[PATCH for-next] RDMA/hns: Support mmapping reset state to userspace

linux-rdma.vger.kernel.org archive mirror
 help / color / mirror / Atom feed

* [PATCH for-next] RDMA/hns: Support mmapping reset state to userspace
@ 2024-10-14 13:07 Junxian Huang
  2024-11-04  8:31 ` Leon Romanovsky
  2024-12-09 19:01 ` Jason Gunthorpe
  0 siblings, 2 replies; 8+ messages in thread
From: Junxian Huang @ 2024-10-14 13:07 UTC (permalink / raw)
  To: jgg, leon
  Cc: linux-rdma, linuxarm, linux-kernel, tangchengchang, huangjunxian6

From: Chengchang Tang <tangchengchang@huawei.com>

Mmap reset state to notify userspace about HW reset. The mmaped flag
hw_ready will be initiated to a non-zero value. When HW is reset,
the mmap page will be zapped and userspace will get a zero value of
hw_ready.

Signed-off-by: Chengchang Tang <tangchengchang@huawei.com>
Signed-off-by: Junxian Huang <huangjunxian6@hisilicon.com>
---
 drivers/infiniband/hw/hns/hns_roce_device.h |  4 ++
 drivers/infiniband/hw/hns/hns_roce_hw_v2.c  | 47 ++++++++++++++++++++-
 drivers/infiniband/hw/hns/hns_roce_main.c   | 36 ++++++++++++++++
 include/uapi/rdma/hns-abi.h                 |  6 +++
 4 files changed, 91 insertions(+), 2 deletions(-)

diff --git a/drivers/infiniband/hw/hns/hns_roce_device.h b/drivers/infiniband/hw/hns/hns_roce_device.h
index 0b1e21cb6d2d..59bca8067a7f 100644
--- a/drivers/infiniband/hw/hns/hns_roce_device.h
+++ b/drivers/infiniband/hw/hns/hns_roce_device.h
@@ -202,6 +202,7 @@ struct hns_roce_uar {
 enum hns_roce_mmap_type {
 	HNS_ROCE_MMAP_TYPE_DB = 1,
 	HNS_ROCE_MMAP_TYPE_DWQE,
+	HNS_ROCE_MMAP_TYPE_RESET,
 };
 
 struct hns_user_mmap_entry {
@@ -216,6 +217,7 @@ struct hns_roce_ucontext {
 	struct list_head	page_list;
 	struct mutex		page_mutex;
 	struct hns_user_mmap_entry *db_mmap_entry;
+	struct hns_user_mmap_entry *reset_mmap_entry;
 	u32			config;
 };
 
@@ -1020,6 +1022,8 @@ struct hns_roce_dev {
 	int			loop_idc;
 	u32			sdb_offset;
 	u32			odb_offset;
+	struct page		*reset_page; /* store reset state */
+	void			*reset_kaddr; /* addr of reset page */
 	const struct hns_roce_hw *hw;
 	void			*priv;
 	struct workqueue_struct *irq_workq;
diff --git a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c
index f1feaa79f78e..2f72074b7cf9 100644
--- a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c
+++ b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c
@@ -37,6 +37,7 @@
 #include <linux/kernel.h>
 #include <linux/types.h>
 #include <linux/workqueue.h>
+#include <linux/vmalloc.h>
 #include <net/addrconf.h>
 #include <rdma/ib_addr.h>
 #include <rdma/ib_cache.h>
@@ -2865,6 +2866,36 @@ static int free_mr_init(struct hns_roce_dev *hr_dev)
 	return ret;
 }
 
+static int hns_roce_v2_get_reset_page(struct hns_roce_dev *hr_dev)
+{
+	struct hns_roce_reset_state *state;
+
+	hr_dev->reset_page = alloc_page(GFP_KERNEL | __GFP_ZERO);
+	if (!hr_dev->reset_page)
+		return -ENOMEM;
+
+	hr_dev->reset_kaddr = vmap(&hr_dev->reset_page, 1, VM_MAP, PAGE_KERNEL);
+	if (!hr_dev->reset_kaddr)
+		goto err_with_vmap;
+
+	state = hr_dev->reset_kaddr;
+	state->hw_ready = 1;
+
+	return 0;
+
+err_with_vmap:
+	put_page(hr_dev->reset_page);
+	return -ENOMEM;
+}
+
+static void hns_roce_v2_put_reset_page(struct hns_roce_dev *hr_dev)
+{
+	vunmap(hr_dev->reset_kaddr);
+	hr_dev->reset_kaddr = NULL;
+	put_page(hr_dev->reset_page);
+	hr_dev->reset_page = NULL;
+}
+
 static int get_hem_table(struct hns_roce_dev *hr_dev)
 {
 	unsigned int qpc_count;
@@ -2944,14 +2975,21 @@ static int hns_roce_v2_init(struct hns_roce_dev *hr_dev)
 {
 	int ret;
 
+	ret = hns_roce_v2_get_reset_page(hr_dev);
+	if (ret) {
+		dev_err(hr_dev->dev,
+			"reset state init failed, ret = %d.\n", ret);
+		return ret;
+	}
+
 	/* The hns ROCEE requires the extdb info to be cleared before using */
 	ret = hns_roce_clear_extdb_list_info(hr_dev);
 	if (ret)
-		return ret;
+		goto err_clear_extdb_failed;
 
 	ret = get_hem_table(hr_dev);
 	if (ret)
-		return ret;
+		goto err_clear_extdb_failed;
 
 	if (hr_dev->is_vf)
 		return 0;
@@ -2967,6 +3005,9 @@ static int hns_roce_v2_init(struct hns_roce_dev *hr_dev)
 err_llm_init_failed:
 	put_hem_table(hr_dev);
 
+err_clear_extdb_failed:
+	hns_roce_v2_put_reset_page(hr_dev);
+
 	return ret;
 }
 
@@ -2980,6 +3021,8 @@ static void hns_roce_v2_exit(struct hns_roce_dev *hr_dev)
 	if (!hr_dev->is_vf)
 		hns_roce_free_link_table(hr_dev);
 
+	hns_roce_v2_put_reset_page(hr_dev);
+
 	if (hr_dev->pci_dev->revision == PCI_REVISION_ID_HIP09)
 		free_dip_list(hr_dev);
 }
diff --git a/drivers/infiniband/hw/hns/hns_roce_main.c b/drivers/infiniband/hw/hns/hns_roce_main.c
index 49315f39361d..1620d4318480 100644
--- a/drivers/infiniband/hw/hns/hns_roce_main.c
+++ b/drivers/infiniband/hw/hns/hns_roce_main.c
@@ -324,6 +324,7 @@ hns_roce_user_mmap_entry_insert(struct ib_ucontext *ucontext, u64 address,
 				ucontext, &entry->rdma_entry, length, 0);
 		break;
 	case HNS_ROCE_MMAP_TYPE_DWQE:
+	case HNS_ROCE_MMAP_TYPE_RESET:
 		ret = rdma_user_mmap_entry_insert_range(
 				ucontext, &entry->rdma_entry, length, 1,
 				U32_MAX);
@@ -341,6 +342,20 @@ hns_roce_user_mmap_entry_insert(struct ib_ucontext *ucontext, u64 address,
 	return entry;
 }
 
+static int hns_roce_alloc_reset_entry(struct ib_ucontext *uctx)
+{
+	struct hns_roce_ucontext *context = to_hr_ucontext(uctx);
+	struct hns_roce_dev *hr_dev = to_hr_dev(uctx->device);
+
+	context->reset_mmap_entry = hns_roce_user_mmap_entry_insert(
+		uctx, (u64)page_to_phys(hr_dev->reset_page), PAGE_SIZE,
+		HNS_ROCE_MMAP_TYPE_RESET);
+	if (!context->reset_mmap_entry)
+		return -ENOMEM;
+
+	return 0;
+}
+
 static void hns_roce_dealloc_uar_entry(struct hns_roce_ucontext *context)
 {
 	if (context->db_mmap_entry)
@@ -369,6 +384,7 @@ static int hns_roce_alloc_ucontext(struct ib_ucontext *uctx,
 	struct hns_roce_dev *hr_dev = to_hr_dev(uctx->device);
 	struct hns_roce_ib_alloc_ucontext_resp resp = {};
 	struct hns_roce_ib_alloc_ucontext ucmd = {};
+	struct rdma_user_mmap_entry *rdma_entry;
 	int ret = -EAGAIN;
 
 	if (!hr_dev->active)
@@ -421,6 +437,13 @@ static int hns_roce_alloc_ucontext(struct ib_ucontext *uctx,
 
 	resp.cqe_size = hr_dev->caps.cqe_sz;
 
+	ret = hns_roce_alloc_reset_entry(uctx);
+	if (ret)
+		goto error_fail_reset_entry;
+
+	rdma_entry = &context->reset_mmap_entry->rdma_entry;
+	resp.reset_mmap_key = rdma_user_mmap_get_offset(rdma_entry);
+
 	ret = ib_copy_to_udata(udata, &resp,
 			       min(udata->outlen, sizeof(resp)));
 	if (ret)
@@ -429,6 +452,9 @@ static int hns_roce_alloc_ucontext(struct ib_ucontext *uctx,
 	return 0;
 
 error_fail_copy_to_udata:
+	rdma_user_mmap_entry_remove(&context->reset_mmap_entry->rdma_entry);
+
+error_fail_reset_entry:
 	if (hr_dev->caps.flags & HNS_ROCE_CAP_FLAG_CQ_RECORD_DB ||
 	    hr_dev->caps.flags & HNS_ROCE_CAP_FLAG_QP_RECORD_DB)
 		mutex_destroy(&context->page_mutex);
@@ -448,6 +474,8 @@ static void hns_roce_dealloc_ucontext(struct ib_ucontext *ibcontext)
 	struct hns_roce_ucontext *context = to_hr_ucontext(ibcontext);
 	struct hns_roce_dev *hr_dev = to_hr_dev(ibcontext->device);
 
+	rdma_user_mmap_entry_remove(&context->reset_mmap_entry->rdma_entry);
+
 	if (hr_dev->caps.flags & HNS_ROCE_CAP_FLAG_CQ_RECORD_DB ||
 	    hr_dev->caps.flags & HNS_ROCE_CAP_FLAG_QP_RECORD_DB)
 		mutex_destroy(&context->page_mutex);
@@ -485,6 +513,14 @@ static int hns_roce_mmap(struct ib_ucontext *uctx, struct vm_area_struct *vma)
 	case HNS_ROCE_MMAP_TYPE_DWQE:
 		prot = pgprot_device(vma->vm_page_prot);
 		break;
+	case HNS_ROCE_MMAP_TYPE_RESET:
+		if (vma->vm_flags & (VM_WRITE | VM_EXEC)) {
+			ret = -EINVAL;
+			goto out;
+		}
+		vm_flags_set(vma, VM_DONTEXPAND);
+		prot = vma->vm_page_prot;
+		break;
 	default:
 		ret = -EINVAL;
 		goto out;
diff --git a/include/uapi/rdma/hns-abi.h b/include/uapi/rdma/hns-abi.h
index 94e861870e27..065eb2e0a690 100644
--- a/include/uapi/rdma/hns-abi.h
+++ b/include/uapi/rdma/hns-abi.h
@@ -136,6 +136,7 @@ struct hns_roce_ib_alloc_ucontext_resp {
 	__u32	max_inline_data;
 	__u8	congest_type;
 	__u8	reserved0[7];
+	__aligned_u64 reset_mmap_key;
 };
 
 struct hns_roce_ib_alloc_ucontext {
@@ -153,4 +154,9 @@ struct hns_roce_ib_create_ah_resp {
 	__u8 tc_mode;
 };
 
+struct hns_roce_reset_state {
+	__u32 hw_ready;
+	__u32 reserved;
+};
+
 #endif /* HNS_ABI_USER_H */
-- 
2.33.0


^ permalink raw reply related	[flat|nested] 8+ messages in thread

* Re: [PATCH for-next] RDMA/hns: Support mmapping reset state to userspace
  2024-10-14 13:07 [PATCH for-next] RDMA/hns: Support mmapping reset state to userspace Junxian Huang
@ 2024-11-04  8:31 ` Leon Romanovsky
  2024-12-09 19:01 ` Jason Gunthorpe
  1 sibling, 0 replies; 8+ messages in thread
From: Leon Romanovsky @ 2024-11-04  8:31 UTC (permalink / raw)
  To: Junxian Huang; +Cc: jgg, linux-rdma, linuxarm, linux-kernel, tangchengchang

On Mon, Oct 14, 2024 at 09:07:31PM +0800, Junxian Huang wrote:
> From: Chengchang Tang <tangchengchang@huawei.com>
> 
> Mmap reset state to notify userspace about HW reset. The mmaped flag
> hw_ready will be initiated to a non-zero value. When HW is reset,
> the mmap page will be zapped and userspace will get a zero value of
> hw_ready.

I didn't forget that patch, but not applying now as it seems extremely
sketchy for me, so waiting for anyone to come and say their opinion too.

Thanks

> 
> Signed-off-by: Chengchang Tang <tangchengchang@huawei.com>
> Signed-off-by: Junxian Huang <huangjunxian6@hisilicon.com>
> ---
>  drivers/infiniband/hw/hns/hns_roce_device.h |  4 ++
>  drivers/infiniband/hw/hns/hns_roce_hw_v2.c  | 47 ++++++++++++++++++++-
>  drivers/infiniband/hw/hns/hns_roce_main.c   | 36 ++++++++++++++++
>  include/uapi/rdma/hns-abi.h                 |  6 +++
>  4 files changed, 91 insertions(+), 2 deletions(-)
> 
> diff --git a/drivers/infiniband/hw/hns/hns_roce_device.h b/drivers/infiniband/hw/hns/hns_roce_device.h
> index 0b1e21cb6d2d..59bca8067a7f 100644
> --- a/drivers/infiniband/hw/hns/hns_roce_device.h
> +++ b/drivers/infiniband/hw/hns/hns_roce_device.h
> @@ -202,6 +202,7 @@ struct hns_roce_uar {
>  enum hns_roce_mmap_type {
>  	HNS_ROCE_MMAP_TYPE_DB = 1,
>  	HNS_ROCE_MMAP_TYPE_DWQE,
> +	HNS_ROCE_MMAP_TYPE_RESET,
>  };
>  
>  struct hns_user_mmap_entry {
> @@ -216,6 +217,7 @@ struct hns_roce_ucontext {
>  	struct list_head	page_list;
>  	struct mutex		page_mutex;
>  	struct hns_user_mmap_entry *db_mmap_entry;
> +	struct hns_user_mmap_entry *reset_mmap_entry;
>  	u32			config;
>  };
>  
> @@ -1020,6 +1022,8 @@ struct hns_roce_dev {
>  	int			loop_idc;
>  	u32			sdb_offset;
>  	u32			odb_offset;
> +	struct page		*reset_page; /* store reset state */
> +	void			*reset_kaddr; /* addr of reset page */
>  	const struct hns_roce_hw *hw;
>  	void			*priv;
>  	struct workqueue_struct *irq_workq;
> diff --git a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c
> index f1feaa79f78e..2f72074b7cf9 100644
> --- a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c
> +++ b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c
> @@ -37,6 +37,7 @@
>  #include <linux/kernel.h>
>  #include <linux/types.h>
>  #include <linux/workqueue.h>
> +#include <linux/vmalloc.h>
>  #include <net/addrconf.h>
>  #include <rdma/ib_addr.h>
>  #include <rdma/ib_cache.h>
> @@ -2865,6 +2866,36 @@ static int free_mr_init(struct hns_roce_dev *hr_dev)
>  	return ret;
>  }
>  
> +static int hns_roce_v2_get_reset_page(struct hns_roce_dev *hr_dev)
> +{
> +	struct hns_roce_reset_state *state;
> +
> +	hr_dev->reset_page = alloc_page(GFP_KERNEL | __GFP_ZERO);
> +	if (!hr_dev->reset_page)
> +		return -ENOMEM;
> +
> +	hr_dev->reset_kaddr = vmap(&hr_dev->reset_page, 1, VM_MAP, PAGE_KERNEL);
> +	if (!hr_dev->reset_kaddr)
> +		goto err_with_vmap;
> +
> +	state = hr_dev->reset_kaddr;
> +	state->hw_ready = 1;
> +
> +	return 0;
> +
> +err_with_vmap:
> +	put_page(hr_dev->reset_page);
> +	return -ENOMEM;
> +}
> +
> +static void hns_roce_v2_put_reset_page(struct hns_roce_dev *hr_dev)
> +{
> +	vunmap(hr_dev->reset_kaddr);
> +	hr_dev->reset_kaddr = NULL;
> +	put_page(hr_dev->reset_page);
> +	hr_dev->reset_page = NULL;
> +}
> +
>  static int get_hem_table(struct hns_roce_dev *hr_dev)
>  {
>  	unsigned int qpc_count;
> @@ -2944,14 +2975,21 @@ static int hns_roce_v2_init(struct hns_roce_dev *hr_dev)
>  {
>  	int ret;
>  
> +	ret = hns_roce_v2_get_reset_page(hr_dev);
> +	if (ret) {
> +		dev_err(hr_dev->dev,
> +			"reset state init failed, ret = %d.\n", ret);
> +		return ret;
> +	}
> +
>  	/* The hns ROCEE requires the extdb info to be cleared before using */
>  	ret = hns_roce_clear_extdb_list_info(hr_dev);
>  	if (ret)
> -		return ret;
> +		goto err_clear_extdb_failed;
>  
>  	ret = get_hem_table(hr_dev);
>  	if (ret)
> -		return ret;
> +		goto err_clear_extdb_failed;
>  
>  	if (hr_dev->is_vf)
>  		return 0;
> @@ -2967,6 +3005,9 @@ static int hns_roce_v2_init(struct hns_roce_dev *hr_dev)
>  err_llm_init_failed:
>  	put_hem_table(hr_dev);
>  
> +err_clear_extdb_failed:
> +	hns_roce_v2_put_reset_page(hr_dev);
> +
>  	return ret;
>  }
>  
> @@ -2980,6 +3021,8 @@ static void hns_roce_v2_exit(struct hns_roce_dev *hr_dev)
>  	if (!hr_dev->is_vf)
>  		hns_roce_free_link_table(hr_dev);
>  
> +	hns_roce_v2_put_reset_page(hr_dev);
> +
>  	if (hr_dev->pci_dev->revision == PCI_REVISION_ID_HIP09)
>  		free_dip_list(hr_dev);
>  }
> diff --git a/drivers/infiniband/hw/hns/hns_roce_main.c b/drivers/infiniband/hw/hns/hns_roce_main.c
> index 49315f39361d..1620d4318480 100644
> --- a/drivers/infiniband/hw/hns/hns_roce_main.c
> +++ b/drivers/infiniband/hw/hns/hns_roce_main.c
> @@ -324,6 +324,7 @@ hns_roce_user_mmap_entry_insert(struct ib_ucontext *ucontext, u64 address,
>  				ucontext, &entry->rdma_entry, length, 0);
>  		break;
>  	case HNS_ROCE_MMAP_TYPE_DWQE:
> +	case HNS_ROCE_MMAP_TYPE_RESET:
>  		ret = rdma_user_mmap_entry_insert_range(
>  				ucontext, &entry->rdma_entry, length, 1,
>  				U32_MAX);
> @@ -341,6 +342,20 @@ hns_roce_user_mmap_entry_insert(struct ib_ucontext *ucontext, u64 address,
>  	return entry;
>  }
>  
> +static int hns_roce_alloc_reset_entry(struct ib_ucontext *uctx)
> +{
> +	struct hns_roce_ucontext *context = to_hr_ucontext(uctx);
> +	struct hns_roce_dev *hr_dev = to_hr_dev(uctx->device);
> +
> +	context->reset_mmap_entry = hns_roce_user_mmap_entry_insert(
> +		uctx, (u64)page_to_phys(hr_dev->reset_page), PAGE_SIZE,
> +		HNS_ROCE_MMAP_TYPE_RESET);
> +	if (!context->reset_mmap_entry)
> +		return -ENOMEM;
> +
> +	return 0;
> +}
> +
>  static void hns_roce_dealloc_uar_entry(struct hns_roce_ucontext *context)
>  {
>  	if (context->db_mmap_entry)
> @@ -369,6 +384,7 @@ static int hns_roce_alloc_ucontext(struct ib_ucontext *uctx,
>  	struct hns_roce_dev *hr_dev = to_hr_dev(uctx->device);
>  	struct hns_roce_ib_alloc_ucontext_resp resp = {};
>  	struct hns_roce_ib_alloc_ucontext ucmd = {};
> +	struct rdma_user_mmap_entry *rdma_entry;
>  	int ret = -EAGAIN;
>  
>  	if (!hr_dev->active)
> @@ -421,6 +437,13 @@ static int hns_roce_alloc_ucontext(struct ib_ucontext *uctx,
>  
>  	resp.cqe_size = hr_dev->caps.cqe_sz;
>  
> +	ret = hns_roce_alloc_reset_entry(uctx);
> +	if (ret)
> +		goto error_fail_reset_entry;
> +
> +	rdma_entry = &context->reset_mmap_entry->rdma_entry;
> +	resp.reset_mmap_key = rdma_user_mmap_get_offset(rdma_entry);
> +
>  	ret = ib_copy_to_udata(udata, &resp,
>  			       min(udata->outlen, sizeof(resp)));
>  	if (ret)
> @@ -429,6 +452,9 @@ static int hns_roce_alloc_ucontext(struct ib_ucontext *uctx,
>  	return 0;
>  
>  error_fail_copy_to_udata:
> +	rdma_user_mmap_entry_remove(&context->reset_mmap_entry->rdma_entry);
> +
> +error_fail_reset_entry:
>  	if (hr_dev->caps.flags & HNS_ROCE_CAP_FLAG_CQ_RECORD_DB ||
>  	    hr_dev->caps.flags & HNS_ROCE_CAP_FLAG_QP_RECORD_DB)
>  		mutex_destroy(&context->page_mutex);
> @@ -448,6 +474,8 @@ static void hns_roce_dealloc_ucontext(struct ib_ucontext *ibcontext)
>  	struct hns_roce_ucontext *context = to_hr_ucontext(ibcontext);
>  	struct hns_roce_dev *hr_dev = to_hr_dev(ibcontext->device);
>  
> +	rdma_user_mmap_entry_remove(&context->reset_mmap_entry->rdma_entry);
> +
>  	if (hr_dev->caps.flags & HNS_ROCE_CAP_FLAG_CQ_RECORD_DB ||
>  	    hr_dev->caps.flags & HNS_ROCE_CAP_FLAG_QP_RECORD_DB)
>  		mutex_destroy(&context->page_mutex);
> @@ -485,6 +513,14 @@ static int hns_roce_mmap(struct ib_ucontext *uctx, struct vm_area_struct *vma)
>  	case HNS_ROCE_MMAP_TYPE_DWQE:
>  		prot = pgprot_device(vma->vm_page_prot);
>  		break;
> +	case HNS_ROCE_MMAP_TYPE_RESET:
> +		if (vma->vm_flags & (VM_WRITE | VM_EXEC)) {
> +			ret = -EINVAL;
> +			goto out;
> +		}
> +		vm_flags_set(vma, VM_DONTEXPAND);
> +		prot = vma->vm_page_prot;
> +		break;
>  	default:
>  		ret = -EINVAL;
>  		goto out;
> diff --git a/include/uapi/rdma/hns-abi.h b/include/uapi/rdma/hns-abi.h
> index 94e861870e27..065eb2e0a690 100644
> --- a/include/uapi/rdma/hns-abi.h
> +++ b/include/uapi/rdma/hns-abi.h
> @@ -136,6 +136,7 @@ struct hns_roce_ib_alloc_ucontext_resp {
>  	__u32	max_inline_data;
>  	__u8	congest_type;
>  	__u8	reserved0[7];
> +	__aligned_u64 reset_mmap_key;
>  };
>  
>  struct hns_roce_ib_alloc_ucontext {
> @@ -153,4 +154,9 @@ struct hns_roce_ib_create_ah_resp {
>  	__u8 tc_mode;
>  };
>  
> +struct hns_roce_reset_state {
> +	__u32 hw_ready;
> +	__u32 reserved;
> +};
> +
>  #endif /* HNS_ABI_USER_H */
> -- 
> 2.33.0
> 

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH for-next] RDMA/hns: Support mmapping reset state to userspace
  2024-10-14 13:07 [PATCH for-next] RDMA/hns: Support mmapping reset state to userspace Junxian Huang
  2024-11-04  8:31 ` Leon Romanovsky
@ 2024-12-09 19:01 ` Jason Gunthorpe
  2024-12-10  6:24   ` Junxian Huang
  1 sibling, 1 reply; 8+ messages in thread
From: Jason Gunthorpe @ 2024-12-09 19:01 UTC (permalink / raw)
  To: Junxian Huang; +Cc: leon, linux-rdma, linuxarm, linux-kernel, tangchengchang

On Mon, Oct 14, 2024 at 09:07:31PM +0800, Junxian Huang wrote:
> From: Chengchang Tang <tangchengchang@huawei.com>
> 
> Mmap reset state to notify userspace about HW reset. The mmaped flag
> hw_ready will be initiated to a non-zero value. When HW is reset,
> the mmap page will be zapped and userspace will get a zero value of
> hw_ready.

This needs alot more explanation about *why* does userspace need this
information and why is hns unique here.

Usually when the HW is reset there are enough existing system calls
that will start failing that a driver should not need to do something
like this.

Jason

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH for-next] RDMA/hns: Support mmapping reset state to userspace
  2024-12-09 19:01 ` Jason Gunthorpe
@ 2024-12-10  6:24   ` Junxian Huang
  2024-12-10 13:48     ` Jason Gunthorpe
  0 siblings, 1 reply; 8+ messages in thread
From: Junxian Huang @ 2024-12-10  6:24 UTC (permalink / raw)
  To: Jason Gunthorpe; +Cc: leon, linux-rdma, linuxarm, linux-kernel, tangchengchang



On 2024/12/10 3:01, Jason Gunthorpe wrote:
> On Mon, Oct 14, 2024 at 09:07:31PM +0800, Junxian Huang wrote:
>> From: Chengchang Tang <tangchengchang@huawei.com>
>>
>> Mmap reset state to notify userspace about HW reset. The mmaped flag
>> hw_ready will be initiated to a non-zero value. When HW is reset,
>> the mmap page will be zapped and userspace will get a zero value of
>> hw_ready.
> 
> This needs alot more explanation about *why* does userspace need this
> information and why is hns unique here.
> 

Our HW cannot flush WQEs by itself unless the driver posts a modify-qp-to-err
mailbox. But when the HW is reset, it'll stop handling mailbox too, so the HW
becomes unable to produce any more CQEs for the existing WQEs. This will break
some users' expectation that they should be able to poll CQEs as many as the
number of the posted WQEs in any cases.

We try to notify the reset state to userspace so that we can generate software
WCs for the existing WQEs in userspace instead of HW in reset state, which is
what this rdma-core PR does:

https://github.com/linux-rdma/rdma-core/pull/1504

Junxian

> Usually when the HW is reset there are enough existing system calls
> that will start failing that a driver should not need to do something
> like this.
> 
> Jason

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH for-next] RDMA/hns: Support mmapping reset state to userspace
  2024-12-10  6:24   ` Junxian Huang
@ 2024-12-10 13:48     ` Jason Gunthorpe
  2024-12-13  9:37       ` Junxian Huang
  0 siblings, 1 reply; 8+ messages in thread
From: Jason Gunthorpe @ 2024-12-10 13:48 UTC (permalink / raw)
  To: Junxian Huang; +Cc: leon, linux-rdma, linuxarm, linux-kernel, tangchengchang

On Tue, Dec 10, 2024 at 02:24:16PM +0800, Junxian Huang wrote:
> 
> 
> On 2024/12/10 3:01, Jason Gunthorpe wrote:
> > On Mon, Oct 14, 2024 at 09:07:31PM +0800, Junxian Huang wrote:
> >> From: Chengchang Tang <tangchengchang@huawei.com>
> >>
> >> Mmap reset state to notify userspace about HW reset. The mmaped flag
> >> hw_ready will be initiated to a non-zero value. When HW is reset,
> >> the mmap page will be zapped and userspace will get a zero value of
> >> hw_ready.
> > 
> > This needs alot more explanation about *why* does userspace need this
> > information and why is hns unique here.
> > 
> 
> Our HW cannot flush WQEs by itself unless the driver posts a modify-qp-to-err
> mailbox. But when the HW is reset, it'll stop handling mailbox too, so the HW
> becomes unable to produce any more CQEs for the existing WQEs. This will break
> some users' expectation that they should be able to poll CQEs as many as the
> number of the posted WQEs in any cases.

But your reset flow partially disassociates the device, when the
userspace goes back to sleep, or rearms the CQ, it should get a hard
fail and do a full cleanup without relying on flushing.

> We try to notify the reset state to userspace so that we can generate software
> WCs for the existing WQEs in userspace instead of HW in reset state, which is
> what this rdma-core PR does:

That doesn't sound right at all. Device disassociation is a hard fail,
we don't try to elegantly do things like generate completions. The
device is dead, the queues are gone.

Jason

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH for-next] RDMA/hns: Support mmapping reset state to userspace
  2024-12-10 13:48     ` Jason Gunthorpe
@ 2024-12-13  9:37       ` Junxian Huang
  2024-12-13 12:49         ` Jason Gunthorpe
  0 siblings, 1 reply; 8+ messages in thread
From: Junxian Huang @ 2024-12-13  9:37 UTC (permalink / raw)
  To: Jason Gunthorpe; +Cc: leon, linux-rdma, linuxarm, linux-kernel, tangchengchang



On 2024/12/10 21:48, Jason Gunthorpe wrote:
> On Tue, Dec 10, 2024 at 02:24:16PM +0800, Junxian Huang wrote:
>>
>>
>> On 2024/12/10 3:01, Jason Gunthorpe wrote:
>>> On Mon, Oct 14, 2024 at 09:07:31PM +0800, Junxian Huang wrote:
>>>> From: Chengchang Tang <tangchengchang@huawei.com>
>>>>
>>>> Mmap reset state to notify userspace about HW reset. The mmaped flag
>>>> hw_ready will be initiated to a non-zero value. When HW is reset,
>>>> the mmap page will be zapped and userspace will get a zero value of
>>>> hw_ready.
>>>
>>> This needs alot more explanation about *why* does userspace need this
>>> information and why is hns unique here.
>>>
>>
>> Our HW cannot flush WQEs by itself unless the driver posts a modify-qp-to-err
>> mailbox. But when the HW is reset, it'll stop handling mailbox too, so the HW
>> becomes unable to produce any more CQEs for the existing WQEs. This will break
>> some users' expectation that they should be able to poll CQEs as many as the
>> number of the posted WQEs in any cases.
> 
> But your reset flow partially disassociates the device, when the
> userspace goes back to sleep, or rearms the CQ, it should get a hard
> fail and do a full cleanup without relying on flushing.
> 

Not sure if I got your point, when you said "the userspace goes back to sleep",
did you mean the ibv_get_async_event() api? Are you suggesting that userspace
should call ibv_get_async_event() to monitor async events, and when it gets a
fatal event, it should stop polling CQs and clean up everything instead of
still waiting for the remaining CQEs?

Thanks,
Junxian

>> We try to notify the reset state to userspace so that we can generate software
>> WCs for the existing WQEs in userspace instead of HW in reset state, which is
>> what this rdma-core PR does:
> 
> That doesn't sound right at all. Device disassociation is a hard fail,
> we don't try to elegantly do things like generate completions. The
> device is dead, the queues are gone.
> 
> Jason
> 

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH for-next] RDMA/hns: Support mmapping reset state to userspace
  2024-12-13  9:37       ` Junxian Huang
@ 2024-12-13 12:49         ` Jason Gunthorpe
  2024-12-17  6:09           ` Junxian Huang
  0 siblings, 1 reply; 8+ messages in thread
From: Jason Gunthorpe @ 2024-12-13 12:49 UTC (permalink / raw)
  To: Junxian Huang; +Cc: leon, linux-rdma, linuxarm, linux-kernel, tangchengchang

On Fri, Dec 13, 2024 at 05:37:58PM +0800, Junxian Huang wrote:
> > But your reset flow partially disassociates the device, when the
> > userspace goes back to sleep, or rearms the CQ, it should get a hard
> > fail and do a full cleanup without relying on flushing. 
> 
> Not sure if I got your point, when you said "the userspace goes back to sleep",
> did you mean the ibv_get_async_event() api? Are you suggesting that userspace
> should call ibv_get_async_event() to monitor async events, and when it gets a
> fatal event, it should stop polling CQs and clean up everything instead of
> still waiting for the remaining CQEs?

Yes, it should do that as well. This is wha the devce fatal event is
for.

I'm also saying that any kernel systems calls, like sleeping for CQ
events should start failing too.

Jason

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH for-next] RDMA/hns: Support mmapping reset state to userspace
  2024-12-13 12:49         ` Jason Gunthorpe
@ 2024-12-17  6:09           ` Junxian Huang
  0 siblings, 0 replies; 8+ messages in thread
From: Junxian Huang @ 2024-12-17  6:09 UTC (permalink / raw)
  To: Jason Gunthorpe; +Cc: leon, linux-rdma, linuxarm, linux-kernel, tangchengchang



On 2024/12/13 20:49, Jason Gunthorpe wrote:
> On Fri, Dec 13, 2024 at 05:37:58PM +0800, Junxian Huang wrote:
>>> But your reset flow partially disassociates the device, when the
>>> userspace goes back to sleep, or rearms the CQ, it should get a hard
>>> fail and do a full cleanup without relying on flushing. 
>>
>> Not sure if I got your point, when you said "the userspace goes back to sleep",
>> did you mean the ibv_get_async_event() api? Are you suggesting that userspace
>> should call ibv_get_async_event() to monitor async events, and when it gets a
>> fatal event, it should stop polling CQs and clean up everything instead of
>> still waiting for the remaining CQEs?
> 
> Yes, it should do that as well. This is wha the devce fatal event is
> for.
> 
> I'm also saying that any kernel systems calls, like sleeping for CQ
> events should start failing too.
> 
> Jason

Thanks. I took a cursory look at some open-source userspace projects,
UCX and SPDK handle the device fatal event properly by doing cleanup.
But Ceph doesn't seem to have any special handling except for logs..

Junxian

^ permalink raw reply	[flat|nested] 8+ messages in thread

end of thread, other threads:[~2024-12-17  6:09 UTC | newest]

Thread overview: 8+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2024-10-14 13:07 [PATCH for-next] RDMA/hns: Support mmapping reset state to userspace Junxian Huang
2024-11-04  8:31 ` Leon Romanovsky
2024-12-09 19:01 ` Jason Gunthorpe
2024-12-10  6:24   ` Junxian Huang
2024-12-10 13:48     ` Jason Gunthorpe
2024-12-13  9:37       ` Junxian Huang
2024-12-13 12:49         ` Jason Gunthorpe
2024-12-17  6:09           ` Junxian Huang

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).