linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH net] net/smc: Fix possible access to freed memory in link clear
@ 2022-08-29 14:54 liuyacan
  2022-08-29 16:38 ` Tony Lu
                   ` (2 more replies)
  0 siblings, 3 replies; 22+ messages in thread
From: liuyacan @ 2022-08-29 14:54 UTC (permalink / raw)
  To: kgraul, davem, wenjia, edumazet, kuba, pabeni
  Cc: linux-s390, netdev, linux-kernel, liuyacan

From: liuyacan <liuyacan@corp.netease.com>

After modifying the QP to the Error state, all RX WR would be
completed with WC in IB_WC_WR_FLUSH_ERR status. Current
implementation does not wait for it is done, but free the link
directly. So there is a risk that accessing the freed link in
tasklet context.

Here is a crash example:

 BUG: unable to handle page fault for address: ffffffff8f220860
 #PF: supervisor write access in kernel mode
 #PF: error_code(0x0002) - not-present page
 PGD f7300e067 P4D f7300e067 PUD f7300f063 PMD 8c4e45063 PTE 800ffff08c9df060
 Oops: 0002 [#1] SMP PTI
 CPU: 1 PID: 0 Comm: swapper/1 Kdump: loaded Tainted: G S         OE     5.10.0-0607+ #23
 Hardware name: Inspur NF5280M4/YZMB-00689-101, BIOS 4.1.20 07/09/2018
 RIP: 0010:native_queued_spin_lock_slowpath+0x176/0x1b0
 Code: f3 90 48 8b 32 48 85 f6 74 f6 eb d5 c1 ee 12 83 e0 03 83 ee 01 48 c1 e0 05 48 63 f6 48 05 00 c8 02 00 48 03 04 f5 00 09 98 8e <48> 89 10 8b 42 08 85 c0 75 09 f3 90 8b 42 08 85 c0 74 f7 48 8b 32
 RSP: 0018:ffffb3b6c001ebd8 EFLAGS: 00010086
 RAX: ffffffff8f220860 RBX: 0000000000000246 RCX: 0000000000080000
 RDX: ffff91db1f86c800 RSI: 000000000000173c RDI: ffff91db62bace00
 RBP: ffff91db62bacc00 R08: 0000000000000000 R09: c00000010000028b
 R10: 0000000000055198 R11: ffffb3b6c001ea58 R12: ffff91db80e05010
 R13: 000000000000000a R14: 0000000000000006 R15: 0000000000000040
 FS:  0000000000000000(0000) GS:ffff91db1f840000(0000) knlGS:0000000000000000
 CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
 CR2: ffffffff8f220860 CR3: 00000001f9580004 CR4: 00000000003706e0
 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
 Call Trace:
  <IRQ>
  _raw_spin_lock_irqsave+0x30/0x40
  mlx5_ib_poll_cq+0x4c/0xc50 [mlx5_ib]
  smc_wr_rx_tasklet_fn+0x56/0xa0 [smc]
  tasklet_action_common.isra.21+0x66/0x100
  __do_softirq+0xd5/0x29c
  asm_call_irq_on_stack+0x12/0x20
  </IRQ>
  do_softirq_own_stack+0x37/0x40
  irq_exit_rcu+0x9d/0xa0
  sysvec_call_function_single+0x34/0x80
  asm_sysvec_call_function_single+0x12/0x20

Signed-off-by: liuyacan <liuyacan@corp.netease.com>
---
 net/smc/smc_core.c |  2 ++
 net/smc/smc_core.h |  2 ++
 net/smc/smc_wr.c   | 12 ++++++++++++
 net/smc/smc_wr.h   |  3 +++
 4 files changed, 19 insertions(+)

diff --git a/net/smc/smc_core.c b/net/smc/smc_core.c
index ff49a11f5..b632a33f1 100644
--- a/net/smc/smc_core.c
+++ b/net/smc/smc_core.c
@@ -752,6 +752,7 @@ int smcr_link_init(struct smc_link_group *lgr, struct smc_link *lnk,
 	atomic_inc(&lnk->smcibdev->lnk_cnt);
 	refcount_set(&lnk->refcnt, 1); /* link refcnt is set to 1 */
 	lnk->clearing = 0;
+	lnk->rx_drained = 0;
 	lnk->path_mtu = lnk->smcibdev->pattr[lnk->ibport - 1].active_mtu;
 	lnk->link_id = smcr_next_link_id(lgr);
 	lnk->lgr = lgr;
@@ -1269,6 +1270,7 @@ void smcr_link_clear(struct smc_link *lnk, bool log)
 	smcr_buf_unmap_lgr(lnk);
 	smcr_rtoken_clear_link(lnk);
 	smc_ib_modify_qp_error(lnk);
+	smc_wr_drain_cq(lnk);
 	smc_wr_free_link(lnk);
 	smc_ib_destroy_queue_pair(lnk);
 	smc_ib_dealloc_protection_domain(lnk);
diff --git a/net/smc/smc_core.h b/net/smc/smc_core.h
index fe8b524ad..0a469a3e7 100644
--- a/net/smc/smc_core.h
+++ b/net/smc/smc_core.h
@@ -117,6 +117,7 @@ struct smc_link {
 	u64			wr_rx_id;	/* seq # of last recv WR */
 	u32			wr_rx_cnt;	/* number of WR recv buffers */
 	unsigned long		wr_rx_tstamp;	/* jiffies when last buf rx */
+	wait_queue_head_t       wr_rx_drain_wait; /* wait for WR drain */
 
 	struct ib_reg_wr	wr_reg;		/* WR register memory region */
 	wait_queue_head_t	wr_reg_wait;	/* wait for wr_reg result */
@@ -138,6 +139,7 @@ struct smc_link {
 	u8			link_idx;	/* index in lgr link array */
 	u8			link_is_asym;	/* is link asymmetric? */
 	u8			clearing : 1;	/* link is being cleared */
+	u8                      rx_drained : 1; /* link is drained */
 	refcount_t		refcnt;		/* link reference count */
 	struct smc_link_group	*lgr;		/* parent link group */
 	struct work_struct	link_down_wrk;	/* wrk to bring link down */
diff --git a/net/smc/smc_wr.c b/net/smc/smc_wr.c
index 26f8f240d..f9992896a 100644
--- a/net/smc/smc_wr.c
+++ b/net/smc/smc_wr.c
@@ -465,6 +465,10 @@ static inline void smc_wr_rx_process_cqes(struct ib_wc wc[], int num)
 			case IB_WC_RNR_RETRY_EXC_ERR:
 			case IB_WC_WR_FLUSH_ERR:
 				smcr_link_down_cond_sched(link);
+				if (link->clearing && wc[i]->wr_id == link->wr_rx_id) {
+					link->rx_drained = 1;
+					wake_up(&link->wr_rx_drain_wait);
+				}
 				break;
 			default:
 				smc_wr_rx_post(link); /* refill WR RX */
@@ -631,6 +635,13 @@ static void smc_wr_init_sge(struct smc_link *lnk)
 	lnk->wr_reg.access = IB_ACCESS_LOCAL_WRITE | IB_ACCESS_REMOTE_WRITE;
 }
 
+void smc_wr_drain_cq(struct smc_link *lnk)
+{
+	wait_event_interruptible_timeout(lnk->wr_rx_drain_wait,
+					 (lnk->drained == 1),
+					 SMC_WR_RX_WAIT_DRAIN_TIME);
+}
+
 void smc_wr_free_link(struct smc_link *lnk)
 {
 	struct ib_device *ibdev;
@@ -889,6 +900,7 @@ int smc_wr_create_link(struct smc_link *lnk)
 	atomic_set(&lnk->wr_tx_refcnt, 0);
 	init_waitqueue_head(&lnk->wr_reg_wait);
 	atomic_set(&lnk->wr_reg_refcnt, 0);
+	init_waitqueue_head(&lnk->wr_rx_drain_wait);
 	return rc;
 
 dma_unmap:
diff --git a/net/smc/smc_wr.h b/net/smc/smc_wr.h
index a54e90a11..2a7ebdba3 100644
--- a/net/smc/smc_wr.h
+++ b/net/smc/smc_wr.h
@@ -27,6 +27,8 @@
 
 #define SMC_WR_TX_PEND_PRIV_SIZE 32
 
+#define SMC_WR_RX_WAIT_DRAIN_TIME       (2 * HZ)
+
 struct smc_wr_tx_pend_priv {
 	u8			priv[SMC_WR_TX_PEND_PRIV_SIZE];
 };
@@ -101,6 +103,7 @@ static inline int smc_wr_rx_post(struct smc_link *link)
 int smc_wr_create_link(struct smc_link *lnk);
 int smc_wr_alloc_link_mem(struct smc_link *lnk);
 int smc_wr_alloc_lgr_mem(struct smc_link_group *lgr);
+void smc_wr_drain_cq(struct smc_link *lnk);
 void smc_wr_free_link(struct smc_link *lnk);
 void smc_wr_free_link_mem(struct smc_link *lnk);
 void smc_wr_free_lgr_mem(struct smc_link_group *lgr);
-- 
2.20.1


^ permalink raw reply related	[flat|nested] 22+ messages in thread

* Re: [PATCH net] net/smc: Fix possible access to freed memory in link clear
  2022-08-29 14:54 [PATCH net] net/smc: Fix possible access to freed memory in link clear liuyacan
@ 2022-08-29 16:38 ` Tony Lu
  2022-08-30  3:05   ` [PATCH net v2] " liuyacan
  2022-08-30 14:31 ` [PATCH net] net/smc: Fix possible access to freed memory in link clear kernel test robot
  2022-08-30 16:10 ` kernel test robot
  2 siblings, 1 reply; 22+ messages in thread
From: Tony Lu @ 2022-08-29 16:38 UTC (permalink / raw)
  To: liuyacan
  Cc: kgraul, davem, wenjia, edumazet, kuba, pabeni, linux-s390, netdev,
	linux-kernel

On Mon, Aug 29, 2022 at 10:54:35PM +0800, liuyacan@corp.netease.com wrote:
> From: liuyacan <liuyacan@corp.netease.com>
> 
> After modifying the QP to the Error state, all RX WR would be
> completed with WC in IB_WC_WR_FLUSH_ERR status. Current
> implementation does not wait for it is done, but free the link
> directly. So there is a risk that accessing the freed link in
> tasklet context.
> 
> Here is a crash example:
> 
>  BUG: unable to handle page fault for address: ffffffff8f220860
>  #PF: supervisor write access in kernel mode
>  #PF: error_code(0x0002) - not-present page
>  PGD f7300e067 P4D f7300e067 PUD f7300f063 PMD 8c4e45063 PTE 800ffff08c9df060
>  Oops: 0002 [#1] SMP PTI
>  CPU: 1 PID: 0 Comm: swapper/1 Kdump: loaded Tainted: G S         OE     5.10.0-0607+ #23
>  Hardware name: Inspur NF5280M4/YZMB-00689-101, BIOS 4.1.20 07/09/2018
>  RIP: 0010:native_queued_spin_lock_slowpath+0x176/0x1b0
>  Code: f3 90 48 8b 32 48 85 f6 74 f6 eb d5 c1 ee 12 83 e0 03 83 ee 01 48 c1 e0 05 48 63 f6 48 05 00 c8 02 00 48 03 04 f5 00 09 98 8e <48> 89 10 8b 42 08 85 c0 75 09 f3 90 8b 42 08 85 c0 74 f7 48 8b 32
>  RSP: 0018:ffffb3b6c001ebd8 EFLAGS: 00010086
>  RAX: ffffffff8f220860 RBX: 0000000000000246 RCX: 0000000000080000
>  RDX: ffff91db1f86c800 RSI: 000000000000173c RDI: ffff91db62bace00
>  RBP: ffff91db62bacc00 R08: 0000000000000000 R09: c00000010000028b
>  R10: 0000000000055198 R11: ffffb3b6c001ea58 R12: ffff91db80e05010
>  R13: 000000000000000a R14: 0000000000000006 R15: 0000000000000040
>  FS:  0000000000000000(0000) GS:ffff91db1f840000(0000) knlGS:0000000000000000
>  CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
>  CR2: ffffffff8f220860 CR3: 00000001f9580004 CR4: 00000000003706e0
>  DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
>  DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
>  Call Trace:
>   <IRQ>
>   _raw_spin_lock_irqsave+0x30/0x40
>   mlx5_ib_poll_cq+0x4c/0xc50 [mlx5_ib]
>   smc_wr_rx_tasklet_fn+0x56/0xa0 [smc]
>   tasklet_action_common.isra.21+0x66/0x100
>   __do_softirq+0xd5/0x29c
>   asm_call_irq_on_stack+0x12/0x20
>   </IRQ>
>   do_softirq_own_stack+0x37/0x40
>   irq_exit_rcu+0x9d/0xa0
>   sysvec_call_function_single+0x34/0x80
>   asm_sysvec_call_function_single+0x12/0x20
> 
> Signed-off-by: liuyacan <liuyacan@corp.netease.com>
> ---
>  net/smc/smc_core.c |  2 ++
>  net/smc/smc_core.h |  2 ++
>  net/smc/smc_wr.c   | 12 ++++++++++++
>  net/smc/smc_wr.h   |  3 +++
>  4 files changed, 19 insertions(+)
> 
> diff --git a/net/smc/smc_core.c b/net/smc/smc_core.c
> index ff49a11f5..b632a33f1 100644
> --- a/net/smc/smc_core.c
> +++ b/net/smc/smc_core.c
> @@ -752,6 +752,7 @@ int smcr_link_init(struct smc_link_group *lgr, struct smc_link *lnk,
>  	atomic_inc(&lnk->smcibdev->lnk_cnt);
>  	refcount_set(&lnk->refcnt, 1); /* link refcnt is set to 1 */
>  	lnk->clearing = 0;
> +	lnk->rx_drained = 0;
>  	lnk->path_mtu = lnk->smcibdev->pattr[lnk->ibport - 1].active_mtu;
>  	lnk->link_id = smcr_next_link_id(lgr);
>  	lnk->lgr = lgr;
> @@ -1269,6 +1270,7 @@ void smcr_link_clear(struct smc_link *lnk, bool log)
>  	smcr_buf_unmap_lgr(lnk);
>  	smcr_rtoken_clear_link(lnk);
>  	smc_ib_modify_qp_error(lnk);
> +	smc_wr_drain_cq(lnk);
>  	smc_wr_free_link(lnk);
>  	smc_ib_destroy_queue_pair(lnk);
>  	smc_ib_dealloc_protection_domain(lnk);
> diff --git a/net/smc/smc_core.h b/net/smc/smc_core.h
> index fe8b524ad..0a469a3e7 100644
> --- a/net/smc/smc_core.h
> +++ b/net/smc/smc_core.h
> @@ -117,6 +117,7 @@ struct smc_link {
>  	u64			wr_rx_id;	/* seq # of last recv WR */
>  	u32			wr_rx_cnt;	/* number of WR recv buffers */
>  	unsigned long		wr_rx_tstamp;	/* jiffies when last buf rx */
> +	wait_queue_head_t       wr_rx_drain_wait; /* wait for WR drain */
>  
>  	struct ib_reg_wr	wr_reg;		/* WR register memory region */
>  	wait_queue_head_t	wr_reg_wait;	/* wait for wr_reg result */
> @@ -138,6 +139,7 @@ struct smc_link {
>  	u8			link_idx;	/* index in lgr link array */
>  	u8			link_is_asym;	/* is link asymmetric? */
>  	u8			clearing : 1;	/* link is being cleared */
> +	u8                      rx_drained : 1; /* link is drained */
>  	refcount_t		refcnt;		/* link reference count */
>  	struct smc_link_group	*lgr;		/* parent link group */
>  	struct work_struct	link_down_wrk;	/* wrk to bring link down */
> diff --git a/net/smc/smc_wr.c b/net/smc/smc_wr.c
> index 26f8f240d..f9992896a 100644
> --- a/net/smc/smc_wr.c
> +++ b/net/smc/smc_wr.c
> @@ -465,6 +465,10 @@ static inline void smc_wr_rx_process_cqes(struct ib_wc wc[], int num)
>  			case IB_WC_RNR_RETRY_EXC_ERR:
>  			case IB_WC_WR_FLUSH_ERR:
>  				smcr_link_down_cond_sched(link);
> +				if (link->clearing && wc[i]->wr_id == link->wr_rx_id) {
> +					link->rx_drained = 1;
> +					wake_up(&link->wr_rx_drain_wait);
> +				}

I am wondering if we should wait for all the wc comes back?

>  				break;
>  			default:
>  				smc_wr_rx_post(link); /* refill WR RX */
> @@ -631,6 +635,13 @@ static void smc_wr_init_sge(struct smc_link *lnk)
>  	lnk->wr_reg.access = IB_ACCESS_LOCAL_WRITE | IB_ACCESS_REMOTE_WRITE;
>  }
>  
> +void smc_wr_drain_cq(struct smc_link *lnk)
> +{
> +	wait_event_interruptible_timeout(lnk->wr_rx_drain_wait,
> +					 (lnk->drained == 1),
> +					 SMC_WR_RX_WAIT_DRAIN_TIME);
> +}

Should we wait for it with timeout? It should eventually be wake up
normally before freeing link. Waiting for SMC_WR_RX_WAIT_DRAIN_TIME (2s)
may also have this issue, although the probability of occurrence is
greatly reduced.

Cheers,
Tony Lu

> +
>  void smc_wr_free_link(struct smc_link *lnk)
>  {
>  	struct ib_device *ibdev;
> @@ -889,6 +900,7 @@ int smc_wr_create_link(struct smc_link *lnk)
>  	atomic_set(&lnk->wr_tx_refcnt, 0);
>  	init_waitqueue_head(&lnk->wr_reg_wait);
>  	atomic_set(&lnk->wr_reg_refcnt, 0);
> +	init_waitqueue_head(&lnk->wr_rx_drain_wait);
>  	return rc;
>  
>  dma_unmap:
> diff --git a/net/smc/smc_wr.h b/net/smc/smc_wr.h
> index a54e90a11..2a7ebdba3 100644
> --- a/net/smc/smc_wr.h
> +++ b/net/smc/smc_wr.h
> @@ -27,6 +27,8 @@
>  
>  #define SMC_WR_TX_PEND_PRIV_SIZE 32
>  
> +#define SMC_WR_RX_WAIT_DRAIN_TIME       (2 * HZ)
> +
>  struct smc_wr_tx_pend_priv {
>  	u8			priv[SMC_WR_TX_PEND_PRIV_SIZE];
>  };
> @@ -101,6 +103,7 @@ static inline int smc_wr_rx_post(struct smc_link *link)
>  int smc_wr_create_link(struct smc_link *lnk);
>  int smc_wr_alloc_link_mem(struct smc_link *lnk);
>  int smc_wr_alloc_lgr_mem(struct smc_link_group *lgr);
> +void smc_wr_drain_cq(struct smc_link *lnk);
>  void smc_wr_free_link(struct smc_link *lnk);
>  void smc_wr_free_link_mem(struct smc_link *lnk);
>  void smc_wr_free_lgr_mem(struct smc_link_group *lgr);
> -- 
> 2.20.1

^ permalink raw reply	[flat|nested] 22+ messages in thread

* [PATCH net v2] net/smc: Fix possible access to freed memory in link clear
  2022-08-29 16:38 ` Tony Lu
@ 2022-08-30  3:05   ` liuyacan
  2022-08-30  5:58     ` [PATCH net v2] net/smc: fix listen processing for SMC-Rv2 liuyacan
  0 siblings, 1 reply; 22+ messages in thread
From: liuyacan @ 2022-08-30  3:05 UTC (permalink / raw)
  To: tonylu
  Cc: davem, edumazet, kgraul, kuba, linux-kernel, linux-s390, liuyacan,
	netdev, pabeni, wenjia

From: liuyacan <liuyacan@corp.netease.com>

After modifying the QP to the Error state, all RX WR would be
completed with WC in IB_WC_WR_FLUSH_ERR status. Current
implementation does not wait for it is done, but free the link
directly. So there is a risk that accessing the freed link in
tasklet context.

Here is a crash example:

 BUG: unable to handle page fault for address: ffffffff8f220860
 #PF: supervisor write access in kernel mode
 #PF: error_code(0x0002) - not-present page
 PGD f7300e067 P4D f7300e067 PUD f7300f063 PMD 8c4e45063 PTE 800ffff08c9df060
 Oops: 0002 [#1] SMP PTI
 CPU: 1 PID: 0 Comm: swapper/1 Kdump: loaded Tainted: G S         OE     5.10.0-0607+ #23
 Hardware name: Inspur NF5280M4/YZMB-00689-101, BIOS 4.1.20 07/09/2018
 RIP: 0010:native_queued_spin_lock_slowpath+0x176/0x1b0
 Code: f3 90 48 8b 32 48 85 f6 74 f6 eb d5 c1 ee 12 83 e0 03 83 ee 01 48 c1 e0 05 48 63 f6 48 05 00 c8 02 00 48 03 04 f5 00 09 98 8e <48> 89 10 8b 42 08 85 c0 75 09 f3 90 8b 42 08 85 c0 74 f7 48 8b 32
 RSP: 0018:ffffb3b6c001ebd8 EFLAGS: 00010086
 RAX: ffffffff8f220860 RBX: 0000000000000246 RCX: 0000000000080000
 RDX: ffff91db1f86c800 RSI: 000000000000173c RDI: ffff91db62bace00
 RBP: ffff91db62bacc00 R08: 0000000000000000 R09: c00000010000028b
 R10: 0000000000055198 R11: ffffb3b6c001ea58 R12: ffff91db80e05010
 R13: 000000000000000a R14: 0000000000000006 R15: 0000000000000040
 FS:  0000000000000000(0000) GS:ffff91db1f840000(0000) knlGS:0000000000000000
 CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
 CR2: ffffffff8f220860 CR3: 00000001f9580004 CR4: 00000000003706e0
 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
 Call Trace:
  <IRQ>
  _raw_spin_lock_irqsave+0x30/0x40
  mlx5_ib_poll_cq+0x4c/0xc50 [mlx5_ib]
  smc_wr_rx_tasklet_fn+0x56/0xa0 [smc]
  tasklet_action_common.isra.21+0x66/0x100
  __do_softirq+0xd5/0x29c
  asm_call_irq_on_stack+0x12/0x20
  </IRQ>
  do_softirq_own_stack+0x37/0x40
  irq_exit_rcu+0x9d/0xa0
  sysvec_call_function_single+0x34/0x80
  asm_sysvec_call_function_single+0x12/0x20

Signed-off-by: liuyacan <liuyacan@corp.netease.com>
---
 net/smc/smc_core.c |  2 ++
 net/smc/smc_core.h |  2 ++
 net/smc/smc_wr.c   | 12 ++++++++++++
 net/smc/smc_wr.h   |  3 +++
 4 files changed, 19 insertions(+)

diff --git a/net/smc/smc_core.c b/net/smc/smc_core.c
index ff49a11f5..b632a33f1 100644
--- a/net/smc/smc_core.c
+++ b/net/smc/smc_core.c
@@ -752,6 +752,7 @@ int smcr_link_init(struct smc_link_group *lgr, struct smc_link *lnk,
 	atomic_inc(&lnk->smcibdev->lnk_cnt);
 	refcount_set(&lnk->refcnt, 1); /* link refcnt is set to 1 */
 	lnk->clearing = 0;
+	lnk->rx_drained = 0;
 	lnk->path_mtu = lnk->smcibdev->pattr[lnk->ibport - 1].active_mtu;
 	lnk->link_id = smcr_next_link_id(lgr);
 	lnk->lgr = lgr;
@@ -1269,6 +1270,7 @@ void smcr_link_clear(struct smc_link *lnk, bool log)
 	smcr_buf_unmap_lgr(lnk);
 	smcr_rtoken_clear_link(lnk);
 	smc_ib_modify_qp_error(lnk);
+	smc_wr_drain_cq(lnk);
 	smc_wr_free_link(lnk);
 	smc_ib_destroy_queue_pair(lnk);
 	smc_ib_dealloc_protection_domain(lnk);
diff --git a/net/smc/smc_core.h b/net/smc/smc_core.h
index fe8b524ad..0a469a3e7 100644
--- a/net/smc/smc_core.h
+++ b/net/smc/smc_core.h
@@ -117,6 +117,7 @@ struct smc_link {
 	u64			wr_rx_id;	/* seq # of last recv WR */
 	u32			wr_rx_cnt;	/* number of WR recv buffers */
 	unsigned long		wr_rx_tstamp;	/* jiffies when last buf rx */
+	wait_queue_head_t       wr_rx_drain_wait; /* wait for WR drain */
 
 	struct ib_reg_wr	wr_reg;		/* WR register memory region */
 	wait_queue_head_t	wr_reg_wait;	/* wait for wr_reg result */
@@ -138,6 +139,7 @@ struct smc_link {
 	u8			link_idx;	/* index in lgr link array */
 	u8			link_is_asym;	/* is link asymmetric? */
 	u8			clearing : 1;	/* link is being cleared */
+	u8                      rx_drained : 1; /* link is drained */
 	refcount_t		refcnt;		/* link reference count */
 	struct smc_link_group	*lgr;		/* parent link group */
 	struct work_struct	link_down_wrk;	/* wrk to bring link down */
diff --git a/net/smc/smc_wr.c b/net/smc/smc_wr.c
index 26f8f240d..3f782837b 100644
--- a/net/smc/smc_wr.c
+++ b/net/smc/smc_wr.c
@@ -465,6 +465,10 @@ static inline void smc_wr_rx_process_cqes(struct ib_wc wc[], int num)
 			case IB_WC_RNR_RETRY_EXC_ERR:
 			case IB_WC_WR_FLUSH_ERR:
 				smcr_link_down_cond_sched(link);
+				if (link->clearing && wc[i].wr_id == link->wr_rx_id) {
+					link->rx_drained = 1;
+					wake_up(&link->wr_rx_drain_wait);
+				}
 				break;
 			default:
 				smc_wr_rx_post(link); /* refill WR RX */
@@ -631,6 +635,13 @@ static void smc_wr_init_sge(struct smc_link *lnk)
 	lnk->wr_reg.access = IB_ACCESS_LOCAL_WRITE | IB_ACCESS_REMOTE_WRITE;
 }
 
+void smc_wr_drain_cq(struct smc_link *lnk)
+{
+	wait_event_interruptible_timeout(lnk->wr_rx_drain_wait,
+					 (lnk->rx_drained == 1),
+					 SMC_WR_RX_WAIT_DRAIN_TIME);
+}
+
 void smc_wr_free_link(struct smc_link *lnk)
 {
 	struct ib_device *ibdev;
@@ -889,6 +900,7 @@ int smc_wr_create_link(struct smc_link *lnk)
 	atomic_set(&lnk->wr_tx_refcnt, 0);
 	init_waitqueue_head(&lnk->wr_reg_wait);
 	atomic_set(&lnk->wr_reg_refcnt, 0);
+	init_waitqueue_head(&lnk->wr_rx_drain_wait);
 	return rc;
 
 dma_unmap:
diff --git a/net/smc/smc_wr.h b/net/smc/smc_wr.h
index a54e90a11..2a7ebdba3 100644
--- a/net/smc/smc_wr.h
+++ b/net/smc/smc_wr.h
@@ -27,6 +27,8 @@
 
 #define SMC_WR_TX_PEND_PRIV_SIZE 32
 
+#define SMC_WR_RX_WAIT_DRAIN_TIME       (2 * HZ)
+
 struct smc_wr_tx_pend_priv {
 	u8			priv[SMC_WR_TX_PEND_PRIV_SIZE];
 };
@@ -101,6 +103,7 @@ static inline int smc_wr_rx_post(struct smc_link *link)
 int smc_wr_create_link(struct smc_link *lnk);
 int smc_wr_alloc_link_mem(struct smc_link *lnk);
 int smc_wr_alloc_lgr_mem(struct smc_link_group *lgr);
+void smc_wr_drain_cq(struct smc_link *lnk);
 void smc_wr_free_link(struct smc_link *lnk);
 void smc_wr_free_link_mem(struct smc_link *lnk);
 void smc_wr_free_lgr_mem(struct smc_link_group *lgr);
-- 
2.20.1


^ permalink raw reply related	[flat|nested] 22+ messages in thread

* Re: [PATCH net v2] net/smc: fix listen processing for SMC-Rv2
  2022-08-30  3:05   ` [PATCH net v2] " liuyacan
@ 2022-08-30  5:58     ` liuyacan
  2022-08-30  8:31       ` Alexandra Winter
  2022-08-30 14:15       ` Tony Lu
  0 siblings, 2 replies; 22+ messages in thread
From: liuyacan @ 2022-08-30  5:58 UTC (permalink / raw)
  To: liuyacan
  Cc: davem, edumazet, kgraul, kuba, linux-kernel, linux-s390, netdev,
	pabeni, tonylu, wenjia

> > From: liuyacan <liuyacan@corp.netease.com>
> > 
> > After modifying the QP to the Error state, all RX WR would be
> > completed with WC in IB_WC_WR_FLUSH_ERR status. Current
> > implementation does not wait for it is done, but free the link
> > directly. So there is a risk that accessing the freed link in
> > tasklet context.
> > 
> > Here is a crash example:
> > 
> >  BUG: unable to handle page fault for address: ffffffff8f220860
> >  #PF: supervisor write access in kernel mode
> >  #PF: error_code(0x0002) - not-present page
> >  PGD f7300e067 P4D f7300e067 PUD f7300f063 PMD 8c4e45063 PTE 800ffff08c9df060
> >  Oops: 0002 [#1] SMP PTI
> >  CPU: 1 PID: 0 Comm: swapper/1 Kdump: loaded Tainted: G S         OE     5.10.0-0607+ #23
> >  Hardware name: Inspur NF5280M4/YZMB-00689-101, BIOS 4.1.20 07/09/2018
> >  RIP: 0010:native_queued_spin_lock_slowpath+0x176/0x1b0
> >  Code: f3 90 48 8b 32 48 85 f6 74 f6 eb d5 c1 ee 12 83 e0 03 83 ee 01 48 c1 e0 05 48 63 f6 48 05 00 c8 02 00 48 03 04 f5 00 09 98 8e <48> 89 10 8b 42 08 85 c0 75 09 f3 90 8b 42 08 85 c0 74 f7 48 8b 32
> >  RSP: 0018:ffffb3b6c001ebd8 EFLAGS: 00010086
> >  RAX: ffffffff8f220860 RBX: 0000000000000246 RCX: 0000000000080000
> >  RDX: ffff91db1f86c800 RSI: 000000000000173c RDI: ffff91db62bace00
> >  RBP: ffff91db62bacc00 R08: 0000000000000000 R09: c00000010000028b
> >  R10: 0000000000055198 R11: ffffb3b6c001ea58 R12: ffff91db80e05010
> >  R13: 000000000000000a R14: 0000000000000006 R15: 0000000000000040
> >  FS:  0000000000000000(0000) GS:ffff91db1f840000(0000) knlGS:0000000000000000
> >  CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
> >  CR2: ffffffff8f220860 CR3: 00000001f9580004 CR4: 00000000003706e0
> >  DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
> >  DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
> >  Call Trace:
> >   <IRQ>
> >   _raw_spin_lock_irqsave+0x30/0x40
> >   mlx5_ib_poll_cq+0x4c/0xc50 [mlx5_ib]
> >   smc_wr_rx_tasklet_fn+0x56/0xa0 [smc]
> >   tasklet_action_common.isra.21+0x66/0x100
> >   __do_softirq+0xd5/0x29c
> >   asm_call_irq_on_stack+0x12/0x20
> >   </IRQ>
> >   do_softirq_own_stack+0x37/0x40
> >   irq_exit_rcu+0x9d/0xa0
> >   sysvec_call_function_single+0x34/0x80
> >   asm_sysvec_call_function_single+0x12/0x20
> > 
> > Signed-off-by: liuyacan <liuyacan@corp.netease.com>
> > ---
> >  net/smc/smc_core.c |  2 ++
> >  net/smc/smc_core.h |  2 ++
> >  net/smc/smc_wr.c   | 12 ++++++++++++
> >  net/smc/smc_wr.h   |  3 +++
> >  4 files changed, 19 insertions(+)
> > 
> > diff --git a/net/smc/smc_core.c b/net/smc/smc_core.c
> > index ff49a11f5..b632a33f1 100644
> > --- a/net/smc/smc_core.c
> > +++ b/net/smc/smc_core.c
> > @@ -752,6 +752,7 @@ int smcr_link_init(struct smc_link_group *lgr, struct smc_link *lnk,
> >  	atomic_inc(&lnk->smcibdev->lnk_cnt);
> >  	refcount_set(&lnk->refcnt, 1); /* link refcnt is set to 1 */
> >  	lnk->clearing = 0;
> > +	lnk->rx_drained = 0;
> >  	lnk->path_mtu = lnk->smcibdev->pattr[lnk->ibport - 1].active_mtu;
> >  	lnk->link_id = smcr_next_link_id(lgr);
> >  	lnk->lgr = lgr;
> > @@ -1269,6 +1270,7 @@ void smcr_link_clear(struct smc_link *lnk, bool log)
> >  	smcr_buf_unmap_lgr(lnk);
> >  	smcr_rtoken_clear_link(lnk);
> >  	smc_ib_modify_qp_error(lnk);
> > +	smc_wr_drain_cq(lnk);
> >  	smc_wr_free_link(lnk);
> >  	smc_ib_destroy_queue_pair(lnk);
> >  	smc_ib_dealloc_protection_domain(lnk);
> > diff --git a/net/smc/smc_core.h b/net/smc/smc_core.h
> > index fe8b524ad..0a469a3e7 100644
> > --- a/net/smc/smc_core.h
> > +++ b/net/smc/smc_core.h
> > @@ -117,6 +117,7 @@ struct smc_link {
> >  	u64			wr_rx_id;	/* seq # of last recv WR */
> >  	u32			wr_rx_cnt;	/* number of WR recv buffers */
> >  	unsigned long		wr_rx_tstamp;	/* jiffies when last buf rx */
> > +	wait_queue_head_t       wr_rx_drain_wait; /* wait for WR drain */
> >  
> >  	struct ib_reg_wr	wr_reg;		/* WR register memory region */
> >  	wait_queue_head_t	wr_reg_wait;	/* wait for wr_reg result */
> > @@ -138,6 +139,7 @@ struct smc_link {
> >  	u8			link_idx;	/* index in lgr link array */
> >  	u8			link_is_asym;	/* is link asymmetric? */
> >  	u8			clearing : 1;	/* link is being cleared */
> > +	u8                      rx_drained : 1; /* link is drained */
> >  	refcount_t		refcnt;		/* link reference count */
> >  	struct smc_link_group	*lgr;		/* parent link group */
> >  	struct work_struct	link_down_wrk;	/* wrk to bring link down */
> > diff --git a/net/smc/smc_wr.c b/net/smc/smc_wr.c
> > index 26f8f240d..f9992896a 100644
> > --- a/net/smc/smc_wr.c
> > +++ b/net/smc/smc_wr.c
> > @@ -465,6 +465,10 @@ static inline void smc_wr_rx_process_cqes(struct ib_wc wc[], int num)
> >  			case IB_WC_RNR_RETRY_EXC_ERR:
> >  			case IB_WC_WR_FLUSH_ERR:
> >  				smcr_link_down_cond_sched(link);
> > +				if (link->clearing && wc[i]->wr_id == link->wr_rx_id) {
> > +					link->rx_drained = 1;
> > +					wake_up(&link->wr_rx_drain_wait);
> > +				}
> 
> I am wondering if we should wait for all the wc comes back?

I think yes, so other processes can safely destroy qp.

> 
> >  				break;
> >  			default:
> >  				smc_wr_rx_post(link); /* refill WR RX */
> > @@ -631,6 +635,13 @@ static void smc_wr_init_sge(struct smc_link *lnk)
> >  	lnk->wr_reg.access = IB_ACCESS_LOCAL_WRITE | IB_ACCESS_REMOTE_WRITE;
> >  }
> >  
> > +void smc_wr_drain_cq(struct smc_link *lnk)
> > +{
> > +	wait_event_interruptible_timeout(lnk->wr_rx_drain_wait,
> > +					 (lnk->drained == 1),
> > +					 SMC_WR_RX_WAIT_DRAIN_TIME);
> > +}
> 
> Should we wait for it with timeout? It should eventually be wake up
> normally before freeing link. Waiting for SMC_WR_RX_WAIT_DRAIN_TIME (2s)
> may also have this issue, although the probability of occurrence is
> greatly reduced.

Indeed, there should logically probably be a perpetual wait here. I'm just worried if it 
will get stuck for some unknown reason.

> 
> Cheers,
> Tony Lu

Regards,
Yacan


^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH net v2] net/smc: fix listen processing for SMC-Rv2
  2022-08-30  5:58     ` [PATCH net v2] net/smc: fix listen processing for SMC-Rv2 liuyacan
@ 2022-08-30  8:31       ` Alexandra Winter
  2022-08-30 13:19         ` [PATCH net v3] net/smc: Fix possible access to freed memory in link clear liuyacan
  2022-08-31 16:08         ` [PATCH net v2] net/smc: fix listen processing for SMC-Rv2 liuyacan
  2022-08-30 14:15       ` Tony Lu
  1 sibling, 2 replies; 22+ messages in thread
From: Alexandra Winter @ 2022-08-30  8:31 UTC (permalink / raw)
  To: liuyacan
  Cc: davem, edumazet, kgraul, kuba, linux-kernel, linux-s390, netdev,
	pabeni, tonylu, wenjia



On 30.08.22 07:58, liuyacan@corp.netease.com wrote:
>>> From: liuyacan <liuyacan@corp.netease.com>
>>>
>>> After modifying the QP to the Error state, all RX WR would be
>>> completed with WC in IB_WC_WR_FLUSH_ERR status. Current
>>> implementation does not wait for it is done, but free the link
>>> directly. So there is a risk that accessing the freed link in
>>> tasklet context.
>>>
>>> Here is a crash example:
>>>
>>>  BUG: unable to handle page fault for address: ffffffff8f220860
>>>  #PF: supervisor write access in kernel mode
>>>  #PF: error_code(0x0002) - not-present page
>>>  PGD f7300e067 P4D f7300e067 PUD f7300f063 PMD 8c4e45063 PTE 800ffff08c9df060
>>>  Oops: 0002 [#1] SMP PTI
>>>  CPU: 1 PID: 0 Comm: swapper/1 Kdump: loaded Tainted: G S         OE     5.10.0-0607+ #23
>>>  Hardware name: Inspur NF5280M4/YZMB-00689-101, BIOS 4.1.20 07/09/2018
>>>  RIP: 0010:native_queued_spin_lock_slowpath+0x176/0x1b0
>>>  Code: f3 90 48 8b 32 48 85 f6 74 f6 eb d5 c1 ee 12 83 e0 03 83 ee 01 48 c1 e0 05 48 63 f6 48 05 00 c8 02 00 48 03 04 f5 00 09 98 8e <48> 89 10 8b 42 08 85 c0 75 09 f3 90 8b 42 08 85 c0 74 f7 48 8b 32
>>>  RSP: 0018:ffffb3b6c001ebd8 EFLAGS: 00010086
>>>  RAX: ffffffff8f220860 RBX: 0000000000000246 RCX: 0000000000080000
>>>  RDX: ffff91db1f86c800 RSI: 000000000000173c RDI: ffff91db62bace00
>>>  RBP: ffff91db62bacc00 R08: 0000000000000000 R09: c00000010000028b
>>>  R10: 0000000000055198 R11: ffffb3b6c001ea58 R12: ffff91db80e05010
>>>  R13: 000000000000000a R14: 0000000000000006 R15: 0000000000000040
>>>  FS:  0000000000000000(0000) GS:ffff91db1f840000(0000) knlGS:0000000000000000
>>>  CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
>>>  CR2: ffffffff8f220860 CR3: 00000001f9580004 CR4: 00000000003706e0
>>>  DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
>>>  DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
>>>  Call Trace:
>>>   <IRQ>
>>>   _raw_spin_lock_irqsave+0x30/0x40
>>>   mlx5_ib_poll_cq+0x4c/0xc50 [mlx5_ib]
>>>   smc_wr_rx_tasklet_fn+0x56/0xa0 [smc]
>>>   tasklet_action_common.isra.21+0x66/0x100
>>>   __do_softirq+0xd5/0x29c
>>>   asm_call_irq_on_stack+0x12/0x20
>>>   </IRQ>
>>>   do_softirq_own_stack+0x37/0x40
>>>   irq_exit_rcu+0x9d/0xa0
>>>   sysvec_call_function_single+0x34/0x80
>>>   asm_sysvec_call_function_single+0x12/0x20
>>>
>>> Signed-off-by: liuyacan <liuyacan@corp.netease.com>
>>> ---
>>>  net/smc/smc_core.c |  2 ++
>>>  net/smc/smc_core.h |  2 ++
>>>  net/smc/smc_wr.c   | 12 ++++++++++++
>>>  net/smc/smc_wr.h   |  3 +++
>>>  4 files changed, 19 insertions(+)
>>>
>>> diff --git a/net/smc/smc_core.c b/net/smc/smc_core.c
>>> index ff49a11f5..b632a33f1 100644
>>> --- a/net/smc/smc_core.c
>>> +++ b/net/smc/smc_core.c
>>> @@ -752,6 +752,7 @@ int smcr_link_init(struct smc_link_group *lgr, struct smc_link *lnk,
>>>  	atomic_inc(&lnk->smcibdev->lnk_cnt);
>>>  	refcount_set(&lnk->refcnt, 1); /* link refcnt is set to 1 */
>>>  	lnk->clearing = 0;
>>> +	lnk->rx_drained = 0;
>>>  	lnk->path_mtu = lnk->smcibdev->pattr[lnk->ibport - 1].active_mtu;
>>>  	lnk->link_id = smcr_next_link_id(lgr);
>>>  	lnk->lgr = lgr;
>>> @@ -1269,6 +1270,7 @@ void smcr_link_clear(struct smc_link *lnk, bool log)
>>>  	smcr_buf_unmap_lgr(lnk);
>>>  	smcr_rtoken_clear_link(lnk);
>>>  	smc_ib_modify_qp_error(lnk);
>>> +	smc_wr_drain_cq(lnk);
>>>  	smc_wr_free_link(lnk);
>>>  	smc_ib_destroy_queue_pair(lnk);
>>>  	smc_ib_dealloc_protection_domain(lnk);
>>> diff --git a/net/smc/smc_core.h b/net/smc/smc_core.h
>>> index fe8b524ad..0a469a3e7 100644
>>> --- a/net/smc/smc_core.h
>>> +++ b/net/smc/smc_core.h
>>> @@ -117,6 +117,7 @@ struct smc_link {
>>>  	u64			wr_rx_id;	/* seq # of last recv WR */
>>>  	u32			wr_rx_cnt;	/* number of WR recv buffers */
>>>  	unsigned long		wr_rx_tstamp;	/* jiffies when last buf rx */
>>> +	wait_queue_head_t       wr_rx_drain_wait; /* wait for WR drain */
>>>  
>>>  	struct ib_reg_wr	wr_reg;		/* WR register memory region */
>>>  	wait_queue_head_t	wr_reg_wait;	/* wait for wr_reg result */
>>> @@ -138,6 +139,7 @@ struct smc_link {
>>>  	u8			link_idx;	/* index in lgr link array */
>>>  	u8			link_is_asym;	/* is link asymmetric? */
>>>  	u8			clearing : 1;	/* link is being cleared */
>>> +	u8                      rx_drained : 1; /* link is drained */
>>>  	refcount_t		refcnt;		/* link reference count */
>>>  	struct smc_link_group	*lgr;		/* parent link group */
>>>  	struct work_struct	link_down_wrk;	/* wrk to bring link down */
>>> diff --git a/net/smc/smc_wr.c b/net/smc/smc_wr.c
>>> index 26f8f240d..f9992896a 100644
>>> --- a/net/smc/smc_wr.c
>>> +++ b/net/smc/smc_wr.c
>>> @@ -465,6 +465,10 @@ static inline void smc_wr_rx_process_cqes(struct ib_wc wc[], int num)
>>>  			case IB_WC_RNR_RETRY_EXC_ERR:
>>>  			case IB_WC_WR_FLUSH_ERR:
>>>  				smcr_link_down_cond_sched(link);
>>> +				if (link->clearing && wc[i]->wr_id == link->wr_rx_id) {
>>> +					link->rx_drained = 1;
>>> +					wake_up(&link->wr_rx_drain_wait);
>>> +				}
>>
>> I am wondering if we should wait for all the wc comes back?
> 
> I think yes, so other processes can safely destroy qp.
> 
>>
>>>  				break;
>>>  			default:
>>>  				smc_wr_rx_post(link); /* refill WR RX */
>>> @@ -631,6 +635,13 @@ static void smc_wr_init_sge(struct smc_link *lnk)
>>>  	lnk->wr_reg.access = IB_ACCESS_LOCAL_WRITE | IB_ACCESS_REMOTE_WRITE;
>>>  }
>>>  
>>> +void smc_wr_drain_cq(struct smc_link *lnk)
>>> +{
>>> +	wait_event_interruptible_timeout(lnk->wr_rx_drain_wait,
>>> +					 (lnk->drained == 1),
>>> +					 SMC_WR_RX_WAIT_DRAIN_TIME);
>>> +}
>>
>> Should we wait for it with timeout? It should eventually be wake up
>> normally before freeing link. Waiting for SMC_WR_RX_WAIT_DRAIN_TIME (2s)
>> may also have this issue, although the probability of occurrence is
>> greatly reduced.
> 
> Indeed, there should logically probably be a perpetual wait here. I'm just worried if it 
> will get stuck for some unknown reason.
> 
>>
>> Cheers,
>> Tony Lu
> 
> Regards,
> Yacan
> 

Thank you very much for working on a fix, Yacan.

Some comments to make reviewers' lives easier:
Please use your real name for the Signed-Off tag and Mail sender (Is it Yacan Liu ?)
(Please use the same Mail address for all your posts. In April there was a post from yacanliu@163.com. Not this one)
Important: Add a Fixes tag, when sending fixes to NET
Is this mail really a reply to your v2? Or rather a reply to Tony's comments on v1?

Kind regards
Alexandra

^ permalink raw reply	[flat|nested] 22+ messages in thread

* [PATCH net v3] net/smc: Fix possible access to freed memory in link clear
  2022-08-30  8:31       ` Alexandra Winter
@ 2022-08-30 13:19         ` liuyacan
  2022-08-31 16:08         ` [PATCH net v2] net/smc: fix listen processing for SMC-Rv2 liuyacan
  1 sibling, 0 replies; 22+ messages in thread
From: liuyacan @ 2022-08-30 13:19 UTC (permalink / raw)
  To: wintera
  Cc: davem, edumazet, kgraul, kuba, linux-kernel, linux-s390, liuyacan,
	netdev, pabeni, tonylu, wenjia, ubraun

From: Yacan Liu <liuyacan@corp.netease.com>

After modifying the QP to the Error state, all RX WR would be completed
with WC in IB_WC_WR_FLUSH_ERR status. Current implementation does not
wait for it is done, but destroy the QP and free the link group directly.
So there is a risk that accessing the freed memory in tasklet context.

Here is a crash example:

 BUG: unable to handle page fault for address: ffffffff8f220860
 #PF: supervisor write access in kernel mode
 #PF: error_code(0x0002) - not-present page
 PGD f7300e067 P4D f7300e067 PUD f7300f063 PMD 8c4e45063 PTE 800ffff08c9df060
 Oops: 0002 [#1] SMP PTI
 CPU: 1 PID: 0 Comm: swapper/1 Kdump: loaded Tainted: G S         OE     5.10.0-0607+ #23
 Hardware name: Inspur NF5280M4/YZMB-00689-101, BIOS 4.1.20 07/09/2018
 RIP: 0010:native_queued_spin_lock_slowpath+0x176/0x1b0
 Code: f3 90 48 8b 32 48 85 f6 74 f6 eb d5 c1 ee 12 83 e0 03 83 ee 01 48 c1 e0 05 48 63 f6 48 05 00 c8 02 00 48 03 04 f5 00 09 98 8e <48> 89 10 8b 42 08 85 c0 75 09 f3 90 8b 42 08 85 c0 74 f7 48 8b 32
 RSP: 0018:ffffb3b6c001ebd8 EFLAGS: 00010086
 RAX: ffffffff8f220860 RBX: 0000000000000246 RCX: 0000000000080000
 RDX: ffff91db1f86c800 RSI: 000000000000173c RDI: ffff91db62bace00
 RBP: ffff91db62bacc00 R08: 0000000000000000 R09: c00000010000028b
 R10: 0000000000055198 R11: ffffb3b6c001ea58 R12: ffff91db80e05010
 R13: 000000000000000a R14: 0000000000000006 R15: 0000000000000040
 FS:  0000000000000000(0000) GS:ffff91db1f840000(0000) knlGS:0000000000000000
 CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
 CR2: ffffffff8f220860 CR3: 00000001f9580004 CR4: 00000000003706e0
 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
 Call Trace:
  <IRQ>
  _raw_spin_lock_irqsave+0x30/0x40
  mlx5_ib_poll_cq+0x4c/0xc50 [mlx5_ib]
  smc_wr_rx_tasklet_fn+0x56/0xa0 [smc]
  tasklet_action_common.isra.21+0x66/0x100
  __do_softirq+0xd5/0x29c
  asm_call_irq_on_stack+0x12/0x20
  </IRQ>
  do_softirq_own_stack+0x37/0x40
  irq_exit_rcu+0x9d/0xa0
  sysvec_call_function_single+0x34/0x80
  asm_sysvec_call_function_single+0x12/0x20

Fixes: bd4ad57718cc ("smc: initialize IB transport incl. PD, MR, QP, CQ, event, WR")
Signed-off-by: Yacan Liu <liuyacan@corp.netease.com>

---
Change in v3:
  -- Tune commit message (Signed-Off tag, Fixes tag)
     Tune code to avoid column length exceeding
Change in v2:
  -- Fix some compile warnings and errors
---
 net/smc/smc_core.c |  2 ++
 net/smc/smc_core.h |  2 ++
 net/smc/smc_wr.c   | 13 +++++++++++++
 net/smc/smc_wr.h   |  3 +++
 4 files changed, 20 insertions(+)

diff --git a/net/smc/smc_core.c b/net/smc/smc_core.c
index ff49a11f5..b632a33f1 100644
--- a/net/smc/smc_core.c
+++ b/net/smc/smc_core.c
@@ -752,6 +752,7 @@ int smcr_link_init(struct smc_link_group *lgr, struct smc_link *lnk,
 	atomic_inc(&lnk->smcibdev->lnk_cnt);
 	refcount_set(&lnk->refcnt, 1); /* link refcnt is set to 1 */
 	lnk->clearing = 0;
+	lnk->rx_drained = 0;
 	lnk->path_mtu = lnk->smcibdev->pattr[lnk->ibport - 1].active_mtu;
 	lnk->link_id = smcr_next_link_id(lgr);
 	lnk->lgr = lgr;
@@ -1269,6 +1270,7 @@ void smcr_link_clear(struct smc_link *lnk, bool log)
 	smcr_buf_unmap_lgr(lnk);
 	smcr_rtoken_clear_link(lnk);
 	smc_ib_modify_qp_error(lnk);
+	smc_wr_drain_cq(lnk);
 	smc_wr_free_link(lnk);
 	smc_ib_destroy_queue_pair(lnk);
 	smc_ib_dealloc_protection_domain(lnk);
diff --git a/net/smc/smc_core.h b/net/smc/smc_core.h
index fe8b524ad..0a469a3e7 100644
--- a/net/smc/smc_core.h
+++ b/net/smc/smc_core.h
@@ -117,6 +117,7 @@ struct smc_link {
 	u64			wr_rx_id;	/* seq # of last recv WR */
 	u32			wr_rx_cnt;	/* number of WR recv buffers */
 	unsigned long		wr_rx_tstamp;	/* jiffies when last buf rx */
+	wait_queue_head_t       wr_rx_drain_wait; /* wait for WR drain */
 
 	struct ib_reg_wr	wr_reg;		/* WR register memory region */
 	wait_queue_head_t	wr_reg_wait;	/* wait for wr_reg result */
@@ -138,6 +139,7 @@ struct smc_link {
 	u8			link_idx;	/* index in lgr link array */
 	u8			link_is_asym;	/* is link asymmetric? */
 	u8			clearing : 1;	/* link is being cleared */
+	u8                      rx_drained : 1; /* link is drained */
 	refcount_t		refcnt;		/* link reference count */
 	struct smc_link_group	*lgr;		/* parent link group */
 	struct work_struct	link_down_wrk;	/* wrk to bring link down */
diff --git a/net/smc/smc_wr.c b/net/smc/smc_wr.c
index 26f8f240d..958f4b78a 100644
--- a/net/smc/smc_wr.c
+++ b/net/smc/smc_wr.c
@@ -465,6 +465,11 @@ static inline void smc_wr_rx_process_cqes(struct ib_wc wc[], int num)
 			case IB_WC_RNR_RETRY_EXC_ERR:
 			case IB_WC_WR_FLUSH_ERR:
 				smcr_link_down_cond_sched(link);
+				if (link->clearing &&
+				    wc[i].wr_id == link->wr_rx_id) {
+					link->rx_drained = 1;
+					wake_up(&link->wr_rx_drain_wait);
+				}
 				break;
 			default:
 				smc_wr_rx_post(link); /* refill WR RX */
@@ -631,6 +636,13 @@ static void smc_wr_init_sge(struct smc_link *lnk)
 	lnk->wr_reg.access = IB_ACCESS_LOCAL_WRITE | IB_ACCESS_REMOTE_WRITE;
 }
 
+void smc_wr_drain_cq(struct smc_link *lnk)
+{
+	wait_event_interruptible_timeout(lnk->wr_rx_drain_wait,
+					 (lnk->rx_drained == 1),
+					 SMC_WR_RX_WAIT_DRAIN_TIME);
+}
+
 void smc_wr_free_link(struct smc_link *lnk)
 {
 	struct ib_device *ibdev;
@@ -889,6 +901,7 @@ int smc_wr_create_link(struct smc_link *lnk)
 	atomic_set(&lnk->wr_tx_refcnt, 0);
 	init_waitqueue_head(&lnk->wr_reg_wait);
 	atomic_set(&lnk->wr_reg_refcnt, 0);
+	init_waitqueue_head(&lnk->wr_rx_drain_wait);
 	return rc;
 
 dma_unmap:
diff --git a/net/smc/smc_wr.h b/net/smc/smc_wr.h
index a54e90a11..2a7ebdba3 100644
--- a/net/smc/smc_wr.h
+++ b/net/smc/smc_wr.h
@@ -27,6 +27,8 @@
 
 #define SMC_WR_TX_PEND_PRIV_SIZE 32
 
+#define SMC_WR_RX_WAIT_DRAIN_TIME       (2 * HZ)
+
 struct smc_wr_tx_pend_priv {
 	u8			priv[SMC_WR_TX_PEND_PRIV_SIZE];
 };
@@ -101,6 +103,7 @@ static inline int smc_wr_rx_post(struct smc_link *link)
 int smc_wr_create_link(struct smc_link *lnk);
 int smc_wr_alloc_link_mem(struct smc_link *lnk);
 int smc_wr_alloc_lgr_mem(struct smc_link_group *lgr);
+void smc_wr_drain_cq(struct smc_link *lnk);
 void smc_wr_free_link(struct smc_link *lnk);
 void smc_wr_free_link_mem(struct smc_link *lnk);
 void smc_wr_free_lgr_mem(struct smc_link_group *lgr);
-- 
2.20.1


^ permalink raw reply related	[flat|nested] 22+ messages in thread

* Re: [PATCH net v2] net/smc: fix listen processing for SMC-Rv2
  2022-08-30  5:58     ` [PATCH net v2] net/smc: fix listen processing for SMC-Rv2 liuyacan
  2022-08-30  8:31       ` Alexandra Winter
@ 2022-08-30 14:15       ` Tony Lu
  2022-08-31 15:53         ` [PATCH net v4] net/smc: Fix possible access to freed memory in link clear liuyacan
  2022-08-31 16:12         ` [PATCH net v2] net/smc: fix listen processing for SMC-Rv2 liuyacan
  1 sibling, 2 replies; 22+ messages in thread
From: Tony Lu @ 2022-08-30 14:15 UTC (permalink / raw)
  To: liuyacan
  Cc: davem, edumazet, kgraul, kuba, linux-kernel, linux-s390, netdev,
	pabeni, wenjia

On Tue, Aug 30, 2022 at 01:58:06PM +0800, liuyacan@corp.netease.com wrote:
> > > From: liuyacan <liuyacan@corp.netease.com>
> > > 
> > > After modifying the QP to the Error state, all RX WR would be
> > > completed with WC in IB_WC_WR_FLUSH_ERR status. Current
> > > implementation does not wait for it is done, but free the link
> > > directly. So there is a risk that accessing the freed link in
> > > tasklet context.
> > > 
> > > Here is a crash example:
> > > 
> > >  BUG: unable to handle page fault for address: ffffffff8f220860
> > >  #PF: supervisor write access in kernel mode
> > >  #PF: error_code(0x0002) - not-present page
> > >  PGD f7300e067 P4D f7300e067 PUD f7300f063 PMD 8c4e45063 PTE 800ffff08c9df060
> > >  Oops: 0002 [#1] SMP PTI
> > >  CPU: 1 PID: 0 Comm: swapper/1 Kdump: loaded Tainted: G S         OE     5.10.0-0607+ #23
> > >  Hardware name: Inspur NF5280M4/YZMB-00689-101, BIOS 4.1.20 07/09/2018
> > >  RIP: 0010:native_queued_spin_lock_slowpath+0x176/0x1b0
> > >  Code: f3 90 48 8b 32 48 85 f6 74 f6 eb d5 c1 ee 12 83 e0 03 83 ee 01 48 c1 e0 05 48 63 f6 48 05 00 c8 02 00 48 03 04 f5 00 09 98 8e <48> 89 10 8b 42 08 85 c0 75 09 f3 90 8b 42 08 85 c0 74 f7 48 8b 32
> > >  RSP: 0018:ffffb3b6c001ebd8 EFLAGS: 00010086
> > >  RAX: ffffffff8f220860 RBX: 0000000000000246 RCX: 0000000000080000
> > >  RDX: ffff91db1f86c800 RSI: 000000000000173c RDI: ffff91db62bace00
> > >  RBP: ffff91db62bacc00 R08: 0000000000000000 R09: c00000010000028b
> > >  R10: 0000000000055198 R11: ffffb3b6c001ea58 R12: ffff91db80e05010
> > >  R13: 000000000000000a R14: 0000000000000006 R15: 0000000000000040
> > >  FS:  0000000000000000(0000) GS:ffff91db1f840000(0000) knlGS:0000000000000000
> > >  CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
> > >  CR2: ffffffff8f220860 CR3: 00000001f9580004 CR4: 00000000003706e0
> > >  DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
> > >  DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
> > >  Call Trace:
> > >   <IRQ>
> > >   _raw_spin_lock_irqsave+0x30/0x40
> > >   mlx5_ib_poll_cq+0x4c/0xc50 [mlx5_ib]
> > >   smc_wr_rx_tasklet_fn+0x56/0xa0 [smc]
> > >   tasklet_action_common.isra.21+0x66/0x100
> > >   __do_softirq+0xd5/0x29c
> > >   asm_call_irq_on_stack+0x12/0x20
> > >   </IRQ>
> > >   do_softirq_own_stack+0x37/0x40
> > >   irq_exit_rcu+0x9d/0xa0
> > >   sysvec_call_function_single+0x34/0x80
> > >   asm_sysvec_call_function_single+0x12/0x20
> > > 
> > > Signed-off-by: liuyacan <liuyacan@corp.netease.com>
> > > ---
> > >  net/smc/smc_core.c |  2 ++
> > >  net/smc/smc_core.h |  2 ++
> > >  net/smc/smc_wr.c   | 12 ++++++++++++
> > >  net/smc/smc_wr.h   |  3 +++
> > >  4 files changed, 19 insertions(+)
> > > 
> > > diff --git a/net/smc/smc_core.c b/net/smc/smc_core.c
> > > index ff49a11f5..b632a33f1 100644
> > > --- a/net/smc/smc_core.c
> > > +++ b/net/smc/smc_core.c
> > > @@ -752,6 +752,7 @@ int smcr_link_init(struct smc_link_group *lgr, struct smc_link *lnk,
> > >  	atomic_inc(&lnk->smcibdev->lnk_cnt);
> > >  	refcount_set(&lnk->refcnt, 1); /* link refcnt is set to 1 */
> > >  	lnk->clearing = 0;
> > > +	lnk->rx_drained = 0;
> > >  	lnk->path_mtu = lnk->smcibdev->pattr[lnk->ibport - 1].active_mtu;
> > >  	lnk->link_id = smcr_next_link_id(lgr);
> > >  	lnk->lgr = lgr;
> > > @@ -1269,6 +1270,7 @@ void smcr_link_clear(struct smc_link *lnk, bool log)
> > >  	smcr_buf_unmap_lgr(lnk);
> > >  	smcr_rtoken_clear_link(lnk);
> > >  	smc_ib_modify_qp_error(lnk);
> > > +	smc_wr_drain_cq(lnk);
> > >  	smc_wr_free_link(lnk);
> > >  	smc_ib_destroy_queue_pair(lnk);
> > >  	smc_ib_dealloc_protection_domain(lnk);
> > > diff --git a/net/smc/smc_core.h b/net/smc/smc_core.h
> > > index fe8b524ad..0a469a3e7 100644
> > > --- a/net/smc/smc_core.h
> > > +++ b/net/smc/smc_core.h
> > > @@ -117,6 +117,7 @@ struct smc_link {
> > >  	u64			wr_rx_id;	/* seq # of last recv WR */
> > >  	u32			wr_rx_cnt;	/* number of WR recv buffers */
> > >  	unsigned long		wr_rx_tstamp;	/* jiffies when last buf rx */
> > > +	wait_queue_head_t       wr_rx_drain_wait; /* wait for WR drain */
> > >  
> > >  	struct ib_reg_wr	wr_reg;		/* WR register memory region */
> > >  	wait_queue_head_t	wr_reg_wait;	/* wait for wr_reg result */
> > > @@ -138,6 +139,7 @@ struct smc_link {
> > >  	u8			link_idx;	/* index in lgr link array */
> > >  	u8			link_is_asym;	/* is link asymmetric? */
> > >  	u8			clearing : 1;	/* link is being cleared */
> > > +	u8                      rx_drained : 1; /* link is drained */
> > >  	refcount_t		refcnt;		/* link reference count */
> > >  	struct smc_link_group	*lgr;		/* parent link group */
> > >  	struct work_struct	link_down_wrk;	/* wrk to bring link down */
> > > diff --git a/net/smc/smc_wr.c b/net/smc/smc_wr.c
> > > index 26f8f240d..f9992896a 100644
> > > --- a/net/smc/smc_wr.c
> > > +++ b/net/smc/smc_wr.c
> > > @@ -465,6 +465,10 @@ static inline void smc_wr_rx_process_cqes(struct ib_wc wc[], int num)
> > >  			case IB_WC_RNR_RETRY_EXC_ERR:
> > >  			case IB_WC_WR_FLUSH_ERR:
> > >  				smcr_link_down_cond_sched(link);
> > > +				if (link->clearing && wc[i]->wr_id == link->wr_rx_id) {
> > > +					link->rx_drained = 1;
> > > +					wake_up(&link->wr_rx_drain_wait);
> > > +				}
> > 
> > I am wondering if we should wait for all the wc comes back?
> 
> I think yes, so other processes can safely destroy qp.
> 
> > 
> > >  				break;
> > >  			default:
> > >  				smc_wr_rx_post(link); /* refill WR RX */
> > > @@ -631,6 +635,13 @@ static void smc_wr_init_sge(struct smc_link *lnk)
> > >  	lnk->wr_reg.access = IB_ACCESS_LOCAL_WRITE | IB_ACCESS_REMOTE_WRITE;
> > >  }
> > >  
> > > +void smc_wr_drain_cq(struct smc_link *lnk)
> > > +{
> > > +	wait_event_interruptible_timeout(lnk->wr_rx_drain_wait,
> > > +					 (lnk->drained == 1),
> > > +					 SMC_WR_RX_WAIT_DRAIN_TIME);
> > > +}
> > 
> > Should we wait for it with timeout? It should eventually be wake up
> > normally before freeing link. Waiting for SMC_WR_RX_WAIT_DRAIN_TIME (2s)
> > may also have this issue, although the probability of occurrence is
> > greatly reduced.
> 
> Indeed, there should logically probably be a perpetual wait here. I'm just worried if it 
> will get stuck for some unknown reason.

IMHO, it's better to get stuck rather than to hide unknown issues. So I
think timeout is unnecessary. 

Cheers,
Tony Lu

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH net] net/smc: Fix possible access to freed memory in link clear
  2022-08-29 14:54 [PATCH net] net/smc: Fix possible access to freed memory in link clear liuyacan
  2022-08-29 16:38 ` Tony Lu
@ 2022-08-30 14:31 ` kernel test robot
  2022-08-30 16:10 ` kernel test robot
  2 siblings, 0 replies; 22+ messages in thread
From: kernel test robot @ 2022-08-30 14:31 UTC (permalink / raw)
  To: liuyacan, kgraul, davem, wenjia, edumazet, kuba, pabeni
  Cc: kbuild-all, linux-s390, netdev, linux-kernel, liuyacan

Hi,

Thank you for the patch! Yet something to improve:

[auto build test ERROR on net/master]

url:    https://github.com/intel-lab-lkp/linux/commits/liuyacan-corp-netease-com/net-smc-Fix-possible-access-to-freed-memory-in-link-clear/20220829-231821
base:   https://git.kernel.org/pub/scm/linux/kernel/git/davem/net.git cb10b0f91c5f76de981ef927e7dadec60c5a5d96
config: arm64-randconfig-r005-20220830 (https://download.01.org/0day-ci/archive/20220830/202208302233.4HlN35vT-lkp@intel.com/config)
compiler: aarch64-linux-gcc (GCC) 12.1.0
reproduce (this is a W=1 build):
        wget https://raw.githubusercontent.com/intel/lkp-tests/master/sbin/make.cross -O ~/bin/make.cross
        chmod +x ~/bin/make.cross
        # https://github.com/intel-lab-lkp/linux/commit/f8be00c954c559c7ae24f34abade4faebc350ec9
        git remote add linux-review https://github.com/intel-lab-lkp/linux
        git fetch --no-tags linux-review liuyacan-corp-netease-com/net-smc-Fix-possible-access-to-freed-memory-in-link-clear/20220829-231821
        git checkout f8be00c954c559c7ae24f34abade4faebc350ec9
        # save the config file
        mkdir build_dir && cp config build_dir/.config
        COMPILER_INSTALL_PATH=$HOME/0day COMPILER=gcc-12.1.0 make.cross W=1 O=build_dir ARCH=arm64 SHELL=/bin/bash net/smc/

If you fix the issue, kindly add following tag where applicable
Reported-by: kernel test robot <lkp@intel.com>

All errors (new ones prefixed by >>):

   net/smc/smc_wr.c: In function 'smc_wr_rx_process_cqes':
>> net/smc/smc_wr.c:468:60: error: invalid type argument of '->' (have 'struct ib_wc')
     468 |                                 if (link->clearing && wc[i]->wr_id == link->wr_rx_id) {
         |                                                            ^~
   In file included from net/smc/smc_wr.c:27:
   net/smc/smc_wr.c: In function 'smc_wr_drain_cq':
>> net/smc/smc_wr.c:641:48: error: 'struct smc_link' has no member named 'drained'; did you mean 'rx_drained'?
     641 |                                          (lnk->drained == 1),
         |                                                ^~~~~~~
   include/linux/wait.h:276:24: note: in definition of macro '___wait_cond_timeout'
     276 |         bool __cond = (condition);                                              \
         |                        ^~~~~~~~~
   net/smc/smc_wr.c:640:9: note: in expansion of macro 'wait_event_interruptible_timeout'
     640 |         wait_event_interruptible_timeout(lnk->wr_rx_drain_wait,
         |         ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
>> net/smc/smc_wr.c:641:48: error: 'struct smc_link' has no member named 'drained'; did you mean 'rx_drained'?
     641 |                                          (lnk->drained == 1),
         |                                                ^~~~~~~
   include/linux/wait.h:310:21: note: in definition of macro '___wait_event'
     310 |                 if (condition)                                                  \
         |                     ^~~~~~~~~
   include/linux/wait.h:506:32: note: in expansion of macro '___wait_cond_timeout'
     506 |         ___wait_event(wq_head, ___wait_cond_timeout(condition),                 \
         |                                ^~~~~~~~~~~~~~~~~~~~
   include/linux/wait.h:535:25: note: in expansion of macro '__wait_event_interruptible_timeout'
     535 |                 __ret = __wait_event_interruptible_timeout(wq_head,             \
         |                         ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
   net/smc/smc_wr.c:640:9: note: in expansion of macro 'wait_event_interruptible_timeout'
     640 |         wait_event_interruptible_timeout(lnk->wr_rx_drain_wait,
         |         ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~


vim +468 net/smc/smc_wr.c

   449	
   450	static inline void smc_wr_rx_process_cqes(struct ib_wc wc[], int num)
   451	{
   452		struct smc_link *link;
   453		int i;
   454	
   455		for (i = 0; i < num; i++) {
   456			link = wc[i].qp->qp_context;
   457			if (wc[i].status == IB_WC_SUCCESS) {
   458				link->wr_rx_tstamp = jiffies;
   459				smc_wr_rx_demultiplex(&wc[i]);
   460				smc_wr_rx_post(link); /* refill WR RX */
   461			} else {
   462				/* handle status errors */
   463				switch (wc[i].status) {
   464				case IB_WC_RETRY_EXC_ERR:
   465				case IB_WC_RNR_RETRY_EXC_ERR:
   466				case IB_WC_WR_FLUSH_ERR:
   467					smcr_link_down_cond_sched(link);
 > 468					if (link->clearing && wc[i]->wr_id == link->wr_rx_id) {
   469						link->rx_drained = 1;
   470						wake_up(&link->wr_rx_drain_wait);
   471					}
   472					break;
   473				default:
   474					smc_wr_rx_post(link); /* refill WR RX */
   475					break;
   476				}
   477			}
   478		}
   479	}
   480	
   481	static void smc_wr_rx_tasklet_fn(struct tasklet_struct *t)
   482	{
   483		struct smc_ib_device *dev = from_tasklet(dev, t, recv_tasklet);
   484		struct ib_wc wc[SMC_WR_MAX_POLL_CQE];
   485		int polled = 0;
   486		int rc;
   487	
   488	again:
   489		polled++;
   490		do {
   491			memset(&wc, 0, sizeof(wc));
   492			rc = ib_poll_cq(dev->roce_cq_recv, SMC_WR_MAX_POLL_CQE, wc);
   493			if (polled == 1) {
   494				ib_req_notify_cq(dev->roce_cq_recv,
   495						 IB_CQ_SOLICITED_MASK
   496						 | IB_CQ_REPORT_MISSED_EVENTS);
   497			}
   498			if (!rc)
   499				break;
   500			smc_wr_rx_process_cqes(&wc[0], rc);
   501		} while (rc > 0);
   502		if (polled == 1)
   503			goto again;
   504	}
   505	
   506	void smc_wr_rx_cq_handler(struct ib_cq *ib_cq, void *cq_context)
   507	{
   508		struct smc_ib_device *dev = (struct smc_ib_device *)cq_context;
   509	
   510		tasklet_schedule(&dev->recv_tasklet);
   511	}
   512	
   513	int smc_wr_rx_post_init(struct smc_link *link)
   514	{
   515		u32 i;
   516		int rc = 0;
   517	
   518		for (i = 0; i < link->wr_rx_cnt; i++)
   519			rc = smc_wr_rx_post(link);
   520		return rc;
   521	}
   522	
   523	/***************************** init, exit, misc ******************************/
   524	
   525	void smc_wr_remember_qp_attr(struct smc_link *lnk)
   526	{
   527		struct ib_qp_attr *attr = &lnk->qp_attr;
   528		struct ib_qp_init_attr init_attr;
   529	
   530		memset(attr, 0, sizeof(*attr));
   531		memset(&init_attr, 0, sizeof(init_attr));
   532		ib_query_qp(lnk->roce_qp, attr,
   533			    IB_QP_STATE |
   534			    IB_QP_CUR_STATE |
   535			    IB_QP_PKEY_INDEX |
   536			    IB_QP_PORT |
   537			    IB_QP_QKEY |
   538			    IB_QP_AV |
   539			    IB_QP_PATH_MTU |
   540			    IB_QP_TIMEOUT |
   541			    IB_QP_RETRY_CNT |
   542			    IB_QP_RNR_RETRY |
   543			    IB_QP_RQ_PSN |
   544			    IB_QP_ALT_PATH |
   545			    IB_QP_MIN_RNR_TIMER |
   546			    IB_QP_SQ_PSN |
   547			    IB_QP_PATH_MIG_STATE |
   548			    IB_QP_CAP |
   549			    IB_QP_DEST_QPN,
   550			    &init_attr);
   551	
   552		lnk->wr_tx_cnt = min_t(size_t, SMC_WR_BUF_CNT,
   553				       lnk->qp_attr.cap.max_send_wr);
   554		lnk->wr_rx_cnt = min_t(size_t, SMC_WR_BUF_CNT * 3,
   555				       lnk->qp_attr.cap.max_recv_wr);
   556	}
   557	
   558	static void smc_wr_init_sge(struct smc_link *lnk)
   559	{
   560		int sges_per_buf = (lnk->lgr->smc_version == SMC_V2) ? 2 : 1;
   561		bool send_inline = (lnk->qp_attr.cap.max_inline_data > SMC_WR_TX_SIZE);
   562		u32 i;
   563	
   564		for (i = 0; i < lnk->wr_tx_cnt; i++) {
   565			lnk->wr_tx_sges[i].addr = send_inline ? (uintptr_t)(&lnk->wr_tx_bufs[i]) :
   566				lnk->wr_tx_dma_addr + i * SMC_WR_BUF_SIZE;
   567			lnk->wr_tx_sges[i].length = SMC_WR_TX_SIZE;
   568			lnk->wr_tx_sges[i].lkey = lnk->roce_pd->local_dma_lkey;
   569			lnk->wr_tx_rdma_sges[i].tx_rdma_sge[0].wr_tx_rdma_sge[0].lkey =
   570				lnk->roce_pd->local_dma_lkey;
   571			lnk->wr_tx_rdma_sges[i].tx_rdma_sge[0].wr_tx_rdma_sge[1].lkey =
   572				lnk->roce_pd->local_dma_lkey;
   573			lnk->wr_tx_rdma_sges[i].tx_rdma_sge[1].wr_tx_rdma_sge[0].lkey =
   574				lnk->roce_pd->local_dma_lkey;
   575			lnk->wr_tx_rdma_sges[i].tx_rdma_sge[1].wr_tx_rdma_sge[1].lkey =
   576				lnk->roce_pd->local_dma_lkey;
   577			lnk->wr_tx_ibs[i].next = NULL;
   578			lnk->wr_tx_ibs[i].sg_list = &lnk->wr_tx_sges[i];
   579			lnk->wr_tx_ibs[i].num_sge = 1;
   580			lnk->wr_tx_ibs[i].opcode = IB_WR_SEND;
   581			lnk->wr_tx_ibs[i].send_flags =
   582				IB_SEND_SIGNALED | IB_SEND_SOLICITED;
   583			if (send_inline)
   584				lnk->wr_tx_ibs[i].send_flags |= IB_SEND_INLINE;
   585			lnk->wr_tx_rdmas[i].wr_tx_rdma[0].wr.opcode = IB_WR_RDMA_WRITE;
   586			lnk->wr_tx_rdmas[i].wr_tx_rdma[1].wr.opcode = IB_WR_RDMA_WRITE;
   587			lnk->wr_tx_rdmas[i].wr_tx_rdma[0].wr.sg_list =
   588				lnk->wr_tx_rdma_sges[i].tx_rdma_sge[0].wr_tx_rdma_sge;
   589			lnk->wr_tx_rdmas[i].wr_tx_rdma[1].wr.sg_list =
   590				lnk->wr_tx_rdma_sges[i].tx_rdma_sge[1].wr_tx_rdma_sge;
   591		}
   592	
   593		if (lnk->lgr->smc_version == SMC_V2) {
   594			lnk->wr_tx_v2_sge->addr = lnk->wr_tx_v2_dma_addr;
   595			lnk->wr_tx_v2_sge->length = SMC_WR_BUF_V2_SIZE;
   596			lnk->wr_tx_v2_sge->lkey = lnk->roce_pd->local_dma_lkey;
   597	
   598			lnk->wr_tx_v2_ib->next = NULL;
   599			lnk->wr_tx_v2_ib->sg_list = lnk->wr_tx_v2_sge;
   600			lnk->wr_tx_v2_ib->num_sge = 1;
   601			lnk->wr_tx_v2_ib->opcode = IB_WR_SEND;
   602			lnk->wr_tx_v2_ib->send_flags =
   603				IB_SEND_SIGNALED | IB_SEND_SOLICITED;
   604		}
   605	
   606		/* With SMC-Rv2 there can be messages larger than SMC_WR_TX_SIZE.
   607		 * Each ib_recv_wr gets 2 sges, the second one is a spillover buffer
   608		 * and the same buffer for all sges. When a larger message arrived then
   609		 * the content of the first small sge is copied to the beginning of
   610		 * the larger spillover buffer, allowing easy data mapping.
   611		 */
   612		for (i = 0; i < lnk->wr_rx_cnt; i++) {
   613			int x = i * sges_per_buf;
   614	
   615			lnk->wr_rx_sges[x].addr =
   616				lnk->wr_rx_dma_addr + i * SMC_WR_BUF_SIZE;
   617			lnk->wr_rx_sges[x].length = SMC_WR_TX_SIZE;
   618			lnk->wr_rx_sges[x].lkey = lnk->roce_pd->local_dma_lkey;
   619			if (lnk->lgr->smc_version == SMC_V2) {
   620				lnk->wr_rx_sges[x + 1].addr =
   621						lnk->wr_rx_v2_dma_addr + SMC_WR_TX_SIZE;
   622				lnk->wr_rx_sges[x + 1].length =
   623						SMC_WR_BUF_V2_SIZE - SMC_WR_TX_SIZE;
   624				lnk->wr_rx_sges[x + 1].lkey =
   625						lnk->roce_pd->local_dma_lkey;
   626			}
   627			lnk->wr_rx_ibs[i].next = NULL;
   628			lnk->wr_rx_ibs[i].sg_list = &lnk->wr_rx_sges[x];
   629			lnk->wr_rx_ibs[i].num_sge = sges_per_buf;
   630		}
   631		lnk->wr_reg.wr.next = NULL;
   632		lnk->wr_reg.wr.num_sge = 0;
   633		lnk->wr_reg.wr.send_flags = IB_SEND_SIGNALED;
   634		lnk->wr_reg.wr.opcode = IB_WR_REG_MR;
   635		lnk->wr_reg.access = IB_ACCESS_LOCAL_WRITE | IB_ACCESS_REMOTE_WRITE;
   636	}
   637	
   638	void smc_wr_drain_cq(struct smc_link *lnk)
   639	{
   640		wait_event_interruptible_timeout(lnk->wr_rx_drain_wait,
 > 641						 (lnk->drained == 1),
   642						 SMC_WR_RX_WAIT_DRAIN_TIME);
   643	}
   644	

-- 
0-DAY CI Kernel Test Service
https://01.org/lkp

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH net] net/smc: Fix possible access to freed memory in link clear
  2022-08-29 14:54 [PATCH net] net/smc: Fix possible access to freed memory in link clear liuyacan
  2022-08-29 16:38 ` Tony Lu
  2022-08-30 14:31 ` [PATCH net] net/smc: Fix possible access to freed memory in link clear kernel test robot
@ 2022-08-30 16:10 ` kernel test robot
  2 siblings, 0 replies; 22+ messages in thread
From: kernel test robot @ 2022-08-30 16:10 UTC (permalink / raw)
  To: liuyacan, kgraul, davem, wenjia, edumazet, kuba, pabeni
  Cc: llvm, kbuild-all, linux-s390, netdev, linux-kernel, liuyacan

Hi,

Thank you for the patch! Yet something to improve:

[auto build test ERROR on net/master]

url:    https://github.com/intel-lab-lkp/linux/commits/liuyacan-corp-netease-com/net-smc-Fix-possible-access-to-freed-memory-in-link-clear/20220829-231821
base:   https://git.kernel.org/pub/scm/linux/kernel/git/davem/net.git cb10b0f91c5f76de981ef927e7dadec60c5a5d96
config: i386-randconfig-a013 (https://download.01.org/0day-ci/archive/20220831/202208310048.0GtZuOsl-lkp@intel.com/config)
compiler: clang version 14.0.6 (https://github.com/llvm/llvm-project f28c006a5895fc0e329fe15fead81e37457cb1d1)
reproduce (this is a W=1 build):
        wget https://raw.githubusercontent.com/intel/lkp-tests/master/sbin/make.cross -O ~/bin/make.cross
        chmod +x ~/bin/make.cross
        # https://github.com/intel-lab-lkp/linux/commit/f8be00c954c559c7ae24f34abade4faebc350ec9
        git remote add linux-review https://github.com/intel-lab-lkp/linux
        git fetch --no-tags linux-review liuyacan-corp-netease-com/net-smc-Fix-possible-access-to-freed-memory-in-link-clear/20220829-231821
        git checkout f8be00c954c559c7ae24f34abade4faebc350ec9
        # save the config file
        mkdir build_dir && cp config build_dir/.config
        COMPILER_INSTALL_PATH=$HOME/0day COMPILER=clang make.cross W=1 O=build_dir ARCH=i386 SHELL=/bin/bash net/smc/

If you fix the issue, kindly add following tag where applicable
Reported-by: kernel test robot <lkp@intel.com>

All errors (new ones prefixed by >>):

>> net/smc/smc_wr.c:468:32: error: member reference type 'struct ib_wc' is not a pointer; did you mean to use '.'?
                                   if (link->clearing && wc[i]->wr_id == link->wr_rx_id) {
                                                         ~~~~~^~
                                                              .
>> net/smc/smc_wr.c:641:13: error: no member named 'drained' in 'struct smc_link'
                                            (lnk->drained == 1),
                                             ~~~  ^
   include/linux/wait.h:534:28: note: expanded from macro 'wait_event_interruptible_timeout'
           if (!___wait_cond_timeout(condition))                                   \
                                     ^~~~~~~~~
   include/linux/wait.h:276:17: note: expanded from macro '___wait_cond_timeout'
           bool __cond = (condition);                                              \
                          ^~~~~~~~~
>> net/smc/smc_wr.c:641:13: error: no member named 'drained' in 'struct smc_link'
                                            (lnk->drained == 1),
                                             ~~~  ^
   include/linux/wait.h:536:7: note: expanded from macro 'wait_event_interruptible_timeout'
                                                   condition, timeout);            \
                                                   ^~~~~~~~~
   include/linux/wait.h:506:46: note: expanded from macro '__wait_event_interruptible_timeout'
           ___wait_event(wq_head, ___wait_cond_timeout(condition),                 \
                                                       ^~~~~~~~~
   include/linux/wait.h:276:17: note: expanded from macro '___wait_cond_timeout'
           bool __cond = (condition);                                              \
                          ^~~~~~~~~
   include/linux/wait.h:310:7: note: expanded from macro '___wait_event'
                   if (condition)                                                  \
                       ^~~~~~~~~
   3 errors generated.


vim +468 net/smc/smc_wr.c

   449	
   450	static inline void smc_wr_rx_process_cqes(struct ib_wc wc[], int num)
   451	{
   452		struct smc_link *link;
   453		int i;
   454	
   455		for (i = 0; i < num; i++) {
   456			link = wc[i].qp->qp_context;
   457			if (wc[i].status == IB_WC_SUCCESS) {
   458				link->wr_rx_tstamp = jiffies;
   459				smc_wr_rx_demultiplex(&wc[i]);
   460				smc_wr_rx_post(link); /* refill WR RX */
   461			} else {
   462				/* handle status errors */
   463				switch (wc[i].status) {
   464				case IB_WC_RETRY_EXC_ERR:
   465				case IB_WC_RNR_RETRY_EXC_ERR:
   466				case IB_WC_WR_FLUSH_ERR:
   467					smcr_link_down_cond_sched(link);
 > 468					if (link->clearing && wc[i]->wr_id == link->wr_rx_id) {
   469						link->rx_drained = 1;
   470						wake_up(&link->wr_rx_drain_wait);
   471					}
   472					break;
   473				default:
   474					smc_wr_rx_post(link); /* refill WR RX */
   475					break;
   476				}
   477			}
   478		}
   479	}
   480	
   481	static void smc_wr_rx_tasklet_fn(struct tasklet_struct *t)
   482	{
   483		struct smc_ib_device *dev = from_tasklet(dev, t, recv_tasklet);
   484		struct ib_wc wc[SMC_WR_MAX_POLL_CQE];
   485		int polled = 0;
   486		int rc;
   487	
   488	again:
   489		polled++;
   490		do {
   491			memset(&wc, 0, sizeof(wc));
   492			rc = ib_poll_cq(dev->roce_cq_recv, SMC_WR_MAX_POLL_CQE, wc);
   493			if (polled == 1) {
   494				ib_req_notify_cq(dev->roce_cq_recv,
   495						 IB_CQ_SOLICITED_MASK
   496						 | IB_CQ_REPORT_MISSED_EVENTS);
   497			}
   498			if (!rc)
   499				break;
   500			smc_wr_rx_process_cqes(&wc[0], rc);
   501		} while (rc > 0);
   502		if (polled == 1)
   503			goto again;
   504	}
   505	
   506	void smc_wr_rx_cq_handler(struct ib_cq *ib_cq, void *cq_context)
   507	{
   508		struct smc_ib_device *dev = (struct smc_ib_device *)cq_context;
   509	
   510		tasklet_schedule(&dev->recv_tasklet);
   511	}
   512	
   513	int smc_wr_rx_post_init(struct smc_link *link)
   514	{
   515		u32 i;
   516		int rc = 0;
   517	
   518		for (i = 0; i < link->wr_rx_cnt; i++)
   519			rc = smc_wr_rx_post(link);
   520		return rc;
   521	}
   522	
   523	/***************************** init, exit, misc ******************************/
   524	
   525	void smc_wr_remember_qp_attr(struct smc_link *lnk)
   526	{
   527		struct ib_qp_attr *attr = &lnk->qp_attr;
   528		struct ib_qp_init_attr init_attr;
   529	
   530		memset(attr, 0, sizeof(*attr));
   531		memset(&init_attr, 0, sizeof(init_attr));
   532		ib_query_qp(lnk->roce_qp, attr,
   533			    IB_QP_STATE |
   534			    IB_QP_CUR_STATE |
   535			    IB_QP_PKEY_INDEX |
   536			    IB_QP_PORT |
   537			    IB_QP_QKEY |
   538			    IB_QP_AV |
   539			    IB_QP_PATH_MTU |
   540			    IB_QP_TIMEOUT |
   541			    IB_QP_RETRY_CNT |
   542			    IB_QP_RNR_RETRY |
   543			    IB_QP_RQ_PSN |
   544			    IB_QP_ALT_PATH |
   545			    IB_QP_MIN_RNR_TIMER |
   546			    IB_QP_SQ_PSN |
   547			    IB_QP_PATH_MIG_STATE |
   548			    IB_QP_CAP |
   549			    IB_QP_DEST_QPN,
   550			    &init_attr);
   551	
   552		lnk->wr_tx_cnt = min_t(size_t, SMC_WR_BUF_CNT,
   553				       lnk->qp_attr.cap.max_send_wr);
   554		lnk->wr_rx_cnt = min_t(size_t, SMC_WR_BUF_CNT * 3,
   555				       lnk->qp_attr.cap.max_recv_wr);
   556	}
   557	
   558	static void smc_wr_init_sge(struct smc_link *lnk)
   559	{
   560		int sges_per_buf = (lnk->lgr->smc_version == SMC_V2) ? 2 : 1;
   561		bool send_inline = (lnk->qp_attr.cap.max_inline_data > SMC_WR_TX_SIZE);
   562		u32 i;
   563	
   564		for (i = 0; i < lnk->wr_tx_cnt; i++) {
   565			lnk->wr_tx_sges[i].addr = send_inline ? (uintptr_t)(&lnk->wr_tx_bufs[i]) :
   566				lnk->wr_tx_dma_addr + i * SMC_WR_BUF_SIZE;
   567			lnk->wr_tx_sges[i].length = SMC_WR_TX_SIZE;
   568			lnk->wr_tx_sges[i].lkey = lnk->roce_pd->local_dma_lkey;
   569			lnk->wr_tx_rdma_sges[i].tx_rdma_sge[0].wr_tx_rdma_sge[0].lkey =
   570				lnk->roce_pd->local_dma_lkey;
   571			lnk->wr_tx_rdma_sges[i].tx_rdma_sge[0].wr_tx_rdma_sge[1].lkey =
   572				lnk->roce_pd->local_dma_lkey;
   573			lnk->wr_tx_rdma_sges[i].tx_rdma_sge[1].wr_tx_rdma_sge[0].lkey =
   574				lnk->roce_pd->local_dma_lkey;
   575			lnk->wr_tx_rdma_sges[i].tx_rdma_sge[1].wr_tx_rdma_sge[1].lkey =
   576				lnk->roce_pd->local_dma_lkey;
   577			lnk->wr_tx_ibs[i].next = NULL;
   578			lnk->wr_tx_ibs[i].sg_list = &lnk->wr_tx_sges[i];
   579			lnk->wr_tx_ibs[i].num_sge = 1;
   580			lnk->wr_tx_ibs[i].opcode = IB_WR_SEND;
   581			lnk->wr_tx_ibs[i].send_flags =
   582				IB_SEND_SIGNALED | IB_SEND_SOLICITED;
   583			if (send_inline)
   584				lnk->wr_tx_ibs[i].send_flags |= IB_SEND_INLINE;
   585			lnk->wr_tx_rdmas[i].wr_tx_rdma[0].wr.opcode = IB_WR_RDMA_WRITE;
   586			lnk->wr_tx_rdmas[i].wr_tx_rdma[1].wr.opcode = IB_WR_RDMA_WRITE;
   587			lnk->wr_tx_rdmas[i].wr_tx_rdma[0].wr.sg_list =
   588				lnk->wr_tx_rdma_sges[i].tx_rdma_sge[0].wr_tx_rdma_sge;
   589			lnk->wr_tx_rdmas[i].wr_tx_rdma[1].wr.sg_list =
   590				lnk->wr_tx_rdma_sges[i].tx_rdma_sge[1].wr_tx_rdma_sge;
   591		}
   592	
   593		if (lnk->lgr->smc_version == SMC_V2) {
   594			lnk->wr_tx_v2_sge->addr = lnk->wr_tx_v2_dma_addr;
   595			lnk->wr_tx_v2_sge->length = SMC_WR_BUF_V2_SIZE;
   596			lnk->wr_tx_v2_sge->lkey = lnk->roce_pd->local_dma_lkey;
   597	
   598			lnk->wr_tx_v2_ib->next = NULL;
   599			lnk->wr_tx_v2_ib->sg_list = lnk->wr_tx_v2_sge;
   600			lnk->wr_tx_v2_ib->num_sge = 1;
   601			lnk->wr_tx_v2_ib->opcode = IB_WR_SEND;
   602			lnk->wr_tx_v2_ib->send_flags =
   603				IB_SEND_SIGNALED | IB_SEND_SOLICITED;
   604		}
   605	
   606		/* With SMC-Rv2 there can be messages larger than SMC_WR_TX_SIZE.
   607		 * Each ib_recv_wr gets 2 sges, the second one is a spillover buffer
   608		 * and the same buffer for all sges. When a larger message arrived then
   609		 * the content of the first small sge is copied to the beginning of
   610		 * the larger spillover buffer, allowing easy data mapping.
   611		 */
   612		for (i = 0; i < lnk->wr_rx_cnt; i++) {
   613			int x = i * sges_per_buf;
   614	
   615			lnk->wr_rx_sges[x].addr =
   616				lnk->wr_rx_dma_addr + i * SMC_WR_BUF_SIZE;
   617			lnk->wr_rx_sges[x].length = SMC_WR_TX_SIZE;
   618			lnk->wr_rx_sges[x].lkey = lnk->roce_pd->local_dma_lkey;
   619			if (lnk->lgr->smc_version == SMC_V2) {
   620				lnk->wr_rx_sges[x + 1].addr =
   621						lnk->wr_rx_v2_dma_addr + SMC_WR_TX_SIZE;
   622				lnk->wr_rx_sges[x + 1].length =
   623						SMC_WR_BUF_V2_SIZE - SMC_WR_TX_SIZE;
   624				lnk->wr_rx_sges[x + 1].lkey =
   625						lnk->roce_pd->local_dma_lkey;
   626			}
   627			lnk->wr_rx_ibs[i].next = NULL;
   628			lnk->wr_rx_ibs[i].sg_list = &lnk->wr_rx_sges[x];
   629			lnk->wr_rx_ibs[i].num_sge = sges_per_buf;
   630		}
   631		lnk->wr_reg.wr.next = NULL;
   632		lnk->wr_reg.wr.num_sge = 0;
   633		lnk->wr_reg.wr.send_flags = IB_SEND_SIGNALED;
   634		lnk->wr_reg.wr.opcode = IB_WR_REG_MR;
   635		lnk->wr_reg.access = IB_ACCESS_LOCAL_WRITE | IB_ACCESS_REMOTE_WRITE;
   636	}
   637	
   638	void smc_wr_drain_cq(struct smc_link *lnk)
   639	{
   640		wait_event_interruptible_timeout(lnk->wr_rx_drain_wait,
 > 641						 (lnk->drained == 1),
   642						 SMC_WR_RX_WAIT_DRAIN_TIME);
   643	}
   644	

-- 
0-DAY CI Kernel Test Service
https://01.org/lkp

^ permalink raw reply	[flat|nested] 22+ messages in thread

* [PATCH net v4] net/smc: Fix possible access to freed memory in link clear
  2022-08-30 14:15       ` Tony Lu
@ 2022-08-31 15:53         ` liuyacan
  2022-09-01  6:51           ` Tony Lu
                             ` (2 more replies)
  2022-08-31 16:12         ` [PATCH net v2] net/smc: fix listen processing for SMC-Rv2 liuyacan
  1 sibling, 3 replies; 22+ messages in thread
From: liuyacan @ 2022-08-31 15:53 UTC (permalink / raw)
  To: tonylu
  Cc: davem, edumazet, kgraul, kuba, linux-kernel, linux-s390, liuyacan,
	netdev, ubraun, pabeni, wenjia, wintera

From: Yacan Liu <liuyacan@corp.netease.com>

After modifying the QP to the Error state, all RX WR would be completed
with WC in IB_WC_WR_FLUSH_ERR status. Current implementation does not
wait for it is done, but destroy the QP and free the link group directly.
So there is a risk that accessing the freed memory in tasklet context.

Here is a crash example:

 BUG: unable to handle page fault for address: ffffffff8f220860
 #PF: supervisor write access in kernel mode
 #PF: error_code(0x0002) - not-present page
 PGD f7300e067 P4D f7300e067 PUD f7300f063 PMD 8c4e45063 PTE 800ffff08c9df060
 Oops: 0002 [#1] SMP PTI
 CPU: 1 PID: 0 Comm: swapper/1 Kdump: loaded Tainted: G S         OE     5.10.0-0607+ #23
 Hardware name: Inspur NF5280M4/YZMB-00689-101, BIOS 4.1.20 07/09/2018
 RIP: 0010:native_queued_spin_lock_slowpath+0x176/0x1b0
 Code: f3 90 48 8b 32 48 85 f6 74 f6 eb d5 c1 ee 12 83 e0 03 83 ee 01 48 c1 e0 05 48 63 f6 48 05 00 c8 02 00 48 03 04 f5 00 09 98 8e <48> 89 10 8b 42 08 85 c0 75 09 f3 90 8b 42 08 85 c0 74 f7 48 8b 32
 RSP: 0018:ffffb3b6c001ebd8 EFLAGS: 00010086
 RAX: ffffffff8f220860 RBX: 0000000000000246 RCX: 0000000000080000
 RDX: ffff91db1f86c800 RSI: 000000000000173c RDI: ffff91db62bace00
 RBP: ffff91db62bacc00 R08: 0000000000000000 R09: c00000010000028b
 R10: 0000000000055198 R11: ffffb3b6c001ea58 R12: ffff91db80e05010
 R13: 000000000000000a R14: 0000000000000006 R15: 0000000000000040
 FS:  0000000000000000(0000) GS:ffff91db1f840000(0000) knlGS:0000000000000000
 CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
 CR2: ffffffff8f220860 CR3: 00000001f9580004 CR4: 00000000003706e0
 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
 Call Trace:
  <IRQ>
  _raw_spin_lock_irqsave+0x30/0x40
  mlx5_ib_poll_cq+0x4c/0xc50 [mlx5_ib]
  smc_wr_rx_tasklet_fn+0x56/0xa0 [smc]
  tasklet_action_common.isra.21+0x66/0x100
  __do_softirq+0xd5/0x29c
  asm_call_irq_on_stack+0x12/0x20
  </IRQ>
  do_softirq_own_stack+0x37/0x40
  irq_exit_rcu+0x9d/0xa0
  sysvec_call_function_single+0x34/0x80
  asm_sysvec_call_function_single+0x12/0x20

Fixes: bd4ad57718cc ("smc: initialize IB transport incl. PD, MR, QP, CQ, event, WR")
Signed-off-by: Yacan Liu <liuyacan@corp.netease.com>

---
Chagen in v4:
  -- Remove the rx_drain flag because smc_wr_rx_post() may not have been called.
  -- Remove timeout.
Change in v3:
  -- Tune commit message (Signed-Off tag, Fixes tag).
     Tune code to avoid column length exceeding.
Change in v2:
  -- Fix some compile warnings and errors.
---
 net/smc/smc_core.c | 2 ++
 net/smc/smc_core.h | 2 ++
 net/smc/smc_wr.c   | 9 +++++++++
 net/smc/smc_wr.h   | 1 +
 4 files changed, 14 insertions(+)

diff --git a/net/smc/smc_core.c b/net/smc/smc_core.c
index ff49a11f5..f92a916e9 100644
--- a/net/smc/smc_core.c
+++ b/net/smc/smc_core.c
@@ -757,6 +757,7 @@ int smcr_link_init(struct smc_link_group *lgr, struct smc_link *lnk,
 	lnk->lgr = lgr;
 	smc_lgr_hold(lgr); /* lgr_put in smcr_link_clear() */
 	lnk->link_idx = link_idx;
+	lnk->wr_rx_id_compl = 0;
 	smc_ibdev_cnt_inc(lnk);
 	smcr_copy_dev_info_to_link(lnk);
 	atomic_set(&lnk->conn_cnt, 0);
@@ -1269,6 +1270,7 @@ void smcr_link_clear(struct smc_link *lnk, bool log)
 	smcr_buf_unmap_lgr(lnk);
 	smcr_rtoken_clear_link(lnk);
 	smc_ib_modify_qp_error(lnk);
+	smc_wr_drain_cq(lnk);
 	smc_wr_free_link(lnk);
 	smc_ib_destroy_queue_pair(lnk);
 	smc_ib_dealloc_protection_domain(lnk);
diff --git a/net/smc/smc_core.h b/net/smc/smc_core.h
index fe8b524ad..285f9bd8e 100644
--- a/net/smc/smc_core.h
+++ b/net/smc/smc_core.h
@@ -115,8 +115,10 @@ struct smc_link {
 	dma_addr_t		wr_rx_dma_addr;	/* DMA address of wr_rx_bufs */
 	dma_addr_t		wr_rx_v2_dma_addr; /* DMA address of v2 rx buf*/
 	u64			wr_rx_id;	/* seq # of last recv WR */
+	u64			wr_rx_id_compl; /* seq # of last completed WR */
 	u32			wr_rx_cnt;	/* number of WR recv buffers */
 	unsigned long		wr_rx_tstamp;	/* jiffies when last buf rx */
+	wait_queue_head_t       wr_rx_empty_wait; /* wait for RQ empty */
 
 	struct ib_reg_wr	wr_reg;		/* WR register memory region */
 	wait_queue_head_t	wr_reg_wait;	/* wait for wr_reg result */
diff --git a/net/smc/smc_wr.c b/net/smc/smc_wr.c
index 26f8f240d..bc8793803 100644
--- a/net/smc/smc_wr.c
+++ b/net/smc/smc_wr.c
@@ -454,6 +454,7 @@ static inline void smc_wr_rx_process_cqes(struct ib_wc wc[], int num)
 
 	for (i = 0; i < num; i++) {
 		link = wc[i].qp->qp_context;
+		link->wr_rx_id_compl = wc[i].wr_id;
 		if (wc[i].status == IB_WC_SUCCESS) {
 			link->wr_rx_tstamp = jiffies;
 			smc_wr_rx_demultiplex(&wc[i]);
@@ -465,6 +466,8 @@ static inline void smc_wr_rx_process_cqes(struct ib_wc wc[], int num)
 			case IB_WC_RNR_RETRY_EXC_ERR:
 			case IB_WC_WR_FLUSH_ERR:
 				smcr_link_down_cond_sched(link);
+				if (link->wr_rx_id_compl == link->wr_rx_id)
+					wake_up(&link->wr_rx_empty_wait);
 				break;
 			default:
 				smc_wr_rx_post(link); /* refill WR RX */
@@ -631,6 +634,11 @@ static void smc_wr_init_sge(struct smc_link *lnk)
 	lnk->wr_reg.access = IB_ACCESS_LOCAL_WRITE | IB_ACCESS_REMOTE_WRITE;
 }
 
+void smc_wr_drain_cq(struct smc_link *lnk)
+{
+	wait_event(lnk->wr_rx_empty_wait, lnk->wr_rx_id_compl == lnk->wr_rx_id);
+}
+
 void smc_wr_free_link(struct smc_link *lnk)
 {
 	struct ib_device *ibdev;
@@ -889,6 +897,7 @@ int smc_wr_create_link(struct smc_link *lnk)
 	atomic_set(&lnk->wr_tx_refcnt, 0);
 	init_waitqueue_head(&lnk->wr_reg_wait);
 	atomic_set(&lnk->wr_reg_refcnt, 0);
+	init_waitqueue_head(&lnk->wr_rx_empty_wait);
 	return rc;
 
 dma_unmap:
diff --git a/net/smc/smc_wr.h b/net/smc/smc_wr.h
index a54e90a11..5ca5086ae 100644
--- a/net/smc/smc_wr.h
+++ b/net/smc/smc_wr.h
@@ -101,6 +101,7 @@ static inline int smc_wr_rx_post(struct smc_link *link)
 int smc_wr_create_link(struct smc_link *lnk);
 int smc_wr_alloc_link_mem(struct smc_link *lnk);
 int smc_wr_alloc_lgr_mem(struct smc_link_group *lgr);
+void smc_wr_drain_cq(struct smc_link *lnk);
 void smc_wr_free_link(struct smc_link *lnk);
 void smc_wr_free_link_mem(struct smc_link *lnk);
 void smc_wr_free_lgr_mem(struct smc_link_group *lgr);
-- 
2.20.1


^ permalink raw reply related	[flat|nested] 22+ messages in thread

* Re: [PATCH net v2] net/smc: fix listen processing for SMC-Rv2
  2022-08-30  8:31       ` Alexandra Winter
  2022-08-30 13:19         ` [PATCH net v3] net/smc: Fix possible access to freed memory in link clear liuyacan
@ 2022-08-31 16:08         ` liuyacan
  1 sibling, 0 replies; 22+ messages in thread
From: liuyacan @ 2022-08-31 16:08 UTC (permalink / raw)
  To: wintera
  Cc: davem, edumazet, kgraul, kuba, linux-kernel, linux-s390, liuyacan,
	netdev, pabeni, tonylu, wenjia

> >>> From: liuyacan <liuyacan@corp.netease.com>
> >>>
> >>> After modifying the QP to the Error state, all RX WR would be
> >>> completed with WC in IB_WC_WR_FLUSH_ERR status. Current
> >>> implementation does not wait for it is done, but free the link
> >>> directly. So there is a risk that accessing the freed link in
> >>> tasklet context.
> >>>
> >>> Here is a crash example:
> >>>
> >>>  BUG: unable to handle page fault for address: ffffffff8f220860
> >>>  #PF: supervisor write access in kernel mode
> >>>  #PF: error_code(0x0002) - not-present page
> >>>  PGD f7300e067 P4D f7300e067 PUD f7300f063 PMD 8c4e45063 PTE 800ffff08c9df060
> >>>  Oops: 0002 [#1] SMP PTI
> >>>  CPU: 1 PID: 0 Comm: swapper/1 Kdump: loaded Tainted: G S         OE     5.10.0-0607+ #23
> >>>  Hardware name: Inspur NF5280M4/YZMB-00689-101, BIOS 4.1.20 07/09/2018
> >>>  RIP: 0010:native_queued_spin_lock_slowpath+0x176/0x1b0
> >>>  Code: f3 90 48 8b 32 48 85 f6 74 f6 eb d5 c1 ee 12 83 e0 03 83 ee 01 48 c1 e0 05 48 63 f6 48 05 00 c8 02 00 48 03 04 f5 00 09 98 8e <48> 89 10 8b 42 08 85 c0 75 09 f3 90 8b 42 08 85 c0 74 f7 48 8b 32
> >>>  RSP: 0018:ffffb3b6c001ebd8 EFLAGS: 00010086
> >>>  RAX: ffffffff8f220860 RBX: 0000000000000246 RCX: 0000000000080000
> >>>  RDX: ffff91db1f86c800 RSI: 000000000000173c RDI: ffff91db62bace00
> >>>  RBP: ffff91db62bacc00 R08: 0000000000000000 R09: c00000010000028b
> >>>  R10: 0000000000055198 R11: ffffb3b6c001ea58 R12: ffff91db80e05010
> >>>  R13: 000000000000000a R14: 0000000000000006 R15: 0000000000000040
> >>>  FS:  0000000000000000(0000) GS:ffff91db1f840000(0000) knlGS:0000000000000000
> >>>  CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
> >>>  CR2: ffffffff8f220860 CR3: 00000001f9580004 CR4: 00000000003706e0
> >>>  DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
> >>>  DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
> >>>  Call Trace:
> >>>   <IRQ>
> >>>   _raw_spin_lock_irqsave+0x30/0x40
> >>>   mlx5_ib_poll_cq+0x4c/0xc50 [mlx5_ib]
> >>>   smc_wr_rx_tasklet_fn+0x56/0xa0 [smc]
> >>>   tasklet_action_common.isra.21+0x66/0x100
> >>>   __do_softirq+0xd5/0x29c
> >>>   asm_call_irq_on_stack+0x12/0x20
> >>>   </IRQ>
> >>>   do_softirq_own_stack+0x37/0x40
> >>>   irq_exit_rcu+0x9d/0xa0
> >>>   sysvec_call_function_single+0x34/0x80
> >>>   asm_sysvec_call_function_single+0x12/0x20
> >>>
> >>> Signed-off-by: liuyacan <liuyacan@corp.netease.com>
> >>> ---
> >>>  net/smc/smc_core.c |  2 ++
> >>>  net/smc/smc_core.h |  2 ++
> >>>  net/smc/smc_wr.c   | 12 ++++++++++++
> >>>  net/smc/smc_wr.h   |  3 +++
> >>>  4 files changed, 19 insertions(+)
> >>>
> >>> diff --git a/net/smc/smc_core.c b/net/smc/smc_core.c
> >>> index ff49a11f5..b632a33f1 100644
> >>> --- a/net/smc/smc_core.c
> >>> +++ b/net/smc/smc_core.c
> >>> @@ -752,6 +752,7 @@ int smcr_link_init(struct smc_link_group *lgr, struct smc_link *lnk,
> >>>  	atomic_inc(&lnk->smcibdev->lnk_cnt);
> >>>  	refcount_set(&lnk->refcnt, 1); /* link refcnt is set to 1 */
> >>>  	lnk->clearing = 0;
> >>> +	lnk->rx_drained = 0;
> >>>  	lnk->path_mtu = lnk->smcibdev->pattr[lnk->ibport - 1].active_mtu;
> >>>  	lnk->link_id = smcr_next_link_id(lgr);
> >>>  	lnk->lgr = lgr;
> >>> @@ -1269,6 +1270,7 @@ void smcr_link_clear(struct smc_link *lnk, bool log)
> >>>  	smcr_buf_unmap_lgr(lnk);
> >>>  	smcr_rtoken_clear_link(lnk);
> >>>  	smc_ib_modify_qp_error(lnk);
> >>> +	smc_wr_drain_cq(lnk);
> >>>  	smc_wr_free_link(lnk);
> >>>  	smc_ib_destroy_queue_pair(lnk);
> >>>  	smc_ib_dealloc_protection_domain(lnk);
> >>> diff --git a/net/smc/smc_core.h b/net/smc/smc_core.h
> >>> index fe8b524ad..0a469a3e7 100644
> >>> --- a/net/smc/smc_core.h
> >>> +++ b/net/smc/smc_core.h
> >>> @@ -117,6 +117,7 @@ struct smc_link {
> >>>  	u64			wr_rx_id;	/* seq # of last recv WR */
> >>>  	u32			wr_rx_cnt;	/* number of WR recv buffers */
> >>>  	unsigned long		wr_rx_tstamp;	/* jiffies when last buf rx */
> >>> +	wait_queue_head_t       wr_rx_drain_wait; /* wait for WR drain */
> >>>  
> >>>  	struct ib_reg_wr	wr_reg;		/* WR register memory region */
> >>>  	wait_queue_head_t	wr_reg_wait;	/* wait for wr_reg result */
> >>> @@ -138,6 +139,7 @@ struct smc_link {
> >>>  	u8			link_idx;	/* index in lgr link array */
> >>>  	u8			link_is_asym;	/* is link asymmetric? */
> >>>  	u8			clearing : 1;	/* link is being cleared */
> >>> +	u8                      rx_drained : 1; /* link is drained */
> >>>  	refcount_t		refcnt;		/* link reference count */
> >>>  	struct smc_link_group	*lgr;		/* parent link group */
> >>>  	struct work_struct	link_down_wrk;	/* wrk to bring link down */
> >>> diff --git a/net/smc/smc_wr.c b/net/smc/smc_wr.c
> >>> index 26f8f240d..f9992896a 100644
> >>> --- a/net/smc/smc_wr.c
> >>> +++ b/net/smc/smc_wr.c
> >>> @@ -465,6 +465,10 @@ static inline void smc_wr_rx_process_cqes(struct ib_wc wc[], int num)
> >>>  			case IB_WC_RNR_RETRY_EXC_ERR:
> >>>  			case IB_WC_WR_FLUSH_ERR:
> >>>  				smcr_link_down_cond_sched(link);
> >>> +				if (link->clearing && wc[i]->wr_id == link->wr_rx_id) {
> >>> +					link->rx_drained = 1;
> >>> +					wake_up(&link->wr_rx_drain_wait);
> >>> +				}
> >>
> >> I am wondering if we should wait for all the wc comes back?
> > 
> > I think yes, so other processes can safely destroy qp.
> > 
> >>
> >>>  				break;
> >>>  			default:
> >>>  				smc_wr_rx_post(link); /* refill WR RX */
> >>> @@ -631,6 +635,13 @@ static void smc_wr_init_sge(struct smc_link *lnk)
> >>>  	lnk->wr_reg.access = IB_ACCESS_LOCAL_WRITE | IB_ACCESS_REMOTE_WRITE;
> >>>  }
> >>>  
> >>> +void smc_wr_drain_cq(struct smc_link *lnk)
> >>> +{
> >>> +	wait_event_interruptible_timeout(lnk->wr_rx_drain_wait,
> >>> +					 (lnk->drained == 1),
> >>> +					 SMC_WR_RX_WAIT_DRAIN_TIME);
> >>> +}
> >>
> >> Should we wait for it with timeout? It should eventually be wake up
> >> normally before freeing link. Waiting for SMC_WR_RX_WAIT_DRAIN_TIME (2s)
> >> may also have this issue, although the probability of occurrence is
> >> greatly reduced.
> > 
> > Indeed, there should logically probably be a perpetual wait here. I'm just worried if it 
> > will get stuck for some unknown reason.
> > 
> >>
> >> Cheers,
> >> Tony Lu
> > 
> > Regards,
> > Yacan
> > 
> 
> Thank you very much for working on a fix, Yacan.
> 
> Some comments to make reviewers' lives easier:
> Please use your real name for the Signed-Off tag and Mail sender (Is it Yacan Liu ?)
> (Please use the same Mail address for all your posts. In April there was a post from yacanliu@163.com. Not this one)
>
> Important: Add a Fixes tag, when sending fixes to NET

OK. I updated in the latest version (v4) 

> Is this mail really a reply to your v2? Or rather a reply to Tony's comments on v1?

It should be v1. But now v1~v3 are abandoned.

> 
> Kind regards
> Alexandra

Regards,
Yacan


^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH net v2] net/smc: fix listen processing for SMC-Rv2
  2022-08-30 14:15       ` Tony Lu
  2022-08-31 15:53         ` [PATCH net v4] net/smc: Fix possible access to freed memory in link clear liuyacan
@ 2022-08-31 16:12         ` liuyacan
  1 sibling, 0 replies; 22+ messages in thread
From: liuyacan @ 2022-08-31 16:12 UTC (permalink / raw)
  To: tonylu
  Cc: davem, edumazet, kgraul, kuba, linux-kernel, linux-s390, liuyacan,
	netdev, pabeni, wenjia

> > > > From: liuyacan <liuyacan@corp.netease.com>
> > > > 
> > > > After modifying the QP to the Error state, all RX WR would be
> > > > completed with WC in IB_WC_WR_FLUSH_ERR status. Current
> > > > implementation does not wait for it is done, but free the link
> > > > directly. So there is a risk that accessing the freed link in
> > > > tasklet context.
> > > > 
> > > > Here is a crash example:
> > > > 
> > > >  BUG: unable to handle page fault for address: ffffffff8f220860
> > > >  #PF: supervisor write access in kernel mode
> > > >  #PF: error_code(0x0002) - not-present page
> > > >  PGD f7300e067 P4D f7300e067 PUD f7300f063 PMD 8c4e45063 PTE 800ffff08c9df060
> > > >  Oops: 0002 [#1] SMP PTI
> > > >  CPU: 1 PID: 0 Comm: swapper/1 Kdump: loaded Tainted: G S         OE     5.10.0-0607+ #23
> > > >  Hardware name: Inspur NF5280M4/YZMB-00689-101, BIOS 4.1.20 07/09/2018
> > > >  RIP: 0010:native_queued_spin_lock_slowpath+0x176/0x1b0
> > > >  Code: f3 90 48 8b 32 48 85 f6 74 f6 eb d5 c1 ee 12 83 e0 03 83 ee 01 48 c1 e0 05 48 63 f6 48 05 00 c8 02 00 48 03 04 f5 00 09 98 8e <48> 89 10 8b 42 08 85 c0 75 09 f3 90 8b 42 08 85 c0 74 f7 48 8b 32
> > > >  RSP: 0018:ffffb3b6c001ebd8 EFLAGS: 00010086
> > > >  RAX: ffffffff8f220860 RBX: 0000000000000246 RCX: 0000000000080000
> > > >  RDX: ffff91db1f86c800 RSI: 000000000000173c RDI: ffff91db62bace00
> > > >  RBP: ffff91db62bacc00 R08: 0000000000000000 R09: c00000010000028b
> > > >  R10: 0000000000055198 R11: ffffb3b6c001ea58 R12: ffff91db80e05010
> > > >  R13: 000000000000000a R14: 0000000000000006 R15: 0000000000000040
> > > >  FS:  0000000000000000(0000) GS:ffff91db1f840000(0000) knlGS:0000000000000000
> > > >  CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
> > > >  CR2: ffffffff8f220860 CR3: 00000001f9580004 CR4: 00000000003706e0
> > > >  DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
> > > >  DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
> > > >  Call Trace:
> > > >   <IRQ>
> > > >   _raw_spin_lock_irqsave+0x30/0x40
> > > >   mlx5_ib_poll_cq+0x4c/0xc50 [mlx5_ib]
> > > >   smc_wr_rx_tasklet_fn+0x56/0xa0 [smc]
> > > >   tasklet_action_common.isra.21+0x66/0x100
> > > >   __do_softirq+0xd5/0x29c
> > > >   asm_call_irq_on_stack+0x12/0x20
> > > >   </IRQ>
> > > >   do_softirq_own_stack+0x37/0x40
> > > >   irq_exit_rcu+0x9d/0xa0
> > > >   sysvec_call_function_single+0x34/0x80
> > > >   asm_sysvec_call_function_single+0x12/0x20
> > > > 
> > > > Signed-off-by: liuyacan <liuyacan@corp.netease.com>
> > > > ---
> > > >  net/smc/smc_core.c |  2 ++
> > > >  net/smc/smc_core.h |  2 ++
> > > >  net/smc/smc_wr.c   | 12 ++++++++++++
> > > >  net/smc/smc_wr.h   |  3 +++
> > > >  4 files changed, 19 insertions(+)
> > > > 
> > > > diff --git a/net/smc/smc_core.c b/net/smc/smc_core.c
> > > > index ff49a11f5..b632a33f1 100644
> > > > --- a/net/smc/smc_core.c
> > > > +++ b/net/smc/smc_core.c
> > > > @@ -752,6 +752,7 @@ int smcr_link_init(struct smc_link_group *lgr, struct smc_link *lnk,
> > > >  	atomic_inc(&lnk->smcibdev->lnk_cnt);
> > > >  	refcount_set(&lnk->refcnt, 1); /* link refcnt is set to 1 */
> > > >  	lnk->clearing = 0;
> > > > +	lnk->rx_drained = 0;
> > > >  	lnk->path_mtu = lnk->smcibdev->pattr[lnk->ibport - 1].active_mtu;
> > > >  	lnk->link_id = smcr_next_link_id(lgr);
> > > >  	lnk->lgr = lgr;
> > > > @@ -1269,6 +1270,7 @@ void smcr_link_clear(struct smc_link *lnk, bool log)
> > > >  	smcr_buf_unmap_lgr(lnk);
> > > >  	smcr_rtoken_clear_link(lnk);
> > > >  	smc_ib_modify_qp_error(lnk);
> > > > +	smc_wr_drain_cq(lnk);
> > > >  	smc_wr_free_link(lnk);
> > > >  	smc_ib_destroy_queue_pair(lnk);
> > > >  	smc_ib_dealloc_protection_domain(lnk);
> > > > diff --git a/net/smc/smc_core.h b/net/smc/smc_core.h
> > > > index fe8b524ad..0a469a3e7 100644
> > > > --- a/net/smc/smc_core.h
> > > > +++ b/net/smc/smc_core.h
> > > > @@ -117,6 +117,7 @@ struct smc_link {
> > > >  	u64			wr_rx_id;	/* seq # of last recv WR */
> > > >  	u32			wr_rx_cnt;	/* number of WR recv buffers */
> > > >  	unsigned long		wr_rx_tstamp;	/* jiffies when last buf rx */
> > > > +	wait_queue_head_t       wr_rx_drain_wait; /* wait for WR drain */
> > > >  
> > > >  	struct ib_reg_wr	wr_reg;		/* WR register memory region */
> > > >  	wait_queue_head_t	wr_reg_wait;	/* wait for wr_reg result */
> > > > @@ -138,6 +139,7 @@ struct smc_link {
> > > >  	u8			link_idx;	/* index in lgr link array */
> > > >  	u8			link_is_asym;	/* is link asymmetric? */
> > > >  	u8			clearing : 1;	/* link is being cleared */
> > > > +	u8                      rx_drained : 1; /* link is drained */
> > > >  	refcount_t		refcnt;		/* link reference count */
> > > >  	struct smc_link_group	*lgr;		/* parent link group */
> > > >  	struct work_struct	link_down_wrk;	/* wrk to bring link down */
> > > > diff --git a/net/smc/smc_wr.c b/net/smc/smc_wr.c
> > > > index 26f8f240d..f9992896a 100644
> > > > --- a/net/smc/smc_wr.c
> > > > +++ b/net/smc/smc_wr.c
> > > > @@ -465,6 +465,10 @@ static inline void smc_wr_rx_process_cqes(struct ib_wc wc[], int num)
> > > >  			case IB_WC_RNR_RETRY_EXC_ERR:
> > > >  			case IB_WC_WR_FLUSH_ERR:
> > > >  				smcr_link_down_cond_sched(link);
> > > > +				if (link->clearing && wc[i]->wr_id == link->wr_rx_id) {
> > > > +					link->rx_drained = 1;
> > > > +					wake_up(&link->wr_rx_drain_wait);
> > > > +				}
> > > 
> > > I am wondering if we should wait for all the wc comes back?
> > 
> > I think yes, so other processes can safely destroy qp.
> > 
> > > 
> > > >  				break;
> > > >  			default:
> > > >  				smc_wr_rx_post(link); /* refill WR RX */
> > > > @@ -631,6 +635,13 @@ static void smc_wr_init_sge(struct smc_link *lnk)
> > > >  	lnk->wr_reg.access = IB_ACCESS_LOCAL_WRITE | IB_ACCESS_REMOTE_WRITE;
> > > >  }
> > > >  
> > > > +void smc_wr_drain_cq(struct smc_link *lnk)
> > > > +{
> > > > +	wait_event_interruptible_timeout(lnk->wr_rx_drain_wait,
> > > > +					 (lnk->drained == 1),
> > > > +					 SMC_WR_RX_WAIT_DRAIN_TIME);
> > > > +}
> > > 
> > > Should we wait for it with timeout? It should eventually be wake up
> > > normally before freeing link. Waiting for SMC_WR_RX_WAIT_DRAIN_TIME (2s)
> > > may also have this issue, although the probability of occurrence is
> > > greatly reduced.
> > 
> > Indeed, there should logically probably be a perpetual wait here. I'm just worried if it 
> > will get stuck for some unknown reason.
> 
> IMHO, it's better to get stuck rather than to hide unknown issues. So I
> think timeout is unnecessary.

Ok, I accept your suggestion, thank you!

> 
> Cheers,
> Tony Lu

Regards,
Yacan


^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH net v4] net/smc: Fix possible access to freed memory in link clear
  2022-08-31 15:53         ` [PATCH net v4] net/smc: Fix possible access to freed memory in link clear liuyacan
@ 2022-09-01  6:51           ` Tony Lu
  2022-09-01  9:33           ` Wenjia Zhang
  2022-09-04 15:58           ` Tony Lu
  2 siblings, 0 replies; 22+ messages in thread
From: Tony Lu @ 2022-09-01  6:51 UTC (permalink / raw)
  To: liuyacan
  Cc: davem, edumazet, kgraul, kuba, linux-kernel, linux-s390, netdev,
	ubraun, pabeni, wenjia, wintera

On Wed, Aug 31, 2022 at 11:53:03PM +0800, liuyacan@corp.netease.com wrote:
> From: Yacan Liu <liuyacan@corp.netease.com>
> 
> After modifying the QP to the Error state, all RX WR would be completed
> with WC in IB_WC_WR_FLUSH_ERR status. Current implementation does not
> wait for it is done, but destroy the QP and free the link group directly.
> So there is a risk that accessing the freed memory in tasklet context.
> 
> Here is a crash example:
> 
>  BUG: unable to handle page fault for address: ffffffff8f220860
>  #PF: supervisor write access in kernel mode
>  #PF: error_code(0x0002) - not-present page
>  PGD f7300e067 P4D f7300e067 PUD f7300f063 PMD 8c4e45063 PTE 800ffff08c9df060
>  Oops: 0002 [#1] SMP PTI
>  CPU: 1 PID: 0 Comm: swapper/1 Kdump: loaded Tainted: G S         OE     5.10.0-0607+ #23
>  Hardware name: Inspur NF5280M4/YZMB-00689-101, BIOS 4.1.20 07/09/2018
>  RIP: 0010:native_queued_spin_lock_slowpath+0x176/0x1b0
>  Code: f3 90 48 8b 32 48 85 f6 74 f6 eb d5 c1 ee 12 83 e0 03 83 ee 01 48 c1 e0 05 48 63 f6 48 05 00 c8 02 00 48 03 04 f5 00 09 98 8e <48> 89 10 8b 42 08 85 c0 75 09 f3 90 8b 42 08 85 c0 74 f7 48 8b 32
>  RSP: 0018:ffffb3b6c001ebd8 EFLAGS: 00010086
>  RAX: ffffffff8f220860 RBX: 0000000000000246 RCX: 0000000000080000
>  RDX: ffff91db1f86c800 RSI: 000000000000173c RDI: ffff91db62bace00
>  RBP: ffff91db62bacc00 R08: 0000000000000000 R09: c00000010000028b
>  R10: 0000000000055198 R11: ffffb3b6c001ea58 R12: ffff91db80e05010
>  R13: 000000000000000a R14: 0000000000000006 R15: 0000000000000040
>  FS:  0000000000000000(0000) GS:ffff91db1f840000(0000) knlGS:0000000000000000
>  CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
>  CR2: ffffffff8f220860 CR3: 00000001f9580004 CR4: 00000000003706e0
>  DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
>  DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
>  Call Trace:
>   <IRQ>
>   _raw_spin_lock_irqsave+0x30/0x40
>   mlx5_ib_poll_cq+0x4c/0xc50 [mlx5_ib]
>   smc_wr_rx_tasklet_fn+0x56/0xa0 [smc]
>   tasklet_action_common.isra.21+0x66/0x100
>   __do_softirq+0xd5/0x29c
>   asm_call_irq_on_stack+0x12/0x20
>   </IRQ>
>   do_softirq_own_stack+0x37/0x40
>   irq_exit_rcu+0x9d/0xa0
>   sysvec_call_function_single+0x34/0x80
>   asm_sysvec_call_function_single+0x12/0x20
> 
> Fixes: bd4ad57718cc ("smc: initialize IB transport incl. PD, MR, QP, CQ, event, WR")
> Signed-off-by: Yacan Liu <liuyacan@corp.netease.com>
> 

Thanks for this fixes. I will test it in our environment.

Cheers,
Tony Lu

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH net v4] net/smc: Fix possible access to freed memory in link clear
  2022-08-31 15:53         ` [PATCH net v4] net/smc: Fix possible access to freed memory in link clear liuyacan
  2022-09-01  6:51           ` Tony Lu
@ 2022-09-01  9:33           ` Wenjia Zhang
  2022-09-01 12:26             ` liuyacan
  2022-09-04 15:58           ` Tony Lu
  2 siblings, 1 reply; 22+ messages in thread
From: Wenjia Zhang @ 2022-09-01  9:33 UTC (permalink / raw)
  To: liuyacan, tonylu
  Cc: davem, edumazet, kgraul, kuba, linux-kernel, linux-s390, netdev,
	ubraun, pabeni, wintera



On 31.08.22 17:53, liuyacan@corp.netease.com wrote:
> From: Yacan Liu <liuyacan@corp.netease.com>
> 
> After modifying the QP to the Error state, all RX WR would be completed
> with WC in IB_WC_WR_FLUSH_ERR status. Current implementation does not
> wait for it is done, but destroy the QP and free the link group directly.
> So there is a risk that accessing the freed memory in tasklet context.
> 
> Here is a crash example:
> 
>   BUG: unable to handle page fault for address: ffffffff8f220860
>   #PF: supervisor write access in kernel mode
>   #PF: error_code(0x0002) - not-present page
>   PGD f7300e067 P4D f7300e067 PUD f7300f063 PMD 8c4e45063 PTE 800ffff08c9df060
>   Oops: 0002 [#1] SMP PTI
>   CPU: 1 PID: 0 Comm: swapper/1 Kdump: loaded Tainted: G S         OE     5.10.0-0607+ #23
>   Hardware name: Inspur NF5280M4/YZMB-00689-101, BIOS 4.1.20 07/09/2018
>   RIP: 0010:native_queued_spin_lock_slowpath+0x176/0x1b0
>   Code: f3 90 48 8b 32 48 85 f6 74 f6 eb d5 c1 ee 12 83 e0 03 83 ee 01 48 c1 e0 05 48 63 f6 48 05 00 c8 02 00 48 03 04 f5 00 09 98 8e <48> 89 10 8b 42 08 85 c0 75 09 f3 90 8b 42 08 85 c0 74 f7 48 8b 32
>   RSP: 0018:ffffb3b6c001ebd8 EFLAGS: 00010086
>   RAX: ffffffff8f220860 RBX: 0000000000000246 RCX: 0000000000080000
>   RDX: ffff91db1f86c800 RSI: 000000000000173c RDI: ffff91db62bace00
>   RBP: ffff91db62bacc00 R08: 0000000000000000 R09: c00000010000028b
>   R10: 0000000000055198 R11: ffffb3b6c001ea58 R12: ffff91db80e05010
>   R13: 000000000000000a R14: 0000000000000006 R15: 0000000000000040
>   FS:  0000000000000000(0000) GS:ffff91db1f840000(0000) knlGS:0000000000000000
>   CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
>   CR2: ffffffff8f220860 CR3: 00000001f9580004 CR4: 00000000003706e0
>   DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
>   DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
>   Call Trace:
>    <IRQ>
>    _raw_spin_lock_irqsave+0x30/0x40
>    mlx5_ib_poll_cq+0x4c/0xc50 [mlx5_ib]
>    smc_wr_rx_tasklet_fn+0x56/0xa0 [smc]
>    tasklet_action_common.isra.21+0x66/0x100
>    __do_softirq+0xd5/0x29c
>    asm_call_irq_on_stack+0x12/0x20
>    </IRQ>
>    do_softirq_own_stack+0x37/0x40
>    irq_exit_rcu+0x9d/0xa0
>    sysvec_call_function_single+0x34/0x80
>    asm_sysvec_call_function_single+0x12/0x20
> 
> Fixes: bd4ad57718cc ("smc: initialize IB transport incl. PD, MR, QP, CQ, event, WR")
> Signed-off-by: Yacan Liu <liuyacan@corp.netease.com>
> 
> ---
> Chagen in v4:
>    -- Remove the rx_drain flag because smc_wr_rx_post() may not have been called.
>    -- Remove timeout.
> Change in v3:
>    -- Tune commit message (Signed-Off tag, Fixes tag).
>       Tune code to avoid column length exceeding.
> Change in v2:
>    -- Fix some compile warnings and errors.
> ---
>   net/smc/smc_core.c | 2 ++
>   net/smc/smc_core.h | 2 ++
>   net/smc/smc_wr.c   | 9 +++++++++
>   net/smc/smc_wr.h   | 1 +
>   4 files changed, 14 insertions(+)
> 
> diff --git a/net/smc/smc_core.c b/net/smc/smc_core.c
> index ff49a11f5..f92a916e9 100644
> --- a/net/smc/smc_core.c
> +++ b/net/smc/smc_core.c
> @@ -757,6 +757,7 @@ int smcr_link_init(struct smc_link_group *lgr, struct smc_link *lnk,
>   	lnk->lgr = lgr;
>   	smc_lgr_hold(lgr); /* lgr_put in smcr_link_clear() */
>   	lnk->link_idx = link_idx;
> +	lnk->wr_rx_id_compl = 0;
>   	smc_ibdev_cnt_inc(lnk);
>   	smcr_copy_dev_info_to_link(lnk);
>   	atomic_set(&lnk->conn_cnt, 0);
> @@ -1269,6 +1270,7 @@ void smcr_link_clear(struct smc_link *lnk, bool log)
>   	smcr_buf_unmap_lgr(lnk);
>   	smcr_rtoken_clear_link(lnk);
>   	smc_ib_modify_qp_error(lnk);
> +	smc_wr_drain_cq(lnk);
>   	smc_wr_free_link(lnk);
>   	smc_ib_destroy_queue_pair(lnk);
>   	smc_ib_dealloc_protection_domain(lnk);
> diff --git a/net/smc/smc_core.h b/net/smc/smc_core.h
> index fe8b524ad..285f9bd8e 100644
> --- a/net/smc/smc_core.h
> +++ b/net/smc/smc_core.h
> @@ -115,8 +115,10 @@ struct smc_link {
>   	dma_addr_t		wr_rx_dma_addr;	/* DMA address of wr_rx_bufs */
>   	dma_addr_t		wr_rx_v2_dma_addr; /* DMA address of v2 rx buf*/
>   	u64			wr_rx_id;	/* seq # of last recv WR */
> +	u64			wr_rx_id_compl; /* seq # of last completed WR */
>   	u32			wr_rx_cnt;	/* number of WR recv buffers */
>   	unsigned long		wr_rx_tstamp;	/* jiffies when last buf rx */
> +	wait_queue_head_t       wr_rx_empty_wait; /* wait for RQ empty */
>   
>   	struct ib_reg_wr	wr_reg;		/* WR register memory region */
>   	wait_queue_head_t	wr_reg_wait;	/* wait for wr_reg result */
> diff --git a/net/smc/smc_wr.c b/net/smc/smc_wr.c
> index 26f8f240d..bc8793803 100644
> --- a/net/smc/smc_wr.c
> +++ b/net/smc/smc_wr.c
> @@ -454,6 +454,7 @@ static inline void smc_wr_rx_process_cqes(struct ib_wc wc[], int num)
>   
>   	for (i = 0; i < num; i++) {
>   		link = wc[i].qp->qp_context;
> +		link->wr_rx_id_compl = wc[i].wr_id;
>   		if (wc[i].status == IB_WC_SUCCESS) {
>   			link->wr_rx_tstamp = jiffies;
>   			smc_wr_rx_demultiplex(&wc[i]);
> @@ -465,6 +466,8 @@ static inline void smc_wr_rx_process_cqes(struct ib_wc wc[], int num)
>   			case IB_WC_RNR_RETRY_EXC_ERR:
>   			case IB_WC_WR_FLUSH_ERR:
>   				smcr_link_down_cond_sched(link);
> +				if (link->wr_rx_id_compl == link->wr_rx_id)
> +					wake_up(&link->wr_rx_empty_wait);
>   				break;
>   			default:
>   				smc_wr_rx_post(link); /* refill WR RX */
> @@ -631,6 +634,11 @@ static void smc_wr_init_sge(struct smc_link *lnk)
>   	lnk->wr_reg.access = IB_ACCESS_LOCAL_WRITE | IB_ACCESS_REMOTE_WRITE;
>   }
>   
> +void smc_wr_drain_cq(struct smc_link *lnk)
> +{
> +	wait_event(lnk->wr_rx_empty_wait, lnk->wr_rx_id_compl == lnk->wr_rx_id);
> +}
> +
>   void smc_wr_free_link(struct smc_link *lnk)
>   {
>   	struct ib_device *ibdev;
> @@ -889,6 +897,7 @@ int smc_wr_create_link(struct smc_link *lnk)
>   	atomic_set(&lnk->wr_tx_refcnt, 0);
>   	init_waitqueue_head(&lnk->wr_reg_wait);
>   	atomic_set(&lnk->wr_reg_refcnt, 0);
> +	init_waitqueue_head(&lnk->wr_rx_empty_wait);
>   	return rc;
>   
>   dma_unmap:
> diff --git a/net/smc/smc_wr.h b/net/smc/smc_wr.h
> index a54e90a11..5ca5086ae 100644
> --- a/net/smc/smc_wr.h
> +++ b/net/smc/smc_wr.h
> @@ -101,6 +101,7 @@ static inline int smc_wr_rx_post(struct smc_link *link)
>   int smc_wr_create_link(struct smc_link *lnk);
>   int smc_wr_alloc_link_mem(struct smc_link *lnk);
>   int smc_wr_alloc_lgr_mem(struct smc_link_group *lgr);
> +void smc_wr_drain_cq(struct smc_link *lnk);
>   void smc_wr_free_link(struct smc_link *lnk);
>   void smc_wr_free_link_mem(struct smc_link *lnk);
>   void smc_wr_free_lgr_mem(struct smc_link_group *lgr);

Thank you @Yacan for the effort to improve our code! And Thank you @Tony 
for such valuable suggestions and testing!
I like the modification of this version. However, this is not a fix 
patch to upstream, since the patches "[PATCH net-next v2 00/10] optimize 
the parallelism of SMC-R connections" are still not applied. My 
sugguestions:
- Please talk to the author (D. Wythe <alibuda@linux.alibaba.com>) of 
those patches I mentioned above, and ask if he can take your patch as a 
part of the patch serie
- Fix patches should go to net-next
- Please send always send your new version separately, rather than as 
reply to your previous version. That makes people confused.

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH net v4] net/smc: Fix possible access to freed memory in link clear
  2022-09-01  9:33           ` Wenjia Zhang
@ 2022-09-01 12:26             ` liuyacan
  2022-09-01 12:45               ` Wenjia Zhang
  0 siblings, 1 reply; 22+ messages in thread
From: liuyacan @ 2022-09-01 12:26 UTC (permalink / raw)
  To: wenjia, alibuda
  Cc: davem, edumazet, kgraul, kuba, linux-kernel, linux-s390, liuyacan,
	netdev, pabeni, tonylu, ubraun, wintera

> > From: Yacan Liu <liuyacan@corp.netease.com>
> > 
> > After modifying the QP to the Error state, all RX WR would be completed
> > with WC in IB_WC_WR_FLUSH_ERR status. Current implementation does not
> > wait for it is done, but destroy the QP and free the link group directly.
> > So there is a risk that accessing the freed memory in tasklet context.
> > 
> > Here is a crash example:
> > 
> >   BUG: unable to handle page fault for address: ffffffff8f220860
> >   #PF: supervisor write access in kernel mode
> >   #PF: error_code(0x0002) - not-present page
> >   PGD f7300e067 P4D f7300e067 PUD f7300f063 PMD 8c4e45063 PTE 800ffff08c9df060
> >   Oops: 0002 [#1] SMP PTI
> >   CPU: 1 PID: 0 Comm: swapper/1 Kdump: loaded Tainted: G S         OE     5.10.0-0607+ #23
> >   Hardware name: Inspur NF5280M4/YZMB-00689-101, BIOS 4.1.20 07/09/2018
> >   RIP: 0010:native_queued_spin_lock_slowpath+0x176/0x1b0
> >   Code: f3 90 48 8b 32 48 85 f6 74 f6 eb d5 c1 ee 12 83 e0 03 83 ee 01 48 c1 e0 05 48 63 f6 48 05 00 c8 02 00 48 03 04 f5 00 09 98 8e <48> 89 10 8b 42 08 85 c0 75 09 f3 90 8b 42 08 85 c0 74 f7 48 8b 32
> >   RSP: 0018:ffffb3b6c001ebd8 EFLAGS: 00010086
> >   RAX: ffffffff8f220860 RBX: 0000000000000246 RCX: 0000000000080000
> >   RDX: ffff91db1f86c800 RSI: 000000000000173c RDI: ffff91db62bace00
> >   RBP: ffff91db62bacc00 R08: 0000000000000000 R09: c00000010000028b
> >   R10: 0000000000055198 R11: ffffb3b6c001ea58 R12: ffff91db80e05010
> >   R13: 000000000000000a R14: 0000000000000006 R15: 0000000000000040
> >   FS:  0000000000000000(0000) GS:ffff91db1f840000(0000) knlGS:0000000000000000
> >   CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
> >   CR2: ffffffff8f220860 CR3: 00000001f9580004 CR4: 00000000003706e0
> >   DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
> >   DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
> >   Call Trace:
> >    <IRQ>
> >    _raw_spin_lock_irqsave+0x30/0x40
> >    mlx5_ib_poll_cq+0x4c/0xc50 [mlx5_ib]
> >    smc_wr_rx_tasklet_fn+0x56/0xa0 [smc]
> >    tasklet_action_common.isra.21+0x66/0x100
> >    __do_softirq+0xd5/0x29c
> >    asm_call_irq_on_stack+0x12/0x20
> >    </IRQ>
> >    do_softirq_own_stack+0x37/0x40
> >    irq_exit_rcu+0x9d/0xa0
> >    sysvec_call_function_single+0x34/0x80
> >    asm_sysvec_call_function_single+0x12/0x20
> > 
> > Fixes: bd4ad57718cc ("smc: initialize IB transport incl. PD, MR, QP, CQ, event, WR")
> > Signed-off-by: Yacan Liu <liuyacan@corp.netease.com>
> > 
> > ---
> > Chagen in v4:
> >    -- Remove the rx_drain flag because smc_wr_rx_post() may not have been called.
> >    -- Remove timeout.
> > Change in v3:
> >    -- Tune commit message (Signed-Off tag, Fixes tag).
> >       Tune code to avoid column length exceeding.
> > Change in v2:
> >    -- Fix some compile warnings and errors.
> > ---
> >   net/smc/smc_core.c | 2 ++
> >   net/smc/smc_core.h | 2 ++
> >   net/smc/smc_wr.c   | 9 +++++++++
> >   net/smc/smc_wr.h   | 1 +
> >   4 files changed, 14 insertions(+)
> > 
> > diff --git a/net/smc/smc_core.c b/net/smc/smc_core.c
> > index ff49a11f5..f92a916e9 100644
> > --- a/net/smc/smc_core.c
> > +++ b/net/smc/smc_core.c
> > @@ -757,6 +757,7 @@ int smcr_link_init(struct smc_link_group *lgr, struct smc_link *lnk,
> >   	lnk->lgr = lgr;
> >   	smc_lgr_hold(lgr); /* lgr_put in smcr_link_clear() */
> >   	lnk->link_idx = link_idx;
> > +	lnk->wr_rx_id_compl = 0;
> >   	smc_ibdev_cnt_inc(lnk);
> >   	smcr_copy_dev_info_to_link(lnk);
> >   	atomic_set(&lnk->conn_cnt, 0);
> > @@ -1269,6 +1270,7 @@ void smcr_link_clear(struct smc_link *lnk, bool log)
> >   	smcr_buf_unmap_lgr(lnk);
> >   	smcr_rtoken_clear_link(lnk);
> >   	smc_ib_modify_qp_error(lnk);
> > +	smc_wr_drain_cq(lnk);
> >   	smc_wr_free_link(lnk);
> >   	smc_ib_destroy_queue_pair(lnk);
> >   	smc_ib_dealloc_protection_domain(lnk);
> > diff --git a/net/smc/smc_core.h b/net/smc/smc_core.h
> > index fe8b524ad..285f9bd8e 100644
> > --- a/net/smc/smc_core.h
> > +++ b/net/smc/smc_core.h
> > @@ -115,8 +115,10 @@ struct smc_link {
> >   	dma_addr_t		wr_rx_dma_addr;	/* DMA address of wr_rx_bufs */
> >   	dma_addr_t		wr_rx_v2_dma_addr; /* DMA address of v2 rx buf*/
> >   	u64			wr_rx_id;	/* seq # of last recv WR */
> > +	u64			wr_rx_id_compl; /* seq # of last completed WR */
> >   	u32			wr_rx_cnt;	/* number of WR recv buffers */
> >   	unsigned long		wr_rx_tstamp;	/* jiffies when last buf rx */
> > +	wait_queue_head_t       wr_rx_empty_wait; /* wait for RQ empty */
> >   
> >   	struct ib_reg_wr	wr_reg;		/* WR register memory region */
> >   	wait_queue_head_t	wr_reg_wait;	/* wait for wr_reg result */
> > diff --git a/net/smc/smc_wr.c b/net/smc/smc_wr.c
> > index 26f8f240d..bc8793803 100644
> > --- a/net/smc/smc_wr.c
> > +++ b/net/smc/smc_wr.c
> > @@ -454,6 +454,7 @@ static inline void smc_wr_rx_process_cqes(struct ib_wc wc[], int num)
> >   
> >   	for (i = 0; i < num; i++) {
> >   		link = wc[i].qp->qp_context;
> > +		link->wr_rx_id_compl = wc[i].wr_id;
> >   		if (wc[i].status == IB_WC_SUCCESS) {
> >   			link->wr_rx_tstamp = jiffies;
> >   			smc_wr_rx_demultiplex(&wc[i]);
> > @@ -465,6 +466,8 @@ static inline void smc_wr_rx_process_cqes(struct ib_wc wc[], int num)
> >   			case IB_WC_RNR_RETRY_EXC_ERR:
> >   			case IB_WC_WR_FLUSH_ERR:
> >   				smcr_link_down_cond_sched(link);
> > +				if (link->wr_rx_id_compl == link->wr_rx_id)
> > +					wake_up(&link->wr_rx_empty_wait);
> >   				break;
> >   			default:
> >   				smc_wr_rx_post(link); /* refill WR RX */
> > @@ -631,6 +634,11 @@ static void smc_wr_init_sge(struct smc_link *lnk)
> >   	lnk->wr_reg.access = IB_ACCESS_LOCAL_WRITE | IB_ACCESS_REMOTE_WRITE;
> >   }
> >   
> > +void smc_wr_drain_cq(struct smc_link *lnk)
> > +{
> > +	wait_event(lnk->wr_rx_empty_wait, lnk->wr_rx_id_compl == lnk->wr_rx_id);
> > +}
> > +
> >   void smc_wr_free_link(struct smc_link *lnk)
> >   {
> >   	struct ib_device *ibdev;
> > @@ -889,6 +897,7 @@ int smc_wr_create_link(struct smc_link *lnk)
> >   	atomic_set(&lnk->wr_tx_refcnt, 0);
> >   	init_waitqueue_head(&lnk->wr_reg_wait);
> >   	atomic_set(&lnk->wr_reg_refcnt, 0);
> > +	init_waitqueue_head(&lnk->wr_rx_empty_wait);
> >   	return rc;
> >   
> >   dma_unmap:
> > diff --git a/net/smc/smc_wr.h b/net/smc/smc_wr.h
> > index a54e90a11..5ca5086ae 100644
> > --- a/net/smc/smc_wr.h
> > +++ b/net/smc/smc_wr.h
> > @@ -101,6 +101,7 @@ static inline int smc_wr_rx_post(struct smc_link *link)
> >   int smc_wr_create_link(struct smc_link *lnk);
> >   int smc_wr_alloc_link_mem(struct smc_link *lnk);
> >   int smc_wr_alloc_lgr_mem(struct smc_link_group *lgr);
> > +void smc_wr_drain_cq(struct smc_link *lnk);
> >   void smc_wr_free_link(struct smc_link *lnk);
> >   void smc_wr_free_link_mem(struct smc_link *lnk);
> >   void smc_wr_free_lgr_mem(struct smc_link_group *lgr);
> 
> Thank you @Yacan for the effort to improve our code! And Thank you @Tony 
> for such valuable suggestions and testing!
> I like the modification of this version. However, this is not a fix 
> patch to upstream, since the patches "[PATCH net-next v2 00/10] optimize 
> the parallelism of SMC-R connections" are still not applied. My 
> sugguestions:
> - Please talk to the author (D. Wythe <alibuda@linux.alibaba.com>) of 
> those patches I mentioned above, and ask if he can take your patch as a 
> part of the patch serie
> - Fix patches should go to net-next
> - Please send always send your new version separately, rather than as 
> reply to your previous version. That makes people confused.

@Wenjia, Thanks a lot for your suggestions and guidance ! 

@D. Wythe, Can you include this patch in your series of patches if it is 
convenient?

Regards,
Yacan


^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH net v4] net/smc: Fix possible access to freed memory in link clear
  2022-09-01 12:26             ` liuyacan
@ 2022-09-01 12:45               ` Wenjia Zhang
  2022-09-01 12:54                 ` liuyacan
  0 siblings, 1 reply; 22+ messages in thread
From: Wenjia Zhang @ 2022-09-01 12:45 UTC (permalink / raw)
  To: liuyacan
  Cc: davem, edumazet, kgraul, kuba, linux-kernel, linux-s390, netdev,
	pabeni, tonylu, ubraun, wintera, alibuda



On 01.09.22 14:26, liuyacan@corp.netease.com wrote:
>>> From: Yacan Liu <liuyacan@corp.netease.com>
>>>
>>> After modifying the QP to the Error state, all RX WR would be completed
>>> with WC in IB_WC_WR_FLUSH_ERR status. Current implementation does not
>>> wait for it is done, but destroy the QP and free the link group directly.
>>> So there is a risk that accessing the freed memory in tasklet context.
>>>
>>> Here is a crash example:
>>>
>>>    BUG: unable to handle page fault for address: ffffffff8f220860
>>>    #PF: supervisor write access in kernel mode
>>>    #PF: error_code(0x0002) - not-present page
>>>    PGD f7300e067 P4D f7300e067 PUD f7300f063 PMD 8c4e45063 PTE 800ffff08c9df060
>>>    Oops: 0002 [#1] SMP PTI
>>>    CPU: 1 PID: 0 Comm: swapper/1 Kdump: loaded Tainted: G S         OE     5.10.0-0607+ #23
>>>    Hardware name: Inspur NF5280M4/YZMB-00689-101, BIOS 4.1.20 07/09/2018
>>>    RIP: 0010:native_queued_spin_lock_slowpath+0x176/0x1b0
>>>    Code: f3 90 48 8b 32 48 85 f6 74 f6 eb d5 c1 ee 12 83 e0 03 83 ee 01 48 c1 e0 05 48 63 f6 48 05 00 c8 02 00 48 03 04 f5 00 09 98 8e <48> 89 10 8b 42 08 85 c0 75 09 f3 90 8b 42 08 85 c0 74 f7 48 8b 32
>>>    RSP: 0018:ffffb3b6c001ebd8 EFLAGS: 00010086
>>>    RAX: ffffffff8f220860 RBX: 0000000000000246 RCX: 0000000000080000
>>>    RDX: ffff91db1f86c800 RSI: 000000000000173c RDI: ffff91db62bace00
>>>    RBP: ffff91db62bacc00 R08: 0000000000000000 R09: c00000010000028b
>>>    R10: 0000000000055198 R11: ffffb3b6c001ea58 R12: ffff91db80e05010
>>>    R13: 000000000000000a R14: 0000000000000006 R15: 0000000000000040
>>>    FS:  0000000000000000(0000) GS:ffff91db1f840000(0000) knlGS:0000000000000000
>>>    CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
>>>    CR2: ffffffff8f220860 CR3: 00000001f9580004 CR4: 00000000003706e0
>>>    DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
>>>    DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
>>>    Call Trace:
>>>     <IRQ>
>>>     _raw_spin_lock_irqsave+0x30/0x40
>>>     mlx5_ib_poll_cq+0x4c/0xc50 [mlx5_ib]
>>>     smc_wr_rx_tasklet_fn+0x56/0xa0 [smc]
>>>     tasklet_action_common.isra.21+0x66/0x100
>>>     __do_softirq+0xd5/0x29c
>>>     asm_call_irq_on_stack+0x12/0x20
>>>     </IRQ>
>>>     do_softirq_own_stack+0x37/0x40
>>>     irq_exit_rcu+0x9d/0xa0
>>>     sysvec_call_function_single+0x34/0x80
>>>     asm_sysvec_call_function_single+0x12/0x20
>>>
>>> Fixes: bd4ad57718cc ("smc: initialize IB transport incl. PD, MR, QP, CQ, event, WR")
>>> Signed-off-by: Yacan Liu <liuyacan@corp.netease.com>
>>>
>>> ---
>>> Chagen in v4:
>>>     -- Remove the rx_drain flag because smc_wr_rx_post() may not have been called.
>>>     -- Remove timeout.
>>> Change in v3:
>>>     -- Tune commit message (Signed-Off tag, Fixes tag).
>>>        Tune code to avoid column length exceeding.
>>> Change in v2:
>>>     -- Fix some compile warnings and errors.
>>> ---
>>>    net/smc/smc_core.c | 2 ++
>>>    net/smc/smc_core.h | 2 ++
>>>    net/smc/smc_wr.c   | 9 +++++++++
>>>    net/smc/smc_wr.h   | 1 +
>>>    4 files changed, 14 insertions(+)
>>>
>>> diff --git a/net/smc/smc_core.c b/net/smc/smc_core.c
>>> index ff49a11f5..f92a916e9 100644
>>> --- a/net/smc/smc_core.c
>>> +++ b/net/smc/smc_core.c
>>> @@ -757,6 +757,7 @@ int smcr_link_init(struct smc_link_group *lgr, struct smc_link *lnk,
>>>    	lnk->lgr = lgr;
>>>    	smc_lgr_hold(lgr); /* lgr_put in smcr_link_clear() */
>>>    	lnk->link_idx = link_idx;
>>> +	lnk->wr_rx_id_compl = 0;
>>>    	smc_ibdev_cnt_inc(lnk);
>>>    	smcr_copy_dev_info_to_link(lnk);
>>>    	atomic_set(&lnk->conn_cnt, 0);
>>> @@ -1269,6 +1270,7 @@ void smcr_link_clear(struct smc_link *lnk, bool log)
>>>    	smcr_buf_unmap_lgr(lnk);
>>>    	smcr_rtoken_clear_link(lnk);
>>>    	smc_ib_modify_qp_error(lnk);
>>> +	smc_wr_drain_cq(lnk);
>>>    	smc_wr_free_link(lnk);
>>>    	smc_ib_destroy_queue_pair(lnk);
>>>    	smc_ib_dealloc_protection_domain(lnk);
>>> diff --git a/net/smc/smc_core.h b/net/smc/smc_core.h
>>> index fe8b524ad..285f9bd8e 100644
>>> --- a/net/smc/smc_core.h
>>> +++ b/net/smc/smc_core.h
>>> @@ -115,8 +115,10 @@ struct smc_link {
>>>    	dma_addr_t		wr_rx_dma_addr;	/* DMA address of wr_rx_bufs */
>>>    	dma_addr_t		wr_rx_v2_dma_addr; /* DMA address of v2 rx buf*/
>>>    	u64			wr_rx_id;	/* seq # of last recv WR */
>>> +	u64			wr_rx_id_compl; /* seq # of last completed WR */
>>>    	u32			wr_rx_cnt;	/* number of WR recv buffers */
>>>    	unsigned long		wr_rx_tstamp;	/* jiffies when last buf rx */
>>> +	wait_queue_head_t       wr_rx_empty_wait; /* wait for RQ empty */
>>>    
>>>    	struct ib_reg_wr	wr_reg;		/* WR register memory region */
>>>    	wait_queue_head_t	wr_reg_wait;	/* wait for wr_reg result */
>>> diff --git a/net/smc/smc_wr.c b/net/smc/smc_wr.c
>>> index 26f8f240d..bc8793803 100644
>>> --- a/net/smc/smc_wr.c
>>> +++ b/net/smc/smc_wr.c
>>> @@ -454,6 +454,7 @@ static inline void smc_wr_rx_process_cqes(struct ib_wc wc[], int num)
>>>    
>>>    	for (i = 0; i < num; i++) {
>>>    		link = wc[i].qp->qp_context;
>>> +		link->wr_rx_id_compl = wc[i].wr_id;
>>>    		if (wc[i].status == IB_WC_SUCCESS) {
>>>    			link->wr_rx_tstamp = jiffies;
>>>    			smc_wr_rx_demultiplex(&wc[i]);
>>> @@ -465,6 +466,8 @@ static inline void smc_wr_rx_process_cqes(struct ib_wc wc[], int num)
>>>    			case IB_WC_RNR_RETRY_EXC_ERR:
>>>    			case IB_WC_WR_FLUSH_ERR:
>>>    				smcr_link_down_cond_sched(link);
>>> +				if (link->wr_rx_id_compl == link->wr_rx_id)
>>> +					wake_up(&link->wr_rx_empty_wait);
>>>    				break;
>>>    			default:
>>>    				smc_wr_rx_post(link); /* refill WR RX */
>>> @@ -631,6 +634,11 @@ static void smc_wr_init_sge(struct smc_link *lnk)
>>>    	lnk->wr_reg.access = IB_ACCESS_LOCAL_WRITE | IB_ACCESS_REMOTE_WRITE;
>>>    }
>>>    
>>> +void smc_wr_drain_cq(struct smc_link *lnk)
>>> +{
>>> +	wait_event(lnk->wr_rx_empty_wait, lnk->wr_rx_id_compl == lnk->wr_rx_id);
>>> +}
>>> +
>>>    void smc_wr_free_link(struct smc_link *lnk)
>>>    {
>>>    	struct ib_device *ibdev;
>>> @@ -889,6 +897,7 @@ int smc_wr_create_link(struct smc_link *lnk)
>>>    	atomic_set(&lnk->wr_tx_refcnt, 0);
>>>    	init_waitqueue_head(&lnk->wr_reg_wait);
>>>    	atomic_set(&lnk->wr_reg_refcnt, 0);
>>> +	init_waitqueue_head(&lnk->wr_rx_empty_wait);
>>>    	return rc;
>>>    
>>>    dma_unmap:
>>> diff --git a/net/smc/smc_wr.h b/net/smc/smc_wr.h
>>> index a54e90a11..5ca5086ae 100644
>>> --- a/net/smc/smc_wr.h
>>> +++ b/net/smc/smc_wr.h
>>> @@ -101,6 +101,7 @@ static inline int smc_wr_rx_post(struct smc_link *link)
>>>    int smc_wr_create_link(struct smc_link *lnk);
>>>    int smc_wr_alloc_link_mem(struct smc_link *lnk);
>>>    int smc_wr_alloc_lgr_mem(struct smc_link_group *lgr);
>>> +void smc_wr_drain_cq(struct smc_link *lnk);
>>>    void smc_wr_free_link(struct smc_link *lnk);
>>>    void smc_wr_free_link_mem(struct smc_link *lnk);
>>>    void smc_wr_free_lgr_mem(struct smc_link_group *lgr);
>>
>> Thank you @Yacan for the effort to improve our code! And Thank you @Tony
>> for such valuable suggestions and testing!
>> I like the modification of this version. However, this is not a fix
>> patch to upstream, since the patches "[PATCH net-next v2 00/10] optimize
>> the parallelism of SMC-R connections" are still not applied. My
>> sugguestions:
>> - Please talk to the author (D. Wythe <alibuda@linux.alibaba.com>) of
>> those patches I mentioned above, and ask if he can take your patch as a
>> part of the patch serie
>> - Fix patches should go to net-next
>> - Please send always send your new version separately, rather than as
>> reply to your previous version. That makes people confused.
> 
> @Wenjia, Thanks a lot for your suggestions and guidance !
> 
> @D. Wythe, Can you include this patch in your series of patches if it is
> convenient?
> 
> Regards,
> Yacan
> 
One point I was confused, fixes should goto net, sorry!

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH net v4] net/smc: Fix possible access to freed memory in link clear
  2022-09-01 12:45               ` Wenjia Zhang
@ 2022-09-01 12:54                 ` liuyacan
  2022-09-01 19:16                   ` Wenjia Zhang
  0 siblings, 1 reply; 22+ messages in thread
From: liuyacan @ 2022-09-01 12:54 UTC (permalink / raw)
  To: wenjia, alibuda
  Cc: davem, edumazet, kgraul, kuba, linux-kernel, linux-s390, liuyacan,
	netdev, pabeni, tonylu, ubraun, wintera

> >>> From: Yacan Liu <liuyacan@corp.netease.com>
> >>>
> >>> After modifying the QP to the Error state, all RX WR would be completed
> >>> with WC in IB_WC_WR_FLUSH_ERR status. Current implementation does not
> >>> wait for it is done, but destroy the QP and free the link group directly.
> >>> So there is a risk that accessing the freed memory in tasklet context.
> >>>
> >>> Here is a crash example:
> >>>
> >>>    BUG: unable to handle page fault for address: ffffffff8f220860
> >>>    #PF: supervisor write access in kernel mode
> >>>    #PF: error_code(0x0002) - not-present page
> >>>    PGD f7300e067 P4D f7300e067 PUD f7300f063 PMD 8c4e45063 PTE 800ffff08c9df060
> >>>    Oops: 0002 [#1] SMP PTI
> >>>    CPU: 1 PID: 0 Comm: swapper/1 Kdump: loaded Tainted: G S         OE     5.10.0-0607+ #23
> >>>    Hardware name: Inspur NF5280M4/YZMB-00689-101, BIOS 4.1.20 07/09/2018
> >>>    RIP: 0010:native_queued_spin_lock_slowpath+0x176/0x1b0
> >>>    Code: f3 90 48 8b 32 48 85 f6 74 f6 eb d5 c1 ee 12 83 e0 03 83 ee 01 48 c1 e0 05 48 63 f6 48 05 00 c8 02 00 48 03 04 f5 00 09 98 8e <48> 89 10 8b 42 08 85 c0 75 09 f3 90 8b 42 08 85 c0 74 f7 48 8b 32
> >>>    RSP: 0018:ffffb3b6c001ebd8 EFLAGS: 00010086
> >>>    RAX: ffffffff8f220860 RBX: 0000000000000246 RCX: 0000000000080000
> >>>    RDX: ffff91db1f86c800 RSI: 000000000000173c RDI: ffff91db62bace00
> >>>    RBP: ffff91db62bacc00 R08: 0000000000000000 R09: c00000010000028b
> >>>    R10: 0000000000055198 R11: ffffb3b6c001ea58 R12: ffff91db80e05010
> >>>    R13: 000000000000000a R14: 0000000000000006 R15: 0000000000000040
> >>>    FS:  0000000000000000(0000) GS:ffff91db1f840000(0000) knlGS:0000000000000000
> >>>    CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
> >>>    CR2: ffffffff8f220860 CR3: 00000001f9580004 CR4: 00000000003706e0
> >>>    DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
> >>>    DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
> >>>    Call Trace:
> >>>     <IRQ>
> >>>     _raw_spin_lock_irqsave+0x30/0x40
> >>>     mlx5_ib_poll_cq+0x4c/0xc50 [mlx5_ib]
> >>>     smc_wr_rx_tasklet_fn+0x56/0xa0 [smc]
> >>>     tasklet_action_common.isra.21+0x66/0x100
> >>>     __do_softirq+0xd5/0x29c
> >>>     asm_call_irq_on_stack+0x12/0x20
> >>>     </IRQ>
> >>>     do_softirq_own_stack+0x37/0x40
> >>>     irq_exit_rcu+0x9d/0xa0
> >>>     sysvec_call_function_single+0x34/0x80
> >>>     asm_sysvec_call_function_single+0x12/0x20
> >>>
> >>> Fixes: bd4ad57718cc ("smc: initialize IB transport incl. PD, MR, QP, CQ, event, WR")
> >>> Signed-off-by: Yacan Liu <liuyacan@corp.netease.com>
> >>>
> >>> ---
> >>> Chagen in v4:
> >>>     -- Remove the rx_drain flag because smc_wr_rx_post() may not have been called.
> >>>     -- Remove timeout.
> >>> Change in v3:
> >>>     -- Tune commit message (Signed-Off tag, Fixes tag).
> >>>        Tune code to avoid column length exceeding.
> >>> Change in v2:
> >>>     -- Fix some compile warnings and errors.
> >>> ---
> >>>    net/smc/smc_core.c | 2 ++
> >>>    net/smc/smc_core.h | 2 ++
> >>>    net/smc/smc_wr.c   | 9 +++++++++
> >>>    net/smc/smc_wr.h   | 1 +
> >>>    4 files changed, 14 insertions(+)
> >>>
> >>> diff --git a/net/smc/smc_core.c b/net/smc/smc_core.c
> >>> index ff49a11f5..f92a916e9 100644
> >>> --- a/net/smc/smc_core.c
> >>> +++ b/net/smc/smc_core.c
> >>> @@ -757,6 +757,7 @@ int smcr_link_init(struct smc_link_group *lgr, struct smc_link *lnk,
> >>>    	lnk->lgr = lgr;
> >>>    	smc_lgr_hold(lgr); /* lgr_put in smcr_link_clear() */
> >>>    	lnk->link_idx = link_idx;
> >>> +	lnk->wr_rx_id_compl = 0;
> >>>    	smc_ibdev_cnt_inc(lnk);
> >>>    	smcr_copy_dev_info_to_link(lnk);
> >>>    	atomic_set(&lnk->conn_cnt, 0);
> >>> @@ -1269,6 +1270,7 @@ void smcr_link_clear(struct smc_link *lnk, bool log)
> >>>    	smcr_buf_unmap_lgr(lnk);
> >>>    	smcr_rtoken_clear_link(lnk);
> >>>    	smc_ib_modify_qp_error(lnk);
> >>> +	smc_wr_drain_cq(lnk);
> >>>    	smc_wr_free_link(lnk);
> >>>    	smc_ib_destroy_queue_pair(lnk);
> >>>    	smc_ib_dealloc_protection_domain(lnk);
> >>> diff --git a/net/smc/smc_core.h b/net/smc/smc_core.h
> >>> index fe8b524ad..285f9bd8e 100644
> >>> --- a/net/smc/smc_core.h
> >>> +++ b/net/smc/smc_core.h
> >>> @@ -115,8 +115,10 @@ struct smc_link {
> >>>    	dma_addr_t		wr_rx_dma_addr;	/* DMA address of wr_rx_bufs */
> >>>    	dma_addr_t		wr_rx_v2_dma_addr; /* DMA address of v2 rx buf*/
> >>>    	u64			wr_rx_id;	/* seq # of last recv WR */
> >>> +	u64			wr_rx_id_compl; /* seq # of last completed WR */
> >>>    	u32			wr_rx_cnt;	/* number of WR recv buffers */
> >>>    	unsigned long		wr_rx_tstamp;	/* jiffies when last buf rx */
> >>> +	wait_queue_head_t       wr_rx_empty_wait; /* wait for RQ empty */
> >>>    
> >>>    	struct ib_reg_wr	wr_reg;		/* WR register memory region */
> >>>    	wait_queue_head_t	wr_reg_wait;	/* wait for wr_reg result */
> >>> diff --git a/net/smc/smc_wr.c b/net/smc/smc_wr.c
> >>> index 26f8f240d..bc8793803 100644
> >>> --- a/net/smc/smc_wr.c
> >>> +++ b/net/smc/smc_wr.c
> >>> @@ -454,6 +454,7 @@ static inline void smc_wr_rx_process_cqes(struct ib_wc wc[], int num)
> >>>    
> >>>    	for (i = 0; i < num; i++) {
> >>>    		link = wc[i].qp->qp_context;
> >>> +		link->wr_rx_id_compl = wc[i].wr_id;
> >>>    		if (wc[i].status == IB_WC_SUCCESS) {
> >>>    			link->wr_rx_tstamp = jiffies;
> >>>    			smc_wr_rx_demultiplex(&wc[i]);
> >>> @@ -465,6 +466,8 @@ static inline void smc_wr_rx_process_cqes(struct ib_wc wc[], int num)
> >>>    			case IB_WC_RNR_RETRY_EXC_ERR:
> >>>    			case IB_WC_WR_FLUSH_ERR:
> >>>    				smcr_link_down_cond_sched(link);
> >>> +				if (link->wr_rx_id_compl == link->wr_rx_id)
> >>> +					wake_up(&link->wr_rx_empty_wait);
> >>>    				break;
> >>>    			default:
> >>>    				smc_wr_rx_post(link); /* refill WR RX */
> >>> @@ -631,6 +634,11 @@ static void smc_wr_init_sge(struct smc_link *lnk)
> >>>    	lnk->wr_reg.access = IB_ACCESS_LOCAL_WRITE | IB_ACCESS_REMOTE_WRITE;
> >>>    }
> >>>    
> >>> +void smc_wr_drain_cq(struct smc_link *lnk)
> >>> +{
> >>> +	wait_event(lnk->wr_rx_empty_wait, lnk->wr_rx_id_compl == lnk->wr_rx_id);
> >>> +}
> >>> +
> >>>    void smc_wr_free_link(struct smc_link *lnk)
> >>>    {
> >>>    	struct ib_device *ibdev;
> >>> @@ -889,6 +897,7 @@ int smc_wr_create_link(struct smc_link *lnk)
> >>>    	atomic_set(&lnk->wr_tx_refcnt, 0);
> >>>    	init_waitqueue_head(&lnk->wr_reg_wait);
> >>>    	atomic_set(&lnk->wr_reg_refcnt, 0);
> >>> +	init_waitqueue_head(&lnk->wr_rx_empty_wait);
> >>>    	return rc;
> >>>    
> >>>    dma_unmap:
> >>> diff --git a/net/smc/smc_wr.h b/net/smc/smc_wr.h
> >>> index a54e90a11..5ca5086ae 100644
> >>> --- a/net/smc/smc_wr.h
> >>> +++ b/net/smc/smc_wr.h
> >>> @@ -101,6 +101,7 @@ static inline int smc_wr_rx_post(struct smc_link *link)
> >>>    int smc_wr_create_link(struct smc_link *lnk);
> >>>    int smc_wr_alloc_link_mem(struct smc_link *lnk);
> >>>    int smc_wr_alloc_lgr_mem(struct smc_link_group *lgr);
> >>> +void smc_wr_drain_cq(struct smc_link *lnk);
> >>>    void smc_wr_free_link(struct smc_link *lnk);
> >>>    void smc_wr_free_link_mem(struct smc_link *lnk);
> >>>    void smc_wr_free_lgr_mem(struct smc_link_group *lgr);
> >>
> >> Thank you @Yacan for the effort to improve our code! And Thank you @Tony
> >> for such valuable suggestions and testing!
> >> I like the modification of this version. However, this is not a fix
> >> patch to upstream, since the patches "[PATCH net-next v2 00/10] optimize
> >> the parallelism of SMC-R connections" are still not applied. My
> >> sugguestions:
> >> - Please talk to the author (D. Wythe <alibuda@linux.alibaba.com>) of
> >> those patches I mentioned above, and ask if he can take your patch as a
> >> part of the patch serie
> >> - Fix patches should go to net-next
> >> - Please send always send your new version separately, rather than as
> >> reply to your previous version. That makes people confused.
> > 
> > @Wenjia, Thanks a lot for your suggestions and guidance !
> > 
> > @D. Wythe, Can you include this patch in your series of patches if it is
> > convenient?
> > 
> > Regards,
> > Yacan
> > 
> One point I was confused, fixes should goto net, sorry!

Well, @D. Wythe, please ignore the above emails, sorry!

Regards,
Yacan


^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH net v4] net/smc: Fix possible access to freed memory in link clear
  2022-09-01 12:54                 ` liuyacan
@ 2022-09-01 19:16                   ` Wenjia Zhang
  2022-09-02  2:16                     ` liuyacan
  0 siblings, 1 reply; 22+ messages in thread
From: Wenjia Zhang @ 2022-09-01 19:16 UTC (permalink / raw)
  To: liuyacan, alibuda
  Cc: davem, edumazet, kgraul, kuba, linux-kernel, linux-s390, netdev,
	pabeni, tonylu, ubraun, wintera



On 01.09.22 14:54, liuyacan@corp.netease.com wrote:
>>>>> From: Yacan Liu <liuyacan@corp.netease.com>
>>>>>
>>>>> After modifying the QP to the Error state, all RX WR would be completed
>>>>> with WC in IB_WC_WR_FLUSH_ERR status. Current implementation does not
>>>>> wait for it is done, but destroy the QP and free the link group directly.
>>>>> So there is a risk that accessing the freed memory in tasklet context.
>>>>>
>>>>> Here is a crash example:
>>>>>
>>>>>     BUG: unable to handle page fault for address: ffffffff8f220860
>>>>>     #PF: supervisor write access in kernel mode
>>>>>     #PF: error_code(0x0002) - not-present page
>>>>>     PGD f7300e067 P4D f7300e067 PUD f7300f063 PMD 8c4e45063 PTE 800ffff08c9df060
>>>>>     Oops: 0002 [#1] SMP PTI
>>>>>     CPU: 1 PID: 0 Comm: swapper/1 Kdump: loaded Tainted: G S         OE     5.10.0-0607+ #23
>>>>>     Hardware name: Inspur NF5280M4/YZMB-00689-101, BIOS 4.1.20 07/09/2018
>>>>>     RIP: 0010:native_queued_spin_lock_slowpath+0x176/0x1b0
>>>>>     Code: f3 90 48 8b 32 48 85 f6 74 f6 eb d5 c1 ee 12 83 e0 03 83 ee 01 48 c1 e0 05 48 63 f6 48 05 00 c8 02 00 48 03 04 f5 00 09 98 8e <48> 89 10 8b 42 08 85 c0 75 09 f3 90 8b 42 08 85 c0 74 f7 48 8b 32
>>>>>     RSP: 0018:ffffb3b6c001ebd8 EFLAGS: 00010086
>>>>>     RAX: ffffffff8f220860 RBX: 0000000000000246 RCX: 0000000000080000
>>>>>     RDX: ffff91db1f86c800 RSI: 000000000000173c RDI: ffff91db62bace00
>>>>>     RBP: ffff91db62bacc00 R08: 0000000000000000 R09: c00000010000028b
>>>>>     R10: 0000000000055198 R11: ffffb3b6c001ea58 R12: ffff91db80e05010
>>>>>     R13: 000000000000000a R14: 0000000000000006 R15: 0000000000000040
>>>>>     FS:  0000000000000000(0000) GS:ffff91db1f840000(0000) knlGS:0000000000000000
>>>>>     CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
>>>>>     CR2: ffffffff8f220860 CR3: 00000001f9580004 CR4: 00000000003706e0
>>>>>     DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
>>>>>     DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
>>>>>     Call Trace:
>>>>>      <IRQ>
>>>>>      _raw_spin_lock_irqsave+0x30/0x40
>>>>>      mlx5_ib_poll_cq+0x4c/0xc50 [mlx5_ib]
>>>>>      smc_wr_rx_tasklet_fn+0x56/0xa0 [smc]
>>>>>      tasklet_action_common.isra.21+0x66/0x100
>>>>>      __do_softirq+0xd5/0x29c
>>>>>      asm_call_irq_on_stack+0x12/0x20
>>>>>      </IRQ>
>>>>>      do_softirq_own_stack+0x37/0x40
>>>>>      irq_exit_rcu+0x9d/0xa0
>>>>>      sysvec_call_function_single+0x34/0x80
>>>>>      asm_sysvec_call_function_single+0x12/0x20
>>>>>
>>>>> Fixes: bd4ad57718cc ("smc: initialize IB transport incl. PD, MR, QP, CQ, event, WR")
>>>>> Signed-off-by: Yacan Liu <liuyacan@corp.netease.com>
>>>>>
>>>>> ---
>>>>> Chagen in v4:
>>>>>      -- Remove the rx_drain flag because smc_wr_rx_post() may not have been called.
>>>>>      -- Remove timeout.
>>>>> Change in v3:
>>>>>      -- Tune commit message (Signed-Off tag, Fixes tag).
>>>>>         Tune code to avoid column length exceeding.
>>>>> Change in v2:
>>>>>      -- Fix some compile warnings and errors.
>>>>> ---
>>>>>     net/smc/smc_core.c | 2 ++
>>>>>     net/smc/smc_core.h | 2 ++
>>>>>     net/smc/smc_wr.c   | 9 +++++++++
>>>>>     net/smc/smc_wr.h   | 1 +
>>>>>     4 files changed, 14 insertions(+)
>>>>>
>>>>> diff --git a/net/smc/smc_core.c b/net/smc/smc_core.c
>>>>> index ff49a11f5..f92a916e9 100644
>>>>> --- a/net/smc/smc_core.c
>>>>> +++ b/net/smc/smc_core.c
>>>>> @@ -757,6 +757,7 @@ int smcr_link_init(struct smc_link_group *lgr, struct smc_link *lnk,
>>>>>     	lnk->lgr = lgr;
>>>>>     	smc_lgr_hold(lgr); /* lgr_put in smcr_link_clear() */
>>>>>     	lnk->link_idx = link_idx;
>>>>> +	lnk->wr_rx_id_compl = 0;
>>>>>     	smc_ibdev_cnt_inc(lnk);
>>>>>     	smcr_copy_dev_info_to_link(lnk);
>>>>>     	atomic_set(&lnk->conn_cnt, 0);
>>>>> @@ -1269,6 +1270,7 @@ void smcr_link_clear(struct smc_link *lnk, bool log)
>>>>>     	smcr_buf_unmap_lgr(lnk);
>>>>>     	smcr_rtoken_clear_link(lnk);
>>>>>     	smc_ib_modify_qp_error(lnk);
>>>>> +	smc_wr_drain_cq(lnk);
>>>>>     	smc_wr_free_link(lnk);
>>>>>     	smc_ib_destroy_queue_pair(lnk);
>>>>>     	smc_ib_dealloc_protection_domain(lnk);
>>>>> diff --git a/net/smc/smc_core.h b/net/smc/smc_core.h
>>>>> index fe8b524ad..285f9bd8e 100644
>>>>> --- a/net/smc/smc_core.h
>>>>> +++ b/net/smc/smc_core.h
>>>>> @@ -115,8 +115,10 @@ struct smc_link {
>>>>>     	dma_addr_t		wr_rx_dma_addr;	/* DMA address of wr_rx_bufs */
>>>>>     	dma_addr_t		wr_rx_v2_dma_addr; /* DMA address of v2 rx buf*/
>>>>>     	u64			wr_rx_id;	/* seq # of last recv WR */
>>>>> +	u64			wr_rx_id_compl; /* seq # of last completed WR */
>>>>>     	u32			wr_rx_cnt;	/* number of WR recv buffers */
>>>>>     	unsigned long		wr_rx_tstamp;	/* jiffies when last buf rx */
>>>>> +	wait_queue_head_t       wr_rx_empty_wait; /* wait for RQ empty */
>>>>>     
>>>>>     	struct ib_reg_wr	wr_reg;		/* WR register memory region */
>>>>>     	wait_queue_head_t	wr_reg_wait;	/* wait for wr_reg result */
>>>>> diff --git a/net/smc/smc_wr.c b/net/smc/smc_wr.c
>>>>> index 26f8f240d..bc8793803 100644
>>>>> --- a/net/smc/smc_wr.c
>>>>> +++ b/net/smc/smc_wr.c
>>>>> @@ -454,6 +454,7 @@ static inline void smc_wr_rx_process_cqes(struct ib_wc wc[], int num)
>>>>>     
>>>>>     	for (i = 0; i < num; i++) {
>>>>>     		link = wc[i].qp->qp_context;
>>>>> +		link->wr_rx_id_compl = wc[i].wr_id;
>>>>>     		if (wc[i].status == IB_WC_SUCCESS) {
>>>>>     			link->wr_rx_tstamp = jiffies;
>>>>>     			smc_wr_rx_demultiplex(&wc[i]);
>>>>> @@ -465,6 +466,8 @@ static inline void smc_wr_rx_process_cqes(struct ib_wc wc[], int num)
>>>>>     			case IB_WC_RNR_RETRY_EXC_ERR:
>>>>>     			case IB_WC_WR_FLUSH_ERR:
>>>>>     				smcr_link_down_cond_sched(link);
>>>>> +				if (link->wr_rx_id_compl == link->wr_rx_id)
>>>>> +					wake_up(&link->wr_rx_empty_wait);
>>>>>     				break;
>>>>>     			default:
>>>>>     				smc_wr_rx_post(link); /* refill WR RX */
>>>>> @@ -631,6 +634,11 @@ static void smc_wr_init_sge(struct smc_link *lnk)
>>>>>     	lnk->wr_reg.access = IB_ACCESS_LOCAL_WRITE | IB_ACCESS_REMOTE_WRITE;
>>>>>     }
>>>>>     
>>>>> +void smc_wr_drain_cq(struct smc_link *lnk)
>>>>> +{
>>>>> +	wait_event(lnk->wr_rx_empty_wait, lnk->wr_rx_id_compl == lnk->wr_rx_id);
>>>>> +}
>>>>> +
>>>>>     void smc_wr_free_link(struct smc_link *lnk)
>>>>>     {
>>>>>     	struct ib_device *ibdev;
>>>>> @@ -889,6 +897,7 @@ int smc_wr_create_link(struct smc_link *lnk)
>>>>>     	atomic_set(&lnk->wr_tx_refcnt, 0);
>>>>>     	init_waitqueue_head(&lnk->wr_reg_wait);
>>>>>     	atomic_set(&lnk->wr_reg_refcnt, 0);
>>>>> +	init_waitqueue_head(&lnk->wr_rx_empty_wait);
>>>>>     	return rc;
>>>>>     
>>>>>     dma_unmap:
>>>>> diff --git a/net/smc/smc_wr.h b/net/smc/smc_wr.h
>>>>> index a54e90a11..5ca5086ae 100644
>>>>> --- a/net/smc/smc_wr.h
>>>>> +++ b/net/smc/smc_wr.h
>>>>> @@ -101,6 +101,7 @@ static inline int smc_wr_rx_post(struct smc_link *link)
>>>>>     int smc_wr_create_link(struct smc_link *lnk);
>>>>>     int smc_wr_alloc_link_mem(struct smc_link *lnk);
>>>>>     int smc_wr_alloc_lgr_mem(struct smc_link_group *lgr);
>>>>> +void smc_wr_drain_cq(struct smc_link *lnk);
>>>>>     void smc_wr_free_link(struct smc_link *lnk);
>>>>>     void smc_wr_free_link_mem(struct smc_link *lnk);
>>>>>     void smc_wr_free_lgr_mem(struct smc_link_group *lgr);
>>>>
>>>> Thank you @Yacan for the effort to improve our code! And Thank you @Tony
>>>> for such valuable suggestions and testing!
>>>> I like the modification of this version. However, this is not a fix
>>>> patch to upstream, since the patches "[PATCH net-next v2 00/10] optimize
>>>> the parallelism of SMC-R connections" are still not applied. My
>>>> sugguestions:
>>>> - Please talk to the author (D. Wythe <alibuda@linux.alibaba.com>) of
>>>> those patches I mentioned above, and ask if he can take your patch as a
>>>> part of the patch serie
>>>> - Fix patches should go to net-next
>>>> - Please send always send your new version separately, rather than as
>>>> reply to your previous version. That makes people confused.
>>>
>>> @Wenjia, Thanks a lot for your suggestions and guidance !
>>>
>>> @D. Wythe, Can you include this patch in your series of patches if it is
>>> convenient?
>>>
>>> Regards,
>>> Yacan
>>>
>> One point I was confused, fixes should goto net, sorry!
> 
> Well, @D. Wythe, please ignore the above emails, sorry!
> 
> Regards,
> Yacan
> 
oh no, I didn't mean that. I think I didn't say clearly. What I mean is 
that the patch should go to net as a seperate patch if the patch serie 
from D. Wythe is already applied. But now the patch serie is still not 
applied, so you can still ask D. Wythe to take your patch as a part of 
this serie. (Just a suggestion)

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH net v4] net/smc: Fix possible access to freed memory in link clear
  2022-09-01 19:16                   ` Wenjia Zhang
@ 2022-09-02  2:16                     ` liuyacan
  2022-09-02  5:55                       ` Wenjia Zhang
  0 siblings, 1 reply; 22+ messages in thread
From: liuyacan @ 2022-09-02  2:16 UTC (permalink / raw)
  To: wenjia
  Cc: alibuda, davem, edumazet, kgraul, kuba, linux-kernel, linux-s390,
	liuyacan, netdev, pabeni, tonylu, ubraun, wintera

> >>>>> From: Yacan Liu <liuyacan@corp.netease.com>
> >>>>>
> >>>>> After modifying the QP to the Error state, all RX WR would be completed
> >>>>> with WC in IB_WC_WR_FLUSH_ERR status. Current implementation does not
> >>>>> wait for it is done, but destroy the QP and free the link group directly.
> >>>>> So there is a risk that accessing the freed memory in tasklet context.
> >>>>>
> >>>>> Here is a crash example:
> >>>>>
> >>>>>     BUG: unable to handle page fault for address: ffffffff8f220860
> >>>>>     #PF: supervisor write access in kernel mode
> >>>>>     #PF: error_code(0x0002) - not-present page
> >>>>>     PGD f7300e067 P4D f7300e067 PUD f7300f063 PMD 8c4e45063 PTE 800ffff08c9df060
> >>>>>     Oops: 0002 [#1] SMP PTI
> >>>>>     CPU: 1 PID: 0 Comm: swapper/1 Kdump: loaded Tainted: G S         OE     5.10.0-0607+ #23
> >>>>>     Hardware name: Inspur NF5280M4/YZMB-00689-101, BIOS 4.1.20 07/09/2018
> >>>>>     RIP: 0010:native_queued_spin_lock_slowpath+0x176/0x1b0
> >>>>>     Code: f3 90 48 8b 32 48 85 f6 74 f6 eb d5 c1 ee 12 83 e0 03 83 ee 01 48 c1 e0 05 48 63 f6 48 05 00 c8 02 00 48 03 04 f5 00 09 98 8e <48> 89 10 8b 42 08 85 c0 75 09 f3 90 8b 42 08 85 c0 74 f7 48 8b 32
> >>>>>     RSP: 0018:ffffb3b6c001ebd8 EFLAGS: 00010086
> >>>>>     RAX: ffffffff8f220860 RBX: 0000000000000246 RCX: 0000000000080000
> >>>>>     RDX: ffff91db1f86c800 RSI: 000000000000173c RDI: ffff91db62bace00
> >>>>>     RBP: ffff91db62bacc00 R08: 0000000000000000 R09: c00000010000028b
> >>>>>     R10: 0000000000055198 R11: ffffb3b6c001ea58 R12: ffff91db80e05010
> >>>>>     R13: 000000000000000a R14: 0000000000000006 R15: 0000000000000040
> >>>>>     FS:  0000000000000000(0000) GS:ffff91db1f840000(0000) knlGS:0000000000000000
> >>>>>     CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
> >>>>>     CR2: ffffffff8f220860 CR3: 00000001f9580004 CR4: 00000000003706e0
> >>>>>     DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
> >>>>>     DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
> >>>>>     Call Trace:
> >>>>>      <IRQ>
> >>>>>      _raw_spin_lock_irqsave+0x30/0x40
> >>>>>      mlx5_ib_poll_cq+0x4c/0xc50 [mlx5_ib]
> >>>>>      smc_wr_rx_tasklet_fn+0x56/0xa0 [smc]
> >>>>>      tasklet_action_common.isra.21+0x66/0x100
> >>>>>      __do_softirq+0xd5/0x29c
> >>>>>      asm_call_irq_on_stack+0x12/0x20
> >>>>>      </IRQ>
> >>>>>      do_softirq_own_stack+0x37/0x40
> >>>>>      irq_exit_rcu+0x9d/0xa0
> >>>>>      sysvec_call_function_single+0x34/0x80
> >>>>>      asm_sysvec_call_function_single+0x12/0x20
> >>>>>
> >>>>> Fixes: bd4ad57718cc ("smc: initialize IB transport incl. PD, MR, QP, CQ, event, WR")
> >>>>> Signed-off-by: Yacan Liu <liuyacan@corp.netease.com>
> >>>>>
> >>>>> ---
> >>>>> Chagen in v4:
> >>>>>      -- Remove the rx_drain flag because smc_wr_rx_post() may not have been called.
> >>>>>      -- Remove timeout.
> >>>>> Change in v3:
> >>>>>      -- Tune commit message (Signed-Off tag, Fixes tag).
> >>>>>         Tune code to avoid column length exceeding.
> >>>>> Change in v2:
> >>>>>      -- Fix some compile warnings and errors.
> >>>>> ---
> >>>>>     net/smc/smc_core.c | 2 ++
> >>>>>     net/smc/smc_core.h | 2 ++
> >>>>>     net/smc/smc_wr.c   | 9 +++++++++
> >>>>>     net/smc/smc_wr.h   | 1 +
> >>>>>     4 files changed, 14 insertions(+)
> >>>>>
> >>>>> diff --git a/net/smc/smc_core.c b/net/smc/smc_core.c
> >>>>> index ff49a11f5..f92a916e9 100644
> >>>>> --- a/net/smc/smc_core.c
> >>>>> +++ b/net/smc/smc_core.c
> >>>>> @@ -757,6 +757,7 @@ int smcr_link_init(struct smc_link_group *lgr, struct smc_link *lnk,
> >>>>>     	lnk->lgr = lgr;
> >>>>>     	smc_lgr_hold(lgr); /* lgr_put in smcr_link_clear() */
> >>>>>     	lnk->link_idx = link_idx;
> >>>>> +	lnk->wr_rx_id_compl = 0;
> >>>>>     	smc_ibdev_cnt_inc(lnk);
> >>>>>     	smcr_copy_dev_info_to_link(lnk);
> >>>>>     	atomic_set(&lnk->conn_cnt, 0);
> >>>>> @@ -1269,6 +1270,7 @@ void smcr_link_clear(struct smc_link *lnk, bool log)
> >>>>>     	smcr_buf_unmap_lgr(lnk);
> >>>>>     	smcr_rtoken_clear_link(lnk);
> >>>>>     	smc_ib_modify_qp_error(lnk);
> >>>>> +	smc_wr_drain_cq(lnk);
> >>>>>     	smc_wr_free_link(lnk);
> >>>>>     	smc_ib_destroy_queue_pair(lnk);
> >>>>>     	smc_ib_dealloc_protection_domain(lnk);
> >>>>> diff --git a/net/smc/smc_core.h b/net/smc/smc_core.h
> >>>>> index fe8b524ad..285f9bd8e 100644
> >>>>> --- a/net/smc/smc_core.h
> >>>>> +++ b/net/smc/smc_core.h
> >>>>> @@ -115,8 +115,10 @@ struct smc_link {
> >>>>>     	dma_addr_t		wr_rx_dma_addr;	/* DMA address of wr_rx_bufs */
> >>>>>     	dma_addr_t		wr_rx_v2_dma_addr; /* DMA address of v2 rx buf*/
> >>>>>     	u64			wr_rx_id;	/* seq # of last recv WR */
> >>>>> +	u64			wr_rx_id_compl; /* seq # of last completed WR */
> >>>>>     	u32			wr_rx_cnt;	/* number of WR recv buffers */
> >>>>>     	unsigned long		wr_rx_tstamp;	/* jiffies when last buf rx */
> >>>>> +	wait_queue_head_t       wr_rx_empty_wait; /* wait for RQ empty */
> >>>>>     
> >>>>>     	struct ib_reg_wr	wr_reg;		/* WR register memory region */
> >>>>>     	wait_queue_head_t	wr_reg_wait;	/* wait for wr_reg result */
> >>>>> diff --git a/net/smc/smc_wr.c b/net/smc/smc_wr.c
> >>>>> index 26f8f240d..bc8793803 100644
> >>>>> --- a/net/smc/smc_wr.c
> >>>>> +++ b/net/smc/smc_wr.c
> >>>>> @@ -454,6 +454,7 @@ static inline void smc_wr_rx_process_cqes(struct ib_wc wc[], int num)
> >>>>>     
> >>>>>     	for (i = 0; i < num; i++) {
> >>>>>     		link = wc[i].qp->qp_context;
> >>>>> +		link->wr_rx_id_compl = wc[i].wr_id;
> >>>>>     		if (wc[i].status == IB_WC_SUCCESS) {
> >>>>>     			link->wr_rx_tstamp = jiffies;
> >>>>>     			smc_wr_rx_demultiplex(&wc[i]);
> >>>>> @@ -465,6 +466,8 @@ static inline void smc_wr_rx_process_cqes(struct ib_wc wc[], int num)
> >>>>>     			case IB_WC_RNR_RETRY_EXC_ERR:
> >>>>>     			case IB_WC_WR_FLUSH_ERR:
> >>>>>     				smcr_link_down_cond_sched(link);
> >>>>> +				if (link->wr_rx_id_compl == link->wr_rx_id)
> >>>>> +					wake_up(&link->wr_rx_empty_wait);
> >>>>>     				break;
> >>>>>     			default:
> >>>>>     				smc_wr_rx_post(link); /* refill WR RX */
> >>>>> @@ -631,6 +634,11 @@ static void smc_wr_init_sge(struct smc_link *lnk)
> >>>>>     	lnk->wr_reg.access = IB_ACCESS_LOCAL_WRITE | IB_ACCESS_REMOTE_WRITE;
> >>>>>     }
> >>>>>     
> >>>>> +void smc_wr_drain_cq(struct smc_link *lnk)
> >>>>> +{
> >>>>> +	wait_event(lnk->wr_rx_empty_wait, lnk->wr_rx_id_compl == lnk->wr_rx_id);
> >>>>> +}
> >>>>> +
> >>>>>     void smc_wr_free_link(struct smc_link *lnk)
> >>>>>     {
> >>>>>     	struct ib_device *ibdev;
> >>>>> @@ -889,6 +897,7 @@ int smc_wr_create_link(struct smc_link *lnk)
> >>>>>     	atomic_set(&lnk->wr_tx_refcnt, 0);
> >>>>>     	init_waitqueue_head(&lnk->wr_reg_wait);
> >>>>>     	atomic_set(&lnk->wr_reg_refcnt, 0);
> >>>>> +	init_waitqueue_head(&lnk->wr_rx_empty_wait);
> >>>>>     	return rc;
> >>>>>     
> >>>>>     dma_unmap:
> >>>>> diff --git a/net/smc/smc_wr.h b/net/smc/smc_wr.h
> >>>>> index a54e90a11..5ca5086ae 100644
> >>>>> --- a/net/smc/smc_wr.h
> >>>>> +++ b/net/smc/smc_wr.h
> >>>>> @@ -101,6 +101,7 @@ static inline int smc_wr_rx_post(struct smc_link *link)
> >>>>>     int smc_wr_create_link(struct smc_link *lnk);
> >>>>>     int smc_wr_alloc_link_mem(struct smc_link *lnk);
> >>>>>     int smc_wr_alloc_lgr_mem(struct smc_link_group *lgr);
> >>>>> +void smc_wr_drain_cq(struct smc_link *lnk);
> >>>>>     void smc_wr_free_link(struct smc_link *lnk);
> >>>>>     void smc_wr_free_link_mem(struct smc_link *lnk);
> >>>>>     void smc_wr_free_lgr_mem(struct smc_link_group *lgr);
> >>>>
> >>>> Thank you @Yacan for the effort to improve our code! And Thank you @Tony
> >>>> for such valuable suggestions and testing!
> >>>> I like the modification of this version. However, this is not a fix
> >>>> patch to upstream, since the patches "[PATCH net-next v2 00/10] optimize
> >>>> the parallelism of SMC-R connections" are still not applied. My
> >>>> sugguestions:
> >>>> - Please talk to the author (D. Wythe <alibuda@linux.alibaba.com>) of
> >>>> those patches I mentioned above, and ask if he can take your patch as a
> >>>> part of the patch serie
> >>>> - Fix patches should go to net-next
> >>>> - Please send always send your new version separately, rather than as
> >>>> reply to your previous version. That makes people confused.
> >>>
> >>> @Wenjia, Thanks a lot for your suggestions and guidance !
> >>>
> >>> @D. Wythe, Can you include this patch in your series of patches if it is
> >>> convenient?
> >>>
> >>> Regards,
> >>> Yacan
> >>>
> >> One point I was confused, fixes should goto net, sorry!
> > 
> > Well, @D. Wythe, please ignore the above emails, sorry!
> > 
> > Regards,
> > Yacan
> > 
> oh no, I didn't mean that. I think I didn't say clearly. What I mean is 
> that the patch should go to net as a seperate patch if the patch serie 
> from D. Wythe is already applied. But now the patch serie is still not 
> applied, so you can still ask D. Wythe to take your patch as a part of 
> this serie. (Just a suggestion)

Well, I misunderstood. What I'm not sure about is that the patch serie 
from D. Wythe is going to the net-next tree, but mine is going to the net. 
Will this be a problem ?

Regards,
Yacan


^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH net v4] net/smc: Fix possible access to freed memory in link clear
  2022-09-02  2:16                     ` liuyacan
@ 2022-09-02  5:55                       ` Wenjia Zhang
  0 siblings, 0 replies; 22+ messages in thread
From: Wenjia Zhang @ 2022-09-02  5:55 UTC (permalink / raw)
  To: liuyacan
  Cc: alibuda, davem, edumazet, kgraul, kuba, linux-kernel, linux-s390,
	netdev, pabeni, tonylu, ubraun, wintera



On 02.09.22 04:16, liuyacan@corp.netease.com wrote:
>>>>>>> From: Yacan Liu <liuyacan@corp.netease.com>
>>>>>>>
>>>>>>> After modifying the QP to the Error state, all RX WR would be completed
>>>>>>> with WC in IB_WC_WR_FLUSH_ERR status. Current implementation does not
>>>>>>> wait for it is done, but destroy the QP and free the link group directly.
>>>>>>> So there is a risk that accessing the freed memory in tasklet context.
>>>>>>>
>>>>>>> Here is a crash example:
>>>>>>>
>>>>>>>      BUG: unable to handle page fault for address: ffffffff8f220860
>>>>>>>      #PF: supervisor write access in kernel mode
>>>>>>>      #PF: error_code(0x0002) - not-present page
>>>>>>>      PGD f7300e067 P4D f7300e067 PUD f7300f063 PMD 8c4e45063 PTE 800ffff08c9df060
>>>>>>>      Oops: 0002 [#1] SMP PTI
>>>>>>>      CPU: 1 PID: 0 Comm: swapper/1 Kdump: loaded Tainted: G S         OE     5.10.0-0607+ #23
>>>>>>>      Hardware name: Inspur NF5280M4/YZMB-00689-101, BIOS 4.1.20 07/09/2018
>>>>>>>      RIP: 0010:native_queued_spin_lock_slowpath+0x176/0x1b0
>>>>>>>      Code: f3 90 48 8b 32 48 85 f6 74 f6 eb d5 c1 ee 12 83 e0 03 83 ee 01 48 c1 e0 05 48 63 f6 48 05 00 c8 02 00 48 03 04 f5 00 09 98 8e <48> 89 10 8b 42 08 85 c0 75 09 f3 90 8b 42 08 85 c0 74 f7 48 8b 32
>>>>>>>      RSP: 0018:ffffb3b6c001ebd8 EFLAGS: 00010086
>>>>>>>      RAX: ffffffff8f220860 RBX: 0000000000000246 RCX: 0000000000080000
>>>>>>>      RDX: ffff91db1f86c800 RSI: 000000000000173c RDI: ffff91db62bace00
>>>>>>>      RBP: ffff91db62bacc00 R08: 0000000000000000 R09: c00000010000028b
>>>>>>>      R10: 0000000000055198 R11: ffffb3b6c001ea58 R12: ffff91db80e05010
>>>>>>>      R13: 000000000000000a R14: 0000000000000006 R15: 0000000000000040
>>>>>>>      FS:  0000000000000000(0000) GS:ffff91db1f840000(0000) knlGS:0000000000000000
>>>>>>>      CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
>>>>>>>      CR2: ffffffff8f220860 CR3: 00000001f9580004 CR4: 00000000003706e0
>>>>>>>      DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
>>>>>>>      DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
>>>>>>>      Call Trace:
>>>>>>>       <IRQ>
>>>>>>>       _raw_spin_lock_irqsave+0x30/0x40
>>>>>>>       mlx5_ib_poll_cq+0x4c/0xc50 [mlx5_ib]
>>>>>>>       smc_wr_rx_tasklet_fn+0x56/0xa0 [smc]
>>>>>>>       tasklet_action_common.isra.21+0x66/0x100
>>>>>>>       __do_softirq+0xd5/0x29c
>>>>>>>       asm_call_irq_on_stack+0x12/0x20
>>>>>>>       </IRQ>
>>>>>>>       do_softirq_own_stack+0x37/0x40
>>>>>>>       irq_exit_rcu+0x9d/0xa0
>>>>>>>       sysvec_call_function_single+0x34/0x80
>>>>>>>       asm_sysvec_call_function_single+0x12/0x20
>>>>>>>
>>>>>>> Fixes: bd4ad57718cc ("smc: initialize IB transport incl. PD, MR, QP, CQ, event, WR")
>>>>>>> Signed-off-by: Yacan Liu <liuyacan@corp.netease.com>
>>>>>>>
>>>>>>> ---
>>>>>>> Chagen in v4:
>>>>>>>       -- Remove the rx_drain flag because smc_wr_rx_post() may not have been called.
>>>>>>>       -- Remove timeout.
>>>>>>> Change in v3:
>>>>>>>       -- Tune commit message (Signed-Off tag, Fixes tag).
>>>>>>>          Tune code to avoid column length exceeding.
>>>>>>> Change in v2:
>>>>>>>       -- Fix some compile warnings and errors.
>>>>>>> ---
>>>>>>>      net/smc/smc_core.c | 2 ++
>>>>>>>      net/smc/smc_core.h | 2 ++
>>>>>>>      net/smc/smc_wr.c   | 9 +++++++++
>>>>>>>      net/smc/smc_wr.h   | 1 +
>>>>>>>      4 files changed, 14 insertions(+)
>>>>>>>
>>>>>>> diff --git a/net/smc/smc_core.c b/net/smc/smc_core.c
>>>>>>> index ff49a11f5..f92a916e9 100644
>>>>>>> --- a/net/smc/smc_core.c
>>>>>>> +++ b/net/smc/smc_core.c
>>>>>>> @@ -757,6 +757,7 @@ int smcr_link_init(struct smc_link_group *lgr, struct smc_link *lnk,
>>>>>>>      	lnk->lgr = lgr;
>>>>>>>      	smc_lgr_hold(lgr); /* lgr_put in smcr_link_clear() */
>>>>>>>      	lnk->link_idx = link_idx;
>>>>>>> +	lnk->wr_rx_id_compl = 0;
>>>>>>>      	smc_ibdev_cnt_inc(lnk);
>>>>>>>      	smcr_copy_dev_info_to_link(lnk);
>>>>>>>      	atomic_set(&lnk->conn_cnt, 0);
>>>>>>> @@ -1269,6 +1270,7 @@ void smcr_link_clear(struct smc_link *lnk, bool log)
>>>>>>>      	smcr_buf_unmap_lgr(lnk);
>>>>>>>      	smcr_rtoken_clear_link(lnk);
>>>>>>>      	smc_ib_modify_qp_error(lnk);
>>>>>>> +	smc_wr_drain_cq(lnk);
>>>>>>>      	smc_wr_free_link(lnk);
>>>>>>>      	smc_ib_destroy_queue_pair(lnk);
>>>>>>>      	smc_ib_dealloc_protection_domain(lnk);
>>>>>>> diff --git a/net/smc/smc_core.h b/net/smc/smc_core.h
>>>>>>> index fe8b524ad..285f9bd8e 100644
>>>>>>> --- a/net/smc/smc_core.h
>>>>>>> +++ b/net/smc/smc_core.h
>>>>>>> @@ -115,8 +115,10 @@ struct smc_link {
>>>>>>>      	dma_addr_t		wr_rx_dma_addr;	/* DMA address of wr_rx_bufs */
>>>>>>>      	dma_addr_t		wr_rx_v2_dma_addr; /* DMA address of v2 rx buf*/
>>>>>>>      	u64			wr_rx_id;	/* seq # of last recv WR */
>>>>>>> +	u64			wr_rx_id_compl; /* seq # of last completed WR */
>>>>>>>      	u32			wr_rx_cnt;	/* number of WR recv buffers */
>>>>>>>      	unsigned long		wr_rx_tstamp;	/* jiffies when last buf rx */
>>>>>>> +	wait_queue_head_t       wr_rx_empty_wait; /* wait for RQ empty */
>>>>>>>      
>>>>>>>      	struct ib_reg_wr	wr_reg;		/* WR register memory region */
>>>>>>>      	wait_queue_head_t	wr_reg_wait;	/* wait for wr_reg result */
>>>>>>> diff --git a/net/smc/smc_wr.c b/net/smc/smc_wr.c
>>>>>>> index 26f8f240d..bc8793803 100644
>>>>>>> --- a/net/smc/smc_wr.c
>>>>>>> +++ b/net/smc/smc_wr.c
>>>>>>> @@ -454,6 +454,7 @@ static inline void smc_wr_rx_process_cqes(struct ib_wc wc[], int num)
>>>>>>>      
>>>>>>>      	for (i = 0; i < num; i++) {
>>>>>>>      		link = wc[i].qp->qp_context;
>>>>>>> +		link->wr_rx_id_compl = wc[i].wr_id;
>>>>>>>      		if (wc[i].status == IB_WC_SUCCESS) {
>>>>>>>      			link->wr_rx_tstamp = jiffies;
>>>>>>>      			smc_wr_rx_demultiplex(&wc[i]);
>>>>>>> @@ -465,6 +466,8 @@ static inline void smc_wr_rx_process_cqes(struct ib_wc wc[], int num)
>>>>>>>      			case IB_WC_RNR_RETRY_EXC_ERR:
>>>>>>>      			case IB_WC_WR_FLUSH_ERR:
>>>>>>>      				smcr_link_down_cond_sched(link);
>>>>>>> +				if (link->wr_rx_id_compl == link->wr_rx_id)
>>>>>>> +					wake_up(&link->wr_rx_empty_wait);
>>>>>>>      				break;
>>>>>>>      			default:
>>>>>>>      				smc_wr_rx_post(link); /* refill WR RX */
>>>>>>> @@ -631,6 +634,11 @@ static void smc_wr_init_sge(struct smc_link *lnk)
>>>>>>>      	lnk->wr_reg.access = IB_ACCESS_LOCAL_WRITE | IB_ACCESS_REMOTE_WRITE;
>>>>>>>      }
>>>>>>>      
>>>>>>> +void smc_wr_drain_cq(struct smc_link *lnk)
>>>>>>> +{
>>>>>>> +	wait_event(lnk->wr_rx_empty_wait, lnk->wr_rx_id_compl == lnk->wr_rx_id);
>>>>>>> +}
>>>>>>> +
>>>>>>>      void smc_wr_free_link(struct smc_link *lnk)
>>>>>>>      {
>>>>>>>      	struct ib_device *ibdev;
>>>>>>> @@ -889,6 +897,7 @@ int smc_wr_create_link(struct smc_link *lnk)
>>>>>>>      	atomic_set(&lnk->wr_tx_refcnt, 0);
>>>>>>>      	init_waitqueue_head(&lnk->wr_reg_wait);
>>>>>>>      	atomic_set(&lnk->wr_reg_refcnt, 0);
>>>>>>> +	init_waitqueue_head(&lnk->wr_rx_empty_wait);
>>>>>>>      	return rc;
>>>>>>>      
>>>>>>>      dma_unmap:
>>>>>>> diff --git a/net/smc/smc_wr.h b/net/smc/smc_wr.h
>>>>>>> index a54e90a11..5ca5086ae 100644
>>>>>>> --- a/net/smc/smc_wr.h
>>>>>>> +++ b/net/smc/smc_wr.h
>>>>>>> @@ -101,6 +101,7 @@ static inline int smc_wr_rx_post(struct smc_link *link)
>>>>>>>      int smc_wr_create_link(struct smc_link *lnk);
>>>>>>>      int smc_wr_alloc_link_mem(struct smc_link *lnk);
>>>>>>>      int smc_wr_alloc_lgr_mem(struct smc_link_group *lgr);
>>>>>>> +void smc_wr_drain_cq(struct smc_link *lnk);
>>>>>>>      void smc_wr_free_link(struct smc_link *lnk);
>>>>>>>      void smc_wr_free_link_mem(struct smc_link *lnk);
>>>>>>>      void smc_wr_free_lgr_mem(struct smc_link_group *lgr);
>>>>>>
>>>>>> Thank you @Yacan for the effort to improve our code! And Thank you @Tony
>>>>>> for such valuable suggestions and testing!
>>>>>> I like the modification of this version. However, this is not a fix
>>>>>> patch to upstream, since the patches "[PATCH net-next v2 00/10] optimize
>>>>>> the parallelism of SMC-R connections" are still not applied. My
>>>>>> sugguestions:
>>>>>> - Please talk to the author (D. Wythe <alibuda@linux.alibaba.com>) of
>>>>>> those patches I mentioned above, and ask if he can take your patch as a
>>>>>> part of the patch serie
>>>>>> - Fix patches should go to net-next
>>>>>> - Please send always send your new version separately, rather than as
>>>>>> reply to your previous version. That makes people confused.
>>>>>
>>>>> @Wenjia, Thanks a lot for your suggestions and guidance !
>>>>>
>>>>> @D. Wythe, Can you include this patch in your series of patches if it is
>>>>> convenient?
>>>>>
>>>>> Regards,
>>>>> Yacan
>>>>>
>>>> One point I was confused, fixes should goto net, sorry!
>>>
>>> Well, @D. Wythe, please ignore the above emails, sorry!
>>>
>>> Regards,
>>> Yacan
>>>
>> oh no, I didn't mean that. I think I didn't say clearly. What I mean is
>> that the patch should go to net as a seperate patch if the patch serie
>> from D. Wythe is already applied. But now the patch serie is still not
>> applied, so you can still ask D. Wythe to take your patch as a part of
>> this serie. (Just a suggestion)
> 
> Well, I misunderstood. What I'm not sure about is that the patch serie
> from D. Wythe is going to the net-next tree, but mine is going to the net.
> Will this be a problem ?
> 
> Regards,
> Yacan
> I don't think that would be a problem in this situation.

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH net v4] net/smc: Fix possible access to freed memory in link clear
  2022-08-31 15:53         ` [PATCH net v4] net/smc: Fix possible access to freed memory in link clear liuyacan
  2022-09-01  6:51           ` Tony Lu
  2022-09-01  9:33           ` Wenjia Zhang
@ 2022-09-04 15:58           ` Tony Lu
  2022-09-05  7:18             ` liuyacan
  2 siblings, 1 reply; 22+ messages in thread
From: Tony Lu @ 2022-09-04 15:58 UTC (permalink / raw)
  To: liuyacan
  Cc: davem, edumazet, kgraul, kuba, linux-kernel, linux-s390, netdev,
	ubraun, pabeni, wenjia, wintera

On Wed, Aug 31, 2022 at 11:53:03PM +0800, liuyacan@corp.netease.com wrote:
> From: Yacan Liu <liuyacan@corp.netease.com>
> 
> After modifying the QP to the Error state, all RX WR would be completed
> with WC in IB_WC_WR_FLUSH_ERR status. Current implementation does not
> wait for it is done, but destroy the QP and free the link group directly.
> So there is a risk that accessing the freed memory in tasklet context.
> 
> Here is a crash example:
> 
>  BUG: unable to handle page fault for address: ffffffff8f220860
>  #PF: supervisor write access in kernel mode
>  #PF: error_code(0x0002) - not-present page
>  PGD f7300e067 P4D f7300e067 PUD f7300f063 PMD 8c4e45063 PTE 800ffff08c9df060
>  Oops: 0002 [#1] SMP PTI
>  CPU: 1 PID: 0 Comm: swapper/1 Kdump: loaded Tainted: G S         OE     5.10.0-0607+ #23
>  Hardware name: Inspur NF5280M4/YZMB-00689-101, BIOS 4.1.20 07/09/2018
>  RIP: 0010:native_queued_spin_lock_slowpath+0x176/0x1b0
>  Code: f3 90 48 8b 32 48 85 f6 74 f6 eb d5 c1 ee 12 83 e0 03 83 ee 01 48 c1 e0 05 48 63 f6 48 05 00 c8 02 00 48 03 04 f5 00 09 98 8e <48> 89 10 8b 42 08 85 c0 75 09 f3 90 8b 42 08 85 c0 74 f7 48 8b 32
>  RSP: 0018:ffffb3b6c001ebd8 EFLAGS: 00010086
>  RAX: ffffffff8f220860 RBX: 0000000000000246 RCX: 0000000000080000
>  RDX: ffff91db1f86c800 RSI: 000000000000173c RDI: ffff91db62bace00
>  RBP: ffff91db62bacc00 R08: 0000000000000000 R09: c00000010000028b
>  R10: 0000000000055198 R11: ffffb3b6c001ea58 R12: ffff91db80e05010
>  R13: 000000000000000a R14: 0000000000000006 R15: 0000000000000040
>  FS:  0000000000000000(0000) GS:ffff91db1f840000(0000) knlGS:0000000000000000
>  CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
>  CR2: ffffffff8f220860 CR3: 00000001f9580004 CR4: 00000000003706e0
>  DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
>  DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
>  Call Trace:
>   <IRQ>
>   _raw_spin_lock_irqsave+0x30/0x40
>   mlx5_ib_poll_cq+0x4c/0xc50 [mlx5_ib]
>   smc_wr_rx_tasklet_fn+0x56/0xa0 [smc]
>   tasklet_action_common.isra.21+0x66/0x100
>   __do_softirq+0xd5/0x29c
>   asm_call_irq_on_stack+0x12/0x20
>   </IRQ>
>   do_softirq_own_stack+0x37/0x40
>   irq_exit_rcu+0x9d/0xa0
>   sysvec_call_function_single+0x34/0x80
>   asm_sysvec_call_function_single+0x12/0x20
> 
> Fixes: bd4ad57718cc ("smc: initialize IB transport incl. PD, MR, QP, CQ, event, WR")
> Signed-off-by: Yacan Liu <liuyacan@corp.netease.com>
> 
> ---
> Chagen in v4:
>   -- Remove the rx_drain flag because smc_wr_rx_post() may not have been called.
>   -- Remove timeout.
> Change in v3:
>   -- Tune commit message (Signed-Off tag, Fixes tag).
>      Tune code to avoid column length exceeding.
> Change in v2:
>   -- Fix some compile warnings and errors.
> ---
>  net/smc/smc_core.c | 2 ++
>  net/smc/smc_core.h | 2 ++
>  net/smc/smc_wr.c   | 9 +++++++++
>  net/smc/smc_wr.h   | 1 +
>  4 files changed, 14 insertions(+)
> 
> diff --git a/net/smc/smc_core.c b/net/smc/smc_core.c
> index ff49a11f5..f92a916e9 100644
> --- a/net/smc/smc_core.c
> +++ b/net/smc/smc_core.c
> @@ -757,6 +757,7 @@ int smcr_link_init(struct smc_link_group *lgr, struct smc_link *lnk,
>  	lnk->lgr = lgr;
>  	smc_lgr_hold(lgr); /* lgr_put in smcr_link_clear() */
>  	lnk->link_idx = link_idx;
> +	lnk->wr_rx_id_compl = 0;
>  	smc_ibdev_cnt_inc(lnk);
>  	smcr_copy_dev_info_to_link(lnk);
>  	atomic_set(&lnk->conn_cnt, 0);
> @@ -1269,6 +1270,7 @@ void smcr_link_clear(struct smc_link *lnk, bool log)
>  	smcr_buf_unmap_lgr(lnk);
>  	smcr_rtoken_clear_link(lnk);
>  	smc_ib_modify_qp_error(lnk);
> +	smc_wr_drain_cq(lnk);

smc_wr_drain_cq() can put into smc_wr_free_link(). Since
smc_wr_free_link() do the same things together, such as waiting for
resources cleaning up about wr.

>  	smc_wr_free_link(lnk);
>  	smc_ib_destroy_queue_pair(lnk);
>  	smc_ib_dealloc_protection_domain(lnk);

<snip>

The patch tested good in our environment. If you are going to send the
v5 patch, you can go with my review.

Reviewed-by: Tony Lu <tonylu@linux.alibaba.com>

Cheers,
Tony Lu

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH net v4] net/smc: Fix possible access to freed memory in link clear
  2022-09-04 15:58           ` Tony Lu
@ 2022-09-05  7:18             ` liuyacan
  0 siblings, 0 replies; 22+ messages in thread
From: liuyacan @ 2022-09-05  7:18 UTC (permalink / raw)
  To: tonylu
  Cc: davem, edumazet, kgraul, kuba, linux-kernel, linux-s390, liuyacan,
	netdev, pabeni, ubraun, wenjia, wintera

> > From: Yacan Liu <liuyacan@corp.netease.com>
> > 
> > After modifying the QP to the Error state, all RX WR would be completed
> > with WC in IB_WC_WR_FLUSH_ERR status. Current implementation does not
> > wait for it is done, but destroy the QP and free the link group directly.
> > So there is a risk that accessing the freed memory in tasklet context.
> > 
> > Here is a crash example:
> > 
> >  BUG: unable to handle page fault for address: ffffffff8f220860
> >  #PF: supervisor write access in kernel mode
> >  #PF: error_code(0x0002) - not-present page
> >  PGD f7300e067 P4D f7300e067 PUD f7300f063 PMD 8c4e45063 PTE 800ffff08c9df060
> >  Oops: 0002 [#1] SMP PTI
> >  CPU: 1 PID: 0 Comm: swapper/1 Kdump: loaded Tainted: G S         OE     5.10.0-0607+ #23
> >  Hardware name: Inspur NF5280M4/YZMB-00689-101, BIOS 4.1.20 07/09/2018
> >  RIP: 0010:native_queued_spin_lock_slowpath+0x176/0x1b0
> >  Code: f3 90 48 8b 32 48 85 f6 74 f6 eb d5 c1 ee 12 83 e0 03 83 ee 01 48 c1 e0 05 48 63 f6 48 05 00 c8 02 00 48 03 04 f5 00 09 98 8e <48> 89 10 8b 42 08 85 c0 75 09 f3 90 8b 42 08 85 c0 74 f7 48 8b 32
> >  RSP: 0018:ffffb3b6c001ebd8 EFLAGS: 00010086
> >  RAX: ffffffff8f220860 RBX: 0000000000000246 RCX: 0000000000080000
> >  RDX: ffff91db1f86c800 RSI: 000000000000173c RDI: ffff91db62bace00
> >  RBP: ffff91db62bacc00 R08: 0000000000000000 R09: c00000010000028b
> >  R10: 0000000000055198 R11: ffffb3b6c001ea58 R12: ffff91db80e05010
> >  R13: 000000000000000a R14: 0000000000000006 R15: 0000000000000040
> >  FS:  0000000000000000(0000) GS:ffff91db1f840000(0000) knlGS:0000000000000000
> >  CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
> >  CR2: ffffffff8f220860 CR3: 00000001f9580004 CR4: 00000000003706e0
> >  DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
> >  DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
> >  Call Trace:
> >   <IRQ>
> >   _raw_spin_lock_irqsave+0x30/0x40
> >   mlx5_ib_poll_cq+0x4c/0xc50 [mlx5_ib]
> >   smc_wr_rx_tasklet_fn+0x56/0xa0 [smc]
> >   tasklet_action_common.isra.21+0x66/0x100
> >   __do_softirq+0xd5/0x29c
> >   asm_call_irq_on_stack+0x12/0x20
> >   </IRQ>
> >   do_softirq_own_stack+0x37/0x40
> >   irq_exit_rcu+0x9d/0xa0
> >   sysvec_call_function_single+0x34/0x80
> >   asm_sysvec_call_function_single+0x12/0x20
> > 
> > Fixes: bd4ad57718cc ("smc: initialize IB transport incl. PD, MR, QP, CQ, event, WR")
> > Signed-off-by: Yacan Liu <liuyacan@corp.netease.com>
> > 
> > ---
> > Chagen in v4:
> >   -- Remove the rx_drain flag because smc_wr_rx_post() may not have been called.
> >   -- Remove timeout.
> > Change in v3:
> >   -- Tune commit message (Signed-Off tag, Fixes tag).
> >      Tune code to avoid column length exceeding.
> > Change in v2:
> >   -- Fix some compile warnings and errors.
> > ---
> >  net/smc/smc_core.c | 2 ++
> >  net/smc/smc_core.h | 2 ++
> >  net/smc/smc_wr.c   | 9 +++++++++
> >  net/smc/smc_wr.h   | 1 +
> >  4 files changed, 14 insertions(+)
> > 
> > diff --git a/net/smc/smc_core.c b/net/smc/smc_core.c
> > index ff49a11f5..f92a916e9 100644
> > --- a/net/smc/smc_core.c
> > +++ b/net/smc/smc_core.c
> > @@ -757,6 +757,7 @@ int smcr_link_init(struct smc_link_group *lgr, struct smc_link *lnk,
> >  	lnk->lgr = lgr;
> >  	smc_lgr_hold(lgr); /* lgr_put in smcr_link_clear() */
> >  	lnk->link_idx = link_idx;
> > +	lnk->wr_rx_id_compl = 0;
> >  	smc_ibdev_cnt_inc(lnk);
> >  	smcr_copy_dev_info_to_link(lnk);
> >  	atomic_set(&lnk->conn_cnt, 0);
> > @@ -1269,6 +1270,7 @@ void smcr_link_clear(struct smc_link *lnk, bool log)
> >  	smcr_buf_unmap_lgr(lnk);
> >  	smcr_rtoken_clear_link(lnk);
> >  	smc_ib_modify_qp_error(lnk);
> > +	smc_wr_drain_cq(lnk);
> 
> smc_wr_drain_cq() can put into smc_wr_free_link(). Since
> smc_wr_free_link() do the same things together, such as waiting for
> resources cleaning up about wr.
> 
> >  	smc_wr_free_link(lnk);
> >  	smc_ib_destroy_queue_pair(lnk);
> >  	smc_ib_dealloc_protection_domain(lnk);
> 
> <snip>
> 
> The patch tested good in our environment. If you are going to send the
> v5 patch, you can go with my review.
> 
> Reviewed-by: Tony Lu <tonylu@linux.alibaba.com>
> 
> Cheers,
> Tony Lu

Thank you very much for your verification and suggestion!

Regards,
Yacan


^ permalink raw reply	[flat|nested] 22+ messages in thread

end of thread, other threads:[~2022-09-05  7:22 UTC | newest]

Thread overview: 22+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2022-08-29 14:54 [PATCH net] net/smc: Fix possible access to freed memory in link clear liuyacan
2022-08-29 16:38 ` Tony Lu
2022-08-30  3:05   ` [PATCH net v2] " liuyacan
2022-08-30  5:58     ` [PATCH net v2] net/smc: fix listen processing for SMC-Rv2 liuyacan
2022-08-30  8:31       ` Alexandra Winter
2022-08-30 13:19         ` [PATCH net v3] net/smc: Fix possible access to freed memory in link clear liuyacan
2022-08-31 16:08         ` [PATCH net v2] net/smc: fix listen processing for SMC-Rv2 liuyacan
2022-08-30 14:15       ` Tony Lu
2022-08-31 15:53         ` [PATCH net v4] net/smc: Fix possible access to freed memory in link clear liuyacan
2022-09-01  6:51           ` Tony Lu
2022-09-01  9:33           ` Wenjia Zhang
2022-09-01 12:26             ` liuyacan
2022-09-01 12:45               ` Wenjia Zhang
2022-09-01 12:54                 ` liuyacan
2022-09-01 19:16                   ` Wenjia Zhang
2022-09-02  2:16                     ` liuyacan
2022-09-02  5:55                       ` Wenjia Zhang
2022-09-04 15:58           ` Tony Lu
2022-09-05  7:18             ` liuyacan
2022-08-31 16:12         ` [PATCH net v2] net/smc: fix listen processing for SMC-Rv2 liuyacan
2022-08-30 14:31 ` [PATCH net] net/smc: Fix possible access to freed memory in link clear kernel test robot
2022-08-30 16:10 ` kernel test robot

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).