[PATCH net-next 07/15] net/mlx5e: xsk: Use KLM to protect frame overrun in unaligned mode

netdev.vger.kernel.org archive mirror
 help / color / mirror / Atom feed

From: Saeed Mahameed <saeed@kernel.org>
To: "David S. Miller" <davem@davemloft.net>,
	Jakub Kicinski <kuba@kernel.org>, Paolo Abeni <pabeni@redhat.com>,
	Eric Dumazet <edumazet@google.com>
Cc: Saeed Mahameed <saeedm@nvidia.com>,
	netdev@vger.kernel.org, Tariq Toukan <tariqt@nvidia.com>,
	Maxim Mikityanskiy <maximmi@nvidia.com>
Subject: [PATCH net-next 07/15] net/mlx5e: xsk: Use KLM to protect frame overrun in unaligned mode
Date: Sat,  1 Oct 2022 21:56:24 -0700	[thread overview]
Message-ID: <20221002045632.291612-8-saeed@kernel.org> (raw)
In-Reply-To: <20221002045632.291612-1-saeed@kernel.org>

From: Maxim Mikityanskiy <maximmi@nvidia.com>

XSK RQs support striding RQ linear mode, but the stride size may be
bigger than the XSK frame size, because:

1. The stride size must be a power of two.

2. The stride size must be equal to the UMR page size. Each XSK frame is
treated as a separate page, because they aren't necessarily adjacent in
physical memory, so the driver can't put more than one stride per page.

3. The minimal MTT page size is 4096 on older firmware.

That means that if XSK frame size is 2048 or not a power of two, the
strides may be bigger than XSK frames. Normally, it's not a problem if
the hardware enforces the MTU. However, traffic between vports skips the
hardware MTU check, and oversized packets may be received.

If an oversized packet is bigger than the XSK frame but not bigger than
the stride, it will cause overwriting of the adjacent UMEM region. If
the packet takes more than one stride, they can be recycled for reuse,
so it's not a problem when the XSK frame size matches the stride size.

Work around the above issue by leveraging KLM to make a more
fine-grained mapping. The beginning of each stride is mapped to the
frame memory, and the padding up to the closest power of two is mapped
to the overflow page that doesn't belong to UMEM. This way, application
data corruption won't happen upon receiving packets bigger than MTU.

Signed-off-by: Maxim Mikityanskiy <maximmi@nvidia.com>
Reviewed-by: Tariq Toukan <tariqt@nvidia.com>
Signed-off-by: Saeed Mahameed <saeedm@nvidia.com>
---
 drivers/net/ethernet/mellanox/mlx5/core/en.h  |  1 +
 .../ethernet/mellanox/mlx5/core/en/params.c   | 45 +++++++++++++++++--
 .../ethernet/mellanox/mlx5/core/en/xsk/rx.c   | 27 +++++++++--
 .../net/ethernet/mellanox/mlx5/core/en_main.c | 27 +++++++++--
 4 files changed, 90 insertions(+), 10 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en.h b/drivers/net/ethernet/mellanox/mlx5/core/en.h
index 93607db1dea4..7c6861d6148d 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en.h
@@ -680,6 +680,7 @@ struct mlx5e_hw_gro_data {
 enum mlx5e_mpwrq_umr_mode {
 	MLX5E_MPWRQ_UMR_MODE_ALIGNED,
 	MLX5E_MPWRQ_UMR_MODE_UNALIGNED,
+	MLX5E_MPWRQ_UMR_MODE_OVERSIZED,
 };
 
 struct mlx5e_rq {
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/params.c b/drivers/net/ethernet/mellanox/mlx5/core/en/params.c
index e8c3b8abf941..203448ee9594 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en/params.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en/params.c
@@ -36,8 +36,28 @@ mlx5e_mpwrq_umr_mode(struct mlx5_core_dev *mdev, struct mlx5e_xsk_param *xsk)
 	 * 1. MTT - direct mapping in page granularity.
 	 * 2. KSM - indirect mapping to another MKey to arbitrary addresses, but
 	 *    all mappings have the same size.
+	 * 3. KLM - indirect mapping to another MKey to arbitrary addresses, and
+	 *    mappings can have different sizes.
 	 */
+	u8 page_shift = mlx5e_mpwrq_page_shift(mdev, xsk);
 	bool unaligned = xsk ? xsk->unaligned : false;
+	bool oversized = false;
+
+	if (xsk) {
+		oversized = xsk->chunk_size < (1 << page_shift);
+		WARN_ON_ONCE(xsk->chunk_size > (1 << page_shift));
+	}
+
+	/* XSK frame size doesn't match the UMR page size, either because the
+	 * frame size is not a power of two, or it's smaller than the minimal
+	 * page size supported by the firmware.
+	 * It's possible to receive packets bigger than MTU in certain setups.
+	 * To avoid writing over the XSK frame boundary, the top region of each
+	 * stride is mapped to a garbage page, resulting in two mappings of
+	 * different sizes per frame.
+	 */
+	if (oversized)
+		return MLX5E_MPWRQ_UMR_MODE_OVERSIZED;
 
 	/* XSK frames can start at arbitrary unaligned locations, but they all
 	 * have the same size which is a power of two. It allows to optimize to
@@ -60,6 +80,8 @@ u8 mlx5e_mpwrq_umr_entry_size(enum mlx5e_mpwrq_umr_mode mode)
 		return sizeof(struct mlx5_mtt);
 	case MLX5E_MPWRQ_UMR_MODE_UNALIGNED:
 		return sizeof(struct mlx5_ksm);
+	case MLX5E_MPWRQ_UMR_MODE_OVERSIZED:
+		return sizeof(struct mlx5_klm) * 2;
 	}
 	WARN_ONCE(1, "MPWRQ UMR mode %d is not known\n", mode);
 	return 0;
@@ -145,11 +167,21 @@ u8 mlx5e_mpwrq_mtts_per_wqe(struct mlx5_core_dev *mdev, u8 page_shift,
 u32 mlx5e_mpwrq_max_num_entries(struct mlx5_core_dev *mdev,
 				enum mlx5e_mpwrq_umr_mode umr_mode)
 {
-	if (umr_mode == MLX5E_MPWRQ_UMR_MODE_UNALIGNED)
-		return min(MLX5E_MAX_RQ_NUM_KSMS,
-			   1 << MLX5_CAP_GEN(mdev, log_max_klm_list_size));
+	/* Same limits apply to KSMs and KLMs. */
+	u32 klm_limit = min(MLX5E_MAX_RQ_NUM_KSMS,
+			    1 << MLX5_CAP_GEN(mdev, log_max_klm_list_size));
 
-	return MLX5E_MAX_RQ_NUM_MTTS;
+	switch (umr_mode) {
+	case MLX5E_MPWRQ_UMR_MODE_ALIGNED:
+		return MLX5E_MAX_RQ_NUM_MTTS;
+	case MLX5E_MPWRQ_UMR_MODE_UNALIGNED:
+		return klm_limit;
+	case MLX5E_MPWRQ_UMR_MODE_OVERSIZED:
+		/* Each entry is two KLMs. */
+		return klm_limit / 2;
+	}
+	WARN_ONCE(1, "MPWRQ UMR mode %d is not known\n", umr_mode);
+	return 0;
 }
 
 static u8 mlx5e_mpwrq_max_log_rq_size(struct mlx5_core_dev *mdev, u8 page_shift,
@@ -1084,6 +1116,11 @@ static u8 mlx5e_build_icosq_log_wq_sz(struct mlx5_core_dev *mdev,
 			xsk.unaligned = true;
 			max_xsk_wqebbs = max(max_xsk_wqebbs,
 				mlx5e_mpwrq_total_umr_wqebbs(mdev, params, &xsk));
+
+			/* XSK unaligned mode, frame size is not equal to stride size. */
+			xsk.chunk_size -= 1;
+			max_xsk_wqebbs = max(max_xsk_wqebbs,
+				mlx5e_mpwrq_total_umr_wqebbs(mdev, params, &xsk));
 		}
 
 		wqebbs += max_xsk_wqebbs;
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/rx.c b/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/rx.c
index 4b2df2895505..78d746704345 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/rx.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/rx.c
@@ -41,7 +41,15 @@ int mlx5e_xsk_alloc_rx_mpwqe(struct mlx5e_rq *rq, u16 ix)
 	umr_wqe = mlx5_wq_cyc_get_wqe(wq, pi);
 	memcpy(umr_wqe, &rq->mpwqe.umr_wqe, sizeof(struct mlx5e_umr_wqe));
 
-	if (unlikely(rq->mpwqe.umr_mode == MLX5E_MPWRQ_UMR_MODE_UNALIGNED)) {
+	if (likely(rq->mpwqe.umr_mode == MLX5E_MPWRQ_UMR_MODE_ALIGNED)) {
+		for (i = 0; i < batch; i++) {
+			dma_addr_t addr = xsk_buff_xdp_get_frame_dma(wi->alloc_units[i].xsk);
+
+			umr_wqe->inline_mtts[i] = (struct mlx5_mtt) {
+				.ptag = cpu_to_be64(addr | MLX5_EN_WR),
+			};
+		}
+	} else if (unlikely(rq->mpwqe.umr_mode == MLX5E_MPWRQ_UMR_MODE_UNALIGNED)) {
 		for (i = 0; i < batch; i++) {
 			dma_addr_t addr = xsk_buff_xdp_get_frame_dma(wi->alloc_units[i].xsk);
 
@@ -51,11 +59,22 @@ int mlx5e_xsk_alloc_rx_mpwqe(struct mlx5e_rq *rq, u16 ix)
 			};
 		}
 	} else {
+		__be32 pad_size = cpu_to_be32((1 << rq->mpwqe.page_shift) -
+					      rq->xsk_pool->chunk_size);
+		__be32 frame_size = cpu_to_be32(rq->xsk_pool->chunk_size);
+
 		for (i = 0; i < batch; i++) {
 			dma_addr_t addr = xsk_buff_xdp_get_frame_dma(wi->alloc_units[i].xsk);
 
-			umr_wqe->inline_mtts[i] = (struct mlx5_mtt) {
-				.ptag = cpu_to_be64(addr | MLX5_EN_WR),
+			umr_wqe->inline_klms[i << 1] = (struct mlx5_klm) {
+				.key = rq->mkey_be,
+				.va = cpu_to_be64(addr),
+				.bcount = frame_size,
+			};
+			umr_wqe->inline_klms[(i << 1) + 1] = (struct mlx5_klm) {
+				.key = rq->mkey_be,
+				.va = cpu_to_be64(rq->wqe_overflow.addr),
+				.bcount = pad_size,
 			};
 		}
 	}
@@ -70,6 +89,8 @@ int mlx5e_xsk_alloc_rx_mpwqe(struct mlx5e_rq *rq, u16 ix)
 	offset = ix * rq->mpwqe.mtts_per_wqe;
 	if (likely(rq->mpwqe.umr_mode == MLX5E_MPWRQ_UMR_MODE_ALIGNED))
 		offset = offset * sizeof(struct mlx5_mtt) / MLX5_OCTWORD;
+	else if (unlikely(rq->mpwqe.umr_mode == MLX5E_MPWRQ_UMR_MODE_OVERSIZED))
+		offset = offset * sizeof(struct mlx5_klm) * 2 / MLX5_OCTWORD;
 	umr_wqe->uctrl.xlt_offset = cpu_to_be16(offset);
 
 	icosq->db.wqe_info[pi] = (struct mlx5e_icosq_wqe_info) {
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
index 2093b6cc6c7c..ae728745379d 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
@@ -299,6 +299,8 @@ static u8 mlx5e_mpwrq_access_mode(enum mlx5e_mpwrq_umr_mode umr_mode)
 		return MLX5_MKC_ACCESS_MODE_MTT;
 	case MLX5E_MPWRQ_UMR_MODE_UNALIGNED:
 		return MLX5_MKC_ACCESS_MODE_KSM;
+	case MLX5E_MPWRQ_UMR_MODE_OVERSIZED:
+		return MLX5_MKC_ACCESS_MODE_KLMS;
 	}
 	WARN_ONCE(1, "MPWRQ UMR mode %d is not known\n", umr_mode);
 	return 0;
@@ -307,10 +309,12 @@ static u8 mlx5e_mpwrq_access_mode(enum mlx5e_mpwrq_umr_mode umr_mode)
 static int mlx5e_create_umr_mkey(struct mlx5_core_dev *mdev,
 				 u32 npages, u8 page_shift, u32 *umr_mkey,
 				 dma_addr_t filler_addr,
-				 enum mlx5e_mpwrq_umr_mode umr_mode)
+				 enum mlx5e_mpwrq_umr_mode umr_mode,
+				 u32 xsk_chunk_size)
 {
 	struct mlx5_mtt *mtt;
 	struct mlx5_ksm *ksm;
+	struct mlx5_klm *klm;
 	u32 octwords;
 	int inlen;
 	void *mkc;
@@ -347,7 +351,8 @@ static int mlx5e_create_umr_mkey(struct mlx5_core_dev *mdev,
 	MLX5_SET(mkc, mkc, pd, mdev->mlx5e_res.hw_objs.pdn);
 	MLX5_SET64(mkc, mkc, len, npages << page_shift);
 	MLX5_SET(mkc, mkc, translations_octword_size, octwords);
-	MLX5_SET(mkc, mkc, log_page_size, page_shift);
+	if (umr_mode != MLX5E_MPWRQ_UMR_MODE_OVERSIZED)
+		MLX5_SET(mkc, mkc, log_page_size, page_shift);
 	MLX5_SET(create_mkey_in, in, translations_octword_actual_size, octwords);
 
 	/* Initialize the mkey with all MTTs pointing to a default
@@ -357,6 +362,21 @@ static int mlx5e_create_umr_mkey(struct mlx5_core_dev *mdev,
 	 * to the default page.
 	 */
 	switch (umr_mode) {
+	case MLX5E_MPWRQ_UMR_MODE_OVERSIZED:
+		klm = MLX5_ADDR_OF(create_mkey_in, in, klm_pas_mtt);
+		for (i = 0; i < npages; i++) {
+			klm[i << 1] = (struct mlx5_klm) {
+				.va = cpu_to_be64(filler_addr),
+				.bcount = cpu_to_be32(xsk_chunk_size),
+				.key = cpu_to_be32(mdev->mlx5e_res.hw_objs.mkey),
+			};
+			klm[(i << 1) + 1] = (struct mlx5_klm) {
+				.va = cpu_to_be64(filler_addr),
+				.bcount = cpu_to_be32((1 << page_shift) - xsk_chunk_size),
+				.key = cpu_to_be32(mdev->mlx5e_res.hw_objs.mkey),
+			};
+		}
+		break;
 	case MLX5E_MPWRQ_UMR_MODE_UNALIGNED:
 		ksm = MLX5_ADDR_OF(create_mkey_in, in, klm_pas_mtt);
 		for (i = 0; i < npages; i++)
@@ -415,6 +435,7 @@ static int mlx5e_create_umr_klm_mkey(struct mlx5_core_dev *mdev,
 
 static int mlx5e_create_rq_umr_mkey(struct mlx5_core_dev *mdev, struct mlx5e_rq *rq)
 {
+	u32 xsk_chunk_size = rq->xsk_pool ? rq->xsk_pool->chunk_size : 0;
 	u32 wq_size = mlx5_wq_ll_get_size(&rq->mpwqe.wq);
 	u32 num_entries, max_num_entries;
 	u32 umr_mkey;
@@ -432,7 +453,7 @@ static int mlx5e_create_rq_umr_mkey(struct mlx5_core_dev *mdev, struct mlx5e_rq
 
 	err = mlx5e_create_umr_mkey(mdev, num_entries, rq->mpwqe.page_shift,
 				    &umr_mkey, rq->wqe_overflow.addr,
-				    rq->mpwqe.umr_mode);
+				    rq->mpwqe.umr_mode, xsk_chunk_size);
 	rq->mpwqe.umr_mkey_be = cpu_to_be32(umr_mkey);
 	return err;
 }
-- 
2.37.3

next prev parent reply	other threads:[~2022-10-02  5:14 UTC|newest]

Thread overview: 17+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2022-10-02  4:56 [PATCH net-next 00/15] ] mlx5 xsk updates part4 and more Saeed Mahameed
2022-10-02  4:56 ` [PATCH net-next 01/15] net/mlx5e: xsk: Flush RQ on XSK activation to save memory Saeed Mahameed
2022-10-02  4:56 ` [PATCH net-next 02/15] net/mlx5e: xsk: Set napi_id to support busy polling Saeed Mahameed
2022-10-02  4:56 ` [PATCH net-next 03/15] net/mlx5e: xsk: Include XSK skb_from_cqe callbacks in INDIRECT_CALL Saeed Mahameed
2022-10-02  4:56 ` [PATCH net-next 04/15] net/mlx5e: xsk: Improve need_wakeup logic Saeed Mahameed
2022-10-02  4:56 ` [PATCH net-next 05/15] net/mlx5e: xsk: Use umr_mode to calculate striding RQ parameters Saeed Mahameed
2022-10-02  4:56 ` [PATCH net-next 06/15] net/mlx5e: Improve MTT/KSM alignment Saeed Mahameed
2022-10-02  4:56 ` Saeed Mahameed [this message]
2022-10-02  4:56 ` [PATCH net-next 08/15] net/mlx5e: xsk: Print a warning in slow configurations Saeed Mahameed
2022-10-02  4:56 ` [PATCH net-next 09/15] net/mlx5e: xsk: Optimize for unaligned mode with 3072-byte frames Saeed Mahameed
2022-10-02  4:56 ` [PATCH net-next 10/15] net/mlx5e: Expose rx_oversize_pkts_buffer counter Saeed Mahameed
2022-10-02  4:56 ` [PATCH net-next 11/15] net/mlx5: Start health poll at earlier stage of driver load Saeed Mahameed
2022-10-02  4:56 ` [PATCH net-next 12/15] net/mlx5: Set default grace period based on function type Saeed Mahameed
2022-10-02  4:56 ` [PATCH net-next 13/15] net/mlx5: E-Switch, Allow offloading fwd dest flow table with vport Saeed Mahameed
2022-10-02  4:56 ` [PATCH net-next 14/15] net/mlx5: E-switch, Don't update group if qos is not enabled Saeed Mahameed
2022-10-02  4:56 ` [PATCH net-next 15/15] net/mlx5: E-Switch, Return EBUSY if can't get mode lock Saeed Mahameed
2022-10-04  0:10 ` [PATCH net-next 00/15] ] mlx5 xsk updates part4 and more patchwork-bot+netdevbpf

find likely ancestor, descendant, or conflicting patches for this message:
( dfblob:93607db1dea dfblob:7c6861d6148 dfblob:e8c3b8abf94
dfblob:203448ee959 dfblob:4b2df289550 dfblob:78d74670434
dfblob:2093b6cc6c7 dfblob:ae728745379 )
 OR (
bs:"[PATCH net-next 07/15] net/mlx5e: xsk: Use KLM to protect frame overrun in unaligned mode" )
	(help)

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20221002045632.291612-8-saeed@kernel.org \
    --to=saeed@kernel.org \
    --cc=davem@davemloft.net \
    --cc=edumazet@google.com \
    --cc=kuba@kernel.org \
    --cc=maximmi@nvidia.com \
    --cc=netdev@vger.kernel.org \
    --cc=pabeni@redhat.com \
    --cc=saeedm@nvidia.com \
    --cc=tariqt@nvidia.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

Be sure your reply has a Subject: header at the top and a blank line before the message body.

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).