Netdev List

Netdev List
 help / color / mirror / Atom feed

* [PATCH 08/17] octeontx2-af: Update bcast list upon NIXLF alloc/free
From: sunil.kovvuri @ 2018-10-19 13:37 UTC (permalink / raw)
  To: netdev, davem; +Cc: arnd, linux-soc, Sunil Goutham
In-Reply-To: <1539956258-29377-1-git-send-email-sunil.kovvuri@gmail.com>

From: Sunil Goutham <sgoutham@marvell.com>

Upon NIXLF ALLOC/FREE, add or remove corresponding PF_FUNC from
the broadcast packet replication list of the CGX LMAC mapped
RVU PF.

Signed-off-by: Sunil Goutham <sgoutham@marvell.com>
---
 .../net/ethernet/marvell/octeontx2/af/rvu_nix.c    | 133 +++++++++++++++++++++
 1 file changed, 133 insertions(+)

diff --git a/drivers/net/ethernet/marvell/octeontx2/af/rvu_nix.c b/drivers/net/ethernet/marvell/octeontx2/af/rvu_nix.c
index 947424a..8333283 100644
--- a/drivers/net/ethernet/marvell/octeontx2/af/rvu_nix.c
+++ b/drivers/net/ethernet/marvell/octeontx2/af/rvu_nix.c
@@ -16,6 +16,8 @@
 #include "rvu.h"
 #include "cgx.h"
 
+static int nix_update_bcast_mce_list(struct rvu *rvu, u16 pcifunc, bool add);
+
 enum mc_tbl_sz {
 	MC_TBL_SZ_256,
 	MC_TBL_SZ_512,
@@ -108,6 +110,7 @@ static int nix_interface_init(struct rvu *rvu, u16 pcifunc, int type, int nixlf)
 	struct rvu_pfvf *pfvf = rvu_get_pfvf(rvu, pcifunc);
 	u8 cgx_id, lmac_id;
 	int pkind, pf;
+	int err;
 
 	pf = rvu_get_pf(pcifunc);
 	if (!is_pf_cgxmapped(rvu, pf) && type != NIX_INTF_TYPE_LBK)
@@ -130,9 +133,30 @@ static int nix_interface_init(struct rvu *rvu, u16 pcifunc, int type, int nixlf)
 	case NIX_INTF_TYPE_LBK:
 		break;
 	}
+
+	/* Add this PF_FUNC to bcast pkt replication list */
+	err = nix_update_bcast_mce_list(rvu, pcifunc, true);
+	if (err) {
+		dev_err(rvu->dev,
+			"Bcast list, failed to enable PF_FUNC 0x%x\n",
+			pcifunc);
+	}
 	return 0;
 }
 
+static void nix_interface_deinit(struct rvu *rvu, u16 pcifunc, u8 nixlf)
+{
+	int err;
+
+	/* Remove this PF_FUNC from bcast pkt replication list */
+	err = nix_update_bcast_mce_list(rvu, pcifunc, false);
+	if (err) {
+		dev_err(rvu->dev,
+			"Bcast list, failed to disable PF_FUNC 0x%x\n",
+			pcifunc);
+	}
+}
+
 static void nix_setup_lso_tso_l3(struct rvu *rvu, int blkaddr,
 				 u64 format, bool v4, u64 *fidx)
 {
@@ -786,6 +810,8 @@ int rvu_mbox_handler_NIX_LF_FREE(struct rvu *rvu, struct msg_req *req,
 	if (nixlf < 0)
 		return NIX_AF_ERR_AF_LF_INVALID;
 
+	nix_interface_deinit(rvu, pcifunc, nixlf);
+
 	/* Reset this NIX LF */
 	err = rvu_lf_reset(rvu, block, nixlf);
 	if (err) {
@@ -1147,6 +1173,113 @@ static int nix_setup_mce(struct rvu *rvu, int mce, u8 op,
 	return 0;
 }
 
+static int nix_update_mce_list(struct nix_mce_list *mce_list,
+			       u16 pcifunc, int idx, bool add)
+{
+	struct mce *mce, *tail = NULL;
+	bool delete = false;
+
+	/* Scan through the current list */
+	hlist_for_each_entry(mce, &mce_list->head, node) {
+		/* If already exists, then delete */
+		if (mce->pcifunc == pcifunc && !add) {
+			delete = true;
+			break;
+		}
+		tail = mce;
+	}
+
+	if (delete) {
+		hlist_del(&mce->node);
+		kfree(mce);
+		mce_list->count--;
+		return 0;
+	}
+
+	if (!add)
+		return 0;
+
+	/* Add a new one to the list, at the tail */
+	mce = kzalloc(sizeof(*mce), GFP_KERNEL);
+	if (!mce)
+		return -ENOMEM;
+	mce->idx = idx;
+	mce->pcifunc = pcifunc;
+	if (!tail)
+		hlist_add_head(&mce->node, &mce_list->head);
+	else
+		hlist_add_behind(&mce->node, &tail->node);
+	mce_list->count++;
+	return 0;
+}
+
+static int nix_update_bcast_mce_list(struct rvu *rvu, u16 pcifunc, bool add)
+{
+	int err = 0, idx, next_idx, count;
+	struct nix_mce_list *mce_list;
+	struct mce *mce, *next_mce;
+	struct nix_mcast *mcast;
+	struct nix_hw *nix_hw;
+	struct rvu_pfvf *pfvf;
+	int blkaddr;
+
+	blkaddr = rvu_get_blkaddr(rvu, BLKTYPE_NIX, pcifunc);
+	if (blkaddr < 0)
+		return 0;
+
+	nix_hw = get_nix_hw(rvu->hw, blkaddr);
+	if (!nix_hw)
+		return 0;
+
+	mcast = &nix_hw->mcast;
+
+	/* Get this PF/VF func's MCE index */
+	pfvf = rvu_get_pfvf(rvu, pcifunc & ~RVU_PFVF_FUNC_MASK);
+	idx = pfvf->bcast_mce_idx + (pcifunc & RVU_PFVF_FUNC_MASK);
+
+	mce_list = &pfvf->bcast_mce_list;
+	if (idx > (pfvf->bcast_mce_idx + mce_list->max)) {
+		dev_err(rvu->dev,
+			"%s: Idx %d > max MCE idx %d, for PF%d bcast list\n",
+			__func__, idx, mce_list->max,
+			pcifunc >> RVU_PFVF_PF_SHIFT);
+		return -EINVAL;
+	}
+
+	spin_lock(&mcast->mce_lock);
+
+	err = nix_update_mce_list(mce_list, pcifunc, idx, add);
+	if (err)
+		goto end;
+
+	/* Disable MCAM entry in NPC */
+
+	if (!mce_list->count)
+		goto end;
+	count = mce_list->count;
+
+	/* Dump the updated list to HW */
+	hlist_for_each_entry(mce, &mce_list->head, node) {
+		next_idx = 0;
+		count--;
+		if (count) {
+			next_mce = hlist_entry(mce->node.next,
+					       struct mce, node);
+			next_idx = next_mce->idx;
+		}
+		/* EOL should be set in last MCE */
+		err = nix_setup_mce(rvu, mce->idx,
+				    NIX_AQ_INSTOP_WRITE, mce->pcifunc,
+				    next_idx, count ? false : true);
+		if (err)
+			goto end;
+	}
+
+end:
+	spin_unlock(&mcast->mce_lock);
+	return err;
+}
+
 static int nix_setup_bcast_tables(struct rvu *rvu, struct nix_hw *nix_hw)
 {
 	struct nix_mcast *mcast = &nix_hw->mcast;
-- 
2.7.4

^ permalink raw reply related

* [PATCH 09/17] octeontx2-af: Support for VTAG strip and capture
From: sunil.kovvuri @ 2018-10-19 13:37 UTC (permalink / raw)
  To: netdev, davem; +Cc: arnd, linux-soc, Vamsi Attunuru, Sunil Goutham
In-Reply-To: <1539956258-29377-1-git-send-email-sunil.kovvuri@gmail.com>

From: Vamsi Attunuru <vamsi.attunuru@marvell.com>

Added support for PF/VF drivers to configure NIX to
capture and/or strip VLAN tag from ingress packets.

Signed-off-by: Vamsi Attunuru <vamsi.attunuru@marvell.com>
Signed-off-by: Sunil Goutham <sgoutham@marvell.com>
---
 drivers/net/ethernet/marvell/octeontx2/af/mbox.h   | 35 ++++++++++++-
 drivers/net/ethernet/marvell/octeontx2/af/rvu.h    |  3 ++
 .../net/ethernet/marvell/octeontx2/af/rvu_nix.c    | 59 ++++++++++++++++++++++
 .../net/ethernet/marvell/octeontx2/af/rvu_struct.h |  5 ++
 4 files changed, 101 insertions(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/marvell/octeontx2/af/mbox.h b/drivers/net/ethernet/marvell/octeontx2/af/mbox.h
index f8efeaa..b60ac9d 100644
--- a/drivers/net/ethernet/marvell/octeontx2/af/mbox.h
+++ b/drivers/net/ethernet/marvell/octeontx2/af/mbox.h
@@ -156,7 +156,8 @@ M(NIX_HWCTX_DISABLE,	0x8003, hwctx_disable_req, msg_rsp)		\
 M(NIX_TXSCH_ALLOC,	0x8004, nix_txsch_alloc_req, nix_txsch_alloc_rsp) \
 M(NIX_TXSCH_FREE,	0x8005, nix_txsch_free_req, msg_rsp)		\
 M(NIX_TXSCHQ_CFG,	0x8006, nix_txschq_config, msg_rsp)		\
-M(NIX_STATS_RST,	0x8007, msg_req, msg_rsp)
+M(NIX_STATS_RST,	0x8007, msg_req, msg_rsp)			\
+M(NIX_VTAG_CFG,	0x8008, nix_vtag_config, msg_rsp)
 
 /* Messages initiated by AF (range 0xC00 - 0xDFF) */
 #define MBOX_UP_CGX_MESSAGES						\
@@ -462,4 +463,36 @@ struct nix_txschq_config {
 	u64 regval[MAX_REGS_PER_MBOX_MSG];
 };
 
+struct nix_vtag_config {
+	struct mbox_msghdr hdr;
+	u8 vtag_size;
+	/* cfg_type is '0' for tx vlan cfg
+	 * cfg_type is '1' for rx vlan cfg
+	 */
+	u8 cfg_type;
+	union {
+		/* valid when cfg_type is '0' */
+		struct {
+			/* tx vlan0 tag(C-VLAN) */
+			u64 vlan0;
+			/* tx vlan1 tag(S-VLAN) */
+			u64 vlan1;
+			/* insert tx vlan tag */
+			u8 insert_vlan :1;
+			/* insert tx double vlan tag */
+			u8 double_vlan :1;
+		} tx;
+
+		/* valid when cfg_type is '1' */
+		struct {
+			/* rx vtag type index */
+			u8 vtag_type;
+			/* rx vtag strip */
+			u8 strip_vtag :1;
+			/* rx vtag capture */
+			u8 capture_vtag :1;
+		} rx;
+	};
+};
+
 #endif /* MBOX_H */
diff --git a/drivers/net/ethernet/marvell/octeontx2/af/rvu.h b/drivers/net/ethernet/marvell/octeontx2/af/rvu.h
index b39400d..1e85e80 100644
--- a/drivers/net/ethernet/marvell/octeontx2/af/rvu.h
+++ b/drivers/net/ethernet/marvell/octeontx2/af/rvu.h
@@ -319,6 +319,9 @@ int rvu_mbox_handler_NIX_TXSCHQ_CFG(struct rvu *rvu,
 				    struct msg_rsp *rsp);
 int rvu_mbox_handler_NIX_STATS_RST(struct rvu *rvu, struct msg_req *req,
 				   struct msg_rsp *rsp);
+int rvu_mbox_handler_NIX_VTAG_CFG(struct rvu *rvu,
+				  struct nix_vtag_config *req,
+				  struct msg_rsp *rsp);
 
 /* NPC APIs */
 int rvu_npc_init(struct rvu *rvu);
diff --git a/drivers/net/ethernet/marvell/octeontx2/af/rvu_nix.c b/drivers/net/ethernet/marvell/octeontx2/af/rvu_nix.c
index 8333283..7de5417 100644
--- a/drivers/net/ethernet/marvell/octeontx2/af/rvu_nix.c
+++ b/drivers/net/ethernet/marvell/octeontx2/af/rvu_nix.c
@@ -1143,6 +1143,65 @@ int rvu_mbox_handler_NIX_TXSCHQ_CFG(struct rvu *rvu,
 	return 0;
 }
 
+static int nix_rx_vtag_cfg(struct rvu *rvu, int nixlf, int blkaddr,
+			   struct nix_vtag_config *req)
+{
+	u64 regval = 0;
+
+#define NIX_VTAGTYPE_MAX 0x8ull
+#define NIX_VTAGSIZE_MASK 0x7ull
+#define NIX_VTAGSTRIP_CAP_MASK 0x30ull
+
+	if (req->rx.vtag_type >= NIX_VTAGTYPE_MAX ||
+	    req->vtag_size > VTAGSIZE_T8)
+		return -EINVAL;
+
+	regval = rvu_read64(rvu, blkaddr,
+			    NIX_AF_LFX_RX_VTAG_TYPEX(nixlf, req->rx.vtag_type));
+
+	if (req->rx.strip_vtag && req->rx.capture_vtag)
+		regval |= BIT_ULL(4) | BIT_ULL(5);
+	else if (req->rx.strip_vtag)
+		regval |= BIT_ULL(4);
+	else
+		regval &= ~(BIT_ULL(4) | BIT_ULL(5));
+
+	regval &= ~NIX_VTAGSIZE_MASK;
+	regval |= req->vtag_size & NIX_VTAGSIZE_MASK;
+
+	rvu_write64(rvu, blkaddr,
+		    NIX_AF_LFX_RX_VTAG_TYPEX(nixlf, req->rx.vtag_type), regval);
+	return 0;
+}
+
+int rvu_mbox_handler_NIX_VTAG_CFG(struct rvu *rvu,
+				  struct nix_vtag_config *req,
+				  struct msg_rsp *rsp)
+{
+	struct rvu_hwinfo *hw = rvu->hw;
+	u16 pcifunc = req->hdr.pcifunc;
+	int blkaddr, nixlf, err;
+
+	blkaddr = rvu_get_blkaddr(rvu, BLKTYPE_NIX, pcifunc);
+	if (blkaddr < 0)
+		return NIX_AF_ERR_AF_LF_INVALID;
+
+	nixlf = rvu_get_lf(rvu, &hw->block[blkaddr], pcifunc, 0);
+	if (nixlf < 0)
+		return NIX_AF_ERR_AF_LF_INVALID;
+
+	if (req->cfg_type) {
+		err = nix_rx_vtag_cfg(rvu, nixlf, blkaddr, req);
+		if (err)
+			return NIX_AF_ERR_PARAM;
+	} else {
+		/* TODO: handle tx vtag configuration */
+		return 0;
+	}
+
+	return 0;
+}
+
 static int nix_setup_mce(struct rvu *rvu, int mce, u8 op,
 			 u16 pcifunc, int next, bool eol)
 {
diff --git a/drivers/net/ethernet/marvell/octeontx2/af/rvu_struct.h b/drivers/net/ethernet/marvell/octeontx2/af/rvu_struct.h
index c331b237..bb2836e 100644
--- a/drivers/net/ethernet/marvell/octeontx2/af/rvu_struct.h
+++ b/drivers/net/ethernet/marvell/octeontx2/af/rvu_struct.h
@@ -879,4 +879,9 @@ struct nix_lso_format {
 #endif
 };
 
+/* NIX VTAG size */
+enum nix_vtag_size {
+	VTAGSIZE_T4   = 0x0,
+	VTAGSIZE_T8   = 0x1,
+};
 #endif /* RVU_STRUCT_H */
-- 
2.7.4

^ permalink raw reply related

* [PATCH 10/17] octeontx2-af: Enable packet length and csum validation
From: sunil.kovvuri @ 2018-10-19 13:37 UTC (permalink / raw)
  To: netdev, davem; +Cc: arnd, linux-soc, Sunil Goutham
In-Reply-To: <1539956258-29377-1-git-send-email-sunil.kovvuri@gmail.com>

From: Sunil Goutham <sgoutham@marvell.com>

Config NPC layer info from KPU profile into protocol
checker to identify outer L2/IPv4/TCP/UDP headers in a
packet. And enable IPv4 checksum validation.

L3/L4 and L4 CSUM validation will be enabled by PF/VF
drivers by configuring NIX_AF_LF(0..127)_RX_CFG via mbox
i.e 'nix_lf_alloc_req->rx_cfg'

Also enable setting of NPC_RESULT_S[L2B] when an outer
L2 broadcast address is detected. This will help in
installing NPC MCAM rules for broadcast packets.

Signed-off-by: Sunil Goutham <sgoutham@marvell.com>
---
 drivers/net/ethernet/marvell/octeontx2/af/rvu_nix.c | 14 ++++++++++++++
 drivers/net/ethernet/marvell/octeontx2/af/rvu_npc.c | 14 ++++++++++++++
 2 files changed, 28 insertions(+)

diff --git a/drivers/net/ethernet/marvell/octeontx2/af/rvu_nix.c b/drivers/net/ethernet/marvell/octeontx2/af/rvu_nix.c
index 7de5417..02e1d16 100644
--- a/drivers/net/ethernet/marvell/octeontx2/af/rvu_nix.c
+++ b/drivers/net/ethernet/marvell/octeontx2/af/rvu_nix.c
@@ -14,6 +14,7 @@
 #include "rvu_struct.h"
 #include "rvu_reg.h"
 #include "rvu.h"
+#include "npc.h"
 #include "cgx.h"
 
 static int nix_update_bcast_mce_list(struct rvu *rvu, u16 pcifunc, bool add);
@@ -1630,6 +1631,19 @@ int rvu_nix_init(struct rvu *rvu)
 		err = nix_setup_mcast(rvu, hw->nix0, blkaddr);
 		if (err)
 			return err;
+
+		/* Config Outer L2, IP, TCP and UDP's NPC layer info.
+		 * This helps HW protocol checker to identify headers
+		 * and validate length and checksums.
+		 */
+		rvu_write64(rvu, blkaddr, NIX_AF_RX_DEF_OL2,
+			    (NPC_LID_LA << 8) | (NPC_LT_LA_ETHER << 4) | 0x0F);
+		rvu_write64(rvu, blkaddr, NIX_AF_RX_DEF_OUDP,
+			    (NPC_LID_LD << 8) | (NPC_LT_LD_UDP << 4) | 0x0F);
+		rvu_write64(rvu, blkaddr, NIX_AF_RX_DEF_OTCP,
+			    (NPC_LID_LD << 8) | (NPC_LT_LD_TCP << 4) | 0x0F);
+		rvu_write64(rvu, blkaddr, NIX_AF_RX_DEF_OIP4,
+			    (NPC_LID_LC << 8) | (NPC_LT_LC_IP << 4) | 0x0F);
 	}
 	return 0;
 }
diff --git a/drivers/net/ethernet/marvell/octeontx2/af/rvu_npc.c b/drivers/net/ethernet/marvell/octeontx2/af/rvu_npc.c
index a973895..cc1d8c9 100644
--- a/drivers/net/ethernet/marvell/octeontx2/af/rvu_npc.c
+++ b/drivers/net/ethernet/marvell/octeontx2/af/rvu_npc.c
@@ -220,6 +220,20 @@ int rvu_npc_init(struct rvu *rvu)
 	/* Configure KPU profile */
 	npc_parser_profile_init(rvu, blkaddr);
 
+	/* Config Outer L2, IPv4's NPC layer info */
+	rvu_write64(rvu, blkaddr, NPC_AF_PCK_DEF_OL2,
+		    (NPC_LID_LA << 8) | (NPC_LT_LA_ETHER << 4) | 0x0F);
+	rvu_write64(rvu, blkaddr, NPC_AF_PCK_DEF_OIP4,
+		    (NPC_LID_LC << 8) | (NPC_LT_LC_IP << 4) | 0x0F);
+
+	/* Enable below for Rx pkts.
+	 * - Outer IPv4 header checksum validation.
+	 * - Detect outer L2 broadcast address and set NPC_RESULT_S[L2M].
+	 */
+	rvu_write64(rvu, blkaddr, NPC_AF_PCK_CFG,
+		    rvu_read64(rvu, blkaddr, NPC_AF_PCK_CFG) |
+		    BIT_ULL(6) | BIT_ULL(2));
+
 	return 0;
 }
 
-- 
2.7.4

^ permalink raw reply related

* [PATCH 11/17] octeontx2-af: NPC MCAM and LDATA extract minimal configuration
From: sunil.kovvuri @ 2018-10-19 13:37 UTC (permalink / raw)
  To: netdev, davem; +Cc: arnd, linux-soc, Sunil Goutham
In-Reply-To: <1539956258-29377-1-git-send-email-sunil.kovvuri@gmail.com>

From: Sunil Goutham <sgoutham@marvell.com>

This patch adds some minimal configuration for NPC MCAM and
LDATA extraction which is sufficient enough to install
ucast/bcast/promiscuous forwarding rules. Below is the
config done
- LDATA extraction config to extract DMAC from pkt
  to offset 64bit in MCAM search key.
- Set MCAM lookup keysize to 224bits
- Set MCAM TX miss action to UCAST_DEFAULT
- Set MCAM RX miss action to DROP

Also inorder to have guaranteed space in MCAM to install
ucast forwarding rule for each of RVU PF/VF, reserved
one MCAM entry for each of NIXLF for ucast rule. And two
entries for each of RVU PF. One for bcast pkt replication
and other for promiscuous mode which allows all pkts
received on a HW CGX/LBK channel.

Signed-off-by: Sunil Goutham <sgoutham@marvell.com>
---
 drivers/net/ethernet/marvell/octeontx2/af/common.h |  21 ++++
 drivers/net/ethernet/marvell/octeontx2/af/rvu.h    |  14 +++
 .../net/ethernet/marvell/octeontx2/af/rvu_nix.c    |  12 ++
 .../net/ethernet/marvell/octeontx2/af/rvu_npc.c    | 134 +++++++++++++++++++++
 4 files changed, 181 insertions(+)

diff --git a/drivers/net/ethernet/marvell/octeontx2/af/common.h b/drivers/net/ethernet/marvell/octeontx2/af/common.h
index 7c53ba3..e438f92 100644
--- a/drivers/net/ethernet/marvell/octeontx2/af/common.h
+++ b/drivers/net/ethernet/marvell/octeontx2/af/common.h
@@ -143,6 +143,27 @@ enum nix_scheduler {
 	NIX_TXSCH_LVL_CNT = 0x5,
 };
 
+/* NIX RX action operation*/
+#define NIX_RX_ACTIONOP_DROP		(0x0ull)
+#define NIX_RX_ACTIONOP_UCAST		(0x1ull)
+#define NIX_RX_ACTIONOP_UCAST_IPSEC	(0x2ull)
+#define NIX_RX_ACTIONOP_MCAST		(0x3ull)
+#define NIX_RX_ACTIONOP_RSS		(0x4ull)
+
+/* NIX TX action operation*/
+#define NIX_TX_ACTIONOP_DROP		(0x0ull)
+#define NIX_TX_ACTIONOP_UCAST_DEFAULT	(0x1ull)
+#define NIX_TX_ACTIONOP_UCAST_CHAN	(0x2ull)
+#define NIX_TX_ACTIONOP_MCAST		(0x3ull)
+#define NIX_TX_ACTIONOP_DROP_VIOL	(0x5ull)
+
+#define NPC_MCAM_KEY_X1			0
+#define NPC_MCAM_KEY_X2			1
+#define NPC_MCAM_KEY_X4			2
+
+#define NIX_INTF_RX			0
+#define NIX_INTF_TX			1
+
 #define NIX_INTF_TYPE_CGX		0
 #define NIX_INTF_TYPE_LBK		1
 
diff --git a/drivers/net/ethernet/marvell/octeontx2/af/rvu.h b/drivers/net/ethernet/marvell/octeontx2/af/rvu.h
index 1e85e80..9fa5183 100644
--- a/drivers/net/ethernet/marvell/octeontx2/af/rvu.h
+++ b/drivers/net/ethernet/marvell/octeontx2/af/rvu.h
@@ -73,6 +73,18 @@ struct nix_mce_list {
 	int			max;
 };
 
+struct npc_mcam {
+	spinlock_t	lock;	/* MCAM entries and counters update lock */
+	u8	keysize;	/* MCAM keysize 112/224/448 bits */
+	u8	banks;		/* Number of MCAM banks */
+	u8	banks_per_entry;/* Number of keywords in key */
+	u16	banksize;	/* Number of MCAM entries in each bank */
+	u16	total_entries;	/* Total number of MCAM entries */
+	u16     entries;	/* Total minus reserved for NIX LFs */
+	u16	nixlf_offset;	/* Offset of nixlf rsvd uncast entries */
+	u16	pf_offset;	/* Offset of PF's rsvd bcast, promisc entries */
+};
+
 /* Structure for per RVU func info ie PF/VF */
 struct rvu_pfvf {
 	bool		npalf; /* Only one NPALF per RVU_FUNC */
@@ -144,6 +156,7 @@ struct rvu_hwinfo {
 	struct rvu_block block[BLK_COUNT]; /* Block info */
 	struct nix_hw    *nix0;
 	struct npc_pkind pkind;
+	struct npc_mcam  mcam;
 };
 
 struct rvu {
@@ -297,6 +310,7 @@ int rvu_mbox_handler_NPA_LF_FREE(struct rvu *rvu, struct msg_req *req,
 /* NIX APIs */
 int rvu_nix_init(struct rvu *rvu);
 void rvu_nix_freemem(struct rvu *rvu);
+int rvu_get_nixlf_count(struct rvu *rvu);
 int rvu_mbox_handler_NIX_LF_ALLOC(struct rvu *rvu,
 				  struct nix_lf_alloc_req *req,
 				  struct nix_lf_alloc_rsp *rsp);
diff --git a/drivers/net/ethernet/marvell/octeontx2/af/rvu_nix.c b/drivers/net/ethernet/marvell/octeontx2/af/rvu_nix.c
index 02e1d16..55075e7 100644
--- a/drivers/net/ethernet/marvell/octeontx2/af/rvu_nix.c
+++ b/drivers/net/ethernet/marvell/octeontx2/af/rvu_nix.c
@@ -55,6 +55,18 @@ struct mce {
 	u16			pcifunc;
 };
 
+int rvu_get_nixlf_count(struct rvu *rvu)
+{
+	struct rvu_block *block;
+	int blkaddr;
+
+	blkaddr = rvu_get_blkaddr(rvu, BLKTYPE_NIX, 0);
+	if (blkaddr < 0)
+		return 0;
+	block = &rvu->hw->block[blkaddr];
+	return block->lf.max;
+}
+
 static void nix_mce_list_init(struct nix_mce_list *list, int max)
 {
 	INIT_HLIST_HEAD(&list->head);
diff --git a/drivers/net/ethernet/marvell/octeontx2/af/rvu_npc.c b/drivers/net/ethernet/marvell/octeontx2/af/rvu_npc.c
index cc1d8c9..1c29436 100644
--- a/drivers/net/ethernet/marvell/octeontx2/af/rvu_npc.c
+++ b/drivers/net/ethernet/marvell/octeontx2/af/rvu_npc.c
@@ -17,6 +17,15 @@
 #include "npc.h"
 #include "npc_profile.h"
 
+#define RSVD_MCAM_ENTRIES_PER_PF	2 /* Bcast & Promisc */
+#define RSVD_MCAM_ENTRIES_PER_NIXLF	1 /* Ucast for LFs */
+
+#define NIXLF_UCAST_ENTRY	0
+#define NIXLF_BCAST_ENTRY	1
+#define NIXLF_PROMISC_ENTRY	2
+
+#define NPC_PARSE_RESULT_DMAC_OFFSET	8
+
 void rvu_npc_set_pkind(struct rvu *rvu, int pkind, struct rvu_pfvf *pfvf)
 {
 	int blkaddr;
@@ -45,6 +54,53 @@ int rvu_npc_get_pkind(struct rvu *rvu, u16 pf)
 	return -1;
 }
 
+#define LDATA_EXTRACT_CONFIG(intf, lid, ltype, ld, cfg) \
+	rvu_write64(rvu, blkaddr,			\
+		NPC_AF_INTFX_LIDX_LTX_LDX_CFG(intf, lid, ltype, ld), cfg)
+
+#define LDATA_FLAGS_CONFIG(intf, ld, flags, cfg)	\
+	rvu_write64(rvu, blkaddr,			\
+		NPC_AF_INTFX_LDATAX_FLAGSX_CFG(intf, ld, flags), cfg)
+
+static void npc_config_ldata_extract(struct rvu *rvu, int blkaddr)
+{
+	struct npc_mcam *mcam = &rvu->hw->mcam;
+	int lid, ltype;
+	int lid_count;
+	u64 cfg;
+
+	cfg = rvu_read64(rvu, blkaddr, NPC_AF_CONST);
+	lid_count = (cfg >> 4) & 0xF;
+
+	/* First clear any existing config i.e
+	 * disable LDATA and FLAGS extraction.
+	 */
+	for (lid = 0; lid < lid_count; lid++) {
+		for (ltype = 0; ltype < 16; ltype++) {
+			LDATA_EXTRACT_CONFIG(NIX_INTF_RX, lid, ltype, 0, 0ULL);
+			LDATA_EXTRACT_CONFIG(NIX_INTF_RX, lid, ltype, 1, 0ULL);
+			LDATA_EXTRACT_CONFIG(NIX_INTF_TX, lid, ltype, 0, 0ULL);
+			LDATA_EXTRACT_CONFIG(NIX_INTF_TX, lid, ltype, 1, 0ULL);
+
+			LDATA_FLAGS_CONFIG(NIX_INTF_RX, 0, ltype, 0ULL);
+			LDATA_FLAGS_CONFIG(NIX_INTF_RX, 1, ltype, 0ULL);
+			LDATA_FLAGS_CONFIG(NIX_INTF_TX, 0, ltype, 0ULL);
+			LDATA_FLAGS_CONFIG(NIX_INTF_TX, 1, ltype, 0ULL);
+		}
+	}
+
+	/* If we plan to extract Outer IPv4 tuple for TCP/UDP pkts
+	 * then 112bit key is not sufficient
+	 */
+	if (mcam->keysize != NPC_MCAM_KEY_X2)
+		return;
+
+	/* Start placing extracted data/flags from 64bit onwards, for now */
+	/* Extract DMAC from the packet */
+	cfg = (0x05 << 16) | BIT_ULL(7) | NPC_PARSE_RESULT_DMAC_OFFSET;
+	LDATA_EXTRACT_CONFIG(NIX_INTF_RX, NPC_LID_LA, NPC_LT_LA_ETHER, 0, cfg);
+}
+
 static void npc_config_kpuaction(struct rvu *rvu, int blkaddr,
 				 struct npc_kpu_profile_action *kpuaction,
 				 int kpu, int entry, bool pkind)
@@ -193,9 +249,61 @@ static void npc_parser_profile_init(struct rvu *rvu, int blkaddr)
 					idx, &npc_kpu_profiles[idx]);
 }
 
+static int npc_mcam_rsrcs_init(struct rvu *rvu, int blkaddr)
+{
+	int nixlf_count = rvu_get_nixlf_count(rvu);
+	struct npc_mcam *mcam = &rvu->hw->mcam;
+	int rsvd;
+	u64 cfg;
+
+	/* Get HW limits */
+	cfg = rvu_read64(rvu, blkaddr, NPC_AF_CONST);
+	mcam->banks = (cfg >> 44) & 0xF;
+	mcam->banksize = (cfg >> 28) & 0xFFFF;
+
+	/* Actual number of MCAM entries vary by entry size */
+	cfg = (rvu_read64(rvu, blkaddr,
+			  NPC_AF_INTFX_KEX_CFG(0)) >> 32) & 0x07;
+	mcam->total_entries = (mcam->banks / BIT_ULL(cfg)) * mcam->banksize;
+	mcam->keysize = cfg;
+
+	/* Number of banks combined per MCAM entry */
+	if (cfg == NPC_MCAM_KEY_X4)
+		mcam->banks_per_entry = 4;
+	else if (cfg == NPC_MCAM_KEY_X2)
+		mcam->banks_per_entry = 2;
+	else
+		mcam->banks_per_entry = 1;
+
+	/* Reserve one MCAM entry for each of the NIX LF to
+	 * guarantee space to install default matching DMAC rule.
+	 * Also reserve 2 MCAM entries for each PF for default
+	 * channel based matching or 'bcast & promisc' matching to
+	 * support BCAST and PROMISC modes of operation for PFs.
+	 * PF0 is excluded.
+	 */
+	rsvd = (nixlf_count * RSVD_MCAM_ENTRIES_PER_NIXLF) +
+		((rvu->hw->total_pfs - 1) * RSVD_MCAM_ENTRIES_PER_PF);
+	if (mcam->total_entries <= rsvd) {
+		dev_warn(rvu->dev,
+			 "Insufficient NPC MCAM size %d for pkt I/O, exiting\n",
+			 mcam->total_entries);
+		return -ENOMEM;
+	}
+
+	mcam->entries = mcam->total_entries - rsvd;
+	mcam->nixlf_offset = mcam->entries;
+	mcam->pf_offset = mcam->nixlf_offset + nixlf_count;
+
+	spin_lock_init(&mcam->lock);
+
+	return 0;
+}
+
 int rvu_npc_init(struct rvu *rvu)
 {
 	struct npc_pkind *pkind = &rvu->hw->pkind;
+	u64 keyz = NPC_MCAM_KEY_X2;
 	int blkaddr, err;
 
 	blkaddr = rvu_get_blkaddr(rvu, BLKTYPE_NPC, 0);
@@ -234,6 +342,32 @@ int rvu_npc_init(struct rvu *rvu)
 		    rvu_read64(rvu, blkaddr, NPC_AF_PCK_CFG) |
 		    BIT_ULL(6) | BIT_ULL(2));
 
+	/* Set RX and TX side MCAM search key size.
+	 * Also enable parse key extract nibbles suchthat except
+	 * layer E to H, rest of the key is included for MCAM search.
+	 */
+	rvu_write64(rvu, blkaddr, NPC_AF_INTFX_KEX_CFG(NIX_INTF_RX),
+		    ((keyz & 0x3) << 32) | ((1ULL << 20) - 1));
+	rvu_write64(rvu, blkaddr, NPC_AF_INTFX_KEX_CFG(NIX_INTF_TX),
+		    ((keyz & 0x3) << 32) | ((1ULL << 20) - 1));
+
+	err = npc_mcam_rsrcs_init(rvu, blkaddr);
+	if (err)
+		return err;
+
+	/* Config packet data and flags extraction into PARSE result */
+	npc_config_ldata_extract(rvu, blkaddr);
+
+	/* Set TX miss action to UCAST_DEFAULT i.e
+	 * transmit the packet on NIX LF SQ's default channel.
+	 */
+	rvu_write64(rvu, blkaddr, NPC_AF_INTFX_MISS_ACT(NIX_INTF_TX),
+		    NIX_TX_ACTIONOP_UCAST_DEFAULT);
+
+	/* If MCAM lookup doesn't result in a match, drop the received packet */
+	rvu_write64(rvu, blkaddr, NPC_AF_INTFX_MISS_ACT(NIX_INTF_RX),
+		    NIX_RX_ACTIONOP_DROP);
+
 	return 0;
 }
 
-- 
2.7.4

^ permalink raw reply related

* [PATCH 12/17] octeontx2-af: Add LMAC channel info to NIXLF_ALLOC response
From: sunil.kovvuri @ 2018-10-19 13:37 UTC (permalink / raw)
  To: netdev, davem
  Cc: arnd, linux-soc, Stanislaw Kardach, Tomasz Duszynski,
	Sunil Goutham
In-Reply-To: <1539956258-29377-1-git-send-email-sunil.kovvuri@gmail.com>

From: Stanislaw Kardach <skardach@marvell.com>

Add LMAC channel info like Rx/Tx channel base and count to
NIXLF_ALLOC mailbox message response. This info is used by
NIXLF attached RVU PF/VF to configure SQ's default channel,
TL3_TL2_LINKX_CFG and to install MCAM rules in NPC based
on matching ingress channel number.

Signed-off-by: Stanislaw Kardach <skardach@marvell.com>
Signed-off-by: Tomasz Duszynski <tduszynski@marvell.com>
Signed-off-by: Sunil Goutham <sgoutham@marvell.com>
---
 drivers/net/ethernet/marvell/octeontx2/af/common.h  | 1 +
 drivers/net/ethernet/marvell/octeontx2/af/mbox.h    | 4 ++++
 drivers/net/ethernet/marvell/octeontx2/af/rvu.h     | 5 +++++
 drivers/net/ethernet/marvell/octeontx2/af/rvu_nix.c | 8 ++++++++
 4 files changed, 18 insertions(+)

diff --git a/drivers/net/ethernet/marvell/octeontx2/af/common.h b/drivers/net/ethernet/marvell/octeontx2/af/common.h
index e438f92..6c8150d 100644
--- a/drivers/net/ethernet/marvell/octeontx2/af/common.h
+++ b/drivers/net/ethernet/marvell/octeontx2/af/common.h
@@ -169,6 +169,7 @@ enum nix_scheduler {
 
 #define MAX_LMAC_PKIND			12
 #define NIX_LINK_CGX_LMAC(a, b)		(0 + 4 * (a) + (b))
+#define NIX_CHAN_CGX_LMAC_CHX(a, b, c)	(0x800 + 0x100 * (a) + 0x10 * (b) + (c))
 
 /* NIX LSO format indices.
  * As of now TSO is the only one using, so statically assigning indices.
diff --git a/drivers/net/ethernet/marvell/octeontx2/af/mbox.h b/drivers/net/ethernet/marvell/octeontx2/af/mbox.h
index b60ac9d..0e2552c 100644
--- a/drivers/net/ethernet/marvell/octeontx2/af/mbox.h
+++ b/drivers/net/ethernet/marvell/octeontx2/af/mbox.h
@@ -382,6 +382,10 @@ struct nix_lf_alloc_req {
 struct nix_lf_alloc_rsp {
 	struct mbox_msghdr hdr;
 	u16	sqb_size;
+	u16	rx_chan_base;
+	u16	tx_chan_base;
+	u8      rx_chan_cnt; /* total number of RX channels */
+	u8      tx_chan_cnt; /* total number of TX channels */
 	u8	lso_tsov4_idx;
 	u8	lso_tsov6_idx;
 	u8      mac_addr[ETH_ALEN];
diff --git a/drivers/net/ethernet/marvell/octeontx2/af/rvu.h b/drivers/net/ethernet/marvell/octeontx2/af/rvu.h
index 9fa5183..12391d2 100644
--- a/drivers/net/ethernet/marvell/octeontx2/af/rvu.h
+++ b/drivers/net/ethernet/marvell/octeontx2/af/rvu.h
@@ -118,6 +118,11 @@ struct rvu_pfvf {
 	unsigned long	*rq_bmap;
 	unsigned long	*cq_bmap;
 
+	u16		rx_chan_base;
+	u16		tx_chan_base;
+	u8              rx_chan_cnt; /* total number of RX channels */
+	u8              tx_chan_cnt; /* total number of TX channels */
+
 	u8		mac_addr[ETH_ALEN]; /* MAC address of this PF/VF */
 
 	/* Broadcast pkt replication info */
diff --git a/drivers/net/ethernet/marvell/octeontx2/af/rvu_nix.c b/drivers/net/ethernet/marvell/octeontx2/af/rvu_nix.c
index 55075e7..86b1e9b 100644
--- a/drivers/net/ethernet/marvell/octeontx2/af/rvu_nix.c
+++ b/drivers/net/ethernet/marvell/octeontx2/af/rvu_nix.c
@@ -140,6 +140,10 @@ static int nix_interface_init(struct rvu *rvu, u16 pcifunc, int type, int nixlf)
 				"PF_Func 0x%x: Invalid pkind\n", pcifunc);
 			return -EINVAL;
 		}
+		pfvf->rx_chan_base = NIX_CHAN_CGX_LMAC_CHX(cgx_id, lmac_id, 0);
+		pfvf->tx_chan_base = pfvf->rx_chan_base;
+		pfvf->rx_chan_cnt = 1;
+		pfvf->tx_chan_cnt = 1;
 		cgx_set_pkind(rvu_cgx_pdata(cgx_id, rvu), lmac_id, pkind);
 		rvu_npc_set_pkind(rvu, pkind, pfvf);
 		break;
@@ -799,6 +803,10 @@ int rvu_mbox_handler_NIX_LF_ALLOC(struct rvu *rvu,
 	/* set SQB size info */
 	cfg = rvu_read64(rvu, blkaddr, NIX_AF_SQ_CONST);
 	rsp->sqb_size = (cfg >> 34) & 0xFFFF;
+	rsp->rx_chan_base = pfvf->rx_chan_base;
+	rsp->tx_chan_base = pfvf->tx_chan_base;
+	rsp->rx_chan_cnt = pfvf->rx_chan_cnt;
+	rsp->tx_chan_cnt = pfvf->tx_chan_cnt;
 	rsp->lso_tsov4_idx = NIX_LSO_FORMAT_IDX_TSOV4;
 	rsp->lso_tsov6_idx = NIX_LSO_FORMAT_IDX_TSOV6;
 	return rc;
-- 
2.7.4

^ permalink raw reply related

* [PATCH 13/17] octeontx2-af: Install ucast and bcast pkt forwarding rules
From: sunil.kovvuri @ 2018-10-19 13:37 UTC (permalink / raw)
  To: netdev, davem; +Cc: arnd, linux-soc, Sunil Goutham
In-Reply-To: <1539956258-29377-1-git-send-email-sunil.kovvuri@gmail.com>

From: Sunil Goutham <sgoutham@marvell.com>

Upon NIXLF_ALLOC install a unicast forwarding rule in NPC MCAM
like below
 - Match pkt DMAC with NIXLF attached PF/VF's MAC address.
 - Ingress channel
 - Action is UCAST
 - Forward to PF_FUNC of this NIXLF
And broadcast pkt forwarding rule as
 - Match L2B bit in MCAM search key
 - Ingress channel
 - Action is UCAST, for now, later it will be changed to MCAST.
Only PFs can install this rule

Upon NIXLF_FREE disable all MCAM entries in use by that NIXLF.

Signed-off-by: Sunil Goutham <sgoutham@marvell.com>
---
 drivers/net/ethernet/marvell/octeontx2/af/npc.h    |  19 ++
 drivers/net/ethernet/marvell/octeontx2/af/rvu.h    |   5 +
 .../net/ethernet/marvell/octeontx2/af/rvu_nix.c    |  14 +
 .../net/ethernet/marvell/octeontx2/af/rvu_npc.c    | 333 +++++++++++++++++++++
 4 files changed, 371 insertions(+)

diff --git a/drivers/net/ethernet/marvell/octeontx2/af/npc.h b/drivers/net/ethernet/marvell/octeontx2/af/npc.h
index 58d8f0b..9cbcac2 100644
--- a/drivers/net/ethernet/marvell/octeontx2/af/npc.h
+++ b/drivers/net/ethernet/marvell/octeontx2/af/npc.h
@@ -240,4 +240,23 @@ struct npc_kpu_pkind_cpi_def {
 	u64 ena            : 1;
 #endif
 };
+
+struct nix_rx_action {
+#if defined(__BIG_ENDIAN_BITFIELD)
+	u64	rsvd_63_61	:3;
+	u64	flow_key_alg	:5;
+	u64	match_id	:16;
+	u64	index		:20;
+	u64	pf_func		:16
+	u64	op		:4;
+#else
+	u64	op		:4;
+	u64	pf_func		:16;
+	u64	index		:20;
+	u64	match_id	:16;
+	u64	flow_key_alg	:5;
+	u64	rsvd_63_61	:3;
+#endif
+};
+
 #endif /* NPC_H */
diff --git a/drivers/net/ethernet/marvell/octeontx2/af/rvu.h b/drivers/net/ethernet/marvell/octeontx2/af/rvu.h
index 12391d2..e83d324 100644
--- a/drivers/net/ethernet/marvell/octeontx2/af/rvu.h
+++ b/drivers/net/ethernet/marvell/octeontx2/af/rvu.h
@@ -347,4 +347,9 @@ int rvu_npc_init(struct rvu *rvu);
 void rvu_npc_freemem(struct rvu *rvu);
 int rvu_npc_get_pkind(struct rvu *rvu, u16 pf);
 void rvu_npc_set_pkind(struct rvu *rvu, int pkind, struct rvu_pfvf *pfvf);
+void rvu_npc_install_ucast_entry(struct rvu *rvu, u16 pcifunc,
+				 int nixlf, u64 chan, u8 *mac_addr);
+void rvu_npc_install_bcast_match_entry(struct rvu *rvu, u16 pcifunc,
+				       int nixlf, u64 chan);
+void rvu_npc_disable_mcam_entries(struct rvu *rvu, u16 pcifunc, int nixlf);
 #endif /* RVU_H */
diff --git a/drivers/net/ethernet/marvell/octeontx2/af/rvu_nix.c b/drivers/net/ethernet/marvell/octeontx2/af/rvu_nix.c
index 86b1e9b..fbe4ff0 100644
--- a/drivers/net/ethernet/marvell/octeontx2/af/rvu_nix.c
+++ b/drivers/net/ethernet/marvell/octeontx2/af/rvu_nix.c
@@ -151,13 +151,24 @@ static int nix_interface_init(struct rvu *rvu, u16 pcifunc, int type, int nixlf)
 		break;
 	}
 
+	/* Add a UCAST forwarding rule in MCAM with this NIXLF attached
+	 * RVU PF/VF's MAC address.
+	 */
+	rvu_npc_install_ucast_entry(rvu, pcifunc, nixlf,
+				    pfvf->rx_chan_base, pfvf->mac_addr);
+
 	/* Add this PF_FUNC to bcast pkt replication list */
 	err = nix_update_bcast_mce_list(rvu, pcifunc, true);
 	if (err) {
 		dev_err(rvu->dev,
 			"Bcast list, failed to enable PF_FUNC 0x%x\n",
 			pcifunc);
+		return err;
 	}
+
+	rvu_npc_install_bcast_match_entry(rvu, pcifunc,
+					  nixlf, pfvf->rx_chan_base);
+
 	return 0;
 }
 
@@ -172,6 +183,9 @@ static void nix_interface_deinit(struct rvu *rvu, u16 pcifunc, u8 nixlf)
 			"Bcast list, failed to disable PF_FUNC 0x%x\n",
 			pcifunc);
 	}
+
+	/* Free and disable any MCAM entries used by this NIX LF */
+	rvu_npc_disable_mcam_entries(rvu, pcifunc, nixlf);
 }
 
 static void nix_setup_lso_tso_l3(struct rvu *rvu, int blkaddr,
diff --git a/drivers/net/ethernet/marvell/octeontx2/af/rvu_npc.c b/drivers/net/ethernet/marvell/octeontx2/af/rvu_npc.c
index 1c29436..e283372 100644
--- a/drivers/net/ethernet/marvell/octeontx2/af/rvu_npc.c
+++ b/drivers/net/ethernet/marvell/octeontx2/af/rvu_npc.c
@@ -26,6 +26,14 @@
 
 #define NPC_PARSE_RESULT_DMAC_OFFSET	8
 
+struct mcam_entry {
+#define NPC_MAX_KWS_IN_KEY	7 /* Number of keywords in max keywidth */
+	u64	kw[NPC_MAX_KWS_IN_KEY];
+	u64	kw_mask[NPC_MAX_KWS_IN_KEY];
+	u64	action;
+	u64	vtag_action;
+};
+
 void rvu_npc_set_pkind(struct rvu *rvu, int pkind, struct rvu_pfvf *pfvf)
 {
 	int blkaddr;
@@ -54,6 +62,331 @@ int rvu_npc_get_pkind(struct rvu *rvu, u16 pf)
 	return -1;
 }
 
+static int npc_get_nixlf_mcam_index(struct npc_mcam *mcam,
+				    u16 pcifunc, int nixlf, int type)
+{
+	int pf = rvu_get_pf(pcifunc);
+	int index;
+
+	/* Check if this is for a PF */
+	if (pf && !(pcifunc & RVU_PFVF_FUNC_MASK)) {
+		/* Reserved entries exclude PF0 */
+		pf--;
+		index = mcam->pf_offset + (pf * RSVD_MCAM_ENTRIES_PER_PF);
+		/* Broadcast address matching entry should be first so
+		 * that the packet can be replicated to all VFs.
+		 */
+		if (type == NIXLF_BCAST_ENTRY)
+			return index;
+		else if (type == NIXLF_PROMISC_ENTRY)
+			return index + 1;
+	}
+
+	return (mcam->nixlf_offset + (nixlf * RSVD_MCAM_ENTRIES_PER_NIXLF));
+}
+
+static int npc_get_bank(struct npc_mcam *mcam, int index)
+{
+	int bank = index / mcam->banksize;
+
+	/* 0,1 & 2,3 banks are combined for this keysize */
+	if (mcam->keysize == NPC_MCAM_KEY_X2)
+		return bank ? 2 : 0;
+
+	return bank;
+}
+
+static bool is_mcam_entry_enabled(struct rvu *rvu, struct npc_mcam *mcam,
+				  int blkaddr, int index)
+{
+	int bank = npc_get_bank(mcam, index);
+	u64 cfg;
+
+	index &= (mcam->banksize - 1);
+	cfg = rvu_read64(rvu, blkaddr, NPC_AF_MCAMEX_BANKX_CFG(index, bank));
+	return (cfg & 1);
+}
+
+static void npc_enable_mcam_entry(struct rvu *rvu, struct npc_mcam *mcam,
+				  int blkaddr, int index, bool enable)
+{
+	int bank = npc_get_bank(mcam, index);
+	int actbank = bank;
+
+	index &= (mcam->banksize - 1);
+	for (; bank < (actbank + mcam->banks_per_entry); bank++) {
+		rvu_write64(rvu, blkaddr,
+			    NPC_AF_MCAMEX_BANKX_CFG(index, bank),
+			    enable ? 1 : 0);
+	}
+}
+
+static void npc_get_keyword(struct mcam_entry *entry, int idx,
+			    u64 *cam0, u64 *cam1)
+{
+	u64 kw_mask = 0x00;
+
+#define CAM_MASK(n)	(BIT_ULL(n) - 1)
+
+	/* 0, 2, 4, 6 indices refer to BANKX_CAMX_W0 and
+	 * 1, 3, 5, 7 indices refer to BANKX_CAMX_W1.
+	 *
+	 * Also, only 48 bits of BANKX_CAMX_W1 are valid.
+	 */
+	switch (idx) {
+	case 0:
+		/* BANK(X)_CAM_W0<63:0> = MCAM_KEY[KW0]<63:0> */
+		*cam1 = entry->kw[0];
+		kw_mask = entry->kw_mask[0];
+		break;
+	case 1:
+		/* BANK(X)_CAM_W1<47:0> = MCAM_KEY[KW1]<47:0> */
+		*cam1 = entry->kw[1] & CAM_MASK(48);
+		kw_mask = entry->kw_mask[1] & CAM_MASK(48);
+		break;
+	case 2:
+		/* BANK(X + 1)_CAM_W0<15:0> = MCAM_KEY[KW1]<63:48>
+		 * BANK(X + 1)_CAM_W0<63:16> = MCAM_KEY[KW2]<47:0>
+		 */
+		*cam1 = (entry->kw[1] >> 48) & CAM_MASK(16);
+		*cam1 |= ((entry->kw[2] & CAM_MASK(48)) << 16);
+		kw_mask = (entry->kw_mask[1] >> 48) & CAM_MASK(16);
+		kw_mask |= ((entry->kw_mask[2] & CAM_MASK(48)) << 16);
+		break;
+	case 3:
+		/* BANK(X + 1)_CAM_W1<15:0> = MCAM_KEY[KW2]<63:48>
+		 * BANK(X + 1)_CAM_W1<47:16> = MCAM_KEY[KW3]<31:0>
+		 */
+		*cam1 = (entry->kw[2] >> 48) & CAM_MASK(16);
+		*cam1 |= ((entry->kw[3] & CAM_MASK(32)) << 16);
+		kw_mask = (entry->kw_mask[2] >> 48) & CAM_MASK(16);
+		kw_mask |= ((entry->kw_mask[3] & CAM_MASK(32)) << 16);
+		break;
+	case 4:
+		/* BANK(X + 2)_CAM_W0<31:0> = MCAM_KEY[KW3]<63:32>
+		 * BANK(X + 2)_CAM_W0<63:32> = MCAM_KEY[KW4]<31:0>
+		 */
+		*cam1 = (entry->kw[3] >> 32) & CAM_MASK(32);
+		*cam1 |= ((entry->kw[4] & CAM_MASK(32)) << 32);
+		kw_mask = (entry->kw_mask[3] >> 32) & CAM_MASK(32);
+		kw_mask |= ((entry->kw_mask[4] & CAM_MASK(32)) << 32);
+		break;
+	case 5:
+		/* BANK(X + 2)_CAM_W1<31:0> = MCAM_KEY[KW4]<63:32>
+		 * BANK(X + 2)_CAM_W1<47:32> = MCAM_KEY[KW5]<15:0>
+		 */
+		*cam1 = (entry->kw[4] >> 32) & CAM_MASK(32);
+		*cam1 |= ((entry->kw[5] & CAM_MASK(16)) << 32);
+		kw_mask = (entry->kw_mask[4] >> 32) & CAM_MASK(32);
+		kw_mask |= ((entry->kw_mask[5] & CAM_MASK(16)) << 32);
+		break;
+	case 6:
+		/* BANK(X + 3)_CAM_W0<47:0> = MCAM_KEY[KW5]<63:16>
+		 * BANK(X + 3)_CAM_W0<63:48> = MCAM_KEY[KW6]<15:0>
+		 */
+		*cam1 = (entry->kw[5] >> 16) & CAM_MASK(48);
+		*cam1 |= ((entry->kw[6] & CAM_MASK(16)) << 48);
+		kw_mask = (entry->kw_mask[5] >> 16) & CAM_MASK(48);
+		kw_mask |= ((entry->kw_mask[6] & CAM_MASK(16)) << 48);
+		break;
+	case 7:
+		/* BANK(X + 3)_CAM_W1<47:0> = MCAM_KEY[KW6]<63:16> */
+		*cam1 = (entry->kw[6] >> 16) & CAM_MASK(48);
+		kw_mask = (entry->kw_mask[6] >> 16) & CAM_MASK(48);
+		break;
+	}
+
+	*cam1 &= kw_mask;
+	*cam0 = ~*cam1 & kw_mask;
+}
+
+static void npc_config_mcam_entry(struct rvu *rvu, struct npc_mcam *mcam,
+				  int blkaddr, int index, u8 intf,
+				  struct mcam_entry *entry, bool enable)
+{
+	int bank = npc_get_bank(mcam, index);
+	int kw = 0, actbank, actindex;
+	u64 cam0, cam1;
+
+	actbank = bank; /* Save bank id, to set action later on */
+	actindex = index;
+	index &= (mcam->banksize - 1);
+
+	/* CAM1 takes the comparison value and
+	 * CAM0 specifies match for a bit in key being '0' or '1' or 'dontcare'.
+	 * CAM1<n> = 0 & CAM0<n> = 1 => match if key<n> = 0
+	 * CAM1<n> = 1 & CAM0<n> = 0 => match if key<n> = 1
+	 * CAM1<n> = 0 & CAM0<n> = 0 => always match i.e dontcare.
+	 */
+	for (; bank < (actbank + mcam->banks_per_entry); bank++, kw = kw + 2) {
+		/* Interface should be set in all banks */
+		rvu_write64(rvu, blkaddr,
+			    NPC_AF_MCAMEX_BANKX_CAMX_INTF(index, bank, 1),
+			    intf);
+		rvu_write64(rvu, blkaddr,
+			    NPC_AF_MCAMEX_BANKX_CAMX_INTF(index, bank, 0),
+			    ~intf & 0x3);
+
+		/* Set the match key */
+		npc_get_keyword(entry, kw, &cam0, &cam1);
+		rvu_write64(rvu, blkaddr,
+			    NPC_AF_MCAMEX_BANKX_CAMX_W0(index, bank, 1), cam1);
+		rvu_write64(rvu, blkaddr,
+			    NPC_AF_MCAMEX_BANKX_CAMX_W0(index, bank, 0), cam0);
+
+		npc_get_keyword(entry, kw + 1, &cam0, &cam1);
+		rvu_write64(rvu, blkaddr,
+			    NPC_AF_MCAMEX_BANKX_CAMX_W1(index, bank, 1), cam1);
+		rvu_write64(rvu, blkaddr,
+			    NPC_AF_MCAMEX_BANKX_CAMX_W1(index, bank, 0), cam0);
+	}
+
+	/* Set 'action' */
+	rvu_write64(rvu, blkaddr,
+		    NPC_AF_MCAMEX_BANKX_ACTION(index, actbank), entry->action);
+
+	/* Set TAG 'action' */
+	rvu_write64(rvu, blkaddr, NPC_AF_MCAMEX_BANKX_TAG_ACT(index, actbank),
+		    entry->vtag_action);
+
+	/* Enable the entry */
+	if (enable)
+		npc_enable_mcam_entry(rvu, mcam, blkaddr, actindex, true);
+	else
+		npc_enable_mcam_entry(rvu, mcam, blkaddr, actindex, false);
+}
+
+static u64 npc_get_mcam_action(struct rvu *rvu, struct npc_mcam *mcam,
+			       int blkaddr, int index)
+{
+	int bank = npc_get_bank(mcam, index);
+
+	index &= (mcam->banksize - 1);
+	return rvu_read64(rvu, blkaddr,
+			  NPC_AF_MCAMEX_BANKX_ACTION(index, bank));
+}
+
+void rvu_npc_install_ucast_entry(struct rvu *rvu, u16 pcifunc,
+				 int nixlf, u64 chan, u8 *mac_addr)
+{
+	struct npc_mcam *mcam = &rvu->hw->mcam;
+	struct mcam_entry entry = { {0} };
+	struct nix_rx_action action;
+	int blkaddr, index, kwi;
+	u64 mac = 0;
+
+	blkaddr = rvu_get_blkaddr(rvu, BLKTYPE_NPC, 0);
+	if (blkaddr < 0)
+		return;
+
+	for (index = ETH_ALEN - 1; index >= 0; index--)
+		mac |= ((u64)*mac_addr++) << (8 * index);
+
+	index = npc_get_nixlf_mcam_index(mcam, pcifunc,
+					 nixlf, NIXLF_UCAST_ENTRY);
+
+	/* Match ingress channel and DMAC */
+	entry.kw[0] = chan;
+	entry.kw_mask[0] = 0xFFFULL;
+
+	kwi = NPC_PARSE_RESULT_DMAC_OFFSET / sizeof(u64);
+	entry.kw[kwi] = mac;
+	entry.kw_mask[kwi] = BIT_ULL(48) - 1;
+
+	/* Don't change the action if entry is already enabled
+	 * Otherwise RSS action may get overwritten.
+	 */
+	if (is_mcam_entry_enabled(rvu, mcam, blkaddr, index)) {
+		*(u64 *)&action = npc_get_mcam_action(rvu, mcam,
+						      blkaddr, index);
+	} else {
+		*(u64 *)&action = 0x00;
+		action.op = NIX_RX_ACTIONOP_UCAST;
+		action.pf_func = pcifunc;
+	}
+
+	entry.action = *(u64 *)&action;
+	npc_config_mcam_entry(rvu, mcam, blkaddr, index,
+			      NIX_INTF_RX, &entry, true);
+}
+
+void rvu_npc_install_bcast_match_entry(struct rvu *rvu, u16 pcifunc,
+				       int nixlf, u64 chan)
+{
+	struct npc_mcam *mcam = &rvu->hw->mcam;
+	struct mcam_entry entry = { {0} };
+	struct nix_rx_action action;
+	struct rvu_pfvf *pfvf;
+	int blkaddr, index;
+
+	blkaddr = rvu_get_blkaddr(rvu, BLKTYPE_NPC, 0);
+	if (blkaddr < 0)
+		return;
+
+	/* Only PF can add a bcast match entry */
+	if (pcifunc & RVU_PFVF_FUNC_MASK)
+		return;
+	pfvf = rvu_get_pfvf(rvu, pcifunc & ~RVU_PFVF_FUNC_MASK);
+
+	index = npc_get_nixlf_mcam_index(mcam, pcifunc,
+					 nixlf, NIXLF_BCAST_ENTRY);
+
+	/* Check for L2B bit and LMAC channel */
+	entry.kw[0] = BIT_ULL(25) | chan;
+	entry.kw_mask[0] = BIT_ULL(25) | 0xFFFULL;
+
+	*(u64 *)&action = 0x00;
+#ifdef MCAST_MCE
+	/* Early silicon doesn't support pkt replication,
+	 * so install entry with UCAST action, so that PF
+	 * receives all broadcast packets.
+	 */
+	action.op = NIX_RX_ACTIONOP_MCAST;
+	action.pf_func = pcifunc;
+	action.index = pfvf->bcast_mce_idx;
+#else
+	action.op = NIX_RX_ACTIONOP_UCAST;
+	action.pf_func = pcifunc;
+#endif
+
+	entry.action = *(u64 *)&action;
+	npc_config_mcam_entry(rvu, mcam, blkaddr, index,
+			      NIX_INTF_RX, &entry, true);
+}
+
+void rvu_npc_disable_mcam_entries(struct rvu *rvu, u16 pcifunc, int nixlf)
+{
+	struct npc_mcam *mcam = &rvu->hw->mcam;
+	struct nix_rx_action action;
+	int blkaddr, index, bank;
+
+	blkaddr = rvu_get_blkaddr(rvu, BLKTYPE_NPC, 0);
+	if (blkaddr < 0)
+		return;
+
+	/* Disable ucast MCAM match entry of this PF/VF */
+	index = npc_get_nixlf_mcam_index(mcam, pcifunc,
+					 nixlf, NIXLF_UCAST_ENTRY);
+	npc_enable_mcam_entry(rvu, mcam, blkaddr, index, false);
+
+	/* For PF, disable promisc and bcast MCAM match entries */
+	if (!(pcifunc & RVU_PFVF_FUNC_MASK)) {
+		index = npc_get_nixlf_mcam_index(mcam, pcifunc,
+						 nixlf, NIXLF_BCAST_ENTRY);
+		/* For bcast, disable only if it's action is not
+		 * packet replication, incase if action is replication
+		 * then this PF's nixlf is removed from bcast replication
+		 * list.
+		 */
+		bank = npc_get_bank(mcam, index);
+		index &= (mcam->banksize - 1);
+		*(u64 *)&action = rvu_read64(rvu, blkaddr,
+				     NPC_AF_MCAMEX_BANKX_ACTION(index, bank));
+		if (action.op != NIX_RX_ACTIONOP_MCAST)
+			npc_enable_mcam_entry(rvu, mcam, blkaddr, index, false);
+	}
+}
+
 #define LDATA_EXTRACT_CONFIG(intf, lid, ltype, ld, cfg) \
 	rvu_write64(rvu, blkaddr,			\
 		NPC_AF_INTFX_LIDX_LTX_LDX_CFG(intf, lid, ltype, ld), cfg)
-- 
2.7.4

^ permalink raw reply related

* [PATCH 14/17] octeontx2-af: NIX Rx flowkey configuration for RSS
From: sunil.kovvuri @ 2018-10-19 13:37 UTC (permalink / raw)
  To: netdev, davem; +Cc: arnd, linux-soc, Sunil Goutham
In-Reply-To: <1539956258-29377-1-git-send-email-sunil.kovvuri@gmail.com>

From: Sunil Goutham <sgoutham@marvell.com>

Configure NIX RX flowkey algorithm configuration to support
RSS (receive side scaling). Currently support for only L3/L4
2-tuple and 4-tuple hash of IPv4/v6/TCP/UDP/SCTP is added.
HW supports upto 32 different flowkey algorithms which SW
can define, this patch defines 9. NPC RX ACTION has to point
to one of these flowkey indices for RSS to work.

The configuration is dependent on NPC parse result's layer
info. So if NPC KPU profile changes suchthat LID/LTYPE values
of above said protocols change then this configuration will
most likely be effected.

Signed-off-by: Sunil Goutham <sgoutham@marvell.com>
---
 drivers/net/ethernet/marvell/octeontx2/af/common.h |  22 ++++
 .../net/ethernet/marvell/octeontx2/af/rvu_nix.c    | 135 +++++++++++++++++++++
 .../net/ethernet/marvell/octeontx2/af/rvu_struct.h |  30 +++++
 3 files changed, 187 insertions(+)

diff --git a/drivers/net/ethernet/marvell/octeontx2/af/common.h b/drivers/net/ethernet/marvell/octeontx2/af/common.h
index 6c8150d..d39ada4 100644
--- a/drivers/net/ethernet/marvell/octeontx2/af/common.h
+++ b/drivers/net/ethernet/marvell/octeontx2/af/common.h
@@ -186,4 +186,26 @@ enum nix_scheduler {
 #define DEFAULT_RSS_CONTEXT_GROUP	0
 #define MAX_RSS_INDIR_TBL_SIZE		256 /* 1 << Max adder bits */
 
+/* NIX flow tag, key type flags */
+#define FLOW_KEY_TYPE_PORT	BIT(0)
+#define FLOW_KEY_TYPE_IPV4	BIT(1)
+#define FLOW_KEY_TYPE_IPV6	BIT(2)
+#define FLOW_KEY_TYPE_TCP	BIT(3)
+#define FLOW_KEY_TYPE_UDP	BIT(4)
+#define FLOW_KEY_TYPE_SCTP	BIT(5)
+
+/* NIX flow tag algorithm indices, max is 31 */
+enum {
+	FLOW_KEY_ALG_PORT,
+	FLOW_KEY_ALG_IP,
+	FLOW_KEY_ALG_TCP,
+	FLOW_KEY_ALG_UDP,
+	FLOW_KEY_ALG_SCTP,
+	FLOW_KEY_ALG_TCP_UDP,
+	FLOW_KEY_ALG_TCP_SCTP,
+	FLOW_KEY_ALG_UDP_SCTP,
+	FLOW_KEY_ALG_TCP_UDP_SCTP,
+	FLOW_KEY_ALG_MAX,
+};
+
 #endif /* COMMON_H */
diff --git a/drivers/net/ethernet/marvell/octeontx2/af/rvu_nix.c b/drivers/net/ethernet/marvell/octeontx2/af/rvu_nix.c
index fbe4ff0..e4c2c52 100644
--- a/drivers/net/ethernet/marvell/octeontx2/af/rvu_nix.c
+++ b/drivers/net/ethernet/marvell/octeontx2/af/rvu_nix.c
@@ -1536,6 +1536,139 @@ int rvu_mbox_handler_NIX_STATS_RST(struct rvu *rvu, struct msg_req *req,
 	return 0;
 }
 
+static void set_flowkey_fields(struct nix_rx_flowkey_alg *alg, u32 flow_cfg)
+{
+	struct nix_rx_flowkey_alg *field = NULL;
+	int idx, key_type;
+
+	if (!alg)
+		return;
+
+	/* FIELD0: IPv4
+	 * FIELD1: IPv6
+	 * FIELD2: TCP/UDP/SCTP/ALL
+	 * FIELD3: Unused
+	 * FIELD4: Unused
+	 *
+	 * Each of the 32 possible flow key algorithm definitions should
+	 * fall into above incremental config (except ALG0). Otherwise a
+	 * single NPC MCAM entry is not sufficient for supporting RSS.
+	 *
+	 * If a different definition or combination needed then NPC MCAM
+	 * has to be programmed to filter such pkts and it's action should
+	 * point to this definition to calculate flowtag or hash.
+	 */
+	for (idx = 0; idx < 32; idx++) {
+		key_type = flow_cfg & BIT_ULL(idx);
+		if (!key_type)
+			continue;
+		switch (key_type) {
+		case FLOW_KEY_TYPE_PORT:
+			field = &alg[0];
+			field->sel_chan = true;
+			/* This should be set to 1, when SEL_CHAN is set */
+			field->bytesm1 = 1;
+			break;
+		case FLOW_KEY_TYPE_IPV4:
+			field = &alg[0];
+			field->lid = NPC_LID_LC;
+			field->ltype_match = NPC_LT_LC_IP;
+			field->hdr_offset = 12; /* SIP offset */
+			field->bytesm1 = 7; /* SIP + DIP, 8 bytes */
+			field->ltype_mask = 0xF; /* Match only IPv4 */
+			break;
+		case FLOW_KEY_TYPE_IPV6:
+			field = &alg[1];
+			field->lid = NPC_LID_LC;
+			field->ltype_match = NPC_LT_LC_IP6;
+			field->hdr_offset = 8; /* SIP offset */
+			field->bytesm1 = 31; /* SIP + DIP, 32 bytes */
+			field->ltype_mask = 0xF; /* Match only IPv6 */
+			break;
+		case FLOW_KEY_TYPE_TCP:
+		case FLOW_KEY_TYPE_UDP:
+		case FLOW_KEY_TYPE_SCTP:
+			field = &alg[2];
+			field->lid = NPC_LID_LD;
+			field->bytesm1 = 3; /* Sport + Dport, 4 bytes */
+			if (key_type == FLOW_KEY_TYPE_TCP)
+				field->ltype_match |= NPC_LT_LD_TCP;
+			else if (key_type == FLOW_KEY_TYPE_UDP)
+				field->ltype_match |= NPC_LT_LD_UDP;
+			else if (key_type == FLOW_KEY_TYPE_SCTP)
+				field->ltype_match |= NPC_LT_LD_SCTP;
+			field->key_offset = 32; /* After IPv4/v6 SIP, DIP */
+			field->ltype_mask = ~field->ltype_match;
+			break;
+		}
+		if (field)
+			field->ena = 1;
+		field = NULL;
+	}
+}
+
+static void nix_rx_flowkey_alg_cfg(struct rvu *rvu, int blkaddr)
+{
+#define FIELDS_PER_ALG	5
+	u64 field[FLOW_KEY_ALG_MAX][FIELDS_PER_ALG];
+	u32 flowkey_cfg, minkey_cfg;
+	int alg, fid;
+
+	memset(&field, 0, sizeof(u64) * FLOW_KEY_ALG_MAX * FIELDS_PER_ALG);
+
+	/* Only incoming channel number */
+	flowkey_cfg = FLOW_KEY_TYPE_PORT;
+	set_flowkey_fields((void *)&field[FLOW_KEY_ALG_PORT], flowkey_cfg);
+
+	/* For a incoming pkt if none of the fields match then flowkey
+	 * will be zero, hence tag generated will also be zero.
+	 * RSS entry at rsse_index = NIX_AF_LF()_RSS_GRP()[OFFSET] will
+	 * be used to queue the packet.
+	 */
+
+	/* IPv4/IPv6 SIP/DIPs */
+	flowkey_cfg = FLOW_KEY_TYPE_IPV4 | FLOW_KEY_TYPE_IPV6;
+	set_flowkey_fields((void *)&field[FLOW_KEY_ALG_IP], flowkey_cfg);
+
+	/* TCPv4/v6 4-tuple, SIP, DIP, Sport, Dport */
+	minkey_cfg = flowkey_cfg;
+	flowkey_cfg = minkey_cfg | FLOW_KEY_TYPE_TCP;
+	set_flowkey_fields((void *)&field[FLOW_KEY_ALG_TCP], flowkey_cfg);
+
+	/* UDPv4/v6 4-tuple, SIP, DIP, Sport, Dport */
+	flowkey_cfg = minkey_cfg | FLOW_KEY_TYPE_UDP;
+	set_flowkey_fields((void *)&field[FLOW_KEY_ALG_UDP], flowkey_cfg);
+
+	/* SCTPv4/v6 4-tuple, SIP, DIP, Sport, Dport */
+	flowkey_cfg = minkey_cfg | FLOW_KEY_TYPE_SCTP;
+	set_flowkey_fields((void *)&field[FLOW_KEY_ALG_SCTP], flowkey_cfg);
+
+	/* TCP/UDP v4/v6 4-tuple, rest IP pkts 2-tuple */
+	flowkey_cfg = minkey_cfg | FLOW_KEY_TYPE_TCP | FLOW_KEY_TYPE_UDP;
+	set_flowkey_fields((void *)&field[FLOW_KEY_ALG_TCP_UDP], flowkey_cfg);
+
+	/* TCP/SCTP v4/v6 4-tuple, rest IP pkts 2-tuple */
+	flowkey_cfg = minkey_cfg | FLOW_KEY_TYPE_TCP | FLOW_KEY_TYPE_SCTP;
+	set_flowkey_fields((void *)&field[FLOW_KEY_ALG_TCP_SCTP], flowkey_cfg);
+
+	/* UDP/SCTP v4/v6 4-tuple, rest IP pkts 2-tuple */
+	flowkey_cfg = minkey_cfg | FLOW_KEY_TYPE_UDP | FLOW_KEY_TYPE_SCTP;
+	set_flowkey_fields((void *)&field[FLOW_KEY_ALG_UDP_SCTP], flowkey_cfg);
+
+	/* TCP/UDP/SCTP v4/v6 4-tuple, rest IP pkts 2-tuple */
+	flowkey_cfg = minkey_cfg | FLOW_KEY_TYPE_TCP |
+		      FLOW_KEY_TYPE_UDP | FLOW_KEY_TYPE_SCTP;
+	set_flowkey_fields((void *)&field[FLOW_KEY_ALG_TCP_UDP_SCTP],
+			   flowkey_cfg);
+
+	for (alg = 0; alg < FLOW_KEY_ALG_MAX; alg++) {
+		for (fid = 0; fid < FIELDS_PER_ALG; fid++)
+			rvu_write64(rvu, blkaddr,
+				    NIX_AF_RX_FLOW_KEY_ALGX_FIELDX(alg, fid),
+				    field[alg][fid]);
+	}
+}
+
 static int nix_calibrate_x2p(struct rvu *rvu, int blkaddr)
 {
 	int idx, err;
@@ -1678,6 +1811,8 @@ int rvu_nix_init(struct rvu *rvu)
 			    (NPC_LID_LD << 8) | (NPC_LT_LD_TCP << 4) | 0x0F);
 		rvu_write64(rvu, blkaddr, NIX_AF_RX_DEF_OIP4,
 			    (NPC_LID_LC << 8) | (NPC_LT_LC_IP << 4) | 0x0F);
+
+		nix_rx_flowkey_alg_cfg(rvu, blkaddr);
 	}
 	return 0;
 }
diff --git a/drivers/net/ethernet/marvell/octeontx2/af/rvu_struct.h b/drivers/net/ethernet/marvell/octeontx2/af/rvu_struct.h
index bb2836e..f920dac 100644
--- a/drivers/net/ethernet/marvell/octeontx2/af/rvu_struct.h
+++ b/drivers/net/ethernet/marvell/octeontx2/af/rvu_struct.h
@@ -879,6 +879,36 @@ struct nix_lso_format {
 #endif
 };
 
+struct nix_rx_flowkey_alg {
+#if defined(__BIG_ENDIAN_BITFIELD)
+	u64 reserved_35_63	:29;
+	u64 ltype_match		:4;
+	u64 ltype_mask		:4;
+	u64 sel_chan		:1;
+	u64 ena			:1;
+	u64 reserved_24_24	:1;
+	u64 lid			:3;
+	u64 bytesm1		:5;
+	u64 hdr_offset		:8;
+	u64 fn_mask		:1;
+	u64 ln_mask		:1;
+	u64 key_offset		:6;
+#else
+	u64 key_offset		:6;
+	u64 ln_mask		:1;
+	u64 fn_mask		:1;
+	u64 hdr_offset		:8;
+	u64 bytesm1		:5;
+	u64 lid			:3;
+	u64 reserved_24_24	:1;
+	u64 ena			:1;
+	u64 sel_chan		:1;
+	u64 ltype_mask		:4;
+	u64 ltype_match		:4;
+	u64 reserved_35_63	:29;
+#endif
+};
+
 /* NIX VTAG size */
 enum nix_vtag_size {
 	VTAGSIZE_T4   = 0x0,
-- 
2.7.4

^ permalink raw reply related

* [PATCH 15/17] octeontx2-af: Support for changing RSS algorithm
From: sunil.kovvuri @ 2018-10-19 13:37 UTC (permalink / raw)
  To: netdev, davem; +Cc: arnd, linux-soc, Sunil Goutham
In-Reply-To: <1539956258-29377-1-git-send-email-sunil.kovvuri@gmail.com>

From: Sunil Goutham <sgoutham@marvell.com>

This patch adds support for a RVU PF/VF to change
NIX Rx flowkey algorithm index in NPC RX RSS_ACTION.
eg: a ethtool command changing RSS algorithm for a netdev
interface would trigger this change in NPC.

If PF/VF doesn't specify any MCAM entry index then default
UCAST entry of the NIXLF attached to PF/VF will be updated
with RSS_ACTION and flowkey index.

Signed-off-by: Sunil Goutham <sgoutham@marvell.com>
---
 drivers/net/ethernet/marvell/octeontx2/af/mbox.h   | 10 ++++-
 drivers/net/ethernet/marvell/octeontx2/af/rvu.h    |  5 +++
 .../net/ethernet/marvell/octeontx2/af/rvu_nix.c    | 51 ++++++++++++++++++++++
 .../net/ethernet/marvell/octeontx2/af/rvu_npc.c    | 43 ++++++++++++++++++
 4 files changed, 108 insertions(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/marvell/octeontx2/af/mbox.h b/drivers/net/ethernet/marvell/octeontx2/af/mbox.h
index 0e2552c..32d70bf 100644
--- a/drivers/net/ethernet/marvell/octeontx2/af/mbox.h
+++ b/drivers/net/ethernet/marvell/octeontx2/af/mbox.h
@@ -157,7 +157,8 @@ M(NIX_TXSCH_ALLOC,	0x8004, nix_txsch_alloc_req, nix_txsch_alloc_rsp) \
 M(NIX_TXSCH_FREE,	0x8005, nix_txsch_free_req, msg_rsp)		\
 M(NIX_TXSCHQ_CFG,	0x8006, nix_txschq_config, msg_rsp)		\
 M(NIX_STATS_RST,	0x8007, msg_req, msg_rsp)			\
-M(NIX_VTAG_CFG,	0x8008, nix_vtag_config, msg_rsp)
+M(NIX_VTAG_CFG,	0x8008, nix_vtag_config, msg_rsp)		\
+M(NIX_RSS_FLOWKEY_CFG,  0x8009, nix_rss_flowkey_cfg, msg_rsp)
 
 /* Messages initiated by AF (range 0xC00 - 0xDFF) */
 #define MBOX_UP_CGX_MESSAGES						\
@@ -499,4 +500,11 @@ struct nix_vtag_config {
 	};
 };
 
+struct nix_rss_flowkey_cfg {
+	struct mbox_msghdr hdr;
+	int	mcam_index;  /* MCAM entry index to modify */
+	u32	flowkey_cfg; /* Flowkey types selected */
+	u8	group;       /* RSS context or group */
+};
+
 #endif /* MBOX_H */
diff --git a/drivers/net/ethernet/marvell/octeontx2/af/rvu.h b/drivers/net/ethernet/marvell/octeontx2/af/rvu.h
index e83d324..b169657 100644
--- a/drivers/net/ethernet/marvell/octeontx2/af/rvu.h
+++ b/drivers/net/ethernet/marvell/octeontx2/af/rvu.h
@@ -341,6 +341,9 @@ int rvu_mbox_handler_NIX_STATS_RST(struct rvu *rvu, struct msg_req *req,
 int rvu_mbox_handler_NIX_VTAG_CFG(struct rvu *rvu,
 				  struct nix_vtag_config *req,
 				  struct msg_rsp *rsp);
+int rvu_mbox_handler_NIX_RSS_FLOWKEY_CFG(struct rvu *rvu,
+					 struct nix_rss_flowkey_cfg *req,
+					 struct msg_rsp *rsp);
 
 /* NPC APIs */
 int rvu_npc_init(struct rvu *rvu);
@@ -352,4 +355,6 @@ void rvu_npc_install_ucast_entry(struct rvu *rvu, u16 pcifunc,
 void rvu_npc_install_bcast_match_entry(struct rvu *rvu, u16 pcifunc,
 				       int nixlf, u64 chan);
 void rvu_npc_disable_mcam_entries(struct rvu *rvu, u16 pcifunc, int nixlf);
+void rvu_npc_update_flowkey_alg_idx(struct rvu *rvu, u16 pcifunc, int nixlf,
+				    int group, int alg_idx, int mcam_index);
 #endif /* RVU_H */
diff --git a/drivers/net/ethernet/marvell/octeontx2/af/rvu_nix.c b/drivers/net/ethernet/marvell/octeontx2/af/rvu_nix.c
index e4c2c52..d4dcdbb 100644
--- a/drivers/net/ethernet/marvell/octeontx2/af/rvu_nix.c
+++ b/drivers/net/ethernet/marvell/octeontx2/af/rvu_nix.c
@@ -1536,6 +1536,57 @@ int rvu_mbox_handler_NIX_STATS_RST(struct rvu *rvu, struct msg_req *req,
 	return 0;
 }
 
+/* Returns the ALG index to be set into NPC_RX_ACTION */
+static int get_flowkey_alg_idx(u32 flow_cfg)
+{
+	u32 ip_cfg;
+
+	flow_cfg &= ~FLOW_KEY_TYPE_PORT;
+	ip_cfg = FLOW_KEY_TYPE_IPV4 | FLOW_KEY_TYPE_IPV6;
+	if (flow_cfg == ip_cfg)
+		return FLOW_KEY_ALG_IP;
+	else if (flow_cfg == (ip_cfg | FLOW_KEY_TYPE_TCP))
+		return FLOW_KEY_ALG_TCP;
+	else if (flow_cfg == (ip_cfg | FLOW_KEY_TYPE_UDP))
+		return FLOW_KEY_ALG_UDP;
+	else if (flow_cfg == (ip_cfg | FLOW_KEY_TYPE_SCTP))
+		return FLOW_KEY_ALG_SCTP;
+	else if (flow_cfg == (ip_cfg | FLOW_KEY_TYPE_TCP | FLOW_KEY_TYPE_UDP))
+		return FLOW_KEY_ALG_TCP_UDP;
+	else if (flow_cfg == (ip_cfg | FLOW_KEY_TYPE_TCP | FLOW_KEY_TYPE_SCTP))
+		return FLOW_KEY_ALG_TCP_SCTP;
+	else if (flow_cfg == (ip_cfg | FLOW_KEY_TYPE_UDP | FLOW_KEY_TYPE_SCTP))
+		return FLOW_KEY_ALG_UDP_SCTP;
+	else if (flow_cfg == (ip_cfg | FLOW_KEY_TYPE_TCP |
+			      FLOW_KEY_TYPE_UDP | FLOW_KEY_TYPE_SCTP))
+		return FLOW_KEY_ALG_TCP_UDP_SCTP;
+
+	return FLOW_KEY_ALG_PORT;
+}
+
+int rvu_mbox_handler_NIX_RSS_FLOWKEY_CFG(struct rvu *rvu,
+					 struct nix_rss_flowkey_cfg *req,
+					 struct msg_rsp *rsp)
+{
+	struct rvu_hwinfo *hw = rvu->hw;
+	u16 pcifunc = req->hdr.pcifunc;
+	int alg_idx, nixlf, blkaddr;
+
+	blkaddr = rvu_get_blkaddr(rvu, BLKTYPE_NIX, pcifunc);
+	if (blkaddr < 0)
+		return NIX_AF_ERR_AF_LF_INVALID;
+
+	nixlf = rvu_get_lf(rvu, &hw->block[blkaddr], pcifunc, 0);
+	if (nixlf < 0)
+		return NIX_AF_ERR_AF_LF_INVALID;
+
+	alg_idx = get_flowkey_alg_idx(req->flowkey_cfg);
+
+	rvu_npc_update_flowkey_alg_idx(rvu, pcifunc, nixlf, req->group,
+				       alg_idx, req->mcam_index);
+	return 0;
+}
+
 static void set_flowkey_fields(struct nix_rx_flowkey_alg *alg, u32 flow_cfg)
 {
 	struct nix_rx_flowkey_alg *field = NULL;
diff --git a/drivers/net/ethernet/marvell/octeontx2/af/rvu_npc.c b/drivers/net/ethernet/marvell/octeontx2/af/rvu_npc.c
index e283372..d486112 100644
--- a/drivers/net/ethernet/marvell/octeontx2/af/rvu_npc.c
+++ b/drivers/net/ethernet/marvell/octeontx2/af/rvu_npc.c
@@ -354,6 +354,49 @@ void rvu_npc_install_bcast_match_entry(struct rvu *rvu, u16 pcifunc,
 			      NIX_INTF_RX, &entry, true);
 }
 
+void rvu_npc_update_flowkey_alg_idx(struct rvu *rvu, u16 pcifunc, int nixlf,
+				    int group, int alg_idx, int mcam_index)
+{
+	struct npc_mcam *mcam = &rvu->hw->mcam;
+	struct nix_rx_action action;
+	int blkaddr, index, bank;
+
+	blkaddr = rvu_get_blkaddr(rvu, BLKTYPE_NPC, 0);
+	if (blkaddr < 0)
+		return;
+
+	/* Check if this is for reserved default entry */
+	if (mcam_index < 0) {
+		if (group != DEFAULT_RSS_CONTEXT_GROUP)
+			return;
+		index = npc_get_nixlf_mcam_index(mcam, pcifunc,
+						 nixlf, NIXLF_UCAST_ENTRY);
+	} else {
+		/* TODO: validate this mcam index */
+		index = mcam_index;
+	}
+
+	if (index >= mcam->total_entries)
+		return;
+
+	bank = npc_get_bank(mcam, index);
+	index &= (mcam->banksize - 1);
+
+	*(u64 *)&action = rvu_read64(rvu, blkaddr,
+				     NPC_AF_MCAMEX_BANKX_ACTION(index, bank));
+	/* Ignore if no action was set earlier */
+	if (!*(u64 *)&action)
+		return;
+
+	action.op = NIX_RX_ACTIONOP_RSS;
+	action.pf_func = pcifunc;
+	action.index = group;
+	action.flow_key_alg = alg_idx;
+
+	rvu_write64(rvu, blkaddr,
+		    NPC_AF_MCAMEX_BANKX_ACTION(index, bank), *(u64 *)&action);
+}
+
 void rvu_npc_disable_mcam_entries(struct rvu *rvu, u16 pcifunc, int nixlf)
 {
 	struct npc_mcam *mcam = &rvu->hw->mcam;
-- 
2.7.4

^ permalink raw reply related

* [PATCH 16/17] octeontx2-af: Support for setting MAC address
From: sunil.kovvuri @ 2018-10-19 13:37 UTC (permalink / raw)
  To: netdev, davem; +Cc: arnd, linux-soc, Sunil Goutham
In-Reply-To: <1539956258-29377-1-git-send-email-sunil.kovvuri@gmail.com>

From: Sunil Goutham <sgoutham@marvell.com>

Added a new mailbox message for a PF/VF to set/update
it's NIXLF's MAC address. Also updates unicast NPC
MCAM entry with this address as matching DMAC.

Signed-off-by: Sunil Goutham <sgoutham@marvell.com>
---
 drivers/net/ethernet/marvell/octeontx2/af/mbox.h   |  8 ++++++-
 drivers/net/ethernet/marvell/octeontx2/af/rvu.h    |  3 +++
 .../net/ethernet/marvell/octeontx2/af/rvu_nix.c    | 25 ++++++++++++++++++++++
 3 files changed, 35 insertions(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/marvell/octeontx2/af/mbox.h b/drivers/net/ethernet/marvell/octeontx2/af/mbox.h
index 32d70bf..afa2ead 100644
--- a/drivers/net/ethernet/marvell/octeontx2/af/mbox.h
+++ b/drivers/net/ethernet/marvell/octeontx2/af/mbox.h
@@ -158,7 +158,8 @@ M(NIX_TXSCH_FREE,	0x8005, nix_txsch_free_req, msg_rsp)		\
 M(NIX_TXSCHQ_CFG,	0x8006, nix_txschq_config, msg_rsp)		\
 M(NIX_STATS_RST,	0x8007, msg_req, msg_rsp)			\
 M(NIX_VTAG_CFG,	0x8008, nix_vtag_config, msg_rsp)		\
-M(NIX_RSS_FLOWKEY_CFG,  0x8009, nix_rss_flowkey_cfg, msg_rsp)
+M(NIX_RSS_FLOWKEY_CFG,  0x8009, nix_rss_flowkey_cfg, msg_rsp)		\
+M(NIX_SET_MAC_ADDR,	0x800a, nix_set_mac_addr, msg_rsp)
 
 /* Messages initiated by AF (range 0xC00 - 0xDFF) */
 #define MBOX_UP_CGX_MESSAGES						\
@@ -507,4 +508,9 @@ struct nix_rss_flowkey_cfg {
 	u8	group;       /* RSS context or group */
 };
 
+struct nix_set_mac_addr {
+	struct mbox_msghdr hdr;
+	u8 mac_addr[ETH_ALEN]; /* MAC address to be set for this pcifunc */
+};
+
 #endif /* MBOX_H */
diff --git a/drivers/net/ethernet/marvell/octeontx2/af/rvu.h b/drivers/net/ethernet/marvell/octeontx2/af/rvu.h
index b169657..93e6891 100644
--- a/drivers/net/ethernet/marvell/octeontx2/af/rvu.h
+++ b/drivers/net/ethernet/marvell/octeontx2/af/rvu.h
@@ -344,6 +344,9 @@ int rvu_mbox_handler_NIX_VTAG_CFG(struct rvu *rvu,
 int rvu_mbox_handler_NIX_RSS_FLOWKEY_CFG(struct rvu *rvu,
 					 struct nix_rss_flowkey_cfg *req,
 					 struct msg_rsp *rsp);
+int rvu_mbox_handler_NIX_SET_MAC_ADDR(struct rvu *rvu,
+				      struct nix_set_mac_addr *req,
+				      struct msg_rsp *rsp);
 
 /* NPC APIs */
 int rvu_npc_init(struct rvu *rvu);
diff --git a/drivers/net/ethernet/marvell/octeontx2/af/rvu_nix.c b/drivers/net/ethernet/marvell/octeontx2/af/rvu_nix.c
index d4dcdbb..3caf81b 100644
--- a/drivers/net/ethernet/marvell/octeontx2/af/rvu_nix.c
+++ b/drivers/net/ethernet/marvell/octeontx2/af/rvu_nix.c
@@ -1720,6 +1720,31 @@ static void nix_rx_flowkey_alg_cfg(struct rvu *rvu, int blkaddr)
 	}
 }
 
+int rvu_mbox_handler_NIX_SET_MAC_ADDR(struct rvu *rvu,
+				      struct nix_set_mac_addr *req,
+				      struct msg_rsp *rsp)
+{
+	struct rvu_hwinfo *hw = rvu->hw;
+	u16 pcifunc = req->hdr.pcifunc;
+	struct rvu_pfvf *pfvf;
+	int blkaddr, nixlf;
+
+	pfvf = rvu_get_pfvf(rvu, pcifunc);
+	blkaddr = rvu_get_blkaddr(rvu, BLKTYPE_NIX, pcifunc);
+	if (!pfvf->nixlf || blkaddr < 0)
+		return NIX_AF_ERR_AF_LF_INVALID;
+
+	nixlf = rvu_get_lf(rvu, &hw->block[blkaddr], pcifunc, 0);
+	if (nixlf < 0)
+		return NIX_AF_ERR_AF_LF_INVALID;
+
+	ether_addr_copy(pfvf->mac_addr, req->mac_addr);
+
+	rvu_npc_install_ucast_entry(rvu, pcifunc, nixlf,
+				    pfvf->rx_chan_base, req->mac_addr);
+	return 0;
+}
+
 static int nix_calibrate_x2p(struct rvu *rvu, int blkaddr)
 {
 	int idx, err;
-- 
2.7.4

^ permalink raw reply related

* [PATCH 17/17] octeontx2-af: Support for NIXLF's UCAST/PROMISC/ALLMULTI modes
From: sunil.kovvuri @ 2018-10-19 13:37 UTC (permalink / raw)
  To: netdev, davem; +Cc: arnd, linux-soc, Sunil Goutham
In-Reply-To: <1539956258-29377-1-git-send-email-sunil.kovvuri@gmail.com>

From: Sunil Goutham <sgoutham@marvell.com>

By default NIXLF is set in UCAST mode. This patch adds a new
mailbox message which when sent by a RVU PF changes this default
mode. When promiscuous mode is needed, the reserved promisc entry
for each of RVU PF is setup to match against ingress channel number
only, so that all pkts on that channel are accepted and forwarded
to the mode change requesting PF_FUNC's NIXLF.

PROMISC and ALLMULTI modes are supported only for PFs, for VFs only
UCAST mode is supported.

Signed-off-by: Sunil Goutham <sgoutham@marvell.com>
---
 drivers/net/ethernet/marvell/octeontx2/af/mbox.h   | 11 ++++-
 drivers/net/ethernet/marvell/octeontx2/af/rvu.h    |  5 ++
 .../net/ethernet/marvell/octeontx2/af/rvu_nix.c    | 33 +++++++++++++
 .../net/ethernet/marvell/octeontx2/af/rvu_npc.c    | 57 ++++++++++++++++++++++
 4 files changed, 105 insertions(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/marvell/octeontx2/af/mbox.h b/drivers/net/ethernet/marvell/octeontx2/af/mbox.h
index afa2ead..a15a59c 100644
--- a/drivers/net/ethernet/marvell/octeontx2/af/mbox.h
+++ b/drivers/net/ethernet/marvell/octeontx2/af/mbox.h
@@ -159,7 +159,8 @@ M(NIX_TXSCHQ_CFG,	0x8006, nix_txschq_config, msg_rsp)		\
 M(NIX_STATS_RST,	0x8007, msg_req, msg_rsp)			\
 M(NIX_VTAG_CFG,	0x8008, nix_vtag_config, msg_rsp)		\
 M(NIX_RSS_FLOWKEY_CFG,  0x8009, nix_rss_flowkey_cfg, msg_rsp)		\
-M(NIX_SET_MAC_ADDR,	0x800a, nix_set_mac_addr, msg_rsp)
+M(NIX_SET_MAC_ADDR,	0x800a, nix_set_mac_addr, msg_rsp)		\
+M(NIX_SET_RX_MODE,	0x800b, nix_rx_mode, msg_rsp)
 
 /* Messages initiated by AF (range 0xC00 - 0xDFF) */
 #define MBOX_UP_CGX_MESSAGES						\
@@ -513,4 +514,12 @@ struct nix_set_mac_addr {
 	u8 mac_addr[ETH_ALEN]; /* MAC address to be set for this pcifunc */
 };
 
+struct nix_rx_mode {
+	struct mbox_msghdr hdr;
+#define NIX_RX_MODE_UCAST	BIT(0)
+#define NIX_RX_MODE_PROMISC	BIT(1)
+#define NIX_RX_MODE_ALLMULTI	BIT(2)
+	u16	mode;
+};
+
 #endif /* MBOX_H */
diff --git a/drivers/net/ethernet/marvell/octeontx2/af/rvu.h b/drivers/net/ethernet/marvell/octeontx2/af/rvu.h
index 93e6891..2c0580c 100644
--- a/drivers/net/ethernet/marvell/octeontx2/af/rvu.h
+++ b/drivers/net/ethernet/marvell/octeontx2/af/rvu.h
@@ -347,6 +347,8 @@ int rvu_mbox_handler_NIX_RSS_FLOWKEY_CFG(struct rvu *rvu,
 int rvu_mbox_handler_NIX_SET_MAC_ADDR(struct rvu *rvu,
 				      struct nix_set_mac_addr *req,
 				      struct msg_rsp *rsp);
+int rvu_mbox_handler_NIX_SET_RX_MODE(struct rvu *rvu, struct nix_rx_mode *req,
+				     struct msg_rsp *rsp);
 
 /* NPC APIs */
 int rvu_npc_init(struct rvu *rvu);
@@ -355,6 +357,9 @@ int rvu_npc_get_pkind(struct rvu *rvu, u16 pf);
 void rvu_npc_set_pkind(struct rvu *rvu, int pkind, struct rvu_pfvf *pfvf);
 void rvu_npc_install_ucast_entry(struct rvu *rvu, u16 pcifunc,
 				 int nixlf, u64 chan, u8 *mac_addr);
+void rvu_npc_install_promisc_entry(struct rvu *rvu, u16 pcifunc,
+				   int nixlf, u64 chan, bool allmulti);
+void rvu_npc_disable_promisc_entry(struct rvu *rvu, u16 pcifunc, int nixlf);
 void rvu_npc_install_bcast_match_entry(struct rvu *rvu, u16 pcifunc,
 				       int nixlf, u64 chan);
 void rvu_npc_disable_mcam_entries(struct rvu *rvu, u16 pcifunc, int nixlf);
diff --git a/drivers/net/ethernet/marvell/octeontx2/af/rvu_nix.c b/drivers/net/ethernet/marvell/octeontx2/af/rvu_nix.c
index 3caf81b..8890c95 100644
--- a/drivers/net/ethernet/marvell/octeontx2/af/rvu_nix.c
+++ b/drivers/net/ethernet/marvell/octeontx2/af/rvu_nix.c
@@ -1745,6 +1745,39 @@ int rvu_mbox_handler_NIX_SET_MAC_ADDR(struct rvu *rvu,
 	return 0;
 }
 
+int rvu_mbox_handler_NIX_SET_RX_MODE(struct rvu *rvu, struct nix_rx_mode *req,
+				     struct msg_rsp *rsp)
+{
+	bool allmulti = false, disable_promisc = false;
+	struct rvu_hwinfo *hw = rvu->hw;
+	u16 pcifunc = req->hdr.pcifunc;
+	struct rvu_pfvf *pfvf;
+	int blkaddr, nixlf;
+
+	pfvf = rvu_get_pfvf(rvu, pcifunc);
+	blkaddr = rvu_get_blkaddr(rvu, BLKTYPE_NIX, pcifunc);
+	if (!pfvf->nixlf || blkaddr < 0)
+		return NIX_AF_ERR_AF_LF_INVALID;
+
+	nixlf = rvu_get_lf(rvu, &hw->block[blkaddr], pcifunc, 0);
+	if (nixlf < 0)
+		return NIX_AF_ERR_AF_LF_INVALID;
+
+	if (req->mode & NIX_RX_MODE_PROMISC)
+		allmulti = false;
+	else if (req->mode & NIX_RX_MODE_ALLMULTI)
+		allmulti = true;
+	else
+		disable_promisc = true;
+
+	if (disable_promisc)
+		rvu_npc_disable_promisc_entry(rvu, pcifunc, nixlf);
+	else
+		rvu_npc_install_promisc_entry(rvu, pcifunc, nixlf,
+					      pfvf->rx_chan_base, allmulti);
+	return 0;
+}
+
 static int nix_calibrate_x2p(struct rvu *rvu, int blkaddr)
 {
 	int idx, err;
diff --git a/drivers/net/ethernet/marvell/octeontx2/af/rvu_npc.c b/drivers/net/ethernet/marvell/octeontx2/af/rvu_npc.c
index d486112..255e24c 100644
--- a/drivers/net/ethernet/marvell/octeontx2/af/rvu_npc.c
+++ b/drivers/net/ethernet/marvell/octeontx2/af/rvu_npc.c
@@ -310,6 +310,61 @@ void rvu_npc_install_ucast_entry(struct rvu *rvu, u16 pcifunc,
 			      NIX_INTF_RX, &entry, true);
 }
 
+void rvu_npc_install_promisc_entry(struct rvu *rvu, u16 pcifunc,
+				   int nixlf, u64 chan, bool allmulti)
+{
+	struct npc_mcam *mcam = &rvu->hw->mcam;
+	struct mcam_entry entry = { {0} };
+	struct nix_rx_action action;
+	int blkaddr, index, kwi;
+
+	blkaddr = rvu_get_blkaddr(rvu, BLKTYPE_NPC, 0);
+	if (blkaddr < 0)
+		return;
+
+	/* Only PF or AF VF can add a promiscuous entry */
+	if (pcifunc & RVU_PFVF_FUNC_MASK)
+		return;
+
+	index = npc_get_nixlf_mcam_index(mcam, pcifunc,
+					 nixlf, NIXLF_PROMISC_ENTRY);
+
+	entry.kw[0] = chan;
+	entry.kw_mask[0] = 0xFFFULL;
+
+	if (allmulti) {
+		kwi = NPC_PARSE_RESULT_DMAC_OFFSET / sizeof(u64);
+		entry.kw[kwi] = BIT_ULL(40); /* LSB bit of 1st byte in DMAC */
+		entry.kw_mask[kwi] = BIT_ULL(40);
+	}
+
+	*(u64 *)&action = 0x00;
+	action.op = NIX_RX_ACTIONOP_UCAST;
+	action.pf_func = pcifunc;
+
+	entry.action = *(u64 *)&action;
+	npc_config_mcam_entry(rvu, mcam, blkaddr, index,
+			      NIX_INTF_RX, &entry, true);
+}
+
+void rvu_npc_disable_promisc_entry(struct rvu *rvu, u16 pcifunc, int nixlf)
+{
+	struct npc_mcam *mcam = &rvu->hw->mcam;
+	int blkaddr, index;
+
+	blkaddr = rvu_get_blkaddr(rvu, BLKTYPE_NPC, 0);
+	if (blkaddr < 0)
+		return;
+
+	/* Only PF's have a promiscuous entry */
+	if (pcifunc & RVU_PFVF_FUNC_MASK)
+		return;
+
+	index = npc_get_nixlf_mcam_index(mcam, pcifunc,
+					 nixlf, NIXLF_PROMISC_ENTRY);
+	npc_enable_mcam_entry(rvu, mcam, blkaddr, index, false);
+}
+
 void rvu_npc_install_bcast_match_entry(struct rvu *rvu, u16 pcifunc,
 				       int nixlf, u64 chan)
 {
@@ -427,6 +482,8 @@ void rvu_npc_disable_mcam_entries(struct rvu *rvu, u16 pcifunc, int nixlf)
 				     NPC_AF_MCAMEX_BANKX_ACTION(index, bank));
 		if (action.op != NIX_RX_ACTIONOP_MCAST)
 			npc_enable_mcam_entry(rvu, mcam, blkaddr, index, false);
+
+		rvu_npc_disable_promisc_entry(rvu, pcifunc, nixlf);
 	}
 }
 
-- 
2.7.4

^ permalink raw reply related

* Re: [PATCH ghak90 (was ghak32) V4 02/10] audit: add container id
From: Richard Guy Briggs @ 2018-10-19 21:50 UTC (permalink / raw)
  To: Paul Moore
  Cc: containers, linux-api, linux-audit, linux-fsdevel, linux-kernel,
	netdev, netfilter-devel, ebiederm, luto, carlos, dhowells, viro,
	simo, Eric Paris, Serge Hallyn
In-Reply-To: <CAHC9VhTT7n4wDoaDGFXAK+wtcsMDDMRkQVCQ2jNYLKSFqFc+bA@mail.gmail.com>

On 2018-10-19 15:38, Paul Moore wrote:
> On Tue, Jul 31, 2018 at 4:11 PM Richard Guy Briggs <rgb@redhat.com> wrote:
> >
> > Implement the proc fs write to set the audit container identifier of a
> > process, emitting an AUDIT_CONTAINER_OP record to document the event.
> >
> > This is a write from the container orchestrator task to a proc entry of
> > the form /proc/PID/audit_containerid where PID is the process ID of the
> > newly created task that is to become the first task in a container, or
> > an additional task added to a container.
> >
> > The write expects up to a u64 value (unset: 18446744073709551615).
> >
> > The writer must have capability CAP_AUDIT_CONTROL.
> >
> > This will produce a record such as this:
> >   type=CONTAINER_ID msg=audit(2018-06-06 12:39:29.636:26949) : op=set opid=2209 old-contid=18446744073709551615 contid=123456 pid=628 auid=root uid=root tty=ttyS0 ses=1 subj=unconfined_u:unconfined_r:unconfined_t:s0-s0:c0.c1023 comm=bash exe=/usr/bin/bash res=yes
> 
> You need to update the record type in the example above.

Yup, thanks.

> > The "op" field indicates an initial set.  The "pid" to "ses" fields are
> > the orchestrator while the "opid" field is the object's PID, the process
> > being "contained".  Old and new audit container identifier values are
> > given in the "contid" fields, while res indicates its success.
> 
> I understand Steve's concern around the "op" field, but I think it
> might be a bit premature to think we might not need to do some sort of
> audit container ID management in the future that would want to make
> use of the CONTAINER_OP message type.  I would like to see the "op"
> field preserved.

I strongly agree.

> > It is not permitted to unset the audit container identifier.
> > A child inherits its parent's audit container identifier.
> >
> > See: https://github.com/linux-audit/audit-kernel/issues/90
> > See: https://github.com/linux-audit/audit-userspace/issues/51
> > See: https://github.com/linux-audit/audit-testsuite/issues/64
> > See: https://github.com/linux-audit/audit-kernel/wiki/RFE-Audit-Container-ID
> >
> > Signed-off-by: Richard Guy Briggs <rgb@redhat.com>
> > Acked-by: Serge Hallyn <serge@hallyn.com>
> > Acked-by: Steve Grubb <sgrubb@redhat.com>
> > ---
> >  fs/proc/base.c             | 37 +++++++++++++++++++++++++
> >  include/linux/audit.h      | 24 ++++++++++++++++
> >  include/uapi/linux/audit.h |  2 ++
> >  kernel/auditsc.c           | 68 ++++++++++++++++++++++++++++++++++++++++++++++
> >  4 files changed, 131 insertions(+)
> 
> ...
> 
> > @@ -2112,6 +2114,72 @@ int audit_set_loginuid(kuid_t loginuid)
> >  }
> >
> >  /**
> > + * audit_set_contid - set current task's audit_context contid
> > + * @contid: contid value
> > + *
> > + * Returns 0 on success, -EPERM on permission failure.
> > + *
> > + * Called (set) from fs/proc/base.c::proc_contid_write().
> > + */
> > +int audit_set_contid(struct task_struct *task, u64 contid)
> > +{
> > +       u64 oldcontid;
> > +       int rc = 0;
> > +       struct audit_buffer *ab;
> > +       uid_t uid;
> > +       struct tty_struct *tty;
> > +       char comm[sizeof(current->comm)];
> > +
> > +       task_lock(task);
> > +       /* Can't set if audit disabled */
> > +       if (!task->audit) {
> > +               task_unlock(task);
> > +               return -ENOPROTOOPT;
> > +       }
> > +       oldcontid = audit_get_contid(task);
> > +       read_lock(&tasklist_lock);
> 
> I assume lockdep was happy with nesting the tasklist_lock inside the task lock?

Yup, I had gone through the logic and at first I had doubts, but the
function comments and other usage reassured me (as well as in-kernel
lock checks on boot) that this was the right order and approach.

> > +       /* Don't allow the audit containerid to be unset */
> > +       if (!audit_contid_valid(contid))
> > +               rc = -EINVAL;
> > +       /* if we don't have caps, reject */
> > +       else if (!capable(CAP_AUDIT_CONTROL))
> > +               rc = -EPERM;
> > +       /* if task has children or is not single-threaded, deny */
> > +       else if (!list_empty(&task->children))
> > +               rc = -EBUSY;
> > +       else if (!(thread_group_leader(task) && thread_group_empty(task)))
> > +               rc = -EALREADY;
> > +       read_unlock(&tasklist_lock);
> > +       if (!rc)
> > +               task->audit->contid = contid;
> > +       task_unlock(task);
> > +
> > +       if (!audit_enabled)
> > +               return rc;
> > +
> > +       ab = audit_log_start(audit_context(), GFP_KERNEL, AUDIT_CONTAINER_OP);
> > +       if (!ab)
> > +               return rc;
> > +
> > +       uid = from_kuid(&init_user_ns, task_uid(current));
> > +       tty = audit_get_tty(current);
> > +       audit_log_format(ab, "op=set opid=%d old-contid=%llu contid=%llu pid=%d uid=%u auid=%u tty=%s ses=%u",
> > +                        task_tgid_nr(task), oldcontid, contid,
> > +                        task_tgid_nr(current), uid,
> > +                        from_kuid(&init_user_ns, audit_get_loginuid(current)),
> > +                        tty ? tty_name(tty) : "(none)",
> > +                        audit_get_sessionid(current));
> > +       audit_put_tty(tty);
> > +       audit_log_task_context(ab);
> > +       audit_log_format(ab, " comm=");
> > +       audit_log_untrustedstring(ab, get_task_comm(comm, current));
> > +       audit_log_d_path_exe(ab, current->mm);
> > +       audit_log_format(ab, " res=%d", !rc);
> > +       audit_log_end(ab);
> > +       return rc;
> > +}
> 
> -- 
> paul moore
> www.paul-moore.com

- RGB

--
Richard Guy Briggs <rgb@redhat.com>
Sr. S/W Engineer, Kernel Security, Base Operating Systems
Remote, Ottawa, Red Hat Canada
IRC: rgb, SunRaycer
Voice: +1.647.777.2635, Internal: (81) 32635

^ permalink raw reply

* [PATCH bpf-next v2 0/2] improve and fix barriers for walking perf ring buffer
From: Daniel Borkmann @ 2018-10-19 13:51 UTC (permalink / raw)
  To: alexei.starovoitov
  Cc: peterz, paulmck, will.deacon, acme, yhs, john.fastabend, netdev,
	Daniel Borkmann

This set first adds smp_* barrier variants to tools infrastructure
and updates perf and libbpf to make use of them. For details, please
see individual patches, thanks!

Arnaldo, if there are no objections, could this be routed via bpf-next
with Acked-by's due to later dependencies in libbpf? Alternatively,
I could also get the 2nd patch out during merge window, but perhaps
it's okay to do in one go as there shouldn't be much conflict in perf
itself.

Thanks!

v1 -> v2:
  - add common helper and switch to acquire/release variants
    when possible, thanks Peter!

Daniel Borkmann (2):
  tools, perf: add and use optimized ring_buffer_{read_head,write_tail} helpers
  bpf, libbpf: use correct barriers in perf ring buffer walk

 tools/arch/arm64/include/asm/barrier.h    | 70 +++++++++++++++++++++++++++++
 tools/arch/ia64/include/asm/barrier.h     | 13 ++++++
 tools/arch/powerpc/include/asm/barrier.h  | 16 +++++++
 tools/arch/s390/include/asm/barrier.h     | 13 ++++++
 tools/arch/sparc/include/asm/barrier_64.h | 13 ++++++
 tools/arch/x86/include/asm/barrier.h      | 14 ++++++
 tools/include/asm/barrier.h               | 35 +++++++++++++++
 tools/include/linux/ring_buffer.h         | 73 +++++++++++++++++++++++++++++++
 tools/lib/bpf/libbpf.c                    | 10 ++---
 tools/perf/util/mmap.h                    | 15 ++-----
 10 files changed, 254 insertions(+), 18 deletions(-)
 create mode 100644 tools/include/linux/ring_buffer.h

-- 
2.9.5

^ permalink raw reply

* [PATCH bpf-next v2 2/2] bpf, libbpf: use correct barriers in perf ring buffer walk
From: Daniel Borkmann @ 2018-10-19 13:51 UTC (permalink / raw)
  To: alexei.starovoitov
  Cc: peterz, paulmck, will.deacon, acme, yhs, john.fastabend, netdev,
	Daniel Borkmann
In-Reply-To: <20181019135103.3602-1-daniel@iogearbox.net>

Given libbpf is a generic library and not restricted to x86-64 only,
the compiler barrier in bpf_perf_event_read_simple() after fetching
the head needs to be replaced with smp_rmb() at minimum. Also, writing
out the tail we should use WRITE_ONCE() to avoid store tearing.

Now that we have the logic in place in ring_buffer_read_head() and
ring_buffer_write_tail() helper also used by perf tool which would
select the correct and best variant for a given architecture (e.g.
x86-64 can avoid CPU barriers entirely), make use of these in order
to fix bpf_perf_event_read_simple().

Fixes: d0cabbb021be ("tools: bpf: move the event reading loop to libbpf")
Fixes: 39111695b1b8 ("samples: bpf: add bpf_perf_event_output example")
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Cc: Will Deacon <will.deacon@arm.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/lib/bpf/libbpf.c | 10 ++++------
 1 file changed, 4 insertions(+), 6 deletions(-)

diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c
index bd71efc..0c21355 100644
--- a/tools/lib/bpf/libbpf.c
+++ b/tools/lib/bpf/libbpf.c
@@ -27,6 +27,7 @@
 #include <linux/list.h>
 #include <linux/limits.h>
 #include <linux/perf_event.h>
+#include <linux/ring_buffer.h>
 #include <sys/stat.h>
 #include <sys/types.h>
 #include <sys/vfs.h>
@@ -2418,13 +2419,12 @@ bpf_perf_event_read_simple(void *mem, unsigned long size,
 			   unsigned long page_size, void **buf, size_t *buf_len,
 			   bpf_perf_event_print_t fn, void *priv)
 {
-	volatile struct perf_event_mmap_page *header = mem;
+	struct perf_event_mmap_page *header = mem;
+	__u64 data_head = ring_buffer_read_head(header);
 	__u64 data_tail = header->data_tail;
-	__u64 data_head = header->data_head;
 	int ret = LIBBPF_PERF_EVENT_ERROR;
 	void *base, *begin, *end;
 
-	asm volatile("" ::: "memory"); /* in real code it should be smp_rmb() */
 	if (data_head == data_tail)
 		return LIBBPF_PERF_EVENT_CONT;
 
@@ -2467,8 +2467,6 @@ bpf_perf_event_read_simple(void *mem, unsigned long size,
 		data_tail += ehdr->size;
 	}
 
-	__sync_synchronize(); /* smp_mb() */
-	header->data_tail = data_tail;
-
+	ring_buffer_write_tail(header, data_tail);
 	return ret;
 }
-- 
2.9.5

^ permalink raw reply related

* [PATCH bpf-next v2 1/2] tools, perf: add and use optimized ring_buffer_{read_head,write_tail} helpers
From: Daniel Borkmann @ 2018-10-19 13:51 UTC (permalink / raw)
  To: alexei.starovoitov
  Cc: peterz, paulmck, will.deacon, acme, yhs, john.fastabend, netdev,
	Daniel Borkmann
In-Reply-To: <20181019135103.3602-1-daniel@iogearbox.net>

Currently, on x86-64, perf uses LFENCE and MFENCE (rmb() and mb(),
respectively) when processing events from the perf ring buffer which
is unnecessarily expensive as we can do more lightweight in particular
given this is critical fast-path in perf.

According to Peter rmb()/mb() were added back then via a94d342b9cb0
("tools/perf: Add required memory barriers") at a time where kernel
still supported chips that needed it, but nowadays support for these
has been ditched completely, therefore we can fix them up as well.

While for x86-64, replacing rmb() and mb() with smp_*() variants would
result in just a compiler barrier for the former and LOCK + ADD for
the latter (__sync_synchronize() uses slower MFENCE by the way), Peter
suggested we can use smp_{load_acquire,store_release}() instead for
architectures where its implementation doesn't resolve in slower smp_mb().
Thus, e.g. in x86-64 we would be able to avoid CPU barrier entirely due
to TSO. For architectures where the latter needs to use smp_mb() e.g.
on arm, we stick to cheaper smp_rmb() variant for fetching the head.

This work adds helpers ring_buffer_read_head() and ring_buffer_write_tail()
for tools infrastructure that either switches to smp_load_acquire() for
architectures where it is cheaper or uses READ_ONCE() + smp_rmb() barrier
for those where it's not in order to fetch the data_head from the perf
control page, and it uses smp_store_release() to write the data_tail.
Latter is smp_mb() + WRITE_ONCE() combination or a cheaper variant if
architecture allows for it. Those that rely on smp_rmb() and smp_mb() can
further improve performance in a follow up step by implementing the two
under tools/arch/*/include/asm/barrier.h such that they don't have to
fallback to rmb() and mb() in tools/include/asm/barrier.h.

Switch perf to use ring_buffer_read_head() and ring_buffer_write_tail()
so it can make use of the optimizations. Later, we convert libbpf as
well to use the same helpers.

Side note [0]: the topic has been raised of whether one could simply use
the C11 gcc builtins [1] for the smp_load_acquire() and smp_store_release()
instead:

  __atomic_load_n(ptr, __ATOMIC_ACQUIRE);
  __atomic_store_n(ptr, val, __ATOMIC_RELEASE);

Kernel and (presumably) tooling shipped along with the kernel has a
minimum requirement of being able to build with gcc-4.6 and the latter
does not have C11 builtins. While generally the C11 memory models don't
align with the kernel's, the C11 load-acquire and store-release alone
/could/ suffice, however. Issue is that this is implementation dependent
on how the load-acquire and store-release is done by the compiler and
the mapping of supported compilers must align to be compatible with the
kernel's implementation, and thus needs to be verified/tracked on a
case by case basis whether they match (unless an architecture uses them
also from kernel side). The implementations for smp_load_acquire() and
smp_store_release() in this patch have been adapted from the kernel side
ones to have a concrete and compatible mapping in place.

  [0] http://patchwork.ozlabs.org/patch/985422/
  [1] https://gcc.gnu.org/onlinedocs/gcc/_005f_005fatomic-Builtins.html

Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Cc: Will Deacon <will.deacon@arm.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/arch/arm64/include/asm/barrier.h    | 70 +++++++++++++++++++++++++++++
 tools/arch/ia64/include/asm/barrier.h     | 13 ++++++
 tools/arch/powerpc/include/asm/barrier.h  | 16 +++++++
 tools/arch/s390/include/asm/barrier.h     | 13 ++++++
 tools/arch/sparc/include/asm/barrier_64.h | 13 ++++++
 tools/arch/x86/include/asm/barrier.h      | 14 ++++++
 tools/include/asm/barrier.h               | 35 +++++++++++++++
 tools/include/linux/ring_buffer.h         | 73 +++++++++++++++++++++++++++++++
 tools/perf/util/mmap.h                    | 15 ++-----
 9 files changed, 250 insertions(+), 12 deletions(-)
 create mode 100644 tools/include/linux/ring_buffer.h

diff --git a/tools/arch/arm64/include/asm/barrier.h b/tools/arch/arm64/include/asm/barrier.h
index 40bde6b..12835ea 100644
--- a/tools/arch/arm64/include/asm/barrier.h
+++ b/tools/arch/arm64/include/asm/barrier.h
@@ -14,4 +14,74 @@
 #define wmb()		asm volatile("dmb ishst" ::: "memory")
 #define rmb()		asm volatile("dmb ishld" ::: "memory")
 
+#define smp_store_release(p, v)					\
+do {								\
+	union { typeof(*p) __val; char __c[1]; } __u =		\
+		{ .__val = (__force typeof(*p)) (v) }; 		\
+								\
+	switch (sizeof(*p)) {					\
+	case 1:							\
+		asm volatile ("stlrb %w1, %0"			\
+				: "=Q" (*p)			\
+				: "r" (*(__u8 *)__u.__c)	\
+				: "memory");			\
+		break;						\
+	case 2:							\
+		asm volatile ("stlrh %w1, %0"			\
+				: "=Q" (*p)			\
+				: "r" (*(__u16 *)__u.__c)	\
+				: "memory");			\
+		break;						\
+	case 4:							\
+		asm volatile ("stlr %w1, %0"			\
+				: "=Q" (*p)			\
+				: "r" (*(__u32 *)__u.__c)	\
+				: "memory");			\
+		break;						\
+	case 8:							\
+		asm volatile ("stlr %1, %0"			\
+				: "=Q" (*p)			\
+				: "r" (*(__u64 *)__u.__c)	\
+				: "memory");			\
+		break;						\
+	default:						\
+		/* Only to shut up gcc ... */			\
+		mb();						\
+		break;						\
+	}							\
+} while (0)
+
+#define smp_load_acquire(p)					\
+({								\
+	union { typeof(*p) __val; char __c[1]; } __u;		\
+								\
+	switch (sizeof(*p)) {					\
+	case 1:							\
+		asm volatile ("ldarb %w0, %1"			\
+			: "=r" (*(__u8 *)__u.__c)		\
+			: "Q" (*p) : "memory");			\
+		break;						\
+	case 2:							\
+		asm volatile ("ldarh %w0, %1"			\
+			: "=r" (*(__u16 *)__u.__c)		\
+			: "Q" (*p) : "memory");			\
+		break;						\
+	case 4:							\
+		asm volatile ("ldar %w0, %1"			\
+			: "=r" (*(__u32 *)__u.__c)		\
+			: "Q" (*p) : "memory");			\
+		break;						\
+	case 8:							\
+		asm volatile ("ldar %0, %1"			\
+			: "=r" (*(__u64 *)__u.__c)		\
+			: "Q" (*p) : "memory");			\
+		break;						\
+	default:						\
+		/* Only to shut up gcc ... */			\
+		mb();						\
+		break;						\
+	}							\
+	__u.__val;						\
+})
+
 #endif /* _TOOLS_LINUX_ASM_AARCH64_BARRIER_H */
diff --git a/tools/arch/ia64/include/asm/barrier.h b/tools/arch/ia64/include/asm/barrier.h
index d808ee0..4d471d9 100644
--- a/tools/arch/ia64/include/asm/barrier.h
+++ b/tools/arch/ia64/include/asm/barrier.h
@@ -46,4 +46,17 @@
 #define rmb()		mb()
 #define wmb()		mb()
 
+#define smp_store_release(p, v)			\
+do {						\
+	barrier();				\
+	WRITE_ONCE(*p, v);			\
+} while (0)
+
+#define smp_load_acquire(p)			\
+({						\
+	typeof(*p) ___p1 = READ_ONCE(*p);	\
+	barrier();				\
+	___p1;					\
+})
+
 #endif /* _TOOLS_LINUX_ASM_IA64_BARRIER_H */
diff --git a/tools/arch/powerpc/include/asm/barrier.h b/tools/arch/powerpc/include/asm/barrier.h
index a634da0..905a2c6 100644
--- a/tools/arch/powerpc/include/asm/barrier.h
+++ b/tools/arch/powerpc/include/asm/barrier.h
@@ -27,4 +27,20 @@
 #define rmb()  __asm__ __volatile__ ("sync" : : : "memory")
 #define wmb()  __asm__ __volatile__ ("sync" : : : "memory")
 
+#if defined(__powerpc64__)
+#define smp_lwsync()	__asm__ __volatile__ ("lwsync" : : : "memory")
+
+#define smp_store_release(p, v)			\
+do {						\
+	smp_lwsync();				\
+	WRITE_ONCE(*p, v);			\
+} while (0)
+
+#define smp_load_acquire(p)			\
+({						\
+	typeof(*p) ___p1 = READ_ONCE(*p);	\
+	smp_lwsync();				\
+	___p1;					\
+})
+#endif /* defined(__powerpc64__) */
 #endif /* _TOOLS_LINUX_ASM_POWERPC_BARRIER_H */
diff --git a/tools/arch/s390/include/asm/barrier.h b/tools/arch/s390/include/asm/barrier.h
index 5030c99..de362fa6 100644
--- a/tools/arch/s390/include/asm/barrier.h
+++ b/tools/arch/s390/include/asm/barrier.h
@@ -28,4 +28,17 @@
 #define rmb()				mb()
 #define wmb()				mb()
 
+#define smp_store_release(p, v)			\
+do {						\
+	barrier();				\
+	WRITE_ONCE(*p, v);			\
+} while (0)
+
+#define smp_load_acquire(p)			\
+({						\
+	typeof(*p) ___p1 = READ_ONCE(*p);	\
+	barrier();				\
+	___p1;					\
+})
+
 #endif /* __TOOLS_LIB_ASM_BARRIER_H */
diff --git a/tools/arch/sparc/include/asm/barrier_64.h b/tools/arch/sparc/include/asm/barrier_64.h
index ba61344..cfb0fdc 100644
--- a/tools/arch/sparc/include/asm/barrier_64.h
+++ b/tools/arch/sparc/include/asm/barrier_64.h
@@ -40,4 +40,17 @@ do {	__asm__ __volatile__("ba,pt	%%xcc, 1f\n\t" \
 #define rmb()	__asm__ __volatile__("":::"memory")
 #define wmb()	__asm__ __volatile__("":::"memory")
 
+#define smp_store_release(p, v)			\
+do {						\
+	barrier();				\
+	WRITE_ONCE(*p, v);			\
+} while (0)
+
+#define smp_load_acquire(p)			\
+({						\
+	typeof(*p) ___p1 = READ_ONCE(*p);	\
+	barrier();				\
+	___p1;					\
+})
+
 #endif /* !(__TOOLS_LINUX_SPARC64_BARRIER_H) */
diff --git a/tools/arch/x86/include/asm/barrier.h b/tools/arch/x86/include/asm/barrier.h
index 8774dee..5891986 100644
--- a/tools/arch/x86/include/asm/barrier.h
+++ b/tools/arch/x86/include/asm/barrier.h
@@ -26,4 +26,18 @@
 #define wmb()	asm volatile("sfence" ::: "memory")
 #endif
 
+#if defined(__x86_64__)
+#define smp_store_release(p, v)			\
+do {						\
+	barrier();				\
+	WRITE_ONCE(*p, v);			\
+} while (0)
+
+#define smp_load_acquire(p)			\
+({						\
+	typeof(*p) ___p1 = READ_ONCE(*p);	\
+	barrier();				\
+	___p1;					\
+})
+#endif /* defined(__x86_64__) */
 #endif /* _TOOLS_LINUX_ASM_X86_BARRIER_H */
diff --git a/tools/include/asm/barrier.h b/tools/include/asm/barrier.h
index 391d942..8d378c5 100644
--- a/tools/include/asm/barrier.h
+++ b/tools/include/asm/barrier.h
@@ -1,4 +1,5 @@
 /* SPDX-License-Identifier: GPL-2.0 */
+#include <linux/compiler.h>
 #if defined(__i386__) || defined(__x86_64__)
 #include "../../arch/x86/include/asm/barrier.h"
 #elif defined(__arm__)
@@ -26,3 +27,37 @@
 #else
 #include <asm-generic/barrier.h>
 #endif
+
+/*
+ * Generic fallback smp_*() definitions for archs that haven't
+ * been updated yet.
+ */
+
+#ifndef smp_rmb
+# define smp_rmb()	rmb()
+#endif
+
+#ifndef smp_wmb
+# define smp_wmb()	wmb()
+#endif
+
+#ifndef smp_mb
+# define smp_mb()	mb()
+#endif
+
+#ifndef smp_store_release
+# define smp_store_release(p, v)		\
+do {						\
+	smp_mb();				\
+	WRITE_ONCE(*p, v);			\
+} while (0)
+#endif
+
+#ifndef smp_load_acquire
+# define smp_load_acquire(p)			\
+({						\
+	typeof(*p) ___p1 = READ_ONCE(*p);	\
+	smp_mb();				\
+	___p1;					\
+})
+#endif
diff --git a/tools/include/linux/ring_buffer.h b/tools/include/linux/ring_buffer.h
new file mode 100644
index 0000000..9a083ae
--- /dev/null
+++ b/tools/include/linux/ring_buffer.h
@@ -0,0 +1,73 @@
+#ifndef _TOOLS_LINUX_RING_BUFFER_H_
+#define _TOOLS_LINUX_RING_BUFFER_H_
+
+#include <asm/barrier.h>
+
+/*
+ * Contract with kernel for walking the perf ring buffer from
+ * user space requires the following barrier pairing (quote
+ * from kernel/events/ring_buffer.c):
+ *
+ *   Since the mmap() consumer (userspace) can run on a
+ *   different CPU:
+ *
+ *   kernel                             user
+ *
+ *   if (LOAD ->data_tail) {            LOAD ->data_head
+ *                      (A)             smp_rmb()       (C)
+ *      STORE $data                     LOAD $data
+ *      smp_wmb()       (B)             smp_mb()        (D)
+ *      STORE ->data_head               STORE ->data_tail
+ *   }
+ *
+ *   Where A pairs with D, and B pairs with C.
+ *
+ *   In our case A is a control dependency that separates the
+ *   load of the ->data_tail and the stores of $data. In case
+ *   ->data_tail indicates there is no room in the buffer to
+ *   store $data we do not.
+ *
+ *   D needs to be a full barrier since it separates the data
+ *   READ from the tail WRITE.
+ *
+ *   For B a WMB is sufficient since it separates two WRITEs,
+ *   and for C an RMB is sufficient since it separates two READs.
+ *
+ * Note, instead of B, C, D we could also use smp_store_release()
+ * in B and D as well as smp_load_acquire() in C.
+ *
+ * However, this optimization does not make sense for all kernel
+ * supported architectures since for a fair number it would
+ * resolve into READ_ONCE() + smp_mb() pair for smp_load_acquire(),
+ * and smp_mb() + WRITE_ONCE() pair for smp_store_release().
+ *
+ * Thus for those smp_wmb() in B and smp_rmb() in C would still
+ * be less expensive. For the case of D this has either the same
+ * cost or is less expensive, for example, due to TSO x86 can
+ * avoid the CPU barrier entirely.
+ */
+
+static inline u64 ring_buffer_read_head(struct perf_event_mmap_page *base)
+{
+/*
+ * Architectures where smp_load_acquire() does not fallback to
+ * READ_ONCE() + smp_mb() pair.
+ */
+#if defined(__x86_64__) || defined(__aarch64__) || defined(__powerpc64__) || \
+    defined(__ia64__) || defined(__sparc__) && defined(__arch64__)
+	return smp_load_acquire(&base->data_head);
+#else
+	u64 head = READ_ONCE(base->data_head);
+
+	smp_rmb();
+	return head;
+#endif
+}
+
+static inline void ring_buffer_write_tail(struct perf_event_mmap_page *base,
+					  u64 tail)
+{
+	smp_store_release(&base->data_tail, tail);
+}
+
+#endif /* _TOOLS_LINUX_RING_BUFFER_H_ */
diff --git a/tools/perf/util/mmap.h b/tools/perf/util/mmap.h
index 05a6d47..8f6531f 100644
--- a/tools/perf/util/mmap.h
+++ b/tools/perf/util/mmap.h
@@ -4,7 +4,7 @@
 #include <linux/compiler.h>
 #include <linux/refcount.h>
 #include <linux/types.h>
-#include <asm/barrier.h>
+#include <linux/ring_buffer.h>
 #include <stdbool.h>
 #include "auxtrace.h"
 #include "event.h"
@@ -71,21 +71,12 @@ void perf_mmap__consume(struct perf_mmap *map);
 
 static inline u64 perf_mmap__read_head(struct perf_mmap *mm)
 {
-	struct perf_event_mmap_page *pc = mm->base;
-	u64 head = READ_ONCE(pc->data_head);
-	rmb();
-	return head;
+	return ring_buffer_read_head(mm->base);
 }
 
 static inline void perf_mmap__write_tail(struct perf_mmap *md, u64 tail)
 {
-	struct perf_event_mmap_page *pc = md->base;
-
-	/*
-	 * ensure all reads are done before we write the tail out.
-	 */
-	mb();
-	pc->data_tail = tail;
+	ring_buffer_write_tail(md->base, tail);
 }
 
 union perf_event *perf_mmap__read_forward(struct perf_mmap *map);
-- 
2.9.5

^ permalink raw reply related

* [PATCH] bpf: btf: Fix a missing-check bug
From: Wenwen Wang @ 2018-10-19 22:29 UTC (permalink / raw)
  To: Wenwen Wang
  Cc: Kangjie Lu, Alexei Starovoitov, Daniel Borkmann,
	open list:BPF (Safe dynamic programs and tools),
	open list:BPF (Safe dynamic programs and tools)

In btf_parse(), the header of the user-space btf data 'btf_data' is firstly
parsed and verified through btf_parse_hdr(). In btf_parse_hdr(), the header
is copied from user-space 'btf_data' to kernel-space 'btf->hdr' and then
verified. If no error happens during the verification process, the whole
data of 'btf_data', including the header, is then copied to 'data' in
btf_parse(). It is obvious that the header is copied twice here. More
importantly, no check is enforced after the second copy to make sure the
headers obtained in these two copies are same. Given that 'btf_data'
resides in the user space, a malicious user can race to modify the header
between these two copies. By doing so, the user can inject inconsistent
data, which can cause undefined behavior of the kernel and introduce
potential security risk.

To avoid the above issue, this patch rewrites the header after the second
copy, using 'btf->hdr', which is obtained in the first copy.

Signed-off-by: Wenwen Wang <wang6495@umn.edu>
---
 kernel/bpf/btf.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c
index 138f030..2a85f91 100644
--- a/kernel/bpf/btf.c
+++ b/kernel/bpf/btf.c
@@ -2202,6 +2202,9 @@ static struct btf *btf_parse(void __user *btf_data, u32 btf_data_size,
 		goto errout;
 	}

+	memcpy(data, &btf->hdr,
+		min_t(u32, btf->hdr.hdr_len, sizeof(btf->hdr)));
+
 	err = btf_parse_str_sec(env);
 	if (err)
 		goto errout;
-- 
2.7.4

^ permalink raw reply related

* [RFC PATCH v2 00/10] udp: implement GRO support
From: Paolo Abeni @ 2018-10-19 14:25 UTC (permalink / raw)
  To: netdev; +Cc: Willem de Bruijn, Steffen Klassert

This series implements GRO support for UDP sockets, as the RX counterpart
of commit bec1f6f69736 ("udp: generate gso with UDP_SEGMENT").
The core functionality is implemented by the second patch, introducing a new
sockopt to enable UDP_GRO, while patch 3 implements support for passing the
segment size to the user space via a new cmsg.
UDP GRO performs a socket lookup for each ingress packets and aggregate datagram
directed to UDP GRO enabled sockets with constant l4 tuple.

UDP GRO packets can land on non GRO-enabled sockets, e.g. due to iptables NAT
rules, and that could potentially confuse existing applications.

The solution adopted here is to de-segment the GRO packet before enqueuing
as needed. Since we must cope with packet reinsertion after de-segmentation,
the relevant code is factored-out in ipv4 and ipv6 specific helpers and exposed
to UDP usage.

While the current code can probably be improved, this safeguard ,implemented in
the patches 4-7, allows future enachements to enable UDP GSO offload on more
virtual devices eventually even on forwarded packets.

The last 4 for patches implement some performance and functional self-tests,
re-using the existing udpgso infrastructure. The problematic scenario described
above is explicitly tested.

v1 - v2:
 - use a new option to enable UDP GRO
 - use static keys to protect the UDP GRO socket lookup
 - cope with UDP GRO misdirection
 - add self-tests

Paolo Abeni (10):
  udp: implement complete book-keeping for encap_needed
  udp: implement GRO for plain UDP sockets.
  udp: add support for UDP_GRO cmsg
  ip: factor out protocol delivery helper
  ipv6: factor out protocol delivery helper
  udp: cope with UDP GRO packet misdirection
  selftests: add GRO support to udp bench rx program
  selftests: conditionally enable XDP support in udpgso_bench_rx
  selftests: add some benchmark for UDP GRO
  selftests: add functionals test for UDP GRO

 include/linux/udp.h                           |  42 +++-
 include/net/udp_tunnel.h                      |   6 +
 include/uapi/linux/udp.h                      |   1 +
 net/ipv4/ip_input.c                           |  73 ++++---
 net/ipv4/udp.c                                |  54 ++++-
 net/ipv4/udp_offload.c                        | 109 ++++++++--
 net/ipv6/ip6_input.c                          |  28 +--
 net/ipv6/udp.c                                |  44 +++-
 net/ipv6/udp_offload.c                        |   6 +-
 tools/testing/selftests/net/Makefile          |  70 +++++++
 tools/testing/selftests/net/udpgro.sh         | 144 +++++++++++++
 tools/testing/selftests/net/udpgro_bench.sh   |  94 +++++++++
 tools/testing/selftests/net/udpgso_bench.sh   |   2 +-
 tools/testing/selftests/net/udpgso_bench_rx.c | 195 ++++++++++++++++--
 tools/testing/selftests/net/udpgso_bench_tx.c |  22 +-
 tools/testing/selftests/net/xdp_dummy.c       |  13 ++
 16 files changed, 790 insertions(+), 113 deletions(-)
 create mode 100755 tools/testing/selftests/net/udpgro.sh
 create mode 100755 tools/testing/selftests/net/udpgro_bench.sh
 create mode 100644 tools/testing/selftests/net/xdp_dummy.c

-- 
2.17.2

^ permalink raw reply

* [RFC PATCH v2 01/10] udp: implement complete book-keeping for encap_needed
From: Paolo Abeni @ 2018-10-19 14:25 UTC (permalink / raw)
  To: netdev; +Cc: Willem de Bruijn, Steffen Klassert
In-Reply-To: <cover.1539957909.git.pabeni@redhat.com>

The *encap_needed static keys are enabled by UDP tunnels
and several UDP encapsulations type, but they are never
turned off. This can cause unneeded overall performance
degradation for systems where such features are used
transiently.

This patch introduces complete book-keeping for such keys,
decreasing the usage at socket destruction time, if needed,
and avoiding that the same socket could increase the key
usage multiple times.

Signed-off-by: Paolo Abeni <pabeni@redhat.com>
---
 include/linux/udp.h      |  7 ++++++-
 include/net/udp_tunnel.h |  6 ++++++
 net/ipv4/udp.c           | 18 ++++++++++++------
 net/ipv6/udp.c           | 14 +++++++++-----
 4 files changed, 33 insertions(+), 12 deletions(-)

diff --git a/include/linux/udp.h b/include/linux/udp.h
index 320d49d85484..a4dafff407fb 100644
--- a/include/linux/udp.h
+++ b/include/linux/udp.h
@@ -49,7 +49,12 @@ struct udp_sock {
 	unsigned int	 corkflag;	/* Cork is required */
 	__u8		 encap_type;	/* Is this an Encapsulation socket? */
 	unsigned char	 no_check6_tx:1,/* Send zero UDP6 checksums on TX? */
-			 no_check6_rx:1;/* Allow zero UDP6 checksums on RX? */
+			 no_check6_rx:1,/* Allow zero UDP6 checksums on RX? */
+			 encap_enabled:1; /* This socket enabled encap
+					   * processing; UDP tunnels and
+					   * different encapsulation layer set
+					   * this
+					   */
 	/*
 	 * Following member retains the information to create a UDP header
 	 * when the socket is uncorked.
diff --git a/include/net/udp_tunnel.h b/include/net/udp_tunnel.h
index fe680ab6b15a..3fbe56430e3b 100644
--- a/include/net/udp_tunnel.h
+++ b/include/net/udp_tunnel.h
@@ -165,6 +165,12 @@ static inline int udp_tunnel_handle_offloads(struct sk_buff *skb, bool udp_csum)
 
 static inline void udp_tunnel_encap_enable(struct socket *sock)
 {
+	struct udp_sock *up = udp_sk(sock->sk);
+
+	if (up->encap_enabled)
+		return;
+
+	up->encap_enabled = 1;
 #if IS_ENABLED(CONFIG_IPV6)
 	if (sock->sk->sk_family == PF_INET6)
 		ipv6_stub->udpv6_encap_enable();
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index cf8252d05a01..9fcb5374e166 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -2382,11 +2382,15 @@ void udp_destroy_sock(struct sock *sk)
 	bool slow = lock_sock_fast(sk);
 	udp_flush_pending_frames(sk);
 	unlock_sock_fast(sk, slow);
-	if (static_branch_unlikely(&udp_encap_needed_key) && up->encap_type) {
-		void (*encap_destroy)(struct sock *sk);
-		encap_destroy = READ_ONCE(up->encap_destroy);
-		if (encap_destroy)
-			encap_destroy(sk);
+	if (static_branch_unlikely(&udp_encap_needed_key)) {
+		if (up->encap_type) {
+			void (*encap_destroy)(struct sock *sk);
+			encap_destroy = READ_ONCE(up->encap_destroy);
+			if (encap_destroy)
+				encap_destroy(sk);
+		}
+		if (up->encap_enabled)
+			static_branch_disable(&udp_encap_needed_key);
 	}
 }
 
@@ -2431,7 +2435,9 @@ int udp_lib_setsockopt(struct sock *sk, int level, int optname,
 			/* FALLTHROUGH */
 		case UDP_ENCAP_L2TPINUDP:
 			up->encap_type = val;
-			udp_encap_enable();
+			if (!up->encap_enabled)
+				udp_encap_enable();
+			up->encap_enabled = 1;
 			break;
 		default:
 			err = -ENOPROTOOPT;
diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c
index 374e7d302f26..8bb50ba32a6f 100644
--- a/net/ipv6/udp.c
+++ b/net/ipv6/udp.c
@@ -1460,11 +1460,15 @@ void udpv6_destroy_sock(struct sock *sk)
 	udp_v6_flush_pending_frames(sk);
 	release_sock(sk);
 
-	if (static_branch_unlikely(&udpv6_encap_needed_key) && up->encap_type) {
-		void (*encap_destroy)(struct sock *sk);
-		encap_destroy = READ_ONCE(up->encap_destroy);
-		if (encap_destroy)
-			encap_destroy(sk);
+	if (static_branch_unlikely(&udpv6_encap_needed_key)) {
+		if (up->encap_type) {
+			void (*encap_destroy)(struct sock *sk);
+			encap_destroy = READ_ONCE(up->encap_destroy);
+			if (encap_destroy)
+				encap_destroy(sk);
+		}
+		if (up->encap_enabled)
+			static_branch_disable(&udpv6_encap_needed_key);
 	}
 
 	inet6_destroy_sock(sk);
-- 
2.17.2

^ permalink raw reply related

* [RFC PATCH v2 03/10] udp: add support for UDP_GRO cmsg
From: Paolo Abeni @ 2018-10-19 14:25 UTC (permalink / raw)
  To: netdev; +Cc: Willem de Bruijn, Steffen Klassert
In-Reply-To: <cover.1539957909.git.pabeni@redhat.com>

When UDP GRO is enabled, the UDP_GRO cmsg will carry the ingress
datagram size. User-space can use such info to compute the original
packets layout.

Signed-off-by: Paolo Abeni <pabeni@redhat.com>
---
CHECK: should we use a separate setsockopt to explicitly enable
gso_size cmsg reception? So that user space can enable UDP_GRO and
fetch cmsg without forcefully receiving GRO related info.
---
 include/linux/udp.h | 11 +++++++++++
 net/ipv4/udp.c      |  4 ++++
 net/ipv6/udp.c      |  3 +++
 3 files changed, 18 insertions(+)

diff --git a/include/linux/udp.h b/include/linux/udp.h
index f613b329852e..e23d5024f42f 100644
--- a/include/linux/udp.h
+++ b/include/linux/udp.h
@@ -121,6 +121,17 @@ static inline bool udp_get_no_check6_rx(struct sock *sk)
 	return udp_sk(sk)->no_check6_rx;
 }
 
+static inline void udp_cmsg_recv(struct msghdr *msg, struct sock *sk,
+				 struct sk_buff *skb)
+{
+	int gso_size;
+
+	if (skb_shinfo(skb)->gso_type & SKB_GSO_UDP_L4) {
+		gso_size = skb_shinfo(skb)->gso_size;
+		put_cmsg(msg, SOL_UDP, UDP_GRO, sizeof(gso_size), &gso_size);
+	}
+}
+
 #define udp_portaddr_for_each_entry(__sk, list) \
 	hlist_for_each_entry(__sk, list, __sk_common.skc_portaddr_node)
 
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index 3c277378814f..2331ac9de954 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -1714,6 +1714,10 @@ int udp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int noblock,
 		memset(sin->sin_zero, 0, sizeof(sin->sin_zero));
 		*addr_len = sizeof(*sin);
 	}
+
+	if (udp_sk(sk)->gro_enabled)
+		udp_cmsg_recv(msg, sk, skb);
+
 	if (inet->cmsg_flags)
 		ip_cmsg_recv_offset(msg, sk, skb, sizeof(struct udphdr), off);
 
diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c
index 8bb50ba32a6f..05f723b98cab 100644
--- a/net/ipv6/udp.c
+++ b/net/ipv6/udp.c
@@ -421,6 +421,9 @@ int udpv6_recvmsg(struct sock *sk, struct msghdr *msg, size_t len,
 		*addr_len = sizeof(*sin6);
 	}
 
+	if (udp_sk(sk)->gro_enabled)
+		udp_cmsg_recv(msg, sk, skb);
+
 	if (np->rxopt.all)
 		ip6_datagram_recv_common_ctl(sk, msg, skb);
 
-- 
2.17.2

^ permalink raw reply related

* [RFC PATCH v2 02/10] udp: implement GRO for plain UDP sockets.
From: Paolo Abeni @ 2018-10-19 14:25 UTC (permalink / raw)
  To: netdev; +Cc: Willem de Bruijn, Steffen Klassert
In-Reply-To: <cover.1539957909.git.pabeni@redhat.com>

This is the RX counterpart of commit bec1f6f69736 ("udp: generate gso
with UDP_SEGMENT"). When UDP_GRO is enabled, such socket is also
eligible for GRO in the rx path: UDP segments directed to such socket
are assembled into a larger GSO_UDP_L4 packet.

The core UDP GRO support is enabled with setsockopt(UDP_GRO).

Initial benchmark numbers:

Before:
udp rx:   1079 MB/s   769065 calls/s

After:
udp rx:   1466 MB/s    24877 calls/s

This change introduces a side effect in respect to UDP tunnels:
after a UDP tunnel creation, now the kernel performs a lookup per ingress
UDP packet, while before such lookup happened only if the ingress packet
carried a valid internal header csum.

v1 -> v2:
 - use a new option to enable UDP GRO
 - use static keys to protect the UDP GRO socket lookup

Signed-off-by: Paolo Abeni <pabeni@redhat.com>
---
 include/linux/udp.h      |   3 +-
 include/uapi/linux/udp.h |   1 +
 net/ipv4/udp.c           |   7 +++
 net/ipv4/udp_offload.c   | 109 +++++++++++++++++++++++++++++++--------
 net/ipv6/udp_offload.c   |   6 +--
 5 files changed, 98 insertions(+), 28 deletions(-)

diff --git a/include/linux/udp.h b/include/linux/udp.h
index a4dafff407fb..f613b329852e 100644
--- a/include/linux/udp.h
+++ b/include/linux/udp.h
@@ -50,11 +50,12 @@ struct udp_sock {
 	__u8		 encap_type;	/* Is this an Encapsulation socket? */
 	unsigned char	 no_check6_tx:1,/* Send zero UDP6 checksums on TX? */
 			 no_check6_rx:1,/* Allow zero UDP6 checksums on RX? */
-			 encap_enabled:1; /* This socket enabled encap
+			 encap_enabled:1, /* This socket enabled encap
 					   * processing; UDP tunnels and
 					   * different encapsulation layer set
 					   * this
 					   */
+			 gro_enabled:1;	/* Can accept GRO packets */
 	/*
 	 * Following member retains the information to create a UDP header
 	 * when the socket is uncorked.
diff --git a/include/uapi/linux/udp.h b/include/uapi/linux/udp.h
index 09502de447f5..30baccb6c9c4 100644
--- a/include/uapi/linux/udp.h
+++ b/include/uapi/linux/udp.h
@@ -33,6 +33,7 @@ struct udphdr {
 #define UDP_NO_CHECK6_TX 101	/* Disable sending checksum for UDP6X */
 #define UDP_NO_CHECK6_RX 102	/* Disable accpeting checksum for UDP6 */
 #define UDP_SEGMENT	103	/* Set GSO segmentation size */
+#define UDP_GRO		104	/* This socket can receive UDP GRO packets */
 
 /* UDP encapsulation types */
 #define UDP_ENCAP_ESPINUDP_NON_IKE	1 /* draft-ietf-ipsec-nat-t-ike-00/01 */
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index 9fcb5374e166..3c277378814f 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -115,6 +115,7 @@
 #include "udp_impl.h"
 #include <net/sock_reuseport.h>
 #include <net/addrconf.h>
+#include <net/udp_tunnel.h>
 
 struct udp_table udp_table __read_mostly;
 EXPORT_SYMBOL(udp_table);
@@ -2459,6 +2460,12 @@ int udp_lib_setsockopt(struct sock *sk, int level, int optname,
 		up->gso_size = val;
 		break;
 
+	case UDP_GRO:
+		if (valbool)
+			udp_tunnel_encap_enable(sk->sk_socket);
+		up->gro_enabled = valbool;
+		break;
+
 	/*
 	 * 	UDP-Lite's partial checksum coverage (RFC 3828).
 	 */
diff --git a/net/ipv4/udp_offload.c b/net/ipv4/udp_offload.c
index 802f2bc00d69..d93c1e8097ba 100644
--- a/net/ipv4/udp_offload.c
+++ b/net/ipv4/udp_offload.c
@@ -343,6 +343,54 @@ static struct sk_buff *udp4_ufo_fragment(struct sk_buff *skb,
 	return segs;
 }
 
+#define UDO_GRO_CNT_MAX 64
+static struct sk_buff *udp_gro_receive_segment(struct list_head *head,
+					       struct sk_buff *skb)
+{
+	struct udphdr *uh = udp_hdr(skb);
+	struct sk_buff *pp = NULL;
+	struct udphdr *uh2;
+	struct sk_buff *p;
+
+	/* requires non zero csum, for simmetry with GSO */
+	if (!uh->check) {
+		NAPI_GRO_CB(skb)->flush = 1;
+		return NULL;
+	}
+
+	/* pull encapsulating udp header */
+	skb_gro_pull(skb, sizeof(struct udphdr));
+	skb_gro_postpull_rcsum(skb, uh, sizeof(struct udphdr));
+
+	list_for_each_entry(p, head, list) {
+		if (!NAPI_GRO_CB(p)->same_flow)
+			continue;
+
+		uh2 = udp_hdr(p);
+
+		/* Match ports only, as csum is always non zero */
+		if ((*(u32 *)&uh->source != *(u32 *)&uh2->source)) {
+			NAPI_GRO_CB(p)->same_flow = 0;
+			continue;
+		}
+
+		/* Terminate the flow on len mismatch or if it grow "too much".
+		 * Under small packet flood GRO count could elsewhere grow a lot
+		 * leading to execessive truesize values
+		 */
+		if (!skb_gro_receive(p, skb) &&
+		    NAPI_GRO_CB(p)->count > UDO_GRO_CNT_MAX)
+			pp = p;
+		else if (uh->len != uh2->len)
+			pp = p;
+
+		return pp;
+	}
+
+	/* mismatch, but we never need to flush */
+	return NULL;
+}
+
 struct sk_buff *udp_gro_receive(struct list_head *head, struct sk_buff *skb,
 				struct udphdr *uh, udp_lookup_t lookup)
 {
@@ -353,23 +401,27 @@ struct sk_buff *udp_gro_receive(struct list_head *head, struct sk_buff *skb,
 	int flush = 1;
 	struct sock *sk;
 
+	rcu_read_lock();
+	sk = (*lookup)(skb, uh->source, uh->dest);
+	if (!sk)
+		goto out_unlock;
+
+	if (udp_sk(sk)->gro_enabled) {
+		pp = call_gro_receive(udp_gro_receive_segment, head, skb);
+		rcu_read_unlock();
+		return pp;
+	}
+
 	if (NAPI_GRO_CB(skb)->encap_mark ||
 	    (skb->ip_summed != CHECKSUM_PARTIAL &&
 	     NAPI_GRO_CB(skb)->csum_cnt == 0 &&
-	     !NAPI_GRO_CB(skb)->csum_valid))
-		goto out;
+	     !NAPI_GRO_CB(skb)->csum_valid) ||
+	    !udp_sk(sk)->gro_receive)
+		goto out_unlock;
 
 	/* mark that this skb passed once through the tunnel gro layer */
 	NAPI_GRO_CB(skb)->encap_mark = 1;
 
-	rcu_read_lock();
-	sk = (*lookup)(skb, uh->source, uh->dest);
-
-	if (sk && udp_sk(sk)->gro_receive)
-		goto unflush;
-	goto out_unlock;
-
-unflush:
 	flush = 0;
 
 	list_for_each_entry(p, head, list) {
@@ -394,7 +446,6 @@ struct sk_buff *udp_gro_receive(struct list_head *head, struct sk_buff *skb,
 
 out_unlock:
 	rcu_read_unlock();
-out:
 	skb_gro_flush_final(skb, pp, flush);
 	return pp;
 }
@@ -427,6 +478,19 @@ static struct sk_buff *udp4_gro_receive(struct list_head *head,
 	return NULL;
 }
 
+static int udp_gro_complete_segment(struct sk_buff *skb)
+{
+	struct udphdr *uh = udp_hdr(skb);
+
+	skb->csum_start = (unsigned char *)uh - skb->head;
+	skb->csum_offset = offsetof(struct udphdr, check);
+	skb->ip_summed = CHECKSUM_PARTIAL;
+
+	skb_shinfo(skb)->gso_segs = NAPI_GRO_CB(skb)->count;
+	skb_shinfo(skb)->gso_type |= SKB_GSO_UDP_L4;
+	return 0;
+}
+
 int udp_gro_complete(struct sk_buff *skb, int nhoff,
 		     udp_lookup_t lookup)
 {
@@ -437,16 +501,21 @@ int udp_gro_complete(struct sk_buff *skb, int nhoff,
 
 	uh->len = newlen;
 
-	/* Set encapsulation before calling into inner gro_complete() functions
-	 * to make them set up the inner offsets.
-	 */
-	skb->encapsulation = 1;
-
 	rcu_read_lock();
 	sk = (*lookup)(skb, uh->source, uh->dest);
-	if (sk && udp_sk(sk)->gro_complete)
+	if (sk && udp_sk(sk)->gro_enabled) {
+		err = udp_gro_complete_segment(skb);
+	} else if (sk && udp_sk(sk)->gro_complete) {
+		skb_shinfo(skb)->gso_type = uh->check ? SKB_GSO_UDP_TUNNEL_CSUM
+					: SKB_GSO_UDP_TUNNEL;
+
+		/* Set encapsulation before calling into inner gro_complete()
+		 * functions to make them set up the inner offsets.
+		 */
+		skb->encapsulation = 1;
 		err = udp_sk(sk)->gro_complete(sk, skb,
 				nhoff + sizeof(struct udphdr));
+	}
 	rcu_read_unlock();
 
 	if (skb->remcsum_offload)
@@ -461,13 +530,9 @@ static int udp4_gro_complete(struct sk_buff *skb, int nhoff)
 	const struct iphdr *iph = ip_hdr(skb);
 	struct udphdr *uh = (struct udphdr *)(skb->data + nhoff);
 
-	if (uh->check) {
-		skb_shinfo(skb)->gso_type |= SKB_GSO_UDP_TUNNEL_CSUM;
+	if (uh->check)
 		uh->check = ~udp_v4_check(skb->len - nhoff, iph->saddr,
 					  iph->daddr, 0);
-	} else {
-		skb_shinfo(skb)->gso_type |= SKB_GSO_UDP_TUNNEL;
-	}
 
 	return udp_gro_complete(skb, nhoff, udp4_lib_lookup_skb);
 }
diff --git a/net/ipv6/udp_offload.c b/net/ipv6/udp_offload.c
index 1b8e161ac527..828b2457f97b 100644
--- a/net/ipv6/udp_offload.c
+++ b/net/ipv6/udp_offload.c
@@ -147,13 +147,9 @@ static int udp6_gro_complete(struct sk_buff *skb, int nhoff)
 	const struct ipv6hdr *ipv6h = ipv6_hdr(skb);
 	struct udphdr *uh = (struct udphdr *)(skb->data + nhoff);
 
-	if (uh->check) {
-		skb_shinfo(skb)->gso_type |= SKB_GSO_UDP_TUNNEL_CSUM;
+	if (uh->check)
 		uh->check = ~udp_v6_check(skb->len - nhoff, &ipv6h->saddr,
 					  &ipv6h->daddr, 0);
-	} else {
-		skb_shinfo(skb)->gso_type |= SKB_GSO_UDP_TUNNEL;
-	}
 
 	return udp_gro_complete(skb, nhoff, udp6_lib_lookup_skb);
 }
-- 
2.17.2

^ permalink raw reply related

* [RFC PATCH v2 04/10] ip: factor out protocol delivery helper
From: Paolo Abeni @ 2018-10-19 14:25 UTC (permalink / raw)
  To: netdev; +Cc: Willem de Bruijn, Steffen Klassert
In-Reply-To: <cover.1539957909.git.pabeni@redhat.com>

So that we can re-use it at the UDP lavel in a later patch

Signed-off-by: Paolo Abeni <pabeni@redhat.com>
---
 net/ipv4/ip_input.c | 73 ++++++++++++++++++++++-----------------------
 1 file changed, 36 insertions(+), 37 deletions(-)

diff --git a/net/ipv4/ip_input.c b/net/ipv4/ip_input.c
index 35a786c0aaa0..72250b4e466d 100644
--- a/net/ipv4/ip_input.c
+++ b/net/ipv4/ip_input.c
@@ -188,51 +188,50 @@ bool ip_call_ra_chain(struct sk_buff *skb)
 	return false;
 }
 
-static int ip_local_deliver_finish(struct net *net, struct sock *sk, struct sk_buff *skb)
+void ip_protocol_deliver_rcu(struct net *net, struct sk_buff *skb, int protocol)
 {
-	__skb_pull(skb, skb_network_header_len(skb));
-
-	rcu_read_lock();
-	{
-		int protocol = ip_hdr(skb)->protocol;
-		const struct net_protocol *ipprot;
-		int raw;
+	const struct net_protocol *ipprot;
+	int raw, ret;
 
-	resubmit:
-		raw = raw_local_deliver(skb, protocol);
+resubmit:
+	raw = raw_local_deliver(skb, protocol);
 
-		ipprot = rcu_dereference(inet_protos[protocol]);
-		if (ipprot) {
-			int ret;
-
-			if (!ipprot->no_policy) {
-				if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
-					kfree_skb(skb);
-					goto out;
-				}
-				nf_reset(skb);
+	ipprot = rcu_dereference(inet_protos[protocol]);
+	if (ipprot) {
+		if (!ipprot->no_policy) {
+			if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
+				kfree_skb(skb);
+				return;
 			}
-			ret = ipprot->handler(skb);
-			if (ret < 0) {
-				protocol = -ret;
-				goto resubmit;
+			nf_reset(skb);
+		}
+		ret = ipprot->handler(skb);
+		if (ret < 0) {
+			protocol = -ret;
+			goto resubmit;
+		}
+		__IP_INC_STATS(net, IPSTATS_MIB_INDELIVERS);
+	} else {
+		if (!raw) {
+			if (xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
+				__IP_INC_STATS(net, IPSTATS_MIB_INUNKNOWNPROTOS);
+				icmp_send(skb, ICMP_DEST_UNREACH,
+					  ICMP_PROT_UNREACH, 0);
 			}
-			__IP_INC_STATS(net, IPSTATS_MIB_INDELIVERS);
+			kfree_skb(skb);
 		} else {
-			if (!raw) {
-				if (xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
-					__IP_INC_STATS(net, IPSTATS_MIB_INUNKNOWNPROTOS);
-					icmp_send(skb, ICMP_DEST_UNREACH,
-						  ICMP_PROT_UNREACH, 0);
-				}
-				kfree_skb(skb);
-			} else {
-				__IP_INC_STATS(net, IPSTATS_MIB_INDELIVERS);
-				consume_skb(skb);
-			}
+			__IP_INC_STATS(net, IPSTATS_MIB_INDELIVERS);
+			consume_skb(skb);
 		}
 	}
- out:
+}
+
+static int ip_local_deliver_finish(struct net *net, struct sock *sk, struct sk_buff *skb)
+{
+	__skb_pull(skb, skb_network_header_len(skb));
+
+	rcu_read_lock();
+	ip_protocol_deliver_rcu(net, skb, ip_hdr(skb)->protocol);
 	rcu_read_unlock();
 
 	return 0;
-- 
2.17.2

^ permalink raw reply related

* [RFC PATCH v2 05/10] ipv6: factor out protocol delivery helper
From: Paolo Abeni @ 2018-10-19 14:25 UTC (permalink / raw)
  To: netdev; +Cc: Willem de Bruijn, Steffen Klassert
In-Reply-To: <cover.1539957909.git.pabeni@redhat.com>

So that we can re-use it at the UDP lavel in the next patch

Signed-off-by: Paolo Abeni <pabeni@redhat.com>
---
 net/ipv6/ip6_input.c | 28 ++++++++++++++++------------
 1 file changed, 16 insertions(+), 12 deletions(-)

diff --git a/net/ipv6/ip6_input.c b/net/ipv6/ip6_input.c
index 96577e742afd..3065226bdc57 100644
--- a/net/ipv6/ip6_input.c
+++ b/net/ipv6/ip6_input.c
@@ -319,28 +319,26 @@ void ipv6_list_rcv(struct list_head *head, struct packet_type *pt,
 /*
  *	Deliver the packet to the host
  */
-
-
-static int ip6_input_finish(struct net *net, struct sock *sk, struct sk_buff *skb)
+void ip6_protocol_deliver_rcu(struct net *net, struct sk_buff *skb, int nexthdr,
+			      bool have_final)
 {
 	const struct inet6_protocol *ipprot;
 	struct inet6_dev *idev;
 	unsigned int nhoff;
-	int nexthdr;
 	bool raw;
-	bool have_final = false;
 
 	/*
 	 *	Parse extension headers
 	 */
 
-	rcu_read_lock();
 resubmit:
 	idev = ip6_dst_idev(skb_dst(skb));
-	if (!pskb_pull(skb, skb_transport_offset(skb)))
-		goto discard;
 	nhoff = IP6CB(skb)->nhoff;
-	nexthdr = skb_network_header(skb)[nhoff];
+	if (!have_final) {
+		if (!pskb_pull(skb, skb_transport_offset(skb)))
+			goto discard;
+		nexthdr = skb_network_header(skb)[nhoff];
+	}
 
 resubmit_final:
 	raw = raw6_local_deliver(skb, nexthdr);
@@ -411,13 +409,19 @@ static int ip6_input_finish(struct net *net, struct sock *sk, struct sk_buff *sk
 			consume_skb(skb);
 		}
 	}
-	rcu_read_unlock();
-	return 0;
+	return;
 
 discard:
 	__IP6_INC_STATS(net, idev, IPSTATS_MIB_INDISCARDS);
-	rcu_read_unlock();
 	kfree_skb(skb);
+}
+
+static int ip6_input_finish(struct net *net, struct sock *sk, struct sk_buff *skb)
+{
+	rcu_read_lock();
+	ip6_protocol_deliver_rcu(net, skb, 0, false);
+	rcu_read_unlock();
+
 	return 0;
 }
 
-- 
2.17.2

^ permalink raw reply related

* [RFC PATCH v2 06/10] udp: cope with UDP GRO packet misdirection
From: Paolo Abeni @ 2018-10-19 14:25 UTC (permalink / raw)
  To: netdev; +Cc: Willem de Bruijn, Steffen Klassert
In-Reply-To: <cover.1539957909.git.pabeni@redhat.com>

In some scenarios, the GRO engine can assemble an UDP GRO packet
that ultimately lands on a non GRO-enabled socket.
This patch tries to address the issue explicitly checking for the UDP
socket features before enqueuing the packet, and eventually segmenting
the unexpected GRO packet, as needed.

We must also cope with re-insertion requests: after segmentation the
UDP code calls the helper introduced by the previous patches, as needed.

Signed-off-by: Paolo Abeni <pabeni@redhat.com>
---
 include/linux/udp.h | 23 +++++++++++++++++++++++
 net/ipv4/udp.c      | 25 ++++++++++++++++++++++++-
 net/ipv6/udp.c      | 27 ++++++++++++++++++++++++++-
 3 files changed, 73 insertions(+), 2 deletions(-)

diff --git a/include/linux/udp.h b/include/linux/udp.h
index e23d5024f42f..19bcb396cd1b 100644
--- a/include/linux/udp.h
+++ b/include/linux/udp.h
@@ -132,6 +132,29 @@ static inline void udp_cmsg_recv(struct msghdr *msg, struct sock *sk,
 	}
 }
 
+static inline bool udp_unexpected_gso(struct sock *sk, struct sk_buff *skb)
+{
+	return !udp_sk(sk)->gro_enabled && skb_is_gso(skb) &&
+	       skb_shinfo(skb)->gso_type & SKB_GSO_UDP_L4;
+}
+
+static inline struct sk_buff *udp_rcv_segment(struct sock *sk,
+					      struct sk_buff *skb)
+{
+	struct sk_buff *segs;
+
+	/* the GSO CB lays after the UDP one, no need to save and restore any
+	 * CB fragment, just initialize it
+	 */
+	segs = __skb_gso_segment(skb, NETIF_F_SG, false);
+	if (unlikely(IS_ERR(segs)))
+		kfree_skb(skb);
+	else if (segs)
+		consume_skb(skb);
+	return segs;
+}
+
+
 #define udp_portaddr_for_each_entry(__sk, list) \
 	hlist_for_each_entry(__sk, list, __sk_common.skc_portaddr_node)
 
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index 2331ac9de954..0d55145ce9f5 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -1909,7 +1909,7 @@ EXPORT_SYMBOL(udp_encap_enable);
  * Note that in the success and error cases, the skb is assumed to
  * have either been requeued or freed.
  */
-static int udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
+static int udp_queue_rcv_one_skb(struct sock *sk, struct sk_buff *skb)
 {
 	struct udp_sock *up = udp_sk(sk);
 	int is_udplite = IS_UDPLITE(sk);
@@ -2012,6 +2012,29 @@ static int udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
 	return -1;
 }
 
+void ip_protocol_deliver_rcu(struct net *net, struct sk_buff *skb, int proto);
+
+static int udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
+{
+	struct sk_buff *next, *segs;
+	int ret;
+
+	if (likely(!udp_unexpected_gso(sk, skb)))
+		return udp_queue_rcv_one_skb(sk, skb);
+
+	BUILD_BUG_ON(sizeof(struct udp_skb_cb) > SKB_SGO_CB_OFFSET);
+	__skb_push(skb, -skb_mac_offset(skb));
+	segs = udp_rcv_segment(sk, skb);
+	for (skb = segs; skb; skb = next) {
+		next = skb->next;
+		__skb_pull(skb, skb_transport_offset(skb));
+		ret = udp_queue_rcv_one_skb(sk, skb);
+		if (ret > 0)
+			ip_protocol_deliver_rcu(dev_net(skb->dev), skb, -ret);
+	}
+	return 0;
+}
+
 /* For TCP sockets, sk_rx_dst is protected by socket lock
  * For UDP, we use xchg() to guard against concurrent changes.
  */
diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c
index 05f723b98cab..d892c064657c 100644
--- a/net/ipv6/udp.c
+++ b/net/ipv6/udp.c
@@ -558,7 +558,7 @@ void udpv6_encap_enable(void)
 }
 EXPORT_SYMBOL(udpv6_encap_enable);
 
-static int udpv6_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
+static int udpv6_queue_rcv_one_skb(struct sock *sk, struct sk_buff *skb)
 {
 	struct udp_sock *up = udp_sk(sk);
 	int is_udplite = IS_UDPLITE(sk);
@@ -641,6 +641,31 @@ static int udpv6_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
 	return -1;
 }
 
+void ip6_protocol_deliver_rcu(struct net *net, struct sk_buff *skb, int nexthdr,
+			      bool have_final);
+
+static int udpv6_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
+{
+	struct sk_buff *next, *segs;
+	int ret;
+
+	if (likely(!udp_unexpected_gso(sk, skb)))
+		return udpv6_queue_rcv_one_skb(sk, skb);
+
+	__skb_push(skb, -skb_mac_offset(skb));
+	segs = udp_rcv_segment(sk, skb);
+	for (skb = segs; skb; skb = next) {
+		next = skb->next;
+		__skb_pull(skb, skb_transport_offset(skb));
+
+		ret = udpv6_queue_rcv_one_skb(sk, skb);
+		if (ret > 0)
+			ip6_protocol_deliver_rcu(dev_net(skb->dev), skb, ret,
+						 true);
+	}
+	return 0;
+}
+
 static bool __udp_v6_is_mcast_sock(struct net *net, struct sock *sk,
 				   __be16 loc_port, const struct in6_addr *loc_addr,
 				   __be16 rmt_port, const struct in6_addr *rmt_addr,
-- 
2.17.2

^ permalink raw reply related

* [RFC PATCH v2 07/10] selftests: add GRO support to udp bench rx program
From: Paolo Abeni @ 2018-10-19 14:25 UTC (permalink / raw)
  To: netdev; +Cc: Willem de Bruijn, Steffen Klassert
In-Reply-To: <cover.1539957909.git.pabeni@redhat.com>

And fix a couple of buglets (port option processing,
clean termination on SIGINT). This is preparatory work
for GRO tests.

Signed-off-by: Paolo Abeni <pabeni@redhat.com>
---
 tools/testing/selftests/net/udpgso_bench_rx.c | 37 +++++++++++++++----
 1 file changed, 30 insertions(+), 7 deletions(-)

diff --git a/tools/testing/selftests/net/udpgso_bench_rx.c b/tools/testing/selftests/net/udpgso_bench_rx.c
index 727cf67a3f75..c55acdd1e27b 100644
--- a/tools/testing/selftests/net/udpgso_bench_rx.c
+++ b/tools/testing/selftests/net/udpgso_bench_rx.c
@@ -31,9 +31,15 @@
 #include <sys/wait.h>
 #include <unistd.h>
 
+#ifndef UDP_GRO
+#define UDP_GRO		104
+#endif
+
 static int  cfg_port		= 8000;
 static bool cfg_tcp;
 static bool cfg_verify;
+static bool cfg_read_all;
+static bool cfg_gro_segment;
 
 static bool interrupted;
 static unsigned long packets, bytes;
@@ -63,6 +69,8 @@ static void do_poll(int fd)
 
 	do {
 		ret = poll(&pfd, 1, 10);
+		if (interrupted)
+			break;
 		if (ret == -1)
 			error(1, errno, "poll");
 		if (ret == 0)
@@ -70,7 +78,7 @@ static void do_poll(int fd)
 		if (pfd.revents != POLLIN)
 			error(1, errno, "poll: 0x%x expected 0x%x\n",
 					pfd.revents, POLLIN);
-	} while (!ret && !interrupted);
+	} while (!ret);
 }
 
 static int do_socket(bool do_tcp)
@@ -102,6 +110,8 @@ static int do_socket(bool do_tcp)
 			error(1, errno, "listen");
 
 		do_poll(accept_fd);
+		if (interrupted)
+			exit(0);
 
 		fd = accept(accept_fd, NULL, NULL);
 		if (fd == -1)
@@ -167,10 +177,10 @@ static void do_verify_udp(const char *data, int len)
 /* Flush all outstanding datagrams. Verify first few bytes of each. */
 static void do_flush_udp(int fd)
 {
-	static char rbuf[ETH_DATA_LEN];
+	static char rbuf[65535];
 	int ret, len, budget = 256;
 
-	len = cfg_verify ? sizeof(rbuf) : 0;
+	len = cfg_read_all ? sizeof(rbuf) : 0;
 	while (budget--) {
 		/* MSG_TRUNC will make return value full datagram length */
 		ret = recv(fd, rbuf, len, MSG_TRUNC | MSG_DONTWAIT);
@@ -178,7 +188,7 @@ static void do_flush_udp(int fd)
 			return;
 		if (ret == -1)
 			error(1, errno, "recv");
-		if (len) {
+		if (len && cfg_verify) {
 			if (ret == 0)
 				error(1, errno, "recv: 0 byte datagram\n");
 
@@ -192,23 +202,30 @@ static void do_flush_udp(int fd)
 
 static void usage(const char *filepath)
 {
-	error(1, 0, "Usage: %s [-tv] [-p port]", filepath);
+	error(1, 0, "Usage: %s [-Grtv] [-p port]", filepath);
 }
 
 static void parse_opts(int argc, char **argv)
 {
 	int c;
 
-	while ((c = getopt(argc, argv, "ptv")) != -1) {
+	while ((c = getopt(argc, argv, "Gp:rtvx:")) != -1) {
 		switch (c) {
+		case 'G':
+			cfg_gro_segment = true;
+			break;
 		case 'p':
-			cfg_port = htons(strtoul(optarg, NULL, 0));
+			cfg_port = strtoul(optarg, NULL, 0);
+			break;
+		case 'r':
+			cfg_read_all = true;
 			break;
 		case 't':
 			cfg_tcp = true;
 			break;
 		case 'v':
 			cfg_verify = true;
+			cfg_read_all = true;
 			break;
 		}
 	}
@@ -227,6 +244,12 @@ static void do_recv(void)
 
 	fd = do_socket(cfg_tcp);
 
+	if (cfg_gro_segment && !cfg_tcp) {
+		int val = 1;
+		if (setsockopt(fd, IPPROTO_UDP, UDP_GRO, &val, sizeof(val)))
+			error(1, errno, "setsockopt UDP_GRO");
+	}
+
 	treport = gettimeofday_ms() + 1000;
 	do {
 		do_poll(fd);
-- 
2.17.2

^ permalink raw reply related

* [RFC PATCH v2 08/10] selftests: conditionally enable XDP support in udpgso_bench_rx
From: Paolo Abeni @ 2018-10-19 14:25 UTC (permalink / raw)
  To: netdev; +Cc: Willem de Bruijn, Steffen Klassert
In-Reply-To: <cover.1539957909.git.pabeni@redhat.com>

XDP support will be used by a later patch to test the GRO path
in a net namespace, leveraging the veth XDP implementation.
To avoid breaking existing setup, XDP support is conditionally
enabled and build only if llc is locally available.

Signed-off-by: Paolo Abeni <pabeni@redhat.com>
---
 tools/testing/selftests/net/Makefile          | 69 +++++++++++++++++++
 tools/testing/selftests/net/udpgso_bench_rx.c | 37 ++++++++++
 tools/testing/selftests/net/xdp_dummy.c       | 13 ++++
 3 files changed, 119 insertions(+)
 create mode 100644 tools/testing/selftests/net/xdp_dummy.c

diff --git a/tools/testing/selftests/net/Makefile b/tools/testing/selftests/net/Makefile
index 256d82d5fa87..176459b7c4d6 100644
--- a/tools/testing/selftests/net/Makefile
+++ b/tools/testing/selftests/net/Makefile
@@ -16,8 +16,77 @@ TEST_GEN_PROGS = reuseport_bpf reuseport_bpf_cpu reuseport_bpf_numa
 TEST_GEN_PROGS += reuseport_dualstack reuseaddr_conflict tls
 
 KSFT_KHDR_INSTALL := 1
+
+# Allows pointing LLC/CLANG to a LLVM backend with bpf support, redefine on cmdline:
+#  make samples/bpf/ LLC=~/git/llvm/build/bin/llc CLANG=~/git/llvm/build/bin/clang
+LLC ?= llc
+CLANG ?= clang
+LLVM_OBJCOPY ?= llvm-objcopy
+BTF_PAHOLE ?= pahole
+HAS_LLC := $(shell which $(LLC) 2>/dev/null)
+
+# conditional enable testes requiring llc
+ifneq (, $(HAS_LLC))
+TEST_GEN_FILES += xdp_dummy.o
+endif
+
 include ../lib.mk
 
+ifneq (, $(HAS_LLC))
+
+# Detect that we're cross compiling and use the cross compiler
+ifdef CROSS_COMPILE
+CLANG_ARCH_ARGS = -target $(ARCH)
+endif
+
+PROBE := $(shell $(LLC) -march=bpf -mcpu=probe -filetype=null /dev/null 2>&1)
+
+# Let newer LLVM versions transparently probe the kernel for availability
+# of full BPF instruction set.
+ifeq ($(PROBE),)
+  CPU ?= probe
+else
+  CPU ?= generic
+endif
+
+SRC_PATH := $(abspath ../../../..)
+LIB_PATH := $(SRC_PATH)/tools/lib
+XDP_CFLAGS := -D SUPPORT_XDP=1 -I$(LIB_PATH)
+LIBBPF = $(LIB_PATH)/bpf/libbpf.a
+BTF_LLC_PROBE := $(shell $(LLC) -march=bpf -mattr=help 2>&1 | grep dwarfris)
+BTF_PAHOLE_PROBE := $(shell $(BTF_PAHOLE) --help 2>&1 | grep BTF)
+BTF_OBJCOPY_PROBE := $(shell $(LLVM_OBJCOPY) --help 2>&1 | grep -i 'usage.*llvm')
+CLANG_SYS_INCLUDES := $(shell $(CLANG) -v -E - </dev/null 2>&1 \
+        | sed -n '/<...> search starts here:/,/End of search list./{ s| \(/.*\)|-idirafter \1|p }')
+CLANG_FLAGS = -I. -I$(SRC_PATH)/include -I../bpf/ \
+	      $(CLANG_SYS_INCLUDES) -Wno-compare-distinct-pointer-types
+
+ifneq ($(and $(BTF_LLC_PROBE),$(BTF_PAHOLE_PROBE),$(BTF_OBJCOPY_PROBE)),)
+	CLANG_CFLAGS += -g
+	LLC_FLAGS += -mattr=dwarfris
+	DWARF2BTF = y
+endif
+
+$(LIBBPF): FORCE
+# Fix up variables inherited from Kbuild that tools/ build system won't like
+	$(MAKE) -C $(dir $@) RM='rm -rf' LDFLAGS= srctree=$(SRC_PATH) O= $(nodir $@)
+
+$(OUTPUT)/udpgso_bench_rx: $(OUTPUT)/udpgso_bench_rx.c $(LIBBPF)
+	$(CC) -o $@ $(XDP_CFLAGS) $(CFLAGS) $(LOADLIBES) $(LDLIBS) $^ -lelf
+
+FORCE:
+
+# bpf program[s] generation
+$(OUTPUT)/%.o: %.c
+	$(CLANG) $(CLANG_FLAGS) \
+		 -O2 -target bpf -emit-llvm -c $< -o - |      \
+	$(LLC) -march=bpf -mcpu=$(CPU) $(LLC_FLAGS) -filetype=obj -o $@
+ifeq ($(DWARF2BTF),y)
+	$(BTF_PAHOLE) -J $@
+endif
+
+endif
+
 $(OUTPUT)/reuseport_bpf_numa: LDFLAGS += -lnuma
 $(OUTPUT)/tcp_mmap: LDFLAGS += -lpthread
 $(OUTPUT)/tcp_inq: LDFLAGS += -lpthread
diff --git a/tools/testing/selftests/net/udpgso_bench_rx.c b/tools/testing/selftests/net/udpgso_bench_rx.c
index c55acdd1e27b..84f101852805 100644
--- a/tools/testing/selftests/net/udpgso_bench_rx.c
+++ b/tools/testing/selftests/net/udpgso_bench_rx.c
@@ -31,6 +31,10 @@
 #include <sys/wait.h>
 #include <unistd.h>
 
+#ifdef SUPPORT_XDP
+#include "bpf/libbpf.h"
+#endif
+
 #ifndef UDP_GRO
 #define UDP_GRO		104
 #endif
@@ -40,6 +44,9 @@ static bool cfg_tcp;
 static bool cfg_verify;
 static bool cfg_read_all;
 static bool cfg_gro_segment;
+#ifdef SUPPORT_XDP
+static int cfg_xdp_iface;
+#endif
 
 static bool interrupted;
 static unsigned long packets, bytes;
@@ -227,6 +234,13 @@ static void parse_opts(int argc, char **argv)
 			cfg_verify = true;
 			cfg_read_all = true;
 			break;
+#ifdef SUPPORT_XDP
+		case 'x':
+			cfg_xdp_iface = if_nametoindex(optarg);
+			if (!cfg_xdp_iface)
+				error(1, errno, "unknown interface %s", optarg);
+			break;
+#endif
 		}
 	}
 
@@ -240,6 +254,9 @@ static void parse_opts(int argc, char **argv)
 static void do_recv(void)
 {
 	unsigned long tnow, treport;
+#ifdef SUPPORT_XDP
+	int prog_fd = -1;
+#endif
 	int fd;
 
 	fd = do_socket(cfg_tcp);
@@ -250,6 +267,22 @@ static void do_recv(void)
 			error(1, errno, "setsockopt UDP_GRO");
 	}
 
+#ifdef SUPPORT_XDP
+	if (cfg_xdp_iface) {
+		struct bpf_prog_load_attr prog_load_attr = {
+			.prog_type	= BPF_PROG_TYPE_XDP,
+			.file 		= "xdp_dummy.o",
+		};
+		struct bpf_object *obj;
+
+		if (bpf_prog_load_xattr(&prog_load_attr, &obj, &prog_fd))
+			error(1, errno, "xdp program load failed\n");
+
+		if (bpf_set_link_xdp_fd(cfg_xdp_iface, prog_fd, 0) < 0)
+			error(1, errno, "link set xdp fd failed\n");
+	}
+#endif
+
 	treport = gettimeofday_ms() + 1000;
 	do {
 		do_poll(fd);
@@ -274,6 +307,10 @@ static void do_recv(void)
 
 	if (close(fd))
 		error(1, errno, "close");
+#ifdef SUPPORT_XDP
+	if (cfg_xdp_iface && bpf_set_link_xdp_fd(cfg_xdp_iface, -1, 0))
+		error(1, errno, "removing xdp program");
+#endif
 }
 
 int main(int argc, char **argv)
diff --git a/tools/testing/selftests/net/xdp_dummy.c b/tools/testing/selftests/net/xdp_dummy.c
new file mode 100644
index 000000000000..1a64cf5099ed
--- /dev/null
+++ b/tools/testing/selftests/net/xdp_dummy.c
@@ -0,0 +1,13 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#define KBUILD_MODNAME "xdp_dummy"
+#include <uapi/linux/bpf.h>
+#include "bpf_helpers.h"
+
+SEC("xdp_dummy")
+int xdp_dummy_prog(struct xdp_md *ctx)
+{
+	return XDP_PASS;
+}
+
+char _license[] SEC("license") = "GPL";
-- 
2.17.2

^ permalink raw reply related

* [RFC PATCH v2 09/10] selftests: add some benchmark for UDP GRO
From: Paolo Abeni @ 2018-10-19 14:25 UTC (permalink / raw)
  To: netdev; +Cc: Willem de Bruijn, Steffen Klassert
In-Reply-To: <cover.1539957909.git.pabeni@redhat.com>

Run on top of veth pair, using a dummy XDP program to enable the GRO.

Signed-off-by: Paolo Abeni <pabeni@redhat.com>
---
 tools/testing/selftests/net/Makefile        |  1 +
 tools/testing/selftests/net/udpgro_bench.sh | 92 +++++++++++++++++++++
 2 files changed, 93 insertions(+)
 create mode 100755 tools/testing/selftests/net/udpgro_bench.sh

diff --git a/tools/testing/selftests/net/Makefile b/tools/testing/selftests/net/Makefile
index 176459b7c4d6..ac999354af54 100644
--- a/tools/testing/selftests/net/Makefile
+++ b/tools/testing/selftests/net/Makefile
@@ -7,6 +7,7 @@ CFLAGS += -I../../../../usr/include/
 TEST_PROGS := run_netsocktests run_afpackettests test_bpf.sh netdevice.sh rtnetlink.sh
 TEST_PROGS += fib_tests.sh fib-onlink-tests.sh pmtu.sh udpgso.sh ip_defrag.sh
 TEST_PROGS += udpgso_bench.sh fib_rule_tests.sh msg_zerocopy.sh psock_snd.sh
+TEST_PROGS += udpgro_bench.sh
 TEST_PROGS_EXTENDED := in_netns.sh
 TEST_GEN_FILES =  socket
 TEST_GEN_FILES += psock_fanout psock_tpacket msg_zerocopy
diff --git a/tools/testing/selftests/net/udpgro_bench.sh b/tools/testing/selftests/net/udpgro_bench.sh
new file mode 100755
index 000000000000..03d37e5e7424
--- /dev/null
+++ b/tools/testing/selftests/net/udpgro_bench.sh
@@ -0,0 +1,92 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+#
+# Run a series of udpgro benchmarks
+
+readonly PEER_NS="ns-peer-$(mktemp -u XXXXXX)"
+
+cleanup() {
+	local -r jobs="$(jobs -p)"
+	local -r ns="$(ip netns list|grep $PEER_NS)"
+
+	[ -n "${jobs}" ] && kill -INT ${jobs} 2>/dev/null
+	[ -n "$ns" ] && ip netns del $ns 2>/dev/null
+}
+trap cleanup EXIT
+
+run_one() {
+	# use 'rx' as separator between sender args and receiver args
+	local -r all="$@"
+	local -r tx_args=${all%rx*}
+	local -r rx_args=${all#*rx}
+
+	ip netns add "${PEER_NS}"
+	ip -netns "${PEER_NS}" link set lo up
+	ip link add type veth
+	ip link set dev veth0 up
+	ip addr add dev veth0 192.168.1.2/24
+	ip addr add dev veth0 2001:db8::2/64 nodad
+
+	ip link set dev veth1 netns "${PEER_NS}"
+	ip -netns "${PEER_NS}" addr add dev veth1 192.168.1.1/24
+	ip -netns "${PEER_NS}" addr add dev veth1 2001:db8::1/64 nodad
+	ip -netns "${PEER_NS}" link set dev veth1 up
+
+	ip netns exec "${PEER_NS}" ./udpgso_bench_rx ${rx_args} -r -x veth1 &
+	ip netns exec "${PEER_NS}" ./udpgso_bench_rx -t ${rx_args} -r &
+
+	# Hack: let bg programs complete the startup
+	sleep 0.1
+	./udpgso_bench_tx ${tx_args}
+}
+
+run_in_netns() {
+	local -r args=$@
+
+	./in_netns.sh $0 __subprocess ${args}
+}
+
+run_udp() {
+	local -r args=$@
+
+	echo "udp gso - over veth touching data"
+	run_in_netns ${args} -S rx
+
+	echo "udp gso and gro - over veth touching data"
+	run_in_netns ${args} -S rx -G
+}
+
+run_tcp() {
+	local -r args=$@
+
+	echo "tcp - over veth touching data"
+	run_in_netns ${args} -t rx
+}
+
+run_all() {
+	local -r core_args="-l 4"
+	local -r ipv4_args="${core_args} -4 -D 192.168.1.1"
+	local -r ipv6_args="${core_args} -6 -D 2001:db8::1"
+
+	echo "ipv4"
+	run_tcp "${ipv4_args}"
+	run_udp "${ipv4_args}"
+
+	echo "ipv6"
+	run_tcp "${ipv4_args}"
+	run_udp "${ipv6_args}"
+}
+
+if [ ! -f xdp_dummy.o ]; then
+	echo "Skipping GRO benchmarks - missing LLC"
+	exit 0
+fi
+
+if [[ $# -eq 0 ]]; then
+	run_all
+elif [[ $1 == "__subprocess" ]]; then
+	shift
+	run_one $@
+else
+	run_in_netns $@
+fi
-- 
2.17.2

^ permalink raw reply related

page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox