Netdev List
 help / color / mirror / Atom feed
* [PATCH v3 08/15] soc: octeontx2: Add RVU block LF provisioning support
From: sunil.kovvuri @ 2018-09-04 16:28 UTC (permalink / raw)
  To: linux-kernel, arnd, olof
  Cc: linux-arm-kernel, linux-soc, andrew, davem, netdev, Sunil Goutham
In-Reply-To: <1536078525-31534-1-git-send-email-sunil.kovvuri@gmail.com>

From: Sunil Goutham <sgoutham@marvell.com>

Added support for a RVU PF/VF to request AF via mailbox
to attach or detach NPA/NIX/SSO/SSOW/TIM/CPT block LFs.
Also supports partial detachment and modifying current
LF attached count of a certian block type.

Signed-off-by: Sunil Goutham <sgoutham@marvell.com>
---
 drivers/soc/marvell/octeontx2/mbox.h    |  45 ++-
 drivers/soc/marvell/octeontx2/rvu.c     | 472 +++++++++++++++++++++++++++++++-
 drivers/soc/marvell/octeontx2/rvu.h     |   8 +-
 drivers/soc/marvell/octeontx2/rvu_reg.h |   8 +-
 4 files changed, 523 insertions(+), 10 deletions(-)

diff --git a/drivers/soc/marvell/octeontx2/mbox.h b/drivers/soc/marvell/octeontx2/mbox.h
index fc593f0..7280d49 100644
--- a/drivers/soc/marvell/octeontx2/mbox.h
+++ b/drivers/soc/marvell/octeontx2/mbox.h
@@ -118,7 +118,17 @@ static inline struct mbox_msghdr *otx2_mbox_alloc_msg(struct otx2_mbox *mbox,
 #define MBOX_MSG_MAX				0xFFFF
 
 #define MBOX_MESSAGES							\
-M(READY,		0x001, msg_req, ready_msg_rsp)
+/* Generic mbox IDs (range 0x000 - 0x1FF) */				\
+M(READY,		0x001, msg_req, ready_msg_rsp)			\
+M(ATTACH_RESOURCES,	0x002, rsrc_attach, msg_rsp)			\
+M(DETACH_RESOURCES,	0x003, rsrc_detach, msg_rsp)			\
+/* CGX mbox IDs (range 0x200 - 0x3FF) */				\
+/* NPA mbox IDs (range 0x400 - 0x5FF) */				\
+/* SSO/SSOW mbox IDs (range 0x600 - 0x7FF) */				\
+/* TIM mbox IDs (range 0x800 - 0x9FF) */				\
+/* CPT mbox IDs (range 0xA00 - 0xBFF) */				\
+/* NPC mbox IDs (range 0x6000 - 0x7FFF) */				\
+/* NIX mbox IDs (range 0x8000 - 0xFFFF) */				\
 
 enum {
 #define M(_name, _id, _1, _2) MBOX_MSG_ ## _name = _id,
@@ -147,4 +157,37 @@ struct ready_msg_rsp {
 	u16    sclk_feq;	/* SCLK frequency */
 };
 
+/* Structure for requesting resource provisioning.
+ * 'modify' flag to be used when either requesting more
+ * or to detach partial of a cetain resource type.
+ * Rest of the fields specify how many of what type to
+ * be attached.
+ */
+struct rsrc_attach {
+	struct mbox_msghdr hdr;
+	u8   modify:1;
+	u8   npalf:1;
+	u8   nixlf:1;
+	u16  sso;
+	u16  ssow;
+	u16  timlfs;
+	u16  cptlfs;
+};
+
+/* Structure for relinquishing resources.
+ * 'partial' flag to be used when relinquishing all resources
+ * but only of a certain type. If not set, all resources of all
+ * types provisioned to the RVU function will be detached.
+ */
+struct rsrc_detach {
+	struct mbox_msghdr hdr;
+	u8 partial:1;
+	u8 npalf:1;
+	u8 nixlf:1;
+	u8 sso:1;
+	u8 ssow:1;
+	u8 timlfs:1;
+	u8 cptlfs:1;
+};
+
 #endif /* MBOX_H */
diff --git a/drivers/soc/marvell/octeontx2/rvu.c b/drivers/soc/marvell/octeontx2/rvu.c
index 9539ab9..39dc45d 100644
--- a/drivers/soc/marvell/octeontx2/rvu.c
+++ b/drivers/soc/marvell/octeontx2/rvu.c
@@ -59,6 +59,41 @@ int rvu_poll_reg(struct rvu *rvu, u64 block, u64 offset, u64 mask, bool zero)
 	return -EBUSY;
 }
 
+int rvu_alloc_rsrc(struct rsrc_bmap *rsrc)
+{
+	int id;
+
+	if (!rsrc->bmap)
+		return -EINVAL;
+
+	id = find_first_zero_bit(rsrc->bmap, rsrc->max);
+	if (id >= rsrc->max)
+		return -ENOSPC;
+
+	__set_bit(id, rsrc->bmap);
+
+	return id;
+}
+
+void rvu_free_rsrc(struct rsrc_bmap *rsrc, int id)
+{
+	if (!rsrc->bmap)
+		return;
+
+	__clear_bit(id, rsrc->bmap);
+}
+
+int rvu_rsrc_free_count(struct rsrc_bmap *rsrc)
+{
+	int used;
+
+	if (!rsrc->bmap)
+		return 0;
+
+	used = bitmap_weight(rsrc->bmap, rsrc->max);
+	return (rsrc->max - used);
+}
+
 int rvu_alloc_bitmap(struct rsrc_bmap *rsrc)
 {
 	rsrc->bmap = kcalloc(BITS_TO_LONGS(rsrc->max),
@@ -68,6 +103,78 @@ int rvu_alloc_bitmap(struct rsrc_bmap *rsrc)
 	return 0;
 }
 
+/* Convert BLOCK_TYPE_E to a BLOCK_ADDR_E.
+ * Some silicon variants of OcteonTX2 supports
+ * multiple blocks of same type.
+ *
+ * @pcifunc has to be zero when no LF is yet attached.
+ */
+int rvu_get_blkaddr(struct rvu *rvu, int blktype, u16 pcifunc)
+{
+	int devnum, blkaddr = -ENODEV;
+	u64 cfg, reg;
+	bool is_pf;
+
+	switch (blktype) {
+	case BLKTYPE_NPA:
+		blkaddr = BLKADDR_NPA;
+		goto exit;
+	case BLKTYPE_NIX:
+		/* For now assume NIX0 */
+		if (!pcifunc) {
+			blkaddr = BLKADDR_NIX0;
+			goto exit;
+		}
+		break;
+	case BLKTYPE_SSO:
+		blkaddr = BLKADDR_SSO;
+		goto exit;
+	case BLKTYPE_SSOW:
+		blkaddr = BLKADDR_SSOW;
+		goto exit;
+	case BLKTYPE_TIM:
+		blkaddr = BLKADDR_TIM;
+		goto exit;
+	case BLKTYPE_CPT:
+		/* For now assume CPT0 */
+		if (!pcifunc) {
+			blkaddr = BLKADDR_CPT0;
+			goto exit;
+		}
+		break;
+	}
+
+	/* Check if this is a RVU PF or VF */
+	if (pcifunc & RVU_PFVF_FUNC_MASK) {
+		is_pf = false;
+		devnum = rvu_get_hwvf(rvu, pcifunc);
+	} else {
+		is_pf = true;
+		devnum = rvu_get_pf(pcifunc);
+	}
+
+	/* Check if the 'pcifunc' has a NIX LF from 'BLKADDR_NIX0' */
+	if (blktype == BLKTYPE_NIX) {
+		reg = is_pf ? RVU_PRIV_PFX_NIX0_CFG : RVU_PRIV_HWVFX_NIX0_CFG;
+		cfg = rvu_read64(rvu, BLKADDR_RVUM, reg | (devnum << 16));
+		if (cfg)
+			blkaddr = BLKADDR_NIX0;
+	}
+
+	/* Check if the 'pcifunc' has a CPT LF from 'BLKADDR_CPT0' */
+	if (blktype == BLKTYPE_CPT) {
+		reg = is_pf ? RVU_PRIV_PFX_CPT0_CFG : RVU_PRIV_HWVFX_CPT0_CFG;
+		cfg = rvu_read64(rvu, BLKADDR_RVUM, reg | (devnum << 16));
+		if (cfg)
+			blkaddr = BLKADDR_CPT0;
+	}
+
+exit:
+	if (is_block_implemented(rvu->hw, blkaddr))
+		return blkaddr;
+	return -ENODEV;
+}
+
 static void rvu_update_rsrc_map(struct rvu *rvu, struct rvu_pfvf *pfvf,
 				struct rvu_block *block, u16 pcifunc,
 				u16 lf, bool attach)
@@ -153,6 +260,17 @@ struct rvu_pfvf *rvu_get_pfvf(struct rvu *rvu, int pcifunc)
 		return &rvu->pf[rvu_get_pf(pcifunc)];
 }
 
+bool is_block_implemented(struct rvu_hwinfo *hw, int blkaddr)
+{
+	struct rvu_block *block;
+
+	if ((blkaddr < BLKADDR_RVUM) || (blkaddr >= BLK_COUNT))
+		return false;
+
+	block = &hw->block[blkaddr];
+	return block->implemented;
+}
+
 static void rvu_check_block_implemented(struct rvu *rvu)
 {
 	struct rvu_hwinfo *hw = rvu->hw;
@@ -273,8 +391,8 @@ static int rvu_setup_hw_resources(struct rvu *rvu)
 	block->type = BLKTYPE_NIX;
 	block->lfshift = 8;
 	block->lookup_reg = NIX_AF_RVU_LF_CFG_DEBUG;
-	block->pf_lfcnt_reg = RVU_PRIV_PFX_NIX_CFG;
-	block->vf_lfcnt_reg = RVU_PRIV_HWVFX_NIX_CFG;
+	block->pf_lfcnt_reg = RVU_PRIV_PFX_NIX0_CFG;
+	block->vf_lfcnt_reg = RVU_PRIV_HWVFX_NIX0_CFG;
 	block->lfcfg_reg = NIX_PRIV_LFX_CFG;
 	block->msixcfg_reg = NIX_PRIV_LFX_INT_CFG;
 	block->lfreset_reg = NIX_AF_LF_RST;
@@ -360,8 +478,8 @@ static int rvu_setup_hw_resources(struct rvu *rvu)
 	block->multislot = true;
 	block->lfshift = 3;
 	block->lookup_reg = CPT_AF_RVU_LF_CFG_DEBUG;
-	block->pf_lfcnt_reg = RVU_PRIV_PFX_CPT_CFG;
-	block->vf_lfcnt_reg = RVU_PRIV_HWVFX_CPT_CFG;
+	block->pf_lfcnt_reg = RVU_PRIV_PFX_CPT0_CFG;
+	block->vf_lfcnt_reg = RVU_PRIV_HWVFX_CPT0_CFG;
 	block->lfcfg_reg = CPT_PRIV_LFX_CFG;
 	block->msixcfg_reg = CPT_PRIV_LFX_INT_CFG;
 	block->lfreset_reg = CPT_AF_LF_RST;
@@ -399,6 +517,8 @@ static int rvu_setup_hw_resources(struct rvu *rvu)
 		rvu_scan_block(rvu, block);
 	}
 
+	spin_lock_init(&rvu->rsrc_lock);
+
 	return 0;
 }
 
@@ -408,6 +528,350 @@ static int rvu_mbox_handler_READY(struct rvu *rvu, struct msg_req *req,
 	return 0;
 }
 
+/* Get current count of a RVU block's LF/slots
+ * provisioned to a given RVU func.
+ */
+static u16 rvu_get_rsrc_mapcount(struct rvu_pfvf *pfvf, int blktype)
+{
+	switch (blktype) {
+	case BLKTYPE_NPA:
+		return pfvf->npalf ? 1 : 0;
+	case BLKTYPE_NIX:
+		return pfvf->nixlf ? 1 : 0;
+	case BLKTYPE_SSO:
+		return pfvf->sso;
+	case BLKTYPE_SSOW:
+		return pfvf->ssow;
+	case BLKTYPE_TIM:
+		return pfvf->timlfs;
+	case BLKTYPE_CPT:
+		return pfvf->cptlfs;
+	}
+	return 0;
+}
+
+static int rvu_lookup_rsrc(struct rvu *rvu, struct rvu_block *block,
+			   int pcifunc, int slot)
+{
+	u64 val;
+
+	val = ((u64)pcifunc << 24) | (slot << 16) | (1ULL << 13);
+	rvu_write64(rvu, block->addr, block->lookup_reg, val);
+	/* Wait for the lookup to finish */
+	/* TODO: put some timeout here */
+	while (rvu_read64(rvu, block->addr, block->lookup_reg) & (1ULL << 13))
+		;
+
+	val = rvu_read64(rvu, block->addr, block->lookup_reg);
+
+	/* Check LF valid bit */
+	if (!(val & (1ULL << 12)))
+		return -1;
+
+	return (val & 0xFFF);
+}
+
+static void rvu_detach_block(struct rvu *rvu, int pcifunc, int blktype)
+{
+	struct rvu_pfvf *pfvf = rvu_get_pfvf(rvu, pcifunc);
+	struct rvu_hwinfo *hw = rvu->hw;
+	struct rvu_block *block;
+	int slot, lf, num_lfs;
+	int blkaddr;
+
+	blkaddr = rvu_get_blkaddr(rvu, blktype, pcifunc);
+	if (blkaddr < 0)
+		return;
+
+	block = &hw->block[blkaddr];
+
+	num_lfs = rvu_get_rsrc_mapcount(pfvf, block->type);
+	if (!num_lfs)
+		return;
+
+	for (slot = 0; slot < num_lfs; slot++) {
+		lf = rvu_lookup_rsrc(rvu, block, pcifunc, slot);
+		if (lf < 0) /* This should never happen */
+			continue;
+
+		/* Disable the LF */
+		rvu_write64(rvu, blkaddr, block->lfcfg_reg |
+			    (lf << block->lfshift), 0x00ULL);
+
+		/* Update SW maintained mapping info as well */
+		rvu_update_rsrc_map(rvu, pfvf, block,
+				    pcifunc, lf, false);
+
+		/* Free the resource */
+		rvu_free_rsrc(&block->lf, lf);
+	}
+}
+
+static int rvu_detach_rsrcs(struct rvu *rvu, struct rsrc_detach *detach,
+			    u16 pcifunc)
+{
+	struct rvu_hwinfo *hw = rvu->hw;
+	bool is_pf, detach_all = true;
+	struct rvu_block *block;
+	int devnum, blkid;
+
+	/* Check if this is for a RVU PF or VF */
+	if (pcifunc & RVU_PFVF_FUNC_MASK) {
+		is_pf = false;
+		devnum = rvu_get_hwvf(rvu, pcifunc);
+	} else {
+		is_pf = true;
+		devnum = rvu_get_pf(pcifunc);
+	}
+
+	spin_lock(&rvu->rsrc_lock);
+
+	/* Check for partial resource detach */
+	if (detach && detach->partial)
+		detach_all = false;
+
+	/* Check for RVU block's LFs attached to this func,
+	 * if so, detach them.
+	 */
+	for (blkid = 0; blkid < BLK_COUNT; blkid++) {
+		block = &hw->block[blkid];
+		if (!block->lf.bmap)
+			continue;
+		if (!detach_all && detach) {
+			if ((blkid == BLKADDR_NPA) && !detach->npalf)
+				continue;
+			else if ((blkid == BLKADDR_NIX0) && !detach->nixlf)
+				continue;
+			else if ((blkid == BLKADDR_SSO) && !detach->sso)
+				continue;
+			else if ((blkid == BLKADDR_SSOW) && !detach->ssow)
+				continue;
+			else if ((blkid == BLKADDR_TIM) && !detach->timlfs)
+				continue;
+			else if ((blkid == BLKADDR_CPT0) && !detach->cptlfs)
+				continue;
+		}
+		rvu_detach_block(rvu, pcifunc, block->type);
+	}
+
+	spin_unlock(&rvu->rsrc_lock);
+	return 0;
+}
+
+static int rvu_mbox_handler_DETACH_RESOURCES(struct rvu *rvu,
+					     struct rsrc_detach *detach,
+					     struct msg_rsp *rsp)
+{
+	return rvu_detach_rsrcs(rvu, detach, detach->hdr.pcifunc);
+}
+
+static void rvu_attach_block(struct rvu *rvu, int pcifunc,
+			     int blktype, int num_lfs)
+{
+	struct rvu_pfvf *pfvf = rvu_get_pfvf(rvu, pcifunc);
+	struct rvu_hwinfo *hw = rvu->hw;
+	struct rvu_block *block;
+	int slot, lf;
+	int blkaddr;
+	u64 cfg;
+
+	if (!num_lfs)
+		return;
+
+	blkaddr = rvu_get_blkaddr(rvu, blktype, 0);
+	if (blkaddr < 0)
+		return;
+
+	block = &hw->block[blkaddr];
+	if (!block->lf.bmap)
+		return;
+
+	for (slot = 0; slot < num_lfs; slot++) {
+		/* Allocate the resource */
+		lf = rvu_alloc_rsrc(&block->lf);
+		if (lf < 0)
+			return;
+
+		cfg = (1ULL << 63) | (pcifunc << 8) | slot;
+		rvu_write64(rvu, blkaddr, block->lfcfg_reg |
+			    (lf << block->lfshift), cfg);
+		rvu_update_rsrc_map(rvu, pfvf, block,
+				    pcifunc, lf, true);
+	}
+}
+
+static int rvu_check_rsrc_availability(struct rvu *rvu,
+				       struct rsrc_attach *req, u16 pcifunc)
+{
+	struct rvu_pfvf *pfvf = rvu_get_pfvf(rvu, pcifunc);
+	struct rvu_hwinfo *hw = rvu->hw;
+	struct rvu_block *block;
+	int free_lfs, mappedlfs;
+
+	/* Only one NPA LF can be attached */
+	if (req->npalf && !rvu_get_rsrc_mapcount(pfvf, BLKTYPE_NPA)) {
+		block = &hw->block[BLKADDR_NPA];
+		free_lfs = rvu_rsrc_free_count(&block->lf);
+		if (!free_lfs)
+			goto fail;
+	} else if (req->npalf) {
+		dev_err(&rvu->pdev->dev,
+			"Func 0x%x: Invalid req, already has NPA\n",
+			 pcifunc);
+		return -EINVAL;
+	}
+
+	/* Only one NIX LF can be attached */
+	if (req->nixlf && !rvu_get_rsrc_mapcount(pfvf, BLKTYPE_NIX)) {
+		block = &hw->block[BLKADDR_NIX0];
+		free_lfs = rvu_rsrc_free_count(&block->lf);
+		if (!free_lfs)
+			goto fail;
+	} else if (req->nixlf) {
+		dev_err(&rvu->pdev->dev,
+			"Func 0x%x: Invalid req, already has NIX\n",
+			pcifunc);
+		return -EINVAL;
+	}
+
+	if (req->sso) {
+		block = &hw->block[BLKADDR_SSO];
+		/* Is request within limits ? */
+		if (req->sso > block->lf.max) {
+			dev_err(&rvu->pdev->dev,
+				"Func 0x%x: Invalid SSO req, %d > max %d\n",
+				 pcifunc, req->sso, block->lf.max);
+			return -EINVAL;
+		}
+		mappedlfs = rvu_get_rsrc_mapcount(pfvf, block->type);
+		free_lfs = rvu_rsrc_free_count(&block->lf);
+		/* Check if additional resources are available */
+		if ((req->sso > mappedlfs) &&
+		    ((req->sso - mappedlfs) > free_lfs))
+			goto fail;
+	}
+
+	if (req->ssow) {
+		block = &hw->block[BLKADDR_SSOW];
+		if (req->ssow > block->lf.max) {
+			dev_err(&rvu->pdev->dev,
+				"Func 0x%x: Invalid SSOW req, %d > max %d\n",
+				 pcifunc, req->sso, block->lf.max);
+			return -EINVAL;
+		}
+		mappedlfs = rvu_get_rsrc_mapcount(pfvf, block->type);
+		free_lfs = rvu_rsrc_free_count(&block->lf);
+		if ((req->ssow > mappedlfs) &&
+		    ((req->ssow - mappedlfs) > free_lfs))
+			goto fail;
+	}
+
+	if (req->timlfs) {
+		block = &hw->block[BLKADDR_TIM];
+		if (req->timlfs > block->lf.max) {
+			dev_err(&rvu->pdev->dev,
+				"Func 0x%x: Invalid TIMLF req, %d > max %d\n",
+				 pcifunc, req->timlfs, block->lf.max);
+			return -EINVAL;
+		}
+		mappedlfs = rvu_get_rsrc_mapcount(pfvf, block->type);
+		free_lfs = rvu_rsrc_free_count(&block->lf);
+		if ((req->timlfs > mappedlfs) &&
+		    ((req->timlfs - mappedlfs) > free_lfs))
+			goto fail;
+	}
+
+	if (req->cptlfs) {
+		block = &hw->block[BLKADDR_CPT0];
+		if (req->cptlfs > block->lf.max) {
+			dev_err(&rvu->pdev->dev,
+				"Func 0x%x: Invalid CPTLF req, %d > max %d\n",
+				 pcifunc, req->cptlfs, block->lf.max);
+			return -EINVAL;
+		}
+		mappedlfs = rvu_get_rsrc_mapcount(pfvf, block->type);
+		free_lfs = rvu_rsrc_free_count(&block->lf);
+		if ((req->cptlfs > mappedlfs) &&
+		    ((req->cptlfs - mappedlfs) > free_lfs))
+			goto fail;
+	}
+
+	return 0;
+
+fail:
+	dev_info(rvu->dev, "Request for %s failed\n", block->name);
+	return -ENOSPC;
+}
+
+static int rvu_mbox_handler_ATTACH_RESOURCES(struct rvu *rvu,
+					     struct rsrc_attach *attach,
+					     struct msg_rsp *rsp)
+{
+	u16 pcifunc = attach->hdr.pcifunc;
+	int devnum, err;
+	bool is_pf;
+
+	/* If first request, detach all existing attached resources */
+	if (!attach->modify)
+		rvu_detach_rsrcs(rvu, NULL, pcifunc);
+
+	/* Check if this is for a RVU PF or VF */
+	if (pcifunc & RVU_PFVF_FUNC_MASK) {
+		is_pf = false;
+		devnum = rvu_get_hwvf(rvu, pcifunc);
+	} else {
+		is_pf = true;
+		devnum = rvu_get_pf(pcifunc);
+	}
+
+	spin_lock(&rvu->rsrc_lock);
+
+	/* Check if the request can be accommodated */
+	err = rvu_check_rsrc_availability(rvu, attach, pcifunc);
+	if (err)
+		goto exit;
+
+	/* Now attach the requested resources */
+	if (attach->npalf)
+		rvu_attach_block(rvu, pcifunc, BLKTYPE_NPA, 1);
+
+	if (attach->nixlf)
+		rvu_attach_block(rvu, pcifunc, BLKTYPE_NIX, 1);
+
+	if (attach->sso) {
+		/* RVU func doesn't know which exact LF or slot is attached
+		 * to it, it always sees as slot 0,1,2. So for a 'modify'
+		 * request, simply detach all existing attached LFs/slots
+		 * and attach a fresh.
+		 */
+		if (attach->modify)
+			rvu_detach_block(rvu, pcifunc, BLKTYPE_SSO);
+		rvu_attach_block(rvu, pcifunc, BLKTYPE_SSO, attach->sso);
+	}
+
+	if (attach->ssow) {
+		if (attach->modify)
+			rvu_detach_block(rvu, pcifunc, BLKTYPE_SSOW);
+		rvu_attach_block(rvu, pcifunc, BLKTYPE_SSOW, attach->ssow);
+	}
+
+	if (attach->timlfs) {
+		if (attach->modify)
+			rvu_detach_block(rvu, pcifunc, BLKTYPE_TIM);
+		rvu_attach_block(rvu, pcifunc, BLKTYPE_TIM, attach->timlfs);
+	}
+
+	if (attach->cptlfs) {
+		if (attach->modify)
+			rvu_detach_block(rvu, pcifunc, BLKTYPE_CPT);
+		rvu_attach_block(rvu, pcifunc, BLKTYPE_CPT, attach->cptlfs);
+	}
+
+exit:
+	spin_unlock(&rvu->rsrc_lock);
+	return err;
+}
+
 static int rvu_process_mbox_msg(struct rvu *rvu, int devid,
 				struct mbox_msghdr *req)
 {
diff --git a/drivers/soc/marvell/octeontx2/rvu.h b/drivers/soc/marvell/octeontx2/rvu.h
index ce9897b..0f76704 100644
--- a/drivers/soc/marvell/octeontx2/rvu.h
+++ b/drivers/soc/marvell/octeontx2/rvu.h
@@ -83,6 +83,7 @@ struct rvu {
 	struct rvu_hwinfo       *hw;
 	struct rvu_pfvf		*pf;
 	struct rvu_pfvf		*hwvf;
+	spinlock_t		rsrc_lock; /* Serialize resource alloc/free */
 
 	/* Mbox */
 	struct otx2_mbox	mbox;
@@ -120,8 +121,13 @@ static inline u64 rvupf_read64(struct rvu *rvu, u64 offset)
  */
 
 int rvu_alloc_bitmap(struct rsrc_bmap *rsrc);
-int rvu_poll_reg(struct rvu *rvu, u64 block, u64 offset, u64 mask, bool zero);
+int rvu_alloc_rsrc(struct rsrc_bmap *rsrc);
+void rvu_free_rsrc(struct rsrc_bmap *rsrc, int id);
+int rvu_rsrc_free_count(struct rsrc_bmap *rsrc);
 int rvu_get_pf(u16 pcifunc);
 struct rvu_pfvf *rvu_get_pfvf(struct rvu *rvu, int pcifunc);
+bool is_block_implemented(struct rvu_hwinfo *hw, int blkaddr);
+int rvu_get_blkaddr(struct rvu *rvu, int blktype, u16 pcifunc);
+int rvu_poll_reg(struct rvu *rvu, u64 block, u64 offset, u64 mask, bool zero);
 
 #endif /* RVU_H */
diff --git a/drivers/soc/marvell/octeontx2/rvu_reg.h b/drivers/soc/marvell/octeontx2/rvu_reg.h
index 3bfb1e0..b28b310 100644
--- a/drivers/soc/marvell/octeontx2/rvu_reg.h
+++ b/drivers/soc/marvell/octeontx2/rvu_reg.h
@@ -54,20 +54,20 @@
 #define RVU_PRIV_PFX_MSIX_CFG(a)            (0x8000110 | (a) << 16)
 #define RVU_PRIV_PFX_ID_CFG(a)              (0x8000120 | (a) << 16)
 #define RVU_PRIV_PFX_INT_CFG(a)             (0x8000200 | (a) << 16)
-#define RVU_PRIV_PFX_NIX_CFG                (0x8000300)
+#define RVU_PRIV_PFX_NIX0_CFG               (0x8000300)
 #define RVU_PRIV_PFX_NPA_CFG		    (0x8000310)
 #define RVU_PRIV_PFX_SSO_CFG                (0x8000320)
 #define RVU_PRIV_PFX_SSOW_CFG               (0x8000330)
 #define RVU_PRIV_PFX_TIM_CFG                (0x8000340)
-#define RVU_PRIV_PFX_CPT_CFG                (0x8000350)
+#define RVU_PRIV_PFX_CPT0_CFG               (0x8000350)
 #define RVU_PRIV_BLOCK_TYPEX_REV(a)         (0x8000400 | (a) << 3)
 #define RVU_PRIV_HWVFX_INT_CFG(a)           (0x8001280 | (a) << 16)
-#define RVU_PRIV_HWVFX_NIX_CFG              (0x8001300)
+#define RVU_PRIV_HWVFX_NIX0_CFG             (0x8001300)
 #define RVU_PRIV_HWVFX_NPA_CFG              (0x8001310)
 #define RVU_PRIV_HWVFX_SSO_CFG              (0x8001320)
 #define RVU_PRIV_HWVFX_SSOW_CFG             (0x8001330)
 #define RVU_PRIV_HWVFX_TIM_CFG              (0x8001340)
-#define RVU_PRIV_HWVFX_CPT_CFG              (0x8001350)
+#define RVU_PRIV_HWVFX_CPT0_CFG             (0x8001350)
 
 /* RVU PF registers */
 #define	RVU_PF_VFX_PFVF_MBOX0		    (0x00000)
-- 
2.7.4

^ permalink raw reply related

* Re: [PATCH v2 net-next] failover: Add missing check to validate 'slave_dev' in net_failover_slave_unregister
From: Samudrala, Sridhar @ 2018-09-04 16:36 UTC (permalink / raw)
  To: YueHaibing, David S. Miller, Stephen Hemminger, Dan Carpenter,
	Alexander Duyck, Jeff Kirsher, Liran Alon, Joao Martins
  Cc: netdev, kernel-janitors
In-Reply-To: <1536029786-21710-1-git-send-email-yuehaibing@huawei.com>

On 9/3/2018 7:56 PM, YueHaibing wrote:
> Fixes gcc '-Wunused-but-set-variable' warning:
>
> drivers/net/net_failover.c: In function 'net_failover_slave_unregister':
> drivers/net/net_failover.c:598:35: warning:
>   variable 'primary_dev' set but not used [-Wunused-but-set-variable]
>
> There should check the validity of 'slave_dev'.
>
> Fixes: cfc80d9a1163 ("net: Introduce net_failover driver")
>
> Signed-off-by: YueHaibing <yuehaibing@huawei.com>

Acked-by: Sridhar Samudrala <sridhar.samudrala@intel.com>


> ---
> v2: use WARN_ON_ONCE as Liran Alon suggested
> ---
>   drivers/net/net_failover.c | 3 +++
>   1 file changed, 3 insertions(+)
>
> diff --git a/drivers/net/net_failover.c b/drivers/net/net_failover.c
> index 7ae1856..5a749dc 100644
> --- a/drivers/net/net_failover.c
> +++ b/drivers/net/net_failover.c
> @@ -603,6 +603,9 @@ static int net_failover_slave_unregister(struct net_device *slave_dev,
>   	primary_dev = rtnl_dereference(nfo_info->primary_dev);
>   	standby_dev = rtnl_dereference(nfo_info->standby_dev);
>   
> +	if (WARN_ON_ONCE(slave_dev != primary_dev && slave_dev != standby_dev))
> +		return -ENODEV;
> +
>   	vlan_vids_del_by_dev(slave_dev, failover_dev);
>   	dev_uc_unsync(slave_dev, failover_dev);
>   	dev_mc_unsync(slave_dev, failover_dev);
>

^ permalink raw reply

* Re: [PATCH net-next v2 1/2] netlink: ipv4 igmp join notifications
From: Patrick Ruddy @ 2018-09-04 16:36 UTC (permalink / raw)
  To: Roopa Prabhu; +Cc: netdev, Jiří Pírko, Stephen Hemminger
In-Reply-To: <CAJieiUgG+mz88jSBOMCk2LXftuxS92-7iJ89x8voxOGRh2V+QQ@mail.gmail.com>

On Mon, 2018-09-03 at 16:12 -0700, Roopa Prabhu wrote:
> On Sun, Sep 2, 2018 at 4:18 AM, Patrick Ruddy
> <pruddy@vyatta.att-mail.com> wrote:
> > Hi Roopa
> > 
> > inline
> > 
> > thx
> > 
> > -pr
> > 
> > On Fri, 2018-08-31 at 09:29 -0700, Roopa Prabhu wrote:
> > > On Fri, Aug 31, 2018 at 4:20 AM, Patrick Ruddy
> > > <pruddy@vyatta.att-mail.com> wrote:
> > > > Some userspace applications need to know about IGMP joins from the kernel
> > > > for 2 reasons
> > > > 1. To allow the programming of multicast MAC filters in hardware
> > > > 2. To form a multicast FORUS list for non link-local multicast
> > > >    groups to be sent to the kernel and from there to the interested
> > > >    party.
> > > > (1) can be fulfilled but simply sending the hardware multicast MAC
> > > > address to be programmed but (2) requires the L3 address to be sent
> > > > since this cannot be constructed from the MAC address whereas the
> > > > reverse translation is a standard library function.
> > > > 
> > > > This commit provides addition and deletion of multicast addresses
> > > > using the RTM_NEWADDR and RTM_DELADDR messages. It also provides
> > > > the RTM_GETADDR extension to allow multicast join state to be read
> > > > from the kernel.
> > > > 
> > > > Signed-off-by: Patrick Ruddy <pruddy@vyatta.att-mail.com>
> > > > ---
> > > > v2: fix kbuild warnings.
> > > 
> > > I am still going through the series, but AFAICT, user-space caches listening to
> > > RTNLGRP_IPV4_IFADDR will now also get multicast addresses by default ?
> > > 
> > 
> > Yes that's the crux of this change. It's unfortunate that I could not
> > use IFA_MULTICAST to distinguish the SAFI. I suppose the other option
> > would be to create a set of new NEW/DEL/GETMULTICAST messages but the
> > partial code for RTM_GETMULTICAST in ipv6/mcast.c complicates that
> > slightly. Happy to look at it if you think that would be be better.
> > 
> 
> yeah, true. Thinking about this some more, you are adding an interface
> for multicast entries learnt via igmp.
> There is already a netlink channel for layer2 mc addresses via igmp. I
> can't see why that cannot be used.
> It is RTM_*MDB msgs. It is currently only available for the bridge.
> But, I have a requirement for it to be
> available via a vxlan dev...so, I am looking at making it available on
> other devices.
> 
> Can you check if RTM_*MDB msgs can be made to work for your case ?.
> 
> The reason I think it should be possible is because this is similar to
> bridge fdb entries.
> The bridge fdb api  (RTM_NEWNEIGH with AF_BRIDGE) is overloaded to
> notify and dump netdev unicast addresses.
> similarly I think the mdb api can be overloaded to notify and dump
> netdev multicast addresses (statically added or learnt via igmp)

If I'm reading this correctly I think overloading this channel is
possible. 

What you're suggesting is overloading the RTM_***MDB messages with
AF_INET and AF_INET6 to carry the per-interfaces joined l3 multicast
addresses.

I've thrown together a quick test of this and it looks good. I can
polish this up and resubmit if you're happy with the approach. FWIW
isolating the multicast addresses this was seems safer and it's a
smaller patchset.

thx

-pr

^ permalink raw reply

* Re: [RFC bpf-next PATCH] samples/bpf: xdp1 add XDP hardware offload option
From: Jesper Dangaard Brouer @ 2018-09-04 16:49 UTC (permalink / raw)
  To: Jakub Kicinski
  Cc: Nick Viljoen, oss-drivers, netdev, John W. Linville, jhsiao,
	Quentin Monnet, brouer
In-Reply-To: <20180904170912.2ca43ffa@cakuba>

On Tue, 4 Sep 2018 17:09:12 +0200
Jakub Kicinski <jakub.kicinski@netronome.com> wrote:

> On Tue, 04 Sep 2018 16:59:19 +0200, Jesper Dangaard Brouer wrote:
> > Trying to use XDP hardware offloading via XDP_FLAGS_HW_MODE
> > and setting the ifindex in prog_load_attr.ifindex before
> > loading the BPF code via bpf_prog_load_xattr().
> > 
> > This unfortunately does not seem to work...
> > - Am I doing something wrong?
> > 
> > Notice, I also disable the map BPF_MAP_TYPE_PERCPU_ARRAY
> > to make sure it was not related to the map (not supporting
> > offloading).
> > 
> > Failed with:
> >  # ./xdp1 -O $(</sys/class/net/enp130s0np1/ifindex)
> >  libbpf: load bpf program failed: Invalid argument
> >  libbpf: failed to load program 'xdp1'
> >  libbpf: failed to load object './xdp1_kern.o'
> > 
> > Tested on kernel 4.18.0-2.el8.x86_64 with driver nfp
> >  Ethernet controller: Netronome Systems, Inc. Device 4000  
> 
> Are you running the BPF capable FW?
> 
> https://help.netronome.com/support/solutions/articles/36000050009-agilio-ebpf-2-0-6-extended-berkeley-packet-filter

I'm likely not running the correct firmware...

Can you tell me, with the ethtool -i output, if I'm running the
appropriate firmware?

# ethtool -i enp129s0np1
driver: nfp
version: 4.18.0-2.el8.x86_64 SMP mod_unl
firmware-version: 0.0.3.5 0.21 nic-2.0.7 nic
expansion-rom-version: 
bus-info: 0000:81:00.0
supports-statistics: yes
supports-test: no
supports-eeprom-access: no
supports-register-dump: yes
supports-priv-flags: no

If this is a firmware version case, then we should really improve the
errors we are giving the user, the -EINVAL can be anything.

 "libbpf: load bpf program failed: Invalid argument"

-- 
Best regards,
  Jesper Dangaard Brouer
  MSc.CS, Principal Kernel Engineer at Red Hat
  LinkedIn: http://www.linkedin.com/in/brouer

^ permalink raw reply

* Re: [bpf-next PATCH 3/3] xdp: split code for map vs non-map redirect
From: Jesper Dangaard Brouer @ 2018-09-04 16:57 UTC (permalink / raw)
  To: kbuild test robot
  Cc: kbuild-all, netdev, Daniel Borkmann, Alexei Starovoitov, brouer
In-Reply-To: <20180904153945.GZ17047@intel.com>

On Tue, 4 Sep 2018 23:39:45 +0800
kbuild test robot <lkp@intel.com> wrote:

> Hi Jesper,
> 
> Thank you for the patch! Perhaps something to improve:

Daniel is faster than kbuild test-robot, and have already pointed this
out, and it should be fixed in V2.

-- 
Best regards,
  Jesper Dangaard Brouer
  MSc.CS, Principal Kernel Engineer at Red Hat
  LinkedIn: http://www.linkedin.com/in/brouer

^ permalink raw reply

* Re: [oss-drivers] Re: [RFC bpf-next PATCH] samples/bpf: xdp1 add XDP hardware offload option
From: Jakub Kicinski @ 2018-09-04 16:59 UTC (permalink / raw)
  To: Jesper Dangaard Brouer
  Cc: Nick Viljoen, oss-drivers, netdev, John W. Linville, jhsiao,
	Quentin Monnet
In-Reply-To: <20180904184933.5ff1ed19@redhat.com>

On Tue, 4 Sep 2018 18:49:33 +0200, Jesper Dangaard Brouer wrote:
> On Tue, 4 Sep 2018 17:09:12 +0200
> Jakub Kicinski <jakub.kicinski@netronome.com> wrote:
> 
> > On Tue, 04 Sep 2018 16:59:19 +0200, Jesper Dangaard Brouer wrote:  
> > > Trying to use XDP hardware offloading via XDP_FLAGS_HW_MODE
> > > and setting the ifindex in prog_load_attr.ifindex before
> > > loading the BPF code via bpf_prog_load_xattr().
> > > 
> > > This unfortunately does not seem to work...
> > > - Am I doing something wrong?
> > > 
> > > Notice, I also disable the map BPF_MAP_TYPE_PERCPU_ARRAY
> > > to make sure it was not related to the map (not supporting
> > > offloading).
> > > 
> > > Failed with:
> > >  # ./xdp1 -O $(</sys/class/net/enp130s0np1/ifindex)
> > >  libbpf: load bpf program failed: Invalid argument
> > >  libbpf: failed to load program 'xdp1'
> > >  libbpf: failed to load object './xdp1_kern.o'
> > > 
> > > Tested on kernel 4.18.0-2.el8.x86_64 with driver nfp
> > >  Ethernet controller: Netronome Systems, Inc. Device 4000    
> > 
> > Are you running the BPF capable FW?
> > 
> > https://help.netronome.com/support/solutions/articles/36000050009-agilio-ebpf-2-0-6-extended-berkeley-packet-filter  
> 
> I'm likely not running the correct firmware...
> 
> Can you tell me, with the ethtool -i output, if I'm running the
> appropriate firmware?
> 
> # ethtool -i enp129s0np1
> driver: nfp
> version: 4.18.0-2.el8.x86_64 SMP mod_unl
> firmware-version: 0.0.3.5 0.21 nic-2.0.7 nic
> expansion-rom-version: 
> bus-info: 0000:81:00.0
> supports-statistics: yes
> supports-test: no
> supports-eeprom-access: no
> supports-register-dump: yes
> supports-priv-flags: no

Yup, the BPF firmware says bpf in firmware version.

> If this is a firmware version case, then we should really improve the
> errors we are giving the user, the -EINVAL can be anything.
> 
>  "libbpf: load bpf program failed: Invalid argument"

That is true.

^ permalink raw reply

* [PATCH net] net/sched: fix memory leak in act_tunnel_key_init()
From: Davide Caratti @ 2018-09-04 17:00 UTC (permalink / raw)
  To: Cong Wang, David S. Miller, Simon Horman, Amir Vadai,
	Jamal Hadi Salim, netdev

If users try to install act_tunnel_key 'set' rules with duplicate values
of 'index', the tunnel metadata are allocated, but never released. Then,
kmemleak complains as follows:

 # tc a a a tunnel_key set src_ip 1.1.1.1 dst_ip 2.2.2.2 id 42 index 111
 # echo clear > /sys/kernel/debug/kmemleak
 # tc a a a tunnel_key set src_ip 1.1.1.1 dst_ip 2.2.2.2 id 42 index 111
 Error: TC IDR already exists.
 We have an error talking to the kernel
 # echo scan > /sys/kernel/debug/kmemleak
 # cat /sys/kernel/debug/kmemleak
 unreferenced object 0xffff8800574e6c80 (size 256):
   comm "tc", pid 5617, jiffies 4298118009 (age 57.990s)
   hex dump (first 32 bytes):
     00 00 00 00 00 00 00 00 00 1c e8 b0 ff ff ff ff  ................
     81 24 c2 ad ff ff ff ff 00 00 00 00 00 00 00 00  .$..............
   backtrace:
     [<00000000b7afbf4e>] tunnel_key_init+0x8a5/0x1800 [act_tunnel_key]
     [<000000007d98fccd>] tcf_action_init_1+0x698/0xac0
     [<0000000099b8f7cc>] tcf_action_init+0x15c/0x590
     [<00000000dc60eebe>] tc_ctl_action+0x336/0x5c2
     [<000000002f5a2f7d>] rtnetlink_rcv_msg+0x357/0x8e0
     [<000000000bfe7575>] netlink_rcv_skb+0x124/0x350
     [<00000000edab656f>] netlink_unicast+0x40f/0x5d0
     [<00000000b322cdcb>] netlink_sendmsg+0x6e8/0xba0
     [<0000000063d9d490>] sock_sendmsg+0xb3/0xf0
     [<00000000f0d3315a>] ___sys_sendmsg+0x654/0x960
     [<00000000c06cbd42>] __sys_sendmsg+0xd3/0x170
     [<00000000ce72e4b0>] do_syscall_64+0xa5/0x470
     [<000000005caa2d97>] entry_SYSCALL_64_after_hwframe+0x49/0xbe
     [<00000000fac1b476>] 0xffffffffffffffff

This problem theoretically happens also in case users attempt to setup a
geneve rule having wrong configuration data, or when the kernel fails to
allocate 'params_new'. Ensure that tunnel_key_init() releases the tunnel
metadata also in the above conditions.

Addresses-Coverity-ID: 1373974 ("Resource leak")
Fixes: d0f6dd8a914f4 ("net/sched: Introduce act_tunnel_key")
Fixes: 0ed5269f9e41f ("net/sched: add tunnel option support to act_tunnel_key")
Signed-off-by: Davide Caratti <dcaratti@redhat.com>
---
 net/sched/act_tunnel_key.c | 16 ++++++++++------
 1 file changed, 10 insertions(+), 6 deletions(-)

diff --git a/net/sched/act_tunnel_key.c b/net/sched/act_tunnel_key.c
index 420759153d5f..28d58bbc953e 100644
--- a/net/sched/act_tunnel_key.c
+++ b/net/sched/act_tunnel_key.c
@@ -317,7 +317,7 @@ static int tunnel_key_init(struct net *net, struct nlattr *nla,
 						  &metadata->u.tun_info,
 						  opts_len, extack);
 			if (ret < 0)
-				goto err_out;
+				goto release_tun_meta;
 		}
 
 		metadata->u.tun_info.mode |= IP_TUNNEL_INFO_TX;
@@ -333,23 +333,24 @@ static int tunnel_key_init(struct net *net, struct nlattr *nla,
 				     &act_tunnel_key_ops, bind, true);
 		if (ret) {
 			NL_SET_ERR_MSG(extack, "Cannot create TC IDR");
-			goto err_out;
+			goto release_tun_meta;
 		}
 
 		ret = ACT_P_CREATED;
 	} else if (!ovr) {
-		tcf_idr_release(*a, bind);
 		NL_SET_ERR_MSG(extack, "TC IDR already exists");
-		return -EEXIST;
+		ret = -EEXIST;
+		goto release_tun_meta;
 	}
 
 	t = to_tunnel_key(*a);
 
 	params_new = kzalloc(sizeof(*params_new), GFP_KERNEL);
 	if (unlikely(!params_new)) {
-		tcf_idr_release(*a, bind);
 		NL_SET_ERR_MSG(extack, "Cannot allocate tunnel key parameters");
-		return -ENOMEM;
+		ret = -ENOMEM;
+		exists = true;
+		goto release_tun_meta;
 	}
 	params_new->tcft_action = parm->t_action;
 	params_new->tcft_enc_metadata = metadata;
@@ -367,6 +368,9 @@ static int tunnel_key_init(struct net *net, struct nlattr *nla,
 
 	return ret;
 
+release_tun_meta:
+	dst_release(&metadata->dst);
+
 err_out:
 	if (exists)
 		tcf_idr_release(*a, bind);
-- 
2.17.1

^ permalink raw reply related

* [iproute PATCH] ip-route: Fix segfault with many nexthops
From: Phil Sutter @ 2018-09-04 17:15 UTC (permalink / raw)
  To: Stephen Hemminger; +Cc: netdev

It was possible to crash ip-route by adding an IPv6 route with 37
nexthop statements. A simple reproducer is:

| for i in `seq 37`; do
| 	nhs="nexthop via 1111::$i "$nhs
| done
| ip -6 route add 3333::/64 $nhs

The related code was broken in multiple ways:

* parse_one_nh() assumed that rta points to 4kB of storage but caller
  provided just 1kB. Fixed by passing 'len' parameter with the correct
  value.

* Error checking of rta_addattr*() calls in parse_one_nh() and called
  functions was completely absent, so with above fix in place output
  flood would occur due to parser looping forever.

Note that it is still not possible to add a route with more than 36
nexthops due to stack buffer sizes, this patch merely fixes error path.

Signed-off-by: Phil Sutter <phil@nwl.cc>
---
 ip/iproute.c          |  41 ++++++++++------
 ip/iproute_lwtunnel.c | 108 +++++++++++++++++++++++++-----------------
 2 files changed, 91 insertions(+), 58 deletions(-)

diff --git a/ip/iproute.c b/ip/iproute.c
index 30833414a3f7f..9e5ae48c0715c 100644
--- a/ip/iproute.c
+++ b/ip/iproute.c
@@ -941,7 +941,7 @@ int print_route(const struct sockaddr_nl *who, struct nlmsghdr *n, void *arg)
 }
 
 static int parse_one_nh(struct nlmsghdr *n, struct rtmsg *r,
-			struct rtattr *rta, struct rtnexthop *rtnh,
+			struct rtattr *rta, size_t len, struct rtnexthop *rtnh,
 			int *argcp, char ***argvp)
 {
 	int argc = *argcp;
@@ -962,11 +962,16 @@ static int parse_one_nh(struct nlmsghdr *n, struct rtmsg *r,
 			if (r->rtm_family == AF_UNSPEC)
 				r->rtm_family = addr.family;
 			if (addr.family == r->rtm_family) {
-				rta_addattr_l(rta, 4096, RTA_GATEWAY, &addr.data, addr.bytelen);
-				rtnh->rtnh_len += sizeof(struct rtattr) + addr.bytelen;
+				if (rta_addattr_l(rta, len, RTA_GATEWAY,
+						  &addr.data, addr.bytelen))
+					return -1;
+				rtnh->rtnh_len += sizeof(struct rtattr)
+						  + addr.bytelen;
 			} else {
-				rta_addattr_l(rta, 4096, RTA_VIA, &addr.family, addr.bytelen+2);
-				rtnh->rtnh_len += RTA_SPACE(addr.bytelen+2);
+				if (rta_addattr_l(rta, len, RTA_VIA,
+						  &addr.family, addr.bytelen + 2))
+					return -1;
+				rtnh->rtnh_len += RTA_SPACE(addr.bytelen + 2);
 			}
 		} else if (strcmp(*argv, "dev") == 0) {
 			NEXT_ARG();
@@ -988,13 +993,15 @@ static int parse_one_nh(struct nlmsghdr *n, struct rtmsg *r,
 			NEXT_ARG();
 			if (get_rt_realms_or_raw(&realm, *argv))
 				invarg("\"realm\" value is invalid\n", *argv);
-			rta_addattr32(rta, 4096, RTA_FLOW, realm);
+			if (rta_addattr32(rta, len, RTA_FLOW, realm))
+				return -1;
 			rtnh->rtnh_len += sizeof(struct rtattr) + 4;
 		} else if (strcmp(*argv, "encap") == 0) {
-			int len = rta->rta_len;
+			int old_len = rta->rta_len;
 
-			lwt_parse_encap(rta, 4096, &argc, &argv);
-			rtnh->rtnh_len += rta->rta_len - len;
+			if (lwt_parse_encap(rta, len, &argc, &argv))
+				return -1;
+			rtnh->rtnh_len += rta->rta_len - old_len;
 		} else if (strcmp(*argv, "as") == 0) {
 			inet_prefix addr;
 
@@ -1002,8 +1009,9 @@ static int parse_one_nh(struct nlmsghdr *n, struct rtmsg *r,
 			if (strcmp(*argv, "to") == 0)
 				NEXT_ARG();
 			get_addr(&addr, *argv, r->rtm_family);
-			rta_addattr_l(rta, 4096, RTA_NEWDST, &addr.data,
-				      addr.bytelen);
+			if (rta_addattr_l(rta, len, RTA_NEWDST,
+					  &addr.data, addr.bytelen))
+				return -1;
 			rtnh->rtnh_len += sizeof(struct rtattr) + addr.bytelen;
 		} else
 			break;
@@ -1036,15 +1044,18 @@ static int parse_nexthops(struct nlmsghdr *n, struct rtmsg *r,
 		memset(rtnh, 0, sizeof(*rtnh));
 		rtnh->rtnh_len = sizeof(*rtnh);
 		rta->rta_len += rtnh->rtnh_len;
-		if (parse_one_nh(n, r, rta, rtnh, &argc, &argv)) {
+		if (parse_one_nh(n, r, rta, 1024, rtnh, &argc, &argv)) {
 			fprintf(stderr, "Error: cannot parse nexthop\n");
 			exit(-1);
 		}
 		rtnh = RTNH_NEXT(rtnh);
 	}
 
+		return 0;
+
 	if (rta->rta_len > RTA_LENGTH(0))
-		addattr_l(n, 1024, RTA_MULTIPATH, RTA_DATA(rta), RTA_PAYLOAD(rta));
+		return addattr_l(n, 1024, RTA_MULTIPATH,
+				 RTA_DATA(rta), RTA_PAYLOAD(rta));
 	return 0;
 }
 
@@ -1484,8 +1495,8 @@ static int iproute_modify(int cmd, unsigned int flags, int argc, char **argv)
 		addattr_l(&req.n, sizeof(req), RTA_METRICS, RTA_DATA(mxrta), RTA_PAYLOAD(mxrta));
 	}
 
-	if (nhs_ok)
-		parse_nexthops(&req.n, &req.r, argc, argv);
+	if (nhs_ok && parse_nexthops(&req.n, &req.r, argc, argv))
+		return -1;
 
 	if (req.r.rtm_family == AF_UNSPEC)
 		req.r.rtm_family = AF_INET;
diff --git a/ip/iproute_lwtunnel.c b/ip/iproute_lwtunnel.c
index e604481142ec1..969a4763df71d 100644
--- a/ip/iproute_lwtunnel.c
+++ b/ip/iproute_lwtunnel.c
@@ -538,8 +538,9 @@ static int parse_encap_seg6(struct rtattr *rta, size_t len, int *argcp,
 
 	memcpy(tuninfo->srh, srh, srhlen);
 
-	rta_addattr_l(rta, len, SEG6_IPTUNNEL_SRH, tuninfo,
-		      sizeof(*tuninfo) + srhlen);
+	if (rta_addattr_l(rta, len, SEG6_IPTUNNEL_SRH, tuninfo,
+			  sizeof(*tuninfo) + srhlen))
+		return -1;
 
 	free(tuninfo);
 	free(srh);
@@ -611,6 +612,7 @@ static int parse_encap_seg6local(struct rtattr *rta, size_t len, int *argcp,
 	char segbuf[1024];
 	inet_prefix addr;
 	__u32 hmac = 0;
+	int ret = 0;
 
 	while (argc > 0) {
 		if (strcmp(*argv, "action") == 0) {
@@ -620,27 +622,28 @@ static int parse_encap_seg6local(struct rtattr *rta, size_t len, int *argcp,
 			action = read_action_type(*argv);
 			if (!action)
 				invarg("\"action\" value is invalid\n", *argv);
-			rta_addattr32(rta, len, SEG6_LOCAL_ACTION, action);
+			ret = rta_addattr32(rta, len, SEG6_LOCAL_ACTION,
+					    action);
 		} else if (strcmp(*argv, "table") == 0) {
 			NEXT_ARG();
 			if (table_ok++)
 				duparg2("table", *argv);
 			get_u32(&table, *argv, 0);
-			rta_addattr32(rta, len, SEG6_LOCAL_TABLE, table);
+			ret = rta_addattr32(rta, len, SEG6_LOCAL_TABLE, table);
 		} else if (strcmp(*argv, "nh4") == 0) {
 			NEXT_ARG();
 			if (nh4_ok++)
 				duparg2("nh4", *argv);
 			get_addr(&addr, *argv, AF_INET);
-			rta_addattr_l(rta, len, SEG6_LOCAL_NH4, &addr.data,
-				      addr.bytelen);
+			ret = rta_addattr_l(rta, len, SEG6_LOCAL_NH4,
+					    &addr.data, addr.bytelen);
 		} else if (strcmp(*argv, "nh6") == 0) {
 			NEXT_ARG();
 			if (nh6_ok++)
 				duparg2("nh6", *argv);
 			get_addr(&addr, *argv, AF_INET6);
-			rta_addattr_l(rta, len, SEG6_LOCAL_NH6, &addr.data,
-				      addr.bytelen);
+			ret = rta_addattr_l(rta, len, SEG6_LOCAL_NH6,
+					    &addr.data, addr.bytelen);
 		} else if (strcmp(*argv, "iif") == 0) {
 			NEXT_ARG();
 			if (iif_ok++)
@@ -648,7 +651,7 @@ static int parse_encap_seg6local(struct rtattr *rta, size_t len, int *argcp,
 			iif = ll_name_to_index(*argv);
 			if (!iif)
 				exit(nodev(*argv));
-			rta_addattr32(rta, len, SEG6_LOCAL_IIF, iif);
+			ret = rta_addattr32(rta, len, SEG6_LOCAL_IIF, iif);
 		} else if (strcmp(*argv, "oif") == 0) {
 			NEXT_ARG();
 			if (oif_ok++)
@@ -656,7 +659,7 @@ static int parse_encap_seg6local(struct rtattr *rta, size_t len, int *argcp,
 			oif = ll_name_to_index(*argv);
 			if (!oif)
 				exit(nodev(*argv));
-			rta_addattr32(rta, len, SEG6_LOCAL_OIF, oif);
+			ret = rta_addattr32(rta, len, SEG6_LOCAL_OIF, oif);
 		} else if (strcmp(*argv, "srh") == 0) {
 			NEXT_ARG();
 			if (srh_ok++)
@@ -691,6 +694,8 @@ static int parse_encap_seg6local(struct rtattr *rta, size_t len, int *argcp,
 		} else {
 			break;
 		}
+		if (ret)
+			return ret;
 		argc--; argv++;
 	}
 
@@ -705,14 +710,14 @@ static int parse_encap_seg6local(struct rtattr *rta, size_t len, int *argcp,
 		srh = parse_srh(segbuf, hmac,
 				action == SEG6_LOCAL_ACTION_END_B6_ENCAP);
 		srhlen = (srh->hdrlen + 1) << 3;
-		rta_addattr_l(rta, len, SEG6_LOCAL_SRH, srh, srhlen);
+		ret = rta_addattr_l(rta, len, SEG6_LOCAL_SRH, srh, srhlen);
 		free(srh);
 	}
 
 	*argcp = argc + 1;
 	*argvp = argv - 1;
 
-	return 0;
+	return ret;
 }
 
 static int parse_encap_mpls(struct rtattr *rta, size_t len,
@@ -730,8 +735,9 @@ static int parse_encap_mpls(struct rtattr *rta, size_t len,
 		exit(1);
 	}
 
-	rta_addattr_l(rta, len, MPLS_IPTUNNEL_DST, &addr.data,
-		      addr.bytelen);
+	if (rta_addattr_l(rta, len, MPLS_IPTUNNEL_DST,
+			  &addr.data, addr.bytelen))
+		return -1;
 
 	argc--;
 	argv++;
@@ -745,7 +751,8 @@ static int parse_encap_mpls(struct rtattr *rta, size_t len,
 				duparg2("ttl", *argv);
 			if (get_u8(&ttl, *argv, 0))
 				invarg("\"ttl\" value is invalid\n", *argv);
-			rta_addattr8(rta, len, MPLS_IPTUNNEL_TTL, ttl);
+			if (rta_addattr8(rta, len, MPLS_IPTUNNEL_TTL, ttl))
+				return -1;
 		} else {
 			break;
 		}
@@ -768,6 +775,7 @@ static int parse_encap_ip(struct rtattr *rta, size_t len,
 	int id_ok = 0, dst_ok = 0, tos_ok = 0, ttl_ok = 0;
 	char **argv = *argvp;
 	int argc = *argcp;
+	int ret = 0;
 
 	while (argc > 0) {
 		if (strcmp(*argv, "id") == 0) {
@@ -778,7 +786,7 @@ static int parse_encap_ip(struct rtattr *rta, size_t len,
 				duparg2("id", *argv);
 			if (get_be64(&id, *argv, 0))
 				invarg("\"id\" value is invalid\n", *argv);
-			rta_addattr64(rta, len, LWTUNNEL_IP_ID, id);
+			ret = rta_addattr64(rta, len, LWTUNNEL_IP_ID, id);
 		} else if (strcmp(*argv, "dst") == 0) {
 			inet_prefix addr;
 
@@ -786,8 +794,8 @@ static int parse_encap_ip(struct rtattr *rta, size_t len,
 			if (dst_ok++)
 				duparg2("dst", *argv);
 			get_addr(&addr, *argv, AF_INET);
-			rta_addattr_l(rta, len, LWTUNNEL_IP_DST,
-				      &addr.data, addr.bytelen);
+			ret = rta_addattr_l(rta, len, LWTUNNEL_IP_DST,
+					    &addr.data, addr.bytelen);
 		} else if (strcmp(*argv, "tos") == 0) {
 			__u32 tos;
 
@@ -796,7 +804,7 @@ static int parse_encap_ip(struct rtattr *rta, size_t len,
 				duparg2("tos", *argv);
 			if (rtnl_dsfield_a2n(&tos, *argv))
 				invarg("\"tos\" value is invalid\n", *argv);
-			rta_addattr8(rta, len, LWTUNNEL_IP_TOS, tos);
+			ret = rta_addattr8(rta, len, LWTUNNEL_IP_TOS, tos);
 		} else if (strcmp(*argv, "ttl") == 0) {
 			__u8 ttl;
 
@@ -805,10 +813,12 @@ static int parse_encap_ip(struct rtattr *rta, size_t len,
 				duparg2("ttl", *argv);
 			if (get_u8(&ttl, *argv, 0))
 				invarg("\"ttl\" value is invalid\n", *argv);
-			rta_addattr8(rta, len, LWTUNNEL_IP_TTL, ttl);
+			ret = rta_addattr8(rta, len, LWTUNNEL_IP_TTL, ttl);
 		} else {
 			break;
 		}
+		if (ret)
+			break;
 		argc--; argv++;
 	}
 
@@ -819,7 +829,7 @@ static int parse_encap_ip(struct rtattr *rta, size_t len,
 	*argcp = argc + 1;
 	*argvp = argv - 1;
 
-	return 0;
+	return ret;
 }
 
 static int parse_encap_ila(struct rtattr *rta, size_t len,
@@ -828,6 +838,7 @@ static int parse_encap_ila(struct rtattr *rta, size_t len,
 	__u64 locator;
 	int argc = *argcp;
 	char **argv = *argvp;
+	int ret = 0;
 
 	if (get_addr64(&locator, *argv) < 0) {
 		fprintf(stderr, "Bad locator: %s\n", *argv);
@@ -836,7 +847,8 @@ static int parse_encap_ila(struct rtattr *rta, size_t len,
 
 	argc--; argv++;
 
-	rta_addattr64(rta, 1024, ILA_ATTR_LOCATOR, locator);
+	if (rta_addattr64(rta, 1024, ILA_ATTR_LOCATOR, locator))
+		return -1;
 
 	while (argc > 0) {
 		if (strcmp(*argv, "csum-mode") == 0) {
@@ -849,8 +861,8 @@ static int parse_encap_ila(struct rtattr *rta, size_t len,
 				invarg("\"csum-mode\" value is invalid\n",
 				       *argv);
 
-			rta_addattr8(rta, 1024, ILA_ATTR_CSUM_MODE,
-				     (__u8)csum_mode);
+			ret = rta_addattr8(rta, 1024, ILA_ATTR_CSUM_MODE,
+					   (__u8)csum_mode);
 
 			argc--; argv++;
 		} else if (strcmp(*argv, "ident-type") == 0) {
@@ -863,8 +875,8 @@ static int parse_encap_ila(struct rtattr *rta, size_t len,
 				invarg("\"ident-type\" value is invalid\n",
 				       *argv);
 
-			rta_addattr8(rta, 1024, ILA_ATTR_IDENT_TYPE,
-				     (__u8)ident_type);
+			ret = rta_addattr8(rta, 1024, ILA_ATTR_IDENT_TYPE,
+					   (__u8)ident_type);
 
 			argc--; argv++;
 		} else if (strcmp(*argv, "hook-type") == 0) {
@@ -877,13 +889,15 @@ static int parse_encap_ila(struct rtattr *rta, size_t len,
 				invarg("\"hook-type\" value is invalid\n",
 				       *argv);
 
-			rta_addattr8(rta, 1024, ILA_ATTR_HOOK_TYPE,
-				     (__u8)hook_type);
+			ret = rta_addattr8(rta, 1024, ILA_ATTR_HOOK_TYPE,
+					   (__u8)hook_type);
 
 			argc--; argv++;
 		} else {
 			break;
 		}
+		if (ret)
+			break;
 	}
 
 	/* argv is currently the first unparsed argument,
@@ -893,7 +907,7 @@ static int parse_encap_ila(struct rtattr *rta, size_t len,
 	*argcp = argc + 1;
 	*argvp = argv - 1;
 
-	return 0;
+	return ret;
 }
 
 static int parse_encap_ip6(struct rtattr *rta, size_t len,
@@ -902,6 +916,7 @@ static int parse_encap_ip6(struct rtattr *rta, size_t len,
 	int id_ok = 0, dst_ok = 0, tos_ok = 0, ttl_ok = 0;
 	char **argv = *argvp;
 	int argc = *argcp;
+	int ret = 0;
 
 	while (argc > 0) {
 		if (strcmp(*argv, "id") == 0) {
@@ -912,7 +927,7 @@ static int parse_encap_ip6(struct rtattr *rta, size_t len,
 				duparg2("id", *argv);
 			if (get_be64(&id, *argv, 0))
 				invarg("\"id\" value is invalid\n", *argv);
-			rta_addattr64(rta, len, LWTUNNEL_IP6_ID, id);
+			ret = rta_addattr64(rta, len, LWTUNNEL_IP6_ID, id);
 		} else if (strcmp(*argv, "dst") == 0) {
 			inet_prefix addr;
 
@@ -920,8 +935,8 @@ static int parse_encap_ip6(struct rtattr *rta, size_t len,
 			if (dst_ok++)
 				duparg2("dst", *argv);
 			get_addr(&addr, *argv, AF_INET6);
-			rta_addattr_l(rta, len, LWTUNNEL_IP6_DST,
-				      &addr.data, addr.bytelen);
+			ret = rta_addattr_l(rta, len, LWTUNNEL_IP6_DST,
+					    &addr.data, addr.bytelen);
 		} else if (strcmp(*argv, "tc") == 0) {
 			__u32 tc;
 
@@ -930,7 +945,7 @@ static int parse_encap_ip6(struct rtattr *rta, size_t len,
 				duparg2("tc", *argv);
 			if (rtnl_dsfield_a2n(&tc, *argv))
 				invarg("\"tc\" value is invalid\n", *argv);
-			rta_addattr8(rta, len, LWTUNNEL_IP6_TC, tc);
+			ret = rta_addattr8(rta, len, LWTUNNEL_IP6_TC, tc);
 		} else if (strcmp(*argv, "hoplimit") == 0) {
 			__u8 hoplimit;
 
@@ -940,10 +955,13 @@ static int parse_encap_ip6(struct rtattr *rta, size_t len,
 			if (get_u8(&hoplimit, *argv, 0))
 				invarg("\"hoplimit\" value is invalid\n",
 				       *argv);
-			rta_addattr8(rta, len, LWTUNNEL_IP6_HOPLIMIT, hoplimit);
+			ret = rta_addattr8(rta, len, LWTUNNEL_IP6_HOPLIMIT,
+					   hoplimit);
 		} else {
 			break;
 		}
+		if (ret)
+			break;
 		argc--; argv++;
 	}
 
@@ -954,7 +972,7 @@ static int parse_encap_ip6(struct rtattr *rta, size_t len,
 	*argcp = argc + 1;
 	*argvp = argv - 1;
 
-	return 0;
+	return ret;
 }
 
 static void lwt_bpf_usage(void)
@@ -1021,6 +1039,7 @@ int lwt_parse_encap(struct rtattr *rta, size_t len, int *argcp, char ***argvp)
 	int argc = *argcp;
 	char **argv = *argvp;
 	__u16 type;
+	int ret = 0;
 
 	NEXT_ARG();
 	type = read_encap_type(*argv);
@@ -1037,37 +1056,40 @@ int lwt_parse_encap(struct rtattr *rta, size_t len, int *argcp, char ***argvp)
 	nest = rta_nest(rta, 1024, RTA_ENCAP);
 	switch (type) {
 	case LWTUNNEL_ENCAP_MPLS:
-		parse_encap_mpls(rta, len, &argc, &argv);
+		ret = parse_encap_mpls(rta, len, &argc, &argv);
 		break;
 	case LWTUNNEL_ENCAP_IP:
-		parse_encap_ip(rta, len, &argc, &argv);
+		ret = parse_encap_ip(rta, len, &argc, &argv);
 		break;
 	case LWTUNNEL_ENCAP_ILA:
-		parse_encap_ila(rta, len, &argc, &argv);
+		ret = parse_encap_ila(rta, len, &argc, &argv);
 		break;
 	case LWTUNNEL_ENCAP_IP6:
-		parse_encap_ip6(rta, len, &argc, &argv);
+		ret = parse_encap_ip6(rta, len, &argc, &argv);
 		break;
 	case LWTUNNEL_ENCAP_BPF:
 		if (parse_encap_bpf(rta, len, &argc, &argv) < 0)
 			exit(-1);
 		break;
 	case LWTUNNEL_ENCAP_SEG6:
-		parse_encap_seg6(rta, len, &argc, &argv);
+		ret = parse_encap_seg6(rta, len, &argc, &argv);
 		break;
 	case LWTUNNEL_ENCAP_SEG6_LOCAL:
-		parse_encap_seg6local(rta, len, &argc, &argv);
+		ret = parse_encap_seg6local(rta, len, &argc, &argv);
 		break;
 	default:
 		fprintf(stderr, "Error: unsupported encap type\n");
 		break;
 	}
+	if (ret)
+		return ret;
+
 	rta_nest_end(rta, nest);
 
-	rta_addattr16(rta, 1024, RTA_ENCAP_TYPE, type);
+	ret = rta_addattr16(rta, 1024, RTA_ENCAP_TYPE, type);
 
 	*argcp = argc;
 	*argvp = argv;
 
-	return 0;
+	return ret;
 }
-- 
2.18.0

^ permalink raw reply related

* [PATCH bpf-next 0/4] i40e AF_XDP zero-copy buffer leak fixes
From: Björn Töpel @ 2018-09-04 18:11 UTC (permalink / raw)
  To: ast, daniel, netdev, jeffrey.t.kirsher, intel-wired-lan,
	jakub.kicinski
  Cc: Björn Töpel, magnus.karlsson, magnus.karlsson

From: Björn Töpel <bjorn.topel@intel.com>

This series addresses an AF_XDP zero-copy issue that buffers passed
from userspace to the kernel was leaked when the hardware descriptor
ring was torn down.

The patches fixes the i40e AF_XDP zero-copy implementation.

Thanks to Jakub Kicinski for pointing this out!

Some background for folks that don't know the details: A zero-copy
capable driver picks buffers off the fill ring and places them on the
hardware Rx ring to be completed at a later point when DMA is
complete. Similar on the Tx side; The driver picks buffers off the Tx
ring and places them on the Tx hardware ring.

In the typical flow, the Rx buffer will be placed onto an Rx ring
(completed to the user), and the Tx buffer will be placed on the
completion ring to notify the user that the transfer is done.

However, if the driver needs to tear down the hardware rings for some
reason (interface goes down, reconfiguration and such), the userspace
buffers cannot be leaked. They have to be reused or completed back to
userspace.

The implementation does the following:

* Outstanding Tx descriptors will be passed to the completion
  ring. The Tx code has back-pressure mechanism in place, so that
  enough empty space in the completion ring is guaranteed.

* Outstanding Rx descriptors are temporarily stored on a stash/reuse
  queue. The reuse queue is based on Jakub's RFC. When/if the HW rings
  comes up again, entries from the stash are used to re-populate the
  ring.

* When AF_XDP ZC is enabled, disallow changing the number of hardware
  descriptors via ethtool. Otherwise, the size of the stash/reuse
  queue can grow unbounded.

Going forward, introducing a "zero-copy allocator" analogous to Jesper
Brouer's page pool would be a more robust and reuseable solution.

Jakub: I've made a minor checkpatch-fix to your RFC, prior adding it
into this series.


Thanks!
Björn

Björn Töpel (3):
  i40e: clean zero-copy XDP Tx ring on shutdown/reset
  i40e: clean zero-copy XDP Rx ring on shutdown/reset
  i40e: disallow changing the number of descriptors when AF_XDP is on

Jakub Kicinski (1):
  net: xsk: add a simple buffer reuse queue

 .../net/ethernet/intel/i40e/i40e_ethtool.c    |   9 +-
 drivers/net/ethernet/intel/i40e/i40e_txrx.c   |  21 ++-
 .../ethernet/intel/i40e/i40e_txrx_common.h    |   4 +
 drivers/net/ethernet/intel/i40e/i40e_xsk.c    | 152 +++++++++++++++++-
 include/net/xdp_sock.h                        |  43 +++++
 net/xdp/xdp_umem.c                            |   2 +
 net/xdp/xsk_queue.c                           |  55 +++++++
 net/xdp/xsk_queue.h                           |   3 +
 8 files changed, 273 insertions(+), 16 deletions(-)

-- 
2.17.1

^ permalink raw reply

* [PATCH bpf-next 1/4] i40e: clean zero-copy XDP Tx ring on shutdown/reset
From: Björn Töpel @ 2018-09-04 18:11 UTC (permalink / raw)
  To: ast, daniel, netdev, jeffrey.t.kirsher, intel-wired-lan,
	jakub.kicinski
  Cc: Björn Töpel, magnus.karlsson, magnus.karlsson
In-Reply-To: <20180904181105.10983-1-bjorn.topel@gmail.com>

From: Björn Töpel <bjorn.topel@intel.com>

When the zero-copy enabled XDP Tx ring is torn down, due to
configuration changes, outstandning frames on the hardware descriptor
ring are queued on the completion ring.

The completion ring has a back-pressure mechanism that will guarantee
that there is sufficient space on the ring.

Signed-off-by: Björn Töpel <bjorn.topel@intel.com>
---
 drivers/net/ethernet/intel/i40e/i40e_txrx.c   | 17 +++++++----
 .../ethernet/intel/i40e/i40e_txrx_common.h    |  2 ++
 drivers/net/ethernet/intel/i40e/i40e_xsk.c    | 30 +++++++++++++++++++
 3 files changed, 43 insertions(+), 6 deletions(-)

diff --git a/drivers/net/ethernet/intel/i40e/i40e_txrx.c b/drivers/net/ethernet/intel/i40e/i40e_txrx.c
index 37bd4e50ccde..7f85d4ba8b54 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_txrx.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_txrx.c
@@ -636,13 +636,18 @@ void i40e_clean_tx_ring(struct i40e_ring *tx_ring)
 	unsigned long bi_size;
 	u16 i;
 
-	/* ring already cleared, nothing to do */
-	if (!tx_ring->tx_bi)
-		return;
+	if (ring_is_xdp(tx_ring) && tx_ring->xsk_umem) {
+		i40e_xsk_clean_tx_ring(tx_ring);
+	} else {
+		/* ring already cleared, nothing to do */
+		if (!tx_ring->tx_bi)
+			return;
 
-	/* Free all the Tx ring sk_buffs */
-	for (i = 0; i < tx_ring->count; i++)
-		i40e_unmap_and_free_tx_resource(tx_ring, &tx_ring->tx_bi[i]);
+		/* Free all the Tx ring sk_buffs */
+		for (i = 0; i < tx_ring->count; i++)
+			i40e_unmap_and_free_tx_resource(tx_ring,
+							&tx_ring->tx_bi[i]);
+	}
 
 	bi_size = sizeof(struct i40e_tx_buffer) * tx_ring->count;
 	memset(tx_ring->tx_bi, 0, bi_size);
diff --git a/drivers/net/ethernet/intel/i40e/i40e_txrx_common.h b/drivers/net/ethernet/intel/i40e/i40e_txrx_common.h
index b5afd479a9c5..29c68b29d36f 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_txrx_common.h
+++ b/drivers/net/ethernet/intel/i40e/i40e_txrx_common.h
@@ -87,4 +87,6 @@ static inline void i40e_arm_wb(struct i40e_ring *tx_ring,
 	}
 }
 
+void i40e_xsk_clean_tx_ring(struct i40e_ring *tx_ring);
+
 #endif /* I40E_TXRX_COMMON_ */
diff --git a/drivers/net/ethernet/intel/i40e/i40e_xsk.c b/drivers/net/ethernet/intel/i40e/i40e_xsk.c
index 2ebfc78bbd09..99116277c4d2 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_xsk.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_xsk.c
@@ -830,3 +830,33 @@ int i40e_xsk_async_xmit(struct net_device *dev, u32 queue_id)
 
 	return 0;
 }
+
+/**
+ * i40e_xsk_clean_xdp_ring - Clean the XDP Tx ring on shutdown
+ * @xdp_ring: XDP Tx ring
+ **/
+void i40e_xsk_clean_tx_ring(struct i40e_ring *tx_ring)
+{
+	u16 ntc = tx_ring->next_to_clean, ntu = tx_ring->next_to_use;
+	struct xdp_umem *umem = tx_ring->xsk_umem;
+	struct i40e_tx_buffer *tx_bi;
+	u32 xsk_frames = 0;
+
+	while (ntc != ntu) {
+		tx_bi = &tx_ring->tx_bi[ntc];
+
+		if (tx_bi->xdpf)
+			i40e_clean_xdp_tx_buffer(tx_ring, tx_bi);
+		else
+			xsk_frames++;
+
+		tx_bi->xdpf = NULL;
+
+		ntc++;
+		if (ntc > tx_ring->count)
+			ntc = 0;
+	}
+
+	if (xsk_frames)
+		xsk_umem_complete_tx(umem, xsk_frames);
+}
-- 
2.17.1

^ permalink raw reply related

* [PATCH bpf-next 3/4] i40e: clean zero-copy XDP Rx ring on shutdown/reset
From: Björn Töpel @ 2018-09-04 18:11 UTC (permalink / raw)
  To: ast, daniel, netdev, jeffrey.t.kirsher, intel-wired-lan,
	jakub.kicinski
  Cc: Björn Töpel, magnus.karlsson, magnus.karlsson
In-Reply-To: <20180904181105.10983-1-bjorn.topel@gmail.com>

From: Björn Töpel <bjorn.topel@intel.com>

Outstanding Rx descriptors are temporarily stored on a stash/reuse
queue. When/if the HW rings comes up again, entries from the stash are
used to re-populate the ring.

The latter required some restructuring of the allocation scheme for
the AF_XDP zero-copy implementation. There is now a fast, and a slow
allocation. The "fast allocation" is used from the fast-path and
obtains free buffers from the fill ring and the internal recycle
mechanism. The "slow allocation" is only used in ring setup, and
obtains buffers from the fill ring and the stash (if any).

Signed-off-by: Björn Töpel <bjorn.topel@intel.com>
---
 drivers/net/ethernet/intel/i40e/i40e_txrx.c   |   4 +-
 .../ethernet/intel/i40e/i40e_txrx_common.h    |   1 +
 drivers/net/ethernet/intel/i40e/i40e_xsk.c    | 100 ++++++++++++++++--
 3 files changed, 96 insertions(+), 9 deletions(-)

diff --git a/drivers/net/ethernet/intel/i40e/i40e_txrx.c b/drivers/net/ethernet/intel/i40e/i40e_txrx.c
index 7f85d4ba8b54..740ea58ba938 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_txrx.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_txrx.c
@@ -1355,8 +1355,10 @@ void i40e_clean_rx_ring(struct i40e_ring *rx_ring)
 		rx_ring->skb = NULL;
 	}
 
-	if (rx_ring->xsk_umem)
+	if (rx_ring->xsk_umem) {
+		i40e_xsk_clean_rx_ring(rx_ring);
 		goto skip_free;
+	}
 
 	/* Free all the Rx ring sk_buffs */
 	for (i = 0; i < rx_ring->count; i++) {
diff --git a/drivers/net/ethernet/intel/i40e/i40e_txrx_common.h b/drivers/net/ethernet/intel/i40e/i40e_txrx_common.h
index 29c68b29d36f..8d46acff6f2e 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_txrx_common.h
+++ b/drivers/net/ethernet/intel/i40e/i40e_txrx_common.h
@@ -87,6 +87,7 @@ static inline void i40e_arm_wb(struct i40e_ring *tx_ring,
 	}
 }
 
+void i40e_xsk_clean_rx_ring(struct i40e_ring *rx_ring);
 void i40e_xsk_clean_tx_ring(struct i40e_ring *tx_ring);
 
 #endif /* I40E_TXRX_COMMON_ */
diff --git a/drivers/net/ethernet/intel/i40e/i40e_xsk.c b/drivers/net/ethernet/intel/i40e/i40e_xsk.c
index 99116277c4d2..e4b62e871afc 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_xsk.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_xsk.c
@@ -140,6 +140,7 @@ static void i40e_xsk_umem_dma_unmap(struct i40e_vsi *vsi, struct xdp_umem *umem)
 static int i40e_xsk_umem_enable(struct i40e_vsi *vsi, struct xdp_umem *umem,
 				u16 qid)
 {
+	struct xdp_umem_fq_reuse *reuseq;
 	bool if_running;
 	int err;
 
@@ -156,6 +157,12 @@ static int i40e_xsk_umem_enable(struct i40e_vsi *vsi, struct xdp_umem *umem,
 			return -EBUSY;
 	}
 
+	reuseq = xsk_reuseq_prepare(vsi->rx_rings[0]->count);
+	if (!reuseq)
+		return -ENOMEM;
+
+	xsk_reuseq_free(xsk_reuseq_swap(umem, reuseq));
+
 	err = i40e_xsk_umem_dma_map(vsi, umem);
 	if (err)
 		return err;
@@ -353,16 +360,46 @@ static bool i40e_alloc_buffer_zc(struct i40e_ring *rx_ring,
 }
 
 /**
- * i40e_alloc_rx_buffers_zc - Allocates a number of Rx buffers
+ * i40e_alloc_buffer_slow_zc - Allocates an i40e_rx_buffer
  * @rx_ring: Rx ring
- * @count: The number of buffers to allocate
+ * @bi: Rx buffer to populate
  *
- * This function allocates a number of Rx buffers and places them on
- * the Rx ring.
+ * This function allocates an Rx buffer. The buffer can come from fill
+ * queue, or via the reuse queue.
  *
  * Returns true for a successful allocation, false otherwise
  **/
-bool i40e_alloc_rx_buffers_zc(struct i40e_ring *rx_ring, u16 count)
+static bool i40e_alloc_buffer_slow_zc(struct i40e_ring *rx_ring,
+				      struct i40e_rx_buffer *bi)
+{
+	struct xdp_umem *umem = rx_ring->xsk_umem;
+	u64 handle, hr;
+
+	if (!xsk_umem_peek_addr_rq(umem, &handle)) {
+		rx_ring->rx_stats.alloc_page_failed++;
+		return false;
+	}
+
+	handle &= rx_ring->xsk_umem->chunk_mask;
+
+	hr = umem->headroom + XDP_PACKET_HEADROOM;
+
+	bi->dma = xdp_umem_get_dma(umem, handle);
+	bi->dma += hr;
+
+	bi->addr = xdp_umem_get_data(umem, handle);
+	bi->addr += hr;
+
+	bi->handle = handle + umem->headroom;
+
+	xsk_umem_discard_addr_rq(umem);
+	return true;
+}
+
+static __always_inline bool __i40e_alloc_rx_buffers_zc(
+	struct i40e_ring *rx_ring, u16 count,
+	bool alloc(struct i40e_ring *rx_ring,
+		   struct i40e_rx_buffer *bi))
 {
 	u16 ntu = rx_ring->next_to_use;
 	union i40e_rx_desc *rx_desc;
@@ -372,7 +409,7 @@ bool i40e_alloc_rx_buffers_zc(struct i40e_ring *rx_ring, u16 count)
 	rx_desc = I40E_RX_DESC(rx_ring, ntu);
 	bi = &rx_ring->rx_bi[ntu];
 	do {
-		if (!i40e_alloc_buffer_zc(rx_ring, bi)) {
+		if (!alloc(rx_ring, bi)) {
 			ok = false;
 			goto no_buffers;
 		}
@@ -404,6 +441,38 @@ bool i40e_alloc_rx_buffers_zc(struct i40e_ring *rx_ring, u16 count)
 	return ok;
 }
 
+/**
+ * i40e_alloc_rx_buffers_zc - Allocates a number of Rx buffers
+ * @rx_ring: Rx ring
+ * @count: The number of buffers to allocate
+ *
+ * This function allocates a number of Rx buffers from the reuse queue
+ * or fill ring and places them on the Rx ring.
+ *
+ * Returns true for a successful allocation, false otherwise
+ **/
+bool i40e_alloc_rx_buffers_zc(struct i40e_ring *rx_ring, u16 count)
+{
+	return __i40e_alloc_rx_buffers_zc(rx_ring, count,
+					  i40e_alloc_buffer_slow_zc);
+}
+
+/**
+ * i40e_alloc_rx_buffers_fast_zc - Allocates a number of Rx buffers
+ * @rx_ring: Rx ring
+ * @count: The number of buffers to allocate
+ *
+ * This function allocates a number of Rx buffers from the fill ring
+ * or the internal recycle mechanism and places them on the Rx ring.
+ *
+ * Returns true for a successful allocation, false otherwise
+ **/
+static bool i40e_alloc_rx_buffers_fast_zc(struct i40e_ring *rx_ring, u16 count)
+{
+	return __i40e_alloc_rx_buffers_zc(rx_ring, count,
+					  i40e_alloc_buffer_zc);
+}
+
 /**
  * i40e_get_rx_buffer_zc - Return the current Rx buffer
  * @rx_ring: Rx ring
@@ -571,8 +640,8 @@ int i40e_clean_rx_irq_zc(struct i40e_ring *rx_ring, int budget)
 
 		if (cleaned_count >= I40E_RX_BUFFER_WRITE) {
 			failure = failure ||
-				  !i40e_alloc_rx_buffers_zc(rx_ring,
-							    cleaned_count);
+				  !i40e_alloc_rx_buffers_fast_zc(rx_ring,
+								 cleaned_count);
 			cleaned_count = 0;
 		}
 
@@ -831,6 +900,21 @@ int i40e_xsk_async_xmit(struct net_device *dev, u32 queue_id)
 	return 0;
 }
 
+void i40e_xsk_clean_rx_ring(struct i40e_ring *rx_ring)
+{
+	u16 i;
+
+	for (i = 0; i < rx_ring->count; i++) {
+		struct i40e_rx_buffer *rx_bi = &rx_ring->rx_bi[i];
+
+		if (!rx_bi->addr)
+			continue;
+
+		xsk_umem_fq_reuse(rx_ring->xsk_umem, rx_bi->handle);
+		rx_bi->addr = NULL;
+	}
+}
+
 /**
  * i40e_xsk_clean_xdp_ring - Clean the XDP Tx ring on shutdown
  * @xdp_ring: XDP Tx ring
-- 
2.17.1

^ permalink raw reply related

* [PATCH bpf-next 2/4] net: xsk: add a simple buffer reuse queue
From: Björn Töpel @ 2018-09-04 18:11 UTC (permalink / raw)
  To: ast, daniel, netdev, jeffrey.t.kirsher, intel-wired-lan,
	jakub.kicinski
  Cc: magnus.karlsson, magnus.karlsson
In-Reply-To: <20180904181105.10983-1-bjorn.topel@gmail.com>

From: Jakub Kicinski <jakub.kicinski@netronome.com>

XSK UMEM is strongly single producer single consumer so reuse of
frames is challenging.  Add a simple "stash" of FILL packets to
reuse for drivers to optionally make use of.  This is useful
when driver has to free (ndo_stop) or resize a ring with an active
AF_XDP ZC socket.

Signed-off-by: Jakub Kicinski <jakub.kicinski@netronome.com>
---
 include/net/xdp_sock.h | 43 +++++++++++++++++++++++++++++++++
 net/xdp/xdp_umem.c     |  2 ++
 net/xdp/xsk_queue.c    | 55 ++++++++++++++++++++++++++++++++++++++++++
 net/xdp/xsk_queue.h    |  3 +++
 4 files changed, 103 insertions(+)

diff --git a/include/net/xdp_sock.h b/include/net/xdp_sock.h
index 932ca0dad6f3..7b55206da138 100644
--- a/include/net/xdp_sock.h
+++ b/include/net/xdp_sock.h
@@ -14,6 +14,7 @@
 #include <net/sock.h>
 
 struct net_device;
+struct xdp_umem_fq_reuse;
 struct xsk_queue;
 
 struct xdp_umem_page {
@@ -37,6 +38,7 @@ struct xdp_umem {
 	struct page **pgs;
 	u32 npgs;
 	struct net_device *dev;
+	struct xdp_umem_fq_reuse *fq_reuse;
 	u16 queue_id;
 	bool zc;
 	spinlock_t xsk_list_lock;
@@ -139,4 +141,45 @@ static inline dma_addr_t xdp_umem_get_dma(struct xdp_umem *umem, u64 addr)
 }
 #endif /* CONFIG_XDP_SOCKETS */
 
+struct xdp_umem_fq_reuse {
+	u32 nentries;
+	u32 length;
+	u64 handles[];
+};
+
+/* Following functions are not thread-safe in any way */
+struct xdp_umem_fq_reuse *xsk_reuseq_prepare(u32 nentries);
+struct xdp_umem_fq_reuse *xsk_reuseq_swap(struct xdp_umem *umem,
+					  struct xdp_umem_fq_reuse *newq);
+void xsk_reuseq_free(struct xdp_umem_fq_reuse *rq);
+
+/* Reuse-queue aware version of FILL queue helpers */
+static inline u64 *xsk_umem_peek_addr_rq(struct xdp_umem *umem, u64 *addr)
+{
+	struct xdp_umem_fq_reuse *rq = umem->fq_reuse;
+
+	if (!rq->length)
+		return xsk_umem_peek_addr(umem, addr);
+
+	*addr = rq->handles[rq->length - 1];
+	return addr;
+}
+
+static inline void xsk_umem_discard_addr_rq(struct xdp_umem *umem)
+{
+	struct xdp_umem_fq_reuse *rq = umem->fq_reuse;
+
+	if (!rq->length)
+		xsk_umem_discard_addr(umem);
+	else
+		rq->length--;
+}
+
+static inline void xsk_umem_fq_reuse(struct xdp_umem *umem, u64 addr)
+{
+	struct xdp_umem_fq_reuse *rq = umem->fq_reuse;
+
+	rq->handles[rq->length++] = addr;
+}
+
 #endif /* _LINUX_XDP_SOCK_H */
diff --git a/net/xdp/xdp_umem.c b/net/xdp/xdp_umem.c
index b3b632c5aeae..555427b3e0fe 100644
--- a/net/xdp/xdp_umem.c
+++ b/net/xdp/xdp_umem.c
@@ -165,6 +165,8 @@ static void xdp_umem_release(struct xdp_umem *umem)
 		umem->cq = NULL;
 	}
 
+	xsk_reuseq_destroy(umem);
+
 	xdp_umem_unpin_pages(umem);
 
 	task = get_pid_task(umem->pid, PIDTYPE_PID);
diff --git a/net/xdp/xsk_queue.c b/net/xdp/xsk_queue.c
index 2dc1384d9f27..b66504592d9b 100644
--- a/net/xdp/xsk_queue.c
+++ b/net/xdp/xsk_queue.c
@@ -3,7 +3,9 @@
  * Copyright(c) 2018 Intel Corporation.
  */
 
+#include <linux/log2.h>
 #include <linux/slab.h>
+#include <linux/overflow.h>
 
 #include "xsk_queue.h"
 
@@ -62,3 +64,56 @@ void xskq_destroy(struct xsk_queue *q)
 	page_frag_free(q->ring);
 	kfree(q);
 }
+
+struct xdp_umem_fq_reuse *xsk_reuseq_prepare(u32 nentries)
+{
+	struct xdp_umem_fq_reuse *newq;
+
+	/* Check for overflow */
+	if (nentries > (u32)roundup_pow_of_two(nentries))
+		return NULL;
+	nentries = roundup_pow_of_two(nentries);
+
+	newq = kvmalloc(struct_size(newq, handles, nentries), GFP_KERNEL);
+	if (!newq)
+		return NULL;
+	memset(newq, 0, offsetof(typeof(*newq), handles));
+
+	newq->nentries = nentries;
+	return newq;
+}
+EXPORT_SYMBOL_GPL(xsk_reuseq_prepare);
+
+struct xdp_umem_fq_reuse *xsk_reuseq_swap(struct xdp_umem *umem,
+					  struct xdp_umem_fq_reuse *newq)
+{
+	struct xdp_umem_fq_reuse *oldq = umem->fq_reuse;
+
+	if (!oldq) {
+		umem->fq_reuse = newq;
+		return NULL;
+	}
+
+	if (newq->nentries < oldq->length)
+		return newq;
+
+	memcpy(newq->handles, oldq->handles,
+	       array_size(oldq->length, sizeof(u64)));
+	newq->length = oldq->length;
+
+	umem->fq_reuse = newq;
+	return oldq;
+}
+EXPORT_SYMBOL_GPL(xsk_reuseq_swap);
+
+void xsk_reuseq_free(struct xdp_umem_fq_reuse *rq)
+{
+	kvfree(rq);
+}
+EXPORT_SYMBOL_GPL(xsk_reuseq_free);
+
+void xsk_reuseq_destroy(struct xdp_umem *umem)
+{
+	xsk_reuseq_free(umem->fq_reuse);
+	umem->fq_reuse = NULL;
+}
diff --git a/net/xdp/xsk_queue.h b/net/xdp/xsk_queue.h
index 82252cccb4e0..bcb5cbb40419 100644
--- a/net/xdp/xsk_queue.h
+++ b/net/xdp/xsk_queue.h
@@ -258,4 +258,7 @@ void xskq_set_umem(struct xsk_queue *q, u64 size, u64 chunk_mask);
 struct xsk_queue *xskq_create(u32 nentries, bool umem_queue);
 void xskq_destroy(struct xsk_queue *q_ops);
 
+/* Executed by the core when the entire UMEM gets freed */
+void xsk_reuseq_destroy(struct xdp_umem *umem);
+
 #endif /* _LINUX_XSK_QUEUE_H */
-- 
2.17.1

^ permalink raw reply related

* [PATCH bpf-next 4/4] i40e: disallow changing the number of descriptors when AF_XDP is on
From: Björn Töpel @ 2018-09-04 18:11 UTC (permalink / raw)
  To: ast, daniel, netdev, jeffrey.t.kirsher, intel-wired-lan,
	jakub.kicinski
  Cc: Björn Töpel, magnus.karlsson, magnus.karlsson
In-Reply-To: <20180904181105.10983-1-bjorn.topel@gmail.com>

From: Björn Töpel <bjorn.topel@intel.com>

When an AF_XDP UMEM is attached to any of the Rx rings, we disallow a
user to change the number of descriptors via e.g. "ethtool -G IFNAME".

Otherwise, the size of the stash/reuse queue can grow unbounded, which
would result in OOM or leaking userspace buffers.

Signed-off-by: Björn Töpel <bjorn.topel@intel.com>
---
 .../net/ethernet/intel/i40e/i40e_ethtool.c    |  9 +++++++-
 .../ethernet/intel/i40e/i40e_txrx_common.h    |  1 +
 drivers/net/ethernet/intel/i40e/i40e_xsk.c    | 22 +++++++++++++++++++
 3 files changed, 31 insertions(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/intel/i40e/i40e_ethtool.c b/drivers/net/ethernet/intel/i40e/i40e_ethtool.c
index d7d3974beca2..3cd2c88c72f8 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_ethtool.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_ethtool.c
@@ -5,7 +5,7 @@
 
 #include "i40e.h"
 #include "i40e_diag.h"
-
+#include "i40e_txrx_common.h"
 #include "i40e_ethtool_stats.h"
 
 #define I40E_PF_STAT(_name, _stat) \
@@ -1493,6 +1493,13 @@ static int i40e_set_ringparam(struct net_device *netdev,
 	    (new_rx_count == vsi->rx_rings[0]->count))
 		return 0;
 
+	/* If there is a AF_XDP UMEM attached to any of Rx rings,
+	 * disallow changing the number of descriptors -- regardless
+	 * if the netdev is running or not.
+	 */
+	if (i40e_xsk_any_rx_ring_enabled(vsi))
+		return -EBUSY;
+
 	while (test_and_set_bit(__I40E_CONFIG_BUSY, pf->state)) {
 		timeout--;
 		if (!timeout)
diff --git a/drivers/net/ethernet/intel/i40e/i40e_txrx_common.h b/drivers/net/ethernet/intel/i40e/i40e_txrx_common.h
index 8d46acff6f2e..09809dffe399 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_txrx_common.h
+++ b/drivers/net/ethernet/intel/i40e/i40e_txrx_common.h
@@ -89,5 +89,6 @@ static inline void i40e_arm_wb(struct i40e_ring *tx_ring,
 
 void i40e_xsk_clean_rx_ring(struct i40e_ring *rx_ring);
 void i40e_xsk_clean_tx_ring(struct i40e_ring *tx_ring);
+bool i40e_xsk_any_rx_ring_enabled(struct i40e_vsi *vsi);
 
 #endif /* I40E_TXRX_COMMON_ */
diff --git a/drivers/net/ethernet/intel/i40e/i40e_xsk.c b/drivers/net/ethernet/intel/i40e/i40e_xsk.c
index e4b62e871afc..119f59ec7cc0 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_xsk.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_xsk.c
@@ -944,3 +944,25 @@ void i40e_xsk_clean_tx_ring(struct i40e_ring *tx_ring)
 	if (xsk_frames)
 		xsk_umem_complete_tx(umem, xsk_frames);
 }
+
+/**
+ * i40e_xsk_any_rx_ring_enabled - Checks whether any of the Rx rings
+ * has AF_XDP UMEM attached
+ * @vsi: vsi
+ *
+ * Returns true if any of the Rx rings has an AF_XDP UMEM attached
+ **/
+bool i40e_xsk_any_rx_ring_enabled(struct i40e_vsi *vsi)
+{
+	int i;
+
+	if (!vsi->xsk_umems)
+		return false;
+
+	for (i = 0; i < vsi->num_queue_pairs; i++) {
+		if (vsi->xsk_umems[i])
+			return true;
+	}
+
+	return false;
+}
-- 
2.17.1

^ permalink raw reply related

* Re: [PATCH net] net/sched: fix memory leak in act_tunnel_key_init()
From: Cong Wang @ 2018-09-04 18:31 UTC (permalink / raw)
  To: Davide Caratti
  Cc: David Miller, Simon Horman, Amir Vadai, Jamal Hadi Salim,
	Linux Kernel Network Developers
In-Reply-To: <d0a72d1371e790505a8141592c6af904c9b24031.1536079973.git.dcaratti@redhat.com>

On Tue, Sep 4, 2018 at 10:00 AM Davide Caratti <dcaratti@redhat.com> wrote:
>
> If users try to install act_tunnel_key 'set' rules with duplicate values
> of 'index', the tunnel metadata are allocated, but never released. Then,
> kmemleak complains as follows:

Acked-by: Cong Wang <xiyou.wangcong@gmail.com>

^ permalink raw reply

* Re: [PATCH] neighbour: confirm neigh entries when ARP packet is received
From: Ihar Hrachyshka @ 2018-09-04 18:31 UTC (permalink / raw)
  To: David Miller
  Cc: Vasiliy Khoruzhick, roopa, adobriyan, jwestfall, stephen,
	anarsoul, keescook, w.bumiller, edumazet, Networking
In-Reply-To: <20180901.165129.1631978540527112789.davem@davemloft.net>

On Sat, Sep 1, 2018 at 4:51 PM, David Miller <davem@davemloft.net> wrote:
> From: Vasily Khoruzhick <vasilykh@arista.com>
> Date: Tue, 28 Aug 2018 19:48:25 -0700
>
>> Update 'confirmed' timestamp when ARP packet is received. It shouldn't
>> affect locktime logic and anyway entry can be confirmed by any higher-layer
>> protocol. Thus it makes no sense not to confirm it when ARP packet is
>> received.
>>
>> Fixes: 77d7123342 ("neighbour: update neigh timestamps iff update is
>> effective")
>>
>> Signed-off-by: Vasily Khoruzhick <vasilykh@arista.com>
>
> I'm not so sure.
>
> The comment above the code you are moving explains that the current
> behavior is intention, and it explains why too.
>
> Even if your change is correct, you're now making that comment
> inaccuratte, so you'd have to update it to match the new code.
>
> But I still think the current code is intentionally behaving that
> way, and for good reason.

Hi David,

(I am the one who put this comment there.)

I agree with the reasoning that Vasily provided for the change (we
should confirm the entry if e.g. ARP packet with identical
hwaddr/ipaddr pair arrives; just not mark it as updated). It was a
mistake of mine to put access to both updated and confirmed fields
under the "if" branch. Just leaving 'updated' there and moving
'confirmed' outside seems like the right thing to do.

The original intent was to not update 'updated' field when no update
happens (because of consequent ARP packets sent in short span of
time). The fix by Vasily should not negatively affect this scenario.

Of course, I also agree that the comment will need some adjustment to
reflect the fact that now a single timestamp is being updated. Perhaps
while at it, Vasily could also explicitly describe in a comment which
scenario the "if" branch check is supposed to cover. (I should have
done it myself, mea culpa.)

I hope it helps,
Ihar

^ permalink raw reply

* Re: [PATCH net-next v2] net: sched: action_ife: take reference to meta module
From: Cong Wang @ 2018-09-04 18:32 UTC (permalink / raw)
  To: Vlad Buslov
  Cc: Linux Kernel Network Developers, Jamal Hadi Salim, Jiri Pirko,
	David Miller
In-Reply-To: <1536011082-2043-1-git-send-email-vladbu@mellanox.com>

On Mon, Sep 3, 2018 at 2:44 PM Vlad Buslov <vladbu@mellanox.com> wrote:
>
> Recent refactoring of add_metainfo() caused use_all_metadata() to add
> metainfo to ife action metalist without taking reference to module. This
> causes warning in module_put called from ife action cleanup function.
>
> Implement add_metainfo_and_get_ops() function that returns with reference
> to module taken if metainfo was added successfully, and call it from
> use_all_metadata(), instead of calling __add_metainfo() directly.


Acked-by: Cong Wang <xiyou.wangcong@gmail.com>

This one should go to -net too.

^ permalink raw reply

* Re: [PATCH v2 00/11] mscc: ocelot: add support for SerDes muxing configuration
From: Paul Burton @ 2018-09-04 23:03 UTC (permalink / raw)
  To: Quentin Schulz
  Cc: Alexandre Belloni, David Miller, andrew, ralf, jhogan, robh+dt,
	mark.rutland, kishon, f.fainelli, allan.nielsen, linux-mips,
	devicetree, linux-kernel, netdev, thomas.petazzoni
In-Reply-To: <20180904180006.d5th3jrbhr4vtahi@qschulz>

Hi Quentin,

On Tue, Sep 04, 2018 at 08:00:06PM +0200, Quentin Schulz wrote:
> On Tue, Sep 04, 2018 at 09:10:28AM -0700, Paul Burton wrote:
> > Hi Alexandre, Quentin, all,
> > 
> > On Tue, Sep 04, 2018 at 05:16:53PM +0200, Alexandre Belloni wrote:
> > > On 03/09/2018 22:09:10-0700, David Miller wrote:
> > > > From: Alexandre Belloni <alexandre.belloni@bootlin.com>
> > > > Date: Mon, 3 Sep 2018 15:45:22 +0200
> > > > 
> > > > > On 03/09/2018 15:34:15+0200, Andrew Lunn wrote:
> > > > >> > I suggest patches 1 and 8 go through MIPS tree, 2 to 5 and 11 go through
> > > > >> > net while the others (6, 7, 9 and 10) go through the generic PHY subsystem.
> > > > >> 
> > > > >> Hi Quentin
> > > > >> 
> > > > >> Are you expecting merge conflicts? If not, it might be simpler to gets
> > > > >> ACKs from each maintainer, and then merge it though one tree.
> > > > > 
> > > > > There are some other DT changes for this cycle so those should probably
> > > > > go through MIPS.
> > > > 
> > > > No objection for this going through the MIPS tree, and from me:
> > > >
> > > > Acked-by: David S. Miller <davem@davemloft.net>
> > > 
> > > What I meant was that 1/11 and 8/11 should go through MIPS because of
> > > the potential conflicts. The other patches can go through net-next as
> > > that will make more sense. Maybe Quentin can split the series in two,
> > > one for MIPS and one for net if that makes it easier for you to apply.
> > 
> > I'd be happy to take the .dts changes through the MIPS tree, though
> > looking at them won't patch 1 break bisection?
> > 
> > Since you remove the hsio reg entry it looks to me like
> > mscc_ocelot_probe() will fail with -EINVAL (which comes from
> > devm_ioremap_resource() with res=NULL) until patch 3.
> > 
> 
> That's correct.
> 
> > I'd feel more comfortable merging this piecemeal if it doesn't result in
> > us breaking bisection for however long it takes for both the trees
> > involved to be merged.
> > 
> 
> How do you want to proceed then?

Well, it sounded like David is OK with this all going through the MIPS
tree, though we'd need an ack for the PHY parts.

Alternatively I'd be happy for the DT changes to go through the net-next
tree, which may make more sense given that the .dts changes are pretty
trivial in comparison with the driver changes. If David wants to do that
then for patches 1 & 8:

    Acked-by: Paul Burton <paul.burton@mips.com>

Either way there may be conflicts for ocelot.dtsi when it comes to
merging to master, but they should be simple to resolve. It seems
Wolfram already took your DT changes for I2C so there's probably going
to be multiple trees updating that file this cycle already anyway.

Ideally I'd say "don't break bisection" but that's sort of a separate
issue here since even if you restructure your series to do that it would
still need to go through one tree. For example you could adjust
mscc_ocelot_probe() to handle either the reg property or the syscon,
then adjust the DT to use the syscon, then remove the code dealing with
the reg property, and I'd consider that a good idea anyway but it would
still probably all need to go through one tree to make sure things get
merged in the right order & avoid breaking bisection.

Thanks,
    Paul

^ permalink raw reply

* Re: [Patch net] tipc: call start and done ops directly in __tipc_nl_compat_dumpit()
From: Cong Wang @ 2018-09-04 18:41 UTC (permalink / raw)
  To: Ying Xue; +Cc: Linux Kernel Network Developers, tipc-discussion, Jon Maloy
In-Reply-To: <941068fa-85c9-7e5b-f769-23800ca407fa@windriver.com>

On Tue, Sep 4, 2018 at 4:45 AM Ying Xue <ying.xue@windriver.com> wrote:
>
>
> On 09/04/2018 10:12 AM, Cong Wang wrote:
> > __tipc_nl_compat_dumpit() uses a netlink_callback on stack,
> > so the only way to align it with other ->dumpit() call path
> > is calling tipc_dump_start() and tipc_dump_done() directly
> > inside it. Otherwise ->dumpit() would always get NULL from
> > cb->args[0].
>
> Thank you for your fix Cong!
>
> Your solution is fine with me.
>
> When we align __tipc_nl_compat_dumpit() with ->dumpit() functions
> defined in tipc_genl_v2_ops[], cb->args[0] is used to save a
> rhashtable_iter object allocated in tipc_dump_start(), and the object
> will be retrieved with cb->args[0] in tipc_dump_done() and will be freed.
>
> But unfortunately cb->args[0] has been used to other purposes in
> tipc_nl_bearer_dump(), tipc_nl_node_dump_link(),
> tipc_nl_name_table_dump(), tipc_nl_node_dump() and tipc_nl_net_dump().
> It means cb->args[0] saved in __tipc_dump_start() will be overwritten in
> these ->dumpit() functions. As a consequence, not only the
> rhashtable_iter object allocated in tipc_dump_start() cannot be properly
> released in tipc_dump_done(), but also more kernel panics might be
> triggered in tipc_dump_done().

Ah, good catch!

The max utilization of cb->args is tipc_nl_name_table_dump():

net/tipc/name_table.c:  cb->args[0] = last_type;
net/tipc/name_table.c:  cb->args[1] = last_lower;
net/tipc/name_table.c:  cb->args[2] = last_key;
net/tipc/name_table.c:  cb->args[3] = done;

Looks like I should just use cb->args[4] for rhashtable iterator,
as we still have some room in cb->args[].

^ permalink raw reply

* Re: [PATCH] neighbour: confirm neigh entries when ARP packet is received
From: David Miller @ 2018-09-04 18:57 UTC (permalink / raw)
  To: ihrachys
  Cc: vasilykh, roopa, adobriyan, jwestfall, stephen, anarsoul,
	keescook, w.bumiller, edumazet, netdev
In-Reply-To: <CAKwN9=A6YEU5QWf9cQjHwv6L_ssOH5K_rapE6hoZtF_A4gokuQ@mail.gmail.com>

From: Ihar Hrachyshka <ihrachys@redhat.com>
Date: Tue, 4 Sep 2018 11:31:23 -0700

> Of course, I also agree that the comment will need some adjustment to
> reflect the fact that now a single timestamp is being updated. Perhaps
> while at it, Vasily could also explicitly describe in a comment which
> scenario the "if" branch check is supposed to cover. (I should have
> done it myself, mea culpa.)

Yes, that would help a lot.

^ permalink raw reply

* Re: [PATCH net] net/mlx5: Fix SQ offset in QPs with small RQ
From: David Miller @ 2018-09-04 19:18 UTC (permalink / raw)
  To: tariqt; +Cc: netdev, eranbe, saeedm, alaa
In-Reply-To: <1535987184-16417-1-git-send-email-tariqt@mellanox.com>

From: Tariq Toukan <tariqt@mellanox.com>
Date: Mon,  3 Sep 2018 18:06:24 +0300

> Correct the formula for calculating the RQ page remainder,
> which should be in byte granularity.  The result will be
> non-zero only for RQs smaller than PAGE_SIZE, as an RQ size
> is a power of 2.
> 
> Divide this by the SQ stride (MLX5_SEND_WQE_BB) to get the
> SQ offset in strides granularity.
> 
> Fixes: d7037ad73daa ("net/mlx5: Fix QP fragmented buffer allocation")
> Signed-off-by: Tariq Toukan <tariqt@mellanox.com>
> Reviewed-by: Eran Ben Elisha <eranbe@mellanox.com>
> Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
> ---
>  drivers/net/ethernet/mellanox/mlx5/core/wq.c | 5 +++--
>  1 file changed, 3 insertions(+), 2 deletions(-)
> 
> Hi Dave,
> Please queue for -stable v4.18.

Applied and queued up for -stable, thanks.

^ permalink raw reply

* Re: [Patch net] act_ife: fix a potential use-after-free
From: David Miller @ 2018-09-04 19:19 UTC (permalink / raw)
  To: xiyou.wangcong; +Cc: netdev, jhs
In-Reply-To: <20180903180815.32220-1-xiyou.wangcong@gmail.com>

From: Cong Wang <xiyou.wangcong@gmail.com>
Date: Mon,  3 Sep 2018 11:08:15 -0700

> Immediately after module_put(), user could delete this
> module, so e->ops could be already freed before we call
> e->ops->release().
> 
> Fix this by moving module_put() after ops->release().
> 
> Fixes: ef6980b6becb ("introduce IFE action")
> Signed-off-by: Cong Wang <xiyou.wangcong@gmail.com>

Applied and queued up for -stable, thanks Cong.

^ permalink raw reply

* Re: [PATCH net-next v2] net: sched: action_ife: take reference to meta module
From: David Miller @ 2018-09-04 19:21 UTC (permalink / raw)
  To: vladbu; +Cc: netdev, xiyou.wangcong, jhs, jiri
In-Reply-To: <1536011082-2043-1-git-send-email-vladbu@mellanox.com>

From: Vlad Buslov <vladbu@mellanox.com>
Date: Tue,  4 Sep 2018 00:44:42 +0300

> Recent refactoring of add_metainfo() caused use_all_metadata() to add
> metainfo to ife action metalist without taking reference to module. This
> causes warning in module_put called from ife action cleanup function.
> 
> Implement add_metainfo_and_get_ops() function that returns with reference
> to module taken if metainfo was added successfully, and call it from
> use_all_metadata(), instead of calling __add_metainfo() directly.
> 
> Example warning:
 ...
> Fixes: 5ffe57da29b3 ("act_ife: fix a potential deadlock")
> Signed-off-by: Vlad Buslov <vladbu@mellanox.com>
> ---
> 
> Changes V1->V2:
> - fold constants into helper function

Applied to 'net'.

^ permalink raw reply

* Re: [PATCH net] net: phy: sfp: Handle unimplemented hwmon limits and alarms
From: David Miller @ 2018-09-04 19:23 UTC (permalink / raw)
  To: andrew; +Cc: netdev, rmk+kernel, f.fainelli
In-Reply-To: <1536027836-19913-1-git-send-email-andrew@lunn.ch>

From: Andrew Lunn <andrew@lunn.ch>
Date: Tue,  4 Sep 2018 04:23:56 +0200

> Not all SFPs implement the registers containing sensor limits and
> alarms. Luckily, there is a bit indicating if they are implemented or
> not. Add checking for this bit, when deciding if the hwmon attributes
> should be visible.
> 
> Fixes: 1323061a018a ("net: phy: sfp: Add HWMON support for module sensors")
> Signed-off-by: Andrew Lunn <andrew@lunn.ch>

Applied, thanks Andrew.

^ permalink raw reply

* [PATCH net-next v2 2/9] if_addr: add IFA_TARGET_NETNSID
From: Christian Brauner @ 2018-09-04 19:53 UTC (permalink / raw)
  To: netdev, linux-kernel
  Cc: davem, kuznet, yoshfuji, pombredanne, kstewart, gregkh, dsahern,
	fw, ktkhai, lucien.xin, jakub.kicinski, jbenc, nicolas.dichtel,
	Christian Brauner
In-Reply-To: <20180904195355.4695-1-christian@brauner.io>

This adds a new IFA_TARGET_NETNSID property to be used by address
families such as PF_INET and PF_INET6.
The IFA_TARGET_NETNSID property can be used to send a network namespace
identifier as part of a request. If a IFA_TARGET_NETNSID property is
identified it will be used to retrieve the target network namespace in
which the request is to be made.

Signed-off-by: Christian Brauner <christian@brauner.io>
Cc: Jiri Benc <jbenc@redhat.com>
Cc: Nicolas Dichtel <nicolas.dichtel@6wind.com>
---
v1->v2:
- rename from IFA_IF_NETNSID to IFA_TARGET_NETNSID

v0->v1:
- unchanged
  Note, I did not change the property name to IFA_TARGET_NSID as there
  was no clear agreement what would be preferred. My personal preference
  is to keep the IFA_IF_NETNSID name because it aligns naturally with
  the IFLA_IF_NETNSID property for RTM_*LINK requests. Jiri seems to
  prefer this name too.
  However, if there is agreement that another property name makes more
  sense I'm happy to send a v2 that changes this.
---
 include/uapi/linux/if_addr.h | 1 +
 1 file changed, 1 insertion(+)

diff --git a/include/uapi/linux/if_addr.h b/include/uapi/linux/if_addr.h
index ebaf5701c9db..dfcf3ce0097f 100644
--- a/include/uapi/linux/if_addr.h
+++ b/include/uapi/linux/if_addr.h
@@ -34,6 +34,7 @@ enum {
 	IFA_MULTICAST,
 	IFA_FLAGS,
 	IFA_RT_PRIORITY,  /* u32, priority/metric for prefix route */
+	IFA_TARGET_NETNSID,
 	__IFA_MAX,
 };
 
-- 
2.17.1

^ permalink raw reply related

* [PATCH net-next v2 3/9] ipv4: enable IFA_TARGET_NETNSID for RTM_GETADDR
From: Christian Brauner @ 2018-09-04 19:53 UTC (permalink / raw)
  To: netdev, linux-kernel
  Cc: davem, kuznet, yoshfuji, pombredanne, kstewart, gregkh, dsahern,
	fw, ktkhai, lucien.xin, jakub.kicinski, jbenc, nicolas.dichtel,
	Christian Brauner
In-Reply-To: <20180904195355.4695-1-christian@brauner.io>

- Backwards Compatibility:
  If userspace wants to determine whether ipv4 RTM_GETADDR requests
  support the new IFA_TARGET_NETNSID property it should verify that the
  reply includes the IFA_TARGET_NETNSID property. If it does not
  userspace should assume that IFA_TARGET_NETNSID is not supported for
  ipv4 RTM_GETADDR requests on this kernel.
- From what I gather from current userspace tools that make use of
  RTM_GETADDR requests some of them pass down struct ifinfomsg when they
  should actually pass down struct ifaddrmsg. To not break existing
  tools that pass down the wrong struct we will do the same as for
  RTM_GETLINK | NLM_F_DUMP requests and not error out when the
  nlmsg_parse() fails.

- Security:
  Callers must have CAP_NET_ADMIN in the owning user namespace of the
  target network namespace.

Signed-off-by: Christian Brauner <christian@brauner.io>
---
v1->v2:
- rename from IFA_IF_NETNSID to IFA_TARGET_NETNSID

v0->v1:
- unchanged
---
 net/ipv4/devinet.c | 38 ++++++++++++++++++++++++++++++--------
 1 file changed, 30 insertions(+), 8 deletions(-)

diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c
index ea4bd8a52422..5cb849300b81 100644
--- a/net/ipv4/devinet.c
+++ b/net/ipv4/devinet.c
@@ -100,6 +100,7 @@ static const struct nla_policy ifa_ipv4_policy[IFA_MAX+1] = {
 	[IFA_CACHEINFO]		= { .len = sizeof(struct ifa_cacheinfo) },
 	[IFA_FLAGS]		= { .type = NLA_U32 },
 	[IFA_RT_PRIORITY]	= { .type = NLA_U32 },
+	[IFA_TARGET_NETNSID]	= { .type = NLA_S32 },
 };
 
 #define IN4_ADDR_HSIZE_SHIFT	8
@@ -1584,7 +1585,8 @@ static int put_cacheinfo(struct sk_buff *skb, unsigned long cstamp,
 }
 
 static int inet_fill_ifaddr(struct sk_buff *skb, struct in_ifaddr *ifa,
-			    u32 portid, u32 seq, int event, unsigned int flags)
+			    u32 portid, u32 seq, int event, unsigned int flags,
+			    int netnsid)
 {
 	struct ifaddrmsg *ifm;
 	struct nlmsghdr  *nlh;
@@ -1601,6 +1603,9 @@ static int inet_fill_ifaddr(struct sk_buff *skb, struct in_ifaddr *ifa,
 	ifm->ifa_scope = ifa->ifa_scope;
 	ifm->ifa_index = ifa->ifa_dev->dev->ifindex;
 
+	if (netnsid >= 0 && nla_put_s32(skb, IFA_TARGET_NETNSID, netnsid))
+		goto nla_put_failure;
+
 	if (!(ifm->ifa_flags & IFA_F_PERMANENT)) {
 		preferred = ifa->ifa_preferred_lft;
 		valid = ifa->ifa_valid_lft;
@@ -1648,6 +1653,9 @@ static int inet_fill_ifaddr(struct sk_buff *skb, struct in_ifaddr *ifa,
 static int inet_dump_ifaddr(struct sk_buff *skb, struct netlink_callback *cb)
 {
 	struct net *net = sock_net(skb->sk);
+	struct nlattr *tb[IFA_MAX+1];
+	struct net *tgt_net = net;
+	int netnsid = -1;
 	int h, s_h;
 	int idx, s_idx;
 	int ip_idx, s_ip_idx;
@@ -1660,12 +1668,23 @@ static int inet_dump_ifaddr(struct sk_buff *skb, struct netlink_callback *cb)
 	s_idx = idx = cb->args[1];
 	s_ip_idx = ip_idx = cb->args[2];
 
+	if (nlmsg_parse(cb->nlh, sizeof(struct ifaddrmsg), tb, IFA_MAX,
+			ifa_ipv4_policy, NULL) >= 0) {
+		if (tb[IFA_TARGET_NETNSID]) {
+			netnsid = nla_get_s32(tb[IFA_TARGET_NETNSID]);
+
+			tgt_net = rtnl_get_net_ns_capable(skb->sk, netnsid);
+			if (IS_ERR(tgt_net))
+				return PTR_ERR(tgt_net);
+		}
+	}
+
 	for (h = s_h; h < NETDEV_HASHENTRIES; h++, s_idx = 0) {
 		idx = 0;
-		head = &net->dev_index_head[h];
+		head = &tgt_net->dev_index_head[h];
 		rcu_read_lock();
-		cb->seq = atomic_read(&net->ipv4.dev_addr_genid) ^
-			  net->dev_base_seq;
+		cb->seq = atomic_read(&tgt_net->ipv4.dev_addr_genid) ^
+			  tgt_net->dev_base_seq;
 		hlist_for_each_entry_rcu(dev, head, index_hlist) {
 			if (idx < s_idx)
 				goto cont;
@@ -1680,9 +1699,10 @@ static int inet_dump_ifaddr(struct sk_buff *skb, struct netlink_callback *cb)
 				if (ip_idx < s_ip_idx)
 					continue;
 				if (inet_fill_ifaddr(skb, ifa,
-					     NETLINK_CB(cb->skb).portid,
-					     cb->nlh->nlmsg_seq,
-					     RTM_NEWADDR, NLM_F_MULTI) < 0) {
+						     NETLINK_CB(cb->skb).portid,
+						     cb->nlh->nlmsg_seq,
+						     RTM_NEWADDR, NLM_F_MULTI,
+						     netnsid) < 0) {
 					rcu_read_unlock();
 					goto done;
 				}
@@ -1698,6 +1718,8 @@ static int inet_dump_ifaddr(struct sk_buff *skb, struct netlink_callback *cb)
 	cb->args[0] = h;
 	cb->args[1] = idx;
 	cb->args[2] = ip_idx;
+	if (netnsid >= 0)
+		put_net(tgt_net);
 
 	return skb->len;
 }
@@ -1715,7 +1737,7 @@ static void rtmsg_ifa(int event, struct in_ifaddr *ifa, struct nlmsghdr *nlh,
 	if (!skb)
 		goto errout;
 
-	err = inet_fill_ifaddr(skb, ifa, portid, seq, event, 0);
+	err = inet_fill_ifaddr(skb, ifa, portid, seq, event, 0, -1);
 	if (err < 0) {
 		/* -EMSGSIZE implies BUG in inet_nlmsg_size() */
 		WARN_ON(err == -EMSGSIZE);
-- 
2.17.1

^ permalink raw reply related


This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox