Netdev List

Netdev List
 help / color / mirror / Atom feed

* [PATCH net-next 2/8] drivers: net: xgene: Configure classifier with pagepool
From: Iyappan Subramanian @ 2016-12-02  0:41 UTC (permalink / raw)
  To: davem, netdev; +Cc: linux-arm-kernel, patches, Iyappan Subramanian, Quan Nguyen
In-Reply-To: <1480639304-18757-1-git-send-email-isubramanian@apm.com>

This patch configures classifier with the pagepool information.

Signed-off-by: Iyappan Subramanian <isubramanian@apm.com>
Signed-off-by: Quan Nguyen <qnguyen@apm.com>
---
 drivers/net/ethernet/apm/xgene/xgene_enet_cle.c   | 16 ++++++++++++++--
 drivers/net/ethernet/apm/xgene/xgene_enet_cle.h   |  2 ++
 drivers/net/ethernet/apm/xgene/xgene_enet_hw.c    |  7 +++++--
 drivers/net/ethernet/apm/xgene/xgene_enet_hw.h    |  6 ++++--
 drivers/net/ethernet/apm/xgene/xgene_enet_main.c  | 11 +++++++++--
 drivers/net/ethernet/apm/xgene/xgene_enet_main.h  |  3 ++-
 drivers/net/ethernet/apm/xgene/xgene_enet_sgmac.c |  9 ++++++---
 drivers/net/ethernet/apm/xgene/xgene_enet_xgmac.c |  7 +++++--
 8 files changed, 47 insertions(+), 14 deletions(-)

diff --git a/drivers/net/ethernet/apm/xgene/xgene_enet_cle.c b/drivers/net/ethernet/apm/xgene/xgene_enet_cle.c
index 7aac0fb..caa55bd 100644
--- a/drivers/net/ethernet/apm/xgene/xgene_enet_cle.c
+++ b/drivers/net/ethernet/apm/xgene/xgene_enet_cle.c
@@ -52,6 +52,7 @@ static void xgene_cle_dbptr_to_hw(struct xgene_enet_pdata *pdata,
 {
 	buf[0] = SET_VAL(CLE_DROP, dbptr->drop);
 	buf[4] = SET_VAL(CLE_FPSEL, dbptr->fpsel) |
+		 SET_VAL(CLE_NFPSEL, dbptr->nxtfpsel) |
 		 SET_VAL(CLE_DSTQIDL, dbptr->dstqid);
 
 	buf[5] = SET_VAL(CLE_DSTQIDH, (u32)dbptr->dstqid >> CLE_DSTQIDL_LEN) |
@@ -349,8 +350,12 @@ static int xgene_cle_set_rss_idt(struct xgene_enet_pdata *pdata)
 		fpsel = xgene_enet_get_fpsel(pool_id);
 		dstqid = xgene_enet_dst_ring_num(pdata->rx_ring[idx]);
 		nfpsel = 0;
-		idt_reg = 0;
+		if (pdata->rx_ring[idx]->page_pool) {
+			pool_id = pdata->rx_ring[idx]->page_pool->id;
+			nfpsel = xgene_enet_get_fpsel(pool_id);
+		}
 
+		idt_reg = 0;
 		xgene_cle_idt_to_hw(pdata, dstqid, fpsel, nfpsel, &idt_reg);
 		ret = xgene_cle_dram_wr(&pdata->cle, &idt_reg, 1, i,
 					RSS_IDT, CLE_CMD_WR);
@@ -400,9 +405,9 @@ static int xgene_cle_setup_rss(struct xgene_enet_pdata *pdata)
 static int xgene_enet_cle_init(struct xgene_enet_pdata *pdata)
 {
 	struct xgene_enet_cle *enet_cle = &pdata->cle;
+	u32 def_qid, def_fpsel, def_nxtfpsel, pool_id;
 	struct xgene_cle_dbptr dbptr[DB_MAX_PTRS];
 	struct xgene_cle_ptree_branch *br;
-	u32 def_qid, def_fpsel, pool_id;
 	struct xgene_cle_ptree *ptree;
 	struct xgene_cle_ptree_kn kn;
 	int ret;
@@ -707,13 +712,20 @@ static int xgene_enet_cle_init(struct xgene_enet_pdata *pdata)
 	def_qid = xgene_enet_dst_ring_num(pdata->rx_ring[0]);
 	pool_id = pdata->rx_ring[0]->buf_pool->id;
 	def_fpsel = xgene_enet_get_fpsel(pool_id);
+	def_nxtfpsel = 0;
+	if (pdata->rx_ring[0]->page_pool) {
+		pool_id = pdata->rx_ring[0]->page_pool->id;
+		def_nxtfpsel = xgene_enet_get_fpsel(pool_id);
+	}
 
 	memset(dbptr, 0, sizeof(struct xgene_cle_dbptr) * DB_MAX_PTRS);
 	dbptr[DB_RES_ACCEPT].fpsel =  def_fpsel;
+	dbptr[DB_RES_ACCEPT].nxtfpsel = def_nxtfpsel;
 	dbptr[DB_RES_ACCEPT].dstqid = def_qid;
 	dbptr[DB_RES_ACCEPT].cle_priority = 1;
 
 	dbptr[DB_RES_DEF].fpsel = def_fpsel;
+	dbptr[DB_RES_DEF].nxtfpsel = def_nxtfpsel;
 	dbptr[DB_RES_DEF].dstqid = def_qid;
 	dbptr[DB_RES_DEF].cle_priority = 7;
 	xgene_cle_setup_def_dbptr(pdata, enet_cle, &dbptr[DB_RES_DEF],
diff --git a/drivers/net/ethernet/apm/xgene/xgene_enet_cle.h b/drivers/net/ethernet/apm/xgene/xgene_enet_cle.h
index 9ac9f8e..903be0c 100644
--- a/drivers/net/ethernet/apm/xgene/xgene_enet_cle.h
+++ b/drivers/net/ethernet/apm/xgene/xgene_enet_cle.h
@@ -91,6 +91,8 @@
 #define CLE_DSTQIDH_LEN		5
 #define CLE_FPSEL_POS		21
 #define CLE_FPSEL_LEN		4
+#define CLE_NFPSEL_POS		17
+#define CLE_NFPSEL_LEN		4
 #define CLE_PRIORITY_POS	5
 #define CLE_PRIORITY_LEN	3
 
diff --git a/drivers/net/ethernet/apm/xgene/xgene_enet_hw.c b/drivers/net/ethernet/apm/xgene/xgene_enet_hw.c
index 1007074..c395df3 100644
--- a/drivers/net/ethernet/apm/xgene/xgene_enet_hw.c
+++ b/drivers/net/ethernet/apm/xgene/xgene_enet_hw.c
@@ -550,12 +550,14 @@ static void xgene_enet_config_ring_if_assoc(struct xgene_enet_pdata *pdata)
 }
 
 static void xgene_enet_cle_bypass(struct xgene_enet_pdata *pdata,
-				  u32 dst_ring_num, u16 bufpool_id)
+				  u32 dst_ring_num, u16 bufpool_id,
+				  u16 nxtbufpool_id)
 {
 	u32 cb;
-	u32 fpsel;
+	u32 fpsel, nxtfpsel;
 
 	fpsel = xgene_enet_get_fpsel(bufpool_id);
+	nxtfpsel = xgene_enet_get_fpsel(nxtbufpool_id);
 
 	xgene_enet_rd_csr(pdata, CLE_BYPASS_REG0_0_ADDR, &cb);
 	cb |= CFG_CLE_BYPASS_EN0;
@@ -565,6 +567,7 @@ static void xgene_enet_cle_bypass(struct xgene_enet_pdata *pdata,
 	xgene_enet_rd_csr(pdata, CLE_BYPASS_REG1_0_ADDR, &cb);
 	CFG_CLE_DSTQID0_SET(&cb, dst_ring_num);
 	CFG_CLE_FPSEL0_SET(&cb, fpsel);
+	CFG_CLE_NXTFPSEL0_SET(&cb, nxtfpsel);
 	xgene_enet_wr_csr(pdata, CLE_BYPASS_REG1_0_ADDR, cb);
 }
 
diff --git a/drivers/net/ethernet/apm/xgene/xgene_enet_hw.h b/drivers/net/ethernet/apm/xgene/xgene_enet_hw.h
index e73cbb1..bd6cb6c 100644
--- a/drivers/net/ethernet/apm/xgene/xgene_enet_hw.h
+++ b/drivers/net/ethernet/apm/xgene/xgene_enet_hw.h
@@ -165,10 +165,12 @@ enum xgene_enet_rm {
 #define CFG_CLE_IP_PROTOCOL0_SET(dst, val)	xgene_set_bits(dst, val, 16, 2)
 #define CFG_CLE_DSTQID0_SET(dst, val)		xgene_set_bits(dst, val, 0, 12)
 #define CFG_CLE_FPSEL0_SET(dst, val)		xgene_set_bits(dst, val, 16, 4)
+#define CFG_CLE_NXTFPSEL0_SET(dst, val)		xgene_set_bits(dst, val, 20, 4)
 #define CFG_MACMODE_SET(dst, val)		xgene_set_bits(dst, val, 18, 2)
 #define CFG_WAITASYNCRD_SET(dst, val)		xgene_set_bits(dst, val, 0, 16)
-#define CFG_CLE_DSTQID0(val)		(val & GENMASK(11, 0))
-#define CFG_CLE_FPSEL0(val)		((val << 16) & GENMASK(19, 16))
+#define CFG_CLE_DSTQID0(val)		((val) & GENMASK(11, 0))
+#define CFG_CLE_FPSEL0(val)		(((val) << 16) & GENMASK(19, 16))
+#define CFG_CLE_NXTFPSEL0(val)		(((val) << 20) & GENMASK(23, 20))
 #define ICM_CONFIG0_REG_0_ADDR		0x0400
 #define ICM_CONFIG2_REG_0_ADDR		0x0410
 #define RX_DV_GATE_REG_0_ADDR		0x05fc
diff --git a/drivers/net/ethernet/apm/xgene/xgene_enet_main.c b/drivers/net/ethernet/apm/xgene/xgene_enet_main.c
index 1352b52..c89acf5 100644
--- a/drivers/net/ethernet/apm/xgene/xgene_enet_main.c
+++ b/drivers/net/ethernet/apm/xgene/xgene_enet_main.c
@@ -1518,9 +1518,10 @@ static int xgene_enet_get_resources(struct xgene_enet_pdata *pdata)
 static int xgene_enet_init_hw(struct xgene_enet_pdata *pdata)
 {
 	struct xgene_enet_cle *enet_cle = &pdata->cle;
+	struct xgene_enet_desc_ring *page_pool;
 	struct net_device *ndev = pdata->ndev;
 	struct xgene_enet_desc_ring *buf_pool;
-	u16 dst_ring_num;
+	u16 dst_ring_num, ring_id;
 	int i, ret;
 
 	ret = pdata->port_ops->reset(pdata);
@@ -1558,8 +1559,14 @@ static int xgene_enet_init_hw(struct xgene_enet_pdata *pdata)
 			netdev_err(ndev, "Preclass Tree init error\n");
 			goto err;
 		}
+
 	} else {
-		pdata->port_ops->cle_bypass(pdata, dst_ring_num, buf_pool->id);
+		dst_ring_num = xgene_enet_dst_ring_num(pdata->rx_ring[0]);
+		buf_pool = pdata->rx_ring[0]->buf_pool;
+		page_pool = pdata->rx_ring[0]->page_pool;
+		ring_id = (page_pool) ? page_pool->id : 0;
+		pdata->port_ops->cle_bypass(pdata, dst_ring_num,
+					    buf_pool->id, ring_id);
 	}
 
 	pdata->phy_speed = SPEED_UNKNOWN;
diff --git a/drivers/net/ethernet/apm/xgene/xgene_enet_main.h b/drivers/net/ethernet/apm/xgene/xgene_enet_main.h
index 011965b..1fe3942 100644
--- a/drivers/net/ethernet/apm/xgene/xgene_enet_main.h
+++ b/drivers/net/ethernet/apm/xgene/xgene_enet_main.h
@@ -115,6 +115,7 @@ struct xgene_enet_desc_ring {
 	enum xgene_enet_ring_cfgsize cfgsize;
 	struct xgene_enet_desc_ring *cp_ring;
 	struct xgene_enet_desc_ring *buf_pool;
+	struct xgene_enet_desc_ring *page_pool;
 	struct napi_struct napi;
 	union {
 		void *desc_addr;
@@ -152,7 +153,7 @@ struct xgene_port_ops {
 	void (*clear)(struct xgene_enet_pdata *pdata,
 		      struct xgene_enet_desc_ring *ring);
 	void (*cle_bypass)(struct xgene_enet_pdata *pdata,
-			   u32 dst_ring_num, u16 bufpool_id);
+			   u32 dst_ring_num, u16 bufpool_id, u16 nxtbufpool_id);
 	void (*shutdown)(struct xgene_enet_pdata *pdata);
 };
 
diff --git a/drivers/net/ethernet/apm/xgene/xgene_enet_sgmac.c b/drivers/net/ethernet/apm/xgene/xgene_enet_sgmac.c
index 8e4209c..82b7a5e 100644
--- a/drivers/net/ethernet/apm/xgene/xgene_enet_sgmac.c
+++ b/drivers/net/ethernet/apm/xgene/xgene_enet_sgmac.c
@@ -484,11 +484,12 @@ static int xgene_enet_reset(struct xgene_enet_pdata *p)
 }
 
 static void xgene_enet_cle_bypass(struct xgene_enet_pdata *p,
-				  u32 dst_ring_num, u16 bufpool_id)
+				  u32 dst_ring_num, u16 bufpool_id,
+				  u16 nxtbufpool_id)
 {
-	u32 data, fpsel;
 	u32 cle_bypass_reg0, cle_bypass_reg1;
 	u32 offset = p->port_id * MAC_OFFSET;
+	u32 data, fpsel, nxtfpsel;
 
 	if (p->enet_id == XGENE_ENET1) {
 		cle_bypass_reg0 = CLE_BYPASS_REG0_0_ADDR;
@@ -502,7 +503,9 @@ static void xgene_enet_cle_bypass(struct xgene_enet_pdata *p,
 	xgene_enet_wr_csr(p, cle_bypass_reg0 + offset, data);
 
 	fpsel = xgene_enet_get_fpsel(bufpool_id);
-	data = CFG_CLE_DSTQID0(dst_ring_num) | CFG_CLE_FPSEL0(fpsel);
+	nxtfpsel = xgene_enet_get_fpsel(nxtbufpool_id);
+	data = CFG_CLE_DSTQID0(dst_ring_num) | CFG_CLE_FPSEL0(fpsel) |
+	       CFG_CLE_NXTFPSEL0(nxtfpsel);
 	xgene_enet_wr_csr(p, cle_bypass_reg1 + offset, data);
 }
 
diff --git a/drivers/net/ethernet/apm/xgene/xgene_enet_xgmac.c b/drivers/net/ethernet/apm/xgene/xgene_enet_xgmac.c
index f97e599..e4adba6 100644
--- a/drivers/net/ethernet/apm/xgene/xgene_enet_xgmac.c
+++ b/drivers/net/ethernet/apm/xgene/xgene_enet_xgmac.c
@@ -350,9 +350,10 @@ static int xgene_enet_reset(struct xgene_enet_pdata *pdata)
 }
 
 static void xgene_enet_xgcle_bypass(struct xgene_enet_pdata *pdata,
-				    u32 dst_ring_num, u16 bufpool_id)
+				    u32 dst_ring_num, u16 bufpool_id,
+				    u16 nxtbufpool_id)
 {
-	u32 cb, fpsel;
+	u32 cb, fpsel, nxtfpsel;
 
 	xgene_enet_rd_csr(pdata, XCLE_BYPASS_REG0_ADDR, &cb);
 	cb |= CFG_CLE_BYPASS_EN0;
@@ -360,9 +361,11 @@ static void xgene_enet_xgcle_bypass(struct xgene_enet_pdata *pdata,
 	xgene_enet_wr_csr(pdata, XCLE_BYPASS_REG0_ADDR, cb);
 
 	fpsel = xgene_enet_get_fpsel(bufpool_id);
+	nxtfpsel = xgene_enet_get_fpsel(nxtbufpool_id);
 	xgene_enet_rd_csr(pdata, XCLE_BYPASS_REG1_ADDR, &cb);
 	CFG_CLE_DSTQID0_SET(&cb, dst_ring_num);
 	CFG_CLE_FPSEL0_SET(&cb, fpsel);
+	CFG_CLE_NXTFPSEL0_SET(&cb, nxtfpsel);
 	xgene_enet_wr_csr(pdata, XCLE_BYPASS_REG1_ADDR, cb);
 }
 
-- 
1.9.1

^ permalink raw reply related

* [PATCH net-next 1/8] drivers: net: xgene: Add helper function
From: Iyappan Subramanian @ 2016-12-02  0:41 UTC (permalink / raw)
  To: davem, netdev; +Cc: linux-arm-kernel, patches, Iyappan Subramanian, Quan Nguyen
In-Reply-To: <1480639304-18757-1-git-send-email-isubramanian@apm.com>

This is a prepartion patch and adds xgene_enet_get_fpsel() helper
function to get buffer pool number.

Signed-off-by: Iyappan Subramanian <isubramanian@apm.com>
Signed-off-by: Quan Nguyen <qnguyen@apm.com>
---
 drivers/net/ethernet/apm/xgene/xgene_enet_cle.c   |  4 ++--
 drivers/net/ethernet/apm/xgene/xgene_enet_hw.c    | 19 +++++++------------
 drivers/net/ethernet/apm/xgene/xgene_enet_hw.h    |  8 ++++++++
 drivers/net/ethernet/apm/xgene/xgene_enet_sgmac.c | 20 +++++++-------------
 drivers/net/ethernet/apm/xgene/xgene_enet_xgmac.c | 20 +++++++-------------
 5 files changed, 31 insertions(+), 40 deletions(-)

diff --git a/drivers/net/ethernet/apm/xgene/xgene_enet_cle.c b/drivers/net/ethernet/apm/xgene/xgene_enet_cle.c
index 23d72af..7aac0fb 100644
--- a/drivers/net/ethernet/apm/xgene/xgene_enet_cle.c
+++ b/drivers/net/ethernet/apm/xgene/xgene_enet_cle.c
@@ -346,7 +346,7 @@ static int xgene_cle_set_rss_idt(struct xgene_enet_pdata *pdata)
 	for (i = 0; i < XGENE_CLE_IDT_ENTRIES; i++) {
 		idx = i % pdata->rxq_cnt;
 		pool_id = pdata->rx_ring[idx]->buf_pool->id;
-		fpsel = xgene_enet_ring_bufnum(pool_id) - 0x20;
+		fpsel = xgene_enet_get_fpsel(pool_id);
 		dstqid = xgene_enet_dst_ring_num(pdata->rx_ring[idx]);
 		nfpsel = 0;
 		idt_reg = 0;
@@ -706,7 +706,7 @@ static int xgene_enet_cle_init(struct xgene_enet_pdata *pdata)
 
 	def_qid = xgene_enet_dst_ring_num(pdata->rx_ring[0]);
 	pool_id = pdata->rx_ring[0]->buf_pool->id;
-	def_fpsel = xgene_enet_ring_bufnum(pool_id) - 0x20;
+	def_fpsel = xgene_enet_get_fpsel(pool_id);
 
 	memset(dbptr, 0, sizeof(struct xgene_cle_dbptr) * DB_MAX_PTRS);
 	dbptr[DB_RES_ACCEPT].fpsel =  def_fpsel;
diff --git a/drivers/net/ethernet/apm/xgene/xgene_enet_hw.c b/drivers/net/ethernet/apm/xgene/xgene_enet_hw.c
index 5390ae8..1007074 100644
--- a/drivers/net/ethernet/apm/xgene/xgene_enet_hw.c
+++ b/drivers/net/ethernet/apm/xgene/xgene_enet_hw.c
@@ -555,7 +555,7 @@ static void xgene_enet_cle_bypass(struct xgene_enet_pdata *pdata,
 	u32 cb;
 	u32 fpsel;
 
-	fpsel = xgene_enet_ring_bufnum(bufpool_id) - 0x20;
+	fpsel = xgene_enet_get_fpsel(bufpool_id);
 
 	xgene_enet_rd_csr(pdata, CLE_BYPASS_REG0_0_ADDR, &cb);
 	cb |= CFG_CLE_BYPASS_EN0;
@@ -652,16 +652,14 @@ static int xgene_enet_reset(struct xgene_enet_pdata *pdata)
 static void xgene_enet_clear(struct xgene_enet_pdata *pdata,
 			     struct xgene_enet_desc_ring *ring)
 {
-	u32 addr, val, data;
-
-	val = xgene_enet_ring_bufnum(ring->id);
+	u32 addr, data;
 
 	if (xgene_enet_is_bufpool(ring->id)) {
 		addr = ENET_CFGSSQMIFPRESET_ADDR;
-		data = BIT(val - 0x20);
+		data = BIT(xgene_enet_get_fpsel(ring->id));
 	} else {
 		addr = ENET_CFGSSQMIWQRESET_ADDR;
-		data = BIT(val);
+		data = BIT(xgene_enet_ring_bufnum(ring->id));
 	}
 
 	xgene_enet_wr_ring_if(pdata, addr, data);
@@ -671,24 +669,21 @@ static void xgene_gport_shutdown(struct xgene_enet_pdata *pdata)
 {
 	struct device *dev = &pdata->pdev->dev;
 	struct xgene_enet_desc_ring *ring;
-	u32 pb, val;
+	u32 pb;
 	int i;
 
 	pb = 0;
 	for (i = 0; i < pdata->rxq_cnt; i++) {
 		ring = pdata->rx_ring[i]->buf_pool;
+		pb |= BIT(xgene_enet_get_fpsel(ring->id));
 
-		val = xgene_enet_ring_bufnum(ring->id);
-		pb |= BIT(val - 0x20);
 	}
 	xgene_enet_wr_ring_if(pdata, ENET_CFGSSQMIFPRESET_ADDR, pb);
 
 	pb = 0;
 	for (i = 0; i < pdata->txq_cnt; i++) {
 		ring = pdata->tx_ring[i];
-
-		val = xgene_enet_ring_bufnum(ring->id);
-		pb |= BIT(val);
+		pb |= BIT(xgene_enet_ring_bufnum(ring->id));
 	}
 	xgene_enet_wr_ring_if(pdata, ENET_CFGSSQMIWQRESET_ADDR, pb);
 
diff --git a/drivers/net/ethernet/apm/xgene/xgene_enet_hw.h b/drivers/net/ethernet/apm/xgene/xgene_enet_hw.h
index 06e598c..e73cbb1 100644
--- a/drivers/net/ethernet/apm/xgene/xgene_enet_hw.h
+++ b/drivers/net/ethernet/apm/xgene/xgene_enet_hw.h
@@ -346,6 +346,14 @@ static inline bool xgene_enet_is_bufpool(u16 id)
 	return ((id & RING_BUFNUM_MASK) >= 0x20) ? true : false;
 }
 
+static inline u8 xgene_enet_get_fpsel(u16 id)
+{
+	if (xgene_enet_is_bufpool(id))
+		return xgene_enet_ring_bufnum(id) - RING_BUFNUM_BUFPOOL;
+
+	return 0;
+}
+
 static inline u16 xgene_enet_get_numslots(u16 id, u32 size)
 {
 	bool is_bufpool = xgene_enet_is_bufpool(id);
diff --git a/drivers/net/ethernet/apm/xgene/xgene_enet_sgmac.c b/drivers/net/ethernet/apm/xgene/xgene_enet_sgmac.c
index d12e9cb..8e4209c 100644
--- a/drivers/net/ethernet/apm/xgene/xgene_enet_sgmac.c
+++ b/drivers/net/ethernet/apm/xgene/xgene_enet_sgmac.c
@@ -501,7 +501,7 @@ static void xgene_enet_cle_bypass(struct xgene_enet_pdata *p,
 	data = CFG_CLE_BYPASS_EN0;
 	xgene_enet_wr_csr(p, cle_bypass_reg0 + offset, data);
 
-	fpsel = xgene_enet_ring_bufnum(bufpool_id) - 0x20;
+	fpsel = xgene_enet_get_fpsel(bufpool_id);
 	data = CFG_CLE_DSTQID0(dst_ring_num) | CFG_CLE_FPSEL0(fpsel);
 	xgene_enet_wr_csr(p, cle_bypass_reg1 + offset, data);
 }
@@ -509,16 +509,14 @@ static void xgene_enet_cle_bypass(struct xgene_enet_pdata *p,
 static void xgene_enet_clear(struct xgene_enet_pdata *pdata,
 			     struct xgene_enet_desc_ring *ring)
 {
-	u32 addr, val, data;
-
-	val = xgene_enet_ring_bufnum(ring->id);
+	u32 addr, data;
 
 	if (xgene_enet_is_bufpool(ring->id)) {
 		addr = ENET_CFGSSQMIFPRESET_ADDR;
-		data = BIT(val - 0x20);
+		data = BIT(xgene_enet_get_fpsel(ring->id));
 	} else {
 		addr = ENET_CFGSSQMIWQRESET_ADDR;
-		data = BIT(val);
+		data = BIT(xgene_enet_ring_bufnum(ring->id));
 	}
 
 	xgene_enet_wr_ring_if(pdata, addr, data);
@@ -528,24 +526,20 @@ static void xgene_enet_shutdown(struct xgene_enet_pdata *p)
 {
 	struct device *dev = &p->pdev->dev;
 	struct xgene_enet_desc_ring *ring;
-	u32 pb, val;
+	u32 pb;
 	int i;
 
 	pb = 0;
 	for (i = 0; i < p->rxq_cnt; i++) {
 		ring = p->rx_ring[i]->buf_pool;
-
-		val = xgene_enet_ring_bufnum(ring->id);
-		pb |= BIT(val - 0x20);
+		pb |= BIT(xgene_enet_get_fpsel(ring->id));
 	}
 	xgene_enet_wr_ring_if(p, ENET_CFGSSQMIFPRESET_ADDR, pb);
 
 	pb = 0;
 	for (i = 0; i < p->txq_cnt; i++) {
 		ring = p->tx_ring[i];
-
-		val = xgene_enet_ring_bufnum(ring->id);
-		pb |= BIT(val);
+		pb |= BIT(xgene_enet_ring_bufnum(ring->id));
 	}
 	xgene_enet_wr_ring_if(p, ENET_CFGSSQMIWQRESET_ADDR, pb);
 
diff --git a/drivers/net/ethernet/apm/xgene/xgene_enet_xgmac.c b/drivers/net/ethernet/apm/xgene/xgene_enet_xgmac.c
index d1758b0..f97e599 100644
--- a/drivers/net/ethernet/apm/xgene/xgene_enet_xgmac.c
+++ b/drivers/net/ethernet/apm/xgene/xgene_enet_xgmac.c
@@ -359,7 +359,7 @@ static void xgene_enet_xgcle_bypass(struct xgene_enet_pdata *pdata,
 	CFG_CLE_IP_PROTOCOL0_SET(&cb, 3);
 	xgene_enet_wr_csr(pdata, XCLE_BYPASS_REG0_ADDR, cb);
 
-	fpsel = xgene_enet_ring_bufnum(bufpool_id) - 0x20;
+	fpsel = xgene_enet_get_fpsel(bufpool_id);
 	xgene_enet_rd_csr(pdata, XCLE_BYPASS_REG1_ADDR, &cb);
 	CFG_CLE_DSTQID0_SET(&cb, dst_ring_num);
 	CFG_CLE_FPSEL0_SET(&cb, fpsel);
@@ -370,24 +370,20 @@ static void xgene_enet_shutdown(struct xgene_enet_pdata *pdata)
 {
 	struct device *dev = &pdata->pdev->dev;
 	struct xgene_enet_desc_ring *ring;
-	u32 pb, val;
+	u32 pb;
 	int i;
 
 	pb = 0;
 	for (i = 0; i < pdata->rxq_cnt; i++) {
 		ring = pdata->rx_ring[i]->buf_pool;
-
-		val = xgene_enet_ring_bufnum(ring->id);
-		pb |= BIT(val - 0x20);
+		pb |= BIT(xgene_enet_get_fpsel(ring->id));
 	}
 	xgene_enet_wr_ring_if(pdata, ENET_CFGSSQMIFPRESET_ADDR, pb);
 
 	pb = 0;
 	for (i = 0; i < pdata->txq_cnt; i++) {
 		ring = pdata->tx_ring[i];
-
-		val = xgene_enet_ring_bufnum(ring->id);
-		pb |= BIT(val);
+		pb |= BIT(xgene_enet_ring_bufnum(ring->id));
 	}
 	xgene_enet_wr_ring_if(pdata, ENET_CFGSSQMIWQRESET_ADDR, pb);
 
@@ -400,16 +396,14 @@ static void xgene_enet_shutdown(struct xgene_enet_pdata *pdata)
 static void xgene_enet_clear(struct xgene_enet_pdata *pdata,
 			     struct xgene_enet_desc_ring *ring)
 {
-	u32 addr, val, data;
-
-	val = xgene_enet_ring_bufnum(ring->id);
+	u32 addr, data;
 
 	if (xgene_enet_is_bufpool(ring->id)) {
 		addr = ENET_CFGSSQMIFPRESET_ADDR;
-		data = BIT(val - 0x20);
+		data = BIT(xgene_enet_get_fpsel(ring->id));
 	} else {
 		addr = ENET_CFGSSQMIWQRESET_ADDR;
-		data = BIT(val);
+		data = BIT(xgene_enet_ring_bufnum(ring->id));
 	}
 
 	xgene_enet_wr_ring_if(pdata, addr, data);
-- 
1.9.1

^ permalink raw reply related

* [PATCH net-next 0/8] drivers: net: xgene: Add Jumbo and Pause frame support
From: Iyappan Subramanian @ 2016-12-02  0:41 UTC (permalink / raw)
  To: davem, netdev; +Cc: linux-arm-kernel, patches, Iyappan Subramanian

This patch set adds,

1. Jumbo frame support
2. Pause frame based flow control

and fixes RSS for non-TCP/UDP packets.

Signed-off-by: Iyappan Subramanian <isubramanian@apm.com>
---
Iyappan Subramanian (8):
  drivers: net: xgene: Add helper function
  drivers: net: xgene: Configure classifier with pagepool
  drivers: net: xgene: Add support for Jumbo frame
  drivers: net: xgene: Add change_mtu function
  drivers: net: xgene: fix: RSS for non-TCP/UDP
  drivers: net: xgene: Add flow control configuration
  drivers: net: xgene: Add flow control initialization
  drivers: net: xgene: ethtool: Add get/set_pauseparam

 drivers/net/ethernet/apm/xgene/xgene_enet_cle.c    | 110 ++++++-
 drivers/net/ethernet/apm/xgene/xgene_enet_cle.h    |   3 +
 .../net/ethernet/apm/xgene/xgene_enet_ethtool.c    |  70 +++++
 drivers/net/ethernet/apm/xgene/xgene_enet_hw.c     | 140 ++++++++-
 drivers/net/ethernet/apm/xgene/xgene_enet_hw.h     |  27 +-
 drivers/net/ethernet/apm/xgene/xgene_enet_main.c   | 336 +++++++++++++++++++--
 drivers/net/ethernet/apm/xgene/xgene_enet_main.h   |  30 +-
 drivers/net/ethernet/apm/xgene/xgene_enet_ring2.c  |   1 +
 drivers/net/ethernet/apm/xgene/xgene_enet_sgmac.c  | 146 +++++++--
 drivers/net/ethernet/apm/xgene/xgene_enet_xgmac.c  | 121 +++++++-
 drivers/net/ethernet/apm/xgene/xgene_enet_xgmac.h  |   9 +
 11 files changed, 895 insertions(+), 98 deletions(-)

-- 
1.9.1

^ permalink raw reply

* Re: [PATCH next] arp: avoid sending ucast probes to 00:00:00:00:00:00
From: Eric Dumazet @ 2016-12-02  0:17 UTC (permalink / raw)
  To: Mahesh Bandewar (महेश बंडेवार)
  Cc: Mahesh Bandewar, netdev, Eric Dumazet, David Miller
In-Reply-To: <CAF2d9jgQFxRFxvGSZ9vaswe27TvEY2Y-D8E1J279-gXLGfO-xw@mail.gmail.com>

On Thu, 2016-12-01 at 15:47 -0800, Mahesh Bandewar (महेश बंडेवार) wrote:
> [...]
> >> @@ -371,10 +372,12 @@ static void arp_solicit(struct neighbour *neigh, struct sk_buff *skb)
> >>
> >>       probes -= NEIGH_VAR(neigh->parms, UCAST_PROBES);
> >>       if (probes < 0) {
> >> +             memset(&null_dev_hw_addr, 0, dev->addr_len);
> >>               if (!(neigh->nud_state & NUD_VALID))
> >>                       pr_debug("trying to ucast probe in NUD_INVALID\n");
> >>               neigh_ha_snapshot(dst_ha, neigh, dev);
> >> -             dst_hw = dst_ha;
> >> +             if (memcmp(&dst_ha, &null_dev_hw_addr, dev->addr_len) != 0)
> >> +                     dst_hw = dst_ha;
> >>       } else {
> >>               probes -= NEIGH_VAR(neigh->parms, APP_PROBES);
> >>               if (probes < 0) {
> >
> > Why is is an IPv4 specific issue ?
> I think the issue is that neigh_ha_snapshot() gets neigh->ha
> unconditionally even if the neigh state is NUD_INVALID.
> 
> > What about IPv6 ?
> Well it's not ARP. The ndisc_solicit() calls ndisc_send_ns() with
> neigh parameter for unicast probe while call with NULL for the
> broadcast probe case. However it does not use this parameter in
> unicast case and probably relies on the route-entry. Hence it is not
> subjected to the same issue.

Well, it looks like the issue is in neighbour code.

Fact that IPv6 might not be impacted is not the point.



> >
> >
> > I would try something in neighbour code, maybe :
> >
> > diff --git a/net/core/neighbour.c b/net/core/neighbour.c
> > index 782dd866366554e53dda3e6c69c807ec90bd0e08..fdfb177eecb6a9b1479eedde457cb1f652d32c68 100644
> > --- a/net/core/neighbour.c
> > +++ b/net/core/neighbour.c
> > @@ -916,7 +916,10 @@ static void neigh_timer_handler(unsigned long arg)
> >                         neigh_dbg(2, "neigh %p is probed\n", neigh);
> >                         neigh->nud_state = NUD_PROBE;
> >                         neigh->updated = jiffies;
> > -                       atomic_set(&neigh->probes, 0);
> > +                       atomic_set(&neigh->probes,
> > +                                  (neigh->output == neigh_blackhole) ?
> > +                                       NEIGH_VAR(neigh->parms, UCAST_PROBES) :
> > +                                       0);
> This would work if we change the above line (in arp_solicit() code)
> from 'if (probes < 0)' to 'if (probes <= 0)'.

Then code at line 973 is wrong ?

atomic_set(&neigh->probes,
           NEIGH_VAR(neigh->parms, UCAST_PROBES));

That would be a more serious issue :)

^ permalink raw reply

* Re: [PATCH next] arp: avoid sending ucast probes to 00:00:00:00:00:00
From: Mahesh Bandewar (महेश बंडेवार) @ 2016-12-01 23:47 UTC (permalink / raw)
  To: Eric Dumazet; +Cc: Mahesh Bandewar, netdev, Eric Dumazet, David Miller
In-Reply-To: <1480634129.18162.350.camel@edumazet-glaptop3.roam.corp.google.com>

[...]
>> @@ -371,10 +372,12 @@ static void arp_solicit(struct neighbour *neigh, struct sk_buff *skb)
>>
>>       probes -= NEIGH_VAR(neigh->parms, UCAST_PROBES);
>>       if (probes < 0) {
>> +             memset(&null_dev_hw_addr, 0, dev->addr_len);
>>               if (!(neigh->nud_state & NUD_VALID))
>>                       pr_debug("trying to ucast probe in NUD_INVALID\n");
>>               neigh_ha_snapshot(dst_ha, neigh, dev);
>> -             dst_hw = dst_ha;
>> +             if (memcmp(&dst_ha, &null_dev_hw_addr, dev->addr_len) != 0)
>> +                     dst_hw = dst_ha;
>>       } else {
>>               probes -= NEIGH_VAR(neigh->parms, APP_PROBES);
>>               if (probes < 0) {
>
> Why is is an IPv4 specific issue ?
I think the issue is that neigh_ha_snapshot() gets neigh->ha
unconditionally even if the neigh state is NUD_INVALID.

> What about IPv6 ?
Well it's not ARP. The ndisc_solicit() calls ndisc_send_ns() with
neigh parameter for unicast probe while call with NULL for the
broadcast probe case. However it does not use this parameter in
unicast case and probably relies on the route-entry. Hence it is not
subjected to the same issue.
>
>
> I would try something in neighbour code, maybe :
>
> diff --git a/net/core/neighbour.c b/net/core/neighbour.c
> index 782dd866366554e53dda3e6c69c807ec90bd0e08..fdfb177eecb6a9b1479eedde457cb1f652d32c68 100644
> --- a/net/core/neighbour.c
> +++ b/net/core/neighbour.c
> @@ -916,7 +916,10 @@ static void neigh_timer_handler(unsigned long arg)
>                         neigh_dbg(2, "neigh %p is probed\n", neigh);
>                         neigh->nud_state = NUD_PROBE;
>                         neigh->updated = jiffies;
> -                       atomic_set(&neigh->probes, 0);
> +                       atomic_set(&neigh->probes,
> +                                  (neigh->output == neigh_blackhole) ?
> +                                       NEIGH_VAR(neigh->parms, UCAST_PROBES) :
> +                                       0);
This would work if we change the above line (in arp_solicit() code)
from 'if (probes < 0)' to 'if (probes <= 0)'.

>                         notify = 1;
>                         next = now + NEIGH_VAR(neigh->parms, RETRANS_TIME);
>                 }
>
> Thanks.
>
>

^ permalink raw reply

* Re: Initial thoughts on TXDP
From: Rick Jones @ 2016-12-02  0:04 UTC (permalink / raw)
  To: Tom Herbert; +Cc: Sowmini Varadhan, Linux Kernel Network Developers
In-Reply-To: <CALx6S34yWeU1_uGEEJvw=+a49V=vF_rgWVwpuJGzwFvwvQOkpg@mail.gmail.com>

On 12/01/2016 02:12 PM, Tom Herbert wrote:
> We have consider both request size and response side in RPC.
> Presumably, something like a memcache server is most serving data as
> opposed to reading it, we are looking to receiving much smaller
> packets than being sent. Requests are going to be quite small say 100
> bytes and unless we are doing significant amount of pipelining on
> connections GRO would rarely kick-in. Response size will have a lot of
> variability, anything from a few kilobytes up to a megabyte. I'm sorry
> I can't be more specific this is an artifact of datacenters that have
> 100s of different applications and communication patterns. Maybe 100b
> request size, 8K, 16K, 64K response sizes might be good for test.

No worries on the specific sizes, it is a classic "How long is a piece 
of string?" sort of question.

Not surprisingly, as the size of what is being received grows, so too 
the delta between GRO on and off.

stack@np-cp1-c0-m1-mgmt:~/rjones2$ HDR="-P 1"; for r in 8K 16K 64K 1M; 
do for gro in on off; do sudo ethtool -K hed0 gro ${gro}; brand="$r gro 
$gro"; ./netperf -B "$brand" -c -H np-cp1-c1-m3-mgmt -t TCP_RR $HDR -- 
-P 12867 -r 128,${r} -o result_brand,throughput,local_sd; HDR="-P 0"; 
done; done
MIGRATED TCP REQUEST/RESPONSE TEST from 0.0.0.0 (0.0.0.0) port 12867 
AF_INET to np-cp1-c1-m3-mgmt () port 12867 AF_INET : demo : first burst 0
Result Tag,Throughput,Local Service Demand
"8K gro on",9899.84,35.947
"8K gro off",7299.54,61.097
"16K gro on",8119.38,58.367
"16K gro off",5176.87,95.317
"64K gro on",4429.57,110.629
"64K gro off",2128.58,289.913
"1M gro on",887.85,918.447
"1M gro off",335.97,3427.587

So that gives a feel for by how much this alternative mechanism would 
have to reduce path-length to maintain the CPU overhead, were the 
mechanism to preclude GRO.

rick

^ permalink raw reply

* linux-next: manual merge of the wireless-drivers-next tree with the net-next tree
From: Stephen Rothwell @ 2016-12-02  0:03 UTC (permalink / raw)
  To: Kalle Valo, Wireless, David Miller, Networking
  Cc: linux-next, linux-kernel, Sara Sharon, Johannes Berg,
	Rajkumar Manoharan

Hi all,

Today's linux-next merge of the wireless-drivers-next tree got a
conflict in:

  drivers/net/wireless/ath/ath10k/mac.c

between commit:

  f3fe4e93dd63 ("mac80211: add a HW flag for supporting HW TX fragmentation")

from the net-next tree and commit:

  ff32eeb86aa1 ("ath10k: advertize hardware packet loss mechanism")

from the wireless-drivers-next tree.

I fixed it up (see below) and can carry the fix as necessary. This
is now fixed as far as linux-next is concerned, but any non trivial
conflicts should be mentioned to your upstream maintainer when your tree
is submitted for merging.  You may also want to consider cooperating
with the maintainer of the conflicting tree to minimise any particularly
complex conflicts.

-- 
Cheers,
Stephen Rothwell

diff --cc drivers/net/wireless/ath/ath10k/mac.c
index 717b2fad9a8a,db6ddf974d1d..000000000000
--- a/drivers/net/wireless/ath/ath10k/mac.c
+++ b/drivers/net/wireless/ath/ath10k/mac.c
@@@ -8005,7 -7993,7 +7993,8 @@@ int ath10k_mac_register(struct ath10k *
  	ieee80211_hw_set(ar->hw, WANT_MONITOR_VIF);
  	ieee80211_hw_set(ar->hw, CHANCTX_STA_CSA);
  	ieee80211_hw_set(ar->hw, QUEUE_CONTROL);
 +	ieee80211_hw_set(ar->hw, SUPPORTS_TX_FRAG);
+ 	ieee80211_hw_set(ar->hw, REPORTS_LOW_ACK);
  
  	if (!test_bit(ATH10K_FLAG_RAW_MODE, &ar->dev_flags))
  		ieee80211_hw_set(ar->hw, SW_CRYPTO_CONTROL);

^ permalink raw reply

* Re: Initial thoughts on TXDP
From: Tom Herbert @ 2016-12-01 23:46 UTC (permalink / raw)
  To: Hannes Frederic Sowa
  Cc: Florian Westphal, Linux Kernel Network Developers,
	Jesper Dangaard Brouer
In-Reply-To: <859a0c99-f427-1db8-d260-1297777792fb@stressinduktion.org>

On Thu, Dec 1, 2016 at 2:47 PM, Hannes Frederic Sowa
<hannes@stressinduktion.org> wrote:
> Side note:
>
> On 01.12.2016 20:51, Tom Herbert wrote:
>>> > E.g. "mini-skb": Even if we assume that this provides a speedup
>>> > (where does that come from? should make no difference if a 32 or
>>> >  320 byte buffer gets allocated).
>>> >
>> It's the zero'ing of three cache lines. I believe we talked about that
>> as netdev.
>
> Jesper and me played with that again very recently:
>
> https://github.com/netoptimizer/prototype-kernel/blob/master/kernel/lib/time_bench_memset.c#L590
>
> In micro-benchmarks we saw a pretty good speed up not using the rep
> stosb generated by gcc builtin but plain movq's. Probably the cost model
> for __builtin_memset in gcc is wrong?
>
> When Jesper is free we wanted to benchmark this and maybe come up with a
> arch specific way of cleaning if it turns out to really improve throughput.
>
> SIMD instructions seem even faster but the kernel_fpu_begin/end() kill
> all the benefits.
>
One nice direction of XDP is that it forces drivers to defer
allocating (and hence zero'ing) skbs. In the receive path I think we
can exploit this property deeper into the stack. The only time we
_really_ to allocate an skbuf is when we need to put the packet onto a
queue. All the other use cases are really just to pass a structure
containing a packet from function to function. For that purpose we
should be able to just pass a much smaller structure in a stack
argument and only allocate an skbuff when we need to enqueue. In cases
where we don't ever queue a packet we might never need to allocate any
skbuff-- this includes pure acks, packets that end up being dropped.
But even more than that, if a received packet generates a TX packet
(like a SYN causes a SYN-ACK) then we might even be able to just
recycle the received packet and avoid needing any skbuff allocation on
transmit (XDP_TX already does this in a limited context)--  this could
be a win to handle SYN attacks for instance. Also, since we don't
queue on the socket buffer for UDP it's conceivable we could avoid
skbuffs in an expedited UDP TX path.

Currently, nearly the whole stack depends on packets always being
passed in skbuffs, however __skb_flow_dissect is an interesting
exception as it can handle packets passed in either an skbuff or by
just a void *-- so we know that this "dual mode" is at least possible.
Trying to retrain the whole stack to be able to handle both skbuffs
and raw pages is probably untenable at this point, but selectively
augmenting some critical performance functions for dual mode (ip_rcv,
tcp_rcv, udp_rcv functions for instance) might work.

Thanks,
Tom

> Bye,
> Hannes
>

^ permalink raw reply

* [PATCH 7/7] Documentation: DT: net: cpsw: remove no_bd_ram property
From: Grygorii Strashko @ 2016-12-01 23:34 UTC (permalink / raw)
  To: David S. Miller, netdev, Mugunthan V N
  Cc: Sekhar Nori, linux-kernel, linux-omap, Ivan Khoronzhuk,
	Grygorii Strashko
In-Reply-To: <20161201233432.6182-1-grygorii.strashko@ti.com>

Even if no_bd_ram property is described in TI CPSW bindings the
support for it has never been introduced in CPSW driver, so there are
no real users of it. Hence, remove no_bd_ram property.

Signed-off-by: Grygorii Strashko <grygorii.strashko@ti.com>
---
 Documentation/devicetree/bindings/net/cpsw.txt | 3 ---
 arch/arm/boot/dts/am33xx.dtsi                  | 1 -
 arch/arm/boot/dts/am4372.dtsi                  | 1 -
 arch/arm/boot/dts/dm814x.dtsi                  | 1 -
 arch/arm/boot/dts/dra7.dtsi                    | 1 -
 5 files changed, 7 deletions(-)

diff --git a/Documentation/devicetree/bindings/net/cpsw.txt b/Documentation/devicetree/bindings/net/cpsw.txt
index b99d196..4e8b673 100644
--- a/Documentation/devicetree/bindings/net/cpsw.txt
+++ b/Documentation/devicetree/bindings/net/cpsw.txt
@@ -25,7 +25,6 @@ Required properties:
 
 Optional properties:
 - ti,hwmods		: Must be "cpgmac0"
-- no_bd_ram		: Must be 0 or 1
 - dual_emac		: Specifies Switch to act as Dual EMAC
 - syscon		: Phandle to the system control device node, which is
 			  the control module device of the am33x
@@ -73,7 +72,6 @@ Examples:
 		cpdma_channels = <8>;
 		ale_entries = <1024>;
 		bd_ram_size = <0x2000>;
-		no_bd_ram = <0>;
 		rx_descs = <64>;
 		mac_control = <0x20>;
 		slaves = <2>;
@@ -102,7 +100,6 @@ Examples:
 		cpdma_channels = <8>;
 		ale_entries = <1024>;
 		bd_ram_size = <0x2000>;
-		no_bd_ram = <0>;
 		rx_descs = <64>;
 		mac_control = <0x20>;
 		slaves = <2>;
diff --git a/arch/arm/boot/dts/am33xx.dtsi b/arch/arm/boot/dts/am33xx.dtsi
index 194d884..7af5520 100644
--- a/arch/arm/boot/dts/am33xx.dtsi
+++ b/arch/arm/boot/dts/am33xx.dtsi
@@ -777,7 +777,6 @@
 			cpdma_channels = <8>;
 			ale_entries = <1024>;
 			bd_ram_size = <0x2000>;
-			no_bd_ram = <0>;
 			mac_control = <0x20>;
 			slaves = <2>;
 			active_slave = <0>;
diff --git a/arch/arm/boot/dts/am4372.dtsi b/arch/arm/boot/dts/am4372.dtsi
index a275fa9..4f651be 100644
--- a/arch/arm/boot/dts/am4372.dtsi
+++ b/arch/arm/boot/dts/am4372.dtsi
@@ -668,7 +668,6 @@
 			cpdma_channels = <8>;
 			ale_entries = <1024>;
 			bd_ram_size = <0x2000>;
-			no_bd_ram = <0>;
 			mac_control = <0x20>;
 			slaves = <2>;
 			active_slave = <0>;
diff --git a/arch/arm/boot/dts/dm814x.dtsi b/arch/arm/boot/dts/dm814x.dtsi
index ff90a6c..614a4ba 100644
--- a/arch/arm/boot/dts/dm814x.dtsi
+++ b/arch/arm/boot/dts/dm814x.dtsi
@@ -508,7 +508,6 @@
 			cpdma_channels = <8>;
 			ale_entries = <1024>;
 			bd_ram_size = <0x2000>;
-			no_bd_ram = <0>;
 			mac_control = <0x20>;
 			slaves = <2>;
 			active_slave = <0>;
diff --git a/arch/arm/boot/dts/dra7.dtsi b/arch/arm/boot/dts/dra7.dtsi
index d4fcd68..cf7325d 100644
--- a/arch/arm/boot/dts/dra7.dtsi
+++ b/arch/arm/boot/dts/dra7.dtsi
@@ -1706,7 +1706,6 @@
 			cpdma_channels = <8>;
 			ale_entries = <1024>;
 			bd_ram_size = <0x2000>;
-			no_bd_ram = <0>;
 			mac_control = <0x20>;
 			slaves = <2>;
 			active_slave = <0>;
-- 
2.10.1

^ permalink raw reply related

* [PATCH 6/7] net: ethernet: ti: cpsw: add support for descs_pool_size dt property
From: Grygorii Strashko @ 2016-12-01 23:34 UTC (permalink / raw)
  To: David S. Miller, netdev, Mugunthan V N
  Cc: Sekhar Nori, linux-kernel, linux-omap, Ivan Khoronzhuk,
	Grygorii Strashko
In-Reply-To: <20161201233432.6182-1-grygorii.strashko@ti.com>

The CPSW CPDMA can process buffer descriptors placed as in internal
CPPI RAM as in DDR. This patch adds support in CPSW and CPDMA for
descs_pool_size DT property, which defines total number of CPDMA CPPI
descriptors to be used for both ingress/egress packets processing:
 - memory size required for CPDMA descriptor pool is calculated basing
on number of descriptors specified by user in descs_pool_size and
CPDMA descriptor size;
 - allocate CPDMA descriptor pool in DDR if pool memory size >
internal CPPI RAM or use internal CPPI RAM otherwise;
 - if descs_pool_size not specified in DT - the default value 256 will
be used which will allow to place CPDMA descriptors pool into the
internal CPPI RAM (current default behaviour);
 - CPDMA will ignore descs_pool_size if descs_pool_size = 0 for
backward comaptiobility with davinci_emac.

Signed-off-by: Grygorii Strashko <grygorii.strashko@ti.com>
---
 drivers/net/ethernet/ti/cpsw.c          |  5 +++++
 drivers/net/ethernet/ti/cpsw.h          |  1 +
 drivers/net/ethernet/ti/davinci_cpdma.c | 12 ++++++++++++
 drivers/net/ethernet/ti/davinci_cpdma.h |  1 +
 4 files changed, 19 insertions(+)

diff --git a/drivers/net/ethernet/ti/cpsw.c b/drivers/net/ethernet/ti/cpsw.c
index dd5d830..a98c6260 100644
--- a/drivers/net/ethernet/ti/cpsw.c
+++ b/drivers/net/ethernet/ti/cpsw.c
@@ -145,6 +145,7 @@ do {								\
 		cpsw->data.active_slave)
 #define IRQ_NUM			2
 #define CPSW_MAX_QUEUES		8
+#define CPSW_CPDMA_DESCS_POOL_SIZE_DEFAULT 256
 
 static int debug_level;
 module_param(debug_level, int, 0);
@@ -2557,6 +2558,9 @@ static int cpsw_probe_dt(struct cpsw_platform_data *data,
 	if (of_property_read_bool(node, "dual_emac"))
 		data->dual_emac = 1;
 
+	data->descs_pool_size = CPSW_CPDMA_DESCS_POOL_SIZE_DEFAULT;
+	of_property_read_u32(node, "descs_pool_size", &data->descs_pool_size);
+
 	/*
 	 * Populate all the child nodes here...
 	 */
@@ -2967,6 +2971,7 @@ static int cpsw_probe(struct platform_device *pdev)
 	dma_params.has_ext_regs		= true;
 	dma_params.desc_hw_addr         = dma_params.desc_mem_phys;
 	dma_params.bus_freq_mhz		= cpsw->bus_freq_mhz;
+	dma_params.descs_pool_size	= cpsw->data.descs_pool_size;
 
 	cpsw->dma = cpdma_ctlr_create(&dma_params);
 	if (!cpsw->dma) {
diff --git a/drivers/net/ethernet/ti/cpsw.h b/drivers/net/ethernet/ti/cpsw.h
index 16b54c6..8835d79 100644
--- a/drivers/net/ethernet/ti/cpsw.h
+++ b/drivers/net/ethernet/ti/cpsw.h
@@ -38,6 +38,7 @@ struct cpsw_platform_data {
 	u32	mac_control;	/* Mac control register */
 	u16	default_vlan;	/* Def VLAN for ALE lookup in VLAN aware mode*/
 	bool	dual_emac;	/* Enable Dual EMAC mode */
+	u32	descs_pool_size;	/* Number of Rx/Tx Descriptios */
 };
 
 void cpsw_phy_sel(struct device *dev, phy_interface_t phy_mode, int slave);
diff --git a/drivers/net/ethernet/ti/davinci_cpdma.c b/drivers/net/ethernet/ti/davinci_cpdma.c
index ba892bb..f45bb8a 100644
--- a/drivers/net/ethernet/ti/davinci_cpdma.c
+++ b/drivers/net/ethernet/ti/davinci_cpdma.c
@@ -219,6 +219,18 @@ int cpdma_desc_pool_create(struct cpdma_ctlr *ctlr)
 				cpdma_params->desc_align);
 	pool->num_desc	= pool->mem_size / pool->desc_size;
 
+	if (cpdma_params->descs_pool_size) {
+		/* recalculate memory size required cpdma descriptor pool
+		 * basing on number of descriptors specified by user and
+		 * if memory size > CPPI internal RAM size (desc_mem_size)
+		 * then switch to use DDR
+		 */
+		pool->num_desc = cpdma_params->descs_pool_size;
+		pool->mem_size = pool->desc_size * pool->num_desc;
+		if (pool->mem_size > cpdma_params->desc_mem_size)
+			cpdma_params->desc_mem_phys = 0;
+	}
+
 	pool->gen_pool = devm_gen_pool_create(ctlr->dev, ilog2(pool->desc_size),
 					      -1, "cpdma");
 	if (IS_ERR(pool->gen_pool)) {
diff --git a/drivers/net/ethernet/ti/davinci_cpdma.h b/drivers/net/ethernet/ti/davinci_cpdma.h
index 4a167db..cb45f8f 100644
--- a/drivers/net/ethernet/ti/davinci_cpdma.h
+++ b/drivers/net/ethernet/ti/davinci_cpdma.h
@@ -37,6 +37,7 @@ struct cpdma_params {
 	int			desc_mem_size;
 	int			desc_align;
 	u32			bus_freq_mhz;
+	u32			descs_pool_size;
 
 	/*
 	 * Some instances of embedded cpdma controllers have extra control and
-- 
2.10.1

^ permalink raw reply related

* [PATCH 5/7] Documentation: DT: net: cpsw: allow to specify descriptors pool size
From: Grygorii Strashko @ 2016-12-01 23:34 UTC (permalink / raw)
  To: David S. Miller, netdev, Mugunthan V N
  Cc: Sekhar Nori, linux-kernel, linux-omap, Ivan Khoronzhuk,
	Grygorii Strashko
In-Reply-To: <20161201233432.6182-1-grygorii.strashko@ti.com>

Add optional property "descs_pool_size" to specify buffer descriptor's
pool size. The "descs_pool_size" should define total number of CPDMA
CPPI descriptors to be used for both ingress/egress packets
processing. If not specified - the default value 256 will be used
which will allow to place descriptor's pool into the internal CPPI
RAM on most of TI SoC.

Signed-off-by: Grygorii Strashko <grygorii.strashko@ti.com>
---
 Documentation/devicetree/bindings/net/cpsw.txt | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/Documentation/devicetree/bindings/net/cpsw.txt b/Documentation/devicetree/bindings/net/cpsw.txt
index 5ad439f..b99d196 100644
--- a/Documentation/devicetree/bindings/net/cpsw.txt
+++ b/Documentation/devicetree/bindings/net/cpsw.txt
@@ -35,6 +35,11 @@ Optional properties:
 			  For example in dra72x-evm, pcf gpio has to be
 			  driven low so that cpsw slave 0 and phy data
 			  lines are connected via mux.
+- descs_pool_size	: total number of CPDMA CPPI descriptors to be used for
+			  both ingress/egress packets processing. if not
+			  specified the default value 256 will be used which
+			  will allow to place descriptors pool into the
+			  internal CPPI RAM.
 
 
 Slave Properties:
-- 
2.10.1

^ permalink raw reply related

* [PATCH 4/7] net: ethernet: ti: cpdma: use devm_ioremap
From: Grygorii Strashko @ 2016-12-01 23:34 UTC (permalink / raw)
  To: David S. Miller, netdev, Mugunthan V N
  Cc: Sekhar Nori, linux-kernel, linux-omap, Ivan Khoronzhuk,
	Grygorii Strashko
In-Reply-To: <20161201233432.6182-1-grygorii.strashko@ti.com>

Use devm_ioremap() and simplify the code.

Signed-off-by: Grygorii Strashko <grygorii.strashko@ti.com>
---
 drivers/net/ethernet/ti/davinci_cpdma.c | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/drivers/net/ethernet/ti/davinci_cpdma.c b/drivers/net/ethernet/ti/davinci_cpdma.c
index db0a7fd..ba892bb 100644
--- a/drivers/net/ethernet/ti/davinci_cpdma.c
+++ b/drivers/net/ethernet/ti/davinci_cpdma.c
@@ -195,8 +195,6 @@ static void cpdma_desc_pool_destroy(struct cpdma_ctlr *ctlr)
 	if (pool->cpumap)
 		dma_free_coherent(ctlr->dev, pool->mem_size, pool->cpumap,
 				  pool->phys);
-	else
-		iounmap(pool->iomap);
 }
 
 /*
@@ -231,7 +229,8 @@ int cpdma_desc_pool_create(struct cpdma_ctlr *ctlr)
 
 	if (cpdma_params->desc_mem_phys) {
 		pool->phys  = cpdma_params->desc_mem_phys;
-		pool->iomap = ioremap(pool->phys, pool->mem_size);
+		pool->iomap = devm_ioremap(ctlr->dev, pool->phys,
+					   pool->mem_size);
 		pool->hw_addr = cpdma_params->desc_hw_addr;
 	} else {
 		pool->cpumap = dma_alloc_coherent(ctlr->dev,  pool->mem_size,
-- 
2.10.1

^ permalink raw reply related

* [PATCH 3/7] net: ethernet: ti: cpdma: minimize number of parameters in cpdma_desc_pool_create/destroy()
From: Grygorii Strashko @ 2016-12-01 23:34 UTC (permalink / raw)
  To: David S. Miller, netdev, Mugunthan V N
  Cc: Sekhar Nori, linux-kernel, linux-omap, Ivan Khoronzhuk,
	Grygorii Strashko
In-Reply-To: <20161201233432.6182-1-grygorii.strashko@ti.com>

Update cpdma_desc_pool_create/destroy() to accept only one parameter
struct cpdma_ctlr*, as this structure contains all required
information for pool creation/destruction.

Signed-off-by: Grygorii Strashko <grygorii.strashko@ti.com>
---
 drivers/net/ethernet/ti/davinci_cpdma.c | 66 ++++++++++++++++-----------------
 1 file changed, 33 insertions(+), 33 deletions(-)

diff --git a/drivers/net/ethernet/ti/davinci_cpdma.c b/drivers/net/ethernet/ti/davinci_cpdma.c
index 379314f..db0a7fd 100644
--- a/drivers/net/ethernet/ti/davinci_cpdma.c
+++ b/drivers/net/ethernet/ti/davinci_cpdma.c
@@ -181,8 +181,10 @@ static struct cpdma_control_info controls[] = {
 				 (directed << CPDMA_TO_PORT_SHIFT));	\
 	} while (0)
 
-static void cpdma_desc_pool_destroy(struct cpdma_desc_pool *pool)
+static void cpdma_desc_pool_destroy(struct cpdma_ctlr *ctlr)
 {
+	struct cpdma_desc_pool *pool = ctlr->pool;
+
 	if (!pool)
 		return;
 
@@ -191,7 +193,7 @@ static void cpdma_desc_pool_destroy(struct cpdma_desc_pool *pool)
 	     gen_pool_size(pool->gen_pool),
 	     gen_pool_avail(pool->gen_pool));
 	if (pool->cpumap)
-		dma_free_coherent(pool->dev, pool->mem_size, pool->cpumap,
+		dma_free_coherent(ctlr->dev, pool->mem_size, pool->cpumap,
 				  pool->phys);
 	else
 		iounmap(pool->iomap);
@@ -203,57 +205,60 @@ static void cpdma_desc_pool_destroy(struct cpdma_desc_pool *pool)
  * devices (e.g. cpsw switches) use plain old memory.  Descriptor pools
  * abstract out these details
  */
-static struct cpdma_desc_pool *
-cpdma_desc_pool_create(struct device *dev, u32 phys, dma_addr_t hw_addr,
-				int size, int align)
+int cpdma_desc_pool_create(struct cpdma_ctlr *ctlr)
 {
+	struct cpdma_params *cpdma_params = &ctlr->params;
 	struct cpdma_desc_pool *pool;
-	int ret;
+	int ret = 0;
 
-	pool = devm_kzalloc(dev, sizeof(*pool), GFP_KERNEL);
+	pool = devm_kzalloc(ctlr->dev, sizeof(*pool), GFP_KERNEL);
 	if (!pool)
 		goto gen_pool_create_fail;
+	ctlr->pool = pool;
 
-	pool->dev	= dev;
-	pool->mem_size	= size;
-	pool->desc_size	= ALIGN(sizeof(struct cpdma_desc), align);
-	pool->num_desc	= size / pool->desc_size;
+	pool->mem_size	= cpdma_params->desc_mem_size;
+	pool->desc_size	= ALIGN(sizeof(struct cpdma_desc),
+				cpdma_params->desc_align);
+	pool->num_desc	= pool->mem_size / pool->desc_size;
 
-	pool->gen_pool = devm_gen_pool_create(dev, ilog2(pool->desc_size), -1,
-					      "cpdma");
+	pool->gen_pool = devm_gen_pool_create(ctlr->dev, ilog2(pool->desc_size),
+					      -1, "cpdma");
 	if (IS_ERR(pool->gen_pool)) {
-		dev_err(dev, "pool create failed %ld\n",
-			PTR_ERR(pool->gen_pool));
+		ret = PTR_ERR(pool->gen_pool);
+		dev_err(ctlr->dev, "pool create failed %d\n", ret);
 		goto gen_pool_create_fail;
 	}
 
-	if (phys) {
-		pool->phys  = phys;
-		pool->iomap = ioremap(phys, size); /* should be memremap? */
-		pool->hw_addr = hw_addr;
+	if (cpdma_params->desc_mem_phys) {
+		pool->phys  = cpdma_params->desc_mem_phys;
+		pool->iomap = ioremap(pool->phys, pool->mem_size);
+		pool->hw_addr = cpdma_params->desc_hw_addr;
 	} else {
-		pool->cpumap = dma_alloc_coherent(dev, size, &pool->hw_addr,
-						  GFP_KERNEL);
+		pool->cpumap = dma_alloc_coherent(ctlr->dev,  pool->mem_size,
+						  &pool->hw_addr, GFP_KERNEL);
 		pool->iomap = (void __iomem __force *)pool->cpumap;
 		pool->phys = pool->hw_addr; /* assumes no IOMMU, don't use this value */
 	}
 
-	if (!pool->iomap)
+	if (!pool->iomap) {
+		ret = -ENOMEM;
 		goto gen_pool_create_fail;
+	}
 
 	ret = gen_pool_add_virt(pool->gen_pool, (unsigned long)pool->iomap,
 				pool->phys, pool->mem_size, -1);
 	if (ret < 0) {
-		dev_err(dev, "pool add failed %d\n", ret);
+		dev_err(ctlr->dev, "pool add failed %d\n", ret);
 		goto gen_pool_add_virt_fail;
 	}
 
-	return pool;
+	return 0;
 
 gen_pool_add_virt_fail:
-	cpdma_desc_pool_destroy(pool);
+	cpdma_desc_pool_destroy(ctlr);
 gen_pool_create_fail:
-	return NULL;
+	ctlr->pool = NULL;
+	return ret;
 }
 
 static inline dma_addr_t desc_phys(struct cpdma_desc_pool *pool,
@@ -502,12 +507,7 @@ struct cpdma_ctlr *cpdma_ctlr_create(struct cpdma_params *params)
 	ctlr->chan_num = 0;
 	spin_lock_init(&ctlr->lock);
 
-	ctlr->pool = cpdma_desc_pool_create(ctlr->dev,
-					    ctlr->params.desc_mem_phys,
-					    ctlr->params.desc_hw_addr,
-					    ctlr->params.desc_mem_size,
-					    ctlr->params.desc_align);
-	if (!ctlr->pool)
+	if (cpdma_desc_pool_create(ctlr))
 		return NULL;
 
 	if (WARN_ON(ctlr->num_chan > CPDMA_MAX_CHANNELS))
@@ -623,7 +623,7 @@ int cpdma_ctlr_destroy(struct cpdma_ctlr *ctlr)
 	for (i = 0; i < ARRAY_SIZE(ctlr->channels); i++)
 		cpdma_chan_destroy(ctlr->channels[i]);
 
-	cpdma_desc_pool_destroy(ctlr->pool);
+	cpdma_desc_pool_destroy(ctlr);
 	return ret;
 }
 EXPORT_SYMBOL_GPL(cpdma_ctlr_destroy);
-- 
2.10.1

^ permalink raw reply related

* [PATCH 2/7] net: ethernet: ti: cpdma: fix desc re-queuing
From: Grygorii Strashko @ 2016-12-01 23:34 UTC (permalink / raw)
  To: David S. Miller, netdev, Mugunthan V N
  Cc: Sekhar Nori, linux-kernel, linux-omap, Ivan Khoronzhuk,
	Grygorii Strashko
In-Reply-To: <20161201233432.6182-1-grygorii.strashko@ti.com>

The currently processing cpdma descriptor with EOQ flag set may
contain two values in Next Descriptor Pointer field:
- valid pointer: means CPDMA missed addition of new desc in queue;
- null: no more descriptors in queue.
In the later case, it's not required to write to HDP register, but now
CPDMA does it.

Hence, add additional check for Next Descriptor Pointer != null in
cpdma_chan_process() function before writing in HDP register.

Signed-off-by: Grygorii Strashko <grygorii.strashko@ti.com>
---
 drivers/net/ethernet/ti/davinci_cpdma.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/ti/davinci_cpdma.c b/drivers/net/ethernet/ti/davinci_cpdma.c
index 0924014..379314f 100644
--- a/drivers/net/ethernet/ti/davinci_cpdma.c
+++ b/drivers/net/ethernet/ti/davinci_cpdma.c
@@ -1152,7 +1152,7 @@ static int __cpdma_chan_process(struct cpdma_chan *chan)
 	chan->count--;
 	chan->stats.good_dequeue++;
 
-	if (status & CPDMA_DESC_EOQ) {
+	if ((status & CPDMA_DESC_EOQ) && chan->head) {
 		chan->stats.requeue++;
 		chan_write(chan, hdp, desc_phys(pool, chan->head));
 	}
-- 
2.10.1

^ permalink raw reply related

* [PATCH 1/7] net: ethernet: ti: cpdma: am437x: allow descs to be plased in ddr
From: Grygorii Strashko @ 2016-12-01 23:34 UTC (permalink / raw)
  To: David S. Miller, netdev, Mugunthan V N
  Cc: Sekhar Nori, linux-kernel, linux-omap, Ivan Khoronzhuk,
	Grygorii Strashko
In-Reply-To: <20161201233432.6182-1-grygorii.strashko@ti.com>

It's observed that cpsw/cpdma is not working properly when CPPI
descriptors are placed in DDR instead of internal CPPI RAM on am437x
SoC:
- rx/tx silently stops processing packets;
- or - after boot it's working for sometime, but stuck once Network
load is increased (ping is working, but iperf is not).
(The same issue has not been reproduced on am335x and am57xx).

It seems that write to HDP register processed faster by interconnect
than writing of descriptor memory buffer in DDR, which is probably
caused by store buffer / write buffer differences as these function
are implemented differently across devices. So, to fix this i come up
with two changes:

1) all accesses to the channel register HDP/CP/RXFREE registers should
be done using sync IO accessors readl()/writel(), because all previous
memory writes writes have to be completed before starting channel
(write to HDP) or completing desc processing.

2) the change 1 only doesn't work on am437x and additional reading of
desc's field is required right after the new descriptor was filled
with data and before pointer on it will be stored in
prev_desc->hw_next field or HDP register.

Signed-off-by: Grygorii Strashko <grygorii.strashko@ti.com>
---
 drivers/net/ethernet/ti/davinci_cpdma.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/drivers/net/ethernet/ti/davinci_cpdma.c b/drivers/net/ethernet/ti/davinci_cpdma.c
index c776e45..0924014 100644
--- a/drivers/net/ethernet/ti/davinci_cpdma.c
+++ b/drivers/net/ethernet/ti/davinci_cpdma.c
@@ -167,10 +167,10 @@ static struct cpdma_control_info controls[] = {
 
 /* various accessors */
 #define dma_reg_read(ctlr, ofs)		__raw_readl((ctlr)->dmaregs + (ofs))
-#define chan_read(chan, fld)		__raw_readl((chan)->fld)
+#define chan_read(chan, fld)		readl((chan)->fld)
 #define desc_read(desc, fld)		__raw_readl(&(desc)->fld)
 #define dma_reg_write(ctlr, ofs, v)	__raw_writel(v, (ctlr)->dmaregs + (ofs))
-#define chan_write(chan, fld, v)	__raw_writel(v, (chan)->fld)
+#define chan_write(chan, fld, v)	writel(v, (chan)->fld)
 #define desc_write(desc, fld, v)	__raw_writel((u32)(v), &(desc)->fld)
 
 #define cpdma_desc_to_port(chan, mode, directed)			\
@@ -1064,6 +1064,7 @@ int cpdma_chan_submit(struct cpdma_chan *chan, void *token, void *data,
 	desc_write(desc, sw_token,  token);
 	desc_write(desc, sw_buffer, buffer);
 	desc_write(desc, sw_len,    len);
+	desc_read(desc, sw_len);
 
 	__cpdma_chan_submit(chan, desc);
 
-- 
2.10.1

^ permalink raw reply related

* [PATCH 0/7]  net: ethernet: ti: cpsw: support placing CPDMA descriptors into DDR
From: Grygorii Strashko @ 2016-12-01 23:34 UTC (permalink / raw)
  To: David S. Miller, netdev, Mugunthan V N
  Cc: Sekhar Nori, linux-kernel, linux-omap, Ivan Khoronzhuk,
	Grygorii Strashko

This series intended to add support for placing CPDMA descriptors into DDR by
introducing new DT property "descs_pool_size" to specify buffer descriptor's
pool size. The "descs_pool_size" defines total number of CPDMA
CPPI descriptors to be used for both ingress/egress packets
processing. If not specified - the default value 256 will be used
which will allow to place descriptor's pool into the internal CPPI
RAM.

This allows significantly to reduce UDP packets drop rate
for bandwidth >301 Mbits/sec (am57x).  

Before enabling this feature, the am437x SoC has to be fixed as it's proved
that it's not working when CPDMA descriptors placed in DDR.
So, the patch 1 fixes this issue.

Grygorii Strashko (7):
  net: ethernet: ti: cpdma: am437x: allow descs to be plased in ddr
  net: ethernet: ti: cpdma: fix desc re-queuing
  net: ethernet: ti: cpdma: minimize number of parameters in
    cpdma_desc_pool_create/destroy()
  net: ethernet: ti: cpdma: use devm_ioremap
  Documentation: DT: net: cpsw: allow to specify descriptors pool size
  net: ethernet: ti: cpsw: add support for descs_pool_size dt property
  Documentation: DT: net: cpsw: remove no_bd_ram property

 Documentation/devicetree/bindings/net/cpsw.txt |  8 ++-
 arch/arm/boot/dts/am33xx.dtsi                  |  1 -
 arch/arm/boot/dts/am4372.dtsi                  |  1 -
 arch/arm/boot/dts/dm814x.dtsi                  |  1 -
 arch/arm/boot/dts/dra7.dtsi                    |  1 -
 drivers/net/ethernet/ti/cpsw.c                 |  5 ++
 drivers/net/ethernet/ti/cpsw.h                 |  1 +
 drivers/net/ethernet/ti/davinci_cpdma.c        | 90 +++++++++++++++-----------
 drivers/net/ethernet/ti/davinci_cpdma.h        |  1 +
 9 files changed, 63 insertions(+), 46 deletions(-)

-- 
2.10.1

^ permalink raw reply

* Re: [patch net-next v3 11/12] mlxsw: spectrum_router: Request a dump of FIB tables during init
From: Hannes Frederic Sowa @ 2016-12-01 23:27 UTC (permalink / raw)
  To: Ido Schimmel
  Cc: Jiri Pirko, netdev, davem, idosch, eladr, yotamg, nogahf, arkadis,
	ogerlitz, roopa, dsa, nikolay, andy, vivien.didelot, andrew,
	f.fainelli, alexander.h.duyck, kaber
In-Reply-To: <20161201231445.vtx7mjxuxusar3mu@splinter>

On 02.12.2016 00:14, Ido Schimmel wrote:

[...]

>> Basically, if you delete a node right now the kernel might simply do a
>> RCU_INIT_POINTER(ptr_location, NULL), which has absolutely no barriers
>> or synchronization with the reader side. Thus you might get a callback
>> from the notifier for a delete event on the one CPU and you end up
>> queueing this fib entry after the delete queue, because the RCU walk
>> isn't protected by any means.
>>
>> Looking closer at this series again, I overlooked the fact that you
>> fetch fib_seq using a rtnl_lock and rtnl_unlock pair, which first of all
>> orders fetching of fib_seq and thus the RCU dumping after any concurrent
>> executing fib table update, also the mutex_lock and unlock provide
>> proper acquire and release fences, so the CPU indeed sees the effect of
>> a RCU_INIT_POINTER update done on another CPU, because they pair with
>> the rtnl_unlock which might happen on the other CPU.
> 
> Yep, Exactly. I had a feeling this is the issue you were referring to,
> but then you were the one to suggest the use of RTNL, so I was quite
> confused.

At that time I actually had in mind that the fib_register would happen
under the sequence lock, so I didn't look closely to the memory barrier
pairings. I kinda still consider this to be a happy accident. ;)

>> My question is if this is a bit of luck and if we should make this
>> explicit by putting the registration itself under the protection of the
>> sequence counter. I favor the additional protection, e.g. if we some day
>> actually we optimize the fib_seq code? Otherwise we might probably
>> document this fact. :)
> 
> Well, some listeners don't require a dump, but only registration
> (rocker) and in the future we might only need a dump (e.g., port being
> moved to a different net namespace). So I'm not sure if bundling both
> together is a good idea.
> 
> Maybe we can keep register_fib_notifier() as-is and add 'bool register'
> to fib_notifier_dump() so that when set, 'nb' is also registered after
> RCU walk, but before we check if the dump is consistent (unregistered if
> inconsistent)?

I really like that. Would you mind adding this?

[...]

>> Quick follow-up question: How can I quickly find out the hw limitations
>> via the kernel api?
> 
> That's a good question. Currently, you can't. However, we already have a
> mechanism in place to read device's capabilities from the firmware and
> we can (and should) expose some of them to the user. The best API for
> that would be devlink, as it can represent the entire device as opposed
> to only a port netdev like other tools.
> 
> We're also working on making the pipeline more visible to the user, so
> that it would be easier for users to understand and debug their
> networks. I believe a colleague of mine (Matty) presented this during
> the last netdev conference.

Thanks, I will look it up!

Bye,
Hannes

^ permalink raw reply

* [PATCH 3/3] netns: fix net_generic() "id - 1" bloat
From: Alexey Dobriyan @ 2016-12-02  1:21 UTC (permalink / raw)
  To: davem; +Cc: netdev, xemul

net_generic() function is both a) inline and b) used ~600 times.

It has the following code inside

		...
	ptr = ng->ptr[id - 1];
		...

"id" is never compile time constant so compiler is forced to subtract 1.
And those decrements or LEA [r32 - 1] instructions add up.

We also start id'ing from 1 to catch bugs where pernet sybsystem id
is not initialized and 0. This is quite pointless idea (nothing will
work or immediate interference with first registered subsystem) in
general but it hints what needs to be done for code size reduction.

Namely, overlaying allocation of pointer array and fixed part of
structure in the beginning and using usual base-0 addressing.

Ids are just cookies, their exact values do not matter, so lets start
with 3 on x86_64.

Code size savings (oh boy): -4.2 KB

As usual, ignore the initial compiler stupidity part of the table.

	add/remove: 0/0 grow/shrink: 12/670 up/down: 89/-4297 (-4208)
	function                                     old     new   delta
	tipc_nametbl_insert_publ                    1250    1270     +20
	nlmclnt_lookup_host                          686     703     +17
	nfsd4_encode_fattr                          5930    5941     +11
	nfs_get_client                              1050    1061     +11
	register_pernet_operations                   333     342      +9
	tcf_mirred_init                              843     849      +6
	tcf_bpf_init                                1143    1149      +6
	gss_setup_upcall                             990     994      +4
	idmap_name_to_id                             432     434      +2
	ops_init                                     274     275      +1
	nfsd_inject_forget_client                    259     260      +1
	nfs4_alloc_client                            612     613      +1
	tunnel_key_walker                            164     163      -1

		...

	tipc_bcbase_select_primary                   392     360     -32
	mac80211_hwsim_new_radio                    2808    2767     -41
	ipip6_tunnel_ioctl                          2228    2186     -42
	tipc_bcast_rcv                               715     672     -43
	tipc_link_build_proto_msg                   1140    1089     -51
	nfsd4_lock                                  3851    3796     -55
	tipc_mon_rcv                                1012     956     -56
	Total: Before=156643951, After=156639743, chg -0.00%


Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
---

 include/net/netns/generic.h |   16 +++++++++-------
 net/core/net_namespace.c    |   20 ++++++++++++--------
 2 files changed, 21 insertions(+), 15 deletions(-)

--- a/include/net/netns/generic.h
+++ b/include/net/netns/generic.h
@@ -25,12 +25,14 @@
  */
 
 struct net_generic {
-	struct {
-		unsigned int len;
-		struct rcu_head rcu;
-	} s;
-
-	void *ptr[0];
+	union {
+		struct {
+			unsigned int len;
+			struct rcu_head rcu;
+		} s;
+
+		void *ptr[0];
+	};
 };
 
 static inline void *net_generic(const struct net *net, unsigned int id)
@@ -40,7 +42,7 @@ static inline void *net_generic(const struct net *net, unsigned int id)
 
 	rcu_read_lock();
 	ng = rcu_dereference(net->gen);
-	ptr = ng->ptr[id - 1];
+	ptr = ng->ptr[id];
 	rcu_read_unlock();
 
 	return ptr;
--- a/net/core/net_namespace.c
+++ b/net/core/net_namespace.c
@@ -39,6 +39,9 @@ EXPORT_SYMBOL(init_net);
 
 static bool init_net_initialized;
 
+#define MIN_PERNET_OPS_ID	\
+	((sizeof(struct net_generic) + sizeof(void *) - 1) / sizeof(void *))
+
 #define INITIAL_NET_GEN_PTRS	13 /* +1 for len +2 for rcu_head */
 
 static unsigned int max_gen_ptrs = INITIAL_NET_GEN_PTRS;
@@ -46,7 +49,7 @@ static unsigned int max_gen_ptrs = INITIAL_NET_GEN_PTRS;
 static struct net_generic *net_alloc_generic(void)
 {
 	struct net_generic *ng;
-	size_t generic_size = offsetof(struct net_generic, ptr[max_gen_ptrs]);
+	unsigned int generic_size = offsetof(struct net_generic, ptr[max_gen_ptrs]);
 
 	ng = kzalloc(generic_size, GFP_KERNEL);
 	if (ng)
@@ -60,12 +63,12 @@ static int net_assign_generic(struct net *net, unsigned int id, void *data)
 	struct net_generic *ng, *old_ng;
 
 	BUG_ON(!mutex_is_locked(&net_mutex));
-	BUG_ON(id == 0);
+	BUG_ON(id < MIN_PERNET_OPS_ID);
 
 	old_ng = rcu_dereference_protected(net->gen,
 					   lockdep_is_held(&net_mutex));
-	if (old_ng->s.len >= id) {
-		old_ng->ptr[id - 1] = data;
+	if (old_ng->s.len > id) {
+		old_ng->ptr[id] = data;
 		return 0;
 	}
 
@@ -84,8 +87,9 @@ static int net_assign_generic(struct net *net, unsigned int id, void *data)
 	 * the old copy for kfree after a grace period.
 	 */
 
-	memcpy(&ng->ptr, &old_ng->ptr, old_ng->s.len * sizeof(void*));
-	ng->ptr[id - 1] = data;
+	memcpy(&ng->ptr[MIN_PERNET_OPS_ID], &old_ng->ptr[MIN_PERNET_OPS_ID],
+	       (old_ng->s.len - MIN_PERNET_OPS_ID) * sizeof(void *));
+	ng->ptr[id] = data;
 
 	rcu_assign_pointer(net->gen, ng);
 	kfree_rcu(old_ng, s.rcu);
@@ -874,7 +878,7 @@ static int register_pernet_operations(struct list_head *list,
 
 	if (ops->id) {
 again:
-		error = ida_get_new_above(&net_generic_ids, 1, ops->id);
+		error = ida_get_new_above(&net_generic_ids, MIN_PERNET_OPS_ID, ops->id);
 		if (error < 0) {
 			if (error == -EAGAIN) {
 				ida_pre_get(&net_generic_ids, GFP_KERNEL);
@@ -882,7 +886,7 @@ static int register_pernet_operations(struct list_head *list,
 			}
 			return error;
 		}
-		max_gen_ptrs = max(max_gen_ptrs, *ops->id);
+		max_gen_ptrs = max(max_gen_ptrs, *ops->id + 1);
 	}
 	error = __register_pernet_operations(list, ops);
 	if (error) {

^ permalink raw reply

* Re: [PATCH next] arp: avoid sending ucast probes to 00:00:00:00:00:00
From: Eric Dumazet @ 2016-12-01 23:15 UTC (permalink / raw)
  To: Mahesh Bandewar; +Cc: netdev, Eric Dumazet, David Miller, Mahesh Bandewar
In-Reply-To: <1480632994-12128-1-git-send-email-mahesh@bandewar.net>

On Thu, 2016-12-01 at 14:56 -0800, Mahesh Bandewar wrote:
> From: Mahesh Bandewar <maheshb@google.com>
> 
> If initial broadcast probe(s) is/are lost, the neigh entry wont have
> valid address of the neighbour. In a situation like this, the fall
> back should be to send a broadcast probe, however the code logic
> continues sending ucast probes to 00:00:00:00:00:00. The default value
> of ucast probes is 3 so system usually recovers after three such probes
> but if the value configured is larger it takes those many probes
> (a probe is sent every second in default config) / seconds to recover
> making machine not-available on the network.
> 
> This patch just ensures that the unicast address is not NULL otherwise
> falls back to sending broadcast probe.
> 
> Signed-off-by: Mahesh Bandewar <maheshb@google.com>
> ---
>  net/ipv4/arp.c | 5 ++++-
>  1 file changed, 4 insertions(+), 1 deletion(-)
> 
> diff --git a/net/ipv4/arp.c b/net/ipv4/arp.c
> index 89a8cac4726a..56fb33d5ed31 100644
> --- a/net/ipv4/arp.c
> +++ b/net/ipv4/arp.c
> @@ -330,6 +330,7 @@ static void arp_solicit(struct neighbour *neigh, struct sk_buff *skb)
>  {
>  	__be32 saddr = 0;
>  	u8 dst_ha[MAX_ADDR_LEN], *dst_hw = NULL;
> +	u8 null_dev_hw_addr[MAX_ADDR_LEN];
>  	struct net_device *dev = neigh->dev;
>  	__be32 target = *(__be32 *)neigh->primary_key;
>  	int probes = atomic_read(&neigh->probes);
> @@ -371,10 +372,12 @@ static void arp_solicit(struct neighbour *neigh, struct sk_buff *skb)
>  
>  	probes -= NEIGH_VAR(neigh->parms, UCAST_PROBES);
>  	if (probes < 0) {
> +		memset(&null_dev_hw_addr, 0, dev->addr_len);
>  		if (!(neigh->nud_state & NUD_VALID))
>  			pr_debug("trying to ucast probe in NUD_INVALID\n");
>  		neigh_ha_snapshot(dst_ha, neigh, dev);
> -		dst_hw = dst_ha;
> +		if (memcmp(&dst_ha, &null_dev_hw_addr, dev->addr_len) != 0)
> +			dst_hw = dst_ha;
>  	} else {
>  		probes -= NEIGH_VAR(neigh->parms, APP_PROBES);
>  		if (probes < 0) {

Why is is an IPv4 specific issue ?
What about IPv6 ?


I would try something in neighbour code, maybe :

diff --git a/net/core/neighbour.c b/net/core/neighbour.c
index 782dd866366554e53dda3e6c69c807ec90bd0e08..fdfb177eecb6a9b1479eedde457cb1f652d32c68 100644
--- a/net/core/neighbour.c
+++ b/net/core/neighbour.c
@@ -916,7 +916,10 @@ static void neigh_timer_handler(unsigned long arg)
 			neigh_dbg(2, "neigh %p is probed\n", neigh);
 			neigh->nud_state = NUD_PROBE;
 			neigh->updated = jiffies;
-			atomic_set(&neigh->probes, 0);
+			atomic_set(&neigh->probes,
+				   (neigh->output == neigh_blackhole) ?
+					NEIGH_VAR(neigh->parms, UCAST_PROBES) :
+					0);
 			notify = 1;
 			next = now + NEIGH_VAR(neigh->parms, RETRANS_TIME);
 		}

Thanks.

^ permalink raw reply related

* [PATCH] netlink: 2-clause nla_ok()
From: Alexey Dobriyan @ 2016-12-02  0:59 UTC (permalink / raw)
  To: davem; +Cc: netdev

nla_ok() consists of 3 clauses:

	1) int rem >= (int)sizeof(struct nlattr)

	2) u16 nla_len >= sizeof(struct nlattr)

	3) u16 nla_len <= int rem

The statement is that clause (1) is redundant.

What it does is ensuring that "rem" is a positive number,
so that in clause (3) positive number will be compared to positive number
with no problems.

However, "u16" fully fits into "int" and integers do not change value
when upcasting even to signed type. Negative integers will be rejected
by clause (3) just fine. Small positive integers will be rejected
by transitivity of comparison operator.

NOTE: all of the above DOES NOT apply to nlmsg_ok() where ->nlmsg_len is
u32(!), so 3 clauses AND A CAST TO INT are necessary.

Obligatory space savings report: -1.6 KB

	$ ./scripts/bloat-o-meter ../vmlinux-000* ../vmlinux-001*
	add/remove: 0/0 grow/shrink: 3/63 up/down: 35/-1692 (-1657)
	function                                     old     new   delta
	validate_scan_freqs                          142     155     +13
	tcf_em_tree_validate                         867     879     +12
	dcbnl_ieee_del                               328     338     +10
	netlbl_cipsov4_add_common.isra               218     215      -3
		...
	ovs_nla_put_actions                          888     806     -82
	netlbl_cipsov4_add_std                      1648    1566     -82
	nl80211_parse_sched_scan                    2889    2780    -109
	ip_tun_from_nlattr                          3086    2945    -141

Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
---

 include/net/netlink.h |    3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

--- a/include/net/netlink.h
+++ b/include/net/netlink.h
@@ -698,8 +698,7 @@ static inline int nla_len(const struct nlattr *nla)
  */
 static inline int nla_ok(const struct nlattr *nla, int remaining)
 {
-	return remaining >= (int) sizeof(*nla) &&
-	       nla->nla_len >= sizeof(*nla) &&
+	return nla->nla_len >= sizeof(*nla) &&
 	       nla->nla_len <= remaining;
 }
 

^ permalink raw reply

* Re: [patch net-next v3 11/12] mlxsw: spectrum_router: Request a dump of FIB tables during init
From: Ido Schimmel @ 2016-12-01 23:14 UTC (permalink / raw)
  To: Hannes Frederic Sowa
  Cc: Jiri Pirko, netdev, davem, idosch, eladr, yotamg, nogahf, arkadis,
	ogerlitz, roopa, dsa, nikolay, andy, vivien.didelot, andrew,
	f.fainelli, alexander.h.duyck, kaber
In-Reply-To: <ea2a8856-59d8-1b5f-9428-b16efa8f6979@stressinduktion.org>

On Thu, Dec 01, 2016 at 10:57:52PM +0100, Hannes Frederic Sowa wrote:
> On 30.11.2016 19:22, Ido Schimmel wrote:
> > On Wed, Nov 30, 2016 at 05:49:56PM +0100, Hannes Frederic Sowa wrote:
> >> On 30.11.2016 17:32, Ido Schimmel wrote:
> >>> On Wed, Nov 30, 2016 at 04:37:48PM +0100, Hannes Frederic Sowa wrote:
> >>>> On 30.11.2016 11:09, Jiri Pirko wrote:
> >>>>> From: Ido Schimmel <idosch@mellanox.com>
> >>>>>
> >>>>> Make sure the device has a complete view of the FIB tables by invoking
> >>>>> their dump during module init.
> >>>>>
> >>>>> Signed-off-by: Ido Schimmel <idosch@mellanox.com>
> >>>>> Signed-off-by: Jiri Pirko <jiri@mellanox.com>
> >>>>> ---
> >>>>>  .../net/ethernet/mellanox/mlxsw/spectrum_router.c  | 23 ++++++++++++++++++++++
> >>>>>  1 file changed, 23 insertions(+)
> >>>>>
> >>>>> diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c
> >>>>> index 14bed1d..d176047 100644
> >>>>> --- a/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c
> >>>>> +++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c
> >>>>> @@ -2027,8 +2027,23 @@ static int mlxsw_sp_router_fib_event(struct notifier_block *nb,
> >>>>>  	return NOTIFY_DONE;
> >>>>>  }
> >>>>>  
> >>>>> +static void mlxsw_sp_router_fib_dump_flush(struct notifier_block *nb)
> >>>>> +{
> >>>>> +	struct mlxsw_sp *mlxsw_sp = container_of(nb, struct mlxsw_sp, fib_nb);
> >>>>> +
> >>>>> +	/* Flush pending FIB notifications and then flush the device's
> >>>>> +	 * table before requesting another dump. Do that with RTNL held,
> >>>>> +	 * as FIB notification block is already registered.
> >>>>> +	 */
> >>>>> +	mlxsw_core_flush_owq();
> >>>>> +	rtnl_lock();
> >>>>> +	mlxsw_sp_router_fib_flush(mlxsw_sp);
> >>>>> +	rtnl_unlock();
> >>>>> +}
> >>>>> +
> >>>>>  int mlxsw_sp_router_init(struct mlxsw_sp *mlxsw_sp)
> >>>>>  {
> >>>>> +	fib_dump_cb_t *cb = mlxsw_sp_router_fib_dump_flush;
> >>>>>  	int err;
> >>>>>  
> >>>>>  	INIT_LIST_HEAD(&mlxsw_sp->router.nexthop_neighs_list);
> >>>>> @@ -2048,8 +2063,16 @@ int mlxsw_sp_router_init(struct mlxsw_sp *mlxsw_sp)
> >>>>>  
> >>>>>  	mlxsw_sp->fib_nb.notifier_call = mlxsw_sp_router_fib_event;
> >>>>>  	register_fib_notifier(&mlxsw_sp->fib_nb);
> >>>>
> >>>> Sorry to pick in here again:
> >>>>
> >>>> There is a race here. You need to protect the registration of the fib
> >>>> notifier as well by the sequence counter. Updates here are not ordered
> >>>> in relation to this code below.
> >>>
> >>> You mean updates that can be received after you registered the notifier
> >>> and until the dump started? I'm aware of that and that's OK. This
> >>> listener should be able to handle duplicates.
> >>
> >> I am not concerned about duplicates, but about ordering deletes and
> >> getting an add from the RCU code you will add the node to hw while it is
> >> deleted in the software path. You probably will ignore the delete
> >> because nothing is installed in hw and later add the node which was
> >> actually deleted but just reordered which happend on another CPU, no?
> > 
> > Are you referring to reordering in the workqueue? We already covered
> > this using an ordered workqueue, which has one context of execution
> > system-wide.
> 
> Ups, sorry, I missed that mail. Probably read it on the mobile phone and
> it became invisible for me later on. Busy day... ;)

Yet another reason not to read emails on your phone ;)

> The reordering in the workqueue seems fine to me and also still necessary.

Correct.

> Basically, if you delete a node right now the kernel might simply do a
> RCU_INIT_POINTER(ptr_location, NULL), which has absolutely no barriers
> or synchronization with the reader side. Thus you might get a callback
> from the notifier for a delete event on the one CPU and you end up
> queueing this fib entry after the delete queue, because the RCU walk
> isn't protected by any means.
> 
> Looking closer at this series again, I overlooked the fact that you
> fetch fib_seq using a rtnl_lock and rtnl_unlock pair, which first of all
> orders fetching of fib_seq and thus the RCU dumping after any concurrent
> executing fib table update, also the mutex_lock and unlock provide
> proper acquire and release fences, so the CPU indeed sees the effect of
> a RCU_INIT_POINTER update done on another CPU, because they pair with
> the rtnl_unlock which might happen on the other CPU.

Yep, Exactly. I had a feeling this is the issue you were referring to,
but then you were the one to suggest the use of RTNL, so I was quite
confused.

> My question is if this is a bit of luck and if we should make this
> explicit by putting the registration itself under the protection of the
> sequence counter. I favor the additional protection, e.g. if we some day
> actually we optimize the fib_seq code? Otherwise we might probably
> document this fact. :)

Well, some listeners don't require a dump, but only registration
(rocker) and in the future we might only need a dump (e.g., port being
moved to a different net namespace). So I'm not sure if bundling both
together is a good idea.

Maybe we can keep register_fib_notifier() as-is and add 'bool register'
to fib_notifier_dump() so that when set, 'nb' is also registered after
RCU walk, but before we check if the dump is consistent (unregistered if
inconsistent)?

> >>> I've a follow up patchset that introduces a new event in switchdev
> >>> notification chain called SWITCHDEV_SYNC, which is sent when port
> >>> netdevs are enslaved / released  from a master device (points in time
> >>> where kernel<->device can get out of sync). It will invoke
> >>> re-propagation of configuration from different parts of the stack
> >>> (e.g. bridge driver, 8021q driver, fib/neigh code), which can result
> >>> in duplicates.
> >>
> >> Okay, understood. I wonder how we can protect against accidentally abort
> >> calls actually. E.g. if I start to inject routes into my routing domain
> >> how can I make sure the box doesn't die after I try to insert enough
> >> routes. Do we need to touch quagga etc?
> > 
> > The whole point of moving abort mechanism to the driver is that the
> > system won't die, but instead routing will be done in the kernel. If you
> > respect hardware limitations, then there's no reason for abort mechanism
> > to kick in.
> 
> Quick follow-up question: How can I quickly find out the hw limitations
> via the kernel api?

That's a good question. Currently, you can't. However, we already have a
mechanism in place to read device's capabilities from the firmware and
we can (and should) expose some of them to the user. The best API for
that would be devlink, as it can represent the entire device as opposed
to only a port netdev like other tools.

We're also working on making the pipeline more visible to the user, so
that it would be easier for users to understand and debug their
networks. I believe a colleague of mine (Matty) presented this during
the last netdev conference.

^ permalink raw reply

* [PATCH 2/3] netns: add dummy struct inside "struct net_generic"
From: Alexey Dobriyan @ 2016-12-02  1:12 UTC (permalink / raw)
  To: davem; +Cc: netdev, xemul

This is precursor to fixing "[id - 1]" bloat inside net_generic().

Name "s" is chosen to complement name "u" often used for dummy unions.

Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
---

 include/net/netns/generic.h |    6 ++++--
 net/core/net_namespace.c    |    8 ++++----
 2 files changed, 8 insertions(+), 6 deletions(-)

--- a/include/net/netns/generic.h
+++ b/include/net/netns/generic.h
@@ -25,8 +25,10 @@
  */
 
 struct net_generic {
-	unsigned int len;
-	struct rcu_head rcu;
+	struct {
+		unsigned int len;
+		struct rcu_head rcu;
+	} s;
 
 	void *ptr[0];
 };
--- a/net/core/net_namespace.c
+++ b/net/core/net_namespace.c
@@ -50,7 +50,7 @@ static struct net_generic *net_alloc_generic(void)
 
 	ng = kzalloc(generic_size, GFP_KERNEL);
 	if (ng)
-		ng->len = max_gen_ptrs;
+		ng->s.len = max_gen_ptrs;
 
 	return ng;
 }
@@ -64,7 +64,7 @@ static int net_assign_generic(struct net *net, unsigned int id, void *data)
 
 	old_ng = rcu_dereference_protected(net->gen,
 					   lockdep_is_held(&net_mutex));
-	if (old_ng->len >= id) {
+	if (old_ng->s.len >= id) {
 		old_ng->ptr[id - 1] = data;
 		return 0;
 	}
@@ -84,11 +84,11 @@ static int net_assign_generic(struct net *net, unsigned int id, void *data)
 	 * the old copy for kfree after a grace period.
 	 */
 
-	memcpy(&ng->ptr, &old_ng->ptr, old_ng->len * sizeof(void*));
+	memcpy(&ng->ptr, &old_ng->ptr, old_ng->s.len * sizeof(void*));
 	ng->ptr[id - 1] = data;
 
 	rcu_assign_pointer(net->gen, ng);
-	kfree_rcu(old_ng, rcu);
+	kfree_rcu(old_ng, s.rcu);
 	return 0;
 }
 

^ permalink raw reply

* [PATCH 1/3] netns: publish net_generic correctly
From: Alexey Dobriyan @ 2016-12-02  1:11 UTC (permalink / raw)
  To: davem; +Cc: netdev, xemul

Publishing net_generic pointer is done with silly mistake: new array is
published BEFORE setting freshly acquired pernet subsystem pointer.

	memcpy
	rcu_assign_pointer
	kfree_rcu
	ng->ptr[id - 1] = data;

This bug was introduced with commit dec827d174d7f76c457238800183ca864a639365
("[NETNS]: The generic per-net pointers.") in the glorious days of
chopping networking stack into containers proper 8.5 years ago (whee...)

How it didn't trigger for so long?
Well, you need quite specific set of conditions:

*) race window opens once per pernet subsystem addition
   (read: modprobe or boot)

*) not every pernet subsystem is eligible (need ->id and ->size)

*) not every pernet subsystem is vulnerable (need incorrect or absense
   of ordering of register_pernet_sybsys() and actually using net_generic())

*) to hide the bug even more, default is to preallocate 13 pointers which
   is actually quite a lot. You need IPv6, netfilter, bridging etc together
   loaded to trigger reallocation in the first place. Trimmed down
   config are OK.

Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
---

 net/core/net_namespace.c |   10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

--- a/net/core/net_namespace.c
+++ b/net/core/net_namespace.c
@@ -64,9 +64,10 @@ static int net_assign_generic(struct net *net, unsigned int id, void *data)
 
 	old_ng = rcu_dereference_protected(net->gen,
 					   lockdep_is_held(&net_mutex));
-	ng = old_ng;
-	if (old_ng->len >= id)
-		goto assign;
+	if (old_ng->len >= id) {
+		old_ng->ptr[id - 1] = data;
+		return 0;
+	}
 
 	ng = net_alloc_generic();
 	if (ng == NULL)
@@ -84,11 +85,10 @@ static int net_assign_generic(struct net *net, unsigned int id, void *data)
 	 */
 
 	memcpy(&ng->ptr, &old_ng->ptr, old_ng->len * sizeof(void*));
+	ng->ptr[id - 1] = data;
 
 	rcu_assign_pointer(net->gen, ng);
 	kfree_rcu(old_ng, rcu);
-assign:
-	ng->ptr[id - 1] = data;
 	return 0;
 }
 

^ permalink raw reply

* [PATCH next] arp: avoid sending ucast probes to 00:00:00:00:00:00
From: Mahesh Bandewar @ 2016-12-01 22:56 UTC (permalink / raw)
  To: netdev, Eric Dumazet, David Miller; +Cc: Mahesh Bandewar

From: Mahesh Bandewar <maheshb@google.com>

If initial broadcast probe(s) is/are lost, the neigh entry wont have
valid address of the neighbour. In a situation like this, the fall
back should be to send a broadcast probe, however the code logic
continues sending ucast probes to 00:00:00:00:00:00. The default value
of ucast probes is 3 so system usually recovers after three such probes
but if the value configured is larger it takes those many probes
(a probe is sent every second in default config) / seconds to recover
making machine not-available on the network.

This patch just ensures that the unicast address is not NULL otherwise
falls back to sending broadcast probe.

Signed-off-by: Mahesh Bandewar <maheshb@google.com>
---
 net/ipv4/arp.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/net/ipv4/arp.c b/net/ipv4/arp.c
index 89a8cac4726a..56fb33d5ed31 100644
--- a/net/ipv4/arp.c
+++ b/net/ipv4/arp.c
@@ -330,6 +330,7 @@ static void arp_solicit(struct neighbour *neigh, struct sk_buff *skb)
 {
 	__be32 saddr = 0;
 	u8 dst_ha[MAX_ADDR_LEN], *dst_hw = NULL;
+	u8 null_dev_hw_addr[MAX_ADDR_LEN];
 	struct net_device *dev = neigh->dev;
 	__be32 target = *(__be32 *)neigh->primary_key;
 	int probes = atomic_read(&neigh->probes);
@@ -371,10 +372,12 @@ static void arp_solicit(struct neighbour *neigh, struct sk_buff *skb)

 	probes -= NEIGH_VAR(neigh->parms, UCAST_PROBES);
 	if (probes < 0) {
+		memset(&null_dev_hw_addr, 0, dev->addr_len);
 		if (!(neigh->nud_state & NUD_VALID))
 			pr_debug("trying to ucast probe in NUD_INVALID\n");
 		neigh_ha_snapshot(dst_ha, neigh, dev);
-		dst_hw = dst_ha;
+		if (memcmp(&dst_ha, &null_dev_hw_addr, dev->addr_len) != 0)
+			dst_hw = dst_ha;
 	} else {
 		probes -= NEIGH_VAR(neigh->parms, APP_PROBES);
 		if (probes < 0) {
-- 
2.8.0.rc3.226.g39d4020

^ permalink raw reply related

* Re: Initial thoughts on TXDP
From: Hannes Frederic Sowa @ 2016-12-01 22:55 UTC (permalink / raw)
  To: Sowmini Varadhan, Tom Herbert; +Cc: Linux Kernel Network Developers
In-Reply-To: <20161201201324.GJ24547@oracle.com>

On 01.12.2016 21:13, Sowmini Varadhan wrote:
> On (12/01/16 11:05), Tom Herbert wrote:
>>
>> Polling does not necessarily imply that networking monopolizes the CPU
>> except when the CPU is otherwise idle. Presumably the application
>> drives the polling when it is ready to receive work.
> 
> I'm not grokking that- "if the cpu is idle, we want to busy-poll
> and make it 0% idle"?  Keeping CPU 0% idle has all sorts
> of issues, see slide 20 of
>  http://www.slideshare.net/shemminger/dpdk-performance
>
>>> and one other critical difference from the hot-potato-forwarding
>>> model (the sort of OVS model that DPDK etc might aruguably be a fit for)
>>> does not apply: in order to figure out the ethernet and IP headers
>>> in the response correctly at all times (in the face of things like VRRP,
>>> gw changes, gw's mac addr changes etc) the application should really
>>> be listening on NETLINK sockets for modifications to the networking
>>> state - again points to needing a select() socket set where you can
>>> have both the I/O fds and the netlink socket,
>>>
>> I would think that that is management would not be implemented in a
>> fast path processing thread for an application.
> 
> sure, but my point was that *XDP and other stack-bypass methods needs 
> to provide a select()able socket: when your use-case is not about just
> networking, you have to snoop on changes to the control plane, and update
> your data path. In the OVS case (pure networking) the OVS control plane
> updates are intrinsic to OVS. For the rest of the request/response world,
> we need a select()able socket set to do this elegantly (not really
> possible in DPDK, for example)

Busypoll on steroids is what windows does by mapping the user space
"doorbell" into a vDSO and let user space loop on that maybe with
MWAIT/MONITOR. The interesting thing is that you can map other events to
this notification event, too. It sounds like a usable idea to me and
reassembles what we already do with futexes.

Bye,
Hannes

^ permalink raw reply

page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox