Netdev List
 help / color / mirror / Atom feed
* [PATCH 4/6] net: stmmac: dwmac1000: fix define DMA_BUS_MODE_RPBL_MASK
From: Niklas Cassel @ 2016-12-02 14:11 UTC (permalink / raw)
  To: Giuseppe Cavallaro, Alexandre Torgue; +Cc: Niklas Cassel, netdev, linux-kernel
In-Reply-To: <1480687910-9690-1-git-send-email-niklass@axis.com>

From: Niklas Cassel <niklas.cassel@axis.com>

DMA_BUS_MODE_RPBL_MASK is really 6 bits,
just like DMA_BUS_MODE_PBL_MASK.

Signed-off-by: Niklas Cassel <niklas.cassel@axis.com>
---
 drivers/net/ethernet/stmicro/stmmac/dwmac1000.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac1000.h b/drivers/net/ethernet/stmicro/stmmac/dwmac1000.h
index ff3e5ab39bd0..52b9407a8a39 100644
--- a/drivers/net/ethernet/stmicro/stmmac/dwmac1000.h
+++ b/drivers/net/ethernet/stmicro/stmmac/dwmac1000.h
@@ -225,7 +225,7 @@ enum rx_tx_priority_ratio {
 
 #define DMA_BUS_MODE_FB		0x00010000	/* Fixed burst */
 #define DMA_BUS_MODE_MB		0x04000000	/* Mixed burst */
-#define DMA_BUS_MODE_RPBL_MASK	0x003e0000	/* Rx-Programmable Burst Len */
+#define DMA_BUS_MODE_RPBL_MASK	0x007e0000	/* Rx-Programmable Burst Len */
 #define DMA_BUS_MODE_RPBL_SHIFT	17
 #define DMA_BUS_MODE_USP	0x00800000
 #define DMA_BUS_MODE_MAXPBL	0x01000000
-- 
2.1.4

^ permalink raw reply related

* [PATCH 3/6] net: stmmac: stmmac_platform: fix parsing of DT binding
From: Niklas Cassel @ 2016-12-02 14:11 UTC (permalink / raw)
  To: Giuseppe Cavallaro, Alexandre Torgue; +Cc: Niklas Cassel, netdev, linux-kernel
In-Reply-To: <1480687910-9690-1-git-send-email-niklass@axis.com>

From: Niklas Cassel <niklas.cassel@axis.com>

commit 64c3b252e9fc ("net: stmmac: fixed the pbl setting with DT")
changed the parsing of the DT binding.

Before 64c3b252e9fc, snps,fixed-burst and snps,mixed-burst were parsed
regardless if the property snps,pbl existed or not.
After the commit, fixed burst and mixed burst are only parsed if
snps,pbl exists. Now when snps,aal has been added, it too is only
parsed if snps,pbl exists.

Since the DT binding does not specify that fixed burst, mixed burst
or aal depend on snps,pbl being specified, undo changes introduced
by 64c3b252e9fc.

The issue commit 64c3b252e9fc ("net: stmmac: fixed the pbl setting with
DT") tries to address is solved in another way:
The databook specifies that all values other than
1, 2, 4, 8, 16, or 32 results in undefined behavior,
so snps,pbl = <0> is invalid.

If pbl is 0 after parsing, set pbl to DEFAULT_DMA_PBL.
This handles the case where the property is omitted, and also handles
the case where the property is specified without any data.

Signed-off-by: Niklas Cassel <niklas.cassel@axis.com>
---
 drivers/net/ethernet/stmicro/stmmac/stmmac_main.c  |  3 +++
 .../net/ethernet/stmicro/stmmac/stmmac_platform.c  | 27 +++++++++++-----------
 2 files changed, 16 insertions(+), 14 deletions(-)

diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
index 27a03f7ee095..9ba2eb4c22fe 100644
--- a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
+++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
@@ -1585,6 +1585,9 @@ static int stmmac_init_dma_engine(struct stmmac_priv *priv)
 		return -EINVAL;
 	}
 
+	if (!priv->plat->dma_cfg->pbl)
+		priv->plat->dma_cfg->pbl = DEFAULT_DMA_PBL;
+
 	if (priv->extend_desc && (priv->mode == STMMAC_RING_MODE))
 		atds = 1;
 
diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_platform.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_platform.c
index b46c892c7079..05b8c33effd5 100644
--- a/drivers/net/ethernet/stmicro/stmmac/stmmac_platform.c
+++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_platform.c
@@ -303,21 +303,20 @@ stmmac_probe_config_dt(struct platform_device *pdev, const char **mac)
 		plat->force_sf_dma_mode = 1;
 	}
 
-	if (of_find_property(np, "snps,pbl", NULL)) {
-		dma_cfg = devm_kzalloc(&pdev->dev, sizeof(*dma_cfg),
-				       GFP_KERNEL);
-		if (!dma_cfg) {
-			of_node_put(plat->phy_node);
-			return ERR_PTR(-ENOMEM);
-		}
-		plat->dma_cfg = dma_cfg;
-		of_property_read_u32(np, "snps,pbl", &dma_cfg->pbl);
-		dma_cfg->aal = of_property_read_bool(np, "snps,aal");
-		dma_cfg->fixed_burst =
-			of_property_read_bool(np, "snps,fixed-burst");
-		dma_cfg->mixed_burst =
-			of_property_read_bool(np, "snps,mixed-burst");
+	dma_cfg = devm_kzalloc(&pdev->dev, sizeof(*dma_cfg),
+			       GFP_KERNEL);
+	if (!dma_cfg) {
+		of_node_put(plat->phy_node);
+		return ERR_PTR(-ENOMEM);
 	}
+	plat->dma_cfg = dma_cfg;
+
+	of_property_read_u32(np, "snps,pbl", &dma_cfg->pbl);
+
+	dma_cfg->aal = of_property_read_bool(np, "snps,aal");
+	dma_cfg->fixed_burst = of_property_read_bool(np, "snps,fixed-burst");
+	dma_cfg->mixed_burst = of_property_read_bool(np, "snps,mixed-burst");
+
 	plat->force_thresh_dma_mode = of_property_read_bool(np, "snps,force_thresh_dma_mode");
 	if (plat->force_thresh_dma_mode) {
 		plat->force_sf_dma_mode = 0;
-- 
2.1.4

^ permalink raw reply related

* [PATCH 5/6] net: stmmac: add support for independent DMA pbl for tx/rx
From: Niklas Cassel @ 2016-12-02 14:11 UTC (permalink / raw)
  To: Rob Herring, Mark Rutland, Jonathan Corbet, Giuseppe Cavallaro,
	Alexandre Torgue, David S. Miller, Phil Reid, Eric Engestrom,
	Joachim Eastwood, Gabriel Fernandez, Vincent Palatin
  Cc: Niklas Cassel, netdev, devicetree, linux-kernel, linux-doc
In-Reply-To: <1480687910-9690-1-git-send-email-niklass@axis.com>

From: Niklas Cassel <niklas.cassel@axis.com>

GMAC and newer supports independent programmable burst lengths for
DMA tx/rx. Add new optional devicetree properties representing this.

To be backwards compatible, snps,pbl will still be valid, but
snps,txpbl/snps,rxpbl will override the value in snps,pbl if set.

If the IP is synthesized to use the AXI interface, there is a register
and a matching DT property inside the optional stmmac-axi-config DT node
for controlling burst lengths, named snps,blen.
However, using this register, it is not possible to control tx and rx
independently. Also, this register is not available if the IP was
synthesized with, e.g., the AHB interface.

Signed-off-by: Niklas Cassel <niklas.cassel@axis.com>
---
 Documentation/devicetree/bindings/net/stmmac.txt      |  6 +++++-
 Documentation/networking/stmmac.txt                   | 19 +++++++++++++------
 drivers/net/ethernet/stmicro/stmmac/dwmac1000_dma.c   | 12 ++++++------
 drivers/net/ethernet/stmicro/stmmac/dwmac4_dma.c      | 12 +++++++-----
 drivers/net/ethernet/stmicro/stmmac/stmmac_platform.c |  2 ++
 include/linux/stmmac.h                                |  2 ++
 6 files changed, 35 insertions(+), 18 deletions(-)

diff --git a/Documentation/devicetree/bindings/net/stmmac.txt b/Documentation/devicetree/bindings/net/stmmac.txt
index b95ff998ba73..8080038ff1b2 100644
--- a/Documentation/devicetree/bindings/net/stmmac.txt
+++ b/Documentation/devicetree/bindings/net/stmmac.txt
@@ -34,7 +34,11 @@ Optional properties:
   platforms.
 - tx-fifo-depth: See ethernet.txt file in the same directory
 - rx-fifo-depth: See ethernet.txt file in the same directory
-- snps,pbl		Programmable Burst Length
+- snps,pbl		Programmable Burst Length (tx and rx)
+- snps,txpbl		Tx Programmable Burst Length. Only for GMAC and newer.
+			If set, DMA tx will use this value rather than snps,pbl.
+- snps,rxpbl		Rx Programmable Burst Length. Only for GMAC and newer.
+			If set, DMA rx will use this value rather than snps,pbl.
 - snps,aal		Address-Aligned Beats
 - snps,fixed-burst	Program the DMA to use the fixed burst mode
 - snps,mixed-burst	Program the DMA to use the mixed burst mode
diff --git a/Documentation/networking/stmmac.txt b/Documentation/networking/stmmac.txt
index e226f8925c9e..82c8e496b4bb 100644
--- a/Documentation/networking/stmmac.txt
+++ b/Documentation/networking/stmmac.txt
@@ -154,7 +154,8 @@ Where:
    o pbl: the Programmable Burst Length is maximum number of beats to
        be transferred in one DMA transaction.
        GMAC also enables the 4xPBL by default.
-   o fixed_burst/mixed_burst/burst_len
+   o txpbl/rxpbl: GMAC and newer supports independent DMA pbl for tx/rx.
+   o fixed_burst/mixed_burst/aal
  o clk_csr: fixed CSR Clock range selection.
  o has_gmac: uses the GMAC core.
  o enh_desc: if sets the MAC will use the enhanced descriptor structure.
@@ -206,16 +207,22 @@ tuned according to the HW capabilities.
 
 struct stmmac_dma_cfg {
 	int pbl;
+	int txpbl;
+	int rxpbl;
 	int fixed_burst;
-	int burst_len_supported;
+	int mixed_burst;
+	bool aal;
 };
 
 Where:
- o pbl: Programmable Burst Length
+ o pbl: Programmable Burst Length (tx and rx)
+ o txpbl: Transmit Programmable Burst Length. Only for GMAC and newer.
+	 If set, DMA tx will use this value rather than pbl.
+ o rxpbl: Receive Programmable Burst Length. Only for GMAC and newer.
+	 If set, DMA rx will use this value rather than pbl.
  o fixed_burst: program the DMA to use the fixed burst mode
- o burst_len: this is the value we put in the register
-	      supported values are provided as macros in
-	      linux/stmmac.h header file.
+ o mixed_burst: program the DMA to use the mixed burst mode
+ o aal: Address-Aligned Beats
 
 ---
 
diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac1000_dma.c b/drivers/net/ethernet/stmicro/stmmac/dwmac1000_dma.c
index 01d0d0f315e5..1dd34fb4c1a9 100644
--- a/drivers/net/ethernet/stmicro/stmmac/dwmac1000_dma.c
+++ b/drivers/net/ethernet/stmicro/stmmac/dwmac1000_dma.c
@@ -87,20 +87,20 @@ static void dwmac1000_dma_init(void __iomem *ioaddr,
 			       u32 dma_tx, u32 dma_rx, int atds)
 {
 	u32 value = readl(ioaddr + DMA_BUS_MODE);
+	int txpbl = dma_cfg->txpbl ?: dma_cfg->pbl;
+	int rxpbl = dma_cfg->rxpbl ?: dma_cfg->pbl;
 
 	/*
 	 * Set the DMA PBL (Programmable Burst Length) mode.
 	 *
 	 * Note: before stmmac core 3.50 this mode bit was 4xPBL, and
 	 * post 3.5 mode bit acts as 8*PBL.
-	 *
-	 * This configuration doesn't take care about the Separate PBL
-	 * so only the bits: 13-8 are programmed with the PBL passed from the
-	 * platform.
 	 */
 	value |= DMA_BUS_MODE_MAXPBL;
-	value &= ~DMA_BUS_MODE_PBL_MASK;
-	value |= (dma_cfg->pbl << DMA_BUS_MODE_PBL_SHIFT);
+	value |= DMA_BUS_MODE_USP;
+	value &= ~(DMA_BUS_MODE_PBL_MASK | DMA_BUS_MODE_RPBL_MASK);
+	value |= (txpbl << DMA_BUS_MODE_PBL_SHIFT);
+	value |= (rxpbl << DMA_BUS_MODE_RPBL_SHIFT);
 
 	/* Set the Fixed burst mode */
 	if (dma_cfg->fixed_burst)
diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac4_dma.c b/drivers/net/ethernet/stmicro/stmmac/dwmac4_dma.c
index 0946546d6dcd..0bf47825bfeb 100644
--- a/drivers/net/ethernet/stmicro/stmmac/dwmac4_dma.c
+++ b/drivers/net/ethernet/stmicro/stmmac/dwmac4_dma.c
@@ -69,11 +69,14 @@ static void dwmac4_dma_axi(void __iomem *ioaddr, struct stmmac_axi *axi)
 	writel(value, ioaddr + DMA_SYS_BUS_MODE);
 }
 
-static void dwmac4_dma_init_channel(void __iomem *ioaddr, int pbl,
+static void dwmac4_dma_init_channel(void __iomem *ioaddr,
+				    struct stmmac_dma_cfg *dma_cfg,
 				    u32 dma_tx_phy, u32 dma_rx_phy,
 				    u32 channel)
 {
 	u32 value;
+	int txpbl = dma_cfg->txpbl ?: dma_cfg->pbl;
+	int rxpbl = dma_cfg->rxpbl ?: dma_cfg->pbl;
 
 	/* set PBL for each channels. Currently we affect same configuration
 	 * on each channel
@@ -83,11 +86,11 @@ static void dwmac4_dma_init_channel(void __iomem *ioaddr, int pbl,
 	writel(value, ioaddr + DMA_CHAN_CONTROL(channel));
 
 	value = readl(ioaddr + DMA_CHAN_TX_CONTROL(channel));
-	value = value | (pbl << DMA_BUS_MODE_PBL_SHIFT);
+	value = value | (txpbl << DMA_BUS_MODE_PBL_SHIFT);
 	writel(value, ioaddr + DMA_CHAN_TX_CONTROL(channel));
 
 	value = readl(ioaddr + DMA_CHAN_RX_CONTROL(channel));
-	value = value | (pbl << DMA_BUS_MODE_RPBL_SHIFT);
+	value = value | (rxpbl << DMA_BUS_MODE_RPBL_SHIFT);
 	writel(value, ioaddr + DMA_CHAN_RX_CONTROL(channel));
 
 	/* Mask interrupts by writing to CSR7 */
@@ -118,8 +121,7 @@ static void dwmac4_dma_init(void __iomem *ioaddr,
 	writel(value, ioaddr + DMA_SYS_BUS_MODE);
 
 	for (i = 0; i < DMA_CHANNEL_NB_MAX; i++)
-		dwmac4_dma_init_channel(ioaddr, dma_cfg->pbl,
-					dma_tx, dma_rx, i);
+		dwmac4_dma_init_channel(ioaddr, dma_cfg, dma_tx, dma_rx, i);
 }
 
 static void _dwmac4_dump_dma_regs(void __iomem *ioaddr, u32 channel)
diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_platform.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_platform.c
index 05b8c33effd5..29faff42a5ea 100644
--- a/drivers/net/ethernet/stmicro/stmmac/stmmac_platform.c
+++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_platform.c
@@ -312,6 +312,8 @@ stmmac_probe_config_dt(struct platform_device *pdev, const char **mac)
 	plat->dma_cfg = dma_cfg;
 
 	of_property_read_u32(np, "snps,pbl", &dma_cfg->pbl);
+	of_property_read_u32(np, "snps,txpbl", &dma_cfg->txpbl);
+	of_property_read_u32(np, "snps,rxpbl", &dma_cfg->rxpbl);
 
 	dma_cfg->aal = of_property_read_bool(np, "snps,aal");
 	dma_cfg->fixed_burst = of_property_read_bool(np, "snps,fixed-burst");
diff --git a/include/linux/stmmac.h b/include/linux/stmmac.h
index 705840e0438f..c8a7c6b278da 100644
--- a/include/linux/stmmac.h
+++ b/include/linux/stmmac.h
@@ -88,6 +88,8 @@ struct stmmac_mdio_bus_data {
 
 struct stmmac_dma_cfg {
 	int pbl;
+	int txpbl;
+	int rxpbl;
 	int fixed_burst;
 	int mixed_burst;
 	bool aal;
-- 
2.1.4

^ permalink raw reply related

* [PATCH 6/6] net: smmac: allow configuring lower pbl values
From: Niklas Cassel @ 2016-12-02 14:13 UTC (permalink / raw)
  To: Rob Herring, Mark Rutland, Jonathan Corbet, Giuseppe Cavallaro,
	Alexandre Torgue, David S. Miller, Phil Reid, Eric Engestrom,
	Vincent Palatin, Andreas Färber, Gabriel Fernandez,
	Joachim Eastwood
  Cc: Niklas Cassel, netdev, devicetree, linux-kernel, linux-doc

From: Niklas Cassel <niklas.cassel@axis.com>

The driver currently always sets the PBLx8/PBLx4 bit, which means that
the pbl values configured via the pbl/txpbl/rxpbl DT properties are
always multiplied by 8/4 in the hardware.

In order to allow the DT to configure lower pbl values, while at the
same time not changing behavior of any existing device trees using the
pbl/txpbl/rxpbl settings, add a property to disable the multiplication
of the pbl by 8/4 in the hardware.

Suggested-by: Rabin Vincent <rabinv@axis.com>
Signed-off-by: Niklas Cassel <niklas.cassel@axis.com>
---
 Documentation/devicetree/bindings/net/stmmac.txt      | 2 ++
 Documentation/networking/stmmac.txt                   | 5 ++++-
 drivers/net/ethernet/stmicro/stmmac/dwmac1000_dma.c   | 3 ++-
 drivers/net/ethernet/stmicro/stmmac/dwmac4_dma.c      | 3 ++-
 drivers/net/ethernet/stmicro/stmmac/stmmac_pci.c      | 2 ++
 drivers/net/ethernet/stmicro/stmmac/stmmac_platform.c | 1 +
 include/linux/stmmac.h                                | 1 +
 7 files changed, 14 insertions(+), 3 deletions(-)

diff --git a/Documentation/devicetree/bindings/net/stmmac.txt b/Documentation/devicetree/bindings/net/stmmac.txt
index 8080038ff1b2..128da752fec9 100644
--- a/Documentation/devicetree/bindings/net/stmmac.txt
+++ b/Documentation/devicetree/bindings/net/stmmac.txt
@@ -39,6 +39,8 @@ Optional properties:
 			If set, DMA tx will use this value rather than snps,pbl.
 - snps,rxpbl		Rx Programmable Burst Length. Only for GMAC and newer.
 			If set, DMA rx will use this value rather than snps,pbl.
+- snps,no-pbl-x8	Don't multiply the pbl/txpbl/rxpbl values by 8.
+			For core rev < 3.50, don't multiply the values by 4.
 - snps,aal		Address-Aligned Beats
 - snps,fixed-burst	Program the DMA to use the fixed burst mode
 - snps,mixed-burst	Program the DMA to use the mixed burst mode
diff --git a/Documentation/networking/stmmac.txt b/Documentation/networking/stmmac.txt
index 82c8e496b4bb..d3376c5fbcf0 100644
--- a/Documentation/networking/stmmac.txt
+++ b/Documentation/networking/stmmac.txt
@@ -153,8 +153,9 @@ Where:
  o dma_cfg: internal DMA parameters
    o pbl: the Programmable Burst Length is maximum number of beats to
        be transferred in one DMA transaction.
-       GMAC also enables the 4xPBL by default.
+       GMAC also enables the 4xPBL by default. (8xPBL for GMAC 3.50 and newer)
    o txpbl/rxpbl: GMAC and newer supports independent DMA pbl for tx/rx.
+   o pblx8: Enable 8xPBL (4xPBL for core rev < 3.50). Enabled by default.
    o fixed_burst/mixed_burst/aal
  o clk_csr: fixed CSR Clock range selection.
  o has_gmac: uses the GMAC core.
@@ -209,6 +210,7 @@ struct stmmac_dma_cfg {
 	int pbl;
 	int txpbl;
 	int rxpbl;
+	bool pblx8;
 	int fixed_burst;
 	int mixed_burst;
 	bool aal;
@@ -220,6 +222,7 @@ Where:
 	 If set, DMA tx will use this value rather than pbl.
  o rxpbl: Receive Programmable Burst Length. Only for GMAC and newer.
 	 If set, DMA rx will use this value rather than pbl.
+ o pblx8: Enable 8xPBL (4xPBL for core rev < 3.50). Enabled by default.
  o fixed_burst: program the DMA to use the fixed burst mode
  o mixed_burst: program the DMA to use the mixed burst mode
  o aal: Address-Aligned Beats
diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac1000_dma.c b/drivers/net/ethernet/stmicro/stmmac/dwmac1000_dma.c
index 1dd34fb4c1a9..1d313af647b4 100644
--- a/drivers/net/ethernet/stmicro/stmmac/dwmac1000_dma.c
+++ b/drivers/net/ethernet/stmicro/stmmac/dwmac1000_dma.c
@@ -96,7 +96,8 @@ static void dwmac1000_dma_init(void __iomem *ioaddr,
 	 * Note: before stmmac core 3.50 this mode bit was 4xPBL, and
 	 * post 3.5 mode bit acts as 8*PBL.
 	 */
-	value |= DMA_BUS_MODE_MAXPBL;
+	if (dma_cfg->pblx8)
+		value |= DMA_BUS_MODE_MAXPBL;
 	value |= DMA_BUS_MODE_USP;
 	value &= ~(DMA_BUS_MODE_PBL_MASK | DMA_BUS_MODE_RPBL_MASK);
 	value |= (txpbl << DMA_BUS_MODE_PBL_SHIFT);
diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac4_dma.c b/drivers/net/ethernet/stmicro/stmmac/dwmac4_dma.c
index 0bf47825bfeb..0f7110d19a4a 100644
--- a/drivers/net/ethernet/stmicro/stmmac/dwmac4_dma.c
+++ b/drivers/net/ethernet/stmicro/stmmac/dwmac4_dma.c
@@ -82,7 +82,8 @@ static void dwmac4_dma_init_channel(void __iomem *ioaddr,
 	 * on each channel
 	 */
 	value = readl(ioaddr + DMA_CHAN_CONTROL(channel));
-	value = value | DMA_BUS_MODE_PBL;
+	if (dma_cfg->pblx8)
+		value = value | DMA_BUS_MODE_PBL;
 	writel(value, ioaddr + DMA_CHAN_CONTROL(channel));
 
 	value = readl(ioaddr + DMA_CHAN_TX_CONTROL(channel));
diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_pci.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_pci.c
index 56c8a2342c14..a2831773431a 100644
--- a/drivers/net/ethernet/stmicro/stmmac/stmmac_pci.c
+++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_pci.c
@@ -81,6 +81,7 @@ static void stmmac_default_data(struct plat_stmmacenet_data *plat)
 	plat->mdio_bus_data->phy_mask = 0;
 
 	plat->dma_cfg->pbl = 32;
+	plat->dma_cfg->pblx8 = true;
 	/* TODO: AXI */
 
 	/* Set default value for multicast hash bins */
@@ -115,6 +116,7 @@ static int quark_default_data(struct plat_stmmacenet_data *plat,
 	plat->mdio_bus_data->phy_mask = 0;
 
 	plat->dma_cfg->pbl = 16;
+	plat->dma_cfg->pblx8 = true;
 	plat->dma_cfg->fixed_burst = 1;
 	/* AXI (TODO) */
 
diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_platform.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_platform.c
index 29faff42a5ea..3a713862f0a3 100644
--- a/drivers/net/ethernet/stmicro/stmmac/stmmac_platform.c
+++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_platform.c
@@ -314,6 +314,7 @@ stmmac_probe_config_dt(struct platform_device *pdev, const char **mac)
 	of_property_read_u32(np, "snps,pbl", &dma_cfg->pbl);
 	of_property_read_u32(np, "snps,txpbl", &dma_cfg->txpbl);
 	of_property_read_u32(np, "snps,rxpbl", &dma_cfg->rxpbl);
+	dma_cfg->pblx8 = !of_property_read_bool(np, "snps,no-pbl-x8");
 
 	dma_cfg->aal = of_property_read_bool(np, "snps,aal");
 	dma_cfg->fixed_burst = of_property_read_bool(np, "snps,fixed-burst");
diff --git a/include/linux/stmmac.h b/include/linux/stmmac.h
index c8a7c6b278da..05a8243017b0 100644
--- a/include/linux/stmmac.h
+++ b/include/linux/stmmac.h
@@ -90,6 +90,7 @@ struct stmmac_dma_cfg {
 	int pbl;
 	int txpbl;
 	int rxpbl;
+	bool pblx8;
 	int fixed_burst;
 	int mixed_burst;
 	bool aal;
-- 
2.1.4

^ permalink raw reply related

* [PATCH] net: phy: dp83848: Support ethernet pause frames
From: Jesper Nilsson @ 2016-12-02 14:22 UTC (permalink / raw)
  To: Florian Fainelli
  Cc: Roger Quadros, Andrew F. Davis, David S. Miller, Dan Murphy,
	netdev, linux-kernel

According to the documentation, the PHYs supported by this driver
can also support pause frames. Announce this to be so.
Tested with a TI83822I.

Signed-off-by: Jesper Nilsson <jesper.nilsson@axis.com>
---
 drivers/net/phy/dp83848.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/drivers/net/phy/dp83848.c b/drivers/net/phy/dp83848.c
index 800b39f..6e4117f 100644
--- a/drivers/net/phy/dp83848.c
+++ b/drivers/net/phy/dp83848.c
@@ -88,7 +88,8 @@ MODULE_DEVICE_TABLE(mdio, dp83848_tbl);
 		.phy_id		= _id,				\
 		.phy_id_mask	= 0xfffffff0,			\
 		.name		= _name,			\
-		.features	= PHY_BASIC_FEATURES,		\
+		.features	= (PHY_BASIC_FEATURES |		\
+			SUPPORTED_Pause | SUPPORTED_Asym_Pause),\
 		.flags		= PHY_HAS_INTERRUPT,		\
 								\
 		.soft_reset	= genphy_soft_reset,		\
-- 
2.1.4


/^JN - Jesper Nilsson
-- 
               Jesper Nilsson -- jesper.nilsson@axis.com

^ permalink raw reply related

* Re: [WIP] net+mlx4: auto doorbell
From: Eric Dumazet @ 2016-12-02 14:23 UTC (permalink / raw)
  To: Jesper Dangaard Brouer
  Cc: Saeed Mahameed, Rick Jones, Linux Netdev List, Saeed Mahameed,
	Tariq Toukan
In-Reply-To: <1480611857.18162.319.camel@edumazet-glaptop3.roam.corp.google.com>

On Thu, 2016-12-01 at 09:04 -0800, Eric Dumazet wrote:
> On Thu, 2016-12-01 at 17:04 +0100, Jesper Dangaard Brouer wrote:
> 
> > I think you misunderstood my concept[1].  I don't want to stop the
> > queue. The new __QUEUE_STATE_FLUSH_NEEDED does not stop the queue, is
> > it just indicating that someone need to flush/ring-doorbell.  Maybe it
> > need another name, because it also indicate that the driver can see
> > that its TX queue is so busy that we don't need to call it immediately.
> > The qdisc layer can then choose to enqueue instead if doing direct xmit.
> 
> But driver ndo_start_xmit() does not have a pointer to qdisc.
> 
> Also the concept of 'queue busy' just because we queued one packet is a
> bit flaky.
> 
> > 
> > When qdisc layer or trafgen/af_packet see this indication it knows it
> > should/must flush the queue when it don't have more work left.  Perhaps
> > through net_tx_action(), by registering itself and e.g. if qdisc_run()
> > is called and queue is empty then check if queue needs a flush. I would
> > also allow driver to flush and clear this bit.
> 
> net_tx_action() is not normally called, unless BQL limit is hit and/or
> some qdiscs with throttling (HTB, TBF, FQ, ...)
> 
> > 
> > I just see it as an extension of your solution, as we still need the
> > driver to figure out then the doorbell/flush can be delayed.
> > p.s. don't be discouraged by this feedback, I'm just very excited and
> > happy that your are working on a solution in this area. As this is a
> > problem area that I've not been able to solve myself for the last
> > approx 2 years. Keep up the good work!
> 
> Do not worry, I appreciate the feedbacks ;)
> 
> BTW, if you are doing tests on mlx4 40Gbit, would you check the
> following quick/dirty hack, using lots of low-rate flows ?
> 
> mlx4 has really hard time to transmit small TSO packets (2 or 3 MSS)
> 
> diff --git a/drivers/net/ethernet/mellanox/mlx4/en_netdev.c b/drivers/net/ethernet/mellanox/mlx4/en_netdev.c
> index 12ea3405f442..96940666abd3 100644
> --- a/drivers/net/ethernet/mellanox/mlx4/en_netdev.c
> +++ b/drivers/net/ethernet/mellanox/mlx4/en_netdev.c
> @@ -2631,6 +2631,11 @@ static void mlx4_en_del_vxlan_port(struct  net_device *dev,
>         queue_work(priv->mdev->workqueue, &priv->vxlan_del_task);
>  }
>  
> +static int mlx4_gso_segs_min = 4; /* TSO packets with less than 4 segments are segmented */
> +module_param_named(mlx4_gso_segs_min, mlx4_gso_segs_min, uint, 0644);
> +MODULE_PARM_DESC(mlx4_gso_segs_min, "threshold for software segmentation of small TSO packets");
> +
> +
>  static netdev_features_t mlx4_en_features_check(struct sk_buff *skb,
>                                                 struct net_device *dev,
>                                                 netdev_features_t features)
> @@ -2651,6 +2656,8 @@ static netdev_features_t mlx4_en_features_check(struct sk_buff *skb,
>                     (udp_hdr(skb)->dest != priv->vxlan_port))
>                         features &= ~(NETIF_F_CSUM_MASK | NETIF_F_GSO_MASK);
>         }
> +       if (skb_is_gso(skb) && skb_shinfo(skb)->gso_segs < mlx4_gso_segs_min)
> +               features &= NETIF_F_GSO_MASK;


Sorry, stupid typo.
This should be "features &= ~NETIF_F_GSO_MASK;" of course

>  
>         return features;
>  }
> 
> 
> 
> 

^ permalink raw reply

* Re: stmmac ethernet in kernel 4.9-rc6: coalescing related pauses.
From: Alexandre Torgue @ 2016-12-02 14:26 UTC (permalink / raw)
  To: Giuseppe CAVALLARO, Pavel Machek; +Cc: David Miller, netdev, linux-kernel
In-Reply-To: <c4748c8f-ae4b-9bbc-7d20-9c7b7ce433c8@st.com>

Hi Pavel and Peppe,

On 12/02/2016 02:51 PM, Giuseppe CAVALLARO wrote:
> On 12/2/2016 1:32 PM, Pavel Machek wrote:
>> Hi!
>>
>>>> Well, if you have a workload that sends and receive packets, it tends
>>>> to work ok, as you do tx_clean() in stmmac_poll(). My workload is not
>>>> like that -- it is "sending packets at 3MB/sec, receiving none". So
>>>> the stmmac_tx_timer() is rescheduled and rescheduled and rescheduled,
>>>> and then we run out of transmit descriptors, and then 40msec passes,
>>>> and then we clean them. Bad.
>>>>
>>>> And that's why low-res timers do not cut it.
>>>
>>> in that case, I expect that the tuning of the driver could help you.
>>> I mean, by using ethtool, it could be enough to set the IC bit on all
>>> the descriptors. You should touch the tx_coal_frames.
>>>
>>> Then you can use ethtool -S to monitor the status.
>>
>> Yes, I did something similar. Unfortnunately that meant crash within
>> minutes, at least with 4.4 kernel. (If you know what was fixed between
>> 4.4 and 4.9, that would be helpful).
>
> 4.4 has no GMAC4 support.
> Alex, do you remember any patches to fix that?

No sorry Peppe.

Pavel,

Sorry but I'm a little bit confused. I'm dropped in some mails without 
historic. I see cleanup, coalescence issue and TSO question.
What is your main issue? Are you working on gmac4 or 3.x ?
Can you refresh a little bit the story please ?

Regards
Alex
>
>>> We had experimented this tuning on STB IP where just datagrams
>>> had to send externally. To be honest, although we had seen
>>> better results w/o any timer, we kept this approach enabled
>>> because the timer was fast enough to cover our tests on SH4 boxes.
>>
>> Please reply to David, and explain how it is supposed to
>> work... because right now it does not. 40 msec delays are not
>> acceptable in default configuration.
>
> I mean, that on UP and SMP system this schema helped
> to improve the performance saving CPU on my side and this has been
> tested since a long time (~4 years).
> I tested something similar to yours where unidirectional traffic
> with limited throughput was needed and I can confirm you that
> tuning/removing coalesce parameters this helped. The tuning I decided
> to keep in the driver was suitable in several user cases and if now
> you have problems or you want to review it I can just confirm that
> there are no problems on my side. If you want to simply the logic
> around the tx process and remove timer on official driver I can accept
> that. I will just ask you uto double check if the throughput and
> CPU usage when request max throughput (better if on GiGa setup) has
> no regressions.
> Otherwise we could start thinking about adaptive schema if feasible.
>
>>>>> In the ring, some descriptors can raise the irq (according to a
>>>>> threshold) and set the IC bit. In this path, the NAPI  poll will be
>>>>> scheduled.
>>>>
>>>> Not NAPI poll but stmmac_tx_timer(), right?
>>>
>>> in the xmit according the the threshold the timer is started or the
>>> interrupt is set inside the descriptor.
>>> Then stmmac_tx_clean will be always called and, if you see the flow,
>>> no irqlock protection is needed!
>>
>> Agreed that no irqlock protection is needed if we rely on napi and
>> timers.
>
> ok
>
>>
>>>>> Concerning the lock protection, we had reviewed long time ago and
>>>>> IIRC, no raise condition should be present. Open to review it,
>>>>> again!
>> ...
>>>> There's nothing that protect stmmac_poll() from running concurently
>>>> with stmmac_dma_interrupt(), right?
>>>
>>> This is not necessary.
>>
>> dma_interrupt accesses shared priv->xstats; variables are of type
>> unsigned long (not atomic_t), yet they are accesssed from interrupt
>> context and from stmmac_ethtool without any locking. That can result
>> in broken statistics AFAICT.
>
> ok we can check this and welcome patches and I'd prefer to
> remove xstats from critical part of the code like ISR (that
> comes from old story of the driver).
>
>>
>> Please take another look. As far as I can tell, you can have two cpus
>> at #1 and #2 in the code, at the same time. It looks like napi_... has
>> some atomic opertions inside so that looks safe at the first look. But
>> I'm not sure if they also include enough memory barriers to make it
>> safe...?
>
> Although I have never reproduced related issues on SMP platforms
> due to reordering of memory operations but, as said above, welcome
> review on this especially if you are seeing problems when manage NAPI.
>
> FYI, the only memory barrier you will see in the driver are about the
> OWN_BIT setting till now.
>
>> static void stmmac_dma_interrupt(struct stmmac_priv *priv)
>> {
>> ...
>>         status = priv->hw->dma->dma_interrupt(priv->ioaddr,
>> &priv->xstats);
>>         if (likely((status & handle_rx)) || (status & handle_tx)) {
>>                 if (likely(napi_schedule_prep(&priv->napi))) {
>> #1
>>                         stmmac_disable_dma_irq(priv);
>>                         __napi_schedule(&priv->napi);
>>                 }
>>         }
>>
>>
>> static int stmmac_poll(struct napi_struct *napi, int budget)
>> {
>>         struct stmmac_priv *priv = container_of(napi, struct
>> stmmac_priv, napi);
>>         int work_done = 0;
>>
>>         priv->xstats.napi_poll++;
>>         stmmac_tx_clean(priv);
>>
>>         work_done = stmmac_rx(priv, budget);
>>     if (work_done < budget) {
>>                 napi_complete(napi);
>> #2
>>             stmmac_enable_dma_irq(priv);
>>         }
>
> hmm, I have to check (and refresh my memory) but the driver
> uses the napi_schedule_prep.
>
> Regards
>
> Peppe
>
>>         return work_done;
>> }
>>
>>
>> Best regards,
>>                                     Pavel
>>
>

^ permalink raw reply

* Re: [PATCH] net: phy: dp83848: Support ethernet pause frames
From: Andrew F. Davis @ 2016-12-02 14:35 UTC (permalink / raw)
  To: Jesper Nilsson, Florian Fainelli
  Cc: Roger Quadros, David S. Miller, Dan Murphy, netdev, linux-kernel
In-Reply-To: <20161202142203.GY19016@axis.com>

On 12/02/2016 08:22 AM, Jesper Nilsson wrote:
> According to the documentation, the PHYs supported by this driver
> can also support pause frames. Announce this to be so.
> Tested with a TI83822I.
> 

Looks like all PHYs supported by this driver do, so:

Acked-by: Andrew F. Davis <afd@ti.com>

> Signed-off-by: Jesper Nilsson <jesper.nilsson@axis.com>
> ---
>  drivers/net/phy/dp83848.c | 3 ++-
>  1 file changed, 2 insertions(+), 1 deletion(-)
> 
> diff --git a/drivers/net/phy/dp83848.c b/drivers/net/phy/dp83848.c
> index 800b39f..6e4117f 100644
> --- a/drivers/net/phy/dp83848.c
> +++ b/drivers/net/phy/dp83848.c
> @@ -88,7 +88,8 @@ MODULE_DEVICE_TABLE(mdio, dp83848_tbl);
>  		.phy_id		= _id,				\
>  		.phy_id_mask	= 0xfffffff0,			\
>  		.name		= _name,			\
> -		.features	= PHY_BASIC_FEATURES,		\
> +		.features	= (PHY_BASIC_FEATURES |		\
> +			SUPPORTED_Pause | SUPPORTED_Asym_Pause),\

Aligning these may look nicer though.

>  		.flags		= PHY_HAS_INTERRUPT,		\
>  								\
>  		.soft_reset	= genphy_soft_reset,		\
> 

^ permalink raw reply

* Re: Initial thoughts on TXDP
From: Edward Cree @ 2016-12-02 14:36 UTC (permalink / raw)
  To: Tom Herbert, Hannes Frederic Sowa
  Cc: Florian Westphal, Linux Kernel Network Developers,
	Jesper Dangaard Brouer
In-Reply-To: <CALx6S350eCYS63dGiR+X+nVvqF_uGJop9Z_m7SmZf9QXr-rrfg@mail.gmail.com>

On 01/12/16 23:46, Tom Herbert wrote:
> The only time we
> _really_ to allocate an skbuf is when we need to put the packet onto a
> queue. All the other use cases are really just to pass a structure
> containing a packet from function to function. For that purpose we
> should be able to just pass a much smaller structure in a stack
> argument and only allocate an skbuff when we need to enqueue. In cases
> where we don't ever queue a packet we might never need to allocate any
> skbuff
Now this intrigues me, because one of the objections to bundling (vs GRO)
was the memory usage of all those SKBs.  IIRC we already do a 'GRO-like'
coalescing when packets reach a TCP socket anyway (or at least in some
cases, not sure if all the different ways we can enqueue a TCP packet for
RX do it), but if we could carry the packets from NIC to socket without
SKBs, doing so in lists rather than one-at-a-time wouldn't cost any extra
memory (the packet-pages are all already allocated on the NIC RX ring).
Possibly combine the two, so that rather than having potentially four
versions of each function (skb, skbundle, void*, void* bundle) you just
have the two 'ends'.

-Ed

^ permalink raw reply

* Re: [PATCH net v2] tcp: warn on bogus MSS and try to amend it
From: Eric Dumazet @ 2016-12-02 14:45 UTC (permalink / raw)
  To: Marcelo Ricardo Leitner
  Cc: netdev, Jon Maxwell, Alex Sidorenko, Alexey Kuznetsov,
	James Morris, Hideaki YOSHIFUJI, Patrick McHardy, tlfalcon,
	Brian King, davem
In-Reply-To: <ef61092947105261f8321a67f726a7fdf9b3c14e.1480675556.git.marcelo.leitner@gmail.com>

On Fri, 2016-12-02 at 08:55 -0200, Marcelo Ricardo Leitner wrote:
> There have been some reports lately about TCP connection stalls caused
> by NIC drivers that aren't setting gso_size on aggregated packets on rx
> path. This causes TCP to assume that the MSS is actually the size of the
> aggregated packet, which is invalid.
> 
> Although the proper fix is to be done at each driver, it's often hard
> and cumbersome for one to debug, come to such root cause and report/fix
> it.
> 
> This patch amends this situation in two ways. First, it adds a warning
> on when this situation occurs, so it gives a hint to those trying to
> debug this. It also limit the maximum probed MSS to the adverised MSS,
> as it should never be any higher than that.
> 
> The result is that the connection may not have the best performance ever
> but it shouldn't stall, and the admin will have a hint on what to look
> for.
> 
> Tested with virtio by forcing gso_size to 0.
> 
> Cc: Jonathan Maxwell <jmaxwell37@gmail.com>
> Signed-off-by: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>
> ---
> v2: Updated msg as suggested by David.
> 
>  net/ipv4/tcp_input.c | 5 ++++-
>  1 file changed, 4 insertions(+), 1 deletion(-)
> 
> diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
> index a27b9c0e27c08b4e4aeaff3d0bfdf3ae561ba4d8..fd619eb93749b6de56a41669248b337c051d9fe2 100644
> --- a/net/ipv4/tcp_input.c
> +++ b/net/ipv4/tcp_input.c
> @@ -144,7 +144,10 @@ static void tcp_measure_rcv_mss(struct sock *sk, const struct sk_buff *skb)
>  	 */
>  	len = skb_shinfo(skb)->gso_size ? : skb->len;
>  	if (len >= icsk->icsk_ack.rcv_mss) {
> -		icsk->icsk_ack.rcv_mss = len;
> +		icsk->icsk_ack.rcv_mss = min_t(unsigned int, len,
> +					       tcp_sk(sk)->advmss);
> +		if (icsk->icsk_ack.rcv_mss != len)
> +			pr_warn_once("Driver has suspect GRO implementation, TCP performance may be compromised.\n");
>  	} else {
>  		/* Otherwise, we make more careful check taking into account,
>  		 * that SACKs block is variable.


skb->dev is indeed NULL, but it might be worth getting back the device
using skb->skb_iif maybe ?

^ permalink raw reply

* Re: [PATCH] net: phy: dp83848: Support ethernet pause frames
From: Jesper Nilsson @ 2016-12-02 14:52 UTC (permalink / raw)
  To: Andrew F. Davis
  Cc: Jesper Nilsson, Florian Fainelli, Roger Quadros, David S. Miller,
	Dan Murphy, netdev, linux-kernel
In-Reply-To: <80ee8058-7d6b-ea88-7ec6-20daf50a1fce@ti.com>

On Fri, Dec 02, 2016 at 08:35:23AM -0600, Andrew F. Davis wrote:
> On 12/02/2016 08:22 AM, Jesper Nilsson wrote:
> > According to the documentation, the PHYs supported by this driver
> > can also support pause frames. Announce this to be so.
> > Tested with a TI83822I.
> > 
> 
> Looks like all PHYs supported by this driver do, so:
> 
> Acked-by: Andrew F. Davis <afd@ti.com>
> 
> > Signed-off-by: Jesper Nilsson <jesper.nilsson@axis.com>
> > ---
> >  drivers/net/phy/dp83848.c | 3 ++-
> >  1 file changed, 2 insertions(+), 1 deletion(-)
> > 
> > diff --git a/drivers/net/phy/dp83848.c b/drivers/net/phy/dp83848.c
> > index 800b39f..6e4117f 100644
> > --- a/drivers/net/phy/dp83848.c
> > +++ b/drivers/net/phy/dp83848.c
> > @@ -88,7 +88,8 @@ MODULE_DEVICE_TABLE(mdio, dp83848_tbl);
> >  		.phy_id		= _id,				\
> >  		.phy_id_mask	= 0xfffffff0,			\
> >  		.name		= _name,			\
> > -		.features	= PHY_BASIC_FEATURES,		\
> > +		.features	= (PHY_BASIC_FEATURES |		\
> > +			SUPPORTED_Pause | SUPPORTED_Asym_Pause),\
> 
> Aligning these may look nicer though.

Agreed, will send a v2.

> >  		.flags		= PHY_HAS_INTERRUPT,		\
> >  								\
> >  		.soft_reset	= genphy_soft_reset,		\
> > 

/^JN - Jesper Nilsson
-- 
               Jesper Nilsson -- jesper.nilsson@axis.com

^ permalink raw reply

* [PATCH v2] net: phy: dp83848: Support ethernet pause frames
From: Jesper Nilsson @ 2016-12-02 14:57 UTC (permalink / raw)
  To: Florian Fainelli
  Cc: Roger Quadros, Andrew F. Davis, David S. Miller, Dan Murphy,
	netdev, linux-kernel
In-Reply-To: <20161202142203.GY19016@axis.com>

According to the documentation, the PHYs supported by this driver
can also support pause frames. Announce this to be so.
Tested with a TI83822I.

Acked-by: Andrew F. Davis <afd@ti.com>
Signed-off-by: Jesper Nilsson <jesper.nilsson@axis.com>
---
 drivers/net/phy/dp83848.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/drivers/net/phy/dp83848.c b/drivers/net/phy/dp83848.c
index 800b39f..320d0dc 100644
--- a/drivers/net/phy/dp83848.c
+++ b/drivers/net/phy/dp83848.c
@@ -88,7 +88,9 @@ MODULE_DEVICE_TABLE(mdio, dp83848_tbl);
 		.phy_id		= _id,				\
 		.phy_id_mask	= 0xfffffff0,			\
 		.name		= _name,			\
-		.features	= PHY_BASIC_FEATURES,		\
+		.features	= (PHY_BASIC_FEATURES |		\
+				   SUPPORTED_Pause |		\
+				   SUPPORTED_Asym_Pause),	\
 		.flags		= PHY_HAS_INTERRUPT,		\
 								\
 		.soft_reset	= genphy_soft_reset,		\
-- 
2.1.4


/^JN - Jesper Nilsson
-- 
               Jesper Nilsson -- jesper.nilsson@axis.com

^ permalink raw reply related

* Re: net/can: warning in raw_setsockopt/__alloc_pages_slowpath
From: Oliver Hartkopp @ 2016-12-02 15:11 UTC (permalink / raw)
  To: Marc Kleine-Budde, Andrey Konovalov, David S. Miller, linux-can,
	netdev, LKML
  Cc: Dmitry Vyukov, Kostya Serebryany, syzkaller
In-Reply-To: <c47865a1-4aae-5eae-f9e0-7b4f7f7e9897@pengutronix.de>



On 12/02/2016 02:24 PM, Marc Kleine-Budde wrote:
> On 12/02/2016 01:43 PM, Andrey Konovalov wrote:


>>  [<ffffffff8369e0de>] raw_setsockopt+0x1be/0x9f0 net/can/raw.c:506
>
> We should add a check for a sensible optlen....
>
>> static int raw_setsockopt(struct socket *sock, int level, int optname,
>> 			  char __user *optval, unsigned int optlen)
>> {
>> 	struct sock *sk = sock->sk;
>> 	struct raw_sock *ro = raw_sk(sk);
>> 	struct can_filter *filter = NULL;  /* dyn. alloc'ed filters */
>> 	struct can_filter sfilter;         /* single filter */
>> 	struct net_device *dev = NULL;
>> 	can_err_mask_t err_mask = 0;
>> 	int count = 0;
>> 	int err = 0;
>>
>> 	if (level != SOL_CAN_RAW)
>> 		return -EINVAL;
>>
>> 	switch (optname) {
>>
>> 	case CAN_RAW_FILTER:
>> 		if (optlen % sizeof(struct can_filter) != 0)
>> 			return -EINVAL;
>
> here...
>
> 		if (optlen > 64 * sizeof(struct can_filter))
> 			return -EINVAL;
>

Agreed.

But what is sensible here?
64 filters is way to small IMO.

When thinking about picking a bunch of single CAN IDs I would tend to 
something like 512 filters.

Regards,
Oliver

>>
>> 		count = optlen / sizeof(struct can_filter);
>>
>> 		if (count > 1) {
>> 			/* filter does not fit into dfilter => alloc space */
>> 			filter = memdup_user(optval, optlen);
>> 			if (IS_ERR(filter))
>> 				return PTR_ERR(filter);
>> 		} else if (count == 1) {
>> 			if (copy_from_user(&sfilter, optval, sizeof(sfilter)))
>> 				return -EFAULT;
>> 		}
>>
>> 		lock_sock(sk);
>
> Marc
>

^ permalink raw reply

* Re: stmmac ethernet in kernel 4.9-rc6: coalescing related pauses.
From: Giuseppe CAVALLARO @ 2016-12-02 15:19 UTC (permalink / raw)
  To: Alexandre Torgue, Pavel Machek; +Cc: David Miller, netdev, linux-kernel
In-Reply-To: <5db2ce3e-d4eb-d5d0-594c-8cd0f2e70476@st.com>

Hi Alex

On 12/2/2016 3:26 PM, Alexandre Torgue wrote:
>> 4.4 has no GMAC4 support.
>> Alex, do you remember any patches to fix that?
>
> No sorry Peppe.
>
> Pavel,
>
> Sorry but I'm a little bit confused. I'm dropped in some mails without
> historic. I see cleanup, coalescence issue and TSO question.
> What is your main issue? Are you working on gmac4 or 3.x ?
> Can you refresh a little bit the story please ?

let me try to do a sum, please Pavel feel free to correct me.

There are some open points about the tx mitigation schema
that we are trying to detail and eventually tune or change
(but keeping the same performance on other user-case).

In particular, the test case that is raising problem is
an unicast tx bench.
I suggested Pavel to tune coalesce (IC bit settings) via
ethtool and monitor stats but he is getting problems (maybe
due to lock).

IIUC problems are mainly on new kernel and not on 4.4 where
the gmac4 is missing. Please Pavel, could you confirm?

Then, there are some open points about lock protections
for xstat and Pavel is getting some problem on SMP.
I do think that we need to review that. This also could
improve the code in critical parts.

Also there are some other discussion about the lock
protection on NAPI still under discussion. I have not
clear if in this case Pavel is getting strange behavior.

Regards
Peppe

^ permalink raw reply

* Re: stmmac: turn coalescing / NAPI off in stmmac
From: Giuseppe CAVALLARO @ 2016-12-02 15:31 UTC (permalink / raw)
  To: Pavel Machek; +Cc: David Miller, alexandre.torgue, netdev, linux-kernel
In-Reply-To: <20161202104238.GA31172@amd>

Hi Pavel

On 12/2/2016 11:42 AM, Pavel Machek wrote:
> Hi!
>
>>> Anyway... since you asked. I belive I have way to disable NAPI / tx
>>> coalescing in the driver. Unfortunately, locking is missing on the rx
>>> path, and needs to be extended to _irqsave variant on tx path.
>>
>> I have just replied to a previous thread about that...
>
> Yeah, please reply to David's mail where he describes why it can't
> work.

let me to re-check the mails :-) I can try to provide you
more details about what I experimented

>
>>> So patch currently looks like this (hand edited, can't be
>>> applied, got it working few hours ago). Does it look acceptable?
>>>
>>> I'd prefer this to go after the patch that pulls common code to single
>>> place, so that single place needs to be patched. Plus I guess I should
>>> add ifdefs, so that more advanced NAPI / tx coalescing code can be
>>> reactivated when it is fixed. Trivial fixes can go on top. Does that
>>> sound like a plan?
>>
>> Hmm, what I find strange is that, just this code is running since a
>> long time on several platforms and Chip versions. No raise condition
>> have been found or lock protection problems (also proving look
>> mechanisms).
>
> Well, it works better for me when I disable CONFIG_SMP. It is normal
> that locking problems are hard to reproduce :-(.

can you share me the test, maybe I can try to reproduce on ARM box.
Are you using 3.x or 4.x GMAC?

>> Pavel, I ask you sorry if I missed some problems so, if you can
>> (as D. Miller asked) to send us a cover letter + all patches
>> I will try to reply soon. I can do also some tests if you ask
>> me that! I could run on 3.x and 4.x but I cannot promise you
>> benchmarks.
>
> Actually... I have questions here. David normally pulls from you (can
> I have a address of your git tree?).

No I send the patches to the mailing list.

>
> Could you apply these to your git?
>
> [PATCH] stmmac ethernet: unify locking
> [PATCH] stmmac: simplify flag assignment
> [PATCH] stmmac: cleanup documenation, make it match reality
>
> They are rather trivial and independend, I'm not sure what cover
> letter would say, besides "simple fixes".
>
> Then I can re-do the reset on top of that...
>
>>> Which tree do you want patches against?
>>>
>>> https://git.kernel.org/cgit/linux/kernel/git/davem/net-next.git/ ?
>>
>> I think that bug fixing should be on top of net.git but I let Miller
>> to decide.
>
> Hmm. It is "only" a performance problem (40msec delays).. I guess
> -next is better target.

ok, maybe if you resend these with a cover-letter I can try to
contribute on reviewing (in case of I have missed some detail).

Best Regards
Peppe

>
> Best regards,
> 								Pavel
>

^ permalink raw reply

* Re: [PATCH] net/rtnetlink: fix attribute name in nlmsg_size() comments
From: David Miller @ 2016-12-02 15:35 UTC (permalink / raw)
  To: tklauser; +Cc: netdev, edumazet
In-Reply-To: <20161130133037.14315-1-tklauser@distanz.ch>

From: Tobias Klauser <tklauser@distanz.ch>
Date: Wed, 30 Nov 2016 14:30:37 +0100

> Use the correct attribute constant names IFLA_GSO_MAX_{SEGS,SIZE}
> instead of IFLA_MAX_GSO_{SEGS,SIZE} for the comments int nlmsg_size().
> 
> Cc: Eric Dumazet <edumazet@google.com>
> Signed-off-by: Tobias Klauser <tklauser@distanz.ch>

Applied, thanks.

^ permalink raw reply

* Re: [PATCH 0/2] net: Add support for SGMII PCS on Altera TSE MAC
From: David Miller @ 2016-12-02 15:37 UTC (permalink / raw)
  To: neill.whillans
  Cc: netdev, linux-kernel, linux-kernel, nios2-dev, vbridger,
	f.fainelli
In-Reply-To: <cover.1480497966.git.neill.whillans@codethink.co.uk>

From: Neill Whillans <neill.whillans@codethink.co.uk>
Date: Wed, 30 Nov 2016 13:41:03 +0000

> These patches were created as part of work to add support for SGMII
> PCS functionality to the Altera TSE MAC. Patches are based on 4.9-rc6
> git tree.
> 
> The first patch in the series adds support for the VSC8572 dual-port
> Gigabit Ethernet transceiver, used in integration testing.
> 
> The second patch adds support for the SGMII PCS functionality to the
> Altera TSE driver.

Series applied, thanks.

^ permalink raw reply

* Re: [PATCH] mm: page_alloc: High-order per-cpu page allocator v3
From: Jesper Dangaard Brouer @ 2016-12-02 15:37 UTC (permalink / raw)
  To: Paolo Abeni
  Cc: Mel Gorman, Andrew Morton, Christoph Lameter, Michal Hocko,
	Vlastimil Babka, Johannes Weiner, Linux-MM, Linux-Kernel,
	Rick Jones, netdev@vger.kernel.org, Hannes Frederic Sowa, brouer
In-Reply-To: <1480630668.5345.7.camel@redhat.com>

On Thu, 01 Dec 2016 23:17:48 +0100
Paolo Abeni <pabeni@redhat.com> wrote:

> On Thu, 2016-12-01 at 18:34 +0100, Jesper Dangaard Brouer wrote:
> > (Cc. netdev, we might have an issue with Paolo's UDP accounting and
> > small socket queues)
> > 
> > On Wed, 30 Nov 2016 16:35:20 +0000
> > Mel Gorman <mgorman@techsingularity.net> wrote:
> >   
> > > > I don't quite get why you are setting the socket recv size
> > > > (with -- -s and -S) to such a small number, size + 256.
> > > >     
> > > 
> > > Maybe I missed something at the time I wrote that but why would it
> > > need to be larger?  
> > 
> > Well, to me it is quite obvious that we need some queue to avoid packet
> > drops.  We have two processes netperf and netserver, that are sending
> > packets between each-other (UDP_STREAM mostly netperf -> netserver).
> > These PIDs are getting scheduled and migrated between CPUs, and thus
> > does not get executed equally fast, thus a queue is need absorb the
> > fluctuations.
> > 
> > The network stack is even partly catching your config "mistake" and
> > increase the socket queue size, so we minimum can handle one max frame
> > (due skb "truesize" concept approx PAGE_SIZE + overhead).
> > 
> > Hopefully for localhost testing a small queue should hopefully not
> > result in packet drops.  Testing... ups, this does result in packet
> > drops.
> > 
> > Test command extracted from mmtests, UDP_STREAM size 1024:
> > 
> >  netperf-2.4.5-installed/bin/netperf -t UDP_STREAM  -l 60  -H 127.0.0.1 \
> >    -- -s 1280 -S 1280 -m 1024 -M 1024 -P 15895
> > 
> >  UDP UNIDIRECTIONAL SEND TEST from 0.0.0.0 (0.0.0.0)
> >   port 15895 AF_INET to 127.0.0.1 (127.0.0.1) port 15895 AF_INET
> >  Socket  Message  Elapsed      Messages                
> >  Size    Size     Time         Okay Errors   Throughput
> >  bytes   bytes    secs            #      #   10^6bits/sec
> > 
> >    4608    1024   60.00     50024301      0    6829.98
> >    2560           60.00     46133211           6298.72
> > 
> >  Dropped packets: 50024301-46133211=3891090
> > 
> > To get a better drop indication, during this I run a command, to get
> > system-wide network counters from the last second, so below numbers are
> > per second.
> > 
> >  $ nstat > /dev/null && sleep 1  && nstat
> >  #kernel
> >  IpInReceives                    885162             0.0
> >  IpInDelivers                    885161             0.0
> >  IpOutRequests                   885162             0.0
> >  UdpInDatagrams                  776105             0.0
> >  UdpInErrors                     109056             0.0
> >  UdpOutDatagrams                 885160             0.0
> >  UdpRcvbufErrors                 109056             0.0
> >  IpExtInOctets                   931190476          0.0
> >  IpExtOutOctets                  931189564          0.0
> >  IpExtInNoECTPkts                885162             0.0
> > 
> > So, 885Kpps but only 776Kpps delivered and 109Kpps drops. See
> > UdpInErrors and UdpRcvbufErrors is equal (109056/sec). This drop
> > happens kernel side in __udp_queue_rcv_skb[1], because receiving
> > process didn't empty it's queue fast enough see [2].
> > 
> > Although upstream changes are coming in this area, [2] is replaced with
> > __udp_enqueue_schedule_skb, which I actually tested with... hmm
> > 
> > Retesting with kernel 4.7.0-baseline+ ... show something else. 
> > To Paolo, you might want to look into this.  And it could also explain why
> > I've not see the mentioned speedup by mm-change, as I've been testing
> > this patch on top of net-next (at 93ba2222550) with Paolo's UDP changes.  
> 
> Thank you for reporting this.
> 
> It seems that the commit 123b4a633580 ("udp: use it's own memory
> accounting schema") is too strict while checking the rcvbuf. 
> 
> For very small value of rcvbuf, it allows a single skb to be enqueued,
> while previously we allowed 2 of them to enter the queue, even if the
> first one truesize exceeded rcvbuf, as in your test-case.
> 
> Can you please try the following patch ?

Sure, it looks much better with this patch.


$ /home/jbrouer/git/mmtests/work/testdisk/sources/netperf-2.4.5-installed/bin/netperf -t UDP_STREAM  -l 60  -H 127.0.0.1    -- -s 1280 -S 1280 -m 1024 -M 1024 -P 15895
UDP UNIDIRECTIONAL SEND TEST from 0.0.0.0 (0.0.0.0) port 15895 AF_INET to 127.0.0.1 (127.0.0.1) port 15895 AF_INET
Socket  Message  Elapsed      Messages                
Size    Size     Time         Okay Errors   Throughput
bytes   bytes    secs            #      #   10^6bits/sec

  4608    1024   60.00     50191555      0    6852.82
  2560           60.00     50189872           6852.59

Only 50191555-50189872=1683 drops, approx 1683/60 = 28/sec

$ nstat > /dev/null && sleep 1  && nstat
#kernel
IpInReceives                    885417             0.0
IpInDelivers                    885416             0.0
IpOutRequests                   885417             0.0
UdpInDatagrams                  885382             0.0
UdpInErrors                     29                 0.0
UdpOutDatagrams                 885410             0.0
UdpRcvbufErrors                 29                 0.0
IpExtInOctets                   931534428          0.0
IpExtOutOctets                  931533376          0.0
IpExtInNoECTPkts                885488             0.0

 
> Thank you,
> 
> Paolo
> ---
>  net/ipv4/udp.c | 6 ++++--
>  1 file changed, 4 insertions(+), 2 deletions(-)
> 
> diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
> index e1d0bf8..2f5dc92 100644
> --- a/net/ipv4/udp.c
> +++ b/net/ipv4/udp.c
> @@ -1200,19 +1200,21 @@ int __udp_enqueue_schedule_skb(struct sock *sk, struct sk_buff *skb)
>  	struct sk_buff_head *list = &sk->sk_receive_queue;
>  	int rmem, delta, amt, err = -ENOMEM;
>  	int size = skb->truesize;
> +	int limit;
>  
>  	/* try to avoid the costly atomic add/sub pair when the receive
>  	 * queue is full; always allow at least a packet
>  	 */
>  	rmem = atomic_read(&sk->sk_rmem_alloc);
> -	if (rmem && (rmem + size > sk->sk_rcvbuf))
> +	limit = size + sk->sk_rcvbuf;
> +	if (rmem > limit)
>  		goto drop;
>  
>  	/* we drop only if the receive buf is full and the receive
>  	 * queue contains some other skb
>  	 */
>  	rmem = atomic_add_return(size, &sk->sk_rmem_alloc);
> -	if ((rmem > sk->sk_rcvbuf) && (rmem > size))
> +	if (rmem > limit)
>  		goto uncharge_drop;
>  
>  	spin_lock(&list->lock);
> 



-- 
Best regards,
  Jesper Dangaard Brouer
  MSc.CS, Principal Kernel Engineer at Red Hat
  LinkedIn: http://www.linkedin.com/in/brouer

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply

* Re: DSA vs. SWTICHDEV ?
From: Andrew Lunn @ 2016-12-02 15:38 UTC (permalink / raw)
  To: Murali Karicheri
  Cc: Joakim Tjernlund, netdev@vger.kernel.org, Roger Quadros,
	Grygorii Strashko
In-Reply-To: <58409867.50001@ti.com>

On Thu, Dec 01, 2016 at 04:38:47PM -0500, Murali Karicheri wrote:
> Hi Andrew,
> On 12/01/2016 12:31 PM, Andrew Lunn wrote:
> > Hi Murali 
> > 
> >> 2. Switch mode where it implements a simple Ethernet switch. Currently
> >>    it doesn't have address learning capability, but in future it
> >>    can.
> > 
> > If it does not have address learning capabilities, does it act like a
> > plain old hub? What comes in one port goes out all others?
> 
> Thanks for the response!
> 
> Yes. It is a plain hub. it replicates frame to both ports. So need to
> run a bridge layer for address learning in software.

Hi Murali

It would be good if you start thinking about all the different
directions. It is not just host to port A and host to port B. What
about port A to Port B? Can it do that in hardware? 

> I think not. I see we have a non Linux implementation that does address
> learning in software using a hash table and look up MAC for each packet
> to see which port it needs to be sent to.

I think i need to read more about the switch. I'm starting to wonder
if it has enough intelligence to be usable. Switchdev is about pushing
configuration down into the switch. It does not however sound like
there is that much which is configurable in this switch.

      Andrew

^ permalink raw reply

* Re: [PATCH net-next 3/6] net: dsa: mv88e6xxx: add a software reset op
From: Andrew Lunn @ 2016-12-02 15:43 UTC (permalink / raw)
  To: Vivien Didelot
  Cc: netdev, linux-kernel, kernel, David S. Miller, Florian Fainelli
In-Reply-To: <87vav3gz67.fsf@ketchup.i-did-not-set--mail-host-address--so-tickle-me>

On Thu, Dec 01, 2016 at 03:41:20PM -0500, Vivien Didelot wrote:
> Hi Andrew,
> 
> Andrew Lunn <andrew@lunn.ch> writes:
> 
> >> diff --git a/drivers/net/dsa/mv88e6xxx/mv88e6xxx.h b/drivers/net/dsa/mv88e6xxx/mv88e6xxx.h
> >> index ab52c37..9e51405 100644
> >> --- a/drivers/net/dsa/mv88e6xxx/mv88e6xxx.h
> >> +++ b/drivers/net/dsa/mv88e6xxx/mv88e6xxx.h
> >> @@ -765,6 +765,9 @@ struct mv88e6xxx_ops {
> >>  	int (*phy_write)(struct mv88e6xxx_chip *chip, int addr, int reg,
> >>  			 u16 val);
> >>  
> >> +	/* Switch Software Reset */
> >> +	int (*reset)(struct mv88e6xxx_chip *chip);
> >> +
> >
> > Hi Vivien
> >
> > In my huge patch series of 6390, i've been using a g1_ prefix for
> > functionality which is in global 1, g2_ for global 2, etc.  This has
> > worked for everything so far with the exception of setting which
> > reserved MAC addresses should be sent to the CPU. Most devices have it
> > in g2, but 6390 has it in g1.
> >
> > Please could you add the prefix.
> 
> I don't understand. It looks like you are talking about the second part
> of the comment I made on your RFC patchset, about the Rsvd2CPU feature:

Hi Vivien

I mean

+	/* Switch Software Reset */
+	int (*g1_reset)(struct mv88e6xxx_chip *chip);
+

We have a collection of function pointers with port_ prefix, another
collection with stats_, and a third with ppu_, etc. And then we have
some which do not fit a specific category. Those i have prefixed with
g1_ or g2_. I think we should have some prefix, and that is my
suggestion.

	Andrew

^ permalink raw reply

* Re: net/can: warning in raw_setsockopt/__alloc_pages_slowpath
From: Marc Kleine-Budde @ 2016-12-02 15:42 UTC (permalink / raw)
  To: Oliver Hartkopp, Andrey Konovalov, David S. Miller, linux-can,
	netdev, LKML
  Cc: Dmitry Vyukov, Kostya Serebryany, syzkaller
In-Reply-To: <73b78023-bc11-25c0-33e3-3a748dbc81cd@hartkopp.net>


[-- Attachment #1.1: Type: text/plain, Size: 1601 bytes --]

On 12/02/2016 04:11 PM, Oliver Hartkopp wrote:
> 
> 
> On 12/02/2016 02:24 PM, Marc Kleine-Budde wrote:
>> On 12/02/2016 01:43 PM, Andrey Konovalov wrote:
> 
> 
>>>  [<ffffffff8369e0de>] raw_setsockopt+0x1be/0x9f0 net/can/raw.c:506
>>
>> We should add a check for a sensible optlen....
>>
>>> static int raw_setsockopt(struct socket *sock, int level, int optname,
>>> 			  char __user *optval, unsigned int optlen)
>>> {
>>> 	struct sock *sk = sock->sk;
>>> 	struct raw_sock *ro = raw_sk(sk);
>>> 	struct can_filter *filter = NULL;  /* dyn. alloc'ed filters */
>>> 	struct can_filter sfilter;         /* single filter */
>>> 	struct net_device *dev = NULL;
>>> 	can_err_mask_t err_mask = 0;
>>> 	int count = 0;
>>> 	int err = 0;
>>>
>>> 	if (level != SOL_CAN_RAW)
>>> 		return -EINVAL;
>>>
>>> 	switch (optname) {
>>>
>>> 	case CAN_RAW_FILTER:
>>> 		if (optlen % sizeof(struct can_filter) != 0)
>>> 			return -EINVAL;
>>
>> here...
>>
>> 		if (optlen > 64 * sizeof(struct can_filter))
>> 			return -EINVAL;
>>
> 
> Agreed.
> 
> But what is sensible here?
> 64 filters is way to small IMO.
> 
> When thinking about picking a bunch of single CAN IDs I would tend to 
> something like 512 filters.

Ok - 64 was just an arbitrary chosen value for demonstration purposes :)

Marc

-- 
Pengutronix e.K.                  | Marc Kleine-Budde           |
Industrial Linux Solutions        | Phone: +49-231-2826-924     |
Vertretung West/Dortmund          | Fax:   +49-5121-206917-5555 |
Amtsgericht Hildesheim, HRA 2686  | http://www.pengutronix.de   |


[-- Attachment #2: OpenPGP digital signature --]
[-- Type: application/pgp-signature, Size: 488 bytes --]

^ permalink raw reply

* Re: [PATCH net 0/7] net: stmmac: fix probe error handling and phydev leaks
From: David Miller @ 2016-12-02 15:43 UTC (permalink / raw)
  To: johan
  Cc: peppe.cavallaro, alexandre.torgue, manabian, carlo, khilman,
	mcoquelin.stm32, maxime.ripard, wens, netdev, linux-kernel
In-Reply-To: <1480516195-27696-1-git-send-email-johan@kernel.org>

From: Johan Hovold <johan@kernel.org>
Date: Wed, 30 Nov 2016 15:29:48 +0100

> This series fixes a number of issues with the stmmac-driver probe error
> handling, which for example left clocks enabled after probe failures.
> 
> The final patch fixes a failure to deregister and free any fixed-link
> PHYs that were registered during probe on probe errors and on driver
> unbind. It also fixes a related of-node leak on late probe errors.
> 
> This series depends on the of_phy_deregister_fixed_link() helper that
> was just merged to net.
> 
> As mentioned earlier, one staging driver also suffers from a similar
> leak and can be fixed up once the above mentioned helper hits mainline.
> 
> Note that these patches have only been compile tested.

Series applied, thanks.

^ permalink raw reply

* Re: [PATCH] mm: page_alloc: High-order per-cpu page allocator v3
From: Paolo Abeni @ 2016-12-02 15:44 UTC (permalink / raw)
  To: Jesper Dangaard Brouer
  Cc: Mel Gorman, Andrew Morton, Christoph Lameter, Michal Hocko,
	Vlastimil Babka, Johannes Weiner, Linux-MM, Linux-Kernel,
	Rick Jones, netdev@vger.kernel.org, Hannes Frederic Sowa
In-Reply-To: <20161202163758.0d8cc9bf@redhat.com>

On Fri, 2016-12-02 at 16:37 +0100, Jesper Dangaard Brouer wrote:
> On Thu, 01 Dec 2016 23:17:48 +0100
> Paolo Abeni <pabeni@redhat.com> wrote:
> 
> > On Thu, 2016-12-01 at 18:34 +0100, Jesper Dangaard Brouer wrote:
> > > (Cc. netdev, we might have an issue with Paolo's UDP accounting and
> > > small socket queues)
> > > 
> > > On Wed, 30 Nov 2016 16:35:20 +0000
> > > Mel Gorman <mgorman@techsingularity.net> wrote:
> > >   
> > > > > I don't quite get why you are setting the socket recv size
> > > > > (with -- -s and -S) to such a small number, size + 256.
> > > > >     
> > > > 
> > > > Maybe I missed something at the time I wrote that but why would it
> > > > need to be larger?  
> > > 
> > > Well, to me it is quite obvious that we need some queue to avoid packet
> > > drops.  We have two processes netperf and netserver, that are sending
> > > packets between each-other (UDP_STREAM mostly netperf -> netserver).
> > > These PIDs are getting scheduled and migrated between CPUs, and thus
> > > does not get executed equally fast, thus a queue is need absorb the
> > > fluctuations.
> > > 
> > > The network stack is even partly catching your config "mistake" and
> > > increase the socket queue size, so we minimum can handle one max frame
> > > (due skb "truesize" concept approx PAGE_SIZE + overhead).
> > > 
> > > Hopefully for localhost testing a small queue should hopefully not
> > > result in packet drops.  Testing... ups, this does result in packet
> > > drops.
> > > 
> > > Test command extracted from mmtests, UDP_STREAM size 1024:
> > > 
> > >  netperf-2.4.5-installed/bin/netperf -t UDP_STREAM  -l 60  -H 127.0.0.1 \
> > >    -- -s 1280 -S 1280 -m 1024 -M 1024 -P 15895
> > > 
> > >  UDP UNIDIRECTIONAL SEND TEST from 0.0.0.0 (0.0.0.0)
> > >   port 15895 AF_INET to 127.0.0.1 (127.0.0.1) port 15895 AF_INET
> > >  Socket  Message  Elapsed      Messages                
> > >  Size    Size     Time         Okay Errors   Throughput
> > >  bytes   bytes    secs            #      #   10^6bits/sec
> > > 
> > >    4608    1024   60.00     50024301      0    6829.98
> > >    2560           60.00     46133211           6298.72
> > > 
> > >  Dropped packets: 50024301-46133211=3891090
> > > 
> > > To get a better drop indication, during this I run a command, to get
> > > system-wide network counters from the last second, so below numbers are
> > > per second.
> > > 
> > >  $ nstat > /dev/null && sleep 1  && nstat
> > >  #kernel
> > >  IpInReceives                    885162             0.0
> > >  IpInDelivers                    885161             0.0
> > >  IpOutRequests                   885162             0.0
> > >  UdpInDatagrams                  776105             0.0
> > >  UdpInErrors                     109056             0.0
> > >  UdpOutDatagrams                 885160             0.0
> > >  UdpRcvbufErrors                 109056             0.0
> > >  IpExtInOctets                   931190476          0.0
> > >  IpExtOutOctets                  931189564          0.0
> > >  IpExtInNoECTPkts                885162             0.0
> > > 
> > > So, 885Kpps but only 776Kpps delivered and 109Kpps drops. See
> > > UdpInErrors and UdpRcvbufErrors is equal (109056/sec). This drop
> > > happens kernel side in __udp_queue_rcv_skb[1], because receiving
> > > process didn't empty it's queue fast enough see [2].
> > > 
> > > Although upstream changes are coming in this area, [2] is replaced with
> > > __udp_enqueue_schedule_skb, which I actually tested with... hmm
> > > 
> > > Retesting with kernel 4.7.0-baseline+ ... show something else. 
> > > To Paolo, you might want to look into this.  And it could also explain why
> > > I've not see the mentioned speedup by mm-change, as I've been testing
> > > this patch on top of net-next (at 93ba2222550) with Paolo's UDP changes.  
> > 
> > Thank you for reporting this.
> > 
> > It seems that the commit 123b4a633580 ("udp: use it's own memory
> > accounting schema") is too strict while checking the rcvbuf. 
> > 
> > For very small value of rcvbuf, it allows a single skb to be enqueued,
> > while previously we allowed 2 of them to enter the queue, even if the
> > first one truesize exceeded rcvbuf, as in your test-case.
> > 
> > Can you please try the following patch ?
> 
> Sure, it looks much better with this patch.

Thank you for testing. I'll send a formal patch to David soon.

BTW I see I nice performance improvement compared to 4.7...

Cheers,

Paolo

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply

* Re: [PATCH net-next V2 0/7] Mellanox 100G mlx5 updates 2016-11-29
From: David Miller @ 2016-12-02 15:47 UTC (permalink / raw)
  To: saeedm; +Cc: netdev, tariqt, ogerlitz, roid, sebott
In-Reply-To: <1480521583-12755-1-git-send-email-saeedm@mellanox.com>

From: Saeed Mahameed <saeedm@mellanox.com>
Date: Wed, 30 Nov 2016 17:59:36 +0200

> The following series from Tariq and Roi, provides some critical fixes
> and updates for the mlx5e driver.
> 
> From Tariq: 
>  - Fix driver coherent memory huge allocation issues by fragmenting
>    completion queues, in a way that is transparent to the netdev driver by
>    providing a new buffer type "mlx5_frag_buf" with the same access API.
>  - Create UMR MKey per RQ to have better scalability.
> 
> From Roi:
>  - Some fixes for the encap-decap support and tc flower added lately to the
>    mlx5e driver.
> 
> v1->v2:
>  - Fix start index in error flow of mlx5_frag_buf_alloc_node, pointed out by Eric.
> 
> This series was generated against commit:
> 31ac1c19455f ("geneve: fix ip_hdr_len reserved for geneve6 tunnel.")

Series applied, thanks.

^ permalink raw reply

* [PATCH net] geneve: avoid use-after-free of skb->data
From: Sabrina Dubroca @ 2016-12-02 15:49 UTC (permalink / raw)
  To: netdev; +Cc: Sabrina Dubroca, John W . Linville

geneve{,6}_build_skb can end up doing a pskb_expand_head(), which
makes the ip_hdr(skb) reference we stashed earlier stale. Since it's
only needed as an argument to ip_tunnel_ecn_encap(), move this
directly in the function call.

Fixes: 08399efc6319 ("geneve: ensure ECN info is handled properly in all tx/rx paths")
Signed-off-by: Sabrina Dubroca <sd@queasysnail.net>
---
 drivers/net/geneve.c | 14 ++++----------
 1 file changed, 4 insertions(+), 10 deletions(-)

diff --git a/drivers/net/geneve.c b/drivers/net/geneve.c
index 532695c8209b..bab65647b80f 100644
--- a/drivers/net/geneve.c
+++ b/drivers/net/geneve.c
@@ -859,7 +859,6 @@ static netdev_tx_t geneve_xmit_skb(struct sk_buff *skb, struct net_device *dev,
 	struct geneve_dev *geneve = netdev_priv(dev);
 	struct geneve_sock *gs4;
 	struct rtable *rt = NULL;
-	const struct iphdr *iip; /* interior IP header */
 	int err = -EINVAL;
 	struct flowi4 fl4;
 	__u8 tos, ttl;
@@ -890,8 +889,6 @@ static netdev_tx_t geneve_xmit_skb(struct sk_buff *skb, struct net_device *dev,
 	sport = udp_flow_src_port(geneve->net, skb, 1, USHRT_MAX, true);
 	skb_reset_mac_header(skb);
 
-	iip = ip_hdr(skb);
-
 	if (info) {
 		const struct ip_tunnel_key *key = &info->key;
 		u8 *opts = NULL;
@@ -911,7 +908,7 @@ static netdev_tx_t geneve_xmit_skb(struct sk_buff *skb, struct net_device *dev,
 		if (unlikely(err))
 			goto tx_error;
 
-		tos = ip_tunnel_ecn_encap(key->tos, iip, skb);
+		tos = ip_tunnel_ecn_encap(key->tos, ip_hdr(skb), skb);
 		ttl = key->ttl;
 		df = key->tun_flags & TUNNEL_DONT_FRAGMENT ? htons(IP_DF) : 0;
 	} else {
@@ -920,7 +917,7 @@ static netdev_tx_t geneve_xmit_skb(struct sk_buff *skb, struct net_device *dev,
 		if (unlikely(err))
 			goto tx_error;
 
-		tos = ip_tunnel_ecn_encap(fl4.flowi4_tos, iip, skb);
+		tos = ip_tunnel_ecn_encap(fl4.flowi4_tos, ip_hdr(skb), skb);
 		ttl = geneve->ttl;
 		if (!ttl && IN_MULTICAST(ntohl(fl4.daddr)))
 			ttl = 1;
@@ -952,7 +949,6 @@ static netdev_tx_t geneve6_xmit_skb(struct sk_buff *skb, struct net_device *dev,
 {
 	struct geneve_dev *geneve = netdev_priv(dev);
 	struct dst_entry *dst = NULL;
-	const struct iphdr *iip; /* interior IP header */
 	struct geneve_sock *gs6;
 	int err = -EINVAL;
 	struct flowi6 fl6;
@@ -982,8 +978,6 @@ static netdev_tx_t geneve6_xmit_skb(struct sk_buff *skb, struct net_device *dev,
 	sport = udp_flow_src_port(geneve->net, skb, 1, USHRT_MAX, true);
 	skb_reset_mac_header(skb);
 
-	iip = ip_hdr(skb);
-
 	if (info) {
 		const struct ip_tunnel_key *key = &info->key;
 		u8 *opts = NULL;
@@ -1004,7 +998,7 @@ static netdev_tx_t geneve6_xmit_skb(struct sk_buff *skb, struct net_device *dev,
 		if (unlikely(err))
 			goto tx_error;
 
-		prio = ip_tunnel_ecn_encap(key->tos, iip, skb);
+		prio = ip_tunnel_ecn_encap(key->tos, ip_hdr(skb), skb);
 		ttl = key->ttl;
 		label = info->key.label;
 	} else {
@@ -1014,7 +1008,7 @@ static netdev_tx_t geneve6_xmit_skb(struct sk_buff *skb, struct net_device *dev,
 			goto tx_error;
 
 		prio = ip_tunnel_ecn_encap(ip6_tclass(fl6.flowlabel),
-					   iip, skb);
+					   ip_hdr(skb), skb);
 		ttl = geneve->ttl;
 		if (!ttl && ipv6_addr_is_multicast(&fl6.daddr))
 			ttl = 1;
-- 
2.10.2

^ permalink raw reply related


This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox