Netdev List
 help / color / mirror / Atom feed
* [PATCH V4 7/8] ARM: dts: stm32: add support of ethernet on stm32mp157c-ev1
From: Christophe Roullier @ 2018-05-23 15:47 UTC (permalink / raw)
  To: mark.rutland, mcoquelin.stm32, alexandre.torgue, peppe.cavallaro
  Cc: devicetree, linux-arm-kernel, netdev, christophe.roullier, andrew
In-Reply-To: <1527090479-5263-1-git-send-email-christophe.roullier@st.com>

MAC is connected to a PHY in RGMII mode.

Signed-off-by: Christophe Roullier <christophe.roullier@st.com>
---
 arch/arm/boot/dts/stm32mp157c-ev1.dts | 20 ++++++++++++++++++++
 1 file changed, 20 insertions(+)

diff --git a/arch/arm/boot/dts/stm32mp157c-ev1.dts b/arch/arm/boot/dts/stm32mp157c-ev1.dts
index 57e6dbc..a7fee5c 100644
--- a/arch/arm/boot/dts/stm32mp157c-ev1.dts
+++ b/arch/arm/boot/dts/stm32mp157c-ev1.dts
@@ -17,5 +17,25 @@
 
 	aliases {
 		serial0 = &uart4;
+		ethernet0 = &ethernet0;
+	};
+};
+
+&ethernet0 {
+	status = "okay";
+	pinctrl-0 = <&ethernet0_rgmii_pins_a>;
+	pinctrl-1 = <&ethernet0_rgmii_pins_sleep_a>;
+	pinctrl-names = "default", "sleep";
+	phy-mode = "rgmii";
+	max-speed = <1000>;
+	phy-handle = <&phy0>;
+
+	mdio0 {
+		#address-cells = <1>;
+		#size-cells = <0>;
+		compatible = "snps,dwmac-mdio";
+		phy0: ethernet-phy@0 {
+			reg = <0>;
+		};
 	};
 };
-- 
1.9.1

^ permalink raw reply related

* [PATCH V4 1/8] net: ethernet: stmmac: add adaptation for stm32mp157c.
From: Christophe Roullier @ 2018-05-23 15:47 UTC (permalink / raw)
  To: mark.rutland, mcoquelin.stm32, alexandre.torgue, peppe.cavallaro
  Cc: devicetree, linux-arm-kernel, netdev, christophe.roullier, andrew
In-Reply-To: <1527090479-5263-1-git-send-email-christophe.roullier@st.com>

Glue codes to support stm32mp157c device and stay
compatible with stm32 mcu family

Signed-off-by: Christophe Roullier <christophe.roullier@st.com>
---
 drivers/net/ethernet/stmicro/stmmac/dwmac-stm32.c | 270 ++++++++++++++++++++--
 1 file changed, 255 insertions(+), 15 deletions(-)

diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac-stm32.c b/drivers/net/ethernet/stmicro/stmmac/dwmac-stm32.c
index 9e6db16..f51e327 100644
--- a/drivers/net/ethernet/stmicro/stmmac/dwmac-stm32.c
+++ b/drivers/net/ethernet/stmicro/stmmac/dwmac-stm32.c
@@ -16,49 +16,183 @@
 #include <linux/of_net.h>
 #include <linux/phy.h>
 #include <linux/platform_device.h>
+#include <linux/pm_wakeirq.h>
 #include <linux/regmap.h>
 #include <linux/slab.h>
 #include <linux/stmmac.h>
 
 #include "stmmac_platform.h"
 
-#define MII_PHY_SEL_MASK	BIT(23)
+#define SYSCFG_MCU_ETH_MASK		BIT(23)
+#define SYSCFG_MP1_ETH_MASK		GENMASK(23, 16)
+
+#define SYSCFG_PMCR_ETH_CLK_SEL		BIT(16)
+#define SYSCFG_PMCR_ETH_REF_CLK_SEL	BIT(17)
+#define SYSCFG_PMCR_ETH_SEL_MII		BIT(20)
+#define SYSCFG_PMCR_ETH_SEL_RGMII	BIT(21)
+#define SYSCFG_PMCR_ETH_SEL_RMII	BIT(23)
+#define SYSCFG_PMCR_ETH_SEL_GMII	0
+#define SYSCFG_MCU_ETH_SEL_MII		0
+#define SYSCFG_MCU_ETH_SEL_RMII		1
 
 struct stm32_dwmac {
 	struct clk *clk_tx;
 	struct clk *clk_rx;
+	struct clk *clk_eth_ck;
+	struct clk *clk_ethstp;
+	struct clk *syscfg_clk;
+	bool int_phyclk;	/* Clock from RCC to drive PHY */
 	u32 mode_reg;		/* MAC glue-logic mode register */
 	struct regmap *regmap;
 	u32 speed;
+	const struct stm32_ops *ops;
+	struct device *dev;
+};
+
+struct stm32_ops {
+	int (*set_mode)(struct plat_stmmacenet_data *plat_dat);
+	int (*clk_prepare)(struct stm32_dwmac *dwmac, bool prepare);
+	int (*suspend)(struct stm32_dwmac *dwmac);
+	void (*resume)(struct stm32_dwmac *dwmac);
+	int (*parse_data)(struct stm32_dwmac *dwmac,
+			  struct device *dev);
+	u32 syscfg_eth_mask;
 };
 
 static int stm32_dwmac_init(struct plat_stmmacenet_data *plat_dat)
 {
 	struct stm32_dwmac *dwmac = plat_dat->bsp_priv;
-	u32 reg = dwmac->mode_reg;
-	u32 val;
 	int ret;
 
-	val = (plat_dat->interface == PHY_INTERFACE_MODE_MII) ? 0 : 1;
-	ret = regmap_update_bits(dwmac->regmap, reg, MII_PHY_SEL_MASK, val);
-	if (ret)
-		return ret;
+	if (dwmac->ops->set_mode) {
+		ret = dwmac->ops->set_mode(plat_dat);
+		if (ret)
+			return ret;
+	}
 
 	ret = clk_prepare_enable(dwmac->clk_tx);
 	if (ret)
 		return ret;
 
-	ret = clk_prepare_enable(dwmac->clk_rx);
-	if (ret)
-		clk_disable_unprepare(dwmac->clk_tx);
+	if (!dwmac->dev->power.is_suspended) {
+		ret = clk_prepare_enable(dwmac->clk_rx);
+		if (ret) {
+			clk_disable_unprepare(dwmac->clk_tx);
+			return ret;
+		}
+	}
+
+	if (dwmac->ops->clk_prepare) {
+		ret = dwmac->ops->clk_prepare(dwmac, true);
+		if (ret) {
+			clk_disable_unprepare(dwmac->clk_rx);
+			clk_disable_unprepare(dwmac->clk_tx);
+		}
+	}
 
 	return ret;
 }
 
+static int stm32mp1_clk_prepare(struct stm32_dwmac *dwmac, bool prepare)
+{
+	int ret = 0;
+
+	if (prepare) {
+		ret = clk_prepare_enable(dwmac->syscfg_clk);
+		if (ret)
+			return ret;
+
+		if (dwmac->int_phyclk) {
+			ret = clk_prepare_enable(dwmac->clk_eth_ck);
+			if (ret) {
+				clk_disable_unprepare(dwmac->syscfg_clk);
+				return ret;
+			}
+		}
+	} else {
+		clk_disable_unprepare(dwmac->syscfg_clk);
+		if (dwmac->int_phyclk)
+			clk_disable_unprepare(dwmac->clk_eth_ck);
+	}
+	return ret;
+}
+
+static int stm32mp1_set_mode(struct plat_stmmacenet_data *plat_dat)
+{
+	struct stm32_dwmac *dwmac = plat_dat->bsp_priv;
+	u32 reg = dwmac->mode_reg;
+	int val;
+
+	switch (plat_dat->interface) {
+	case PHY_INTERFACE_MODE_MII:
+		val = SYSCFG_PMCR_ETH_SEL_MII;
+		pr_debug("SYSCFG init : PHY_INTERFACE_MODE_MII\n");
+		break;
+	case PHY_INTERFACE_MODE_GMII:
+		val = SYSCFG_PMCR_ETH_SEL_GMII;
+		if (dwmac->int_phyclk)
+			val |= SYSCFG_PMCR_ETH_CLK_SEL;
+		pr_debug("SYSCFG init : PHY_INTERFACE_MODE_GMII\n");
+		break;
+	case PHY_INTERFACE_MODE_RMII:
+		val = SYSCFG_PMCR_ETH_SEL_RMII;
+		if (dwmac->int_phyclk)
+			val |= SYSCFG_PMCR_ETH_REF_CLK_SEL;
+		pr_debug("SYSCFG init : PHY_INTERFACE_MODE_RMII\n");
+		break;
+	case PHY_INTERFACE_MODE_RGMII:
+	case PHY_INTERFACE_MODE_RGMII_ID:
+	case PHY_INTERFACE_MODE_RGMII_RXID:
+	case PHY_INTERFACE_MODE_RGMII_TXID:
+		val = SYSCFG_PMCR_ETH_SEL_RGMII;
+		if (dwmac->int_phyclk)
+			val |= SYSCFG_PMCR_ETH_CLK_SEL;
+		pr_debug("SYSCFG init : PHY_INTERFACE_MODE_RGMII\n");
+		break;
+	default:
+		pr_debug("SYSCFG init :  Do not manage %d interface\n",
+			 plat_dat->interface);
+		/* Do not manage others interfaces */
+		return -EINVAL;
+	}
+
+	return regmap_update_bits(dwmac->regmap, reg,
+				 dwmac->ops->syscfg_eth_mask, val);
+}
+
+static int stm32mcu_set_mode(struct plat_stmmacenet_data *plat_dat)
+{
+	struct stm32_dwmac *dwmac = plat_dat->bsp_priv;
+	u32 reg = dwmac->mode_reg;
+	int val;
+
+	switch (plat_dat->interface) {
+	case PHY_INTERFACE_MODE_MII:
+		val = SYSCFG_MCU_ETH_SEL_MII;
+		pr_debug("SYSCFG init : PHY_INTERFACE_MODE_MII\n");
+		break;
+	case PHY_INTERFACE_MODE_RMII:
+		val = SYSCFG_MCU_ETH_SEL_RMII;
+		pr_debug("SYSCFG init : PHY_INTERFACE_MODE_RMII\n");
+		break;
+	default:
+		pr_debug("SYSCFG init :  Do not manage %d interface\n",
+			 plat_dat->interface);
+		/* Do not manage others interfaces */
+		return -EINVAL;
+	}
+
+	return regmap_update_bits(dwmac->regmap, reg,
+				 dwmac->ops->syscfg_eth_mask, val);
+}
+
 static void stm32_dwmac_clk_disable(struct stm32_dwmac *dwmac)
 {
 	clk_disable_unprepare(dwmac->clk_tx);
 	clk_disable_unprepare(dwmac->clk_rx);
+
+	if (dwmac->ops->clk_prepare)
+		dwmac->ops->clk_prepare(dwmac, false);
 }
 
 static int stm32_dwmac_parse_data(struct stm32_dwmac *dwmac,
@@ -70,15 +204,22 @@ static int stm32_dwmac_parse_data(struct stm32_dwmac *dwmac,
 	/*  Get TX/RX clocks */
 	dwmac->clk_tx = devm_clk_get(dev, "mac-clk-tx");
 	if (IS_ERR(dwmac->clk_tx)) {
-		dev_err(dev, "No tx clock provided...\n");
+		dev_err(dev, "No ETH Tx clock provided...\n");
 		return PTR_ERR(dwmac->clk_tx);
 	}
+
 	dwmac->clk_rx = devm_clk_get(dev, "mac-clk-rx");
 	if (IS_ERR(dwmac->clk_rx)) {
-		dev_err(dev, "No rx clock provided...\n");
+		dev_err(dev, "No ETH Rx clock provided...\n");
 		return PTR_ERR(dwmac->clk_rx);
 	}
 
+	if (dwmac->ops->parse_data) {
+		err = dwmac->ops->parse_data(dwmac, dev);
+		if (err)
+			return err;
+	}
+
 	/* Get mode register */
 	dwmac->regmap = syscon_regmap_lookup_by_phandle(np, "st,syscon");
 	if (IS_ERR(dwmac->regmap))
@@ -91,11 +232,46 @@ static int stm32_dwmac_parse_data(struct stm32_dwmac *dwmac,
 	return err;
 }
 
+static int stm32mp1_parse_data(struct stm32_dwmac *dwmac,
+			       struct device *dev)
+{
+	struct device_node *np = dev->of_node;
+
+	dwmac->int_phyclk = of_property_read_bool(np, "st,int-phyclk");
+
+	/* Check if internal clk from RCC selected */
+	if (dwmac->int_phyclk) {
+		/*  Get ETH_CLK clocks */
+		dwmac->clk_eth_ck = devm_clk_get(dev, "eth-ck");
+		if (IS_ERR(dwmac->clk_eth_ck)) {
+			dev_err(dev, "No ETH CK clock provided...\n");
+			return PTR_ERR(dwmac->clk_eth_ck);
+		}
+	}
+
+	/*  Clock used for low power mode */
+	dwmac->clk_ethstp = devm_clk_get(dev, "ethstp");
+	if (IS_ERR(dwmac->clk_ethstp)) {
+		dev_err(dev, "No ETH peripheral clock provided for CStop mode ...\n");
+		return PTR_ERR(dwmac->clk_ethstp);
+	}
+
+	/*  Clock for sysconfig */
+	dwmac->syscfg_clk = devm_clk_get(dev, "syscfg-clk");
+	if (IS_ERR(dwmac->syscfg_clk)) {
+		dev_err(dev, "No syscfg clock provided...\n");
+		return PTR_ERR(dwmac->syscfg_clk);
+	}
+
+	return 0;
+}
+
 static int stm32_dwmac_probe(struct platform_device *pdev)
 {
 	struct plat_stmmacenet_data *plat_dat;
 	struct stmmac_resources stmmac_res;
 	struct stm32_dwmac *dwmac;
+	const struct stm32_ops *data;
 	int ret;
 
 	ret = stmmac_get_platform_resources(pdev, &stmmac_res);
@@ -112,6 +288,16 @@ static int stm32_dwmac_probe(struct platform_device *pdev)
 		goto err_remove_config_dt;
 	}
 
+	data = of_device_get_match_data(&pdev->dev);
+	if (!data) {
+		dev_err(&pdev->dev, "no of match data provided\n");
+		ret = -EINVAL;
+		goto err_remove_config_dt;
+	}
+
+	dwmac->ops = data;
+	dwmac->dev = &pdev->dev;
+
 	ret = stm32_dwmac_parse_data(dwmac, &pdev->dev);
 	if (ret) {
 		dev_err(&pdev->dev, "Unable to parse OF data\n");
@@ -149,15 +335,48 @@ static int stm32_dwmac_remove(struct platform_device *pdev)
 	return ret;
 }
 
+static int stm32mp1_suspend(struct stm32_dwmac *dwmac)
+{
+	int ret = 0;
+
+	ret = clk_prepare_enable(dwmac->clk_ethstp);
+	if (ret)
+		return ret;
+
+	clk_disable_unprepare(dwmac->clk_tx);
+	clk_disable_unprepare(dwmac->syscfg_clk);
+	if (dwmac->int_phyclk)
+		clk_disable_unprepare(dwmac->clk_eth_ck);
+
+	return ret;
+}
+
+static void stm32mp1_resume(struct stm32_dwmac *dwmac)
+{
+	clk_disable_unprepare(dwmac->clk_ethstp);
+}
+
+static int stm32mcu_suspend(struct stm32_dwmac *dwmac)
+{
+	clk_disable_unprepare(dwmac->clk_tx);
+	clk_disable_unprepare(dwmac->clk_rx);
+
+	return 0;
+}
+
 #ifdef CONFIG_PM_SLEEP
 static int stm32_dwmac_suspend(struct device *dev)
 {
 	struct net_device *ndev = dev_get_drvdata(dev);
 	struct stmmac_priv *priv = netdev_priv(ndev);
+	struct stm32_dwmac *dwmac = priv->plat->bsp_priv;
+
 	int ret;
 
 	ret = stmmac_suspend(dev);
-	stm32_dwmac_clk_disable(priv->plat->bsp_priv);
+
+	if (dwmac->ops->suspend)
+		ret = dwmac->ops->suspend(dwmac);
 
 	return ret;
 }
@@ -166,8 +385,12 @@ static int stm32_dwmac_resume(struct device *dev)
 {
 	struct net_device *ndev = dev_get_drvdata(dev);
 	struct stmmac_priv *priv = netdev_priv(ndev);
+	struct stm32_dwmac *dwmac = priv->plat->bsp_priv;
 	int ret;
 
+	if (dwmac->ops->resume)
+		dwmac->ops->resume(dwmac);
+
 	ret = stm32_dwmac_init(priv->plat);
 	if (ret)
 		return ret;
@@ -181,8 +404,24 @@ static int stm32_dwmac_resume(struct device *dev)
 static SIMPLE_DEV_PM_OPS(stm32_dwmac_pm_ops,
 	stm32_dwmac_suspend, stm32_dwmac_resume);
 
+static struct stm32_ops stm32mcu_dwmac_data = {
+	.set_mode = stm32mcu_set_mode,
+	.suspend = stm32mcu_suspend,
+	.syscfg_eth_mask = SYSCFG_MCU_ETH_MASK
+};
+
+static struct stm32_ops stm32mp1_dwmac_data = {
+	.set_mode = stm32mp1_set_mode,
+	.clk_prepare = stm32mp1_clk_prepare,
+	.suspend = stm32mp1_suspend,
+	.resume = stm32mp1_resume,
+	.parse_data = stm32mp1_parse_data,
+	.syscfg_eth_mask = SYSCFG_MP1_ETH_MASK
+};
+
 static const struct of_device_id stm32_dwmac_match[] = {
-	{ .compatible = "st,stm32-dwmac"},
+	{ .compatible = "st,stm32-dwmac", .data = &stm32mcu_dwmac_data},
+	{ .compatible = "st,stm32mp1-dwmac", .data = &stm32mp1_dwmac_data},
 	{ }
 };
 MODULE_DEVICE_TABLE(of, stm32_dwmac_match);
@@ -199,5 +438,6 @@ static SIMPLE_DEV_PM_OPS(stm32_dwmac_pm_ops,
 module_platform_driver(stm32_dwmac_driver);
 
 MODULE_AUTHOR("Alexandre Torgue <alexandre.torgue@gmail.com>");
-MODULE_DESCRIPTION("STMicroelectronics MCU DWMAC Specific Glue layer");
+MODULE_AUTHOR("Christophe Roullier <christophe.roullier@st.com>");
+MODULE_DESCRIPTION("STMicroelectronics STM32 DWMAC Specific Glue layer");
 MODULE_LICENSE("GPL v2");
-- 
1.9.1

^ permalink raw reply related

* [PATCH V4 4/8] ARM: dts: stm32: Add syscfg on stm32mp1
From: Christophe Roullier @ 2018-05-23 15:47 UTC (permalink / raw)
  To: mark.rutland, mcoquelin.stm32, alexandre.torgue, peppe.cavallaro
  Cc: devicetree, linux-arm-kernel, netdev, christophe.roullier, andrew
In-Reply-To: <1527090479-5263-1-git-send-email-christophe.roullier@st.com>

System configuration controller is mainly used to manage
the compensation cell and other IOs and system related
settings.

Signed-off-by: Christophe Roullier <christophe.roullier@st.com>
---
 arch/arm/boot/dts/stm32mp157c.dtsi | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/arch/arm/boot/dts/stm32mp157c.dtsi b/arch/arm/boot/dts/stm32mp157c.dtsi
index bc3eddc..3db03a2 100644
--- a/arch/arm/boot/dts/stm32mp157c.dtsi
+++ b/arch/arm/boot/dts/stm32mp157c.dtsi
@@ -167,6 +167,11 @@
 			#reset-cells = <1>;
 		};
 
+		syscfg: syscon@50020000 {
+			compatible = "st,stm32mp157-syscfg", "syscon";
+			reg = <0x50020000 0x400>;
+		};
+
 		usart1: serial@5c000000 {
 			compatible = "st,stm32h7-uart";
 			reg = <0x5c000000 0x400>;
-- 
1.9.1

^ permalink raw reply related

* Re: [RFC PATCH 0/6] net: ethernet: ti: cpsw: add MQPRIO and CBS Qdisc offload
From: Grygorii Strashko @ 2018-05-23 15:43 UTC (permalink / raw)
  To: Ivan Khoronzhuk, davem
  Cc: corbet, akpm, netdev, linux-doc, linux-kernel, linux-omap,
	vinicius.gomes, henrik, jesus.sanchez-palencia, Sekhar Nori,
	Yogesh Siraswar, Schuyler Patton
In-Reply-To: <20180518211510.13341-1-ivan.khoronzhuk@linaro.org>

Hi Ivan,

On 05/18/2018 04:15 PM, Ivan Khoronzhuk wrote:
> This series adds MQPRIO and CBS Qdisc offload for TI cpsw driver.
> It potentially can be used in audio video bridging (AVB) and time
> sensitive networking (TSN).
> 
> Patchset was tested on AM572x EVM and BBB boards. Last patch from this
> series adds detailed description of configuration with examples. For
> consistency reasons, in role of talker and listener, tools from
> patchset "TSN: Add qdisc based config interface for CBS" were used and
> can be seen here: https://www.spinics.net/lists/netdev/msg460869.html
> 
> Based on net-next/master

Thanks a lot, it is great work.

In general I have no comments as of now, but I prefer to wait a bit (few 
weeks) for more comments and possible test reports.

If no comments, pls feel free to repost as final series.


> 
> Ivan Khoronzhuk (6):
>    net: ethernet: ti: cpsw: use cpdma channels in backward order for txq
>    net: ethernet: ti: cpdma: fit rated channels in backward order
>    net: ethernet: ti: cpsw: add MQPRIO Qdisc offload
>    net: ethernet: ti: cpsw: add CBS Qdisc offload
>    net: ethernet: ti: cpsw: restore shaper configuration while down/up
>    Documentation: networking: cpsw: add MQPRIO & CBS offload examples
> 
>   Documentation/networking/cpsw.txt       | 540 ++++++++++++++++++++++++
>   drivers/net/ethernet/ti/cpsw.c          | 364 +++++++++++++++-
>   drivers/net/ethernet/ti/davinci_cpdma.c |  31 +-
>   3 files changed, 913 insertions(+), 22 deletions(-)
>   create mode 100644 Documentation/networking/cpsw.txt
> 

-- 
regards,
-grygorii

^ permalink raw reply

* Re: WARNING in ip_recv_error
From: Willem de Bruijn @ 2018-05-23 15:40 UTC (permalink / raw)
  To: David Miller
  Cc: Eric Dumazet, DaeLyong Jeong, Alexey Kuznetsov, Hideaki YOSHIFUJI,
	Network Development, LKML, Byoungyoung Lee, Kyungtae Kim,
	bammanag, Willem de Bruijn
In-Reply-To: <CAF=yD-JObh3u+41V609=zph0XfcgOxpmqm3hL5WPKWFrSnnV3Q@mail.gmail.com>

On Sun, May 20, 2018 at 7:13 PM, Willem de Bruijn
<willemdebruijn.kernel@gmail.com> wrote:
> On Fri, May 18, 2018 at 2:59 PM, Willem de Bruijn
> <willemdebruijn.kernel@gmail.com> wrote:
>> On Fri, May 18, 2018 at 2:46 PM, Willem de Bruijn
>> <willemdebruijn.kernel@gmail.com> wrote:
>>> On Fri, May 18, 2018 at 2:44 PM, Willem de Bruijn
>>> <willemdebruijn.kernel@gmail.com> wrote:
>>>> On Fri, May 18, 2018 at 1:09 PM, Willem de Bruijn
>>>> <willemdebruijn.kernel@gmail.com> wrote:
>>>>> On Fri, May 18, 2018 at 11:44 AM, David Miller <davem@davemloft.net> wrote:
>>>>>> From: Eric Dumazet <eric.dumazet@gmail.com>
>>>>>> Date: Fri, 18 May 2018 08:30:43 -0700
>>>>>>
>>>>>>> We probably need to revert Willem patch (7ce875e5ecb8562fd44040f69bda96c999e38bbc)
>>>>>>
>>>>>> Is it really valid to reach ip_recv_err with an ipv6 socket?
>>>>>
>>>>> I guess the issue is that setsockopt IPV6_ADDRFORM is not an
>>>>> atomic operation, so that the socket is neither fully ipv4 nor fully
>>>>> ipv6 by the time it reaches ip_recv_error.
>>>>>
>>>>>   sk->sk_socket->ops = &inet_dgram_ops;
>>>>>   < HERE >
>>>>>   sk->sk_family = PF_INET;
>>>>>
>>>>> Even calling inet_recv_error to demux would not necessarily help.
>>>>>
>>>>> Safest would be to look up by skb->protocol, similar to what
>>>>> ipv6_recv_error does to handle v4-mapped-v6.
>>>>>
>>>>> Or to make that function safe with PF_INET and swap the order
>>>>> of the above two operations.
>>>>>
>>>>> All sound needlessly complicated for this rare socket option, but
>>>>> I don't have a better idea yet. Dropping on the floor is not nice,
>>>>> either.
>>>>
>>>> Ensuring that ip_recv_error correctly handles packets from either
>>>> socket and removing the warning should indeed be good.
>>>>
>>>> It is robust against v4-mapped packets from an AF_INET6 socket,
>>>> but see caveat on reconnect below.
>>>>
>>>> The code between ipv6_recv_error for v4-mapped addresses and
>>>> ip_recv_error is essentially the same, the main difference being
>>>> whether to return network headers as sockaddr_in with SOL_IP
>>>> or sockaddr_in6 with SOL_IPV6.
>>>>
>>>> There are very few other locations in the stack that explicitly test
>>>> sk_family in this way and thus would be vulnerable to races with
>>>> IPV6_ADDRFORM.
>>>>
>>>> I'm not sure whether it is possible for a udpv6 socket to queue a
>>>> real ipv6 packet on the error queue, disconnect, connect to an
>>>> ipv4 address, call IPV6_ADDRFORM and then call ip_recv_error
>>>> on a true ipv6 packet. That would return buggy data, e.g., in
>>>> msg_name.
>>>
>>> In do_ipv6_setsockopt IPV6_ADDRFORM we can test that the
>>> error queue is empty, and then take its lock for the duration of the
>>> operation.
>>
>> Actually, no reason to hold the lock. This setsockopt holds the socket
>> lock, which connect would need, too. So testing that the queue
>> is empty after testing that it is connected to a v4 address is
>> sufficient to ensure that no ipv6 packets are queued for reception.
>>
>> diff --git a/net/ipv6/ipv6_sockglue.c b/net/ipv6/ipv6_sockglue.c
>> index 4d780c7f0130..a975d6311341 100644
>> --- a/net/ipv6/ipv6_sockglue.c
>> +++ b/net/ipv6/ipv6_sockglue.c
>> @@ -199,6 +199,11 @@ static int do_ipv6_setsockopt(struct sock *sk,
>> int level, int optname,
>>
>>                         if (ipv6_only_sock(sk) ||
>>                             !ipv6_addr_v4mapped(&sk->sk_v6_daddr)) {
>>                                 retv = -EADDRNOTAVAIL;
>>                                 break;
>>                         }
>>
>> +                       if (!skb_queue_empty(&sk->sk_error_queue)) {
>> +                               retv = -EBUSY;
>> +                               break;
>> +                       }
>> +
>>                         fl6_free_socklist(sk);
>>                         __ipv6_sock_mc_close(sk);
>>
>> After this it should be safe to remove the warning in ip_recv_error.
>
> Hmm.. nope.
>
> This ensures that the socket cannot produce any new true v6 packets.
> But it does not guarantee that they are not already in the system, e.g.
> queued in tc, and will find their way to the error queue later.
>
> We'll have to just be able to handle ipv6 packets in ip_recv_error.
> Since IPV6_ADDRFORM is used to pass to legacy v4-only
> processes and those likely are only confused by SOL_IPV6
> error messages, it is probably best to just drop them and perhaps
> WARN_ONCE.

Even more fun, this is not limited to the error queue.

I can queue a v6 packet for reception on a socket, connect to a v4
address, call IPV6_ADDRFORM and then a regular recvfrom will
return a partial v6 address as AF_INET.

We definitely do not want to have to add a check

  if (skb->protocol == htons(ETH_P_IPV6)) {
    kfree_skb(skb);
    goto try_again;
  }

to the normal recvmsg path.

An alternative may be to tighten the check on when to allow
IPV6_ADDRFORM. Not only return EBUSY if a packet is pending,
but also if any sk_{rmem, omem, wmem}_alloc is non-zero. Only,
these tightened constraints could break a legacy application.

Either way, this race is somewhat tangential to the one that
RaceFuzzer found. The sk changes that IPV6_ADDRFORM makes
to sk_prot, sk_socket->ops and sk_family are not atomic and will
not be. They need not be, because no other code assumes this
consistency.

So I'll start by removing the warning as Eric suggested.

^ permalink raw reply

* Re: [PATCH net-next v3 0/7] Add support for QCA8334 switch
From: Florian Fainelli @ 2018-05-23 15:39 UTC (permalink / raw)
  To: Michal Vokáč, netdev
  Cc: linux-kernel, devicetree, vivien.didelot, andrew, mark.rutland,
	robh+dt, davem, michal.vokac
In-Reply-To: <1527056424-14528-1-git-send-email-michal.vokac@ysoft.com>



On 05/22/2018 11:20 PM, Michal Vokáč wrote:
> This series basically adds support for a QCA8334 ethernet switch to the
> qca8k driver. It is a four-port variant of the already supported seven
> port QCA8337. Register map is the same for the whole familly and all chips
> have the same device ID.
> 
> Major part of this series enhances the CPU port setting. Currently the CPU
> port is not set to any sensible defaults compatible with the xGMII
> interface. This series forces the CPU port to its maximum bandwidth and
> also allows to adjust the new defaults using fixed-link device tree
> sub-node.
> 
> Alongside these changes I fixed two checkpatch warnings regarding SPDX and
> redundant parentheses.

Looks great, thanks Michal! Do you have any features or things you are
working on that would be added later to the driver?

> 
> Changes in v3:
>  - Rebased on latest net-next/master.
>  - Corrected fixed-link documentation.
> 
> Michal Vokáč (7):
>   net: dsa: qca8k: Add QCA8334 binding documentation
>   net: dsa: qca8k: Add support for QCA8334 switch
>   net: dsa: qca8k: Enable RXMAC when bringing up a port
>   net: dsa: qca8k: Force CPU port to its highest bandwidth
>   net: dsa: qca8k: Allow overwriting CPU port setting
>   net: dsa: qca8k: Replace GPL boilerplate by SPDX
>   net: dsa: qca8k: Remove redundant parentheses
> 
>  .../devicetree/bindings/net/dsa/qca8k.txt          | 23 +++++++-
>  drivers/net/dsa/qca8k.c                            | 64 ++++++++++++++++++----
>  drivers/net/dsa/qca8k.h                            |  7 ++-
>  3 files changed, 79 insertions(+), 15 deletions(-)
> 

-- 
Florian

^ permalink raw reply

* Re: [PATCH net-next v3 2/7] net: dsa: qca8k: Add support for QCA8334 switch
From: Florian Fainelli @ 2018-05-23 15:38 UTC (permalink / raw)
  To: Michal Vokáč, netdev
  Cc: linux-kernel, devicetree, vivien.didelot, andrew, mark.rutland,
	robh+dt, davem, michal.vokac
In-Reply-To: <1527056424-14528-3-git-send-email-michal.vokac@ysoft.com>



On 05/22/2018 11:20 PM, Michal Vokáč wrote:
> Add support for the four-port variant of the Qualcomm QCA833x switch.
> 
> Signed-off-by: Michal Vokáč <michal.vokac@ysoft.com>
> Reviewed-by: Andrew Lunn <andrew@lunn.ch>

Reviewed-by: Florian Fainelli <f.fainelli@gmail.com>
-- 
Florian

^ permalink raw reply

* Re: [net-next 1/6] net/dcb: Add dcbnl buffer attribute
From: Huy Nguyen @ 2018-05-23 15:37 UTC (permalink / raw)
  To: John Fastabend, Jiri Pirko, Jakub Kicinski
  Cc: Saeed Mahameed, David S. Miller, netdev, Or Gerlitz
In-Reply-To: <e361efd5-293d-0712-7ddf-5ad2a838d013@gmail.com>



On 5/23/2018 8:52 AM, John Fastabend wrote:
> It would be nice though if the API gave us some hint on max/min/stride
> of allowed values. Could the get API return these along with current
> value? Presumably the allowed max size could change with devlink buffer
> changes in how the global buffer is divided up as well.
Acked. I will add Max. Let's skip min/stride since it is too hardware 
specific.

^ permalink raw reply

* Re: [PATCH net-next v3 1/7] net: dsa: qca8k: Add QCA8334 binding documentation
From: Florian Fainelli @ 2018-05-23 15:37 UTC (permalink / raw)
  To: Michal Vokáč, netdev
  Cc: linux-kernel, devicetree, vivien.didelot, andrew, mark.rutland,
	robh+dt, davem, michal.vokac
In-Reply-To: <1527056424-14528-2-git-send-email-michal.vokac@ysoft.com>



On 05/22/2018 11:20 PM, Michal Vokáč wrote:
> Add support for the four-port variant of the Qualcomm QCA833x switch.
> 
> The CPU port default link settings can be reconfigured using
> a fixed-link sub-node.
> 
> Signed-off-by: Michal Vokáč <michal.vokac@ysoft.com>
> Reviewed-by: Rob Herring <robh@kernel.org>
> Reviewed-by: Andrew Lunn <andrew@lunn.ch>

Reviewed-by: Florian Fainelli <f.fainelli@gmail.com>
-- 
Florian

^ permalink raw reply

* RE: [PATCH v2] packet: track ring entry use using a shadow ring to prevent RX ring overrun
From: Jon Rosen (jrosen) @ 2018-05-23 15:29 UTC (permalink / raw)
  To: Willem de Bruijn
  Cc: David S. Miller, Willem de Bruijn, Eric Dumazet, Kees Cook,
	David Windsor, Rosen, Rami, Reshetova, Elena, Mike Maloney,
	Benjamin Poirier, Thomas Gleixner, Greg Kroah-Hartman,
	open list:NETWORKING [GENERAL], open list
In-Reply-To: <CAF=yD-LpGOTx_jrh10G59M4Y-vpdtx_q5SG1Hjc7452y9KUt4w@mail.gmail.com>



> -----Original Message-----
> From: Willem de Bruijn [mailto:willemdebruijn.kernel@gmail.com]
> Sent: Wednesday, May 23, 2018 9:37 AM
> To: Jon Rosen (jrosen) <jrosen@cisco.com>
> Cc: David S. Miller <davem@davemloft.net>; Willem de Bruijn <willemb@google.com>; Eric Dumazet
> <edumazet@google.com>; Kees Cook <keescook@chromium.org>; David Windsor <dwindsor@gmail.com>; Rosen,
> Rami <rami.rosen@intel.com>; Reshetova, Elena <elena.reshetova@intel.com>; Mike Maloney
> <maloney@google.com>; Benjamin Poirier <bpoirier@suse.com>; Thomas Gleixner <tglx@linutronix.de>; Greg
> Kroah-Hartman <gregkh@linuxfoundation.org>; open list:NETWORKING [GENERAL] <netdev@vger.kernel.org>;
> open list <linux-kernel@vger.kernel.org>
> Subject: Re: [PATCH v2] packet: track ring entry use using a shadow ring to prevent RX ring overrun
> 
> On Wed, May 23, 2018 at 7:54 AM, Jon Rosen (jrosen) <jrosen@cisco.com> wrote:
> >> > For the ring, there is no requirement to allocate exactly the amount
> >> > specified by the user request. Safer than relying on shared memory
> >> > and simpler than the extra allocation in this patch would be to allocate
> >> > extra shadow memory at the end of the ring (and not mmap that).
> >> >
> >> > That still leaves an extra cold cacheline vs using tp_padding.
> >>
> >> Given my lack of experience and knowledge in writing kernel code
> >> it was easier for me to allocate the shadow ring as a separate
> >> structure.  Of course it's not about me and my skills so if it's
> >> more appropriate to allocate at the tail of the existing ring
> >> then certainly I can look at doing that.
> >
> > The memory for the ring is not one contiguous block, it's an array of
> > blocks of pages (or 'order' sized blocks of pages). I don't think
> > increasing the size of each of the blocks to provided storage would be
> > such a good idea as it will risk spilling over into the next order and
> > wasting lots of memory. I suspect it's also more complex than a single
> > shadow ring to do both the allocation and the access.
> >
> > It could be tacked onto the end of the pg_vec[] used to store the
> > pointers to the blocks. The challenge with that is that a pg_vec[] is
> > created for each of RX and TX rings so either it would have to
> > allocate unnecessary storage for TX or the caller will have to say if
> > extra space should be allocated or not.  E.g.:
> >
> > static struct pgv *alloc_pg_vec(struct tpacket_req *req, int order, int scratch, void **scratch_p)
> >
> > I'm not sure avoiding the extra allocation and moving it to the
> > pg_vec[] for the RX ring is going to get the simplification you were
> > hoping for.  Is there another way of storing the shadow ring which
> > I should consider?
> 
> I did indeed mean attaching extra pages to pg_vec[]. It should be
> simpler than a separate structure, but I may be wrong.

I don't think it would be too bad, it may actually turn out to be
convenient to implement.

> 
> Either way, I still would prefer to avoid the shadow buffer completely.
> It incurs complexity and cycle cost on all users because of only the
> rare (non-existent?) consumer that overwrites the padding bytes.

I prefer that as well.  I'm just not sure there is a bulletproof
solution without the shadow state.  I also wish it were only a
theoretical issue but unfortunately it is actually something our
customers have seen.
> 
> Perhaps we can use padding yet avoid deadlock by writing a
> timed value. The simplest would be jiffies >> N. Then only a
> process that writes this exact value would be subject to drops and
> then still only for a limited period.
> 
> Instead of depending on wall clock time, like jiffies, another option
> would be to keep a percpu array of values. Each cpu has a zero
> entry if it is not writing, nonzero if it is. If a writer encounters a
> number in padding that is > num_cpus, then the state is garbage
> from userspace. If <= num_cpus, it is adhered to only until that cpu
> clears its entry, which is guaranteed to happen eventually.
> 
> Just a quick thought. This might not fly at all upon closer scrutiny.

I'm not sure I understand the suggestion, but I'll think on it
some more.

Some other options maybe worth considering (in no specific order):
- test the application to see if it will consume entries if tp_status
  is set to anything other than TP_STATUS_USER, only use shadow if
  it doesn't strictly honor the TP_STATUS_USER bit.

- skip shadow if we see new TP_STATUS_USER_TO_KERNEL is used

- use tp_len == -1 to indicate inuse




^ permalink raw reply

* Re: [PATCH net-next] sfc: stop the TX queue before pushing new buffers
From: Edward Cree @ 2018-05-23 15:28 UTC (permalink / raw)
  To: Martin Habets, linux-net-drivers, davem; +Cc: netdev, jarod
In-Reply-To: <152706844446.27257.6312747433904122379.stgit@mh-desktop.uk.solarflarecom.com>

On 23/05/18 10:41, Martin Habets wrote:
> efx_enqueue_skb() can push new buffers for the xmit_more functionality.
> We must stops the TX queue before this or else the TX queue does not get
> restarted and we get a netdev watchdog.
>
> In the error handling we may now need to unwind more than 1 packet, and
> we may need to push the new buffers onto the partner queue.
>
> Fixes: e9117e5099ea ("sfc: Firmware-Assisted TSO version 2")
> Reported-by: Jarod Wilson <jarod@redhat.com>
> Signed-off-by: Martin Habets <mhabets@solarflare.com>
> ---
>
> Dave, could you please also queue up this patch for stable?
>
>  drivers/net/ethernet/sfc/tx.c |   31 +++++++++++++++++++++++--------
>  1 file changed, 23 insertions(+), 8 deletions(-)
>
> diff --git a/drivers/net/ethernet/sfc/tx.c b/drivers/net/ethernet/sfc/tx.c
> index cece961f2e82..17e0697f42d5 100644
> --- a/drivers/net/ethernet/sfc/tx.c
> +++ b/drivers/net/ethernet/sfc/tx.c
> @@ -435,17 +435,18 @@ static int efx_tx_map_data(struct efx_tx_queue *tx_queue, struct sk_buff *skb,
>  	} while (1);
>  }
>  
> -/* Remove buffers put into a tx_queue.  None of the buffers must have
> - * an skb attached.
> +/* Remove buffers put into a tx_queue for the current packet.
> + * None of the buffers must have an skb attached.
>   */
> -static void efx_enqueue_unwind(struct efx_tx_queue *tx_queue)
> +static void efx_enqueue_unwind(struct efx_tx_queue *tx_queue,
> +			       unsigned int insert_count)
>  {
>  	struct efx_tx_buffer *buffer;
>  	unsigned int bytes_compl = 0;
>  	unsigned int pkts_compl = 0;
>  
>  	/* Work backwards until we hit the original insert pointer value */
> -	while (tx_queue->insert_count != tx_queue->write_count) {
> +	while (tx_queue->insert_count != insert_count) {
>  		--tx_queue->insert_count;
>  		buffer = __efx_tx_queue_get_insert_buffer(tx_queue);
>  		efx_dequeue_buffer(tx_queue, buffer, &pkts_compl, &bytes_compl);
> @@ -504,6 +505,8 @@ static int efx_tx_tso_fallback(struct efx_tx_queue *tx_queue,
>   */
>  netdev_tx_t efx_enqueue_skb(struct efx_tx_queue *tx_queue, struct sk_buff *skb)
>  {
> +	unsigned int old_insert_count = tx_queue->insert_count;
> +	bool xmit_more = skb->xmit_more;
>  	bool data_mapped = false;
>  	unsigned int segments;
>  	unsigned int skb_len;
> @@ -553,8 +556,10 @@ netdev_tx_t efx_enqueue_skb(struct efx_tx_queue *tx_queue, struct sk_buff *skb)
>  	/* Update BQL */
>  	netdev_tx_sent_queue(tx_queue->core_txq, skb_len);
>  
> +	efx_tx_maybe_stop_queue(tx_queue);
> +
>  	/* Pass off to hardware */
> -	if (!skb->xmit_more || netif_xmit_stopped(tx_queue->core_txq)) {
> +	if (!xmit_more || netif_xmit_stopped(tx_queue->core_txq)) {
>  		struct efx_tx_queue *txq2 = efx_tx_queue_partner(tx_queue);
>  
>  		/* There could be packets left on the partner queue if those
> @@ -577,14 +582,24 @@ netdev_tx_t efx_enqueue_skb(struct efx_tx_queue *tx_queue, struct sk_buff *skb)
>  		tx_queue->tx_packets++;
>  	}
>  
> -	efx_tx_maybe_stop_queue(tx_queue);
> -
>  	return NETDEV_TX_OK;
>  
>  
>  err:
> -	efx_enqueue_unwind(tx_queue);
> +	efx_enqueue_unwind(tx_queue, old_insert_count);
>  	dev_kfree_skb_any(skb);
> +
> +	/* If we're not expecting another transmit and we had something to push
> +	 * on this queue or a partner queue then we need to push here to get the
> +	 * previous packets out.
> +	 */
> +	if (!xmit_more) {
> +		struct efx_tx_queue *txq2 = efx_tx_queue_partner(tx_queue);
> +
> +		if (txq2->xmit_more_available)
> +			efx_nic_push_buffers(txq2);
> +	}
This only pushes the partner queue, it needs to also push this queue if
 tx_queue->xmit_more_available (as your comment just above mentions).

Apart from this, LGTM.

-Ed
> +
>  	return NETDEV_TX_OK;
>  }
>  
>

^ permalink raw reply

* Re: [net-next 1/6] net/dcb: Add dcbnl buffer attribute
From: Huy Nguyen @ 2018-05-23 15:27 UTC (permalink / raw)
  To: Jakub Kicinski
  Cc: Saeed Mahameed, David S. Miller, netdev, Jiri Pirko, Or Gerlitz,
	Parav Pandit, Ido Schimmel
In-Reply-To: <20180523022314.783e47fa@cakuba>



On 5/23/2018 4:23 AM, Jakub Kicinski wrote:
> >From patch description it seems like your default setup is shared
> buffer split 50% (lossy)/50% (all prios) and the example you give
> changes that to 25% (lossy)/25%x3 prio groups.
>
> With existing devlink API could this be modelled by three ingress pools
> with 2 TCs bound each?
Yes, possible. When you map prio to tc. Please be careful with prio term 
in switch
since switch has more than 8 prio.

^ permalink raw reply

* Re: [bpf-next V4 PATCH 6/8] xdp: change ndo_xdp_xmit API to support bulking
From: Jesper Dangaard Brouer @ 2018-05-23 15:27 UTC (permalink / raw)
  To: John Fastabend
  Cc: netdev, Daniel Borkmann, Alexei Starovoitov, Christoph Hellwig,
	BjörnTöpel, Magnus Karlsson, makita.toshiaki, brouer
In-Reply-To: <c9f0c7b0-26a8-815d-71bf-4e5ccdaa74aa@gmail.com>

On Wed, 23 May 2018 07:42:14 -0700
John Fastabend <john.fastabend@gmail.com> wrote:

> On 05/18/2018 06:35 AM, Jesper Dangaard Brouer wrote:
>  [...]  
> 
> Couple suggestions for some optimizations/improvements but otherwise
> looks good to me.
> 
> Thanks,
> John
> 
[...]
> > diff --git a/drivers/net/ethernet/intel/i40e/i40e_txrx.c b/drivers/net/ethernet/intel/i40e/i40e_txrx.c
> > index 5efa68de935b..9b698c5acd05 100644
> > --- a/drivers/net/ethernet/intel/i40e/i40e_txrx.c
> > +++ b/drivers/net/ethernet/intel/i40e/i40e_txrx.c
> > @@ -3664,14 +3664,19 @@ netdev_tx_t i40e_lan_xmit_frame(struct sk_buff *skb, struct net_device *netdev)
> >   * @dev: netdev
> >   * @xdp: XDP buffer
> >   *
> > - * Returns Zero if sent, else an error code
> > + * Returns number of frames successfully sent. Frames that fail are
> > + * free'ed via XDP return API.
> > + *
> > + * For error cases, a negative errno code is returned and no-frames
> > + * are transmitted (caller must handle freeing frames).
> >   **/
> > -int i40e_xdp_xmit(struct net_device *dev, struct xdp_frame *xdpf)
> > +int i40e_xdp_xmit(struct net_device *dev, int n, struct xdp_frame **frames)
> >  {
> >  	struct i40e_netdev_priv *np = netdev_priv(dev);
> >  	unsigned int queue_index = smp_processor_id();
> >  	struct i40e_vsi *vsi = np->vsi;
> > -	int err;
> > +	int drops = 0;
> > +	int i;
> >  
> >  	if (test_bit(__I40E_VSI_DOWN, vsi->state))
> >  		return -ENETDOWN;
> > @@ -3679,11 +3684,18 @@ int i40e_xdp_xmit(struct net_device *dev, struct xdp_frame *xdpf)
> >  	if (!i40e_enabled_xdp_vsi(vsi) || queue_index >= vsi->num_queue_pairs)
> >  		return -ENXIO;
> >  
> > -	err = i40e_xmit_xdp_ring(xdpf, vsi->xdp_rings[queue_index]);
> > -	if (err != I40E_XDP_TX)
> > -		return -ENOSPC;
> > +	for (i = 0; i < n; i++) {
> > +		struct xdp_frame *xdpf = frames[i];
> > +		int err;
> >  
> > -	return 0;
> > +		err = i40e_xmit_xdp_ring(xdpf, vsi->xdp_rings[queue_index]);
> > +		if (err != I40E_XDP_TX) {
> > +			xdp_return_frame_rx_napi(xdpf);
> > +			drops++;
> > +		}
> > +	}
> > +  
> 
> Perhaps not needed in this series, but I wonder how useful it is to hit
> the ring with the remaining frames after the first error. The error will
> typically be due to the TX queue being full. In this case it might make
> more sense to just drop the entire train of frames vs beating the up a
> full queue.

Yes, that would be an optimization, but I think we can do that in
another series.

This patcheset actually open up for a lot of followup optimizations.
I imagine/plan to optimize i40e_xmit_xdp_ring() further, e.g. by
implementing a bulking variant (e.g check I40E_DESC_UNUSED+update
xdp_ring->next_to_use once, map several DMA pages in one call, only
call smp_wmb() once per bulk, etc.),

> > +	return n - drops;
> >  }


-- 
Best regards,
  Jesper Dangaard Brouer
  MSc.CS, Principal Kernel Engineer at Red Hat
  LinkedIn: http://www.linkedin.com/in/brouer

^ permalink raw reply

* Re: [net-next] i40iw/i40e: Remove link dependency on i40e
From: Jason Gunthorpe @ 2018-05-23 15:18 UTC (permalink / raw)
  To: Alexander Duyck
  Cc: Christoph Hellwig, Jeff Kirsher, David Miller, Doug Ledford,
	Sindhu Devale, Netdev, linux-rdma, Neil Horman, Stefan Assmann,
	John Greene, Shiraz Saleem
In-Reply-To: <CAKgT0UeZRZ8O3PQWSz2Wod8HNjwc_bcuqqb70A7qypPn5kzW+Q@mail.gmail.com>

On Wed, May 23, 2018 at 08:03:44AM -0700, Alexander Duyck wrote:
> On Tue, May 22, 2018 at 11:19 PM, Christoph Hellwig <hch@infradead.org> wrote:
> > On Tue, May 22, 2018 at 02:04:06PM -0700, Jeff Kirsher wrote:
> >> > Why would you want to do this? The rdma driver is non-functional
> >> > without the ethernet driver, so why on earth would we want to defeat
> >> > the module dependency mechanism?
> >>
> >> This change is driven by the OSV's like Red Hat, where customer's were
> >> updating the i40e driver, which in turn broke i40iw.
> >
> > Doctor it hurts when I do this..
> >
> > There is no reason to make a mess of our drivers because people are
> > doing things they should haver never done and that aren't supported
> > in Linux.
> >
> > If Intel didn;t offer any out of tree drivers I'm pretty sure no
> > customer would even attempt this.  So fix this where the problem is.
> 
> Are you serious? You are never going to see out-of-tree drivers go
> away. They exist for the simple reason that most customers/OSVs are
> slow to upgrade their kernels so we have people running on a 3.10
> something kernel on their RHEL 7.X and want to use the latest greatest
> hardware.

So provide the i40iw module when providing the i40e upgrade module?

I still can't understand why this is a problem that needs to be
solved in mainline, or why it deserves a special and unique fix to
i40e, or even what the *actual* problem is..

Jason

^ permalink raw reply

* Re: [net-next 1/6] net/dcb: Add dcbnl buffer attribute
From: Huy Nguyen @ 2018-05-23 15:08 UTC (permalink / raw)
  To: Jakub Kicinski
  Cc: Saeed Mahameed, David S. Miller, netdev, Jiri Pirko, Or Gerlitz,
	Parav Pandit, Ido Schimmel
In-Reply-To: <20180523022314.783e47fa@cakuba>

> I hope that is not true, since we (Netronome) are trying to use it for
> NIC configuration, too.  We should generalize the API if need be.
Yes, it is up to your company. devlink is static tool. DCBNL are intended to
be dynamically configured by switch. In real world, not many people 
configure NIC's qos
from host.

^ permalink raw reply

* Re: [bpf-next V4 PATCH 3/8] xdp: add tracepoint for devmap like cpumap have
From: Jesper Dangaard Brouer @ 2018-05-23 15:04 UTC (permalink / raw)
  To: John Fastabend
  Cc: netdev, Daniel Borkmann, Alexei Starovoitov, Christoph Hellwig,
	BjörnTöpel, Magnus Karlsson, makita.toshiaki, brouer
In-Reply-To: <4345d6a2-935d-e206-e309-b63825f880b8@gmail.com>

On Wed, 23 May 2018 07:24:03 -0700
John Fastabend <john.fastabend@gmail.com> wrote:

> > @@ -219,8 +221,8 @@ void __dev_map_insert_ctx(struct bpf_map *map, u32 bit)
> >  static int bq_xmit_all(struct bpf_dtab_netdev *obj,
> >  			 struct xdp_bulk_queue *bq)
> >  {
> > -	unsigned int processed = 0, drops = 0;
> >  	struct net_device *dev = obj->dev;
> > +	int sent = 0, drops = 0;
> >  	int i;
> >  
> >  	if (unlikely(!bq->count))
> > @@ -241,10 +243,13 @@ static int bq_xmit_all(struct bpf_dtab_netdev *obj,
> >  			drops++;
> >  			xdp_return_frame(xdpf);
> >  		}
> > -		processed++;
> > +		sent++;  
> 
> Do 'dropped' frames also get counted as 'sent' frames? This seems a bit
> counter-intuitive to me. Should it be 'drops+sent = total frames'
> instead?

Again, sorry for mixing this up when spliting up the patch.  The
patchset does end up with: 'drops+sent = total frames'.  (The
"processed" counter is a copy-paste of code from cpumap, which have
another semantics).  I'll clean it up in V5, to help reviewers.

-- 
Best regards,
  Jesper Dangaard Brouer
  MSc.CS, Principal Kernel Engineer at Red Hat
  LinkedIn: http://www.linkedin.com/in/brouer

^ permalink raw reply

* Re: [net-next] i40iw/i40e: Remove link dependency on i40e
From: Alexander Duyck @ 2018-05-23 15:03 UTC (permalink / raw)
  To: Christoph Hellwig
  Cc: Jeff Kirsher, Jason Gunthorpe, David Miller, Doug Ledford,
	Sindhu Devale, Netdev, linux-rdma, Neil Horman, Stefan Assmann,
	John Greene, Shiraz Saleem
In-Reply-To: <20180523061922.GA4753@infradead.org>

On Tue, May 22, 2018 at 11:19 PM, Christoph Hellwig <hch@infradead.org> wrote:
> On Tue, May 22, 2018 at 02:04:06PM -0700, Jeff Kirsher wrote:
>> > Why would you want to do this? The rdma driver is non-functional
>> > without the ethernet driver, so why on earth would we want to defeat
>> > the module dependency mechanism?
>>
>> This change is driven by the OSV's like Red Hat, where customer's were
>> updating the i40e driver, which in turn broke i40iw.
>
> Doctor it hurts when I do this..
>
> There is no reason to make a mess of our drivers because people are
> doing things they should haver never done and that aren't supported
> in Linux.
>
> If Intel didn;t offer any out of tree drivers I'm pretty sure no
> customer would even attempt this.  So fix this where the problem is.

Are you serious? You are never going to see out-of-tree drivers go
away. They exist for the simple reason that most customers/OSVs are
slow to upgrade their kernels so we have people running on a 3.10
something kernel on their RHEL 7.X and want to use the latest greatest
hardware.

I find it ridiculous that you expect us to support a product with
in-kernel only and it is pretty short sighted to insist that that is
the only way to support a product.

With that said it probably wouldn't hurt to throw a WARN_ONCE in here
somewhere that basically informs the user they are doing something
stupid and essentially disabling the i40iw driver if they remove i40e.

- Alex

^ permalink raw reply

* Re: [PATCH v2 net] stmmac: strip vlan tag on reception only for 8021q tagged frames
From: Elad Nachman @ 2018-05-23 15:00 UTC (permalink / raw)
  To: Jose Abreu, Florian Fainelli, David Miller
  Cc: makita.toshiaki, netdev, peppe.cavallaro, alexandre.torgue
In-Reply-To: <09c5b6b3-faa5-fd99-76bd-72a107122f2a@synopsys.com>

Jose,

I am not sure which drivers you have checked. I guess most non-networking embedded drivers never use 802.1AD
so they stay broken unknowingly.
Specifically, I have tested Intel e1000e based card which works correctly versus stmmac which works incorrectly.

If you check netdev.c in e1000e then the vlan strip is conditional upon checking of 802.1Q bit in the descriptor,
which does not happen in stmmac_main.c - the vlan stripping happens based on any tag header check.
Once I applied my patch things started working - did not have to patch e1000e. The problem is that ip link allows to create 802.1ad devices which are not 0x8100 tagged but stmmac and probably other drivers handles the non 0x8100 tag incorrectly and the vlan slave discards the frame.

Regarding the getting the tag from the descriptor - this is of course a possibility. My original patch handled stripping of all tags but then I was told here that I cannot strip 802.1ad tags without the NETIF_F_HW_VLAN_STAG_RX feature, which is probably correct regardless of the source of the vlan tag - skb or descriptor.

I can also add the NETIF_F_HW_VLAN_STAG_RX feature plus the following original v1 patch:
--- a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c	2018-04-11 17:04:00.586057300 +0300
+++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c	2018-04-11 17:05:33.601992400 +0300
@@ -3293,17 +3293,19 @@ dma_map_err:
 
 static void stmmac_rx_vlan(struct net_device *dev, struct sk_buff *skb)
 {
-	struct ethhdr *ehdr;
+	struct vlan_ethhdr *veth;
 	u16 vlanid;
+	__be16 vlan_proto;
 
 	if ((dev->features & NETIF_F_HW_VLAN_CTAG_RX) ==
 	    NETIF_F_HW_VLAN_CTAG_RX &&
 	    !__vlan_get_tag(skb, &vlanid)) {
 		/* pop the vlan tag */
-		ehdr = (struct ethhdr *)skb->data;
-		memmove(skb->data + VLAN_HLEN, ehdr, ETH_ALEN * 2);
+		veth = (struct vlan_ethhdr *)skb->data;
+		vlan_proto = veth->h_vlan_proto;
+		memmove(skb->data + VLAN_HLEN, veth, ETH_ALEN * 2);
 		skb_pull(skb, VLAN_HLEN);
-		__vlan_hwaccel_put_tag(skb, htons(ETH_P_8021Q), vlanid);
+		__vlan_hwaccel_put_tag(skb, vlan_proto, vlanid);
 	}
 }

There are three reasons why I prefer using this approach rather than the descriptor approach:

A. It is inline with the original driver code.
B. Using descriptor vlan will require to completely rewrite stmmac_rx_vlan and will result in at least 2-3 times line counts in the patch.
C. I have no idea if first silicon implementations of DWMAC IP had some kind of HW bug (for example AXI clock glitch) which caused sampling of the VLAN tag in the descriptors to be unstable, and since I have no access to such hardware for regression I risk breaking stmmac for such old SOCs in case they decide to jump kernel versions to the latest.

Thanks,

Elad.

On 22/05/18 11:56, Jose Abreu wrote:
> On 21-05-2018 17:42, Florian Fainelli wrote:
>> On 05/21/2018 08:48 AM, David Miller wrote:
>>> From: David Miller <davem@davemloft.net>
>>> Date: Thu, 17 May 2018 12:43:56 -0400 (EDT)
>>>
>>>> Giuseppe and Alexandre, please review this patch.
>>> If nobody thinks this patch is important enough to actually
>>> review, I'm tossing it.
>>>
>>> Sorry.
>>>
>> How about looping in Jose?
> 
> Thanks for the cc Florian!
> 
> Elad,
> 
> Looking at this patch I have a couple of questions and suggestions:
> 
> I see that most drivers use this pattern so, are they all broken?
> or is this a special case?
> 
> You can also get the inner/outer vlan tag by reading desc0 of
> receive descriptor. Which can make this completely agnostic of
> VLAN tag.
> 
> Thanks and Best Regards,
> Jose Miguel Abreu
> 

^ permalink raw reply

* Re: [PATCH V3 8/8] dt-bindings: stm32: add compatible for syscon
From: Rob Herring @ 2018-05-23 14:47 UTC (permalink / raw)
  To: Christophe ROULLIER
  Cc: mark.rutland@arm.com, mcoquelin.stm32@gmail.com, Alexandre TORGUE,
	Peppe CAVALLARO, devicetree@vger.kernel.org,
	linux-arm-kernel@lists.infradead.org, netdev@vger.kernel.org,
	andrew@lunn.ch
In-Reply-To: <4f6941a0-4b41-f2db-52b1-d45aa7287fa2@st.com>

On Wed, May 23, 2018 at 4:32 AM, Christophe ROULLIER
<christophe.roullier@st.com> wrote:
> On 05/22/2018 07:22 PM, Rob Herring wrote:
>> On Mon, May 21, 2018 at 10:07:26AM +0200, Christophe Roullier wrote:
>>> This patch describes syscon DT bindings.
>>>
>>> Signed-off-by: Christophe Roullier <christophe.roullier@st.com>
>>> ---
>>>   Documentation/devicetree/bindings/arm/stm32.txt | 5 +++++
>>>   1 file changed, 5 insertions(+)
>>>
>>> diff --git a/Documentation/devicetree/bindings/arm/stm32.txt b/Documentation/devicetree/bindings/arm/stm32.txt
>>> index 6808ed9..e46ebad 100644
>>> --- a/Documentation/devicetree/bindings/arm/stm32.txt
>>> +++ b/Documentation/devicetree/bindings/arm/stm32.txt
>>> @@ -8,3 +8,8 @@ using one of the following compatible strings:
>>>     st,stm32f746
>>>     st,stm32h743
>>>     st,stm32mp157
>>> +
>>> +Required nodes:
>>> +- syscon: the soc bus node must have a system controller node pointing to the
>>> +  global control registers, with the compatible string
>>> +  "st,stm32mp157-syscfg", "syscon";
>>
>> Please don't mix soc/board bindings with other nodes. So perhaps
>> stm32-syscon.txt.
>>
>> Rob
>>
>
> Hi Rob,
>
> Is it ok for you with this tree file:

Yes, one nit below.

>
> Documentation/devicetree/bindings/arm/stm32/stm32.txt
> Documentation/devicetree/bindings/arm/stm32/stm32-syscon.txt
>         With stm32-syscon.txt:
>
> ---------------------------------------------------
> STMicroelectronics STM32 Platforms System Controller
>
> Properties:
>   - compatible : should contain two values. First value must be :
>                 - " st,stm32mp157-syscfg " - for stm32mp157 based SoCs,
>                 second value must be always "syscon".
>   - reg : offset and length of the register set.
>
> Example:
>         syscfg: system-config@50020000 {

syscon@...

>                 compatible = "st,stm32mp157-syscfg", "syscon";
>                 reg = <0x50020000 0x400>;
>         };
> ---------------------------------------------------
>
> Do we need to update also all MCU family (stm32f4, stm32h7, stm32f7)
> property to be coherent ?

Yes, if they all have the same or similar syscfg block.

>
> Thanks for your feedback.
>
> Christophe.
>

^ permalink raw reply

* Re: [bpf-next V4 PATCH 6/8] xdp: change ndo_xdp_xmit API to support bulking
From: John Fastabend @ 2018-05-23 14:42 UTC (permalink / raw)
  To: Jesper Dangaard Brouer, netdev, Daniel Borkmann,
	Alexei Starovoitov
  Cc: Christoph Hellwig, BjörnTöpel, Magnus Karlsson,
	makita.toshiaki
In-Reply-To: <152665050207.21055.17828855357297094854.stgit@firesoul>

On 05/18/2018 06:35 AM, Jesper Dangaard Brouer wrote:
> This patch change the API for ndo_xdp_xmit to support bulking
> xdp_frames.
> 
> When kernel is compiled with CONFIG_RETPOLINE, XDP sees a huge slowdown.
> Most of the slowdown is caused by DMA API indirect function calls, but
> also the net_device->ndo_xdp_xmit() call.
> 
> Benchmarked patch with CONFIG_RETPOLINE, using xdp_redirect_map with
> single flow/core test (CPU E5-1650 v4 @ 3.60GHz), showed
> performance improved:
>  for driver ixgbe: 6,042,682 pps -> 6,853,768 pps = +811,086 pps
>  for driver i40e : 6,187,169 pps -> 6,724,519 pps = +537,350 pps
> 
> With frames avail as a bulk inside the driver ndo_xdp_xmit call,
> further optimizations are possible, like bulk DMA-mapping for TX.
> 
> Testing without CONFIG_RETPOLINE show the same performance for
> physical NIC drivers.
> 
> The virtual NIC driver tun sees a huge performance boost, as it can
> avoid doing per frame producer locking, but instead amortize the
> locking cost over the bulk.
> 
> V2: Fix compile errors reported by kbuild test robot <lkp@intel.com>
> V4: Isolated ndo, driver changes and callers.
> 
> Signed-off-by: Jesper Dangaard Brouer <brouer@redhat.com>

Couple suggestions for some optimizations/improvements but otherwise
looks good to me.

Thanks,
John

> ---
>  drivers/net/ethernet/intel/i40e/i40e_txrx.c   |   26 +++++++---
>  drivers/net/ethernet/intel/i40e/i40e_txrx.h   |    2 -
>  drivers/net/ethernet/intel/ixgbe/ixgbe_main.c |   21 ++++++--
>  drivers/net/tun.c                             |   37 +++++++++-----
>  drivers/net/virtio_net.c                      |   66 +++++++++++++++++++------
>  include/linux/netdevice.h                     |   14 +++--
>  kernel/bpf/devmap.c                           |   31 ++++++++----
>  net/core/filter.c                             |    8 ++-
>  8 files changed, 141 insertions(+), 64 deletions(-)
> 
> diff --git a/drivers/net/ethernet/intel/i40e/i40e_txrx.c b/drivers/net/ethernet/intel/i40e/i40e_txrx.c
> index 5efa68de935b..9b698c5acd05 100644
> --- a/drivers/net/ethernet/intel/i40e/i40e_txrx.c
> +++ b/drivers/net/ethernet/intel/i40e/i40e_txrx.c
> @@ -3664,14 +3664,19 @@ netdev_tx_t i40e_lan_xmit_frame(struct sk_buff *skb, struct net_device *netdev)
>   * @dev: netdev
>   * @xdp: XDP buffer
>   *
> - * Returns Zero if sent, else an error code
> + * Returns number of frames successfully sent. Frames that fail are
> + * free'ed via XDP return API.
> + *
> + * For error cases, a negative errno code is returned and no-frames
> + * are transmitted (caller must handle freeing frames).
>   **/
> -int i40e_xdp_xmit(struct net_device *dev, struct xdp_frame *xdpf)
> +int i40e_xdp_xmit(struct net_device *dev, int n, struct xdp_frame **frames)
>  {
>  	struct i40e_netdev_priv *np = netdev_priv(dev);
>  	unsigned int queue_index = smp_processor_id();
>  	struct i40e_vsi *vsi = np->vsi;
> -	int err;
> +	int drops = 0;
> +	int i;
>  
>  	if (test_bit(__I40E_VSI_DOWN, vsi->state))
>  		return -ENETDOWN;
> @@ -3679,11 +3684,18 @@ int i40e_xdp_xmit(struct net_device *dev, struct xdp_frame *xdpf)
>  	if (!i40e_enabled_xdp_vsi(vsi) || queue_index >= vsi->num_queue_pairs)
>  		return -ENXIO;
>  
> -	err = i40e_xmit_xdp_ring(xdpf, vsi->xdp_rings[queue_index]);
> -	if (err != I40E_XDP_TX)
> -		return -ENOSPC;
> +	for (i = 0; i < n; i++) {
> +		struct xdp_frame *xdpf = frames[i];
> +		int err;
>  
> -	return 0;
> +		err = i40e_xmit_xdp_ring(xdpf, vsi->xdp_rings[queue_index]);
> +		if (err != I40E_XDP_TX) {
> +			xdp_return_frame_rx_napi(xdpf);
> +			drops++;
> +		}
> +	}
> +

Perhaps not needed in this series, but I wonder how useful it is to hit
the ring with the remaining frames after the first error. The error will
typically be due to the TX queue being full. In this case it might make
more sense to just drop the entire train of frames vs beating the up a
full queue.

> +	return n - drops;
>  }
>  
>  /**
> diff --git a/drivers/net/ethernet/intel/i40e/i40e_txrx.h b/drivers/net/ethernet/intel/i40e/i40e_txrx.h
> index fdd2c55f03a6..eb8804b3d7b6 100644
> --- a/drivers/net/ethernet/intel/i40e/i40e_txrx.h
> +++ b/drivers/net/ethernet/intel/i40e/i40e_txrx.h
> @@ -487,7 +487,7 @@ u32 i40e_get_tx_pending(struct i40e_ring *ring, bool in_sw);
>  void i40e_detect_recover_hung(struct i40e_vsi *vsi);
>  int __i40e_maybe_stop_tx(struct i40e_ring *tx_ring, int size);
>  bool __i40e_chk_linearize(struct sk_buff *skb);
> -int i40e_xdp_xmit(struct net_device *dev, struct xdp_frame *xdpf);
> +int i40e_xdp_xmit(struct net_device *dev, int n, struct xdp_frame **frames);
>  void i40e_xdp_flush(struct net_device *dev);
>  
>  /**
> diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
> index 6652b201df5b..9645619f7729 100644
> --- a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
> +++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
> @@ -10017,11 +10017,13 @@ static int ixgbe_xdp(struct net_device *dev, struct netdev_bpf *xdp)
>  	}
>  }
>  
> -static int ixgbe_xdp_xmit(struct net_device *dev, struct xdp_frame *xdpf)
> +static int ixgbe_xdp_xmit(struct net_device *dev, int n,
> +			  struct xdp_frame **frames)
>  {
>  	struct ixgbe_adapter *adapter = netdev_priv(dev);
>  	struct ixgbe_ring *ring;
> -	int err;
> +	int drops = 0;
> +	int i;
>  
>  	if (unlikely(test_bit(__IXGBE_DOWN, &adapter->state)))
>  		return -ENETDOWN;
> @@ -10033,11 +10035,18 @@ static int ixgbe_xdp_xmit(struct net_device *dev, struct xdp_frame *xdpf)
>  	if (unlikely(!ring))
>  		return -ENXIO;
>  
> -	err = ixgbe_xmit_xdp_ring(adapter, xdpf);
> -	if (err != IXGBE_XDP_TX)
> -		return -ENOSPC;
> +	for (i = 0; i < n; i++) {
> +		struct xdp_frame *xdpf = frames[i];
> +		int err;
>  
> -	return 0;
> +		err = ixgbe_xmit_xdp_ring(adapter, xdpf);
> +		if (err != IXGBE_XDP_TX) {
> +			xdp_return_frame_rx_napi(xdpf);
> +			drops++;
> +		}
> +	}

Same as above, how about just aborting if we get an error?

> +
> +	return n - drops;
>  }
>  
>  static void ixgbe_xdp_flush(struct net_device *dev)
> diff --git a/drivers/net/tun.c b/drivers/net/tun.c
> index 44d4f3d25350..d3dcfcb1c4b3 100644
> --- a/drivers/net/tun.c
> +++ b/drivers/net/tun.c
> @@ -70,6 +70,7 @@
>  #include <net/netns/generic.h>
>  #include <net/rtnetlink.h>
>  #include <net/sock.h>
> +#include <net/xdp.h>
>  #include <linux/seq_file.h>
>  #include <linux/uio.h>
>  #include <linux/skb_array.h>
> @@ -1290,34 +1291,44 @@ static const struct net_device_ops tun_netdev_ops = {
>  	.ndo_get_stats64	= tun_net_get_stats64,
>  };
>  
> -static int tun_xdp_xmit(struct net_device *dev, struct xdp_frame *frame)
> +static int tun_xdp_xmit(struct net_device *dev, int n, struct xdp_frame **frames)
>  {
>  	struct tun_struct *tun = netdev_priv(dev);
>  	struct tun_file *tfile;
>  	u32 numqueues;
> -	int ret = 0;
> +	int drops = 0;
> +	int cnt = n;
> +	int i;
>  
>  	rcu_read_lock();
>  
>  	numqueues = READ_ONCE(tun->numqueues);
>  	if (!numqueues) {
> -		ret = -ENOSPC;
> -		goto out;
> +		rcu_read_unlock();
> +		return -ENXIO; /* Caller will free/return all frames */
>  	}
>  
>  	tfile = rcu_dereference(tun->tfiles[smp_processor_id() %
>  					    numqueues]);
> -	/* Encode the XDP flag into lowest bit for consumer to differ
> -	 * XDP buffer from sk_buff.
> -	 */
> -	if (ptr_ring_produce(&tfile->tx_ring, tun_xdp_to_ptr(frame))) {
> -		this_cpu_inc(tun->pcpu_stats->tx_dropped);
> -		ret = -ENOSPC;
> +
> +	spin_lock(&tfile->tx_ring.producer_lock);
> +	for (i = 0; i < n; i++) {
> +		struct xdp_frame *xdp = frames[i];
> +		/* Encode the XDP flag into lowest bit for consumer to differ
> +		 * XDP buffer from sk_buff.
> +		 */
> +		void *frame = tun_xdp_to_ptr(xdp);
> +
> +		if (__ptr_ring_produce(&tfile->tx_ring, frame)) {
> +			this_cpu_inc(tun->pcpu_stats->tx_dropped);
> +			xdp_return_frame_rx_napi(xdp);
> +			drops++;
> +		}

We have a ptr_ring_consume_batched() API it seems we should also have a
ptr_ring_produce_batched() now as well. Could be a follow up patch but
I think its probably worth considering.


>  	}
> +	spin_unlock(&tfile->tx_ring.producer_lock);
>  
> -out:
>  	rcu_read_unlock();
> -	return ret;
> +	return cnt - drops;
>  }
>  
>  static int tun_xdp_tx(struct net_device *dev, struct xdp_buff *xdp)
> @@ -1327,7 +1338,7 @@ static int tun_xdp_tx(struct net_device *dev, struct xdp_buff *xdp)
>  	if (unlikely(!frame))
>  		return -EOVERFLOW;
>  
> -	return tun_xdp_xmit(dev, frame);
> +	return tun_xdp_xmit(dev, 1, &frame);
>  }
>  
>  static void tun_xdp_flush(struct net_device *dev)
> diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
> index f34794a76c4d..39a0783d1cde 100644
> --- a/drivers/net/virtio_net.c
> +++ b/drivers/net/virtio_net.c
> @@ -419,23 +419,13 @@ static void virtnet_xdp_flush(struct net_device *dev)
>  	virtqueue_kick(sq->vq);
>  }
>  
> -static int __virtnet_xdp_xmit(struct virtnet_info *vi,
> -			       struct xdp_frame *xdpf)
> +static int __virtnet_xdp_xmit_one(struct virtnet_info *vi,
> +				   struct send_queue *sq,
> +				   struct xdp_frame *xdpf)
>  {
>  	struct virtio_net_hdr_mrg_rxbuf *hdr;
> -	struct xdp_frame *xdpf_sent;
> -	struct send_queue *sq;
> -	unsigned int len;
> -	unsigned int qp;
>  	int err;
>  
> -	qp = vi->curr_queue_pairs - vi->xdp_queue_pairs + smp_processor_id();
> -	sq = &vi->sq[qp];
> -
> -	/* Free up any pending old buffers before queueing new ones. */
> -	while ((xdpf_sent = virtqueue_get_buf(sq->vq, &len)) != NULL)
> -		xdp_return_frame(xdpf_sent);
> -
>  	/* virtqueue want to use data area in-front of packet */
>  	if (unlikely(xdpf->metasize > 0))
>  		return -EOPNOTSUPP;
> @@ -459,11 +449,40 @@ static int __virtnet_xdp_xmit(struct virtnet_info *vi,
>  	return 0;
>  }
>  
> -static int virtnet_xdp_xmit(struct net_device *dev, struct xdp_frame *xdpf)
> +static int __virtnet_xdp_tx_xmit(struct virtnet_info *vi,
> +				   struct xdp_frame *xdpf)
> +{
> +	struct xdp_frame *xdpf_sent;
> +	struct send_queue *sq;
> +	unsigned int len;
> +	unsigned int qp;
> +
> +	qp = vi->curr_queue_pairs - vi->xdp_queue_pairs + smp_processor_id();
> +	sq = &vi->sq[qp];
> +
> +	/* Free up any pending old buffers before queueing new ones. */
> +	while ((xdpf_sent = virtqueue_get_buf(sq->vq, &len)) != NULL)
> +		xdp_return_frame(xdpf_sent);
> +
> +	return __virtnet_xdp_xmit_one(vi, sq, xdpf);
> +}
> +
> +static int virtnet_xdp_xmit(struct net_device *dev,
> +			    int n, struct xdp_frame **frames)
>  {
>  	struct virtnet_info *vi = netdev_priv(dev);
>  	struct receive_queue *rq = vi->rq;
> +	struct xdp_frame *xdpf_sent;
>  	struct bpf_prog *xdp_prog;
> +	struct send_queue *sq;
> +	unsigned int len;
> +	unsigned int qp;
> +	int drops = 0;
> +	int err;
> +	int i;
> +
> +	qp = vi->curr_queue_pairs - vi->xdp_queue_pairs + smp_processor_id();
> +	sq = &vi->sq[qp];
>  
>  	/* Only allow ndo_xdp_xmit if XDP is loaded on dev, as this
>  	 * indicate XDP resources have been successfully allocated.
> @@ -472,7 +491,20 @@ static int virtnet_xdp_xmit(struct net_device *dev, struct xdp_frame *xdpf)
>  	if (!xdp_prog)
>  		return -ENXIO;
>  
> -	return __virtnet_xdp_xmit(vi, xdpf);
> +	/* Free up any pending old buffers before queueing new ones. */
> +	while ((xdpf_sent = virtqueue_get_buf(sq->vq, &len)) != NULL)
> +		xdp_return_frame(xdpf_sent);
> +
> +	for (i = 0; i < n; i++) {
> +		struct xdp_frame *xdpf = frames[i];
> +
> +		err = __virtnet_xdp_xmit_one(vi, sq, xdpf);
> +		if (err) {
> +			xdp_return_frame_rx_napi(xdpf);
> +			drops++;
> +		}
> +	}
> +	return n - drops;
>  }
>  
>  static unsigned int virtnet_get_headroom(struct virtnet_info *vi)
> @@ -616,7 +648,7 @@ static struct sk_buff *receive_small(struct net_device *dev,
>  			xdpf = convert_to_xdp_frame(&xdp);
>  			if (unlikely(!xdpf))
>  				goto err_xdp;
> -			err = __virtnet_xdp_xmit(vi, xdpf);
> +			err = __virtnet_xdp_tx_xmit(vi, xdpf);
>  			if (unlikely(err)) {
>  				trace_xdp_exception(vi->dev, xdp_prog, act);
>  				goto err_xdp;
> @@ -779,7 +811,7 @@ static struct sk_buff *receive_mergeable(struct net_device *dev,
>  			xdpf = convert_to_xdp_frame(&xdp);
>  			if (unlikely(!xdpf))
>  				goto err_xdp;
> -			err = __virtnet_xdp_xmit(vi, xdpf);
> +			err = __virtnet_xdp_tx_xmit(vi, xdpf);
>  			if (unlikely(err)) {
>  				trace_xdp_exception(vi->dev, xdp_prog, act);
>  				if (unlikely(xdp_page != page))
> diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
> index 03ed492c4e14..debdb6286170 100644
> --- a/include/linux/netdevice.h
> +++ b/include/linux/netdevice.h
> @@ -1185,9 +1185,13 @@ struct dev_ifalias {
>   *	This function is used to set or query state related to XDP on the
>   *	netdevice and manage BPF offload. See definition of
>   *	enum bpf_netdev_command for details.
> - * int (*ndo_xdp_xmit)(struct net_device *dev, struct xdp_frame *xdp);
> - *	This function is used to submit a XDP packet for transmit on a
> - *	netdevice.
> + * int (*ndo_xdp_xmit)(struct net_device *dev, int n, struct xdp_frame **xdp);
> + *	This function is used to submit @n XDP packets for transmit on a
> + *	netdevice. Returns number of frames successfully transmitted, frames
> + *	that got dropped are freed/returned via xdp_return_frame().
> + *	Returns negative number, means general error invoking ndo, meaning
> + *	no frames were xmit'ed and core-caller will free all frames.
> + *	TODO: Consider add flag to allow sending flush operation.
>   * void (*ndo_xdp_flush)(struct net_device *dev);
>   *	This function is used to inform the driver to flush a particular
>   *	xdp tx queue. Must be called on same CPU as xdp_xmit.
> @@ -1375,8 +1379,8 @@ struct net_device_ops {
>  						       int needed_headroom);
>  	int			(*ndo_bpf)(struct net_device *dev,
>  					   struct netdev_bpf *bpf);
> -	int			(*ndo_xdp_xmit)(struct net_device *dev,
> -						struct xdp_frame *xdp);
> +	int			(*ndo_xdp_xmit)(struct net_device *dev, int n,
> +						struct xdp_frame **xdp);
>  	void			(*ndo_xdp_flush)(struct net_device *dev);
>  };
>  
> diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c
> index 6f84100723b0..1317629662ae 100644
> --- a/kernel/bpf/devmap.c
> +++ b/kernel/bpf/devmap.c
> @@ -222,7 +222,7 @@ static int bq_xmit_all(struct bpf_dtab_netdev *obj,
>  			 struct xdp_bulk_queue *bq)
>  {
>  	struct net_device *dev = obj->dev;
> -	int sent = 0, drops = 0;
> +	int sent = 0, drops = 0, err = 0;
>  	int i;
>  
>  	if (unlikely(!bq->count))
> @@ -234,23 +234,32 @@ static int bq_xmit_all(struct bpf_dtab_netdev *obj,
>  		prefetch(xdpf);
>  	}
>  
> -	for (i = 0; i < bq->count; i++) {
> -		struct xdp_frame *xdpf = bq->q[i];
> -		int err;
> -
> -		err = dev->netdev_ops->ndo_xdp_xmit(dev, xdpf);
> -		if (err) {
> -			drops++;
> -			xdp_return_frame(xdpf);
> -		}
> -		sent++;
> +	sent = dev->netdev_ops->ndo_xdp_xmit(dev, bq->count, bq->q);
> +	if (sent < 0) {
> +		err = sent;
> +		sent = 0;
> +		goto error;
>  	}
> +	drops = bq->count - sent;
> +out:
>  	bq->count = 0;
>  
>  	trace_xdp_devmap_xmit(&obj->dtab->map, obj->bit,
>  			      sent, drops, bq->dev_rx, dev);
>  	bq->dev_rx = NULL;
>  	return 0;
> +error:
> +	/* If ndo_xdp_xmit fails with an errno, no frames have been
> +	 * xmit'ed and it's our responsibility to them free all.
> +	 */
> +	for (i = 0; i < bq->count; i++) {
> +		struct xdp_frame *xdpf = bq->q[i];
> +
> +		/* RX path under NAPI protection, can return frames faster */
> +		xdp_return_frame_rx_napi(xdpf);
> +		drops++;
> +	}
> +	goto out;
>  }
>  
>  /* __dev_map_flush is called from xdp_do_flush_map() which _must_ be signaled
> diff --git a/net/core/filter.c b/net/core/filter.c
> index 4a93423cc5ea..19504b7f4959 100644
> --- a/net/core/filter.c
> +++ b/net/core/filter.c
> @@ -3035,7 +3035,7 @@ static int __bpf_tx_xdp(struct net_device *dev,
>  			u32 index)
>  {
>  	struct xdp_frame *xdpf;
> -	int err;
> +	int sent;
>  
>  	if (!dev->netdev_ops->ndo_xdp_xmit) {
>  		return -EOPNOTSUPP;
> @@ -3045,9 +3045,9 @@ static int __bpf_tx_xdp(struct net_device *dev,
>  	if (unlikely(!xdpf))
>  		return -EOVERFLOW;
>  
> -	err = dev->netdev_ops->ndo_xdp_xmit(dev, xdpf);
> -	if (err)
> -		return err;
> +	sent = dev->netdev_ops->ndo_xdp_xmit(dev, 1, &xdpf);
> +	if (sent <= 0)
> +		return sent;
>  	dev->netdev_ops->ndo_xdp_flush(dev);
>  	return 0;
>  }
> 

^ permalink raw reply

* [PATCH net-next 4/4] net/smc: longer delay when freeing client link groups
From: Ursula Braun @ 2018-05-23 14:38 UTC (permalink / raw)
  To: davem; +Cc: netdev, linux-s390, schwidefsky, heiko.carstens, raspl, ubraun
In-Reply-To: <20180523143812.25824-1-ubraun@linux.ibm.com>

Client link group creation always follows the server linkgroup creation.
If peer creates a new server link group, client has to create a new
client link group. If peer reuses a server link group for a new
connection, client has to reuse its client link group as well. To
avoid out-of-sync conditions for link groups a longer delay for
for client link group removal is defined to make sure this link group
still exists, once the peer decides to reuse a server link group.

Currently the client link group delay time is just 10 jiffies larger
than the server link group delay time. This patch increases the delay
difference to 10 seconds to have a better protection against
out-of-sync link groups.

Signed-off-by: Ursula Braun <ubraun@linux.ibm.com>
---
 net/smc/smc_core.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/net/smc/smc_core.c b/net/smc/smc_core.c
index 2bf138e7d3ec..add82b0266f3 100644
--- a/net/smc/smc_core.c
+++ b/net/smc/smc_core.c
@@ -28,7 +28,7 @@
 
 #define SMC_LGR_NUM_INCR		256
 #define SMC_LGR_FREE_DELAY_SERV		(600 * HZ)
-#define SMC_LGR_FREE_DELAY_CLNT		(SMC_LGR_FREE_DELAY_SERV + 10)
+#define SMC_LGR_FREE_DELAY_CLNT		(SMC_LGR_FREE_DELAY_SERV + 10 * HZ)
 
 static struct smc_lgr_list smc_lgr_list = {	/* established link groups */
 	.lock = __SPIN_LOCK_UNLOCKED(smc_lgr_list.lock),
-- 
2.16.3

^ permalink raw reply related

* [PATCH net-next 3/4] net/smc: urgent data support
From: Ursula Braun @ 2018-05-23 14:38 UTC (permalink / raw)
  To: davem; +Cc: netdev, linux-s390, schwidefsky, heiko.carstens, raspl, ubraun
In-Reply-To: <20180523143812.25824-1-ubraun@linux.ibm.com>

From: Stefan Raspl <raspl@linux.ibm.com>

Add support for out of band data send and receive.

Signed-off-by: Stefan Raspl <raspl@linux.ibm.com>
Signed-off-by: Ursula Braun <ubraun@linux.ibm.com>
---
 net/smc/af_smc.c   |  24 ++++++++++-
 net/smc/smc.h      |  15 +++++++
 net/smc/smc_cdc.c  |  44 ++++++++++++++++++--
 net/smc/smc_cdc.h  |  13 ++++++
 net/smc/smc_core.c |   1 +
 net/smc/smc_rx.c   | 120 +++++++++++++++++++++++++++++++++++++++++++++++------
 net/smc/smc_tx.c   |  55 ++++++++++++++++--------
 net/smc/smc_tx.h   |   2 +-
 8 files changed, 238 insertions(+), 36 deletions(-)

diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c
index f2d925921d81..2c369d4bb1c1 100644
--- a/net/smc/af_smc.c
+++ b/net/smc/af_smc.c
@@ -8,8 +8,6 @@
  *
  *  Initial restrictions:
  *    - support for alternate links postponed
- *    - partial support for non-blocking sockets only
- *    - support for urgent data postponed
  *
  *  Copyright IBM Corp. 2016, 2018
  *
@@ -1338,6 +1336,8 @@ static __poll_t smc_poll(struct file *file, struct socket *sock,
 			if (sk->sk_state == SMC_APPCLOSEWAIT1)
 				mask |= EPOLLIN;
 		}
+		if (smc->conn.urg_state == SMC_URG_VALID)
+			mask |= EPOLLPRI;
 
 	}
 	release_sock(sk);
@@ -1477,10 +1477,13 @@ static int smc_getsockopt(struct socket *sock, int level, int optname,
 static int smc_ioctl(struct socket *sock, unsigned int cmd,
 		     unsigned long arg)
 {
+	union smc_host_cursor cons, urg;
+	struct smc_connection *conn;
 	struct smc_sock *smc;
 	int answ;
 
 	smc = smc_sk(sock->sk);
+	conn = &smc->conn;
 	if (smc->use_fallback) {
 		if (!smc->clcsock)
 			return -EBADF;
@@ -1517,6 +1520,23 @@ static int smc_ioctl(struct socket *sock, unsigned int cmd,
 		else
 			answ = smc_tx_prepared_sends(&smc->conn);
 		break;
+	case SIOCATMARK:
+		if (smc->sk.sk_state == SMC_LISTEN)
+			return -EINVAL;
+		if (smc->sk.sk_state == SMC_INIT ||
+		    smc->sk.sk_state == SMC_CLOSED) {
+			answ = 0;
+		} else {
+			smc_curs_write(&cons,
+			       smc_curs_read(&conn->local_tx_ctrl.cons, conn),
+				       conn);
+			smc_curs_write(&urg,
+				       smc_curs_read(&conn->urg_curs, conn),
+				       conn);
+			answ = smc_curs_diff(conn->rmb_desc->len,
+					     &cons, &urg) == 1;
+		}
+		break;
 	default:
 		return -ENOIOCTLCMD;
 	}
diff --git a/net/smc/smc.h b/net/smc/smc.h
index a1467e411645..51ae1f10d81a 100644
--- a/net/smc/smc.h
+++ b/net/smc/smc.h
@@ -114,6 +114,12 @@ struct smc_host_cdc_msg {		/* Connection Data Control message */
 	u8				reserved[18];
 } __aligned(8);
 
+enum smc_urg_state {
+	SMC_URG_VALID,			/* data present */
+	SMC_URG_NOTYET,			/* data pending */
+	SMC_URG_READ			/* data was already read */
+};
+
 struct smc_connection {
 	struct rb_node		alert_node;
 	struct smc_link_group	*lgr;		/* link group of connection */
@@ -160,6 +166,15 @@ struct smc_connection {
 	union smc_host_cursor	rx_curs_confirmed; /* confirmed to peer
 						    * source of snd_una ?
 						    */
+	union smc_host_cursor	urg_curs;	/* points at urgent byte */
+	enum smc_urg_state	urg_state;
+	bool			urg_tx_pend;	/* urgent data staged */
+	bool			urg_rx_skip_pend;
+						/* indicate urgent oob data
+						 * read, but previous regular
+						 * data still pending
+						 */
+	char			urg_rx_byte;	/* urgent byte */
 	atomic_t		bytes_to_rcv;	/* arrived data,
 						 * not yet received
 						 */
diff --git a/net/smc/smc_cdc.c b/net/smc/smc_cdc.c
index 8d2c079c87b0..a7e8d63fc8ae 100644
--- a/net/smc/smc_cdc.c
+++ b/net/smc/smc_cdc.c
@@ -164,6 +164,28 @@ static inline bool smc_cdc_before(u16 seq1, u16 seq2)
 	return (s16)(seq1 - seq2) < 0;
 }
 
+static void smc_cdc_handle_urg_data_arrival(struct smc_sock *smc,
+					    int *diff_prod)
+{
+	struct smc_connection *conn = &smc->conn;
+	char *base;
+
+	/* new data included urgent business */
+	smc_curs_write(&conn->urg_curs,
+		       smc_curs_read(&conn->local_rx_ctrl.prod, conn),
+		       conn);
+	conn->urg_state = SMC_URG_VALID;
+	if (!sock_flag(&smc->sk, SOCK_URGINLINE))
+		/* we'll skip the urgent byte, so don't account for it */
+		(*diff_prod)--;
+	base = (char *)conn->rmb_desc->cpu_addr;
+	if (conn->urg_curs.count)
+		conn->urg_rx_byte = *(base + conn->urg_curs.count - 1);
+	else
+		conn->urg_rx_byte = *(base + conn->rmb_desc->len - 1);
+	sk_send_sigurg(&smc->sk);
+}
+
 static void smc_cdc_msg_recv_action(struct smc_sock *smc,
 				    struct smc_cdc_msg *cdc)
 {
@@ -194,15 +216,25 @@ static void smc_cdc_msg_recv_action(struct smc_sock *smc,
 	diff_prod = smc_curs_diff(conn->rmb_desc->len, &prod_old,
 				  &conn->local_rx_ctrl.prod);
 	if (diff_prod) {
+		if (conn->local_rx_ctrl.prod_flags.urg_data_present)
+			smc_cdc_handle_urg_data_arrival(smc, &diff_prod);
 		/* bytes_to_rcv is decreased in smc_recvmsg */
 		smp_mb__before_atomic();
 		atomic_add(diff_prod, &conn->bytes_to_rcv);
 		/* guarantee 0 <= bytes_to_rcv <= rmb_desc->len */
 		smp_mb__after_atomic();
 		smc->sk.sk_data_ready(&smc->sk);
-	} else if ((conn->local_rx_ctrl.prod_flags.write_blocked) ||
-		   (conn->local_rx_ctrl.prod_flags.cons_curs_upd_req)) {
-		smc->sk.sk_data_ready(&smc->sk);
+	} else {
+		if (conn->local_rx_ctrl.prod_flags.write_blocked ||
+		    conn->local_rx_ctrl.prod_flags.cons_curs_upd_req ||
+		    conn->local_rx_ctrl.prod_flags.urg_data_pending) {
+			if (conn->local_rx_ctrl.prod_flags.urg_data_pending)
+				conn->urg_state = SMC_URG_NOTYET;
+			/* force immediate tx of current consumer cursor, but
+			 * under send_lock to guarantee arrival in seqno-order
+			 */
+			smc_tx_sndbuf_nonempty(conn);
+		}
 	}
 
 	/* piggy backed tx info */
@@ -212,6 +244,12 @@ static void smc_cdc_msg_recv_action(struct smc_sock *smc,
 		/* trigger socket release if connection closed */
 		smc_close_wake_tx_prepared(smc);
 	}
+	if (diff_cons && conn->urg_tx_pend &&
+	    atomic_read(&conn->peer_rmbe_space) == conn->peer_rmbe_size) {
+		/* urg data confirmed by peer, indicate we're ready for more */
+		conn->urg_tx_pend = false;
+		smc->sk.sk_write_space(&smc->sk);
+	}
 
 	if (conn->local_rx_ctrl.conn_state_flags.peer_conn_abort) {
 		smc->sk.sk_err = ECONNRESET;
diff --git a/net/smc/smc_cdc.h b/net/smc/smc_cdc.h
index d2012fd22100..f60082fee5b8 100644
--- a/net/smc/smc_cdc.h
+++ b/net/smc/smc_cdc.h
@@ -146,6 +146,19 @@ static inline int smc_curs_diff(unsigned int size,
 	return max_t(int, 0, (new->count - old->count));
 }
 
+/* calculate cursor difference between old and new - returns negative
+ * value in case old > new
+ */
+static inline int smc_curs_comp(unsigned int size,
+				union smc_host_cursor *old,
+				union smc_host_cursor *new)
+{
+	if (old->wrap > new->wrap ||
+	    (old->wrap == new->wrap && old->count > new->count))
+		return -smc_curs_diff(size, new, old);
+	return smc_curs_diff(size, old, new);
+}
+
 static inline void smc_host_cursor_to_cdc(union smc_cdc_cursor *peer,
 					  union smc_host_cursor *local,
 					  struct smc_connection *conn)
diff --git a/net/smc/smc_core.c b/net/smc/smc_core.c
index 21c244f53b0a..2bf138e7d3ec 100644
--- a/net/smc/smc_core.c
+++ b/net/smc/smc_core.c
@@ -544,6 +544,7 @@ int smc_conn_create(struct smc_sock *smc,
 	}
 	conn->local_tx_ctrl.common.type = SMC_CDC_MSG_TYPE;
 	conn->local_tx_ctrl.len = SMC_WR_TX_SIZE;
+	conn->urg_state = SMC_URG_READ;
 #ifndef KERNEL_HAS_ATOMIC64
 	spin_lock_init(&conn->acurs_lock);
 #endif
diff --git a/net/smc/smc_rx.c b/net/smc/smc_rx.c
index 290a434471d1..3d77b383cccd 100644
--- a/net/smc/smc_rx.c
+++ b/net/smc/smc_rx.c
@@ -47,16 +47,59 @@ static void smc_rx_wake_up(struct sock *sk)
  *   @conn   connection to update
  *   @cons   consumer cursor
  *   @len    number of Bytes consumed
+ *   Returns:
+ *   1 if we should end our receive, 0 otherwise
  */
-static void smc_rx_update_consumer(struct smc_connection *conn,
-				   union smc_host_cursor cons, size_t len)
+static int smc_rx_update_consumer(struct smc_sock *smc,
+				  union smc_host_cursor cons, size_t len)
 {
+	struct smc_connection *conn = &smc->conn;
+	struct sock *sk = &smc->sk;
+	bool force = false;
+	int diff, rc = 0;
+
 	smc_curs_add(conn->rmb_desc->len, &cons, len);
+
+	/* did we process urgent data? */
+	if (conn->urg_state == SMC_URG_VALID || conn->urg_rx_skip_pend) {
+		diff = smc_curs_comp(conn->rmb_desc->len, &cons,
+				     &conn->urg_curs);
+		if (sock_flag(sk, SOCK_URGINLINE)) {
+			if (diff == 0) {
+				force = true;
+				rc = 1;
+				conn->urg_state = SMC_URG_READ;
+			}
+		} else {
+			if (diff == 1) {
+				/* skip urgent byte */
+				force = true;
+				smc_curs_add(conn->rmb_desc->len, &cons, 1);
+				conn->urg_rx_skip_pend = false;
+			} else if (diff < -1)
+				/* we read past urgent byte */
+				conn->urg_state = SMC_URG_READ;
+		}
+	}
+
 	smc_curs_write(&conn->local_tx_ctrl.cons, smc_curs_read(&cons, conn),
 		       conn);
+
 	/* send consumer cursor update if required */
 	/* similar to advertising new TCP rcv_wnd if required */
-	smc_tx_consumer_update(conn);
+	smc_tx_consumer_update(conn, force);
+
+	return rc;
+}
+
+static void smc_rx_update_cons(struct smc_sock *smc, size_t len)
+{
+	struct smc_connection *conn = &smc->conn;
+	union smc_host_cursor cons;
+
+	smc_curs_write(&cons, smc_curs_read(&conn->local_tx_ctrl.cons, conn),
+		       conn);
+	smc_rx_update_consumer(smc, cons, len);
 }
 
 struct smc_spd_priv {
@@ -70,7 +113,6 @@ static void smc_rx_pipe_buf_release(struct pipe_inode_info *pipe,
 	struct smc_spd_priv *priv = (struct smc_spd_priv *)buf->private;
 	struct smc_sock *smc = priv->smc;
 	struct smc_connection *conn;
-	union smc_host_cursor cons;
 	struct sock *sk = &smc->sk;
 
 	if (sk->sk_state == SMC_CLOSED ||
@@ -79,9 +121,7 @@ static void smc_rx_pipe_buf_release(struct pipe_inode_info *pipe,
 		goto out;
 	conn = &smc->conn;
 	lock_sock(sk);
-	smc_curs_write(&cons, smc_curs_read(&conn->local_tx_ctrl.cons, conn),
-		       conn);
-	smc_rx_update_consumer(conn, cons, priv->len);
+	smc_rx_update_cons(smc, priv->len);
 	release_sock(sk);
 	if (atomic_sub_and_test(priv->len, &conn->splice_pending))
 		smc_rx_wake_up(sk);
@@ -184,6 +224,52 @@ int smc_rx_wait(struct smc_sock *smc, long *timeo,
 	return rc;
 }
 
+static int smc_rx_recv_urg(struct smc_sock *smc, struct msghdr *msg, int len,
+			   int flags)
+{
+	struct smc_connection *conn = &smc->conn;
+	union smc_host_cursor cons;
+	struct sock *sk = &smc->sk;
+	int rc = 0;
+
+	if (sock_flag(sk, SOCK_URGINLINE) ||
+	    !(conn->urg_state == SMC_URG_VALID) ||
+	    conn->urg_state == SMC_URG_READ)
+		return -EINVAL;
+
+	if (conn->urg_state == SMC_URG_VALID) {
+		if (!(flags & MSG_PEEK))
+			smc->conn.urg_state = SMC_URG_READ;
+		msg->msg_flags |= MSG_OOB;
+		if (len > 0) {
+			if (!(flags & MSG_TRUNC))
+				rc = memcpy_to_msg(msg, &conn->urg_rx_byte, 1);
+			len = 1;
+			smc_curs_write(&cons,
+				       smc_curs_read(&conn->local_tx_ctrl.cons,
+						     conn),
+				       conn);
+			if (smc_curs_diff(conn->rmb_desc->len, &cons,
+					  &conn->urg_curs) > 1)
+				conn->urg_rx_skip_pend = true;
+			/* Urgent Byte was already accounted for, but trigger
+			 * skipping the urgent byte in non-inline case
+			 */
+			if (!(flags & MSG_PEEK))
+				smc_rx_update_consumer(smc, cons, 0);
+		} else {
+			msg->msg_flags |= MSG_TRUNC;
+		}
+
+		return rc ? -EFAULT : len;
+	}
+
+	if (sk->sk_state == SMC_CLOSED || sk->sk_shutdown & RCV_SHUTDOWN)
+		return 0;
+
+	return -EAGAIN;
+}
+
 /* smc_rx_recvmsg - receive data from RMBE
  * @msg:	copy data to receive buffer
  * @pipe:	copy data to pipe if set - indicates splice() call
@@ -209,12 +295,12 @@ int smc_rx_recvmsg(struct smc_sock *smc, struct msghdr *msg,
 
 	if (unlikely(flags & MSG_ERRQUEUE))
 		return -EINVAL; /* future work for sk.sk_family == AF_SMC */
-	if (flags & MSG_OOB)
-		return -EINVAL; /* future work */
 
 	sk = &smc->sk;
 	if (sk->sk_state == SMC_LISTEN)
 		return -ENOTCONN;
+	if (flags & MSG_OOB)
+		return smc_rx_recv_urg(smc, msg, len, flags);
 	timeo = sock_rcvtimeo(sk, flags & MSG_DONTWAIT);
 	target = sock_rcvlowat(sk, flags & MSG_WAITALL, len);
 
@@ -227,6 +313,9 @@ int smc_rx_recvmsg(struct smc_sock *smc, struct msghdr *msg,
 
 		if (atomic_read(&conn->bytes_to_rcv))
 			goto copy;
+		else if (conn->urg_state == SMC_URG_VALID)
+			/* we received a single urgent Byte - skip */
+			smc_rx_update_cons(smc, 0);
 
 		if (sk->sk_shutdown & RCV_SHUTDOWN ||
 		    smc_cdc_rxed_any_close_or_senddone(conn) ||
@@ -281,14 +370,18 @@ int smc_rx_recvmsg(struct smc_sock *smc, struct msghdr *msg,
 			continue;
 		}
 
-		/* not more than what user space asked for */
-		copylen = min_t(size_t, read_remaining, readable);
 		smc_curs_write(&cons,
 			       smc_curs_read(&conn->local_tx_ctrl.cons, conn),
 			       conn);
 		/* subsequent splice() calls pick up where previous left */
 		if (splbytes)
 			smc_curs_add(conn->rmb_desc->len, &cons, splbytes);
+		if (conn->urg_state == SMC_URG_VALID &&
+		    sock_flag(&smc->sk, SOCK_URGINLINE) &&
+		    readable > 1)
+			readable--;	/* always stop at urgent Byte */
+		/* not more than what user space asked for */
+		copylen = min_t(size_t, read_remaining, readable);
 		/* determine chunks where to read from rcvbuf */
 		/* either unwrapped case, or 1st chunk of wrapped case */
 		chunk_len = min_t(size_t, copylen, conn->rmb_desc->len -
@@ -333,8 +426,8 @@ int smc_rx_recvmsg(struct smc_sock *smc, struct msghdr *msg,
 			atomic_sub(copylen, &conn->bytes_to_rcv);
 			/* guarantee 0 <= bytes_to_rcv <= rmb_desc->len */
 			smp_mb__after_atomic();
-			if (msg)
-				smc_rx_update_consumer(conn, cons, copylen);
+			if (msg && smc_rx_update_consumer(smc, cons, copylen))
+				goto out;
 		}
 	} while (read_remaining);
 out:
@@ -346,4 +439,5 @@ void smc_rx_init(struct smc_sock *smc)
 {
 	smc->sk.sk_data_ready = smc_rx_wake_up;
 	atomic_set(&smc->conn.splice_pending, 0);
+	smc->conn.urg_state = SMC_URG_READ;
 }
diff --git a/net/smc/smc_tx.c b/net/smc/smc_tx.c
index 1f4a38b857f0..cee666400752 100644
--- a/net/smc/smc_tx.c
+++ b/net/smc/smc_tx.c
@@ -32,7 +32,7 @@
 /***************************** sndbuf producer *******************************/
 
 /* callback implementation for sk.sk_write_space()
- * to wakeup sndbuf producers that blocked with smc_tx_wait_memory().
+ * to wakeup sndbuf producers that blocked with smc_tx_wait().
  * called under sk_socket lock.
  */
 static void smc_tx_write_space(struct sock *sk)
@@ -56,7 +56,7 @@ static void smc_tx_write_space(struct sock *sk)
 	}
 }
 
-/* Wakeup sndbuf producers that blocked with smc_tx_wait_memory().
+/* Wakeup sndbuf producers that blocked with smc_tx_wait().
  * Cf. tcp_data_snd_check()=>tcp_check_space()=>tcp_new_space().
  */
 void smc_tx_sndbuf_nonfull(struct smc_sock *smc)
@@ -66,8 +66,10 @@ void smc_tx_sndbuf_nonfull(struct smc_sock *smc)
 		smc->sk.sk_write_space(&smc->sk);
 }
 
-/* blocks sndbuf producer until at least one byte of free space available */
-static int smc_tx_wait_memory(struct smc_sock *smc, int flags)
+/* blocks sndbuf producer until at least one byte of free space available
+ * or urgent Byte was consumed
+ */
+static int smc_tx_wait(struct smc_sock *smc, int flags)
 {
 	DEFINE_WAIT_FUNC(wait, woken_wake_function);
 	struct smc_connection *conn = &smc->conn;
@@ -103,14 +105,15 @@ static int smc_tx_wait_memory(struct smc_sock *smc, int flags)
 			break;
 		}
 		sk_clear_bit(SOCKWQ_ASYNC_NOSPACE, sk);
-		if (atomic_read(&conn->sndbuf_space))
-			break; /* at least 1 byte of free space available */
+		if (atomic_read(&conn->sndbuf_space) && !conn->urg_tx_pend)
+			break; /* at least 1 byte of free & no urgent data */
 		set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
 		sk_wait_event(sk, &timeo,
 			      sk->sk_err ||
 			      (sk->sk_shutdown & SEND_SHUTDOWN) ||
 			      smc_cdc_rxed_any_close(conn) ||
-			      atomic_read(&conn->sndbuf_space),
+			      (atomic_read(&conn->sndbuf_space) &&
+			       !conn->urg_tx_pend),
 			      &wait);
 	}
 	remove_wait_queue(sk_sleep(sk), &wait);
@@ -157,8 +160,11 @@ int smc_tx_sendmsg(struct smc_sock *smc, struct msghdr *msg, size_t len)
 		if (smc_cdc_rxed_any_close(conn))
 			return send_done ?: -ECONNRESET;
 
-		if (!atomic_read(&conn->sndbuf_space)) {
-			rc = smc_tx_wait_memory(smc, msg->msg_flags);
+		if (msg->msg_flags & MSG_OOB)
+			conn->local_tx_ctrl.prod_flags.urg_data_pending = 1;
+
+		if (!atomic_read(&conn->sndbuf_space) || conn->urg_tx_pend) {
+			rc = smc_tx_wait(smc, msg->msg_flags);
 			if (rc) {
 				if (send_done)
 					return send_done;
@@ -168,7 +174,7 @@ int smc_tx_sendmsg(struct smc_sock *smc, struct msghdr *msg, size_t len)
 		}
 
 		/* initialize variables for 1st iteration of subsequent loop */
-		/* could be just 1 byte, even after smc_tx_wait_memory above */
+		/* could be just 1 byte, even after smc_tx_wait above */
 		writespace = atomic_read(&conn->sndbuf_space);
 		/* not more than what user space asked for */
 		copylen = min_t(size_t, send_remaining, writespace);
@@ -218,6 +224,8 @@ int smc_tx_sendmsg(struct smc_sock *smc, struct msghdr *msg, size_t len)
 		/* since we just produced more new data into sndbuf,
 		 * trigger sndbuf consumer: RDMA write into peer RMBE and CDC
 		 */
+		if ((msg->msg_flags & MSG_OOB) && !send_remaining)
+			conn->urg_tx_pend = true;
 		if ((msg->msg_flags & MSG_MORE || smc_tx_is_corked(smc)) &&
 		    (atomic_read(&conn->sndbuf_space) >
 						(conn->sndbuf_desc->len >> 1)))
@@ -299,6 +307,7 @@ static int smc_tx_rdma_writes(struct smc_connection *conn)
 	union smc_host_cursor sent, prep, prod, cons;
 	struct ib_sge sges[SMC_IB_MAX_SEND_SGE];
 	struct smc_link_group *lgr = conn->lgr;
+	struct smc_cdc_producer_flags *pflags;
 	int to_send, rmbespace;
 	struct smc_link *link;
 	dma_addr_t dma_addr;
@@ -326,7 +335,8 @@ static int smc_tx_rdma_writes(struct smc_connection *conn)
 		       conn);
 
 	/* if usable snd_wnd closes ask peer to advertise once it opens again */
-	conn->local_tx_ctrl.prod_flags.write_blocked = (to_send >= rmbespace);
+	pflags = &conn->local_tx_ctrl.prod_flags;
+	pflags->write_blocked = (to_send >= rmbespace);
 	/* cf. usable snd_wnd */
 	len = min(to_send, rmbespace);
 
@@ -391,6 +401,8 @@ static int smc_tx_rdma_writes(struct smc_connection *conn)
 		src_len_sum = src_len;
 	}
 
+	if (conn->urg_tx_pend && len == to_send)
+		pflags->urg_data_present = 1;
 	smc_tx_advance_cursors(conn, &prod, &sent, len);
 	/* update connection's cursors with advanced local cursors */
 	smc_curs_write(&conn->local_tx_ctrl.prod,
@@ -410,6 +422,7 @@ static int smc_tx_rdma_writes(struct smc_connection *conn)
  */
 int smc_tx_sndbuf_nonempty(struct smc_connection *conn)
 {
+	struct smc_cdc_producer_flags *pflags;
 	struct smc_cdc_tx_pend *pend;
 	struct smc_wr_buf *wr_buf;
 	int rc;
@@ -433,14 +446,21 @@ int smc_tx_sndbuf_nonempty(struct smc_connection *conn)
 		goto out_unlock;
 	}
 
-	rc = smc_tx_rdma_writes(conn);
-	if (rc) {
-		smc_wr_tx_put_slot(&conn->lgr->lnk[SMC_SINGLE_LINK],
-				   (struct smc_wr_tx_pend_priv *)pend);
-		goto out_unlock;
+	if (!conn->local_tx_ctrl.prod_flags.urg_data_present) {
+		rc = smc_tx_rdma_writes(conn);
+		if (rc) {
+			smc_wr_tx_put_slot(&conn->lgr->lnk[SMC_SINGLE_LINK],
+					   (struct smc_wr_tx_pend_priv *)pend);
+			goto out_unlock;
+		}
 	}
 
 	rc = smc_cdc_msg_send(conn, wr_buf, pend);
+	pflags = &conn->local_tx_ctrl.prod_flags;
+	if (!rc && pflags->urg_data_present) {
+		pflags->urg_data_pending = 0;
+		pflags->urg_data_present = 0;
+	}
 
 out_unlock:
 	spin_unlock_bh(&conn->send_lock);
@@ -473,7 +493,7 @@ void smc_tx_work(struct work_struct *work)
 	release_sock(&smc->sk);
 }
 
-void smc_tx_consumer_update(struct smc_connection *conn)
+void smc_tx_consumer_update(struct smc_connection *conn, bool force)
 {
 	union smc_host_cursor cfed, cons;
 	int to_confirm;
@@ -487,6 +507,7 @@ void smc_tx_consumer_update(struct smc_connection *conn)
 	to_confirm = smc_curs_diff(conn->rmb_desc->len, &cfed, &cons);
 
 	if (conn->local_rx_ctrl.prod_flags.cons_curs_upd_req ||
+	    force ||
 	    ((to_confirm > conn->rmbe_update_limit) &&
 	     ((to_confirm > (conn->rmb_desc->len / 2)) ||
 	      conn->local_rx_ctrl.prod_flags.write_blocked))) {
diff --git a/net/smc/smc_tx.h b/net/smc/smc_tx.h
index 44d077942976..9d2238909fa0 100644
--- a/net/smc/smc_tx.h
+++ b/net/smc/smc_tx.h
@@ -32,6 +32,6 @@ void smc_tx_init(struct smc_sock *smc);
 int smc_tx_sendmsg(struct smc_sock *smc, struct msghdr *msg, size_t len);
 int smc_tx_sndbuf_nonempty(struct smc_connection *conn);
 void smc_tx_sndbuf_nonfull(struct smc_sock *smc);
-void smc_tx_consumer_update(struct smc_connection *conn);
+void smc_tx_consumer_update(struct smc_connection *conn, bool force);
 
 #endif /* SMC_TX_H */
-- 
2.16.3

^ permalink raw reply related

* [PATCH net-next 2/4] net/smc: lock smc_lgr_list in port_terminate()
From: Ursula Braun @ 2018-05-23 14:38 UTC (permalink / raw)
  To: davem; +Cc: netdev, linux-s390, schwidefsky, heiko.carstens, raspl, ubraun
In-Reply-To: <20180523143812.25824-1-ubraun@linux.ibm.com>

From: Hans Wippel <hwippel@linux.ibm.com>

Currently, smc_port_terminate() is not holding the lock of the lgr list
while it is traversing the list. This patch adds locking to this
function and changes smc_lgr_terminate() accordingly.

Signed-off-by: Hans Wippel <hwippel@linux.ibm.com>
Signed-off-by: Ursula Braun <ubraun@linux.ibm.com>
---
 net/smc/smc_core.c | 16 +++++++++++++---
 1 file changed, 13 insertions(+), 3 deletions(-)

diff --git a/net/smc/smc_core.c b/net/smc/smc_core.c
index 1e5c0e90a706..21c244f53b0a 100644
--- a/net/smc/smc_core.c
+++ b/net/smc/smc_core.c
@@ -346,7 +346,7 @@ void smc_lgr_forget(struct smc_link_group *lgr)
 }
 
 /* terminate linkgroup abnormally */
-void smc_lgr_terminate(struct smc_link_group *lgr)
+static void __smc_lgr_terminate(struct smc_link_group *lgr)
 {
 	struct smc_connection *conn;
 	struct smc_sock *smc;
@@ -355,7 +355,8 @@ void smc_lgr_terminate(struct smc_link_group *lgr)
 	if (lgr->terminating)
 		return;	/* lgr already terminating */
 	lgr->terminating = 1;
-	smc_lgr_forget(lgr);
+	if (!list_empty(&lgr->list)) /* forget lgr */
+		list_del_init(&lgr->list);
 	smc_llc_link_inactive(&lgr->lnk[SMC_SINGLE_LINK]);
 
 	write_lock_bh(&lgr->conns_lock);
@@ -377,16 +378,25 @@ void smc_lgr_terminate(struct smc_link_group *lgr)
 	smc_lgr_schedule_free_work(lgr);
 }
 
+void smc_lgr_terminate(struct smc_link_group *lgr)
+{
+	spin_lock_bh(&smc_lgr_list.lock);
+	__smc_lgr_terminate(lgr);
+	spin_unlock_bh(&smc_lgr_list.lock);
+}
+
 /* Called when IB port is terminated */
 void smc_port_terminate(struct smc_ib_device *smcibdev, u8 ibport)
 {
 	struct smc_link_group *lgr, *l;
 
+	spin_lock_bh(&smc_lgr_list.lock);
 	list_for_each_entry_safe(lgr, l, &smc_lgr_list.list, list) {
 		if (lgr->lnk[SMC_SINGLE_LINK].smcibdev == smcibdev &&
 		    lgr->lnk[SMC_SINGLE_LINK].ibport == ibport)
-			smc_lgr_terminate(lgr);
+			__smc_lgr_terminate(lgr);
 	}
+	spin_unlock_bh(&smc_lgr_list.lock);
 }
 
 /* Determine vlan of internal TCP socket.
-- 
2.16.3

^ permalink raw reply related

* [PATCH net-next 1/4] net/smc: return 0 for ioctl calls in states INIT and CLOSED
From: Ursula Braun @ 2018-05-23 14:38 UTC (permalink / raw)
  To: davem; +Cc: netdev, linux-s390, schwidefsky, heiko.carstens, raspl, ubraun
In-Reply-To: <20180523143812.25824-1-ubraun@linux.ibm.com>

A connected SMC-socket contains addresses of descriptors for the
send buffer and the rmb (receive buffer). Fields of these descriptors
are used to determine the answer for certain ioctl requests.
Add extra handling for unconnected SMC socket states without valid
buffer descriptor addresses.

Signed-off-by: Ursula Braun <ubraun@linux.ibm.com>
Reported-by: syzbot+e6714328fda813fc670f@syzkaller.appspotmail.com
---
 net/smc/af_smc.c | 18 +++++++++++++++---
 1 file changed, 15 insertions(+), 3 deletions(-)

diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c
index 48530dab5c94..f2d925921d81 100644
--- a/net/smc/af_smc.c
+++ b/net/smc/af_smc.c
@@ -1490,20 +1490,32 @@ static int smc_ioctl(struct socket *sock, unsigned int cmd,
 	case SIOCINQ: /* same as FIONREAD */
 		if (smc->sk.sk_state == SMC_LISTEN)
 			return -EINVAL;
-		answ = atomic_read(&smc->conn.bytes_to_rcv);
+		if (smc->sk.sk_state == SMC_INIT ||
+		    smc->sk.sk_state == SMC_CLOSED)
+			answ = 0;
+		else
+			answ = atomic_read(&smc->conn.bytes_to_rcv);
 		break;
 	case SIOCOUTQ:
 		/* output queue size (not send + not acked) */
 		if (smc->sk.sk_state == SMC_LISTEN)
 			return -EINVAL;
-		answ = smc->conn.sndbuf_desc->len -
+		if (smc->sk.sk_state == SMC_INIT ||
+		    smc->sk.sk_state == SMC_CLOSED)
+			answ = 0;
+		else
+			answ = smc->conn.sndbuf_desc->len -
 					atomic_read(&smc->conn.sndbuf_space);
 		break;
 	case SIOCOUTQNSD:
 		/* output queue size (not send only) */
 		if (smc->sk.sk_state == SMC_LISTEN)
 			return -EINVAL;
-		answ = smc_tx_prepared_sends(&smc->conn);
+		if (smc->sk.sk_state == SMC_INIT ||
+		    smc->sk.sk_state == SMC_CLOSED)
+			answ = 0;
+		else
+			answ = smc_tx_prepared_sends(&smc->conn);
 		break;
 	default:
 		return -ENOIOCTLCMD;
-- 
2.16.3

^ permalink raw reply related

* [PATCH net-next 0/4] patches 2018-05-23
From: Ursula Braun @ 2018-05-23 14:38 UTC (permalink / raw)
  To: davem; +Cc: netdev, linux-s390, schwidefsky, heiko.carstens, raspl, ubraun

Dave,

here are more smc-patches for net-next:

Patch 1 fixes an ioctl problem detected by syzbot.

Patch 2 improves smc_lgr_list locking in case of abnormal link
group termination. If you want to receive a version for the net-tree,
please let me know. It would look somewhat different, since the port
terminate code has been moved to smc_core.c on net-next.

Patch 3 enables SMC to deal with urgent data.

Patch 4 is a minor improvement to avoid out-of-sync linkgroups
between 2 peers.

Thanks, Ursula

Hans Wippel (1):
  net/smc: lock smc_lgr_list in port_terminate()

Stefan Raspl (1):
  net/smc: urgent data support

Ursula Braun (2):
  net/smc: return 0 for ioctl calls in states INIT and CLOSED
  net/smc: longer delay when freeing client link groups

 net/smc/af_smc.c   |  42 ++++++++++++++++---
 net/smc/smc.h      |  15 +++++++
 net/smc/smc_cdc.c  |  44 ++++++++++++++++++--
 net/smc/smc_cdc.h  |  13 ++++++
 net/smc/smc_core.c |  19 +++++++--
 net/smc/smc_rx.c   | 120 +++++++++++++++++++++++++++++++++++++++++++++++------
 net/smc/smc_tx.c   |  55 ++++++++++++++++--------
 net/smc/smc_tx.h   |   2 +-
 8 files changed, 267 insertions(+), 43 deletions(-)

-- 
2.16.3

^ permalink raw reply


This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox