Netdev List
 help / color / mirror / Atom feed
* [PATCH iproute2-next] bpf: replace snprintf with asprintf when dealing with long buffers
From: Andrea Claudi @ 2019-09-09 16:05 UTC (permalink / raw)
  To: netdev; +Cc: stephen, dsahern

This reduces stack usage, as asprintf allocates memory on the heap.

This indirectly fixes a snprintf truncation warning (from gcc v9.2.1):

bpf.c: In function ‘bpf_get_work_dir’:
bpf.c:784:49: warning: ‘snprintf’ output may be truncated before the last format character [-Wformat-truncation=]
  784 |  snprintf(bpf_wrk_dir, sizeof(bpf_wrk_dir), "%s/", mnt);
      |                                                 ^
bpf.c:784:2: note: ‘snprintf’ output between 2 and 4097 bytes into a destination of size 4096
  784 |  snprintf(bpf_wrk_dir, sizeof(bpf_wrk_dir), "%s/", mnt);
      |  ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

Fixes: e42256699cac ("bpf: make tc's bpf loader generic and move into lib")
Signed-off-by: Andrea Claudi <aclaudi@redhat.com>
---
 lib/bpf.c | 95 +++++++++++++++++++++++++++++++++----------------------
 1 file changed, 57 insertions(+), 38 deletions(-)

diff --git a/lib/bpf.c b/lib/bpf.c
index 7d2a322ffbaec..18e0334d3f11b 100644
--- a/lib/bpf.c
+++ b/lib/bpf.c
@@ -406,13 +406,14 @@ static int bpf_derive_elf_map_from_fdinfo(int fd, struct bpf_elf_map *map,
 					  struct bpf_map_ext *ext)
 {
 	unsigned int val, owner_type = 0, owner_jited = 0;
-	char file[PATH_MAX], buff[4096];
+	char *file, buff[4096];
 	FILE *fp;
 
-	snprintf(file, sizeof(file), "/proc/%d/fdinfo/%d", getpid(), fd);
+	asprintf(&file, "/proc/%d/fdinfo/%d", getpid(), fd);
 	memset(map, 0, sizeof(*map));
 
 	fp = fopen(file, "r");
+	free(file);
 	if (!fp) {
 		fprintf(stderr, "No procfs support?!\n");
 		return -EIO;
@@ -600,7 +601,7 @@ int bpf_trace_pipe(void)
 		0,
 	};
 	int fd_in, fd_out = STDERR_FILENO;
-	char tpipe[PATH_MAX];
+	char *tpipe;
 	const char *mnt;
 
 	mnt = bpf_find_mntpt("tracefs", TRACEFS_MAGIC, tracefs_mnt,
@@ -610,9 +611,10 @@ int bpf_trace_pipe(void)
 		return -1;
 	}
 
-	snprintf(tpipe, sizeof(tpipe), "%s/trace_pipe", mnt);
+	asprintf(&tpipe, "%s/trace_pipe", mnt);
 
 	fd_in = open(tpipe, O_RDONLY);
+	free(tpipe);
 	if (fd_in < 0)
 		return -1;
 
@@ -633,37 +635,42 @@ int bpf_trace_pipe(void)
 
 static int bpf_gen_global(const char *bpf_sub_dir)
 {
-	char bpf_glo_dir[PATH_MAX];
+	char *bpf_glo_dir;
 	int ret;
 
-	snprintf(bpf_glo_dir, sizeof(bpf_glo_dir), "%s/%s/",
-		 bpf_sub_dir, BPF_DIR_GLOBALS);
+	asprintf(&bpf_glo_dir, "%s/%s/", bpf_sub_dir, BPF_DIR_GLOBALS);
 
 	ret = mkdir(bpf_glo_dir, S_IRWXU);
 	if (ret && errno != EEXIST) {
 		fprintf(stderr, "mkdir %s failed: %s\n", bpf_glo_dir,
 			strerror(errno));
-		return ret;
+		goto out;
 	}
 
-	return 0;
+	ret = 0;
+out:
+	free(bpf_glo_dir);
+	return ret;
 }
 
 static int bpf_gen_master(const char *base, const char *name)
 {
-	char bpf_sub_dir[PATH_MAX + NAME_MAX + 1];
+	char *bpf_sub_dir;
 	int ret;
 
-	snprintf(bpf_sub_dir, sizeof(bpf_sub_dir), "%s%s/", base, name);
+	asprintf(&bpf_sub_dir, "%s%s/", base, name);
 
 	ret = mkdir(bpf_sub_dir, S_IRWXU);
 	if (ret && errno != EEXIST) {
 		fprintf(stderr, "mkdir %s failed: %s\n", bpf_sub_dir,
 			strerror(errno));
-		return ret;
+		goto out;
 	}
 
-	return bpf_gen_global(bpf_sub_dir);
+	ret = bpf_gen_global(bpf_sub_dir);
+out:
+	free(bpf_sub_dir);
+	return ret;
 }
 
 static int bpf_slave_via_bind_mnt(const char *full_name,
@@ -692,13 +699,13 @@ static int bpf_slave_via_bind_mnt(const char *full_name,
 static int bpf_gen_slave(const char *base, const char *name,
 			 const char *link)
 {
-	char bpf_lnk_dir[PATH_MAX + NAME_MAX + 1];
-	char bpf_sub_dir[PATH_MAX + NAME_MAX];
+	char *bpf_lnk_dir;
+	char *bpf_sub_dir;
 	struct stat sb = {};
 	int ret;
 
-	snprintf(bpf_lnk_dir, sizeof(bpf_lnk_dir), "%s%s/", base, link);
-	snprintf(bpf_sub_dir, sizeof(bpf_sub_dir), "%s%s",  base, name);
+	asprintf(&bpf_lnk_dir, "%s%s/", base, link);
+	asprintf(&bpf_sub_dir, "%s%s",  base, name);
 
 	ret = symlink(bpf_lnk_dir, bpf_sub_dir);
 	if (ret) {
@@ -706,25 +713,30 @@ static int bpf_gen_slave(const char *base, const char *name,
 			if (errno != EPERM) {
 				fprintf(stderr, "symlink %s failed: %s\n",
 					bpf_sub_dir, strerror(errno));
-				return ret;
+				goto out;
 			}
 
-			return bpf_slave_via_bind_mnt(bpf_sub_dir,
-						      bpf_lnk_dir);
+			ret = bpf_slave_via_bind_mnt(bpf_sub_dir, bpf_lnk_dir);
+			goto out;
 		}
 
 		ret = lstat(bpf_sub_dir, &sb);
 		if (ret) {
 			fprintf(stderr, "lstat %s failed: %s\n",
 				bpf_sub_dir, strerror(errno));
-			return ret;
+			goto out;
 		}
 
-		if ((sb.st_mode & S_IFMT) != S_IFLNK)
-			return bpf_gen_global(bpf_sub_dir);
+		if ((sb.st_mode & S_IFMT) != S_IFLNK) {
+			ret = bpf_gen_global(bpf_sub_dir);
+			goto out;
+		}
 	}
 
-	return 0;
+out:
+	free(bpf_lnk_dir);
+	free(bpf_sub_dir);
+	return ret;
 }
 
 static int bpf_gen_hierarchy(const char *base)
@@ -742,7 +754,7 @@ static int bpf_gen_hierarchy(const char *base)
 static const char *bpf_get_work_dir(enum bpf_prog_type type)
 {
 	static char bpf_tmp[PATH_MAX] = BPF_DIR_MNT;
-	static char bpf_wrk_dir[PATH_MAX];
+	static char *bpf_wrk_dir;
 	static const char *mnt;
 	static bool bpf_mnt_cached;
 	const char *mnt_env = getenv(BPF_ENV_MNT);
@@ -781,7 +793,7 @@ static const char *bpf_get_work_dir(enum bpf_prog_type type)
 		}
 	}
 
-	snprintf(bpf_wrk_dir, sizeof(bpf_wrk_dir), "%s/", mnt);
+	asprintf(&bpf_wrk_dir, "%s/", mnt);
 
 	ret = bpf_gen_hierarchy(bpf_wrk_dir);
 	if (ret) {
@@ -1438,29 +1450,31 @@ static int bpf_probe_pinned(const char *name, const struct bpf_elf_ctx *ctx,
 
 static int bpf_make_obj_path(const struct bpf_elf_ctx *ctx)
 {
-	char tmp[PATH_MAX];
+	char *tmp;
 	int ret;
 
-	snprintf(tmp, sizeof(tmp), "%s/%s", bpf_get_work_dir(ctx->type),
-		 ctx->obj_uid);
+	asprintf(&tmp, "%s/%s", bpf_get_work_dir(ctx->type), ctx->obj_uid);
 
 	ret = mkdir(tmp, S_IRWXU);
 	if (ret && errno != EEXIST) {
 		fprintf(stderr, "mkdir %s failed: %s\n", tmp, strerror(errno));
-		return ret;
+		goto out;
 	}
 
-	return 0;
+	ret = 0;
+out:
+	free(tmp);
+	return ret;
 }
 
 static int bpf_make_custom_path(const struct bpf_elf_ctx *ctx,
 				const char *todo)
 {
-	char tmp[PATH_MAX], rem[PATH_MAX], *sub;
+	char *tmp, *rem, *sub;
 	int ret;
 
-	snprintf(tmp, sizeof(tmp), "%s/../", bpf_get_work_dir(ctx->type));
-	snprintf(rem, sizeof(rem), "%s/", todo);
+	asprintf(&tmp, "%s/../", bpf_get_work_dir(ctx->type));
+	asprintf(&rem, "%s/", todo);
 	sub = strtok(rem, "/");
 
 	while (sub) {
@@ -1474,13 +1488,17 @@ static int bpf_make_custom_path(const struct bpf_elf_ctx *ctx,
 		if (ret && errno != EEXIST) {
 			fprintf(stderr, "mkdir %s failed: %s\n", tmp,
 				strerror(errno));
-			return ret;
+			goto out;
 		}
 
 		sub = strtok(NULL, "/");
 	}
 
-	return 0;
+	ret = 0;
+out:
+	free(rem);
+	free(tmp);
+	return ret;
 }
 
 static int bpf_place_pinned(int fd, const char *name,
@@ -2581,14 +2599,15 @@ struct bpf_jited_aux {
 
 static int bpf_derive_prog_from_fdinfo(int fd, struct bpf_prog_data *prog)
 {
-	char file[PATH_MAX], buff[4096];
+	char *file, buff[4096];
 	unsigned int val;
 	FILE *fp;
 
-	snprintf(file, sizeof(file), "/proc/%d/fdinfo/%d", getpid(), fd);
+	asprintf(&file, "/proc/%d/fdinfo/%d", getpid(), fd);
 	memset(prog, 0, sizeof(*prog));
 
 	fp = fopen(file, "r");
+	free(file);
 	if (!fp) {
 		fprintf(stderr, "No procfs support?!\n");
 		return -EIO;
-- 
2.21.0


^ permalink raw reply related

* [PATCH v3] net: phy: dp83867: Add SGMII mode type switching
From: Vitaly Gaiduk @ 2019-09-09 16:02 UTC (permalink / raw)
  To: davem, robh+dt, f.fainelli
  Cc: Vitaly Gaiduk, Andrew Lunn, Heiner Kallweit, netdev, linux-kernel
In-Reply-To: <1568026945-3857-1-git-send-email-vitaly.gaiduk@cloudbear.ru>

This patch adds ability to switch beetween two PHY SGMII modes.
Some hardware, for example, FPGA IP designs may use 6-wire mode
which enables differential SGMII clock to MAC.

Signed-off-by: Vitaly Gaiduk <vitaly.gaiduk@cloudbear.ru>
---
Changes in v3:
- Fixed retaining DP83867_SGMII_TYPE bit

 drivers/net/phy/dp83867.c | 19 +++++++++++++++++++
 1 file changed, 19 insertions(+)

diff --git a/drivers/net/phy/dp83867.c b/drivers/net/phy/dp83867.c
index 1f1ecee..37fceaf 100644
--- a/drivers/net/phy/dp83867.c
+++ b/drivers/net/phy/dp83867.c
@@ -37,6 +37,7 @@
 #define DP83867_STRAP_STS2	0x006f
 #define DP83867_RGMIIDCTL	0x0086
 #define DP83867_IO_MUX_CFG	0x0170
+#define DP83867_SGMIICTL	0x00D3
 #define DP83867_10M_SGMII_CFG   0x016F
 #define DP83867_10M_SGMII_RATE_ADAPT_MASK BIT(7)

@@ -61,6 +62,9 @@
 #define DP83867_RGMII_TX_CLK_DELAY_EN		BIT(1)
 #define DP83867_RGMII_RX_CLK_DELAY_EN		BIT(0)

+/* SGMIICTL bits */
+#define DP83867_SGMII_TYPE		BIT(14)
+
 /* STRAP_STS1 bits */
 #define DP83867_STRAP_STS1_RESERVED		BIT(11)

@@ -109,6 +113,7 @@ struct dp83867_private {
 	bool rxctrl_strap_quirk;
 	bool set_clk_output;
 	u32 clk_output_sel;
+	bool sgmii_ref_clk_en;
 };

 static int dp83867_ack_interrupt(struct phy_device *phydev)
@@ -197,6 +202,9 @@ static int dp83867_of_init(struct phy_device *phydev)
 	dp83867->rxctrl_strap_quirk = of_property_read_bool(of_node,
 					"ti,dp83867-rxctrl-strap-quirk");

+	dp83867->sgmii_ref_clk_en = of_property_read_bool(of_node,
+					"ti,sgmii-ref-clock-output-enable");
+
 	/* Existing behavior was to use default pin strapping delay in rgmii
 	 * mode, but rgmii should have meant no delay.  Warn existing users.
 	 */
@@ -389,6 +397,17 @@ static int dp83867_config_init(struct phy_device *phydev)

 		if (ret)
 			return ret;
+
+		val = phy_read_mmd(phydev, DP83867_DEVADDR, DP83867_SGMIICTL);
+		/* SGMII type is set to 4-wire mode by default.
+		 * If we place appropriate property in dts (see above)
+		 * switch on 6-wire mode.
+		 */
+		if (dp83867->sgmii_ref_clk_en)
+			val |= DP83867_SGMII_TYPE;
+		else
+			val &= ~DP83867_SGMII_TYPE;
+		phy_write_mmd(phydev, DP83867_DEVADDR, DP83867_SGMIICTL, val);
 	}

 	/* Enable Interrupt output INT_OE in CFG3 register */
--
2.7.4


^ permalink raw reply related

* RE: [PATCH net-next v2 2/2] net: stmmac: Support enhanced addressing mode for DWMAC 4.10
From: Jose Abreu @ 2019-09-09 16:05 UTC (permalink / raw)
  To: Thierry Reding, David S . Miller
  Cc: Giuseppe Cavallaro, Alexandre Torgue, Jon Hunter, Bitan Biswas,
	netdev@vger.kernel.org, linux-tegra@vger.kernel.org
In-Reply-To: <20190909152546.383-2-thierry.reding@gmail.com>

From: Thierry Reding <thierry.reding@gmail.com>
Date: Sep/09/2019, 16:25:46 (UTC+00:00)

> @@ -79,6 +79,10 @@ static void dwmac4_dma_init_rx_chan(void __iomem *ioaddr,
>  	value = value | (rxpbl << DMA_BUS_MODE_RPBL_SHIFT);
>  	writel(value, ioaddr + DMA_CHAN_RX_CONTROL(chan));
>  
> +	if (dma_cfg->eame)

There is no need for this check. If EAME is not enabled then upper 32 
bits will be zero.

> +		writel(upper_32_bits(dma_rx_phy),
> +		       ioaddr + DMA_CHAN_RX_BASE_ADDR_HI(chan));
> +
>  	writel(lower_32_bits(dma_rx_phy), ioaddr + DMA_CHAN_RX_BASE_ADDR(chan));
>  }

> @@ -97,6 +101,10 @@ static void dwmac4_dma_init_tx_chan(void __iomem *ioaddr,
>  
>  	writel(value, ioaddr + DMA_CHAN_TX_CONTROL(chan));
>  
> +	if (dma_cfg->eame)

Same here.

> +		writel(upper_32_bits(dma_tx_phy),
> +		       ioaddr + DMA_CHAN_TX_BASE_ADDR_HI(chan));
> +
>  	writel(lower_32_bits(dma_tx_phy), ioaddr + DMA_CHAN_TX_BASE_ADDR(chan));
>  }

Also, please provide a cover letter in next submission.

---
Thanks,
Jose Miguel Abreu

^ permalink raw reply

* RE: [PATCH net-next v2 1/2] net: stmmac: Only enable enhanced addressing mode when needed
From: Jose Abreu @ 2019-09-09 16:07 UTC (permalink / raw)
  To: Thierry Reding, David S . Miller
  Cc: Giuseppe Cavallaro, Alexandre Torgue, Jon Hunter, Bitan Biswas,
	netdev@vger.kernel.org, linux-tegra@vger.kernel.org
In-Reply-To: <20190909152546.383-1-thierry.reding@gmail.com>

From: Thierry Reding <thierry.reding@gmail.com>
Date: Sep/09/2019, 16:25:45 (UTC+00:00)

> @@ -92,6 +92,7 @@ struct stmmac_dma_cfg {
>  	int fixed_burst;
>  	int mixed_burst;
>  	bool aal;
> +	bool eame;

bools should not be used in struct's, please change to int.

---
Thanks,
Jose Miguel Abreu

^ permalink raw reply

* RE: [PATCH net-next 1/5] enetc: Fix if_mode extraction
From: Claudiu Manoil @ 2019-09-09 16:24 UTC (permalink / raw)
  To: Andrew Lunn; +Cc: David S . Miller, Alexandru Marginean, netdev@vger.kernel.org
In-Reply-To: <20190906195743.GD2339@lunn.ch>

>-----Original Message-----
>From: Andrew Lunn <andrew@lunn.ch>
>Sent: Friday, September 6, 2019 10:58 PM
>To: Claudiu Manoil <claudiu.manoil@nxp.com>
>Cc: David S . Miller <davem@davemloft.net>; Alexandru Marginean
><alexandru.marginean@nxp.com>; netdev@vger.kernel.org
>Subject: Re: [PATCH net-next 1/5] enetc: Fix if_mode extraction
>
>On Fri, Sep 06, 2019 at 05:15:40PM +0300, Claudiu Manoil wrote:
>> Fix handling of error return code. Before this fix,
>> the error code was handled as unsigned type.
>> Also, on this path if if_mode not found then just handle
>> it as fixed link (i.e mac2mac connection).
>>
>> Signed-off-by: Claudiu Manoil <claudiu.manoil@nxp.com>
>> ---
>>  drivers/net/ethernet/freescale/enetc/enetc_pf.c | 17 ++++++-----------
>>  1 file changed, 6 insertions(+), 11 deletions(-)
>>
>> diff --git a/drivers/net/ethernet/freescale/enetc/enetc_pf.c
>b/drivers/net/ethernet/freescale/enetc/enetc_pf.c
>> index 7d6513ff8507..3a556646a2fb 100644
>> --- a/drivers/net/ethernet/freescale/enetc/enetc_pf.c
>> +++ b/drivers/net/ethernet/freescale/enetc/enetc_pf.c
>> @@ -751,6 +751,7 @@ static int enetc_of_get_phy(struct enetc_ndev_priv
>*priv)
>>  	struct enetc_pf *pf = enetc_si_priv(priv->si);
>>  	struct device_node *np = priv->dev->of_node;
>>  	struct device_node *mdio_np;
>> +	int phy_mode;
>>  	int err;
>>
>>  	if (!np) {
>> @@ -784,17 +785,11 @@ static int enetc_of_get_phy(struct enetc_ndev_priv
>*priv)
>>  		}
>>  	}
>>
>> -	priv->if_mode = of_get_phy_mode(np);
>> -	if (priv->if_mode < 0) {
>> -		dev_err(priv->dev, "missing phy type\n");
>> -		of_node_put(priv->phy_node);
>> -		if (of_phy_is_fixed_link(np))
>> -			of_phy_deregister_fixed_link(np);
>> -		else
>> -			enetc_mdio_remove(pf);
>> -
>> -		return -EINVAL;
>> -	}
>
>Hi Claudiu
>
>It is not clear to me why it is no longer necessary to deregister the
>fixed link, or remove the mdio bus?
>
>> +	phy_mode = of_get_phy_mode(np);
>> +	if (phy_mode < 0)
>> +		priv->if_mode = PHY_INTERFACE_MODE_NA; /* fixed link */
>> +	else
>> +		priv->if_mode = phy_mode;
>

Hi Andrew,

The MAC2MAC connections are defined as fixed-link too, but without
phy-mode/phy-connection-type properties.  We don't want to de-register
these links.  Initial code was bogus in this regard.  So this MODE_NA interface
mode is reserved for the current representation of enetc ethernet ports
that are MAC2MAC connected with switch ports.  Also true that the current
upstream driver does not have the MAC2MAC connections enabled.

Your question however leads to the following discussion 😊.  The LS1028 SoC
has internal MAC2MAC connections between enetc eth ports and switch ports.
However the switch driver for LS1028a is not available upstream yet, it needs
to be re-worked as a DSA driver as you know (if you remember the discussions
we had on switchdev vs DSA for the Felix driver).  Now that we're at it, how
would you like to represent these MAC2MAC connections?

Current proposal is:
			ethernet@0,2 { /* SoC internal, connected to switch port 4 */
				compatible = "fsl,enetc";
				reg = <0x000200 0 0 0 0>;
				fixed-link {
					speed = <1000>;
					full-duplex;
				};
			};
			switch@0,5 {
				compatible = "mscc,felix-switch";
				[...]
				ports {
					#address-cells = <1>;
					#size-cells = <0>;

					/* external ports */
					[...]
					/* internal SoC ports */
					port@4 { /* connected to ENETC port2 */
						reg = <4>;
						fixed-link {
							speed = <1000>;
							full-duplex;
						};
					};
					port@5 { /* CPU port, connected to ENETC port3 */
						reg = <5>;
						ethernet = <&enetc_port3>;
						fixed-link {
							speed = <1000>;
							full-duplex;
						};
					};
				};
			};
			enetc_port3: ethernet@0,6 { /* SoC internal connected to switch port 5 */
				compatible = "fsl,enetc";
				reg = <0x000600 0 0 0 0>;
				fixed-link {
					speed = <1000>;
					full-duplex;
				};
			};
		};

Thanks.

Claudiu

^ permalink raw reply

* Re: [PATCH v3] net: phy: dp83867: Add SGMII mode type switching
From: Florian Fainelli @ 2019-09-09 16:40 UTC (permalink / raw)
  To: Vitaly Gaiduk, davem, robh+dt
  Cc: Andrew Lunn, Heiner Kallweit, netdev, linux-kernel
In-Reply-To: <1568044937-12526-1-git-send-email-vitaly.gaiduk@cloudbear.ru>

On 9/9/19 9:02 AM, Vitaly Gaiduk wrote:
> This patch adds ability to switch beetween two PHY SGMII modes.
> Some hardware, for example, FPGA IP designs may use 6-wire mode
> which enables differential SGMII clock to MAC.
> 
> Signed-off-by: Vitaly Gaiduk <vitaly.gaiduk@cloudbear.ru>

Thanks for addressing my comments, please re-post the entire patch
series per:

https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/Documentation/networking/netdev-FAQ.rst#n134

Thanks!
-- 
Florian

^ permalink raw reply

* [PATCH v3 1/2] net: phy: dp83867: Add documentation for SGMII mode type
From: Vitaly Gaiduk @ 2019-09-09 16:52 UTC (permalink / raw)
  To: davem, robh+dt, f.fainelli
  Cc: Vitaly Gaiduk, Mark Rutland, Trent Piepho, Andrew Lunn, netdev,
	devicetree, linux-kernel
In-Reply-To: <1568047940-14490-1-git-send-email-vitaly.gaiduk@cloudbear.ru>

Add documentation of ti,sgmii-ref-clock-output-enable
which can be used to select SGMII mode type (4 or 6-wire).

Signed-off-by: Vitaly Gaiduk <vitaly.gaiduk@cloudbear.ru>
---
 Documentation/devicetree/bindings/net/ti,dp83867.txt | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/Documentation/devicetree/bindings/net/ti,dp83867.txt b/Documentation/devicetree/bindings/net/ti,dp83867.txt
index db6aa3f..c98c682 100644
--- a/Documentation/devicetree/bindings/net/ti,dp83867.txt
+++ b/Documentation/devicetree/bindings/net/ti,dp83867.txt
@@ -37,6 +37,10 @@ Optional property:
 			      for applicable values.  The CLK_OUT pin can also
 			      be disabled by this property.  When omitted, the
 			      PHY's default will be left as is.
+	- ti,sgmii-ref-clock-output-enable - This denotes the fact which
+				    SGMII configuration is used (4 or 6-wire modes).
+				    Some MACs work with differential SGMII clock.
+				    See data manual for details.
 
 Note: ti,min-output-impedance and ti,max-output-impedance are mutually
       exclusive. When both properties are present ti,max-output-impedance
-- 
2.7.4


^ permalink raw reply related

* [PATCH net-next] net: stmmac: pci: Add HAPS support using GMAC5
From: Jose Abreu @ 2019-09-09 16:54 UTC (permalink / raw)
  To: netdev
  Cc: Joao Pinto, Jose Abreu, Giuseppe Cavallaro, Alexandre Torgue,
	David S. Miller, Maxime Coquelin, linux-stm32, linux-arm-kernel,
	linux-kernel

Add the support for Synopsys HAPS board that uses GMAC5.

Signed-off-by: Jose Abreu <joabreu@synopsys.com>

---
Cc: Giuseppe Cavallaro <peppe.cavallaro@st.com>
Cc: Alexandre Torgue <alexandre.torgue@st.com>
Cc: Jose Abreu <joabreu@synopsys.com>
Cc: "David S. Miller" <davem@davemloft.net>
Cc: Maxime Coquelin <mcoquelin.stm32@gmail.com>
Cc: netdev@vger.kernel.org
Cc: linux-stm32@st-md-mailman.stormreply.com
Cc: linux-arm-kernel@lists.infradead.org
Cc: linux-kernel@vger.kernel.org
---
 drivers/net/ethernet/stmicro/stmmac/stmmac_pci.c | 71 ++++++++++++++++++++++++
 1 file changed, 71 insertions(+)

diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_pci.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_pci.c
index 20906287b6d4..292045f4581f 100644
--- a/drivers/net/ethernet/stmicro/stmmac/stmmac_pci.c
+++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_pci.c
@@ -375,6 +375,75 @@ static const struct stmmac_pci_info quark_pci_info = {
 	.setup = quark_default_data,
 };
 
+static int snps_gmac5_default_data(struct pci_dev *pdev,
+				   struct plat_stmmacenet_data *plat)
+{
+	int i;
+
+	plat->clk_csr = 5;
+	plat->has_gmac4 = 1;
+	plat->force_sf_dma_mode = 1;
+	plat->tso_en = 1;
+	plat->pmt = 1;
+
+	plat->mdio_bus_data->phy_mask = 0;
+
+	/* Set default value for multicast hash bins */
+	plat->multicast_filter_bins = HASH_TABLE_SIZE;
+
+	/* Set default value for unicast filter entries */
+	plat->unicast_filter_entries = 1;
+
+	/* Set the maxmtu to a default of JUMBO_LEN */
+	plat->maxmtu = JUMBO_LEN;
+
+	/* Set default number of RX and TX queues to use */
+	plat->tx_queues_to_use = 4;
+	plat->rx_queues_to_use = 4;
+
+	plat->tx_sched_algorithm = MTL_TX_ALGORITHM_WRR;
+	for (i = 0; i < plat->tx_queues_to_use; i++) {
+		plat->tx_queues_cfg[i].use_prio = false;
+		plat->tx_queues_cfg[i].mode_to_use = MTL_QUEUE_DCB;
+		plat->tx_queues_cfg[i].weight = 25;
+	}
+
+	plat->rx_sched_algorithm = MTL_RX_ALGORITHM_SP;
+	for (i = 0; i < plat->rx_queues_to_use; i++) {
+		plat->rx_queues_cfg[i].use_prio = false;
+		plat->rx_queues_cfg[i].mode_to_use = MTL_QUEUE_DCB;
+		plat->rx_queues_cfg[i].pkt_route = 0x0;
+		plat->rx_queues_cfg[i].chan = i;
+	}
+
+	plat->bus_id = 1;
+	plat->phy_addr = -1;
+	plat->interface = PHY_INTERFACE_MODE_GMII;
+
+	plat->dma_cfg->pbl = 32;
+	plat->dma_cfg->pblx8 = true;
+
+	/* Axi Configuration */
+	plat->axi = devm_kzalloc(&pdev->dev, sizeof(*plat->axi), GFP_KERNEL);
+	if (!plat->axi)
+		return -ENOMEM;
+
+	plat->axi->axi_wr_osr_lmt = 31;
+	plat->axi->axi_rd_osr_lmt = 31;
+
+	plat->axi->axi_fb = false;
+	plat->axi->axi_blen[0] = 4;
+	plat->axi->axi_blen[1] = 8;
+	plat->axi->axi_blen[2] = 16;
+	plat->axi->axi_blen[3] = 32;
+
+	return 0;
+}
+
+static const struct stmmac_pci_info snps_gmac5_pci_info = {
+	.setup = snps_gmac5_default_data,
+};
+
 /**
  * stmmac_pci_probe
  *
@@ -518,6 +587,7 @@ static SIMPLE_DEV_PM_OPS(stmmac_pm_ops, stmmac_pci_suspend, stmmac_pci_resume);
 #define STMMAC_EHL_RGMII1G_ID	0x4b30
 #define STMMAC_EHL_SGMII1G_ID	0x4b31
 #define STMMAC_TGL_SGMII1G_ID	0xa0ac
+#define STMMAC_GMAC5_ID		0x7102
 
 #define STMMAC_DEVICE(vendor_id, dev_id, info)	{	\
 	PCI_VDEVICE(vendor_id, dev_id),			\
@@ -531,6 +601,7 @@ static const struct pci_device_id stmmac_id_table[] = {
 	STMMAC_DEVICE(INTEL, STMMAC_EHL_RGMII1G_ID, ehl_rgmii1g_pci_info),
 	STMMAC_DEVICE(INTEL, STMMAC_EHL_SGMII1G_ID, ehl_sgmii1g_pci_info),
 	STMMAC_DEVICE(INTEL, STMMAC_TGL_SGMII1G_ID, tgl_sgmii1g_pci_info),
+	STMMAC_DEVICE(SYNOPSYS, STMMAC_GMAC5_ID, snps_gmac5_pci_info),
 	{}
 };
 
-- 
2.7.4


^ permalink raw reply related

* Re: [PATCH v3 2/2] net: phy: dp83867: Add SGMII mode type switching
From: Florian Fainelli @ 2019-09-09 16:56 UTC (permalink / raw)
  To: Vitaly Gaiduk, davem, robh+dt
  Cc: Mark Rutland, Andrew Lunn, Heiner Kallweit, Trent Piepho, netdev,
	devicetree, linux-kernel
In-Reply-To: <1568047940-14490-1-git-send-email-vitaly.gaiduk@cloudbear.ru>

On 9/9/19 9:52 AM, Vitaly Gaiduk wrote:
> This patch adds ability to switch beetween two PHY SGMII modes.
> Some hardware, for example, FPGA IP designs may use 6-wire mode
> which enables differential SGMII clock to MAC.
> 
> Signed-off-by: Vitaly Gaiduk <vitaly.gaiduk@cloudbear.ru>

Reviewed-by: Florian Fainelli <f.fainelli@gmail.com>
-- 
Florian

^ permalink raw reply

* Re: [PATCH v3 1/2] net: phy: dp83867: Add documentation for SGMII mode type
From: Florian Fainelli @ 2019-09-09 16:56 UTC (permalink / raw)
  To: Vitaly Gaiduk, davem, robh+dt
  Cc: Mark Rutland, Trent Piepho, Andrew Lunn, netdev, devicetree,
	linux-kernel
In-Reply-To: <1568047940-14490-2-git-send-email-vitaly.gaiduk@cloudbear.ru>

On 9/9/19 9:52 AM, Vitaly Gaiduk wrote:
> Add documentation of ti,sgmii-ref-clock-output-enable
> which can be used to select SGMII mode type (4 or 6-wire).
> 
> Signed-off-by: Vitaly Gaiduk <vitaly.gaiduk@cloudbear.ru>
> ---
>  Documentation/devicetree/bindings/net/ti,dp83867.txt | 4 ++++
>  1 file changed, 4 insertions(+)
> 
> diff --git a/Documentation/devicetree/bindings/net/ti,dp83867.txt b/Documentation/devicetree/bindings/net/ti,dp83867.txt
> index db6aa3f..c98c682 100644
> --- a/Documentation/devicetree/bindings/net/ti,dp83867.txt
> +++ b/Documentation/devicetree/bindings/net/ti,dp83867.txt
> @@ -37,6 +37,10 @@ Optional property:
>  			      for applicable values.  The CLK_OUT pin can also
>  			      be disabled by this property.  When omitted, the
>  			      PHY's default will be left as is.
> +	- ti,sgmii-ref-clock-output-enable - This denotes the fact which
> +				    SGMII configuration is used (4 or 6-wire modes).
> +				    Some MACs work with differential SGMII clock.
> +				    See data manual for details.

The wording is a bit odd here, I would just omit "the fact" to make the
sentence more readable. With that:

Reviewed-by: Florian Fainelli <f.fainelli@gmail.com>
-- 
Florian

^ permalink raw reply

* [PATCH v3 2/2] net: phy: dp83867: Add SGMII mode type switching
From: Vitaly Gaiduk @ 2019-09-09 16:52 UTC (permalink / raw)
  To: davem, robh+dt, f.fainelli
  Cc: Vitaly Gaiduk, Mark Rutland, Andrew Lunn, Heiner Kallweit,
	Trent Piepho, netdev, devicetree, linux-kernel
In-Reply-To: <1568026945-3857-1-git-send-email-vitaly.gaiduk@cloudbear.ru>

This patch adds ability to switch beetween two PHY SGMII modes.
Some hardware, for example, FPGA IP designs may use 6-wire mode
which enables differential SGMII clock to MAC.

Signed-off-by: Vitaly Gaiduk <vitaly.gaiduk@cloudbear.ru>
---
Changes in v3:
- Fixed retaining DP83867_SGMII_TYPE bit

 drivers/net/phy/dp83867.c | 19 +++++++++++++++++++
 1 file changed, 19 insertions(+)

diff --git a/drivers/net/phy/dp83867.c b/drivers/net/phy/dp83867.c
index 1f1ecee..37fceaf 100644
--- a/drivers/net/phy/dp83867.c
+++ b/drivers/net/phy/dp83867.c
@@ -37,6 +37,7 @@
 #define DP83867_STRAP_STS2	0x006f
 #define DP83867_RGMIIDCTL	0x0086
 #define DP83867_IO_MUX_CFG	0x0170
+#define DP83867_SGMIICTL	0x00D3
 #define DP83867_10M_SGMII_CFG   0x016F
 #define DP83867_10M_SGMII_RATE_ADAPT_MASK BIT(7)

@@ -61,6 +62,9 @@
 #define DP83867_RGMII_TX_CLK_DELAY_EN		BIT(1)
 #define DP83867_RGMII_RX_CLK_DELAY_EN		BIT(0)

+/* SGMIICTL bits */
+#define DP83867_SGMII_TYPE		BIT(14)
+
 /* STRAP_STS1 bits */
 #define DP83867_STRAP_STS1_RESERVED		BIT(11)

@@ -109,6 +113,7 @@ struct dp83867_private {
 	bool rxctrl_strap_quirk;
 	bool set_clk_output;
 	u32 clk_output_sel;
+	bool sgmii_ref_clk_en;
 };

 static int dp83867_ack_interrupt(struct phy_device *phydev)
@@ -197,6 +202,9 @@ static int dp83867_of_init(struct phy_device *phydev)
 	dp83867->rxctrl_strap_quirk = of_property_read_bool(of_node,
 					"ti,dp83867-rxctrl-strap-quirk");

+	dp83867->sgmii_ref_clk_en = of_property_read_bool(of_node,
+					"ti,sgmii-ref-clock-output-enable");
+
 	/* Existing behavior was to use default pin strapping delay in rgmii
 	 * mode, but rgmii should have meant no delay.  Warn existing users.
 	 */
@@ -389,6 +397,17 @@ static int dp83867_config_init(struct phy_device *phydev)

 		if (ret)
 			return ret;
+
+		val = phy_read_mmd(phydev, DP83867_DEVADDR, DP83867_SGMIICTL);
+		/* SGMII type is set to 4-wire mode by default.
+		 * If we place appropriate property in dts (see above)
+		 * switch on 6-wire mode.
+		 */
+		if (dp83867->sgmii_ref_clk_en)
+			val |= DP83867_SGMII_TYPE;
+		else
+			val &= ~DP83867_SGMII_TYPE;
+		phy_write_mmd(phydev, DP83867_DEVADDR, DP83867_SGMIICTL, val);
 	}

 	/* Enable Interrupt output INT_OE in CFG3 register */
--
2.7.4


^ permalink raw reply related

* [PATCH v4 1/2] net: phy: dp83867: Add documentation for SGMII mode type
From: Vitaly Gaiduk @ 2019-09-09 17:19 UTC (permalink / raw)
  To: davem, robh+dt, f.fainelli
  Cc: Vitaly Gaiduk, Mark Rutland, Andrew Lunn, Trent Piepho, netdev,
	devicetree, linux-kernel
In-Reply-To: <1568049566-16708-1-git-send-email-vitaly.gaiduk@cloudbear.ru>

Add documentation of ti,sgmii-ref-clock-output-enable
which can be used to select SGMII mode type (4 or 6-wire).

Signed-off-by: Vitaly Gaiduk <vitaly.gaiduk@cloudbear.ru>
---
Changes in v4:
- Fixed the wording of property

 Documentation/devicetree/bindings/net/ti,dp83867.txt | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/Documentation/devicetree/bindings/net/ti,dp83867.txt b/Documentation/devicetree/bindings/net/ti,dp83867.txt
index db6aa3f..388ff48 100644
--- a/Documentation/devicetree/bindings/net/ti,dp83867.txt
+++ b/Documentation/devicetree/bindings/net/ti,dp83867.txt
@@ -37,6 +37,10 @@ Optional property:
 			      for applicable values.  The CLK_OUT pin can also
 			      be disabled by this property.  When omitted, the
 			      PHY's default will be left as is.
+	- ti,sgmii-ref-clock-output-enable - This denotes which
+				    SGMII configuration is used (4 or 6-wire modes).
+				    Some MACs work with differential SGMII clock.
+				    See data manual for details.

 Note: ti,min-output-impedance and ti,max-output-impedance are mutually
       exclusive. When both properties are present ti,max-output-impedance
--
2.7.4


^ permalink raw reply related

* [PATCH v4 2/2] net: phy: dp83867: Add SGMII mode type switching
From: Vitaly Gaiduk @ 2019-09-09 17:19 UTC (permalink / raw)
  To: davem, robh+dt, f.fainelli
  Cc: Vitaly Gaiduk, Mark Rutland, Andrew Lunn, Heiner Kallweit,
	Trent Piepho, netdev, devicetree, linux-kernel
In-Reply-To: <1568047940-14490-2-git-send-email-vitaly.gaiduk@cloudbear.ru>

This patch adds ability to switch beetween two PHY SGMII modes.
Some hardware, for example, FPGA IP designs may use 6-wire mode
which enables differential SGMII clock to MAC.

Signed-off-by: Vitaly Gaiduk <vitaly.gaiduk@cloudbear.ru>
---
 drivers/net/phy/dp83867.c | 19 +++++++++++++++++++
 1 file changed, 19 insertions(+)

diff --git a/drivers/net/phy/dp83867.c b/drivers/net/phy/dp83867.c
index 1f1ecee..37fceaf 100644
--- a/drivers/net/phy/dp83867.c
+++ b/drivers/net/phy/dp83867.c
@@ -37,6 +37,7 @@
 #define DP83867_STRAP_STS2	0x006f
 #define DP83867_RGMIIDCTL	0x0086
 #define DP83867_IO_MUX_CFG	0x0170
+#define DP83867_SGMIICTL	0x00D3
 #define DP83867_10M_SGMII_CFG   0x016F
 #define DP83867_10M_SGMII_RATE_ADAPT_MASK BIT(7)
 
@@ -61,6 +62,9 @@
 #define DP83867_RGMII_TX_CLK_DELAY_EN		BIT(1)
 #define DP83867_RGMII_RX_CLK_DELAY_EN		BIT(0)
 
+/* SGMIICTL bits */
+#define DP83867_SGMII_TYPE		BIT(14)
+
 /* STRAP_STS1 bits */
 #define DP83867_STRAP_STS1_RESERVED		BIT(11)
 
@@ -109,6 +113,7 @@ struct dp83867_private {
 	bool rxctrl_strap_quirk;
 	bool set_clk_output;
 	u32 clk_output_sel;
+	bool sgmii_ref_clk_en;
 };
 
 static int dp83867_ack_interrupt(struct phy_device *phydev)
@@ -197,6 +202,9 @@ static int dp83867_of_init(struct phy_device *phydev)
 	dp83867->rxctrl_strap_quirk = of_property_read_bool(of_node,
 					"ti,dp83867-rxctrl-strap-quirk");
 
+	dp83867->sgmii_ref_clk_en = of_property_read_bool(of_node,
+					"ti,sgmii-ref-clock-output-enable");
+
 	/* Existing behavior was to use default pin strapping delay in rgmii
 	 * mode, but rgmii should have meant no delay.  Warn existing users.
 	 */
@@ -389,6 +397,17 @@ static int dp83867_config_init(struct phy_device *phydev)
 
 		if (ret)
 			return ret;
+
+		val = phy_read_mmd(phydev, DP83867_DEVADDR, DP83867_SGMIICTL);
+		/* SGMII type is set to 4-wire mode by default.
+		 * If we place appropriate property in dts (see above)
+		 * switch on 6-wire mode.
+		 */
+		if (dp83867->sgmii_ref_clk_en)
+			val |= DP83867_SGMII_TYPE;
+		else
+			val &= ~DP83867_SGMII_TYPE;
+		phy_write_mmd(phydev, DP83867_DEVADDR, DP83867_SGMIICTL, val);
 	}
 
 	/* Enable Interrupt output INT_OE in CFG3 register */
-- 
2.7.4


^ permalink raw reply related

* Re: [PATCH v4 2/2] net: phy: dp83867: Add SGMII mode type switching
From: Trent Piepho @ 2019-09-09 17:40 UTC (permalink / raw)
  To: vitaly.gaiduk@cloudbear.ru, davem@davemloft.net,
	f.fainelli@gmail.com, robh+dt@kernel.org
  Cc: mark.rutland@arm.com, hkallweit1@gmail.com,
	netdev@vger.kernel.org, andrew@lunn.ch,
	devicetree@vger.kernel.org, linux-kernel@vger.kernel.org
In-Reply-To: <1568049566-16708-1-git-send-email-vitaly.gaiduk@cloudbear.ru>

On Mon, 2019-09-09 at 20:19 +0300, Vitaly Gaiduk wrote:
> This patch adds ability to switch beetween two PHY SGMII modes.
> Some hardware, for example, FPGA IP designs may use 6-wire mode
> which enables differential SGMII clock to MAC.
> 
> +
> +		val = phy_read_mmd(phydev, DP83867_DEVADDR, DP83867_SGMIICTL);
> +		/* SGMII type is set to 4-wire mode by default.
> +		 * If we place appropriate property in dts (see above)
> +		 * switch on 6-wire mode.
> +		 */
> +		if (dp83867->sgmii_ref_clk_en)
> +			val |= DP83867_SGMII_TYPE;
> +		else
> +			val &= ~DP83867_SGMII_TYPE;
> +		phy_write_mmd(phydev, DP83867_DEVADDR, DP83867_SGMIICTL, val);

Should use phy_modify_mmd().

^ permalink raw reply

* Re: [PATCH bpf-next v10 2/4] bpf: new helper to obtain namespace data from current task New bpf helper bpf_get_current_pidns_info.
From: Carlos Antonio Neira Bustos @ 2019-09-09 17:45 UTC (permalink / raw)
  To: Yonghong Song
  Cc: Al Viro, netdev@vger.kernel.org, ebiederm@xmission.com,
	brouer@redhat.com, bpf@vger.kernel.org
In-Reply-To: <7d196a64-cf36-c2d5-7328-154aaeb929eb@fb.com>

Thanks a lot, Al Viro and Yonghong for taking the time to review this patch and
provide technical insights needed on this one.
But how do we move this forward? 
Al Viro's review is clear that this will not work and we should strip the name 
resolution code (thanks for your detailed analysis).
As there is currently only one instance of the nsfs device on the system,  
I think we could leave out the retrieval of the pidns device number and address it
when the situation changes.
What do you think?


On Sat, Sep 07, 2019 at 06:34:39AM +0000, Yonghong Song wrote:
> 
> 
> On 9/6/19 5:10 PM, Al Viro wrote:
> > On Fri, Sep 06, 2019 at 11:21:14PM +0000, Yonghong Song wrote:
> > 
> >> -bash-4.4$ readlink /proc/self/ns/pid
> >> pid:[4026531836]
> >> -bash-4.4$ stat /proc/self/ns/pid
> >>     File: ‘/proc/self/ns/pid’ -> ‘pid:[4026531836]’
> >>     Size: 0               Blocks: 0          IO Block: 1024   symbolic link
> >> Device: 4h/4d   Inode: 344795989   Links: 1
> >> Access: (0777/lrwxrwxrwx)  Uid: (128203/     yhs)   Gid: (  100/   users)
> >> Context: user_u:base_r:base_t
> >> Access: 2019-09-06 16:06:09.431616380 -0700
> >> Modify: 2019-09-06 16:06:09.431616380 -0700
> >> Change: 2019-09-06 16:06:09.431616380 -0700
> >>    Birth: -
> >> -bash-4.4$
> >>
> >> Based on a discussion with Eric Biederman back in 2019 Linux
> >> Plumbers, Eric suggested that to uniquely identify a
> >> namespace, device id (major/minor) number should also
> >> be included. Although today's kernel implementation
> >> has the same device for all namespace pseudo files,
> >> but from uapi perspective, device id should be included.
> >>
> >> That is the reason why we try to get device id which holds
> >> pid namespace pseudo file.
> >>
> >> Do you have a better suggestion on how to get
> >> the device id for 'current' pid namespace? Or from design, we
> >> really should not care about device id at all?
> > 
> > What the hell is "device id for pid namespace"?  This is the
> > first time I've heard about that mystery object, so it's
> > hard to tell where it could be found.
> > 
> > I can tell you what device numbers are involved in the areas
> > you seem to be looking in.
> > 
> > 1) there's whatever device number that gets assigned to
> > (this) procfs instance.  That, ironically, _is_ per-pidns, but
> > that of the procfs instance, not that of your process (and
> > those can be different).  That's what you get in ->st_dev
> > when doing lstat() of anything in /proc (assuming that
> > procfs is mounted there, in the first place).  NOTE:
> > that's lstat(2), not stat(2).  stat(1) uses lstat(2),
> > unless given -L (in which case it's stat(2) time).  The
> > difference:
> > 
> > root@kvm1:~# stat /proc/self/ns/pid
> >    File: /proc/self/ns/pid -> pid:[4026531836]
> >    Size: 0               Blocks: 0          IO Block: 1024   symbolic link
> > Device: 4h/4d   Inode: 17396       Links: 1
> > Access: (0777/lrwxrwxrwx)  Uid: (    0/    root)   Gid: (    0/    root)
> > Access: 2019-09-06 19:43:11.871312319 -0400
> > Modify: 2019-09-06 19:43:11.871312319 -0400
> > Change: 2019-09-06 19:43:11.871312319 -0400
> >   Birth: -
> > root@kvm1:~# stat -L /proc/self/ns/pid
> >    File: /proc/self/ns/pid
> >    Size: 0               Blocks: 0          IO Block: 4096   regular empty file
> > Device: 3h/3d   Inode: 4026531836  Links: 1
> > Access: (0444/-r--r--r--)  Uid: (    0/    root)   Gid: (    0/    root)
> > Access: 2019-09-06 19:43:15.955313293 -0400
> > Modify: 2019-09-06 19:43:15.955313293 -0400
> > Change: 2019-09-06 19:43:15.955313293 -0400
> >   Birth: -
> > 
> > The former is lstat, the latter - stat.
> > 
> > 2) device number of the filesystem where the symlink target lives.
> > In this case, it's nsfs and there's only one instance on the entire
> > system.  _That_ would be obtained by looking at st_dev in stat(2) on
> > /proc/self/ns/pid (0:3 above).
> > 
> > 3) device number *OF* the symlink.  That would be st_rdev in lstat(2).
> > There's none - it's a symlink, not a character or block device.  It's
> > always zero and always will be zero.
> > 
> > 4) the same for the target; st_rdev in stat(2) results and again,
> > there's no such beast - it's neither character nor block device.
> > 
> > Your code is looking at (3).  Please, reread any textbook on Unix
> > in the section that would cover stat(2) and discussion of the
> > difference between st_dev and st_rdev.
> > 
> > I have no idea what Eric had been talking about - it's hard to
> > reconstruct by what you said so far.  Making nsfs per-userns,
> > perhaps?  But that makes no sense whatsoever, not that userns
> > ever had...  Cheap shots aside, I really can't guess what that's
> > about.  Sorry.
> 
> Thanks for the detailed information. The device number we want
> is nsfs. Indeed, currently, there is only one instance
> on the entire system. But not exactly sure what is the possibility
> to have more than one nsfs device in the future. Maybe per-userns
> or any other criteria?
> 
> > 
> > In any case, pathname resolution is *NOT* for the situations where
> > you can't block.  Even if it's procfs (and from the same pidns as
> > the process) mounted there, there is no promise that the target
> > of /proc/self has already been looked up and not evicted from
> > memory since then.  And in case of cache miss pathwalk will
> > have to call ->lookup(), which requires locking the directory
> > (rw_sem, shared).  You can't do that in such context.
> > 
> > And that doesn't even go into the possibility that process has
> > something very different mounted on /proc.
> > 
> > Again, I don't know what it is that you want to get to, but
> > I would strongly recommend finding a way to get to that data
> > that would not involve going anywhere near pathname resolution.
> > 
> > How would you expect the userland to work with that value,
> > whatever it might be?  If it's just a 32bit field that will
> > never be read, you might as well store there the same value
> > you store now (0, that is) in much cheaper and safer way ;-)
> 
> Suppose inside pid namespace, user can pass the device number,
> say n1, (`stat -L /proc/self/ns/pid`) to bpf program (through map
> or JIT). At runtime, bpf program will try to get device number,
> say n2, for the 'current' process. If n1 is not the same as
> n2, that means they are not in the same namespace. 'current'
> is in the same pid namespace as the user iff
> n1 == n2 and also pidns id is the same for 'current' and
> the one with `lsns -t pid`.
> 
> Are you aware of any way to get the pidns device number
> for 'current' without going through the pathname
> lookup?
> 

^ permalink raw reply

* [PATCH] libbpf: Don't error out if getsockopt() fails for XDP_OPTIONS
From: Toke Høiland-Jørgensen @ 2019-09-09 17:46 UTC (permalink / raw)
  To: Alexei Starovoitov, Daniel Borkmann, netdev, bpf
  Cc: Toke Høiland-Jørgensen

The xsk_socket__create() function fails and returns an error if it cannot
get the XDP_OPTIONS through getsockopt(). However, support for XDP_OPTIONS
was not added until kernel 5.3, so this means that creating XSK sockets
always fails on older kernels.

Since the option is just used to set the zero-copy flag in the xsk struct,
there really is no need to error out if the getsockopt() call fails.

Signed-off-by: Toke Høiland-Jørgensen <toke@redhat.com>
---
 tools/lib/bpf/xsk.c | 8 ++------
 1 file changed, 2 insertions(+), 6 deletions(-)

diff --git a/tools/lib/bpf/xsk.c b/tools/lib/bpf/xsk.c
index 680e63066cf3..598e487d9ce8 100644
--- a/tools/lib/bpf/xsk.c
+++ b/tools/lib/bpf/xsk.c
@@ -603,12 +603,8 @@ int xsk_socket__create(struct xsk_socket **xsk_ptr, const char *ifname,
 
 	optlen = sizeof(opts);
 	err = getsockopt(xsk->fd, SOL_XDP, XDP_OPTIONS, &opts, &optlen);
-	if (err) {
-		err = -errno;
-		goto out_mmap_tx;
-	}
-
-	xsk->zc = opts.flags & XDP_OPTIONS_ZEROCOPY;
+	if (!err)
+		xsk->zc = opts.flags & XDP_OPTIONS_ZEROCOPY;
 
 	if (!(xsk->config.libbpf_flags & XSK_LIBBPF_FLAGS__INHIBIT_PROG_LOAD)) {
 		err = xsk_setup_xdp_prog(xsk);
-- 
2.23.0


^ permalink raw reply related

* [PATCH net 0/1] net/tls(TLS_SW): double free in tls_tx_records
From: Pooja Trivedi @ 2019-09-09 17:46 UTC (permalink / raw)
  To: netdev
In-Reply-To: <CAOrEdsmxstWoBz2AotrTx_OBFN_jycqnSqtsvLxuCLGtKKi_dA@mail.gmail.com>

[-- Attachment #1: Type: text/plain, Size: 6590 bytes --]

TLS module crash while running SSL record encryption using
klts_send_[file] using crypto accelerator (Nitrox).

Following are the preconditions and steps to reproduce the issue:

Preconditions:
1) Installed 5.3-rc4
2) Nitrox5 card plugin (crypto accelerator)

Steps to reproduce the issue:
1) Installed n5pf.ko (drivers/crypto/cavium/nitrox)
2) Installed tls.ko if not is installed by default (net/tls)
3) Obtained uperf tool from GitHub
   3.1) Modified uperf to use tls module by using setsocket.
   3.2) Modified uperf tool to support sendfile with SSL.

Test:
1) Ran uperf with 4 threads
2) Each thread sends data using sendfile over SSL protocol.

After a few seconds into the test, kernel crashes because of record
list corruption

[  270.888952] ------------[ cut here ]------------
[  270.890450] list_del corruption, ffff91cc3753a800->prev is
LIST_POISON2 (dead000000000122)
[  270.891194] WARNING: CPU: 1 PID: 7387 at lib/list_debug.c:50
__list_del_entry_valid+0x62/0x90
[  270.892037] Modules linked in: n5pf(OE) netconsole tls(OE) bonding
intel_rapl_msr intel_rapl_common sb_edac x86_pkg_temp_thermal
intel_powerclamp coretemp kvm_intel kvm iTCO_wdt iTCO_vendor_support
irqbypass crct10dif_pclmul crc32_pclmul ghash_clmulni_intel
aesni_intel crypto_simd mei_me cryptd glue_helper ipmi_si sg mei
lpc_ich pcspkr joydev ioatdma i2c_i801 ipmi_devintf ipmi_msghandler
wmi ip_tables xfs libcrc32c sd_mod mgag200 drm_vram_helper ttm
drm_kms_helper syscopyarea sysfillrect sysimgblt fb_sys_fops drm isci
libsas ahci scsi_transport_sas libahci crc32c_intel serio_raw igb
libata ptp pps_core dca i2c_algo_bit dm_mirror dm_region_hash dm_log
dm_mod [last unloaded: nitrox_drv]
[  270.896836] CPU: 1 PID: 7387 Comm: uperf Kdump: loaded Tainted: G
        OE     5.3.0-rc4 #1
[  270.897711] Hardware name: Supermicro SYS-1027R-N3RF/X9DRW, BIOS
3.0c 03/24/2014
[  270.898597] RIP: 0010:__list_del_entry_valid+0x62/0x90
[  270.899478] Code: 00 00 00 c3 48 89 fe 48 89 c2 48 c7 c7 e0 f9 ee
8d e8 b2 cf c8 ff 0f 0b 31 c0 c3 48 89 fe 48 c7 c7 18 fa ee 8d e8 9e
cf c8 ff <0f> 0b 31 c0 c3 48 89 f2 48 89 fe 48 c7 c7 50 fa ee 8d e8 87
cf c8
[  270.901321] RSP: 0018:ffffb6ea86eb7c20 EFLAGS: 00010282
[  270.902240] RAX: 0000000000000000 RBX: ffff91cc3753c000 RCX: 0000000000000000
[  270.903157] RDX: ffff91bc3f867080 RSI: ffff91bc3f857738 RDI: ffff91bc3f857738
[  270.904074] RBP: ffff91bc36020940 R08: 0000000000000560 R09: 0000000000000000
[  270.904988] R10: 0000000000000000 R11: 0000000000000000 R12: 0000000000000000
[  270.905902] R13: ffff91cc3753a800 R14: ffff91cc37cc6400 R15: ffff91cc3753a800
[  270.906809] FS:  00007f454a88d700(0000) GS:ffff91bc3f840000(0000)
knlGS:0000000000000000
[  270.907715] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[  270.908606] CR2: 00007f453c00292c CR3: 000000103554e003 CR4: 00000000001606e0
[  270.909490] Call Trace:
[  270.910373]  tls_tx_records+0x138/0x1c0 [tls]
[  270.911262]  tls_sw_sendpage+0x3e0/0x420 [tls]
[  270.912154]  inet_sendpage+0x52/0x90
[  270.913045]  ? direct_splice_actor+0x40/0x40
[  270.913941]  kernel_sendpage+0x1a/0x30
[  270.914831]  sock_sendpage+0x20/0x30
[  270.915714]  pipe_to_sendpage+0x62/0x90
[  270.916592]  __splice_from_pipe+0x80/0x180
[  270.917461]  ? direct_splice_actor+0x40/0x40
[  270.918334]  splice_from_pipe+0x5d/0x90
[  270.919208]  direct_splice_actor+0x35/0x40
[  270.920086]  splice_direct_to_actor+0x103/0x230
[  270.920966]  ? generic_pipe_buf_nosteal+0x10/0x10
[  270.921850]  do_splice_direct+0x9a/0xd0
[  270.922733]  do_sendfile+0x1c9/0x3d0
[  270.923612]  __x64_sys_sendfile64+0x5c/0xc0

Observations:
1) This issue is observed after applying "Commit a42055e8d2c3: Add
support for async encryption of records for performance"
2) 5.2.2 kernel exhibits the same issue

Attached is the complete crash log.

Bugzilla: https://bugzilla.kernel.org/show_bug.cgi?id=204669

linux-crypto original post:
https://marc.info/?l=linux-crypto-vger&m=156700690108854&w=2


After adding custom profiling around lock_sock/release_sock as well as
getting backtraces of the call stack at and around the time of the
crash/race-condition, it seems like using the socket lock is not the
best way to synchronize write access to tls_tx_records, especially
when the socket lock can get released under tcp memory pressure situation.

One potential way for race condition to appear:

When under tcp memory pressure, Thread 1 takes the following code path:
do_sendfile ---> ... ---> .... ---> tls_sw_sendpage --->
tls_sw_do_sendpage ---> tls_tx_records ---> tls_push_sg --->
do_tcp_sendpages ---> sk_stream_wait_memory ---> sk_wait_event

sk_wait_event releases the socket lock and sleeps waiting for memory:

#define sk_wait_event(__sk, __timeo, __condition, __wait)       \
     ({  int __rc;                       \
         release_sock(__sk);                 \
         __rc = __condition;                 \
         if (!__rc) {                        \
             *(__timeo) = wait_woken(__wait,         \
                         TASK_INTERRUPTIBLE, \
                         *(__timeo));        \
         }                           \
         sched_annotate_sleep();                 \
         lock_sock(__sk);                    \
         __rc = __condition;                 \
         __rc;                           \
     })

Thread 2 code path:
tx_work_handler ---> tls_tx_records

Thread 2 is able to obtain the socket lock and go through the
transmission of the ctx->tx_list, deleting the sent ones (as in the
for loop below).

int tls_tx_records(struct sock *sk, int flags)
{
     ....
     ....
     ....
     ....
     list_for_each_entry_safe(rec, tmp, &ctx->tx_list, list) {
          if (READ_ONCE(rec->tx_ready)) {
              if (flags == -1)
                  tx_flags = rec->tx_flags;
              else
                  tx_flags = flags;

              msg_en = &rec->msg_encrypted;
              rc = tls_push_sg(sk, tls_ctx,
                       &msg_en->sg.data[msg_en->sg.curr],
                       0, tx_flags);
              if (rc)
                  goto tx_err;

              list_del(&rec->list); // **** crash location ****
              sk_msg_free(sk, &rec->msg_plaintext);
              kfree(rec);
          } else {
              break;
          }
      }
     ....
     ....
     ....
     ....
}

When Thread 1 wakes up from tls_push_sg call and attempts list_del on
previously grabbed record which was sent and deleted by Thread 2, it
causes the crash.


To fix this race, a flag or bool inside of ctx can be used to
synchronize access to tls_tx_records.

[-- Attachment #2: tls_crash_log_5_3_rc4.txt --]
[-- Type: text/plain, Size: 11274 bytes --]

[  270.888952] ------------[ cut here ]------------
[  270.890450] list_del corruption, ffff91cc3753a800->prev is LIST_POISON2 (dead000000000122)
[  270.891194] WARNING: CPU: 1 PID: 7387 at lib/list_debug.c:50 __list_del_entry_valid+0x62/0x90
[  270.892037] Modules linked in: n5pf(OE) netconsole tls(OE) bonding intel_rapl_msr intel_rapl_common sb_edac x86_pkg_temp_thermal intel_powerclamp coretemp kvm_intel kvm iTCO_wdt iTCO_vendor_support irqbypass crct10dif_pclmul crc32_pclmul ghash_clmulni_intel aesni_intel crypto_simd mei_me cryptd glue_helper ipmi_si sg mei lpc_ich pcspkr joydev ioatdma i2c_i801 ipmi_devintf ipmi_msghandler wmi ip_tables xfs libcrc32c sd_mod mgag200 drm_vram_helper ttm drm_kms_helper syscopyarea sysfillrect sysimgblt fb_sys_fops drm isci libsas ahci scsi_transport_sas libahci crc32c_intel serio_raw igb libata ptp pps_core dca i2c_algo_bit dm_mirror dm_region_hash dm_log dm_mod [last unloaded: nitrox_drv]
[  270.896836] CPU: 1 PID: 7387 Comm: uperf Kdump: loaded Tainted: G           OE     5.3.0-rc4 #1
[  270.897711] Hardware name: Supermicro SYS-1027R-N3RF/X9DRW, BIOS 3.0c 03/24/2014
[  270.898597] RIP: 0010:__list_del_entry_valid+0x62/0x90
[  270.899478] Code: 00 00 00 c3 48 89 fe 48 89 c2 48 c7 c7 e0 f9 ee 8d e8 b2 cf c8 ff 0f 0b 31 c0 c3 48 89 fe 48 c7 c7 18 fa ee 8d e8 9e cf c8 ff <0f> 0b 31 c0 c3 48 89 f2 48 89 fe 48 c7 c7 50 fa ee 8d e8 87 cf c8
[  270.901321] RSP: 0018:ffffb6ea86eb7c20 EFLAGS: 00010282
[  270.902240] RAX: 0000000000000000 RBX: ffff91cc3753c000 RCX: 0000000000000000
[  270.903157] RDX: ffff91bc3f867080 RSI: ffff91bc3f857738 RDI: ffff91bc3f857738
[  270.904074] RBP: ffff91bc36020940 R08: 0000000000000560 R09: 0000000000000000
[  270.904988] R10: 0000000000000000 R11: 0000000000000000 R12: 0000000000000000
[  270.905902] R13: ffff91cc3753a800 R14: ffff91cc37cc6400 R15: ffff91cc3753a800
[  270.906809] FS:  00007f454a88d700(0000) GS:ffff91bc3f840000(0000) knlGS:0000000000000000
[  270.907715] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[  270.908606] CR2: 00007f453c00292c CR3: 000000103554e003 CR4: 00000000001606e0
[  270.909490] Call Trace:
[  270.910373]  tls_tx_records+0x138/0x1c0 [tls]
[  270.911262]  tls_sw_sendpage+0x3e0/0x420 [tls]
[  270.912154]  inet_sendpage+0x52/0x90
[  270.913045]  ? direct_splice_actor+0x40/0x40
[  270.913941]  kernel_sendpage+0x1a/0x30
[  270.914831]  sock_sendpage+0x20/0x30
[  270.915714]  pipe_to_sendpage+0x62/0x90
[  270.916592]  __splice_from_pipe+0x80/0x180
[  270.917461]  ? direct_splice_actor+0x40/0x40
[  270.918334]  splice_from_pipe+0x5d/0x90
[  270.919208]  direct_splice_actor+0x35/0x40
[  270.920086]  splice_direct_to_actor+0x103/0x230
[  270.920966]  ? generic_pipe_buf_nosteal+0x10/0x10
[  270.921850]  do_splice_direct+0x9a/0xd0
[  270.922733]  do_sendfile+0x1c9/0x3d0
[  270.923612]  __x64_sys_sendfile64+0x5c/0xc0
[  270.924485]  do_syscall_64+0x5b/0x1d0
[  270.925344]  entry_SYSCALL_64_after_hwframe+0x44/0xa9
[  270.926188] RIP: 0033:0x7f454c9bf6ba
[  270.927010] Code: 31 c0 5b c3 0f 1f 40 00 48 89 da 4c 89 ce 44 89 c7 5b e9 09 fe ff ff 66 0f 1f 84 00 00 00 00 00 49 89 ca b8 28 00 00 00 0f 05 <48> 3d 01 f0 ff ff 73 01 c3 48 8b 0d 86 27 2d 00 f7 d8 64 89 01 48
[  270.928710] RSP: 002b:00007f454a88cd08 EFLAGS: 00000246 ORIG_RAX: 0000000000000028
[  270.929573] RAX: ffffffffffffffda RBX: 0000000002025460 RCX: 00007f454c9bf6ba
[  270.930433] RDX: 00007f454a88cd18 RSI: 0000000000000005 RDI: 000000000000000a
[  270.931294] RBP: 000000000000000a R08: 00007f454cc920ec R09: 00007f454cc92140
[  270.932146] R10: 0000000000001000 R11: 0000000000000246 R12: 0000000000000000
[  270.932988] R13: 0000000000000000 R14: 0000000000000000 R15: 0000000002030788
[  270.933809] ---[ end trace 4ee70802e847a9de ]---
[  270.934624] ------------[ cut here ]------------
[  270.935957] list_del corruption, ffff91cc3753c000->prev is LIST_POISON2 (dead000000000122)
[  270.937381] WARNING: CPU: 1 PID: 7387 at lib/list_debug.c:50 __list_del_entry_valid+0x62/0x90
[  270.938384] Modules linked in: n5pf(OE) netconsole tls(OE) bonding intel_rapl_msr intel_rapl_common sb_edac x86_pkg_temp_thermal intel_powerclamp coretemp kvm_intel kvm iTCO_wdt iTCO_vendor_support irqbypass crct10dif_pclmul crc32_pclmul ghash_clmulni_intel aesni_intel crypto_simd mei_me cryptd glue_helper ipmi_si sg mei lpc_ich pcspkr joydev ioatdma i2c_i801 ipmi_devintf ipmi_msghandler wmi ip_tables xfs libcrc32c sd_mod mgag200 drm_vram_helper ttm drm_kms_helper syscopyarea sysfillrect sysimgblt fb_sys_fops drm isci libsas ahci scsi_transport_sas libahci crc32c_intel serio_raw igb libata ptp pps_core dca i2c_algo_bit dm_mirror dm_region_hash dm_log dm_mod [last unloaded: nitrox_drv]
[  270.943119] CPU: 1 PID: 7387 Comm: uperf Kdump: loaded Tainted: G        W  OE     5.3.0-rc4 #1
[  270.943920] Hardware name: Supermicro SYS-1027R-N3RF/X9DRW, BIOS 3.0c 03/24/2014
[  270.944725] RIP: 0010:__list_del_entry_valid+0x62/0x90
[  270.945522] Code: 00 00 00 c3 48 89 fe 48 89 c2 48 c7 c7 e0 f9 ee 8d e8 b2 cf c8 ff 0f 0b 31 c0 c3 48 89 fe 48 c7 c7 18 fa ee 8d e8 9e cf c8 ff <0f> 0b 31 c0 c3 48 89 f2 48 89 fe 48 c7 c7 50 fa ee 8d e8 87 cf c8
[  270.947193] RSP: 0018:ffffb6ea86eb7c20 EFLAGS: 00010282
[  270.948037] RAX: 0000000000000000 RBX: ffff91cc3753a800 RCX: 0000000000000000
[  270.948882] RDX: ffff91bc3f867080 RSI: ffff91bc3f857738 RDI: ffff91bc3f857738
[  270.949723] RBP: ffff91bc36020940 R08: 000000000000058d R09: 0000000000000000
[  270.950562] R10: 0000000000000000 R11: 0000000000000000 R12: 0000000000000000
[  270.951387] R13: ffff91cc3753c000 R14: ffff91cc37cc6400 R15: ffff91cc3753c000
[  270.952220] FS:  00007f454a88d700(0000) GS:ffff91bc3f840000(0000) knlGS:0000000000000000
[  270.953065] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[  270.953914] CR2: 00007f453c00292c CR3: 000000103554e003 CR4: 00000000001606e0
[  270.954777] Call Trace:
[  270.955636]  tls_tx_records+0x138/0x1c0 [tls]
[  270.956499]  tls_sw_sendpage+0x3e0/0x420 [tls]
[  270.957366]  inet_sendpage+0x52/0x90
[  270.958232]  ? direct_splice_actor+0x40/0x40
[  270.959101]  kernel_sendpage+0x1a/0x30
[  270.959967]  sock_sendpage+0x20/0x30
[  270.960830]  pipe_to_sendpage+0x62/0x90
[  270.961693]  __splice_from_pipe+0x80/0x180
[  270.962556]  ? direct_splice_actor+0x40/0x40
[  270.963413]  splice_from_pipe+0x5d/0x90
[  270.964274]  direct_splice_actor+0x35/0x40
[  270.965136]  splice_direct_to_actor+0x103/0x230
[  270.966001]  ? generic_pipe_buf_nosteal+0x10/0x10
[  270.966867]  do_splice_direct+0x9a/0xd0
[  270.967727]  do_sendfile+0x1c9/0x3d0
[  270.968565]  __x64_sys_sendfile64+0x5c/0xc0
[  270.969380]  do_syscall_64+0x5b/0x1d0
[  270.970185]  entry_SYSCALL_64_after_hwframe+0x44/0xa9
[  270.970985] RIP: 0033:0x7f454c9bf6ba
[  270.971778] Code: 31 c0 5b c3 0f 1f 40 00 48 89 da 4c 89 ce 44 89 c7 5b e9 09 fe ff ff 66 0f 1f 84 00 00 00 00 00 49 89 ca b8 28 00 00 00 0f 05 <48> 3d 01 f0 ff ff 73 01 c3 48 8b 0d 86 27 2d 00 f7 d8 64 89 01 48
[  270.973438] RSP: 002b:00007f454a88cd08 EFLAGS: 00000246 ORIG_RAX: 0000000000000028
[  270.974293] RAX: ffffffffffffffda RBX: 0000000002025460 RCX: 00007f454c9bf6ba
[  270.975148] RDX: 00007f454a88cd18 RSI: 0000000000000005 RDI: 000000000000000a
[  270.976009] RBP: 000000000000000a R08: 00007f454cc920ec R09: 00007f454cc92140
[  270.976860] R10: 0000000000001000 R11: 0000000000000246 R12: 0000000000000000
[  270.977699] R13: 0000000000000000 R14: 0000000000000000 R15: 0000000002030788
[  270.978514] ---[ end trace 4ee70802e847a9df ]---
[  270.979330] BUG: unable to handle page fault for address: ffff91d383e05784
[  270.980749] #PF: supervisor read access in kernel mode
[  270.981820] #PF: error_code(0x0000) - not-present page
[  270.982571] PGD 1032e01067 P4D 1032e01067 PUD 0 
[  270.983303] Oops: 0000 [#1] SMP PTI
[  270.984020] CPU: 1 PID: 7387 Comm: uperf Kdump: loaded Tainted: G        W  OE     5.3.0-rc4 #1
[  270.984748] Hardware name: Supermicro SYS-1027R-N3RF/X9DRW, BIOS 3.0c 03/24/2014
[  270.985473] RIP: 0010:tls_push_sg+0x27/0x190 [tls]
[  270.986184] Code: 00 00 00 0f 1f 44 00 00 41 57 44 0f b7 d1 41 56 41 55 49 89 d5 41 54 45 89 c4 41 81 cc 00 00 02 00 55 48 89 fd 53 48 83 ec 10 <44> 8b 4a 0c 48 89 74 24 08 44 89 44 24 04 45 89 ce 45 29 d6 44 03
[  270.987631] RSP: 0018:ffffb6ea86eb7be0 EFLAGS: 00010282
[  270.988350] RAX: 000000074c8caca0 RBX: ffff91cc3753b800 RCX: 0000000000000000
[  270.989081] RDX: ffff91d383e05778 RSI: ffff91cc37cc6400 RDI: ffff91bc36020940
[  270.989813] RBP: ffff91bc36020940 R08: 0000000000000000 R09: 0000000000000000
[  270.990548] R10: 0000000000000000 R11: 0000000000000000 R12: 0000000000020000
[  270.991272] R13: ffff91d383e05778 R14: ffff91cc37cc6400 R15: ffff91cc3753a800
[  270.991999] FS:  00007f454a88d700(0000) GS:ffff91bc3f840000(0000) knlGS:0000000000000000
[  270.992727] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[  270.993448] CR2: ffff91d383e05784 CR3: 000000103554e003 CR4: 00000000001606e0
[  270.994177] Call Trace:
[  270.994905]  tls_tx_records+0x128/0x1c0 [tls]
[  270.995638]  tls_sw_sendpage+0x3e0/0x420 [tls]
[  270.996365]  inet_sendpage+0x52/0x90
[  270.997095]  ? direct_splice_actor+0x40/0x40
[  270.997826]  kernel_sendpage+0x1a/0x30
[  270.998555]  sock_sendpage+0x20/0x30
[  270.999281]  pipe_to_sendpage+0x62/0x90
[  271.000010]  __splice_from_pipe+0x80/0x180
[  271.000740]  ? direct_splice_actor+0x40/0x40
[  271.001467]  splice_from_pipe+0x5d/0x90
[  271.002199]  direct_splice_actor+0x35/0x40
[  271.002930]  splice_direct_to_actor+0x103/0x230
[  271.003661]  ? generic_pipe_buf_nosteal+0x10/0x10
[  271.004387]  do_splice_direct+0x9a/0xd0
[  271.005118]  do_sendfile+0x1c9/0x3d0
[  271.005849]  __x64_sys_sendfile64+0x5c/0xc0
[  271.006579]  do_syscall_64+0x5b/0x1d0
[  271.007301]  entry_SYSCALL_64_after_hwframe+0x44/0xa9
[  271.008031] RIP: 0033:0x7f454c9bf6ba
[  271.008737] Code: 31 c0 5b c3 0f 1f 40 00 48 89 da 4c 89 ce 44 89 c7 5b e9 09 fe ff ff 66 0f 1f 84 00 00 00 00 00 49 89 ca b8 28 00 00 00 0f 05 <48> 3d 01 f0 ff ff 73 01 c3 48 8b 0d 86 27 2d 00 f7 d8 64 89 01 48
[  271.010187] RSP: 002b:00007f454a88cd08 EFLAGS: 00000246 ORIG_RAX: 0000000000000028
[  271.010915] RAX: ffffffffffffffda RBX: 0000000002025460 RCX: 00007f454c9bf6ba
[  271.011648] RDX: 00007f454a88cd18 RSI: 0000000000000005 RDI: 000000000000000a
[  271.012374] RBP: 000000000000000a R08: 00007f454cc920ec R09: 00007f454cc92140
[  271.013109] R10: 0000000000001000 R11: 0000000000000246 R12: 0000000000000000
[  271.013848] R13: 0000000000000000 R14: 0000000000000000 R15: 0000000002030788
[  271.014577] Modules linked in: n5pf(OE) netconsole tls(OE) bonding intel_rapl_msr intel_rapl_common sb_edac x86_pkg_temp_thermal intel_powerclamp coretemp kvm_intel kvm iTCO_wdt iTCO_vendor_support irqbypass crct10dif_pclmul crc32_pclmul ghash_clmulni_intel aesni_intel crypto_simd mei_me cryptd glue_helper ipmi_si sg mei lpc_ich pcspkr joydev ioatdma i2c_i801 ipmi_devintf ipmi_msghandler wmi ip_tables xfs libcrc32c sd_mod mgag200 drm_vram_helper ttm drm_kms_helper syscopyarea sysfillrect sysimgblt fb_sys_fops drm isci libsas ahci scsi_transport_sas libahci crc32c_intel serio_raw igb libata ptp pps_core dca i2c_algo_bit dm_mirror dm_region_hash dm_log dm_mod [last unloaded: nitrox_drv]
[  271.019344] CR2: ffff91d383e05784


^ permalink raw reply

* Re: [PATCH] libbpf: Don't error out if getsockopt() fails for XDP_OPTIONS
From: Yonghong Song @ 2019-09-09 17:53 UTC (permalink / raw)
  To: Toke Høiland-Jørgensen, Alexei Starovoitov,
	Daniel Borkmann, netdev@vger.kernel.org, bpf@vger.kernel.org
In-Reply-To: <20190909174619.1735-1-toke@redhat.com>



On 9/9/19 10:46 AM, Toke Høiland-Jørgensen wrote:
> The xsk_socket__create() function fails and returns an error if it cannot
> get the XDP_OPTIONS through getsockopt(). However, support for XDP_OPTIONS
> was not added until kernel 5.3, so this means that creating XSK sockets
> always fails on older kernels.
> 
> Since the option is just used to set the zero-copy flag in the xsk struct,
> there really is no need to error out if the getsockopt() call fails.
> 
> Signed-off-by: Toke Høiland-Jørgensen <toke@redhat.com>
> ---
>   tools/lib/bpf/xsk.c | 8 ++------
>   1 file changed, 2 insertions(+), 6 deletions(-)
> 
> diff --git a/tools/lib/bpf/xsk.c b/tools/lib/bpf/xsk.c
> index 680e63066cf3..598e487d9ce8 100644
> --- a/tools/lib/bpf/xsk.c
> +++ b/tools/lib/bpf/xsk.c
> @@ -603,12 +603,8 @@ int xsk_socket__create(struct xsk_socket **xsk_ptr, const char *ifname,
>   
>   	optlen = sizeof(opts);
>   	err = getsockopt(xsk->fd, SOL_XDP, XDP_OPTIONS, &opts, &optlen);
> -	if (err) {
> -		err = -errno;
> -		goto out_mmap_tx;
> -	}
> -
> -	xsk->zc = opts.flags & XDP_OPTIONS_ZEROCOPY;
> +	if (!err)
> +		xsk->zc = opts.flags & XDP_OPTIONS_ZEROCOPY;
>   
>   	if (!(xsk->config.libbpf_flags & XSK_LIBBPF_FLAGS__INHIBIT_PROG_LOAD)) {
>   		err = xsk_setup_xdp_prog(xsk);

Since 'zc' is not used by anybody, maybe all codes 'zc' related can be 
removed? It can be added back back once there is an interface to use 'zc'?

diff --git a/tools/lib/bpf/xsk.c b/tools/lib/bpf/xsk.c
index 842c4fd55859..24fa313524fb 100644
--- a/tools/lib/bpf/xsk.c
+++ b/tools/lib/bpf/xsk.c
@@ -65,7 +65,6 @@ struct xsk_socket {
         int xsks_map_fd;
         __u32 queue_id;
         char ifname[IFNAMSIZ];
-       bool zc;
  };

  struct xsk_nl_info {
@@ -491,7 +490,6 @@ int xsk_socket__create(struct xsk_socket **xsk_ptr, 
const char *ifname,
         void *rx_map = NULL, *tx_map = NULL;
         struct sockaddr_xdp sxdp = {};
         struct xdp_mmap_offsets off;
-       struct xdp_options opts;
         struct xsk_socket *xsk;
         socklen_t optlen;
         int err;
@@ -611,15 +609,6 @@ int xsk_socket__create(struct xsk_socket **xsk_ptr, 
const char *ifname,

         xsk->prog_fd = -1;

-       optlen = sizeof(opts);
-       err = getsockopt(xsk->fd, SOL_XDP, XDP_OPTIONS, &opts, &optlen);
-       if (err) {
-               err = -errno;
-               goto out_mmap_tx;
-       }
-
-       xsk->zc = opts.flags & XDP_OPTIONS_ZEROCOPY;
-
         if (!(xsk->config.libbpf_flags & 
XSK_LIBBPF_FLAGS__INHIBIT_PROG_LOAD)) {
                 err = xsk_setup_xdp_prog(xsk);
                 if (err)

^ permalink raw reply related

* [PATCH net 1/1] net/tls(TLS_SW): Fix list_del double free caused by a race condition in tls_tx_records
From: Pooja Trivedi @ 2019-09-09 18:15 UTC (permalink / raw)
  To: netdev
In-Reply-To: <CAOrEdsnNZ3GJTFzfcBhUv6wvnXTJf=b9eJ8Exk2CXR6VyLsn1Q@mail.gmail.com>

    Enclosing tls_tx_records within lock_sock/release_sock pair to ensure
    write-synchronization is not sufficient because socket lock gets released
    under memory pressure situation by sk_wait_event while it sleeps waiting
    for memory, allowing another writer into tls_tx_records. This causes a
    race condition with record deletion post transmission.

    To fix this bug, use a flag set in tx_bitmask field of TLS context to
    ensure single writer in tls_tx_records at a time

    The bug resulted in the following crash:


    [  270.888952] ------------[ cut here ]------------
    [  270.890450] list_del corruption, ffff91cc3753a800->prev is
    LIST_POISON2 (dead000000000122)
    [  270.891194] WARNING: CPU: 1 PID: 7387 at lib/list_debug.c:50
    __list_del_entry_valid+0x62/0x90
    [  270.892037] Modules linked in: n5pf(OE) netconsole tls(OE) bonding
    intel_rapl_msr intel_rapl_common sb_edac x86_pkg_temp_thermal
    intel_powerclamp coretemp kvm_intel kvm iTCO_wdt iTCO_vendor_support
    irqbypass crct10dif_pclmul crc32_pclmul ghash_clmulni_intel
    aesni_intel crypto_simd mei_me cryptd glue_helper ipmi_si sg mei
    lpc_ich pcspkr joydev ioatdma i2c_i801 ipmi_devintf ipmi_msghandler
    wmi ip_tables xfs libcrc32c sd_mod mgag200 drm_vram_helper ttm
    drm_kms_helper syscopyarea sysfillrect sysimgblt fb_sys_fops drm isci
    libsas ahci scsi_transport_sas libahci crc32c_intel serio_raw igb
    libata ptp pps_core dca i2c_algo_bit dm_mirror dm_region_hash dm_log
    dm_mod [last unloaded: nitrox_drv]
    [  270.896836] CPU: 1 PID: 7387 Comm: uperf Kdump: loaded Tainted: G
            OE     5.3.0-rc4 #1
    [  270.897711] Hardware name: Supermicro SYS-1027R-N3RF/X9DRW, BIOS
    3.0c 03/24/2014
    [  270.898597] RIP: 0010:__list_del_entry_valid+0x62/0x90
    [  270.899478] Code: 00 00 00 c3 48 89 fe 48 89 c2 48 c7 c7 e0 f9 ee
    8d e8 b2 cf c8 ff 0f 0b 31 c0 c3 48 89 fe 48 c7 c7 18 fa ee 8d e8 9e
    cf c8 ff <0f> 0b 31 c0 c3 48 89 f2 48 89 fe 48 c7 c7 50 fa ee 8d e8 87
    cf c8
    [  270.901321] RSP: 0018:ffffb6ea86eb7c20 EFLAGS: 00010282
    [  270.902240] RAX: 0000000000000000 RBX: ffff91cc3753c000 RCX:
0000000000000000
    [  270.903157] RDX: ffff91bc3f867080 RSI: ffff91bc3f857738 RDI:
ffff91bc3f857738
    [  270.904074] RBP: ffff91bc36020940 R08: 0000000000000560 R09:
0000000000000000
    [  270.904988] R10: 0000000000000000 R11: 0000000000000000 R12:
0000000000000000
    [  270.905902] R13: ffff91cc3753a800 R14: ffff91cc37cc6400 R15:
ffff91cc3753a800
    [  270.906809] FS:  00007f454a88d700(0000) GS:ffff91bc3f840000(0000)
    knlGS:0000000000000000
    [  270.907715] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
    [  270.908606] CR2: 00007f453c00292c CR3: 000000103554e003 CR4:
00000000001606e0
    [  270.909490] Call Trace:
    [  270.910373]  tls_tx_records+0x138/0x1c0 [tls]
    [  270.911262]  tls_sw_sendpage+0x3e0/0x420 [tls]
    [  270.912154]  inet_sendpage+0x52/0x90
    [  270.913045]  ? direct_splice_actor+0x40/0x40
    [  270.913941]  kernel_sendpage+0x1a/0x30
    [  270.914831]  sock_sendpage+0x20/0x30
    [  270.915714]  pipe_to_sendpage+0x62/0x90
    [  270.916592]  __splice_from_pipe+0x80/0x180
    [  270.917461]  ? direct_splice_actor+0x40/0x40
    [  270.918334]  splice_from_pipe+0x5d/0x90
    [  270.919208]  direct_splice_actor+0x35/0x40
    [  270.920086]  splice_direct_to_actor+0x103/0x230
    [  270.920966]  ? generic_pipe_buf_nosteal+0x10/0x10
    [  270.921850]  do_splice_direct+0x9a/0xd0
    [  270.922733]  do_sendfile+0x1c9/0x3d0
    [  270.923612]  __x64_sys_sendfile64+0x5c/0xc0

    Signed-off-by: Pooja Trivedi <pooja.trivedi@stackpath.com>

----------

diff --git a/include/net/tls.h b/include/net/tls.h
index 41b2d41..f346a54 100644
--- a/include/net/tls.h
+++ b/include/net/tls.h
@@ -161,6 +161,7 @@ struct tls_sw_context_tx {

 #define BIT_TX_SCHEDULED 0
 #define BIT_TX_CLOSING 1
+#define BIT_TX_IN_PROGRESS 2
  unsigned long tx_bitmask;
 };

diff --git a/net/tls/tls_sw.c b/net/tls/tls_sw.c
index 91d21b0..6e99c61 100644
--- a/net/tls/tls_sw.c
+++ b/net/tls/tls_sw.c
@@ -367,6 +367,10 @@ int tls_tx_records(struct sock *sk, int flags)
  struct sk_msg *msg_en;
  int tx_flags, rc = 0;

+ /* If another writer is already in tls_tx_records, backoff and leave */
+ if (test_and_set_bit(BIT_TX_IN_PROGRESS, &ctx->tx_bitmask))
+ return 0;
+
  if (tls_is_partially_sent_record(tls_ctx)) {
  rec = list_first_entry(&ctx->tx_list,
        struct tls_rec, list);
@@ -415,6 +419,9 @@ int tls_tx_records(struct sock *sk, int flags)
  if (rc < 0 && rc != -EAGAIN)
  tls_err_abort(sk, EBADMSG);

+ /* clear the bit so another writer can get into tls_tx_records */
+ clear_bit(BIT_TX_IN_PROGRESS, &ctx->tx_bitmask);
+
  return rc;
 }

^ permalink raw reply related

* Re: ixgbe: driver drops packets routed from an IPSec interface with a "bad sa_idx" error
From: Shannon Nelson @ 2019-09-09 18:21 UTC (permalink / raw)
  To: Michael Marley, netdev, Jeff Kirsher, steffen.klassert
In-Reply-To: <10ba81d178d4ade76741c1a6e1672056@michaelmarley.com>

On 9/6/19 11:13 AM, Michael Marley wrote:
> (This is also reported at 
> https://bugzilla.kernel.org/show_bug.cgi?id=204551, but it was 
> recommended that I send it to this list as well.)
>
> I have a put together a router that routes traffic from several local 
> subnets from a switch attached to an i82599ES card through an IPSec 
> VPN interface set up with StrongSwan.  (The VPN is running on an 
> unrelated second interface with a different driver.)  Traffic from the 
> local interfaces to the VPN works as it should and eventually makes it 
> through the VPN server and out to the Internet.  The return traffic 
> makes it back to the router and tcpdump shows it leaving by the 
> i82599, but the traffic never actually makes it onto the wire and I 
> instead get one of
>
> enp1s0: ixgbe_ipsec_tx: bad sa_idx=64512 handle=0
>
> for each packet that should be transmitted.  (The sa_idx and handle 
> values are always the same.)
>
> I realized this was probably related to ixgbe's IPSec offloading 
> feature, so I tried with the motherboard's integrated e1000e device 
> and didn't have the problem.  I tried using ethtool to disable all the 
> IPSec-related offloads (tx-esp-segmentation, esp-hw-offload, 
> esp-tx-csum-hw-offload), but the problem persisted.  I then tried 
> recompiling the kernel with CONFIG_IXGBE_IPSEC=n and that worked 
> around the problem.
>
> I was also able to find another instance of the same problem reported 
> in Debian at 
> https://bugs.debian.org/cgi-bin/bugreport.cgi?bug=930443.  That person 
> seems to be having exactly the same issue as me, down to the sa_idx 
> and handle values being the same.
>
> If there are any more details I can provide to make this easier to 
> track down, please let me know.
>
> Thanks,
>
> Michael Marley

Hi Michael,

Thanks for pointing this out.  The issue this error message is 
complaining about is that the handle given to the driver is a bad 
value.  The handle is what helps the driver find the right encryption 
information, and in this case is an index into an array, one array for 
Rx and one for Tx, each of which have up to 1024 entries.  In order to 
encode them into a single value, 1024 is added to the Tx values to make 
the handle, and 1024 is subtracted to use the handle later.  Note that 
the bad sa_idx is 64512, which happens to also be -1024; if the Tx 
handle given to ixgbe for xmit is 0, we subtract 1024 from that and get 
this bad sa_idx value.

That handle is supposed to be an opaque value only used by the driver.  
It looks to me like either (a) the driver is not setting up the handle 
correctly when the SA is first set up, or (b) something in the upper 
levels of the ipsec code is clearing the handle value. We would need to 
know more about all the bits in your SA set up to have a better idea 
what parts of the ipsec code are being exercised when this problem happens.

I currently don't have access to a good ixgbe setup on which to 
test/debug this, and I haven't been paying much attention lately to 
what's happening in the upper ipsec layers, so my help will be somewhat 
limited.  I'm hoping the the Intel folks can add a little help, so I've 
copied Jeff Kirsher on this (they'll probably point back to me since I 
wrote this chunk :-) ).  I've also copied Stephen Klassert for his ipsec 
thoughts.

In the meantime, can you give more details on the exact ipsec rules that 
are used here, and are there any error messages coming from ixgbe 
regarding the ipsec rule setup that might help us identify what's happening?

Thanks,
sln



^ permalink raw reply

* Re: ixgbe: driver drops packets routed from an IPSec interface with a "bad sa_idx" error
From: Michael Marley @ 2019-09-09 18:45 UTC (permalink / raw)
  To: Shannon Nelson; +Cc: netdev, Jeff Kirsher, steffen.klassert
In-Reply-To: <4caa4fb7-9963-99ab-318f-d8ada4f19205@pensando.io>

On 2019-09-09 14:21, Shannon Nelson wrote:
> On 9/6/19 11:13 AM, Michael Marley wrote:
>> (This is also reported at 
>> https://bugzilla.kernel.org/show_bug.cgi?id=204551, but it was 
>> recommended that I send it to this list as well.)
>> 
>> I have a put together a router that routes traffic from several local 
>> subnets from a switch attached to an i82599ES card through an IPSec 
>> VPN interface set up with StrongSwan.  (The VPN is running on an 
>> unrelated second interface with a different driver.)  Traffic from the 
>> local interfaces to the VPN works as it should and eventually makes it 
>> through the VPN server and out to the Internet.  The return traffic 
>> makes it back to the router and tcpdump shows it leaving by the 
>> i82599, but the traffic never actually makes it onto the wire and I 
>> instead get one of
>> 
>> enp1s0: ixgbe_ipsec_tx: bad sa_idx=64512 handle=0
>> 
>> for each packet that should be transmitted.  (The sa_idx and handle 
>> values are always the same.)
>> 
>> I realized this was probably related to ixgbe's IPSec offloading 
>> feature, so I tried with the motherboard's integrated e1000e device 
>> and didn't have the problem.  I tried using ethtool to disable all the 
>> IPSec-related offloads (tx-esp-segmentation, esp-hw-offload, 
>> esp-tx-csum-hw-offload), but the problem persisted.  I then tried 
>> recompiling the kernel with CONFIG_IXGBE_IPSEC=n and that worked 
>> around the problem.
>> 
>> I was also able to find another instance of the same problem reported 
>> in Debian at 
>> https://bugs.debian.org/cgi-bin/bugreport.cgi?bug=930443.  That person 
>> seems to be having exactly the same issue as me, down to the sa_idx 
>> and handle values being the same.
>> 
>> If there are any more details I can provide to make this easier to 
>> track down, please let me know.
>> 
>> Thanks,
>> 
>> Michael Marley
> 
> Hi Michael,
> 
> Thanks for pointing this out.  The issue this error message is
> complaining about is that the handle given to the driver is a bad
> value.  The handle is what helps the driver find the right encryption
> information, and in this case is an index into an array, one array for
> Rx and one for Tx, each of which have up to 1024 entries.  In order to
> encode them into a single value, 1024 is added to the Tx values to
> make the handle, and 1024 is subtracted to use the handle later.  Note
> that the bad sa_idx is 64512, which happens to also be -1024; if the
> Tx handle given to ixgbe for xmit is 0, we subtract 1024 from that and
> get this bad sa_idx value.
> 
> That handle is supposed to be an opaque value only used by the
> driver.  It looks to me like either (a) the driver is not setting up
> the handle correctly when the SA is first set up, or (b) something in
> the upper levels of the ipsec code is clearing the handle value. We
> would need to know more about all the bits in your SA set up to have a
> better idea what parts of the ipsec code are being exercised when this
> problem happens.
> 
> I currently don't have access to a good ixgbe setup on which to
> test/debug this, and I haven't been paying much attention lately to
> what's happening in the upper ipsec layers, so my help will be
> somewhat limited.  I'm hoping the the Intel folks can add a little
> help, so I've copied Jeff Kirsher on this (they'll probably point back
> to me since I wrote this chunk :-) ).  I've also copied Stephen
> Klassert for his ipsec thoughts.
> 
> In the meantime, can you give more details on the exact ipsec rules
> that are used here, and are there any error messages coming from ixgbe
> regarding the ipsec rule setup that might help us identify what's
> happening?
> 
> Thanks,
> sln

Hi Shannon,

Thanks for your response!  I apologize, I am a bit of a newbie to IPSec 
myself, so I'm not 100% sure what is the best way to provide the 
information you need, but here is the (slightly-redacted) output of 
swanctl --list-sas first from the server and then from the client:

<servername>: #24, ESTABLISHED, IKEv2, 3cb75c180ee5dc68_i 
cc7dae551b603bb7_r*
   local  '<serverip>' @ <serverip>[4500]
   remote '<clientip>' @ <clientip>[4500]
   AES_GCM_16-256/PRF_HMAC_SHA2_512/ECP_384
   established 174180s ago
   <servername>: #110, reqid 12, INSTALLED, TUNNEL-in-UDP, 
ESP:AES_GCM_16-256/ECP_384
     installed 469s ago
     in  c51a0f11 (-|0x00000064), 1548864 bytes, 19575 packets,     6s 
ago
     out c3bd9741 (-|0x00000064), 23618807 bytes, 22865 packets,     7s 
ago
     local  0.0.0.0/0 ::/0
     remote 0.0.0.0/0 ::/0

<clientname>: #1, ESTABLISHED, IKEv2, 3cb75c180ee5dc68_i* 
cc7dae551b603bb7_r
   local  '<clientip>' @ <clientip>[4500]
   remote '<serverip>' @ <serverip>[4500]
   AES_GCM_16-256/PRF_HMAC_SHA2_512/ECP_384
   established 174013s ago
   <clientname>: #54, reqid 1, INSTALLED, TUNNEL-in-UDP, 
ESP:AES_GCM_16-256/ECP_384
     installed 303s ago, rekeying in 2979s, expires in 3657s
     in  c3bd9741 (-|0x00000064), 23178523 bytes, 20725 packets,     0s 
ago
     out c51a0f11 (-|0x00000064), 1429124 bytes, 17719 packets,     0s 
ago
     local  0.0.0.0/0 ::/0
     remote 0.0.0.0/0 ::/0

It might also be worth mentioning that I am using an xfrm interface to 
do "regular" routing rather than the policy-based routing that 
StrongSwan/IPSec normally uses. If there is anything else that would 
help more, I would be happy to provide it.

Just to be clear though, I'm not trying to run IPSec on the ixgbe 
interface at all.  The ixgbe adapter is being used to connect the router 
to the switch on the LAN side of the network.  IPSec is running on the 
WAN interface without any hardware acceleration (besides AES-NI).  The 
problem occurs when a computer on the LAN tries to access the WAN.  The 
outgoing packets work as expected and the incoming packets are routed 
back out through the ixgbe device toward the LAN client, but the driver 
drops the packets with the sa_idx error.

I hope this helps.

Thanks,

Michael

^ permalink raw reply

* Re: [PATCH net-next v2 1/2] net: stmmac: Only enable enhanced addressing mode when needed
From: Thierry Reding @ 2019-09-09 19:11 UTC (permalink / raw)
  To: Jose Abreu
  Cc: David S . Miller, Giuseppe Cavallaro, Alexandre Torgue,
	Jon Hunter, Bitan Biswas, netdev@vger.kernel.org,
	linux-tegra@vger.kernel.org
In-Reply-To: <BN8PR12MB3266B232D3B895A4A4368191D3B70@BN8PR12MB3266.namprd12.prod.outlook.com>

[-- Attachment #1: Type: text/plain, Size: 502 bytes --]

On Mon, Sep 09, 2019 at 04:07:04PM +0000, Jose Abreu wrote:
> From: Thierry Reding <thierry.reding@gmail.com>
> Date: Sep/09/2019, 16:25:45 (UTC+00:00)
> 
> > @@ -92,6 +92,7 @@ struct stmmac_dma_cfg {
> >  	int fixed_burst;
> >  	int mixed_burst;
> >  	bool aal;
> > +	bool eame;
> 
> bools should not be used in struct's, please change to int.

Huh? Since when? "aal" right above it is also bool. Can you provide a
specific rationale for why we shouldn't use bool in structs?

Thierry

[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 833 bytes --]

^ permalink raw reply

* Re: [PATCH net-next v2 2/2] net: stmmac: Support enhanced addressing mode for DWMAC 4.10
From: Thierry Reding @ 2019-09-09 19:13 UTC (permalink / raw)
  To: Jose Abreu
  Cc: David S . Miller, Giuseppe Cavallaro, Alexandre Torgue,
	Jon Hunter, Bitan Biswas, netdev@vger.kernel.org,
	linux-tegra@vger.kernel.org
In-Reply-To: <BN8PR12MB3266AAC6FF4819EC25CB087BD3B70@BN8PR12MB3266.namprd12.prod.outlook.com>

[-- Attachment #1: Type: text/plain, Size: 1400 bytes --]

On Mon, Sep 09, 2019 at 04:05:52PM +0000, Jose Abreu wrote:
> From: Thierry Reding <thierry.reding@gmail.com>
> Date: Sep/09/2019, 16:25:46 (UTC+00:00)
> 
> > @@ -79,6 +79,10 @@ static void dwmac4_dma_init_rx_chan(void __iomem *ioaddr,
> >  	value = value | (rxpbl << DMA_BUS_MODE_RPBL_SHIFT);
> >  	writel(value, ioaddr + DMA_CHAN_RX_CONTROL(chan));
> >  
> > +	if (dma_cfg->eame)
> 
> There is no need for this check. If EAME is not enabled then upper 32 
> bits will be zero.

The idea here was to potentially guard against this register not being
available on some revisions. Having the check here would avoid access to
the register if the device doesn't support enhanced addressing.

> 
> > +		writel(upper_32_bits(dma_rx_phy),
> > +		       ioaddr + DMA_CHAN_RX_BASE_ADDR_HI(chan));
> > +
> >  	writel(lower_32_bits(dma_rx_phy), ioaddr + DMA_CHAN_RX_BASE_ADDR(chan));
> >  }
> 
> > @@ -97,6 +101,10 @@ static void dwmac4_dma_init_tx_chan(void __iomem *ioaddr,
> >  
> >  	writel(value, ioaddr + DMA_CHAN_TX_CONTROL(chan));
> >  
> > +	if (dma_cfg->eame)
> 
> Same here.
> 
> > +		writel(upper_32_bits(dma_tx_phy),
> > +		       ioaddr + DMA_CHAN_TX_BASE_ADDR_HI(chan));
> > +
> >  	writel(lower_32_bits(dma_tx_phy), ioaddr + DMA_CHAN_TX_BASE_ADDR(chan));
> >  }
> 
> Also, please provide a cover letter in next submission.

Alright, will do.

Thierry

[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 833 bytes --]

^ permalink raw reply

* Re: [PATCH] net/mlx5: reduce stack usage in FW tracer
From: Saeed Mahameed @ 2019-09-09 19:39 UTC (permalink / raw)
  To: davem@davemloft.net, arnd@arndb.de, leon@kernel.org
  Cc: cai@lca.pw, linux-rdma@vger.kernel.org, Moshe Shemesh,
	Feras Daoud, linux-kernel@vger.kernel.org, Eran Ben Elisha,
	netdev@vger.kernel.org, Erez Shitrit
In-Reply-To: <20190906151123.1088455-1-arnd@arndb.de>

On Fri, 2019-09-06 at 17:11 +0200, Arnd Bergmann wrote:
> It's generally not ok to put a 512 byte buffer on the stack, as
> kernel
> stack is a scarce resource:
> 
> drivers/net/ethernet/mellanox/mlx5/core/diag/fw_tracer.c:660:13:
> error: stack frame size of 1032 bytes in function
> 'mlx5_fw_tracer_handle_traces' [-Werror,-Wframe-larger-than=]
> 
> This is done in a context that is allowed to sleep, so using
> dynamic allocation is ok as well. I'm not too worried about
> runtime overhead, as this already contains an snprintf() and
> other expensive functions.
> 
> Fixes: 70dd6fdb8987 ("net/mlx5: FW tracer, parse traces and kernel
> tracing support")
> Signed-off-by: Arnd Bergmann <arnd@arndb.de>
> ---
>  .../mellanox/mlx5/core/diag/fw_tracer.c       | 21 ++++++++++-------
> --
>  1 file changed, 11 insertions(+), 10 deletions(-)
> 
> diff --git a/drivers/net/ethernet/mellanox/mlx5/core/diag/fw_tracer.c
> b/drivers/net/ethernet/mellanox/mlx5/core/diag/fw_tracer.c
> index 2011eaf15cc5..d81e78060f9f 100644
> --- a/drivers/net/ethernet/mellanox/mlx5/core/diag/fw_tracer.c
> +++ b/drivers/net/ethernet/mellanox/mlx5/core/diag/fw_tracer.c
> @@ -557,16 +557,16 @@ static void mlx5_tracer_print_trace(struct
> tracer_string_format *str_frmt,
>  				    struct mlx5_core_dev *dev,
>  				    u64 trace_timestamp)
>  {
> -	char	tmp[512];
> -

Hi Arnd, thanks for the patch, 
this function is very perfomance critical when fw traces are activated
to pull some fw content on error situations, using kmalloc here might
become a problem and stall the system further more if the problem was
initially due to lack of memory.

since this function only needs 512 bytes maybe we should mark it as
noinline to avoid any extra stack usages on the caller function
mlx5_fw_tracer_handle_traces ?

> -	snprintf(tmp, sizeof(tmp), str_frmt->string,
> -		 str_frmt->params[0],
> -		 str_frmt->params[1],
> -		 str_frmt->params[2],
> -		 str_frmt->params[3],
> -		 str_frmt->params[4],
> -		 str_frmt->params[5],
> -		 str_frmt->params[6]);
> +	char *tmp = kasprintf(GFP_KERNEL, str_frmt->string,
> +			      str_frmt->params[0],
> +			      str_frmt->params[1],
> +			      str_frmt->params[2],
> +			      str_frmt->params[3],
> +			      str_frmt->params[4],
> +			      str_frmt->params[5],
> +			      str_frmt->params[6]);
> +	if (!tmp)
> +		return;
>  
>  	trace_mlx5_fw(dev->tracer, trace_timestamp, str_frmt->lost,
>  		      str_frmt->event_id, tmp);
> @@ -576,6 +576,7 @@ static void mlx5_tracer_print_trace(struct
> tracer_string_format *str_frmt,
>  
>  	/* remove it from hash */
>  	mlx5_tracer_clean_message(str_frmt);
> +	kfree(tmp);
>  }
>  
>  static int mlx5_tracer_handle_string_trace(struct mlx5_fw_tracer
> *tracer,

^ permalink raw reply

* RE: VRF Issue Since kernel 5
From: Gowen @ 2019-09-09 19:43 UTC (permalink / raw)
  To: Alexis Bauvin; +Cc: netdev@vger.kernel.org
In-Reply-To: <7CAF2F23-5D88-4BE7-B703-06B71D1EDD11@online.net>

Hi alexis,

I did this earlier today and no change.

I’ll look at trying to see if the return traffic is hitting the INPUT table tomorrow with some conntrack rules and see if it hits any of those rules. If not then do you have any hints/techniques I can use to find the source of the issue?

Gareth

-----Original Message-----
From: Alexis Bauvin <abauvin@online.net> 
Sent: 09 September 2019 13:02
To: Gowen <gowen@potatocomputing.co.uk>
Cc: netdev@vger.kernel.org
Subject: Re: VRF Issue Since kernel 5

Hi,

I guess all routing from the management VRF itself is working correctly (i.e. cURLing an IP from this VRF or digging any DNS), and it is your route leakage that’s at fault.

Could you try swapping the local and l3mdev rules?

`ip rule del pref 0; ip rule add from all lookup local pref 1001`

I faced security issues and behavioral weirdnesses from the default kernel rule ordering regarding the default vrf.

Alexis

> Le 9 sept. 2019 à 12:53, Gowen <gowen@potatocomputing.co.uk> a écrit :
> 
> Hi Alexis,
> 
> Admin@NETM06:~$ sysctl net.ipv4.tcp_l3mdev_accept 
> net.ipv4.tcp_l3mdev_accept = 1
> 
> Admin@NETM06:~$ sudo ip vrf exec mgmt-vrf curl kernel.org
> curl: (6) Could not resolve host: kernel.org
> 
> the failure to resolve is the same with all DNS lookups from any 
> process I've run
> 
> The route is there from the guide I originally used, I can't remember 
> the purpose but I know I don't need it - I've removed it now and no 
> change
> 
> Admin@NETM06:~$ ip rule show
> 0:      from all lookup local
> 1000:   from all lookup [l3mdev-table]
> 32766:  from all lookup main
> 32767:  from all lookup default
> 
> I could switch the VRFs over, but this is a test-box and i have prod boxes on this as well so not so keen on that if I can avoid it.
> 
> From what I can speculate, because the TCP return traffic is met with an RST, it looks like it may be something to do with iptables - but even if I set the policy to ACCEPT and flush all the rules, the behaviour remains the same.
> 
> Is it possible that the TCP stack isn't aware of the session (as is mapped to wrong VRF internally or something to that effect) and is therefore sending the RST?
> 
> Gareth
> From: Alexis Bauvin <abauvin@online.net>
> Sent: 09 September 2019 10:28
> To: Gowen <gowen@potatocomputing.co.uk>
> Cc: netdev@vger.kernel.org <netdev@vger.kernel.org>
> Subject: Re: VRF Issue Since kernel 5
>  
> Hi,
> 
> There has been some changes regarding VRF isolation in Linux 5 IIRC, 
> namely proper isolation of the default VRF.
> 
> Some things you may try:
> 
> - looking at the l3mdev_accept sysctls (e.g. 
> `net.ipv4.tcp_l3mdev_accept`)
> - querying stuff from the management vrf through `ip vrf exec vrf-mgmt <stuff>`
>   e.g. `ip vrf exec vrf-mgmt curl kernel.org`
>        `ip vrf exec vrf-mgmt dig @1.1.1.1 kernel.org`
> - reversing your logic: default VRF is your management one, the other one is for your
>   other boxes
> 
> Also, your `unreachable default metric 4278198272` route looks odd to me.
> 
> What are your routing rules? (`ip rule`)
> 
> Alexis
> 
> > Le 9 sept. 2019 à 09:46, Gowen <gowen@potatocomputing.co.uk> a écrit :
> > 
> > Hi there,
> > 
> > Dave A said this was the mailer to send this to:
> > 
> > 
> > I’ve been using my management interface in a VRF for several months now and it’s worked perfectly – I’ve been able to update/upgrade the packages just fine and iptables works excellently with it – exactly as I needed.
> > 
> > 
> > Since Kernel 5 though I am no longer able to update – but the issue 
> > is quite a curious one as some traffic appears to be fine (DNS 
> > lookups use VRF correctly) but others don’t (updating/upgrading the 
> > packages)
> > 
> > 
> > I have on this device 2 interfaces:
> > Eth0 for management – inbound SSH, DNS, updates/upgrades
> > Eth1 for managing other boxes (ansible using SSH)
> > 
> > 
> > Link and addr info shown below:
> > 
> > 
> > Admin@NETM06:~$ ip link show
> > 1: lo: <LOOPBACK,UP,LOWER_UP> mtu 65536 qdisc noqueue state UNKNOWN mode DEFAULT group default qlen 1000
> >     link/loopback 00:00:00:00:00:00 brd 00:00:00:00:00:00
> > 2: eth0: <BROADCAST,MULTICAST,UP,LOWER_UP> mtu 1500 qdisc mq master mgmt-vrf state UP mode DEFAULT group default qlen 1000
> >     link/ether 00:22:48:07:cc:ad brd ff:ff:ff:ff:ff:ff
> > 3: eth1: <BROADCAST,MULTICAST,UP,LOWER_UP> mtu 1500 qdisc mq state UP mode DEFAULT group default qlen 1000
> >     link/ether 00:22:48:07:c9:6c brd ff:ff:ff:ff:ff:ff
> > 4: mgmt-vrf: <NOARP,MASTER,UP,LOWER_UP> mtu 65536 qdisc noqueue state UP mode DEFAULT group default qlen 1000
> >     link/ether 8a:f6:26:65:02:5a brd ff:ff:ff:ff:ff:ff
> > 
> > 
> > Admin@NETM06:~$ ip addr
> > 1: lo: <LOOPBACK,UP,LOWER_UP> mtu 65536 qdisc noqueue state UNKNOWN group default qlen 1000
> >     link/loopback 00:00:00:00:00:00 brd 00:00:00:00:00:00
> >     inet 127.0.0.1/8 scope host lo
> >        valid_lft forever preferred_lft forever
> >     inet6 ::1/128 scope host
> >        valid_lft forever preferred_lft forever
> > 2: eth0: <BROADCAST,MULTICAST,UP,LOWER_UP> mtu 1500 qdisc mq master mgmt-vrf state UP group default qlen 1000
> >     link/ether 00:22:48:07:cc:ad brd ff:ff:ff:ff:ff:ff
> >     inet 10.24.12.10/24 brd 10.24.12.255 scope global eth0
> >        valid_lft forever preferred_lft forever
> >     inet6 fe80::222:48ff:fe07:ccad/64 scope link
> >        valid_lft forever preferred_lft forever
> > 3: eth1: <BROADCAST,MULTICAST,UP,LOWER_UP> mtu 1500 qdisc mq state UP group default qlen 1000
> >     link/ether 00:22:48:07:c9:6c brd ff:ff:ff:ff:ff:ff
> >     inet 10.24.12.9/24 brd 10.24.12.255 scope global eth1
> >        valid_lft forever preferred_lft forever
> >     inet6 fe80::222:48ff:fe07:c96c/64 scope link
> >        valid_lft forever preferred_lft forever
> > 4: mgmt-vrf: <NOARP,MASTER,UP,LOWER_UP> mtu 65536 qdisc noqueue state UP group default qlen 1000
> >     link/ether 8a:f6:26:65:02:5a brd ff:ff:ff:ff:ff:ff
> > 
> > 
> > 
> > the production traffic is all in the 10.0.0.0/8 network (eth1 global 
> > VRF) except for a few subnets (DNS) which are routed out eth0 
> > (mgmt-vrf)
> > 
> > 
> > Admin@NETM06:~$ ip route show
> > default via 10.24.12.1 dev eth0
> > 10.0.0.0/8 via 10.24.12.1 dev eth1
> > 10.24.12.0/24 dev eth1 proto kernel scope link src 10.24.12.9
> > 10.24.65.0/24 via 10.24.12.1 dev eth0
> > 10.25.65.0/24 via 10.24.12.1 dev eth0
> > 10.26.0.0/21 via 10.24.12.1 dev eth0
> > 10.26.64.0/21 via 10.24.12.1 dev eth0
> > 
> > 
> > Admin@NETM06:~$ ip route show vrf mgmt-vrf default via 10.24.12.1 
> > dev eth0 unreachable default metric 4278198272
> > 10.24.12.0/24 dev eth0 proto kernel scope link src 10.24.12.10
> > 10.24.65.0/24 via 10.24.12.1 dev eth0
> > 10.25.65.0/24 via 10.24.12.1 dev eth0
> > 10.26.0.0/21 via 10.24.12.1 dev eth0
> > 10.26.64.0/21 via 10.24.12.1 dev eth0
> > 
> > 
> > 
> > The strange activity occurs when I enter the command “sudo apt update” as I can resolve the DNS request (10.24.65.203 or 10.24.64.203, verified with tcpdump) out eth0 but for the actual update traffic there is no activity:
> > 
> > 
> > sudo tcpdump -i eth0 '(host 10.24.65.203 or host 10.25.65.203) and 
> > port 53' -n <OUTPUT OMITTED FOR BREVITY>
> > 10:06:05.268735 IP 10.24.12.10.39963 > 10.24.65.203.53: 48798+ [1au] 
> > A? security.ubuntu.com. (48) <OUTPUT OMITTED FOR BREVITY>
> > 10:06:05.284403 IP 10.24.65.203.53 > 10.24.12.10.39963: 48798 13/0/1 
> > A 91.189.91.23, A 91.189.88.24, A 91.189.91.26, A 91.189.88.162, A 
> > 91.189.88.149, A 91.189.91.24, A 91.189.88.173, A 91.189.88.177, A 
> > 91.189.88.31, A 91.189.91.14, A 91.189.88.176, A 91.189.88.175, A 
> > 91.189.88.174 (256)
> > 
> > 
> > 
> > You can see that the update traffic is returned but is not accepted 
> > by the stack and a RST is sent
> > 
> > 
> > Admin@NETM06:~$ sudo tcpdump -i eth0 '(not host 168.63.129.16 and 
> > port 80)' -n
> > tcpdump: verbose output suppressed, use -v or -vv for full protocol 
> > decode listening on eth0, link-type EN10MB (Ethernet), capture size 
> > 262144 bytes
> > 10:17:12.690658 IP 10.24.12.10.40216 > 91.189.88.175.80: Flags [S], 
> > seq 2279624826, win 64240, options [mss 1460,sackOK,TS val 
> > 2029365856 ecr 0,nop,wscale 7], length 0
> > 10:17:12.691929 IP 10.24.12.10.52362 > 91.189.95.83.80: Flags [S], seq 1465797256, win 64240, options [mss 1460,sackOK,TS val 3833463674 ecr 0,nop,wscale 7], length 0
> > 10:17:12.696270 IP 91.189.88.175.80 > 10.24.12.10.40216: Flags [S.], seq 968450722, ack 2279624827, win 28960, options [mss 1418,sackOK,TS val 81957103 ecr 2029365856,nop,wscale 7], length 0                                                                                                                            
> > 10:17:12.696301 IP 10.24.12.10.40216 > 91.189.88.175.80: Flags [R], seq 2279624827, win 0, length 0
> > 10:17:12.697884 IP 91.189.95.83.80 > 10.24.12.10.52362: Flags [S.], seq 4148330738, ack 1465797257, win 28960, options [mss 1418,sackOK,TS val 2257624414 ecr 3833463674,nop,wscale 8], length 0                                                                                                                         
> > 10:17:12.697909 IP 10.24.12.10.52362 > 91.189.95.83.80: Flags [R], 
> > seq 1465797257, win 0, length 0
> > 
> > 
> > 
> > 
> > I can emulate the DNS lookup using netcat in the vrf:
> > 
> > 
> > sudo ip vrf exec mgmt-vrf nc -u 10.24.65.203 53
> > 
> > 
> > then interactively enter the binary for a www.google.co.uk request:
> > 
> > 
> > 0035624be394010000010000000000010377777706676f6f676c6502636f02756b00
> > 000100010000290200000000000000
> > 
> > 
> > This returns as expected:
> > 
> > 
> > 00624be394010000010000000000010377777706676f6f676c6502636f02756b0000
> > 0100010000290200000000000000
> > 
> > 
> > I can run:
> > 
> > 
> > Admin@NETM06:~$ host www.google.co.uk www.google.co.uk has address 
> > 172.217.169.3 www.google.co.uk has IPv6 address 
> > 2a00:1450:4009:80d::2003
> > 
> > 
> > but I get a timeout for:
> > 
> > 
> > sudo ip vrf  exec mgmt-vrf host www.google.co.uk ;; connection timed 
> > out; no servers could be reached
> > 
> > 
> > 
> > However I can take a repo address and vrf exec to it on port 80:
> > 
> > 
> > Admin@NETM06:~$ sudo ip vrf  exec mgmt-vrf nc 91.189.91.23 80 hello
> > HTTP/1.1 400 Bad Request
> > <OUTPUT OMITTED>
> > 
> > My iptables rule:
> > 
> > 
> > sudo iptables -Z
> > Admin@NETM06:~$ sudo iptables -L -v
> > Chain INPUT (policy DROP 16 packets, 3592 bytes)
> > pkts bytes target     prot opt in     out     source               destination
> >    44  2360 ACCEPT     tcp  --  any    any     anywhere             anywhere             tcp spt:http ctstate RELATED,ESTABLISHED
> >    83 10243 ACCEPT     udp  --  any    any     anywhere             anywhere             udp spt:domain ctstate RELATED,ESTABLISHED
> > 
> > 
> > 
> > I cannot find out why the update isn’t working. Any help greatly 
> > appreciated
> > 
> > 
> > Kind Regards,
> > 
> > 
> > Gareth


^ permalink raw reply


This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox