Netdev List

Netdev List
 help / color / mirror / Atom feed

* [PATCH rdma-next 3/3] IB/mlx5: Add page fault handler for DC initiator WQE
From: Leon Romanovsky @ 2019-08-01 12:21 UTC (permalink / raw)
  To: Doug Ledford, Jason Gunthorpe
  Cc: Leon Romanovsky, RDMA mailing list, Michael Guralnik, Moni Shoua,
	Saeed Mahameed, linux-netdev
In-Reply-To: <20190801122139.25224-1-leon@kernel.org>

From: Michael Guralnik <michaelgur@mellanox.com>

Parsing DC initiator WQEs upon page fault requires skipping an address
vector segment, as in UD WQEs.

Signed-off-by: Michael Guralnik <michaelgur@mellanox.com>
Reviewed-by: Moni Shoua <monis@mellanox.com>
Signed-off-by: Leon Romanovsky <leonro@mellanox.com>
---
 drivers/infiniband/hw/mlx5/odp.c | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/drivers/infiniband/hw/mlx5/odp.c b/drivers/infiniband/hw/mlx5/odp.c
index 5e87a5e25574..6f1de5edbe8e 100644
--- a/drivers/infiniband/hw/mlx5/odp.c
+++ b/drivers/infiniband/hw/mlx5/odp.c
@@ -1065,6 +1065,12 @@ static int mlx5_ib_mr_initiator_pfault_handler(
 	case IB_QPT_UD:
 		transport_caps = dev->odp_caps.per_transport_caps.ud_odp_caps;
 		break;
+	case IB_QPT_DRIVER:
+		if (qp->qp_sub_type == MLX5_IB_QPT_DCI) {
+			transport_caps = dev->dc_odp_caps;
+			break;
+		}
+		/* fall through */
 	default:
 		mlx5_ib_err(dev, "ODP fault on QP of an unsupported transport 0x%x\n",
 			    qp->ibqp.qp_type);
@@ -1078,7 +1084,8 @@ static int mlx5_ib_mr_initiator_pfault_handler(
 		return -EFAULT;
 	}
 
-	if (qp->ibqp.qp_type == IB_QPT_UD) {
+	if (qp->ibqp.qp_type == IB_QPT_UD ||
+	    qp->qp_sub_type == MLX5_IB_QPT_DCI) {
 		av = *wqe;
 		if (av->dqp_dct & cpu_to_be32(MLX5_EXTENDED_UD_AV))
 			*wqe += sizeof(struct mlx5_av);
-- 
2.20.1


^ permalink raw reply related

* [PATCH net-next] mvpp2: use devm_platform_ioremap_resource() to simplify code
From: YueHaibing @ 2019-08-01 12:22 UTC (permalink / raw)
  To: davem, antoine.tenart, maxime.chevallier; +Cc: linux-kernel, netdev, YueHaibing

Use devm_platform_ioremap_resource() to simplify the code a bit.
This is detected by coccinelle.

Reported-by: Hulk Robot <hulkci@huawei.com>
Signed-off-by: YueHaibing <yuehaibing@huawei.com>
---
 drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c | 10 +++-------
 1 file changed, 3 insertions(+), 7 deletions(-)

diff --git a/drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c b/drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c
index 937e4b9..e9d8ffe 100644
--- a/drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c
+++ b/drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c
@@ -5014,7 +5014,6 @@ static int mvpp2_port_probe(struct platform_device *pdev,
 	struct device_node *port_node = to_of_node(port_fwnode);
 	netdev_features_t features;
 	struct net_device *dev;
-	struct resource *res;
 	struct phylink *phylink;
 	char *mac_from = "";
 	unsigned int ntxqs, nrxqs, thread;
@@ -5118,8 +5117,7 @@ static int mvpp2_port_probe(struct platform_device *pdev,
 	port->comphy = comphy;
 
 	if (priv->hw_version == MVPP21) {
-		res = platform_get_resource(pdev, IORESOURCE_MEM, 2 + id);
-		port->base = devm_ioremap_resource(&pdev->dev, res);
+		port->base = devm_platform_ioremap_resource(pdev, 2 + id);
 		if (IS_ERR(port->base)) {
 			err = PTR_ERR(port->base);
 			goto err_free_irq;
@@ -5551,14 +5549,12 @@ static int mvpp2_probe(struct platform_device *pdev)
 	if (priv->hw_version == MVPP21)
 		queue_mode = MVPP2_QDIST_SINGLE_MODE;
 
-	res = platform_get_resource(pdev, IORESOURCE_MEM, 0);
-	base = devm_ioremap_resource(&pdev->dev, res);
+	base = devm_platform_ioremap_resource(pdev, 0);
 	if (IS_ERR(base))
 		return PTR_ERR(base);
 
 	if (priv->hw_version == MVPP21) {
-		res = platform_get_resource(pdev, IORESOURCE_MEM, 1);
-		priv->lms_base = devm_ioremap_resource(&pdev->dev, res);
+		priv->lms_base = devm_platform_ioremap_resource(pdev, 1);
 		if (IS_ERR(priv->lms_base))
 			return PTR_ERR(priv->lms_base);
 	} else {
-- 
2.7.4



^ permalink raw reply related

* [PATCH net-next] net: dsa: lantiq: use devm_platform_ioremap_resource() to simplify code
From: YueHaibing @ 2019-08-01 12:25 UTC (permalink / raw)
  To: davem, hauke, andrew, vivien.didelot, f.fainelli
  Cc: linux-kernel, netdev, YueHaibing

Use devm_platform_ioremap_resource() to simplify the code a bit.
This is detected by coccinelle.

Reported-by: Hulk Robot <hulkci@huawei.com>
Signed-off-by: YueHaibing <yuehaibing@huawei.com>
---
 drivers/net/dsa/lantiq_gswip.c | 10 +++-------
 1 file changed, 3 insertions(+), 7 deletions(-)

diff --git a/drivers/net/dsa/lantiq_gswip.c b/drivers/net/dsa/lantiq_gswip.c
index 4e64835..2175ec1 100644
--- a/drivers/net/dsa/lantiq_gswip.c
+++ b/drivers/net/dsa/lantiq_gswip.c
@@ -1822,7 +1822,6 @@ static int gswip_gphy_fw_list(struct gswip_priv *priv,
 static int gswip_probe(struct platform_device *pdev)
 {
 	struct gswip_priv *priv;
-	struct resource *gswip_res, *mdio_res, *mii_res;
 	struct device_node *mdio_np, *gphy_fw_np;
 	struct device *dev = &pdev->dev;
 	int err;
@@ -1833,18 +1832,15 @@ static int gswip_probe(struct platform_device *pdev)
 	if (!priv)
 		return -ENOMEM;
 
-	gswip_res = platform_get_resource(pdev, IORESOURCE_MEM, 0);
-	priv->gswip = devm_ioremap_resource(dev, gswip_res);
+	priv->gswip = devm_platform_ioremap_resource(pdev, 0);
 	if (IS_ERR(priv->gswip))
 		return PTR_ERR(priv->gswip);
 
-	mdio_res = platform_get_resource(pdev, IORESOURCE_MEM, 1);
-	priv->mdio = devm_ioremap_resource(dev, mdio_res);
+	priv->mdio = devm_platform_ioremap_resource(pdev, 1);
 	if (IS_ERR(priv->mdio))
 		return PTR_ERR(priv->mdio);
 
-	mii_res = platform_get_resource(pdev, IORESOURCE_MEM, 2);
-	priv->mii = devm_ioremap_resource(dev, mii_res);
+	priv->mii = devm_platform_ioremap_resource(pdev, 2);
 	if (IS_ERR(priv->mii))
 		return PTR_ERR(priv->mii);
 
-- 
2.7.4



^ permalink raw reply related

* [PATCH net-next] net: dsa: b53: use devm_platform_ioremap_resource() to simplify code
From: YueHaibing @ 2019-08-01 12:27 UTC (permalink / raw)
  To: davem, f.fainelli, andrew, vivien.didelot
  Cc: linux-kernel, netdev, YueHaibing

Use devm_platform_ioremap_resource() to simplify the code a bit.
This is detected by coccinelle.

Reported-by: Hulk Robot <hulkci@huawei.com>
Signed-off-by: YueHaibing <yuehaibing@huawei.com>
---
 drivers/net/dsa/b53/b53_srab.c | 8 ++------
 1 file changed, 2 insertions(+), 6 deletions(-)

diff --git a/drivers/net/dsa/b53/b53_srab.c b/drivers/net/dsa/b53/b53_srab.c
index d9c56a7..0a1be52 100644
--- a/drivers/net/dsa/b53/b53_srab.c
+++ b/drivers/net/dsa/b53/b53_srab.c
@@ -536,7 +536,6 @@ static void b53_srab_mux_init(struct platform_device *pdev)
 	struct b53_device *dev = platform_get_drvdata(pdev);
 	struct b53_srab_priv *priv = dev->priv;
 	struct b53_srab_port_priv *p;
-	struct resource *r;
 	unsigned int port;
 	u32 reg, off = 0;
 	int ret;
@@ -544,8 +543,7 @@ static void b53_srab_mux_init(struct platform_device *pdev)
 	if (dev->pdata && dev->pdata->chip_id != BCM58XX_DEVICE_ID)
 		return;
 
-	r = platform_get_resource(pdev, IORESOURCE_MEM, 1);
-	priv->mux_config = devm_ioremap_resource(&pdev->dev, r);
+	priv->mux_config = devm_platform_ioremap_resource(pdev, 1);
 	if (IS_ERR(priv->mux_config))
 		return;
 
@@ -593,7 +591,6 @@ static int b53_srab_probe(struct platform_device *pdev)
 	const struct of_device_id *of_id = NULL;
 	struct b53_srab_priv *priv;
 	struct b53_device *dev;
-	struct resource *r;
 
 	if (dn)
 		of_id = of_match_node(b53_srab_of_match, dn);
@@ -610,8 +607,7 @@ static int b53_srab_probe(struct platform_device *pdev)
 	if (!priv)
 		return -ENOMEM;
 
-	r = platform_get_resource(pdev, IORESOURCE_MEM, 0);
-	priv->regs = devm_ioremap_resource(&pdev->dev, r);
+	priv->regs = devm_platform_ioremap_resource(pdev, 0);
 	if (IS_ERR(priv->regs))
 		return -ENOMEM;
 
-- 
2.7.4



^ permalink raw reply related

* [PATCH net-next] net: dsa: bcm_sf2: use devm_platform_ioremap_resource() to simplify code
From: YueHaibing @ 2019-08-01 12:29 UTC (permalink / raw)
  To: davem, andrew, vivien.didelot, f.fainelli
  Cc: linux-kernel, netdev, YueHaibing

Use devm_platform_ioremap_resource() to simplify the code a bit.
This is detected by coccinelle.

Reported-by: Hulk Robot <hulkci@huawei.com>
Signed-off-by: YueHaibing <yuehaibing@huawei.com>
---
 drivers/net/dsa/bcm_sf2.c | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/drivers/net/dsa/bcm_sf2.c b/drivers/net/dsa/bcm_sf2.c
index 3811fdb..49f9943 100644
--- a/drivers/net/dsa/bcm_sf2.c
+++ b/drivers/net/dsa/bcm_sf2.c
@@ -1041,7 +1041,6 @@ static int bcm_sf2_sw_probe(struct platform_device *pdev)
 	struct b53_device *dev;
 	struct dsa_switch *ds;
 	void __iomem **base;
-	struct resource *r;
 	unsigned int i;
 	u32 reg, rev;
 	int ret;
@@ -1107,8 +1106,7 @@ static int bcm_sf2_sw_probe(struct platform_device *pdev)
 
 	base = &priv->core;
 	for (i = 0; i < BCM_SF2_REGS_NUM; i++) {
-		r = platform_get_resource(pdev, IORESOURCE_MEM, i);
-		*base = devm_ioremap_resource(&pdev->dev, r);
+		*base = devm_platform_ioremap_resource(pdev, i);
 		if (IS_ERR(*base)) {
 			pr_err("unable to find register: %s\n", reg_names[i]);
 			return PTR_ERR(*base);
-- 
2.7.4



^ permalink raw reply related

* [PATCH net-next] net: mediatek: use devm_platform_ioremap_resource() to simplify code
From: YueHaibing @ 2019-08-01 12:33 UTC (permalink / raw)
  To: davem, nbd, john, sean.wang, nelson.chang, matthias.bgg
  Cc: linux-kernel, netdev, linux-mediatek, linux-arm-kernel,
	YueHaibing

Use devm_platform_ioremap_resource() to simplify the code a bit.
This is detected by coccinelle.

Reported-by: Hulk Robot <hulkci@huawei.com>
Signed-off-by: YueHaibing <yuehaibing@huawei.com>
---
 drivers/net/ethernet/mediatek/mtk_eth_soc.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/drivers/net/ethernet/mediatek/mtk_eth_soc.c b/drivers/net/ethernet/mediatek/mtk_eth_soc.c
index e529d86..ddbffeb 100644
--- a/drivers/net/ethernet/mediatek/mtk_eth_soc.c
+++ b/drivers/net/ethernet/mediatek/mtk_eth_soc.c
@@ -2447,7 +2447,6 @@ static int mtk_add_mac(struct mtk_eth *eth, struct device_node *np)
 
 static int mtk_probe(struct platform_device *pdev)
 {
-	struct resource *res = platform_get_resource(pdev, IORESOURCE_MEM, 0);
 	struct device_node *mac_np;
 	struct mtk_eth *eth;
 	int err;
@@ -2460,7 +2459,7 @@ static int mtk_probe(struct platform_device *pdev)
 	eth->soc = of_device_get_match_data(&pdev->dev);
 
 	eth->dev = &pdev->dev;
-	eth->base = devm_ioremap_resource(&pdev->dev, res);
+	eth->base = devm_platform_ioremap_resource(pdev, 0);
 	if (IS_ERR(eth->base))
 		return PTR_ERR(eth->base);
 
-- 
2.7.4



^ permalink raw reply related

* Re: [PATCH rdma-next 2/3] IB/mlx5: Expose ODP for DC capabilities to user
From: Gal Pressman @ 2019-08-01 12:34 UTC (permalink / raw)
  To: Leon Romanovsky, Doug Ledford, Jason Gunthorpe
  Cc: Leon Romanovsky, RDMA mailing list, Michael Guralnik, Moni Shoua,
	Saeed Mahameed, linux-netdev
In-Reply-To: <20190801122139.25224-3-leon@kernel.org>

On 01/08/2019 15:21, Leon Romanovsky wrote:
>  enum mlx5_user_cmds_supp_uhw {
> @@ -147,6 +148,7 @@ struct mlx5_ib_alloc_ucontext_resp {
>  	__u32	num_uars_per_page;
>  	__u32	num_dyn_bfregs;
>  	__u32	dump_fill_mkey;
> +	__u32	dc_odp_caps;

This should be padded to 64 bits.

>  };
>  
>  struct mlx5_ib_alloc_pd_resp {
> 

^ permalink raw reply

* [PATCH net-next] net: qcom/emac: use devm_platform_ioremap_resource() to simplify code
From: YueHaibing @ 2019-08-01 12:34 UTC (permalink / raw)
  To: davem, timur; +Cc: linux-kernel, netdev, YueHaibing

Use devm_platform_ioremap_resource() to simplify the code a bit.
This is detected by coccinelle.

Reported-by: Hulk Robot <hulkci@huawei.com>
Signed-off-by: YueHaibing <yuehaibing@huawei.com>
---
 drivers/net/ethernet/qualcomm/emac/emac.c | 7 ++-----
 1 file changed, 2 insertions(+), 5 deletions(-)

diff --git a/drivers/net/ethernet/qualcomm/emac/emac.c b/drivers/net/ethernet/qualcomm/emac/emac.c
index bfe1046..c84ab05 100644
--- a/drivers/net/ethernet/qualcomm/emac/emac.c
+++ b/drivers/net/ethernet/qualcomm/emac/emac.c
@@ -544,7 +544,6 @@ static int emac_probe_resources(struct platform_device *pdev,
 				struct emac_adapter *adpt)
 {
 	struct net_device *netdev = adpt->netdev;
-	struct resource *res;
 	char maddr[ETH_ALEN];
 	int ret = 0;
 
@@ -561,14 +560,12 @@ static int emac_probe_resources(struct platform_device *pdev,
 	adpt->irq.irq = ret;
 
 	/* base register address */
-	res = platform_get_resource(pdev, IORESOURCE_MEM, 0);
-	adpt->base = devm_ioremap_resource(&pdev->dev, res);
+	adpt->base = devm_platform_ioremap_resource(pdev, 0);
 	if (IS_ERR(adpt->base))
 		return PTR_ERR(adpt->base);
 
 	/* CSR register address */
-	res = platform_get_resource(pdev, IORESOURCE_MEM, 1);
-	adpt->csr = devm_ioremap_resource(&pdev->dev, res);
+	adpt->csr = devm_platform_ioremap_resource(pdev, 1);
 	if (IS_ERR(adpt->csr))
 		return PTR_ERR(adpt->csr);
 
-- 
2.7.4



^ permalink raw reply related

* [PATCH net-next] bcm63xx_enet: use devm_platform_ioremap_resource() to simplify code
From: YueHaibing @ 2019-08-01 12:39 UTC (permalink / raw)
  To: davem, f.fainelli, bcm-kernel-feedback-list, andrew,
	linux-arm-kernel
  Cc: linux-kernel, netdev, YueHaibing

Use devm_platform_ioremap_resource() to simplify the code a bit.
This is detected by coccinelle.

Reported-by: Hulk Robot <hulkci@huawei.com>
Signed-off-by: YueHaibing <yuehaibing@huawei.com>
---
 drivers/net/ethernet/broadcom/bcm63xx_enet.c | 9 +++------
 1 file changed, 3 insertions(+), 6 deletions(-)

diff --git a/drivers/net/ethernet/broadcom/bcm63xx_enet.c b/drivers/net/ethernet/broadcom/bcm63xx_enet.c
index 291e4af..620cd3f 100644
--- a/drivers/net/ethernet/broadcom/bcm63xx_enet.c
+++ b/drivers/net/ethernet/broadcom/bcm63xx_enet.c
@@ -1693,7 +1693,7 @@ static int bcm_enet_probe(struct platform_device *pdev)
 	struct bcm_enet_priv *priv;
 	struct net_device *dev;
 	struct bcm63xx_enet_platform_data *pd;
-	struct resource *res_mem, *res_irq, *res_irq_rx, *res_irq_tx;
+	struct resource *res_irq, *res_irq_rx, *res_irq_tx;
 	struct mii_bus *bus;
 	int i, ret;
 
@@ -1719,8 +1719,7 @@ static int bcm_enet_probe(struct platform_device *pdev)
 	if (ret)
 		goto out;
 
-	res_mem = platform_get_resource(pdev, IORESOURCE_MEM, 0);
-	priv->base = devm_ioremap_resource(&pdev->dev, res_mem);
+	priv->base = devm_platform_ioremap_resource(pdev, 0);
 	if (IS_ERR(priv->base)) {
 		ret = PTR_ERR(priv->base);
 		goto out;
@@ -2762,15 +2761,13 @@ struct platform_driver bcm63xx_enetsw_driver = {
 /* reserve & remap memory space shared between all macs */
 static int bcm_enet_shared_probe(struct platform_device *pdev)
 {
-	struct resource *res;
 	void __iomem *p[3];
 	unsigned int i;
 
 	memset(bcm_enet_shared_base, 0, sizeof(bcm_enet_shared_base));
 
 	for (i = 0; i < 3; i++) {
-		res = platform_get_resource(pdev, IORESOURCE_MEM, i);
-		p[i] = devm_ioremap_resource(&pdev->dev, res);
+		p[i] = devm_platform_ioremap_resource(pdev, i);
 		if (IS_ERR(p[i]))
 			return PTR_ERR(p[i]);
 	}
-- 
2.7.4



^ permalink raw reply related

* [PATCH net-next] net: phy: xgene: use devm_platform_ioremap_resource() to simplify code
From: YueHaibing @ 2019-08-01 12:46 UTC (permalink / raw)
  To: davem, iyappan, keyur, quan, andrew, f.fainelli, hkallweit1
  Cc: linux-kernel, netdev, YueHaibing

Use devm_platform_ioremap_resource() to simplify the code a bit.
This is detected by coccinelle.

Reported-by: Hulk Robot <hulkci@huawei.com>
Signed-off-by: YueHaibing <yuehaibing@huawei.com>
---
 drivers/net/phy/mdio-xgene.c | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/drivers/net/phy/mdio-xgene.c b/drivers/net/phy/mdio-xgene.c
index 717cc2a..34990ea 100644
--- a/drivers/net/phy/mdio-xgene.c
+++ b/drivers/net/phy/mdio-xgene.c
@@ -328,7 +328,6 @@ static int xgene_mdio_probe(struct platform_device *pdev)
 	struct device *dev = &pdev->dev;
 	struct mii_bus *mdio_bus;
 	const struct of_device_id *of_id;
-	struct resource *res;
 	struct xgene_mdio_pdata *pdata;
 	void __iomem *csr_base;
 	int mdio_id = 0, ret = 0;
@@ -355,8 +354,7 @@ static int xgene_mdio_probe(struct platform_device *pdev)
 	pdata->mdio_id = mdio_id;
 	pdata->dev = dev;
 
-	res = platform_get_resource(pdev, IORESOURCE_MEM, 0);
-	csr_base = devm_ioremap_resource(dev, res);
+	csr_base = devm_platform_ioremap_resource(pdev, 0);
 	if (IS_ERR(csr_base))
 		return PTR_ERR(csr_base);
 	pdata->mac_csr_addr = csr_base;
-- 
2.7.4



^ permalink raw reply related

* Re: [PATCH net v2] mvpp2: fix panic on module removal
From: Antoine Tenart @ 2019-08-01 12:48 UTC (permalink / raw)
  To: Matteo Croce
  Cc: netdev, Miquel Raynal, linux-kernel, Lorenzo Bianconi,
	Antoine Tenart, Maxime Chevallier, David S. Miller, Marcin Wojtas,
	Stefan Chulski
In-Reply-To: <20190801121330.30823-1-mcroce@redhat.com>

Hello Matteo,

On Thu, Aug 01, 2019 at 02:13:30PM +0200, Matteo Croce wrote:
> mvpp2 uses a delayed workqueue to gather traffic statistics.
> On module removal the workqueue can be destroyed before calling
> cancel_delayed_work_sync() on its works.
> Fix it by moving the destroy_workqueue() call after mvpp2_port_remove().
> Also remove an unneeded call to flush_workqueue()
> 
>     # rmmod mvpp2
>     [ 2743.311722] mvpp2 f4000000.ethernet eth1: phy link down 10gbase-kr/10Gbps/Full
>     [ 2743.320063] mvpp2 f4000000.ethernet eth1: Link is Down
>     [ 2743.572263] mvpp2 f4000000.ethernet eth2: phy link down sgmii/1Gbps/Full
>     [ 2743.580076] mvpp2 f4000000.ethernet eth2: Link is Down
>     [ 2744.102169] mvpp2 f2000000.ethernet eth0: phy link down 10gbase-kr/10Gbps/Full
>     [ 2744.110441] mvpp2 f2000000.ethernet eth0: Link is Down
>     [ 2744.115614] Unable to handle kernel NULL pointer dereference at virtual address 0000000000000000
>     [ 2744.115615] Mem abort info:
>     [ 2744.115616]   ESR = 0x96000005
>     [ 2744.115617]   Exception class = DABT (current EL), IL = 32 bits
>     [ 2744.115618]   SET = 0, FnV = 0
>     [ 2744.115619]   EA = 0, S1PTW = 0
>     [ 2744.115620] Data abort info:
>     [ 2744.115621]   ISV = 0, ISS = 0x00000005
>     [ 2744.115622]   CM = 0, WnR = 0
>     [ 2744.115624] user pgtable: 4k pages, 39-bit VAs, pgdp=0000000422681000
>     [ 2744.115626] [0000000000000000] pgd=0000000000000000, pud=0000000000000000
>     [ 2744.115630] Internal error: Oops: 96000005 [#1] SMP
>     [ 2744.115632] Modules linked in: mvpp2(-) algif_hash af_alg nls_iso8859_1 nls_cp437 vfat fat xhci_plat_hcd m25p80 spi_nor xhci_hcd mtd usbcore i2c_mv64xxx sfp usb_common marvell10g phy_generic spi_orion mdio_i2c i2c_core mvmdio phylink sbsa_gwdt ip_tables x_tables autofs4 [last unloaded: mvpp2]
>     [ 2744.115654] CPU: 3 PID: 8357 Comm: kworker/3:2 Not tainted 5.3.0-rc2 #1
>     [ 2744.115655] Hardware name: Marvell 8040 MACCHIATOBin Double-shot (DT)
>     [ 2744.115665] Workqueue: events_power_efficient phylink_resolve [phylink]
>     [ 2744.115669] pstate: a0000085 (NzCv daIf -PAN -UAO)
>     [ 2744.115675] pc : __queue_work+0x9c/0x4d8
>     [ 2744.115677] lr : __queue_work+0x170/0x4d8
>     [ 2744.115678] sp : ffffff801001bd50
>     [ 2744.115680] x29: ffffff801001bd50 x28: ffffffc422597600
>     [ 2744.115684] x27: ffffff80109ae6f0 x26: ffffff80108e4018
>     [ 2744.115688] x25: 0000000000000003 x24: 0000000000000004
>     [ 2744.115691] x23: ffffff80109ae6e0 x22: 0000000000000017
>     [ 2744.115694] x21: ffffffc42c030000 x20: ffffffc42209e8f8
>     [ 2744.115697] x19: 0000000000000000 x18: 0000000000000000
>     [ 2744.115699] x17: 0000000000000000 x16: 0000000000000000
>     [ 2744.115701] x15: 0000000000000010 x14: ffffffffffffffff
>     [ 2744.115702] x13: ffffff8090e2b95f x12: ffffff8010e2b967
>     [ 2744.115704] x11: ffffff8010906000 x10: 0000000000000040
>     [ 2744.115706] x9 : ffffff80109223b8 x8 : ffffff80109223b0
>     [ 2744.115707] x7 : ffffffc42bc00068 x6 : 0000000000000000
>     [ 2744.115709] x5 : ffffffc42bc00000 x4 : 0000000000000000
>     [ 2744.115710] x3 : 0000000000000000 x2 : 0000000000000000
>     [ 2744.115712] x1 : 0000000000000008 x0 : ffffffc42c030000
>     [ 2744.115714] Call trace:
>     [ 2744.115716]  __queue_work+0x9c/0x4d8
>     [ 2744.115718]  delayed_work_timer_fn+0x28/0x38
>     [ 2744.115722]  call_timer_fn+0x3c/0x180
>     [ 2744.115723]  expire_timers+0x60/0x168
>     [ 2744.115724]  run_timer_softirq+0xbc/0x1e8
>     [ 2744.115727]  __do_softirq+0x128/0x320
>     [ 2744.115731]  irq_exit+0xa4/0xc0
>     [ 2744.115734]  __handle_domain_irq+0x70/0xc0
>     [ 2744.115735]  gic_handle_irq+0x58/0xa8
>     [ 2744.115737]  el1_irq+0xb8/0x140
>     [ 2744.115738]  console_unlock+0x3a0/0x568
>     [ 2744.115740]  vprintk_emit+0x200/0x2a0
>     [ 2744.115744]  dev_vprintk_emit+0x1c8/0x1e4
>     [ 2744.115747]  dev_printk_emit+0x6c/0x7c
>     [ 2744.115751]  __netdev_printk+0x104/0x1d8
>     [ 2744.115752]  netdev_printk+0x60/0x70
>     [ 2744.115756]  phylink_resolve+0x38c/0x3c8 [phylink]
>     [ 2744.115758]  process_one_work+0x1f8/0x448
>     [ 2744.115760]  worker_thread+0x54/0x500
>     [ 2744.115762]  kthread+0x12c/0x130
>     [ 2744.115764]  ret_from_fork+0x10/0x1c
>     [ 2744.115768] Code: aa1403e0 97fffbbe aa0003f5 b4000700 (f9400261)
> 
> Fixes: 118d6298f6f0 ("net: mvpp2: add ethtool GOP statistics")
> Signed-off-by: Lorenzo Bianconi <lorenzo@kernel.org>
> Signed-off-by: Matteo Croce <mcroce@redhat.com>

Acked-by: Antoine Tenart <antoine.tenart@bootlin.com>

Thanks!
Antoine

> ---
>  drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c | 5 ++---
>  1 file changed, 2 insertions(+), 3 deletions(-)
> 
> diff --git a/drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c b/drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c
> index c51f1d5b550b..ad42cc0a2b4a 100644
> --- a/drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c
> +++ b/drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c
> @@ -5759,9 +5759,6 @@ static int mvpp2_remove(struct platform_device *pdev)
>  
>  	mvpp2_dbgfs_cleanup(priv);
>  
> -	flush_workqueue(priv->stats_queue);
> -	destroy_workqueue(priv->stats_queue);
> -
>  	fwnode_for_each_available_child_node(fwnode, port_fwnode) {
>  		if (priv->port_list[i]) {
>  			mutex_destroy(&priv->port_list[i]->gather_stats_lock);
> @@ -5770,6 +5767,8 @@ static int mvpp2_remove(struct platform_device *pdev)
>  		i++;
>  	}
>  
> +	destroy_workqueue(priv->stats_queue);
> +
>  	for (i = 0; i < MVPP2_BM_POOLS_NUM; i++) {
>  		struct mvpp2_bm_pool *bm_pool = &priv->bm_pools[i];
>  
> -- 
> 2.21.0
> 

-- 
Antoine Ténart, Bootlin
Embedded Linux and Kernel engineering
https://bootlin.com

^ permalink raw reply

* Re: [PATCH rdma-next 2/3] IB/mlx5: Expose ODP for DC capabilities to user
From: Leon Romanovsky @ 2019-08-01 12:50 UTC (permalink / raw)
  To: Gal Pressman
  Cc: Doug Ledford, Jason Gunthorpe, RDMA mailing list,
	Michael Guralnik, Moni Shoua, Saeed Mahameed, linux-netdev
In-Reply-To: <12331110-ecfd-fb2f-460a-be41be13b2d3@amazon.com>

On Thu, Aug 01, 2019 at 03:34:24PM +0300, Gal Pressman wrote:
> On 01/08/2019 15:21, Leon Romanovsky wrote:
> >  enum mlx5_user_cmds_supp_uhw {
> > @@ -147,6 +148,7 @@ struct mlx5_ib_alloc_ucontext_resp {
> >  	__u32	num_uars_per_page;
> >  	__u32	num_dyn_bfregs;
> >  	__u32	dump_fill_mkey;
> > +	__u32	dc_odp_caps;
>
> This should be padded to 64 bits.

Thanks a lot, I fixed it locally.

>
> >  };
> >
> >  struct mlx5_ib_alloc_pd_resp {
> >

^ permalink raw reply

* [net-next,rfc] net: bridge: mdb: Extend with multicast LLADDR
From: Horatiu Vultur @ 2019-08-01 12:50 UTC (permalink / raw)
  To: nikolay, idosch, andrew, allan.nielsen
  Cc: davem, roopa, petrm, tglx, fw, netdev, linux-kernel, bridge,
	Horatiu Vultur

Based on the discussion on the topic[1], we extend the functionality of
the 'bridge mdb' command to accept link layer multicast address. This
required only few changes and it fits nicely with the current
implementation and also the old implementation was not changed.

In this patch, we have added a MAC address to the union in 'struct br_ip'.
If we want to continue like this we should properly rename the structure as
it is not an IP any more.

To create a group for two of the front ports the following entries can
be added:
bridge mdb add dev br0 port eth0 grp 01:00:00:00:00:04 permanent vid 1
bridge mdb add dev br0 port eth1 grp 01:00:00:00:00:04 permanent vid 1

Now the entries will be display as following:
dev br0 port eth0 grp 01:00:00:00:00:04 permanent offload vid 1
dev br0 port eth1 grp 01:00:00:00:00:04 permanent offload vid 1

This requires changes to iproute2 as well, see the follogin patch for that.

Now if frame with dmac '01:00:00:00:00:04' will arrive at one of the front
ports. If we have HW offload support, then the frame will be forwarded by
the switch, and need not to go to the CPU. In a pure SW world, the frame is
forwarded by the SW bridge, which will flooded it only the ports which are
part of the group.

So far so good. This is an important part of the problem we wanted to solve.

But, there is one drawback of this approach. If you want to add two of the
front ports and br0 to receive the frame then I can't see a way of doing it
with the bridge mdb command. To do that it requireds many more changes to
the existing code.

Example:
bridge mdb add dev br0 port eth0 grp 01:00:00:00:00:04 permanent vid 1
bridge mdb add dev br0 port eth1 grp 01:00:00:00:00:04 permanent vid 1
bridge mdb add dev br0 port br0 grp 01:00:00:00:00:04 permanent vid 1 // This looks wrong.

We believe we come a long way by re-using the facilities in MDB (thanks for
convincing us in doing this), but we are still not completely happy with
the result.

If I only look at the user-interface (iproute2), and completely ignore all
the implementation details, then I still think that the FDB sub command is
more suitable for this. Today, FDB is for unicast, and MDB is for multicast.
We could change this such that MDB is for IP-multicast, and FDB is
forwarding in general (we should prevent FDB in install IP-multicast rules,
but we suggest to allow it to install MAC-Multicast rules).

The example from above would now look like this:
bridge fdb add 01:00:00:00:00:04 dev eth0 static self master
bridge fdb add 01:00:00:00:00:04 dev eth1 static self master
bridge fdb add 01:00:00:00:00:04 dev br0 static self master

It would be very similar to the "bridge vlan" command which also allow to
specify groups with and without br0.

Next observation is on the hashing data structure. In 'net_bridge_mdb_entry'
we only use/need the net_bridge_port_group/ports member (and the MAC
address, which we hacked into the br_ip struct) when we are a L2-multicast
entry. This type allow use to re-use the br_multicast_flood function
which does a lot of the work for us.

Also, the key used to do the lookup in the FDB is already a MAC address
(no need to hack the br_ip).

Regarding the events generated by switchdev: In the current proposal this
is a SWITCHDEV_OBJ_ID_PORT_MDB which refer to the switchdev_obj_port_mdb
type. If we changed to use the SWITCHDEV_FDB_ADD_TO_BRIDGE event, then
the associated data type would be switchdev_notifier_fdb_info - which also
has the information we need.

Using the FDB database, can still reuse the net_bridge_port_group type (and
associated functions), and I other parts from the MDB call graph as well.

If this sounds appealing, then we can do a proposal based on the idea.

If the MDB patch is what we can agree on, then we will continue polish this
and look for a solution to control the inclusion/exclusion of the br0
device (hints will be most appreciated).

[1] https://patchwork.ozlabs.org/patch/1136878/

Signed-off-by: Horatiu Vultur <horatiu.vultur@microchip.com>
Co-developed-by: Allan W. Nielsen <allan.nielsen@microchip.com>
Signed-off-by: Allan W. Nielsen <allan.nielsen@microchip.com>
---
 include/linux/if_bridge.h      |  1 +
 include/uapi/linux/if_bridge.h |  1 +
 net/bridge/br_device.c         |  7 +++++--
 net/bridge/br_forward.c        |  3 ++-
 net/bridge/br_input.c          | 13 ++++++++++--
 net/bridge/br_mdb.c            | 47 +++++++++++++++++++++++++++++++++++-------
 net/bridge/br_multicast.c      |  4 +++-
 net/bridge/br_private.h        |  3 ++-
 8 files changed, 64 insertions(+), 15 deletions(-)

diff --git a/include/linux/if_bridge.h b/include/linux/if_bridge.h
index f3fab5d..07b092a 100644
--- a/include/linux/if_bridge.h
+++ b/include/linux/if_bridge.h
@@ -16,6 +16,7 @@
 struct br_ip {
 	union {
 		__be32	ip4;
+		__u8	mac[ETH_ALEN];
 #if IS_ENABLED(CONFIG_IPV6)
 		struct in6_addr ip6;
 #endif
diff --git a/include/uapi/linux/if_bridge.h b/include/uapi/linux/if_bridge.h
index 773e476..e535a81 100644
--- a/include/uapi/linux/if_bridge.h
+++ b/include/uapi/linux/if_bridge.h
@@ -243,6 +243,7 @@ struct br_mdb_entry {
 		union {
 			__be32	ip4;
 			struct in6_addr ip6;
+			__u8	mac[ETH_ALEN];
 		} u;
 		__be16		proto;
 	} addr;
diff --git a/net/bridge/br_device.c b/net/bridge/br_device.c
index c05def8..b2d9041 100644
--- a/net/bridge/br_device.c
+++ b/net/bridge/br_device.c
@@ -83,7 +83,7 @@ netdev_tx_t br_dev_xmit(struct sk_buff *skb, struct net_device *dev)
 		br_flood(br, skb, BR_PKT_BROADCAST, false, true);
 	} else if (is_multicast_ether_addr(dest)) {
 		if (unlikely(netpoll_tx_running(dev))) {
-			br_flood(br, skb, BR_PKT_MULTICAST, false, true);
+			br_flood(br, skb, BR_PKT_MULTICAST_IP, false, true);
 			goto out;
 		}
 		if (br_multicast_rcv(br, NULL, skb, vid)) {
@@ -95,8 +95,11 @@ netdev_tx_t br_dev_xmit(struct sk_buff *skb, struct net_device *dev)
 		if ((mdst || BR_INPUT_SKB_CB_MROUTERS_ONLY(skb)) &&
 		    br_multicast_querier_exists(br, eth_hdr(skb)))
 			br_multicast_flood(mdst, skb, false, true);
+		else if (mdst && skb->protocol != htons(ETH_P_IP) &&
+			 skb->protocol != htons(ETH_P_IPV6))
+			br_multicast_flood(mdst, skb, false, true);
 		else
-			br_flood(br, skb, BR_PKT_MULTICAST, false, true);
+			br_flood(br, skb, BR_PKT_MULTICAST_IP, false, true);
 	} else if ((dst = br_fdb_find_rcu(br, dest, vid)) != NULL) {
 		br_forward(dst->dst, skb, false, true);
 	} else {
diff --git a/net/bridge/br_forward.c b/net/bridge/br_forward.c
index 8663700..36b58e8 100644
--- a/net/bridge/br_forward.c
+++ b/net/bridge/br_forward.c
@@ -203,7 +203,8 @@ void br_flood(struct net_bridge *br, struct sk_buff *skb,
 			if (!(p->flags & BR_FLOOD))
 				continue;
 			break;
-		case BR_PKT_MULTICAST:
+		case BR_PKT_MULTICAST_IP:
+		case BR_PKT_MULTICAST_L2:
 			if (!(p->flags & BR_MCAST_FLOOD) && skb->dev != br->dev)
 				continue;
 			break;
diff --git a/net/bridge/br_input.c b/net/bridge/br_input.c
index 21b74e7..a7db0c5 100644
--- a/net/bridge/br_input.c
+++ b/net/bridge/br_input.c
@@ -99,9 +99,13 @@ int br_handle_frame_finish(struct net *net, struct sock *sk, struct sk_buff *skb
 			pkt_type = BR_PKT_BROADCAST;
 			local_rcv = true;
 		} else {
-			pkt_type = BR_PKT_MULTICAST;
+			pkt_type = BR_PKT_MULTICAST_IP;
 			if (br_multicast_rcv(br, p, skb, vid))
 				goto drop;
+
+			if (skb->protocol != htons(ETH_P_IP) &&
+			    skb->protocol != htons(ETH_P_IPV6))
+				pkt_type = BR_PKT_MULTICAST_L2;
 		}
 	}
 
@@ -129,7 +133,12 @@ int br_handle_frame_finish(struct net *net, struct sock *sk, struct sk_buff *skb
 	}
 
 	switch (pkt_type) {
-	case BR_PKT_MULTICAST:
+	case BR_PKT_MULTICAST_L2:
+		mdst = br_mdb_get(br, skb, vid);
+		if (mdst)
+			mcast_hit = true;
+		break;
+	case BR_PKT_MULTICAST_IP:
 		mdst = br_mdb_get(br, skb, vid);
 		if ((mdst || BR_INPUT_SKB_CB_MROUTERS_ONLY(skb)) &&
 		    br_multicast_querier_exists(br, eth_hdr(skb))) {
diff --git a/net/bridge/br_mdb.c b/net/bridge/br_mdb.c
index bf6acd3..b337a30 100644
--- a/net/bridge/br_mdb.c
+++ b/net/bridge/br_mdb.c
@@ -67,12 +67,19 @@ static void __mdb_entry_to_br_ip(struct br_mdb_entry *entry, struct br_ip *ip)
 	memset(ip, 0, sizeof(struct br_ip));
 	ip->vid = entry->vid;
 	ip->proto = entry->addr.proto;
-	if (ip->proto == htons(ETH_P_IP))
+	switch (ip->proto) {
+	case htons(ETH_P_IP):
 		ip->u.ip4 = entry->addr.u.ip4;
+		break;
 #if IS_ENABLED(CONFIG_IPV6)
-	else
+	case htons(ETH_P_IPV6):
 		ip->u.ip6 = entry->addr.u.ip6;
+		break;
 #endif
+	default:
+		ether_addr_copy(ip->u.mac, entry->addr.u.mac);
+		break;
+	}
 }
 
 static int br_mdb_fill_info(struct sk_buff *skb, struct netlink_callback *cb,
@@ -117,12 +124,19 @@ static int br_mdb_fill_info(struct sk_buff *skb, struct netlink_callback *cb,
 			e.ifindex = port->dev->ifindex;
 			e.vid = p->addr.vid;
 			__mdb_entry_fill_flags(&e, p->flags);
-			if (p->addr.proto == htons(ETH_P_IP))
+			switch (p->addr.proto) {
+			case htons(ETH_P_IP):
 				e.addr.u.ip4 = p->addr.u.ip4;
+				break;
 #if IS_ENABLED(CONFIG_IPV6)
-			if (p->addr.proto == htons(ETH_P_IPV6))
+			case htons(ETH_P_IPV6):
 				e.addr.u.ip6 = p->addr.u.ip6;
+				break;
 #endif
+			default:
+				ether_addr_copy(e.addr.u.mac, p->addr.u.mac);
+				break;
+			}
 			e.addr.proto = p->addr.proto;
 			nest_ent = nla_nest_start_noflag(skb,
 							 MDBA_MDB_ENTRY_INFO);
@@ -322,12 +336,19 @@ static void br_mdb_switchdev_host_port(struct net_device *dev,
 		.vid = entry->vid,
 	};
 
-	if (entry->addr.proto == htons(ETH_P_IP))
+	switch (entry->addr.proto) {
+	case htons(ETH_P_IP):
 		ip_eth_mc_map(entry->addr.u.ip4, mdb.addr);
+		break;
 #if IS_ENABLED(CONFIG_IPV6)
-	else
+	case htons(ETH_P_IPV6):
 		ipv6_eth_mc_map(&entry->addr.u.ip6, mdb.addr);
+		break;
 #endif
+	default:
+		ether_addr_copy(mdb.addr, entry->addr.u.mac);
+		break;
+	}
 
 	mdb.obj.orig_dev = dev;
 	switch (type) {
@@ -367,12 +388,19 @@ static void __br_mdb_notify(struct net_device *dev, struct net_bridge_port *p,
 	int err = -ENOBUFS;
 
 	port_dev = __dev_get_by_index(net, entry->ifindex);
-	if (entry->addr.proto == htons(ETH_P_IP))
+	switch (entry->addr.proto) {
+	case htons(ETH_P_IP):
 		ip_eth_mc_map(entry->addr.u.ip4, mdb.addr);
+		break;
 #if IS_ENABLED(CONFIG_IPV6)
-	else
+	case htons(ETH_P_IPV6):
 		ipv6_eth_mc_map(&entry->addr.u.ip6, mdb.addr);
+		break;
 #endif
+	default:
+		ether_addr_copy(mdb.addr, entry->addr.u.mac);
+		break;
+	}
 
 	mdb.obj.orig_dev = port_dev;
 	if (p && port_dev && type == RTM_NEWMDB) {
@@ -423,6 +451,7 @@ void br_mdb_notify(struct net_device *dev, struct net_bridge_port *port,
 #if IS_ENABLED(CONFIG_IPV6)
 	entry.addr.u.ip6 = group->u.ip6;
 #endif
+	ether_addr_copy(group->u.mac, entry.addr.u.mac);
 	entry.vid = group->vid;
 	__mdb_entry_fill_flags(&entry, flags);
 	__br_mdb_notify(dev, port, &entry, type);
@@ -510,6 +539,8 @@ static bool is_valid_mdb_entry(struct br_mdb_entry *entry)
 		if (ipv6_addr_is_ll_all_nodes(&entry->addr.u.ip6))
 			return false;
 #endif
+	} else if (is_multicast_ether_addr(entry->addr.u.mac)) {
+		;
 	} else
 		return false;
 	if (entry->state != MDB_PERMANENT && entry->state != MDB_TEMPORARY)
diff --git a/net/bridge/br_multicast.c b/net/bridge/br_multicast.c
index de22c8f..01250c1 100644
--- a/net/bridge/br_multicast.c
+++ b/net/bridge/br_multicast.c
@@ -133,7 +133,9 @@ struct net_bridge_mdb_entry *br_mdb_get(struct net_bridge *br,
 		break;
 #endif
 	default:
-		return NULL;
+		ip.proto = 0;
+		ether_addr_copy(ip.u.mac, eth_hdr(skb)->h_dest);
+		break;
 	}
 
 	return br_mdb_ip_get_rcu(br, &ip);
diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h
index 159a0e2..60e2430d 100644
--- a/net/bridge/br_private.h
+++ b/net/bridge/br_private.h
@@ -590,7 +590,8 @@ void br_fdb_offloaded_set(struct net_bridge *br, struct net_bridge_port *p,
 /* br_forward.c */
 enum br_pkt_type {
 	BR_PKT_UNICAST,
-	BR_PKT_MULTICAST,
+	BR_PKT_MULTICAST_IP,
+	BR_PKT_MULTICAST_L2,
 	BR_PKT_BROADCAST
 };
 int br_dev_queue_push_xmit(struct net *net, struct sock *sk, struct sk_buff *skb);
-- 
2.7.4


^ permalink raw reply related

* [iproute2,rfc] bridge: mdb: Extend with with LLADDR
From: Horatiu Vultur @ 2019-08-01 12:51 UTC (permalink / raw)
  To: nikolay, idosch, andrew, allan.nielsen
  Cc: davem, roopa, petrm, tglx, fw, netdev, linux-kernel, bridge,
	Horatiu Vultur

Extend bridge mdb command to accept as group also link layer multicast
addresss. The old behaviour is not change.

To add new mdb entry:
bridge mdb add dev br0 port eth0 grp 11:22:33:44:55:66 permanent vid 1

To display existing entries:
bridge mdb
dev br0 port eth4 grp 01:00:00:00:00:01 permanent offload vid 1

Signed-off-by: Horatiu Vultur <horatiu.vultur@microchip.com>
---
 bridge/mdb.c                   | 29 ++++++++++++++++++++++++-----
 include/uapi/linux/if_bridge.h |  1 +
 2 files changed, 25 insertions(+), 5 deletions(-)

diff --git a/bridge/mdb.c b/bridge/mdb.c
index 928ae56..057c0b6 100644
--- a/bridge/mdb.c
+++ b/bridge/mdb.c
@@ -138,9 +138,21 @@ static void print_mdb_entry(FILE *f, int ifindex, const struct br_mdb_entry *e,
 	print_string(PRINT_ANY, "port", " port %s",
 		     ll_index_to_name(e->ifindex));
 
-	print_color_string(PRINT_ANY, ifa_family_color(af),
-			    "grp", " grp %s",
-			    inet_ntop(af, src, abuf, sizeof(abuf)));
+	if (e->addr.proto == htons(ETH_P_IP) ||
+	    e->addr.proto == htons(ETH_P_IPV6)) {
+		print_color_string(PRINT_ANY, ifa_family_color(af),
+				   "grp", " grp %s",
+				   inet_ntop(af, src, abuf, sizeof(abuf)));
+	} else {
+		const char *lladdr;
+		SPRINT_BUF(b1);
+
+		lladdr = ll_addr_n2a(e->addr.u.mac, ETH_ALEN, 0, b1,
+				     sizeof(b1));
+
+		print_color_string(PRINT_ANY, COLOR_MAC, "grp", " grp %s",
+				   lladdr);
+	}
 
 	print_string(PRINT_ANY, "state", " %s",
 			   (e->state & MDB_PERMANENT) ? "permanent" : "temp");
@@ -380,6 +392,7 @@ static int mdb_modify(int cmd, int flags, int argc, char **argv)
 	};
 	struct br_mdb_entry entry = {};
 	char *d = NULL, *p = NULL, *grp = NULL;
+	char abuf[ETH_ALEN];
 	short vid = 0;
 
 	while (argc > 0) {
@@ -422,8 +435,14 @@ static int mdb_modify(int cmd, int flags, int argc, char **argv)
 
 	if (!inet_pton(AF_INET, grp, &entry.addr.u.ip4)) {
 		if (!inet_pton(AF_INET6, grp, &entry.addr.u.ip6)) {
-			fprintf(stderr, "Invalid address \"%s\"\n", grp);
-			return -1;
+			if (sscanf(grp, "%hhx:%hhx:%hhx:%hhx:%hhx:%hhx",
+				   abuf, abuf+1, abuf+2, abuf+3, abuf+4,
+				   abuf+5) != 6) {
+				fprintf(stderr, "Invalid address \"%s\"\n", grp);
+				return -1;
+			}
+			memcpy(entry.addr.u.mac, abuf, ETH_ALEN);
+			entry.addr.proto = 0;
 		} else
 			entry.addr.proto = htons(ETH_P_IPV6);
 	} else
diff --git a/include/uapi/linux/if_bridge.h b/include/uapi/linux/if_bridge.h
index 04f763c..88aad9c 100644
--- a/include/uapi/linux/if_bridge.h
+++ b/include/uapi/linux/if_bridge.h
@@ -243,6 +243,7 @@ struct br_mdb_entry {
 		union {
 			__be32	ip4;
 			struct in6_addr ip6;
+			__u8	mac[ETH_ALEN];
 		} u;
 		__be16		proto;
 	} addr;
-- 
2.7.4


^ permalink raw reply related

* [PATCH] net: sched: use temporary variable for actions indexes
From: dmitrolin @ 2019-08-01 13:02 UTC (permalink / raw)
  To: netdev; +Cc: davem, jiri, jhs, xiyou.wangcong, Dmytro Linkin, Vlad Buslov

From: Dmytro Linkin <dmitrolin@mellanox.com>

Currently init call of all actions (except ipt) init their 'parm'
structure as a direct pointer to nla data in skb. This leads to race
condition when some of the filter actions were initialized successfully
(and were assigned with idr action index that was written directly
into nla data), but then were deleted and retried (due to following
action module missing or classifier-initiated retry), in which case
action init code tries to insert action to idr with index that was
assigned on previous iteration. During retry the index can be reused
by another action that was inserted concurrently, which causes
unintended action sharing between filters.
To fix described race condition, save action idr index to temporary
stack-allocated variable instead on nla data.

Fixes: 0190c1d452a9 ("net: sched: atomically check-allocate action")
Signed-off-by: Dmytro Linkin <dmitrolin@mellanox.com>
Signed-off-by: Vlad Buslov <vladbu@mellanox.com>
---
 net/sched/act_bpf.c        |  9 +++++----
 net/sched/act_connmark.c   |  9 +++++----
 net/sched/act_csum.c       |  9 +++++----
 net/sched/act_ct.c         |  9 +++++----
 net/sched/act_ctinfo.c     |  9 +++++----
 net/sched/act_gact.c       |  8 +++++---
 net/sched/act_ife.c        |  8 +++++---
 net/sched/act_mirred.c     | 13 +++++++------
 net/sched/act_mpls.c       |  8 +++++---
 net/sched/act_nat.c        |  9 +++++----
 net/sched/act_pedit.c      | 10 ++++++----
 net/sched/act_police.c     |  8 +++++---
 net/sched/act_sample.c     | 10 +++++-----
 net/sched/act_simple.c     | 10 ++++++----
 net/sched/act_skbedit.c    | 11 ++++++-----
 net/sched/act_skbmod.c     | 11 ++++++-----
 net/sched/act_tunnel_key.c |  8 +++++---
 net/sched/act_vlan.c       | 16 +++++++++-------
 18 files changed, 100 insertions(+), 75 deletions(-)

diff --git a/net/sched/act_bpf.c b/net/sched/act_bpf.c
index 8126b26..fd1f7e7 100644
--- a/net/sched/act_bpf.c
+++ b/net/sched/act_bpf.c
@@ -285,6 +285,7 @@ static int tcf_bpf_init(struct net *net, struct nlattr *nla,
 	struct tcf_bpf *prog;
 	bool is_bpf, is_ebpf;
 	int ret, res = 0;
+	u32 index;
 
 	if (!nla)
 		return -EINVAL;
@@ -298,13 +299,13 @@ static int tcf_bpf_init(struct net *net, struct nlattr *nla,
 		return -EINVAL;
 
 	parm = nla_data(tb[TCA_ACT_BPF_PARMS]);
-
-	ret = tcf_idr_check_alloc(tn, &parm->index, act, bind);
+	index = parm->index;
+	ret = tcf_idr_check_alloc(tn, &index, act, bind);
 	if (!ret) {
-		ret = tcf_idr_create(tn, parm->index, est, act,
+		ret = tcf_idr_create(tn, index, est, act,
 				     &act_bpf_ops, bind, true);
 		if (ret < 0) {
-			tcf_idr_cleanup(tn, parm->index);
+			tcf_idr_cleanup(tn, index);
 			return ret;
 		}
 
diff --git a/net/sched/act_connmark.c b/net/sched/act_connmark.c
index ce36b0f..32ac04d 100644
--- a/net/sched/act_connmark.c
+++ b/net/sched/act_connmark.c
@@ -103,6 +103,7 @@ static int tcf_connmark_init(struct net *net, struct nlattr *nla,
 	struct tcf_connmark_info *ci;
 	struct tc_connmark *parm;
 	int ret = 0, err;
+	u32 index;
 
 	if (!nla)
 		return -EINVAL;
@@ -116,13 +117,13 @@ static int tcf_connmark_init(struct net *net, struct nlattr *nla,
 		return -EINVAL;
 
 	parm = nla_data(tb[TCA_CONNMARK_PARMS]);
-
-	ret = tcf_idr_check_alloc(tn, &parm->index, a, bind);
+	index = parm->index;
+	ret = tcf_idr_check_alloc(tn, &index, a, bind);
 	if (!ret) {
-		ret = tcf_idr_create(tn, parm->index, est, a,
+		ret = tcf_idr_create(tn, index, est, a,
 				     &act_connmark_ops, bind, false);
 		if (ret) {
-			tcf_idr_cleanup(tn, parm->index);
+			tcf_idr_cleanup(tn, index);
 			return ret;
 		}
 
diff --git a/net/sched/act_csum.c b/net/sched/act_csum.c
index 621fb22..9b92882 100644
--- a/net/sched/act_csum.c
+++ b/net/sched/act_csum.c
@@ -52,6 +52,7 @@ static int tcf_csum_init(struct net *net, struct nlattr *nla,
 	struct tc_csum *parm;
 	struct tcf_csum *p;
 	int ret = 0, err;
+	u32 index;
 
 	if (nla == NULL)
 		return -EINVAL;
@@ -64,13 +65,13 @@ static int tcf_csum_init(struct net *net, struct nlattr *nla,
 	if (tb[TCA_CSUM_PARMS] == NULL)
 		return -EINVAL;
 	parm = nla_data(tb[TCA_CSUM_PARMS]);
-
-	err = tcf_idr_check_alloc(tn, &parm->index, a, bind);
+	index = parm->index;
+	err = tcf_idr_check_alloc(tn, &index, a, bind);
 	if (!err) {
-		ret = tcf_idr_create(tn, parm->index, est, a,
+		ret = tcf_idr_create(tn, index, est, a,
 				     &act_csum_ops, bind, true);
 		if (ret) {
-			tcf_idr_cleanup(tn, parm->index);
+			tcf_idr_cleanup(tn, index);
 			return ret;
 		}
 		ret = ACT_P_CREATED;
diff --git a/net/sched/act_ct.c b/net/sched/act_ct.c
index b501ce0..33a1a74 100644
--- a/net/sched/act_ct.c
+++ b/net/sched/act_ct.c
@@ -666,6 +666,7 @@ static int tcf_ct_init(struct net *net, struct nlattr *nla,
 	struct tc_ct *parm;
 	struct tcf_ct *c;
 	int err, res = 0;
+	u32 index;
 
 	if (!nla) {
 		NL_SET_ERR_MSG_MOD(extack, "Ct requires attributes to be passed");
@@ -681,16 +682,16 @@ static int tcf_ct_init(struct net *net, struct nlattr *nla,
 		return -EINVAL;
 	}
 	parm = nla_data(tb[TCA_CT_PARMS]);
-
-	err = tcf_idr_check_alloc(tn, &parm->index, a, bind);
+	index = parm->index;
+	err = tcf_idr_check_alloc(tn, &index, a, bind);
 	if (err < 0)
 		return err;
 
 	if (!err) {
-		err = tcf_idr_create(tn, parm->index, est, a,
+		err = tcf_idr_create(tn, index, est, a,
 				     &act_ct_ops, bind, true);
 		if (err) {
-			tcf_idr_cleanup(tn, parm->index);
+			tcf_idr_cleanup(tn, index);
 			return err;
 		}
 		res = ACT_P_CREATED;
diff --git a/net/sched/act_ctinfo.c b/net/sched/act_ctinfo.c
index 10eb2bb..06ef74b 100644
--- a/net/sched/act_ctinfo.c
+++ b/net/sched/act_ctinfo.c
@@ -157,10 +157,10 @@ static int tcf_ctinfo_init(struct net *net, struct nlattr *nla,
 			   struct netlink_ext_ack *extack)
 {
 	struct tc_action_net *tn = net_generic(net, ctinfo_net_id);
+	u32 dscpmask = 0, dscpstatemask, index;
 	struct nlattr *tb[TCA_CTINFO_MAX + 1];
 	struct tcf_ctinfo_params *cp_new;
 	struct tcf_chain *goto_ch = NULL;
-	u32 dscpmask = 0, dscpstatemask;
 	struct tc_ctinfo *actparm;
 	struct tcf_ctinfo *ci;
 	u8 dscpmaskshift;
@@ -206,12 +206,13 @@ static int tcf_ctinfo_init(struct net *net, struct nlattr *nla,
 	}
 
 	/* done the validation:now to the actual action allocation */
-	err = tcf_idr_check_alloc(tn, &actparm->index, a, bind);
+	index = actparm->index;
+	err = tcf_idr_check_alloc(tn, &index, a, bind);
 	if (!err) {
-		ret = tcf_idr_create(tn, actparm->index, est, a,
+		ret = tcf_idr_create(tn, index, est, a,
 				     &act_ctinfo_ops, bind, false);
 		if (ret) {
-			tcf_idr_cleanup(tn, actparm->index);
+			tcf_idr_cleanup(tn, index);
 			return ret;
 		}
 		ret = ACT_P_CREATED;
diff --git a/net/sched/act_gact.c b/net/sched/act_gact.c
index b2380c5..8f0140c 100644
--- a/net/sched/act_gact.c
+++ b/net/sched/act_gact.c
@@ -61,6 +61,7 @@ static int tcf_gact_init(struct net *net, struct nlattr *nla,
 	struct tc_gact *parm;
 	struct tcf_gact *gact;
 	int ret = 0;
+	u32 index;
 	int err;
 #ifdef CONFIG_GACT_PROB
 	struct tc_gact_p *p_parm = NULL;
@@ -77,6 +78,7 @@ static int tcf_gact_init(struct net *net, struct nlattr *nla,
 	if (tb[TCA_GACT_PARMS] == NULL)
 		return -EINVAL;
 	parm = nla_data(tb[TCA_GACT_PARMS]);
+	index = parm->index;
 
 #ifndef CONFIG_GACT_PROB
 	if (tb[TCA_GACT_PROB] != NULL)
@@ -94,12 +96,12 @@ static int tcf_gact_init(struct net *net, struct nlattr *nla,
 	}
 #endif
 
-	err = tcf_idr_check_alloc(tn, &parm->index, a, bind);
+	err = tcf_idr_check_alloc(tn, &index, a, bind);
 	if (!err) {
-		ret = tcf_idr_create(tn, parm->index, est, a,
+		ret = tcf_idr_create(tn, index, est, a,
 				     &act_gact_ops, bind, true);
 		if (ret) {
-			tcf_idr_cleanup(tn, parm->index);
+			tcf_idr_cleanup(tn, index);
 			return ret;
 		}
 		ret = ACT_P_CREATED;
diff --git a/net/sched/act_ife.c b/net/sched/act_ife.c
index 41d5398..bd1a541 100644
--- a/net/sched/act_ife.c
+++ b/net/sched/act_ife.c
@@ -479,6 +479,7 @@ static int tcf_ife_init(struct net *net, struct nlattr *nla,
 	u8 *saddr = NULL;
 	bool exists = false;
 	int ret = 0;
+	u32 index;
 	int err;
 
 	err = nla_parse_nested_deprecated(tb, TCA_IFE_MAX, nla, ife_policy,
@@ -502,7 +503,8 @@ static int tcf_ife_init(struct net *net, struct nlattr *nla,
 	if (!p)
 		return -ENOMEM;
 
-	err = tcf_idr_check_alloc(tn, &parm->index, a, bind);
+	index = parm->index;
+	err = tcf_idr_check_alloc(tn, &index, a, bind);
 	if (err < 0) {
 		kfree(p);
 		return err;
@@ -514,10 +516,10 @@ static int tcf_ife_init(struct net *net, struct nlattr *nla,
 	}
 
 	if (!exists) {
-		ret = tcf_idr_create(tn, parm->index, est, a, &act_ife_ops,
+		ret = tcf_idr_create(tn, index, est, a, &act_ife_ops,
 				     bind, true);
 		if (ret) {
-			tcf_idr_cleanup(tn, parm->index);
+			tcf_idr_cleanup(tn, index);
 			kfree(p);
 			return ret;
 		}
diff --git a/net/sched/act_mirred.c b/net/sched/act_mirred.c
index 055faa2..be3f88d 100644
--- a/net/sched/act_mirred.c
+++ b/net/sched/act_mirred.c
@@ -104,6 +104,7 @@ static int tcf_mirred_init(struct net *net, struct nlattr *nla,
 	struct net_device *dev;
 	bool exists = false;
 	int ret, err;
+	u32 index;
 
 	if (!nla) {
 		NL_SET_ERR_MSG_MOD(extack, "Mirred requires attributes to be passed");
@@ -118,8 +119,8 @@ static int tcf_mirred_init(struct net *net, struct nlattr *nla,
 		return -EINVAL;
 	}
 	parm = nla_data(tb[TCA_MIRRED_PARMS]);
-
-	err = tcf_idr_check_alloc(tn, &parm->index, a, bind);
+	index = parm->index;
+	err = tcf_idr_check_alloc(tn, &index, a, bind);
 	if (err < 0)
 		return err;
 	exists = err;
@@ -136,21 +137,21 @@ static int tcf_mirred_init(struct net *net, struct nlattr *nla,
 		if (exists)
 			tcf_idr_release(*a, bind);
 		else
-			tcf_idr_cleanup(tn, parm->index);
+			tcf_idr_cleanup(tn, index);
 		NL_SET_ERR_MSG_MOD(extack, "Unknown mirred option");
 		return -EINVAL;
 	}
 
 	if (!exists) {
 		if (!parm->ifindex) {
-			tcf_idr_cleanup(tn, parm->index);
+			tcf_idr_cleanup(tn, index);
 			NL_SET_ERR_MSG_MOD(extack, "Specified device does not exist");
 			return -EINVAL;
 		}
-		ret = tcf_idr_create(tn, parm->index, est, a,
+		ret = tcf_idr_create(tn, index, est, a,
 				     &act_mirred_ops, bind, true);
 		if (ret) {
-			tcf_idr_cleanup(tn, parm->index);
+			tcf_idr_cleanup(tn, index);
 			return ret;
 		}
 		ret = ACT_P_CREATED;
diff --git a/net/sched/act_mpls.c b/net/sched/act_mpls.c
index ca2597c..0f299e3 100644
--- a/net/sched/act_mpls.c
+++ b/net/sched/act_mpls.c
@@ -138,6 +138,7 @@ static int tcf_mpls_init(struct net *net, struct nlattr *nla,
 	struct tcf_mpls *m;
 	int ret = 0, err;
 	u8 mpls_ttl = 0;
+	u32 index;
 
 	if (!nla) {
 		NL_SET_ERR_MSG_MOD(extack, "Missing netlink attributes");
@@ -153,6 +154,7 @@ static int tcf_mpls_init(struct net *net, struct nlattr *nla,
 		return -EINVAL;
 	}
 	parm = nla_data(tb[TCA_MPLS_PARMS]);
+	index = parm->index;
 
 	/* Verify parameters against action type. */
 	switch (parm->m_action) {
@@ -209,7 +211,7 @@ static int tcf_mpls_init(struct net *net, struct nlattr *nla,
 		return -EINVAL;
 	}
 
-	err = tcf_idr_check_alloc(tn, &parm->index, a, bind);
+	err = tcf_idr_check_alloc(tn, &index, a, bind);
 	if (err < 0)
 		return err;
 	exists = err;
@@ -217,10 +219,10 @@ static int tcf_mpls_init(struct net *net, struct nlattr *nla,
 		return 0;
 
 	if (!exists) {
-		ret = tcf_idr_create(tn, parm->index, est, a,
+		ret = tcf_idr_create(tn, index, est, a,
 				     &act_mpls_ops, bind, true);
 		if (ret) {
-			tcf_idr_cleanup(tn, parm->index);
+			tcf_idr_cleanup(tn, index);
 			return ret;
 		}
 
diff --git a/net/sched/act_nat.c b/net/sched/act_nat.c
index 45923eb..7b858c1 100644
--- a/net/sched/act_nat.c
+++ b/net/sched/act_nat.c
@@ -44,6 +44,7 @@ static int tcf_nat_init(struct net *net, struct nlattr *nla, struct nlattr *est,
 	struct tc_nat *parm;
 	int ret = 0, err;
 	struct tcf_nat *p;
+	u32 index;
 
 	if (nla == NULL)
 		return -EINVAL;
@@ -56,13 +57,13 @@ static int tcf_nat_init(struct net *net, struct nlattr *nla, struct nlattr *est,
 	if (tb[TCA_NAT_PARMS] == NULL)
 		return -EINVAL;
 	parm = nla_data(tb[TCA_NAT_PARMS]);
-
-	err = tcf_idr_check_alloc(tn, &parm->index, a, bind);
+	index = parm->index;
+	err = tcf_idr_check_alloc(tn, &index, a, bind);
 	if (!err) {
-		ret = tcf_idr_create(tn, parm->index, est, a,
+		ret = tcf_idr_create(tn, index, est, a,
 				     &act_nat_ops, bind, false);
 		if (ret) {
-			tcf_idr_cleanup(tn, parm->index);
+			tcf_idr_cleanup(tn, index);
 			return ret;
 		}
 		ret = ACT_P_CREATED;
diff --git a/net/sched/act_pedit.c b/net/sched/act_pedit.c
index 45e9d6b..17360c6 100644
--- a/net/sched/act_pedit.c
+++ b/net/sched/act_pedit.c
@@ -149,6 +149,7 @@ static int tcf_pedit_init(struct net *net, struct nlattr *nla,
 	struct tcf_pedit *p;
 	int ret = 0, err;
 	int ksize;
+	u32 index;
 
 	if (!nla) {
 		NL_SET_ERR_MSG_MOD(extack, "Pedit requires attributes to be passed");
@@ -179,18 +180,19 @@ static int tcf_pedit_init(struct net *net, struct nlattr *nla,
 	if (IS_ERR(keys_ex))
 		return PTR_ERR(keys_ex);
 
-	err = tcf_idr_check_alloc(tn, &parm->index, a, bind);
+	index = parm->index;
+	err = tcf_idr_check_alloc(tn, &index, a, bind);
 	if (!err) {
 		if (!parm->nkeys) {
-			tcf_idr_cleanup(tn, parm->index);
+			tcf_idr_cleanup(tn, index);
 			NL_SET_ERR_MSG_MOD(extack, "Pedit requires keys to be passed");
 			ret = -EINVAL;
 			goto out_free;
 		}
-		ret = tcf_idr_create(tn, parm->index, est, a,
+		ret = tcf_idr_create(tn, index, est, a,
 				     &act_pedit_ops, bind, false);
 		if (ret) {
-			tcf_idr_cleanup(tn, parm->index);
+			tcf_idr_cleanup(tn, index);
 			goto out_free;
 		}
 		ret = ACT_P_CREATED;
diff --git a/net/sched/act_police.c b/net/sched/act_police.c
index a065f62..49cec3e 100644
--- a/net/sched/act_police.c
+++ b/net/sched/act_police.c
@@ -57,6 +57,7 @@ static int tcf_police_init(struct net *net, struct nlattr *nla,
 	struct tc_action_net *tn = net_generic(net, police_net_id);
 	struct tcf_police_params *new;
 	bool exists = false;
+	u32 index;
 
 	if (nla == NULL)
 		return -EINVAL;
@@ -73,7 +74,8 @@ static int tcf_police_init(struct net *net, struct nlattr *nla,
 		return -EINVAL;
 
 	parm = nla_data(tb[TCA_POLICE_TBF]);
-	err = tcf_idr_check_alloc(tn, &parm->index, a, bind);
+	index = parm->index;
+	err = tcf_idr_check_alloc(tn, &index, a, bind);
 	if (err < 0)
 		return err;
 	exists = err;
@@ -81,10 +83,10 @@ static int tcf_police_init(struct net *net, struct nlattr *nla,
 		return 0;
 
 	if (!exists) {
-		ret = tcf_idr_create(tn, parm->index, NULL, a,
+		ret = tcf_idr_create(tn, index, NULL, a,
 				     &act_police_ops, bind, true);
 		if (ret) {
-			tcf_idr_cleanup(tn, parm->index);
+			tcf_idr_cleanup(tn, index);
 			return ret;
 		}
 		ret = ACT_P_CREATED;
diff --git a/net/sched/act_sample.c b/net/sched/act_sample.c
index 274d7a0..595308d 100644
--- a/net/sched/act_sample.c
+++ b/net/sched/act_sample.c
@@ -41,8 +41,8 @@ static int tcf_sample_init(struct net *net, struct nlattr *nla,
 	struct tc_action_net *tn = net_generic(net, sample_net_id);
 	struct nlattr *tb[TCA_SAMPLE_MAX + 1];
 	struct psample_group *psample_group;
+	u32 psample_group_num, rate, index;
 	struct tcf_chain *goto_ch = NULL;
-	u32 psample_group_num, rate;
 	struct tc_sample *parm;
 	struct tcf_sample *s;
 	bool exists = false;
@@ -59,8 +59,8 @@ static int tcf_sample_init(struct net *net, struct nlattr *nla,
 		return -EINVAL;
 
 	parm = nla_data(tb[TCA_SAMPLE_PARMS]);
-
-	err = tcf_idr_check_alloc(tn, &parm->index, a, bind);
+	index = parm->index;
+	err = tcf_idr_check_alloc(tn, &index, a, bind);
 	if (err < 0)
 		return err;
 	exists = err;
@@ -68,10 +68,10 @@ static int tcf_sample_init(struct net *net, struct nlattr *nla,
 		return 0;
 
 	if (!exists) {
-		ret = tcf_idr_create(tn, parm->index, est, a,
+		ret = tcf_idr_create(tn, index, est, a,
 				     &act_sample_ops, bind, true);
 		if (ret) {
-			tcf_idr_cleanup(tn, parm->index);
+			tcf_idr_cleanup(tn, index);
 			return ret;
 		}
 		ret = ACT_P_CREATED;
diff --git a/net/sched/act_simple.c b/net/sched/act_simple.c
index f28ddba..33aefa2 100644
--- a/net/sched/act_simple.c
+++ b/net/sched/act_simple.c
@@ -95,6 +95,7 @@ static int tcf_simp_init(struct net *net, struct nlattr *nla,
 	struct tcf_defact *d;
 	bool exists = false;
 	int ret = 0, err;
+	u32 index;
 
 	if (nla == NULL)
 		return -EINVAL;
@@ -108,7 +109,8 @@ static int tcf_simp_init(struct net *net, struct nlattr *nla,
 		return -EINVAL;
 
 	parm = nla_data(tb[TCA_DEF_PARMS]);
-	err = tcf_idr_check_alloc(tn, &parm->index, a, bind);
+	index = parm->index;
+	err = tcf_idr_check_alloc(tn, &index, a, bind);
 	if (err < 0)
 		return err;
 	exists = err;
@@ -119,15 +121,15 @@ static int tcf_simp_init(struct net *net, struct nlattr *nla,
 		if (exists)
 			tcf_idr_release(*a, bind);
 		else
-			tcf_idr_cleanup(tn, parm->index);
+			tcf_idr_cleanup(tn, index);
 		return -EINVAL;
 	}
 
 	if (!exists) {
-		ret = tcf_idr_create(tn, parm->index, est, a,
+		ret = tcf_idr_create(tn, index, est, a,
 				     &act_simp_ops, bind, false);
 		if (ret) {
-			tcf_idr_cleanup(tn, parm->index);
+			tcf_idr_cleanup(tn, index);
 			return ret;
 		}
 
diff --git a/net/sched/act_skbedit.c b/net/sched/act_skbedit.c
index 215a067..b100870 100644
--- a/net/sched/act_skbedit.c
+++ b/net/sched/act_skbedit.c
@@ -99,6 +99,7 @@ static int tcf_skbedit_init(struct net *net, struct nlattr *nla,
 	u16 *queue_mapping = NULL, *ptype = NULL;
 	bool exists = false;
 	int ret = 0, err;
+	u32 index;
 
 	if (nla == NULL)
 		return -EINVAL;
@@ -146,8 +147,8 @@ static int tcf_skbedit_init(struct net *net, struct nlattr *nla,
 	}
 
 	parm = nla_data(tb[TCA_SKBEDIT_PARMS]);
-
-	err = tcf_idr_check_alloc(tn, &parm->index, a, bind);
+	index = parm->index;
+	err = tcf_idr_check_alloc(tn, &index, a, bind);
 	if (err < 0)
 		return err;
 	exists = err;
@@ -158,15 +159,15 @@ static int tcf_skbedit_init(struct net *net, struct nlattr *nla,
 		if (exists)
 			tcf_idr_release(*a, bind);
 		else
-			tcf_idr_cleanup(tn, parm->index);
+			tcf_idr_cleanup(tn, index);
 		return -EINVAL;
 	}
 
 	if (!exists) {
-		ret = tcf_idr_create(tn, parm->index, est, a,
+		ret = tcf_idr_create(tn, index, est, a,
 				     &act_skbedit_ops, bind, true);
 		if (ret) {
-			tcf_idr_cleanup(tn, parm->index);
+			tcf_idr_cleanup(tn, index);
 			return ret;
 		}
 
diff --git a/net/sched/act_skbmod.c b/net/sched/act_skbmod.c
index 4f07706..7da3518 100644
--- a/net/sched/act_skbmod.c
+++ b/net/sched/act_skbmod.c
@@ -87,12 +87,12 @@ static int tcf_skbmod_init(struct net *net, struct nlattr *nla,
 	struct tcf_skbmod_params *p, *p_old;
 	struct tcf_chain *goto_ch = NULL;
 	struct tc_skbmod *parm;
+	u32 lflags = 0, index;
 	struct tcf_skbmod *d;
 	bool exists = false;
 	u8 *daddr = NULL;
 	u8 *saddr = NULL;
 	u16 eth_type = 0;
-	u32 lflags = 0;
 	int ret = 0, err;
 
 	if (!nla)
@@ -122,10 +122,11 @@ static int tcf_skbmod_init(struct net *net, struct nlattr *nla,
 	}
 
 	parm = nla_data(tb[TCA_SKBMOD_PARMS]);
+	index = parm->index;
 	if (parm->flags & SKBMOD_F_SWAPMAC)
 		lflags = SKBMOD_F_SWAPMAC;
 
-	err = tcf_idr_check_alloc(tn, &parm->index, a, bind);
+	err = tcf_idr_check_alloc(tn, &index, a, bind);
 	if (err < 0)
 		return err;
 	exists = err;
@@ -136,15 +137,15 @@ static int tcf_skbmod_init(struct net *net, struct nlattr *nla,
 		if (exists)
 			tcf_idr_release(*a, bind);
 		else
-			tcf_idr_cleanup(tn, parm->index);
+			tcf_idr_cleanup(tn, index);
 		return -EINVAL;
 	}
 
 	if (!exists) {
-		ret = tcf_idr_create(tn, parm->index, est, a,
+		ret = tcf_idr_create(tn, index, est, a,
 				     &act_skbmod_ops, bind, true);
 		if (ret) {
-			tcf_idr_cleanup(tn, parm->index);
+			tcf_idr_cleanup(tn, index);
 			return ret;
 		}
 
diff --git a/net/sched/act_tunnel_key.c b/net/sched/act_tunnel_key.c
index 10dffda..6d0debd 100644
--- a/net/sched/act_tunnel_key.c
+++ b/net/sched/act_tunnel_key.c
@@ -225,6 +225,7 @@ static int tunnel_key_init(struct net *net, struct nlattr *nla,
 	__be16 flags = 0;
 	u8 tos, ttl;
 	int ret = 0;
+	u32 index;
 	int err;
 
 	if (!nla) {
@@ -245,7 +246,8 @@ static int tunnel_key_init(struct net *net, struct nlattr *nla,
 	}
 
 	parm = nla_data(tb[TCA_TUNNEL_KEY_PARMS]);
-	err = tcf_idr_check_alloc(tn, &parm->index, a, bind);
+	index = parm->index;
+	err = tcf_idr_check_alloc(tn, &index, a, bind);
 	if (err < 0)
 		return err;
 	exists = err;
@@ -345,7 +347,7 @@ static int tunnel_key_init(struct net *net, struct nlattr *nla,
 	}
 
 	if (!exists) {
-		ret = tcf_idr_create(tn, parm->index, est, a,
+		ret = tcf_idr_create(tn, index, est, a,
 				     &act_tunnel_key_ops, bind, true);
 		if (ret) {
 			NL_SET_ERR_MSG(extack, "Cannot create TC IDR");
@@ -403,7 +405,7 @@ static int tunnel_key_init(struct net *net, struct nlattr *nla,
 	if (exists)
 		tcf_idr_release(*a, bind);
 	else
-		tcf_idr_cleanup(tn, parm->index);
+		tcf_idr_cleanup(tn, index);
 	return ret;
 }
 
diff --git a/net/sched/act_vlan.c b/net/sched/act_vlan.c
index 9269d35..984b05a 100644
--- a/net/sched/act_vlan.c
+++ b/net/sched/act_vlan.c
@@ -116,6 +116,7 @@ static int tcf_vlan_init(struct net *net, struct nlattr *nla,
 	u8 push_prio = 0;
 	bool exists = false;
 	int ret = 0, err;
+	u32 index;
 
 	if (!nla)
 		return -EINVAL;
@@ -128,7 +129,8 @@ static int tcf_vlan_init(struct net *net, struct nlattr *nla,
 	if (!tb[TCA_VLAN_PARMS])
 		return -EINVAL;
 	parm = nla_data(tb[TCA_VLAN_PARMS]);
-	err = tcf_idr_check_alloc(tn, &parm->index, a, bind);
+	index = parm->index;
+	err = tcf_idr_check_alloc(tn, &index, a, bind);
 	if (err < 0)
 		return err;
 	exists = err;
@@ -144,7 +146,7 @@ static int tcf_vlan_init(struct net *net, struct nlattr *nla,
 			if (exists)
 				tcf_idr_release(*a, bind);
 			else
-				tcf_idr_cleanup(tn, parm->index);
+				tcf_idr_cleanup(tn, index);
 			return -EINVAL;
 		}
 		push_vid = nla_get_u16(tb[TCA_VLAN_PUSH_VLAN_ID]);
@@ -152,7 +154,7 @@ static int tcf_vlan_init(struct net *net, struct nlattr *nla,
 			if (exists)
 				tcf_idr_release(*a, bind);
 			else
-				tcf_idr_cleanup(tn, parm->index);
+				tcf_idr_cleanup(tn, index);
 			return -ERANGE;
 		}
 
@@ -166,7 +168,7 @@ static int tcf_vlan_init(struct net *net, struct nlattr *nla,
 				if (exists)
 					tcf_idr_release(*a, bind);
 				else
-					tcf_idr_cleanup(tn, parm->index);
+					tcf_idr_cleanup(tn, index);
 				return -EPROTONOSUPPORT;
 			}
 		} else {
@@ -180,16 +182,16 @@ static int tcf_vlan_init(struct net *net, struct nlattr *nla,
 		if (exists)
 			tcf_idr_release(*a, bind);
 		else
-			tcf_idr_cleanup(tn, parm->index);
+			tcf_idr_cleanup(tn, index);
 		return -EINVAL;
 	}
 	action = parm->v_action;
 
 	if (!exists) {
-		ret = tcf_idr_create(tn, parm->index, est, a,
+		ret = tcf_idr_create(tn, index, est, a,
 				     &act_vlan_ops, bind, true);
 		if (ret) {
-			tcf_idr_cleanup(tn, parm->index);
+			tcf_idr_cleanup(tn, index);
 			return ret;
 		}
 
-- 
1.8.3.1


^ permalink raw reply related

* Re: [PATCH v4 1/5] vsock/virtio: limit the memory used per-socket
From: Michael S. Tsirkin @ 2019-08-01 13:21 UTC (permalink / raw)
  To: Stefano Garzarella
  Cc: netdev, linux-kernel, Stefan Hajnoczi, David S. Miller,
	virtualization, Jason Wang, kvm
In-Reply-To: <20190801104754.lb3ju5xjfmnxioii@steredhat>

On Thu, Aug 01, 2019 at 12:47:54PM +0200, Stefano Garzarella wrote:
> On Tue, Jul 30, 2019 at 04:42:25PM -0400, Michael S. Tsirkin wrote:
> > On Tue, Jul 30, 2019 at 11:35:39AM +0200, Stefano Garzarella wrote:
> 
> (...)
> 
> > > 
> > > The problem here is the compatibility. Before this series virtio-vsock
> > > and vhost-vsock modules had the RX buffer size hard-coded
> > > (VIRTIO_VSOCK_DEFAULT_RX_BUF_SIZE = 4K). So, if we send a buffer smaller
> > > of 4K, there might be issues.
> > 
> > Shouldn't be if they are following the spec. If not let's fix
> > the broken parts.
> > 
> > > 
> > > Maybe it is the time to add add 'features' to virtio-vsock device.
> > > 
> > > Thanks,
> > > Stefano
> > 
> > Why would a remote care about buffer sizes?
> > 
> > Let's first see what the issues are. If they exist
> > we can either fix the bugs, or code the bug as a feature in spec.
> > 
> 
> The vhost_transport '.stream_enqueue' callback
> [virtio_transport_stream_enqueue()] calls the virtio_transport_send_pkt_info(),
> passing the user message. This function allocates a new packet, copying
> the user message, but (before this series) it limits the packet size to
> the VIRTIO_VSOCK_DEFAULT_RX_BUF_SIZE (4K):
> 
> static int virtio_transport_send_pkt_info(struct vsock_sock *vsk,
> 					  struct virtio_vsock_pkt_info *info)
> {
>  ...
> 	/* we can send less than pkt_len bytes */
> 	if (pkt_len > VIRTIO_VSOCK_DEFAULT_RX_BUF_SIZE)
> 		pkt_len = VIRTIO_VSOCK_DEFAULT_RX_BUF_SIZE;
> 
> 	/* virtio_transport_get_credit might return less than pkt_len credit */
> 	pkt_len = virtio_transport_get_credit(vvs, pkt_len);
> 
> 	/* Do not send zero length OP_RW pkt */
> 	if (pkt_len == 0 && info->op == VIRTIO_VSOCK_OP_RW)
> 		return pkt_len;
>  ...
> }
> 
> then it queues the packet for the TX worker calling .send_pkt()
> [vhost_transport_send_pkt() in the vhost_transport case]
> 
> The main function executed by the TX worker is
> vhost_transport_do_send_pkt() that picks up a buffer from the virtqueue
> and it tries to copy the packet (up to 4K) on it.  If the buffer
> allocated from the guest will be smaller then 4K, I think here it will
> be discarded with an error:
> 
> static void
> vhost_transport_do_send_pkt(struct vhost_vsock *vsock,
> 				struct vhost_virtqueue *vq)
> {
>  ...
> 		nbytes = copy_to_iter(pkt->buf, pkt->len, &iov_iter);

isn't pck len the actual length though?

> 		if (nbytes != pkt->len) {
> 			virtio_transport_free_pkt(pkt);
> 			vq_err(vq, "Faulted on copying pkt buf\n");
> 			break;
> 		}
>  ...
> }
> 
> 
> This series changes this behavior since now we will split the packet in
> vhost_transport_do_send_pkt() depending on the buffer found in the
> virtqueue.
> 
> We didn't change the buffer size in this series, so we still backward
> compatible, but if we will use buffers smaller than 4K, we should
> encounter the error described above.
> 
> How do you suggest we proceed if we want to change the buffer size?
> Maybe adding a feature to "support any buffer size"?
> 
> Thanks,
> Stefano



^ permalink raw reply

* Re: [PATCH] tipc: compat: allow tipc commands without arguments
From: Ying Xue @ 2019-08-01 13:12 UTC (permalink / raw)
  To: Taras Kondratiuk, Jon Maloy, David S. Miller
  Cc: netdev, tipc-discussion, linux-kernel, xe-linux-external, stable
In-Reply-To: <20190729221507.48893-1-takondra@cisco.com>

On 7/30/19 6:15 AM, Taras Kondratiuk wrote:
> Commit 2753ca5d9009 ("tipc: fix uninit-value in tipc_nl_compat_doit")
> broke older tipc tools that use compat interface (e.g. tipc-config from
> tipcutils package):
> 
> % tipc-config -p
> operation not supported
> 
> The commit started to reject TIPC netlink compat messages that do not
> have attributes. It is too restrictive because some of such messages are
> valid (they don't need any arguments):
> 
> % grep 'tx none' include/uapi/linux/tipc_config.h
> #define  TIPC_CMD_NOOP              0x0000    /* tx none, rx none */
> #define  TIPC_CMD_GET_MEDIA_NAMES   0x0002    /* tx none, rx media_name(s) */
> #define  TIPC_CMD_GET_BEARER_NAMES  0x0003    /* tx none, rx bearer_name(s) */
> #define  TIPC_CMD_SHOW_PORTS        0x0006    /* tx none, rx ultra_string */
> #define  TIPC_CMD_GET_REMOTE_MNG    0x4003    /* tx none, rx unsigned */
> #define  TIPC_CMD_GET_MAX_PORTS     0x4004    /* tx none, rx unsigned */
> #define  TIPC_CMD_GET_NETID         0x400B    /* tx none, rx unsigned */
> #define  TIPC_CMD_NOT_NET_ADMIN     0xC001    /* tx none, rx none */
> 
> This patch relaxes the original fix and rejects messages without
> arguments only if such arguments are expected by a command (reg_type is
> non zero).
> 
> Fixes: 2753ca5d9009 ("tipc: fix uninit-value in tipc_nl_compat_doit")
> Cc: stable@vger.kernel.org
> Signed-off-by: Taras Kondratiuk <takondra@cisco.com>

Acked-by: Ying Xue <ying.xue@windriver.com>

> ---
> The patch is based on v5.3-rc2.
> 
>  net/tipc/netlink_compat.c | 11 +++++++----
>  1 file changed, 7 insertions(+), 4 deletions(-)
> 
> diff --git a/net/tipc/netlink_compat.c b/net/tipc/netlink_compat.c
> index d86030ef1232..e135d4e11231 100644
> --- a/net/tipc/netlink_compat.c
> +++ b/net/tipc/netlink_compat.c
> @@ -55,6 +55,7 @@ struct tipc_nl_compat_msg {
>  	int rep_type;
>  	int rep_size;
>  	int req_type;
> +	int req_size;
>  	struct net *net;
>  	struct sk_buff *rep;
>  	struct tlv_desc *req;
> @@ -257,7 +258,8 @@ static int tipc_nl_compat_dumpit(struct tipc_nl_compat_cmd_dump *cmd,
>  	int err;
>  	struct sk_buff *arg;
>  
> -	if (msg->req_type && !TLV_CHECK_TYPE(msg->req, msg->req_type))
> +	if (msg->req_type && (!msg->req_size ||
> +			      !TLV_CHECK_TYPE(msg->req, msg->req_type)))
>  		return -EINVAL;
>  
>  	msg->rep = tipc_tlv_alloc(msg->rep_size);
> @@ -354,7 +356,8 @@ static int tipc_nl_compat_doit(struct tipc_nl_compat_cmd_doit *cmd,
>  {
>  	int err;
>  
> -	if (msg->req_type && !TLV_CHECK_TYPE(msg->req, msg->req_type))
> +	if (msg->req_type && (!msg->req_size ||
> +			      !TLV_CHECK_TYPE(msg->req, msg->req_type)))
>  		return -EINVAL;
>  
>  	err = __tipc_nl_compat_doit(cmd, msg);
> @@ -1278,8 +1281,8 @@ static int tipc_nl_compat_recv(struct sk_buff *skb, struct genl_info *info)
>  		goto send;
>  	}
>  
> -	len = nlmsg_attrlen(req_nlh, GENL_HDRLEN + TIPC_GENL_HDRLEN);
> -	if (!len || !TLV_OK(msg.req, len)) {
> +	msg.req_size = nlmsg_attrlen(req_nlh, GENL_HDRLEN + TIPC_GENL_HDRLEN);
> +	if (msg.req_size && !TLV_OK(msg.req, msg.req_size)) {
>  		msg.rep = tipc_get_err_tlv(TIPC_CFG_NOT_SUPPORTED);
>  		err = -EOPNOTSUPP;
>  		goto send;
> 

^ permalink raw reply

* Re: [PATCH v4 1/5] vsock/virtio: limit the memory used per-socket
From: Stefano Garzarella @ 2019-08-01 13:36 UTC (permalink / raw)
  To: Michael S. Tsirkin
  Cc: netdev, linux-kernel, Stefan Hajnoczi, David S. Miller,
	virtualization, Jason Wang, kvm
In-Reply-To: <20190801091106-mutt-send-email-mst@kernel.org>

On Thu, Aug 01, 2019 at 09:21:15AM -0400, Michael S. Tsirkin wrote:
> On Thu, Aug 01, 2019 at 12:47:54PM +0200, Stefano Garzarella wrote:
> > On Tue, Jul 30, 2019 at 04:42:25PM -0400, Michael S. Tsirkin wrote:
> > > On Tue, Jul 30, 2019 at 11:35:39AM +0200, Stefano Garzarella wrote:
> > 
> > (...)
> > 
> > > > 
> > > > The problem here is the compatibility. Before this series virtio-vsock
> > > > and vhost-vsock modules had the RX buffer size hard-coded
> > > > (VIRTIO_VSOCK_DEFAULT_RX_BUF_SIZE = 4K). So, if we send a buffer smaller
> > > > of 4K, there might be issues.
> > > 
> > > Shouldn't be if they are following the spec. If not let's fix
> > > the broken parts.
> > > 
> > > > 
> > > > Maybe it is the time to add add 'features' to virtio-vsock device.
> > > > 
> > > > Thanks,
> > > > Stefano
> > > 
> > > Why would a remote care about buffer sizes?
> > > 
> > > Let's first see what the issues are. If they exist
> > > we can either fix the bugs, or code the bug as a feature in spec.
> > > 
> > 
> > The vhost_transport '.stream_enqueue' callback
> > [virtio_transport_stream_enqueue()] calls the virtio_transport_send_pkt_info(),
> > passing the user message. This function allocates a new packet, copying
> > the user message, but (before this series) it limits the packet size to
> > the VIRTIO_VSOCK_DEFAULT_RX_BUF_SIZE (4K):
> > 
> > static int virtio_transport_send_pkt_info(struct vsock_sock *vsk,
> > 					  struct virtio_vsock_pkt_info *info)
> > {
> >  ...
> > 	/* we can send less than pkt_len bytes */
> > 	if (pkt_len > VIRTIO_VSOCK_DEFAULT_RX_BUF_SIZE)
> > 		pkt_len = VIRTIO_VSOCK_DEFAULT_RX_BUF_SIZE;
> > 
> > 	/* virtio_transport_get_credit might return less than pkt_len credit */
> > 	pkt_len = virtio_transport_get_credit(vvs, pkt_len);
> > 
> > 	/* Do not send zero length OP_RW pkt */
> > 	if (pkt_len == 0 && info->op == VIRTIO_VSOCK_OP_RW)
> > 		return pkt_len;
> >  ...
> > }
> > 
> > then it queues the packet for the TX worker calling .send_pkt()
> > [vhost_transport_send_pkt() in the vhost_transport case]
> > 
> > The main function executed by the TX worker is
> > vhost_transport_do_send_pkt() that picks up a buffer from the virtqueue
> > and it tries to copy the packet (up to 4K) on it.  If the buffer
> > allocated from the guest will be smaller then 4K, I think here it will
> > be discarded with an error:
> > 

I'm adding more lines to explain better.

> > static void
> > vhost_transport_do_send_pkt(struct vhost_vsock *vsock,
> > 				struct vhost_virtqueue *vq)
> > {
		...

		head = vhost_get_vq_desc(vq, vq->iov, ARRAY_SIZE(vq->iov),
					 &out, &in, NULL, NULL);

		...

		len = iov_length(&vq->iov[out], in);
		iov_iter_init(&iov_iter, READ, &vq->iov[out], in, len);

		nbytes = copy_to_iter(&pkt->hdr, sizeof(pkt->hdr), &iov_iter);
		if (nbytes != sizeof(pkt->hdr)) {
			virtio_transport_free_pkt(pkt);
			vq_err(vq, "Faulted on copying pkt hdr\n");
			break;
		}

> >  ...
> > 		nbytes = copy_to_iter(pkt->buf, pkt->len, &iov_iter);
> 
> isn't pck len the actual length though?
> 

It is the length of the packet that we are copying in the guest RX
buffers pointed by the iov_iter. The guest allocates an iovec with 2
buffers, one for the header and one for the payload (4KB).

> > 		if (nbytes != pkt->len) {
> > 			virtio_transport_free_pkt(pkt);
> > 			vq_err(vq, "Faulted on copying pkt buf\n");
> > 			break;
> > 		}
> >  ...
> > }
> > 
> > 
> > This series changes this behavior since now we will split the packet in
> > vhost_transport_do_send_pkt() depending on the buffer found in the
> > virtqueue.
> > 
> > We didn't change the buffer size in this series, so we still backward
> > compatible, but if we will use buffers smaller than 4K, we should
> > encounter the error described above.
> > 
> > How do you suggest we proceed if we want to change the buffer size?
> > Maybe adding a feature to "support any buffer size"?
> > 
> > Thanks,
> > Stefano
> 
> 

-- 

^ permalink raw reply

* Re: [PATCH] net: sched: use temporary variable for actions indexes
From: Dmytro Linkin @ 2019-08-01 13:39 UTC (permalink / raw)
  To: netdev; +Cc: davem, jiri, jhs, xiyou.wangcong, Vlad Buslov
In-Reply-To: <1564664571-31508-1-git-send-email-dmitrolin@mellanox.com>

On Thu, Aug 01, 2019 at 01:02:51PM +0000, dmitrolin@mellanox.com wrote:
> From: Dmytro Linkin <dmitrolin@mellanox.com>
> 
> Currently init call of all actions (except ipt) init their 'parm'
> structure as a direct pointer to nla data in skb. This leads to race
> condition when some of the filter actions were initialized successfully
> (and were assigned with idr action index that was written directly
> into nla data), but then were deleted and retried (due to following
> action module missing or classifier-initiated retry), in which case
> action init code tries to insert action to idr with index that was
> assigned on previous iteration. During retry the index can be reused
> by another action that was inserted concurrently, which causes
> unintended action sharing between filters.
> To fix described race condition, save action idr index to temporary
> stack-allocated variable instead on nla data.
> 
> Fixes: 0190c1d452a9 ("net: sched: atomically check-allocate action")
> Signed-off-by: Dmytro Linkin <dmitrolin@mellanox.com>
> Signed-off-by: Vlad Buslov <vladbu@mellanox.com>
> ---

Hi,
Forgot to add target branch - net

Sincerely,
Dmytro

^ permalink raw reply

* [PATCH v2] net/mlx5e: always initialize frag->last_in_page
From: Qian Cai @ 2019-08-01 13:52 UTC (permalink / raw)
  To: davem; +Cc: saeedm, tariqt, netdev, linux-kernel, Qian Cai

The commit 069d11465a80 ("net/mlx5e: RX, Enhance legacy Receive Queue
memory scheme") introduced an undefined behaviour below due to
"frag->last_in_page" is only initialized in mlx5e_init_frags_partition()
when,

if (next_frag.offset + frag_info[f].frag_stride > PAGE_SIZE)

or after bailed out the loop,

for (i = 0; i < mlx5_wq_cyc_get_size(&rq->wqe.wq); i++)

As the result, there could be some "frag" have uninitialized
value of "last_in_page".

Later, get_frag() obtains those "frag" and check "frag->last_in_page" in
mlx5e_put_rx_frag() and triggers the error during boot. Fix it by always
initializing "frag->last_in_page" to "false" in
mlx5e_init_frags_partition().

UBSAN: Undefined behaviour in
drivers/net/ethernet/mellanox/mlx5/core/en_rx.c:325:12
load of value 170 is not a valid value for type 'bool' (aka '_Bool')
Call trace:
 dump_backtrace+0x0/0x264
 show_stack+0x20/0x2c
 dump_stack+0xb0/0x104
 __ubsan_handle_load_invalid_value+0x104/0x128
 mlx5e_handle_rx_cqe+0x8e8/0x12cc [mlx5_core]
 mlx5e_poll_rx_cq+0xca8/0x1a94 [mlx5_core]
 mlx5e_napi_poll+0x17c/0xa30 [mlx5_core]
 net_rx_action+0x248/0x940
 __do_softirq+0x350/0x7b8
 irq_exit+0x200/0x26c
 __handle_domain_irq+0xc8/0x128
 gic_handle_irq+0x138/0x228
 el1_irq+0xb8/0x140
 arch_cpu_idle+0x1a4/0x348
 do_idle+0x114/0x1b0
 cpu_startup_entry+0x24/0x28
 rest_init+0x1ac/0x1dc
 arch_call_rest_init+0x10/0x18
 start_kernel+0x4d4/0x57c

Fixes: 069d11465a80 ("net/mlx5e: RX, Enhance legacy Receive Queue memory scheme")
Signed-off-by: Qian Cai <cai@lca.pw>
---

v2: zero-init the whole struct instead per Tariq.

 drivers/net/ethernet/mellanox/mlx5/core/en_main.c | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
index 47eea6b3a1c3..e1810c03a510 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
@@ -331,12 +331,11 @@ static inline u64 mlx5e_get_mpwqe_offset(struct mlx5e_rq *rq, u16 wqe_ix)
 
 static void mlx5e_init_frags_partition(struct mlx5e_rq *rq)
 {
-	struct mlx5e_wqe_frag_info next_frag, *prev;
+	struct mlx5e_wqe_frag_info next_frag = {};
+	struct mlx5e_wqe_frag_info *prev = NULL;
 	int i;
 
 	next_frag.di = &rq->wqe.di[0];
-	next_frag.offset = 0;
-	prev = NULL;
 
 	for (i = 0; i < mlx5_wq_cyc_get_size(&rq->wqe.wq); i++) {
 		struct mlx5e_rq_frag_info *frag_info = &rq->wqe.info.arr[0];
-- 
1.8.3.1


^ permalink raw reply related

* Re: [PATCH v2 net-next] be2net: disable bh with spin_lock in be_process_mcc
From: Willem de Bruijn @ 2019-08-01 14:03 UTC (permalink / raw)
  To: Denis Kirjanov
  Cc: sathya.perla, ajit.khaparde, sriharsha.basavapatna,
	Network Development
In-Reply-To: <20190801092420.34502-1-dkirjanov@suse.com>

On Thu, Aug 1, 2019 at 5:24 AM Denis Kirjanov <kda@linux-powerpc.org> wrote:
>
> be_process_mcc() is invoked in 3 different places and
> always with BHs disabled except the be_poll function
> but since it's invoked from softirq with BHs
> disabled it won't hurt.

This describes the current state. What is the benefit of removing the
local_bh_disable/local_bh_enable pair from one caller (be_worker), but
not another (be_mcc_wait_compl) and then convert process_mcc to
disable bh itself with spin_lock_bh?

^ permalink raw reply

* Re: [net-next,rfc] net: bridge: mdb: Extend with multicast LLADDR
From: Nikolay Aleksandrov @ 2019-08-01 14:07 UTC (permalink / raw)
  To: Horatiu Vultur, idosch, andrew, allan.nielsen
  Cc: davem, roopa, petrm, tglx, fw, netdev, linux-kernel, bridge
In-Reply-To: <1564663840-27721-1-git-send-email-horatiu.vultur@microchip.com>

[-- Attachment #1: Type: text/plain, Size: 13857 bytes --]

Hi Horatiu,
Overall I think MDB is the right way, we'd like to contain the multicast code.
A few comments below.

On 01/08/2019 15:50, Horatiu Vultur wrote:
> Based on the discussion on the topic[1], we extend the functionality of
> the 'bridge mdb' command to accept link layer multicast address. This
> required only few changes and it fits nicely with the current
> implementation and also the old implementation was not changed.
> 
> In this patch, we have added a MAC address to the union in 'struct br_ip'.
> If we want to continue like this we should properly rename the structure as
> it is not an IP any more.
> 
> To create a group for two of the front ports the following entries can
> be added:
> bridge mdb add dev br0 port eth0 grp 01:00:00:00:00:04 permanent vid 1
> bridge mdb add dev br0 port eth1 grp 01:00:00:00:00:04 permanent vid 1
> 
> Now the entries will be display as following:
> dev br0 port eth0 grp 01:00:00:00:00:04 permanent offload vid 1
> dev br0 port eth1 grp 01:00:00:00:00:04 permanent offload vid 1
> 
> This requires changes to iproute2 as well, see the follogin patch for that.
> 
> Now if frame with dmac '01:00:00:00:00:04' will arrive at one of the front
> ports. If we have HW offload support, then the frame will be forwarded by
> the switch, and need not to go to the CPU. In a pure SW world, the frame is
> forwarded by the SW bridge, which will flooded it only the ports which are
> part of the group.
> 
> So far so good. This is an important part of the problem we wanted to solve.
> 
> But, there is one drawback of this approach. If you want to add two of the
> front ports and br0 to receive the frame then I can't see a way of doing it
> with the bridge mdb command. To do that it requireds many more changes to
> the existing code.
> 
> Example:
> bridge mdb add dev br0 port eth0 grp 01:00:00:00:00:04 permanent vid 1
> bridge mdb add dev br0 port eth1 grp 01:00:00:00:00:04 permanent vid 1
> bridge mdb add dev br0 port br0 grp 01:00:00:00:00:04 permanent vid 1 // This looks wrong.
> 
> We believe we come a long way by re-using the facilities in MDB (thanks for
> convincing us in doing this), but we are still not completely happy with
> the result.
> 

Just add self argument for the bridge mdb command, no need to specify it twice.

> If I only look at the user-interface (iproute2), and completely ignore all
> the implementation details, then I still think that the FDB sub command is
> more suitable for this. Today, FDB is for unicast, and MDB is for multicast.
> We could change this such that MDB is for IP-multicast, and FDB is
> forwarding in general (we should prevent FDB in install IP-multicast rules,
> but we suggest to allow it to install MAC-Multicast rules).
> 
> The example from above would now look like this:
> bridge fdb add 01:00:00:00:00:04 dev eth0 static self master
> bridge fdb add 01:00:00:00:00:04 dev eth1 static self master
> bridge fdb add 01:00:00:00:00:04 dev br0 static self master
> 
> It would be very similar to the "bridge vlan" command which also allow to
> specify groups with and without br0.
> 
> Next observation is on the hashing data structure. In 'net_bridge_mdb_entry'
> we only use/need the net_bridge_port_group/ports member (and the MAC
> address, which we hacked into the br_ip struct) when we are a L2-multicast
> entry. This type allow use to re-use the br_multicast_flood function
> which does a lot of the work for us.
> 
> Also, the key used to do the lookup in the FDB is already a MAC address
> (no need to hack the br_ip).
> 

Look at it as extending br_ip, it's not a hack but a valid mcast use-case.
In fact br_ip is an internal structure which can easily be renamed.

> Regarding the events generated by switchdev: In the current proposal this
> is a SWITCHDEV_OBJ_ID_PORT_MDB which refer to the switchdev_obj_port_mdb
> type. If we changed to use the SWITCHDEV_FDB_ADD_TO_BRIDGE event, then
> the associated data type would be switchdev_notifier_fdb_info - which also
> has the information we need.
> 
> Using the FDB database, can still reuse the net_bridge_port_group type (and
> associated functions), and I other parts from the MDB call graph as well.
> 

We don't want to mix these.

> If this sounds appealing, then we can do a proposal based on the idea.
> 
> If the MDB patch is what we can agree on, then we will continue polish this
> and look for a solution to control the inclusion/exclusion of the br0
> device (hints will be most appreciated).
> 

Yes, please. Let's work on this implementation. Some code comments below.

> [1] https://patchwork.ozlabs.org/patch/1136878/
> 
> Signed-off-by: Horatiu Vultur <horatiu.vultur@microchip.com>
> Co-developed-by: Allan W. Nielsen <allan.nielsen@microchip.com>
> Signed-off-by: Allan W. Nielsen <allan.nielsen@microchip.com>
> ---
>  include/linux/if_bridge.h      |  1 +
>  include/uapi/linux/if_bridge.h |  1 +
>  net/bridge/br_device.c         |  7 +++++--
>  net/bridge/br_forward.c        |  3 ++-
>  net/bridge/br_input.c          | 13 ++++++++++--
>  net/bridge/br_mdb.c            | 47 +++++++++++++++++++++++++++++++++++-------
>  net/bridge/br_multicast.c      |  4 +++-
>  net/bridge/br_private.h        |  3 ++-
>  8 files changed, 64 insertions(+), 15 deletions(-)
> 

Overall I don't think we need this BR_PKT_MULTICAST_L2, we could do the below much
easier and without the checks if you use a per-mdb flag that says it's to be treated
as a MULTICAST_L2 entry. Then you remove all of the BR_PKT_MULTICAST_L2 code (see the
attached patch based on this one for example). and continue processing it as it is processed today.
We'll keep the fast-path with minimal number of new conditionals.

Something like the patch I've attached to this reply, note that it is not complete
just to show the intent, you'll have to re-work br_mdb_notify() to make it proper
and there're most probably other details I've missed. If you find even better/less
complex way to do it then please do.

Cheers,
 Nik

> diff --git a/include/linux/if_bridge.h b/include/linux/if_bridge.h
> index f3fab5d..07b092a 100644
> --- a/include/linux/if_bridge.h
> +++ b/include/linux/if_bridge.h
> @@ -16,6 +16,7 @@
>  struct br_ip {
>  	union {
>  		__be32	ip4;
> +		__u8	mac[ETH_ALEN];
>  #if IS_ENABLED(CONFIG_IPV6)
>  		struct in6_addr ip6;
>  #endif
> diff --git a/include/uapi/linux/if_bridge.h b/include/uapi/linux/if_bridge.h
> index 773e476..e535a81 100644
> --- a/include/uapi/linux/if_bridge.h
> +++ b/include/uapi/linux/if_bridge.h
> @@ -243,6 +243,7 @@ struct br_mdb_entry {
>  		union {
>  			__be32	ip4;
>  			struct in6_addr ip6;
> +			__u8	mac[ETH_ALEN];
>  		} u;
>  		__be16		proto;
>  	} addr;
> diff --git a/net/bridge/br_device.c b/net/bridge/br_device.c
> index c05def8..b2d9041 100644
> --- a/net/bridge/br_device.c
> +++ b/net/bridge/br_device.c
> @@ -83,7 +83,7 @@ netdev_tx_t br_dev_xmit(struct sk_buff *skb, struct net_device *dev)
>  		br_flood(br, skb, BR_PKT_BROADCAST, false, true);
>  	} else if (is_multicast_ether_addr(dest)) {
>  		if (unlikely(netpoll_tx_running(dev))) {
> -			br_flood(br, skb, BR_PKT_MULTICAST, false, true);
> +			br_flood(br, skb, BR_PKT_MULTICAST_IP, false, true);
>  			goto out;
>  		}
>  		if (br_multicast_rcv(br, NULL, skb, vid)) {
> @@ -95,8 +95,11 @@ netdev_tx_t br_dev_xmit(struct sk_buff *skb, struct net_device *dev)
>  		if ((mdst || BR_INPUT_SKB_CB_MROUTERS_ONLY(skb)) &&
>  		    br_multicast_querier_exists(br, eth_hdr(skb)))
>  			br_multicast_flood(mdst, skb, false, true);
> +		else if (mdst && skb->protocol != htons(ETH_P_IP) &&
> +			 skb->protocol != htons(ETH_P_IPV6))
> +			br_multicast_flood(mdst, skb, false, true);
>  		else
> -			br_flood(br, skb, BR_PKT_MULTICAST, false, true);
> +			br_flood(br, skb, BR_PKT_MULTICAST_IP, false, true);
>  	} else if ((dst = br_fdb_find_rcu(br, dest, vid)) != NULL) {
>  		br_forward(dst->dst, skb, false, true);
>  	} else {
> diff --git a/net/bridge/br_forward.c b/net/bridge/br_forward.c
> index 8663700..36b58e8 100644
> --- a/net/bridge/br_forward.c
> +++ b/net/bridge/br_forward.c
> @@ -203,7 +203,8 @@ void br_flood(struct net_bridge *br, struct sk_buff *skb,
>  			if (!(p->flags & BR_FLOOD))
>  				continue;
>  			break;
> -		case BR_PKT_MULTICAST:
> +		case BR_PKT_MULTICAST_IP:
> +		case BR_PKT_MULTICAST_L2:
>  			if (!(p->flags & BR_MCAST_FLOOD) && skb->dev != br->dev)
>  				continue;
>  			break;
> diff --git a/net/bridge/br_input.c b/net/bridge/br_input.c
> index 21b74e7..a7db0c5 100644
> --- a/net/bridge/br_input.c
> +++ b/net/bridge/br_input.c
> @@ -99,9 +99,13 @@ int br_handle_frame_finish(struct net *net, struct sock *sk, struct sk_buff *skb
>  			pkt_type = BR_PKT_BROADCAST;
>  			local_rcv = true;
>  		} else {
> -			pkt_type = BR_PKT_MULTICAST;
> +			pkt_type = BR_PKT_MULTICAST_IP;
>  			if (br_multicast_rcv(br, p, skb, vid))
>  				goto drop;
> +
> +			if (skb->protocol != htons(ETH_P_IP) &&
> +			    skb->protocol != htons(ETH_P_IPV6))
> +				pkt_type = BR_PKT_MULTICAST_L2;
>  		}
>  	}
>  
> @@ -129,7 +133,12 @@ int br_handle_frame_finish(struct net *net, struct sock *sk, struct sk_buff *skb
>  	}
>  
>  	switch (pkt_type) {
> -	case BR_PKT_MULTICAST:
> +	case BR_PKT_MULTICAST_L2:
> +		mdst = br_mdb_get(br, skb, vid);
> +		if (mdst)
> +			mcast_hit = true;
> +		break;
> +	case BR_PKT_MULTICAST_IP:
>  		mdst = br_mdb_get(br, skb, vid);
>  		if ((mdst || BR_INPUT_SKB_CB_MROUTERS_ONLY(skb)) &&
>  		    br_multicast_querier_exists(br, eth_hdr(skb))) {
> diff --git a/net/bridge/br_mdb.c b/net/bridge/br_mdb.c
> index bf6acd3..b337a30 100644
> --- a/net/bridge/br_mdb.c
> +++ b/net/bridge/br_mdb.c
> @@ -67,12 +67,19 @@ static void __mdb_entry_to_br_ip(struct br_mdb_entry *entry, struct br_ip *ip)
>  	memset(ip, 0, sizeof(struct br_ip));
>  	ip->vid = entry->vid;
>  	ip->proto = entry->addr.proto;
> -	if (ip->proto == htons(ETH_P_IP))
> +	switch (ip->proto) {
> +	case htons(ETH_P_IP):
>  		ip->u.ip4 = entry->addr.u.ip4;
> +		break;
>  #if IS_ENABLED(CONFIG_IPV6)
> -	else
> +	case htons(ETH_P_IPV6):
>  		ip->u.ip6 = entry->addr.u.ip6;
> +		break;
>  #endif
> +	default:
> +		ether_addr_copy(ip->u.mac, entry->addr.u.mac);
> +		break;
> +	}
>  }
>  
>  static int br_mdb_fill_info(struct sk_buff *skb, struct netlink_callback *cb,
> @@ -117,12 +124,19 @@ static int br_mdb_fill_info(struct sk_buff *skb, struct netlink_callback *cb,
>  			e.ifindex = port->dev->ifindex;
>  			e.vid = p->addr.vid;
>  			__mdb_entry_fill_flags(&e, p->flags);
> -			if (p->addr.proto == htons(ETH_P_IP))
> +			switch (p->addr.proto) {
> +			case htons(ETH_P_IP):
>  				e.addr.u.ip4 = p->addr.u.ip4;
> +				break;
>  #if IS_ENABLED(CONFIG_IPV6)
> -			if (p->addr.proto == htons(ETH_P_IPV6))
> +			case htons(ETH_P_IPV6):
>  				e.addr.u.ip6 = p->addr.u.ip6;
> +				break;
>  #endif
> +			default:
> +				ether_addr_copy(e.addr.u.mac, p->addr.u.mac);
> +				break;
> +			}
>  			e.addr.proto = p->addr.proto;
>  			nest_ent = nla_nest_start_noflag(skb,
>  							 MDBA_MDB_ENTRY_INFO);
> @@ -322,12 +336,19 @@ static void br_mdb_switchdev_host_port(struct net_device *dev,
>  		.vid = entry->vid,
>  	};
>  
> -	if (entry->addr.proto == htons(ETH_P_IP))
> +	switch (entry->addr.proto) {
> +	case htons(ETH_P_IP):
>  		ip_eth_mc_map(entry->addr.u.ip4, mdb.addr);
> +		break;
>  #if IS_ENABLED(CONFIG_IPV6)
> -	else
> +	case htons(ETH_P_IPV6):
>  		ipv6_eth_mc_map(&entry->addr.u.ip6, mdb.addr);
> +		break;
>  #endif
> +	default:
> +		ether_addr_copy(mdb.addr, entry->addr.u.mac);
> +		break;
> +	}
>  
>  	mdb.obj.orig_dev = dev;
>  	switch (type) {
> @@ -367,12 +388,19 @@ static void __br_mdb_notify(struct net_device *dev, struct net_bridge_port *p,
>  	int err = -ENOBUFS;
>  
>  	port_dev = __dev_get_by_index(net, entry->ifindex);
> -	if (entry->addr.proto == htons(ETH_P_IP))
> +	switch (entry->addr.proto) {
> +	case htons(ETH_P_IP):
>  		ip_eth_mc_map(entry->addr.u.ip4, mdb.addr);
> +		break;
>  #if IS_ENABLED(CONFIG_IPV6)
> -	else
> +	case htons(ETH_P_IPV6):
>  		ipv6_eth_mc_map(&entry->addr.u.ip6, mdb.addr);
> +		break;
>  #endif
> +	default:
> +		ether_addr_copy(mdb.addr, entry->addr.u.mac);
> +		break;
> +	}
>  
>  	mdb.obj.orig_dev = port_dev;
>  	if (p && port_dev && type == RTM_NEWMDB) {
> @@ -423,6 +451,7 @@ void br_mdb_notify(struct net_device *dev, struct net_bridge_port *port,
>  #if IS_ENABLED(CONFIG_IPV6)
>  	entry.addr.u.ip6 = group->u.ip6;
>  #endif
> +	ether_addr_copy(group->u.mac, entry.addr.u.mac);
>  	entry.vid = group->vid;
>  	__mdb_entry_fill_flags(&entry, flags);
>  	__br_mdb_notify(dev, port, &entry, type);
> @@ -510,6 +539,8 @@ static bool is_valid_mdb_entry(struct br_mdb_entry *entry)
>  		if (ipv6_addr_is_ll_all_nodes(&entry->addr.u.ip6))
>  			return false;
>  #endif
> +	} else if (is_multicast_ether_addr(entry->addr.u.mac)) {
> +		;
>  	} else
>  		return false;
>  	if (entry->state != MDB_PERMANENT && entry->state != MDB_TEMPORARY)
> diff --git a/net/bridge/br_multicast.c b/net/bridge/br_multicast.c
> index de22c8f..01250c1 100644
> --- a/net/bridge/br_multicast.c
> +++ b/net/bridge/br_multicast.c
> @@ -133,7 +133,9 @@ struct net_bridge_mdb_entry *br_mdb_get(struct net_bridge *br,
>  		break;
>  #endif
>  	default:
> -		return NULL;
> +		ip.proto = 0;
> +		ether_addr_copy(ip.u.mac, eth_hdr(skb)->h_dest);
> +		break;
>  	}
>  
>  	return br_mdb_ip_get_rcu(br, &ip);
> diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h
> index 159a0e2..60e2430d 100644
> --- a/net/bridge/br_private.h
> +++ b/net/bridge/br_private.h
> @@ -590,7 +590,8 @@ void br_fdb_offloaded_set(struct net_bridge *br, struct net_bridge_port *p,
>  /* br_forward.c */
>  enum br_pkt_type {
>  	BR_PKT_UNICAST,
> -	BR_PKT_MULTICAST,
> +	BR_PKT_MULTICAST_IP,
> +	BR_PKT_MULTICAST_L2,
>  	BR_PKT_BROADCAST
>  };
>  int br_dev_queue_push_xmit(struct net *net, struct sock *sk, struct sk_buff *skb);
> 


[-- Attachment #2: 0001-net-bridge-mdb-Extend-with-multicast-LLADDR.patch --]
[-- Type: text/x-patch, Size: 12289 bytes --]

From 611cb2250c06a22d08819bc2d3d67bb7a2867cc4 Mon Sep 17 00:00:00 2001
From: Horatiu Vultur <horatiu.vultur@microchip.com>
Date: Thu, 1 Aug 2019 14:50:40 +0200
Subject: [RFC incomplete] net: bridge: mdb: Extend with multicast LLADDR

Based on the discussion on the topic[1], we extend the functionality of
the 'bridge mdb' command to accept link layer multicast address. This
required only few changes and it fits nicely with the current
implementation and also the old implementation was not changed.

In this patch, we have added a MAC address to the union in 'struct br_ip'.
If we want to continue like this we should properly rename the structure as
it is not an IP any more.

To create a group for two of the front ports the following entries can
be added:
bridge mdb add dev br0 port eth0 grp 01:00:00:00:00:04 permanent vid 1
bridge mdb add dev br0 port eth1 grp 01:00:00:00:00:04 permanent vid 1

Now the entries will be display as following:
dev br0 port eth0 grp 01:00:00:00:00:04 permanent offload vid 1
dev br0 port eth1 grp 01:00:00:00:00:04 permanent offload vid 1

This requires changes to iproute2 as well, see the follogin patch for that.

Now if frame with dmac '01:00:00:00:00:04' will arrive at one of the front
ports. If we have HW offload support, then the frame will be forwarded by
the switch, and need not to go to the CPU. In a pure SW world, the frame is
forwarded by the SW bridge, which will flooded it only the ports which are
part of the group.

So far so good. This is an important part of the problem we wanted to solve.

But, there is one drawback of this approach. If you want to add two of the
front ports and br0 to receive the frame then I can't see a way of doing it
with the bridge mdb command. To do that it requireds many more changes to
the existing code.

Example:
bridge mdb add dev br0 port eth0 grp 01:00:00:00:00:04 permanent vid 1
bridge mdb add dev br0 port eth1 grp 01:00:00:00:00:04 permanent vid 1
bridge mdb add dev br0 port br0 grp 01:00:00:00:00:04 permanent vid 1 // This looks wrong.

We believe we come a long way by re-using the facilities in MDB (thanks for
convincing us in doing this), but we are still not completely happy with
the result.

If I only look at the user-interface (iproute2), and completely ignore all
the implementation details, then I still think that the FDB sub command is
more suitable for this. Today, FDB is for unicast, and MDB is for multicast.
We could change this such that MDB is for IP-multicast, and FDB is
forwarding in general (we should prevent FDB in install IP-multicast rules,
but we suggest to allow it to install MAC-Multicast rules).

The example from above would now look like this:
bridge fdb add 01:00:00:00:00:04 dev eth0 static self master
bridge fdb add 01:00:00:00:00:04 dev eth1 static self master
bridge fdb add 01:00:00:00:00:04 dev br0 static self master

It would be very similar to the "bridge vlan" command which also allow to
specify groups with and without br0.

Next observation is on the hashing data structure. In 'net_bridge_mdb_entry'
we only use/need the net_bridge_port_group/ports member (and the MAC
address, which we hacked into the br_ip struct) when we are a L2-multicast
entry. This type allow use to re-use the br_multicast_flood function
which does a lot of the work for us.

Also, the key used to do the lookup in the FDB is already a MAC address
(no need to hack the br_ip).

Regarding the events generated by switchdev: In the current proposal this
is a SWITCHDEV_OBJ_ID_PORT_MDB which refer to the switchdev_obj_port_mdb
type. If we changed to use the SWITCHDEV_FDB_ADD_TO_BRIDGE event, then
the associated data type would be switchdev_notifier_fdb_info - which also
has the information we need.

Using the FDB database, can still reuse the net_bridge_port_group type (and
associated functions), and I other parts from the MDB call graph as well.

If this sounds appealing, then we can do a proposal based on the idea.

If the MDB patch is what we can agree on, then we will continue polish this
and look for a solution to control the inclusion/exclusion of the br0
device (hints will be most appreciated).

[1] https://patchwork.ozlabs.org/patch/1136878/

Signed-off-by: Horatiu Vultur <horatiu.vultur@microchip.com>
Co-developed-by: Allan W. Nielsen <allan.nielsen@microchip.com>
Signed-off-by: Allan W. Nielsen <allan.nielsen@microchip.com>
---
 include/linux/if_bridge.h      |  1 +
 include/uapi/linux/if_bridge.h |  1 +
 net/bridge/br_device.c         |  7 +++--
 net/bridge/br_forward.c        |  3 ++-
 net/bridge/br_input.c          | 13 ++++++++--
 net/bridge/br_mdb.c            | 47 ++++++++++++++++++++++++++++------
 net/bridge/br_multicast.c      |  4 ++-
 net/bridge/br_private.h        |  3 ++-
 8 files changed, 64 insertions(+), 15 deletions(-)

diff --git a/include/linux/if_bridge.h b/include/linux/if_bridge.h
index 9e57c4411734..68f2558b1a23 100644
--- a/include/linux/if_bridge.h
+++ b/include/linux/if_bridge.h
@@ -16,6 +16,7 @@
 struct br_ip {
 	union {
 		__be32	ip4;
+		__u8	mac[ETH_ALEN];
 #if IS_ENABLED(CONFIG_IPV6)
 		struct in6_addr ip6;
 #endif
diff --git a/include/uapi/linux/if_bridge.h b/include/uapi/linux/if_bridge.h
index 1b3c2b643a02..49a2de0b7420 100644
--- a/include/uapi/linux/if_bridge.h
+++ b/include/uapi/linux/if_bridge.h
@@ -244,6 +244,7 @@ struct br_mdb_entry {
 		union {
 			__be32	ip4;
 			struct in6_addr ip6;
+			__u8	mac[ETH_ALEN];
 		} u;
 		__be16		proto;
 	} addr;
diff --git a/net/bridge/br_device.c b/net/bridge/br_device.c
index 681b72862c16..581d6875cdb2 100644
--- a/net/bridge/br_device.c
+++ b/net/bridge/br_device.c
@@ -84,7 +84,7 @@ netdev_tx_t br_dev_xmit(struct sk_buff *skb, struct net_device *dev)
 		br_flood(br, skb, BR_PKT_BROADCAST, false, true);
 	} else if (is_multicast_ether_addr(dest)) {
 		if (unlikely(netpoll_tx_running(dev))) {
-			br_flood(br, skb, BR_PKT_MULTICAST, false, true);
+			br_flood(br, skb, BR_PKT_MULTICAST_IP, false, true);
 			goto out;
 		}
 		if (br_multicast_rcv(br, NULL, skb, vid)) {
@@ -96,8 +96,11 @@ netdev_tx_t br_dev_xmit(struct sk_buff *skb, struct net_device *dev)
 		if ((mdst || BR_INPUT_SKB_CB_MROUTERS_ONLY(skb)) &&
 		    br_multicast_querier_exists(br, eth_hdr(skb)))
 			br_multicast_flood(mdst, skb, false, true);
+		else if (mdst && skb->protocol != htons(ETH_P_IP) &&
+			 skb->protocol != htons(ETH_P_IPV6))
+			br_multicast_flood(mdst, skb, false, true);
 		else
-			br_flood(br, skb, BR_PKT_MULTICAST, false, true);
+			br_flood(br, skb, BR_PKT_MULTICAST_IP, false, true);
 	} else if ((dst = br_fdb_find_rcu(br, dest, vid)) != NULL) {
 		br_forward(dst->dst, skb, false, true);
 	} else {
diff --git a/net/bridge/br_forward.c b/net/bridge/br_forward.c
index 86637000f275..36b58e86d719 100644
--- a/net/bridge/br_forward.c
+++ b/net/bridge/br_forward.c
@@ -203,7 +203,8 @@ void br_flood(struct net_bridge *br, struct sk_buff *skb,
 			if (!(p->flags & BR_FLOOD))
 				continue;
 			break;
-		case BR_PKT_MULTICAST:
+		case BR_PKT_MULTICAST_IP:
+		case BR_PKT_MULTICAST_L2:
 			if (!(p->flags & BR_MCAST_FLOOD) && skb->dev != br->dev)
 				continue;
 			break;
diff --git a/net/bridge/br_input.c b/net/bridge/br_input.c
index 09b1dd8cd853..f69e710a7f55 100644
--- a/net/bridge/br_input.c
+++ b/net/bridge/br_input.c
@@ -97,9 +97,13 @@ int br_handle_frame_finish(struct net *net, struct sock *sk, struct sk_buff *skb
 			pkt_type = BR_PKT_BROADCAST;
 			local_rcv = true;
 		} else {
-			pkt_type = BR_PKT_MULTICAST;
+			pkt_type = BR_PKT_MULTICAST_IP;
 			if (br_multicast_rcv(br, p, skb, vid))
 				goto drop;
+
+			if (skb->protocol != htons(ETH_P_IP) &&
+			    skb->protocol != htons(ETH_P_IPV6))
+				pkt_type = BR_PKT_MULTICAST_L2;
 		}
 	}
 
@@ -127,7 +131,12 @@ int br_handle_frame_finish(struct net *net, struct sock *sk, struct sk_buff *skb
 	}
 
 	switch (pkt_type) {
-	case BR_PKT_MULTICAST:
+	case BR_PKT_MULTICAST_L2:
+		mdst = br_mdb_get(br, skb, vid);
+		if (mdst)
+			mcast_hit = true;
+		break;
+	case BR_PKT_MULTICAST_IP:
 		mdst = br_mdb_get(br, skb, vid);
 		if ((mdst || BR_INPUT_SKB_CB_MROUTERS_ONLY(skb)) &&
 		    br_multicast_querier_exists(br, eth_hdr(skb))) {
diff --git a/net/bridge/br_mdb.c b/net/bridge/br_mdb.c
index 428af1abf8cc..25d182f45fde 100644
--- a/net/bridge/br_mdb.c
+++ b/net/bridge/br_mdb.c
@@ -69,12 +69,19 @@ static void __mdb_entry_to_br_ip(struct br_mdb_entry *entry, struct br_ip *ip)
 	memset(ip, 0, sizeof(struct br_ip));
 	ip->vid = entry->vid;
 	ip->proto = entry->addr.proto;
-	if (ip->proto == htons(ETH_P_IP))
+	switch (ip->proto) {
+	case htons(ETH_P_IP):
 		ip->u.ip4 = entry->addr.u.ip4;
+		break;
 #if IS_ENABLED(CONFIG_IPV6)
-	else
+	case htons(ETH_P_IPV6):
 		ip->u.ip6 = entry->addr.u.ip6;
+		break;
 #endif
+	default:
+		ether_addr_copy(ip->u.mac, entry->addr.u.mac);
+		break;
+	}
 }
 
 static int br_mdb_fill_info(struct sk_buff *skb, struct netlink_callback *cb,
@@ -119,12 +126,19 @@ static int br_mdb_fill_info(struct sk_buff *skb, struct netlink_callback *cb,
 			e.ifindex = port->dev->ifindex;
 			e.vid = p->addr.vid;
 			__mdb_entry_fill_flags(&e, p->flags);
-			if (p->addr.proto == htons(ETH_P_IP))
+			switch (p->addr.proto) {
+			case htons(ETH_P_IP):
 				e.addr.u.ip4 = p->addr.u.ip4;
+				break;
 #if IS_ENABLED(CONFIG_IPV6)
-			if (p->addr.proto == htons(ETH_P_IPV6))
+			case htons(ETH_P_IPV6):
 				e.addr.u.ip6 = p->addr.u.ip6;
+				break;
 #endif
+			default:
+				ether_addr_copy(e.addr.u.mac, p->addr.u.mac);
+				break;
+			}
 			e.addr.proto = p->addr.proto;
 			nest_ent = nla_nest_start_noflag(skb,
 							 MDBA_MDB_ENTRY_INFO);
@@ -324,12 +338,19 @@ static void br_mdb_switchdev_host_port(struct net_device *dev,
 		.vid = entry->vid,
 	};
 
-	if (entry->addr.proto == htons(ETH_P_IP))
+	switch (entry->addr.proto) {
+	case htons(ETH_P_IP):
 		ip_eth_mc_map(entry->addr.u.ip4, mdb.addr);
+		break;
 #if IS_ENABLED(CONFIG_IPV6)
-	else
+	case htons(ETH_P_IPV6):
 		ipv6_eth_mc_map(&entry->addr.u.ip6, mdb.addr);
+		break;
 #endif
+	default:
+		ether_addr_copy(mdb.addr, entry->addr.u.mac);
+		break;
+	}
 
 	mdb.obj.orig_dev = dev;
 	switch (type) {
@@ -369,12 +390,19 @@ static void __br_mdb_notify(struct net_device *dev, struct net_bridge_port *p,
 	int err = -ENOBUFS;
 
 	port_dev = __dev_get_by_index(net, entry->ifindex);
-	if (entry->addr.proto == htons(ETH_P_IP))
+	switch (entry->addr.proto) {
+	case htons(ETH_P_IP):
 		ip_eth_mc_map(entry->addr.u.ip4, mdb.addr);
+		break;
 #if IS_ENABLED(CONFIG_IPV6)
-	else
+	case htons(ETH_P_IPV6):
 		ipv6_eth_mc_map(&entry->addr.u.ip6, mdb.addr);
+		break;
 #endif
+	default:
+		ether_addr_copy(mdb.addr, entry->addr.u.mac);
+		break;
+	}
 
 	mdb.obj.orig_dev = port_dev;
 	if (p && port_dev && type == RTM_NEWMDB) {
@@ -425,6 +453,7 @@ void br_mdb_notify(struct net_device *dev, struct net_bridge_port *port,
 #if IS_ENABLED(CONFIG_IPV6)
 	entry.addr.u.ip6 = group->u.ip6;
 #endif
+	ether_addr_copy(group->u.mac, entry.addr.u.mac);
 	entry.vid = group->vid;
 	__mdb_entry_fill_flags(&entry, flags);
 	__br_mdb_notify(dev, port, &entry, type);
@@ -512,6 +541,8 @@ static bool is_valid_mdb_entry(struct br_mdb_entry *entry)
 		if (ipv6_addr_is_ll_all_nodes(&entry->addr.u.ip6))
 			return false;
 #endif
+	} else if (is_multicast_ether_addr(entry->addr.u.mac)) {
+		;
 	} else
 		return false;
 	if (entry->state != MDB_PERMANENT && entry->state != MDB_TEMPORARY)
diff --git a/net/bridge/br_multicast.c b/net/bridge/br_multicast.c
index 3d4b2817687f..aa28a322ce9d 100644
--- a/net/bridge/br_multicast.c
+++ b/net/bridge/br_multicast.c
@@ -133,7 +133,9 @@ struct net_bridge_mdb_entry *br_mdb_get(struct net_bridge *br,
 		break;
 #endif
 	default:
-		return NULL;
+		ip.proto = 0;
+		ether_addr_copy(ip.u.mac, eth_hdr(skb)->h_dest);
+		break;
 	}
 
 	return br_mdb_ip_get_rcu(br, &ip);
diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h
index c4fd307fbfdc..1f2a880a9d17 100644
--- a/net/bridge/br_private.h
+++ b/net/bridge/br_private.h
@@ -592,7 +592,8 @@ void br_fdb_offloaded_set(struct net_bridge *br, struct net_bridge_port *p,
 /* br_forward.c */
 enum br_pkt_type {
 	BR_PKT_UNICAST,
-	BR_PKT_MULTICAST,
+	BR_PKT_MULTICAST_IP,
+	BR_PKT_MULTICAST_L2,
 	BR_PKT_BROADCAST
 };
 int br_dev_queue_push_xmit(struct net *net, struct sock *sk, struct sk_buff *skb);
-- 
2.21.0


^ permalink raw reply related

* Re: [PATCH net] ibmveth: use net_err_ratelimited when set_multicast_list
From: Hangbin Liu @ 2019-08-01 14:10 UTC (permalink / raw)
  To: Joe Perches; +Cc: netdev, Thomas Falcon, David S . Miller
In-Reply-To: <209f7fe62e2a79cd8c02b104b8e3babdd16bff30.camel@perches.com>

On Thu, Aug 01, 2019 at 03:28:43AM -0700, Joe Perches wrote:
> Perhaps add the netdev_<level>_ratelimited variants and use that instead
> 
> Somthing like:

Yes, that looks better. Do you mind if I take your code and add your
Signed-off-by info?

Thanks
Hangbin
> 
> ---
>  include/linux/netdevice.h | 54 +++++++++++++++++++++++++++++++++++++++++++++++
>  1 file changed, 54 insertions(+)
> 
> diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
> index 88292953aa6f..37116019e14f 100644
> --- a/include/linux/netdevice.h
> +++ b/include/linux/netdevice.h
> @@ -4737,6 +4737,60 @@ do {								\
>  #define netdev_info_once(dev, fmt, ...) \
>  	netdev_level_once(KERN_INFO, dev, fmt, ##__VA_ARGS__)
>  
> +#define netdev_level_ratelimited(netdev_level, dev, fmt, ...)		\
> +do {									\
> +	static DEFINE_RATELIMIT_STATE(_rs,				\
> +				      DEFAULT_RATELIMIT_INTERVAL,	\
> +				      DEFAULT_RATELIMIT_BURST);		\
> +	if (__ratelimit(&_rs))						\
> +		netdev_level(dev, fmt, ##__VA_ARGS__);			\
> +} while (0)
> +
> +#define netdev_emerg_ratelimited(dev, fmt, ...)				\
> +	netdev_level_ratelimited(netdev_emerg, dev, fmt, ##__VA_ARGS__)
> +#define netdev_alert_ratelimited(dev, fmt, ...)				\
> +	netdev_level_ratelimited(netdev_alert, dev, fmt, ##__VA_ARGS__)
> +#define netdev_crit_ratelimited(dev, fmt, ...)				\
> +	netdev_level_ratelimited(netdev_crit, dev, fmt, ##__VA_ARGS__)
> +#define netdev_err_ratelimited(dev, fmt, ...)				\
> +	netdev_level_ratelimited(netdev_err, dev, fmt, ##__VA_ARGS__)
> +#define netdev_warn_ratelimited(dev, fmt, ...)				\
> +	netdev_level_ratelimited(netdev_warn, dev, fmt, ##__VA_ARGS__)
> +#define netdev_notice_ratelimited(dev, fmt, ...)			\
> +	netdev_level_ratelimited(netdev_notice, dev, fmt, ##__VA_ARGS__)
> +#define netdev_info_ratelimited(dev, fmt, ...)				\
> +	netdev_level_ratelimited(netdev_info, dev, fmt, ##__VA_ARGS__)
> +
> +#if defined(CONFIG_DYNAMIC_DEBUG)
> +/* descriptor check is first to prevent flooding with "callbacks suppressed" */
> +#define netdev_dbg_ratelimited(dev, fmt, ...)				\
> +do {									\
> +	static DEFINE_RATELIMIT_STATE(_rs,				\
> +				      DEFAULT_RATELIMIT_INTERVAL,	\
> +				      DEFAULT_RATELIMIT_BURST);		\
> +	DEFINE_DYNAMIC_DEBUG_METADATA(descriptor, fmt);			\
> +	if (DYNAMIC_DEBUG_BRANCH(descriptor) &&				\
> +	    __ratelimit(&_rs))						\
> +		__dynamic_netdev_dbg(&descriptor, dev, fmt,		\
> +				     ##__VA_ARGS__);			\
> +} while (0)
> +#elif defined(DEBUG)
> +#define netdev_dbg_ratelimited(dev, fmt, ...)				\
> +do {									\
> +	static DEFINE_RATELIMIT_STATE(_rs,				\
> +				      DEFAULT_RATELIMIT_INTERVAL,	\
> +				      DEFAULT_RATELIMIT_BURST);		\
> +	if (__ratelimit(&_rs))						\
> +		netdev_printk(KERN_DEBUG, dev, fmt, ##__VA_ARGS__);	\
> +} while (0)
> +#else
> +#define netdev_dbg_ratelimited(dev, fmt, ...)				\
> +do {									\
> +	if (0)								\
> +		netdev_printk(KERN_DEBUG, dev, fmt, ##__VA_ARGS__);	\
> +} while (0)
> +#endif
> +
>  #define MODULE_ALIAS_NETDEV(device) \
>  	MODULE_ALIAS("netdev-" device)
>  
> 
> 

^ permalink raw reply

* Re: [net-next,rfc] net: bridge: mdb: Extend with multicast LLADDR
From: Nikolay Aleksandrov @ 2019-08-01 14:11 UTC (permalink / raw)
  To: Horatiu Vultur, idosch, andrew, allan.nielsen
  Cc: davem, roopa, petrm, tglx, fw, netdev, linux-kernel, bridge
In-Reply-To: <f758fdbf-4e0a-57b3-f13d-23e893ba7458@cumulusnetworks.com>

[-- Attachment #1: Type: text/plain, Size: 1750 bytes --]

On 01/08/2019 17:07, Nikolay Aleksandrov wrote:
> Hi Horatiu,
> Overall I think MDB is the right way, we'd like to contain the multicast code.
> A few comments below.
> 
> On 01/08/2019 15:50, Horatiu Vultur wrote:
[snip]
>>
>> Signed-off-by: Horatiu Vultur <horatiu.vultur@microchip.com>
>> Co-developed-by: Allan W. Nielsen <allan.nielsen@microchip.com>
>> Signed-off-by: Allan W. Nielsen <allan.nielsen@microchip.com>
>> ---
>>  include/linux/if_bridge.h      |  1 +
>>  include/uapi/linux/if_bridge.h |  1 +
>>  net/bridge/br_device.c         |  7 +++++--
>>  net/bridge/br_forward.c        |  3 ++-
>>  net/bridge/br_input.c          | 13 ++++++++++--
>>  net/bridge/br_mdb.c            | 47 +++++++++++++++++++++++++++++++++++-------
>>  net/bridge/br_multicast.c      |  4 +++-
>>  net/bridge/br_private.h        |  3 ++-
>>  8 files changed, 64 insertions(+), 15 deletions(-)
>>
> 
> Overall I don't think we need this BR_PKT_MULTICAST_L2, we could do the below much
> easier and without the checks if you use a per-mdb flag that says it's to be treated
> as a MULTICAST_L2 entry. Then you remove all of the BR_PKT_MULTICAST_L2 code (see the
> attached patch based on this one for example). and continue processing it as it is processed today.
> We'll keep the fast-path with minimal number of new conditionals.
> 
> Something like the patch I've attached to this reply, note that it is not complete
> just to show the intent, you'll have to re-work br_mdb_notify() to make it proper
> and there're most probably other details I've missed. If you find even better/less
> complex way to do it then please do.
> 
> Cheers,
>  Nik

Oops, I sent back your original patch. Here's the actually changed version
I was talking about.

Thanks,
 Nik




[-- Attachment #2: 0001-net-bridge-mdb-Extend-with-multicast-LLADDR.patch --]
[-- Type: text/x-patch, Size: 12962 bytes --]

From 01dbe0b22da96efcc6fbf46bd3b22353fca32f5d Mon Sep 17 00:00:00 2001
From: Horatiu Vultur <horatiu.vultur@microchip.com>
Date: Thu, 1 Aug 2019 14:50:40 +0200
Subject: [RFC incomplete] net: bridge: mdb: Extend with multicast LLADDR

Based on the discussion on the topic[1], we extend the functionality of
the 'bridge mdb' command to accept link layer multicast address. This
required only few changes and it fits nicely with the current
implementation and also the old implementation was not changed.

In this patch, we have added a MAC address to the union in 'struct br_ip'.
If we want to continue like this we should properly rename the structure as
it is not an IP any more.

To create a group for two of the front ports the following entries can
be added:
bridge mdb add dev br0 port eth0 grp 01:00:00:00:00:04 permanent vid 1
bridge mdb add dev br0 port eth1 grp 01:00:00:00:00:04 permanent vid 1

Now the entries will be display as following:
dev br0 port eth0 grp 01:00:00:00:00:04 permanent offload vid 1
dev br0 port eth1 grp 01:00:00:00:00:04 permanent offload vid 1

This requires changes to iproute2 as well, see the follogin patch for that.

Now if frame with dmac '01:00:00:00:00:04' will arrive at one of the front
ports. If we have HW offload support, then the frame will be forwarded by
the switch, and need not to go to the CPU. In a pure SW world, the frame is
forwarded by the SW bridge, which will flooded it only the ports which are
part of the group.

So far so good. This is an important part of the problem we wanted to solve.

But, there is one drawback of this approach. If you want to add two of the
front ports and br0 to receive the frame then I can't see a way of doing it
with the bridge mdb command. To do that it requireds many more changes to
the existing code.

Example:
bridge mdb add dev br0 port eth0 grp 01:00:00:00:00:04 permanent vid 1
bridge mdb add dev br0 port eth1 grp 01:00:00:00:00:04 permanent vid 1
bridge mdb add dev br0 port br0 grp 01:00:00:00:00:04 permanent vid 1 // This looks wrong.

We believe we come a long way by re-using the facilities in MDB (thanks for
convincing us in doing this), but we are still not completely happy with
the result.

If I only look at the user-interface (iproute2), and completely ignore all
the implementation details, then I still think that the FDB sub command is
more suitable for this. Today, FDB is for unicast, and MDB is for multicast.
We could change this such that MDB is for IP-multicast, and FDB is
forwarding in general (we should prevent FDB in install IP-multicast rules,
but we suggest to allow it to install MAC-Multicast rules).

The example from above would now look like this:
bridge fdb add 01:00:00:00:00:04 dev eth0 static self master
bridge fdb add 01:00:00:00:00:04 dev eth1 static self master
bridge fdb add 01:00:00:00:00:04 dev br0 static self master

It would be very similar to the "bridge vlan" command which also allow to
specify groups with and without br0.

Next observation is on the hashing data structure. In 'net_bridge_mdb_entry'
we only use/need the net_bridge_port_group/ports member (and the MAC
address, which we hacked into the br_ip struct) when we are a L2-multicast
entry. This type allow use to re-use the br_multicast_flood function
which does a lot of the work for us.

Also, the key used to do the lookup in the FDB is already a MAC address
(no need to hack the br_ip).

Regarding the events generated by switchdev: In the current proposal this
is a SWITCHDEV_OBJ_ID_PORT_MDB which refer to the switchdev_obj_port_mdb
type. If we changed to use the SWITCHDEV_FDB_ADD_TO_BRIDGE event, then
the associated data type would be switchdev_notifier_fdb_info - which also
has the information we need.

Using the FDB database, can still reuse the net_bridge_port_group type (and
associated functions), and I other parts from the MDB call graph as well.

If this sounds appealing, then we can do a proposal based on the idea.

If the MDB patch is what we can agree on, then we will continue polish this
and look for a solution to control the inclusion/exclusion of the br0
device (hints will be most appreciated).

[1] https://patchwork.ozlabs.org/patch/1136878/

Signed-off-by: Horatiu Vultur <horatiu.vultur@microchip.com>
Co-developed-by: Allan W. Nielsen <allan.nielsen@microchip.com>
Signed-off-by: Allan W. Nielsen <allan.nielsen@microchip.com>
---
 include/linux/if_bridge.h      |  1 +
 include/uapi/linux/if_bridge.h |  2 ++
 net/bridge/br_device.c         |  2 +-
 net/bridge/br_input.c          |  2 +-
 net/bridge/br_mdb.c            | 58 ++++++++++++++++++++++++++++------
 net/bridge/br_multicast.c      |  6 ++--
 net/bridge/br_private.h        | 10 ++++--
 7 files changed, 64 insertions(+), 17 deletions(-)

diff --git a/include/linux/if_bridge.h b/include/linux/if_bridge.h
index 9e57c4411734..68f2558b1a23 100644
--- a/include/linux/if_bridge.h
+++ b/include/linux/if_bridge.h
@@ -16,6 +16,7 @@
 struct br_ip {
 	union {
 		__be32	ip4;
+		__u8	mac[ETH_ALEN];
 #if IS_ENABLED(CONFIG_IPV6)
 		struct in6_addr ip6;
 #endif
diff --git a/include/uapi/linux/if_bridge.h b/include/uapi/linux/if_bridge.h
index 1b3c2b643a02..50b4b481fac5 100644
--- a/include/uapi/linux/if_bridge.h
+++ b/include/uapi/linux/if_bridge.h
@@ -238,12 +238,14 @@ struct br_mdb_entry {
 	__u8 state;
 #define MDB_FLAGS_OFFLOAD	(1 << 0)
 #define MDB_FLAGS_FAST_LEAVE	(1 << 1)
+#define MDB_FLAGS_L2MCAST	(1 << 2)
 	__u8 flags;
 	__u16 vid;
 	struct {
 		union {
 			__be32	ip4;
 			struct in6_addr ip6;
+			__u8	mac[ETH_ALEN];
 		} u;
 		__be16		proto;
 	} addr;
diff --git a/net/bridge/br_device.c b/net/bridge/br_device.c
index 681b72862c16..8ffceaac4c80 100644
--- a/net/bridge/br_device.c
+++ b/net/bridge/br_device.c
@@ -94,7 +94,7 @@ netdev_tx_t br_dev_xmit(struct sk_buff *skb, struct net_device *dev)
 
 		mdst = br_mdb_get(br, skb, vid);
 		if ((mdst || BR_INPUT_SKB_CB_MROUTERS_ONLY(skb)) &&
-		    br_multicast_querier_exists(br, eth_hdr(skb)))
+		    br_multicast_querier_exists(br, eth_hdr(skb), mdst))
 			br_multicast_flood(mdst, skb, false, true);
 		else
 			br_flood(br, skb, BR_PKT_MULTICAST, false, true);
diff --git a/net/bridge/br_input.c b/net/bridge/br_input.c
index 09b1dd8cd853..331a1ee87c62 100644
--- a/net/bridge/br_input.c
+++ b/net/bridge/br_input.c
@@ -130,7 +130,7 @@ int br_handle_frame_finish(struct net *net, struct sock *sk, struct sk_buff *skb
 	case BR_PKT_MULTICAST:
 		mdst = br_mdb_get(br, skb, vid);
 		if ((mdst || BR_INPUT_SKB_CB_MROUTERS_ONLY(skb)) &&
-		    br_multicast_querier_exists(br, eth_hdr(skb))) {
+		    br_multicast_querier_exists(br, eth_hdr(skb), mdst)) {
 			if ((mdst && mdst->host_joined) ||
 			    br_multicast_is_router(br)) {
 				local_rcv = true;
diff --git a/net/bridge/br_mdb.c b/net/bridge/br_mdb.c
index 428af1abf8cc..2da17b769760 100644
--- a/net/bridge/br_mdb.c
+++ b/net/bridge/br_mdb.c
@@ -62,6 +62,8 @@ static void __mdb_entry_fill_flags(struct br_mdb_entry *e, unsigned char flags)
 		e->flags |= MDB_FLAGS_OFFLOAD;
 	if (flags & MDB_PG_FLAGS_FAST_LEAVE)
 		e->flags |= MDB_FLAGS_FAST_LEAVE;
+	if (flags & MDB_PG_FLAGS_L2MCAST)
+		e->flags |= MDB_FLAGS_L2MCAST;
 }
 
 static void __mdb_entry_to_br_ip(struct br_mdb_entry *entry, struct br_ip *ip)
@@ -69,12 +71,19 @@ static void __mdb_entry_to_br_ip(struct br_mdb_entry *entry, struct br_ip *ip)
 	memset(ip, 0, sizeof(struct br_ip));
 	ip->vid = entry->vid;
 	ip->proto = entry->addr.proto;
-	if (ip->proto == htons(ETH_P_IP))
+	switch (ip->proto) {
+	case htons(ETH_P_IP):
 		ip->u.ip4 = entry->addr.u.ip4;
+		break;
 #if IS_ENABLED(CONFIG_IPV6)
-	else
+	case htons(ETH_P_IPV6):
 		ip->u.ip6 = entry->addr.u.ip6;
+		break;
 #endif
+	default:
+		ether_addr_copy(ip->u.mac, entry->addr.u.mac);
+		break;
+	}
 }
 
 static int br_mdb_fill_info(struct sk_buff *skb, struct netlink_callback *cb,
@@ -110,6 +119,7 @@ static int br_mdb_fill_info(struct sk_buff *skb, struct netlink_callback *cb,
 		      pp = &p->next) {
 			struct nlattr *nest_ent;
 			struct br_mdb_entry e;
+			u16 flags = p->flags;
 
 			port = p->port;
 			if (!port)
@@ -118,13 +128,22 @@ static int br_mdb_fill_info(struct sk_buff *skb, struct netlink_callback *cb,
 			memset(&e, 0, sizeof(e));
 			e.ifindex = port->dev->ifindex;
 			e.vid = p->addr.vid;
-			__mdb_entry_fill_flags(&e, p->flags);
-			if (p->addr.proto == htons(ETH_P_IP))
+			if (mp->l2_mcast)
+				flags |= MDB_PG_FLAGS_L2MCAST;
+			__mdb_entry_fill_flags(&e, flags);
+			switch (p->addr.proto) {
+			case htons(ETH_P_IP):
 				e.addr.u.ip4 = p->addr.u.ip4;
+				break;
 #if IS_ENABLED(CONFIG_IPV6)
-			if (p->addr.proto == htons(ETH_P_IPV6))
+			case htons(ETH_P_IPV6):
 				e.addr.u.ip6 = p->addr.u.ip6;
+				break;
 #endif
+			default:
+				ether_addr_copy(e.addr.u.mac, p->addr.u.mac);
+				break;
+			}
 			e.addr.proto = p->addr.proto;
 			nest_ent = nla_nest_start_noflag(skb,
 							 MDBA_MDB_ENTRY_INFO);
@@ -324,12 +343,19 @@ static void br_mdb_switchdev_host_port(struct net_device *dev,
 		.vid = entry->vid,
 	};
 
-	if (entry->addr.proto == htons(ETH_P_IP))
+	switch (entry->addr.proto) {
+	case htons(ETH_P_IP):
 		ip_eth_mc_map(entry->addr.u.ip4, mdb.addr);
+		break;
 #if IS_ENABLED(CONFIG_IPV6)
-	else
+	case htons(ETH_P_IPV6):
 		ipv6_eth_mc_map(&entry->addr.u.ip6, mdb.addr);
+		break;
 #endif
+	default:
+		ether_addr_copy(mdb.addr, entry->addr.u.mac);
+		break;
+	}
 
 	mdb.obj.orig_dev = dev;
 	switch (type) {
@@ -369,12 +395,19 @@ static void __br_mdb_notify(struct net_device *dev, struct net_bridge_port *p,
 	int err = -ENOBUFS;
 
 	port_dev = __dev_get_by_index(net, entry->ifindex);
-	if (entry->addr.proto == htons(ETH_P_IP))
+	switch (entry->addr.proto) {
+	case htons(ETH_P_IP):
 		ip_eth_mc_map(entry->addr.u.ip4, mdb.addr);
+		break;
 #if IS_ENABLED(CONFIG_IPV6)
-	else
+	case htons(ETH_P_IPV6):
 		ipv6_eth_mc_map(&entry->addr.u.ip6, mdb.addr);
+		break;
 #endif
+	default:
+		ether_addr_copy(mdb.addr, entry->addr.u.mac);
+		break;
+	}
 
 	mdb.obj.orig_dev = port_dev;
 	if (p && port_dev && type == RTM_NEWMDB) {
@@ -425,6 +458,7 @@ void br_mdb_notify(struct net_device *dev, struct net_bridge_port *port,
 #if IS_ENABLED(CONFIG_IPV6)
 	entry.addr.u.ip6 = group->u.ip6;
 #endif
+	ether_addr_copy(group->u.mac, entry.addr.u.mac);
 	entry.vid = group->vid;
 	__mdb_entry_fill_flags(&entry, flags);
 	__br_mdb_notify(dev, port, &entry, type);
@@ -512,8 +546,12 @@ static bool is_valid_mdb_entry(struct br_mdb_entry *entry)
 		if (ipv6_addr_is_ll_all_nodes(&entry->addr.u.ip6))
 			return false;
 #endif
-	} else
+	} else if ((entry->flags & MDB_FLAGS_L2MCAST) &&
+		   !is_multicast_ether_addr(entry->addr.u.mac)) {
 		return false;
+	} else {
+		return false;
+	}
 	if (entry->state != MDB_PERMANENT && entry->state != MDB_TEMPORARY)
 		return false;
 	if (entry->vid >= VLAN_VID_MASK)
diff --git a/net/bridge/br_multicast.c b/net/bridge/br_multicast.c
index 3d4b2817687f..43cfc8b18765 100644
--- a/net/bridge/br_multicast.c
+++ b/net/bridge/br_multicast.c
@@ -133,7 +133,9 @@ struct net_bridge_mdb_entry *br_mdb_get(struct net_bridge *br,
 		break;
 #endif
 	default:
-		return NULL;
+		ip.proto = 0;
+		ether_addr_copy(ip.u.mac, eth_hdr(skb)->h_dest);
+		break;
 	}
 
 	return br_mdb_ip_get_rcu(br, &ip);
@@ -2233,7 +2235,7 @@ bool br_multicast_has_querier_anywhere(struct net_device *dev, int proto)
 	memset(&eth, 0, sizeof(eth));
 	eth.h_proto = htons(proto);
 
-	ret = br_multicast_querier_exists(br, &eth);
+	ret = br_multicast_querier_exists(br, &eth, NULL);
 
 unlock:
 	rcu_read_unlock();
diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h
index c4fd307fbfdc..be9e5f327c86 100644
--- a/net/bridge/br_private.h
+++ b/net/bridge/br_private.h
@@ -200,6 +200,7 @@ struct net_bridge_fdb_entry {
 #define MDB_PG_FLAGS_PERMANENT	BIT(0)
 #define MDB_PG_FLAGS_OFFLOAD	BIT(1)
 #define MDB_PG_FLAGS_FAST_LEAVE	BIT(2)
+#define MDB_PG_FLAGS_L2MCAST	BIT(3)
 
 struct net_bridge_port_group {
 	struct net_bridge_port		*port;
@@ -220,6 +221,7 @@ struct net_bridge_mdb_entry {
 	struct timer_list		timer;
 	struct br_ip			addr;
 	bool				host_joined;
+	bool				l2_mcast;
 	struct hlist_node		mdb_node;
 };
 
@@ -734,7 +736,8 @@ __br_multicast_querier_exists(struct net_bridge *br,
 }
 
 static inline bool br_multicast_querier_exists(struct net_bridge *br,
-					       struct ethhdr *eth)
+					       struct ethhdr *eth,
+					       struct net_bridge_mdb_entry *dst)
 {
 	switch (eth->h_proto) {
 	case (htons(ETH_P_IP)):
@@ -746,7 +749,7 @@ static inline bool br_multicast_querier_exists(struct net_bridge *br,
 			&br->ip6_other_query, true);
 #endif
 	default:
-		return false;
+		return !!(dst && dst->l2_mcast);
 	}
 }
 
@@ -814,7 +817,8 @@ static inline bool br_multicast_is_router(struct net_bridge *br)
 }
 
 static inline bool br_multicast_querier_exists(struct net_bridge *br,
-					       struct ethhdr *eth)
+					       struct ethhdr *eth,
+					       struct net_bridge_mdb_entry *dst)
 {
 	return false;
 }
-- 
2.21.0


^ permalink raw reply related

* Re: [PATCH V2 7/9] vhost: do not use RCU to synchronize MMU notifier with worker
From: Jason Gunthorpe @ 2019-08-01 14:15 UTC (permalink / raw)
  To: Jason Wang; +Cc: mst, kvm, virtualization, netdev, linux-kernel, linux-mm
In-Reply-To: <a3bde826-6329-68e4-2826-8a9de4c5bd1e@redhat.com>

On Thu, Aug 01, 2019 at 01:02:18PM +0800, Jason Wang wrote:
> 
> On 2019/8/1 上午3:30, Jason Gunthorpe wrote:
> > On Wed, Jul 31, 2019 at 09:28:20PM +0800, Jason Wang wrote:
> > > On 2019/7/31 下午8:39, Jason Gunthorpe wrote:
> > > > On Wed, Jul 31, 2019 at 04:46:53AM -0400, Jason Wang wrote:
> > > > > We used to use RCU to synchronize MMU notifier with worker. This leads
> > > > > calling synchronize_rcu() in invalidate_range_start(). But on a busy
> > > > > system, there would be many factors that may slow down the
> > > > > synchronize_rcu() which makes it unsuitable to be called in MMU
> > > > > notifier.
> > > > > 
> > > > > A solution is SRCU but its overhead is obvious with the expensive full
> > > > > memory barrier. Another choice is to use seqlock, but it doesn't
> > > > > provide a synchronization method between readers and writers. The last
> > > > > choice is to use vq mutex, but it need to deal with the worst case
> > > > > that MMU notifier must be blocked and wait for the finish of swap in.
> > > > > 
> > > > > So this patch switches use a counter to track whether or not the map
> > > > > was used. The counter was increased when vq try to start or finish
> > > > > uses the map. This means, when it was even, we're sure there's no
> > > > > readers and MMU notifier is synchronized. When it was odd, it means
> > > > > there's a reader we need to wait it to be even again then we are
> > > > > synchronized.
> > > > You just described a seqlock.
> > > 
> > > Kind of, see my explanation below.
> > > 
> > > 
> > > > We've been talking about providing this as some core service from mmu
> > > > notifiers because nearly every use of this API needs it.
> > > 
> > > That would be very helpful.
> > > 
> > > 
> > > > IMHO this gets the whole thing backwards, the common pattern is to
> > > > protect the 'shadow pte' data with a seqlock (usually open coded),
> > > > such that the mmu notififer side has the write side of that lock and
> > > > the read side is consumed by the thread accessing or updating the SPTE.
> > > 
> > > Yes, I've considered something like that. But the problem is, mmu notifier
> > > (writer) need to wait for the vhost worker to finish the read before it can
> > > do things like setting dirty pages and unmapping page.  It looks to me
> > > seqlock doesn't provide things like this.
> > The seqlock is usually used to prevent a 2nd thread from accessing the
> > VA while it is being changed by the mm. ie you use something seqlocky
> > instead of the ugly mmu_notifier_unregister/register cycle.
> 
> 
> Yes, so we have two mappings:
> 
> [1] vring address to VA
> [2] VA to PA
> 
> And have several readers and writers
> 
> 1) set_vring_num_addr(): writer of both [1] and [2]
> 2) MMU notifier: reader of [1] writer of [2]
> 3) GUP: reader of [1] writer of [2]
> 4) memory accessors: reader of [1] and [2]
> 
> Fortunately, 1) 3) and 4) have already synchronized through vq->mutex. We
> only need to deal with synchronization between 2) and each of the reset:
> Sync between 1) and 2): For mapping [1], I do
> mmu_notifier_unregister/register. This help to avoid holding any lock to do
> overlap check.

I suspect you could have done this with a RCU technique instead of
register/unregister.

> Sync between 2) and 4): For mapping [1], both are readers, no need any
> synchronization. For mapping [2], synchronize through RCU (or something
> simliar to seqlock).

You can't really use a seqlock, seqlocks are collision-retry locks,
and the semantic here is that invalidate_range_start *MUST* not
continue until thread doing #4 above is guarenteed no longer touching
the memory.

This must be a proper barrier, like a spinlock, mutex, or
synchronize_rcu.

And, again, you can't re-invent a spinlock with open coding and get
something better.

Jason

^ permalink raw reply

page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox