Netdev List
 help / color / mirror / Atom feed
* [PATCH net-next v4 5/8] dsa: port: Ignore bridge VLAN events
From: Petr Machata @ 2018-05-30  0:59 UTC (permalink / raw)
  To: netdev, devel, bridge
  Cc: jiri, idosch, davem, razvan.stefanescu, gregkh, stephen, andrew,
	vivien.didelot, f.fainelli, nikolay, dan.carpenter
In-Reply-To: <cover.1527641426.git.petrm@mellanox.com>

A follow-up patch enables emitting VLAN notifications for the bridge CPU
port in addition to the existing slave port notifications. These
notifications have orig_dev set to the bridge in question.

Because there's no specific support for these VLANs, just ignore the
notifications to maintain the current behavior.

Signed-off-by: Petr Machata <petrm@mellanox.com>
Reviewed-by: Vivien Didelot <vivien.didelot@savoirfairelinux.com>
---
 net/dsa/port.c | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/net/dsa/port.c b/net/dsa/port.c
index 2413beb..ed05954 100644
--- a/net/dsa/port.c
+++ b/net/dsa/port.c
@@ -252,6 +252,9 @@ int dsa_port_vlan_add(struct dsa_port *dp,
 		.vlan = vlan,
 	};
 
+	if (netif_is_bridge_master(vlan->obj.orig_dev))
+		return -EOPNOTSUPP;
+
 	if (br_vlan_enabled(dp->bridge_dev))
 		return dsa_port_notify(dp, DSA_NOTIFIER_VLAN_ADD, &info);
 
@@ -267,6 +270,9 @@ int dsa_port_vlan_del(struct dsa_port *dp,
 		.vlan = vlan,
 	};
 
+	if (netif_is_bridge_master(vlan->obj.orig_dev))
+		return -EOPNOTSUPP;
+
 	if (br_vlan_enabled(dp->bridge_dev))
 		return dsa_port_notify(dp, DSA_NOTIFIER_VLAN_DEL, &info);
 
-- 
2.4.11

^ permalink raw reply related

* [PATCH net-next v4 7/8] net: bridge: Notify about bridge VLANs
From: Petr Machata @ 2018-05-30  1:00 UTC (permalink / raw)
  To: netdev, devel, bridge
  Cc: f.fainelli, andrew, nikolay, gregkh, vivien.didelot, idosch, jiri,
	razvan.stefanescu, davem, dan.carpenter
In-Reply-To: <cover.1527641426.git.petrm@mellanox.com>

A driver might need to react to changes in settings of brentry VLANs.
Therefore send switchdev port notifications for these as well. Reuse
SWITCHDEV_OBJ_ID_PORT_VLAN for this purpose. Listeners should use
netif_is_bridge_master() on orig_dev to determine whether the
notification is about a bridge port or a bridge.

Signed-off-by: Petr Machata <petrm@mellanox.com>
---
 net/bridge/br_vlan.c | 28 +++++++++++++++++++++++++---
 1 file changed, 25 insertions(+), 3 deletions(-)

diff --git a/net/bridge/br_vlan.c b/net/bridge/br_vlan.c
index 602c869..7df2690 100644
--- a/net/bridge/br_vlan.c
+++ b/net/bridge/br_vlan.c
@@ -246,6 +246,10 @@ static int __vlan_add(struct net_bridge_vlan *v, u16 flags)
 			goto out_filt;
 		v->brvlan = masterv;
 		v->stats = masterv->stats;
+	} else {
+		err = br_switchdev_port_vlan_add(dev, v->vid, flags);
+		if (err && err != -EOPNOTSUPP)
+			goto out;
 	}
 
 	/* Add the dev mac and count the vlan only if it's usable */
@@ -281,6 +285,8 @@ static int __vlan_add(struct net_bridge_vlan *v, u16 flags)
 			br_vlan_put_master(masterv);
 			v->brvlan = NULL;
 		}
+	} else {
+		br_switchdev_port_vlan_del(dev, v->vid);
 	}
 
 	goto out;
@@ -306,6 +312,11 @@ static int __vlan_del(struct net_bridge_vlan *v)
 		err = __vlan_vid_del(p->dev, p->br, v->vid);
 		if (err)
 			goto out;
+	} else {
+		err = br_switchdev_port_vlan_del(v->br->dev, v->vid);
+		if (err && err != -EOPNOTSUPP)
+			goto out;
+		err = 0;
 	}
 
 	if (br_vlan_should_use(v)) {
@@ -558,16 +569,22 @@ static int br_vlan_add_existing(struct net_bridge *br,
 {
 	int err;
 
+	err = br_switchdev_port_vlan_add(br->dev, vlan->vid, flags);
+	if (err && err != -EOPNOTSUPP)
+		return err;
+
 	if (!br_vlan_is_brentry(vlan)) {
 		/* Trying to change flags of non-existent bridge vlan */
-		if (!(flags & BRIDGE_VLAN_INFO_BRENTRY))
-			return -EINVAL;
+		if (!(flags & BRIDGE_VLAN_INFO_BRENTRY)) {
+			err = -EINVAL;
+			goto err_flags;
+		}
 		/* It was only kept for port vlans, now make it real */
 		err = br_fdb_insert(br, NULL, br->dev->dev_addr,
 				    vlan->vid);
 		if (err) {
 			br_err(br, "failed to insert local address into bridge forwarding table\n");
-			return err;
+			goto err_fdb_insert;
 		}
 
 		refcount_inc(&vlan->refcnt);
@@ -580,6 +597,11 @@ static int br_vlan_add_existing(struct net_bridge *br,
 		*changed = true;
 
 	return 0;
+
+err_fdb_insert:
+err_flags:
+	br_switchdev_port_vlan_del(br->dev, vlan->vid);
+	return err;
 }
 
 /* Must be protected by RTNL.
-- 
2.4.11

^ permalink raw reply related

* [PATCH net-next v4 6/8] staging: fsl-dpaa2: ethsw: Ignore bridge VLAN events
From: Petr Machata @ 2018-05-30  1:00 UTC (permalink / raw)
  To: netdev, devel, bridge
  Cc: jiri, idosch, davem, razvan.stefanescu, gregkh, stephen, andrew,
	vivien.didelot, f.fainelli, nikolay, dan.carpenter
In-Reply-To: <cover.1527641426.git.petrm@mellanox.com>

A follow-up patch enables emitting VLAN notifications for the bridge CPU
port in addition to the existing slave port notifications. These
notifications have orig_dev set to the bridge in question.

Because there's no specific support for these VLANs, just ignore the
notifications to maintain the current behavior.

Signed-off-by: Petr Machata <petrm@mellanox.com>
---
 drivers/staging/fsl-dpaa2/ethsw/ethsw.c | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/drivers/staging/fsl-dpaa2/ethsw/ethsw.c b/drivers/staging/fsl-dpaa2/ethsw/ethsw.c
index c723a04..a17dd29 100644
--- a/drivers/staging/fsl-dpaa2/ethsw/ethsw.c
+++ b/drivers/staging/fsl-dpaa2/ethsw/ethsw.c
@@ -719,6 +719,9 @@ static int port_vlans_add(struct net_device *netdev,
 	struct ethsw_port_priv *port_priv = netdev_priv(netdev);
 	int vid, err;
 
+	if (netif_is_bridge_master(vlan->obj.orig_dev))
+		return -EOPNOTSUPP;
+
 	if (switchdev_trans_ph_prepare(trans))
 		return 0;
 
@@ -873,6 +876,9 @@ static int port_vlans_del(struct net_device *netdev,
 	struct ethsw_port_priv *port_priv = netdev_priv(netdev);
 	int vid, err;
 
+	if (netif_is_bridge_master(vlan->obj.orig_dev))
+		return -EOPNOTSUPP;
+
 	for (vid = vlan->vid_begin; vid <= vlan->vid_end; vid++) {
 		err = ethsw_port_del_vlan(port_priv, vid);
 		if (err)
-- 
2.4.11

^ permalink raw reply related

* [PATCH net-next v4 8/8] mlxsw: spectrum_switchdev: Schedule respin during trans prepare
From: Petr Machata @ 2018-05-30  1:00 UTC (permalink / raw)
  To: netdev, devel, bridge
  Cc: f.fainelli, andrew, nikolay, gregkh, vivien.didelot, idosch, jiri,
	razvan.stefanescu, davem, dan.carpenter
In-Reply-To: <cover.1527641426.git.petrm@mellanox.com>

Since there's no special support for the bridge events, the driver
returns -EOPNOTSUPP, and thus the commit never happens. Therefore
schedule respin during the prepare stage: there's no real difference one
way or another.

This fixes the problem that mirror-to-gretap offload wouldn't adapt to
changes in bridge vlan configuration right away and another notification
would have to arrive for mlxsw to catch up.

Signed-off-by: Petr Machata <petrm@mellanox.com>
Reviewed-by: Ido Schimmel <idosch@mellanox.com>
---
 drivers/net/ethernet/mellanox/mlxsw/spectrum_switchdev.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_switchdev.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum_switchdev.c
index cbc8fab..8a15ac4 100644
--- a/drivers/net/ethernet/mellanox/mlxsw/spectrum_switchdev.c
+++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_switchdev.c
@@ -1697,7 +1697,7 @@ static int mlxsw_sp_port_obj_add(struct net_device *dev,
 		vlan = SWITCHDEV_OBJ_PORT_VLAN(obj);
 		err = mlxsw_sp_port_vlans_add(mlxsw_sp_port, vlan, trans);
 
-		if (switchdev_trans_ph_commit(trans)) {
+		if (switchdev_trans_ph_prepare(trans)) {
 			/* The event is emitted before the changes are actually
 			 * applied to the bridge. Therefore schedule the respin
 			 * call for later, so that the respin logic sees the
-- 
2.4.11

^ permalink raw reply related

* Re: [PATCH mlx5-next 1/2] net/mlx5: Add temperature warning event to log
From: Andrew Lunn @ 2018-05-30  1:04 UTC (permalink / raw)
  To: Saeed Mahameed
  Cc: netdev, linux-rdma, Leon Romanovsky, Jason Gunthorpe, Ilan Tayari,
	Adi Nissim
In-Reply-To: <20180530001954.12000-2-saeedm@mellanox.com>

On Tue, May 29, 2018 at 05:19:53PM -0700, Saeed Mahameed wrote:
> From: Ilan Tayari <ilant@mellanox.com>
> 
> Temperature warning event is sent by FW to indicate high temperature
> as detected by one of the sensors on the board.
> Add handling of this event by writing the numbers of the alert sensors
> to the kernel log.

Hi Saaed

Is the temperature itself available? If so, it would be better to
expose this as a hwmon device per temperature sensor.

       Andrew

^ permalink raw reply

* Re: [PATCH mlx5-next 2/2] net/mlx5: Add FPGA QP error event
From: Andrew Lunn @ 2018-05-30  1:07 UTC (permalink / raw)
  To: Saeed Mahameed
  Cc: netdev, linux-rdma, Leon Romanovsky, Jason Gunthorpe, Ilan Tayari,
	Adi Nissim
In-Reply-To: <20180530001954.12000-3-saeedm@mellanox.com>

On Tue, May 29, 2018 at 05:19:54PM -0700, Saeed Mahameed wrote:
> From: Ilan Tayari <ilant@mellanox.com>
> 
> The FPGA QP event fires whenever a QP on the FPGA trasitions
> to the error state.

FPGA i know, field programmable gate array. Could you offer some clue
as to what QP means?

   Thanks
	Andrew

^ permalink raw reply

* [PATCH v2] Revert "alx: remove WoL support"
From: AceLan Kao @ 2018-05-30  2:10 UTC (permalink / raw)
  To: Jay Cliburn, Chris Snook, David S . Miller, Rakesh Pandit, netdev,
	Emily Chien, Andrew Lunn, Johannes Berg, Johannes Stezenbach,
	linux-kernel

This reverts commit bc2bebe8de8ed4ba6482c9cc370b0dd72ffe8cd2.

The WoL feature is a must to pass Energy Star 6.1 and above,
the power consumption will be measured during S3 with WoL is enabled.

Reverting "alx: remove WoL support", and will try to fix the unintentional
wake up issue when WoL is enabled.

Bugzilla: https://bugzilla.kernel.org/show_bug.cgi?id=61651

Signed-off-by: AceLan Kao <acelan.kao@canonical.com>
---
 drivers/net/ethernet/atheros/alx/ethtool.c |  36 +++++
 drivers/net/ethernet/atheros/alx/hw.c      | 154 ++++++++++++++++++++-
 drivers/net/ethernet/atheros/alx/hw.h      |   5 +
 drivers/net/ethernet/atheros/alx/main.c    | 142 +++++++++++++++++--
 4 files changed, 326 insertions(+), 11 deletions(-)

diff --git a/drivers/net/ethernet/atheros/alx/ethtool.c b/drivers/net/ethernet/atheros/alx/ethtool.c
index 2f4eabf652e8..859e27236ce4 100644
--- a/drivers/net/ethernet/atheros/alx/ethtool.c
+++ b/drivers/net/ethernet/atheros/alx/ethtool.c
@@ -310,11 +310,47 @@ static int alx_get_sset_count(struct net_device *netdev, int sset)
 	}
 }
 
+static void alx_get_wol(struct net_device *netdev, struct ethtool_wolinfo *wol)
+{
+	struct alx_priv *alx = netdev_priv(netdev);
+	struct alx_hw *hw = &alx->hw;
+
+	wol->supported = WAKE_MAGIC | WAKE_PHY;
+	wol->wolopts = 0;
+
+	if (hw->sleep_ctrl & ALX_SLEEP_WOL_MAGIC)
+		wol->wolopts |= WAKE_MAGIC;
+	if (hw->sleep_ctrl & ALX_SLEEP_WOL_PHY)
+		wol->wolopts |= WAKE_PHY;
+}
+
+static int alx_set_wol(struct net_device *netdev, struct ethtool_wolinfo *wol)
+{
+	struct alx_priv *alx = netdev_priv(netdev);
+	struct alx_hw *hw = &alx->hw;
+
+	if (wol->wolopts & ~(WAKE_MAGIC | WAKE_PHY))
+		return -EOPNOTSUPP;
+
+	hw->sleep_ctrl = 0;
+
+	if (wol->wolopts & WAKE_MAGIC)
+		hw->sleep_ctrl |= ALX_SLEEP_WOL_MAGIC;
+	if (wol->wolopts & WAKE_PHY)
+		hw->sleep_ctrl |= ALX_SLEEP_WOL_PHY;
+
+	device_set_wakeup_enable(&alx->hw.pdev->dev, hw->sleep_ctrl);
+
+	return 0;
+}
+
 const struct ethtool_ops alx_ethtool_ops = {
 	.get_pauseparam	= alx_get_pauseparam,
 	.set_pauseparam	= alx_set_pauseparam,
 	.get_msglevel	= alx_get_msglevel,
 	.set_msglevel	= alx_set_msglevel,
+	.get_wol	= alx_get_wol,
+	.set_wol	= alx_set_wol,
 	.get_link	= ethtool_op_get_link,
 	.get_strings	= alx_get_strings,
 	.get_sset_count	= alx_get_sset_count,
diff --git a/drivers/net/ethernet/atheros/alx/hw.c b/drivers/net/ethernet/atheros/alx/hw.c
index 6ac40b0003a3..f9bf612550ab 100644
--- a/drivers/net/ethernet/atheros/alx/hw.c
+++ b/drivers/net/ethernet/atheros/alx/hw.c
@@ -332,6 +332,16 @@ void alx_set_macaddr(struct alx_hw *hw, const u8 *addr)
 	alx_write_mem32(hw, ALX_STAD1, val);
 }
 
+static void alx_enable_osc(struct alx_hw *hw)
+{
+	u32 val;
+
+	/* rising edge */
+	val = alx_read_mem32(hw, ALX_MISC);
+	alx_write_mem32(hw, ALX_MISC, val & ~ALX_MISC_INTNLOSC_OPEN);
+	alx_write_mem32(hw, ALX_MISC, val | ALX_MISC_INTNLOSC_OPEN);
+}
+
 static void alx_reset_osc(struct alx_hw *hw, u8 rev)
 {
 	u32 val, val2;
@@ -774,7 +784,6 @@ int alx_setup_speed_duplex(struct alx_hw *hw, u32 ethadv, u8 flowctrl)
 	return err;
 }
 
-
 void alx_post_phy_link(struct alx_hw *hw)
 {
 	u16 phy_val, len, agc;
@@ -848,6 +857,65 @@ void alx_post_phy_link(struct alx_hw *hw)
 	}
 }
 
+/* NOTE:
+ *    1. phy link must be established before calling this function
+ *    2. wol option (pattern,magic,link,etc.) is configed before call it.
+ */
+int alx_pre_suspend(struct alx_hw *hw, int speed, u8 duplex)
+{
+	u32 master, mac, phy, val;
+	int err = 0;
+
+	master = alx_read_mem32(hw, ALX_MASTER);
+	master &= ~ALX_MASTER_PCLKSEL_SRDS;
+	mac = hw->rx_ctrl;
+	/* 10/100 half */
+	ALX_SET_FIELD(mac, ALX_MAC_CTRL_SPEED,  ALX_MAC_CTRL_SPEED_10_100);
+	mac &= ~(ALX_MAC_CTRL_FULLD | ALX_MAC_CTRL_RX_EN | ALX_MAC_CTRL_TX_EN);
+
+	phy = alx_read_mem32(hw, ALX_PHY_CTRL);
+	phy &= ~(ALX_PHY_CTRL_DSPRST_OUT | ALX_PHY_CTRL_CLS);
+	phy |= ALX_PHY_CTRL_RST_ANALOG | ALX_PHY_CTRL_HIB_PULSE |
+	       ALX_PHY_CTRL_HIB_EN;
+
+	/* without any activity  */
+	if (!(hw->sleep_ctrl & ALX_SLEEP_ACTIVE)) {
+		err = alx_write_phy_reg(hw, ALX_MII_IER, 0);
+		if (err)
+			return err;
+		phy |= ALX_PHY_CTRL_IDDQ | ALX_PHY_CTRL_POWER_DOWN;
+	} else {
+		if (hw->sleep_ctrl & (ALX_SLEEP_WOL_MAGIC | ALX_SLEEP_CIFS))
+			mac |= ALX_MAC_CTRL_RX_EN | ALX_MAC_CTRL_BRD_EN;
+		if (hw->sleep_ctrl & ALX_SLEEP_CIFS)
+			mac |= ALX_MAC_CTRL_TX_EN;
+		if (duplex == DUPLEX_FULL)
+			mac |= ALX_MAC_CTRL_FULLD;
+		if (speed == SPEED_1000)
+			ALX_SET_FIELD(mac, ALX_MAC_CTRL_SPEED,
+				      ALX_MAC_CTRL_SPEED_1000);
+		phy |= ALX_PHY_CTRL_DSPRST_OUT;
+		err = alx_write_phy_ext(hw, ALX_MIIEXT_ANEG,
+					ALX_MIIEXT_S3DIG10,
+					ALX_MIIEXT_S3DIG10_SL);
+		if (err)
+			return err;
+	}
+
+	alx_enable_osc(hw);
+	hw->rx_ctrl = mac;
+	alx_write_mem32(hw, ALX_MASTER, master);
+	alx_write_mem32(hw, ALX_MAC_CTRL, mac);
+	alx_write_mem32(hw, ALX_PHY_CTRL, phy);
+
+	/* set val of PDLL D3PLLOFF */
+	val = alx_read_mem32(hw, ALX_PDLL_TRNS1);
+	val |= ALX_PDLL_TRNS1_D3PLLOFF_EN;
+	alx_write_mem32(hw, ALX_PDLL_TRNS1, val);
+
+	return 0;
+}
+
 bool alx_phy_configured(struct alx_hw *hw)
 {
 	u32 cfg, hw_cfg;
@@ -920,6 +988,26 @@ int alx_clear_phy_intr(struct alx_hw *hw)
 	return alx_read_phy_reg(hw, ALX_MII_ISR, &isr);
 }
 
+int alx_config_wol(struct alx_hw *hw)
+{
+	u32 wol = 0;
+	int err = 0;
+
+	/* turn on magic packet event */
+	if (hw->sleep_ctrl & ALX_SLEEP_WOL_MAGIC)
+		wol |= ALX_WOL0_MAGIC_EN | ALX_WOL0_PME_MAGIC_EN;
+
+	/* turn on link up event */
+	if (hw->sleep_ctrl & ALX_SLEEP_WOL_PHY) {
+		wol |=  ALX_WOL0_LINK_EN | ALX_WOL0_PME_LINK;
+		/* only link up can wake up */
+		err = alx_write_phy_reg(hw, ALX_MII_IER, ALX_IER_LINK_UP);
+	}
+	alx_write_mem32(hw, ALX_WOL0, wol);
+
+	return err;
+}
+
 void alx_disable_rss(struct alx_hw *hw)
 {
 	u32 ctrl = alx_read_mem32(hw, ALX_RXQ0);
@@ -1044,6 +1132,70 @@ void alx_mask_msix(struct alx_hw *hw, int index, bool mask)
 	alx_post_write(hw);
 }
 
+int alx_select_powersaving_speed(struct alx_hw *hw, int *speed, u8 *duplex)
+{
+	int i, err;
+	u16 lpa;
+
+	err = alx_read_phy_link(hw);
+	if (err)
+		return err;
+
+	if (hw->link_speed == SPEED_UNKNOWN) {
+		*speed = SPEED_UNKNOWN;
+		*duplex = DUPLEX_UNKNOWN;
+		return 0;
+	}
+
+	err = alx_read_phy_reg(hw, MII_LPA, &lpa);
+	if (err)
+		return err;
+
+	if (!(lpa & LPA_LPACK)) {
+		*speed = hw->link_speed;
+		return 0;
+	}
+
+	if (lpa & LPA_10FULL) {
+		*speed = SPEED_10;
+		*duplex = DUPLEX_FULL;
+	} else if (lpa & LPA_10HALF) {
+		*speed = SPEED_10;
+		*duplex = DUPLEX_HALF;
+	} else if (lpa & LPA_100FULL) {
+		*speed = SPEED_100;
+		*duplex = DUPLEX_FULL;
+	} else {
+		*speed = SPEED_100;
+		*duplex = DUPLEX_HALF;
+	}
+
+	if (*speed == hw->link_speed && *duplex == hw->duplex)
+		return 0;
+	err = alx_write_phy_reg(hw, ALX_MII_IER, 0);
+	if (err)
+		return err;
+	err = alx_setup_speed_duplex(hw, alx_speed_to_ethadv(*speed, *duplex) |
+					ADVERTISED_Autoneg, ALX_FC_ANEG |
+					ALX_FC_RX | ALX_FC_TX);
+	if (err)
+		return err;
+
+	/* wait for linkup */
+	for (i = 0; i < ALX_MAX_SETUP_LNK_CYCLE; i++) {
+		msleep(100);
+
+		err = alx_read_phy_link(hw);
+		if (err < 0)
+			return err;
+		if (hw->link_speed != SPEED_UNKNOWN)
+			break;
+	}
+	if (i == ALX_MAX_SETUP_LNK_CYCLE)
+		return -ETIMEDOUT;
+
+	return 0;
+}
 
 bool alx_get_phy_info(struct alx_hw *hw)
 {
diff --git a/drivers/net/ethernet/atheros/alx/hw.h b/drivers/net/ethernet/atheros/alx/hw.h
index e42d7e0947eb..a7fb6c8d846a 100644
--- a/drivers/net/ethernet/atheros/alx/hw.h
+++ b/drivers/net/ethernet/atheros/alx/hw.h
@@ -487,6 +487,8 @@ struct alx_hw {
 	u8 flowctrl;
 	u32 adv_cfg;
 
+	u32 sleep_ctrl;
+
 	spinlock_t mdio_lock;
 	struct mdio_if_info mdio;
 	u16 phy_id[2];
@@ -549,12 +551,14 @@ void alx_reset_pcie(struct alx_hw *hw);
 void alx_enable_aspm(struct alx_hw *hw, bool l0s_en, bool l1_en);
 int alx_setup_speed_duplex(struct alx_hw *hw, u32 ethadv, u8 flowctrl);
 void alx_post_phy_link(struct alx_hw *hw);
+int alx_pre_suspend(struct alx_hw *hw, int speed, u8 duplex);
 int alx_read_phy_reg(struct alx_hw *hw, u16 reg, u16 *phy_data);
 int alx_write_phy_reg(struct alx_hw *hw, u16 reg, u16 phy_data);
 int alx_read_phy_ext(struct alx_hw *hw, u8 dev, u16 reg, u16 *pdata);
 int alx_write_phy_ext(struct alx_hw *hw, u8 dev, u16 reg, u16 data);
 int alx_read_phy_link(struct alx_hw *hw);
 int alx_clear_phy_intr(struct alx_hw *hw);
+int alx_config_wol(struct alx_hw *hw);
 void alx_cfg_mac_flowcontrol(struct alx_hw *hw, u8 fc);
 void alx_start_mac(struct alx_hw *hw);
 int alx_reset_mac(struct alx_hw *hw);
@@ -563,6 +567,7 @@ bool alx_phy_configured(struct alx_hw *hw);
 void alx_configure_basic(struct alx_hw *hw);
 void alx_mask_msix(struct alx_hw *hw, int index, bool mask);
 void alx_disable_rss(struct alx_hw *hw);
+int alx_select_powersaving_speed(struct alx_hw *hw, int *speed, u8 *duplex);
 bool alx_get_phy_info(struct alx_hw *hw);
 void alx_update_hw_stats(struct alx_hw *hw);
 
diff --git a/drivers/net/ethernet/atheros/alx/main.c b/drivers/net/ethernet/atheros/alx/main.c
index 567ee54504bc..c0e2bb22ce24 100644
--- a/drivers/net/ethernet/atheros/alx/main.c
+++ b/drivers/net/ethernet/atheros/alx/main.c
@@ -1070,6 +1070,7 @@ static int alx_init_sw(struct alx_priv *alx)
 	alx->dev->max_mtu = ALX_MAX_FRAME_LEN(ALX_MAX_FRAME_SIZE);
 	alx->tx_ringsz = 256;
 	alx->rx_ringsz = 512;
+	hw->sleep_ctrl = ALX_SLEEP_WOL_MAGIC | ALX_SLEEP_WOL_PHY;
 	hw->imt = 200;
 	alx->int_mask = ALX_ISR_MISC;
 	hw->dma_chnl = hw->max_dma_chnl;
@@ -1346,6 +1347,66 @@ static int alx_stop(struct net_device *netdev)
 	return 0;
 }
 
+static int __alx_shutdown(struct pci_dev *pdev, bool *wol_en)
+{
+	struct alx_priv *alx = pci_get_drvdata(pdev);
+	struct net_device *netdev = alx->dev;
+	struct alx_hw *hw = &alx->hw;
+	int err, speed;
+	u8 duplex;
+
+	netif_device_detach(netdev);
+
+	if (netif_running(netdev))
+		__alx_stop(alx);
+
+#ifdef CONFIG_PM_SLEEP
+	err = pci_save_state(pdev);
+	if (err)
+		return err;
+#endif
+
+	err = alx_select_powersaving_speed(hw, &speed, &duplex);
+	if (err)
+		return err;
+	err = alx_clear_phy_intr(hw);
+	if (err)
+		return err;
+	err = alx_pre_suspend(hw, speed, duplex);
+	if (err)
+		return err;
+	err = alx_config_wol(hw);
+	if (err)
+		return err;
+
+	*wol_en = false;
+	if (hw->sleep_ctrl & ALX_SLEEP_ACTIVE) {
+		netif_info(alx, wol, netdev,
+			   "wol: ctrl=%X, speed=%X\n",
+			   hw->sleep_ctrl, speed);
+		device_set_wakeup_enable(&pdev->dev, true);
+		*wol_en = true;
+	}
+
+	pci_disable_device(pdev);
+
+	return 0;
+}
+
+static void alx_shutdown(struct pci_dev *pdev)
+{
+	int err;
+	bool wol_en;
+
+	err = __alx_shutdown(pdev, &wol_en);
+	if (!err) {
+		pci_wake_from_d3(pdev, wol_en);
+		pci_set_power_state(pdev, PCI_D3hot);
+	} else {
+		dev_err(&pdev->dev, "shutdown fail %d\n", err);
+	}
+}
+
 static void alx_link_check(struct work_struct *work)
 {
 	struct alx_priv *alx;
@@ -1841,6 +1902,8 @@ static int alx_probe(struct pci_dev *pdev, const struct pci_device_id *ent)
 		goto out_unmap;
 	}
 
+	device_set_wakeup_enable(&pdev->dev, hw->sleep_ctrl);
+
 	netdev_info(netdev,
 		    "Qualcomm Atheros AR816x/AR817x Ethernet [%pM]\n",
 		    netdev->dev_addr);
@@ -1883,12 +1946,22 @@ static void alx_remove(struct pci_dev *pdev)
 static int alx_suspend(struct device *dev)
 {
 	struct pci_dev *pdev = to_pci_dev(dev);
-	struct alx_priv *alx = pci_get_drvdata(pdev);
+	int err;
+	bool wol_en;
+
+	err = __alx_shutdown(pdev, &wol_en);
+	if (err) {
+		dev_err(&pdev->dev, "shutdown fail in suspend %d\n", err);
+		return err;
+	}
+
+	if (wol_en) {
+		pci_prepare_to_sleep(pdev);
+	} else {
+		pci_wake_from_d3(pdev, false);
+		pci_set_power_state(pdev, PCI_D3hot);
+	}
 
-	if (!netif_running(alx->dev))
-		return 0;
-	netif_device_detach(alx->dev);
-	__alx_stop(alx);
 	return 0;
 }
 
@@ -1896,23 +1969,69 @@ static int alx_resume(struct device *dev)
 {
 	struct pci_dev *pdev = to_pci_dev(dev);
 	struct alx_priv *alx = pci_get_drvdata(pdev);
+	struct net_device *netdev = alx->dev;
 	struct alx_hw *hw = &alx->hw;
+	int err;
+
+	pci_set_power_state(pdev, PCI_D0);
+	pci_restore_state(pdev);
+	pci_save_state(pdev);
+
+	pci_enable_wake(pdev, PCI_D3hot, 0);
+	pci_enable_wake(pdev, PCI_D3cold, 0);
 
+	hw->link_speed = SPEED_UNKNOWN;
+	alx->int_mask = ALX_ISR_MISC;
+
+	alx_reset_pcie(hw);
 	alx_reset_phy(hw);
 
-	if (!netif_running(alx->dev))
-		return 0;
-	netif_device_attach(alx->dev);
-	return __alx_open(alx, true);
+	pci_set_power_state(pdev, PCI_D0);
+	pci_restore_state(pdev);
+	pci_save_state(pdev);
+
+	pci_enable_wake(pdev, PCI_D3hot, 0);
+	pci_enable_wake(pdev, PCI_D3cold, 0);
+
+	hw->link_speed = SPEED_UNKNOWN;
+	alx->int_mask = ALX_ISR_MISC;
+
+	alx_reset_pcie(hw);
+	alx_reset_phy(hw);
+
+	err = alx_reset_mac(hw);
+	if (err) {
+		netif_err(alx, hw, alx->dev,
+			  "resume:reset_mac fail %d\n", err);
+		return -EIO;
+	}
+
+	err = alx_setup_speed_duplex(hw, hw->adv_cfg, hw->flowctrl);
+	if (err) {
+		netif_err(alx, hw, alx->dev,
+			  "resume:setup_speed_duplex fail %d\n", err);
+		return -EIO;
+	}
+
+	if (netif_running(netdev)) {
+		err = __alx_open(alx, true);
+		if (err)
+			return err;
+	}
+
+	netif_device_attach(netdev);
+
+	return err;
 }
+#endif
 
+#ifdef CONFIG_PM_SLEEP
 static SIMPLE_DEV_PM_OPS(alx_pm_ops, alx_suspend, alx_resume);
 #define ALX_PM_OPS      (&alx_pm_ops)
 #else
 #define ALX_PM_OPS      NULL
 #endif
 
-
 static pci_ers_result_t alx_pci_error_detected(struct pci_dev *pdev,
 					       pci_channel_state_t state)
 {
@@ -1955,6 +2074,8 @@ static pci_ers_result_t alx_pci_error_slot_reset(struct pci_dev *pdev)
 	}
 
 	pci_set_master(pdev);
+	pci_enable_wake(pdev, PCI_D3hot, 0);
+	pci_enable_wake(pdev, PCI_D3cold, 0);
 
 	alx_reset_pcie(hw);
 	if (!alx_reset_mac(hw))
@@ -2011,6 +2132,7 @@ static struct pci_driver alx_driver = {
 	.id_table    = alx_pci_tbl,
 	.probe       = alx_probe,
 	.remove      = alx_remove,
+	.shutdown    = alx_shutdown,
 	.err_handler = &alx_err_handlers,
 	.driver.pm   = ALX_PM_OPS,
 };
-- 
2.17.0

^ permalink raw reply related

* Re: [PATCH net-next v4 5/8] dsa: port: Ignore bridge VLAN events
From: Florian Fainelli @ 2018-05-30  2:13 UTC (permalink / raw)
  To: Petr Machata, netdev, devel, bridge
  Cc: jiri, idosch, davem, razvan.stefanescu, gregkh, stephen, andrew,
	vivien.didelot, nikolay, dan.carpenter
In-Reply-To: <aa04e548376becfc699273d023dff0197ed97999.1527641426.git.petrm@mellanox.com>



On 05/29/2018 05:59 PM, Petr Machata wrote:
> A follow-up patch enables emitting VLAN notifications for the bridge CPU
> port in addition to the existing slave port notifications. These
> notifications have orig_dev set to the bridge in question.
> 
> Because there's no specific support for these VLANs, just ignore the
> notifications to maintain the current behavior.
> 
> Signed-off-by: Petr Machata <petrm@mellanox.com>
> Reviewed-by: Vivien Didelot <vivien.didelot@savoirfairelinux.com>

Reviewed-by: Florian Fainelli <f.fainelli@gmail.com>
-- 
Florian

^ permalink raw reply

* Re: [net] vhost: Use kzalloc() to allocate vhost_msg_node
From: Michael S. Tsirkin @ 2018-05-30  3:01 UTC (permalink / raw)
  To: Guenter Roeck
  Cc: Kevin Easton, kvm, netdev, syzkaller-bugs, linux-kernel,
	virtualization
In-Reply-To: <20180529221908.GA22742@roeck-us.net>

On Tue, May 29, 2018 at 03:19:08PM -0700, Guenter Roeck wrote:
> On Fri, Apr 27, 2018 at 11:45:02AM -0400, Kevin Easton wrote:
> > The struct vhost_msg within struct vhost_msg_node is copied to userspace,
> > so it should be allocated with kzalloc() to ensure all structure padding
> > is zeroed.
> > 
> > Signed-off-by: Kevin Easton <kevin@guarana.org>
> > Reported-by: syzbot+87cfa083e727a224754b@syzkaller.appspotmail.com
> 
> Is this patch going anywhere ?
> 
> The patch fixes CVE-2018-1118. It would be useful to understand if and when
> this problem is going to be fixed.
> 
> Thanks,
> Guenter
> > ---
> >  drivers/vhost/vhost.c | 2 +-
> >  1 file changed, 1 insertion(+), 1 deletion(-)
> > 
> > diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
> > index f3bd8e9..1b84dcff 100644
> > --- a/drivers/vhost/vhost.c
> > +++ b/drivers/vhost/vhost.c
> > @@ -2339,7 +2339,7 @@ EXPORT_SYMBOL_GPL(vhost_disable_notify);
> >  /* Create a new message. */
> >  struct vhost_msg_node *vhost_new_msg(struct vhost_virtqueue *vq, int type)
> >  {
> > -	struct vhost_msg_node *node = kmalloc(sizeof *node, GFP_KERNEL);
> > +	struct vhost_msg_node *node = kzalloc(sizeof *node, GFP_KERNEL);
> >  	if (!node)
> >  		return NULL;
> >  	node->vq = vq;

As I pointed out, we don't need to init the whole structure. The proper
fix is thus (I think) below.

Could you use your testing infrastructure to confirm this fixes the issue?

Thanks!

Signed-off-by: Michael S. Tsirkin <mst@redhat.com>

diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
index f3bd8e941224..58d9aec90afb 100644
--- a/drivers/vhost/vhost.c
+++ b/drivers/vhost/vhost.c
@@ -2342,6 +2342,9 @@ struct vhost_msg_node *vhost_new_msg(struct vhost_virtqueue *vq, int type)
 	struct vhost_msg_node *node = kmalloc(sizeof *node, GFP_KERNEL);
 	if (!node)
 		return NULL;
+
+	/* Make sure all padding within the structure is initialized. */
+	memset(&node->msg, 0, sizeof node->msg);
 	node->vq = vq;
 	node->msg.type = type;
 	return node;

^ permalink raw reply related

* [PATCH net] net/sonic: Use dma_mapping_error()
From: Finn Thain @ 2018-05-30  3:03 UTC (permalink / raw)
  To: David S. Miller; +Cc: Thomas Bogendoerfer, netdev, linux-kernel

With CONFIG_DMA_API_DEBUG=y, calling sonic_open() produces the
message, "DMA-API: device driver failed to check map error".
Add the missing dma_mapping_error() call.

Cc: Thomas Bogendoerfer <tsbogend@alpha.franken.de>
Signed-off-by: Finn Thain <fthain@telegraphics.com.au>
---
 drivers/net/ethernet/natsemi/sonic.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/natsemi/sonic.c b/drivers/net/ethernet/natsemi/sonic.c
index 7ed08486ae23..c805dcbebd02 100644
--- a/drivers/net/ethernet/natsemi/sonic.c
+++ b/drivers/net/ethernet/natsemi/sonic.c
@@ -84,7 +84,7 @@ static int sonic_open(struct net_device *dev)
 	for (i = 0; i < SONIC_NUM_RRS; i++) {
 		dma_addr_t laddr = dma_map_single(lp->device, skb_put(lp->rx_skb[i], SONIC_RBSIZE),
 		                                  SONIC_RBSIZE, DMA_FROM_DEVICE);
-		if (!laddr) {
+		if (dma_mapping_error(lp->device, laddr)) {
 			while(i > 0) { /* free any that were mapped successfully */
 				i--;
 				dma_unmap_single(lp->device, lp->rx_laddr[i], SONIC_RBSIZE, DMA_FROM_DEVICE);
-- 
2.16.1

^ permalink raw reply related

* Re: [PATCH net-next 1/3] net: Add support to configure SR-IOV VF minimum and maximum queues.
From: Michael Chan @ 2018-05-30  3:19 UTC (permalink / raw)
  To: Samudrala, Sridhar; +Cc: David Miller, Netdev
In-Reply-To: <1de4ef51-dbe8-4d7e-0e43-9e92773cf52b@intel.com>

On Tue, May 29, 2018 at 1:46 PM, Samudrala, Sridhar
<sridhar.samudrala@intel.com> wrote:

>
> Isn't ndo_set_vf_xxx() considered a legacy interface and not planned to be
> extended?

I didn't know about that.

> Shouldn't we enable this via ethtool on the port representor netdev?
>
>

We discussed about this.  ethtool on the VF representor will only work
in switchdev mode and also will not support min/max values.

^ permalink raw reply

* Re: [PATCH V4] mlx4_core: allocate ICM memory in page size chunks
From: Eric Dumazet @ 2018-05-30  3:34 UTC (permalink / raw)
  To: David Miller, qing.huang
  Cc: tariqt, haakon.bugge, yanjun.zhu, netdev, linux-rdma,
	linux-kernel, gi-oh.kim
In-Reply-To: <20180525.102321.858995452200286788.davem@davemloft.net>



On 05/25/2018 10:23 AM, David Miller wrote:
> From: Qing Huang <qing.huang@oracle.com>
> Date: Wed, 23 May 2018 16:22:46 -0700
> 
>> When a system is under memory presure (high usage with fragments),
>> the original 256KB ICM chunk allocations will likely trigger kernel
>> memory management to enter slow path doing memory compact/migration
>> ops in order to complete high order memory allocations.
>>
>> When that happens, user processes calling uverb APIs may get stuck
>> for more than 120s easily even though there are a lot of free pages
>> in smaller chunks available in the system.
>>
>> Syslog:
>> ...
>> Dec 10 09:04:51 slcc03db02 kernel: [397078.572732] INFO: task
>> oracle_205573_e:205573 blocked for more than 120 seconds.
>> ...
>>
>> With 4KB ICM chunk size on x86_64 arch, the above issue is fixed.
>>
>> However in order to support smaller ICM chunk size, we need to fix
>> another issue in large size kcalloc allocations.
>>
>> E.g.
>> Setting log_num_mtt=30 requires 1G mtt entries. With the 4KB ICM chunk
>> size, each ICM chunk can only hold 512 mtt entries (8 bytes for each mtt
>> entry). So we need a 16MB allocation for a table->icm pointer array to
>> hold 2M pointers which can easily cause kcalloc to fail.
>>
>> The solution is to use kvzalloc to replace kcalloc which will fall back
>> to vmalloc automatically if kmalloc fails.
>>
>> Signed-off-by: Qing Huang <qing.huang@oracle.com>
>> Acked-by: Daniel Jurgens <danielj@mellanox.com>
>> Reviewed-by: Zhu Yanjun <yanjun.zhu@oracle.com>
> 
> Applied, thanks.
> 

I must say this patch causes regressions here.

KASAN is not happy.

It looks that you guys did not really looked at mlx4_alloc_icm()

This function is properly handling high order allocations with fallbacks to order-0 pages
under high memory pressure.

BUG: KASAN: slab-out-of-bounds in to_rdma_ah_attr+0x808/0x9e0 [mlx4_ib]
Read of size 4 at addr ffff8817df584f68 by task qp_listing_test/92585

CPU: 38 PID: 92585 Comm: qp_listing_test Tainted: G           O     
Call Trace:
 [<ffffffffba80d7bb>] dump_stack+0x4d/0x72
 [<ffffffffb951dc5f>] print_address_description+0x6f/0x260
 [<ffffffffb951e1c7>] kasan_report+0x257/0x370
 [<ffffffffb951e339>] __asan_report_load4_noabort+0x19/0x20
 [<ffffffffc0256d28>] to_rdma_ah_attr+0x808/0x9e0 [mlx4_ib]
 [<ffffffffc02785b3>] mlx4_ib_query_qp+0x1213/0x1660 [mlx4_ib]
 [<ffffffffc02dbfdb>] qpstat_print_qp+0x13b/0x500 [ib_uverbs]
 [<ffffffffc02dc3ea>] qpstat_seq_show+0x4a/0xb0 [ib_uverbs]
 [<ffffffffb95f125c>] seq_read+0xa9c/0x1230
 [<ffffffffb96e0821>] proc_reg_read+0xc1/0x180
 [<ffffffffb9577918>] __vfs_read+0xe8/0x730
 [<ffffffffb9578057>] vfs_read+0xf7/0x300
 [<ffffffffb95794d2>] SyS_read+0xd2/0x1b0
 [<ffffffffb8e06b16>] do_syscall_64+0x186/0x420
 [<ffffffffbaa00071>] entry_SYSCALL_64_after_hwframe+0x3d/0xa2
RIP: 0033:0x7f851a7bb30d
RSP: 002b:00007ffd09a758c0 EFLAGS: 00000293 ORIG_RAX: 0000000000000000
RAX: ffffffffffffffda RBX: 00007f84ff959440 RCX: 00007f851a7bb30d
RDX: 000000000003fc00 RSI: 00007f84ff60a000 RDI: 000000000000000b
RBP: 00007ffd09a75900 R08: 00000000ffffffff R09: 0000000000000000
R10: 0000000000000022 R11: 0000000000000293 R12: 0000000000000000
R13: 000000000003ffff R14: 000000000003ffff R15: 00007f84ff60a000

Allocated by task 4488: 
 save_stack+0x46/0xd0
 kasan_kmalloc+0xad/0xe0
 __kmalloc+0x101/0x5e0
 ib_register_device+0xc03/0x1250 [ib_core]
 mlx4_ib_add+0x27d6/0x4dd0 [mlx4_ib]
 mlx4_add_device+0xa9/0x340 [mlx4_core]
 mlx4_register_interface+0x16e/0x390 [mlx4_core]
 xhci_pci_remove+0x7a/0x180 [xhci_pci]
 do_one_initcall+0xa0/0x230
 do_init_module+0x1b9/0x5a4
 load_module+0x63e6/0x94c0
 SYSC_init_module+0x1a4/0x1c0
 SyS_init_module+0xe/0x10
 do_syscall_64+0x186/0x420
 entry_SYSCALL_64_after_hwframe+0x3d/0xa2

Freed by task 0:
(stack is not available)

The buggy address belongs to the object at ffff8817df584f40
 which belongs to the cache kmalloc-32 of size 32
The buggy address is located 8 bytes to the right of
 32-byte region [ffff8817df584f40, ffff8817df584f60)
The buggy address belongs to the page:
page:ffffea005f7d6100 count:1 mapcount:0 mapping:ffff8817df584000 index:0xffff8817df584fc1
flags: 0x880000000000100(slab)
raw: 0880000000000100 ffff8817df584000 ffff8817df584fc1 000000010000003f
raw: ffffea005f3ac0a0 ffffea005c476760 ffff8817fec00900 ffff883ff78d26c0
page dumped because: kasan: bad access detected
page->mem_cgroup:ffff883ff78d26c0

Memory state around the buggy address:
 ffff8817df584e00: 00 03 fc fc fc fc fc fc 00 03 fc fc fc fc fc fc
 ffff8817df584e80: 00 00 00 04 fc fc fc fc 00 00 00 fc fc fc fc fc
>ffff8817df584f00: fb fb fb fb fc fc fc fc 00 00 00 00 fc fc fc fc
                                                          ^
 ffff8817df584f80: fb fb fb fb fc fc fc fc fc fc fc fc fc fc fc fc
 ffff8817df585000: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb

I will test :

diff --git a/drivers/net/ethernet/mellanox/mlx4/icm.c b/drivers/net/ethernet/mellanox/mlx4/icm.c
index 685337d58276fc91baeeb64387c52985e1bc6dda..4d2a71381acb739585d662175e86caef72338097 100644
--- a/drivers/net/ethernet/mellanox/mlx4/icm.c
+++ b/drivers/net/ethernet/mellanox/mlx4/icm.c
@@ -43,12 +43,13 @@
 #include "fw.h"
 
 /*
- * We allocate in page size (default 4KB on many archs) chunks to avoid high
- * order memory allocations in fragmented/high usage memory situation.
+ * We allocate in as big chunks as we can, up to a maximum of 256 KB
+ * per chunk. Note that the chunks are not necessarily in contiguous
+ * physical memory.
  */
 enum {
-       MLX4_ICM_ALLOC_SIZE     = PAGE_SIZE,
-       MLX4_TABLE_CHUNK_SIZE   = PAGE_SIZE,
+       MLX4_ICM_ALLOC_SIZE     = 1 << 18,
+       MLX4_TABLE_CHUNK_SIZE   = 1 << 18
 };
 
 static void mlx4_free_icm_pages(struct mlx4_dev *dev, struct mlx4_icm_chunk *chunk)

^ permalink raw reply related

* Re: [net] vhost: Use kzalloc() to allocate vhost_msg_node
From: Guenter Roeck @ 2018-05-30  3:42 UTC (permalink / raw)
  To: Michael S. Tsirkin
  Cc: Kevin Easton, Jason Wang, kvm, virtualization, netdev,
	linux-kernel, syzkaller-bugs
In-Reply-To: <20180530055704-mutt-send-email-mst@kernel.org>

On 05/29/2018 08:01 PM, Michael S. Tsirkin wrote:
> On Tue, May 29, 2018 at 03:19:08PM -0700, Guenter Roeck wrote:
>> On Fri, Apr 27, 2018 at 11:45:02AM -0400, Kevin Easton wrote:
>>> The struct vhost_msg within struct vhost_msg_node is copied to userspace,
>>> so it should be allocated with kzalloc() to ensure all structure padding
>>> is zeroed.
>>>
>>> Signed-off-by: Kevin Easton <kevin@guarana.org>
>>> Reported-by: syzbot+87cfa083e727a224754b@syzkaller.appspotmail.com
>>
>> Is this patch going anywhere ?
>>
>> The patch fixes CVE-2018-1118. It would be useful to understand if and when
>> this problem is going to be fixed.
>>
>> Thanks,
>> Guenter
>>> ---
>>>   drivers/vhost/vhost.c | 2 +-
>>>   1 file changed, 1 insertion(+), 1 deletion(-)
>>>
>>> diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
>>> index f3bd8e9..1b84dcff 100644
>>> --- a/drivers/vhost/vhost.c
>>> +++ b/drivers/vhost/vhost.c
>>> @@ -2339,7 +2339,7 @@ EXPORT_SYMBOL_GPL(vhost_disable_notify);
>>>   /* Create a new message. */
>>>   struct vhost_msg_node *vhost_new_msg(struct vhost_virtqueue *vq, int type)
>>>   {
>>> -	struct vhost_msg_node *node = kmalloc(sizeof *node, GFP_KERNEL);
>>> +	struct vhost_msg_node *node = kzalloc(sizeof *node, GFP_KERNEL);
>>>   	if (!node)
>>>   		return NULL;
>>>   	node->vq = vq;
> 
> As I pointed out, we don't need to init the whole structure. The proper
> fix is thus (I think) below.
> 
> Could you use your testing infrastructure to confirm this fixes the issue?
> 

Sorry, I don't have the means to test the fix.

Guenter

> Thanks!
> 
> Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
> 
> diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
> index f3bd8e941224..58d9aec90afb 100644
> --- a/drivers/vhost/vhost.c
> +++ b/drivers/vhost/vhost.c
> @@ -2342,6 +2342,9 @@ struct vhost_msg_node *vhost_new_msg(struct vhost_virtqueue *vq, int type)
>   	struct vhost_msg_node *node = kmalloc(sizeof *node, GFP_KERNEL);
>   	if (!node)
>   		return NULL;
> +
> +	/* Make sure all padding within the structure is initialized. */
> +	memset(&node->msg, 0, sizeof node->msg);
>   	node->vq = vq;
>   	node->msg.type = type;
>   	return node;
> 

^ permalink raw reply

* Re: [PATCH V4] mlx4_core: allocate ICM memory in page size chunks
From: Eric Dumazet @ 2018-05-30  3:44 UTC (permalink / raw)
  To: Eric Dumazet, David Miller, qing.huang
  Cc: tariqt, haakon.bugge, yanjun.zhu, netdev, linux-rdma,
	linux-kernel, gi-oh.kim
In-Reply-To: <7a353b65-6b7f-1aee-1c48-e83c8e02f693@gmail.com>



On 05/29/2018 11:34 PM, Eric Dumazet wrote:

> I will test :
> 
> diff --git a/drivers/net/ethernet/mellanox/mlx4/icm.c b/drivers/net/ethernet/mellanox/mlx4/icm.c
> index 685337d58276fc91baeeb64387c52985e1bc6dda..4d2a71381acb739585d662175e86caef72338097 100644
> --- a/drivers/net/ethernet/mellanox/mlx4/icm.c
> +++ b/drivers/net/ethernet/mellanox/mlx4/icm.c
> @@ -43,12 +43,13 @@
>  #include "fw.h"
>  
>  /*
> - * We allocate in page size (default 4KB on many archs) chunks to avoid high
> - * order memory allocations in fragmented/high usage memory situation.
> + * We allocate in as big chunks as we can, up to a maximum of 256 KB
> + * per chunk. Note that the chunks are not necessarily in contiguous
> + * physical memory.
>   */
>  enum {
> -       MLX4_ICM_ALLOC_SIZE     = PAGE_SIZE,
> -       MLX4_TABLE_CHUNK_SIZE   = PAGE_SIZE,
> +       MLX4_ICM_ALLOC_SIZE     = 1 << 18,
> +       MLX4_TABLE_CHUNK_SIZE   = 1 << 18
>  };
>  
>  static void mlx4_free_icm_pages(struct mlx4_dev *dev, struct mlx4_icm_chunk *chunk)
> 

And I will add this simple fix, this really should address your initial concern much better.

@@ -99,6 +100,8 @@ static int mlx4_alloc_icm_pages(struct scatterlist *mem, int order,
 {
        struct page *page;
 
+       if (order)
+               gfp_mask |= __GFP_NORETRY;
        page = alloc_pages_node(node, gfp_mask, order);
        if (!page) {
                page = alloc_pages(gfp_mask, order);

^ permalink raw reply

* Re: [PATCH V4] mlx4_core: allocate ICM memory in page size chunks
From: Eric Dumazet @ 2018-05-30  3:49 UTC (permalink / raw)
  To: Eric Dumazet, David Miller, qing.huang
  Cc: tariqt, haakon.bugge, yanjun.zhu, netdev, linux-rdma,
	linux-kernel, gi-oh.kim
In-Reply-To: <0e11e0fc-6ccf-aa93-9c4f-b9eae1b90643@gmail.com>



On 05/29/2018 11:44 PM, Eric Dumazet wrote:

> 
> And I will add this simple fix, this really should address your initial concern much better.
> 
> @@ -99,6 +100,8 @@ static int mlx4_alloc_icm_pages(struct scatterlist *mem, int order,
>  {
>         struct page *page;
>  
> +       if (order)
> +               gfp_mask |= __GFP_NORETRY;

    and also      gfp_mask &= ~__GFP_DIRECT_RECLAIM


>         page = alloc_pages_node(node, gfp_mask, order);
>         if (!page) {
>                 page = alloc_pages(gfp_mask, order);
> 

^ permalink raw reply

* [PATCH net] mlx4_core: restore optimal ICM memory allocation
From: Eric Dumazet @ 2018-05-30  4:11 UTC (permalink / raw)
  To: David S . Miller
  Cc: netdev, Eric Dumazet, Eric Dumazet, John Sperbeck, Tarick Bedeir,
	Qing Huang, Daniel Jurgens, Zhu Yanjun, Tariq Toukan

Commit 1383cb8103bb ("mlx4_core: allocate ICM memory in page size chunks")
brought a regression caught in our regression suite, thanks to KASAN.

Note that mlx4_alloc_icm() is already able to try high order allocations
and fallback to low-order allocations under high memory pressure.

We only have to tweak gfp_mask a bit, to help falling back faster,
without risking OOM killings.

BUG: KASAN: slab-out-of-bounds in to_rdma_ah_attr+0x808/0x9e0 [mlx4_ib]
Read of size 4 at addr ffff8817df584f68 by task qp_listing_test/92585

CPU: 38 PID: 92585 Comm: qp_listing_test Tainted: G           O
Call Trace:
 [<ffffffffba80d7bb>] dump_stack+0x4d/0x72
 [<ffffffffb951dc5f>] print_address_description+0x6f/0x260
 [<ffffffffb951e1c7>] kasan_report+0x257/0x370
 [<ffffffffb951e339>] __asan_report_load4_noabort+0x19/0x20
 [<ffffffffc0256d28>] to_rdma_ah_attr+0x808/0x9e0 [mlx4_ib]
 [<ffffffffc02785b3>] mlx4_ib_query_qp+0x1213/0x1660 [mlx4_ib]
 [<ffffffffc02dbfdb>] qpstat_print_qp+0x13b/0x500 [ib_uverbs]
 [<ffffffffc02dc3ea>] qpstat_seq_show+0x4a/0xb0 [ib_uverbs]
 [<ffffffffb95f125c>] seq_read+0xa9c/0x1230
 [<ffffffffb96e0821>] proc_reg_read+0xc1/0x180
 [<ffffffffb9577918>] __vfs_read+0xe8/0x730
 [<ffffffffb9578057>] vfs_read+0xf7/0x300
 [<ffffffffb95794d2>] SyS_read+0xd2/0x1b0
 [<ffffffffb8e06b16>] do_syscall_64+0x186/0x420
 [<ffffffffbaa00071>] entry_SYSCALL_64_after_hwframe+0x3d/0xa2
RIP: 0033:0x7f851a7bb30d
RSP: 002b:00007ffd09a758c0 EFLAGS: 00000293 ORIG_RAX: 0000000000000000
RAX: ffffffffffffffda RBX: 00007f84ff959440 RCX: 00007f851a7bb30d
RDX: 000000000003fc00 RSI: 00007f84ff60a000 RDI: 000000000000000b
RBP: 00007ffd09a75900 R08: 00000000ffffffff R09: 0000000000000000
R10: 0000000000000022 R11: 0000000000000293 R12: 0000000000000000
R13: 000000000003ffff R14: 000000000003ffff R15: 00007f84ff60a000

Allocated by task 4488:
 save_stack+0x46/0xd0
 kasan_kmalloc+0xad/0xe0
 __kmalloc+0x101/0x5e0
 ib_register_device+0xc03/0x1250 [ib_core]
 mlx4_ib_add+0x27d6/0x4dd0 [mlx4_ib]
 mlx4_add_device+0xa9/0x340 [mlx4_core]
 mlx4_register_interface+0x16e/0x390 [mlx4_core]
 xhci_pci_remove+0x7a/0x180 [xhci_pci]
 do_one_initcall+0xa0/0x230
 do_init_module+0x1b9/0x5a4
 load_module+0x63e6/0x94c0
 SYSC_init_module+0x1a4/0x1c0
 SyS_init_module+0xe/0x10
 do_syscall_64+0x186/0x420
 entry_SYSCALL_64_after_hwframe+0x3d/0xa2

Freed by task 0:
(stack is not available)

The buggy address belongs to the object at ffff8817df584f40
 which belongs to the cache kmalloc-32 of size 32
The buggy address is located 8 bytes to the right of
 32-byte region [ffff8817df584f40, ffff8817df584f60)
The buggy address belongs to the page:
page:ffffea005f7d6100 count:1 mapcount:0 mapping:ffff8817df584000 index:0xffff8817df584fc1
flags: 0x880000000000100(slab)
raw: 0880000000000100 ffff8817df584000 ffff8817df584fc1 000000010000003f
raw: ffffea005f3ac0a0 ffffea005c476760 ffff8817fec00900 ffff883ff78d26c0
page dumped because: kasan: bad access detected
page->mem_cgroup:ffff883ff78d26c0

Memory state around the buggy address:
 ffff8817df584e00: 00 03 fc fc fc fc fc fc 00 03 fc fc fc fc fc fc
 ffff8817df584e80: 00 00 00 04 fc fc fc fc 00 00 00 fc fc fc fc fc
> ffff8817df584f00: fb fb fb fb fc fc fc fc 00 00 00 00 fc fc fc fc
                                                          ^
 ffff8817df584f80: fb fb fb fb fc fc fc fc fc fc fc fc fc fc fc fc
 ffff8817df585000: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb

Fixes: 1383cb8103bb ("mlx4_core: allocate ICM memory in page size chunks")
Signed-off-by: Eric Dumazet <edumazet@google.com>
Cc: John Sperbeck <jsperbeck@google.com>
Cc: Tarick Bedeir <tarick@google.com>
Cc: Qing Huang <qing.huang@oracle.com>
Cc: Daniel Jurgens <danielj@mellanox.com>
Cc: Zhu Yanjun <yanjun.zhu@oracle.com>
Cc: Tariq Toukan <tariqt@mellanox.com>
---
 drivers/net/ethernet/mellanox/mlx4/icm.c | 17 +++++++++++------
 1 file changed, 11 insertions(+), 6 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx4/icm.c b/drivers/net/ethernet/mellanox/mlx4/icm.c
index 685337d58276fc91baeeb64387c52985e1bc6dda..cae33d5c7dbd9ba7929adcf2127b104f6796fa5a 100644
--- a/drivers/net/ethernet/mellanox/mlx4/icm.c
+++ b/drivers/net/ethernet/mellanox/mlx4/icm.c
@@ -43,12 +43,13 @@
 #include "fw.h"
 
 /*
- * We allocate in page size (default 4KB on many archs) chunks to avoid high
- * order memory allocations in fragmented/high usage memory situation.
+ * We allocate in as big chunks as we can, up to a maximum of 256 KB
+ * per chunk. Note that the chunks are not necessarily in contiguous
+ * physical memory.
  */
 enum {
-	MLX4_ICM_ALLOC_SIZE	= PAGE_SIZE,
-	MLX4_TABLE_CHUNK_SIZE	= PAGE_SIZE,
+	MLX4_ICM_ALLOC_SIZE	= 1 << 18,
+	MLX4_TABLE_CHUNK_SIZE	= 1 << 18,
 };
 
 static void mlx4_free_icm_pages(struct mlx4_dev *dev, struct mlx4_icm_chunk *chunk)
@@ -135,6 +136,7 @@ struct mlx4_icm *mlx4_alloc_icm(struct mlx4_dev *dev, int npages,
 	struct mlx4_icm *icm;
 	struct mlx4_icm_chunk *chunk = NULL;
 	int cur_order;
+	gfp_t mask;
 	int ret;
 
 	/* We use sg_set_buf for coherent allocs, which assumes low memory */
@@ -178,13 +180,16 @@ struct mlx4_icm *mlx4_alloc_icm(struct mlx4_dev *dev, int npages,
 		while (1 << cur_order > npages)
 			--cur_order;
 
+		mask = gfp_mask;
+		if (cur_order)
+			mask = (mask & ~__GFP_DIRECT_RECLAIM) | __GFP_NORETRY;
 		if (coherent)
 			ret = mlx4_alloc_icm_coherent(&dev->persist->pdev->dev,
 						      &chunk->mem[chunk->npages],
-						      cur_order, gfp_mask);
+						      cur_order, mask);
 		else
 			ret = mlx4_alloc_icm_pages(&chunk->mem[chunk->npages],
-						   cur_order, gfp_mask,
+						   cur_order, mask,
 						   dev->numa_node);
 
 		if (ret) {
-- 
2.17.0.921.gf22659ad46-goog

^ permalink raw reply related

* WARNING in iov_iter_revert
From: Gerb Stralko @ 2018-05-30  4:58 UTC (permalink / raw)
  To: aviadye, davejwatson, davem, ilyal, linux-kernel, netdev,
	syzkaller-bugs

Hello,

I'm in the process of fixing this bug 
(https://syzkaller.appspot.com/bug?id=1339e0a805a4ddb11eaee6fb6b1bc905493ded77)

However, there are a couple things I'm trying to wrap my head around to understand and fix this issue properly.

I'm using the C reproducer from syzkaller[1] to trigger this issue which has been a huge help in learning the Linux kernel.

First in tls_sw.c in tls_sw_sendmsg we having the following: 

`do_tcp_sendpages
   `tls_push_sg
      `tls_push_record 

returns -EPIPE (-32) because sk->sk_shudown is set.  See do_tcp_sendpages 
in net/ipv4/tcp.c

Do we need to check for -EPIPE before continuing in tls_sw.c after 
tls_push_record, for example:

if (ret == -EAGAIN || ret == -EPIPE)
                     ^^^^^^^^^^^^^^
  goto send_end;

If -EPIPE is valid and we don't need to check for that particular 
condition 
then calling iov_iter_revert might be wrong because of the second 
argument:

ctx->sg_plaintext_size - orig_size

Because that statement will evaluate to -1 because ctx->sg_plaintext_size 
has been set to zero from earlier when calling:

`free_sg
  `tls_push_sg  

And orig_size is set to 1 from the beginning of the while loop:

orig_size = ctx->sg_plaintext_size

So calling iov_iter_revert with the second argument being -1 will trigger 
the warning in lib/iov_iter.c:iov_iter_revert:

if (WARN_ON(unroll > MAX_RW_COUNT))
     return;

Am I correct in thinking EPIPE should be checked after returning from 
tls_push_record?  Or do we need to do some sanity checks on 
sg_plaintext_size and orig_size before calling iov_iter_revert?

Any help would be greatly appreciated.

[1] https://syzkaller.appspot.com/text?tag=ReproC&x=141d5417800000

On Sunday, May 13, 2018 at 9:28:03 AM UTC-7, syzbot wrote:
>
> Hello, 
>
> syzbot found the following crash on: 
>
> HEAD commit:    427fbe89261d Merge branch 'next' of 
git://git.kernel.org/p.. 
>
> git tree:       upstream 
> console output: https://syzkaller.appspot.com/x/log.txt?x=16b33477800000 
> kernel config:  
https://syzkaller.appspot.com/x/.config?x=fcce42b221691ff9 
> dashboard link: 
> https://syzkaller.appspot.com/bug?extid=c226690f7b3126c5ee04 
> compiler:       gcc (GCC) 8.0.1 20180413 (experimental) 
> syzkaller 
repro:https://syzkaller.appspot.com/x/repro.syz?x=144f1997800000 
> C reproducer:   https://syzkaller.appspot.com/x/repro.c?x=141d5417800000 
>
> IMPORTANT: if you fix the bug, please add the following tag to the 
commit: 
> Reported-by: syzbot+c22669...@syzkaller.appspotmail.com <javascript:> 
>
> random: sshd: uninitialized urandom read (32 bytes read) 
> random: sshd: uninitialized urandom read (32 bytes read) 
> random: sshd: uninitialized urandom read (32 bytes read) 
> random: sshd: uninitialized urandom read (32 bytes read) 
> random: sshd: uninitialized urandom read (32 bytes read) 
> WARNING: CPU: 1 PID: 4542 at lib/iov_iter.c:857 
> iov_iter_revert+0x2ee/0xaa0   
> lib/iov_iter.c:857 
> Kernel panic - not syncing: panic_on_warn set ... 
>
> CPU: 1 PID: 4542 Comm: syz-executor650 Not tainted 4.17.0-rc4+ #44 
> Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS   
> Google 01/01/2011 
> Call Trace: 
>   __dump_stack lib/dump_stack.c:77 [inline] 
>   dump_stack+0x1b9/0x294 lib/dump_stack.c:113 
>   panic+0x22f/0x4de kernel/panic.c:184 
>   __warn.cold.8+0x163/0x1b3 kernel/panic.c:536 
>   report_bug+0x252/0x2d0 lib/bug.c:186 
>   fixup_bug arch/x86/kernel/traps.c:178 [inline] 
>   do_error_trap+0x1de/0x490 arch/x86/kernel/traps.c:296 
>   do_invalid_op+0x1b/0x20 arch/x86/kernel/traps.c:315 
>   invalid_op+0x14/0x20 arch/x86/entry/entry_64.S:992 
> RIP: 0010:iov_iter_revert+0x2ee/0xaa0 lib/iov_iter.c:857 
> RSP: 0018:ffff8801ad1bf700 EFLAGS: 00010293 
> RAX: ffff8801ac55e6c0 RBX: 00000000ffffffff RCX: ffffffff835104a1 
> RDX: 0000000000000000 RSI: ffffffff8351074e RDI: 0000000000000007 
> RBP: ffff8801ad1bf760 R08: ffff8801ac55e6c0 R09: ffffed003b5e46c2 
> R10: 0000000000000003 R11: 0000000000000001 R12: 0000000000000001 
> R13: ffff8801ad1bfd60 R14: 0000000000000011 R15: ffff8801ae9ac040 
>   tls_sw_sendmsg+0xf1c/0x12d0 net/tls/tls_sw.c:448 
>   inet_sendmsg+0x19f/0x690 net/ipv4/af_inet.c:798 
>   sock_sendmsg_nosec net/socket.c:629 [inline] 
>   sock_sendmsg+0xd5/0x120 net/socket.c:639 
>   ___sys_sendmsg+0x805/0x940 net/socket.c:2117 
>   __sys_sendmsg+0x115/0x270 net/socket.c:2155 
>   __do_sys_sendmsg net/socket.c:2164 [inline] 
>   __se_sys_sendmsg net/socket.c:2162 [inline] 
>   __x64_sys_sendmsg+0x78/0xb0 net/socket.c:2162 
>   do_syscall_64+0x1b1/0x800 arch/x86/entry/common.c:287 
>   entry_SYSCALL_64_after_hwframe+0x49/0xbe 
> RIP: 0033:0x4403a9 
> RSP: 002b:00007ffdcdfbd6c8 EFLAGS: 00000207 ORIG_RAX: 000000000000002e 
> RAX: ffffffffffffffda RBX: 00000000004002c8 RCX: 00000000004403a9 
> RDX: 0000000000000000 RSI: 0000000020001340 RDI: 0000000000000003 
> RBP: 00000000006ca018 R08: 000000000000001c R09: 000000000000001c 
> R10: 0000000020000180 R11: 0000000000000207 R12: 0000000000401cd0 
> R13: 0000000000401d60 R14: 0000000000000000 R15: 0000000000000000 
> Dumping ftrace buffer: 
>     (ftrace buffer empty) 
> Kernel Offset: disabled 
> Rebooting in 86400 seconds.. 
>
>
> --- 
> This bug is generated by a bot. It may contain errors. 
> See https://goo.gl/tpsmEJ for more information about syzbot. 
> syzbot engineers can be reached at syzk...@googlegroups.com 
<javascript:>. 
>
>
> syzbot will keep track of this bug report. See: 
> https://goo.gl/tpsmEJ#bug-status-tracking for how to communicate with   
> syzbot. 
> syzbot can test patches for this bug, for details see: 
> https://goo.gl/tpsmEJ#testing-patches 
>

^ permalink raw reply

* Re: [PATCH bpf v2 0/5] fix test_sockmap
From: John Fastabend @ 2018-05-30  5:12 UTC (permalink / raw)
  To: Prashant Bhole, Alexei Starovoitov, Daniel Borkmann
  Cc: David S . Miller, Shuah Khan, netdev, linux-kselftest
In-Reply-To: <95bb9c9a-25ed-4b10-a181-987a61e0a24c@lab.ntt.co.jp>

On 05/29/2018 05:44 PM, Prashant Bhole wrote:
> 
> 
> On 5/30/2018 12:48 AM, John Fastabend wrote:
>> On 05/27/2018 09:37 PM, Prashant Bhole wrote:
>>> This series fixes error handling, timeout and data verification in
>>> test_sockmap. Previously it was not able to detect failure/timeout in
>>> RX/TX thread because error was not notified to the main thread.
>>>
>>> Also slightly improved test output by printing parameter values (cork,
>>> apply, start, end) so that parameters for all tests are displayed.
>>>
>>> Prashant Bhole (5):
>>>    selftests/bpf: test_sockmap, check test failure
>>>    selftests/bpf: test_sockmap, join cgroup in selftest mode
>>>    selftests/bpf: test_sockmap, fix test timeout
>>>    selftests/bpf: test_sockmap, fix data verification
>>>    selftests/bpf: test_sockmap, print additional test options
>>>
>>>   tools/testing/selftests/bpf/test_sockmap.c | 76 +++++++++++++++++-----
>>>   1 file changed, 58 insertions(+), 18 deletions(-)
>>>
>>
>> After first patch "check test failure" how do we handle the case
>> where test is known to cause timeouts because we are specifically testing
>> these cases. This is the 'cork' parameter we discussed in the last
>> series. It looks like with this series the test may still throw an
>> error?
> 
> Sorry. In your comment in last series, did you mean to skip error
> checking only for all cork tests (for now)?
> 
> -Prashant
> 

Hi, After this is applied are any errors returned from test_sockmap?
When I read the first patch it looked like timeouts from the cork
tests may result in errors "FAILED" tests. If this is the case then
yes we need skip error checking on all tests or just the corked
tests.

.John

^ permalink raw reply

* Re: [PATCH bpf v2 0/5] fix test_sockmap
From: Prashant Bhole @ 2018-05-30  5:31 UTC (permalink / raw)
  To: John Fastabend, Alexei Starovoitov, Daniel Borkmann
  Cc: David S . Miller, Shuah Khan, netdev, linux-kselftest
In-Reply-To: <f477f42d-f3fb-6138-b0af-589d049790a0@gmail.com>



On 5/30/2018 2:12 PM, John Fastabend wrote:
> On 05/29/2018 05:44 PM, Prashant Bhole wrote:
>>
>>
>> On 5/30/2018 12:48 AM, John Fastabend wrote:
>>> On 05/27/2018 09:37 PM, Prashant Bhole wrote:
>>>> This series fixes error handling, timeout and data verification in
>>>> test_sockmap. Previously it was not able to detect failure/timeout in
>>>> RX/TX thread because error was not notified to the main thread.
>>>>
>>>> Also slightly improved test output by printing parameter values (cork,
>>>> apply, start, end) so that parameters for all tests are displayed.
>>>>
>>>> Prashant Bhole (5):
>>>>     selftests/bpf: test_sockmap, check test failure
>>>>     selftests/bpf: test_sockmap, join cgroup in selftest mode
>>>>     selftests/bpf: test_sockmap, fix test timeout
>>>>     selftests/bpf: test_sockmap, fix data verification
>>>>     selftests/bpf: test_sockmap, print additional test options
>>>>
>>>>    tools/testing/selftests/bpf/test_sockmap.c | 76 +++++++++++++++++-----
>>>>    1 file changed, 58 insertions(+), 18 deletions(-)
>>>>
>>>
>>> After first patch "check test failure" how do we handle the case
>>> where test is known to cause timeouts because we are specifically testing
>>> these cases. This is the 'cork' parameter we discussed in the last
>>> series. It looks like with this series the test may still throw an
>>> error?
>>
>> Sorry. In your comment in last series, did you mean to skip error
>> checking only for all cork tests (for now)?
>>
>> -Prashant
>>
> 
> Hi, After this is applied are any errors returned from test_sockmap?
> When I read the first patch it looked like timeouts from the cork
> tests may result in errors "FAILED" tests. If this is the case then
> yes we need skip error checking on all tests or just the corked
> tests.

Yes errors returned after applying this series. I will skip error 
checking on just corked tests.

-Prashant

^ permalink raw reply

* [PATCH v5 net] stmmac: 802.1ad tag stripping fix
From: Elad Nachman @ 2018-05-30  5:48 UTC (permalink / raw)
  To: Toshiaki Makita, David Miller
  Cc: Jose Abreu, Florian Fainelli, netdev, peppe.cavallaro,
	alexandre.torgue, eladv6
In-Reply-To: <8fcb3661-40f5-dc40-0800-d47494e021c1@lab.ntt.co.jp>

stmmac reception handler calls stmmac_rx_vlan() to strip the vlan before calling napi_gro_receive().

The function assumes VLAN tagged frames are always tagged with 802.1Q protocol,
and assigns ETH_P_8021Q to the skb by hard-coding the parameter on call to __vlan_hwaccel_put_tag() .

This causes packets not to be passed to the VLAN slave if it was created with 802.1AD protocol
(ip link add link eth0 eth0.100 type vlan proto 802.1ad id 100).

This fix passes the protocol from the VLAN header into __vlan_hwaccel_put_tag()
instead of using the hard-coded value of ETH_P_8021Q.
NETIF_F_HW_VLAN_CTAG_RX check was removed and NETIF_F_HW_VLAN_STAG_RX feature was added to be in line with the driver actual abilities.

Signed-off-by: Elad Nachman <eladn@gilat.com>

---
 drivers/net/ethernet/stmicro/stmmac/stmmac_main.c | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
index b65e2d1..f680bcf 100644
--- a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
+++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
@@ -3293,17 +3293,17 @@ static netdev_tx_t stmmac_xmit(struct sk_buff *skb, struct net_device *dev)
 
 static void stmmac_rx_vlan(struct net_device *dev, struct sk_buff *skb)
 {
-	struct ethhdr *ehdr;
+	struct vlan_ethhdr *veth;
 	u16 vlanid;
+	__be16 vlan_proto;
 
-	if ((dev->features & NETIF_F_HW_VLAN_CTAG_RX) ==
-	    NETIF_F_HW_VLAN_CTAG_RX &&
-	    !__vlan_get_tag(skb, &vlanid)) {
+	if (!__vlan_get_tag(skb, &vlanid)) {
 		/* pop the vlan tag */
-		ehdr = (struct ethhdr *)skb->data;
-		memmove(skb->data + VLAN_HLEN, ehdr, ETH_ALEN * 2);
+		veth = (struct vlan_ethhdr *)skb->data;
+		vlan_proto = veth->h_vlan_proto;
+		memmove(skb->data + VLAN_HLEN, veth, ETH_ALEN * 2);
 		skb_pull(skb, VLAN_HLEN);
-		__vlan_hwaccel_put_tag(skb, htons(ETH_P_8021Q), vlanid);
+		__vlan_hwaccel_put_tag(skb, vlan_proto, vlanid);
 	}
 }
 
@@ -4344,7 +4344,7 @@ int stmmac_dvr_probe(struct device *device,
 	ndev->watchdog_timeo = msecs_to_jiffies(watchdog);
 #ifdef STMMAC_VLAN_TAG_USED
 	/* Both mac100 and gmac support receive VLAN tag detection */
-	ndev->features |= NETIF_F_HW_VLAN_CTAG_RX;
+	ndev->features |= NETIF_F_HW_VLAN_CTAG_RX | NETIF_F_HW_VLAN_STAG_RX;
 #endif
 	priv->msg_enable = netif_msg_init(debug, default_msg_level);
 
-- 
2.7.4

^ permalink raw reply related

* Re: [PATCH net-next 1/3] net: Add support to configure SR-IOV VF minimum and maximum queues.
From: Jakub Kicinski @ 2018-05-30  5:56 UTC (permalink / raw)
  To: Michael Chan, Samudrala, Sridhar; +Cc: David Miller, Netdev, Or Gerlitz

On Tue, 29 May 2018 20:19:54 -0700, Michael Chan wrote:
> On Tue, May 29, 2018 at 1:46 PM, Samudrala, Sridhar wrote:
> > Isn't ndo_set_vf_xxx() considered a legacy interface and not planned to be
> > extended?

+1 it's painful to see this feature being added to the legacy
API :(  Another duplicated configuration knob.

> I didn't know about that.
>
> > Shouldn't we enable this via ethtool on the port representor netdev?
>
> We discussed about this.  ethtool on the VF representor will only work
> in switchdev mode and also will not support min/max values.

Ethtool channel API may be overdue a rewrite in devlink anyway, but I
feel like implementing switchdev mode and rewriting features in devlink
may be too much to ask.

^ permalink raw reply

* [PATCH bpf v3 0/5] fix test_sockmap
From: Prashant Bhole @ 2018-05-30  5:56 UTC (permalink / raw)
  To: Alexei Starovoitov, Daniel Borkmann, John Fastabend
  Cc: Prashant Bhole, David S . Miller, Shuah Khan, netdev,
	linux-kselftest

This series fixes error handling, timeout and data verification in
test_sockmap. Previously it was not able to detect failure/timeout in
RX/TX thread because error was not notified to the main thread.

Also slightly improved test output by printing parameter values (cork,
apply, start, end) so that parameters for all tests are displayed.

Changes in v3:
  - Skipped error checking for corked tests

Prashant Bhole (5):
  selftests/bpf: test_sockmap, check test failure
  selftests/bpf: test_sockmap, join cgroup in selftest mode
  selftests/bpf: test_sockmap, fix test timeout
  selftests/bpf: test_sockmap, fix data verification
  selftests/bpf: test_sockmap, print additional test options

 tools/testing/selftests/bpf/test_sockmap.c | 76 +++++++++++++++++-----
 1 file changed, 58 insertions(+), 18 deletions(-)

-- 
2.17.0

^ permalink raw reply

* [PATCH bpf v3 1/5] selftests/bpf: test_sockmap, check test failure
From: Prashant Bhole @ 2018-05-30  5:56 UTC (permalink / raw)
  To: Alexei Starovoitov, Daniel Borkmann, John Fastabend
  Cc: Prashant Bhole, David S . Miller, Shuah Khan, netdev,
	linux-kselftest
In-Reply-To: <20180530055611.10216-1-bhole_prashant_q7@lab.ntt.co.jp>

Test failures are not identified because exit code of RX/TX threads
is not checked. Also threads are not returning correct exit code.

- Return exit code from threads depending on test execution status
- In main thread, check the exit code of RX/TX threads
- Skip error checking for corked tests as they are expected to timeout

Fixes: 16962b2404ac ("bpf: sockmap, add selftests")
Signed-off-by: Prashant Bhole <bhole_prashant_q7@lab.ntt.co.jp>
---
 tools/testing/selftests/bpf/test_sockmap.c | 25 ++++++++++++++++------
 1 file changed, 19 insertions(+), 6 deletions(-)

diff --git a/tools/testing/selftests/bpf/test_sockmap.c b/tools/testing/selftests/bpf/test_sockmap.c
index eb17fae458e6..01bc9c6745e8 100644
--- a/tools/testing/selftests/bpf/test_sockmap.c
+++ b/tools/testing/selftests/bpf/test_sockmap.c
@@ -429,8 +429,8 @@ static int sendmsg_test(struct sockmap_options *opt)
 	struct msg_stats s = {0};
 	int iov_count = opt->iov_count;
 	int iov_buf = opt->iov_length;
+	int rx_status, tx_status;
 	int cnt = opt->rate;
-	int status;
 
 	errno = 0;
 
@@ -442,7 +442,7 @@ static int sendmsg_test(struct sockmap_options *opt)
 	rxpid = fork();
 	if (rxpid == 0) {
 		if (opt->drop_expected)
-			exit(1);
+			exit(0);
 
 		if (opt->sendpage)
 			iov_count = 1;
@@ -463,7 +463,7 @@ static int sendmsg_test(struct sockmap_options *opt)
 				"rx_sendmsg: TX: %zuB %fB/s %fGB/s RX: %zuB %fB/s %fGB/s\n",
 				s.bytes_sent, sent_Bps, sent_Bps/giga,
 				s.bytes_recvd, recvd_Bps, recvd_Bps/giga);
-		exit(1);
+		exit(err ? 1 : 0);
 	} else if (rxpid == -1) {
 		perror("msg_loop_rx: ");
 		return errno;
@@ -491,14 +491,27 @@ static int sendmsg_test(struct sockmap_options *opt)
 				"tx_sendmsg: TX: %zuB %fB/s %f GB/s RX: %zuB %fB/s %fGB/s\n",
 				s.bytes_sent, sent_Bps, sent_Bps/giga,
 				s.bytes_recvd, recvd_Bps, recvd_Bps/giga);
-		exit(1);
+		exit(err ? 1 : 0);
 	} else if (txpid == -1) {
 		perror("msg_loop_tx: ");
 		return errno;
 	}
 
-	assert(waitpid(rxpid, &status, 0) == rxpid);
-	assert(waitpid(txpid, &status, 0) == txpid);
+	assert(waitpid(rxpid, &rx_status, 0) == rxpid);
+	assert(waitpid(txpid, &tx_status, 0) == txpid);
+	if (WIFEXITED(rx_status)) {
+		err = WEXITSTATUS(rx_status);
+		if (err && !txmsg_cork) {
+			fprintf(stderr, "rx thread exited with err %d. ", err);
+			goto out;
+		}
+	}
+	if (WIFEXITED(tx_status)) {
+		err = WEXITSTATUS(tx_status);
+		if (err)
+			fprintf(stderr, "tx thread exited with err %d. ", err);
+	}
+out:
 	return err;
 }
 
-- 
2.17.0

^ permalink raw reply related

* [PATCH bpf v3 2/5] selftests/bpf: test_sockmap, join cgroup in selftest mode
From: Prashant Bhole @ 2018-05-30  5:56 UTC (permalink / raw)
  To: Alexei Starovoitov, Daniel Borkmann, John Fastabend
  Cc: Prashant Bhole, David S . Miller, Shuah Khan, netdev,
	linux-kselftest
In-Reply-To: <20180530055611.10216-1-bhole_prashant_q7@lab.ntt.co.jp>

In case of selftest mode, temporary cgroup environment is created but
cgroup is not joined. It causes test failures. Fixed by joining the
cgroup

Fixes: 16962b2404ac ("bpf: sockmap, add selftests")
Acked-by: John Fastabend <john.fastabend@gmail.com>
Signed-off-by: Prashant Bhole <bhole_prashant_q7@lab.ntt.co.jp>
---
 tools/testing/selftests/bpf/test_sockmap.c | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/tools/testing/selftests/bpf/test_sockmap.c b/tools/testing/selftests/bpf/test_sockmap.c
index 01bc9c6745e8..64f9e25c451f 100644
--- a/tools/testing/selftests/bpf/test_sockmap.c
+++ b/tools/testing/selftests/bpf/test_sockmap.c
@@ -1342,6 +1342,11 @@ static int __test_suite(char *bpf_file)
 		return cg_fd;
 	}
 
+	if (join_cgroup(CG_PATH)) {
+		fprintf(stderr, "ERROR: failed to join cgroup\n");
+		return -EINVAL;
+	}
+
 	/* Tests basic commands and APIs with range of iov values */
 	txmsg_start = txmsg_end = 0;
 	err = test_txmsg(cg_fd);
-- 
2.17.0

^ permalink raw reply related

* [PATCH bpf v3 3/5] selftests/bpf: test_sockmap, fix test timeout
From: Prashant Bhole @ 2018-05-30  5:56 UTC (permalink / raw)
  To: Alexei Starovoitov, Daniel Borkmann, John Fastabend
  Cc: Prashant Bhole, David S . Miller, Shuah Khan, netdev,
	linux-kselftest
In-Reply-To: <20180530055611.10216-1-bhole_prashant_q7@lab.ntt.co.jp>

In order to reduce runtime of tests, recently timout for select() call
was reduced from 1sec to 10usec. This was causing many tests failures.
It was caught with failure handling commits in this series.

Restoring the timeout from 10usec to 1sec

Fixes: a18fda1a62c3 ("bpf: reduce runtime of test_sockmap tests")
Signed-off-by: Prashant Bhole <bhole_prashant_q7@lab.ntt.co.jp>
---
 tools/testing/selftests/bpf/test_sockmap.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tools/testing/selftests/bpf/test_sockmap.c b/tools/testing/selftests/bpf/test_sockmap.c
index 64f9e25c451f..9d01f5c2abe2 100644
--- a/tools/testing/selftests/bpf/test_sockmap.c
+++ b/tools/testing/selftests/bpf/test_sockmap.c
@@ -345,8 +345,8 @@ static int msg_loop(int fd, int iov_count, int iov_length, int cnt,
 		if (err < 0)
 			perror("recv start time: ");
 		while (s->bytes_recvd < total_bytes) {
-			timeout.tv_sec = 0;
-			timeout.tv_usec = 10;
+			timeout.tv_sec = 1;
+			timeout.tv_usec = 0;
 
 			/* FD sets */
 			FD_ZERO(&w);
-- 
2.17.0

^ permalink raw reply related


This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox