Netdev List
 help / color / mirror / Atom feed
* [PATCH net-next V2 1/7] net: Add net-device param to the get offloaded stats ndo
From: Saeed Mahameed @ 2016-11-22 21:09 UTC (permalink / raw)
  To: David S. Miller; +Cc: netdev, Or Gerlitz, Roi Dayan, Saeed Mahameed
In-Reply-To: <1479849000-14902-1-git-send-email-saeedm@mellanox.com>

From: Or Gerlitz <ogerlitz@mellanox.com>

Some drivers would need to check few internal matters for
that. To be used in downstream mlx5 commit.

Signed-off-by: Or Gerlitz <ogerlitz@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
---
 drivers/net/ethernet/mellanox/mlxsw/spectrum.c | 2 +-
 include/linux/netdevice.h                      | 4 ++--
 net/core/rtnetlink.c                           | 4 ++--
 3 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum.c
index 4a1f9d5..e0d7d5a 100644
--- a/drivers/net/ethernet/mellanox/mlxsw/spectrum.c
+++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum.c
@@ -857,7 +857,7 @@ mlxsw_sp_port_get_sw_stats64(const struct net_device *dev,
 	return 0;
 }
 
-static bool mlxsw_sp_port_has_offload_stats(int attr_id)
+static bool mlxsw_sp_port_has_offload_stats(const struct net_device *dev, int attr_id)
 {
 	switch (attr_id) {
 	case IFLA_OFFLOAD_XSTATS_CPU_HIT:
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index e84800e..ae32a27 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -925,7 +925,7 @@ struct netdev_xdp {
  *	3. Update dev->stats asynchronously and atomically, and define
  *	   neither operation.
  *
- * bool (*ndo_has_offload_stats)(int attr_id)
+ * bool (*ndo_has_offload_stats)(const struct net_device *dev, int attr_id)
  *	Return true if this device supports offload stats of this attr_id.
  *
  * int (*ndo_get_offload_stats)(int attr_id, const struct net_device *dev,
@@ -1165,7 +1165,7 @@ struct net_device_ops {
 
 	struct rtnl_link_stats64* (*ndo_get_stats64)(struct net_device *dev,
 						     struct rtnl_link_stats64 *storage);
-	bool			(*ndo_has_offload_stats)(int attr_id);
+	bool			(*ndo_has_offload_stats)(const struct net_device *dev, int attr_id);
 	int			(*ndo_get_offload_stats)(int attr_id,
 							 const struct net_device *dev,
 							 void *attr_data);
diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index db313ec..f5a8d8a 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -3665,7 +3665,7 @@ static int rtnl_get_offload_stats(struct sk_buff *skb, struct net_device *dev,
 		if (!size)
 			continue;
 
-		if (!dev->netdev_ops->ndo_has_offload_stats(attr_id))
+		if (!dev->netdev_ops->ndo_has_offload_stats(dev, attr_id))
 			continue;
 
 		attr = nla_reserve_64bit(skb, attr_id, size,
@@ -3706,7 +3706,7 @@ static int rtnl_get_offload_stats_size(const struct net_device *dev)
 
 	for (attr_id = IFLA_OFFLOAD_XSTATS_FIRST;
 	     attr_id <= IFLA_OFFLOAD_XSTATS_MAX; attr_id++) {
-		if (!dev->netdev_ops->ndo_has_offload_stats(attr_id))
+		if (!dev->netdev_ops->ndo_has_offload_stats(dev, attr_id))
 			continue;
 		size = rtnl_get_offload_stats_attr_size(attr_id);
 		nla_size += nla_total_size_64bit(size);
-- 
2.7.4

^ permalink raw reply related

* [PATCH net-next V2 6/7] net/mlx5: E-Switch, Add control for inline mode
From: Saeed Mahameed @ 2016-11-22 21:09 UTC (permalink / raw)
  To: David S. Miller; +Cc: netdev, Or Gerlitz, Roi Dayan, Saeed Mahameed
In-Reply-To: <1479849000-14902-1-git-send-email-saeedm@mellanox.com>

From: Roi Dayan <roid@mellanox.com>

Implement devlink show and set of HW inline-mode.
The supported modes: none, link, network, transport.
We currently support one mode for all vports so set is done on all vports.
When eswitch is first initialized the inline-mode is queried from the FW.

Signed-off-by: Roi Dayan <roid@mellanox.com>
Reviewed-by: Or Gerlitz <ogerlitz@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
---
 drivers/net/ethernet/mellanox/mlx5/core/eswitch.c  |   1 +
 drivers/net/ethernet/mellanox/mlx5/core/eswitch.h  |   4 +
 .../ethernet/mellanox/mlx5/core/eswitch_offloads.c | 141 +++++++++++++++++++++
 drivers/net/ethernet/mellanox/mlx5/core/main.c     |   2 +
 4 files changed, 148 insertions(+)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c
index 9734ac8..d6807c3 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c
@@ -1798,6 +1798,7 @@ int mlx5_eswitch_init(struct mlx5_core_dev *dev)
 	esw->total_vports = total_vports;
 	esw->enabled_vports = 0;
 	esw->mode = SRIOV_NONE;
+	esw->offloads.inline_mode = MLX5_INLINE_MODE_NONE;
 
 	dev->priv.eswitch = esw;
 	return 0;
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h
index 40482e8..cf1aa56 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h
@@ -200,6 +200,7 @@ struct mlx5_esw_offload {
 	struct mlx5_flow_group *vport_rx_group;
 	struct mlx5_eswitch_rep *vport_reps;
 	DECLARE_HASHTABLE(encap_tbl, 8);
+	u8 inline_mode;
 };
 
 struct mlx5_eswitch {
@@ -309,6 +310,9 @@ void mlx5_eswitch_sqs2vport_stop(struct mlx5_eswitch *esw,
 
 int mlx5_devlink_eswitch_mode_set(struct devlink *devlink, u16 mode);
 int mlx5_devlink_eswitch_mode_get(struct devlink *devlink, u16 *mode);
+int mlx5_devlink_eswitch_inline_mode_set(struct devlink *devlink, u8 mode);
+int mlx5_devlink_eswitch_inline_mode_get(struct devlink *devlink, u8 *mode);
+int mlx5_eswitch_inline_mode_get(struct mlx5_eswitch *esw, int nvfs, u8 *mode);
 void mlx5_eswitch_register_vport_rep(struct mlx5_eswitch *esw,
 				     int vport_index,
 				     struct mlx5_eswitch_rep *rep);
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c b/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c
index 731f286..5c01550 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c
@@ -657,6 +657,14 @@ static int esw_offloads_start(struct mlx5_eswitch *esw)
 		if (err1)
 			esw_warn(esw->dev, "Failed setting eswitch back to legacy, err %d\n", err);
 	}
+	if (esw->offloads.inline_mode == MLX5_INLINE_MODE_NONE) {
+		if (mlx5_eswitch_inline_mode_get(esw,
+						 num_vfs,
+						 &esw->offloads.inline_mode)) {
+			esw->offloads.inline_mode = MLX5_INLINE_MODE_L2;
+			esw_warn(esw->dev, "Inline mode is different between vports\n");
+		}
+	}
 	return err;
 }
 
@@ -771,6 +779,50 @@ static int esw_mode_to_devlink(u16 mlx5_mode, u16 *mode)
 	return 0;
 }
 
+static int esw_inline_mode_from_devlink(u8 mode, u8 *mlx5_mode)
+{
+	switch (mode) {
+	case DEVLINK_ESWITCH_INLINE_MODE_NONE:
+		*mlx5_mode = MLX5_INLINE_MODE_NONE;
+		break;
+	case DEVLINK_ESWITCH_INLINE_MODE_LINK:
+		*mlx5_mode = MLX5_INLINE_MODE_L2;
+		break;
+	case DEVLINK_ESWITCH_INLINE_MODE_NETWORK:
+		*mlx5_mode = MLX5_INLINE_MODE_IP;
+		break;
+	case DEVLINK_ESWITCH_INLINE_MODE_TRANSPORT:
+		*mlx5_mode = MLX5_INLINE_MODE_TCP_UDP;
+		break;
+	default:
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+static int esw_inline_mode_to_devlink(u8 mlx5_mode, u8 *mode)
+{
+	switch (mlx5_mode) {
+	case MLX5_INLINE_MODE_NONE:
+		*mode = DEVLINK_ESWITCH_INLINE_MODE_NONE;
+		break;
+	case MLX5_INLINE_MODE_L2:
+		*mode = DEVLINK_ESWITCH_INLINE_MODE_LINK;
+		break;
+	case MLX5_INLINE_MODE_IP:
+		*mode = DEVLINK_ESWITCH_INLINE_MODE_NETWORK;
+		break;
+	case MLX5_INLINE_MODE_TCP_UDP:
+		*mode = DEVLINK_ESWITCH_INLINE_MODE_TRANSPORT;
+		break;
+	default:
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
 int mlx5_devlink_eswitch_mode_set(struct devlink *devlink, u16 mode)
 {
 	struct mlx5_core_dev *dev;
@@ -815,6 +867,95 @@ int mlx5_devlink_eswitch_mode_get(struct devlink *devlink, u16 *mode)
 	return esw_mode_to_devlink(dev->priv.eswitch->mode, mode);
 }
 
+int mlx5_devlink_eswitch_inline_mode_set(struct devlink *devlink, u8 mode)
+{
+	struct mlx5_core_dev *dev = devlink_priv(devlink);
+	struct mlx5_eswitch *esw = dev->priv.eswitch;
+	int num_vports = esw->enabled_vports;
+	int err;
+	int vport;
+	u8 mlx5_mode;
+
+	if (!MLX5_CAP_GEN(dev, vport_group_manager))
+		return -EOPNOTSUPP;
+
+	if (esw->mode == SRIOV_NONE)
+		return -EOPNOTSUPP;
+
+	if (MLX5_CAP_ETH(dev, wqe_inline_mode) !=
+	    MLX5_CAP_INLINE_MODE_VPORT_CONTEXT)
+		return -EOPNOTSUPP;
+
+	err = esw_inline_mode_from_devlink(mode, &mlx5_mode);
+	if (err)
+		goto out;
+
+	for (vport = 1; vport < num_vports; vport++) {
+		err = mlx5_modify_nic_vport_min_inline(dev, vport, mlx5_mode);
+		if (err) {
+			esw_warn(dev, "Failed to set min inline on vport %d\n",
+				 vport);
+			goto revert_inline_mode;
+		}
+	}
+
+	esw->offloads.inline_mode = mlx5_mode;
+	return 0;
+
+revert_inline_mode:
+	while (--vport > 0)
+		mlx5_modify_nic_vport_min_inline(dev,
+						 vport,
+						 esw->offloads.inline_mode);
+out:
+	return err;
+}
+
+int mlx5_devlink_eswitch_inline_mode_get(struct devlink *devlink, u8 *mode)
+{
+	struct mlx5_core_dev *dev = devlink_priv(devlink);
+	struct mlx5_eswitch *esw = dev->priv.eswitch;
+
+	if (!MLX5_CAP_GEN(dev, vport_group_manager))
+		return -EOPNOTSUPP;
+
+	if (esw->mode == SRIOV_NONE)
+		return -EOPNOTSUPP;
+
+	if (MLX5_CAP_ETH(dev, wqe_inline_mode) !=
+	    MLX5_CAP_INLINE_MODE_VPORT_CONTEXT)
+		return -EOPNOTSUPP;
+
+	return esw_inline_mode_to_devlink(esw->offloads.inline_mode, mode);
+}
+
+int mlx5_eswitch_inline_mode_get(struct mlx5_eswitch *esw, int nvfs, u8 *mode)
+{
+	struct mlx5_core_dev *dev = esw->dev;
+	int vport;
+	u8 prev_mlx5_mode, mlx5_mode = MLX5_INLINE_MODE_L2;
+
+	if (!MLX5_CAP_GEN(dev, vport_group_manager))
+		return -EOPNOTSUPP;
+
+	if (esw->mode == SRIOV_NONE)
+		return -EOPNOTSUPP;
+
+	if (MLX5_CAP_ETH(dev, wqe_inline_mode) !=
+	    MLX5_CAP_INLINE_MODE_VPORT_CONTEXT)
+		return -EOPNOTSUPP;
+
+	for (vport = 1; vport <= nvfs; vport++) {
+		mlx5_query_nic_vport_min_inline(dev, vport, &mlx5_mode);
+		if (vport > 1 && prev_mlx5_mode != mlx5_mode)
+			return -EINVAL;
+		prev_mlx5_mode = mlx5_mode;
+	}
+
+	*mode = mlx5_mode;
+	return 0;
+}
+
 void mlx5_eswitch_register_vport_rep(struct mlx5_eswitch *esw,
 				     int vport_index,
 				     struct mlx5_eswitch_rep *__rep)
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/main.c b/drivers/net/ethernet/mellanox/mlx5/core/main.c
index f28df33..b440a16 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/main.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/main.c
@@ -1239,6 +1239,8 @@ static const struct devlink_ops mlx5_devlink_ops = {
 #ifdef CONFIG_MLX5_CORE_EN
 	.eswitch_mode_set = mlx5_devlink_eswitch_mode_set,
 	.eswitch_mode_get = mlx5_devlink_eswitch_mode_get,
+	.eswitch_inline_mode_set = mlx5_devlink_eswitch_inline_mode_set,
+	.eswitch_inline_mode_get = mlx5_devlink_eswitch_inline_mode_get,
 #endif
 };
 
-- 
2.7.4

^ permalink raw reply related

* [PATCH net-next V2 7/7] net/mlx5e: Enforce min inline mode when offloading flows
From: Saeed Mahameed @ 2016-11-22 21:10 UTC (permalink / raw)
  To: David S. Miller; +Cc: netdev, Or Gerlitz, Roi Dayan, Saeed Mahameed
In-Reply-To: <1479849000-14902-1-git-send-email-saeedm@mellanox.com>

From: Roi Dayan <roid@mellanox.com>

A flow should be offloaded only if the matches are
allowed according to min inline mode.

Signed-off-by: Roi Dayan <roid@mellanox.com>
Reviewed-by: Or Gerlitz <ogerlitz@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
---
 drivers/net/ethernet/mellanox/mlx5/core/en_tc.c | 46 +++++++++++++++++++++++--
 1 file changed, 44 insertions(+), 2 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c
index 4b99112..4d06fab 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c
@@ -279,8 +279,10 @@ static int parse_tunnel_attr(struct mlx5e_priv *priv,
 	return 0;
 }
 
-static int parse_cls_flower(struct mlx5e_priv *priv, struct mlx5_flow_spec *spec,
-			    struct tc_cls_flower_offload *f)
+static int __parse_cls_flower(struct mlx5e_priv *priv,
+			      struct mlx5_flow_spec *spec,
+			      struct tc_cls_flower_offload *f,
+			      u8 *min_inline)
 {
 	void *headers_c = MLX5_ADDR_OF(fte_match_param, spec->match_criteria,
 				       outer_headers);
@@ -289,6 +291,8 @@ static int parse_cls_flower(struct mlx5e_priv *priv, struct mlx5_flow_spec *spec
 	u16 addr_type = 0;
 	u8 ip_proto = 0;
 
+	*min_inline = MLX5_INLINE_MODE_L2;
+
 	if (f->dissector->used_keys &
 	    ~(BIT(FLOW_DISSECTOR_KEY_CONTROL) |
 	      BIT(FLOW_DISSECTOR_KEY_BASIC) |
@@ -362,6 +366,9 @@ static int parse_cls_flower(struct mlx5e_priv *priv, struct mlx5_flow_spec *spec
 			 mask->ip_proto);
 		MLX5_SET(fte_match_set_lyr_2_4, headers_v, ip_protocol,
 			 key->ip_proto);
+
+		if (mask->ip_proto)
+			*min_inline = MLX5_INLINE_MODE_IP;
 	}
 
 	if (dissector_uses_key(f->dissector, FLOW_DISSECTOR_KEY_ETH_ADDRS)) {
@@ -432,6 +439,9 @@ static int parse_cls_flower(struct mlx5e_priv *priv, struct mlx5_flow_spec *spec
 		memcpy(MLX5_ADDR_OF(fte_match_set_lyr_2_4, headers_v,
 				    dst_ipv4_dst_ipv6.ipv4_layout.ipv4),
 		       &key->dst, sizeof(key->dst));
+
+		if (mask->src || mask->dst)
+			*min_inline = MLX5_INLINE_MODE_IP;
 	}
 
 	if (addr_type == FLOW_DISSECTOR_KEY_IPV6_ADDRS) {
@@ -457,6 +467,10 @@ static int parse_cls_flower(struct mlx5e_priv *priv, struct mlx5_flow_spec *spec
 		memcpy(MLX5_ADDR_OF(fte_match_set_lyr_2_4, headers_v,
 				    dst_ipv4_dst_ipv6.ipv6_layout.ipv6),
 		       &key->dst, sizeof(key->dst));
+
+		if (ipv6_addr_type(&mask->src) != IPV6_ADDR_ANY ||
+		    ipv6_addr_type(&mask->dst) != IPV6_ADDR_ANY)
+			*min_inline = MLX5_INLINE_MODE_IP;
 	}
 
 	if (dissector_uses_key(f->dissector, FLOW_DISSECTOR_KEY_PORTS)) {
@@ -497,11 +511,39 @@ static int parse_cls_flower(struct mlx5e_priv *priv, struct mlx5_flow_spec *spec
 				   "Only UDP and TCP transport are supported\n");
 			return -EINVAL;
 		}
+
+		if (mask->src || mask->dst)
+			*min_inline = MLX5_INLINE_MODE_TCP_UDP;
 	}
 
 	return 0;
 }
 
+static int parse_cls_flower(struct mlx5e_priv *priv,
+			    struct mlx5_flow_spec *spec,
+			    struct tc_cls_flower_offload *f)
+{
+	struct mlx5_core_dev *dev = priv->mdev;
+	struct mlx5_eswitch *esw = dev->priv.eswitch;
+	struct mlx5_eswitch_rep *rep = priv->ppriv;
+	u8 min_inline;
+	int err;
+
+	err = __parse_cls_flower(priv, spec, f, &min_inline);
+
+	if (!err && esw->mode == SRIOV_OFFLOADS &&
+	    rep->vport != FDB_UPLINK_VPORT) {
+		if (min_inline > esw->offloads.inline_mode) {
+			netdev_warn(priv->netdev,
+				    "Flow is not offloaded due to min inline setting, required %d actual %d\n",
+				    min_inline, esw->offloads.inline_mode);
+			return -EOPNOTSUPP;
+		}
+	}
+
+	return err;
+}
+
 static int parse_tc_nic_actions(struct mlx5e_priv *priv, struct tcf_exts *exts,
 				u32 *action, u32 *flow_tag)
 {
-- 
2.7.4

^ permalink raw reply related

* Re: [PATCH net-next] ethtool: Protect {get,set}_phy_tunable with PHY device mutex
From: Florian Fainelli @ 2016-11-22 21:02 UTC (permalink / raw)
  To: netdev
  Cc: davem, bcm-kernel-feedback-list, andrew, allan.nielsen,
	raju.lakkaraju, vivien.didelot
In-Reply-To: <20161122201316.10830-1-f.fainelli@gmail.com>

On 11/22/2016 12:13 PM, Florian Fainelli wrote:
> PHY drivers should be able to rely on the caller of {get,set}_tunable to
> have acquired the PHY device mutex, in order to both serialize against
> concurrent calls of these functions, but also against PHY state machine
> changes. All ethtool PHY-level functions do this, except
> {get,set}_tunable, so we make them consistent here as well.
> 
> Fixes: 968ad9da7e0e ("ethtool: Implements ETHTOOL_PHY_GTUNABLE/ETHTOOL_PHY_STUNABLE")
> Signed-off-by: Florian Fainelli <f.fainelli@gmail.com>

David, please discard, this is going to create problems for the
Microsemi PHY driver since it also acquires phydev->lock. (patch has
been marked accordingly in patchwork.
Thanks!
-- 
Florian

^ permalink raw reply

* [PATCH net-next v2] ethtool: Protect {get,set}_phy_tunable with PHY device mutex
From: Florian Fainelli @ 2016-11-22 21:55 UTC (permalink / raw)
  To: netdev
  Cc: davem, bcm-kernel-feedback-list, andrew, allan.nielsen,
	raju.lakkaraju, vivien.didelot, Florian Fainelli

PHY drivers should be able to rely on the caller of {get,set}_tunable to
have acquired the PHY device mutex, in order to both serialize against
concurrent calls of these functions, but also against PHY state machine
changes. All ethtool PHY-level functions do this, except
{get,set}_tunable, so we make them consistent here as well.

We need to update the Microsemi PHY driver in the same commit to avoid
introducing either deadlocks, or lack of proper locking.

Fixes: 968ad9da7e0e ("ethtool: Implements ETHTOOL_PHY_GTUNABLE/ETHTOOL_PHY_STUNABLE")
Fixes: 310d9ad57ae0 ("net: phy: Add downshift get/set support in Microsemi PHYs driver")
Signed-off-by: Florian Fainelli <f.fainelli@gmail.com>
---
Changes in v2:

- also patch drivers/net/phy/mscc.c in the same commit

 drivers/net/phy/mscc.c | 16 +++++-----------
 net/core/ethtool.c     |  4 ++++
 2 files changed, 9 insertions(+), 11 deletions(-)

diff --git a/drivers/net/phy/mscc.c b/drivers/net/phy/mscc.c
index 92018ba6209e..7a3740c7bf6d 100644
--- a/drivers/net/phy/mscc.c
+++ b/drivers/net/phy/mscc.c
@@ -115,10 +115,9 @@ static int vsc85xx_downshift_get(struct phy_device *phydev, u8 *count)
 	int rc;
 	u16 reg_val;
 
-	mutex_lock(&phydev->lock);
 	rc = vsc85xx_phy_page_set(phydev, MSCC_PHY_PAGE_EXTENDED);
 	if (rc != 0)
-		goto out_unlock;
+		goto out;
 
 	reg_val = phy_read(phydev, MSCC_PHY_ACTIPHY_CNTL);
 	reg_val &= DOWNSHIFT_CNTL_MASK;
@@ -128,9 +127,7 @@ static int vsc85xx_downshift_get(struct phy_device *phydev, u8 *count)
 		*count = ((reg_val & ~DOWNSHIFT_EN) >> DOWNSHIFT_CNTL_POS) + 2;
 	rc = vsc85xx_phy_page_set(phydev, MSCC_PHY_PAGE_STANDARD);
 
-out_unlock:
-	mutex_unlock(&phydev->lock);
-
+out:
 	return rc;
 }
 
@@ -150,23 +147,20 @@ static int vsc85xx_downshift_set(struct phy_device *phydev, u8 count)
 		count = (((count - 2) << DOWNSHIFT_CNTL_POS) | DOWNSHIFT_EN);
 	}
 
-	mutex_lock(&phydev->lock);
 	rc = vsc85xx_phy_page_set(phydev, MSCC_PHY_PAGE_EXTENDED);
 	if (rc != 0)
-		goto out_unlock;
+		goto out;
 
 	reg_val = phy_read(phydev, MSCC_PHY_ACTIPHY_CNTL);
 	reg_val &= ~(DOWNSHIFT_CNTL_MASK);
 	reg_val |= count;
 	rc = phy_write(phydev, MSCC_PHY_ACTIPHY_CNTL, reg_val);
 	if (rc != 0)
-		goto out_unlock;
+		goto out;
 
 	rc = vsc85xx_phy_page_set(phydev, MSCC_PHY_PAGE_STANDARD);
 
-out_unlock:
-	mutex_unlock(&phydev->lock);
-
+out:
 	return rc;
 }
 
diff --git a/net/core/ethtool.c b/net/core/ethtool.c
index e9b4556751ff..0adb3bec5b5a 100644
--- a/net/core/ethtool.c
+++ b/net/core/ethtool.c
@@ -2466,7 +2466,9 @@ static int get_phy_tunable(struct net_device *dev, void __user *useraddr)
 	data = kmalloc(tuna.len, GFP_USER);
 	if (!data)
 		return -ENOMEM;
+	mutex_lock(&phydev->lock);
 	ret = phydev->drv->get_tunable(phydev, &tuna, data);
+	mutex_unlock(&phydev->lock);
 	if (ret)
 		goto out;
 	useraddr += sizeof(tuna);
@@ -2501,7 +2503,9 @@ static int set_phy_tunable(struct net_device *dev, void __user *useraddr)
 	ret = -EFAULT;
 	if (copy_from_user(data, useraddr, tuna.len))
 		goto out;
+	mutex_lock(&phydev->lock);
 	ret = phydev->drv->set_tunable(phydev, &tuna, data);
+	mutex_unlock(&phydev->lock);
 
 out:
 	kfree(data);
-- 
2.9.3

^ permalink raw reply related

* Re: [net] 34fad54c25: kernel BUG at include/linux/skbuff.h:1935!
From: Linus Torvalds @ 2016-11-22 22:04 UTC (permalink / raw)
  To: kernel test robot, David Miller, Eric Dumazet
  Cc: LKP, LKML, Alexei Starovoitov, Willem de Bruijn, Alexander Duyck,
	Network Development
In-Reply-To: <582b7c30.nXQXP2V4/6pFiYwt%xiaolong.ye@intel.com>

David, Eric,

 what's the situation on this issue? The bisection looks a bit odd,
but the commit in question does end up changing the key_control->thoff
value for the failure case, so maybe that in turn ends up screwing up
a later skb_pull.

I'm not seeing anything that might fix this in the last networking
pull, but I may have missed something.

I also noticed that the kernel test robot had screwed up the
participants list for some reason, and had

  "Acked-by: Alexander Duyck <alexander.h.duyck@intel.com>, David S.
Miller" <davem@davemloft.net>

as one of the participants. So there's some odd commit parsing issue
there somewhere. But Alexander seems to have seen this report despite
that, it just never went anywhere that I can tell.

                Linus

On Tue, Nov 15, 2016 at 1:20 PM, kernel test robot
<xiaolong.ye@intel.com> wrote:
>
> FYI, we noticed the following commit:
>
> https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git master
> commit 34fad54c2537f7c99d07375e50cb30aa3c23bd83 ("net: __skb_flow_dissect() must cap its return value")
>
> in testcase: pbzip2
> with following parameters:
>
>         nr_threads: 25%
>         blocksize: 900K
>         cpufreq_governor: performance
>
>
>
> on test machine: 48 threads 2 sockets Intel(R) Xeon(R) CPU E5-2697 v2 @ 2.70GHz with 64G memory
>
> caused below changes:
>
>
> +------------------------------------------------------------------+------------+------------+
> |                                                                  | 79774d6bfa | 34fad54c25 |
> +------------------------------------------------------------------+------------+------------+
> | boot_successes                                                   | 0          | 2          |
> | boot_failures                                                    | 2          | 20         |
> | invoked_oom-killer:gfp_mask=0x                                   | 2          | 2          |
> | Mem-Info                                                         | 2          | 2          |
> | Kernel_panic-not_syncing:Out_of_memory_and_no_killable_processes | 2          | 2          |
> | kernel_BUG_at_include/linux/skbuff.h                             | 0          | 16         |
> | invalid_opcode:#[##]SMP                                          | 0          | 16         |
> | RIP:eth_type_trans                                               | 0          | 16         |
> | Kernel_panic-not_syncing:Fatal_exception_in_interrupt            | 0          | 15         |
> | calltrace:hub_event                                              | 0          | 1          |
> | WARNING:at_fs/sysfs/dir.c:#sysfs_warn_dup                        | 0          | 2          |
> | calltrace:parport_pc_init                                        | 0          | 2          |
> | calltrace:SyS_finit_module                                       | 0          | 2          |
> | WARNING:at_lib/kobject.c:#kobject_add_internal                   | 0          | 2          |
> +------------------------------------------------------------------+------------+------------+
>
>
>
> [   19.375251] IPv6: ADDRCONF(NETDEV_UP): eth1: link is not ready
> [   19.388892] Sending DHCP requests .
> [   19.388892] ------------[ cut here ]------------
> [   19.388894] kernel BUG at include/linux/skbuff.h:1935!
> [   19.388895] invalid opcode: 0000 [#1] SMP
> [   19.388896] Modules linked in:
> [   19.388897] CPU: 0 PID: 0 Comm: swapper/0 Not tainted 4.9.0-rc3-00320-g34fad54 #1
> [   19.388898] Hardware name: Intel Corporation S2600WP/S2600WP, BIOS SE5C600.86B.02.02.0002.122320131210 12/23/2013
> [   19.388899] task: ffffffff81e0e4c0 task.stack: ffffffff81e00000
> [   19.388904] RIP: 0010:[<ffffffff81837c48>]  [<ffffffff81837c48>] eth_type_trans+0xe8/0x140
> [   19.388904] RSP: 0000:ffff88081e803db8  EFLAGS: 00010297
> [   19.388905] RAX: 0000000000000152 RBX: ffff88080221f200 RCX: 0000000000001073
> [   19.388905] RDX: ffff8808013afdc0 RSI: ffff880801114000 RDI: ffff880819407c00
> [   19.388906] RBP: ffff88081e803e20 R08: ffff880801114000 R09: 0000000000000800
> [   19.388907] R10: ffff8808013afec0 R11: ffffea003fd5a880 R12: ffff880819407c00
> [   19.388907] R13: ffff881033408000 R14: ffffc9000843e000 R15: 0000000000000158
> [   19.388908] FS:  0000000000000000(0000) GS:ffff88081e800000(0000) knlGS:0000000000000000
> [   19.388909] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
> [   19.388910] CR2: ffff88103ffff000 CR3: 0000000001e07000 CR4: 00000000001406f0
> [   19.388910] Stack:
> [   19.388912]  ffffffff816905a7 ffffea003fd5a880 ffffea0000000008 ffff88080221f050
> [   19.388913]  ffff88080221f000 0000004000000160 ffffea003fd5a880 0000000000000000
> [   19.388915]  0000000000000040 0000000000000000 ffff88080221f050 ffff88100d216000
> [   19.388915] Call Trace:
> [   19.388919]  <IRQ>
> [   19.388919]  [<ffffffff816905a7>] ? igb_clean_rx_irq+0x6a7/0x7d0
> [   19.388921]  [<ffffffff81690a52>] igb_poll+0x382/0x700
> [   19.388922]  [<ffffffff81690a67>] ? igb_poll+0x397/0x700
> [   19.388925]  [<ffffffff8180f2d7>] net_rx_action+0x217/0x360
> [   19.388928]  [<ffffffff81957fb4>] __do_softirq+0x104/0x2ab
> [   19.388931]  [<ffffffff81086961>] irq_exit+0xf1/0x100
> [   19.388932]  [<ffffffff81957cf4>] do_IRQ+0x54/0xd0
> [   19.388935]  [<ffffffff81955b8c>] common_interrupt+0x8c/0x8c
> [   19.388938]  <EOI>
> [   19.388938]  [<ffffffff817c1d12>] ? cpuidle_enter_state+0x122/0x2e0
> [   19.388939]  [<ffffffff817c1f07>] cpuidle_enter+0x17/0x20
> [   19.388942]  [<ffffffff810c64c3>] call_cpuidle+0x23/0x40
> [   19.388944]  [<ffffffff810c66f4>] cpu_startup_entry+0x114/0x200
> [   19.388946]  [<ffffffff81947675>] rest_init+0x85/0x90
> [   19.388950]  [<ffffffff81ffbf5c>] start_kernel+0x407/0x414
> [   19.388952]  [<ffffffff81ffb120>] ? early_idt_handler_array+0x120/0x120
> [   19.388953]  [<ffffffff81ffb2d6>] x86_64_start_reservations+0x2a/0x2c
> [   19.388955]  [<ffffffff81ffb415>] x86_64_start_kernel+0x13d/0x14c
> [   19.388968] Code: 00 04 00 00 c9 c3 48 33 86 70 03 00 00 48 c1 e0 10 48 85 c0 0f b6 87 90 00 00 00 75 28 83 e0 f8 83 c8 01 88 87 90 00 00 00 eb 82 <0f> 0b 0f b6 87 90 00 00 00 83 e0 f8 83 c8 03 88 87 90 00 00 00
> [   19.388970] RIP  [<ffffffff81837c48>] eth_type_trans+0xe8/0x140
> [   19.388970]  RSP <ffff88081e803db8>
> [   19.388996] ---[ end trace 107996155a43a15c ]---
> [   19.393422] Kernel panic - not syncing: Fatal exception in interrupt
>
>
> To reproduce:
>
>         git clone git://git.kernel.org/pub/scm/linux/kernel/git/wfg/lkp-tests.git
>         cd lkp-tests
>         bin/lkp install job.yaml  # job file is attached in this email
>         bin/lkp run     job.yaml
>
>
>
> Thanks,
> Kernel Test Robot

^ permalink raw reply

* Re: [RFC net-next 0/3] net: bridge: Allow CPU port configuration
From: Jiri Pirko @ 2016-11-22 22:08 UTC (permalink / raw)
  To: Andrew Lunn
  Cc: Ido Schimmel, Florian Fainelli, netdev, davem, bridge, stephen,
	vivien.didelot, jiri, idosch
In-Reply-To: <20161122174829.GD14947@lunn.ch>

Tue, Nov 22, 2016 at 06:48:29PM CET, andrew@lunn.ch wrote:
>Hi Ido
> 
>> First of all, I want to be sure that when we say "CPU port", we're
>> talking about the same thing. In mlxsw, the CPU port is a pipe between
>> the device and the host, through which all packets trapped to the host
>> go through. So, when a packet is trapped, the driver reads its Rx
>> descriptor, checks through which port it ingressed, resolves its netdev,
>> sets skb->dev accordingly and injects it to the Rx path via
>> netif_receive_skb(). The CPU port itself isn't represented using a
>> netdev.
>
>With DSA, we have a real physical ethernet network interface for the
>'cpu' port. It connects to one of the ports of the switch. Frames on

Every port should be visible as a netdevice, including cpu port.
Would it make sence to have representors for those?

>this interface have an extra header, indicating which switch port it
>came from, and we do a similar resolving it to a slave netdev, strip
>of the header and injecting it into the receiver path via
>netif_receive_skb().
>
>	Andrew

^ permalink raw reply

* Re: [PATCH 2/2] net: qcom/emac: add support for the Qualcomm Technologies QDF2400
From: Timur Tabi @ 2016-11-22 22:14 UTC (permalink / raw)
  To: David Miller, alokc, netdev
In-Reply-To: <1479769102-27909-3-git-send-email-timur@codeaurora.org>

On 11/21/2016 04:58 PM, Timur Tabi wrote:
> The QDF2432 and the QDF2400 have slightly different internal PHYs,
> so there are some programming differences.  Some of the registers in
> the QDF2400 have moved, and some registers require different values
> during initialization.
>
> Because of the differences, the internal PHY on the QDF2400 has a new
> ACPI HID, QCOM8072.
>
> Signed-off-by: Timur Tabi<timur@codeaurora.org>

There seems to be some disagreement internally as to whether a new HID 
is the right approach.  Please hold off on applying patch [2/2] for now.

Patch [1/2] can be applied, however, if it passes review.

-- 
Qualcomm Datacenter Technologies, Inc. as an affiliate of Qualcomm
Technologies, Inc.  Qualcomm Technologies, Inc. is a member of the
Code Aurora Forum, a Linux Foundation Collaborative Project.

^ permalink raw reply

* Re: [PATCH] net: dsa: mv88e6xxx: egress all frames
From: Vivien Didelot @ 2016-11-22 22:15 UTC (permalink / raw)
  To: Andrew Lunn, Stefan Eichenberger; +Cc: Stefan Eichenberger, f.fainelli, netdev
In-Reply-To: <20161122190206.GE14947@lunn.ch>

Hi Andrew, Stefan,

Andrew Lunn <andrew@lunn.ch> writes:

> What you might find useful is
>
> https://github.com/vivien/linux.git 161b96bd7d16d21b0f046c935b70c3b2d277ccc2
>
> although it might need some changes for recent commits.
>
> With that, you can see deeper into the switches registers.

FYI, I have rebased it on top of the latest net-next (f9aa9dc7d2d0):

    https://github.com/vivien/linux.git dsa/dev

Thanks,

        Vivien

^ permalink raw reply

* Re: [net] 34fad54c25: kernel BUG at include/linux/skbuff.h:1935!
From: Eric Dumazet @ 2016-11-22 22:28 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: kernel test robot, David Miller, LKP, LKML, Alexei Starovoitov,
	Willem de Bruijn, Alexander Duyck, Network Development
In-Reply-To: <CA+55aFxV7Bq583QOdYauuo2jY9EkAmgnceBukrN27ArjzFszYg@mail.gmail.com>

On Tue, Nov 22, 2016 at 2:04 PM, Linus Torvalds
<torvalds@linux-foundation.org> wrote:
> David, Eric,
>
>  what's the situation on this issue? The bisection looks a bit odd,
> but the commit in question does end up changing the key_control->thoff
> value for the failure case, so maybe that in turn ends up screwing up
> a later skb_pull.
>
> I'm not seeing anything that might fix this in the last networking
> pull, but I may have missed something.
>
> I also noticed that the kernel test robot had screwed up the
> participants list for some reason, and had
>
>   "Acked-by: Alexander Duyck <alexander.h.duyck@intel.com>, David S.
> Miller" <davem@davemloft.net>
>
> as one of the participants. So there's some odd commit parsing issue
> there somewhere. But Alexander seems to have seen this report despite
> that, it just never went anywhere that I can tell.
>
>                 Linus
>

This is fixed by :
https://git.kernel.org/cgit/linux/kernel/git/davem/net.git/commit/?id=c9b8af1330198ae241cd545e1f040019010d44d9

Thanks

^ permalink raw reply

* Re: [net] 34fad54c25: kernel BUG at include/linux/skbuff.h:1935!
From: Andre Noll @ 2016-11-22 22:30 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: kernel test robot, David Miller, Eric Dumazet, LKP, LKML,
	Alexei Starovoitov, Willem de Bruijn, Alexander Duyck,
	Network Development
In-Reply-To: <CA+55aFxV7Bq583QOdYauuo2jY9EkAmgnceBukrN27ArjzFszYg@mail.gmail.com>

[-- Attachment #1: Type: text/plain, Size: 771 bytes --]

On Tue, Nov 22, 14:04, Linus Torvalds wrote
>  what's the situation on this issue? The bisection looks a bit odd,
> but the commit in question does end up changing the key_control->thoff
> value for the failure case, so maybe that in turn ends up screwing up
> a later skb_pull.
> 
> I'm not seeing anything that might fix this in the last networking
> pull, but I may have missed something.

I think that's the bug Eric has fixed today. See thread

	[PATCH net] flow_dissect: call init_default_flow_dissectors() earlier

David has queued up the fix and will send it your way shortly.

Andre
-- 
Max Planck Institute for Developmental Biology
Spemannstraße 35, 72076 Tübingen, Germany. Phone: (+49) 7071 601 829
http://people.tuebingen.mpg.de/maan/

[-- Attachment #2: Digital signature --]
[-- Type: application/pgp-signature, Size: 181 bytes --]

^ permalink raw reply

* Re: [RFC 02/10] IB/hfi-vnic: Virtual Network Interface Controller (VNIC) Bus driver
From: Christoph Lameter @ 2016-11-22 23:04 UTC (permalink / raw)
  To: Vishwanathapura, Niranjana
  Cc: Jason Gunthorpe, Doug Ledford, linux-rdma-u79uwXL29TY76Z2rM5mHXA,
	netdev-u79uwXL29TY76Z2rM5mHXA, Dennis Dalessandro
In-Reply-To: <20161122194918.GA69241-wPcXA7LoDC+1XWohqUldA0EOCMrvLtNR@public.gmane.org>

On Tue, 22 Nov 2016, Vishwanathapura, Niranjana wrote:

> Ok, I do understand Jason's point that we should probably not put this driver
> under drivers/infiniband/sw/.., as this driver is not a HCA.
> It is an ULP similar to ipoib, built on top of Omni-path irrespective of
> whether we register a hfi_vnic_bus or a direct custom interface with HFI1.
> This ULP will transmit and recieve Omni-path packets over the fabric, and is
> dependent on IB MAD interface and the HFI1 driver.

This is something that encapsulates IP (v4 right?) in something else.
Would belong into

	linux/net/ipv4

You already have similar implementations there

See f.e. ipip.c, ip_tunnel.c and lots more (try
	ls linux/net/ipv4/*tunnel*

)

If this is more like a device then it would belong into

linux/drivers/net/hfi or so (see also linux/drivers/net/ppp, plip,
loopback, etc etc)



--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply

* Re: [net] 34fad54c25: kernel BUG at include/linux/skbuff.h:1935!
From: Linus Torvalds @ 2016-11-22 23:30 UTC (permalink / raw)
  To: Eric Dumazet, Andre Noll
  Cc: kernel test robot, David Miller, LKP, LKML, Alexei Starovoitov,
	Willem de Bruijn, Alexander Duyck, Network Development
In-Reply-To: <CANn89i+B8F8thJRCdNPHA=BddTTxHCHz+xkvT_wuZ+b8osoTVQ@mail.gmail.com>

On Tue, Nov 22, 2016 at 2:28 PM, Eric Dumazet <edumazet@google.com> wrote:
>
> This is fixed by :
> https://git.kernel.org/cgit/linux/kernel/git/davem/net.git/commit/?id=c9b8af1330198ae241cd545e1f040019010d44d9

Thanks guys. This was one of the less esoteric-looking regressions, so
I'm happy to hear it's solved.

             Linus

^ permalink raw reply

* Re: [PATCH net-next] net/sched: cls_flower: verify root pointer before dereferncing it
From: John Fastabend @ 2016-11-22 23:36 UTC (permalink / raw)
  To: Daniel Borkmann, Cong Wang, Jiri Pirko
  Cc: Roi Dayan, David S. Miller, Linux Kernel Network Developers,
	Jiri Pirko, Or Gerlitz, Cong Wang
In-Reply-To: <5834AD97.1020600@iogearbox.net>

On 16-11-22 12:41 PM, Daniel Borkmann wrote:
> On 11/22/2016 08:28 PM, Cong Wang wrote:
>> On Tue, Nov 22, 2016 at 8:11 AM, Jiri Pirko <jiri@resnulli.us> wrote:
>>> Tue, Nov 22, 2016 at 05:04:11PM CET, daniel@iogearbox.net wrote:
>>>> Hmm, I don't think we want to have such an additional test in fast
>>>> path for each and every classifier. Can we think of ways to avoid that?
>>>>
>>>> My question is, since we unlink individual instances from such
>>>> tp-internal
>>>> lists through RCU and release the instance through call_rcu() as
>>>> well as
>>>> the head (tp->root) via kfree_rcu() eventually, against what are we
>>>> protecting
>>>> setting RCU_INIT_POINTER(tp->root, NULL) in ->destroy() callback?
>>>> Something
>>>> not respecting grace period?
>>>
>>> If you call tp->ops->destroy in call_rcu, you don't have to set tp->root
>>> to null.
> 
> But that's not really an answer to my question. ;)
> 
>> We do need to respect the grace period if we touch the globally visible
>> data structure tp in tcf_destroy(). Therefore Roi's patch is not
>> fixing the
>> right place.
> 
> I think there may be multiple issues actually.
> 
> At the time we go into tc_classify(), from ingress as well as egress side,
> we're under RCU, but BH variant. In cls delete()/destroy() callbacks, we
> everywhere use call_rcu() and kfree_rcu(), same as for tcf_destroy() where
> we use kfree_rcu() on tp, although we iterate tps (and implicitly inner
> filters)
> via rcu_dereference_bh() from reader side. Is there a reason why we don't
> use call_rcu_bh() variant on destruction for all this instead?

I can't think of any if its all under _bh we can convert the call_rcu to
call_rcu_bh it just needs an audit.

> 
> Just looking at cls_bpf and others, what protects
> RCU_INIT_POINTER(tp->root,
> NULL) against? The tp is unlinked in tc_ctl_tfilter() from the tp chain in
> tcf_destroy() cases. Still active readers under RCU BH can race against
> this
> (tp->root being NULL), as the commit identified. Only the get() callback
> checks
> for head against NULL, but both are serialized under rtnl, and the only
> place
> we call this is tc_ctl_tfilter(). Even if we create a new tp, head
> should not
> be NULL there, if it was assigned during the init() cb, but contains an
> empty
> list. (It's different for things like cls_cgroup, though.) So, I'm
> wondering
> if the RCU_INIT_POINTER(tp->root, NULL) can just be removed instead
> (unless I'm
> missing something obvious)?


Just took a look at this I think there are a couple possible solutions.
The easiest is likely to fix all the call sites so that 'tp' is unlinked
before calling the destroy() handlers AND not doing the NULL set. I only
see one such call site where destroy is called before unlinking at the
moment. This should enforce that after a grace period there is no path
to reach the classifiers because 'tp' is unlinked. Calling destroy
before unlinking 'tp' however could cause a small race between grace
period of 'tp' and grace period of the filter.

Another would be to only call the destroy path from the call_rcu path
of the 'tp' object so that destroy is only ever called after the object
is guaranteed to be unlinked from the tc_filter path.

I think both solutions would be fine.

Cong were you working on one of these? Or do you have another idea?


> 
>> Also I don't know why you blame my commit, this problem should already
>> exist prior to my commit, probably date back to John's RCU patches.
> 
> It seems so.

^ permalink raw reply

* [PATCH RFC v1] ethtool: implement helper to get flow_type value
From: Jacob Keller @ 2016-11-22 23:44 UTC (permalink / raw)
  To: netdev, Intel Wired LAN; +Cc: Jacob Keller

Often a driver wants to store the flow type and thus it must mask the
extra fields. This is a task that could grow more complex as more flags
are added in the future. Add a helper function that masks the flags for
marking additional fields.

Modify drivers in drivers/net/ethernet that currently check for FLOW_EXT
and FLOW_MAC_EXT to use the helper. Currently this is only the mellanox
drivers.

I chose not to modify other drivers as I'm actually unsure whether we
should always mask the flow type even for drivers which don't recognize
the newer flags. On the one hand, today's drivers (generally)
automatically fail when a new flag is used because they won't mask it
and their checks against flow_type will not match. On the other hand, it
means another place that you have to update when you begin implementing
a flag.

An alternative is to have the driver store a set of flags that it knows
about, and then have ethtool core do the check for us to discard frames.
I haven't implemented this quite yet.

Signed-off-by: Jacob Keller <jacob.e.keller@intel.com>
---
I plan on using this helper when fixing the mask code for ntuple filters
in the Intel i40e driver. I wanted to see whether this approach was
acceptable, and whether we should implement additional checks. The
primary reason is that today's drivers are "fail closed" in that a new
flag type will probably fail on drivers due to checking for flow types
they recognize. Since drivers only remove the masked bits they recognize
this works. However, this gets cumbersome if new additional flags get
added in the future. I would like some sort of helper, but if we
encourage its use, and a new flag gets added, the helper will then
unforunately make the driver "fail open" in that a new flag will get
ignored as the driver won't know to return -EINVAL.

I think the right solution will be to add some sort of checks in core
ethtool which we can basically set the recognized flags in some way for
all drivers such that the ethtool core can drop requests for flows with
unknown flag types. I'm unsure how to implement this though.

Thoughts?

 drivers/net/ethernet/mellanox/mlx4/en_ethtool.c         |  4 ++--
 drivers/net/ethernet/mellanox/mlx5/core/en_fs_ethtool.c |  6 +++---
 include/uapi/linux/ethtool.h                            | 11 ++++++++---
 3 files changed, 13 insertions(+), 8 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx4/en_ethtool.c b/drivers/net/ethernet/mellanox/mlx4/en_ethtool.c
index 487a58f9c192..d8f9839ce2a3 100644
--- a/drivers/net/ethernet/mellanox/mlx4/en_ethtool.c
+++ b/drivers/net/ethernet/mellanox/mlx4/en_ethtool.c
@@ -1270,7 +1270,7 @@ static int mlx4_en_validate_flow(struct net_device *dev,
 			return -EINVAL;
 	}
 
-	switch (cmd->fs.flow_type & ~(FLOW_EXT | FLOW_MAC_EXT)) {
+	switch (ethtool_get_flow_spec_type(cmd->fs.flow_type)) {
 	case TCP_V4_FLOW:
 	case UDP_V4_FLOW:
 		if (cmd->fs.m_u.tcp_ip4_spec.tos)
@@ -1493,7 +1493,7 @@ static int mlx4_en_ethtool_to_net_trans_rule(struct net_device *dev,
 	if (err)
 		return err;
 
-	switch (cmd->fs.flow_type & ~(FLOW_EXT | FLOW_MAC_EXT)) {
+	switch (ethtool_get_flow_spec_type(cmd->fs.flow_type)) {
 	case ETHER_FLOW:
 		spec_l2 = kzalloc(sizeof(*spec_l2), GFP_KERNEL);
 		if (!spec_l2)
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_fs_ethtool.c b/drivers/net/ethernet/mellanox/mlx5/core/en_fs_ethtool.c
index 3691451c728c..066e6c5cf38b 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_fs_ethtool.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_fs_ethtool.c
@@ -63,7 +63,7 @@ static struct mlx5e_ethtool_table *get_flow_table(struct mlx5e_priv *priv,
 	int table_size;
 	int prio;
 
-	switch (fs->flow_type & ~(FLOW_EXT | FLOW_MAC_EXT)) {
+	switch (ethtool_get_flow_spec_type(fs->flow_type)) {
 	case TCP_V4_FLOW:
 	case UDP_V4_FLOW:
 		max_tuples = ETHTOOL_NUM_L3_L4_FTS;
@@ -147,7 +147,7 @@ static int set_flow_attrs(u32 *match_c, u32 *match_v,
 					     outer_headers);
 	void *outer_headers_v = MLX5_ADDR_OF(fte_match_param, match_v,
 					     outer_headers);
-	u32 flow_type = fs->flow_type & ~(FLOW_EXT | FLOW_MAC_EXT);
+	u32 flow_type = ethtool_get_flow_spec_type(fs->flow_type);
 	struct ethtool_tcpip4_spec *l4_mask;
 	struct ethtool_tcpip4_spec *l4_val;
 	struct ethtool_usrip4_spec *l3_mask;
@@ -393,7 +393,7 @@ static int validate_flow(struct mlx5e_priv *priv,
 	    fs->ring_cookie != RX_CLS_FLOW_DISC)
 		return -EINVAL;
 
-	switch (fs->flow_type & ~(FLOW_EXT | FLOW_MAC_EXT)) {
+	switch (ethtool_get_flow_spec_type(fs->flow_type)) {
 	case ETHER_FLOW:
 		eth_mask = &fs->m_u.ether_spec;
 		if (!is_zero_ether_addr(eth_mask->h_dest))
diff --git a/include/uapi/linux/ethtool.h b/include/uapi/linux/ethtool.h
index f0db7788f887..e6d7d2aea56c 100644
--- a/include/uapi/linux/ethtool.h
+++ b/include/uapi/linux/ethtool.h
@@ -880,6 +880,14 @@ struct ethtool_rx_flow_spec {
 	__u32		location;
 };
 
+/* Flag to enable additional fields in struct ethtool_rx_flow_spec */
+#define	FLOW_EXT	0x80000000
+#define	FLOW_MAC_EXT	0x40000000
+static inline __u32 ethtool_get_flow_spec_type(__u32 flow_type)
+{
+	return flow_type & (FLOW_EXT | FLOW_MAC_EXT);
+}
+
 /* How rings are layed out when accessing virtual functions or
  * offloaded queues is device specific. To allow users to do flow
  * steering and specify these queues the ring cookie is partitioned
@@ -1579,9 +1587,6 @@ static inline int ethtool_validate_duplex(__u8 duplex)
 #define	IPV4_FLOW	0x10	/* hash only */
 #define	IPV6_FLOW	0x11	/* hash only */
 #define	ETHER_FLOW	0x12	/* spec only (ether_spec) */
-/* Flag to enable additional fields in struct ethtool_rx_flow_spec */
-#define	FLOW_EXT	0x80000000
-#define	FLOW_MAC_EXT	0x40000000
 
 /* L3-L4 network traffic flow hash options */
 #define	RXH_L2DA	(1 << 1)
-- 
2.11.0.rc2.152.g4d04e67

^ permalink raw reply related

* Re: [RFC PATCH net-next] net: ethtool: add support for forward error correction modes
From: Casey Leedom @ 2016-11-22 23:41 UTC (permalink / raw)
  To: netdev@vger.kernel.org
In-Reply-To: <DM5PR12MB1786073763933EAE09E36978C8B40@DM5PR12MB1786.namprd12.prod.outlook.com>

  And by the way, we currently have two ethtool APIs which pump in an Auto-Negotiation indication -- set_link_ksettings() and set_pauseparam().  Now we're talking about adding a third, set_fecparam().  Are all of the calls to these three APIs supposed to agree on the concept of Auto-Negotiations?  I.e. what's it mean if set_link_ksettings() gets called with link_ksettings->base.autoneg == AUTONEG_ENABLE but set_pauseparam() gets called with epause->autoneg == AUTONEG_DISABLE?  And now adding set_fecparam() into the system with a similar ability to specify the state of Auto-Negotiation is even more confusing.

Casey

^ permalink raw reply

* [PATCH ethtool] ethtool: Fix the "advertise" parameter logic.
From: Michael Chan @ 2016-11-22 23:55 UTC (permalink / raw)
  To: linville; +Cc: netdev

From: Michael Chan <mchan@broadcom.com>

The current code ignores the value of the advertise parameter.  For example,

ethtool -s ethx advertise 0x1000

The full_advertising_wanted parameter of 0x1000 is not passed to the kernel.
The reason is that advertising_wanted is NULL in this case, and ethtool
will think that the user has given no advertisement input and so it will
proceed to pass all supported advertisement speeds to the kernel.

The older legacy ethtool with similar logic worked because
advertising_wanted was an integer and could take on -1 and 0.  It would pass
the full_advertising_wanted value if advertising_wanted == -1.

This fix is to pass all supported advertisement speeds only when both
advertising_wanted == NULL && full_advertising_wanted == NULL.

Signed-off-by: Michael Chan <michael.chan@broadcom.com>
---
 ethtool.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/ethtool.c b/ethtool.c
index 49ac94e..7715823 100644
--- a/ethtool.c
+++ b/ethtool.c
@@ -2971,7 +2971,8 @@ static int do_sset(struct cmd_context *ctx)
 				fprintf(stderr,	"\n");
 			}
 			if (autoneg_wanted == AUTONEG_ENABLE &&
-			    advertising_wanted == NULL) {
+			    advertising_wanted == NULL &&
+			    full_advertising_wanted == NULL) {
 				unsigned int i;
 
 				/* Auto negotiation enabled, but with
-- 
1.8.4.5

^ permalink raw reply related

* [PATCH net-next] mlx4: reorganize struct mlx4_en_tx_ring
From: Eric Dumazet @ 2016-11-22 23:56 UTC (permalink / raw)
  To: David Miller; +Cc: netdev, Tariq Toukan

From: Eric Dumazet <edumazet@google.com>

Goal is to reorganize this critical structure to increase performance.

ndo_start_xmit() should only dirty one cache line, and access as few
cache lines as possible.

Add sp_ (Slow Path) prefix to fields that are not used in fast path,
to make clear what is going on.

After this patch pahole reports something much better, as all
ndo_start_xmit() needed fields are packed into two cache lines instead
of seven or eight

struct mlx4_en_tx_ring {
	u32                        last_nr_txbb;         /*     0   0x4 */
	u32                        cons;                 /*   0x4   0x4 */
	long unsigned int          wake_queue;           /*   0x8   0x8 */
	struct netdev_queue *      tx_queue;             /*  0x10   0x8 */
	u32                        (*free_tx_desc)(struct mlx4_en_priv *, struct mlx4_en_tx_ring *, int, u8, u64, int); /*  0x18   0x8 */
	struct mlx4_en_rx_ring *   recycle_ring;         /*  0x20   0x8 */

	/* XXX 24 bytes hole, try to pack */

	/* --- cacheline 1 boundary (64 bytes) --- */
	u32                        prod;                 /*  0x40   0x4 */
	unsigned int               tx_dropped;           /*  0x44   0x4 */
	long unsigned int          bytes;                /*  0x48   0x8 */
	long unsigned int          packets;              /*  0x50   0x8 */
	long unsigned int          tx_csum;              /*  0x58   0x8 */
	long unsigned int          tso_packets;          /*  0x60   0x8 */
	long unsigned int          xmit_more;            /*  0x68   0x8 */
	struct mlx4_bf             bf;                   /*  0x70  0x18 */
	/* --- cacheline 2 boundary (128 bytes) was 8 bytes ago --- */
	__be32                     doorbell_qpn;         /*  0x88   0x4 */
	__be32                     mr_key;               /*  0x8c   0x4 */
	u32                        size;                 /*  0x90   0x4 */
	u32                        size_mask;            /*  0x94   0x4 */
	u32                        full_size;            /*  0x98   0x4 */
	u32                        buf_size;             /*  0x9c   0x4 */
	void *                     buf;                  /*  0xa0   0x8 */
	struct mlx4_en_tx_info *   tx_info;              /*  0xa8   0x8 */
	int                        qpn;                  /*  0xb0   0x4 */
	u8                         queue_index;          /*  0xb4   0x1 */
	bool                       bf_enabled;           /*  0xb5   0x1 */
	bool                       bf_alloced;           /*  0xb6   0x1 */
	u8                         hwtstamp_tx_type;     /*  0xb7   0x1 */
	u8 *                       bounce_buf;           /*  0xb8   0x8 */
	/* --- cacheline 3 boundary (192 bytes) --- */
	long unsigned int          queue_stopped;        /*  0xc0   0x8 */
	struct mlx4_hwq_resources  sp_wqres;             /*  0xc8  0x58 */
	/* --- cacheline 4 boundary (256 bytes) was 32 bytes ago --- */
	struct mlx4_qp             sp_qp;                /* 0x120  0x30 */
	/* --- cacheline 5 boundary (320 bytes) was 16 bytes ago --- */
	struct mlx4_qp_context     sp_context;           /* 0x150  0xf8 */
	/* --- cacheline 9 boundary (576 bytes) was 8 bytes ago --- */
	cpumask_t                  sp_affinity_mask;     /* 0x248  0x20 */
	enum mlx4_qp_state         sp_qp_state;          /* 0x268   0x4 */
	u16                        sp_stride;            /* 0x26c   0x2 */
	u16                        sp_cqn;               /* 0x26e   0x2 */

	/* size: 640, cachelines: 10, members: 36 */
	/* sum members: 600, holes: 1, sum holes: 24 */
	/* padding: 16 */
};

Instead of this silly placement :

struct mlx4_en_tx_ring {
	u32                        last_nr_txbb;         /*     0   0x4 */
	u32                        cons;                 /*   0x4   0x4 */
	long unsigned int          wake_queue;           /*   0x8   0x8 */

	/* XXX 48 bytes hole, try to pack */

	/* --- cacheline 1 boundary (64 bytes) --- */
	u32                        prod;                 /*  0x40   0x4 */

	/* XXX 4 bytes hole, try to pack */

	long unsigned int          bytes;                /*  0x48   0x8 */
	long unsigned int          packets;              /*  0x50   0x8 */
	long unsigned int          tx_csum;              /*  0x58   0x8 */
	long unsigned int          tso_packets;          /*  0x60   0x8 */
	long unsigned int          xmit_more;            /*  0x68   0x8 */
	unsigned int               tx_dropped;           /*  0x70   0x4 */

	/* XXX 4 bytes hole, try to pack */

	struct mlx4_bf             bf;                   /*  0x78  0x18 */
	/* --- cacheline 2 boundary (128 bytes) was 16 bytes ago --- */
	long unsigned int          queue_stopped;        /*  0x90   0x8 */
	cpumask_t                  affinity_mask;        /*  0x98  0x10 */
	struct mlx4_qp             qp;                   /*  0xa8  0x30 */
	/* --- cacheline 3 boundary (192 bytes) was 24 bytes ago --- */
	struct mlx4_hwq_resources  wqres;                /*  0xd8  0x58 */
	/* --- cacheline 4 boundary (256 bytes) was 48 bytes ago --- */
	u32                        size;                 /* 0x130   0x4 */
	u32                        size_mask;            /* 0x134   0x4 */
	u16                        stride;               /* 0x138   0x2 */

	/* XXX 2 bytes hole, try to pack */

	u32                        full_size;            /* 0x13c   0x4 */
	/* --- cacheline 5 boundary (320 bytes) --- */
	u16                        cqn;                  /* 0x140   0x2 */

	/* XXX 2 bytes hole, try to pack */

	u32                        buf_size;             /* 0x144   0x4 */
	__be32                     doorbell_qpn;         /* 0x148   0x4 */
	__be32                     mr_key;               /* 0x14c   0x4 */
	void *                     buf;                  /* 0x150   0x8 */
	struct mlx4_en_tx_info *   tx_info;              /* 0x158   0x8 */
	struct mlx4_en_rx_ring *   recycle_ring;         /* 0x160   0x8 */
	u32                        (*free_tx_desc)(struct mlx4_en_priv *, struct mlx4_en_tx_ring *, int, u8, u64, int); /* 0x168   0x8 */
	u8 *                       bounce_buf;           /* 0x170   0x8 */
	struct mlx4_qp_context     context;              /* 0x178  0xf8 */
	/* --- cacheline 9 boundary (576 bytes) was 48 bytes ago --- */
	int                        qpn;                  /* 0x270   0x4 */
	enum mlx4_qp_state         qp_state;             /* 0x274   0x4 */
	u8                         queue_index;          /* 0x278   0x1 */
	bool                       bf_enabled;           /* 0x279   0x1 */
	bool                       bf_alloced;           /* 0x27a   0x1 */

	/* XXX 5 bytes hole, try to pack */

	/* --- cacheline 10 boundary (640 bytes) --- */
	struct netdev_queue *      tx_queue;             /* 0x280   0x8 */
	int                        hwtstamp_tx_type;     /* 0x288   0x4 */

	/* size: 704, cachelines: 11, members: 36 */
	/* sum members: 587, holes: 6, sum holes: 65 */
	/* padding: 52 */
};


Signed-off-by: Eric Dumazet <edumazet@google.com>
---
 drivers/net/ethernet/mellanox/mlx4/en_netdev.c |    2 
 drivers/net/ethernet/mellanox/mlx4/en_tx.c     |   48 +++++++--------
 drivers/net/ethernet/mellanox/mlx4/mlx4_en.h   |   42 +++++++------
 3 files changed, 48 insertions(+), 44 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx4/en_netdev.c b/drivers/net/ethernet/mellanox/mlx4/en_netdev.c
index 9a807e93c9fdd81e61e561208aa1480a244d0bdb..9018bb1b2e12142e048281a9d28ddf95e0023a61 100644
--- a/drivers/net/ethernet/mellanox/mlx4/en_netdev.c
+++ b/drivers/net/ethernet/mellanox/mlx4/en_netdev.c
@@ -1305,7 +1305,7 @@ static void mlx4_en_tx_timeout(struct net_device *dev)
 		if (!netif_tx_queue_stopped(netdev_get_tx_queue(dev, i)))
 			continue;
 		en_warn(priv, "TX timeout on queue: %d, QP: 0x%x, CQ: 0x%x, Cons: 0x%x, Prod: 0x%x\n",
-			i, tx_ring->qpn, tx_ring->cqn,
+			i, tx_ring->qpn, tx_ring->sp_cqn,
 			tx_ring->cons, tx_ring->prod);
 	}
 
diff --git a/drivers/net/ethernet/mellanox/mlx4/en_tx.c b/drivers/net/ethernet/mellanox/mlx4/en_tx.c
index 5de3cbe24f2bf467f9a8f7d499e131b6d2a1844c..4b597dca5c52d114344d638895275ed0d378bd96 100644
--- a/drivers/net/ethernet/mellanox/mlx4/en_tx.c
+++ b/drivers/net/ethernet/mellanox/mlx4/en_tx.c
@@ -66,7 +66,7 @@ int mlx4_en_create_tx_ring(struct mlx4_en_priv *priv,
 
 	ring->size = size;
 	ring->size_mask = size - 1;
-	ring->stride = stride;
+	ring->sp_stride = stride;
 	ring->full_size = ring->size - HEADROOM - MAX_DESC_TXBBS;
 
 	tmp = size * sizeof(struct mlx4_en_tx_info);
@@ -90,22 +90,22 @@ int mlx4_en_create_tx_ring(struct mlx4_en_priv *priv,
 			goto err_info;
 		}
 	}
-	ring->buf_size = ALIGN(size * ring->stride, MLX4_EN_PAGE_SIZE);
+	ring->buf_size = ALIGN(size * ring->sp_stride, MLX4_EN_PAGE_SIZE);
 
 	/* Allocate HW buffers on provided NUMA node */
 	set_dev_node(&mdev->dev->persist->pdev->dev, node);
-	err = mlx4_alloc_hwq_res(mdev->dev, &ring->wqres, ring->buf_size);
+	err = mlx4_alloc_hwq_res(mdev->dev, &ring->sp_wqres, ring->buf_size);
 	set_dev_node(&mdev->dev->persist->pdev->dev, mdev->dev->numa_node);
 	if (err) {
 		en_err(priv, "Failed allocating hwq resources\n");
 		goto err_bounce;
 	}
 
-	ring->buf = ring->wqres.buf.direct.buf;
+	ring->buf = ring->sp_wqres.buf.direct.buf;
 
 	en_dbg(DRV, priv, "Allocated TX ring (addr:%p) - buf:%p size:%d buf_size:%d dma:%llx\n",
 	       ring, ring->buf, ring->size, ring->buf_size,
-	       (unsigned long long) ring->wqres.buf.direct.map);
+	       (unsigned long long) ring->sp_wqres.buf.direct.map);
 
 	err = mlx4_qp_reserve_range(mdev->dev, 1, 1, &ring->qpn,
 				    MLX4_RESERVE_ETH_BF_QP);
@@ -114,12 +114,12 @@ int mlx4_en_create_tx_ring(struct mlx4_en_priv *priv,
 		goto err_hwq_res;
 	}
 
-	err = mlx4_qp_alloc(mdev->dev, ring->qpn, &ring->qp, GFP_KERNEL);
+	err = mlx4_qp_alloc(mdev->dev, ring->qpn, &ring->sp_qp, GFP_KERNEL);
 	if (err) {
 		en_err(priv, "Failed allocating qp %d\n", ring->qpn);
 		goto err_reserve;
 	}
-	ring->qp.event = mlx4_en_sqp_event;
+	ring->sp_qp.event = mlx4_en_sqp_event;
 
 	err = mlx4_bf_alloc(mdev->dev, &ring->bf, node);
 	if (err) {
@@ -141,7 +141,7 @@ int mlx4_en_create_tx_ring(struct mlx4_en_priv *priv,
 	if (queue_index < priv->num_tx_rings_p_up)
 		cpumask_set_cpu(cpumask_local_spread(queue_index,
 						     priv->mdev->dev->numa_node),
-				&ring->affinity_mask);
+				&ring->sp_affinity_mask);
 
 	*pring = ring;
 	return 0;
@@ -149,7 +149,7 @@ int mlx4_en_create_tx_ring(struct mlx4_en_priv *priv,
 err_reserve:
 	mlx4_qp_release_range(mdev->dev, ring->qpn, 1);
 err_hwq_res:
-	mlx4_free_hwq_res(mdev->dev, &ring->wqres, ring->buf_size);
+	mlx4_free_hwq_res(mdev->dev, &ring->sp_wqres, ring->buf_size);
 err_bounce:
 	kfree(ring->bounce_buf);
 	ring->bounce_buf = NULL;
@@ -171,10 +171,10 @@ void mlx4_en_destroy_tx_ring(struct mlx4_en_priv *priv,
 
 	if (ring->bf_alloced)
 		mlx4_bf_free(mdev->dev, &ring->bf);
-	mlx4_qp_remove(mdev->dev, &ring->qp);
-	mlx4_qp_free(mdev->dev, &ring->qp);
+	mlx4_qp_remove(mdev->dev, &ring->sp_qp);
+	mlx4_qp_free(mdev->dev, &ring->sp_qp);
 	mlx4_qp_release_range(priv->mdev->dev, ring->qpn, 1);
-	mlx4_free_hwq_res(mdev->dev, &ring->wqres, ring->buf_size);
+	mlx4_free_hwq_res(mdev->dev, &ring->sp_wqres, ring->buf_size);
 	kfree(ring->bounce_buf);
 	ring->bounce_buf = NULL;
 	kvfree(ring->tx_info);
@@ -190,7 +190,7 @@ int mlx4_en_activate_tx_ring(struct mlx4_en_priv *priv,
 	struct mlx4_en_dev *mdev = priv->mdev;
 	int err;
 
-	ring->cqn = cq;
+	ring->sp_cqn = cq;
 	ring->prod = 0;
 	ring->cons = 0xffffffff;
 	ring->last_nr_txbb = 1;
@@ -198,21 +198,21 @@ int mlx4_en_activate_tx_ring(struct mlx4_en_priv *priv,
 	memset(ring->buf, 0, ring->buf_size);
 	ring->free_tx_desc = mlx4_en_free_tx_desc;
 
-	ring->qp_state = MLX4_QP_STATE_RST;
-	ring->doorbell_qpn = cpu_to_be32(ring->qp.qpn << 8);
+	ring->sp_qp_state = MLX4_QP_STATE_RST;
+	ring->doorbell_qpn = cpu_to_be32(ring->sp_qp.qpn << 8);
 	ring->mr_key = cpu_to_be32(mdev->mr.key);
 
-	mlx4_en_fill_qp_context(priv, ring->size, ring->stride, 1, 0, ring->qpn,
-				ring->cqn, user_prio, &ring->context);
+	mlx4_en_fill_qp_context(priv, ring->size, ring->sp_stride, 1, 0, ring->qpn,
+				ring->sp_cqn, user_prio, &ring->sp_context);
 	if (ring->bf_alloced)
-		ring->context.usr_page =
+		ring->sp_context.usr_page =
 			cpu_to_be32(mlx4_to_hw_uar_index(mdev->dev,
 							 ring->bf.uar->index));
 
-	err = mlx4_qp_to_ready(mdev->dev, &ring->wqres.mtt, &ring->context,
-			       &ring->qp, &ring->qp_state);
-	if (!cpumask_empty(&ring->affinity_mask))
-		netif_set_xps_queue(priv->dev, &ring->affinity_mask,
+	err = mlx4_qp_to_ready(mdev->dev, &ring->sp_wqres.mtt, &ring->sp_context,
+			       &ring->sp_qp, &ring->sp_qp_state);
+	if (!cpumask_empty(&ring->sp_affinity_mask))
+		netif_set_xps_queue(priv->dev, &ring->sp_affinity_mask,
 				    ring->queue_index);
 
 	return err;
@@ -223,8 +223,8 @@ void mlx4_en_deactivate_tx_ring(struct mlx4_en_priv *priv,
 {
 	struct mlx4_en_dev *mdev = priv->mdev;
 
-	mlx4_qp_modify(mdev->dev, NULL, ring->qp_state,
-		       MLX4_QP_STATE_RST, NULL, 0, 0, &ring->qp);
+	mlx4_qp_modify(mdev->dev, NULL, ring->sp_qp_state,
+		       MLX4_QP_STATE_RST, NULL, 0, 0, &ring->sp_qp);
 }
 
 static inline bool mlx4_en_is_tx_ring_full(struct mlx4_en_tx_ring *ring)
diff --git a/drivers/net/ethernet/mellanox/mlx4/mlx4_en.h b/drivers/net/ethernet/mellanox/mlx4/mlx4_en.h
index eff21651b67308a17fe3d60d236cd0b6800a3fd2..574bcbb1b38fc4758511d8f7bd17a87b0a507a73 100644
--- a/drivers/net/ethernet/mellanox/mlx4/mlx4_en.h
+++ b/drivers/net/ethernet/mellanox/mlx4/mlx4_en.h
@@ -281,46 +281,50 @@ struct mlx4_en_tx_ring {
 	u32			last_nr_txbb;
 	u32			cons;
 	unsigned long		wake_queue;
+	struct netdev_queue	*tx_queue;
+	u32			(*free_tx_desc)(struct mlx4_en_priv *priv,
+						struct mlx4_en_tx_ring *ring,
+						int index, u8 owner,
+						u64 timestamp, int napi_mode);
+	struct mlx4_en_rx_ring	*recycle_ring;
 
 	/* cache line used and dirtied in mlx4_en_xmit() */
 	u32			prod ____cacheline_aligned_in_smp;
+	unsigned int		tx_dropped;
 	unsigned long		bytes;
 	unsigned long		packets;
 	unsigned long		tx_csum;
 	unsigned long		tso_packets;
 	unsigned long		xmit_more;
-	unsigned int		tx_dropped;
 	struct mlx4_bf		bf;
-	unsigned long		queue_stopped;
 
 	/* Following part should be mostly read */
-	cpumask_t		affinity_mask;
-	struct mlx4_qp		qp;
-	struct mlx4_hwq_resources wqres;
+	__be32			doorbell_qpn;
+	__be32			mr_key;
 	u32			size; /* number of TXBBs */
 	u32			size_mask;
-	u16			stride;
 	u32			full_size;
-	u16			cqn;	/* index of port CQ associated with this ring */
 	u32			buf_size;
-	__be32			doorbell_qpn;
-	__be32			mr_key;
 	void			*buf;
 	struct mlx4_en_tx_info	*tx_info;
-	struct mlx4_en_rx_ring	*recycle_ring;
-	u32			(*free_tx_desc)(struct mlx4_en_priv *priv,
-						struct mlx4_en_tx_ring *ring,
-						int index, u8 owner,
-						u64 timestamp, int napi_mode);
-	u8			*bounce_buf;
-	struct mlx4_qp_context	context;
 	int			qpn;
-	enum mlx4_qp_state	qp_state;
 	u8			queue_index;
 	bool			bf_enabled;
 	bool			bf_alloced;
-	struct netdev_queue	*tx_queue;
-	int			hwtstamp_tx_type;
+	u8			hwtstamp_tx_type;
+	u8			*bounce_buf;
+
+	/* Not used in fast path
+	 * Only queue_stopped might be used if BQL is not properly working.
+	 */
+	unsigned long		queue_stopped;
+	struct mlx4_hwq_resources sp_wqres;
+	struct mlx4_qp		sp_qp;
+	struct mlx4_qp_context	sp_context;
+	cpumask_t		sp_affinity_mask;
+	enum mlx4_qp_state	sp_qp_state;
+	u16			sp_stride;
+	u16			sp_cqn;	/* index of port CQ associated with this ring */
 } ____cacheline_aligned_in_smp;
 
 struct mlx4_en_rx_desc {

^ permalink raw reply related

* Re: [RFC 02/10] IB/hfi-vnic: Virtual Network Interface Controller (VNIC) Bus driver
From: ira.weiny @ 2016-11-23  0:05 UTC (permalink / raw)
  To: Jason Gunthorpe
  Cc: Vishwanathapura, Niranjana, Doug Ledford,
	linux-rdma-u79uwXL29TY76Z2rM5mHXA, netdev-u79uwXL29TY76Z2rM5mHXA,
	Dennis Dalessandro
In-Reply-To: <20161122170407.GE3956-ePGOBjL8dl3ta4EC/59zMFaTQe2KTcn/@public.gmane.org>

On Tue, Nov 22, 2016 at 10:04:07AM -0700, Jason Gunthorpe wrote:
> On Mon, Nov 21, 2016 at 05:53:04PM -0800, Vishwanathapura, Niranjana wrote:
> > There are many example drivers in kernel which are using bus_register() in
> > an initcall.
> 
> There really are not, certainly not in major subsystems.

I see 2 drivers in the Block subsystem which do this:


19   5354  /nfs/site/home/iweiny/linux-stable/drivers/block/cciss.c <<cciss_init>>
	err = bus_register(&cciss_bus_type);
20   6447 /nfs/site/home/iweiny/linux-stable/drivers/block/rbd.c <<rbd_sysfs_init>>
	ret = bus_register(&rbd_bus_type);

2 drivers in the drm subsystem which do this:


29   1155  /nfs/site/home/iweiny/linux-stable/drivers/gpu/drm/drm_mipi_dsi.c <<mipi_dsi_bus_init>>
	return bus_register(&mipi_dsi_bus_type);
30    242 /nfs/site/home/iweiny/linux-stable/drivers/gpu/host1x/dev.c <<tegra_host1x_init>>
	err = bus_register(&host1x_bus_type);

And I think there are a couple others.

I'm not sure what these devices/buses do but they are registering their own bus
while being in another major subsystem.  Is what we are doing really so
crazy/wrong?


>
> > We could add a custom Interface between HFI1 driver and hfi_vnic drivers
> > without involving a bus.
> 
> hfi is already registering on the infiniband class, just use that.
> 

I don't understand what you mean here?

The bus_register provides a really clean way for the hfi1 driver and hfi_vnic
driver to find each other.  This includes being able to support hfi1 with or
without hfi_vnic being loaded.  Note that without configuration from the "EM"
Ethernet Manager the hfi_vnic does not export a net device.

Why wouldn't we use this core kernel support?[*]

> > But using the existing bus model gave a lot of in-built flexibility in
> > decoupling devices from the drivers.
> 
> If you want to have your own bus then you need your own hfi
> subsystem. drivers/infiniband is not a dumping ground..
> 

We don't consider drivers/infiniband a "dumping ground".  There is a
requirement on ib_mad from the hfi_vnic driver.

Ira

[*] As an aside why does the ib_core not use this methodology?  It dawned on
me that this may be a better way to fix our module load problems.  However, I
have not looked into details.

> Jason
> --
> To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
> the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply

* Re: [RFC 02/10] IB/hfi-vnic: Virtual Network Interface Controller (VNIC) Bus driver
From: Andrew Lunn @ 2016-11-23  0:06 UTC (permalink / raw)
  To: Vishwanathapura, Niranjana
  Cc: Jason Gunthorpe, Doug Ledford, linux-rdma, netdev,
	Dennis Dalessandro
In-Reply-To: <20161122194918.GA69241@knc-06.sc.intel.com>

On Tue, Nov 22, 2016 at 11:49:18AM -0800, Vishwanathapura, Niranjana wrote:
> Ok, I do understand Jason's point that we should probably not put
> this driver under drivers/infiniband/sw/.., as this driver is not a
> HCA.
> It is an ULP similar to ipoib, built on top of Omni-path
> irrespective of whether we register a hfi_vnic_bus or a direct
> custom interface with HFI1.
> This ULP will transmit and recieve Omni-path packets over the
> fabric, and is dependent on IB MAD interface and the HFI1 driver.
> 
> Doug,
> Will it be acceptable if we put it under 'drivers/infiniband/ulp/hfi_vnic'?

How about turning this whole discussion around. 

This is a network driver. So ask the network Maintainers where he
wants it. Send the patch to David Miller <davem@davemloft.net> and
netdev with the question, where does this code belong?

	Andrew

^ permalink raw reply

* Re: [PATCH net-next 1/1] ipv6: sr: add option to control lwtunnel support
From: Alexei Starovoitov @ 2016-11-23  0:16 UTC (permalink / raw)
  To: David Miller
  Cc: david.lebrun, netdev@vger.kernel.org, Lorenzo Colitti,
	Roopa Prabhu, Eric Dumazet

On Wed, Nov 16, 2016 at 8:32 AM, David Miller <davem@davemloft.net> wrote:
> From: David Lebrun <david.lebrun@uclouvain.be>
> Date: Tue, 15 Nov 2016 16:14:04 +0100
>
>> This patch adds a new option CONFIG_IPV6_SEG6_LWTUNNEL to enable/disable
>> support of encapsulation with the lightweight tunnels. When this option
>> is enabled, CONFIG_LWTUNNEL is automatically selected.
>>
>> Fix commit 6c8702c60b88 ("ipv6: sr: add support for SRH encapsulation and injection with lwtunnels")
>>
>> Without a proper option to control lwtunnel support for SR-IPv6, if
>> CONFIG_LWTUNNEL=n then the IPv6 initialization fails as a consequence
>> of seg6_iptunnel_init() failure with EOPNOTSUPP:
>>
>> NET: Registered protocol family 10
>> IPv6: Attempt to unregister permanent protocol 6
>> IPv6: Attempt to unregister permanent protocol 136
>> IPv6: Attempt to unregister permanent protocol 17
>> NET: Unregistered protocol family 10
>>
>> Tested (compiling, booting, and loading ipv6 module when relevant)
>> with possible combinations of CONFIG_IPV6={y,m,n},
>> CONFIG_IPV6_SEG6_LWTUNNEL={y,n} and CONFIG_LWTUNNEL={y,n}.
>>
>> Reported-by: Lorenzo Colitti <lorenzo@google.com>
>> Suggested-by: Roopa Prabhu <roopa@cumulusnetworks.com>
>> Signed-off-by: David Lebrun <david.lebrun@uclouvain.be>
>
> Applied.

ipv6 seems to be still broken in the latest net-next
when CONFIG_LWTUNNEL is not set:
# ping 127.0.0.1
ping: socket: Address family not supported by protocol
# ping -4 127.0.0.1
PING localhost.localdomain (127.0.0.1) 56(84) bytes of data.
64 bytes from localhost.localdomain (127.0.0.1): icmp_seq=1 ttl=64 time=0.067 ms

it works with CONFIG_LWTUNNEL=y

Roopa, David, please take a look.

Thanks!

^ permalink raw reply

* Re: [RFC net-next 0/3] net: bridge: Allow CPU port configuration
From: Florian Fainelli @ 2016-11-23  0:24 UTC (permalink / raw)
  To: Jiri Pirko, Andrew Lunn
  Cc: idosch, vivien.didelot, netdev, bridge, Ido Schimmel, jiri, davem
In-Reply-To: <20161122220859.GF1819@nanopsycho>

On 11/22/2016 02:08 PM, Jiri Pirko wrote:
> Tue, Nov 22, 2016 at 06:48:29PM CET, andrew@lunn.ch wrote:
>> Hi Ido
>>
>>> First of all, I want to be sure that when we say "CPU port", we're
>>> talking about the same thing. In mlxsw, the CPU port is a pipe between
>>> the device and the host, through which all packets trapped to the host
>>> go through. So, when a packet is trapped, the driver reads its Rx
>>> descriptor, checks through which port it ingressed, resolves its netdev,
>>> sets skb->dev accordingly and injects it to the Rx path via
>>> netif_receive_skb(). The CPU port itself isn't represented using a
>>> netdev.
>>
>> With DSA, we have a real physical ethernet network interface for the
>> 'cpu' port. It connects to one of the ports of the switch. Frames on
> 
> Every port should be visible as a netdevice, including cpu port.
> Would it make sence to have representors for those?

The CPU port is kind of already visible with DSA since you need the
switch to be attached to a normal Ethernet MAC driver (later referenced
as eth0 for simplicity). Since eth0 is going to potentially receive/send
switch tagged traffic, and the model is to terminate the interfaces at
the port level, this interface does not really have any meaningful use
from a data exchange, apart from multiplexing/demultiplexing switch tags
(when enabled).

If we did create a "cpu" network device, this interface would not be
able to send/receive traffic either, because the per-port network
interfaces are terminated at their level, and the conduit interface is
just used for transmitting/receiving switch tagged traffic. It does have
value as a controlling interface only though.

As a controlling interface, this can be helpful, but we need to decide
which side of the switch this CPU interface would represent, is it the
switch's view of the CPU port, or is the Ethernet MAC view's of the
switch's CPU port, attached to it (especially true with discrete switch
chips).

If we did use eth0 as a controlling interface, we need to somehow be
able to overload (in an objected oriented fashioned) the netdev_ops,
ethtool_ops and switchdev_ops for that interface so as to make it
participate in the switch configuration (we actually do this already for
ethtool statistics, but this is ugly).
-- 
Florian

^ permalink raw reply

* Re: [RFC 02/10] IB/hfi-vnic: Virtual Network Interface Controller (VNIC) Bus driver
From: Jason Gunthorpe @ 2016-11-23  0:49 UTC (permalink / raw)
  To: ira.weiny
  Cc: Vishwanathapura, Niranjana, Doug Ledford, linux-rdma, netdev,
	Dennis Dalessandro
In-Reply-To: <20161123000502.GA27968@phlsvsds.ph.intel.com>

On Tue, Nov 22, 2016 at 07:05:05PM -0500, ira.weiny wrote:
> On Tue, Nov 22, 2016 at 10:04:07AM -0700, Jason Gunthorpe wrote:
> > On Mon, Nov 21, 2016 at 05:53:04PM -0800, Vishwanathapura, Niranjana wrote:
> > > There are many example drivers in kernel which are using bus_register() in
> > > an initcall.
> > 
> > There really are not, certainly not in major subsystems.
> 
> I see 2 drivers in the Block subsystem which do this:
> 
> 
> 19   5354  /nfs/site/home/iweiny/linux-stable/drivers/block/cciss.c <<cciss_init>>
> 	err = bus_register(&cciss_bus_type);
> 20   6447 /nfs/site/home/iweiny/linux-stable/drivers/block/rbd.c <<rbd_sysfs_init>>
> 	ret = bus_register(&rbd_bus_type);
> 
> 2 drivers in the drm subsystem which do this:
> 
> 
> 29   1155  /nfs/site/home/iweiny/linux-stable/drivers/gpu/drm/drm_mipi_dsi.c <<mipi_dsi_bus_init>>
> 	return bus_register(&mipi_dsi_bus_type);
> 30    242 /nfs/site/home/iweiny/linux-stable/drivers/gpu/host1x/dev.c <<tegra_host1x_init>>
> 	err = bus_register(&host1x_bus_type);

IMHO this is all obscure or legacy stuff (eg ccsiss lost its bus when
it was reworked into hpsa). Who knows about that SOC stuff, maybe
there really is a special on-chip bus under those drivers.

The point is using a bus as a generic interconnect between two driver
modules seems very rare, and is not what we have historically ever
done in drivers/infiniband - all our split drivers use a trivial
register scheme. eg see cxgb4_register_uld/mlx4_register_interface/etc.

Should a multi-function driver use a bus or class to connect its
parts? Who knows. Maybe Greg KH/etc has an opinion. But that is not
what we have been doing, it doesn't seem very simplifying, and
this series doesn't even make module auto-loading work...

Since doing this creates a bunch of uapis (again, from a driver, ugh) it
seems like a bad idea without more support as 'the right way'

.. and yes, it would be nice to have a lightweight mechanism to
replace those register functions that could handle module auto loading
too, and maybe that is a 'multi part driver bus/class' or somesuch
... This is really a topic for the device core maintainers, IMHO.

> > > We could add a custom Interface between HFI1 driver and hfi_vnic drivers
> > > without involving a bus.
> > 
> > hfi is already registering on the infiniband class, just use that.
> 
> I don't understand what you mean here?

Get the struct ib_device for the hfi and then do something to get hfi
specific function calls.

Or work it backwards with a _register function..

> [*] As an aside why does the ib_core not use this methodology?  It dawned on
> me that this may be a better way to fix our module load problems.  However, I
> have not looked into details.

ib_core is a class, which is appropriate. RDMA devices are not busses.

Jason

^ permalink raw reply

* [PATCH net-next 2/2] samples/bpf: fix bpf loader
From: Alexei Starovoitov @ 2016-11-23  0:52 UTC (permalink / raw)
  To: David S . Miller; +Cc: Daniel Borkmann, netdev
In-Reply-To: <1479862329-2361912-1-git-send-email-ast@fb.com>

llvm can emit relocations into sections other than program code
(like debug info sections). Ignore them during parsing of elf file

Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 samples/bpf/bpf_load.c | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/samples/bpf/bpf_load.c b/samples/bpf/bpf_load.c
index 97913e109b14..62f54d6eb8bf 100644
--- a/samples/bpf/bpf_load.c
+++ b/samples/bpf/bpf_load.c
@@ -317,6 +317,10 @@ int load_bpf_file(char *path)
 				    &shdr_prog, &data_prog))
 				continue;
 
+			if (shdr_prog.sh_type != SHT_PROGBITS ||
+			    !(shdr_prog.sh_flags & SHF_EXECINSTR))
+				continue;
+
 			insns = (struct bpf_insn *) data_prog->d_buf;
 
 			processed_sec[shdr.sh_info] = true;
-- 
2.8.0

^ permalink raw reply related

* [PATCH net-next 1/2] samples/bpf: fix sockex2 example
From: Alexei Starovoitov @ 2016-11-23  0:52 UTC (permalink / raw)
  To: David S . Miller; +Cc: Daniel Borkmann, netdev

since llvm commit "Do not expand UNDEF SDNode during insn selection lowering"
llvm will generate code that uses uninitialized registers for cases
where C code is actually uses uninitialized data.
So this sockex2 example is technically broken.
Fix it by initializing on the stack variable fully.
Also increase verifier buffer limit, since verifier output
may not fit in 64k for this sockex2 code depending on llvm version.

Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 samples/bpf/libbpf.h       | 2 +-
 samples/bpf/sockex2_kern.c | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/samples/bpf/libbpf.h b/samples/bpf/libbpf.h
index ac6edb61b64a..de96a935068d 100644
--- a/samples/bpf/libbpf.h
+++ b/samples/bpf/libbpf.h
@@ -18,7 +18,7 @@ int bpf_prog_load(enum bpf_prog_type prog_type,
 int bpf_obj_pin(int fd, const char *pathname);
 int bpf_obj_get(const char *pathname);
 
-#define LOG_BUF_SIZE 65536
+#define LOG_BUF_SIZE (256 * 1024)
 extern char bpf_log_buf[LOG_BUF_SIZE];
 
 /* ALU ops on registers, bpf_add|sub|...: dst_reg += src_reg */
diff --git a/samples/bpf/sockex2_kern.c b/samples/bpf/sockex2_kern.c
index 44e5846c988f..f58acfc92556 100644
--- a/samples/bpf/sockex2_kern.c
+++ b/samples/bpf/sockex2_kern.c
@@ -198,7 +198,7 @@ struct bpf_map_def SEC("maps") hash_map = {
 SEC("socket2")
 int bpf_prog2(struct __sk_buff *skb)
 {
-	struct bpf_flow_keys flow;
+	struct bpf_flow_keys flow = {};
 	struct pair *value;
 	u32 key;
 
-- 
2.8.0

^ permalink raw reply related


This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox