Netdev List

Netdev List
 help / color / mirror / Atom feed

* [PATCH net 6/9] net/mlx5e: Use vport MTU rather than physical port MTU
From: Saeed Mahameed @ 2016-04-19 12:33 UTC (permalink / raw)
  To: David S. Miller
  Cc: netdev, Or Gerlitz, Tal Alon, Eran Ben Elisha, Saeed Mahameed
In-Reply-To: <1461069222-27076-1-git-send-email-saeedm@mellanox.com>

Set and report vport MTU rather than physical MTU,
Driver will set both vport and physical port mtu and will
rely on the query of vport mtu.

SRIOV VFs have to report their MTU to their vport manager (PF),
and this will allow them to work with any MTU they need
without failing the request.

Also for some cases where the PF is not a port owner, PF can
work with MTU less than the physical port mtu if set physical
port mtu didn't take effect.

Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
---
 drivers/net/ethernet/mellanox/mlx5/core/en_main.c |   44 ++++++++++++++++----
 drivers/net/ethernet/mellanox/mlx5/core/vport.c   |   40 +++++++++++++++++++
 include/linux/mlx5/vport.h                        |    2 +
 3 files changed, 77 insertions(+), 9 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
index 93e4ef4..85773f8 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
@@ -1404,24 +1404,50 @@ static int mlx5e_refresh_tirs_self_loopback_enable(struct mlx5e_priv *priv)
 	return 0;
 }
 
-static int mlx5e_set_dev_port_mtu(struct net_device *netdev)
+static int mlx5e_set_mtu(struct mlx5e_priv *priv, u16 mtu)
 {
-	struct mlx5e_priv *priv = netdev_priv(netdev);
 	struct mlx5_core_dev *mdev = priv->mdev;
-	u16 hw_mtu;
+	u16 hw_mtu = MLX5E_SW2HW_MTU(mtu);
 	int err;
 
-	err = mlx5_set_port_mtu(mdev, MLX5E_SW2HW_MTU(netdev->mtu), 1);
+	err = mlx5_set_port_mtu(mdev, hw_mtu, 1);
 	if (err)
 		return err;
 
-	mlx5_query_port_oper_mtu(mdev, &hw_mtu, 1);
+	/* Update vport context MTU */
+	mlx5_modify_nic_vport_mtu(mdev, hw_mtu);
+	return 0;
+}
+
+static void mlx5e_query_mtu(struct mlx5e_priv *priv, u16 *mtu)
+{
+	struct mlx5_core_dev *mdev = priv->mdev;
+	u16 hw_mtu = 0;
+	int err;
+
+	err = mlx5_query_nic_vport_mtu(mdev, &hw_mtu);
+	if (err || !hw_mtu) /* fallback to port oper mtu */
+		mlx5_query_port_oper_mtu(mdev, &hw_mtu, 1);
+
+	*mtu = MLX5E_HW2SW_MTU(hw_mtu);
+}
+
+static int mlx5e_set_dev_port_mtu(struct net_device *netdev)
+{
+	struct mlx5e_priv *priv = netdev_priv(netdev);
+	u16 mtu;
+	int err;
+
+	err = mlx5e_set_mtu(priv, netdev->mtu);
+	if (err)
+		return err;
 
-	if (MLX5E_HW2SW_MTU(hw_mtu) != netdev->mtu)
-		netdev_warn(netdev, "%s: Port MTU %d is different than netdev mtu %d\n",
-			    __func__, MLX5E_HW2SW_MTU(hw_mtu), netdev->mtu);
+	mlx5e_query_mtu(priv, &mtu);
+	if (mtu != netdev->mtu)
+		netdev_warn(netdev, "%s: VPort MTU %d is different than netdev mtu %d\n",
+			    __func__, mtu, netdev->mtu);
 
-	netdev->mtu = MLX5E_HW2SW_MTU(hw_mtu);
+	netdev->mtu = mtu;
 	return 0;
 }
 
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/vport.c b/drivers/net/ethernet/mellanox/mlx5/core/vport.c
index bd51840..b69dadc 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/vport.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/vport.c
@@ -196,6 +196,46 @@ int mlx5_modify_nic_vport_mac_address(struct mlx5_core_dev *mdev,
 }
 EXPORT_SYMBOL_GPL(mlx5_modify_nic_vport_mac_address);
 
+int mlx5_query_nic_vport_mtu(struct mlx5_core_dev *mdev, u16 *mtu)
+{
+	int outlen = MLX5_ST_SZ_BYTES(query_nic_vport_context_out);
+	u32 *out;
+	int err;
+
+	out = mlx5_vzalloc(outlen);
+	if (!out)
+		return -ENOMEM;
+
+	err = mlx5_query_nic_vport_context(mdev, 0, out, outlen);
+	if (!err)
+		*mtu = MLX5_GET(query_nic_vport_context_out, out,
+				nic_vport_context.mtu);
+
+	kvfree(out);
+	return err;
+}
+EXPORT_SYMBOL_GPL(mlx5_query_nic_vport_mtu);
+
+int mlx5_modify_nic_vport_mtu(struct mlx5_core_dev *mdev, u16 mtu)
+{
+	int inlen = MLX5_ST_SZ_BYTES(modify_nic_vport_context_in);
+	void *in;
+	int err;
+
+	in = mlx5_vzalloc(inlen);
+	if (!in)
+		return -ENOMEM;
+
+	MLX5_SET(modify_nic_vport_context_in, in, field_select.mtu, 1);
+	MLX5_SET(modify_nic_vport_context_in, in, nic_vport_context.mtu, mtu);
+
+	err = mlx5_modify_nic_vport_context(mdev, in, inlen);
+
+	kvfree(in);
+	return err;
+}
+EXPORT_SYMBOL_GPL(mlx5_modify_nic_vport_mtu);
+
 int mlx5_query_nic_vport_mac_list(struct mlx5_core_dev *dev,
 				  u32 vport,
 				  enum mlx5_list_type list_type,
diff --git a/include/linux/mlx5/vport.h b/include/linux/mlx5/vport.h
index bd93e63..301da4a 100644
--- a/include/linux/mlx5/vport.h
+++ b/include/linux/mlx5/vport.h
@@ -45,6 +45,8 @@ int mlx5_query_nic_vport_mac_address(struct mlx5_core_dev *mdev,
 				     u16 vport, u8 *addr);
 int mlx5_modify_nic_vport_mac_address(struct mlx5_core_dev *dev,
 				      u16 vport, u8 *addr);
+int mlx5_query_nic_vport_mtu(struct mlx5_core_dev *mdev, u16 *mtu);
+int mlx5_modify_nic_vport_mtu(struct mlx5_core_dev *mdev, u16 mtu);
 int mlx5_query_nic_vport_system_image_guid(struct mlx5_core_dev *mdev,
 					   u64 *system_image_guid);
 int mlx5_query_nic_vport_node_guid(struct mlx5_core_dev *mdev, u64 *node_guid);
-- 
1.7.1

^ permalink raw reply related

* [PATCH net 8/9] net/mlx5: Add pci shutdown callback
From: Saeed Mahameed @ 2016-04-19 12:33 UTC (permalink / raw)
  To: David S. Miller
  Cc: netdev, Or Gerlitz, Tal Alon, Eran Ben Elisha, Majd Dibbiny,
	Tariq Toukan, Haggai Abramovsky, Saeed Mahameed
In-Reply-To: <1461069222-27076-1-git-send-email-saeedm@mellanox.com>

From: Majd Dibbiny <majd@mellanox.com>

This patch introduces kexec support for mlx5.
When switching kernels, kexec() calls shutdown, which unloads
the driver and cleans its resources.

In addition, remove unregister netdev from shutdown flow. This will
allow a clean shutdown, even if some netdev clients did not release their
reference from this netdev. Releasing The HW resources only is enough as
the kernel is shutting down

Signed-off-by: Majd Dibbiny <majd@mellanox.com>
Signed-off-by: Tariq Toukan <tariqt@mellanox.com>
Signed-off-by: Haggai Abramovsky <hagaya@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
---
 drivers/net/ethernet/mellanox/mlx5/core/en_main.c |   15 +++++++++++-
 drivers/net/ethernet/mellanox/mlx5/core/main.c    |   23 +++++++++++++++++---
 include/linux/mlx5/driver.h                       |    7 +++--
 3 files changed, 36 insertions(+), 9 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
index 85773f8..67d548b 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
@@ -2633,7 +2633,16 @@ static void mlx5e_destroy_netdev(struct mlx5_core_dev *mdev, void *vpriv)
 	schedule_work(&priv->set_rx_mode_work);
 	mlx5e_disable_async_events(priv);
 	flush_scheduled_work();
-	unregister_netdev(netdev);
+	if (test_bit(MLX5_INTERFACE_STATE_SHUTDOWN, &mdev->intf_state)) {
+		netif_device_detach(netdev);
+		mutex_lock(&priv->state_lock);
+		if (test_bit(MLX5E_STATE_OPENED, &priv->state))
+			mlx5e_close_locked(netdev);
+		mutex_unlock(&priv->state_lock);
+	} else {
+		unregister_netdev(netdev);
+	}
+
 	mlx5e_tc_cleanup(priv);
 	mlx5e_vxlan_cleanup(priv);
 	mlx5e_destroy_flow_tables(priv);
@@ -2646,7 +2655,9 @@ static void mlx5e_destroy_netdev(struct mlx5_core_dev *mdev, void *vpriv)
 	mlx5_core_dealloc_transport_domain(priv->mdev, priv->tdn);
 	mlx5_core_dealloc_pd(priv->mdev, priv->pdn);
 	mlx5_unmap_free_uar(priv->mdev, &priv->cq_uar);
-	free_netdev(netdev);
+
+	if (!test_bit(MLX5_INTERFACE_STATE_SHUTDOWN, &mdev->intf_state))
+		free_netdev(netdev);
 }
 
 static void *mlx5e_get_netdev(void *vpriv)
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/main.c b/drivers/net/ethernet/mellanox/mlx5/core/main.c
index ddd352a..6892746 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/main.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/main.c
@@ -966,7 +966,7 @@ static int mlx5_load_one(struct mlx5_core_dev *dev, struct mlx5_priv *priv)
 	int err;
 
 	mutex_lock(&dev->intf_state_mutex);
-	if (dev->interface_state == MLX5_INTERFACE_STATE_UP) {
+	if (test_bit(MLX5_INTERFACE_STATE_UP, &dev->intf_state)) {
 		dev_warn(&dev->pdev->dev, "%s: interface is up, NOP\n",
 			 __func__);
 		goto out;
@@ -1133,7 +1133,8 @@ static int mlx5_load_one(struct mlx5_core_dev *dev, struct mlx5_priv *priv)
 	if (err)
 		pr_info("failed request module on %s\n", MLX5_IB_MOD);
 
-	dev->interface_state = MLX5_INTERFACE_STATE_UP;
+	clear_bit(MLX5_INTERFACE_STATE_DOWN, &dev->intf_state);
+	set_bit(MLX5_INTERFACE_STATE_UP, &dev->intf_state);
 out:
 	mutex_unlock(&dev->intf_state_mutex);
 
@@ -1207,7 +1208,7 @@ static int mlx5_unload_one(struct mlx5_core_dev *dev, struct mlx5_priv *priv)
 	}
 
 	mutex_lock(&dev->intf_state_mutex);
-	if (dev->interface_state == MLX5_INTERFACE_STATE_DOWN) {
+	if (test_bit(MLX5_INTERFACE_STATE_DOWN, &dev->intf_state)) {
 		dev_warn(&dev->pdev->dev, "%s: interface is down, NOP\n",
 			 __func__);
 		goto out;
@@ -1241,7 +1242,8 @@ static int mlx5_unload_one(struct mlx5_core_dev *dev, struct mlx5_priv *priv)
 	mlx5_cmd_cleanup(dev);
 
 out:
-	dev->interface_state = MLX5_INTERFACE_STATE_DOWN;
+	clear_bit(MLX5_INTERFACE_STATE_UP, &dev->intf_state);
+	set_bit(MLX5_INTERFACE_STATE_DOWN, &dev->intf_state);
 	mutex_unlock(&dev->intf_state_mutex);
 	return err;
 }
@@ -1452,6 +1454,18 @@ static const struct pci_error_handlers mlx5_err_handler = {
 	.resume		= mlx5_pci_resume
 };
 
+static void shutdown(struct pci_dev *pdev)
+{
+	struct mlx5_core_dev *dev  = pci_get_drvdata(pdev);
+	struct mlx5_priv *priv = &dev->priv;
+
+	dev_info(&pdev->dev, "Shutdown was called\n");
+	/* Notify mlx5 clients that the kernel is being shut down */
+	set_bit(MLX5_INTERFACE_STATE_SHUTDOWN, &dev->intf_state);
+	mlx5_unload_one(dev, priv);
+	mlx5_pci_disable_device(dev);
+}
+
 static const struct pci_device_id mlx5_core_pci_table[] = {
 	{ PCI_VDEVICE(MELLANOX, 0x1011) },			/* Connect-IB */
 	{ PCI_VDEVICE(MELLANOX, 0x1012), MLX5_PCI_DEV_IS_VF},	/* Connect-IB VF */
@@ -1471,6 +1485,7 @@ static struct pci_driver mlx5_core_driver = {
 	.id_table       = mlx5_core_pci_table,
 	.probe          = init_one,
 	.remove         = remove_one,
+	.shutdown	= shutdown,
 	.err_handler	= &mlx5_err_handler,
 	.sriov_configure   = mlx5_core_sriov_configure,
 };
diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h
index dcd5ac8..369c837 100644
--- a/include/linux/mlx5/driver.h
+++ b/include/linux/mlx5/driver.h
@@ -519,8 +519,9 @@ enum mlx5_device_state {
 };
 
 enum mlx5_interface_state {
-	MLX5_INTERFACE_STATE_DOWN,
-	MLX5_INTERFACE_STATE_UP,
+	MLX5_INTERFACE_STATE_DOWN = BIT(0),
+	MLX5_INTERFACE_STATE_UP = BIT(1),
+	MLX5_INTERFACE_STATE_SHUTDOWN = BIT(2),
 };
 
 enum mlx5_pci_status {
@@ -544,7 +545,7 @@ struct mlx5_core_dev {
 	enum mlx5_device_state	state;
 	/* sync interface state */
 	struct mutex		intf_state_mutex;
-	enum mlx5_interface_state interface_state;
+	unsigned long		intf_state;
 	void			(*event) (struct mlx5_core_dev *dev,
 					  enum mlx5_dev_event event,
 					  unsigned long param);
-- 
1.7.1

^ permalink raw reply related

* [PATCH net 0/9] mlx5 driver updates and fixes
From: Saeed Mahameed @ 2016-04-19 12:33 UTC (permalink / raw)
  To: David S. Miller
  Cc: netdev, Or Gerlitz, Tal Alon, Eran Ben Elisha, Saeed Mahameed

Hi Dave,

This series has few bug fixes for mlx5 core and ethernet driver.

Eli fixed a wrong static local variable declaration in flow steering API.
Majd added the support of ConnectX-5 PF and VF and added the support
for kernel shutdown pci callback for more robust reboot procedures.
Maor fixed a soft lockup in flow steering.
Rana fixed a wrog speed define in mlx5 EN driver.
I also had the chance to introduce some bug fixes in mlx5 EN mtu 
reporting and handling, and added the ability to reset link speed 
to device's default.

For -stable:
	net/mlx5_core: Fix soft lockup in steering error flow
	net/mlx5e: Device's mtu field is u16 and not int
	net/mlx5e: Fix minimum MTU
	net/mlx5e: Use vport MTU rather than physical port MTU

Thanks,
Saeed

Eli Cohen (1):
  net/mlx5_core: Remove static from local variable

Majd Dibbiny (2):
  net/mlx5_core: Add ConnectX-5 to list of supported devices
  net/mlx5: Add pci shutdown callback

Maor Gottlieb (1):
  net/mlx5_core: Fix soft lockup in steering error flow

Rana Shahout (1):
  net/mlx5e: Fix MLX5E_100BASE_T define

Saeed Mahameed (4):
  net/mlx5e: Device's mtu field is u16 and not int
  net/mlx5e: Fix minimum MTU
  net/mlx5e: Use vport MTU rather than physical port MTU
  net/mlx5e: Reset link modes upon setting speed to zero

 drivers/net/ethernet/mellanox/mlx5/core/en.h       |    2 +-
 .../net/ethernet/mellanox/mlx5/core/en_ethtool.c   |   26 ++++---
 drivers/net/ethernet/mellanox/mlx5/core/en_main.c  |   72 +++++++++++++++----
 drivers/net/ethernet/mellanox/mlx5/core/fs_core.c  |   48 +++++--------
 drivers/net/ethernet/mellanox/mlx5/core/main.c     |   25 ++++++-
 drivers/net/ethernet/mellanox/mlx5/core/port.c     |   10 ++--
 drivers/net/ethernet/mellanox/mlx5/core/vport.c    |   40 +++++++++++
 include/linux/mlx5/driver.h                        |    7 +-
 include/linux/mlx5/port.h                          |    6 +-
 include/linux/mlx5/vport.h                         |    2 +
 10 files changed, 166 insertions(+), 72 deletions(-)

^ permalink raw reply

* [PATCH net 3/9] net/mlx5_core: Add ConnectX-5 to list of supported devices
From: Saeed Mahameed @ 2016-04-19 12:33 UTC (permalink / raw)
  To: David S. Miller
  Cc: netdev, Or Gerlitz, Tal Alon, Eran Ben Elisha, Majd Dibbiny,
	Saeed Mahameed
In-Reply-To: <1461069222-27076-1-git-send-email-saeedm@mellanox.com>

From: Majd Dibbiny <majd@mellanox.com>

Add the upcoming ConnectX-5 devices (PF and VF) to the list of
supported devices by the mlx5 driver.

Signed-off-by: Majd Dibbiny <majd@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
---
 drivers/net/ethernet/mellanox/mlx5/core/main.c |    2 ++
 1 files changed, 2 insertions(+), 0 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/main.c b/drivers/net/ethernet/mellanox/mlx5/core/main.c
index 3f3b2fa..ddd352a 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/main.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/main.c
@@ -1459,6 +1459,8 @@ static const struct pci_device_id mlx5_core_pci_table[] = {
 	{ PCI_VDEVICE(MELLANOX, 0x1014), MLX5_PCI_DEV_IS_VF},	/* ConnectX-4 VF */
 	{ PCI_VDEVICE(MELLANOX, 0x1015) },			/* ConnectX-4LX */
 	{ PCI_VDEVICE(MELLANOX, 0x1016), MLX5_PCI_DEV_IS_VF},	/* ConnectX-4LX VF */
+	{ PCI_VDEVICE(MELLANOX, 0x1017) },			/* ConnectX-5 */
+	{ PCI_VDEVICE(MELLANOX, 0x1018), MLX5_PCI_DEV_IS_VF},	/* ConnectX-5 VF */
 	{ 0, }
 };
 
-- 
1.7.1

^ permalink raw reply related

* [PATCH net 7/9] net/mlx5_core: Remove static from local variable
From: Saeed Mahameed @ 2016-04-19 12:33 UTC (permalink / raw)
  To: David S. Miller
  Cc: netdev, Or Gerlitz, Tal Alon, Eran Ben Elisha, Eli Cohen,
	Saeed Mahameed
In-Reply-To: <1461069222-27076-1-git-send-email-saeedm@mellanox.com>

From: Eli Cohen <eli@mellanox.com>

The static is not required and breaks re-entrancy if it will be required.

Fixes: 2530236303d9 ("net/mlx5_core: Flow steering tree initialization")
Signed-off-by: Eli Cohen <eli@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
---
 drivers/net/ethernet/mellanox/mlx5/core/fs_core.c |    2 +-
 1 files changed, 1 insertions(+), 1 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c b/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c
index 3c7e3e5..89cce97 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c
@@ -1276,7 +1276,7 @@ struct mlx5_flow_namespace *mlx5_get_flow_namespace(struct mlx5_core_dev *dev,
 {
 	struct mlx5_flow_root_namespace *root_ns = dev->priv.root_ns;
 	int prio;
-	static struct fs_prio *fs_prio;
+	struct fs_prio *fs_prio;
 	struct mlx5_flow_namespace *ns;
 
 	if (!root_ns)
-- 
1.7.1

^ permalink raw reply related

* [PATCH net 1/9] net/mlx5_core: Fix soft lockup in steering error flow
From: Saeed Mahameed @ 2016-04-19 12:33 UTC (permalink / raw)
  To: David S. Miller
  Cc: netdev, Or Gerlitz, Tal Alon, Eran Ben Elisha, Maor Gottlieb,
	Saeed Mahameed
In-Reply-To: <1461069222-27076-1-git-send-email-saeedm@mellanox.com>

From: Maor Gottlieb <maorg@mellanox.com>

In the error flow of adding flow rule to auto-grouped flow
table, we call to tree_remove_node.

tree_remove_node locks the node's parent, however the node's parent
is already locked by mlx5_add_flow_rule and this causes a deadlock.
After this patch, if we failed to add the flow rule, we unlock the
flow table before calling to tree_remove_node.

fixes: f0d22d187473 ('net/mlx5_core: Introduce flow steering autogrouped
flow table')
Signed-off-by: Maor Gottlieb <maorg@mellanox.com>
Reported-by: Amir Vadai <amir@vadai.me>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
---
 drivers/net/ethernet/mellanox/mlx5/core/fs_core.c |   46 ++++++++-------------
 1 files changed, 17 insertions(+), 29 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c b/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c
index 5121be4..3c7e3e5 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c
@@ -1065,33 +1065,6 @@ unlock_fg:
 	return rule;
 }
 
-static struct mlx5_flow_rule *add_rule_to_auto_fg(struct mlx5_flow_table *ft,
-						  u8 match_criteria_enable,
-						  u32 *match_criteria,
-						  u32 *match_value,
-						  u8 action,
-						  u32 flow_tag,
-						  struct mlx5_flow_destination *dest)
-{
-	struct mlx5_flow_rule *rule;
-	struct mlx5_flow_group *g;
-
-	g = create_autogroup(ft, match_criteria_enable, match_criteria);
-	if (IS_ERR(g))
-		return (void *)g;
-
-	rule = add_rule_fg(g, match_value,
-			   action, flow_tag, dest);
-	if (IS_ERR(rule)) {
-		/* Remove assumes refcount > 0 and autogroup creates a group
-		 * with a refcount = 0.
-		 */
-		tree_get_node(&g->node);
-		tree_remove_node(&g->node);
-	}
-	return rule;
-}
-
 static struct mlx5_flow_rule *
 _mlx5_add_flow_rule(struct mlx5_flow_table *ft,
 		    u8 match_criteria_enable,
@@ -1119,8 +1092,23 @@ _mlx5_add_flow_rule(struct mlx5_flow_table *ft,
 				goto unlock;
 		}
 
-	rule = add_rule_to_auto_fg(ft, match_criteria_enable, match_criteria,
-				   match_value, action, flow_tag, dest);
+	g = create_autogroup(ft, match_criteria_enable, match_criteria);
+	if (IS_ERR(g)) {
+		rule = (void *)g;
+		goto unlock;
+	}
+
+	rule = add_rule_fg(g, match_value,
+			   action, flow_tag, dest);
+	if (IS_ERR(rule)) {
+		/* Remove assumes refcount > 0 and autogroup creates a group
+		 * with a refcount = 0.
+		 */
+		unlock_ref_node(&ft->node);
+		tree_get_node(&g->node);
+		tree_remove_node(&g->node);
+		return rule;
+	}
 unlock:
 	unlock_ref_node(&ft->node);
 	return rule;
-- 
1.7.1

^ permalink raw reply related

* [PATCH net 4/9] net/mlx5e: Device's mtu field is u16 and not int
From: Saeed Mahameed @ 2016-04-19 12:33 UTC (permalink / raw)
  To: David S. Miller
  Cc: netdev, Or Gerlitz, Tal Alon, Eran Ben Elisha, Saeed Mahameed
In-Reply-To: <1461069222-27076-1-git-send-email-saeedm@mellanox.com>

For set/query MTU port firmware commands the MTU field
is 16 bits, here I changed all the "int mtu" parameters
of the functions wrapping those firmware commands to be u16.

Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
---
 drivers/net/ethernet/mellanox/mlx5/core/en_main.c |    4 ++--
 drivers/net/ethernet/mellanox/mlx5/core/port.c    |   10 +++++-----
 include/linux/mlx5/port.h                         |    6 +++---
 3 files changed, 10 insertions(+), 10 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
index e0adb60..2fbbc62 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
@@ -1408,7 +1408,7 @@ static int mlx5e_set_dev_port_mtu(struct net_device *netdev)
 {
 	struct mlx5e_priv *priv = netdev_priv(netdev);
 	struct mlx5_core_dev *mdev = priv->mdev;
-	int hw_mtu;
+	u16 hw_mtu;
 	int err;
 
 	err = mlx5_set_port_mtu(mdev, MLX5E_SW2HW_MTU(netdev->mtu), 1);
@@ -2004,7 +2004,7 @@ static int mlx5e_change_mtu(struct net_device *netdev, int new_mtu)
 	struct mlx5e_priv *priv = netdev_priv(netdev);
 	struct mlx5_core_dev *mdev = priv->mdev;
 	bool was_opened;
-	int max_mtu;
+	u16 max_mtu;
 	int err = 0;
 
 	mlx5_query_port_max_mtu(mdev, &max_mtu, 1);
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/port.c b/drivers/net/ethernet/mellanox/mlx5/core/port.c
index ae378c5..53cc1e2 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/port.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/port.c
@@ -247,8 +247,8 @@ int mlx5_query_port_admin_status(struct mlx5_core_dev *dev,
 }
 EXPORT_SYMBOL_GPL(mlx5_query_port_admin_status);
 
-static void mlx5_query_port_mtu(struct mlx5_core_dev *dev, int *admin_mtu,
-				int *max_mtu, int *oper_mtu, u8 port)
+static void mlx5_query_port_mtu(struct mlx5_core_dev *dev, u16 *admin_mtu,
+				u16 *max_mtu, u16 *oper_mtu, u8 port)
 {
 	u32 in[MLX5_ST_SZ_DW(pmtu_reg)];
 	u32 out[MLX5_ST_SZ_DW(pmtu_reg)];
@@ -268,7 +268,7 @@ static void mlx5_query_port_mtu(struct mlx5_core_dev *dev, int *admin_mtu,
 		*admin_mtu = MLX5_GET(pmtu_reg, out, admin_mtu);
 }
 
-int mlx5_set_port_mtu(struct mlx5_core_dev *dev, int mtu, u8 port)
+int mlx5_set_port_mtu(struct mlx5_core_dev *dev, u16 mtu, u8 port)
 {
 	u32 in[MLX5_ST_SZ_DW(pmtu_reg)];
 	u32 out[MLX5_ST_SZ_DW(pmtu_reg)];
@@ -283,14 +283,14 @@ int mlx5_set_port_mtu(struct mlx5_core_dev *dev, int mtu, u8 port)
 }
 EXPORT_SYMBOL_GPL(mlx5_set_port_mtu);
 
-void mlx5_query_port_max_mtu(struct mlx5_core_dev *dev, int *max_mtu,
+void mlx5_query_port_max_mtu(struct mlx5_core_dev *dev, u16 *max_mtu,
 			     u8 port)
 {
 	mlx5_query_port_mtu(dev, NULL, max_mtu, NULL, port);
 }
 EXPORT_SYMBOL_GPL(mlx5_query_port_max_mtu);
 
-void mlx5_query_port_oper_mtu(struct mlx5_core_dev *dev, int *oper_mtu,
+void mlx5_query_port_oper_mtu(struct mlx5_core_dev *dev, u16 *oper_mtu,
 			      u8 port)
 {
 	mlx5_query_port_mtu(dev, NULL, NULL, oper_mtu, port);
diff --git a/include/linux/mlx5/port.h b/include/linux/mlx5/port.h
index a1d145a..b30250a 100644
--- a/include/linux/mlx5/port.h
+++ b/include/linux/mlx5/port.h
@@ -54,9 +54,9 @@ int mlx5_set_port_admin_status(struct mlx5_core_dev *dev,
 int mlx5_query_port_admin_status(struct mlx5_core_dev *dev,
 				 enum mlx5_port_status *status);
 
-int mlx5_set_port_mtu(struct mlx5_core_dev *dev, int mtu, u8 port);
-void mlx5_query_port_max_mtu(struct mlx5_core_dev *dev, int *max_mtu, u8 port);
-void mlx5_query_port_oper_mtu(struct mlx5_core_dev *dev, int *oper_mtu,
+int mlx5_set_port_mtu(struct mlx5_core_dev *dev, u16 mtu, u8 port);
+void mlx5_query_port_max_mtu(struct mlx5_core_dev *dev, u16 *max_mtu, u8 port);
+void mlx5_query_port_oper_mtu(struct mlx5_core_dev *dev, u16 *oper_mtu,
 			      u8 port);
 
 int mlx5_query_port_vl_hw_cap(struct mlx5_core_dev *dev,
-- 
1.7.1

^ permalink raw reply related

* [PATCH net 2/9] net/mlx5e: Fix MLX5E_100BASE_T define
From: Saeed Mahameed @ 2016-04-19 12:33 UTC (permalink / raw)
  To: David S. Miller
  Cc: netdev, Or Gerlitz, Tal Alon, Eran Ben Elisha, Rana Shahout,
	Saeed Mahameed
In-Reply-To: <1461069222-27076-1-git-send-email-saeedm@mellanox.com>

From: Rana Shahout <ranas@mellanox.com>

Bit 25 of eth_proto_capability in PTYS register is
1000Base-TT and not 100Base-T.

Fixes: f62b8bb8f2d3 ('net/mlx5: Extend mlx5_core to
support ConnectX-4 Ethernet functionality')
Signed-off-by: Rana Shahout <ranas@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
---
 drivers/net/ethernet/mellanox/mlx5/core/en.h       |    2 +-
 .../net/ethernet/mellanox/mlx5/core/en_ethtool.c   |    8 ++++----
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en.h b/drivers/net/ethernet/mellanox/mlx5/core/en.h
index 879e627..e80ce94 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en.h
@@ -609,7 +609,7 @@ enum mlx5e_link_mode {
 	MLX5E_100GBASE_KR4	 = 22,
 	MLX5E_100GBASE_LR4	 = 23,
 	MLX5E_100BASE_TX	 = 24,
-	MLX5E_100BASE_T		 = 25,
+	MLX5E_1000BASE_T	 = 25,
 	MLX5E_10GBASE_T		 = 26,
 	MLX5E_25GBASE_CR	 = 27,
 	MLX5E_25GBASE_KR	 = 28,
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c b/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c
index 68834b7..3476ab8 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c
@@ -138,10 +138,10 @@ static const struct {
 	[MLX5E_100BASE_TX]   = {
 		.speed      = 100,
 	},
-	[MLX5E_100BASE_T]    = {
-		.supported  = SUPPORTED_100baseT_Full,
-		.advertised = ADVERTISED_100baseT_Full,
-		.speed      = 100,
+	[MLX5E_1000BASE_T]    = {
+		.supported  = SUPPORTED_1000baseT_Full,
+		.advertised = ADVERTISED_1000baseT_Full,
+		.speed      = 1000,
 	},
 	[MLX5E_10GBASE_T]    = {
 		.supported  = SUPPORTED_10000baseT_Full,
-- 
1.7.1

^ permalink raw reply related

* [PATCH net 9/9] net/mlx5e: Reset link modes upon setting speed to zero
From: Saeed Mahameed @ 2016-04-19 12:33 UTC (permalink / raw)
  To: David S. Miller
  Cc: netdev, Or Gerlitz, Tal Alon, Eran Ben Elisha, Saeed Mahameed
In-Reply-To: <1461069222-27076-1-git-send-email-saeedm@mellanox.com>

Upon ethtool request to set speed to 0 we handle it as a special request
to reset link modes to Device's defaults.

Fixes: f62b8bb8f2d3 ("net/mlx5: Extend mlx5_core to support ConnectX-4
Ethernet functionality")
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
---
 .../net/ethernet/mellanox/mlx5/core/en_ethtool.c   |   18 +++++++++++-------
 1 files changed, 11 insertions(+), 7 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c b/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c
index 3476ab8..206f7fc 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c
@@ -640,17 +640,10 @@ static int mlx5e_set_settings(struct net_device *netdev,
 	struct mlx5e_priv *priv    = netdev_priv(netdev);
 	struct mlx5_core_dev *mdev = priv->mdev;
 	u32 link_modes;
-	u32 speed;
 	u32 eth_proto_cap, eth_proto_admin;
 	enum mlx5_port_status ps;
 	int err;
 
-	speed = ethtool_cmd_speed(cmd);
-
-	link_modes = cmd->autoneg == AUTONEG_ENABLE ?
-		mlx5e_ethtool2ptys_adver_link(cmd->advertising) :
-		mlx5e_ethtool2ptys_speed_link(speed);
-
 	err = mlx5_query_port_proto_cap(mdev, &eth_proto_cap, MLX5_PTYS_EN);
 	if (err) {
 		netdev_err(netdev, "%s: query port eth proto cap failed: %d\n",
@@ -658,6 +651,17 @@ static int mlx5e_set_settings(struct net_device *netdev,
 		goto out;
 	}
 
+	if (cmd->autoneg == AUTONEG_ENABLE) {
+		link_modes = mlx5e_ethtool2ptys_adver_link(cmd->advertising);
+	} else {
+		u32 speed = ethtool_cmd_speed(cmd);
+
+		if (speed)
+			link_modes = mlx5e_ethtool2ptys_speed_link(speed);
+		else /* speed 0 means reset link_modes to Device's default */
+			link_modes = eth_proto_cap;
+	}
+
 	link_modes = link_modes & eth_proto_cap;
 	if (!link_modes) {
 		netdev_err(netdev, "%s: Not supported link mode(s) requested",
-- 
1.7.1

^ permalink raw reply related

* [PATCH net 5/9] net/mlx5e: Fix minimum MTU
From: Saeed Mahameed @ 2016-04-19 12:33 UTC (permalink / raw)
  To: David S. Miller
  Cc: netdev, Or Gerlitz, Tal Alon, Eran Ben Elisha, Saeed Mahameed
In-Reply-To: <1461069222-27076-1-git-send-email-saeedm@mellanox.com>

Minimum MTU that can be set in Connectx4 device is 68.

This fixes the case where a user wants to set invalid MTU,
the driver will fail to satisfy this request and the interface
will stay down.

It is better to report an error and continue working with old
mtu.

Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
---
 drivers/net/ethernet/mellanox/mlx5/core/en_main.c |   11 ++++++++---
 1 files changed, 8 insertions(+), 3 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
index 2fbbc62..93e4ef4 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
@@ -1999,22 +1999,27 @@ static int mlx5e_set_features(struct net_device *netdev,
 	return err;
 }
 
+#define MXL5_HW_MIN_MTU 64
+#define MXL5E_MIN_MTU (MXL5_HW_MIN_MTU + ETH_FCS_LEN)
+
 static int mlx5e_change_mtu(struct net_device *netdev, int new_mtu)
 {
 	struct mlx5e_priv *priv = netdev_priv(netdev);
 	struct mlx5_core_dev *mdev = priv->mdev;
 	bool was_opened;
 	u16 max_mtu;
+	u16 min_mtu;
 	int err = 0;
 
 	mlx5_query_port_max_mtu(mdev, &max_mtu, 1);
 
 	max_mtu = MLX5E_HW2SW_MTU(max_mtu);
+	min_mtu = MLX5E_HW2SW_MTU(MXL5E_MIN_MTU);
 
-	if (new_mtu > max_mtu) {
+	if (new_mtu > max_mtu || new_mtu < min_mtu) {
 		netdev_err(netdev,
-			   "%s: Bad MTU (%d) > (%d) Max\n",
-			   __func__, new_mtu, max_mtu);
+			   "%s: Bad MTU (%d), valid range is: [%d..%d]\n",
+			   __func__, new_mtu, min_mtu, max_mtu);
 		return -EINVAL;
 	}
 
-- 
1.7.1

^ permalink raw reply related

* Re: [PATCHv3 net-next] net: use jiffies_to_msecs to replace EXPIRES_IN_MS in inet/sctp_diag
From: Xin Long @ 2016-04-19 12:37 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: network dev, linux-sctp, Marcelo Ricardo Leitner, Vlad Yasevich,
	daniel, davem, jsitnick
In-Reply-To: <1461068732.10638.169.camel@edumazet-glaptop3.roam.corp.google.com>

On Tue, Apr 19, 2016 at 8:25 PM, Eric Dumazet <eric.dumazet@gmail.com> wrote:
> On Tue, 2016-04-19 at 15:10 +0800, Xin Long wrote:
>> EXPIRES_IN_MS macro comes from net/ipv4/inet_diag.c and dates
>> back to before jiffies_to_msecs() has been introduced.
>>
>> Now we can remove it and use jiffies_to_msecs().
>>
>> Signed-off-by: Xin Long <lucien.xin@gmail.com>
>> ---
>
> Ok. Note that you could have mentioned this was
>
> Suggested-by: Jakub Sitnicki <jkbs@redhat.com>
>
> Even coworkers deserve credits ;)
yeah, didn't know the tag "Suggested-by:" before.
Thanks. :D

>
> Acked-by: Eric Dumazet <edumazet@google.com>
>
> Thanks.
>
>
>

^ permalink raw reply

* [PATCH 1/4] net: thunderx: Introduce a mailbox message to reset VF counters
From: sunil.kovvuri @ 2016-04-19 13:33 UTC (permalink / raw)
  To: netdev
  Cc: linux-kernel, linux-arm-kernel, sgoutham, robert.richter,
	Jerin Jacob
In-Reply-To: <1461072812-44239-1-git-send-email-sunil.kovvuri@gmail.com>

From: Jerin Jacob <jerin.jacob@caviumnetworks.com>

Write access to VF statistics counter register is only allowed from PF.
Added a new mailbox message to reset VF's Rx/Tx counters, this is used
by userspace DPDK.

Signed-off-by: Jerin Jacob <jerin.jacob@caviumnetworks.com>
Signed-off-by: Sunil Goutham <sgoutham@cavium.com>
---
 drivers/net/ethernet/cavium/thunder/nic.h      |   27 ++++++++++++++
 drivers/net/ethernet/cavium/thunder/nic_main.c |   45 ++++++++++++++++++++++++
 2 files changed, 72 insertions(+), 0 deletions(-)

diff --git a/drivers/net/ethernet/cavium/thunder/nic.h b/drivers/net/ethernet/cavium/thunder/nic.h
index 83025bb..e2ac9bd 100644
--- a/drivers/net/ethernet/cavium/thunder/nic.h
+++ b/drivers/net/ethernet/cavium/thunder/nic.h
@@ -368,6 +368,7 @@ struct nicvf {
 #define	NIC_MBOX_MSG_PNICVF_PTR		0x14	/* Get primary qset nicvf ptr */
 #define	NIC_MBOX_MSG_SNICVF_PTR		0x15	/* Send sqet nicvf ptr to PVF */
 #define	NIC_MBOX_MSG_LOOPBACK		0x16	/* Set interface in loopback */
+#define	NIC_MBOX_MSG_RESET_STAT_COUNTER 0x17	/* Reset statistics counters */
 #define	NIC_MBOX_MSG_CFG_DONE		0xF0	/* VF configuration done */
 #define	NIC_MBOX_MSG_SHUTDOWN		0xF1	/* VF is being shutdown */
 
@@ -484,6 +485,31 @@ struct set_loopback {
 	bool  enable;
 };
 
+/* Reset statistics counters */
+struct reset_stat_cfg {
+	u8    msg;
+	/* Bitmap to select NIC_PF_VNIC(vf_id)_RX_STAT(0..13) */
+	u16   rx_stat_mask;
+	/* Bitmap to select NIC_PF_VNIC(vf_id)_TX_STAT(0..4) */
+	u8    tx_stat_mask;
+	/* Bitmap to select NIC_PF_QS(0..127)_RQ(0..7)_STAT(0..1)
+	 * bit14, bit15 NIC_PF_QS(vf_id)_RQ7_STAT(0..1)
+	 * bit12, bit13 NIC_PF_QS(vf_id)_RQ6_STAT(0..1)
+	 * ..
+	 * bit2, bit3 NIC_PF_QS(vf_id)_RQ1_STAT(0..1)
+	 * bit0, bit1 NIC_PF_QS(vf_id)_RQ0_STAT(0..1)
+	 */
+	u16   rq_stat_mask;
+	/* Bitmap to select NIC_PF_QS(0..127)_SQ(0..7)_STAT(0..1)
+	 * bit14, bit15 NIC_PF_QS(vf_id)_SQ7_STAT(0..1)
+	 * bit12, bit13 NIC_PF_QS(vf_id)_SQ6_STAT(0..1)
+	 * ..
+	 * bit2, bit3 NIC_PF_QS(vf_id)_SQ1_STAT(0..1)
+	 * bit0, bit1 NIC_PF_QS(vf_id)_SQ0_STAT(0..1)
+	 */
+	u16   sq_stat_mask;
+};
+
 /* 128 bit shared memory between PF and each VF */
 union nic_mbx {
 	struct { u8 msg; }	msg;
@@ -501,6 +527,7 @@ union nic_mbx {
 	struct sqs_alloc        sqs_alloc;
 	struct nicvf_ptr	nicvf;
 	struct set_loopback	lbk;
+	struct reset_stat_cfg	reset_stat;
 };
 
 #define NIC_NODE_ID_MASK	0x03
diff --git a/drivers/net/ethernet/cavium/thunder/nic_main.c b/drivers/net/ethernet/cavium/thunder/nic_main.c
index 95f17f8..77ee260 100644
--- a/drivers/net/ethernet/cavium/thunder/nic_main.c
+++ b/drivers/net/ethernet/cavium/thunder/nic_main.c
@@ -616,6 +616,48 @@ static int nic_config_loopback(struct nicpf *nic, struct set_loopback *lbk)
 	return 0;
 }
 
+/* Reset statistics counters */
+static int nic_reset_stat_counters(struct nicpf *nic,
+				   int vf, struct reset_stat_cfg *cfg)
+{
+	int i, stat, qnum;
+	u64 reg_addr;
+
+	for (i = 0; i < RX_STATS_ENUM_LAST; i++) {
+		if (cfg->rx_stat_mask & BIT(i)) {
+			reg_addr = NIC_PF_VNIC_0_127_RX_STAT_0_13 |
+				   (vf << NIC_QS_ID_SHIFT) |
+				   (i << 3);
+			nic_reg_write(nic, reg_addr, 0);
+		}
+	}
+
+	for (i = 0; i < TX_STATS_ENUM_LAST; i++) {
+		if (cfg->tx_stat_mask & BIT(i)) {
+			reg_addr = NIC_PF_VNIC_0_127_TX_STAT_0_4 |
+				   (vf << NIC_QS_ID_SHIFT) |
+				   (i << 3);
+			nic_reg_write(nic, reg_addr, 0);
+		}
+	}
+
+	for (i = 0; i <= 15; i++) {
+		qnum = i >> 1;
+		stat = i & 1 ? 1 : 0;
+		reg_addr = (vf << NIC_QS_ID_SHIFT) |
+			   (qnum << NIC_Q_NUM_SHIFT) | (stat << 3);
+		if (cfg->rq_stat_mask & BIT(i)) {
+			reg_addr |= NIC_PF_QSET_0_127_RQ_0_7_STAT_0_1;
+			nic_reg_write(nic, reg_addr, 0);
+		}
+		if (cfg->sq_stat_mask & BIT(i)) {
+			reg_addr |= NIC_PF_QSET_0_127_SQ_0_7_STAT_0_1;
+			nic_reg_write(nic, reg_addr, 0);
+		}
+	}
+	return 0;
+}
+
 static void nic_enable_vf(struct nicpf *nic, int vf, bool enable)
 {
 	int bgx, lmac;
@@ -757,6 +799,9 @@ static void nic_handle_mbx_intr(struct nicpf *nic, int vf)
 	case NIC_MBOX_MSG_LOOPBACK:
 		ret = nic_config_loopback(nic, &mbx.lbk);
 		break;
+	case NIC_MBOX_MSG_RESET_STAT_COUNTER:
+		ret = nic_reset_stat_counters(nic, vf, &mbx.reset_stat);
+		break;
 	default:
 		dev_err(&nic->pdev->dev,
 			"Invalid msg from VF%d, msg 0x%x\n", vf, mbx.msg.msg);
-- 
1.7.1

^ permalink raw reply related

* [RFC PATCH net-next 0/8] Handle multiple received packets at each stage
From: Edward Cree @ 2016-04-19 13:33 UTC (permalink / raw)
  To: netdev, David Miller; +Cc: Jesper Dangaard Brouer, linux-net-drivers

Earlier discussions on this list[1] suggested that having multiple packets
traverse the network stack together (rather than calling the stack for each
packet singly) could improve performance through better cache locality.
This patch series is an attempt to implement this by having drivers pass an
SKB list to the stack at the end of the NAPI poll.  The stack then attempts
to keep the list together, only splitting it when either packets need to be
treated differently, or the next layer of the stack is not list-aware.

The first two patches simply place received packets on a list during the
event processing loop on the sfc EF10 architecture, then call the normal
stack for each packet singly at the end of the NAPI poll.
The remaining patches extend the 'listified' processing as far as the IP
receive handler.

Packet rate was tested with NetPerf UDP_STREAM, with 10 streams of 1-byte
packets, and the process and interrupt pinned to a single core on the RX
side.
The NIC was a 40G Solarflare 7x42Q; the CPU was a Xeon E3-1220V2 @ 3.10GHz.
Baseline:      5.07Mpps
after patch 2: 5.59Mpps (10.2% above baseline)
after patch 8: 6.44Mpps (25.6% above baseline)

I also attempted to measure the latency, but couldn't get reliable numbers;
my best estimate is that the series cost about 160ns if interrupt moderation
is disabled and busy-poll is enabled; about 60ns vice-versa.
I tried adding a check in the driver to only perform bundling if interrupt
moderation was active on the channel, but was unable to demonstrate any
latency gain from this, so I have omitted it from this series.

[1] http://thread.gmane.org/gmane.linux.network/395502

Edward Cree (8):
  net: core: trivial netif_receive_skb_list() entry point
  sfc: batch up RX delivery on EF10
  net: core: unwrap skb list receive slightly further
  net: core: Another step of skb receive list processing
  net: core: another layer of lists, around PF_MEMALLOC skb handling
  net: core: propagate SKB lists through packet_type lookup
  net: ipv4: listified version of ip_rcv
  net: ipv4: listify ip_rcv_finish

 drivers/net/ethernet/sfc/ef10.c       |   9 ++
 drivers/net/ethernet/sfc/efx.c        |   2 +
 drivers/net/ethernet/sfc/net_driver.h |   3 +
 drivers/net/ethernet/sfc/rx.c         |   7 +-
 include/linux/netdevice.h             |   4 +
 include/linux/netfilter.h             |  27 ++++
 include/linux/skbuff.h                |  16 +++
 include/net/ip.h                      |   2 +
 include/trace/events/net.h            |  14 ++
 net/core/dev.c                        | 245 ++++++++++++++++++++++++++++------
 net/ipv4/af_inet.c                    |   1 +
 net/ipv4/ip_input.c                   | 127 ++++++++++++++++--
 12 files changed, 409 insertions(+), 48 deletions(-)

^ permalink raw reply

* [PATCH 0/4] net: thunderx: Add multiqset support for DPDK
From: sunil.kovvuri @ 2016-04-19 13:33 UTC (permalink / raw)
  To: netdev; +Cc: linux-kernel, linux-arm-kernel, sgoutham, robert.richter

From: Sunil Goutham <sgoutham@cavium.com>

This patch series mainly adds support for userspace application
like DPDK with a VNIC VF attached to request additional QSets
for having morethan the default 8 queues.

Jerin Jacob (1):
  net: thunderx: Introduce a mailbox message to reset VF counters

Radoslaw Biernacki (3):
  net: thunderx: Add multiqset support for dataplane apps
  net: thunderx: add sysfs attribute for SQS/SVF assigments
  net: thunderx: Improvement for MBX interface debug messages

 drivers/net/ethernet/cavium/thunder/nic.h        |   32 ++-
 drivers/net/ethernet/cavium/thunder/nic_main.c   |  321 +++++++++++++++++++---
 drivers/net/ethernet/cavium/thunder/nicvf_main.c |   10 +-
 3 files changed, 324 insertions(+), 39 deletions(-)

^ permalink raw reply

* [PATCH 2/4] net: thunderx: Add multiqset support for dataplane apps
From: sunil.kovvuri @ 2016-04-19 13:33 UTC (permalink / raw)
  To: netdev
  Cc: linux-kernel, linux-arm-kernel, sgoutham, robert.richter,
	Radoslaw Biernacki
In-Reply-To: <1461072812-44239-1-git-send-email-sunil.kovvuri@gmail.com>

From: Radoslaw Biernacki <rad@semihalf.com>

This patch adds support to PF for allocating additional Qsets to
dataplane apps such as DPDK. Till now PF, upon host bound interface's
request it used to allocate Qsets from the free ones, but for dataplane
apps support has been added for it to request specific Qsets instead of
just PF's choice.

And also adds validation checks at different places, these are needed to
 have proper secondary Qset allocation when interfaces in different domain
i.e Host, VFIO, DPDK e.t.c exist at the same time.

Some of the checks are
- Check if RSS indirection table has valid entries.
- When host bound interface requests additional Qsets, PF should
  assign only those which in host domain i.e both primary VF and
  secondary VFs should be using same driver. Hence added PCI driver
  checks.
- If dataplane app terminates without proper shutdown then when
  restarted it will request the same or different SQsets as were
  assigned before. This is taken care of otherwise application
  won't recover.

Removed 'sqs_used' which became redundant due to new SQset allocation scheme.

Signed-off-by: Radoslaw Biernacki <rad@semihalf.com>
Signed-off-by: Sunil Goutham <sgoutham@cavium.com>
---
 drivers/net/ethernet/cavium/thunder/nic.h        |    5 +-
 drivers/net/ethernet/cavium/thunder/nic_main.c   |  193 +++++++++++++++++++---
 drivers/net/ethernet/cavium/thunder/nicvf_main.c |    2 +-
 3 files changed, 170 insertions(+), 30 deletions(-)

diff --git a/drivers/net/ethernet/cavium/thunder/nic.h b/drivers/net/ethernet/cavium/thunder/nic.h
index e2ac9bd..b63278a 100644
--- a/drivers/net/ethernet/cavium/thunder/nic.h
+++ b/drivers/net/ethernet/cavium/thunder/nic.h
@@ -463,11 +463,12 @@ struct bgx_link_status {
 	u32   speed;
 };
 
-/* Get Extra Qset IDs */
+/* Allocate additional SQS to VF */
 struct sqs_alloc {
 	u8    msg;
-	u8    vf_id;
+	u8    spec; /* 1 - For specific SQS allocation, 0 - For PF's choice */
 	u8    qs_count;
+	u8    svf[MAX_SQS_PER_VF]; /* SQS VF ids for specific allocation */
 };
 
 struct nicvf_ptr {
diff --git a/drivers/net/ethernet/cavium/thunder/nic_main.c b/drivers/net/ethernet/cavium/thunder/nic_main.c
index 77ee260..d6a6914 100644
--- a/drivers/net/ethernet/cavium/thunder/nic_main.c
+++ b/drivers/net/ethernet/cavium/thunder/nic_main.c
@@ -29,9 +29,9 @@ struct nicpf {
 	void __iomem		*reg_base;       /* Register start address */
 	u8			num_sqs_en;	/* Secondary qsets enabled */
 	u64			nicvf[MAX_NUM_VFS_SUPPORTED];
+#define	NIC_VF_UNASSIGNED	((u8)0xFF)
 	u8			vf_sqs[MAX_NUM_VFS_SUPPORTED][MAX_SQS_PER_VF];
 	u8			pqs_vf[MAX_NUM_VFS_SUPPORTED];
-	bool			sqs_used[MAX_NUM_VFS_SUPPORTED];
 	struct pkind_cfg	pkind;
 #define	NIC_SET_VF_LMAC_MAP(bgx, lmac)	(((bgx & 0xF) << 4) | (lmac & 0xF))
 #define	NIC_GET_BGX_FROM_VF_LMAC_MAP(map)	((map >> 4) & 0xF)
@@ -46,6 +46,7 @@ struct nicpf {
 	u16			rssi_base[MAX_NUM_VFS_SUPPORTED];
 	u16			rss_ind_tbl_size;
 	bool			mbx_lock[MAX_NUM_VFS_SUPPORTED];
+	struct pci_dev		*vf_pdev[MAX_NUM_VFS_SUPPORTED];
 
 	/* MSI-X */
 	bool			msix_enabled;
@@ -458,10 +459,18 @@ static void nic_config_rss(struct nicpf *nic, struct rss_cfg_msg *cfg)
 	for (; rssi < (rssi_base + cfg->tbl_len); rssi++) {
 		u8 svf = cfg->ind_tbl[idx] >> 3;
 
-		if (svf)
+		if (svf && (svf <= MAX_SQS_PER_VF)) {
 			qset = nic->vf_sqs[cfg->vf_id][svf - 1];
-		else
+			if ((qset >= MAX_NUM_VFS_SUPPORTED) ||
+			    (nic->pqs_vf[qset] != cfg->vf_id)) {
+				dev_err(&nic->pdev->dev,
+					"Invalid rss table entry %d from VF %d\n",
+					cfg->ind_tbl[idx], cfg->vf_id);
+				qset = cfg->vf_id;
+			}
+		} else {
 			qset = cfg->vf_id;
+		}
 		nic_reg_write(nic, NIC_PF_RSSI_0_4097_RQ | (rssi << 3),
 			      (qset << 3) | (cfg->ind_tbl[idx] & 0x7));
 		idx++;
@@ -550,7 +559,19 @@ static void nic_send_pnicvf(struct nicpf *nic, int sqs)
 static void nic_send_snicvf(struct nicpf *nic, struct nicvf_ptr *nicvf)
 {
 	union nic_mbx mbx = {};
-	int sqs_id = nic->vf_sqs[nicvf->vf_id][nicvf->sqs_id];
+	int sqs_id;
+
+	if (nicvf->sqs_id >= MAX_SQS_PER_VF) {
+		nic_mbx_send_nack(nic, nicvf->vf_id);
+		return;
+	}
+
+	sqs_id = nic->vf_sqs[nicvf->vf_id][nicvf->sqs_id];
+	if ((sqs_id < nic->num_vf_en) ||
+	    (nic->pqs_vf[sqs_id] != nicvf->vf_id)) {
+		nic_mbx_send_nack(nic, nicvf->vf_id);
+		return;
+	}
 
 	mbx.nicvf.msg = NIC_MBOX_MSG_SNICVF_PTR;
 	mbx.nicvf.sqs_id = nicvf->sqs_id;
@@ -558,47 +579,150 @@ static void nic_send_snicvf(struct nicpf *nic, struct nicvf_ptr *nicvf)
 	nic_send_msg_to_vf(nic, nicvf->vf_id, &mbx);
 }
 
+/* Find and take reference to all vf devices */
+static void nic_get_vf_pdev(struct nicpf *nic, int vf_en)
+{
+	struct pci_dev *pdev = nic->pdev;
+	struct pci_dev *vfdev;
+	u16 vid = pdev->vendor;
+	u16 devid;
+	int vf = 0, pos;
+
+	pos = pci_find_ext_capability(pdev, PCI_EXT_CAP_ID_SRIOV);
+	if (!pos)
+		return;
+	pci_read_config_word(pdev, pos + PCI_SRIOV_VF_DID, &devid);
+
+	vfdev = pci_get_device(vid, devid, NULL);
+	for (; vfdev; vfdev = pci_get_device(vid, devid, vfdev)) {
+		if (!vfdev->is_virtfn)
+			continue;
+		if (vfdev->physfn != pdev)
+			continue;
+		if (vf >= vf_en)
+			continue;
+		nic->vf_pdev[vf] = vfdev;
+		pci_dev_get(vfdev);
+		++vf;
+	}
+}
+
+/* Release references to all vf devices */
+static void nic_put_vf_pdev(struct nicpf *nic)
+{
+	int vf;
+
+	for (vf = 0; vf < MAX_NUM_VFS_SUPPORTED; vf++) {
+		struct pci_dev *vfdev = nic->vf_pdev[vf];
+
+		nic->vf_pdev[vf] = NULL;
+		if (vfdev)
+			pci_dev_put(vfdev);
+	}
+}
+
+/* Check if pri.VF and sec.VF are in same domain i.e bound to same driver */
+static bool nic_check_svf_drv(struct nicpf *nic, u8 pvf, u8 svf)
+{
+	return pci_dev_driver(nic->vf_pdev[pvf]) ==
+	       pci_dev_driver(nic->vf_pdev[svf]);
+}
+
 /* Find next available Qset that can be assigned as a
  * secondary Qset to a VF.
  */
-static int nic_nxt_avail_sqs(struct nicpf *nic)
+static int nic_nxt_avail_sqs(struct nicpf *nic, u8 pvf)
 {
 	int sqs;
 
-	for (sqs = 0; sqs < nic->num_sqs_en; sqs++) {
-		if (!nic->sqs_used[sqs])
-			nic->sqs_used[sqs] = true;
+	for (sqs = nic->num_vf_en;
+	     sqs < (nic->num_vf_en + nic->num_sqs_en); sqs++) {
+		if ((nic->pqs_vf[sqs] == NIC_VF_UNASSIGNED) &&
+		    nic_check_svf_drv(nic, pvf, sqs))
+			nic->pqs_vf[sqs] = pvf;
 		else
 			continue;
-		return sqs + nic->num_vf_en;
+		return sqs;
 	}
 	return -1;
 }
 
 /* Allocate additional Qsets for requested VF */
-static void nic_alloc_sqs(struct nicpf *nic, struct sqs_alloc *sqs)
+static void nic_alloc_sqs(struct nicpf *nic, u8 pvf, struct sqs_alloc *sqs)
 {
 	union nic_mbx mbx = {};
 	int idx, alloc_qs = 0;
 	int sqs_id;
 
-	if (!nic->num_sqs_en)
+	if (!nic->num_sqs_en || (sqs->qs_count > MAX_SQS_PER_VF))
 		goto send_mbox;
 
-	for (idx = 0; idx < sqs->qs_count; idx++) {
-		sqs_id = nic_nxt_avail_sqs(nic);
-		if (sqs_id < 0)
-			break;
-		nic->vf_sqs[sqs->vf_id][idx] = sqs_id;
-		nic->pqs_vf[sqs_id] = sqs->vf_id;
-		alloc_qs++;
+	if (sqs->spec) {
+		for (idx = 0; idx < sqs->qs_count; idx++) {
+			sqs_id = sqs->svf[idx];
+
+			/* Check if desired SQS is within the allowed range */
+			if (!((sqs_id >= nic->num_vf_en) &&
+			      (sqs_id < (nic->num_vf_en + nic->num_sqs_en)))) {
+				dev_err(&nic->pdev->dev,
+					"Req SQS is invalid sqs->svf[%d]=%u",
+					idx, sqs_id);
+				break;
+			}
+
+			/* Check if desired SQS is free or assigned to a PVF */
+			if ((nic->pqs_vf[sqs_id] != NIC_VF_UNASSIGNED) &&
+			    (nic->pqs_vf[sqs_id] != pvf)) {
+				dev_err(&nic->pdev->dev,
+					"SQS%d is already allocated to VF%u",
+					sqs_id, nic->pqs_vf[sqs_id]);
+				break;
+			}
+
+			/* Check if SQS is bound to the same driver as PVF */
+			if (!nic_check_svf_drv(nic, pvf, sqs_id)) {
+				dev_err(&nic->pdev->dev,
+					"SQS%d use different driver", sqs_id);
+				break;
+			}
+		}
+
+		if (idx != sqs->qs_count)
+			goto send_mbox;
+
+		/* Clear any existing assignments */
+		for (idx = 0; idx < MAX_SQS_PER_VF; idx++)
+			nic->vf_sqs[pvf][idx] = NIC_VF_UNASSIGNED;
+		for (idx = nic->num_vf_en;
+		     idx < (nic->num_vf_en + nic->num_sqs_en); idx++) {
+			if (nic->pqs_vf[idx] == pvf)
+				nic->pqs_vf[idx] = NIC_VF_UNASSIGNED;
+		}
+
+		/* Populate VF's SQS table */
+		for (idx = 0; idx < sqs->qs_count; idx++) {
+			sqs_id = sqs->svf[idx];
+			nic->vf_sqs[pvf][idx] = sqs_id;
+			nic->pqs_vf[sqs_id] = pvf;
+			mbx.sqs_alloc.svf[idx] = sqs_id;
+		}
+		alloc_qs = idx;
+	} else {
+		for (idx = 0; idx < sqs->qs_count; idx++) {
+			sqs_id = nic_nxt_avail_sqs(nic, pvf);
+			if (sqs_id < 0)
+				break;
+			nic->vf_sqs[pvf][idx] = sqs_id;
+			nic->pqs_vf[sqs_id] = pvf;
+			mbx.sqs_alloc.svf[idx] = sqs_id;
+		}
+		alloc_qs = idx;
 	}
 
 send_mbox:
 	mbx.sqs_alloc.msg = NIC_MBOX_MSG_ALLOC_SQS;
-	mbx.sqs_alloc.vf_id = sqs->vf_id;
 	mbx.sqs_alloc.qs_count = alloc_qs;
-	nic_send_msg_to_vf(nic, sqs->vf_id, &mbx);
+	nic_send_msg_to_vf(nic, pvf, &mbx);
 }
 
 static int nic_config_loopback(struct nicpf *nic, struct set_loopback *lbk)
@@ -776,13 +900,15 @@ static void nic_handle_mbx_intr(struct nicpf *nic, int vf)
 		goto unlock;
 	case NIC_MBOX_MSG_SHUTDOWN:
 		/* First msg in VF teardown sequence */
-		if (vf >= nic->num_vf_en)
-			nic->sqs_used[vf - nic->num_vf_en] = false;
-		nic->pqs_vf[vf] = 0;
+		if (vf < nic->num_vf_en) {
+			for (i = 0; i < MAX_SQS_PER_VF; i++)
+				nic->vf_sqs[vf][i] = NIC_VF_UNASSIGNED;
+		}
+		nic->pqs_vf[vf] = NIC_VF_UNASSIGNED;
 		nic_enable_vf(nic, vf, false);
 		break;
 	case NIC_MBOX_MSG_ALLOC_SQS:
-		nic_alloc_sqs(nic, &mbx.sqs_alloc);
+		nic_alloc_sqs(nic, vf, &mbx.sqs_alloc);
 		goto unlock;
 	case NIC_MBOX_MSG_NICVF_PTR:
 		nic->nicvf[vf] = mbx.nicvf.nicvf;
@@ -979,6 +1105,8 @@ static int nic_sriov_init(struct pci_dev *pdev, struct nicpf *nic)
 		return err;
 	}
 
+	nic_get_vf_pdev(nic, vf_en);
+
 	dev_info(&pdev->dev, "SRIOV enabled, number of VF available %d\n",
 		 vf_en);
 
@@ -1035,7 +1163,7 @@ static int nic_probe(struct pci_dev *pdev, const struct pci_device_id *ent)
 {
 	struct device *dev = &pdev->dev;
 	struct nicpf *nic;
-	int    err;
+	int    err, vf, sqs;
 
 	BUILD_BUG_ON(sizeof(union nic_mbx) > 16);
 
@@ -1090,6 +1218,13 @@ static int nic_probe(struct pci_dev *pdev, const struct pci_device_id *ent)
 	/* Set RSS TBL size for each VF */
 	nic->rss_ind_tbl_size = NIC_MAX_RSS_IDR_TBL_SIZE;
 
+	/* Initialize all VF's primary Qset */
+	for (vf = 0; vf < MAX_NUM_VFS_SUPPORTED; vf++) {
+		nic->pqs_vf[vf] = NIC_VF_UNASSIGNED;
+		for (sqs = 0; sqs < MAX_SQS_PER_VF; sqs++)
+			nic->vf_sqs[vf][sqs] = NIC_VF_UNASSIGNED;
+	}
+
 	/* Register interrupts */
 	err = nic_register_interrupts(nic);
 	if (err)
@@ -1114,8 +1249,10 @@ static int nic_probe(struct pci_dev *pdev, const struct pci_device_id *ent)
 	return 0;
 
 err_disable_sriov:
-	if (nic->flags & NIC_SRIOV_ENABLED)
+	if (nic->flags & NIC_SRIOV_ENABLED) {
+		nic_put_vf_pdev(nic);
 		pci_disable_sriov(pdev);
+	}
 err_unregister_interrupts:
 	nic_unregister_interrupts(nic);
 err_release_regions:
@@ -1130,8 +1267,10 @@ static void nic_remove(struct pci_dev *pdev)
 {
 	struct nicpf *nic = pci_get_drvdata(pdev);
 
-	if (nic->flags & NIC_SRIOV_ENABLED)
+	if (nic->flags & NIC_SRIOV_ENABLED) {
+		nic_put_vf_pdev(nic);
 		pci_disable_sriov(pdev);
+	}
 
 	if (nic->check_link) {
 		/* Destroy work Queue */
diff --git a/drivers/net/ethernet/cavium/thunder/nicvf_main.c b/drivers/net/ethernet/cavium/thunder/nicvf_main.c
index bfee298..87d0f56 100644
--- a/drivers/net/ethernet/cavium/thunder/nicvf_main.c
+++ b/drivers/net/ethernet/cavium/thunder/nicvf_main.c
@@ -386,7 +386,7 @@ static void nicvf_request_sqs(struct nicvf *nic)
 		return;
 
 	mbx.sqs_alloc.msg = NIC_MBOX_MSG_ALLOC_SQS;
-	mbx.sqs_alloc.vf_id = nic->vf_id;
+	mbx.sqs_alloc.spec = 0; /* Let PF choose which SQS to alloc */
 	mbx.sqs_alloc.qs_count = nic->sqs_count;
 	if (nicvf_send_msg_to_pf(nic, &mbx)) {
 		/* No response from PF */
-- 
1.7.1

^ permalink raw reply related

* [PATCH 3/4] net: thunderx: add sysfs attribute for SQS/SVF assigments
From: sunil.kovvuri @ 2016-04-19 13:33 UTC (permalink / raw)
  To: netdev
  Cc: linux-kernel, linux-arm-kernel, sgoutham, robert.richter,
	Radoslaw Biernacki
In-Reply-To: <1461072812-44239-1-git-send-email-sunil.kovvuri@gmail.com>

From: Radoslaw Biernacki <rad@semihalf.com>

With this sysfs attribute (sriov_sqs_assignment) administrator will be
able to read the current assigment of SQS/SVF for a given VF. This is
useful to decide which VFs needs to be attached to UIO for a successful
allocation of secondary Qsets

Signed-off-by: Radoslaw Biernacki <rad@semihalf.com>
Signed-off-by: Sunil Goutham <sgoutham@cavium.com>
---
 drivers/net/ethernet/cavium/thunder/nic_main.c |   67 +++++++++++++++++++++++-
 1 files changed, 66 insertions(+), 1 deletions(-)

diff --git a/drivers/net/ethernet/cavium/thunder/nic_main.c b/drivers/net/ethernet/cavium/thunder/nic_main.c
index d6a6914..e2d8db9 100644
--- a/drivers/net/ethernet/cavium/thunder/nic_main.c
+++ b/drivers/net/ethernet/cavium/thunder/nic_main.c
@@ -1159,6 +1159,60 @@ static void nic_poll_for_link(struct work_struct *work)
 	queue_delayed_work(nic->check_link, &nic->dwork, HZ * 2);
 }
 
+ssize_t sriov_sqs_assignment_show(struct device *dev,
+				  struct device_attribute *attr, char *buf)
+{
+	struct pci_dev *pdev = container_of(dev, struct pci_dev, dev);
+	struct pci_dev *vf_dev;
+	struct pci_driver *vf_drv;
+	struct nicpf *nic = pci_get_drvdata(pdev);
+	size_t vf, off, svf_idx;
+
+	off = scnprintf(buf, PAGE_SIZE, "%u\n", nic->num_vf_en);
+
+	for (vf = 0; vf < nic->num_vf_en; vf++) {
+		vf_dev = nic->vf_pdev[vf];
+		vf_drv = vf_dev ? pci_dev_driver(vf_dev) : NULL;
+		if (off >= PAGE_SIZE)
+			break;
+		off += scnprintf(&buf[off], PAGE_SIZE - off,
+				 "%zu %04x:%02x:%02x.%d %s %c:",
+				 vf, pci_domain_nr(vf_dev->bus),
+				 vf_dev->bus->number, PCI_SLOT(vf_dev->devfn),
+				 PCI_FUNC(vf_dev->devfn),
+				 vf_drv ? vf_drv->name : "no-driver",
+				 nic->vf_enabled[vf] ? '+' : '-');
+		for (svf_idx = 0; svf_idx < MAX_SQS_PER_VF; svf_idx++) {
+			if (off >= PAGE_SIZE)
+				break;
+			if (nic->vf_sqs[vf][svf_idx] == NIC_VF_UNASSIGNED)
+				break;
+			off += scnprintf(&buf[off], PAGE_SIZE - off, " %d",
+					 nic->vf_sqs[vf][svf_idx]);
+		}
+		if (off >= PAGE_SIZE)
+			break;
+		off += scnprintf(&buf[off], PAGE_SIZE - off, "\n");
+	}
+
+	for (vf = nic->num_vf_en; vf < nic->num_vf_en + nic->num_sqs_en; vf++) {
+		vf_dev = nic->vf_pdev[vf];
+		vf_drv = vf_dev ? pci_dev_driver(vf_dev) : NULL;
+		if (off >= PAGE_SIZE)
+			break;
+		off += scnprintf(&buf[off], PAGE_SIZE - off,
+				 "%zu %04x:%02x:%02x.%d %s: %u\n",
+				 vf, pci_domain_nr(vf_dev->bus),
+				 vf_dev->bus->number, PCI_SLOT(vf_dev->devfn),
+				 PCI_FUNC(vf_dev->devfn),
+				 vf_drv ? vf_drv->name : "no-driver",
+				 nic->pqs_vf[vf]);
+	}
+
+	return off;
+}
+DEVICE_ATTR_RO(sriov_sqs_assignment);
+
 static int nic_probe(struct pci_dev *pdev, const struct pci_device_id *ent)
 {
 	struct device *dev = &pdev->dev;
@@ -1235,12 +1289,18 @@ static int nic_probe(struct pci_dev *pdev, const struct pci_device_id *ent)
 	if (err)
 		goto err_unregister_interrupts;
 
+	err = device_create_file(dev, &dev_attr_sriov_sqs_assignment);
+	if (err) {
+		err = -ENOMEM;
+		goto err_disable_sriov;
+	}
+
 	/* Register a physical link status poll fn() */
 	nic->check_link = alloc_workqueue("check_link_status",
 					  WQ_UNBOUND | WQ_MEM_RECLAIM, 1);
 	if (!nic->check_link) {
 		err = -ENOMEM;
-		goto err_disable_sriov;
+		goto err_remove_sysfs_attr;
 	}
 
 	INIT_DELAYED_WORK(&nic->dwork, nic_poll_for_link);
@@ -1248,6 +1308,8 @@ static int nic_probe(struct pci_dev *pdev, const struct pci_device_id *ent)
 
 	return 0;
 
+err_remove_sysfs_attr:
+	device_remove_file(dev, &dev_attr_sriov_sqs_assignment);
 err_disable_sriov:
 	if (nic->flags & NIC_SRIOV_ENABLED) {
 		nic_put_vf_pdev(nic);
@@ -1266,6 +1328,9 @@ err_disable_device:
 static void nic_remove(struct pci_dev *pdev)
 {
 	struct nicpf *nic = pci_get_drvdata(pdev);
+	struct device *dev = &pdev->dev;
+
+	device_remove_file(dev, &dev_attr_sriov_sqs_assignment);
 
 	if (nic->flags & NIC_SRIOV_ENABLED) {
 		nic_put_vf_pdev(nic);
-- 
1.7.1

^ permalink raw reply related

* [PATCH 4/4] net: thunderx: Improvement for MBX interface debug messages
From: sunil.kovvuri @ 2016-04-19 13:33 UTC (permalink / raw)
  To: netdev
  Cc: linux-kernel, linux-arm-kernel, sgoutham, robert.richter,
	Radoslaw Biernacki
In-Reply-To: <1461072812-44239-1-git-send-email-sunil.kovvuri@gmail.com>

From: Radoslaw Biernacki <rad@semihalf.com>

Adding debug messages in case of NACK for a mailbox message, also
did small cleanups.

Signed-off-by: Radoslaw Biernacki <rad@semihalf.com>
Signed-off-by: Sunil Goutham <sgoutham@cavium.com>
---
 drivers/net/ethernet/cavium/thunder/nic_main.c   |   16 ++++++++++------
 drivers/net/ethernet/cavium/thunder/nicvf_main.c |    8 ++++++--
 2 files changed, 16 insertions(+), 8 deletions(-)

diff --git a/drivers/net/ethernet/cavium/thunder/nic_main.c b/drivers/net/ethernet/cavium/thunder/nic_main.c
index e2d8db9..550a950 100644
--- a/drivers/net/ethernet/cavium/thunder/nic_main.c
+++ b/drivers/net/ethernet/cavium/thunder/nic_main.c
@@ -820,7 +820,7 @@ static void nic_handle_mbx_intr(struct nicpf *nic, int vf)
 		mbx_addr += sizeof(u64);
 	}
 
-	dev_dbg(&nic->pdev->dev, "%s: Mailbox msg %d from VF%d\n",
+	dev_dbg(&nic->pdev->dev, "%s: Mailbox msg 0x%02x from VF%d\n",
 		__func__, mbx.msg.msg, vf);
 	switch (mbx.msg.msg) {
 	case NIC_MBOX_MSG_READY:
@@ -830,8 +830,7 @@ static void nic_handle_mbx_intr(struct nicpf *nic, int vf)
 			nic->duplex[vf] = 0;
 			nic->speed[vf] = 0;
 		}
-		ret = 1;
-		break;
+		goto unlock;
 	case NIC_MBOX_MSG_QS_CFG:
 		reg_addr = NIC_PF_QSET_0_127_CFG |
 			   (mbx.qs.num << NIC_QS_ID_SHIFT);
@@ -873,8 +872,10 @@ static void nic_handle_mbx_intr(struct nicpf *nic, int vf)
 		nic_tx_channel_cfg(nic, mbx.qs.num, &mbx.sq);
 		break;
 	case NIC_MBOX_MSG_SET_MAC:
-		if (vf >= nic->num_vf_en)
+		if (vf >= nic->num_vf_en) {
+			ret = -1; /* NACK */
 			break;
+		}
 		lmac = mbx.mac.vf_id;
 		bgx = NIC_GET_BGX_FROM_VF_LMAC_MAP(nic->vf_lmac_map[lmac]);
 		lmac = NIC_GET_LMAC_FROM_VF_LMAC_MAP(nic->vf_lmac_map[lmac]);
@@ -934,10 +935,13 @@ static void nic_handle_mbx_intr(struct nicpf *nic, int vf)
 		break;
 	}
 
-	if (!ret)
+	if (!ret) {
 		nic_mbx_send_ack(nic, vf);
-	else if (mbx.msg.msg != NIC_MBOX_MSG_READY)
+	} else if (mbx.msg.msg != NIC_MBOX_MSG_READY) {
+		dev_err(&nic->pdev->dev, "NACK for MBOX 0x%02x from VF %d\n",
+			mbx.msg.msg, vf);
 		nic_mbx_send_nack(nic, vf);
+	}
 unlock:
 	nic->mbx_lock[vf] = false;
 }
diff --git a/drivers/net/ethernet/cavium/thunder/nicvf_main.c b/drivers/net/ethernet/cavium/thunder/nicvf_main.c
index 87d0f56..12ea73a 100644
--- a/drivers/net/ethernet/cavium/thunder/nicvf_main.c
+++ b/drivers/net/ethernet/cavium/thunder/nicvf_main.c
@@ -134,15 +134,19 @@ int nicvf_send_msg_to_pf(struct nicvf *nic, union nic_mbx *mbx)
 
 	/* Wait for previous message to be acked, timeout 2sec */
 	while (!nic->pf_acked) {
-		if (nic->pf_nacked)
+		if (nic->pf_nacked) {
+			netdev_err(nic->netdev,
+				   "PF NACK to mbox msg 0x%02x from VF%d\n",
+				   (mbx->msg.msg & 0xFF), nic->vf_id);
 			return -EINVAL;
+		}
 		msleep(sleep);
 		if (nic->pf_acked)
 			break;
 		timeout -= sleep;
 		if (!timeout) {
 			netdev_err(nic->netdev,
-				   "PF didn't ack to mbox msg %d from VF%d\n",
+				   "PF didn't ACK to mbox msg 0x%02x from VF%d\n",
 				   (mbx->msg.msg & 0xFF), nic->vf_id);
 			return -EBUSY;
 		}
-- 
1.7.1

^ permalink raw reply related

* [RFC PATCH net-next 1/8] net: core: trivial netif_receive_skb_list() entry point
From: Edward Cree @ 2016-04-19 13:34 UTC (permalink / raw)
  To: netdev, David Miller; +Cc: Jesper Dangaard Brouer, linux-net-drivers
In-Reply-To: <5716338E.4050003@solarflare.com>

Just calls netif_receive_skb() in a loop.

Signed-off-by: Edward Cree <ecree@solarflare.com>
---
 include/linux/netdevice.h |  1 +
 net/core/dev.c            | 20 ++++++++++++++++++++
 2 files changed, 21 insertions(+)

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index a3bb534..682d0ad 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -3218,6 +3218,7 @@ static inline void dev_consume_skb_any(struct sk_buff *skb)
 int netif_rx(struct sk_buff *skb);
 int netif_rx_ni(struct sk_buff *skb);
 int netif_receive_skb(struct sk_buff *skb);
+void netif_receive_skb_list(struct sk_buff_head *list);
 gro_result_t napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb);
 void napi_gro_flush(struct napi_struct *napi, bool flush_old);
 struct sk_buff *napi_get_frags(struct napi_struct *napi);
diff --git a/net/core/dev.c b/net/core/dev.c
index 52d446b..7abcb1d 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -4292,6 +4292,26 @@ int netif_receive_skb(struct sk_buff *skb)
 }
 EXPORT_SYMBOL(netif_receive_skb);
 
+/**
+ *	netif_receive_skb_list - process many receive buffers from network
+ *	@list: list of skbs to process.  Must not be shareable (e.g. it may
+ *	be on the stack)
+ *
+ *	For now, just calls netif_receive_skb() in a loop, ignoring the
+ *	return value.
+ *
+ *	This function may only be called from softirq context and interrupts
+ *	should be enabled.
+ */
+void netif_receive_skb_list(struct sk_buff_head *list)
+{
+	struct sk_buff *skb;
+
+	while ((skb = __skb_dequeue(list)) != NULL)
+		netif_receive_skb(skb);
+}
+EXPORT_SYMBOL(netif_receive_skb_list);
+
 /* Network device is going away, flush any packets still pending
  * Called with irqs disabled.
  */

^ permalink raw reply related

* [RFC PATCH net-next 2/8] sfc: batch up RX delivery on EF10
From: Edward Cree @ 2016-04-19 13:35 UTC (permalink / raw)
  To: netdev, David Miller; +Cc: Jesper Dangaard Brouer, linux-net-drivers
In-Reply-To: <5716338E.4050003@solarflare.com>

Improves packet rate of 1-byte UDP receives by 10%.

Signed-off-by: Edward Cree <ecree@solarflare.com>
---
 drivers/net/ethernet/sfc/ef10.c       | 9 +++++++++
 drivers/net/ethernet/sfc/efx.c        | 2 ++
 drivers/net/ethernet/sfc/net_driver.h | 3 +++
 drivers/net/ethernet/sfc/rx.c         | 7 ++++++-
 4 files changed, 20 insertions(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/sfc/ef10.c b/drivers/net/ethernet/sfc/ef10.c
index 98d33d4..e348f8f 100644
--- a/drivers/net/ethernet/sfc/ef10.c
+++ b/drivers/net/ethernet/sfc/ef10.c
@@ -2656,6 +2656,7 @@ static int efx_ef10_ev_process(struct efx_channel *channel, int quota)
 {
 	struct efx_nic *efx = channel->efx;
 	efx_qword_t event, *p_event;
+	struct sk_buff_head rx_list;
 	unsigned int read_ptr;
 	int ev_code;
 	int tx_descs = 0;
@@ -2664,6 +2665,11 @@ static int efx_ef10_ev_process(struct efx_channel *channel, int quota)
 	if (quota <= 0)
 		return spent;
 
+	/* Prepare the batch receive list */
+	EFX_BUG_ON_PARANOID(channel->rx_list != NULL);
+	channel->rx_list = &rx_list;
+	__skb_queue_head_init(channel->rx_list);
+
 	read_ptr = channel->eventq_read_ptr;
 
 	for (;;) {
@@ -2724,6 +2730,9 @@ static int efx_ef10_ev_process(struct efx_channel *channel, int quota)
 	}
 
 out:
+	/* Receive any packets we queued up */
+	netif_receive_skb_list(channel->rx_list);
+	channel->rx_list = NULL;
 	channel->eventq_read_ptr = read_ptr;
 	return spent;
 }
diff --git a/drivers/net/ethernet/sfc/efx.c b/drivers/net/ethernet/sfc/efx.c
index 0705ec86..e004c0b 100644
--- a/drivers/net/ethernet/sfc/efx.c
+++ b/drivers/net/ethernet/sfc/efx.c
@@ -527,6 +527,8 @@ static int efx_probe_channel(struct efx_channel *channel)
 			goto fail;
 	}
 
+	channel->rx_list = NULL;
+
 	return 0;
 
 fail:
diff --git a/drivers/net/ethernet/sfc/net_driver.h b/drivers/net/ethernet/sfc/net_driver.h
index 38c4223..d969c85 100644
--- a/drivers/net/ethernet/sfc/net_driver.h
+++ b/drivers/net/ethernet/sfc/net_driver.h
@@ -418,6 +418,7 @@ enum efx_sync_events_state {
  *	__efx_rx_packet(), or zero if there is none
  * @rx_pkt_index: Ring index of first buffer for next packet to be delivered
  *	by __efx_rx_packet(), if @rx_pkt_n_frags != 0
+ * @rx_list: list of SKBs from current RX, awaiting processing
  * @rx_queue: RX queue for this channel
  * @tx_queue: TX queues for this channel
  * @sync_events_state: Current state of sync events on this channel
@@ -462,6 +463,8 @@ struct efx_channel {
 	unsigned int rx_pkt_n_frags;
 	unsigned int rx_pkt_index;
 
+	struct sk_buff_head *rx_list;
+
 	struct efx_rx_queue rx_queue;
 	struct efx_tx_queue tx_queue[EFX_TXQ_TYPES];
 
diff --git a/drivers/net/ethernet/sfc/rx.c b/drivers/net/ethernet/sfc/rx.c
index 8956995..025e387 100644
--- a/drivers/net/ethernet/sfc/rx.c
+++ b/drivers/net/ethernet/sfc/rx.c
@@ -642,7 +642,12 @@ static void efx_rx_deliver(struct efx_channel *channel, u8 *eh,
 			return;
 
 	/* Pass the packet up */
-	netif_receive_skb(skb);
+	if (channel->rx_list != NULL)
+		/* Add to list, will pass up later */
+		__skb_queue_tail(channel->rx_list, skb);
+	else
+		/* No list, so pass it up now */
+		netif_receive_skb(skb);
 }
 
 /* Handle a received packet.  Second half: Touches packet payload. */

^ permalink raw reply related

* [RFC PATCH net-next 3/8] net: core: unwrap skb list receive slightly further
From: Edward Cree @ 2016-04-19 13:35 UTC (permalink / raw)
  To: netdev, David Miller; +Cc: Jesper Dangaard Brouer, linux-net-drivers
In-Reply-To: <5716338E.4050003@solarflare.com>

Adds iterator skb_queue_for_each() to run over a list without modifying it.

Signed-off-by: Edward Cree <ecree@solarflare.com>
---
 include/linux/skbuff.h     | 16 ++++++++++++++++
 include/trace/events/net.h |  7 +++++++
 net/core/dev.c             |  4 +++-
 3 files changed, 26 insertions(+), 1 deletion(-)

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index da0ace3..2851c38 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -1501,6 +1501,22 @@ static inline struct sk_buff *skb_peek_tail(const struct sk_buff_head *list_)
 }
 
 /**
+ *	skb_queue_for_each - iterate over an skb queue
+ *	@pos:        the &struct sk_buff to use as a loop cursor.
+ *	@head:       the &struct sk_buff_head for your list.
+ *
+ *	The reference count is not incremented and the reference is therefore
+ *	volatile; the list lock is not taken either. Use with caution.
+ *
+ *	The list must not be modified (though the individual skbs can be)
+ *	within the loop body.
+ *
+ *	After loop completion, @pos will be %NULL.
+ */
+#define skb_queue_for_each(pos, head) \
+	for (pos = skb_peek(head); pos != NULL; pos = skb_peek_next(pos, head))
+
+/**
  *	skb_queue_len	- get queue length
  *	@list_: list to measure
  *
diff --git a/include/trace/events/net.h b/include/trace/events/net.h
index 49cc7c3..30f359c 100644
--- a/include/trace/events/net.h
+++ b/include/trace/events/net.h
@@ -222,6 +222,13 @@ DEFINE_EVENT(net_dev_rx_verbose_template, netif_receive_skb_entry,
 	TP_ARGS(skb)
 );
 
+DEFINE_EVENT(net_dev_rx_verbose_template, netif_receive_skb_list_entry,
+
+	TP_PROTO(const struct sk_buff *skb),
+
+	TP_ARGS(skb)
+);
+
 DEFINE_EVENT(net_dev_rx_verbose_template, netif_rx_entry,
 
 	TP_PROTO(const struct sk_buff *skb),
diff --git a/net/core/dev.c b/net/core/dev.c
index 7abcb1d..4bb6724 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -4307,8 +4307,10 @@ void netif_receive_skb_list(struct sk_buff_head *list)
 {
 	struct sk_buff *skb;
 
+	skb_queue_for_each(skb, list)
+		trace_netif_receive_skb_list_entry(skb);
 	while ((skb = __skb_dequeue(list)) != NULL)
-		netif_receive_skb(skb);
+		netif_receive_skb_internal(skb);
 }
 EXPORT_SYMBOL(netif_receive_skb_list);
 

^ permalink raw reply related

* [RFC PATCH net-next 4/8] net: core: Another step of skb receive list processing
From: Edward Cree @ 2016-04-19 13:35 UTC (permalink / raw)
  To: netdev, David Miller; +Cc: Jesper Dangaard Brouer, linux-net-drivers
In-Reply-To: <5716338E.4050003@solarflare.com>

netif_receive_skb_list_internal() now processes a list and hands it
on to the next function.

The code duplication is unfortunate, but the common part between the list
and non-list versions of the function takes a lock (rcu_read_lock()), so
factoring it out would be a little ugly.

Signed-off-by: Edward Cree <ecree@solarflare.com>
---
 net/core/dev.c | 50 ++++++++++++++++++++++++++++++++++++++++++++++----
 1 file changed, 46 insertions(+), 4 deletions(-)

diff --git a/net/core/dev.c b/net/core/dev.c
index 4bb6724..586807d 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -4241,6 +4241,14 @@ static int __netif_receive_skb(struct sk_buff *skb)
 	return ret;
 }
 
+static void __netif_receive_skb_list(struct sk_buff_head *list)
+{
+	struct sk_buff *skb;
+
+	while ((skb = __skb_dequeue(list)) != NULL)
+		__netif_receive_skb(skb);
+}
+
 static int netif_receive_skb_internal(struct sk_buff *skb)
 {
 	int ret;
@@ -4269,6 +4277,41 @@ static int netif_receive_skb_internal(struct sk_buff *skb)
 	return ret;
 }
 
+static void netif_receive_skb_list_internal(struct sk_buff_head *list)
+{
+	struct sk_buff_head sublist;
+	struct sk_buff *skb;
+
+	__skb_queue_head_init(&sublist);
+
+	rcu_read_lock();
+	while ((skb = __skb_dequeue(list)) != NULL) {
+		net_timestamp_check(netdev_tstamp_prequeue, skb);
+		if (skb_defer_rx_timestamp(skb)) {
+			/* Handled, don't add to sublist */
+			continue;
+		}
+
+#ifdef CONFIG_RPS
+		if (static_key_false(&rps_needed)) {
+			struct rps_dev_flow voidflow, *rflow = &voidflow;
+			int cpu = get_rps_cpu(skb->dev, skb, &rflow);
+
+			if (cpu >= 0) {
+				enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
+				/* Handled, don't add to sublist */
+				continue;
+			}
+		}
+#endif
+		__skb_queue_tail(&sublist, skb);
+	}
+
+	__netif_receive_skb_list(&sublist);
+	rcu_read_unlock();
+	return;
+}
+
 /**
  *	netif_receive_skb - process receive buffer from network
  *	@skb: buffer to process
@@ -4297,8 +4340,8 @@ EXPORT_SYMBOL(netif_receive_skb);
  *	@list: list of skbs to process.  Must not be shareable (e.g. it may
  *	be on the stack)
  *
- *	For now, just calls netif_receive_skb() in a loop, ignoring the
- *	return value.
+ *	Since return value of netif_receive_skb() is normally ignored, and
+ *	wouldn't be meaningful for a list, this function returns void.
  *
  *	This function may only be called from softirq context and interrupts
  *	should be enabled.
@@ -4309,8 +4352,7 @@ void netif_receive_skb_list(struct sk_buff_head *list)
 
 	skb_queue_for_each(skb, list)
 		trace_netif_receive_skb_list_entry(skb);
-	while ((skb = __skb_dequeue(list)) != NULL)
-		netif_receive_skb_internal(skb);
+	netif_receive_skb_list_internal(list);
 }
 EXPORT_SYMBOL(netif_receive_skb_list);
 

^ permalink raw reply related

* [RFC PATCH net-next 5/8] net: core: another layer of lists, around PF_MEMALLOC skb handling
From: Edward Cree @ 2016-04-19 13:36 UTC (permalink / raw)
  To: netdev, David Miller; +Cc: Jesper Dangaard Brouer, linux-net-drivers
In-Reply-To: <5716338E.4050003@solarflare.com>

First example of a layer splitting the list (rather than merely taking
individual packets off it).

Again, trying to factor the common parts wouldn't make this any nicer.

Signed-off-by: Edward Cree <ecree@solarflare.com>
---
 net/core/dev.c | 36 ++++++++++++++++++++++++++++++++++--
 1 file changed, 34 insertions(+), 2 deletions(-)

diff --git a/net/core/dev.c b/net/core/dev.c
index 586807d..0f914bf 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -4216,6 +4216,14 @@ out:
 	return ret;
 }
 
+static void __netif_receive_skb_list_core(struct sk_buff_head *list, bool pfmemalloc)
+{
+	struct sk_buff *skb;
+
+	while ((skb = __skb_dequeue(list)) != NULL)
+		__netif_receive_skb_core(skb, pfmemalloc);
+}
+
 static int __netif_receive_skb(struct sk_buff *skb)
 {
 	int ret;
@@ -4243,10 +4251,34 @@ static int __netif_receive_skb(struct sk_buff *skb)
 
 static void __netif_receive_skb_list(struct sk_buff_head *list)
 {
+	struct sk_buff_head sublist;
+	bool pfmemalloc = false; /* Is current sublist PF_MEMALLOC? */
+	unsigned long pflags;
 	struct sk_buff *skb;
 
-	while ((skb = __skb_dequeue(list)) != NULL)
-		__netif_receive_skb(skb);
+	__skb_queue_head_init(&sublist);
+
+	while ((skb = __skb_dequeue(list)) != NULL) {
+		if ((sk_memalloc_socks() && skb_pfmemalloc(skb)) != pfmemalloc) {
+			/* Handle the previous sublist */
+			__netif_receive_skb_list_core(&sublist, pfmemalloc);
+			pfmemalloc = !pfmemalloc;
+			/* See comments in __netif_receive_skb */
+			if (pfmemalloc) {
+				pflags = current->flags;
+				current->flags |= PF_MEMALLOC;
+			} else {
+				tsk_restore_flags(current, pflags, PF_MEMALLOC);
+			}
+			__skb_queue_head_init(&sublist);
+		}
+		__skb_queue_tail(&sublist, skb);
+	}
+	/* Handle the last sublist */
+	__netif_receive_skb_list_core(&sublist, pfmemalloc);
+	/* Restore pflags */
+	if (pfmemalloc)
+		tsk_restore_flags(current, pflags, PF_MEMALLOC);
 }
 
 static int netif_receive_skb_internal(struct sk_buff *skb)

^ permalink raw reply related

* [RFC PATCH net-next 6/8] net: core: propagate SKB lists through packet_type lookup
From: Edward Cree @ 2016-04-19 13:36 UTC (permalink / raw)
  To: netdev, David Miller; +Cc: Jesper Dangaard Brouer, linux-net-drivers
In-Reply-To: <5716338E.4050003@solarflare.com>

This could maybe be made more efficient if we first split the list based on
 skb->protocol, and then did ptype lookup for each sublist.  Unfortunately,
 there are things liks sch_handle_ingress and the rx_handlers that can
 produce different results per packet.

Signed-off-by: Edward Cree <ecree@solarflare.com>
---
 include/trace/events/net.h |   7 +++
 net/core/dev.c             | 146 ++++++++++++++++++++++++++++++++-------------
 2 files changed, 113 insertions(+), 40 deletions(-)

diff --git a/include/trace/events/net.h b/include/trace/events/net.h
index 30f359c..7a17a31 100644
--- a/include/trace/events/net.h
+++ b/include/trace/events/net.h
@@ -130,6 +130,13 @@ DEFINE_EVENT(net_dev_template, netif_receive_skb,
 	TP_ARGS(skb)
 );
 
+DEFINE_EVENT(net_dev_template, netif_receive_skb_list,
+
+	TP_PROTO(struct sk_buff *skb),
+
+	TP_ARGS(skb)
+);
+
 DEFINE_EVENT(net_dev_template, netif_rx,
 
 	TP_PROTO(struct sk_buff *skb),
diff --git a/net/core/dev.c b/net/core/dev.c
index 0f914bf..db1d16a 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -4061,12 +4061,13 @@ static inline int nf_ingress(struct sk_buff *skb, struct packet_type **pt_prev,
 	return 0;
 }
 
-static int __netif_receive_skb_core(struct sk_buff *skb, bool pfmemalloc)
+static int __netif_receive_skb_taps(struct sk_buff *skb, bool pfmemalloc,
+				    struct packet_type **pt_prev)
 {
-	struct packet_type *ptype, *pt_prev;
 	rx_handler_func_t *rx_handler;
 	struct net_device *orig_dev;
 	bool deliver_exact = false;
+	struct packet_type *ptype;
 	int ret = NET_RX_DROP;
 	__be16 type;
 
@@ -4081,7 +4082,7 @@ static int __netif_receive_skb_core(struct sk_buff *skb, bool pfmemalloc)
 		skb_reset_transport_header(skb);
 	skb_reset_mac_len(skb);
 
-	pt_prev = NULL;
+	*pt_prev = NULL;
 
 another_round:
 	skb->skb_iif = skb->dev->ifindex;
@@ -4106,25 +4107,25 @@ another_round:
 		goto skip_taps;
 
 	list_for_each_entry_rcu(ptype, &ptype_all, list) {
-		if (pt_prev)
-			ret = deliver_skb(skb, pt_prev, orig_dev);
-		pt_prev = ptype;
+		if (*pt_prev)
+			ret = deliver_skb(skb, *pt_prev, orig_dev);
+		*pt_prev = ptype;
 	}
 
 	list_for_each_entry_rcu(ptype, &skb->dev->ptype_all, list) {
-		if (pt_prev)
-			ret = deliver_skb(skb, pt_prev, orig_dev);
-		pt_prev = ptype;
+		if (*pt_prev)
+			ret = deliver_skb(skb, *pt_prev, orig_dev);
+		*pt_prev = ptype;
 	}
 
 skip_taps:
 #ifdef CONFIG_NET_INGRESS
 	if (static_key_false(&ingress_needed)) {
-		skb = sch_handle_ingress(skb, &pt_prev, &ret, orig_dev);
+		skb = sch_handle_ingress(skb, pt_prev, &ret, orig_dev);
 		if (!skb)
 			goto out;
 
-		if (nf_ingress(skb, &pt_prev, &ret, orig_dev) < 0)
+		if (nf_ingress(skb, pt_prev, &ret, orig_dev) < 0)
 			goto out;
 	}
 #endif
@@ -4136,9 +4137,9 @@ ncls:
 		goto drop;
 
 	if (skb_vlan_tag_present(skb)) {
-		if (pt_prev) {
-			ret = deliver_skb(skb, pt_prev, orig_dev);
-			pt_prev = NULL;
+		if (*pt_prev) {
+			ret = deliver_skb(skb, *pt_prev, orig_dev);
+			*pt_prev = NULL;
 		}
 		if (vlan_do_receive(&skb))
 			goto another_round;
@@ -4148,9 +4149,9 @@ ncls:
 
 	rx_handler = rcu_dereference(skb->dev->rx_handler);
 	if (rx_handler) {
-		if (pt_prev) {
-			ret = deliver_skb(skb, pt_prev, orig_dev);
-			pt_prev = NULL;
+		if (*pt_prev) {
+			ret = deliver_skb(skb, *pt_prev, orig_dev);
+			*pt_prev = NULL;
 		}
 		switch (rx_handler(&skb)) {
 		case RX_HANDLER_CONSUMED:
@@ -4181,47 +4182,112 @@ ncls:
 
 	/* deliver only exact match when indicated */
 	if (likely(!deliver_exact)) {
-		deliver_ptype_list_skb(skb, &pt_prev, orig_dev, type,
+		deliver_ptype_list_skb(skb, pt_prev, orig_dev, type,
 				       &ptype_base[ntohs(type) &
 						   PTYPE_HASH_MASK]);
 	}
 
-	deliver_ptype_list_skb(skb, &pt_prev, orig_dev, type,
+	deliver_ptype_list_skb(skb, pt_prev, orig_dev, type,
 			       &orig_dev->ptype_specific);
 
 	if (unlikely(skb->dev != orig_dev)) {
-		deliver_ptype_list_skb(skb, &pt_prev, orig_dev, type,
+		deliver_ptype_list_skb(skb, pt_prev, orig_dev, type,
 				       &skb->dev->ptype_specific);
 	}
-
-	if (pt_prev) {
-		if (unlikely(skb_orphan_frags(skb, GFP_ATOMIC)))
-			goto drop;
-		else
-			ret = pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
-	} else {
+	if (*pt_prev && unlikely(skb_orphan_frags(skb, GFP_ATOMIC)))
+		goto drop;
+	return ret;
 drop:
-		if (!deliver_exact)
-			atomic_long_inc(&skb->dev->rx_dropped);
-		else
-			atomic_long_inc(&skb->dev->rx_nohandler);
-		kfree_skb(skb);
-		/* Jamal, now you will not able to escape explaining
-		 * me how you were going to use this. :-)
-		 */
-		ret = NET_RX_DROP;
-	}
-
+	if (!deliver_exact)
+		atomic_long_inc(&skb->dev->rx_dropped);
+	else
+		atomic_long_inc(&skb->dev->rx_nohandler);
+	kfree_skb(skb);
+	/* Jamal, now you will not able to escape explaining
+	 * me how you were going to use this. :-)
+	 */
+	ret = NET_RX_DROP;
 out:
+	*pt_prev = NULL;
 	return ret;
 }
 
-static void __netif_receive_skb_list_core(struct sk_buff_head *list, bool pfmemalloc)
+static int __netif_receive_skb_core(struct sk_buff *skb, bool pfmemalloc)
+{
+	struct net_device *orig_dev = skb->dev;
+	struct packet_type *pt_prev;
+	int ret;
+
+	ret = __netif_receive_skb_taps(skb, pfmemalloc, &pt_prev);
+	if (pt_prev)
+		ret = pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
+	return ret;
+}
+
+static inline void __netif_receive_skb_list_ptype(struct sk_buff_head *list,
+						  struct packet_type *pt_prev,
+						  struct net_device *orig_dev)
 {
 	struct sk_buff *skb;
 
 	while ((skb = __skb_dequeue(list)) != NULL)
-		__netif_receive_skb_core(skb, pfmemalloc);
+		pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
+}
+
+static void __netif_receive_skb_list_core(struct sk_buff_head *list, bool pfmemalloc)
+{
+	/* Fast-path assumptions:
+	 * - There is no RX handler.
+	 * - Only one packet_type matches.
+	 * If either of these fails, we will end up doing some per-packet
+	 * processing in-line, then handling the 'last ptype' for the whole
+	 * sublist.  This can't cause out-of-order delivery to any single ptype,
+	 * because the 'last ptype' must be constant across the sublist, and all
+	 * other ptypes are handled per-packet.  Unless, that is, a ptype can
+	 * be delivered to more than once for a single packet - but that seems
+	 * like it would be a bad idea anyway.
+	 * So it should be fine (at least, I think so), but you'll lose the
+	 * (putative) performance benefits of batching.
+	 */
+	/* Current (common) ptype of sublist */
+	struct packet_type *pt_curr = NULL;
+	/* In the normal (device RX) case, orig_dev should be the same for
+	 * every skb in the list.  But as I'm not certain of this, I check
+	 * it's constant and split the list if not.
+	 * So, od_curr is the current (common) orig_dev of sublist.
+	 */
+	struct net_device *od_curr = NULL;
+	struct sk_buff_head sublist;
+	struct sk_buff *skb;
+
+	__skb_queue_head_init(&sublist);
+
+	while ((skb = __skb_dequeue(list)) != NULL) {
+		struct packet_type *pt_prev;
+		struct net_device *orig_dev = skb->dev;
+
+		__netif_receive_skb_taps(skb, pfmemalloc, &pt_prev);
+		if (pt_prev) {
+			if (skb_queue_empty(&sublist)) {
+				pt_curr = pt_prev;
+				od_curr = orig_dev;
+			} else if (!(pt_curr == pt_prev &&
+				     od_curr == orig_dev)) {
+				/* dispatch old sublist */
+				__netif_receive_skb_list_ptype(&sublist,
+							       pt_curr,
+							       od_curr);
+				/* start new sublist */
+				__skb_queue_head_init(&sublist);
+				pt_curr = pt_prev;
+				od_curr = orig_dev;
+			}
+			__skb_queue_tail(&sublist, skb);
+		}
+	}
+
+	/* dispatch final sublist */
+	__netif_receive_skb_list_ptype(&sublist, pt_curr, od_curr);
 }
 
 static int __netif_receive_skb(struct sk_buff *skb)

^ permalink raw reply related

* [RFC PATCH net-next 7/8] net: ipv4: listified version of ip_rcv
From: Edward Cree @ 2016-04-19 13:37 UTC (permalink / raw)
  To: netdev, David Miller; +Cc: Jesper Dangaard Brouer, linux-net-drivers
In-Reply-To: <5716338E.4050003@solarflare.com>

Also involved adding a way to run a netfilter hook over a list of packets.
Rather than attempting to make netfilter know about lists (which would be
horrendous) we just let it call the regular okfn (in this case
ip_rcv_finish()) for any packets it steals, and have it give us back a list
of packets it's synchronously accepted (which normally NF_HOOK would
automatically call okfn() on, but we want to be able to potentially pass
the list to a listified version of okfn().)

There is potential for out-of-order receives if the netfilter hook ends up
synchronously stealing packets, as they will be processed before any accepts
earlier in the list.  However, it was already possible for an asynchronous
accept to cause out-of-order receives, so hopefully I haven't broken
anything that wasn't broken already.

Signed-off-by: Edward Cree <ecree@solarflare.com>
---
 include/linux/netdevice.h |  3 ++
 include/linux/netfilter.h | 27 +++++++++++++++++
 include/net/ip.h          |  2 ++
 net/core/dev.c            | 11 +++++--
 net/ipv4/af_inet.c        |  1 +
 net/ipv4/ip_input.c       | 75 ++++++++++++++++++++++++++++++++++++++++++-----
 6 files changed, 110 insertions(+), 9 deletions(-)

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 682d0ad..292f2d5 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -2143,6 +2143,9 @@ struct packet_type {
 					 struct net_device *,
 					 struct packet_type *,
 					 struct net_device *);
+	void			(*list_func) (struct sk_buff_head *,
+					      struct packet_type *,
+					      struct net_device *);
 	bool			(*id_match)(struct packet_type *ptype,
 					    struct sock *sk);
 	void			*af_packet_priv;
diff --git a/include/linux/netfilter.h b/include/linux/netfilter.h
index 9230f9a..e18e91b 100644
--- a/include/linux/netfilter.h
+++ b/include/linux/netfilter.h
@@ -220,6 +220,24 @@ NF_HOOK_THRESH(uint8_t pf, unsigned int hook, struct net *net, struct sock *sk,
 	return ret;
 }
 
+static inline void
+NF_HOOK_LIST_THRESH(uint8_t pf, unsigned int hook, struct net *net, struct sock *sk,
+		    struct sk_buff_head *list, struct sk_buff_head *sublist,
+		    struct net_device *in, struct net_device *out,
+		    int (*okfn)(struct net *, struct sock *, struct sk_buff *),
+		    int thresh)
+{
+	struct sk_buff *skb;
+
+	__skb_queue_head_init(sublist); /* list of synchronously ACCEPTed skbs */
+	while ((skb = __skb_dequeue(list)) != NULL) {
+		int ret = nf_hook_thresh(pf, hook, net, sk, skb, in, out, okfn,
+					 thresh);
+		if (ret == 1)
+			__skb_queue_tail(sublist, skb);
+	}
+}
+
 static inline int
 NF_HOOK_COND(uint8_t pf, unsigned int hook, struct net *net, struct sock *sk,
 	     struct sk_buff *skb, struct net_device *in, struct net_device *out,
@@ -242,6 +260,15 @@ NF_HOOK(uint8_t pf, unsigned int hook, struct net *net, struct sock *sk, struct
 	return NF_HOOK_THRESH(pf, hook, net, sk, skb, in, out, okfn, INT_MIN);
 }
 
+static inline void
+NF_HOOK_LIST(uint8_t pf, unsigned int hook, struct net *net, struct sock *sk,
+	     struct sk_buff_head *list, struct sk_buff_head *sublist,
+	     struct net_device *in, struct net_device *out,
+	     int (*okfn)(struct net *, struct sock *, struct sk_buff *))
+{
+	NF_HOOK_LIST_THRESH(pf, hook, net, sk, list, sublist, in, out, okfn, INT_MIN);
+}
+
 /* Call setsockopt() */
 int nf_setsockopt(struct sock *sk, u_int8_t pf, int optval, char __user *opt,
 		  unsigned int len);
diff --git a/include/net/ip.h b/include/net/ip.h
index 93725e5..c994c44 100644
--- a/include/net/ip.h
+++ b/include/net/ip.h
@@ -106,6 +106,8 @@ int ip_build_and_send_pkt(struct sk_buff *skb, const struct sock *sk,
 			  struct ip_options_rcu *opt);
 int ip_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt,
 	   struct net_device *orig_dev);
+void ip_list_rcv(struct sk_buff_head *list, struct packet_type *pt,
+		 struct net_device *orig_dev);
 int ip_local_deliver(struct sk_buff *skb);
 int ip_mr_input(struct sk_buff *skb);
 int ip_output(struct net *net, struct sock *sk, struct sk_buff *skb);
diff --git a/net/core/dev.c b/net/core/dev.c
index db1d16a..da768e2 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -4230,8 +4230,15 @@ static inline void __netif_receive_skb_list_ptype(struct sk_buff_head *list,
 {
 	struct sk_buff *skb;
 
-	while ((skb = __skb_dequeue(list)) != NULL)
-		pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
+	if (!pt_prev)
+		return;
+	if (skb_queue_empty(list))
+		return;
+	if (pt_prev->list_func != NULL)
+		pt_prev->list_func(list, pt_prev, orig_dev);
+	else
+		while ((skb = __skb_dequeue(list)) != NULL)
+			pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
 }
 
 static void __netif_receive_skb_list_core(struct sk_buff_head *list, bool pfmemalloc)
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index 2e6e65f..1424147 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -1757,6 +1757,7 @@ fs_initcall(ipv4_offload_init);
 static struct packet_type ip_packet_type __read_mostly = {
 	.type = cpu_to_be16(ETH_P_IP),
 	.func = ip_rcv,
+	.list_func = ip_list_rcv,
 };
 
 static int __init inet_init(void)
diff --git a/net/ipv4/ip_input.c b/net/ipv4/ip_input.c
index e3d7827..e7d0d85 100644
--- a/net/ipv4/ip_input.c
+++ b/net/ipv4/ip_input.c
@@ -395,10 +395,9 @@ drop:
 /*
  * 	Main IP Receive routine.
  */
-int ip_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev)
+static struct sk_buff *ip_rcv_core(struct sk_buff *skb, struct net *net)
 {
 	const struct iphdr *iph;
-	struct net *net;
 	u32 len;
 
 	/* When the interface is in promisc. mode, drop all the crap
@@ -408,7 +407,6 @@ int ip_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt,
 		goto drop;
 
 
-	net = dev_net(dev);
 	IP_UPD_PO_STATS_BH(net, IPSTATS_MIB_IN, skb->len);
 
 	skb = skb_share_check(skb, GFP_ATOMIC);
@@ -475,9 +473,7 @@ int ip_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt,
 	/* Must drop socket now because of tproxy. */
 	skb_orphan(skb);
 
-	return NF_HOOK(NFPROTO_IPV4, NF_INET_PRE_ROUTING,
-		       net, NULL, skb, dev, NULL,
-		       ip_rcv_finish);
+	return skb;
 
 csum_error:
 	IP_INC_STATS_BH(net, IPSTATS_MIB_CSUMERRORS);
@@ -486,5 +482,70 @@ inhdr_error:
 drop:
 	kfree_skb(skb);
 out:
-	return NET_RX_DROP;
+	return NULL;
+}
+
+/*
+ * IP receive entry point
+ */
+int ip_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt,
+	   struct net_device *orig_dev)
+{
+	struct net *net = dev_net(dev);
+
+	skb = ip_rcv_core(skb, net);
+	if (skb == NULL)
+		return NET_RX_DROP;
+	return NF_HOOK(NFPROTO_IPV4, NF_INET_PRE_ROUTING,
+		       net, NULL, skb, dev, NULL,
+		       ip_rcv_finish);
+}
+
+static void ip_sublist_rcv(struct sk_buff_head *list, struct net_device *dev,
+			   struct net *net)
+{
+	struct sk_buff_head sublist;
+	struct sk_buff *skb;
+
+	NF_HOOK_LIST(NFPROTO_IPV4, NF_INET_PRE_ROUTING, net, NULL,
+		     list, &sublist, dev, NULL, ip_rcv_finish);
+	while ((skb = __skb_dequeue(&sublist)) != NULL)
+		ip_rcv_finish(net, NULL, skb);
+}
+
+/* Receive a list of IP packets */
+void ip_list_rcv(struct sk_buff_head *list, struct packet_type *pt,
+		 struct net_device *orig_dev)
+{
+	struct net_device *curr_dev = NULL;
+	struct net *curr_net = NULL;
+	struct sk_buff_head sublist;
+	struct sk_buff *skb;
+
+	__skb_queue_head_init(&sublist);
+
+	while ((skb = __skb_dequeue(list)) != NULL) {
+		struct net_device *dev = skb->dev;
+		struct net *net = dev_net(dev);
+
+		skb = ip_rcv_core(skb, net);
+		if (skb == NULL)
+			continue;
+
+		if (skb_queue_empty(&sublist)) {
+			curr_dev = dev;
+			curr_net = net;
+		} else if (curr_dev != dev || curr_net != net) {
+			/* dispatch old sublist */
+			ip_sublist_rcv(&sublist, dev, net);
+			/* start new sublist */
+			__skb_queue_head_init(&sublist);
+			curr_dev = dev;
+			curr_net = net;
+		}
+		/* add to current sublist */
+		__skb_queue_tail(&sublist, skb);
+	}
+	/* dispatch final sublist */
+	ip_sublist_rcv(&sublist, curr_dev, curr_net);
 }

^ permalink raw reply related

* [RFC PATCH net-next 8/8] net: ipv4: listify ip_rcv_finish
From: Edward Cree @ 2016-04-19 13:37 UTC (permalink / raw)
  To: netdev, David Miller; +Cc: Jesper Dangaard Brouer, linux-net-drivers
In-Reply-To: <5716338E.4050003@solarflare.com>

Signed-off-by: Edward Cree <ecree@solarflare.com>
---
 net/ipv4/ip_input.c | 58 ++++++++++++++++++++++++++++++++++++++++++++++++-----
 1 file changed, 53 insertions(+), 5 deletions(-)

diff --git a/net/ipv4/ip_input.c b/net/ipv4/ip_input.c
index e7d0d85..5bbc409 100644
--- a/net/ipv4/ip_input.c
+++ b/net/ipv4/ip_input.c
@@ -308,7 +308,8 @@ drop:
 	return true;
 }
 
-static int ip_rcv_finish(struct net *net, struct sock *sk, struct sk_buff *skb)
+static int ip_rcv_finish_core(struct net *net, struct sock *sk,
+			      struct sk_buff *skb)
 {
 	const struct iphdr *iph = ip_hdr(skb);
 	struct rtable *rt;
@@ -385,13 +386,22 @@ static int ip_rcv_finish(struct net *net, struct sock *sk, struct sk_buff *skb)
 			goto drop;
 	}
 
-	return dst_input(skb);
+	return NET_RX_SUCCESS;
 
 drop:
 	kfree_skb(skb);
 	return NET_RX_DROP;
 }
 
+static int ip_rcv_finish(struct net *net, struct sock *sk, struct sk_buff *skb)
+{
+	int ret = ip_rcv_finish_core(net, sk, skb);
+
+	if (ret != NET_RX_DROP)
+		ret = dst_input(skb);
+	return ret;
+}
+
 /*
  * 	Main IP Receive routine.
  */
@@ -501,16 +511,54 @@ int ip_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt,
 		       ip_rcv_finish);
 }
 
+static void ip_sublist_rcv_finish(struct sk_buff_head *list)
+{
+	struct sk_buff *skb;
+
+	while ((skb = __skb_dequeue(list)) != NULL)
+		dst_input(skb);
+}
+
+static void ip_list_rcv_finish(struct net *net, struct sock *sk,
+			       struct sk_buff_head *list)
+{
+	struct dst_entry *curr_dst = NULL;
+	struct sk_buff_head sublist;
+	struct sk_buff *skb;
+
+	__skb_queue_head_init(&sublist);
+
+	while ((skb = __skb_dequeue(list)) != NULL) {
+		struct dst_entry *dst;
+
+		if (ip_rcv_finish_core(net, sk, skb) == NET_RX_DROP)
+			continue;
+
+		dst = skb_dst(skb);
+		if (skb_queue_empty(&sublist)) {
+			curr_dst = dst;
+		} else if (curr_dst != dst) {
+			/* dispatch old sublist */
+			ip_sublist_rcv_finish(&sublist);
+			/* start new sublist */
+			__skb_queue_head_init(&sublist);
+			curr_dst = dst;
+		}
+		/* add to current sublist */
+		__skb_queue_tail(&sublist, skb);
+	}
+	/* dispatch final sublist */
+	ip_sublist_rcv_finish(&sublist);
+}
+
 static void ip_sublist_rcv(struct sk_buff_head *list, struct net_device *dev,
 			   struct net *net)
 {
 	struct sk_buff_head sublist;
-	struct sk_buff *skb;
 
 	NF_HOOK_LIST(NFPROTO_IPV4, NF_INET_PRE_ROUTING, net, NULL,
 		     list, &sublist, dev, NULL, ip_rcv_finish);
-	while ((skb = __skb_dequeue(&sublist)) != NULL)
-		ip_rcv_finish(net, NULL, skb);
+	ip_list_rcv_finish(net, NULL, &sublist);
 }
 
 /* Receive a list of IP packets */

^ permalink raw reply related

page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox