Netdev List

Netdev List
 help / color / mirror / Atom feed

* [net-next 01/15] i40e: Introduce VF port representor/control netdevs
From: Jeff Kirsher @ 2016-09-21  3:43 UTC (permalink / raw)
  To: davem
  Cc: Sridhar Samudrala, netdev, nhorman, sassmann, jogreene,
	guru.anbalagane, Jeff Kirsher
In-Reply-To: <1474429432-102772-1-git-send-email-jeffrey.t.kirsher@intel.com>

From: Sridhar Samudrala <sridhar.samudrala@intel.com>

This patch enables creation of a VF Port representor/Control netdev
associated with each VF. These netdevs can be used to control and configure
VFs from PFs namespace. They enable exposing VF statistics, configuring
link state, mtu, fdb/vlan entries etc.

    # echo 2 > /sys/class/net/enp5s0f0/device/sriov_numvfs
    # ip l show
    297: enp5s0f0: <BROADCAST,MULTICAST> mtu 1500 qdisc noop portid 6805ca2e7268 state DOWN mode DEFAULT group default qlen 1000
    link/ether 68:05:ca:2e:72:68 brd ff:ff:ff:ff:ff:ff
    vf 0 MAC 00:00:00:00:00:00, spoof checking on, link-state auto, trust off
    vf 1 MAC 00:00:00:00:00:00, spoof checking on, link-state auto, trust off
    299: enp5s0f0-vf0: <BROADCAST,MULTICAST> mtu 1500 qdisc noop state DOWN mode DEFAULT group default qlen 1000
    link/ether 00:00:00:00:00:00 brd ff:ff:ff:ff:ff:ff
    300: enp5s0f0-vf1: <BROADCAST,MULTICAST> mtu 1500 qdisc noop state DOWN mode DEFAULT group default qlen 1000
    link/ether 00:00:00:00:00:00 brd ff:ff:ff:ff:ff:ff

Signed-off-by: Sridhar Samudrala <sridhar.samudrala@intel.com>
Tested-by: Andrew Bowers <andrewx.bowers@intel.com>
Signed-off-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
---
 drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c | 88 ++++++++++++++++++++++
 drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.h | 14 ++++
 2 files changed, 102 insertions(+)

diff --git a/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c b/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c
index da34235..11f6970 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c
@@ -1003,6 +1003,90 @@ complete_reset:
 	clear_bit(__I40E_VF_DISABLE, &pf->state);
 }
 
+static int i40e_vf_netdev_open(struct net_device *dev)
+{
+	return 0;
+}
+
+static int i40e_vf_netdev_stop(struct net_device *dev)
+{
+	return 0;
+}
+
+static const struct net_device_ops i40e_vf_netdev_ops = {
+	.ndo_open = i40e_vf_netdev_open,
+	.ndo_stop = i40e_vf_netdev_stop,
+};
+
+/**
+ * i40e_alloc_vf_netdev
+ * @vf: pointer to the VF structure
+ * @vf_num: VF number
+ *
+ * Create VF representor/control netdev
+ **/
+int i40e_alloc_vf_netdev(struct i40e_vf *vf, u16 vf_num)
+{
+	struct i40e_pf *pf = vf->pf;
+	struct i40e_vsi *vsi = pf->vsi[pf->lan_vsi];
+	struct i40e_vf_netdev_priv *priv;
+	char netdev_name[IFNAMSIZ];
+	struct net_device *netdev;
+	int err;
+
+	snprintf(netdev_name, IFNAMSIZ, "%s-vf%d", vsi->netdev->name, vf_num);
+	netdev = alloc_netdev(sizeof(struct i40e_vf_netdev_priv), netdev_name,
+			      NET_NAME_UNKNOWN, ether_setup);
+	if (!netdev) {
+		dev_err(&pf->pdev->dev, "alloc_netdev failed for vf:%d\n",
+			vf_num);
+		return -ENOMEM;
+	}
+
+	pf->vf[vf_num].ctrl_netdev = netdev;
+
+	priv = netdev_priv(netdev);
+	priv->vf = &(pf->vf[vf_num]);
+
+	netdev->netdev_ops = &i40e_vf_netdev_ops;
+
+	netif_carrier_off(netdev);
+	netif_tx_disable(netdev);
+
+	err = register_netdev(netdev);
+	if (err) {
+		dev_err(&pf->pdev->dev, "register_netdev failed for vf: %s\n",
+			vf->ctrl_netdev->name);
+		free_netdev(netdev);
+		return err;
+	}
+
+	dev_info(&pf->pdev->dev, "VF representor(%s) created for VF %d\n",
+		 vf->ctrl_netdev->name, vf_num);
+
+	return 0;
+}
+
+/**
+ * i40e_free_vf_netdev
+ * @vf: pointer to the VF structure
+ *
+ * Free VF representor/control netdev
+ **/
+void i40e_free_vf_netdev(struct i40e_vf *vf)
+{
+	struct i40e_pf *pf = vf->pf;
+
+	if (!vf->ctrl_netdev)
+		return;
+
+	dev_info(&pf->pdev->dev, "Freeing VF representor(%s)\n",
+		 vf->ctrl_netdev->name);
+
+	unregister_netdev(vf->ctrl_netdev);
+	free_netdev(vf->ctrl_netdev);
+}
+
 /**
  * i40e_free_vfs
  * @pf: pointer to the PF structure
@@ -1045,6 +1129,8 @@ void i40e_free_vfs(struct i40e_pf *pf)
 			i40e_free_vf_res(&pf->vf[i]);
 		/* disable qp mappings */
 		i40e_disable_vf_mappings(&pf->vf[i]);
+
+		i40e_free_vf_netdev(&pf->vf[i]);
 	}
 
 	kfree(pf->vf);
@@ -1112,6 +1198,8 @@ int i40e_alloc_vfs(struct i40e_pf *pf, u16 num_alloc_vfs)
 		/* VF resources get allocated during reset */
 		i40e_reset_vf(&vfs[i], false);
 
+		i40e_alloc_vf_netdev(&vfs[i], i);
+
 	}
 	pf->num_alloc_vfs = num_alloc_vfs;
 
diff --git a/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.h b/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.h
index 8751741..1d54b95 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.h
+++ b/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.h
@@ -72,10 +72,21 @@ enum i40e_vf_capabilities {
 	I40E_VIRTCHNL_VF_CAP_IWARP,
 };
 
+/* VF Ctrl netdev private structure */
+struct i40e_vf_netdev_priv {
+	struct i40e_vf *vf;
+};
+
 /* VF information structure */
 struct i40e_vf {
 	struct i40e_pf *pf;
 
+	/* VF Port representor netdev that allows control and configuration
+	 * of VFs from the host. Enables returning VF stats, configuring link
+	 * state, mtu, fdb/vlans etc.
+	 */
+	struct net_device *ctrl_netdev;
+
 	/* VF id in the PF space */
 	s16 vf_id;
 	/* all VF vsis connect to the same parent */
@@ -142,4 +153,7 @@ int i40e_ndo_set_vf_spoofchk(struct net_device *netdev, int vf_id, bool enable);
 void i40e_vc_notify_link_state(struct i40e_pf *pf);
 void i40e_vc_notify_reset(struct i40e_pf *pf);
 
+int i40e_alloc_vf_netdev(struct i40e_vf *vf, u16 vf_num);
+void i40e_free_vf_netdev(struct i40e_vf *vf);
+
 #endif /* _I40E_VIRTCHNL_PF_H_ */
-- 
2.7.4

^ permalink raw reply related

* [net-next 02/15] i40e: Enable VF specific ethtool statistics via VF Port representor netdevs
From: Jeff Kirsher @ 2016-09-21  3:43 UTC (permalink / raw)
  To: davem
  Cc: Sridhar Samudrala, netdev, nhorman, sassmann, jogreene,
	guru.anbalagane, Jeff Kirsher
In-Reply-To: <1474429432-102772-1-git-send-email-jeffrey.t.kirsher@intel.com>

From: Sridhar Samudrala <sridhar.samudrala@intel.com>

Sample script that shows ethtool stats on VF representor netdev
PF: enp5s0f0, VF0: enp5s2  VF_REP0: enp5s0f0-vf0

   # echo 2 > /sys/class/net/enp5s0f0/device/sriov_numvfs
   # ip link set enp5s2 up
   # ethtool -S enp5s0f0-vf0
   NIC statistics:
     tx_bytes: 0
     tx_unicast: 0
     tx_multicast: 0
     tx_broadcast: 0
     tx_discards: 0
     tx_errors: 0
     rx_bytes: 140
     rx_unicast: 0
     rx_multicast: 2
     rx_broadcast: 0
     rx_discards: 0
     rx_unknown_protocol: 0

Signed-off-by: Sridhar Samudrala <sridhar.samudrala@intel.com>
Tested-by: Andrew Bowers <andrewx.bowers@intel.com>
Signed-off-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
---
 drivers/net/ethernet/intel/i40e/i40e.h             |  1 +
 drivers/net/ethernet/intel/i40e/i40e_ethtool.c     | 72 ++++++++++++++++++++++
 drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c |  1 +
 3 files changed, 74 insertions(+)

diff --git a/drivers/net/ethernet/intel/i40e/i40e.h b/drivers/net/ethernet/intel/i40e/i40e.h
index 19103a6..13b1f75 100644
--- a/drivers/net/ethernet/intel/i40e/i40e.h
+++ b/drivers/net/ethernet/intel/i40e/i40e.h
@@ -866,4 +866,5 @@ i40e_status i40e_get_npar_bw_setting(struct i40e_pf *pf);
 i40e_status i40e_set_npar_bw_setting(struct i40e_pf *pf);
 i40e_status i40e_commit_npar_bw_setting(struct i40e_pf *pf);
 void i40e_print_link_message(struct i40e_vsi *vsi, bool isup);
+void i40e_set_vf_netdev_ethtool_ops(struct net_device *netdev);
 #endif /* _I40E_H_ */
diff --git a/drivers/net/ethernet/intel/i40e/i40e_ethtool.c b/drivers/net/ethernet/intel/i40e/i40e_ethtool.c
index 1835186..1f3bbb05 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_ethtool.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_ethtool.c
@@ -3116,3 +3116,75 @@ void i40e_set_ethtool_ops(struct net_device *netdev)
 {
 	netdev->ethtool_ops = &i40e_ethtool_ops;
 }
+
+/* As the VF Port representor(VFPR) represents the switch port corresponding
+ * to a VF, the tx_ and rx_ strings are swapped to indicate that the frames
+ * transmitted from VF are received on VFPR and the frames received on VF are
+ * transmitted from VFPR.
+ */
+static const char i40e_vf_netdev_ethtool_sset[][ETH_GSTRING_LEN] = {
+	"tx_bytes",
+	"tx_unicast",
+	"tx_multicast",
+	"tx_broadcast",
+	"tx_discards",
+	"tx_errors",
+	"rx_bytes",
+	"rx_unicast",
+	"rx_multicast",
+	"rx_broadcast",
+	"rx_discards",
+	"rx_unknown_protocol",
+};
+
+#define I40E_VF_NETDEV_ETHTOOL_STAT_COUNT \
+			ARRAY_SIZE(i40e_vf_netdev_ethtool_sset)
+
+static void i40e_vf_netdev_ethtool_get_strings(struct net_device *dev,
+					       u32 stringset,
+					       u8 *ethtool_strings)
+{
+	switch (stringset) {
+	case ETH_SS_STATS:
+		memcpy(ethtool_strings, &i40e_vf_netdev_ethtool_sset,
+		       sizeof(i40e_vf_netdev_ethtool_sset));
+		break;
+	}
+}
+
+static int i40e_vf_netdev_ethtool_get_sset_count(struct net_device *dev,
+						 int stringset)
+{
+	switch (stringset) {
+	case ETH_SS_STATS:
+		return I40E_VF_NETDEV_ETHTOOL_STAT_COUNT;
+	default:
+		return -EOPNOTSUPP;
+	}
+}
+
+static void i40e_vf_netdev_ethtool_get_stats(struct net_device *dev,
+				struct ethtool_stats *target_ethtool_stats,
+				u64 *target_stat_values)
+{
+	struct i40e_vf_netdev_priv *priv = netdev_priv(dev);
+	struct i40e_vf *vf = priv->vf;
+	struct i40e_pf *pf = vf->pf;
+	struct i40e_vsi *vsi;
+
+	vsi = pf->vsi[vf->lan_vsi_idx];
+	i40e_update_stats(vsi);
+	memcpy(target_stat_values, &vsi->eth_stats,
+	       I40E_VF_NETDEV_ETHTOOL_STAT_COUNT * 8);
+}
+
+static const struct ethtool_ops i40e_vf_netdev_ethtool_ops = {
+	.get_strings		= i40e_vf_netdev_ethtool_get_strings,
+	.get_ethtool_stats	= i40e_vf_netdev_ethtool_get_stats,
+	.get_sset_count		= i40e_vf_netdev_ethtool_get_sset_count,
+};
+
+void i40e_set_vf_netdev_ethtool_ops(struct net_device *netdev)
+{
+	netdev->ethtool_ops = &i40e_vf_netdev_ethtool_ops;
+}
diff --git a/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c b/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c
index 11f6970..cacb797 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c
@@ -1049,6 +1049,7 @@ int i40e_alloc_vf_netdev(struct i40e_vf *vf, u16 vf_num)
 	priv->vf = &(pf->vf[vf_num]);
 
 	netdev->netdev_ops = &i40e_vf_netdev_ops;
+	i40e_set_vf_netdev_ethtool_ops(netdev);
 
 	netif_carrier_off(netdev);
 	netif_tx_disable(netdev);
-- 
2.7.4

^ permalink raw reply related

* [net-next 00/15][pull request] 40GbE Intel Wired LAN Driver Updates 2016-09-20
From: Jeff Kirsher @ 2016-09-21  3:43 UTC (permalink / raw)
  To: davem; +Cc: Jeff Kirsher, netdev, nhorman, sassmann, jogreene,
	guru.anbalagane

This series contains updates to i40e and i40evf only.

Sridhar enables creation of a VF port Representor/Control netdev
associated with each VF, which allows control and configuring VFs from
Pfs namespace.  Then enables the VF specific ethtool statistics via the
VF port Representor.  Adds initial devlink support to set/get the mode
of a SRIOV switch.  Fixes link state event handling by updating the
carrier and starts/stops the Tx queues based on the link state
notification from PF.

Brady fixes an issue where a user defined RSS hash key was not being
set because a user defined indirection table is not supplied when changing
the hash key, so if an indirection table is not supplied now, then a
default one is created and the hash key is correctly set.  Also fixed
an issue where when NPAR was enabled, we were still using pf->mac_seid
to perform the dump port query. Instead, go through the VSI to determine
the correct ID to use in either case.

Mitch provides one fix where a conditional return code was reversed, so
he does a "switheroo" to fix the issue.

Carolyn has two fixes, first fixes an issue in the virt channel code,
where a return code was not checked for NULL when applicable.  Second,
fixes an issue where we were byte swapping the port parameter, then
byte swapping it again in function execution.

Colin Ian King fixes a potential NULL pointer dereference.

Amritha adds support for switchdev ops on the VF port representors and
the PF uplink.

Bimmy changes up i40evf_up_complete() to be void since it always returns
success anyways, which allows cleaning up of code which checked the
return code from this function.

Alex fixed an issue where the driver was incorrectly assuming that we
would always be pulling no more than 1 descriptor from each fragment.
So to correct this, we just need to make certain to test all the way to
the end of the fragments as it is possible for us to span 2 descriptors
in the block before us so we need to guarantee that even the last 6
descriptors have enough data to fill a full frame.

The following are changes since commit 5737f6c92681939e417579b421f81f035e57c582:
  mlx4: add missed recycle opportunity for XDP_TX on TX failure
and are available in the git repository at:
  git://git.kernel.org/pub/scm/linux/kernel/git/jkirsher/next-queue 40GbE

Alan Brady (2):
  i40e: fix setting user defined RSS hash key
  i40e: fix "dump port" command when NPAR enabled

Alexander Duyck (1):
  i40e: Limit TX descriptor count in cases where frag size is greater
    than 16K

Amritha Nambiar (1):
  i40e: Add support for switchdev API for Switch ID

Bimmy Pujari (1):
  i40evf: remove unnecessary error checking against i40evf_up_complete

Carolyn Wyborny (2):
  i40e: Fix to check for NULL
  i40e: Fix for extra byte swap in tunnel setup

Colin Ian King (1):
  i40e: avoid potential null pointer dereference when assigning len

Lihong Yang (1):
  i40evf: remove unnecessary error checking against i40e_shutdown_adminq

Mitch Williams (1):
  i40e: return correct opcode to VF

Sridhar Samudrala (5):
  i40e: Introduce VF port representor/control netdevs
  i40e: Enable VF specific ethtool statistics via VF Port representor
    netdevs
  i40e: Introduce devlink interface
  i40evf: Fix link state event handling
  i40e: Sync link state between VFs and VF Port representors(VFPR)

 drivers/net/ethernet/intel/Kconfig                 |   1 +
 drivers/net/ethernet/intel/i40e/i40e.h             |   7 +
 drivers/net/ethernet/intel/i40e/i40e_debugfs.c     |   7 +-
 drivers/net/ethernet/intel/i40e/i40e_ethtool.c     |  84 ++++++++-
 drivers/net/ethernet/intel/i40e/i40e_main.c        | 120 +++++++++++--
 drivers/net/ethernet/intel/i40e/i40e_txrx.c        |   7 +-
 drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c | 196 ++++++++++++++++++++-
 drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.h |  16 ++
 drivers/net/ethernet/intel/i40evf/i40e_common.c    |   3 +-
 drivers/net/ethernet/intel/i40evf/i40e_txrx.c      |   7 +-
 drivers/net/ethernet/intel/i40evf/i40evf_main.c    |  18 +-
 .../net/ethernet/intel/i40evf/i40evf_virtchnl.c    |  10 +-
 12 files changed, 428 insertions(+), 48 deletions(-)

-- 
2.7.4

^ permalink raw reply

* Re: [PATCH v5 net-next 1/1] net sched actions: fix GETing actions
From: David Miller @ 2016-09-21  3:37 UTC (permalink / raw)
  To: jhs; +Cc: netdev, xiyou.wangcong
In-Reply-To: <1474326171-29035-1-git-send-email-jhs@emojatatu.com>

From: Jamal Hadi Salim <jhs@mojatatu.com>
Date: Mon, 19 Sep 2016 19:02:51 -0400

> From: Jamal Hadi Salim <jhs@mojatatu.com>
> 
> With the batch changes that translated transient actions into
> a temporary list lost in the translation was the fact that
> tcf_action_destroy() will eventually delete the action from
> the permanent location if the refcount is zero.
> 
> Example of what broke:
> ...add a gact action to drop
> sudo $TC actions add action drop index 10
> ...now retrieve it, looks good
> sudo $TC actions get action gact index 10
> ...retrieve it again and find it is gone!
> sudo $TC actions get action gact index 10
> 
> Fixes: 22dc13c837c3 ("net_sched: convert tcf_exts from list to pointer array"),
> Fixes: 824a7e8863b3 ("net_sched: remove an unnecessary list_del()")
> Fixes: f07fed82ad79 ("net_sched: remove the leftover cleanup_a()")
> 
> Acked-by: Cong Wang <xiyou.wangcong@gmail.com>
> Signed-off-by: Jamal Hadi Salim <jhs@mojatatu.com>

Applied.

^ permalink raw reply

* Re: [PATCH net-next 0/3] BPF direct packet access improvements
From: David Miller @ 2016-09-21  3:37 UTC (permalink / raw)
  To: daniel; +Cc: alexei.starovoitov, tgraf, jakub.kicinski, netdev
In-Reply-To: <cover.1474323281.git.daniel@iogearbox.net>

From: Daniel Borkmann <daniel@iogearbox.net>
Date: Tue, 20 Sep 2016 00:26:11 +0200

> This set adds write support to the currently available read support
> for {cls,act}_bpf programs. First one is a fix for affected commit
> sitting in net-next and prerequisite for the second one, last patch
> adds a number of test cases against the verifier. For details, please
> see individual patches.

Series applied.

^ permalink raw reply

* [PATCH net-next] MAINTAINERS: Update b44 maintainer.
From: Michael Chan @ 2016-09-21  3:33 UTC (permalink / raw)
  To: davem; +Cc: netdev, f.fainelli

Taking over as maintainer since Gary Zambrano is no longer working
for Broadcom.

Signed-off-by: Michael Chan <michael.chan@broadcom.com>
---
 MAINTAINERS | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/MAINTAINERS b/MAINTAINERS
index ce80b36..7626f7836 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -2509,7 +2509,7 @@ S:	Supported
 F:	kernel/bpf/
 
 BROADCOM B44 10/100 ETHERNET DRIVER
-M:	Gary Zambrano <zambrano@broadcom.com>
+M:	Michael Chan <michael.chan@broadcom.com>
 L:	netdev@vger.kernel.org
 S:	Supported
 F:	drivers/net/ethernet/broadcom/b44.*
-- 
1.8.3.1

^ permalink raw reply related

* Re: [PATCH net-next 5/7] rhashtable: abstract out function to get hash
From: Tom Herbert @ 2016-09-21  3:17 UTC (permalink / raw)
  To: Herbert Xu
  Cc: Thomas Graf, David S. Miller, Linux Kernel Network Developers,
	Roopa Prabhu, Kernel Team
In-Reply-To: <20160921024613.GA27473@gondor.apana.org.au>

On Tue, Sep 20, 2016 at 7:46 PM, Herbert Xu <herbert@gondor.apana.org.au> wrote:
> On Tue, Sep 20, 2016 at 07:58:03PM +0200, Thomas Graf wrote:
>>
>> I understand this particular patch as an effort not to duplicate
>> hash function selection such as jhash vs jhash2 based on key_len.
>
> If the rhashtable params stay non-const as is then this is going
> to produce some monstrous code which will be worse than using
> jhash unconditionally.
>
I will look at keep params constant.

Tom

> If the rhashtable params are made const then you'll already know
> whether jhash or jhash2 is used.
>
> Cheers,
> --
> Email: Herbert Xu <herbert@gondor.apana.org.au>
> Home Page: http://gondor.apana.org.au/~herbert/
> PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt

^ permalink raw reply

* Re: [PATCH v4 net-next 16/16] tcp_bbr: add BBR congestion control
From: Neal Cardwell @ 2016-09-21  2:57 UTC (permalink / raw)
  To: Stephen Hemminger
  Cc: David Miller, Netdev, Van Jacobson, Yuchung Cheng,
	Nandita Dukkipati, Eric Dumazet, Soheil Hassas Yeganeh
In-Reply-To: <CADVnQym2JZk9naoygkwbMP5og7ZP+q5GgjxpWc8A3uMZzTwsaQ@mail.gmail.com>

On Tue, Sep 20, 2016 at 2:50 PM, Neal Cardwell <ncardwell@google.com> wrote:
> On Tue, Sep 20, 2016 at 2:48 PM, Stephen Hemminger
> <stephen@networkplumber.org> wrote:
>>
>> On Mon, 19 Sep 2016 23:39:23 -0400
>> Neal Cardwell <ncardwell@google.com> wrote:
>>
>> > +/* INET_DIAG_BBRINFO */
>> > +
>> > +struct tcp_bbr_info {
>> > +     /* u64 bw: max-filtered BW (app throughput) estimate in Byte per sec: */
>> > +     __u32   bbr_bw_lo;              /* lower 32 bits of bw */
>> > +     __u32   bbr_bw_hi;              /* upper 32 bits of bw */
>> > +     __u32   bbr_min_rtt;            /* min-filtered RTT in uSec */
>> > +     __u32   bbr_pacing_gain;        /* pacing gain shifted left 8 bits */
>> > +     __u32   bbr_cwnd_gain;          /* cwnd gain shifted left 8 bits */
>> > +};
>> > +
>>
>> I assume there is a change to iproute (ss) to dump this info?
>
> Yes, we have a patch for iproute2 (inet_diag.h and ss.c), which we've
> been using. We'll send that out ASAP.

Here are the patches with proposed iproute2 support to dump this info:

http://patchwork.ozlabs.org/patch/672538/
http://patchwork.ozlabs.org/patch/672539/
http://patchwork.ozlabs.org/patch/672540/

thanks,
neal

^ permalink raw reply

* Re: [PATCH] tcp: fix wrong checksum calculation on MTU probing
From: David Miller @ 2016-09-21  2:57 UTC (permalink / raw)
  To: douglascs; +Cc: kuznet, jmorris, yoshfuji, kaber, netdev
In-Reply-To: <15e3bab9-f21e-9a55-a87e-e389f633f755@taghos.com.br>

This patch is whitespace damaged by your email client.

Please fix this, email the patch to yourself, and only resubmit this
when you can successfully apply the patch you emailed to yourself.

Thanks.

^ permalink raw reply

* Re: [PATCH next] ipvlan: Fix dependency issue
From: David Miller @ 2016-09-21  2:56 UTC (permalink / raw)
  To: mahesh; +Cc: netdev, edumazet, maheshb
In-Reply-To: <1474318589-6452-1-git-send-email-mahesh@bandewar.net>

From: Mahesh Bandewar <mahesh@bandewar.net>
Date: Mon, 19 Sep 2016 13:56:29 -0700

> From: Mahesh Bandewar <maheshb@google.com>
> 
> kbuild-build-bot reported that if NETFILTER is not selected, the
> build fails pointing to netfilter symbols.
> 
> Fixes: 4fbae7d83c98 ("ipvlan: Introduce l3s mode")
> 
> Signed-off-by: Mahesh Bandewar <maheshb@google.com>

Applied.

^ permalink raw reply

* Re: [PATCH net-next 2/2] openvswitch: avoid resetting flow key while installing new flow.
From: David Miller @ 2016-09-21  2:54 UTC (permalink / raw)
  To: pshelar; +Cc: netdev
In-Reply-To: <1474318260-16988-2-git-send-email-pshelar@ovn.org>

From: Pravin B Shelar <pshelar@ovn.org>
Date: Mon, 19 Sep 2016 13:51:00 -0700

> since commit commit db74a3335e0f6 ("openvswitch: use percpu
> flow stats") flow alloc resets flow-key. So there is no need
> to reset the flow-key again if OVS is using newly allocated
> flow-key.
> 
> Signed-off-by: Pravin B Shelar <pshelar@ovn.org>

Applied.

^ permalink raw reply

* Re: [PATCH net-next 1/2] openvswitch: Fix Frame-size larger than 1024 bytes warning.
From: David Miller @ 2016-09-21  2:54 UTC (permalink / raw)
  To: pshelar; +Cc: netdev
In-Reply-To: <1474318260-16988-1-git-send-email-pshelar@ovn.org>

From: Pravin B Shelar <pshelar@ovn.org>
Date: Mon, 19 Sep 2016 13:50:59 -0700

> There is no need to declare separate key on stack,
> we can just use sw_flow->key to store the key directly.
> 
> This commit fixes following warning:
> 
> net/openvswitch/datapath.c: In function ‘ovs_flow_cmd_new’:
> net/openvswitch/datapath.c:1080:1: warning: the frame size of 1040 bytes
> is larger than 1024 bytes [-Wframe-larger-than=]
> 
> Signed-off-by: Pravin B Shelar <pshelar@ovn.org>

Applied.

^ permalink raw reply

* Re: pull request: bluetooth-next 2016-09-19
From: David Miller @ 2016-09-21  2:53 UTC (permalink / raw)
  To: johan.hedberg; +Cc: linux-bluetooth, netdev
In-Reply-To: <20160919193742.GA10296@t440s>

From: Johan Hedberg <johan.hedberg@gmail.com>
Date: Mon, 19 Sep 2016 22:37:42 +0300

> Here's the main bluetooth-next pull request for the 4.9 kernel.
> 
>  - Added new messages for monitor sockets for better mgmt tracing
>  - Added local name and appearance support in scan response
>  - Added new Qualcomm WCNSS SMD based HCI driver
>  - Minor fixes & cleanup to 802.15.4 code
>  - New USB ID to btusb driver
>  - Added Marvell support to HCI UART driver
>  - Add combined LED trigger for controller power
>  - Other minor fixes here and there
> 
> Please let me know if there are any issues pulling. Thanks.

Pulled, thanks Johan.

^ permalink raw reply

* Re: [PATCH] 6pack: fix buffer length mishandling
From: David Miller @ 2016-09-21  2:51 UTC (permalink / raw)
  To: alan; +Cc: netdev
In-Reply-To: <147431251411.53012.5151201638797223106.stgit@localhost.localdomain>

From: Alan <alan@linux.intel.com>
Date: Mon, 19 Sep 2016 20:15:24 +0100

> Dmitry Vyukov wrote:
>> different runs). Looking at code, the following looks suspicious -- we
>> limit copy by 512 bytes, but use the original count which can be
>> larger than 512:
>>
>> static void sixpack_receive_buf(struct tty_struct *tty,
>>     const unsigned char *cp, char *fp, int count)
>> {
>>     unsigned char buf[512];
>>     ....
>>     memcpy(buf, cp, count < sizeof(buf) ? count : sizeof(buf));
>>     ....
>>     sixpack_decode(sp, buf, count1);
> 
> With the sane tty locking we now have I believe the following is safe as
> we consume the bytes and move them into the decoded buffer before
> returning.
> 
> Signed-off-by: Alan Cox <alan@linux.intel.com>

Applied to net-next, thanks Alan.

^ permalink raw reply

* Re: pull-request: can 2016-09-19
From: David Miller @ 2016-09-21  2:48 UTC (permalink / raw)
  To: mkl; +Cc: netdev, linux-can, kernel
In-Reply-To: <20160919141949.5826-1-mkl@pengutronix.de>

From: Marc Kleine-Budde <mkl@pengutronix.de>
Date: Mon, 19 Sep 2016 16:19:48 +0200

> this is a pull request of one patch for the upcoming linux-4.8 release.
> 
> The patch by Fabio Estevam fixes the pm handling in the flexcan driver.

Pulled, thanks.

^ permalink raw reply

* Re: XDP (eXpress Data Path) documentation
From: Alexei Starovoitov via iovisor-dev @ 2016-09-21  2:47 UTC (permalink / raw)
  To: Jesper Dangaard Brouer
  Cc: Nathan Willis, Alexei Starovoitov,
	netdev-u79uwXL29TY76Z2rM5mHXA@public.gmane.org,
	iovisor-dev-9jONkmmOlFHEE9lA1F8Ukti2O/JbrIOy@public.gmane.org,
	Jonathan Corbet, linux-doc-u79uwXL29TY76Z2rM5mHXA, Saeed Mahameed,
	Tom Herbert
In-Reply-To: <20160920110844.661965be-H+wXaHxf7aLQT0dZR+AlfA@public.gmane.org>

On Tue, Sep 20, 2016 at 11:08:44AM +0200, Jesper Dangaard Brouer via iovisor-dev wrote:
> Hi all,
> 
> As promised, I've started documenting the XDP eXpress Data Path):
> 
>  [1] https://prototype-kernel.readthedocs.io/en/latest/networking/XDP/index.html
> 
> IMHO the documentation have reached a stage where it is useful for the
> XDP project, BUT I request collaboration on improving the documentation
> from all. (Native English speakers are encouraged to send grammar fixes ;-))
> 
> You wouldn't believe it: But this pretty looking documentation actually
> follows the new Kernel documentation format.  It is actually just
> ".rst" text files stored in my github repository under kernel/Documentation [2]
> 
>  [2] https://github.com/netoptimizer/prototype-kernel/tree/master/kernel/Documentation

Thanks so much for doing it. This is great start!
Some minor editing is needed here and there.
To make it into official doc do you mind preparing a patch for Jon's doc tree ?
If you think the doc is too volatile and not suitable for kernel.org,
another alternative is to host it on https://github.com/iovisor
since it's LF collaborative project it won't disappear suddenly.
You can be a maintainer of that repo if you like.

^ permalink raw reply

* Re: [PATCH net-next 5/7] rhashtable: abstract out function to get hash
From: Herbert Xu @ 2016-09-21  2:46 UTC (permalink / raw)
  To: Thomas Graf; +Cc: Tom Herbert, davem, netdev, roopa, kernel-team
In-Reply-To: <20160920175803.GE3291@pox.localdomain>

On Tue, Sep 20, 2016 at 07:58:03PM +0200, Thomas Graf wrote:
> 
> I understand this particular patch as an effort not to duplicate
> hash function selection such as jhash vs jhash2 based on key_len.

If the rhashtable params stay non-const as is then this is going
to produce some monstrous code which will be worse than using
jhash unconditionally.

If the rhashtable params are made const then you'll already know
whether jhash or jhash2 is used.

Cheers,
-- 
Email: Herbert Xu <herbert@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt

^ permalink raw reply

* [PATCH iproute2 3/3] ss: output TCP BBR diag information
From: Neal Cardwell @ 2016-09-21  2:43 UTC (permalink / raw)
  To: Stephen Hemminger
  Cc: netdev, Neal Cardwell, Yuchung Cheng, Eric Dumazet,
	Soheil Hassas Yeganeh
In-Reply-To: <1474425824-22646-1-git-send-email-ncardwell@google.com>

Dump useful TCP BBR state information from a struct tcp_bbr_info that
was grabbed using the inet_diag API.

We tolerate info that is shorter or longer than expected, in case the
kernel is older or newer than the ss binary. We simply print the
minimum of what is expected from the kernel and what is provided from
the kernel. We use the same trick as that used for struct tcp_info:
when the info from the kernel is shorter than we hoped, we pad the end
with zeroes, and don't print fields if they are zero.

The BBR output looks like:
  bbr:(bw:1.2Mbps,mrtt:18.965,pacing_gain:2.88672,cwnd_gain:2.88672)

The motivation here is to be consistent with DCTCP, which looks like:
  dctcp(ce_state:23,alpha:23,ab_ecn:23,ab_tot:23)

Signed-off-by: Neal Cardwell <ncardwell@google.com>
Signed-off-by: Yuchung Cheng <ycheng@google.com>
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: Soheil Hassas Yeganeh <soheil@google.com>
---
 misc/ss.c | 31 +++++++++++++++++++++++++++++++
 1 file changed, 31 insertions(+)

diff --git a/misc/ss.c b/misc/ss.c
index 9c456d4..14fff46 100644
--- a/misc/ss.c
+++ b/misc/ss.c
@@ -784,6 +784,7 @@ struct tcpstat {
 	bool		    has_fastopen_opt;
 	bool		    has_wscale_opt;
 	struct dctcpstat    *dctcp;
+	struct tcp_bbr_info *bbr_info;
 };
 
 static void sock_state_print(struct sockstat *s, const char *sock_name)
@@ -1727,6 +1728,25 @@ static void tcp_stats_print(struct tcpstat *s)
 		printf(" dctcp:fallback_mode");
 	}
 
+	if (s->bbr_info) {
+		__u64 bw;
+
+		bw = s->bbr_info->bbr_bw_hi;
+		bw <<= 32;
+		bw |= s->bbr_info->bbr_bw_lo;
+
+		printf(" bbr:(bw:%sbps,mrtt:%g",
+		       sprint_bw(b1, bw * 8.0),
+		       (double)s->bbr_info->bbr_min_rtt / 1000.0);
+		if (s->bbr_info->bbr_pacing_gain)
+			printf(",pacing_gain:%g",
+			       (double)s->bbr_info->bbr_pacing_gain / 256.0);
+		if (s->bbr_info->bbr_cwnd_gain)
+			printf(",cwnd_gain:%g",
+			       (double)s->bbr_info->bbr_cwnd_gain / 256.0);
+		printf(")");
+	}
+
 	if (s->send_bps)
 		printf(" send %sbps", sprint_bw(b1, s->send_bps));
 	if (s->lastsnd)
@@ -2005,6 +2025,16 @@ static void tcp_show_info(const struct nlmsghdr *nlh, struct inet_diag_msg *r,
 			s.dctcp		= dctcp;
 		}
 
+		if (tb[INET_DIAG_BBRINFO]) {
+			const void *bbr_info = RTA_DATA(tb[INET_DIAG_BBRINFO]);
+			int len = min(RTA_PAYLOAD(tb[INET_DIAG_BBRINFO]),
+				      sizeof(*s.bbr_info));
+
+			s.bbr_info = calloc(1, sizeof(*s.bbr_info));
+			if (s.bbr_info && bbr_info)
+				memcpy(s.bbr_info, bbr_info, len);
+		}
+
 		if (rtt > 0 && info->tcpi_snd_mss && info->tcpi_snd_cwnd) {
 			s.send_bps = (double) info->tcpi_snd_cwnd *
 				(double)info->tcpi_snd_mss * 8000000. / rtt;
@@ -2027,6 +2057,7 @@ static void tcp_show_info(const struct nlmsghdr *nlh, struct inet_diag_msg *r,
 			s.min_rtt = (double) info->tcpi_min_rtt / 1000;
 		tcp_stats_print(&s);
 		free(s.dctcp);
+		free(s.bbr_info);
 	}
 }
 
-- 
2.8.0.rc3.226.g39d4020

^ permalink raw reply related

* [PATCH iproute2 2/3] Update inet_diag.h to include INET_DIAG_BBRINFO and related structs
From: Neal Cardwell @ 2016-09-21  2:43 UTC (permalink / raw)
  To: Stephen Hemminger; +Cc: netdev, Neal Cardwell
In-Reply-To: <1474425824-22646-1-git-send-email-ncardwell@google.com>

Update to include the the inet_diag.h changes in:
  "tcp_bbr: add BBR congestion control"

Signed-off-by: Neal Cardwell <ncardwell@google.com>
---
 include/linux/inet_diag.h | 13 +++++++++++++
 1 file changed, 13 insertions(+)

diff --git a/include/linux/inet_diag.h b/include/linux/inet_diag.h
index 5dac049..529a5a2 100644
--- a/include/linux/inet_diag.h
+++ b/include/linux/inet_diag.h
@@ -117,6 +117,7 @@ enum {
 	INET_DIAG_PEERS,
 	INET_DIAG_PAD,
 	INET_DIAG_MARK,
+	INET_DIAG_BBRINFO,
 	__INET_DIAG_MAX,
 };
 
@@ -150,8 +151,20 @@ struct tcp_dctcp_info {
 	__u32	dctcp_ab_tot;
 };
 
+/* INET_DIAG_BBRINFO */
+
+struct tcp_bbr_info {
+	/* u64 bw: max-filtered BW (app throughput) estimate in Byte per sec: */
+	__u32	bbr_bw_lo;		/* lower 32 bits of bw */
+	__u32	bbr_bw_hi;		/* upper 32 bits of bw */
+	__u32	bbr_min_rtt;		/* min-filtered RTT in uSec */
+	__u32	bbr_pacing_gain;	/* pacing gain shifted left 8 bits */
+	__u32	bbr_cwnd_gain;		/* cwnd gain shifted left 8 bits */
+};
+
 union tcp_cc_info {
 	struct tcpvegas_info	vegas;
 	struct tcp_dctcp_info	dctcp;
+	struct tcp_bbr_info	bbr;
 };
 #endif /* _INET_DIAG_H_ */
-- 
2.8.0.rc3.226.g39d4020

^ permalink raw reply related

* [PATCH iproute2 1/3] Update inet_diag.h header to pick up INET_DIAG_MARK
From: Neal Cardwell @ 2016-09-21  2:43 UTC (permalink / raw)
  To: Stephen Hemminger; +Cc: netdev, Neal Cardwell

To ease the upcoming addition of BBR-related data to inet_diag.h, add
the declaration of INET_DIAG_MARK. That way the BBR-related paches
only contain BBR-related pieces.

Signed-off-by: Neal Cardwell <ncardwell@google.com>
---
 include/linux/inet_diag.h | 1 +
 1 file changed, 1 insertion(+)

diff --git a/include/linux/inet_diag.h b/include/linux/inet_diag.h
index 07e486c..5dac049 100644
--- a/include/linux/inet_diag.h
+++ b/include/linux/inet_diag.h
@@ -116,6 +116,7 @@ enum {
 	INET_DIAG_LOCALS,
 	INET_DIAG_PEERS,
 	INET_DIAG_PAD,
+	INET_DIAG_MARK,
 	__INET_DIAG_MAX,
 };
 
-- 
2.8.0.rc3.226.g39d4020

^ permalink raw reply related

* Re: [PATCH net-next 6/7] net/faraday: Fix phy link irq on Aspeed G5 SoCs
From: Joel Stanley @ 2016-09-21  2:02 UTC (permalink / raw)
  To: Andrew Lunn
  Cc: Benjamin Herrenschmidt, davem, Gavin Shan, Andrew Jeffery, netdev,
	linux-kernel
In-Reply-To: <20160920152940.GI22292@lunn.ch>

On Wed, Sep 21, 2016 at 12:59 AM, Andrew Lunn <andrew@lunn.ch> wrote:
> On Tue, Sep 20, 2016 at 10:13:14PM +1000, Benjamin Herrenschmidt wrote:
>> On Tue, 2016-09-20 at 16:00 +0930, Joel Stanley wrote:
>> > On Aspeed SoC with a direct PHY connection (non-NSCI), we receive
>> > continual PHYSTS interrupts:
>> >
>> >  [   20.280000] ftgmac100 1e660000.ethernet eth0: [ISR] = 0x200: PHYSTS_CHG
>> >  [   20.280000] ftgmac100 1e660000.ethernet eth0: [ISR] = 0x200: PHYSTS_CHG
>> >  [   20.280000] ftgmac100 1e660000.ethernet eth0: [ISR] = 0x200: PHYSTS_CHG
>> >  [   20.300000] ftgmac100 1e660000.ethernet eth0: [ISR] = 0x200: PHYSTS_CHG
>> >
>> > This is because the driver was enabling low-level sensitive interrupt
>> > generation where the systems are wired for high-level. All CPU cycles
>> > are spent servicing this interrupt.
>>
>> If this is a system wiring issue, should it be represented by a DT
>> property ?
>
> Is there a device tree binding document somewhere?
>
> Is it possible just to put ACTIVE_HIGH in the right place in the
> binding?

I wrote "wired for high level" wrt the SoC internals. To be honest I
wondered the same thing but it's hard with only one (non-NSCI) system
to test on.

I had a look at the eval board schematic and it appears that the line
has pull down resistors on it, explaining why the IRQ fires when it's
configured to active low. Other machines re-use the pin pin as a GPIO.
So yes, I will change this to a dt property in v2. That will mean
dropping 4/7 "net/faraday: Avoid PHYSTS_CHG interrupt" as well.

Cheers,

Joel

^ permalink raw reply

* [PATCH net] net: get rid of an signed integer overflow in ip_idents_reserve()
From: Eric Dumazet @ 2016-09-21  1:06 UTC (permalink / raw)
  To: Jiri Pirko, David Miller; +Cc: netdev
In-Reply-To: <1474378115.23058.2.camel@edumazet-glaptop3.roam.corp.google.com>

From: Eric Dumazet <edumazet@google.com>

Jiri Pirko reported an UBSAN warning happening in ip_idents_reserve()

[] UBSAN: Undefined behaviour in ./arch/x86/include/asm/atomic.h:156:11
[] signed integer overflow:
[] -2117905507 + -695755206 cannot be represented in type 'int'

Since we do not have uatomic_add_return() yet, use atomic_cmpxchg()
so that the arithmetics can be done using unsigned int.

Fixes: 04ca6973f7c1 ("ip: make IP identifiers less predictable")
Signed-off-by: Eric Dumazet <edumazet@google.com>
Reported-by: Jiri Pirko <jiri@resnulli.us>
---
David, Jiri, I removed the prandom_u32() stuff in favor of a traditional
loop to meet stable requirements. Thanks !

 net/ipv4/route.c |   10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index b52496fd51075821c39435f50ac62f813967aecc..654a9af201366887652a4e19a6f1261e5e747056 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -476,12 +476,18 @@ u32 ip_idents_reserve(u32 hash, int segs)
 	atomic_t *p_id = ip_idents + hash % IP_IDENTS_SZ;
 	u32 old = ACCESS_ONCE(*p_tstamp);
 	u32 now = (u32)jiffies;
-	u32 delta = 0;
+	u32 new, delta = 0;
 
 	if (old != now && cmpxchg(p_tstamp, old, now) == old)
 		delta = prandom_u32_max(now - old);
 
-	return atomic_add_return(segs + delta, p_id) - segs;
+	/* Do not use atomic_add_return() as it makes UBSAN unhappy */
+	do {
+		old = (u32)atomic_read(p_id);
+		new = old + delta + segs;
+	} while (atomic_cmpxchg(p_id, old, new) != old);
+
+	return new - segs;
 }
 EXPORT_SYMBOL(ip_idents_reserve);
 

^ permalink raw reply related

* Re: [PATCHv3 net-next 1/2] net: dsa: mv88e6xxx: Add helper for accessing port registers
From: Vivien Didelot @ 2016-09-21  0:43 UTC (permalink / raw)
  To: Andrew Lunn, David Miller; +Cc: netdev, Andrew Lunn
In-Reply-To: <1474414832-1638-2-git-send-email-andrew@lunn.ch>

Hi Andrew,

Andrew Lunn <andrew@lunn.ch> writes:

> There is a device coming soon which places its port registers
> somewhere different to all other Marvell switches supported so far.
> Add helper functions for reading/writing port registers, making it
> easier to handle this new device.
>
> Signed-off-by: Andrew Lunn <andrew@lunn.ch>

Reviewed-by: Vivien Didelot <vivien.didelot@savoirfairelinux.com>

Thanks!

        Vivien

^ permalink raw reply

* Re: [PATCH RFC 1/3] xdp: Infrastructure to generalize XDP
From: Alexei Starovoitov @ 2016-09-21  0:13 UTC (permalink / raw)
  To: Tom Herbert, Thomas Graf
  Cc: David S. Miller, Linux Kernel Network Developers, Kernel Team,
	Tariq Toukan, Brenden Blanco, Alexei Starovoitov, Eric Dumazet,
	Jesper Dangaard Brouer
In-Reply-To: <CALx6S35LmUtwRA85Kg6s7OR=e5Pj9ssqmWLsjtmXfZTXeG02zQ@mail.gmail.com>

On 9/20/16 4:59 PM, Tom Herbert wrote:
> I am looking at using this for ILA router. The problem I am hitting is
> that not all packets that we need to translate go through the XDP
> path. Some would go through the kernel path, some through XDP path but
> that would mean I need parallel lookup tables to be maintained for the
> two paths which won't scale. ILA translation is so trivial and not
> really something that we need to be user programmable, the fast path
> is really for accelerating an existing kernel capability. If I can
> reuse the kernel code already written and the existing kernel data
> structures to make a fast path in XDP there is a lot of value in that
> for me.

sounds like you want to add hard coded ILA rewriter to the driver
instead of doing it as BPF program?!
That is 180 degree turn vs the whole protocol ossification tune
that I thought you strongly believe in.

What kernel data structures do you want to reuse?
ILA rewriter needs single hash lookup. Several different
types of hash maps exist on bpf side already and
even more are coming that will be usable by both tc and xdp side.
csum adjustment? we have them for tc. Not for xdp yet,
but it's trivial to allow them on xdp side too.
May be we should talk about real motivation for the patches
and see what is the best solution.

^ permalink raw reply

* Re: [PATCH RFC 1/3] xdp: Infrastructure to generalize XDP
From: Alexei Starovoitov @ 2016-09-21  0:01 UTC (permalink / raw)
  To: Tom Herbert, davem, netdev
  Cc: kernel-team, tariqt, bblanco, alexei.starovoitov, eric.dumazet,
	brouer
In-Reply-To: <1474408824-418864-2-git-send-email-tom@herbertland.com>

On 9/20/16 3:00 PM, Tom Herbert wrote:
> +static inline int __xdp_hook_run(struct list_head *list_head,
> +				 struct xdp_buff *xdp)
> +{
> +	struct xdp_hook_ops *elem;
> +	int ret = XDP_PASS;
> +
> +	list_for_each_entry(elem, list_head, list) {
> +		ret = elem->hook(elem->priv, xdp);
> +		if (ret != XDP_PASS)
> +			break;
> +	}
> +
> +	return ret;
> +}
> +
> +/* Run the XDP hooks for a napi device. Called from a driver's receive
> + * routine
> + */
> +static inline int xdp_hook_run(struct napi_struct *napi, struct xdp_buff *xdp)
> +{
> +	struct net_device *dev = napi->dev;
> +	int ret = XDP_PASS;
> +
> +	if (static_branch_unlikely(&xdp_hooks_needed)) {
> +		/* Run hooks in napi first */
> +		ret = __xdp_hook_run(&napi->xdp_hook_list, xdp);
> +		if (ret != XDP_PASS)
> +			return ret;
> +
> +		/* Now run device hooks */
> +		ret = __xdp_hook_run(&dev->xdp_hook_list, xdp);
> +		if (ret != XDP_PASS)
> +			return ret;
> +	}
> +
> +	return ret;
> +}

it's an interesting idea to move prog pointer into napi struct,
but certainly not at such huge cost.
Right now it's 1 load + 1 cmp + 1 indirect jump per packet
to invoke the program, with above approach it becomes
6 loads + 3 cmp (just to get through run_needed_check() check)
+ 6 loads + 3 cmp + 2 indirect jumps.
(I may be little bit off +- few loads)
That is a non-starter.
When we were optimizing receive path of tc clast ingress hook
we saw 1Mpps saved for every load+cmp+indirect jump removed.

We're working on inlining of bpf_map_lookup to save one
indirect call per lookup, we cannot just waste them here.

We need to save cycles instead, especially when it doesn't
really solve your goals. It seems the goals are:

 >- Allows alternative users of the XDP hooks other than the original
 >    BPF

this should be achieved by their own hooks while reusing
return codes XDP_TX, XDP_PASS to keep driver side the same.
I'm not against other packet processing engines, but not
at the cost of lower performance.

 >  - Allows a means to pipeline XDP programs together

this can be done already via bpf_tail_call. No changes needed.

 >  - Reduces the amount of code and complexity needed in drivers to
 >    manage XDP

hmm:
534 insertions(+), 144 deletions(-)
looks like increase in complexity instead.

 >  - Provides a more structured environment that is extensible to new
 >    features while being mostly transparent to the drivers

don't see that in these patches either.
Things like packet size change (that we're working on) still
has to be implemented for every driver.
Existing XDP_TX, XDP_DROP have to be implemented per driver as well.

Also introduction of xdp.h breaks existing UAPI.
That's not acceptable either.

^ permalink raw reply

page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox