Netdev List

Netdev List
 help / color / mirror / Atom feed

* [net 03/10] net/mlx5: Use u16 for Work Queue buffer fragment size
From: Saeed Mahameed @ 2018-09-06  4:09 UTC (permalink / raw)
  To: David S. Miller; +Cc: netdev, Tariq Toukan, Saeed Mahameed
In-Reply-To: <20180906040952.29684-1-saeedm@mellanox.com>

From: Tariq Toukan <tariqt@mellanox.com>

Minimal stride size is 16.
Hence, the number of strides in a fragment (of PAGE_SIZE)
is <= PAGE_SIZE / 16 <= 4K.

u16 is sufficient to represent this.

Fixes: 388ca8be0037 ("IB/mlx5: Implement fragmented completion queue (CQ)")
Signed-off-by: Tariq Toukan <tariqt@mellanox.com>
Reviewed-by: Eran Ben Elisha <eranbe@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
---
 drivers/net/ethernet/mellanox/mlx5/core/wq.c | 4 ++--
 drivers/net/ethernet/mellanox/mlx5/core/wq.h | 2 +-
 include/linux/mlx5/driver.h                  | 2 +-
 3 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/wq.c b/drivers/net/ethernet/mellanox/mlx5/core/wq.c
index c8c315eb5128..d838af9539b1 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/wq.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/wq.c
@@ -39,9 +39,9 @@ u32 mlx5_wq_cyc_get_size(struct mlx5_wq_cyc *wq)
 	return (u32)wq->fbc.sz_m1 + 1;
 }
 
-u32 mlx5_wq_cyc_get_frag_size(struct mlx5_wq_cyc *wq)
+u16 mlx5_wq_cyc_get_frag_size(struct mlx5_wq_cyc *wq)
 {
-	return (u32)wq->fbc.frag_sz_m1 + 1;
+	return wq->fbc.frag_sz_m1 + 1;
 }
 
 u32 mlx5_cqwq_get_size(struct mlx5_cqwq *wq)
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/wq.h b/drivers/net/ethernet/mellanox/mlx5/core/wq.h
index 2bd4c3184eba..3a1a170bb2d7 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/wq.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/wq.h
@@ -80,7 +80,7 @@ int mlx5_wq_cyc_create(struct mlx5_core_dev *mdev, struct mlx5_wq_param *param,
 		       void *wqc, struct mlx5_wq_cyc *wq,
 		       struct mlx5_wq_ctrl *wq_ctrl);
 u32 mlx5_wq_cyc_get_size(struct mlx5_wq_cyc *wq);
-u32 mlx5_wq_cyc_get_frag_size(struct mlx5_wq_cyc *wq);
+u16 mlx5_wq_cyc_get_frag_size(struct mlx5_wq_cyc *wq);
 
 int mlx5_wq_qp_create(struct mlx5_core_dev *mdev, struct mlx5_wq_param *param,
 		      void *qpc, struct mlx5_wq_qp *wq,
diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h
index aa65f58c6610..3a1258fd8ac3 100644
--- a/include/linux/mlx5/driver.h
+++ b/include/linux/mlx5/driver.h
@@ -362,7 +362,7 @@ struct mlx5_frag_buf {
 struct mlx5_frag_buf_ctrl {
 	struct mlx5_frag_buf	frag_buf;
 	u32			sz_m1;
-	u32			frag_sz_m1;
+	u16			frag_sz_m1;
 	u32			strides_offset;
 	u8			log_sz;
 	u8			log_stride;
-- 
2.17.1

^ permalink raw reply related

* [net 02/10] net/mlx5: Fix debugfs cleanup in the device init/remove flow
From: Saeed Mahameed @ 2018-09-06  4:09 UTC (permalink / raw)
  To: David S. Miller; +Cc: netdev, Jack Morgenstein, Saeed Mahameed
In-Reply-To: <20180906040952.29684-1-saeedm@mellanox.com>

From: Jack Morgenstein <jackm@dev.mellanox.co.il>

When initializing the device (procedure init_one), the driver
calls mlx5_pci_init to perform pci initialization. As part of this
initialization, mlx5_pci_init creates a debugfs directory.
If this creation fails, init_one aborts, returning failure to
the caller (which is the probe method caller).

The main reason for such a failure to occur is if the debugfs
directory already exists. This can happen if the last time
mlx5_pci_close was called, debugfs_remove (silently) failed due
to the debugfs directory not being empty.

Guarantee that such a debugfs_remove failure will not occur by
instead calling debugfs_remove_recursive in procedure mlx5_pci_close.

Fixes: 59211bd3b632 ("net/mlx5: Split the load/unload flow into hardware and software flows")
Signed-off-by: Jack Morgenstein <jackm@dev.mellanox.co.il>
Reviewed-by: Daniel Jurgens <danielj@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
---
 drivers/net/ethernet/mellanox/mlx5/core/main.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/main.c b/drivers/net/ethernet/mellanox/mlx5/core/main.c
index 739aad0a0b35..b5e9f664fc66 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/main.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/main.c
@@ -878,8 +878,10 @@ static int mlx5_pci_init(struct mlx5_core_dev *dev, struct mlx5_priv *priv)
 	priv->numa_node = dev_to_node(&dev->pdev->dev);

 	priv->dbg_root = debugfs_create_dir(dev_name(&pdev->dev), mlx5_debugfs_root);
-	if (!priv->dbg_root)
+	if (!priv->dbg_root) {
+		dev_err(&pdev->dev, "Cannot create debugfs dir, aborting\n");
 		return -ENOMEM;
+	}

 	err = mlx5_pci_enable_device(dev);
 	if (err) {
@@ -928,7 +930,7 @@ static void mlx5_pci_close(struct mlx5_core_dev *dev, struct mlx5_priv *priv)
 	pci_clear_master(dev->pdev);
 	release_bar(dev->pdev);
 	mlx5_pci_disable_device(dev);
-	debugfs_remove(priv->dbg_root);
+	debugfs_remove_recursive(priv->dbg_root);
 }

 static int mlx5_init_once(struct mlx5_core_dev *dev, struct mlx5_priv *priv)
-- 
2.17.1

^ permalink raw reply related

* [net 04/10] net/mlx5: Use u16 for Work Queue buffer strides offset
From: Saeed Mahameed @ 2018-09-06  4:09 UTC (permalink / raw)
  To: David S. Miller; +Cc: netdev, Tariq Toukan, Saeed Mahameed
In-Reply-To: <20180906040952.29684-1-saeedm@mellanox.com>

From: Tariq Toukan <tariqt@mellanox.com>

Minimal stride size is 16.
Hence, the number of strides in a fragment (of PAGE_SIZE)
is <= PAGE_SIZE / 16 <= 4K.

u16 is sufficient to represent this.

Fixes: d7037ad73daa ("net/mlx5: Fix QP fragmented buffer allocation")
Signed-off-by: Tariq Toukan <tariqt@mellanox.com>
Reviewed-by: Eran Ben Elisha <eranbe@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
---
 drivers/net/ethernet/mellanox/mlx5/core/wq.c | 2 +-
 include/linux/mlx5/driver.h                  | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/wq.c b/drivers/net/ethernet/mellanox/mlx5/core/wq.c
index d838af9539b1..68e7f8df2a6d 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/wq.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/wq.c
@@ -138,7 +138,7 @@ int mlx5_wq_qp_create(struct mlx5_core_dev *mdev, struct mlx5_wq_param *param,
 		      void *qpc, struct mlx5_wq_qp *wq,
 		      struct mlx5_wq_ctrl *wq_ctrl)
 {
-	u32 sq_strides_offset;
+	u16 sq_strides_offset;
 	u32 rq_pg_remainder;
 	int err;
 
diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h
index 3a1258fd8ac3..66d94b4557cf 100644
--- a/include/linux/mlx5/driver.h
+++ b/include/linux/mlx5/driver.h
@@ -363,7 +363,7 @@ struct mlx5_frag_buf_ctrl {
 	struct mlx5_frag_buf	frag_buf;
 	u32			sz_m1;
 	u16			frag_sz_m1;
-	u32			strides_offset;
+	u16			strides_offset;
 	u8			log_sz;
 	u8			log_stride;
 	u8			log_frag_strides;
@@ -995,7 +995,7 @@ static inline u32 mlx5_base_mkey(const u32 key)
 }
 
 static inline void mlx5_fill_fbc_offset(u8 log_stride, u8 log_sz,
-					u32 strides_offset,
+					u16 strides_offset,
 					struct mlx5_frag_buf_ctrl *fbc)
 {
 	fbc->log_stride = log_stride;
-- 
2.17.1

^ permalink raw reply related

* [net 01/10] net/mlx5: Fix use-after-free in self-healing flow
From: Saeed Mahameed @ 2018-09-06  4:09 UTC (permalink / raw)
  To: David S. Miller; +Cc: netdev, Jack Morgenstein, Saeed Mahameed
In-Reply-To: <20180906040952.29684-1-saeedm@mellanox.com>

From: Jack Morgenstein <jackm@dev.mellanox.co.il>

When the mlx5 health mechanism detects a problem while the driver
is in the middle of init_one or remove_one, the driver needs to prevent
the health mechanism from scheduling future work; if future work
is scheduled, there is a problem with use-after-free: the system WQ
tries to run the work item (which has been freed) at the scheduled
future time.

Prevent this by disabling work item scheduling in the health mechanism
when the driver is in the middle of init_one() or remove_one().

Fixes: e126ba97dba9 ("mlx5: Add driver for Mellanox Connect-IB adapters")
Signed-off-by: Jack Morgenstein <jackm@dev.mellanox.co.il>
Reviewed-by: Feras Daoud <ferasda@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
---
 drivers/net/ethernet/mellanox/mlx5/core/health.c | 10 +++++++++-
 drivers/net/ethernet/mellanox/mlx5/core/main.c   |  6 +++---
 include/linux/mlx5/driver.h                      |  2 +-
 3 files changed, 13 insertions(+), 5 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/health.c b/drivers/net/ethernet/mellanox/mlx5/core/health.c
index d39b0b7011b2..9f39aeca863f 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/health.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/health.c
@@ -331,9 +331,17 @@ void mlx5_start_health_poll(struct mlx5_core_dev *dev)
 	add_timer(&health->timer);
 }
 
-void mlx5_stop_health_poll(struct mlx5_core_dev *dev)
+void mlx5_stop_health_poll(struct mlx5_core_dev *dev, bool disable_health)
 {
 	struct mlx5_core_health *health = &dev->priv.health;
+	unsigned long flags;
+
+	if (disable_health) {
+		spin_lock_irqsave(&health->wq_lock, flags);
+		set_bit(MLX5_DROP_NEW_HEALTH_WORK, &health->flags);
+		set_bit(MLX5_DROP_NEW_RECOVERY_WORK, &health->flags);
+		spin_unlock_irqrestore(&health->wq_lock, flags);
+	}
 
 	del_timer_sync(&health->timer);
 }
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/main.c b/drivers/net/ethernet/mellanox/mlx5/core/main.c
index cf3e4a659052..739aad0a0b35 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/main.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/main.c
@@ -1286,7 +1286,7 @@ static int mlx5_load_one(struct mlx5_core_dev *dev, struct mlx5_priv *priv,
 		mlx5_cleanup_once(dev);
 
 err_stop_poll:
-	mlx5_stop_health_poll(dev);
+	mlx5_stop_health_poll(dev, boot);
 	if (mlx5_cmd_teardown_hca(dev)) {
 		dev_err(&dev->pdev->dev, "tear_down_hca failed, skip cleanup\n");
 		goto out_err;
@@ -1346,7 +1346,7 @@ static int mlx5_unload_one(struct mlx5_core_dev *dev, struct mlx5_priv *priv,
 	mlx5_free_irq_vectors(dev);
 	if (cleanup)
 		mlx5_cleanup_once(dev);
-	mlx5_stop_health_poll(dev);
+	mlx5_stop_health_poll(dev, cleanup);
 	err = mlx5_cmd_teardown_hca(dev);
 	if (err) {
 		dev_err(&dev->pdev->dev, "tear_down_hca failed, skip cleanup\n");
@@ -1608,7 +1608,7 @@ static int mlx5_try_fast_unload(struct mlx5_core_dev *dev)
 	 * with the HCA, so the health polll is no longer needed.
 	 */
 	mlx5_drain_health_wq(dev);
-	mlx5_stop_health_poll(dev);
+	mlx5_stop_health_poll(dev, false);
 
 	ret = mlx5_cmd_force_teardown_hca(dev);
 	if (ret) {
diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h
index 7a452716de4b..aa65f58c6610 100644
--- a/include/linux/mlx5/driver.h
+++ b/include/linux/mlx5/driver.h
@@ -1052,7 +1052,7 @@ int mlx5_cmd_free_uar(struct mlx5_core_dev *dev, u32 uarn);
 void mlx5_health_cleanup(struct mlx5_core_dev *dev);
 int mlx5_health_init(struct mlx5_core_dev *dev);
 void mlx5_start_health_poll(struct mlx5_core_dev *dev);
-void mlx5_stop_health_poll(struct mlx5_core_dev *dev);
+void mlx5_stop_health_poll(struct mlx5_core_dev *dev, bool disable_health);
 void mlx5_drain_health_wq(struct mlx5_core_dev *dev);
 void mlx5_trigger_health_work(struct mlx5_core_dev *dev);
 void mlx5_drain_health_recovery(struct mlx5_core_dev *dev);
-- 
2.17.1

^ permalink raw reply related

* [pull request][net 00/10] Mellanox, mlx5 fixes 2018-09-05
From: Saeed Mahameed @ 2018-09-06  4:09 UTC (permalink / raw)
  To: David S. Miller; +Cc: netdev, Saeed Mahameed

Hi Dave,

This pull request contains some fixes for mlx5 etherent netdevice and
core driver.

Please pull and let me know if there's any problem.

For -stable v4.9:
('net/mlx5: Fix debugfs cleanup in the device init/remove flow')

For -stable v4.12:
("net/mlx5: E-Switch, Fix memory leak when creating switchdev mode FDB tables")

For -stable v4.13:
("net/mlx5: Fix use-after-free in self-healing flow")

For -stable v4.14:
("net/mlx5: Check for error in mlx5_attach_interface")

For -stable v4.15:
("net/mlx5: Fix not releasing read lock when adding flow rules")

For -stable v4.17:
("net/mlx5: Fix possible deadlock from lockdep when adding fte to fg")

For -stable v4.18:
("net/mlx5: Use u16 for Work Queue buffer fragment size")

Thanks,
Saeed.

---

The following changes since commit e65a9e480e91ddf9e15155454d370cead64689c8:

  net: qca_spi: Fix race condition in spi transfers (2018-09-05 08:09:35 -0700)

are available in the Git repository at:

  git://git.kernel.org/pub/scm/linux/kernel/git/saeed/linux.git tags/mlx5e-fixes-2018-09-05

for you to fetch changes up to ad9421e36a77056a4f095d49b9605e80b4d216ed:

  net/mlx5: Fix possible deadlock from lockdep when adding fte to fg (2018-09-05 17:08:34 -0700)

----------------------------------------------------------------
mlx5e-fixes-2018-09-05

----------------------------------------------------------------
Daniel Jurgens (1):
      net/mlx5: Consider PCI domain in search for next dev

Huy Nguyen (1):
      net/mlx5: Check for error in mlx5_attach_interface

Jack Morgenstein (2):
      net/mlx5: Fix use-after-free in self-healing flow
      net/mlx5: Fix debugfs cleanup in the device init/remove flow

Raed Salem (1):
      net/mlx5: E-Switch, Fix memory leak when creating switchdev mode FDB tables

Roi Dayan (2):
      net/mlx5: Fix not releasing read lock when adding flow rules
      net/mlx5: Fix possible deadlock from lockdep when adding fte to fg

Saeed Mahameed (1):
      net/mlx5e: Ethtool steering, fix udp source port value

Tariq Toukan (2):
      net/mlx5: Use u16 for Work Queue buffer fragment size
      net/mlx5: Use u16 for Work Queue buffer strides offset

 drivers/net/ethernet/mellanox/mlx5/core/dev.c      | 22 ++++---
 .../ethernet/mellanox/mlx5/core/en_fs_ethtool.c    |  2 +-
 .../ethernet/mellanox/mlx5/core/eswitch_offloads.c |  1 +
 drivers/net/ethernet/mellanox/mlx5/core/fs_core.c  | 76 +++++++++++-----------
 drivers/net/ethernet/mellanox/mlx5/core/health.c   | 10 ++-
 drivers/net/ethernet/mellanox/mlx5/core/main.c     | 12 ++--
 drivers/net/ethernet/mellanox/mlx5/core/wq.c       |  6 +-
 drivers/net/ethernet/mellanox/mlx5/core/wq.h       |  2 +-
 include/linux/mlx5/driver.h                        |  8 +--
 9 files changed, 79 insertions(+), 60 deletions(-)

^ permalink raw reply

* Re: [PATCH mlx5-next v1 05/15] net/mlx5: Break encap/decap into two separated flow table creation flags
From: Leon Romanovsky @ 2018-09-06  4:09 UTC (permalink / raw)
  To: Or Gerlitz
  Cc: Jason Gunthorpe, Doug Ledford, RDMA mailing list, Ariel Levkovich,
	Mark Bloch, Or Gerlitz, Saeed Mahameed, linux-netdev
In-Reply-To: <CAJ3xEMjrrhjCRmnGyD4SYit+pgJpakXLSrF6i35qaZiDfGsAxw@mail.gmail.com>

[-- Attachment #1: Type: text/plain, Size: 1730 bytes --]

On Thu, Sep 06, 2018 at 12:37:17AM +0300, Or Gerlitz wrote:
> On Wed, Sep 5, 2018 at 9:11 PM, Leon Romanovsky  wrote:
> > On Wed, Sep 05, 2018 at 10:38:00AM -0600, Jason Gunthorpe wrote:
> >> On Wed, Sep 05, 2018 at 08:10:25AM +0300, Leon Romanovsky wrote:
> >> > > > -       int en_encap_decap = !!(flags & MLX5_FLOW_TABLE_TUNNEL_EN);
> >> > > > +       int en_encap = !!(flags & MLX5_FLOW_TABLE_TUNNEL_EN_ENCAP);
> >> > > > +       int en_decap = !!(flags & MLX5_FLOW_TABLE_TUNNEL_EN_DECAP);
> >> > >
> >> > > Yuk, please don't use !!.
> >> > >
> >> > >   bool en_decap = flags & MLX5_FLOW_TABLE_TUNNEL_EN_DECAP;
> >> >
> >> > We need to provide en_encap and en_decap as an input to MLX5_SET(...)
> >> > which is passed to FW as 0 or 1.
> >> >
> >> > Boolean type is declared in C as int and treated as zero for false
> >> > and any other value for true,
> >>
> >> No, that isn't right, the kernel uses C99's _Bool intrinsic type, which
> >> is guaranteed to only hold 0 or 1 by the compiler.
> >>
> >> See types.h:
> >>
> >> typedef _Bool                   bool;
> >
> > Exciting, it took me a while to find C99 standard and relevant 6.3.1.2.
> > Anyway, this patch didn't change previous functionality, which used "!!"
> > convention.
>
> so? if we didn't do things properly prior to the patch, why not fixing it along
> with the patch? lets fix

Or,

What exactly "to fix"? Both code lines:
1. Have correct syntax
2. Implement proper C99
3. Give same compiler code
4. Have same readability

There is nothing to fix.

And this patch is already merged, so if you truly care about this,
please go ahead and prepare patch for whole driver, or better for
whole kernel.

 kernel git:(rdma-next) git grep "\!\!" |wc -l
8125

Thanks

[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 801 bytes --]

^ permalink raw reply

* [PATCH net-next 10/11] tap: accept an array of XDP buffs through sendmsg()
From: Jason Wang @ 2018-09-06  4:05 UTC (permalink / raw)
  To: netdev, linux-kernel; +Cc: kvm, virtualization, mst, jasowang
In-Reply-To: <20180906040526.22518-1-jasowang@redhat.com>

This patch implement TUN_MSG_PTR msg_control type. This type allows
the caller to pass an array of XDP buffs to tuntap through ptr field
of the tun_msg_control. Tap will build skb through those XDP buffers.

This will avoid lots of indirect calls thus improves the icache
utilization and allows to do XDP batched flushing when doing XDP
redirection.

Signed-off-by: Jason Wang <jasowang@redhat.com>
---
 drivers/net/tap.c | 73 +++++++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 71 insertions(+), 2 deletions(-)

diff --git a/drivers/net/tap.c b/drivers/net/tap.c
index 7996ed7cbf18..50eb7bf22225 100644
--- a/drivers/net/tap.c
+++ b/drivers/net/tap.c
@@ -1146,14 +1146,83 @@ static const struct file_operations tap_fops = {
 #endif
 };
 
+static int tap_get_user_xdp(struct tap_queue *q, struct xdp_buff *xdp)
+{
+	struct virtio_net_hdr *gso = xdp->data_hard_start + sizeof(int);
+	int buflen = *(int *)xdp->data_hard_start;
+	int vnet_hdr_len = 0;
+	struct tap_dev *tap;
+	struct sk_buff *skb;
+	int err, depth;
+
+	if (q->flags & IFF_VNET_HDR)
+		vnet_hdr_len = READ_ONCE(q->vnet_hdr_sz);
+
+	skb = build_skb(xdp->data_hard_start, buflen);
+	if (!skb) {
+		err = -ENOMEM;
+		goto err;
+	}
+
+	skb_reserve(skb, xdp->data - xdp->data_hard_start);
+	skb_put(skb, xdp->data_end - xdp->data);
+
+	skb_set_network_header(skb, ETH_HLEN);
+	skb_reset_mac_header(skb);
+	skb->protocol = eth_hdr(skb)->h_proto;
+
+	if (vnet_hdr_len) {
+		err = virtio_net_hdr_to_skb(skb, gso, tap_is_little_endian(q));
+		if (err)
+			goto err_kfree;
+	}
+
+	skb_probe_transport_header(skb, ETH_HLEN);
+
+	/* Move network header to the right position for VLAN tagged packets */
+	if ((skb->protocol == htons(ETH_P_8021Q) ||
+	     skb->protocol == htons(ETH_P_8021AD)) &&
+	    __vlan_get_protocol(skb, skb->protocol, &depth) != 0)
+		skb_set_network_header(skb, depth);
+
+	rcu_read_lock();
+	tap = rcu_dereference(q->tap);
+	if (tap) {
+		skb->dev = tap->dev;
+		dev_queue_xmit(skb);
+	} else {
+		kfree_skb(skb);
+	}
+	rcu_read_unlock();
+
+	return 0;
+
+err_kfree:
+	kfree_skb(skb);
+err:
+	rcu_read_lock();
+		tap = rcu_dereference(q->tap);
+	if (tap && tap->count_tx_dropped)
+		tap->count_tx_dropped(tap);
+	rcu_read_unlock();
+	return err;
+}
+
 static int tap_sendmsg(struct socket *sock, struct msghdr *m,
 		       size_t total_len)
 {
 	struct tap_queue *q = container_of(sock, struct tap_queue, sock);
 	struct tun_msg_ctl *ctl = m->msg_control;
+	struct xdp_buff *xdp;
+	int i;
 
-	if (ctl && ctl->type != TUN_MSG_UBUF)
-		return -EINVAL;
+	if (ctl && ((ctl->type & 0xF) == TUN_MSG_PTR)) {
+		for (i = 0; i < ctl->type >> 16; i++) {
+			xdp = &((struct xdp_buff *)ctl->ptr)[i];
+			tap_get_user_xdp(q, xdp);
+		}
+		return 0;
+	}
 
 	return tap_get_user(q, ctl ? ctl->ptr : NULL, &m->msg_iter,
 			    m->msg_flags & MSG_DONTWAIT);
-- 
2.17.1

^ permalink raw reply related

* [PATCH net-next 03/11] tuntap: enable bh early during processing XDP
From: Jason Wang @ 2018-09-06  4:05 UTC (permalink / raw)
  To: netdev, linux-kernel; +Cc: kvm, virtualization, mst, jasowang
In-Reply-To: <20180906040526.22518-1-jasowang@redhat.com>

This patch move the bh enabling a little bit earlier, this will be
used for factoring out the core XDP logic of tuntap.

Signed-off-by: Jason Wang <jasowang@redhat.com>
---
 drivers/net/tun.c | 10 +++-------
 1 file changed, 3 insertions(+), 7 deletions(-)

diff --git a/drivers/net/tun.c b/drivers/net/tun.c
index d3677a544b56..372caf7d67d9 100644
--- a/drivers/net/tun.c
+++ b/drivers/net/tun.c
@@ -1726,22 +1726,18 @@ static struct sk_buff *tun_build_skb(struct tun_struct *tun,
 			goto err_xdp;
 		}
 	}
+	rcu_read_unlock();
+	local_bh_enable();
 
 	skb = build_skb(buf, buflen);
-	if (!skb) {
-		rcu_read_unlock();
-		local_bh_enable();
+	if (!skb)
 		return ERR_PTR(-ENOMEM);
-	}
 
 	skb_reserve(skb, pad - delta);
 	skb_put(skb, len);
 	get_page(alloc_frag->page);
 	alloc_frag->offset += buflen;
 
-	rcu_read_unlock();
-	local_bh_enable();
-
 	return skb;
 
 err_redirect:
-- 
2.17.1

^ permalink raw reply related

* [PATCH net-next 01/11] net: sock: introduce SOCK_XDP
From: Jason Wang @ 2018-09-06  4:05 UTC (permalink / raw)
  To: netdev, linux-kernel; +Cc: kvm, virtualization, mst, jasowang
In-Reply-To: <20180906040526.22518-1-jasowang@redhat.com>

This patch introduces a new sock flag - SOCK_XDP. This will be used
for notifying the upper layer that XDP program is attached on the
lower socket, and requires for extra headroom.

TUN will be the first user.

Signed-off-by: Jason Wang <jasowang@redhat.com>
---
 drivers/net/tun.c  | 19 +++++++++++++++++++
 include/net/sock.h |  1 +
 2 files changed, 20 insertions(+)

diff --git a/drivers/net/tun.c b/drivers/net/tun.c
index ebd07ad82431..2c548bd20393 100644
--- a/drivers/net/tun.c
+++ b/drivers/net/tun.c
@@ -869,6 +869,9 @@ static int tun_attach(struct tun_struct *tun, struct file *file,
 		tun_napi_init(tun, tfile, napi);
 	}
 
+	if (rtnl_dereference(tun->xdp_prog))
+		sock_set_flag(&tfile->sk, SOCK_XDP);
+
 	tun_set_real_num_queues(tun);
 
 	/* device is allowed to go away first, so no need to hold extra
@@ -1241,13 +1244,29 @@ static int tun_xdp_set(struct net_device *dev, struct bpf_prog *prog,
 		       struct netlink_ext_ack *extack)
 {
 	struct tun_struct *tun = netdev_priv(dev);
+	struct tun_file *tfile;
 	struct bpf_prog *old_prog;
+	int i;
 
 	old_prog = rtnl_dereference(tun->xdp_prog);
 	rcu_assign_pointer(tun->xdp_prog, prog);
 	if (old_prog)
 		bpf_prog_put(old_prog);
 
+	for (i = 0; i < tun->numqueues; i++) {
+		tfile = rtnl_dereference(tun->tfiles[i]);
+		if (prog)
+			sock_set_flag(&tfile->sk, SOCK_XDP);
+		else
+			sock_reset_flag(&tfile->sk, SOCK_XDP);
+	}
+	list_for_each_entry(tfile, &tun->disabled, next) {
+		if (prog)
+			sock_set_flag(&tfile->sk, SOCK_XDP);
+		else
+			sock_reset_flag(&tfile->sk, SOCK_XDP);
+	}
+
 	return 0;
 }
 
diff --git a/include/net/sock.h b/include/net/sock.h
index 433f45fc2d68..38cae35f6e16 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -800,6 +800,7 @@ enum sock_flags {
 	SOCK_SELECT_ERR_QUEUE, /* Wake select on error queue */
 	SOCK_RCU_FREE, /* wait rcu grace period in sk_destruct() */
 	SOCK_TXTIME,
+	SOCK_XDP, /* XDP is attached */
 };
 
 #define SK_FLAGS_TIMESTAMP ((1UL << SOCK_TIMESTAMP) | (1UL << SOCK_TIMESTAMPING_RX_SOFTWARE))
-- 
2.17.1

^ permalink raw reply related

* [PATCH] ath9k: add reset for airtime station debugfs
From: Louie Lu @ 2018-09-06  3:53 UTC (permalink / raw)
  To: kvalo; +Cc: ath9k-devel, davem, linux-wireless, netdev, toke

Let user can reset station airtime status by debugfs, it will
reset all airtime deficit to ATH_AIRTIME_QUANTUM and reset rx/tx
airtime accumulate to 0.

Signed-off-by: Louie Lu <git@louie.lu>
---
 drivers/net/wireless/ath/ath9k/debug_sta.c | 18 +++++++++++++++++-
 1 file changed, 17 insertions(+), 1 deletion(-)

diff --git a/drivers/net/wireless/ath/ath9k/debug_sta.c b/drivers/net/wireless/ath/ath9k/debug_sta.c
index ed8b77a74630..e8fcd3e1c470 100644
--- a/drivers/net/wireless/ath/ath9k/debug_sta.c
+++ b/drivers/net/wireless/ath/ath9k/debug_sta.c
@@ -286,9 +286,25 @@ static ssize_t read_airtime(struct file *file, char __user *user_buf,
 	return retval;
 }
 
+static ssize_t
+write_airtime_reset_stub(struct file *file, const char __user *ubuf,
+		   size_t count, loff_t *ppos)
+{
+	struct ath_node *an = file->private_data;
+	struct ath_airtime_stats *astats;
+	int i;
+
+	astats = &an->airtime_stats;
+	astats->rx_airtime = 0;
+	astats->tx_airtime = 0;
+	for (i = 0; i < 4; i++)
+		an->airtime_deficit[i] = ATH_AIRTIME_QUANTUM;
+	return count;
+}
 
 static const struct file_operations fops_airtime = {
 	.read = read_airtime,
+	.write = write_airtime_reset_stub,
 	.open = simple_open,
 	.owner = THIS_MODULE,
 	.llseek = default_llseek,
@@ -304,5 +320,5 @@ void ath9k_sta_add_debugfs(struct ieee80211_hw *hw,
 
 	debugfs_create_file("node_aggr", 0444, dir, an, &fops_node_aggr);
 	debugfs_create_file("node_recv", 0444, dir, an, &fops_node_recv);
-	debugfs_create_file("airtime", 0444, dir, an, &fops_airtime);
+	debugfs_create_file("airtime", 0644, dir, an, &fops_airtime);
 }
-- 
2.18.0

^ permalink raw reply related

* Re: [PATCH net] devlink: Fix devlink_param_driverinit_value_set() stub return code
From: Moshe Shemesh @ 2018-09-06  7:30 UTC (permalink / raw)
  To: David Ahern, David S. Miller; +Cc: Jiri Pirko, netdev, linux-kernel
In-Reply-To: <b660e196-9277-f66a-5be8-4288f73b9305@gmail.com>



On 9/5/2018 5:26 PM, David Ahern wrote:
> On 9/5/18 6:48 AM, Moshe Shemesh wrote:
>>
>>
>> On 9/4/2018 7:13 PM, David Ahern wrote:
>>> On 9/4/18 7:04 AM, Moshe Shemesh wrote:
>>>> The stub function returned -EOPNOTSUPP while CONFIG_NET_DEVLINK is off.
>>>> It caused false warning during driver load. Driver needs to update
>>>> devlink on a parameter value if devlink module is there, if not it
>>>> doesn't need any error code.
>>>>
>>>> Fixes: ec01aeb1803e ("devlink: Add support for get/set driverinit
>>>> value")
>>>> Signed-off-by: Moshe Shemesh <moshe@mellanox.com>
>>>> Acked-by: Jiri Pirko <jiri@mellanox.com>
>>>> ---
>>>>    include/net/devlink.h | 2 +-
>>>>    1 file changed, 1 insertion(+), 1 deletion(-)
>>>>
>>>> diff --git a/include/net/devlink.h b/include/net/devlink.h
>>>> index b9b89d6..b467357 100644
>>>> --- a/include/net/devlink.h
>>>> +++ b/include/net/devlink.h
>>>> @@ -781,7 +781,7 @@ static inline bool
>>>> devlink_dpipe_table_counter_enabled(struct devlink *devlink,
>>>>    devlink_param_driverinit_value_set(struct devlink *devlink, u32
>>>> param_id,
>>>>                       union devlink_param_value init_val)
>>>>    {
>>>> -    return -EOPNOTSUPP;
>>>> +    return 0;
>>>>    }
>>>>      static inline void
>>>>
>>>
>>> This should be handled by the driver -- check for -EOPNOTSUPP and not
>>> log an error.
>>
>> This is a stub inline function.
>> The return value would be ambiguous as the non-stub function can also
>> return -EOPNOTSUPP, in case the driver-init mode is not supported for a
>> parameter.
>>>
>>> devlink is generic infrastructure. If a call is made and the operation
>>> is not supported, then devlink should return an error.
>>
>> In general the stub functions should take care that the driver won't
>> feel the missing code as much as possible. That's why
>> devlink_params_register() returns success and so should this function.
>>>
> 
> A driver is accessing core infrastructure to set a value; core infra can
> not because the code is not compiled in. The driver should be told that
> the option is not enabled, and it is at the moment with the -EOPNOTSUPP
> return code.
> 
> That is similar to devlink_dpipe_table_resource_set which returns
> -EOPNOTSUPP when devlink is compiled out.

Yes, that was my original thought when I wrote this stub, but actually 
in this case the driver has the value and it wants to update devlink, so 
devlink will have it too. If devlink module is not there why should it 
really care ?

> 

^ permalink raw reply

* Re: KASAN: slab-out-of-bounds Read in _decode_session6
From: Eric Dumazet @ 2018-09-06  7:00 UTC (permalink / raw)
  To: syzbot, ast, daniel, davem, dvyukov, herbert, kuznet,
	linux-kernel, netdev, steffen.klassert, syzkaller-bugs, yoshfuji
In-Reply-To: <0000000000002bef6405752b530b@google.com>



On 09/05/2018 08:17 PM, syzbot wrote:
> syzbot has found a reproducer for the following crash on:
> 
> HEAD commit:    b36fdc6853a3 Merge tag 'gpio-v4.19-2' of git://git.kernel...
> git tree:       upstream
> console output: https://syzkaller.appspot.com/x/log.txt?x=164938d1400000
> kernel config:  https://syzkaller.appspot.com/x/.config?x=4c7e83258d6e0156
> dashboard link: https://syzkaller.appspot.com/bug?extid=acffccec848dc13fe459
> compiler:       gcc (GCC) 8.0.1 20180413 (experimental)
> syz repro:      https://syzkaller.appspot.com/x/repro.syz?x=115f172e400000
> C reproducer:   https://syzkaller.appspot.com/x/repro.c?x=16399be1400000
> 
> IMPORTANT: if you fix the bug, please add the following tag to the commit:
> Reported-by: syzbot+acffccec848dc13fe459@syzkaller.appspotmail.com
> 
> IPv6: ADDRCONF(NETDEV_UP): veth1: link is not ready
> IPv6: ADDRCONF(NETDEV_CHANGE): veth1: link becomes ready
> IPv6: ADDRCONF(NETDEV_CHANGE): veth0: link becomes ready
> 8021q: adding VLAN 0 to HW filter on device team0
> ==================================================================
> BUG: KASAN: slab-out-of-bounds in _decode_session6+0x1331/0x14e0 net/ipv6/xfrm6_policy.c:161
> Read of size 1 at addr ffff8801d4a67f07 by task syz-executor092/4673
> 
> CPU: 1 PID: 4673 Comm: syz-executor092 Not tainted 4.19.0-rc2+ #223
> Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011
> Call Trace:
>  __dump_stack lib/dump_stack.c:77 [inline]
>  dump_stack+0x1c9/0x2b4 lib/dump_stack.c:113
>  print_address_description+0x6c/0x20b mm/kasan/report.c:256
>  kasan_report_error mm/kasan/report.c:354 [inline]
>  kasan_report.cold.7+0x242/0x30d mm/kasan/report.c:412
>  __asan_report_load1_noabort+0x14/0x20 mm/kasan/report.c:430
>  _decode_session6+0x1331/0x14e0 net/ipv6/xfrm6_policy.c:161
>  __xfrm_decode_session+0x71/0x140 net/xfrm/xfrm_policy.c:2299
>  xfrm_decode_session include/net/xfrm.h:1232 [inline]
>  vti6_tnl_xmit+0x3fc/0x1bb1 net/ipv6/ip6_vti.c:542
>  __netdev_start_xmit include/linux/netdevice.h:4287 [inline]
>  netdev_start_xmit include/linux/netdevice.h:4296 [inline]
>  xmit_one net/core/dev.c:3216 [inline]
>  dev_hard_start_xmit+0x272/0xc10 net/core/dev.c:3232
>  __dev_queue_xmit+0x2ab2/0x3870 net/core/dev.c:3802
>  dev_queue_xmit+0x17/0x20 net/core/dev.c:3835
>  __bpf_tx_skb net/core/filter.c:2012 [inline]
>  __bpf_redirect_common net/core/filter.c:2050 [inline]
>  __bpf_redirect+0x5b7/0xae0 net/core/filter.c:2057
>  ____bpf_clone_redirect net/core/filter.c:2090 [inline]
>  bpf_clone_redirect+0x2f6/0x490 net/core/filter.c:2062
>  bpf_prog_c39d1ba309a769f7+0xe9e/0x1000
> 
> Allocated by task 4673:
>  save_stack+0x43/0xd0 mm/kasan/kasan.c:448
>  set_track mm/kasan/kasan.c:460 [inline]
>  kasan_kmalloc+0xc4/0xe0 mm/kasan/kasan.c:553
>  __do_kmalloc_node mm/slab.c:3682 [inline]
>  __kmalloc_node_track_caller+0x47/0x70 mm/slab.c:3696
>  __kmalloc_reserve.isra.41+0x3a/0xe0 net/core/skbuff.c:137
>  pskb_expand_head+0x230/0x10e0 net/core/skbuff.c:1463
>  skb_ensure_writable+0x3dd/0x640 net/core/skbuff.c:5129
>  __bpf_try_make_writable net/core/filter.c:1633 [inline]
>  bpf_try_make_writable net/core/filter.c:1639 [inline]
>  bpf_try_make_head_writable net/core/filter.c:1647 [inline]
>  ____bpf_clone_redirect net/core/filter.c:2084 [inline]
>  bpf_clone_redirect+0x14a/0x490 net/core/filter.c:2062
>  bpf_prog_c39d1ba309a769f7+0xe9e/0x1000
> 
> Freed by task 3286:
>  save_stack+0x43/0xd0 mm/kasan/kasan.c:448
>  set_track mm/kasan/kasan.c:460 [inline]
>  __kasan_slab_free+0x11a/0x170 mm/kasan/kasan.c:521
>  kasan_slab_free+0xe/0x10 mm/kasan/kasan.c:528
>  __cache_free mm/slab.c:3498 [inline]
>  kfree+0xd9/0x210 mm/slab.c:3813
>  load_elf_binary+0x2569/0x5610 fs/binfmt_elf.c:1118
>  search_binary_handler+0x17d/0x570 fs/exec.c:1653
>  exec_binprm fs/exec.c:1695 [inline]
>  __do_execve_file.isra.35+0x15ff/0x2460 fs/exec.c:1819
>  do_execveat_common fs/exec.c:1866 [inline]
>  do_execve fs/exec.c:1883 [inline]
>  __do_sys_execve fs/exec.c:1964 [inline]
>  __se_sys_execve fs/exec.c:1959 [inline]
>  __x64_sys_execve+0x8f/0xc0 fs/exec.c:1959
>  do_syscall_64+0x1b9/0x820 arch/x86/entry/common.c:290
>  entry_SYSCALL_64_after_hwframe+0x49/0xbe
> 
> The buggy address belongs to the object at ffff8801d4a67d00
>  which belongs to the cache kmalloc-512 of size 512
> The buggy address is located 7 bytes to the right of
>  512-byte region [ffff8801d4a67d00, ffff8801d4a67f00)
> The buggy address belongs to the page:
> page:ffffea00075299c0 count:1 mapcount:0 mapping:ffff8801dac00940 index:0x0
> flags: 0x2fffc0000000100(slab)
> raw: 02fffc0000000100 ffffea0007529988 ffffea0007529a48 ffff8801dac00940
> raw: 0000000000000000 ffff8801d4a67080 0000000100000006 0000000000000000
> page dumped because: kasan: bad access detected
> 
> Memory state around the buggy address:
>  ffff8801d4a67e00: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
>  ffff8801d4a67e80: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
>> ffff8801d4a67f00: fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc
>                    ^
>  ffff8801d4a67f80: fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc
>  ffff8801d4a68000: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb
> ==================================================================
> 


What about :

diff --git a/net/core/filter.c b/net/core/filter.c
index aecdeba052d3f0ff3d4f0a33ec36891f9738052c..a662f59786bd0677850c1c60a2c92faa6fb6c5bb 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -2081,7 +2081,7 @@ BPF_CALL_3(bpf_clone_redirect, struct sk_buff *, skb, u32, ifindex, u64, flags)
         * here, we need to free the just generated clone to unclone once
         * again.
         */
-       ret = bpf_try_make_head_writable(skb);
+       ret = bpf_try_make_head_writable(clone);
        if (unlikely(ret)) {
                kfree_skb(clone);
                return -ENOMEM;

^ permalink raw reply related

* [PATCH] ath10k: snoc: remove set but not used variable 'ar_snoc'
From: YueHaibing @ 2018-09-06  2:29 UTC (permalink / raw)
  To: Kalle Valo; +Cc: Yue Haibing, ath10k, netdev, linux-wireless, kernel-janitors

From: Yue Haibing <yuehaibing@huawei.com>

Fixes gcc '-Wunused-but-set-variable' warning:

drivers/net/wireless/ath/ath10k/snoc.c: In function 'ath10k_snoc_tx_pipe_cleanup':
drivers/net/wireless/ath/ath10k/snoc.c:681:22: warning:
 variable 'ar_snoc' set but not used [-Wunused-but-set-variable]

Signed-off-by: Yue Haibing <yuehaibing@huawei.com>
---
 drivers/net/wireless/ath/ath10k/snoc.c | 2 --
 1 file changed, 2 deletions(-)

diff --git a/drivers/net/wireless/ath/ath10k/snoc.c b/drivers/net/wireless/ath/ath10k/snoc.c
index fa1843a..2d6ae00 100644
--- a/drivers/net/wireless/ath/ath10k/snoc.c
+++ b/drivers/net/wireless/ath/ath10k/snoc.c
@@ -678,13 +678,11 @@ static void ath10k_snoc_tx_pipe_cleanup(struct ath10k_snoc_pipe *snoc_pipe)
 {
 	struct ath10k_ce_pipe *ce_pipe;
 	struct ath10k_ce_ring *ce_ring;
-	struct ath10k_snoc *ar_snoc;
 	struct sk_buff *skb;
 	struct ath10k *ar;
 	int i;
 
 	ar = snoc_pipe->hif_ce_state;
-	ar_snoc = ath10k_snoc_priv(ar);
 	ce_pipe = snoc_pipe->ce_hdl;
 	ce_ring = ce_pipe->src_ring;

^ permalink raw reply related

* Re: [PATCH net-next v3 5/5] net: dsa: b53: Add SerDes support
From: Andrew Lunn @ 2018-09-06  1:53 UTC (permalink / raw)
  To: Florian Fainelli; +Cc: netdev, vivien.didelot, davem
In-Reply-To: <20180905194215.29301-6-f.fainelli@gmail.com>

On Wed, Sep 05, 2018 at 12:42:15PM -0700, Florian Fainelli wrote:
> Add support for the Northstar Plus SerDes which is accessed through a
> special page of the switch. Since this is something that most people
> probably will not want to use, make it a configurable option with a
> default on ARCH_BCM_NSP where it is the most useful currently.
> 
> The SerDes supports both SGMII and 1000baseX modes for both lanes, and
> 2500baseX for one of the lanes, and is internally looking like a
> seemingly standard MII PHY, except for the few bits that got repurposed.
> 
> Signed-off-by: Florian Fainelli <f.fainelli@gmail.com>
 
Reviewed-by: Andrew Lunn <andrew@lunn.ch>

    Andrew

^ permalink raw reply

* [PATCH net-next] liquidio: Add spoof checking on a VF MAC address
From: Felix Manlunas @ 2018-09-06  1:40 UTC (permalink / raw)
  To: davem
  Cc: netdev, raghu.vatsavayi, derek.chickles, satananda.burla,
	felix.manlunas, weilin.chang

From: Weilin Chang <weilin.chang@cavium.com>

1. Provide the API to set/unset the spoof checking feature.
2. Add a function to periodically provide the count of found
   packets with spoof VF MAC address.
3. Prevent VF MAC address changing while the spoofchk of the VF is
   on unless the changing MAC address is issued from PF.

Signed-off-by: Weilin Chang <weilin.chang@cavium.com>
Signed-off-by: Felix Manlunas <felix.manlunas@cavium.com>
---
 drivers/net/ethernet/cavium/liquidio/lio_core.c    | 74 ++++++++++++++++++++++
 drivers/net/ethernet/cavium/liquidio/lio_ethtool.c |  3 +-
 drivers/net/ethernet/cavium/liquidio/lio_main.c    | 63 ++++++++++++++++++
 drivers/net/ethernet/cavium/liquidio/lio_vf_main.c |  8 +++
 .../net/ethernet/cavium/liquidio/liquidio_common.h | 24 +++++--
 .../net/ethernet/cavium/liquidio/octeon_device.h   |  5 ++
 .../net/ethernet/cavium/liquidio/octeon_network.h  |  6 ++
 drivers/net/ethernet/cavium/liquidio/octeon_nic.c  | 16 +++--
 8 files changed, 187 insertions(+), 12 deletions(-)

diff --git a/drivers/net/ethernet/cavium/liquidio/lio_core.c b/drivers/net/ethernet/cavium/liquidio/lio_core.c
index cdc26ca..0284204 100644
--- a/drivers/net/ethernet/cavium/liquidio/lio_core.c
+++ b/drivers/net/ethernet/cavium/liquidio/lio_core.c
@@ -1357,6 +1357,69 @@ octnet_nic_stats_callback(struct octeon_device *oct_dev,
 	}
 }
 
+int lio_fetch_vf_stats(struct lio *lio)
+{
+	struct octeon_device *oct_dev = lio->oct_dev;
+	struct octeon_soft_command *sc;
+	struct oct_nic_vf_stats_resp *resp;
+
+	int retval;
+
+	/* Alloc soft command */
+	sc = (struct octeon_soft_command *)
+		octeon_alloc_soft_command(oct_dev,
+					  0,
+					  sizeof(struct oct_nic_vf_stats_resp),
+					  0);
+
+	if (!sc) {
+		dev_err(&oct_dev->pci_dev->dev, "Soft command allocation failed\n");
+		retval = -ENOMEM;
+		goto lio_fetch_vf_stats_exit;
+	}
+
+	resp = (struct oct_nic_vf_stats_resp *)sc->virtrptr;
+	memset(resp, 0, sizeof(struct oct_nic_vf_stats_resp));
+
+	init_completion(&sc->complete);
+	sc->sc_status = OCTEON_REQUEST_PENDING;
+
+	sc->iq_no = lio->linfo.txpciq[0].s.q_no;
+
+	octeon_prepare_soft_command(oct_dev, sc, OPCODE_NIC,
+				    OPCODE_NIC_VF_PORT_STATS, 0, 0, 0);
+
+	retval = octeon_send_soft_command(oct_dev, sc);
+	if (retval == IQ_SEND_FAILED) {
+		octeon_free_soft_command(oct_dev, sc);
+		goto lio_fetch_vf_stats_exit;
+	}
+
+	retval =
+		wait_for_sc_completion_timeout(oct_dev, sc,
+					       (2 * LIO_SC_MAX_TMO_MS));
+	if (retval)  {
+		dev_err(&oct_dev->pci_dev->dev,
+			"sc OPCODE_NIC_VF_PORT_STATS command failed\n");
+		goto lio_fetch_vf_stats_exit;
+	}
+
+	if (sc->sc_status != OCTEON_REQUEST_TIMEOUT && !resp->status) {
+		octeon_swap_8B_data((u64 *)&resp->spoofmac_cnt,
+				    (sizeof(u64)) >> 3);
+
+		if (resp->spoofmac_cnt != 0) {
+			dev_warn(&oct_dev->pci_dev->dev,
+				 "%llu Spoofed packets detected\n",
+				 resp->spoofmac_cnt);
+		}
+	}
+	WRITE_ONCE(sc->caller_is_done, 1);
+
+lio_fetch_vf_stats_exit:
+	return retval;
+}
+
 void lio_fetch_stats(struct work_struct *work)
 {
 	struct cavium_wk *wk = (struct cavium_wk *)work;
@@ -1367,6 +1430,17 @@ void lio_fetch_stats(struct work_struct *work)
 	unsigned long time_in_jiffies;
 	int retval;
 
+	if (OCTEON_CN23XX_PF(oct_dev)) {
+		/* report spoofchk every 2 seconds */
+		if (!(oct_dev->vfstats_poll % LIO_VFSTATS_POLL) &&
+		    (oct_dev->fw_info.app_cap_flags & LIQUIDIO_SPOOFCHK_CAP) &&
+		    oct_dev->sriov_info.num_vfs_alloced) {
+			lio_fetch_vf_stats(lio);
+		}
+
+		oct_dev->vfstats_poll++;
+	}
+
 	/* Alloc soft command */
 	sc = (struct octeon_soft_command *)
 		octeon_alloc_soft_command(oct_dev,
diff --git a/drivers/net/ethernet/cavium/liquidio/lio_ethtool.c b/drivers/net/ethernet/cavium/liquidio/lio_ethtool.c
index 46d8379..e1e5808 100644
--- a/drivers/net/ethernet/cavium/liquidio/lio_ethtool.c
+++ b/drivers/net/ethernet/cavium/liquidio/lio_ethtool.c
@@ -1713,7 +1713,8 @@ static void lio_vf_get_ethtool_stats(struct net_device *netdev,
 	  */
 	data[i++] = lstats.rx_dropped;
 	/* sum of oct->instr_queue[iq_no]->stats.tx_dropped */
-	data[i++] = lstats.tx_dropped;
+	data[i++] = lstats.tx_dropped +
+		oct_dev->link_stats.fromhost.fw_err_drop;
 
 	data[i++] = oct_dev->link_stats.fromwire.fw_total_mcast;
 	data[i++] = oct_dev->link_stats.fromhost.fw_total_mcast_sent;
diff --git a/drivers/net/ethernet/cavium/liquidio/lio_main.c b/drivers/net/ethernet/cavium/liquidio/lio_main.c
index e973662..40f941f 100644
--- a/drivers/net/ethernet/cavium/liquidio/lio_main.c
+++ b/drivers/net/ethernet/cavium/liquidio/lio_main.c
@@ -2858,6 +2858,62 @@ static int liquidio_set_vf_mac(struct net_device *netdev, int vfidx, u8 *mac)
 	return retval;
 }
 
+static int liquidio_set_vf_spoofchk(struct net_device *netdev, int vfidx,
+				    bool enable)
+{
+	struct lio *lio = GET_LIO(netdev);
+	struct octeon_device *oct = lio->oct_dev;
+	struct octnic_ctrl_pkt nctrl;
+	int retval;
+
+	if (!(oct->fw_info.app_cap_flags & LIQUIDIO_SPOOFCHK_CAP)) {
+		netif_info(lio, drv, lio->netdev,
+			   "firmware does not support spoofchk\n");
+		return -EOPNOTSUPP;
+	}
+
+	if (vfidx < 0 || vfidx >= oct->sriov_info.num_vfs_alloced) {
+		netif_info(lio, drv, lio->netdev, "Invalid vfidx %d\n", vfidx);
+		return -EINVAL;
+	}
+
+	if (enable) {
+		if (oct->sriov_info.vf_spoofchk[vfidx])
+			return 0;
+	} else {
+		/* Clear */
+		if (!oct->sriov_info.vf_spoofchk[vfidx])
+			return 0;
+	}
+
+	memset(&nctrl, 0, sizeof(struct octnic_ctrl_pkt));
+	nctrl.ncmd.s.cmdgroup = OCTNET_CMD_GROUP1;
+	nctrl.ncmd.s.cmd = OCTNET_CMD_SET_VF_SPOOFCHK;
+	nctrl.ncmd.s.param1 =
+		vfidx + 1; /* vfidx is 0 based,
+			    * but vf_num (param1) is 1 based
+			    */
+	nctrl.ncmd.s.param2 = enable;
+	nctrl.ncmd.s.more = 0;
+	nctrl.iq_no = lio->linfo.txpciq[0].s.q_no;
+	nctrl.cb_fn = 0;
+
+	retval = octnet_send_nic_ctrl_pkt(oct, &nctrl);
+
+	if (retval) {
+		netif_info(lio, drv, lio->netdev,
+			   "Failed to set VF %d spoofchk %s\n", vfidx,
+			enable ? "on" : "off");
+		return -1;
+	}
+
+	oct->sriov_info.vf_spoofchk[vfidx] = enable;
+	netif_info(lio, drv, lio->netdev, "VF %u spoofchk is %s\n", vfidx,
+		   enable ? "on" : "off");
+
+	return 0;
+}
+
 static int liquidio_set_vf_vlan(struct net_device *netdev, int vfidx,
 				u16 vlan, u8 qos, __be16 vlan_proto)
 {
@@ -2920,6 +2976,8 @@ static int liquidio_get_vf_config(struct net_device *netdev, int vfidx,
 	if (vfidx < 0 || vfidx >= oct->sriov_info.num_vfs_alloced)
 		return -EINVAL;
 
+	memset(ivi, 0, sizeof(struct ifla_vf_info));
+
 	ivi->vf = vfidx;
 	macaddr = 2 + (u8 *)&oct->sriov_info.vf_macaddr[vfidx];
 	ether_addr_copy(&ivi->mac[0], macaddr);
@@ -2931,6 +2989,10 @@ static int liquidio_get_vf_config(struct net_device *netdev, int vfidx,
 	else
 		ivi->trusted = false;
 	ivi->linkstate = oct->sriov_info.vf_linkstate[vfidx];
+	ivi->spoofchk = oct->sriov_info.vf_spoofchk[vfidx];
+	ivi->max_tx_rate = lio->linfo.link.s.speed;
+	ivi->min_tx_rate = 0;
+
 	return 0;
 }
 
@@ -3180,6 +3242,7 @@ static const struct net_device_ops lionetdevops = {
 	.ndo_set_vf_mac		= liquidio_set_vf_mac,
 	.ndo_set_vf_vlan	= liquidio_set_vf_vlan,
 	.ndo_get_vf_config	= liquidio_get_vf_config,
+	.ndo_set_vf_spoofchk	= liquidio_set_vf_spoofchk,
 	.ndo_set_vf_trust	= liquidio_set_vf_trust,
 	.ndo_set_vf_link_state  = liquidio_set_vf_link_state,
 	.ndo_get_vf_stats	= liquidio_get_vf_stats,
diff --git a/drivers/net/ethernet/cavium/liquidio/lio_vf_main.c b/drivers/net/ethernet/cavium/liquidio/lio_vf_main.c
index fe3d935..8fa7ac3 100644
--- a/drivers/net/ethernet/cavium/liquidio/lio_vf_main.c
+++ b/drivers/net/ethernet/cavium/liquidio/lio_vf_main.c
@@ -1135,6 +1135,12 @@ static int liquidio_set_mac(struct net_device *netdev, void *p)
 		return -ENOMEM;
 	}
 
+	if (nctrl.sc_status ==
+	    FIRMWARE_STATUS_CODE(OCTEON_REQUEST_NO_PERMISSION)) {
+		dev_err(&oct->pci_dev->dev, "MAC Address change failed: no permission\n");
+		return -EPERM;
+	}
+
 	memcpy(netdev->dev_addr, addr->sa_data, netdev->addr_len);
 	ether_addr_copy(((u8 *)&lio->linfo.hw_addr) + 2, addr->sa_data);
 
@@ -2049,6 +2055,8 @@ static int setup_nic_devices(struct octeon_device *octeon_dev)
 		lio->linfo.link.u64 = resp->cfg_info.linfo.link.u64;
 		lio->linfo.macaddr_is_admin_asgnd =
 			resp->cfg_info.linfo.macaddr_is_admin_asgnd;
+		lio->linfo.macaddr_spoofchk =
+			resp->cfg_info.linfo.macaddr_spoofchk;
 
 		lio->msg_enable = netif_msg_init(debug, DEFAULT_MSG_ENABLE);
 
diff --git a/drivers/net/ethernet/cavium/liquidio/liquidio_common.h b/drivers/net/ethernet/cavium/liquidio/liquidio_common.h
index 0decc72..8fcb07d 100644
--- a/drivers/net/ethernet/cavium/liquidio/liquidio_common.h
+++ b/drivers/net/ethernet/cavium/liquidio/liquidio_common.h
@@ -118,6 +118,10 @@ enum octeon_tag_type {
 /* App specific capabilities from firmware to pf driver */
 #define LIQUIDIO_TIME_SYNC_CAP 0x1
 #define LIQUIDIO_SWITCHDEV_CAP 0x2
+#define LIQUIDIO_SPOOFCHK_CAP  0x4
+
+/* error status return from firmware */
+#define OCTEON_REQUEST_NO_PERMISSION 0xc
 
 static inline u32 incr_index(u32 index, u32 count, u32 max)
 {
@@ -241,6 +245,10 @@ static inline void add_sg_size(struct octeon_sg_entry *sg_entry,
 
 #define   OCTNET_CMD_QUEUE_COUNT_CTL	0x1f
 
+#define   OCTNET_CMD_GROUP1             1
+#define   OCTNET_CMD_SET_VF_SPOOFCHK    0x1
+#define   OCTNET_GROUP1_LAST_CMD        OCTNET_CMD_SET_VF_SPOOFCHK
+
 #define   OCTNET_CMD_VXLAN_PORT_ADD    0x0
 #define   OCTNET_CMD_VXLAN_PORT_DEL    0x1
 #define   OCTNET_CMD_RXCSUM_ENABLE     0x0
@@ -255,6 +263,8 @@ static inline void add_sg_size(struct octeon_sg_entry *sg_entry,
 #define   SEAPI_CMD_SPEED_SET           0x2
 #define   SEAPI_CMD_SPEED_GET           0x3
 
+#define OPCODE_NIC_VF_PORT_STATS        0x22
+
 #define   LIO_CMD_WAIT_TM 100
 
 /* RX(packets coming from wire) Checksum verification flags */
@@ -303,7 +313,8 @@ union octnet_cmd {
 
 		u64 more:6; /* How many udd words follow the command */
 
-		u64 reserved:29;
+		u64 cmdgroup:8;
+		u64 reserved:21;
 
 		u64 param1:16;
 
@@ -315,7 +326,8 @@ union octnet_cmd {
 
 		u64 param1:16;
 
-		u64 reserved:29;
+		u64 reserved:21;
+		u64 cmdgroup:8;
 
 		u64 more:6;
 
@@ -759,13 +771,17 @@ struct oct_link_info {
 #ifdef __BIG_ENDIAN_BITFIELD
 	u64 gmxport:16;
 	u64 macaddr_is_admin_asgnd:1;
-	u64 rsvd:31;
+	u64 rsvd:13;
+	u64 macaddr_spoofchk:1;
+	u64 rsvd1:17;
 	u64 num_txpciq:8;
 	u64 num_rxpciq:8;
 #else
 	u64 num_rxpciq:8;
 	u64 num_txpciq:8;
-	u64 rsvd:31;
+	u64 rsvd1:17;
+	u64 macaddr_spoofchk:1;
+	u64 rsvd:13;
 	u64 macaddr_is_admin_asgnd:1;
 	u64 gmxport:16;
 #endif
diff --git a/drivers/net/ethernet/cavium/liquidio/octeon_device.h b/drivers/net/ethernet/cavium/liquidio/octeon_device.h
index d99ca6b..881e40d 100644
--- a/drivers/net/ethernet/cavium/liquidio/octeon_device.h
+++ b/drivers/net/ethernet/cavium/liquidio/octeon_device.h
@@ -397,6 +397,8 @@ struct octeon_sriov_info {
 
 	int	vf_linkstate[MAX_POSSIBLE_VFS];
 
+	bool    vf_spoofchk[MAX_POSSIBLE_VFS];
+
 	u64	vf_drv_loaded_mask;
 };
 
@@ -607,6 +609,9 @@ struct octeon_device {
 	u8  speed_boot;
 	u8  speed_setting;
 	u8  no_speed_setting;
+
+	u32    vfstats_poll;
+#define LIO_VFSTATS_POLL 10
 };
 
 #define  OCT_DRV_ONLINE 1
diff --git a/drivers/net/ethernet/cavium/liquidio/octeon_network.h b/drivers/net/ethernet/cavium/liquidio/octeon_network.h
index ecd2a26..9117b5a 100644
--- a/drivers/net/ethernet/cavium/liquidio/octeon_network.h
+++ b/drivers/net/ethernet/cavium/liquidio/octeon_network.h
@@ -71,6 +71,12 @@ struct oct_nic_stats_resp {
 	u64     status;
 };
 
+struct oct_nic_vf_stats_resp {
+	u64     rh;
+	u64	spoofmac_cnt;
+	u64     status;
+};
+
 struct oct_nic_stats_ctrl {
 	struct completion complete;
 	struct net_device *netdev;
diff --git a/drivers/net/ethernet/cavium/liquidio/octeon_nic.c b/drivers/net/ethernet/cavium/liquidio/octeon_nic.c
index 676fe0b..1a706f8 100644
--- a/drivers/net/ethernet/cavium/liquidio/octeon_nic.c
+++ b/drivers/net/ethernet/cavium/liquidio/octeon_nic.c
@@ -172,13 +172,15 @@ octnet_send_nic_ctrl_pkt(struct octeon_device *oct,
 
 	spin_unlock_bh(&oct->cmd_resp_wqlock);
 
-	switch (nctrl->ncmd.s.cmd) {
-		/* caller holds lock, can not sleep */
-	case OCTNET_CMD_CHANGE_DEVFLAGS:
-	case OCTNET_CMD_SET_MULTI_LIST:
-	case OCTNET_CMD_SET_UC_LIST:
-		WRITE_ONCE(sc->caller_is_done, true);
-		return retval;
+	if (nctrl->ncmd.s.cmdgroup == 0) {
+		switch (nctrl->ncmd.s.cmd) {
+			/* caller holds lock, can not sleep */
+		case OCTNET_CMD_CHANGE_DEVFLAGS:
+		case OCTNET_CMD_SET_MULTI_LIST:
+		case OCTNET_CMD_SET_UC_LIST:
+			WRITE_ONCE(sc->caller_is_done, true);
+			return retval;
+		}
 	}
 
 	retval = wait_for_sc_completion_timeout(oct, sc, 0);
-- 
2.7.4

^ permalink raw reply related

* Re: [PATCH net-next v2 0/9] rtnetlink: add IFA_TARGET_NETNSID for RTM_GETADDR
From: David Miller @ 2018-09-06  5:27 UTC (permalink / raw)
  To: christian
  Cc: netdev, linux-kernel, kuznet, yoshfuji, pombredanne, kstewart,
	gregkh, dsahern, fw, ktkhai, lucien.xin, jakub.kicinski, jbenc,
	nicolas.dichtel
In-Reply-To: <20180904195355.4695-1-christian@brauner.io>

From: Christian Brauner <christian@brauner.io>
Date: Tue,  4 Sep 2018 21:53:46 +0200

> # v2 introduction:
 ...

This looks good to me, series applied.

Thank you!

^ permalink raw reply

* Re: [PATCH V2 0/4 next] net: lan78xx: Minor improvements
From: David Miller @ 2018-09-06  5:20 UTC (permalink / raw)
  To: stefan.wahren
  Cc: woojung.huh, UNGLinuxDriver, netdev, linux-usb, linux-kernel
In-Reply-To: <1536082152-3669-1-git-send-email-stefan.wahren@i2se.com>

From: Stefan Wahren <stefan.wahren@i2se.com>
Date: Tue,  4 Sep 2018 19:29:08 +0200

> This patch series contains some minor improvements for the lan78xx
> driver.
> 
> Changes in V2:
> - Keep Copyright comment as multi-line
> - Add Raghuram's Reviewed-by

Series applied, thanks.

^ permalink raw reply

* Re: [RFC PATCH net-next V2 0/6] XDP rx handler
From: Jason Wang @ 2018-09-06  5:12 UTC (permalink / raw)
  To: David Ahern, Jesper Dangaard Brouer
  Cc: Alexei Starovoitov, netdev, linux-kernel, ast, daniel, mst
In-Reply-To: <99d3e3d0-a14d-7789-2777-67421c7d4a20@gmail.com>

On 2018年09月06日 01:20, David Ahern wrote:
> [ sorry for the delay; focused on the nexthop RFC ]

No problem. Your comments is appreciated.

> On 8/20/18 12:34 AM, Jason Wang wrote:
>>
>> On 2018年08月18日 05:15, David Ahern wrote:
>>> On 8/15/18 9:34 PM, Jason Wang wrote:
>>>> I may miss something but BPF forbids loop. Without a loop how can we
>>>> make sure all stacked devices is enumerated correctly without knowing
>>>> the topology in advance?
>>> netdev_for_each_upper_dev_rcu
>>>
>>> BPF helpers allow programs to do lookups in kernel tables, in this case
>>> the ability to find an upper device that would receive the packet.
>> So if I understand correctly, you mean using
>> netdev_for_each_upper_dev_rcu() inside a BPF helper? If yes, I think we
>> may still need device specific logic. E.g for macvlan,
>> netdev_for_each_upper_dev_rcu() enumerates all macvlan devices on top a
>> lower device. But what we need is one of the macvlan that matches the
>> dst mac address which is similar to what XDP rx handler did. And it
>> would become more complicated if we have multiple layers of device.
> My device lookup helper takes the base port index (starting device),
> vlan protocol, vlan tag and dest mac. So, yes, the mac address is used
> to uniquely identify the stacked device.

Ok.

>
>> So let's consider a simple case, consider we have 5 macvlan devices:
>>
>> macvlan0: doing some packet filtering before passing packets to TCP/IP
>> stack
>> macvlan1: modify packets and redirect to another interface
>> macvlan2: modify packets and transmit packet back through XDP_TX
>> macvlan3: deliver packets to AF_XDP
>> macvtap0: deliver packets raw XDP to VM
>>
>> So, with XDP rx handler, what we need to just to attach five different
>> XDP programs to each macvlan device. Your idea is to do all things in
>> the root device XDP program. This looks complicated and not flexible
>> since it needs to care a lot of things, e.g adding/removing
>> actions/policies. And XDP program needs to call BPF helper that use
>> netdev_for_each_upper_dev_rcu() to work correctly with stacked device.
>>
> Stacking on top of a nic port can have all kinds of combinations of
> vlans, bonds, bridges, vlans on bonds and bridges, macvlans, etc. I
> suspect trying to install a program for layer 3 forwarding on each one
> and iteratively running the programs would kill the performance gained
> from forwarding with xdp.

Yes, the performance may drop but it's still much faster than XDP 
generic path.

One reason for the drop is the device specific logic like mac address 
matching which is also needed for the case of a single XDP program on 
the root device. For macvlan, if we allow attach XDP on macvlan, we can 
offload the mac address lookup to hardware through L2 forwarding 
offload, this can give us no performance drop I believe. The only reason 
that was introduced by XDP rx handler itself is probably the indirect 
calls. We can try to amortize them by introducing some kind of batching 
on top. For the issue of multiple XDP program iterations, for this RFC, 
if we have N stacked devices, there's no need to attach XDP program on 
each layer, the only thing that need is the XDP_PASS action in the root 
device, then you can attach XDP program on any one or some stacked 
devices on top.

So the RFC is not intended to replace any exist solution, it just 
provides some flexibility for having native XDP on stacked device (which 
is based on rx handler) and benefit from exist tools to do the 
configuration. If user want to do all things in the root device, that 
should work well without any issues.

Thanks

^ permalink raw reply

* Re: [PATCH] netlink: Make groups check less stupid in netlink_bind()
From: David Miller @ 2018-09-06  5:11 UTC (permalink / raw)
  To: dima; +Cc: linux-kernel, 0x7f454c46, torvalds, herbert, steffen.klassert,
	netdev
In-Reply-To: <20180903214051.4433-1-dima@arista.com>

From: Dmitry Safonov <dima@arista.com>
Date: Mon,  3 Sep 2018 22:40:51 +0100

> As Linus noted, the test for 0 is needless, groups type can follow the
> usual kernel style and 8*sizeof(unsigned long) is BITS_PER_LONG:
> 
>> The code [..] isn't technically incorrect...
>> But it is stupid.
>> Why stupid? Because the test for 0 is pointless.
>>
>> Just doing
>>        if (nlk->ngroups < 8*sizeof(groups))
>>                groups &= (1UL << nlk->ngroups) - 1;
>>
>> would have been fine and more understandable, since the "mask by shift
>> count" already does the right thing for a ngroups value of 0. Now that
>> test for zero makes me go "what's special about zero?". It turns out
>> that the answer to that is "nothing".
> [..]
>> The type of "groups" is kind of silly too.
>>
>> Yeah, "long unsigned int" isn't _technically_ wrong. But we normally
>> call that type "unsigned long".
> 
> Cleanup my piece of pointlessness.
> 
> Cc: "David S. Miller" <davem@davemloft.net>
> Cc: Herbert Xu <herbert@gondor.apana.org.au>
> Cc: Steffen Klassert <steffen.klassert@secunet.com>
> Cc: netdev@vger.kernel.org
> Fairly-blamed-by: Linus Torvalds <torvalds@linux-foundation.org>
> Signed-off-by: Dmitry Safonov <dima@arista.com>

Applied to net-next, thanks.

^ permalink raw reply

* lening
From: Funding BVBA Finances @ 2018-09-05 23:55 UTC (permalink / raw)





Goede dag


We zijn Funding BVBA Financiering van leningen per postadvertentie. Heeft u financi&euml;le hulp nodig? Een legitieme kredietwaardigheid voor rente nodig? Heeft u een zakelijke lening nodig? Heb je een lening nodig om een huis, auto te kopen, je rekeningen en schulden te betalen? Heeft u geld nodig om problemen op te lossen? (leningen op korte en lange termijn, persoonlijke leningen, leningen aan bedrijven, enz.) met een rente van 3%. Onze leningen vari&euml;ren van 5.000,00 tot 20.000.000,00 US Dollar of Euro of Pond met een maximale duur van 15 jaar.


INFORMATIE NODIG


Jullie namen:
Adres: ...........
Telefoon: ...........
Benodigde hoeveelheid: .......
Looptijd: ...............
Beroep: ...........
Maandelijks inkomensniveau: ......
Geslacht: ..............
Geboortedatum: .......
Staat: ...............
Land: .........
Doel: .........


"We laten u een betere manier zien om uw financi&euml;le vrijheid te vergroten"


Met vriendelijke groet,


Sigurd Pedersen



+448709453058

info@fundingbvbafinances.com

Website: http://www.fundingbvbafinance.com

^ permalink raw reply

* [PATCH bpf-next 4/4] tools/bpf: bpftool: add net support
From: Yonghong Song @ 2018-09-05 23:58 UTC (permalink / raw)
  To: ast, daniel, netdev; +Cc: kernel-team
In-Reply-To: <20180905235806.1536396-1-yhs@fb.com>

Add "bpftool net" support. Networking devices are enumerated
to dump device index/name associated with xdp progs.

For each networking device, tc classes and qdiscs are enumerated
in order to check their bpf filters.
In addition, root handle and clsact ingress/egress are also checked for
bpf filters.  Not all filter information is printed out. Only ifindex,
kind, filter name, prog_id and tag are printed out, which are good
enough to show attachment information. If the filter action
is a bpf action, its bpf program id, bpf name and tag will be
printed out as well.

For example,
  $ ./bpftool net
  xdp [
  ifindex 2 devname eth0 prog_id 198
  ]
  tc_filters [
  ifindex 2 kind qdisc_htb name prefix_matcher.o:[cls_prefix_matcher_htb]
            prog_id 111727 tag d08fe3b4319bc2fd act []
  ifindex 2 kind qdisc_clsact_ingress name fbflow_icmp
            prog_id 130246 tag 3f265c7f26db62c9 act []
  ifindex 2 kind qdisc_clsact_egress name prefix_matcher.o:[cls_prefix_matcher_clsact]
            prog_id 111726 tag 99a197826974c876
  ifindex 2 kind qdisc_clsact_egress name cls_fg_dscp
            prog_id 108619 tag dc4630674fd72dcc act []
  ifindex 2 kind qdisc_clsact_egress name fbflow_egress
            prog_id 130245 tag 72d2d830d6888d2c
  ]
  $ ./bpftool -jp net
  [{
        "xdp": [{
                "ifindex": 2,
                "devname": "eth0",
                "prog_id": 198
            }
        ],
        "tc_filters": [{
                "ifindex": 2,
                "kind": "qdisc_htb",
                "name": "prefix_matcher.o:[cls_prefix_matcher_htb]",
                "prog_id": 111727,
                "tag": "d08fe3b4319bc2fd",
                "act": []
            },{
                "ifindex": 2,
                "kind": "qdisc_clsact_ingress",
                "name": "fbflow_icmp",
                "prog_id": 130246,
                "tag": "3f265c7f26db62c9",
                "act": []
            },{
                "ifindex": 2,
                "kind": "qdisc_clsact_egress",
                "name": "prefix_matcher.o:[cls_prefix_matcher_clsact]",
                "prog_id": 111726,
                "tag": "99a197826974c876"
            },{
                "ifindex": 2,
                "kind": "qdisc_clsact_egress",
                "name": "cls_fg_dscp",
                "prog_id": 108619,
                "tag": "dc4630674fd72dcc",
                "act": []
            },{
                "ifindex": 2,
                "kind": "qdisc_clsact_egress",
                "name": "fbflow_egress",
                "prog_id": 130245,
                "tag": "72d2d830d6888d2c"
            }
        ]
    }
  ]

Signed-off-by: Yonghong Song <yhs@fb.com>
---
 .../bpf/bpftool/Documentation/bpftool-net.rst | 133 ++++++++++
 tools/bpf/bpftool/Documentation/bpftool.rst   |   6 +-
 tools/bpf/bpftool/bash-completion/bpftool     |  17 +-
 tools/bpf/bpftool/main.c                      |   3 +-
 tools/bpf/bpftool/main.h                      |   7 +
 tools/bpf/bpftool/net.c                       | 233 ++++++++++++++++++
 tools/bpf/bpftool/netlink_dumper.c            | 181 ++++++++++++++
 tools/bpf/bpftool/netlink_dumper.h            | 103 ++++++++
 8 files changed, 676 insertions(+), 7 deletions(-)
 create mode 100644 tools/bpf/bpftool/Documentation/bpftool-net.rst
 create mode 100644 tools/bpf/bpftool/net.c
 create mode 100644 tools/bpf/bpftool/netlink_dumper.c
 create mode 100644 tools/bpf/bpftool/netlink_dumper.h

diff --git a/tools/bpf/bpftool/Documentation/bpftool-net.rst b/tools/bpf/bpftool/Documentation/bpftool-net.rst
new file mode 100644
index 000000000000..48a61837a264
--- /dev/null
+++ b/tools/bpf/bpftool/Documentation/bpftool-net.rst
@@ -0,0 +1,133 @@
+================
+bpftool-net
+================
+-------------------------------------------------------------------------------
+tool for inspection of netdev/tc related bpf prog attachments
+-------------------------------------------------------------------------------
+
+:Manual section: 8
+
+SYNOPSIS
+========
+
+	**bpftool** [*OPTIONS*] **net** *COMMAND*
+
+	*OPTIONS* := { [{ **-j** | **--json** }] [{ **-p** | **--pretty** }] }
+
+	*COMMANDS* :=
+	{ **show** | **list** } [ **dev** name ] | **help**
+
+NET COMMANDS
+============
+
+|	**bpftool** **net { show | list } [ dev name ]**
+|	**bpftool** **net help**
+
+DESCRIPTION
+===========
+	**bpftool net { show | list } [ dev name ]**
+		  List all networking device driver and tc attachment in the system.
+
+                  Output will start with all xdp program attachment, followed by
+                  all tc class/qdisc bpf program attachments. Both xdp programs and
+                  tc programs are ordered based on ifindex number. If multiple bpf
+                  programs attached to the same networking device through **tc filter**,
+                  the order will be first all bpf programs attached to tc classes, then
+                  all bpf programs attached to non clsact qdiscs, and finally all
+                  bpf programs attached to root and clsact qdisc.
+
+	**bpftool net help**
+		  Print short help message.
+
+OPTIONS
+=======
+	-h, --help
+		  Print short generic help message (similar to **bpftool help**).
+
+	-v, --version
+		  Print version number (similar to **bpftool version**).
+
+	-j, --json
+		  Generate JSON output. For commands that cannot produce JSON, this
+		  option has no effect.
+
+	-p, --pretty
+		  Generate human-readable JSON output. Implies **-j**.
+
+EXAMPLES
+========
+
+| **# bpftool net**
+
+::
+
+      xdp [
+      ifindex 2 devname eth0 prog_id 198
+      ]
+      tc_filters [
+      ifindex 2 kind qdisc_htb name prefix_matcher.o:[cls_prefix_matcher_htb]
+                prog_id 111727 tag d08fe3b4319bc2fd act []
+      ifindex 2 kind qdisc_clsact_ingress name fbflow_icmp
+                prog_id 130246 tag 3f265c7f26db62c9 act []
+      ifindex 2 kind qdisc_clsact_egress name prefix_matcher.o:[cls_prefix_matcher_clsact]
+                prog_id 111726 tag 99a197826974c876
+      ifindex 2 kind qdisc_clsact_egress name cls_fg_dscp
+                prog_id 108619 tag dc4630674fd72dcc act []
+      ifindex 2 kind qdisc_clsact_egress name fbflow_egress
+                prog_id 130245 tag 72d2d830d6888d2c
+      ]
+
+|
+| **# bpftool -jp net**
+
+::
+
+    [{
+            "xdp": [{
+                    "ifindex": 2,
+                    "devname": "eth0",
+                    "prog_id": 198
+                }
+            ],
+            "tc_filters": [{
+                    "ifindex": 2,
+                    "kind": "qdisc_htb",
+                    "name": "prefix_matcher.o:[cls_prefix_matcher_htb]",
+                    "prog_id": 111727,
+                    "tag": "d08fe3b4319bc2fd",
+                    "act": []
+                },{
+                    "ifindex": 2,
+                    "kind": "qdisc_clsact_ingress",
+                    "name": "fbflow_icmp",
+                    "prog_id": 130246,
+                    "tag": "3f265c7f26db62c9",
+                    "act": []
+                },{
+                    "ifindex": 2,
+                    "kind": "qdisc_clsact_egress",
+                    "name": "prefix_matcher.o:[cls_prefix_matcher_clsact]",
+                    "prog_id": 111726,
+                    "tag": "99a197826974c876"
+                },{
+                    "ifindex": 2,
+                    "kind": "qdisc_clsact_egress",
+                    "name": "cls_fg_dscp",
+                    "prog_id": 108619,
+                    "tag": "dc4630674fd72dcc",
+                    "act": []
+                },{
+                    "ifindex": 2,
+                    "kind": "qdisc_clsact_egress",
+                    "name": "fbflow_egress",
+                    "prog_id": 130245,
+                    "tag": "72d2d830d6888d2c"
+                }
+            ]
+        }
+    ]
+
+
+SEE ALSO
+========
+	**bpftool**\ (8), **bpftool-prog**\ (8), **bpftool-map**\ (8)
diff --git a/tools/bpf/bpftool/Documentation/bpftool.rst b/tools/bpf/bpftool/Documentation/bpftool.rst
index b6f5d560460d..8dda77daeda9 100644
--- a/tools/bpf/bpftool/Documentation/bpftool.rst
+++ b/tools/bpf/bpftool/Documentation/bpftool.rst
@@ -16,7 +16,7 @@ SYNOPSIS
 
 	**bpftool** **version**
 
-	*OBJECT* := { **map** | **program** | **cgroup** | **perf** }
+	*OBJECT* := { **map** | **program** | **cgroup** | **perf** | **net** }
 
 	*OPTIONS* := { { **-V** | **--version** } | { **-h** | **--help** }
 	| { **-j** | **--json** } [{ **-p** | **--pretty** }] }
@@ -32,6 +32,8 @@ SYNOPSIS
 
 	*PERF-COMMANDS* := { **show** | **list** | **help** }
 
+	*NET-COMMANDS* := { **show** | **list** | **help** }
+
 DESCRIPTION
 ===========
 	*bpftool* allows for inspection and simple modification of BPF objects
@@ -58,4 +60,4 @@ OPTIONS
 SEE ALSO
 ========
 	**bpftool-map**\ (8), **bpftool-prog**\ (8), **bpftool-cgroup**\ (8)
-        **bpftool-perf**\ (8)
+        **bpftool-perf**\ (8), **bpftool-net**\ (8)
diff --git a/tools/bpf/bpftool/bash-completion/bpftool b/tools/bpf/bpftool/bash-completion/bpftool
index 598066c40191..df1060b852c1 100644
--- a/tools/bpf/bpftool/bash-completion/bpftool
+++ b/tools/bpf/bpftool/bash-completion/bpftool
@@ -494,10 +494,10 @@ _bpftool()
                     _filedir
                     return 0
                     ;;
-		tree)
-		    _filedir
-		    return 0
-		    ;;
+                tree)
+                    _filedir
+                    return 0
+                    ;;
                 attach|detach)
                     local ATTACH_TYPES='ingress egress sock_create sock_ops \
                         device bind4 bind6 post_bind4 post_bind6 connect4 \
@@ -552,6 +552,15 @@ _bpftool()
                     ;;
             esac
             ;;
+        net)
+            case $command in
+                *)
+                    [[ $prev == $object ]] && \
+                        COMPREPLY=( $( compgen -W 'help \
+                            show list' -- "$cur" ) )
+                    ;;
+            esac
+            ;;
     esac
 } &&
 complete -F _bpftool bpftool
diff --git a/tools/bpf/bpftool/main.c b/tools/bpf/bpftool/main.c
index d15a62be6cf0..79dc3f193547 100644
--- a/tools/bpf/bpftool/main.c
+++ b/tools/bpf/bpftool/main.c
@@ -85,7 +85,7 @@ static int do_help(int argc, char **argv)
 		"       %s batch file FILE\n"
 		"       %s version\n"
 		"\n"
-		"       OBJECT := { prog | map | cgroup | perf }\n"
+		"       OBJECT := { prog | map | cgroup | perf | net }\n"
 		"       " HELP_SPEC_OPTIONS "\n"
 		"",
 		bin_name, bin_name, bin_name);
@@ -215,6 +215,7 @@ static const struct cmd cmds[] = {
 	{ "map",	do_map },
 	{ "cgroup",	do_cgroup },
 	{ "perf",	do_perf },
+	{ "net",	do_net },
 	{ "version",	do_version },
 	{ 0 }
 };
diff --git a/tools/bpf/bpftool/main.h b/tools/bpf/bpftool/main.h
index 238e734d75b3..02dfbcb92a23 100644
--- a/tools/bpf/bpftool/main.h
+++ b/tools/bpf/bpftool/main.h
@@ -136,6 +136,7 @@ int do_map(int argc, char **arg);
 int do_event_pipe(int argc, char **argv);
 int do_cgroup(int argc, char **arg);
 int do_perf(int argc, char **arg);
+int do_net(int argc, char **arg);
 
 int prog_parse_fd(int *argc, char ***argv);
 int map_parse_fd(int *argc, char ***argv);
@@ -165,4 +166,10 @@ struct btf_dumper {
  */
 int btf_dumper_type(const struct btf_dumper *d, __u32 type_id,
 		    const void *data);
+
+struct nlattr;
+struct ifinfomsg;
+struct tcmsg;
+int do_xdp_dump(struct ifinfomsg *ifinfo, struct nlattr **tb);
+int do_filter_dump(struct tcmsg *ifinfo, struct nlattr **tb, const char *kind);
 #endif
diff --git a/tools/bpf/bpftool/net.c b/tools/bpf/bpftool/net.c
new file mode 100644
index 000000000000..77dd73dd9ade
--- /dev/null
+++ b/tools/bpf/bpftool/net.c
@@ -0,0 +1,233 @@
+// SPDX-License-Identifier: GPL-2.0+
+// Copyright (C) 2018 Facebook
+
+#define _GNU_SOURCE
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+#include <libbpf.h>
+#include <net/if.h>
+#include <linux/if.h>
+#include <linux/rtnetlink.h>
+#include <linux/tc_act/tc_bpf.h>
+#include <sys/socket.h>
+
+#include <bpf.h>
+#include <nlattr.h>
+#include "main.h"
+#include "netlink_dumper.h"
+
+struct bpf_netdev_t {
+	int	*ifindex_array;
+	int	used_len;
+	int	array_len;
+	int	filter_idx;
+};
+
+struct tc_kind_handle {
+	char	kind[64];
+	int	handle;
+};
+
+struct bpf_tcinfo_t {
+	struct tc_kind_handle	*handle_array;
+	int			used_len;
+	int			array_len;
+	bool			is_qdisc;
+};
+
+static int dump_link_nlmsg(void *cookie, void *msg, struct nlattr **tb)
+{
+	struct bpf_netdev_t *netinfo = cookie;
+	struct ifinfomsg *ifinfo = msg;
+
+	if (netinfo->filter_idx > 0 && netinfo->filter_idx != ifinfo->ifi_index)
+		return 0;
+
+	if (netinfo->used_len == netinfo->array_len) {
+		netinfo->ifindex_array = realloc(netinfo->ifindex_array,
+			(netinfo->array_len + 16) * sizeof(int));
+		netinfo->array_len += 16;
+	}
+	netinfo->ifindex_array[netinfo->used_len++] = ifinfo->ifi_index;
+
+	return do_xdp_dump(ifinfo, tb);
+}
+
+static int dump_class_qdisc_nlmsg(void *cookie, void *msg, struct nlattr **tb)
+{
+	struct bpf_tcinfo_t *tcinfo = cookie;
+	struct tcmsg *info = msg;
+
+	if (tcinfo->is_qdisc) {
+		/* skip clsact qdisc */
+		if (tb[TCA_KIND] &&
+		    strcmp(nla_data(tb[TCA_KIND]), "clsact") == 0)
+			return 0;
+		if (info->tcm_handle == 0)
+			return 0;
+	}
+
+	if (tcinfo->used_len == tcinfo->array_len) {
+		tcinfo->handle_array = realloc(tcinfo->handle_array,
+			(tcinfo->array_len + 16) * sizeof(struct tc_kind_handle));
+		tcinfo->array_len += 16;
+	}
+	tcinfo->handle_array[tcinfo->used_len].handle = info->tcm_handle;
+	snprintf(tcinfo->handle_array[tcinfo->used_len].kind,
+		 sizeof(tcinfo->handle_array[tcinfo->used_len].kind),
+		 "%s_%s",
+		 tcinfo->is_qdisc ? "qdisc" : "class",
+		 tb[TCA_KIND] ? nla_getattr_str(tb[TCA_KIND]) : "unknown");
+	tcinfo->used_len++;
+
+	return 0;
+}
+
+static int dump_filter_nlmsg(void *cookie, void *msg, struct nlattr **tb)
+{
+	const char *kind = cookie;
+
+	return do_filter_dump((struct tcmsg *)msg, tb, kind);
+}
+
+static int show_dev_tc_bpf(int sock, unsigned int nl_pid, int ifindex)
+{
+	struct bpf_tcinfo_t tcinfo;
+	int i, handle, ret;
+
+	tcinfo.handle_array = NULL;
+	tcinfo.used_len = 0;
+	tcinfo.array_len = 0;
+
+	tcinfo.is_qdisc = false;
+	ret = nl_get_class(sock, nl_pid, ifindex, dump_class_qdisc_nlmsg,
+			   &tcinfo);
+	if (ret)
+		return ret;
+
+	tcinfo.is_qdisc = true;
+	ret = nl_get_qdisc(sock, nl_pid, ifindex, dump_class_qdisc_nlmsg,
+			   &tcinfo);
+	if (ret)
+		return ret;
+
+	for (i = 0; i < tcinfo.used_len; i++) {
+		ret = nl_get_filter(sock, nl_pid, ifindex,
+				    tcinfo.handle_array[i].handle,
+				    dump_filter_nlmsg,
+				    tcinfo.handle_array[i].kind);
+		if (ret)
+			return ret;
+	}
+
+	/* root, ingress and egress handle */
+	handle = TC_H_ROOT;
+	ret = nl_get_filter(sock, nl_pid, ifindex, handle, dump_filter_nlmsg,
+			    "root");
+	if (ret)
+		return ret;
+
+	handle = TC_H_MAKE(TC_H_CLSACT, TC_H_MIN_INGRESS);
+	ret = nl_get_filter(sock, nl_pid, ifindex, handle, dump_filter_nlmsg,
+			    "qdisc_clsact_ingress");
+	if (ret)
+		return ret;
+
+	handle = TC_H_MAKE(TC_H_CLSACT, TC_H_MIN_EGRESS);
+	ret = nl_get_filter(sock, nl_pid, ifindex, handle, dump_filter_nlmsg,
+			    "qdisc_clsact_egress");
+	if (ret)
+		return ret;
+
+	return 0;
+}
+
+static int do_show(int argc, char **argv)
+{
+	int i, sock, ret, filter_idx = -1;
+	struct bpf_netdev_t dev_array;
+	unsigned int nl_pid;
+	char err_buf[256];
+
+	if (argc == 2) {
+		if (strcmp(argv[0], "dev") != 0)
+			usage();
+		filter_idx = if_nametoindex(argv[1]);
+		if (filter_idx == 0) {
+			fprintf(stderr, "invalid dev name %s\n", argv[1]);
+			return -1;
+		}
+	} else if (argc != 0) {
+		usage();
+	}
+
+	sock = bpf_netlink_open(&nl_pid);
+	if (sock < 0) {
+		fprintf(stderr, "failed to open netlink sock\n");
+		return -1;
+	}
+
+	dev_array.ifindex_array = NULL;
+	dev_array.used_len = 0;
+	dev_array.array_len = 0;
+	dev_array.filter_idx = filter_idx;
+
+	if (json_output)
+		jsonw_start_array(json_wtr);
+	NET_START_OBJECT;
+	NET_START_ARRAY("xdp", "\n");
+	ret = nl_get_link(sock, nl_pid, dump_link_nlmsg, &dev_array);
+	NET_END_ARRAY("\n");
+
+	if (!ret) {
+		NET_START_ARRAY("tc_filters", "\n");
+		for (i = 0; i < dev_array.used_len; i++) {
+			ret = show_dev_tc_bpf(sock, nl_pid,
+					      dev_array.ifindex_array[i]);
+			if (ret)
+				break;
+		}
+		NET_END_ARRAY("\n");
+	}
+	NET_END_OBJECT;
+	if (json_output)
+		jsonw_end_array(json_wtr);
+
+	if (ret) {
+		if (json_output)
+			jsonw_null(json_wtr);
+		libbpf_strerror(ret, err_buf, sizeof(err_buf));
+		fprintf(stderr, "Error: %s\n", err_buf);
+	}
+	free(dev_array.ifindex_array);
+	close(sock);
+	return ret;
+}
+
+static int do_help(int argc, char **argv)
+{
+	if (json_output) {
+		jsonw_null(json_wtr);
+		return 0;
+	}
+
+	fprintf(stderr,
+		"Usage: %s %s { show | list } [dev <devname>]\n"
+		"       %s %s help\n",
+		bin_name, argv[-2], bin_name, argv[-2]);
+
+	return 0;
+}
+
+static const struct cmd cmds[] = {
+	{ "show",	do_show },
+	{ "list",	do_show },
+	{ "help",	do_help },
+	{ 0 }
+};
+
+int do_net(int argc, char **argv)
+{
+	return cmd_select(cmds, argc, argv, do_help);
+}
diff --git a/tools/bpf/bpftool/netlink_dumper.c b/tools/bpf/bpftool/netlink_dumper.c
new file mode 100644
index 000000000000..e12494fd1d2e
--- /dev/null
+++ b/tools/bpf/bpftool/netlink_dumper.c
@@ -0,0 +1,181 @@
+// SPDX-License-Identifier: GPL-2.0+
+// Copyright (C) 2018 Facebook
+
+#include <stdlib.h>
+#include <string.h>
+#include <libbpf.h>
+#include <linux/rtnetlink.h>
+#include <linux/tc_act/tc_bpf.h>
+
+#include <nlattr.h>
+#include "main.h"
+#include "netlink_dumper.h"
+
+static void xdp_dump_prog_id(struct nlattr **tb, int attr,
+			     const char *type)
+{
+	if (!tb[attr])
+		return;
+
+	NET_DUMP_UINT(type, nla_getattr_u32(tb[attr]))
+}
+
+static int do_xdp_dump_one(struct nlattr *attr, unsigned int ifindex,
+			   const char *name)
+{
+	struct nlattr *tb[IFLA_XDP_MAX + 1];
+	unsigned char mode;
+
+	if (nla_parse_nested(tb, IFLA_XDP_MAX, attr, NULL) < 0)
+		return -1;
+
+	if (!tb[IFLA_XDP_ATTACHED])
+		return 0;
+
+	mode = nla_getattr_u8(tb[IFLA_XDP_ATTACHED]);
+	if (mode == XDP_ATTACHED_NONE)
+		return 0;
+
+	NET_START_OBJECT;
+	NET_DUMP_UINT("ifindex", ifindex);
+
+	if (name)
+		NET_DUMP_STR("devname", name);
+
+	if (tb[IFLA_XDP_PROG_ID])
+		NET_DUMP_UINT("prog_id", nla_getattr_u32(tb[IFLA_XDP_PROG_ID]));
+
+	if (mode == XDP_ATTACHED_MULTI) {
+		xdp_dump_prog_id(tb, IFLA_XDP_SKB_PROG_ID, "generic_prog_id");
+		xdp_dump_prog_id(tb, IFLA_XDP_DRV_PROG_ID, "drv_prog_id");
+		xdp_dump_prog_id(tb, IFLA_XDP_HW_PROG_ID, "offload_prog_id");
+	}
+
+	NET_END_OBJECT_FINAL;
+	return 0;
+}
+
+int do_xdp_dump(struct ifinfomsg *ifinfo, struct nlattr **tb)
+{
+	if (!tb[IFLA_XDP])
+		return 0;
+
+	return do_xdp_dump_one(tb[IFLA_XDP], ifinfo->ifi_index,
+			       nla_getattr_str(tb[IFLA_IFNAME]));
+}
+
+static char *hexstring_n2a(const unsigned char *str, int len,
+			   char *buf, int blen)
+{
+	char *ptr = buf;
+	int i;
+
+	for (i = 0; i < len; i++) {
+		if (blen < 3)
+			break;
+		sprintf(ptr, "%02x", str[i]);
+		ptr += 2;
+		blen -= 2;
+	}
+	return buf;
+}
+
+static int do_bpf_dump_one_act(struct nlattr *attr)
+{
+	struct nlattr *tb[TCA_ACT_BPF_MAX + 1];
+	char buf[256];
+
+	if (nla_parse_nested(tb, TCA_ACT_BPF_MAX, attr, NULL) < 0)
+		return -LIBBPF_ERRNO__NLPARSE;
+
+	if (!tb[TCA_ACT_BPF_PARMS])
+		return -LIBBPF_ERRNO__NLPARSE;
+
+	NET_START_OBJECT_NESTED2;
+	if (tb[TCA_ACT_BPF_NAME])
+		NET_DUMP_STR("name", nla_getattr_str(tb[TCA_ACT_BPF_NAME]));
+	if (tb[TCA_ACT_BPF_ID])
+		NET_DUMP_UINT("bpf_id", nla_getattr_u32(tb[TCA_ACT_BPF_ID]));
+	if (tb[TCA_ACT_BPF_TAG])
+		NET_DUMP_STR("tag", hexstring_n2a(nla_data(tb[TCA_ACT_BPF_TAG]),
+						  nla_len(tb[TCA_ACT_BPF_TAG]),
+						  buf, sizeof(buf)));
+	NET_END_OBJECT_NESTED;
+	return 0;
+}
+
+static int do_dump_one_act(struct nlattr *attr)
+{
+	struct nlattr *tb[TCA_ACT_MAX + 1];
+
+	if (!attr)
+		return 0;
+
+	if (nla_parse_nested(tb, TCA_ACT_MAX, attr, NULL) < 0)
+		return -LIBBPF_ERRNO__NLPARSE;
+
+	if (tb[TCA_ACT_KIND] && strcmp(nla_data(tb[TCA_ACT_KIND]), "bpf") == 0)
+		return do_bpf_dump_one_act(tb[TCA_ACT_OPTIONS]);
+
+	return 0;
+}
+
+static int do_bpf_act_dump(struct nlattr *attr)
+{
+	struct nlattr *tb[TCA_ACT_MAX_PRIO + 1];
+	int act, ret;
+
+	if (nla_parse_nested(tb, TCA_ACT_MAX_PRIO, attr, NULL) < 0)
+		return -LIBBPF_ERRNO__NLPARSE;
+
+	NET_START_ARRAY("act", "");
+	for (act = 0; act <= TCA_ACT_MAX_PRIO; act++) {
+		ret = do_dump_one_act(tb[act]);
+		if (ret)
+			break;
+	}
+	NET_END_ARRAY(" ");
+
+	return ret;
+}
+
+static int do_bpf_filter_dump(struct nlattr *attr)
+{
+	struct nlattr *tb[TCA_BPF_MAX + 1];
+	char buf[256];
+	int ret;
+
+	if (nla_parse_nested(tb, TCA_BPF_MAX, attr, NULL) < 0)
+		return -LIBBPF_ERRNO__NLPARSE;
+
+	if (tb[TCA_BPF_NAME])
+		NET_DUMP_STR("name", nla_getattr_str(tb[TCA_BPF_NAME]));
+	if (tb[TCA_BPF_ID])
+		NET_DUMP_UINT("prog_id", nla_getattr_u32(tb[TCA_BPF_ID]));
+	if (tb[TCA_BPF_TAG])
+		NET_DUMP_STR("tag", hexstring_n2a(nla_data(tb[TCA_BPF_TAG]),
+						  nla_len(tb[TCA_BPF_TAG]),
+						  buf, sizeof(buf)));
+	if (tb[TCA_BPF_ACT]) {
+		ret = do_bpf_act_dump(tb[TCA_BPF_ACT]);
+		if (ret)
+			return ret;
+	}
+
+	return 0;
+}
+
+int do_filter_dump(struct tcmsg *info, struct nlattr **tb, const char *kind)
+{
+	int ret = 0;
+
+	if (tb[TCA_OPTIONS] && strcmp(nla_data(tb[TCA_KIND]), "bpf") == 0) {
+		NET_START_OBJECT;
+		NET_DUMP_UINT("ifindex", info->tcm_ifindex);
+		NET_DUMP_STR("kind", kind);
+		ret = do_bpf_filter_dump(tb[TCA_OPTIONS]);
+		NET_END_OBJECT_FINAL;
+	}
+
+	return ret;
+}
diff --git a/tools/bpf/bpftool/netlink_dumper.h b/tools/bpf/bpftool/netlink_dumper.h
new file mode 100644
index 000000000000..552d8851ac06
--- /dev/null
+++ b/tools/bpf/bpftool/netlink_dumper.h
@@ -0,0 +1,103 @@
+// SPDX-License-Identifier: GPL-2.0+
+// Copyright (C) 2018 Facebook
+
+#ifndef _NETLINK_DUMPER_H_
+#define _NETLINK_DUMPER_H_
+
+#define NET_START_OBJECT				\
+{							\
+	if (json_output)				\
+		jsonw_start_object(json_wtr);		\
+}
+
+#define NET_START_OBJECT_NESTED(name)			\
+{							\
+	if (json_output) {				\
+		jsonw_name(json_wtr, name);		\
+		jsonw_start_object(json_wtr);		\
+	} else {					\
+		fprintf(stderr, "%s {", name);		\
+	}						\
+}
+
+#define NET_START_OBJECT_NESTED2			\
+{							\
+	if (json_output)				\
+		jsonw_start_object(json_wtr);		\
+	else						\
+		fprintf(stderr, "{");			\
+}
+
+#define NET_END_OBJECT_NESTED				\
+{							\
+	if (json_output)				\
+		jsonw_end_object(json_wtr);		\
+	else						\
+		fprintf(stderr, "}");			\
+}
+
+#define NET_END_OBJECT					\
+{							\
+	if (json_output)				\
+		jsonw_end_object(json_wtr);		\
+}
+
+#define NET_END_OBJECT_FINAL				\
+{							\
+	if (json_output)				\
+		jsonw_end_object(json_wtr);		\
+	else						\
+		fprintf(stderr, "\n");			\
+}
+
+#define NET_START_ARRAY(name, newline)			\
+{							\
+	if (json_output) {				\
+		jsonw_name(json_wtr, name);		\
+		jsonw_start_array(json_wtr);		\
+	} else {					\
+		fprintf(stderr, "%s [%s", name, newline);\
+	}						\
+}
+
+#define NET_END_ARRAY(endstr)				\
+{							\
+	if (json_output)				\
+		jsonw_end_array(json_wtr);		\
+	else						\
+		fprintf(stderr, "]%s", endstr);		\
+}
+
+#define NET_DUMP_UINT(name, val)			\
+{							\
+	if (json_output)				\
+		jsonw_uint_field(json_wtr, name, val);	\
+	else						\
+		fprintf(stderr, "%s %d ", name, val);	\
+}
+
+#define NET_DUMP_LLUINT(name, val)			\
+{							\
+	if (json_output)				\
+		jsonw_lluint_field(json_wtr, name, val);\
+	else						\
+		fprintf(stderr, "%s %lld ", name, val);	\
+}
+
+#define NET_DUMP_STR(name, str)				\
+{							\
+	if (json_output)				\
+		jsonw_string_field(json_wtr, name, str);\
+	else						\
+		fprintf(stderr, "%s %s ", name, str);	\
+}
+
+#define NET_DUMP_STR_ONLY(str)				\
+{							\
+	if (json_output)				\
+		jsonw_string(json_wtr, str);		\
+	else						\
+		fprintf(stderr, "%s ", str);		\
+}
+
+#endif
-- 
2.17.1

^ permalink raw reply related

* [PATCH bpf-next 0/4] tools/bpf: add bpftool net support
From: Yonghong Song @ 2018-09-05 23:58 UTC (permalink / raw)
  To: ast, daniel, netdev; +Cc: kernel-team

As bpf usage becomes more pervasive, people starts to worry
about their cpu and memory cost. On a particular host,
people often wanted to know all running bpf programs
and their attachment context. So they can relate
a performance/memory anormly quickly to a particular bpf
program or an application.

bpftool already provides a pretty good coverage for perf
and cgroup related attachments. This patch set enabled
to dump attachment info for xdp and tc bpf programs.

Currently, users can already use "ip link show <dev>" and
"tc filter show dev <dev> ..." to dump bpf program attachment
information for xdp and tc bpf programs. The main reason
to implement such functionality in bpftool as well is for
better user experience. We want the bpftool to be the
ultimate tool for bpf introspection. The bpftool net
implementation will only present necessary bpf attachment
information to the user, ignoring most other ip/tc
specific information.

For example, the below is a pretty json print for xdp
and tc_filters.

  $ ./bpftool -jp net
  [{
        "xdp": [{
                "ifindex": 2,
                "devname": "eth0",
                "prog_id": 198
            }
        ],
        "tc_filters": [{
                "ifindex": 2,
                "kind": "qdisc_htb",
                "name": "prefix_matcher.o:[cls_prefix_matcher_htb]",
                "prog_id": 111727,
                "tag": "d08fe3b4319bc2fd",
                "act": []
            },{
                "ifindex": 2,
                "kind": "qdisc_clsact_ingress",
                "name": "fbflow_icmp",
                "prog_id": 130246,
                "tag": "3f265c7f26db62c9",
                "act": []
            },{
                "ifindex": 2,
                "kind": "qdisc_clsact_egress",
                "name": "prefix_matcher.o:[cls_prefix_matcher_clsact]",
                "prog_id": 111726,
                "tag": "99a197826974c876"
            },{
                "ifindex": 2,
                "kind": "qdisc_clsact_egress",
                "name": "cls_fg_dscp",
                "prog_id": 108619,
                "tag": "dc4630674fd72dcc",
                "act": []
            },{
                "ifindex": 2,
                "kind": "qdisc_clsact_egress",
                "name": "fbflow_egress",
                "prog_id": 130245,
                "tag": "72d2d830d6888d2c"
            }
        ]
    }
  ]

Patch #1 synced kernel uapi header if_link.h to tools directory.
Patch #2 moved tools/bpf/lib/bpf.c netlink related functions to
a new file. Patch #3 implemented additional functions
in libbpf which will be used in Patch #4.
Patch #4 implemented bpftool net support to dump
xdp and tc bpf program attachments.

Yonghong Song (4):
  tools/bpf: sync kernel uapi header if_link.h to tools
  tools/bpf: move bpf/lib netlink related functions into a new file
  tools/bpf: add more netlink functionalities in lib/bpf
  tools/bpf: bpftool: add net support

 .../bpf/bpftool/Documentation/bpftool-net.rst | 133 +++++++
 tools/bpf/bpftool/Documentation/bpftool.rst   |   6 +-
 tools/bpf/bpftool/bash-completion/bpftool     |  17 +-
 tools/bpf/bpftool/main.c                      |   3 +-
 tools/bpf/bpftool/main.h                      |   7 +
 tools/bpf/bpftool/net.c                       | 233 +++++++++++++
 tools/bpf/bpftool/netlink_dumper.c            | 181 ++++++++++
 tools/bpf/bpftool/netlink_dumper.h            | 103 ++++++
 tools/include/uapi/linux/if_link.h            |  17 +
 tools/lib/bpf/Build                           |   2 +-
 tools/lib/bpf/bpf.c                           | 129 -------
 tools/lib/bpf/libbpf.h                        |  16 +
 tools/lib/bpf/libbpf_errno.c                  |   1 +
 tools/lib/bpf/netlink.c                       | 324 ++++++++++++++++++
 tools/lib/bpf/nlattr.c                        |  33 +-
 tools/lib/bpf/nlattr.h                        |  38 ++
 16 files changed, 1094 insertions(+), 149 deletions(-)
 create mode 100644 tools/bpf/bpftool/Documentation/bpftool-net.rst
 create mode 100644 tools/bpf/bpftool/net.c
 create mode 100644 tools/bpf/bpftool/netlink_dumper.c
 create mode 100644 tools/bpf/bpftool/netlink_dumper.h
 create mode 100644 tools/lib/bpf/netlink.c

-- 
2.17.1

^ permalink raw reply

* [PATCH bpf-next 3/4] tools/bpf: add more netlink functionalities in lib/bpf
From: Yonghong Song @ 2018-09-05 23:58 UTC (permalink / raw)
  To: ast, daniel, netdev; +Cc: kernel-team
In-Reply-To: <20180905235806.1536396-1-yhs@fb.com>

This patch added a few netlink attribute parsing functions
and the netlink API functions to query networking links, tc classes,
tc qdiscs and tc filters. For example, the following API is
to get networking links:
  int nl_get_link(int sock, unsigned int nl_pid,
                  dump_nlmsg_t dump_link_nlmsg,
                  void *cookie);

Note that when the API is called, the user also provided a
callback function with the following signature:
  int (*dump_nlmsg_t)(void *cookie, void *msg, struct nlattr **tb);

The "cookie" is the parameter the user passed to the API and will
be available for the callback function.
The "msg" is the information about the result, e.g., ifinfomsg or
tcmsg. The "tb" is the parsed netlink attributes.

Signed-off-by: Yonghong Song <yhs@fb.com>
---
 tools/lib/bpf/libbpf.h       |  16 ++++
 tools/lib/bpf/libbpf_errno.c |   1 +
 tools/lib/bpf/netlink.c      | 165 ++++++++++++++++++++++++++++++++++-
 tools/lib/bpf/nlattr.c       |  33 ++++---
 tools/lib/bpf/nlattr.h       |  38 ++++++++
 5 files changed, 238 insertions(+), 15 deletions(-)

diff --git a/tools/lib/bpf/libbpf.h b/tools/lib/bpf/libbpf.h
index 96c55fac54c3..e3b00e23e181 100644
--- a/tools/lib/bpf/libbpf.h
+++ b/tools/lib/bpf/libbpf.h
@@ -46,6 +46,7 @@ enum libbpf_errno {
 	LIBBPF_ERRNO__PROGTYPE,	/* Kernel doesn't support this program type */
 	LIBBPF_ERRNO__WRNGPID,	/* Wrong pid in netlink message */
 	LIBBPF_ERRNO__INVSEQ,	/* Invalid netlink sequence */
+	LIBBPF_ERRNO__NLPARSE,	/* netlink parsing error */
 	__LIBBPF_ERRNO__END,
 };
 
@@ -297,4 +298,19 @@ int bpf_perf_event_read_simple(void *mem, unsigned long size,
 			       unsigned long page_size,
 			       void **buf, size_t *buf_len,
 			       bpf_perf_event_print_t fn, void *priv);
+
+struct nlmsghdr;
+struct nlattr;
+typedef int (*dump_nlmsg_t)(void *cookie, void *msg, struct nlattr **tb);
+typedef int (*__dump_nlmsg_t)(struct nlmsghdr *nlmsg, dump_nlmsg_t,
+			      void *cookie);
+int bpf_netlink_open(unsigned int *nl_pid);
+int nl_get_link(int sock, unsigned int nl_pid, dump_nlmsg_t dump_link_nlmsg,
+		void *cookie);
+int nl_get_class(int sock, unsigned int nl_pid, int ifindex,
+		 dump_nlmsg_t dump_class_nlmsg, void *cookie);
+int nl_get_qdisc(int sock, unsigned int nl_pid, int ifindex,
+		 dump_nlmsg_t dump_qdisc_nlmsg, void *cookie);
+int nl_get_filter(int sock, unsigned int nl_pid, int ifindex, int handle,
+		  dump_nlmsg_t dump_filter_nlmsg, void *cookie);
 #endif
diff --git a/tools/lib/bpf/libbpf_errno.c b/tools/lib/bpf/libbpf_errno.c
index d9ba851bd7f9..2464ade3b326 100644
--- a/tools/lib/bpf/libbpf_errno.c
+++ b/tools/lib/bpf/libbpf_errno.c
@@ -42,6 +42,7 @@ static const char *libbpf_strerror_table[NR_ERRNO] = {
 	[ERRCODE_OFFSET(PROGTYPE)]	= "Kernel doesn't support this program type",
 	[ERRCODE_OFFSET(WRNGPID)]	= "Wrong pid in netlink message",
 	[ERRCODE_OFFSET(INVSEQ)]	= "Invalid netlink sequence",
+	[ERRCODE_OFFSET(NLPARSE)]	= "Incorrect netlink message parsing",
 };
 
 int libbpf_strerror(int err, char *buf, size_t size)
diff --git a/tools/lib/bpf/netlink.c b/tools/lib/bpf/netlink.c
index ccaa991fe9d8..469e068dd0c5 100644
--- a/tools/lib/bpf/netlink.c
+++ b/tools/lib/bpf/netlink.c
@@ -18,7 +18,7 @@
 #define SOL_NETLINK 270
 #endif
 
-static int bpf_netlink_open(__u32 *nl_pid)
+int bpf_netlink_open(__u32 *nl_pid)
 {
 	struct sockaddr_nl sa;
 	socklen_t addrlen;
@@ -61,7 +61,9 @@ static int bpf_netlink_open(__u32 *nl_pid)
 	return ret;
 }
 
-static int bpf_netlink_recv(int sock, __u32 nl_pid, int seq)
+static int bpf_netlink_recv(int sock, __u32 nl_pid, int seq,
+			    __dump_nlmsg_t _fn, dump_nlmsg_t fn,
+			    void *cookie)
 {
 	struct nlmsgerr *err;
 	struct nlmsghdr *nh;
@@ -98,6 +100,11 @@ static int bpf_netlink_recv(int sock, __u32 nl_pid, int seq)
 			default:
 				break;
 			}
+			if (_fn) {
+				ret = _fn(nh, fn, cookie);
+				if (ret)
+					return ret;
+			}
 		}
 	}
 	ret = 0;
@@ -157,9 +164,161 @@ int bpf_set_link_xdp_fd(int ifindex, int fd, __u32 flags)
 		ret = -errno;
 		goto cleanup;
 	}
-	ret = bpf_netlink_recv(sock, nl_pid, seq);
+	ret = bpf_netlink_recv(sock, nl_pid, seq, NULL, NULL, NULL);
 
 cleanup:
 	close(sock);
 	return ret;
 }
+
+static int __dump_link_nlmsg(struct nlmsghdr *nlh, dump_nlmsg_t dump_link_nlmsg,
+			     void *cookie)
+{
+	struct nlattr *tb[IFLA_MAX + 1], *attr;
+	struct ifinfomsg *ifi = NLMSG_DATA(nlh);
+	int len;
+
+	len = nlh->nlmsg_len - NLMSG_LENGTH(sizeof(*ifi));
+	attr = (struct nlattr *) ((void *) ifi + NLMSG_ALIGN(sizeof(*ifi)));
+	if (nla_parse(tb, IFLA_MAX, attr, len, NULL) != 0)
+		return -LIBBPF_ERRNO__NLPARSE;
+
+	return dump_link_nlmsg(cookie, ifi, tb);
+}
+
+int nl_get_link(int sock, unsigned int nl_pid, dump_nlmsg_t dump_link_nlmsg,
+		void *cookie)
+{
+	struct {
+		struct nlmsghdr nlh;
+		struct ifinfomsg ifm;
+	} req = {
+		.nlh.nlmsg_len = NLMSG_LENGTH(sizeof(struct ifinfomsg)),
+		.nlh.nlmsg_type = RTM_GETLINK,
+		.nlh.nlmsg_flags = NLM_F_DUMP | NLM_F_REQUEST,
+		.ifm.ifi_family = AF_PACKET,
+	};
+	int seq = time(NULL);
+
+	req.nlh.nlmsg_seq = seq;
+	if (send(sock, &req, req.nlh.nlmsg_len, 0) < 0)
+		return -errno;
+
+	return bpf_netlink_recv(sock, nl_pid, seq, __dump_link_nlmsg,
+				dump_link_nlmsg, cookie);
+}
+
+static int __dump_class_nlmsg(struct nlmsghdr *nlh,
+			      dump_nlmsg_t dump_class_nlmsg, void *cookie)
+{
+	struct nlattr *tb[TCA_MAX + 1], *attr;
+	struct tcmsg *t = NLMSG_DATA(nlh);
+	int len;
+
+	len = nlh->nlmsg_len - NLMSG_LENGTH(sizeof(*t));
+	attr = (struct nlattr *) ((void *) t + NLMSG_ALIGN(sizeof(*t)));
+	if (nla_parse(tb, TCA_MAX, attr, len, NULL) != 0)
+		return -LIBBPF_ERRNO__NLPARSE;
+
+	return dump_class_nlmsg(cookie, t, tb);
+}
+
+int nl_get_class(int sock, unsigned int nl_pid, int ifindex,
+		 dump_nlmsg_t dump_class_nlmsg, void *cookie)
+{
+	struct {
+		struct nlmsghdr nlh;
+		struct tcmsg t;
+	} req = {
+		.nlh.nlmsg_len = NLMSG_LENGTH(sizeof(struct tcmsg)),
+		.nlh.nlmsg_type = RTM_GETTCLASS,
+		.nlh.nlmsg_flags = NLM_F_DUMP | NLM_F_REQUEST,
+		.t.tcm_family = AF_UNSPEC,
+		.t.tcm_ifindex = ifindex,
+	};
+	int seq = time(NULL);
+
+	req.nlh.nlmsg_seq = seq;
+	if (send(sock, &req, req.nlh.nlmsg_len, 0) < 0)
+		return -errno;
+
+	return bpf_netlink_recv(sock, nl_pid, seq, __dump_class_nlmsg,
+				dump_class_nlmsg, cookie);
+}
+
+static int __dump_qdisc_nlmsg(struct nlmsghdr *nlh,
+			      dump_nlmsg_t dump_qdisc_nlmsg, void *cookie)
+{
+	struct nlattr *tb[TCA_MAX + 1], *attr;
+	struct tcmsg *t = NLMSG_DATA(nlh);
+	int len;
+
+	len = nlh->nlmsg_len - NLMSG_LENGTH(sizeof(*t));
+	attr = (struct nlattr *) ((void *) t + NLMSG_ALIGN(sizeof(*t)));
+	if (nla_parse(tb, TCA_MAX, attr, len, NULL) != 0)
+		return -LIBBPF_ERRNO__NLPARSE;
+
+	return dump_qdisc_nlmsg(cookie, t, tb);
+}
+
+int nl_get_qdisc(int sock, unsigned int nl_pid, int ifindex,
+		 dump_nlmsg_t dump_qdisc_nlmsg, void *cookie)
+{
+	struct {
+		struct nlmsghdr nlh;
+		struct tcmsg t;
+	} req = {
+		.nlh.nlmsg_len = NLMSG_LENGTH(sizeof(struct tcmsg)),
+		.nlh.nlmsg_type = RTM_GETQDISC,
+		.nlh.nlmsg_flags = NLM_F_DUMP | NLM_F_REQUEST,
+		.t.tcm_family = AF_UNSPEC,
+		.t.tcm_ifindex = ifindex,
+	};
+	int seq = time(NULL);
+
+	req.nlh.nlmsg_seq = seq;
+	if (send(sock, &req, req.nlh.nlmsg_len, 0) < 0)
+		return -errno;
+
+	return bpf_netlink_recv(sock, nl_pid, seq, __dump_qdisc_nlmsg,
+				dump_qdisc_nlmsg, cookie);
+}
+
+static int __dump_filter_nlmsg(struct nlmsghdr *nlh,
+			       dump_nlmsg_t dump_filter_nlmsg, void *cookie)
+{
+	struct nlattr *tb[TCA_MAX + 1], *attr;
+	struct tcmsg *t = NLMSG_DATA(nlh);
+	int len;
+
+	len = nlh->nlmsg_len - NLMSG_LENGTH(sizeof(*t));
+	attr = (struct nlattr *) ((void *) t + NLMSG_ALIGN(sizeof(*t)));
+	if (nla_parse(tb, TCA_MAX, attr, len, NULL) != 0)
+		return -LIBBPF_ERRNO__NLPARSE;
+
+	return dump_filter_nlmsg(cookie, t, tb);
+}
+
+int nl_get_filter(int sock, unsigned int nl_pid, int ifindex, int handle,
+		  dump_nlmsg_t dump_filter_nlmsg, void *cookie)
+{
+	struct {
+		struct nlmsghdr nlh;
+		struct tcmsg t;
+	} req = {
+		.nlh.nlmsg_len = NLMSG_LENGTH(sizeof(struct tcmsg)),
+		.nlh.nlmsg_type = RTM_GETTFILTER,
+		.nlh.nlmsg_flags = NLM_F_DUMP | NLM_F_REQUEST,
+		.t.tcm_family = AF_UNSPEC,
+		.t.tcm_ifindex = ifindex,
+		.t.tcm_parent = handle,
+	};
+	int seq = time(NULL);
+
+	req.nlh.nlmsg_seq = seq;
+	if (send(sock, &req, req.nlh.nlmsg_len, 0) < 0)
+		return -errno;
+
+	return bpf_netlink_recv(sock, nl_pid, seq, __dump_filter_nlmsg,
+				dump_filter_nlmsg, cookie);
+}
diff --git a/tools/lib/bpf/nlattr.c b/tools/lib/bpf/nlattr.c
index 4719434278b2..49f514119bdb 100644
--- a/tools/lib/bpf/nlattr.c
+++ b/tools/lib/bpf/nlattr.c
@@ -26,11 +26,6 @@ static uint16_t nla_attr_minlen[NLA_TYPE_MAX+1] = {
 	[NLA_FLAG]	= 0,
 };
 
-static int nla_len(const struct nlattr *nla)
-{
-	return nla->nla_len - NLA_HDRLEN;
-}
-
 static struct nlattr *nla_next(const struct nlattr *nla, int *remaining)
 {
 	int totlen = NLA_ALIGN(nla->nla_len);
@@ -46,11 +41,6 @@ static int nla_ok(const struct nlattr *nla, int remaining)
 	       nla->nla_len <= remaining;
 }
 
-static void *nla_data(const struct nlattr *nla)
-{
-	return (char *) nla + NLA_HDRLEN;
-}
-
 static int nla_type(const struct nlattr *nla)
 {
 	return nla->nla_type & NLA_TYPE_MASK;
@@ -114,8 +104,8 @@ static inline int nlmsg_len(const struct nlmsghdr *nlh)
  * @see nla_validate
  * @return 0 on success or a negative error code.
  */
-static int nla_parse(struct nlattr *tb[], int maxtype, struct nlattr *head, int len,
-		     struct nla_policy *policy)
+int nla_parse(struct nlattr *tb[], int maxtype, struct nlattr *head, int len,
+	      struct nla_policy *policy)
 {
 	struct nlattr *nla;
 	int rem, err;
@@ -146,6 +136,25 @@ static int nla_parse(struct nlattr *tb[], int maxtype, struct nlattr *head, int
 	return err;
 }
 
+/**
+ * Create attribute index based on nested attribute
+ * @arg tb              Index array to be filled (maxtype+1 elements).
+ * @arg maxtype         Maximum attribute type expected and accepted.
+ * @arg nla             Nested Attribute.
+ * @arg policy          Attribute validation policy.
+ *
+ * Feeds the stream of attributes nested into the specified attribute
+ * to nla_parse().
+ *
+ * @see nla_parse
+ * @return 0 on success or a negative error code.
+ */
+int nla_parse_nested(struct nlattr *tb[], int maxtype, struct nlattr *nla,
+		     struct nla_policy *policy)
+{
+	return nla_parse(tb, maxtype, nla_data(nla), nla_len(nla), policy);
+}
+
 /* dump netlink extended ack error message */
 int nla_dump_errormsg(struct nlmsghdr *nlh)
 {
diff --git a/tools/lib/bpf/nlattr.h b/tools/lib/bpf/nlattr.h
index 931a71f68f93..a6e2396bce7c 100644
--- a/tools/lib/bpf/nlattr.h
+++ b/tools/lib/bpf/nlattr.h
@@ -67,6 +67,44 @@ struct nla_policy {
 	     nla_ok(pos, rem); \
 	     pos = nla_next(pos, &(rem)))
 
+/**
+ * nla_data - head of payload
+ * @nla: netlink attribute
+ */
+static inline void *nla_data(const struct nlattr *nla)
+{
+	return (char *) nla + NLA_HDRLEN;
+}
+
+static inline uint8_t nla_getattr_u8(const struct nlattr *nla)
+{
+	return *(uint8_t *)nla_data(nla);
+}
+
+static inline uint32_t nla_getattr_u32(const struct nlattr *nla)
+{
+	return *(uint32_t *)nla_data(nla);
+}
+
+static inline const char *nla_getattr_str(const struct nlattr *nla)
+{
+	return (const char *)nla_data(nla);
+}
+
+/**
+ * nla_len - length of payload
+ * @nla: netlink attribute
+ */
+static inline int nla_len(const struct nlattr *nla)
+{
+	return nla->nla_len - NLA_HDRLEN;
+}
+
+int nla_parse(struct nlattr *tb[], int maxtype, struct nlattr *head, int len,
+	      struct nla_policy *policy);
+int nla_parse_nested(struct nlattr *tb[], int maxtype, struct nlattr *nla,
+		     struct nla_policy *policy);
+
 int nla_dump_errormsg(struct nlmsghdr *nlh);
 
 #endif /* __NLATTR_H */
-- 
2.17.1

^ permalink raw reply related

* [PATCH bpf-next 1/4] tools/bpf: sync kernel uapi header if_link.h to tools
From: Yonghong Song @ 2018-09-05 23:58 UTC (permalink / raw)
  To: ast, daniel, netdev; +Cc: kernel-team
In-Reply-To: <20180905235806.1536396-1-yhs@fb.com>

Among others, this header will be used later for
bpftool net support.

Signed-off-by: Yonghong Song <yhs@fb.com>
---
 tools/include/uapi/linux/if_link.h | 17 +++++++++++++++++
 1 file changed, 17 insertions(+)

diff --git a/tools/include/uapi/linux/if_link.h b/tools/include/uapi/linux/if_link.h
index cf01b6824244..43391e2d1153 100644
--- a/tools/include/uapi/linux/if_link.h
+++ b/tools/include/uapi/linux/if_link.h
@@ -164,6 +164,8 @@ enum {
 	IFLA_CARRIER_UP_COUNT,
 	IFLA_CARRIER_DOWN_COUNT,
 	IFLA_NEW_IFINDEX,
+	IFLA_MIN_MTU,
+	IFLA_MAX_MTU,
 	__IFLA_MAX
 };
 
@@ -334,6 +336,7 @@ enum {
 	IFLA_BRPORT_GROUP_FWD_MASK,
 	IFLA_BRPORT_NEIGH_SUPPRESS,
 	IFLA_BRPORT_ISOLATED,
+	IFLA_BRPORT_BACKUP_PORT,
 	__IFLA_BRPORT_MAX
 };
 #define IFLA_BRPORT_MAX (__IFLA_BRPORT_MAX - 1)
@@ -459,6 +462,16 @@ enum {
 
 #define IFLA_MACSEC_MAX (__IFLA_MACSEC_MAX - 1)
 
+/* XFRM section */
+enum {
+	IFLA_XFRM_UNSPEC,
+	IFLA_XFRM_LINK,
+	IFLA_XFRM_IF_ID,
+	__IFLA_XFRM_MAX
+};
+
+#define IFLA_XFRM_MAX (__IFLA_XFRM_MAX - 1)
+
 enum macsec_validation_type {
 	MACSEC_VALIDATE_DISABLED = 0,
 	MACSEC_VALIDATE_CHECK = 1,
@@ -920,6 +933,7 @@ enum {
 	XDP_ATTACHED_DRV,
 	XDP_ATTACHED_SKB,
 	XDP_ATTACHED_HW,
+	XDP_ATTACHED_MULTI,
 };
 
 enum {
@@ -928,6 +942,9 @@ enum {
 	IFLA_XDP_ATTACHED,
 	IFLA_XDP_FLAGS,
 	IFLA_XDP_PROG_ID,
+	IFLA_XDP_DRV_PROG_ID,
+	IFLA_XDP_SKB_PROG_ID,
+	IFLA_XDP_HW_PROG_ID,
 	__IFLA_XDP_MAX,
 };
 
-- 
2.17.1

^ permalink raw reply related

* [PATCH bpf-next 2/4] tools/bpf: move bpf/lib netlink related functions into a new file
From: Yonghong Song @ 2018-09-05 23:58 UTC (permalink / raw)
  To: ast, daniel, netdev; +Cc: kernel-team
In-Reply-To: <20180905235806.1536396-1-yhs@fb.com>

There are no functionality change for this patch.

In the subsequent patches, more netlink related library functions
will be added and a separate file is better than cluttering bpf.c.

Signed-off-by: Yonghong Song <yhs@fb.com>
---
 tools/lib/bpf/Build     |   2 +-
 tools/lib/bpf/bpf.c     | 129 -------------------------------
 tools/lib/bpf/netlink.c | 165 ++++++++++++++++++++++++++++++++++++++++
 3 files changed, 166 insertions(+), 130 deletions(-)
 create mode 100644 tools/lib/bpf/netlink.c

diff --git a/tools/lib/bpf/Build b/tools/lib/bpf/Build
index 13a861135127..512b2c0ba0d2 100644
--- a/tools/lib/bpf/Build
+++ b/tools/lib/bpf/Build
@@ -1 +1 @@
-libbpf-y := libbpf.o bpf.o nlattr.o btf.o libbpf_errno.o
+libbpf-y := libbpf.o bpf.o nlattr.o btf.o libbpf_errno.o netlink.o
diff --git a/tools/lib/bpf/bpf.c b/tools/lib/bpf/bpf.c
index 60aa4ca8b2c5..3878a26a2071 100644
--- a/tools/lib/bpf/bpf.c
+++ b/tools/lib/bpf/bpf.c
@@ -28,16 +28,8 @@
 #include <linux/bpf.h>
 #include "bpf.h"
 #include "libbpf.h"
-#include "nlattr.h"
-#include <linux/rtnetlink.h>
-#include <linux/if_link.h>
-#include <sys/socket.h>
 #include <errno.h>
 
-#ifndef SOL_NETLINK
-#define SOL_NETLINK 270
-#endif
-
 /*
  * When building perf, unistd.h is overridden. __NR_bpf is
  * required to be defined explicitly.
@@ -499,127 +491,6 @@ int bpf_raw_tracepoint_open(const char *name, int prog_fd)
 	return sys_bpf(BPF_RAW_TRACEPOINT_OPEN, &attr, sizeof(attr));
 }
 
-int bpf_set_link_xdp_fd(int ifindex, int fd, __u32 flags)
-{
-	struct sockaddr_nl sa;
-	int sock, seq = 0, len, ret = -1;
-	char buf[4096];
-	struct nlattr *nla, *nla_xdp;
-	struct {
-		struct nlmsghdr  nh;
-		struct ifinfomsg ifinfo;
-		char             attrbuf[64];
-	} req;
-	struct nlmsghdr *nh;
-	struct nlmsgerr *err;
-	socklen_t addrlen;
-	int one = 1;
-
-	memset(&sa, 0, sizeof(sa));
-	sa.nl_family = AF_NETLINK;
-
-	sock = socket(AF_NETLINK, SOCK_RAW, NETLINK_ROUTE);
-	if (sock < 0) {
-		return -errno;
-	}
-
-	if (setsockopt(sock, SOL_NETLINK, NETLINK_EXT_ACK,
-		       &one, sizeof(one)) < 0) {
-		fprintf(stderr, "Netlink error reporting not supported\n");
-	}
-
-	if (bind(sock, (struct sockaddr *)&sa, sizeof(sa)) < 0) {
-		ret = -errno;
-		goto cleanup;
-	}
-
-	addrlen = sizeof(sa);
-	if (getsockname(sock, (struct sockaddr *)&sa, &addrlen) < 0) {
-		ret = -errno;
-		goto cleanup;
-	}
-
-	if (addrlen != sizeof(sa)) {
-		ret = -LIBBPF_ERRNO__INTERNAL;
-		goto cleanup;
-	}
-
-	memset(&req, 0, sizeof(req));
-	req.nh.nlmsg_len = NLMSG_LENGTH(sizeof(struct ifinfomsg));
-	req.nh.nlmsg_flags = NLM_F_REQUEST | NLM_F_ACK;
-	req.nh.nlmsg_type = RTM_SETLINK;
-	req.nh.nlmsg_pid = 0;
-	req.nh.nlmsg_seq = ++seq;
-	req.ifinfo.ifi_family = AF_UNSPEC;
-	req.ifinfo.ifi_index = ifindex;
-
-	/* started nested attribute for XDP */
-	nla = (struct nlattr *)(((char *)&req)
-				+ NLMSG_ALIGN(req.nh.nlmsg_len));
-	nla->nla_type = NLA_F_NESTED | IFLA_XDP;
-	nla->nla_len = NLA_HDRLEN;
-
-	/* add XDP fd */
-	nla_xdp = (struct nlattr *)((char *)nla + nla->nla_len);
-	nla_xdp->nla_type = IFLA_XDP_FD;
-	nla_xdp->nla_len = NLA_HDRLEN + sizeof(int);
-	memcpy((char *)nla_xdp + NLA_HDRLEN, &fd, sizeof(fd));
-	nla->nla_len += nla_xdp->nla_len;
-
-	/* if user passed in any flags, add those too */
-	if (flags) {
-		nla_xdp = (struct nlattr *)((char *)nla + nla->nla_len);
-		nla_xdp->nla_type = IFLA_XDP_FLAGS;
-		nla_xdp->nla_len = NLA_HDRLEN + sizeof(flags);
-		memcpy((char *)nla_xdp + NLA_HDRLEN, &flags, sizeof(flags));
-		nla->nla_len += nla_xdp->nla_len;
-	}
-
-	req.nh.nlmsg_len += NLA_ALIGN(nla->nla_len);
-
-	if (send(sock, &req, req.nh.nlmsg_len, 0) < 0) {
-		ret = -errno;
-		goto cleanup;
-	}
-
-	len = recv(sock, buf, sizeof(buf), 0);
-	if (len < 0) {
-		ret = -errno;
-		goto cleanup;
-	}
-
-	for (nh = (struct nlmsghdr *)buf; NLMSG_OK(nh, len);
-	     nh = NLMSG_NEXT(nh, len)) {
-		if (nh->nlmsg_pid != sa.nl_pid) {
-			ret = -LIBBPF_ERRNO__WRNGPID;
-			goto cleanup;
-		}
-		if (nh->nlmsg_seq != seq) {
-			ret = -LIBBPF_ERRNO__INVSEQ;
-			goto cleanup;
-		}
-		switch (nh->nlmsg_type) {
-		case NLMSG_ERROR:
-			err = (struct nlmsgerr *)NLMSG_DATA(nh);
-			if (!err->error)
-				continue;
-			ret = err->error;
-			nla_dump_errormsg(nh);
-			goto cleanup;
-		case NLMSG_DONE:
-			break;
-		default:
-			break;
-		}
-	}
-
-	ret = 0;
-
-cleanup:
-	close(sock);
-	return ret;
-}
-
 int bpf_load_btf(void *btf, __u32 btf_size, char *log_buf, __u32 log_buf_size,
 		 bool do_log)
 {
diff --git a/tools/lib/bpf/netlink.c b/tools/lib/bpf/netlink.c
new file mode 100644
index 000000000000..ccaa991fe9d8
--- /dev/null
+++ b/tools/lib/bpf/netlink.c
@@ -0,0 +1,165 @@
+// SPDX-License-Identifier: LGPL-2.1
+/* Copyright (c) 2018 Facebook */
+
+#include <stdlib.h>
+#include <memory.h>
+#include <unistd.h>
+#include <linux/bpf.h>
+#include <linux/rtnetlink.h>
+#include <sys/socket.h>
+#include <errno.h>
+#include <time.h>
+
+#include "bpf.h"
+#include "libbpf.h"
+#include "nlattr.h"
+
+#ifndef SOL_NETLINK
+#define SOL_NETLINK 270
+#endif
+
+static int bpf_netlink_open(__u32 *nl_pid)
+{
+	struct sockaddr_nl sa;
+	socklen_t addrlen;
+	int one = 1, ret;
+	int sock;
+
+	memset(&sa, 0, sizeof(sa));
+	sa.nl_family = AF_NETLINK;
+
+	sock = socket(AF_NETLINK, SOCK_RAW, NETLINK_ROUTE);
+	if (sock < 0)
+		return -errno;
+
+	if (setsockopt(sock, SOL_NETLINK, NETLINK_EXT_ACK,
+		       &one, sizeof(one)) < 0) {
+		fprintf(stderr, "Netlink error reporting not supported\n");
+	}
+
+	if (bind(sock, (struct sockaddr *)&sa, sizeof(sa)) < 0) {
+		ret = -errno;
+		goto cleanup;
+	}
+
+	addrlen = sizeof(sa);
+	if (getsockname(sock, (struct sockaddr *)&sa, &addrlen) < 0) {
+		ret = -errno;
+		goto cleanup;
+	}
+
+	if (addrlen != sizeof(sa)) {
+		ret = -LIBBPF_ERRNO__INTERNAL;
+		goto cleanup;
+	}
+
+	*nl_pid = sa.nl_pid;
+	return sock;
+
+cleanup:
+	close(sock);
+	return ret;
+}
+
+static int bpf_netlink_recv(int sock, __u32 nl_pid, int seq)
+{
+	struct nlmsgerr *err;
+	struct nlmsghdr *nh;
+	char buf[4096];
+	int len, ret;
+
+	while (1) {
+		len = recv(sock, buf, sizeof(buf), 0);
+		if (len < 0) {
+			ret = -errno;
+			goto done;
+		}
+
+		for (nh = (struct nlmsghdr *)buf; NLMSG_OK(nh, len);
+		     nh = NLMSG_NEXT(nh, len)) {
+			if (nh->nlmsg_pid != nl_pid) {
+				ret = -LIBBPF_ERRNO__WRNGPID;
+				goto done;
+			}
+			if (nh->nlmsg_seq != seq) {
+				ret = -LIBBPF_ERRNO__INVSEQ;
+				goto done;
+			}
+			switch (nh->nlmsg_type) {
+			case NLMSG_ERROR:
+				err = (struct nlmsgerr *)NLMSG_DATA(nh);
+				if (!err->error)
+					continue;
+				ret = err->error;
+				nla_dump_errormsg(nh);
+				goto done;
+			case NLMSG_DONE:
+				return 0;
+			default:
+				break;
+			}
+		}
+	}
+	ret = 0;
+done:
+	return ret;
+}
+
+int bpf_set_link_xdp_fd(int ifindex, int fd, __u32 flags)
+{
+	int sock, seq = 0, ret;
+	struct nlattr *nla, *nla_xdp;
+	struct {
+		struct nlmsghdr  nh;
+		struct ifinfomsg ifinfo;
+		char             attrbuf[64];
+	} req;
+	__u32 nl_pid;
+
+	sock = bpf_netlink_open(&nl_pid);
+	if (sock < 0)
+		return sock;
+
+	memset(&req, 0, sizeof(req));
+	req.nh.nlmsg_len = NLMSG_LENGTH(sizeof(struct ifinfomsg));
+	req.nh.nlmsg_flags = NLM_F_REQUEST | NLM_F_ACK;
+	req.nh.nlmsg_type = RTM_SETLINK;
+	req.nh.nlmsg_pid = 0;
+	req.nh.nlmsg_seq = ++seq;
+	req.ifinfo.ifi_family = AF_UNSPEC;
+	req.ifinfo.ifi_index = ifindex;
+
+	/* started nested attribute for XDP */
+	nla = (struct nlattr *)(((char *)&req)
+				+ NLMSG_ALIGN(req.nh.nlmsg_len));
+	nla->nla_type = NLA_F_NESTED | IFLA_XDP;
+	nla->nla_len = NLA_HDRLEN;
+
+	/* add XDP fd */
+	nla_xdp = (struct nlattr *)((char *)nla + nla->nla_len);
+	nla_xdp->nla_type = IFLA_XDP_FD;
+	nla_xdp->nla_len = NLA_HDRLEN + sizeof(int);
+	memcpy((char *)nla_xdp + NLA_HDRLEN, &fd, sizeof(fd));
+	nla->nla_len += nla_xdp->nla_len;
+
+	/* if user passed in any flags, add those too */
+	if (flags) {
+		nla_xdp = (struct nlattr *)((char *)nla + nla->nla_len);
+		nla_xdp->nla_type = IFLA_XDP_FLAGS;
+		nla_xdp->nla_len = NLA_HDRLEN + sizeof(flags);
+		memcpy((char *)nla_xdp + NLA_HDRLEN, &flags, sizeof(flags));
+		nla->nla_len += nla_xdp->nla_len;
+	}
+
+	req.nh.nlmsg_len += NLA_ALIGN(nla->nla_len);
+
+	if (send(sock, &req, req.nh.nlmsg_len, 0) < 0) {
+		ret = -errno;
+		goto cleanup;
+	}
+	ret = bpf_netlink_recv(sock, nl_pid, seq);
+
+cleanup:
+	close(sock);
+	return ret;
+}
-- 
2.17.1

^ permalink raw reply related

page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox