Netdev List

Netdev List
 help / color / mirror / Atom feed

* [PATCH bpf-next] bpf: offload: add priv field for drivers
From: Jakub Kicinski @ 2019-02-12  8:20 UTC (permalink / raw)
  To: alexei.starovoitov, daniel; +Cc: oss-drivers, netdev, Jakub Kicinski

Currently bpf_offload_dev does not have any priv pointer, forcing
the drivers to work backwards from the netdev in program metadata.
This is not great given programs are conceptually associated with
the offload device, and it means one or two unnecessary deferences.
Add a priv pointer to bpf_offload_dev.

Signed-off-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Reviewed-by: Quentin Monnet <quentin.monnet@netronome.com>
---
 drivers/net/ethernet/netronome/nfp/bpf/main.c    |  2 +-
 drivers/net/ethernet/netronome/nfp/bpf/offload.c |  4 +---
 drivers/net/netdevsim/bpf.c                      |  5 +++--
 include/linux/bpf.h                              |  3 ++-
 kernel/bpf/offload.c                             | 10 +++++++++-
 5 files changed, 16 insertions(+), 8 deletions(-)

diff --git a/drivers/net/ethernet/netronome/nfp/bpf/main.c b/drivers/net/ethernet/netronome/nfp/bpf/main.c
index dccae0319204..275de9f4c61c 100644
--- a/drivers/net/ethernet/netronome/nfp/bpf/main.c
+++ b/drivers/net/ethernet/netronome/nfp/bpf/main.c
@@ -465,7 +465,7 @@ static int nfp_bpf_init(struct nfp_app *app)
 		app->ctrl_mtu = nfp_bpf_ctrl_cmsg_mtu(bpf);
 	}
 
-	bpf->bpf_dev = bpf_offload_dev_create(&nfp_bpf_dev_ops);
+	bpf->bpf_dev = bpf_offload_dev_create(&nfp_bpf_dev_ops, bpf);
 	err = PTR_ERR_OR_ZERO(bpf->bpf_dev);
 	if (err)
 		goto err_free_neutral_maps;
diff --git a/drivers/net/ethernet/netronome/nfp/bpf/offload.c b/drivers/net/ethernet/netronome/nfp/bpf/offload.c
index 55c7dbf8b421..15dce97650a5 100644
--- a/drivers/net/ethernet/netronome/nfp/bpf/offload.c
+++ b/drivers/net/ethernet/netronome/nfp/bpf/offload.c
@@ -185,8 +185,6 @@ static void nfp_prog_free(struct nfp_prog *nfp_prog)
 
 static int nfp_bpf_verifier_prep(struct bpf_prog *prog)
 {
-	struct nfp_net *nn = netdev_priv(prog->aux->offload->netdev);
-	struct nfp_app *app = nn->app;
 	struct nfp_prog *nfp_prog;
 	int ret;
 
@@ -197,7 +195,7 @@ static int nfp_bpf_verifier_prep(struct bpf_prog *prog)
 
 	INIT_LIST_HEAD(&nfp_prog->insns);
 	nfp_prog->type = prog->type;
-	nfp_prog->bpf = app->priv;
+	nfp_prog->bpf = bpf_offload_dev_priv(prog->aux->offload->offdev);
 
 	ret = nfp_prog_prepare(nfp_prog, prog->insnsi, prog->len);
 	if (ret)
diff --git a/drivers/net/netdevsim/bpf.c b/drivers/net/netdevsim/bpf.c
index 172b271c8bd2..f92c43453ec6 100644
--- a/drivers/net/netdevsim/bpf.c
+++ b/drivers/net/netdevsim/bpf.c
@@ -248,7 +248,7 @@ static int nsim_bpf_create_prog(struct netdevsim *ns, struct bpf_prog *prog)
 
 static int nsim_bpf_verifier_prep(struct bpf_prog *prog)
 {
-	struct netdevsim *ns = netdev_priv(prog->aux->offload->netdev);
+	struct netdevsim *ns = bpf_offload_dev_priv(prog->aux->offload->offdev);
 
 	if (!ns->bpf_bind_accept)
 		return -EOPNOTSUPP;
@@ -589,7 +589,8 @@ int nsim_bpf_init(struct netdevsim *ns)
 		if (IS_ERR_OR_NULL(ns->sdev->ddir_bpf_bound_progs))
 			return -ENOMEM;
 
-		ns->sdev->bpf_dev = bpf_offload_dev_create(&nsim_bpf_dev_ops);
+		ns->sdev->bpf_dev = bpf_offload_dev_create(&nsim_bpf_dev_ops,
+							   ns);
 		err = PTR_ERR_OR_ZERO(ns->sdev->bpf_dev);
 		if (err)
 			return err;
diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index bd169a7bcc93..d3126ff4994a 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -767,8 +767,9 @@ int bpf_map_offload_get_next_key(struct bpf_map *map,
 bool bpf_offload_prog_map_match(struct bpf_prog *prog, struct bpf_map *map);
 
 struct bpf_offload_dev *
-bpf_offload_dev_create(const struct bpf_prog_offload_ops *ops);
+bpf_offload_dev_create(const struct bpf_prog_offload_ops *ops, void *priv);
 void bpf_offload_dev_destroy(struct bpf_offload_dev *offdev);
+void *bpf_offload_dev_priv(struct bpf_offload_dev *offdev);
 int bpf_offload_dev_netdev_register(struct bpf_offload_dev *offdev,
 				    struct net_device *netdev);
 void bpf_offload_dev_netdev_unregister(struct bpf_offload_dev *offdev,
diff --git a/kernel/bpf/offload.c b/kernel/bpf/offload.c
index 39dba8c90331..ba635209ae9a 100644
--- a/kernel/bpf/offload.c
+++ b/kernel/bpf/offload.c
@@ -35,6 +35,7 @@ static DECLARE_RWSEM(bpf_devs_lock);
 struct bpf_offload_dev {
 	const struct bpf_prog_offload_ops *ops;
 	struct list_head netdevs;
+	void *priv;
 };
 
 struct bpf_offload_netdev {
@@ -669,7 +670,7 @@ void bpf_offload_dev_netdev_unregister(struct bpf_offload_dev *offdev,
 EXPORT_SYMBOL_GPL(bpf_offload_dev_netdev_unregister);
 
 struct bpf_offload_dev *
-bpf_offload_dev_create(const struct bpf_prog_offload_ops *ops)
+bpf_offload_dev_create(const struct bpf_prog_offload_ops *ops, void *priv)
 {
 	struct bpf_offload_dev *offdev;
 	int err;
@@ -688,6 +689,7 @@ bpf_offload_dev_create(const struct bpf_prog_offload_ops *ops)
 		return ERR_PTR(-ENOMEM);
 
 	offdev->ops = ops;
+	offdev->priv = priv;
 	INIT_LIST_HEAD(&offdev->netdevs);
 
 	return offdev;
@@ -700,3 +702,9 @@ void bpf_offload_dev_destroy(struct bpf_offload_dev *offdev)
 	kfree(offdev);
 }
 EXPORT_SYMBOL_GPL(bpf_offload_dev_destroy);
+
+void *bpf_offload_dev_priv(struct bpf_offload_dev *offdev)
+{
+	return offdev->priv;
+}
+EXPORT_SYMBOL_GPL(bpf_offload_dev_priv);
-- 
2.19.2


^ permalink raw reply related

* Re: [net-next PATCH 2/2] net: page_pool: don't use page->private to store dma_addr_t
From: Jesper Dangaard Brouer @ 2019-02-12  8:23 UTC (permalink / raw)
  To: Alexander Duyck
  Cc: Netdev, linux-mm, Toke Høiland-Jørgensen,
	Ilias Apalodimas, Matthew Wilcox, Saeed Mahameed, Andrew Morton,
	Mel Gorman, David S. Miller, Tariq Toukan, brouer
In-Reply-To: <CAKgT0Ucw_HGaice7cjM7e_nYuvjU_TKVd54Yc_fHen1pZRkUJw@mail.gmail.com>

On Mon, 11 Feb 2019 11:31:13 -0800
Alexander Duyck <alexander.duyck@gmail.com> wrote:

> On Mon, Feb 11, 2019 at 8:07 AM Jesper Dangaard Brouer
> <brouer@redhat.com> wrote:
> >
> > From: Ilias Apalodimas <ilias.apalodimas@linaro.org>
> >
> > As pointed out by David Miller the current page_pool implementation
> > stores dma_addr_t in page->private.
> > This won't work on 32-bit platforms with 64-bit DMA addresses since the
> > page->private is an unsigned long and the dma_addr_t a u64.
> >
> > A previous patch is adding dma_addr_t on struct page to accommodate this.
> > This patch adapts the page_pool related functions to use the newly added
> > struct for storing and retrieving DMA addresses from network drivers.
> >
> > Signed-off-by: Ilias Apalodimas <ilias.apalodimas@linaro.org>
> > Signed-off-by: Jesper Dangaard Brouer <brouer@redhat.com>
> > ---
> >  net/core/page_pool.c |   13 +++++++++----
> >  1 file changed, 9 insertions(+), 4 deletions(-)
> >
> > diff --git a/net/core/page_pool.c b/net/core/page_pool.c
> > index 43a932cb609b..897a69a1477e 100644
> > --- a/net/core/page_pool.c
> > +++ b/net/core/page_pool.c
> > @@ -136,7 +136,9 @@ static struct page *__page_pool_alloc_pages_slow(struct page_pool *pool,
> >         if (!(pool->p.flags & PP_FLAG_DMA_MAP))
> >                 goto skip_dma_map;
> >
> > -       /* Setup DMA mapping: use page->private for DMA-addr
> > +       /* Setup DMA mapping: use 'struct page' area for storing DMA-addr
> > +        * since dma_addr_t can be either 32 or 64 bits and does not always fit
> > +        * into page private data (i.e 32bit cpu with 64bit DMA caps)
> >          * This mapping is kept for lifetime of page, until leaving pool.
> >          */
> >         dma = dma_map_page(pool->p.dev, page, 0,
> > @@ -146,7 +148,7 @@ static struct page *__page_pool_alloc_pages_slow(struct page_pool *pool,
> >                 put_page(page);
> >                 return NULL;
> >         }
> > -       set_page_private(page, dma); /* page->private = dma; */
> > +       page->dma_addr = dma;
> >
> >  skip_dma_map:
> >         /* When page just alloc'ed is should/must have refcnt 1. */
> > @@ -175,13 +177,16 @@ EXPORT_SYMBOL(page_pool_alloc_pages);
> >  static void __page_pool_clean_page(struct page_pool *pool,
> >                                    struct page *page)
> >  {
> > +       dma_addr_t dma;
> > +
> >         if (!(pool->p.flags & PP_FLAG_DMA_MAP))
> >                 return;
> >
> > +       dma = page->dma_addr;
> >         /* DMA unmap */
> > -       dma_unmap_page(pool->p.dev, page_private(page),
> > +       dma_unmap_page(pool->p.dev, dma,
> >                        PAGE_SIZE << pool->p.order, pool->p.dma_dir);
> > -       set_page_private(page, 0);
> > +       page->dma_addr = 0;
> >  }
> >
> >  /* Return a page to the page allocator, cleaning up our state */  
> 
> This comment is unrelated to this patch specifically, but applies more
> generally to the page_pool use of dma_unmap_page.
> 
> So just looking at this I am pretty sure the use of just
> dma_unmap_page isn't correct here. You should probably be using
> dma_unmap_page_attrs and specifically be passing the attribute
> DMA_ATTR_SKIP_CPU_SYNC so that you can tear down the mapping without
> invalidating the contents of the page.

It is unrelated to this patch, but YES you are right.  I was aware of
this, but it slipped my mind.  You were the one that taught me the
principle page_pool is based on, that we keep the DMA mapping, but
instead let the driver perform the DMA-sync operations.

Thanks for catching this!  I actually think that the current small
ARM64 board we are playing with at the moment (Espressobin) will have a
performance benefit from doing this.


> This is something that will work for most cases but if you run into a
> case where this is used with SWIOTLB in bounce buffer mode you would
> end up potentially corrupting data on the unmap call.

I do have a board Machiattobin, that operate with SWIOTLB bounce
buffers, which it is not suppose to, and something that I'll hopefully
get a round to fix soon.  But we have not implemented use of page_pool
on that board yet. So, thanks for catching this.

-- 
Best regards,
  Jesper Dangaard Brouer
  MSc.CS, Principal Kernel Engineer at Red Hat
  LinkedIn: http://www.linkedin.com/in/brouer

^ permalink raw reply

* Re: [net-next PATCH 1/2] mm: add dma_addr_t to struct page
From: Jesper Dangaard Brouer @ 2019-02-12  8:28 UTC (permalink / raw)
  To: Andrew Morton
  Cc: netdev, linux-mm, Toke Høiland-Jørgensen,
	Ilias Apalodimas, willy, Saeed Mahameed, mgorman, David S. Miller,
	Tariq Toukan, brouer
In-Reply-To: <20190211121624.30c601d0fa4c0f972eeaf1c6@linux-foundation.org>

On Mon, 11 Feb 2019 12:16:24 -0800
Andrew Morton <akpm@linux-foundation.org> wrote:

> On Mon, 11 Feb 2019 17:06:46 +0100 Jesper Dangaard Brouer <brouer@redhat.com> wrote:
> 
> > The page_pool API is using page->private to store DMA addresses.
> > As pointed out by David Miller we can't use that on 32-bit architectures
> > with 64-bit DMA
> > 
> > This patch adds a new dma_addr_t struct to allow storing DMA addresses
> > 
> > ..
> >
> > --- a/include/linux/mm_types.h
> > +++ b/include/linux/mm_types.h
> > @@ -95,6 +95,14 @@ struct page {
> >  			 */
> >  			unsigned long private;
> >  		};
> > +		struct {	/* page_pool used by netstack */
> > +			/**
> > +			 * @dma_addr: Page_pool need to store DMA-addr, and
> > +			 * cannot use @private, as DMA-mappings can be 64-bit
> > +			 * even on 32-bit Architectures.
> > +			 */  
> 
> This comment is a bit awkward.  The discussion about why it doesn't use
> ->private is uninteresting going forward and is more material for a  
> changelog.
> 
> How about
> 
> 			/**
> 			 * @dma_addr: page_pool requires a 64-bit value even on
> 			 * 32-bit architectures.
> 			 */

Much better, I'll use that!

> Otherwise,
> 
> Acked-by: Andrew Morton <akpm@linux-foundation.org>

Thanks!

-- 
Best regards,
  Jesper Dangaard Brouer
  MSc.CS, Principal Kernel Engineer at Red Hat
  LinkedIn: http://www.linkedin.com/in/brouer

^ permalink raw reply

* Re: [PATCH] ser_gigaset: mark expected switch fall-through
From: Paul Bolle @ 2019-02-12  8:45 UTC (permalink / raw)
  To: Gustavo A. R. Silva
  Cc: gigaset307x-common, netdev, linux-kernel, Kees Cook, Karsten Keil
In-Reply-To: <20190211223444.GA29517@embeddedor>

Gustavo A. R. Silva schreef op ma 11-02-2019 om 16:34 [-0600]:
> In preparation to enabling -Wimplicit-fallthrough, mark switch
> cases where we are expecting to fall through.
> 
> This patch fixes the following warning:
> 
> drivers/isdn/gigaset/ser-gigaset.c: In function ‘gigaset_tty_ioctl’:
> drivers/isdn/gigaset/ser-gigaset.c:627:3: warning: this statement may fall through [-Wimplicit-fallthrough=]
>    switch (arg) {
>    ^~~~~~
> drivers/isdn/gigaset/ser-gigaset.c:638:2: note: here
>   default:
>   ^~~~~~~
> 
> Warning level 3 was used: -Wimplicit-fallthrough=3
> 
> Notice that, in this particular case, the code comment is modified
> in accordance with what GCC is expecting to find.
> 
> This patch is part of the ongoing efforts to enable
> -Wimplicit-fallthrough.
> 
> Signed-off-by: Gustavo A. R. Silva <gustavo@embeddedor.com>

Acked-by: Paul Bolle <pebolle@tiscali.nl>

Thanks,


Paul Bolle


^ permalink raw reply

* [PATCH 0/2] i40e: fix regression that enables AF_XDP ZC unconditionally
From: Björn Töpel @ 2019-02-12  8:52 UTC (permalink / raw)
  To: intel-wired-lan
  Cc: Björn Töpel, magnus.karlsson, magnus.karlsson, netdev,
	jan.sokolowski

This series addresses a recent AF_XDP zero-copy regression.

In commit f3fef2b6e1cc ("i40e: Remove umem from VSI") a regression was
introduced; When the VSI was reset, the setup code would try to enable
AF_XDP ZC unconditionally (as long as there was a umem placed in the
netdev._rx struct). Here, we add a bitmap to the VSI that tracks if a
certain queue pair has been "zero-copy enabled" via the ndo_bpf. The
bitmap is used in i40e_xsk_umem, and enables zero-copy if and only if
XDP is enabled, the corresponding qid in the bitmap is set and the
umem is non-NULL.

Thanks,
Björn

Björn Töpel (2):
  i40e: move i40e_xsk_umem function
  i40e: add tracking of AF_XDP ZC state for each queue pair

 drivers/net/ethernet/intel/i40e/i40e.h      | 16 ++----------
 drivers/net/ethernet/intel/i40e/i40e_main.c | 28 +++++++++++++++++++++
 drivers/net/ethernet/intel/i40e/i40e_xsk.c  |  3 +++
 3 files changed, 33 insertions(+), 14 deletions(-)

-- 
2.19.1

^ permalink raw reply

* [PATCH 1/2] i40e: move i40e_xsk_umem function
From: Björn Töpel @ 2019-02-12  8:52 UTC (permalink / raw)
  To: intel-wired-lan
  Cc: Björn Töpel, magnus.karlsson, magnus.karlsson, netdev,
	jan.sokolowski
In-Reply-To: <20190212085205.7848-1-bjorn.topel@gmail.com>

From: Björn Töpel <bjorn.topel@intel.com>

The i40e_xsk_umem function was explicitly inlined in i40e.h. There is
no reason for that, so move it to i40e_main.c instead.

Signed-off-by: Björn Töpel <bjorn.topel@intel.com>
---
 drivers/net/ethernet/intel/i40e/i40e.h      | 14 --------------
 drivers/net/ethernet/intel/i40e/i40e_main.c | 20 ++++++++++++++++++++
 2 files changed, 20 insertions(+), 14 deletions(-)

diff --git a/drivers/net/ethernet/intel/i40e/i40e.h b/drivers/net/ethernet/intel/i40e/i40e.h
index d684998ba2b0..cc583ad5236b 100644
--- a/drivers/net/ethernet/intel/i40e/i40e.h
+++ b/drivers/net/ethernet/intel/i40e/i40e.h
@@ -1096,20 +1096,6 @@ static inline bool i40e_enabled_xdp_vsi(struct i40e_vsi *vsi)
 	return !!vsi->xdp_prog;
 }
 
-static inline struct xdp_umem *i40e_xsk_umem(struct i40e_ring *ring)
-{
-	bool xdp_on = i40e_enabled_xdp_vsi(ring->vsi);
-	int qid = ring->queue_index;
-
-	if (ring_is_xdp(ring))
-		qid -= ring->vsi->alloc_queue_pairs;
-
-	if (!xdp_on)
-		return NULL;
-
-	return xdp_get_umem_from_qid(ring->vsi->netdev, qid);
-}
-
 int i40e_create_queue_channel(struct i40e_vsi *vsi, struct i40e_channel *ch);
 int i40e_set_bw_limit(struct i40e_vsi *vsi, u16 seid, u64 max_tx_rate);
 int i40e_add_del_cloud_filter(struct i40e_vsi *vsi,
diff --git a/drivers/net/ethernet/intel/i40e/i40e_main.c b/drivers/net/ethernet/intel/i40e/i40e_main.c
index 44856a84738d..ba1a84a2c8e5 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_main.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_main.c
@@ -3063,6 +3063,26 @@ static void i40e_config_xps_tx_ring(struct i40e_ring *ring)
 			    ring->queue_index);
 }
 
+/**
+ * i40e_xsk_umem - Retrieve the AF_XDP ZC if XDP and ZC is enabled
+ * @ring: The Tx or Rx ring
+ *
+ * Returns the UMEM or NULL.
+ **/
+static struct xdp_umem *i40e_xsk_umem(struct i40e_ring *ring)
+{
+	bool xdp_on = i40e_enabled_xdp_vsi(ring->vsi);
+	int qid = ring->queue_index;
+
+	if (ring_is_xdp(ring))
+		qid -= ring->vsi->alloc_queue_pairs;
+
+	if (!xdp_on)
+		return NULL;
+
+	return xdp_get_umem_from_qid(ring->vsi->netdev, qid);
+}
+
 /**
  * i40e_configure_tx_ring - Configure a transmit ring context and rest
  * @ring: The Tx ring to configure
-- 
2.19.1


^ permalink raw reply related

* [PATCH 2/2] i40e: add tracking of AF_XDP ZC state for each queue pair
From: Björn Töpel @ 2019-02-12  8:52 UTC (permalink / raw)
  To: intel-wired-lan
  Cc: Björn Töpel, magnus.karlsson, magnus.karlsson, netdev,
	jan.sokolowski
In-Reply-To: <20190212085205.7848-1-bjorn.topel@gmail.com>

From: Björn Töpel <bjorn.topel@intel.com>

In commit f3fef2b6e1cc ("i40e: Remove umem from VSI") a regression was
introduced; When the VSI was reset, the setup code would try to enable
AF_XDP ZC unconditionally (as long as there was a umem placed in the
netdev._rx struct). Here, we add a bitmap to the VSI that tracks if a
certain queue pair has been "zero-copy enabled" via the ndo_bpf. The
bitmap is used in i40e_xsk_umem, and enables zero-copy if and only if
XDP is enabled, the corresponding qid in the bitmap is set and the
umem is non-NULL.

Fixes: f3fef2b6e1cc ("i40e: Remove umem from VSI")
Signed-off-by: Björn Töpel <bjorn.topel@intel.com>
---
 drivers/net/ethernet/intel/i40e/i40e.h      |  2 ++
 drivers/net/ethernet/intel/i40e/i40e_main.c | 10 +++++++++-
 drivers/net/ethernet/intel/i40e/i40e_xsk.c  |  3 +++
 3 files changed, 14 insertions(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/intel/i40e/i40e.h b/drivers/net/ethernet/intel/i40e/i40e.h
index cc583ad5236b..d3cc3427caad 100644
--- a/drivers/net/ethernet/intel/i40e/i40e.h
+++ b/drivers/net/ethernet/intel/i40e/i40e.h
@@ -790,6 +790,8 @@ struct i40e_vsi {
 
 	/* VSI specific handlers */
 	irqreturn_t (*irq_handler)(int irq, void *data);
+
+	unsigned long *af_xdp_zc_qps; /* tracks AF_XDP ZC enabled qps */
 } ____cacheline_internodealigned_in_smp;
 
 struct i40e_netdev_priv {
diff --git a/drivers/net/ethernet/intel/i40e/i40e_main.c b/drivers/net/ethernet/intel/i40e/i40e_main.c
index ba1a84a2c8e5..0dd00d58c524 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_main.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_main.c
@@ -3077,7 +3077,7 @@ static struct xdp_umem *i40e_xsk_umem(struct i40e_ring *ring)
 	if (ring_is_xdp(ring))
 		qid -= ring->vsi->alloc_queue_pairs;
 
-	if (!xdp_on)
+	if (!xdp_on || !test_bit(qid, ring->vsi->af_xdp_zc_qps))
 		return NULL;
 
 	return xdp_get_umem_from_qid(ring->vsi->netdev, qid);
@@ -10076,6 +10076,12 @@ static int i40e_vsi_mem_alloc(struct i40e_pf *pf, enum i40e_vsi_type type)
 	hash_init(vsi->mac_filter_hash);
 	vsi->irqs_ready = false;
 
+	if (type == I40E_VSI_MAIN) {
+		vsi->af_xdp_zc_qps = bitmap_zalloc(pf->num_lan_qps, GFP_KERNEL);
+		if (!vsi->af_xdp_zc_qps)
+			goto err_rings;
+	}
+
 	ret = i40e_set_num_rings_in_vsi(vsi);
 	if (ret)
 		goto err_rings;
@@ -10094,6 +10100,7 @@ static int i40e_vsi_mem_alloc(struct i40e_pf *pf, enum i40e_vsi_type type)
 	goto unlock_pf;
 
 err_rings:
+	bitmap_free(vsi->af_xdp_zc_qps);
 	pf->next_vsi = i - 1;
 	kfree(vsi);
 unlock_pf:
@@ -10174,6 +10181,7 @@ static int i40e_vsi_clear(struct i40e_vsi *vsi)
 	i40e_put_lump(pf->qp_pile, vsi->base_queue, vsi->idx);
 	i40e_put_lump(pf->irq_pile, vsi->base_vector, vsi->idx);
 
+	bitmap_free(vsi->af_xdp_zc_qps);
 	i40e_vsi_free_arrays(vsi, true);
 	i40e_clear_rss_config_user(vsi);
 
diff --git a/drivers/net/ethernet/intel/i40e/i40e_xsk.c b/drivers/net/ethernet/intel/i40e/i40e_xsk.c
index 96d849460d9b..2737fee338c4 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_xsk.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_xsk.c
@@ -102,6 +102,8 @@ static int i40e_xsk_umem_enable(struct i40e_vsi *vsi, struct xdp_umem *umem,
 	if (err)
 		return err;
 
+	set_bit(qid, vsi->af_xdp_zc_qps);
+
 	if_running = netif_running(vsi->netdev) && i40e_enabled_xdp_vsi(vsi);
 
 	if (if_running) {
@@ -143,6 +145,7 @@ static int i40e_xsk_umem_disable(struct i40e_vsi *vsi, u16 qid)
 			return err;
 	}
 
+	clear_bit(qid, vsi->af_xdp_zc_qps);
 	i40e_xsk_umem_dma_unmap(vsi, umem);
 
 	if (if_running) {
-- 
2.19.1


^ permalink raw reply related

* Re: Stack sends oversize UDP packet to the driver
From: Michael Chan @ 2019-02-12  8:55 UTC (permalink / raw)
  To: Mahesh Bandewar (महेश बंडेवार)
  Cc: Daniel Axtens, Netdev, David Miller, Eric Dumazet,
	Willem de Bruijn
In-Reply-To: <CAF2d9jjwuT1WSz7P9QFbjdpp8GhhV-XBLzVC8H94Le_JCxL0fg@mail.gmail.com>

On Fri, Feb 8, 2019 at 12:26 PM Mahesh Bandewar (महेश बंडेवार)
<maheshb@google.com> wrote:
>
> On Wed, Feb 6, 2019 at 8:51 PM Mahesh Bandewar (महेश बंडेवार)
> <maheshb@google.com> wrote:
> >
> > On Tue, Feb 5, 2019 at 11:36 AM Michael Chan <michael.chan@broadcom.com> wrote:
> > > I've looked at this a little more.  The blackhole_dev is not IFF_UP |
> > > IFF_RUNNING, right?  May be that's why the packets are never getting
> > > to the xmit function?
> > Yes, so I added those two flags and ended up writing a test-module for
> > the device (which I will include while posting the patch-series).
> > However, adding those flags is also not sufficient since the qdisc is
> > initialized to noop_qdisc so qdisc enqueue will drop packets before
> > hitting the ndo_start_xmit().
>
> I have another version of the fix (with help from Eric) and this
> should hit the .ndo_start_xmit() of the blackhole_dev. I'm adding
> these flags during the setup and then calling dev_activate() to change
> noop qdisc to null qdisc. Please give this patch set a try and let me
> know if the blackhole_dev xmit path gets exercised in your test
> scenario.

The new version still works in the sense that no oversize packets are
seen in the NIC driver's xmit function.  But I still don't see any
packets hitting the blackhole's xmit function.  I'm not 100% sure but
I think the blackhole dev has no IP address and so the UDP packets are
dropped in ip_finish_output2() because there is no neigh.  Something
like that.

^ permalink raw reply

* Re: [PATCH net-next 2/2] devlink: Fix list access without lock while reading region
From: Sergei Shtylyov @ 2019-02-12  9:01 UTC (permalink / raw)
  To: Parav Pandit, jiri, davem, netdev
In-Reply-To: <1549955377-15828-1-git-send-email-parav@mellanox.com>

Hello!

On 12.02.2019 10:09, Parav Pandit wrote:

> While finding the devlink device during region reading,
> devlink device list is accessed and devlink device is
> returned without holding a lock. This could lead to user-after-free

    Use-after-free, perhaps?

> accesses.
> 
> While at it, add lockdep assert to ensure that all future callers hold
> the lock when calling devlink_get_from_attrs().
> 
> Fixes: 4e54795a27f5 ("devlink: Add support for region snapshot read command")
> Signed-off-by: Parav Pandit <parav@mellanox.com>
> Acked-by: Jiri Pirko <jiri@mellanox.com>
[...]

MBR, Sergei

^ permalink raw reply

* Re: [PATCH net-next 0/3] Remove getting SWITCHDEV_ATTR_ID_PORT_BRIDGE_FLAGS
From: Jiri Pirko @ 2019-02-12  8:55 UTC (permalink / raw)
  To: Florian Fainelli
  Cc: netdev, David S. Miller, Ido Schimmel, open list,
	open list:STAGING SUBSYSTEM, moderated list:ETHERNET BRIDGE, jiri,
	andrew, vivien.didelot
In-Reply-To: <20190211211749.19847-1-f.fainelli@gmail.com>

Mon, Feb 11, 2019 at 10:17:46PM CET, f.fainelli@gmail.com wrote:
>Hi all,
>
>AFAICT there is no code that attempts to get the value of the attribute
>SWITCHDEV_ATTR_ID_PORT_BRIDGE_FLAGS while it is used with
>switchdev_port_attr_set().
>
>This is effectively no doing anything and it can slow down future work
>that tries to make modifications in these areas so remove that.
>
>David, there should be no dependency with previous patch series, but
>again, feedback from Ido and Jiri would be welcome in case this was
>added for a reason.

It was originally used by:
switchdev_port_bridge_getlink()
removed by:
commit 29ab586c3d83f81c435e269cace9a1619afb5bbd
Author: Arkadi Sharshevsky <arkadis@mellanox.com>
Date:   Sun Aug 6 16:15:51 2017 +0300

    net: switchdev: Remove bridge bypass support from switchdev

So these are just leftovers. Let's flush them.

^ permalink raw reply

* Re: [PATCH net-next 1/3] mlxsw: spectrum_switchdev: Remove getting PORT_BRIDGE_FLAGS
From: Jiri Pirko @ 2019-02-12  8:55 UTC (permalink / raw)
  To: Florian Fainelli
  Cc: netdev, David S. Miller, Ido Schimmel, open list,
	open list:STAGING SUBSYSTEM, moderated list:ETHERNET BRIDGE, jiri,
	andrew, vivien.didelot
In-Reply-To: <20190211211749.19847-2-f.fainelli@gmail.com>

Mon, Feb 11, 2019 at 10:17:47PM CET, f.fainelli@gmail.com wrote:
>There is no code that will query the SWITCHDEV_ATTR_ID_PORT_BRIDGE_FLAGS
>attribute remove support for that.
>
>Signed-off-by: Florian Fainelli <f.fainelli@gmail.com>

Acked-by: Jiri Pirko <jiri@mellanox.com>

^ permalink raw reply

* Re: [PATCH net-next 2/3] rocker: Remove getting PORT_BRIDGE_FLAGS
From: Jiri Pirko @ 2019-02-12  8:56 UTC (permalink / raw)
  To: Florian Fainelli
  Cc: netdev, David S. Miller, Ido Schimmel, open list,
	open list:STAGING SUBSYSTEM, moderated list:ETHERNET BRIDGE, jiri,
	andrew, vivien.didelot
In-Reply-To: <20190211211749.19847-3-f.fainelli@gmail.com>

Mon, Feb 11, 2019 at 10:17:48PM CET, f.fainelli@gmail.com wrote:
>There is no code that attempts to get the
>SWITCHDEV_ATTR_ID_PORT_BRIDGE_FLAGS attribute, remove support for that.
>
>Signed-off-by: Florian Fainelli <f.fainelli@gmail.com>

Acked-by: Jiri Pirko <jiri@mellanox.com>

^ permalink raw reply

* Re: [PATCH net-next 3/3] staging: fsl-dpaa2: ethsw: Remove getting PORT_BRIDGE_FLAGS
From: Jiri Pirko @ 2019-02-12  8:56 UTC (permalink / raw)
  To: Florian Fainelli
  Cc: netdev, David S. Miller, Ido Schimmel, open list,
	open list:STAGING SUBSYSTEM, moderated list:ETHERNET BRIDGE, jiri,
	andrew, vivien.didelot
In-Reply-To: <20190211211749.19847-4-f.fainelli@gmail.com>

Mon, Feb 11, 2019 at 10:17:49PM CET, f.fainelli@gmail.com wrote:
>There is no code that tries to get the attribute
>SWITCHDEV_ATTR_ID_PORT_BRIDGE_FLAGS, remove support for doing that.
>
>Signed-off-by: Florian Fainelli <f.fainelli@gmail.com>

Acked-by: Jiri Pirko <jiri@mellanox.com>

^ permalink raw reply

* Re: [Patch net] team: avoid complex list operations in team_nl_cmd_options_set()
From: Jiri Pirko @ 2019-02-12  9:26 UTC (permalink / raw)
  To: Cong Wang
  Cc: netdev, syzbot+4d4af685432dc0e56c91, syzbot+68ee510075cf64260cc4,
	Paolo Abeni
In-Reply-To: <20190212055951.6712-1-xiyou.wangcong@gmail.com>

Tue, Feb 12, 2019 at 06:59:51AM CET, xiyou.wangcong@gmail.com wrote:
>The current opt_inst_list operations inside team_nl_cmd_options_set()
>is too complex to track:
>
>    LIST_HEAD(opt_inst_list);
>    nla_for_each_nested(...) {
>        list_for_each_entry(opt_inst, &team->option_inst_list, list) {
>            if (__team_option_inst_tmp_find(&opt_inst_list, opt_inst))
>                continue;
>            list_add(&opt_inst->tmp_list, &opt_inst_list);
>        }
>    }
>    team_nl_send_event_options_get(team, &opt_inst_list);
>
>as while we retrieve 'opt_inst' from team->option_inst_list, it could
>be added to the local 'opt_inst_list' for multiple times. The
>__team_option_inst_tmp_find() doesn't work, as the setter
>team_mode_option_set() still calls team->ops.exit() which uses
>->tmp_list too in __team_options_change_check().
>
>Simplify the list operations by moving the 'opt_inst_list' and
>team_nl_send_event_options_get() into the nla_for_each_nested() loop so
>that it can be guranteed that we won't insert a same list entry for
>multiple times. Therefore, __team_option_inst_tmp_find() can be removed
>too.
>
>Fixes: 4fb0534fb7bb ("team: avoid adding twice the same option to the event list")
>Fixes: 2fcdb2c9e659 ("team: allow to send multiple set events in one message")
>Reported-by: syzbot+4d4af685432dc0e56c91@syzkaller.appspotmail.com
>Reported-by: syzbot+68ee510075cf64260cc4@syzkaller.appspotmail.com
>Cc: Jiri Pirko <jiri@resnulli.us>
>Cc: Paolo Abeni <pabeni@redhat.com>
>Signed-off-by: Cong Wang <xiyou.wangcong@gmail.com>

Acked-by: Jiri Pirko <jiri@mellanox.com>

Thanks!

^ permalink raw reply

* Re: [net-next PATCH 1/2] mm: add dma_addr_t to struct page
From: Jesper Dangaard Brouer @ 2019-02-12 10:06 UTC (permalink / raw)
  To: Matthew Wilcox
  Cc: netdev, linux-mm, Toke Høiland-Jørgensen,
	Ilias Apalodimas, Saeed Mahameed, Andrew Morton, mgorman,
	David S. Miller, Tariq Toukan, brouer, Willem de Bruijn
In-Reply-To: <20190211165551.GD12668@bombadil.infradead.org>

On Mon, 11 Feb 2019 08:55:51 -0800
Matthew Wilcox <willy@infradead.org> wrote:

> On Mon, Feb 11, 2019 at 05:06:46PM +0100, Jesper Dangaard Brouer wrote:
> > The page_pool API is using page->private to store DMA addresses.
> > As pointed out by David Miller we can't use that on 32-bit architectures
> > with 64-bit DMA
> > 
> > This patch adds a new dma_addr_t struct to allow storing DMA addresses
> > 
> > Signed-off-by: Jesper Dangaard Brouer <brouer@redhat.com>
> > Signed-off-by: Ilias Apalodimas <ilias.apalodimas@linaro.org>  
> 
> Reviewed-by: Matthew Wilcox <willy@infradead.org>
> 
> > +		struct {	/* page_pool used by netstack */
> > +			/**
> > +			 * @dma_addr: Page_pool need to store DMA-addr, and  
> 
> s/need/needs/
> 
> > +			 * cannot use @private, as DMA-mappings can be 64-bit  
> 
> s/DMA-mappings/DMA addresses/
> 
> > +			 * even on 32-bit Architectures.  
> 
> s/A/a/

Yes, that comments needs improvement. I think I'll use AKPMs suggestion.

> > +			 */
> > +			dma_addr_t dma_addr; /* Shares area with @lru */  
> 
> It also shares with @slab_list, @next, @compound_head, @pgmap and
> @rcu_head.  I think it's pointless to try to document which other fields
> something shares space with; the places which do it are a legacy from
> before I rearranged struct page last year.  Anyone looking at this should
> now be able to see "Oh, this is a union, only use the fields which are
> in the union for the type of struct page I have here".

I agree, I'll strip that comment.

> Are the pages allocated from this API ever supposed to be mapped to
> userspace?

I would like to know what fields on struct-page we cannot touch if we
want to keep this a possibility?

That said, I hope we don't need to do this. But as I integrate this
further into the netstack code, we might have to support this, or
at-least release the page_pool "state" (currently only DMA-addr) before
the skb_zcopy code path.  First iteration will not do zero-copy stuff,
and later I'll coordinate with Willem how to add this, if needed.

My general opinion is that if an end-user want to have pages mapped to
userspace, then page_pool (MEM_TYPE_PAGE_POOL) is not the right choice,
but instead use MEM_TYPE_ZERO_COPY (see enum xdp_mem_type).  We are
generally working towards allowing NIC drivers to have a different
memory type per RX-ring.

> You also say in the documentation:
> 
>  * If no DMA mapping is done, then it can act as shim-layer that
>  * fall-through to alloc_page.  As no state is kept on the page, the
>  * regular put_page() call is sufficient.
> 
> I think this is probably a dangerous precedent to set.  Better to require
> exactly one call to page_pool_put_page() (with the understanding that the
> refcount may be elevated, so this may not be the final free of the page,
> but the page will no longer be usable for its page_pool purpose).

Yes, this actually how it is implemented today, and the comment should
be improved.  Today __page_pool_put_page() in case of refcount is
elevated do call __page_pool_clean_page() to release page page_pool
state, and is in principle no longer "usable" for page_pool purposes.
BUT I have considered removing this, as it might not fit how want to
use the API. In our current RFC we found a need for (and introduced) a
page_pool_unmap_page() call (that call __page_pool_clean_page()), when
driver hits cases where the code path doesn't have a call-back to
page_pool_put_page() but instead end-up calling put_page().

-- 
Best regards,
  Jesper Dangaard Brouer
  MSc.CS, Principal Kernel Engineer at Red Hat
  LinkedIn: http://www.linkedin.com/in/brouer

^ permalink raw reply

* Re: [PATCH v2 bpf-next] tools: bpftool: doc, add text about feature-subcommand
From: Quentin Monnet @ 2019-02-12 10:12 UTC (permalink / raw)
  To: Prashant Bhole, Alexei Starovoitov, Daniel Borkmann; +Cc: netdev
In-Reply-To: <20190212012512.9060-1-bhole_prashant_q7@lab.ntt.co.jp>

2019-02-12 10:25 UTC+0900 ~ Prashant Bhole <bhole_prashant_q7@lab.ntt.co.jp>
> This patch adds missing information about feature-subcommand in
> bpftool.rst
> 
> Signed-off-by: Prashant Bhole <bhole_prashant_q7@lab.ntt.co.jp>
> ---
> 
> v2: used tabs instead of spaces

Thanks a lot!

Reviewed-by: Quentin Monnet <quentin.monnet@netronome.com>

^ permalink raw reply

* Re: [Patch net] team: avoid complex list operations in team_nl_cmd_options_set()
From: Paolo Abeni @ 2019-02-12 10:32 UTC (permalink / raw)
  To: Cong Wang, netdev
  Cc: syzbot+4d4af685432dc0e56c91, syzbot+68ee510075cf64260cc4,
	Jiri Pirko
In-Reply-To: <20190212055951.6712-1-xiyou.wangcong@gmail.com>

On Mon, 2019-02-11 at 21:59 -0800, Cong Wang wrote:
> The current opt_inst_list operations inside team_nl_cmd_options_set()
> is too complex to track:

Indeed !

>     LIST_HEAD(opt_inst_list);
>     nla_for_each_nested(...) {
>         list_for_each_entry(opt_inst, &team->option_inst_list, list) {
>             if (__team_option_inst_tmp_find(&opt_inst_list, opt_inst))
>                 continue;
>             list_add(&opt_inst->tmp_list, &opt_inst_list);
>         }
>     }
>     team_nl_send_event_options_get(team, &opt_inst_list);
> 
> as while we retrieve 'opt_inst' from team->option_inst_list, it could
> be added to the local 'opt_inst_list' for multiple times. The
> __team_option_inst_tmp_find() doesn't work, as the setter
> team_mode_option_set() still calls team->ops.exit() which uses
> ->tmp_list too in __team_options_change_check().
> Simplify the list operations by moving the 'opt_inst_list' and
> team_nl_send_event_options_get() into the nla_for_each_nested() loop so
> that it can be guranteed that we won't insert a same list entry for
> multiple times. Therefore, __team_option_inst_tmp_find() can be removed
> too.
> 
> Fixes: 4fb0534fb7bb ("team: avoid adding twice the same option to the event list")
> Fixes: 2fcdb2c9e659 ("team: allow to send multiple set events in one message")
> Reported-by: syzbot+4d4af685432dc0e56c91@syzkaller.appspotmail.com
> Reported-by: syzbot+68ee510075cf64260cc4@syzkaller.appspotmail.com
> Cc: Jiri Pirko <jiri@resnulli.us>
> Cc: Paolo Abeni <pabeni@redhat.com>
> Signed-off-by: Cong Wang <xiyou.wangcong@gmail.com>

Reviewed-by: Paolo Abeni <pabeni@redhat.com>

Thank you!

Paolo


^ permalink raw reply

* [PATCH net] sctp: call gso_reset_checksum when computing checksum in sctp_gso_segment
From: Xin Long @ 2019-02-12 10:47 UTC (permalink / raw)
  To: linux-kernel, network dev, linux-sctp
  Cc: davem, Marcelo Ricardo Leitner, Neil Horman

Jianlin reported a panic when running sctp gso over gre over vlan device:

  [   84.772930] RIP: 0010:do_csum+0x6d/0x170
  [   84.790605] Call Trace:
  [   84.791054]  csum_partial+0xd/0x20
  [   84.791657]  gre_gso_segment+0x2c3/0x390
  [   84.792364]  inet_gso_segment+0x161/0x3e0
  [   84.793071]  skb_mac_gso_segment+0xb8/0x120
  [   84.793846]  __skb_gso_segment+0x7e/0x180
  [   84.794581]  validate_xmit_skb+0x141/0x2e0
  [   84.795297]  __dev_queue_xmit+0x258/0x8f0
  [   84.795949]  ? eth_header+0x26/0xc0
  [   84.796581]  ip_finish_output2+0x196/0x430
  [   84.797295]  ? skb_gso_validate_network_len+0x11/0x80
  [   84.798183]  ? ip_finish_output+0x169/0x270
  [   84.798875]  ip_output+0x6c/0xe0
  [   84.799413]  ? ip_append_data.part.50+0xc0/0xc0
  [   84.800145]  iptunnel_xmit+0x144/0x1c0
  [   84.800814]  ip_tunnel_xmit+0x62d/0x930 [ip_tunnel]
  [   84.801699]  gre_tap_xmit+0xac/0xf0 [ip_gre]
  [   84.802395]  dev_hard_start_xmit+0xa5/0x210
  [   84.803086]  sch_direct_xmit+0x14f/0x340
  [   84.803733]  __dev_queue_xmit+0x799/0x8f0
  [   84.804472]  ip_finish_output2+0x2e0/0x430
  [   84.805255]  ? skb_gso_validate_network_len+0x11/0x80
  [   84.806154]  ip_output+0x6c/0xe0
  [   84.806721]  ? ip_append_data.part.50+0xc0/0xc0
  [   84.807516]  sctp_packet_transmit+0x716/0xa10 [sctp]
  [   84.808337]  sctp_outq_flush+0xd7/0x880 [sctp]

It was caused by SKB_GSO_CB(skb)->csum_start not set in sctp_gso_segment.
sctp_gso_segment() calls skb_segment() with 'feature | NETIF_F_HW_CSUM',
which causes SKB_GSO_CB(skb)->csum_start not to be set in skb_segment().

For TCP/UDP, when feature supports HW_CSUM, CHECKSUM_PARTIAL will be set
and gso_reset_checksum will be called to set SKB_GSO_CB(skb)->csum_start.

So SCTP should do the same as TCP/UDP, to call gso_reset_checksum() when
computing checksum in sctp_gso_segment.

Reported-by: Jianlin Shi <jishi@redhat.com>
Signed-off-by: Xin Long <lucien.xin@gmail.com>
---
 net/sctp/offload.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/net/sctp/offload.c b/net/sctp/offload.c
index 123e9f2..edfcf16 100644
--- a/net/sctp/offload.c
+++ b/net/sctp/offload.c
@@ -36,6 +36,7 @@ static __le32 sctp_gso_make_checksum(struct sk_buff *skb)
 {
 	skb->ip_summed = CHECKSUM_NONE;
 	skb->csum_not_inet = 0;
+	gso_reset_checksum(skb, ~0);
 	return sctp_compute_cksum(skb, skb_transport_offset(skb));
 }
 
-- 
2.1.0


^ permalink raw reply related

* [PATCH net] sctp: set stream ext to NULL after freeing it in sctp_stream_outq_migrate
From: Xin Long @ 2019-02-12 10:51 UTC (permalink / raw)
  To: linux-kernel, network dev, linux-sctp
  Cc: davem, Marcelo Ricardo Leitner, Neil Horman

In sctp_stream_init(), after sctp_stream_outq_migrate() freed the
surplus streams' ext, but sctp_stream_alloc_out() returns -ENOMEM,
stream->outcnt will not be set to 'outcnt'.

With the bigger value on stream->outcnt, when closing the assoc and
freeing its streams, the ext of those surplus streams will be freed
again since those stream exts were not set to NULL after freeing in
sctp_stream_outq_migrate(). Then the invalid-free issue reported by
syzbot would be triggered.

We fix it by simply setting them to NULL after freeing.

Fixes: 5bbbbe32a431 ("sctp: introduce stream scheduler foundations")
Reported-by: syzbot+58e480e7b28f2d890bfd@syzkaller.appspotmail.com
Signed-off-by: Xin Long <lucien.xin@gmail.com>
---
 net/sctp/stream.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/net/sctp/stream.c b/net/sctp/stream.c
index f246331..2936ed1 100644
--- a/net/sctp/stream.c
+++ b/net/sctp/stream.c
@@ -144,8 +144,10 @@ static void sctp_stream_outq_migrate(struct sctp_stream *stream,
 		}
 	}

-	for (i = outcnt; i < stream->outcnt; i++)
+	for (i = outcnt; i < stream->outcnt; i++) {
 		kfree(SCTP_SO(stream, i)->ext);
+		SCTP_SO(stream, i)->ext = NULL;
+	}
 }

 static int sctp_stream_alloc_out(struct sctp_stream *stream, __u16 outcnt,
-- 
2.1.0

^ permalink raw reply related

* Re: [v3,4/5] net: phy: at803x: Disable phy delay for RGMII mode
From: Peter Ujfalusi @ 2019-02-12 10:55 UTC (permalink / raw)
  To: Vinod Koul, David S Miller, Roger Quadros
  Cc: netdev, linux-arm-msm, Niklas Cassel, Bjorn Andersson,
	Andrew Lunn, Florian Fainelli, Nori, Sekhar
In-Reply-To: <20190121091318.20079-5-vkoul@kernel.org>

Vinod,

On 21/01/2019 11.13, Vinod Koul wrote:
> For RGMII mode, phy delay should be disabled. Add this case along
> with disable delay routines.

In next-20190211 I need to revert this patch to get cpsw networking to work on am335x-evmsk. The board uses AR8031_AL1A PHY, which is handled by the phy/at803x.c

On next-20190211:
[    3.374601] net eth0: initializing cpsw version 1.12 (0)
[    3.384484] Atheros 8031 ethernet 4a101000.mdio:00: attached PHY driver [Atheros 8031 ethernet] (mii_bus:phy_addr=4a101000.mdio:00, irq=POLL)
[    3.400041] cpsw 4a100000.ethernet eth0: Link is Up - 1Gbps/Full - flow control rx/tx
[    3.410813] mmc1: new SDIO card at address 0001
[    3.439362] IP-Config: Complete:
[    3.442649]      device=eth0, hwaddr=bc:6a:29:7d:2c:a9, ipaddr=10.0.0.90, mask=255.255.255.0, gw=10.0.0.1
[    3.452840]      host=10.0.0.90, domain=, nis-domain=(none)
[    3.458462]      bootserver=10.0.0.30, rootserver=10.0.0.30, rootpath=
[    3.466296] vwl1271: disabling
[    3.470195] ALSA device list:
[    3.473189]   #0: AM335x-EVMSK

After reverting this patch:
[    3.374636] net eth0: initializing cpsw version 1.12 (0)
[    3.384534] Atheros 8031 ethernet 4a101000.mdio:00: attached PHY driver [Atheros 8031 ethernet] (mii_bus:phy_addr=4a101000.mdio:00, irq=POLL)
[    3.400125] cpsw 4a100000.ethernet eth0: Link is Up - 1Gbps/Full - flow control rx/tx
[    3.410866] mmc1: new SDIO card at address 0001
[    3.439379] IP-Config: Complete:
[    3.442666]      device=eth0, hwaddr=bc:6a:29:7d:2c:a9, ipaddr=10.0.0.90, mask=255.255.255.0, gw=10.0.0.1
[    3.452865]      host=10.0.0.90, domain=, nis-domain=(none)
[    3.458482]      bootserver=10.0.0.30, rootserver=10.0.0.30, rootpath=
[    3.466334] vwl1271: disabling
[    3.470245] ALSA device list:
[    3.473241]   #0: AM335x-EVMSK
[    3.501052] VFS: Mounted root (nfs filesystem) readonly on device 0:15.
[    3.508694] devtmpfs: mounted
[    3.514546] Freeing unused kernel memory: 1024K
[    3.520567] Run /sbin/init as init process

and the board boots to nfsroot fine.

 
> Signed-off-by: Vinod Koul <vkoul@kernel.org>
> ---
>  drivers/net/phy/at803x.c | 22 ++++++++++++----------
>  1 file changed, 12 insertions(+), 10 deletions(-)
> 
> diff --git a/drivers/net/phy/at803x.c b/drivers/net/phy/at803x.c
> index f9432d053a22..8ff12938ab47 100644
> --- a/drivers/net/phy/at803x.c
> +++ b/drivers/net/phy/at803x.c
> @@ -110,16 +110,16 @@ static int at803x_debug_reg_mask(struct phy_device *phydev, u16 reg,
>  	return phy_write(phydev, AT803X_DEBUG_DATA, val);
>  }
>  
> -static inline int at803x_enable_rx_delay(struct phy_device *phydev)
> +static inline int at803x_disable_rx_delay(struct phy_device *phydev)
>  {
> -	return at803x_debug_reg_mask(phydev, AT803X_DEBUG_REG_0, 0,
> -					AT803X_DEBUG_RX_CLK_DLY_EN);
> +	return at803x_debug_reg_mask(phydev, AT803X_DEBUG_REG_0,
> +				     AT803X_DEBUG_RX_CLK_DLY_EN, 0);
>  }
>  
> -static inline int at803x_enable_tx_delay(struct phy_device *phydev)
> +static inline int at803x_disable_tx_delay(struct phy_device *phydev)
>  {
> -	return at803x_debug_reg_mask(phydev, AT803X_DEBUG_REG_5, 0,
> -					AT803X_DEBUG_TX_CLK_DLY_EN);
> +	return at803x_debug_reg_mask(phydev, AT803X_DEBUG_REG_5,
> +				     AT803X_DEBUG_TX_CLK_DLY_EN, 0);
>  }
>  
>  /* save relevant PHY registers to private copy */
> @@ -256,15 +256,17 @@ static int at803x_config_init(struct phy_device *phydev)
>  		return ret;
>  
>  	if (phydev->interface == PHY_INTERFACE_MODE_RGMII_RXID ||
> -			phydev->interface == PHY_INTERFACE_MODE_RGMII_ID) {
> -		ret = at803x_enable_rx_delay(phydev);
> +			phydev->interface == PHY_INTERFACE_MODE_RGMII_ID ||
> +			phydev->interface == PHY_INTERFACE_MODE_RGMII) {
> +		ret = at803x_disable_rx_delay(phydev);
>  		if (ret < 0)
>  			return ret;
>  	}
>  
>  	if (phydev->interface == PHY_INTERFACE_MODE_RGMII_TXID ||
> -			phydev->interface == PHY_INTERFACE_MODE_RGMII_ID) {
> -		ret = at803x_enable_tx_delay(phydev);
> +			phydev->interface == PHY_INTERFACE_MODE_RGMII_ID ||
> +			phydev->interface == PHY_INTERFACE_MODE_RGMII) {
> +		ret = at803x_disable_tx_delay(phydev);
>  		if (ret < 0)
>  			return ret;
>  	}
> 

- Péter

Texas Instruments Finland Oy, Porkkalankatu 22, 00180 Helsinki. Y-tunnus/Business ID: 0615521-4. Kotipaikka/Domicile: Helsinki

^ permalink raw reply

* Re: [PATCH V1 net 2/2] net: ena: update driver version from 2.0.2 to 2.0.3
From: Moritz Fischer @ 2019-02-12 11:12 UTC (permalink / raw)
  To: akiyano
  Cc: David S. Miller, netdev, dwmw, zorik, matua, saeedb, msw,
	aliguori, nafea, gtzalik, netanel, alisaidi
In-Reply-To: <1549905464-13758-3-git-send-email-akiyano@amazon.com>

Hi Arthur,

On Mon, Feb 11, 2019 at 9:19 AM <akiyano@amazon.com> wrote:
>
> From: Arthur Kiyanovski <akiyano@amazon.com>
>
> Update driver version due to bug fix.

Wouldn't you want to do this atomically with the actual fix in one commit?

Thanks,
Moritz

^ permalink raw reply

* Re: [v3,4/5] net: phy: at803x: Disable phy delay for RGMII mode
From: Vinod Koul @ 2019-02-12 11:31 UTC (permalink / raw)
  To: Peter Ujfalusi
  Cc: David S Miller, Roger Quadros, netdev, linux-arm-msm,
	Niklas Cassel, Bjorn Andersson, Andrew Lunn, Florian Fainelli,
	Nori, Sekhar
In-Reply-To: <147151c4-d162-4ebe-189a-564492d84d18@ti.com>

Hi Peter,

On 12-02-19, 12:55, Peter Ujfalusi wrote:
> Vinod,
> 
> On 21/01/2019 11.13, Vinod Koul wrote:
> > For RGMII mode, phy delay should be disabled. Add this case along
> > with disable delay routines.
> 
> In next-20190211 I need to revert this patch to get cpsw networking to
> work on am335x-evmsk. The board uses AR8031_AL1A PHY, which is handled
> by the phy/at803x.c

I see that DTS specifies that you are using phy-mode = "rgmii-txid".
RGMII mode implies that we should not have any delay in the
phy, so this patch does the right thing.

In the previous version of the patch I did propose to add a DT entry so
that current users who are wrongly using this would not be impacted but
the suggestion was to get them fixed.

So in you case do you need rgmii-txd mode if so why should the delay be
enabled for this? We can add a patch that enabled delay for your
controller but that cant be rgmii mode.

Thanks


> 
> On next-20190211:
> [    3.374601] net eth0: initializing cpsw version 1.12 (0)
> [    3.384484] Atheros 8031 ethernet 4a101000.mdio:00: attached PHY driver [Atheros 8031 ethernet] (mii_bus:phy_addr=4a101000.mdio:00, irq=POLL)
> [    3.400041] cpsw 4a100000.ethernet eth0: Link is Up - 1Gbps/Full - flow control rx/tx
> [    3.410813] mmc1: new SDIO card at address 0001
> [    3.439362] IP-Config: Complete:
> [    3.442649]      device=eth0, hwaddr=bc:6a:29:7d:2c:a9, ipaddr=10.0.0.90, mask=255.255.255.0, gw=10.0.0.1
> [    3.452840]      host=10.0.0.90, domain=, nis-domain=(none)
> [    3.458462]      bootserver=10.0.0.30, rootserver=10.0.0.30, rootpath=
> [    3.466296] vwl1271: disabling
> [    3.470195] ALSA device list:
> [    3.473189]   #0: AM335x-EVMSK
> 
> After reverting this patch:
> [    3.374636] net eth0: initializing cpsw version 1.12 (0)
> [    3.384534] Atheros 8031 ethernet 4a101000.mdio:00: attached PHY driver [Atheros 8031 ethernet] (mii_bus:phy_addr=4a101000.mdio:00, irq=POLL)
> [    3.400125] cpsw 4a100000.ethernet eth0: Link is Up - 1Gbps/Full - flow control rx/tx
> [    3.410866] mmc1: new SDIO card at address 0001
> [    3.439379] IP-Config: Complete:
> [    3.442666]      device=eth0, hwaddr=bc:6a:29:7d:2c:a9, ipaddr=10.0.0.90, mask=255.255.255.0, gw=10.0.0.1
> [    3.452865]      host=10.0.0.90, domain=, nis-domain=(none)
> [    3.458482]      bootserver=10.0.0.30, rootserver=10.0.0.30, rootpath=
> [    3.466334] vwl1271: disabling
> [    3.470245] ALSA device list:
> [    3.473241]   #0: AM335x-EVMSK
> [    3.501052] VFS: Mounted root (nfs filesystem) readonly on device 0:15.
> [    3.508694] devtmpfs: mounted
> [    3.514546] Freeing unused kernel memory: 1024K
> [    3.520567] Run /sbin/init as init process
> 
> and the board boots to nfsroot fine.
> 
>  
> > Signed-off-by: Vinod Koul <vkoul@kernel.org>
> > ---
> >  drivers/net/phy/at803x.c | 22 ++++++++++++----------
> >  1 file changed, 12 insertions(+), 10 deletions(-)
> > 
> > diff --git a/drivers/net/phy/at803x.c b/drivers/net/phy/at803x.c
> > index f9432d053a22..8ff12938ab47 100644
> > --- a/drivers/net/phy/at803x.c
> > +++ b/drivers/net/phy/at803x.c
> > @@ -110,16 +110,16 @@ static int at803x_debug_reg_mask(struct phy_device *phydev, u16 reg,
> >  	return phy_write(phydev, AT803X_DEBUG_DATA, val);
> >  }
> >  
> > -static inline int at803x_enable_rx_delay(struct phy_device *phydev)
> > +static inline int at803x_disable_rx_delay(struct phy_device *phydev)
> >  {
> > -	return at803x_debug_reg_mask(phydev, AT803X_DEBUG_REG_0, 0,
> > -					AT803X_DEBUG_RX_CLK_DLY_EN);
> > +	return at803x_debug_reg_mask(phydev, AT803X_DEBUG_REG_0,
> > +				     AT803X_DEBUG_RX_CLK_DLY_EN, 0);
> >  }
> >  
> > -static inline int at803x_enable_tx_delay(struct phy_device *phydev)
> > +static inline int at803x_disable_tx_delay(struct phy_device *phydev)
> >  {
> > -	return at803x_debug_reg_mask(phydev, AT803X_DEBUG_REG_5, 0,
> > -					AT803X_DEBUG_TX_CLK_DLY_EN);
> > +	return at803x_debug_reg_mask(phydev, AT803X_DEBUG_REG_5,
> > +				     AT803X_DEBUG_TX_CLK_DLY_EN, 0);
> >  }
> >  
> >  /* save relevant PHY registers to private copy */
> > @@ -256,15 +256,17 @@ static int at803x_config_init(struct phy_device *phydev)
> >  		return ret;
> >  
> >  	if (phydev->interface == PHY_INTERFACE_MODE_RGMII_RXID ||
> > -			phydev->interface == PHY_INTERFACE_MODE_RGMII_ID) {
> > -		ret = at803x_enable_rx_delay(phydev);
> > +			phydev->interface == PHY_INTERFACE_MODE_RGMII_ID ||
> > +			phydev->interface == PHY_INTERFACE_MODE_RGMII) {
> > +		ret = at803x_disable_rx_delay(phydev);
> >  		if (ret < 0)
> >  			return ret;
> >  	}
> >  
> >  	if (phydev->interface == PHY_INTERFACE_MODE_RGMII_TXID ||
> > -			phydev->interface == PHY_INTERFACE_MODE_RGMII_ID) {
> > -		ret = at803x_enable_tx_delay(phydev);
> > +			phydev->interface == PHY_INTERFACE_MODE_RGMII_ID ||
> > +			phydev->interface == PHY_INTERFACE_MODE_RGMII) {
> > +		ret = at803x_disable_tx_delay(phydev);
> >  		if (ret < 0)
> >  			return ret;
> >  	}
> > 
> 
> - Péter
> 
> Texas Instruments Finland Oy, Porkkalankatu 22, 00180 Helsinki. Y-tunnus/Business ID: 0615521-4. Kotipaikka/Domicile: Helsinki

-- 
~Vinod

^ permalink raw reply

* [bpf-next 1/2] tcp: replace SOCK_DEBUG() with tcp_stats()
From: Yafang Shao @ 2019-02-12 11:31 UTC (permalink / raw)
  To: daniel, ast
  Cc: yhs, brakmo, edumazet, davem, netdev, linux-kernel, shaoyafang,
	Yafang Shao
In-Reply-To: <1549971097-12627-1-git-send-email-laoar.shao@gmail.com>

SOCK_DEBUG is a very ancient debugging interface, and it's not very useful
for debugging.
So this patch removes the SOCK_DEBUG() and introduce a new function
tcp_stats() to trace this kind of events.
Some MIBs are added for these events.

Regarding the SO_DEBUG in sock_{s,g}etsockopt, I think it is better to
keep as-is, because if we return an errno to tell the application that
this optname isn't supported for TCP, it may break the application.
The application still can use this option but don't take any effect for
TCP.

Signed-off-by: Yafang Shao <laoar.shao@gmail.com>
---
 include/uapi/linux/snmp.h |  3 +++
 net/ipv4/proc.c           |  3 +++
 net/ipv4/tcp_input.c      | 26 +++++++++++---------------
 net/ipv6/tcp_ipv6.c       |  2 --
 4 files changed, 17 insertions(+), 17 deletions(-)

diff --git a/include/uapi/linux/snmp.h b/include/uapi/linux/snmp.h
index 86dc24a..fd5c09c 100644
--- a/include/uapi/linux/snmp.h
+++ b/include/uapi/linux/snmp.h
@@ -283,6 +283,9 @@ enum
 	LINUX_MIB_TCPACKCOMPRESSED,		/* TCPAckCompressed */
 	LINUX_MIB_TCPZEROWINDOWDROP,		/* TCPZeroWindowDrop */
 	LINUX_MIB_TCPRCVQDROP,			/* TCPRcvQDrop */
+	LINUX_MIB_TCPINVALIDACK,		/* TCPInvalidAck */
+	LINUX_MIB_TCPOLDACK,			/* TCPOldAck */
+	LINUX_MIB_TCPPARTIALPACKET,		/* TCPPartialPacket */
 	__LINUX_MIB_MAX
 };
 
diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c
index c3610b3..1b0320a 100644
--- a/net/ipv4/proc.c
+++ b/net/ipv4/proc.c
@@ -291,6 +291,9 @@ static int sockstat_seq_show(struct seq_file *seq, void *v)
 	SNMP_MIB_ITEM("TCPAckCompressed", LINUX_MIB_TCPACKCOMPRESSED),
 	SNMP_MIB_ITEM("TCPZeroWindowDrop", LINUX_MIB_TCPZEROWINDOWDROP),
 	SNMP_MIB_ITEM("TCPRcvQDrop", LINUX_MIB_TCPRCVQDROP),
+	SNMP_MIB_ITEM("TCPInvalidAck", LINUX_MIB_TCPINVALIDACK),
+	SNMP_MIB_ITEM("TCPOldAck", LINUX_MIB_TCPOLDACK),
+	SNMP_MIB_ITEM("TCPPartialPacket", LINUX_MIB_TCPPARTIALPACKET),
 	SNMP_MIB_SENTINEL
 };
 
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 7a027dec..88deb1f 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -3554,6 +3554,11 @@ static u32 tcp_newly_delivered(struct sock *sk, u32 prior_delivered, int flag)
 	return delivered;
 }
 
+static void tcp_stats(struct sock *sk, int mib_idx)
+{
+	NET_INC_STATS(sock_net(sk), mib_idx);
+}
+
 /* This routine deals with incoming acks, but not outgoing ones. */
 static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
 {
@@ -3715,7 +3720,7 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
 	return 1;
 
 invalid_ack:
-	SOCK_DEBUG(sk, "Ack %u after %u:%u\n", ack, tp->snd_una, tp->snd_nxt);
+	tcp_stats(sk, LINUX_MIB_TCPINVALIDACK);
 	return -1;
 
 old_ack:
@@ -3731,7 +3736,7 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
 		tcp_xmit_recovery(sk, rexmit);
 	}
 
-	SOCK_DEBUG(sk, "Ack %u before %u:%u\n", ack, tp->snd_una, tp->snd_nxt);
+	tcp_stats(sk, LINUX_MIB_TCPOLDACK);
 	return 0;
 }
 
@@ -4432,13 +4437,10 @@ static void tcp_ofo_queue(struct sock *sk)
 		rb_erase(&skb->rbnode, &tp->out_of_order_queue);
 
 		if (unlikely(!after(TCP_SKB_CB(skb)->end_seq, tp->rcv_nxt))) {
-			SOCK_DEBUG(sk, "ofo packet was already received\n");
+			tcp_stats(sk, LINUX_MIB_TCPOFODROP);
 			tcp_drop(sk, skb);
 			continue;
 		}
-		SOCK_DEBUG(sk, "ofo requeuing : rcv_next %X seq %X - %X\n",
-			   tp->rcv_nxt, TCP_SKB_CB(skb)->seq,
-			   TCP_SKB_CB(skb)->end_seq);
 
 		tail = skb_peek_tail(&sk->sk_receive_queue);
 		eaten = tail && tcp_try_coalesce(sk, tail, skb, &fragstolen);
@@ -4499,11 +4501,9 @@ static void tcp_data_queue_ofo(struct sock *sk, struct sk_buff *skb)
 	tp->pred_flags = 0;
 	inet_csk_schedule_ack(sk);
 
-	NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPOFOQUEUE);
 	seq = TCP_SKB_CB(skb)->seq;
 	end_seq = TCP_SKB_CB(skb)->end_seq;
-	SOCK_DEBUG(sk, "out of order segment: rcv_next %X seq %X - %X\n",
-		   tp->rcv_nxt, seq, end_seq);
+	tcp_stats(sk, LINUX_MIB_TCPOFOQUEUE);
 
 	p = &tp->out_of_order_queue.rb_node;
 	if (RB_EMPTY_ROOT(&tp->out_of_order_queue)) {
@@ -4779,9 +4779,7 @@ static void tcp_data_queue(struct sock *sk, struct sk_buff *skb)
 
 	if (before(TCP_SKB_CB(skb)->seq, tp->rcv_nxt)) {
 		/* Partial packet, seq < rcv_next < end_seq */
-		SOCK_DEBUG(sk, "partial packet: rcv_next %X seq %X - %X\n",
-			   tp->rcv_nxt, TCP_SKB_CB(skb)->seq,
-			   TCP_SKB_CB(skb)->end_seq);
+		tcp_stats(sk, LINUX_MIB_TCPPARTIALPACKET);
 
 		tcp_dsack_set(sk, TCP_SKB_CB(skb)->seq, tp->rcv_nxt);
 
@@ -5061,9 +5059,7 @@ static int tcp_prune_queue(struct sock *sk)
 {
 	struct tcp_sock *tp = tcp_sk(sk);
 
-	SOCK_DEBUG(sk, "prune_queue: c=%x\n", tp->copied_seq);
-
-	NET_INC_STATS(sock_net(sk), LINUX_MIB_PRUNECALLED);
+	tcp_stats(sk, LINUX_MIB_PRUNECALLED);
 
 	if (atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf)
 		tcp_clamp_window(sk);
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index e51cda7..57ef69a1 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -220,8 +220,6 @@ static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr,
 		u32 exthdrlen = icsk->icsk_ext_hdr_len;
 		struct sockaddr_in sin;
 
-		SOCK_DEBUG(sk, "connect: ipv4 mapped\n");
-
 		if (__ipv6_only_sock(sk))
 			return -ENETUNREACH;
 
-- 
1.8.3.1


^ permalink raw reply related

* [bpf-next 2/2] bpf: add BPF_SOCK_OPS_STATS_CB for tcp_stats()
From: Yafang Shao @ 2019-02-12 11:31 UTC (permalink / raw)
  To: daniel, ast
  Cc: yhs, brakmo, edumazet, davem, netdev, linux-kernel, shaoyafang,
	Yafang Shao
In-Reply-To: <1549971097-12627-1-git-send-email-laoar.shao@gmail.com>

Introuce this new op BPF_SOCK_OPS_STATS_CB for tcp_stats() such that it
can be traced via BPF on a per socket basis.
There's one argument in BPF_SOCK_OPS_STATS_CB, which is Linux MIB index
LINUX_MIB_* to indicate the TCP event.
All these Linux MIBs are defined in include/uapi/linux/snmp.h.

Signed-off-by: Yafang Shao <laoar.shao@gmail.com>
---
 include/uapi/linux/bpf.h | 5 +++++
 net/ipv4/tcp_input.c     | 1 +
 2 files changed, 6 insertions(+)

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 1777fa0..0314ddd 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -2894,6 +2894,11 @@ enum {
 	BPF_SOCK_OPS_TCP_LISTEN_CB,	/* Called on listen(2), right after
 					 * socket transition to LISTEN state.
 					 */
+	BPF_SOCK_OPS_STATS_CB,		/*
+					 * Called on tcp_stats().
+					 * Arg1: Linux MIB index
+					 * 	 LINUX_MIB_*
+					 */
 };
 
 /* List of TCP states. There is a build check in net/ipv4/tcp.c to detect
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 88deb1f..4acf458 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -3557,6 +3557,7 @@ static u32 tcp_newly_delivered(struct sock *sk, u32 prior_delivered, int flag)
 static void tcp_stats(struct sock *sk, int mib_idx)
 {
 	NET_INC_STATS(sock_net(sk), mib_idx);
+	tcp_call_bpf(sk, BPF_SOCK_OPS_STATS_CB, 1, &mib_idx);
 }
 
 /* This routine deals with incoming acks, but not outgoing ones. */
-- 
1.8.3.1


^ permalink raw reply related

* [bpf-next 0/2] cleanup SOCK_DEBUG() and introduce BPF_SOCK_OPS_STATS_CB
From: Yafang Shao @ 2019-02-12 11:31 UTC (permalink / raw)
  To: daniel, ast
  Cc: yhs, brakmo, edumazet, davem, netdev, linux-kernel, shaoyafang,
	Yafang Shao

SOCK_DEBUG is a very ancient debugging interface, and it's not very useful
for debugging.
This pacthset cleanups SOCK_DEBUG() and replace it with a new methord
based on BPF.

I cleanup SOCK_DEBUG() only for TCP, and other protocols are kept as is.

After this patchset, the SO_DEBUG interface will not take any effect for
TCP, but I still keep it in sock_{s,g}etsockopt() for TCP to avoid breaking
applications.

In the future we may extend tcp_stats() as bellow or something else to
cover all the LINUX_MIB_* and TCP_MIB_* proposaled[0] in the netconf2018.

now:
	tcp_stats(struct sock *sk, int mib_idx)
future:
	tcp_stats(struct sock *sk, int mib_idx, int packets)
	The argument packets can be 1 to indicates this is a event only;
	and skb_shinfo(skb)->gso_segs to indicates the number of packets
	are also concerned.

[0] page 14,
http://vger.kernel.org/netconf2018_files/BrendanGregg_netconf2018.pdf

Yafang Shao (2):
  tcp: replace SOCK_DEBUG() with tcp_stats()
  bpf: add BPF_SOCK_OPS_STATS_CB for tcp_stats()

 include/uapi/linux/bpf.h  |  5 +++++
 include/uapi/linux/snmp.h |  3 +++
 net/ipv4/proc.c           |  3 +++
 net/ipv4/tcp_input.c      | 27 ++++++++++++---------------
 net/ipv6/tcp_ipv6.c       |  2 --
 5 files changed, 23 insertions(+), 17 deletions(-)

-- 
1.8.3.1

^ permalink raw reply

page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox