Netdev List

Netdev List
 help / color / mirror / Atom feed

* [PATCH net-next] ixgbe: introduce a helper to simplify code
From: YueHaibing @ 2018-05-23 12:08 UTC (permalink / raw)
  To: davem, jeffrey.t.kirsher; +Cc: netdev, intel-wired-lan, YueHaibing

ixgbe_dbg_reg_ops_read and ixgbe_dbg_netdev_ops_read copy-pasting
the same code except for ixgbe_dbg_netdev_ops_buf/ixgbe_dbg_reg_ops_buf,
so introduce a helper ixgbe_dbg_common_ops_read to remove redundant code.

Signed-off-by: YueHaibing <yuehaibing@huawei.com>
---
 drivers/net/ethernet/intel/ixgbe/ixgbe_debugfs.c | 57 +++++++++---------------
 1 file changed, 21 insertions(+), 36 deletions(-)

diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_debugfs.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_debugfs.c
index 55fe811..50dfb02 100644
--- a/drivers/net/ethernet/intel/ixgbe/ixgbe_debugfs.c
+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_debugfs.c
@@ -10,15 +10,9 @@ static struct dentry *ixgbe_dbg_root;
 
 static char ixgbe_dbg_reg_ops_buf[256] = "";
 
-/**
- * ixgbe_dbg_reg_ops_read - read for reg_ops datum
- * @filp: the opened file
- * @buffer: where to write the data for the user to read
- * @count: the size of the user's buffer
- * @ppos: file position offset
- **/
-static ssize_t ixgbe_dbg_reg_ops_read(struct file *filp, char __user *buffer,
-				    size_t count, loff_t *ppos)
+static ssize_t ixgbe_dbg_common_ops_read(struct file *filp, char __user *buffer,
+					 size_t count, loff_t *ppos,
+					 char *dbg_buf)
 {
 	struct ixgbe_adapter *adapter = filp->private_data;
 	char *buf;
@@ -29,8 +23,7 @@ static ssize_t ixgbe_dbg_reg_ops_read(struct file *filp, char __user *buffer,
 		return 0;
 
 	buf = kasprintf(GFP_KERNEL, "%s: %s\n",
-			adapter->netdev->name,
-			ixgbe_dbg_reg_ops_buf);
+			adapter->netdev->name, dbg_buf);
 	if (!buf)
 		return -ENOMEM;
 
@@ -46,6 +39,20 @@ static ssize_t ixgbe_dbg_reg_ops_read(struct file *filp, char __user *buffer,
 }
 
 /**
+ * ixgbe_dbg_reg_ops_read - read for reg_ops datum
+ * @filp: the opened file
+ * @buffer: where to write the data for the user to read
+ * @count: the size of the user's buffer
+ * @ppos: file position offset
+ **/
+static ssize_t ixgbe_dbg_reg_ops_read(struct file *filp, char __user *buffer,
+				      size_t count, loff_t *ppos)
+{
+	return ixgbe_dbg_common_ops_read(filp, buffer, count, ppos,
+					 ixgbe_dbg_reg_ops_buf);
+}
+
+/**
  * ixgbe_dbg_reg_ops_write - write into reg_ops datum
  * @filp: the opened file
  * @buffer: where to find the user's data
@@ -121,33 +128,11 @@ static char ixgbe_dbg_netdev_ops_buf[256] = "";
  * @count: the size of the user's buffer
  * @ppos: file position offset
  **/
-static ssize_t ixgbe_dbg_netdev_ops_read(struct file *filp,
-					 char __user *buffer,
+static ssize_t ixgbe_dbg_netdev_ops_read(struct file *filp, char __user *buffer,
 					 size_t count, loff_t *ppos)
 {
-	struct ixgbe_adapter *adapter = filp->private_data;
-	char *buf;
-	int len;
-
-	/* don't allow partial reads */
-	if (*ppos != 0)
-		return 0;
-
-	buf = kasprintf(GFP_KERNEL, "%s: %s\n",
-			adapter->netdev->name,
-			ixgbe_dbg_netdev_ops_buf);
-	if (!buf)
-		return -ENOMEM;
-
-	if (count < strlen(buf)) {
-		kfree(buf);
-		return -ENOSPC;
-	}
-
-	len = simple_read_from_buffer(buffer, count, ppos, buf, strlen(buf));
-
-	kfree(buf);
-	return len;
+	return ixgbe_dbg_common_ops_read(filp, buffer, count, ppos,
+					 ixgbe_dbg_netdev_ops_buf);
 }
 
 /**
-- 
2.7.0

^ permalink raw reply related

* Re: [PATCH 0/3] bpf: add boot parameters for sysctl knobs
From: Jesper Dangaard Brouer @ 2018-05-23 12:00 UTC (permalink / raw)
  To: Eugene Syromiatnikov
  Cc: Alexei Starovoitov, netdev, linux-doc, Kees Cook, Kai-Heng Feng,
	Daniel Borkmann, Alexei Starovoitov, Jonathan Corbet, Jiri Olsa,
	brouer
In-Reply-To: <20180523113542.GD3888@asgard.redhat.com>

On Wed, 23 May 2018 13:35:47 +0200
Eugene Syromiatnikov <esyr@redhat.com> wrote:

> On Mon, May 21, 2018 at 11:58:13AM -0700, Alexei Starovoitov wrote:
> > On Mon, May 21, 2018 at 02:29:30PM +0200, Eugene Syromiatnikov wrote:  
> > > Hello.
> > > 
> > > This patch set adds ability to set default values for
> > > kernel.unprivileged_bpf_disable, net.core.bpf_jit_harden,
> > > net.core.bpf_jit_kallsyms sysctl knobs as well as option to override
> > > them via a boot-time kernel parameter.  
> > 
> > Commits log not only should explain 'what' is being done by the patch,
> > but 'why' as well.  
> 
> Some BPF sysctl knobs affect the loading of BPF programs, and during
> system boot/init stages these sysctls are not yet configured. A
> concrete example is systemd, that has implemented loading of BPF
> programs.
> 
> Thus, to allow controlling these setting at early boot, this patch set
> adds the ability to change the default setting of these sysctl knobs
> as well as option to override them via a boot-time kernel parameter
> (in order to avoid rebuilding kernel each time a need of changing these
> defaults arises).
> 
> The sysctl knobs in question are kernel.unprivileged_bpf_disable,
> net.core.bpf_jit_harden, and net.core.bpf_jit_kallsyms.

Hi Eugene,

You have to resend the entire patchset with this explanation in the
cover-letter.  Your old patchset have been dropped from patchwork[1]
due to being marked with "Changes Requested".

Please remember to give it a "V2" tag and also specify which git tree
you are targeting[2], like [PATCH bpf-next V2].


[1] http://patchwork.ozlabs.org/project/netdev/list/?series=45617&state=%2a

[2] https://github.com/netoptimizer/linux/blob/bpf_doc10/Documentation/bpf/bpf_devel_QA.rst#q-how-do-i-indicate-which-tree-bpf-vs-bpf-next-my-patch-should-be-applied-to
-- 
Best regards,
  Jesper Dangaard Brouer
  MSc.CS, Principal Kernel Engineer at Red Hat
  LinkedIn: http://www.linkedin.com/in/brouer

^ permalink raw reply

* RE: [PATCH v2] packet: track ring entry use using a shadow ring to prevent RX ring overrun
From: Jon Rosen (jrosen) @ 2018-05-23 11:54 UTC (permalink / raw)
  To: Willem de Bruijn
  Cc: David S. Miller, Willem de Bruijn, Eric Dumazet, Kees Cook,
	David Windsor, Rosen, Rami, Reshetova, Elena, Mike Maloney,
	Benjamin Poirier, Thomas Gleixner, Greg Kroah-Hartman,
	open list:NETWORKING [GENERAL], open list
In-Reply-To: <CAF=yD-KvTyTibhqpKP79Obfk8KRVDLMyb6THNwNgrJzs47bAtQ@mail.gmail.com>

> > For the ring, there is no requirement to allocate exactly the amount
> > specified by the user request. Safer than relying on shared memory
> > and simpler than the extra allocation in this patch would be to allocate
> > extra shadow memory at the end of the ring (and not mmap that).
> >
> > That still leaves an extra cold cacheline vs using tp_padding.
> 
> Given my lack of experience and knowledge in writing kernel code
> it was easier for me to allocate the shadow ring as a separate
> structure.  Of course it's not about me and my skills so if it's
> more appropriate to allocate at the tail of the existing ring
> then certainly I can look at doing that.

The memory for the ring is not one contiguous block, it's an array of
blocks of pages (or 'order' sized blocks of pages). I don't think
increasing the size of each of the blocks to provided storage would be
such a good idea as it will risk spilling over into the next order and
wasting lots of memory. I suspect it's also more complex than a single
shadow ring to do both the allocation and the access.

It could be tacked onto the end of the pg_vec[] used to store the
pointers to the blocks. The challenge with that is that a pg_vec[] is
created for each of RX and TX rings so either it would have to
allocate unnecessary storage for TX or the caller will have to say if
extra space should be allocated or not.  E.g.:

static struct pgv *alloc_pg_vec(struct tpacket_req *req, int order, int scratch, void **scratch_p)

I'm not sure avoiding the extra allocation and moving it to the
pg_vec[] for the RX ring is going to get the simplification you were
hoping for.  Is there another way of storing the shadow ring which
I should consider?

^ permalink raw reply

* Re: [PATCH 0/3] bpf: add boot parameters for sysctl knobs
From: Eugene Syromiatnikov @ 2018-05-23 11:35 UTC (permalink / raw)
  To: Alexei Starovoitov
  Cc: netdev, linux-kernel, linux-doc, Kees Cook, Kai-Heng Feng,
	Daniel Borkmann, Alexei Starovoitov, Jonathan Corbet, Jiri Olsa,
	Jesper Dangaard Brouer
In-Reply-To: <20180521185811.2r6372timh5rfs5n@ast-mbp.dhcp.thefacebook.com>

On Mon, May 21, 2018 at 11:58:13AM -0700, Alexei Starovoitov wrote:
> On Mon, May 21, 2018 at 02:29:30PM +0200, Eugene Syromiatnikov wrote:
> > Hello.
> > 
> > This patch set adds ability to set default values for
> > kernel.unprivileged_bpf_disable, net.core.bpf_jit_harden,
> > net.core.bpf_jit_kallsyms sysctl knobs as well as option to override
> > them via a boot-time kernel parameter.
> 
> Commits log not only should explain 'what' is being done by the patch,
> but 'why' as well.

Some BPF sysctl knobs affect the loading of BPF programs, and during
system boot/init stages these sysctls are not yet configured. A
concrete example is systemd, that has implemented loading of BPF
programs.

Thus, to allow controlling these setting at early boot, this patch set
adds the ability to change the default setting of these sysctl knobs
as well as option to override them via a boot-time kernel parameter
(in order to avoid rebuilding kernel each time a need of changing these
defaults arises).

The sysctl knobs in question are kernel.unprivileged_bpf_disable,
net.core.bpf_jit_harden, and net.core.bpf_jit_kallsyms.

^ permalink raw reply

* Re: [PATCH v2 1/1] drivers core: multi-threading device shutdown
From: Pavel Tatashin @ 2018-05-23 11:31 UTC (permalink / raw)
  To: pavel
  Cc: Steven Sistare, Daniel Jordan, LKML, jeffrey.t.kirsher,
	intel-wired-lan, netdev, gregkh, Alexander Duyck, tobin
In-Reply-To: <20180523104029.GC15312@amd>

Hi Pavel,

Thank you for looking at this patch. BTW, the version 5 is out. The latest
thread is anchered here:
http://lkml.kernel.org/r/20180516024004.28977-1-pasha.tatashin@oracle.com

> ixgbe is network card, right? So ... it does not have any persistent
> state and no moving parts, and there's no reason we could not "just
> power it down"?

True to what you said, but the same path is used for both regular reboot,
and kexec reboot. In the later case we want to make sure that devices are
quieced and do not send any interrupts, do not do any DMA activity, and
basically in the same state as when system was first cold booted, so the
driver initializing functions can pick and bring the devices up. My
understanding is that because we do not want to diverge the regular reboot
and kexec reboot, we do it for both cases.

^ permalink raw reply

* Re: [bpf-next V4 PATCH 1/8] bpf: devmap introduce dev_map_enqueue
From: Jesper Dangaard Brouer @ 2018-05-23 11:12 UTC (permalink / raw)
  To: Daniel Borkmann
  Cc: netdev, Daniel Borkmann, Alexei Starovoitov, Christoph Hellwig,
	BjörnTöpel, Magnus Karlsson, makita.toshiaki, brouer
In-Reply-To: <1cb01875-6044-540a-e1f8-0cafd9587ee8@iogearbox.net>


On Wed, 23 May 2018 11:34:22 +0200 Daniel Borkmann <daniel@iogearbox.net> wrote:

> > +int dev_map_enqueue(struct bpf_dtab_netdev *dst, struct xdp_buff *xdp)
> > +{
> > +	struct net_device *dev = dst->dev;
> > +	struct xdp_frame *xdpf;
> > +	int err;
> > +
> > +	if (!dev->netdev_ops->ndo_xdp_xmit)
> > +		return -EOPNOTSUPP;
> > +
> > +	xdpf = convert_to_xdp_frame(xdp);
> > +	if (unlikely(!xdpf))
> > +		return -EOVERFLOW;
> > +
> > +	/* TODO: implement a bulking/enqueue step later */
> > +	err = dev->netdev_ops->ndo_xdp_xmit(dev, xdpf);
> > +	if (err)
> > +		return err;
> > +
> > +	return 0;  
> 
> The 'err' is just unnecessary, lets just do:
> 
>   return dev->netdev_ops->ndo_xdp_xmit(dev, xdpf);
> 
> Later after the other patches this becomes:
> 
>   return bq_enqueue(dst, xdpf, dev_rx);

I agree, I'll fix this up in V5.

After this patchset gets applied, there are also other opportunities to
do similar micro-optimizations.  I have a branch (on top of this
patchset) which does such micro-optimizations (including this) plus
I've looked at the resulting asm-code layout.  But my benchmarks only
show a 2 nanosec improvement for all these micro-optimizations (where
the focus is to reduce the asm-code I-cache size of xdp_do_redirect).

-- 
Best regards,
  Jesper Dangaard Brouer
  MSc.CS, Principal Kernel Engineer at Red Hat
  LinkedIn: http://www.linkedin.com/in/brouer

^ permalink raw reply

* RE: [PATCH v2] packet: track ring entry use using a shadow ring to prevent RX ring overrun
From: Jon Rosen (jrosen) @ 2018-05-23 11:08 UTC (permalink / raw)
  To: Willem de Bruijn
  Cc: David S. Miller, Willem de Bruijn, Eric Dumazet, Kees Cook,
	David Windsor, Rosen, Rami, Reshetova, Elena, Mike Maloney,
	Benjamin Poirier, Thomas Gleixner, Greg Kroah-Hartman,
	open list:NETWORKING [GENERAL], open list
In-Reply-To: <CAF=yD-KHFwhVt4xzED7FPLd_xej9nLmoZOgZG0PmsVgbrnN0Ng@mail.gmail.com>

> >>> I think the bigger issues as you've pointed out are the cost of
> >>> the additional spin lock and should the additional state be
> >>> stored in-band (fewer cache lines) or out-of band (less risk of
> >>> breaking due to unpredictable application behavior).
> >>
> >> We don't need the spinlock if clearing the shadow byte after
> >> setting the status to user.
> >>
> >> Worst case, user will set it back to kernel while the shadow
> >> byte is not cleared yet and the next producer will drop a packet.
> >> But next producers will make progress, so there is no deadlock
> >> or corruption.
> >
> > I thought so too for a while but after spending more time than I
> > care to admit I relized the following sequence was occuring:
> >
> >    Core A                       Core B
> >    ------                       ------
> >    - Enter spin_lock
> >    -   Get tp_status of head (X)
> >        tp_status == 0
> >    -   Check inuse
> >        inuse == 0
> >    -   Allocate entry X
> >        advance head (X+1)
> >        set inuse=1
> >    - Exit spin_lock
> >
> >      <very long delay>
> >
> >                                 <allocate N-1 entries
> >                                 where N = size of ring>
> >
> >                                 - Enter spin_lock
> >                                 -   get tp_status of head (X+N)
> >                                     tp_status == 0 (but slot
> >                                     in use for X on core A)
> >
> >    - write tp_status of         <--- trouble!
> >      X = TP_STATUS_USER         <--- trouble!
> >    - write inuse=0              <--- trouble!
> >
> >                                 -   Check inuse
> >                                     inuse == 0
> >                                 -   Allocate entry X+N
> >                                     advance head (X+N+1)
> >                                     set inuse=1
> >                                 - Exit spin_lock
> >
> >
> > At this point Core A just passed slot X to userspace with a
> > packet and Core B has just been assigned slot X+N (same slot as
> > X) for it's new packet. Both cores A and B end up filling in that
> > slot.  Tracking ths donw was one of the reasons it took me a
> > while to produce these updated diffs.
> 
> Is this not just an ordering issue? Since inuse is set after tp_status,
> it has to be tested first (and barriers are needed to avoid reordering).

I changed the code as you suggest to do the inuse check first and
removed the extra added spin_lock/unlock and it seems to be working.
I was able to run through the night without an issue (normally I would
hit the ring corruption in 1 to 2 hours).

Thanks for pointing that out, I should have caught that myself.  Next
I'll look at your suggestion for where to put the shadow ring.

^ permalink raw reply

* Re: [v8, bpf-next, 4/9] net/wireless/iwlwifi: fix iwlwifi_dev_ucode_error tracepoint
From: Johannes Berg @ 2018-05-23 11:03 UTC (permalink / raw)
  To: Alexei Starovoitov, davem-fT/PcQaiUtIeIZ0/mPfg9Q
  Cc: daniel-FeC+5ew28dpmcu3hnIyYJQ,
	torvalds-de/tnXTf+JLsfHDXvbKv3WD2FQJk+8+b,
	peterz-wEGCiKHe2LqWVfeAwA7xHQ, rostedt-nx8X9YLhiw1AfugRpC6u6w,
	mathieu.desnoyers-vg+e7yoeK/dWk0Htik3J/w,
	netdev-u79uwXL29TY76Z2rM5mHXA, kernel-team-b10kYP2dOMg,
	linux-api-u79uwXL29TY76Z2rM5mHXA,
	linux-wireless-u79uwXL29TY76Z2rM5mHXA
In-Reply-To: <20180328190540.370956-5-ast-DgEjT+Ai2ygdnm+yROfE0A@public.gmane.org>

On Wed, 2018-03-28 at 12:05 -0700, Alexei Starovoitov wrote:
> fix iwlwifi_dev_ucode_error tracepoint to pass pointer to a table
> instead of all 17 arguments by value.
> dvm/main.c and mvm/utils.c have 'struct iwl_error_event_table'
> defined with very similar yet subtly different fields and offsets.
> tracepoint is still common and using definition of 'struct iwl_error_event_table'
> from dvm/commands.h while copying fields.
> Long term this tracepoint probably should be split into two.

It would've been nice to CC the wireless list for wireless related
patches ...

> --- a/drivers/net/wireless/intel/iwlwifi/iwl-devtrace.c
> +++ b/drivers/net/wireless/intel/iwlwifi/iwl-devtrace.c
> @@ -30,6 +30,7 @@
>  #ifndef __CHECKER__
>  #include "iwl-trans.h"
>  
> +#include "dvm/commands.h"

In particular, this breaks the whole driver abstraction.

> +++ b/drivers/net/wireless/intel/iwlwifi/mvm/utils.c
> @@ -549,12 +549,7 @@ static void iwl_mvm_dump_lmac_error_log(struct iwl_mvm *mvm, u32 base)
>  
>         IWL_ERR(mvm, "Loaded firmware version: %s\n", mvm->fw->fw_version);
>  
> -       trace_iwlwifi_dev_ucode_error(trans->dev, table.error_id, table.tsf_low,
> -                                     table.data1, table.data2, table.data3,
> -                                     table.blink2, table.ilink1,
> -                                     table.ilink2, table.bcon_time, table.gp1,
> -                                     table.gp2, table.fw_rev_type, table.major,
> -                                     table.minor, table.hw_ver, table.brd_ver);
> +       trace_iwlwifi_dev_ucode_error(trans->dev, &table, table.hw_ver, table.brd_ver);

This is also utterly wrong because mvm has - for better or worse - a
different type "struct iwl_error_event_table" in this file ...

This really should never have gotten into the tree.

johannes

^ permalink raw reply

* [PATCH] netfilter: nft_numgen: fix ptr_ret.cocci warnings
From: kbuild test robot @ 2018-05-23 10:58 UTC (permalink / raw)
  To: Laura Garcia Liebana
  Cc: kbuild-all, Pablo Neira Ayuso, Jozsef Kadlecsik, Florian Westphal,
	netfilter-devel, coreteam, netdev, linux-kernel
In-Reply-To: <201805231855.2507c2Vh%fengguang.wu@intel.com>

From: kbuild test robot <fengguang.wu@intel.com>

net/netfilter/nft_numgen.c:117:1-3: WARNING: PTR_ERR_OR_ZERO can be used


 Use PTR_ERR_OR_ZERO rather than if(IS_ERR(...)) + PTR_ERR

Generated by: scripts/coccinelle/api/ptr_ret.cocci

Fixes: d734a2888922 ("netfilter: nft_numgen: add map lookups for numgen statements")
CC: Laura Garcia Liebana <nevola@gmail.com>
Signed-off-by: kbuild test robot <fengguang.wu@intel.com>
---

 nft_numgen.c |    5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

--- a/net/netfilter/nft_numgen.c
+++ b/net/netfilter/nft_numgen.c
@@ -114,10 +114,7 @@ static int nft_ng_inc_map_init(const str
 					  tb[NFTA_NG_SET_NAME],
 					  tb[NFTA_NG_SET_ID], genmask);
 
-	if (IS_ERR(priv->map))
-		return PTR_ERR(priv->map);
-
-	return 0;
+	return PTR_ERR_OR_ZERO(priv->map);
 }
 
 static int nft_ng_dump(struct sk_buff *skb, enum nft_registers dreg,

^ permalink raw reply

* Re: [bpf-next V4 PATCH 2/8] bpf: devmap prepare xdp frames for bulking
From: Daniel Borkmann @ 2018-05-23 10:45 UTC (permalink / raw)
  To: Jesper Dangaard Brouer
  Cc: netdev, Daniel Borkmann, Alexei Starovoitov, Christoph Hellwig,
	BjörnTöpel, Magnus Karlsson, makita.toshiaki
In-Reply-To: <20180523122917.05f45694@redhat.com>

On 05/23/2018 12:29 PM, Jesper Dangaard Brouer wrote:
> On Wed, 23 May 2018 11:54:38 +0200
> Daniel Borkmann <daniel@iogearbox.net> wrote:
> 
>> Please fix this bug up and send a fresh series, so we can fix this
>> right away without needing a follow-up in bpf-next.
> 
> Okay.  So I assume this means you reverted/didn't push this patchset in
> bpf-next tree... so I have time to respin the series, and I will send a V5.

Yeah, I tossed it from there, so we have a chance to fix it up in the series
itself, which will then later on land in a clean state in net-next eventually.
So please resend the series in a v5. Thanks Jesper!

^ permalink raw reply

* Re: [PATCH 1/1] tools/lib/libbpf.c: fix string format to allow build on arm32
From: Daniel Borkmann @ 2018-05-23 10:41 UTC (permalink / raw)
  To: Sirio Balmelli; +Cc: netdev, kafai
In-Reply-To: <20180521065940.srb4rqwaohkn43i6@vm4>

[ +Martin ]

On 05/21/2018 08:59 AM, Sirio Balmelli wrote:
> On arm32, 'cd tools/testing/selftests/bpf && make' fails with:
> 
> libbpf.c:80:10: error: format ‘%ld’ expects argument of type ‘long int’, but argument 4 has type ‘int64_t {aka long long int}’ [-Werror=format=]
>    (func)("libbpf: " fmt, ##__VA_ARGS__); \
>           ^
> libbpf.c:83:30: note: in expansion of macro ‘__pr’
>  #define pr_warning(fmt, ...) __pr(__pr_warning, fmt, ##__VA_ARGS__)
>                               ^~~~
> libbpf.c:1072:3: note: in expansion of macro ‘pr_warning’
>    pr_warning("map:%s value_type:%s has BTF type_size:%ld != value_size:%u\n",
> 
> To fix, include 'inttypes.h' and change format directive to 'PRIi64'.
> 
> Signed-off-by: Sirio Balmelli <sirio@b-ad.ch>

Thanks for the fix! One minor comment below.

> ---
>  tools/lib/bpf/libbpf.c | 6 ++++--
>  1 file changed, 4 insertions(+), 2 deletions(-)
> 
> diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c
> index 3dbe217bf23e..e2cc8f10c188 100644
> --- a/tools/lib/bpf/libbpf.c
> +++ b/tools/lib/bpf/libbpf.c
> @@ -48,6 +48,8 @@
>  #include "bpf.h"
>  #include "btf.h"
>  
> +#include <inttypes.h>   /* PRIi64 */
> +
>  #ifndef EM_BPF
>  #define EM_BPF 247
>  #endif
> @@ -1042,7 +1044,7 @@ static int bpf_map_find_btf_info(struct bpf_map *map, const struct btf *btf)
>  	}
>  
>  	if (def->key_size != key_size) {
> -		pr_warning("map:%s key_type:%s has BTF type_size:%ld != key_size:%u\n",
> +		pr_warning("map:%s key_type:%s has BTF type_size:%"PRIi64" != key_size:%u\n",
>  			   map->name, name, key_size, def->key_size);
>  		return -EINVAL;
>  	}
> @@ -1069,7 +1071,7 @@ static int bpf_map_find_btf_info(struct bpf_map *map, const struct btf *btf)
>  	}
>  
>  	if (def->value_size != value_size) {
> -		pr_warning("map:%s value_type:%s has BTF type_size:%ld != value_size:%u\n",
> +		pr_warning("map:%s value_type:%s has BTF type_size:%"PRIi64" != value_size:%u\n",
>  			   map->name, name, value_size, def->value_size);

I don't think we need the PRIi64 in here. Could you just change it to 'type_size:%u'
and then cast the 'key_size' resp. 'value_size' to unsigned int? We know at this
point that the type size value is positive anyway, so we only want to show the size
mismatch so simple cast should suffice. Thanks!

>  		return -EINVAL;
>  	}
> 

^ permalink raw reply

* Re: [PATCH v2 1/1] drivers core: multi-threading device shutdown
From: Pavel Machek @ 2018-05-23 10:40 UTC (permalink / raw)
  To: Pavel Tatashin
  Cc: steven.sistare, daniel.m.jordan, linux-kernel, jeffrey.t.kirsher,
	intel-wired-lan, netdev, gregkh, alexander.duyck, tobin
In-Reply-To: <20180505154040.28614-2-pasha.tatashin@oracle.com>

[-- Attachment #1: Type: text/plain, Size: 1260 bytes --]

Hi!

> When system is rebooted, halted or kexeced device_shutdown() is
> called.
> 
> This function shuts down every single device by calling either:
> 
> 	dev->bus->shutdown(dev)
> 	dev->driver->shutdown(dev)
...
> Finally, with ixgbe_shutdown() it takes 0.37 for each device, but that time
> is spread all over the place, with bigger offenders:
> 
>     ixgbe_shutdown
>       __ixgbe_shutdown
>         ixgbe_close_suspend
>           ixgbe_down
>             ixgbe_init_hw_generic
>               ixgbe_reset_hw_X540
>                 msleep(100);                        0.104483472
>                 ixgbe_get_san_mac_addr_generic      0.048414851
>                 ixgbe_get_wwn_prefix_generic        0.048409893
>               ixgbe_start_hw_X540
>                 ixgbe_start_hw_generic
>                   ixgbe_clear_hw_cntrs_generic      0.048581502
>                   ixgbe_setup_fc_generic            0.024225800

ixgbe is network card, right? So ... it does not have any persistent
state and no moving parts, and there's no reason we could not "just
power it down"?
>  /*

-- 
(english) http://www.livejournal.com/~pavelmachek
(cesky, pictures) http://atrey.karlin.mff.cuni.cz/~pavel/picture/horses/blog.html

[-- Attachment #2: Digital signature --]
[-- Type: application/pgp-signature, Size: 181 bytes --]

^ permalink raw reply

* Re: [bpf-next V4 PATCH 2/8] bpf: devmap prepare xdp frames for bulking
From: Jesper Dangaard Brouer @ 2018-05-23 10:38 UTC (permalink / raw)
  To: Daniel Borkmann
  Cc: netdev, Daniel Borkmann, Alexei Starovoitov, Christoph Hellwig,
	BjörnTöpel, Magnus Karlsson, makita.toshiaki, brouer
In-Reply-To: <1db29c53-1568-a3b5-f2d3-1c830aefe33b@iogearbox.net>

On Wed, 23 May 2018 11:54:38 +0200
Daniel Borkmann <daniel@iogearbox.net> wrote:

> > +	for (i = 0; i < bq->count; i++) {
> > +		struct xdp_frame *xdpf = bq->q[i];
> > +		int err;
> > +
> > +		err = dev->netdev_ops->ndo_xdp_xmit(dev, xdpf);
> > +		if (err) {
> > +			drops++;
> > +			xdp_return_frame(xdpf);
> > +		}
> > +		processed++;  
> 
> This sort of thing makes it really hard to review. 'processed' and
> 'drops' are not read anywhere in this function. So I need to go and
> check all the other patches whether there's further logic involved here
> or not. I had to review your series after applying all patches in a
> local branch, please never do this. Add the logic in a patch where it's
> self-contained and obvious to review.

Sorry, 'processed' and 'drops' were used by the tracepoint that Alexei
asked me to split into another (next patch).  And I can see that I have
renamed 'processed' to 'sent' in the next tracepoint patch, which makes
reviewing even harder sorry.  Those lines should have been moved to the
tracepoint patch. My mistake when splitting up the patches.

-- 
Best regards,
  Jesper Dangaard Brouer
  MSc.CS, Principal Kernel Engineer at Red Hat
  LinkedIn: http://www.linkedin.com/in/brouer

^ permalink raw reply

* Re: [PATCH bpf-next v3 10/10] tools: bpftool: add delimiters to multi-function JITed dumps
From: Sandipan Das @ 2018-05-23 10:37 UTC (permalink / raw)
  To: Daniel Borkmann, Jakub Kicinski
  Cc: ast, netdev, linuxppc-dev, mpe, naveen.n.rao, Quentin Monnet
In-Reply-To: <2dabfa7f-15b8-236c-7724-33bc3da7e549@iogearbox.net>


On 05/23/2018 02:38 PM, Daniel Borkmann wrote:
> On 05/22/2018 09:55 PM, Jakub Kicinski wrote:
>> On Tue, 22 May 2018 22:46:13 +0530, Sandipan Das wrote:
>>> +		if (info.nr_jited_func_lens && info.jited_func_lens) {
>>> +			struct kernel_sym *sym = NULL;
>>> +			unsigned char *img = buf;
>>> +			__u64 *ksyms = NULL;
>>> +			__u32 *lens;
>>> +			__u32 i;
>>> +
>>> +			if (info.nr_jited_ksyms) {
>>> +				kernel_syms_load(&dd);
>>> +				ksyms = (__u64 *) info.jited_ksyms;
>>> +			}
>>> +
>>> +			lens = (__u32 *) info.jited_func_lens;
>>> +			for (i = 0; i < info.nr_jited_func_lens; i++) {
>>> +				if (ksyms) {
>>> +					sym = kernel_syms_search(&dd, ksyms[i]);
>>> +					if (sym)
>>> +						printf("%s:\n", sym->name);
>>> +					else
>>> +						printf("%016llx:\n", ksyms[i]);
>>> +				}
>>> +
>>> +				disasm_print_insn(img, lens[i], opcodes, name);
>>> +				img += lens[i];
>>> +				printf("\n");
>>> +			}
>>> +		} else {
>>
>> The output doesn't seem to be JSON-compatible :(  We try to make sure
>> all bpftool command can produce valid JSON when run with -j (or -p)
>> switch.
>>
>> Would it be possible to make each function a separate JSON object with
>> "name" and "insn" array?  Would that work?
> 
> Sandipan, could you take a look at this? Given there's json output today we
> should definitely try not to break it; presumably this would be one final
> respin of your series with this fixed.
> 
> 

Sure. With a few changes, I am able get JSON output like the following:

# echo 0 > /proc/sys/net/core/bpf_jit_kallsyms
# bpftool prog -p dump jited id 1

[{
        "name": "0xd00000000aa80000",
        "insns": [{
                "pc": "0x0",
                "operation": "nop",
                "operands": [null
                ]
            },{
                "pc": "0x4",
                "operation": "nop",
                "operands": [null
                ]
            },{
                "pc": "0x8",
                "operation": "mflr",
                "operands": ["r0"
                ]
            },{
                "pc": "0xc",
                "operation": "std",
                "operands": ["r0","16","(","r1",")"
                ]
            },{
                "pc": "0x10",
                "operation": "stdu",
                "operands": ["r1","-112","(","r1",")"
                ]
            },{
                ...
            }
        ]
    },{
        "name": "0xd00000000aae0000",
        "insns": [{
                "pc": "0x0",
                "operation": "nop",
                "operands": [null
                ]
            },{
                "pc": "0x4",
                "operation": "nop",
                "operands": [null
                ]
            },{
                "pc": "0x8",
                "operation": "mflr",
                "operands": ["r0"
                ]
            },{
                ...
            }
        ]
    }
]

# echo 1 > /proc/sys/net/core/bpf_jit_kallsyms
# bpftool prog -p dump jited id 1

[{
        "name": "bpf_prog_b811aab41a39ad3d_foo",
        "insns": [{
                "pc": "0x0",
                "operation": "nop",
                "operands": [null
                ]
            },{
                "pc": "0x4",
                "operation": "nop",
                "operands": [null
                ]
            },{
                "pc": "0x8",
                "operation": "mflr",
                "operands": ["r0"
                ]
            },{
                "pc": "0xc",
                "operation": "std",
                "operands": ["r0","16","(","r1",")"
                ]
            },{
                "pc": "0x10",
                "operation": "stdu",
                "operands": ["r1","-112","(","r1",")"
                ]
            },{
                ...
            }
        ]
    },{
        "name": "bpf_prog_196af774a3477707_F",
        "insns": [{
                "pc": "0x0",
                "operation": "nop",
                "operands": [null
                ]
            },{
                "pc": "0x4",
                "operation": "nop",
                "operands": [null
                ]
            },{
                "pc": "0x8",
                "operation": "mflr",
                "operands": ["r0"
                ]
            },{
                ...
            }
        ]
    }
]

If this is okay, I can send out the next revision with these changes.



Other than that, for powerpc64, there is a problem with the way the
binutils disassembler code (in "opcodes/ppc-dis.c") passes arguments
to the callback fprintf_json().

In fprintf_json(), we always expect the va_list elements to resolve
to strings (char *). But for powerpc64, the register or immediate
operands are always passed as integers. So, when the code attempts
to resolve these operands using va_arg(ap, char *), bpftool crashes.
For now, I am using a workaround based on vsnprintf() but this does
not get the semantics correct for memory operands. You can probably
see that for the store instructions in the JSON dump above this.

Daniel,

Would it be okay if I send out a fix for this in a different series?

- Sandipan

^ permalink raw reply

* [PATCH] atmel: Add missing call to pci_disable_device()
From: YueHaibing @ 2018-05-23 10:34 UTC (permalink / raw)
  To: simon-xn1N/tgparsycpQjotevgVpr/1R2p/CL,
	kvalo-sgV2jX0FEOL9JmXXK+q4OQ, davem-fT/PcQaiUtIeIZ0/mPfg9Q
  Cc: netdev-u79uwXL29TY76Z2rM5mHXA,
	linux-wireless-u79uwXL29TY76Z2rM5mHXA, YueHaibing

add pci_disable_device in error handling while init_atmel_card failed.

Signed-off-by: YueHaibing <yuehaibing-hv44wF8Li93QT0dZR+AlfA@public.gmane.org>
---
 drivers/net/wireless/atmel/atmel_pci.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/drivers/net/wireless/atmel/atmel_pci.c b/drivers/net/wireless/atmel/atmel_pci.c
index bcf1f27..30df58a 100644
--- a/drivers/net/wireless/atmel/atmel_pci.c
+++ b/drivers/net/wireless/atmel/atmel_pci.c
@@ -61,8 +61,10 @@ static int atmel_pci_probe(struct pci_dev *pdev,
 	dev = init_atmel_card(pdev->irq, pdev->resource[1].start,
 			      ATMEL_FW_TYPE_506,
 			      &pdev->dev, NULL, NULL);
-	if (!dev)
+	if (!dev) {
+		pci_disable_device(pdev);
 		return -ENODEV;
+	}
 
 	pci_set_drvdata(pdev, dev);
 	return 0;
-- 
2.7.0

^ permalink raw reply related

* Re: [PATCH bpf-next v2 0/7] BTF uapi cleanup
From: Daniel Borkmann @ 2018-05-23 10:29 UTC (permalink / raw)
  To: Martin KaFai Lau, netdev; +Cc: Alexei Starovoitov, Yonghong Song, kernel-team
In-Reply-To: <20180522215723.784040-1-kafai@fb.com>

On 05/22/2018 11:57 PM, Martin KaFai Lau wrote:
> This patch set makes some changes to cleanup the unused
> bits in BTF uapi.  It also makes the btf_header extensible.
> 
> Please see individual patches for details.
> 
> v2:
> - Remove NR_SECS from patch 2
> - Remove "unsigned" check on array->index_type from patch 3
> - Remove BTF_INT_VARARGS and further limit BTF_INT_ENCODING
>   from 8 bits to 4 bits in patch 4
> - Adjustments in test_btf.c to reflect changes in v2
> 
> Martin KaFai Lau (7):
>   bpf: Expose check_uarg_tail_zero()
>   bpf: btf: Change how section is supported in btf_header
>   bpf: btf: Check array->index_type
>   bpf: btf: Remove unused bits from uapi/linux/btf.h
>   bpf: btf: Rename btf_key_id and btf_value_id in bpf_map_info
>   bpf: btf: Sync bpf.h and btf.h to tools
>   bpf: btf: Add tests for the btf uapi changes
> 
>  include/linux/bpf.h                    |   6 +-
>  include/uapi/linux/bpf.h               |   8 +-
>  include/uapi/linux/btf.h               |  37 +--
>  kernel/bpf/arraymap.c                  |   2 +-
>  kernel/bpf/btf.c                       | 335 +++++++++++++++------
>  kernel/bpf/syscall.c                   |  32 +-
>  tools/include/uapi/linux/bpf.h         |   8 +-
>  tools/include/uapi/linux/btf.h         |  37 +--
>  tools/lib/bpf/bpf.c                    |   4 +-
>  tools/lib/bpf/bpf.h                    |   4 +-
>  tools/lib/bpf/btf.c                    |   5 +-
>  tools/lib/bpf/libbpf.c                 |  34 +--
>  tools/lib/bpf/libbpf.h                 |   4 +-
>  tools/testing/selftests/bpf/test_btf.c | 521 +++++++++++++++++++++++++++------
>  14 files changed, 753 insertions(+), 284 deletions(-)
> 

Applied to bpf-next, thanks Martin!

^ permalink raw reply

* Re: [PATCH v2] selftests: net: reuseport_bpf_numa: don't fail if no numa support
From: Daniel Borkmann @ 2018-05-23 10:29 UTC (permalink / raw)
  To: Anders Roxell, davem, shuah; +Cc: netdev, linux-kselftest, linux-kernel
In-Reply-To: <20180518222737.16006-1-anders.roxell@linaro.org>

On 05/19/2018 12:27 AM, Anders Roxell wrote:
> The reuseport_bpf_numa test case fails there's no numa support.  The
> test shouldn't fail if there's no support it should be skipped.
> 
> Fixes: 3c2c3c16aaf6 ("reuseport, bpf: add test case for bpf_get_numa_node_id")
> Signed-off-by: Anders Roxell <anders.roxell@linaro.org>

Applied to bpf tree, thanks Anders!

^ permalink raw reply

* Re: [bpf-next V4 PATCH 2/8] bpf: devmap prepare xdp frames for bulking
From: Jesper Dangaard Brouer @ 2018-05-23 10:29 UTC (permalink / raw)
  To: Daniel Borkmann
  Cc: netdev, Daniel Borkmann, Alexei Starovoitov, Christoph Hellwig,
	BjörnTöpel, Magnus Karlsson, makita.toshiaki, brouer
In-Reply-To: <1db29c53-1568-a3b5-f2d3-1c830aefe33b@iogearbox.net>

On Wed, 23 May 2018 11:54:38 +0200
Daniel Borkmann <daniel@iogearbox.net> wrote:

> Please fix this bug up and send a fresh series, so we can fix this
> right away without needing a follow-up in bpf-next.

Okay.  So I assume this means you reverted/didn't push this patchset in
bpf-next tree... so I have time to respin the series, and I will send a V5.

-- 
Best regards,
  Jesper Dangaard Brouer
  MSc.CS, Principal Kernel Engineer at Red Hat
  LinkedIn: http://www.linkedin.com/in/brouer

^ permalink raw reply

* Re: [bpf-next V4 PATCH 2/8] bpf: devmap prepare xdp frames for bulking
From: Daniel Borkmann @ 2018-05-23  9:54 UTC (permalink / raw)
  To: Jesper Dangaard Brouer, netdev, Daniel Borkmann,
	Alexei Starovoitov
  Cc: Christoph Hellwig, BjörnTöpel, Magnus Karlsson,
	makita.toshiaki
In-Reply-To: <152665048175.21055.15345477060144555643.stgit@firesoul>

On 05/18/2018 03:34 PM, Jesper Dangaard Brouer wrote:
> Like cpumap create queue for xdp frames that will be bulked.  For now,
> this patch simply invoke ndo_xdp_xmit foreach frame.  This happens,
> either when the map flush operation is envoked, or when the limit
> DEV_MAP_BULK_SIZE is reached.
> 
> Signed-off-by: Jesper Dangaard Brouer <brouer@redhat.com>
> ---
>  kernel/bpf/devmap.c |   77 ++++++++++++++++++++++++++++++++++++++++++++++++---
>  1 file changed, 73 insertions(+), 4 deletions(-)
> 
> diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c
> index 808808bf2bf2..cab72c100bb5 100644
> --- a/kernel/bpf/devmap.c
> +++ b/kernel/bpf/devmap.c
> @@ -54,11 +54,18 @@
>  #define DEV_CREATE_FLAG_MASK \
>  	(BPF_F_NUMA_NODE | BPF_F_RDONLY | BPF_F_WRONLY)
>  
> +#define DEV_MAP_BULK_SIZE 16
> +struct xdp_bulk_queue {
> +	struct xdp_frame *q[DEV_MAP_BULK_SIZE];
> +	unsigned int count;
> +};
> +
>  /* objects in the map */
>  struct bpf_dtab_netdev {
>  	struct net_device *dev; /* must be first member, due to tracepoint */
>  	struct bpf_dtab *dtab;
>  	unsigned int bit;
> +	struct xdp_bulk_queue __percpu *bulkq;
>  	struct rcu_head rcu;
>  };
>  
> @@ -209,6 +216,38 @@ void __dev_map_insert_ctx(struct bpf_map *map, u32 bit)
>  	__set_bit(bit, bitmap);
>  }
>  
> +static int bq_xmit_all(struct bpf_dtab_netdev *obj,
> +			 struct xdp_bulk_queue *bq)
> +{
> +	unsigned int processed = 0, drops = 0;
> +	struct net_device *dev = obj->dev;
> +	int i;
> +
> +	if (unlikely(!bq->count))
> +		return 0;
> +
> +	for (i = 0; i < bq->count; i++) {
> +		struct xdp_frame *xdpf = bq->q[i];
> +
> +		prefetch(xdpf);
> +	}
> +
> +	for (i = 0; i < bq->count; i++) {
> +		struct xdp_frame *xdpf = bq->q[i];
> +		int err;
> +
> +		err = dev->netdev_ops->ndo_xdp_xmit(dev, xdpf);
> +		if (err) {
> +			drops++;
> +			xdp_return_frame(xdpf);
> +		}
> +		processed++;

This sort of thing makes it really hard to review. 'processed' and
'drops' are not read anywhere in this function. So I need to go and
check all the other patches whether there's further logic involved here
or not. I had to review your series after applying all patches in a
local branch, please never do this. Add the logic in a patch where it's
self-contained and obvious to review.

> +	}
> +	bq->count = 0;
> +
> +	return 0;
> +}
> +
>  /* __dev_map_flush is called from xdp_do_flush_map() which _must_ be signaled
>   * from the driver before returning from its napi->poll() routine. The poll()
>   * routine is called either from busy_poll context or net_rx_action signaled
> @@ -224,6 +263,7 @@ void __dev_map_flush(struct bpf_map *map)
>  
>  	for_each_set_bit(bit, bitmap, map->max_entries) {
>  		struct bpf_dtab_netdev *dev = READ_ONCE(dtab->netdev_map[bit]);
> +		struct xdp_bulk_queue *bq;
>  		struct net_device *netdev;
>  
>  		/* This is possible if the dev entry is removed by user space
> @@ -233,6 +273,9 @@ void __dev_map_flush(struct bpf_map *map)
>  			continue;
>  
>  		__clear_bit(bit, bitmap);
> +
> +		bq = this_cpu_ptr(dev->bulkq);
> +		bq_xmit_all(dev, bq);
>  		netdev = dev->dev;
>  		if (likely(netdev->netdev_ops->ndo_xdp_flush))
>  			netdev->netdev_ops->ndo_xdp_flush(netdev);
> @@ -255,6 +298,20 @@ struct bpf_dtab_netdev *__dev_map_lookup_elem(struct bpf_map *map, u32 key)
>  	return obj;
>  }
>  
> +/* Runs under RCU-read-side, plus in softirq under NAPI protection.
> + * Thus, safe percpu variable access.
> + */
> +static int bq_enqueue(struct bpf_dtab_netdev *obj, struct xdp_frame *xdpf)
> +{
> +	struct xdp_bulk_queue *bq = this_cpu_ptr(obj->bulkq);
> +
> +	if (unlikely(bq->count == DEV_MAP_BULK_SIZE))
> +		bq_xmit_all(obj, bq);
> +
> +	bq->q[bq->count++] = xdpf;
> +	return 0;
> +}
> +
>  int dev_map_enqueue(struct bpf_dtab_netdev *dst, struct xdp_buff *xdp)
>  {
>  	struct net_device *dev = dst->dev;
> @@ -268,8 +325,7 @@ int dev_map_enqueue(struct bpf_dtab_netdev *dst, struct xdp_buff *xdp)
>  	if (unlikely(!xdpf))
>  		return -EOVERFLOW;
>  
> -	/* TODO: implement a bulking/enqueue step later */
> -	err = dev->netdev_ops->ndo_xdp_xmit(dev, xdpf);
> +	err = bq_enqueue(dst, xdpf);
>  	if (err)
>  		return err;
>  
> @@ -288,13 +344,18 @@ static void dev_map_flush_old(struct bpf_dtab_netdev *dev)
>  {
>  	if (dev->dev->netdev_ops->ndo_xdp_flush) {
>  		struct net_device *fl = dev->dev;
> +		struct xdp_bulk_queue *bq;
>  		unsigned long *bitmap;
> +
>  		int cpu;

Please remove the newline from above.

>  		for_each_online_cpu(cpu) {
>  			bitmap = per_cpu_ptr(dev->dtab->flush_needed, cpu);
>  			__clear_bit(dev->bit, bitmap);
>  
> +			bq = per_cpu_ptr(dev->bulkq, cpu);
> +			bq_xmit_all(dev, bq);
> +
>  			fl->netdev_ops->ndo_xdp_flush(dev->dev);
>  		}
>  	}
> @@ -306,6 +367,7 @@ static void __dev_map_entry_free(struct rcu_head *rcu)
>  
>  	dev = container_of(rcu, struct bpf_dtab_netdev, rcu);
>  	dev_map_flush_old(dev);
> +	free_percpu(dev->bulkq);
>  	dev_put(dev->dev);
>  	kfree(dev);
>  }
> @@ -338,6 +400,7 @@ static int dev_map_update_elem(struct bpf_map *map, void *key, void *value,
>  {
>  	struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map);
>  	struct net *net = current->nsproxy->net_ns;
> +	gfp_t gfp = GFP_ATOMIC | __GFP_NOWARN;
>  	struct bpf_dtab_netdev *dev, *old_dev;
>  	u32 i = *(u32 *)key;
>  	u32 ifindex = *(u32 *)value;
> @@ -352,11 +415,17 @@ static int dev_map_update_elem(struct bpf_map *map, void *key, void *value,
>  	if (!ifindex) {
>  		dev = NULL;
>  	} else {
> -		dev = kmalloc_node(sizeof(*dev), GFP_ATOMIC | __GFP_NOWARN,
> -				   map->numa_node);
> +		dev = kmalloc_node(sizeof(*dev), gfp, map->numa_node);
>  		if (!dev)
>  			return -ENOMEM;
>  
> +		dev->bulkq = __alloc_percpu_gfp(sizeof(*dev->bulkq),
> +						sizeof(void *), gfp);
> +		if (!dev->bulkq) {
> +			kfree(dev);
> +			return -ENOMEM;
> +		}
> +
>  		dev->dev = dev_get_by_index(net, ifindex);
>  		if (!dev->dev) {
>  			kfree(dev);

Ahh well, and I just realized that here you are leaking memory in the error path. :(

A free_percpu(dev->bulkq) is missing.

Please fix this bug up and send a fresh series, so we can fix this right away without
needing a follow-up in bpf-next.

Thanks,
Daniel

^ permalink raw reply

* Re: [PATCH net-next] sctp: add support for SCTP_REUSE_PORT sockopt
From: Neil Horman @ 2018-05-23  9:48 UTC (permalink / raw)
  To: Xin Long
  Cc: Michael Tuexen, Marcelo Ricardo Leitner, network dev, linux-sctp,
	davem
In-Reply-To: <CADvbK_fRLwT=SHHahPwpJvxnFQWbuwrM6PWLW+20vmXhUgAZxA@mail.gmail.com>

On Wed, May 23, 2018 at 03:04:53PM +0800, Xin Long wrote:
> On Tue, May 22, 2018 at 7:51 PM, Neil Horman <nhorman@tuxdriver.com> wrote:
> > On Tue, May 22, 2018 at 03:07:57PM +0800, Xin Long wrote:
> >> On Mon, May 21, 2018 at 9:48 PM, Neil Horman <nhorman@tuxdriver.com> wrote:
> >> > On Mon, May 21, 2018 at 02:16:56PM +0200, Michael Tuexen wrote:
> >> >> > On 21. May 2018, at 13:39, Neil Horman <nhorman@tuxdriver.com> wrote:
> >> >> >
> >> >> > On Sun, May 20, 2018 at 10:54:04PM -0300, Marcelo Ricardo Leitner wrote:
> >> >> >> On Sun, May 20, 2018 at 08:50:59PM -0400, Neil Horman wrote:
> >> >> >>> On Sat, May 19, 2018 at 03:44:40PM +0800, Xin Long wrote:
> >> >> >>>> This feature is actually already supported by sk->sk_reuse which can be
> >> >> >>>> set by SO_REUSEADDR. But it's not working exactly as RFC6458 demands in
> >> >> >>>> section 8.1.27, like:
> >> >> >>>>
> >> >> >>>>  - This option only supports one-to-one style SCTP sockets
> >> >> >>>>  - This socket option must not be used after calling bind()
> >> >> >>>>    or sctp_bindx().
> >> >> >>>>
> >> >> >>>> Besides, SCTP_REUSE_PORT sockopt should be provided for user's programs.
> >> >> >>>> Otherwise, the programs with SCTP_REUSE_PORT from other systems will not
> >> >> >>>> work in linux.
> >> >> >>>>
> >> >> >>>> This patch reuses sk->sk_reuse and works pretty much as SO_REUSEADDR,
> >> >> >>>> just with some extra setup limitations that are neeeded when it is being
> >> >> >>>> enabled.
> >> >> >>>>
> >> >> >>>> "It should be noted that the behavior of the socket-level socket option
> >> >> >>>> to reuse ports and/or addresses for SCTP sockets is unspecified", so it
> >> >> >>>> leaves SO_REUSEADDR as is for the compatibility.
> >> >> >>>>
> >> >> >>>> Signed-off-by: Xin Long <lucien.xin@gmail.com>
> >> >> >>>> ---
> >> >> >>>> include/uapi/linux/sctp.h |  1 +
> >> >> >>>> net/sctp/socket.c         | 48 +++++++++++++++++++++++++++++++++++++++++++++++
> >> >> >>>> 2 files changed, 49 insertions(+)
> >> >> >>>>
> >> >> >>> A few things:
> >> >> >>>
> >> >> >>> 1) I agree with Tom, this feature is a complete duplication of the SK_REUSEPORT
> >> >> >>> socket option.  I understand that this is an implementation of the option in the
> >> >> >>> RFC, but its definately a duplication of a feature, which makes several things
> >> >> >>> really messy.
> >> >> >>>
> >> >> >>> 2) The overloading of the sk_reuse opeion is a bad idea, for several reasons.
> >> >> >>> Chief among them is the behavioral interference between this patch and the
> >> >> >>> SO_REUSEADDR socket level option, that also sets this feature.  If you set
> >> >> >>> sk_reuse via SO_REUSEADDR, you will set the SCTP port reuse feature regardless
> >> >> >>> of the bind or 1:1/1:m state of the socket.  Vice versa, if you set this socket
> >> >> >>> option via the SCTP_PORT_REUSE option you will inadvertently turn on address
> >> >> >>> reuse for the socket.  We can't do that.
> >> >> >>
> >> >> >> Given your comments, going a bit further here, one other big
> >> >> >> implication is that a port would never be able to be considered to
> >> >> >> fully meet SCTP standards regarding reuse because a rogue application
> >> >> >> may always abuse of the socket level opt to gain access to the port.
> >> >> >>
> >> >> >> IOW, the patch allows the application to use such restrictions against
> >> >> >> itself and nothing else, which undermines the patch idea.
> >> >> >>
> >> >> > Agreed.
> >> >> >
> >> >> >> I lack the knowledge on why the SCTP option was proposed in the RFC. I
> >> >> >> guess they had a good reason to add the restriction on 1:1/1:m style.
> >> >> >> Does the usage of the current imply in any risk to SCTP sockets? If
> >> >> >> yes, that would give some grounds for going forward with the SCTP
> >> >> >> option.
> >> >> >>
> >> >> > I'm also not privy to why the sctp option was proposed, though I expect that the
> >> >> > lack of standardization of SO_REUSEPORT probably had something to do with it.
> >> >> > As for the reasoning behind restriction to only 1:1 sockets, if I had to guess,
> >> >> > I would say it likely because it creates ordering difficulty at the application
> >> >> > level.
> >> >> >
> >> >> > CC-ing Michael Tuxen, who I believe had some input on this RFC.  Hopefully he
> >> >> > can shed some light on this.
> >> >> Dear all,
> >> >>
> >> >> the reason this was added is to have a specified way to allow a system to
> >> >> behave like a client and server making use of the INIT collision.
> >> >>
> >> >> For 1-to-many style sockets you can do this by creating a socket, binding it,
> >> >> calling listen on it and trying to connect to the peer.
> >> >>
> >> >> For 1-to-1 style sockets you need two sockets for it. One listener and one
> >> >> you use to connect (and close it in case of failure, open a new one...).
> >> >>
> >> >> It was not clear if one can achieve this with SO_REUSEPORT and/or SO_REUSEADDR
> >> >> on all platforms. We left that unspecified.
> >> >>
> >> >> I hope this makes the intention clearer.
> >> >>
> >> > I think it makes the intention clearer yes, but it unfortunately does nothing in
> >> > my mind to clarify how the implementation should best handle the potential
> >> > overlap in functionality.  What I see here is that we have two functional paths
> >> > (the SO_REUSEPORT path and the SCTP_PORT_REUSE path), which may or may not
> >> > (depending on the OS implementation achieve the same functional goal (allowing
> >> > multiple sockets to share a port while allowing one socket to listen and the
> >> > other connect to a remote peer).  If both implementations do the same thing on a
> >> > given platform, we can either just alias one to another and be done, but if they
> >> > don't then we either have to implement both paths, and ensure that the
> >> > SO_REUSEPORT path is a no-op/error return for SCTP sockets, or that each path
> >> > implements a distinct feature set that is cleaarly documented.
> >> >
> >> > That said, I think we may be in luck.  Looking at the connect and listen paths,
> >> > it appears to me that:
> >> >
> >> > 1) Sockets ignore SO_REUSEPORT in the connect and listen paths (save for any
> >> > autobinding) so it would appear that the intent of the SCTP rfc can be honored
> >> > via SO_REUSEPORT on linux.
> >> >
> >> > 2) SO_REUSEPORT prevents changing state after a bind has occured, so we can honr
> >> > that part of the SCTP RFC.
> >> >
> >> > The only missing part is the restriction that SCTP_REUSE_PORT has which is
> >> > unaccounted for is that 1:M sockets aren't allowed to enable port reuse.
> >> > However, I think the implication from Michaels description above is that port
> >> > reuse on a 1:M socket is implicit because a single socket can connect and listen
> >> > in that use case, rather than there being a danger to doing so.
> >> >
> >> > As such, I would propose that we implement this socket option by simply setting
> >> > the sk->sk_reuseport field in the sock structure, and document the fact that
> >> > linux does not restrict port reuse from 1:M sockets.
> >> Note that, sk->sk_reuseport is not affecting linux SCTP socket at all now.
> >> linux SCTP socket doesn't really have SO_REUSEADDR (sk->sk_reuse)
> >> support, but use sk->sk_reuse as REUSE_PORT, (yes, it is confusing).
> >> Pls refer to sctp_get_port_local().
> >>
> > No, its not used now, but if you do use it to do something specific to SCTP (via
> > the SCTP_REUSE_PORT socket option), you risk aliasing SO_REUSEPORT behavior to
> > it, and if it doesn't match what the RFC behavior mandates, thats a problem.
> >
> >> So I'm not sure using sk->sk_reuseport here means we will drop sk->sk_reuse
> >> use in linux SCTP but use sk->sk_reuseport instead, or we will think that socket
> >> enables 'port reuse' when either of them is set.
> >>
> > I don't think we would drop the behavior of sk_reuse here, why would we?  As far
> > as I can see, the behavior of SO_REUSEADDR (not SO_REUSEPORT), isn't in
> > question, is it?
> >
> >> Note some users may be already using SO_REUSEADDR to enable the 'port
> >> reuse' in linux sctp socket. If we're changing to sk->sk_reuseport, we may face
> >> a compatibility problem.
> >>
> > I don't see how the behavior of SO_REUSEADDR is in question here.  All I'm
> > suggesting is that you simplify this patch so that the SCTP_REUSE_PORT socket
> > option set sk_reuseport, as that option to my eyes conforms to the sctp rfc
> > requirements.  Or am I' missing something?
> No, I am :)
> sk_reuseport seems more complicated than sk_reuse. I kind of mixed them.
> I need to check more beofore continuing. Thanks.
> 
Thank you!
Neil

> >
> > Neil
> >
> >>
> >> >
> >> > Thoughts?
> >> > Neil
> >> >
> >>
> 

^ permalink raw reply

* pull-request: mac80211 2018-05-23
From: Johannes Berg @ 2018-05-23  9:47 UTC (permalink / raw)
  To: David Miller; +Cc: netdev, linux-wireless

Hi Dave,

Just another handful of fixes as we wind down towards the
merge window.

Please pull and let me know if there's any problem.

Thanks,
johannes



The following changes since commit 6caf9fb3bda17df59de4ed6ed4950c43ca1361e3:

  Merge git://git.kernel.org/pub/scm/linux/kernel/git/bpf/bpf (2018-05-17 23:33:52 -0400)

are available in the Git repository at:

  git://git.kernel.org/pub/scm/linux/kernel/git/jberg/mac80211.git tags/mac80211-for-davem-2018-05-23

for you to fetch changes up to fed4825096cfbbfd654cb292ab6eb193911aef01:

  mac80211_hwsim: Fix radio dump for radio idx 0 (2018-05-22 10:24:17 +0200)

----------------------------------------------------------------
A handful of fixes:
 * hwsim radio dump wasn't working for the first radio
 * mesh was updating statistics incorrectly
 * a netlink message allocation was possibly too short
 * wiphy name limit was still too long
 * in certain cases regdb query could find a NULL pointer

----------------------------------------------------------------
Andrew Zaborowski (1):
      mac80211_hwsim: Fix radio dump for radio idx 0

Bob Copeland (1):
      mac80211: mesh: fix premature update of rc stats

Dedy Lansky (1):
      nl80211: fix nlmsg allocation in cfg80211_ft_event

Eric Biggers (1):
      cfg80211: further limit wiphy names to 64 bytes

Haim Dreyfuss (1):
      cfg80211: fix NULL pointer derference when querying regdb

 drivers/net/wireless/mac80211_hwsim.c | 4 ++--
 include/uapi/linux/nl80211.h          | 2 +-
 net/mac80211/mesh_plink.c             | 8 ++++----
 net/wireless/nl80211.c                | 3 ++-
 net/wireless/reg.c                    | 3 +++
 5 files changed, 12 insertions(+), 8 deletions(-)

^ permalink raw reply

* Re: [PATCH bpf-next 0/5] fix test_sockmap
From: Prashant Bhole @ 2018-05-23  9:44 UTC (permalink / raw)
  To: John Fastabend, Alexei Starovoitov, Daniel Borkmann
  Cc: David S . Miller, Shuah Khan, netdev
In-Reply-To: <f9f46ed7-43ae-4421-4546-ffb09fabd1fe@gmail.com>



On 5/22/2018 2:08 AM, John Fastabend wrote:
> On 05/20/2018 10:13 PM, Prashant Bhole wrote:
>>
>>
>> On 5/19/2018 1:42 AM, John Fastabend wrote:
>>> On 05/18/2018 12:17 AM, Prashant Bhole wrote:
>>>> This series fixes bugs in test_sockmap code. They weren't caught
>>>> previously because failure in RX/TX thread was not notified to the
>>>> main thread.
>>>>
>>>> Also fixed data verification logic and slightly improved test output
>>>> such that parameters values (cork, apply, start, end) of failed test
>>>> can be easily seen.
>>>>
>>>
>>> Great, this was on my list so thanks for taking care of it.
>>>
>>>> Note: Even after fixing above problems there are issues with tests
>>>> which set cork parameter. Tests fail (RX thread timeout) when cork
>>>> value is non-zero and overall data sent by TX thread isn't multiples
>>>> of cork value.
>>>
>>>
>>> This is expected. When 'cork' is set the sender should only xmit
>>> the data when 'cork' bytes are available. If the user doesn't
>>> provide the N bytes the data is cork'ed waiting for the bytes and
>>> if the socket is closed the state is cleaned up. What these tests
>>> are testing is the cleanup path when a user doesn't provide the
>>> N bytes. In practice this is used to validate headers and prevent
>>> users from sending partial headers. We want to keep these tests because
>>> they verify a tear-down path in the code.
>>
>> Ok.
>>
>>>
>>> After your changes do these get reported as failures? If so we
>>> need to account for the above in the calculations.
>>
>> Yes, cork related test are reported as failures because of RX thread
>> timeout.
>>
>> So with your above description, I think we need to differentiate cork
>> tests with partial data and full data. In partial data test we can have
>> something like "timeout_expected" flag. Any other way to fix it?
>>
> 
> Adding a flag seems reasonable to me. Lets do this for now. Also I
> plan to add more negative tests so we can either use the same
> flag or a new one for those cases as well.
> 

John,
I worked on this for some time and noticed that the RX-timeout of tests 
with cork parameter is dependent on various parameters. So we can not 
set a flag like the way 'drop_expected' flag is set before executing the 
test.

So I decided to write a function which judges all parameters before each 
test and decides whether a test with cork parameter will timeout or not. 
Then the conditions in the function became complicated. For example some 
tests fail if opt->rate < 17 (with some other conditions). Here is 17 is 
related to FRAGS_PER_SKB. Consider following two examples.

./test_sockmap --cgroup /mnt/cgroup2 -r 16 -i 1 -l 30 -t sendpage 
--txmsg --txmsg_cork 1024   # RX timeout occurs

./test_sockmap --cgroup /mnt/cgroup2 -r 17 -i 1 -l 30 -t sendpage 
--txmsg --txmsg_cork 1024   # Success!

Do we need to keep such tests? if yes, then I will continue with adding 
such conditions in the function.


-Prashant

^ permalink raw reply

* Re: [net-next 1/6] net/dcb: Add dcbnl buffer attribute
From: Jiri Pirko @ 2018-05-23  9:43 UTC (permalink / raw)
  To: Jakub Kicinski
  Cc: Saeed Mahameed, David S. Miller, netdev, Huy Nguyen, Or Gerlitz
In-Reply-To: <20180521222026.4f54f479@cakuba>

Tue, May 22, 2018 at 07:20:26AM CEST, jakub.kicinski@netronome.com wrote:
>On Mon, 21 May 2018 14:04:57 -0700, Saeed Mahameed wrote:
>> From: Huy Nguyen <huyn@mellanox.com>
>> 
>> In this patch, we add dcbnl buffer attribute to allow user
>> change the NIC's buffer configuration such as priority
>> to buffer mapping and buffer size of individual buffer.
>> 
>> This attribute combined with pfc attribute allows advance user to
>> fine tune the qos setting for specific priority queue. For example,
>> user can give dedicated buffer for one or more prirorities or user
>> can give large buffer to certain priorities.
>> 
>> We present an use case scenario where dcbnl buffer attribute configured
>> by advance user helps reduce the latency of messages of different sizes.
>> 
>> Scenarios description:
>> On ConnectX-5, we run latency sensitive traffic with
>> small/medium message sizes ranging from 64B to 256KB and bandwidth sensitive
>> traffic with large messages sizes 512KB and 1MB. We group small, medium,
>> and large message sizes to their own pfc enables priorities as follow.
>>   Priorities 1 & 2 (64B, 256B and 1KB)
>>   Priorities 3 & 4 (4KB, 8KB, 16KB, 64KB, 128KB and 256KB)
>>   Priorities 5 & 6 (512KB and 1MB)
>> 
>> By default, ConnectX-5 maps all pfc enabled priorities to a single
>> lossless fixed buffer size of 50% of total available buffer space. The
>> other 50% is assigned to lossy buffer. Using dcbnl buffer attribute,
>> we create three equal size lossless buffers. Each buffer has 25% of total
>> available buffer space. Thus, the lossy buffer size reduces to 25%. Priority
>> to lossless  buffer mappings are set as follow.
>>   Priorities 1 & 2 on lossless buffer #1
>>   Priorities 3 & 4 on lossless buffer #2
>>   Priorities 5 & 6 on lossless buffer #3
>> 
>> We observe improvements in latency for small and medium message sizes
>> as follows. Please note that the large message sizes bandwidth performance is
>> reduced but the total bandwidth remains the same.
>>   256B message size (42 % latency reduction)
>>   4K message size (21% latency reduction)
>>   64K message size (16% latency reduction)
>> 
>> Signed-off-by: Huy Nguyen <huyn@mellanox.com>
>> Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
>
>On a cursory look this bares a lot of resemblance to devlink shared
>buffer configuration ABI.  Did you look into using that?  
>
>Just to be clear devlink shared buffer ABIs don't require representors
>and "switchdev mode".

If the CX5 buffer they are trying to utilize here is per port and not a
shared one, it would seem ok for me to not have it in "devlink sb".

^ permalink raw reply

* [PATCH net-next] sfc: stop the TX queue before pushing new buffers
From: Martin Habets @ 2018-05-23  9:41 UTC (permalink / raw)
  To: linux-net-drivers, davem; +Cc: netdev, jarod

efx_enqueue_skb() can push new buffers for the xmit_more functionality.
We must stops the TX queue before this or else the TX queue does not get
restarted and we get a netdev watchdog.

In the error handling we may now need to unwind more than 1 packet, and
we may need to push the new buffers onto the partner queue.

Fixes: e9117e5099ea ("sfc: Firmware-Assisted TSO version 2")
Reported-by: Jarod Wilson <jarod@redhat.com>
Signed-off-by: Martin Habets <mhabets@solarflare.com>
---

Dave, could you please also queue up this patch for stable?

 drivers/net/ethernet/sfc/tx.c |   31 +++++++++++++++++++++++--------
 1 file changed, 23 insertions(+), 8 deletions(-)

diff --git a/drivers/net/ethernet/sfc/tx.c b/drivers/net/ethernet/sfc/tx.c
index cece961f2e82..17e0697f42d5 100644
--- a/drivers/net/ethernet/sfc/tx.c
+++ b/drivers/net/ethernet/sfc/tx.c
@@ -435,17 +435,18 @@ static int efx_tx_map_data(struct efx_tx_queue *tx_queue, struct sk_buff *skb,
 	} while (1);
 }
 
-/* Remove buffers put into a tx_queue.  None of the buffers must have
- * an skb attached.
+/* Remove buffers put into a tx_queue for the current packet.
+ * None of the buffers must have an skb attached.
  */
-static void efx_enqueue_unwind(struct efx_tx_queue *tx_queue)
+static void efx_enqueue_unwind(struct efx_tx_queue *tx_queue,
+			       unsigned int insert_count)
 {
 	struct efx_tx_buffer *buffer;
 	unsigned int bytes_compl = 0;
 	unsigned int pkts_compl = 0;
 
 	/* Work backwards until we hit the original insert pointer value */
-	while (tx_queue->insert_count != tx_queue->write_count) {
+	while (tx_queue->insert_count != insert_count) {
 		--tx_queue->insert_count;
 		buffer = __efx_tx_queue_get_insert_buffer(tx_queue);
 		efx_dequeue_buffer(tx_queue, buffer, &pkts_compl, &bytes_compl);
@@ -504,6 +505,8 @@ static int efx_tx_tso_fallback(struct efx_tx_queue *tx_queue,
  */
 netdev_tx_t efx_enqueue_skb(struct efx_tx_queue *tx_queue, struct sk_buff *skb)
 {
+	unsigned int old_insert_count = tx_queue->insert_count;
+	bool xmit_more = skb->xmit_more;
 	bool data_mapped = false;
 	unsigned int segments;
 	unsigned int skb_len;
@@ -553,8 +556,10 @@ netdev_tx_t efx_enqueue_skb(struct efx_tx_queue *tx_queue, struct sk_buff *skb)
 	/* Update BQL */
 	netdev_tx_sent_queue(tx_queue->core_txq, skb_len);
 
+	efx_tx_maybe_stop_queue(tx_queue);
+
 	/* Pass off to hardware */
-	if (!skb->xmit_more || netif_xmit_stopped(tx_queue->core_txq)) {
+	if (!xmit_more || netif_xmit_stopped(tx_queue->core_txq)) {
 		struct efx_tx_queue *txq2 = efx_tx_queue_partner(tx_queue);
 
 		/* There could be packets left on the partner queue if those
@@ -577,14 +582,24 @@ netdev_tx_t efx_enqueue_skb(struct efx_tx_queue *tx_queue, struct sk_buff *skb)
 		tx_queue->tx_packets++;
 	}
 
-	efx_tx_maybe_stop_queue(tx_queue);
-
 	return NETDEV_TX_OK;
 
 
 err:
-	efx_enqueue_unwind(tx_queue);
+	efx_enqueue_unwind(tx_queue, old_insert_count);
 	dev_kfree_skb_any(skb);
+
+	/* If we're not expecting another transmit and we had something to push
+	 * on this queue or a partner queue then we need to push here to get the
+	 * previous packets out.
+	 */
+	if (!xmit_more) {
+		struct efx_tx_queue *txq2 = efx_tx_queue_partner(tx_queue);
+
+		if (txq2->xmit_more_available)
+			efx_nic_push_buffers(txq2);
+	}
+
 	return NETDEV_TX_OK;
 }
 

^ permalink raw reply related

* Re: [bpf-next V4 PATCH 1/8] bpf: devmap introduce dev_map_enqueue
From: Daniel Borkmann @ 2018-05-23  9:34 UTC (permalink / raw)
  To: Jesper Dangaard Brouer, netdev, Daniel Borkmann,
	Alexei Starovoitov
  Cc: Christoph Hellwig, BjörnTöpel, Magnus Karlsson,
	makita.toshiaki
In-Reply-To: <152665047666.21055.16395048652428831814.stgit@firesoul>

On 05/18/2018 03:34 PM, Jesper Dangaard Brouer wrote:
> Functionality is the same, but the ndo_xdp_xmit call is now
> simply invoked from inside the devmap.c code.
> 
> V2: Fix compile issue reported by kbuild test robot <lkp@intel.com>
> 
> Signed-off-by: Jesper Dangaard Brouer <brouer@redhat.com>
> ---
>  include/linux/bpf.h        |   14 +++++++++++---
>  include/trace/events/xdp.h |    9 ++++++++-
>  kernel/bpf/devmap.c        |   37 +++++++++++++++++++++++++++++++------
>  net/core/filter.c          |   15 ++-------------
>  4 files changed, 52 insertions(+), 23 deletions(-)
> 
> diff --git a/include/linux/bpf.h b/include/linux/bpf.h
> index ed0122b45b63..fc1459bdcafc 100644
> --- a/include/linux/bpf.h
> +++ b/include/linux/bpf.h
> @@ -485,14 +485,15 @@ int bpf_check(struct bpf_prog **fp, union bpf_attr *attr);
>  void bpf_patch_call_args(struct bpf_insn *insn, u32 stack_depth);
>  
>  /* Map specifics */
> -struct net_device  *__dev_map_lookup_elem(struct bpf_map *map, u32 key);
> +struct xdp_buff;
> +struct bpf_dtab_netdev *__dev_map_lookup_elem(struct bpf_map *map, u32 key);

When you have some follow-up patches, would be great if you could clean this
up a bit. At least a newline after the struct declaration would make it a bit
more readable.

>  void __dev_map_insert_ctx(struct bpf_map *map, u32 index);
>  void __dev_map_flush(struct bpf_map *map);
> +int dev_map_enqueue(struct bpf_dtab_netdev *dst, struct xdp_buff *xdp);
>  
>  struct bpf_cpu_map_entry *__cpu_map_lookup_elem(struct bpf_map *map, u32 key);
>  void __cpu_map_insert_ctx(struct bpf_map *map, u32 index);
>  void __cpu_map_flush(struct bpf_map *map);
> -struct xdp_buff;
>  int cpu_map_enqueue(struct bpf_cpu_map_entry *rcpu, struct xdp_buff *xdp,
>  		    struct net_device *dev_rx);
>  
> @@ -571,6 +572,14 @@ static inline void __dev_map_flush(struct bpf_map *map)
>  {
>  }
>  
> +struct xdp_buff;
> +struct bpf_dtab_netdev;
> +static inline

In particular here as well.

> +int dev_map_enqueue(struct bpf_dtab_netdev *dst, struct xdp_buff *xdp)
> +{
> +	return 0;
> +}
> +
>  static inline
>  struct bpf_cpu_map_entry *__cpu_map_lookup_elem(struct bpf_map *map, u32 key)
>  {
> @@ -585,7 +594,6 @@ static inline void __cpu_map_flush(struct bpf_map *map)
>  {
>  }
>  
> -struct xdp_buff;
>  static inline int cpu_map_enqueue(struct bpf_cpu_map_entry *rcpu,
>  				  struct xdp_buff *xdp,
>  				  struct net_device *dev_rx)
> diff --git a/include/trace/events/xdp.h b/include/trace/events/xdp.h
> index 8989a92c571a..96104610d40e 100644
> --- a/include/trace/events/xdp.h
> +++ b/include/trace/events/xdp.h
> @@ -138,11 +138,18 @@ DEFINE_EVENT_PRINT(xdp_redirect_template, xdp_redirect_map_err,
>  		  __entry->map_id, __entry->map_index)
>  );
>  
> +#ifndef __DEVMAP_OBJ_TYPE
> +#define __DEVMAP_OBJ_TYPE
> +struct _bpf_dtab_netdev {
> +	struct net_device *dev;
> +};
> +#endif /* __DEVMAP_OBJ_TYPE */

The __DEVMAP_OBJ_TYPE is not used anywhere, what's its purpose?

Also if you define struct _bpf_dtab_netdev this is rather fragile when mapping
to struct bpf_dtab_netdev. Best way of guarding this is to make a BUILD_BUG_ON()
where you compare both offsets in the struct and bail out compilation whenever
this changes.

>  #define devmap_ifindex(fwd, map)				\
>  	(!fwd ? 0 :						\
>  	 (!map ? 0 :						\
>  	  ((map->map_type == BPF_MAP_TYPE_DEVMAP) ?		\
> -	   ((struct net_device *)fwd)->ifindex : 0)))
> +	   ((struct _bpf_dtab_netdev *)fwd)->dev->ifindex : 0)))
>  
>  #define _trace_xdp_redirect_map(dev, xdp, fwd, map, idx)		\
>  	 trace_xdp_redirect_map(dev, xdp, devmap_ifindex(fwd, map),	\
> diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c
> index 565f9ece9115..808808bf2bf2 100644
> --- a/kernel/bpf/devmap.c
> +++ b/kernel/bpf/devmap.c
> @@ -48,18 +48,21 @@
>   * calls will fail at this point.
>   */
>  #include <linux/bpf.h>
> +#include <net/xdp.h>
>  #include <linux/filter.h>
>  
>  #define DEV_CREATE_FLAG_MASK \
>  	(BPF_F_NUMA_NODE | BPF_F_RDONLY | BPF_F_WRONLY)
>  
> +/* objects in the map */

This comment is unnecessary.

>  struct bpf_dtab_netdev {
> -	struct net_device *dev;
> +	struct net_device *dev; /* must be first member, due to tracepoint */
>  	struct bpf_dtab *dtab;
>  	unsigned int bit;
>  	struct rcu_head rcu;
>  };
>  
> +/* bpf map container */

Ditto. Why add it? If it's unclear from the code, then it would probably be
better to clean up the code a bit to make it more obvious. Comments should
explain *why* we do certain things, not *what* the code is doing. Latter is
just a sign that the code itself should be improved potentially. :)

>  struct bpf_dtab {
>  	struct bpf_map map;
>  	struct bpf_dtab_netdev **netdev_map;
> @@ -240,21 +243,43 @@ void __dev_map_flush(struct bpf_map *map)
>   * update happens in parallel here a dev_put wont happen until after reading the
>   * ifindex.
>   */
> -struct net_device  *__dev_map_lookup_elem(struct bpf_map *map, u32 key)
> +struct bpf_dtab_netdev *__dev_map_lookup_elem(struct bpf_map *map, u32 key)
>  {
>  	struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map);
> -	struct bpf_dtab_netdev *dev;
> +	struct bpf_dtab_netdev *obj;
>  
>  	if (key >= map->max_entries)
>  		return NULL;
>  
> -	dev = READ_ONCE(dtab->netdev_map[key]);
> -	return dev ? dev->dev : NULL;
> +	obj = READ_ONCE(dtab->netdev_map[key]);
> +	return obj;
> +}
> +
> +int dev_map_enqueue(struct bpf_dtab_netdev *dst, struct xdp_buff *xdp)
> +{
> +	struct net_device *dev = dst->dev;
> +	struct xdp_frame *xdpf;
> +	int err;
> +
> +	if (!dev->netdev_ops->ndo_xdp_xmit)
> +		return -EOPNOTSUPP;
> +
> +	xdpf = convert_to_xdp_frame(xdp);
> +	if (unlikely(!xdpf))
> +		return -EOVERFLOW;
> +
> +	/* TODO: implement a bulking/enqueue step later */
> +	err = dev->netdev_ops->ndo_xdp_xmit(dev, xdpf);
> +	if (err)
> +		return err;
> +
> +	return 0;

The 'err' is just unnecessary, lets just do:

  return dev->netdev_ops->ndo_xdp_xmit(dev, xdpf);

Later after the other patches this becomes:

  return bq_enqueue(dst, xdpf, dev_rx);

>  }
>  
>  static void *dev_map_lookup_elem(struct bpf_map *map, void *key)
>  {
> -	struct net_device *dev = __dev_map_lookup_elem(map, *(u32 *)key);
> +	struct bpf_dtab_netdev *obj = __dev_map_lookup_elem(map, *(u32 *)key);
> +	struct net_device *dev = dev = obj ? obj->dev : NULL;
>  
>  	return dev ? &dev->ifindex : NULL;
>  }
> diff --git a/net/core/filter.c b/net/core/filter.c
> index 6d0d1560bd70..1447ec94ef74 100644
> --- a/net/core/filter.c
> +++ b/net/core/filter.c
> @@ -3061,20 +3061,9 @@ static int __bpf_tx_xdp_map(struct net_device *dev_rx, void *fwd,
>  
>  	switch (map->map_type) {
>  	case BPF_MAP_TYPE_DEVMAP: {
> -		struct net_device *dev = fwd;
> -		struct xdp_frame *xdpf;
> +		struct bpf_dtab_netdev *dst = fwd;
>  
> -		if (!dev->netdev_ops->ndo_xdp_xmit)
> -			return -EOPNOTSUPP;
> -
> -		xdpf = convert_to_xdp_frame(xdp);
> -		if (unlikely(!xdpf))
> -			return -EOVERFLOW;
> -
> -		/* TODO: move to inside map code instead, for bulk support
> -		 * err = dev_map_enqueue(dev, xdp);
> -		 */
> -		err = dev->netdev_ops->ndo_xdp_xmit(dev, xdpf);
> +		err = dev_map_enqueue(dst, xdp);
>  		if (err)
>  			return err;
>  		__dev_map_insert_ctx(map, index);
> 

^ permalink raw reply

page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox