Netdev List
 help / color / mirror / Atom feed
* [PATCH] Bluetooth: 6lowpan: Make variable header_ops constant
From: Nishka Dasgupta @ 2019-08-15  5:52 UTC (permalink / raw)
  To: marcel, johan.hedberg, linux-bluetooth, linux-kernel, davem,
	netdev
  Cc: Nishka Dasgupta

Static variable header_ops, of type header_ops, is used only once, when
it is assigned to field header_ops of a variable having type net_device.
This corresponding field is declared as const in the definition of
net_device. Hence make header_ops constant as well to protect it from
unnecessary modification.
Issue found with Coccinelle.

Signed-off-by: Nishka Dasgupta <nishkadg.linux@gmail.com>
---
 net/bluetooth/6lowpan.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/net/bluetooth/6lowpan.c b/net/bluetooth/6lowpan.c
index 9d41de1ec90f..bb55d92691b0 100644
--- a/net/bluetooth/6lowpan.c
+++ b/net/bluetooth/6lowpan.c
@@ -583,7 +583,7 @@ static const struct net_device_ops netdev_ops = {
 	.ndo_start_xmit		= bt_xmit,
 };
 
-static struct header_ops header_ops = {
+static const struct header_ops header_ops = {
 	.create	= header_create,
 };
 
-- 
2.19.1


^ permalink raw reply related

* Re: tun: mark small packets as owned by the tap sock
From: Jason Wang @ 2019-08-15  5:11 UTC (permalink / raw)
  To: Dave Jones; +Cc: Alexis Bauvin, netdev
In-Reply-To: <20190813140025.GA17823@codemonkey.org.uk>


On 2019/8/13 下午10:00, Dave Jones wrote:
> On Tue, Aug 13, 2019 at 04:33:59PM +0800, Jason Wang wrote:
>   >
>   > On 2019/8/13 上午6:19, Dave Jones wrote:
>   > > On Wed, Aug 07, 2019 at 12:30:07AM +0000, Linux Kernel wrote:
>   > >   > Commit:     4b663366246be1d1d4b1b8b01245b2e88ad9e706
>   > >   > Parent:     16b2084a8afa1432d14ba72b7c97d7908e178178
>   > >   > Web:        https://git.kernel.org/torvalds/c/4b663366246be1d1d4b1b8b01245b2e88ad9e706
>   > >   > Author:     Alexis Bauvin <abauvin@scaleway.com>
>   > >   > AuthorDate: Tue Jul 23 16:23:01 2019 +0200
>   > >   >
>   > >   >     tun: mark small packets as owned by the tap sock
>   > >   >
>   > >   >     - v1 -> v2: Move skb_set_owner_w to __tun_build_skb to reduce patch size
>   > >
>   > > This commit breaks ipv6 routing when I deployed on it a linode.
>   > > It seems to work briefly after boot, and then silently all packets get
>   > > dropped. (Presumably, it's dropping RA or ND packets)
>   > >
>   > > With this reverted, everything works as it did in rc3.
>   > >
>   > Two questions:
>   >
>   > - Are you using XDP for TUN?
>
> not knowingly.
> $ grep XDP .config
> # CONFIG_XDP_SOCKETS is not set
>
> What's configured on the hypervisor side I have no idea.


Ok, please tell me more about your setups:

- Are you using TUN in host or guest?

- Are you using it for VM or VPN(tunneling)?

- Where did the packet get dropped?


>
>   > - Does it work before 66ccbc9c87c2?
>
> that's been around since 4.14-rc1, and at one point it ran whatever was
> in debian9 (4.9).  I don't recall it ever not working, so I'd say yes.
>
> I can build a 4.13 if it'll prove something, but it'll take me a while.
> (This is my primary MX, so it's dropping email while it's on the broken
>   kernel, so I need to plan some time to be around to babysit it)


If possible please try that.


>
>   > If yes, could you show us the result of net_dropmonitor?
>
> where do I get that?  It doesn't seem packaged for debian.
>
> 	Dave


It's part of perf-script(1). You can simply start it through perf script 
record net_dropmonitor.

Thanks

>

^ permalink raw reply

* [PATCH bpf-next 1/5] xsk: Convert bool 'zc' field in struct xdp_umem to a u32 bitmap
From: Sridhar Samudrala @ 2019-08-15  3:46 UTC (permalink / raw)
  To: magnus.karlsson, bjorn.topel, netdev, bpf, sridhar.samudrala,
	intel-wired-lan, maciej.fijalkowski, tom.herbert
In-Reply-To: <1565840783-8269-1-git-send-email-sridhar.samudrala@intel.com>

The bool 'zc' field in struct xdp_uem is replaced with a u32 flags
field and a bit within flags is used to indicate zerocopy. This flags
field will be used in later patches for other bit fields.

Also, removed the bool 'zc' field from struct xdp_sock as it can be
accessed via flags in xs->umem.

Signed-off-by: Sridhar Samudrala <sridhar.samudrala@intel.com>
---
 include/net/xdp_sock.h | 12 ++++++++++--
 net/xdp/xdp_umem.c     |  6 +++---
 net/xdp/xsk.c          | 12 +++++++++---
 net/xdp/xsk_diag.c     |  3 ++-
 4 files changed, 24 insertions(+), 9 deletions(-)

diff --git a/include/net/xdp_sock.h b/include/net/xdp_sock.h
index 69796d264f06..b6716dbdce1a 100644
--- a/include/net/xdp_sock.h
+++ b/include/net/xdp_sock.h
@@ -27,6 +27,9 @@ struct xdp_umem_fq_reuse {
 	u64 handles[];
 };
 
+/* Bits for the umem flags field. */
+#define XDP_UMEM_F_ZEROCOPY	(1 << 0)
+
 struct xdp_umem {
 	struct xsk_queue *fq;
 	struct xsk_queue *cq;
@@ -45,7 +48,7 @@ struct xdp_umem {
 	struct net_device *dev;
 	struct xdp_umem_fq_reuse *fq_reuse;
 	u16 queue_id;
-	bool zc;
+	u32 flags;
 	spinlock_t xsk_list_lock;
 	struct list_head xsk_list;
 };
@@ -58,7 +61,6 @@ struct xdp_sock {
 	struct xdp_umem *umem;
 	struct list_head flush_node;
 	u16 queue_id;
-	bool zc;
 	enum {
 		XSK_READY = 0,
 		XSK_BOUND,
@@ -95,6 +97,7 @@ struct xdp_umem_fq_reuse *xsk_reuseq_swap(struct xdp_umem *umem,
 					  struct xdp_umem_fq_reuse *newq);
 void xsk_reuseq_free(struct xdp_umem_fq_reuse *rq);
 struct xdp_umem *xdp_get_umem_from_qid(struct net_device *dev, u16 queue_id);
+bool xsk_umem_zerocopy(struct xdp_umem *umem);
 
 static inline char *xdp_umem_get_data(struct xdp_umem *umem, u64 addr)
 {
@@ -213,6 +216,11 @@ static inline struct xdp_umem *xdp_get_umem_from_qid(struct net_device *dev,
 	return NULL;
 }
 
+static inline bool xsk_umem_zerocopy(struct xdp_umem *umem)
+{
+	return false;
+}
+
 static inline char *xdp_umem_get_data(struct xdp_umem *umem, u64 addr)
 {
 	return NULL;
diff --git a/net/xdp/xdp_umem.c b/net/xdp/xdp_umem.c
index a0607969f8c0..411b3e3498c4 100644
--- a/net/xdp/xdp_umem.c
+++ b/net/xdp/xdp_umem.c
@@ -126,7 +126,7 @@ int xdp_umem_assign_dev(struct xdp_umem *umem, struct net_device *dev,
 	if (err)
 		goto err_unreg_umem;
 
-	umem->zc = true;
+	umem->flags |= XDP_UMEM_F_ZEROCOPY;
 	return 0;
 
 err_unreg_umem:
@@ -147,7 +147,7 @@ void xdp_umem_clear_dev(struct xdp_umem *umem)
 	if (!umem->dev)
 		return;
 
-	if (umem->zc) {
+	if (xsk_umem_zerocopy(umem)) {
 		bpf.command = XDP_SETUP_XSK_UMEM;
 		bpf.xsk.umem = NULL;
 		bpf.xsk.queue_id = umem->queue_id;
@@ -162,7 +162,7 @@ void xdp_umem_clear_dev(struct xdp_umem *umem)
 
 	dev_put(umem->dev);
 	umem->dev = NULL;
-	umem->zc = false;
+	umem->flags &= ~XDP_UMEM_F_ZEROCOPY;
 }
 
 static void xdp_umem_unmap_pages(struct xdp_umem *umem)
diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c
index 59b57d708697..ca95676ef75d 100644
--- a/net/xdp/xsk.c
+++ b/net/xdp/xsk.c
@@ -295,6 +295,12 @@ static int xsk_generic_xmit(struct sock *sk, struct msghdr *m,
 	return err;
 }
 
+bool xsk_umem_zerocopy(struct xdp_umem *umem)
+{
+	return (umem && (umem->flags & XDP_UMEM_F_ZEROCOPY));
+}
+EXPORT_SYMBOL(xsk_umem_zerocopy);
+
 static int xsk_sendmsg(struct socket *sock, struct msghdr *m, size_t total_len)
 {
 	bool need_wait = !(m->msg_flags & MSG_DONTWAIT);
@@ -310,7 +316,8 @@ static int xsk_sendmsg(struct socket *sock, struct msghdr *m, size_t total_len)
 	if (need_wait)
 		return -EOPNOTSUPP;
 
-	return (xs->zc) ? xsk_zc_xmit(sk) : xsk_generic_xmit(sk, m, total_len);
+	return xsk_umem_zerocopy(xs->umem) ? xsk_zc_xmit(sk) :
+					     xsk_generic_xmit(sk, m, total_len);
 }
 
 static unsigned int xsk_poll(struct file *file, struct socket *sock,
@@ -503,7 +510,6 @@ static int xsk_bind(struct socket *sock, struct sockaddr *addr, int addr_len)
 	}
 
 	xs->dev = dev;
-	xs->zc = xs->umem->zc;
 	xs->queue_id = qid;
 	xskq_set_umem(xs->rx, xs->umem->size, xs->umem->chunk_mask);
 	xskq_set_umem(xs->tx, xs->umem->size, xs->umem->chunk_mask);
@@ -683,7 +689,7 @@ static int xsk_getsockopt(struct socket *sock, int level, int optname,
 			return -EINVAL;
 
 		mutex_lock(&xs->mutex);
-		if (xs->zc)
+		if (xsk_umem_zerocopy(xs->umem))
 			opts.flags |= XDP_OPTIONS_ZEROCOPY;
 		mutex_unlock(&xs->mutex);
 
diff --git a/net/xdp/xsk_diag.c b/net/xdp/xsk_diag.c
index d5e06c8e0cbf..8a19b7e87cfb 100644
--- a/net/xdp/xsk_diag.c
+++ b/net/xdp/xsk_diag.c
@@ -10,6 +10,7 @@
 #include <net/xdp_sock.h>
 #include <linux/xdp_diag.h>
 #include <linux/sock_diag.h>
+#include <net/xdp_sock.h>
 
 #include "xsk_queue.h"
 #include "xsk.h"
@@ -61,7 +62,7 @@ static int xsk_diag_put_umem(const struct xdp_sock *xs, struct sk_buff *nlskb)
 	du.ifindex = umem->dev ? umem->dev->ifindex : 0;
 	du.queue_id = umem->queue_id;
 	du.flags = 0;
-	if (umem->zc)
+	if (xsk_umem_zerocopy(umem))
 		du.flags |= XDP_DU_F_ZEROCOPY;
 	du.refs = refcount_read(&umem->users);
 
-- 
2.20.1


^ permalink raw reply related

* [PATCH bpf-next 4/5] ixgbe: Enable XDP_SKIP_BPF option for AF_XDP sockets
From: Sridhar Samudrala @ 2019-08-15  3:46 UTC (permalink / raw)
  To: magnus.karlsson, bjorn.topel, netdev, bpf, sridhar.samudrala,
	intel-wired-lan, maciej.fijalkowski, tom.herbert
In-Reply-To: <1565840783-8269-1-git-send-email-sridhar.samudrala@intel.com>

This patch skips calling BPF program in the receive path if
the queue is associated with UMEM that is not shared and
bound to an AF_XDP socket that has enabled skip bpf during
bind() call.

Signed-off-by: Sridhar Samudrala <sridhar.samudrala@intel.com>
---
 drivers/net/ethernet/intel/ixgbe/ixgbe_main.c | 20 +++++++++++++++++--
 drivers/net/ethernet/intel/ixgbe/ixgbe_xsk.c  | 16 +++++++++++++--
 2 files changed, 32 insertions(+), 4 deletions(-)

diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
index dc7b128c780e..594792860cdd 100644
--- a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
@@ -2197,6 +2197,7 @@ static struct sk_buff *ixgbe_run_xdp(struct ixgbe_adapter *adapter,
 	int err, result = IXGBE_XDP_PASS;
 	struct bpf_prog *xdp_prog;
 	struct xdp_frame *xdpf;
+	struct xdp_umem *umem;
 	u32 act;
 
 	rcu_read_lock();
@@ -2207,6 +2208,13 @@ static struct sk_buff *ixgbe_run_xdp(struct ixgbe_adapter *adapter,
 
 	prefetchw(xdp->data_hard_start); /* xdp_frame write */
 
+	umem = xdp_get_umem_from_qid(rx_ring->netdev, rx_ring->queue_index);
+	if (xsk_umem_skip_bpf(umem)) {
+		err = xsk_umem_rcv(umem, xdp);
+		result = !err ? IXGBE_XDP_REDIR : IXGBE_XDP_CONSUMED;
+		goto xdp_out;
+	}
+
 	act = bpf_prog_run_xdp(xdp_prog, xdp);
 	switch (act) {
 	case XDP_PASS:
@@ -2400,8 +2408,16 @@ static int ixgbe_clean_rx_irq(struct ixgbe_q_vector *q_vector,
 		total_rx_packets++;
 	}
 
-	if (xdp_xmit & IXGBE_XDP_REDIR)
-		xdp_do_flush_map();
+	if (xdp_xmit & IXGBE_XDP_REDIR) {
+		struct xdp_umem *umem;
+
+		umem = xdp_get_umem_from_qid(rx_ring->netdev,
+					     rx_ring->queue_index);
+		if (xsk_umem_skip_bpf(umem))
+			xsk_umem_flush(umem);
+		else
+			xdp_do_flush_map();
+	}
 
 	if (xdp_xmit & IXGBE_XDP_TX) {
 		struct ixgbe_ring *ring = adapter->xdp_ring[smp_processor_id()];
diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_xsk.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_xsk.c
index 6b609553329f..9ea8a769d7a8 100644
--- a/drivers/net/ethernet/intel/ixgbe/ixgbe_xsk.c
+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_xsk.c
@@ -148,6 +148,12 @@ static int ixgbe_run_xdp_zc(struct ixgbe_adapter *adapter,
 	struct xdp_frame *xdpf;
 	u32 act;
 
+	if (xsk_umem_skip_bpf(rx_ring->xsk_umem)) {
+		err = xsk_umem_rcv(rx_ring->xsk_umem, xdp);
+		result = !err ? IXGBE_XDP_REDIR : IXGBE_XDP_CONSUMED;
+		return result;
+	}
+
 	rcu_read_lock();
 	xdp_prog = READ_ONCE(rx_ring->xdp_prog);
 	act = bpf_prog_run_xdp(xdp_prog, xdp);
@@ -527,8 +533,14 @@ int ixgbe_clean_rx_irq_zc(struct ixgbe_q_vector *q_vector,
 		ixgbe_rx_skb(q_vector, skb);
 	}
 
-	if (xdp_xmit & IXGBE_XDP_REDIR)
-		xdp_do_flush_map();
+	if (xdp_xmit & IXGBE_XDP_REDIR) {
+		struct xdp_umem *umem = rx_ring->xsk_umem;
+
+		if (xsk_umem_skip_bpf(umem))
+			xsk_umem_flush(umem);
+		else
+			xdp_do_flush_map();
+	}
 
 	if (xdp_xmit & IXGBE_XDP_TX) {
 		struct ixgbe_ring *ring = adapter->xdp_ring[smp_processor_id()];
-- 
2.20.1


^ permalink raw reply related

* [PATCH bpf-next 5/5] xdpsock_user: Add skip_bpf option
From: Sridhar Samudrala @ 2019-08-15  3:46 UTC (permalink / raw)
  To: magnus.karlsson, bjorn.topel, netdev, bpf, sridhar.samudrala,
	intel-wired-lan, maciej.fijalkowski, tom.herbert
In-Reply-To: <1565840783-8269-1-git-send-email-sridhar.samudrala@intel.com>

Signed-off-by: Sridhar Samudrala <sridhar.samudrala@intel.com>
---
 samples/bpf/xdpsock_user.c | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/samples/bpf/xdpsock_user.c b/samples/bpf/xdpsock_user.c
index 93eaaf7239b2..509fc6a18af9 100644
--- a/samples/bpf/xdpsock_user.c
+++ b/samples/bpf/xdpsock_user.c
@@ -123,6 +123,9 @@ static void print_benchmark(bool running)
 	if (opt_poll)
 		printf("poll() ");
 
+	if (opt_xdp_bind_flags & XDP_SKIP_BPF)
+		printf("skip-bpf ");
+
 	if (running) {
 		printf("running...");
 		fflush(stdout);
@@ -352,6 +355,7 @@ static struct option long_options[] = {
 	{"zero-copy", no_argument, 0, 'z'},
 	{"copy", no_argument, 0, 'c'},
 	{"frame-size", required_argument, 0, 'f'},
+	{"skip-bpf", no_argument, 0, 's'},
 	{0, 0, 0, 0}
 };
 
@@ -372,6 +376,7 @@ static void usage(const char *prog)
 		"  -z, --zero-copy      Force zero-copy mode.\n"
 		"  -c, --copy           Force copy mode.\n"
 		"  -f, --frame-size=n   Set the frame size (must be a power of two, default is %d).\n"
+		"  -s, --skip-bpf       Skip running bpf program.\n"
 		"\n";
 	fprintf(stderr, str, prog, XSK_UMEM__DEFAULT_FRAME_SIZE);
 	exit(EXIT_FAILURE);
@@ -430,6 +435,9 @@ static void parse_command_line(int argc, char **argv)
 		case 'f':
 			opt_xsk_frame_size = atoi(optarg);
 			break;
+		case 's':
+			opt_xdp_bind_flags |= XDP_SKIP_BPF;
+			break;
 		default:
 			usage(basename(argv[0]));
 		}
-- 
2.20.1


^ permalink raw reply related

* [PATCH bpf-next 2/5] xsk: Introduce XDP_SKIP_BPF bind option
From: Sridhar Samudrala @ 2019-08-15  3:46 UTC (permalink / raw)
  To: magnus.karlsson, bjorn.topel, netdev, bpf, sridhar.samudrala,
	intel-wired-lan, maciej.fijalkowski, tom.herbert
In-Reply-To: <1565840783-8269-1-git-send-email-sridhar.samudrala@intel.com>

This option enables an AF_XDP socket to specify XDP_SKIP_BPF flag
with the bind() call to skip calling the BPF program in the receive
path and pass the XDP buffer directly to the socket.

When a single AF_XDP socket is associated with a queue and a HW
filter is used to redirect the packets and the app is interested in
receiving all the packets on that queue, we don't need an additional 
BPF program to do further filtering or lookup/redirect to a socket.

Signed-off-by: Sridhar Samudrala <sridhar.samudrala@intel.com>
---
 include/net/xdp_sock.h        |  9 +++++++++
 include/uapi/linux/if_xdp.h   |  1 +
 include/uapi/linux/xdp_diag.h |  1 +
 net/xdp/xdp_umem.c            |  5 ++++-
 net/xdp/xsk.c                 | 31 +++++++++++++++++++++++++++++--
 net/xdp/xsk_diag.c            |  2 ++
 6 files changed, 46 insertions(+), 3 deletions(-)

diff --git a/include/net/xdp_sock.h b/include/net/xdp_sock.h
index b6716dbdce1a..ad132a69db7c 100644
--- a/include/net/xdp_sock.h
+++ b/include/net/xdp_sock.h
@@ -29,6 +29,7 @@ struct xdp_umem_fq_reuse {
 
 /* Bits for the umem flags field. */
 #define XDP_UMEM_F_ZEROCOPY	(1 << 0)
+#define XDP_UMEM_F_SKIP_BPF	(1 << 1)
 
 struct xdp_umem {
 	struct xsk_queue *fq;
@@ -98,6 +99,9 @@ struct xdp_umem_fq_reuse *xsk_reuseq_swap(struct xdp_umem *umem,
 void xsk_reuseq_free(struct xdp_umem_fq_reuse *rq);
 struct xdp_umem *xdp_get_umem_from_qid(struct net_device *dev, u16 queue_id);
 bool xsk_umem_zerocopy(struct xdp_umem *umem);
+bool xsk_umem_skip_bpf(struct xdp_umem *umem);
+void xsk_umem_flush(struct xdp_umem *umem);
+int xsk_umem_rcv(struct xdp_umem *umem, struct xdp_buff *xdp);
 
 static inline char *xdp_umem_get_data(struct xdp_umem *umem, u64 addr)
 {
@@ -221,6 +225,11 @@ static inline bool xsk_umem_zerocopy(struct xdp_umem *umem)
 	return false;
 }
 
+static inline bool xsk_umem_skip_bpf(struct xdp_umem *umem)
+{
+	return false;
+}
+
 static inline char *xdp_umem_get_data(struct xdp_umem *umem, u64 addr)
 {
 	return NULL;
diff --git a/include/uapi/linux/if_xdp.h b/include/uapi/linux/if_xdp.h
index faaa5ca2a117..881447ebf3c9 100644
--- a/include/uapi/linux/if_xdp.h
+++ b/include/uapi/linux/if_xdp.h
@@ -16,6 +16,7 @@
 #define XDP_SHARED_UMEM	(1 << 0)
 #define XDP_COPY	(1 << 1) /* Force copy-mode */
 #define XDP_ZEROCOPY	(1 << 2) /* Force zero-copy mode */
+#define XDP_SKIP_BPF	(1 << 3) /* Skip running BPF program */
 
 struct sockaddr_xdp {
 	__u16 sxdp_family;
diff --git a/include/uapi/linux/xdp_diag.h b/include/uapi/linux/xdp_diag.h
index 78b2591a7782..6caf3d9c9abe 100644
--- a/include/uapi/linux/xdp_diag.h
+++ b/include/uapi/linux/xdp_diag.h
@@ -56,6 +56,7 @@ struct xdp_diag_ring {
 };
 
 #define XDP_DU_F_ZEROCOPY (1 << 0)
+#define XDP_DU_F_SKIP_BPF (2 << 0)
 
 struct xdp_diag_umem {
 	__u64	size;
diff --git a/net/xdp/xdp_umem.c b/net/xdp/xdp_umem.c
index 411b3e3498c4..cbc02509dc90 100644
--- a/net/xdp/xdp_umem.c
+++ b/net/xdp/xdp_umem.c
@@ -106,6 +106,9 @@ int xdp_umem_assign_dev(struct xdp_umem *umem, struct net_device *dev,
 	umem->dev = dev;
 	umem->queue_id = queue_id;
 
+	if (flags & XDP_SKIP_BPF)
+		umem->flags |= XDP_UMEM_F_SKIP_BPF;
+
 	dev_hold(dev);
 
 	if (force_copy)
@@ -162,7 +165,7 @@ void xdp_umem_clear_dev(struct xdp_umem *umem)
 
 	dev_put(umem->dev);
 	umem->dev = NULL;
-	umem->flags &= ~XDP_UMEM_F_ZEROCOPY;
+	umem->flags &= ~(XDP_UMEM_F_ZEROCOPY | XDP_UMEM_F_SKIP_BPF);
 }
 
 static void xdp_umem_unmap_pages(struct xdp_umem *umem)
diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c
index ca95676ef75d..bcb6a77fae22 100644
--- a/net/xdp/xsk.c
+++ b/net/xdp/xsk.c
@@ -166,6 +166,27 @@ int xsk_generic_rcv(struct xdp_sock *xs, struct xdp_buff *xdp)
 	return err;
 }
 
+void xsk_umem_flush(struct xdp_umem *umem)
+{
+	struct xdp_sock *xs;
+
+	if (!list_empty(&umem->xsk_list)) {
+		xs = list_first_entry(&umem->xsk_list, struct xdp_sock, list);
+		xsk_flush(xs);
+	}
+}
+EXPORT_SYMBOL(xsk_umem_flush);
+
+int xsk_umem_rcv(struct xdp_umem *umem, struct xdp_buff *xdp)
+{
+	struct xdp_sock *xs;
+
+	xs = list_first_entry(&umem->xsk_list, struct xdp_sock, list);
+	xdp->handle += xdp->data - xdp->data_hard_start;
+	return xsk_rcv(xs, xdp);
+}
+EXPORT_SYMBOL(xsk_umem_rcv);
+
 void xsk_umem_complete_tx(struct xdp_umem *umem, u32 nb_entries)
 {
 	xskq_produce_flush_addr_n(umem->cq, nb_entries);
@@ -301,6 +322,12 @@ bool xsk_umem_zerocopy(struct xdp_umem *umem)
 }
 EXPORT_SYMBOL(xsk_umem_zerocopy);
 
+bool xsk_umem_skip_bpf(struct xdp_umem *umem)
+{
+	return (umem && (umem->flags & XDP_UMEM_F_SKIP_BPF));
+}
+EXPORT_SYMBOL(xsk_umem_skip_bpf);
+
 static int xsk_sendmsg(struct socket *sock, struct msghdr *m, size_t total_len)
 {
 	bool need_wait = !(m->msg_flags & MSG_DONTWAIT);
@@ -434,7 +461,7 @@ static int xsk_bind(struct socket *sock, struct sockaddr *addr, int addr_len)
 		return -EINVAL;
 
 	flags = sxdp->sxdp_flags;
-	if (flags & ~(XDP_SHARED_UMEM | XDP_COPY | XDP_ZEROCOPY))
+	if (flags & ~(XDP_SHARED_UMEM | XDP_COPY | XDP_ZEROCOPY | XDP_SKIP_BPF))
 		return -EINVAL;
 
 	rtnl_lock();
@@ -461,7 +488,7 @@ static int xsk_bind(struct socket *sock, struct sockaddr *addr, int addr_len)
 		struct xdp_sock *umem_xs;
 		struct socket *sock;
 
-		if ((flags & XDP_COPY) || (flags & XDP_ZEROCOPY)) {
+		if (flags & (XDP_COPY | XDP_ZEROCOPY | XDP_SKIP_BPF)) {
 			/* Cannot specify flags for shared sockets. */
 			err = -EINVAL;
 			goto out_unlock;
diff --git a/net/xdp/xsk_diag.c b/net/xdp/xsk_diag.c
index 8a19b7e87cfb..f6f4b7912a22 100644
--- a/net/xdp/xsk_diag.c
+++ b/net/xdp/xsk_diag.c
@@ -64,6 +64,8 @@ static int xsk_diag_put_umem(const struct xdp_sock *xs, struct sk_buff *nlskb)
 	du.flags = 0;
 	if (xsk_umem_zerocopy(umem))
 		du.flags |= XDP_DU_F_ZEROCOPY;
+	if (xsk_umem_skip_bpf(umem))
+		du.flags |= XDP_DU_F_SKIP_BPF;
 	du.refs = refcount_read(&umem->users);
 
 	err = nla_put(nlskb, XDP_DIAG_UMEM, sizeof(du), &du);
-- 
2.20.1


^ permalink raw reply related

* [PATCH bpf-next 3/5] i40e: Enable XDP_SKIP_BPF option for AF_XDP sockets
From: Sridhar Samudrala @ 2019-08-15  3:46 UTC (permalink / raw)
  To: magnus.karlsson, bjorn.topel, netdev, bpf, sridhar.samudrala,
	intel-wired-lan, maciej.fijalkowski, tom.herbert
In-Reply-To: <1565840783-8269-1-git-send-email-sridhar.samudrala@intel.com>

This patch skips calling BPF program in the receive path if
the queue is associated with UMEM that is not shared and
bound to an AF_XDP socket that has enabled skip bpf during
bind() call.

Here are some performance numbers collected on 
  - 2 socket 28 core Intel(R) Xeon(R) Platinum 8180 CPU @ 2.50GHz
  - Intel 40Gb Ethernet NIC (i40e)

All tests use 2 cores and the results are in Mpps.

turbo on (default)
---------------------------------------------	
                      no-skip-bpf    skip-bpf
---------------------------------------------	
rxdrop zerocopy           21.9         38.5 
l2fwd  zerocopy           17.0         20.5
rxdrop copy               11.1         13.3
l2fwd  copy                1.9          2.0

no turbo :  echo 1 > /sys/devices/system/cpu/intel_pstate/no_turbo
---------------------------------------------	
                      no-skip-bpf    skip-bpf
---------------------------------------------	
rxdrop zerocopy           15.4         29.0
l2fwd  zerocopy           11.8         18.2
rxdrop copy                8.2         10.5
l2fwd  copy                1.7          1.7
---------------------------------------------	

Signed-off-by: Sridhar Samudrala <sridhar.samudrala@intel.com>
---
 drivers/net/ethernet/intel/i40e/i40e_txrx.c | 22 +++++++++++++++++++--
 drivers/net/ethernet/intel/i40e/i40e_xsk.c  |  6 ++++++
 2 files changed, 26 insertions(+), 2 deletions(-)

diff --git a/drivers/net/ethernet/intel/i40e/i40e_txrx.c b/drivers/net/ethernet/intel/i40e/i40e_txrx.c
index e3f29dc8b290..5e63e3644e87 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_txrx.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_txrx.c
@@ -2199,6 +2199,7 @@ static struct sk_buff *i40e_run_xdp(struct i40e_ring *rx_ring,
 	int err, result = I40E_XDP_PASS;
 	struct i40e_ring *xdp_ring;
 	struct bpf_prog *xdp_prog;
+	struct xdp_umem *umem;
 	u32 act;
 
 	rcu_read_lock();
@@ -2209,6 +2210,13 @@ static struct sk_buff *i40e_run_xdp(struct i40e_ring *rx_ring,
 
 	prefetchw(xdp->data_hard_start); /* xdp_frame write */
 
+	umem = xdp_get_umem_from_qid(rx_ring->netdev, rx_ring->queue_index);
+	if (xsk_umem_skip_bpf(umem)) {
+		err = xsk_umem_rcv(umem, xdp);
+		result = !err ? I40E_XDP_REDIR : I40E_XDP_CONSUMED;
+		goto xdp_out;
+	}
+
 	act = bpf_prog_run_xdp(xdp_prog, xdp);
 	switch (act) {
 	case XDP_PASS:
@@ -2303,8 +2311,18 @@ void i40e_update_rx_stats(struct i40e_ring *rx_ring,
  **/
 void i40e_finalize_xdp_rx(struct i40e_ring *rx_ring, unsigned int xdp_res)
 {
-	if (xdp_res & I40E_XDP_REDIR)
-		xdp_do_flush_map();
+	if (xdp_res & I40E_XDP_REDIR) {
+		struct xdp_umem *umem;
+
+		umem = rx_ring->xsk_umem;
+		if (!umem)
+			umem = xdp_get_umem_from_qid(rx_ring->netdev,
+						     rx_ring->queue_index);
+		if (xsk_umem_skip_bpf(umem))
+			xsk_umem_flush(umem);
+		else
+			xdp_do_flush_map();
+	}
 
 	if (xdp_res & I40E_XDP_TX) {
 		struct i40e_ring *xdp_ring =
diff --git a/drivers/net/ethernet/intel/i40e/i40e_xsk.c b/drivers/net/ethernet/intel/i40e/i40e_xsk.c
index 32bad014d76c..cc538479c95d 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_xsk.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_xsk.c
@@ -195,6 +195,12 @@ static int i40e_run_xdp_zc(struct i40e_ring *rx_ring, struct xdp_buff *xdp)
 	struct bpf_prog *xdp_prog;
 	u32 act;
 
+	if (xsk_umem_skip_bpf(rx_ring->xsk_umem)) {
+		err = xsk_umem_rcv(rx_ring->xsk_umem, xdp);
+		result = !err ? I40E_XDP_REDIR : I40E_XDP_CONSUMED;
+		return result;
+	}
+
 	rcu_read_lock();
 	/* NB! xdp_prog will always be !NULL, due to the fact that
 	 * this path is enabled by setting an XDP program.
-- 
2.20.1


^ permalink raw reply related

* [PATCH bpf-next 0/5] Add support for SKIP_BPF flag for AF_XDP sockets
From: Sridhar Samudrala @ 2019-08-15  3:46 UTC (permalink / raw)
  To: magnus.karlsson, bjorn.topel, netdev, bpf, sridhar.samudrala,
	intel-wired-lan, maciej.fijalkowski, tom.herbert

This patch series introduces XDP_SKIP_BPF flag that can be specified
during the bind() call of an AF_XDP socket to skip calling the BPF 
program in the receive path and pass the buffer directly to the socket.

When a single AF_XDP socket is associated with a queue and a HW
filter is used to redirect the packets and the app is interested in
receiving all the packets on that queue, we don't need an additional 
BPF program to do further filtering or lookup/redirect to a socket.

Here are some performance numbers collected on 
  - 2 socket 28 core Intel(R) Xeon(R) Platinum 8180 CPU @ 2.50GHz
  - Intel 40Gb Ethernet NIC (i40e)

All tests use 2 cores and the results are in Mpps.

turbo on (default)
---------------------------------------------	
                      no-skip-bpf    skip-bpf
---------------------------------------------	
rxdrop zerocopy           21.9         38.5 
l2fwd  zerocopy           17.0         20.5
rxdrop copy               11.1         13.3
l2fwd  copy                1.9          2.0

no turbo :  echo 1 > /sys/devices/system/cpu/intel_pstate/no_turbo
---------------------------------------------	
                      no-skip-bpf    skip-bpf
---------------------------------------------	
rxdrop zerocopy           15.4         29.0
l2fwd  zerocopy           11.8         18.2
rxdrop copy                8.2         10.5
l2fwd  copy                1.7          1.7
---------------------------------------------	

Sridhar Samudrala (5):
  xsk: Convert bool 'zc' field in struct xdp_umem to a u32 bitmap
  xsk: Introduce XDP_SKIP_BPF bind option
  i40e: Enable XDP_SKIP_BPF option for AF_XDP sockets
  ixgbe: Enable XDP_SKIP_BPF option for AF_XDP sockets
  xdpsock_user: Add skip_bpf option

 drivers/net/ethernet/intel/i40e/i40e_txrx.c   | 22 +++++++++-
 drivers/net/ethernet/intel/i40e/i40e_xsk.c    |  6 +++
 drivers/net/ethernet/intel/ixgbe/ixgbe_main.c | 20 ++++++++-
 drivers/net/ethernet/intel/ixgbe/ixgbe_xsk.c  | 16 ++++++-
 include/net/xdp_sock.h                        | 21 ++++++++-
 include/uapi/linux/if_xdp.h                   |  1 +
 include/uapi/linux/xdp_diag.h                 |  1 +
 net/xdp/xdp_umem.c                            |  9 ++--
 net/xdp/xsk.c                                 | 43 ++++++++++++++++---
 net/xdp/xsk_diag.c                            |  5 ++-
 samples/bpf/xdpsock_user.c                    |  8 ++++
 11 files changed, 135 insertions(+), 17 deletions(-)

-- 
2.20.1


^ permalink raw reply

* Re: [PATCH V5 0/9] Fixes for vhost metadata acceleration
From: Jason Wang @ 2019-08-15  3:28 UTC (permalink / raw)
  To: Christoph Hellwig, Jason Gunthorpe
  Cc: Michael S. Tsirkin, kvm, virtualization, netdev, linux-kernel,
	linux-mm
In-Reply-To: <20190813164105.GD22640@infradead.org>


On 2019/8/14 上午12:41, Christoph Hellwig wrote:
> On Tue, Aug 13, 2019 at 08:57:07AM -0300, Jason Gunthorpe wrote:
>> On Tue, Aug 13, 2019 at 04:31:07PM +0800, Jason Wang wrote:
>>
>>> What kind of issues do you see? Spinlock is to synchronize GUP with MMU
>>> notifier in this series.
>> A GUP that can't sleep can't pagefault which makes it a really weird
>> pattern
> get_user_pages/get_user_pages_fast must not be called under a spinlock.
> We have the somewhat misnamed __get_user_page_fast that just does a
> lookup for existing pages and never faults for a few places that need
> to do that lookup from contexts where we can't sleep.


Yes, I do use __get_user_pages_fast() in the code.

Thanks


^ permalink raw reply

* Re: [PATCH V5 0/9] Fixes for vhost metadata acceleration
From: Jason Wang @ 2019-08-15  3:26 UTC (permalink / raw)
  To: Jason Gunthorpe
  Cc: Michael S. Tsirkin, kvm, virtualization, netdev, linux-kernel,
	linux-mm
In-Reply-To: <20190813115707.GC29508@ziepe.ca>


On 2019/8/13 下午7:57, Jason Gunthorpe wrote:
> On Tue, Aug 13, 2019 at 04:31:07PM +0800, Jason Wang wrote:
>
>> What kind of issues do you see? Spinlock is to synchronize GUP with MMU
>> notifier in this series.
> A GUP that can't sleep can't pagefault which makes it a really weird
> pattern


My understanding is __get_user_pages_fast() assumes caller can fail or 
have fallback. And we have graceful fallback to copy_{to|from}_user().


>
>> Btw, back to the original question. May I know why synchronize_rcu() is not
>> suitable? Consider:
> We already went over this. You'd need to determine it doesn't somehow
> deadlock the mm on reclaim paths. Maybe it is OK, the rcq_gq_wq is
> marked WQ_MEM_RECLAIM at least..


Yes, will take a look at this.


>
> I also think Michael was concerned about the latency spikes a long RCU
> delay would cause.


I don't think it's a real problem consider MMU notifier could be 
preempted or blocked.

Thanks


>
> Jason

^ permalink raw reply

* [net] tipc: fix false detection of retransmit failures
From: Tuong Lien @ 2019-08-15  3:24 UTC (permalink / raw)
  To: davem, jon.maloy, maloy, ying.xue, netdev; +Cc: tipc-discussion

This commit eliminates the use of the link 'stale_limit' & 'prev_from'
(besides the already removed - 'stale_cnt') variables in the detection
of repeated retransmit failures as there is no proper way to initialize
them to avoid a false detection, i.e. it is not really a retransmission
failure but due to a garbage values in the variables.

Instead, a jiffies variable will be added to individual skbs (like the
way we restrict the skb retransmissions) in order to mark the first skb
retransmit time. Later on, at the next retransmissions, the timestamp
will be checked to see if the skb in the link transmq is "too stale",
that is, the link tolerance time has passed, so that a link reset will
be ordered. Note, just checking on the first skb in the queue is fine
enough since it must be the oldest one.
A counter is also added to keep track the actual skb retransmissions'
number for later checking when the failure happens.

The downside of this approach is that the skb->cb[] buffer is about to
be exhausted, however it is always able to allocate another memory area
and keep a reference to it when needed.

Fixes: 77cf8edbc0e7 ("tipc: simplify stale link failure criteria")
Reported-by: Hoang Le <hoang.h.le@dektech.com.au>
Acked-by: Ying Xue <ying.xue@windriver.com>
Acked-by: Jon Maloy <jon.maloy@ericsson.com>
Signed-off-by: Tuong Lien <tuong.t.lien@dektech.com.au>
---
 net/tipc/link.c | 92 ++++++++++++++++++++++++++++++++-------------------------
 net/tipc/msg.h  |  8 +++--
 2 files changed, 57 insertions(+), 43 deletions(-)

diff --git a/net/tipc/link.c b/net/tipc/link.c
index dd3155b14654..01d76bf16e9d 100644
--- a/net/tipc/link.c
+++ b/net/tipc/link.c
@@ -106,8 +106,6 @@ struct tipc_stats {
  * @transmitq: queue for sent, non-acked messages
  * @backlogq: queue for messages waiting to be sent
  * @snt_nxt: next sequence number to use for outbound messages
- * @prev_from: sequence number of most previous retransmission request
- * @stale_limit: time when repeated identical retransmits must force link reset
  * @ackers: # of peers that needs to ack each packet before it can be released
  * @acked: # last packet acked by a certain peer. Used for broadcast.
  * @rcv_nxt: next sequence number to expect for inbound messages
@@ -164,9 +162,7 @@ struct tipc_link {
 		u16 limit;
 	} backlog[5];
 	u16 snd_nxt;
-	u16 prev_from;
 	u16 window;
-	unsigned long stale_limit;
 
 	/* Reception */
 	u16 rcv_nxt;
@@ -1063,47 +1059,53 @@ static void tipc_link_advance_backlog(struct tipc_link *l,
  * link_retransmit_failure() - Detect repeated retransmit failures
  * @l: tipc link sender
  * @r: tipc link receiver (= l in case of unicast)
- * @from: seqno of the 1st packet in retransmit request
  * @rc: returned code
  *
  * Return: true if the repeated retransmit failures happens, otherwise
  * false
  */
 static bool link_retransmit_failure(struct tipc_link *l, struct tipc_link *r,
-				    u16 from, int *rc)
+				    int *rc)
 {
 	struct sk_buff *skb = skb_peek(&l->transmq);
 	struct tipc_msg *hdr;
 
 	if (!skb)
 		return false;
-	hdr = buf_msg(skb);
 
-	/* Detect repeated retransmit failures on same packet */
-	if (r->prev_from != from) {
-		r->prev_from = from;
-		r->stale_limit = jiffies + msecs_to_jiffies(r->tolerance);
-	} else if (time_after(jiffies, r->stale_limit)) {
-		pr_warn("Retransmission failure on link <%s>\n", l->name);
-		link_print(l, "State of link ");
-		pr_info("Failed msg: usr %u, typ %u, len %u, err %u\n",
-			msg_user(hdr), msg_type(hdr), msg_size(hdr),
-			msg_errcode(hdr));
-		pr_info("sqno %u, prev: %x, src: %x\n",
-			msg_seqno(hdr), msg_prevnode(hdr), msg_orignode(hdr));
-
-		trace_tipc_list_dump(&l->transmq, true, "retrans failure!");
-		trace_tipc_link_dump(l, TIPC_DUMP_NONE, "retrans failure!");
-		trace_tipc_link_dump(r, TIPC_DUMP_NONE, "retrans failure!");
+	if (!TIPC_SKB_CB(skb)->retr_cnt)
+		return false;
 
-		if (link_is_bc_sndlink(l))
-			*rc = TIPC_LINK_DOWN_EVT;
+	if (!time_after(jiffies, TIPC_SKB_CB(skb)->retr_stamp +
+			msecs_to_jiffies(r->tolerance)))
+		return false;
+
+	hdr = buf_msg(skb);
+	if (link_is_bc_sndlink(l) && !less(r->acked, msg_seqno(hdr)))
+		return false;
 
+	pr_warn("Retransmission failure on link <%s>\n", l->name);
+	link_print(l, "State of link ");
+	pr_info("Failed msg: usr %u, typ %u, len %u, err %u\n",
+		msg_user(hdr), msg_type(hdr), msg_size(hdr), msg_errcode(hdr));
+	pr_info("sqno %u, prev: %x, dest: %x\n",
+		msg_seqno(hdr), msg_prevnode(hdr), msg_destnode(hdr));
+	pr_info("retr_stamp %d, retr_cnt %d\n",
+		jiffies_to_msecs(TIPC_SKB_CB(skb)->retr_stamp),
+		TIPC_SKB_CB(skb)->retr_cnt);
+
+	trace_tipc_list_dump(&l->transmq, true, "retrans failure!");
+	trace_tipc_link_dump(l, TIPC_DUMP_NONE, "retrans failure!");
+	trace_tipc_link_dump(r, TIPC_DUMP_NONE, "retrans failure!");
+
+	if (link_is_bc_sndlink(l)) {
+		r->state = LINK_RESET;
+		*rc = TIPC_LINK_DOWN_EVT;
+	} else {
 		*rc = tipc_link_fsm_evt(l, LINK_FAILURE_EVT);
-		return true;
 	}
 
-	return false;
+	return true;
 }
 
 /* tipc_link_bc_retrans() - retransmit zero or more packets
@@ -1129,7 +1131,7 @@ static int tipc_link_bc_retrans(struct tipc_link *l, struct tipc_link *r,
 
 	trace_tipc_link_retrans(r, from, to, &l->transmq);
 
-	if (link_retransmit_failure(l, r, from, &rc))
+	if (link_retransmit_failure(l, r, &rc))
 		return rc;
 
 	skb_queue_walk(&l->transmq, skb) {
@@ -1138,11 +1140,10 @@ static int tipc_link_bc_retrans(struct tipc_link *l, struct tipc_link *r,
 			continue;
 		if (more(msg_seqno(hdr), to))
 			break;
-		if (link_is_bc_sndlink(l)) {
-			if (time_before(jiffies, TIPC_SKB_CB(skb)->nxt_retr))
-				continue;
-			TIPC_SKB_CB(skb)->nxt_retr = TIPC_BC_RETR_LIM;
-		}
+
+		if (time_before(jiffies, TIPC_SKB_CB(skb)->nxt_retr))
+			continue;
+		TIPC_SKB_CB(skb)->nxt_retr = TIPC_BC_RETR_LIM;
 		_skb = __pskb_copy(skb, LL_MAX_HEADER + MIN_H_SIZE, GFP_ATOMIC);
 		if (!_skb)
 			return 0;
@@ -1152,6 +1153,10 @@ static int tipc_link_bc_retrans(struct tipc_link *l, struct tipc_link *r,
 		_skb->priority = TC_PRIO_CONTROL;
 		__skb_queue_tail(xmitq, _skb);
 		l->stats.retransmitted++;
+
+		/* Increase actual retrans counter & mark first time */
+		if (!TIPC_SKB_CB(skb)->retr_cnt++)
+			TIPC_SKB_CB(skb)->retr_stamp = jiffies;
 	}
 	return 0;
 }
@@ -1393,12 +1398,10 @@ static int tipc_link_advance_transmq(struct tipc_link *l, u16 acked, u16 gap,
 	struct tipc_msg *hdr;
 	u16 bc_ack = l->bc_rcvlink->rcv_nxt - 1;
 	u16 ack = l->rcv_nxt - 1;
+	bool passed = false;
 	u16 seqno, n = 0;
 	int rc = 0;
 
-	if (gap && link_retransmit_failure(l, l, acked + 1, &rc))
-		return rc;
-
 	skb_queue_walk_safe(&l->transmq, skb, tmp) {
 		seqno = buf_seqno(skb);
 
@@ -1408,12 +1411,17 @@ static int tipc_link_advance_transmq(struct tipc_link *l, u16 acked, u16 gap,
 			__skb_unlink(skb, &l->transmq);
 			kfree_skb(skb);
 		} else if (less_eq(seqno, acked + gap)) {
-			/* retransmit skb */
+			/* First, check if repeated retrans failures occurs? */
+			if (!passed && link_retransmit_failure(l, l, &rc))
+				return rc;
+			passed = true;
+
+			/* retransmit skb if unrestricted*/
 			if (time_before(jiffies, TIPC_SKB_CB(skb)->nxt_retr))
 				continue;
 			TIPC_SKB_CB(skb)->nxt_retr = TIPC_UC_RETR_TIME;
-
-			_skb = __pskb_copy(skb, MIN_H_SIZE, GFP_ATOMIC);
+			_skb = __pskb_copy(skb, LL_MAX_HEADER + MIN_H_SIZE,
+					   GFP_ATOMIC);
 			if (!_skb)
 				continue;
 			hdr = buf_msg(_skb);
@@ -1422,6 +1430,10 @@ static int tipc_link_advance_transmq(struct tipc_link *l, u16 acked, u16 gap,
 			_skb->priority = TC_PRIO_CONTROL;
 			__skb_queue_tail(xmitq, _skb);
 			l->stats.retransmitted++;
+
+			/* Increase actual retrans counter & mark first time */
+			if (!TIPC_SKB_CB(skb)->retr_cnt++)
+				TIPC_SKB_CB(skb)->retr_stamp = jiffies;
 		} else {
 			/* retry with Gap ACK blocks if any */
 			if (!ga || n >= ga->gack_cnt)
@@ -2681,7 +2693,7 @@ int tipc_link_dump(struct tipc_link *l, u16 dqueues, char *buf)
 	i += scnprintf(buf + i, sz - i, " %x", l->peer_caps);
 	i += scnprintf(buf + i, sz - i, " %u", l->silent_intv_cnt);
 	i += scnprintf(buf + i, sz - i, " %u", l->rst_cnt);
-	i += scnprintf(buf + i, sz - i, " %u", l->prev_from);
+	i += scnprintf(buf + i, sz - i, " %u", 0);
 	i += scnprintf(buf + i, sz - i, " %u", 0);
 	i += scnprintf(buf + i, sz - i, " %u", l->acked);
 
diff --git a/net/tipc/msg.h b/net/tipc/msg.h
index 1c8c8dd32a4e..0daa6f04ca81 100644
--- a/net/tipc/msg.h
+++ b/net/tipc/msg.h
@@ -102,13 +102,15 @@ struct plist;
 #define TIPC_MEDIA_INFO_OFFSET	5
 
 struct tipc_skb_cb {
-	u32 bytes_read;
-	u32 orig_member;
 	struct sk_buff *tail;
 	unsigned long nxt_retr;
-	bool validated;
+	unsigned long retr_stamp;
+	u32 bytes_read;
+	u32 orig_member;
 	u16 chain_imp;
 	u16 ackers;
+	u16 retr_cnt;
+	bool validated;
 };
 
 #define TIPC_SKB_CB(__skb) ((struct tipc_skb_cb *)&((__skb)->cb[0]))
-- 
2.13.7


^ permalink raw reply related

* Re: [PATCH] virtio-net: lower min ring num_free for efficiency
From: Jason Wang @ 2019-08-15  3:17 UTC (permalink / raw)
  To: 冉 jiang, mst@redhat.com
  Cc: davem@davemloft.net, ast@kernel.org, daniel@iogearbox.net,
	jakub.kicinski@netronome.com, hawk@kernel.org,
	john.fastabend@gmail.com, kafai@fb.com, songliubraving@fb.com,
	yhs@fb.com, virtualization@lists.linux-foundation.org,
	netdev@vger.kernel.org, linux-kernel@vger.kernel.org,
	xdp-newbies@vger.kernel.org, bpf@vger.kernel.org,
	jiangran.jr@alibaba-inc.com
In-Reply-To: <BYAPR14MB3205B734E554EACEEE337ADDA6AC0@BYAPR14MB3205.namprd14.prod.outlook.com>


On 2019/8/15 上午11:11, 冉 jiang wrote:
> On 2019/8/15 11:01, Jason Wang wrote:
>> On 2019/8/14 上午10:06, ? jiang wrote:
>>> This change lowers ring buffer reclaim threshold from 1/2*queue to
>>> budget
>>> for better performance. According to our test with qemu + dpdk, packet
>>> dropping happens when the guest is not able to provide free buffer in
>>> avail ring timely with default 1/2*queue. The value in the patch has
>>> been
>>> tested and does show better performance.
>>
>> Please add your tests setup and result here.
>>
>> Thanks
>>
>>
>>> Signed-off-by: jiangkidd <jiangkidd@hotmail.com>
>>> ---
>>>    drivers/net/virtio_net.c | 2 +-
>>>    1 file changed, 1 insertion(+), 1 deletion(-)
>>>
>>> diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
>>> index 0d4115c9e20b..bc08be7925eb 100644
>>> --- a/drivers/net/virtio_net.c
>>> +++ b/drivers/net/virtio_net.c
>>> @@ -1331,7 +1331,7 @@ static int virtnet_receive(struct receive_queue
>>> *rq, int budget,
>>>            }
>>>        }
>>>    -    if (rq->vq->num_free > virtqueue_get_vring_size(rq->vq) / 2) {
>>> +    if (rq->vq->num_free > min((unsigned int)budget,
>>> virtqueue_get_vring_size(rq->vq)) / 2) {
>>>            if (!try_fill_recv(vi, rq, GFP_ATOMIC))
>>>                schedule_delayed_work(&vi->refill, 0);
>>>        }
> Sure, here are the details:


Thanks for the details, but I meant it's better if you could summarize 
you test result in the commit log in a compact way.

Btw, some comments, see below:


>
>
> Test setup & result:
>
> ----------------------------------------------------
>
> Below is the snippet from our test result. Test1 was done with default
> driver with the value of 1/2 * queue, while test2 is with my patch. We
> can see average
> drop packets do decrease a lot in test2.
>
> test1Time    avgDropPackets    test2Time    avgDropPackets pps
>
> 16:21.0    12.295    56:50.4    0 300k
> 17:19.1    15.244    56:50.4    0    300k
> 18:17.5    18.789    56:50.4    0    300k
> 19:15.1    14.208    56:50.4    0    300k
> 20:13.2    20.818    56:50.4    0.267    300k
> 21:11.2    12.397    56:50.4    0    300k
> 22:09.3    12.599    56:50.4    0    300k
> 23:07.3    15.531    57:48.4    0    300k
> 24:05.5    13.664    58:46.5    0    300k
> 25:03.7    13.158    59:44.5    4.73    300k
> 26:01.1    2.486    00:42.6    0    300k
> 26:59.1    11.241    01:40.6    0    300k
> 27:57.2    20.521    02:38.6    0    300k
> 28:55.2    30.094    03:36.7    0    300k
> 29:53.3    16.828    04:34.7    0.963    300k
> 30:51.3    46.916    05:32.8    0    400k
> 31:49.3    56.214    05:32.8    0    400k
> 32:47.3    58.69    05:32.8    0    400k
> 33:45.3    61.486    05:32.8    0    400k
> 34:43.3    72.175    05:32.8    0.598    400k
> 35:41.3    56.699    05:32.8    0    400k
> 36:39.3    61.071    05:32.8    0    400k
> 37:37.3    43.355    06:30.8    0    400k
> 38:35.4    44.644    06:30.8    0    400k
> 39:33.4    72.336    06:30.8    0    400k
> 40:31.4    70.676    06:30.8    0    400k
> 41:29.4    108.009    06:30.8    0    400k
> 42:27.4    65.216    06:30.8    0    400k


Why there're difference in test time? Could you summarize them like:

Test setup: e.g testpmd or pktgen to generate packets to guest

avg packets drop before: XXX

avg packets drop after: YYY(-ZZZ%)

Thanks


>
>
> Data to prove why the patch helps:
>
> ----------------------------------------------------
>
> We did have completed several rounds of test with setting the value to
> budget (64 as the default value). It does improve a lot with pps is
> below 400pps for a single stream. We are confident that it runs out of free
> buffer in avail ring when packet dropping happens with below systemtap:
>
> Just a snippet:
>
> probe module("virtio_ring").function("virtqueue_get_buf")
> {
>        x = (@cast($_vq, "vring_virtqueue")->vring->used->idx)-
> (@cast($_vq, "vring_virtqueue")->last_used_idx) ---> we use this one
> to verify if the queue is full, which means guest is not able to take
> buffer from the queue timely
>
>        if (x<0 && (x+65535)<4096)
>            x = x+65535
>
>        if((x==1024) && @cast($_vq, "vring_virtqueue")->vq->callback ==
> callback_addr)
>            netrxcount[x] <<< gettimeofday_s()
> }
>
>
> probe module("virtio_ring").function("virtqueue_add_inbuf")
> {
>        y = (@cast($vq, "vring_virtqueue")->vring->avail->idx)-
> (@cast($vq, "vring_virtqueue")->vring->used->idx) ---> we use this one
> to verify if we run out of free buffer in avail ring
>        if (y<0 && (y+65535)<4096)
>            y = y+65535
>
>        if(@2=="debugon")
>        {
>            if(y==0 && @cast($vq, "vring_virtqueue")->vq->callback ==
> callback_addr)
>            {
>                netrxfreecount[y] <<< gettimeofday_s()
>
>                printf("no avail ring left seen, printing most recent 5
> num free, vq: %lx, current index: %d\n", $vq, recentfreecount)
>                for(i=recentfreecount; i!=((recentfreecount+4) % 5);
> i=((i+1) % 5))
>                {
>                    printf("index: %d, num free: %d\n", i, recentfree[$vq,
> i])
>                }
>
>                printf("index: %d, num free: %d\n", i, recentfree[$vq, i])
>                //exit()
>            }
>        }
> }
>
>
> probe
> module("virtio_net").statement("virtnet_receive@drivers/net/virtio_net.c:732")
>
> {
>        recentfreecount++
>        recentfreecount = recentfreecount % 5
>        recentfree[$rq->vq, recentfreecount] = $rq->vq->num_free --->
> record the num_free for the last 5 calls to virtnet_receive, so we can
> see if lowering the bar helps.
> }
>
>
> Here is the result:
>
> no avail ring left seen, printing most recent 5 num free, vq:
> ffff9c13c1200000, current index: 1
> index: 1, num free: 561
> index: 2, num free: 305
> index: 3, num free: 369
> index: 4, num free: 433
> index: 0, num free: 497
> no avail ring left seen, printing most recent 5 num free, vq:
> ffff9c13c1200000, current index: 1
> index: 1, num free: 543
> index: 2, num free: 463
> index: 3, num free: 469
> index: 4, num free: 476
> index: 0, num free: 479
> no avail ring left seen, printing most recent 5 num free, vq:
> ffff9c13c1200000, current index: 2
> index: 2, num free: 555
> index: 3, num free: 414
> index: 4, num free: 420
> index: 0, num free: 427
> index: 1, num free: 491
>
> We can see in the last 4 calls to virtnet_receive before we run out
> of free buffer and start to relaim, num_free is quite high. So if we
> can do the reclaim earlier, it will certainly help.
>
> Jiang


Right, but I think there's no need to put those thing in the commit log.

Thanks



^ permalink raw reply

* Re: [PATCH] cxgb4: fix a memory leak bug
From: David Miller @ 2019-08-15  3:03 UTC (permalink / raw)
  To: wenwen; +Cc: vishal, netdev, linux-kernel
In-Reply-To: <1565687932-2870-1-git-send-email-wenwen@cs.uga.edu>

From: Wenwen Wang <wenwen@cs.uga.edu>
Date: Tue, 13 Aug 2019 04:18:52 -0500

> In blocked_fl_write(), 't' is not deallocated if bitmap_parse_user() fails,
> leading to a memory leak bug. To fix this issue, free t before returning
> the error.
> 
> Signed-off-by: Wenwen Wang <wenwen@cs.uga.edu>

Applied.

^ permalink raw reply

* Re: [PATCH net-next] net/mvpp2: Replace tasklet with softirq hrtimer
From: David Miller @ 2019-08-15  3:02 UTC (permalink / raw)
  To: bigeasy; +Cc: netdev, tglx, maxime.chevallier
In-Reply-To: <20190813080025.2j3cj6gfyzzxek7x@linutronix.de>

From: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Date: Tue, 13 Aug 2019 10:00:25 +0200

> From: Thomas Gleixner <tglx@linutronix.de>
> 
> The tx_done_tasklet tasklet is used in invoke the hrtimer
> (mvpp2_hr_timer_cb) in softirq context. This can be also achieved without
> the tasklet but with HRTIMER_MODE_SOFT as hrtimer mode.
> 
> Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
> Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>

Applied.

^ permalink raw reply

* Re: [PATCH] virtio-net: lower min ring num_free for efficiency
From: Jason Wang @ 2019-08-15  3:01 UTC (permalink / raw)
  To: ? jiang, mst@redhat.com
  Cc: davem@davemloft.net, ast@kernel.org, daniel@iogearbox.net,
	jakub.kicinski@netronome.com, hawk@kernel.org,
	john.fastabend@gmail.com, kafai@fb.com, songliubraving@fb.com,
	yhs@fb.com, virtualization@lists.linux-foundation.org,
	netdev@vger.kernel.org, linux-kernel@vger.kernel.org,
	xdp-newbies@vger.kernel.org, bpf@vger.kernel.org,
	jiangran.jr@alibaba-inc.com
In-Reply-To: <BYAPR14MB3205E4E194942B0A1A91A222A6AD0@BYAPR14MB3205.namprd14.prod.outlook.com>


On 2019/8/14 上午10:06, ? jiang wrote:
> This change lowers ring buffer reclaim threshold from 1/2*queue to budget
> for better performance. According to our test with qemu + dpdk, packet
> dropping happens when the guest is not able to provide free buffer in
> avail ring timely with default 1/2*queue. The value in the patch has been
> tested and does show better performance.


Please add your tests setup and result here.

Thanks


>
> Signed-off-by: jiangkidd <jiangkidd@hotmail.com>
> ---
>   drivers/net/virtio_net.c | 2 +-
>   1 file changed, 1 insertion(+), 1 deletion(-)
>
> diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
> index 0d4115c9e20b..bc08be7925eb 100644
> --- a/drivers/net/virtio_net.c
> +++ b/drivers/net/virtio_net.c
> @@ -1331,7 +1331,7 @@ static int virtnet_receive(struct receive_queue *rq, int budget,
>   		}
>   	}
>   
> -	if (rq->vq->num_free > virtqueue_get_vring_size(rq->vq) / 2) {
> +	if (rq->vq->num_free > min((unsigned int)budget, virtqueue_get_vring_size(rq->vq)) / 2) {
>   		if (!try_fill_recv(vi, rq, GFP_ATOMIC))
>   			schedule_delayed_work(&vi->refill, 0);
>   	}

^ permalink raw reply

* Re: [PATCH 0/2] Netfilter updates for net-next
From: David Miller @ 2019-08-15  2:59 UTC (permalink / raw)
  To: pablo; +Cc: netfilter-devel, netdev
In-Reply-To: <20190814214347.4940-1-pablo@netfilter.org>

From: Pablo Neira Ayuso <pablo@netfilter.org>
Date: Wed, 14 Aug 2019 23:43:45 +0200

> The following patchset contains Netfilter updates for net-next.
> This round addresses fallout from previous pull request:
> 
> 1) Remove #warning from ipt_LOG.h and ip6t_LOG.h headers,
>    from Jeremy Sowden.
> 
> 2) Incorrect parens in memcmp() in nft_bitwise, from Nathan Chancellor.
> 
> You can pull these changes from:
> 
>   git://git.kernel.org/pub/scm/linux/kernel/git/pablo/nf-next.git

Pulled, thanks.

^ permalink raw reply

* Re: WARNING in cgroup_rstat_updated
From: syzbot @ 2019-08-15  1:26 UTC (permalink / raw)
  To: ast, daniel, hdanton, john.fastabend, linux-kernel, linux-mm,
	netdev, syzkaller-bugs
In-Reply-To: <000000000000b851cb058f7e637f@google.com>

syzbot has bisected this bug to:

commit e9db4ef6bf4ca9894bb324c76e01b8f1a16b2650
Author: John Fastabend <john.fastabend@gmail.com>
Date:   Sat Jun 30 13:17:47 2018 +0000

     bpf: sockhash fix omitted bucket lock in sock_close

bisection log:  https://syzkaller.appspot.com/x/bisect.txt?x=143286e2600000
start commit:   31cc088a Merge tag 'drm-next-2019-07-19' of git://anongit...
git tree:       net-next
final crash:    https://syzkaller.appspot.com/x/report.txt?x=163286e2600000
console output: https://syzkaller.appspot.com/x/log.txt?x=123286e2600000
kernel config:  https://syzkaller.appspot.com/x/.config?x=4dba67bf8b8c9ad7
dashboard link: https://syzkaller.appspot.com/bug?extid=370e4739fa489334a4ef
syz repro:      https://syzkaller.appspot.com/x/repro.syz?x=16dd57dc600000

Reported-by: syzbot+370e4739fa489334a4ef@syzkaller.appspotmail.com
Fixes: e9db4ef6bf4c ("bpf: sockhash fix omitted bucket lock in sock_close")

For information about bisection process see: https://goo.gl/tpsmEJ#bisection

^ permalink raw reply

* Re: [PATCH net-next 1/5] RDS: Re-add pf/sol access via sysctl
From: David Miller @ 2019-08-15  1:25 UTC (permalink / raw)
  To: gerd.rausch; +Cc: santosh.shilimkar, dledford, netdev, linux-rdma, rds-devel
In-Reply-To: <1c6d1f04-96d5-94e5-3140-d3da194e14f3@oracle.com>

From: Gerd Rausch <gerd.rausch@oracle.com>
Date: Wed, 14 Aug 2019 14:45:21 -0700

> a) It is utterly inappropriate to have Oracle applications
>    rely on a /proc/sys API that was kept out of Upstream-Linux
>    for this long

Yes.

> 
> b) It is utterly inappropriate to include such a /proc/sys API
>    that Oracle applications have depended on this late

Also yes.

^ permalink raw reply

* Re: [patch net-next v3 1/2] netdevsim: implement support for devlink region and snapshots
From: Jakub Kicinski @ 2019-08-15  1:14 UTC (permalink / raw)
  To: Jiri Pirko; +Cc: netdev, davem, mlxsw
In-Reply-To: <20190814153735.6923-2-jiri@resnulli.us>

On Wed, 14 Aug 2019 17:37:34 +0200, Jiri Pirko wrote:
> diff --git a/drivers/net/netdevsim/dev.c b/drivers/net/netdevsim/dev.c
> index 08ca59fc189b..125a0358bc04 100644
> --- a/drivers/net/netdevsim/dev.c
> +++ b/drivers/net/netdevsim/dev.c
> @@ -27,6 +27,41 @@
>  
>  static struct dentry *nsim_dev_ddir;
>  
> +#define NSIM_DEV_DUMMY_REGION_SIZE (1024 * 32)
> +
> +static ssize_t nsim_dev_take_snapshot_write(struct file *file,
> +					    const char __user *data,
> +					    size_t count, loff_t *ppos)
> +{
> +	struct nsim_dev *nsim_dev = file->private_data;
> +	void *dummy_data;
> +	u32 id;
> +	int err;

If you have to rebase on top of Ido's changes and repost, please do
reverse xmas tree.

^ permalink raw reply

* Re: [patch net-next v2 2/2] selftests: netdevsim: add devlink params tests
From: Jakub Kicinski @ 2019-08-15  1:09 UTC (permalink / raw)
  To: Jiri Pirko; +Cc: netdev, davem, mlxsw
In-Reply-To: <20190814152604.6385-3-jiri@resnulli.us>

On Wed, 14 Aug 2019 17:26:04 +0200, Jiri Pirko wrote:
> From: Jiri Pirko <jiri@mellanox.com>
> 
> Test recently added netdevsim devlink param implementation.
> 
> Signed-off-by: Jiri Pirko <jiri@mellanox.com>
> ---
> v1->v2:
> -using cmd_jq helper

Still failing here :(

# ./devlink.sh 
TEST: fw flash test                                                 [ OK ]
TEST: params test                                                   [FAIL]
	Failed to get test1 param value
TEST: regions test                                                  [ OK ]

# jq --version
jq-1.5-1-a5b5cbe
# echo '{ "a" : false }' | jq -e -r '.[]'
false
# echo $?
1

On another machine:

$ echo '{ "a" : false }' | jq -e -r '.[]'
false
$ echo $?
1

Did you mean to drop the -e ?

^ permalink raw reply

* Re: [PATCH net-next v2 13/14] selftests: devlink_trap: Add test cases for devlink-trap
From: Jakub Kicinski @ 2019-08-15  0:42 UTC (permalink / raw)
  To: Ido Schimmel
  Cc: netdev, davem, nhorman, jiri, toke, dsahern, roopa, nikolay, andy,
	f.fainelli, andrew, vivien.didelot, mlxsw, Ido Schimmel
In-Reply-To: <20190813075400.11841-14-idosch@idosch.org>

On Tue, 13 Aug 2019 10:53:59 +0300, Ido Schimmel wrote:
> From: Ido Schimmel <idosch@mellanox.com>
> 
> Add test cases for devlink-trap on top of the netdevsim implementation.
> 
> The tests focus on the devlink-trap core infrastructure and user space
> API. They test both good and bad flows and also dismantle of the netdev
> and devlink device used to report trapped packets.
> 
> This allows device drivers to focus their tests on device-specific
> functionality.
> 
> Signed-off-by: Ido Schimmel <idosch@mellanox.com>
> Acked-by: Jiri Pirko <jiri@mellanox.com>

Thanks for the test!

Should it perhaps live in:
tools/testing/selftests/drivers/net/netdevsim/
?

That's where Jiri puts his devlink tests..

Also the test seems to require netdevsim to be loaded, otherwise:
# ./devlink_trap.sh 
SKIP: No netdevsim support

Is that expected?

^ permalink raw reply

* Re: [PATCH v2 bpf-next 1/4] bpf: unprivileged BPF access via /dev/bpf
From: Alexei Starovoitov @ 2019-08-15  0:36 UTC (permalink / raw)
  To: Andy Lutomirski
  Cc: Andy Lutomirski, Daniel Colascione, Song Liu, Kees Cook,
	Networking, bpf, Alexei Starovoitov, Daniel Borkmann, Kernel Team,
	Lorenz Bauer, Jann Horn, Greg KH, Linux API, LSM List
In-Reply-To: <317422C3-ACE3-42A7-A287-7B8FEE12E33A@amacapital.net>

On Wed, Aug 14, 2019 at 04:59:18PM -0700, Andy Lutomirski wrote:
> 
> 
> > On Aug 14, 2019, at 4:33 PM, Alexei Starovoitov <alexei.starovoitov@gmail.com> wrote:
> > 
> >> On Wed, Aug 14, 2019 at 03:30:51PM -0700, Andy Lutomirski wrote:
> >> 
> >> 
> >>>> On Aug 14, 2019, at 3:05 PM, Alexei Starovoitov <alexei.starovoitov@gmail.com> wrote:
> >>>> 
> >>>> On Wed, Aug 14, 2019 at 10:51:23AM -0700, Andy Lutomirski wrote:
> >>>> 
> >>>> If eBPF is genuinely not usable by programs that are not fully trusted
> >>>> by the admin, then no kernel changes at all are needed.  Programs that
> >>>> want to reduce their own privileges can easily fork() a privileged
> >>>> subprocess or run a little helper to which they delegate BPF
> >>>> operations.  This is far more flexible than anything that will ever be
> >>>> in the kernel because it allows the helper to verify that the rest of
> >>>> the program is doing exactly what it's supposed to and restrict eBPF
> >>>> operations to exactly the subset that is needed.  So a container
> >>>> manager or network manager that drops some provilege could have a
> >>>> little bpf-helper that manages its BPF XDP, firewalling, etc
> >>>> configuration.  The two processes would talk over a socketpair.
> >>> 
> >>> there were three projects that tried to delegate bpf operations.
> >>> All of them failed.
> >>> bpf operational workflow is much more complex than you're imagining.
> >>> fork() also doesn't work for all cases.
> >>> I gave this example before: consider multiple systemd-like deamons
> >>> that need to do bpf operations that want to pass this 'bpf capability'
> >>> to other deamons written by other teams. Some of them will start
> >>> non-root, but still need to do bpf. They will be rpm installed
> >>> and live upgraded while running.
> >>> We considered to make systemd such centralized bpf delegation
> >>> authority too. It didn't work. bpf in kernel grows quickly.
> >>> libbpf part grows independently. llvm keeps evolving.
> >>> All of them are being changed while system overall has to stay
> >>> operational. Centralized approach breaks apart.
> >>> 
> >>>> The interesting cases you're talking about really *do* involved
> >>>> unprivileged or less privileged eBPF, though.  Let's see:
> >>>> 
> >>>> systemd --user: systemd --user *is not privileged at all*.  There's no
> >>>> issue of reducing privilege, since systemd --user doesn't have any
> >>>> privilege to begin with.  But systemd supports some eBPF features, and
> >>>> presumably it would like to support them in the systemd --user case.
> >>>> This is unprivileged eBPF.
> >>> 
> >>> Let's disambiguate the terminology.
> >>> This /dev/bpf patch set started as describing the feature as 'unprivileged bpf'.
> >>> I think that was a mistake.
> >>> Let's call systemd-like deamon usage of bpf 'less privileged bpf'.
> >>> This is not unprivileged.
> >>> 'unprivileged bpf' is what sysctl kernel.unprivileged_bpf_disabled controls.
> >>> 
> >>> There is a huge difference between the two.
> >>> I'm against extending 'unprivileged bpf' even a bit more than what it is
> >>> today for many reasons mentioned earlier.
> >>> The /dev/bpf is about 'less privileged'.
> >>> Less privileged than root. We need to split part of full root capability
> >>> into bpf capability. So that most of the root can be dropped.
> >>> This is very similar to what cap_net_admin does.
> >>> cap_net_amdin can bring down eth0 which is just as bad as crashing the box.
> >>> cap_net_admin is very much privileged. Just 'less privileged' than root.
> >>> Same thing for cap_bpf.
> >> 
> >> The new pseudo-capability in this patch set is absurdly broad. I’ve proposed some finer-grained divisions in this thread. Do you have comments on them?
> > 
> > Initially I agreed that it's probably too broad, but then realized
> > that they're perfect as-is. There is no need to partition further.
> > 
> >>> May be we should do both cap_bpf and /dev/bpf to make it clear that
> >>> this is the same thing. Two interfaces to achieve the same result.
> >> 
> >> What for?  If there’s a CAP_BPF, then why do you want /dev/bpf? Especially if you define it to do the same thing.
> > 
> > Indeed, ambient capabilities should work for all cases.
> > 
> >> No, I’m not.  I have no objection at all if you try to come up with a clear definition of what the capability checks do and what it means to grant a new permission to a task.  Changing *all* of the capable checks is needlessly broad.
> > 
> > There are not that many bits left. I prefer to consume single CAP_BPF bit.
> > All capable(CAP_SYS_ADMIN) checks in kernel/bpf/ will become CAP_BPF.
> > This is no-brainer.
> > 
> > The only question is whether few cases of CAP_NET_ADMIN in kernel/bpf/
> > should be extended to CAP_BPF or not.
> > imo devmap and xskmap can stay CAP_NET_ADMIN,
> > but cgroup bpf attach/detach should be either CAP_NET_ADMIN or CAP_BPF.
> > Initially cgroup-bpf hooks were limited to networking.
> > It's no longer the case. Requiring NET_ADMIN there make little sense now.
> > 
> 
> Cgroup bpf attach/detach, with the current API, gives very strong control over the whole system, and it will just get stronger as bpf gains features. Making it CAP_BPF means that you will never have the ability to make CAP_BPF safe to give to anything other than an extremely highly trusted process.  Unsafe pointers are similar. 

'never to less trusted process' ? why do you think so?
I don't see a problem adding /dev/bpf/foo in the future and make things
more granular. There is no such use case today. Hence I don't want to
spend time and design something without clear use case in mind.

> Do new programs really need the by_id calls? 

yes. Lorenz gave an example earlier. map-in-map returns map_id.
To operate on that map by_id is needed.


^ permalink raw reply

* [PATCH] ipvlan: set hw_enc_features like macvlan
From: Bill Sommerfeld @ 2019-08-15  0:10 UTC (permalink / raw)
  To: David S. Miller, Petr Machata
  Cc: Jiri Pirko, Ido Schimmel, Daniel Borkmann, YueHaibing,
	Thomas Gleixner, Miaohe Lin, Eric Dumazet, Mahesh Bandewar,
	netdev, linux-kernel, Bill Sommerfeld

Allow encapsulated packets sent to tunnels layered over ipvlan to use
offloads rather than forcing SW fallbacks.

Since commit f21e5077010acda73a60 ("macvlan: add offload features for
encapsulation"), macvlan has set dev->hw_enc_features to include
everything in dev->features; do likewise in ipvlan.

Signed-off-by: Bill Sommerfeld <wsommerfeld@google.com>
---
 drivers/net/ipvlan/ipvlan_main.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/net/ipvlan/ipvlan_main.c b/drivers/net/ipvlan/ipvlan_main.c
index 1c96bed5a7c4..887bbba4631e 100644
--- a/drivers/net/ipvlan/ipvlan_main.c
+++ b/drivers/net/ipvlan/ipvlan_main.c
@@ -126,6 +126,7 @@ static int ipvlan_init(struct net_device *dev)
 		     (phy_dev->state & IPVLAN_STATE_MASK);
 	dev->features = phy_dev->features & IPVLAN_FEATURES;
 	dev->features |= NETIF_F_LLTX | NETIF_F_VLAN_CHALLENGED;
+	dev->hw_enc_features |= dev->features;
 	dev->gso_max_size = phy_dev->gso_max_size;
 	dev->gso_max_segs = phy_dev->gso_max_segs;
 	dev->hard_header_len = phy_dev->hard_header_len;
-- 
2.23.0.rc1.153.gdeed80330f-goog


^ permalink raw reply related

* [PATCH bpf-next] tools: libbpf: update extended attributes version of bpf_object__open()
From: Anton Protopopov @ 2019-08-15  0:03 UTC (permalink / raw)
  To: Andrii Nakryiko, Alexei Starovoitov, Daniel Borkmann,
	Martin KaFai Lau, Song Liu, Yonghong Song, netdev, bpf,
	linux-kernel
  Cc: Anton Protopopov

Update the bpf_object_open_attr structure and corresponding code so that the
bpf_object__open_xattr function could be used to open objects from buffers as
well as from files.  The reason for this change is that the existing
bpf_object__open_buffer function doesn't provide a way to specify neither the
needs_kver nor flags parameters to the internal call to __bpf_object__open
which makes it inconvenient for loading BPF objects which doesn't require a
kernel version.

Two new fields, obj_buf and obj_buf_sz, were added to the structure, and the
file field was union'ed with obj_name so that one can open an object like this:

    struct bpf_object_open_attr attr = {
        .obj_name   = name,
        .obj_buf    = obj_buf,
        .obj_buf_sz = obj_buf_sz,
        .prog_type  = BPF_PROG_TYPE_UNSPEC,
    };
    return bpf_object__open_xattr(&attr);

while still being able to use the file semantics:

    struct bpf_object_open_attr attr = {
        .file       = path,
        .prog_type  = BPF_PROG_TYPE_UNSPEC,
    };
    return bpf_object__open_xattr(&attr);

Another thing to note is that since the commit c034a177d3c8 ("bpf: bpftool, add
flag to allow non-compat map definitions") which introduced a MAPS_RELAX_COMPAT
flag to load objects with non-compat map definitions, bpf_object__open_buffer
was called with this flag enabled (it was passed as the boolean true value in
flags argument to __bpf_object__open).  The default behaviour for all open
functions is to clear this flag and this patch changes bpf_object__open_buffer
to clears this flag.  It can be enabled, if needed, by opening an object from
buffer using __bpf_object__open_xattr.

Signed-off-by: Anton Protopopov <a.s.protopopov@gmail.com>
---
 tools/lib/bpf/libbpf.c | 45 ++++++++++++++++++++++++++----------------
 tools/lib/bpf/libbpf.h |  7 ++++++-
 2 files changed, 34 insertions(+), 18 deletions(-)

diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c
index 2233f919dd88..7c8054afd901 100644
--- a/tools/lib/bpf/libbpf.c
+++ b/tools/lib/bpf/libbpf.c
@@ -3630,13 +3630,31 @@ __bpf_object__open(const char *path, void *obj_buf, size_t obj_buf_sz,
 struct bpf_object *__bpf_object__open_xattr(struct bpf_object_open_attr *attr,
 					    int flags)
 {
+	char tmp_name[64];
+
 	/* param validation */
-	if (!attr->file)
+	if (!attr)
 		return NULL;
 
-	pr_debug("loading %s\n", attr->file);
+	if (attr->obj_buf) {
+		if (attr->obj_buf_sz <= 0)
+			return NULL;
+		if (!attr->file) {
+			snprintf(tmp_name, sizeof(tmp_name), "%lx-%lx",
+				 (unsigned long)attr->obj_buf,
+				 (unsigned long)attr->obj_buf_sz);
+			attr->obj_name = tmp_name;
+		}
+		pr_debug("loading object '%s' from buffer\n", attr->obj_name);
+	} else if (!attr->file) {
+		return NULL;
+	} else {
+		attr->obj_buf_sz = 0;
 
-	return __bpf_object__open(attr->file, NULL, 0,
+		pr_debug("loading object file '%s'\n", attr->file);
+	}
+
+	return __bpf_object__open(attr->file, attr->obj_buf, attr->obj_buf_sz,
 				  bpf_prog_type__needs_kver(attr->prog_type),
 				  flags);
 }
@@ -3660,21 +3678,14 @@ struct bpf_object *bpf_object__open_buffer(void *obj_buf,
 					   size_t obj_buf_sz,
 					   const char *name)
 {
-	char tmp_name[64];
-
-	/* param validation */
-	if (!obj_buf || obj_buf_sz <= 0)
-		return NULL;
-
-	if (!name) {
-		snprintf(tmp_name, sizeof(tmp_name), "%lx-%lx",
-			 (unsigned long)obj_buf,
-			 (unsigned long)obj_buf_sz);
-		name = tmp_name;
-	}
-	pr_debug("loading object '%s' from buffer\n", name);
+	struct bpf_object_open_attr attr = {
+		.obj_name	= name,
+		.obj_buf	= obj_buf,
+		.obj_buf_sz	= obj_buf_sz,
+		.prog_type	= BPF_PROG_TYPE_UNSPEC,
+	};
 
-	return __bpf_object__open(name, obj_buf, obj_buf_sz, true, true);
+	return bpf_object__open_xattr(&attr);
 }
 
 int bpf_object__unload(struct bpf_object *obj)
diff --git a/tools/lib/bpf/libbpf.h b/tools/lib/bpf/libbpf.h
index e8f70977d137..634f278578dd 100644
--- a/tools/lib/bpf/libbpf.h
+++ b/tools/lib/bpf/libbpf.h
@@ -63,8 +63,13 @@ LIBBPF_API libbpf_print_fn_t libbpf_set_print(libbpf_print_fn_t fn);
 struct bpf_object;
 
 struct bpf_object_open_attr {
-	const char *file;
+	union {
+		const char *file;
+		const char *obj_name;
+	};
 	enum bpf_prog_type prog_type;
+	void *obj_buf;
+	size_t obj_buf_sz;
 };
 
 LIBBPF_API struct bpf_object *bpf_object__open(const char *path);
-- 
2.19.1


^ permalink raw reply related

* Re: [PATCH net-next v2 11/14] netdevsim: Add devlink-trap support
From: Jakub Kicinski @ 2019-08-14 23:59 UTC (permalink / raw)
  To: Ido Schimmel
  Cc: netdev, davem, nhorman, jiri, toke, dsahern, roopa, nikolay, andy,
	f.fainelli, andrew, vivien.didelot, mlxsw, Ido Schimmel
In-Reply-To: <20190813075400.11841-12-idosch@idosch.org>

On Tue, 13 Aug 2019 10:53:57 +0300, Ido Schimmel wrote:
> diff --git a/drivers/net/netdevsim/dev.c b/drivers/net/netdevsim/dev.c
> index 08ca59fc189b..2758d95c8d18 100644
> --- a/drivers/net/netdevsim/dev.c
> +++ b/drivers/net/netdevsim/dev.c
> @@ -17,11 +17,21 @@
>  
>  #include <linux/debugfs.h>
>  #include <linux/device.h>
> +#include <linux/etherdevice.h>
> +#include <linux/inet.h>
> +#include <linux/jiffies.h>
> +#include <linux/kernel.h>
>  #include <linux/list.h>
>  #include <linux/mutex.h>
>  #include <linux/random.h>
> +#include <linux/workqueue.h>
> +#include <linux/random.h>
>  #include <linux/rtnetlink.h>
>  #include <net/devlink.h>
> +#include <net/ip.h>
> +#include <uapi/linux/devlink.h>
> +#include <uapi/linux/ip.h>
> +#include <uapi/linux/udp.h>

Please keep includes ordered alphabetically. You're adding
linux/random.h second time.

>  #include "netdevsim.h"

> +static void nsim_dev_trap_report(struct nsim_dev_port *nsim_dev_port)
> +{
> +	struct nsim_dev *nsim_dev = nsim_dev_port->ns->nsim_dev;
> +	struct nsim_trap_data *nsim_trap_data = nsim_dev->trap_data;
> +	struct devlink *devlink = priv_to_devlink(nsim_dev);
> +	int i;

reverse christmas tree, please

^ permalink raw reply


This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox