Netdev List

Netdev List
 help / color / mirror / Atom feed

* [PATCH bpf v2 4/6] tools: bpf: sync bpf uapi header
From: Sandipan Das @ 2018-05-18 12:50 UTC (permalink / raw)
  To: ast, daniel; +Cc: netdev, linuxppc-dev, naveen.n.rao, mpe, jakub.kicinski
In-Reply-To: <20180518125039.6500-1-sandipan@linux.vnet.ibm.com>

Syncing the bpf.h uapi header with tools so that struct
bpf_prog_info has the two new fields for passing on the
addresses of the kernel symbols corresponding to each
function in a JITed program.

Signed-off-by: Sandipan Das <sandipan@linux.vnet.ibm.com>
---
 tools/include/uapi/linux/bpf.h | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index d94d333a8225..040c9cac7303 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -2188,6 +2188,8 @@ struct bpf_prog_info {
 	__u32 xlated_prog_len;
 	__aligned_u64 jited_prog_insns;
 	__aligned_u64 xlated_prog_insns;
+	__aligned_u64 jited_ksyms;
+	__u32 nr_jited_ksyms;
 	__u64 load_time;	/* ns since boottime */
 	__u32 created_by_uid;
 	__u32 nr_map_ids;
-- 
2.14.3

^ permalink raw reply related

* [PATCH bpf v2 5/6] tools: bpftool: resolve calls without using imm field
From: Sandipan Das @ 2018-05-18 12:50 UTC (permalink / raw)
  To: ast, daniel; +Cc: netdev, linuxppc-dev, naveen.n.rao, mpe, jakub.kicinski
In-Reply-To: <20180518125039.6500-1-sandipan@linux.vnet.ibm.com>

Currently, we resolve the callee's address for a JITed function
call by using the imm field of the call instruction as an offset
from __bpf_call_base. If bpf_jit_kallsyms is enabled, we further
use this address to get the callee's kernel symbol's name.

For some architectures, such as powerpc64, the imm field is not
large enough to hold this offset. So, instead of assigning this
offset to the imm field, the verifier now assigns the subprog
id. Also, a list of kernel symbol addresses for all the JITed
functions is provided in the program info. We now use the imm
field as an index for this list to lookup a callee's symbol's
address and resolve its name.

Suggested-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: Sandipan Das <sandipan@linux.vnet.ibm.com>
---
v2:
 - Order variables from longest to shortest
 - Make sure that ksyms_ptr and ksyms_len are always initialized
 - Simplify code
---
 tools/bpf/bpftool/prog.c          | 29 +++++++++++++++++++++++++++++
 tools/bpf/bpftool/xlated_dumper.c | 10 +++++++++-
 tools/bpf/bpftool/xlated_dumper.h |  2 ++
 3 files changed, 40 insertions(+), 1 deletion(-)

diff --git a/tools/bpf/bpftool/prog.c b/tools/bpf/bpftool/prog.c
index 9bdfdf2d3fbe..e2f8f8f259fc 100644
--- a/tools/bpf/bpftool/prog.c
+++ b/tools/bpf/bpftool/prog.c
@@ -421,19 +421,26 @@ static int do_show(int argc, char **argv)
 static int do_dump(int argc, char **argv)
 {
 	struct bpf_prog_info info = {};
+	unsigned long *addrs = NULL;
 	struct dump_data dd = {};
 	__u32 len = sizeof(info);
 	unsigned int buf_size;
+	unsigned int nr_addrs;
 	char *filepath = NULL;
 	bool opcodes = false;
 	bool visual = false;
 	unsigned char *buf;
 	__u32 *member_len;
 	__u64 *member_ptr;
+	__u32 *ksyms_len;
+	__u64 *ksyms_ptr;
 	ssize_t n;
 	int err;
 	int fd;
 
+	ksyms_len = &info.nr_jited_ksyms;
+	ksyms_ptr = &info.jited_ksyms;
+
 	if (is_prefix(*argv, "jited")) {
 		member_len = &info.jited_prog_len;
 		member_ptr = &info.jited_prog_insns;
@@ -496,10 +503,22 @@ static int do_dump(int argc, char **argv)
 		return -1;
 	}
 
+	nr_addrs = *ksyms_len;
+	if (nr_addrs) {
+		addrs = malloc(nr_addrs * sizeof(__u64));
+		if (!addrs) {
+			p_err("mem alloc failed");
+			close(fd);
+			goto err_free;
+		}
+	}
+
 	memset(&info, 0, sizeof(info));
 
 	*member_ptr = ptr_to_u64(buf);
 	*member_len = buf_size;
+	*ksyms_ptr = ptr_to_u64(addrs);
+	*ksyms_len = nr_addrs;
 
 	err = bpf_obj_get_info_by_fd(fd, &info, &len);
 	close(fd);
@@ -513,6 +532,11 @@ static int do_dump(int argc, char **argv)
 		goto err_free;
 	}
 
+	if (*ksyms_len > nr_addrs) {
+		p_err("too many addresses returned");
+		goto err_free;
+	}
+
 	if ((member_len == &info.jited_prog_len &&
 	     info.jited_prog_insns == 0) ||
 	    (member_len == &info.xlated_prog_len &&
@@ -558,6 +582,9 @@ static int do_dump(int argc, char **argv)
 			dump_xlated_cfg(buf, *member_len);
 	} else {
 		kernel_syms_load(&dd);
+		dd.nr_jited_ksyms = *ksyms_len;
+		dd.jited_ksyms = (__u64 *) *ksyms_ptr;
+
 		if (json_output)
 			dump_xlated_json(&dd, buf, *member_len, opcodes);
 		else
@@ -566,10 +593,12 @@ static int do_dump(int argc, char **argv)
 	}
 
 	free(buf);
+	free(addrs);
 	return 0;
 
 err_free:
 	free(buf);
+	free(addrs);
 	return -1;
 }
 
diff --git a/tools/bpf/bpftool/xlated_dumper.c b/tools/bpf/bpftool/xlated_dumper.c
index 7a3173b76c16..fb065b55db6d 100644
--- a/tools/bpf/bpftool/xlated_dumper.c
+++ b/tools/bpf/bpftool/xlated_dumper.c
@@ -174,7 +174,11 @@ static const char *print_call_pcrel(struct dump_data *dd,
 				    unsigned long address,
 				    const struct bpf_insn *insn)
 {
-	if (sym)
+	if (!dd->nr_jited_ksyms)
+		/* Do not show address for interpreted programs */
+		snprintf(dd->scratch_buff, sizeof(dd->scratch_buff),
+			"%+d", insn->off);
+	else if (sym)
 		snprintf(dd->scratch_buff, sizeof(dd->scratch_buff),
 			 "%+d#%s", insn->off, sym->name);
 	else
@@ -203,6 +207,10 @@ static const char *print_call(void *private_data,
 	unsigned long address = dd->address_call_base + insn->imm;
 	struct kernel_sym *sym;
 
+	if (insn->src_reg == BPF_PSEUDO_CALL &&
+		(__u32) insn->imm < dd->nr_jited_ksyms)
+		address = dd->jited_ksyms[insn->imm];
+
 	sym = kernel_syms_search(dd, address);
 	if (insn->src_reg == BPF_PSEUDO_CALL)
 		return print_call_pcrel(dd, sym, address, insn);
diff --git a/tools/bpf/bpftool/xlated_dumper.h b/tools/bpf/bpftool/xlated_dumper.h
index b34affa7ef2d..eafbb49c8d0b 100644
--- a/tools/bpf/bpftool/xlated_dumper.h
+++ b/tools/bpf/bpftool/xlated_dumper.h
@@ -49,6 +49,8 @@ struct dump_data {
 	unsigned long address_call_base;
 	struct kernel_sym *sym_mapping;
 	__u32 sym_count;
+	__u64 *jited_ksyms;
+	__u32 nr_jited_ksyms;
 	char scratch_buff[SYM_MAX_NAME + 8];
 };
 
-- 
2.14.3

^ permalink raw reply related

* [PATCH bpf v2 6/6] bpf: fix JITed dump for multi-function programs via syscall
From: Sandipan Das @ 2018-05-18 12:50 UTC (permalink / raw)
  To: ast, daniel; +Cc: netdev, linuxppc-dev, naveen.n.rao, mpe, jakub.kicinski
In-Reply-To: <20180518125039.6500-1-sandipan@linux.vnet.ibm.com>

Currently, for multi-function programs, we cannot get the JITed
instructions using the bpf system call's BPF_OBJ_GET_INFO_BY_FD
command. Because of this, userspace tools such as bpftool fail
to identify a multi-function program as being JITed or not.

With the JIT enabled and the test program running, this can be
verified as follows:

  # cat /proc/sys/net/core/bpf_jit_enable
  1

Before applying this patch:

  # bpftool prog list
  1: kprobe  name foo  tag b811aab41a39ad3d  gpl
          loaded_at 2018-05-16T11:43:38+0530  uid 0
          xlated 216B  not jited  memlock 65536B
  ...

  # bpftool prog dump jited id 1
  no instructions returned

After applying this patch:

  # bpftool prog list
  1: kprobe  name foo  tag b811aab41a39ad3d  gpl
          loaded_at 2018-05-16T12:13:01+0530  uid 0
          xlated 216B  jited 308B  memlock 65536B
  ...

  # bpftool prog dump jited id 1
     0:   nop
     4:   nop
     8:   mflr    r0
     c:   std     r0,16(r1)
    10:   stdu    r1,-112(r1)
    14:   std     r31,104(r1)
    18:   addi    r31,r1,48
    1c:   li      r3,10
  ...

Signed-off-by: Sandipan Das <sandipan@linux.vnet.ibm.com>
---
 kernel/bpf/syscall.c | 38 ++++++++++++++++++++++++++++++++------
 1 file changed, 32 insertions(+), 6 deletions(-)

diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 54a72fafe57c..2430d159078c 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -1896,7 +1896,7 @@ static int bpf_prog_get_info_by_fd(struct bpf_prog *prog,
 	struct bpf_prog_info info = {};
 	u32 info_len = attr->info.info_len;
 	char __user *uinsns;
-	u32 ulen;
+	u32 ulen, i;
 	int err;
 
 	err = check_uarg_tail_zero(uinfo, sizeof(info), info_len);
@@ -1922,7 +1922,6 @@ static int bpf_prog_get_info_by_fd(struct bpf_prog *prog,
 	ulen = min_t(u32, info.nr_map_ids, ulen);
 	if (ulen) {
 		u32 __user *user_map_ids = u64_to_user_ptr(info.map_ids);
-		u32 i;
 
 		for (i = 0; i < ulen; i++)
 			if (put_user(prog->aux->used_maps[i]->id,
@@ -1970,13 +1969,41 @@ static int bpf_prog_get_info_by_fd(struct bpf_prog *prog,
 	 * for offload.
 	 */
 	ulen = info.jited_prog_len;
-	info.jited_prog_len = prog->jited_len;
+	if (prog->aux->func_cnt) {
+		info.jited_prog_len = 0;
+		for (i = 0; i < prog->aux->func_cnt; i++)
+			info.jited_prog_len += prog->aux->func[i]->jited_len;
+	} else {
+		info.jited_prog_len = prog->jited_len;
+	}
+
 	if (info.jited_prog_len && ulen) {
 		if (bpf_dump_raw_ok()) {
 			uinsns = u64_to_user_ptr(info.jited_prog_insns);
 			ulen = min_t(u32, info.jited_prog_len, ulen);
-			if (copy_to_user(uinsns, prog->bpf_func, ulen))
-				return -EFAULT;
+
+			/* for multi-function programs, copy the JITed
+			 * instructions for all the functions
+			 */
+			if (prog->aux->func_cnt) {
+				u32 len, free;
+				u8 *img;
+
+				free = ulen;
+				for (i = 0; i < prog->aux->func_cnt; i++) {
+					len = prog->aux->func[i]->jited_len;
+					img = (u8 *) prog->aux->func[i]->bpf_func;
+					if (len > free)
+						break;
+					if (copy_to_user(uinsns, img, len))
+						return -EFAULT;
+					uinsns += len;
+					free -= len;
+				}
+			} else {
+				if (copy_to_user(uinsns, prog->bpf_func, ulen))
+					return -EFAULT;
+			}
 		} else {
 			info.jited_prog_insns = 0;
 		}
@@ -1987,7 +2014,6 @@ static int bpf_prog_get_info_by_fd(struct bpf_prog *prog,
 	if (info.nr_jited_ksyms && ulen) {
 		u64 __user *user_jited_ksyms = u64_to_user_ptr(info.jited_ksyms);
 		ulong ksym_addr;
-		u32 i;
 
 		/* copy the address of the kernel symbol corresponding to
 		 * each function
-- 
2.14.3

^ permalink raw reply related

* [PATCH net] net: sched: red: avoid hashing NULL child
From: Paolo Abeni @ 2018-05-18 12:51 UTC (permalink / raw)
  To: netdev
  Cc: Jamal Hadi Salim, Cong Wang, Jiri Pirko, David S. Miller,
	Hangbin Liu, Jiri Kosina

Hangbin reported an Oops triggered by the syzkaller qdisc rules:

 kasan: GPF could be caused by NULL-ptr deref or user memory access
 general protection fault: 0000 [#1] SMP KASAN PTI
 Modules linked in: sch_red
 CPU: 0 PID: 28699 Comm: syz-executor5 Not tainted 4.17.0-rc4.kcov #1
 Hardware name: Red Hat KVM, BIOS 0.5.1 01/01/2011
 RIP: 0010:qdisc_hash_add+0x26/0xa0
 RSP: 0018:ffff8800589cf470 EFLAGS: 00010203
 RAX: dffffc0000000000 RBX: 0000000000000000 RCX: ffffffff824ad971
 RDX: 0000000000000007 RSI: ffffc9000ce9f000 RDI: 000000000000003c
 RBP: 0000000000000001 R08: ffffed000b139ea2 R09: ffff8800589cf4f0
 R10: ffff8800589cf50f R11: ffffed000b139ea2 R12: ffff880054019fc0
 R13: ffff880054019fb4 R14: ffff88005c0af600 R15: ffff880054019fb0
 FS:  00007fa6edcb1700(0000) GS:ffff88005ce00000(0000) knlGS:0000000000000000
 CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
 CR2: 0000000020000740 CR3: 000000000fc16000 CR4: 00000000000006f0
 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
 Call Trace:
  red_change+0x2d2/0xed0 [sch_red]
  qdisc_create+0x57e/0xef0
  tc_modify_qdisc+0x47f/0x14e0
  rtnetlink_rcv_msg+0x6a8/0x920
  netlink_rcv_skb+0x2a2/0x3c0
  netlink_unicast+0x511/0x740
  netlink_sendmsg+0x825/0xc30
  sock_sendmsg+0xc5/0x100
  ___sys_sendmsg+0x778/0x8e0
  __sys_sendmsg+0xf5/0x1b0
  do_syscall_64+0xbd/0x3b0
  entry_SYSCALL_64_after_hwframe+0x44/0xa9
 RIP: 0033:0x450869
 RSP: 002b:00007fa6edcb0c48 EFLAGS: 00000246 ORIG_RAX: 000000000000002e
 RAX: ffffffffffffffda RBX: 00007fa6edcb16b4 RCX: 0000000000450869
 RDX: 0000000000000000 RSI: 00000000200000c0 RDI: 0000000000000013
 RBP: 000000000072bea0 R08: 0000000000000000 R09: 0000000000000000
 R10: 0000000000000000 R11: 0000000000000246 R12: 00000000ffffffff
 R13: 0000000000008778 R14: 0000000000702838 R15: 00007fa6edcb1700
 Code: e9 0b fe ff ff 0f 1f 44 00 00 55 53 48 89 fb 89 f5 e8 3f 07 f3 fe 48 8d 7b 3c 48 b8 00 00 00 00 00 fc ff df 48 89 fa 48 c1 ea 03 <0f> b6 14 02 48 89 f8 83 e0 07 83 c0 03 38 d0 7c 04 84 d2 75 51
 RIP: qdisc_hash_add+0x26/0xa0 RSP: ffff8800589cf470

When a red qdisc is updated with a 0 limit, the child qdisc is left
unmodified, no additional scheduler is created in red_change(),
the 'child' local variable is rightfully NULL and must not add it
to the hash table.

This change addresses the above issue moving qdisc_hash_add() right
after the child qdisc creation. It additionally removes unneeded checks
for noop_qdisc.

Reported-by: Hangbin Liu <liuhangbin@gmail.com>
Fixes: 49b499718fa1 ("net: sched: make default fifo qdiscs appear in the dump")
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
---
 net/sched/sch_red.c | 5 +++--
 net/sched/sch_tbf.c | 5 +++--
 2 files changed, 6 insertions(+), 4 deletions(-)

diff --git a/net/sched/sch_red.c b/net/sched/sch_red.c
index 16644b3d2362..56c181c3feeb 100644
--- a/net/sched/sch_red.c
+++ b/net/sched/sch_red.c
@@ -222,10 +222,11 @@ static int red_change(struct Qdisc *sch, struct nlattr *opt,
 					 extack);
 		if (IS_ERR(child))
 			return PTR_ERR(child);
-	}
 
-	if (child != &noop_qdisc)
+		/* child is fifo, no need to check for noop_qdisc */
 		qdisc_hash_add(child, true);
+	}
+
 	sch_tree_lock(sch);
 	q->flags = ctl->flags;
 	q->limit = ctl->limit;
diff --git a/net/sched/sch_tbf.c b/net/sched/sch_tbf.c
index 03225a8df973..6f74a426f159 100644
--- a/net/sched/sch_tbf.c
+++ b/net/sched/sch_tbf.c
@@ -383,6 +383,9 @@ static int tbf_change(struct Qdisc *sch, struct nlattr *opt,
 			err = PTR_ERR(child);
 			goto done;
 		}
+
+		/* child is fifo, no need to check for noop_qdisc */
+		qdisc_hash_add(child, true);
 	}
 
 	sch_tree_lock(sch);
@@ -391,8 +394,6 @@ static int tbf_change(struct Qdisc *sch, struct nlattr *opt,
 					  q->qdisc->qstats.backlog);
 		qdisc_destroy(q->qdisc);
 		q->qdisc = child;
-		if (child != &noop_qdisc)
-			qdisc_hash_add(child, true);
 	}
 	q->limit = qopt->limit;
 	if (tb[TCA_TBF_PBURST])
-- 
2.14.3

^ permalink raw reply related

* Re: [PATCH 28/42] drbd: switch to proc_create_single
From: Lars Ellenberg @ 2018-05-18 12:55 UTC (permalink / raw)
  To: Christoph Hellwig
  Cc: linux-rtc-u79uwXL29TY76Z2rM5mHXA, Alessandro Zummo,
	Alexandre Belloni, devel-gWbeCf7V1WCQmaza687I9mD2FQJk+8+b,
	linux-scsi-u79uwXL29TY76Z2rM5mHXA,
	linux-acpi-u79uwXL29TY76Z2rM5mHXA, Greg Kroah-Hartman, Jiri Slaby,
	megaraidlinux.pdl-dY08KVG/lbpWk0Htik3J/w,
	linux-kernel-u79uwXL29TY76Z2rM5mHXA, Alexey Dobriyan,
	linux-ide-u79uwXL29TY76Z2rM5mHXA,
	netfilter-devel-u79uwXL29TY76Z2rM5mHXA, Alexander Viro,
	netdev-u79uwXL29TY76Z2rM5mHXA, Andrew Morton,
	linux-ext4-u79uwXL29TY76Z2rM5mHXA,
	linux-afs-IAPFreCvJWM7uuMidbF8XUB+6BGkLq7r,
	jfs-discussion-5NWGOfrQmneRv+LV9MX5uipxlwaOVQ5f,
	drbd-dev-cunTk1MwBs8qoQakbn7OcQ
In-Reply-To: <20180516094346.20506-29-hch-jcswGhMUV9g@public.gmane.org>

On Wed, May 16, 2018 at 11:43:32AM +0200, Christoph Hellwig wrote:
> And stop messing with try_module_get on THIS_MODULE, which doesn't make
> any sense here.

The idea was to increase module count on /proc/drbd access.

If someone holds /proc/drbd open, previously rmmod would
"succeed" in starting the unload, but then block on remove_proc_entry,
leading to a situation where the lsmod does not show drbd anymore,
but /proc/drbd being still there (but no longer accessible).

I'd rather have rmmod fail up front in this case.
And try_module_get() seemed most appropriate.

    Lars

^ permalink raw reply

* [PATCH net] tuntap: raise EPOLLOUT on device up
From: Jason Wang @ 2018-05-18 13:00 UTC (permalink / raw)
  To: netdev, linux-kernel; +Cc: mst, Jason Wang, Hannes Frederic Sowa, Eric Dumazet

We return -EIO on device down but can not raise EPOLLOUT after it was
up. This may confuse user like vhost which expects tuntap to raise
EPOLLOUT to re-enable its TX routine after tuntap is down. This could
be easily reproduced by transmitting packets from VM while down and up
the tap device. Fixing this by set SOCKWQ_ASYNC_NOSPACE on -EIO.

Cc: Hannes Frederic Sowa <hannes@stressinduktion.org>
Cc: Eric Dumazet <edumazet@google.com>
Fixes: 1bd4978a88ac2 ("tun: honor IFF_UP in tun_get_user()")
Signed-off-by: Jason Wang <jasowang@redhat.com>
---
 drivers/net/tun.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/drivers/net/tun.c b/drivers/net/tun.c
index d45ac37..1b29761 100644
--- a/drivers/net/tun.c
+++ b/drivers/net/tun.c
@@ -1734,8 +1734,10 @@ static ssize_t tun_get_user(struct tun_struct *tun, struct tun_file *tfile,
 	int skb_xdp = 1;
 	bool frags = tun_napi_frags_enabled(tun);
 
-	if (!(tun->dev->flags & IFF_UP))
+	if (!(tun->dev->flags & IFF_UP)) {
+		set_bit(SOCKWQ_ASYNC_NOSPACE, &tfile->socket.flags);
 		return -EIO;
+	}
 
 	if (!(tun->flags & IFF_NO_PI)) {
 		if (len < sizeof(pi))
-- 
2.7.4

^ permalink raw reply related

* Re: [PATCH net] sock_diag: fix use-after-free read in __sk_free
From: Craig Gallek @ 2018-05-18 13:05 UTC (permalink / raw)
  To: Eric Dumazet; +Cc: David S . Miller, netdev, Eric Dumazet

On Fri, May 18, 2018 at 7:47 AM, Eric Dumazet <edumazet@google.com> wrote:
> We must not call sock_diag_has_destroy_listeners(sk) on a socket
> that has no reference on net structure.
>
> BUG: KASAN: use-after-free in sock_diag_has_destroy_listeners include/linux/sock_diag.h:75 [inline]
> BUG: KASAN: use-after-free in __sk_free+0x329/0x340 net/core/sock.c:1609
> Read of size 8 at addr ffff88018a02e3a0 by task swapper/1/0
>
> CPU: 1 PID: 0 Comm: swapper/1 Not tainted 4.17.0-rc5+ #54
> Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011
> Call Trace:
>  <IRQ>
>  __dump_stack lib/dump_stack.c:77 [inline]
>  dump_stack+0x1b9/0x294 lib/dump_stack.c:113
>  print_address_description+0x6c/0x20b mm/kasan/report.c:256
>  kasan_report_error mm/kasan/report.c:354 [inline]
>  kasan_report.cold.7+0x242/0x2fe mm/kasan/report.c:412
>  __asan_report_load8_noabort+0x14/0x20 mm/kasan/report.c:433
>  sock_diag_has_destroy_listeners include/linux/sock_diag.h:75 [inline]
>  __sk_free+0x329/0x340 net/core/sock.c:1609
>  sk_free+0x42/0x50 net/core/sock.c:1623
>  sock_put include/net/sock.h:1664 [inline]
>  reqsk_free include/net/request_sock.h:116 [inline]
>  reqsk_put include/net/request_sock.h:124 [inline]
>  inet_csk_reqsk_queue_drop_and_put net/ipv4/inet_connection_sock.c:672 [inline]
>  reqsk_timer_handler+0xe27/0x10e0 net/ipv4/inet_connection_sock.c:739
>  call_timer_fn+0x230/0x940 kernel/time/timer.c:1326
>  expire_timers kernel/time/timer.c:1363 [inline]
>  __run_timers+0x79e/0xc50 kernel/time/timer.c:1666
>  run_timer_softirq+0x4c/0x70 kernel/time/timer.c:1692
>  __do_softirq+0x2e0/0xaf5 kernel/softirq.c:285
>  invoke_softirq kernel/softirq.c:365 [inline]
>  irq_exit+0x1d1/0x200 kernel/softirq.c:405
>  exiting_irq arch/x86/include/asm/apic.h:525 [inline]
>  smp_apic_timer_interrupt+0x17e/0x710 arch/x86/kernel/apic/apic.c:1052
>  apic_timer_interrupt+0xf/0x20 arch/x86/entry/entry_64.S:863
>  </IRQ>
> RIP: 0010:native_safe_halt+0x6/0x10 arch/x86/include/asm/irqflags.h:54
> RSP: 0018:ffff8801d9ae7c38 EFLAGS: 00000282 ORIG_RAX: ffffffffffffff13
> RAX: dffffc0000000000 RBX: 1ffff1003b35cf8a RCX: 0000000000000000
> RDX: 1ffffffff11a30d0 RSI: 0000000000000001 RDI: ffffffff88d18680
> RBP: ffff8801d9ae7c38 R08: ffffed003b5e46c3 R09: 0000000000000000
> R10: 0000000000000000 R11: 0000000000000000 R12: 0000000000000001
> R13: ffff8801d9ae7cf0 R14: ffffffff897bef20 R15: 0000000000000000
>  arch_safe_halt arch/x86/include/asm/paravirt.h:94 [inline]
>  default_idle+0xc2/0x440 arch/x86/kernel/process.c:354
>  arch_cpu_idle+0x10/0x20 arch/x86/kernel/process.c:345
>  default_idle_call+0x6d/0x90 kernel/sched/idle.c:93
>  cpuidle_idle_call kernel/sched/idle.c:153 [inline]
>  do_idle+0x395/0x560 kernel/sched/idle.c:262
>  cpu_startup_entry+0x104/0x120 kernel/sched/idle.c:368
>  start_secondary+0x426/0x5b0 arch/x86/kernel/smpboot.c:269
>  secondary_startup_64+0xa5/0xb0 arch/x86/kernel/head_64.S:242
>
> Allocated by task 4557:
>  save_stack+0x43/0xd0 mm/kasan/kasan.c:448
>  set_track mm/kasan/kasan.c:460 [inline]
>  kasan_kmalloc+0xc4/0xe0 mm/kasan/kasan.c:553
>  kasan_slab_alloc+0x12/0x20 mm/kasan/kasan.c:490
>  kmem_cache_alloc+0x12e/0x760 mm/slab.c:3554
>  kmem_cache_zalloc include/linux/slab.h:691 [inline]
>  net_alloc net/core/net_namespace.c:383 [inline]
>  copy_net_ns+0x159/0x4c0 net/core/net_namespace.c:423
>  create_new_namespaces+0x69d/0x8f0 kernel/nsproxy.c:107
>  unshare_nsproxy_namespaces+0xc3/0x1f0 kernel/nsproxy.c:206
>  ksys_unshare+0x708/0xf90 kernel/fork.c:2408
>  __do_sys_unshare kernel/fork.c:2476 [inline]
>  __se_sys_unshare kernel/fork.c:2474 [inline]
>  __x64_sys_unshare+0x31/0x40 kernel/fork.c:2474
>  do_syscall_64+0x1b1/0x800 arch/x86/entry/common.c:287
>  entry_SYSCALL_64_after_hwframe+0x49/0xbe
>
> Freed by task 69:
>  save_stack+0x43/0xd0 mm/kasan/kasan.c:448
>  set_track mm/kasan/kasan.c:460 [inline]
>  __kasan_slab_free+0x11a/0x170 mm/kasan/kasan.c:521
>  kasan_slab_free+0xe/0x10 mm/kasan/kasan.c:528
>  __cache_free mm/slab.c:3498 [inline]
>  kmem_cache_free+0x86/0x2d0 mm/slab.c:3756
>  net_free net/core/net_namespace.c:399 [inline]
>  net_drop_ns.part.14+0x11a/0x130 net/core/net_namespace.c:406
>  net_drop_ns net/core/net_namespace.c:405 [inline]
>  cleanup_net+0x6a1/0xb20 net/core/net_namespace.c:541
>  process_one_work+0xc1e/0x1b50 kernel/workqueue.c:2145
>  worker_thread+0x1cc/0x1440 kernel/workqueue.c:2279
>  kthread+0x345/0x410 kernel/kthread.c:240
>  ret_from_fork+0x3a/0x50 arch/x86/entry/entry_64.S:412
>
> The buggy address belongs to the object at ffff88018a02c140
>  which belongs to the cache net_namespace of size 8832
> The buggy address is located 8800 bytes inside of
>  8832-byte region [ffff88018a02c140, ffff88018a02e3c0)
> The buggy address belongs to the page:
> page:ffffea0006280b00 count:1 mapcount:0 mapping:ffff88018a02c140 index:0x0 compound_mapcount: 0
> flags: 0x2fffc0000008100(slab|head)
> raw: 02fffc0000008100 ffff88018a02c140 0000000000000000 0000000100000001
> raw: ffffea00062a1320 ffffea0006268020 ffff8801d9bdde40 0000000000000000
> page dumped because: kasan: bad access detected
>
> Fixes: b922622ec6ef ("sock_diag: don't broadcast kernel sockets")
> Signed-off-by: Eric Dumazet <edumazet@google.com>
> Cc: Craig Gallek <kraig@google.com>
> Reported-by: syzbot <syzkaller@googlegroups.com>

Acked-by: Craig Gallek <kraig@google.com>

Thanks Eric!

^ permalink raw reply

* Re: [PATCH net-next] net:sched: add action inheritdsfield to skbmod
From: Jamal Hadi Salim @ 2018-05-18 13:06 UTC (permalink / raw)
  To: Fu, Qiaobin, davem@davemloft.net; +Cc: netdev@vger.kernel.org, Michel Machado
In-Reply-To: <2F042100-2BAC-48E5-887C-5D426B1D5A5B@bu.edu>

On 17/05/18 03:33 PM, Fu, Qiaobin wrote:
> net/sched: add action inheritdsfield to skbmod
> 
> The new action inheritdsfield copies the field DS of
> IPv4 and IPv6 packets into skb->prioriry. This enables
> later classification of packets based on the DS field.
> 
> Original idea by Jamal Hadi Salim <jhs@mojatatu.com>
> 
> Signed-off-by: Qiaobin Fu <qiaobinf@bu.edu>
> Reviewed-by: Michel Machado <michel@digirati.com.br>

LGTM. Thanks for putting the effort.
As a tradition we also require that you post the iproute2
patch and make sure this works. Can you please do that?
Also: If you can add at least a test on tdc it would help
immensely. Other than that:

Acked-by: Jamal Hadi Salim <jhs@mojatatu.com>

cheers,
jamal

^ permalink raw reply

* Re: [PATCH net] tuntap: raise EPOLLOUT on device up
From: Michael S. Tsirkin @ 2018-05-18 13:13 UTC (permalink / raw)
  To: Jason Wang; +Cc: netdev, linux-kernel, Hannes Frederic Sowa, Eric Dumazet
In-Reply-To: <1526648443-24128-1-git-send-email-jasowang@redhat.com>

On Fri, May 18, 2018 at 09:00:43PM +0800, Jason Wang wrote:
> We return -EIO on device down but can not raise EPOLLOUT after it was
> up. This may confuse user like vhost which expects tuntap to raise
> EPOLLOUT to re-enable its TX routine after tuntap is down. This could
> be easily reproduced by transmitting packets from VM while down and up
> the tap device. Fixing this by set SOCKWQ_ASYNC_NOSPACE on -EIO.
> 
> Cc: Hannes Frederic Sowa <hannes@stressinduktion.org>
> Cc: Eric Dumazet <edumazet@google.com>
> Fixes: 1bd4978a88ac2 ("tun: honor IFF_UP in tun_get_user()")
> Signed-off-by: Jason Wang <jasowang@redhat.com>
> ---
>  drivers/net/tun.c | 4 +++-
>  1 file changed, 3 insertions(+), 1 deletion(-)
> 
> diff --git a/drivers/net/tun.c b/drivers/net/tun.c
> index d45ac37..1b29761 100644
> --- a/drivers/net/tun.c
> +++ b/drivers/net/tun.c
> @@ -1734,8 +1734,10 @@ static ssize_t tun_get_user(struct tun_struct *tun, struct tun_file *tfile,
>  	int skb_xdp = 1;
>  	bool frags = tun_napi_frags_enabled(tun);
>  
> -	if (!(tun->dev->flags & IFF_UP))
> +	if (!(tun->dev->flags & IFF_UP)) {

Isn't this racy?  What if flag is cleared at this point?

> +		set_bit(SOCKWQ_ASYNC_NOSPACE, &tfile->socket.flags);
>  		return -EIO;
> +	}
>  
>  	if (!(tun->flags & IFF_NO_PI)) {
>  		if (len < sizeof(pi))
> -- 
> 2.7.4

^ permalink raw reply

* Re: [RFC v4 3/5] virtio_ring: add packed ring support
From: Jason Wang @ 2018-05-18 13:17 UTC (permalink / raw)
  To: Tiwei Bie; +Cc: mst, virtualization, linux-kernel, netdev, wexu, jfreimann
In-Reply-To: <20180518112950.GA28224@debian>



On 2018年05月18日 19:29, Tiwei Bie wrote:
> On Thu, May 17, 2018 at 08:01:52PM +0800, Jason Wang wrote:
>> On 2018年05月16日 22:33, Tiwei Bie wrote:
>>> On Wed, May 16, 2018 at 10:05:44PM +0800, Jason Wang wrote:
>>>> On 2018年05月16日 21:45, Tiwei Bie wrote:
>>>>> On Wed, May 16, 2018 at 08:51:43PM +0800, Jason Wang wrote:
>>>>>> On 2018年05月16日 20:39, Tiwei Bie wrote:
>>>>>>> On Wed, May 16, 2018 at 07:50:16PM +0800, Jason Wang wrote:
>>>>>>>> On 2018年05月16日 16:37, Tiwei Bie wrote:
>>> [...]
>>>>>>>>> +static void detach_buf_packed(struct vring_virtqueue *vq, unsigned int head,
>>>>>>>>> +			      unsigned int id, void **ctx)
>>>>>>>>> +{
>>>>>>>>> +	struct vring_packed_desc *desc;
>>>>>>>>> +	unsigned int i, j;
>>>>>>>>> +
>>>>>>>>> +	/* Clear data ptr. */
>>>>>>>>> +	vq->desc_state[id].data = NULL;
>>>>>>>>> +
>>>>>>>>> +	i = head;
>>>>>>>>> +
>>>>>>>>> +	for (j = 0; j < vq->desc_state[id].num; j++) {
>>>>>>>>> +		desc = &vq->vring_packed.desc[i];
>>>>>>>>> +		vring_unmap_one_packed(vq, desc);
>>>>>>>> As mentioned in previous discussion, this probably won't work for the case
>>>>>>>> of out of order completion since it depends on the information in the
>>>>>>>> descriptor ring. We probably need to extend ctx to record such information.
>>>>>>> Above code doesn't depend on the information in the descriptor
>>>>>>> ring. The vq->desc_state[] is the extended ctx.
>>>>>>>
>>>>>>> Best regards,
>>>>>>> Tiwei Bie
>>>>>> Yes, but desc is a pointer to descriptor ring I think so
>>>>>> vring_unmap_one_packed() still depends on the content of descriptor ring?
>>>>>>
>>>>> I got your point now. I think it makes sense to reserve
>>>>> the bits of the addr field. Driver shouldn't try to get
>>>>> addrs from the descriptors when cleanup the descriptors
>>>>> no matter whether we support out-of-order or not.
>>>> Maybe I was wrong, but I remember spec mentioned something like this.
>>> You're right. Spec mentioned this. I was just repeating
>>> the spec to emphasize that it does make sense. :)
>>>
>>>>> But combining it with the out-of-order support, it will
>>>>> mean that the driver still needs to maintain a desc/ctx
>>>>> list that is very similar to the desc ring in the split
>>>>> ring. I'm not quite sure whether it's something we want.
>>>>> If it is true, I'll do it. So do you think we also want
>>>>> to maintain such a desc/ctx list for packed ring?
>>>> To make it work for OOO backends I think we need something like this
>>>> (hardware NIC drivers are usually have something like this).
>>> Which hardware NIC drivers have this?
>> It's quite common I think, e.g driver track e.g dma addr and page frag
>> somewhere. e.g the ring->rx_info in mlx4 driver.
> It seems that I had a misunderstanding on your
> previous comments. I know it's quite common for
> drivers to track e.g. DMA addrs somewhere (and
> I think one reason behind this is that they want
> to reuse the bits of addr field).

Yes, we may want this for virtio-net as well in the future.

>   But tracking
> addrs somewhere doesn't means supporting OOO.
> I thought you were saying it's quite common for
> hardware NIC drivers to support OOO (i.e. NICs
> will return the descriptors OOO):
>
> I'm not familiar with mlx4, maybe I'm wrong.
> I just had a quick glance. And I found below
> comments in mlx4_en_process_rx_cq():
>
> ```
> /* We assume a 1:1 mapping between CQEs and Rx descriptors, so Rx
>   * descriptor offset can be deduced from the CQE index instead of
>   * reading 'cqe->index' */
> index = cq->mcq.cons_index & ring->size_mask;
> cqe = mlx4_en_get_cqe(cq->buf, index, priv->cqe_size) + factor;
> ```
>
> It seems that although they have a completion
> queue, they are still using the ring in order.

I guess so (at least from the above bits). Git grep -i "out of order" in 
drivers/net gives some hints. Looks like there're few deivces do this.

> I guess maybe storage device may want OOO.

Right, some iSCSI did.

But tracking them elsewhere is not only for OOO.

Spec said:

for element address

"
In a used descriptor, Element Address is unused.
"

for Next flag:

"
For example, if descriptors are used in the same order in which they are 
made available, this will result in
the used descriptor overwriting the first available descriptor in the 
list, the used descriptor for the next list
overwriting the first available descriptor in the next list, etc.
"

for in order completion:

"
This will result in the used descriptor overwriting the first available 
descriptor in the batch, the used descriptor
for the next batch overwriting the first available descriptor in the 
next batch, etc.
"

So:

- It's an alignment to the spec
- device may (or should) overwrite the descriptor make also make address 
field useless.

Thanks

>
> Best regards,
> Tiwei Bie
>
>> Thanks
>>
>>>> Not for the patch, but it looks like having a OUT_OF_ORDER feature bit is
>>>> much more simpler to be started with.
>>> +1
>>>
>>> Best regards,
>>> Tiwei Bie

^ permalink raw reply

* VLAN performance issues with Open vSwitch when 8021q not loaded
From: Mikhail Sennikovsky @ 2018-05-18 13:24 UTC (permalink / raw)
  To: netdev; +Cc: Mikhail Sennikovskii

Hi all,

We were recently experiencing network performance issues with VLAN
networking setup with Open vSwitch, for the ingress traffic coming to
VLAN trunk port of the ovs.
As we discovered, the issue was caused by gro not working for it,
which in turn was because the gro receive callbacks for 802.1Q payload
type are defined in the 8021q module (see
https://elixir.bootlin.com/linux/v4.16.9/source/net/8021q/vlan.c#L762
), which was not loaded.
This resulted in a significant bandwidth performance drop, having
~3Gbps instead of the expected ~7Gbps for a simple iperf3 test in our
case.

The obvious work-around would be to load the 8021q module, which
indeed makes bandwidth performance back to the expected numbers.
This seems like a hidden and not obvious magic however.

So I'm questioning, whether it makes sense to have the gro receive
callbacks for 802.1Q and 802.1ad moved to some common place, that
would be used/enabled by both 8021q and openvswitch modules.

Regards,
Mikhail

^ permalink raw reply

* Re: [PATCH net] tuntap: raise EPOLLOUT on device up
From: Jason Wang @ 2018-05-18 13:26 UTC (permalink / raw)
  To: Michael S. Tsirkin
  Cc: netdev, linux-kernel, Hannes Frederic Sowa, Eric Dumazet
In-Reply-To: <20180518161253-mutt-send-email-mst@kernel.org>



On 2018年05月18日 21:13, Michael S. Tsirkin wrote:
> On Fri, May 18, 2018 at 09:00:43PM +0800, Jason Wang wrote:
>> We return -EIO on device down but can not raise EPOLLOUT after it was
>> up. This may confuse user like vhost which expects tuntap to raise
>> EPOLLOUT to re-enable its TX routine after tuntap is down. This could
>> be easily reproduced by transmitting packets from VM while down and up
>> the tap device. Fixing this by set SOCKWQ_ASYNC_NOSPACE on -EIO.
>>
>> Cc: Hannes Frederic Sowa <hannes@stressinduktion.org>
>> Cc: Eric Dumazet <edumazet@google.com>
>> Fixes: 1bd4978a88ac2 ("tun: honor IFF_UP in tun_get_user()")
>> Signed-off-by: Jason Wang <jasowang@redhat.com>
>> ---
>>   drivers/net/tun.c | 4 +++-
>>   1 file changed, 3 insertions(+), 1 deletion(-)
>>
>> diff --git a/drivers/net/tun.c b/drivers/net/tun.c
>> index d45ac37..1b29761 100644
>> --- a/drivers/net/tun.c
>> +++ b/drivers/net/tun.c
>> @@ -1734,8 +1734,10 @@ static ssize_t tun_get_user(struct tun_struct *tun, struct tun_file *tfile,
>>   	int skb_xdp = 1;
>>   	bool frags = tun_napi_frags_enabled(tun);
>>   
>> -	if (!(tun->dev->flags & IFF_UP))
>> +	if (!(tun->dev->flags & IFF_UP)) {
> Isn't this racy?  What if flag is cleared at this point?

I think you mean "set at this point"? Then yes, so we probably need to 
set the bit during tun_net_close().

Thanks

>> +		set_bit(SOCKWQ_ASYNC_NOSPACE, &tfile->socket.flags);
>>   		return -EIO;
>> +	}
>>   
>>   	if (!(tun->flags & IFF_NO_PI)) {
>>   		if (len < sizeof(pi))
>> -- 
>> 2.7.4

^ permalink raw reply

* Re: [PATCH v3 2/2] bpf: add selftest for rawir_event type program
From: Sean Young @ 2018-05-18 13:33 UTC (permalink / raw)
  To: Quentin Monnet
  Cc: Y Song, linux-media, linux-kernel, Alexei Starovoitov,
	Mauro Carvalho Chehab, Daniel Borkmann, netdev, Matthias Reichl,
	Devin Heitmueller
In-Reply-To: <86ffb16c-9b4e-c826-ecd2-82266e7b8c2e@netronome.com>

On Fri, May 18, 2018 at 11:13:07AM +0100, Quentin Monnet wrote:
> 2018-05-17 22:01 UTC+0100 ~ Sean Young <sean@mess.org>
> > On Thu, May 17, 2018 at 10:17:59AM -0700, Y Song wrote:
> >> On Wed, May 16, 2018 at 2:04 PM, Sean Young <sean@mess.org> wrote:
> >>> This is simple test over rc-loopback.
> >>>
> >>> Signed-off-by: Sean Young <sean@mess.org>
> >>> ---
> >>>  tools/bpf/bpftool/prog.c                      |   1 +
> >>>  tools/include/uapi/linux/bpf.h                |  57 +++++++-
> >>>  tools/lib/bpf/libbpf.c                        |   1 +
> >>>  tools/testing/selftests/bpf/Makefile          |   8 +-
> >>>  tools/testing/selftests/bpf/bpf_helpers.h     |   6 +
> >>>  tools/testing/selftests/bpf/test_rawir.sh     |  37 +++++
> >>>  .../selftests/bpf/test_rawir_event_kern.c     |  26 ++++
> >>>  .../selftests/bpf/test_rawir_event_user.c     | 130 ++++++++++++++++++
> >>>  8 files changed, 261 insertions(+), 5 deletions(-)
> >>>  create mode 100755 tools/testing/selftests/bpf/test_rawir.sh
> >>>  create mode 100644 tools/testing/selftests/bpf/test_rawir_event_kern.c
> >>>  create mode 100644 tools/testing/selftests/bpf/test_rawir_event_user.c
> 
> [...]
> 
> >> Most people probably not really familiar with lircN device. It would be
> >> good to provide more information about how to enable this, e.g.,
> >>   CONFIG_RC_CORE=y
> >>   CONFIG_BPF_RAWIR_EVENT=y
> >>   CONFIG_RC_LOOPBACK=y
> >>   ......
> > 
> > Good point. I'll add some words explaining what is and how to make it work.
> > 
> > Thanks
> > Sean
> 
> 
> By the way, shouldn't the two eBPF helpers bpf_rc_keydown() and
> bpf_rc_repeat() be compiled out in patch 1 if e.g.
> CONFIG_BPF_RAWIR_EVENT is not set? There are some other helpers that are
> compiled only if relevant config options are set (bpf_get_xfrm_state()
> for example).

So if CONFIG_BPF_RAWIR_EVENT is not set, then bpf-rawir-event.c is not
compiled. Stubs are created in include/linux/bpf_rcdev.h, so this is
already the case if I understand your correctly.
 
> (If you were to change that, please also update helper documentations to
> indicate what configuration options are required to be able to use the
> helpers.)

Ok, I'll add that.

Thanks again!

Sean

^ permalink raw reply

* [bpf-next V4 PATCH 0/8] xdp: introduce bulking for ndo_xdp_xmit API
From: Jesper Dangaard Brouer @ 2018-05-18 13:34 UTC (permalink / raw)
  To: netdev, Daniel Borkmann, Alexei Starovoitov,
	Jesper Dangaard Brouer
  Cc: Christoph Hellwig, BjörnTöpel, Magnus Karlsson,
	makita.toshiaki

This patchset change ndo_xdp_xmit API to take a bulk of xdp frames.

In this V4 patchset, I've split-out the patches from 4 to 8 patches.
I cannot split the driver changes from the NDO change, but I've tried
to isolated the NDO change together with the driver change as much as
possible.

When kernel is compiled with CONFIG_RETPOLINE, every indirect function
pointer (branch) call hurts performance. For XDP this have a huge
negative performance impact.

This patchset reduce the needed (indirect) calls to ndo_xdp_xmit, but
also prepares for further optimizations.  The DMA APIs use of indirect
function pointer calls is the primary source the regression.  It is
left for a followup patchset, to use bulking calls towards the DMA API
(via the scatter-gatter calls).

The other advantage of this API change is that drivers can easier
amortize the cost of any sync/locking scheme, over the bulk of
packets.  The assumption of the current API is that the driver
implemementing the NDO will also allocate a dedicated XDP TX queue for
every CPU in the system.  Which is not always possible or practical to
configure. E.g. ixgbe cannot load an XDP program on a machine with
more than 96 CPUs, due to limited hardware TX queues.  E.g. virtio_net
is hard to configure as it requires manually increasing the
queues. E.g. tun driver chooses to use a per XDP frame producer lock
modulo smp_processor_id over avail queues.

I'm considered adding 'flags' to ndo_xdp_xmit, but it's not part of
this patchset.  This will be a followup patchset, once we know if this
will be needed (e.g. for non-map xdp_redirect flush-flag, and if
AF_XDP chooses to use ndo_xdp_xmit for TX).

---

Jesper Dangaard Brouer (8):
      bpf: devmap introduce dev_map_enqueue
      bpf: devmap prepare xdp frames for bulking
      xdp: add tracepoint for devmap like cpumap have
      samples/bpf: xdp_monitor use tracepoint xdp:xdp_devmap_xmit
      xdp: introduce xdp_return_frame_rx_napi
      xdp: change ndo_xdp_xmit API to support bulking
      xdp/trace: extend tracepoint in devmap with an err
      samples/bpf: xdp_monitor use err code from tracepoint xdp:xdp_devmap_xmit

 drivers/net/ethernet/intel/i40e/i40e_txrx.c   |   26 ++++-
 drivers/net/ethernet/intel/i40e/i40e_txrx.h   |    2 
 drivers/net/ethernet/intel/ixgbe/ixgbe_main.c |   21 +++-
 drivers/net/tun.c                             |   37 ++++---
 drivers/net/virtio_net.c                      |   66 +++++++++---
 include/linux/bpf.h                           |   16 ++-
 include/linux/netdevice.h                     |   14 ++-
 include/net/page_pool.h                       |    5 +
 include/net/xdp.h                             |    1 
 include/trace/events/xdp.h                    |   50 +++++++++
 kernel/bpf/cpumap.c                           |    2 
 kernel/bpf/devmap.c                           |  134 ++++++++++++++++++++++++-
 net/core/filter.c                             |   23 +---
 net/core/xdp.c                                |   20 +++-
 samples/bpf/xdp_monitor_kern.c                |   49 +++++++++
 samples/bpf/xdp_monitor_user.c                |   69 +++++++++++++
 16 files changed, 449 insertions(+), 86 deletions(-)

^ permalink raw reply

* [bpf-next V4 PATCH 1/8] bpf: devmap introduce dev_map_enqueue
From: Jesper Dangaard Brouer @ 2018-05-18 13:34 UTC (permalink / raw)
  To: netdev, Daniel Borkmann, Alexei Starovoitov,
	Jesper Dangaard Brouer
  Cc: Christoph Hellwig, BjörnTöpel, Magnus Karlsson,
	makita.toshiaki
In-Reply-To: <152665044141.21055.1276346542020340263.stgit@firesoul>

Functionality is the same, but the ndo_xdp_xmit call is now
simply invoked from inside the devmap.c code.

V2: Fix compile issue reported by kbuild test robot <lkp@intel.com>

Signed-off-by: Jesper Dangaard Brouer <brouer@redhat.com>
---
 include/linux/bpf.h        |   14 +++++++++++---
 include/trace/events/xdp.h |    9 ++++++++-
 kernel/bpf/devmap.c        |   37 +++++++++++++++++++++++++++++++------
 net/core/filter.c          |   15 ++-------------
 4 files changed, 52 insertions(+), 23 deletions(-)

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index ed0122b45b63..fc1459bdcafc 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -485,14 +485,15 @@ int bpf_check(struct bpf_prog **fp, union bpf_attr *attr);
 void bpf_patch_call_args(struct bpf_insn *insn, u32 stack_depth);
 
 /* Map specifics */
-struct net_device  *__dev_map_lookup_elem(struct bpf_map *map, u32 key);
+struct xdp_buff;
+struct bpf_dtab_netdev *__dev_map_lookup_elem(struct bpf_map *map, u32 key);
 void __dev_map_insert_ctx(struct bpf_map *map, u32 index);
 void __dev_map_flush(struct bpf_map *map);
+int dev_map_enqueue(struct bpf_dtab_netdev *dst, struct xdp_buff *xdp);
 
 struct bpf_cpu_map_entry *__cpu_map_lookup_elem(struct bpf_map *map, u32 key);
 void __cpu_map_insert_ctx(struct bpf_map *map, u32 index);
 void __cpu_map_flush(struct bpf_map *map);
-struct xdp_buff;
 int cpu_map_enqueue(struct bpf_cpu_map_entry *rcpu, struct xdp_buff *xdp,
 		    struct net_device *dev_rx);
 
@@ -571,6 +572,14 @@ static inline void __dev_map_flush(struct bpf_map *map)
 {
 }
 
+struct xdp_buff;
+struct bpf_dtab_netdev;
+static inline
+int dev_map_enqueue(struct bpf_dtab_netdev *dst, struct xdp_buff *xdp)
+{
+	return 0;
+}
+
 static inline
 struct bpf_cpu_map_entry *__cpu_map_lookup_elem(struct bpf_map *map, u32 key)
 {
@@ -585,7 +594,6 @@ static inline void __cpu_map_flush(struct bpf_map *map)
 {
 }
 
-struct xdp_buff;
 static inline int cpu_map_enqueue(struct bpf_cpu_map_entry *rcpu,
 				  struct xdp_buff *xdp,
 				  struct net_device *dev_rx)
diff --git a/include/trace/events/xdp.h b/include/trace/events/xdp.h
index 8989a92c571a..96104610d40e 100644
--- a/include/trace/events/xdp.h
+++ b/include/trace/events/xdp.h
@@ -138,11 +138,18 @@ DEFINE_EVENT_PRINT(xdp_redirect_template, xdp_redirect_map_err,
 		  __entry->map_id, __entry->map_index)
 );
 
+#ifndef __DEVMAP_OBJ_TYPE
+#define __DEVMAP_OBJ_TYPE
+struct _bpf_dtab_netdev {
+	struct net_device *dev;
+};
+#endif /* __DEVMAP_OBJ_TYPE */
+
 #define devmap_ifindex(fwd, map)				\
 	(!fwd ? 0 :						\
 	 (!map ? 0 :						\
 	  ((map->map_type == BPF_MAP_TYPE_DEVMAP) ?		\
-	   ((struct net_device *)fwd)->ifindex : 0)))
+	   ((struct _bpf_dtab_netdev *)fwd)->dev->ifindex : 0)))
 
 #define _trace_xdp_redirect_map(dev, xdp, fwd, map, idx)		\
 	 trace_xdp_redirect_map(dev, xdp, devmap_ifindex(fwd, map),	\
diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c
index 565f9ece9115..808808bf2bf2 100644
--- a/kernel/bpf/devmap.c
+++ b/kernel/bpf/devmap.c
@@ -48,18 +48,21 @@
  * calls will fail at this point.
  */
 #include <linux/bpf.h>
+#include <net/xdp.h>
 #include <linux/filter.h>
 
 #define DEV_CREATE_FLAG_MASK \
 	(BPF_F_NUMA_NODE | BPF_F_RDONLY | BPF_F_WRONLY)
 
+/* objects in the map */
 struct bpf_dtab_netdev {
-	struct net_device *dev;
+	struct net_device *dev; /* must be first member, due to tracepoint */
 	struct bpf_dtab *dtab;
 	unsigned int bit;
 	struct rcu_head rcu;
 };
 
+/* bpf map container */
 struct bpf_dtab {
 	struct bpf_map map;
 	struct bpf_dtab_netdev **netdev_map;
@@ -240,21 +243,43 @@ void __dev_map_flush(struct bpf_map *map)
  * update happens in parallel here a dev_put wont happen until after reading the
  * ifindex.
  */
-struct net_device  *__dev_map_lookup_elem(struct bpf_map *map, u32 key)
+struct bpf_dtab_netdev *__dev_map_lookup_elem(struct bpf_map *map, u32 key)
 {
 	struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map);
-	struct bpf_dtab_netdev *dev;
+	struct bpf_dtab_netdev *obj;
 
 	if (key >= map->max_entries)
 		return NULL;
 
-	dev = READ_ONCE(dtab->netdev_map[key]);
-	return dev ? dev->dev : NULL;
+	obj = READ_ONCE(dtab->netdev_map[key]);
+	return obj;
+}
+
+int dev_map_enqueue(struct bpf_dtab_netdev *dst, struct xdp_buff *xdp)
+{
+	struct net_device *dev = dst->dev;
+	struct xdp_frame *xdpf;
+	int err;
+
+	if (!dev->netdev_ops->ndo_xdp_xmit)
+		return -EOPNOTSUPP;
+
+	xdpf = convert_to_xdp_frame(xdp);
+	if (unlikely(!xdpf))
+		return -EOVERFLOW;
+
+	/* TODO: implement a bulking/enqueue step later */
+	err = dev->netdev_ops->ndo_xdp_xmit(dev, xdpf);
+	if (err)
+		return err;
+
+	return 0;
 }
 
 static void *dev_map_lookup_elem(struct bpf_map *map, void *key)
 {
-	struct net_device *dev = __dev_map_lookup_elem(map, *(u32 *)key);
+	struct bpf_dtab_netdev *obj = __dev_map_lookup_elem(map, *(u32 *)key);
+	struct net_device *dev = dev = obj ? obj->dev : NULL;
 
 	return dev ? &dev->ifindex : NULL;
 }
diff --git a/net/core/filter.c b/net/core/filter.c
index 6d0d1560bd70..1447ec94ef74 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -3061,20 +3061,9 @@ static int __bpf_tx_xdp_map(struct net_device *dev_rx, void *fwd,
 
 	switch (map->map_type) {
 	case BPF_MAP_TYPE_DEVMAP: {
-		struct net_device *dev = fwd;
-		struct xdp_frame *xdpf;
+		struct bpf_dtab_netdev *dst = fwd;
 
-		if (!dev->netdev_ops->ndo_xdp_xmit)
-			return -EOPNOTSUPP;
-
-		xdpf = convert_to_xdp_frame(xdp);
-		if (unlikely(!xdpf))
-			return -EOVERFLOW;
-
-		/* TODO: move to inside map code instead, for bulk support
-		 * err = dev_map_enqueue(dev, xdp);
-		 */
-		err = dev->netdev_ops->ndo_xdp_xmit(dev, xdpf);
+		err = dev_map_enqueue(dst, xdp);
 		if (err)
 			return err;
 		__dev_map_insert_ctx(map, index);

^ permalink raw reply related

* [bpf-next V4 PATCH 2/8] bpf: devmap prepare xdp frames for bulking
From: Jesper Dangaard Brouer @ 2018-05-18 13:34 UTC (permalink / raw)
  To: netdev, Daniel Borkmann, Alexei Starovoitov,
	Jesper Dangaard Brouer
  Cc: Christoph Hellwig, BjörnTöpel, Magnus Karlsson,
	makita.toshiaki
In-Reply-To: <152665044141.21055.1276346542020340263.stgit@firesoul>

Like cpumap create queue for xdp frames that will be bulked.  For now,
this patch simply invoke ndo_xdp_xmit foreach frame.  This happens,
either when the map flush operation is envoked, or when the limit
DEV_MAP_BULK_SIZE is reached.

Signed-off-by: Jesper Dangaard Brouer <brouer@redhat.com>
---
 kernel/bpf/devmap.c |   77 ++++++++++++++++++++++++++++++++++++++++++++++++---
 1 file changed, 73 insertions(+), 4 deletions(-)

diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c
index 808808bf2bf2..cab72c100bb5 100644
--- a/kernel/bpf/devmap.c
+++ b/kernel/bpf/devmap.c
@@ -54,11 +54,18 @@
 #define DEV_CREATE_FLAG_MASK \
 	(BPF_F_NUMA_NODE | BPF_F_RDONLY | BPF_F_WRONLY)
 
+#define DEV_MAP_BULK_SIZE 16
+struct xdp_bulk_queue {
+	struct xdp_frame *q[DEV_MAP_BULK_SIZE];
+	unsigned int count;
+};
+
 /* objects in the map */
 struct bpf_dtab_netdev {
 	struct net_device *dev; /* must be first member, due to tracepoint */
 	struct bpf_dtab *dtab;
 	unsigned int bit;
+	struct xdp_bulk_queue __percpu *bulkq;
 	struct rcu_head rcu;
 };
 
@@ -209,6 +216,38 @@ void __dev_map_insert_ctx(struct bpf_map *map, u32 bit)
 	__set_bit(bit, bitmap);
 }
 
+static int bq_xmit_all(struct bpf_dtab_netdev *obj,
+			 struct xdp_bulk_queue *bq)
+{
+	unsigned int processed = 0, drops = 0;
+	struct net_device *dev = obj->dev;
+	int i;
+
+	if (unlikely(!bq->count))
+		return 0;
+
+	for (i = 0; i < bq->count; i++) {
+		struct xdp_frame *xdpf = bq->q[i];
+
+		prefetch(xdpf);
+	}
+
+	for (i = 0; i < bq->count; i++) {
+		struct xdp_frame *xdpf = bq->q[i];
+		int err;
+
+		err = dev->netdev_ops->ndo_xdp_xmit(dev, xdpf);
+		if (err) {
+			drops++;
+			xdp_return_frame(xdpf);
+		}
+		processed++;
+	}
+	bq->count = 0;
+
+	return 0;
+}
+
 /* __dev_map_flush is called from xdp_do_flush_map() which _must_ be signaled
  * from the driver before returning from its napi->poll() routine. The poll()
  * routine is called either from busy_poll context or net_rx_action signaled
@@ -224,6 +263,7 @@ void __dev_map_flush(struct bpf_map *map)
 
 	for_each_set_bit(bit, bitmap, map->max_entries) {
 		struct bpf_dtab_netdev *dev = READ_ONCE(dtab->netdev_map[bit]);
+		struct xdp_bulk_queue *bq;
 		struct net_device *netdev;
 
 		/* This is possible if the dev entry is removed by user space
@@ -233,6 +273,9 @@ void __dev_map_flush(struct bpf_map *map)
 			continue;
 
 		__clear_bit(bit, bitmap);
+
+		bq = this_cpu_ptr(dev->bulkq);
+		bq_xmit_all(dev, bq);
 		netdev = dev->dev;
 		if (likely(netdev->netdev_ops->ndo_xdp_flush))
 			netdev->netdev_ops->ndo_xdp_flush(netdev);
@@ -255,6 +298,20 @@ struct bpf_dtab_netdev *__dev_map_lookup_elem(struct bpf_map *map, u32 key)
 	return obj;
 }
 
+/* Runs under RCU-read-side, plus in softirq under NAPI protection.
+ * Thus, safe percpu variable access.
+ */
+static int bq_enqueue(struct bpf_dtab_netdev *obj, struct xdp_frame *xdpf)
+{
+	struct xdp_bulk_queue *bq = this_cpu_ptr(obj->bulkq);
+
+	if (unlikely(bq->count == DEV_MAP_BULK_SIZE))
+		bq_xmit_all(obj, bq);
+
+	bq->q[bq->count++] = xdpf;
+	return 0;
+}
+
 int dev_map_enqueue(struct bpf_dtab_netdev *dst, struct xdp_buff *xdp)
 {
 	struct net_device *dev = dst->dev;
@@ -268,8 +325,7 @@ int dev_map_enqueue(struct bpf_dtab_netdev *dst, struct xdp_buff *xdp)
 	if (unlikely(!xdpf))
 		return -EOVERFLOW;
 
-	/* TODO: implement a bulking/enqueue step later */
-	err = dev->netdev_ops->ndo_xdp_xmit(dev, xdpf);
+	err = bq_enqueue(dst, xdpf);
 	if (err)
 		return err;
 
@@ -288,13 +344,18 @@ static void dev_map_flush_old(struct bpf_dtab_netdev *dev)
 {
 	if (dev->dev->netdev_ops->ndo_xdp_flush) {
 		struct net_device *fl = dev->dev;
+		struct xdp_bulk_queue *bq;
 		unsigned long *bitmap;
+
 		int cpu;
 
 		for_each_online_cpu(cpu) {
 			bitmap = per_cpu_ptr(dev->dtab->flush_needed, cpu);
 			__clear_bit(dev->bit, bitmap);
 
+			bq = per_cpu_ptr(dev->bulkq, cpu);
+			bq_xmit_all(dev, bq);
+
 			fl->netdev_ops->ndo_xdp_flush(dev->dev);
 		}
 	}
@@ -306,6 +367,7 @@ static void __dev_map_entry_free(struct rcu_head *rcu)
 
 	dev = container_of(rcu, struct bpf_dtab_netdev, rcu);
 	dev_map_flush_old(dev);
+	free_percpu(dev->bulkq);
 	dev_put(dev->dev);
 	kfree(dev);
 }
@@ -338,6 +400,7 @@ static int dev_map_update_elem(struct bpf_map *map, void *key, void *value,
 {
 	struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map);
 	struct net *net = current->nsproxy->net_ns;
+	gfp_t gfp = GFP_ATOMIC | __GFP_NOWARN;
 	struct bpf_dtab_netdev *dev, *old_dev;
 	u32 i = *(u32 *)key;
 	u32 ifindex = *(u32 *)value;
@@ -352,11 +415,17 @@ static int dev_map_update_elem(struct bpf_map *map, void *key, void *value,
 	if (!ifindex) {
 		dev = NULL;
 	} else {
-		dev = kmalloc_node(sizeof(*dev), GFP_ATOMIC | __GFP_NOWARN,
-				   map->numa_node);
+		dev = kmalloc_node(sizeof(*dev), gfp, map->numa_node);
 		if (!dev)
 			return -ENOMEM;
 
+		dev->bulkq = __alloc_percpu_gfp(sizeof(*dev->bulkq),
+						sizeof(void *), gfp);
+		if (!dev->bulkq) {
+			kfree(dev);
+			return -ENOMEM;
+		}
+
 		dev->dev = dev_get_by_index(net, ifindex);
 		if (!dev->dev) {
 			kfree(dev);

^ permalink raw reply related

* [bpf-next V4 PATCH 3/8] xdp: add tracepoint for devmap like cpumap have
From: Jesper Dangaard Brouer @ 2018-05-18 13:34 UTC (permalink / raw)
  To: netdev, Daniel Borkmann, Alexei Starovoitov,
	Jesper Dangaard Brouer
  Cc: Christoph Hellwig, BjörnTöpel, Magnus Karlsson,
	makita.toshiaki
In-Reply-To: <152665044141.21055.1276346542020340263.stgit@firesoul>

Notice how this allow us get XDP statistic without affecting the XDP
performance, as tracepoint is no-longer activated on a per packet basis.

Signed-off-by: Jesper Dangaard Brouer <brouer@redhat.com>
---
 include/linux/bpf.h        |    6 ++++--
 include/trace/events/xdp.h |   39 +++++++++++++++++++++++++++++++++++++++
 kernel/bpf/devmap.c        |   25 ++++++++++++++++++++-----
 net/core/filter.c          |    2 +-
 4 files changed, 64 insertions(+), 8 deletions(-)

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index fc1459bdcafc..ca7110b81793 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -489,7 +489,8 @@ struct xdp_buff;
 struct bpf_dtab_netdev *__dev_map_lookup_elem(struct bpf_map *map, u32 key);
 void __dev_map_insert_ctx(struct bpf_map *map, u32 index);
 void __dev_map_flush(struct bpf_map *map);
-int dev_map_enqueue(struct bpf_dtab_netdev *dst, struct xdp_buff *xdp);
+int dev_map_enqueue(struct bpf_dtab_netdev *dst, struct xdp_buff *xdp,
+		    struct net_device *dev_rx);
 
 struct bpf_cpu_map_entry *__cpu_map_lookup_elem(struct bpf_map *map, u32 key);
 void __cpu_map_insert_ctx(struct bpf_map *map, u32 index);
@@ -575,7 +576,8 @@ static inline void __dev_map_flush(struct bpf_map *map)
 struct xdp_buff;
 struct bpf_dtab_netdev;
 static inline
-int dev_map_enqueue(struct bpf_dtab_netdev *dst, struct xdp_buff *xdp)
+int dev_map_enqueue(struct bpf_dtab_netdev *dst, struct xdp_buff *xdp,
+		    struct net_device *dev_rx)
 {
 	return 0;
 }
diff --git a/include/trace/events/xdp.h b/include/trace/events/xdp.h
index 96104610d40e..2e9ef0650144 100644
--- a/include/trace/events/xdp.h
+++ b/include/trace/events/xdp.h
@@ -229,6 +229,45 @@ TRACE_EVENT(xdp_cpumap_enqueue,
 		  __entry->to_cpu)
 );
 
+TRACE_EVENT(xdp_devmap_xmit,
+
+	TP_PROTO(const struct bpf_map *map, u32 map_index,
+		 int sent, int drops,
+		 const struct net_device *from_dev,
+		 const struct net_device *to_dev),
+
+	TP_ARGS(map, map_index, sent, drops, from_dev, to_dev),
+
+	TP_STRUCT__entry(
+		__field(int, map_id)
+		__field(u32, act)
+		__field(u32, map_index)
+		__field(int, drops)
+		__field(int, sent)
+		__field(int, from_ifindex)
+		__field(int, to_ifindex)
+	),
+
+	TP_fast_assign(
+		__entry->map_id		= map->id;
+		__entry->act		= XDP_REDIRECT;
+		__entry->map_index	= map_index;
+		__entry->drops		= drops;
+		__entry->sent		= sent;
+		__entry->from_ifindex	= from_dev->ifindex;
+		__entry->to_ifindex	= to_dev->ifindex;
+	),
+
+	TP_printk("ndo_xdp_xmit"
+		  " map_id=%d map_index=%d action=%s"
+		  " sent=%d drops=%d"
+		  " from_ifindex=%d to_ifindex=%d",
+		  __entry->map_id, __entry->map_index,
+		  __print_symbolic(__entry->act, __XDP_ACT_SYM_TAB),
+		  __entry->sent, __entry->drops,
+		  __entry->from_ifindex, __entry->to_ifindex)
+);
+
 #endif /* _TRACE_XDP_H */
 
 #include <trace/define_trace.h>
diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c
index cab72c100bb5..6f84100723b0 100644
--- a/kernel/bpf/devmap.c
+++ b/kernel/bpf/devmap.c
@@ -50,6 +50,7 @@
 #include <linux/bpf.h>
 #include <net/xdp.h>
 #include <linux/filter.h>
+#include <trace/events/xdp.h>
 
 #define DEV_CREATE_FLAG_MASK \
 	(BPF_F_NUMA_NODE | BPF_F_RDONLY | BPF_F_WRONLY)
@@ -57,6 +58,7 @@
 #define DEV_MAP_BULK_SIZE 16
 struct xdp_bulk_queue {
 	struct xdp_frame *q[DEV_MAP_BULK_SIZE];
+	struct net_device *dev_rx;
 	unsigned int count;
 };
 
@@ -219,8 +221,8 @@ void __dev_map_insert_ctx(struct bpf_map *map, u32 bit)
 static int bq_xmit_all(struct bpf_dtab_netdev *obj,
 			 struct xdp_bulk_queue *bq)
 {
-	unsigned int processed = 0, drops = 0;
 	struct net_device *dev = obj->dev;
+	int sent = 0, drops = 0;
 	int i;
 
 	if (unlikely(!bq->count))
@@ -241,10 +243,13 @@ static int bq_xmit_all(struct bpf_dtab_netdev *obj,
 			drops++;
 			xdp_return_frame(xdpf);
 		}
-		processed++;
+		sent++;
 	}
 	bq->count = 0;
 
+	trace_xdp_devmap_xmit(&obj->dtab->map, obj->bit,
+			      sent, drops, bq->dev_rx, dev);
+	bq->dev_rx = NULL;
 	return 0;
 }
 
@@ -301,18 +306,28 @@ struct bpf_dtab_netdev *__dev_map_lookup_elem(struct bpf_map *map, u32 key)
 /* Runs under RCU-read-side, plus in softirq under NAPI protection.
  * Thus, safe percpu variable access.
  */
-static int bq_enqueue(struct bpf_dtab_netdev *obj, struct xdp_frame *xdpf)
+static int bq_enqueue(struct bpf_dtab_netdev *obj, struct xdp_frame *xdpf,
+		      struct net_device *dev_rx)
+
 {
 	struct xdp_bulk_queue *bq = this_cpu_ptr(obj->bulkq);
 
 	if (unlikely(bq->count == DEV_MAP_BULK_SIZE))
 		bq_xmit_all(obj, bq);
 
+	/* Ingress dev_rx will be the same for all xdp_frame's in
+	 * bulk_queue, because bq stored per-CPU and must be flushed
+	 * from net_device drivers NAPI func end.
+	 */
+	if (!bq->dev_rx)
+		bq->dev_rx = dev_rx;
+
 	bq->q[bq->count++] = xdpf;
 	return 0;
 }
 
-int dev_map_enqueue(struct bpf_dtab_netdev *dst, struct xdp_buff *xdp)
+int dev_map_enqueue(struct bpf_dtab_netdev *dst, struct xdp_buff *xdp,
+		    struct net_device *dev_rx)
 {
 	struct net_device *dev = dst->dev;
 	struct xdp_frame *xdpf;
@@ -325,7 +340,7 @@ int dev_map_enqueue(struct bpf_dtab_netdev *dst, struct xdp_buff *xdp)
 	if (unlikely(!xdpf))
 		return -EOVERFLOW;
 
-	err = bq_enqueue(dst, xdpf);
+	err = bq_enqueue(dst, xdpf, dev_rx);
 	if (err)
 		return err;
 
diff --git a/net/core/filter.c b/net/core/filter.c
index 1447ec94ef74..4a93423cc5ea 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -3063,7 +3063,7 @@ static int __bpf_tx_xdp_map(struct net_device *dev_rx, void *fwd,
 	case BPF_MAP_TYPE_DEVMAP: {
 		struct bpf_dtab_netdev *dst = fwd;
 
-		err = dev_map_enqueue(dst, xdp);
+		err = dev_map_enqueue(dst, xdp, dev_rx);
 		if (err)
 			return err;
 		__dev_map_insert_ctx(map, index);

^ permalink raw reply related

* [bpf-next V4 PATCH 4/8] samples/bpf: xdp_monitor use tracepoint xdp:xdp_devmap_xmit
From: Jesper Dangaard Brouer @ 2018-05-18 13:34 UTC (permalink / raw)
  To: netdev, Daniel Borkmann, Alexei Starovoitov,
	Jesper Dangaard Brouer
  Cc: Christoph Hellwig, BjörnTöpel, Magnus Karlsson,
	makita.toshiaki
In-Reply-To: <152665044141.21055.1276346542020340263.stgit@firesoul>

The xdp_monitor sample/tool is updated to use the new tracepoint
xdp:xdp_devmap_xmit the previous patch just introduced.

Signed-off-by: Jesper Dangaard Brouer <brouer@redhat.com>
---
 samples/bpf/xdp_monitor_kern.c |   39 +++++++++++++++++++++++++++++++++++
 samples/bpf/xdp_monitor_user.c |   44 +++++++++++++++++++++++++++++++++++++++-
 2 files changed, 82 insertions(+), 1 deletion(-)

diff --git a/samples/bpf/xdp_monitor_kern.c b/samples/bpf/xdp_monitor_kern.c
index 211db8ded0de..2854aa0665ea 100644
--- a/samples/bpf/xdp_monitor_kern.c
+++ b/samples/bpf/xdp_monitor_kern.c
@@ -208,3 +208,42 @@ int trace_xdp_cpumap_kthread(struct cpumap_kthread_ctx *ctx)
 
 	return 0;
 }
+
+struct bpf_map_def SEC("maps") devmap_xmit_cnt = {
+	.type		= BPF_MAP_TYPE_PERCPU_ARRAY,
+	.key_size	= sizeof(u32),
+	.value_size	= sizeof(struct datarec),
+	.max_entries	= 1,
+};
+
+/* Tracepoint: /sys/kernel/debug/tracing/events/xdp/xdp_devmap_xmit/format
+ * Code in:         kernel/include/trace/events/xdp.h
+ */
+struct devmap_xmit_ctx {
+	u64 __pad;		// First 8 bytes are not accessible by bpf code
+	int map_id;		//	offset:8;  size:4; signed:1;
+	u32 act;		//	offset:12; size:4; signed:0;
+	u32 map_index;		//	offset:16; size:4; signed:0;
+	int drops;		//	offset:20; size:4; signed:1;
+	int sent;		//	offset:24; size:4; signed:1;
+	int from_ifindex;	//	offset:28; size:4; signed:1;
+	int to_ifindex;		//	offset:32; size:4; signed:1;
+};
+
+SEC("tracepoint/xdp/xdp_devmap_xmit")
+int trace_xdp_devmap_xmit(struct devmap_xmit_ctx *ctx)
+{
+	struct datarec *rec;
+	u32 key = 0;
+
+	rec = bpf_map_lookup_elem(&devmap_xmit_cnt, &key);
+	if (!rec)
+		return 0;
+	rec->processed += ctx->sent;
+	rec->dropped   += ctx->drops;
+
+	/* Record bulk events, then userspace can calc average bulk size */
+	rec->info += 1;
+
+	return 1;
+}
diff --git a/samples/bpf/xdp_monitor_user.c b/samples/bpf/xdp_monitor_user.c
index bf09b5188acd..7e18a454924c 100644
--- a/samples/bpf/xdp_monitor_user.c
+++ b/samples/bpf/xdp_monitor_user.c
@@ -141,6 +141,7 @@ struct stats_record {
 	struct record_u64 xdp_exception[XDP_ACTION_MAX];
 	struct record xdp_cpumap_kthread;
 	struct record xdp_cpumap_enqueue[MAX_CPUS];
+	struct record xdp_devmap_xmit;
 };
 
 static bool map_collect_record(int fd, __u32 key, struct record *rec)
@@ -397,7 +398,7 @@ static void stats_print(struct stats_record *stats_rec,
 			info = calc_info(r, p, t);
 			if (info > 0)
 				i_str = "sched";
-			if (pps > 0)
+			if (pps > 0 || drop > 0)
 				printf(fmt1, "cpumap-kthread",
 				       i, pps, drop, info, i_str);
 		}
@@ -409,6 +410,42 @@ static void stats_print(struct stats_record *stats_rec,
 		printf(fmt2, "cpumap-kthread", "total", pps, drop, info, i_str);
 	}
 
+	/* devmap ndo_xdp_xmit stats */
+	{
+		char *fmt1 = "%-15s %-7d %'-12.0f %'-12.0f %'-10.2f %s\n";
+		char *fmt2 = "%-15s %-7s %'-12.0f %'-12.0f %'-10.2f %s\n";
+		struct record *rec, *prev;
+		double drop, info;
+		char *i_str = "";
+
+		rec  =  &stats_rec->xdp_devmap_xmit;
+		prev = &stats_prev->xdp_devmap_xmit;
+		t = calc_period(rec, prev);
+		for (i = 0; i < nr_cpus; i++) {
+			struct datarec *r = &rec->cpu[i];
+			struct datarec *p = &prev->cpu[i];
+
+			pps  = calc_pps(r, p, t);
+			drop = calc_drop(r, p, t);
+			info = calc_info(r, p, t);
+			if (info > 0) {
+				i_str = "bulk-average";
+				info = (pps+drop) / info; /* calc avg bulk */
+			}
+			if (pps > 0 || drop > 0)
+				printf(fmt1, "devmap-xmit",
+				       i, pps, drop, info, i_str);
+		}
+		pps = calc_pps(&rec->total, &prev->total, t);
+		drop = calc_drop(&rec->total, &prev->total, t);
+		info = calc_info(&rec->total, &prev->total, t);
+		if (info > 0) {
+			i_str = "bulk-average";
+			info = (pps+drop) / info; /* calc avg bulk */
+		}
+		printf(fmt2, "devmap-xmit", "total", pps, drop, info, i_str);
+	}
+
 	printf("\n");
 }
 
@@ -437,6 +474,9 @@ static bool stats_collect(struct stats_record *rec)
 	fd = map_data[3].fd; /* map3: cpumap_kthread_cnt */
 	map_collect_record(fd, 0, &rec->xdp_cpumap_kthread);
 
+	fd = map_data[4].fd; /* map4: devmap_xmit_cnt */
+	map_collect_record(fd, 0, &rec->xdp_devmap_xmit);
+
 	return true;
 }
 
@@ -480,6 +520,7 @@ static struct stats_record *alloc_stats_record(void)
 
 	rec_sz = sizeof(struct datarec);
 	rec->xdp_cpumap_kthread.cpu = alloc_rec_per_cpu(rec_sz);
+	rec->xdp_devmap_xmit.cpu    = alloc_rec_per_cpu(rec_sz);
 
 	for (i = 0; i < MAX_CPUS; i++)
 		rec->xdp_cpumap_enqueue[i].cpu = alloc_rec_per_cpu(rec_sz);
@@ -498,6 +539,7 @@ static void free_stats_record(struct stats_record *r)
 		free(r->xdp_exception[i].cpu);
 
 	free(r->xdp_cpumap_kthread.cpu);
+	free(r->xdp_devmap_xmit.cpu);
 
 	for (i = 0; i < MAX_CPUS; i++)
 		free(r->xdp_cpumap_enqueue[i].cpu);

^ permalink raw reply related

* [bpf-next V4 PATCH 5/8] xdp: introduce xdp_return_frame_rx_napi
From: Jesper Dangaard Brouer @ 2018-05-18 13:34 UTC (permalink / raw)
  To: netdev, Daniel Borkmann, Alexei Starovoitov,
	Jesper Dangaard Brouer
  Cc: Christoph Hellwig, BjörnTöpel, Magnus Karlsson,
	makita.toshiaki
In-Reply-To: <152665044141.21055.1276346542020340263.stgit@firesoul>

When sending an xdp_frame through xdp_do_redirect call, then error
cases can happen where the xdp_frame needs to be dropped, and
returning an -errno code isn't sufficient/possible any-longer
(e.g. for cpumap case). This is already fully supported, by simply
calling xdp_return_frame.

This patch is an optimization, which provides xdp_return_frame_rx_napi,
which is a faster variant for these error cases.  It take advantage of
the protection provided by XDP RX running under NAPI protection.

This change is mostly relevant for drivers using the page_pool
allocator as it can take advantage of this. (Tested with mlx5).
---
 include/net/page_pool.h |    5 +++--
 include/net/xdp.h       |    1 +
 kernel/bpf/cpumap.c     |    2 +-
 net/core/xdp.c          |   20 ++++++++++++++++----
 4 files changed, 21 insertions(+), 7 deletions(-)

diff --git a/include/net/page_pool.h b/include/net/page_pool.h
index c79087153148..694d055e01ef 100644
--- a/include/net/page_pool.h
+++ b/include/net/page_pool.h
@@ -115,13 +115,14 @@ void page_pool_destroy(struct page_pool *pool);
 void __page_pool_put_page(struct page_pool *pool,
 			  struct page *page, bool allow_direct);
 
-static inline void page_pool_put_page(struct page_pool *pool, struct page *page)
+static inline void page_pool_put_page(struct page_pool *pool,
+				      struct page *page, bool allow_direct)
 {
 	/* When page_pool isn't compiled-in, net/core/xdp.c doesn't
 	 * allow registering MEM_TYPE_PAGE_POOL, but shield linker.
 	 */
 #ifdef CONFIG_PAGE_POOL
-	__page_pool_put_page(pool, page, false);
+	__page_pool_put_page(pool, page, allow_direct);
 #endif
 }
 /* Very limited use-cases allow recycle direct */
diff --git a/include/net/xdp.h b/include/net/xdp.h
index 0b689cf561c7..7ad779237ae8 100644
--- a/include/net/xdp.h
+++ b/include/net/xdp.h
@@ -104,6 +104,7 @@ struct xdp_frame *convert_to_xdp_frame(struct xdp_buff *xdp)
 }
 
 void xdp_return_frame(struct xdp_frame *xdpf);
+void xdp_return_frame_rx_napi(struct xdp_frame *xdpf);
 void xdp_return_buff(struct xdp_buff *xdp);
 
 int xdp_rxq_info_reg(struct xdp_rxq_info *xdp_rxq,
diff --git a/kernel/bpf/cpumap.c b/kernel/bpf/cpumap.c
index c95b04ec103e..e0918d180f08 100644
--- a/kernel/bpf/cpumap.c
+++ b/kernel/bpf/cpumap.c
@@ -578,7 +578,7 @@ static int bq_flush_to_queue(struct bpf_cpu_map_entry *rcpu,
 		err = __ptr_ring_produce(q, xdpf);
 		if (err) {
 			drops++;
-			xdp_return_frame(xdpf);
+			xdp_return_frame_rx_napi(xdpf);
 		}
 		processed++;
 	}
diff --git a/net/core/xdp.c b/net/core/xdp.c
index bf6758f74339..cb8c4e061a5a 100644
--- a/net/core/xdp.c
+++ b/net/core/xdp.c
@@ -308,7 +308,13 @@ int xdp_rxq_info_reg_mem_model(struct xdp_rxq_info *xdp_rxq,
 }
 EXPORT_SYMBOL_GPL(xdp_rxq_info_reg_mem_model);
 
-static void xdp_return(void *data, struct xdp_mem_info *mem)
+/* XDP RX runs under NAPI protection, and in different delivery error
+ * scenarios (e.g. queue full), it is possible to return the xdp_frame
+ * while still leveraging this protection.  The @napi_direct boolian
+ * is used for those calls sites.  Thus, allowing for faster recycling
+ * of xdp_frames/pages in those cases.
+ */
+static void __xdp_return(void *data, struct xdp_mem_info *mem, bool napi_direct)
 {
 	struct xdp_mem_allocator *xa;
 	struct page *page;
@@ -320,7 +326,7 @@ static void xdp_return(void *data, struct xdp_mem_info *mem)
 		xa = rhashtable_lookup(mem_id_ht, &mem->id, mem_id_rht_params);
 		page = virt_to_head_page(data);
 		if (xa)
-			page_pool_put_page(xa->page_pool, page);
+			page_pool_put_page(xa->page_pool, page, napi_direct);
 		else
 			put_page(page);
 		rcu_read_unlock();
@@ -340,12 +346,18 @@ static void xdp_return(void *data, struct xdp_mem_info *mem)
 
 void xdp_return_frame(struct xdp_frame *xdpf)
 {
-	xdp_return(xdpf->data, &xdpf->mem);
+	__xdp_return(xdpf->data, &xdpf->mem, false);
 }
 EXPORT_SYMBOL_GPL(xdp_return_frame);
 
+void xdp_return_frame_rx_napi(struct xdp_frame *xdpf)
+{
+	__xdp_return(xdpf->data, &xdpf->mem, true);
+}
+EXPORT_SYMBOL_GPL(xdp_return_frame_rx_napi);
+
 void xdp_return_buff(struct xdp_buff *xdp)
 {
-	xdp_return(xdp->data, &xdp->rxq->mem);
+	__xdp_return(xdp->data, &xdp->rxq->mem, true);
 }
 EXPORT_SYMBOL_GPL(xdp_return_buff);

^ permalink raw reply related

* [bpf-next V4 PATCH 6/8] xdp: change ndo_xdp_xmit API to support bulking
From: Jesper Dangaard Brouer @ 2018-05-18 13:35 UTC (permalink / raw)
  To: netdev, Daniel Borkmann, Alexei Starovoitov,
	Jesper Dangaard Brouer
  Cc: Christoph Hellwig, BjörnTöpel, Magnus Karlsson,
	makita.toshiaki
In-Reply-To: <152665044141.21055.1276346542020340263.stgit@firesoul>

This patch change the API for ndo_xdp_xmit to support bulking
xdp_frames.

When kernel is compiled with CONFIG_RETPOLINE, XDP sees a huge slowdown.
Most of the slowdown is caused by DMA API indirect function calls, but
also the net_device->ndo_xdp_xmit() call.

Benchmarked patch with CONFIG_RETPOLINE, using xdp_redirect_map with
single flow/core test (CPU E5-1650 v4 @ 3.60GHz), showed
performance improved:
 for driver ixgbe: 6,042,682 pps -> 6,853,768 pps = +811,086 pps
 for driver i40e : 6,187,169 pps -> 6,724,519 pps = +537,350 pps

With frames avail as a bulk inside the driver ndo_xdp_xmit call,
further optimizations are possible, like bulk DMA-mapping for TX.

Testing without CONFIG_RETPOLINE show the same performance for
physical NIC drivers.

The virtual NIC driver tun sees a huge performance boost, as it can
avoid doing per frame producer locking, but instead amortize the
locking cost over the bulk.

V2: Fix compile errors reported by kbuild test robot <lkp@intel.com>
V4: Isolated ndo, driver changes and callers.

Signed-off-by: Jesper Dangaard Brouer <brouer@redhat.com>
---
 drivers/net/ethernet/intel/i40e/i40e_txrx.c   |   26 +++++++---
 drivers/net/ethernet/intel/i40e/i40e_txrx.h   |    2 -
 drivers/net/ethernet/intel/ixgbe/ixgbe_main.c |   21 ++++++--
 drivers/net/tun.c                             |   37 +++++++++-----
 drivers/net/virtio_net.c                      |   66 +++++++++++++++++++------
 include/linux/netdevice.h                     |   14 +++--
 kernel/bpf/devmap.c                           |   31 ++++++++----
 net/core/filter.c                             |    8 ++-
 8 files changed, 141 insertions(+), 64 deletions(-)

diff --git a/drivers/net/ethernet/intel/i40e/i40e_txrx.c b/drivers/net/ethernet/intel/i40e/i40e_txrx.c
index 5efa68de935b..9b698c5acd05 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_txrx.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_txrx.c
@@ -3664,14 +3664,19 @@ netdev_tx_t i40e_lan_xmit_frame(struct sk_buff *skb, struct net_device *netdev)
  * @dev: netdev
  * @xdp: XDP buffer
  *
- * Returns Zero if sent, else an error code
+ * Returns number of frames successfully sent. Frames that fail are
+ * free'ed via XDP return API.
+ *
+ * For error cases, a negative errno code is returned and no-frames
+ * are transmitted (caller must handle freeing frames).
  **/
-int i40e_xdp_xmit(struct net_device *dev, struct xdp_frame *xdpf)
+int i40e_xdp_xmit(struct net_device *dev, int n, struct xdp_frame **frames)
 {
 	struct i40e_netdev_priv *np = netdev_priv(dev);
 	unsigned int queue_index = smp_processor_id();
 	struct i40e_vsi *vsi = np->vsi;
-	int err;
+	int drops = 0;
+	int i;
 
 	if (test_bit(__I40E_VSI_DOWN, vsi->state))
 		return -ENETDOWN;
@@ -3679,11 +3684,18 @@ int i40e_xdp_xmit(struct net_device *dev, struct xdp_frame *xdpf)
 	if (!i40e_enabled_xdp_vsi(vsi) || queue_index >= vsi->num_queue_pairs)
 		return -ENXIO;
 
-	err = i40e_xmit_xdp_ring(xdpf, vsi->xdp_rings[queue_index]);
-	if (err != I40E_XDP_TX)
-		return -ENOSPC;
+	for (i = 0; i < n; i++) {
+		struct xdp_frame *xdpf = frames[i];
+		int err;
 
-	return 0;
+		err = i40e_xmit_xdp_ring(xdpf, vsi->xdp_rings[queue_index]);
+		if (err != I40E_XDP_TX) {
+			xdp_return_frame_rx_napi(xdpf);
+			drops++;
+		}
+	}
+
+	return n - drops;
 }
 
 /**
diff --git a/drivers/net/ethernet/intel/i40e/i40e_txrx.h b/drivers/net/ethernet/intel/i40e/i40e_txrx.h
index fdd2c55f03a6..eb8804b3d7b6 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_txrx.h
+++ b/drivers/net/ethernet/intel/i40e/i40e_txrx.h
@@ -487,7 +487,7 @@ u32 i40e_get_tx_pending(struct i40e_ring *ring, bool in_sw);
 void i40e_detect_recover_hung(struct i40e_vsi *vsi);
 int __i40e_maybe_stop_tx(struct i40e_ring *tx_ring, int size);
 bool __i40e_chk_linearize(struct sk_buff *skb);
-int i40e_xdp_xmit(struct net_device *dev, struct xdp_frame *xdpf);
+int i40e_xdp_xmit(struct net_device *dev, int n, struct xdp_frame **frames);
 void i40e_xdp_flush(struct net_device *dev);
 
 /**
diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
index 6652b201df5b..9645619f7729 100644
--- a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
@@ -10017,11 +10017,13 @@ static int ixgbe_xdp(struct net_device *dev, struct netdev_bpf *xdp)
 	}
 }
 
-static int ixgbe_xdp_xmit(struct net_device *dev, struct xdp_frame *xdpf)
+static int ixgbe_xdp_xmit(struct net_device *dev, int n,
+			  struct xdp_frame **frames)
 {
 	struct ixgbe_adapter *adapter = netdev_priv(dev);
 	struct ixgbe_ring *ring;
-	int err;
+	int drops = 0;
+	int i;
 
 	if (unlikely(test_bit(__IXGBE_DOWN, &adapter->state)))
 		return -ENETDOWN;
@@ -10033,11 +10035,18 @@ static int ixgbe_xdp_xmit(struct net_device *dev, struct xdp_frame *xdpf)
 	if (unlikely(!ring))
 		return -ENXIO;
 
-	err = ixgbe_xmit_xdp_ring(adapter, xdpf);
-	if (err != IXGBE_XDP_TX)
-		return -ENOSPC;
+	for (i = 0; i < n; i++) {
+		struct xdp_frame *xdpf = frames[i];
+		int err;
 
-	return 0;
+		err = ixgbe_xmit_xdp_ring(adapter, xdpf);
+		if (err != IXGBE_XDP_TX) {
+			xdp_return_frame_rx_napi(xdpf);
+			drops++;
+		}
+	}
+
+	return n - drops;
 }
 
 static void ixgbe_xdp_flush(struct net_device *dev)
diff --git a/drivers/net/tun.c b/drivers/net/tun.c
index 44d4f3d25350..d3dcfcb1c4b3 100644
--- a/drivers/net/tun.c
+++ b/drivers/net/tun.c
@@ -70,6 +70,7 @@
 #include <net/netns/generic.h>
 #include <net/rtnetlink.h>
 #include <net/sock.h>
+#include <net/xdp.h>
 #include <linux/seq_file.h>
 #include <linux/uio.h>
 #include <linux/skb_array.h>
@@ -1290,34 +1291,44 @@ static const struct net_device_ops tun_netdev_ops = {
 	.ndo_get_stats64	= tun_net_get_stats64,
 };
 
-static int tun_xdp_xmit(struct net_device *dev, struct xdp_frame *frame)
+static int tun_xdp_xmit(struct net_device *dev, int n, struct xdp_frame **frames)
 {
 	struct tun_struct *tun = netdev_priv(dev);
 	struct tun_file *tfile;
 	u32 numqueues;
-	int ret = 0;
+	int drops = 0;
+	int cnt = n;
+	int i;
 
 	rcu_read_lock();
 
 	numqueues = READ_ONCE(tun->numqueues);
 	if (!numqueues) {
-		ret = -ENOSPC;
-		goto out;
+		rcu_read_unlock();
+		return -ENXIO; /* Caller will free/return all frames */
 	}
 
 	tfile = rcu_dereference(tun->tfiles[smp_processor_id() %
 					    numqueues]);
-	/* Encode the XDP flag into lowest bit for consumer to differ
-	 * XDP buffer from sk_buff.
-	 */
-	if (ptr_ring_produce(&tfile->tx_ring, tun_xdp_to_ptr(frame))) {
-		this_cpu_inc(tun->pcpu_stats->tx_dropped);
-		ret = -ENOSPC;
+
+	spin_lock(&tfile->tx_ring.producer_lock);
+	for (i = 0; i < n; i++) {
+		struct xdp_frame *xdp = frames[i];
+		/* Encode the XDP flag into lowest bit for consumer to differ
+		 * XDP buffer from sk_buff.
+		 */
+		void *frame = tun_xdp_to_ptr(xdp);
+
+		if (__ptr_ring_produce(&tfile->tx_ring, frame)) {
+			this_cpu_inc(tun->pcpu_stats->tx_dropped);
+			xdp_return_frame_rx_napi(xdp);
+			drops++;
+		}
 	}
+	spin_unlock(&tfile->tx_ring.producer_lock);
 
-out:
 	rcu_read_unlock();
-	return ret;
+	return cnt - drops;
 }
 
 static int tun_xdp_tx(struct net_device *dev, struct xdp_buff *xdp)
@@ -1327,7 +1338,7 @@ static int tun_xdp_tx(struct net_device *dev, struct xdp_buff *xdp)
 	if (unlikely(!frame))
 		return -EOVERFLOW;
 
-	return tun_xdp_xmit(dev, frame);
+	return tun_xdp_xmit(dev, 1, &frame);
 }
 
 static void tun_xdp_flush(struct net_device *dev)
diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
index f34794a76c4d..39a0783d1cde 100644
--- a/drivers/net/virtio_net.c
+++ b/drivers/net/virtio_net.c
@@ -419,23 +419,13 @@ static void virtnet_xdp_flush(struct net_device *dev)
 	virtqueue_kick(sq->vq);
 }
 
-static int __virtnet_xdp_xmit(struct virtnet_info *vi,
-			       struct xdp_frame *xdpf)
+static int __virtnet_xdp_xmit_one(struct virtnet_info *vi,
+				   struct send_queue *sq,
+				   struct xdp_frame *xdpf)
 {
 	struct virtio_net_hdr_mrg_rxbuf *hdr;
-	struct xdp_frame *xdpf_sent;
-	struct send_queue *sq;
-	unsigned int len;
-	unsigned int qp;
 	int err;
 
-	qp = vi->curr_queue_pairs - vi->xdp_queue_pairs + smp_processor_id();
-	sq = &vi->sq[qp];
-
-	/* Free up any pending old buffers before queueing new ones. */
-	while ((xdpf_sent = virtqueue_get_buf(sq->vq, &len)) != NULL)
-		xdp_return_frame(xdpf_sent);
-
 	/* virtqueue want to use data area in-front of packet */
 	if (unlikely(xdpf->metasize > 0))
 		return -EOPNOTSUPP;
@@ -459,11 +449,40 @@ static int __virtnet_xdp_xmit(struct virtnet_info *vi,
 	return 0;
 }
 
-static int virtnet_xdp_xmit(struct net_device *dev, struct xdp_frame *xdpf)
+static int __virtnet_xdp_tx_xmit(struct virtnet_info *vi,
+				   struct xdp_frame *xdpf)
+{
+	struct xdp_frame *xdpf_sent;
+	struct send_queue *sq;
+	unsigned int len;
+	unsigned int qp;
+
+	qp = vi->curr_queue_pairs - vi->xdp_queue_pairs + smp_processor_id();
+	sq = &vi->sq[qp];
+
+	/* Free up any pending old buffers before queueing new ones. */
+	while ((xdpf_sent = virtqueue_get_buf(sq->vq, &len)) != NULL)
+		xdp_return_frame(xdpf_sent);
+
+	return __virtnet_xdp_xmit_one(vi, sq, xdpf);
+}
+
+static int virtnet_xdp_xmit(struct net_device *dev,
+			    int n, struct xdp_frame **frames)
 {
 	struct virtnet_info *vi = netdev_priv(dev);
 	struct receive_queue *rq = vi->rq;
+	struct xdp_frame *xdpf_sent;
 	struct bpf_prog *xdp_prog;
+	struct send_queue *sq;
+	unsigned int len;
+	unsigned int qp;
+	int drops = 0;
+	int err;
+	int i;
+
+	qp = vi->curr_queue_pairs - vi->xdp_queue_pairs + smp_processor_id();
+	sq = &vi->sq[qp];
 
 	/* Only allow ndo_xdp_xmit if XDP is loaded on dev, as this
 	 * indicate XDP resources have been successfully allocated.
@@ -472,7 +491,20 @@ static int virtnet_xdp_xmit(struct net_device *dev, struct xdp_frame *xdpf)
 	if (!xdp_prog)
 		return -ENXIO;
 
-	return __virtnet_xdp_xmit(vi, xdpf);
+	/* Free up any pending old buffers before queueing new ones. */
+	while ((xdpf_sent = virtqueue_get_buf(sq->vq, &len)) != NULL)
+		xdp_return_frame(xdpf_sent);
+
+	for (i = 0; i < n; i++) {
+		struct xdp_frame *xdpf = frames[i];
+
+		err = __virtnet_xdp_xmit_one(vi, sq, xdpf);
+		if (err) {
+			xdp_return_frame_rx_napi(xdpf);
+			drops++;
+		}
+	}
+	return n - drops;
 }
 
 static unsigned int virtnet_get_headroom(struct virtnet_info *vi)
@@ -616,7 +648,7 @@ static struct sk_buff *receive_small(struct net_device *dev,
 			xdpf = convert_to_xdp_frame(&xdp);
 			if (unlikely(!xdpf))
 				goto err_xdp;
-			err = __virtnet_xdp_xmit(vi, xdpf);
+			err = __virtnet_xdp_tx_xmit(vi, xdpf);
 			if (unlikely(err)) {
 				trace_xdp_exception(vi->dev, xdp_prog, act);
 				goto err_xdp;
@@ -779,7 +811,7 @@ static struct sk_buff *receive_mergeable(struct net_device *dev,
 			xdpf = convert_to_xdp_frame(&xdp);
 			if (unlikely(!xdpf))
 				goto err_xdp;
-			err = __virtnet_xdp_xmit(vi, xdpf);
+			err = __virtnet_xdp_tx_xmit(vi, xdpf);
 			if (unlikely(err)) {
 				trace_xdp_exception(vi->dev, xdp_prog, act);
 				if (unlikely(xdp_page != page))
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 03ed492c4e14..debdb6286170 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -1185,9 +1185,13 @@ struct dev_ifalias {
  *	This function is used to set or query state related to XDP on the
  *	netdevice and manage BPF offload. See definition of
  *	enum bpf_netdev_command for details.
- * int (*ndo_xdp_xmit)(struct net_device *dev, struct xdp_frame *xdp);
- *	This function is used to submit a XDP packet for transmit on a
- *	netdevice.
+ * int (*ndo_xdp_xmit)(struct net_device *dev, int n, struct xdp_frame **xdp);
+ *	This function is used to submit @n XDP packets for transmit on a
+ *	netdevice. Returns number of frames successfully transmitted, frames
+ *	that got dropped are freed/returned via xdp_return_frame().
+ *	Returns negative number, means general error invoking ndo, meaning
+ *	no frames were xmit'ed and core-caller will free all frames.
+ *	TODO: Consider add flag to allow sending flush operation.
  * void (*ndo_xdp_flush)(struct net_device *dev);
  *	This function is used to inform the driver to flush a particular
  *	xdp tx queue. Must be called on same CPU as xdp_xmit.
@@ -1375,8 +1379,8 @@ struct net_device_ops {
 						       int needed_headroom);
 	int			(*ndo_bpf)(struct net_device *dev,
 					   struct netdev_bpf *bpf);
-	int			(*ndo_xdp_xmit)(struct net_device *dev,
-						struct xdp_frame *xdp);
+	int			(*ndo_xdp_xmit)(struct net_device *dev, int n,
+						struct xdp_frame **xdp);
 	void			(*ndo_xdp_flush)(struct net_device *dev);
 };
 
diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c
index 6f84100723b0..1317629662ae 100644
--- a/kernel/bpf/devmap.c
+++ b/kernel/bpf/devmap.c
@@ -222,7 +222,7 @@ static int bq_xmit_all(struct bpf_dtab_netdev *obj,
 			 struct xdp_bulk_queue *bq)
 {
 	struct net_device *dev = obj->dev;
-	int sent = 0, drops = 0;
+	int sent = 0, drops = 0, err = 0;
 	int i;
 
 	if (unlikely(!bq->count))
@@ -234,23 +234,32 @@ static int bq_xmit_all(struct bpf_dtab_netdev *obj,
 		prefetch(xdpf);
 	}
 
-	for (i = 0; i < bq->count; i++) {
-		struct xdp_frame *xdpf = bq->q[i];
-		int err;
-
-		err = dev->netdev_ops->ndo_xdp_xmit(dev, xdpf);
-		if (err) {
-			drops++;
-			xdp_return_frame(xdpf);
-		}
-		sent++;
+	sent = dev->netdev_ops->ndo_xdp_xmit(dev, bq->count, bq->q);
+	if (sent < 0) {
+		err = sent;
+		sent = 0;
+		goto error;
 	}
+	drops = bq->count - sent;
+out:
 	bq->count = 0;
 
 	trace_xdp_devmap_xmit(&obj->dtab->map, obj->bit,
 			      sent, drops, bq->dev_rx, dev);
 	bq->dev_rx = NULL;
 	return 0;
+error:
+	/* If ndo_xdp_xmit fails with an errno, no frames have been
+	 * xmit'ed and it's our responsibility to them free all.
+	 */
+	for (i = 0; i < bq->count; i++) {
+		struct xdp_frame *xdpf = bq->q[i];
+
+		/* RX path under NAPI protection, can return frames faster */
+		xdp_return_frame_rx_napi(xdpf);
+		drops++;
+	}
+	goto out;
 }
 
 /* __dev_map_flush is called from xdp_do_flush_map() which _must_ be signaled
diff --git a/net/core/filter.c b/net/core/filter.c
index 4a93423cc5ea..19504b7f4959 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -3035,7 +3035,7 @@ static int __bpf_tx_xdp(struct net_device *dev,
 			u32 index)
 {
 	struct xdp_frame *xdpf;
-	int err;
+	int sent;
 
 	if (!dev->netdev_ops->ndo_xdp_xmit) {
 		return -EOPNOTSUPP;
@@ -3045,9 +3045,9 @@ static int __bpf_tx_xdp(struct net_device *dev,
 	if (unlikely(!xdpf))
 		return -EOVERFLOW;
 
-	err = dev->netdev_ops->ndo_xdp_xmit(dev, xdpf);
-	if (err)
-		return err;
+	sent = dev->netdev_ops->ndo_xdp_xmit(dev, 1, &xdpf);
+	if (sent <= 0)
+		return sent;
 	dev->netdev_ops->ndo_xdp_flush(dev);
 	return 0;
 }

^ permalink raw reply related

* [bpf-next V4 PATCH 7/8] xdp/trace: extend tracepoint in devmap with an err
From: Jesper Dangaard Brouer @ 2018-05-18 13:35 UTC (permalink / raw)
  To: netdev, Daniel Borkmann, Alexei Starovoitov,
	Jesper Dangaard Brouer
  Cc: Christoph Hellwig, BjörnTöpel, Magnus Karlsson,
	makita.toshiaki
In-Reply-To: <152665044141.21055.1276346542020340263.stgit@firesoul>

Extending tracepoint xdp:xdp_devmap_xmit in devmap with an err code
allow people to easier identify the reason behind the ndo_xdp_xmit
call to a given driver is failing.
---
 include/trace/events/xdp.h |   10 ++++++----
 kernel/bpf/devmap.c        |    2 +-
 2 files changed, 7 insertions(+), 5 deletions(-)

diff --git a/include/trace/events/xdp.h b/include/trace/events/xdp.h
index 2e9ef0650144..1ecf4c67fcf7 100644
--- a/include/trace/events/xdp.h
+++ b/include/trace/events/xdp.h
@@ -234,9 +234,9 @@ TRACE_EVENT(xdp_devmap_xmit,
 	TP_PROTO(const struct bpf_map *map, u32 map_index,
 		 int sent, int drops,
 		 const struct net_device *from_dev,
-		 const struct net_device *to_dev),
+		 const struct net_device *to_dev, int err),
 
-	TP_ARGS(map, map_index, sent, drops, from_dev, to_dev),
+	TP_ARGS(map, map_index, sent, drops, from_dev, to_dev, err),
 
 	TP_STRUCT__entry(
 		__field(int, map_id)
@@ -246,6 +246,7 @@ TRACE_EVENT(xdp_devmap_xmit,
 		__field(int, sent)
 		__field(int, from_ifindex)
 		__field(int, to_ifindex)
+		__field(int, err)
 	),
 
 	TP_fast_assign(
@@ -256,16 +257,17 @@ TRACE_EVENT(xdp_devmap_xmit,
 		__entry->sent		= sent;
 		__entry->from_ifindex	= from_dev->ifindex;
 		__entry->to_ifindex	= to_dev->ifindex;
+		__entry->err		= err;
 	),
 
 	TP_printk("ndo_xdp_xmit"
 		  " map_id=%d map_index=%d action=%s"
 		  " sent=%d drops=%d"
-		  " from_ifindex=%d to_ifindex=%d",
+		  " from_ifindex=%d to_ifindex=%d err=%d",
 		  __entry->map_id, __entry->map_index,
 		  __print_symbolic(__entry->act, __XDP_ACT_SYM_TAB),
 		  __entry->sent, __entry->drops,
-		  __entry->from_ifindex, __entry->to_ifindex)
+		  __entry->from_ifindex, __entry->to_ifindex, __entry->err)
 );
 
 #endif /* _TRACE_XDP_H */
diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c
index 1317629662ae..4dd8f0e3a8d9 100644
--- a/kernel/bpf/devmap.c
+++ b/kernel/bpf/devmap.c
@@ -245,7 +245,7 @@ static int bq_xmit_all(struct bpf_dtab_netdev *obj,
 	bq->count = 0;
 
 	trace_xdp_devmap_xmit(&obj->dtab->map, obj->bit,
-			      sent, drops, bq->dev_rx, dev);
+			      sent, drops, bq->dev_rx, dev, err);
 	bq->dev_rx = NULL;
 	return 0;
 error:

^ permalink raw reply related

* [bpf-next V4 PATCH 8/8] samples/bpf: xdp_monitor use err code from tracepoint xdp:xdp_devmap_xmit
From: Jesper Dangaard Brouer @ 2018-05-18 13:35 UTC (permalink / raw)
  To: netdev, Daniel Borkmann, Alexei Starovoitov,
	Jesper Dangaard Brouer
  Cc: Christoph Hellwig, BjörnTöpel, Magnus Karlsson,
	makita.toshiaki
In-Reply-To: <152665044141.21055.1276346542020340263.stgit@firesoul>

Update xdp_monitor to use the recently added err code introduced
in tracepoint xdp:xdp_devmap_xmit, to show if the drop count is
caused by some driver general delivery problem.  Other kind of drops
will likely just be more normal TX space issues.
---
 samples/bpf/xdp_monitor_kern.c |   10 ++++++++++
 samples/bpf/xdp_monitor_user.c |   35 ++++++++++++++++++++++++++++++-----
 2 files changed, 40 insertions(+), 5 deletions(-)

diff --git a/samples/bpf/xdp_monitor_kern.c b/samples/bpf/xdp_monitor_kern.c
index 2854aa0665ea..ad10fe700d7d 100644
--- a/samples/bpf/xdp_monitor_kern.c
+++ b/samples/bpf/xdp_monitor_kern.c
@@ -125,6 +125,7 @@ struct datarec {
 	u64 processed;
 	u64 dropped;
 	u64 info;
+	u64 err;
 };
 #define MAX_CPUS 64
 
@@ -228,6 +229,7 @@ struct devmap_xmit_ctx {
 	int sent;		//	offset:24; size:4; signed:1;
 	int from_ifindex;	//	offset:28; size:4; signed:1;
 	int to_ifindex;		//	offset:32; size:4; signed:1;
+	int err;		//	offset:36; size:4; signed:1;
 };
 
 SEC("tracepoint/xdp/xdp_devmap_xmit")
@@ -245,5 +247,13 @@ int trace_xdp_devmap_xmit(struct devmap_xmit_ctx *ctx)
 	/* Record bulk events, then userspace can calc average bulk size */
 	rec->info += 1;
 
+	/* Record error cases, where no frame were sent */
+	if (ctx->err)
+		rec->err++;
+
+	/* Catch API error of drv ndo_xdp_xmit sent more than count */
+	if (ctx->drops < 0)
+		rec->err++;
+
 	return 1;
 }
diff --git a/samples/bpf/xdp_monitor_user.c b/samples/bpf/xdp_monitor_user.c
index 7e18a454924c..dd558cbb2309 100644
--- a/samples/bpf/xdp_monitor_user.c
+++ b/samples/bpf/xdp_monitor_user.c
@@ -117,6 +117,7 @@ struct datarec {
 	__u64 processed;
 	__u64 dropped;
 	__u64 info;
+	__u64 err;
 };
 #define MAX_CPUS 64
 
@@ -152,6 +153,7 @@ static bool map_collect_record(int fd, __u32 key, struct record *rec)
 	__u64 sum_processed = 0;
 	__u64 sum_dropped = 0;
 	__u64 sum_info = 0;
+	__u64 sum_err = 0;
 	int i;
 
 	if ((bpf_map_lookup_elem(fd, &key, values)) != 0) {
@@ -170,10 +172,13 @@ static bool map_collect_record(int fd, __u32 key, struct record *rec)
 		sum_dropped        += values[i].dropped;
 		rec->cpu[i].info = values[i].info;
 		sum_info        += values[i].info;
+		rec->cpu[i].err = values[i].err;
+		sum_err        += values[i].err;
 	}
 	rec->total.processed = sum_processed;
 	rec->total.dropped   = sum_dropped;
 	rec->total.info      = sum_info;
+	rec->total.err       = sum_err;
 	return true;
 }
 
@@ -274,6 +279,18 @@ static double calc_info(struct datarec *r, struct datarec *p, double period)
 	return pps;
 }
 
+static double calc_err(struct datarec *r, struct datarec *p, double period)
+{
+	__u64 packets = 0;
+	double pps = 0;
+
+	if (period > 0) {
+		packets = r->err - p->err;
+		pps = packets / period;
+	}
+	return pps;
+}
+
 static void stats_print(struct stats_record *stats_rec,
 			struct stats_record *stats_prev,
 			bool err_only)
@@ -412,11 +429,12 @@ static void stats_print(struct stats_record *stats_rec,
 
 	/* devmap ndo_xdp_xmit stats */
 	{
-		char *fmt1 = "%-15s %-7d %'-12.0f %'-12.0f %'-10.2f %s\n";
-		char *fmt2 = "%-15s %-7s %'-12.0f %'-12.0f %'-10.2f %s\n";
+		char *fmt1 = "%-15s %-7d %'-12.0f %'-12.0f %'-10.2f %s %s\n";
+		char *fmt2 = "%-15s %-7s %'-12.0f %'-12.0f %'-10.2f %s %s\n";
 		struct record *rec, *prev;
-		double drop, info;
+		double drop, info, err;
 		char *i_str = "";
+		char *err_str = "";
 
 		rec  =  &stats_rec->xdp_devmap_xmit;
 		prev = &stats_prev->xdp_devmap_xmit;
@@ -428,22 +446,29 @@ static void stats_print(struct stats_record *stats_rec,
 			pps  = calc_pps(r, p, t);
 			drop = calc_drop(r, p, t);
 			info = calc_info(r, p, t);
+			err  = calc_err(r, p, t);
 			if (info > 0) {
 				i_str = "bulk-average";
 				info = (pps+drop) / info; /* calc avg bulk */
 			}
+			if (err > 0)
+				err_str = "drv-err";
 			if (pps > 0 || drop > 0)
 				printf(fmt1, "devmap-xmit",
-				       i, pps, drop, info, i_str);
+				       i, pps, drop, info, i_str, err_str);
 		}
 		pps = calc_pps(&rec->total, &prev->total, t);
 		drop = calc_drop(&rec->total, &prev->total, t);
 		info = calc_info(&rec->total, &prev->total, t);
+		err  = calc_err(&rec->total, &prev->total, t);
 		if (info > 0) {
 			i_str = "bulk-average";
 			info = (pps+drop) / info; /* calc avg bulk */
 		}
-		printf(fmt2, "devmap-xmit", "total", pps, drop, info, i_str);
+		if (err > 0)
+			err_str = "drv-err";
+		printf(fmt2, "devmap-xmit", "total", pps, drop,
+		       info, i_str, err_str);
 	}
 
 	printf("\n");

^ permalink raw reply related

* [PATCH net-next] cxgb4: collect SGE PF/VF queue map
From: Rahul Lakkireddy @ 2018-05-18 13:42 UTC (permalink / raw)
  To: netdev; +Cc: davem, ganeshgr, nirranjan, indranil, Rahul Lakkireddy

For T6, collect info on queue mapping to corresponding PF/VF in SGE.

Signed-off-by: Rahul Lakkireddy <rahul.lakkireddy@chelsio.com>
Signed-off-by: Ganesh Goudar <ganeshgr@chelsio.com>
---
 drivers/net/ethernet/chelsio/cxgb4/cudbg_entity.h | 17 ++++++++
 drivers/net/ethernet/chelsio/cxgb4/cudbg_lib.c    | 47 ++++++++++++++++++++++-
 drivers/net/ethernet/chelsio/cxgb4/cxgb4_cudbg.c  |  3 +-
 3 files changed, 65 insertions(+), 2 deletions(-)

diff --git a/drivers/net/ethernet/chelsio/cxgb4/cudbg_entity.h b/drivers/net/ethernet/chelsio/cxgb4/cudbg_entity.h
index 740a18ba4229..c333e25620a7 100644
--- a/drivers/net/ethernet/chelsio/cxgb4/cudbg_entity.h
+++ b/drivers/net/ethernet/chelsio/cxgb4/cudbg_entity.h
@@ -62,6 +62,18 @@ struct cudbg_hw_sched {
 	u32 map;
 };
 
+#define SGE_QBASE_DATA_REG_NUM 4
+
+struct sge_qbase_reg_field {
+	u32 reg_addr;
+	u32 reg_data[SGE_QBASE_DATA_REG_NUM];
+	/* Max supported PFs */
+	u32 pf_data_value[PCIE_FW_MASTER_M + 1][SGE_QBASE_DATA_REG_NUM];
+	/* Max supported VFs */
+	u32 vf_data_value[T6_VF_M + 1][SGE_QBASE_DATA_REG_NUM];
+	u32 vfcount; /* Actual number of max vfs in current configuration */
+};
+
 struct ireg_field {
 	u32 ireg_addr;
 	u32 ireg_data;
@@ -357,6 +369,11 @@ static const u32 t5_sge_dbg_index_array[2][IREG_NUM_ELEM] = {
 	{0x10cc, 0x10d4, 0x0, 16},
 };
 
+static const u32 t6_sge_qbase_index_array[] = {
+	/* 1 addr reg SGE_QBASE_INDEX and 4 data reg SGE_QBASE_MAP[0-3] */
+	0x1250, 0x1240, 0x1244, 0x1248, 0x124c,
+};
+
 static const u32 t5_pcie_pdbg_array[][IREG_NUM_ELEM] = {
 	{0x5a04, 0x5a0c, 0x00, 0x20}, /* t5_pcie_pdbg_regs_00_to_20 */
 	{0x5a04, 0x5a0c, 0x21, 0x20}, /* t5_pcie_pdbg_regs_21_to_40 */
diff --git a/drivers/net/ethernet/chelsio/cxgb4/cudbg_lib.c b/drivers/net/ethernet/chelsio/cxgb4/cudbg_lib.c
index 4feb7eca0acf..0afcfe99bff3 100644
--- a/drivers/net/ethernet/chelsio/cxgb4/cudbg_lib.c
+++ b/drivers/net/ethernet/chelsio/cxgb4/cudbg_lib.c
@@ -1339,16 +1339,39 @@ int cudbg_collect_tp_indirect(struct cudbg_init *pdbg_init,
 	return cudbg_write_and_release_buff(pdbg_init, &temp_buff, dbg_buff);
 }
 
+static void cudbg_read_sge_qbase_indirect_reg(struct adapter *padap,
+					      struct sge_qbase_reg_field *qbase,
+					      u32 func, bool is_pf)
+{
+	u32 *buff, i;
+
+	if (is_pf) {
+		buff = qbase->pf_data_value[func];
+	} else {
+		buff = qbase->vf_data_value[func];
+		/* In SGE_QBASE_INDEX,
+		 * Entries 0->7 are PF0->7, Entries 8->263 are VFID0->256.
+		 */
+		func += 8;
+	}
+
+	t4_write_reg(padap, qbase->reg_addr, func);
+	for (i = 0; i < SGE_QBASE_DATA_REG_NUM; i++, buff++)
+		*buff = t4_read_reg(padap, qbase->reg_data[i]);
+}
+
 int cudbg_collect_sge_indirect(struct cudbg_init *pdbg_init,
 			       struct cudbg_buffer *dbg_buff,
 			       struct cudbg_error *cudbg_err)
 {
 	struct adapter *padap = pdbg_init->adap;
 	struct cudbg_buffer temp_buff = { 0 };
+	struct sge_qbase_reg_field *sge_qbase;
 	struct ireg_buf *ch_sge_dbg;
 	int i, rc;
 
-	rc = cudbg_get_buff(pdbg_init, dbg_buff, sizeof(*ch_sge_dbg) * 2,
+	rc = cudbg_get_buff(pdbg_init, dbg_buff,
+			    sizeof(*ch_sge_dbg) * 2 + sizeof(*sge_qbase),
 			    &temp_buff);
 	if (rc)
 		return rc;
@@ -1370,6 +1393,28 @@ int cudbg_collect_sge_indirect(struct cudbg_init *pdbg_init,
 				 sge_pio->ireg_local_offset);
 		ch_sge_dbg++;
 	}
+
+	if (CHELSIO_CHIP_VERSION(padap->params.chip) > CHELSIO_T5) {
+		sge_qbase = (struct sge_qbase_reg_field *)ch_sge_dbg;
+		/* 1 addr reg SGE_QBASE_INDEX and 4 data reg
+		 * SGE_QBASE_MAP[0-3]
+		 */
+		sge_qbase->reg_addr = t6_sge_qbase_index_array[0];
+		for (i = 0; i < SGE_QBASE_DATA_REG_NUM; i++)
+			sge_qbase->reg_data[i] =
+				t6_sge_qbase_index_array[i + 1];
+
+		for (i = 0; i <= PCIE_FW_MASTER_M; i++)
+			cudbg_read_sge_qbase_indirect_reg(padap, sge_qbase,
+							  i, true);
+
+		for (i = 0; i < padap->params.arch.vfcount; i++)
+			cudbg_read_sge_qbase_indirect_reg(padap, sge_qbase,
+							  i, false);
+
+		sge_qbase->vfcount = padap->params.arch.vfcount;
+	}
+
 	return cudbg_write_and_release_buff(pdbg_init, &temp_buff, dbg_buff);
 }
 
diff --git a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_cudbg.c b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_cudbg.c
index 085691eb2b95..8d751efcb90e 100644
--- a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_cudbg.c
+++ b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_cudbg.c
@@ -214,7 +214,8 @@ static u32 cxgb4_get_entity_length(struct adapter *adap, u32 entity)
 		len = sizeof(struct ireg_buf) * n;
 		break;
 	case CUDBG_SGE_INDIRECT:
-		len = sizeof(struct ireg_buf) * 2;
+		len = sizeof(struct ireg_buf) * 2 +
+		      sizeof(struct sge_qbase_reg_field);
 		break;
 	case CUDBG_ULPRX_LA:
 		len = sizeof(struct cudbg_ulprx_la);
-- 
2.14.1

^ permalink raw reply related

* Re: [PATCH bpf-next v3 00/15] Introducing AF_XDP support
From: Daniel Borkmann @ 2018-05-18 13:43 UTC (permalink / raw)
  To: Alexei Starovoitov, Björn Töpel, Alexei Starovoitov
  Cc: Karlsson, Magnus, Duyck, Alexander H, Alexander Duyck,
	John Fastabend, Jesper Dangaard Brouer, Willem de Bruijn,
	Michael S. Tsirkin, Netdev, Björn Töpel,
	michael.lundkvist, Brandeburg, Jesse, Singhai, Anjali,
	Zhang, Qi Z
In-Reply-To: <cb3aa2a3-f72c-54bf-e883-88922e372c58@fb.com>

On 05/18/2018 05:38 AM, Alexei Starovoitov wrote:
> On 5/16/18 11:46 PM, Björn Töpel wrote:
>> 2018-05-04 1:38 GMT+02:00 Alexei Starovoitov <alexei.starovoitov@gmail.com>:
>>> On Fri, May 04, 2018 at 12:49:09AM +0200, Daniel Borkmann wrote:
>>>> On 05/02/2018 01:01 PM, Björn Töpel wrote:
>>>>> From: Björn Töpel <bjorn.topel@intel.com>
>>>>>
>>>>> This patch set introduces a new address family called AF_XDP that is
>>>>> optimized for high performance packet processing and, in upcoming
>>>>> patch sets, zero-copy semantics. In this patch set, we have removed
>>>>> all zero-copy related code in order to make it smaller, simpler and
>>>>> hopefully more review friendly. This patch set only supports copy-mode
>>>>> for the generic XDP path (XDP_SKB) for both RX and TX and copy-mode
>>>>> for RX using the XDP_DRV path. Zero-copy support requires XDP and
>>>>> driver changes that Jesper Dangaard Brouer is working on. Some of his
>>>>> work has already been accepted. We will publish our zero-copy support
>>>>> for RX and TX on top of his patch sets at a later point in time.
>>>>
>>>> +1, would be great to see it land this cycle. Saw few minor nits here
>>>> and there but nothing to hold it up, for the series:
>>>>
>>>> Acked-by: Daniel Borkmann <daniel@iogearbox.net>
>>>>
>>>> Thanks everyone!
>>>
>>> Great stuff!
>>>
>>> Applied to bpf-next, with one condition.
>>> Upcoming zero-copy patches for both RX and TX need to be posted
>>> and reviewed within this release window.
>>> If netdev community as a whole won't be able to agree on the zero-copy
>>> bits we'd need to revert this feature before the next merge window.
>>>
>>> Few other minor nits:
>>> patch 3:
>>> +struct xdp_ring {
>>> +       __u32 producer __attribute__((aligned(64)));
>>> +       __u32 consumer __attribute__((aligned(64)));
>>> +};
>>> It kinda begs for ____cacheline_aligned_in_smp to be introduced for uapi headers.
>>
>> Hmm, I need some guidance on what a sane uapi variant would be. We
>> can't have the uapi depend on the kernel build. ARM64, e.g., can have
>> both 64B and 128B according to the specs. Contemporary IA processors
>> have 64B.
>>
>> The simplest, and maybe most future-proof, would be 128B aligned for
>> all. Another is having 128B for ARM and 64B for all IA. A third option
>> is having a hand-shaking API (I think virtio has that) for determine
>> the cache line size, but I'd rather not go down that route.
>>
>> Thoughts/ideas on how a uapi ____cacheline_aligned_in_smp version
>> would look like?
> 
> I suspect i40e+arm combination wasn't tested anyway.
> The api may have endianness issues too on something like sparc.
> I think the way to be backwards compatible in this area
> is to make the api usable on x86 only by adding
> to include/uapi/linux/if_xdp.h
> #if defined(__x86_64__)
> #define AF_XDP_CACHE_BYTES 64
> #else
> #error "AF_XDP support is not yet available for this architecture"
> #endif
> and doing:
>     __u32 producer __attribute__((aligned(AF_XDP_CACHE_BYTES)));
>     __u32 consumer __attribute__((aligned(AF_XDP_CACHE_BYTES)));
> 
> And progressively add to this for arm64 and few other archs.
> Eventually removing #error and adding some generic define
> that's good enough for long tail of architectures that
> we really cannot test.

Been looking into this yesterday as well a bit, and it's a bit of a mess what
uapi headers do on this regard (though there are just a handful of such headers).
Some of the kernel uapi headers hard-code generally 64 bytes regardless of the
underlying arch. In general, the kernel does expose it to user space via sysfs
(coherency_line_size). Here's what perf does to retrieve it:

#ifdef _SC_LEVEL1_DCACHE_LINESIZE
#define cache_line_size(cacheline_sizep) *cacheline_sizep = sysconf(_SC_LEVEL1_DCACHE_LINESIZE)
#else
static void cache_line_size(int *cacheline_sizep)
{
        if (sysfs__read_int("devices/system/cpu/cpu0/cache/index0/coherency_line_size", cacheline_sizep))
                pr_debug("cannot determine cache line size");
}
#endif

The sysconf() implementation for _SC_LEVEL1_DCACHE_LINESIZE seems also only
available for x86, arm64, s390 and ppc on a cursory glance in the glibc code.
In the x86 case it retrieves the info from cpuid insn. In order to generically
use it in combination with the header you'd have some probe which would then
set this as a define before including the header.

Then projects like urcu, they do ...

#define ____cacheline_internodealigned_in_smp \
        __attribute__((__aligned__(CAA_CACHE_LINE_SIZE)))

... and then hard code CAA_CACHE_LINE_SIZE for x86 (== 128), s390 (== 128),
ppc (== 256) and sparc64 (== 256) with a generic fallback to 64.

Hmm, perhaps a combination of the two would make sense where in case of known
cacheline size it can still be used and we only have the fallback in such way.
Like:

#ifndef XDP_CACHE_BYTES
# if defined(__x86_64__)
#  define XDP_CACHE_BYTES	64
# else
#  error "Please define XDP_CACHE_BYTES for this architecture!"
# endif
#endif

Too bad there's no asm uapi header at least for the archs where it's fixed
anyway such that not every project out there has to redefine all of it from
scratch and we could just include it (and the generic-asm one would throw
a compile error if it's not externally defined or such).

Cheers,
Daniel

^ permalink raw reply

* [PATCH net] cxgb4: fix offset in collecting TX rate limit info
From: Rahul Lakkireddy @ 2018-05-18 13:43 UTC (permalink / raw)
  To: netdev; +Cc: davem, ganeshgr, nirranjan, indranil, Rahul Lakkireddy

Correct the indirect register offsets in collecting TX rate limit info
in UP CIM logs.

Also, T5 doesn't support these indirect register offsets, so remove
them from collection logic.

Fixes: be6e36d916b1 ("cxgb4: collect TX rate limit info in UP CIM logs")
Signed-off-by: Rahul Lakkireddy <rahul.lakkireddy@chelsio.com>
Signed-off-by: Ganesh Goudar <ganeshgr@chelsio.com>
---
 drivers/net/ethernet/chelsio/cxgb4/cudbg_entity.h | 28 ++++++++---------------
 1 file changed, 9 insertions(+), 19 deletions(-)

diff --git a/drivers/net/ethernet/chelsio/cxgb4/cudbg_entity.h b/drivers/net/ethernet/chelsio/cxgb4/cudbg_entity.h
index b57acb8dc35b..dc25066c59a1 100644
--- a/drivers/net/ethernet/chelsio/cxgb4/cudbg_entity.h
+++ b/drivers/net/ethernet/chelsio/cxgb4/cudbg_entity.h
@@ -419,15 +419,15 @@ static const u32 t6_up_cim_reg_array[][IREG_NUM_ELEM + 1] = {
 	{0x7b50, 0x7b54, 0x280, 0x20, 0}, /* up_cim_280_to_2fc */
 	{0x7b50, 0x7b54, 0x300, 0x20, 0}, /* up_cim_300_to_37c */
 	{0x7b50, 0x7b54, 0x380, 0x14, 0}, /* up_cim_380_to_3cc */
-	{0x7b50, 0x7b54, 0x2900, 0x4, 0x4}, /* up_cim_2900_to_3d40 */
-	{0x7b50, 0x7b54, 0x2904, 0x4, 0x4}, /* up_cim_2904_to_3d44 */
-	{0x7b50, 0x7b54, 0x2908, 0x4, 0x4}, /* up_cim_2908_to_3d48 */
-	{0x7b50, 0x7b54, 0x2910, 0x4, 0x4}, /* up_cim_2910_to_3d4c */
-	{0x7b50, 0x7b54, 0x2914, 0x4, 0x4}, /* up_cim_2914_to_3d50 */
-	{0x7b50, 0x7b54, 0x2920, 0x10, 0x10}, /* up_cim_2920_to_2a10 */
-	{0x7b50, 0x7b54, 0x2924, 0x10, 0x10}, /* up_cim_2924_to_2a14 */
-	{0x7b50, 0x7b54, 0x2928, 0x10, 0x10}, /* up_cim_2928_to_2a18 */
-	{0x7b50, 0x7b54, 0x292c, 0x10, 0x10}, /* up_cim_292c_to_2a1c */
+	{0x7b50, 0x7b54, 0x4900, 0x4, 0x4}, /* up_cim_4900_to_4c60 */
+	{0x7b50, 0x7b54, 0x4904, 0x4, 0x4}, /* up_cim_4904_to_4c64 */
+	{0x7b50, 0x7b54, 0x4908, 0x4, 0x4}, /* up_cim_4908_to_4c68 */
+	{0x7b50, 0x7b54, 0x4910, 0x4, 0x4}, /* up_cim_4910_to_4c70 */
+	{0x7b50, 0x7b54, 0x4914, 0x4, 0x4}, /* up_cim_4914_to_4c74 */
+	{0x7b50, 0x7b54, 0x4920, 0x10, 0x10}, /* up_cim_4920_to_4a10 */
+	{0x7b50, 0x7b54, 0x4924, 0x10, 0x10}, /* up_cim_4924_to_4a14 */
+	{0x7b50, 0x7b54, 0x4928, 0x10, 0x10}, /* up_cim_4928_to_4a18 */
+	{0x7b50, 0x7b54, 0x492c, 0x10, 0x10}, /* up_cim_492c_to_4a1c */
 };
 
 static const u32 t5_up_cim_reg_array[][IREG_NUM_ELEM + 1] = {
@@ -444,16 +444,6 @@ static const u32 t5_up_cim_reg_array[][IREG_NUM_ELEM + 1] = {
 	{0x7b50, 0x7b54, 0x280, 0x20, 0}, /* up_cim_280_to_2fc */
 	{0x7b50, 0x7b54, 0x300, 0x20, 0}, /* up_cim_300_to_37c */
 	{0x7b50, 0x7b54, 0x380, 0x14, 0}, /* up_cim_380_to_3cc */
-	{0x7b50, 0x7b54, 0x2900, 0x4, 0x4}, /* up_cim_2900_to_3d40 */
-	{0x7b50, 0x7b54, 0x2904, 0x4, 0x4}, /* up_cim_2904_to_3d44 */
-	{0x7b50, 0x7b54, 0x2908, 0x4, 0x4}, /* up_cim_2908_to_3d48 */
-	{0x7b50, 0x7b54, 0x2910, 0x4, 0x4}, /* up_cim_2910_to_3d4c */
-	{0x7b50, 0x7b54, 0x2914, 0x4, 0x4}, /* up_cim_2914_to_3d50 */
-	{0x7b50, 0x7b54, 0x2918, 0x4, 0x4}, /* up_cim_2918_to_3d54 */
-	{0x7b50, 0x7b54, 0x291c, 0x4, 0x4}, /* up_cim_291c_to_3d58 */
-	{0x7b50, 0x7b54, 0x2924, 0x10, 0x10}, /* up_cim_2924_to_2914 */
-	{0x7b50, 0x7b54, 0x2928, 0x10, 0x10}, /* up_cim_2928_to_2a18 */
-	{0x7b50, 0x7b54, 0x292c, 0x10, 0x10}, /* up_cim_292c_to_2a1c */
 };
 
 static const u32 t6_hma_ireg_array[][IREG_NUM_ELEM] = {
-- 
2.14.1

^ permalink raw reply related

page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox