Netdev List

Netdev List
 help / color / mirror / Atom feed

* [PATCH] net: mvpp2: typo and cosmetic fixes
From: Antoine Tenart @ 2018-05-18 12:34 UTC (permalink / raw)
  To: davem
  Cc: Antoine Tenart, netdev, linux-kernel, thomas.petazzoni,
	maxime.chevallier, gregory.clement, miquel.raynal, nadavh,
	stefanc, ymarkman, mw

This patch on the Marvell PPv2 driver is only cosmetic. Two typos are
removed as well as other cosmetic fixes, such as extra new lines or tabs
vs spaces.

Suggested-by: Stefan Chulski <stefanc@marvell.com>
Signed-off-by: Antoine Tenart <antoine.tenart@bootlin.com>
---
 drivers/net/ethernet/marvell/mvpp2.c | 12 ++++--------
 1 file changed, 4 insertions(+), 8 deletions(-)

diff --git a/drivers/net/ethernet/marvell/mvpp2.c b/drivers/net/ethernet/marvell/mvpp2.c
index f8ed983bc767..7f54bb0334f1 100644
--- a/drivers/net/ethernet/marvell/mvpp2.c
+++ b/drivers/net/ethernet/marvell/mvpp2.c
@@ -1757,7 +1757,6 @@ static void mvpp2_prs_tcam_ai_update(struct mvpp2_prs_entry *pe,
 	int i, ai_idx = MVPP2_PRS_TCAM_AI_BYTE;
 
 	for (i = 0; i < MVPP2_PRS_AI_BITS; i++) {
-
 		if (!(enable & BIT(i)))
 			continue;
 
@@ -1841,7 +1840,6 @@ static void mvpp2_prs_sram_ai_update(struct mvpp2_prs_entry *pe,
 	int ai_off = MVPP2_PRS_SRAM_AI_OFFS;
 
 	for (i = 0; i < MVPP2_PRS_SRAM_AI_CTRL_BITS; i++) {
-
 		if (!(mask & BIT(i)))
 			continue;
 
@@ -4937,7 +4935,7 @@ static void mvpp22_gop_mask_irq(struct mvpp2_port *port)
 	if (port->gop_id == 0) {
 		val = readl(port->base + MVPP22_XLG_EXT_INT_MASK);
 		val &= ~(MVPP22_XLG_EXT_INT_MASK_XLG |
-		         MVPP22_XLG_EXT_INT_MASK_GIG);
+			 MVPP22_XLG_EXT_INT_MASK_GIG);
 		writel(val, port->base + MVPP22_XLG_EXT_INT_MASK);
 	}
 
@@ -5471,7 +5469,6 @@ static void mvpp2_aggr_txq_pend_desc_add(struct mvpp2_port *port, int pending)
 			   MVPP2_AGGR_TXQ_UPDATE_REG, pending);
 }
 
-
 /* Check if there are enough free descriptors in aggregated txq.
  * If not, update the number of occupied descriptors and repeat the check.
  *
@@ -5551,7 +5548,7 @@ static int mvpp2_txq_reserved_desc_num_proc(struct mvpp2 *priv,
 
 	txq_pcpu->reserved_num += mvpp2_txq_alloc_reserved_desc(priv, txq, req);
 
-	/* OK, the descriptor cound has been updated: check again. */
+	/* OK, the descriptor could have been updated: check again. */
 	if (txq_pcpu->reserved_num < num)
 		return -ENOMEM;
 	return 0;
@@ -6033,7 +6030,7 @@ static int mvpp2_txq_init(struct mvpp2_port *port,
 	/* Calculate base address in prefetch buffer. We reserve 16 descriptors
 	 * for each existing TXQ.
 	 * TCONTS for PON port must be continuous from 0 to MVPP2_MAX_TCONT
-	 * GBE ports assumed to be continious from 0 to MVPP2_MAX_PORTS
+	 * GBE ports assumed to be continuous from 0 to MVPP2_MAX_PORTS
 	 */
 	desc_per_txq = 16;
 	desc = (port->id * MVPP2_MAX_TXQ * desc_per_txq) +
@@ -6603,8 +6600,7 @@ static int mvpp2_tx_frag_process(struct mvpp2_port *port, struct sk_buff *skb,
 		mvpp2_txdesc_size_set(port, tx_desc, frag->size);
 
 		buf_dma_addr = dma_map_single(port->dev->dev.parent, addr,
-					       frag->size,
-					       DMA_TO_DEVICE);
+					      frag->size, DMA_TO_DEVICE);
 		if (dma_mapping_error(port->dev->dev.parent, buf_dma_addr)) {
 			mvpp2_txq_desc_put(txq);
 			goto cleanup;
-- 
2.17.0

^ permalink raw reply related

* Re: pull-request Cavium liquidio firmware v1.7.2
From: Josh Boyer @ 2018-05-18 12:36 UTC (permalink / raw)
  To: Felix Manlunas
  Cc: Linux Firmware, netdev, raghu.vatsavayi, derek.chickles,
	satananda.burla
In-Reply-To: <20180509022157.GA1404@felix-thinkpad.cavium.com>

On Wed, May 9, 2018 at 12:54 AM Felix Manlunas <felix.manlunas@cavium.com>
wrote:

> The following changes since commit
8fc2d4e55685bf73b6f7752383da9067404a74bb:

>    Merge git://git.marvell.com/mwifiex-firmware (2018-05-07 09:09:40 -0400)

> are available in the git repository at:

>    https://github.com/felix-cavium/linux-firmware.git
for-upstreaming-v1.7.2

> for you to fetch changes up to d3b6941e1a85cbff895a92aa9e36b50deaeac970:

>    linux-firmware: liquidio: update firmware to v1.7.2 (2018-05-08
19:02:41 -0700)

> Signed-off-by: Felix Manlunas <felix.manlunas@cavium.com>
> ----------------------------------------------------------------
> Felix Manlunas (1):
>        linux-firmware: liquidio: update firmware to v1.7.2

>   WHENCE                     |   8 ++++----
>   liquidio/lio_210nv_nic.bin | Bin 1265368 -> 1281464 bytes
>   liquidio/lio_210sv_nic.bin | Bin 1163128 -> 1179352 bytes
>   liquidio/lio_23xx_nic.bin  | Bin 1271456 -> 1287264 bytes
>   liquidio/lio_410nv_nic.bin | Bin 1265368 -> 1281464 bytes
>   5 files changed, 4 insertions(+), 4 deletions(-)

Pulled and pushed out.  Thanks.

josh

^ permalink raw reply

* [PATCH bpf v2 0/6] bpf: enhancements for multi-function programs
From: Sandipan Das @ 2018-05-18 12:50 UTC (permalink / raw)
  To: ast, daniel; +Cc: netdev, linuxppc-dev, naveen.n.rao, mpe, jakub.kicinski

This patch series introduces the following:

[1] Support for bpf-to-bpf function calls in the powerpc64 JIT compiler.

[2] Provide a way for resolving function calls because of the way JITed
    images are allocated in powerpc64.

[3] Fix to get JITed instruction dumps for multi-function programs from
    the bpf system call.

v2:
 - Incorporate review comments from Jakub

Sandipan Das (6):
  bpf: support 64-bit offsets for bpf function calls
  bpf: powerpc64: add JIT support for multi-function programs
  bpf: get kernel symbol addresses via syscall
  tools: bpf: sync bpf uapi header
  tools: bpftool: resolve calls without using imm field
  bpf: fix JITed dump for multi-function programs via syscall

 arch/powerpc/net/bpf_jit_comp64.c | 79 ++++++++++++++++++++++++++++++++++-----
 include/uapi/linux/bpf.h          |  2 +
 kernel/bpf/syscall.c              | 56 ++++++++++++++++++++++++---
 kernel/bpf/verifier.c             | 22 +++++++----
 tools/bpf/bpftool/prog.c          | 29 ++++++++++++++
 tools/bpf/bpftool/xlated_dumper.c | 10 ++++-
 tools/bpf/bpftool/xlated_dumper.h |  2 +
 tools/include/uapi/linux/bpf.h    |  2 +
 8 files changed, 179 insertions(+), 23 deletions(-)

-- 
2.14.3

^ permalink raw reply

* [PATCH bpf v2 1/6] bpf: support 64-bit offsets for bpf function calls
From: Sandipan Das @ 2018-05-18 12:50 UTC (permalink / raw)
  To: ast, daniel; +Cc: netdev, linuxppc-dev, naveen.n.rao, mpe, jakub.kicinski
In-Reply-To: <20180518125039.6500-1-sandipan@linux.vnet.ibm.com>

The imm field of a bpf instruction is a signed 32-bit integer.
For JIT bpf-to-bpf function calls, it stores the offset of the
start address of the callee's JITed image from __bpf_call_base.

For some architectures, such as powerpc64, this offset may be
as large as 64 bits and cannot be accomodated in the imm field
without truncation.

We resolve this by:

[1] Additionally using the auxillary data of each function to
    keep a list of start addresses of the JITed images for all
    functions determined by the verifier.

[2] Retaining the subprog id inside the off field of the call
    instructions and using it to index into the list mentioned
    above and lookup the callee's address.

To make sure that the existing JIT compilers continue to work
without requiring changes, we keep the imm field as it is.

Signed-off-by: Sandipan Das <sandipan@linux.vnet.ibm.com>
---
 kernel/bpf/verifier.c | 15 ++++++++++++++-
 1 file changed, 14 insertions(+), 1 deletion(-)

diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index a9e4b1372da6..6c56cce9c4e3 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -5383,11 +5383,24 @@ static int jit_subprogs(struct bpf_verifier_env *env)
 			    insn->src_reg != BPF_PSEUDO_CALL)
 				continue;
 			subprog = insn->off;
-			insn->off = 0;
 			insn->imm = (u64 (*)(u64, u64, u64, u64, u64))
 				func[subprog]->bpf_func -
 				__bpf_call_base;
 		}
+
+		/* we use the aux data to keep a list of the start addresses
+		 * of the JITed images for each function in the program
+		 *
+		 * for some architectures, such as powerpc64, the imm field
+		 * might not be large enough to hold the offset of the start
+		 * address of the callee's JITed image from __bpf_call_base
+		 *
+		 * in such cases, we can lookup the start address of a callee
+		 * by using its subprog id, available from the off field of
+		 * the call instruction, as an index for this list
+		 */
+		func[i]->aux->func = func;
+		func[i]->aux->func_cnt = env->subprog_cnt + 1;
 	}
 	for (i = 0; i < env->subprog_cnt; i++) {
 		old_bpf_func = func[i]->bpf_func;
-- 
2.14.3

^ permalink raw reply related

* [PATCH bpf v2 2/6] bpf: powerpc64: add JIT support for multi-function programs
From: Sandipan Das @ 2018-05-18 12:50 UTC (permalink / raw)
  To: ast, daniel; +Cc: netdev, linuxppc-dev, naveen.n.rao, mpe, jakub.kicinski
In-Reply-To: <20180518125039.6500-1-sandipan@linux.vnet.ibm.com>

This adds support for bpf-to-bpf function calls in the powerpc64
JIT compiler. The JIT compiler converts the bpf call instructions
to native branch instructions. After a round of the usual passes,
the start addresses of the JITed images for the callee functions
are known. Finally, to fixup the branch target addresses, we need
to perform an extra pass.

Because of the address range in which JITed images are allocated
on powerpc64, the offsets of the start addresses of these images
from __bpf_call_base are as large as 64 bits. So, for a function
call, we cannot use the imm field of the instruction to determine
the callee's address. Instead, we use the alternative method of
getting it from the list of function addresses in the auxillary
data of the caller by using the off field as an index.

Signed-off-by: Sandipan Das <sandipan@linux.vnet.ibm.com>
---
 arch/powerpc/net/bpf_jit_comp64.c | 79 ++++++++++++++++++++++++++++++++++-----
 1 file changed, 69 insertions(+), 10 deletions(-)

diff --git a/arch/powerpc/net/bpf_jit_comp64.c b/arch/powerpc/net/bpf_jit_comp64.c
index 1bdb1aff0619..25939892d8f7 100644
--- a/arch/powerpc/net/bpf_jit_comp64.c
+++ b/arch/powerpc/net/bpf_jit_comp64.c
@@ -256,7 +256,7 @@ static void bpf_jit_emit_tail_call(u32 *image, struct codegen_context *ctx, u32
 /* Assemble the body code between the prologue & epilogue */
 static int bpf_jit_build_body(struct bpf_prog *fp, u32 *image,
 			      struct codegen_context *ctx,
-			      u32 *addrs)
+			      u32 *addrs, bool extra_pass)
 {
 	const struct bpf_insn *insn = fp->insnsi;
 	int flen = fp->len;
@@ -712,11 +712,23 @@ static int bpf_jit_build_body(struct bpf_prog *fp, u32 *image,
 			break;
 
 		/*
-		 * Call kernel helper
+		 * Call kernel helper or bpf function
 		 */
 		case BPF_JMP | BPF_CALL:
 			ctx->seen |= SEEN_FUNC;
-			func = (u8 *) __bpf_call_base + imm;
+
+			/* bpf function call */
+			if (insn[i].src_reg == BPF_PSEUDO_CALL && extra_pass)
+				if (fp->aux->func && off < fp->aux->func_cnt)
+					/* use the subprog id from the off
+					 * field to lookup the callee address
+					 */
+					func = (u8 *) fp->aux->func[off]->bpf_func;
+				else
+					return -EINVAL;
+			/* kernel helper call */
+			else
+				func = (u8 *) __bpf_call_base + imm;
 
 			bpf_jit_emit_func_call(image, ctx, (u64)func);
 
@@ -864,6 +876,14 @@ static int bpf_jit_build_body(struct bpf_prog *fp, u32 *image,
 	return 0;
 }
 
+struct powerpc64_jit_data {
+	struct bpf_binary_header *header;
+	u32 *addrs;
+	u8 *image;
+	u32 proglen;
+	struct codegen_context ctx;
+};
+
 struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *fp)
 {
 	u32 proglen;
@@ -871,6 +891,7 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *fp)
 	u8 *image = NULL;
 	u32 *code_base;
 	u32 *addrs;
+	struct powerpc64_jit_data *jit_data;
 	struct codegen_context cgctx;
 	int pass;
 	int flen;
@@ -878,6 +899,7 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *fp)
 	struct bpf_prog *org_fp = fp;
 	struct bpf_prog *tmp_fp;
 	bool bpf_blinded = false;
+	bool extra_pass = false;
 
 	if (!fp->jit_requested)
 		return org_fp;
@@ -891,7 +913,28 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *fp)
 		fp = tmp_fp;
 	}
 
+	jit_data = fp->aux->jit_data;
+	if (!jit_data) {
+		jit_data = kzalloc(sizeof(*jit_data), GFP_KERNEL);
+		if (!jit_data) {
+			fp = org_fp;
+			goto out;
+		}
+		fp->aux->jit_data = jit_data;
+	}
+
 	flen = fp->len;
+	addrs = jit_data->addrs;
+	if (addrs) {
+		cgctx = jit_data->ctx;
+		image = jit_data->image;
+		bpf_hdr = jit_data->header;
+		proglen = jit_data->proglen;
+		alloclen = proglen + FUNCTION_DESCR_SIZE;
+		extra_pass = true;
+		goto skip_init_ctx;
+	}
+
 	addrs = kzalloc((flen+1) * sizeof(*addrs), GFP_KERNEL);
 	if (addrs == NULL) {
 		fp = org_fp;
@@ -904,10 +947,10 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *fp)
 	cgctx.stack_size = round_up(fp->aux->stack_depth, 16);
 
 	/* Scouting faux-generate pass 0 */
-	if (bpf_jit_build_body(fp, 0, &cgctx, addrs)) {
+	if (bpf_jit_build_body(fp, 0, &cgctx, addrs, false)) {
 		/* We hit something illegal or unsupported. */
 		fp = org_fp;
-		goto out;
+		goto out_addrs;
 	}
 
 	/*
@@ -925,9 +968,10 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *fp)
 			bpf_jit_fill_ill_insns);
 	if (!bpf_hdr) {
 		fp = org_fp;
-		goto out;
+		goto out_addrs;
 	}
 
+skip_init_ctx:
 	code_base = (u32 *)(image + FUNCTION_DESCR_SIZE);
 
 	/* Code generation passes 1-2 */
@@ -935,7 +979,7 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *fp)
 		/* Now build the prologue, body code & epilogue for real. */
 		cgctx.idx = 0;
 		bpf_jit_build_prologue(code_base, &cgctx);
-		bpf_jit_build_body(fp, code_base, &cgctx, addrs);
+		bpf_jit_build_body(fp, code_base, &cgctx, addrs, extra_pass);
 		bpf_jit_build_epilogue(code_base, &cgctx);
 
 		if (bpf_jit_enable > 1)
@@ -956,15 +1000,30 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *fp)
 	((u64 *)image)[1] = local_paca->kernel_toc;
 #endif
 
+	bpf_flush_icache(bpf_hdr, (u8 *)bpf_hdr + (bpf_hdr->pages * PAGE_SIZE));
+
+	if (!fp->is_func || extra_pass) {
+		bpf_jit_binary_lock_ro(bpf_hdr);
+	} else {
+		jit_data->addrs = addrs;
+		jit_data->ctx = cgctx;
+		jit_data->proglen = proglen;
+		jit_data->image = image;
+		jit_data->header = bpf_hdr;
+	}
+
 	fp->bpf_func = (void *)image;
 	fp->jited = 1;
 	fp->jited_len = alloclen;
 
-	bpf_flush_icache(bpf_hdr, (u8 *)bpf_hdr + (bpf_hdr->pages * PAGE_SIZE));
+	if (!fp->is_func || extra_pass) {
+out_addrs:
+		kfree(addrs);
+		kfree(jit_data);
+		fp->aux->jit_data = NULL;
+	}
 
 out:
-	kfree(addrs);
-
 	if (bpf_blinded)
 		bpf_jit_prog_release_other(fp, fp == org_fp ? tmp_fp : org_fp);
 
-- 
2.14.3

^ permalink raw reply related

* [PATCH bpf v2 3/6] bpf: get kernel symbol addresses via syscall
From: Sandipan Das @ 2018-05-18 12:50 UTC (permalink / raw)
  To: ast, daniel; +Cc: netdev, linuxppc-dev, naveen.n.rao, mpe, jakub.kicinski
In-Reply-To: <20180518125039.6500-1-sandipan@linux.vnet.ibm.com>

This adds new two new fields to struct bpf_prog_info. For
multi-function programs, these fields can be used to pass
a list of kernel symbol addresses for all functions in a
given program and to userspace using the bpf system call
with the BPF_OBJ_GET_INFO_BY_FD command.

When bpf_jit_kallsyms is enabled, we can get the address
of the corresponding kernel symbol for a callee function
and resolve the symbol's name. The address is determined
by adding the value of the call instruction's imm field
to __bpf_call_base. This offset gets assigned to the imm
field by the verifier.

For some architectures, such as powerpc64, the imm field
is not large enough to hold this offset.

We resolve this by:

[1] Assigning the subprog id to the imm field of a call
    instruction in the verifier instead of the offset of
    the callee's symbol's address from __bpf_call_base.

[2] Determining the address of a callee's corresponding
    symbol by using the imm field as an index for the
    list of kernel symbol addresses now available from
    the program info.

Suggested-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: Sandipan Das <sandipan@linux.vnet.ibm.com>
---
 include/uapi/linux/bpf.h |  2 ++
 kernel/bpf/syscall.c     | 20 ++++++++++++++++++++
 kernel/bpf/verifier.c    |  7 +------
 3 files changed, 23 insertions(+), 6 deletions(-)

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index d94d333a8225..040c9cac7303 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -2188,6 +2188,8 @@ struct bpf_prog_info {
 	__u32 xlated_prog_len;
 	__aligned_u64 jited_prog_insns;
 	__aligned_u64 xlated_prog_insns;
+	__aligned_u64 jited_ksyms;
+	__u32 nr_jited_ksyms;
 	__u64 load_time;	/* ns since boottime */
 	__u32 created_by_uid;
 	__u32 nr_map_ids;
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index bfcde949c7f8..54a72fafe57c 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -1933,6 +1933,7 @@ static int bpf_prog_get_info_by_fd(struct bpf_prog *prog,
 	if (!capable(CAP_SYS_ADMIN)) {
 		info.jited_prog_len = 0;
 		info.xlated_prog_len = 0;
+		info.nr_jited_ksyms = 0;
 		goto done;
 	}
 
@@ -1981,6 +1982,25 @@ static int bpf_prog_get_info_by_fd(struct bpf_prog *prog,
 		}
 	}
 
+	ulen = info.nr_jited_ksyms;
+	info.nr_jited_ksyms = prog->aux->func_cnt;
+	if (info.nr_jited_ksyms && ulen) {
+		u64 __user *user_jited_ksyms = u64_to_user_ptr(info.jited_ksyms);
+		ulong ksym_addr;
+		u32 i;
+
+		/* copy the address of the kernel symbol corresponding to
+		 * each function
+		 */
+		ulen = min_t(u32, info.nr_jited_ksyms, ulen);
+		for (i = 0; i < ulen; i++) {
+			ksym_addr = (ulong) prog->aux->func[i]->bpf_func;
+			ksym_addr &= PAGE_MASK;
+			if (put_user((u64) ksym_addr, &user_jited_ksyms[i]))
+				return -EFAULT;
+		}
+	}
+
 done:
 	if (copy_to_user(uinfo, &info, info_len) ||
 	    put_user(info_len, &uattr->info.info_len))
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 6c56cce9c4e3..e826c396aba2 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -5426,17 +5426,12 @@ static int jit_subprogs(struct bpf_verifier_env *env)
 	 * later look the same as if they were interpreted only.
 	 */
 	for (i = 0, insn = prog->insnsi; i < prog->len; i++, insn++) {
-		unsigned long addr;
-
 		if (insn->code != (BPF_JMP | BPF_CALL) ||
 		    insn->src_reg != BPF_PSEUDO_CALL)
 			continue;
 		insn->off = env->insn_aux_data[i].call_imm;
 		subprog = find_subprog(env, i + insn->off + 1);
-		addr  = (unsigned long)func[subprog]->bpf_func;
-		addr &= PAGE_MASK;
-		insn->imm = (u64 (*)(u64, u64, u64, u64, u64))
-			    addr - __bpf_call_base;
+		insn->imm = subprog;
 	}
 
 	prog->jited = 1;
-- 
2.14.3

^ permalink raw reply related

* [PATCH bpf v2 4/6] tools: bpf: sync bpf uapi header
From: Sandipan Das @ 2018-05-18 12:50 UTC (permalink / raw)
  To: ast, daniel; +Cc: netdev, linuxppc-dev, naveen.n.rao, mpe, jakub.kicinski
In-Reply-To: <20180518125039.6500-1-sandipan@linux.vnet.ibm.com>

Syncing the bpf.h uapi header with tools so that struct
bpf_prog_info has the two new fields for passing on the
addresses of the kernel symbols corresponding to each
function in a JITed program.

Signed-off-by: Sandipan Das <sandipan@linux.vnet.ibm.com>
---
 tools/include/uapi/linux/bpf.h | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index d94d333a8225..040c9cac7303 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -2188,6 +2188,8 @@ struct bpf_prog_info {
 	__u32 xlated_prog_len;
 	__aligned_u64 jited_prog_insns;
 	__aligned_u64 xlated_prog_insns;
+	__aligned_u64 jited_ksyms;
+	__u32 nr_jited_ksyms;
 	__u64 load_time;	/* ns since boottime */
 	__u32 created_by_uid;
 	__u32 nr_map_ids;
-- 
2.14.3

^ permalink raw reply related

* [PATCH bpf v2 5/6] tools: bpftool: resolve calls without using imm field
From: Sandipan Das @ 2018-05-18 12:50 UTC (permalink / raw)
  To: ast, daniel; +Cc: netdev, linuxppc-dev, naveen.n.rao, mpe, jakub.kicinski
In-Reply-To: <20180518125039.6500-1-sandipan@linux.vnet.ibm.com>

Currently, we resolve the callee's address for a JITed function
call by using the imm field of the call instruction as an offset
from __bpf_call_base. If bpf_jit_kallsyms is enabled, we further
use this address to get the callee's kernel symbol's name.

For some architectures, such as powerpc64, the imm field is not
large enough to hold this offset. So, instead of assigning this
offset to the imm field, the verifier now assigns the subprog
id. Also, a list of kernel symbol addresses for all the JITed
functions is provided in the program info. We now use the imm
field as an index for this list to lookup a callee's symbol's
address and resolve its name.

Suggested-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: Sandipan Das <sandipan@linux.vnet.ibm.com>
---
v2:
 - Order variables from longest to shortest
 - Make sure that ksyms_ptr and ksyms_len are always initialized
 - Simplify code
---
 tools/bpf/bpftool/prog.c          | 29 +++++++++++++++++++++++++++++
 tools/bpf/bpftool/xlated_dumper.c | 10 +++++++++-
 tools/bpf/bpftool/xlated_dumper.h |  2 ++
 3 files changed, 40 insertions(+), 1 deletion(-)

diff --git a/tools/bpf/bpftool/prog.c b/tools/bpf/bpftool/prog.c
index 9bdfdf2d3fbe..e2f8f8f259fc 100644
--- a/tools/bpf/bpftool/prog.c
+++ b/tools/bpf/bpftool/prog.c
@@ -421,19 +421,26 @@ static int do_show(int argc, char **argv)
 static int do_dump(int argc, char **argv)
 {
 	struct bpf_prog_info info = {};
+	unsigned long *addrs = NULL;
 	struct dump_data dd = {};
 	__u32 len = sizeof(info);
 	unsigned int buf_size;
+	unsigned int nr_addrs;
 	char *filepath = NULL;
 	bool opcodes = false;
 	bool visual = false;
 	unsigned char *buf;
 	__u32 *member_len;
 	__u64 *member_ptr;
+	__u32 *ksyms_len;
+	__u64 *ksyms_ptr;
 	ssize_t n;
 	int err;
 	int fd;
 
+	ksyms_len = &info.nr_jited_ksyms;
+	ksyms_ptr = &info.jited_ksyms;
+
 	if (is_prefix(*argv, "jited")) {
 		member_len = &info.jited_prog_len;
 		member_ptr = &info.jited_prog_insns;
@@ -496,10 +503,22 @@ static int do_dump(int argc, char **argv)
 		return -1;
 	}
 
+	nr_addrs = *ksyms_len;
+	if (nr_addrs) {
+		addrs = malloc(nr_addrs * sizeof(__u64));
+		if (!addrs) {
+			p_err("mem alloc failed");
+			close(fd);
+			goto err_free;
+		}
+	}
+
 	memset(&info, 0, sizeof(info));
 
 	*member_ptr = ptr_to_u64(buf);
 	*member_len = buf_size;
+	*ksyms_ptr = ptr_to_u64(addrs);
+	*ksyms_len = nr_addrs;
 
 	err = bpf_obj_get_info_by_fd(fd, &info, &len);
 	close(fd);
@@ -513,6 +532,11 @@ static int do_dump(int argc, char **argv)
 		goto err_free;
 	}
 
+	if (*ksyms_len > nr_addrs) {
+		p_err("too many addresses returned");
+		goto err_free;
+	}
+
 	if ((member_len == &info.jited_prog_len &&
 	     info.jited_prog_insns == 0) ||
 	    (member_len == &info.xlated_prog_len &&
@@ -558,6 +582,9 @@ static int do_dump(int argc, char **argv)
 			dump_xlated_cfg(buf, *member_len);
 	} else {
 		kernel_syms_load(&dd);
+		dd.nr_jited_ksyms = *ksyms_len;
+		dd.jited_ksyms = (__u64 *) *ksyms_ptr;
+
 		if (json_output)
 			dump_xlated_json(&dd, buf, *member_len, opcodes);
 		else
@@ -566,10 +593,12 @@ static int do_dump(int argc, char **argv)
 	}
 
 	free(buf);
+	free(addrs);
 	return 0;
 
 err_free:
 	free(buf);
+	free(addrs);
 	return -1;
 }
 
diff --git a/tools/bpf/bpftool/xlated_dumper.c b/tools/bpf/bpftool/xlated_dumper.c
index 7a3173b76c16..fb065b55db6d 100644
--- a/tools/bpf/bpftool/xlated_dumper.c
+++ b/tools/bpf/bpftool/xlated_dumper.c
@@ -174,7 +174,11 @@ static const char *print_call_pcrel(struct dump_data *dd,
 				    unsigned long address,
 				    const struct bpf_insn *insn)
 {
-	if (sym)
+	if (!dd->nr_jited_ksyms)
+		/* Do not show address for interpreted programs */
+		snprintf(dd->scratch_buff, sizeof(dd->scratch_buff),
+			"%+d", insn->off);
+	else if (sym)
 		snprintf(dd->scratch_buff, sizeof(dd->scratch_buff),
 			 "%+d#%s", insn->off, sym->name);
 	else
@@ -203,6 +207,10 @@ static const char *print_call(void *private_data,
 	unsigned long address = dd->address_call_base + insn->imm;
 	struct kernel_sym *sym;
 
+	if (insn->src_reg == BPF_PSEUDO_CALL &&
+		(__u32) insn->imm < dd->nr_jited_ksyms)
+		address = dd->jited_ksyms[insn->imm];
+
 	sym = kernel_syms_search(dd, address);
 	if (insn->src_reg == BPF_PSEUDO_CALL)
 		return print_call_pcrel(dd, sym, address, insn);
diff --git a/tools/bpf/bpftool/xlated_dumper.h b/tools/bpf/bpftool/xlated_dumper.h
index b34affa7ef2d..eafbb49c8d0b 100644
--- a/tools/bpf/bpftool/xlated_dumper.h
+++ b/tools/bpf/bpftool/xlated_dumper.h
@@ -49,6 +49,8 @@ struct dump_data {
 	unsigned long address_call_base;
 	struct kernel_sym *sym_mapping;
 	__u32 sym_count;
+	__u64 *jited_ksyms;
+	__u32 nr_jited_ksyms;
 	char scratch_buff[SYM_MAX_NAME + 8];
 };
 
-- 
2.14.3

^ permalink raw reply related

* [PATCH bpf v2 6/6] bpf: fix JITed dump for multi-function programs via syscall
From: Sandipan Das @ 2018-05-18 12:50 UTC (permalink / raw)
  To: ast, daniel; +Cc: netdev, linuxppc-dev, naveen.n.rao, mpe, jakub.kicinski
In-Reply-To: <20180518125039.6500-1-sandipan@linux.vnet.ibm.com>

Currently, for multi-function programs, we cannot get the JITed
instructions using the bpf system call's BPF_OBJ_GET_INFO_BY_FD
command. Because of this, userspace tools such as bpftool fail
to identify a multi-function program as being JITed or not.

With the JIT enabled and the test program running, this can be
verified as follows:

  # cat /proc/sys/net/core/bpf_jit_enable
  1

Before applying this patch:

  # bpftool prog list
  1: kprobe  name foo  tag b811aab41a39ad3d  gpl
          loaded_at 2018-05-16T11:43:38+0530  uid 0
          xlated 216B  not jited  memlock 65536B
  ...

  # bpftool prog dump jited id 1
  no instructions returned

After applying this patch:

  # bpftool prog list
  1: kprobe  name foo  tag b811aab41a39ad3d  gpl
          loaded_at 2018-05-16T12:13:01+0530  uid 0
          xlated 216B  jited 308B  memlock 65536B
  ...

  # bpftool prog dump jited id 1
     0:   nop
     4:   nop
     8:   mflr    r0
     c:   std     r0,16(r1)
    10:   stdu    r1,-112(r1)
    14:   std     r31,104(r1)
    18:   addi    r31,r1,48
    1c:   li      r3,10
  ...

Signed-off-by: Sandipan Das <sandipan@linux.vnet.ibm.com>
---
 kernel/bpf/syscall.c | 38 ++++++++++++++++++++++++++++++++------
 1 file changed, 32 insertions(+), 6 deletions(-)

diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 54a72fafe57c..2430d159078c 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -1896,7 +1896,7 @@ static int bpf_prog_get_info_by_fd(struct bpf_prog *prog,
 	struct bpf_prog_info info = {};
 	u32 info_len = attr->info.info_len;
 	char __user *uinsns;
-	u32 ulen;
+	u32 ulen, i;
 	int err;
 
 	err = check_uarg_tail_zero(uinfo, sizeof(info), info_len);
@@ -1922,7 +1922,6 @@ static int bpf_prog_get_info_by_fd(struct bpf_prog *prog,
 	ulen = min_t(u32, info.nr_map_ids, ulen);
 	if (ulen) {
 		u32 __user *user_map_ids = u64_to_user_ptr(info.map_ids);
-		u32 i;
 
 		for (i = 0; i < ulen; i++)
 			if (put_user(prog->aux->used_maps[i]->id,
@@ -1970,13 +1969,41 @@ static int bpf_prog_get_info_by_fd(struct bpf_prog *prog,
 	 * for offload.
 	 */
 	ulen = info.jited_prog_len;
-	info.jited_prog_len = prog->jited_len;
+	if (prog->aux->func_cnt) {
+		info.jited_prog_len = 0;
+		for (i = 0; i < prog->aux->func_cnt; i++)
+			info.jited_prog_len += prog->aux->func[i]->jited_len;
+	} else {
+		info.jited_prog_len = prog->jited_len;
+	}
+
 	if (info.jited_prog_len && ulen) {
 		if (bpf_dump_raw_ok()) {
 			uinsns = u64_to_user_ptr(info.jited_prog_insns);
 			ulen = min_t(u32, info.jited_prog_len, ulen);
-			if (copy_to_user(uinsns, prog->bpf_func, ulen))
-				return -EFAULT;
+
+			/* for multi-function programs, copy the JITed
+			 * instructions for all the functions
+			 */
+			if (prog->aux->func_cnt) {
+				u32 len, free;
+				u8 *img;
+
+				free = ulen;
+				for (i = 0; i < prog->aux->func_cnt; i++) {
+					len = prog->aux->func[i]->jited_len;
+					img = (u8 *) prog->aux->func[i]->bpf_func;
+					if (len > free)
+						break;
+					if (copy_to_user(uinsns, img, len))
+						return -EFAULT;
+					uinsns += len;
+					free -= len;
+				}
+			} else {
+				if (copy_to_user(uinsns, prog->bpf_func, ulen))
+					return -EFAULT;
+			}
 		} else {
 			info.jited_prog_insns = 0;
 		}
@@ -1987,7 +2014,6 @@ static int bpf_prog_get_info_by_fd(struct bpf_prog *prog,
 	if (info.nr_jited_ksyms && ulen) {
 		u64 __user *user_jited_ksyms = u64_to_user_ptr(info.jited_ksyms);
 		ulong ksym_addr;
-		u32 i;
 
 		/* copy the address of the kernel symbol corresponding to
 		 * each function
-- 
2.14.3

^ permalink raw reply related

* [PATCH net] net: sched: red: avoid hashing NULL child
From: Paolo Abeni @ 2018-05-18 12:51 UTC (permalink / raw)
  To: netdev
  Cc: Jamal Hadi Salim, Cong Wang, Jiri Pirko, David S. Miller,
	Hangbin Liu, Jiri Kosina

Hangbin reported an Oops triggered by the syzkaller qdisc rules:

 kasan: GPF could be caused by NULL-ptr deref or user memory access
 general protection fault: 0000 [#1] SMP KASAN PTI
 Modules linked in: sch_red
 CPU: 0 PID: 28699 Comm: syz-executor5 Not tainted 4.17.0-rc4.kcov #1
 Hardware name: Red Hat KVM, BIOS 0.5.1 01/01/2011
 RIP: 0010:qdisc_hash_add+0x26/0xa0
 RSP: 0018:ffff8800589cf470 EFLAGS: 00010203
 RAX: dffffc0000000000 RBX: 0000000000000000 RCX: ffffffff824ad971
 RDX: 0000000000000007 RSI: ffffc9000ce9f000 RDI: 000000000000003c
 RBP: 0000000000000001 R08: ffffed000b139ea2 R09: ffff8800589cf4f0
 R10: ffff8800589cf50f R11: ffffed000b139ea2 R12: ffff880054019fc0
 R13: ffff880054019fb4 R14: ffff88005c0af600 R15: ffff880054019fb0
 FS:  00007fa6edcb1700(0000) GS:ffff88005ce00000(0000) knlGS:0000000000000000
 CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
 CR2: 0000000020000740 CR3: 000000000fc16000 CR4: 00000000000006f0
 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
 Call Trace:
  red_change+0x2d2/0xed0 [sch_red]
  qdisc_create+0x57e/0xef0
  tc_modify_qdisc+0x47f/0x14e0
  rtnetlink_rcv_msg+0x6a8/0x920
  netlink_rcv_skb+0x2a2/0x3c0
  netlink_unicast+0x511/0x740
  netlink_sendmsg+0x825/0xc30
  sock_sendmsg+0xc5/0x100
  ___sys_sendmsg+0x778/0x8e0
  __sys_sendmsg+0xf5/0x1b0
  do_syscall_64+0xbd/0x3b0
  entry_SYSCALL_64_after_hwframe+0x44/0xa9
 RIP: 0033:0x450869
 RSP: 002b:00007fa6edcb0c48 EFLAGS: 00000246 ORIG_RAX: 000000000000002e
 RAX: ffffffffffffffda RBX: 00007fa6edcb16b4 RCX: 0000000000450869
 RDX: 0000000000000000 RSI: 00000000200000c0 RDI: 0000000000000013
 RBP: 000000000072bea0 R08: 0000000000000000 R09: 0000000000000000
 R10: 0000000000000000 R11: 0000000000000246 R12: 00000000ffffffff
 R13: 0000000000008778 R14: 0000000000702838 R15: 00007fa6edcb1700
 Code: e9 0b fe ff ff 0f 1f 44 00 00 55 53 48 89 fb 89 f5 e8 3f 07 f3 fe 48 8d 7b 3c 48 b8 00 00 00 00 00 fc ff df 48 89 fa 48 c1 ea 03 <0f> b6 14 02 48 89 f8 83 e0 07 83 c0 03 38 d0 7c 04 84 d2 75 51
 RIP: qdisc_hash_add+0x26/0xa0 RSP: ffff8800589cf470

When a red qdisc is updated with a 0 limit, the child qdisc is left
unmodified, no additional scheduler is created in red_change(),
the 'child' local variable is rightfully NULL and must not add it
to the hash table.

This change addresses the above issue moving qdisc_hash_add() right
after the child qdisc creation. It additionally removes unneeded checks
for noop_qdisc.

Reported-by: Hangbin Liu <liuhangbin@gmail.com>
Fixes: 49b499718fa1 ("net: sched: make default fifo qdiscs appear in the dump")
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
---
 net/sched/sch_red.c | 5 +++--
 net/sched/sch_tbf.c | 5 +++--
 2 files changed, 6 insertions(+), 4 deletions(-)

diff --git a/net/sched/sch_red.c b/net/sched/sch_red.c
index 16644b3d2362..56c181c3feeb 100644
--- a/net/sched/sch_red.c
+++ b/net/sched/sch_red.c
@@ -222,10 +222,11 @@ static int red_change(struct Qdisc *sch, struct nlattr *opt,
 					 extack);
 		if (IS_ERR(child))
 			return PTR_ERR(child);
-	}
 
-	if (child != &noop_qdisc)
+		/* child is fifo, no need to check for noop_qdisc */
 		qdisc_hash_add(child, true);
+	}
+
 	sch_tree_lock(sch);
 	q->flags = ctl->flags;
 	q->limit = ctl->limit;
diff --git a/net/sched/sch_tbf.c b/net/sched/sch_tbf.c
index 03225a8df973..6f74a426f159 100644
--- a/net/sched/sch_tbf.c
+++ b/net/sched/sch_tbf.c
@@ -383,6 +383,9 @@ static int tbf_change(struct Qdisc *sch, struct nlattr *opt,
 			err = PTR_ERR(child);
 			goto done;
 		}
+
+		/* child is fifo, no need to check for noop_qdisc */
+		qdisc_hash_add(child, true);
 	}
 
 	sch_tree_lock(sch);
@@ -391,8 +394,6 @@ static int tbf_change(struct Qdisc *sch, struct nlattr *opt,
 					  q->qdisc->qstats.backlog);
 		qdisc_destroy(q->qdisc);
 		q->qdisc = child;
-		if (child != &noop_qdisc)
-			qdisc_hash_add(child, true);
 	}
 	q->limit = qopt->limit;
 	if (tb[TCA_TBF_PBURST])
-- 
2.14.3

^ permalink raw reply related

* Re: [PATCH 28/42] drbd: switch to proc_create_single
From: Lars Ellenberg @ 2018-05-18 12:55 UTC (permalink / raw)
  To: Christoph Hellwig
  Cc: linux-rtc-u79uwXL29TY76Z2rM5mHXA, Alessandro Zummo,
	Alexandre Belloni, devel-gWbeCf7V1WCQmaza687I9mD2FQJk+8+b,
	linux-scsi-u79uwXL29TY76Z2rM5mHXA,
	linux-acpi-u79uwXL29TY76Z2rM5mHXA, Greg Kroah-Hartman, Jiri Slaby,
	megaraidlinux.pdl-dY08KVG/lbpWk0Htik3J/w,
	linux-kernel-u79uwXL29TY76Z2rM5mHXA, Alexey Dobriyan,
	linux-ide-u79uwXL29TY76Z2rM5mHXA,
	netfilter-devel-u79uwXL29TY76Z2rM5mHXA, Alexander Viro,
	netdev-u79uwXL29TY76Z2rM5mHXA, Andrew Morton,
	linux-ext4-u79uwXL29TY76Z2rM5mHXA,
	linux-afs-IAPFreCvJWM7uuMidbF8XUB+6BGkLq7r,
	jfs-discussion-5NWGOfrQmneRv+LV9MX5uipxlwaOVQ5f,
	drbd-dev-cunTk1MwBs8qoQakbn7OcQ
In-Reply-To: <20180516094346.20506-29-hch-jcswGhMUV9g@public.gmane.org>

On Wed, May 16, 2018 at 11:43:32AM +0200, Christoph Hellwig wrote:
> And stop messing with try_module_get on THIS_MODULE, which doesn't make
> any sense here.

The idea was to increase module count on /proc/drbd access.

If someone holds /proc/drbd open, previously rmmod would
"succeed" in starting the unload, but then block on remove_proc_entry,
leading to a situation where the lsmod does not show drbd anymore,
but /proc/drbd being still there (but no longer accessible).

I'd rather have rmmod fail up front in this case.
And try_module_get() seemed most appropriate.

    Lars

^ permalink raw reply

* [PATCH net] tuntap: raise EPOLLOUT on device up
From: Jason Wang @ 2018-05-18 13:00 UTC (permalink / raw)
  To: netdev, linux-kernel; +Cc: mst, Jason Wang, Hannes Frederic Sowa, Eric Dumazet

We return -EIO on device down but can not raise EPOLLOUT after it was
up. This may confuse user like vhost which expects tuntap to raise
EPOLLOUT to re-enable its TX routine after tuntap is down. This could
be easily reproduced by transmitting packets from VM while down and up
the tap device. Fixing this by set SOCKWQ_ASYNC_NOSPACE on -EIO.

Cc: Hannes Frederic Sowa <hannes@stressinduktion.org>
Cc: Eric Dumazet <edumazet@google.com>
Fixes: 1bd4978a88ac2 ("tun: honor IFF_UP in tun_get_user()")
Signed-off-by: Jason Wang <jasowang@redhat.com>
---
 drivers/net/tun.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/drivers/net/tun.c b/drivers/net/tun.c
index d45ac37..1b29761 100644
--- a/drivers/net/tun.c
+++ b/drivers/net/tun.c
@@ -1734,8 +1734,10 @@ static ssize_t tun_get_user(struct tun_struct *tun, struct tun_file *tfile,
 	int skb_xdp = 1;
 	bool frags = tun_napi_frags_enabled(tun);
 
-	if (!(tun->dev->flags & IFF_UP))
+	if (!(tun->dev->flags & IFF_UP)) {
+		set_bit(SOCKWQ_ASYNC_NOSPACE, &tfile->socket.flags);
 		return -EIO;
+	}
 
 	if (!(tun->flags & IFF_NO_PI)) {
 		if (len < sizeof(pi))
-- 
2.7.4

^ permalink raw reply related

* Re: [PATCH net] sock_diag: fix use-after-free read in __sk_free
From: Craig Gallek @ 2018-05-18 13:05 UTC (permalink / raw)
  To: Eric Dumazet; +Cc: David S . Miller, netdev, Eric Dumazet

On Fri, May 18, 2018 at 7:47 AM, Eric Dumazet <edumazet@google.com> wrote:
> We must not call sock_diag_has_destroy_listeners(sk) on a socket
> that has no reference on net structure.
>
> BUG: KASAN: use-after-free in sock_diag_has_destroy_listeners include/linux/sock_diag.h:75 [inline]
> BUG: KASAN: use-after-free in __sk_free+0x329/0x340 net/core/sock.c:1609
> Read of size 8 at addr ffff88018a02e3a0 by task swapper/1/0
>
> CPU: 1 PID: 0 Comm: swapper/1 Not tainted 4.17.0-rc5+ #54
> Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011
> Call Trace:
>  <IRQ>
>  __dump_stack lib/dump_stack.c:77 [inline]
>  dump_stack+0x1b9/0x294 lib/dump_stack.c:113
>  print_address_description+0x6c/0x20b mm/kasan/report.c:256
>  kasan_report_error mm/kasan/report.c:354 [inline]
>  kasan_report.cold.7+0x242/0x2fe mm/kasan/report.c:412
>  __asan_report_load8_noabort+0x14/0x20 mm/kasan/report.c:433
>  sock_diag_has_destroy_listeners include/linux/sock_diag.h:75 [inline]
>  __sk_free+0x329/0x340 net/core/sock.c:1609
>  sk_free+0x42/0x50 net/core/sock.c:1623
>  sock_put include/net/sock.h:1664 [inline]
>  reqsk_free include/net/request_sock.h:116 [inline]
>  reqsk_put include/net/request_sock.h:124 [inline]
>  inet_csk_reqsk_queue_drop_and_put net/ipv4/inet_connection_sock.c:672 [inline]
>  reqsk_timer_handler+0xe27/0x10e0 net/ipv4/inet_connection_sock.c:739
>  call_timer_fn+0x230/0x940 kernel/time/timer.c:1326
>  expire_timers kernel/time/timer.c:1363 [inline]
>  __run_timers+0x79e/0xc50 kernel/time/timer.c:1666
>  run_timer_softirq+0x4c/0x70 kernel/time/timer.c:1692
>  __do_softirq+0x2e0/0xaf5 kernel/softirq.c:285
>  invoke_softirq kernel/softirq.c:365 [inline]
>  irq_exit+0x1d1/0x200 kernel/softirq.c:405
>  exiting_irq arch/x86/include/asm/apic.h:525 [inline]
>  smp_apic_timer_interrupt+0x17e/0x710 arch/x86/kernel/apic/apic.c:1052
>  apic_timer_interrupt+0xf/0x20 arch/x86/entry/entry_64.S:863
>  </IRQ>
> RIP: 0010:native_safe_halt+0x6/0x10 arch/x86/include/asm/irqflags.h:54
> RSP: 0018:ffff8801d9ae7c38 EFLAGS: 00000282 ORIG_RAX: ffffffffffffff13
> RAX: dffffc0000000000 RBX: 1ffff1003b35cf8a RCX: 0000000000000000
> RDX: 1ffffffff11a30d0 RSI: 0000000000000001 RDI: ffffffff88d18680
> RBP: ffff8801d9ae7c38 R08: ffffed003b5e46c3 R09: 0000000000000000
> R10: 0000000000000000 R11: 0000000000000000 R12: 0000000000000001
> R13: ffff8801d9ae7cf0 R14: ffffffff897bef20 R15: 0000000000000000
>  arch_safe_halt arch/x86/include/asm/paravirt.h:94 [inline]
>  default_idle+0xc2/0x440 arch/x86/kernel/process.c:354
>  arch_cpu_idle+0x10/0x20 arch/x86/kernel/process.c:345
>  default_idle_call+0x6d/0x90 kernel/sched/idle.c:93
>  cpuidle_idle_call kernel/sched/idle.c:153 [inline]
>  do_idle+0x395/0x560 kernel/sched/idle.c:262
>  cpu_startup_entry+0x104/0x120 kernel/sched/idle.c:368
>  start_secondary+0x426/0x5b0 arch/x86/kernel/smpboot.c:269
>  secondary_startup_64+0xa5/0xb0 arch/x86/kernel/head_64.S:242
>
> Allocated by task 4557:
>  save_stack+0x43/0xd0 mm/kasan/kasan.c:448
>  set_track mm/kasan/kasan.c:460 [inline]
>  kasan_kmalloc+0xc4/0xe0 mm/kasan/kasan.c:553
>  kasan_slab_alloc+0x12/0x20 mm/kasan/kasan.c:490
>  kmem_cache_alloc+0x12e/0x760 mm/slab.c:3554
>  kmem_cache_zalloc include/linux/slab.h:691 [inline]
>  net_alloc net/core/net_namespace.c:383 [inline]
>  copy_net_ns+0x159/0x4c0 net/core/net_namespace.c:423
>  create_new_namespaces+0x69d/0x8f0 kernel/nsproxy.c:107
>  unshare_nsproxy_namespaces+0xc3/0x1f0 kernel/nsproxy.c:206
>  ksys_unshare+0x708/0xf90 kernel/fork.c:2408
>  __do_sys_unshare kernel/fork.c:2476 [inline]
>  __se_sys_unshare kernel/fork.c:2474 [inline]
>  __x64_sys_unshare+0x31/0x40 kernel/fork.c:2474
>  do_syscall_64+0x1b1/0x800 arch/x86/entry/common.c:287
>  entry_SYSCALL_64_after_hwframe+0x49/0xbe
>
> Freed by task 69:
>  save_stack+0x43/0xd0 mm/kasan/kasan.c:448
>  set_track mm/kasan/kasan.c:460 [inline]
>  __kasan_slab_free+0x11a/0x170 mm/kasan/kasan.c:521
>  kasan_slab_free+0xe/0x10 mm/kasan/kasan.c:528
>  __cache_free mm/slab.c:3498 [inline]
>  kmem_cache_free+0x86/0x2d0 mm/slab.c:3756
>  net_free net/core/net_namespace.c:399 [inline]
>  net_drop_ns.part.14+0x11a/0x130 net/core/net_namespace.c:406
>  net_drop_ns net/core/net_namespace.c:405 [inline]
>  cleanup_net+0x6a1/0xb20 net/core/net_namespace.c:541
>  process_one_work+0xc1e/0x1b50 kernel/workqueue.c:2145
>  worker_thread+0x1cc/0x1440 kernel/workqueue.c:2279
>  kthread+0x345/0x410 kernel/kthread.c:240
>  ret_from_fork+0x3a/0x50 arch/x86/entry/entry_64.S:412
>
> The buggy address belongs to the object at ffff88018a02c140
>  which belongs to the cache net_namespace of size 8832
> The buggy address is located 8800 bytes inside of
>  8832-byte region [ffff88018a02c140, ffff88018a02e3c0)
> The buggy address belongs to the page:
> page:ffffea0006280b00 count:1 mapcount:0 mapping:ffff88018a02c140 index:0x0 compound_mapcount: 0
> flags: 0x2fffc0000008100(slab|head)
> raw: 02fffc0000008100 ffff88018a02c140 0000000000000000 0000000100000001
> raw: ffffea00062a1320 ffffea0006268020 ffff8801d9bdde40 0000000000000000
> page dumped because: kasan: bad access detected
>
> Fixes: b922622ec6ef ("sock_diag: don't broadcast kernel sockets")
> Signed-off-by: Eric Dumazet <edumazet@google.com>
> Cc: Craig Gallek <kraig@google.com>
> Reported-by: syzbot <syzkaller@googlegroups.com>

Acked-by: Craig Gallek <kraig@google.com>

Thanks Eric!

^ permalink raw reply

* Re: [PATCH net-next] net:sched: add action inheritdsfield to skbmod
From: Jamal Hadi Salim @ 2018-05-18 13:06 UTC (permalink / raw)
  To: Fu, Qiaobin, davem@davemloft.net; +Cc: netdev@vger.kernel.org, Michel Machado
In-Reply-To: <2F042100-2BAC-48E5-887C-5D426B1D5A5B@bu.edu>

On 17/05/18 03:33 PM, Fu, Qiaobin wrote:
> net/sched: add action inheritdsfield to skbmod
> 
> The new action inheritdsfield copies the field DS of
> IPv4 and IPv6 packets into skb->prioriry. This enables
> later classification of packets based on the DS field.
> 
> Original idea by Jamal Hadi Salim <jhs@mojatatu.com>
> 
> Signed-off-by: Qiaobin Fu <qiaobinf@bu.edu>
> Reviewed-by: Michel Machado <michel@digirati.com.br>

LGTM. Thanks for putting the effort.
As a tradition we also require that you post the iproute2
patch and make sure this works. Can you please do that?
Also: If you can add at least a test on tdc it would help
immensely. Other than that:

Acked-by: Jamal Hadi Salim <jhs@mojatatu.com>

cheers,
jamal

^ permalink raw reply

* Re: [PATCH net] tuntap: raise EPOLLOUT on device up
From: Michael S. Tsirkin @ 2018-05-18 13:13 UTC (permalink / raw)
  To: Jason Wang; +Cc: netdev, linux-kernel, Hannes Frederic Sowa, Eric Dumazet
In-Reply-To: <1526648443-24128-1-git-send-email-jasowang@redhat.com>

On Fri, May 18, 2018 at 09:00:43PM +0800, Jason Wang wrote:
> We return -EIO on device down but can not raise EPOLLOUT after it was
> up. This may confuse user like vhost which expects tuntap to raise
> EPOLLOUT to re-enable its TX routine after tuntap is down. This could
> be easily reproduced by transmitting packets from VM while down and up
> the tap device. Fixing this by set SOCKWQ_ASYNC_NOSPACE on -EIO.
> 
> Cc: Hannes Frederic Sowa <hannes@stressinduktion.org>
> Cc: Eric Dumazet <edumazet@google.com>
> Fixes: 1bd4978a88ac2 ("tun: honor IFF_UP in tun_get_user()")
> Signed-off-by: Jason Wang <jasowang@redhat.com>
> ---
>  drivers/net/tun.c | 4 +++-
>  1 file changed, 3 insertions(+), 1 deletion(-)
> 
> diff --git a/drivers/net/tun.c b/drivers/net/tun.c
> index d45ac37..1b29761 100644
> --- a/drivers/net/tun.c
> +++ b/drivers/net/tun.c
> @@ -1734,8 +1734,10 @@ static ssize_t tun_get_user(struct tun_struct *tun, struct tun_file *tfile,
>  	int skb_xdp = 1;
>  	bool frags = tun_napi_frags_enabled(tun);
>  
> -	if (!(tun->dev->flags & IFF_UP))
> +	if (!(tun->dev->flags & IFF_UP)) {

Isn't this racy?  What if flag is cleared at this point?

> +		set_bit(SOCKWQ_ASYNC_NOSPACE, &tfile->socket.flags);
>  		return -EIO;
> +	}
>  
>  	if (!(tun->flags & IFF_NO_PI)) {
>  		if (len < sizeof(pi))
> -- 
> 2.7.4

^ permalink raw reply

* Re: [RFC v4 3/5] virtio_ring: add packed ring support
From: Jason Wang @ 2018-05-18 13:17 UTC (permalink / raw)
  To: Tiwei Bie; +Cc: mst, virtualization, linux-kernel, netdev, wexu, jfreimann
In-Reply-To: <20180518112950.GA28224@debian>



On 2018年05月18日 19:29, Tiwei Bie wrote:
> On Thu, May 17, 2018 at 08:01:52PM +0800, Jason Wang wrote:
>> On 2018年05月16日 22:33, Tiwei Bie wrote:
>>> On Wed, May 16, 2018 at 10:05:44PM +0800, Jason Wang wrote:
>>>> On 2018年05月16日 21:45, Tiwei Bie wrote:
>>>>> On Wed, May 16, 2018 at 08:51:43PM +0800, Jason Wang wrote:
>>>>>> On 2018年05月16日 20:39, Tiwei Bie wrote:
>>>>>>> On Wed, May 16, 2018 at 07:50:16PM +0800, Jason Wang wrote:
>>>>>>>> On 2018年05月16日 16:37, Tiwei Bie wrote:
>>> [...]
>>>>>>>>> +static void detach_buf_packed(struct vring_virtqueue *vq, unsigned int head,
>>>>>>>>> +			      unsigned int id, void **ctx)
>>>>>>>>> +{
>>>>>>>>> +	struct vring_packed_desc *desc;
>>>>>>>>> +	unsigned int i, j;
>>>>>>>>> +
>>>>>>>>> +	/* Clear data ptr. */
>>>>>>>>> +	vq->desc_state[id].data = NULL;
>>>>>>>>> +
>>>>>>>>> +	i = head;
>>>>>>>>> +
>>>>>>>>> +	for (j = 0; j < vq->desc_state[id].num; j++) {
>>>>>>>>> +		desc = &vq->vring_packed.desc[i];
>>>>>>>>> +		vring_unmap_one_packed(vq, desc);
>>>>>>>> As mentioned in previous discussion, this probably won't work for the case
>>>>>>>> of out of order completion since it depends on the information in the
>>>>>>>> descriptor ring. We probably need to extend ctx to record such information.
>>>>>>> Above code doesn't depend on the information in the descriptor
>>>>>>> ring. The vq->desc_state[] is the extended ctx.
>>>>>>>
>>>>>>> Best regards,
>>>>>>> Tiwei Bie
>>>>>> Yes, but desc is a pointer to descriptor ring I think so
>>>>>> vring_unmap_one_packed() still depends on the content of descriptor ring?
>>>>>>
>>>>> I got your point now. I think it makes sense to reserve
>>>>> the bits of the addr field. Driver shouldn't try to get
>>>>> addrs from the descriptors when cleanup the descriptors
>>>>> no matter whether we support out-of-order or not.
>>>> Maybe I was wrong, but I remember spec mentioned something like this.
>>> You're right. Spec mentioned this. I was just repeating
>>> the spec to emphasize that it does make sense. :)
>>>
>>>>> But combining it with the out-of-order support, it will
>>>>> mean that the driver still needs to maintain a desc/ctx
>>>>> list that is very similar to the desc ring in the split
>>>>> ring. I'm not quite sure whether it's something we want.
>>>>> If it is true, I'll do it. So do you think we also want
>>>>> to maintain such a desc/ctx list for packed ring?
>>>> To make it work for OOO backends I think we need something like this
>>>> (hardware NIC drivers are usually have something like this).
>>> Which hardware NIC drivers have this?
>> It's quite common I think, e.g driver track e.g dma addr and page frag
>> somewhere. e.g the ring->rx_info in mlx4 driver.
> It seems that I had a misunderstanding on your
> previous comments. I know it's quite common for
> drivers to track e.g. DMA addrs somewhere (and
> I think one reason behind this is that they want
> to reuse the bits of addr field).

Yes, we may want this for virtio-net as well in the future.

>   But tracking
> addrs somewhere doesn't means supporting OOO.
> I thought you were saying it's quite common for
> hardware NIC drivers to support OOO (i.e. NICs
> will return the descriptors OOO):
>
> I'm not familiar with mlx4, maybe I'm wrong.
> I just had a quick glance. And I found below
> comments in mlx4_en_process_rx_cq():
>
> ```
> /* We assume a 1:1 mapping between CQEs and Rx descriptors, so Rx
>   * descriptor offset can be deduced from the CQE index instead of
>   * reading 'cqe->index' */
> index = cq->mcq.cons_index & ring->size_mask;
> cqe = mlx4_en_get_cqe(cq->buf, index, priv->cqe_size) + factor;
> ```
>
> It seems that although they have a completion
> queue, they are still using the ring in order.

I guess so (at least from the above bits). Git grep -i "out of order" in 
drivers/net gives some hints. Looks like there're few deivces do this.

> I guess maybe storage device may want OOO.

Right, some iSCSI did.

But tracking them elsewhere is not only for OOO.

Spec said:

for element address

"
In a used descriptor, Element Address is unused.
"

for Next flag:

"
For example, if descriptors are used in the same order in which they are 
made available, this will result in
the used descriptor overwriting the first available descriptor in the 
list, the used descriptor for the next list
overwriting the first available descriptor in the next list, etc.
"

for in order completion:

"
This will result in the used descriptor overwriting the first available 
descriptor in the batch, the used descriptor
for the next batch overwriting the first available descriptor in the 
next batch, etc.
"

So:

- It's an alignment to the spec
- device may (or should) overwrite the descriptor make also make address 
field useless.

Thanks

>
> Best regards,
> Tiwei Bie
>
>> Thanks
>>
>>>> Not for the patch, but it looks like having a OUT_OF_ORDER feature bit is
>>>> much more simpler to be started with.
>>> +1
>>>
>>> Best regards,
>>> Tiwei Bie

^ permalink raw reply

* VLAN performance issues with Open vSwitch when 8021q not loaded
From: Mikhail Sennikovsky @ 2018-05-18 13:24 UTC (permalink / raw)
  To: netdev; +Cc: Mikhail Sennikovskii

Hi all,

We were recently experiencing network performance issues with VLAN
networking setup with Open vSwitch, for the ingress traffic coming to
VLAN trunk port of the ovs.
As we discovered, the issue was caused by gro not working for it,
which in turn was because the gro receive callbacks for 802.1Q payload
type are defined in the 8021q module (see
https://elixir.bootlin.com/linux/v4.16.9/source/net/8021q/vlan.c#L762
), which was not loaded.
This resulted in a significant bandwidth performance drop, having
~3Gbps instead of the expected ~7Gbps for a simple iperf3 test in our
case.

The obvious work-around would be to load the 8021q module, which
indeed makes bandwidth performance back to the expected numbers.
This seems like a hidden and not obvious magic however.

So I'm questioning, whether it makes sense to have the gro receive
callbacks for 802.1Q and 802.1ad moved to some common place, that
would be used/enabled by both 8021q and openvswitch modules.

Regards,
Mikhail

^ permalink raw reply

* Re: [PATCH net] tuntap: raise EPOLLOUT on device up
From: Jason Wang @ 2018-05-18 13:26 UTC (permalink / raw)
  To: Michael S. Tsirkin
  Cc: netdev, linux-kernel, Hannes Frederic Sowa, Eric Dumazet
In-Reply-To: <20180518161253-mutt-send-email-mst@kernel.org>



On 2018年05月18日 21:13, Michael S. Tsirkin wrote:
> On Fri, May 18, 2018 at 09:00:43PM +0800, Jason Wang wrote:
>> We return -EIO on device down but can not raise EPOLLOUT after it was
>> up. This may confuse user like vhost which expects tuntap to raise
>> EPOLLOUT to re-enable its TX routine after tuntap is down. This could
>> be easily reproduced by transmitting packets from VM while down and up
>> the tap device. Fixing this by set SOCKWQ_ASYNC_NOSPACE on -EIO.
>>
>> Cc: Hannes Frederic Sowa <hannes@stressinduktion.org>
>> Cc: Eric Dumazet <edumazet@google.com>
>> Fixes: 1bd4978a88ac2 ("tun: honor IFF_UP in tun_get_user()")
>> Signed-off-by: Jason Wang <jasowang@redhat.com>
>> ---
>>   drivers/net/tun.c | 4 +++-
>>   1 file changed, 3 insertions(+), 1 deletion(-)
>>
>> diff --git a/drivers/net/tun.c b/drivers/net/tun.c
>> index d45ac37..1b29761 100644
>> --- a/drivers/net/tun.c
>> +++ b/drivers/net/tun.c
>> @@ -1734,8 +1734,10 @@ static ssize_t tun_get_user(struct tun_struct *tun, struct tun_file *tfile,
>>   	int skb_xdp = 1;
>>   	bool frags = tun_napi_frags_enabled(tun);
>>   
>> -	if (!(tun->dev->flags & IFF_UP))
>> +	if (!(tun->dev->flags & IFF_UP)) {
> Isn't this racy?  What if flag is cleared at this point?

I think you mean "set at this point"? Then yes, so we probably need to 
set the bit during tun_net_close().

Thanks

>> +		set_bit(SOCKWQ_ASYNC_NOSPACE, &tfile->socket.flags);
>>   		return -EIO;
>> +	}
>>   
>>   	if (!(tun->flags & IFF_NO_PI)) {
>>   		if (len < sizeof(pi))
>> -- 
>> 2.7.4

^ permalink raw reply

* Re: [PATCH v3 2/2] bpf: add selftest for rawir_event type program
From: Sean Young @ 2018-05-18 13:33 UTC (permalink / raw)
  To: Quentin Monnet
  Cc: Y Song, linux-media, linux-kernel, Alexei Starovoitov,
	Mauro Carvalho Chehab, Daniel Borkmann, netdev, Matthias Reichl,
	Devin Heitmueller
In-Reply-To: <86ffb16c-9b4e-c826-ecd2-82266e7b8c2e@netronome.com>

On Fri, May 18, 2018 at 11:13:07AM +0100, Quentin Monnet wrote:
> 2018-05-17 22:01 UTC+0100 ~ Sean Young <sean@mess.org>
> > On Thu, May 17, 2018 at 10:17:59AM -0700, Y Song wrote:
> >> On Wed, May 16, 2018 at 2:04 PM, Sean Young <sean@mess.org> wrote:
> >>> This is simple test over rc-loopback.
> >>>
> >>> Signed-off-by: Sean Young <sean@mess.org>
> >>> ---
> >>>  tools/bpf/bpftool/prog.c                      |   1 +
> >>>  tools/include/uapi/linux/bpf.h                |  57 +++++++-
> >>>  tools/lib/bpf/libbpf.c                        |   1 +
> >>>  tools/testing/selftests/bpf/Makefile          |   8 +-
> >>>  tools/testing/selftests/bpf/bpf_helpers.h     |   6 +
> >>>  tools/testing/selftests/bpf/test_rawir.sh     |  37 +++++
> >>>  .../selftests/bpf/test_rawir_event_kern.c     |  26 ++++
> >>>  .../selftests/bpf/test_rawir_event_user.c     | 130 ++++++++++++++++++
> >>>  8 files changed, 261 insertions(+), 5 deletions(-)
> >>>  create mode 100755 tools/testing/selftests/bpf/test_rawir.sh
> >>>  create mode 100644 tools/testing/selftests/bpf/test_rawir_event_kern.c
> >>>  create mode 100644 tools/testing/selftests/bpf/test_rawir_event_user.c
> 
> [...]
> 
> >> Most people probably not really familiar with lircN device. It would be
> >> good to provide more information about how to enable this, e.g.,
> >>   CONFIG_RC_CORE=y
> >>   CONFIG_BPF_RAWIR_EVENT=y
> >>   CONFIG_RC_LOOPBACK=y
> >>   ......
> > 
> > Good point. I'll add some words explaining what is and how to make it work.
> > 
> > Thanks
> > Sean
> 
> 
> By the way, shouldn't the two eBPF helpers bpf_rc_keydown() and
> bpf_rc_repeat() be compiled out in patch 1 if e.g.
> CONFIG_BPF_RAWIR_EVENT is not set? There are some other helpers that are
> compiled only if relevant config options are set (bpf_get_xfrm_state()
> for example).

So if CONFIG_BPF_RAWIR_EVENT is not set, then bpf-rawir-event.c is not
compiled. Stubs are created in include/linux/bpf_rcdev.h, so this is
already the case if I understand your correctly.
 
> (If you were to change that, please also update helper documentations to
> indicate what configuration options are required to be able to use the
> helpers.)

Ok, I'll add that.

Thanks again!

Sean

^ permalink raw reply

* [bpf-next V4 PATCH 0/8] xdp: introduce bulking for ndo_xdp_xmit API
From: Jesper Dangaard Brouer @ 2018-05-18 13:34 UTC (permalink / raw)
  To: netdev, Daniel Borkmann, Alexei Starovoitov,
	Jesper Dangaard Brouer
  Cc: Christoph Hellwig, BjörnTöpel, Magnus Karlsson,
	makita.toshiaki

This patchset change ndo_xdp_xmit API to take a bulk of xdp frames.

In this V4 patchset, I've split-out the patches from 4 to 8 patches.
I cannot split the driver changes from the NDO change, but I've tried
to isolated the NDO change together with the driver change as much as
possible.

When kernel is compiled with CONFIG_RETPOLINE, every indirect function
pointer (branch) call hurts performance. For XDP this have a huge
negative performance impact.

This patchset reduce the needed (indirect) calls to ndo_xdp_xmit, but
also prepares for further optimizations.  The DMA APIs use of indirect
function pointer calls is the primary source the regression.  It is
left for a followup patchset, to use bulking calls towards the DMA API
(via the scatter-gatter calls).

The other advantage of this API change is that drivers can easier
amortize the cost of any sync/locking scheme, over the bulk of
packets.  The assumption of the current API is that the driver
implemementing the NDO will also allocate a dedicated XDP TX queue for
every CPU in the system.  Which is not always possible or practical to
configure. E.g. ixgbe cannot load an XDP program on a machine with
more than 96 CPUs, due to limited hardware TX queues.  E.g. virtio_net
is hard to configure as it requires manually increasing the
queues. E.g. tun driver chooses to use a per XDP frame producer lock
modulo smp_processor_id over avail queues.

I'm considered adding 'flags' to ndo_xdp_xmit, but it's not part of
this patchset.  This will be a followup patchset, once we know if this
will be needed (e.g. for non-map xdp_redirect flush-flag, and if
AF_XDP chooses to use ndo_xdp_xmit for TX).

---

Jesper Dangaard Brouer (8):
      bpf: devmap introduce dev_map_enqueue
      bpf: devmap prepare xdp frames for bulking
      xdp: add tracepoint for devmap like cpumap have
      samples/bpf: xdp_monitor use tracepoint xdp:xdp_devmap_xmit
      xdp: introduce xdp_return_frame_rx_napi
      xdp: change ndo_xdp_xmit API to support bulking
      xdp/trace: extend tracepoint in devmap with an err
      samples/bpf: xdp_monitor use err code from tracepoint xdp:xdp_devmap_xmit

 drivers/net/ethernet/intel/i40e/i40e_txrx.c   |   26 ++++-
 drivers/net/ethernet/intel/i40e/i40e_txrx.h   |    2 
 drivers/net/ethernet/intel/ixgbe/ixgbe_main.c |   21 +++-
 drivers/net/tun.c                             |   37 ++++---
 drivers/net/virtio_net.c                      |   66 +++++++++---
 include/linux/bpf.h                           |   16 ++-
 include/linux/netdevice.h                     |   14 ++-
 include/net/page_pool.h                       |    5 +
 include/net/xdp.h                             |    1 
 include/trace/events/xdp.h                    |   50 +++++++++
 kernel/bpf/cpumap.c                           |    2 
 kernel/bpf/devmap.c                           |  134 ++++++++++++++++++++++++-
 net/core/filter.c                             |   23 +---
 net/core/xdp.c                                |   20 +++-
 samples/bpf/xdp_monitor_kern.c                |   49 +++++++++
 samples/bpf/xdp_monitor_user.c                |   69 +++++++++++++
 16 files changed, 449 insertions(+), 86 deletions(-)

^ permalink raw reply

* [bpf-next V4 PATCH 1/8] bpf: devmap introduce dev_map_enqueue
From: Jesper Dangaard Brouer @ 2018-05-18 13:34 UTC (permalink / raw)
  To: netdev, Daniel Borkmann, Alexei Starovoitov,
	Jesper Dangaard Brouer
  Cc: Christoph Hellwig, BjörnTöpel, Magnus Karlsson,
	makita.toshiaki
In-Reply-To: <152665044141.21055.1276346542020340263.stgit@firesoul>

Functionality is the same, but the ndo_xdp_xmit call is now
simply invoked from inside the devmap.c code.

V2: Fix compile issue reported by kbuild test robot <lkp@intel.com>

Signed-off-by: Jesper Dangaard Brouer <brouer@redhat.com>
---
 include/linux/bpf.h        |   14 +++++++++++---
 include/trace/events/xdp.h |    9 ++++++++-
 kernel/bpf/devmap.c        |   37 +++++++++++++++++++++++++++++++------
 net/core/filter.c          |   15 ++-------------
 4 files changed, 52 insertions(+), 23 deletions(-)

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index ed0122b45b63..fc1459bdcafc 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -485,14 +485,15 @@ int bpf_check(struct bpf_prog **fp, union bpf_attr *attr);
 void bpf_patch_call_args(struct bpf_insn *insn, u32 stack_depth);
 
 /* Map specifics */
-struct net_device  *__dev_map_lookup_elem(struct bpf_map *map, u32 key);
+struct xdp_buff;
+struct bpf_dtab_netdev *__dev_map_lookup_elem(struct bpf_map *map, u32 key);
 void __dev_map_insert_ctx(struct bpf_map *map, u32 index);
 void __dev_map_flush(struct bpf_map *map);
+int dev_map_enqueue(struct bpf_dtab_netdev *dst, struct xdp_buff *xdp);
 
 struct bpf_cpu_map_entry *__cpu_map_lookup_elem(struct bpf_map *map, u32 key);
 void __cpu_map_insert_ctx(struct bpf_map *map, u32 index);
 void __cpu_map_flush(struct bpf_map *map);
-struct xdp_buff;
 int cpu_map_enqueue(struct bpf_cpu_map_entry *rcpu, struct xdp_buff *xdp,
 		    struct net_device *dev_rx);
 
@@ -571,6 +572,14 @@ static inline void __dev_map_flush(struct bpf_map *map)
 {
 }
 
+struct xdp_buff;
+struct bpf_dtab_netdev;
+static inline
+int dev_map_enqueue(struct bpf_dtab_netdev *dst, struct xdp_buff *xdp)
+{
+	return 0;
+}
+
 static inline
 struct bpf_cpu_map_entry *__cpu_map_lookup_elem(struct bpf_map *map, u32 key)
 {
@@ -585,7 +594,6 @@ static inline void __cpu_map_flush(struct bpf_map *map)
 {
 }
 
-struct xdp_buff;
 static inline int cpu_map_enqueue(struct bpf_cpu_map_entry *rcpu,
 				  struct xdp_buff *xdp,
 				  struct net_device *dev_rx)
diff --git a/include/trace/events/xdp.h b/include/trace/events/xdp.h
index 8989a92c571a..96104610d40e 100644
--- a/include/trace/events/xdp.h
+++ b/include/trace/events/xdp.h
@@ -138,11 +138,18 @@ DEFINE_EVENT_PRINT(xdp_redirect_template, xdp_redirect_map_err,
 		  __entry->map_id, __entry->map_index)
 );
 
+#ifndef __DEVMAP_OBJ_TYPE
+#define __DEVMAP_OBJ_TYPE
+struct _bpf_dtab_netdev {
+	struct net_device *dev;
+};
+#endif /* __DEVMAP_OBJ_TYPE */
+
 #define devmap_ifindex(fwd, map)				\
 	(!fwd ? 0 :						\
 	 (!map ? 0 :						\
 	  ((map->map_type == BPF_MAP_TYPE_DEVMAP) ?		\
-	   ((struct net_device *)fwd)->ifindex : 0)))
+	   ((struct _bpf_dtab_netdev *)fwd)->dev->ifindex : 0)))
 
 #define _trace_xdp_redirect_map(dev, xdp, fwd, map, idx)		\
 	 trace_xdp_redirect_map(dev, xdp, devmap_ifindex(fwd, map),	\
diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c
index 565f9ece9115..808808bf2bf2 100644
--- a/kernel/bpf/devmap.c
+++ b/kernel/bpf/devmap.c
@@ -48,18 +48,21 @@
  * calls will fail at this point.
  */
 #include <linux/bpf.h>
+#include <net/xdp.h>
 #include <linux/filter.h>
 
 #define DEV_CREATE_FLAG_MASK \
 	(BPF_F_NUMA_NODE | BPF_F_RDONLY | BPF_F_WRONLY)
 
+/* objects in the map */
 struct bpf_dtab_netdev {
-	struct net_device *dev;
+	struct net_device *dev; /* must be first member, due to tracepoint */
 	struct bpf_dtab *dtab;
 	unsigned int bit;
 	struct rcu_head rcu;
 };
 
+/* bpf map container */
 struct bpf_dtab {
 	struct bpf_map map;
 	struct bpf_dtab_netdev **netdev_map;
@@ -240,21 +243,43 @@ void __dev_map_flush(struct bpf_map *map)
  * update happens in parallel here a dev_put wont happen until after reading the
  * ifindex.
  */
-struct net_device  *__dev_map_lookup_elem(struct bpf_map *map, u32 key)
+struct bpf_dtab_netdev *__dev_map_lookup_elem(struct bpf_map *map, u32 key)
 {
 	struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map);
-	struct bpf_dtab_netdev *dev;
+	struct bpf_dtab_netdev *obj;
 
 	if (key >= map->max_entries)
 		return NULL;
 
-	dev = READ_ONCE(dtab->netdev_map[key]);
-	return dev ? dev->dev : NULL;
+	obj = READ_ONCE(dtab->netdev_map[key]);
+	return obj;
+}
+
+int dev_map_enqueue(struct bpf_dtab_netdev *dst, struct xdp_buff *xdp)
+{
+	struct net_device *dev = dst->dev;
+	struct xdp_frame *xdpf;
+	int err;
+
+	if (!dev->netdev_ops->ndo_xdp_xmit)
+		return -EOPNOTSUPP;
+
+	xdpf = convert_to_xdp_frame(xdp);
+	if (unlikely(!xdpf))
+		return -EOVERFLOW;
+
+	/* TODO: implement a bulking/enqueue step later */
+	err = dev->netdev_ops->ndo_xdp_xmit(dev, xdpf);
+	if (err)
+		return err;
+
+	return 0;
 }
 
 static void *dev_map_lookup_elem(struct bpf_map *map, void *key)
 {
-	struct net_device *dev = __dev_map_lookup_elem(map, *(u32 *)key);
+	struct bpf_dtab_netdev *obj = __dev_map_lookup_elem(map, *(u32 *)key);
+	struct net_device *dev = dev = obj ? obj->dev : NULL;
 
 	return dev ? &dev->ifindex : NULL;
 }
diff --git a/net/core/filter.c b/net/core/filter.c
index 6d0d1560bd70..1447ec94ef74 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -3061,20 +3061,9 @@ static int __bpf_tx_xdp_map(struct net_device *dev_rx, void *fwd,
 
 	switch (map->map_type) {
 	case BPF_MAP_TYPE_DEVMAP: {
-		struct net_device *dev = fwd;
-		struct xdp_frame *xdpf;
+		struct bpf_dtab_netdev *dst = fwd;
 
-		if (!dev->netdev_ops->ndo_xdp_xmit)
-			return -EOPNOTSUPP;
-
-		xdpf = convert_to_xdp_frame(xdp);
-		if (unlikely(!xdpf))
-			return -EOVERFLOW;
-
-		/* TODO: move to inside map code instead, for bulk support
-		 * err = dev_map_enqueue(dev, xdp);
-		 */
-		err = dev->netdev_ops->ndo_xdp_xmit(dev, xdpf);
+		err = dev_map_enqueue(dst, xdp);
 		if (err)
 			return err;
 		__dev_map_insert_ctx(map, index);

^ permalink raw reply related

* [bpf-next V4 PATCH 2/8] bpf: devmap prepare xdp frames for bulking
From: Jesper Dangaard Brouer @ 2018-05-18 13:34 UTC (permalink / raw)
  To: netdev, Daniel Borkmann, Alexei Starovoitov,
	Jesper Dangaard Brouer
  Cc: Christoph Hellwig, BjörnTöpel, Magnus Karlsson,
	makita.toshiaki
In-Reply-To: <152665044141.21055.1276346542020340263.stgit@firesoul>

Like cpumap create queue for xdp frames that will be bulked.  For now,
this patch simply invoke ndo_xdp_xmit foreach frame.  This happens,
either when the map flush operation is envoked, or when the limit
DEV_MAP_BULK_SIZE is reached.

Signed-off-by: Jesper Dangaard Brouer <brouer@redhat.com>
---
 kernel/bpf/devmap.c |   77 ++++++++++++++++++++++++++++++++++++++++++++++++---
 1 file changed, 73 insertions(+), 4 deletions(-)

diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c
index 808808bf2bf2..cab72c100bb5 100644
--- a/kernel/bpf/devmap.c
+++ b/kernel/bpf/devmap.c
@@ -54,11 +54,18 @@
 #define DEV_CREATE_FLAG_MASK \
 	(BPF_F_NUMA_NODE | BPF_F_RDONLY | BPF_F_WRONLY)
 
+#define DEV_MAP_BULK_SIZE 16
+struct xdp_bulk_queue {
+	struct xdp_frame *q[DEV_MAP_BULK_SIZE];
+	unsigned int count;
+};
+
 /* objects in the map */
 struct bpf_dtab_netdev {
 	struct net_device *dev; /* must be first member, due to tracepoint */
 	struct bpf_dtab *dtab;
 	unsigned int bit;
+	struct xdp_bulk_queue __percpu *bulkq;
 	struct rcu_head rcu;
 };
 
@@ -209,6 +216,38 @@ void __dev_map_insert_ctx(struct bpf_map *map, u32 bit)
 	__set_bit(bit, bitmap);
 }
 
+static int bq_xmit_all(struct bpf_dtab_netdev *obj,
+			 struct xdp_bulk_queue *bq)
+{
+	unsigned int processed = 0, drops = 0;
+	struct net_device *dev = obj->dev;
+	int i;
+
+	if (unlikely(!bq->count))
+		return 0;
+
+	for (i = 0; i < bq->count; i++) {
+		struct xdp_frame *xdpf = bq->q[i];
+
+		prefetch(xdpf);
+	}
+
+	for (i = 0; i < bq->count; i++) {
+		struct xdp_frame *xdpf = bq->q[i];
+		int err;
+
+		err = dev->netdev_ops->ndo_xdp_xmit(dev, xdpf);
+		if (err) {
+			drops++;
+			xdp_return_frame(xdpf);
+		}
+		processed++;
+	}
+	bq->count = 0;
+
+	return 0;
+}
+
 /* __dev_map_flush is called from xdp_do_flush_map() which _must_ be signaled
  * from the driver before returning from its napi->poll() routine. The poll()
  * routine is called either from busy_poll context or net_rx_action signaled
@@ -224,6 +263,7 @@ void __dev_map_flush(struct bpf_map *map)
 
 	for_each_set_bit(bit, bitmap, map->max_entries) {
 		struct bpf_dtab_netdev *dev = READ_ONCE(dtab->netdev_map[bit]);
+		struct xdp_bulk_queue *bq;
 		struct net_device *netdev;
 
 		/* This is possible if the dev entry is removed by user space
@@ -233,6 +273,9 @@ void __dev_map_flush(struct bpf_map *map)
 			continue;
 
 		__clear_bit(bit, bitmap);
+
+		bq = this_cpu_ptr(dev->bulkq);
+		bq_xmit_all(dev, bq);
 		netdev = dev->dev;
 		if (likely(netdev->netdev_ops->ndo_xdp_flush))
 			netdev->netdev_ops->ndo_xdp_flush(netdev);
@@ -255,6 +298,20 @@ struct bpf_dtab_netdev *__dev_map_lookup_elem(struct bpf_map *map, u32 key)
 	return obj;
 }
 
+/* Runs under RCU-read-side, plus in softirq under NAPI protection.
+ * Thus, safe percpu variable access.
+ */
+static int bq_enqueue(struct bpf_dtab_netdev *obj, struct xdp_frame *xdpf)
+{
+	struct xdp_bulk_queue *bq = this_cpu_ptr(obj->bulkq);
+
+	if (unlikely(bq->count == DEV_MAP_BULK_SIZE))
+		bq_xmit_all(obj, bq);
+
+	bq->q[bq->count++] = xdpf;
+	return 0;
+}
+
 int dev_map_enqueue(struct bpf_dtab_netdev *dst, struct xdp_buff *xdp)
 {
 	struct net_device *dev = dst->dev;
@@ -268,8 +325,7 @@ int dev_map_enqueue(struct bpf_dtab_netdev *dst, struct xdp_buff *xdp)
 	if (unlikely(!xdpf))
 		return -EOVERFLOW;
 
-	/* TODO: implement a bulking/enqueue step later */
-	err = dev->netdev_ops->ndo_xdp_xmit(dev, xdpf);
+	err = bq_enqueue(dst, xdpf);
 	if (err)
 		return err;
 
@@ -288,13 +344,18 @@ static void dev_map_flush_old(struct bpf_dtab_netdev *dev)
 {
 	if (dev->dev->netdev_ops->ndo_xdp_flush) {
 		struct net_device *fl = dev->dev;
+		struct xdp_bulk_queue *bq;
 		unsigned long *bitmap;
+
 		int cpu;
 
 		for_each_online_cpu(cpu) {
 			bitmap = per_cpu_ptr(dev->dtab->flush_needed, cpu);
 			__clear_bit(dev->bit, bitmap);
 
+			bq = per_cpu_ptr(dev->bulkq, cpu);
+			bq_xmit_all(dev, bq);
+
 			fl->netdev_ops->ndo_xdp_flush(dev->dev);
 		}
 	}
@@ -306,6 +367,7 @@ static void __dev_map_entry_free(struct rcu_head *rcu)
 
 	dev = container_of(rcu, struct bpf_dtab_netdev, rcu);
 	dev_map_flush_old(dev);
+	free_percpu(dev->bulkq);
 	dev_put(dev->dev);
 	kfree(dev);
 }
@@ -338,6 +400,7 @@ static int dev_map_update_elem(struct bpf_map *map, void *key, void *value,
 {
 	struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map);
 	struct net *net = current->nsproxy->net_ns;
+	gfp_t gfp = GFP_ATOMIC | __GFP_NOWARN;
 	struct bpf_dtab_netdev *dev, *old_dev;
 	u32 i = *(u32 *)key;
 	u32 ifindex = *(u32 *)value;
@@ -352,11 +415,17 @@ static int dev_map_update_elem(struct bpf_map *map, void *key, void *value,
 	if (!ifindex) {
 		dev = NULL;
 	} else {
-		dev = kmalloc_node(sizeof(*dev), GFP_ATOMIC | __GFP_NOWARN,
-				   map->numa_node);
+		dev = kmalloc_node(sizeof(*dev), gfp, map->numa_node);
 		if (!dev)
 			return -ENOMEM;
 
+		dev->bulkq = __alloc_percpu_gfp(sizeof(*dev->bulkq),
+						sizeof(void *), gfp);
+		if (!dev->bulkq) {
+			kfree(dev);
+			return -ENOMEM;
+		}
+
 		dev->dev = dev_get_by_index(net, ifindex);
 		if (!dev->dev) {
 			kfree(dev);

^ permalink raw reply related

* [bpf-next V4 PATCH 3/8] xdp: add tracepoint for devmap like cpumap have
From: Jesper Dangaard Brouer @ 2018-05-18 13:34 UTC (permalink / raw)
  To: netdev, Daniel Borkmann, Alexei Starovoitov,
	Jesper Dangaard Brouer
  Cc: Christoph Hellwig, BjörnTöpel, Magnus Karlsson,
	makita.toshiaki
In-Reply-To: <152665044141.21055.1276346542020340263.stgit@firesoul>

Notice how this allow us get XDP statistic without affecting the XDP
performance, as tracepoint is no-longer activated on a per packet basis.

Signed-off-by: Jesper Dangaard Brouer <brouer@redhat.com>
---
 include/linux/bpf.h        |    6 ++++--
 include/trace/events/xdp.h |   39 +++++++++++++++++++++++++++++++++++++++
 kernel/bpf/devmap.c        |   25 ++++++++++++++++++++-----
 net/core/filter.c          |    2 +-
 4 files changed, 64 insertions(+), 8 deletions(-)

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index fc1459bdcafc..ca7110b81793 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -489,7 +489,8 @@ struct xdp_buff;
 struct bpf_dtab_netdev *__dev_map_lookup_elem(struct bpf_map *map, u32 key);
 void __dev_map_insert_ctx(struct bpf_map *map, u32 index);
 void __dev_map_flush(struct bpf_map *map);
-int dev_map_enqueue(struct bpf_dtab_netdev *dst, struct xdp_buff *xdp);
+int dev_map_enqueue(struct bpf_dtab_netdev *dst, struct xdp_buff *xdp,
+		    struct net_device *dev_rx);
 
 struct bpf_cpu_map_entry *__cpu_map_lookup_elem(struct bpf_map *map, u32 key);
 void __cpu_map_insert_ctx(struct bpf_map *map, u32 index);
@@ -575,7 +576,8 @@ static inline void __dev_map_flush(struct bpf_map *map)
 struct xdp_buff;
 struct bpf_dtab_netdev;
 static inline
-int dev_map_enqueue(struct bpf_dtab_netdev *dst, struct xdp_buff *xdp)
+int dev_map_enqueue(struct bpf_dtab_netdev *dst, struct xdp_buff *xdp,
+		    struct net_device *dev_rx)
 {
 	return 0;
 }
diff --git a/include/trace/events/xdp.h b/include/trace/events/xdp.h
index 96104610d40e..2e9ef0650144 100644
--- a/include/trace/events/xdp.h
+++ b/include/trace/events/xdp.h
@@ -229,6 +229,45 @@ TRACE_EVENT(xdp_cpumap_enqueue,
 		  __entry->to_cpu)
 );
 
+TRACE_EVENT(xdp_devmap_xmit,
+
+	TP_PROTO(const struct bpf_map *map, u32 map_index,
+		 int sent, int drops,
+		 const struct net_device *from_dev,
+		 const struct net_device *to_dev),
+
+	TP_ARGS(map, map_index, sent, drops, from_dev, to_dev),
+
+	TP_STRUCT__entry(
+		__field(int, map_id)
+		__field(u32, act)
+		__field(u32, map_index)
+		__field(int, drops)
+		__field(int, sent)
+		__field(int, from_ifindex)
+		__field(int, to_ifindex)
+	),
+
+	TP_fast_assign(
+		__entry->map_id		= map->id;
+		__entry->act		= XDP_REDIRECT;
+		__entry->map_index	= map_index;
+		__entry->drops		= drops;
+		__entry->sent		= sent;
+		__entry->from_ifindex	= from_dev->ifindex;
+		__entry->to_ifindex	= to_dev->ifindex;
+	),
+
+	TP_printk("ndo_xdp_xmit"
+		  " map_id=%d map_index=%d action=%s"
+		  " sent=%d drops=%d"
+		  " from_ifindex=%d to_ifindex=%d",
+		  __entry->map_id, __entry->map_index,
+		  __print_symbolic(__entry->act, __XDP_ACT_SYM_TAB),
+		  __entry->sent, __entry->drops,
+		  __entry->from_ifindex, __entry->to_ifindex)
+);
+
 #endif /* _TRACE_XDP_H */
 
 #include <trace/define_trace.h>
diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c
index cab72c100bb5..6f84100723b0 100644
--- a/kernel/bpf/devmap.c
+++ b/kernel/bpf/devmap.c
@@ -50,6 +50,7 @@
 #include <linux/bpf.h>
 #include <net/xdp.h>
 #include <linux/filter.h>
+#include <trace/events/xdp.h>
 
 #define DEV_CREATE_FLAG_MASK \
 	(BPF_F_NUMA_NODE | BPF_F_RDONLY | BPF_F_WRONLY)
@@ -57,6 +58,7 @@
 #define DEV_MAP_BULK_SIZE 16
 struct xdp_bulk_queue {
 	struct xdp_frame *q[DEV_MAP_BULK_SIZE];
+	struct net_device *dev_rx;
 	unsigned int count;
 };
 
@@ -219,8 +221,8 @@ void __dev_map_insert_ctx(struct bpf_map *map, u32 bit)
 static int bq_xmit_all(struct bpf_dtab_netdev *obj,
 			 struct xdp_bulk_queue *bq)
 {
-	unsigned int processed = 0, drops = 0;
 	struct net_device *dev = obj->dev;
+	int sent = 0, drops = 0;
 	int i;
 
 	if (unlikely(!bq->count))
@@ -241,10 +243,13 @@ static int bq_xmit_all(struct bpf_dtab_netdev *obj,
 			drops++;
 			xdp_return_frame(xdpf);
 		}
-		processed++;
+		sent++;
 	}
 	bq->count = 0;
 
+	trace_xdp_devmap_xmit(&obj->dtab->map, obj->bit,
+			      sent, drops, bq->dev_rx, dev);
+	bq->dev_rx = NULL;
 	return 0;
 }
 
@@ -301,18 +306,28 @@ struct bpf_dtab_netdev *__dev_map_lookup_elem(struct bpf_map *map, u32 key)
 /* Runs under RCU-read-side, plus in softirq under NAPI protection.
  * Thus, safe percpu variable access.
  */
-static int bq_enqueue(struct bpf_dtab_netdev *obj, struct xdp_frame *xdpf)
+static int bq_enqueue(struct bpf_dtab_netdev *obj, struct xdp_frame *xdpf,
+		      struct net_device *dev_rx)
+
 {
 	struct xdp_bulk_queue *bq = this_cpu_ptr(obj->bulkq);
 
 	if (unlikely(bq->count == DEV_MAP_BULK_SIZE))
 		bq_xmit_all(obj, bq);
 
+	/* Ingress dev_rx will be the same for all xdp_frame's in
+	 * bulk_queue, because bq stored per-CPU and must be flushed
+	 * from net_device drivers NAPI func end.
+	 */
+	if (!bq->dev_rx)
+		bq->dev_rx = dev_rx;
+
 	bq->q[bq->count++] = xdpf;
 	return 0;
 }
 
-int dev_map_enqueue(struct bpf_dtab_netdev *dst, struct xdp_buff *xdp)
+int dev_map_enqueue(struct bpf_dtab_netdev *dst, struct xdp_buff *xdp,
+		    struct net_device *dev_rx)
 {
 	struct net_device *dev = dst->dev;
 	struct xdp_frame *xdpf;
@@ -325,7 +340,7 @@ int dev_map_enqueue(struct bpf_dtab_netdev *dst, struct xdp_buff *xdp)
 	if (unlikely(!xdpf))
 		return -EOVERFLOW;
 
-	err = bq_enqueue(dst, xdpf);
+	err = bq_enqueue(dst, xdpf, dev_rx);
 	if (err)
 		return err;
 
diff --git a/net/core/filter.c b/net/core/filter.c
index 1447ec94ef74..4a93423cc5ea 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -3063,7 +3063,7 @@ static int __bpf_tx_xdp_map(struct net_device *dev_rx, void *fwd,
 	case BPF_MAP_TYPE_DEVMAP: {
 		struct bpf_dtab_netdev *dst = fwd;
 
-		err = dev_map_enqueue(dst, xdp);
+		err = dev_map_enqueue(dst, xdp, dev_rx);
 		if (err)
 			return err;
 		__dev_map_insert_ctx(map, index);

^ permalink raw reply related

* [bpf-next V4 PATCH 4/8] samples/bpf: xdp_monitor use tracepoint xdp:xdp_devmap_xmit
From: Jesper Dangaard Brouer @ 2018-05-18 13:34 UTC (permalink / raw)
  To: netdev, Daniel Borkmann, Alexei Starovoitov,
	Jesper Dangaard Brouer
  Cc: Christoph Hellwig, BjörnTöpel, Magnus Karlsson,
	makita.toshiaki
In-Reply-To: <152665044141.21055.1276346542020340263.stgit@firesoul>

The xdp_monitor sample/tool is updated to use the new tracepoint
xdp:xdp_devmap_xmit the previous patch just introduced.

Signed-off-by: Jesper Dangaard Brouer <brouer@redhat.com>
---
 samples/bpf/xdp_monitor_kern.c |   39 +++++++++++++++++++++++++++++++++++
 samples/bpf/xdp_monitor_user.c |   44 +++++++++++++++++++++++++++++++++++++++-
 2 files changed, 82 insertions(+), 1 deletion(-)

diff --git a/samples/bpf/xdp_monitor_kern.c b/samples/bpf/xdp_monitor_kern.c
index 211db8ded0de..2854aa0665ea 100644
--- a/samples/bpf/xdp_monitor_kern.c
+++ b/samples/bpf/xdp_monitor_kern.c
@@ -208,3 +208,42 @@ int trace_xdp_cpumap_kthread(struct cpumap_kthread_ctx *ctx)
 
 	return 0;
 }
+
+struct bpf_map_def SEC("maps") devmap_xmit_cnt = {
+	.type		= BPF_MAP_TYPE_PERCPU_ARRAY,
+	.key_size	= sizeof(u32),
+	.value_size	= sizeof(struct datarec),
+	.max_entries	= 1,
+};
+
+/* Tracepoint: /sys/kernel/debug/tracing/events/xdp/xdp_devmap_xmit/format
+ * Code in:         kernel/include/trace/events/xdp.h
+ */
+struct devmap_xmit_ctx {
+	u64 __pad;		// First 8 bytes are not accessible by bpf code
+	int map_id;		//	offset:8;  size:4; signed:1;
+	u32 act;		//	offset:12; size:4; signed:0;
+	u32 map_index;		//	offset:16; size:4; signed:0;
+	int drops;		//	offset:20; size:4; signed:1;
+	int sent;		//	offset:24; size:4; signed:1;
+	int from_ifindex;	//	offset:28; size:4; signed:1;
+	int to_ifindex;		//	offset:32; size:4; signed:1;
+};
+
+SEC("tracepoint/xdp/xdp_devmap_xmit")
+int trace_xdp_devmap_xmit(struct devmap_xmit_ctx *ctx)
+{
+	struct datarec *rec;
+	u32 key = 0;
+
+	rec = bpf_map_lookup_elem(&devmap_xmit_cnt, &key);
+	if (!rec)
+		return 0;
+	rec->processed += ctx->sent;
+	rec->dropped   += ctx->drops;
+
+	/* Record bulk events, then userspace can calc average bulk size */
+	rec->info += 1;
+
+	return 1;
+}
diff --git a/samples/bpf/xdp_monitor_user.c b/samples/bpf/xdp_monitor_user.c
index bf09b5188acd..7e18a454924c 100644
--- a/samples/bpf/xdp_monitor_user.c
+++ b/samples/bpf/xdp_monitor_user.c
@@ -141,6 +141,7 @@ struct stats_record {
 	struct record_u64 xdp_exception[XDP_ACTION_MAX];
 	struct record xdp_cpumap_kthread;
 	struct record xdp_cpumap_enqueue[MAX_CPUS];
+	struct record xdp_devmap_xmit;
 };
 
 static bool map_collect_record(int fd, __u32 key, struct record *rec)
@@ -397,7 +398,7 @@ static void stats_print(struct stats_record *stats_rec,
 			info = calc_info(r, p, t);
 			if (info > 0)
 				i_str = "sched";
-			if (pps > 0)
+			if (pps > 0 || drop > 0)
 				printf(fmt1, "cpumap-kthread",
 				       i, pps, drop, info, i_str);
 		}
@@ -409,6 +410,42 @@ static void stats_print(struct stats_record *stats_rec,
 		printf(fmt2, "cpumap-kthread", "total", pps, drop, info, i_str);
 	}
 
+	/* devmap ndo_xdp_xmit stats */
+	{
+		char *fmt1 = "%-15s %-7d %'-12.0f %'-12.0f %'-10.2f %s\n";
+		char *fmt2 = "%-15s %-7s %'-12.0f %'-12.0f %'-10.2f %s\n";
+		struct record *rec, *prev;
+		double drop, info;
+		char *i_str = "";
+
+		rec  =  &stats_rec->xdp_devmap_xmit;
+		prev = &stats_prev->xdp_devmap_xmit;
+		t = calc_period(rec, prev);
+		for (i = 0; i < nr_cpus; i++) {
+			struct datarec *r = &rec->cpu[i];
+			struct datarec *p = &prev->cpu[i];
+
+			pps  = calc_pps(r, p, t);
+			drop = calc_drop(r, p, t);
+			info = calc_info(r, p, t);
+			if (info > 0) {
+				i_str = "bulk-average";
+				info = (pps+drop) / info; /* calc avg bulk */
+			}
+			if (pps > 0 || drop > 0)
+				printf(fmt1, "devmap-xmit",
+				       i, pps, drop, info, i_str);
+		}
+		pps = calc_pps(&rec->total, &prev->total, t);
+		drop = calc_drop(&rec->total, &prev->total, t);
+		info = calc_info(&rec->total, &prev->total, t);
+		if (info > 0) {
+			i_str = "bulk-average";
+			info = (pps+drop) / info; /* calc avg bulk */
+		}
+		printf(fmt2, "devmap-xmit", "total", pps, drop, info, i_str);
+	}
+
 	printf("\n");
 }
 
@@ -437,6 +474,9 @@ static bool stats_collect(struct stats_record *rec)
 	fd = map_data[3].fd; /* map3: cpumap_kthread_cnt */
 	map_collect_record(fd, 0, &rec->xdp_cpumap_kthread);
 
+	fd = map_data[4].fd; /* map4: devmap_xmit_cnt */
+	map_collect_record(fd, 0, &rec->xdp_devmap_xmit);
+
 	return true;
 }
 
@@ -480,6 +520,7 @@ static struct stats_record *alloc_stats_record(void)
 
 	rec_sz = sizeof(struct datarec);
 	rec->xdp_cpumap_kthread.cpu = alloc_rec_per_cpu(rec_sz);
+	rec->xdp_devmap_xmit.cpu    = alloc_rec_per_cpu(rec_sz);
 
 	for (i = 0; i < MAX_CPUS; i++)
 		rec->xdp_cpumap_enqueue[i].cpu = alloc_rec_per_cpu(rec_sz);
@@ -498,6 +539,7 @@ static void free_stats_record(struct stats_record *r)
 		free(r->xdp_exception[i].cpu);
 
 	free(r->xdp_cpumap_kthread.cpu);
+	free(r->xdp_devmap_xmit.cpu);
 
 	for (i = 0; i < MAX_CPUS; i++)
 		free(r->xdp_cpumap_enqueue[i].cpu);

^ permalink raw reply related

* [bpf-next V4 PATCH 5/8] xdp: introduce xdp_return_frame_rx_napi
From: Jesper Dangaard Brouer @ 2018-05-18 13:34 UTC (permalink / raw)
  To: netdev, Daniel Borkmann, Alexei Starovoitov,
	Jesper Dangaard Brouer
  Cc: Christoph Hellwig, BjörnTöpel, Magnus Karlsson,
	makita.toshiaki
In-Reply-To: <152665044141.21055.1276346542020340263.stgit@firesoul>

When sending an xdp_frame through xdp_do_redirect call, then error
cases can happen where the xdp_frame needs to be dropped, and
returning an -errno code isn't sufficient/possible any-longer
(e.g. for cpumap case). This is already fully supported, by simply
calling xdp_return_frame.

This patch is an optimization, which provides xdp_return_frame_rx_napi,
which is a faster variant for these error cases.  It take advantage of
the protection provided by XDP RX running under NAPI protection.

This change is mostly relevant for drivers using the page_pool
allocator as it can take advantage of this. (Tested with mlx5).
---
 include/net/page_pool.h |    5 +++--
 include/net/xdp.h       |    1 +
 kernel/bpf/cpumap.c     |    2 +-
 net/core/xdp.c          |   20 ++++++++++++++++----
 4 files changed, 21 insertions(+), 7 deletions(-)

diff --git a/include/net/page_pool.h b/include/net/page_pool.h
index c79087153148..694d055e01ef 100644
--- a/include/net/page_pool.h
+++ b/include/net/page_pool.h
@@ -115,13 +115,14 @@ void page_pool_destroy(struct page_pool *pool);
 void __page_pool_put_page(struct page_pool *pool,
 			  struct page *page, bool allow_direct);
 
-static inline void page_pool_put_page(struct page_pool *pool, struct page *page)
+static inline void page_pool_put_page(struct page_pool *pool,
+				      struct page *page, bool allow_direct)
 {
 	/* When page_pool isn't compiled-in, net/core/xdp.c doesn't
 	 * allow registering MEM_TYPE_PAGE_POOL, but shield linker.
 	 */
 #ifdef CONFIG_PAGE_POOL
-	__page_pool_put_page(pool, page, false);
+	__page_pool_put_page(pool, page, allow_direct);
 #endif
 }
 /* Very limited use-cases allow recycle direct */
diff --git a/include/net/xdp.h b/include/net/xdp.h
index 0b689cf561c7..7ad779237ae8 100644
--- a/include/net/xdp.h
+++ b/include/net/xdp.h
@@ -104,6 +104,7 @@ struct xdp_frame *convert_to_xdp_frame(struct xdp_buff *xdp)
 }
 
 void xdp_return_frame(struct xdp_frame *xdpf);
+void xdp_return_frame_rx_napi(struct xdp_frame *xdpf);
 void xdp_return_buff(struct xdp_buff *xdp);
 
 int xdp_rxq_info_reg(struct xdp_rxq_info *xdp_rxq,
diff --git a/kernel/bpf/cpumap.c b/kernel/bpf/cpumap.c
index c95b04ec103e..e0918d180f08 100644
--- a/kernel/bpf/cpumap.c
+++ b/kernel/bpf/cpumap.c
@@ -578,7 +578,7 @@ static int bq_flush_to_queue(struct bpf_cpu_map_entry *rcpu,
 		err = __ptr_ring_produce(q, xdpf);
 		if (err) {
 			drops++;
-			xdp_return_frame(xdpf);
+			xdp_return_frame_rx_napi(xdpf);
 		}
 		processed++;
 	}
diff --git a/net/core/xdp.c b/net/core/xdp.c
index bf6758f74339..cb8c4e061a5a 100644
--- a/net/core/xdp.c
+++ b/net/core/xdp.c
@@ -308,7 +308,13 @@ int xdp_rxq_info_reg_mem_model(struct xdp_rxq_info *xdp_rxq,
 }
 EXPORT_SYMBOL_GPL(xdp_rxq_info_reg_mem_model);
 
-static void xdp_return(void *data, struct xdp_mem_info *mem)
+/* XDP RX runs under NAPI protection, and in different delivery error
+ * scenarios (e.g. queue full), it is possible to return the xdp_frame
+ * while still leveraging this protection.  The @napi_direct boolian
+ * is used for those calls sites.  Thus, allowing for faster recycling
+ * of xdp_frames/pages in those cases.
+ */
+static void __xdp_return(void *data, struct xdp_mem_info *mem, bool napi_direct)
 {
 	struct xdp_mem_allocator *xa;
 	struct page *page;
@@ -320,7 +326,7 @@ static void xdp_return(void *data, struct xdp_mem_info *mem)
 		xa = rhashtable_lookup(mem_id_ht, &mem->id, mem_id_rht_params);
 		page = virt_to_head_page(data);
 		if (xa)
-			page_pool_put_page(xa->page_pool, page);
+			page_pool_put_page(xa->page_pool, page, napi_direct);
 		else
 			put_page(page);
 		rcu_read_unlock();
@@ -340,12 +346,18 @@ static void xdp_return(void *data, struct xdp_mem_info *mem)
 
 void xdp_return_frame(struct xdp_frame *xdpf)
 {
-	xdp_return(xdpf->data, &xdpf->mem);
+	__xdp_return(xdpf->data, &xdpf->mem, false);
 }
 EXPORT_SYMBOL_GPL(xdp_return_frame);
 
+void xdp_return_frame_rx_napi(struct xdp_frame *xdpf)
+{
+	__xdp_return(xdpf->data, &xdpf->mem, true);
+}
+EXPORT_SYMBOL_GPL(xdp_return_frame_rx_napi);
+
 void xdp_return_buff(struct xdp_buff *xdp)
 {
-	xdp_return(xdp->data, &xdp->rxq->mem);
+	__xdp_return(xdp->data, &xdp->rxq->mem, true);
 }
 EXPORT_SYMBOL_GPL(xdp_return_buff);

^ permalink raw reply related

page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox