Netdev List

Netdev List
 help / color / mirror / Atom feed

* [PATCH bpf-next v6 05/10] bpf/verifier: improve register value range tracking with ARSH
From: Yonghong Song @ 2018-04-23 21:27 UTC (permalink / raw)
  To: ast, daniel, netdev, ecree; +Cc: kernel-team
In-Reply-To: <20180423212752.986580-1-yhs@fb.com>

When helpers like bpf_get_stack returns an int value
and later on used for arithmetic computation, the LSH and ARSH
operations are often required to get proper sign extension into
64-bit. For example, without this patch:
    54: R0=inv(id=0,umax_value=800)
    54: (bf) r8 = r0
    55: R0=inv(id=0,umax_value=800) R8_w=inv(id=0,umax_value=800)
    55: (67) r8 <<= 32
    56: R8_w=inv(id=0,umax_value=3435973836800,var_off=(0x0; 0x3ff00000000))
    56: (c7) r8 s>>= 32
    57: R8=inv(id=0)
With this patch:
    54: R0=inv(id=0,umax_value=800)
    54: (bf) r8 = r0
    55: R0=inv(id=0,umax_value=800) R8_w=inv(id=0,umax_value=800)
    55: (67) r8 <<= 32
    56: R8_w=inv(id=0,umax_value=3435973836800,var_off=(0x0; 0x3ff00000000))
    56: (c7) r8 s>>= 32
    57: R8=inv(id=0, umax_value=800,var_off=(0x0; 0x3ff))
With better range of "R8", later on when "R8" is added to other register,
e.g., a map pointer or scalar-value register, the better register
range can be derived and verifier failure may be avoided.

In our later example,
    ......
    usize = bpf_get_stack(ctx, raw_data, max_len, BPF_F_USER_STACK);
    if (usize < 0)
        return 0;
    ksize = bpf_get_stack(ctx, raw_data + usize, max_len - usize, 0);
    ......
Without improving ARSH value range tracking, the register representing
"max_len - usize" will have smin_value equal to S64_MIN and will be
rejected by verifier.

Signed-off-by: Yonghong Song <yhs@fb.com>
---
 include/linux/tnum.h  |  4 +++-
 kernel/bpf/tnum.c     | 10 ++++++++++
 kernel/bpf/verifier.c | 41 +++++++++++++++++++++++++++++++++++++++++
 3 files changed, 54 insertions(+), 1 deletion(-)

diff --git a/include/linux/tnum.h b/include/linux/tnum.h
index 0d2d3da..c7dc2b5 100644
--- a/include/linux/tnum.h
+++ b/include/linux/tnum.h
@@ -23,8 +23,10 @@ struct tnum tnum_range(u64 min, u64 max);
 /* Arithmetic and logical ops */
 /* Shift a tnum left (by a fixed shift) */
 struct tnum tnum_lshift(struct tnum a, u8 shift);
-/* Shift a tnum right (by a fixed shift) */
+/* Shift (rsh) a tnum right (by a fixed shift) */
 struct tnum tnum_rshift(struct tnum a, u8 shift);
+/* Shift (arsh) a tnum right (by a fixed min_shift) */
+struct tnum tnum_arshift(struct tnum a, u8 min_shift);
 /* Add two tnums, return @a + @b */
 struct tnum tnum_add(struct tnum a, struct tnum b);
 /* Subtract two tnums, return @a - @b */
diff --git a/kernel/bpf/tnum.c b/kernel/bpf/tnum.c
index 1f4bf68..938d412 100644
--- a/kernel/bpf/tnum.c
+++ b/kernel/bpf/tnum.c
@@ -43,6 +43,16 @@ struct tnum tnum_rshift(struct tnum a, u8 shift)
 	return TNUM(a.value >> shift, a.mask >> shift);
 }
 
+struct tnum tnum_arshift(struct tnum a, u8 min_shift)
+{
+	/* if a.value is negative, arithmetic shifting by minimum shift
+	 * will have larger negative offset compared to more shifting.
+	 * If a.value is nonnegative, arithmetic shifting by minimum shift
+	 * will have larger positive offset compare to more shifting.
+	 */
+	return TNUM((s64)a.value >> min_shift, (s64)a.mask >> min_shift);
+}
+
 struct tnum tnum_add(struct tnum a, struct tnum b)
 {
 	u64 sm, sv, sigma, chi, mu;
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 217d92a..643923e 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -2968,6 +2968,47 @@ static int adjust_scalar_min_max_vals(struct bpf_verifier_env *env,
 		/* We may learn something more from the var_off */
 		__update_reg_bounds(dst_reg);
 		break;
+	case BPF_ARSH:
+		if (umax_val >= insn_bitness) {
+			/* Shifts greater than 31 or 63 are undefined.
+			 * This includes shifts by a negative number.
+			 */
+			mark_reg_unknown(env, regs, insn->dst_reg);
+			break;
+		}
+
+		/* BPF_ARSH is an arithmetic shift. The new range of
+		 * smin_value and smax_value should take the sign
+		 * into consideration.
+		 *
+		 * For example, if smin_value = -16, umin_val = 0
+		 * and umax_val = 2, the new smin_value should be
+		 * -16 >> 0 = -16 since -16 >> 2 = -4.
+		 * If smin_value = 16, umin_val = 0 and umax_val = 2,
+		 * the new smin_value should be 16 >> 2 = 4.
+		 *
+		 * Now suppose smax_value = -4, umin_val = 0 and
+		 * umax_val = 2, the new smax_value should be
+		 * -4 >> 2 = -1. If smax_value = 32 with the same
+		 * umin_val/umax_val, the new smax_value should remain 32.
+		 */
+		if (dst_reg->smin_value < 0)
+			dst_reg->smin_value >>= umin_val;
+		else
+			dst_reg->smin_value >>= umax_val;
+		if (dst_reg->smax_value < 0)
+			dst_reg->smax_value >>= umax_val;
+		else
+			dst_reg->smax_value >>= umin_val;
+		dst_reg->var_off = tnum_arshift(dst_reg->var_off, umin_val);
+
+		/* blow away the dst_reg umin_value/umax_value and rely on
+		 * dst_reg var_off to refine the result.
+		 */
+		dst_reg->umin_value = 0;
+		dst_reg->umax_value = U64_MAX;
+		__update_reg_bounds(dst_reg);
+		break;
 	default:
 		mark_reg_unknown(env, regs, insn->dst_reg);
 		break;
-- 
2.9.5

^ permalink raw reply related

* [PATCH bpf-next v6 03/10] bpf/verifier: refine retval R0 state for bpf_get_stack helper
From: Yonghong Song @ 2018-04-23 21:27 UTC (permalink / raw)
  To: ast, daniel, netdev, ecree; +Cc: kernel-team
In-Reply-To: <20180423212752.986580-1-yhs@fb.com>

The special property of return values for helpers bpf_get_stack
and bpf_probe_read_str are captured in verifier.
Both helpers return a negative error code or
a length, which is equal to or smaller than the buffer
size argument. This additional information in the
verifier can avoid the condition such as "retval > bufsize"
in the bpf program. For example, for the code blow,
    usize = bpf_get_stack(ctx, raw_data, max_len, BPF_F_USER_STACK);
    if (usize < 0 || usize > max_len)
        return 0;
The verifier may have the following errors:
    52: (85) call bpf_get_stack#65
     R0=map_value(id=0,off=0,ks=4,vs=1600,imm=0) R1_w=ctx(id=0,off=0,imm=0)
     R2_w=map_value(id=0,off=0,ks=4,vs=1600,imm=0) R3_w=inv800 R4_w=inv256
     R6=ctx(id=0,off=0,imm=0) R7=map_value(id=0,off=0,ks=4,vs=1600,imm=0)
     R9_w=inv800 R10=fp0,call_-1
    53: (bf) r8 = r0
    54: (bf) r1 = r8
    55: (67) r1 <<= 32
    56: (bf) r2 = r1
    57: (77) r2 >>= 32
    58: (25) if r2 > 0x31f goto pc+33
     R0=inv(id=0) R1=inv(id=0,smax_value=9223372032559808512,
                         umax_value=18446744069414584320,
                         var_off=(0x0; 0xffffffff00000000))
     R2=inv(id=0,umax_value=799,var_off=(0x0; 0x3ff))
     R6=ctx(id=0,off=0,imm=0) R7=map_value(id=0,off=0,ks=4,vs=1600,imm=0)
     R8=inv(id=0) R9=inv800 R10=fp0,call_-1
    59: (1f) r9 -= r8
    60: (c7) r1 s>>= 32
    61: (bf) r2 = r7
    62: (0f) r2 += r1
    math between map_value pointer and register with unbounded
    min value is not allowed
The failure is due to llvm compiler optimization where register "r2",
which is a copy of "r1", is tested for condition while later on "r1"
is used for map_ptr operation. The verifier is not able to track such
inst sequence effectively.

Without the "usize > max_len" condition, there is no llvm optimization
and the below generated code passed verifier:
    52: (85) call bpf_get_stack#65
     R0=map_value(id=0,off=0,ks=4,vs=1600,imm=0) R1_w=ctx(id=0,off=0,imm=0)
     R2_w=map_value(id=0,off=0,ks=4,vs=1600,imm=0) R3_w=inv800 R4_w=inv256
     R6=ctx(id=0,off=0,imm=0) R7=map_value(id=0,off=0,ks=4,vs=1600,imm=0)
     R9_w=inv800 R10=fp0,call_-1
    53: (b7) r1 = 0
    54: (bf) r8 = r0
    55: (67) r8 <<= 32
    56: (c7) r8 s>>= 32
    57: (6d) if r1 s> r8 goto pc+24
     R0=inv(id=0,umax_value=800,var_off=(0x0; 0x3ff))
     R1=inv0 R6=ctx(id=0,off=0,imm=0)
     R7=map_value(id=0,off=0,ks=4,vs=1600,imm=0)
     R8=inv(id=0,umax_value=800,var_off=(0x0; 0x3ff)) R9=inv800
     R10=fp0,call_-1
    58: (bf) r2 = r7
    59: (0f) r2 += r8
    60: (1f) r9 -= r8
    61: (bf) r1 = r6

Signed-off-by: Yonghong Song <yhs@fb.com>
---
 kernel/bpf/verifier.c | 27 +++++++++++++++++++++++++++
 1 file changed, 27 insertions(+)

diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index aba9425..d531119 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -164,6 +164,8 @@ struct bpf_call_arg_meta {
 	bool pkt_access;
 	int regno;
 	int access_size;
+	s64 msize_smax_value;
+	u64 msize_umax_value;
 };
 
 static DEFINE_MUTEX(bpf_verifier_lock);
@@ -1994,6 +1996,12 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno,
 	} else if (arg_type_is_mem_size(arg_type)) {
 		bool zero_size_allowed = (arg_type == ARG_CONST_SIZE_OR_ZERO);
 
+		/* remember the mem_size which may be used later
+		 * to refine return values.
+		 */
+		meta->msize_smax_value = reg->smax_value;
+		meta->msize_umax_value = reg->umax_value;
+
 		/* The register is SCALAR_VALUE; the access check
 		 * happens using its boundaries.
 		 */
@@ -2333,6 +2341,23 @@ static int prepare_func_exit(struct bpf_verifier_env *env, int *insn_idx)
 	return 0;
 }
 
+static void do_refine_retval_range(struct bpf_reg_state *regs, int ret_type,
+				   int func_id,
+				   struct bpf_call_arg_meta *meta)
+{
+	struct bpf_reg_state *ret_reg = &regs[BPF_REG_0];
+
+	if (ret_type != RET_INTEGER ||
+	    (func_id != BPF_FUNC_get_stack &&
+	     func_id != BPF_FUNC_probe_read_str))
+		return;
+
+	ret_reg->smax_value = meta->msize_smax_value;
+	ret_reg->umax_value = meta->msize_umax_value;
+	__reg_deduce_bounds(ret_reg);
+	__reg_bound_offset(ret_reg);
+}
+
 static int check_helper_call(struct bpf_verifier_env *env, int func_id, int insn_idx)
 {
 	const struct bpf_func_proto *fn = NULL;
@@ -2456,6 +2481,8 @@ static int check_helper_call(struct bpf_verifier_env *env, int func_id, int insn
 		return -EINVAL;
 	}
 
+	do_refine_retval_range(regs, fn->ret_type, func_id, &meta);
+
 	err = check_map_func_compatibility(env, meta.map_ptr, func_id);
 	if (err)
 		return err;
-- 
2.9.5

^ permalink raw reply related

* [PATCH bpf-next v6 01/10] bpf: change prototype for stack_map_get_build_id_offset
From: Yonghong Song @ 2018-04-23 21:27 UTC (permalink / raw)
  To: ast, daniel, netdev, ecree; +Cc: kernel-team
In-Reply-To: <20180423212752.986580-1-yhs@fb.com>

This patch didn't incur functionality change. The function prototype
got changed so that the same function can be reused later.

Signed-off-by: Yonghong Song <yhs@fb.com>
---
 kernel/bpf/stackmap.c | 13 +++++--------
 1 file changed, 5 insertions(+), 8 deletions(-)

diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c
index 57eeb12..04f6ec1 100644
--- a/kernel/bpf/stackmap.c
+++ b/kernel/bpf/stackmap.c
@@ -262,16 +262,11 @@ static int stack_map_get_build_id(struct vm_area_struct *vma,
 	return ret;
 }
 
-static void stack_map_get_build_id_offset(struct bpf_map *map,
-					  struct stack_map_bucket *bucket,
+static void stack_map_get_build_id_offset(struct bpf_stack_build_id *id_offs,
 					  u64 *ips, u32 trace_nr, bool user)
 {
 	int i;
 	struct vm_area_struct *vma;
-	struct bpf_stack_build_id *id_offs;
-
-	bucket->nr = trace_nr;
-	id_offs = (struct bpf_stack_build_id *)bucket->data;
 
 	/*
 	 * We cannot do up_read() in nmi context, so build_id lookup is
@@ -361,8 +356,10 @@ BPF_CALL_3(bpf_get_stackid, struct pt_regs *, regs, struct bpf_map *, map,
 			pcpu_freelist_pop(&smap->freelist);
 		if (unlikely(!new_bucket))
 			return -ENOMEM;
-		stack_map_get_build_id_offset(map, new_bucket, ips,
-					      trace_nr, user);
+		new_bucket->nr = trace_nr;
+		stack_map_get_build_id_offset(
+			(struct bpf_stack_build_id *)new_bucket->data,
+			ips, trace_nr, user);
 		trace_len = trace_nr * sizeof(struct bpf_stack_build_id);
 		if (hash_matches && bucket->nr == trace_nr &&
 		    memcmp(bucket->data, new_bucket->data, trace_len) == 0) {
-- 
2.9.5

^ permalink raw reply related

* [PATCH bpf-next v6 04/10] bpf: remove never-hit branches in verifier adjust_scalar_min_max_vals
From: Yonghong Song @ 2018-04-23 21:27 UTC (permalink / raw)
  To: ast, daniel, netdev, ecree; +Cc: kernel-team
In-Reply-To: <20180423212752.986580-1-yhs@fb.com>

In verifier function adjust_scalar_min_max_vals,
when src_known is false and the opcode is BPF_LSH/BPF_RSH,
early return will happen in the function. So remove
the branch in handling BPF_LSH/BPF_RSH when src_known is false.

Signed-off-by: Yonghong Song <yhs@fb.com>
---
 kernel/bpf/verifier.c | 11 ++---------
 1 file changed, 2 insertions(+), 9 deletions(-)

diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index d531119..217d92a 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -2934,10 +2934,7 @@ static int adjust_scalar_min_max_vals(struct bpf_verifier_env *env,
 			dst_reg->umin_value <<= umin_val;
 			dst_reg->umax_value <<= umax_val;
 		}
-		if (src_known)
-			dst_reg->var_off = tnum_lshift(dst_reg->var_off, umin_val);
-		else
-			dst_reg->var_off = tnum_lshift(tnum_unknown, umin_val);
+		dst_reg->var_off = tnum_lshift(dst_reg->var_off, umin_val);
 		/* We may learn something more from the var_off */
 		__update_reg_bounds(dst_reg);
 		break;
@@ -2965,11 +2962,7 @@ static int adjust_scalar_min_max_vals(struct bpf_verifier_env *env,
 		 */
 		dst_reg->smin_value = S64_MIN;
 		dst_reg->smax_value = S64_MAX;
-		if (src_known)
-			dst_reg->var_off = tnum_rshift(dst_reg->var_off,
-						       umin_val);
-		else
-			dst_reg->var_off = tnum_rshift(tnum_unknown, umin_val);
+		dst_reg->var_off = tnum_rshift(dst_reg->var_off, umin_val);
 		dst_reg->umin_value >>= umax_val;
 		dst_reg->umax_value >>= umin_val;
 		/* We may learn something more from the var_off */
-- 
2.9.5

^ permalink raw reply related

* Re: [PATCH net-next v2 0/2] openvswitch: Support conntrack zone limit
From: Yi-Hung Wei @ 2018-04-23 21:19 UTC (permalink / raw)
  To: Pravin Shelar
  Cc: David Miller, Linux Kernel Network Developers, Florian Westphal
In-Reply-To: <CAOrHB_CzX4cR6KKkwC=2ap3gaxnX7FD1fytyAQBcgbQof_ixDA@mail.gmail.com>

On Mon, Apr 23, 2018 at 1:10 PM, Pravin Shelar <pshelar@ovn.org> wrote:
> On Mon, Apr 23, 2018 at 6:39 AM, David Miller <davem@davemloft.net> wrote:
>> From: Yi-Hung Wei <yihung.wei@gmail.com>
>> Date: Tue, 17 Apr 2018 17:30:27 -0700
>>
>>> Currently, nf_conntrack_max is used to limit the maximum number of
>>> conntrack entries in the conntrack table for every network namespace.
>>> For the VMs and containers that reside in the same namespace,
>>> they share the same conntrack table, and the total # of conntrack entries
>>> for all the VMs and containers are limited by nf_conntrack_max.  In this
>>> case, if one of the VM/container abuses the usage the conntrack entries,
>>> it blocks the others from committing valid conntrack entries into the
>>> conntrack table.  Even if we can possibly put the VM in different network
>>> namespace, the current nf_conntrack_max configuration is kind of rigid
>>> that we cannot limit different VM/container to have different # conntrack
>>> entries.
>>>
>
> Hi
> This looks like general problem related to nf zone usage limit, Did
> you considered changing nf-conntrack to have a per zone limit, so that
> all users of nf-filter can use it. I prefer this to adding a wrapper
> in OVS nf-filter layer.
>
> Thanks,
> Pravin.
>

Hi Prvain,

Thanks for your comment.  Originally, I was thinking to add this
feature in nf_conntrack and had some discussion with Florian.  It
turns out that iptables and nft have their own way to keep track of
the connection limits, and it sounds reasonable to share the backend
that counts the number of connections, but each module can enforce the
connection limit in their own way.  Therefore, Florian helped to pull
out the common backend to nf_conncount in the following commit. The
nf_conncount then can be used by xtables, nft, and ovs.

commit 625c556118f3c2fd28bb8ef6da18c53bd4037be4
Author: Florian Westphal <fw@strlen.de>
Date:   Sat Dec 9 21:01:08 2017 +0100

    netfilter: connlimit: split xt_connlimit into front and backend

This allows to reuse xt_connlimit infrastructure from nf_tables.
The upcoming nf_tables frontend can just pass in an nftables register
as input key, this allows limiting by any nft-supported key, including
concatenations.  For xt_connlimit, pass in the zone and the ip/ipv6 addres.
....

Basically, to achieve conntrack zone limit in OVS.  We need the
following 3 parts.
1. Count the number of connections (this is provided by netfilter's
nf_conncount backend)
2. Keep track of the connection limits of zones, and check if it
exceeds the limit.
3. An API for userspace to set/delete/get the conntrack zone limit.

This patch series implements item 2 and 3, and it reuses the
nf_conncount from netfiler for the first part.

Thanks,

-Yi-Hung

^ permalink raw reply

* Re: [PATCH bpf-next v4 1/2] bpf: allow map helpers access to map values directly
From: Daniel Borkmann @ 2018-04-23 21:18 UTC (permalink / raw)
  To: Paul Chaignon, Alexei Starovoitov, netdev; +Cc: iovisor-dev, paul.chaignon
In-Reply-To: <3c8b73a6ea66eb01419da4ff0c611f72c7905a5a.1524407665.git.paul.chaignon@orange.com>

On 04/22/2018 11:52 PM, Paul Chaignon wrote:
> Helpers that expect ARG_PTR_TO_MAP_KEY and ARG_PTR_TO_MAP_VALUE can only
> access stack and packet memory.  Allow these helpers to directly access
> map values by passing registers of type PTR_TO_MAP_VALUE.
> 
> This change removes the need for an extra copy to the stack when using a
> map value to perform a second map lookup, as in the following:
> 
> struct bpf_map_def SEC("maps") infobyreq = {
>     .type = BPF_MAP_TYPE_HASHMAP,
>     .key_size = sizeof(struct request *),
>     .value_size = sizeof(struct info_t),
>     .max_entries = 1024,
> };
> struct bpf_map_def SEC("maps") counts = {
>     .type = BPF_MAP_TYPE_HASHMAP,
>     .key_size = sizeof(struct info_t),
>     .value_size = sizeof(u64),
>     .max_entries = 1024,
> };
> SEC("kprobe/blk_account_io_start")
> int bpf_blk_account_io_start(struct pt_regs *ctx)
> {
>     struct info_t *info = bpf_map_lookup_elem(&infobyreq, &ctx->di);
>     u64 *count = bpf_map_lookup_elem(&counts, info);
>     (*count)++;
> }
> 
> Signed-off-by: Paul Chaignon <paul.chaignon@orange.com>
> ---
>  kernel/bpf/verifier.c | 9 ++++++++-
>  1 file changed, 8 insertions(+), 1 deletion(-)
> 
> diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
> index 5dd1dcb902bf..70e00beade03 100644
> --- a/kernel/bpf/verifier.c
> +++ b/kernel/bpf/verifier.c
> @@ -1914,7 +1914,7 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno,
>  	if (arg_type == ARG_PTR_TO_MAP_KEY ||
>  	    arg_type == ARG_PTR_TO_MAP_VALUE) {
>  		expected_type = PTR_TO_STACK;
> -		if (!type_is_pkt_pointer(type) &&
> +		if (!type_is_pkt_pointer(type) && type != PTR_TO_MAP_VALUE &&
>  		    type != expected_type)
>  			goto err_type;
>  	} else if (arg_type == ARG_CONST_SIZE ||
> @@ -1970,6 +1970,9 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno,
>  			err = check_packet_access(env, regno, reg->off,
>  						  meta->map_ptr->key_size,
>  						  false);
> +		else if (type == PTR_TO_MAP_VALUE)
> +			err = check_map_access(env, regno, reg->off,
> +					       meta->map_ptr->key_size, false);
>  		else
>  			err = check_stack_boundary(env, regno,
>  						   meta->map_ptr->key_size,

We should reuse check_helper_mem_access() here which covers all three cases
from above already and simplifies the code a bit.

> @@ -1987,6 +1990,10 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno,
>  			err = check_packet_access(env, regno, reg->off,
>  						  meta->map_ptr->value_size,
>  						  false);
> +		else if (type == PTR_TO_MAP_VALUE)
> +			err = check_map_access(env, regno, reg->off,
> +					       meta->map_ptr->value_size,
> +					       false);
>  		else
>  			err = check_stack_boundary(env, regno,
>  						   meta->map_ptr->value_size,
> 

Ditto.

Thanks,
Daniel

^ permalink raw reply

* Re: [PATCH net-next 0/4] mm,tcp: provide mmap_hook to solve lockdep issue
From: Andy Lutomirski @ 2018-04-23 21:14 UTC (permalink / raw)
  To: Eric Dumazet, David S . Miller
  Cc: netdev, linux-kernel, Soheil Hassas Yeganeh, Eric Dumazet,
	linux-mm, Linux API
In-Reply-To: <20180420155542.122183-1-edumazet@google.com>

On 04/20/2018 08:55 AM, Eric Dumazet wrote:
> This patch series provide a new mmap_hook to fs willing to grab
> a mutex before mm->mmap_sem is taken, to ensure lockdep sanity.
> 
> This hook allows us to shorten tcp_mmap() execution time (while mmap_sem
> is held), and improve multi-threading scalability.
> 

I think that the right solution is to rework mmap() on TCP sockets a 
bit.  The current approach in net-next is very strange for a few reasons:

1. It uses mmap() as an operation that has side effects besides just 
creating a mapping.  If nothing else, it's surprising, since mmap() 
doesn't usually do that.  But it's also causing problems like what 
you're seeing.

2. The performance is worse than it needs to be.  mmap() is slow, and I 
doubt you'll find many mm developers who consider this particular abuse 
of mmap() to be a valid thing to optimize for.

3. I'm not at all convinced the accounting is sane.  As far as I can 
tell, you're allowing unprivileged users to increment the count on 
network-owned pages, limited only by available virtual memory, without 
obviously charging it to the socket buffer limits.  It looks like a 
program that simply forgot to call munmap() would cause the system to 
run out of memory, and I see no reason to expect the OOM killer to have 
any real chance of killing the right task.

4. Error handling sucks.  If I try to mmap() a large range (which is the 
whole point -- using a small range will kill performance) and not quite 
all of it can be mapped, then I waste a bunch of time in the kernel and 
get *none* of the range mapped.

I would suggest that you rework the interface a bit.  First a user would 
call mmap() on a TCP socket, which would create an empty VMA.  (It would 
set vm_ops to point to tcp_vm_ops or similar so that the TCP code could 
recognize it, but it would have no effect whatsoever on the TCP state 
machine.  Reading the VMA would get SIGBUS.)  Then a user would call a 
new ioctl() or setsockopt() function and pass something like:

struct tcp_zerocopy_receive {
   void *address;
   size_t length;
};

The kernel would verify that [address, address+length) is entirely 
inside a single TCP VMA and then would do the vm_insert_range magic.  On 
success, length is changed to the length that actually got mapped.  The 
kernel could do this while holding mmap_sem for *read*, and it could get 
the lock ordering right.  If and when mm range locks ever get merged, it 
could switch to using a range lock.

Then you could use MADV_DONTNEED or another ioctl/setsockopt to zap the 
part of the mapping that you're done with.

Does this seem reasonable?  It should involve very little code change, 
it will run faster, it will scale better, and it is much less weird IMO.

^ permalink raw reply

* [Patch nf] ipvs: initialize tbl->entries in ip_vs_lblc_init_svc()
From: Cong Wang @ 2018-04-23 21:04 UTC (permalink / raw)
  To: netdev
  Cc: lvs-devel, netfilter-devel, Cong Wang, Simon Horman,
	Julian Anastasov, Pablo Neira Ayuso

Similarly, tbl->entries is not initialized after kmalloc(),
therefore causes an uninit-value warning in ip_vs_lblc_check_expire(),
as reported by syzbot.

Reported-by: <syzbot+3e9695f147fb529aa9bc@syzkaller.appspotmail.com>
Cc: Simon Horman <horms@verge.net.au>
Cc: Julian Anastasov <ja@ssi.bg>
Cc: Pablo Neira Ayuso <pablo@netfilter.org>
Signed-off-by: Cong Wang <xiyou.wangcong@gmail.com>
---
 net/netfilter/ipvs/ip_vs_lblc.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/net/netfilter/ipvs/ip_vs_lblc.c b/net/netfilter/ipvs/ip_vs_lblc.c
index 3057e453bf31..83918119ceb8 100644
--- a/net/netfilter/ipvs/ip_vs_lblc.c
+++ b/net/netfilter/ipvs/ip_vs_lblc.c
@@ -371,6 +371,7 @@ static int ip_vs_lblc_init_svc(struct ip_vs_service *svc)
 	tbl->counter = 1;
 	tbl->dead = false;
 	tbl->svc = svc;
+	atomic_set(&tbl->entries, 0);
 
 	/*
 	 *    Hook periodic timer for garbage collection
-- 
2.13.0

^ permalink raw reply related

* [Patch nf] ipvs: initialize tbl->entries after allocation
From: Cong Wang @ 2018-04-23 20:53 UTC (permalink / raw)
  To: netdev
  Cc: lvs-devel, netfilter-devel, Cong Wang, Simon Horman,
	Julian Anastasov, Pablo Neira Ayuso

tbl->entries is not initialized after kmalloc(), therefore
causes an uninit-value warning in ip_vs_lblc_check_expire()
as reported by syzbot.

Reported-by: <syzbot+3dfdea57819073a04f21@syzkaller.appspotmail.com>
Cc: Simon Horman <horms@verge.net.au>
Cc: Julian Anastasov <ja@ssi.bg>
Cc: Pablo Neira Ayuso <pablo@netfilter.org>
Signed-off-by: Cong Wang <xiyou.wangcong@gmail.com>
---
 net/netfilter/ipvs/ip_vs_lblcr.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/net/netfilter/ipvs/ip_vs_lblcr.c b/net/netfilter/ipvs/ip_vs_lblcr.c
index 92adc04557ed..bc2bc5eebcb8 100644
--- a/net/netfilter/ipvs/ip_vs_lblcr.c
+++ b/net/netfilter/ipvs/ip_vs_lblcr.c
@@ -534,6 +534,7 @@ static int ip_vs_lblcr_init_svc(struct ip_vs_service *svc)
 	tbl->counter = 1;
 	tbl->dead = false;
 	tbl->svc = svc;
+	atomic_set(&tbl->entries, 0);
 
 	/*
 	 *    Hook periodic timer for garbage collection
-- 
2.13.0

^ permalink raw reply related

* Re: [PATCH net-next v2 0/2] openvswitch: Support conntrack zone limit
From: Pravin Shelar @ 2018-04-23 20:10 UTC (permalink / raw)
  To: David Miller; +Cc: Yi-Hung Wei, Linux Kernel Network Developers
In-Reply-To: <20180423.093927.2276689799061091037.davem@davemloft.net>

On Mon, Apr 23, 2018 at 6:39 AM, David Miller <davem@davemloft.net> wrote:
> From: Yi-Hung Wei <yihung.wei@gmail.com>
> Date: Tue, 17 Apr 2018 17:30:27 -0700
>
>> Currently, nf_conntrack_max is used to limit the maximum number of
>> conntrack entries in the conntrack table for every network namespace.
>> For the VMs and containers that reside in the same namespace,
>> they share the same conntrack table, and the total # of conntrack entries
>> for all the VMs and containers are limited by nf_conntrack_max.  In this
>> case, if one of the VM/container abuses the usage the conntrack entries,
>> it blocks the others from committing valid conntrack entries into the
>> conntrack table.  Even if we can possibly put the VM in different network
>> namespace, the current nf_conntrack_max configuration is kind of rigid
>> that we cannot limit different VM/container to have different # conntrack
>> entries.
>>

Hi
This looks like general problem related to nf zone usage limit, Did
you considered changing nf-conntrack to have a per zone limit, so that
all users of nf-filter can use it. I prefer this to adding a wrapper
in OVS nf-filter layer.

Thanks,
Pravin.

>> To address the aforementioned issue, this patch proposes to have a
>> fine-grained mechanism that could further limit the # of conntrack entries
>> per-zone.  For example, we can designate different zone to different VM,
>> and set conntrack limit to each zone.  By providing this isolation, a
>> mis-behaved VM only consumes the conntrack entries in its own zone, and
>> it will not influence other well-behaved VMs.  Moreover, the users can
>> set various conntrack limit to different zone based on their preference.
>>
>> The proposed implementation utilizes Netfilter's nf_conncount backend
>> to count the number of connections in a particular zone.  If the number of
>> connection is above a configured limitation, OVS will return ENOMEM to the
>> userspace.  If userspace does not configure the zone limit, the limit
>> defaults to zero that is no limitation, which is backward compatible to
>> the behavior without this patch.
>>
>> The first patch defines the conntrack limit netlink definition, and the
>> second patch provides the implementation.
>
> Pravin, I need this series reviewed.
>
> Thank you.

^ permalink raw reply

* Re: [PATCH bpf-next 02/15] xsk: add user memory registration support sockopt
From: Michael S. Tsirkin @ 2018-04-23 20:26 UTC (permalink / raw)
  To: Björn Töpel
  Cc: Karlsson, Magnus, Duyck, Alexander H, Alexander Duyck,
	John Fastabend, Alexei Starovoitov, Jesper Dangaard Brouer,
	Willem de Bruijn, Daniel Borkmann, Netdev, Björn Töpel,
	michael.lundkvist, Brandeburg, Jesse, Singhai, Anjali,
	Zhang, Qi Z
In-Reply-To: <CAJ+HfNhbNeBfiMP+dK0ZGU0dfYrkWpo5jOAidUoUnkxRMBb0xg@mail.gmail.com>

On Mon, Apr 23, 2018 at 10:15:18PM +0200, Björn Töpel wrote:
> 2018-04-23 22:11 GMT+02:00 Michael S. Tsirkin <mst@redhat.com>:
> > On Mon, Apr 23, 2018 at 10:00:15PM +0200, Björn Töpel wrote:
> >> 2018-04-23 18:18 GMT+02:00 Michael S. Tsirkin <mst@redhat.com>:
> >>
> >> [...]
> >>
> >> >> +static void xdp_umem_unpin_pages(struct xdp_umem *umem)
> >> >> +{
> >> >> +     unsigned int i;
> >> >> +
> >> >> +     if (umem->pgs) {
> >> >> +             for (i = 0; i < umem->npgs; i++)
> >> >
> >> > Since you pin them with FOLL_WRITE, I assume these pages
> >> > are written to.
> >> > Don't you need set_page_dirty_lock here?
> >> >
> >>
> >> Hmm, I actually *removed* it from the RFC V2, but after doing some
> >> homework, I think you're right. Thanks for pointing this out!
> >>
> >> Thinking more about this; This function is called from sk_destruct,
> >> and in the Tx case the sk_destruct can be called from interrupt
> >> context, where set_page_dirty_lock cannot be called.
> >>
> >> Are there any preferred ways of solving this? Scheduling the whole
> >> xsk_destruct call to a workqueue is one way (I think). Any
> >> cleaner/better way?
> >>
> >> [...]
> >
> > Defer unpinning pages until the next tx call?
> >
> 
> If the sock is released, there wont be another tx call.

unpin them on socket release too?

> Or am I
> missing something obvious?
> 
> >
> >> >> +static int __xdp_umem_reg(struct xdp_umem *umem, struct xdp_umem_reg *mr)
> >> >> +{
> >> >> +     u32 frame_size = mr->frame_size, frame_headroom = mr->frame_headroom;
> >> >> +     u64 addr = mr->addr, size = mr->len;
> >> >> +     unsigned int nframes;
> >> >> +     int size_chk, err;
> >> >> +
> >> >> +     if (frame_size < XDP_UMEM_MIN_FRAME_SIZE || frame_size > PAGE_SIZE) {
> >> >> +             /* Strictly speaking we could support this, if:
> >> >> +              * - huge pages, or*
> >> >
> >> > what does "or*" here mean?
> >> >
> >>
> >> Oops, I'll change to just 'or' in the next revision.
> >>
> >>
> >> Thanks!
> >> Björn

^ permalink raw reply

* Re: [PATCH 00/12] Netfilter/IPVS fixes for net
From: David Miller @ 2018-04-23 20:22 UTC (permalink / raw)
  To: pablo; +Cc: netfilter-devel, netdev
In-Reply-To: <20180423175714.9794-1-pablo@netfilter.org>

From: Pablo Neira Ayuso <pablo@netfilter.org>
Date: Mon, 23 Apr 2018 19:57:02 +0200

> The following patchset contains Netfilter/IPVS fixes for your net tree,
> they are:
 ...
> You can pull these changes from:
> 
>   git://git.kernel.org/pub/scm/linux/kernel/git/pablo/nf.git

Pulled, thank you.

^ permalink raw reply

* Re: [bpf-next PATCH 3/3] bpf: add sample program to trace map events
From: Jesper Dangaard Brouer @ 2018-04-23 20:22 UTC (permalink / raw)
  To: Alexei Starovoitov
  Cc: Sebastiano Miano, netdev, ast, daniel, mingo, rostedt,
	fulvio.risso, David S. Miller, brouer
In-Reply-To: <20180423200801.m2kcrjcoavuibbmk@ast-mbp>

On Mon, 23 Apr 2018 14:08:02 -0600
Alexei Starovoitov <alexei.starovoitov@gmail.com> wrote:

> On Mon, Apr 23, 2018 at 04:08:36PM +0200, Sebastiano Miano wrote:
> > 
> > That's in fact the real use case for the first two patches. Since bpf
> > tracepoints are still a rather common (and easy to use) troubleshooting and
> > monitoring tool why shouldn't we "enhance" their support with the newly
> > added map/prog IDs?  
> 
> because these tracepoints can be abused in the way that this patch demonstrated.
> Whether to keep this patch in the series or not is irrelevant.

I don't understand your abuse use-case, can you explain what you mean?

You do realize that these tracepoints can _only_ monitor the userspace
map activity (not kernel map changes) ... and we _do_ need a way to
debug this (and without the map_id I can tell which map).

-- 
Best regards,
  Jesper Dangaard Brouer
  MSc.CS, Principal Kernel Engineer at Red Hat
  LinkedIn: http://www.linkedin.com/in/brouer

^ permalink raw reply

* Re: [nf-next] netfilter: extend SRH match to support matching previous, next and last SID
From: Ahmed Abdelsalam @ 2018-04-23 20:16 UTC (permalink / raw)
  To: Florian Westphal
  Cc: Pablo Neira Ayuso, davem, dav.lebrun, linux-kernel,
	netfilter-devel, coreteam, netdev
In-Reply-To: <20180423200844.bq3ksj262brrifnj@breakpoint.cc>

On Mon, 23 Apr 2018 22:08:44 +0200
Florian Westphal <fw@strlen.de> wrote:

> Ahmed Abdelsalam <amsalam20@gmail.com> wrote:
> > > > @@ -50,6 +62,12 @@ struct ip6t_srh {
> > > >  	__u8                    segs_left;
> > > >  	__u8                    last_entry;
> > > >  	__u16                   tag;
> > > > +	struct in6_addr		psid_addr;
> > > > +	struct in6_addr		nsid_addr;
> > > > +	struct in6_addr		lsid_addr;
> > > > +	struct in6_addr		psid_msk;
> > > > +	struct in6_addr		nsid_msk;
> > > > +	struct in6_addr		lsid_msk;
> > > 
> > > This is changing something exposed through UAPI, so you will need a
> > > new revision for this.
> > 
> > Could you please advice what should be done in this case? 
> 
> You need to add
> struct ip6t_srh_v1 {
> 	/* copy of struct ip6t_srh here */
> 
> 	/* new fields go here */
> };
> 
> 
> Look at xt_conntrack.c, conntrack_mt_reg[] for an example of
> multi-revision match.
> 
> You can probably re-origanise code to avoid too much duplication.
> See 5a786232eb69a1f870ddc0cfd69d5bdef241a2ea in nf.git for an example,
> it makes v0 into a v1 struct at runtime and re-uses new v1 code
> for old v0.
> 
> 

Thanks Florian!  

-- 
Ahmed Abdelsalam <amsalam20@gmail.com>

^ permalink raw reply

* Re: [PATCH bpf-next 02/15] xsk: add user memory registration support sockopt
From: Björn Töpel @ 2018-04-23 20:15 UTC (permalink / raw)
  To: Michael S. Tsirkin
  Cc: Karlsson, Magnus, Duyck, Alexander H, Alexander Duyck,
	John Fastabend, Alexei Starovoitov, Jesper Dangaard Brouer,
	Willem de Bruijn, Daniel Borkmann, Netdev, Björn Töpel,
	michael.lundkvist, Brandeburg, Jesse, Singhai, Anjali,
	Zhang, Qi Z
In-Reply-To: <20180423231026-mutt-send-email-mst@kernel.org>

2018-04-23 22:11 GMT+02:00 Michael S. Tsirkin <mst@redhat.com>:
> On Mon, Apr 23, 2018 at 10:00:15PM +0200, Björn Töpel wrote:
>> 2018-04-23 18:18 GMT+02:00 Michael S. Tsirkin <mst@redhat.com>:
>>
>> [...]
>>
>> >> +static void xdp_umem_unpin_pages(struct xdp_umem *umem)
>> >> +{
>> >> +     unsigned int i;
>> >> +
>> >> +     if (umem->pgs) {
>> >> +             for (i = 0; i < umem->npgs; i++)
>> >
>> > Since you pin them with FOLL_WRITE, I assume these pages
>> > are written to.
>> > Don't you need set_page_dirty_lock here?
>> >
>>
>> Hmm, I actually *removed* it from the RFC V2, but after doing some
>> homework, I think you're right. Thanks for pointing this out!
>>
>> Thinking more about this; This function is called from sk_destruct,
>> and in the Tx case the sk_destruct can be called from interrupt
>> context, where set_page_dirty_lock cannot be called.
>>
>> Are there any preferred ways of solving this? Scheduling the whole
>> xsk_destruct call to a workqueue is one way (I think). Any
>> cleaner/better way?
>>
>> [...]
>
> Defer unpinning pages until the next tx call?
>

If the sock is released, there wont be another tx call. Or am I
missing something obvious?

>
>> >> +static int __xdp_umem_reg(struct xdp_umem *umem, struct xdp_umem_reg *mr)
>> >> +{
>> >> +     u32 frame_size = mr->frame_size, frame_headroom = mr->frame_headroom;
>> >> +     u64 addr = mr->addr, size = mr->len;
>> >> +     unsigned int nframes;
>> >> +     int size_chk, err;
>> >> +
>> >> +     if (frame_size < XDP_UMEM_MIN_FRAME_SIZE || frame_size > PAGE_SIZE) {
>> >> +             /* Strictly speaking we could support this, if:
>> >> +              * - huge pages, or*
>> >
>> > what does "or*" here mean?
>> >
>>
>> Oops, I'll change to just 'or' in the next revision.
>>
>>
>> Thanks!
>> Björn

^ permalink raw reply

* Re: [PATCH net-next 0/2] net/ipv6: couple of fixes for rcu change to from
From: David Miller @ 2018-04-23 20:13 UTC (permalink / raw)
  To: dsahern; +Cc: netdev
In-Reply-To: <20180423183207.9124-1-dsahern@gmail.com>

From: David Ahern <dsahern@gmail.com>
Date: Mon, 23 Apr 2018 11:32:05 -0700

> So many details... I am thankful for all the robots running the
> permutations and tools.

This is why I am not afraid of the robots taking over. :-)

> Two bug fixes from the rcu change to rt->from:
> 1. missing rcu lock in ip6_negative_advice
> 2. rcu dereferences in 2 sites

Series applied, thanks David.

^ permalink raw reply

* Re: [RFC V3 PATCH 0/8] Packed ring for vhost
From: Konrad Rzeszutek Wilk @ 2018-04-23 20:11 UTC (permalink / raw)
  To: Michael S. Tsirkin; +Cc: kvm, netdev, linux-kernel, virtualization, wexu
In-Reply-To: <20180423224715-mutt-send-email-mst@kernel.org>

On Mon, Apr 23, 2018 at 10:59:43PM +0300, Michael S. Tsirkin wrote:
> On Mon, Apr 23, 2018 at 03:31:20PM -0400, Konrad Rzeszutek Wilk wrote:
> > On Mon, Apr 23, 2018 at 01:34:52PM +0800, Jason Wang wrote:
> > > Hi all:
> > > 
> > > This RFC implement packed ring layout. The code were tested with
> > > Tiwei's RFC V2 a thttps://lkml.org/lkml/2018/4/1/48. Some fixups and
> > > tweaks were needed on top of Tiwei's code to make it run. TCP stream
> > > and pktgen does not show obvious difference compared with split ring.
> > 
> > I have to ask then - what is the benefit of this?
> 
> You can use this with dpdk within guest.
> The DPDK version did see a performance improvement so hopefully with

Is there a link to this performance improvement document?

> future versions this will too.
> 
> > > 
> > > Changes from V2:
> > > - do not use & in checking desc_event_flags
> > > - off should be most significant bit
> > > - remove the workaround of mergeable buffer for dpdk prototype
> > > - id should be in the last descriptor in the chain
> > > - keep _F_WRITE for write descriptor when adding used
> > > - device flags updating should use ADDR_USED type
> > > - return error on unexpected unavail descriptor in a chain
> > > - return false in vhost_ve_avail_empty is descriptor is available
> > > - track last seen avail_wrap_counter
> > > - correctly examine available descriptor in get_indirect_packed()
> > > - vhost_idx_diff should return u16 instead of bool
> > > 
> > > Changes from V1:
> > > 
> > > - Refactor vhost used elem code to avoid open coding on used elem
> > > - Event suppression support (compile test only).
> > > - Indirect descriptor support (compile test only).
> > > - Zerocopy support.
> > > - vIOMMU support.
> > > - SCSI/VSOCK support (compile test only).
> > > - Fix several bugs
> > > 
> > > For simplicity, I don't implement batching or other optimizations.
> > > 
> > > Please review.
> > > 
> > > Jason Wang (8):
> > >   vhost: move get_rx_bufs to vhost.c
> > >   vhost: hide used ring layout from device
> > >   vhost: do not use vring_used_elem
> > >   vhost_net: do not explicitly manipulate vhost_used_elem
> > >   vhost: vhost_put_user() can accept metadata type
> > >   virtio: introduce packed ring defines
> > >   vhost: packed ring support
> > >   vhost: event suppression for packed ring
> > > 
> > >  drivers/vhost/net.c                | 136 ++----
> > >  drivers/vhost/scsi.c               |  62 +--
> > >  drivers/vhost/vhost.c              | 824 ++++++++++++++++++++++++++++++++++---
> > >  drivers/vhost/vhost.h              |  47 ++-
> > >  drivers/vhost/vsock.c              |  42 +-
> > >  include/uapi/linux/virtio_config.h |   9 +
> > >  include/uapi/linux/virtio_ring.h   |  32 ++
> > >  7 files changed, 926 insertions(+), 226 deletions(-)
> > > 
> > > -- 
> > > 2.7.4
> > > 

^ permalink raw reply

* Re: [PATCH bpf-next 02/15] xsk: add user memory registration support sockopt
From: Michael S. Tsirkin @ 2018-04-23 20:11 UTC (permalink / raw)
  To: Björn Töpel
  Cc: Karlsson, Magnus, Duyck, Alexander H, Alexander Duyck,
	John Fastabend, Alexei Starovoitov, Jesper Dangaard Brouer,
	Willem de Bruijn, Daniel Borkmann, Netdev, Björn Töpel,
	michael.lundkvist, Brandeburg, Jesse, Singhai, Anjali,
	Zhang, Qi Z
In-Reply-To: <CAJ+HfNjwMFig+aJbacFK--5_1i8F2DLSyAUOvU12Xc-OvJBAzQ@mail.gmail.com>

On Mon, Apr 23, 2018 at 10:00:15PM +0200, Björn Töpel wrote:
> 2018-04-23 18:18 GMT+02:00 Michael S. Tsirkin <mst@redhat.com>:
> 
> [...]
> 
> >> +static void xdp_umem_unpin_pages(struct xdp_umem *umem)
> >> +{
> >> +     unsigned int i;
> >> +
> >> +     if (umem->pgs) {
> >> +             for (i = 0; i < umem->npgs; i++)
> >
> > Since you pin them with FOLL_WRITE, I assume these pages
> > are written to.
> > Don't you need set_page_dirty_lock here?
> >
> 
> Hmm, I actually *removed* it from the RFC V2, but after doing some
> homework, I think you're right. Thanks for pointing this out!
> 
> Thinking more about this; This function is called from sk_destruct,
> and in the Tx case the sk_destruct can be called from interrupt
> context, where set_page_dirty_lock cannot be called.
> 
> Are there any preferred ways of solving this? Scheduling the whole
> xsk_destruct call to a workqueue is one way (I think). Any
> cleaner/better way?
> 
> [...]

Defer unpinning pages until the next tx call?


> >> +static int __xdp_umem_reg(struct xdp_umem *umem, struct xdp_umem_reg *mr)
> >> +{
> >> +     u32 frame_size = mr->frame_size, frame_headroom = mr->frame_headroom;
> >> +     u64 addr = mr->addr, size = mr->len;
> >> +     unsigned int nframes;
> >> +     int size_chk, err;
> >> +
> >> +     if (frame_size < XDP_UMEM_MIN_FRAME_SIZE || frame_size > PAGE_SIZE) {
> >> +             /* Strictly speaking we could support this, if:
> >> +              * - huge pages, or*
> >
> > what does "or*" here mean?
> >
> 
> Oops, I'll change to just 'or' in the next revision.
> 
> 
> Thanks!
> Björn

^ permalink raw reply

* Re: [PATCH 0/3] bpf: Store/dump license string for loaded program
From: Alexei Starovoitov @ 2018-04-23 20:11 UTC (permalink / raw)
  To: Jiri Olsa
  Cc: Alexei Starovoitov, Daniel Borkmann, lkml, netdev, Quentin Monnet
In-Reply-To: <20180423065927.23127-1-jolsa@kernel.org>

On Mon, Apr 23, 2018 at 08:59:24AM +0200, Jiri Olsa wrote:
> hi,
> sending the change to store and dump the license
> info for loaded BPF programs. It's important for
> us get the license info, when investigating on
> screwed up machine.

hmm. boolean flag whether bpf prog is gpl or not
is already exposed via bpf_prog_info.
I see no point of wasting extra 128 bytes of kernel memory.

^ permalink raw reply

* Re: [nf-next] netfilter: extend SRH match to support matching previous, next and last SID
From: Florian Westphal @ 2018-04-23 20:08 UTC (permalink / raw)
  To: Ahmed Abdelsalam
  Cc: Pablo Neira Ayuso, fw, davem, dav.lebrun, linux-kernel,
	netfilter-devel, coreteam, netdev
In-Reply-To: <20180423220148.03800031d0cb8e8a7a83dc31@gmail.com>

Ahmed Abdelsalam <amsalam20@gmail.com> wrote:
> > > @@ -50,6 +62,12 @@ struct ip6t_srh {
> > >  	__u8                    segs_left;
> > >  	__u8                    last_entry;
> > >  	__u16                   tag;
> > > +	struct in6_addr		psid_addr;
> > > +	struct in6_addr		nsid_addr;
> > > +	struct in6_addr		lsid_addr;
> > > +	struct in6_addr		psid_msk;
> > > +	struct in6_addr		nsid_msk;
> > > +	struct in6_addr		lsid_msk;
> > 
> > This is changing something exposed through UAPI, so you will need a
> > new revision for this.
> 
> Could you please advice what should be done in this case? 

You need to add
struct ip6t_srh_v1 {
	/* copy of struct ip6t_srh here */

	/* new fields go here */
};


Look at xt_conntrack.c, conntrack_mt_reg[] for an example of
multi-revision match.

You can probably re-origanise code to avoid too much duplication.
See 5a786232eb69a1f870ddc0cfd69d5bdef241a2ea in nf.git for an example,
it makes v0 into a v1 struct at runtime and re-uses new v1 code
for old v0.

^ permalink raw reply

* Re: [bpf-next PATCH 3/3] bpf: add sample program to trace map events
From: Alexei Starovoitov @ 2018-04-23 20:08 UTC (permalink / raw)
  To: Sebastiano Miano
  Cc: Jesper Dangaard Brouer, netdev, ast, daniel, mingo, rostedt,
	fulvio.risso, David S. Miller
In-Reply-To: <062780b1-6a24-a7f3-175c-db3b02605850@polito.it>

On Mon, Apr 23, 2018 at 04:08:36PM +0200, Sebastiano Miano wrote:
> 
> That's in fact the real use case for the first two patches. Since bpf
> tracepoints are still a rather common (and easy to use) troubleshooting and
> monitoring tool why shouldn't we "enhance" their support with the newly
> added map/prog IDs?

because these tracepoints can be abused in the way that this patch demonstrated.
Whether to keep this patch in the series or not is irrelevant.

^ permalink raw reply

* Re: [PATCH v7 net-next 4/4] netvsc: refactor notifier/event handling code to use the failover framework
From: Michael S. Tsirkin @ 2018-04-23 20:06 UTC (permalink / raw)
  To: Siwei Liu
  Cc: Stephen Hemminger, Jiri Pirko, Sridhar Samudrala, David Miller,
	Netdev, virtualization, virtio-dev, Brandeburg, Jesse,
	Alexander Duyck, Jakub Kicinski, Jason Wang
In-Reply-To: <CADGSJ20ge75T+ddxtUBT4d9m1i3=HLOAHJEoS7Cg0bqnXrutwA@mail.gmail.com>

On Mon, Apr 23, 2018 at 12:44:39PM -0700, Siwei Liu wrote:
> On Mon, Apr 23, 2018 at 10:56 AM, Michael S. Tsirkin <mst@redhat.com> wrote:
> > On Mon, Apr 23, 2018 at 10:44:40AM -0700, Stephen Hemminger wrote:
> >> On Mon, 23 Apr 2018 20:24:56 +0300
> >> "Michael S. Tsirkin" <mst@redhat.com> wrote:
> >>
> >> > On Mon, Apr 23, 2018 at 10:04:06AM -0700, Stephen Hemminger wrote:
> >> > > > >
> >> > > > >I will NAK patches to change to common code for netvsc especially the
> >> > > > >three device model.  MS worked hard with distro vendors to support transparent
> >> > > > >mode, ans we really can't have a new model; or do backport.
> >> > > > >
> >> > > > >Plus, DPDK is now dependent on existing model.
> >> > > >
> >> > > > Sorry, but nobody here cares about dpdk or other similar oddities.
> >> > >
> >> > > The network device model is a userspace API, and DPDK is a userspace application.
> >> >
> >> > It is userspace but are you sure dpdk is actually poking at netdevs?
> >> > AFAIK it's normally banging device registers directly.
> >> >
> >> > > You can't go breaking userspace even if you don't like the application.
> >> >
> >> > Could you please explain how is the proposed patchset breaking
> >> > userspace? Ignoring DPDK for now, I don't think it changes the userspace
> >> > API at all.
> >> >
> >>
> >> The DPDK has a device driver vdev_netvsc which scans the Linux network devices
> >> to look for Linux netvsc device and the paired VF device and setup the
> >> DPDK environment.  This setup creates a DPDK failsafe (bondingish) instance
> >> and sets up TAP support over the Linux netvsc device as well as the Mellanox
> >> VF device.
> >>
> >> So it depends on existing 2 device model. You can't go to a 3 device model
> >> or start hiding devices from userspace.
> >
> > Okay so how does the existing patch break that? IIUC does not go to
> > a 3 device model since netvsc calls failover_register directly.
> >
> >> Also, I am working on associating netvsc and VF device based on serial number
> >> rather than MAC address. The serial number is how Windows works now, and it makes
> >> sense for Linux and Windows to use the same mechanism if possible.
> >
> > Maybe we should support same for virtio ...
> > Which serial do you mean? From vpd?
> >
> > I guess you will want to keep supporting MAC for old hypervisors?
> >
> > It all seems like a reasonable thing to support in the generic core.
> 
> That's the reason why I chose explicit identifier rather than rely on
> MAC address to bind/pair a device. MAC address can change. Even if it
> can't, malicious guest user can fake MAC address to skip binding.
> 
> -Siwei

Address should be sampled at device creation to prevent this
kind of hack. Not that it buys the malicious user much:
if you can poke at MAC addresses you probably already can
break networking.




> 
> >
> > --
> > MST

^ permalink raw reply

* Re: KASAN: null-ptr-deref Read in refcount_inc_not_zero
From: Cong Wang @ 2018-04-23 20:05 UTC (permalink / raw)
  To: syzbot
  Cc: David Miller, Denys Vlasenko, LKML,
	Linux Kernel Network Developers, syzkaller-bugs, xiaolou4617
In-Reply-To: <000000000000eeeeff056a79954c@google.com>

#syz fix: llc: fix NULL pointer deref for SOCK_ZAPPED

^ permalink raw reply

* Re: [PATCH bpf] bpf: disable and restore preemption in __BPF_PROG_RUN_ARRAY
From: Alexei Starovoitov @ 2018-04-23 20:05 UTC (permalink / raw)
  To: Roman Gushchin; +Cc: netdev, kernel-team, Alexei Starovoitov, Daniel Borkmann
In-Reply-To: <20180423170921.16162-1-guro@fb.com>

On Mon, Apr 23, 2018 at 06:09:21PM +0100, Roman Gushchin wrote:
> Running bpf programs requires disabled preemption,
> however at least some* of the BPF_PROG_RUN_ARRAY users
> do not follow this rule.
> 
> To fix this bug, and also to make it not happen in the future,
> let's add explicit preemption disabling/re-enabling
> to the __BPF_PROG_RUN_ARRAY code.
> 
> * for example:
>  [   17.624472] RIP: 0010:__cgroup_bpf_run_filter_sk+0x1c4/0x1d0
>  ...
>  [   17.640890]  inet6_create+0x3eb/0x520
>  [   17.641405]  __sock_create+0x242/0x340
>  [   17.641939]  __sys_socket+0x57/0xe0
>  [   17.642370]  ? trace_hardirqs_off_thunk+0x1a/0x1c
>  [   17.642944]  SyS_socket+0xa/0x10
>  [   17.643357]  do_syscall_64+0x79/0x220
>  [   17.643879]  entry_SYSCALL_64_after_hwframe+0x42/0xb7
> 
> Signed-off-by: Roman Gushchin <guro@fb.com>
> Cc: Alexei Starovoitov <ast@kernel.org>
> Cc: Daniel Borkmann <daniel@iogearbox.net>

Acked-by: Alexei Starovoitov <ast@kernel.org>

^ permalink raw reply

* Re: [nf-next] netfilter: extend SRH match to support matching previous, next and last SID
From: Ahmed Abdelsalam @ 2018-04-23 20:01 UTC (permalink / raw)
  To: Pablo Neira Ayuso
  Cc: fw, davem, dav.lebrun, linux-kernel, netfilter-devel, coreteam,
	netdev
In-Reply-To: <20180423173047.gsf2xjlmpichyvte@salvia>

On Mon, 23 Apr 2018 19:30:47 +0200
Pablo Neira Ayuso <pablo@netfilter.org> wrote:

> On Mon, Apr 23, 2018 at 05:48:22AM -0500, Ahmed Abdelsalam wrote:
> > Signed-off-by: Ahmed Abdelsalam <amsalam20@gmail.com>
> > ---
> >  include/uapi/linux/netfilter_ipv6/ip6t_srh.h | 22 +++++++++++++--
> >  net/ipv6/netfilter/ip6t_srh.c                | 41 +++++++++++++++++++++++++++-
> >  2 files changed, 60 insertions(+), 3 deletions(-)
> > 
> > diff --git a/include/uapi/linux/netfilter_ipv6/ip6t_srh.h b/include/uapi/linux/netfilter_ipv6/ip6t_srh.h
> > index f3cc0ef..9808382 100644
> > --- a/include/uapi/linux/netfilter_ipv6/ip6t_srh.h
> > +++ b/include/uapi/linux/netfilter_ipv6/ip6t_srh.h
> > @@ -17,7 +17,10 @@
> >  #define IP6T_SRH_LAST_GT        0x0100
> >  #define IP6T_SRH_LAST_LT        0x0200
> >  #define IP6T_SRH_TAG            0x0400
> > -#define IP6T_SRH_MASK           0x07FF
> > +#define IP6T_SRH_PSID           0x0800
> > +#define IP6T_SRH_NSID           0x1000
> > +#define IP6T_SRH_LSID           0x2000
> > +#define IP6T_SRH_MASK           0x3FFF
> >  
> >  /* Values for "mt_invflags" field in struct ip6t_srh */
> >  #define IP6T_SRH_INV_NEXTHDR    0x0001
> > @@ -31,7 +34,10 @@
> >  #define IP6T_SRH_INV_LAST_GT    0x0100
> >  #define IP6T_SRH_INV_LAST_LT    0x0200
> >  #define IP6T_SRH_INV_TAG        0x0400
> > -#define IP6T_SRH_INV_MASK       0x07FF
> > +#define IP6T_SRH_INV_PSID       0x0800
> > +#define IP6T_SRH_INV_NSID       0x1000
> > +#define IP6T_SRH_INV_LSID       0x2000
> > +#define IP6T_SRH_INV_MASK       0x3FFF
> >  
> >  /**
> >   *      struct ip6t_srh - SRH match options
> > @@ -40,6 +46,12 @@
> >   *      @ segs_left: Segments left field of SRH
> >   *      @ last_entry: Last entry field of SRH
> >   *      @ tag: Tag field of SRH
> > + *      @ psid_addr: Address of previous SID in SRH SID list
> > + *      @ nsid_addr: Address of NEXT SID in SRH SID list
> > + *      @ lsid_addr: Address of LAST SID in SRH SID list
> > + *      @ psid_msk: Mask of previous SID in SRH SID list
> > + *      @ nsid_msk: Mask of next SID in SRH SID list
> > + *      @ lsid_msk: MAsk of last SID in SRH SID list
> >   *      @ mt_flags: match options
> >   *      @ mt_invflags: Invert the sense of match options
> >   */
> > @@ -50,6 +62,12 @@ struct ip6t_srh {
> >  	__u8                    segs_left;
> >  	__u8                    last_entry;
> >  	__u16                   tag;
> > +	struct in6_addr		psid_addr;
> > +	struct in6_addr		nsid_addr;
> > +	struct in6_addr		lsid_addr;
> > +	struct in6_addr		psid_msk;
> > +	struct in6_addr		nsid_msk;
> > +	struct in6_addr		lsid_msk;
> 
> This is changing something exposed through UAPI, so you will need a
> new revision for this.

Could you please advice what should be done in this case? 

> 
> >  	__u16                   mt_flags;
> >  	__u16                   mt_invflags;
> >  };
> > diff --git a/net/ipv6/netfilter/ip6t_srh.c b/net/ipv6/netfilter/ip6t_srh.c
> > index 33719d5..2b5cc73 100644
> > --- a/net/ipv6/netfilter/ip6t_srh.c
> > +++ b/net/ipv6/netfilter/ip6t_srh.c
> > @@ -30,7 +30,9 @@ static bool srh_mt6(const struct sk_buff *skb, struct xt_action_param *par)
> >  	const struct ip6t_srh *srhinfo = par->matchinfo;
> >  	struct ipv6_sr_hdr *srh;
> >  	struct ipv6_sr_hdr _srh;
> > -	int hdrlen, srhoff = 0;
> > +	int hdrlen, psidoff, nsidoff, lsidoff, srhoff = 0;
> > +	struct in6_addr *psid, *nsid, *lsid;
> > +	struct in6_addr _psid, _nsid, _lsid;
> 
> Could you rearrange variable definitions? ie. longest line first, eg.
> 
> 	int hdrlen, psidoff, nsidoff, lsidoff, srhoff = 0;
>   	const struct ip6t_srh *srhinfo = par->matchinfo;
> 	struct in6_addr *psid, *nsid, *lsid;
>   	struct ipv6_sr_hdr *srh;
>   	struct ipv6_sr_hdr _srh;
> 

Ok I will re-arrange them in reverse christmas tree form. 

Ahmed 

-- 
Ahmed Abdelsalam <amsalam20@gmail.com>

^ permalink raw reply

page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox