Netdev List

Netdev List
 help / color / mirror / Atom feed

* Re: [bpf-next PATCH] bpf: sockmap, fix skmsg recvmsg handler to track size correctly
From: Daniel Borkmann @ 2018-10-17  0:32 UTC (permalink / raw)
  To: John Fastabend, ast; +Cc: netdev
In-Reply-To: <20181016173601.9211.1573.stgit@john-Precision-Tower-5810>

On 10/16/2018 07:36 PM, John Fastabend wrote:
> When converting sockmap to new skmsg generic data structures we missed
> that the recvmsg handler did not correctly use sg.size and instead was
> using individual elements length. The result is if a sock is closed
> with outstanding data we omit the call to sk_mem_uncharge() and can
> get the warning below.
> 
> [   66.728282] WARNING: CPU: 6 PID: 5783 at net/core/stream.c:206 sk_stream_kill_queues+0x1fa/0x210
> 
> To fix this correct the redirect handler to xfer the size along with
> the scatterlist and also decrement the size from the recvmsg handler.
> Now when a sock is closed the remaining 'size' will be decremented
> with sk_mem_uncharge().
> 
> Signed-off-by: John Fastabend <john.fastabend@gmail.com>

Applied to bpf-next, thanks!

^ permalink raw reply

* Re: [bpf-next PATCH 0/3] sockmap support for msg_peek flag
From: Daniel Borkmann @ 2018-10-17  0:32 UTC (permalink / raw)
  To: Alexei Starovoitov, John Fastabend; +Cc: ast, netdev
In-Reply-To: <20181016184144.dryrzu7zbfzioz7q@ast-mbp.dhcp.thefacebook.com>

On 10/16/2018 08:41 PM, Alexei Starovoitov wrote:
> On Tue, Oct 16, 2018 at 11:07:54AM -0700, John Fastabend wrote:
>> This adds support for the MSG_PEEK flag when redirecting into an
>> ingress psock sk_msg queue.
>>
>> The first patch adds some base support to the helpers, then the
>> feature, and finally we add an option for the test suite to do
>> a duplicate MSG_PEEK call on every recv to test the feature.
>>
>> With duplicate MSG_PEEK call all tests continue to PASS.
> 
> for the set
> Acked-by: Alexei Starovoitov <ast@kernel.org>

Applied to bpf-next, thanks!

^ permalink raw reply

* Re: [PATCH bpf-next 05/13] bpf: get better bpf_prog ksyms based on btf func type_id
From: Yonghong Song @ 2018-10-17  0:32 UTC (permalink / raw)
  To: Martin Lau
  Cc: Alexei Starovoitov, daniel@iogearbox.net, netdev@vger.kernel.org,
	Kernel Team
In-Reply-To: <20181015231223.cwe3zblnh4wt7xds@kafai-mbp.dhcp.thefacebook.com>



On 10/15/18 4:12 PM, Martin Lau wrote:
> On Fri, Oct 12, 2018 at 11:54:42AM -0700, Yonghong Song wrote:
>> This patch added interface to load a program with the following
>> additional information:
>>     . prog_btf_fd
>>     . func_info and func_info_len
>> where func_info will provides function range and type_id
>> corresponding to each function.
>>
>> If verifier agrees with function range provided by the user,
>> the bpf_prog ksym for each function will use the func name
>> provided in the type_id, which is supposed to provide better
>> encoding as it is not limited by 16 bytes program name
>> limitation and this is better for bpf program which contains
>> multiple subprograms.
>>
>> The bpf_prog_info interface is also extended to
>> return btf_id and jited_func_types, so user spaces can
>> print out the function prototype for each jited function.
> Some nits.
> 
>>
>> Signed-off-by: Yonghong Song <yhs@fb.com>
>> ---
>>   include/linux/bpf.h          |  2 +
>>   include/linux/bpf_verifier.h |  1 +
>>   include/linux/btf.h          |  2 +
>>   include/uapi/linux/bpf.h     | 11 +++++
>>   kernel/bpf/btf.c             | 16 +++++++
>>   kernel/bpf/core.c            |  9 ++++
>>   kernel/bpf/syscall.c         | 86 +++++++++++++++++++++++++++++++++++-
>>   kernel/bpf/verifier.c        | 50 +++++++++++++++++++++
>>   8 files changed, 176 insertions(+), 1 deletion(-)
>>
>> diff --git a/include/linux/bpf.h b/include/linux/bpf.h
>> index 9b558713447f..e9c63ffa01af 100644
>> --- a/include/linux/bpf.h
>> +++ b/include/linux/bpf.h
>> @@ -308,6 +308,8 @@ struct bpf_prog_aux {
>>   	void *security;
>>   #endif
>>   	struct bpf_prog_offload *offload;
>> +	struct btf *btf;
>> +	u32 type_id; /* type id for this prog/func */
>>   	union {
>>   		struct work_struct work;
>>   		struct rcu_head	rcu;
>> diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h
>> index 9e8056ec20fa..e84782ec50ac 100644
>> --- a/include/linux/bpf_verifier.h
>> +++ b/include/linux/bpf_verifier.h
>> @@ -201,6 +201,7 @@ static inline bool bpf_verifier_log_needed(const struct bpf_verifier_log *log)
>>   struct bpf_subprog_info {
>>   	u32 start; /* insn idx of function entry point */
>>   	u16 stack_depth; /* max. stack depth used by this function */
>> +	u32 type_id; /* btf type_id for this subprog */
>>   };
>>   
>>   /* single container for all structs
>> diff --git a/include/linux/btf.h b/include/linux/btf.h
>> index e076c4697049..90e91b52aa90 100644
>> --- a/include/linux/btf.h
>> +++ b/include/linux/btf.h
>> @@ -46,5 +46,7 @@ void btf_type_seq_show(const struct btf *btf, u32 type_id, void *obj,
>>   		       struct seq_file *m);
>>   int btf_get_fd_by_id(u32 id);
>>   u32 btf_id(const struct btf *btf);
>> +bool is_btf_func_type(const struct btf *btf, u32 type_id);
>> +const char *btf_get_name_by_id(const struct btf *btf, u32 type_id);
>>   
>>   #endif
>> diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
>> index f9187b41dff6..7ebbf4f06a65 100644
>> --- a/include/uapi/linux/bpf.h
>> +++ b/include/uapi/linux/bpf.h
>> @@ -332,6 +332,9 @@ union bpf_attr {
>>   		 * (context accesses, allowed helpers, etc).
>>   		 */
>>   		__u32		expected_attach_type;
>> +		__u32		prog_btf_fd;	/* fd pointing to BTF type data */
>> +		__u32		func_info_len;	/* func_info length */
>> +		__aligned_u64	func_info;	/* func type info */
>>   	};
>>   
>>   	struct { /* anonymous struct used by BPF_OBJ_* commands */
>> @@ -2585,6 +2588,9 @@ struct bpf_prog_info {
>>   	__u32 nr_jited_func_lens;
>>   	__aligned_u64 jited_ksyms;
>>   	__aligned_u64 jited_func_lens;
>> +	__u32 btf_id;
>> +	__u32 nr_jited_func_types;
>> +	__aligned_u64 jited_func_types;
>>   } __attribute__((aligned(8)));
>>   
>>   struct bpf_map_info {
>> @@ -2896,4 +2902,9 @@ struct bpf_flow_keys {
>>   	};
>>   };
>>   
>> +struct bpf_func_info {
>> +	__u32	insn_offset;
>> +	__u32	type_id;
>> +};
>> +
>>   #endif /* _UAPI__LINUX_BPF_H__ */
>> diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c
>> index 794a185f11bf..85b8eeccddbd 100644
>> --- a/kernel/bpf/btf.c
>> +++ b/kernel/bpf/btf.c
>> @@ -486,6 +486,15 @@ static const struct btf_type *btf_type_by_id(const struct btf *btf, u32 type_id)
>>   	return btf->types[type_id];
>>   }
>>   
>> +bool is_btf_func_type(const struct btf *btf, u32 type_id)
>> +{
>> +	const struct btf_type *type = btf_type_by_id(btf, type_id);
>> +
>> +	if (!type || BTF_INFO_KIND(type->info) != BTF_KIND_FUNC)
>> +		return false;
>> +	return true;
>> +}
> Can btf_type_is_func() (from patch 2) be reused?
> The btf_type_by_id() can be done by the caller.
> I don't think it worths to add a similar helper
> for just one user for now.

Currently, btf_type_by_id() is not exposed.

bpf/btf.c:
static const struct btf_type *btf_type_by_id(const struct btf *btf, u32 
type_id)

Do you want to expose this function as global one?
We cannot put the whole definition in the header as it touches
btf internals.

> 
> The !type check can be added to btf_type_is_func() if
> it is needed.
> 
>> +
>>   /*
>>    * Regular int is not a bit field and it must be either
>>    * u8/u16/u32/u64.
>> @@ -2579,3 +2588,10 @@ u32 btf_id(const struct btf *btf)
>>   {
>>   	return btf->id;
>>   }
>> +
>> +const char *btf_get_name_by_id(const struct btf *btf, u32 type_id)
>> +{
>> +	const struct btf_type *t = btf_type_by_id(btf, type_id);
>> +
>> +	return btf_name_by_offset(btf, t->name_off);
>> +}
>> diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
>> index 3f5bf1af0826..add3866a120e 100644
>> --- a/kernel/bpf/core.c
>> +++ b/kernel/bpf/core.c
>> @@ -27,6 +27,7 @@
>>   #include <linux/random.h>
>>   #include <linux/moduleloader.h>
>>   #include <linux/bpf.h>
>> +#include <linux/btf.h>
>>   #include <linux/frame.h>
>>   #include <linux/rbtree_latch.h>
>>   #include <linux/kallsyms.h>
>> @@ -387,6 +388,7 @@ bpf_get_prog_addr_region(const struct bpf_prog *prog,
>>   static void bpf_get_prog_name(const struct bpf_prog *prog, char *sym)
>>   {
>>   	const char *end = sym + KSYM_NAME_LEN;
>> +	const char *func_name;
>>   
>>   	BUILD_BUG_ON(sizeof("bpf_prog_") +
>>   		     sizeof(prog->tag) * 2 +
>> @@ -401,6 +403,13 @@ static void bpf_get_prog_name(const struct bpf_prog *prog, char *sym)
>>   
>>   	sym += snprintf(sym, KSYM_NAME_LEN, "bpf_prog_");
>>   	sym  = bin2hex(sym, prog->tag, sizeof(prog->tag));
>> +
>> +	if (prog->aux->btf) {
>> +		func_name = btf_get_name_by_id(prog->aux->btf, prog->aux->type_id);
>> +		snprintf(sym, (size_t)(end - sym), "_%s", func_name);
>> +		return;
>> +	}
>> +
>>   	if (prog->aux->name[0])
>>   		snprintf(sym, (size_t)(end - sym), "_%s", prog->aux->name);
>>   	else
>> diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
>> index 4f416234251f..aa4688a1a137 100644
>> --- a/kernel/bpf/syscall.c
>> +++ b/kernel/bpf/syscall.c
>> @@ -1120,6 +1120,7 @@ static void __bpf_prog_put(struct bpf_prog *prog, bool do_idr_lock)
>>   		/* bpf_prog_free_id() must be called first */
>>   		bpf_prog_free_id(prog, do_idr_lock);
>>   		bpf_prog_kallsyms_del_all(prog);
>> +		btf_put(prog->aux->btf);
>>   
>>   		call_rcu(&prog->aux->rcu, __bpf_prog_put_rcu);
>>   	}
>> @@ -1343,8 +1344,45 @@ bpf_prog_load_check_attach_type(enum bpf_prog_type prog_type,
>>   	}
>>   }
>>   
>> +static int prog_check_btf(const struct bpf_prog *prog, const struct btf *btf,
>> +			  union bpf_attr *attr)
>> +{
>> +	struct bpf_func_info __user *uinfo, info;
>> +	int i, nfuncs, usize;
>> +	u32 prev_offset;
>> +
>> +	usize = sizeof(struct bpf_func_info);
>> +	if (attr->func_info_len % usize)
>> +		return -EINVAL;
>> +
>> +	/* func_info section should have increasing and valid insn_offset
>> +	 * and type should be BTF_KIND_FUNC.
>> +	 */
>> +	nfuncs = attr->func_info_len / usize;
>> +	uinfo = u64_to_user_ptr(attr->func_info);
>> +	for (i = 0; i < nfuncs; i++) {
>> +		if (copy_from_user(&info, uinfo, usize))
>> +			return -EFAULT;
>> +
>> +		if (!is_btf_func_type(btf, info.type_id))
>> +			return -EINVAL;
>> +
>> +		if (i == 0) {
>> +			if (info.insn_offset)
>> +				return -EINVAL;
>> +			prev_offset = 0;
>> +		} else if (info.insn_offset < prev_offset) {
>> +			return -EINVAL;
>> +		}
>> +
>> +		prev_offset = info.insn_offset;
>> +	}
>> +
>> +	return 0;
>> +}
>> +
>>   /* last field in 'union bpf_attr' used by this command */
>> -#define	BPF_PROG_LOAD_LAST_FIELD expected_attach_type
>> +#define	BPF_PROG_LOAD_LAST_FIELD func_info
>>   
>>   static int bpf_prog_load(union bpf_attr *attr)
>>   {
>> @@ -1431,6 +1469,27 @@ static int bpf_prog_load(union bpf_attr *attr)
>>   	if (err)
>>   		goto free_prog;
>>   
>> +	/* copy func_info before verifier which may make
>> +	 * some adjustment.
>> +	 */
> Is it a left over comment?  I don't see the intention of
> copying func_info to avoid verifier modification from below.
> I could be missing something.
> 
> or should the comments be moved to the new "check_btf_func()" below?
> 
>> +	if (attr->func_info_len) {
>> +		struct btf *btf;
>> +
>> +		btf = btf_get_by_fd(attr->prog_btf_fd);
>> +		if (IS_ERR(btf)) {
>> +			err = PTR_ERR(btf);
>> +			goto free_prog;
>> +		}
>> +
>> +		err = prog_check_btf(prog, btf, attr);
>> +		if (err) {
>> +			btf_put(btf);
>> +			goto free_prog;
>> +		}
>> +
>> +		prog->aux->btf = btf;
>> +	}
>> +
>>   	/* run eBPF verifier */
>>   	err = bpf_check(&prog, attr);
>>   	if (err < 0)
>> @@ -1463,6 +1522,7 @@ static int bpf_prog_load(union bpf_attr *attr)
>>   	bpf_prog_kallsyms_del_subprogs(prog);
>>   	free_used_maps(prog->aux);
>>   free_prog:
>> +	btf_put(prog->aux->btf);
>>   	bpf_prog_uncharge_memlock(prog);
>>   free_prog_sec:
>>   	security_bpf_prog_free(prog->aux);
>> @@ -2108,6 +2168,30 @@ static int bpf_prog_get_info_by_fd(struct bpf_prog *prog,
>>   		}
>>   	}
>>   
>> +	if (prog->aux->btf) {
>> +		info.btf_id = btf_id(prog->aux->btf);
>> +
>> +		ulen = info.nr_jited_func_types;
>> +		info.nr_jited_func_types = prog->aux->func_cnt;
>> +		if (info.nr_jited_func_types && ulen) {
>> +			if (bpf_dump_raw_ok()) {
>> +				u32 __user *user_types;
>> +				u32 func_type, i;
>> +
>> +				ulen = min_t(u32, info.nr_jited_func_types,
>> +					     ulen);
>> +				user_types = u64_to_user_ptr(info.jited_func_types);
>> +				for (i = 0; i < ulen; i++) {
>> +					func_type = prog->aux->func[i]->aux->type_id;
>> +					if (put_user(func_type, &user_types[i]))
>> +						return -EFAULT;
>> +				}
>> +			} else {
>> +				info.jited_func_types = 0;
>> +			}
>> +		}
>> +	}
>> +
>>   done:
>>   	if (copy_to_user(uinfo, &info, info_len) ||
>>   	    put_user(info_len, &uattr->info.info_len))
>> diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
>> index 3f93a548a642..97c408e84322 100644
>> --- a/kernel/bpf/verifier.c
>> +++ b/kernel/bpf/verifier.c
>> @@ -4589,6 +4589,50 @@ static int check_cfg(struct bpf_verifier_env *env)
>>   	return ret;
>>   }
>>   
>> +static int check_btf_func(struct bpf_prog *prog, struct bpf_verifier_env *env,
>> +			  union bpf_attr *attr)
>> +{
>> +	struct bpf_func_info *data;
>> +	int i, nfuncs, ret = 0;
>> +
>> +	if (!attr->func_info_len)
>> +		return 0;
>> +
>> +	nfuncs = attr->func_info_len / sizeof(struct bpf_func_info);
>> +	if (env->subprog_cnt != nfuncs) {
>> +		verbose(env, "number of funcs in func_info does not match verifier\n");
>> +		return -EINVAL;
>> +	}
>> +
>> +	data = kvmalloc(attr->func_info_len, GFP_KERNEL | __GFP_NOWARN);
>> +	if (!data) {
>> +		verbose(env, "no memory to allocate attr func_info\n");
>> +		return -ENOMEM;
>> +	}
>> +
>> +	if (copy_from_user(data, u64_to_user_ptr(attr->func_info),
>> +			   attr->func_info_len)) {
>> +		verbose(env, "memory copy error for attr func_info\n");
>> +		ret = -EFAULT;
>> +		goto cleanup;
>> +		}
> Extra indentation.
> 
>> +
>> +	for (i = 0; i < nfuncs; i++) {
>> +		if (env->subprog_info[i].start != data[i].insn_offset) {
>> +			verbose(env, "func_info subprog start (%d) does not match verifier (%d)\n",
>> +				env->subprog_info[i].start, data[i].insn_offset);
> The printing args are swapped? env->subprog_info[i].start should
> go to the "verifier (%d)"?
> 
> and s/%d/%u/
> 
>> +			ret = -EINVAL;
>> +			goto cleanup;
>> +		}
>> +		env->subprog_info[i].type_id = data[i].type_id;
>> +	}
>> +
>> +	prog->aux->type_id = data[0].type_id;
>> +cleanup:
>> +	kvfree(data);
>> +	return ret;
>> +}
>> +
>>   /* check %cur's range satisfies %old's */
>>   static bool range_within(struct bpf_reg_state *old,
>>   			 struct bpf_reg_state *cur)
>> @@ -5873,6 +5917,8 @@ static int jit_subprogs(struct bpf_verifier_env *env)
>>   		func[i]->aux->name[0] = 'F';
>>   		func[i]->aux->stack_depth = env->subprog_info[i].stack_depth;
>>   		func[i]->jit_requested = 1;
>> +		func[i]->aux->btf = prog->aux->btf;
>> +		func[i]->aux->type_id = env->subprog_info[i].type_id;
>>   		func[i] = bpf_int_jit_compile(func[i]);
>>   		if (!func[i]->jited) {
>>   			err = -ENOTSUPP;
>> @@ -6307,6 +6353,10 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr)
>>   	if (ret < 0)
>>   		goto skip_full_check;
>>   
>> +	ret = check_btf_func(env->prog, env, attr);
>> +	if (ret < 0)
>> +		goto skip_full_check;
>> +
>>   	ret = do_check(env);
>>   	if (env->cur_state) {
>>   		free_verifier_state(env->cur_state, true);
>> -- 
>> 2.17.1
>>

^ permalink raw reply

* Re: [PATCH net] r8169: fix NAPI handling under high load
From: Eric Dumazet @ 2018-10-17  0:23 UTC (permalink / raw)
  To: Florian Fainelli, Stephen Hemminger, Holger Hoffstätte
  Cc: Heiner Kallweit, David Miller, Realtek linux nic maintainers,
	netdev@vger.kernel.org
In-Reply-To: <92db72c9-1f8c-d6a5-bcf4-241fa4c5a310@gmail.com>



On 10/16/2018 04:08 PM, Florian Fainelli wrote:

> I had started doing that about a month ago in light of the ixbge
> ndo_poll_controller vs. napi problem, but have not had time to submit
> that series yet:
> 
> https://github.com/ffainelli/linux/commits/napi-check
> 
> feel free to piggy back on top of that series if you would like to
> address this.


But the root cause was different, you remember ?

We fixed the real issue with netpoll ability to stick all NIC IRQ onto one
victim CPU.

^ permalink raw reply

* Re: [PATCH net] r8169: fix NAPI handling under high load
From: Eric Dumazet @ 2018-10-17  0:21 UTC (permalink / raw)
  To: Stephen Hemminger, Holger Hoffstätte
  Cc: Heiner Kallweit, David Miller, Realtek linux nic maintainers,
	netdev@vger.kernel.org
In-Reply-To: <20181016160355.1cc0a2e9@xeon-e3>



On 10/16/2018 04:03 PM, Stephen Hemminger wrote:

> Many drivers have buggy usage of napi_complete_done.
> 
> Might even be worth forcing all network drivers to check the return
> value. But fixing 150 broken drivers will be a nuisance.
> 
> diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
> index dc1d9ed33b31..c38bc66ffe74 100644
> --- a/include/linux/netdevice.h
> +++ b/include/linux/netdevice.h
> @@ -466,7 +466,8 @@ static inline bool napi_reschedule(struct napi_struct *napi)
>         return false;
>  }
>  
> -bool napi_complete_done(struct napi_struct *n, int work_done);
> +bool __must_check napi_complete_done(struct napi_struct *n, int work_done);
> +
>  /**
>   *     napi_complete - NAPI processing complete
>   *     @n: NAPI context
> 

I disagree completely.
This never has been a requirement.

^ permalink raw reply

* Re: [PATCH net] r8169: fix NAPI handling under high load
From: Eric Dumazet @ 2018-10-17  0:19 UTC (permalink / raw)
  To: Stephen Hemminger, Heiner Kallweit
  Cc: David Miller, Realtek linux nic maintainers,
	netdev@vger.kernel.org
In-Reply-To: <20181016151708.3fff9bd9@xeon-e3>



On 10/16/2018 03:17 PM, Stephen Hemminger wrote:
> On Tue, 16 Oct 2018 22:37:31 +0200
> Heiner Kallweit <hkallweit1@gmail.com> wrote:
> 
>> rtl_rx() and rtl_tx() are called only if the respective bits are set
>> in the interrupt status register. Under high load NAPI may not be
>> able to process all data (work_done == budget) and it will schedule
>> subsequent calls to the poll callback.
>> rtl_ack_events() however resets the bits in the interrupt status
>> register, therefore subsequent calls to rtl8169_poll() won't call
>> rtl_rx() and rtl_tx() - chip interrupts are still disabled.
>>
>> Fix this by calling rtl_rx() and rtl_tx() independent of the bits
>> set in the interrupt status register. Both functions will detect
>> if there's nothing to do for them.
>>
>> This issue has been there more or less forever (at least it exists in
>> 3.16 already), so I can't provide a "Fixes" tag. 
>>
>> Signed-off-by: Heiner Kallweit <hkallweit1@gmail.com>
> 
> Another issue is this:
> 
> 	if (work_done < budget) {
> 		napi_complete_done(napi, work_done);
> 
> 		rtl_irq_enable(tp, enable_mask);
> 		mmiowb();
> 	}
> 
> 	return work_done;
> }
> 
> 
> The code needs to check return value of napi_complete_done.
> 
> 	if (work_done < budget &&
> 	    napi_complete_done(napi, work_done) {
> 		rtl_irq_enable(tp, enable_mask);
> 		mmiowb();
> 	}
> 
> 	return work_done;
> }
> 
> Try that, it might fix the problem and your logic would
> be unnecessary 
> 

Well, I do not believe this is related.

Testing napi_complete_done() is not mandatory, it is only an optimization [1]
and only for busy polling users.

In short, by default, this should not matter, since busy-polling is not enabled by default.


[1] busy polling users are spinning anyway, so it is not even clear if this
   is really an optimization, unless maybe the cost of irq enabling is really really high...

^ permalink raw reply

* [PATCH net-next 2/2] tcp_bbr: centralize code to set gains
From: Neal Cardwell @ 2018-10-17  0:16 UTC (permalink / raw)
  To: David Miller
  Cc: netdev, Neal Cardwell, Yuchung Cheng, Soheil Hassas Yeganeh,
	Priyaranjan Jha, Eric Dumazet
In-Reply-To: <20181017001645.261770-1-ncardwell@google.com>

Centralize the code that sets gains used for computing cwnd and pacing
rate. This simplifies the code and makes it easier to change the state
machine or (in the future) dynamically change the gain values and
ensure that the correct gain values are always used.

Signed-off-by: Neal Cardwell <ncardwell@google.com>
Signed-off-by: Yuchung Cheng <ycheng@google.com>
Signed-off-by: Soheil Hassas Yeganeh <soheil@google.com>
Signed-off-by: Priyaranjan Jha <priyarjha@google.com>
Signed-off-by: Eric Dumazet <edumazet@google.com>
---
 net/ipv4/tcp_bbr.c | 40 ++++++++++++++++++++++++++++++----------
 1 file changed, 30 insertions(+), 10 deletions(-)

diff --git a/net/ipv4/tcp_bbr.c b/net/ipv4/tcp_bbr.c
index 4cc2223d2cd54..9277abdd822a0 100644
--- a/net/ipv4/tcp_bbr.c
+++ b/net/ipv4/tcp_bbr.c
@@ -521,8 +521,6 @@ static void bbr_advance_cycle_phase(struct sock *sk)
 
 	bbr->cycle_idx = (bbr->cycle_idx + 1) & (CYCLE_LEN - 1);
 	bbr->cycle_mstamp = tp->delivered_mstamp;
-	bbr->pacing_gain = bbr->lt_use_bw ? BBR_UNIT :
-					    bbr_pacing_gain[bbr->cycle_idx];
 }
 
 /* Gain cycling: cycle pacing gain to converge to fair share of available bw. */
@@ -540,8 +538,6 @@ static void bbr_reset_startup_mode(struct sock *sk)
 	struct bbr *bbr = inet_csk_ca(sk);
 
 	bbr->mode = BBR_STARTUP;
-	bbr->pacing_gain = bbr_high_gain;
-	bbr->cwnd_gain	 = bbr_high_gain;
 }
 
 static void bbr_reset_probe_bw_mode(struct sock *sk)
@@ -549,8 +545,6 @@ static void bbr_reset_probe_bw_mode(struct sock *sk)
 	struct bbr *bbr = inet_csk_ca(sk);
 
 	bbr->mode = BBR_PROBE_BW;
-	bbr->pacing_gain = BBR_UNIT;
-	bbr->cwnd_gain = bbr_cwnd_gain;
 	bbr->cycle_idx = CYCLE_LEN - 1 - prandom_u32_max(bbr_cycle_rand);
 	bbr_advance_cycle_phase(sk);	/* flip to next phase of gain cycle */
 }
@@ -768,8 +762,6 @@ static void bbr_check_drain(struct sock *sk, const struct rate_sample *rs)
 
 	if (bbr->mode == BBR_STARTUP && bbr_full_bw_reached(sk)) {
 		bbr->mode = BBR_DRAIN;	/* drain queue we created */
-		bbr->pacing_gain = bbr_drain_gain;	/* pace slow to drain */
-		bbr->cwnd_gain = bbr_high_gain;	/* maintain cwnd */
 		tcp_sk(sk)->snd_ssthresh =
 				bbr_target_cwnd(sk, bbr_max_bw(sk), BBR_UNIT);
 	}	/* fall through to check if in-flight is already small: */
@@ -831,8 +823,6 @@ static void bbr_update_min_rtt(struct sock *sk, const struct rate_sample *rs)
 	if (bbr_probe_rtt_mode_ms > 0 && filter_expired &&
 	    !bbr->idle_restart && bbr->mode != BBR_PROBE_RTT) {
 		bbr->mode = BBR_PROBE_RTT;  /* dip, drain queue */
-		bbr->pacing_gain = BBR_UNIT;
-		bbr->cwnd_gain = BBR_UNIT;
 		bbr_save_cwnd(sk);  /* note cwnd so we can restore it */
 		bbr->probe_rtt_done_stamp = 0;
 	}
@@ -860,6 +850,35 @@ static void bbr_update_min_rtt(struct sock *sk, const struct rate_sample *rs)
 		bbr->idle_restart = 0;
 }
 
+static void bbr_update_gains(struct sock *sk)
+{
+	struct bbr *bbr = inet_csk_ca(sk);
+
+	switch (bbr->mode) {
+	case BBR_STARTUP:
+		bbr->pacing_gain = bbr_high_gain;
+		bbr->cwnd_gain	 = bbr_high_gain;
+		break;
+	case BBR_DRAIN:
+		bbr->pacing_gain = bbr_drain_gain;	/* slow, to drain */
+		bbr->cwnd_gain	 = bbr_high_gain;	/* keep cwnd */
+		break;
+	case BBR_PROBE_BW:
+		bbr->pacing_gain = (bbr->lt_use_bw ?
+				    BBR_UNIT :
+				    bbr_pacing_gain[bbr->cycle_idx]);
+		bbr->cwnd_gain	 = bbr_cwnd_gain;
+		break;
+	case BBR_PROBE_RTT:
+		bbr->pacing_gain = BBR_UNIT;
+		bbr->cwnd_gain	 = BBR_UNIT;
+		break;
+	default:
+		WARN_ONCE(1, "BBR bad mode: %u\n", bbr->mode);
+		break;
+	}
+}
+
 static void bbr_update_model(struct sock *sk, const struct rate_sample *rs)
 {
 	bbr_update_bw(sk, rs);
@@ -867,6 +886,7 @@ static void bbr_update_model(struct sock *sk, const struct rate_sample *rs)
 	bbr_check_full_bw_reached(sk, rs);
 	bbr_check_drain(sk, rs);
 	bbr_update_min_rtt(sk, rs);
+	bbr_update_gains(sk);
 }
 
 static void bbr_main(struct sock *sk, const struct rate_sample *rs)
-- 
2.19.1.331.ge82ca0e54c-goog

^ permalink raw reply related

* [PATCH net-next 1/2] tcp_bbr: adjust TCP BBR for departure time pacing
From: Neal Cardwell @ 2018-10-17  0:16 UTC (permalink / raw)
  To: David Miller; +Cc: netdev, Neal Cardwell, Yuchung Cheng, Eric Dumazet
In-Reply-To: <20181017001645.261770-1-ncardwell@google.com>

Adjust TCP BBR for the new departure time pacing model in the recent
commit ab408b6dc7449 ("tcp: switch tcp and sch_fq to new earliest
departure time model").

With TSQ and pacing at lower layers, there are often several skbs
queued in the pacing layer, and thus there is less data "in the
network" than "in flight".

With departure time pacing at lower layers (e.g. fq or potential
future NICs), the data in the pacing layer now has a pre-scheduled
("baked-in") departure time that cannot be changed, even if the
congestion control algorithm decides to use a new pacing rate.

This means that there can be a non-trivial lag between when BBR makes
a pacing rate change and when the inter-skb pacing delays
change. After a pacing rate change, the number of packets in the
network can gradually evolve to be higher or lower, depending on
whether the sending rate is higher or lower than the delivery
rate. Thus ignoring this lag can cause significant overshoot, with the
flow ending up with too many or too few packets in the network.

This commit changes BBR to adapt its pacing rate based on the amount
of data in the network that it estimates has already been "baked in"
by previous departure time decisions. We estimate the number of our
packets that will be in the network at the earliest departure time
(EDT) for the next skb scheduled as:

   in_network_at_edt = inflight_at_edt - (EDT - now) * bw

If we're increasing the amount of data in the network ("in_network"),
then we want to know if the transmit of the EDT skb will push
in_network above the target, so our answer includes
bbr_tso_segs_goal() from the skb departing at EDT. If we're decreasing
in_network, then we want to know if in_network will sink too low just
before the EDT transmit, so our answer does not include the segments
from the skb departing at EDT.

Why do we treat pacing_gain > 1.0 case and pacing_gain < 1.0 case
differently? The in_network curve is a step function: in_network goes
up on transmits, and down on ACKs. To accurately predict when
in_network will go beyond our target value, this will happen on
different events, depending on whether we're concerned about
in_network potentially going too high or too low:

 o if pushing in_network up (pacing_gain > 1.0),
   then in_network goes above target upon a transmit event

 o if pushing in_network down (pacing_gain < 1.0),
   then in_network goes below target upon an ACK event

This commit changes the BBR state machine to use this estimated
"packets in network" value to make its decisions.

Signed-off-by: Neal Cardwell <ncardwell@google.com>
Signed-off-by: Yuchung Cheng <ycheng@google.com>
Signed-off-by: Eric Dumazet <edumazet@google.com>
---
 net/ipv4/tcp_bbr.c | 37 +++++++++++++++++++++++++++++++++++--
 1 file changed, 35 insertions(+), 2 deletions(-)

diff --git a/net/ipv4/tcp_bbr.c b/net/ipv4/tcp_bbr.c
index b88081285fd17..4cc2223d2cd54 100644
--- a/net/ipv4/tcp_bbr.c
+++ b/net/ipv4/tcp_bbr.c
@@ -369,6 +369,39 @@ static u32 bbr_target_cwnd(struct sock *sk, u32 bw, int gain)
 	return cwnd;
 }

+/* With pacing at lower layers, there's often less data "in the network" than
+ * "in flight". With TSQ and departure time pacing at lower layers (e.g. fq),
+ * we often have several skbs queued in the pacing layer with a pre-scheduled
+ * earliest departure time (EDT). BBR adapts its pacing rate based on the
+ * inflight level that it estimates has already been "baked in" by previous
+ * departure time decisions. We calculate a rough estimate of the number of our
+ * packets that might be in the network at the earliest departure time for the
+ * next skb scheduled:
+ *   in_network_at_edt = inflight_at_edt - (EDT - now) * bw
+ * If we're increasing inflight, then we want to know if the transmit of the
+ * EDT skb will push inflight above the target, so inflight_at_edt includes
+ * bbr_tso_segs_goal() from the skb departing at EDT. If decreasing inflight,
+ * then estimate if inflight will sink too low just before the EDT transmit.
+ */
+static u32 bbr_packets_in_net_at_edt(struct sock *sk, u32 inflight_now)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+	struct bbr *bbr = inet_csk_ca(sk);
+	u64 now_ns, edt_ns, interval_us;
+	u32 interval_delivered, inflight_at_edt;
+
+	now_ns = tp->tcp_clock_cache;
+	edt_ns = max(tp->tcp_wstamp_ns, now_ns);
+	interval_us = div_u64(edt_ns - now_ns, NSEC_PER_USEC);
+	interval_delivered = (u64)bbr_bw(sk) * interval_us >> BW_SCALE;
+	inflight_at_edt = inflight_now;
+	if (bbr->pacing_gain > BBR_UNIT)              /* increasing inflight */
+		inflight_at_edt += bbr_tso_segs_goal(sk);  /* include EDT skb */
+	if (interval_delivered >= inflight_at_edt)
+		return 0;
+	return inflight_at_edt - interval_delivered;
+}
+
 /* An optimization in BBR to reduce losses: On the first round of recovery, we
  * follow the packet conservation principle: send P packets per P packets acked.
  * After that, we slow-start and send at most 2*P packets per P packets acked.
@@ -460,7 +493,7 @@ static bool bbr_is_next_cycle_phase(struct sock *sk,
 	if (bbr->pacing_gain == BBR_UNIT)
 		return is_full_length;		/* just use wall clock time */

-	inflight = rs->prior_in_flight;  /* what was in-flight before ACK? */
+	inflight = bbr_packets_in_net_at_edt(sk, rs->prior_in_flight);
 	bw = bbr_max_bw(sk);

 	/* A pacing_gain > 1.0 probes for bw by trying to raise inflight to at
@@ -741,7 +774,7 @@ static void bbr_check_drain(struct sock *sk, const struct rate_sample *rs)
 				bbr_target_cwnd(sk, bbr_max_bw(sk), BBR_UNIT);
 	}	/* fall through to check if in-flight is already small: */
 	if (bbr->mode == BBR_DRAIN &&
-	    tcp_packets_in_flight(tcp_sk(sk)) <=
+	    bbr_packets_in_net_at_edt(sk, tcp_packets_in_flight(tcp_sk(sk))) <=
 	    bbr_target_cwnd(sk, bbr_max_bw(sk), BBR_UNIT))
 		bbr_reset_probe_bw_mode(sk);  /* we estimate queue is drained */
 }
-- 
2.19.1.331.ge82ca0e54c-goog

^ permalink raw reply related

* [PATCH net-next 0/2] tcp_bbr: TCP BBR changes for EDT pacing model
From: Neal Cardwell @ 2018-10-17  0:16 UTC (permalink / raw)
  To: David Miller; +Cc: netdev, Neal Cardwell

Two small patches for TCP BBR to follow up with Eric's recent work to change
the TCP and fq pacing machinery to an "earliest departure time" (EDT) model:

- The first patch adjusts the TCP BBR logic to work with the new
  "earliest departure time" (EDT) pacing model.

- The second patch adjusts the TCP BBR logic to centralize the setting
  of gain values, to simplify the code and prepare for future changes.

Neal Cardwell (2):
  tcp_bbr: adjust TCP BBR for departure time pacing
  tcp_bbr: centralize code to set gains

 net/ipv4/tcp_bbr.c | 77 ++++++++++++++++++++++++++++++++++++++--------
 1 file changed, 65 insertions(+), 12 deletions(-)

-- 
2.19.1.331.ge82ca0e54c-goog

^ permalink raw reply

* [PATCH v2] virtio_net: avoid using netif_tx_disable() for serializing tx routine
From: Ake Koomsin @ 2018-10-17  7:59 UTC (permalink / raw)
  To: Jason Wang
  Cc: ake, Michael S. Tsirkin, David S. Miller, virtualization, netdev,
	linux-kernel
In-Reply-To: <f04bee7a-3892-102d-2be0-f4fda4048e74@redhat.com>

Commit 713a98d90c5e ("virtio-net: serialize tx routine during reset")
introduces netif_tx_disable() after netif_device_detach() in order to
avoid use-after-free of tx queues. However, there are two issues.

1) Its operation is redundant with netif_device_detach() if the interface
   is running.
2) In case of the interface is not running before suspending and
   resuming, the tx does not get resumed by netif_device_attach().
   This results in losing network connectivity.

It is better to use netif_tx_lock()/netif_tx_unlock() instead for
serializing tx routine during reset. This also preserves the symmetry
of netif_device_detach() and netif_device_attach().

Fixes commit 713a98d90c5e ("virtio-net: serialize tx routine during reset")
Signed-off-by: Ake Koomsin <ake@igel.co.jp>
---
 drivers/net/virtio_net.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
index 3f5aa59c37b7..41ccf9c994a4 100644
--- a/drivers/net/virtio_net.c
+++ b/drivers/net/virtio_net.c
@@ -2267,8 +2267,9 @@ static void virtnet_freeze_down(struct virtio_device *vdev)
 	/* Make sure no work handler is accessing the device */
 	flush_work(&vi->config_work);

+	netif_tx_lock(vi->dev);
 	netif_device_detach(vi->dev);
-	netif_tx_disable(vi->dev);
+	netif_tx_unlock(vi->dev);
 	cancel_delayed_work_sync(&vi->refill);

 	if (netif_running(vi->dev)) {
@@ -2304,7 +2305,9 @@ static int virtnet_restore_up(struct virtio_device *vdev)
 		}
 	}

+	netif_tx_lock(vi->dev);
 	netif_device_attach(vi->dev);
+	netif_tx_unlock(vi->dev);
 	return err;
 }

-- 
2.19.1

^ permalink raw reply related

* Re: [PATCH] iwlwifi: fix spelling mistake "registrating" -> "registering"
From: Luciano Coelho @ 2018-10-17  7:41 UTC (permalink / raw)
  To: Colin King, Johannes Berg, Emmanuel Grumbach,
	Intel Linux Wireless, Kalle Valo, David S . Miller,
	linux-wireless, netdev
  Cc: kernel-janitors, linux-kernel
In-Reply-To: <20181011095757.18475-1-colin.king@canonical.com>

On Thu, 2018-10-11 at 10:57 +0100, Colin King wrote:
> From: Colin Ian King <colin.king@canonical.com>
> 
> Trivial fix to spelling mistake in IWL_ERR error message
> 
> Signed-off-by: Colin Ian King <colin.king@canonical.com>
> ---

Thanks Colin.  I've applied this in our internal tree and it will
eventually reach the mainline, following our normal upstreaming
process.

--
Cheers,
Luca.

^ permalink raw reply

* Re: [PATCH] iwlwifi: mvm: fix spelling mistake "Recieved" -> "Received"
From: Luciano Coelho @ 2018-10-17  7:38 UTC (permalink / raw)
  To: Colin King, Johannes Berg, Emmanuel Grumbach,
	Intel Linux Wireless, Kalle Valo, David S . Miller,
	linux-wireless, netdev
  Cc: kernel-janitors, linux-kernel
In-Reply-To: <20181016172251.1227-1-colin.king@canonical.com>

On Tue, 2018-10-16 at 18:22 +0100, Colin King wrote:
> From: Colin Ian King <colin.king@canonical.com>
> 
> Trivial fix to spelling mistake in IWL_DEBUG_TE message text.
> 
> Signed-off-by: Colin Ian King <colin.king@canonical.com>
> ---

Hi Colin,

You already sent this patch before and it has already been applied in
our internal tree and will reach upstream at some point.

Please don't keep re-sending identical patches once I replied that it
has been applied.

--
Cheers,
Luca.

^ permalink raw reply

* Re: [PATCH bpf-next 2/3] bpf: emit RECORD_MMAP events for bpf prog load/unload
From: David Ahern @ 2018-10-16 23:43 UTC (permalink / raw)
  To: Song Liu, Alexei Starovoitov
  Cc: acme, Peter Zijlstra, Alexei Starovoitov, David S . Miller,
	Daniel Borkmann, Networking, kernel-team
In-Reply-To: <CAPhsuW7BO3gaaTPEyn2XdFTb+y5JBxU0oocJvMak=4jGHTJTqw@mail.gmail.com>

On 10/15/18 4:33 PM, Song Liu wrote:
> I am working with Alexei on the idea of fetching BPF program information via
> BPF_OBJ_GET_INFO_BY_FD cmd. I added PERF_RECORD_BPF_EVENT
> to perf_event_type, and dumped these events to perf event ring buffer.
> 
> I found that perf will not process event until the end of perf-record:
> 
> root@virt-test:~# ~/perf record -ag -- sleep 10
> ...... 10 seconds later
> [ perf record: Woken up 34 times to write data ]
> machine__process_bpf_event: prog_id 6 loaded
> machine__process_bpf_event: prog_id 6 unloaded
> [ perf record: Captured and wrote 9.337 MB perf.data (93178 samples) ]
> 
> In this example, the bpf program was loaded and then unloaded in
> another terminal. When machine__process_bpf_event() processes
> the load event, the bpf program is already unloaded. Therefore,
> machine__process_bpf_event() will not be able to get information
> about the program via BPF_OBJ_GET_INFO_BY_FD cmd.
> 
> To solve this problem, we will need to run BPF_OBJ_GET_INFO_BY_FD
> as soon as perf get the event from kernel. I looked around the perf
> code for a while. But I haven't found a good example where some
> events are processed before the end of perf-record. Could you
> please help me with this?

perf record does not process events as they are generated. Its sole job
is pushing data from the maps to a file as fast as possible meaning in
bulk based on current read and write locations.

Adding code to process events will add significant overhead to the
record command and will not really solve your race problem.

^ permalink raw reply

* Re: [PATCH] iwlwifi: mvm: remove set but not used variable 'he_phy_data'
From: Luciano Coelho @ 2018-10-17  7:16 UTC (permalink / raw)
  To: YueHaibing, Johannes Berg, Emmanuel Grumbach,
	Intel Linux Wireless, Kalle Valo, Golan Ben Ami, Sara Sharon,
	Shaul Triebitz, Liad Kaufman
  Cc: linux-wireless-u79uwXL29TY76Z2rM5mHXA,
	kernel-janitors-u79uwXL29TY76Z2rM5mHXA,
	netdev-u79uwXL29TY76Z2rM5mHXA
In-Reply-To: <1538736421-97401-1-git-send-email-yuehaibing-hv44wF8Li93QT0dZR+AlfA@public.gmane.org>

On Fri, 2018-10-05 at 10:47 +0000, YueHaibing wrote:
> Fixes gcc '-Wunused-but-set-variable' warning:
> 
> drivers/net/wireless/intel/iwlwifi/mvm/rxmq.c: In function
> 'iwl_mvm_rx_mpdu_mq':
> drivers/net/wireless/intel/iwlwifi/mvm/rxmq.c:1386:7: warning:
>  variable 'he_phy_data' set but not used [-Wunused-but-set-variable]
>    u64 he_phy_data;
>    
> 'he_phy_data' never used since be introduce in
> commit 18ead597daa1 ("iwlwifi: support new rx_mpdu_desc api")
> 
> Signed-off-by: YueHaibing <yuehaibing-hv44wF8Li93QT0dZR+AlfA@public.gmane.org>
> ---

Thanks, Yue! This was a merge damage when I applied the HE patches for
iwlwifi.

I've queued this and will send it out as part of the series for 4.21. 
Then it will trickle down as needed.

--
Cheers,
Luca.

^ permalink raw reply

* Re: [PATCH bpf-next v2 3/7] bpf: add MAP_LOOKUP_AND_DELETE_ELEM syscall
From: Alexei Starovoitov @ 2018-10-16 23:20 UTC (permalink / raw)
  To: Mauricio Vasquez
  Cc: Song Liu, Alexei Starovoitov, Daniel Borkmann, Networking
In-Reply-To: <08a63a3e-50a4-5f34-ac86-a7793407858f@polito.it>

On Tue, Oct 16, 2018 at 04:16:39PM -0500, Mauricio Vasquez wrote:
> 
> 
> On 10/11/2018 06:51 PM, Alexei Starovoitov wrote:
> > On Wed, Oct 10, 2018 at 05:50:01PM -0500, Mauricio Vasquez wrote:
> > > > > Does it make sense to you?
> > > > I reread the other patch, and found it does NOT use the following logic for
> > > > queue and stack:
> > > > 
> > > >                  rcu_read_lock();
> > > >                  ptr = map->ops->map_lookup_and_delete_elem(map, key);
> > > >                  if (ptr)
> > > >                          memcpy(value, ptr, value_size);
> > > > 
> > > > I guess this part is not used at all? Can we just remove it?
> > > > 
> > > > Thanks,
> > > > Song
> > > This is the base code for map_lookup_and_delete support, it is not used in
> > > queue/stack maps.
> > > 
> > > I think we can leave it there, so when somebody implements lookup_and_delete
> > > for other maps doesn't have to care about implementing also this.
> > The code looks useful to me, but I also agree with Song. And in the kernel
> > we don't typically add 'almost dead code'.
> > May be provide an implementation of the lookup_and_delete for hash map
> > so it's actually used ?
> 
> I haven't written any code but I think there is a potential problem here.
> Current lookup_and_delete returns a pointer to the element, hence deletion
> of the element should be done using call_rcu to guarantee this is valid
> after returning.
> In the hashtab, the deletion only uses call_rcu when there is not prealloc,
> otherwise the element is pushed on the list of free elements immediately.
> If we move the logic to push elements into the free list under a call_rcu
> invocation, it could happen that the free list is empty because the call_rcu
> is still pending (a similar issue we had with the queue/stack maps when they
> used a pass by reference API).
> 
> There is another way to implement it without this issue, in syscall.c:
> l = ops->lookup(key);
> memcpy(l, some_buffer)
> ops->delete(key)
> 
> The point here is that the lookup_and_delete operation is not being used at
> all, so, is lookup + delete = lookup_and_delete?, can it be generalized?
> If this is true, then what is the sense of having lookup_and_delete
> syscall?,

I though of lookup_and_delete command as atomic operation.
Only in such case it would make sense.
Otherwise there is no point in having additional cmd.
In case of hash map the implementation would be similar to delete:
  raw_spin_lock_irqsave(&b->lock, flags);
  l = lookup_elem_raw(head, hash, key, key_size);
  if (l) {
          hlist_nulls_del_rcu(&l->hash_node);
          bpf_long_memcpy(); // into temp kernel area
          free_htab_elem(htab, l);
          ret = 0;
  }
  raw_spin_unlock_irqrestore(&b->lock, flags);
  copy_to_user();

there is a chance that some other cpu is doing lookup in parallel
and may be modifying value, so bpf_long_mempcy() isn't fully atomic.
But bpf side is written together with user space side,
so above almost-atomic lookup_and_delete is usable instead
of lookup and then delete which races too much.

Having said that I think it's fine to defer this new ndo for now
and leave lookup_and_delete syscall cmd for stack/queue map only.

^ permalink raw reply

* Re: [PATCH net] r8169: fix NAPI handling under high load
From: Florian Fainelli @ 2018-10-16 23:08 UTC (permalink / raw)
  To: Stephen Hemminger, Holger Hoffstätte
  Cc: Heiner Kallweit, David Miller, Realtek linux nic maintainers,
	netdev@vger.kernel.org
In-Reply-To: <20181016160355.1cc0a2e9@xeon-e3>

On 10/16/2018 04:03 PM, Stephen Hemminger wrote:
> On Tue, 16 Oct 2018 23:17:31 +0200
> Holger Hoffstätte <holger@applied-asynchrony.com> wrote:
> 
>> On 10/16/18 22:37, Heiner Kallweit wrote:
>>> rtl_rx() and rtl_tx() are called only if the respective bits are set
>>> in the interrupt status register. Under high load NAPI may not be
>>> able to process all data (work_done == budget) and it will schedule
>>> subsequent calls to the poll callback.
>>> rtl_ack_events() however resets the bits in the interrupt status
>>> register, therefore subsequent calls to rtl8169_poll() won't call
>>> rtl_rx() and rtl_tx() - chip interrupts are still disabled.  
>>
>> Very interesting! Could this be the reason for the mysterious
>> hangs & resets we experienced when enabling BQL for r8169?
>> They happened more often with TSO/GSO enabled and several people
>> attempted to fix those hangs unsuccessfully; it was later reverted
>> and has been since then (#87cda7cb43).
>> If this bug has been there "forever" it might be tempting to
>> re-apply BQL and see what happens. Any chance you could give that
>> a try? I'll gladly test patches, just like I'll run this one.
>>
>> cheers
>> Holger
> 
> Many drivers have buggy usage of napi_complete_done.
> 
> Might even be worth forcing all network drivers to check the return
> value. But fixing 150 broken drivers will be a nuisance.

I had started doing that about a month ago in light of the ixbge
ndo_poll_controller vs. napi problem, but have not had time to submit
that series yet:

https://github.com/ffainelli/linux/commits/napi-check

feel free to piggy back on top of that series if you would like to
address this.

> 
> diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
> index dc1d9ed33b31..c38bc66ffe74 100644
> --- a/include/linux/netdevice.h
> +++ b/include/linux/netdevice.h
> @@ -466,7 +466,8 @@ static inline bool napi_reschedule(struct napi_struct *napi)
>         return false;
>  }
>  
> -bool napi_complete_done(struct napi_struct *n, int work_done);
> +bool __must_check napi_complete_done(struct napi_struct *n, int work_done);
> +
>  /**
>   *     napi_complete - NAPI processing complete
>   *     @n: NAPI context
> 


-- 
Florian

^ permalink raw reply

* Re: [PATCH net] r8169: fix NAPI handling under high load
From: Stephen Hemminger @ 2018-10-16 23:03 UTC (permalink / raw)
  To: Holger Hoffstätte
  Cc: Heiner Kallweit, David Miller, Realtek linux nic maintainers,
	netdev@vger.kernel.org
In-Reply-To: <ada39488-8e9c-d6a8-461e-672642497db2@applied-asynchrony.com>

On Tue, 16 Oct 2018 23:17:31 +0200
Holger Hoffstätte <holger@applied-asynchrony.com> wrote:

> On 10/16/18 22:37, Heiner Kallweit wrote:
> > rtl_rx() and rtl_tx() are called only if the respective bits are set
> > in the interrupt status register. Under high load NAPI may not be
> > able to process all data (work_done == budget) and it will schedule
> > subsequent calls to the poll callback.
> > rtl_ack_events() however resets the bits in the interrupt status
> > register, therefore subsequent calls to rtl8169_poll() won't call
> > rtl_rx() and rtl_tx() - chip interrupts are still disabled.  
> 
> Very interesting! Could this be the reason for the mysterious
> hangs & resets we experienced when enabling BQL for r8169?
> They happened more often with TSO/GSO enabled and several people
> attempted to fix those hangs unsuccessfully; it was later reverted
> and has been since then (#87cda7cb43).
> If this bug has been there "forever" it might be tempting to
> re-apply BQL and see what happens. Any chance you could give that
> a try? I'll gladly test patches, just like I'll run this one.
> 
> cheers
> Holger

Many drivers have buggy usage of napi_complete_done.

Might even be worth forcing all network drivers to check the return
value. But fixing 150 broken drivers will be a nuisance.

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index dc1d9ed33b31..c38bc66ffe74 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -466,7 +466,8 @@ static inline bool napi_reschedule(struct napi_struct *napi)
        return false;
 }
 
-bool napi_complete_done(struct napi_struct *n, int work_done);
+bool __must_check napi_complete_done(struct napi_struct *n, int work_done);
+
 /**
  *     napi_complete - NAPI processing complete
  *     @n: NAPI context

^ permalink raw reply related

* Re: [PATCH net-next V2 6/8] vhost: packed ring support
From: Jason Wang @ 2018-10-17  6:54 UTC (permalink / raw)
  To: Maxime Coquelin, Michael S. Tsirkin, Tiwei Bie
  Cc: kvm, virtualization, netdev, linux-kernel, wexu, jfreimann
In-Reply-To: <1df62bd3-3cc9-d04a-2939-4570d37faa68@redhat.com>


On 2018/10/16 下午9:58, Maxime Coquelin wrote:
>
> On 10/15/2018 04:22 AM, Jason Wang wrote:
>>
>>
>> On 2018年10月13日 01:23, Michael S. Tsirkin wrote:
>>> On Fri, Oct 12, 2018 at 10:32:44PM +0800, Tiwei Bie wrote:
>>>> On Mon, Jul 16, 2018 at 11:28:09AM +0800, Jason Wang wrote:
>>>> [...]
>>>>> @@ -1367,10 +1397,48 @@ long vhost_vring_ioctl(struct vhost_dev 
>>>>> *d, unsigned int ioctl, void __user *arg
>>>>>           vq->last_avail_idx = s.num;
>>>>>           /* Forget the cached index value. */
>>>>>           vq->avail_idx = vq->last_avail_idx;
>>>>> +        if (vhost_has_feature(vq, VIRTIO_F_RING_PACKED)) {
>>>>> +            vq->last_avail_wrap_counter = wrap_counter;
>>>>> +            vq->avail_wrap_counter = vq->last_avail_wrap_counter;
>>>>> +        }
>>>>>           break;
>>>>>       case VHOST_GET_VRING_BASE:
>>>>>           s.index = idx;
>>>>>           s.num = vq->last_avail_idx;
>>>>> +        if (vhost_has_feature(vq, VIRTIO_F_RING_PACKED))
>>>>> +            s.num |= vq->last_avail_wrap_counter << 31;
>>>>> +        if (copy_to_user(argp, &s, sizeof(s)))
>>>>> +            r = -EFAULT;
>>>>> +        break;
>>>>> +    case VHOST_SET_VRING_USED_BASE:
>>>>> +        /* Moving base with an active backend?
>>>>> +         * You don't want to do that.
>>>>> +         */
>>>>> +        if (vq->private_data) {
>>>>> +            r = -EBUSY;
>>>>> +            break;
>>>>> +        }
>>>>> +        if (copy_from_user(&s, argp, sizeof(s))) {
>>>>> +            r = -EFAULT;
>>>>> +            break;
>>>>> +        }
>>>>> +        if (vhost_has_feature(vq, VIRTIO_F_RING_PACKED)) {
>>>>> +            wrap_counter = s.num >> 31;
>>>>> +            s.num &= ~(1 << 31);
>>>>> +        }
>>>>> +        if (s.num > 0xffff) {
>>>>> +            r = -EINVAL;
>>>>> +            break;
>>>>> +        }
>>>> Do we want to put wrap_counter at bit 15?
>>> I think I second that - seems to be consistent with
>>> e.g. event suppression structure and the proposed
>>> extension to driver notifications.
>>
>> Ok, I assumes packed virtqueue support 64K but looks not. I can 
>> change it to bit 15 and GET_VRING_BASE need to be changed as well.
>>
>>>
>>>
>>>> If put wrap_counter at bit 31, the check (s.num > 0xffff)
>>>> won't be able to catch the illegal index 0x8000~0xffff for
>>>> packed ring.
>>>>
>>
>> Do we need to clarify this in the spec?
>>
>>>>> +        vq->last_used_idx = s.num;
>>>>> +        if (vhost_has_feature(vq, VIRTIO_F_RING_PACKED))
>>>>> +            vq->last_used_wrap_counter = wrap_counter;
>>>>> +        break;
>>>>> +    case VHOST_GET_VRING_USED_BASE:
>>>> Do we need the new VHOST_GET_VRING_USED_BASE and
>>>> VHOST_SET_VRING_USED_BASE ops?
>>>>
>>>> We are going to merge below series in DPDK:
>>>>
>>>> http://patches.dpdk.org/patch/45874/
>>>>
>>>> We may need to reach an agreement first.
>>
>> If we agree that 64K virtqueue won't be supported, I'm ok with either.
>
> I'm fine to put wrap_counter at bit 15.
> I will post a new version of the DPDK series soon.
>
>> Btw the code assumes used_wrap_counter is equal to avail_wrap_counter 
>> which looks wrong?
>
> For split ring, we used to set the last_used_idx to the same value as
> last_avail_idx as VHOST_USER_GET_VRING_BASE cannot be called while the
> ring is being processed, so their value is always the same at the time
> the request is handled.


I may miss something, but it looks to me we should sync last_used_idx 
from used_idx.

Thanks


>
>
> I kept the same behavior for packed ring, and so the wrap counter have
> to be the same.
>
> Regards,
> Maxime
>
>> Thanks
>>
>>>>
>>>>> +        s.index = idx;
>>>>> +        s.num = vq->last_used_idx;
>>>>> +        if (vhost_has_feature(vq, VIRTIO_F_RING_PACKED))
>>>>> +            s.num |= vq->last_used_wrap_counter << 31;
>>>>>           if (copy_to_user(argp, &s, sizeof s))
>>>>>               r = -EFAULT;
>>>>>           break;
>>>> [...]
>>

^ permalink raw reply

* Re: netconsole warning in 4.19.0-rc7
From: Meelis Roos @ 2018-10-17  6:22 UTC (permalink / raw)
  To: Dave Jones, Cong Wang, LKML, Linux Kernel Network Developers
In-Reply-To: <20181017035038.czaqown24rnjn2pw@codemonkey.org.uk>

> I took another look at that error path. Turns out this is all we need I
> think..

With this patch applied on top of 4.19-rc8, I stll get the warning:

[   13.722919] WARNING: CPU: 0 PID: 0 at kernel/softirq.c:168 __local_bh_enable_ip+0x2e/0x44
[   13.722922] Modules linked in:
[   13.722924] CPU: 0 PID: 0 Comm: swapper Not tainted 4.19.0-rc8-dirty #212
[   13.722926] Hardware name: MicroLink                       /D850MV                         , BIOS MV85010A.86A.0067.P24.0304081124 04/08/2003
[   13.722927] EIP: __local_bh_enable_ip+0x2e/0x44
[   13.722929] Code: cc 02 5f de a9 00 00 0f 00 75 1f 83 ea 01 f7 da 01 15 cc 02 5f de a1 cc 02 5f de a9 00 ff 1f 00 74 0c ff 0d cc 02 5f de 5d c3 <0f> 0b eb dd 66 a1 80 cd 5e de 66 85 c0 74 e9 e8 87 ff ff ff eb e2
[   13.722931] EAX: 80010200 EBX: f602b000 ECX: 36346b30 EDX: 00000200
[   13.722932] ESI: f620ed20 EDI: f620ebac EBP: f600de40 ESP: f600de40
[   13.722933] DS: 007b ES: 007b FS: 0000 GS: 00e0 SS: 0068 EFLAGS: 00010006
[   13.722934] CR0: 80050033 CR2: bfc87ffc CR3: 3633c000 CR4: 000006d0
[   13.722935] Call Trace:
[   13.722936]  <IRQ>
[   13.722937]  netpoll_send_skb_on_dev+0xa5/0x1bf
[   13.722938]  netpoll_send_udp+0x2ab/0x462
[   13.722939]  write_msg+0xc9/0xd4
[   13.722940]  ? netconsole_netdev_event+0xd0/0xd0
[   13.722941]  console_unlock+0x283/0x376
[   13.722942]  vprintk_emit+0xfa/0x13a
[   13.722943]  vprintk_default+0x32/0x34
[   13.722944]  vprintk_func+0x26/0x82
[   13.722946]  ? __wake_up+0x1a/0x20
[   13.722947]  printk+0xe/0x10
[   13.722948]  crng_fast_load+0x138/0x142
[   13.722949]  add_interrupt_randomness+0x1cd/0x1ec
[   13.722950]  ? handle_fasteoi_irq+0x127/0x127
[   13.722951]  handle_irq_event_percpu+0x33/0x6d
[   13.722952]  ? irqd_cfg+0xc/0x11
[   13.722953]  handle_irq_event+0x1d/0x29
[   13.722954]  handle_edge_irq+0x61/0x161
[   13.722955]  handle_irq+0x83/0xa0
[   13.722956]  </IRQ>
[   13.722956]  do_IRQ+0x3b/0xd7
[   13.722957]  common_interrupt+0xd3/0xd8
[   13.722958] EIP: default_idle+0x5/0x7
[   13.722961] Code: ff ff ff fb eb e7 b8 00 00 01 00 0f c1 03 31 c0 eb ac e8 ba cc bc ff 90 90 90 90 90 90 90 90 90 90 90 90 90 90 55 89 e5 fb f4 <5d> c3 55 89 e5 a1 d4 02 5f de 80 48 02 20 8b 10 83 e2 08 75 2c 8b
[   13.722962] EAX: de4795c0 EBX: 00000000 ECX: 31ef550d EDX: 00000000
[   13.722963] ESI: 00000000 EDI: f73fd684 EBP: de5d5f20 ESP: de5d5f20
[   13.722964] DS: 007b ES: 007b FS: 0000 GS: 00e0 SS: 0068 EFLAGS: 00000246
[   13.722965]  ? __cpuidle_text_start+0x8/0x8
[   13.722966]  arch_cpu_idle+0xd/0xf
[   13.722967]  default_idle_call+0x1a/0x29
[   13.722968]  do_idle+0x131/0x18e
[   13.722969]  cpu_startup_entry+0x52/0x54
[   13.722970]  rest_init+0x6e/0x70
[   13.722971]  start_kernel+0x365/0x37e
[   13.722972]  i386_start_kernel+0x92/0x96
[   13.722973]  startup_32_smp+0x164/0x168
[   13.722974] ---[ end trace 74d36b4b49749ac1 ]---

-- 
Meelis Roos <mroos@linux.ee>

^ permalink raw reply

* Re: [bpf-next PATCH] bpf: sockmap, fix skmsg recvmsg handler to track size correctly
From: Alexei Starovoitov @ 2018-10-16 22:27 UTC (permalink / raw)
  To: John Fastabend; +Cc: ast, daniel, netdev
In-Reply-To: <20181016173601.9211.1573.stgit@john-Precision-Tower-5810>

On Tue, Oct 16, 2018 at 10:36:01AM -0700, John Fastabend wrote:
> When converting sockmap to new skmsg generic data structures we missed
> that the recvmsg handler did not correctly use sg.size and instead was
> using individual elements length. The result is if a sock is closed
> with outstanding data we omit the call to sk_mem_uncharge() and can
> get the warning below.
> 
> [   66.728282] WARNING: CPU: 6 PID: 5783 at net/core/stream.c:206 sk_stream_kill_queues+0x1fa/0x210
> 
> To fix this correct the redirect handler to xfer the size along with
> the scatterlist and also decrement the size from the recvmsg handler.
> Now when a sock is closed the remaining 'size' will be decremented
> with sk_mem_uncharge().
> 
> Signed-off-by: John Fastabend <john.fastabend@gmail.com>

Acked-by: Alexei Starovoitov <ast@kernel.org>

^ permalink raw reply

* Re: [PATCH bpf-next 0/2] nfp: bpf: improve offload checks
From: Alexei Starovoitov @ 2018-10-16 22:26 UTC (permalink / raw)
  To: Jakub Kicinski; +Cc: daniel, netdev, oss-drivers
In-Reply-To: <20181016221910.9947-1-jakub.kicinski@netronome.com>

On Tue, Oct 16, 2018 at 03:19:08PM -0700, Jakub Kicinski wrote:
> Hi,
> 
> this set adds check to make sure offload behaviour is correct.
> First when atomic counters are used, we must make sure the map
> does not already contain data we did not prepare for holding
> atomics.
> 
> Second patch double checks vNIC capabilities for program offload
> in case program is shared by multiple vNICs with different
> constraints.

1st patch is quite a hack, but until we have proper BTF annotations
I don't see any other way to solve it.

Applied, Thanks

^ permalink raw reply

* Re: [PATCH] virtio_net: enable tx after resuming from suspend
From: Jason Wang @ 2018-10-17  6:18 UTC (permalink / raw)
  To: ake
  Cc: Michael S. Tsirkin, David S. Miller, virtualization, netdev,
	linux-kernel
In-Reply-To: <0e8b0747-cc9f-8863-2ee1-2b032d6c0fae@igel.co.jp>


On 2018/10/16 下午6:15, ake wrote:
>
> On 2018年10月16日 17:53, Jason Wang wrote:
>> On 2018/10/15 下午6:08, ake wrote:
>>> On 2018年10月12日 18:18, ake wrote:
>>>> On 2018年10月12日 17:23, Jason Wang wrote:
>>>>> On 2018年10月12日 12:30, ake wrote:
>>>>>> On 2018年10月11日 22:06, Jason Wang wrote:
>>>>>>> On 2018年10月11日 18:22, ake wrote:
>>>>>>>> On 2018年10月11日 18:44, Jason Wang wrote:
>>>>>>>>> On 2018年10月11日 15:51, Ake Koomsin wrote:
>>>>>>>>>> commit 713a98d90c5e ("virtio-net: serialize tx routine during
>>>>>>>>>> reset")
>>>>>>>>>> disabled the virtio tx before going to suspend to avoid a use
>>>>>>>>>> after
>>>>>>>>>> free.
>>>>>>>>>> However, after resuming, it causes the virtio_net device to
>>>>>>>>>> lose its
>>>>>>>>>> network connectivity.
>>>>>>>>>>
>>>>>>>>>> To solve the issue, we need to enable tx after resuming.
>>>>>>>>>>
>>>>>>>>>> Fixes commit 713a98d90c5e ("virtio-net: serialize tx routine
>>>>>>>>>> during
>>>>>>>>>> reset")
>>>>>>>>>> Signed-off-by: Ake Koomsin <ake@igel.co.jp>
>>>>>>>>>> ---
>>>>>>>>>>       drivers/net/virtio_net.c | 1 +
>>>>>>>>>>       1 file changed, 1 insertion(+)
>>>>>>>>>>
>>>>>>>>>> diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
>>>>>>>>>> index dab504ec5e50..3453d80f5f81 100644
>>>>>>>>>> --- a/drivers/net/virtio_net.c
>>>>>>>>>> +++ b/drivers/net/virtio_net.c
>>>>>>>>>> @@ -2256,6 +2256,7 @@ static int virtnet_restore_up(struct
>>>>>>>>>> virtio_device *vdev)
>>>>>>>>>>           }
>>>>>>>>>>             netif_device_attach(vi->dev);
>>>>>>>>>> +    netif_start_queue(vi->dev);
>>>>>>>>> I believe this is duplicated with netif_tx_wake_all_queues() in
>>>>>>>>> netif_device_attach() above?
>>>>>>>> Thank you for your review.
>>>>>>>>
>>>>>>>> If both netif_tx_wake_all_queues() and netif_start_queue() result in
>>>>>>>> clearing __QUEUE_STATE_DRV_XOFF, then is it possible that some
>>>>>>>> conditions in netif_device_attach() is not satisfied?
>>>>>>> Yes, maybe. One case I can see now is when the device is down, in
>>>>>>> this
>>>>>>> case netif_device_attach() won't try to wakeup the queue.
>>>>>>>
>>>>>>>>      Without
>>>>>>>> netif_start_queue(), the virtio_net device does not resume properly
>>>>>>>> after waking up.
>>>>>>> How do you trigger the issue? Just do suspend/resume?
>>>>>> Yes, simply suspend and resume.
>>>>>>
>>>>>> Here is how I trigger the issue:
>>>>>>
>>>>>> 1) Start the Virtual Machine Manager GUI program.
>>>>>> 2) Create a guest Linux OS. Make sure that the guest OS kernel is
>>>>>>       >= 4.12. Make sure that it uses virtio_net as its network device.
>>>>>>       In addition, make sure that the video adapter is VGA. Otherwise,
>>>>>>       waking up with the virtual power button does not work.
>>>>>> 3) After installing the guest OS, log in, and test the network
>>>>>>       connectivity by ping the host machine.
>>>>>> 4) Suspend. After this, the screen is blank.
>>>>>> 5) Resume by hitting the virtual power button. The login screen
>>>>>>       appears again.
>>>>>> 6) Log in again. The guest loses its network connection.
>>>>>>
>>>>>> In my test:
>>>>>> Guest: Ubuntu 16.04/18.04 with kernel 4.15.0-36-generic
>>>>>> Host: Ubuntu 16.04 with kernel 4.15.0-36-generic/4.4.0-137-generic
>>>>> I can not reproduce this issue if virtio-net interface is up in guest
>>>>> before the suspend. I'm using net-next.git and qemu master. But I do
>>>>> reproduce when virtio-net interface is down in guest before suspend,
>>>>> after resume, even if I make it up, the network is still lost.
>>>>>
>>>>> I think the interface is up in your case, but please confirm this.
>>>> If you mean the interface state before I hit the suspend button,
>>>> the answer is yes. The interface is up before I suspend the guest
>>>> machine.
>>>>
>>>> Note that my current QEMU version is QEMU emulator version 2.5.0
>>>> (Debian 1:2.5+dfsg-5ubuntu10.32).
>>>>
>>>> I will try with net-next.git and qemu master later and see if I can
>>>> reproduce the issue.
>>> Update. I tried with net-next and qemu master. Interestingly, the result
>>> is different from yours. The network is lost even if the virtio_net
>>> interface is up before suspending.
>>>
>>> Host: Ubuntu 16.04 with net-next kernel (default configuration)
>>> Guest: Ubuntu 18.04 with net-next kernel (default configuration)
>>> Qemu: master
>>> Qemu command:
>>> qemu-system-x86_64 -cpu host -m 2048 -enable-kvm \
>>> -bios /usr/share/OVMF/OVMF_CODE.fd \
>>> -drive file=/var/lib/libvirt/images/virtio_test.qcow2,if=virtio \
>>> -netdev user,id=hostnet0 \
>>> -device virtio-net-pci,netdev=hostnet0 \
>>> -device VGA,id=video0,vgamem_mb=16 \
>>> -global PIIX4_PM.disable_s3=1 \
>>> -global PIIX4_PM.disable_s4=1 -monitor stdio
>>
>> Interesting, just notice you're using userspace network. To isolate the
>> issue, can you retry with e.g tap or e1000 to make sure it's not a fault
>> of slirp or virito-net?
> I will try.
>
>> Thanks
>>
> There is another thing that I want to discuss. I notice that
> netif_device_detach() should result in setting __QUEUE_STATE_DRV_XOFF if
> the network interface is running. By calling netif_tx_disable() after
> netif_device_detach(), isn't it redundant in case of the network
> interface is running? If the goal is to serialize tx routine, would
> netif_tx_lock() and net_tx_unlock() are more appropriate? Like this:
>
> netif_tx_lock(vi->dev);
> netif_device_detach(vi->dev);
> netif_tx_unlock(vi->dev);
>
> Currently, netif_tx_disable() seems to disturb the symmetry of
> netif_device_detach() and netif_device_attach(). That is the reason
> why you can reproduce the problem when the interface is down before
> suspending.


Yes I agree.

Thanks

^ permalink raw reply

* Re: [PATCH bpf-next] libbpf: Per-symbol visibility for DSO
From: Alexei Starovoitov @ 2018-10-16 22:19 UTC (permalink / raw)
  To: Andrey Ignatov; +Cc: netdev, ast, daniel, kernel-team
In-Reply-To: <20181016055034.1515270-1-rdna@fb.com>

On Mon, Oct 15, 2018 at 10:50:34PM -0700, Andrey Ignatov wrote:
> Make global symbols in libbpf DSO hidden by default with
> -fvisibility=hidden and export symbols that are part of ABI explicitly
> with __attribute__((visibility("default"))).
> 
> This is common practice that should prevent from accidentally exporting
> a symbol, that is not supposed to be a part of ABI what, in turn,
> improves both libbpf developer- and user-experiences. See [1] for more
> details.
> 
> Export control becomes more important since more and more projects use
> libbpf.
> 
> The patch doesn't export a bunch of netlink related functions since as
> agreed in [2] they'll be reworked. That doesn't break bpftool since
> bpftool links libbpf statically.
> 
> [1] https://www.akkadia.org/drepper/dsohowto.pdf (2.2 Export Control)
> [2] https://www.mail-archive.com/netdev@vger.kernel.org/msg251434.html
> 
> Signed-off-by: Andrey Ignatov <rdna@fb.com>

Applied, Thanks

^ permalink raw reply

* [PATCH bpf-next 2/2] nfp: bpf: double check vNIC capabilities after object sharing
From: Jakub Kicinski @ 2018-10-16 22:19 UTC (permalink / raw)
  To: alexei.starovoitov, daniel; +Cc: netdev, oss-drivers, Jakub Kicinski
In-Reply-To: <20181016221910.9947-1-jakub.kicinski@netronome.com>

Program translation stage checks that program can be offloaded to
the netdev which was passed during the load (bpf_attr->prog_ifindex).
After program sharing was introduced, however, the netdev on which
program is loaded can theoretically be different, and therefore
we should recheck the program size and max stack size at load time.

This was found by code inspection, AFAIK today all vNICs have
identical caps.

Signed-off-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Reviewed-by: Quentin Monnet <quentin.monnet@netronome.com>
---
 drivers/net/ethernet/netronome/nfp/bpf/main.h     |  3 +++
 drivers/net/ethernet/netronome/nfp/bpf/offload.c  | 14 +++++++++++++-
 drivers/net/ethernet/netronome/nfp/bpf/verifier.c | 11 ++++++-----
 3 files changed, 22 insertions(+), 6 deletions(-)

diff --git a/drivers/net/ethernet/netronome/nfp/bpf/main.h b/drivers/net/ethernet/netronome/nfp/bpf/main.h
index 12e98a0a58e5..7f591d71ab28 100644
--- a/drivers/net/ethernet/netronome/nfp/bpf/main.h
+++ b/drivers/net/ethernet/netronome/nfp/bpf/main.h
@@ -441,6 +441,7 @@ struct nfp_bpf_subprog_info {
  * @prog: machine code
  * @prog_len: number of valid instructions in @prog array
  * @__prog_alloc_len: alloc size of @prog array
+ * @stack_size: total amount of stack used
  * @verifier_meta: temporary storage for verifier's insn meta
  * @type: BPF program type
  * @last_bpf_off: address of the last instruction translated from BPF
@@ -465,6 +466,8 @@ struct nfp_prog {
 	unsigned int prog_len;
 	unsigned int __prog_alloc_len;
 
+	unsigned int stack_size;
+
 	struct nfp_insn_meta *verifier_meta;
 
 	enum bpf_prog_type type;
diff --git a/drivers/net/ethernet/netronome/nfp/bpf/offload.c b/drivers/net/ethernet/netronome/nfp/bpf/offload.c
index 658c7143d59c..ba8ceedcf6a2 100644
--- a/drivers/net/ethernet/netronome/nfp/bpf/offload.c
+++ b/drivers/net/ethernet/netronome/nfp/bpf/offload.c
@@ -489,7 +489,7 @@ nfp_net_bpf_load(struct nfp_net *nn, struct bpf_prog *prog,
 		 struct netlink_ext_ack *extack)
 {
 	struct nfp_prog *nfp_prog = prog->aux->offload->dev_priv;
-	unsigned int max_mtu;
+	unsigned int max_mtu, max_stack, max_prog_len;
 	dma_addr_t dma_addr;
 	void *img;
 	int err;
@@ -500,6 +500,18 @@ nfp_net_bpf_load(struct nfp_net *nn, struct bpf_prog *prog,
 		return -EOPNOTSUPP;
 	}
 
+	max_stack = nn_readb(nn, NFP_NET_CFG_BPF_STACK_SZ) * 64;
+	if (nfp_prog->stack_size > max_stack) {
+		NL_SET_ERR_MSG_MOD(extack, "stack too large");
+		return -EOPNOTSUPP;
+	}
+
+	max_prog_len = nn_readw(nn, NFP_NET_CFG_BPF_MAX_LEN);
+	if (nfp_prog->prog_len > max_prog_len) {
+		NL_SET_ERR_MSG_MOD(extack, "program too long");
+		return -EOPNOTSUPP;
+	}
+
 	img = nfp_bpf_relo_for_vnic(nfp_prog, nn->app_priv);
 	if (IS_ERR(img))
 		return PTR_ERR(img);
diff --git a/drivers/net/ethernet/netronome/nfp/bpf/verifier.c b/drivers/net/ethernet/netronome/nfp/bpf/verifier.c
index e04035c116a4..99f977bfd8cc 100644
--- a/drivers/net/ethernet/netronome/nfp/bpf/verifier.c
+++ b/drivers/net/ethernet/netronome/nfp/bpf/verifier.c
@@ -747,9 +747,9 @@ nfp_bpf_get_stack_usage(struct nfp_prog *nfp_prog, unsigned int cnt)
 
 static int nfp_bpf_finalize(struct bpf_verifier_env *env)
 {
-	unsigned int stack_size, stack_needed;
 	struct bpf_subprog_info *info;
 	struct nfp_prog *nfp_prog;
+	unsigned int max_stack;
 	struct nfp_net *nn;
 	int i;
 
@@ -777,11 +777,12 @@ static int nfp_bpf_finalize(struct bpf_verifier_env *env)
 	}
 
 	nn = netdev_priv(env->prog->aux->offload->netdev);
-	stack_size = nn_readb(nn, NFP_NET_CFG_BPF_STACK_SZ) * 64;
-	stack_needed = nfp_bpf_get_stack_usage(nfp_prog, env->prog->len);
-	if (stack_needed > stack_size) {
+	max_stack = nn_readb(nn, NFP_NET_CFG_BPF_STACK_SZ) * 64;
+	nfp_prog->stack_size = nfp_bpf_get_stack_usage(nfp_prog,
+						       env->prog->len);
+	if (nfp_prog->stack_size > max_stack) {
 		pr_vlog(env, "stack too large: program %dB > FW stack %dB\n",
-			stack_needed, stack_size);
+			nfp_prog->stack_size, max_stack);
 		return -EOPNOTSUPP;
 	}
 
-- 
2.17.1

^ permalink raw reply related

* [PATCH bpf-next 1/2] nfp: bpf: protect against mis-initializing atomic counters
From: Jakub Kicinski @ 2018-10-16 22:19 UTC (permalink / raw)
  To: alexei.starovoitov, daniel; +Cc: netdev, oss-drivers, Jakub Kicinski
In-Reply-To: <20181016221910.9947-1-jakub.kicinski@netronome.com>

Atomic operations on the NFP are currently always in big endian.
The driver keeps track of regions of memory storing atomic values
and byte swaps them accordingly.  There are corner cases where
the map values may be initialized before the driver knows they
are used as atomic counters.  This can happen either when the
datapath is performing the update and the stack contents are
unknown or when map is updated before the program which will
use it for atomic values is loaded.

To avoid situation where user initializes the value to 0 1 2 3
and then after loading a program which uses the word as an atomic
counter starts reading 3 2 1 0 - only allow atomic counters to be
initialized to endian-neutral values.

For updates from the datapath the stack information may not be
as precise, so just allow initializing such values to 0.

Example code which would break:
struct bpf_map_def SEC("maps") rxcnt = {
       .type = BPF_MAP_TYPE_HASH,
       .key_size = sizeof(__u32),
       .value_size = sizeof(__u64),
       .max_entries = 1,
};

int xdp_prog1()
{
      	__u64 nonzeroval = 3;
	__u32 key = 0;
	__u64 *value;

	value = bpf_map_lookup_elem(&rxcnt, &key);
	if (!value)
		bpf_map_update_elem(&rxcnt, &key, &nonzeroval, BPF_ANY);
	else
		__sync_fetch_and_add(value, 1);

	return XDP_PASS;
}

$ offload bpftool map dump
key: 00 00 00 00 value: 00 00 00 03 00 00 00 00

should be:

$ offload bpftool map dump
key: 00 00 00 00 value: 03 00 00 00 00 00 00 00

Reported-by: David Beckett <david.beckett@netronome.com>
Signed-off-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Reviewed-by: Quentin Monnet <quentin.monnet@netronome.com>
---
 drivers/net/ethernet/netronome/nfp/bpf/main.h |  7 ++-
 .../net/ethernet/netronome/nfp/bpf/offload.c  | 18 +++++-
 .../net/ethernet/netronome/nfp/bpf/verifier.c | 58 +++++++++++++++++--
 3 files changed, 76 insertions(+), 7 deletions(-)

diff --git a/drivers/net/ethernet/netronome/nfp/bpf/main.h b/drivers/net/ethernet/netronome/nfp/bpf/main.h
index 52457ae3b259..12e98a0a58e5 100644
--- a/drivers/net/ethernet/netronome/nfp/bpf/main.h
+++ b/drivers/net/ethernet/netronome/nfp/bpf/main.h
@@ -189,6 +189,11 @@ enum nfp_bpf_map_use {
 	NFP_MAP_USE_ATOMIC_CNT,
 };
 
+struct nfp_bpf_map_word {
+	unsigned char type		:4;
+	unsigned char non_zero_update	:1;
+};
+
 /**
  * struct nfp_bpf_map - private per-map data attached to BPF maps for offload
  * @offmap:	pointer to the offloaded BPF map
@@ -202,7 +207,7 @@ struct nfp_bpf_map {
 	struct nfp_app_bpf *bpf;
 	u32 tid;
 	struct list_head l;
-	enum nfp_bpf_map_use use_map[];
+	struct nfp_bpf_map_word use_map[];
 };
 
 struct nfp_bpf_neutral_map {
diff --git a/drivers/net/ethernet/netronome/nfp/bpf/offload.c b/drivers/net/ethernet/netronome/nfp/bpf/offload.c
index 927e038d9f77..658c7143d59c 100644
--- a/drivers/net/ethernet/netronome/nfp/bpf/offload.c
+++ b/drivers/net/ethernet/netronome/nfp/bpf/offload.c
@@ -262,10 +262,25 @@ static void nfp_map_bpf_byte_swap(struct nfp_bpf_map *nfp_map, void *value)
 	unsigned int i;
 
 	for (i = 0; i < DIV_ROUND_UP(nfp_map->offmap->map.value_size, 4); i++)
-		if (nfp_map->use_map[i] == NFP_MAP_USE_ATOMIC_CNT)
+		if (nfp_map->use_map[i].type == NFP_MAP_USE_ATOMIC_CNT)
 			word[i] = (__force u32)cpu_to_be32(word[i]);
 }
 
+/* Mark value as unsafely initialized in case it becomes atomic later
+ * and we didn't byte swap something non-byte swap neutral.
+ */
+static void
+nfp_map_bpf_byte_swap_record(struct nfp_bpf_map *nfp_map, void *value)
+{
+	u32 *word = value;
+	unsigned int i;
+
+	for (i = 0; i < DIV_ROUND_UP(nfp_map->offmap->map.value_size, 4); i++)
+		if (nfp_map->use_map[i].type == NFP_MAP_UNUSED &&
+		    word[i] != (__force u32)cpu_to_be32(word[i]))
+			nfp_map->use_map[i].non_zero_update = 1;
+}
+
 static int
 nfp_bpf_map_lookup_entry(struct bpf_offloaded_map *offmap,
 			 void *key, void *value)
@@ -285,6 +300,7 @@ nfp_bpf_map_update_entry(struct bpf_offloaded_map *offmap,
 			 void *key, void *value, u64 flags)
 {
 	nfp_map_bpf_byte_swap(offmap->dev_priv, value);
+	nfp_map_bpf_byte_swap_record(offmap->dev_priv, value);
 	return nfp_bpf_ctrl_update_entry(offmap, key, value, flags);
 }
 
diff --git a/drivers/net/ethernet/netronome/nfp/bpf/verifier.c b/drivers/net/ethernet/netronome/nfp/bpf/verifier.c
index 193dd685b365..e04035c116a4 100644
--- a/drivers/net/ethernet/netronome/nfp/bpf/verifier.c
+++ b/drivers/net/ethernet/netronome/nfp/bpf/verifier.c
@@ -80,6 +80,46 @@ nfp_record_adjust_head(struct nfp_app_bpf *bpf, struct nfp_prog *nfp_prog,
 	nfp_prog->adjust_head_location = location;
 }
 
+static bool nfp_bpf_map_update_value_ok(struct bpf_verifier_env *env)
+{
+	const struct bpf_reg_state *reg1 = cur_regs(env) + BPF_REG_1;
+	const struct bpf_reg_state *reg3 = cur_regs(env) + BPF_REG_3;
+	struct bpf_offloaded_map *offmap;
+	struct bpf_func_state *state;
+	struct nfp_bpf_map *nfp_map;
+	int off, i;
+
+	state = env->cur_state->frame[reg3->frameno];
+
+	/* We need to record each time update happens with non-zero words,
+	 * in case such word is used in atomic operations.
+	 * Implicitly depend on nfp_bpf_stack_arg_ok(reg3) being run before.
+	 */
+
+	offmap = map_to_offmap(reg1->map_ptr);
+	nfp_map = offmap->dev_priv;
+	off = reg3->off + reg3->var_off.value;
+
+	for (i = 0; i < offmap->map.value_size; i++) {
+		struct bpf_stack_state *stack_entry;
+		unsigned int soff;
+
+		soff = -(off + i) - 1;
+		stack_entry = &state->stack[soff / BPF_REG_SIZE];
+		if (stack_entry->slot_type[soff % BPF_REG_SIZE] == STACK_ZERO)
+			continue;
+
+		if (nfp_map->use_map[i / 4].type == NFP_MAP_USE_ATOMIC_CNT) {
+			pr_vlog(env, "value at offset %d/%d may be non-zero, bpf_map_update_elem() is required to initialize atomic counters to zero to avoid offload endian issues\n",
+				i, soff);
+			return false;
+		}
+		nfp_map->use_map[i / 4].non_zero_update = 1;
+	}
+
+	return true;
+}
+
 static int
 nfp_bpf_stack_arg_ok(const char *fname, struct bpf_verifier_env *env,
 		     const struct bpf_reg_state *reg,
@@ -171,7 +211,8 @@ nfp_bpf_check_helper_call(struct nfp_prog *nfp_prog,
 					 bpf->helpers.map_update, reg1) ||
 		    !nfp_bpf_stack_arg_ok("map_update", env, reg2,
 					  meta->func_id ? &meta->arg2 : NULL) ||
-		    !nfp_bpf_stack_arg_ok("map_update", env, reg3, NULL))
+		    !nfp_bpf_stack_arg_ok("map_update", env, reg3, NULL) ||
+		    !nfp_bpf_map_update_value_ok(env))
 			return -EOPNOTSUPP;
 		break;
 
@@ -352,15 +393,22 @@ nfp_bpf_map_mark_used_one(struct bpf_verifier_env *env,
 			  struct nfp_bpf_map *nfp_map,
 			  unsigned int off, enum nfp_bpf_map_use use)
 {
-	if (nfp_map->use_map[off / 4] != NFP_MAP_UNUSED &&
-	    nfp_map->use_map[off / 4] != use) {
+	if (nfp_map->use_map[off / 4].type != NFP_MAP_UNUSED &&
+	    nfp_map->use_map[off / 4].type != use) {
 		pr_vlog(env, "map value use type conflict %s vs %s off: %u\n",
-			nfp_bpf_map_use_name(nfp_map->use_map[off / 4]),
+			nfp_bpf_map_use_name(nfp_map->use_map[off / 4].type),
 			nfp_bpf_map_use_name(use), off);
 		return -EOPNOTSUPP;
 	}
 
-	nfp_map->use_map[off / 4] = use;
+	if (nfp_map->use_map[off / 4].non_zero_update &&
+	    use == NFP_MAP_USE_ATOMIC_CNT) {
+		pr_vlog(env, "atomic counter in map value may already be initialized to non-zero value off: %u\n",
+			off);
+		return -EOPNOTSUPP;
+	}
+
+	nfp_map->use_map[off / 4].type = use;
 
 	return 0;
 }
-- 
2.17.1

^ permalink raw reply related

page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox