Netdev List
 help / color / mirror / Atom feed
* Re: [bpf-next PATCH v2 15/18] bpf: sockmap sample support for bpf_msg_cork_bytes()
From: Alexei Starovoitov @ 2018-03-15 20:15 UTC (permalink / raw)
  To: John Fastabend; +Cc: davem, ast, daniel, davejwatson, netdev
In-Reply-To: <20180312192421.8039.90630.stgit@john-Precision-Tower-5810>

On Mon, Mar 12, 2018 at 12:24:21PM -0700, John Fastabend wrote:
> Add sample application support for the bpf_msg_cork_bytes helper. This
> lets the user specify how many bytes each verdict should apply to.
> 
> Similar to apply_bytes() tests these can be run as a stand-alone test
> when used without other options or inline with other tests by using
> the txmsg_cork option along with any of the basic tests txmsg,
> txmsg_redir, txmsg_drop.
> 
> Signed-off-by: John Fastabend <john.fastabend@gmail.com>
> ---
>  include/uapi/linux/bpf_common.h           |    7 ++--
>  samples/sockmap/sockmap_kern.c            |   53 +++++++++++++++++++++++++----
>  samples/sockmap/sockmap_user.c            |   19 ++++++++++
>  tools/include/uapi/linux/bpf.h            |    3 +-
>  tools/testing/selftests/bpf/bpf_helpers.h |    2 +
>  5 files changed, 71 insertions(+), 13 deletions(-)
> 
> diff --git a/include/uapi/linux/bpf_common.h b/include/uapi/linux/bpf_common.h
> index ee97668..18be907 100644
> --- a/include/uapi/linux/bpf_common.h
> +++ b/include/uapi/linux/bpf_common.h
> @@ -15,10 +15,9 @@
>  
>  /* ld/ldx fields */
>  #define BPF_SIZE(code)  ((code) & 0x18)
> -#define		BPF_W		0x00 /* 32-bit */
> -#define		BPF_H		0x08 /* 16-bit */
> -#define		BPF_B		0x10 /*  8-bit */
> -/* eBPF		BPF_DW		0x18    64-bit */
> +#define		BPF_W		0x00
> +#define		BPF_H		0x08
> +#define		BPF_B		0x10

this hunk seems wrong here. Botched rebase?

^ permalink raw reply

* Re: [bpf-next PATCH v2 08/18] bpf: sk_msg program helper bpf_sk_msg_pull_data
From: Daniel Borkmann @ 2018-03-15 20:25 UTC (permalink / raw)
  To: John Fastabend, davem, ast, davejwatson; +Cc: netdev
In-Reply-To: <20180312192345.8039.27774.stgit@john-Precision-Tower-5810>

On 03/12/2018 08:23 PM, John Fastabend wrote:
[...]
>  /* integer value in 'imm' field of BPF_CALL instruction selects which helper
>   * function eBPF program intends to call
> diff --git a/net/core/filter.c b/net/core/filter.c
> index 2c73af0..7b9e63e 100644
> --- a/net/core/filter.c
> +++ b/net/core/filter.c
> @@ -1956,6 +1956,134 @@ struct sock *do_msg_redirect_map(struct sk_msg_buff *msg)
>  	.arg2_type      = ARG_ANYTHING,
>  };
>  
> +BPF_CALL_4(bpf_msg_pull_data,
> +	   struct sk_msg_buff *, msg, u32, start, u32, end, u64, flags)
> +{
> +	unsigned int len = 0, offset = 0, copy = 0;
> +	struct scatterlist *sg = msg->sg_data;
> +	int first_sg, last_sg, i, shift;
> +	unsigned char *p, *to, *from;
> +	int bytes = end - start;
> +	struct page *page;
> +
> +	if (unlikely(end < start))
> +		return -EINVAL;

Actually should be:

if (unlikely(flags || end <= start))
	return -EINVAL;

> +	/* First find the starting scatterlist element */
> +	i = msg->sg_start;
> +	do {
> +		len = sg[i].length;
> +		offset += len;
> +		if (start < offset + len)
> +			break;
> +		i++;
> +		if (i == MAX_SKB_FRAGS)
> +			i = 0;
> +	} while (i != msg->sg_end);
> +
> +	if (unlikely(start >= offset + len))
> +		return -EINVAL;
> +
> +	if (!msg->sg_copy[i] && bytes <= len)
> +		goto out;
> +
> +	first_sg = i;
> +
> +	/* At this point we need to linearize multiple scatterlist
> +	 * elements or a single shared page. Either way we need to
> +	 * copy into a linear buffer exclusively owned by BPF. Then
> +	 * place the buffer in the scatterlist and fixup the original
> +	 * entries by removing the entries now in the linear buffer
> +	 * and shifting the remaining entries. For now we do not try
> +	 * to copy partial entries to avoid complexity of running out
> +	 * of sg_entry slots. The downside is reading a single byte
> +	 * will copy the entire sg entry.
> +	 */
> +	do {
> +		copy += sg[i].length;
> +		i++;
> +		if (i == MAX_SKB_FRAGS)
> +			i = 0;
> +		if (bytes < copy)
> +			break;
> +	} while (i != msg->sg_end);
> +	last_sg = i;
> +
> +	if (unlikely(copy < end - start))
> +		return -EINVAL;
> +
> +	page = alloc_pages(__GFP_NOWARN | GFP_ATOMIC, get_order(copy));

if (unlikely(!page))
	return -ENOMEM;

> +	p = page_address(page);
> +	offset = 0;
> +
> +	i = first_sg;
> +	do {
> +		from = sg_virt(&sg[i]);
> +		len = sg[i].length;
> +		to = p + offset;
> +
> +		memcpy(to, from, len);
> +		offset += len;
> +		sg[i].length = 0;
> +		put_page(sg_page(&sg[i]));
> +
> +		i++;
> +		if (i == MAX_SKB_FRAGS)
> +			i = 0;
> +	} while (i != last_sg);
> +
> +	sg[first_sg].length = copy;
> +	sg_set_page(&sg[first_sg], page, copy, 0);
> +
> +	/* To repair sg ring we need to shift entries. If we only
> +	 * had a single entry though we can just replace it and
> +	 * be done. Otherwise walk the ring and shift the entries.
> +	 */
> +	shift = last_sg - first_sg - 1;
> +	if (!shift)
> +		goto out;
> +
> +	i = first_sg + 1;
> +	do {
> +		int move_from;
> +
> +		if (i + shift >= MAX_SKB_FRAGS)
> +			move_from = i + shift - MAX_SKB_FRAGS;
> +		else
> +			move_from = i + shift;
> +
> +		if (move_from == msg->sg_end)
> +			break;
> +
> +		sg[i] = sg[move_from];
> +		sg[move_from].length = 0;
> +		sg[move_from].page_link = 0;
> +		sg[move_from].offset = 0;
> +
> +		i++;
> +		if (i == MAX_SKB_FRAGS)
> +			i = 0;
> +	} while (1);
> +	msg->sg_end -= shift;
> +	if (msg->sg_end < 0)
> +		msg->sg_end += MAX_SKB_FRAGS;
> +out:
> +	msg->data = sg_virt(&sg[i]) + start - offset;
> +	msg->data_end = msg->data + bytes;
> +
> +	return 0;
> +}
> +
> +static const struct bpf_func_proto bpf_msg_pull_data_proto = {
> +	.func		= bpf_msg_pull_data,
> +	.gpl_only	= false,
> +	.ret_type	= RET_INTEGER,
> +	.arg1_type	= ARG_PTR_TO_CTX,
> +	.arg2_type	= ARG_ANYTHING,
> +	.arg3_type	= ARG_ANYTHING,
> +	.arg4_type	= ARG_ANYTHING,
> +};
> +
>  BPF_CALL_1(bpf_get_cgroup_classid, const struct sk_buff *, skb)
>  {
>  	return task_get_classid(skb);
> @@ -2897,7 +3025,8 @@ bool bpf_helper_changes_pkt_data(void *func)
>  	    func == bpf_l3_csum_replace ||
>  	    func == bpf_l4_csum_replace ||
>  	    func == bpf_xdp_adjust_head ||
> -	    func == bpf_xdp_adjust_meta)
> +	    func == bpf_xdp_adjust_meta ||
> +	    func == bpf_msg_pull_data)
>  		return true;
>  
>  	return false;
> @@ -3666,6 +3795,8 @@ static const struct bpf_func_proto *sk_msg_func_proto(enum bpf_func_id func_id)
>  		return &bpf_msg_apply_bytes_proto;
>  	case BPF_FUNC_msg_cork_bytes:
>  		return &bpf_msg_cork_bytes_proto;
> +	case BPF_FUNC_msg_pull_data:
> +		return &bpf_msg_pull_data_proto;
>  	default:
>  		return bpf_base_func_proto(func_id);
>  	}
> 

^ permalink raw reply

* Re: [RFC,POC] iptables/nftables to epbf/xdp via common intermediate layer
From: Alexei Starovoitov @ 2018-03-15 20:26 UTC (permalink / raw)
  To: Florian Westphal
  Cc: Edward Cree, Daniel Borkmann, netdev, ast, pablo, David S. Miller
In-Reply-To: <20180315170022.GA21181@breakpoint.cc>

On Thu, Mar 15, 2018 at 06:00:22PM +0100, Florian Westphal wrote:
> Alexei Starovoitov <alexei.starovoitov@gmail.com> wrote:
> > The way this IMR defined today looks pretty much like nft and
> > it feels a bit too low level than iptable conversion would need.
> 
> It wasn't so much about a specific IMR but to avoid code duplication
> between nft and iptables translators.
> 
> > I think it would be simpler to have user space only extensions
> > and opcodes added to bpf for the purpose of the translation.
> > Like there is no bpf instruction called 'load from IP header',
> > but we can make one. Just extend extended bpf with an instruction
> > like this and on the first pass do full conversion of nft
> > directly into this 'extended extended bpf'.
> 
> I don't want to duplicate any ebpf conversion (and optimisations)
> in the nft part.
> 
> If nft can be translated to this 'extended extended bpf' and
> this then generates bpf code from nft input all is good.

if possible it's great to avoid duplication, but it shouldn't be
such ultimate goal that it cripples iptable->bpf conversion
just to reuse nft->bpf bits.

^ permalink raw reply

* Re: [RFC PATCH V1 01/12] audit: add container id
From: Stefan Berger @ 2018-03-15 20:27 UTC (permalink / raw)
  To: Richard Guy Briggs, cgroups-u79uwXL29TY76Z2rM5mHXA,
	containers-cunTk1MwBs9QetFLy7KEm3xJsTq8ys+cHZ5vskTnxNA,
	linux-api-u79uwXL29TY76Z2rM5mHXA, Linux-Audit Mailing List,
	linux-fsdevel-u79uwXL29TY76Z2rM5mHXA, LKML,
	netdev-u79uwXL29TY76Z2rM5mHXA
  Cc: mszeredi-H+wXaHxf7aLQT0dZR+AlfA, ebiederm-aS9lmoZGLiVWk0Htik3J/w,
	simo-H+wXaHxf7aLQT0dZR+AlfA, jlayton-H+wXaHxf7aLQT0dZR+AlfA,
	carlos-H+wXaHxf7aLQT0dZR+AlfA, dhowells-H+wXaHxf7aLQT0dZR+AlfA,
	madzcar-Re5JQEeQqe8AvxtiuMwx3w,
	viro-RmSDqhL/yNMiFSDQTTA3OLVCufUGDwFn,
	luto-DgEjT+Ai2ygdnm+yROfE0A, eparis-FjpueFixGhCM4zKIHC2jIg,
	trondmy-7I+n7zu2hftEKMMhf/gKZA
In-Reply-To: <2e5d93ee46feca915a101c2fc3062da674a98223.1519930146.git.rgb-H+wXaHxf7aLQT0dZR+AlfA@public.gmane.org>

On 03/01/2018 02:41 PM, Richard Guy Briggs wrote:
> Implement the proc fs write to set the audit container ID of a process,
> emitting an AUDIT_CONTAINER record to document the event.
>
> This is a write from the container orchestrator task to a proc entry of
> the form /proc/PID/containerid where PID is the process ID of the newly
> created task that is to become the first task in a container, or an
> additional task added to a container.
>
> The write expects up to a u64 value (unset: 18446744073709551615).
>
> This will produce a record such as this:
> type=UNKNOWN[1333] msg=audit(1519903238.968:261): op=set pid=596 uid=0 subj=unconfined_u:unconfined_r:unconfined_t:s0-s0:c0.c1023 auid=0 tty=pts0 ses=1 opid=596 old-contid=18446744073709551615 contid=123455 res=0
>
> The "op" field indicates an initial set.  The "pid" to "ses" fields are
> the orchestrator while the "opid" field is the object's PID, the process
> being "contained".  Old and new container ID values are given in the
> "contid" fields, while res indicates its success.
>
> It is not permitted to self-set, unset or re-set the container ID.  A
> child inherits its parent's container ID, but then can be set only once
> after.
>
> See: https://github.com/linux-audit/audit-kernel/issues/32
>
>
>   /* audit_rule_data supports filter rules with both integer and string
>    * fields.  It corresponds with AUDIT_ADD_RULE, AUDIT_DEL_RULE and
> diff --git a/kernel/auditsc.c b/kernel/auditsc.c
> index 4e0a4ac..0ee1e59 100644
> --- a/kernel/auditsc.c
> +++ b/kernel/auditsc.c
> @@ -2073,6 +2073,92 @@ int audit_set_loginuid(kuid_t loginuid)
>   	return rc;
>   }
>
> +static int audit_set_containerid_perm(struct task_struct *task, u64 containerid)
> +{
> +	struct task_struct *parent;
> +	u64 pcontainerid, ccontainerid;
> +	pid_t ppid;
> +
> +	/* Don't allow to set our own containerid */
> +	if (current == task)
> +		return -EPERM;
> +	/* Don't allow the containerid to be unset */
> +	if (!cid_valid(containerid))
> +		return -EINVAL;
> +	/* if we don't have caps, reject */
> +	if (!capable(CAP_AUDIT_CONTROL))
> +		return -EPERM;
> +	/* if containerid is unset, allow */
> +	if (!audit_containerid_set(task))
> +		return 0;

I am wondering whether there should be a check for the target process 
that will receive the containerid to not have CAP_SYS_ADMIN that would 
otherwise allow it to arbitrarily unshare()/clone() and leave the set of 
namespaces that may make up the container whose containerid we assign here?

> +	/* it is already set, and not inherited from the parent, reject */
> +	ccontainerid = audit_get_containerid(task);
> +	rcu_read_lock();
> +	parent = rcu_dereference(task->real_parent);
> +	rcu_read_unlock();
> +	task_lock(parent);
> +	pcontainerid = audit_get_containerid(parent);
> +	ppid = task_tgid_nr(parent);
ppid not needed...

> +	task_unlock(parent);
> +	if (ccontainerid != pcontainerid)
> +		return -EPERM;
> +	return 0;
> +}
> +
> +static void audit_log_set_containerid(struct task_struct *task, u64 oldcontainerid,
> +				      u64 containerid, int rc)
> +{
> +	struct audit_buffer *ab;
> +	uid_t uid;
> +	struct tty_struct *tty;
> +
> +	if (!audit_enabled)
> +		return;
> +
> +	ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_CONTAINER);
> +	if (!ab)
> +		return;
> +
> +	uid = from_kuid(&init_user_ns, task_uid(current));
> +	tty = audit_get_tty(current);
> +
> +	audit_log_format(ab, "op=set pid=%d uid=%u", task_tgid_nr(current), uid);
> +	audit_log_task_context(ab);
> +	audit_log_format(ab, " auid=%u tty=%s ses=%u opid=%d old-contid=%llu contid=%llu res=%d",
> +			 from_kuid(&init_user_ns, audit_get_loginuid(current)),
> +			 tty ? tty_name(tty) : "(none)", audit_get_sessionid(current),
> +			 task_tgid_nr(task), oldcontainerid, containerid, !rc);
> +
> +	audit_put_tty(tty);
> +	audit_log_end(ab);
> +}
> +
> +/**
> + * audit_set_containerid - set current task's audit_context containerid
> + * @containerid: containerid value
> + *
> + * Returns 0 on success, -EPERM on permission failure.
> + *
> + * Called (set) from fs/proc/base.c::proc_containerid_write().
> + */
> +int audit_set_containerid(struct task_struct *task, u64 containerid)
> +{
> +	u64 oldcontainerid;
> +	int rc;
> +
> +	oldcontainerid = audit_get_containerid(task);
> +
> +	rc = audit_set_containerid_perm(task, containerid);
> +	if (!rc) {
> +		task_lock(task);
> +		task->containerid = containerid;
> +		task_unlock(task);
> +	}
> +
> +	audit_log_set_containerid(task, oldcontainerid, containerid, rc);
> +	return rc;
> +}
> +
>   /**
>    * __audit_mq_open - record audit data for a POSIX MQ open
>    * @oflag: open flag

^ permalink raw reply

* Re: [PATCH net-next 0/6] Converting pernet_operations (part #8)
From: Julian Anastasov @ 2018-03-15 20:32 UTC (permalink / raw)
  To: Kirill Tkhai
  Cc: dev-yBygre7rU0TnMu66kgdUjQ, Wensong Zhang,
	g.nault-pHk1y4uTXVDytLWWfqlThQ,
	dan.j.williams-ral2JQCrhuEAvxtiuMwx3w,
	netdev-u79uwXL29TY76Z2rM5mHXA,
	roopa-qUQiAmfTcIp+XZJcv9eMoEEOCMrvLtNR,
	dsahern-Re5JQEeQqe8AvxtiuMwx3w, fw-HFFVJYpyMKqzQB+pC5nmwQ,
	jchapman-Bm0nJX+W7e9BDgjK7y7TUQ, lvs-devel-u79uwXL29TY76Z2rM5mHXA,
	netfilter-devel-u79uwXL29TY76Z2rM5mHXA,
	elena.reshetova-ral2JQCrhuEAvxtiuMwx3w,
	amine.kherbouche-pdR9zngts4EAvxtiuMwx3w,
	kadlec-K40Dz/62t/MgiyqX0sVFJYdd74u8MsAO,
	rshearma-43mecJUBy8ZBDgjK7y7TUQ, David S. Miller,
	pablo-Cap9r6Oaw4JrovVCs/uTlw, dwindsor-Re5JQEeQqe8AvxtiuMwx3w
In-Reply-To: <152110491273.28582.13804059107038714030.stgit-bi+AKbBUZKY6gyzm1THtWbp2dZbC/Bob@public.gmane.org>


	Hello,

On Thu, 15 Mar 2018, Kirill Tkhai wrote:

> Hi,
> 
> this series continues to review and to convert pernet_operations
> to make them possible to be executed in parallel for several
> net namespaces at the same time. There are different operations
> over the tree, mostly are ipvs.
> 
> 
> Thanks,
> Kirill
> ---
> 
> Kirill Tkhai (6):
>       net: Convert l2tp_net_ops
>       net: Convert mpls_net_ops
>       net: Convert ovs_net_ops
>       net: Convert ipvs_core_ops
>       net: Convert ipvs_core_dev_ops
>       net: Convert ip_vs_ftp_ops

	The IPVS patches 4-6 look good to me,

Acked-by: Julian Anastasov <ja-FgGsKACvmQM@public.gmane.org>

>  net/l2tp/l2tp_core.c            |    1 +
>  net/mpls/af_mpls.c              |    1 +
>  net/netfilter/ipvs/ip_vs_core.c |    2 ++
>  net/netfilter/ipvs/ip_vs_ftp.c  |    1 +
>  net/openvswitch/datapath.c      |    1 +
>  5 files changed, 6 insertions(+)
> 
> --
> Signed-off-by: Kirill Tkhai <ktkhai-5HdwGun5lf+gSpxsJD1C4w@public.gmane.org>

Regards

--
Julian Anastasov <ja-FgGsKACvmQM@public.gmane.org>

^ permalink raw reply

* Re: [bpf-next PATCH v2 06/18] bpf: sockmap, add bpf_msg_apply_bytes() helper
From: Daniel Borkmann @ 2018-03-15 20:32 UTC (permalink / raw)
  To: John Fastabend, davem, ast, davejwatson; +Cc: netdev
In-Reply-To: <20180312192334.8039.98220.stgit@john-Precision-Tower-5810>

On 03/12/2018 08:23 PM, John Fastabend wrote:
> A single sendmsg or sendfile system call can contain multiple logical
> messages that a BPF program may want to read and apply a verdict. But,
> without an apply_bytes helper any verdict on the data applies to all
> bytes in the sendmsg/sendfile. Alternatively, a BPF program may only
> care to read the first N bytes of a msg. If the payload is large say
> MB or even GB setting up and calling the BPF program repeatedly for
> all bytes, even though the verdict is already known, creates
> unnecessary overhead.
> 
> To allow BPF programs to control how many bytes a given verdict
> applies to we implement a bpf_msg_apply_bytes() helper. When called
> from within a BPF program this sets a counter, internal to the
> BPF infrastructure, that applies the last verdict to the next N
> bytes. If the N is smaller than the current data being processed
> from a sendmsg/sendfile call, the first N bytes will be sent and
> the BPF program will be re-run with start_data pointing to the N+1
> byte. If N is larger than the current data being processed the
> BPF verdict will be applied to multiple sendmsg/sendfile calls
> until N bytes are consumed.
> 
> Note1 if a socket closes with apply_bytes counter non-zero this
> is not a problem because data is not being buffered for N bytes
> and is sent as its received.
> 
> Note2 if this is operating in the sendpage context the data
> pointers may be zeroed after this call if the apply walks beyond
> a msg_pull_data() call specified data range. (helper implemented
> shortly in this series).
> 
> Signed-off-by: John Fastabend <john.fastabend@gmail.com>
> ---
>  include/uapi/linux/bpf.h |    3 ++-
>  net/core/filter.c        |   16 ++++++++++++++++
>  2 files changed, 18 insertions(+), 1 deletion(-)
> 
> diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
> index b8275f0..e50c61f 100644
> --- a/include/uapi/linux/bpf.h
> +++ b/include/uapi/linux/bpf.h
> @@ -769,7 +769,8 @@ enum bpf_attach_type {
>  	FN(getsockopt),			\
>  	FN(override_return),		\
>  	FN(sock_ops_cb_flags_set),	\
> -	FN(msg_redirect_map),
> +	FN(msg_redirect_map),		\
> +	FN(msg_apply_bytes),
>  
>  /* integer value in 'imm' field of BPF_CALL instruction selects which helper
>   * function eBPF program intends to call
> diff --git a/net/core/filter.c b/net/core/filter.c
> index 314c311..df2a8f4 100644
> --- a/net/core/filter.c
> +++ b/net/core/filter.c
> @@ -1928,6 +1928,20 @@ struct sock *do_msg_redirect_map(struct sk_msg_buff *msg)
>  	.arg4_type      = ARG_ANYTHING,
>  };
>  
> +BPF_CALL_2(bpf_msg_apply_bytes, struct sk_msg_buff *, msg, u64, bytes)
> +{
> +	msg->apply_bytes = bytes;

Here in bpf_msg_apply_bytes() but also in bpf_msg_cork_bytes() the signature
is u64, but in struct sk_msg_buff and struct smap_psock it's type int, so
user provided u64 will make these negative. Is there a reason to have this
allow a negative value and not being of type u32 everywhere?

> +	return 0;
> +}
> +
> +static const struct bpf_func_proto bpf_msg_apply_bytes_proto = {
> +	.func           = bpf_msg_apply_bytes,
> +	.gpl_only       = false,
> +	.ret_type       = RET_INTEGER,
> +	.arg1_type	= ARG_PTR_TO_CTX,
> +	.arg2_type      = ARG_ANYTHING,
> +};
> +
>  BPF_CALL_1(bpf_get_cgroup_classid, const struct sk_buff *, skb)
>  {
>  	return task_get_classid(skb);
> @@ -3634,6 +3648,8 @@ static const struct bpf_func_proto *sk_msg_func_proto(enum bpf_func_id func_id)
>  	switch (func_id) {
>  	case BPF_FUNC_msg_redirect_map:
>  		return &bpf_msg_redirect_map_proto;
> +	case BPF_FUNC_msg_apply_bytes:
> +		return &bpf_msg_apply_bytes_proto;
>  	default:
>  		return bpf_base_func_proto(func_id);
>  	}
> 

^ permalink raw reply

* Re: [RFC 2/2] page_frag_cache: Store metadata in struct page
From: Eric Dumazet @ 2018-03-15 21:03 UTC (permalink / raw)
  To: Matthew Wilcox, Alexander Duyck; +Cc: linux-mm, netdev, Matthew Wilcox
In-Reply-To: <20180315195329.7787-3-willy@infradead.org>



On 03/15/2018 12:53 PM, Matthew Wilcox wrote:
> From: Matthew Wilcox <mawilcox@microsoft.com>
> 
> Shrink page_frag_cache from 24 to 8 bytes (a single pointer to the
> currently-in-use struct page) by using the page's refcount directly
> (instead of maintaining a bias) and storing our current progress through
> the page in the same bits currently used for page->index.  We no longer
> need to reflect the page pfmemalloc state if we're storing the page
> directly.
> 
> On the downside, we now call page_address() on every allocation, and we
> do an atomic_inc() rather than a non-atomic decrement, but we should
> touch the same number of cachelines and there is far less code (and
> the code is less complex).
> 
> Signed-off-by: Matthew Wilcox <mawilcox@microsoft.com>
> ---


One point about storing metadata not in struct page was to avoid false sharing.

consumers of page frags call put_page() thus touching page refcount.

With the pagecnt_bias (and other fields) we store in another cache line 
(private to the producer) we amortize the access to the shared cache line only when needed.

^ permalink raw reply

* Re: [PATCH] [v2] Bluetooth: btrsi: rework dependencies
From: Arnd Bergmann @ 2018-03-15 21:16 UTC (permalink / raw)
  To: Marcel Holtmann
  Cc: Johan Hedberg, Kalle Valo, Sebastian Reichel, Amitkumar Karwar,
	Siva Rebbagondla, Linux Bluetooth mailing list, LKML,
	linux-wireless, Networking
In-Reply-To: <988BDE3E-B2D2-4ABF-B644-4BFF41AF4157@holtmann.org>

On Thu, Mar 15, 2018 at 7:30 PM, Marcel Holtmann <marcel@holtmann.org> wrote:
> Hi Arnd,
>
>> The linkage between the bluetooth driver and the wireless
>> driver is not defined properly, leading to build problems
>> such as:
>>
>> warning: (BT_HCIRSI) selects RSI_COEX which has unmet direct dependencies (NETDEVICES && WLAN && WLAN_VENDOR_RSI && BT_HCIRSI && RSI_91X)
>> drivers/net/wireless/rsi/rsi_91x_main.o: In function `rsi_read_pkt':
>> (.text+0x205): undefined reference to `rsi_bt_ops'
>>
>> As the dependency is actually the reverse (RSI_91X uses
>> the BT_RSI driver, not the other way round), this changes
>> the dependency to match, and enables the bluetooth driver
>> from the RSI_COEX symbol.
>>
>> Fixes: 38aa4da50483 ("Bluetooth: btrsi: add new rsi bluetooth driver")
>> Signed-off-by: Arnd Bergmann <arnd@arndb.de>
>> ---
>> v2: Pick a different from v1
>> ---
>> drivers/bluetooth/Kconfig        | 4 +---
>> drivers/net/wireless/rsi/Kconfig | 4 +++-
>> 2 files changed, 4 insertions(+), 4 deletions(-)
>
> Acked-by: Marcel Holtmann <marcel@holtmann.org>
>
> Since I think Kalle still has to take it through his tree until the btrsi driver makes it into net-next.

Ok. Kalle, please wait for v3 though, I just ran into another build failure
caused by a typo in v2.

        Arnd

^ permalink raw reply

* [PATCH] [v3] Bluetooth: btrsi: rework dependencies
From: Arnd Bergmann @ 2018-03-15 21:18 UTC (permalink / raw)
  To: Kalle Valo
  Cc: Arnd Bergmann, Marcel Holtmann, Johan Hedberg, Johan Hovold,
	Amitkumar Karwar, Siva Rebbagondla, linux-bluetooth, linux-kernel,
	linux-wireless, netdev

The linkage between the bluetooth driver and the wireless
driver is not defined properly, leading to build problems
such as:

warning: (BT_HCIRSI) selects RSI_COEX which has unmet direct dependencies (NETDEVICES && WLAN && WLAN_VENDOR_RSI && BT_HCIRSI && RSI_91X)
drivers/net/wireless/rsi/rsi_91x_main.o: In function `rsi_read_pkt':
(.text+0x205): undefined reference to `rsi_bt_ops'

As the dependency is actually the reverse (RSI_91X uses
the BT_RSI driver, not the other way round), this changes
the dependency to match, and enables the bluetooth driver
from the RSI_COEX symbol.

Fixes: 38aa4da50483 ("Bluetooth: btrsi: add new rsi bluetooth driver")
Acked-by; Marcel Holtmann <marcel@holtmann.org>
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
---
v2: Pick a different approach from v1
v3: fix typo: s/BT_RSI/BT_HCIRSI/
---
 drivers/bluetooth/Kconfig        | 4 +---
 drivers/net/wireless/rsi/Kconfig | 4 +++-
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/drivers/bluetooth/Kconfig b/drivers/bluetooth/Kconfig
index d8bbd661dbdb..149a38ee1fce 100644
--- a/drivers/bluetooth/Kconfig
+++ b/drivers/bluetooth/Kconfig
@@ -393,9 +393,7 @@ config BT_QCOMSMD
 	  kernel or say M to compile as a module.
 
 config BT_HCIRSI
-	tristate "Redpine HCI support"
-	default n
-	select RSI_COEX
+	tristate
 	help
 	  Redpine BT driver.
 	  This driver handles BT traffic from upper layers and pass
diff --git a/drivers/net/wireless/rsi/Kconfig b/drivers/net/wireless/rsi/Kconfig
index f004be33fcfa..976c21866230 100644
--- a/drivers/net/wireless/rsi/Kconfig
+++ b/drivers/net/wireless/rsi/Kconfig
@@ -13,6 +13,7 @@ if WLAN_VENDOR_RSI
 
 config RSI_91X
 	tristate "Redpine Signals Inc 91x WLAN driver support"
+	select BT_HCIRSI if RSI_COEX
 	depends on MAC80211
 	---help---
 	  This option enabes support for RSI 1x1 devices.
@@ -44,7 +45,8 @@ config RSI_USB
 
 config RSI_COEX
 	bool "Redpine Signals WLAN BT Coexistence support"
-	depends on BT_HCIRSI && RSI_91X
+	depends on BT && RSI_91X
+	depends on !(BT=m && RSI_91X=y)
 	default y
 	---help---
 	  This option enables the WLAN BT coex support in rsi drivers.
-- 
2.9.0

^ permalink raw reply related

* [next-queue 1/4] ixgbe: no need for ipsec csum feature check
From: Shannon Nelson @ 2018-03-15 21:23 UTC (permalink / raw)
  To: intel-wired-lan; +Cc: netdev, steffen.klassert
In-Reply-To: <1521149003-1433-1-git-send-email-shannon.nelson@oracle.com>

With the patch
commit f8aa2696b4af ("esp: check the NETIF_F_HW_ESP_TX_CSUM bit before segmenting")
we no longer need to protect ourself from checksum
offload requests on IPsec packets, so we can remove
the check in our .ndo_features_check callback.

Signed-off-by: Shannon Nelson <shannon.nelson@oracle.com>
---
 drivers/net/ethernet/intel/ixgbe/ixgbe_main.c | 6 ------
 1 file changed, 6 deletions(-)

diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
index 8536942..153cd9e 100644
--- a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
@@ -9908,12 +9908,6 @@ ixgbe_features_check(struct sk_buff *skb, struct net_device *dev,
 	if (skb->encapsulation && !(features & NETIF_F_TSO_MANGLEID))
 		features &= ~NETIF_F_TSO;
 
-#ifdef CONFIG_XFRM_OFFLOAD
-	/* IPsec offload doesn't get along well with others *yet* */
-	if (skb->sp)
-		features &= ~(NETIF_F_TSO | NETIF_F_HW_CSUM);
-#endif
-
 	return features;
 }
 
-- 
2.7.4

^ permalink raw reply related

* [next-queue 2/4] ixgbe: remove unneeded ipsec test in TX path
From: Shannon Nelson @ 2018-03-15 21:23 UTC (permalink / raw)
  To: intel-wired-lan; +Cc: netdev, steffen.klassert
In-Reply-To: <1521149003-1433-1-git-send-email-shannon.nelson@oracle.com>

Since the ipsec data fields will be zero anyway in the non-ipsec
case, we can remove the conditional jump.

Suggested-by: Alexander Duyck <alexander.duyck@gmail.com>
Signed-off-by: Shannon Nelson <shannon.nelson@oracle.com>
---
 drivers/net/ethernet/intel/ixgbe/ixgbe_main.c | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
index 153cd9e..a54f3d8 100644
--- a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
@@ -7864,10 +7864,8 @@ static void ixgbe_tx_csum(struct ixgbe_ring *tx_ring,
 	vlan_macip_lens |= skb_network_offset(skb) << IXGBE_ADVTXD_MACLEN_SHIFT;
 	vlan_macip_lens |= first->tx_flags & IXGBE_TX_FLAGS_VLAN_MASK;
 
-	if (first->tx_flags & IXGBE_TX_FLAGS_IPSEC) {
-		fceof_saidx |= itd->sa_idx;
-		type_tucmd |= itd->flags | itd->trailer_len;
-	}
+	fceof_saidx |= itd->sa_idx;
+	type_tucmd |= itd->flags | itd->trailer_len;
 
 	ixgbe_tx_ctxtdesc(tx_ring, vlan_macip_lens, fceof_saidx, type_tucmd, 0);
 }
-- 
2.7.4

^ permalink raw reply related

* [next-queue 0/4] ixgbe: Enable tso and checksum offload with ipsec
From: Shannon Nelson @ 2018-03-15 21:23 UTC (permalink / raw)
  To: intel-wired-lan; +Cc: netdev, steffen.klassert

This patchset fixes up the bits for supporting TSO and checksum
offload in conjunction with IPsec offload.  This brings the
throughput of a simple iperf test back up to nearly line rate.

Shannon Nelson (4):
  ixgbe: no need for ipsec csum feature check
  ixgbe: remove unneeded ipsec test in Tx path
  ixgbe: no need for esp trailer if gso
  ixgbe: enable tso with ipsec offload

 drivers/net/ethernet/intel/ixgbe/ixgbe_ipsec.c | 45 +++++++++++++++-----------
 drivers/net/ethernet/intel/ixgbe/ixgbe_main.c  | 30 ++++++++---------
 2 files changed, 42 insertions(+), 33 deletions(-)

-- 
2.7.4

^ permalink raw reply

* [next-queue 4/4] ixgbe: enable tso with ipsec offload
From: Shannon Nelson @ 2018-03-15 21:23 UTC (permalink / raw)
  To: intel-wired-lan; +Cc: netdev, steffen.klassert
In-Reply-To: <1521149003-1433-1-git-send-email-shannon.nelson@oracle.com>

Fix things up to support TSO offload in conjunction
with IPsec hw offload.  This raises throughput with
IPsec offload on to nearly line rate.

Signed-off-by: Shannon Nelson <shannon.nelson@oracle.com>
---
 drivers/net/ethernet/intel/ixgbe/ixgbe_ipsec.c |  7 +++++--
 drivers/net/ethernet/intel/ixgbe/ixgbe_main.c  | 25 +++++++++++++++++++------
 2 files changed, 24 insertions(+), 8 deletions(-)

diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_ipsec.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_ipsec.c
index 5ddea43..bfbcfc2 100644
--- a/drivers/net/ethernet/intel/ixgbe/ixgbe_ipsec.c
+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_ipsec.c
@@ -896,6 +896,7 @@ void ixgbe_ipsec_rx(struct ixgbe_ring *rx_ring,
 void ixgbe_init_ipsec_offload(struct ixgbe_adapter *adapter)
 {
 	struct ixgbe_ipsec *ipsec;
+	netdev_features_t features;
 	size_t size;
 
 	if (adapter->hw.mac.type == ixgbe_mac_82598EB)
@@ -929,8 +930,10 @@ void ixgbe_init_ipsec_offload(struct ixgbe_adapter *adapter)
 	ixgbe_ipsec_clear_hw_tables(adapter);
 
 	adapter->netdev->xfrmdev_ops = &ixgbe_xfrmdev_ops;
-	adapter->netdev->features |= NETIF_F_HW_ESP;
-	adapter->netdev->hw_enc_features |= NETIF_F_HW_ESP;
+
+	features = NETIF_F_HW_ESP | NETIF_F_HW_ESP_TX_CSUM | NETIF_F_GSO_ESP;
+	adapter->netdev->features |= features;
+	adapter->netdev->hw_enc_features |= features;
 
 	return;
 
diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
index a54f3d8..6022666 100644
--- a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
@@ -7721,9 +7721,11 @@ static void ixgbe_service_task(struct work_struct *work)
 
 static int ixgbe_tso(struct ixgbe_ring *tx_ring,
 		     struct ixgbe_tx_buffer *first,
-		     u8 *hdr_len)
+		     u8 *hdr_len,
+		     struct ixgbe_ipsec_tx_data *itd)
 {
 	u32 vlan_macip_lens, type_tucmd, mss_l4len_idx;
+	u32 fceof_saidx = 0;
 	struct sk_buff *skb = first->skb;
 	union {
 		struct iphdr *v4;
@@ -7762,9 +7764,13 @@ static int ixgbe_tso(struct ixgbe_ring *tx_ring,
 		unsigned char *trans_start = ip.hdr + (ip.v4->ihl * 4);
 
 		/* IP header will have to cancel out any data that
-		 * is not a part of the outer IP header
+		 * is not a part of the outer IP header, except for
+		 * IPsec where we want the IP+ESP header.
 		 */
-		ip.v4->check = csum_fold(csum_partial(trans_start,
+		if (first->tx_flags & IXGBE_TX_FLAGS_IPSEC)
+			ip.v4->check = 0;
+		else
+			ip.v4->check = csum_fold(csum_partial(trans_start,
 						      csum_start - trans_start,
 						      0));
 		type_tucmd |= IXGBE_ADVTXD_TUCMD_IPV4;
@@ -7797,12 +7803,15 @@ static int ixgbe_tso(struct ixgbe_ring *tx_ring,
 	mss_l4len_idx = (*hdr_len - l4_offset) << IXGBE_ADVTXD_L4LEN_SHIFT;
 	mss_l4len_idx |= skb_shinfo(skb)->gso_size << IXGBE_ADVTXD_MSS_SHIFT;
 
+	fceof_saidx |= itd->sa_idx;
+	type_tucmd |= itd->flags | itd->trailer_len;
+
 	/* vlan_macip_lens: HEADLEN, MACLEN, VLAN tag */
 	vlan_macip_lens = l4.hdr - ip.hdr;
 	vlan_macip_lens |= (ip.hdr - skb->data) << IXGBE_ADVTXD_MACLEN_SHIFT;
 	vlan_macip_lens |= first->tx_flags & IXGBE_TX_FLAGS_VLAN_MASK;
 
-	ixgbe_tx_ctxtdesc(tx_ring, vlan_macip_lens, 0, type_tucmd,
+	ixgbe_tx_ctxtdesc(tx_ring, vlan_macip_lens, fceof_saidx, type_tucmd,
 			  mss_l4len_idx);
 
 	return 1;
@@ -8493,7 +8502,8 @@ netdev_tx_t ixgbe_xmit_frame_ring(struct sk_buff *skb,
 	if (skb->sp && !ixgbe_ipsec_tx(tx_ring, first, &ipsec_tx))
 		goto out_drop;
 #endif
-	tso = ixgbe_tso(tx_ring, first, &hdr_len);
+
+	tso = ixgbe_tso(tx_ring, first, &hdr_len, &ipsec_tx);
 	if (tso < 0)
 		goto out_drop;
 	else if (!tso)
@@ -9902,8 +9912,11 @@ ixgbe_features_check(struct sk_buff *skb, struct net_device *dev,
 
 	/* We can only support IPV4 TSO in tunnels if we can mangle the
 	 * inner IP ID field, so strip TSO if MANGLEID is not supported.
+	 * IPsec offoad sets skb->encapsulation but still can handle
+	 * the TSO, so it's the exception.
 	 */
-	if (skb->encapsulation && !(features & NETIF_F_TSO_MANGLEID))
+	if (skb->encapsulation && !(features & NETIF_F_TSO_MANGLEID) &&
+	    !skb->sp)
 		features &= ~NETIF_F_TSO;
 
 	return features;
-- 
2.7.4

^ permalink raw reply related

* [next-queue 3/4] ixgbe: no need for esp trailer if gso
From: Shannon Nelson @ 2018-03-15 21:23 UTC (permalink / raw)
  To: intel-wired-lan; +Cc: netdev, steffen.klassert
In-Reply-To: <1521149003-1433-1-git-send-email-shannon.nelson@oracle.com>

There is no need to calculate the trailer length if we're doing
a GSO/TSO, as there is no trailer added to the packet data.
Also, don't bother clearing the flags field as it was already
cleared earlier.

Signed-off-by: Shannon Nelson <shannon.nelson@oracle.com>
---
 drivers/net/ethernet/intel/ixgbe/ixgbe_ipsec.c | 37 +++++++++++++++-----------
 1 file changed, 21 insertions(+), 16 deletions(-)

diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_ipsec.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_ipsec.c
index f225452..5ddea43 100644
--- a/drivers/net/ethernet/intel/ixgbe/ixgbe_ipsec.c
+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_ipsec.c
@@ -774,11 +774,7 @@ int ixgbe_ipsec_tx(struct ixgbe_ring *tx_ring,
 
 	first->tx_flags |= IXGBE_TX_FLAGS_IPSEC | IXGBE_TX_FLAGS_CC;
 
-	itd->flags = 0;
 	if (xs->id.proto == IPPROTO_ESP) {
-		struct sk_buff *skb = first->skb;
-		int ret, authlen, trailerlen;
-		u8 padlen;
 
 		itd->flags |= IXGBE_ADVTXD_TUCMD_IPSEC_TYPE_ESP |
 			      IXGBE_ADVTXD_TUCMD_L4T_TCP;
@@ -790,19 +786,28 @@ int ixgbe_ipsec_tx(struct ixgbe_ring *tx_ring,
 		 * padlen bytes of padding.  This ends up not the same
 		 * as the static value found in xs->props.trailer_len (21).
 		 *
-		 * The "correct" way to get the auth length would be to use
-		 *    authlen = crypto_aead_authsize(xs->data);
-		 * but since we know we only have one size to worry about
-		 * we can let the compiler use the constant and save us a
-		 * few CPU cycles.
+		 * ... but if we're doing GSO, don't bother as the stack
+		 * doesn't add a trailer for those.
 		 */
-		authlen = IXGBE_IPSEC_AUTH_BITS / 8;
-
-		ret = skb_copy_bits(skb, skb->len - (authlen + 2), &padlen, 1);
-		if (unlikely(ret))
-			return 0;
-		trailerlen = authlen + 2 + padlen;
-		itd->trailer_len = trailerlen;
+		if (!skb_is_gso(first->skb)) {
+			/* The "correct" way to get the auth length would be
+			 * to use
+			 *    authlen = crypto_aead_authsize(xs->data);
+			 * but since we know we only have one size to worry
+			 * about * we can let the compiler use the constant
+			 * and save us a few CPU cycles.
+			 */
+			const int authlen = IXGBE_IPSEC_AUTH_BITS / 8;
+			struct sk_buff *skb = first->skb;
+			u8 padlen;
+			int ret;
+
+			ret = skb_copy_bits(skb, skb->len - (authlen + 2),
+					    &padlen, 1);
+			if (unlikely(ret))
+				return 0;
+			itd->trailer_len = authlen + 2 + padlen;
+		}
 	}
 	if (tsa->encrypt)
 		itd->flags |= IXGBE_ADVTXD_TUCMD_IPSEC_ENCRYPT_EN;
-- 
2.7.4

^ permalink raw reply related

* Re: [RFC 2/2] page_frag_cache: Store metadata in struct page
From: Alexander Duyck @ 2018-03-15 21:26 UTC (permalink / raw)
  To: Matthew Wilcox; +Cc: Alexander Duyck, linux-mm, Netdev, Matthew Wilcox
In-Reply-To: <20180315195329.7787-3-willy@infradead.org>

On Thu, Mar 15, 2018 at 12:53 PM, Matthew Wilcox <willy@infradead.org> wrote:
> From: Matthew Wilcox <mawilcox@microsoft.com>
>
> Shrink page_frag_cache from 24 to 8 bytes (a single pointer to the
> currently-in-use struct page) by using the page's refcount directly
> (instead of maintaining a bias) and storing our current progress through
> the page in the same bits currently used for page->index.  We no longer
> need to reflect the page pfmemalloc state if we're storing the page
> directly.
>
> On the downside, we now call page_address() on every allocation, and we
> do an atomic_inc() rather than a non-atomic decrement, but we should
> touch the same number of cachelines and there is far less code (and
> the code is less complex).
>
> Signed-off-by: Matthew Wilcox <mawilcox@microsoft.com>

So my concern with this patch is that it is essentially trading off
CPU performance for reduced size. One of the reasons for getting away
from using the page pointer is because it is expensive to access the
page when the ref_count is bouncing on multiple cache lines. In
addition converting from a page to a virtual address is actually more
expensive then you would think it should be. I went through and
optimized that the best I could, but there is still a pretty
significant cost to the call.

I won't be able to test the patches until next week, but I expect I
will probably see a noticeable regression when performing a small
packet routing test.

- Alex

^ permalink raw reply

* Re: [PATCH RFC 0/7] net: phy: patch series aiming to improve few aspects of phylib
From: Heiner Kallweit @ 2018-03-15 21:08 UTC (permalink / raw)
  To: Florian Fainelli, Andrew Lunn; +Cc: Geert Uytterhoeven, netdev@vger.kernel.org
In-Reply-To: <90ec1e79-d5fd-2871-3b99-fcc8e51804d6@gmail.com>

Am 15.03.2018 um 00:53 schrieb Florian Fainelli:
> On 03/14/2018 01:10 PM, Heiner Kallweit wrote:
>> This patch series aims to tackle few issues with phylib:
>>  
>> - address issues with patch series [1] (smsc911x + phylib changes)
>> - make phy_stop synchronous
>> - get rid of phy_start/stop_machine and handle it in phy_start/phy_stop
>> - in mdio_suspend consider runtime pm state of mdio bus parent
>> - consider more WOL conditions when deciding whether PHY is allowed to
>>   suspend
>> - only resume phy after system suspend if needed
>>
>> [1] https://www.mail-archive.com/netdev@vger.kernel.org/msg196061.html
>>
>> It works fine here but other NIC drivers may use phylib differently. 
>> Therefore I'd appreciate feedback and more testing.
>>
>> I could think of some subsequent patches, e.g. phy_error() could be
>> reduced to calling phy_stop() and printing an error message
>> (today it silently sets the PHY state to PHY_HALTED).
> 
> Thanks for the patch series, I will give it a spin on a number of
> devices using different PHYLIB integration and see if something breaks.
> 
Great, and thanks for the immediate feedback.
I'll prepare a v2 based on it, also considerung Geert's feedback.

>>
>> Heiner Kallweit (7):
>>   net: phy: unconditionally resume and re-enable interrupts in phy_start
>>   net: phy: improve checking for when PHY is allowed to suspend
>>   net: phy: resume PHY only if needed in mdio_bus_phy_suspend
>>   net: phy: remove phy_start_machine
>>   net: phy: make phy_stop synchronous
>>   net: phy: use new function phy_stop_suspending in mdio_bus_phy_suspend
>>   net: phy: remove phy_stop_machine
>>
>>  drivers/net/phy/phy.c        | 102 +++++++++++++++++--------------------------
>>  drivers/net/phy/phy_device.c |  80 ++++++++++++++++++++-------------
>>  drivers/net/phy/phylink.c    |   1 -
>>  include/linux/phy.h          |  14 ++++--
>>  4 files changed, 100 insertions(+), 97 deletions(-)
>>
> 
> 

^ permalink raw reply

* Re: [PATCH RFC 3/7] net: phy: resume PHY only if needed in, mdio_bus_phy_suspend
From: Heiner Kallweit @ 2018-03-15 21:25 UTC (permalink / raw)
  To: Florian Fainelli, Andrew Lunn; +Cc: Geert Uytterhoeven, netdev@vger.kernel.org
In-Reply-To: <0215270c-af88-5720-457c-2113712365b2@gmail.com>

Am 15.03.2018 um 00:50 schrieb Florian Fainelli:
> On 03/14/2018 01:16 PM, Heiner Kallweit wrote:
>> Currently the PHY is unconditionally resumed in mdio_bus_phy_suspend().
>> In cases where the PHY was sleepinh before suspending or if somebody else
>> takes care of resuming later, this is not needed and wastes energy.
>>
>> Also start the state machine only if it's used by the driver (indicated
>> by the adjust_link callback being defined).
>>
>> Signed-off-by: Heiner Kallweit <hkallweit1@gmail.com>
>> ---
>>  drivers/net/phy/phy_device.c | 33 +++++++++++++++++++++++----------
>>  1 file changed, 23 insertions(+), 10 deletions(-)
>>
>> diff --git a/drivers/net/phy/phy_device.c b/drivers/net/phy/phy_device.c
>> index a5691536f..c6fd79758 100644
>> --- a/drivers/net/phy/phy_device.c
>> +++ b/drivers/net/phy/phy_device.c
>> @@ -124,6 +124,18 @@ static bool phy_may_suspend(struct phy_device *phydev)
>>  }
>>  
>>  #ifdef CONFIG_PM
>> +
>> +static bool mdio_bus_phy_needs_start(struct phy_device *phydev)
>> +{
>> +	bool start;
> 
> How about needs_start? This is uber nitpicking but it seems to be more
> in line with what is being tested for here.
> 
Agree ..

>> +
>> +	mutex_lock(&phydev->lock);
>> +	start = phydev->state == PHY_UP && phydev->adjust_link;
> 
> You probably need to add an || phydev->phy_link_change here because that
> is what PHYLINK uses, it does not register an adjust_link callback, but
> would likely expect similar semantics. Even better, introduce a helper
> function that tests for both to avoid possible issues...
> 

phydev->phy_link_change is set in phy_attach_direct(). Therefore it's
always set if the device is attached. And mdio_bus_phy_needs_start()
is only used after we have verified that the device is attached.
Having said that I don't see when phydev->phy_link_change could
be NULL.

When talking about phydev->phy_link_change, why does it exist at all?
I found no driver setting an own callback, replacing the default
phy_link_change(). So we could use the default directly.
Or in which use case would a driver set an own callback?

>> +	mutex_unlock(&phydev->lock);
>> +
>> +	return start;
>> +}
>> +
>>  static int mdio_bus_phy_suspend(struct device *dev)
>>  {
>>  	struct phy_device *phydev = to_phy_device(dev);
>> @@ -142,25 +154,25 @@ static int mdio_bus_phy_suspend(struct device *dev)
>>  static int mdio_bus_phy_resume(struct device *dev)
>>  {
>>  	struct phy_device *phydev = to_phy_device(dev);
>> -	int ret;
>> +	int ret = 0;
>>  
>> -	ret = phy_resume(phydev);
>> -	if (ret < 0)
>> -		return ret;
>> +	if (!phydev->attached_dev)
>> +		return 0;
>>  
>> -	if (phydev->attached_dev && phydev->adjust_link)
>> -		phy_start_machine(phydev);
>> +	if (mdio_bus_phy_needs_start(phydev))
>> +		phy_start(phydev);
>> +	else if (!phydev->adjust_link)
>> +		ret = phy_resume(phydev);
> 
> Humm, under which conditions can you not have phydev->attached_dev and
> also not phydev->adjust_link being set? As mentioned earlier, you would
> likely need to test for phy_link_change too here.
> 
We come here only if phydev->attached_dev is set. If this is the case
and phydev->adjust_link is not set this indicates that the driver
doesn't use the phylib state machine.
And in this case I'd prefer to just call phy_resume().

>>  
>> -	return 0;
>> +	return ret;
>>  }
>>  
>>  static int mdio_bus_phy_restore(struct device *dev)
>>  {
>>  	struct phy_device *phydev = to_phy_device(dev);
>> -	struct net_device *netdev = phydev->attached_dev;
>>  	int ret;
>>  
>> -	if (!netdev)
>> +	if (!phydev->attached_dev)
>>  		return 0;
> 
> That does not seem to be making any functional difference, so I would
> just drop this for now.
> 
>>  
>>  	ret = phy_init_hw(phydev);
>> @@ -171,7 +183,8 @@ static int mdio_bus_phy_restore(struct device *dev)
>>  	phydev->link = 0;
>>  	phydev->state = PHY_UP;
>>  
>> -	phy_start_machine(phydev);
>> +	if (mdio_bus_phy_needs_start(phydev))
>> +		phy_start(phydev);
>>  
>>  	return 0;
>>  }
>>
> 
> 

^ permalink raw reply

* Re: [PATCH RFC 0/7] net: phy: patch series aiming to improve few aspects of phylib
From: Heiner Kallweit @ 2018-03-15 21:34 UTC (permalink / raw)
  To: Geert Uytterhoeven
  Cc: Florian Fainelli, Andrew Lunn, Geert Uytterhoeven,
	netdev@vger.kernel.org
In-Reply-To: <CAMuHMdUR8U5hEnbsPOrWpMHaLYMs=1dsNJLgAO6_TrzpBjFW0Q@mail.gmail.com>

Am 15.03.2018 um 11:07 schrieb Geert Uytterhoeven:
> Hi Heiner,
> 
> On Wed, Mar 14, 2018 at 9:10 PM, Heiner Kallweit <hkallweit1@gmail.com> wrote:
>> This patch series aims to tackle few issues with phylib:
>>
>> - address issues with patch series [1] (smsc911x + phylib changes)
>> - make phy_stop synchronous
>> - get rid of phy_start/stop_machine and handle it in phy_start/phy_stop
>> - in mdio_suspend consider runtime pm state of mdio bus parent
>> - consider more WOL conditions when deciding whether PHY is allowed to
>>   suspend
>> - only resume phy after system suspend if needed
>>
>> [1] https://www.mail-archive.com/netdev@vger.kernel.org/msg196061.html
>>
>> It works fine here but other NIC drivers may use phylib differently.
>> Therefore I'd appreciate feedback and more testing.
>>
>> I could think of some subsequent patches, e.g. phy_error() could be
>> reduced to calling phy_stop() and printing an error message
>> (today it silently sets the PHY state to PHY_HALTED).
>>
>> Heiner Kallweit (7):
>>   net: phy: unconditionally resume and re-enable interrupts in phy_start
>>   net: phy: improve checking for when PHY is allowed to suspend
>>   net: phy: resume PHY only if needed in mdio_bus_phy_suspend
>>   net: phy: remove phy_start_machine
>>   net: phy: make phy_stop synchronous
>>   net: phy: use new function phy_stop_suspending in mdio_bus_phy_suspend
>>   net: phy: remove phy_stop_machine
> 
> Thanks for your series!
> 
> I've gave this a try on a few machines, incl. r8a73a4/ape6evm and
> sh73a0/kzm9g, which have smsc911x Ethernet chips on a power-managed bus.
> 
> On both machines it crashes during system suspend, which means the smsc911c's
> registers are accessed while the device is suspended:
> 
> PM: suspend entry (deep)
> PM: Syncing filesystems ... done.
> Freezing user space processes ... (elapsed 0.001 seconds) done.
> OOM killer disabled.
> Freezing remaining freezable tasks ... (elapsed 0.001 seconds) done.
> PM: suspend devices took 0.130 seconds
> Disabling non-boot CPUs ...
> Unhandled fault: imprecise external abort (0x1406) at 0x000ce408
> pgd = f4465d7b
> [000ce408] *pgd=00000000
> Internal error: : 1406 [#1] SMP ARM
> Modules linked in:
> CPU: 1 PID: 20 Comm: kworker/1:1 Not tainted
> 4.16.0-rc5-kzm9g-00470-g319cfb3643965f46-dirty #1030
> Hardware name: Generic SH73A0 (Flattened Device Tree)
> Workqueue: events linkwatch_event
> PC is at __smsc911x_reg_read+0x1c/0x60
> LR is at smsc911x_tx_get_txstatus+0x2c/0x7c
> pc : [<c03eefd4>]    lr : [<c03ef820>]    psr: 20010093
> sp : df51bd38  ip : df51bce0  fp : 00000000
> r10: 00000000  r9 : 00000000  r8 : c0909b58
> r7 : a0010013  r6 : df636e08  r5 : df636dc0  r4 : df636800
> r3 : e0903000  r2 : 00000001  r1 : e0903080  r0 : 00000000
> Flags: nzCv  IRQs off  FIQs on  Mode SVC_32  ISA ARM  Segment none
> Control: 10c5387d  Table: 5ee4004a  DAC: 00000051
> Process kworker/1:1 (pid: 20, stack limit = 0x1e2af6bb)
> Stack: (0xdf51bd38 to 0xdf51c000)
> bd20:                                                       c03efb14 df636800
> bd40: df636dc0 c063c198 df51bdb0 c03efa80 c03efb14 df636800 df636800 c03efb20
> bd60: c03efb14 dec5e8f4 df636800 c063c198 df51bdb0 c04b4494 dec5e8f0 dec4ea80
> bd80: df636800 c04d7c28 dec4ea80 df636800 dec5e800 c04d3d68 0000002a 00000000
> bda0: c04d3990 c020af0c df400a80 00000000 00000000 00000000 00000000 00000000
> bdc0: 00000000 00000000 00000050 00000000 df51be03 c04a5828 00000580 c04a5758
> bde0: dec4ea80 000004db 014000c0 c0908448 00000001 c04a58a0 df51be03 c04d14e0
> be00: 00000000 3cef0b86 c04d13bc dec4ea80 df636800 00000010 00000000 00000000
> be20: df636800 00000000 c0931b44 c04d73c0 00000000 00000000 00000000 00000000
> be40: 00000000 00000000 00000000 ffffffff 014000c0 df636800 c0931ad8 df51bed4
> be60: c0931ad8 c04d7468 014000c0 00000000 00000000 c014404c c0908448 c0908448
> be80: df636800 c04d7534 014000c0 00000000 00000000 014000c0 c0908448 c04b9d8c
> bea0: df636800 00000000 00000000 3cef0b86 c0931b44 df636800 c0931b44 c04d8854
> bec0: df636aac c04d8b10 df51bf2c c0908448 00000000 df51bed4 df51bed4 3cef0b86
> bee0: df51bf2c df50dc80 c0931ad8 dfbdaac0 df51bf2c dfbddd00 00000000 00000001
> bf00: 00000000 c04d8b98 c04d8b74 c013cc8c 00000001 00000000 c013cc14 c013d214
> bf20: c0908448 00000000 00000004 c0931ad8 00000000 00000000 c075f7d9 3cef0b86
> bf40: c0905900 df50dc80 dfbdaac0 dfbdaac0 df51a000 dfbdaaf4 c0905900 df50dc98
> bf60: 00000008 c013d4b0 df518540 df50de80 df5110c0 00000000 df491eb0 df50dc80
> bf80: c013d1e4 df50deb8 00000000 c014293c df5110c0 c014281c 00000000 00000000
> bfa0: 00000000 00000000 00000000 c01010b4 00000000 00000000 00000000 00000000
> bfc0: 00000000 00000000 00000000 00000000 00000000 00000000 00000000 00000000
> bfe0: 00000000 00000000 00000000 00000000 00000013 00000000 7fdfffff fff7fdff
> [<c03eefd4>] (__smsc911x_reg_read) from [<c03ef820>]
> (smsc911x_tx_get_txstatus+0x2c/0x7c)
> [<c03ef820>] (smsc911x_tx_get_txstatus) from [<c03efa80>]
> (smsc911x_tx_update_txcounters+0x14/0xa8)
> [<c03efa80>] (smsc911x_tx_update_txcounters) from [<c03efb20>]
> (smsc911x_get_stats+0xc/0x58)
> [<c03efb20>] (smsc911x_get_stats) from [<c04b4494>] (dev_get_stats+0x58/0xa8)
> [<c04b4494>] (dev_get_stats) from [<c04d7c28>] (rtnl_fill_stats+0x38/0x118)
> [<c04d7c28>] (rtnl_fill_stats) from [<c04d3d68>]
> (rtnl_fill_ifinfo.constprop.12+0x6a8/0x105c)
> [<c04d3d68>] (rtnl_fill_ifinfo.constprop.12) from [<c04d73c0>]
> (rtmsg_ifinfo_build_skb+0x7c/0xd0)
> [<c04d73c0>] (rtmsg_ifinfo_build_skb) from [<c04d7468>]
> (rtmsg_ifinfo_event.part.6+0x28/0x4c)
> [<c04d7468>] (rtmsg_ifinfo_event.part.6) from [<c04d7534>]
> (rtmsg_ifinfo+0x24/0x2c)
> [<c04d7534>] (rtmsg_ifinfo) from [<c04b9d8c>] (netdev_state_change+0x5c/0x80)
> [<c04b9d8c>] (netdev_state_change) from [<c04d8854>]
> (linkwatch_do_dev+0x50/0x74)
> [<c04d8854>] (linkwatch_do_dev) from [<c04d8b10>]
> (__linkwatch_run_queue+0x138/0x19c)
> [<c04d8b10>] (__linkwatch_run_queue) from [<c04d8b98>]
> (linkwatch_event+0x24/0x34)
> [<c04d8b98>] (linkwatch_event) from [<c013cc8c>] (process_one_work+0x250/0x41c)
> [<c013cc8c>] (process_one_work) from [<c013d4b0>] (worker_thread+0x2cc/0x40c)
> [<c013d4b0>] (worker_thread) from [<c014293c>] (kthread+0x120/0x140)
> [<c014293c>] (kthread) from [<c01010b4>] (ret_from_fork+0x14/0x20)
> Exception stack(0xdf51bfb0 to 0xdf51bff8)
> bfa0:                                     00000000 00000000 00000000 00000000
> bfc0: 00000000 00000000 00000000 00000000 00000000 00000000 00000000 00000000
> bfe0: 00000000 00000000 00000000 00000000 00000013 00000000
> Code: e5903000 e0831001 e5910000 f57ff04f (e12fff1e)
> 
> I've bisected this to  "net: phy: use new function phy_stop_suspending in,
> mdio_bus_phy_suspend".
> 
Thanks a lot for testing and the quick feedback!
Reason for the problem seems to be that mdio_bus_phy_suspend() now includes
a call to phy_link_down() which eventually fires an asynchronous linkwatch
event. And this asynchronous event (accessing the chip registers) is
processed only after the chip has been powered down.

Maybe it's sufficient if I set the do_carrier parameter of phy_link_down()
to false if suspending. Have to spend few more thoughts on this.

Regards, Heiner


> Gr{oetje,eeting}s,
> 
>                         Geert
> 
> --
> Geert Uytterhoeven -- There's lots of Linux beyond ia32 -- geert@linux-m68k.org
> 
> In personal conversations with technical people, I call myself a hacker. But
> when I'm talking to journalists I just say "programmer" or something like that.
>                                 -- Linus Torvalds
> 

^ permalink raw reply

* Re: [RFC net-next 2/6] driver: net: bonding: allow registration of tc offload callbacks in bond
From: Or Gerlitz @ 2018-03-15 21:38 UTC (permalink / raw)
  To: Jiri Pirko
  Cc: Jiri Pirko, Rabie Loulou, John Hurley, Jakub Kicinski,
	Simon Horman, Linux Netdev List, ASAP_Direct_Dev, mlxsw
In-Reply-To: <20180314155640.GF2130@nanopsycho>

On Wed, Mar 14, 2018 at 5:56 PM, Jiri Pirko <jiri@resnulli.us> wrote:
> Wed, Mar 14, 2018 at 12:23:59PM CET, gerlitz.or@gmail.com wrote:
>>On Wed, Mar 14, 2018 at 11:50 AM, Jiri Pirko <jiri@resnulli.us> wrote:
>>> Tue, Mar 13, 2018 at 04:51:02PM CET, gerlitz.or@gmail.com wrote:
>>>>On Wed, Mar 7, 2018 at 12:57 PM, Jiri Pirko <jiri@resnulli.us> wrote:
>>
>>>>This sounds nice for the case where one install ingress tc rules on
>>>>the bond (lets
>>>>call them type 1, see next)
>>>>
>>>>One obstacle pointed by my colleague, Rabie, is that when the upper layer
>>>>issues stat call on the filter, they will get two replies, this can confuse them
>>>>and lead to wrong decisions (aging). I wonder if/how we can set a knob
>>>
>>> The bonding itself would not do anything on stats update
>>> command (TC_CLSFLOWER_STATS for example). Only the slaves would do
>>> update. So there will be only reply from slaves.
>>>
>>> Bond/team is just going to probagare block bind/unbind down. Nothing else.
>>
>>Do we agree that user space will get the replies of all lower (slave) devices,
>>or I am missing something here?
>
> "user space will get the replies" - not sure what exactly do you mean by
> this. The stats would be accumulated over all devices/drivers who
> registered block callback.

OK, this is probably something I have to check, thanks


>>>>2. bond being egress port of a rule
>>>>2.1 VF rep --> uplink 0
>>>>2.2 VF rep --> uplink 1
>>>>
>>>>and we do that in the driver (add/del two HW rules, combine the stat
>>>>results, etc)
>>>
>>> That is up to the driver. If the driver can share block between 2
>>> devices, he can do that. If he cannot share, it will just report stats
>>> for every device separatelly (2 block cbs registered) and tc will see
>>> them both together. No need to do anything in driver.
>>
>>right
>>
>>>>3. ingress rule on VF rep port with shared tunnel device being the
>>>>egress (encap)
>>>>and where the routing of the underlay (tunnel) goes through LAG.
>>
>>> Same as "2."
>>
>>ok
>>
>>>>4. ingress rule shared tunnel device being the ingress and VF rep port being the egress (decap)

>>> I don't follow :(

>> the way tunneling is handled in tc classifier/action is

>> encap:  ingress: net port, action1: tunnel key set action2: mirred to
>> shared-tunnel device

>> decap: ingress: shared tunnel device, action1: tunnel key unset
>> action2: mirred to net port

>> type 4 are the decap rules, when we offload it to as HW ACL we stretch
>> the line and the ingress in a HW port too (e.g uplink port in NICs)

> Okay, I see. But where's the bond here? Is it the one I mentioned as
> "mirred redirect to lag"?

since the ingress port is not HW port, we will use the egdev approach
and offload the rule as the uplink of this VF rep port being the ingress.

Since we will see that this uplink is into LAG, we will offload another rule
which the 2nd uplink being the ingress

>>> I see another thing we need to sanitize: vxlan rule ingress match action
>>> mirred redirect to lag
>>right, we don't have  for NIC but for switch ASIC, I guess it is applicable
> Yes, it is. For future NICs I guess it is going to be as well.

might

^ permalink raw reply

* Re: [PATCH v4 1/2] kernel.h: Introduce const_max() for VLA removal
From: Linus Torvalds @ 2018-03-15 21:42 UTC (permalink / raw)
  To: Kees Cook
  Cc: Andrew Morton, Josh Poimboeuf, Rasmus Villemoes, Randy Dunlap,
	Miguel Ojeda, Ingo Molnar, David Laight, Ian Abbott, linux-input,
	linux-btrfs, Network Development, Linux Kernel Mailing List,
	Kernel Hardening
In-Reply-To: <1521143266-31350-2-git-send-email-keescook@chromium.org>

On Thu, Mar 15, 2018 at 12:47 PM, Kees Cook <keescook@chromium.org> wrote:
>
> To gain the ability to compare differing types, the arguments are
> explicitly cast to size_t.

Ugh, I really hate this.

It silently does insane things if you do

   const_max(-1,6)

and there is nothing in the name that implies that you can't use
negative constants.

                   Linus

^ permalink raw reply

* Re: [bpf-next PATCH v2 06/18] bpf: sockmap, add bpf_msg_apply_bytes() helper
From: Alexei Starovoitov @ 2018-03-15 21:45 UTC (permalink / raw)
  To: John Fastabend; +Cc: davem, ast, daniel, davejwatson, netdev
In-Reply-To: <20180312192334.8039.98220.stgit@john-Precision-Tower-5810>

On Mon, Mar 12, 2018 at 12:23:34PM -0700, John Fastabend wrote:
> A single sendmsg or sendfile system call can contain multiple logical
> messages that a BPF program may want to read and apply a verdict. But,
> without an apply_bytes helper any verdict on the data applies to all
> bytes in the sendmsg/sendfile. Alternatively, a BPF program may only
> care to read the first N bytes of a msg. If the payload is large say
> MB or even GB setting up and calling the BPF program repeatedly for
> all bytes, even though the verdict is already known, creates
> unnecessary overhead.
> 
> To allow BPF programs to control how many bytes a given verdict
> applies to we implement a bpf_msg_apply_bytes() helper. When called
> from within a BPF program this sets a counter, internal to the
> BPF infrastructure, that applies the last verdict to the next N
> bytes. If the N is smaller than the current data being processed
> from a sendmsg/sendfile call, the first N bytes will be sent and
> the BPF program will be re-run with start_data pointing to the N+1
> byte. If N is larger than the current data being processed the
> BPF verdict will be applied to multiple sendmsg/sendfile calls
> until N bytes are consumed.
> 
> Note1 if a socket closes with apply_bytes counter non-zero this
> is not a problem because data is not being buffered for N bytes
> and is sent as its received.
> 
> Note2 if this is operating in the sendpage context the data
> pointers may be zeroed after this call if the apply walks beyond
> a msg_pull_data() call specified data range. (helper implemented
> shortly in this series).

instead of 'shortly in this seris' you meant 'implemented earlier'?
patch 5 handles it, but it's set here, right?

The semantics of the helper looks great.

^ permalink raw reply

* Re: [bpf-next PATCH v2 06/18] bpf: sockmap, add bpf_msg_apply_bytes() helper
From: John Fastabend @ 2018-03-15 21:59 UTC (permalink / raw)
  To: Alexei Starovoitov; +Cc: davem, ast, daniel, davejwatson, netdev
In-Reply-To: <20180315214554.2gspfp255qqxbjcb@ast-mbp>

On 03/15/2018 02:45 PM, Alexei Starovoitov wrote:
> On Mon, Mar 12, 2018 at 12:23:34PM -0700, John Fastabend wrote:
>> A single sendmsg or sendfile system call can contain multiple logical
>> messages that a BPF program may want to read and apply a verdict. But,
>> without an apply_bytes helper any verdict on the data applies to all
>> bytes in the sendmsg/sendfile. Alternatively, a BPF program may only
>> care to read the first N bytes of a msg. If the payload is large say
>> MB or even GB setting up and calling the BPF program repeatedly for
>> all bytes, even though the verdict is already known, creates
>> unnecessary overhead.
>>
>> To allow BPF programs to control how many bytes a given verdict
>> applies to we implement a bpf_msg_apply_bytes() helper. When called
>> from within a BPF program this sets a counter, internal to the
>> BPF infrastructure, that applies the last verdict to the next N
>> bytes. If the N is smaller than the current data being processed
>> from a sendmsg/sendfile call, the first N bytes will be sent and
>> the BPF program will be re-run with start_data pointing to the N+1
>> byte. If N is larger than the current data being processed the
>> BPF verdict will be applied to multiple sendmsg/sendfile calls
>> until N bytes are consumed.
>>
>> Note1 if a socket closes with apply_bytes counter non-zero this
>> is not a problem because data is not being buffered for N bytes
>> and is sent as its received.
>>
>> Note2 if this is operating in the sendpage context the data
>> pointers may be zeroed after this call if the apply walks beyond
>> a msg_pull_data() call specified data range. (helper implemented
>> shortly in this series).
> 
> instead of 'shortly in this seris' you meant 'implemented earlier'?
> patch 5 handles it, but it's set here, right?
> 

Yep just a hold-over from an earlier patch description. I'll remove
that entire note2 and fixup a couple small things Daniel noticed
with a v3.

> The semantics of the helper looks great.
> 

Great!

^ permalink raw reply

* Re: [bpf-next PATCH v2 05/18] bpf: create tcp_bpf_ulp allowing BPF to monitor socket TX/RX data
From: Alexei Starovoitov @ 2018-03-15 21:59 UTC (permalink / raw)
  To: John Fastabend; +Cc: davem, ast, daniel, davejwatson, netdev
In-Reply-To: <20180312192329.8039.75277.stgit@john-Precision-Tower-5810>

On Mon, Mar 12, 2018 at 12:23:29PM -0700, John Fastabend wrote:
>  
> +/* User return codes for SK_MSG prog type. */
> +enum sk_msg_action {
> +	SK_MSG_DROP = 0,
> +	SK_MSG_PASS,
> +};

do we really need new enum here?
It's the same as 'enum sk_action' and SK_DROP == SK_MSG_DROP
and there will be only drop/pass in both enums.
Also I don't see where these two new SK_MSG_* are used...

> +
> +/* user accessible metadata for SK_MSG packet hook, new fields must
> + * be added to the end of this structure
> + */
> +struct sk_msg_md {
> +	__u32 data;
> +	__u32 data_end;
> +};

I think it's time for me to ask for forgiveness :)
I used __u32 for data and data_end only because all other fields
in __sk_buff were __u32 at the time and I couldn't easily figure out
how to teach verifier to recognize 8-byte rewrites.
Unfortunately my mistake stuck and was copied over into xdp.
Since this is new struct let's do it right and add
'void *data, *data_end' here,
since bpf prog will use them as 'void *' pointers.
There are no compat issues here, since bpf is always 64-bit.

> +static int bpf_map_msg_verdict(int _rc, struct sk_msg_buff *md)
> +{
> +	return ((_rc == SK_PASS) ?
> +	       (md->map ? __SK_REDIRECT : __SK_PASS) :
> +	       __SK_DROP);

you're using old SK_PASS here too ;)
that's to my point of not adding SK_MSG_PASS...

Overall the patch set looks absolutely great.
Thank you for working on it.

^ permalink raw reply

* Re: [bpf-next PATCH v2 06/18] bpf: sockmap, add bpf_msg_apply_bytes() helper
From: John Fastabend @ 2018-03-15 22:02 UTC (permalink / raw)
  To: Daniel Borkmann, davem, ast, davejwatson; +Cc: netdev
In-Reply-To: <0ae5f4fd-2916-01e2-dba1-bd2505a1677c@iogearbox.net>

On 03/15/2018 01:32 PM, Daniel Borkmann wrote:
> On 03/12/2018 08:23 PM, John Fastabend wrote:
>> A single sendmsg or sendfile system call can contain multiple logical
>> messages that a BPF program may want to read and apply a verdict. But,
>> without an apply_bytes helper any verdict on the data applies to all
>> bytes in the sendmsg/sendfile. Alternatively, a BPF program may only
>> care to read the first N bytes of a msg. If the payload is large say
>> MB or even GB setting up and calling the BPF program repeatedly for
>> all bytes, even though the verdict is already known, creates
>> unnecessary overhead.
>>
>> To allow BPF programs to control how many bytes a given verdict
>> applies to we implement a bpf_msg_apply_bytes() helper. When called
>> from within a BPF program this sets a counter, internal to the
>> BPF infrastructure, that applies the last verdict to the next N
>> bytes. If the N is smaller than the current data being processed
>> from a sendmsg/sendfile call, the first N bytes will be sent and
>> the BPF program will be re-run with start_data pointing to the N+1
>> byte. If N is larger than the current data being processed the
>> BPF verdict will be applied to multiple sendmsg/sendfile calls
>> until N bytes are consumed.
>>
>> Note1 if a socket closes with apply_bytes counter non-zero this
>> is not a problem because data is not being buffered for N bytes
>> and is sent as its received.
>>
>> Note2 if this is operating in the sendpage context the data
>> pointers may be zeroed after this call if the apply walks beyond
>> a msg_pull_data() call specified data range. (helper implemented
>> shortly in this series).
>>
>> Signed-off-by: John Fastabend <john.fastabend@gmail.com>
>> ---
>>  include/uapi/linux/bpf.h |    3 ++-
>>  net/core/filter.c        |   16 ++++++++++++++++
>>  2 files changed, 18 insertions(+), 1 deletion(-)
>>
>> diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
>> index b8275f0..e50c61f 100644
>> --- a/include/uapi/linux/bpf.h
>> +++ b/include/uapi/linux/bpf.h
>> @@ -769,7 +769,8 @@ enum bpf_attach_type {
>>  	FN(getsockopt),			\
>>  	FN(override_return),		\
>>  	FN(sock_ops_cb_flags_set),	\
>> -	FN(msg_redirect_map),
>> +	FN(msg_redirect_map),		\
>> +	FN(msg_apply_bytes),
>>  
>>  /* integer value in 'imm' field of BPF_CALL instruction selects which helper
>>   * function eBPF program intends to call
>> diff --git a/net/core/filter.c b/net/core/filter.c
>> index 314c311..df2a8f4 100644
>> --- a/net/core/filter.c
>> +++ b/net/core/filter.c
>> @@ -1928,6 +1928,20 @@ struct sock *do_msg_redirect_map(struct sk_msg_buff *msg)
>>  	.arg4_type      = ARG_ANYTHING,
>>  };
>>  
>> +BPF_CALL_2(bpf_msg_apply_bytes, struct sk_msg_buff *, msg, u64, bytes)
>> +{
>> +	msg->apply_bytes = bytes;
> 
> Here in bpf_msg_apply_bytes() but also in bpf_msg_cork_bytes() the signature
> is u64, but in struct sk_msg_buff and struct smap_psock it's type int, so
> user provided u64 will make these negative. Is there a reason to have this
> allow a negative value and not being of type u32 everywhere?
> 

Nope no reason for negative values, we can make it consistently
u32.

^ permalink raw reply

* Re: [Intel-wired-lan] [next-queue 4/4] ixgbe: enable tso with ipsec offload
From: Alexander Duyck @ 2018-03-15 22:03 UTC (permalink / raw)
  To: Shannon Nelson; +Cc: intel-wired-lan, Steffen Klassert, Netdev
In-Reply-To: <1521149003-1433-5-git-send-email-shannon.nelson@oracle.com>

On Thu, Mar 15, 2018 at 2:23 PM, Shannon Nelson
<shannon.nelson@oracle.com> wrote:
> Fix things up to support TSO offload in conjunction
> with IPsec hw offload.  This raises throughput with
> IPsec offload on to nearly line rate.
>
> Signed-off-by: Shannon Nelson <shannon.nelson@oracle.com>
> ---
>  drivers/net/ethernet/intel/ixgbe/ixgbe_ipsec.c |  7 +++++--
>  drivers/net/ethernet/intel/ixgbe/ixgbe_main.c  | 25 +++++++++++++++++++------
>  2 files changed, 24 insertions(+), 8 deletions(-)
>
> diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_ipsec.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_ipsec.c
> index 5ddea43..bfbcfc2 100644
> --- a/drivers/net/ethernet/intel/ixgbe/ixgbe_ipsec.c
> +++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_ipsec.c
> @@ -896,6 +896,7 @@ void ixgbe_ipsec_rx(struct ixgbe_ring *rx_ring,
>  void ixgbe_init_ipsec_offload(struct ixgbe_adapter *adapter)
>  {
>         struct ixgbe_ipsec *ipsec;
> +       netdev_features_t features;
>         size_t size;
>
>         if (adapter->hw.mac.type == ixgbe_mac_82598EB)
> @@ -929,8 +930,10 @@ void ixgbe_init_ipsec_offload(struct ixgbe_adapter *adapter)
>         ixgbe_ipsec_clear_hw_tables(adapter);
>
>         adapter->netdev->xfrmdev_ops = &ixgbe_xfrmdev_ops;
> -       adapter->netdev->features |= NETIF_F_HW_ESP;
> -       adapter->netdev->hw_enc_features |= NETIF_F_HW_ESP;
> +
> +       features = NETIF_F_HW_ESP | NETIF_F_HW_ESP_TX_CSUM | NETIF_F_GSO_ESP;
> +       adapter->netdev->features |= features;
> +       adapter->netdev->hw_enc_features |= features;

Instead of adding the local variable you might just create a new
define that includes these 3 feature flags and then use that here. You
could use the way I did IXGBE_GSO_PARTIAL_FEATURES as an example.

>         return;
>
> diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
> index a54f3d8..6022666 100644
> --- a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
> +++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
> @@ -7721,9 +7721,11 @@ static void ixgbe_service_task(struct work_struct *work)
>
>  static int ixgbe_tso(struct ixgbe_ring *tx_ring,
>                      struct ixgbe_tx_buffer *first,
> -                    u8 *hdr_len)
> +                    u8 *hdr_len,
> +                    struct ixgbe_ipsec_tx_data *itd)
>  {
>         u32 vlan_macip_lens, type_tucmd, mss_l4len_idx;
> +       u32 fceof_saidx = 0;
>         struct sk_buff *skb = first->skb;

Reverse xmas tree this. It should probably be moved down to just past
the declaration of paylen and l4_offset.

>         union {
>                 struct iphdr *v4;
> @@ -7762,9 +7764,13 @@ static int ixgbe_tso(struct ixgbe_ring *tx_ring,
>                 unsigned char *trans_start = ip.hdr + (ip.v4->ihl * 4);
>
>                 /* IP header will have to cancel out any data that
> -                * is not a part of the outer IP header
> +                * is not a part of the outer IP header, except for
> +                * IPsec where we want the IP+ESP header.
>                  */
> -               ip.v4->check = csum_fold(csum_partial(trans_start,
> +               if (first->tx_flags & IXGBE_TX_FLAGS_IPSEC)
> +                       ip.v4->check = 0;
> +               else
> +                       ip.v4->check = csum_fold(csum_partial(trans_start,
>                                                       csum_start - trans_start,
>                                                       0));
>                 type_tucmd |= IXGBE_ADVTXD_TUCMD_IPV4;

I would say this should be flipped like so:
ip.v4->check =  (skb_shinfo(skb)->gso_type & SKB_GSO_PARTIAL) ?
                         csum_fold(csum_partial(trans_start,
csum_start - trans_start, 0) : 0;

> @@ -7797,12 +7803,15 @@ static int ixgbe_tso(struct ixgbe_ring *tx_ring,
>         mss_l4len_idx = (*hdr_len - l4_offset) << IXGBE_ADVTXD_L4LEN_SHIFT;
>         mss_l4len_idx |= skb_shinfo(skb)->gso_size << IXGBE_ADVTXD_MSS_SHIFT;
>
> +       fceof_saidx |= itd->sa_idx;
> +       type_tucmd |= itd->flags | itd->trailer_len;
> +
>         /* vlan_macip_lens: HEADLEN, MACLEN, VLAN tag */
>         vlan_macip_lens = l4.hdr - ip.hdr;
>         vlan_macip_lens |= (ip.hdr - skb->data) << IXGBE_ADVTXD_MACLEN_SHIFT;
>         vlan_macip_lens |= first->tx_flags & IXGBE_TX_FLAGS_VLAN_MASK;
>
> -       ixgbe_tx_ctxtdesc(tx_ring, vlan_macip_lens, 0, type_tucmd,
> +       ixgbe_tx_ctxtdesc(tx_ring, vlan_macip_lens, fceof_saidx, type_tucmd,
>                           mss_l4len_idx);
>
>         return 1;
> @@ -8493,7 +8502,8 @@ netdev_tx_t ixgbe_xmit_frame_ring(struct sk_buff *skb,
>         if (skb->sp && !ixgbe_ipsec_tx(tx_ring, first, &ipsec_tx))
>                 goto out_drop;
>  #endif
> -       tso = ixgbe_tso(tx_ring, first, &hdr_len);
> +
> +       tso = ixgbe_tso(tx_ring, first, &hdr_len, &ipsec_tx);
>         if (tso < 0)
>                 goto out_drop;
>         else if (!tso)

No need for the extra blank line. I would say just leave it as is and
add your extra argument.

> @@ -9902,8 +9912,11 @@ ixgbe_features_check(struct sk_buff *skb, struct net_device *dev,
>
>         /* We can only support IPV4 TSO in tunnels if we can mangle the
>          * inner IP ID field, so strip TSO if MANGLEID is not supported.
> +        * IPsec offoad sets skb->encapsulation but still can handle
> +        * the TSO, so it's the exception.
>          */
> -       if (skb->encapsulation && !(features & NETIF_F_TSO_MANGLEID))
> +       if (skb->encapsulation && !(features & NETIF_F_TSO_MANGLEID) &&
> +           !skb->sp)
>                 features &= ~NETIF_F_TSO;
>
>         return features;
> --
> 2.7.4
>
> _______________________________________________
> Intel-wired-lan mailing list
> Intel-wired-lan@osuosl.org
> https://lists.osuosl.org/mailman/listinfo/intel-wired-lan

^ permalink raw reply


This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox