* Re: [PATCH bpf-next 03/15] xsk: add umem fill queue support and mmap
From: Michael S. Tsirkin @ 2018-04-23 23:16 UTC (permalink / raw)
To: Björn Töpel
Cc: magnus.karlsson, alexander.h.duyck, alexander.duyck,
john.fastabend, ast, brouer, willemdebruijn.kernel, daniel,
netdev, michael.lundkvist, jesse.brandeburg, anjali.singhai,
qi.z.zhang
In-Reply-To: <20180423135619.7179-4-bjorn.topel@gmail.com>
On Mon, Apr 23, 2018 at 03:56:07PM +0200, Björn Töpel wrote:
> From: Magnus Karlsson <magnus.karlsson@intel.com>
>
> Here, we add another setsockopt for registered user memory (umem)
> called XDP_UMEM_FILL_QUEUE. Using this socket option, the process can
> ask the kernel to allocate a queue (ring buffer) and also mmap it
> (XDP_UMEM_PGOFF_FILL_QUEUE) into the process.
>
> The queue is used to explicitly pass ownership of umem frames from the
> user process to the kernel. These frames will in a later patch be
> filled in with Rx packet data by the kernel.
>
> Signed-off-by: Magnus Karlsson <magnus.karlsson@intel.com>
> ---
> include/uapi/linux/if_xdp.h | 15 +++++++++++
> net/xdp/Makefile | 2 +-
> net/xdp/xdp_umem.c | 5 ++++
> net/xdp/xdp_umem.h | 2 ++
> net/xdp/xsk.c | 62 ++++++++++++++++++++++++++++++++++++++++++++-
> net/xdp/xsk_queue.c | 58 ++++++++++++++++++++++++++++++++++++++++++
> net/xdp/xsk_queue.h | 38 +++++++++++++++++++++++++++
> 7 files changed, 180 insertions(+), 2 deletions(-)
> create mode 100644 net/xdp/xsk_queue.c
> create mode 100644 net/xdp/xsk_queue.h
>
> diff --git a/include/uapi/linux/if_xdp.h b/include/uapi/linux/if_xdp.h
> index 41252135a0fe..975661e1baca 100644
> --- a/include/uapi/linux/if_xdp.h
> +++ b/include/uapi/linux/if_xdp.h
> @@ -23,6 +23,7 @@
>
> /* XDP socket options */
> #define XDP_UMEM_REG 3
> +#define XDP_UMEM_FILL_RING 4
>
> struct xdp_umem_reg {
> __u64 addr; /* Start of packet data area */
> @@ -31,4 +32,18 @@ struct xdp_umem_reg {
> __u32 frame_headroom; /* Frame head room */
> };
>
> +/* Pgoff for mmaping the rings */
> +#define XDP_UMEM_PGOFF_FILL_RING 0x100000000
> +
> +struct xdp_ring {
> + __u32 producer __attribute__((aligned(64)));
> + __u32 consumer __attribute__((aligned(64)));
> +};
> +
> +/* Used for the fill and completion queues for buffers */
> +struct xdp_umem_ring {
> + struct xdp_ring ptrs;
> + __u32 desc[0] __attribute__((aligned(64)));
> +};
> +
> #endif /* _LINUX_IF_XDP_H */
> diff --git a/net/xdp/Makefile b/net/xdp/Makefile
> index a5d736640a0f..074fb2b2d51c 100644
> --- a/net/xdp/Makefile
> +++ b/net/xdp/Makefile
> @@ -1,2 +1,2 @@
> -obj-$(CONFIG_XDP_SOCKETS) += xsk.o xdp_umem.o
> +obj-$(CONFIG_XDP_SOCKETS) += xsk.o xdp_umem.o xsk_queue.o
>
> diff --git a/net/xdp/xdp_umem.c b/net/xdp/xdp_umem.c
> index bff058f5a769..6fc233e03f30 100644
> --- a/net/xdp/xdp_umem.c
> +++ b/net/xdp/xdp_umem.c
> @@ -62,6 +62,11 @@ static void xdp_umem_release(struct xdp_umem *umem)
> struct mm_struct *mm;
> unsigned long diff;
>
> + if (umem->fq) {
> + xskq_destroy(umem->fq);
> + umem->fq = NULL;
> + }
> +
> if (umem->pgs) {
> xdp_umem_unpin_pages(umem);
>
> diff --git a/net/xdp/xdp_umem.h b/net/xdp/xdp_umem.h
> index 58714f4f7f25..3086091aebdd 100644
> --- a/net/xdp/xdp_umem.h
> +++ b/net/xdp/xdp_umem.h
> @@ -18,9 +18,11 @@
> #include <linux/mm.h>
> #include <linux/if_xdp.h>
>
> +#include "xsk_queue.h"
> #include "xdp_umem_props.h"
>
> struct xdp_umem {
> + struct xsk_queue *fq;
> struct page **pgs;
> struct xdp_umem_props props;
> u32 npgs;
> diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c
> index 19fc719cbe0d..bf6a1151df28 100644
> --- a/net/xdp/xsk.c
> +++ b/net/xdp/xsk.c
> @@ -32,6 +32,7 @@
> #include <linux/netdevice.h>
> #include <net/sock.h>
>
> +#include "xsk_queue.h"
> #include "xdp_umem.h"
>
> struct xdp_sock {
> @@ -47,6 +48,21 @@ static struct xdp_sock *xdp_sk(struct sock *sk)
> return (struct xdp_sock *)sk;
> }
>
> +static int xsk_init_queue(u32 entries, struct xsk_queue **queue)
> +{
> + struct xsk_queue *q;
> +
> + if (entries == 0 || *queue || !is_power_of_2(entries))
> + return -EINVAL;
> +
> + q = xskq_create(entries);
> + if (!q)
> + return -ENOMEM;
> +
> + *queue = q;
> + return 0;
> +}
> +
> static int xsk_release(struct socket *sock)
> {
> struct sock *sk = sock->sk;
> @@ -109,6 +125,23 @@ static int xsk_setsockopt(struct socket *sock, int level, int optname,
> mutex_unlock(&xs->mutex);
> return 0;
> }
> + case XDP_UMEM_FILL_RING:
> + {
> + struct xsk_queue **q;
> + int entries;
> +
> + if (!xs->umem)
> + return -EINVAL;
> +
> + if (copy_from_user(&entries, optval, sizeof(entries)))
> + return -EFAULT;
> +
> + mutex_lock(&xs->mutex);
> + q = &xs->umem->fq;
> + err = xsk_init_queue(entries, q);
> + mutex_unlock(&xs->mutex);
> + return err;
> + }
> default:
> break;
> }
> @@ -116,6 +149,33 @@ static int xsk_setsockopt(struct socket *sock, int level, int optname,
> return -ENOPROTOOPT;
> }
>
> +static int xsk_mmap(struct file *file, struct socket *sock,
> + struct vm_area_struct *vma)
> +{
> + unsigned long offset = vma->vm_pgoff << PAGE_SHIFT;
> + unsigned long size = vma->vm_end - vma->vm_start;
> + struct xdp_sock *xs = xdp_sk(sock->sk);
> + struct xsk_queue *q;
> + unsigned long pfn;
> + struct page *qpg;
> +
> + if (!xs->umem)
> + return -EINVAL;
> +
> + if (offset == XDP_UMEM_PGOFF_FILL_RING)
> + q = xs->umem->fq;
> + else
> + return -EINVAL;
> +
> + qpg = virt_to_head_page(q->ring);
> + if (size > (PAGE_SIZE << compound_order(qpg)))
> + return -EINVAL;
> +
> + pfn = virt_to_phys(q->ring) >> PAGE_SHIFT;
> + return remap_pfn_range(vma, vma->vm_start, pfn,
> + size, vma->vm_page_prot);
> +}
> +
> static struct proto xsk_proto = {
> .name = "XDP",
> .owner = THIS_MODULE,
> @@ -139,7 +199,7 @@ static const struct proto_ops xsk_proto_ops = {
> .getsockopt = sock_no_getsockopt,
> .sendmsg = sock_no_sendmsg,
> .recvmsg = sock_no_recvmsg,
> - .mmap = sock_no_mmap,
> + .mmap = xsk_mmap,
> .sendpage = sock_no_sendpage,
> };
>
> diff --git a/net/xdp/xsk_queue.c b/net/xdp/xsk_queue.c
> new file mode 100644
> index 000000000000..23da4f29d3fb
> --- /dev/null
> +++ b/net/xdp/xsk_queue.c
> @@ -0,0 +1,58 @@
> +// SPDX-License-Identifier: GPL-2.0
> +/* XDP user-space ring structure
> + * Copyright(c) 2018 Intel Corporation.
> + *
> + * This program is free software; you can redistribute it and/or modify it
> + * under the terms and conditions of the GNU General Public License,
> + * version 2, as published by the Free Software Foundation.
> + *
> + * This program is distributed in the hope it will be useful, but WITHOUT
> + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
> + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
> + * more details.
> + */
> +
> +#include <linux/slab.h>
> +
> +#include "xsk_queue.h"
> +
> +static u32 xskq_umem_get_ring_size(struct xsk_queue *q)
> +{
> + return sizeof(struct xdp_umem_ring) + q->nentries * sizeof(u32);
> +}
> +
> +struct xsk_queue *xskq_create(u32 nentries)
> +{
> + struct xsk_queue *q;
> + gfp_t gfp_flags;
> + size_t size;
> +
> + q = kzalloc(sizeof(*q), GFP_KERNEL);
> + if (!q)
> + return NULL;
> +
> + q->nentries = nentries;
> + q->ring_mask = nentries - 1;
> +
> + gfp_flags = GFP_KERNEL | __GFP_ZERO | __GFP_NOWARN |
> + __GFP_COMP | __GFP_NORETRY;
> + size = xskq_umem_get_ring_size(q);
> +
> + q->ring = (struct xdp_ring *)__get_free_pages(gfp_flags,
> + get_order(size));
> + if (!q->ring) {
> + kfree(q);
> + return NULL;
> + }
> +
> + return q;
> +}
> +
> +void xskq_destroy(struct xsk_queue *q)
> +{
> + if (!q)
> + return;
> +
> + page_frag_free(q->ring);
> + kfree(q);
> +}
> diff --git a/net/xdp/xsk_queue.h b/net/xdp/xsk_queue.h
> new file mode 100644
> index 000000000000..7eb556bf73be
> --- /dev/null
> +++ b/net/xdp/xsk_queue.h
> @@ -0,0 +1,38 @@
> +/* SPDX-License-Identifier: GPL-2.0
> + * XDP user-space ring structure
> + * Copyright(c) 2018 Intel Corporation.
> + *
> + * This program is free software; you can redistribute it and/or modify it
> + * under the terms and conditions of the GNU General Public License,
> + * version 2, as published by the Free Software Foundation.
> + *
> + * This program is distributed in the hope it will be useful, but WITHOUT
> + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
> + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
> + * more details.
> + */
> +
> +#ifndef _LINUX_XSK_QUEUE_H
> +#define _LINUX_XSK_QUEUE_H
> +
> +#include <linux/types.h>
> +#include <linux/if_xdp.h>
> +
> +#include "xdp_umem_props.h"
> +
> +struct xsk_queue {
> + struct xdp_umem_props umem_props;
> + u32 ring_mask;
> + u32 nentries;
> + u32 prod_head;
> + u32 prod_tail;
> + u32 cons_head;
> + u32 cons_tail;
> + struct xdp_ring *ring;
> + u64 invalid_descs;
> +};
Any documentation on how e.g. the locking works here?
> +
> +struct xsk_queue *xskq_create(u32 nentries);
> +void xskq_destroy(struct xsk_queue *q);
> +
> +#endif /* _LINUX_XSK_QUEUE_H */
> --
> 2.14.1
^ permalink raw reply
* Re: [RFC PATCH ghak32 V2 01/13] audit: add container id
From: Paul Moore @ 2018-04-23 23:15 UTC (permalink / raw)
To: Richard Guy Briggs
Cc: cgroups, containers, linux-api, Linux-Audit Mailing List,
linux-fsdevel, LKML, netdev, ebiederm, luto, jlayton, carlos,
dhowells, viro, simo, Eric Paris, serge
In-Reply-To: <20180421143443.faaput5g2rn6ul7p@madcap2.tricolour.ca>
On Sat, Apr 21, 2018 at 10:34 AM, Richard Guy Briggs <rgb@redhat.com> wrote:
> On 2018-04-18 19:47, Paul Moore wrote:
>> On Fri, Mar 16, 2018 at 5:00 AM, Richard Guy Briggs <rgb@redhat.com> wrote:
>> > Implement the proc fs write to set the audit container ID of a process,
>> > emitting an AUDIT_CONTAINER record to document the event.
>> >
>> > This is a write from the container orchestrator task to a proc entry of
>> > the form /proc/PID/containerid where PID is the process ID of the newly
>> > created task that is to become the first task in a container, or an
>> > additional task added to a container.
>> >
>> > The write expects up to a u64 value (unset: 18446744073709551615).
>> >
>> > This will produce a record such as this:
>> > type=CONTAINER msg=audit(1519903238.968:261): op=set pid=596 uid=0 subj=unconfined_u:unconfined_r:unconfined_t:s0-s0:c0.c1023 auid=0 tty=pts0 ses=1 opid=596 old-contid=18446744073709551615 contid=123455 res=0
>> >
>> > The "op" field indicates an initial set. The "pid" to "ses" fields are
>> > the orchestrator while the "opid" field is the object's PID, the process
>> > being "contained". Old and new container ID values are given in the
>> > "contid" fields, while res indicates its success.
>> >
>> > It is not permitted to self-set, unset or re-set the container ID. A
>> > child inherits its parent's container ID, but then can be set only once
>> > after.
>> >
>> > See: https://github.com/linux-audit/audit-kernel/issues/32
>> >
>> > Signed-off-by: Richard Guy Briggs <rgb@redhat.com>
>> > ---
>> > fs/proc/base.c | 37 ++++++++++++++++++++
>> > include/linux/audit.h | 16 +++++++++
>> > include/linux/init_task.h | 4 ++-
>> > include/linux/sched.h | 1 +
>> > include/uapi/linux/audit.h | 2 ++
>> > kernel/auditsc.c | 84 ++++++++++++++++++++++++++++++++++++++++++++++
>> > 6 files changed, 143 insertions(+), 1 deletion(-)
...
>> > diff --git a/include/linux/sched.h b/include/linux/sched.h
>> > index d258826..1b82191 100644
>> > --- a/include/linux/sched.h
>> > +++ b/include/linux/sched.h
>> > @@ -796,6 +796,7 @@ struct task_struct {
>> > #ifdef CONFIG_AUDITSYSCALL
>> > kuid_t loginuid;
>> > unsigned int sessionid;
>> > + u64 containerid;
>>
>> This one line addition to the task_struct scares me the most of
>> anything in this patchset. Why? It's a field named "containerid" in
>> a perhaps one of the most widely used core kernel structures; the
>> possibilities for abuse are endless, and it's foolish to think we
>> would ever be able to adequately police this.
>
> Fair enough.
>
>> Unfortunately, we can't add the field to audit_context as things
>> currently stand because we don't always allocate an audit_context,
>> it's dependent on the system's configuration, and we need to track the
>> audit container ID for a given process, regardless of the audit
>> configuration. Pretty much the same reason why loginuid and sessionid
>> are located directly in task_struct now. As I stressed during the
>> design phase, I really want to keep this as an *audit* container ID
>> and not a general purpose kernel wide container ID. If the kernel
>> ever grows a general purpose container ID token, I'll be the first in
>> line to convert the audit code, but I don't want audit to be that
>> general purpose mechanism ... audit is hated enough as-is ;)
>
> When would we need an audit container ID when audit is not enabled
> enough to have an audit_context?
I'm thinking of the audit_alloc() case where audit_filter_task()
returns AUDIT_DISABLED.
I believe this is the same reason why loginuid and sessionid live
directly in the task_struct and not in the audit_context; they need to
persist for the lifetime of the task.
> If it is only used for audit, and audit is the only consumer, and audit
> can only use it when it is enabled, then we can just return success to
> any write to the proc filehandle, or not even present it. Nothing will
> be able to know that value wasn't used.
>
> When are loginuid and sessionid used now when audit is not enabled (or
> should I say, explicitly disabled)?
See above. I think that should answer these questions.
>> > diff --git a/include/uapi/linux/audit.h b/include/uapi/linux/audit.h
>> > index 4e61a9e..921a71f 100644
>> > --- a/include/uapi/linux/audit.h
>> > +++ b/include/uapi/linux/audit.h
>> > @@ -71,6 +71,7 @@
>> > #define AUDIT_TTY_SET 1017 /* Set TTY auditing status */
>> > #define AUDIT_SET_FEATURE 1018 /* Turn an audit feature on or off */
>> > #define AUDIT_GET_FEATURE 1019 /* Get which features are enabled */
>> > +#define AUDIT_CONTAINER 1020 /* Define the container id and information */
>> >
>> > #define AUDIT_FIRST_USER_MSG 1100 /* Userspace messages mostly uninteresting to kernel */
>> > #define AUDIT_USER_AVC 1107 /* We filter this differently */
>> > @@ -465,6 +466,7 @@ struct audit_tty_status {
>> > };
>> >
>> > #define AUDIT_UID_UNSET (unsigned int)-1
>> > +#define AUDIT_CID_UNSET ((u64)-1)
>>
>> I think we need to decide if we want to distinguish between the "host"
>> (e.g. init ns) and "unset". Looking at this patch (I've only quickly
>> skimmed the others so far) it would appear that you don't think we
>> need to worry about this distinction; that's fine, but let's make it
>> explicit with a comment in the code that AUDIT_CID_UNSET means "unset"
>> as well as "host".
>
> I don't see any reason to distinguish between "host" and "unset". Since
> a container doesn't have a concrete definition based in namespaces, the
> initial namespace set is meaningless here.
Okay, that sounds reasonable.
> Is there value in having a container orchestrator process have a
> reserved container ID that has a policy distinct from any other
> container?
I'm open to arguments for this idea, but I don't see a point to it right now.
> If so, then I could see the value in making the distinction.
> For example, I've heard of interest in systemd acting as a container
> orchestrator, so if it took on that role as PID 1, then every process in
> the system would inherit that ID and none would be unset.
>
> I can't picture how having seperate "host" and "unset" values helps us.
I don't have a strong feeling either way, I just wanted to ask the question.
>> > /* audit_rule_data supports filter rules with both integer and string
>> > * fields. It corresponds with AUDIT_ADD_RULE, AUDIT_DEL_RULE and
>> > diff --git a/kernel/auditsc.c b/kernel/auditsc.c
>> > index 4e0a4ac..29c8482 100644
>> > --- a/kernel/auditsc.c
>> > +++ b/kernel/auditsc.c
>> > @@ -2073,6 +2073,90 @@ int audit_set_loginuid(kuid_t loginuid)
>> > return rc;
>> > }
>> >
>> > +static int audit_set_containerid_perm(struct task_struct *task, u64 containerid)
>> > +{
>> > + struct task_struct *parent;
>> > + u64 pcontainerid, ccontainerid;
>> > +
>> > + /* Don't allow to set our own containerid */
>> > + if (current == task)
>> > + return -EPERM;
>>
>> Why not? Is there some obvious security concern that I missing?
>
> We then lose the distinction in the AUDIT_CONTAINER record between the
> initiating PID and the target PID. This was outlined in the proposal.
I just went back and reread the v3 proposal and I still don't see a
good explanation of this. Why is this bad? What's the security
concern?
> Having said that, I'm still not sure we have protected sufficiently from
> a child turning around and setting it's parent's as yet unset or
> inherited audit container ID.
Yes, I believe we only want to let a task set the audit container for
it's children (or itself/threads if we decide to allow that, see
above). There *has* to be a function to check to see if a task if a
child of a given task ... right? ... although this is likely to be a
pointer traversal and locking nightmare ... hmmm.
>> I ask because I suppose it might be possible for some container
>> runtime to do a fork, setup some of the environment and them exec the
>> container (before you answer the obvious "namespaces!" please remember
>> we're not trying to define containers).
>
> I don't think namespaces have any bearing on this concern since none are
> required.
>
>> > + /* Don't allow the containerid to be unset */
>> > + if (!cid_valid(containerid))
>> > + return -EINVAL;
>> > + /* if we don't have caps, reject */
>> > + if (!capable(CAP_AUDIT_CONTROL))
>> > + return -EPERM;
>> > + /* if containerid is unset, allow */
>> > + if (!audit_containerid_set(task))
>> > + return 0;
>> > + /* it is already set, and not inherited from the parent, reject */
>> > + ccontainerid = audit_get_containerid(task);
>> > + rcu_read_lock();
>> > + parent = rcu_dereference(task->real_parent);
>> > + rcu_read_unlock();
>> > + task_lock(parent);
>> > + pcontainerid = audit_get_containerid(parent);
>> > + task_unlock(parent);
>> > + if (ccontainerid != pcontainerid)
>> > + return -EPERM;
>> > + return 0;
>> > +}
>> > +
>> > +static void audit_log_set_containerid(struct task_struct *task, u64 oldcontainerid,
>> > + u64 containerid, int rc)
>> > +{
>> > + struct audit_buffer *ab;
>> > + uid_t uid;
>> > + struct tty_struct *tty;
>> > +
>> > + if (!audit_enabled)
>> > + return;
>> > +
>> > + ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_CONTAINER);
>> > + if (!ab)
>> > + return;
>> > +
>> > + uid = from_kuid(&init_user_ns, task_uid(current));
>> > + tty = audit_get_tty(current);
>> > +
>> > + audit_log_format(ab, "op=set pid=%d uid=%u", task_tgid_nr(current), uid);
>> > + audit_log_task_context(ab);
>> > + audit_log_format(ab, " auid=%u tty=%s ses=%u opid=%d old-contid=%llu contid=%llu res=%d",
>> > + from_kuid(&init_user_ns, audit_get_loginuid(current)),
>> > + tty ? tty_name(tty) : "(none)", audit_get_sessionid(current),
>> > + task_tgid_nr(task), oldcontainerid, containerid, !rc);
>> > +
>> > + audit_put_tty(tty);
>> > + audit_log_end(ab);
>> > +}
>> > +
>> > +/**
>> > + * audit_set_containerid - set current task's audit_context containerid
>> > + * @containerid: containerid value
>> > + *
>> > + * Returns 0 on success, -EPERM on permission failure.
>> > + *
>> > + * Called (set) from fs/proc/base.c::proc_containerid_write().
>> > + */
>> > +int audit_set_containerid(struct task_struct *task, u64 containerid)
>> > +{
>> > + u64 oldcontainerid;
>> > + int rc;
>> > +
>> > + oldcontainerid = audit_get_containerid(task);
>> > +
>> > + rc = audit_set_containerid_perm(task, containerid);
>> > + if (!rc) {
>> > + task_lock(task);
>> > + task->containerid = containerid;
>> > + task_unlock(task);
>> > + }
>> > +
>> > + audit_log_set_containerid(task, oldcontainerid, containerid, rc);
>> > + return rc;
>>
>> Why are audit_set_containerid_perm() and audit_log_containerid()
>> separate functions?
>
> (I assume you mean audit_log_set_containerid()?)
Yep. My fingers got tired typing in that function name and decided a
shortcut was necessary.
> It seemed clearer that all the permission checking was in one function
> and its return code could be used to report the outcome when logging the
> (attempted) action. This is the same structure as audit_set_loginuid()
> and it made sense.
When possible I really like it when the permission checks are in the
same function as the code which does the work; it's less likely to get
abused that way (you have to willfully bypass the access checks). The
exceptions might be if you wanted to reuse the access control code, or
insert a modular access mechanism (e.g. LSMs).
I'm less concerned about audit_log_set_containerid(), but the usual
idea of avoiding single-use function within the same scope applies
here.
> This would be the time to connect it to a syscall if that seems like a
> good idea and remove pid, uid, auid, tty, ses fields.
Ah yes, I missed that. You know my stance on connecting records by
now (hint: yes, connect them) so I think that would be a good thing to
do for the next round.
--
paul moore
www.paul-moore.com
^ permalink raw reply
* Re: dev_loopback_xmit parameters
From: Emanuele @ 2018-04-23 23:11 UTC (permalink / raw)
To: Eric Dumazet; +Cc: Linux Kernel Network Developers
In-Reply-To: <c91f52c8-c7de-e555-a184-b62d39665b9f@gmail.com>
Ok, clear now.
Even though I don't understand what to set to avoid triggering the
WARN_ON(!skb_dst(skb));
inside dev_loopback_xmit.
I just would like to send the skb in loopback, i.e. moving the packet
from the sending to the receiving queue of a certain struct net_device.
On 24/04/2018 00:36, Eric Dumazet wrote:
>
> On 04/23/2018 02:40 PM, Emanuele wrote:
>> Hello,
>>
>> I don't know if this is the right place where to ask, but I was wondering why the dev_loopback_xmit function defined in /net/core/dev.c takes struct net * and struct sock * as parameters. They are never used, so I believe passing only the struct sk_buff * should be enough.
>>
> Look at net/ipv6/ip6_output.c where NF_HOOK() uses dev_loopback_xmit().
>
>> In addition, it would like to know where I can read what is and how to set a skb dst_entry, since I don't really understand it.
>>
>> Thanks a lot,
>>
>> Emanuele
>>
^ permalink raw reply
* Re: [PATCH bpf-next 02/15] xsk: add user memory registration support sockopt
From: Willem de Bruijn @ 2018-04-23 23:04 UTC (permalink / raw)
To: Björn Töpel
Cc: Karlsson, Magnus, Alexander Duyck, Alexander Duyck,
John Fastabend, Alexei Starovoitov, Jesper Dangaard Brouer,
Daniel Borkmann, Michael S. Tsirkin, Network Development,
Björn Töpel, michael.lundkvist, Brandeburg, Jesse,
Singhai, Anjali, Zhang, Qi Z
In-Reply-To: <20180423135619.7179-3-bjorn.topel@gmail.com>
On Mon, Apr 23, 2018 at 9:56 AM, Björn Töpel <bjorn.topel@gmail.com> wrote:
> From: Björn Töpel <bjorn.topel@intel.com>
>
> In this commit the base structure of the AF_XDP address family is set
> up. Further, we introduce the abilty register a window of user memory
> to the kernel via the XDP_UMEM_REG setsockopt syscall. The memory
> window is viewed by an AF_XDP socket as a set of equally large
> frames. After a user memory registration all frames are "owned" by the
> user application, and not the kernel.
>
> Co-authored-by: Magnus Karlsson <magnus.karlsson@intel.com>
> Signed-off-by: Magnus Karlsson <magnus.karlsson@intel.com>
> Signed-off-by: Björn Töpel <bjorn.topel@intel.com>
> +static void xdp_umem_release(struct xdp_umem *umem)
> +{
> + struct task_struct *task;
> + struct mm_struct *mm;
> + unsigned long diff;
> +
> + if (umem->pgs) {
> + xdp_umem_unpin_pages(umem);
> +
> + task = get_pid_task(umem->pid, PIDTYPE_PID);
> + put_pid(umem->pid);
> + if (!task)
> + goto out;
> + mm = get_task_mm(task);
> + put_task_struct(task);
> + if (!mm)
> + goto out;
> +
> + diff = umem->size >> PAGE_SHIFT;
Need to round up or size must always be a multiple of PAGE_SIZE.
> +
> + down_write(&mm->mmap_sem);
> + mm->pinned_vm -= diff;
> + up_write(&mm->mmap_sem);
When using user->locked_vm for resource limit checks, no need
to also update mm->pinned_vm?
> +static int __xdp_umem_reg(struct xdp_umem *umem, struct xdp_umem_reg *mr)
> +{
> + u32 frame_size = mr->frame_size, frame_headroom = mr->frame_headroom;
> + u64 addr = mr->addr, size = mr->len;
> + unsigned int nframes;
> + int size_chk, err;
> +
> + if (frame_size < XDP_UMEM_MIN_FRAME_SIZE || frame_size > PAGE_SIZE) {
> + /* Strictly speaking we could support this, if:
> + * - huge pages, or*
> + * - using an IOMMU, or
> + * - making sure the memory area is consecutive
> + * but for now, we simply say "computer says no".
> + */
> + return -EINVAL;
> + }
Ideally, AF_XDP subsumes all packet socket use cases. It does not
have packet v3's small packet optimizations of variable sized frames
and block signaling.
I don't suggest adding that now. But for the non-zerocopy case, it may
make sense to ensure that nothing is blocking a later addition of these
features. Especially for header-only (snaplen) workloads. So far, I don't
see any issues.
> + if (!is_power_of_2(frame_size))
> + return -EINVAL;
> +
> + if (!PAGE_ALIGNED(addr)) {
> + /* Memory area has to be page size aligned. For
> + * simplicity, this might change.
> + */
> + return -EINVAL;
> + }
> +
> + if ((addr + size) < addr)
> + return -EINVAL;
> +
> + nframes = size / frame_size;
> + if (nframes == 0 || nframes > UINT_MAX)
> + return -EINVAL;
You may also want a check here that nframes * frame_size is at least
PAGE_SIZE and probably a multiple of that.
> + frame_headroom = ALIGN(frame_headroom, 64);
> +
> + size_chk = frame_size - frame_headroom - XDP_PACKET_HEADROOM;
> + if (size_chk < 0)
> + return -EINVAL;
> +
> + umem->pid = get_task_pid(current, PIDTYPE_PID);
> + umem->size = (size_t)size;
> + umem->address = (unsigned long)addr;
> + umem->props.frame_size = frame_size;
> + umem->props.nframes = nframes;
> + umem->frame_headroom = frame_headroom;
> + umem->npgs = size / PAGE_SIZE;
> + umem->pgs = NULL;
> + umem->user = NULL;
> +
> + umem->frame_size_log2 = ilog2(frame_size);
> + umem->nfpp_mask = (PAGE_SIZE / frame_size) - 1;
> + umem->nfpplog2 = ilog2(PAGE_SIZE / frame_size);
> + atomic_set(&umem->users, 1);
> +
> + err = xdp_umem_account_pages(umem);
> + if (err)
> + goto out;
> +
> + err = xdp_umem_pin_pages(umem);
> + if (err)
need to call xdp_umem_unaccount_pages on error
> + goto out;
> + return 0;
> +
> +out:
> + put_pid(umem->pid);
> + return err;
> +}
^ permalink raw reply
* [PATCH v2] selftests: bpf: update .gitignore with missing file
From: Anders Roxell @ 2018-04-23 22:53 UTC (permalink / raw)
To: ast, daniel, shuah; +Cc: netdev, linux-kernel, linux-kselftest, Anders Roxell
In-Reply-To: <065a8bc2-6065-3d43-8b03-35ed693de7ce@iogearbox.net>
Fixes: c0fa1b6c3efc ("bpf: btf: Add BTF tests")
Signed-off-by: Anders Roxell <anders.roxell@linaro.org>
---
Rebased against bpf-next.
tools/testing/selftests/bpf/.gitignore | 1 +
1 file changed, 1 insertion(+)
diff --git a/tools/testing/selftests/bpf/.gitignore b/tools/testing/selftests/bpf/.gitignore
index 9cf83f895d98..da19f0562bf8 100644
--- a/tools/testing/selftests/bpf/.gitignore
+++ b/tools/testing/selftests/bpf/.gitignore
@@ -12,3 +12,4 @@ test_tcpbpf_user
test_verifier_log
feature
test_libbpf_open
+test_btf
--
2.17.0
^ permalink raw reply related
* Re: [bpf PATCH v3 0/3] BPF, a couple sockmap fixes
From: Daniel Borkmann @ 2018-04-23 22:52 UTC (permalink / raw)
To: John Fastabend, ast; +Cc: netdev
In-Reply-To: <20180423223712.16388.63625.stgit@john-Precision-Tower-5810>
On 04/24/2018 12:39 AM, John Fastabend wrote:
> While testing sockmap with more programs (besides our test programs)
> I found a couple issues.
>
> The attached series fixes an issue where pinned maps were not
> working correctly, blocking sockets returned zero, and an error
> path that when the sock hit an out of memory case resulted in a
> double page_put() while doing ingress redirects.
>
> See individual patches for more details.
>
> v2: Incorporated Daniel's feedback to use map ops for uref put op
> which also fixed the build error discovered in v1.
> v3: rename map_put_uref to map_release_uref
Applied to bpf tree, thanks John!
^ permalink raw reply
* [PATCH net v2] net: ethtool: Add missing kernel doc for FEC parameters
From: Florian Fainelli @ 2018-04-23 22:51 UTC (permalink / raw)
To: netdev; +Cc: davem, vidya.chowdary, dustin, roopa, Florian Fainelli
While adding support for ethtool::get_fecparam and set_fecparam, kernel
doc for these functions was missed, add those.
Fixes: 1a5f3da20bd9 ("net: ethtool: add support for forward error correction modes")
Signed-off-by: Florian Fainelli <f.fainelli@gmail.com>
---
Changes in v2:
- corrected set_fecparam in commit message
include/linux/ethtool.h | 2 ++
1 file changed, 2 insertions(+)
diff --git a/include/linux/ethtool.h b/include/linux/ethtool.h
index ebe41811ed34..b32cd2062f18 100644
--- a/include/linux/ethtool.h
+++ b/include/linux/ethtool.h
@@ -310,6 +310,8 @@ bool ethtool_convert_link_mode_to_legacy_u32(u32 *legacy_u32,
* fields should be ignored (use %__ETHTOOL_LINK_MODE_MASK_NBITS
* instead of the latter), any change to them will be overwritten
* by kernel. Returns a negative error code or zero.
+ * @get_fecparam: Get the network device Forward Error Correction parameters.
+ * @set_fecparam: Set the network device Forward Error Correction parameters.
*
* All operations are optional (i.e. the function pointer may be set
* to %NULL) and callers must take this into account. Callers must
--
2.14.1
^ permalink raw reply related
* [bpf PATCH v3 3/3] bpf: sockmap, fix double page_put on ENOMEM error in redirect path
From: John Fastabend @ 2018-04-23 22:39 UTC (permalink / raw)
To: ast, daniel; +Cc: netdev
In-Reply-To: <20180423223712.16388.63625.stgit@john-Precision-Tower-5810>
In the case where the socket memory boundary is hit the redirect
path returns an ENOMEM error. However, before checking for this
condition the redirect scatterlist buffer is setup with a valid
page and length. This is never unwound so when the buffers are
released latter in the error path we do a put_page() and clear
the scatterlist fields. But, because the initial error happens
before completing the scatterlist buffer we end up with both the
original buffer and the redirect buffer pointing to the same page
resulting in duplicate put_page() calls.
To fix this simply move the initial configuration of the redirect
scatterlist buffer below the sock memory check.
Found this while running TCP_STREAM test with netperf using Cilium.
Fixes: fa246693a111 ("bpf: sockmap, BPF_F_INGRESS flag for BPF_SK_SKB_STREAM_VERDICT")
Signed-off-by: John Fastabend <john.fastabend@gmail.com>
---
0 files changed
diff --git a/kernel/bpf/sockmap.c b/kernel/bpf/sockmap.c
index aaf50ec..634415c 100644
--- a/kernel/bpf/sockmap.c
+++ b/kernel/bpf/sockmap.c
@@ -524,8 +524,6 @@ static int bpf_tcp_ingress(struct sock *sk, int apply_bytes,
i = md->sg_start;
do {
- r->sg_data[i] = md->sg_data[i];
-
size = (apply && apply_bytes < md->sg_data[i].length) ?
apply_bytes : md->sg_data[i].length;
@@ -536,6 +534,7 @@ static int bpf_tcp_ingress(struct sock *sk, int apply_bytes,
}
sk_mem_charge(sk, size);
+ r->sg_data[i] = md->sg_data[i];
r->sg_data[i].length = size;
md->sg_data[i].length -= size;
md->sg_data[i].offset += size;
^ permalink raw reply related
* [bpf PATCH v3 2/3] bpf: sockmap, sk_wait_event needed to handle blocking cases
From: John Fastabend @ 2018-04-23 22:39 UTC (permalink / raw)
To: ast, daniel; +Cc: netdev
In-Reply-To: <20180423223712.16388.63625.stgit@john-Precision-Tower-5810>
In the recvmsg handler we need to add a wait event to support the
blocking use cases. Without this we return zero and may confuse
user applications. In the wait event any data received on the
sk either via sk_receive_queue or the psock ingress list will
wake up the sock.
Fixes: fa246693a111 ("bpf: sockmap, BPF_F_INGRESS flag for BPF_SK_SKB_STREAM_VERDICT")
Signed-off-by: John Fastabend <john.fastabend@gmail.com>
---
0 files changed
diff --git a/kernel/bpf/sockmap.c b/kernel/bpf/sockmap.c
index a73d484..aaf50ec 100644
--- a/kernel/bpf/sockmap.c
+++ b/kernel/bpf/sockmap.c
@@ -43,6 +43,7 @@
#include <net/tcp.h>
#include <linux/ptr_ring.h>
#include <net/inet_common.h>
+#include <linux/sched/signal.h>
#define SOCK_CREATE_FLAG_MASK \
(BPF_F_NUMA_NODE | BPF_F_RDONLY | BPF_F_WRONLY)
@@ -732,6 +733,26 @@ static int bpf_exec_tx_verdict(struct smap_psock *psock,
return err;
}
+static int bpf_wait_data(struct sock *sk,
+ struct smap_psock *psk, int flags,
+ long timeo, int *err)
+{
+ int rc;
+
+ DEFINE_WAIT_FUNC(wait, woken_wake_function);
+
+ add_wait_queue(sk_sleep(sk), &wait);
+ sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk);
+ rc = sk_wait_event(sk, &timeo,
+ !list_empty(&psk->ingress) ||
+ !skb_queue_empty(&sk->sk_receive_queue),
+ &wait);
+ sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk);
+ remove_wait_queue(sk_sleep(sk), &wait);
+
+ return rc;
+}
+
static int bpf_tcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len,
int nonblock, int flags, int *addr_len)
{
@@ -755,6 +776,7 @@ static int bpf_tcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len,
return tcp_recvmsg(sk, msg, len, nonblock, flags, addr_len);
lock_sock(sk);
+bytes_ready:
while (copied != len) {
struct scatterlist *sg;
struct sk_msg_buff *md;
@@ -809,6 +831,28 @@ static int bpf_tcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len,
}
}
+ if (!copied) {
+ long timeo;
+ int data;
+ int err = 0;
+
+ timeo = sock_rcvtimeo(sk, nonblock);
+ data = bpf_wait_data(sk, psock, flags, timeo, &err);
+
+ if (data) {
+ if (!skb_queue_empty(&sk->sk_receive_queue)) {
+ release_sock(sk);
+ smap_release_sock(psock, sk);
+ copied = tcp_recvmsg(sk, msg, len, nonblock, flags, addr_len);
+ return copied;
+ }
+ goto bytes_ready;
+ }
+
+ if (err)
+ copied = err;
+ }
+
release_sock(sk);
smap_release_sock(psock, sk);
return copied;
^ permalink raw reply related
* [bpf PATCH v3 1/3] bpf: sockmap, map_release does not hold refcnt for pinned maps
From: John Fastabend @ 2018-04-23 22:39 UTC (permalink / raw)
To: ast, daniel; +Cc: netdev
In-Reply-To: <20180423223712.16388.63625.stgit@john-Precision-Tower-5810>
Relying on map_release hook to decrement the reference counts when a
map is removed only works if the map is not being pinned. In the
pinned case the ref is decremented immediately and the BPF programs
released. After this BPF programs may not be in-use which is not
what the user would expect.
This patch moves the release logic into bpf_map_put_uref() and brings
sockmap in-line with how a similar case is handled in prog array maps.
Fixes: 3d9e952697de ("bpf: sockmap, fix leaking maps with attached but not detached progs")
Signed-off-by: John Fastabend <john.fastabend@gmail.com>
---
0 files changed
diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 0ac4340..a5782d0 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -33,6 +33,7 @@ struct bpf_map_ops {
void (*map_release)(struct bpf_map *map, struct file *map_file);
void (*map_free)(struct bpf_map *map);
int (*map_get_next_key)(struct bpf_map *map, void *key, void *next_key);
+ void (*map_release_uref)(struct bpf_map *map);
/* funcs callable from userspace and from eBPF programs */
void *(*map_lookup_elem)(struct bpf_map *map, void *key);
@@ -448,7 +449,6 @@ int bpf_percpu_array_update(struct bpf_map *map, void *key, void *value,
int bpf_fd_array_map_update_elem(struct bpf_map *map, struct file *map_file,
void *key, void *value, u64 map_flags);
int bpf_fd_array_map_lookup_elem(struct bpf_map *map, void *key, u32 *value);
-void bpf_fd_array_map_clear(struct bpf_map *map);
int bpf_fd_htab_map_update_elem(struct bpf_map *map, struct file *map_file,
void *key, void *value, u64 map_flags);
int bpf_fd_htab_map_lookup_elem(struct bpf_map *map, void *key, u32 *value);
diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c
index 02a1893..0fd8d8f 100644
--- a/kernel/bpf/arraymap.c
+++ b/kernel/bpf/arraymap.c
@@ -526,7 +526,7 @@ static u32 prog_fd_array_sys_lookup_elem(void *ptr)
}
/* decrement refcnt of all bpf_progs that are stored in this map */
-void bpf_fd_array_map_clear(struct bpf_map *map)
+static void bpf_fd_array_map_clear(struct bpf_map *map)
{
struct bpf_array *array = container_of(map, struct bpf_array, map);
int i;
@@ -545,6 +545,7 @@ void bpf_fd_array_map_clear(struct bpf_map *map)
.map_fd_get_ptr = prog_fd_array_get_ptr,
.map_fd_put_ptr = prog_fd_array_put_ptr,
.map_fd_sys_lookup_elem = prog_fd_array_sys_lookup_elem,
+ .map_release_uref = bpf_fd_array_map_clear,
};
static struct bpf_event_entry *bpf_event_entry_gen(struct file *perf_file,
diff --git a/kernel/bpf/sockmap.c b/kernel/bpf/sockmap.c
index a3b2138..a73d484 100644
--- a/kernel/bpf/sockmap.c
+++ b/kernel/bpf/sockmap.c
@@ -1831,7 +1831,7 @@ static int sock_map_update_elem(struct bpf_map *map,
return err;
}
-static void sock_map_release(struct bpf_map *map, struct file *map_file)
+static void sock_map_release(struct bpf_map *map)
{
struct bpf_stab *stab = container_of(map, struct bpf_stab, map);
struct bpf_prog *orig;
@@ -1855,7 +1855,7 @@ static void sock_map_release(struct bpf_map *map, struct file *map_file)
.map_get_next_key = sock_map_get_next_key,
.map_update_elem = sock_map_update_elem,
.map_delete_elem = sock_map_delete_elem,
- .map_release = sock_map_release,
+ .map_release_uref = sock_map_release,
};
BPF_CALL_4(bpf_sock_map_update, struct bpf_sock_ops_kern *, bpf_sock,
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index fe23dc5a..a22c26e 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -260,8 +260,8 @@ static void bpf_map_free_deferred(struct work_struct *work)
static void bpf_map_put_uref(struct bpf_map *map)
{
if (atomic_dec_and_test(&map->usercnt)) {
- if (map->map_type == BPF_MAP_TYPE_PROG_ARRAY)
- bpf_fd_array_map_clear(map);
+ if (map->ops->map_release_uref)
+ map->ops->map_release_uref(map);
}
}
^ permalink raw reply related
* [bpf PATCH v3 0/3] BPF, a couple sockmap fixes
From: John Fastabend @ 2018-04-23 22:39 UTC (permalink / raw)
To: ast, daniel; +Cc: netdev
While testing sockmap with more programs (besides our test programs)
I found a couple issues.
The attached series fixes an issue where pinned maps were not
working correctly, blocking sockets returned zero, and an error
path that when the sock hit an out of memory case resulted in a
double page_put() while doing ingress redirects.
See individual patches for more details.
v2: Incorporated Daniel's feedback to use map ops for uref put op
which also fixed the build error discovered in v1.
v3: rename map_put_uref to map_release_uref
---
John Fastabend (3):
bpf: sockmap, map_release does not hold refcnt for pinned maps
bpf: sockmap, sk_wait_event needed to handle blocking cases
bpf: sockmap, fix double page_put on ENOMEM error in redirect path
0 files changed
^ permalink raw reply
* Re: dev_loopback_xmit parameters
From: Eric Dumazet @ 2018-04-23 22:36 UTC (permalink / raw)
To: Emanuele, Linux Kernel Network Developers
In-Reply-To: <eb1bf67b-7f38-a3b5-4974-6ee1d198887f@gmail.com>
On 04/23/2018 02:40 PM, Emanuele wrote:
> Hello,
>
> I don't know if this is the right place where to ask, but I was wondering why the dev_loopback_xmit function defined in /net/core/dev.c takes struct net * and struct sock * as parameters. They are never used, so I believe passing only the struct sk_buff * should be enough.
>
Look at net/ipv6/ip6_output.c where NF_HOOK() uses dev_loopback_xmit().
> In addition, it would like to know where I can read what is and how to set a skb dst_entry, since I don't really understand it.
>
> Thanks a lot,
>
> Emanuele
>
^ permalink raw reply
* Re: [PATCH] selftests: bpf: update .gitignore with missing file
From: Daniel Borkmann @ 2018-04-23 22:31 UTC (permalink / raw)
To: Anders Roxell
Cc: ast, Shuah Khan, netdev, Linux Kernel Mailing List,
linux-kselftest
In-Reply-To: <CADYN=9KAorrGBh+2_LZ+6cxaf4yU-pOmU4BSzkMKT8sdU7hhUw@mail.gmail.com>
On 04/24/2018 12:14 AM, Anders Roxell wrote:
> On 23 April 2018 at 23:34, Daniel Borkmann <daniel@iogearbox.net> wrote:
>> On 04/23/2018 03:50 PM, Anders Roxell wrote:
>>> Fixes: c0fa1b6c3efc ("bpf: btf: Add BTF tests")
>>> Signed-off-by: Anders Roxell <anders.roxell@linaro.org>
>>> ---
>>> tools/testing/selftests/bpf/.gitignore | 1 +
>>> 1 file changed, 1 insertion(+)
>>>
>>> diff --git a/tools/testing/selftests/bpf/.gitignore b/tools/testing/selftests/bpf/.gitignore
>>> index 5e1ab2f0eb79..3e3b3ced3f7c 100644
>>> --- a/tools/testing/selftests/bpf/.gitignore
>>> +++ b/tools/testing/selftests/bpf/.gitignore
>>> @@ -15,3 +15,4 @@ test_libbpf_open
>>> test_sock
>>> test_sock_addr
>>> urandom_read
>>> +test_btf
>>
>> Against which tree is this? This doesn't apply to bpf-next. It would
>> apply against bpf tree, but c0fa1b6c3efc ("bpf: btf: Add BTF tests")
>> is part of bpf-next, so fits to neither of them.
>
> I'm sorry,
>
> I did it against this patch [1] that I thought was already applied to
> the bpf tree.
That was bpf tree since the original change already went to mainline; the
BTF is in bpf-next however, so you need to rebase your change against that.
Thanks,
Daniel
^ permalink raw reply
* [PATCH 4/4] rhashtable: improve rhashtable_walk stability when stop/start used.
From: NeilBrown @ 2018-04-23 22:29 UTC (permalink / raw)
To: Thomas Graf, Herbert Xu, David Miller; +Cc: netdev, linux-kernel
In-Reply-To: <152452244405.1456.8175298512483573078.stgit@noble>
When a walk of an rhashtable is interrupted with rhastable_walk_stop()
and then rhashtable_walk_start(), the location to restart from is based
on a 'skip' count in the current hash chain, and this can be incorrect
if insertions or deletions have happened. This does not happen when
the walk is not stopped and started as iter->p is a placeholder which
is safe to use while holding the RCU read lock.
In rhashtable_walk_start() we can revalidate that 'p' is still in the
same hash chain. If it isn't then the current method is still used.
With this patch, if a rhashtable walker ensures that the current
object remains in the table over a stop/start period (possibly by
elevating the reference count if that is sufficient), it can be sure
that a walk will not miss objects that were in the hashtable for the
whole time of the walk.
rhashtable_walk_start() may not find the object even though it is
still in the hashtable if a rehash has moved it to a new table. In
this case it will (eventually) get -EAGAIN and will need to proceed
through the whole table again to be sure to see everything at least
once.
Acked-by: Herbert Xu <herbert@gondor.apana.org.au>
Signed-off-by: NeilBrown <neilb@suse.com>
---
lib/rhashtable.c | 44 +++++++++++++++++++++++++++++++++++++++++---
1 file changed, 41 insertions(+), 3 deletions(-)
diff --git a/lib/rhashtable.c b/lib/rhashtable.c
index 81edf1ab38ab..9427b5766134 100644
--- a/lib/rhashtable.c
+++ b/lib/rhashtable.c
@@ -727,6 +727,7 @@ int rhashtable_walk_start_check(struct rhashtable_iter *iter)
__acquires(RCU)
{
struct rhashtable *ht = iter->ht;
+ bool rhlist = ht->rhlist;
rcu_read_lock();
@@ -735,13 +736,52 @@ int rhashtable_walk_start_check(struct rhashtable_iter *iter)
list_del(&iter->walker.list);
spin_unlock(&ht->lock);
- if (!iter->walker.tbl && !iter->end_of_table) {
+ if (iter->end_of_table)
+ return 0;
+ if (!iter->walker.tbl) {
iter->walker.tbl = rht_dereference_rcu(ht->tbl, ht);
iter->slot = 0;
iter->skip = 0;
return -EAGAIN;
}
+ if (iter->p && !rhlist) {
+ /*
+ * We need to validate that 'p' is still in the table, and
+ * if so, update 'skip'
+ */
+ struct rhash_head *p;
+ int skip = 0;
+ rht_for_each_rcu(p, iter->walker.tbl, iter->slot) {
+ skip++;
+ if (p == iter->p) {
+ iter->skip = skip;
+ goto found;
+ }
+ }
+ iter->p = NULL;
+ } else if (iter->p && rhlist) {
+ /* Need to validate that 'list' is still in the table, and
+ * if so, update 'skip' and 'p'.
+ */
+ struct rhash_head *p;
+ struct rhlist_head *list;
+ int skip = 0;
+ rht_for_each_rcu(p, iter->walker.tbl, iter->slot) {
+ for (list = container_of(p, struct rhlist_head, rhead);
+ list;
+ list = rcu_dereference(list->next)) {
+ skip++;
+ if (list == iter->list) {
+ iter->p = p;
+ skip = skip;
+ goto found;
+ }
+ }
+ }
+ iter->p = NULL;
+ }
+found:
return 0;
}
EXPORT_SYMBOL_GPL(rhashtable_walk_start_check);
@@ -917,8 +957,6 @@ void rhashtable_walk_stop(struct rhashtable_iter *iter)
iter->walker.tbl = NULL;
spin_unlock(&ht->lock);
- iter->p = NULL;
-
out:
rcu_read_unlock();
}
^ permalink raw reply related
* [PATCH 3/4] rhashtable: reset iter when rhashtable_walk_start sees new table
From: NeilBrown @ 2018-04-23 22:29 UTC (permalink / raw)
To: Thomas Graf, Herbert Xu, David Miller; +Cc: netdev, linux-kernel
In-Reply-To: <152452244405.1456.8175298512483573078.stgit@noble>
The documentation claims that when rhashtable_walk_start_check()
detects a resize event, it will rewind back to the beginning
of the table. This is not true. We need to set ->slot and
->skip to be zero for it to be true.
Acked-by: Herbert Xu <herbert@gondor.apana.org.au>
Signed-off-by: NeilBrown <neilb@suse.com>
---
lib/rhashtable.c | 2 ++
1 file changed, 2 insertions(+)
diff --git a/lib/rhashtable.c b/lib/rhashtable.c
index 6d490f51174e..81edf1ab38ab 100644
--- a/lib/rhashtable.c
+++ b/lib/rhashtable.c
@@ -737,6 +737,8 @@ int rhashtable_walk_start_check(struct rhashtable_iter *iter)
if (!iter->walker.tbl && !iter->end_of_table) {
iter->walker.tbl = rht_dereference_rcu(ht->tbl, ht);
+ iter->slot = 0;
+ iter->skip = 0;
return -EAGAIN;
}
^ permalink raw reply related
* [PATCH 2/4] rhashtable: Revise incorrect comment on r{hl, hash}table_walk_enter()
From: NeilBrown @ 2018-04-23 22:29 UTC (permalink / raw)
To: Thomas Graf, Herbert Xu, David Miller; +Cc: netdev, linux-kernel
In-Reply-To: <152452244405.1456.8175298512483573078.stgit@noble>
Neither rhashtable_walk_enter() or rhltable_walk_enter() sleep, though
they do take a spinlock without irq protection.
So revise the comments to accurately state the contexts in which
these functions can be called.
Acked-by: Herbert Xu <herbert@gondor.apana.org.au>
Signed-off-by: NeilBrown <neilb@suse.com>
---
include/linux/rhashtable.h | 5 +++--
lib/rhashtable.c | 5 +++--
2 files changed, 6 insertions(+), 4 deletions(-)
diff --git a/include/linux/rhashtable.h b/include/linux/rhashtable.h
index 87d443a5b11d..4e1f535c2034 100644
--- a/include/linux/rhashtable.h
+++ b/include/linux/rhashtable.h
@@ -1268,8 +1268,9 @@ static inline int rhashtable_walk_init(struct rhashtable *ht,
* For a completely stable walk you should construct your own data
* structure outside the hash table.
*
- * This function may sleep so you must not call it from interrupt
- * context or with spin locks held.
+ * This function may be called from any process context, including
+ * non-preemptable context, but cannot be called from softirq or
+ * hardirq context.
*
* You must call rhashtable_walk_exit after this function returns.
*/
diff --git a/lib/rhashtable.c b/lib/rhashtable.c
index 2b2b79974b61..6d490f51174e 100644
--- a/lib/rhashtable.c
+++ b/lib/rhashtable.c
@@ -668,8 +668,9 @@ EXPORT_SYMBOL_GPL(rhashtable_insert_slow);
* For a completely stable walk you should construct your own data
* structure outside the hash table.
*
- * This function may sleep so you must not call it from interrupt
- * context or with spin locks held.
+ * This function may be called from any process context, including
+ * non-preemptable context, but cannot be called from softirq or
+ * hardirq context.
*
* You must call rhashtable_walk_exit after this function returns.
*/
^ permalink raw reply related
* [PATCH 1/4] rhashtable: remove outdated comments about grow_decision etc
From: NeilBrown @ 2018-04-23 22:29 UTC (permalink / raw)
To: Thomas Graf, Herbert Xu, David Miller; +Cc: netdev, linux-kernel
In-Reply-To: <152452244405.1456.8175298512483573078.stgit@noble>
grow_decision and shink_decision no longer exist, so remove
the remaining references to them.
Acked-by: Herbert Xu <herbert@gondor.apana.org.au>
Signed-off-by: NeilBrown <neilb@suse.com>
---
include/linux/rhashtable.h | 33 ++++++++++++++-------------------
1 file changed, 14 insertions(+), 19 deletions(-)
diff --git a/include/linux/rhashtable.h b/include/linux/rhashtable.h
index 1f8ad121eb43..87d443a5b11d 100644
--- a/include/linux/rhashtable.h
+++ b/include/linux/rhashtable.h
@@ -836,9 +836,8 @@ static inline void *__rhashtable_insert_fast(
*
* It is safe to call this function from atomic context.
*
- * Will trigger an automatic deferred table resizing if the size grows
- * beyond the watermark indicated by grow_decision() which can be passed
- * to rhashtable_init().
+ * Will trigger an automatic deferred table resizing if residency in the
+ * table grows beyond 70%.
*/
static inline int rhashtable_insert_fast(
struct rhashtable *ht, struct rhash_head *obj,
@@ -866,9 +865,8 @@ static inline int rhashtable_insert_fast(
*
* It is safe to call this function from atomic context.
*
- * Will trigger an automatic deferred table resizing if the size grows
- * beyond the watermark indicated by grow_decision() which can be passed
- * to rhashtable_init().
+ * Will trigger an automatic deferred table resizing if residency in the
+ * table grows beyond 70%.
*/
static inline int rhltable_insert_key(
struct rhltable *hlt, const void *key, struct rhlist_head *list,
@@ -890,9 +888,8 @@ static inline int rhltable_insert_key(
*
* It is safe to call this function from atomic context.
*
- * Will trigger an automatic deferred table resizing if the size grows
- * beyond the watermark indicated by grow_decision() which can be passed
- * to rhashtable_init().
+ * Will trigger an automatic deferred table resizing if residency in the
+ * table grows beyond 70%.
*/
static inline int rhltable_insert(
struct rhltable *hlt, struct rhlist_head *list,
@@ -922,9 +919,8 @@ static inline int rhltable_insert(
*
* It is safe to call this function from atomic context.
*
- * Will trigger an automatic deferred table resizing if the size grows
- * beyond the watermark indicated by grow_decision() which can be passed
- * to rhashtable_init().
+ * Will trigger an automatic deferred table resizing if residency in the
+ * table grows beyond 70%.
*/
static inline int rhashtable_lookup_insert_fast(
struct rhashtable *ht, struct rhash_head *obj,
@@ -981,9 +977,8 @@ static inline void *rhashtable_lookup_get_insert_fast(
*
* Lookups may occur in parallel with hashtable mutations and resizing.
*
- * Will trigger an automatic deferred table resizing if the size grows
- * beyond the watermark indicated by grow_decision() which can be passed
- * to rhashtable_init().
+ * Will trigger an automatic deferred table resizing if residency in the
+ * table grows beyond 70%.
*
* Returns zero on success.
*/
@@ -1134,8 +1129,8 @@ static inline int __rhashtable_remove_fast(
* walk the bucket chain upon removal. The removal operation is thus
* considerable slow if the hash table is not correctly sized.
*
- * Will automatically shrink the table via rhashtable_expand() if the
- * shrink_decision function specified at rhashtable_init() returns true.
+ * Will automatically shrink the table if permitted when residency drops
+ * below 30%.
*
* Returns zero on success, -ENOENT if the entry could not be found.
*/
@@ -1156,8 +1151,8 @@ static inline int rhashtable_remove_fast(
* walk the bucket chain upon removal. The removal operation is thus
* considerable slow if the hash table is not correctly sized.
*
- * Will automatically shrink the table via rhashtable_expand() if the
- * shrink_decision function specified at rhashtable_init() returns true.
+ * Will automatically shrink the table if permitted when residency drops
+ * below 30%
*
* Returns zero on success, -ENOENT if the entry could not be found.
*/
^ permalink raw reply related
* [PATCH 0/4] A few rhashtables cleanups
From: NeilBrown @ 2018-04-23 22:29 UTC (permalink / raw)
To: Thomas Graf, Herbert Xu, David Miller; +Cc: netdev, linux-kernel
2 patches fixes documentation
1 fixes a bit in rhashtable_walk_start()
1 improves rhashtable_walk stability.
All reviewed and Acked.
Thanks,
NeilBrown
---
NeilBrown (4):
rhashtable: remove outdated comments about grow_decision etc
rhashtable: Revise incorrect comment on r{hl,hash}table_walk_enter()
rhashtable: reset iter when rhashtable_walk_start sees new table
rhashtable: improve rhashtable_walk stability when stop/start used.
include/linux/rhashtable.h | 38 +++++++++++++++------------------
lib/rhashtable.c | 51 ++++++++++++++++++++++++++++++++++++++++----
2 files changed, 63 insertions(+), 26 deletions(-)
--
Signature
^ permalink raw reply
* Re: [PATCH] selftests: bpf: update .gitignore with missing file
From: Anders Roxell @ 2018-04-23 22:14 UTC (permalink / raw)
To: Daniel Borkmann
Cc: ast, Shuah Khan, netdev, Linux Kernel Mailing List,
linux-kselftest
In-Reply-To: <5ea150a6-8af1-3aa8-7ff0-5fdb64054bab@iogearbox.net>
On 23 April 2018 at 23:34, Daniel Borkmann <daniel@iogearbox.net> wrote:
> On 04/23/2018 03:50 PM, Anders Roxell wrote:
>> Fixes: c0fa1b6c3efc ("bpf: btf: Add BTF tests")
>> Signed-off-by: Anders Roxell <anders.roxell@linaro.org>
>> ---
>> tools/testing/selftests/bpf/.gitignore | 1 +
>> 1 file changed, 1 insertion(+)
>>
>> diff --git a/tools/testing/selftests/bpf/.gitignore b/tools/testing/selftests/bpf/.gitignore
>> index 5e1ab2f0eb79..3e3b3ced3f7c 100644
>> --- a/tools/testing/selftests/bpf/.gitignore
>> +++ b/tools/testing/selftests/bpf/.gitignore
>> @@ -15,3 +15,4 @@ test_libbpf_open
>> test_sock
>> test_sock_addr
>> urandom_read
>> +test_btf
>
> Against which tree is this? This doesn't apply to bpf-next. It would
> apply against bpf tree, but c0fa1b6c3efc ("bpf: btf: Add BTF tests")
> is part of bpf-next, so fits to neither of them.
I'm sorry,
I did it against this patch [1] that I thought was already applied to
the bpf tree.
Cheers,
Anders
[1] https://patchwork.kernel.org/patch/10332907/
^ permalink raw reply
* Re: [bpf PATCH v2 1/3] bpf: sockmap, map_release does not hold refcnt for pinned maps
From: Daniel Borkmann @ 2018-04-23 21:53 UTC (permalink / raw)
To: John Fastabend, ast; +Cc: netdev
In-Reply-To: <20180423182929.17999.59712.stgit@john-Precision-Tower-5810>
On 04/23/2018 08:29 PM, John Fastabend wrote:
> Relying on map_release hook to decrement the reference counts when a
> map is removed only works if the map is not being pinned. In the
> pinned case the ref is decremented immediately and the BPF programs
> released. After this BPF programs may not be in-use which is not
> what the user would expect.
>
> This patch moves the release logic into bpf_map_put_uref() and brings
> sockmap in-line with how a similar case is handled in prog array maps.
>
> Fixes: 3d9e952697de ("bpf: sockmap, fix leaking maps with attached but not detached progs")
> Signed-off-by: John Fastabend <john.fastabend@gmail.com>
Patches look good, but one trivial request below.
[...]
> diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
> index 4ca46df..4b70439 100644
> --- a/kernel/bpf/syscall.c
> +++ b/kernel/bpf/syscall.c
> @@ -257,8 +257,8 @@ static void bpf_map_free_deferred(struct work_struct *work)
> static void bpf_map_put_uref(struct bpf_map *map)
> {
> if (atomic_dec_and_test(&map->usercnt)) {
> - if (map->map_type == BPF_MAP_TYPE_PROG_ARRAY)
> - bpf_fd_array_map_clear(map);
> + if (map->ops->map_put_uref)
> + map->ops->map_put_uref(map);
Could you change the callback name into something like 'map_release_uref'?
Naming it 'map_put_uref' is a bit confusing since this is only called when
the uref reference count already dropped to zero, and here we really release
the last reference point to user space. Given this is BPF core infra, would
be nice to still fix this up before applying.
Thanks,
Daniel
^ permalink raw reply
* [PATCH v2] net: qrtr: Expose tunneling endpoint to user space
From: Bjorn Andersson @ 2018-04-23 21:46 UTC (permalink / raw)
To: David S. Miller; +Cc: Chris Lew, linux-kernel, netdev, linux-arm-msm
This implements a misc character device named "qrtr-tun" for the purpose
of allowing user space applications to implement endpoints in the qrtr
network.
This allows more advanced (and dynamic) testing of the qrtr code as well
as opens up the ability of tunneling qrtr over a network or USB link.
Signed-off-by: Bjorn Andersson <bjorn.andersson@linaro.org>
---
Changes since v1:
- Dropped queue lock
net/qrtr/Kconfig | 7 +++
net/qrtr/Makefile | 2 +
net/qrtr/tun.c | 146 ++++++++++++++++++++++++++++++++++++++++++++++++++++++
3 files changed, 155 insertions(+)
create mode 100644 net/qrtr/tun.c
diff --git a/net/qrtr/Kconfig b/net/qrtr/Kconfig
index 326fd97444f5..1944834d225c 100644
--- a/net/qrtr/Kconfig
+++ b/net/qrtr/Kconfig
@@ -21,4 +21,11 @@ config QRTR_SMD
Say Y here to support SMD based ipcrouter channels. SMD is the
most common transport for IPC Router.
+config QRTR_TUN
+ tristate "TUN device for Qualcomm IPC Router"
+ ---help---
+ Say Y here to expose a character device that allows user space to
+ implement endpoints of QRTR, for purpose of tunneling data to other
+ hosts or testing purposes.
+
endif # QRTR
diff --git a/net/qrtr/Makefile b/net/qrtr/Makefile
index ab09e40f7c74..be012bfd3e52 100644
--- a/net/qrtr/Makefile
+++ b/net/qrtr/Makefile
@@ -2,3 +2,5 @@ obj-$(CONFIG_QRTR) := qrtr.o
obj-$(CONFIG_QRTR_SMD) += qrtr-smd.o
qrtr-smd-y := smd.o
+obj-$(CONFIG_QRTR_TUN) += qrtr-tun.o
+qrtr-tun-y := tun.o
diff --git a/net/qrtr/tun.c b/net/qrtr/tun.c
new file mode 100644
index 000000000000..48b2147c98f8
--- /dev/null
+++ b/net/qrtr/tun.c
@@ -0,0 +1,146 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2018, Linaro Ltd */
+
+#include <linux/miscdevice.h>
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <linux/uaccess.h>
+
+#include "qrtr.h"
+
+struct qrtr_tun {
+ struct qrtr_endpoint ep;
+
+ struct sk_buff_head queue;
+ wait_queue_head_t readq;
+};
+
+static int qrtr_tun_send(struct qrtr_endpoint *ep, struct sk_buff *skb)
+{
+ struct qrtr_tun *tun = container_of(ep, struct qrtr_tun, ep);
+
+ skb_queue_tail(&tun->queue, skb);
+
+ /* wake up any blocking processes, waiting for new data */
+ wake_up_interruptible(&tun->readq);
+
+ return 0;
+}
+
+static int qrtr_tun_open(struct inode *inode, struct file *filp)
+{
+ struct qrtr_tun *tun;
+
+ tun = kzalloc(sizeof(*tun), GFP_KERNEL);
+ if (!tun)
+ return -ENOMEM;
+
+ skb_queue_head_init(&tun->queue);
+ init_waitqueue_head(&tun->readq);
+
+ tun->ep.xmit = qrtr_tun_send;
+
+ filp->private_data = tun;
+
+ return qrtr_endpoint_register(&tun->ep, QRTR_EP_NID_AUTO);
+}
+
+static ssize_t qrtr_tun_read_iter(struct kiocb *iocb, struct iov_iter *to)
+{
+ struct file *filp = iocb->ki_filp;
+ struct qrtr_tun *tun = filp->private_data;
+ struct sk_buff *skb;
+ int count;
+
+ while (!(skb = skb_dequeue(&tun->queue))) {
+ if (filp->f_flags & O_NONBLOCK)
+ return -EAGAIN;
+
+ /* Wait until we get data or the endpoint goes away */
+ if (wait_event_interruptible(tun->readq,
+ !skb_queue_empty(&tun->queue)))
+ return -ERESTARTSYS;
+ }
+
+ count = min_t(size_t, iov_iter_count(to), skb->len);
+ if (copy_to_iter(skb->data, count, to) != count)
+ count = -EFAULT;
+
+ kfree_skb(skb);
+
+ return count;
+}
+
+static ssize_t qrtr_tun_write_iter(struct kiocb *iocb, struct iov_iter *from)
+{
+ struct file *filp = iocb->ki_filp;
+ struct qrtr_tun *tun = filp->private_data;
+ size_t len = iov_iter_count(from);
+ ssize_t ret;
+ void *kbuf;
+
+ kbuf = kzalloc(len, GFP_KERNEL);
+ if (!kbuf)
+ return -ENOMEM;
+
+ if (!copy_from_iter_full(kbuf, len, from))
+ return -EFAULT;
+
+ ret = qrtr_endpoint_post(&tun->ep, kbuf, len);
+
+ return ret < 0 ? ret : len;
+}
+
+static int qrtr_tun_release(struct inode *inode, struct file *filp)
+{
+ struct qrtr_tun *tun = filp->private_data;
+ struct sk_buff *skb;
+
+ qrtr_endpoint_unregister(&tun->ep);
+
+ /* Discard all SKBs */
+ while (!skb_queue_empty(&tun->queue)) {
+ skb = skb_dequeue(&tun->queue);
+ kfree_skb(skb);
+ }
+
+ kfree(tun);
+
+ return 0;
+}
+
+static const struct file_operations qrtr_tun_ops = {
+ .owner = THIS_MODULE,
+ .open = qrtr_tun_open,
+ .read_iter = qrtr_tun_read_iter,
+ .write_iter = qrtr_tun_write_iter,
+ .release = qrtr_tun_release,
+};
+
+static struct miscdevice qrtr_tun_miscdev = {
+ MISC_DYNAMIC_MINOR,
+ "qrtr-tun",
+ &qrtr_tun_ops,
+};
+
+static int __init qrtr_tun_init(void)
+{
+ int ret;
+
+ ret = misc_register(&qrtr_tun_miscdev);
+ if (ret)
+ pr_err("failed to register Qualcomm IPC Router tun device\n");
+
+ return ret;
+}
+
+static void __exit qrtr_tun_exit(void)
+{
+ misc_deregister(&qrtr_tun_miscdev);
+}
+
+module_init(qrtr_tun_init);
+module_exit(qrtr_tun_exit);
+
+MODULE_DESCRIPTION("Qualcomm IPC Router TUN device");
+MODULE_LICENSE("GPL v2");
--
2.16.2
^ permalink raw reply related
* [PATCH net-next] tcp: md5: only call tp->af_specific->md5_lookup() for md5 sockets
From: Eric Dumazet @ 2018-04-23 21:46 UTC (permalink / raw)
To: David S . Miller; +Cc: netdev, Eric Dumazet, Eric Dumazet
RETPOLINE made calls to tp->af_specific->md5_lookup() quite expensive,
given they have no result.
We can omit the calls for sockets that have no md5 keys.
Signed-off-by: Eric Dumazet <edumazet@google.com>
---
net/ipv4/tcp_output.c | 26 ++++++++++++++------------
1 file changed, 14 insertions(+), 12 deletions(-)
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 383cac0ff0ec059ca7dbc1a6304cc7f8183e008d..95feffb6d53f8a9eadfb15a2fffeec498d6e993a 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -585,14 +585,15 @@ static unsigned int tcp_syn_options(struct sock *sk, struct sk_buff *skb,
unsigned int remaining = MAX_TCP_OPTION_SPACE;
struct tcp_fastopen_request *fastopen = tp->fastopen_req;
+ *md5 = NULL;
#ifdef CONFIG_TCP_MD5SIG
- *md5 = tp->af_specific->md5_lookup(sk, sk);
- if (*md5) {
- opts->options |= OPTION_MD5;
- remaining -= TCPOLEN_MD5SIG_ALIGNED;
+ if (unlikely(rcu_access_pointer(tp->md5sig_info))) {
+ *md5 = tp->af_specific->md5_lookup(sk, sk);
+ if (*md5) {
+ opts->options |= OPTION_MD5;
+ remaining -= TCPOLEN_MD5SIG_ALIGNED;
+ }
}
-#else
- *md5 = NULL;
#endif
/* We always get an MSS option. The option bytes which will be seen in
@@ -720,14 +721,15 @@ static unsigned int tcp_established_options(struct sock *sk, struct sk_buff *skb
opts->options = 0;
+ *md5 = NULL;
#ifdef CONFIG_TCP_MD5SIG
- *md5 = tp->af_specific->md5_lookup(sk, sk);
- if (unlikely(*md5)) {
- opts->options |= OPTION_MD5;
- size += TCPOLEN_MD5SIG_ALIGNED;
+ if (unlikely(rcu_access_pointer(tp->md5sig_info))) {
+ *md5 = tp->af_specific->md5_lookup(sk, sk);
+ if (*md5) {
+ opts->options |= OPTION_MD5;
+ size += TCPOLEN_MD5SIG_ALIGNED;
+ }
}
-#else
- *md5 = NULL;
#endif
if (likely(tp->rx_opt.tstamp_ok)) {
--
2.17.0.484.g0c8726318c-goog
^ permalink raw reply related
* Re: [bpf PATCH 1/2] bpf: Document sockmap '-target bpf' requirement for PROG_TYPE_SK_MSG
From: Daniel Borkmann @ 2018-04-23 21:44 UTC (permalink / raw)
To: John Fastabend, ast; +Cc: netdev
In-Reply-To: <20180423191102.21348.85601.stgit@john-Precision-Tower-5810>
Applied both to bpf tree, thanks John!
^ permalink raw reply
* dev_loopback_xmit parameters
From: Emanuele @ 2018-04-23 21:40 UTC (permalink / raw)
To: Linux Kernel Network Developers
Hello,
I don't know if this is the right place where to ask, but I was
wondering why the dev_loopback_xmit function defined in /net/core/dev.c
takes struct net * and struct sock * as parameters. They are never
used, so I believe passing only the struct sk_buff * should be enough.
In addition, it would like to know where I can read what is and how to
set a skb dst_entry, since I don't really understand it.
Thanks a lot,
Emanuele
^ permalink raw reply
* Re: WARNING in refcount_dec
From: Willem de Bruijn @ 2018-04-23 21:38 UTC (permalink / raw)
To: DaeRyong Jeong
Cc: Cong Wang, Byoungyoung Lee, LKML, Kyungtae Kim,
Linux Kernel Network Developers, Willem de Bruijn
In-Reply-To: <CAF=yD-JR1Bg1FLat41Uua9vi1-ALyuQttChAWyWz0Yx6_01_BA@mail.gmail.com>
On Thu, Apr 19, 2018 at 2:55 PM, Willem de Bruijn
<willemdebruijn.kernel@gmail.com> wrote:
> On Thu, Apr 19, 2018 at 2:32 AM, DaeRyong Jeong <threeearcat@gmail.com> wrote:
>> Hello.
>> We have analyzed the cause of the crash in v4.16-rc3, WARNING in refcount_dec,
>> which is found by RaceFuzzer (a modified version of Syzkaller).
>>
>> Since struct packet_sock's member variables, running, has_vnet_hdr, origdev
>> and auxdata are declared as bitfields, accessing these variables can race if
>> there is no synchronization mechanism.
>
> Great catch.
>
> These fields po->{running, auxdata, origdev, has_vnet_hdr} are
> accessed without a uniform locking strategy.
>
> po->running is always accessed with po->bind_lock held (with the
> exception of reading in packet_seq_show, but that is best effort).
>
> That is the only field written to outside setsockopt. If it is moved to
> a separate word, it will no longer interfere with the others.
>
> The other fields are read lockless in the various recv and send
> functions, but only set in setsockopt. We've had enough
> locking bugs around setsockopt that I suggest we wrap all of
> those in lock_sock, like the example I gave before for
> has_vnet_hdr.
Sent http://patchwork.ozlabs.org/patch/903190/
^ permalink raw reply
page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox