Netdev List

Netdev List
 help / color / mirror / Atom feed

* Re: [RFC PATCH ghak32 V2 11/13] audit: add support for containerid to network namespaces
From: Richard Guy Briggs @ 2018-04-20 20:02 UTC (permalink / raw)
  To: Paul Moore
  Cc: cgroups, containers, linux-api, Linux-Audit Mailing List,
	linux-fsdevel, LKML, netdev, ebiederm, luto, jlayton, carlos,
	dhowells, viro, simo, Eric Paris, serge
In-Reply-To: <CAHC9VhRkstDMjd5T3w+iOUDjzDAs1AOm0xd3p6v_xn6fNGYQhA@mail.gmail.com>

On 2018-04-18 21:46, Paul Moore wrote:
> On Fri, Mar 16, 2018 at 5:00 AM, Richard Guy Briggs <rgb@redhat.com> wrote:
> > Audit events could happen in a network namespace outside of a task
> > context due to packets received from the net that trigger an auditing
> > rule prior to being associated with a running task.  The network
> > namespace could in use by multiple containers by association to the
> > tasks in that network namespace.  We still want a way to attribute
> > these events to any potential containers.  Keep a list per network
> > namespace to track these container identifiiers.
> >
> > Add/increment the container identifier on:
> > - initial setting of the container id via /proc
> > - clone/fork call that inherits a container identifier
> > - unshare call that inherits a container identifier
> > - setns call that inherits a container identifier
> > Delete/decrement the container identifier on:
> > - an inherited container id dropped when child set
> > - process exit
> > - unshare call that drops a net namespace
> > - setns call that drops a net namespace
> >
> > See: https://github.com/linux-audit/audit-kernel/issues/32
> > See: https://github.com/linux-audit/audit-testsuite/issues/64
> > Signed-off-by: Richard Guy Briggs <rgb@redhat.com>
> > ---
> >  include/linux/audit.h       |  7 +++++++
> >  include/net/net_namespace.h | 12 ++++++++++++
> >  kernel/auditsc.c            |  9 ++++++---
> >  kernel/nsproxy.c            |  6 ++++++
> >  net/core/net_namespace.c    | 45 +++++++++++++++++++++++++++++++++++++++++++++
> >  5 files changed, 76 insertions(+), 3 deletions(-)
> 
> ...
> 
> > diff --git a/include/net/net_namespace.h b/include/net/net_namespace.h
> > index 0490084..343a428 100644
> > --- a/include/net/net_namespace.h
> > +++ b/include/net/net_namespace.h
> > @@ -33,6 +33,7 @@
> >  #include <linux/ns_common.h>
> >  #include <linux/idr.h>
> >  #include <linux/skbuff.h>
> > +#include <linux/audit.h>
> >
> >  struct user_namespace;
> >  struct proc_dir_entry;
> > @@ -150,6 +151,7 @@ struct net {
> >  #endif
> >         struct sock             *diag_nlsk;
> >         atomic_t                fnhe_genid;
> > +       struct list_head        audit_containerid;
> >  } __randomize_layout;
> 
> We talked about this briefly off-list, you should be using audit_net
> and the net_generic mechanism instead of this.
> 
> >  #include <linux/seq_file_net.h>
> > @@ -301,6 +303,16 @@ static inline struct net *read_pnet(const possible_net_t *pnet)
> >  #define __net_initconst        __initconst
> >  #endif
> >
> > +#ifdef CONFIG_NET_NS
> > +void net_add_audit_containerid(struct net *net, u64 containerid);
> > +void net_del_audit_containerid(struct net *net, u64 containerid);
> > +#else
> > +static inline void net_add_audit_containerid(struct net *, u64)
> > +{ }
> > +static inline void net_del_audit_containerid(struct net *, u64)
> > +{ }
> > +#endif
> > +
> >  int peernet2id_alloc(struct net *net, struct net *peer);
> >  int peernet2id(struct net *net, struct net *peer);
> >  bool peernet_has_id(struct net *net, struct net *peer);
> > diff --git a/kernel/auditsc.c b/kernel/auditsc.c
> > index 2f02ed9..208da962 100644
> > --- a/kernel/auditsc.c
> > +++ b/kernel/auditsc.c
> > @@ -75,6 +75,7 @@
> >  #include <linux/uaccess.h>
> >  #include <linux/fsnotify_backend.h>
> >  #include <uapi/linux/limits.h>
> > +#include <net/net_namespace.h>
> >
> >  #include "audit.h"
> >
> > @@ -2175,16 +2176,18 @@ static void audit_log_set_containerid(struct task_struct *task, u64 oldcontainer
> >   */
> >  int audit_set_containerid(struct task_struct *task, u64 containerid)
> >  {
> > -       u64 oldcontainerid;
> > +       u64 oldcontainerid = audit_get_containerid(task);
> >         int rc;
> > -
> > -       oldcontainerid = audit_get_containerid(task);
> > +       struct net *net = task->nsproxy->net_ns;
> >
> >         rc = audit_set_containerid_perm(task, containerid);
> >         if (!rc) {
> > +               if (cid_valid(oldcontainerid))
> > +                       net_del_audit_containerid(net, oldcontainerid);
> 
> Using audit_net we can handle this internal to audit, which is a Good Thing.

No problem, done.

> >                 task_lock(task);
> >                 task->containerid = containerid;
> >                 task_unlock(task);
> > +               net_add_audit_containerid(net, containerid);
> 
> Same.
> 
> >         }
> >
> >         audit_log_set_containerid(task, oldcontainerid, containerid, rc);
> > diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c
> > index f6c5d33..d9f1090 100644
> > --- a/kernel/nsproxy.c
> > +++ b/kernel/nsproxy.c
> > @@ -140,6 +140,7 @@ int copy_namespaces(unsigned long flags, struct task_struct *tsk)
> >         struct nsproxy *old_ns = tsk->nsproxy;
> >         struct user_namespace *user_ns = task_cred_xxx(tsk, user_ns);
> >         struct nsproxy *new_ns;
> > +       u64 containerid = audit_get_containerid(tsk);
> >
> >         if (likely(!(flags & (CLONE_NEWNS | CLONE_NEWUTS | CLONE_NEWIPC |
> >                               CLONE_NEWPID | CLONE_NEWNET |
> > @@ -167,6 +168,7 @@ int copy_namespaces(unsigned long flags, struct task_struct *tsk)
> >                 return  PTR_ERR(new_ns);
> >
> >         tsk->nsproxy = new_ns;
> > +       net_add_audit_containerid(new_ns->net_ns, containerid);
> >         return 0;
> >  }
> 
> Hopefully we can handle this in audit_net_init(), we just need to
> figure out where we can get the correct task_struct for the audit
> container ID (some backpointer in the net struct?).

I don't follow.  This needs to happen on every task startup.
audit_net_init() is only called when a new network namespace starts up.

> > @@ -217,6 +219,7 @@ int unshare_nsproxy_namespaces(unsigned long unshare_flags,
> >  void switch_task_namespaces(struct task_struct *p, struct nsproxy *new)
> >  {
> >         struct nsproxy *ns;
> > +       u64 containerid = audit_get_containerid(p);
> >
> >         might_sleep();
> >
> > @@ -224,6 +227,9 @@ void switch_task_namespaces(struct task_struct *p, struct nsproxy *new)
> >         ns = p->nsproxy;
> >         p->nsproxy = new;
> >         task_unlock(p);
> > +       net_del_audit_containerid(ns->net_ns, containerid);
> > +       if (new)
> > +               net_add_audit_containerid(new->net_ns, containerid);
> 
> Okay, we might need a hook here for switching namespaces, but I would
> much rather it be a generic audit hook that calls directly into audit.

Trivial, done.

> paul moore

- RGB

--
Richard Guy Briggs <rgb@redhat.com>
Sr. S/W Engineer, Kernel Security, Base Operating Systems
Remote, Ottawa, Red Hat Canada
IRC: rgb, SunRaycer
Voice: +1.647.777.2635, Internal: (81) 32635

^ permalink raw reply

* Re: [PATCH RFC net-next 00/11] udp gso
From: Alexander Duyck @ 2018-04-20 20:08 UTC (permalink / raw)
  To: Tushar Dave
  Cc: David Miller, Sowmini Varadhan, Willem de Bruijn,
	Samudrala, Sridhar, Netdev, Willem de Bruijn
In-Reply-To: <5195a7d9-7b7d-044b-0031-6fa13c0fe48a@oracle.com>

On Fri, Apr 20, 2018 at 11:27 AM, Tushar Dave <tushar.n.dave@oracle.com> wrote:
>
>
> On 04/18/2018 11:12 AM, Alexander Duyck wrote:
>>
>> On Wed, Apr 18, 2018 at 10:28 AM, David Miller <davem@davemloft.net>
>> wrote:
>>>
>>> From: Sowmini Varadhan <sowmini.varadhan@oracle.com>
>>> Date: Wed, 18 Apr 2018 08:31:03 -0400
>>>
>>>> However, I share Sridhar's concerns about the very fundamental change
>>>> to UDP message boundary semantics here.  There is actually no such thing
>>>> as a "segment" in udp, so in general this feature makes me a little
>>>> uneasy.  Well behaved udp applications should already be sending mtu
>>>> sized datagrams. And the not-so-well-behaved ones are probably relying
>>>> on IP fragmentation/reassembly to take care of datagram boundary
>>>> semantics
>>>> for them?
>>>>
>>>> As Sridhar points out, the feature is not really "negotiated" - one side
>>>> unilaterally sets the option. If the receiver is a classic/POSIX UDP
>>>> implementation, it will have no way of knowing that message boundaries
>>>> have been re-adjusted at the sender.
>>>
>>>
>>> There are no "semantics".
>>>
>>> What ends up on the wire is the same before the kernel/app changes as
>>> afterwards.
>>>
>>> The only difference is that instead of the application doing N - 1
>>> sendmsg() calls with mtu sized writes, it's giving everything all at
>>> once and asking the kernel to segment.
>>>
>>> It even gives the application control over the size of the packets,
>>> which I think is completely prudent in this situation.
>>
>>
>> My only concern with the patch set is verifying what mitigations are
>> in case so that we aren't trying to set an MSS size that results in a
>> frame larger than MTU. I'm still digging through the code and trying
>> to grok it, but I figured I might just put the question out there to
>> may my reviewing easier.
>>
>> Also any plans for HW offload support for this? I vaguely recall that
>> the igb and ixgbe parts had support for something like this in
>> hardware. I would have to double check to see what exactly is
>> supported.
>
>
> Alex,
>
> If by HW support you meant UFO (UDP Fragmentation Offload), then I have
> dig into that last year using ixgbe. And I found that Intel 10G HW does
> break large UDP packets into MTU size however it does not generate
> *true* IP fragments. Instead, when large (> MTU) size UDP packet is
> given to NIC, HW generates unique UDP packets with distinct IP
> fragments. This makes it impossible for receiving station to reassemble
> them into one UDP packet.
>
> I am not sure about igb!
>
> -Tushar

Tushar,

I am not sure you have been following this thread, but this is about
adding UDP GSO support, not fragmentation offload. With GSO support
the UDP frames are not expected to be reassembled they are meant to be
handled as individual frames.

What you have described is why I am interested. This patch set adds
support for GSO segmentation, not fragmentation.

Thanks.

- Alex

^ permalink raw reply

* Re: [PATCH RFC net-next 00/11] udp gso
From: Tushar Dave @ 2018-04-21  3:11 UTC (permalink / raw)
  To: Alexander Duyck
  Cc: David Miller, Sowmini Varadhan, Willem de Bruijn,
	Samudrala, Sridhar, Netdev, Willem de Bruijn
In-Reply-To: <CAKgT0UddxeAR2Sf4gtHOyN6o7+9p02xFnwYB0ceYxBCV4iTi-w@mail.gmail.com>



On 04/20/2018 01:08 PM, Alexander Duyck wrote:
> On Fri, Apr 20, 2018 at 11:27 AM, Tushar Dave <tushar.n.dave@oracle.com> wrote:
>>
>>
>> On 04/18/2018 11:12 AM, Alexander Duyck wrote:
>>>
>>> On Wed, Apr 18, 2018 at 10:28 AM, David Miller <davem@davemloft.net>
>>> wrote:
>>>>
>>>> From: Sowmini Varadhan <sowmini.varadhan@oracle.com>
>>>> Date: Wed, 18 Apr 2018 08:31:03 -0400
>>>>
>>>>> However, I share Sridhar's concerns about the very fundamental change
>>>>> to UDP message boundary semantics here.  There is actually no such thing
>>>>> as a "segment" in udp, so in general this feature makes me a little
>>>>> uneasy.  Well behaved udp applications should already be sending mtu
>>>>> sized datagrams. And the not-so-well-behaved ones are probably relying
>>>>> on IP fragmentation/reassembly to take care of datagram boundary
>>>>> semantics
>>>>> for them?
>>>>>
>>>>> As Sridhar points out, the feature is not really "negotiated" - one side
>>>>> unilaterally sets the option. If the receiver is a classic/POSIX UDP
>>>>> implementation, it will have no way of knowing that message boundaries
>>>>> have been re-adjusted at the sender.
>>>>
>>>>
>>>> There are no "semantics".
>>>>
>>>> What ends up on the wire is the same before the kernel/app changes as
>>>> afterwards.
>>>>
>>>> The only difference is that instead of the application doing N - 1
>>>> sendmsg() calls with mtu sized writes, it's giving everything all at
>>>> once and asking the kernel to segment.
>>>>
>>>> It even gives the application control over the size of the packets,
>>>> which I think is completely prudent in this situation.
>>>
>>>
>>> My only concern with the patch set is verifying what mitigations are
>>> in case so that we aren't trying to set an MSS size that results in a
>>> frame larger than MTU. I'm still digging through the code and trying
>>> to grok it, but I figured I might just put the question out there to
>>> may my reviewing easier.
>>>
>>> Also any plans for HW offload support for this? I vaguely recall that
>>> the igb and ixgbe parts had support for something like this in
>>> hardware. I would have to double check to see what exactly is
>>> supported.
>>
>>
>> Alex,
>>
>> If by HW support you meant UFO (UDP Fragmentation Offload), then I have
>> dig into that last year using ixgbe. And I found that Intel 10G HW does
>> break large UDP packets into MTU size however it does not generate
>> *true* IP fragments. Instead, when large (> MTU) size UDP packet is
>> given to NIC, HW generates unique UDP packets with distinct IP
>> fragments. This makes it impossible for receiving station to reassemble
>> them into one UDP packet.
>>
>> I am not sure about igb!
>>
>> -Tushar
> 
> Tushar,
> 
> I am not sure you have been following this thread, but this is about
> adding UDP GSO support, not fragmentation offload. With GSO support
> the UDP frames are not expected to be reassembled they are meant to be
> handled as individual frames.
> 
> What you have described is why I am interested. This patch set adds
> support for GSO segmentation, not fragmentation.
I see. Never mind.
Thanks.

-Tushar
> 
> Thanks.
> 
> - Alex
> 

^ permalink raw reply

* Re: [PATCH net] bpf: sockmap remove dead check
From: Daniel Borkmann @ 2018-04-20 20:13 UTC (permalink / raw)
  To: Jann Horn, ast, netdev, linux-kernel, john.fastabend
In-Reply-To: <20180420161630.233178-1-jannh@google.com>

On 04/20/2018 06:16 PM, Jann Horn wrote:
> Remove dead code that bails on `attr->value_size > KMALLOC_MAX_SIZE` - the
> previous check already bails on `attr->value_size != 4`.
> 
> Signed-off-by: Jann Horn <jannh@google.com>

Applied to bpf tree, thanks Jann!

^ permalink raw reply

* Re: [RFC PATCH ghak32 V2 11/13] audit: add support for containerid to network namespaces
From: Paul Moore @ 2018-04-20 20:22 UTC (permalink / raw)
  To: Richard Guy Briggs
  Cc: cgroups, containers, linux-api, Linux-Audit Mailing List,
	linux-fsdevel, LKML, netdev, ebiederm, luto, jlayton, carlos,
	dhowells, viro, simo, Eric Paris, serge
In-Reply-To: <20180420200226.7tyxzuovdbgclw3m@madcap2.tricolour.ca>

On Fri, Apr 20, 2018 at 4:02 PM, Richard Guy Briggs <rgb@redhat.com> wrote:
> On 2018-04-18 21:46, Paul Moore wrote:
>> On Fri, Mar 16, 2018 at 5:00 AM, Richard Guy Briggs <rgb@redhat.com> wrote:
>> > Audit events could happen in a network namespace outside of a task
>> > context due to packets received from the net that trigger an auditing
>> > rule prior to being associated with a running task.  The network
>> > namespace could in use by multiple containers by association to the
>> > tasks in that network namespace.  We still want a way to attribute
>> > these events to any potential containers.  Keep a list per network
>> > namespace to track these container identifiiers.
>> >
>> > Add/increment the container identifier on:
>> > - initial setting of the container id via /proc
>> > - clone/fork call that inherits a container identifier
>> > - unshare call that inherits a container identifier
>> > - setns call that inherits a container identifier
>> > Delete/decrement the container identifier on:
>> > - an inherited container id dropped when child set
>> > - process exit
>> > - unshare call that drops a net namespace
>> > - setns call that drops a net namespace
>> >
>> > See: https://github.com/linux-audit/audit-kernel/issues/32
>> > See: https://github.com/linux-audit/audit-testsuite/issues/64
>> > Signed-off-by: Richard Guy Briggs <rgb@redhat.com>
>> > ---
>> >  include/linux/audit.h       |  7 +++++++
>> >  include/net/net_namespace.h | 12 ++++++++++++
>> >  kernel/auditsc.c            |  9 ++++++---
>> >  kernel/nsproxy.c            |  6 ++++++
>> >  net/core/net_namespace.c    | 45 +++++++++++++++++++++++++++++++++++++++++++++
>> >  5 files changed, 76 insertions(+), 3 deletions(-)

...

>> > diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c
>> > index f6c5d33..d9f1090 100644
>> > --- a/kernel/nsproxy.c
>> > +++ b/kernel/nsproxy.c
>> > @@ -140,6 +140,7 @@ int copy_namespaces(unsigned long flags, struct task_struct *tsk)
>> >         struct nsproxy *old_ns = tsk->nsproxy;
>> >         struct user_namespace *user_ns = task_cred_xxx(tsk, user_ns);
>> >         struct nsproxy *new_ns;
>> > +       u64 containerid = audit_get_containerid(tsk);
>> >
>> >         if (likely(!(flags & (CLONE_NEWNS | CLONE_NEWUTS | CLONE_NEWIPC |
>> >                               CLONE_NEWPID | CLONE_NEWNET |
>> > @@ -167,6 +168,7 @@ int copy_namespaces(unsigned long flags, struct task_struct *tsk)
>> >                 return  PTR_ERR(new_ns);
>> >
>> >         tsk->nsproxy = new_ns;
>> > +       net_add_audit_containerid(new_ns->net_ns, containerid);
>> >         return 0;
>> >  }
>>
>> Hopefully we can handle this in audit_net_init(), we just need to
>> figure out where we can get the correct task_struct for the audit
>> container ID (some backpointer in the net struct?).
>
> I don't follow.  This needs to happen on every task startup.
> audit_net_init() is only called when a new network namespace starts up.

Yep, sorry, my mistake.  I must have confused myself when I was
looking at the code.

I'm thinking out loud here, bear with me ...

Assuming we move the netns/audit-container-ID tracking to audit_net,
and considering we already have an audit hook in copy_process() (it
calls audit_alloc()), would this be better handled by the
copy_process() hook?  This ignores naming, audit_alloc() reuse, etc.;
those can be easily fixed.  I'm just thinking of ways to limit our
impact on the core kernel and leverage our existing interaction
points.

-- 
paul moore
www.paul-moore.com

^ permalink raw reply

* Re: [RFC PATCH ghak32 V2 11/13] audit: add support for containerid to network namespaces
From: Richard Guy Briggs @ 2018-04-20 20:42 UTC (permalink / raw)
  To: Paul Moore
  Cc: simo, jlayton, carlos, linux-api, containers, LKML, Eric Paris,
	dhowells, Linux-Audit Mailing List, ebiederm, luto, netdev,
	linux-fsdevel, cgroups, serge, viro
In-Reply-To: <CAHC9VhTOYUAyCJidm99som6FVmjouQUGsEHarQ4h_NhwJxQQfw@mail.gmail.com>

On 2018-04-20 16:22, Paul Moore wrote:
> On Fri, Apr 20, 2018 at 4:02 PM, Richard Guy Briggs <rgb@redhat.com> wrote:
> > On 2018-04-18 21:46, Paul Moore wrote:
> >> On Fri, Mar 16, 2018 at 5:00 AM, Richard Guy Briggs <rgb@redhat.com> wrote:
> >> > Audit events could happen in a network namespace outside of a task
> >> > context due to packets received from the net that trigger an auditing
> >> > rule prior to being associated with a running task.  The network
> >> > namespace could in use by multiple containers by association to the
> >> > tasks in that network namespace.  We still want a way to attribute
> >> > these events to any potential containers.  Keep a list per network
> >> > namespace to track these container identifiiers.
> >> >
> >> > Add/increment the container identifier on:
> >> > - initial setting of the container id via /proc
> >> > - clone/fork call that inherits a container identifier
> >> > - unshare call that inherits a container identifier
> >> > - setns call that inherits a container identifier
> >> > Delete/decrement the container identifier on:
> >> > - an inherited container id dropped when child set
> >> > - process exit
> >> > - unshare call that drops a net namespace
> >> > - setns call that drops a net namespace
> >> >
> >> > See: https://github.com/linux-audit/audit-kernel/issues/32
> >> > See: https://github.com/linux-audit/audit-testsuite/issues/64
> >> > Signed-off-by: Richard Guy Briggs <rgb@redhat.com>
> >> > ---
> >> >  include/linux/audit.h       |  7 +++++++
> >> >  include/net/net_namespace.h | 12 ++++++++++++
> >> >  kernel/auditsc.c            |  9 ++++++---
> >> >  kernel/nsproxy.c            |  6 ++++++
> >> >  net/core/net_namespace.c    | 45 +++++++++++++++++++++++++++++++++++++++++++++
> >> >  5 files changed, 76 insertions(+), 3 deletions(-)
> 
> ...
> 
> >> > diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c
> >> > index f6c5d33..d9f1090 100644
> >> > --- a/kernel/nsproxy.c
> >> > +++ b/kernel/nsproxy.c
> >> > @@ -140,6 +140,7 @@ int copy_namespaces(unsigned long flags, struct task_struct *tsk)
> >> >         struct nsproxy *old_ns = tsk->nsproxy;
> >> >         struct user_namespace *user_ns = task_cred_xxx(tsk, user_ns);
> >> >         struct nsproxy *new_ns;
> >> > +       u64 containerid = audit_get_containerid(tsk);
> >> >
> >> >         if (likely(!(flags & (CLONE_NEWNS | CLONE_NEWUTS | CLONE_NEWIPC |
> >> >                               CLONE_NEWPID | CLONE_NEWNET |
> >> > @@ -167,6 +168,7 @@ int copy_namespaces(unsigned long flags, struct task_struct *tsk)
> >> >                 return  PTR_ERR(new_ns);
> >> >
> >> >         tsk->nsproxy = new_ns;
> >> > +       net_add_audit_containerid(new_ns->net_ns, containerid);
> >> >         return 0;
> >> >  }
> >>
> >> Hopefully we can handle this in audit_net_init(), we just need to
> >> figure out where we can get the correct task_struct for the audit
> >> container ID (some backpointer in the net struct?).
> >
> > I don't follow.  This needs to happen on every task startup.
> > audit_net_init() is only called when a new network namespace starts up.
> 
> Yep, sorry, my mistake.  I must have confused myself when I was
> looking at the code.
> 
> I'm thinking out loud here, bear with me ...
> 
> Assuming we move the netns/audit-container-ID tracking to audit_net,
> and considering we already have an audit hook in copy_process() (it
> calls audit_alloc()), would this be better handled by the
> copy_process() hook?  This ignores naming, audit_alloc() reuse, etc.;
> those can be easily fixed.  I'm just thinking of ways to limit our
> impact on the core kernel and leverage our existing interaction
> points.

The new namespace hasn't been cloned yet and this is the only function
where we have access to both namespaces, so I don't see how that could
work...

> paul moore

- RGB

--
Richard Guy Briggs <rgb@redhat.com>
Sr. S/W Engineer, Kernel Security, Base Operating Systems
Remote, Ottawa, Red Hat Canada
IRC: rgb, SunRaycer
Voice: +1.647.777.2635, Internal: (81) 32635

^ permalink raw reply

* Re: [PATCH RFC net-next] net: ipvs: Adjust gso_size for IPPROTO_TCP
From: Julian Anastasov @ 2018-04-20 20:43 UTC (permalink / raw)
  To: Martin KaFai Lau
  Cc: netdev, Tom Herbert, Eric Dumazet, Nikita Shirokov, kernel-team
In-Reply-To: <20180419212324.1542504-1-kafai@fb.com>


	Hello,

On Thu, 19 Apr 2018, Martin KaFai Lau wrote:

> This patch is not a proper fix and mainly serves for discussion purpose.
> It is based on net-next which I have been using to debug the issue.
> 
> The change that works around the issue is in ensure_mtu_is_adequate().
> Other changes are the rippling effect in function arg.
> 
> This bug was uncovered by one of our legacy service that
> are still using ipvs for load balancing.  In that setup,
> the ipvs encap the ipv6-tcp packet in another ipv6 hdr
> before tx it out to eth0.
> 
> The problem is the kernel stack could pass a skb (which was
> originated from a sys_write(tcp_fd)) to the driver with skb->len
> bigger than the device MTU.  In one NIC setup (with gso and tso off)
> that we are using, it upset the NIC/driver and caused the tx queue
> stalled for tens of seconds which is how it got uncovered.
> (On the NIC side, the NIC firmware and driver have been fixed
> to avoid this tx queue stall after seeing this skb).
> 
> On the kernel side, based on the commit log, this bug should have
> been exposed after commit 815d22e55b0e ("ip6ip6: Support for GSO/GRO").

	Before I go to deeply analyze the GSO code, here
are some preliminary observations:

- IPVS started to use iptunnel_handle_offloads() around 2014,
commit ea1d5d7755a3

- later (2016) the "ip6ip6: Support for GSO/GRO" commits started
to use skb_set_inner_ipproto(). But it is missing in the IPVS code.
I'm not sure if such call can help. At least, I don't see what
different happens in IPVS compared to ip6ip6_tnl_xmit(), for example.

> Before commit 815d22e55b0e, ipv6_gso_segment() would just error
> out (-EPROTONOSUPPORT) because the tx-ing packet is an ip6ip6.
> Due to this error out, it avoid passing it to the driver.  The TCP
> stack then timeout and the TCP mtu probing eventually kicked in to
> lower the skb->len enough to avoid gso_segment.
> 
> After commit 815d22e55b0e, ipv6_gso_segment() -> ipv6_gso_segment()
> -> tcp6_gso_segment() which segment the packet based on a mss
> that does not account for the extra IPv6 hdr.
> 
> Here is a stack from the WARN_ON() that we added to the driver to
> capture the issue:
> [ 1128.611875] WARNING: CPU: 40 PID: 31495 at drivers/net/ethernet/mellanox/mlx5/core/en_tx.c:424 mlx5e_xmit+0x814
> ...
> [ 1129.016536] Call Trace:
> [ 1129.021412]  ? skb_release_data+0xfc/0x120
> [ 1129.029587]  ? kfree_skbmem+0x64/0x70
> [ 1129.036905]  dev_hard_start_xmit+0xa4/0x200
> [ 1129.045262]  sch_direct_xmit+0x10f/0x280
> [ 1129.053111]  __qdisc_run+0x223/0x5a0
> [ 1129.060251]  __dev_queue_xmit+0x245/0x7d0
> [ 1129.068268]  dev_queue_xmit+0x10/0x20
> [ 1129.075573]  ? dev_queue_xmit+0x10/0x20
> [ 1129.083218]  ip6_finish_output2+0x2db/0x490
> [ 1129.091573]  ip6_finish_output+0x125/0x190
> [ 1129.099754]  ip6_output+0x5f/0x100
> [ 1129.106548]  ? ip6_fragment+0x9f0/0x9f0
> [ 1129.114212]  ip6_local_out+0x35/0x40
> [ 1129.121356]  ip_vs_tunnel_xmit_v6+0x267/0x290 [ip_vs]
> [ 1129.131443]  ip_vs_in.part.24+0x302/0x710 [ip_vs]
> [ 1129.140837]  ? ip_vs_in.part.24+0x302/0x710 [ip_vs]
> [ 1129.150578]  ? ip_vs_conn_out_get+0x17/0x140 [ip_vs]
> [ 1129.160493]  ? ip_vs_conn_out_get_proto+0x25/0x30 [ip_vs]
> [ 1129.171273]  ip_vs_in+0x43/0x130 [ip_vs]
> [ 1129.179109]  ip_vs_local_request6+0x26/0x30 [ip_vs]
> [ 1129.188849]  nf_hook_slow+0x3e/0xc0
> [ 1129.195800]  ip6_xmit+0x30b/0x540
> [ 1129.202421]  ? ac6_proc_exit+0x20/0x20
> [ 1129.209909]  inet6_csk_xmit+0x82/0xd0
> [ 1129.217207]  ? lock_timer_base+0x76/0xa0
> [ 1129.225043]  tcp_transmit_skb+0x56f/0xa40
> [ 1129.233051]  tcp_write_xmit+0x2b2/0x11b0
> [ 1129.240885]  __tcp_push_pending_frames+0x33/0xa0
> [ 1129.250106]  tcp_push+0xde/0x100
> [ 1129.256554]  tcp_sendmsg_locked+0x9ca/0xca0
> [ 1129.264910]  tcp_sendmsg+0x2c/0x50
> [ 1129.271703]  inet_sendmsg+0x31/0xb0
> [ 1129.278672]  sock_write_iter+0xf8/0x110
> [ 1129.286335]  new_sync_write+0xd9/0x120
> [ 1129.293823]  vfs_write+0x18d/0x1e0
> [ 1129.300614]  SyS_write+0x48/0xa0
> [ 1129.307045]  do_syscall_64+0x69/0x1e0
> [ 1129.314361]  entry_SYSCALL_64_after_hwframe+0x3d/0xa2
> ...
> [ 1129.648183] ---[ end trace 635061c9c300799e ]---
> [ 1129.657407] skb->len:1554 MTU:1522
> 
> The tcp flow is connecting from the address ending ':27:0' to the ':85'.
> 
> [host-a] > ip -6 r show table local
> local 2401:db00:1011:1f01:face:b00c:0:85 dev lo src 2401:db00:1011:10af:face:0:27:0 metric 1024 advmss 1440 pref medium
> 
> [host-a] > ip -6 a
> 1: lo: <LOOPBACK,UP,LOWER_UP> mtu 65536 state UNKNOWN qlen 1000
>     inet6 2401:db00:1011:1f01:face:b00c:0:85/128 scope global
>        valid_lft forever preferred_lft forever
> 2: eth0: <BROADCAST,MULTICAST,UP,LOWER_UP> mtu 1500 state UP qlen 1000
>     inet6 2401:db00:1011:10af:face:0:27:0/64 scope global
>        valid_lft forever preferred_lft forever
> 
> [host-a] > cat /proc/net/ip_vs
> TCP  [2401:db00:1011:1f01:face:b00c:0000:0085]:01BB rr
>   -> [2401:db00:1011:10cc:face:0000:0091:0000]:01BB      Tunnel  6772   9          6
>   -> [2401:db00:1011:10d8:face:0000:0091:0000]:01BB      Tunnel  6772   8          6
>   -> [2401:db00:1011:10d2:face:0000:0091:0000]:01BB      Tunnel  6772   19         7
> 
> [host-a] > openssl s_client -connect [2401:db00:1011:1f01:face:b00c:0:85]:443
> send-something-long-here-to-trigger-the-bug
> 
> Changing the local route mtu to 1460 to account for the extra ipv6 tunnel header
> can also side step the issue.  Like this:

	Yes, reducing the MTU was a solution recommended from
long time ago in the IPVS HOWTO.

> 
> > ip -6 r show table local
> local 2401:db00:1011:1f01:face:b00c:0:85 dev lo src 2401:db00:1011:10af:face:0:27:0 metric 1024 mtu 1460 advmss 1440 pref medium
> 
> Signed-off-by: Martin KaFai Lau <kafai@fb.com>
> ---
>  net/netfilter/ipvs/ip_vs_xmit.c | 49 +++++++++++++++++++++++++++--------------
>  1 file changed, 33 insertions(+), 16 deletions(-)
> 
> diff --git a/net/netfilter/ipvs/ip_vs_xmit.c b/net/netfilter/ipvs/ip_vs_xmit.c
> index 11c416f3d6e3..88cc0d53ebce 100644
> --- a/net/netfilter/ipvs/ip_vs_xmit.c
> +++ b/net/netfilter/ipvs/ip_vs_xmit.c
> @@ -212,13 +212,15 @@ static inline void maybe_update_pmtu(int skb_af, struct sk_buff *skb, int mtu)
>  		ort->dst.ops->update_pmtu(&ort->dst, sk, NULL, mtu);
>  }
>  
> -static inline bool ensure_mtu_is_adequate(struct netns_ipvs *ipvs, int skb_af,
> +static inline bool ensure_mtu_is_adequate(struct ip_vs_conn *cp,
>  					  int rt_mode,
>  					  struct ip_vs_iphdr *ipvsh,
>  					  struct sk_buff *skb, int mtu)
>  {
> +	struct netns_ipvs *ipvs = cp->ipvs;
> +
>  #ifdef CONFIG_IP_VS_IPV6
> -	if (skb_af == AF_INET6) {
> +	if (cp->af == AF_INET6) {
>  		struct net *net = ipvs->net;
>  
>  		if (unlikely(__mtu_check_toobig_v6(skb, mtu))) {
> @@ -251,6 +253,17 @@ static inline bool ensure_mtu_is_adequate(struct netns_ipvs *ipvs, int skb_af,
>  		}
>  	}
>  
> +	if (skb_shinfo(skb)->gso_size && cp->protocol == IPPROTO_TCP) {
> +		const struct tcphdr *th = (struct tcphdr *)skb_transport_header(skb);
> +		unsigned short hdr_len = (th->doff << 2) +
> +			skb_network_header_len(skb);
> +
> +		if (mtu > hdr_len && mtu - hdr_len < skb_shinfo(skb)->gso_size)
> +			skb_decrease_gso_size(skb_shinfo(skb),
> +					      skb_shinfo(skb)->gso_size -
> +					      (mtu - hdr_len));

	So, software segmentation happens and we want the
tunnel header to be accounted immediately and not after PMTU
probing period? Is this a problem only for IPVS tunnels?
Do we see such delays with other tunnels? May be this should
be solved for all protocols (not just TCP) and for all tunnels?
Looking at ip6_xmit, on GSO we do not return -EMSGSIZE to
local sender, so we should really alter the gso_size for proper
segmentation?

Regards

^ permalink raw reply

* Re: [pci PATCH v8 0/4] Add support for unmanaged SR-IOV
From: Randy Dunlap @ 2018-04-20 20:47 UTC (permalink / raw)
  To: Alexander Duyck
  Cc: Alexander Duyck, Bjorn Helgaas, linux-pci, virtio-dev, kvm,
	Netdev, Daly, Dan, LKML, linux-nvme, Keith Busch, netanel,
	Don Dutile, Maximilian Heyne, Wang, Liang-min, Rustad, Mark D,
	David Woodhouse, Christoph Hellwig, dwmw
In-Reply-To: <CAKgT0UewuO7LWc8wkHPTv2cksF39-8+5nFfEP_DmXVRQtgOWdg@mail.gmail.com>

On 04/20/18 13:01, Alexander Duyck wrote:
> On Fri, Apr 20, 2018 at 10:23 AM, Randy Dunlap <rdunlap@infradead.org> wrote:
>> On 04/20/18 09:28, Alexander Duyck wrote:
>>> This series is meant to add support for SR-IOV on devices when the VFs are
>>> not managed by the kernel. Examples of recent patches attempting to do this
>>> include:
>>> virto - https://patchwork.kernel.org/patch/10241225/
>>> pci-stub - https://patchwork.kernel.org/patch/10109935/
>>> vfio - https://patchwork.kernel.org/patch/10103353/
>>> uio - https://patchwork.kernel.org/patch/9974031/
>>
>> Hi,
>>
>> Somewhere in this patch series it would be nice to tell us what the heck
>> a "PF" is.  :)
>>
>> Thanks.
> 
> Sorry, I was kind of operating on the assumption of everyone
> understanding SR-IOV nomenclature.

Yes, I understood that. :)

> A "PF" is a PCIe Physical Function. When you bring up a PCIe device
> that supports SR-IOV it is the device that is there to begin with.
> 
> A "VF" is a PCIe Virtual Function. You could think of as a logical
> device that is spawned from the physical function using a combination
> of hardware configuration via the SR-IOV block in the PCIe extended
> configuration space and kernel/driver features.
> 
> There are also a number of online resources you could use to research
> SR-IOV further. Hope that helps to clarify some of this.
> 
> Thanks.

Thank you.


-- 
~Randy

^ permalink raw reply

* Re: [PATCH] kvmalloc: always use vmalloc if CONFIG_DEBUG_VM
From: Mikulas Patocka @ 2018-04-20 20:54 UTC (permalink / raw)
  To: Michal Hocko
  Cc: David Miller, Andrew Morton, linux-mm, eric.dumazet, edumazet,
	bhutchings, netdev, linux-kernel, mst, jasowang, virtualization,
	dm-devel, Vlastimil Babka
In-Reply-To: <20180420130852.GC16083@dhcp22.suse.cz>

On Fri, 20 Apr 2018, Michal Hocko wrote:

> On Thu 19-04-18 12:12:38, Mikulas Patocka wrote:
> [...]
> > From: Mikulas Patocka <mpatocka@redhat.com>
> > Subject: [PATCH] kvmalloc: always use vmalloc if CONFIG_DEBUG_VM
> > 
> > The kvmalloc function tries to use kmalloc and falls back to vmalloc if
> > kmalloc fails.
> > 
> > Unfortunatelly, some kernel code has bugs - it uses kvmalloc and then
> > uses DMA-API on the returned memory or frees it with kfree. Such bugs were
> > found in the virtio-net driver, dm-integrity or RHEL7 powerpc-specific
> > code.
> > 
> > These bugs are hard to reproduce because vmalloc falls back to kmalloc
> > only if memory is fragmented.
> > 
> > In order to detect these bugs reliably I submit this patch that changes
> > kvmalloc to always use vmalloc if CONFIG_DEBUG_VM is turned on.
> 
> No way. This is just wrong! First of all, you will explode most likely
> on many allocations of small sizes. Second, CONFIG_DEBUG_VM tends to be
> enabled quite often.

You're an evil person who doesn't want to fix bugs.

You refused to fix vmalloc(GFP_NOIO) misbehavior a year ago (did you make 
some progress with it since that time?) and you refuse to fix kvmalloc 
misuses.

I tried this patch on text-only virtual machine and /proc/vmallocinfo 
shows 614kB more memory. I tried it on a desktop machine with the chrome 
browser open and /proc/vmallocinfo space is increased by 7MB. So no - this 
won't exhaust memory and kill the machine.

Arguing that this increases memory consumption is as bogus as arguing that 
CONFIG_LOCKDEP increses memory consumption. No one is forcing you to 
enable CONFIG_LOCKDEP and no one is forcing you to enable this kvmalloc 
test too.

Mikulas

^ permalink raw reply

* Re: [PATCH] kvmalloc: always use vmalloc if CONFIG_DEBUG_VM
From: Mikulas Patocka @ 2018-04-20 20:56 UTC (permalink / raw)
  To: Michal Hocko
  Cc: Matthew Wilcox, David Miller, Andrew Morton, linux-mm,
	eric.dumazet, edumazet, bhutchings, netdev, linux-kernel, mst,
	jasowang, virtualization, dm-devel, Vlastimil Babka
In-Reply-To: <20180420134901.GB17484@dhcp22.suse.cz>

On Fri, 20 Apr 2018, Michal Hocko wrote:

> On Fri 20-04-18 06:41:36, Matthew Wilcox wrote:
> > On Fri, Apr 20, 2018 at 03:08:52PM +0200, Michal Hocko wrote:
> > > > In order to detect these bugs reliably I submit this patch that changes
> > > > kvmalloc to always use vmalloc if CONFIG_DEBUG_VM is turned on.
> > > 
> > > No way. This is just wrong! First of all, you will explode most likely
> > > on many allocations of small sizes. Second, CONFIG_DEBUG_VM tends to be
> > > enabled quite often.
> > 
> > I think it'll still suit Mikulas' debugging needs if we always use
> > vmalloc for sizes above PAGE_SIZE?
> 
> Even if that was the case then this doesn't sounds like CONFIG_DEBUG_VM
> material. We do not want a completely different behavior when the config
> 
> -- 
> Michal Hocko
> SUSE Labs

I'm not arguing that it must be turned on exactly by CONFIG_DEBUG_VM. It 
may be turned on some other option that is enabled in debug kernels 
(CONFIG_DEBUG_SG may be a better option, because you'll get meaningful 
stracktraces from DMA API then).

> is enabled. If we really need some better fallback testing coverage
> then the fault injection, as suggested by Vlastimil, sounds much more
> reasonable to me

People who test kernels will install the kernel-debug package, reboot to 
the debug kernel and run their testsuites. They won't turn on magic 
options in debugfs or use some hideous kernel commandline arguments. If 
the kvmalloc test isn't in the debug kernel, then the testing crew won't 
test it - that's it.

Mikulas

^ permalink raw reply

* Re: [PATCH] kvmalloc: always use vmalloc if CONFIG_DEBUG_VM
From: Matthew Wilcox @ 2018-04-20 21:02 UTC (permalink / raw)
  To: Mikulas Patocka
  Cc: Michal Hocko, David Miller, Andrew Morton, linux-mm, eric.dumazet,
	edumazet, bhutchings, netdev, linux-kernel, mst, jasowang,
	virtualization, dm-devel, Vlastimil Babka
In-Reply-To: <alpine.LRH.2.02.1804201635180.25408@file01.intranet.prod.int.rdu2.redhat.com>

On Fri, Apr 20, 2018 at 04:54:53PM -0400, Mikulas Patocka wrote:
> On Fri, 20 Apr 2018, Michal Hocko wrote:
> > No way. This is just wrong! First of all, you will explode most likely
> > on many allocations of small sizes. Second, CONFIG_DEBUG_VM tends to be
> > enabled quite often.
> 
> You're an evil person who doesn't want to fix bugs.

Steady on.  There's no need for that.  Michal isn't evil.  Please
apologise.

> You refused to fix vmalloc(GFP_NOIO) misbehavior a year ago (did you make 
> some progress with it since that time?) and you refuse to fix kvmalloc 
> misuses.

I understand you're frustrated, but this is not the way to get the problems
fixed.

> I tried this patch on text-only virtual machine and /proc/vmallocinfo 
> shows 614kB more memory. I tried it on a desktop machine with the chrome 
> browser open and /proc/vmallocinfo space is increased by 7MB. So no - this 
> won't exhaust memory and kill the machine.

This is good data, thank you for providing it.

> Arguing that this increases memory consumption is as bogus as arguing that 
> CONFIG_LOCKDEP increses memory consumption. No one is forcing you to 
> enable CONFIG_LOCKDEP and no one is forcing you to enable this kvmalloc 
> test too.

I think there's a real problem which is that CONFIG_DEBUG_VM is too broad.
It inserts code in a *lot* of places, some of which is quite expensive.
We would do better to split it into more granular pieces ... although
an explosion of configuration options isn't great either.  Maybe just
CONFIG_DEBUG_VM and CONFIG_DEBUG_VM_EXPENSIVE.

Michal may be wrong, but he's not evil.

^ permalink raw reply

* Re: [PATCH RFC net-next] igb: adjust SYSTIM register using TIMADJ register
From: Richard Cochran @ 2018-04-20 21:12 UTC (permalink / raw)
  To: Kshitiz Gupta; +Cc: jeffrey.t.kirsher, intel-wired-lan, netdev, linux-kernel
In-Reply-To: <1524254196-21705-1-git-send-email-kshitiz.gupta@ni.com>

On Fri, Apr 20, 2018 at 02:56:36PM -0500, Kshitiz Gupta wrote:
> Currently the driver adjusts time by reading the current time and then
> modifying it before writing to SYSTIM register. This can introduce
> inaccuracies in SYSTIM. With a PREEMPT_RT kernel, spinlocks may be
> interrupted, which in the existing implementation may lead to increased
> time between the read and the write.
> 
> Alternatively as per section 7.8.3.2 in the i210 data sheet, this
> operation can be done more accurately by using the TIMADJ registers,
> but this should only be used for adjustments less than one 8th of the
> sync interval. Once this register is written, the software can poll on
> TSICR.TADJ to make sure that adjustment operation is completed.

I doubt the utility of this.  The first jump is typically to correct a
large offset of seconds, minutes, or even months.  After that, the
servo corrects any remaining error.

It would help if you would show us a clearly improved servo response
with this change applied.

Thanks,
Richard

^ permalink raw reply

* Re: [PATCH] kvmalloc: always use vmalloc if CONFIG_DEBUG_VM
From: Mikulas Patocka @ 2018-04-20 21:21 UTC (permalink / raw)
  To: Matthew Wilcox
  Cc: Michal Hocko, David Miller, Andrew Morton, linux-mm, eric.dumazet,
	edumazet, bhutchings, netdev, linux-kernel, mst, jasowang,
	virtualization, dm-devel, Vlastimil Babka
In-Reply-To: <20180420210200.GH10788@bombadil.infradead.org>



On Fri, 20 Apr 2018, Matthew Wilcox wrote:

> On Fri, Apr 20, 2018 at 04:54:53PM -0400, Mikulas Patocka wrote:
> > On Fri, 20 Apr 2018, Michal Hocko wrote:
> > > No way. This is just wrong! First of all, you will explode most likely
> > > on many allocations of small sizes. Second, CONFIG_DEBUG_VM tends to be
> > > enabled quite often.
> > 
> > You're an evil person who doesn't want to fix bugs.
> 
> Steady on.  There's no need for that.  Michal isn't evil.  Please
> apologise.

I see this attitude from Michal again and again.

He didn't want to fix vmalloc(GFP_NOIO), he didn't want to fix alloc_pages 
sleeping when __GFP_NORETRY is used. So what should I say? Fix them and 
you won't be evil :-)

(he could also fix the oom killer, so that it is triggered when 
free_memory+cache+free_swap goes beyond a threshold and not when you loop 
too long in the allocator)

> > You refused to fix vmalloc(GFP_NOIO) misbehavior a year ago (did you make 
> > some progress with it since that time?) and you refuse to fix kvmalloc 
> > misuses.
> 
> I understand you're frustrated, but this is not the way to get the problems
> fixed.
> 
> > I tried this patch on text-only virtual machine and /proc/vmallocinfo 
> > shows 614kB more memory. I tried it on a desktop machine with the chrome 
> > browser open and /proc/vmallocinfo space is increased by 7MB. So no - this 
> > won't exhaust memory and kill the machine.
> 
> This is good data, thank you for providing it.
> 
> > Arguing that this increases memory consumption is as bogus as arguing that 
> > CONFIG_LOCKDEP increses memory consumption. No one is forcing you to 
> > enable CONFIG_LOCKDEP and no one is forcing you to enable this kvmalloc 
> > test too.
> 
> I think there's a real problem which is that CONFIG_DEBUG_VM is too broad.
> It inserts code in a *lot* of places, some of which is quite expensive.
> We would do better to split it into more granular pieces ... although
> an explosion of configuration options isn't great either.  Maybe just
> CONFIG_DEBUG_VM and CONFIG_DEBUG_VM_EXPENSIVE.
> 
> Michal may be wrong, but he's not evil.

I already said that we can change it from CONFIG_DEBUG_VM to 
CONFIG_DEBUG_SG - or to whatever other option you may want, just to make 
sure that it is enabled in distro debug kernels by default.

Mikulas

^ permalink raw reply

* Re: [PATCH RFC net-next 00/11] udp gso
From: Willem de Bruijn @ 2018-04-20 21:58 UTC (permalink / raw)
  To: Alexander Duyck
  Cc: David Miller, Sowmini Varadhan, Samudrala, Sridhar, Netdev,
	Willem de Bruijn
In-Reply-To: <CAKgT0UcaapGYAwdQ7udr0qN=r+8d_ii6MZw2uQy6Ct+tMtvfpA@mail.gmail.com>

>>> Also any plans for HW offload support for this? I vaguely recall that
>>> the igb and ixgbe parts had support for something like this in
>>> hardware. I would have to double check to see what exactly is
>>> supported.
>>
>> I hadn't given that much thought until the request yesterday to
>> expose the NETIF_F_GSO_UDP_L4 flag through ethtool. By
>> virtue of having only a single fixed segmentation length, it
>> appears reasonably straightforward to offload.
>
> Actually I just got a chance to start on a review of things. Do we
> need to have to use both GSO_UDP and and GSO_UDP_L4? It might be
> better if we could split these up and specifically call out GSO_UDP as
> UFO and GSO_UDP_L4 as being UDP segmentation.

Thanks for taking a look, Alex.

Agreed, I'll revise that. My initial thought was that both gso skbs need
to take the same udp gso special case branches in places like act_csum
and ovs. But on rereading that seems an unsafe approach, as some
branches are fragmentation specific. I'll review them all and add
separate SKB_GSO_UDP_L4 cases where needed, instead.

^ permalink raw reply

* Re: [PATCH net-next] net: phy: mdio-boardinfo: Allow recursive mdiobus_register()
From: Andrew Lunn @ 2018-04-20 21:59 UTC (permalink / raw)
  To: David Miller; +Cc: netdev, f.fainelli, vivien.didelot
In-Reply-To: <20180420.103430.1692883931904134627.davem@davemloft.net>

On Fri, Apr 20, 2018 at 10:34:30AM -0400, David Miller wrote:
> From: Andrew Lunn <andrew@lunn.ch>
> Date: Thu, 19 Apr 2018 02:00:47 +0200
> 
> > mdiobus_register will search for any mdiobus board info registered for
> > the bus being registered. If found, it will probe devices on the bus.
> > That device, if for example it is an ethernet switch, may then try to
> > register an mdio bus. Thus we need to allow recursive calls to
> > mdiobus_register.
> > 
> > Holding the mdio_board_lock will cause a deadlock during this
> > recursion. Release the lock and use list_for_each_entry_safe.
> > 
> > Signed-off-by: Andrew Lunn <andrew@lunn.ch>
> 
> Applied.
> 
> While looking over this code I see that we currently never unregister
> mdio boardinfo objects.
> 
> If we have drivers that can be unloaded, as it seems the one you plan
> to add that needs this change should be, the situation could get more
> tricky here.

Hi David

That is in fact normal, if you look at the i2c and spi versions of
this. This is/was generally used for ARM platforms, from before the
times of DT. There was a board file setting up platform devices for
the different bits of the hardware. The hardware never goes away, so
there is no need to remove the description of the hardware, what
devices are on which bus, etc. The drivers for that hardware can
however be removed.

The platform i'm working on is however x86. So i don't have a board
file as such, just a driver which gets loaded because of matches with
ACPI DMI strings. It populates this mdio boardinfo, as well as i2c
boardinfo, causing other drivers to be loaded. And i don't implement a
remove function, so the driver can never be unloaded. I'm happy with
that.

	Andrew

^ permalink raw reply

* Re: Q: force netif ON even when there is no real link ?
From: Andrew Lunn @ 2018-04-20 22:04 UTC (permalink / raw)
  To: Ran Shalit; +Cc: netdev
In-Reply-To: <CAJ2oMhLPKfzi=-ZvXiHbG8rcEZc39nmU3qocvZc6NrmHwHQadg@mail.gmail.com>

> By saying "mac driver", I mean Ethernet driver with fixed phy.

The fixed-phy implements the usual PHY API. So the MAC driver just
sees a PHY which has a fixed speed, and is up.

     Andrew

^ permalink raw reply

* Re: [pci PATCH v8 1/4] pci: Add pci_sriov_configure_simple for PFs that don't manage VF resources
From: Gregory Rose @ 2018-04-20 22:06 UTC (permalink / raw)
  To: Alexander Duyck, bhelgaas, linux-pci
  Cc: virtio-dev, kvm, netdev, dan.daly, linux-kernel, linux-nvme,
	keith.busch, netanel, ddutile, mheyne, liang-min.wang,
	mark.d.rustad, dwmw2, hch, dwmw
In-Reply-To: <20180420162814.46077.11939.stgit@ahduyck-green-test.jf.intel.com>


On 4/20/2018 9:28 AM, Alexander Duyck wrote:
> This patch adds a common configuration function called
> pci_sriov_configure_simple that will allow for managing VFs on devices
> where the PF is not capable of managing VF resources.
>
> Signed-off-by: Alexander Duyck <alexander.h.duyck@intel.com>
> Tested-by: Mark Rustad <mark.d.rustad@intel.com>
> ---
>
> v5: New patch replacing pci_sriov_configure_unmanaged with
>        pci_sriov_configure_simple
>      Dropped bits related to autoprobe changes
> v6: Defined pci_sriov_configure_simple as NULL if IOV is disabled
> v7: Updated pci_sriov_configure_simple to drop need for err value
>      Fixed comment explaining why pci_sriov_configure_simple is NULL
>
>   drivers/pci/iov.c   |   31 +++++++++++++++++++++++++++++++
>   include/linux/pci.h |    3 +++
>   2 files changed, 34 insertions(+)
>
> diff --git a/drivers/pci/iov.c b/drivers/pci/iov.c
> index 677924a..3e0a7fd 100644
> --- a/drivers/pci/iov.c
> +++ b/drivers/pci/iov.c
> @@ -807,3 +807,34 @@ int pci_sriov_get_totalvfs(struct pci_dev *dev)
>   	return dev->sriov->total_VFs;
>   }
>   EXPORT_SYMBOL_GPL(pci_sriov_get_totalvfs);
> +
> +/**
> + * pci_sriov_configure_simple - helper to configure unmanaged SR-IOV
> + * @dev: the PCI device
> + * @nr_virtfn: number of virtual functions to enable, 0 to disable
> + *
> + * Used to provide generic enable/disable SR-IOV option for devices
> + * that do not manage the VFs generated by their driver. Return value
> + * is negative on error, or number of VFs allocated on success.
> + */
> +int pci_sriov_configure_simple(struct pci_dev *dev, int nr_virtfn)
> +{
> +	might_sleep();
> +
> +	if (!dev->is_physfn)
> +		return -ENODEV;
> +
> +	if (pci_vfs_assigned(dev)) {
> +		pci_warn(dev,
> +			 "Cannot modify SR-IOV while VFs are assigned\n");
> +		return -EPERM;
> +	}
> +
> +	if (!nr_virtfn) {
> +		sriov_disable(dev);
> +		return 0;
> +	}
> +
> +	return sriov_enable(dev, nr_virtfn) ? : nr_virtfn;
> +}
> +EXPORT_SYMBOL_GPL(pci_sriov_configure_simple);
> diff --git a/include/linux/pci.h b/include/linux/pci.h
> index ae42289..7d36e39 100644
> --- a/include/linux/pci.h
> +++ b/include/linux/pci.h
> @@ -1955,6 +1955,7 @@ static inline void pci_mmcfg_late_init(void) { }
>   int pci_vfs_assigned(struct pci_dev *dev);
>   int pci_sriov_set_totalvfs(struct pci_dev *dev, u16 numvfs);
>   int pci_sriov_get_totalvfs(struct pci_dev *dev);
> +int pci_sriov_configure_simple(struct pci_dev *dev, int nr_virtfn);
>   resource_size_t pci_iov_resource_size(struct pci_dev *dev, int resno);
>   void pci_vf_drivers_autoprobe(struct pci_dev *dev, bool probe);
>   #else
> @@ -1982,6 +1983,8 @@ static inline int pci_sriov_set_totalvfs(struct pci_dev *dev, u16 numvfs)
>   { return 0; }
>   static inline int pci_sriov_get_totalvfs(struct pci_dev *dev)
>   { return 0; }
> +/* this is expected to be used as a function pointer, just define as NULL */
> +#define pci_sriov_configure_simple NULL
>   static inline resource_size_t pci_iov_resource_size(struct pci_dev *dev, int resno)
>   { return 0; }
>   static inline void pci_vf_drivers_autoprobe(struct pci_dev *dev, bool probe) { }
>

Thanks Alex!

Reviewed-by: Greg Rose <gvrose8192@gmail.com>

^ permalink raw reply

* Re: [pci PATCH v8 4/4] pci-pf-stub: Add PF driver stub for PFs that function only to enable VFs
From: Gregory Rose @ 2018-04-20 22:08 UTC (permalink / raw)
  To: Alexander Duyck, bhelgaas, linux-pci
  Cc: virtio-dev, kvm, netdev, dan.daly, linux-kernel, linux-nvme,
	keith.busch, netanel, ddutile, mheyne, liang-min.wang,
	mark.d.rustad, dwmw2, hch, dwmw
In-Reply-To: <20180420163109.46077.60334.stgit@ahduyck-green-test.jf.intel.com>

On 4/20/2018 9:31 AM, Alexander Duyck wrote:
> Add a new driver called "pci-pf-stub" to act as a "white-list" for PF
> devices that provide no other functionality other then acting as a means of
> allocating a set of VFs. For now I only have one example ID provided by
> Amazon in terms of devices that require this functionality. The general
> idea is that in the future we will see other devices added as vendors come
> up with devices where the PF is more or less just a lightweight shim used
> to allocate VFs.
>
> Signed-off-by: Alexander Duyck <alexander.h.duyck@intel.com>
> ---
>
> v6: New driver to address concerns about Amazon devices left unsupported
> v7: Dropped pci_id table explanation from pci-pf-stub driver
>
>   drivers/pci/Kconfig       |   12 ++++++++++
>   drivers/pci/Makefile      |    2 ++
>   drivers/pci/pci-pf-stub.c |   54 +++++++++++++++++++++++++++++++++++++++++++++
>   include/linux/pci_ids.h   |    2 ++
>   4 files changed, 70 insertions(+)
>   create mode 100644 drivers/pci/pci-pf-stub.c
>
> diff --git a/drivers/pci/Kconfig b/drivers/pci/Kconfig
> index 34b56a8..cdef2a2 100644
> --- a/drivers/pci/Kconfig
> +++ b/drivers/pci/Kconfig
> @@ -71,6 +71,18 @@ config PCI_STUB
>   
>   	  When in doubt, say N.
>   
> +config PCI_PF_STUB
> +	tristate "PCI PF Stub driver"
> +	depends on PCI
> +	depends on PCI_IOV
> +	help
> +	  Say Y or M here if you want to enable support for devices that
> +	  require SR-IOV support, while at the same time the PF itself is
> +	  not providing any actual services on the host itself such as
> +	  storage or networking.
> +
> +	  When in doubt, say N.
> +
>   config XEN_PCIDEV_FRONTEND
>           tristate "Xen PCI Frontend"
>           depends on PCI && X86 && XEN
> diff --git a/drivers/pci/Makefile b/drivers/pci/Makefile
> index 9419709..4e133d3 100644
> --- a/drivers/pci/Makefile
> +++ b/drivers/pci/Makefile
> @@ -43,6 +43,8 @@ obj-$(CONFIG_PCI_SYSCALL) += syscall.o
>   
>   obj-$(CONFIG_PCI_STUB) += pci-stub.o
>   
> +obj-$(CONFIG_PCI_PF_STUB) += pci-pf-stub.o
> +
>   obj-$(CONFIG_PCI_ECAM) += ecam.o
>   
>   obj-$(CONFIG_XEN_PCIDEV_FRONTEND) += xen-pcifront.o
> diff --git a/drivers/pci/pci-pf-stub.c b/drivers/pci/pci-pf-stub.c
> new file mode 100644
> index 0000000..9d5fdf2
> --- /dev/null
> +++ b/drivers/pci/pci-pf-stub.c
> @@ -0,0 +1,54 @@
> +// SPDX-License-Identifier: GPL-2.0
> +/* pci-pf-stub - simple stub driver for PCI SR-IOV PF device
> + *
> + * This driver is meant to act as a "white-list" for devices that provde
> + * SR-IOV functionality while at the same time not actually needing a
> + * driver of their own.
> + */
> +
> +#include <linux/module.h>
> +#include <linux/pci.h>
> +
> +/**
> + * pci_pf_stub_white_list - White list of devices to bind pci-pf-stub onto
> + *
> + * This table provides the list of IDs this driver is supposed to bind
> + * onto. You could think of this as a list of "quirked" devices where we
> + * are adding support for SR-IOV here since there are no other drivers
> + * that they would be running under.
> + */
> +static const struct pci_device_id pci_pf_stub_white_list[] = {
> +	{ PCI_VDEVICE(AMAZON, 0x0053) },
> +	/* required last entry */
> +	{ 0 }
> +};
> +MODULE_DEVICE_TABLE(pci, pci_pf_stub_white_list);
> +
> +static int pci_pf_stub_probe(struct pci_dev *dev,
> +			     const struct pci_device_id *id)
> +{
> +	pci_info(dev, "claimed by pci-pf-stub\n");
> +	return 0;
> +}
> +
> +static struct pci_driver pf_stub_driver = {
> +	.name			= "pci-pf-stub",
> +	.id_table		= pci_pf_stub_white_list,
> +	.probe			= pci_pf_stub_probe,
> +	.sriov_configure	= pci_sriov_configure_simple,
> +};
> +
> +static int __init pci_pf_stub_init(void)
> +{
> +	return pci_register_driver(&pf_stub_driver);
> +}
> +
> +static void __exit pci_pf_stub_exit(void)
> +{
> +	pci_unregister_driver(&pf_stub_driver);
> +}
> +
> +module_init(pci_pf_stub_init);
> +module_exit(pci_pf_stub_exit);
> +
> +MODULE_LICENSE("GPL");
> diff --git a/include/linux/pci_ids.h b/include/linux/pci_ids.h
> index a637a7d..62dab14 100644
> --- a/include/linux/pci_ids.h
> +++ b/include/linux/pci_ids.h
> @@ -2549,6 +2549,8 @@
>   #define PCI_VENDOR_ID_CIRCUITCO		0x1cc8
>   #define PCI_SUBSYSTEM_ID_CIRCUITCO_MINNOWBOARD	0x0001
>   
> +#define PCI_VENDOR_ID_AMAZON		0x1d0f
> +
>   #define PCI_VENDOR_ID_TEKRAM		0x1de1
>   #define PCI_DEVICE_ID_TEKRAM_DC290	0xdc29
>   
>

LGTM

Reviewed-by: Greg Rose <gvrose8192@gmail.com>

^ permalink raw reply

* Re: [pci PATCH v8 2/4] ena: Migrate over to unmanaged SR-IOV support
From: Gregory Rose @ 2018-04-20 22:10 UTC (permalink / raw)
  To: Alexander Duyck, bhelgaas, linux-pci
  Cc: virtio-dev, kvm, netdev, dan.daly, linux-kernel, linux-nvme,
	keith.busch, netanel, ddutile, mheyne, liang-min.wang,
	mark.d.rustad, dwmw2, hch, dwmw
In-Reply-To: <20180420162841.46077.17967.stgit@ahduyck-green-test.jf.intel.com>

On 4/20/2018 9:30 AM, Alexander Duyck wrote:
> Instead of implementing our own version of a SR-IOV configuration stub in
> the ena driver we can just reuse the existing
> pci_sriov_configure_simple function.
>
> Signed-off-by: Alexander Duyck <alexander.h.duyck@intel.com>
> ---
>
> v5: Replaced call to pci_sriov_configure_unmanaged with
>          pci_sriov_configure_simple
> v6: Dropped "#ifdef" checks for IOV wrapping sriov_configure definition
> v7: No change
>
>   drivers/net/ethernet/amazon/ena/ena_netdev.c |   28 +-------------------------
>   1 file changed, 1 insertion(+), 27 deletions(-)
>
> diff --git a/drivers/net/ethernet/amazon/ena/ena_netdev.c b/drivers/net/ethernet/amazon/ena/ena_netdev.c
> index a822e70..f2af87d 100644
> --- a/drivers/net/ethernet/amazon/ena/ena_netdev.c
> +++ b/drivers/net/ethernet/amazon/ena/ena_netdev.c
> @@ -3386,32 +3386,6 @@ static int ena_probe(struct pci_dev *pdev, const struct pci_device_id *ent)
>   }
>   
>   /*****************************************************************************/
> -static int ena_sriov_configure(struct pci_dev *dev, int numvfs)
> -{
> -	int rc;
> -
> -	if (numvfs > 0) {
> -		rc = pci_enable_sriov(dev, numvfs);
> -		if (rc != 0) {
> -			dev_err(&dev->dev,
> -				"pci_enable_sriov failed to enable: %d vfs with the error: %d\n",
> -				numvfs, rc);
> -			return rc;
> -		}
> -
> -		return numvfs;
> -	}
> -
> -	if (numvfs == 0) {
> -		pci_disable_sriov(dev);
> -		return 0;
> -	}
> -
> -	return -EINVAL;
> -}
> -
> -/*****************************************************************************/
> -/*****************************************************************************/
>   
>   /* ena_remove - Device Removal Routine
>    * @pdev: PCI device information struct
> @@ -3526,7 +3500,7 @@ static int ena_resume(struct pci_dev *pdev)
>   	.suspend    = ena_suspend,
>   	.resume     = ena_resume,
>   #endif
> -	.sriov_configure = ena_sriov_configure,
> +	.sriov_configure = pci_sriov_configure_simple,
>   };
>   
>   static int __init ena_init(void)
>

Reviewed-by: Greg Rose <gvrose8192@gmail.com>

^ permalink raw reply

* [PATCH bpf-next v3 1/9] bpf: change prototype for stack_map_get_build_id_offset
From: Yonghong Song @ 2018-04-20 22:18 UTC (permalink / raw)
  To: ast, daniel, netdev; +Cc: kernel-team
In-Reply-To: <20180420221842.742330-1-yhs@fb.com>

This patch didn't incur functionality change. The function prototype
got changed so that the same function can be reused later.

Signed-off-by: Yonghong Song <yhs@fb.com>
---
 kernel/bpf/stackmap.c | 13 +++++--------
 1 file changed, 5 insertions(+), 8 deletions(-)

diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c
index 57eeb12..04f6ec1 100644
--- a/kernel/bpf/stackmap.c
+++ b/kernel/bpf/stackmap.c
@@ -262,16 +262,11 @@ static int stack_map_get_build_id(struct vm_area_struct *vma,
 	return ret;
 }
 
-static void stack_map_get_build_id_offset(struct bpf_map *map,
-					  struct stack_map_bucket *bucket,
+static void stack_map_get_build_id_offset(struct bpf_stack_build_id *id_offs,
 					  u64 *ips, u32 trace_nr, bool user)
 {
 	int i;
 	struct vm_area_struct *vma;
-	struct bpf_stack_build_id *id_offs;
-
-	bucket->nr = trace_nr;
-	id_offs = (struct bpf_stack_build_id *)bucket->data;
 
 	/*
 	 * We cannot do up_read() in nmi context, so build_id lookup is
@@ -361,8 +356,10 @@ BPF_CALL_3(bpf_get_stackid, struct pt_regs *, regs, struct bpf_map *, map,
 			pcpu_freelist_pop(&smap->freelist);
 		if (unlikely(!new_bucket))
 			return -ENOMEM;
-		stack_map_get_build_id_offset(map, new_bucket, ips,
-					      trace_nr, user);
+		new_bucket->nr = trace_nr;
+		stack_map_get_build_id_offset(
+			(struct bpf_stack_build_id *)new_bucket->data,
+			ips, trace_nr, user);
 		trace_len = trace_nr * sizeof(struct bpf_stack_build_id);
 		if (hash_matches && bucket->nr == trace_nr &&
 		    memcmp(bucket->data, new_bucket->data, trace_len) == 0) {
-- 
2.9.5

^ permalink raw reply related

* [PATCH bpf-next v3 7/9] tools/bpf: add a verifier test case for bpf_get_stack helper and ARSH
From: Yonghong Song @ 2018-04-20 22:18 UTC (permalink / raw)
  To: ast, daniel, netdev; +Cc: kernel-team
In-Reply-To: <20180420221842.742330-1-yhs@fb.com>

The test_verifier already has a few ARSH test cases.
This patch adds a new test case which takes advantage of newly
improved verifier behavior for bpf_get_stack and ARSH.

Signed-off-by: Yonghong Song <yhs@fb.com>
---
 tools/testing/selftests/bpf/test_verifier.c | 45 +++++++++++++++++++++++++++++
 1 file changed, 45 insertions(+)

diff --git a/tools/testing/selftests/bpf/test_verifier.c b/tools/testing/selftests/bpf/test_verifier.c
index 3e7718b..cd595ba 100644
--- a/tools/testing/selftests/bpf/test_verifier.c
+++ b/tools/testing/selftests/bpf/test_verifier.c
@@ -11423,6 +11423,51 @@ static struct bpf_test tests[] = {
 		.errstr = "BPF_XADD stores into R2 packet",
 		.prog_type = BPF_PROG_TYPE_XDP,
 	},
+	{
+		"bpf_get_stack return R0 within range",
+		.insns = {
+			BPF_MOV64_REG(BPF_REG_6, BPF_REG_1),
+			BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0),
+			BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+			BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8),
+			BPF_LD_MAP_FD(BPF_REG_1, 0),
+			BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0,
+				     BPF_FUNC_map_lookup_elem),
+			BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 28),
+			BPF_MOV64_REG(BPF_REG_7, BPF_REG_0),
+			BPF_MOV64_IMM(BPF_REG_9, sizeof(struct test_val)),
+			BPF_MOV64_REG(BPF_REG_1, BPF_REG_6),
+			BPF_MOV64_REG(BPF_REG_2, BPF_REG_7),
+			BPF_MOV64_IMM(BPF_REG_3, sizeof(struct test_val)),
+			BPF_MOV64_IMM(BPF_REG_4, 256),
+			BPF_EMIT_CALL(BPF_FUNC_get_stack),
+			BPF_MOV64_IMM(BPF_REG_1, 0),
+			BPF_MOV64_REG(BPF_REG_8, BPF_REG_0),
+			BPF_ALU64_IMM(BPF_LSH, BPF_REG_8, 32),
+			BPF_ALU64_IMM(BPF_ARSH, BPF_REG_8, 32),
+			BPF_JMP_REG(BPF_JSLT, BPF_REG_1, BPF_REG_8, 16),
+			BPF_ALU64_REG(BPF_SUB, BPF_REG_9, BPF_REG_8),
+			BPF_MOV64_REG(BPF_REG_2, BPF_REG_7),
+			BPF_ALU64_REG(BPF_ADD, BPF_REG_2, BPF_REG_8),
+			BPF_MOV64_REG(BPF_REG_1, BPF_REG_9),
+			BPF_ALU64_IMM(BPF_LSH, BPF_REG_1, 32),
+			BPF_ALU64_IMM(BPF_ARSH, BPF_REG_1, 32),
+			BPF_MOV64_REG(BPF_REG_3, BPF_REG_2),
+			BPF_ALU64_REG(BPF_ADD, BPF_REG_3, BPF_REG_1),
+			BPF_MOV64_REG(BPF_REG_1, BPF_REG_7),
+			BPF_MOV64_IMM(BPF_REG_5, sizeof(struct test_val)),
+			BPF_ALU64_REG(BPF_ADD, BPF_REG_1, BPF_REG_5),
+			BPF_JMP_REG(BPF_JGE, BPF_REG_3, BPF_REG_1, 4),
+			BPF_MOV64_REG(BPF_REG_1, BPF_REG_6),
+			BPF_MOV64_REG(BPF_REG_3, BPF_REG_9),
+			BPF_MOV64_IMM(BPF_REG_4, 0),
+			BPF_EMIT_CALL(BPF_FUNC_get_stack),
+			BPF_EXIT_INSN(),
+		},
+		.fixup_map2 = { 4 },
+		.result = ACCEPT,
+		.prog_type = BPF_PROG_TYPE_TRACEPOINT,
+	},
 };
 
 static int probe_filter_length(const struct bpf_insn *fp)
-- 
2.9.5

^ permalink raw reply related

* [PATCH bpf-next v3 3/9] bpf/verifier: refine retval R0 state for bpf_get_stack helper
From: Yonghong Song @ 2018-04-20 22:18 UTC (permalink / raw)
  To: ast, daniel, netdev; +Cc: kernel-team
In-Reply-To: <20180420221842.742330-1-yhs@fb.com>

The special property of return values for helpers bpf_get_stack
and bpf_probe_read_str are captured in verifier.
Both helpers return a negative error code or
a length, which is equal to or smaller than the buffer
size argument. This additional information in the
verifier can avoid the condition such as "retval > bufsize"
in the bpf program. For example, for the code blow,
    usize = bpf_get_stack(ctx, raw_data, max_len, BPF_F_USER_STACK);
    if (usize < 0 || usize > max_len)
        return 0;
The verifier may have the following errors:
    52: (85) call bpf_get_stack#65
     R0=map_value(id=0,off=0,ks=4,vs=1600,imm=0) R1_w=ctx(id=0,off=0,imm=0)
     R2_w=map_value(id=0,off=0,ks=4,vs=1600,imm=0) R3_w=inv800 R4_w=inv256
     R6=ctx(id=0,off=0,imm=0) R7=map_value(id=0,off=0,ks=4,vs=1600,imm=0)
     R9_w=inv800 R10=fp0,call_-1
    53: (bf) r8 = r0
    54: (bf) r1 = r8
    55: (67) r1 <<= 32
    56: (bf) r2 = r1
    57: (77) r2 >>= 32
    58: (25) if r2 > 0x31f goto pc+33
     R0=inv(id=0) R1=inv(id=0,smax_value=9223372032559808512,
                         umax_value=18446744069414584320,
                         var_off=(0x0; 0xffffffff00000000))
     R2=inv(id=0,umax_value=799,var_off=(0x0; 0x3ff))
     R6=ctx(id=0,off=0,imm=0) R7=map_value(id=0,off=0,ks=4,vs=1600,imm=0)
     R8=inv(id=0) R9=inv800 R10=fp0,call_-1
    59: (1f) r9 -= r8
    60: (c7) r1 s>>= 32
    61: (bf) r2 = r7
    62: (0f) r2 += r1
    math between map_value pointer and register with unbounded
    min value is not allowed
The failure is due to llvm compiler optimization where register "r2",
which is a copy of "r1", is tested for condition while later on "r1"
is used for map_ptr operation. The verifier is not able to track such
inst sequence effectively.

Without the "usize > max_len" condition, there is no llvm optimization
and the below generated code passed verifier:
    52: (85) call bpf_get_stack#65
     R0=map_value(id=0,off=0,ks=4,vs=1600,imm=0) R1_w=ctx(id=0,off=0,imm=0)
     R2_w=map_value(id=0,off=0,ks=4,vs=1600,imm=0) R3_w=inv800 R4_w=inv256
     R6=ctx(id=0,off=0,imm=0) R7=map_value(id=0,off=0,ks=4,vs=1600,imm=0)
     R9_w=inv800 R10=fp0,call_-1
    53: (b7) r1 = 0
    54: (bf) r8 = r0
    55: (67) r8 <<= 32
    56: (c7) r8 s>>= 32
    57: (6d) if r1 s> r8 goto pc+24
     R0=inv(id=0,umax_value=800) R1=inv0 R6=ctx(id=0,off=0,imm=0)
     R7=map_value(id=0,off=0,ks=4,vs=1600,imm=0)
     R8=inv(id=0,umax_value=800,var_off=(0x0; 0x3ff)) R9=inv800
     R10=fp0,call_-1
    58: (bf) r2 = r7
    59: (0f) r2 += r8
    60: (1f) r9 -= r8
    61: (bf) r1 = r6

Signed-off-by: Yonghong Song <yhs@fb.com>
---
 kernel/bpf/verifier.c | 27 +++++++++++++++++++++++++++
 1 file changed, 27 insertions(+)

diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index aba9425..3c8bb92 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -164,6 +164,8 @@ struct bpf_call_arg_meta {
 	bool pkt_access;
 	int regno;
 	int access_size;
+	s64 msize_smax_value;
+	u64 msize_umax_value;
 };
 
 static DEFINE_MUTEX(bpf_verifier_lock);
@@ -2027,6 +2029,14 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno,
 		err = check_helper_mem_access(env, regno - 1,
 					      reg->umax_value,
 					      zero_size_allowed, meta);
+
+		if (!err && !!meta) {
+			/* remember the mem_size which may be used later
+			 * to refine return values.
+			 */
+			meta->msize_smax_value = reg->smax_value;
+			meta->msize_umax_value = reg->umax_value;
+		}
 	}
 
 	return err;
@@ -2333,6 +2343,21 @@ static int prepare_func_exit(struct bpf_verifier_env *env, int *insn_idx)
 	return 0;
 }
 
+static void do_refine_retval_range(struct bpf_reg_state *regs, int ret_type,
+				   int func_id,
+				   struct bpf_call_arg_meta *meta)
+{
+	struct bpf_reg_state *ret_reg = &regs[BPF_REG_0];
+
+	if (ret_type != RET_INTEGER ||
+	    (func_id != BPF_FUNC_get_stack &&
+	     func_id != BPF_FUNC_probe_read_str))
+		return;
+
+	ret_reg->smax_value = meta->msize_smax_value;
+	ret_reg->umax_value = meta->msize_umax_value;
+}
+
 static int check_helper_call(struct bpf_verifier_env *env, int func_id, int insn_idx)
 {
 	const struct bpf_func_proto *fn = NULL;
@@ -2456,6 +2481,8 @@ static int check_helper_call(struct bpf_verifier_env *env, int func_id, int insn
 		return -EINVAL;
 	}
 
+	do_refine_retval_range(regs, fn->ret_type, func_id, &meta);
+
 	err = check_map_func_compatibility(env, meta.map_ptr, func_id);
 	if (err)
 		return err;
-- 
2.9.5

^ permalink raw reply related

* [PATCH bpf-next v3 9/9] tools/bpf: add a test for bpf_get_stack with tracepoint prog
From: Yonghong Song @ 2018-04-20 22:18 UTC (permalink / raw)
  To: ast, daniel, netdev; +Cc: kernel-team
In-Reply-To: <20180420221842.742330-1-yhs@fb.com>

The test_stacktrace_map and test_stacktrace_build_id are
enhanced to call bpf_get_stack in the helper to get the
stack trace as well.  The stack traces from bpf_get_stack
and bpf_get_stackid are compared to ensure that for the
same stack as represented as the same hash, their ip addresses
or build id's must be the same.

Signed-off-by: Yonghong Song <yhs@fb.com>
---
 tools/testing/selftests/bpf/test_progs.c           | 63 +++++++++++++++++++---
 .../selftests/bpf/test_stacktrace_build_id.c       | 20 ++++++-
 tools/testing/selftests/bpf/test_stacktrace_map.c  | 20 +++++--
 3 files changed, 92 insertions(+), 11 deletions(-)

diff --git a/tools/testing/selftests/bpf/test_progs.c b/tools/testing/selftests/bpf/test_progs.c
index dad4c3f..06b922a 100644
--- a/tools/testing/selftests/bpf/test_progs.c
+++ b/tools/testing/selftests/bpf/test_progs.c
@@ -897,11 +897,40 @@ static int compare_map_keys(int map1_fd, int map2_fd)
 	return 0;
 }
 
+static int compare_stack_ips(int smap_fd, int amap_fd, int stack_trace_len)
+{
+	__u32 key, next_key, *cur_key_p, *next_key_p;
+	char val_buf1[stack_trace_len], val_buf2[stack_trace_len];
+	int i, err;
+
+	cur_key_p = NULL;
+	next_key_p = &key;
+	while (bpf_map_get_next_key(smap_fd, cur_key_p, next_key_p) == 0) {
+		err = bpf_map_lookup_elem(smap_fd, next_key_p, val_buf1);
+		if (err)
+			return err;
+		err = bpf_map_lookup_elem(amap_fd, next_key_p, val_buf2);
+		if (err)
+			return err;
+		for (i = 0; i < stack_trace_len; i++) {
+			if (val_buf1[i] != val_buf2[i])
+				return -1;
+		}
+		key = *next_key_p;
+		cur_key_p = &key;
+		next_key_p = &next_key;
+	}
+	if (errno != ENOENT)
+		return -1;
+
+	return 0;
+}
+
 static void test_stacktrace_map()
 {
-	int control_map_fd, stackid_hmap_fd, stackmap_fd;
+	int control_map_fd, stackid_hmap_fd, stackmap_fd, stack_amap_fd;
 	const char *file = "./test_stacktrace_map.o";
-	int bytes, efd, err, pmu_fd, prog_fd;
+	int bytes, efd, err, pmu_fd, prog_fd, stack_trace_len;
 	struct perf_event_attr attr = {};
 	__u32 key, val, duration = 0;
 	struct bpf_object *obj;
@@ -957,6 +986,10 @@ static void test_stacktrace_map()
 	if (stackmap_fd < 0)
 		goto disable_pmu;
 
+	stack_amap_fd = bpf_find_map(__func__, obj, "stack_amap");
+	if (stack_amap_fd < 0)
+		goto disable_pmu;
+
 	/* give some time for bpf program run */
 	sleep(1);
 
@@ -978,6 +1011,12 @@ static void test_stacktrace_map()
 		  "err %d errno %d\n", err, errno))
 		goto disable_pmu_noerr;
 
+	stack_trace_len = PERF_MAX_STACK_DEPTH * sizeof(__u64);
+	err = compare_stack_ips(stackmap_fd, stack_amap_fd, stack_trace_len);
+	if (CHECK(err, "compare_stack_ips stackmap vs. stack_amap",
+		  "err %d errno %d\n", err, errno))
+		goto disable_pmu_noerr;
+
 	goto disable_pmu_noerr;
 disable_pmu:
 	error_cnt++;
@@ -1071,9 +1110,9 @@ static int extract_build_id(char *build_id, size_t size)
 
 static void test_stacktrace_build_id(void)
 {
-	int control_map_fd, stackid_hmap_fd, stackmap_fd;
+	int control_map_fd, stackid_hmap_fd, stackmap_fd, stack_amap_fd;
 	const char *file = "./test_stacktrace_build_id.o";
-	int bytes, efd, err, pmu_fd, prog_fd;
+	int bytes, efd, err, pmu_fd, prog_fd, stack_trace_len;
 	struct perf_event_attr attr = {};
 	__u32 key, previous_key, val, duration = 0;
 	struct bpf_object *obj;
@@ -1138,6 +1177,11 @@ static void test_stacktrace_build_id(void)
 		  err, errno))
 		goto disable_pmu;
 
+	stack_amap_fd = bpf_find_map(__func__, obj, "stack_amap");
+	if (CHECK(stack_amap_fd < 0, "bpf_find_map stack_amap",
+		  "err %d errno %d\n", err, errno))
+		goto disable_pmu;
+
 	assert(system("dd if=/dev/urandom of=/dev/zero count=4 2> /dev/null")
 	       == 0);
 	assert(system("./urandom_read if=/dev/urandom of=/dev/zero count=4 2> /dev/null") == 0);
@@ -1189,8 +1233,15 @@ static void test_stacktrace_build_id(void)
 		previous_key = key;
 	} while (bpf_map_get_next_key(stackmap_fd, &previous_key, &key) == 0);
 
-	CHECK(build_id_matches < 1, "build id match",
-	      "Didn't find expected build ID from the map");
+	if (CHECK(build_id_matches < 1, "build id match",
+		  "Didn't find expected build ID from the map"))
+		goto disable_pmu;
+
+	stack_trace_len = PERF_MAX_STACK_DEPTH
+		* sizeof(struct bpf_stack_build_id);
+	err = compare_stack_ips(stackmap_fd, stack_amap_fd, stack_trace_len);
+	CHECK(err, "compare_stack_ips stackmap vs. stack_amap",
+	      "err %d errno %d\n", err, errno);
 
 disable_pmu:
 	ioctl(pmu_fd, PERF_EVENT_IOC_DISABLE);
diff --git a/tools/testing/selftests/bpf/test_stacktrace_build_id.c b/tools/testing/selftests/bpf/test_stacktrace_build_id.c
index b755bd7..d86c281 100644
--- a/tools/testing/selftests/bpf/test_stacktrace_build_id.c
+++ b/tools/testing/selftests/bpf/test_stacktrace_build_id.c
@@ -19,7 +19,7 @@ struct bpf_map_def SEC("maps") stackid_hmap = {
 	.type = BPF_MAP_TYPE_HASH,
 	.key_size = sizeof(__u32),
 	.value_size = sizeof(__u32),
-	.max_entries = 10000,
+	.max_entries = 16384,
 };
 
 struct bpf_map_def SEC("maps") stackmap = {
@@ -31,6 +31,14 @@ struct bpf_map_def SEC("maps") stackmap = {
 	.map_flags = BPF_F_STACK_BUILD_ID,
 };
 
+struct bpf_map_def SEC("maps") stack_amap = {
+	.type = BPF_MAP_TYPE_ARRAY,
+	.key_size = sizeof(__u32),
+	.value_size = sizeof(struct bpf_stack_build_id)
+		* PERF_MAX_STACK_DEPTH,
+	.max_entries = 128,
+};
+
 /* taken from /sys/kernel/debug/tracing/events/random/urandom_read/format */
 struct random_urandom_args {
 	unsigned long long pad;
@@ -42,7 +50,10 @@ struct random_urandom_args {
 SEC("tracepoint/random/urandom_read")
 int oncpu(struct random_urandom_args *args)
 {
+	__u32 max_len = sizeof(struct bpf_stack_build_id)
+			* PERF_MAX_STACK_DEPTH;
 	__u32 key = 0, val = 0, *value_p;
+	void *stack_p;
 
 	value_p = bpf_map_lookup_elem(&control_map, &key);
 	if (value_p && *value_p)
@@ -50,8 +61,13 @@ int oncpu(struct random_urandom_args *args)
 
 	/* The size of stackmap and stackid_hmap should be the same */
 	key = bpf_get_stackid(args, &stackmap, BPF_F_USER_STACK);
-	if ((int)key >= 0)
+	if ((int)key >= 0) {
 		bpf_map_update_elem(&stackid_hmap, &key, &val, 0);
+		stack_p = bpf_map_lookup_elem(&stack_amap, &key);
+		if (stack_p)
+			bpf_get_stack(args, stack_p, max_len,
+				      BPF_F_USER_STACK | BPF_F_USER_BUILD_ID);
+	}
 
 	return 0;
 }
diff --git a/tools/testing/selftests/bpf/test_stacktrace_map.c b/tools/testing/selftests/bpf/test_stacktrace_map.c
index 76d85c5d..f83c7b6 100644
--- a/tools/testing/selftests/bpf/test_stacktrace_map.c
+++ b/tools/testing/selftests/bpf/test_stacktrace_map.c
@@ -19,14 +19,21 @@ struct bpf_map_def SEC("maps") stackid_hmap = {
 	.type = BPF_MAP_TYPE_HASH,
 	.key_size = sizeof(__u32),
 	.value_size = sizeof(__u32),
-	.max_entries = 10000,
+	.max_entries = 16384,
 };
 
 struct bpf_map_def SEC("maps") stackmap = {
 	.type = BPF_MAP_TYPE_STACK_TRACE,
 	.key_size = sizeof(__u32),
 	.value_size = sizeof(__u64) * PERF_MAX_STACK_DEPTH,
-	.max_entries = 10000,
+	.max_entries = 16384,
+};
+
+struct bpf_map_def SEC("maps") stack_amap = {
+	.type = BPF_MAP_TYPE_ARRAY,
+	.key_size = sizeof(__u32),
+	.value_size = sizeof(__u64) * PERF_MAX_STACK_DEPTH,
+	.max_entries = 16384,
 };
 
 /* taken from /sys/kernel/debug/tracing/events/sched/sched_switch/format */
@@ -44,7 +51,10 @@ struct sched_switch_args {
 SEC("tracepoint/sched/sched_switch")
 int oncpu(struct sched_switch_args *ctx)
 {
+	__u32 max_len = PERF_MAX_STACK_DEPTH * sizeof(__u64);
 	__u32 key = 0, val = 0, *value_p;
+	void *stack_p;
+
 
 	value_p = bpf_map_lookup_elem(&control_map, &key);
 	if (value_p && *value_p)
@@ -52,8 +62,12 @@ int oncpu(struct sched_switch_args *ctx)
 
 	/* The size of stackmap and stackid_hmap should be the same */
 	key = bpf_get_stackid(ctx, &stackmap, 0);
-	if ((int)key >= 0)
+	if ((int)key >= 0) {
 		bpf_map_update_elem(&stackid_hmap, &key, &val, 0);
+		stack_p = bpf_map_lookup_elem(&stack_amap, &key);
+		if (stack_p)
+			bpf_get_stack(ctx, stack_p, max_len, 0);
+	}
 
 	return 0;
 }
-- 
2.9.5

^ permalink raw reply related

* [PATCH bpf-next v3 0/9] bpf: add bpf_get_stack helper
From: Yonghong Song @ 2018-04-20 22:18 UTC (permalink / raw)
  To: ast, daniel, netdev; +Cc: kernel-team

Currently, stackmap and bpf_get_stackid helper are provided
for bpf program to get the stack trace. This approach has
a limitation though. If two stack traces have the same hash,
only one will get stored in the stackmap table regardless of
whether BPF_F_REUSE_STACKID is specified or not,
so some stack traces may be missing from user perspective.

This patch implements a new helper, bpf_get_stack, will
send stack traces directly to bpf program. The bpf program
is able to see all stack traces, and then can do in-kernel
processing or send stack traces to user space through
shared map or bpf_perf_event_output.

Patches #1 and #2 implemented the core kernel support.
Patches #3 and #4 are two verifier improves to make
bpf programming easier. Patch #5 synced the new helper
to tools headers. Patch #6 moved perf_event polling code
and ksym lookup code from samples/bpf to
tools/testing/selftests/bpf. Patch #7 added a verifier
test in tools/bpf for new verifier change.
Patches #8 and #9 added tests for raw tracepoint prog
and tracepoint prog respectively.

Changelogs:
  v2 -> v3:
    . use meta to track helper memory size argument
    . implement range checking for ARSH in verifier
    . move perf event polling and ksym related functions
      from samples/bpf to tools/bpf
    . added test to compare build id's between bpf_get_stackid
      and bpf_get_stack
  v1 -> v2:
    . fix compilation error when CONFIG_PERF_EVENTS is not enabled

Yonghong Song (9):
  bpf: change prototype for stack_map_get_build_id_offset
  bpf: add bpf_get_stack helper
  bpf/verifier: refine retval R0 state for bpf_get_stack helper
  bpf/verifier: improve register value range tracking with ARSH
  tools/bpf: add bpf_get_stack helper to tools headers
  samples/bpf: move common-purpose trace functions to selftests
  tools/bpf: add a verifier test case for bpf_get_stack helper and ARSH
  tools/bpf: add a test for bpf_get_stack with raw tracepoint prog
  tools/bpf: add a test for bpf_get_stack with tracepoint prog

 include/linux/bpf.h                                |   1 +
 include/linux/filter.h                             |   3 +-
 include/uapi/linux/bpf.h                           |  19 ++-
 kernel/bpf/core.c                                  |   5 +
 kernel/bpf/stackmap.c                              |  80 ++++++++-
 kernel/bpf/syscall.c                               |  10 ++
 kernel/bpf/verifier.c                              |  56 +++++++
 kernel/trace/bpf_trace.c                           |  50 +++++-
 samples/bpf/Makefile                               |  11 +-
 samples/bpf/bpf_load.c                             |  63 -------
 samples/bpf/bpf_load.h                             |   7 -
 samples/bpf/offwaketime_user.c                     |   1 +
 samples/bpf/sampleip_user.c                        |   1 +
 samples/bpf/spintest_user.c                        |   1 +
 samples/bpf/trace_event_user.c                     |   1 +
 samples/bpf/trace_output_user.c                    | 125 ++------------
 tools/include/uapi/linux/bpf.h                     |  19 ++-
 tools/testing/selftests/bpf/Makefile               |   3 +-
 tools/testing/selftests/bpf/bpf_helpers.h          |   3 +-
 tools/testing/selftests/bpf/test_get_stack_rawtp.c | 102 +++++++++++
 tools/testing/selftests/bpf/test_progs.c           | 178 +++++++++++++++++++-
 .../selftests/bpf/test_stacktrace_build_id.c       |  20 ++-
 tools/testing/selftests/bpf/test_stacktrace_map.c  |  20 ++-
 tools/testing/selftests/bpf/test_verifier.c        |  45 +++++
 tools/testing/selftests/bpf/trace_helpers.c        | 186 +++++++++++++++++++++
 tools/testing/selftests/bpf/trace_helpers.h        |  24 +++
 26 files changed, 825 insertions(+), 209 deletions(-)
 create mode 100644 tools/testing/selftests/bpf/test_get_stack_rawtp.c
 create mode 100644 tools/testing/selftests/bpf/trace_helpers.c
 create mode 100644 tools/testing/selftests/bpf/trace_helpers.h

-- 
2.9.5

^ permalink raw reply

* [PATCH bpf-next v3 2/9] bpf: add bpf_get_stack helper
From: Yonghong Song @ 2018-04-20 22:18 UTC (permalink / raw)
  To: ast, daniel, netdev; +Cc: kernel-team
In-Reply-To: <20180420221842.742330-1-yhs@fb.com>

Currently, stackmap and bpf_get_stackid helper are provided
for bpf program to get the stack trace. This approach has
a limitation though. If two stack traces have the same hash,
only one will get stored in the stackmap table,
so some stack traces are missing from user perspective.

This patch implements a new helper, bpf_get_stack, will
send stack traces directly to bpf program. The bpf program
is able to see all stack traces, and then can do in-kernel
processing or send stack traces to user space through
shared map or bpf_perf_event_output.

Acked-by: Alexei Starovoitov <ast@fb.com>
Signed-off-by: Yonghong Song <yhs@fb.com>
---
 include/linux/bpf.h      |  1 +
 include/linux/filter.h   |  3 ++-
 include/uapi/linux/bpf.h | 19 ++++++++++++--
 kernel/bpf/core.c        |  5 ++++
 kernel/bpf/stackmap.c    | 67 ++++++++++++++++++++++++++++++++++++++++++++++++
 kernel/bpf/syscall.c     | 10 ++++++++
 kernel/bpf/verifier.c    |  3 +++
 kernel/trace/bpf_trace.c | 50 +++++++++++++++++++++++++++++++++++-
 8 files changed, 154 insertions(+), 4 deletions(-)

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index ee5275e..2c520b4 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -690,6 +690,7 @@ extern const struct bpf_func_proto bpf_get_current_comm_proto;
 extern const struct bpf_func_proto bpf_skb_vlan_push_proto;
 extern const struct bpf_func_proto bpf_skb_vlan_pop_proto;
 extern const struct bpf_func_proto bpf_get_stackid_proto;
+extern const struct bpf_func_proto bpf_get_stack_proto;
 extern const struct bpf_func_proto bpf_sock_map_update_proto;
 
 /* Shared helpers among cBPF and eBPF. */
diff --git a/include/linux/filter.h b/include/linux/filter.h
index 4da8b23..044d30e 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -468,7 +468,8 @@ struct bpf_prog {
 				dst_needed:1,	/* Do we need dst entry? */
 				blinded:1,	/* Was blinded */
 				is_func:1,	/* program is a bpf function */
-				kprobe_override:1; /* Do we override a kprobe? */
+				kprobe_override:1, /* Do we override a kprobe? */
+				need_callchain_buf:1; /* Needs callchain buffer? */
 	enum bpf_prog_type	type;		/* Type of BPF program */
 	enum bpf_attach_type	expected_attach_type; /* For some prog types */
 	u32			len;		/* Number of filter blocks */
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index c8383a2..470f3a2 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -529,6 +529,17 @@ union bpf_attr {
  *             other bits - reserved
  *     Return: >= 0 stackid on success or negative error
  *
+ * int bpf_get_stack(ctx, buf, size, flags)
+ *     walk user or kernel stack and store the ips in buf
+ *     @ctx: struct pt_regs*
+ *     @buf: user buffer to fill stack
+ *     @size: the buf size
+ *     @flags: bits 0-7 - numer of stack frames to skip
+ *             bit 8 - collect user stack instead of kernel
+ *             bit 11 - get build-id as well if user stack
+ *             other bits - reserved
+ *     Return: >= 0 size copied on success or negative error
+ *
  * s64 bpf_csum_diff(from, from_size, to, to_size, seed)
  *     calculate csum diff
  *     @from: raw from buffer
@@ -841,7 +852,8 @@ union bpf_attr {
 	FN(msg_cork_bytes),		\
 	FN(msg_pull_data),		\
 	FN(bind),			\
-	FN(xdp_adjust_tail),
+	FN(xdp_adjust_tail),		\
+	FN(get_stack),
 
 /* integer value in 'imm' field of BPF_CALL instruction selects which helper
  * function eBPF program intends to call
@@ -875,11 +887,14 @@ enum bpf_func_id {
 /* BPF_FUNC_skb_set_tunnel_key and BPF_FUNC_skb_get_tunnel_key flags. */
 #define BPF_F_TUNINFO_IPV6		(1ULL << 0)
 
-/* BPF_FUNC_get_stackid flags. */
+/* flags for both BPF_FUNC_get_stackid and BPF_FUNC_get_stack. */
 #define BPF_F_SKIP_FIELD_MASK		0xffULL
 #define BPF_F_USER_STACK		(1ULL << 8)
+/* flags used by BPF_FUNC_get_stackid only. */
 #define BPF_F_FAST_STACK_CMP		(1ULL << 9)
 #define BPF_F_REUSE_STACKID		(1ULL << 10)
+/* flags used by BPF_FUNC_get_stack only. */
+#define BPF_F_USER_BUILD_ID		(1ULL << 11)
 
 /* BPF_FUNC_skb_set_tunnel_key flags. */
 #define BPF_F_ZERO_CSUM_TX		(1ULL << 1)
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index d315b39..bf22eca 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -31,6 +31,7 @@
 #include <linux/rbtree_latch.h>
 #include <linux/kallsyms.h>
 #include <linux/rcupdate.h>
+#include <linux/perf_event.h>
 
 #include <asm/unaligned.h>
 
@@ -1709,6 +1710,10 @@ static void bpf_prog_free_deferred(struct work_struct *work)
 	aux = container_of(work, struct bpf_prog_aux, work);
 	if (bpf_prog_is_dev_bound(aux))
 		bpf_prog_offload_destroy(aux->prog);
+#ifdef CONFIG_PERF_EVENTS
+	if (aux->prog->need_callchain_buf)
+		put_callchain_buffers();
+#endif
 	for (i = 0; i < aux->func_cnt; i++)
 		bpf_jit_free(aux->func[i]);
 	if (aux->func_cnt) {
diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c
index 04f6ec1..4477cf6 100644
--- a/kernel/bpf/stackmap.c
+++ b/kernel/bpf/stackmap.c
@@ -402,6 +402,73 @@ const struct bpf_func_proto bpf_get_stackid_proto = {
 	.arg3_type	= ARG_ANYTHING,
 };
 
+BPF_CALL_4(bpf_get_stack, struct pt_regs *, regs, void *, buf, u32, size,
+	   u64, flags)
+{
+	u32 init_nr, trace_nr, copy_len, elem_size, num_elem;
+	bool user_build_id = flags & BPF_F_USER_BUILD_ID;
+	u32 skip = flags & BPF_F_SKIP_FIELD_MASK;
+	bool user = flags & BPF_F_USER_STACK;
+	struct perf_callchain_entry *trace;
+	bool kernel = !user;
+	int err = -EINVAL;
+	u64 *ips;
+
+	if (unlikely(flags & ~(BPF_F_SKIP_FIELD_MASK | BPF_F_USER_STACK |
+			       BPF_F_USER_BUILD_ID)))
+		goto clear;
+	if (kernel && user_build_id)
+		goto clear;
+
+	elem_size = (user && user_build_id) ? sizeof(struct bpf_stack_build_id)
+					    : sizeof(u64);
+	if (unlikely(size % elem_size))
+		goto clear;
+
+	num_elem = size / elem_size;
+	if (sysctl_perf_event_max_stack < num_elem)
+		init_nr = 0;
+	else
+		init_nr = sysctl_perf_event_max_stack - num_elem;
+	trace = get_perf_callchain(regs, init_nr, kernel, user,
+				   sysctl_perf_event_max_stack, false, false);
+	if (unlikely(!trace))
+		goto err_fault;
+
+	trace_nr = trace->nr - init_nr;
+	if (trace_nr <= skip)
+		goto err_fault;
+
+	trace_nr -= skip;
+	trace_nr = (trace_nr <= num_elem) ? trace_nr : num_elem;
+	copy_len = trace_nr * elem_size;
+	ips = trace->ip + skip + init_nr;
+	if (user && user_build_id)
+		stack_map_get_build_id_offset(buf, ips, trace_nr, user);
+	else
+		memcpy(buf, ips, copy_len);
+
+	if (size > copy_len)
+		memset(buf + copy_len, 0, size - copy_len);
+	return copy_len;
+
+err_fault:
+	err = -EFAULT;
+clear:
+	memset(buf, 0, size);
+	return err;
+}
+
+const struct bpf_func_proto bpf_get_stack_proto = {
+	.func		= bpf_get_stack,
+	.gpl_only	= true,
+	.ret_type	= RET_INTEGER,
+	.arg1_type	= ARG_PTR_TO_CTX,
+	.arg2_type	= ARG_PTR_TO_UNINIT_MEM,
+	.arg3_type	= ARG_CONST_SIZE_OR_ZERO,
+	.arg4_type	= ARG_ANYTHING,
+};
+
 /* Called from eBPF program */
 static void *stack_map_lookup_elem(struct bpf_map *map, void *key)
 {
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index fe23dc5a..1ee71f6 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -1360,6 +1360,16 @@ static int bpf_prog_load(union bpf_attr *attr)
 	if (err)
 		goto free_used_maps;
 
+	if (prog->need_callchain_buf) {
+#ifdef CONFIG_PERF_EVENTS
+		err = get_callchain_buffers(sysctl_perf_event_max_stack);
+#else
+		err = -ENOTSUPP;
+#endif
+		if (err)
+			goto free_used_maps;
+	}
+
 	err = bpf_prog_new_fd(prog);
 	if (err < 0) {
 		/* failed to allocate fd.
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 5dd1dcb..aba9425 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -2460,6 +2460,9 @@ static int check_helper_call(struct bpf_verifier_env *env, int func_id, int insn
 	if (err)
 		return err;
 
+	if (func_id == BPF_FUNC_get_stack)
+		env->prog->need_callchain_buf = true;
+
 	if (changes_data)
 		clear_all_pkt_pointers(env);
 	return 0;
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index d88e96d..fe8476f 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -20,6 +20,7 @@
 #include "trace.h"
 
 u64 bpf_get_stackid(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5);
+u64 bpf_get_stack(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5);
 
 /**
  * trace_call_bpf - invoke BPF program
@@ -577,6 +578,8 @@ kprobe_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 		return &bpf_perf_event_output_proto;
 	case BPF_FUNC_get_stackid:
 		return &bpf_get_stackid_proto;
+	case BPF_FUNC_get_stack:
+		return &bpf_get_stack_proto;
 	case BPF_FUNC_perf_event_read_value:
 		return &bpf_perf_event_read_value_proto;
 #ifdef CONFIG_BPF_KPROBE_OVERRIDE
@@ -664,6 +667,25 @@ static const struct bpf_func_proto bpf_get_stackid_proto_tp = {
 	.arg3_type	= ARG_ANYTHING,
 };
 
+BPF_CALL_4(bpf_get_stack_tp, void *, tp_buff, void *, buf, u32, size,
+	   u64, flags)
+{
+	struct pt_regs *regs = *(struct pt_regs **)tp_buff;
+
+	return bpf_get_stack((unsigned long) regs, (unsigned long) buf,
+			     (unsigned long) size, flags, 0);
+}
+
+static const struct bpf_func_proto bpf_get_stack_proto_tp = {
+	.func		= bpf_get_stack_tp,
+	.gpl_only	= true,
+	.ret_type	= RET_INTEGER,
+	.arg1_type	= ARG_PTR_TO_CTX,
+	.arg2_type	= ARG_PTR_TO_UNINIT_MEM,
+	.arg3_type	= ARG_CONST_SIZE_OR_ZERO,
+	.arg4_type	= ARG_ANYTHING,
+};
+
 static const struct bpf_func_proto *
 tp_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 {
@@ -672,6 +694,8 @@ tp_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 		return &bpf_perf_event_output_proto_tp;
 	case BPF_FUNC_get_stackid:
 		return &bpf_get_stackid_proto_tp;
+	case BPF_FUNC_get_stack:
+		return &bpf_get_stack_proto_tp;
 	default:
 		return tracing_func_proto(func_id, prog);
 	}
@@ -734,6 +758,8 @@ pe_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 		return &bpf_perf_event_output_proto_tp;
 	case BPF_FUNC_get_stackid:
 		return &bpf_get_stackid_proto_tp;
+	case BPF_FUNC_get_stack:
+		return &bpf_get_stack_proto_tp;
 	case BPF_FUNC_perf_prog_read_value:
 		return &bpf_perf_prog_read_value_proto;
 	default:
@@ -744,7 +770,7 @@ pe_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 /*
  * bpf_raw_tp_regs are separate from bpf_pt_regs used from skb/xdp
  * to avoid potential recursive reuse issue when/if tracepoints are added
- * inside bpf_*_event_output and/or bpf_get_stack_id
+ * inside bpf_*_event_output, bpf_get_stackid and/or bpf_get_stack
  */
 static DEFINE_PER_CPU(struct pt_regs, bpf_raw_tp_regs);
 BPF_CALL_5(bpf_perf_event_output_raw_tp, struct bpf_raw_tracepoint_args *, args,
@@ -787,6 +813,26 @@ static const struct bpf_func_proto bpf_get_stackid_proto_raw_tp = {
 	.arg3_type	= ARG_ANYTHING,
 };
 
+BPF_CALL_4(bpf_get_stack_raw_tp, struct bpf_raw_tracepoint_args *, args,
+	   void *, buf, u32, size, u64, flags)
+{
+	struct pt_regs *regs = this_cpu_ptr(&bpf_raw_tp_regs);
+
+	perf_fetch_caller_regs(regs);
+	return bpf_get_stack((unsigned long) regs, (unsigned long) buf,
+			     (unsigned long) size, flags, 0);
+}
+
+static const struct bpf_func_proto bpf_get_stack_proto_raw_tp = {
+	.func		= bpf_get_stack_raw_tp,
+	.gpl_only	= true,
+	.ret_type	= RET_INTEGER,
+	.arg1_type	= ARG_PTR_TO_CTX,
+	.arg2_type	= ARG_PTR_TO_MEM,
+	.arg3_type	= ARG_CONST_SIZE_OR_ZERO,
+	.arg4_type	= ARG_ANYTHING,
+};
+
 static const struct bpf_func_proto *
 raw_tp_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 {
@@ -795,6 +841,8 @@ raw_tp_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 		return &bpf_perf_event_output_proto_raw_tp;
 	case BPF_FUNC_get_stackid:
 		return &bpf_get_stackid_proto_raw_tp;
+	case BPF_FUNC_get_stack:
+		return &bpf_get_stack_proto_raw_tp;
 	default:
 		return tracing_func_proto(func_id, prog);
 	}
-- 
2.9.5

^ permalink raw reply related

page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox