Netdev List

Netdev List
 help / color / mirror / Atom feed

* [PATCH v2 bpf 1/3] bpf: fix lockdep false positive in percpu_freelist
From: Alexei Starovoitov @ 2019-01-31  2:12 UTC (permalink / raw)
  To: davem; +Cc: daniel, peterz, edumazet, jannh, netdev, kernel-team
In-Reply-To: <20190131021245.1905869-1-ast@kernel.org>

Lockdep warns about false positive:
[   12.492084] 00000000e6b28347 (&head->lock){+...}, at: pcpu_freelist_push+0x2a/0x40
[   12.492696] but this lock was taken by another, HARDIRQ-safe lock in the past:
[   12.493275]  (&rq->lock){-.-.}
[   12.493276]
[   12.493276]
[   12.493276] and interrupts could create inverse lock ordering between them.
[   12.493276]
[   12.494435]
[   12.494435] other info that might help us debug this:
[   12.494979]  Possible interrupt unsafe locking scenario:
[   12.494979]
[   12.495518]        CPU0                    CPU1
[   12.495879]        ----                    ----
[   12.496243]   lock(&head->lock);
[   12.496502]                                local_irq_disable();
[   12.496969]                                lock(&rq->lock);
[   12.497431]                                lock(&head->lock);
[   12.497890]   <Interrupt>
[   12.498104]     lock(&rq->lock);
[   12.498368]
[   12.498368]  *** DEADLOCK ***
[   12.498368]
[   12.498837] 1 lock held by dd/276:
[   12.499110]  #0: 00000000c58cb2ee (rcu_read_lock){....}, at: trace_call_bpf+0x5e/0x240
[   12.499747]
[   12.499747] the shortest dependencies between 2nd lock and 1st lock:
[   12.500389]  -> (&rq->lock){-.-.} {
[   12.500669]     IN-HARDIRQ-W at:
[   12.500934]                       _raw_spin_lock+0x2f/0x40
[   12.501373]                       scheduler_tick+0x4c/0xf0
[   12.501812]                       update_process_times+0x40/0x50
[   12.502294]                       tick_periodic+0x27/0xb0
[   12.502723]                       tick_handle_periodic+0x1f/0x60
[   12.503203]                       timer_interrupt+0x11/0x20
[   12.503651]                       __handle_irq_event_percpu+0x43/0x2c0
[   12.504167]                       handle_irq_event_percpu+0x20/0x50
[   12.504674]                       handle_irq_event+0x37/0x60
[   12.505139]                       handle_level_irq+0xa7/0x120
[   12.505601]                       handle_irq+0xa1/0x150
[   12.506018]                       do_IRQ+0x77/0x140
[   12.506411]                       ret_from_intr+0x0/0x1d
[   12.506834]                       _raw_spin_unlock_irqrestore+0x53/0x60
[   12.507362]                       __setup_irq+0x481/0x730
[   12.507789]                       setup_irq+0x49/0x80
[   12.508195]                       hpet_time_init+0x21/0x32
[   12.508644]                       x86_late_time_init+0xb/0x16
[   12.509106]                       start_kernel+0x390/0x42a
[   12.509554]                       secondary_startup_64+0xa4/0xb0
[   12.510034]     IN-SOFTIRQ-W at:
[   12.510305]                       _raw_spin_lock+0x2f/0x40
[   12.510772]                       try_to_wake_up+0x1c7/0x4e0
[   12.511220]                       swake_up_locked+0x20/0x40
[   12.511657]                       swake_up_one+0x1a/0x30
[   12.512070]                       rcu_process_callbacks+0xc5/0x650
[   12.512553]                       __do_softirq+0xe6/0x47b
[   12.512978]                       irq_exit+0xc3/0xd0
[   12.513372]                       smp_apic_timer_interrupt+0xa9/0x250
[   12.513876]                       apic_timer_interrupt+0xf/0x20
[   12.514343]                       default_idle+0x1c/0x170
[   12.514765]                       do_idle+0x199/0x240
[   12.515159]                       cpu_startup_entry+0x19/0x20
[   12.515614]                       start_kernel+0x422/0x42a
[   12.516045]                       secondary_startup_64+0xa4/0xb0
[   12.516521]     INITIAL USE at:
[   12.516774]                      _raw_spin_lock_irqsave+0x38/0x50
[   12.517258]                      rq_attach_root+0x16/0xd0
[   12.517685]                      sched_init+0x2f2/0x3eb
[   12.518096]                      start_kernel+0x1fb/0x42a
[   12.518525]                      secondary_startup_64+0xa4/0xb0
[   12.518986]   }
[   12.519132]   ... key      at: [<ffffffff82b7bc28>] __key.71384+0x0/0x8
[   12.519649]   ... acquired at:
[   12.519892]    pcpu_freelist_pop+0x7b/0xd0
[   12.520221]    bpf_get_stackid+0x1d2/0x4d0
[   12.520563]    ___bpf_prog_run+0x8b4/0x11a0
[   12.520887]
[   12.521008] -> (&head->lock){+...} {
[   12.521292]    HARDIRQ-ON-W at:
[   12.521539]                     _raw_spin_lock+0x2f/0x40
[   12.521950]                     pcpu_freelist_push+0x2a/0x40
[   12.522396]                     bpf_get_stackid+0x494/0x4d0
[   12.522828]                     ___bpf_prog_run+0x8b4/0x11a0
[   12.523296]    INITIAL USE at:
[   12.523537]                    _raw_spin_lock+0x2f/0x40
[   12.523944]                    pcpu_freelist_populate+0xc0/0x120
[   12.524417]                    htab_map_alloc+0x405/0x500
[   12.524835]                    __do_sys_bpf+0x1a3/0x1a90
[   12.525253]                    do_syscall_64+0x4a/0x180
[   12.525659]                    entry_SYSCALL_64_after_hwframe+0x49/0xbe
[   12.526167]  }
[   12.526311]  ... key      at: [<ffffffff838f7668>] __key.13130+0x0/0x8
[   12.526812]  ... acquired at:
[   12.527047]    __lock_acquire+0x521/0x1350
[   12.527371]    lock_acquire+0x98/0x190
[   12.527680]    _raw_spin_lock+0x2f/0x40
[   12.527994]    pcpu_freelist_push+0x2a/0x40
[   12.528325]    bpf_get_stackid+0x494/0x4d0
[   12.528645]    ___bpf_prog_run+0x8b4/0x11a0
[   12.528970]
[   12.529092]
[   12.529092] stack backtrace:
[   12.529444] CPU: 0 PID: 276 Comm: dd Not tainted 5.0.0-rc3-00018-g2fa53f892422 #475
[   12.530043] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.11.0-2.el7 04/01/2014
[   12.530750] Call Trace:
[   12.530948]  dump_stack+0x5f/0x8b
[   12.531248]  check_usage_backwards+0x10c/0x120
[   12.531598]  ? ___bpf_prog_run+0x8b4/0x11a0
[   12.531935]  ? mark_lock+0x382/0x560
[   12.532229]  mark_lock+0x382/0x560
[   12.532496]  ? print_shortest_lock_dependencies+0x180/0x180
[   12.532928]  __lock_acquire+0x521/0x1350
[   12.533271]  ? find_get_entry+0x17f/0x2e0
[   12.533586]  ? find_get_entry+0x19c/0x2e0
[   12.533902]  ? lock_acquire+0x98/0x190
[   12.534196]  lock_acquire+0x98/0x190
[   12.534482]  ? pcpu_freelist_push+0x2a/0x40
[   12.534810]  _raw_spin_lock+0x2f/0x40
[   12.535099]  ? pcpu_freelist_push+0x2a/0x40
[   12.535432]  pcpu_freelist_push+0x2a/0x40
[   12.535750]  bpf_get_stackid+0x494/0x4d0
[   12.536062]  ___bpf_prog_run+0x8b4/0x11a0

It has been explained that is a false positive here:
https://lkml.org/lkml/2018/7/25/756
Recap:
- stackmap uses pcpu_freelist
- The lock in pcpu_freelist is a percpu lock
- stackmap is only used by tracing bpf_prog
- A tracing bpf_prog cannot be run if another bpf_prog
  has already been running (ensured by the percpu bpf_prog_active counter).

Eric pointed out that this lockdep splats stops other
legit lockdep splats in selftests/bpf/test_progs.c.

Fix this by calling local_irq_save/restore for stackmap.

Another false positive had also been worked around by calling
local_irq_save in commit 89ad2fa3f043 ("bpf: fix lockdep splat").
That commit added unnecessary irq_save/restore to fast path of
bpf hash map. irqs are already disabled at that point, since htab
is holding per bucket spin_lock with irqsave.

Let's reduce overhead for htab by introducing __pcpu_freelist_push/pop
function w/o irqsave and convert pcpu_freelist_push/pop to irqsave
to be used elsewhere (right now only in stackmap).
It stops lockdep false positive in stackmap with a bit of acceptable overhead.

Fixes: 557c0c6e7df8 ("bpf: convert stackmap to pre-allocation")
Reported-by: Naresh Kamboju <naresh.kamboju@linaro.org>
Reported-by: Eric Dumazet <eric.dumazet@gmail.com>
Acked-by: Martin KaFai Lau <kafai@fb.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 kernel/bpf/hashtab.c         |  4 ++--
 kernel/bpf/percpu_freelist.c | 41 +++++++++++++++++++++++++-----------
 kernel/bpf/percpu_freelist.h |  4 ++++
 3 files changed, 35 insertions(+), 14 deletions(-)

diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c
index 4b7c76765d9d..f9274114c88d 100644
--- a/kernel/bpf/hashtab.c
+++ b/kernel/bpf/hashtab.c
@@ -686,7 +686,7 @@ static void free_htab_elem(struct bpf_htab *htab, struct htab_elem *l)
 	}
 
 	if (htab_is_prealloc(htab)) {
-		pcpu_freelist_push(&htab->freelist, &l->fnode);
+		__pcpu_freelist_push(&htab->freelist, &l->fnode);
 	} else {
 		atomic_dec(&htab->count);
 		l->htab = htab;
@@ -748,7 +748,7 @@ static struct htab_elem *alloc_htab_elem(struct bpf_htab *htab, void *key,
 		} else {
 			struct pcpu_freelist_node *l;
 
-			l = pcpu_freelist_pop(&htab->freelist);
+			l = __pcpu_freelist_pop(&htab->freelist);
 			if (!l)
 				return ERR_PTR(-E2BIG);
 			l_new = container_of(l, struct htab_elem, fnode);
diff --git a/kernel/bpf/percpu_freelist.c b/kernel/bpf/percpu_freelist.c
index 673fa6fe2d73..0c1b4ba9e90e 100644
--- a/kernel/bpf/percpu_freelist.c
+++ b/kernel/bpf/percpu_freelist.c
@@ -28,8 +28,8 @@ void pcpu_freelist_destroy(struct pcpu_freelist *s)
 	free_percpu(s->freelist);
 }
 
-static inline void __pcpu_freelist_push(struct pcpu_freelist_head *head,
-					struct pcpu_freelist_node *node)
+static inline void ___pcpu_freelist_push(struct pcpu_freelist_head *head,
+					 struct pcpu_freelist_node *node)
 {
 	raw_spin_lock(&head->lock);
 	node->next = head->first;
@@ -37,12 +37,22 @@ static inline void __pcpu_freelist_push(struct pcpu_freelist_head *head,
 	raw_spin_unlock(&head->lock);
 }
 
-void pcpu_freelist_push(struct pcpu_freelist *s,
+void __pcpu_freelist_push(struct pcpu_freelist *s,
 			struct pcpu_freelist_node *node)
 {
 	struct pcpu_freelist_head *head = this_cpu_ptr(s->freelist);
 
-	__pcpu_freelist_push(head, node);
+	___pcpu_freelist_push(head, node);
+}
+
+void pcpu_freelist_push(struct pcpu_freelist *s,
+			struct pcpu_freelist_node *node)
+{
+	unsigned long flags;
+
+	local_irq_save(flags);
+	__pcpu_freelist_push(s, node);
+	local_irq_restore(flags);
 }
 
 void pcpu_freelist_populate(struct pcpu_freelist *s, void *buf, u32 elem_size,
@@ -63,7 +73,7 @@ void pcpu_freelist_populate(struct pcpu_freelist *s, void *buf, u32 elem_size,
 	for_each_possible_cpu(cpu) {
 again:
 		head = per_cpu_ptr(s->freelist, cpu);
-		__pcpu_freelist_push(head, buf);
+		___pcpu_freelist_push(head, buf);
 		i++;
 		buf += elem_size;
 		if (i == nr_elems)
@@ -74,14 +84,12 @@ void pcpu_freelist_populate(struct pcpu_freelist *s, void *buf, u32 elem_size,
 	local_irq_restore(flags);
 }
 
-struct pcpu_freelist_node *pcpu_freelist_pop(struct pcpu_freelist *s)
+struct pcpu_freelist_node *__pcpu_freelist_pop(struct pcpu_freelist *s)
 {
 	struct pcpu_freelist_head *head;
 	struct pcpu_freelist_node *node;
-	unsigned long flags;
 	int orig_cpu, cpu;
 
-	local_irq_save(flags);
 	orig_cpu = cpu = raw_smp_processor_id();
 	while (1) {
 		head = per_cpu_ptr(s->freelist, cpu);
@@ -89,16 +97,25 @@ struct pcpu_freelist_node *pcpu_freelist_pop(struct pcpu_freelist *s)
 		node = head->first;
 		if (node) {
 			head->first = node->next;
-			raw_spin_unlock_irqrestore(&head->lock, flags);
+			raw_spin_unlock(&head->lock);
 			return node;
 		}
 		raw_spin_unlock(&head->lock);
 		cpu = cpumask_next(cpu, cpu_possible_mask);
 		if (cpu >= nr_cpu_ids)
 			cpu = 0;
-		if (cpu == orig_cpu) {
-			local_irq_restore(flags);
+		if (cpu == orig_cpu)
 			return NULL;
-		}
 	}
 }
+
+struct pcpu_freelist_node *pcpu_freelist_pop(struct pcpu_freelist *s)
+{
+	struct pcpu_freelist_node *ret;
+	unsigned long flags;
+
+	local_irq_save(flags);
+	ret = __pcpu_freelist_pop(s);
+	local_irq_restore(flags);
+	return ret;
+}
diff --git a/kernel/bpf/percpu_freelist.h b/kernel/bpf/percpu_freelist.h
index 3049aae8ea1e..c3960118e617 100644
--- a/kernel/bpf/percpu_freelist.h
+++ b/kernel/bpf/percpu_freelist.h
@@ -22,8 +22,12 @@ struct pcpu_freelist_node {
 	struct pcpu_freelist_node *next;
 };
 
+/* pcpu_freelist_* do spin_lock_irqsave. */
 void pcpu_freelist_push(struct pcpu_freelist *, struct pcpu_freelist_node *);
 struct pcpu_freelist_node *pcpu_freelist_pop(struct pcpu_freelist *);
+/* __pcpu_freelist_* do spin_lock only. caller must disable irqs. */
+void __pcpu_freelist_push(struct pcpu_freelist *, struct pcpu_freelist_node *);
+struct pcpu_freelist_node *__pcpu_freelist_pop(struct pcpu_freelist *);
 void pcpu_freelist_populate(struct pcpu_freelist *s, void *buf, u32 elem_size,
 			    u32 nr_elems);
 int pcpu_freelist_init(struct pcpu_freelist *);
-- 
2.20.0


^ permalink raw reply related

* [PATCH v2 bpf 0/3] bpf: fixes for lockdep and deadlocks
From: Alexei Starovoitov @ 2019-01-31  2:12 UTC (permalink / raw)
  To: davem; +Cc: daniel, peterz, edumazet, jannh, netdev, kernel-team

v1->v2:
- reworded 2nd patch. It's a real dead lock. Not a false positive
- dropped the lockdep fix for up_read_non_owner in bpf_get_stackid

In addition to preempt_disable patch for socket filters
https://patchwork.ozlabs.org/patch/1032437/
First patch fixes lockdep false positive in percpu_freelist
Second patch fixes potential deadlock in bpf_prog_register
Third patch fixes another potential deadlock in stackmap access
from tracing bpf prog and from syscall.

Alexei Starovoitov (2):
  bpf: fix lockdep false positive in percpu_freelist
  bpf: fix potential deadlock in bpf_prog_register

Martin KaFai Lau (1):
  bpf: Fix syscall's stackmap lookup potential deadlock

 kernel/bpf/hashtab.c         |  4 ++--
 kernel/bpf/percpu_freelist.c | 41 +++++++++++++++++++++++++-----------
 kernel/bpf/percpu_freelist.h |  4 ++++
 kernel/bpf/syscall.c         | 12 +++++++++--
 kernel/trace/bpf_trace.c     | 14 ++----------
 5 files changed, 47 insertions(+), 28 deletions(-)

-- 
2.20.0


^ permalink raw reply

* Re: Kernel memory corruption in CIPSO labeled TCP packets processing.
From: Paul Moore @ 2019-01-31  2:10 UTC (permalink / raw)
  To: Nazarov Sergey
  Cc: linux-security-module@vger.kernel.org, selinux@vger.kernel.org,
	netdev@vger.kernel.org, Casey Schaufler
In-Reply-To: <3191601548853902@myt6-23299ba78d64.qloud-c.yandex.net>

On Wed, Jan 30, 2019 at 8:11 AM Nazarov Sergey <s-nazarov@yandex.ru> wrote:
> 30.01.2019, 01:42, "Paul Moore" <paul@paul-moore.com>:
> > There are several cases where the stack ends up calling icmp_send()
> > after the skb has been through ip_options_compile(), that should be
> > okay.
> >
> > --
> > paul moore
> > www.paul-moore.com
>
> In those cases precompiled ip_options struct used, without the need to reuse ip_options_compile.
> I think, for error ICMP packet, we can discard all other options except CIPSO. It will be better, than
> send packet, contains wrong option's data. Modified patch 2:
> ---
>  net/ipv4/cipso_ipv4.c |   24 ++++++++++++++++++++++--
>  1 files changed, 22 insertions(+), 2 deletions(-)

This isn't how the rest of the stack works, look at
ip_local_deliver_finish() for one example.  Perhaps the behavior you
are proposing is correct, but please show me where in the various RFC
specs it is defined so that I can better understand why it should work
this way.

> diff --git a/net/ipv4/cipso_ipv4.c b/net/ipv4/cipso_ipv4.c
> index 777fa3b..797826c 100644
> --- a/net/ipv4/cipso_ipv4.c
> +++ b/net/ipv4/cipso_ipv4.c
> @@ -1735,13 +1735,33 @@ int cipso_v4_validate(const struct sk_buff *skb, unsigned char **option)
>   */
>  void cipso_v4_error(struct sk_buff *skb, int error, u32 gateway)
>  {
> +       struct ip_options opt;
> +       unsigned char *optptr;
> +
>         if (ip_hdr(skb)->protocol == IPPROTO_ICMP || error != -EACCES)
>                 return;
>
> +       /*
> +        * We might be called above the IP layer,
> +        * so we can not use icmp_send and IPCB here.
> +        *
> +        * For the generated ICMP packet, we create a
> +        * temporary ip _options structure, contains
> +        * the CIPSO option only, since the other options data
> +        * could be modified when the original packet receiving.
> +        */
> +
> +       memset(&opt, 0, sizeof(struct ip_options));
> +       optptr = cipso_v4_optptr(skb);
> +       if (optptr) {
> +               opt.optlen = optptr[1];
> +               opt.cipso = optptr - skb_network_header(skb);
> +       }
> +
>         if (gateway)
> -               icmp_send(skb, ICMP_DEST_UNREACH, ICMP_NET_ANO, 0);
> +               __icmp_send(skb, ICMP_DEST_UNREACH, ICMP_NET_ANO, 0, &opt);
>         else
> -               icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_ANO, 0);
> +               __icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_ANO, 0, &opt);
>  }
>
>  /**
>


-- 
paul moore
www.paul-moore.com

^ permalink raw reply

* Re: [PATCH bpf-next 2/4] bpf: fix lockdep false positive in stackmap
From: Alexei Starovoitov @ 2019-01-31  2:01 UTC (permalink / raw)
  To: Waiman Long
  Cc: Peter Zijlstra, Alexei Starovoitov, davem, daniel, edumazet,
	jannh, netdev, kernel-team
In-Reply-To: <ac6334bc-1e74-ec9b-11ab-91199fb88fd0@redhat.com>

On Wed, Jan 30, 2019 at 04:32:12PM -0500, Waiman Long wrote:
> On 01/30/2019 04:11 PM, Waiman Long wrote:
> > On 01/30/2019 03:10 PM, Alexei Starovoitov wrote:
> >> On Wed, Jan 30, 2019 at 02:42:23PM -0500, Waiman Long wrote:
> >>> On 01/30/2019 02:30 PM, Alexei Starovoitov wrote:
> >>>> On Wed, Jan 30, 2019 at 11:15:30AM +0100, Peter Zijlstra wrote:
> >>>>> On Tue, Jan 29, 2019 at 08:04:56PM -0800, Alexei Starovoitov wrote:
> >>>>>> Lockdep warns about false positive:
> >>>>> This is not a false positive, and you probably also need to use
> >>>>> down_read_non_owner() to match this up_read_non_owner().
> >>>>>
> >>>>> {up,down}_read() and {up,down}_read_non_owner() are not only different
> >>>>> in the lockdep annotation; there is also optimistic spin stuff that
> >>>>> relies on 'owner' tracking.
> >>>> Can you point out in the code the spin bit?
> >>>> As far as I can see sem->owner is debug only feature.
> >>>> All owner checks are done under CONFIG_DEBUG_RWSEMS.
> >>> No, sem->owner is mainly for performing optimistic spinning which is a
> >>> performance feature to make rwsem writer-lock performs similar to mutex.
> >>> The debugging part is just an add-on. It is not the reason for the
> >>> presence of sem->owner.
> >> I see. Got it.
> >>
> >>>> Also there is no down_read_trylock_non_owner() at the moment.
> >>>> We can argue about it for -next, but I'd rather silence lockdep
> >>>> with this patch today.
> >>>>
> >>> We can add down_read_trylock_non_owner() if there is a need for it. It
> >>> should be easy to do.
> >> Yes, but looking through the code it's not clear to me that it's safe
> >> to mix non_owner() versions with regular.
> >> bpf/stackmap.c does down_read_trylock + up_read.
> >> If we add new down_read_trylock_non_owner that set the owner to
> >> NULL | RWSEM_* bits is this safe with conccurent read/write
> >> that do regular versions?
> >> rwsem_can_spin_on_owner() does:
> >>         if (owner) {
> >>                 ret = is_rwsem_owner_spinnable(owner) &&
> >>                       owner_on_cpu(owner);
> >> that looks correct.
> >> For a second I thought there could be fault here due to non_owner.
> >> But there could be other places where it's assumed that owner
> >> is never null?
> > The content of owner is not the cause of the lockdep warning. The
> > lockdep code assumes that the task that acquires the lock will release
> > it some time later. That is not the case when you need to acquire the
> > lock by one task and released by another. In this case, you have to use
> > the non_owner version of down/up_read which disable the lockdep
> > acquire/release tracking. That will be the only difference between the
> > two set of APIs.
> >
> > Cheers,
> > Longman
> 
> BTW, you may want to do something like that to make sure that the lock
> ownership is probably tracked.
> 
> -Longman
> 
> diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c
> index d43b145..79eef9d 100644
> --- a/kernel/bpf/stackmap.c
> +++ b/kernel/bpf/stackmap.c
> @@ -338,6 +338,13 @@ static void stack_map_get_build_id_offset(struct
> bpf_stack_
>         } else {
>                 work->sem = &current->mm->mmap_sem;
>                 irq_work_queue(&work->irq_work);
> +
> +               /*
> +                * The irq_work will release the mmap_sem with
> +                * up_read_non_owner(). The rwsem_release() is called
> +                * here to release the lock from lockdep's perspective.
> +                */
> +               rwsem_release(&current->mm->mmap_sem.dep_map, 1, _RET_IP_);

are you saying the above is enough coupled with up_read_non_owner?
Looking at how amdgpu is using this api... I think they just use non_owner
version when doing things from different task.
So I don't think pairing non_owner with non_owner is strictly necessary.
It seems fine to use down_read_trylock() with above rwsem_release() hack
plus up_read_non_owner().
Am I missing something?


^ permalink raw reply

* Re: [PATCH net-next] net: udp Allow CHECKSUM_UNNECESSARY packets to do GRO.
From: maowenan @ 2019-01-31  1:57 UTC (permalink / raw)
  To: Tom Herbert
  Cc: Linux Kernel Network Developers, David S. Miller, Eric Dumazet
In-Reply-To: <CALx6S369gR+RPB3Gb_QwE9HAPKmFunFNkVZxo4GJZ8Jx++aDzw@mail.gmail.com>



On 2019/1/30 4:24, Tom Herbert wrote:
> On Tue, Jan 29, 2019 at 12:08 AM maowenan <maowenan@huawei.com> wrote:
>>
>>
>>
>> On 2019/1/29 14:24, Tom Herbert wrote:
>>> On Mon, Jan 28, 2019 at 10:04 PM maowenan <maowenan@huawei.com> wrote:
>>>>
>>>>
>>>>
>>>> On 2019/1/29 12:01, Tom Herbert wrote:
>>>>> On Mon, Jan 28, 2019 at 7:00 PM maowenan <maowenan@huawei.com> wrote:
>>>>>>
>>>>>> Hi all，
>>>>>> Do you have any comments about this change?
>>>>>>
>>>>>>
>>>>>> On 2019/1/23 11:33, Mao Wenan wrote:
>>>>>>> When udp4_gro_receive() get one packet that uh->check=0,
>>>>>>> skb_gro_checksum_validate_zero_check() will set the
>>>>>>> skb->ip_summed = CHECKSUM_UNNECESSARY;
>>>>>>> skb->csum_level = 0;
>>>>>>> Then udp_gro_receive() will flush the packet which is not CHECKSUM_PARTIAL,
>>>>>>> It is not our expect,  because check=0 in udp header indicates this
>>>>>>> packet is no need to caculate checksum, we should go further to do GRO.
>>>>>>>
>>>>>>> This patch changes the value of csum_cnt according to skb->csum_level.
>>>>>>> ---
>>>>>>>  include/linux/netdevice.h | 1 +
>>>>>>>  1 file changed, 1 insertion(+)
>>>>>>>
>>>>>>> diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
>>>>>>> index 1377d08..9c819f1 100644
>>>>>>> --- a/include/linux/netdevice.h
>>>>>>> +++ b/include/linux/netdevice.h
>>>>>>> @@ -2764,6 +2764,7 @@ static inline void skb_gro_incr_csum_unnecessary(struct sk_buff *skb)
>>>>>>>                * during GRO. This saves work if we fallback to normal path.
>>>>>>>                */
>>>>>>>               __skb_incr_checksum_unnecessary(skb);
>>>>>>> +             NAPI_GRO_CB(skb)->csum_cnt = skb->csum_level + 1;
>>>>>
>>>>> That doesn't look right. This would be reinitializing the GRO
>>>>> checksums from the beginning.
>>>>>
>>>>>>>       }
>>>>>>>  }
>>>>>>>
>>>>>>>
>>>>>>
>>>>> I assume the code is bailing on this conditional:
>>>>>
>>>>> if (NAPI_GRO_CB(skb)->encap_mark ||
>>>>>             (skb->ip_summed != CHECKSUM_PARTIAL &&
>>>>>              NAPI_GRO_CB(skb)->csum_cnt == 0 &&
>>>>>              !NAPI_GRO_CB(skb)->csum_valid) ||
>>>>>             !udp_sk(sk)->gro_receive)
>>>>>                 goto out_unlock;
>>>>>
>>>>> I am trying to remember why this needs to check csum_cnt. If there was
>>>>> a csum_cnt for the UDP csum being zero from checksum-unnecessary, it
>>>>> was consumed by skb_gro_checksum_validate_zero_check in UDP4 GRO
>>>>> received.
>>>>
>>>> We have met the scene about two VMs in different host with vxlan packets, when udp4_gro_receive receives
>>>> one packet with ip_summed=CHECKSUM_NONE,csum_cnt=0,csum_valid=0,and udp->check=0, then skb_gro_checksum_validate_zero_check()->
>>>> skb_gro_incr_csum_unnecessary() validate it and set ip_summed=CHECKSUM_UNNECESSARY,csum_level=0, but csum_cnt and csum_valid
>>>> keep zero value. Then it will be flushed in udp_gro_receive(), the codes as you have showed.
>>>>
>>>> so I think it forgets to modify csum_cnt since csum_level is changed in skb_gro_incr_csum_unnecessary()->__skb_incr_checksum_unnecessary().
>>>>
>>> Yes, but the csum_level is changing since we've gone beyond the
>>> checksums initially reported inc checksum-unnecessary. GRO csum_cnt is
>>> initialized to skb->csum_level + 1 at the start of GRO processing.
>>>
>>> If I recall, the rule is that UDP GRO requires at least one non-zero
>>> checksum to be verified. The idea is that if we end up computing
>>> packet checksums on the host for inner checksums like TCP during GRO,
>>> then that's negating the performance benefits of GRO. Had UDP check
>>> not been zero then we would do checksum unnecessary conversion and so
>>> csum_valid would be set for the remainded of GRO processing. The
>>> existing code is following the rule I believe, so this may be working
>>> as intended.
>>
>> Do you have any suggestion if I need do GRO as udp->check is zero?
>> My previous modification which works fine as below:
>>         if (NAPI_GRO_CB(skb)->encap_mark ||
>>             (skb->ip_summed != CHECKSUM_PARTIAL &&
>> +            skb->ip_summed != CHECKSUM_UNNECESSARY &&
> 
> That's effectively disabling the rule that we need a real checksum
> calculation to proceed with GRO. Besides that, the device returning
> one checksum-unnecessary level because UDP csum is zero is pretty
> pointelss; we can just as easily deduce get to same state just by
> looking at the field with CHECKSUM_NONE. What we really want to see
> for GRO is a real checksum computation being done on the packet.
> 
> A few questions:
> 
> What type of packets are being GROed? Are these TCP? What performance
> difference do you see with our patch? Can you try enabling UDP
> checksums, and even RCO with VXLAN? With UDP encapsulation we
> generally see better performance with checksum enabled since UDP
> checksum offload is ubiquitous and we can easily convert
> checksum-unnecessary (with non-zero csum) to checksum-complete.

We use the physical network card calculate the checksum of the inner packet with checksum offload.
Set the udp checksum of the vxlan header is 0.

With this patch, the bandwidth of TCP between two VMs increase from 2Gbit/s to 6Gbit/s.

> 
> Tom
> 
> 
> 
> 
>>              NAPI_GRO_CB(skb)->csum_cnt == 0 &&
>>              !NAPI_GRO_CB(skb)->csum_valid) ||
>>             !udp_sk(sk)->gro_receive)
>>                 goto out_unlock;
>>
>>
>>>
>>> Tom
>>>
>>>>>
>>>>> .
>>>>>
>>>>
>>>
>>> .
>>>
>>
> 
> .
> 


^ permalink raw reply

* Re: [PATCH] rtlwifi: remove set but not used variable 'cmd_seq'
From: YueHaibing @ 2019-01-31  1:53 UTC (permalink / raw)
  To: Kalle Valo, Pkshih; +Cc: linux-wireless@vger.kernel.org, netdev@vger.kernel.org
In-Reply-To: <87zhrijuay.fsf@kamboji.qca.qualcomm.com>


On 2019/1/30 18:47, Kalle Valo wrote:
> Pkshih <pkshih@realtek.com> writes:
> 
>> On Tue, 2019-01-29 at 14:03 +0800, YueHaibing wrote:
>>> +cc netdev@vger.kernel.org
>>>
>>> On 2019/1/29 13:57, YueHaibing wrote:
>>>> ping...
>>>>  
>>>> On 2018/9/11 20:12, YueHaibing wrote:
>>>>> Fixes gcc '-Wunused-but-set-variable' warning:
>>>>>
>>>>> drivers/net/wireless/realtek/rtlwifi/base.c: In function
>>> 'rtl_c2h_content_parsing':
>>>>> drivers/net/wireless/realtek/rtlwifi/base.c:2313:13: warning:
>>>>>   variable 'cmd_seq' set but not used [-Wunused-but-set-variable]
>>>>>
>>>>> Signed-off-by: YueHaibing <yuehaibing@huawei.com>
>>
>> Thanks for your fix.
>>
>> Acked-by: Ping-Ke Shih <pkshih@realtek.com>
> 
> For some reason I couldn't find the original patch from patchwork.
> I didn't find the version sent today though:
> 
> https://patchwork.kernel.org/patch/10787619/
> 
> BTW YueHaibing, you can check the linux-wireless patch status yourself from
> the patchwork:

Thanks, I'll check it.


> 
> https://wireless.wiki.kernel.org/en/developers/documentation/submittingpatches#checking_state_of_patches_from_patchwork
> 


^ permalink raw reply

* Re: net: phylink: dsa: mv88e6xxx: flaky link detection on switch ports with internal PHYs
From: John David Anglin @ 2019-01-31  1:27 UTC (permalink / raw)
  To: Andrew Lunn; +Cc: Russell King, Vivien Didelot, Florian Fainelli, netdev
In-Reply-To: <20190130223846.GB30115@lunn.ch>

On 2019-01-30 5:38 p.m., Andrew Lunn wrote:
> I'd suggest you take a look at the datasheet for the 37xx and check
> what the hardware actually supports. You might need to extend the
> driver.
I did look and the GIC does support level interrupts.  But all the
documentation is in
generic ARM documents that I don't currently have.  I'll see if I can
find them tomorrow.

Dave

-- 
John David Anglin  dave.anglin@bell.net


^ permalink raw reply

* Re: [PATCH net-next v2 01/12] net: bridge: multicast: Propagate br_mc_disabled_update() return
From: Florian Fainelli @ 2019-01-31  1:00 UTC (permalink / raw)
  To: Ido Schimmel
  Cc: netdev@vger.kernel.org, andrew@lunn.ch, vivien.didelot@gmail.com,
	davem@davemloft.net, Jiri Pirko, ilias.apalodimas@linaro.org,
	ivan.khoronzhuk@linaro.org, roopa@cumulusnetworks.com,
	nikolay@cumulusnetworks.com
In-Reply-To: <20190130073656.GA22227@splinter>

On 1/29/19 11:36 PM, Ido Schimmel wrote:
> On Tue, Jan 29, 2019 at 04:55:37PM -0800, Florian Fainelli wrote:
>> Some Ethernet switches might not be able to support disabling multicast
>> flooding globally when e.g: several bridges span the same physical
>> device, propagate the return value of br_mc_disabled_update() such that
>> this propagates correctly to user-space.
>>
>> Signed-off-by: Florian Fainelli <f.fainelli@gmail.com>
>> ---
>>  net/bridge/br_multicast.c | 23 ++++++++++++++++-------
>>  1 file changed, 16 insertions(+), 7 deletions(-)
>>
>> diff --git a/net/bridge/br_multicast.c b/net/bridge/br_multicast.c
>> index 3aeff0895669..aff5e003d34f 100644
>> --- a/net/bridge/br_multicast.c
>> +++ b/net/bridge/br_multicast.c
>> @@ -813,20 +813,22 @@ static void br_ip6_multicast_port_query_expired(struct timer_list *t)
>>  }
>>  #endif
>>  
>> -static void br_mc_disabled_update(struct net_device *dev, bool value)
>> +static int br_mc_disabled_update(struct net_device *dev, bool value)
>>  {
>>  	struct switchdev_attr attr = {
>>  		.orig_dev = dev,
>>  		.id = SWITCHDEV_ATTR_ID_BRIDGE_MC_DISABLED,
>> -		.flags = SWITCHDEV_F_DEFER,
>> +		.flags = SWITCHDEV_F_DEFER | SWITCHDEV_F_SKIP_EOPNOTSUPP,
> 
> Actually, since the operation is deferred I don't think the return value
> from the driver is ever checked. Can you test it?

You are right, you get a WARN() from switchdev_attr_port_set_now(), but
this does not propagate back to the caller, so you can still create a
bridge device and enslave a device successfully despite getting warnings
on the console.

> 
> I think it would be good to convert the attributes to use the switchdev
> notifier like commit d17d9f5e5143 ("switchdev: Replace port obj add/del
> SDO with a notification") did for objects. Then you can have your
> listener veto the operation in the same context it is happening.

Alright, working on it. Would you do that just for the attr_set, or for
attr_get as well (to be symmetrical)?
-- 
Florian

^ permalink raw reply

* Re: Stack sends oversize UDP packet to the driver
From: Mahesh Bandewar (महेश बंडेवार) @ 2019-01-31  1:00 UTC (permalink / raw)
  To: Michael Chan
  Cc: Daniel Axtens, Netdev, David Miller, Eric Dumazet,
	Willem de Bruijn
In-Reply-To: <CACKFLi=O8fDPbtPeiPcrkSxp3U5EgNzhOsU9yJ3c-cU01p-dhQ@mail.gmail.com>

On Wed, Jan 30, 2019 at 1:07 AM Michael Chan <michael.chan@broadcom.com> wrote:
>
> On Tue, Jan 22, 2019 at 10:29 AM Mahesh Bandewar (महेश बंडेवार)
> <maheshb@google.com> wrote:
>
> >
> > The idea behind the fix is very simple and it is to create a dst-only
> > (unregistered) device with a very low MTU and use it instead of 'lo'
> > while invalidating the dst. This would make it *not* forward packets
> > to driver which might need fragmentation.
> >
>
> We tested the 2 patches many times and including an overnight test.  I
> can confirm that the oversize UDP packets are no longer seen with the
> patches applied.  However, I don't see the blackhole xmit function
> getting called to free the SKBs though.
>
Thanks for the confirmation Michael. The blackhole device mtu is
really small, so I would assume the fragmentation code dropped those
packets before calling the xmit function (in ip_fragment), you could
verify that with icmp counters.

> Thanks.

^ permalink raw reply

* Re: [PATCH net-next] nfp: use struct_size() in kzalloc()
From: Gustavo A. R. Silva @ 2019-01-31  0:57 UTC (permalink / raw)
  To: Jakub Kicinski; +Cc: David S. Miller, oss-drivers, netdev, linux-kernel
In-Reply-To: <20190130165221.243d37f9@cakuba.hsd1.ca.comcast.net>



On 1/30/19 6:52 PM, Jakub Kicinski wrote:
> On Wed, 30 Jan 2019 18:38:59 -0600, Gustavo A. R. Silva wrote:
>> One of the more common cases of allocation size calculations is finding
>> the size of a structure that has a zero-sized array at the end, along
>> with memory for some number of elements for that array. For example:
>>
>> struct foo {
>>     int stuff;
>>     struct boo entry[];
>> };
>>
>> instance = kzalloc(sizeof(struct foo) + count * sizeof(struct boo), GFP_KERNEL);
>>
>> Instead of leaving these open-coded and prone to type mistakes, we can
>> now use the new struct_size() helper:
>>
>> instance = kzalloc(struct_size(instance, entry, count), GFP_KERNEL);
>>
>> This code was detected with the help of Coccinelle.
>>
>> Signed-off-by: Gustavo A. R. Silva <gustavo@embeddedor.com>
> 
> Acked-by: Jakub Kicinski <jakub.kicinski@netronome.com>
> 

Thanks, Jakub.

--
Gustavo

^ permalink raw reply

* [PATCH net-next] tulip: eeprom: use struct_size() in kmalloc()
From: Gustavo A. R. Silva @ 2019-01-31  0:33 UTC (permalink / raw)
  To: David S. Miller; +Cc: netdev, linux-parisc, linux-kernel, Gustavo A. R. Silva

One of the more common cases of allocation size calculations is finding
the size of a structure that has a zero-sized array at the end, along
with memory for some number of elements for that array. For example:

struct foo {
    int stuff;
    struct boo entry[];
};

instance = kmalloc(sizeof(struct foo) + count * sizeof(struct boo), GFP_KERNEL);

Instead of leaving these open-coded and prone to type mistakes, we can
now use the new struct_size() helper:

instance = kmalloc(struct_size(instance, entry, count), GFP_KERNEL);

This code was detected with the help of Coccinelle.

Signed-off-by: Gustavo A. R. Silva <gustavo@embeddedor.com>
---
 drivers/net/ethernet/dec/tulip/eeprom.c | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/drivers/net/ethernet/dec/tulip/eeprom.c b/drivers/net/ethernet/dec/tulip/eeprom.c
index 1812f4916917..ba0a69b363f8 100644
--- a/drivers/net/ethernet/dec/tulip/eeprom.c
+++ b/drivers/net/ethernet/dec/tulip/eeprom.c
@@ -224,9 +224,7 @@ void tulip_parse_eeprom(struct net_device *dev)
 		        return;
 		}
 
-		mtable = kmalloc(sizeof(struct mediatable) +
-				 count * sizeof(struct medialeaf),
-				 GFP_KERNEL);
+		mtable = kmalloc(struct_size(mtable, mleaf, count), GFP_KERNEL);
 		if (mtable == NULL)
 			return;				/* Horrible, impossible failure. */
 		last_mediatable = tp->mtable = mtable;
-- 
2.20.1


^ permalink raw reply related

* Re: [PATCH net-next] nfp: use struct_size() in kzalloc()
From: Jakub Kicinski @ 2019-01-31  0:52 UTC (permalink / raw)
  To: Gustavo A. R. Silva; +Cc: David S. Miller, oss-drivers, netdev, linux-kernel
In-Reply-To: <20190131003859.GA28539@embeddedor>

On Wed, 30 Jan 2019 18:38:59 -0600, Gustavo A. R. Silva wrote:
> One of the more common cases of allocation size calculations is finding
> the size of a structure that has a zero-sized array at the end, along
> with memory for some number of elements for that array. For example:
> 
> struct foo {
>     int stuff;
>     struct boo entry[];
> };
> 
> instance = kzalloc(sizeof(struct foo) + count * sizeof(struct boo), GFP_KERNEL);
> 
> Instead of leaving these open-coded and prone to type mistakes, we can
> now use the new struct_size() helper:
> 
> instance = kzalloc(struct_size(instance, entry, count), GFP_KERNEL);
> 
> This code was detected with the help of Coccinelle.
> 
> Signed-off-by: Gustavo A. R. Silva <gustavo@embeddedor.com>

Acked-by: Jakub Kicinski <jakub.kicinski@netronome.com>

^ permalink raw reply

* [PATCH net-next] ipv4: fib: use struct_size() in kzalloc()
From: Gustavo A. R. Silva @ 2019-01-31  0:51 UTC (permalink / raw)
  To: David S. Miller, Alexey Kuznetsov, Hideaki YOSHIFUJI
  Cc: netdev, linux-kernel, Gustavo A. R. Silva

One of the more common cases of allocation size calculations is finding
the size of a structure that has a zero-sized array at the end, along
with memory for some number of elements for that array. For example:

struct foo {
    int stuff;
    struct boo entry[];
};

instance = kzalloc(sizeof(struct foo) + count * sizeof(struct boo), GFP_KERNEL);

Instead of leaving these open-coded and prone to type mistakes, we can
now use the new struct_size() helper:

instance = kzalloc(struct_size(instance, entry, count), GFP_KERNEL);

This code was detected with the help of Coccinelle.

Signed-off-by: Gustavo A. R. Silva <gustavo@embeddedor.com>
---
 net/ipv4/fib_semantics.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c
index 5022bc63863a..8e185b5a2bf6 100644
--- a/net/ipv4/fib_semantics.c
+++ b/net/ipv4/fib_semantics.c
@@ -1072,7 +1072,7 @@ struct fib_info *fib_create_info(struct fib_config *cfg,
 			goto failure;
 	}

-	fi = kzalloc(sizeof(*fi)+nhs*sizeof(struct fib_nh), GFP_KERNEL);
+	fi = kzalloc(struct_size(fi, fib_nh, nhs), GFP_KERNEL);
 	if (!fi)
 		goto failure;
 	fi->fib_metrics = ip_fib_metrics_init(fi->fib_net, cfg->fc_mx,
-- 
2.20.1

^ permalink raw reply related

* [PATCH net-next] xprtrdma: Use struct_size() in kzalloc()
From: Gustavo A. R. Silva @ 2019-01-31  0:46 UTC (permalink / raw)
  To: Trond Myklebust, Anna Schumaker, J. Bruce Fields, Jeff Layton,
	David S. Miller
  Cc: linux-nfs, netdev, linux-kernel, Gustavo A. R. Silva

One of the more common cases of allocation size calculations is finding
the size of a structure that has a zero-sized array at the end, along
with memory for some number of elements for that array. For example:

struct foo {
    int stuff;
    struct boo entry[];
};

instance = kzalloc(sizeof(struct foo) + count * sizeof(struct boo), GFP_KERNEL);

Instead of leaving these open-coded and prone to type mistakes, we can
now use the new struct_size() helper:

instance = kzalloc(struct_size(instance, entry, count), GFP_KERNEL);

This code was detected with the help of Coccinelle.

Signed-off-by: Gustavo A. R. Silva <gustavo@embeddedor.com>
---
 net/sunrpc/xprtrdma/verbs.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c
index 4994e75945b8..9e8cf7456840 100644
--- a/net/sunrpc/xprtrdma/verbs.c
+++ b/net/sunrpc/xprtrdma/verbs.c
@@ -811,8 +811,7 @@ static struct rpcrdma_sendctx *rpcrdma_sendctx_create(struct rpcrdma_ia *ia)
 {
 	struct rpcrdma_sendctx *sc;
 
-	sc = kzalloc(sizeof(*sc) +
-		     ia->ri_max_send_sges * sizeof(struct ib_sge),
+	sc = kzalloc(struct_size(sc, sc_sges, ia->ri_max_send_sges),
 		     GFP_KERNEL);
 	if (!sc)
 		return NULL;
-- 
2.20.1


^ permalink raw reply related

* Re: [PATCH net-next v2 1/2] net: dsa: mv88e6xxx: Save switch rules
From: Vivien Didelot @ 2019-01-31  0:46 UTC (permalink / raw)
  To: Miquel Raynal
  Cc: Andrew Lunn, Florian Fainelli, David S. Miller, netdev,
	linux-kernel, Thomas Petazzoni, Gregory Clement, Antoine Tenart,
	Maxime Chevallier, Nadav Haklai
In-Reply-To: <20190130104606.31639abb@xps13>

Hi Miquèl,

On Wed, 30 Jan 2019 10:46:06 +0100, Miquel Raynal <miquel.raynal@bootlin.com> wrote:

> > > > Today, there is no S2RAM support for switches. First, I proposed to add
> > > > suspend/resume callbacks to the mv88e6xxx driver - just enough to avoid
> > > > crashing the kernel.  
> > > 
> > > Then i would suggest the mv88e6xxx refuses the suspend. Actually that
> > > probably is the first correct step. We don't have suspend support, so
> > > stop the suspend happening, so preventing the kernel crash.  

Actually can you show me the crash that is happening?

^ permalink raw reply

* [PATCH net-next] nfp: use struct_size() in kzalloc()
From: Gustavo A. R. Silva @ 2019-01-31  0:38 UTC (permalink / raw)
  To: Jakub Kicinski, David S. Miller
  Cc: oss-drivers, netdev, linux-kernel, Gustavo A. R. Silva

One of the more common cases of allocation size calculations is finding
the size of a structure that has a zero-sized array at the end, along
with memory for some number of elements for that array. For example:

struct foo {
    int stuff;
    struct boo entry[];
};

instance = kzalloc(sizeof(struct foo) + count * sizeof(struct boo), GFP_KERNEL);

Instead of leaving these open-coded and prone to type mistakes, we can
now use the new struct_size() helper:

instance = kzalloc(struct_size(instance, entry, count), GFP_KERNEL);

This code was detected with the help of Coccinelle.

Signed-off-by: Gustavo A. R. Silva <gustavo@embeddedor.com>
---
 drivers/net/ethernet/netronome/nfp/nfpcore/nfp_nsp_eth.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/drivers/net/ethernet/netronome/nfp/nfpcore/nfp_nsp_eth.c b/drivers/net/ethernet/netronome/nfp/nfpcore/nfp_nsp_eth.c
index 802c9224bb32..f6f028fa5db9 100644
--- a/drivers/net/ethernet/netronome/nfp/nfpcore/nfp_nsp_eth.c
+++ b/drivers/net/ethernet/netronome/nfp/nfpcore/nfp_nsp_eth.c
@@ -269,8 +269,7 @@ __nfp_eth_read_ports(struct nfp_cpp *cpp, struct nfp_nsp *nsp)
 		goto err;
 	}

-	table = kzalloc(sizeof(*table) +
-			sizeof(struct nfp_eth_table_port) * cnt, GFP_KERNEL);
+	table = kzalloc(struct_size(table, ports, cnt), GFP_KERNEL);
 	if (!table)
 		goto err;

-- 
2.20.1

^ permalink raw reply related

* Re: [PATCH] tty: Fix WARNING in tty_set_termios
From: shuah @ 2019-01-31  0:35 UTC (permalink / raw)
  To: Johan Hovold
  Cc: Al Viro, linux-wireless, chris, devel, robh, sameo, linux-serial,
	jslaby, santhameena13, kirk, johan.hedberg, arnd, marcel,
	samuel.thibault, m.maya.nakamura, zhongjiang, gregkh, speakup,
	linux-kernel, linux-bluetooth, netdev, nishka.dasgupta_ug18,
	davem, shuah
In-Reply-To: <20190130103227.GR3691@localhost>

On 1/30/19 3:32 AM, Johan Hovold wrote:
> On Mon, Jan 28, 2019 at 02:29:22PM -0700, shuah wrote:
>> On 1/25/19 9:14 PM, Al Viro wrote:
>>> On Fri, Jan 25, 2019 at 04:29:05PM -0700, Shuah Khan wrote:
>>>> tty_set_termios() has the following WARMN_ON which can be triggered with a
>>>> syscall to invoke TIOCGETD __NR_ioctl.
> 
> You meant TIOCSETD here, and in fact its the call which sets the uart
> protocol that triggers the warning.

Right. It is a TIOCSETD.

> 
>>>> WARN_ON(tty->driver->type == TTY_DRIVER_TYPE_PTY &&
>>>>                   tty->driver->subtype == PTY_TYPE_MASTER);
>>>> Reference: https://syzkaller.appspot.com/bug?id=2410d22f1d8e5984217329dd0884b01d99e3e48d
>>>>
>>>> A simple change would have been to print error message instead of WARN_ON.
>>>> However, the callers assume that tty_set_termios() always returns 0 and
>>>> don't check return value. The complete solution is fixing all the callers
>>>> to check error and bail out to fix the WARN_ON.
>>>>
>>>> This fix changes tty_set_termios() to return error and all the callers
>>>> to check error and bail out. The reproducer is used to reproduce the
>>>> problem and verify the fix.
>>>
>>>> --- a/drivers/bluetooth/hci_ldisc.c
>>>> +++ b/drivers/bluetooth/hci_ldisc.c
>>>> @@ -321,6 +321,8 @@ void hci_uart_set_flow_control(struct hci_uart *hu, bool enable)
>>>>    		status = tty_set_termios(tty, &ktermios);
>>>>    		BT_DBG("Disabling hardware flow control: %s",
>>>>    		       status ? "failed" : "success");
>>>> +		if (status)
>>>> +			return;
>>>
>>> Can that ldisc end up set on pty master?  And does it make any sense there?
>>
>> The initial objective of the patch is to prevent the WARN_ON by making
>> the change to return error instead of WARN_ON. However, without changes
>> to places that don't check the return and keep making progress, there
>> will be secondary problems.
>>
>> Without this change to return here, instead of WARN_ON, it will fail
>> with the following NULL pointer dereference at the next thing
>> hci_uart_set_flow_control() attempts.
>>
>> status = tty->driver->ops->tiocmget(tty);
>>
>> kernel: [10140.649783] BUG: unable to handle kernel NULL pointer
> 
> That's a separate issue, which is being fixed:
> 
> 	https://lkml.kernel.org/r/20190130095938.GP3691@localhost
> 

Ah good to know.

>>> IOW, I don't believe that this patch makes any sense.  If anything,
>>> we need to prevent unconditional tty_set_termios() on the path
>>> that *does* lead to calling it for pty.
>>
>> I don't think preventing unconditional tty_set_termios() is enough to
>> prevent secondary problems such as the one above.
>>
>> For example, the following call chain leads to the WARN_ON that was
>> reported. Even if void hci_uart_set_baudrate() prevents the very first
>> tty_set_termios() call, its caller hci_uart_setup() continues with
>> more tty setup. It goes ahead to call driver setup callback. The
>> driver callback goes on to do more setup calling tty_set_termios().
>>
>> WARN_ON call path:
>>    hci_uart_set_baudrate+0x1cc/0x250 drivers/bluetooth/hci_ldisc.c:378
>>    hci_uart_setup+0xa2/0x490 drivers/bluetooth/hci_ldisc.c:401
>>    hci_dev_do_open+0x6b1/0x1920 net/bluetooth/hci_core.c:1423
>>
>> Once this WARN_ON is changed to return error, the following
>> happens, when hci_uart_setup() does driver setup callback.
>>
>> kernel: [10140.649836]  mrvl_setup+0x17/0x80 [hci_uart]
>> kernel: [10140.649840]  hci_uart_setup+0x56/0x160 [hci_uart]
>> kernel: [10140.649850]  hci_dev_do_open+0xe6/0x630 [bluetooth]
>> kernel: [10140.649860]  hci_power_on+0x52/0x220 [bluetooth]
>>
>> I think continuing to catch the invalid condition in tty_set_termios()
>> and preventing progress by checking return value is a straight forward
>> change to avoid secondary problems, and it might be difficult to catch
>> all the cases where it could fail.
> 
> I agree with Al that this change doesn't make much sense. The WARN_ON
> is there to catch any bugs leading to the termios being changed for a
> master side pty. Those should bugs should be fixed, and not worked
> around in order to silence a WARN_ON.
> 
> The problem started with 7721383f4199 ("Bluetooth: hci_uart: Support
> operational speed during setup") which introduced a new way for how
> tty_set_termios() could end up being called for a master pty.
> 

Ah. Thanks for the context.

> As Al hinted at, setting these ldiscs for a master pty really makes no
> sense and perhaps that is what we should prevent unless simply making
> sure they do not call tty_set_termios() is sufficient for the time
> being.
> 

I will take a look to see if not calling tty_set_termios() is enough.

> Finally, note that serdev never operates on a pty, and that this is only
> an issue for (the three) line disciplines.
> 

Thanks for the detailed message explaining the evolution. Now it makes
sense.

-- Shuah

^ permalink raw reply

* [PATCH net-next] cxgb4: smt: use struct_size() in kvzalloc()
From: Gustavo A. R. Silva @ 2019-01-31  0:27 UTC (permalink / raw)
  To: Vishal Kulkarni, David S. Miller
  Cc: netdev, linux-kernel, Gustavo A. R. Silva

One of the more common cases of allocation size calculations is
finding the size of a structure that has a zero-sized array at
the end, along with memory for some number of elements for that
array. For example:

struct foo {
    int stuff;
    struct boo entry[];
};

instance = kvzalloc(sizeof(struct foo) + count * sizeof(struct boo), GFP_KERNEL);

Instead of leaving these open-coded and prone to type mistakes, we can now
use the new struct_size() helper:

instance = kvzalloc(struct_size(instance, entry, count), GFP_KERNEL);

This code was detected with the help of Coccinelle.

Signed-off-by: Gustavo A. R. Silva <gustavo@embeddedor.com>
---
 drivers/net/ethernet/chelsio/cxgb4/smt.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/drivers/net/ethernet/chelsio/cxgb4/smt.c b/drivers/net/ethernet/chelsio/cxgb4/smt.c
index 7b2207a2a130..eaf1fb74689c 100644
--- a/drivers/net/ethernet/chelsio/cxgb4/smt.c
+++ b/drivers/net/ethernet/chelsio/cxgb4/smt.c
@@ -47,8 +47,7 @@ struct smt_data *t4_init_smt(void)

 	smt_size = SMT_SIZE;

-	s = kvzalloc(sizeof(*s) + smt_size * sizeof(struct smt_entry),
-		     GFP_KERNEL);
+	s = kvzalloc(struct_size(s, smtab, smt_size), GFP_KERNEL);
 	if (!s)
 		return NULL;
 	s->smt_size = smt_size;
-- 
2.20.1

^ permalink raw reply related

* Re: [RFC 00/14] netlink/hierarchical stats
From: Jakub Kicinski @ 2019-01-31  0:24 UTC (permalink / raw)
  To: Roopa Prabhu
  Cc: David Miller, oss-drivers, netdev, Jiří Pírko,
	Florian Fainelli, Andrew Lunn, Michal Kubecek, David Ahern,
	Simon Horman, Brandeburg, Jesse, maciejromanfijalkowski,
	vasundhara-v.volam, Michael Chan, shalomt, Ido Schimmel
In-Reply-To: <CAJieiUgF4tHtqbzwC4g4HDOtss=Q14auyG=7WZXnQHaj6DXSmQ@mail.gmail.com>

On Wed, 30 Jan 2019 14:14:34 -0800, Roopa Prabhu wrote:
> On Mon, Jan 28, 2019 at 3:45 PM Jakub Kicinski wrote:
> > Hi!
> >
> > As I tried to explain in my slides at netconf 2018 we are lacking
> > an expressive, standard API to report device statistics.
> >
> > Networking silicon generally maintains some IEEE 802.3 and/or RMON
> > statistics.  Today those all end up in ethtool -S.  Here is a simple
> > attempt (admittedly very imprecise) of counting how many names driver
> > authors invented for IETF RFC2819 etherStatsPkts512to1023Octets
> > statistics (RX and TX):
> >
> > $ git grep '".*512.*1023.*"' -- drivers/net/ | \
> >     sed -e 's/.*"\(.*\)".*/\1/' | sort | uniq | wc -l
> > 63
> >
> > Interestingly only two drivers in the tree use the name the standard
> > gave us (etherStatsPkts512to1023, modulo case).
> >
> > I set out to working on this set in an attempt to give drivers a way
> > to express clearly to user space standard-compliant counters.
> >
> > Second most common use for custom statistics is per-queue counters.
> > This is where the "hierarchical" part of this set comes in, as
> > groups can be nested, and user space tools can handle the aggregation
> > inside the groups if needed.
> >
> > This set also tries to address the problem of users not knowing if
> > a statistic is reported by hardware or the driver.  Many modern drivers
> > use some prefix in ethtool -S to indicate MAC/PHY stats.  At a quick
> > glance: Netronome uses "mac.", Intel "port." and Mellanox "_phy".
> > In this set, netlink attributes describe whether a group of statistics
> > is RX or TX, maintained by device or driver.
> >
> > The purpose of this patch set is _not_ to replace ethtool -S.  It is
> > an incredibly useful tool, and we will certainly continue using it.
> > However, for standard-based and commonly maintained statistics a more
> > structured API seems warranted.
> >
> > There are two things missing from these patches, which I initially
> > planned to address as well: filtering, and refresh rate control.
> >
> > Filtering doesn't need much explanation, users should be able to request
> > only a subset of statistics (like only SW stats or only given ID).  The
> > bitmap of statistics in each group is there for filtering later on.
> >
> > By refresh control I mean the ability for user space to indicate how
> > "fresh" values it expects.  Sometimes reading the HW counters requires
> > slow register reads or FW communication, in such cases drivers may cache
> > the result.  (Privileged) user space should be able to add a "not older
> > than" timestamp to indicate how fresh statistics it expects.  And vice
> > versa, drivers can then also put the timestamp of when the statistics
> > were last refreshed in the dump for more precise bandwidth estimation.  
> 
> Jakub, Glad to see hw stats in the RTM_*STATS api. I do see you
> mention 'partial' support for ethtool stats. I understand the reason
> you say its partial.
> But while at it, why not also include the ability to have driver
> extensible stats here ? ie make it complete. We have talked about
> making all hw stats available
> via the RTM_*STATS api in the past..., so just want to make sure the
> new HSTATS infra you are adding to the RTM_*STATS api
> covers or at-least makes it possible to include driver extensible
> stats in the future where the driver gets to define the stats id +
> value (This is very useful).
>  It would be nice if you can account for that in this new HSTATS API.

My thinking was that we should leave truly custom/strange stats to
ethtool API which works quite well for that and at the same time be
very accepting of people adding new IDs to HSTAT (only requirement is
basically defining the meaning very clearly).  

For the first stab I looked at two drivers and added all the stats that
were common.

Given this set is identifying statistics by ID - how would we make that
extensible to drivers?  Would we go back to strings or have some
"driver specific" ID space?

Is there any particular type of statistic you'd expect drivers to want
to add?  For NICs I think IEEE/RMON should pretty much cover the
silicon ones, but I don't know much about switches :)

^ permalink raw reply

* [PATCH net-next] cxgb4: sched: use struct_size() in kvzalloc()
From: Gustavo A. R. Silva @ 2019-01-31  0:23 UTC (permalink / raw)
  To: Vishal Kulkarni, David S. Miller
  Cc: netdev, linux-kernel, Gustavo A. R. Silva

One of the more common cases of allocation size calculations is
finding the size of a structure that has a zero-sized array at
the end, along with memory for some number of elements for that
array. For example:

struct foo {
    int stuff;
    struct boo entry[];
};

instance = kvzalloc(sizeof(struct foo) + count * sizeof(struct boo), GFP_KERNEL);

Instead of leaving these open-coded and prone to type mistakes, we can now
use the new struct_size() helper:

instance = kvzalloc(struct_size(instance, entry, count), GFP_KERNEL);

This code was detected with the help of Coccinelle.

Signed-off-by: Gustavo A. R. Silva <gustavo@embeddedor.com>
---
 drivers/net/ethernet/chelsio/cxgb4/sched.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/chelsio/cxgb4/sched.c b/drivers/net/ethernet/chelsio/cxgb4/sched.c
index 52edb688942b..ba6c153ee45c 100644
--- a/drivers/net/ethernet/chelsio/cxgb4/sched.c
+++ b/drivers/net/ethernet/chelsio/cxgb4/sched.c
@@ -478,7 +478,7 @@ struct sched_table *t4_init_sched(unsigned int sched_size)
 	struct sched_table *s;
 	unsigned int i;

-	s = kvzalloc(sizeof(*s) + sched_size * sizeof(struct sched_class), GFP_KERNEL);
+	s = kvzalloc(struct_size(s, tab, sched_size), GFP_KERNEL);
 	if (!s)
 		return NULL;

-- 
2.20.1

^ permalink raw reply related

* Re: [PATCH net-next v8 0/8] devlink: Add configuration parameters support for devlink_port
From: Jakub Kicinski @ 2019-01-30 23:57 UTC (permalink / raw)
  To: Vasundhara Volam; +Cc: davem, michael.chan, jiri, mkubecek, netdev
In-Reply-To: <1548678627-21938-1-git-send-email-vasundhara-v.volam@broadcom.com>

On Mon, 28 Jan 2019 18:00:19 +0530, Vasundhara Volam wrote:
> This patchset adds support for configuration parameters setting through
> devlink_port.  Each device registers supported configuration parameters
> table.
> 
> The user can retrieve data on these parameters by
> "devlink port param show" command and can set new value to a
> parameter by "devlink port param set" command.
> All configuration modes supported by devlink_dev are supported
> by devlink_port also.

Hm, I think we were kind of going somewhere with the ethtool/nl
attribute encapsulation idea.  You seem to have ignored those comments
on v7 and reposted v8 a day after.  

I think we should explore the nesting further.  The only obstacle is
that ethtool netlink conversion is not yet finished, but that's just 
a simple matter of programming.  Do you disagree with that direction?
Please comment.

^ permalink raw reply

* [PATCH bpf-next v5 5/5] selftests: bpf: add test_lwt_ip_encap selftest
From: Peter Oskolkov @ 2019-01-30 23:51 UTC (permalink / raw)
  To: Alexei Starovoitov, Daniel Borkmann, netdev
  Cc: Peter Oskolkov, David Ahern, Peter Oskolkov
In-Reply-To: <20190130235136.136527-1-posk@google.com>

This patch adds a bpf self-test to cover BPF_LWT_ENCAP_IP mode
in bpf_lwt_push_encap.

Covered:
- encapping in LWT_IN and LWT_XMIT
- IPv4 and IPv6

Signed-off-by: Peter Oskolkov <posk@google.com>
Change-Id: I9d0d1003a40c28a41467116f3c32a84730ff39b2
---
 tools/testing/selftests/bpf/Makefile          |   5 +-
 .../testing/selftests/bpf/test_lwt_ip_encap.c |  85 +++++
 .../selftests/bpf/test_lwt_ip_encap.sh        | 311 ++++++++++++++++++
 3 files changed, 399 insertions(+), 2 deletions(-)
 create mode 100644 tools/testing/selftests/bpf/test_lwt_ip_encap.c
 create mode 100755 tools/testing/selftests/bpf/test_lwt_ip_encap.sh

diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile
index 8993e9c8f410..28aa3b3e297e 100644
--- a/tools/testing/selftests/bpf/Makefile
+++ b/tools/testing/selftests/bpf/Makefile
@@ -35,7 +35,7 @@ BPF_OBJ_FILES = \
 	sendmsg4_prog.o sendmsg6_prog.o test_lirc_mode2_kern.o \
 	get_cgroup_id_kern.o socket_cookie_prog.o test_select_reuseport_kern.o \
 	test_skb_cgroup_id_kern.o bpf_flow.o netcnt_prog.o test_xdp_vlan.o \
-	xdp_dummy.o test_map_in_map.o
+	xdp_dummy.o test_map_in_map.o test_lwt_ip_encap.o
 
 # Objects are built with default compilation flags and with sub-register
 # code-gen enabled.
@@ -73,7 +73,8 @@ TEST_PROGS := test_kmod.sh \
 	test_lirc_mode2.sh \
 	test_skb_cgroup_id.sh \
 	test_flow_dissector.sh \
-	test_xdp_vlan.sh
+	test_xdp_vlan.sh \
+	test_lwt_ip_encap.sh
 
 TEST_PROGS_EXTENDED := with_addr.sh \
 	with_tunnels.sh \
diff --git a/tools/testing/selftests/bpf/test_lwt_ip_encap.c b/tools/testing/selftests/bpf/test_lwt_ip_encap.c
new file mode 100644
index 000000000000..c957d6dfe6d7
--- /dev/null
+++ b/tools/testing/selftests/bpf/test_lwt_ip_encap.c
@@ -0,0 +1,85 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <stddef.h>
+#include <string.h>
+#include <linux/bpf.h>
+#include <linux/ip.h>
+#include <linux/ipv6.h>
+#include "bpf_helpers.h"
+#include "bpf_endian.h"
+
+struct grehdr {
+	__be16 flags;
+	__be16 protocol;
+};
+
+SEC("encap_gre")
+int bpf_lwt_encap_gre(struct __sk_buff *skb)
+{
+	struct encap_hdr {
+		struct iphdr iph;
+		struct grehdr greh;
+	} hdr;
+	int err;
+
+	memset(&hdr, 0, sizeof(struct encap_hdr));
+
+	hdr.iph.ihl = 5;
+	hdr.iph.version = 4;
+	hdr.iph.ttl = 0x40;
+	hdr.iph.protocol = 47;  /* IPPROTO_GRE */
+#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
+	hdr.iph.saddr = 0x640110ac;  /* 172.16.1.100 */
+	hdr.iph.daddr = 0x641010ac;  /* 172.16.16.100 */
+#elif __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
+	hdr.iph.saddr = 0xac100164;  /* 172.16.1.100 */
+	hdr.iph.daddr = 0xac101064;  /* 172.16.16.100 */
+#else
+#error "Fix your compiler's __BYTE_ORDER__?!"
+#endif
+	hdr.iph.tot_len = bpf_htons(skb->len + sizeof(struct encap_hdr));
+
+	hdr.greh.protocol = skb->protocol;
+
+	err = bpf_lwt_push_encap(skb, BPF_LWT_ENCAP_IP, &hdr,
+				 sizeof(struct encap_hdr));
+	if (err)
+		return BPF_DROP;
+
+	return BPF_LWT_REROUTE;
+}
+
+SEC("encap_gre6")
+int bpf_lwt_encap_gre6(struct __sk_buff *skb)
+{
+	struct encap_hdr {
+		struct ipv6hdr ip6hdr;
+		struct grehdr greh;
+	} hdr;
+	int err;
+
+	memset(&hdr, 0, sizeof(struct encap_hdr));
+
+	hdr.ip6hdr.version = 6;
+	hdr.ip6hdr.payload_len = bpf_htons(skb->len + sizeof(struct grehdr));
+	hdr.ip6hdr.nexthdr = 47;  /* IPPROTO_GRE */
+	hdr.ip6hdr.hop_limit = 0x40;
+	/* fb01::1 */
+	hdr.ip6hdr.saddr.s6_addr[0] = 0xfb;
+	hdr.ip6hdr.saddr.s6_addr[1] = 1;
+	hdr.ip6hdr.saddr.s6_addr[15] = 1;
+	/* fb10::1 */
+	hdr.ip6hdr.daddr.s6_addr[0] = 0xfb;
+	hdr.ip6hdr.daddr.s6_addr[1] = 0x10;
+	hdr.ip6hdr.daddr.s6_addr[15] = 1;
+
+	hdr.greh.protocol = skb->protocol;
+
+	err = bpf_lwt_push_encap(skb, BPF_LWT_ENCAP_IP, &hdr,
+				 sizeof(struct encap_hdr));
+	if (err)
+		return BPF_DROP;
+
+	return BPF_LWT_REROUTE;
+}
+
+char _license[] SEC("license") = "GPL";
diff --git a/tools/testing/selftests/bpf/test_lwt_ip_encap.sh b/tools/testing/selftests/bpf/test_lwt_ip_encap.sh
new file mode 100755
index 000000000000..4ca714e23ab0
--- /dev/null
+++ b/tools/testing/selftests/bpf/test_lwt_ip_encap.sh
@@ -0,0 +1,311 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+#
+# Setup/topology:
+#
+#    NS1             NS2             NS3
+#   veth1 <---> veth2   veth3 <---> veth4 (the top route)
+#   veth5 <---> veth6   veth7 <---> veth8 (the bottom route)
+#
+#   each vethN gets IPv[4|6]_N address
+#
+#   IPv*_SRC = IPv*_1
+#   IPv*_DST = IPv*_4
+#
+#   all tests test pings from IPv*_SRC to IPv*_DST
+#
+#   by default, routes are configured to allow packets to go
+#   IP*_1 <=> IP*_2 <=> IP*_3 <=> IP*_4 (the top route)
+#
+#   a GRE device is installed in NS3 with IPv*_GRE, and
+#   NS1/NS2 are configured to route packets to IPv*_GRE via IP*_8
+#   (the bottom route)
+#
+# Tests:
+#
+#   1. routes NS2->IPv*_DST are brought down, so the only way a ping
+#      from IP*_SRC to IP*_DST can work is via IPv*_GRE
+#
+#   2a. in an egress test, a bpf LWT_XMIT program is installed on veth1
+#       that encaps the packets with an IP/GRE header to route to IPv*_GRE
+#
+#       ping: SRC->[encap at veth1:egress]->GRE:decap->DST
+#       ping replies go DST->SRC directly
+#
+#   2b. in an ingress test, a bpf LWT_IN program is installed on veth2
+#       that encaps the packets with an IP/GRE header to route to IPv*_GRE
+#
+#       ping: SRC->[encap at veth2:ingress]->GRE:decap->DST
+#       ping replies go DST->SRC directly
+
+set -e  # exit on error
+
+if [[ $EUID -ne 0 ]]; then
+	echo "This script must be run as root"
+	echo "FAIL"
+	exit 1
+fi
+
+readonly NS1="ns1-$(mktemp -u XXXXXX)"
+readonly NS2="ns2-$(mktemp -u XXXXXX)"
+readonly NS3="ns3-$(mktemp -u XXXXXX)"
+
+readonly IPv4_1="172.16.1.100"
+readonly IPv4_2="172.16.2.100"
+readonly IPv4_3="172.16.3.100"
+readonly IPv4_4="172.16.4.100"
+readonly IPv4_5="172.16.5.100"
+readonly IPv4_6="172.16.6.100"
+readonly IPv4_7="172.16.7.100"
+readonly IPv4_8="172.16.8.100"
+readonly IPv4_GRE="172.16.16.100"
+
+readonly IPv4_SRC=$IPv4_1
+readonly IPv4_DST=$IPv4_4
+
+readonly IPv6_1="fb01::1"
+readonly IPv6_2="fb02::1"
+readonly IPv6_3="fb03::1"
+readonly IPv6_4="fb04::1"
+readonly IPv6_5="fb05::1"
+readonly IPv6_6="fb06::1"
+readonly IPv6_7="fb07::1"
+readonly IPv6_8="fb08::1"
+readonly IPv6_GRE="fb10::1"
+
+readonly IPv6_SRC=$IPv6_1
+readonly IPv6_DST=$IPv6_4
+
+setup() {
+set -e  # exit on error
+	# create devices and namespaces
+	ip netns add "${NS1}"
+	ip netns add "${NS2}"
+	ip netns add "${NS3}"
+
+	ip link add veth1 type veth peer name veth2
+	ip link add veth3 type veth peer name veth4
+	ip link add veth5 type veth peer name veth6
+	ip link add veth7 type veth peer name veth8
+
+	ip netns exec ${NS2} sysctl -wq net.ipv4.ip_forward=1
+	ip netns exec ${NS2} sysctl -wq net.ipv6.conf.all.forwarding=1
+
+	ip link set veth1 netns ${NS1}
+	ip link set veth2 netns ${NS2}
+	ip link set veth3 netns ${NS2}
+	ip link set veth4 netns ${NS3}
+	ip link set veth5 netns ${NS1}
+	ip link set veth6 netns ${NS2}
+	ip link set veth7 netns ${NS2}
+	ip link set veth8 netns ${NS3}
+
+	# configure addesses: the top route (1-2-3-4)
+	ip -netns ${NS1}    addr add ${IPv4_1}/24  dev veth1
+	ip -netns ${NS2}    addr add ${IPv4_2}/24  dev veth2
+	ip -netns ${NS2}    addr add ${IPv4_3}/24  dev veth3
+	ip -netns ${NS3}    addr add ${IPv4_4}/24  dev veth4
+	ip -netns ${NS1} -6 addr add ${IPv6_1}/128 nodad dev veth1
+	ip -netns ${NS2} -6 addr add ${IPv6_2}/128 nodad dev veth2
+	ip -netns ${NS2} -6 addr add ${IPv6_3}/128 nodad dev veth3
+	ip -netns ${NS3} -6 addr add ${IPv6_4}/128 nodad dev veth4
+
+	# configure addresses: the bottom route (5-6-7-8)
+	ip -netns ${NS1}    addr add ${IPv4_5}/24  dev veth5
+	ip -netns ${NS2}    addr add ${IPv4_6}/24  dev veth6
+	ip -netns ${NS2}    addr add ${IPv4_7}/24  dev veth7
+	ip -netns ${NS3}    addr add ${IPv4_8}/24  dev veth8
+	ip -netns ${NS1} -6 addr add ${IPv6_5}/128 nodad dev veth5
+	ip -netns ${NS2} -6 addr add ${IPv6_6}/128 nodad dev veth6
+	ip -netns ${NS2} -6 addr add ${IPv6_7}/128 nodad dev veth7
+	ip -netns ${NS3} -6 addr add ${IPv6_8}/128 nodad dev veth8
+
+
+	ip -netns ${NS1} link set dev veth1 up
+	ip -netns ${NS2} link set dev veth2 up
+	ip -netns ${NS2} link set dev veth3 up
+	ip -netns ${NS3} link set dev veth4 up
+	ip -netns ${NS1} link set dev veth5 up
+	ip -netns ${NS2} link set dev veth6 up
+	ip -netns ${NS2} link set dev veth7 up
+	ip -netns ${NS3} link set dev veth8 up
+
+	# configure routes: IP*_SRC -> veth1/IP*_2 (= top route) default;
+	# the bottom route to specific bottom addresses
+
+	# NS1
+	# top route
+	ip -netns ${NS1}    route add ${IPv4_2}/32  dev veth1
+	ip -netns ${NS1}    route add default dev veth1 via ${IPv4_2}  # go top by default
+	ip -netns ${NS1} -6 route add ${IPv6_2}/128 dev veth1
+	ip -netns ${NS1} -6 route add default dev veth1 via ${IPv6_2}  # go top by default
+	# bottom route
+	ip -netns ${NS1}    route add ${IPv4_6}/32  dev veth5
+	ip -netns ${NS1}    route add ${IPv4_7}/32  dev veth5 via ${IPv4_6}
+	ip -netns ${NS1}    route add ${IPv4_8}/32  dev veth5 via ${IPv4_6}
+	ip -netns ${NS1} -6 route add ${IPv6_6}/128 dev veth5
+	ip -netns ${NS1} -6 route add ${IPv6_7}/128 dev veth5 via ${IPv6_6}
+	ip -netns ${NS1} -6 route add ${IPv6_8}/128 dev veth5 via ${IPv6_6}
+
+	# NS2
+	# top route
+	ip -netns ${NS2}    route add ${IPv4_1}/32  dev veth2
+	ip -netns ${NS2}    route add ${IPv4_4}/32  dev veth3
+	ip -netns ${NS2} -6 route add ${IPv6_1}/128 dev veth2
+	ip -netns ${NS2} -6 route add ${IPv6_4}/128 dev veth3
+	# bottom route
+	ip -netns ${NS2}    route add ${IPv4_5}/32  dev veth6
+	ip -netns ${NS2}    route add ${IPv4_8}/32  dev veth7
+	ip -netns ${NS2} -6 route add ${IPv6_5}/128 dev veth6
+	ip -netns ${NS2} -6 route add ${IPv6_8}/128 dev veth7
+
+	# NS3
+	# top route
+	ip -netns ${NS3}    route add ${IPv4_3}/32  dev veth4
+	ip -netns ${NS3}    route add ${IPv4_1}/32  dev veth4 via ${IPv4_3}
+	ip -netns ${NS3}    route add ${IPv4_2}/32  dev veth4 via ${IPv4_3}
+	ip -netns ${NS3} -6 route add ${IPv6_3}/128 dev veth4
+	ip -netns ${NS3} -6 route add ${IPv6_1}/128 dev veth4 via ${IPv6_3}
+	ip -netns ${NS3} -6 route add ${IPv6_2}/128 dev veth4 via ${IPv6_3}
+	# bottom route
+	ip -netns ${NS3}    route add ${IPv4_7}/32  dev veth8
+	ip -netns ${NS3}    route add ${IPv4_5}/32  dev veth8 via ${IPv4_7}
+	ip -netns ${NS3}    route add ${IPv4_6}/32  dev veth8 via ${IPv4_7}
+	ip -netns ${NS3} -6 route add ${IPv6_7}/128 dev veth8
+	ip -netns ${NS3} -6 route add ${IPv6_5}/128 dev veth8 via ${IPv6_7}
+	ip -netns ${NS3} -6 route add ${IPv6_6}/128 dev veth8 via ${IPv6_7}
+
+	# configure IPv4 GRE device in NS3, and a route to it via the "bottom" route
+	ip -netns ${NS3} tunnel add gre_dev mode gre remote ${IPv4_1} local ${IPv4_GRE} ttl 255
+	ip -netns ${NS3} link set gre_dev up
+	ip -netns ${NS3} addr add ${IPv4_GRE} dev gre_dev
+	ip -netns ${NS1} route add ${IPv4_GRE}/32 dev veth5 via ${IPv4_6}
+	ip -netns ${NS2} route add ${IPv4_GRE}/32 dev veth7 via ${IPv4_8}
+
+
+	# configure IPv6 GRE device in NS3, and a route to it via the "bottom" route
+	ip -netns ${NS3} -6 tunnel add name gre6_dev mode ip6gre remote ${IPv6_1} local ${IPv6_GRE} ttl 255
+	ip -netns ${NS3} link set gre6_dev up
+	ip -netns ${NS3} -6 addr add ${IPv6_GRE} nodad dev gre6_dev
+	ip -netns ${NS1} -6 route add ${IPv6_GRE}/128 dev veth5 via ${IPv6_6}
+	ip -netns ${NS2} -6 route add ${IPv6_GRE}/128 dev veth7 via ${IPv6_8}
+
+	# rp_filter gets confused by what these tests are doing, so disable it
+	ip netns exec ${NS1} sysctl -wq net.ipv4.conf.all.rp_filter=0
+	ip netns exec ${NS2} sysctl -wq net.ipv4.conf.all.rp_filter=0
+	ip netns exec ${NS3} sysctl -wq net.ipv4.conf.all.rp_filter=0
+}
+
+cleanup() {
+	ip netns del ${NS1} 2> /dev/null
+	ip netns del ${NS2} 2> /dev/null
+	ip netns del ${NS3} 2> /dev/null
+}
+
+trap cleanup EXIT
+
+test_ping() {
+	local readonly PROTO=$1
+	local readonly EXPECTED=$2
+	local RET=0
+
+	set +e
+	if [ "${PROTO}" == "IPv4" ] ; then
+		ip netns exec ${NS1} ping  -c 1 -W 1 -I ${IPv4_SRC} ${IPv4_DST} 2>&1 > /dev/null
+		RET=$?
+	elif [ "${PROTO}" == "IPv6" ] ; then
+		ip netns exec ${NS1} ping6 -c 1 -W 6 -I ${IPv6_SRC} ${IPv6_DST} 2>&1 > /dev/null
+		RET=$?
+	else
+		echo "test_ping: unknown PROTO: ${PROTO}"
+		exit 1
+	fi
+	set -e
+
+	if [ "0" != "${RET}" ]; then
+		RET=1
+	fi
+
+	if [ "${EXPECTED}" != "${RET}" ] ; then
+		echo "FAIL: test_ping: ${RET}"
+		exit 1
+	fi
+}
+
+test_egress() {
+	local readonly ENCAP=$1
+	echo "starting egress ${ENCAP} encap test"
+	setup
+
+	# need to wait a bit for IPv6 to autoconf, otherwise
+	# ping6 sometimes fails with "unable to bind to address"
+
+	# by default, pings work
+	test_ping IPv4 0
+	test_ping IPv6 0
+
+	# remove NS2->DST routes, ping fails
+	ip -netns ${NS2}    route del ${IPv4_DST}/32  dev veth3
+	ip -netns ${NS2} -6 route del ${IPv6_DST}/128 dev veth3
+	test_ping IPv4 1
+	test_ping IPv6 1
+
+	# install replacement routes (LWT/eBPF), pings succeed
+	if [ "${ENCAP}" == "IPv4" ] ; then
+		ip -netns ${NS1} route add ${IPv4_DST} encap bpf xmit obj test_lwt_ip_encap.o sec encap_gre dev veth1
+		ip -netns ${NS1} -6 route add ${IPv6_DST} encap bpf xmit obj test_lwt_ip_encap.o sec encap_gre dev veth1
+	elif [ "${ENCAP}" == "IPv6" ] ; then
+		ip -netns ${NS1} route add ${IPv4_DST} encap bpf xmit obj test_lwt_ip_encap.o sec encap_gre6 dev veth1
+		ip -netns ${NS1} -6 route add ${IPv6_DST} encap bpf xmit obj test_lwt_ip_encap.o sec encap_gre6 dev veth1
+	else
+		echo "FAIL: unknown encap ${ENCAP}"
+	fi
+	test_ping IPv4 0
+	test_ping IPv6 0
+
+	cleanup
+	echo "PASS"
+}
+
+test_ingress() {
+	local readonly ENCAP=$1
+	echo "starting ingress ${ENCAP} encap test"
+	setup
+
+	# need to wait a bit for IPv6 to autoconf, otherwise
+	# ping6 sometimes fails with "unable to bind to address"
+
+	# by default, pings work
+	test_ping IPv4 0
+	test_ping IPv6 0
+
+	# remove NS2->DST routes, pings fail
+	ip -netns ${NS2}    route del ${IPv4_DST}/32  dev veth3
+	ip -netns ${NS2} -6 route del ${IPv6_DST}/128 dev veth3
+	test_ping IPv4 1
+	test_ping IPv6 1
+
+	# install replacement routes (LWT/eBPF), pings succeed
+	if [ "${ENCAP}" == "IPv4" ] ; then
+		ip -netns ${NS2} route add ${IPv4_DST} encap bpf in obj test_lwt_ip_encap.o sec encap_gre dev veth2
+		ip -netns ${NS2} -6 route add ${IPv6_DST} encap bpf in obj test_lwt_ip_encap.o sec encap_gre dev veth2
+	elif [ "${ENCAP}" == "IPv6" ] ; then
+		ip -netns ${NS2} route add ${IPv4_DST} encap bpf in obj test_lwt_ip_encap.o sec encap_gre6 dev veth2
+		ip -netns ${NS2} -6 route add ${IPv6_DST} encap bpf in obj test_lwt_ip_encap.o sec encap_gre6 dev veth2
+	else
+		echo "FAIL: unknown encap ${ENCAP}"
+	fi
+	test_ping IPv4 0
+	test_ping IPv6 0
+
+	cleanup
+	echo "PASS"
+}
+
+test_egress IPv4
+test_egress IPv6
+
+test_ingress IPv4
+test_ingress IPv6
+
+echo "all tests passed"
-- 
2.20.1.495.gaa96b0ce6b-goog


^ permalink raw reply related

* [PATCH bpf-next v5 4/5] bpf: sync <kdir>/<uapi>/bpf.h with tools/<uapi>/bpf.h
From: Peter Oskolkov @ 2019-01-30 23:51 UTC (permalink / raw)
  To: Alexei Starovoitov, Daniel Borkmann, netdev
  Cc: Peter Oskolkov, David Ahern, Peter Oskolkov
In-Reply-To: <20190130235136.136527-1-posk@google.com>

This patch copies changes in bpf.h done by a previous patch
in this patchset from the kernel uapi include dir into tools
uapi include dir.

Signed-off-by: Peter Oskolkov <posk@google.com>
---
 tools/include/uapi/linux/bpf.h | 23 +++++++++++++++++++++--
 1 file changed, 21 insertions(+), 2 deletions(-)

diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index 60b99b730a41..911c15585fab 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -2015,6 +2015,16 @@ union bpf_attr {
  *			Only works if *skb* contains an IPv6 packet. Insert a
  *			Segment Routing Header (**struct ipv6_sr_hdr**) inside
  *			the IPv6 header.
+ *		**BPF_LWT_ENCAP_IP**
+ *			IP encapsulation (GRE/GUE/IPIP/etc). The outer header
+ *			must be IPv4 or IPv6, followed by zero or more
+ *			additional headers, up to LWT_BPF_MAX_HEADROOM total
+ *			bytes in all prepended headers.
+ *
+ *		BPF_LWT_ENCAP_SEG6*** types can be called by bpf programs of
+ *		type BPF_PROG_TYPE_LWT_IN; BPF_LWT_ENCAP_IP type can be called
+ *		by bpf programs of types BPF_PROG_TYPE_LWT_IN and
+ *		BPF_PROG_TYPE_LWT_XMIT.
  *
  * 		A call to this helper is susceptible to change the underlaying
  * 		packet buffer. Therefore, at load time, all checks on pointers
@@ -2495,7 +2505,8 @@ enum bpf_hdr_start_off {
 /* Encapsulation type for BPF_FUNC_lwt_push_encap helper. */
 enum bpf_lwt_encap_mode {
 	BPF_LWT_ENCAP_SEG6,
-	BPF_LWT_ENCAP_SEG6_INLINE
+	BPF_LWT_ENCAP_SEG6_INLINE,
+	BPF_LWT_ENCAP_IP,
 };
 
 #define __bpf_md_ptr(type, name)	\
@@ -2583,7 +2594,15 @@ enum bpf_ret_code {
 	BPF_DROP = 2,
 	/* 3-6 reserved */
 	BPF_REDIRECT = 7,
-	/* >127 are reserved for prog type specific return codes */
+	/* >127 are reserved for prog type specific return codes.
+	 *
+	 * BPF_LWT_REROUTE: used by BPF_PROG_TYPE_LWT_IN and
+	 *    BPF_PROG_TYPE_LWT_XMIT to indicate that skb had been
+	 *    changed and should be routed based on its new L3 header.
+	 *    (This is an L3 redirect, as opposed to L2 redirect
+	 *    represented by BPF_REDIRECT above).
+	 */
+	BPF_LWT_REROUTE = 128,
 };
 
 struct bpf_sock {
-- 
2.20.1.495.gaa96b0ce6b-goog


^ permalink raw reply related

* [PATCH bpf-next v5 3/5] bpf: add handling of BPF_LWT_REROUTE to lwt_bpf.c
From: Peter Oskolkov @ 2019-01-30 23:51 UTC (permalink / raw)
  To: Alexei Starovoitov, Daniel Borkmann, netdev
  Cc: Peter Oskolkov, David Ahern, Peter Oskolkov
In-Reply-To: <20190130235136.136527-1-posk@google.com>

This patch builds on top of the previous patch in the patchset,
which added BPF_LWT_ENCAP_IP mode to bpf_lwt_push_encap. As the
encapping can result in the skb needing to go via a different
interface/route/dst, bpf programs can indicate this by returning
BPF_LWT_REROUTE, which triggers a new route lookup for the skb.

Signed-off-by: Peter Oskolkov <posk@google.com>
---
 net/core/lwt_bpf.c | 125 +++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 125 insertions(+)

diff --git a/net/core/lwt_bpf.c b/net/core/lwt_bpf.c
index 6a6e9acab73d..20581567f84a 100644
--- a/net/core/lwt_bpf.c
+++ b/net/core/lwt_bpf.c
@@ -16,6 +16,7 @@
 #include <linux/types.h>
 #include <linux/bpf.h>
 #include <net/lwtunnel.h>
+#include <net/ip6_route.h>
 
 struct bpf_lwt_prog {
 	struct bpf_prog *prog;
@@ -55,6 +56,7 @@ static int run_lwt_bpf(struct sk_buff *skb, struct bpf_lwt_prog *lwt,
 
 	switch (ret) {
 	case BPF_OK:
+	case BPF_LWT_REROUTE:
 		break;
 
 	case BPF_REDIRECT:
@@ -87,6 +89,32 @@ static int run_lwt_bpf(struct sk_buff *skb, struct bpf_lwt_prog *lwt,
 	return ret;
 }
 
+static int bpf_lwt_input_reroute(struct sk_buff *skb)
+{
+	int err = -EINVAL;
+
+	if (skb->protocol == htons(ETH_P_IP)) {
+		struct iphdr *iph = ip_hdr(skb);
+
+		err = ip_route_input_noref(skb, iph->daddr, iph->saddr,
+					   iph->tos, skb_dst(skb)->dev);
+	} else if (skb->protocol == htons(ETH_P_IPV6)) {
+		ip6_route_input(skb);
+		err = skb_dst(skb)->error;
+	} else {
+		pr_warn_once("BPF_LWT_REROUTE input: unsupported proto %d\n",
+			     skb->protocol);
+	}
+
+	if (err)
+		goto err;
+	return dst_input(skb);
+
+err:
+	kfree_skb(skb);
+	return err;
+}
+
 static int bpf_input(struct sk_buff *skb)
 {
 	struct dst_entry *dst = skb_dst(skb);
@@ -98,6 +126,8 @@ static int bpf_input(struct sk_buff *skb)
 		ret = run_lwt_bpf(skb, &bpf->in, dst, NO_REDIRECT);
 		if (ret < 0)
 			return ret;
+		if (ret == BPF_LWT_REROUTE)
+			return bpf_lwt_input_reroute(skb);
 	}
 
 	if (unlikely(!dst->lwtstate->orig_input)) {
@@ -147,6 +177,90 @@ static int xmit_check_hhlen(struct sk_buff *skb)
 	return 0;
 }
 
+static int bpf_lwt_xmit_reroute(struct sk_buff *skb)
+{
+	struct net_device *l3mdev = l3mdev_master_dev_rcu(skb_dst(skb)->dev);
+	int oif = l3mdev ? l3mdev->ifindex : 0;
+	struct dst_entry *dst = NULL;
+	struct sock *sk;
+	struct net *net;
+	bool ipv4;
+	int err;
+
+	if (skb->protocol == htons(ETH_P_IP)) {
+		ipv4 = true;
+	} else if (skb->protocol == htons(ETH_P_IPV6)) {
+		ipv4 = false;
+	} else {
+		pr_warn_once("BPF_LWT_REROUTE xmit: unsupported proto %d\n",
+			     skb->protocol);
+		return -EINVAL;
+	}
+
+	sk = sk_to_full_sk(skb->sk);
+	if (sk) {
+		if (sk->sk_bound_dev_if)
+			oif = sk->sk_bound_dev_if;
+		net = sock_net(sk);
+	} else {
+		net = dev_net(skb_dst(skb)->dev);
+	}
+
+	if (ipv4) {
+		struct iphdr *iph = ip_hdr(skb);
+		struct flowi4 fl4 = {0};
+		struct rtable *rt;
+
+		fl4.flowi4_oif = oif;
+		fl4.flowi4_mark = skb->mark;
+		fl4.flowi4_uid = sock_net_uid(net, sk);
+		fl4.flowi4_tos = RT_TOS(iph->tos);
+		fl4.flowi4_flags = FLOWI_FLAG_ANYSRC;
+		fl4.flowi4_proto = iph->protocol;
+		fl4.daddr = iph->daddr;
+		fl4.saddr = iph->saddr;
+
+		rt = ip_route_output_key(net, &fl4);
+		if (IS_ERR(rt) || rt->dst.error)
+			return -EINVAL;
+		dst = &rt->dst;
+	} else {
+		struct ipv6hdr *iph6 = ipv6_hdr(skb);
+		struct flowi6 fl6 = {0};
+
+		fl6.flowi6_oif = oif;
+		fl6.flowi6_mark = skb->mark;
+		fl6.flowi6_uid = sock_net_uid(net, sk);
+		fl6.flowlabel = ip6_flowinfo(iph6);
+		fl6.flowi6_proto = iph6->nexthdr;
+		fl6.daddr = iph6->daddr;
+		fl6.saddr = iph6->saddr;
+
+		dst = ip6_route_output(net, skb->sk, &fl6);
+		if (IS_ERR(dst) || dst->error)
+			return -EINVAL;
+	}
+
+	/* Although skb header was reserved in bpf_lwt_push_ip_encap(), it
+	 * was done for the previous dst, so we are doing it here again, in
+	 * case the new dst needs much more space. The call below is a noop
+	 * if there is enough header space in skb.
+	 */
+	err = skb_cow_head(skb, LL_RESERVED_SPACE(dst->dev));
+	if (unlikely(err))
+		return err;
+
+	skb_dst_drop(skb);
+	skb_dst_set(skb, dst);
+
+	err = dst_output(dev_net(skb_dst(skb)->dev), skb->sk, skb);
+	if (unlikely(err))
+		return err;
+
+	/* ip[6]_finish_output2 understand LWTUNNEL_XMIT_DONE */
+	return LWTUNNEL_XMIT_DONE;
+}
+
 static int bpf_xmit(struct sk_buff *skb)
 {
 	struct dst_entry *dst = skb_dst(skb);
@@ -154,11 +268,20 @@ static int bpf_xmit(struct sk_buff *skb)
 
 	bpf = bpf_lwt_lwtunnel(dst->lwtstate);
 	if (bpf->xmit.prog) {
+		__be16 proto = skb->protocol;
 		int ret;
 
 		ret = run_lwt_bpf(skb, &bpf->xmit, dst, CAN_REDIRECT);
 		switch (ret) {
 		case BPF_OK:
+			/* If the header changed, e.g. via bpf_lwt_push_encap,
+			 * BPF_LWT_REROUTE below should have been used if the
+			 * protocol was also changed.
+			 */
+			if (skb->protocol != proto) {
+				kfree_skb(skb);
+				return -EINVAL;
+			}
 			/* If the header was expanded, headroom might be too
 			 * small for L2 header to come, expand as needed.
 			 */
@@ -169,6 +292,8 @@ static int bpf_xmit(struct sk_buff *skb)
 			return LWTUNNEL_XMIT_CONTINUE;
 		case BPF_REDIRECT:
 			return LWTUNNEL_XMIT_DONE;
+		case BPF_LWT_REROUTE:
+			return bpf_lwt_xmit_reroute(skb);
 		default:
 			return ret;
 		}
-- 
2.20.1.495.gaa96b0ce6b-goog


^ permalink raw reply related

* [PATCH bpf-next v5 2/5] bpf: implement BPF_LWT_ENCAP_IP mode in bpf_lwt_push_encap
From: Peter Oskolkov @ 2019-01-30 23:51 UTC (permalink / raw)
  To: Alexei Starovoitov, Daniel Borkmann, netdev
  Cc: Peter Oskolkov, David Ahern, Peter Oskolkov
In-Reply-To: <20190130235136.136527-1-posk@google.com>

This patch implements BPF_LWT_ENCAP_IP mode in bpf_lwt_push_encap
BPF helper. It enables BPF programs (specifically, BPF_PROG_TYPE_LWT_IN
and BPF_PROG_TYPE_LWT_XMIT prog types) to add IP encapsulation headers
to packets (e.g. IP/GRE, GUE, IPIP).

This is useful when thousands of different short-lived flows should be
encapped, each with different and dynamically determined destination.
Although lwtunnels can be used in some of these scenarios, the ability
to dynamically generate encap headers adds more flexibility, e.g.
when routing depends on the state of the host (reflected in global bpf
maps).

Signed-off-by: Peter Oskolkov <posk@google.com>
---
 include/net/lwtunnel.h |  3 +++
 net/core/filter.c      |  3 ++-
 net/core/lwt_bpf.c     | 59 ++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 64 insertions(+), 1 deletion(-)

diff --git a/include/net/lwtunnel.h b/include/net/lwtunnel.h
index 33fd9ba7e0e5..f0973eca8036 100644
--- a/include/net/lwtunnel.h
+++ b/include/net/lwtunnel.h
@@ -126,6 +126,8 @@ int lwtunnel_cmp_encap(struct lwtunnel_state *a, struct lwtunnel_state *b);
 int lwtunnel_output(struct net *net, struct sock *sk, struct sk_buff *skb);
 int lwtunnel_input(struct sk_buff *skb);
 int lwtunnel_xmit(struct sk_buff *skb);
+int bpf_lwt_push_ip_encap(struct sk_buff *skb, void *hdr, u32 len,
+			  bool ingress);
 
 static inline void lwtunnel_set_redirect(struct dst_entry *dst)
 {
@@ -138,6 +140,7 @@ static inline void lwtunnel_set_redirect(struct dst_entry *dst)
 		dst->input = lwtunnel_input;
 	}
 }
+
 #else
 
 static inline void lwtstate_free(struct lwtunnel_state *lws)
diff --git a/net/core/filter.c b/net/core/filter.c
index 27d3fbe4b77b..de6bd4b4e0a3 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -73,6 +73,7 @@
 #include <linux/seg6_local.h>
 #include <net/seg6.h>
 #include <net/seg6_local.h>
+#include <net/lwtunnel.h>
 
 /**
  *	sk_filter_trim_cap - run a packet through a socket filter
@@ -4804,7 +4805,7 @@ static int bpf_push_seg6_encap(struct sk_buff *skb, u32 type, void *hdr, u32 len
 static int bpf_push_ip_encap(struct sk_buff *skb, void *hdr, u32 len,
 			     bool ingress)
 {
-	return -EINVAL;  /* Implemented in the next patch. */
+	return bpf_lwt_push_ip_encap(skb, hdr, len, ingress);
 }
 
 BPF_CALL_4(bpf_lwt_in_push_encap, struct sk_buff *, skb, u32, type, void *, hdr,
diff --git a/net/core/lwt_bpf.c b/net/core/lwt_bpf.c
index a648568c5e8f..6a6e9acab73d 100644
--- a/net/core/lwt_bpf.c
+++ b/net/core/lwt_bpf.c
@@ -390,6 +390,65 @@ static const struct lwtunnel_encap_ops bpf_encap_ops = {
 	.owner		= THIS_MODULE,
 };
 
+int bpf_lwt_push_ip_encap(struct sk_buff *skb, void *hdr, u32 len, bool ingress)
+{
+	struct iphdr *iph;
+	bool ipv4;
+	int err;
+
+	if (unlikely(len < sizeof(struct iphdr) || len > LWT_BPF_MAX_HEADROOM))
+		return -EINVAL;
+
+	/* validate protocol and length */
+	iph = (struct iphdr *)hdr;
+	if (iph->version == 4) {
+		ipv4 = true;
+		if (unlikely(len < iph->ihl * 4))
+			return -EINVAL;
+	} else if (iph->version == 6) {
+		ipv4 = false;
+		if (unlikely(len < sizeof(struct ipv6hdr)))
+			return -EINVAL;
+	} else {
+		return -EINVAL;
+	}
+
+	if (ingress)
+		err = skb_cow_head(skb, len + skb->mac_len);
+	else
+		err = skb_cow_head(skb,
+				   len + LL_RESERVED_SPACE(skb_dst(skb)->dev));
+	if (unlikely(err))
+		return err;
+
+	/* push the encap headers and fix pointers */
+	skb_reset_inner_headers(skb);
+	skb->encapsulation = 1;
+	skb_push(skb, len);
+	if (ingress)
+		skb_postpush_rcsum(skb, iph, len);
+	skb_reset_network_header(skb);
+	memcpy(skb_network_header(skb), hdr, len);
+	bpf_compute_data_pointers(skb);
+
+	if (ipv4) {
+		skb->protocol = htons(ETH_P_IP);
+		iph = ip_hdr(skb);
+		if (iph->ihl * 4 < len)
+			skb_set_transport_header(skb, iph->ihl * 4);
+
+		if (!iph->check)
+			iph->check = ip_fast_csum((unsigned char *)iph,
+						  iph->ihl);
+	} else {
+		skb->protocol = htons(ETH_P_IPV6);
+		if (sizeof(struct ipv6hdr) < len)
+			skb_set_transport_header(skb, sizeof(struct ipv6hdr));
+	}
+
+	return 0;
+}
+
 static int __init bpf_lwt_init(void)
 {
 	return lwtunnel_encap_add_ops(&bpf_encap_ops, LWTUNNEL_ENCAP_BPF);
-- 
2.20.1.495.gaa96b0ce6b-goog


^ permalink raw reply related

page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox