Netdev List

Netdev List
 help / color / mirror / Atom feed

* Re: [PATCH bpf-next] bpf: check that BPF programs run with preemption disabled
From: Song Liu @ 2019-01-29  7:00 UTC (permalink / raw)
  To: Alexei Starovoitov
  Cc: David Miller, Daniel Borkmann, peterz@infradead.org,
	jannh@google.com, paulmck@linux.ibm.com, will.deacon@arm.com,
	mingo@redhat.com, netdev@vger.kernel.org, Kernel Team
In-Reply-To: <20190129012152.251061-1-ast@kernel.org>



> On Jan 28, 2019, at 5:21 PM, Alexei Starovoitov <ast@kernel.org> wrote:
> 
> From: Peter Zijlstra <peterz@infradead.org>
> 
> Introduce cant_sleep() macro for annotation of functions that cannot sleep.
> 
> Use it in BPF_PROG_RUN to catch execution of BPF programs
> in preemptable context.
> 
> Suggested-by: Jann Horn <jannh@google.com>
> Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
> Signed-off-by: Alexei Starovoitov <ast@kernel.org>

Acked-by: Song Liu <songliubraving@fb.com>


> ---
> include/linux/filter.h |  2 +-
> include/linux/kernel.h | 14 ++++++++++++--
> kernel/sched/core.c    | 28 ++++++++++++++++++++++++++++
> 3 files changed, 41 insertions(+), 3 deletions(-)
> 
> diff --git a/include/linux/filter.h b/include/linux/filter.h
> index e4b473f85b46..7e87863617b3 100644
> --- a/include/linux/filter.h
> +++ b/include/linux/filter.h
> @@ -533,7 +533,7 @@ struct sk_filter {
> 	struct bpf_prog	*prog;
> };
> 
> -#define BPF_PROG_RUN(filter, ctx)  (*(filter)->bpf_func)(ctx, (filter)->insnsi)
> +#define BPF_PROG_RUN(filter, ctx)  ({ cant_sleep(); (*(filter)->bpf_func)(ctx, (filter)->insnsi); })
> 
> #define BPF_SKB_CB_LEN QDISC_CB_PRIV_LEN
> 
> diff --git a/include/linux/kernel.h b/include/linux/kernel.h
> index 8f0e68e250a7..a8868a32098c 100644
> --- a/include/linux/kernel.h
> +++ b/include/linux/kernel.h
> @@ -245,8 +245,10 @@ extern int _cond_resched(void);
> #endif
> 
> #ifdef CONFIG_DEBUG_ATOMIC_SLEEP
> -  void ___might_sleep(const char *file, int line, int preempt_offset);
> -  void __might_sleep(const char *file, int line, int preempt_offset);
> +extern void ___might_sleep(const char *file, int line, int preempt_offset);
> +extern void __might_sleep(const char *file, int line, int preempt_offset);
> +extern void __cant_sleep(const char *file, int line, int preempt_offset);
> +
> /**
>  * might_sleep - annotation for functions that can sleep
>  *
> @@ -259,6 +261,13 @@ extern int _cond_resched(void);
>  */
> # define might_sleep() \
> 	do { __might_sleep(__FILE__, __LINE__, 0); might_resched(); } while (0)
> +/**
> + * cant_sleep - annotation for functions that cannot sleep
> + *
> + * this macro will print a stack trace if it is executed with preemption enabled
> + */
> +# define cant_sleep() \
> +	do { __cant_sleep(__FILE__, __LINE__, 0); } while (0)
> # define sched_annotate_sleep()	(current->task_state_change = 0)
> #else
>   static inline void ___might_sleep(const char *file, int line,
> @@ -266,6 +275,7 @@ extern int _cond_resched(void);
>   static inline void __might_sleep(const char *file, int line,
> 				   int preempt_offset) { }
> # define might_sleep() do { might_resched(); } while (0)
> +# define cant_sleep() do { } while (0)
> # define sched_annotate_sleep() do { } while (0)
> #endif
> 
> diff --git a/kernel/sched/core.c b/kernel/sched/core.c
> index a674c7db2f29..1dcbff62f973 100644
> --- a/kernel/sched/core.c
> +++ b/kernel/sched/core.c
> @@ -6149,6 +6149,34 @@ void ___might_sleep(const char *file, int line, int preempt_offset)
> 	add_taint(TAINT_WARN, LOCKDEP_STILL_OK);
> }
> EXPORT_SYMBOL(___might_sleep);
> +
> +void __cant_sleep(const char *file, int line, int preempt_offset)
> +{
> +	static unsigned long prev_jiffy;
> +
> +	if (irqs_disabled())
> +		return;
> +
> +	if (!IS_ENABLED(CONFIG_PREEMPT_COUNT))
> +		return;
> +
> +	if (preempt_count() > preempt_offset)
> +		return;
> +
> +	if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy)
> +		return;
> +	prev_jiffy = jiffies;
> +
> +	printk(KERN_ERR "BUG: assuming atomic context at %s:%d\n", file, line);
> +	printk(KERN_ERR "in_atomic(): %d, irqs_disabled(): %d, pid: %d, name: %s\n",
> +			in_atomic(), irqs_disabled(),
> +			current->pid, current->comm);
> +
> +	debug_show_held_locks(current);
> +	dump_stack();
> +	add_taint(TAINT_WARN, LOCKDEP_STILL_OK);
> +}
> +EXPORT_SYMBOL_GPL(__cant_sleep);
> #endif
> 
> #ifdef CONFIG_MAGIC_SYSRQ
> -- 
> 2.20.0
> 


^ permalink raw reply

* Re: [PATCH bpf] bpf: run bpf programs with preemption disabled
From: Song Liu @ 2019-01-29  7:01 UTC (permalink / raw)
  To: Alexei Starovoitov
  Cc: David Miller, daniel@iogearbox.net, peterz@infradead.org,
	jannh@google.com, paulmck@linux.ibm.com, will.deacon@arm.com,
	mingo@redhat.com, netdev@vger.kernel.org, Kernel Team
In-Reply-To: <20190129024334.1100196-1-ast@kernel.org>



> On Jan 28, 2019, at 6:43 PM, Alexei Starovoitov <ast@kernel.org> wrote:
> 
> Disabled preemption is necessary for proper access to per-cpu maps
> from BPF programs.
> 
> But the sender side of socket filters didn't have preemption disabled:
> unix_dgram_sendmsg->sk_filter->sk_filter_trim_cap->bpf_prog_run_save_cb->BPF_PROG_RUN
> 
> and a combination of af_packet with tun device didn't disable either:
> tpacket_snd->packet_direct_xmit->packet_pick_tx_queue->ndo_select_queue->
>  tun_select_queue->tun_ebpf_select_queue->bpf_prog_run_clear_cb->BPF_PROG_RUN
> 
> Disable preemption before executing BPF programs (both classic and extended).
> 
> Reported-by: Jann Horn <jannh@google.com>
> Signed-off-by: Alexei Starovoitov <ast@kernel.org>

Acked-by: Song Liu <songliubraving@fb.com>


> ---
> include/linux/filter.h | 21 ++++++++++++++++++---
> kernel/bpf/cgroup.c    |  2 +-
> 2 files changed, 19 insertions(+), 4 deletions(-)
> 
> diff --git a/include/linux/filter.h b/include/linux/filter.h
> index ad106d845b22..e532fcc6e4b5 100644
> --- a/include/linux/filter.h
> +++ b/include/linux/filter.h
> @@ -591,8 +591,8 @@ static inline u8 *bpf_skb_cb(struct sk_buff *skb)
> 	return qdisc_skb_cb(skb)->data;
> }
> 
> -static inline u32 bpf_prog_run_save_cb(const struct bpf_prog *prog,
> -				       struct sk_buff *skb)
> +static inline u32 __bpf_prog_run_save_cb(const struct bpf_prog *prog,
> +					 struct sk_buff *skb)
> {
> 	u8 *cb_data = bpf_skb_cb(skb);
> 	u8 cb_saved[BPF_SKB_CB_LEN];
> @@ -611,15 +611,30 @@ static inline u32 bpf_prog_run_save_cb(const struct bpf_prog *prog,
> 	return res;
> }
> 
> +static inline u32 bpf_prog_run_save_cb(const struct bpf_prog *prog,
> +				       struct sk_buff *skb)
> +{
> +	u32 res;
> +
> +	preempt_disable();
> +	res = __bpf_prog_run_save_cb(prog, skb);
> +	preempt_enable();
> +	return res;
> +}
> +
> static inline u32 bpf_prog_run_clear_cb(const struct bpf_prog *prog,
> 					struct sk_buff *skb)
> {
> 	u8 *cb_data = bpf_skb_cb(skb);
> +	u32 res;
> 
> 	if (unlikely(prog->cb_access))
> 		memset(cb_data, 0, BPF_SKB_CB_LEN);
> 
> -	return BPF_PROG_RUN(prog, skb);
> +	preempt_disable();
> +	res = BPF_PROG_RUN(prog, skb);
> +	preempt_enable();
> +	return res;
> }
> 
> static __always_inline u32 bpf_prog_run_xdp(const struct bpf_prog *prog,
> diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c
> index ab612fe9862f..d17d05570a3f 100644
> --- a/kernel/bpf/cgroup.c
> +++ b/kernel/bpf/cgroup.c
> @@ -572,7 +572,7 @@ int __cgroup_bpf_run_filter_skb(struct sock *sk,
> 	bpf_compute_and_save_data_end(skb, &saved_data_end);
> 
> 	ret = BPF_PROG_RUN_ARRAY(cgrp->bpf.effective[type], skb,
> -				 bpf_prog_run_save_cb);
> +				 __bpf_prog_run_save_cb);
> 	bpf_restore_data_end(skb, saved_data_end);
> 	__skb_pull(skb, offset);
> 	skb->sk = save_sk;
> -- 
> 2.20.0
> 


^ permalink raw reply

* Re: [PATCH net-next] net: nci: remove set-but-not-used variable 'status'
From: David Miller @ 2019-01-29  7:04 UTC (permalink / raw)
  To: yuehaibing; +Cc: sameo, linux-kernel, netdev, linux-wireless
In-Reply-To: <20190129064028.7768-1-yuehaibing@huawei.com>

From: YueHaibing <yuehaibing@huawei.com>
Date: Tue, 29 Jan 2019 14:40:28 +0800

> gcc warning this:
> net/nfc/nci/hci.c:373:5:
>  warning: variable 'status' set but not used [-Wunused-but-set-variable]
> 
> It not used since commit d8cd37ed2fc8 ("NFC: nci: Fix improper
> management of HCI return code"), so can be removed.
> 
> Signed-off-by: YueHaibing <yuehaibing@huawei.com>

I'll let the wireless/nfc folks pick this up.

^ permalink raw reply

* Re: [PATCH net] net: tls: Save iv in tls_rec for async crypto requests
From: David Miller @ 2019-01-29  7:06 UTC (permalink / raw)
  To: davejwatson; +Cc: netdev, vakul.garg, borisp, aviadye, john.fastabend, daniel
In-Reply-To: <20190127005735.k3qh34hctskwdvzy@davejwatson-mba.local.dhcp.thefacebook.com>

From: Dave Watson <davejwatson@fb.com>
Date: Sun, 27 Jan 2019 00:57:38 +0000

> aead_request_set_crypt takes an iv pointer, and we change the iv
> soon after setting it.  Some async crypto algorithms don't save the iv,
> so we need to save it in the tls_rec for async requests.
> 
> Found by hardcoding x64 aesni to use async crypto manager (to test the async
> codepath), however I don't think this combination can happen in the wild.
> Presumably other hardware offloads will need this fix, but there have been
> no user reports.
> 
> Fixes: a42055e8d2c30 ("Add support for async encryption of records...")
> Signed-off-by: Dave Watson <davejwatson@fb.com>

Applied and queued up for -stable.

^ permalink raw reply

* Re: [PATCH] kernel/bpf/cgroup.c - clean up kerneldoc warnings
From: Song Liu @ 2019-01-29  7:07 UTC (permalink / raw)
  To: Valdis Kletnieks
  Cc: Alexei Starovoitov, Daniel Borkmann, Networking, open list
In-Reply-To: <10746.1548744426@turing-police.cc.vt.edu>

On Mon, Jan 28, 2019 at 10:48 PM <valdis.kletnieks@vt.edu> wrote:
>
> Building with W=1 reveals some bitrot
>
>   CC      kernel/bpf/cgroup.o
> kernel/bpf/cgroup.c:238: warning: Function parameter or member 'flags' not described in '__cgroup_bpf_attach'
> kernel/bpf/cgroup.c:367: warning: Function parameter or member 'unused_flags' not described in '__cgroup_bpf_detach'
>
> Add a kerneldoc line for 'flags'.
>
> Fixing the warning for 'unused_flags' is best approached by
> removing the unused parameter on the function call.
>
> Signed-off-by: Valdis Kletnieks <valdis.kletnieks@vt.edu>

Thanks!

Acked-by: Song Liu <songliubraving@fb.com>

PS: For future patches, please specify which tree the patch should be
applied to with [PATCH bpf] or [PATCH bpf-next].

>
> diff --git a/include/linux/bpf-cgroup.h b/include/linux/bpf-cgroup.h
> index 588dd5f0bd85..695b2a880d9a 100644
> --- a/include/linux/bpf-cgroup.h
> +++ b/include/linux/bpf-cgroup.h
> @@ -78,7 +78,7 @@ int cgroup_bpf_inherit(struct cgroup *cgrp);
>  int __cgroup_bpf_attach(struct cgroup *cgrp, struct bpf_prog *prog,
>                         enum bpf_attach_type type, u32 flags);
>  int __cgroup_bpf_detach(struct cgroup *cgrp, struct bpf_prog *prog,
> -                       enum bpf_attach_type type, u32 flags);
> +                       enum bpf_attach_type type);
>  int __cgroup_bpf_query(struct cgroup *cgrp, const union bpf_attr *attr,
>                        union bpf_attr __user *uattr);
>
> diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c
> index ab612fe9862f..d78cfec5807d 100644
> --- a/kernel/bpf/cgroup.c
> +++ b/kernel/bpf/cgroup.c
> @@ -230,6 +230,7 @@ static int update_effective_progs(struct cgroup *cgrp,
>   * @cgrp: The cgroup which descendants to traverse
>   * @prog: A program to attach
>   * @type: Type of attach operation
> + * @flags: Option flags
>   *
>   * Must be called with cgroup_mutex held.
>   */
> @@ -363,7 +364,7 @@ int __cgroup_bpf_attach(struct cgroup *cgrp, struct bpf_prog *prog,
>   * Must be called with cgroup_mutex held.
>   */
>  int __cgroup_bpf_detach(struct cgroup *cgrp, struct bpf_prog *prog,
> -                       enum bpf_attach_type type, u32 unused_flags)
> +                       enum bpf_attach_type type)
>  {
>         struct list_head *progs = &cgrp->bpf.progs[type];
>         enum bpf_cgroup_storage_type stype;
> diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
> index 6d03a27918f4..9802ab424397 100644
> --- a/kernel/cgroup/cgroup.c
> +++ b/kernel/cgroup/cgroup.c
> @@ -6059,7 +6059,7 @@ int cgroup_bpf_detach(struct cgroup *cgrp, struct bpf_prog *prog,
>         int ret;
>
>         mutex_lock(&cgroup_mutex);
> -       ret = __cgroup_bpf_detach(cgrp, prog, type, flags);
> +       ret = __cgroup_bpf_detach(cgrp, prog, type);
>         mutex_unlock(&cgroup_mutex);
>         return ret;
>  }
>

^ permalink raw reply

* Re: [PATCH net] net: tls: Fix deadlock in free_resources tx
From: David Miller @ 2019-01-29  7:07 UTC (permalink / raw)
  To: davejwatson; +Cc: netdev, vakul.garg, borisp, aviadye, john.fastabend, daniel
In-Reply-To: <20190127005901.osdc5t5xvbk4lwb7@davejwatson-mba.local.dhcp.thefacebook.com>

From: Dave Watson <davejwatson@fb.com>
Date: Sun, 27 Jan 2019 00:59:03 +0000

> If there are outstanding async tx requests (when crypto returns EINPROGRESS),
> there is a potential deadlock: the tx work acquires the lock, while we
> cancel_delayed_work_sync() while holding the lock.  Drop the lock while waiting
> for the work to complete.
> 
> Fixes: a42055e8d2c30 ("Add support for async encryption of records...")
> Signed-off-by: Dave Watson <davejwatson@fb.com>

Applied and queued up for -stable.

^ permalink raw reply

* Re: [PATCH] include/linux/bpf.h - fix missing prototype warnings...
From: Song Liu @ 2019-01-29  7:07 UTC (permalink / raw)
  To: Valdis Kletnieks
  Cc: Alexei Starovoitov, Daniel Borkmann, Networking, open list
In-Reply-To: <6349.1548741865@turing-police.cc.vt.edu>

On Mon, Jan 28, 2019 at 10:05 PM <valdis.kletnieks@vt.edu> wrote:
>
> Compiling with W=1 generates warnings:
>
>   CC      kernel/bpf/core.o
> kernel/bpf/core.c:721:12: warning: no previous prototype for ?bpf_jit_alloc_exec_limit? [-Wmissing-prototypes]
>   721 | u64 __weak bpf_jit_alloc_exec_limit(void)
>       |            ^~~~~~~~~~~~~~~~~~~~~~~~
> kernel/bpf/core.c:757:14: warning: no previous prototype for ?bpf_jit_alloc_exec? [-Wmissing-prototypes]
>   757 | void *__weak bpf_jit_alloc_exec(unsigned long size)
>       |              ^~~~~~~~~~~~~~~~~~
> kernel/bpf/core.c:762:13: warning: no previous prototype for ?bpf_jit_free_exec? [-Wmissing-prototypes]
>   762 | void __weak bpf_jit_free_exec(void *addr)
>       |             ^~~~~~~~~~~~~~~~~
>
> All three are weak functions that archs can override, although none do so
> currently.  Provide prototypes for when a new arch provides its own.
>
> Signed-off-by: Valdis Kletnieks <valdis.kletnieks@vt.edu>

Acked-by: Song Liu <songliubraving@fb.com>

>
> diff --git a/include/linux/bpf.h b/include/linux/bpf.h
> index 3851529062ec..99e55313123f 100644
> --- a/include/linux/bpf.h
> +++ b/include/linux/bpf.h
> @@ -472,6 +472,10 @@ _out:                                                      \
>  #define BPF_PROG_RUN_ARRAY_CHECK(array, ctx, func)     \
>         __BPF_PROG_RUN_ARRAY(array, ctx, func, true)
>
> +u64 __weak bpf_jit_alloc_exec_limit(void);
> +void *__weak bpf_jit_alloc_exec(unsigned long size);
> +void __weak bpf_jit_free_exec(void *addr);
> +
>  #ifdef CONFIG_BPF_SYSCALL
>  DECLARE_PER_CPU(int, bpf_prog_active);
>
>

^ permalink raw reply

* Re: net: alteon: replace dev_kfree_skb_irq by dev_consume_skb_irq
From: David Miller @ 2019-01-29  7:11 UTC (permalink / raw)
  To: albin_yang; +Cc: linux-acenic, netdev, jes, yang.wei9
In-Reply-To: <1548604594-4181-1-git-send-email-albin_yang@163.com>

From: Yang Wei <albin_yang@163.com>
Date: Sun, 27 Jan 2019 23:56:34 +0800

> dev_consume_skb_irq() should be called in ace_tx_int() when xmit
> done. It makes drop profiles more friendly.
> 
> Signed-off-by: Yang Wei <yang.wei9@zte.com.cn>

Applied.

^ permalink raw reply

* Re: net: amd8111e: replace dev_kfree_skb_irq by dev_consume_skb_irq
From: David Miller @ 2019-01-29  7:11 UTC (permalink / raw)
  To: albin_yang; +Cc: netdev, yang.wei9
In-Reply-To: <1548604705-4251-1-git-send-email-albin_yang@163.com>

From: Yang Wei <albin_yang@163.com>
Date: Sun, 27 Jan 2019 23:58:25 +0800

> dev_consume_skb_irq() should be called in amd8111e_tx() when xmit
> done. It makes drop profiles more friendly.
> 
> Signed-off-by: Yang Wei <yang.wei9@zte.com.cn>

Applied.

^ permalink raw reply

* Re: [PATCH net] net: apple: replace dev_kfree_skb_irq by dev_consume_skb_irq for drop profiles
From: David Miller @ 2019-01-29  7:11 UTC (permalink / raw)
  To: yang.wei9
  Cc: krzk, herbert, yuehaibing, netdev, linux-kernel, xue.zhihong,
	wang.yi59
In-Reply-To: <1548718753-41047-1-git-send-email-yang.wei9@zte.com.cn>

From: Yang Wei <yang.wei9@zte.com.cn>
Date: Tue, 29 Jan 2019 07:39:13 +0800

> dev_consume_skb_irq() should be called in bmac_txdma_intr() when 
> xmit done. It makes drop profiles more friendly.
> 
> Signed-off-by: Yang Wei <yang.wei9@zte.com.cn>

Applied.

^ permalink raw reply

* Re: [PATCH net] net: ti: replace dev_kfree_skb_irq by dev_consume_skb_irq for drop profiles
From: David Miller @ 2019-01-29  7:11 UTC (permalink / raw)
  To: yang.wei9; +Cc: f.fainelli, netdev, linux-kernel, xue.zhihong, wang.yi59
In-Reply-To: <1548718810-41315-1-git-send-email-yang.wei9@zte.com.cn>

From: Yang Wei <yang.wei9@zte.com.cn>
Date: Tue, 29 Jan 2019 07:40:10 +0800

> dev_consume_skb_irq() should be called in cpmac_end_xmit() when
> xmit done. It makes drop profiles more friendly.
> 
> Signed-off-by: Yang Wei <yang.wei9@zte.com.cn>

Applied.

^ permalink raw reply

* Re: [PATCH net-next] liquidio: fix the validation of rx checksum status from NIC hardware
From: David Miller @ 2019-01-29  7:13 UTC (permalink / raw)
  To: fmanlunas; +Cc: netdev, dchickles, sburla, vburru
In-Reply-To: <20190128193828.GA32464@felix-thinkpad.marvell.com>

From: Felix Manlunas <fmanlunas@marvell.com>
Date: Mon, 28 Jan 2019 19:38:31 +0000

> From: Veerasenareddy Burru <vburru@marvell.com>
> 
> Fixed the code that was incorrectly interpreting the rx checksum validation
> status from hardware, and updating kernel that the packet arrived with
> correct checksum though the packet arrived with incorrect checksum and
> hardware also indicated checksum is not correct.
> 
> Signed-off-by: Veerasenareddy Burru <vburru@marvell.com>
> Acked-by: Derek Chickles <dchickles@marvell.com>
> Signed-off-by: Felix Manlunas <fmanlunas@marvell.com>

Applied, thanks.

^ permalink raw reply

* Re: TCP/IPv4 sending using MSG_ZEROCOPY and closing the socket
From: mathias_koehrer @ 2019-01-29  7:14 UTC (permalink / raw)
  To: Willem de Bruijn; +Cc: Network Development, Willem de Bruijn
In-Reply-To: <CAF=yD-Kn_7zdg7r7jc8GmqKnS46cLktWMWsZRP6K6VP=KqURKw@mail.gmail.com>

>> Hi all,
>> 
>> I have one question on the behavior of TCP/IPv4 sending using the MSG_ZEROCOPY flag,
>> the kernel version is 4.19.18.
>> 
>> What happens if I close the sending socket immediately after performing a socket
>> send() or sendmsg() call (called with the MSG_ZEROCOPY flag)?
>> I.e. in this situation not all messages have been sent yet, however - as the
>> socket is closed - it is no longer possible to retrieve the completion
>> notification via the error channel.
>> 
>> Is it fine for the user space program to free all outstanding messages after the
>> socket close() has returned?
>> Or is there anything else that has to be considered?
> 
> If closing the socket while user memory is still in transmission, it
> will not be possible to safely reuse the memory, as the process has no
> way of discovering when the kernel has finished transmission.
> 
> Depending on type of memory, there may be workarounds to avoid
> unbound virtual memory growth, such as unmapping the virtual
> address range in the case of mmap()ed data.
> 
> But in general, the right approach is to wait for all completions
> before closing a socket.
> 
> If this takes a long time, say due to the TCP stack hold on to data for
> retransmission in the case a peer does not properly close its side,
> disconnect (connect() AF_UNSPEC) can be used to purge the
> queues and trigger notifications. Again, this is a last resort and
> usually not needed.
Hi Willem!
Thank you for the clarification.
Do you think you could add this to the documentation of MSG_ZEROCOPY?
I think this could be really helpful for all users of the zerocopy feature.

Thanks a lot

Mathias

^ permalink raw reply

* Re: Kernel memory corruption in CIPSO labeled TCP packets processing.
From: Nazarov Sergey @ 2019-01-29  7:23 UTC (permalink / raw)
  To: Paul Moore
  Cc: linux-security-module@vger.kernel.org, selinux@vger.kernel.org,
	netdev@vger.kernel.org, Casey Schaufler
In-Reply-To: <CAHC9VhRC8BC9Ocs9FYVrwnWutWD4Dow_MjwpvowxmJd=NtVN5g@mail.gmail.com>

29.01.2019, 01:18, "Paul Moore" <paul@paul-moore.com>:
> If we don't pass a skb into ip_options_compile(), meaning both "skb"
> and "rt" will be NULL, then I don't believe the option data will
> change. Am I missing something?
>
> --
> paul moore
> www.paul-moore.com

I mean, in cipso_v4_error we copy option data from skb before ip_options_compile call:
+       memcpy(opt->__data, (unsigned char *)&(ip_hdr(skb)[1]), opt->optlen);
But skb IP header data could be already changed by first call of ip_options_compile
when packet received.

^ permalink raw reply

* Re: WoL broken in r8169.c since kernel 4.19
From: Marc Haber @ 2019-01-29  7:32 UTC (permalink / raw)
  To: Heiner Kallweit; +Cc: netdev@vger.kernel.org
In-Reply-To: <b19894de-5d8d-0578-8efd-7e61773f383c@gmail.com>

On Mon, Jan 28, 2019 at 09:28:13PM +0100, Heiner Kallweit wrote:
> Because we're interested in file r8169.c only, you could test r8169.c
> from the oops-ing kernel on top of a working kernel version.

That's a really nice idea, and so obvious once one thinks about it.

According to bisect, the first bad commit is
19725496da5602b401eae389736ab00d1817e264

commit 19725496da5602b401eae389736ab00d1817e264
Merge: aea5f654e6b7 9981b4fb8684
Author: David S. Miller <davem@davemloft.net>
Date:   Tue Jul 24 19:21:58 2018 -0700

    Merge ra.kernel.org:/pub/scm/linux/kernel/git/davem/net

diff --cc drivers/net/ethernet/realtek/r8169.c
index 49a6e25ddc2b,eaedc11ed686..8ea1fa36ca43
--- a/drivers/net/ethernet/realtek/r8169.c
+++ b/drivers/net/ethernet/realtek/r8169.c
@@@ -7396,9 -7734,24 +7396,8 @@@ static int rtl_init_one(struct pci_dev 
                return rc;
        }
  
-       /* override BIOS settings, use userspace tools to enable WOL */
-       __rtl8169_set_wol(tp, 0);
+       tp->saved_wolopts = __rtl8169_get_wol(tp);
  
 -      if (rtl_tbi_enabled(tp)) {
 -              tp->set_speed = rtl8169_set_speed_tbi;
 -              tp->get_link_ksettings = rtl8169_get_link_ksettings_tbi;
 -              tp->phy_reset_enable = rtl8169_tbi_reset_enable;
 -              tp->phy_reset_pending = rtl8169_tbi_reset_pending;
 -              tp->link_ok = rtl8169_tbi_link_ok;
 -              tp->do_ioctl = rtl_tbi_ioctl;
 -      } else {
 -              tp->set_speed = rtl8169_set_speed_xmii;
 -              tp->get_link_ksettings = rtl8169_get_link_ksettings_xmii;
 -              tp->phy_reset_enable = rtl8169_xmii_reset_enable;
 -              tp->phy_reset_pending = rtl8169_xmii_reset_pending;
 -              tp->link_ok = rtl8169_xmii_link_ok;
 -              tp->do_ioctl = rtl_xmii_ioctl;
 -      }
 -
        mutex_init(&tp->wk.mutex);
        u64_stats_init(&tp->rx_stats.syncp);
        u64_stats_init(&tp->tx_stats.syncp);

What confuses me here is the big part where the "-" is not in column 1, and
patch -R calls it garbage.

Greetings
Marc

-- 
-----------------------------------------------------------------------------
Marc Haber         | "I don't trust Computers. They | Mailadresse im Header
Leimen, Germany    |  lose things."    Winona Ryder | Fon: *49 6224 1600402
Nordisch by Nature |  How to make an American Quilt | Fax: *49 6224 1600421

^ permalink raw reply

* [PATCH] netfilter: nft_tunnel: Add NFTA_TUNNEL_MODE options
From: wenxu @ 2019-01-29  7:51 UTC (permalink / raw)
  To: pablo, netfilter-devel; +Cc: netdev

From: wenxu <wenxu@ucloud.cn>

nft "tunnel" expr match both the tun_info of RX and TX. This patch
provide the NFTA_TUNNEL_MODE to individually match the tun_info of
RX or TX.

Signed-off-by: wenxu <wenxu@ucloud.cn>
---
 include/uapi/linux/netfilter/nf_tables.h |  9 +++++++++
 net/netfilter/nft_tunnel.c               | 34 ++++++++++++++++++++++++++++++--
 2 files changed, 41 insertions(+), 2 deletions(-)

diff --git a/include/uapi/linux/netfilter/nf_tables.h b/include/uapi/linux/netfilter/nf_tables.h
index 0303028..a66c8de 100644
--- a/include/uapi/linux/netfilter/nf_tables.h
+++ b/include/uapi/linux/netfilter/nf_tables.h
@@ -1727,10 +1727,19 @@ enum nft_tunnel_keys {
 };
 #define NFT_TUNNEL_MAX	(__NFT_TUNNEL_MAX - 1)
 
+enum nft_tunnel_mode {
+	NFT_TUNNEL_MODE_NONE,
+	NFT_TUNNEL_MODE_RX,
+	NFT_TUNNEL_MODE_TX,
+	__NFT_TUNNEL_MODE_MAX
+};
+#define NFT_TUNNEL_MODE_MAX	(__NFT_TUNNEL_MODE_MAX - 1)
+
 enum nft_tunnel_attributes {
 	NFTA_TUNNEL_UNSPEC,
 	NFTA_TUNNEL_KEY,
 	NFTA_TUNNEL_DREG,
+	NFTA_TUNNEL_MODE,
 	__NFTA_TUNNEL_MAX
 };
 #define NFTA_TUNNEL_MAX	(__NFTA_TUNNEL_MAX - 1)
diff --git a/net/netfilter/nft_tunnel.c b/net/netfilter/nft_tunnel.c
index 3a15f21..ea28588 100644
--- a/net/netfilter/nft_tunnel.c
+++ b/net/netfilter/nft_tunnel.c
@@ -15,6 +15,7 @@
 struct nft_tunnel {
 	enum nft_tunnel_keys	key:8;
 	enum nft_registers	dreg:8;
+	enum nft_tunnel_mode	mode:8;
 };
 
 static void nft_tunnel_get_eval(const struct nft_expr *expr,
@@ -29,14 +30,32 @@ static void nft_tunnel_get_eval(const struct nft_expr *expr,
 
 	switch (priv->key) {
 	case NFT_TUNNEL_PATH:
-		nft_reg_store8(dest, !!tun_info);
+		if (!tun_info) {
+			nft_reg_store8(dest, false);
+			return;
+		}
+		if (priv->mode == NFT_TUNNEL_MODE_NONE ||
+		    (priv->mode == NFT_TUNNEL_MODE_RX &&
+		     !(tun_info->mode & IP_TUNNEL_INFO_TX)) ||
+		    (priv->mode == NFT_TUNNEL_MODE_TX &&
+		     (tun_info->mode & IP_TUNNEL_INFO_TX)))
+			nft_reg_store8(dest, true);
+		else
+			nft_reg_store8(dest, false);
 		break;
 	case NFT_TUNNEL_ID:
 		if (!tun_info) {
 			regs->verdict.code = NFT_BREAK;
 			return;
 		}
-		*dest = ntohl(tunnel_id_to_key32(tun_info->key.tun_id));
+		if (priv->mode == NFT_TUNNEL_MODE_NONE ||
+		    (priv->mode == NFT_TUNNEL_MODE_RX &&
+		     !(tun_info->mode & IP_TUNNEL_INFO_TX)) ||
+		    (priv->mode == NFT_TUNNEL_MODE_TX &&
+		     (tun_info->mode & IP_TUNNEL_INFO_TX)))
+			*dest = ntohl(tunnel_id_to_key32(tun_info->key.tun_id));
+		else
+			regs->verdict.code = NFT_BREAK;
 		break;
 	default:
 		WARN_ON(1);
@@ -47,6 +66,7 @@ static void nft_tunnel_get_eval(const struct nft_expr *expr,
 static const struct nla_policy nft_tunnel_policy[NFTA_TUNNEL_MAX + 1] = {
 	[NFTA_TUNNEL_KEY]	= { .type = NLA_U32 },
 	[NFTA_TUNNEL_DREG]	= { .type = NLA_U32 },
+	[NFTA_TUNNEL_MODE]	= { .type = NLA_U32 },
 };
 
 static int nft_tunnel_get_init(const struct nft_ctx *ctx,
@@ -74,6 +94,14 @@ static int nft_tunnel_get_init(const struct nft_ctx *ctx,
 
 	priv->dreg = nft_parse_register(tb[NFTA_TUNNEL_DREG]);
 
+	if (tb[NFTA_TUNNEL_MODE]) {
+		priv->mode = ntohl(nla_get_be32(tb[NFTA_TUNNEL_MODE]));
+		if (priv->mode > NFT_TUNNEL_MODE_MAX)
+			return -EOPNOTSUPP;
+	} else {
+		priv->mode = NFT_TUNNEL_MODE_NONE;
+	}
+
 	return nft_validate_register_store(ctx, priv->dreg, NULL,
 					   NFT_DATA_VALUE, len);
 }
@@ -87,6 +115,8 @@ static int nft_tunnel_get_dump(struct sk_buff *skb,
 		goto nla_put_failure;
 	if (nft_dump_register(skb, NFTA_TUNNEL_DREG, priv->dreg))
 		goto nla_put_failure;
+	if (nla_put_be32(skb, NFTA_TUNNEL_MODE, htonl(priv->mode)))
+		goto nla_put_failure;
 	return 0;
 
 nla_put_failure:
-- 
1.8.3.1


^ permalink raw reply related

* Re: [PATCH 2/7] sh_eth: RX checksum offload support
From: Simon Horman @ 2019-01-29  7:58 UTC (permalink / raw)
  To: Sergei Shtylyov; +Cc: netdev, David S. Miller, linux-renesas-soc, linux-sh
In-Reply-To: <d95fff4f-7750-87c4-3664-eb5efb593b75@cogentembedded.com>

Hi Sergei,

On Mon, Jan 28, 2019 at 06:45:26PM +0300, Sergei Shtylyov wrote:
> Hello!
> 
> On 01/28/2019 03:18 PM, Simon Horman wrote:
> 
> >> Add support for the RX checksum offload. This is enabled by default and
> >> may be disabled and re-enabled using 'ethtool':
> >>
> >> # ethtool -K eth0 rx {on|off}
> >>
> >> Some Ether MACs provide a simple checksumming scheme which appears to be
> >> completely compatible with CHECKSUM_COMPLETE: sum of all packet data after
> >> the L2 header is appended to packet data; this may be trivially read by
> >> the driver and used to update the skb accordingly. The same checksumming
> >> scheme is implemented in the EtherAVB MACs and now supported by tha 'ravb'
> >> driver.
> >>
> >> In terms of performance, throughput is close to gigabit line rate with the
> >> RX checksum offload both enabled and disabled.  The 'perf' output, however,
> >> appears to indicate that significantly less time is spent in do_csum() --
> >> this is as expected.
> > 
> > Nice.
> > 
> > FYI, this seems similar to what I observed for RAVB, perhaps on H3 I don't
> > exactly recall. On E3, which has less CPU power, I recently observed that
> > with rx-csum enabled I can achieve gigabit line rate, but with rx-csum
> > disabled throughput is significantly lower. I.e. on that system throughput
> > is CPU bound with 1500 byte packets unless rx-csum enabled.
> 
>    Unfortunately, we can't teset these patches on the other gen3 boards. ISTR
> you have RZ/A1H board... if it's still with you, I'd appreciate testing.

Unfortunately, as of a few weeks ago, I no longer have that board.

> > Next point:
> > 
> > 2da64300fbc ("ravb: expand rx descriptor data to accommodate hw checksum")
> > is fresh in my mind and I wonder if mdp->rx_buf_sz needs to grow to ensure
> > that there is always enough space for the csum.
> 
>    Well, if you look at sh_eth_ring_init(), you'll see that the driver reserves
> plenty of space at the end the RX buffers.

Yes, I see that. And I assume that was enough space before this patch.
But is it still enough space now that 2 bytes are needed for the hardware csum?
2 bytes that might have previously been used as packet data in some
circumstances.

> > In particular, have you
> > tested this with MTU-size frames with VLANs. (My test is to run iperf3 over
> > a VLAN netdev, netperf over a VLAN netdev would likely work just as well.)
> 
>    Could you refresh me on how to bring up a VLAN on a given interface?

You will need a kernel with CONFIG_VLAN_8021Q enabled.

Then you can do something like this:

	ip link add link eth0 name eth0.1 type vlan id 1
	ip addr add 10.1.1.100/24 dev eth0.1
	ip link set dev eth0.1 up


> [...]
> >> The above results collected on the R-Car V3H Starter Kit board.
> >>
> >> Based on the commit 4d86d3818627 ("ravb: RX checksum offload")...
> >>
> >> Signed-off-by: Sergei Shtylyov <sergei.shtylyov@cogentembedded.com>
> [...]
> 
> MBR, Sergei
> 

^ permalink raw reply

* Re: [PATCH mlx5-next 0/4] mlx5 next misc updates
From: Leon Romanovsky @ 2019-01-29  7:58 UTC (permalink / raw)
  To: Saeed Mahameed
  Cc: saeedm@dev.mellanox.co.il, Jason Gunthorpe,
	netdev@vger.kernel.org, linux-rdma@vger.kernel.org
In-Reply-To: <812ce9a19d55ed16af677622a39ee484eb9c508f.camel@mellanox.com>

[-- Attachment #1: Type: text/plain, Size: 2180 bytes --]

On Mon, Jan 28, 2019 at 07:11:01PM +0000, Saeed Mahameed wrote:
> On Sun, 2019-01-27 at 07:51 +0000, Leon Romanovsky wrote:
> > On Fri, Jan 25, 2019 at 10:08:00AM -0800, Saeed Mahameed wrote:
> > > On Thu, Jan 24, 2019 at 4:30 AM Leon Romanovsky <
> > > leonro@mellanox.com> wrote:
> > > > On Fri, Jan 18, 2019 at 04:33:09PM -0800, Saeed Mahameed wrote:
> > > > > Hi all,
> > > > >
> > > > > This series includes updates to mlx5-next shared branch.
> > > > >
> > > > > 1) from Jason, improve mlx5_cmd_exec_cb async API to be safer
> > > > > 2) from Maxim Mikityanskiy, cleanups for mlx5_write64 doorbell
> > > > > API
> > > > > 3) from Michael Guralnik, Add pci AtomicOps request
> > > > >
> > > > > Thanks,
> > > > > Saeed.
> > > > >
> > > > > ---
> > > > >
> > > > > Jason Gunthorpe (1):
> > > > >   net/mlx5: Make mlx5_cmd_exec_cb() a safe API
> > > > >
> > > > > Michael Guralnik (1):
> > > > >   net/mlx5: Add pci AtomicOps request
> > > >
> > > > Those two were applied to mlx5-next branch.
> > > >
> > > > ce4eee5340a9 (mlx5-next) net/mlx5: Add pci AtomicOps request
> > > > e355477ed9e4 net/mlx5: Make mlx5_cmd_exec_cb() a safe API
> > > >
> > > > > Maxim Mikityanskiy (2):
> > > > >   net/mlx5: Remove unused MLX5_*_DOORBELL_LOCK macros
> > > > >   net/mlx5: Remove spinlock support from mlx5_write64
> > > >
> > > > Those two needs extra work,
> > >
> > > What extra work ?
> >
> > You got two comments for area you are touching:
> > 1. Replace _rww writes to something else.
>
> Not related to this cleanup patchset.
>
> > 2. Protect with spinlock 32-bits writes instead of ignoring it.
>
> Same as above, I already explained this.
>
> >
> > Both of those changes will touch the same 2-4 lines and there
> > is very little benefit in creating more than one-two patches
> > just for that.
> >
>
> Future work, as it needs verification and careful testing.
>
> Leon I would like to move on with those 2 small cleanup patches, no
> functionality change here, please confirm you are ok with them.

At least write large and scary comment that this mode was always broken.

>
> Thanks,
> Saeed.
>
> > Thanks
> >
> > > > Thanks
> > > >
> > > > > --
> > > > > 2.20.1
> > > > >

[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 801 bytes --]

^ permalink raw reply

* [RFC PATCH net-next 0/6 v2] connection tracking in tc and OVS offload
From: Paul Blakey @ 2019-01-29  7:59 UTC (permalink / raw)
  To: Guy Shattah, Marcelo Leitner, Aaron Conole, John Hurley,
	Simon Horman, Justin Pettit, Gregory Rose, Eelco Chaudron,
	Flavio Leitner, Florian Westphal, Jiri Pirko, Rashid Khan,
	Sushil Kulkarni, Andy Gospodarek, Roi Dayan, Yossi Kuperman,
	Or Gerlitz, Rony Efraim, davem@davemloft.net, netdev
  Cc: Paul Blakey

Hi,
As you may know,  we are working on connection tracking for a while, and we already
had patches for tc that matched our connection tracking offload RFC.

Marcelo already shared his tc patches for a similar action ct and flower match on ct_info state,
and this patches are pretty close to his. We would like to share ours and see what's the difference
so maybe we can merge the two.

I think the main difference here is that we propose the usage of a new metadata that resembles
ovs recirc id, so one can use tc recirculation in a similar way that ovs does.

The plan is to support offloading of OVS rules to tc, so this recirculation id will
be shared with and from OVS.

The following is an example using the recirc id metadata (aa_rep and bb_rep are two veth devices)

tc qdisc add dev bb_rep ingress
tc qdisc add dev aa_rep ingress
tc filter add dev aa_rep ingress prio 1 chain 0 proto ip flower match 'meta(tc_recirc mask 0xffffffff eq 0x1)' ct_state +trk+est  ip_proto tcp action mirred egress redirect dev bb_rep
tc filter add dev aa_rep ingress prio 1 chain 0 proto ip flower ct_state -trk ip_proto tcp action ct recirc 1 reclassify
tc filter add dev aa_rep ingress prio 1 chain 0 proto ip flower match 'meta(tc_recirc mask 0xffffffff eq 0x1)' ct_state +trk+new ip_proto tcp action ct commit pipe action mirred egress redirect dev bb_rep

tc filter add dev bb_rep ingress prio 1 chain 0 proto ip flower ct_state -trk ip_proto tcp action ct recirc 2 reclassify
tc filter add dev bb_rep ingress prio 1 chain 0 proto ip flower match 'meta(tc_recirc mask 0xffffffff eq 0x2)' ct_state +trk+est ip_proto tcp  action mirred egress redirect dev aa_rep

of course, goto chain instead of reclassify will also work.

There might be some difference in how we handle action ct and I'll follow up on that.


Changelog:
  v1->v2:
    Missed first patch :|
    Added netdev mailing list

Paul Blakey (6):
  net/sched: Introduce act_ct
  net/sched: cls_flower: add match on ct info
  net/sched: cls_flower: Add ematch support
  net: Add new tc recirc id skb extension
  net/sched: em_meta: add match on tc recirc id skb extension
  net/sched: act_ct: Add tc recirc id set/del support

 include/linux/skbuff.h                    |   1 +
 include/net/tc_act/tc_ct.h                |   2 +
 include/uapi/linux/pkt_cls.h              |  19 ++++
 include/uapi/linux/tc_act/tc_ct.h         |   2 +
 include/uapi/linux/tc_ematch/tc_em_meta.h |   1 +
 net/core/skbuff.c                         |   2 +
 net/sched/act_ct.c                        |  18 ++++
 net/sched/cls_flower.c                    | 148 ++++++++++++++++++++++++++++--
 net/sched/em_meta.c                       |   8 ++
 9 files changed, 194 insertions(+), 7 deletions(-)

-- 
1.8.3.1


^ permalink raw reply

* Re: [PATCH] selftests/netfilter: fix config fragment CONFIG_NF_TABLES_INET
From: Florian Westphal @ 2019-01-29  7:59 UTC (permalink / raw)
  To: Naresh Kamboju; +Cc: netfilter-devel, netdev, fw, pablo, davem, linux-kselftest
In-Reply-To: <20190129062835.31122-1-naresh.kamboju@linaro.org>

Naresh Kamboju <naresh.kamboju@linaro.org> wrote:
> In selftests the config fragment for netfilter was added as
> NF_TABLES_INET=y and this patch correct it as CONFIG_NF_TABLES_INET=y

Oh, silly me.  Thanks for fixing this up.

Acked-by: Florian Westphal <fw@strlen.de>

^ permalink raw reply

* Re: [PATCH 3/7] sh_eth: offload RX checksum on R7S72100
From: Simon Horman @ 2019-01-29  8:00 UTC (permalink / raw)
  To: Sergei Shtylyov; +Cc: netdev, David S. Miller, linux-renesas-soc, linux-sh
In-Reply-To: <916ed105-d499-4ca1-8418-4743b5a7a60d@cogentembedded.com>

On Mon, Jan 28, 2019 at 06:21:11PM +0300, Sergei Shtylyov wrote:
> On 01/28/2019 03:20 PM, Simon Horman wrote:
> 
> >> The RZ/A1H (R7S721000) SoC manual describes the Ether MAC's RX checksum
> >> offload the same way as it's implemented in the EtherAVB MACs...
> >>
> >> Signed-off-by: Sergei Shtylyov <sergei.shtylyov@cogentembedded.com>
> > 
> > Regarding this and the remaining patches in this series,
> > which add rx-csum offload support in the sh_eth driver for
> > various SoCs: has this been tested?
> 
>    As I said, I've only tested it on R8A77980.

Thanks, I missed that.

As you may have guessed the implication of my question is that
IMHO it would be best only to add this feature to SoCs where
it has been tested.

^ permalink raw reply

* Re: [PATCH bpf-next v4 5/7] samples/bpf: Add a "force" flag to XDP samples
From: Jesper Dangaard Brouer @ 2019-01-29  8:00 UTC (permalink / raw)
  To: Maciej Fijalkowski; +Cc: daniel, ast, netdev, jakub.kicinski, brouer
In-Reply-To: <20190128191613.11705-6-maciejromanfijalkowski@gmail.com>

On Mon, 28 Jan 2019 20:16:11 +0100
Maciej Fijalkowski <maciejromanfijalkowski@gmail.com> wrote:

> From: Maciej Fijalkowski <maciej.fijalkowski@intel.com>
> 
> Make xdp samples consistent with iproute2 behavior and set the
> XDP_FLAGS_UPDATE_IF_NOEXIST by default when setting the xdp program on
> interface. Provide an option for user to force the program loading,
> which as a result will not include the mentioned flag in
> bpf_set_link_xdp_fd call.

I like the idea, but what is the error message users get after this
change?

-- 
Best regards,
  Jesper Dangaard Brouer
  MSc.CS, Principal Kernel Engineer at Red Hat
  LinkedIn: http://www.linkedin.com/in/brouer

^ permalink raw reply

* [RFC PATCH net-next 1/6 v2] net/sched: Introduce act_ct
From: Paul Blakey @ 2019-01-29  8:02 UTC (permalink / raw)
  To: Guy Shattah, Marcelo Leitner, Aaron Conole, John Hurley,
	Simon Horman, Justin Pettit, Gregory Rose, Eelco Chaudron,
	Flavio Leitner, Florian Westphal, Jiri Pirko, Rashid Khan,
	Sushil Kulkarni, Andy Gospodarek, Roi Dayan, Yossi Kuperman,
	Or Gerlitz, Rony Efraim, davem@davemloft.net, netdev
  Cc: Paul Blakey

Act ct will send packets to conntrack on a specific zone,
and when commiting a connection, a ct label and ct mark can be set.

Signed-off-by: Paul Blakey <paulb@mellanox.com>
---
 include/net/tc_act/tc_ct.h        |  37 +++
 include/uapi/linux/tc_act/tc_ct.h |  29 +++
 net/sched/Kconfig                 |  11 +
 net/sched/Makefile                |   1 +
 net/sched/act_ct.c                | 465 ++++++++++++++++++++++++++++++++++++++
 5 files changed, 543 insertions(+)
 create mode 100644 include/net/tc_act/tc_ct.h
 create mode 100644 include/uapi/linux/tc_act/tc_ct.h
 create mode 100644 net/sched/act_ct.c

diff --git a/include/net/tc_act/tc_ct.h b/include/net/tc_act/tc_ct.h
new file mode 100644
index 0000000..4a16375
--- /dev/null
+++ b/include/net/tc_act/tc_ct.h
@@ -0,0 +1,37 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef __NET_TC_CT_H
+#define __NET_TC_CT_H
+
+#include <net/act_api.h>
+#include <uapi/linux/tc_act/tc_ct.h>
+
+#define TCA_ACT_CT_LABEL_SIZE 4
+struct tcf_ct {
+	struct tc_action common;
+	struct net *net;
+	struct nf_conn *tmpl;
+	u32 labels[TCA_ACT_CT_LABEL_SIZE];
+	u32 labels_mask[TCA_ACT_CT_LABEL_SIZE];
+	u32 mark;
+	u32 mark_mask;
+	u16 zone;
+	bool commit;
+};
+
+#define to_ct(a) ((struct tcf_ct *)a)
+
+static inline bool is_tcf_ct(const struct tc_action *a)
+{
+#ifdef CONFIG_NET_CLS_ACT
+	if (a->ops && a->ops->type == TCA_ACT_CT)
+		return true;
+#endif
+	return false;
+}
+
+static inline struct tcf_ct *tcf_ct(const struct tc_action *a)
+{
+	return to_ct(a);
+}
+
+#endif /* __NET_TC_CT_H */
diff --git a/include/uapi/linux/tc_act/tc_ct.h b/include/uapi/linux/tc_act/tc_ct.h
new file mode 100644
index 0000000..6dbd771
--- /dev/null
+++ b/include/uapi/linux/tc_act/tc_ct.h
@@ -0,0 +1,29 @@
+/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
+#ifndef __UAPI_TC_CT_H
+#define __UAPI_TC_CT_H
+
+#include <linux/types.h>
+#include <linux/pkt_cls.h>
+
+#define TCA_ACT_CT 18
+
+struct tc_ct {
+	tc_gen;
+	__u16 zone;
+	__u32 labels[4];
+	__u32 labels_mask[4];
+	__u32 mark;
+	__u32 mark_mask;
+	bool commit;
+};
+
+enum {
+	TCA_CT_UNSPEC,
+	TCA_CT_PARMS,
+	TCA_CT_TM,
+	TCA_CT_PAD,
+	__TCA_CT_MAX
+};
+#define TCA_CT_MAX (__TCA_CT_MAX - 1)
+
+#endif /* __UAPI_TC_CT_H */
diff --git a/net/sched/Kconfig b/net/sched/Kconfig
index 1b9afde..935a327 100644
--- a/net/sched/Kconfig
+++ b/net/sched/Kconfig
@@ -912,6 +912,17 @@ config NET_ACT_TUNNEL_KEY
 	  To compile this code as a module, choose M here: the
 	  module will be called act_tunnel_key.
 
+config NET_ACT_CT
+        tristate "connection tracking action"
+        depends on NET_CLS_ACT
+        ---help---
+	  Say Y here to allow sending the packets to conntrack module
+
+	  If unsure, say N.
+
+	  To compile this code as a module, choose M here: the
+	  module will be called act_ct.
+
 config NET_IFE_SKBMARK
         tristate "Support to encoding decoding skb mark on IFE action"
         depends on NET_ACT_IFE
diff --git a/net/sched/Makefile b/net/sched/Makefile
index 8a40431..c0a02de 100644
--- a/net/sched/Makefile
+++ b/net/sched/Makefile
@@ -27,6 +27,7 @@ obj-$(CONFIG_NET_IFE_SKBMARK)	+= act_meta_mark.o
 obj-$(CONFIG_NET_IFE_SKBPRIO)	+= act_meta_skbprio.o
 obj-$(CONFIG_NET_IFE_SKBTCINDEX)	+= act_meta_skbtcindex.o
 obj-$(CONFIG_NET_ACT_TUNNEL_KEY)+= act_tunnel_key.o
+obj-$(CONFIG_NET_ACT_CT)	+= act_ct.o
 obj-$(CONFIG_NET_SCH_FIFO)	+= sch_fifo.o
 obj-$(CONFIG_NET_SCH_CBQ)	+= sch_cbq.o
 obj-$(CONFIG_NET_SCH_HTB)	+= sch_htb.o
diff --git a/net/sched/act_ct.c b/net/sched/act_ct.c
new file mode 100644
index 0000000..61155cc
--- /dev/null
+++ b/net/sched/act_ct.c
@@ -0,0 +1,465 @@
+/*
+ * net/sched/act_ct.c  connection tracking action
+ *
+ * Copyright (c) 2018
+ *
+ * Authors:	Yossi Kuperman <yossiku@mellanox.com>
+ *		Paul Blakey <paulb@mellanox.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+*/
+
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/skbuff.h>
+#include <linux/rtnetlink.h>
+#include <linux/pkt_cls.h>
+#include <linux/ip.h>
+#include <linux/ipv6.h>
+#include <net/netlink.h>
+#include <net/pkt_sched.h>
+#include <net/act_api.h>
+#include <uapi/linux/tc_act/tc_ct.h>
+#include <net/tc_act/tc_ct.h>
+
+#include <net/netfilter/nf_conntrack.h>
+#include <net/netfilter/nf_conntrack_core.h>
+#include <net/netfilter/nf_conntrack_zones.h>
+#include <net/netfilter/nf_conntrack_helper.h>
+#include <net/netfilter/nf_conntrack_labels.h>
+
+#include <net/pkt_cls.h>
+
+static unsigned int ct_net_id;
+static struct tc_action_ops act_ct_ops;
+
+/* Determine whether skb->_nfct is equal to the result of conntrack lookup. */
+static bool skb_nfct_cached(struct net *net, struct sk_buff *skb, u16 zone_id)
+{
+	enum ip_conntrack_info ctinfo;
+	struct nf_conn *ct;
+
+	ct = nf_ct_get(skb, &ctinfo);
+	if (!ct)
+		return false;
+	if (!net_eq(net, read_pnet(&ct->ct_net)))
+		return false;
+	if (nf_ct_zone(ct)->id != zone_id)
+		return false;
+	return true;
+}
+
+/* Trim the skb to the length specified by the IP/IPv6 header,
+ * removing any trailing lower-layer padding. This prepares the skb
+ * for higher-layer processing that assumes skb->len excludes padding
+ * (such as nf_ip_checksum). The caller needs to pull the skb to the
+ * network header, and ensure ip_hdr/ipv6_hdr points to valid data.
+ */
+static int tcf_skb_network_trim(struct sk_buff *skb)
+{
+	unsigned int len;
+	int err;
+
+	switch (skb->protocol) {
+	case htons(ETH_P_IP):
+		len = ntohs(ip_hdr(skb)->tot_len);
+		break;
+	case htons(ETH_P_IPV6):
+		len = sizeof(struct ipv6hdr)
+			+ ntohs(ipv6_hdr(skb)->payload_len);
+		break;
+	default:
+		len = skb->len;
+	}
+
+	err = pskb_trim_rcsum(skb, len);
+
+	return err;
+}
+
+static u_int8_t tcf_skb_family(struct sk_buff *skb)
+{
+	u_int8_t family = PF_UNSPEC;
+
+	switch (skb->protocol) {
+	case htons(ETH_P_IP):
+		family = PF_INET;
+		break;
+	case htons(ETH_P_IPV6):
+		family = PF_INET6;
+		break;
+	default:
+        break;
+	}
+
+	return family;
+}
+
+static bool labels_nonzero(const u32 *labels_mask)
+{
+	return !!memchr_inv(labels_mask, 0, 4);
+}
+
+static int tcf_ct_act(struct sk_buff *skb, const struct tc_action *a,
+		      struct tcf_result *res)
+{
+	struct net *net = dev_net(skb->dev);
+	struct tcf_ct *ca = to_ct(a);
+	enum ip_conntrack_info ctinfo;
+	struct nf_conn *tmpl = NULL;
+	struct nf_hook_state state;
+	struct nf_conn *ct;
+	int nh_ofs, err;
+	u_int8_t family;
+	bool cached;
+
+	/* The conntrack module expects to be working at L3. */
+	nh_ofs = skb_network_offset(skb);
+	skb_pull_rcsum(skb, nh_ofs);
+
+	err = tcf_skb_network_trim(skb);
+	if (err)
+		goto drop;
+
+	family = tcf_skb_family(skb);
+	if (family == PF_UNSPEC)
+		goto drop;
+
+	state.hook = NF_INET_PRE_ROUTING,
+	state.net = net,
+	state.pf = family;
+
+	spin_lock(&ca->tcf_lock);
+	tcf_lastuse_update(&ca->tcf_tm);
+	bstats_update(&ca->tcf_bstats, skb);
+	tmpl = ca->tmpl;
+
+	/* If we are recirculating packets to match on ct fields and
+	 * committing with a separate ct action, then we don't need to
+	 * actually run the packet through conntrack twice unless it's for a
+	 * different zone. */
+	cached = skb_nfct_cached(net, skb, ca->zone);
+	if (!cached) {
+		/* Associate skb with specified zone. */
+		if (tmpl) {
+			if (skb_nfct(skb))
+				nf_conntrack_put(skb_nfct(skb));
+			nf_conntrack_get(&tmpl->ct_general);
+			nf_ct_set(skb, tmpl, IP_CT_NEW);
+		}
+
+		err = nf_conntrack_in(skb, &state);
+		if (err != NF_ACCEPT)
+			goto out;
+	}
+
+	ct = nf_ct_get(skb, &ctinfo);
+	if (!ct)
+		goto out;
+	nf_ct_deliver_cached_events(ct);
+
+	if (ca->commit) {
+		u32 *labels = ca->labels;
+		u32 *labels_m = ca->labels_mask;
+
+#if IS_ENABLED(CONFIG_NF_CONNTRACK_MARK)
+		if (ca->mark_mask) {
+			u32 ct_mark = ca->mark;
+			u32 mask = ca->mark_mask;
+			u32 new_mark;
+
+			new_mark = ct_mark | (ct->mark & ~(mask));
+			if (ct->mark != new_mark) {
+				ct->mark = new_mark;
+				if (nf_ct_is_confirmed(ct))
+					nf_conntrack_event_cache(IPCT_MARK, ct);
+			}
+		}
+#endif
+		if (!nf_ct_is_confirmed(ct)) {
+			bool have_mask = labels_nonzero(labels_m);
+			struct nf_conn_labels *cl, *master_cl;
+
+			/* Inherit master's labels to the related connection? */
+			master_cl = ct->master ? nf_ct_labels_find(ct->master) : NULL;
+
+			if (!master_cl && !have_mask)
+				goto skip; /* Nothing to do. */
+
+			/* Get labels or add them */
+			cl = nf_ct_labels_find(ct);
+			if (!cl) {
+				nf_ct_labels_ext_add(ct);
+				cl = nf_ct_labels_find(ct);
+			}
+			if (!cl)
+				goto out;
+
+			/* Inherit the master's labels, if any.  Must use memcpy for backport
+			 * as struct assignment only copies the length field in older
+			 * kernels.
+	 		*/
+			if (master_cl)
+				memcpy(cl->bits, master_cl->bits, NF_CT_LABELS_MAX_SIZE);
+
+			if (have_mask) {
+				u32 *dst = (u32 *)cl->bits;
+				int i;
+
+				for (i = 0; i < 4; i++)
+					dst[i] = (dst[i] & ~labels_m[i]) | (labels[i] & labels_m[i]);
+			}
+
+			/* Labels are included in the IPCTNL_MSG_CT_NEW event only if the
+			 * IPCT_LABEL bit is set in the event cache.
+			 */
+			nf_conntrack_event_cache(IPCT_LABEL, ct);
+		} else if (labels_nonzero(labels_m)) {
+			struct nf_conn_labels *cl;
+
+			cl = nf_ct_labels_find(ct);
+			if (!cl) {
+				nf_ct_labels_ext_add(ct);
+				cl = nf_ct_labels_find(ct);
+			}
+
+			if (!cl)
+				goto out;
+
+			nf_connlabels_replace(ct, ca->labels, ca->labels_mask, 4);
+		}
+skip:
+		/* This will take care of sending queued events even if the connection
+		 * is already confirmed. */
+		nf_conntrack_confirm(skb);
+	}
+
+out:
+	skb_push(skb, nh_ofs);
+	skb_postpush_rcsum(skb, skb->data, nh_ofs);
+
+	spin_unlock(&ca->tcf_lock);
+	return ca->tcf_action;
+
+drop:
+	spin_lock(&ca->tcf_lock);
+	ca->tcf_qstats.drops++;
+	spin_unlock(&ca->tcf_lock);
+	return TC_ACT_SHOT;
+}
+
+static const struct nla_policy ct_policy[TCA_CT_MAX + 1] = {
+	[TCA_CT_PARMS] = { .len = sizeof(struct tc_ct) },
+};
+
+static int tcf_ct_init(struct net *net, struct nlattr *nla,
+		       struct nlattr *est, struct tc_action **a,
+		       int ovr, int bind, bool rtnl_held,
+		       struct netlink_ext_ack *extack)
+{
+	struct tc_action_net *tn = net_generic(net, ct_net_id);
+	struct nlattr *tb[TCA_CT_MAX + 1];
+	struct nf_conntrack_zone zone;
+	struct nf_conn *tmpl = NULL;
+	bool exists = false;
+	struct tc_ct *parm;
+	struct tcf_ct *ci;
+	int ret, err;
+
+	if (!nla) {
+		NL_SET_ERR_MSG_MOD(extack, "Ct requires attributes to be passed");
+		return -EINVAL;
+	}
+
+	ret = nla_parse_nested(tb, TCA_CT_MAX, nla, ct_policy, extack);
+	if (ret < 0)
+		return ret;
+
+	if (!tb[TCA_CT_PARMS]) {
+		NL_SET_ERR_MSG_MOD(extack, "Missing required ct parameters");
+		return -EINVAL;
+	}
+
+	parm = nla_data(tb[TCA_CT_PARMS]);
+
+	err = tcf_idr_check_alloc(tn, &parm->index, a, bind);
+	if (err < 0)
+		return err;
+	exists = err;
+	if (exists && bind)
+		return 0;
+
+	if (!exists) {
+		ret = tcf_idr_create(tn, parm->index, est, a, &act_ct_ops, bind, false);
+		if (ret) {
+			tcf_idr_cleanup(tn, parm->index);
+			return ret;
+		}
+
+		ci = to_ct(*a);
+		ci->tcf_action = parm->action;
+		ci->net = net;
+		ci->commit = parm->commit;
+		ci->zone = parm->zone;
+#if !IS_ENABLED(CONFIG_NF_CONNTRACK_MARK)
+		if (parm->mark_mask) {
+			NL_SET_ERR_MSG_MOD(extack, "Mark not supported by kernel config");
+			return -EOPNOTSUPP;
+		}
+#endif
+#if !IS_ENABLED(CONFIG_NF_CONNTRACK_LABELS)
+		if (labels_nonzero(parm->labels_mask)) {
+			NL_SET_ERR_MSG_MOD(extack, "Labels not supported by kernel config");
+			return -EOPNOTSUPP;
+		}
+#endif
+		if (parm->zone != NF_CT_DEFAULT_ZONE_ID) {
+			nf_ct_zone_init(&zone, parm->zone,
+					NF_CT_DEFAULT_ZONE_DIR, 0);
+
+			tmpl = nf_ct_tmpl_alloc(net, &zone, GFP_ATOMIC);
+			if (!tmpl) {
+				NL_SET_ERR_MSG_MOD(extack, "Failed to allocate conntrack template");
+				tcf_idr_cleanup(tn, parm->index);
+				return -ENOMEM;
+			}
+			__set_bit(IPS_CONFIRMED_BIT, &tmpl->status);
+			nf_conntrack_get(&tmpl->ct_general);
+		}
+
+		ci->tmpl = tmpl;
+		ci->mark = parm->mark;
+		ci->mark_mask = parm->mark_mask;
+		memcpy(ci->labels, parm->labels, sizeof(parm->labels));
+		memcpy(ci->labels_mask, parm->labels_mask, sizeof(parm->labels_mask));
+
+		tcf_idr_insert(tn, *a);
+		ret = ACT_P_CREATED;
+	} else {
+		/* TODO: handle replace */
+		NL_SET_ERR_MSG_MOD(extack, "Ct can only be created");
+		tcf_idr_cleanup(tn, parm->index);
+		return -EOPNOTSUPP;
+	}
+
+	return ret;
+}
+
+static void tcf_ct_release(struct tc_action *a)
+{
+	struct tcf_ct *ci = to_ct(a);
+
+	if (ci->tmpl)
+		nf_conntrack_put(&ci->tmpl->ct_general);
+}
+
+static inline int tcf_ct_dump(struct sk_buff *skb, struct tc_action *a,
+				    int bind, int ref)
+{
+	unsigned char *b = skb_tail_pointer(skb);
+	struct tcf_ct *ci = to_ct(a);
+
+	struct tc_ct opt = {
+		.index   = ci->tcf_index,
+		.refcnt  = refcount_read(&ci->tcf_refcnt) - ref,
+		.bindcnt = atomic_read(&ci->tcf_bindcnt) - bind,
+	};
+	struct tcf_t t;
+
+	spin_lock_bh(&ci->tcf_lock);
+	opt.action  = ci->tcf_action,
+	opt.zone   = ci->zone,
+	opt.commit = ci->commit,
+	opt.mark = ci->mark,
+	opt.mark_mask = ci->mark_mask,
+	memcpy(opt.labels, ci->labels, sizeof(opt.labels));
+	memcpy(opt.labels_mask, ci->labels_mask, sizeof(opt.labels_mask));
+
+	if (nla_put(skb, TCA_CT_PARMS, sizeof(opt), &opt))
+		goto nla_put_failure;
+
+	tcf_tm_dump(&t, &ci->tcf_tm);
+	if (nla_put_64bit(skb, TCA_CT_TM, sizeof(t), &t, TCA_CT_PAD))
+		goto nla_put_failure;
+	spin_unlock_bh(&ci->tcf_lock);
+
+	return skb->len;
+nla_put_failure:
+	spin_unlock_bh(&ci->tcf_lock);
+	nlmsg_trim(skb, b);
+	return -1;
+}
+
+static int tcf_ct_walker(struct net *net, struct sk_buff *skb,
+			 struct netlink_callback *cb, int type,
+			 const struct tc_action_ops *ops,
+			 struct netlink_ext_ack *extack)
+{
+	struct tc_action_net *tn = net_generic(net, ct_net_id);
+
+	return tcf_generic_walker(tn, skb, cb, type, ops, extack);
+}
+
+static int tcf_ct_search(struct net *net, struct tc_action **a, u32 index)
+{
+	struct tc_action_net *tn = net_generic(net, ct_net_id);
+
+	return tcf_idr_search(tn, a, index);
+}
+
+static struct tc_action_ops act_ct_ops = {
+	.kind		=	"ct",
+	.type		=	TCA_ACT_CT,
+	.owner		=	THIS_MODULE,
+	.act		=	tcf_ct_act,
+	.dump		=	tcf_ct_dump,
+	.init		=	tcf_ct_init,
+	.cleanup	=	tcf_ct_release,
+	.walk		=	tcf_ct_walker,
+	.lookup		=	tcf_ct_search,
+	.size		=	sizeof(struct tcf_ct),
+};
+
+static __net_init int ct_init_net(struct net *net)
+{
+	struct tc_action_net *tn = net_generic(net, ct_net_id);
+
+	return tc_action_net_init(tn, &act_ct_ops);
+}
+
+static void __net_exit ct_exit_net(struct list_head *net_list)
+{
+	tc_action_net_exit(net_list, ct_net_id);
+}
+
+static struct pernet_operations ct_net_ops = {
+	.init = ct_init_net,
+	.exit_batch = ct_exit_net,
+	.id   = &ct_net_id,
+	.size = sizeof(struct tc_action_net),
+};
+
+static int __init ct_init_module(void)
+{
+	char *mark = IS_ENABLED(CONFIG_NF_CONNTRACK_MARK) ? "on" : "off";
+	char *label = IS_ENABLED(CONFIG_NF_CONNTRACK_LABELS) ? "on" : "off";
+
+	pr_info("ct action on, mark: %s, label: %s\n", mark, label);
+	return tcf_register_action(&act_ct_ops, &ct_net_ops);
+}
+
+static void __exit ct_cleanup_module(void)
+{
+	tcf_unregister_action(&act_ct_ops, &ct_net_ops);
+}
+
+module_init(ct_init_module);
+module_exit(ct_cleanup_module);
+MODULE_AUTHOR("Yossi Kuperman <yossiku@mellanox.com>");
+MODULE_DESCRIPTION("Connection tracking action");
+MODULE_LICENSE("GPL");
+
-- 
1.8.3.1


^ permalink raw reply related

* [RFC PATCH net-next 4/6 v2] net: Add new tc recirc id skb extension
From: Paul Blakey @ 2019-01-29  8:02 UTC (permalink / raw)
  To: Guy Shattah, Marcelo Leitner, Aaron Conole, John Hurley,
	Simon Horman, Justin Pettit, Gregory Rose, Eelco Chaudron,
	Flavio Leitner, Florian Westphal, Jiri Pirko, Rashid Khan,
	Sushil Kulkarni, Andy Gospodarek, Roi Dayan, Yossi Kuperman,
	Or Gerlitz, Rony Efraim, davem@davemloft.net, netdev
  Cc: Paul Blakey
In-Reply-To: <1548748926-23822-2-git-send-email-paulb@mellanox.com>

This will be used by followup patch to tc act ct to
recirculate the packet after going to the connection tracking module and
share this recirculation from tc to OVS.

Signed-off-by: Paul Blakey <paulb@mellanox.com>
---
 include/linux/skbuff.h | 1 +
 net/core/skbuff.c      | 2 ++
 2 files changed, 3 insertions(+)

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 95d25b0..02768c7 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -3912,6 +3912,7 @@ enum skb_ext_id {
 #ifdef CONFIG_XFRM
 	SKB_EXT_SEC_PATH,
 #endif
+	SKB_EXT_TC_RECIRC_ID,
 	SKB_EXT_NUM, /* must be last */
 };
 
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index 26d8484..57a2655 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -3911,6 +3911,7 @@ int skb_gro_receive(struct sk_buff *p, struct sk_buff *skb)
 #ifdef CONFIG_XFRM
 	[SKB_EXT_SEC_PATH] = SKB_EXT_CHUNKSIZEOF(struct sec_path),
 #endif
+	[SKB_EXT_TC_RECIRC_ID] = SKB_EXT_CHUNKSIZEOF(uint32_t),
 };
 
 static __always_inline unsigned int skb_ext_total_length(void)
@@ -3922,6 +3923,7 @@ static __always_inline unsigned int skb_ext_total_length(void)
 #ifdef CONFIG_XFRM
 		skb_ext_type_len[SKB_EXT_SEC_PATH] +
 #endif
+		skb_ext_type_len[SKB_EXT_TC_RECIRC_ID] +
 		0;
 }
 
-- 
1.8.3.1


^ permalink raw reply related

* [RFC PATCH net-next 2/6 v2] net/sched: cls_flower: add match on ct info
From: Paul Blakey @ 2019-01-29  8:02 UTC (permalink / raw)
  To: Guy Shattah, Marcelo Leitner, Aaron Conole, John Hurley,
	Simon Horman, Justin Pettit, Gregory Rose, Eelco Chaudron,
	Flavio Leitner, Florian Westphal, Jiri Pirko, Rashid Khan,
	Sushil Kulkarni, Andy Gospodarek, Roi Dayan, Yossi Kuperman,
	Or Gerlitz, Rony Efraim, davem@davemloft.net, netdev
  Cc: Paul Blakey
In-Reply-To: <1548748926-23822-2-git-send-email-paulb@mellanox.com>

New match on ct state, mark, and label from ct_info on the skb.
This can be set via sending the packet to ct via the ct action.

Signed-off-by: Paul Blakey <paulb@mellanox.com>
---
 include/uapi/linux/pkt_cls.h |  17 ++++++
 net/sched/cls_flower.c       | 126 +++++++++++++++++++++++++++++++++++++++++--
 2 files changed, 140 insertions(+), 3 deletions(-)

diff --git a/include/uapi/linux/pkt_cls.h b/include/uapi/linux/pkt_cls.h
index 02ac251..121f1ef 100644
--- a/include/uapi/linux/pkt_cls.h
+++ b/include/uapi/linux/pkt_cls.h
@@ -497,11 +497,28 @@ enum {
 	TCA_FLOWER_KEY_PORT_DST_MIN,	/* be16 */
 	TCA_FLOWER_KEY_PORT_DST_MAX,	/* be16 */
 
+	TCA_FLOWER_KEY_CT_STATE,
+	TCA_FLOWER_KEY_CT_STATE_MASK,
+	TCA_FLOWER_KEY_CT_ZONE,
+	TCA_FLOWER_KEY_CT_ZONE_MASK,
+	TCA_FLOWER_KEY_CT_MARK,
+	TCA_FLOWER_KEY_CT_MARK_MASK,
+	TCA_FLOWER_KEY_CT_LABELS,
+	TCA_FLOWER_KEY_CT_LABELS_MASK,
+
 	__TCA_FLOWER_MAX,
 };
 
 #define TCA_FLOWER_MAX (__TCA_FLOWER_MAX - 1)
 
+
+#define TCA_FLOWER_KEY_CT_FLAGS_NEW               0x01 /* Beginning of a new connection. */
+#define TCA_FLOWER_KEY_CT_FLAGS_ESTABLISHED       0x02 /* Part of an existing connection. */
+#define TCA_FLOWER_KEY_CT_FLAGS_RELATED           0x04 /* Related to an established connection. */
+#define TCA_FLOWER_KEY_CT_FLAGS_INVALID           0x10 /* Could not track connection. */
+#define TCA_FLOWER_KEY_CT_FLAGS_TRACKED           0x20 /* Conntrack has occurred. */
+
+
 enum {
 	TCA_FLOWER_KEY_ENC_OPTS_UNSPEC,
 	TCA_FLOWER_KEY_ENC_OPTS_GENEVE, /* Nested
diff --git a/net/sched/cls_flower.c b/net/sched/cls_flower.c
index f6aa57f..bf74a31 100644
--- a/net/sched/cls_flower.c
+++ b/net/sched/cls_flower.c
@@ -29,6 +29,9 @@
 #include <net/dst.h>
 #include <net/dst_metadata.h>
 
+#include <net/netfilter/nf_conntrack_core.h>
+#include <net/netfilter/nf_conntrack_labels.h>
+
 struct fl_flow_key {
 	int	indev_ifindex;
 	struct flow_dissector_key_control control;
@@ -57,6 +60,11 @@ struct fl_flow_key {
 	struct flow_dissector_key_enc_opts enc_opts;
 	struct flow_dissector_key_ports tp_min;
 	struct flow_dissector_key_ports tp_max;
+
+	u8	ct_state;
+	u16	ct_zone;
+	u32	ct_mark;
+	u32	ct_labels[NF_CT_LABELS_MAX_SIZE / sizeof(u32)];
 } __aligned(BITS_PER_LONG / 8); /* Ensure that we can do comparisons as longs. */
 
 struct fl_flow_mask_range {
@@ -265,19 +273,55 @@ static struct cls_fl_filter *fl_lookup(struct fl_flow_mask *mask,
 	return __fl_lookup(mask, mkey);
 }
 
+static u8 fl_ct_get_state(enum ip_conntrack_info ctinfo)
+{
+	u8 ct_state = TCA_FLOWER_KEY_CT_FLAGS_TRACKED;
+
+	switch (ctinfo) {
+	case IP_CT_ESTABLISHED:
+	case IP_CT_ESTABLISHED_REPLY:
+		ct_state |= TCA_FLOWER_KEY_CT_FLAGS_ESTABLISHED;
+		break;
+	case IP_CT_RELATED:
+	case IP_CT_RELATED_REPLY:
+		ct_state |= TCA_FLOWER_KEY_CT_FLAGS_RELATED;
+		break;
+	case IP_CT_NEW:
+		ct_state |= TCA_FLOWER_KEY_CT_FLAGS_NEW;
+		break;
+	default:
+		break;
+	}
+
+	return ct_state;
+}
+
 static int fl_classify(struct sk_buff *skb, const struct tcf_proto *tp,
 		       struct tcf_result *res)
 {
 	struct cls_fl_head *head = rcu_dereference_bh(tp->root);
-	struct cls_fl_filter *f;
-	struct fl_flow_mask *mask;
-	struct fl_flow_key skb_key;
+	enum ip_conntrack_info ctinfo;
 	struct fl_flow_key skb_mkey;
+	struct fl_flow_key skb_key;
+	struct fl_flow_mask *mask;
+	struct nf_conn_labels *cl;
+	struct cls_fl_filter *f;
+	struct nf_conn *ct;
 
 	list_for_each_entry_rcu(mask, &head->masks, list) {
 		fl_clear_masked_range(&skb_key, mask);
 
 		skb_key.indev_ifindex = skb->skb_iif;
+		ct = nf_ct_get(skb, &ctinfo);
+		if (ct) {
+			skb_key.ct_state = fl_ct_get_state(ctinfo);
+			skb_key.ct_zone = ct->zone.id;
+			skb_key.ct_mark = ct->mark;
+
+			cl = nf_ct_labels_find(ct);
+			if (cl)
+				memcpy(skb_key.ct_labels, cl->bits, sizeof(skb_key.ct_labels));
+		}
 		/* skb_flow_dissect() does not set n_proto in case an unknown
 		 * protocol, so do it rather here.
 		 */
@@ -562,6 +606,14 @@ static void *fl_get(struct tcf_proto *tp, u32 handle)
 	[TCA_FLOWER_KEY_ENC_IP_TTL_MASK] = { .type = NLA_U8 },
 	[TCA_FLOWER_KEY_ENC_OPTS]	= { .type = NLA_NESTED },
 	[TCA_FLOWER_KEY_ENC_OPTS_MASK]	= { .type = NLA_NESTED },
+	[TCA_FLOWER_KEY_CT_STATE]	= { .type = NLA_U8 },
+	[TCA_FLOWER_KEY_CT_STATE_MASK]	= { .type = NLA_U8 },
+	[TCA_FLOWER_KEY_CT_ZONE]	= { .type = NLA_U16 },
+	[TCA_FLOWER_KEY_CT_ZONE_MASK]	= { .type = NLA_U16 },
+	[TCA_FLOWER_KEY_CT_MARK]	= { .type = NLA_U32 },
+	[TCA_FLOWER_KEY_CT_MARK_MASK]	= { .type = NLA_U32 },
+	[TCA_FLOWER_KEY_CT_LABELS]	= { .type = NLA_UNSPEC, .len = 16 },
+	[TCA_FLOWER_KEY_CT_LABELS_MASK]	= { .type = NLA_UNSPEC, .len = 16 },
 };
 
 static const struct nla_policy
@@ -872,6 +924,36 @@ static int fl_set_enc_opt(struct nlattr **tb, struct fl_flow_key *key,
 	return 0;
 }
 
+static int fl_set_key_ct(struct nlattr **tb, struct fl_flow_key *key,
+			 struct fl_flow_key *mask,
+			 struct netlink_ext_ack *extack)
+{
+	size_t label_len = 0;
+
+	if (tb[TCA_FLOWER_KEY_CT_STATE]) {
+		key->ct_state = nla_get_u8(tb[TCA_FLOWER_KEY_CT_STATE]);
+		mask->ct_state = nla_get_u8(tb[TCA_FLOWER_KEY_CT_STATE_MASK]);
+	}
+
+	if (tb[TCA_FLOWER_KEY_CT_ZONE_MASK]) {
+		key->ct_zone = nla_get_u16(tb[TCA_FLOWER_KEY_CT_ZONE]);
+		mask->ct_zone = nla_get_u16(tb[TCA_FLOWER_KEY_CT_ZONE_MASK]);
+	}
+
+	if (tb[TCA_FLOWER_KEY_CT_MARK_MASK]) {
+		key->ct_mark = nla_get_u32(tb[TCA_FLOWER_KEY_CT_MARK]);
+		mask->ct_mark = nla_get_u32(tb[TCA_FLOWER_KEY_CT_MARK_MASK]);
+	}
+
+	if (tb[TCA_FLOWER_KEY_CT_LABELS_MASK]) {
+		label_len = nla_len(tb[TCA_FLOWER_KEY_CT_LABELS]);
+		memcpy(key->ct_labels, nla_data(tb[TCA_FLOWER_KEY_CT_LABELS]), label_len);
+		memcpy(mask->ct_labels, nla_data(tb[TCA_FLOWER_KEY_CT_LABELS_MASK]), label_len);
+	}
+
+	return 0;
+}
+
 static int fl_set_key(struct net *net, struct nlattr **tb,
 		      struct fl_flow_key *key, struct fl_flow_key *mask,
 		      struct netlink_ext_ack *extack)
@@ -1082,6 +1164,10 @@ static int fl_set_key(struct net *net, struct nlattr **tb,
 			return ret;
 	}
 
+	ret = fl_set_key_ct(tb, key, mask, extack);
+	if (ret)
+		return ret;
+
 	if (tb[TCA_FLOWER_KEY_FLAGS])
 		ret = fl_set_key_flags(tb, &key->control.flags, &mask->control.flags);
 
@@ -1761,6 +1847,37 @@ static int fl_dump_key_geneve_opt(struct sk_buff *skb,
 	return -EMSGSIZE;
 }
 
+static int fl_dump_key_ct(struct sk_buff *skb,
+			  struct fl_flow_key *key,
+			  struct fl_flow_key *mask)
+{
+	if(fl_dump_key_val(skb, &key->ct_state, TCA_FLOWER_KEY_CT_STATE,
+			   &mask->ct_state, TCA_FLOWER_KEY_CT_STATE_MASK,
+			   sizeof(key->ct_state)))
+		goto nla_put_failure;
+
+	if (fl_dump_key_val(skb, &key->ct_zone, TCA_FLOWER_KEY_CT_ZONE,
+			    &mask->ct_zone, TCA_FLOWER_KEY_CT_ZONE_MASK,
+			    sizeof(key->ct_zone)))
+		goto nla_put_failure;
+
+	if (fl_dump_key_val(skb, &key->ct_mark, TCA_FLOWER_KEY_CT_MARK,
+			    &mask->ct_mark, TCA_FLOWER_KEY_CT_MARK_MASK,
+			    sizeof(key->ct_mark)))
+		goto nla_put_failure;
+
+	if (fl_dump_key_val(skb, &key->ct_labels, TCA_FLOWER_KEY_CT_LABELS,
+			    &mask->ct_labels, TCA_FLOWER_KEY_CT_LABELS_MASK,
+			    sizeof(key->ct_labels)))
+		goto nla_put_failure;
+
+	return 0;
+
+nla_put_failure:
+	return -EMSGSIZE;
+}
+
+
 static int fl_dump_key_options(struct sk_buff *skb, int enc_opt_type,
 			       struct flow_dissector_key_enc_opts *enc_opts)
 {
@@ -1994,6 +2111,9 @@ static int fl_dump_key(struct sk_buff *skb, struct net *net,
 	    fl_dump_key_enc_opt(skb, &key->enc_opts, &mask->enc_opts))
 		goto nla_put_failure;
 
+	if (fl_dump_key_ct(skb, key, mask))
+		goto nla_put_failure;
+
 	if (fl_dump_key_flags(skb, key->control.flags, mask->control.flags))
 		goto nla_put_failure;
 
-- 
1.8.3.1


^ permalink raw reply related

page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox