Netdev List

Netdev List
 help / color / mirror / Atom feed

* LOAN OFFER
From: 005 @ 2013-10-04  7:02 UTC (permalink / raw)
  To: Recipients


 Do You need  Business Loan or Personal if Yes?Name,Amount,Country,Loan Durations,Phone Number Email;changloanoffer@vip.se.com

^ permalink raw reply

* [PATCH v4 net-next] fix unsafe set_memory_rw from softirq
From: Alexei Starovoitov @ 2013-10-04  7:14 UTC (permalink / raw)
  To: David S. Miller
  Cc: Daniel Borkmann, Eric Dumazet, Heiko Carstens, linux-arm-kernel,
	linuxppc-dev, linux-s390, netdev

on x86 system with net.core.bpf_jit_enable = 1

sudo tcpdump -i eth1 'tcp port 22'

causes the warning:
[   56.766097]  Possible unsafe locking scenario:
[   56.766097]
[   56.780146]        CPU0
[   56.786807]        ----
[   56.793188]   lock(&(&vb->lock)->rlock);
[   56.799593]   <Interrupt>
[   56.805889]     lock(&(&vb->lock)->rlock);
[   56.812266]
[   56.812266]  *** DEADLOCK ***
[   56.812266]
[   56.830670] 1 lock held by ksoftirqd/1/13:
[   56.836838]  #0:  (rcu_read_lock){.+.+..}, at: [<ffffffff8118f44c>] vm_unmap_aliases+0x8c/0x380
[   56.849757]
[   56.849757] stack backtrace:
[   56.862194] CPU: 1 PID: 13 Comm: ksoftirqd/1 Not tainted 3.12.0-rc3+ #45
[   56.868721] Hardware name: System manufacturer System Product Name/P8Z77 WS, BIOS 3007 07/26/2012
[   56.882004]  ffffffff821944c0 ffff88080bbdb8c8 ffffffff8175a145 0000000000000007
[   56.895630]  ffff88080bbd5f40 ffff88080bbdb928 ffffffff81755b14 0000000000000001
[   56.909313]  ffff880800000001 ffff880800000000 ffffffff8101178f 0000000000000001
[   56.923006] Call Trace:
[   56.929532]  [<ffffffff8175a145>] dump_stack+0x55/0x76
[   56.936067]  [<ffffffff81755b14>] print_usage_bug+0x1f7/0x208
[   56.942445]  [<ffffffff8101178f>] ? save_stack_trace+0x2f/0x50
[   56.948932]  [<ffffffff810cc0a0>] ? check_usage_backwards+0x150/0x150
[   56.955470]  [<ffffffff810ccb52>] mark_lock+0x282/0x2c0
[   56.961945]  [<ffffffff810ccfed>] __lock_acquire+0x45d/0x1d50
[   56.968474]  [<ffffffff810cce6e>] ? __lock_acquire+0x2de/0x1d50
[   56.975140]  [<ffffffff81393bf5>] ? cpumask_next_and+0x55/0x90
[   56.981942]  [<ffffffff810cef72>] lock_acquire+0x92/0x1d0
[   56.988745]  [<ffffffff8118f52a>] ? vm_unmap_aliases+0x16a/0x380
[   56.995619]  [<ffffffff817628f1>] _raw_spin_lock+0x41/0x50
[   57.002493]  [<ffffffff8118f52a>] ? vm_unmap_aliases+0x16a/0x380
[   57.009447]  [<ffffffff8118f52a>] vm_unmap_aliases+0x16a/0x380
[   57.016477]  [<ffffffff8118f44c>] ? vm_unmap_aliases+0x8c/0x380
[   57.023607]  [<ffffffff810436b0>] change_page_attr_set_clr+0xc0/0x460
[   57.030818]  [<ffffffff810cfb8d>] ? trace_hardirqs_on+0xd/0x10
[   57.037896]  [<ffffffff811a8330>] ? kmem_cache_free+0xb0/0x2b0
[   57.044789]  [<ffffffff811b59c3>] ? free_object_rcu+0x93/0xa0
[   57.051720]  [<ffffffff81043d9f>] set_memory_rw+0x2f/0x40
[   57.058727]  [<ffffffff8104e17c>] bpf_jit_free+0x2c/0x40
[   57.065577]  [<ffffffff81642cba>] sk_filter_release_rcu+0x1a/0x30
[   57.072338]  [<ffffffff811108e2>] rcu_process_callbacks+0x202/0x7c0
[   57.078962]  [<ffffffff81057f17>] __do_softirq+0xf7/0x3f0
[   57.085373]  [<ffffffff81058245>] run_ksoftirqd+0x35/0x70

cannot reuse jited filter memory, since it's readonly,
so use original bpf insns memory to hold work_struct

defer kfree of sk_filter until jit completed freeing

tested on x86_64 and i386

Signed-off-by: Alexei Starovoitov <ast@plumgrid.com>
---
 arch/arm/net/bpf_jit_32.c       |    1 +
 arch/powerpc/net/bpf_jit_comp.c |    1 +
 arch/s390/net/bpf_jit_comp.c    |    4 +++-
 arch/sparc/net/bpf_jit_comp.c   |    1 +
 arch/x86/net/bpf_jit_comp.c     |   18 +++++++++++++-----
 include/linux/filter.h          |   15 +++++++++++----
 include/net/sock.h              |    6 ++----
 net/core/filter.c               |    8 ++++----
 8 files changed, 36 insertions(+), 18 deletions(-)

diff --git a/arch/arm/net/bpf_jit_32.c b/arch/arm/net/bpf_jit_32.c
index f50d223..99b44e0 100644
--- a/arch/arm/net/bpf_jit_32.c
+++ b/arch/arm/net/bpf_jit_32.c
@@ -930,4 +930,5 @@ void bpf_jit_free(struct sk_filter *fp)
 {
 	if (fp->bpf_func != sk_run_filter)
 		module_free(NULL, fp->bpf_func);
+	kfree(fp);
 }
diff --git a/arch/powerpc/net/bpf_jit_comp.c b/arch/powerpc/net/bpf_jit_comp.c
index bf56e33..2345bdb 100644
--- a/arch/powerpc/net/bpf_jit_comp.c
+++ b/arch/powerpc/net/bpf_jit_comp.c
@@ -691,4 +691,5 @@ void bpf_jit_free(struct sk_filter *fp)
 {
 	if (fp->bpf_func != sk_run_filter)
 		module_free(NULL, fp->bpf_func);
+	kfree(fp);
 }
diff --git a/arch/s390/net/bpf_jit_comp.c b/arch/s390/net/bpf_jit_comp.c
index 7092392..a5df511 100644
--- a/arch/s390/net/bpf_jit_comp.c
+++ b/arch/s390/net/bpf_jit_comp.c
@@ -881,7 +881,9 @@ void bpf_jit_free(struct sk_filter *fp)
 	struct bpf_binary_header *header = (void *)addr;
 
 	if (fp->bpf_func == sk_run_filter)
-		return;
+		goto free_filter;
 	set_memory_rw(addr, header->pages);
 	module_free(NULL, header);
+free_filter:
+	kfree(fp);
 }
diff --git a/arch/sparc/net/bpf_jit_comp.c b/arch/sparc/net/bpf_jit_comp.c
index 9c7be59..218b6b2 100644
--- a/arch/sparc/net/bpf_jit_comp.c
+++ b/arch/sparc/net/bpf_jit_comp.c
@@ -808,4 +808,5 @@ void bpf_jit_free(struct sk_filter *fp)
 {
 	if (fp->bpf_func != sk_run_filter)
 		module_free(NULL, fp->bpf_func);
+	kfree(fp);
 }
diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c
index 79c216a..516593e 100644
--- a/arch/x86/net/bpf_jit_comp.c
+++ b/arch/x86/net/bpf_jit_comp.c
@@ -772,13 +772,21 @@ out:
 	return;
 }
 
+static void bpf_jit_free_deferred(struct work_struct *work)
+{
+	struct sk_filter *fp = container_of(work, struct sk_filter, work);
+	unsigned long addr = (unsigned long)fp->bpf_func & PAGE_MASK;
+	struct bpf_binary_header *header = (void *)addr;
+
+	set_memory_rw(addr, header->pages);
+	module_free(NULL, header);
+	kfree(fp);
+}
+
 void bpf_jit_free(struct sk_filter *fp)
 {
 	if (fp->bpf_func != sk_run_filter) {
-		unsigned long addr = (unsigned long)fp->bpf_func & PAGE_MASK;
-		struct bpf_binary_header *header = (void *)addr;
-
-		set_memory_rw(addr, header->pages);
-		module_free(NULL, header);
+		INIT_WORK(&fp->work, bpf_jit_free_deferred);
+		schedule_work(&fp->work);
 	}
 }
diff --git a/include/linux/filter.h b/include/linux/filter.h
index a6ac848..ff4e40c 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -6,6 +6,7 @@
 
 #include <linux/atomic.h>
 #include <linux/compat.h>
+#include <linux/workqueue.h>
 #include <uapi/linux/filter.h>
 
 #ifdef CONFIG_COMPAT
@@ -25,15 +26,19 @@ struct sk_filter
 {
 	atomic_t		refcnt;
 	unsigned int         	len;	/* Number of filter blocks */
+	struct rcu_head		rcu;
 	unsigned int		(*bpf_func)(const struct sk_buff *skb,
 					    const struct sock_filter *filter);
-	struct rcu_head		rcu;
-	struct sock_filter     	insns[0];
+	union {
+		struct sock_filter     	insns[0];
+		struct work_struct	work;
+	};
 };
 
-static inline unsigned int sk_filter_len(const struct sk_filter *fp)
+static inline unsigned int sk_filter_size(unsigned int proglen)
 {
-	return fp->len * sizeof(struct sock_filter) + sizeof(*fp);
+	return max(sizeof(struct sk_filter),
+		   offsetof(struct sk_filter, insns[proglen]));
 }
 
 extern int sk_filter(struct sock *sk, struct sk_buff *skb);
@@ -67,11 +72,13 @@ static inline void bpf_jit_dump(unsigned int flen, unsigned int proglen,
 }
 #define SK_RUN_FILTER(FILTER, SKB) (*FILTER->bpf_func)(SKB, FILTER->insns)
 #else
+#include <linux/slab.h>
 static inline void bpf_jit_compile(struct sk_filter *fp)
 {
 }
 static inline void bpf_jit_free(struct sk_filter *fp)
 {
+	kfree(fp);
 }
 #define SK_RUN_FILTER(FILTER, SKB) sk_run_filter(SKB, FILTER->insns)
 #endif
diff --git a/include/net/sock.h b/include/net/sock.h
index e3bf213..1105357 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -1612,16 +1612,14 @@ static inline void sk_filter_release(struct sk_filter *fp)
 
 static inline void sk_filter_uncharge(struct sock *sk, struct sk_filter *fp)
 {
-	unsigned int size = sk_filter_len(fp);
-
-	atomic_sub(size, &sk->sk_omem_alloc);
+	atomic_sub(sk_filter_size(fp->len), &sk->sk_omem_alloc);
 	sk_filter_release(fp);
 }
 
 static inline void sk_filter_charge(struct sock *sk, struct sk_filter *fp)
 {
 	atomic_inc(&fp->refcnt);
-	atomic_add(sk_filter_len(fp), &sk->sk_omem_alloc);
+	atomic_add(sk_filter_size(fp->len), &sk->sk_omem_alloc);
 }
 
 /*
diff --git a/net/core/filter.c b/net/core/filter.c
index 6438f29..01b7808 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -644,7 +644,6 @@ void sk_filter_release_rcu(struct rcu_head *rcu)
 	struct sk_filter *fp = container_of(rcu, struct sk_filter, rcu);
 
 	bpf_jit_free(fp);
-	kfree(fp);
 }
 EXPORT_SYMBOL(sk_filter_release_rcu);
 
@@ -683,7 +682,7 @@ int sk_unattached_filter_create(struct sk_filter **pfp,
 	if (fprog->filter == NULL)
 		return -EINVAL;
 
-	fp = kmalloc(fsize + sizeof(*fp), GFP_KERNEL);
+	fp = kmalloc(sk_filter_size(fprog->len), GFP_KERNEL);
 	if (!fp)
 		return -ENOMEM;
 	memcpy(fp->insns, fprog->filter, fsize);
@@ -723,6 +722,7 @@ int sk_attach_filter(struct sock_fprog *fprog, struct sock *sk)
 {
 	struct sk_filter *fp, *old_fp;
 	unsigned int fsize = sizeof(struct sock_filter) * fprog->len;
+	unsigned int sk_fsize = sk_filter_size(fprog->len);
 	int err;
 
 	if (sock_flag(sk, SOCK_FILTER_LOCKED))
@@ -732,11 +732,11 @@ int sk_attach_filter(struct sock_fprog *fprog, struct sock *sk)
 	if (fprog->filter == NULL)
 		return -EINVAL;
 
-	fp = sock_kmalloc(sk, fsize+sizeof(*fp), GFP_KERNEL);
+	fp = sock_kmalloc(sk, sk_fsize, GFP_KERNEL);
 	if (!fp)
 		return -ENOMEM;
 	if (copy_from_user(fp->insns, fprog->filter, fsize)) {
-		sock_kfree_s(sk, fp, fsize+sizeof(*fp));
+		sock_kfree_s(sk, fp, sk_fsize);
 		return -EFAULT;
 	}
 
-- 
1.7.9.5

^ permalink raw reply related

* Re: [PATCH RFC 03/77] PCI/MSI/s390: Fix single MSI only check
From: Martin Schwidefsky @ 2013-10-04  7:39 UTC (permalink / raw)
  To: Alexander Gordeev
  Cc: linux-kernel, Bjorn Helgaas, Ralf Baechle, Michael Ellerman,
	Benjamin Herrenschmidt, Ingo Molnar, Tejun Heo, Dan Williams,
	Andy King, Jon Mason, Matt Porter, stable, linux-pci, linux-mips,
	linuxppc-dev, linux390, linux-s390, x86, linux-ide,
	iss_storagedev, linux-nvme, linux-rdma, netdev, e1000-devel,
	linux-driver, Solarflare linux maintainers, VMware, Inc., lin
In-Reply-To: <8c9811b13fd93e73641dab8e3bd1bd5b2dc37a61.1380703262.git.agordeev@redhat.com>

On Wed,  2 Oct 2013 12:48:19 +0200
Alexander Gordeev <agordeev@redhat.com> wrote:

> Multiple MSIs have never been supported on s390 architecture,
> but the platform code fails to report single MSI only.
> 
> Signed-off-by: Alexander Gordeev <agordeev@redhat.com>
> ---
>  arch/s390/pci/pci.c |    2 ++
>  1 files changed, 2 insertions(+), 0 deletions(-)
> 
> diff --git a/arch/s390/pci/pci.c b/arch/s390/pci/pci.c
> index f17a834..c79c6e4 100644
> --- a/arch/s390/pci/pci.c
> +++ b/arch/s390/pci/pci.c
> @@ -427,6 +427,8 @@ int arch_setup_msi_irqs(struct pci_dev *pdev, int nvec, int type)
>  	pr_debug("%s: requesting %d MSI-X interrupts...", __func__, nvec);
>  	if (type != PCI_CAP_ID_MSIX && type != PCI_CAP_ID_MSI)
>  		return -EINVAL;
> +	if (type == PCI_CAP_ID_MSI && nvec > 1)
> +		return 1;
>  	msi_vecs = min(nvec, ZPCI_MSI_VEC_MAX);
>  	msi_vecs = min_t(unsigned int, msi_vecs, CONFIG_PCI_NR_MSI);
> 

Acked-by: Martin Schwidefsky <schwidefsky@de.ibm.com>

-- 
blue skies,
   Martin.

"Reality continues to ruin my life." - Calvin.

^ permalink raw reply

* Re: [PATCH RFC 04/77] PCI/MSI/s390: Remove superfluous check of MSI type
From: Martin Schwidefsky @ 2013-10-04  7:40 UTC (permalink / raw)
  To: Alexander Gordeev
  Cc: linux-kernel, Bjorn Helgaas, Ralf Baechle, Michael Ellerman,
	Benjamin Herrenschmidt, Ingo Molnar, Tejun Heo, Dan Williams,
	Andy King, Jon Mason, Matt Porter, stable, linux-pci, linux-mips,
	linuxppc-dev, linux390, linux-s390, x86, linux-ide,
	iss_storagedev, linux-nvme, linux-rdma, netdev, e1000-devel,
	linux-driver, Solarflare linux maintainers, VMware, Inc., lin
In-Reply-To: <bae65aa3e30dfd23bd5ed47add7310cfbb96243a.1380703262.git.agordeev@redhat.com>

On Wed,  2 Oct 2013 12:48:20 +0200
Alexander Gordeev <agordeev@redhat.com> wrote:

> arch_setup_msi_irqs() hook can only be called from the generic
> MSI code which ensures correct MSI type parameter.
> 
> Signed-off-by: Alexander Gordeev <agordeev@redhat.com>
> ---
>  arch/s390/pci/pci.c |    2 --
>  1 files changed, 0 insertions(+), 2 deletions(-)
> 
> diff --git a/arch/s390/pci/pci.c b/arch/s390/pci/pci.c
> index c79c6e4..61a3c2c 100644
> --- a/arch/s390/pci/pci.c
> +++ b/arch/s390/pci/pci.c
> @@ -425,8 +425,6 @@ int arch_setup_msi_irqs(struct pci_dev *pdev, int nvec, int type)
>  	int rc;
> 
>  	pr_debug("%s: requesting %d MSI-X interrupts...", __func__, nvec);
> -	if (type != PCI_CAP_ID_MSIX && type != PCI_CAP_ID_MSI)
> -		return -EINVAL;
>  	if (type == PCI_CAP_ID_MSI && nvec > 1)
>  		return 1;
>  	msi_vecs = min(nvec, ZPCI_MSI_VEC_MAX);

Acked-by: Martin Schwidefsky <schwidefsky@de.ibm.com>

-- 
blue skies,
   Martin.

"Reality continues to ruin my life." - Calvin.

^ permalink raw reply

* Re: [PATCH v3 net-next] fix unsafe set_memory_rw from softirq
From: Ingo Molnar @ 2013-10-04  7:51 UTC (permalink / raw)
  To: Alexei Starovoitov
  Cc: Heiko Carstens, Eric Dumazet, Paul Mackerras, H. Peter Anvin,
	sparclinux, Nicolas Dichtel, linux-s390, Russell King, x86,
	James Morris, Ingo Molnar, Alexey Kuznetsov, Paul E. McKenney,
	Xi Wang, Matt Evans, Thomas Gleixner, linux-arm-kernel,
	Stelian Nirlu, Nicolas Schichan, Hideaki YOSHIFUJI, netdev,
	linux-kernel, David S. Miller, Mircea Gherzan, Daniel Borkmann
In-Reply-To: <1380853446-30537-1-git-send-email-ast@plumgrid.com>


* Alexei Starovoitov <ast@plumgrid.com> wrote:

> on x86 system with net.core.bpf_jit_enable = 1
> 
> sudo tcpdump -i eth1 'tcp port 22'
> 
> causes the warning:
> [   56.766097]  Possible unsafe locking scenario:
> [   56.766097]
> [   56.780146]        CPU0
> [   56.786807]        ----
> [   56.793188]   lock(&(&vb->lock)->rlock);
> [   56.799593]   <Interrupt>
> [   56.805889]     lock(&(&vb->lock)->rlock);
> [   56.812266]
> [   56.812266]  *** DEADLOCK ***
> [   56.812266]
> [   56.830670] 1 lock held by ksoftirqd/1/13:
> [   56.836838]  #0:  (rcu_read_lock){.+.+..}, at: [<ffffffff8118f44c>] vm_unmap_aliases+0x8c/0x380
> [   56.849757]
> [   56.849757] stack backtrace:
> [   56.862194] CPU: 1 PID: 13 Comm: ksoftirqd/1 Not tainted 3.12.0-rc3+ #45
> [   56.868721] Hardware name: System manufacturer System Product Name/P8Z77 WS, BIOS 3007 07/26/2012
> [   56.882004]  ffffffff821944c0 ffff88080bbdb8c8 ffffffff8175a145 0000000000000007
> [   56.895630]  ffff88080bbd5f40 ffff88080bbdb928 ffffffff81755b14 0000000000000001
> [   56.909313]  ffff880800000001 ffff880800000000 ffffffff8101178f 0000000000000001
> [   56.923006] Call Trace:
> [   56.929532]  [<ffffffff8175a145>] dump_stack+0x55/0x76
> [   56.936067]  [<ffffffff81755b14>] print_usage_bug+0x1f7/0x208
> [   56.942445]  [<ffffffff8101178f>] ? save_stack_trace+0x2f/0x50
> [   56.948932]  [<ffffffff810cc0a0>] ? check_usage_backwards+0x150/0x150
> [   56.955470]  [<ffffffff810ccb52>] mark_lock+0x282/0x2c0
> [   56.961945]  [<ffffffff810ccfed>] __lock_acquire+0x45d/0x1d50
> [   56.968474]  [<ffffffff810cce6e>] ? __lock_acquire+0x2de/0x1d50
> [   56.975140]  [<ffffffff81393bf5>] ? cpumask_next_and+0x55/0x90
> [   56.981942]  [<ffffffff810cef72>] lock_acquire+0x92/0x1d0
> [   56.988745]  [<ffffffff8118f52a>] ? vm_unmap_aliases+0x16a/0x380
> [   56.995619]  [<ffffffff817628f1>] _raw_spin_lock+0x41/0x50
> [   57.002493]  [<ffffffff8118f52a>] ? vm_unmap_aliases+0x16a/0x380
> [   57.009447]  [<ffffffff8118f52a>] vm_unmap_aliases+0x16a/0x380
> [   57.016477]  [<ffffffff8118f44c>] ? vm_unmap_aliases+0x8c/0x380
> [   57.023607]  [<ffffffff810436b0>] change_page_attr_set_clr+0xc0/0x460
> [   57.030818]  [<ffffffff810cfb8d>] ? trace_hardirqs_on+0xd/0x10
> [   57.037896]  [<ffffffff811a8330>] ? kmem_cache_free+0xb0/0x2b0
> [   57.044789]  [<ffffffff811b59c3>] ? free_object_rcu+0x93/0xa0
> [   57.051720]  [<ffffffff81043d9f>] set_memory_rw+0x2f/0x40
> [   57.058727]  [<ffffffff8104e17c>] bpf_jit_free+0x2c/0x40
> [   57.065577]  [<ffffffff81642cba>] sk_filter_release_rcu+0x1a/0x30
> [   57.072338]  [<ffffffff811108e2>] rcu_process_callbacks+0x202/0x7c0
> [   57.078962]  [<ffffffff81057f17>] __do_softirq+0xf7/0x3f0
> [   57.085373]  [<ffffffff81058245>] run_ksoftirqd+0x35/0x70
> 
> cannot reuse jited filter memory, since it's readonly,
> so use original bpf insns memory to hold work_struct
> 
> defer kfree of sk_filter until jit completed freeing
> 
> tested on x86_64 and i386
> 
> Signed-off-by: Alexei Starovoitov <ast@plumgrid.com>
> ---
>  arch/arm/net/bpf_jit_32.c       |    1 +
>  arch/powerpc/net/bpf_jit_comp.c |    1 +
>  arch/s390/net/bpf_jit_comp.c    |    4 +++-
>  arch/sparc/net/bpf_jit_comp.c   |    1 +
>  arch/x86/net/bpf_jit_comp.c     |   20 +++++++++++++++-----
>  include/linux/filter.h          |   11 +++++++++--
>  net/core/filter.c               |   11 +++++++----
>  7 files changed, 37 insertions(+), 12 deletions(-)
> 
> diff --git a/arch/arm/net/bpf_jit_32.c b/arch/arm/net/bpf_jit_32.c
> index f50d223..99b44e0 100644
> --- a/arch/arm/net/bpf_jit_32.c
> +++ b/arch/arm/net/bpf_jit_32.c
> @@ -930,4 +930,5 @@ void bpf_jit_free(struct sk_filter *fp)
>  {
>  	if (fp->bpf_func != sk_run_filter)
>  		module_free(NULL, fp->bpf_func);
> +	kfree(fp);
>  }
> diff --git a/arch/powerpc/net/bpf_jit_comp.c b/arch/powerpc/net/bpf_jit_comp.c
> index bf56e33..2345bdb 100644
> --- a/arch/powerpc/net/bpf_jit_comp.c
> +++ b/arch/powerpc/net/bpf_jit_comp.c
> @@ -691,4 +691,5 @@ void bpf_jit_free(struct sk_filter *fp)
>  {
>  	if (fp->bpf_func != sk_run_filter)
>  		module_free(NULL, fp->bpf_func);
> +	kfree(fp);
>  }
> diff --git a/arch/s390/net/bpf_jit_comp.c b/arch/s390/net/bpf_jit_comp.c
> index 7092392..a5df511 100644
> --- a/arch/s390/net/bpf_jit_comp.c
> +++ b/arch/s390/net/bpf_jit_comp.c
> @@ -881,7 +881,9 @@ void bpf_jit_free(struct sk_filter *fp)
>  	struct bpf_binary_header *header = (void *)addr;
>  
>  	if (fp->bpf_func == sk_run_filter)
> -		return;
> +		goto free_filter;
>  	set_memory_rw(addr, header->pages);
>  	module_free(NULL, header);
> +free_filter:
> +	kfree(fp);
>  }
> diff --git a/arch/sparc/net/bpf_jit_comp.c b/arch/sparc/net/bpf_jit_comp.c
> index 9c7be59..218b6b2 100644
> --- a/arch/sparc/net/bpf_jit_comp.c
> +++ b/arch/sparc/net/bpf_jit_comp.c
> @@ -808,4 +808,5 @@ void bpf_jit_free(struct sk_filter *fp)
>  {
>  	if (fp->bpf_func != sk_run_filter)
>  		module_free(NULL, fp->bpf_func);
> +	kfree(fp);
>  }
> diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c
> index 79c216a..1396a0a 100644
> --- a/arch/x86/net/bpf_jit_comp.c
> +++ b/arch/x86/net/bpf_jit_comp.c
> @@ -772,13 +772,23 @@ out:
>  	return;
>  }
>  
> +static void bpf_jit_free_deferred(struct work_struct *work)
> +{
> +	struct sk_filter *fp = container_of((void *)work, struct sk_filter,
> +					    insns);
> +	unsigned long addr = (unsigned long)fp->bpf_func & PAGE_MASK;
> +	struct bpf_binary_header *header = (void *)addr;
> +
> +	set_memory_rw(addr, header->pages);
> +	module_free(NULL, header);
> +	kfree(fp);
> +}

Using the data type suggestions I make further below, this could be 
written in a simpler form, as:

	struct sk_filter *fp = container_of(work, struct sk_filter, work);

Also, a question, why do you mask with PAGE_MASK here:

	unsigned long addr = (unsigned long)fp->bpf_func & PAGE_MASK;

?

AFAICS bpf_func is the module_alloc() result - and module code is page 
aligned. So ->bpf_func is always page aligned here. (The sk_run_filter 
special case cannot happen here.)

> +
>  void bpf_jit_free(struct sk_filter *fp)
>  {
>  	if (fp->bpf_func != sk_run_filter) {
> -		unsigned long addr = (unsigned long)fp->bpf_func & PAGE_MASK;
> -		struct bpf_binary_header *header = (void *)addr;
> -
> -		set_memory_rw(addr, header->pages);
> -		module_free(NULL, header);
> +		struct work_struct *work = (struct work_struct *)fp->insns;
> +		INIT_WORK(work, bpf_jit_free_deferred);

Missing newline between local variables and statements.

> +		schedule_work(work);
>  	}
>  }
> diff --git a/include/linux/filter.h b/include/linux/filter.h
> index a6ac848..5d66cd9 100644
> --- a/include/linux/filter.h
> +++ b/include/linux/filter.h
> @@ -25,15 +25,20 @@ struct sk_filter
>  {
>  	atomic_t		refcnt;
>  	unsigned int         	len;	/* Number of filter blocks */
> +	struct rcu_head		rcu;
>  	unsigned int		(*bpf_func)(const struct sk_buff *skb,
>  					    const struct sock_filter *filter);
> -	struct rcu_head		rcu;
> +	/* insns start right after bpf_func, so that sk_run_filter() fetches
> +	 * first insn from the same cache line that was used to call into
> +	 * sk_run_filter()
> +	 */
>  	struct sock_filter     	insns[0];

Please use the customary (multi-line) comment style:

  /*
   * Comment .....
   * ...... goes here.
   */

specified in Documentation/CodingStyle.

>  };
>  
>  static inline unsigned int sk_filter_len(const struct sk_filter *fp)
>  {
> -	return fp->len * sizeof(struct sock_filter) + sizeof(*fp);
> +	return max(fp->len * sizeof(struct sock_filter),
> +		   sizeof(struct work_struct)) + sizeof(*fp);

So, "sizeof(struct work_struct)) + sizeof(*fp)" is a pattern that repeats 
a couple of times. Might make sense to stick that into a helper of its own 
- but in general this open coded overlay allocation method looks a bit 
fragile and not very obvious in isolation.

So it could be done a bit cleaner, using an anonymous union:

	/*
	 * These two overlay, the work struct is used during workqueue 
	 * driven teardown, when the instructions are not used anymore:
	 */
	union {
		struct sock_filter     	insns[0];
		struct work_struct	work;
	};

And then all the sizeof() calculations become obvious and sk_filter_len() 
could be eliminated - I've marked the conversions in the code further 
below.

>  extern int sk_filter(struct sock *sk, struct sk_buff *skb);
> @@ -67,11 +72,13 @@ static inline void bpf_jit_dump(unsigned int flen, unsigned int proglen,
>  }
>  #define SK_RUN_FILTER(FILTER, SKB) (*FILTER->bpf_func)(SKB, FILTER->insns)
>  #else
> +#include <linux/slab.h>

Inlines in the middle of header files are generally frowned upon.

The standard pattern is to put them at the top, that way it's easier to 
see the dependencies and there's also less .config dependent inclusion, 
which makes header hell cleanup work easier.

>  static inline void bpf_jit_compile(struct sk_filter *fp)
>  {
>  }
>  static inline void bpf_jit_free(struct sk_filter *fp)
>  {
> +	kfree(fp);
>  }
>  #define SK_RUN_FILTER(FILTER, SKB) sk_run_filter(SKB, FILTER->insns)
>  #endif
> diff --git a/net/core/filter.c b/net/core/filter.c
> index 6438f29..ad5eaba 100644
> --- a/net/core/filter.c
> +++ b/net/core/filter.c
> @@ -644,7 +644,6 @@ void sk_filter_release_rcu(struct rcu_head *rcu)
>  	struct sk_filter *fp = container_of(rcu, struct sk_filter, rcu);
>  
>  	bpf_jit_free(fp);
> -	kfree(fp);
>  }
>  EXPORT_SYMBOL(sk_filter_release_rcu);
>  
> @@ -677,13 +676,15 @@ int sk_unattached_filter_create(struct sk_filter **pfp,
>  {
>  	struct sk_filter *fp;
>  	unsigned int fsize = sizeof(struct sock_filter) * fprog->len;
> +	unsigned int sk_fsize = max_t(u32, fsize, sizeof(struct work_struct))
> +		+ sizeof(*fp);

Using the structure definition I suggested, this could be replaced with 
the more obvious:

	unsigned int sk_fsize = max(fsize, sizeof(*fp));

>  	int err;
>  
>  	/* Make sure new filter is there and in the right amounts. */
>  	if (fprog->filter == NULL)
>  		return -EINVAL;
>  
> -	fp = kmalloc(fsize + sizeof(*fp), GFP_KERNEL);
> +	fp = kmalloc(sk_fsize, GFP_KERNEL);
>  	if (!fp)
>  		return -ENOMEM;
>  	memcpy(fp->insns, fprog->filter, fsize);
> @@ -723,6 +724,8 @@ int sk_attach_filter(struct sock_fprog *fprog, struct sock *sk)
>  {
>  	struct sk_filter *fp, *old_fp;
>  	unsigned int fsize = sizeof(struct sock_filter) * fprog->len;
> +	unsigned int sk_fsize = max_t(u32, fsize, sizeof(struct work_struct))
> +		+ sizeof(*fp);

This too could be written as:

	unsigned int sk_fsize = max(fsize, sizeof(*fp));

>  	int err;
>  
>  	if (sock_flag(sk, SOCK_FILTER_LOCKED))
> @@ -732,11 +735,11 @@ int sk_attach_filter(struct sock_fprog *fprog, struct sock *sk)
>  	if (fprog->filter == NULL)
>  		return -EINVAL;
>  
> -	fp = sock_kmalloc(sk, fsize+sizeof(*fp), GFP_KERNEL);
> +	fp = sock_kmalloc(sk, sk_fsize, GFP_KERNEL);
>  	if (!fp)
>  		return -ENOMEM;
>  	if (copy_from_user(fp->insns, fprog->filter, fsize)) {
> -		sock_kfree_s(sk, fp, fsize+sizeof(*fp));
> +		sock_kfree_s(sk, fp, sk_fsize);
>  		return -EFAULT;
>  	}

A couple of questions/suggestions:

1)

I took a brief look at arch/x86/net/bpf_jit_comp.c while reviewing this 
patch.

You need to split up bpf_jit_compile(), it's an obscenely large, ~600 
lines long function. We don't do that in modern, maintainable kernel code.

2)

This 128 bytes extra padding:

        /* Most of BPF filters are really small,
         * but if some of them fill a page, allow at least
         * 128 extra bytes to insert a random section of int3
         */
        sz = round_up(proglen + sizeof(*header) + 128, PAGE_SIZE);

why is it done? It's not clear to me from the comment.

3)

It's nice code altogether! Are there any plans to generalize its 
interfaces, to allow arbitrary bytecode to be used by other kernel 
subsystems as well? In particular tracing filters could make use of it, 
but it would also allow safe probe points.

Thanks,

	Ingo

^ permalink raw reply

* [PATCH v2.42 0/5] MPLS actions and matches
From: Simon Horman @ 2013-10-04  8:09 UTC (permalink / raw)
  To: dev-yBygre7rU0TnMu66kgdUjQ, netdev-u79uwXL29TY76Z2rM5mHXA,
	Jesse Gross, Ben Pfaff
  Cc: Isaku Yamahata, Ravi K

Hi,

This series implements MPLS actions and matches based on work by
Ravi K, Leo Alterman, Yamahata-san and Joe Stringer.

This series provides two changes

* Patches 1 - 3

  Provide user-space support for the VLAN/MPLS tag insertion order
  up to and including OpenFlow 1.2, and the different ordering
  specified from OpenFlow 1.3. In a nutshell the datapath always
  uses the OpenFlow 1.3 ordering, which is to always insert tags
  immediately after the L2 header, regardless of the presence of other
  tags. And ovs-vswtichd provides compatibility for the behaviour up
  to OpenFlow 1.2, which is that MPLS tags should follow VLAN tags
  if present.

  These patches have been updated since v2.40.

  Ben, these are for you to review.

* Patches 4 and 5

  Adding basic MPLS action and match support to the kernel datapath

  These patches have not been updated since v2.40.

  Jesse, these are for you to review.


Differences between v2.42 and v2.41:

v2.42
* Rebase for:
  + 0585f7a ("datapath: Simplify mega-flow APIs.")
  + a097c0b ("datapath: Restructure datapath.c and flow.c")
* As suggested by Jesse Gross
  + Take into account that push_mpls() will have freed the skb on error
  + Remove dubious !eth_p_mpls(skb->protocol) condition from push_mpls
    The !eth_p_mpls(skb->protocol) condition on setting inner_protocol
    has no effect. Its motivation was to ensure that inner_protocol was
    only set the first time that mpls_push occured. However this is already
    ensured by the !ovs_skb_get_inner_protocol(skb) condition.
  + Return -EINVAL instead of -ENOMEM from pop_mpls() if the skb is too short
  + Do not add @inner_protocol to kernel doc for struct ovs_skb_cb.
    The patch no longer adds an inner_protocol member to struct ovs_skb_cb
  + Do not add and set otherwise unsued inner_protocol variable in
    rpl_dev_queue_xmit()
* As suggested by Pravin Shelar
  + Implement compatibility code in existing rpl_skb_gso_segment
    rather than introducing to use rpl___skb_gso_segment


Differences between v2.41 and v2.40:

* As suggested by Ben Pfaff
  + Expand struct ofpact_reg_load to include a mpls_before_vlan field
    rather than using the compat field of the ofpact field of
    struct ofpact_reg_load.
  + Pass version to  ofpacts_pull_openflow11_actions and
    ofpacts_pull_openflow11_instructions.  This removes the invalid
    assumption that that these functions are passed a full message and are
    thus able to deduce the OpenFlow version.


Differences between v2.40 and v2.39:

* Rebase for:
  + New dev_queue_xmit compat code
  + Updated put_vlan()
  + Removal of mpls_depth field from struct flow
* As suggested by Jesse Gross
  + Remove bogus mac_len update from push_mpls()
  + Slightly simplify push_mpls() by using eth_hdr()
  + Remove dubious condition !eth_p_mpls(inner_protocol) on
    an skb being considered to be MPLS in netdev_send()
  + Only use compatibility code for MPLS GSO segmentation on kernels
    older than 3.11
  + Revamp setting of inner_protocol
    1. Do not unconditionally set inner_protocol to the value of
       skb->protocol in ovs_execute_actions().
    2. Initialise inner_protocol it to zero only if compatibility code is in
       use. In the case where compatibility code is not in use it will either
       be zero due since the allocation of the skb or some other value set
       by some other user.
    3. Conditionally set the inner_protocol in push_mpls() to the value of
       skb->protocol when entering push_mpls(). The condition is that
       inner_protocol is zero and the value of skb->protocol is not an MPLS
       ethernet type.
    - This new scheme:
      + Pushes logic to set inner_protocol closer to the case where it is
	needed.
      + Avoids over-writing values set by other users.
* As suggested by Pravin Shelar
  + Only set and restore skb->protocol in rpl___skb_gso_segment() in the
    case of MPLS
  + Add inner_protocol field to struct ovs_gso_cb instead of ovs_skb_cb.
    This moves compatibility code closer to where it is used
    and creates fewer differences with mainline.
* Update comment on mac_len updates in datapath/actions.c
* Remove HAVE_INNER_PROCOTOL and instead just check
  against kernel version 3.11 directly.
  HAVE_INNER_PROCOTOL is a hang-over from work done prior
  to the merge of inner_protocol into the kernel.
* Remove dubious condition !eth_p_mpls(inner_protocol) on
  using inner_protocol as the type in rpl_skb_network_protocol()
* Do not update type of features in rpl_dev_queue_xmit.
  Though arguably correct this is not an inherent part of
  the changes made by this patch.
* Use skb_cow_head() in push_mpls()
  + Call skb_cow_head(skb, MPLS_HLEN) instead of
    make_writable(skb, skb->mac_len) to ensure that there is enough head
    room to push an MPLS LSE regardless of whether the skb is cloned or not.
  + This is consistent with the behaviour of rpl__vlan_put_tag().
  + This is a fix for crashes reported when performing mpls_push
    with headroom less than 4. This problem was introduced in v3.36.
* Skip popping in mpls_pop if the skb is too short to contain an MPLS LSE


Differences between v2.39 and v2.38:

* Rebase for removal of vlan, checksum and skb->mark compat code
  - This includes adding adding a new patch,
    "[PATCH v2.39 6/7] datapath: Break out deacceleration portion of
    vlan_push" to allow re-use of some existing code.


Differences between v2.38 and v2.37:

* Rebase for SCTP support
* Refactor validate_tp_port() to iterate over eth_types rather
  than open-coding the loop. With the addition of SCTP this logic
  is now used three times.


Differences between v2.37 and v2.36:

* Rebase


Differences between v2.36 and v2.35:

* Rebase

* Do not add set_ethertype() to datapath/actions.c.
  As this patch has evolved this function had devolved into
  to sets of functionality wrapped into a single function with
  only one line of common code. Refactor things to simply
  open-code setting the ether type in the two locations where
  set_ethertype() was previously used. The aim here is to improve
  readability.

* Update setting skb->ethertype after mpls push and pop.
  - In the case of push_mpls it should be set unconditionally
    as in v2.35 the behaviour of this function to always push
    an MPLS LSE before any VLAN tags.
  - In the case of mpls_pop eth_p_mpls(skb->protocol) is a better
    test than skb->protocol != htons(ETH_P_8021Q) as it will give the
    correct behaviour in the presence of other VLAN ethernet types,
    for example 0x88a8 which is used by 802.1ad. Moreover, it seems
    correct to update the ethernet type if it was previously set
    according to the top-most MPLS LSE.

* Deaccelerate VLANs when pushing MPLS tags the
  - Since v2.35 MPLS push will insert an MPLS LSE before any VLAN tags.
    This means that if an accelerated tag is present it should be
    deaccelerated to ensure it ends up in the correct position.

* Update skb->mac_len in push_mpls() so that it will be correct
  when used by a subsequent call to pop_mpls().

  As things stand I do not believe this is strictly necessary as
  ovs-vswitchd will not send a pop MPLS action after a push MPLS action.
  However, I have added this in order to code more defensively as I believe
  that if such a sequence did occur it would be rather unobvious why
  it didn't work.

* Do not add skb_cow_head() call in push_mpls().
  It is unnecessary as there is a make_writable() call.
  This change was also made in v2.30 but some how the
  code regressed between then and v2.35.


Differences between v2.35 and v2.34:

* Add support for the tag ordering specified up until OpenFlow 1.2 and
  the ordering specified from OpenFlow 1.3.

* Correct error in datapath patch's handling of GSO in the presence
  of MPLS and absence of VLANs.


To aid review this series is available in git at:

git://github.com/horms/openvswitch.git devel/mpls-v2.42


Patch list and overall diffstat:

Joe Stringer (3):
  odp: Allow VLAN actions after MPLS actions
  ofp-actions: Add separate OpenFlow 1.3 action parser
  lib: Support pushing of MPLS LSE before or after VLAN tag

Simon Horman (2):
  datapath: Break out deacceleration portion of vlan_push
  datapath: Add basic MPLS support to kernel

 datapath/Modules.mk                             |   1 +
 datapath/actions.c                              | 158 ++++++++-
 datapath/datapath.c                             |   4 +-
 datapath/flow.c                                 |  29 ++
 datapath/flow.h                                 |  17 +-
 datapath/flow_netlink.c                         | 286 +++++++++++++--
 datapath/flow_netlink.h                         |   2 +-
 datapath/linux/compat/gso.c                     |  70 +++-
 datapath/linux/compat/gso.h                     |  41 +++
 datapath/linux/compat/include/linux/netdevice.h |   6 +-
 datapath/linux/compat/netdevice.c               |  10 +-
 datapath/mpls.h                                 |  15 +
 include/linux/openvswitch.h                     |   7 +-
 lib/flow.c                                      |   2 +-
 lib/odp-util.c                                  |   9 +-
 lib/odp-util.h                                  |   2 +-
 lib/ofp-actions.c                               |  68 +++-
 lib/ofp-actions.h                               |   9 +
 lib/ofp-print.c                                 |   2 +-
 lib/ofp-util.c                                  |  24 +-
 lib/ofp-util.h                                  |   2 +-
 lib/packets.c                                   |  10 +-
 lib/packets.h                                   |   2 +-
 ofproto/ofproto-dpif-xlate.c                    | 110 ++++--
 ofproto/ofproto-dpif-xlate.h                    |   5 +
 tests/ofproto-dpif.at                           | 446 ++++++++++++++++++++++++
 utilities/ovs-ofctl.c                           |   8 +-
 27 files changed, 1233 insertions(+), 112 deletions(-)
 create mode 100644 datapath/mpls.h

-- 
1.8.4

^ permalink raw reply

* [PATCH v2.42 1/5] odp: Allow VLAN actions after MPLS actions
From: Simon Horman @ 2013-10-04  8:09 UTC (permalink / raw)
  To: dev-yBygre7rU0TnMu66kgdUjQ, netdev-u79uwXL29TY76Z2rM5mHXA,
	Jesse Gross, Ben Pfaff
  Cc: Isaku Yamahata, Ravi K
In-Reply-To: <1380874200-8981-1-git-send-email-horms-/R6kz+dDXgpPR4JQBCEnsQ@public.gmane.org>

From: Joe Stringer <joe-Q1GJJQv1iO6lP80pJB477g@public.gmane.org>

OpenFlow 1.1 and 1.2, and 1.3 differ in their handling of MPLS actions in the
presence of VLAN tags. To allow correct behaviour to be committed in
each situation, this patch adds a second round of VLAN tag action
handling to commit_odp_actions(), which occurs after MPLS actions. This
is implemented with a new field in 'struct xlate_in' called 'vlan_tci'.

When an push_mpls action is composed, the flow's current VLAN state is
stored into xin->vlan_tci, and flow->vlan_tci is set to 0 (pop_vlan). If
a VLAN tag is present, it is stripped; if not, then there is no change.
Any later modifications to the VLAN state is written to xin->vlan_tci.
When committing the actions, flow->vlan_tci is used before MPLS actions,
and xin->vlan_tci is used afterwards. This retains the current datapath
behaviour, but allows VLAN actions to be applied in a more flexible
manner.

Both before and after this patch MPLS LSEs are pushed onto a packet after
any VLAN tags that may be present. This is the behaviour described in
OpenFlow 1.1 and 1.2. OpenFlow 1.3 specifies that MPLS LSEs should be
pushed onto a packet before any VLAN tags that are present. Support
for this will be added by a subsequent patch that makes use of
the infrastructure added by this patch.

Signed-off-by: Joe Stringer <joe-Q1GJJQv1iO6lP80pJB477g@public.gmane.org>
Signed-off-by: Simon Horman <horms-/R6kz+dDXgpPR4JQBCEnsQ@public.gmane.org>

---

v2.41
* Rework comments to make things a little clearer

v2.40
* Rebase for removal of mpls_depth from struct flow

v2.38 - v2.39
* No change

v2.37
* Rebase

v2.36
* No change

v2.5
* First post
---
 lib/odp-util.c               |   9 +-
 lib/odp-util.h               |   2 +-
 ofproto/ofproto-dpif-xlate.c | 101 ++++++++++++++++-----
 ofproto/ofproto-dpif-xlate.h |   5 ++
 tests/ofproto-dpif.at        | 209 +++++++++++++++++++++++++++++++++++++++++++
 5 files changed, 303 insertions(+), 23 deletions(-)

diff --git a/lib/odp-util.c b/lib/odp-util.c
index 5c7ccfb..4028167 100644
--- a/lib/odp-util.c
+++ b/lib/odp-util.c
@@ -3600,11 +3600,15 @@ commit_set_pkt_mark_action(const struct flow *flow, struct flow *base,
  * key from 'base' into 'flow', and then changes 'base' the same way.  Does not
  * commit set_tunnel actions.  Users should call commit_odp_tunnel_action()
  * in addition to this function if needed.  Sets fields in 'wc' that are
- * used as part of the action. */
+ * used as part of the action.
+ *
+ * VLAN actions may be committed twice; If vlan_tci in 'flow' differs from the
+ * one in 'base', then it is committed before MPLS actions. If 'final_vlan_tci'
+ * differs from 'flow->vlan_tci', it is committed afterwards. */
 void
 commit_odp_actions(const struct flow *flow, struct flow *base,
                    struct ofpbuf *odp_actions, struct flow_wildcards *wc,
-                   int *mpls_depth_delta)
+                   int *mpls_depth_delta, ovs_be16 final_vlan_tci)
 {
     commit_set_ether_addr_action(flow, base, odp_actions, wc);
     commit_vlan_action(flow->vlan_tci, base, odp_actions, wc);
@@ -3615,6 +3619,7 @@ commit_odp_actions(const struct flow *flow, struct flow *base,
      * that it is no longer IP and thus nw and port actions are no longer valid.
      */
     commit_mpls_action(flow, base, odp_actions, wc, mpls_depth_delta);
+    commit_vlan_action(final_vlan_tci, base, odp_actions, wc);
     commit_set_priority_action(flow, base, odp_actions, wc);
     commit_set_pkt_mark_action(flow, base, odp_actions, wc);
 }
diff --git a/lib/odp-util.h b/lib/odp-util.h
index 2712cb0..a7bca43 100644
--- a/lib/odp-util.h
+++ b/lib/odp-util.h
@@ -143,7 +143,7 @@ void commit_odp_tunnel_action(const struct flow *, struct flow *base,
                               struct ofpbuf *odp_actions);
 void commit_odp_actions(const struct flow *, struct flow *base,
                         struct ofpbuf *odp_actions, struct flow_wildcards *wc,
-                        int *mpls_depth_delta);
+                        int *mpls_depth_delta, ovs_be16 final_vlan_tci);
 \f
 /* ofproto-dpif interface.
  *
diff --git a/ofproto/ofproto-dpif-xlate.c b/ofproto/ofproto-dpif-xlate.c
index cced7cc..6757933 100644
--- a/ofproto/ofproto-dpif-xlate.c
+++ b/ofproto/ofproto-dpif-xlate.c
@@ -982,10 +982,11 @@ static void
 output_normal(struct xlate_ctx *ctx, const struct xbundle *out_xbundle,
               uint16_t vlan)
 {
-    ovs_be16 *flow_tci = &ctx->xin->flow.vlan_tci;
+    ovs_be16 *flow_tci = &ctx->xin->vlan_tci;
     uint16_t vid;
     ovs_be16 tci, old_tci;
     struct xport *xport;
+    bool flow_tci_equal_to_xin = (*flow_tci == ctx->xin->flow.vlan_tci);
 
     vid = output_vlan_to_vid(out_xbundle, vlan);
     if (list_is_empty(&out_xbundle->xports)) {
@@ -1016,9 +1017,15 @@ output_normal(struct xlate_ctx *ctx, const struct xbundle *out_xbundle,
         }
     }
     *flow_tci = tci;
+    if (flow_tci_equal_to_xin) {
+        ctx->xin->flow.vlan_tci = tci;
+    }
 
     compose_output_action(ctx, xport->ofp_port);
     *flow_tci = old_tci;
+    if (flow_tci_equal_to_xin) {
+        ctx->xin->flow.vlan_tci = old_tci;
+    }
 }
 
 /* A VM broadcasts a gratuitous ARP to indicate that it has resumed after
@@ -1251,7 +1258,7 @@ xlate_normal(struct xlate_ctx *ctx)
 
     /* Drop malformed frames. */
     if (flow->dl_type == htons(ETH_TYPE_VLAN) &&
-        !(flow->vlan_tci & htons(VLAN_CFI))) {
+        !(ctx->xin->vlan_tci & htons(VLAN_CFI))) {
         if (ctx->xin->packet != NULL) {
             static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1, 5);
             VLOG_WARN_RL(&rl, "bridge %s: dropping packet with partial "
@@ -1275,7 +1282,7 @@ xlate_normal(struct xlate_ctx *ctx)
     }
 
     /* Check VLAN. */
-    vid = vlan_tci_to_vid(flow->vlan_tci);
+    vid = vlan_tci_to_vid(ctx->xin->vlan_tci);
     if (!input_vid_is_valid(vid, in_xbundle, ctx->xin->packet != NULL)) {
         xlate_report(ctx, "disallowed VLAN VID for this input port, dropping");
         return;
@@ -1533,7 +1540,7 @@ compose_output_action__(struct xlate_ctx *ctx, ofp_port_t ofp_port,
     const struct xport *xport = get_ofp_port(ctx->xbridge, ofp_port);
     struct flow_wildcards *wc = &ctx->xout->wc;
     struct flow *flow = &ctx->xin->flow;
-    ovs_be16 flow_vlan_tci;
+    ovs_be16 flow_vlan_tci, xin_vlan_tci;
     uint32_t flow_pkt_mark;
     uint8_t flow_nw_tos;
     odp_port_t out_port, odp_port;
@@ -1602,6 +1609,7 @@ compose_output_action__(struct xlate_ctx *ctx, ofp_port_t ofp_port,
     }
 
     flow_vlan_tci = flow->vlan_tci;
+    xin_vlan_tci = ctx->xin->vlan_tci;
     flow_pkt_mark = flow->pkt_mark;
     flow_nw_tos = flow->nw_tos;
 
@@ -1641,19 +1649,20 @@ compose_output_action__(struct xlate_ctx *ctx, ofp_port_t ofp_port,
             wc->masks.vlan_tci |= htons(VLAN_VID_MASK | VLAN_CFI);
         }
         vlandev_port = vsp_realdev_to_vlandev(ctx->xbridge->ofproto, ofp_port,
-                                              flow->vlan_tci);
+                                              ctx->xin->vlan_tci);
         if (vlandev_port == ofp_port) {
             out_port = odp_port;
         } else {
             out_port = ofp_port_to_odp_port(ctx->xbridge, vlandev_port);
             flow->vlan_tci = htons(0);
+            ctx->xin->vlan_tci = htons(0);
         }
     }
 
     if (out_port != ODPP_NONE) {
         commit_odp_actions(flow, &ctx->base_flow,
                            &ctx->xout->odp_actions, &ctx->xout->wc,
-                           &ctx->mpls_depth_delta);
+                           &ctx->mpls_depth_delta, ctx->xin->vlan_tci);
         nl_msg_put_odp_port(&ctx->xout->odp_actions, OVS_ACTION_ATTR_OUTPUT,
                             out_port);
 
@@ -1665,6 +1674,7 @@ compose_output_action__(struct xlate_ctx *ctx, ofp_port_t ofp_port,
  out:
     /* Restore flow */
     flow->vlan_tci = flow_vlan_tci;
+    ctx->xin->vlan_tci = xin_vlan_tci;
     flow->pkt_mark = flow_pkt_mark;
     flow->nw_tos = flow_nw_tos;
 }
@@ -1809,7 +1819,7 @@ execute_controller_action(struct xlate_ctx *ctx, int len,
 
     commit_odp_actions(&ctx->xin->flow, &ctx->base_flow,
                        &ctx->xout->odp_actions, &ctx->xout->wc,
-                       &ctx->mpls_depth_delta);
+                       &ctx->mpls_depth_delta, ctx->xin->vlan_tci);
 
     odp_execute_actions(NULL, packet, &key, ctx->xout->odp_actions.data,
                         ctx->xout->odp_actions.size, NULL, NULL);
@@ -2197,7 +2207,7 @@ xlate_sample_action(struct xlate_ctx *ctx,
 
   commit_odp_actions(&ctx->xin->flow, &ctx->base_flow,
                      &ctx->xout->odp_actions, &ctx->xout->wc,
-                     &ctx->mpls_depth_delta);
+                     &ctx->mpls_depth_delta, ctx->xin->vlan_tci);
 
   compose_flow_sample_cookie(os->probability, os->collector_set_id,
                              os->obs_domain_id, os->obs_point_id, &cookie);
@@ -2226,13 +2236,35 @@ may_receive(const struct xport *xport, struct xlate_ctx *ctx)
 }
 
 static void
+vlan_tci_restore(struct xlate_in *xin, ovs_be16 *tci_ptr, ovs_be16 orig_tci)
+{
+    /* If MPLS actions were executed after vlan actions then
+     * copy the final vlan_tci out and restore the intermediate VLAN state. */
+    if (xin->flow.vlan_tci != orig_tci && tci_ptr == &xin->vlan_tci) {
+        xin->vlan_tci = xin->flow.vlan_tci;
+        xin->flow.vlan_tci = orig_tci;
+    }
+}
+
+static void
 do_xlate_actions(const struct ofpact *ofpacts, size_t ofpacts_len,
                  struct xlate_ctx *ctx)
 {
     struct flow_wildcards *wc = &ctx->xout->wc;
     struct flow *flow = &ctx->xin->flow;
+    ovs_be16 *vlan_tci;
     const struct ofpact *a;
 
+
+    /* VLAN actions are stored in '*vlan_tci'. This variable initially
+     * points to 'xin->flow->vlan_tci', so that VLAN actions are applied
+     * before any MPLS actions. When an MPLS action is translated,
+     * 'vlan_tci' is updated to point to 'xin->vlan_tci'. This causes later
+     * VLAN actions to be applied after MPLS actions.  For each iteration
+     * of the loop 'xin->vlan_tci' is updated to reflect the final VLAN
+     * state of the flow. */
+    vlan_tci = &ctx->xin->flow.vlan_tci;
+
     OFPACT_FOR_EACH (a, ofpacts, ofpacts_len) {
         struct ofpact_controller *controller;
         const struct ofpact_metadata *metadata;
@@ -2241,6 +2273,9 @@ do_xlate_actions(const struct ofpact *ofpacts, size_t ofpacts_len,
             break;
         }
 
+        /* Update the final vlan state to match the current state. */
+        ctx->xin->vlan_tci = *vlan_tci;
+
         switch (a->type) {
         case OFPACT_OUTPUT:
             xlate_output_action(ctx, ofpact_get_OUTPUT(a)->port,
@@ -2264,28 +2299,28 @@ do_xlate_actions(const struct ofpact *ofpacts, size_t ofpacts_len,
 
         case OFPACT_SET_VLAN_VID:
             wc->masks.vlan_tci |= htons(VLAN_VID_MASK | VLAN_CFI);
-            flow->vlan_tci &= ~htons(VLAN_VID_MASK);
-            flow->vlan_tci |= (htons(ofpact_get_SET_VLAN_VID(a)->vlan_vid)
-                               | htons(VLAN_CFI));
+            *vlan_tci &= ~htons(VLAN_VID_MASK);
+            *vlan_tci |= (htons(ofpact_get_SET_VLAN_VID(a)->vlan_vid)
+                          | htons(VLAN_CFI));
             break;
 
         case OFPACT_SET_VLAN_PCP:
-            wc->masks.vlan_tci |= htons(VLAN_PCP_MASK | VLAN_CFI);
-            flow->vlan_tci &= ~htons(VLAN_PCP_MASK);
-            flow->vlan_tci |=
+            wc->masks.vlan_tci |= htons(VLAN_VID_MASK | VLAN_CFI);
+            *vlan_tci &= ~htons(VLAN_PCP_MASK);
+            *vlan_tci |=
                 htons((ofpact_get_SET_VLAN_PCP(a)->vlan_pcp << VLAN_PCP_SHIFT)
                       | VLAN_CFI);
             break;
 
         case OFPACT_STRIP_VLAN:
             memset(&wc->masks.vlan_tci, 0xff, sizeof wc->masks.vlan_tci);
-            flow->vlan_tci = htons(0);
+            *vlan_tci = htons(0);
             break;
 
         case OFPACT_PUSH_VLAN:
             /* XXX 802.1AD(QinQ) */
             memset(&wc->masks.vlan_tci, 0xff, sizeof wc->masks.vlan_tci);
-            flow->vlan_tci = htons(VLAN_CFI);
+            *vlan_tci = htons(VLAN_CFI);
             break;
 
         case OFPACT_SET_ETH_SRC:
@@ -2353,29 +2388,54 @@ do_xlate_actions(const struct ofpact *ofpacts, size_t ofpacts_len,
             flow->skb_priority = ctx->orig_skb_priority;
             break;
 
-        case OFPACT_REG_MOVE:
+        case OFPACT_REG_MOVE: {
+            ovs_be16 orig_tci = flow->vlan_tci;
             nxm_execute_reg_move(ofpact_get_REG_MOVE(a), flow, wc);
+            vlan_tci_restore(ctx->xin, vlan_tci, orig_tci);
             break;
+        }
 
-        case OFPACT_REG_LOAD:
+        case OFPACT_REG_LOAD: {
+            ovs_be16 orig_tci = flow->vlan_tci;
             nxm_execute_reg_load(ofpact_get_REG_LOAD(a), flow);
+            vlan_tci_restore(ctx->xin, vlan_tci, orig_tci);
             break;
+        }
 
-        case OFPACT_STACK_PUSH:
+        case OFPACT_STACK_PUSH: {
+            ovs_be16 orig_tci = flow->vlan_tci;
+            flow->vlan_tci = *vlan_tci;
             nxm_execute_stack_push(ofpact_get_STACK_PUSH(a), flow, wc,
                                    &ctx->stack);
+            flow->vlan_tci = orig_tci;
             break;
+        }
 
-        case OFPACT_STACK_POP:
+        case OFPACT_STACK_POP: {
+            ovs_be16 orig_tci = flow->vlan_tci;
             nxm_execute_stack_pop(ofpact_get_STACK_POP(a), flow, wc,
                                   &ctx->stack);
+            vlan_tci_restore(ctx->xin, vlan_tci, orig_tci);
             break;
+        }
 
         case OFPACT_PUSH_MPLS:
             if (compose_mpls_push_action(ctx,
                                          ofpact_get_PUSH_MPLS(a)->ethertype)) {
                 return;
             }
+
+            /* Save and pop any existing VLAN tags. They will be pushed
+             * back onto the packet after pushing the MPLS LSE. The overall
+             * effect is to push the MPLS LSE after any VLAN tags that may
+             * be present. This is the behaviour described for OpenFlow 1.1
+             * and 1.2. This code needs to be enhanced to make this
+             * conditional to also and support pushing the MPLS LSE before
+             * any VLAN tags that may be present, the behaviour described
+             * for OpenFlow 1.3. */
+            ctx->xin->vlan_tci = *vlan_tci;
+            flow->vlan_tci = htons(0);
+            vlan_tci = &ctx->xin->vlan_tci;
             break;
 
         case OFPACT_POP_MPLS:
@@ -2477,6 +2537,7 @@ xlate_in_init(struct xlate_in *xin, struct ofproto_dpif *ofproto,
 {
     xin->ofproto = ofproto;
     xin->flow = *flow;
+    xin->vlan_tci = flow->vlan_tci;
     xin->packet = packet;
     xin->may_learn = packet != NULL;
     xin->rule = rule;
diff --git a/ofproto/ofproto-dpif-xlate.h b/ofproto/ofproto-dpif-xlate.h
index 6403f50..54fd36d 100644
--- a/ofproto/ofproto-dpif-xlate.h
+++ b/ofproto/ofproto-dpif-xlate.h
@@ -60,6 +60,11 @@ struct xlate_in {
      * this flow when actions change header fields. */
     struct flow flow;
 
+    /* If MPLS and VLAN actions were both present in the translation, and VLAN
+     * actions should occur after the MPLS actions, then this field is used
+     * to store the final vlan_tci state. */
+    ovs_be16 vlan_tci;
+
     /* The packet corresponding to 'flow', or a null pointer if we are
      * revalidating without a packet to refer to. */
     const struct ofpbuf *packet;
diff --git a/tests/ofproto-dpif.at b/tests/ofproto-dpif.at
index ea36020..3d57f37 100644
--- a/tests/ofproto-dpif.at
+++ b/tests/ofproto-dpif.at
@@ -869,6 +869,215 @@ done
 OVS_VSWITCHD_STOP
 AT_CLEANUP
 
+AT_SETUP([ofproto-dpif - OF1.2 VLAN+MPLS handling])
+OVS_VSWITCHD_START([dnl
+   add-port br0 p1 -- set Interface p1 type=dummy
+])
+ON_EXIT([kill `cat ovs-ofctl.pid`])
+
+AT_CAPTURE_FILE([ofctl_monitor.log])
+AT_DATA([flows.txt], [dnl
+cookie=0xa dl_src=40:44:44:44:54:50 actions=push_mpls:0x8847,load:10->OXM_OF_MPLS_LABEL[[]],push_vlan:0x8100,mod_vlan_vid:99,mod_vlan_pcp:1,controller
+cookie=0xa dl_src=40:44:44:44:54:51 actions=push_mpls:0x8847,load:10->OXM_OF_MPLS_LABEL[[]],push_vlan:0x8100,mod_vlan_vid:99,mod_vlan_pcp:1,controller
+cookie=0xa dl_src=40:44:44:44:54:52 actions=push_mpls:0x8847,load:10->OXM_OF_MPLS_LABEL[[]],push_vlan:0x8100,load:99->OXM_OF_VLAN_VID[[]],mod_vlan_pcp:1,controller
+cookie=0xa dl_src=40:44:44:44:54:53 actions=push_mpls:0x8847,load:10->OXM_OF_MPLS_LABEL[[]],push_vlan:0x8100,load:99->OXM_OF_VLAN_VID[[]],mod_vlan_pcp:1,controller
+cookie=0xa dl_src=40:44:44:44:54:54 actions=push_vlan:0x8100,mod_vlan_vid:99,mod_vlan_pcp:1,push_mpls:0x8847,load:10->OXM_OF_MPLS_LABEL[[]],controller
+cookie=0xa dl_src=40:44:44:44:54:55 actions=push_vlan:0x8100,mod_vlan_vid:99,mod_vlan_pcp:1,push_mpls:0x8847,load:10->OXM_OF_MPLS_LABEL[[]],controller
+cookie=0xa dl_src=40:44:44:44:54:56 actions=push_vlan:0x8100,load:99->OXM_OF_VLAN_VID[[]],mod_vlan_pcp:1,push_mpls:0x8847,load:10->OXM_OF_MPLS_LABEL[[]],controller
+cookie=0xa dl_src=40:44:44:44:54:57 actions=push_vlan:0x8100,load:99->OXM_OF_VLAN_VID[[]],mod_vlan_pcp:1,push_mpls:0x8847,load:10->OXM_OF_MPLS_LABEL[[]],controller
+])
+AT_CHECK([ovs-ofctl --protocols=OpenFlow12 add-flows br0 flows.txt])
+
+dnl Modified MPLS controller action.
+dnl In this test, we push the MPLS tag before pushing a VLAN tag, so we see
+dnl both of these in the final flow
+AT_CHECK([ovs-ofctl monitor br0 65534 -P nxm --detach --pidfile 2> ofctl_monitor.log])
+
+for i in 1 2 3; do
+    ovs-appctl netdev-dummy/receive p1 'in_port(1),eth(src=40:44:44:44:54:50,dst=50:54:00:00:00:07),eth_type(0x0800),ipv4(src=192.168.0.1,dst=192.168.0.2,proto=6,tos=0,ttl=64,frag=no)'
+done
+OVS_WAIT_UNTIL([ovs-appctl -t ovs-ofctl exit])
+
+AT_CHECK([cat ofctl_monitor.log], [0], [dnl
+NXT_PACKET_IN (xid=0x0): cookie=0xa total_len=68 in_port=1 (via action) data_len=68 (unbuffered)
+mpls,metadata=0,in_port=0,dl_vlan=99,dl_vlan_pcp=1,dl_src=40:44:44:44:54:50,dl_dst=50:54:00:00:00:07,mpls_label=10,mpls_tc=0,mpls_ttl=64,mpls_bos=1
+dnl
+NXT_PACKET_IN (xid=0x0): cookie=0xa total_len=68 in_port=1 (via action) data_len=68 (unbuffered)
+mpls,metadata=0,in_port=0,dl_vlan=99,dl_vlan_pcp=1,dl_src=40:44:44:44:54:50,dl_dst=50:54:00:00:00:07,mpls_label=10,mpls_tc=0,mpls_ttl=64,mpls_bos=1
+dnl
+NXT_PACKET_IN (xid=0x0): cookie=0xa total_len=68 in_port=1 (via action) data_len=68 (unbuffered)
+mpls,metadata=0,in_port=0,dl_vlan=99,dl_vlan_pcp=1,dl_src=40:44:44:44:54:50,dl_dst=50:54:00:00:00:07,mpls_label=10,mpls_tc=0,mpls_ttl=64,mpls_bos=1
+])
+
+dnl Modified MPLS controller action.
+dnl In this test, the input packet in vlan-tagged, which should be stripped
+dnl before we push the MPLS and VLAN tags.
+AT_CHECK([ovs-ofctl monitor br0 65534 -P nxm --detach --pidfile 2> ofctl_monitor.log])
+
+for i in 1 2 3; do
+    ovs-appctl netdev-dummy/receive p1 'in_port(1),eth(src=40:44:44:44:54:51,dst=50:54:00:00:00:07),eth_type(0x8100),vlan(vid=88,pcp=7),encap(eth_type(0x0800),ipv4(src=192.168.0.1,dst=192.168.0.2,proto=6,tos=0,ttl=64,frag=no))'
+done
+OVS_WAIT_UNTIL([test `wc -l < ofctl_monitor.log` -ge 6])
+ovs-appctl -t ovs-ofctl exit
+
+AT_CHECK([cat ofctl_monitor.log], [0], [dnl
+NXT_PACKET_IN (xid=0x0): cookie=0xa total_len=64 in_port=1 (via action) data_len=64 (unbuffered)
+mpls,metadata=0,in_port=0,dl_vlan=99,dl_vlan_pcp=1,dl_src=40:44:44:44:54:51,dl_dst=50:54:00:00:00:07,mpls_label=10,mpls_tc=0,mpls_ttl=64,mpls_bos=1
+dnl
+NXT_PACKET_IN (xid=0x0): cookie=0xa total_len=64 in_port=1 (via action) data_len=64 (unbuffered)
+mpls,metadata=0,in_port=0,dl_vlan=99,dl_vlan_pcp=1,dl_src=40:44:44:44:54:51,dl_dst=50:54:00:00:00:07,mpls_label=10,mpls_tc=0,mpls_ttl=64,mpls_bos=1
+dnl
+NXT_PACKET_IN (xid=0x0): cookie=0xa total_len=64 in_port=1 (via action) data_len=64 (unbuffered)
+mpls,metadata=0,in_port=0,dl_vlan=99,dl_vlan_pcp=1,dl_src=40:44:44:44:54:51,dl_dst=50:54:00:00:00:07,mpls_label=10,mpls_tc=0,mpls_ttl=64,mpls_bos=1
+])
+
+dnl Modified MPLS controller action.
+dnl In this test, we push the MPLS tag before pushing a VLAN tag, so we see
+dnl both of these in the final flow
+AT_CHECK([ovs-ofctl monitor br0 65534 -P nxm --detach --pidfile 2> ofctl_monitor.log])
+
+for i in 1 2 3; do
+    ovs-appctl netdev-dummy/receive p1 'in_port(1),eth(src=40:44:44:44:54:52,dst=52:54:00:00:00:07),eth_type(0x0800),ipv4(src=192.168.0.1,dst=192.168.0.2,proto=6,tos=0,ttl=64,frag=no)'
+done
+OVS_WAIT_UNTIL([ovs-appctl -t ovs-ofctl exit])
+
+AT_CHECK([cat ofctl_monitor.log], [0], [dnl
+NXT_PACKET_IN (xid=0x0): cookie=0xa total_len=68 in_port=1 (via action) data_len=68 (unbuffered)
+mpls,metadata=0,in_port=0,dl_vlan=99,dl_vlan_pcp=1,dl_src=40:44:44:44:54:52,dl_dst=52:54:00:00:00:07,mpls_label=10,mpls_tc=0,mpls_ttl=64,mpls_bos=1
+dnl
+NXT_PACKET_IN (xid=0x0): cookie=0xa total_len=68 in_port=1 (via action) data_len=68 (unbuffered)
+mpls,metadata=0,in_port=0,dl_vlan=99,dl_vlan_pcp=1,dl_src=40:44:44:44:54:52,dl_dst=52:54:00:00:00:07,mpls_label=10,mpls_tc=0,mpls_ttl=64,mpls_bos=1
+dnl
+NXT_PACKET_IN (xid=0x0): cookie=0xa total_len=68 in_port=1 (via action) data_len=68 (unbuffered)
+mpls,metadata=0,in_port=0,dl_vlan=99,dl_vlan_pcp=1,dl_src=40:44:44:44:54:52,dl_dst=52:54:00:00:00:07,mpls_label=10,mpls_tc=0,mpls_ttl=64,mpls_bos=1
+])
+
+dnl Modified MPLS controller action.
+dnl In this test, the input packet in vlan-tagged, which should be stripped
+dnl before we push the MPLS and VLAN tags.
+AT_CHECK([ovs-ofctl monitor br0 65534 -P nxm --detach --pidfile 2> ofctl_monitor.log])
+
+for i in 1 2 3; do
+    ovs-appctl netdev-dummy/receive p1 'in_port(1),eth(src=40:44:44:44:54:53,dst=50:54:00:00:00:07),eth_type(0x8100),vlan(vid=88,pcp=7),encap(eth_type(0x0800),ipv4(src=192.168.0.1,dst=192.168.0.2,proto=6,tos=0,ttl=64,frag=no))'
+done
+OVS_WAIT_UNTIL([test `wc -l < ofctl_monitor.log` -ge 6])
+ovs-appctl -t ovs-ofctl exit
+
+AT_CHECK([cat ofctl_monitor.log], [0], [dnl
+NXT_PACKET_IN (xid=0x0): cookie=0xa total_len=64 in_port=1 (via action) data_len=64 (unbuffered)
+mpls,metadata=0,in_port=0,dl_vlan=99,dl_vlan_pcp=1,dl_src=40:44:44:44:54:53,dl_dst=50:54:00:00:00:07,mpls_label=10,mpls_tc=0,mpls_ttl=64,mpls_bos=1
+dnl
+NXT_PACKET_IN (xid=0x0): cookie=0xa total_len=64 in_port=1 (via action) data_len=64 (unbuffered)
+mpls,metadata=0,in_port=0,dl_vlan=99,dl_vlan_pcp=1,dl_src=40:44:44:44:54:53,dl_dst=50:54:00:00:00:07,mpls_label=10,mpls_tc=0,mpls_ttl=64,mpls_bos=1
+dnl
+NXT_PACKET_IN (xid=0x0): cookie=0xa total_len=64 in_port=1 (via action) data_len=64 (unbuffered)
+mpls,metadata=0,in_port=0,dl_vlan=99,dl_vlan_pcp=1,dl_src=40:44:44:44:54:53,dl_dst=50:54:00:00:00:07,mpls_label=10,mpls_tc=0,mpls_ttl=64,mpls_bos=1
+])
+
+dnl Modified MPLS controller action.
+dnl In this test, we push the VLAN tag before pushing a MPLS tag, but these
+dnl actions are reordered, so we see both of these in the final flow.
+AT_CHECK([ovs-ofctl monitor br0 65534 -P nxm --detach --pidfile 2> ofctl_monitor.log])
+
+for i in 1 2 3; do
+    ovs-appctl netdev-dummy/receive p1 'in_port(1),eth(src=40:44:44:44:54:54,dst=50:54:00:00:00:07),eth_type(0x0800),ipv4(src=192.168.0.1,dst=192.168.0.2,proto=6,tos=0,ttl=64,frag=no)'
+done
+OVS_WAIT_UNTIL([test `wc -l < ofctl_monitor.log` -ge 6])
+ovs-appctl -t ovs-ofctl exit
+
+AT_CHECK([cat ofctl_monitor.log], [0], [dnl
+NXT_PACKET_IN (xid=0x0): cookie=0xa total_len=68 in_port=1 (via action) data_len=68 (unbuffered)
+mpls,metadata=0,in_port=0,dl_vlan=99,dl_vlan_pcp=1,dl_src=40:44:44:44:54:54,dl_dst=50:54:00:00:00:07,mpls_label=10,mpls_tc=0,mpls_ttl=64,mpls_bos=1
+dnl
+NXT_PACKET_IN (xid=0x0): cookie=0xa total_len=68 in_port=1 (via action) data_len=68 (unbuffered)
+mpls,metadata=0,in_port=0,dl_vlan=99,dl_vlan_pcp=1,dl_src=40:44:44:44:54:54,dl_dst=50:54:00:00:00:07,mpls_label=10,mpls_tc=0,mpls_ttl=64,mpls_bos=1
+dnl
+NXT_PACKET_IN (xid=0x0): cookie=0xa total_len=68 in_port=1 (via action) data_len=68 (unbuffered)
+mpls,metadata=0,in_port=0,dl_vlan=99,dl_vlan_pcp=1,dl_src=40:44:44:44:54:54,dl_dst=50:54:00:00:00:07,mpls_label=10,mpls_tc=0,mpls_ttl=64,mpls_bos=1
+])
+
+dnl Modified MPLS controller action.
+dnl In this test, the input packet in vlan-tagged, which should be stripped
+dnl before we push the MPLS and VLAN tags.
+AT_CHECK([ovs-ofctl monitor br0 65534 -P nxm --detach --pidfile 2> ofctl_monitor.log])
+
+for i in 1 2 3; do
+    ovs-appctl netdev-dummy/receive p1 'in_port(1),eth(src=40:44:44:44:54:55,dst=50:54:00:00:00:07),eth_type(0x8100),vlan(vid=88,pcp=7),encap(eth_type(0x0800),ipv4(src=192.168.0.1,dst=192.168.0.2,proto=6,tos=0,ttl=64,frag=no))'
+done
+OVS_WAIT_UNTIL([test `wc -l < ofctl_monitor.log` -ge 6])
+ovs-appctl -t ovs-ofctl exit
+
+AT_CHECK([cat ofctl_monitor.log], [0], [dnl
+NXT_PACKET_IN (xid=0x0): cookie=0xa total_len=64 in_port=1 (via action) data_len=64 (unbuffered)
+mpls,metadata=0,in_port=0,dl_vlan=99,dl_vlan_pcp=1,dl_src=40:44:44:44:54:55,dl_dst=50:54:00:00:00:07,mpls_label=10,mpls_tc=0,mpls_ttl=64,mpls_bos=1
+dnl
+NXT_PACKET_IN (xid=0x0): cookie=0xa total_len=64 in_port=1 (via action) data_len=64 (unbuffered)
+mpls,metadata=0,in_port=0,dl_vlan=99,dl_vlan_pcp=1,dl_src=40:44:44:44:54:55,dl_dst=50:54:00:00:00:07,mpls_label=10,mpls_tc=0,mpls_ttl=64,mpls_bos=1
+dnl
+NXT_PACKET_IN (xid=0x0): cookie=0xa total_len=64 in_port=1 (via action) data_len=64 (unbuffered)
+mpls,metadata=0,in_port=0,dl_vlan=99,dl_vlan_pcp=1,dl_src=40:44:44:44:54:55,dl_dst=50:54:00:00:00:07,mpls_label=10,mpls_tc=0,mpls_ttl=64,mpls_bos=1
+])
+
+dnl Modified MPLS controller action.
+dnl In this test, we push the VLAN tag before pushing a MPLS tag, but these
+dnl actions are reordered, so we see both of these in the final flow.
+AT_CHECK([ovs-ofctl monitor br0 65534 -P nxm --detach --pidfile 2> ofctl_monitor.log])
+
+for i in 1 2 3; do
+    ovs-appctl netdev-dummy/receive p1 'in_port(1),eth(src=40:44:44:44:54:56,dst=50:54:00:00:00:07),eth_type(0x0800),ipv4(src=192.168.0.1,dst=192.168.0.2,proto=6,tos=0,ttl=64,frag=no)'
+done
+OVS_WAIT_UNTIL([test `wc -l < ofctl_monitor.log` -ge 6])
+ovs-appctl -t ovs-ofctl exit
+
+AT_CHECK([cat ofctl_monitor.log], [0], [dnl
+NXT_PACKET_IN (xid=0x0): cookie=0xa total_len=68 in_port=1 (via action) data_len=68 (unbuffered)
+mpls,metadata=0,in_port=0,dl_vlan=99,dl_vlan_pcp=1,dl_src=40:44:44:44:54:56,dl_dst=50:54:00:00:00:07,mpls_label=10,mpls_tc=0,mpls_ttl=64,mpls_bos=1
+dnl
+NXT_PACKET_IN (xid=0x0): cookie=0xa total_len=68 in_port=1 (via action) data_len=68 (unbuffered)
+mpls,metadata=0,in_port=0,dl_vlan=99,dl_vlan_pcp=1,dl_src=40:44:44:44:54:56,dl_dst=50:54:00:00:00:07,mpls_label=10,mpls_tc=0,mpls_ttl=64,mpls_bos=1
+dnl
+NXT_PACKET_IN (xid=0x0): cookie=0xa total_len=68 in_port=1 (via action) data_len=68 (unbuffered)
+mpls,metadata=0,in_port=0,dl_vlan=99,dl_vlan_pcp=1,dl_src=40:44:44:44:54:56,dl_dst=50:54:00:00:00:07,mpls_label=10,mpls_tc=0,mpls_ttl=64,mpls_bos=1
+])
+
+dnl Modified MPLS controller action.
+dnl In this test, the input packet in vlan-tagged, which should be stripped
+dnl before we push the MPLS and VLAN tags.
+AT_CHECK([ovs-ofctl monitor br0 65534 -P nxm --detach --pidfile 2> ofctl_monitor.log])
+
+for i in 1 2 3; do
+    ovs-appctl netdev-dummy/receive p1 'in_port(1),eth(src=40:44:44:44:54:57,dst=50:54:00:00:00:07),eth_type(0x8100),vlan(vid=88,pcp=7),encap(eth_type(0x0800),ipv4(src=192.168.0.1,dst=192.168.0.2,proto=6,tos=0,ttl=64,frag=no))'
+done
+OVS_WAIT_UNTIL([test `wc -l < ofctl_monitor.log` -ge 6])
+ovs-appctl -t ovs-ofctl exit
+
+AT_CHECK([cat ofctl_monitor.log], [0], [dnl
+NXT_PACKET_IN (xid=0x0): cookie=0xa total_len=64 in_port=1 (via action) data_len=64 (unbuffered)
+mpls,metadata=0,in_port=0,dl_vlan=99,dl_vlan_pcp=1,dl_src=40:44:44:44:54:57,dl_dst=50:54:00:00:00:07,mpls_label=10,mpls_tc=0,mpls_ttl=64,mpls_bos=1
+dnl
+NXT_PACKET_IN (xid=0x0): cookie=0xa total_len=64 in_port=1 (via action) data_len=64 (unbuffered)
+mpls,metadata=0,in_port=0,dl_vlan=99,dl_vlan_pcp=1,dl_src=40:44:44:44:54:57,dl_dst=50:54:00:00:00:07,mpls_label=10,mpls_tc=0,mpls_ttl=64,mpls_bos=1
+dnl
+NXT_PACKET_IN (xid=0x0): cookie=0xa total_len=64 in_port=1 (via action) data_len=64 (unbuffered)
+mpls,metadata=0,in_port=0,dl_vlan=99,dl_vlan_pcp=1,dl_src=40:44:44:44:54:57,dl_dst=50:54:00:00:00:07,mpls_label=10,mpls_tc=0,mpls_ttl=64,mpls_bos=1
+])
+
+AT_CHECK([ovs-appctl time/warp 5000], [0], [ignore])
+AT_CHECK([ovs-ofctl dump-flows br0 | ofctl_strip | sort], [0], [dnl
+ cookie=0xa, n_packets=3, n_bytes=180, dl_src=40:44:44:44:54:50 actions=push_mpls:0x8847,load:0xa->OXM_OF_MPLS_LABEL[[]],mod_vlan_vid:99,mod_vlan_pcp:1,CONTROLLER:65535
+ cookie=0xa, n_packets=3, n_bytes=180, dl_src=40:44:44:44:54:51 actions=push_mpls:0x8847,load:0xa->OXM_OF_MPLS_LABEL[[]],mod_vlan_vid:99,mod_vlan_pcp:1,CONTROLLER:65535
+ cookie=0xa, n_packets=3, n_bytes=180, dl_src=40:44:44:44:54:52 actions=push_mpls:0x8847,load:0xa->OXM_OF_MPLS_LABEL[[]],load:0x63->OXM_OF_VLAN_VID[[]],mod_vlan_pcp:1,CONTROLLER:65535
+ cookie=0xa, n_packets=3, n_bytes=180, dl_src=40:44:44:44:54:53 actions=push_mpls:0x8847,load:0xa->OXM_OF_MPLS_LABEL[[]],load:0x63->OXM_OF_VLAN_VID[[]],mod_vlan_pcp:1,CONTROLLER:65535
+ cookie=0xa, n_packets=3, n_bytes=180, dl_src=40:44:44:44:54:54 actions=mod_vlan_vid:99,mod_vlan_pcp:1,push_mpls:0x8847,load:0xa->OXM_OF_MPLS_LABEL[[]],CONTROLLER:65535
+ cookie=0xa, n_packets=3, n_bytes=180, dl_src=40:44:44:44:54:55 actions=mod_vlan_vid:99,mod_vlan_pcp:1,push_mpls:0x8847,load:0xa->OXM_OF_MPLS_LABEL[[]],CONTROLLER:65535
+ cookie=0xa, n_packets=3, n_bytes=180, dl_src=40:44:44:44:54:56 actions=load:0x63->OXM_OF_VLAN_VID[[]],mod_vlan_pcp:1,push_mpls:0x8847,load:0xa->OXM_OF_MPLS_LABEL[[]],CONTROLLER:65535
+ cookie=0xa, n_packets=3, n_bytes=180, dl_src=40:44:44:44:54:57 actions=load:0x63->OXM_OF_VLAN_VID[[]],mod_vlan_pcp:1,push_mpls:0x8847,load:0xa->OXM_OF_MPLS_LABEL[[]],CONTROLLER:65535
+NXST_FLOW reply:
+])
+
+OVS_VSWITCHD_STOP
+AT_CLEANUP
+
 AT_SETUP([ofproto-dpif - fragment handling])
 OVS_VSWITCHD_START
 ADD_OF_PORTS([br0], [1], [2], [3], [4], [5], [6], [90])
-- 
1.8.4

^ permalink raw reply related

* [PATCH v2.42 2/5] ofp-actions: Add separate OpenFlow 1.3 action parser
From: Simon Horman @ 2013-10-04  8:09 UTC (permalink / raw)
  To: dev-yBygre7rU0TnMu66kgdUjQ, netdev-u79uwXL29TY76Z2rM5mHXA,
	Jesse Gross, Ben Pfaff
  Cc: Isaku Yamahata, Ravi K
In-Reply-To: <1380874200-8981-1-git-send-email-horms-/R6kz+dDXgpPR4JQBCEnsQ@public.gmane.org>

From: Joe Stringer <joe-Q1GJJQv1iO6lP80pJB477g@public.gmane.org>

This patch adds new ofpact_from_openflow13() and
ofpacts_from_openflow13() functions parallel to the existing ofpact
handling code. In the OpenFlow 1.3 version, push_mpls is handled
differently, but all other actions are handled by the existing code.

In the case of push_mpls for OpenFlow 1.3 the new mpls_before_vlan field of
struct ofpact_push_mpls is set to true.  This will be used by a subsequent
patch to allow allow the correct VLAN+MPLS datapath behaviour to be
determined at odp translation time.

Signed-off-by: Joe Stringer <joe-Q1GJJQv1iO6lP80pJB477g@public.gmane.org>
Signed-off-by: Simon Horman <horms-/R6kz+dDXgpPR4JQBCEnsQ@public.gmane.org>

---

v2.41 [Simon Horman]
* As suggested by Ben Pfaff
  + Expand struct ofpact_reg_load to include a mpls_before_vlan field
    rather than using the compat field of the ofpact field of
    struct ofpact_reg_load.
  + Pass version to  ofpacts_pull_openflow11_actions and
    ofpacts_pull_openflow11_instructions.  This removes the invalid
    assumption that that these functions are passed a full message and are
    thus able to deduce the OpenFlow version.

v2.36 - v2.40
* No change

v2.35 [Joe Stringer]
* First post
---
 lib/ofp-actions.c     | 68 ++++++++++++++++++++++++++++++++++++++++++++++++---
 lib/ofp-actions.h     |  9 +++++++
 lib/ofp-print.c       |  2 +-
 lib/ofp-util.c        | 24 ++++++++++--------
 lib/ofp-util.h        |  2 +-
 utilities/ovs-ofctl.c |  8 +++---
 6 files changed, 93 insertions(+), 20 deletions(-)

diff --git a/lib/ofp-actions.c b/lib/ofp-actions.c
index 65430f3..434d020 100644
--- a/lib/ofp-actions.c
+++ b/lib/ofp-actions.c
@@ -881,6 +881,40 @@ ofpacts_from_openflow11(const union ofp_action *in, size_t n_in,
     return ofpacts_from_openflow(in, n_in, out, ofpact_from_openflow11);
 }
 \f
+static enum ofperr
+ofpact_from_openflow13(const union ofp_action *a, struct ofpbuf *out)
+{
+    enum ofputil_action_code code;
+    enum ofperr error;
+
+    error = decode_openflow11_action(a, &code);
+    if (error) {
+        return error;
+    }
+
+    if (code == OFPUTIL_OFPAT11_PUSH_MPLS) {
+        struct ofpact_push_mpls *oam;
+        struct ofp11_action_push *oap = (struct ofp11_action_push *)a;
+        if (!eth_type_mpls(oap->ethertype)) {
+            return OFPERR_OFPBAC_BAD_ARGUMENT;
+        }
+        oam = ofpact_put_PUSH_MPLS(out);
+        oam->ethertype = oap->ethertype;
+        oam->mpls_before_vlan = true;
+    } else {
+        return ofpact_from_openflow11(a, out);
+    }
+
+    return error;
+}
+
+static enum ofperr
+ofpacts_from_openflow13(const union ofp_action *in, size_t n_in,
+                        struct ofpbuf *out)
+{
+    return ofpacts_from_openflow(in, n_in, out, ofpact_from_openflow13);
+}
+\f
 /* OpenFlow 1.1 instructions. */
 
 #define DEFINE_INST(ENUM, STRUCT, EXTENSIBLE, NAME)             \
@@ -1085,11 +1119,14 @@ get_actions_from_instruction(const struct ofp11_instruction *inst,
     *n_actions = (ntohs(inst->len) - sizeof *inst) / OFP11_INSTRUCTION_ALIGN;
 }
 
-/* Attempts to convert 'actions_len' bytes of OpenFlow 1.1 actions from the
+/* Attempts to convert 'actions_len' bytes of OpenFlow actions from the
  * front of 'openflow' into ofpacts.  On success, replaces any existing content
  * in 'ofpacts' by the converted ofpacts; on failure, clears 'ofpacts'.
  * Returns 0 if successful, otherwise an OpenFlow error.
  *
+ * Actions are processed according to their OpenFlow version which
+ * is provided in the 'version' parameter.
+ *
  * In most places in OpenFlow 1.1 and 1.2, actions appear encapsulated in
  * instructions, so you should call ofpacts_pull_openflow11_instructions()
  * instead of this function.
@@ -1101,15 +1138,27 @@ get_actions_from_instruction(const struct ofp11_instruction *inst,
  * valid in a specific context. */
 enum ofperr
 ofpacts_pull_openflow11_actions(struct ofpbuf *openflow,
+                                enum ofp_version version,
                                 unsigned int actions_len,
                                 struct ofpbuf *ofpacts)
 {
-    return ofpacts_pull_actions(openflow, actions_len, ofpacts,
-                                ofpacts_from_openflow11);
+    switch (version) {
+    case OFP10_VERSION:
+    case OFP11_VERSION:
+    case OFP12_VERSION:
+        return ofpacts_pull_actions(openflow, actions_len, ofpacts,
+                                    ofpacts_from_openflow11);
+    case OFP13_VERSION:
+        return ofpacts_pull_actions(openflow, actions_len, ofpacts,
+                                    ofpacts_from_openflow13);
+    default:
+        NOT_REACHED();
+    }
 }
 
 enum ofperr
 ofpacts_pull_openflow11_instructions(struct ofpbuf *openflow,
+                                     enum ofp_version version,
                                      unsigned int instructions_len,
                                      struct ofpbuf *ofpacts)
 {
@@ -1160,7 +1209,18 @@ ofpacts_pull_openflow11_instructions(struct ofpbuf *openflow,
 
         get_actions_from_instruction(insts[OVSINST_OFPIT11_APPLY_ACTIONS],
                                      &actions, &n_actions);
-        error = ofpacts_from_openflow11(actions, n_actions, ofpacts);
+        switch (version) {
+        case OFP10_VERSION:
+        case OFP11_VERSION:
+        case OFP12_VERSION:
+            error = ofpacts_from_openflow11(actions, n_actions, ofpacts);
+            break;
+        case OFP13_VERSION:
+            error = ofpacts_from_openflow13(actions, n_actions, ofpacts);
+            break;
+        default:
+            NOT_REACHED();
+        }
         if (error) {
             goto exit;
         }
diff --git a/lib/ofp-actions.h b/lib/ofp-actions.h
index 0876ed7..d67fc3f 100644
--- a/lib/ofp-actions.h
+++ b/lib/ofp-actions.h
@@ -332,6 +332,13 @@ struct ofpact_reg_load {
 struct ofpact_push_mpls {
     struct ofpact ofpact;
     ovs_be16 ethertype;
+    bool mpls_before_vlan;  /* If true then the MPLS LSE will be added
+                             * immediately after the ethernet header
+                             * and before any VLAN tags that are present.
+                             * This is the behaviour for OpenFlow 1.3+.
+                             * Otherwise the MPLS LSE will be added after
+                             * any VLAN tags that are present.  This is the
+                             * behaviour for OpenFlow 1.1 and 1.2. */
 };
 
 /* OFPACT_POP_MPLS
@@ -504,9 +511,11 @@ enum ofperr ofpacts_pull_openflow10(struct ofpbuf *openflow,
                                     unsigned int actions_len,
                                     struct ofpbuf *ofpacts);
 enum ofperr ofpacts_pull_openflow11_actions(struct ofpbuf *openflow,
+                                            enum ofp_version version,
                                             unsigned int actions_len,
                                             struct ofpbuf *ofpacts);
 enum ofperr ofpacts_pull_openflow11_instructions(struct ofpbuf *openflow,
+                                                 enum ofp_version version,
                                                  unsigned int instructions_len,
                                                  struct ofpbuf *ofpacts);
 enum ofperr ofpacts_check(const struct ofpact[], size_t ofpacts_len,
diff --git a/lib/ofp-print.c b/lib/ofp-print.c
index 6fe1cee..a0615b5 100644
--- a/lib/ofp-print.c
+++ b/lib/ofp-print.c
@@ -2224,7 +2224,7 @@ ofp_print_group_desc(struct ds *s, const struct ofp_header *oh)
         struct ofputil_group_desc gd;
         int retval;
 
-        retval = ofputil_decode_group_desc_reply(&gd, &b);
+        retval = ofputil_decode_group_desc_reply(&gd, &b, oh->version);
         if (retval) {
             if (retval != EOF) {
                 ds_put_cstr(s, " ***parse error***");
diff --git a/lib/ofp-util.c b/lib/ofp-util.c
index 173b534..570447a 100644
--- a/lib/ofp-util.c
+++ b/lib/ofp-util.c
@@ -1504,7 +1504,8 @@ ofputil_decode_flow_mod(struct ofputil_flow_mod *fm,
             return error;
         }
 
-        error = ofpacts_pull_openflow11_instructions(&b, b.size, ofpacts);
+        error = ofpacts_pull_openflow11_instructions(&b, oh->version,
+                                                     b.size, ofpacts);
         if (error) {
             return error;
         }
@@ -2360,7 +2361,8 @@ ofputil_decode_flow_stats_reply(struct ofputil_flow_stats *fs,
             return EINVAL;
         }
 
-        if (ofpacts_pull_openflow11_instructions(msg, length - sizeof *ofs -
+        if (ofpacts_pull_openflow11_instructions(msg, oh->version,
+                                                 length - sizeof *ofs -
                                                  padded_match_len, ofpacts)) {
             VLOG_WARN_RL(&bad_ofmsg_rl, "OFPST_FLOW reply bad instructions");
             return EINVAL;
@@ -3092,7 +3094,8 @@ ofputil_decode_packet_out(struct ofputil_packet_out *po,
             return error;
         }
 
-        error = ofpacts_pull_openflow11_actions(&b, ntohs(opo->actions_len),
+        error = ofpacts_pull_openflow11_actions(&b, oh->version,
+                                                ntohs(opo->actions_len),
                                                 ofpacts);
         if (error) {
             return error;
@@ -5674,8 +5677,8 @@ ofputil_append_group_desc_reply(const struct ofputil_group_desc *gds,
 }
 
 static enum ofperr
-ofputil_pull_buckets(struct ofpbuf *msg, size_t buckets_length,
-                     struct list *buckets)
+ofputil_pull_buckets(struct ofpbuf *msg, enum ofp_version version,
+                     size_t buckets_length, struct list *buckets)
 {
     struct ofp11_bucket *ob;
 
@@ -5708,8 +5711,8 @@ ofputil_pull_buckets(struct ofpbuf *msg, size_t buckets_length,
         buckets_length -= ob_len;
 
         ofpbuf_init(&ofpacts, 0);
-        error = ofpacts_pull_openflow11_actions(msg, ob_len - sizeof *ob,
-                                                &ofpacts);
+        error = ofpacts_pull_openflow11_actions(msg, version,
+                                                ob_len - sizeof *ob, &ofpacts);
         if (error) {
             ofpbuf_uninit(&ofpacts);
             ofputil_bucket_list_destroy(buckets);
@@ -5745,7 +5748,7 @@ ofputil_pull_buckets(struct ofpbuf *msg, size_t buckets_length,
  * otherwise a positive errno value. */
 int
 ofputil_decode_group_desc_reply(struct ofputil_group_desc *gd,
-                                struct ofpbuf *msg)
+                                struct ofpbuf *msg, enum ofp_version version)
 {
     struct ofp11_group_desc_stats *ogds;
     size_t length;
@@ -5774,7 +5777,8 @@ ofputil_decode_group_desc_reply(struct ofputil_group_desc *gd,
         return OFPERR_OFPBRC_BAD_LEN;
     }
 
-    return ofputil_pull_buckets(msg, length - sizeof *ogds, &gd->buckets);
+    return ofputil_pull_buckets(msg, version, length - sizeof *ogds,
+                                &gd->buckets);
 }
 
 /* Converts abstract group mod 'gm' into a message for OpenFlow version
@@ -5857,7 +5861,7 @@ ofputil_decode_group_mod(const struct ofp_header *oh,
     gm->type = ogm->type;
     gm->group_id = ntohl(ogm->group_id);
 
-    return ofputil_pull_buckets(&msg, msg.size, &gm->buckets);
+    return ofputil_pull_buckets(&msg, oh->version, msg.size, &gm->buckets);
 }
 
 /* Parse a queue status request message into 'oqsr'.
diff --git a/lib/ofp-util.h b/lib/ofp-util.h
index d5f34d7..5fa8fee 100644
--- a/lib/ofp-util.h
+++ b/lib/ofp-util.h
@@ -973,7 +973,7 @@ int ofputil_decode_group_stats_reply(struct ofpbuf *,
                                      struct ofputil_group_stats *);
 
 int ofputil_decode_group_desc_reply(struct ofputil_group_desc *,
-                                    struct ofpbuf *);
+                                    struct ofpbuf *, enum ofp_version);
 
 void ofputil_append_group_desc_reply(const struct ofputil_group_desc *,
                                      struct list *buckets,
diff --git a/utilities/ovs-ofctl.c b/utilities/ovs-ofctl.c
index c2cc1f6..00d35aa 100644
--- a/utilities/ovs-ofctl.c
+++ b/utilities/ovs-ofctl.c
@@ -2968,8 +2968,8 @@ ofctl_parse_ofp11_actions(int argc OVS_UNUSED, char *argv[] OVS_UNUSED)
         /* Convert to ofpacts. */
         ofpbuf_init(&ofpacts, 0);
         size = of11_in.size;
-        error = ofpacts_pull_openflow11_actions(&of11_in, of11_in.size,
-                                                &ofpacts);
+        error = ofpacts_pull_openflow11_actions(&of11_in, OFP11_VERSION,
+                                                of11_in.size, &ofpacts);
         if (error) {
             printf("bad OF1.1 actions: %s\n\n", ofperr_get_name(error));
             ofpbuf_uninit(&ofpacts);
@@ -3036,8 +3036,8 @@ ofctl_parse_ofp11_instructions(int argc OVS_UNUSED, char *argv[] OVS_UNUSED)
         /* Convert to ofpacts. */
         ofpbuf_init(&ofpacts, 0);
         size = of11_in.size;
-        error = ofpacts_pull_openflow11_instructions(&of11_in, of11_in.size,
-                                                     &ofpacts);
+        error = ofpacts_pull_openflow11_instructions(&of11_in, OFP11_VERSION,
+                                                     of11_in.size, &ofpacts);
         if (!error) {
             /* Verify actions. */
             struct flow flow;
-- 
1.8.4

^ permalink raw reply related

* [PATCH v2.42 3/5] lib: Support pushing of MPLS LSE before or after VLAN tag
From: Simon Horman @ 2013-10-04  8:09 UTC (permalink / raw)
  To: dev-yBygre7rU0TnMu66kgdUjQ, netdev-u79uwXL29TY76Z2rM5mHXA,
	Jesse Gross, Ben Pfaff
  Cc: Isaku Yamahata, Ravi K
In-Reply-To: <1380874200-8981-1-git-send-email-horms-/R6kz+dDXgpPR4JQBCEnsQ@public.gmane.org>

From: Joe Stringer <joe-Q1GJJQv1iO6lP80pJB477g@public.gmane.org>

This patch modifies the push_mpls behaviour to allow
pushing of an MPLS LSE either before any VLAN tag that may be present.

Pushing the MPLS LSE before any VLAN tag that is present is the
behaviour specified in OpenFlow 1.3.

Pushing the MPLS LSE after the any VLAN tag that is present is the
behaviour specified in OpenFlow 1.1 and 1.2. This is the only behaviour
that was supported prior to this patch.

When an push_mpls action has been inserted using OpenFlow 1.2 or earlier
the behaviour of pushing the MPLS LSE before any VLAN tag that may be
present is implemented by by inserting VLAN actions around the MPLS push
action during odp translation; Pop VLAN tags before committing MPLS
actions, and push the expected VLAN tag afterwards.

The trigger condition for the two different behaviours is the value of the
mpls_before_vlan field of struct ofpact_push_mpls.  This field is set when
parsing OpenFlow actions.

Signed-off-by: Joe Stringer <joe-Q1GJJQv1iO6lP80pJB477g@public.gmane.org>
Signed-off-by: Simon Horman <horms-/R6kz+dDXgpPR4JQBCEnsQ@public.gmane.org>

---

v2.41 [Simon Horman]
* Use mpls_before_vlan field of struct ofpact_reg_load.
* Reword changelog to describe the difference in behaviour between
  different Open Flow versions.

v2.40 [Simon Horman]
* Trivial rebase for removal of set_ethertype()

v2.36 - v2.39
* No change

v2.35 [Joe Stringer]
* First post
---
 lib/flow.c                   |   2 +-
 lib/packets.c                |  10 +-
 lib/packets.h                |   2 +-
 ofproto/ofproto-dpif-xlate.c |  27 ++---
 tests/ofproto-dpif.at        | 237 +++++++++++++++++++++++++++++++++++++++++++
 5 files changed, 259 insertions(+), 19 deletions(-)

diff --git a/lib/flow.c b/lib/flow.c
index 0678c6f..6242ef9 100644
--- a/lib/flow.c
+++ b/lib/flow.c
@@ -1061,7 +1061,7 @@ flow_compose(struct ofpbuf *b, const struct flow *flow)
     }
 
     if (eth_type_mpls(flow->dl_type)) {
-        b->l2_5 = b->l3;
+        b->l2_5 = (char*)b->l2 + ETH_HEADER_LEN;
         push_mpls(b, flow->dl_type, flow->mpls_lse);
     }
 }
diff --git a/lib/packets.c b/lib/packets.c
index 922c5db..f8a58b6 100644
--- a/lib/packets.c
+++ b/lib/packets.c
@@ -220,11 +220,11 @@ eth_pop_vlan(struct ofpbuf *packet)
 
 /* Set ethertype of the packet. */
 void
-set_ethertype(struct ofpbuf *packet, ovs_be16 eth_type)
+set_ethertype(struct ofpbuf *packet, ovs_be16 eth_type, bool inner)
 {
     struct eth_header *eh = packet->data;
 
-    if (eh->eth_type == htons(ETH_TYPE_VLAN)) {
+    if (inner && eh->eth_type == htons(ETH_TYPE_VLAN)) {
         ovs_be16 *p;
         p = ALIGNED_CAST(ovs_be16 *,
                 (char *)(packet->l2_5 ? packet->l2_5 : packet->l3) - 2);
@@ -332,8 +332,8 @@ push_mpls(struct ofpbuf *packet, ovs_be16 ethtype, ovs_be32 lse)
 
     if (!is_mpls(packet)) {
         /* Set ethtype and MPLS label stack entry. */
-        set_ethertype(packet, ethtype);
-        packet->l2_5 = packet->l3;
+        set_ethertype(packet, ethtype, false);
+        packet->l2_5 = (char*)packet->l2 + ETH_HEADER_LEN;
     }
 
     /* Push new MPLS shim header onto packet. */
@@ -354,7 +354,7 @@ pop_mpls(struct ofpbuf *packet, ovs_be16 ethtype)
         size_t len;
         mh = packet->l2_5;
         len = (char*)packet->l2_5 - (char*)packet->l2;
-        set_ethertype(packet, ethtype);
+        set_ethertype(packet, ethtype, true);
         if (mh->mpls_lse & htonl(MPLS_BOS_MASK)) {
             packet->l2_5 = NULL;
         } else {
diff --git a/lib/packets.h b/lib/packets.h
index 7388152..38fec70 100644
--- a/lib/packets.h
+++ b/lib/packets.h
@@ -143,7 +143,7 @@ void compose_rarp(struct ofpbuf *, const uint8_t eth_src[ETH_ADDR_LEN]);
 void eth_push_vlan(struct ofpbuf *, ovs_be16 tci);
 void eth_pop_vlan(struct ofpbuf *);
 
-void set_ethertype(struct ofpbuf *packet, ovs_be16 eth_type);
+void set_ethertype(struct ofpbuf *packet, ovs_be16 eth_type, bool inner);
 
 const char *eth_from_hex(const char *hex, struct ofpbuf **packetp);
 void eth_format_masked(const uint8_t eth[ETH_ADDR_LEN],
diff --git a/ofproto/ofproto-dpif-xlate.c b/ofproto/ofproto-dpif-xlate.c
index 6757933..2afd760 100644
--- a/ofproto/ofproto-dpif-xlate.c
+++ b/ofproto/ofproto-dpif-xlate.c
@@ -2419,24 +2419,27 @@ do_xlate_actions(const struct ofpact *ofpacts, size_t ofpacts_len,
             break;
         }
 
-        case OFPACT_PUSH_MPLS:
-            if (compose_mpls_push_action(ctx,
-                                         ofpact_get_PUSH_MPLS(a)->ethertype)) {
+        case OFPACT_PUSH_MPLS: {
+            struct ofpact_push_mpls *oam = ofpact_get_PUSH_MPLS(a);
+
+            if (compose_mpls_push_action(ctx, oam->ethertype)) {
                 return;
             }
 
-            /* Save and pop any existing VLAN tags. They will be pushed
-             * back onto the packet after pushing the MPLS LSE. The overall
-             * effect is to push the MPLS LSE after any VLAN tags that may
-             * be present. This is the behaviour described for OpenFlow 1.1
-             * and 1.2. This code needs to be enhanced to make this
-             * conditional to also and support pushing the MPLS LSE before
-             * any VLAN tags that may be present, the behaviour described
-             * for OpenFlow 1.3. */
+            /* Save and pop any existing VLAN tags if the MPLS LSE should
+             * be pushed after VLAN tags.  The overall effect is to push
+             * the MPLS LSE after any VLAN tags that may be present.  This
+             * is the behaviour described for OpenFlow 1.1 and 1.2.
+             * Do not save and therefore pop the VLAN tags if the MPLS LSE
+             * should be pushed before any VLAN tags that are present.
+             * This is the behaviour described for OpenFlow 1.3. */
             ctx->xin->vlan_tci = *vlan_tci;
-            flow->vlan_tci = htons(0);
+            if (!oam->mpls_before_vlan) {
+                flow->vlan_tci = htons(0);
+            }
             vlan_tci = &ctx->xin->vlan_tci;
             break;
+        }
 
         case OFPACT_POP_MPLS:
             if (compose_mpls_pop_action(ctx,
diff --git a/tests/ofproto-dpif.at b/tests/ofproto-dpif.at
index 3d57f37..cb4be48 100644
--- a/tests/ofproto-dpif.at
+++ b/tests/ofproto-dpif.at
@@ -1078,6 +1078,243 @@ NXST_FLOW reply:
 OVS_VSWITCHD_STOP
 AT_CLEANUP
 
+AT_SETUP([ofproto-dpif - OF1.3+ VLAN+MPLS handling])
+OVS_VSWITCHD_START([dnl
+   add-port br0 p1 -- set Interface p1 type=dummy
+])
+ON_EXIT([kill `cat ovs-ofctl.pid`])
+
+AT_CAPTURE_FILE([ofctl_monitor.log])
+AT_DATA([flows.txt], [dnl
+cookie=0xa dl_src=40:44:44:44:55:44 actions=push_mpls:0x8847,load:10->OXM_OF_MPLS_LABEL[[]],load:3->OXM_OF_MPLS_TC[[]],controller
+cookie=0xa dl_src=40:44:44:44:55:45 actions=push_vlan:0x8100,mod_vlan_vid:99,push_mpls:0x8847,load:10->OXM_OF_MPLS_LABEL[[]],load:3->OXM_OF_MPLS_TC[[]],controller
+cookie=0xa dl_src=40:44:44:44:55:46 actions=push_vlan:0x8100,mod_vlan_vid:99,push_mpls:0x8847,load:10->OXM_OF_MPLS_LABEL[[]],load:3->OXM_OF_MPLS_TC[[]],controller
+cookie=0xa dl_src=40:44:44:44:55:47 actions=push_vlan:0x8100,load:99->OXM_OF_VLAN_VID[[]],push_mpls:0x8847,load:10->OXM_OF_MPLS_LABEL[[]],load:3->OXM_OF_MPLS_TC[[]],controller
+cookie=0xa dl_src=40:44:44:44:55:48 actions=push_vlan:0x8100,load:99->OXM_OF_VLAN_VID[[]],push_mpls:0x8847,load:10->OXM_OF_MPLS_LABEL[[]],load:3->OXM_OF_MPLS_TC[[]],controller
+cookie=0xa dl_src=40:44:44:44:55:49 actions=push_mpls:0x8847,load:10->OXM_OF_MPLS_LABEL[[]],load:3->OXM_OF_MPLS_TC[[]],push_vlan:0x8100,mod_vlan_vid:99,controller
+cookie=0xa dl_src=40:44:44:44:55:50 actions=push_mpls:0x8847,load:10->OXM_OF_MPLS_LABEL[[]],load:3->OXM_OF_MPLS_TC[[]],push_vlan:0x8100,mod_vlan_vid:99,controller
+cookie=0xa dl_src=40:44:44:44:55:51 actions=push_mpls:0x8847,load:10->OXM_OF_MPLS_LABEL[[]],load:3->OXM_OF_MPLS_TC[[]],push_vlan:0x8100,load:99->OXM_OF_VLAN_VID[[]],mod_vlan_pcp:1,controller
+cookie=0xa dl_src=40:44:44:44:55:52 actions=push_mpls:0x8847,load:10->OXM_OF_MPLS_LABEL[[]],load:3->OXM_OF_MPLS_TC[[]],push_vlan:0x8100,load:99->OXM_OF_VLAN_VID[[]],mod_vlan_pcp:1,controller
+])
+AT_CHECK([ovs-ofctl --protocols=OpenFlow13 add-flows br0 flows.txt])
+
+dnl Modified MPLS controller action.
+dnl The input packet has a VLAN tag, but because we push an MPLS tag in
+dnl OF1.3 mode, we can no longer see it.
+AT_CHECK([ovs-ofctl monitor br0 65534 -P nxm --detach --pidfile 2> ofctl_monitor.log])
+
+for i in 1 2 3; do
+    ovs-appctl netdev-dummy/receive p1 'in_port(1),eth(src=40:44:44:44:55:44,dst=50:54:00:00:00:07),eth_type(0x8100),vlan(vid=88,pcp=7),encap(eth_type(0x0800),ipv4(src=192.168.0.1,dst=192.168.0.2,proto=6,tos=0,ttl=64,frag=no))'
+done
+OVS_WAIT_UNTIL([test `wc -l < ofctl_monitor.log` -ge 6])
+ovs-appctl -t ovs-ofctl exit
+
+AT_CHECK([cat ofctl_monitor.log], [0], [dnl
+NXT_PACKET_IN (xid=0x0): cookie=0xa total_len=64 in_port=1 (via action) data_len=64 (unbuffered)
+mpls,metadata=0,in_port=0,vlan_tci=0x0000,dl_src=40:44:44:44:55:44,dl_dst=50:54:00:00:00:07,mpls_label=10,mpls_tc=3,mpls_ttl=64,mpls_bos=1
+dnl
+NXT_PACKET_IN (xid=0x0): cookie=0xa total_len=64 in_port=1 (via action) data_len=64 (unbuffered)
+mpls,metadata=0,in_port=0,vlan_tci=0x0000,dl_src=40:44:44:44:55:44,dl_dst=50:54:00:00:00:07,mpls_label=10,mpls_tc=3,mpls_ttl=64,mpls_bos=1
+dnl
+NXT_PACKET_IN (xid=0x0): cookie=0xa total_len=64 in_port=1 (via action) data_len=64 (unbuffered)
+mpls,metadata=0,in_port=0,vlan_tci=0x0000,dl_src=40:44:44:44:55:44,dl_dst=50:54:00:00:00:07,mpls_label=10,mpls_tc=3,mpls_ttl=64,mpls_bos=1
+])
+
+dnl Modified MPLS controller action.
+dnl In this test, we push a VLAN tag, then an MPLS tag in OF1.3 mode, so we
+dnl can only see the MPLS tag in the result.
+AT_CHECK([ovs-ofctl monitor br0 65534 -P nxm --detach --pidfile 2> ofctl_monitor.log])
+
+for i in 1 2 3; do
+    ovs-appctl netdev-dummy/receive p1 'in_port(1),eth(src=40:44:44:44:55:45,dst=50:54:00:00:00:07),eth_type(0x0800),ipv4(src=192.168.0.1,dst=192.168.0.2,proto=6,tos=0,ttl=64,frag=no)'
+done
+OVS_WAIT_UNTIL([test `wc -l < ofctl_monitor.log` -ge 6])
+ovs-appctl -t ovs-ofctl exit
+
+AT_CHECK([cat ofctl_monitor.log], [0], [dnl
+NXT_PACKET_IN (xid=0x0): cookie=0xa total_len=68 in_port=1 (via action) data_len=68 (unbuffered)
+mpls,metadata=0,in_port=0,vlan_tci=0x0000,dl_src=40:44:44:44:55:45,dl_dst=50:54:00:00:00:07,mpls_label=10,mpls_tc=3,mpls_ttl=64,mpls_bos=1
+dnl
+NXT_PACKET_IN (xid=0x0): cookie=0xa total_len=68 in_port=1 (via action) data_len=68 (unbuffered)
+mpls,metadata=0,in_port=0,vlan_tci=0x0000,dl_src=40:44:44:44:55:45,dl_dst=50:54:00:00:00:07,mpls_label=10,mpls_tc=3,mpls_ttl=64,mpls_bos=1
+dnl
+NXT_PACKET_IN (xid=0x0): cookie=0xa total_len=68 in_port=1 (via action) data_len=68 (unbuffered)
+mpls,metadata=0,in_port=0,vlan_tci=0x0000,dl_src=40:44:44:44:55:45,dl_dst=50:54:00:00:00:07,mpls_label=10,mpls_tc=3,mpls_ttl=64,mpls_bos=1
+])
+
+dnl Modified MPLS controller action.
+dnl In this test, the input packet is vlan-tagged; we update this tag then
+dnl push an MPLS tag in OF1.3 mode. As such, we can only see the MPLS tag in
+dnl the result.
+AT_CHECK([ovs-ofctl monitor br0 65534 -P nxm --detach --pidfile 2> ofctl_monitor.log])
+
+for i in 1 2 3; do
+    ovs-appctl netdev-dummy/receive p1 'in_port(1),eth(src=40:44:44:44:55:46,dst=50:54:00:00:00:07),eth_type(0x8100),vlan(vid=88,pcp=7),encap(eth_type(0x0800),ipv4(src=192.168.0.1,dst=192.168.0.2,proto=6,tos=0,ttl=64,frag=no))'
+done
+OVS_WAIT_UNTIL([test `wc -l < ofctl_monitor.log` -ge 6])
+ovs-appctl -t ovs-ofctl exit
+
+AT_CHECK([cat ofctl_monitor.log], [0], [dnl
+NXT_PACKET_IN (xid=0x0): cookie=0xa total_len=64 in_port=1 (via action) data_len=64 (unbuffered)
+mpls,metadata=0,in_port=0,vlan_tci=0x0000,dl_src=40:44:44:44:55:46,dl_dst=50:54:00:00:00:07,mpls_label=10,mpls_tc=3,mpls_ttl=64,mpls_bos=1
+dnl
+NXT_PACKET_IN (xid=0x0): cookie=0xa total_len=64 in_port=1 (via action) data_len=64 (unbuffered)
+mpls,metadata=0,in_port=0,vlan_tci=0x0000,dl_src=40:44:44:44:55:46,dl_dst=50:54:00:00:00:07,mpls_label=10,mpls_tc=3,mpls_ttl=64,mpls_bos=1
+dnl
+NXT_PACKET_IN (xid=0x0): cookie=0xa total_len=64 in_port=1 (via action) data_len=64 (unbuffered)
+mpls,metadata=0,in_port=0,vlan_tci=0x0000,dl_src=40:44:44:44:55:46,dl_dst=50:54:00:00:00:07,mpls_label=10,mpls_tc=3,mpls_ttl=64,mpls_bos=1
+])
+
+dnl Modified MPLS controller action.
+dnl In this test, we push a VLAN tag, then an MPLS tag in OF1.3 mode, so we
+dnl can only see the MPLS tag in the result.
+AT_CHECK([ovs-ofctl monitor br0 65534 -P nxm --detach --pidfile 2> ofctl_monitor.log])
+
+for i in 1 2 3; do
+    ovs-appctl netdev-dummy/receive p1 'in_port(1),eth(src=40:44:44:44:55:47,dst=50:54:00:00:00:07),eth_type(0x0800),ipv4(src=192.168.0.1,dst=192.168.0.2,proto=6,tos=0,ttl=64,frag=no)'
+done
+OVS_WAIT_UNTIL([test `wc -l < ofctl_monitor.log` -ge 6])
+ovs-appctl -t ovs-ofctl exit
+
+AT_CHECK([cat ofctl_monitor.log], [0], [dnl
+NXT_PACKET_IN (xid=0x0): cookie=0xa total_len=68 in_port=1 (via action) data_len=68 (unbuffered)
+mpls,metadata=0,in_port=0,vlan_tci=0x0000,dl_src=40:44:44:44:55:47,dl_dst=50:54:00:00:00:07,mpls_label=10,mpls_tc=3,mpls_ttl=64,mpls_bos=1
+dnl
+NXT_PACKET_IN (xid=0x0): cookie=0xa total_len=68 in_port=1 (via action) data_len=68 (unbuffered)
+mpls,metadata=0,in_port=0,vlan_tci=0x0000,dl_src=40:44:44:44:55:47,dl_dst=50:54:00:00:00:07,mpls_label=10,mpls_tc=3,mpls_ttl=64,mpls_bos=1
+dnl
+NXT_PACKET_IN (xid=0x0): cookie=0xa total_len=68 in_port=1 (via action) data_len=68 (unbuffered)
+mpls,metadata=0,in_port=0,vlan_tci=0x0000,dl_src=40:44:44:44:55:47,dl_dst=50:54:00:00:00:07,mpls_label=10,mpls_tc=3,mpls_ttl=64,mpls_bos=1
+])
+
+dnl Modified MPLS controller action.
+dnl In this test, the input packet is vlan-tagged; we update this tag then
+dnl push an MPLS tag in OF1.3 mode. As such, we can only see the MPLS tag in
+dnl the result.
+AT_CHECK([ovs-ofctl monitor br0 65534 -P nxm --detach --pidfile 2> ofctl_monitor.log])
+
+for i in 1 2 3; do
+    ovs-appctl netdev-dummy/receive p1 'in_port(1),eth(src=40:44:44:44:55:48,dst=50:54:00:00:00:07),eth_type(0x8100),vlan(vid=88,pcp=7),encap(eth_type(0x0800),ipv4(src=192.168.0.1,dst=192.168.0.2,proto=6,tos=0,ttl=64,frag=no))'
+done
+OVS_WAIT_UNTIL([test `wc -l < ofctl_monitor.log` -ge 6])
+ovs-appctl -t ovs-ofctl exit
+
+AT_CHECK([cat ofctl_monitor.log], [0], [dnl
+NXT_PACKET_IN (xid=0x0): cookie=0xa total_len=64 in_port=1 (via action) data_len=64 (unbuffered)
+mpls,metadata=0,in_port=0,vlan_tci=0x0000,dl_src=40:44:44:44:55:48,dl_dst=50:54:00:00:00:07,mpls_label=10,mpls_tc=3,mpls_ttl=64,mpls_bos=1
+dnl
+NXT_PACKET_IN (xid=0x0): cookie=0xa total_len=64 in_port=1 (via action) data_len=64 (unbuffered)
+mpls,metadata=0,in_port=0,vlan_tci=0x0000,dl_src=40:44:44:44:55:48,dl_dst=50:54:00:00:00:07,mpls_label=10,mpls_tc=3,mpls_ttl=64,mpls_bos=1
+dnl
+NXT_PACKET_IN (xid=0x0): cookie=0xa total_len=64 in_port=1 (via action) data_len=64 (unbuffered)
+mpls,metadata=0,in_port=0,vlan_tci=0x0000,dl_src=40:44:44:44:55:48,dl_dst=50:54:00:00:00:07,mpls_label=10,mpls_tc=3,mpls_ttl=64,mpls_bos=1
+])
+
+dnl Modified MPLS controller action.
+dnl In this test, we push the MPLS tag before pushing a VLAN tag, so we see
+dnl both of these in the final flow.
+AT_CHECK([ovs-ofctl monitor br0 65534 -P nxm --detach --pidfile 2> ofctl_monitor.log])
+
+for i in 1 2 3; do
+    ovs-appctl netdev-dummy/receive p1 'in_port(1),eth(src=40:44:44:44:55:49,dst=50:54:00:00:00:07),eth_type(0x0800),ipv4(src=192.168.0.1,dst=192.168.0.2,proto=6,tos=0,ttl=64,frag=no)'
+done
+OVS_WAIT_UNTIL([test `wc -l < ofctl_monitor.log` -ge 6])
+ovs-appctl -t ovs-ofctl exit
+
+AT_CHECK([cat ofctl_monitor.log], [0], [dnl
+NXT_PACKET_IN (xid=0x0): cookie=0xa total_len=68 in_port=1 (via action) data_len=68 (unbuffered)
+mpls,metadata=0,in_port=0,dl_vlan=99,dl_vlan_pcp=0,dl_src=40:44:44:44:55:49,dl_dst=50:54:00:00:00:07,mpls_label=10,mpls_tc=3,mpls_ttl=64,mpls_bos=1
+dnl
+NXT_PACKET_IN (xid=0x0): cookie=0xa total_len=68 in_port=1 (via action) data_len=68 (unbuffered)
+mpls,metadata=0,in_port=0,dl_vlan=99,dl_vlan_pcp=0,dl_src=40:44:44:44:55:49,dl_dst=50:54:00:00:00:07,mpls_label=10,mpls_tc=3,mpls_ttl=64,mpls_bos=1
+dnl
+NXT_PACKET_IN (xid=0x0): cookie=0xa total_len=68 in_port=1 (via action) data_len=68 (unbuffered)
+mpls,metadata=0,in_port=0,dl_vlan=99,dl_vlan_pcp=0,dl_src=40:44:44:44:55:49,dl_dst=50:54:00:00:00:07,mpls_label=10,mpls_tc=3,mpls_ttl=64,mpls_bos=1
+])
+
+dnl Modified MPLS controller action.
+dnl In this test, the input packet in vlan-tagged, which should be stripped
+dnl before we push the MPLS and VLAN tags.
+AT_CHECK([ovs-ofctl monitor br0 65534 -P nxm --detach --pidfile 2> ofctl_monitor.log])
+
+for i in 1 2 3; do
+    ovs-appctl netdev-dummy/receive p1 'in_port(1),eth(src=40:44:44:44:55:50,dst=50:54:00:00:00:07),eth_type(0x8100),vlan(vid=88,pcp=7),encap(eth_type(0x0800),ipv4(src=192.168.0.1,dst=192.168.0.2,proto=6,tos=0,ttl=64,frag=no))'
+done
+OVS_WAIT_UNTIL([test `wc -l < ofctl_monitor.log` -ge 6])
+ovs-appctl -t ovs-ofctl exit
+
+AT_CHECK([cat ofctl_monitor.log], [0], [dnl
+NXT_PACKET_IN (xid=0x0): cookie=0xa total_len=68 in_port=1 (via action) data_len=68 (unbuffered)
+mpls,metadata=0,in_port=0,dl_vlan=99,dl_vlan_pcp=0,dl_src=40:44:44:44:55:50,dl_dst=50:54:00:00:00:07,mpls_label=10,mpls_tc=3,mpls_ttl=64,mpls_bos=1
+dnl
+NXT_PACKET_IN (xid=0x0): cookie=0xa total_len=68 in_port=1 (via action) data_len=68 (unbuffered)
+mpls,metadata=0,in_port=0,dl_vlan=99,dl_vlan_pcp=0,dl_src=40:44:44:44:55:50,dl_dst=50:54:00:00:00:07,mpls_label=10,mpls_tc=3,mpls_ttl=64,mpls_bos=1
+dnl
+NXT_PACKET_IN (xid=0x0): cookie=0xa total_len=68 in_port=1 (via action) data_len=68 (unbuffered)
+mpls,metadata=0,in_port=0,dl_vlan=99,dl_vlan_pcp=0,dl_src=40:44:44:44:55:50,dl_dst=50:54:00:00:00:07,mpls_label=10,mpls_tc=3,mpls_ttl=64,mpls_bos=1
+])
+
+dnl Modified MPLS controller action.
+dnl In this test, we push the MPLS tag before pushing a VLAN tag, so we see
+dnl both of these in the final flow.
+AT_CHECK([ovs-ofctl monitor br0 65534 -P nxm --detach --pidfile 2> ofctl_monitor.log])
+
+for i in 1 2 3; do
+    ovs-appctl netdev-dummy/receive p1 'in_port(1),eth(src=40:44:44:44:55:51,dst=50:54:00:00:00:07),eth_type(0x0800),ipv4(src=192.168.0.1,dst=192.168.0.2,proto=6,tos=0,ttl=64,frag=no)'
+done
+OVS_WAIT_UNTIL([test `wc -l < ofctl_monitor.log` -ge 6])
+ovs-appctl -t ovs-ofctl exit
+
+AT_CHECK([cat ofctl_monitor.log], [0], [dnl
+NXT_PACKET_IN (xid=0x0): cookie=0xa total_len=68 in_port=1 (via action) data_len=68 (unbuffered)
+mpls,metadata=0,in_port=0,dl_vlan=99,dl_vlan_pcp=1,dl_src=40:44:44:44:55:51,dl_dst=50:54:00:00:00:07,mpls_label=10,mpls_tc=3,mpls_ttl=64,mpls_bos=1
+dnl
+NXT_PACKET_IN (xid=0x0): cookie=0xa total_len=68 in_port=1 (via action) data_len=68 (unbuffered)
+mpls,metadata=0,in_port=0,dl_vlan=99,dl_vlan_pcp=1,dl_src=40:44:44:44:55:51,dl_dst=50:54:00:00:00:07,mpls_label=10,mpls_tc=3,mpls_ttl=64,mpls_bos=1
+dnl
+NXT_PACKET_IN (xid=0x0): cookie=0xa total_len=68 in_port=1 (via action) data_len=68 (unbuffered)
+mpls,metadata=0,in_port=0,dl_vlan=99,dl_vlan_pcp=1,dl_src=40:44:44:44:55:51,dl_dst=50:54:00:00:00:07,mpls_label=10,mpls_tc=3,mpls_ttl=64,mpls_bos=1
+])
+
+dnl Modified MPLS controller action.
+dnl In this test, the input packet in vlan-tagged, which should be stripped
+dnl before we push the MPLS and VLAN tags.
+AT_CHECK([ovs-ofctl monitor br0 65534 -P nxm --detach --pidfile 2> ofctl_monitor.log])
+
+for i in 1 2 3; do
+    ovs-appctl netdev-dummy/receive p1 'in_port(1),eth(src=40:44:44:44:55:52,dst=50:54:00:00:00:07),eth_type(0x8100),vlan(vid=88,pcp=7),encap(eth_type(0x0800),ipv4(src=192.168.0.1,dst=192.168.0.2,proto=6,tos=0,ttl=64,frag=no))'
+done
+OVS_WAIT_UNTIL([test `wc -l < ofctl_monitor.log` -ge 6])
+ovs-appctl -t ovs-ofctl exit
+
+AT_CHECK([cat ofctl_monitor.log], [0], [dnl
+NXT_PACKET_IN (xid=0x0): cookie=0xa total_len=68 in_port=1 (via action) data_len=68 (unbuffered)
+mpls,metadata=0,in_port=0,dl_vlan=99,dl_vlan_pcp=1,dl_src=40:44:44:44:55:52,dl_dst=50:54:00:00:00:07,mpls_label=10,mpls_tc=3,mpls_ttl=64,mpls_bos=1
+dnl
+NXT_PACKET_IN (xid=0x0): cookie=0xa total_len=68 in_port=1 (via action) data_len=68 (unbuffered)
+mpls,metadata=0,in_port=0,dl_vlan=99,dl_vlan_pcp=1,dl_src=40:44:44:44:55:52,dl_dst=50:54:00:00:00:07,mpls_label=10,mpls_tc=3,mpls_ttl=64,mpls_bos=1
+dnl
+NXT_PACKET_IN (xid=0x0): cookie=0xa total_len=68 in_port=1 (via action) data_len=68 (unbuffered)
+mpls,metadata=0,in_port=0,dl_vlan=99,dl_vlan_pcp=1,dl_src=40:44:44:44:55:52,dl_dst=50:54:00:00:00:07,mpls_label=10,mpls_tc=3,mpls_ttl=64,mpls_bos=1
+])
+
+AT_CHECK([ovs-appctl time/warp 5000], [0], [ignore])
+AT_CHECK([ovs-ofctl dump-flows br0 | ofctl_strip | sort], [0], [dnl
+ cookie=0xa, n_packets=3, n_bytes=180, dl_src=40:44:44:44:55:44 actions=push_mpls:0x8847,load:0xa->OXM_OF_MPLS_LABEL[[]],load:0x3->OXM_OF_MPLS_TC[[]],CONTROLLER:65535
+ cookie=0xa, n_packets=3, n_bytes=180, dl_src=40:44:44:44:55:45 actions=mod_vlan_vid:99,push_mpls:0x8847,load:0xa->OXM_OF_MPLS_LABEL[[]],load:0x3->OXM_OF_MPLS_TC[[]],CONTROLLER:65535
+ cookie=0xa, n_packets=3, n_bytes=180, dl_src=40:44:44:44:55:46 actions=mod_vlan_vid:99,push_mpls:0x8847,load:0xa->OXM_OF_MPLS_LABEL[[]],load:0x3->OXM_OF_MPLS_TC[[]],CONTROLLER:65535
+ cookie=0xa, n_packets=3, n_bytes=180, dl_src=40:44:44:44:55:47 actions=load:0x63->OXM_OF_VLAN_VID[[]],push_mpls:0x8847,load:0xa->OXM_OF_MPLS_LABEL[[]],load:0x3->OXM_OF_MPLS_TC[[]],CONTROLLER:65535
+ cookie=0xa, n_packets=3, n_bytes=180, dl_src=40:44:44:44:55:48 actions=load:0x63->OXM_OF_VLAN_VID[[]],push_mpls:0x8847,load:0xa->OXM_OF_MPLS_LABEL[[]],load:0x3->OXM_OF_MPLS_TC[[]],CONTROLLER:65535
+ cookie=0xa, n_packets=3, n_bytes=180, dl_src=40:44:44:44:55:49 actions=push_mpls:0x8847,load:0xa->OXM_OF_MPLS_LABEL[[]],load:0x3->OXM_OF_MPLS_TC[[]],mod_vlan_vid:99,CONTROLLER:65535
+ cookie=0xa, n_packets=3, n_bytes=180, dl_src=40:44:44:44:55:50 actions=push_mpls:0x8847,load:0xa->OXM_OF_MPLS_LABEL[[]],load:0x3->OXM_OF_MPLS_TC[[]],mod_vlan_vid:99,CONTROLLER:65535
+ cookie=0xa, n_packets=3, n_bytes=180, dl_src=40:44:44:44:55:51 actions=push_mpls:0x8847,load:0xa->OXM_OF_MPLS_LABEL[[]],load:0x3->OXM_OF_MPLS_TC[[]],load:0x63->OXM_OF_VLAN_VID[[]],mod_vlan_pcp:1,CONTROLLER:65535
+ cookie=0xa, n_packets=3, n_bytes=180, dl_src=40:44:44:44:55:52 actions=push_mpls:0x8847,load:0xa->OXM_OF_MPLS_LABEL[[]],load:0x3->OXM_OF_MPLS_TC[[]],load:0x63->OXM_OF_VLAN_VID[[]],mod_vlan_pcp:1,CONTROLLER:65535
+NXST_FLOW reply:
+])
+
+OVS_VSWITCHD_STOP
+AT_CLEANUP
+
 AT_SETUP([ofproto-dpif - fragment handling])
 OVS_VSWITCHD_START
 ADD_OF_PORTS([br0], [1], [2], [3], [4], [5], [6], [90])
-- 
1.8.4

^ permalink raw reply related

* [PATCH v2.42 4/5] datapath: Break out deacceleration portion of vlan_push
From: Simon Horman @ 2013-10-04  8:09 UTC (permalink / raw)
  To: dev-yBygre7rU0TnMu66kgdUjQ, netdev-u79uwXL29TY76Z2rM5mHXA,
	Jesse Gross, Ben Pfaff
  Cc: Isaku Yamahata, Ravi K
In-Reply-To: <1380874200-8981-1-git-send-email-horms-/R6kz+dDXgpPR4JQBCEnsQ@public.gmane.org>

Break out deacceleration portion of vlan_push into vlan_put
so that it may be re-used by mpls_push.

For both vlan_push and mpls_push if there is an accelerated VLAN tag
present then it should be deaccelerated, adding it to the data of
the skb, before the new tag is added.

Signed-off-by: Simon Horman <horms-/R6kz+dDXgpPR4JQBCEnsQ@public.gmane.org>

---
v2.40
* As suggested by Jesse Gross
  + Simplify vlan_push by returning an error code
    rather than an error code encoded as a struct xkb_buff *

v2.39
* First post
---
 datapath/actions.c | 29 +++++++++++++++++++----------
 1 file changed, 19 insertions(+), 10 deletions(-)

diff --git a/datapath/actions.c b/datapath/actions.c
index 30ea1d2..d961e5d 100644
--- a/datapath/actions.c
+++ b/datapath/actions.c
@@ -105,22 +105,31 @@ static int pop_vlan(struct sk_buff *skb)
 	return 0;
 }
 
-static int push_vlan(struct sk_buff *skb, const struct ovs_action_push_vlan *vlan)
+/* push down current VLAN tag */
+static int put_vlan(struct sk_buff *skb)
 {
-	if (unlikely(vlan_tx_tag_present(skb))) {
-		u16 current_tag;
+	u16 current_tag = vlan_tx_tag_get(skb);
 
-		/* push down current VLAN tag */
-		current_tag = vlan_tx_tag_get(skb);
+	if (!__vlan_put_tag(skb, skb->vlan_proto, current_tag))
+		return -ENOMEM;
 
-		if (!__vlan_put_tag(skb, skb->vlan_proto, current_tag))
-			return -ENOMEM;
+	if (skb->ip_summed == CHECKSUM_COMPLETE)
+		skb->csum = csum_add(skb->csum, csum_partial(skb->data
+				+ (2 * ETH_ALEN), VLAN_HLEN, 0));
 
-		if (skb->ip_summed == CHECKSUM_COMPLETE)
-			skb->csum = csum_add(skb->csum, csum_partial(skb->data
-					+ (2 * ETH_ALEN), VLAN_HLEN, 0));
+	return 0;
+}
 
+static int push_vlan(struct sk_buff *skb, const struct ovs_action_push_vlan *vlan)
+{
+	if (unlikely(vlan_tx_tag_present(skb))) {
+		int err;
+
+		err = put_vlan(skb);
+		if (unlikely(err))
+			return err;
 	}
+
 	__vlan_hwaccel_put_tag(skb, vlan->vlan_tpid, ntohs(vlan->vlan_tci) & ~VLAN_TAG_PRESENT);
 	return 0;
 }
-- 
1.8.4

^ permalink raw reply related

* [PATCH v2.42 5/5] datapath: Add basic MPLS support to kernel
From: Simon Horman @ 2013-10-04  8:10 UTC (permalink / raw)
  To: dev-yBygre7rU0TnMu66kgdUjQ, netdev-u79uwXL29TY76Z2rM5mHXA,
	Jesse Gross, Ben Pfaff
  Cc: Isaku Yamahata, Ravi K
In-Reply-To: <1380874200-8981-1-git-send-email-horms-/R6kz+dDXgpPR4JQBCEnsQ@public.gmane.org>

Allow datapath to recognize and extract MPLS labels into flow keys
and execute actions which push, pop, and set labels on packets.

Based heavily on work by Leo Alterman, Ravi K, Isaku Yamahata and Joe Stringer.

Cc: Ravi K <rkerur-Re5JQEeQqe8AvxtiuMwx3w@public.gmane.org>
Cc: Leo Alterman <lalterman-l0M0P4e3n4LQT0dZR+AlfA@public.gmane.org>
Cc: Isaku Yamahata <yamahata-jCdQPDEk3idL9jVzuh4AOg@public.gmane.org>
Cc: Joe Stringer <joe-Q1GJJQv1iO6lP80pJB477g@public.gmane.org>
Signed-off-by: Simon Horman <horms-/R6kz+dDXgpPR4JQBCEnsQ@public.gmane.org>

---

v2.42
* Rebase for:
  + 0585f7a ("datapath: Simplify mega-flow APIs.")
  + a097c0b ("datapath: Restructure datapath.c and flow.c")
* As suggested by Jesse Gross
  + Take into account that push_mpls() will have freed the skb on error
  + Remove dubious !eth_p_mpls(skb->protocol) condition from push_mpls
    The !eth_p_mpls(skb->protocol) condition on setting inner_protocol
    has no effect. Its motivation was to ensure that inner_protocol was
    only set the first time that mpls_push occured. However this is already
    ensured by the !ovs_skb_get_inner_protocol(skb) condition.
  + Return -EINVAL instead of -ENOMEM from pop_mpls() if the skb is too short
  + Do not add @inner_protocol to kernel doc for struct ovs_skb_cb.
    The patch no longer adds an inner_protocol member to struct ovs_skb_cb
  + Do not add and set otherwise unsued inner_protocol variable in
    rpl_dev_queue_xmit()
* As suggested by Pravin Shelar
  + Implement compatibility code in existing rpl_skb_gso_segment
    rather than introducing to use rpl___skb_gso_segment

v2.41
* No change

v2.40
* Rebase for:
  + New dev_queue_xmit compat code
  + Updated put_vlan()
* As suggested by Jesse Gross
  + Remove bogus mac_len update from push_mpls()
  + Slightly simplify push_mpls() by using eth_hdr()
  + Remove dubious condition !eth_p_mpls(inner_protocol) on
    an skb being considered to be MPLS in netdev_send()
  + Only use compatibility code for MPLS GSO segmentation on kernels
    older than 3.11
  + Revamp setting of inner_protocol
    1. Do not unconditionally set inner_protocol to the value of
       skb->protocol in ovs_execute_actions().
    2. Initialise inner_protocol it to zero only if compatibility code is in
       use. In the case where compatibility code is not in use it will either
       be zero due since the allocation of the skb or some other value set
       by some other user.
    3. Conditionally set the inner_protocol in push_mpls() to the value of
       skb->protocol when entering push_mpls(). The condition is that
       inner_protocol is zero and the value of skb->protocol is not an MPLS
       ethernet type.
    - This new scheme:
      + Pushes logic to set inner_protocol closer to the case where it is
	needed.
      + Avoids over-writing values set by other users.
* As suggested by Pravin Shelar
  + Only set and restore skb->protocol in rpl___skb_gso_segment() in the
    case of MPLS
  + Add inner_protocol field to struct ovs_gso_cb instead of ovs_skb_cb.
    This moves compatibility code closer to where it is used
    and creates fewer differences with mainline.
* Update comment on mac_len updates in datapath/actions.c
* Remove HAVE_INNER_PROCOTOL and instead just check
  against kernel version 3.11 directly.
  HAVE_INNER_PROCOTOL is a hang-over from work done prior
  to the merge of inner_protocol into the kernel.
* Remove dubious condition !eth_p_mpls(inner_protocol) on
  using inner_protocol as the type in rpl_skb_network_protocol()
* Do not update type of features in rpl_dev_queue_xmit.
  Though arguably correct this is not an inherent part of
  the changes made by this patch.
* Use skb_cow_head() in push_mpls()
  + Call skb_cow_head(skb, MPLS_HLEN) instead of
    make_writable(skb, skb->mac_len) to ensure that there is enough head
    room to push an MPLS LSE regardless of whether the skb is cloned or not.
  + This is consistent with the behaviour of rpl__vlan_put_tag().
  + This is a fix for crashes reported when performing mpls_push
    with headroom less than 4. This problem was introduced in v3.36.
* Skip popping in mpls_pop if the skb is too short to contain an MPLS LSE

v2.39
* Rebase for removal of vlan, checksum and skb->mark compat code

v2.38
* Rebase for SCTP support
* Refactor validate_tp_port() to iterate over eth_types rather
  than open-coding the loop. With the addition of SCTP this logic
  is now used three times.

v2.36 - v2.37
* Rebase

* Do not add set_ethertype() to datapath/actions.c.
  As this patch has evolved this function had devolved into
  to sets of functionality wrapped into a single function with
  only one line of common code. Refactor things to simply
  open-code setting the ether type in the two locations where
  set_ethertype() was previously used. The aim here is to improve
  readability.

* Update setting skb->ethertype after mpls push and pop.
  - In the case of push_mpls it should be set unconditionally
    as in v2.35 the behaviour of this function to always push
    an MPLS LSE before any VLAN tags.
  - In the case of mpls_pop eth_p_mpls(skb->protocol) is a better
    test than skb->protocol != htons(ETH_P_8021Q) as it will give the
    correct behaviour in the presence of other VLAN ethernet types,
    for example 0x88a8 which is used by 802.1ad. Moreover, it seems
    correct to update the ethernet type if it was previously set
    according to the top-most MPLS LSE.

* Deaccelerate VLANs when pushing MPLS tags the
  - Since v2.35 MPLS push will insert an MPLS LSE before any VLAN tags.
    This means that if an accelerated tag is present it should be
    deaccelerated to ensure it ends up in the correct position.

* Update skb->mac_len in push_mpls() so that it will be correct
  when used by a subsequent call to pop_mpls().

  As things stand I do not believe this is strictly necessary as
  ovs-vswitchd will not send a pop MPLS action after a push MPLS action.
  However, I have added this in order to code more defensively as I believe
  that if such a sequence did occur it would be rather unobvious why
  it didn't work.

* Do not add skb_cow_head() call in push_mpls().
  It is unnecessary as there is a make_writable() call.
  This change was also made in v2.30 but some how the
  code regressed between then and v2.35.

v2.35
* Rebase
* Move MPLS constants to mpls.h
* Push MPLS tags after ethernet, before VLAN tags
  - This is consistent with the OpenFlow 1.3 specification
  - Compatibility with OpenFlow 1.2 and earlier versions
    may be provided by ovs-vswitchd.
* Correct GSO behaviour in the presence of MPLS but absence of VLANs

v2.34
* Rebase for megaflow changes

v2.33
* Ensure that inner_protocol is always set to to the current
  skb->protocol value in ovs_execute_actions(). This ensures
  it is set to the correct value in the absence of a push_mpls action.
  Also remove setting of inner_protocol in push_mpls() as
  it duplicates the code now in ovs_execute_actions().
* Call __skb_gso_segment() instead of skb_gso_segment() from
  rpl___skb_gso_segment() in the case that HAVE___SKB_GSO_SEGMENT is set.
  This was a typo.

v2.32
* As suggested by Jesse Gross
  - Use int instead of size_t in validate_and_copy_actions__().
  - Fix crazy edit mess in pop_mpls() action comment
  - Move eth_p_mpls() into mpls.h
  - Refactor skb_gso_segment MPLS handling into rpl_skb_gso_segment
    Address Jesse's comments regarding this code:
    "Can we push this completely into the skb_gso_segment() compatibility
     code? It's both nicer and may make the interactions with the vlan code
     less confusing."
  - Move GSO compatibility code into linux/compat/gso.*
  - Set skb->protocol on mpls_push and mpls_pop in the presence
    of an offloaded VLAN.

v2.31
* As suggested by Jesse Gross
  - There is no need to make mac_header_end inline as it is not in a header file
  - Remove dubious if (*skb_ethertype == ethertype) optimisation from
    set_ethertype
  - Only set skb->protocol in push_mpls() or pop_mpls() for non-VLAN packets
  - Use MAX_ETH_TYPES instead of SAMPLE_ACTION_DEPTH for array size
    of types in struct eth_types. This corrects a typo/thinko.
  - Correct eth type tracking logic such that start isn't advanced
    when entering a sample action, ensuring that all possibly types
    are checked when verifying nested actions.
* Define HAVE_INNER_PROTOCOL based on kernel version.
  inner_protocol has been merged into net-next and should appear in
  v3.11 so there is no longer a need for a acinclude.m4 test to check for it.
* Add MPLS GSO compatibility code.
  This is for use on kernels that do not have MPLS GSO support.
  Thanks to Joe Stringer for his work on this.

v2.30
* As suggested by Jesse Gross
  - Use skb_cow_head in push_mpls to ensure there is sufficient headroom for
    skb_push
  - Call make_writable with skb->mac_len instead of skb->mac_len + MPLS_HLEN
    in push_mpls as only the first skb->mac_len bytes of existing packet data
    are modified.
  - Rename skb_mac_header_end as mac_header_end, this seems
    to be a more appropriate name for a local function.
  - Remove OVS_CSUM_COMPLETE code from set_ethertype().
    Inside OVS the ethernet header is not covered by OVS_CSUM_COMPLETE.
  - Use __skb_pull() instead of skb_pull() in pop_mpls()
  - Decrement and decrement skb->mac_len when poping and pushing VLAN tags.
    Previously mac_len was reset, but this would result in forgetting
    the MPLS label stack.
  - Remove spurious comment from before do_execute_actions().
  - Move OVS_KEY_ATTR_MPLS attribute to its final, upstreamable, location.
  - Correct ethertype check for OVS_ACTION_ATTR_POP_MPLS case in
    validate_and_copy_actions() to check for MPLS ethertypes rather than
    ETH_P_IP.
  - Rewrite tracking of eth types used to verify actions in the presence
    of sample actions. There is a large comment above struct eth_types
    describing the new implementation.

v2.29
* Break include/ and lib/ portions of the patch out into a
  separate patch "datapath: Add basic MPLS support to kernel"
* Update for new MPLS GSO scheme
  - skb->protocol is set to the new ethertype of the packet
    on MPLS push and pop
  - When pushing the first MPLS LSE onto a previously non-MPLS
    packet set skb->inner_protocol to the original ethertype.
  - skb->inner_protocol may be used by the network stack
    for GSO of the inner-packet.
* Drop const from ethertype parameter of set_ethertype.
  This appears to be a legacy of this parameter being a pointer.
* Pass the ethertype patrameter of pop_mpls as a value rather
  than a pointer.

v2.28
* Kernel Datapath changes as suggested by Jarno Rajahalme
  + Correct the logic introduced in v2.27 to set the network_header
    to after the MPLS label stack in the case of an MPLS packet.
    - Increment stack_len offset so that label stacks of depth greater
      than two do not cause an infinite loop.
    - Correct offset passed to check_header to include skb->mac len

v2.27
* Kernel Datapath changes as suggested by Jarno Rajahalme and Jesse Gross:
  + Previously the mac_len and network_header of an skb corresponded
    to the end of the L2 header.  To support GSO, just before transmission,
    do_output, with the results as follows:

    Input: non-MPLS skb: Output: network header and mac_len correspond
                         to the beginning of the L3 headers
    Input: MPLS:         Output: network header and mac_len correspond to the
                         end of the L2 headers.

    This is somewhat confusing.

  + The new scheme is as follows:
    - The mac_len always corresponds to the end of the L2 header.
    - The network header always corresponds to the beginning of the
      L3 header.

  + Note that in the case of MPLS output the end of the L2 headers and the
  beginning of the L3 headers will differ.

* Remove unused declaration of skb_cb_mpls_stack()

v2.26
* Rebase on master
* Kernel Datapath changes as suggested by Jarno Rajahalme
  - Use skb_network_header() instead of skb_mac_header() to locate
    the ethertype to set in set_ethertype() as the latter will
    be wrong in the presence of VLAN tags. This resolves
    a regression introduced in v2.24.
  - Enhance comment in do_output()
  - do_execute_actions(): Do not alter mpls_stack_depth if
    a MPLS push or pop action fail. This is achieved by altering
    mpls_stack_depth at the end of push_mpls() and pop_mpls().

v2.25
* Rebase on master
* Pass big-endian value as the last argument of eth_types_set() in
  validate_and_copy_actions__()
* Use revised GSO support as provided by the patch series
  "[PATCH 0/2] Small Modifications to GSO to allow segmentation of MPLS"
  - Set skb->mac_len to the length of the l2 header + MPLS stack length
  - Update skb->network_header accordingly
  - Set skb->encapsulated_features

v2.24
* Use skb_mac_header() in set_ethertype()
* Set skb->encapsulation in set_ethertype() to support MPLS GSO.
  Also add a note about the other requirements for MPLS GSO.
  MPLS GSO support will be posted as a patch net-next (Linux mainline)
  "MPLS: Add limited GSO support"
* Do not add ETH_TYPE_MIN, it is no longer used

v2.23
* As suggested by Jesse Gross:
  - Verify the current ethernet type when validating sample actions
    both for the taken and not-taken path if the sample action.
  - Document that the OVS_KEY_ATTR_MPLS attribute accepts a list of
    struct ovs_key_mpls but that an implementation may restrict
    the length it accepts.
  - Restrict the array length of the OVS_KEY_ATTR_MPLS to one.
    + Don't add ovs_flow_verify_key_len as it was added to
      handle attributes whose values are arrays but there are
      no attributes with values that are arrays (of length greater than one).

v2.22
* As suggested by Jesse Gross:
  - Fix sparse warning in validate_and_copy_actions()
    I have no idea why sparse doesn't show this up this on my system.
  - Remove call to skb_cow_head() from push_mpls() as it
    is already covered by a call to make_writable()
  - Check (key_type > OVS_KEY_ATTR_MAX) in ovs_flow_verify_key_len()
  - Disallow set actions on l2.5+ data and MPLS push and pop actions
    after an MPLS pop action as there is no verification that the packet
    is actually of the new ethernet type. This may later be supported
    using recirculation or by other means.
  - Do not add spurious debuging message to ovs_flow_cmd_new_or_set()

v2.21
* As suggested by Jesse Gross:
  - Verify that l3 and l4 actions always always occur prior to
    a push_mpls action and use the network header pointer of an skb
    to track the top of the MPLS stack. This avoids adding an l2_size
    element to the skb callback.

v2.20
* As suggested by Jesse Gross:
  - Do not add ovs_dp_ioctl_hook
    + This appears to be garbage from a rebase
  - Do not add skb_cb_set_l2_size. Instead set OVS_CB(skb)->l2_size
    in ovs_flow_extract().
  - Do not free skb on error in push_mpls(), it is freed in the caller
  - Call skb_reset_mac_len() in pop_mpls() and push_mpls()
  - Update checksums in pop_mpls(), push_mpls() and set_mpls().
  - Rename skb_cb_mpls_bos() as skb_cb_mpls_stack().
    It returns the top not the bottom of the stack.
  - Track the current eth_type in validate_and_copy_actions
    which is initially the eth_type of the flow and may be modified
    by push_mpls and pop_mpls actions. Use this to correctly validate
    mpls_set actions. This is to allow mpls_set actions to be applied
    to a non-MPLS frame after an mpls_push action (although ovs-vswitchd
    doesn't currently do that).
    Also:
    + Remove the check of the eth_type in set_mpls() as the new validation
      scheme should ensure it cannot be incorrect.
    + Use the current eth_type to validate mpls_pop actions and remove
      the eth_type check from pop_mpls().
  - Move OVS_KEY_ATTR_MPLS to non-upstream group in ovs_key_lens
  - Remove unnecessary memset of mpls_key in ovs_flow_to_nlattrs()
  - Make a union of the mpls and ip elements of struct sw_flow_key.
    Currently the code stops parsing after an MPLS header so it is
    not possible for the ip and mpls elements to be used simultaneously
    and some space can be saved by using a union.
  - Allow an array of MPLS key attributes
    + Currently all but the first element is ignored
    + User-space needs to be updated to accept more than one element,
      currently it will treat their presence as an error
  - Do not update network header in ovs_flow_extract() for after parsing
    the MPLS stack as it is never used because no l3+ processing
    occurs on MPLS frames.
  - Allow multiple MPLS entries in a match by allowing the OVS_KEY_ATTR_MPLS
    to be an array of struct ovs_key_mpls with at least one entry.
    Currently only one entry is used which is byte-for-byte compatible with
    the previous scheme of having OVS_KEY_ATTR_MPLS as a struct
    ovs_key_mpls.
* Make skb writable in pop_mpls(), push_mpls() and set_mpls().

v2.18 - v2.19
* No change

v2.17
* As suggested by Ben Pfaff
  - Use consistent terminology for MPLS.
    + Consistently refer to the MPLS component of a packet as the
      MPLS label stack and entries in the stack as MPLS label stack entries
      (LSE).  An MPLS label is a component of an MPLS label stack entry.
      The other components are the traffic class (TC), time to live (TTL)
      and bottom of stack (BoS) bit.
  - Rename compose_.*mpls_ functions as execute_.*mpls_

v2.16
* No change

v2.15
* As suggested by Ben Pfaff
  - Use OVS_ACTION_SET to set OVS_KEY_ATTR_MPLS instead of
    OVS_ACTION_ATTR_SET_MPLS

v2.14
* Remove include/linux/openvswitch.h portion which added add
  new key and action attributes. This
  now present in "User-Space MPLS actions and matches"
  which is now a dependency of this patch

v2.13
* As suggested by Jarno Rajahalme
  - Rename mpls_bos element of ovs_skb_cb as l2_size as it is set and used
    regardless of if an MPLS stack is present or not. Update the name of
    helper functions and documentation accordingly.
  - Ensure that skb_cb_mpls_bos() never returns NULL
* Correct endieness in eth_p_mpls()

v2.12
* Update skb and network header on MPLS extraction in ovs_flow_extract()
* Use NULL in skb_cb_mpls_bos()
* Add eth_p_mpls helper

v2.10 - v2.11
* No change

v2.9
* datapath: Always update the mpls bos if  vlan_pop is successful

  Regardless of the details of how a successful
  vlan_pop is achieved, the mpls bos needs to be updated.

  Without this fix it has been observed that the following
  results in malformed packets

v2.8
* No change

v2.7
* Rebase

v2.6
* As suggested by Yamahata-san
  - Do not guard against label == 0 for
    OVS_ACTION_ATTR_SET_MPLS in validate_actions().
    A label of 0 is valid
  - Remove comment stupulating that if
    the top_label element of struct sw_flow_key is 0 then
    there is no MPLS label. An MPLS label of 0 is valid
    and the correct check if ethertype is
    ntohs(ETH_TYPE_MPLS) or ntohs(ETH_TYPE_MPLS_MCAST)

v2.4 - v2.5
* No change

v2.3
* s/mpls_stack/mpls_bos/
  This is in keeping with the naming used in the OpenFlow 1.3 specification

v2.2
* Call skb_reset_mac_header() in skb_cb_set_mpls_stack()
  eth_hdr(skb) is non-NULL when called in skb_cb_set_mpls_stack().
* Add a call to skb_cb_set_mpls_stack() in ovs_packet_cmd_execute().
  I apologise that I have mislaid my notes on this but
  it avoids a kernel panic. I can investigate again if necessary.
* Use struct ovs_action_push_mpls instead of
  __be16 to decode OVS_ACTION_ATTR_PUSH_MPLS in validate_actions(). This is
  consistent with the data format for the attribute.
* Indentation fix in skb_cb_mpls_stack(). [cosmetic]

v2.1
* Manual rebase
---
 datapath/Modules.mk                             |   1 +
 datapath/actions.c                              | 131 ++++++++++-
 datapath/datapath.c                             |   4 +-
 datapath/flow.c                                 |  29 +++
 datapath/flow.h                                 |  17 +-
 datapath/flow_netlink.c                         | 286 ++++++++++++++++++++++--
 datapath/flow_netlink.h                         |   2 +-
 datapath/linux/compat/gso.c                     |  70 +++++-
 datapath/linux/compat/gso.h                     |  41 ++++
 datapath/linux/compat/include/linux/netdevice.h |   6 +-
 datapath/linux/compat/netdevice.c               |  10 +-
 datapath/mpls.h                                 |  15 ++
 include/linux/openvswitch.h                     |   7 +-
 13 files changed, 569 insertions(+), 50 deletions(-)
 create mode 100644 datapath/mpls.h

diff --git a/datapath/Modules.mk b/datapath/Modules.mk
index b652411..6aa80e5 100644
--- a/datapath/Modules.mk
+++ b/datapath/Modules.mk
@@ -26,6 +26,7 @@ openvswitch_headers = \
 	flow.h \
 	flow_netlink.h \
 	flow_table.h \
+	mpls.h \
 	vlan.h \
 	vport.h \
 	vport-internal_dev.h \
diff --git a/datapath/actions.c b/datapath/actions.c
index d961e5d..8babfc4 100644
--- a/datapath/actions.c
+++ b/datapath/actions.c
@@ -35,6 +35,8 @@
 #include <net/sctp/checksum.h>
 
 #include "datapath.h"
+#include "gso.h"
+#include "mpls.h"
 #include "vlan.h"
 #include "vport.h"
 
@@ -71,7 +73,8 @@ static int __pop_vlan_tci(struct sk_buff *skb, __be16 *current_tci)
 
 	vlan_set_encap_proto(skb, vhdr);
 	skb->mac_header += VLAN_HLEN;
-	skb_reset_mac_len(skb);
+	/* Update mac_len for subsequent MPLS actions */
+	skb->mac_len -= VLAN_HLEN;
 
 	return 0;
 }
@@ -113,6 +116,9 @@ static int put_vlan(struct sk_buff *skb)
 	if (!__vlan_put_tag(skb, skb->vlan_proto, current_tag))
 		return -ENOMEM;
 
+	/* update mac_len for subsequent MPLS actions */
+	skb->mac_len += VLAN_HLEN;
+
 	if (skb->ip_summed == CHECKSUM_COMPLETE)
 		skb->csum = csum_add(skb->csum, csum_partial(skb->data
 				+ (2 * ETH_ALEN), VLAN_HLEN, 0));
@@ -134,6 +140,114 @@ static int push_vlan(struct sk_buff *skb, const struct ovs_action_push_vlan *vla
 	return 0;
 }
 
+/* The end of the mac header.
+ *
+ * For non-MPLS skbs this will correspond to the network header.
+ * For MPLS skbs it will be before the network_header as the MPLS
+ * label stack lies between the end of the mac header and the network
+ * header. That is, for MPLS skbs the end of the mac header
+ * is the top of the MPLS label stack.
+ */
+static unsigned char *mac_header_end(const struct sk_buff *skb)
+{
+	return skb_mac_header(skb) + skb->mac_len;
+}
+
+/* Push MPLS after the ethernet header. */
+static int push_mpls(struct sk_buff *skb,
+		     const struct ovs_action_push_mpls *mpls)
+{
+	__be32 *new_mpls_lse;
+	struct ethhdr *hdr;
+
+	if (unlikely(vlan_tx_tag_present(skb))) {
+		int err;
+
+		err = put_vlan(skb);
+		if (unlikely(err))
+			return err;
+
+	        vlan_set_tci(skb, 0);
+	}
+
+	if (skb_cow_head(skb, MPLS_HLEN) < 0) {
+		kfree_skb(skb);
+		return -ENOMEM;
+	}
+	skb_push(skb, MPLS_HLEN);
+
+	memmove(skb_mac_header(skb) - MPLS_HLEN, skb_mac_header(skb),
+		ETH_HLEN);
+	skb_reset_mac_header(skb);
+
+	new_mpls_lse = (__be32 *)(skb_mac_header(skb) + ETH_HLEN);
+	*new_mpls_lse = mpls->mpls_lse;
+
+	if (skb->ip_summed == CHECKSUM_COMPLETE)
+		skb->csum = csum_add(skb->csum, csum_partial(new_mpls_lse,
+							     MPLS_HLEN, 0));
+
+	hdr = eth_hdr(skb);
+	hdr->h_proto = mpls->mpls_ethertype;
+	if (!ovs_skb_get_inner_protocol(skb))
+		ovs_skb_set_inner_protocol(skb, skb->protocol);
+	skb->protocol = mpls->mpls_ethertype;
+	return 0;
+}
+
+static int pop_mpls(struct sk_buff *skb, const __be16 ethertype)
+{
+	struct ethhdr *hdr;
+	int err;
+
+	err = make_writable(skb, skb->mac_len + MPLS_HLEN);
+	if (unlikely(err))
+		return err;
+
+	if (unlikely(skb->len < skb->mac_len + MPLS_HLEN))
+		return -EINVAL;
+
+	if (skb->ip_summed == CHECKSUM_COMPLETE)
+		skb->csum = csum_sub(skb->csum,
+				     csum_partial(mac_header_end(skb),
+						  MPLS_HLEN, 0));
+
+	memmove(skb_mac_header(skb) + MPLS_HLEN, skb_mac_header(skb),
+		skb->mac_len);
+
+	__skb_pull(skb, MPLS_HLEN);
+	skb_reset_mac_header(skb);
+
+	/* mac_header_end() is used to locate the ethertype
+	 * field correctly in the presence of VLAN tags.
+	 */
+	hdr = (struct ethhdr *)(mac_header_end(skb) - ETH_HLEN);
+	hdr->h_proto = ethertype;
+	if (eth_p_mpls(skb->protocol))
+		skb->protocol = ethertype;
+	return 0;
+}
+
+static int set_mpls(struct sk_buff *skb, const __be32 *mpls_lse)
+{
+	__be32 *stack = (__be32 *)mac_header_end(skb);
+	int err;
+
+	err = make_writable(skb, skb->mac_len + MPLS_HLEN);
+	if (unlikely(err))
+		return err;
+
+	if (skb->ip_summed == CHECKSUM_COMPLETE) {
+		__be32 diff[] = { ~(*stack), *mpls_lse };
+		skb->csum = ~csum_partial((char *)diff, sizeof(diff),
+					  ~skb->csum);
+	}
+
+	*stack = *mpls_lse;
+
+	return 0;
+}
+
 static int set_eth_addr(struct sk_buff *skb,
 			const struct ovs_key_ethernet *eth_key)
 {
@@ -509,6 +623,9 @@ static int execute_set_action(struct sk_buff *skb,
 
 	case OVS_KEY_ATTR_SCTP:
 		err = set_sctp(skb, nla_data(nested_attr));
+
+	case OVS_KEY_ATTR_MPLS:
+		err = set_mpls(skb, nla_data(nested_attr));
 		break;
 	}
 
@@ -545,6 +662,16 @@ static int do_execute_actions(struct datapath *dp, struct sk_buff *skb,
 			output_userspace(dp, skb, a);
 			break;
 
+		case OVS_ACTION_ATTR_PUSH_MPLS:
+			err = push_mpls(skb, nla_data(a));
+			if (unlikely(err)) /* skb already freed. */
+				return err;
+			break;
+
+		case OVS_ACTION_ATTR_POP_MPLS:
+			err = pop_mpls(skb, nla_get_be16(a));
+			break;
+
 		case OVS_ACTION_ATTR_PUSH_VLAN:
 			err = push_vlan(skb, nla_data(a));
 			if (unlikely(err)) /* skb already freed. */
@@ -618,6 +745,8 @@ int ovs_execute_actions(struct datapath *dp, struct sk_buff *skb)
 		goto out_loop;
 	}
 
+	ovs_skb_init_inner_protocol(skb);
+
 	OVS_CB(skb)->tun_key = NULL;
 	error = do_execute_actions(dp, skb, acts->actions,
 					 acts->actions_len, false);
diff --git a/datapath/datapath.c b/datapath/datapath.c
index 9e6df12..4a2afad 100644
--- a/datapath/datapath.c
+++ b/datapath/datapath.c
@@ -518,7 +518,7 @@ static int ovs_packet_cmd_execute(struct sk_buff *skb, struct genl_info *info)
 		goto err_flow_free;
 
 	err = ovs_nla_copy_actions(a[OVS_PACKET_ATTR_ACTIONS],
-				   &flow->key, 0, &acts);
+				   &flow->key, &acts);
 	rcu_assign_pointer(flow->sf_acts, acts);
 	if (err)
 		goto err_flow_free;
@@ -779,7 +779,7 @@ static int ovs_flow_cmd_new_or_set(struct sk_buff *skb, struct genl_info *info)
 
 		ovs_flow_mask_key(&masked_key, &key, &mask);
 		error = ovs_nla_copy_actions(a[OVS_FLOW_ATTR_ACTIONS],
-					     &masked_key, 0, &acts);
+					     &masked_key, &acts);
 		if (error) {
 			OVS_NLERR("Flow actions may not be safe on all matching packets.\n");
 			goto err_kfree;
diff --git a/datapath/flow.c b/datapath/flow.c
index faa4e15..91f428d 100644
--- a/datapath/flow.c
+++ b/datapath/flow.c
@@ -44,6 +44,7 @@
 #include <net/ipv6.h>
 #include <net/ndisc.h>
 
+#include "mpls.h"
 #include "vlan.h"
 
 u64 ovs_flow_used_time(unsigned long flow_jiffies)
@@ -391,6 +392,7 @@ int ovs_flow_extract(struct sk_buff *skb, u16 in_port, struct sw_flow_key *key)
 		return -ENOMEM;
 
 	skb_reset_network_header(skb);
+	skb_reset_mac_len(skb);
 	__skb_push(skb, skb->data - skb_mac_header(skb));
 
 	/* Network layer. */
@@ -473,6 +475,33 @@ int ovs_flow_extract(struct sk_buff *skb, u16 in_port, struct sw_flow_key *key)
 			memcpy(key->ipv4.arp.sha, arp->ar_sha, ETH_ALEN);
 			memcpy(key->ipv4.arp.tha, arp->ar_tha, ETH_ALEN);
 		}
+	} else if (eth_p_mpls(key->eth.type)) {
+		size_t stack_len = MPLS_HLEN;
+
+		/* In the presence of an MPLS label stack the end of the L2
+		 * header and the beginning of the L3 header differ.
+		 *
+		 * Advance network_header to the beginning of the L3
+		 * header. mac_len corresponds to the end of the L2 header.
+		 */
+		while (1) {
+			__be32 lse;
+
+			error = check_header(skb, skb->mac_len + stack_len);
+			if (unlikely(error))
+				return 0;
+
+			memcpy(&lse, skb_network_header(skb), MPLS_HLEN);
+
+			if (stack_len == MPLS_HLEN)
+				memcpy(&key->mpls.top_lse, &lse, MPLS_HLEN);
+
+			skb_set_network_header(skb, skb->mac_len + stack_len);
+			if (lse & htonl(MPLS_BOS_MASK))
+				break;
+
+			stack_len += MPLS_HLEN;
+		}
 	} else if (key->eth.type == htons(ETH_P_IPV6)) {
 		int nh_len;             /* IPv6 Header + Extensions */
 
diff --git a/datapath/flow.h b/datapath/flow.h
index 91a3022..eff0023 100644
--- a/datapath/flow.h
+++ b/datapath/flow.h
@@ -79,12 +79,17 @@ struct sw_flow_key {
 		__be16 tci;		/* 0 if no VLAN, VLAN_TAG_PRESENT set otherwise. */
 		__be16 type;		/* Ethernet frame type. */
 	} eth;
-	struct {
-		u8     proto;		/* IP protocol or lower 8 bits of ARP opcode. */
-		u8     tos;		/* IP ToS. */
-		u8     ttl;		/* IP TTL/hop limit. */
-		u8     frag;		/* One of OVS_FRAG_TYPE_*. */
-	} ip;
+	union {
+		struct {
+			__be32 top_lse;		/* top label stack entry */
+		} mpls;
+		struct {
+			u8     proto;		/* IP protocol or lower 8 bits of ARP opcode. */
+			u8     tos;		/* IP ToS. */
+			u8     ttl;		/* IP TTL/hop limit. */
+			u8     frag;		/* One of OVS_FRAG_TYPE_*. */
+		} ip;
+	};
 	union {
 		struct {
 			struct {
diff --git a/datapath/flow_netlink.c b/datapath/flow_netlink.c
index 515a9f6..eaf41dc 100644
--- a/datapath/flow_netlink.c
+++ b/datapath/flow_netlink.c
@@ -18,6 +18,7 @@
 
 #include "flow.h"
 #include "datapath.h"
+#include "mpls.h"
 #include <linux/uaccess.h>
 #include <linux/netdevice.h>
 #include <linux/etherdevice.h>
@@ -119,7 +120,8 @@ static bool match_validate(const struct sw_flow_match *match,
 			| (1ULL << OVS_KEY_ATTR_ICMP)
 			| (1ULL << OVS_KEY_ATTR_ICMPV6)
 			| (1ULL << OVS_KEY_ATTR_ARP)
-			| (1ULL << OVS_KEY_ATTR_ND));
+			| (1ULL << OVS_KEY_ATTR_ND)
+			| (1ULL << OVS_KEY_ATTR_MPLS));
 
 	/* Always allowed mask fields. */
 	mask_allowed |= ((1ULL << OVS_KEY_ATTR_TUNNEL)
@@ -134,6 +136,13 @@ static bool match_validate(const struct sw_flow_match *match,
 			mask_allowed |= 1ULL << OVS_KEY_ATTR_ARP;
 	}
 
+
+	if (eth_p_mpls(match->key->eth.type)) {
+		key_expected |= 1ULL << OVS_KEY_ATTR_MPLS;
+		if (match->mask && (match->mask->key.eth.type == htons(0xffff)))
+			mask_allowed |= 1ULL << OVS_KEY_ATTR_MPLS;
+	}
+
 	if (match->key->eth.type == htons(ETH_P_IP)) {
 		key_expected |= 1ULL << OVS_KEY_ATTR_IPV4;
 		if (match->mask && (match->mask->key.eth.type == htons(0xffff)))
@@ -242,6 +251,7 @@ static const int ovs_key_lens[OVS_KEY_ATTR_MAX + 1] = {
 	[OVS_KEY_ATTR_ARP] = sizeof(struct ovs_key_arp),
 	[OVS_KEY_ATTR_ND] = sizeof(struct ovs_key_nd),
 	[OVS_KEY_ATTR_TUNNEL] = -1,
+	[OVS_KEY_ATTR_MPLS] = sizeof(struct ovs_key_mpls),
 };
 
 static bool is_all_zero(const u8 *fp, size_t size)
@@ -616,6 +626,16 @@ static int ovs_key_from_nlattrs(struct sw_flow_match *match,  u64 attrs,
 		attrs &= ~(1ULL << OVS_KEY_ATTR_ARP);
 	}
 
+	if (attrs & (1ULL << OVS_KEY_ATTR_MPLS)) {
+		const struct ovs_key_mpls *mpls_key;
+
+		mpls_key = nla_data(a[OVS_KEY_ATTR_MPLS]);
+		SW_FLOW_KEY_PUT(match, mpls.top_lse,
+				mpls_key->mpls_lse, is_mask);
+
+		attrs &= ~(1ULL << OVS_KEY_ATTR_MPLS);
+	 }
+
 	if (attrs & (1ULL << OVS_KEY_ATTR_TCP)) {
 		const struct ovs_key_tcp *tcp_key;
 
@@ -988,6 +1008,14 @@ int ovs_nla_put_flow(const struct sw_flow_key *swkey,
 		arp_key->arp_op = htons(output->ip.proto);
 		memcpy(arp_key->arp_sha, output->ipv4.arp.sha, ETH_ALEN);
 		memcpy(arp_key->arp_tha, output->ipv4.arp.tha, ETH_ALEN);
+	} else if (eth_p_mpls(swkey->eth.type)) {
+		struct ovs_key_mpls *mpls_key;
+
+		nla = nla_reserve(skb, OVS_KEY_ATTR_MPLS, sizeof(*mpls_key));
+		if (!nla)
+			goto nla_put_failure;
+		mpls_key = nla_data(nla);
+		mpls_key->mpls_lse = output->mpls.top_lse;
 	}
 
 	if ((swkey->eth.type == htons(ETH_P_IP) ||
@@ -1190,15 +1218,133 @@ static inline void add_nested_action_end(struct sw_flow_actions *sfa,
 
 	a->nla_len = sfa->actions_len - st_offset;
 }
+#define MAX_ETH_TYPES 16 /* Arbitrary Limit */
+
+/* struct eth_types - possible eth types
+ * @types: provides storage for the possible eth types.
+ * @start: is the index of the first entry of types which is possible.
+ * @end: is the index of the last entry of types which is possible.
+ * @cursor: is the index of the entry which should be updated if an action
+ * changes the eth type.
+ *
+ * Due to the sample action there may be multiple possible eth types.
+ * In order to correctly validate actions all possible types are tracked
+ * and verified. This is done using struct eth_types.
+ *
+ * Initially start, end and cursor should be 0, and the first element of
+ * types should be set to the eth type of the flow.
+ *
+ * When an action changes the eth type then the values of start and end are
+ * updated to the value of cursor. The new type is stored at types[cursor].
+ *
+ * When entering a sample action the start and cursor values are saved. The
+ * value of cursor is set to the value of end plus one.
+ *
+ * When leaving a sample action the start and cursor values are restored to
+ * their saved values.
+ *
+ * An example follows.
+ *
+ * actions: pop_mpls(A),sample(pop_mpls(B)),sample(pop_mpls(C)),pop_mpls(D)
+ *
+ * 0. Initial state:
+ *	types = { original_eth_type }
+ * 	start = end = cursor = 0;
+ *
+ * 1. pop_mpls(A)
+ *    a. Check types from start (0) to end (0) inclusive
+ *       i.e. Check against original_eth_type
+ *    b. Set start = end = cursor
+ *    c. Set types[cursor] = A
+ *    New state:
+ *	types = { A }
+ *	start = end = cursor = 0;
+ *
+ * 2. Enter first sample()
+ *    a. Save start and cursor
+ *    b. Set cursor = end + 1
+ *    New state:
+ *	types = { A }
+ *	start = end = 0;
+ *	cursor = 1;
+ *
+ * 3. pop_mpls(B)
+ *    a. Check types from start (0) to end (0)
+ *       i.e: Check against A
+ *    b. Set start = end = cursor
+ *    c. Set types[cursor] = B
+ *    New state:
+ *	types = { A, B }
+ *	start = end = cursor = 1;
+ *
+ * 4. Leave first sample()
+ *    a. Restore start and cursor to the values when entering 2.
+ *    New state:
+ *	types = { A, B }
+ *	start = cursor = 0;
+ *	end = 1;
+ *
+ * 5. Enter second sample()
+ *    a. Save start and cursor
+ *    b. Set cursor = end + 1
+ *    New state:
+ *	types = { A, B }
+ *	start = 0;
+ *	end = 1;
+ *	cursor = 2;
+ *
+ * 6. pop_mpls(C)
+ *    a. Check types from start (0) to end (1) inclusive
+ *       i.e: Check against A and B
+ *    b. Set start = end = cursor
+ *    c. Set types[cursor] = C
+ *    New state:
+ *	types = { A, B, C }
+ *	start = end = cursor = 2;
+ *
+ * 7. Leave second sample()
+ *    a. Restore start and cursor to the values when entering 5.
+ *    New state:
+ *	types = { A, B, C }
+ *	start = cursor = 0;
+ *	end = 2;
+ *
+ * 8. pop_mpls(D)
+ *    a. Check types from start (0) to end (2) inclusive
+ *       i.e: Check against A, B and C
+ *    b. Set start = end = cursor
+ *    c. Set types[cursor] = D
+ *    New state:
+ *	types = { D } // Trailing entries of type are no longer used end = 0
+ *	start = end = cursor = 0;
+ */
+struct eth_types {
+	int start, end, cursor;
+	__be16 types[MAX_ETH_TYPES];
+};
+
+static void eth_types_set(struct eth_types *types, __be16 type)
+{
+	types->start = types->end = types->cursor;
+	types->types[types->cursor] = type;
+}
+
+static int ovs_nla_copy_actions__(const struct nlattr *attr,
+				  const struct sw_flow_key *key,
+				  int depth,
+				  struct sw_flow_actions **sfa,
+				  struct eth_types *eth_types);
 
 static int validate_and_copy_sample(const struct nlattr *attr,
 				    const struct sw_flow_key *key, int depth,
-				    struct sw_flow_actions **sfa)
+				    struct sw_flow_actions **sfa,
+				    struct eth_types *eth_types)
 {
 	const struct nlattr *attrs[OVS_SAMPLE_ATTR_MAX + 1];
 	const struct nlattr *probability, *actions;
 	const struct nlattr *a;
 	int rem, start, err, st_acts;
+	int saved_eth_types_start, saved_eth_types_cursor;
 
 	memset(attrs, 0, sizeof(attrs));
 	nla_for_each_nested(a, attr, rem) {
@@ -1230,22 +1376,38 @@ static int validate_and_copy_sample(const struct nlattr *attr,
 	if (st_acts < 0)
 		return st_acts;
 
-	err = ovs_nla_copy_actions(actions, key, depth + 1, sfa);
+	/* Save and update eth_types cursor and start.  Please see the
+	 * comment for struct eth_types for a discussion of this.
+	 */
+	saved_eth_types_start = eth_types->start;
+	saved_eth_types_cursor = eth_types->cursor;
+	eth_types->cursor = eth_types->end + 1;
+	if (eth_types->cursor == MAX_ETH_TYPES)
+		return -EINVAL;
+
+	err = ovs_nla_copy_actions__(actions, key, depth + 1, sfa, eth_types);
 	if (err)
 		return err;
 
+	/* Restore eth_types cursor and start.  Please see the
+	 * comment for struct eth_types for a discussion of this.
+	 */
+	eth_types->cursor = saved_eth_types_cursor;
+	eth_types->start = saved_eth_types_start;
+
 	add_nested_action_end(*sfa, st_acts);
 	add_nested_action_end(*sfa, start);
 
 	return 0;
 }
 
-static int validate_tp_port(const struct sw_flow_key *flow_key)
+static int validate_tp_port__(const struct sw_flow_key *flow_key,
+			      __be16 eth_type)
 {
-	if (flow_key->eth.type == htons(ETH_P_IP)) {
+	if (eth_type == htons(ETH_P_IP)) {
 		if (flow_key->ipv4.tp.src || flow_key->ipv4.tp.dst)
 			return 0;
-	} else if (flow_key->eth.type == htons(ETH_P_IPV6)) {
+	} else  if (eth_type == htons(ETH_P_IPV6)) {
 		if (flow_key->ipv6.tp.src || flow_key->ipv6.tp.dst)
 			return 0;
 	}
@@ -1253,6 +1415,21 @@ static int validate_tp_port(const struct sw_flow_key *flow_key)
 	return -EINVAL;
 }
 
+static int validate_tp_port(const struct sw_flow_key *flow_key,
+                           const struct eth_types *eth_types)
+{
+	int i;
+
+	for (i = eth_types->start; i < eth_types->end; i++) {
+		int ret = validate_tp_port__(flow_key, eth_types->types[i]);
+
+		if (ret)
+			return ret;
+	}
+
+	return 0;
+}
+
 void ovs_match_init(struct sw_flow_match *match,
 		    struct sw_flow_key *key,
 		    struct sw_flow_mask *mask)
@@ -1295,7 +1472,7 @@ static int validate_and_copy_set_tun(const struct nlattr *attr,
 static int validate_set(const struct nlattr *a,
 			const struct sw_flow_key *flow_key,
 			struct sw_flow_actions **sfa,
-			bool *set_tun)
+			bool *set_tun, struct eth_types *eth_types)
 {
 	const struct nlattr *ovs_key = nla_data(a);
 	int key_type = nla_type(ovs_key);
@@ -1326,9 +1503,12 @@ static int validate_set(const struct nlattr *a,
 			return err;
 		break;
 
-	case OVS_KEY_ATTR_IPV4:
-		if (flow_key->eth.type != htons(ETH_P_IP))
-			return -EINVAL;
+	case OVS_KEY_ATTR_IPV4: {
+		int i;
+
+		for (i = eth_types->start; i <= eth_types->end; i++)
+			if (eth_types->types[i] != htons(ETH_P_IP))
+				return -EINVAL;
 
 		if (!flow_key->ip.proto)
 			return -EINVAL;
@@ -1341,10 +1521,14 @@ static int validate_set(const struct nlattr *a,
 			return -EINVAL;
 
 		break;
+	}
 
-	case OVS_KEY_ATTR_IPV6:
-		if (flow_key->eth.type != htons(ETH_P_IPV6))
-			return -EINVAL;
+	case OVS_KEY_ATTR_IPV6: {
+		int i;
+
+		for (i = eth_types->start; i <= eth_types->end; i++)
+			if (eth_types->types[i] != htons(ETH_P_IPV6))
+				return -EINVAL;
 
 		if (!flow_key->ip.proto)
 			return -EINVAL;
@@ -1360,24 +1544,35 @@ static int validate_set(const struct nlattr *a,
 			return -EINVAL;
 
 		break;
+	}
+
 
 	case OVS_KEY_ATTR_TCP:
 		if (flow_key->ip.proto != IPPROTO_TCP)
 			return -EINVAL;
 
-		return validate_tp_port(flow_key);
+		return validate_tp_port(flow_key, eth_types);
 
 	case OVS_KEY_ATTR_UDP:
 		if (flow_key->ip.proto != IPPROTO_UDP)
 			return -EINVAL;
 
-		return validate_tp_port(flow_key);
+		return validate_tp_port(flow_key, eth_types);
+
+	case OVS_KEY_ATTR_MPLS: {
+		int i;
+
+		for (i = eth_types->start; i < eth_types->end; i++)
+			if (!eth_p_mpls(eth_types->types[i]))
+				return -EINVAL;
+		break;
+	}
 
 	case OVS_KEY_ATTR_SCTP:
 		if (flow_key->ip.proto != IPPROTO_SCTP)
 			return -EINVAL;
 
-		return validate_tp_port(flow_key);
+		return validate_tp_port(flow_key, eth_types);
 
 	default:
 		return -EINVAL;
@@ -1421,10 +1616,11 @@ static int copy_action(const struct nlattr *from,
 	return 0;
 }
 
-int ovs_nla_copy_actions(const struct nlattr *attr,
-			 const struct sw_flow_key *key,
-			 int depth,
-			 struct sw_flow_actions **sfa)
+static int ovs_nla_copy_actions__(const struct nlattr *attr,
+				  const struct sw_flow_key *key,
+				  int depth,
+				  struct sw_flow_actions **sfa,
+				  struct eth_types *eth_types)
 {
 	const struct nlattr *a;
 	int rem, err;
@@ -1437,6 +1633,8 @@ int ovs_nla_copy_actions(const struct nlattr *attr,
 		static const u32 action_lens[OVS_ACTION_ATTR_MAX + 1] = {
 			[OVS_ACTION_ATTR_OUTPUT] = sizeof(u32),
 			[OVS_ACTION_ATTR_USERSPACE] = (u32)-1,
+			[OVS_ACTION_ATTR_PUSH_MPLS] = sizeof(struct ovs_action_push_mpls),
+			[OVS_ACTION_ATTR_POP_MPLS] = sizeof(__be16),
 			[OVS_ACTION_ATTR_PUSH_VLAN] = sizeof(struct ovs_action_push_vlan),
 			[OVS_ACTION_ATTR_POP_VLAN] = 0,
 			[OVS_ACTION_ATTR_SET] = (u32)-1,
@@ -1479,14 +1677,44 @@ int ovs_nla_copy_actions(const struct nlattr *attr,
 				return -EINVAL;
 			break;
 
+		case OVS_ACTION_ATTR_PUSH_MPLS: {
+			const struct ovs_action_push_mpls *mpls = nla_data(a);
+			if (!eth_p_mpls(mpls->mpls_ethertype))
+				return -EINVAL;
+			eth_types_set(eth_types, mpls->mpls_ethertype);
+			break;
+		}
+
+		case OVS_ACTION_ATTR_POP_MPLS: {
+			int i;
+
+			for (i = eth_types->start; i <= eth_types->end; i++)
+				if (!eth_p_mpls(eth_types->types[i]))
+					return -EINVAL;
+
+			/* Disallow subsequent L2.5+ set and mpls_pop actions
+			 * as there is no check here to ensure that the new
+			 * eth_type is valid and thus set actions could
+			 * write off the end of the packet or otherwise
+			 * corrupt it.
+			 *
+			 * Support for these actions is planned using packet
+			 * recirculation.
+			 */
+			eth_types_set(eth_types, htons(0));
+			break;
+		}
+
 		case OVS_ACTION_ATTR_SET:
-			err = validate_set(a, key, sfa, &skip_copy);
+			err = validate_set(a, key, sfa, &skip_copy,
+					   eth_types);
 			if (err)
 				return err;
 			break;
 
 		case OVS_ACTION_ATTR_SAMPLE:
-			err = validate_and_copy_sample(a, key, depth, sfa);
+			err = validate_and_copy_sample(a, key, depth, sfa,
+						       eth_types);
 			if (err)
 				return err;
 			skip_copy = true;
@@ -1508,6 +1736,20 @@ int ovs_nla_copy_actions(const struct nlattr *attr,
 	return 0;
 }
 
+int ovs_nla_copy_actions(const struct nlattr *attr,
+			 const struct sw_flow_key *key,
+			 struct sw_flow_actions **sfa)
+{
+	struct eth_types eth_type = {
+		.start = 0,
+		.end = 0,
+		.cursor = 0,
+		.types = { key->eth.type, },
+	};
+
+	return ovs_nla_copy_actions__(attr, key, 0, sfa, &eth_type);
+}
+
 static int sample_action_to_attr(const struct nlattr *attr, struct sk_buff *skb)
 {
 	const struct nlattr *a;
diff --git a/datapath/flow_netlink.h b/datapath/flow_netlink.h
index 4401510..b471ece 100644
--- a/datapath/flow_netlink.h
+++ b/datapath/flow_netlink.h
@@ -49,7 +49,7 @@ int ovs_nla_get_match(struct sw_flow_match *match,
 		      const struct nlattr *);
 
 int ovs_nla_copy_actions(const struct nlattr *attr,
-			 const struct sw_flow_key *key, int depth,
+			 const struct sw_flow_key *key,
 			 struct sw_flow_actions **sfa);
 int ovs_nla_put_actions(const struct nlattr *attr,
 			int len, struct sk_buff *skb);
diff --git a/datapath/linux/compat/gso.c b/datapath/linux/compat/gso.c
index 32f906c..43f0d16 100644
--- a/datapath/linux/compat/gso.c
+++ b/datapath/linux/compat/gso.c
@@ -19,6 +19,7 @@
 #include <linux/module.h>
 #include <linux/if.h>
 #include <linux/if_tunnel.h>
+#include <linux/if_vlan.h>
 #include <linux/icmp.h>
 #include <linux/in.h>
 #include <linux/ip.h>
@@ -35,6 +36,8 @@
 #include <net/xfrm.h>
 
 #include "gso.h"
+#include "mpls.h"
+#include "vlan.h"
 
 #if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,37) && \
 	!defined(HAVE_VLAN_BUG_WORKAROUND)
@@ -47,10 +50,12 @@ MODULE_PARM_DESC(vlan_tso, "Enable TSO for VLAN packets");
 #define vlan_tso true
 #endif
 
-#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,37)
+#if LINUX_VERSION_CODE < KERNEL_VERSION(3,11,0)
 static bool dev_supports_vlan_tx(struct net_device *dev)
 {
-#if defined(HAVE_VLAN_BUG_WORKAROUND)
+#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,37)
+	return true;
+#elif defined(HAVE_VLAN_BUG_WORKAROUND)
 	return dev->features & NETIF_F_HW_VLAN_TX;
 #else
 	/* Assume that the driver is buggy. */
@@ -58,24 +63,64 @@ static bool dev_supports_vlan_tx(struct net_device *dev)
 #endif
 }
 
+/* Strictly this is not needed and will be optimised out
+ * as this code is guarded by if LINUX_VERSION_CODE < KERNEL_VERSION(3,11,0).
+ * It is here to make things explicit should the compatibility
+ * code be extended in some way prior extending its life-span
+ * beyond v3.11.
+ */
+static bool supports_mpls_gso(void)
+{
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(3,11,0)
+	return true;
+#else
+	return false;
+#endif
+}
+
 int rpl_dev_queue_xmit(struct sk_buff *skb)
 {
 #undef dev_queue_xmit
 	int err = -ENOMEM;
+	bool vlan, mpls;
+
+	vlan = mpls = false;
+
+	if (eth_p_mpls(skb->protocol) && !supports_mpls_gso())
+		mpls = true;
+
+	if (vlan_tx_tag_present(skb) && !dev_supports_vlan_tx(skb->dev))
+		vlan = true;
 
-	if (vlan_tx_tag_present(skb) && !dev_supports_vlan_tx(skb->dev)) {
+	if (vlan || mpls) {
 		int features;
 
 		features = netif_skb_features(skb);
 
-		if (!vlan_tso)
-			features &= ~(NETIF_F_TSO | NETIF_F_TSO6 |
-				      NETIF_F_UFO | NETIF_F_FSO);
+		if (vlan) {
+			if (!vlan_tso)
+				features &= ~(NETIF_F_TSO | NETIF_F_TSO6 |
+					      NETIF_F_UFO | NETIF_F_FSO);
 
-		skb = __vlan_put_tag(skb, skb->vlan_proto, vlan_tx_tag_get(skb));
-		if (unlikely(!skb))
-			return err;
-		vlan_set_tci(skb, 0);
+			skb = __vlan_put_tag(skb, skb->vlan_proto,
+					     vlan_tx_tag_get(skb));
+			if (unlikely(!skb))
+				return err;
+			vlan_set_tci(skb, 0);
+		}
+
+		/* As of v3.11 the kernel provides an mpls_features field in
+		 * struct net_device which allows devices to advertise which
+		 * features its supports for MPLS. This value defaults to
+		 * NETIF_F_SG and as of v3.11.
+		 *
+		 * This compatibility code is intended for kernels older
+		 * than v3.11 that do not support MPLS GSO and thus do not
+		 * provide mpls_features. Thus this code uses NETIF_F_SG
+		 * directly in place of mpls_features.
+		 */
+		if (mpls)
+			features &= NETIF_F_SG;
 
 		if (netif_needs_gso(skb, features)) {
 			struct sk_buff *nskb;
@@ -114,13 +159,15 @@ drop:
 	kfree_skb(skb);
 	return err;
 }
-#endif /* kernel version < 2.6.37 */
 
 static __be16 __skb_network_protocol(struct sk_buff *skb)
 {
 	__be16 type = skb->protocol;
 	int vlan_depth = ETH_HLEN;
 
+	if (eth_p_mpls(skb->protocol))
+		type = ovs_skb_get_inner_protocol(skb);
+
 	while (type == htons(ETH_P_8021Q) || type == htons(ETH_P_8021AD)) {
 		struct vlan_hdr *vh;
 
@@ -134,6 +181,7 @@ static __be16 __skb_network_protocol(struct sk_buff *skb)
 
 	return type;
 }
+#endif /* kernel version < 3.11.0 */
 
 static struct sk_buff *tnl_skb_gso_segment(struct sk_buff *skb,
 					   netdev_features_t features,
diff --git a/datapath/linux/compat/gso.h b/datapath/linux/compat/gso.h
index 44fd213..d7a9cea 100644
--- a/datapath/linux/compat/gso.h
+++ b/datapath/linux/compat/gso.h
@@ -1,6 +1,7 @@
 #ifndef __LINUX_GSO_WRAPPER_H
 #define __LINUX_GSO_WRAPPER_H
 
+#include <linux/netdevice.h>
 #include <linux/skbuff.h>
 #include <net/protocol.h>
 
@@ -11,6 +12,9 @@ struct ovs_gso_cb {
 	sk_buff_data_t	inner_network_header;
 	sk_buff_data_t	inner_mac_header;
 	void (*fix_segment)(struct sk_buff *);
+#if LINUX_VERSION_CODE < KERNEL_VERSION(3,11,0)
+	__be16			inner_protocol;
+#endif
 };
 #define OVS_GSO_CB(skb) ((struct ovs_gso_cb *)(skb)->cb)
 
@@ -69,4 +73,41 @@ static inline void skb_reset_inner_headers(struct sk_buff *skb)
 
 #define ip_local_out rpl_ip_local_out
 int ip_local_out(struct sk_buff *skb);
+
+#if LINUX_VERSION_CODE < KERNEL_VERSION(3,11,0)
+static inline void ovs_skb_init_inner_protocol(struct sk_buff *skb) {
+	OVS_GSO_CB(skb)->inner_protocol = htons(0);
+}
+
+static inline void ovs_skb_set_inner_protocol(struct sk_buff *skb,
+					      __be16 ethertype) {
+	OVS_GSO_CB(skb)->inner_protocol = ethertype;
+}
+
+static inline __be16 ovs_skb_get_inner_protocol(struct sk_buff *skb)
+{
+	return OVS_GSO_CB(skb)->inner_protocol;
+}
+
+#else
+
+static inline void ovs_skb_init_inner_protocol(struct sk_buff *skb) {
+	/* Nothing to do. The inner_protocol is either zero or
+	 * has been set to a value by another user.
+	 * Either way it may be considered initialised.
+	 */
+}
+
+static inline void ovs_skb_set_inner_protocol(struct sk_buff *skb,
+					      __be16 ethertype)
+{
+	skb->inner_protocol = ethertype;
+}
+
+static inline __be16 ovs_skb_get_inner_protocol(struct sk_buff *skb)
+{
+	return skb->inner_protocol;
+}
+#endif
+
 #endif
diff --git a/datapath/linux/compat/include/linux/netdevice.h b/datapath/linux/compat/include/linux/netdevice.h
index 2b2c855..dd47af1 100644
--- a/datapath/linux/compat/include/linux/netdevice.h
+++ b/datapath/linux/compat/include/linux/netdevice.h
@@ -73,10 +73,12 @@ static inline struct net_device *dev_get_by_index_rcu(struct net *net, int ifind
 #define NETIF_F_FSO 0
 #endif
 
-#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,38)
+#if LINUX_VERSION_CODE < KERNEL_VERSION(3,11,0)
 #define skb_gso_segment rpl_skb_gso_segment
 struct sk_buff *rpl_skb_gso_segment(struct sk_buff *skb, u32 features);
+#endif
 
+#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,38)
 #define netif_skb_features rpl_netif_skb_features
 u32 rpl_netif_skb_features(struct sk_buff *skb);
 
@@ -120,7 +122,7 @@ static inline void netdev_upper_dev_unlink(struct net_device *dev,
 }
 #endif
 
-#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,37)
+#if LINUX_VERSION_CODE < KERNEL_VERSION(3,11,0)
 #define dev_queue_xmit rpl_dev_queue_xmit
 int dev_queue_xmit(struct sk_buff *skb);
 #endif
diff --git a/datapath/linux/compat/netdevice.c b/datapath/linux/compat/netdevice.c
index 248066d..7c1f045 100644
--- a/datapath/linux/compat/netdevice.c
+++ b/datapath/linux/compat/netdevice.c
@@ -1,6 +1,9 @@
 #include <linux/netdevice.h>
 #include <linux/if_vlan.h>
 
+#include "mpls.h"
+#include "gso.h"
+
 #ifdef HAVE_RHEL_OVS_HOOK
 int nr_bridges = 0;
 #endif
@@ -71,7 +74,9 @@ u32 rpl_netif_skb_features(struct sk_buff *skb)
 		return harmonize_features(skb, protocol, features);
 	}
 }
+#endif	/* kernel version < 2.6.38 */
 
+#if LINUX_VERSION_CODE < KERNEL_VERSION(3,11,0)
 struct sk_buff *rpl_skb_gso_segment(struct sk_buff *skb, u32 features)
 {
 	int vlan_depth = ETH_HLEN;
@@ -79,6 +84,9 @@ struct sk_buff *rpl_skb_gso_segment(struct sk_buff *skb, u32 features)
 	__be16 skb_proto;
 	struct sk_buff *skb_gso;
 
+	if (eth_p_mpls(skb->protocol))
+		type = ovs_skb_get_inner_protocol(skb);
+
 	while (type == htons(ETH_P_8021Q)) {
 		struct vlan_hdr *vh;
 
@@ -99,4 +107,4 @@ struct sk_buff *rpl_skb_gso_segment(struct sk_buff *skb, u32 features)
 	skb->protocol = skb_proto;
 	return skb_gso;
 }
-#endif	/* kernel version < 2.6.38 */
+#endif	/* kernel version < 3.11.0 */
diff --git a/datapath/mpls.h b/datapath/mpls.h
new file mode 100644
index 0000000..7eab104
--- /dev/null
+++ b/datapath/mpls.h
@@ -0,0 +1,15 @@
+#ifndef MPLS_H
+#define MPLS_H 1
+
+#include <linux/if_ether.h>
+
+#define MPLS_BOS_MASK	0x00000100
+#define MPLS_HLEN 4
+
+static inline bool eth_p_mpls(__be16 eth_type)
+{
+	return eth_type == htons(ETH_P_MPLS_UC) ||
+		eth_type == htons(ETH_P_MPLS_MC);
+}
+
+#endif
diff --git a/include/linux/openvswitch.h b/include/linux/openvswitch.h
index 09c26b5..1ef98a8 100644
--- a/include/linux/openvswitch.h
+++ b/include/linux/openvswitch.h
@@ -283,14 +283,13 @@ enum ovs_key_attr {
 	OVS_KEY_ATTR_SKB_MARK,  /* u32 skb mark */
 	OVS_KEY_ATTR_TUNNEL,	/* Nested set of ovs_tunnel attributes */
 	OVS_KEY_ATTR_SCTP,      /* struct ovs_key_sctp */
+	OVS_KEY_ATTR_MPLS,      /* array of struct ovs_key_mpls.
+				 * The implementation may restrict
+				 * the accepted length of the array. */
 
 #ifdef __KERNEL__
 	OVS_KEY_ATTR_IPV4_TUNNEL,  /* struct ovs_key_ipv4_tunnel */
 #endif
-
-	OVS_KEY_ATTR_MPLS = 62, /* array of struct ovs_key_mpls.
-				 * The implementation may restrict
-				 * the accepted length of the array. */
 	__OVS_KEY_ATTR_MAX
 };
 
-- 
1.8.4

^ permalink raw reply related

* Re: [PATCH RFC 00/77] Re-design MSI/MSI-X interrupts enablement pattern
From: Alexander Gordeev @ 2013-10-04  8:29 UTC (permalink / raw)
  To: Ben Hutchings
  Cc: linux-kernel, Bjorn Helgaas, Ralf Baechle, Michael Ellerman,
	Benjamin Herrenschmidt, Martin Schwidefsky, Ingo Molnar,
	Tejun Heo, Dan Williams, Andy King, Jon Mason, Matt Porter,
	linux-pci, linux-mips, linuxppc-dev, linux390, linux-s390, x86,
	linux-ide, iss_storagedev, linux-nvme, linux-rdma, netdev,
	e1000-devel, linux-dr
In-Reply-To: <1380840585.3419.50.camel@bwh-desktop.uk.level5networks.com>

On Thu, Oct 03, 2013 at 11:49:45PM +0100, Ben Hutchings wrote:
> On Wed, 2013-10-02 at 12:48 +0200, Alexander Gordeev wrote:
> > This update converts pci_enable_msix() and pci_enable_msi_block()
> > interfaces to canonical kernel functions and makes them return a
> > error code in case of failure or 0 in case of success.
> [...]
> 
> I think this is fundamentally flawed: pci_msix_table_size() and
> pci_get_msi_cap() can only report the limits of the *device* (which the
> driver usually already knows), whereas MSI allocation can also be
> constrained due to *global* limits on the number of distinct IRQs.

Even the current implementation by no means addresses it. Although it
might seem a case for architectures to report the number of IRQs available
for a driver to retry, in fact they all just fail. The same applies to
*any* other type of resource involved: irq_desc's, CPU interrupt vector
space, msi_desc's etc. No platform cares about it and just bails out once
a constrain met (please correct me if I am wrong here). Given that Linux
has been doing well even on embedded I think we should not change it.

The only exception to the above is pSeries platform which takes advantage
of the current design (to implement MSI quota). There are indications we
can satisfy pSeries requirements, but the design proposed in this RFC
is not going to change drastically anyway. The start of the discusstion
is here: https://lkml.org/lkml/2013/9/5/293

> Currently pci_enable_msix() will report a positive value if it fails due
> to the global limit.  Your patch 7 removes that.  pci_enable_msi_block()
> unfortunately doesn't appear to do this.

pci_enable_msi_block() can do more than one MSI only on x86 (with IOMMU),
but it does not bother to return positive numbers, indeed.

> It seems to me that a more useful interface would take a minimum and
> maximum number of vectors from the driver.  This wouldn't allow the
> driver to specify that it could only accept, say, any even number within
> a certain range, but you could still leave the current functions
> available for any driver that needs that.

Mmmm.. I am not sure I am getting it. Could you please rephrase?

> Ben.

-- 
Regards,
Alexander Gordeev
agordeev@redhat.com

^ permalink raw reply

* Re: [PATCH RFC 00/77] Re-design MSI/MSI-X interrupts enablement pattern
From: David Laight @ 2013-10-04  8:31 UTC (permalink / raw)
  To: Alexander Gordeev, Ben Hutchings
  Cc: linux-mips, VMware, Inc., linux-pci, linux-nvme, linux-ide,
	Martin Schwidefsky, linux-s390, Andy King, linux-scsi,
	e1000-devel, x86, Ingo Molnar, iss_storagedev, Tejun Heo,
	Bjorn Helgaas, Dan Williams, Jon Mason,
	Solarflare linux maintainers, netdev, linux-kernel, Ralf Baechle,
	linux-rdma, linux-driver, linux390, linuxppc-dev
In-Reply-To: <20131004082920.GA4536@dhcp-26-207.brq.redhat.com>

> > It seems to me that a more useful interface would take a minimum and
> > maximum number of vectors from the driver.  This wouldn't allow the
> > driver to specify that it could only accept, say, any even number within
> > a certain range, but you could still leave the current functions
> > available for any driver that needs that.
> 
> Mmmm.. I am not sure I am getting it. Could you please rephrase?

One possibility is for drivers than can use a lot of interrupts to
request a minimum number initially and then request the additional
ones much later on.
That would make it less likely that none will be available for
devices/drivers that need them but are initialised later.

	David




------------------------------------------------------------------------------
October Webinars: Code for Performance
Free Intel webinars can help you accelerate application performance.
Explore tips for MPI, OpenMP, advanced profiling, and more. Get the most from 
the latest Intel processors and coprocessors. See abstracts and register >
http://pubads.g.doubleclick.net/gampad/clk?id=60134791&iu=/4140/ostg.clktrk
_______________________________________________
E1000-devel mailing list
E1000-devel@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/e1000-devel
To learn more about Intel&#174; Ethernet, visit http://communities.intel.com/community/wired

^ permalink raw reply

* [PATCH 00/33] Netfilter updates for net-next
From: Pablo Neira Ayuso @ 2013-10-04  8:32 UTC (permalink / raw)
  To: netfilter-devel; +Cc: davem, netdev

Hi David,

The following patchset contains Netfilter updates for your net-next tree,
mostly ipset improvements and enhancements features, they are:

* Don't call ip_nest_end needlessly in the error path from me, suggested
  by Pablo Neira Ayuso, from Jozsef Kadlecsik.

* Fixed sparse warnings about shadowed variable and missing rcu annotation
  and fix of "may be used uninitialized" warnings, also from Jozsef.

* Renamed simple macro names to avoid namespace issues, reported by David
  Laight, again from Jozsef.

* Use fix sized type for timeout in the extension part, and cosmetic
  ordering of matches and targets separatedly in xt_set.c, from Jozsef.

* Support package fragments for IPv4 protos without ports from Anders K.
  Pedersen. For example this allows a hash:ip,port ipset containing the
  entry 192.168.0.1,gre:0 to match all package fragments for PPTP VPN
  tunnels to/from the host. Without this patch only the first package
  fragment (with fragment offset 0) was matched.

* Introduced a new operation to get both setname and family, from Jozsef.
  ip[6]tables set match and SET target need to know the family of the set
  in order to reject adding rules which refer to a set with a non-mathcing
  family. Currently such rules are silently accepted and then ignored
  instead of generating an error message to the user.

* Reworked extensions support in ipset types from Jozsef. The approach of
  defining structures with all variations is not manageable as the
  number of extensions grows. Therefore a blob for the extensions is
  introduced, somewhat similar to conntrack. The support of extensions
  which need a per data destroy function is added as well.

* When an element timed out in a list:set type of set, the garbage
  collector skipped the checking of the next element. So the purging
  was delayed to the next run of the gc, fixed by Jozsef.

* A small Kconfig fix: NETFILTER_NETLINK cannot be selected and
  ipset requires it.

* hash:net,net type from Oliver Smith. The type provides the ability to
  store pairs of subnets in a set.

* Comment for ipset entries from Oliver Smith. This makes possible to
  annotate entries in a set with comments, for example:

  ipset n foo hash:net,net comment
  ipset a foo 10.0.0.0/21,192.168.1.0/24 comment "office nets A and B"

* Fix of hash types resizing with comment extension from Jozsef.

* Fix of new extensions for list:set type when an element is added
  into a slot from where another element was pushed away from Jozsef.

* Introduction of a common function for the listing of the element
  extensions from Jozsef.

* Net namespace support for ipset from Vitaly Lavrov.

* hash:net,port,net type from Oliver Smith, which makes possible
  to store the triples of two subnets and a protocol, port pair in
  a set.

* Get xt_TCPMSS working with net namespace, by Gao feng.

* Use the proper net netnamespace to allocate skbs, also by Gao feng.

* A couple of cleanups for the conntrack SIP helper, by Holger
  Eitzenberger.

* Extend cttimeout to allow setting default conntrack timeouts via
  nfnetlink, so we can get rid of all our sysctl/proc interfaces in
  the future for timeout tuning, from me.

You can pull these changes from:

  git://git.kernel.org/pub/scm/linux/kernel/git/pablo/nf-next.git master

Thanks!

----------------------------------------------------------------

The following changes since commit 8ce440610357b77587433d0df647cea69a6890a8:

  ipv6: do not allow ipv6 module to be removed (2013-09-24 11:31:58 -0400)

are available in the git repository at:

  git://git.kernel.org/pub/scm/linux/kernel/git/pablo/nf-next.git master

for you to fetch changes up to 91cb498e6a34b429a032f8cfbb57dde28cd20e0c:

  netfilter: cttimeout: allow to set/get default protocol timeouts (2013-10-01 13:17:39 +0200)

----------------------------------------------------------------
Anders K. Pedersen (1):
      netfilter: ipset: Support package fragments for IPv4 protos without ports

Gao feng (4):
      netfilter: xt_TCPMSS: Get mtu only if clamp-mss-to-pmtu is specified
      netfilter: xt_TCPMSS: lookup route from proper net namespace
      netfilter: nfnetlink_queue: use proper net namespace to allocate skb
      netfilter: nfnetlink_log: use proper net to allocate skb

Jozsef Kadlecsik (18):
      netfilter: ipset: Don't call ip_nest_end needlessly in the error path
      netfilter: ipset: Sparse warning about shadowed variable fixed
      netfilter: ipset: Fix sparse warnings due to missing rcu annotations
      netfilter: ipset: Rename simple macro names to avoid namespace issues.
      netfilter: ipset: Fix "may be used uninitialized" warnings
      netfilter: ipset: Use fix sized type for timeout in the extension part
      netfilter: ipset: order matches and targets separatedly in xt_set.c
      netfilter: ipset: Introduce new operation to get both setname and family
      netfilter: ipset: Prepare ipset to support multiple networks for hash types
      netfilter: ipset: Rename extension offset ids to extension ids
      netfilter: ipset: Move extension data to set structure
      netfilter: ipset: Generalize extensions support
      netfilter: ipset: Support extensions which need a per data destroy function
      netfilter: ipset: list:set: make sure all elements are checked by the gc
      netfilter: ipset: Kconfig: ipset needs NETFILTER_NETLINK
      netfilter: ipset: Fix hash resizing with comments
      netfilter: ipset: For set:list types, replaced elements must be zeroed out
      netfilter: ipset: Use a common function at listing the extensions

Oliver Smith (6):
      netfilter: ipset: Add hash:net,net module to kernel.
      netfilter: ipset: Support comments for ipset entries in the core.
      netfilter: ipset: Support comments in bitmap-type ipsets.
      netfilter: ipset: Support comments in the list-type ipset.
      netfilter: ipset: Support comments in hash-type ipsets.
      netfilter: ipset: Add hash:net,port,net module to kernel.

Pablo Neira Ayuso (1):
      netfilter: cttimeout: allow to set/get default protocol timeouts

Vitaly Lavrov (1):
      netfiler: ipset: Add net namespace for ipset

holger@eitzenberger.org (2):
      netfilter: nf_ct_sip: extend RCU read lock in set_expected_rtp_rtcp()
      netfilter: nf_ct_sip: consolidate NAT hook functions

 include/linux/netfilter/ipset/ip_set.h             |  151 ++++-
 include/linux/netfilter/ipset/ip_set_comment.h     |   57 ++
 include/linux/netfilter/ipset/ip_set_timeout.h     |    4 +-
 include/linux/netfilter/nf_conntrack_sip.h         |  107 ++--
 include/uapi/linux/netfilter/ipset/ip_set.h        |   16 +-
 include/uapi/linux/netfilter/nfnetlink_cttimeout.h |    2 +
 net/netfilter/ipset/Kconfig                        |   20 +-
 net/netfilter/ipset/Makefile                       |    2 +
 net/netfilter/ipset/ip_set_bitmap_gen.h            |  163 +++---
 net/netfilter/ipset/ip_set_bitmap_ip.c             |  125 ++---
 net/netfilter/ipset/ip_set_bitmap_ipmac.c          |  156 ++----
 net/netfilter/ipset/ip_set_bitmap_port.c           |  112 +---
 net/netfilter/ipset/ip_set_core.c                  |  361 ++++++++----
 net/netfilter/ipset/ip_set_getport.c               |   18 +-
 net/netfilter/ipset/ip_set_hash_gen.h              |  526 ++++++++---------
 net/netfilter/ipset/ip_set_hash_ip.c               |   58 +-
 net/netfilter/ipset/ip_set_hash_ipport.c           |   80 +--
 net/netfilter/ipset/ip_set_hash_ipportip.c         |   86 +--
 net/netfilter/ipset/ip_set_hash_ipportnet.c        |  108 +---
 net/netfilter/ipset/ip_set_hash_net.c              |   85 +--
 net/netfilter/ipset/ip_set_hash_netiface.c         |   98 +---
 net/netfilter/ipset/ip_set_hash_netnet.c           |  483 ++++++++++++++++
 net/netfilter/ipset/ip_set_hash_netport.c          |   92 +--
 net/netfilter/ipset/ip_set_hash_netportnet.c       |  588 ++++++++++++++++++++
 net/netfilter/ipset/ip_set_list_set.c              |  263 ++++-----
 net/netfilter/nf_conntrack_sip.c                   |  133 ++---
 net/netfilter/nf_nat_sip.c                         |   35 +-
 net/netfilter/nfnetlink_cttimeout.c                |  161 +++++-
 net/netfilter/nfnetlink_log.c                      |   11 +-
 net/netfilter/nfnetlink_queue_core.c               |    6 +-
 net/netfilter/xt_TCPMSS.c                          |   72 +--
 net/netfilter/xt_set.c                             |  222 ++++----
 net/sched/em_ipset.c                               |    7 +-
 33 files changed, 2677 insertions(+), 1731 deletions(-)
 create mode 100644 include/linux/netfilter/ipset/ip_set_comment.h
 create mode 100644 net/netfilter/ipset/ip_set_hash_netnet.c
 create mode 100644 net/netfilter/ipset/ip_set_hash_netportnet.c

^ permalink raw reply

* [PATCH 01/33] netfilter: nf_ct_sip: extend RCU read lock in set_expected_rtp_rtcp()
From: Pablo Neira Ayuso @ 2013-10-04  8:32 UTC (permalink / raw)
  To: netfilter-devel; +Cc: davem, netdev
In-Reply-To: <1380875598-5250-1-git-send-email-pablo@netfilter.org>

From: "holger@eitzenberger.org" <holger@eitzenberger.org>

Currently set_expected_rtp_rtcp() in the SIP helper uses
rcu_dereference() two times to access two different NAT hook
functions. However, only the first one is protected by the RCU
reader lock, but the 2nd isn't. Fix it by extending the RCU
protected area.

This is more a cosmetic thing since we rely on all netfilter hooks
being rcu_read_lock()ed by nf_hook_slow() in many places anyways,
as Patrick McHardy clarified.

Signed-off-by: Holger Eitzenberger <holger.eitzenberger@sophos.com>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/netfilter/nf_conntrack_sip.c |    6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/net/netfilter/nf_conntrack_sip.c b/net/netfilter/nf_conntrack_sip.c
index e0c4373..5ed8c44 100644
--- a/net/netfilter/nf_conntrack_sip.c
+++ b/net/netfilter/nf_conntrack_sip.c
@@ -966,7 +966,6 @@ static int set_expected_rtp_rtcp(struct sk_buff *skb, unsigned int protoff,
 #endif
 			skip_expect = 1;
 	} while (!skip_expect);
-	rcu_read_unlock();
 
 	base_port = ntohs(tuple.dst.u.udp.port) & ~1;
 	rtp_port = htons(base_port);
@@ -980,8 +979,10 @@ static int set_expected_rtp_rtcp(struct sk_buff *skb, unsigned int protoff,
 			goto err1;
 	}
 
-	if (skip_expect)
+	if (skip_expect) {
+		rcu_read_unlock();
 		return NF_ACCEPT;
+	}
 
 	rtp_exp = nf_ct_expect_alloc(ct);
 	if (rtp_exp == NULL)
@@ -1012,6 +1013,7 @@ static int set_expected_rtp_rtcp(struct sk_buff *skb, unsigned int protoff,
 err2:
 	nf_ct_expect_put(rtp_exp);
 err1:
+	rcu_read_unlock();
 	return ret;
 }
 
-- 
1.7.10.4


^ permalink raw reply related

* [PATCH 02/33] netfilter: xt_TCPMSS: Get mtu only if clamp-mss-to-pmtu is specified
From: Pablo Neira Ayuso @ 2013-10-04  8:32 UTC (permalink / raw)
  To: netfilter-devel; +Cc: davem, netdev
In-Reply-To: <1380875598-5250-1-git-send-email-pablo@netfilter.org>

From: Gao feng <gaofeng@cn.fujitsu.com>

This patch refactors the code to skip tcpmss_reverse_mtu if no
clamp-mss-to-pmtu is specified.

Signed-off-by: Gao feng <gaofeng@cn.fujitsu.com>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/netfilter/xt_TCPMSS.c |   70 +++++++++++++++++++++++----------------------
 1 file changed, 36 insertions(+), 34 deletions(-)

diff --git a/net/netfilter/xt_TCPMSS.c b/net/netfilter/xt_TCPMSS.c
index cd24290..62776de 100644
--- a/net/netfilter/xt_TCPMSS.c
+++ b/net/netfilter/xt_TCPMSS.c
@@ -43,10 +43,41 @@ optlen(const u_int8_t *opt, unsigned int offset)
 		return opt[offset+1];
 }
 
+static u_int32_t tcpmss_reverse_mtu(const struct sk_buff *skb,
+				    unsigned int family)
+{
+	struct flowi fl;
+	const struct nf_afinfo *ai;
+	struct rtable *rt = NULL;
+	u_int32_t mtu     = ~0U;
+
+	if (family == PF_INET) {
+		struct flowi4 *fl4 = &fl.u.ip4;
+		memset(fl4, 0, sizeof(*fl4));
+		fl4->daddr = ip_hdr(skb)->saddr;
+	} else {
+		struct flowi6 *fl6 = &fl.u.ip6;
+
+		memset(fl6, 0, sizeof(*fl6));
+		fl6->daddr = ipv6_hdr(skb)->saddr;
+	}
+	rcu_read_lock();
+	ai = nf_get_afinfo(family);
+	if (ai != NULL)
+		ai->route(&init_net, (struct dst_entry **)&rt, &fl, false);
+	rcu_read_unlock();
+
+	if (rt != NULL) {
+		mtu = dst_mtu(&rt->dst);
+		dst_release(&rt->dst);
+	}
+	return mtu;
+}
+
 static int
 tcpmss_mangle_packet(struct sk_buff *skb,
 		     const struct xt_action_param *par,
-		     unsigned int in_mtu,
+		     unsigned int family,
 		     unsigned int tcphoff,
 		     unsigned int minlen)
 {
@@ -76,6 +107,8 @@ tcpmss_mangle_packet(struct sk_buff *skb,
 		return -1;
 
 	if (info->mss == XT_TCPMSS_CLAMP_PMTU) {
+		unsigned int in_mtu = tcpmss_reverse_mtu(skb, family);
+
 		if (dst_mtu(skb_dst(skb)) <= minlen) {
 			net_err_ratelimited("unknown or invalid path-MTU (%u)\n",
 					    dst_mtu(skb_dst(skb)));
@@ -165,37 +198,6 @@ tcpmss_mangle_packet(struct sk_buff *skb,
 	return TCPOLEN_MSS;
 }
 
-static u_int32_t tcpmss_reverse_mtu(const struct sk_buff *skb,
-				    unsigned int family)
-{
-	struct flowi fl;
-	const struct nf_afinfo *ai;
-	struct rtable *rt = NULL;
-	u_int32_t mtu     = ~0U;
-
-	if (family == PF_INET) {
-		struct flowi4 *fl4 = &fl.u.ip4;
-		memset(fl4, 0, sizeof(*fl4));
-		fl4->daddr = ip_hdr(skb)->saddr;
-	} else {
-		struct flowi6 *fl6 = &fl.u.ip6;
-
-		memset(fl6, 0, sizeof(*fl6));
-		fl6->daddr = ipv6_hdr(skb)->saddr;
-	}
-	rcu_read_lock();
-	ai = nf_get_afinfo(family);
-	if (ai != NULL)
-		ai->route(&init_net, (struct dst_entry **)&rt, &fl, false);
-	rcu_read_unlock();
-
-	if (rt != NULL) {
-		mtu = dst_mtu(&rt->dst);
-		dst_release(&rt->dst);
-	}
-	return mtu;
-}
-
 static unsigned int
 tcpmss_tg4(struct sk_buff *skb, const struct xt_action_param *par)
 {
@@ -204,7 +206,7 @@ tcpmss_tg4(struct sk_buff *skb, const struct xt_action_param *par)
 	int ret;
 
 	ret = tcpmss_mangle_packet(skb, par,
-				   tcpmss_reverse_mtu(skb, PF_INET),
+				   PF_INET,
 				   iph->ihl * 4,
 				   sizeof(*iph) + sizeof(struct tcphdr));
 	if (ret < 0)
@@ -233,7 +235,7 @@ tcpmss_tg6(struct sk_buff *skb, const struct xt_action_param *par)
 	if (tcphoff < 0)
 		return NF_DROP;
 	ret = tcpmss_mangle_packet(skb, par,
-				   tcpmss_reverse_mtu(skb, PF_INET6),
+				   PF_INET6,
 				   tcphoff,
 				   sizeof(*ipv6h) + sizeof(struct tcphdr));
 	if (ret < 0)
-- 
1.7.10.4


^ permalink raw reply related

* [PATCH 03/33] netfilter: xt_TCPMSS: lookup route from proper net namespace
From: Pablo Neira Ayuso @ 2013-10-04  8:32 UTC (permalink / raw)
  To: netfilter-devel; +Cc: davem, netdev
In-Reply-To: <1380875598-5250-1-git-send-email-pablo@netfilter.org>

From: Gao feng <gaofeng@cn.fujitsu.com>

Otherwise the pmtu will be incorrect.

Signed-off-by: Gao feng <gaofeng@cn.fujitsu.com>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/netfilter/xt_TCPMSS.c |    8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/net/netfilter/xt_TCPMSS.c b/net/netfilter/xt_TCPMSS.c
index 62776de..e762de5 100644
--- a/net/netfilter/xt_TCPMSS.c
+++ b/net/netfilter/xt_TCPMSS.c
@@ -43,7 +43,8 @@ optlen(const u_int8_t *opt, unsigned int offset)
 		return opt[offset+1];
 }
 
-static u_int32_t tcpmss_reverse_mtu(const struct sk_buff *skb,
+static u_int32_t tcpmss_reverse_mtu(struct net *net,
+				    const struct sk_buff *skb,
 				    unsigned int family)
 {
 	struct flowi fl;
@@ -64,7 +65,7 @@ static u_int32_t tcpmss_reverse_mtu(const struct sk_buff *skb,
 	rcu_read_lock();
 	ai = nf_get_afinfo(family);
 	if (ai != NULL)
-		ai->route(&init_net, (struct dst_entry **)&rt, &fl, false);
+		ai->route(net, (struct dst_entry **)&rt, &fl, false);
 	rcu_read_unlock();
 
 	if (rt != NULL) {
@@ -107,7 +108,8 @@ tcpmss_mangle_packet(struct sk_buff *skb,
 		return -1;
 
 	if (info->mss == XT_TCPMSS_CLAMP_PMTU) {
-		unsigned int in_mtu = tcpmss_reverse_mtu(skb, family);
+		struct net *net = dev_net(par->in ? par->in : par->out);
+		unsigned int in_mtu = tcpmss_reverse_mtu(net, skb, family);
 
 		if (dst_mtu(skb_dst(skb)) <= minlen) {
 			net_err_ratelimited("unknown or invalid path-MTU (%u)\n",
-- 
1.7.10.4


^ permalink raw reply related

* [PATCH 04/33] netfilter: ipset: Don't call ip_nest_end needlessly in the error path
From: Pablo Neira Ayuso @ 2013-10-04  8:32 UTC (permalink / raw)
  To: netfilter-devel; +Cc: davem, netdev
In-Reply-To: <1380875598-5250-1-git-send-email-pablo@netfilter.org>

From: Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>

Suggested-by: Pablo Neira Ayuso <pablo@netfilter.org>
Signed-off-by: Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>
---
 net/netfilter/ipset/ip_set_bitmap_gen.h |    2 +-
 net/netfilter/ipset/ip_set_hash_gen.h   |    2 +-
 net/netfilter/ipset/ip_set_list_set.c   |    2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/net/netfilter/ipset/ip_set_bitmap_gen.h b/net/netfilter/ipset/ip_set_bitmap_gen.h
index 2524337..f6af97c 100644
--- a/net/netfilter/ipset/ip_set_bitmap_gen.h
+++ b/net/netfilter/ipset/ip_set_bitmap_gen.h
@@ -228,11 +228,11 @@ mtype_list(const struct ip_set *set,
 
 nla_put_failure:
 	nla_nest_cancel(skb, nested);
-	ipset_nest_end(skb, adt);
 	if (unlikely(id == first)) {
 		cb->args[2] = 0;
 		return -EMSGSIZE;
 	}
+	ipset_nest_end(skb, adt);
 	return 0;
 }
 
diff --git a/net/netfilter/ipset/ip_set_hash_gen.h b/net/netfilter/ipset/ip_set_hash_gen.h
index 707bc52..7ff20ec 100644
--- a/net/netfilter/ipset/ip_set_hash_gen.h
+++ b/net/netfilter/ipset/ip_set_hash_gen.h
@@ -909,13 +909,13 @@ mtype_list(const struct ip_set *set,
 
 nla_put_failure:
 	nlmsg_trim(skb, incomplete);
-	ipset_nest_end(skb, atd);
 	if (unlikely(first == cb->args[2])) {
 		pr_warning("Can't list set %s: one bucket does not fit into "
 			   "a message. Please report it!\n", set->name);
 		cb->args[2] = 0;
 		return -EMSGSIZE;
 	}
+	ipset_nest_end(skb, atd);
 	return 0;
 }
 
diff --git a/net/netfilter/ipset/ip_set_list_set.c b/net/netfilter/ipset/ip_set_list_set.c
index 979b8c9..68299ee 100644
--- a/net/netfilter/ipset/ip_set_list_set.c
+++ b/net/netfilter/ipset/ip_set_list_set.c
@@ -550,11 +550,11 @@ finish:
 
 nla_put_failure:
 	nla_nest_cancel(skb, nested);
-	ipset_nest_end(skb, atd);
 	if (unlikely(i == first)) {
 		cb->args[2] = 0;
 		return -EMSGSIZE;
 	}
+	ipset_nest_end(skb, atd);
 	return 0;
 }
 
-- 
1.7.10.4


^ permalink raw reply related

* [PATCH 07/33] netfilter: ipset: Rename simple macro names to avoid namespace issues.
From: Pablo Neira Ayuso @ 2013-10-04  8:32 UTC (permalink / raw)
  To: netfilter-devel; +Cc: davem, netdev
In-Reply-To: <1380875598-5250-1-git-send-email-pablo@netfilter.org>

From: Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>

Reported-by: David Laight <David.Laight@ACULAB.COM>
Signed-off-by: Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>
---
 include/linux/netfilter/ipset/ip_set.h      |    3 +
 net/netfilter/ipset/ip_set_bitmap_gen.h     |   47 ++++-----
 net/netfilter/ipset/ip_set_bitmap_ip.c      |   10 +-
 net/netfilter/ipset/ip_set_bitmap_ipmac.c   |   10 +-
 net/netfilter/ipset/ip_set_bitmap_port.c    |   10 +-
 net/netfilter/ipset/ip_set_hash_gen.h       |  147 ++++++++++++++-------------
 net/netfilter/ipset/ip_set_hash_ip.c        |   10 +-
 net/netfilter/ipset/ip_set_hash_ipport.c    |   12 +--
 net/netfilter/ipset/ip_set_hash_ipportip.c  |   12 +--
 net/netfilter/ipset/ip_set_hash_ipportnet.c |   16 +--
 net/netfilter/ipset/ip_set_hash_net.c       |   14 +--
 net/netfilter/ipset/ip_set_hash_netiface.c  |   14 +--
 net/netfilter/ipset/ip_set_hash_netport.c   |   16 +--
 net/netfilter/ipset/ip_set_list_set.c       |   10 +-
 14 files changed, 169 insertions(+), 162 deletions(-)

diff --git a/include/linux/netfilter/ipset/ip_set.h b/include/linux/netfilter/ipset/ip_set.h
index 9ac9fbd..f900f33 100644
--- a/include/linux/netfilter/ipset/ip_set.h
+++ b/include/linux/netfilter/ipset/ip_set.h
@@ -398,4 +398,7 @@ bitmap_bytes(u32 a, u32 b)
 	{ .bytes = ULLONG_MAX, .packets = ULLONG_MAX,	\
 	  .timeout = (map)->timeout }
 
+#define IPSET_CONCAT(a, b)		a##b
+#define IPSET_TOKEN(a, b)		IPSET_CONCAT(a, b)
+
 #endif /*_IP_SET_H */
diff --git a/net/netfilter/ipset/ip_set_bitmap_gen.h b/net/netfilter/ipset/ip_set_bitmap_gen.h
index f6af97c..d39905e 100644
--- a/net/netfilter/ipset/ip_set_bitmap_gen.h
+++ b/net/netfilter/ipset/ip_set_bitmap_gen.h
@@ -8,31 +8,28 @@
 #ifndef __IP_SET_BITMAP_IP_GEN_H
 #define __IP_SET_BITMAP_IP_GEN_H
 
-#define CONCAT(a, b)		a##b
-#define TOKEN(a,b)		CONCAT(a, b)
-
-#define mtype_do_test		TOKEN(MTYPE, _do_test)
-#define mtype_gc_test		TOKEN(MTYPE, _gc_test)
-#define mtype_is_filled		TOKEN(MTYPE, _is_filled)
-#define mtype_do_add		TOKEN(MTYPE, _do_add)
-#define mtype_do_del		TOKEN(MTYPE, _do_del)
-#define mtype_do_list		TOKEN(MTYPE, _do_list)
-#define mtype_do_head		TOKEN(MTYPE, _do_head)
-#define mtype_adt_elem		TOKEN(MTYPE, _adt_elem)
-#define mtype_add_timeout	TOKEN(MTYPE, _add_timeout)
-#define mtype_gc_init		TOKEN(MTYPE, _gc_init)
-#define mtype_kadt		TOKEN(MTYPE, _kadt)
-#define mtype_uadt		TOKEN(MTYPE, _uadt)
-#define mtype_destroy		TOKEN(MTYPE, _destroy)
-#define mtype_flush		TOKEN(MTYPE, _flush)
-#define mtype_head		TOKEN(MTYPE, _head)
-#define mtype_same_set		TOKEN(MTYPE, _same_set)
-#define mtype_elem		TOKEN(MTYPE, _elem)
-#define mtype_test		TOKEN(MTYPE, _test)
-#define mtype_add		TOKEN(MTYPE, _add)
-#define mtype_del		TOKEN(MTYPE, _del)
-#define mtype_list		TOKEN(MTYPE, _list)
-#define mtype_gc		TOKEN(MTYPE, _gc)
+#define mtype_do_test		IPSET_TOKEN(MTYPE, _do_test)
+#define mtype_gc_test		IPSET_TOKEN(MTYPE, _gc_test)
+#define mtype_is_filled		IPSET_TOKEN(MTYPE, _is_filled)
+#define mtype_do_add		IPSET_TOKEN(MTYPE, _do_add)
+#define mtype_do_del		IPSET_TOKEN(MTYPE, _do_del)
+#define mtype_do_list		IPSET_TOKEN(MTYPE, _do_list)
+#define mtype_do_head		IPSET_TOKEN(MTYPE, _do_head)
+#define mtype_adt_elem		IPSET_TOKEN(MTYPE, _adt_elem)
+#define mtype_add_timeout	IPSET_TOKEN(MTYPE, _add_timeout)
+#define mtype_gc_init		IPSET_TOKEN(MTYPE, _gc_init)
+#define mtype_kadt		IPSET_TOKEN(MTYPE, _kadt)
+#define mtype_uadt		IPSET_TOKEN(MTYPE, _uadt)
+#define mtype_destroy		IPSET_TOKEN(MTYPE, _destroy)
+#define mtype_flush		IPSET_TOKEN(MTYPE, _flush)
+#define mtype_head		IPSET_TOKEN(MTYPE, _head)
+#define mtype_same_set		IPSET_TOKEN(MTYPE, _same_set)
+#define mtype_elem		IPSET_TOKEN(MTYPE, _elem)
+#define mtype_test		IPSET_TOKEN(MTYPE, _test)
+#define mtype_add		IPSET_TOKEN(MTYPE, _add)
+#define mtype_del		IPSET_TOKEN(MTYPE, _del)
+#define mtype_list		IPSET_TOKEN(MTYPE, _list)
+#define mtype_gc		IPSET_TOKEN(MTYPE, _gc)
 #define mtype			MTYPE
 
 #define ext_timeout(e, m)	\
diff --git a/net/netfilter/ipset/ip_set_bitmap_ip.c b/net/netfilter/ipset/ip_set_bitmap_ip.c
index f1a8128..c2f89b1 100644
--- a/net/netfilter/ipset/ip_set_bitmap_ip.c
+++ b/net/netfilter/ipset/ip_set_bitmap_ip.c
@@ -25,12 +25,12 @@
 #include <linux/netfilter/ipset/ip_set.h>
 #include <linux/netfilter/ipset/ip_set_bitmap.h>
 
-#define REVISION_MIN	0
-#define REVISION_MAX	1	/* Counter support added */
+#define IPSET_TYPE_REV_MIN	0
+#define IPSET_TYPE_REV_MAX	1	/* Counter support added */
 
 MODULE_LICENSE("GPL");
 MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>");
-IP_SET_MODULE_DESC("bitmap:ip", REVISION_MIN, REVISION_MAX);
+IP_SET_MODULE_DESC("bitmap:ip", IPSET_TYPE_REV_MIN, IPSET_TYPE_REV_MAX);
 MODULE_ALIAS("ip_set_bitmap:ip");
 
 #define MTYPE		bitmap_ip
@@ -401,8 +401,8 @@ static struct ip_set_type bitmap_ip_type __read_mostly = {
 	.features	= IPSET_TYPE_IP,
 	.dimension	= IPSET_DIM_ONE,
 	.family		= NFPROTO_IPV4,
-	.revision_min	= REVISION_MIN,
-	.revision_max	= REVISION_MAX,
+	.revision_min	= IPSET_TYPE_REV_MIN,
+	.revision_max	= IPSET_TYPE_REV_MAX,
 	.create		= bitmap_ip_create,
 	.create_policy	= {
 		[IPSET_ATTR_IP]		= { .type = NLA_NESTED },
diff --git a/net/netfilter/ipset/ip_set_bitmap_ipmac.c b/net/netfilter/ipset/ip_set_bitmap_ipmac.c
index 3b30e0b..1d6551c 100644
--- a/net/netfilter/ipset/ip_set_bitmap_ipmac.c
+++ b/net/netfilter/ipset/ip_set_bitmap_ipmac.c
@@ -25,12 +25,12 @@
 #include <linux/netfilter/ipset/ip_set.h>
 #include <linux/netfilter/ipset/ip_set_bitmap.h>
 
-#define REVISION_MIN	0
-#define REVISION_MAX	1	/* Counter support added */
+#define IPSET_TYPE_REV_MIN	0
+#define IPSET_TYPE_REV_MAX	1	/* Counter support added */
 
 MODULE_LICENSE("GPL");
 MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>");
-IP_SET_MODULE_DESC("bitmap:ip,mac", REVISION_MIN, REVISION_MAX);
+IP_SET_MODULE_DESC("bitmap:ip,mac", IPSET_TYPE_REV_MIN, IPSET_TYPE_REV_MAX);
 MODULE_ALIAS("ip_set_bitmap:ip,mac");
 
 #define MTYPE		bitmap_ipmac
@@ -460,8 +460,8 @@ static struct ip_set_type bitmap_ipmac_type = {
 	.features	= IPSET_TYPE_IP | IPSET_TYPE_MAC,
 	.dimension	= IPSET_DIM_TWO,
 	.family		= NFPROTO_IPV4,
-	.revision_min	= REVISION_MIN,
-	.revision_max	= REVISION_MAX,
+	.revision_min	= IPSET_TYPE_REV_MIN,
+	.revision_max	= IPSET_TYPE_REV_MAX,
 	.create		= bitmap_ipmac_create,
 	.create_policy	= {
 		[IPSET_ATTR_IP]		= { .type = NLA_NESTED },
diff --git a/net/netfilter/ipset/ip_set_bitmap_port.c b/net/netfilter/ipset/ip_set_bitmap_port.c
index 8207d1f..b220489 100644
--- a/net/netfilter/ipset/ip_set_bitmap_port.c
+++ b/net/netfilter/ipset/ip_set_bitmap_port.c
@@ -20,12 +20,12 @@
 #include <linux/netfilter/ipset/ip_set_bitmap.h>
 #include <linux/netfilter/ipset/ip_set_getport.h>
 
-#define REVISION_MIN	0
-#define REVISION_MAX	1	/* Counter support added */
+#define IPSET_TYPE_REV_MIN	0
+#define IPSET_TYPE_REV_MAX	1	/* Counter support added */
 
 MODULE_LICENSE("GPL");
 MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>");
-IP_SET_MODULE_DESC("bitmap:port", REVISION_MIN, REVISION_MAX);
+IP_SET_MODULE_DESC("bitmap:port", IPSET_TYPE_REV_MIN, IPSET_TYPE_REV_MAX);
 MODULE_ALIAS("ip_set_bitmap:port");
 
 #define MTYPE		bitmap_port
@@ -333,8 +333,8 @@ static struct ip_set_type bitmap_port_type = {
 	.features	= IPSET_TYPE_PORT,
 	.dimension	= IPSET_DIM_ONE,
 	.family		= NFPROTO_UNSPEC,
-	.revision_min	= REVISION_MIN,
-	.revision_max	= REVISION_MAX,
+	.revision_min	= IPSET_TYPE_REV_MIN,
+	.revision_max	= IPSET_TYPE_REV_MAX,
 	.create		= bitmap_port_create,
 	.create_policy	= {
 		[IPSET_ATTR_PORT]	= { .type = NLA_U16 },
diff --git a/net/netfilter/ipset/ip_set_hash_gen.h b/net/netfilter/ipset/ip_set_hash_gen.h
index 09a21dd..68b9cce 100644
--- a/net/netfilter/ipset/ip_set_hash_gen.h
+++ b/net/netfilter/ipset/ip_set_hash_gen.h
@@ -17,9 +17,6 @@
 
 #define rcu_dereference_bh_nfnl(p)	rcu_dereference_bh_check(p, 1)
 
-#define CONCAT(a, b)		a##b
-#define TOKEN(a, b)		CONCAT(a, b)
-
 /* Hashing which uses arrays to resolve clashing. The hash table is resized
  * (doubled) when searching becomes too long.
  * Internally jhash is used with the assumption that the size of the
@@ -222,41 +219,41 @@ hbucket_elem_add(struct hbucket *n, u8 ahash_max, size_t dsize)
 
 #undef HKEY
 
-#define mtype_data_equal	TOKEN(MTYPE, _data_equal)
+#define mtype_data_equal	IPSET_TOKEN(MTYPE, _data_equal)
 #ifdef IP_SET_HASH_WITH_NETS
-#define mtype_do_data_match	TOKEN(MTYPE, _do_data_match)
+#define mtype_do_data_match	IPSET_TOKEN(MTYPE, _do_data_match)
 #else
 #define mtype_do_data_match(d)	1
 #endif
-#define mtype_data_set_flags	TOKEN(MTYPE, _data_set_flags)
-#define mtype_data_reset_flags	TOKEN(MTYPE, _data_reset_flags)
-#define mtype_data_netmask	TOKEN(MTYPE, _data_netmask)
-#define mtype_data_list		TOKEN(MTYPE, _data_list)
-#define mtype_data_next		TOKEN(MTYPE, _data_next)
-#define mtype_elem		TOKEN(MTYPE, _elem)
-#define mtype_add_cidr		TOKEN(MTYPE, _add_cidr)
-#define mtype_del_cidr		TOKEN(MTYPE, _del_cidr)
-#define mtype_ahash_memsize	TOKEN(MTYPE, _ahash_memsize)
-#define mtype_flush		TOKEN(MTYPE, _flush)
-#define mtype_destroy		TOKEN(MTYPE, _destroy)
-#define mtype_gc_init		TOKEN(MTYPE, _gc_init)
-#define mtype_same_set		TOKEN(MTYPE, _same_set)
-#define mtype_kadt		TOKEN(MTYPE, _kadt)
-#define mtype_uadt		TOKEN(MTYPE, _uadt)
+#define mtype_data_set_flags	IPSET_TOKEN(MTYPE, _data_set_flags)
+#define mtype_data_reset_flags	IPSET_TOKEN(MTYPE, _data_reset_flags)
+#define mtype_data_netmask	IPSET_TOKEN(MTYPE, _data_netmask)
+#define mtype_data_list		IPSET_TOKEN(MTYPE, _data_list)
+#define mtype_data_next		IPSET_TOKEN(MTYPE, _data_next)
+#define mtype_elem		IPSET_TOKEN(MTYPE, _elem)
+#define mtype_add_cidr		IPSET_TOKEN(MTYPE, _add_cidr)
+#define mtype_del_cidr		IPSET_TOKEN(MTYPE, _del_cidr)
+#define mtype_ahash_memsize	IPSET_TOKEN(MTYPE, _ahash_memsize)
+#define mtype_flush		IPSET_TOKEN(MTYPE, _flush)
+#define mtype_destroy		IPSET_TOKEN(MTYPE, _destroy)
+#define mtype_gc_init		IPSET_TOKEN(MTYPE, _gc_init)
+#define mtype_same_set		IPSET_TOKEN(MTYPE, _same_set)
+#define mtype_kadt		IPSET_TOKEN(MTYPE, _kadt)
+#define mtype_uadt		IPSET_TOKEN(MTYPE, _uadt)
 #define mtype			MTYPE
 
-#define mtype_elem		TOKEN(MTYPE, _elem)
-#define mtype_add		TOKEN(MTYPE, _add)
-#define mtype_del		TOKEN(MTYPE, _del)
-#define mtype_test_cidrs	TOKEN(MTYPE, _test_cidrs)
-#define mtype_test		TOKEN(MTYPE, _test)
-#define mtype_expire		TOKEN(MTYPE, _expire)
-#define mtype_resize		TOKEN(MTYPE, _resize)
-#define mtype_head		TOKEN(MTYPE, _head)
-#define mtype_list		TOKEN(MTYPE, _list)
-#define mtype_gc		TOKEN(MTYPE, _gc)
-#define mtype_variant		TOKEN(MTYPE, _variant)
-#define mtype_data_match	TOKEN(MTYPE, _data_match)
+#define mtype_elem		IPSET_TOKEN(MTYPE, _elem)
+#define mtype_add		IPSET_TOKEN(MTYPE, _add)
+#define mtype_del		IPSET_TOKEN(MTYPE, _del)
+#define mtype_test_cidrs	IPSET_TOKEN(MTYPE, _test_cidrs)
+#define mtype_test		IPSET_TOKEN(MTYPE, _test)
+#define mtype_expire		IPSET_TOKEN(MTYPE, _expire)
+#define mtype_resize		IPSET_TOKEN(MTYPE, _resize)
+#define mtype_head		IPSET_TOKEN(MTYPE, _head)
+#define mtype_list		IPSET_TOKEN(MTYPE, _list)
+#define mtype_gc		IPSET_TOKEN(MTYPE, _gc)
+#define mtype_variant		IPSET_TOKEN(MTYPE, _variant)
+#define mtype_data_match	IPSET_TOKEN(MTYPE, _data_match)
 
 #ifndef HKEY_DATALEN
 #define HKEY_DATALEN		sizeof(struct mtype_elem)
@@ -941,13 +938,13 @@ nla_put_failure:
 }
 
 static int
-TOKEN(MTYPE, _kadt)(struct ip_set *set, const struct sk_buff *skb,
-	      const struct xt_action_param *par,
-	      enum ipset_adt adt, struct ip_set_adt_opt *opt);
+IPSET_TOKEN(MTYPE, _kadt)(struct ip_set *set, const struct sk_buff *skb,
+	    const struct xt_action_param *par,
+	    enum ipset_adt adt, struct ip_set_adt_opt *opt);
 
 static int
-TOKEN(MTYPE, _uadt)(struct ip_set *set, struct nlattr *tb[],
-	      enum ipset_adt adt, u32 *lineno, u32 flags, bool retried);
+IPSET_TOKEN(MTYPE, _uadt)(struct ip_set *set, struct nlattr *tb[],
+	    enum ipset_adt adt, u32 *lineno, u32 flags, bool retried);
 
 static const struct ip_set_type_variant mtype_variant = {
 	.kadt	= mtype_kadt,
@@ -967,7 +964,7 @@ static const struct ip_set_type_variant mtype_variant = {
 
 #ifdef IP_SET_EMIT_CREATE
 static int
-TOKEN(HTYPE, _create)(struct ip_set *set, struct nlattr *tb[], u32 flags)
+IPSET_TOKEN(HTYPE, _create)(struct ip_set *set, struct nlattr *tb[], u32 flags)
 {
 	u32 hashsize = IPSET_DEFAULT_HASHSIZE, maxelem = IPSET_DEFAULT_MAXELEM;
 	u32 cadt_flags = 0;
@@ -1045,9 +1042,9 @@ TOKEN(HTYPE, _create)(struct ip_set *set, struct nlattr *tb[], u32 flags)
 
 	set->data = h;
 	if (set->family ==  NFPROTO_IPV4)
-		set->variant = &TOKEN(HTYPE, 4_variant);
+		set->variant = &IPSET_TOKEN(HTYPE, 4_variant);
 	else
-		set->variant = &TOKEN(HTYPE, 6_variant);
+		set->variant = &IPSET_TOKEN(HTYPE, 6_variant);
 
 	if (tb[IPSET_ATTR_CADT_FLAGS])
 		cadt_flags = ip_set_get_h32(tb[IPSET_ATTR_CADT_FLAGS]);
@@ -1058,64 +1055,74 @@ TOKEN(HTYPE, _create)(struct ip_set *set, struct nlattr *tb[], u32 flags)
 				ip_set_timeout_uget(tb[IPSET_ATTR_TIMEOUT]);
 			set->extensions |= IPSET_EXT_TIMEOUT;
 			if (set->family == NFPROTO_IPV4) {
-				h->dsize =
-					sizeof(struct TOKEN(HTYPE, 4ct_elem));
+				h->dsize = sizeof(struct
+					IPSET_TOKEN(HTYPE, 4ct_elem));
 				h->offset[IPSET_OFFSET_TIMEOUT] =
-					offsetof(struct TOKEN(HTYPE, 4ct_elem),
-						 timeout);
+					offsetof(struct
+						IPSET_TOKEN(HTYPE, 4ct_elem),
+						timeout);
 				h->offset[IPSET_OFFSET_COUNTER] =
-					offsetof(struct TOKEN(HTYPE, 4ct_elem),
-						 counter);
-				TOKEN(HTYPE, 4_gc_init)(set,
-					TOKEN(HTYPE, 4_gc));
+					offsetof(struct
+						IPSET_TOKEN(HTYPE, 4ct_elem),
+						counter);
+				IPSET_TOKEN(HTYPE, 4_gc_init)(set,
+					IPSET_TOKEN(HTYPE, 4_gc));
 			} else {
-				h->dsize =
-					sizeof(struct TOKEN(HTYPE, 6ct_elem));
+				h->dsize = sizeof(struct
+					IPSET_TOKEN(HTYPE, 6ct_elem));
 				h->offset[IPSET_OFFSET_TIMEOUT] =
-					offsetof(struct TOKEN(HTYPE, 6ct_elem),
-						 timeout);
+					offsetof(struct
+						IPSET_TOKEN(HTYPE, 6ct_elem),
+						timeout);
 				h->offset[IPSET_OFFSET_COUNTER] =
-					offsetof(struct TOKEN(HTYPE, 6ct_elem),
-						 counter);
-				TOKEN(HTYPE, 6_gc_init)(set,
-					TOKEN(HTYPE, 6_gc));
+					offsetof(struct
+						IPSET_TOKEN(HTYPE, 6ct_elem),
+						counter);
+				IPSET_TOKEN(HTYPE, 6_gc_init)(set,
+					IPSET_TOKEN(HTYPE, 6_gc));
 			}
 		} else {
 			if (set->family == NFPROTO_IPV4) {
 				h->dsize =
-					sizeof(struct TOKEN(HTYPE, 4c_elem));
+					sizeof(struct
+						IPSET_TOKEN(HTYPE, 4c_elem));
 				h->offset[IPSET_OFFSET_COUNTER] =
-					offsetof(struct TOKEN(HTYPE, 4c_elem),
-						 counter);
+					offsetof(struct
+						IPSET_TOKEN(HTYPE, 4c_elem),
+						counter);
 			} else {
 				h->dsize =
-					sizeof(struct TOKEN(HTYPE, 6c_elem));
+					sizeof(struct
+						IPSET_TOKEN(HTYPE, 6c_elem));
 				h->offset[IPSET_OFFSET_COUNTER] =
-					offsetof(struct TOKEN(HTYPE, 6c_elem),
-						 counter);
+					offsetof(struct
+						IPSET_TOKEN(HTYPE, 6c_elem),
+						counter);
 			}
 		}
 	} else if (tb[IPSET_ATTR_TIMEOUT]) {
 		h->timeout = ip_set_timeout_uget(tb[IPSET_ATTR_TIMEOUT]);
 		set->extensions |= IPSET_EXT_TIMEOUT;
 		if (set->family == NFPROTO_IPV4) {
-			h->dsize = sizeof(struct TOKEN(HTYPE, 4t_elem));
+			h->dsize = sizeof(struct IPSET_TOKEN(HTYPE, 4t_elem));
 			h->offset[IPSET_OFFSET_TIMEOUT] =
-				offsetof(struct TOKEN(HTYPE, 4t_elem),
+				offsetof(struct IPSET_TOKEN(HTYPE, 4t_elem),
 					 timeout);
-			TOKEN(HTYPE, 4_gc_init)(set, TOKEN(HTYPE, 4_gc));
+			IPSET_TOKEN(HTYPE, 4_gc_init)(set,
+				IPSET_TOKEN(HTYPE, 4_gc));
 		} else {
-			h->dsize = sizeof(struct TOKEN(HTYPE, 6t_elem));
+			h->dsize = sizeof(struct IPSET_TOKEN(HTYPE, 6t_elem));
 			h->offset[IPSET_OFFSET_TIMEOUT] =
-				offsetof(struct TOKEN(HTYPE, 6t_elem),
+				offsetof(struct IPSET_TOKEN(HTYPE, 6t_elem),
 					 timeout);
-			TOKEN(HTYPE, 6_gc_init)(set, TOKEN(HTYPE, 6_gc));
+			IPSET_TOKEN(HTYPE, 6_gc_init)(set,
+				IPSET_TOKEN(HTYPE, 6_gc));
 		}
 	} else {
 		if (set->family == NFPROTO_IPV4)
-			h->dsize = sizeof(struct TOKEN(HTYPE, 4_elem));
+			h->dsize = sizeof(struct IPSET_TOKEN(HTYPE, 4_elem));
 		else
-			h->dsize = sizeof(struct TOKEN(HTYPE, 6_elem));
+			h->dsize = sizeof(struct IPSET_TOKEN(HTYPE, 6_elem));
 	}
 
 	pr_debug("create %s hashsize %u (%u) maxelem %u: %p(%p)\n",
diff --git a/net/netfilter/ipset/ip_set_hash_ip.c b/net/netfilter/ipset/ip_set_hash_ip.c
index c74e6e1..de44fca 100644
--- a/net/netfilter/ipset/ip_set_hash_ip.c
+++ b/net/netfilter/ipset/ip_set_hash_ip.c
@@ -23,12 +23,12 @@
 #include <linux/netfilter/ipset/ip_set.h>
 #include <linux/netfilter/ipset/ip_set_hash.h>
 
-#define REVISION_MIN	0
-#define REVISION_MAX	1	/* Counters support */
+#define IPSET_TYPE_REV_MIN	0
+#define IPSET_TYPE_REV_MAX	1	/* Counters support */
 
 MODULE_LICENSE("GPL");
 MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>");
-IP_SET_MODULE_DESC("hash:ip", REVISION_MIN, REVISION_MAX);
+IP_SET_MODULE_DESC("hash:ip", IPSET_TYPE_REV_MIN, IPSET_TYPE_REV_MAX);
 MODULE_ALIAS("ip_set_hash:ip");
 
 /* Type specific function prefix */
@@ -304,8 +304,8 @@ static struct ip_set_type hash_ip_type __read_mostly = {
 	.features	= IPSET_TYPE_IP,
 	.dimension	= IPSET_DIM_ONE,
 	.family		= NFPROTO_UNSPEC,
-	.revision_min	= REVISION_MIN,
-	.revision_max	= REVISION_MAX,
+	.revision_min	= IPSET_TYPE_REV_MIN,
+	.revision_max	= IPSET_TYPE_REV_MAX,
 	.create		= hash_ip_create,
 	.create_policy	= {
 		[IPSET_ATTR_HASHSIZE]	= { .type = NLA_U32 },
diff --git a/net/netfilter/ipset/ip_set_hash_ipport.c b/net/netfilter/ipset/ip_set_hash_ipport.c
index 7a2d2bd..b514ff4 100644
--- a/net/netfilter/ipset/ip_set_hash_ipport.c
+++ b/net/netfilter/ipset/ip_set_hash_ipport.c
@@ -24,13 +24,13 @@
 #include <linux/netfilter/ipset/ip_set_getport.h>
 #include <linux/netfilter/ipset/ip_set_hash.h>
 
-#define REVISION_MIN	0
-/*			1    SCTP and UDPLITE support added */
-#define REVISION_MAX	2 /* Counters support added */
+#define IPSET_TYPE_REV_MIN	0
+/*				1    SCTP and UDPLITE support added */
+#define IPSET_TYPE_REV_MAX	2 /* Counters support added */
 
 MODULE_LICENSE("GPL");
 MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>");
-IP_SET_MODULE_DESC("hash:ip,port", REVISION_MIN, REVISION_MAX);
+IP_SET_MODULE_DESC("hash:ip,port", IPSET_TYPE_REV_MIN, IPSET_TYPE_REV_MAX);
 MODULE_ALIAS("ip_set_hash:ip,port");
 
 /* Type specific function prefix */
@@ -396,8 +396,8 @@ static struct ip_set_type hash_ipport_type __read_mostly = {
 	.features	= IPSET_TYPE_IP | IPSET_TYPE_PORT,
 	.dimension	= IPSET_DIM_TWO,
 	.family		= NFPROTO_UNSPEC,
-	.revision_min	= REVISION_MIN,
-	.revision_max	= REVISION_MAX,
+	.revision_min	= IPSET_TYPE_REV_MIN,
+	.revision_max	= IPSET_TYPE_REV_MAX,
 	.create		= hash_ipport_create,
 	.create_policy	= {
 		[IPSET_ATTR_HASHSIZE]	= { .type = NLA_U32 },
diff --git a/net/netfilter/ipset/ip_set_hash_ipportip.c b/net/netfilter/ipset/ip_set_hash_ipportip.c
index 34e8a1a..d05070d 100644
--- a/net/netfilter/ipset/ip_set_hash_ipportip.c
+++ b/net/netfilter/ipset/ip_set_hash_ipportip.c
@@ -24,13 +24,13 @@
 #include <linux/netfilter/ipset/ip_set_getport.h>
 #include <linux/netfilter/ipset/ip_set_hash.h>
 
-#define REVISION_MIN	0
-/*			1    SCTP and UDPLITE support added */
-#define REVISION_MAX	2 /* Counters support added */
+#define IPSET_TYPE_REV_MIN	0
+/*				1    SCTP and UDPLITE support added */
+#define IPSET_TYPE_REV_MAX	2 /* Counters support added */
 
 MODULE_LICENSE("GPL");
 MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>");
-IP_SET_MODULE_DESC("hash:ip,port,ip", REVISION_MIN, REVISION_MAX);
+IP_SET_MODULE_DESC("hash:ip,port,ip", IPSET_TYPE_REV_MIN, IPSET_TYPE_REV_MAX);
 MODULE_ALIAS("ip_set_hash:ip,port,ip");
 
 /* Type specific function prefix */
@@ -414,8 +414,8 @@ static struct ip_set_type hash_ipportip_type __read_mostly = {
 	.features	= IPSET_TYPE_IP | IPSET_TYPE_PORT | IPSET_TYPE_IP2,
 	.dimension	= IPSET_DIM_THREE,
 	.family		= NFPROTO_UNSPEC,
-	.revision_min	= REVISION_MIN,
-	.revision_max	= REVISION_MAX,
+	.revision_min	= IPSET_TYPE_REV_MIN,
+	.revision_max	= IPSET_TYPE_REV_MAX,
 	.create		= hash_ipportip_create,
 	.create_policy	= {
 		[IPSET_ATTR_HASHSIZE]	= { .type = NLA_U32 },
diff --git a/net/netfilter/ipset/ip_set_hash_ipportnet.c b/net/netfilter/ipset/ip_set_hash_ipportnet.c
index 9a80d8b..7d1dede 100644
--- a/net/netfilter/ipset/ip_set_hash_ipportnet.c
+++ b/net/netfilter/ipset/ip_set_hash_ipportnet.c
@@ -24,15 +24,15 @@
 #include <linux/netfilter/ipset/ip_set_getport.h>
 #include <linux/netfilter/ipset/ip_set_hash.h>
 
-#define REVISION_MIN	0
-/*			1    SCTP and UDPLITE support added */
-/*			2    Range as input support for IPv4 added */
-/*			3    nomatch flag support added */
-#define REVISION_MAX	4 /* Counters support added */
+#define IPSET_TYPE_REV_MIN	0
+/*				1    SCTP and UDPLITE support added */
+/*				2    Range as input support for IPv4 added */
+/*				3    nomatch flag support added */
+#define IPSET_TYPE_REV_MAX	4 /* Counters support added */
 
 MODULE_LICENSE("GPL");
 MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>");
-IP_SET_MODULE_DESC("hash:ip,port,net", REVISION_MIN, REVISION_MAX);
+IP_SET_MODULE_DESC("hash:ip,port,net", IPSET_TYPE_REV_MIN, IPSET_TYPE_REV_MAX);
 MODULE_ALIAS("ip_set_hash:ip,port,net");
 
 /* Type specific function prefix */
@@ -574,8 +574,8 @@ static struct ip_set_type hash_ipportnet_type __read_mostly = {
 			  IPSET_TYPE_NOMATCH,
 	.dimension	= IPSET_DIM_THREE,
 	.family		= NFPROTO_UNSPEC,
-	.revision_min	= REVISION_MIN,
-	.revision_max	= REVISION_MAX,
+	.revision_min	= IPSET_TYPE_REV_MIN,
+	.revision_max	= IPSET_TYPE_REV_MAX,
 	.create		= hash_ipportnet_create,
 	.create_policy	= {
 		[IPSET_ATTR_HASHSIZE]	= { .type = NLA_U32 },
diff --git a/net/netfilter/ipset/ip_set_hash_net.c b/net/netfilter/ipset/ip_set_hash_net.c
index 223e9f5..9cb9ef4 100644
--- a/net/netfilter/ipset/ip_set_hash_net.c
+++ b/net/netfilter/ipset/ip_set_hash_net.c
@@ -22,14 +22,14 @@
 #include <linux/netfilter/ipset/ip_set.h>
 #include <linux/netfilter/ipset/ip_set_hash.h>
 
-#define REVISION_MIN	0
-/*			1    Range as input support for IPv4 added */
-/*			2    nomatch flag support added */
-#define REVISION_MAX	3 /* Counters support added */
+#define IPSET_TYPE_REV_MIN	0
+/*				1    Range as input support for IPv4 added */
+/*				2    nomatch flag support added */
+#define IPSET_TYPE_REV_MAX	3 /* Counters support added */
 
 MODULE_LICENSE("GPL");
 MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>");
-IP_SET_MODULE_DESC("hash:net", REVISION_MIN, REVISION_MAX);
+IP_SET_MODULE_DESC("hash:net", IPSET_TYPE_REV_MIN, IPSET_TYPE_REV_MAX);
 MODULE_ALIAS("ip_set_hash:net");
 
 /* Type specific function prefix */
@@ -406,8 +406,8 @@ static struct ip_set_type hash_net_type __read_mostly = {
 	.features	= IPSET_TYPE_IP | IPSET_TYPE_NOMATCH,
 	.dimension	= IPSET_DIM_ONE,
 	.family		= NFPROTO_UNSPEC,
-	.revision_min	= REVISION_MIN,
-	.revision_max	= REVISION_MAX,
+	.revision_min	= IPSET_TYPE_REV_MIN,
+	.revision_max	= IPSET_TYPE_REV_MAX,
 	.create		= hash_net_create,
 	.create_policy	= {
 		[IPSET_ATTR_HASHSIZE]	= { .type = NLA_U32 },
diff --git a/net/netfilter/ipset/ip_set_hash_netiface.c b/net/netfilter/ipset/ip_set_hash_netiface.c
index 7d798d5..2310fc2 100644
--- a/net/netfilter/ipset/ip_set_hash_netiface.c
+++ b/net/netfilter/ipset/ip_set_hash_netiface.c
@@ -23,14 +23,14 @@
 #include <linux/netfilter/ipset/ip_set.h>
 #include <linux/netfilter/ipset/ip_set_hash.h>
 
-#define REVISION_MIN	0
-/*			1    nomatch flag support added */
-/*			2    /0 support added */
-#define REVISION_MAX	3 /* Counters support added */
+#define IPSET_TYPE_REV_MIN	0
+/*				1    nomatch flag support added */
+/*				2    /0 support added */
+#define IPSET_TYPE_REV_MAX	3 /* Counters support added */
 
 MODULE_LICENSE("GPL");
 MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>");
-IP_SET_MODULE_DESC("hash:net,iface", REVISION_MIN, REVISION_MAX);
+IP_SET_MODULE_DESC("hash:net,iface", IPSET_TYPE_REV_MIN, IPSET_TYPE_REV_MAX);
 MODULE_ALIAS("ip_set_hash:net,iface");
 
 /* Interface name rbtree */
@@ -645,8 +645,8 @@ static struct ip_set_type hash_netiface_type __read_mostly = {
 			  IPSET_TYPE_NOMATCH,
 	.dimension	= IPSET_DIM_TWO,
 	.family		= NFPROTO_UNSPEC,
-	.revision_min	= REVISION_MIN,
-	.revision_max	= REVISION_MAX,
+	.revision_min	= IPSET_TYPE_REV_MIN,
+	.revision_max	= IPSET_TYPE_REV_MAX,
 	.create		= hash_netiface_create,
 	.create_policy	= {
 		[IPSET_ATTR_HASHSIZE]	= { .type = NLA_U32 },
diff --git a/net/netfilter/ipset/ip_set_hash_netport.c b/net/netfilter/ipset/ip_set_hash_netport.c
index 09d6690..1601d48 100644
--- a/net/netfilter/ipset/ip_set_hash_netport.c
+++ b/net/netfilter/ipset/ip_set_hash_netport.c
@@ -23,15 +23,15 @@
 #include <linux/netfilter/ipset/ip_set_getport.h>
 #include <linux/netfilter/ipset/ip_set_hash.h>
 
-#define REVISION_MIN	0
-/*			1    SCTP and UDPLITE support added */
-/*			2    Range as input support for IPv4 added */
-/*			3    nomatch flag support added */
-#define REVISION_MAX	4 /* Counters support added */
+#define IPSET_TYPE_REV_MIN	0
+/*				1    SCTP and UDPLITE support added */
+/*				2    Range as input support for IPv4 added */
+/*				3    nomatch flag support added */
+#define IPSET_TYPE_REV_MAX	4 /* Counters support added */
 
 MODULE_LICENSE("GPL");
 MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>");
-IP_SET_MODULE_DESC("hash:net,port", REVISION_MIN, REVISION_MAX);
+IP_SET_MODULE_DESC("hash:net,port", IPSET_TYPE_REV_MIN, IPSET_TYPE_REV_MAX);
 MODULE_ALIAS("ip_set_hash:net,port");
 
 /* Type specific function prefix */
@@ -518,8 +518,8 @@ static struct ip_set_type hash_netport_type __read_mostly = {
 	.features	= IPSET_TYPE_IP | IPSET_TYPE_PORT | IPSET_TYPE_NOMATCH,
 	.dimension	= IPSET_DIM_TWO,
 	.family		= NFPROTO_UNSPEC,
-	.revision_min	= REVISION_MIN,
-	.revision_max	= REVISION_MAX,
+	.revision_min	= IPSET_TYPE_REV_MIN,
+	.revision_max	= IPSET_TYPE_REV_MAX,
 	.create		= hash_netport_create,
 	.create_policy	= {
 		[IPSET_ATTR_HASHSIZE]	= { .type = NLA_U32 },
diff --git a/net/netfilter/ipset/ip_set_list_set.c b/net/netfilter/ipset/ip_set_list_set.c
index 68299ee..a9e301f 100644
--- a/net/netfilter/ipset/ip_set_list_set.c
+++ b/net/netfilter/ipset/ip_set_list_set.c
@@ -15,12 +15,12 @@
 #include <linux/netfilter/ipset/ip_set.h>
 #include <linux/netfilter/ipset/ip_set_list.h>
 
-#define REVISION_MIN	0
-#define REVISION_MAX	1 /* Counters support added */
+#define IPSET_TYPE_REV_MIN	0
+#define IPSET_TYPE_REV_MAX	1 /* Counters support added */
 
 MODULE_LICENSE("GPL");
 MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>");
-IP_SET_MODULE_DESC("list:set", REVISION_MIN, REVISION_MAX);
+IP_SET_MODULE_DESC("list:set", IPSET_TYPE_REV_MIN, IPSET_TYPE_REV_MAX);
 MODULE_ALIAS("ip_set_list:set");
 
 /* Member elements  */
@@ -703,8 +703,8 @@ static struct ip_set_type list_set_type __read_mostly = {
 	.features	= IPSET_TYPE_NAME | IPSET_DUMP_LAST,
 	.dimension	= IPSET_DIM_ONE,
 	.family		= NFPROTO_UNSPEC,
-	.revision_min	= REVISION_MIN,
-	.revision_max	= REVISION_MAX,
+	.revision_min	= IPSET_TYPE_REV_MIN,
+	.revision_max	= IPSET_TYPE_REV_MAX,
 	.create		= list_set_create,
 	.create_policy	= {
 		[IPSET_ATTR_SIZE]	= { .type = NLA_U32 },
-- 
1.7.10.4


^ permalink raw reply related

* [PATCH 08/33] netfilter: ipset: Fix "may be used uninitialized" warnings
From: Pablo Neira Ayuso @ 2013-10-04  8:32 UTC (permalink / raw)
  To: netfilter-devel; +Cc: davem, netdev
In-Reply-To: <1380875598-5250-1-git-send-email-pablo@netfilter.org>

From: Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>

Reported-by: Pablo Neira Ayuso <pablo@netfilter.org>
Signed-off-by: Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>
---
 net/netfilter/ipset/ip_set_bitmap_ip.c      |    4 ++--
 net/netfilter/ipset/ip_set_bitmap_ipmac.c   |    4 ++--
 net/netfilter/ipset/ip_set_hash_ip.c        |    2 +-
 net/netfilter/ipset/ip_set_hash_ipport.c    |    2 +-
 net/netfilter/ipset/ip_set_hash_ipportip.c  |    2 +-
 net/netfilter/ipset/ip_set_hash_ipportnet.c |    4 ++--
 net/netfilter/ipset/ip_set_hash_net.c       |    2 +-
 net/netfilter/ipset/ip_set_hash_netiface.c  |    2 +-
 net/netfilter/ipset/ip_set_hash_netport.c   |    2 +-
 9 files changed, 12 insertions(+), 12 deletions(-)

diff --git a/net/netfilter/ipset/ip_set_bitmap_ip.c b/net/netfilter/ipset/ip_set_bitmap_ip.c
index c2f89b1..ce99d26 100644
--- a/net/netfilter/ipset/ip_set_bitmap_ip.c
+++ b/net/netfilter/ipset/ip_set_bitmap_ip.c
@@ -131,7 +131,7 @@ bitmap_ip_uadt(struct ip_set *set, struct nlattr *tb[],
 {
 	struct bitmap_ip *map = set->data;
 	ipset_adtfn adtfn = set->variant->adt[adt];
-	u32 ip, ip_to;
+	u32 ip = 0, ip_to = 0;
 	struct bitmap_ip_adt_elem e = { };
 	struct ip_set_ext ext = IP_SET_INIT_UEXT(map);
 	int ret = 0;
@@ -264,7 +264,7 @@ static int
 bitmap_ip_create(struct ip_set *set, struct nlattr *tb[], u32 flags)
 {
 	struct bitmap_ip *map;
-	u32 first_ip, last_ip, hosts, cadt_flags = 0;
+	u32 first_ip = 0, last_ip = 0, hosts, cadt_flags = 0;
 	u64 elements;
 	u8 netmask = 32;
 	int ret;
diff --git a/net/netfilter/ipset/ip_set_bitmap_ipmac.c b/net/netfilter/ipset/ip_set_bitmap_ipmac.c
index 1d6551c..6d5bad9 100644
--- a/net/netfilter/ipset/ip_set_bitmap_ipmac.c
+++ b/net/netfilter/ipset/ip_set_bitmap_ipmac.c
@@ -246,7 +246,7 @@ bitmap_ipmac_uadt(struct ip_set *set, struct nlattr *tb[],
 	ipset_adtfn adtfn = set->variant->adt[adt];
 	struct bitmap_ipmac_adt_elem e = {};
 	struct ip_set_ext ext = IP_SET_INIT_UEXT(map);
-	u32 ip;
+	u32 ip = 0;
 	int ret = 0;
 
 	if (unlikely(!tb[IPSET_ATTR_IP] ||
@@ -355,7 +355,7 @@ static int
 bitmap_ipmac_create(struct ip_set *set, struct nlattr *tb[],
 		    u32 flags)
 {
-	u32 first_ip, last_ip, cadt_flags = 0;
+	u32 first_ip = 0, last_ip = 0, cadt_flags = 0;
 	u64 elements;
 	struct bitmap_ipmac *map;
 	int ret;
diff --git a/net/netfilter/ipset/ip_set_hash_ip.c b/net/netfilter/ipset/ip_set_hash_ip.c
index de44fca..260c9a8 100644
--- a/net/netfilter/ipset/ip_set_hash_ip.c
+++ b/net/netfilter/ipset/ip_set_hash_ip.c
@@ -119,7 +119,7 @@ hash_ip4_uadt(struct ip_set *set, struct nlattr *tb[],
 	ipset_adtfn adtfn = set->variant->adt[adt];
 	struct hash_ip4_elem e = {};
 	struct ip_set_ext ext = IP_SET_INIT_UEXT(h);
-	u32 ip, ip_to, hosts;
+	u32 ip = 0, ip_to = 0, hosts;
 	int ret = 0;
 
 	if (unlikely(!tb[IPSET_ATTR_IP] ||
diff --git a/net/netfilter/ipset/ip_set_hash_ipport.c b/net/netfilter/ipset/ip_set_hash_ipport.c
index b514ff4..64caad3 100644
--- a/net/netfilter/ipset/ip_set_hash_ipport.c
+++ b/net/netfilter/ipset/ip_set_hash_ipport.c
@@ -137,7 +137,7 @@ hash_ipport4_uadt(struct ip_set *set, struct nlattr *tb[],
 	ipset_adtfn adtfn = set->variant->adt[adt];
 	struct hash_ipport4_elem e = { };
 	struct ip_set_ext ext = IP_SET_INIT_UEXT(h);
-	u32 ip, ip_to, p = 0, port, port_to;
+	u32 ip, ip_to = 0, p = 0, port, port_to;
 	bool with_ports = false;
 	int ret;
 
diff --git a/net/netfilter/ipset/ip_set_hash_ipportip.c b/net/netfilter/ipset/ip_set_hash_ipportip.c
index d05070d..2873bbc 100644
--- a/net/netfilter/ipset/ip_set_hash_ipportip.c
+++ b/net/netfilter/ipset/ip_set_hash_ipportip.c
@@ -142,7 +142,7 @@ hash_ipportip4_uadt(struct ip_set *set, struct nlattr *tb[],
 	ipset_adtfn adtfn = set->variant->adt[adt];
 	struct hash_ipportip4_elem e = { };
 	struct ip_set_ext ext = IP_SET_INIT_UEXT(h);
-	u32 ip, ip_to, p = 0, port, port_to;
+	u32 ip, ip_to = 0, p = 0, port, port_to;
 	bool with_ports = false;
 	int ret;
 
diff --git a/net/netfilter/ipset/ip_set_hash_ipportnet.c b/net/netfilter/ipset/ip_set_hash_ipportnet.c
index 7d1dede..f111558 100644
--- a/net/netfilter/ipset/ip_set_hash_ipportnet.c
+++ b/net/netfilter/ipset/ip_set_hash_ipportnet.c
@@ -196,8 +196,8 @@ hash_ipportnet4_uadt(struct ip_set *set, struct nlattr *tb[],
 	ipset_adtfn adtfn = set->variant->adt[adt];
 	struct hash_ipportnet4_elem e = { .cidr = HOST_MASK - 1 };
 	struct ip_set_ext ext = IP_SET_INIT_UEXT(h);
-	u32 ip, ip_to, p = 0, port, port_to;
-	u32 ip2_from, ip2_to, ip2_last, ip2;
+	u32 ip = 0, ip_to = 0, p = 0, port, port_to;
+	u32 ip2_from = 0, ip2_to = 0, ip2_last, ip2;
 	bool with_ports = false;
 	u8 cidr;
 	int ret;
diff --git a/net/netfilter/ipset/ip_set_hash_net.c b/net/netfilter/ipset/ip_set_hash_net.c
index 9cb9ef4..0a64dad 100644
--- a/net/netfilter/ipset/ip_set_hash_net.c
+++ b/net/netfilter/ipset/ip_set_hash_net.c
@@ -166,7 +166,7 @@ hash_net4_uadt(struct ip_set *set, struct nlattr *tb[],
 	ipset_adtfn adtfn = set->variant->adt[adt];
 	struct hash_net4_elem e = { .cidr = HOST_MASK };
 	struct ip_set_ext ext = IP_SET_INIT_UEXT(h);
-	u32 ip = 0, ip_to, last;
+	u32 ip = 0, ip_to = 0, last;
 	int ret;
 
 	if (unlikely(!tb[IPSET_ATTR_IP] ||
diff --git a/net/netfilter/ipset/ip_set_hash_netiface.c b/net/netfilter/ipset/ip_set_hash_netiface.c
index 2310fc2..846371b 100644
--- a/net/netfilter/ipset/ip_set_hash_netiface.c
+++ b/net/netfilter/ipset/ip_set_hash_netiface.c
@@ -320,7 +320,7 @@ hash_netiface4_uadt(struct ip_set *set, struct nlattr *tb[],
 	ipset_adtfn adtfn = set->variant->adt[adt];
 	struct hash_netiface4_elem e = { .cidr = HOST_MASK, .elem = 1 };
 	struct ip_set_ext ext = IP_SET_INIT_UEXT(h);
-	u32 ip = 0, ip_to, last;
+	u32 ip = 0, ip_to = 0, last;
 	char iface[IFNAMSIZ];
 	int ret;
 
diff --git a/net/netfilter/ipset/ip_set_hash_netport.c b/net/netfilter/ipset/ip_set_hash_netport.c
index 1601d48..d98a685 100644
--- a/net/netfilter/ipset/ip_set_hash_netport.c
+++ b/net/netfilter/ipset/ip_set_hash_netport.c
@@ -187,7 +187,7 @@ hash_netport4_uadt(struct ip_set *set, struct nlattr *tb[],
 	ipset_adtfn adtfn = set->variant->adt[adt];
 	struct hash_netport4_elem e = { .cidr = HOST_MASK - 1 };
 	struct ip_set_ext ext = IP_SET_INIT_UEXT(h);
-	u32 port, port_to, p = 0, ip = 0, ip_to, last;
+	u32 port, port_to, p = 0, ip = 0, ip_to = 0, last;
 	bool with_ports = false;
 	u8 cidr;
 	int ret;
-- 
1.7.10.4


^ permalink raw reply related

* [PATCH 09/33] netfilter: ipset: Use fix sized type for timeout in the extension part
From: Pablo Neira Ayuso @ 2013-10-04  8:32 UTC (permalink / raw)
  To: netfilter-devel; +Cc: davem, netdev
In-Reply-To: <1380875598-5250-1-git-send-email-pablo@netfilter.org>

From: Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>

Signed-off-by: Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>
---
 include/linux/netfilter/ipset/ip_set.h |    2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/linux/netfilter/ipset/ip_set.h b/include/linux/netfilter/ipset/ip_set.h
index f900f33..69aa604 100644
--- a/include/linux/netfilter/ipset/ip_set.h
+++ b/include/linux/netfilter/ipset/ip_set.h
@@ -67,7 +67,7 @@ enum ip_set_offset {
 #define SET_WITH_COUNTER(s)	((s)->extensions & IPSET_EXT_COUNTER)
 
 struct ip_set_ext {
-	unsigned long timeout;
+	u32 timeout;
 	u64 packets;
 	u64 bytes;
 };
-- 
1.7.10.4


^ permalink raw reply related

* [PATCH 10/33] netfilter: ipset: Support package fragments for IPv4 protos without ports
From: Pablo Neira Ayuso @ 2013-10-04  8:32 UTC (permalink / raw)
  To: netfilter-devel; +Cc: davem, netdev
In-Reply-To: <1380875598-5250-1-git-send-email-pablo@netfilter.org>

From: "Anders K. Pedersen" <akp@surftown.com>

Enable ipset port set types to match IPv4 package fragments for
protocols that doesn't have ports (or the port information isn't
supported by ipset).

For example this allows a hash:ip,port ipset containing the entry
192.168.0.1,gre:0 to match all package fragments for PPTP VPN tunnels
to/from the host. Without this patch only the first package fragment
(with fragment offset 0) was matched, while subsequent fragments wasn't.

This is not possible for IPv6, where the protocol is in the fragmented
part of the package unlike IPv4, where the protocol is in the IP header.

IPPROTO_ICMPV6 is deliberately not included, because it isn't relevant
for IPv4.

Signed-off-by: Anders K. Pedersen <akp@surftown.com>
Signed-off-by: Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>
---
 net/netfilter/ipset/ip_set_getport.c |   18 +++++++++++++++++-
 1 file changed, 17 insertions(+), 1 deletion(-)

diff --git a/net/netfilter/ipset/ip_set_getport.c b/net/netfilter/ipset/ip_set_getport.c
index dac156f..29fb01d 100644
--- a/net/netfilter/ipset/ip_set_getport.c
+++ b/net/netfilter/ipset/ip_set_getport.c
@@ -102,9 +102,25 @@ ip_set_get_ip4_port(const struct sk_buff *skb, bool src,
 	int protocol = iph->protocol;

 	/* See comments at tcp_match in ip_tables.c */
-	if (protocol <= 0 || (ntohs(iph->frag_off) & IP_OFFSET))
+	if (protocol <= 0)
 		return false;

+	if (ntohs(iph->frag_off) & IP_OFFSET)
+		switch (protocol) {
+		case IPPROTO_TCP:
+		case IPPROTO_SCTP:
+		case IPPROTO_UDP:
+		case IPPROTO_UDPLITE:
+		case IPPROTO_ICMP:
+			/* Port info not available for fragment offset > 0 */
+			return false;
+		default:
+			/* Other protocols doesn't have ports,
+			   so we can match fragments */
+			*proto = protocol;
+			return true;
+		}
+
 	return get_port(skb, protocol, protooff, src, port, proto);
 }
 EXPORT_SYMBOL_GPL(ip_set_get_ip4_port);
-- 
1.7.10.4

^ permalink raw reply related

* [PATCH 11/33] netfilter: ipset: order matches and targets separatedly in xt_set.c
From: Pablo Neira Ayuso @ 2013-10-04  8:32 UTC (permalink / raw)
  To: netfilter-devel; +Cc: davem, netdev
In-Reply-To: <1380875598-5250-1-git-send-email-pablo@netfilter.org>

From: Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>

Signed-off-by: Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>
---
 net/netfilter/xt_set.c |  188 ++++++++++++++++++++++++------------------------
 1 file changed, 96 insertions(+), 92 deletions(-)

diff --git a/net/netfilter/xt_set.c b/net/netfilter/xt_set.c
index 31790e7..2095488 100644
--- a/net/netfilter/xt_set.c
+++ b/net/netfilter/xt_set.c
@@ -109,6 +109,101 @@ set_match_v0_destroy(const struct xt_mtdtor_param *par)
 	ip_set_nfnl_put(info->match_set.index);
 }
 
+/* Revision 1 match */
+
+static bool
+set_match_v1(const struct sk_buff *skb, struct xt_action_param *par)
+{
+	const struct xt_set_info_match_v1 *info = par->matchinfo;
+	ADT_OPT(opt, par->family, info->match_set.dim,
+		info->match_set.flags, 0, UINT_MAX);
+
+	if (opt.flags & IPSET_RETURN_NOMATCH)
+		opt.cmdflags |= IPSET_FLAG_RETURN_NOMATCH;
+
+	return match_set(info->match_set.index, skb, par, &opt,
+			 info->match_set.flags & IPSET_INV_MATCH);
+}
+
+static int
+set_match_v1_checkentry(const struct xt_mtchk_param *par)
+{
+	struct xt_set_info_match_v1 *info = par->matchinfo;
+	ip_set_id_t index;
+
+	index = ip_set_nfnl_get_byindex(info->match_set.index);
+
+	if (index == IPSET_INVALID_ID) {
+		pr_warning("Cannot find set indentified by id %u to match\n",
+			   info->match_set.index);
+		return -ENOENT;
+	}
+	if (info->match_set.dim > IPSET_DIM_MAX) {
+		pr_warning("Protocol error: set match dimension "
+			   "is over the limit!\n");
+		ip_set_nfnl_put(info->match_set.index);
+		return -ERANGE;
+	}
+
+	return 0;
+}
+
+static void
+set_match_v1_destroy(const struct xt_mtdtor_param *par)
+{
+	struct xt_set_info_match_v1 *info = par->matchinfo;
+
+	ip_set_nfnl_put(info->match_set.index);
+}
+
+/* Revision 3 match */
+
+static bool
+match_counter(u64 counter, const struct ip_set_counter_match *info)
+{
+	switch (info->op) {
+	case IPSET_COUNTER_NONE:
+		return true;
+	case IPSET_COUNTER_EQ:
+		return counter == info->value;
+	case IPSET_COUNTER_NE:
+		return counter != info->value;
+	case IPSET_COUNTER_LT:
+		return counter < info->value;
+	case IPSET_COUNTER_GT:
+		return counter > info->value;
+	}
+	return false;
+}
+
+static bool
+set_match_v3(const struct sk_buff *skb, struct xt_action_param *par)
+{
+	const struct xt_set_info_match_v3 *info = par->matchinfo;
+	ADT_OPT(opt, par->family, info->match_set.dim,
+		info->match_set.flags, info->flags, UINT_MAX);
+	int ret;
+
+	if (info->packets.op != IPSET_COUNTER_NONE ||
+	    info->bytes.op != IPSET_COUNTER_NONE)
+		opt.cmdflags |= IPSET_FLAG_MATCH_COUNTERS;
+
+	ret = match_set(info->match_set.index, skb, par, &opt,
+			info->match_set.flags & IPSET_INV_MATCH);
+
+	if (!(ret && opt.cmdflags & IPSET_FLAG_MATCH_COUNTERS))
+		return ret;
+
+	if (!match_counter(opt.ext.packets, &info->packets))
+		return 0;
+	return match_counter(opt.ext.bytes, &info->bytes);
+}
+
+#define set_match_v3_checkentry	set_match_v1_checkentry
+#define set_match_v3_destroy	set_match_v1_destroy
+
+/* Revision 0 interface: backward compatible with netfilter/iptables */
+
 static unsigned int
 set_target_v0(struct sk_buff *skb, const struct xt_action_param *par)
 {
@@ -180,52 +275,7 @@ set_target_v0_destroy(const struct xt_tgdtor_param *par)
 		ip_set_nfnl_put(info->del_set.index);
 }
 
-/* Revision 1 match and target */
-
-static bool
-set_match_v1(const struct sk_buff *skb, struct xt_action_param *par)
-{
-	const struct xt_set_info_match_v1 *info = par->matchinfo;
-	ADT_OPT(opt, par->family, info->match_set.dim,
-		info->match_set.flags, 0, UINT_MAX);
-
-	if (opt.flags & IPSET_RETURN_NOMATCH)
-		opt.cmdflags |= IPSET_FLAG_RETURN_NOMATCH;
-
-	return match_set(info->match_set.index, skb, par, &opt,
-			 info->match_set.flags & IPSET_INV_MATCH);
-}
-
-static int
-set_match_v1_checkentry(const struct xt_mtchk_param *par)
-{
-	struct xt_set_info_match_v1 *info = par->matchinfo;
-	ip_set_id_t index;
-
-	index = ip_set_nfnl_get_byindex(info->match_set.index);
-
-	if (index == IPSET_INVALID_ID) {
-		pr_warning("Cannot find set indentified by id %u to match\n",
-			   info->match_set.index);
-		return -ENOENT;
-	}
-	if (info->match_set.dim > IPSET_DIM_MAX) {
-		pr_warning("Protocol error: set match dimension "
-			   "is over the limit!\n");
-		ip_set_nfnl_put(info->match_set.index);
-		return -ERANGE;
-	}
-
-	return 0;
-}
-
-static void
-set_match_v1_destroy(const struct xt_mtdtor_param *par)
-{
-	struct xt_set_info_match_v1 *info = par->matchinfo;
-
-	ip_set_nfnl_put(info->match_set.index);
-}
+/* Revision 1 target */
 
 static unsigned int
 set_target_v1(struct sk_buff *skb, const struct xt_action_param *par)
@@ -320,52 +370,6 @@ set_target_v2(struct sk_buff *skb, const struct xt_action_param *par)
 #define set_target_v2_checkentry	set_target_v1_checkentry
 #define set_target_v2_destroy		set_target_v1_destroy
 
-/* Revision 3 match */
-
-static bool
-match_counter(u64 counter, const struct ip_set_counter_match *info)
-{
-	switch (info->op) {
-	case IPSET_COUNTER_NONE:
-		return true;
-	case IPSET_COUNTER_EQ:
-		return counter == info->value;
-	case IPSET_COUNTER_NE:
-		return counter != info->value;
-	case IPSET_COUNTER_LT:
-		return counter < info->value;
-	case IPSET_COUNTER_GT:
-		return counter > info->value;
-	}
-	return false;
-}
-
-static bool
-set_match_v3(const struct sk_buff *skb, struct xt_action_param *par)
-{
-	const struct xt_set_info_match_v3 *info = par->matchinfo;
-	ADT_OPT(opt, par->family, info->match_set.dim,
-		info->match_set.flags, info->flags, UINT_MAX);
-	int ret;
-
-	if (info->packets.op != IPSET_COUNTER_NONE ||
-	    info->bytes.op != IPSET_COUNTER_NONE)
-		opt.cmdflags |= IPSET_FLAG_MATCH_COUNTERS;
-
-	ret = match_set(info->match_set.index, skb, par, &opt,
-			info->match_set.flags & IPSET_INV_MATCH);
-
-	if (!(ret && opt.cmdflags & IPSET_FLAG_MATCH_COUNTERS))
-		return ret;
-
-	if (!match_counter(opt.ext.packets, &info->packets))
-		return 0;
-	return match_counter(opt.ext.bytes, &info->bytes);
-}
-
-#define set_match_v3_checkentry	set_match_v1_checkentry
-#define set_match_v3_destroy	set_match_v1_destroy
-
 static struct xt_match set_matches[] __read_mostly = {
 	{
 		.name		= "set",
-- 
1.7.10.4


^ permalink raw reply related

* [PATCH 14/33] netfilter: ipset: Rename extension offset ids to extension ids
From: Pablo Neira Ayuso @ 2013-10-04  8:32 UTC (permalink / raw)
  To: netfilter-devel; +Cc: davem, netdev
In-Reply-To: <1380875598-5250-1-git-send-email-pablo@netfilter.org>

From: Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>

Signed-off-by: Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>
---
 include/linux/netfilter/ipset/ip_set.h    |   16 ++++++++--------
 net/netfilter/ipset/ip_set_bitmap_gen.h   |    4 ++--
 net/netfilter/ipset/ip_set_bitmap_ip.c    |   10 +++++-----
 net/netfilter/ipset/ip_set_bitmap_ipmac.c |   10 +++++-----
 net/netfilter/ipset/ip_set_bitmap_port.c  |   10 +++++-----
 net/netfilter/ipset/ip_set_hash_gen.h     |   22 +++++++++++-----------
 net/netfilter/ipset/ip_set_list_set.c     |   14 +++++++-------
 7 files changed, 43 insertions(+), 43 deletions(-)

diff --git a/include/linux/netfilter/ipset/ip_set.h b/include/linux/netfilter/ipset/ip_set.h
index 56012a3..b4db791 100644
--- a/include/linux/netfilter/ipset/ip_set.h
+++ b/include/linux/netfilter/ipset/ip_set.h
@@ -56,20 +56,20 @@ enum ip_set_extension {
 	IPSET_EXT_COUNTER = (1 << IPSET_EXT_BIT_COUNTER),
 };
 
-/* Extension offsets */
-enum ip_set_offset {
-	IPSET_OFFSET_TIMEOUT = 0,
-	IPSET_OFFSET_COUNTER,
-	IPSET_OFFSET_MAX,
-};
-
 #define SET_WITH_TIMEOUT(s)	((s)->extensions & IPSET_EXT_TIMEOUT)
 #define SET_WITH_COUNTER(s)	((s)->extensions & IPSET_EXT_COUNTER)
 
+/* Extension id, in size order */
+enum ip_set_ext_id {
+	IPSET_EXT_ID_COUNTER = 0,
+	IPSET_EXT_ID_TIMEOUT,
+	IPSET_EXT_ID_MAX,
+};
+
 struct ip_set_ext {
-	u32 timeout;
 	u64 packets;
 	u64 bytes;
+	u32 timeout;
 };
 
 struct ip_set;
diff --git a/net/netfilter/ipset/ip_set_bitmap_gen.h b/net/netfilter/ipset/ip_set_bitmap_gen.h
index d39905e..889a929 100644
--- a/net/netfilter/ipset/ip_set_bitmap_gen.h
+++ b/net/netfilter/ipset/ip_set_bitmap_gen.h
@@ -33,9 +33,9 @@
 #define mtype			MTYPE
 
 #define ext_timeout(e, m)	\
-	(unsigned long *)((e) + (m)->offset[IPSET_OFFSET_TIMEOUT])
+	(unsigned long *)((e) + (m)->offset[IPSET_EXT_ID_TIMEOUT])
 #define ext_counter(e, m)	\
-	(struct ip_set_counter *)((e) + (m)->offset[IPSET_OFFSET_COUNTER])
+	(struct ip_set_counter *)((e) + (m)->offset[IPSET_EXT_ID_COUNTER])
 #define get_ext(map, id)	((map)->extensions + (map)->dsize * (id))
 
 static void
diff --git a/net/netfilter/ipset/ip_set_bitmap_ip.c b/net/netfilter/ipset/ip_set_bitmap_ip.c
index ce99d26..2ee210e 100644
--- a/net/netfilter/ipset/ip_set_bitmap_ip.c
+++ b/net/netfilter/ipset/ip_set_bitmap_ip.c
@@ -45,7 +45,7 @@ struct bitmap_ip {
 	u32 hosts;		/* number of hosts in a subnet */
 	size_t memsize;		/* members size */
 	size_t dsize;		/* extensions struct size */
-	size_t offset[IPSET_OFFSET_MAX]; /* Offsets to extensions */
+	size_t offset[IPSET_EXT_ID_MAX]; /* Offsets to extensions */
 	u8 netmask;		/* subnet netmask */
 	u32 timeout;		/* timeout parameter */
 	struct timer_list gc;	/* garbage collection */
@@ -342,9 +342,9 @@ bitmap_ip_create(struct ip_set *set, struct nlattr *tb[], u32 flags)
 		set->extensions |= IPSET_EXT_COUNTER;
 		if (tb[IPSET_ATTR_TIMEOUT]) {
 			map->dsize = sizeof(struct bitmap_ipct_elem);
-			map->offset[IPSET_OFFSET_TIMEOUT] =
+			map->offset[IPSET_EXT_ID_TIMEOUT] =
 				offsetof(struct bitmap_ipct_elem, timeout);
-			map->offset[IPSET_OFFSET_COUNTER] =
+			map->offset[IPSET_EXT_ID_COUNTER] =
 				offsetof(struct bitmap_ipct_elem, counter);
 
 			if (!init_map_ip(set, map, first_ip, last_ip,
@@ -360,7 +360,7 @@ bitmap_ip_create(struct ip_set *set, struct nlattr *tb[], u32 flags)
 			bitmap_ip_gc_init(set, bitmap_ip_gc);
 		} else {
 			map->dsize = sizeof(struct bitmap_ipc_elem);
-			map->offset[IPSET_OFFSET_COUNTER] =
+			map->offset[IPSET_EXT_ID_COUNTER] =
 				offsetof(struct bitmap_ipc_elem, counter);
 
 			if (!init_map_ip(set, map, first_ip, last_ip,
@@ -371,7 +371,7 @@ bitmap_ip_create(struct ip_set *set, struct nlattr *tb[], u32 flags)
 		}
 	} else if (tb[IPSET_ATTR_TIMEOUT]) {
 		map->dsize = sizeof(struct bitmap_ipt_elem);
-		map->offset[IPSET_OFFSET_TIMEOUT] =
+		map->offset[IPSET_EXT_ID_TIMEOUT] =
 			offsetof(struct bitmap_ipt_elem, timeout);
 
 		if (!init_map_ip(set, map, first_ip, last_ip,
diff --git a/net/netfilter/ipset/ip_set_bitmap_ipmac.c b/net/netfilter/ipset/ip_set_bitmap_ipmac.c
index 6d5bad9..e711875 100644
--- a/net/netfilter/ipset/ip_set_bitmap_ipmac.c
+++ b/net/netfilter/ipset/ip_set_bitmap_ipmac.c
@@ -52,7 +52,7 @@ struct bitmap_ipmac {
 	struct timer_list gc;	/* garbage collector */
 	size_t memsize;		/* members size */
 	size_t dsize;		/* size of element */
-	size_t offset[IPSET_OFFSET_MAX]; /* Offsets to extensions */
+	size_t offset[IPSET_EXT_ID_MAX]; /* Offsets to extensions */
 };
 
 /* ADT structure for generic function args */
@@ -405,9 +405,9 @@ bitmap_ipmac_create(struct ip_set *set, struct nlattr *tb[],
 		set->extensions |= IPSET_EXT_COUNTER;
 		if (tb[IPSET_ATTR_TIMEOUT]) {
 			map->dsize = sizeof(struct bitmap_ipmacct_elem);
-			map->offset[IPSET_OFFSET_TIMEOUT] =
+			map->offset[IPSET_EXT_ID_TIMEOUT] =
 				offsetof(struct bitmap_ipmacct_elem, timeout);
-			map->offset[IPSET_OFFSET_COUNTER] =
+			map->offset[IPSET_EXT_ID_COUNTER] =
 				offsetof(struct bitmap_ipmacct_elem, counter);
 
 			if (!init_map_ipmac(set, map, first_ip, last_ip,
@@ -421,7 +421,7 @@ bitmap_ipmac_create(struct ip_set *set, struct nlattr *tb[],
 			bitmap_ipmac_gc_init(set, bitmap_ipmac_gc);
 		} else {
 			map->dsize = sizeof(struct bitmap_ipmacc_elem);
-			map->offset[IPSET_OFFSET_COUNTER] =
+			map->offset[IPSET_EXT_ID_COUNTER] =
 				offsetof(struct bitmap_ipmacc_elem, counter);
 
 			if (!init_map_ipmac(set, map, first_ip, last_ip,
@@ -432,7 +432,7 @@ bitmap_ipmac_create(struct ip_set *set, struct nlattr *tb[],
 		}
 	} else if (tb[IPSET_ATTR_TIMEOUT]) {
 		map->dsize = sizeof(struct bitmap_ipmact_elem);
-		map->offset[IPSET_OFFSET_TIMEOUT] =
+		map->offset[IPSET_EXT_ID_TIMEOUT] =
 			offsetof(struct bitmap_ipmact_elem, timeout);
 
 		if (!init_map_ipmac(set, map, first_ip, last_ip, elements)) {
diff --git a/net/netfilter/ipset/ip_set_bitmap_port.c b/net/netfilter/ipset/ip_set_bitmap_port.c
index b220489..bebc137 100644
--- a/net/netfilter/ipset/ip_set_bitmap_port.c
+++ b/net/netfilter/ipset/ip_set_bitmap_port.c
@@ -39,7 +39,7 @@ struct bitmap_port {
 	u32 elements;		/* number of max elements in the set */
 	size_t memsize;		/* members size */
 	size_t dsize;		/* extensions struct size */
-	size_t offset[IPSET_OFFSET_MAX]; /* Offsets to extensions */
+	size_t offset[IPSET_EXT_ID_MAX]; /* Offsets to extensions */
 	u32 timeout;		/* timeout parameter */
 	struct timer_list gc;	/* garbage collection */
 };
@@ -282,9 +282,9 @@ bitmap_port_create(struct ip_set *set, struct nlattr *tb[], u32 flags)
 		set->extensions |= IPSET_EXT_COUNTER;
 		if (tb[IPSET_ATTR_TIMEOUT]) {
 			map->dsize = sizeof(struct bitmap_portct_elem);
-			map->offset[IPSET_OFFSET_TIMEOUT] =
+			map->offset[IPSET_EXT_ID_TIMEOUT] =
 				offsetof(struct bitmap_portct_elem, timeout);
-			map->offset[IPSET_OFFSET_COUNTER] =
+			map->offset[IPSET_EXT_ID_COUNTER] =
 				offsetof(struct bitmap_portct_elem, counter);
 			if (!init_map_port(set, map, first_port, last_port)) {
 				kfree(map);
@@ -297,7 +297,7 @@ bitmap_port_create(struct ip_set *set, struct nlattr *tb[], u32 flags)
 			bitmap_port_gc_init(set, bitmap_port_gc);
 		} else {
 			map->dsize = sizeof(struct bitmap_portc_elem);
-			map->offset[IPSET_OFFSET_COUNTER] =
+			map->offset[IPSET_EXT_ID_COUNTER] =
 				offsetof(struct bitmap_portc_elem, counter);
 			if (!init_map_port(set, map, first_port, last_port)) {
 				kfree(map);
@@ -306,7 +306,7 @@ bitmap_port_create(struct ip_set *set, struct nlattr *tb[], u32 flags)
 		}
 	} else if (tb[IPSET_ATTR_TIMEOUT]) {
 		map->dsize = sizeof(struct bitmap_portt_elem);
-		map->offset[IPSET_OFFSET_TIMEOUT] =
+		map->offset[IPSET_EXT_ID_TIMEOUT] =
 			offsetof(struct bitmap_portt_elem, timeout);
 		if (!init_map_port(set, map, first_port, last_port)) {
 			kfree(map);
diff --git a/net/netfilter/ipset/ip_set_hash_gen.h b/net/netfilter/ipset/ip_set_hash_gen.h
index a833240..e4db925 100644
--- a/net/netfilter/ipset/ip_set_hash_gen.h
+++ b/net/netfilter/ipset/ip_set_hash_gen.h
@@ -179,9 +179,9 @@ hbucket_elem_add(struct hbucket *n, u8 ahash_max, size_t dsize)
 #endif /* IP_SET_HASH_WITH_NETS */
 
 #define ext_timeout(e, h)	\
-(unsigned long *)(((void *)(e)) + (h)->offset[IPSET_OFFSET_TIMEOUT])
+(unsigned long *)(((void *)(e)) + (h)->offset[IPSET_EXT_ID_TIMEOUT])
 #define ext_counter(e, h)	\
-(struct ip_set_counter *)(((void *)(e)) + (h)->offset[IPSET_OFFSET_COUNTER])
+(struct ip_set_counter *)(((void *)(e)) + (h)->offset[IPSET_EXT_ID_COUNTER])
 
 #endif /* _IP_SET_HASH_GEN_H */
 
@@ -278,7 +278,7 @@ struct htype {
 	u32 initval;		/* random jhash init value */
 	u32 timeout;		/* timeout value, if enabled */
 	size_t dsize;		/* data struct size */
-	size_t offset[IPSET_OFFSET_MAX]; /* Offsets to extensions */
+	size_t offset[IPSET_EXT_ID_MAX]; /* Offsets to extensions */
 	struct timer_list gc;	/* garbage collection when timeout enabled */
 	struct mtype_elem next; /* temporary storage for uadd */
 #ifdef IP_SET_HASH_WITH_MULTI
@@ -1059,11 +1059,11 @@ IPSET_TOKEN(HTYPE, _create)(struct ip_set *set, struct nlattr *tb[], u32 flags)
 			if (set->family == NFPROTO_IPV4) {
 				h->dsize = sizeof(struct
 					IPSET_TOKEN(HTYPE, 4ct_elem));
-				h->offset[IPSET_OFFSET_TIMEOUT] =
+				h->offset[IPSET_EXT_ID_TIMEOUT] =
 					offsetof(struct
 						IPSET_TOKEN(HTYPE, 4ct_elem),
 						timeout);
-				h->offset[IPSET_OFFSET_COUNTER] =
+				h->offset[IPSET_EXT_ID_COUNTER] =
 					offsetof(struct
 						IPSET_TOKEN(HTYPE, 4ct_elem),
 						counter);
@@ -1072,11 +1072,11 @@ IPSET_TOKEN(HTYPE, _create)(struct ip_set *set, struct nlattr *tb[], u32 flags)
 			} else {
 				h->dsize = sizeof(struct
 					IPSET_TOKEN(HTYPE, 6ct_elem));
-				h->offset[IPSET_OFFSET_TIMEOUT] =
+				h->offset[IPSET_EXT_ID_TIMEOUT] =
 					offsetof(struct
 						IPSET_TOKEN(HTYPE, 6ct_elem),
 						timeout);
-				h->offset[IPSET_OFFSET_COUNTER] =
+				h->offset[IPSET_EXT_ID_COUNTER] =
 					offsetof(struct
 						IPSET_TOKEN(HTYPE, 6ct_elem),
 						counter);
@@ -1088,7 +1088,7 @@ IPSET_TOKEN(HTYPE, _create)(struct ip_set *set, struct nlattr *tb[], u32 flags)
 				h->dsize =
 					sizeof(struct
 						IPSET_TOKEN(HTYPE, 4c_elem));
-				h->offset[IPSET_OFFSET_COUNTER] =
+				h->offset[IPSET_EXT_ID_COUNTER] =
 					offsetof(struct
 						IPSET_TOKEN(HTYPE, 4c_elem),
 						counter);
@@ -1096,7 +1096,7 @@ IPSET_TOKEN(HTYPE, _create)(struct ip_set *set, struct nlattr *tb[], u32 flags)
 				h->dsize =
 					sizeof(struct
 						IPSET_TOKEN(HTYPE, 6c_elem));
-				h->offset[IPSET_OFFSET_COUNTER] =
+				h->offset[IPSET_EXT_ID_COUNTER] =
 					offsetof(struct
 						IPSET_TOKEN(HTYPE, 6c_elem),
 						counter);
@@ -1107,14 +1107,14 @@ IPSET_TOKEN(HTYPE, _create)(struct ip_set *set, struct nlattr *tb[], u32 flags)
 		set->extensions |= IPSET_EXT_TIMEOUT;
 		if (set->family == NFPROTO_IPV4) {
 			h->dsize = sizeof(struct IPSET_TOKEN(HTYPE, 4t_elem));
-			h->offset[IPSET_OFFSET_TIMEOUT] =
+			h->offset[IPSET_EXT_ID_TIMEOUT] =
 				offsetof(struct IPSET_TOKEN(HTYPE, 4t_elem),
 					 timeout);
 			IPSET_TOKEN(HTYPE, 4_gc_init)(set,
 				IPSET_TOKEN(HTYPE, 4_gc));
 		} else {
 			h->dsize = sizeof(struct IPSET_TOKEN(HTYPE, 6t_elem));
-			h->offset[IPSET_OFFSET_TIMEOUT] =
+			h->offset[IPSET_EXT_ID_TIMEOUT] =
 				offsetof(struct IPSET_TOKEN(HTYPE, 6t_elem),
 					 timeout);
 			IPSET_TOKEN(HTYPE, 6_gc_init)(set,
diff --git a/net/netfilter/ipset/ip_set_list_set.c b/net/netfilter/ipset/ip_set_list_set.c
index a9e301f..0ed19b5 100644
--- a/net/netfilter/ipset/ip_set_list_set.c
+++ b/net/netfilter/ipset/ip_set_list_set.c
@@ -59,7 +59,7 @@ struct set_adt_elem {
 /* Type structure */
 struct list_set {
 	size_t dsize;		/* element size */
-	size_t offset[IPSET_OFFSET_MAX]; /* Offsets to extensions */
+	size_t offset[IPSET_EXT_ID_MAX]; /* Offsets to extensions */
 	u32 size;		/* size of set list array */
 	u32 timeout;		/* timeout value */
 	struct timer_list gc;	/* garbage collection */
@@ -73,9 +73,9 @@ list_set_elem(const struct list_set *map, u32 id)
 }
 
 #define ext_timeout(e, m)	\
-(unsigned long *)((void *)(e) + (m)->offset[IPSET_OFFSET_TIMEOUT])
+(unsigned long *)((void *)(e) + (m)->offset[IPSET_EXT_ID_TIMEOUT])
 #define ext_counter(e, m)	\
-(struct ip_set_counter *)((void *)(e) + (m)->offset[IPSET_OFFSET_COUNTER])
+(struct ip_set_counter *)((void *)(e) + (m)->offset[IPSET_EXT_ID_COUNTER])
 
 static int
 list_set_ktest(struct ip_set *set, const struct sk_buff *skb,
@@ -667,9 +667,9 @@ list_set_create(struct ip_set *set, struct nlattr *tb[], u32 flags)
 			if (!map)
 				return -ENOMEM;
 			set->extensions |= IPSET_EXT_TIMEOUT;
-			map->offset[IPSET_OFFSET_TIMEOUT] =
+			map->offset[IPSET_EXT_ID_TIMEOUT] =
 				offsetof(struct setct_elem, timeout);
-			map->offset[IPSET_OFFSET_COUNTER] =
+			map->offset[IPSET_EXT_ID_COUNTER] =
 				offsetof(struct setct_elem, counter);
 			list_set_gc_init(set, list_set_gc);
 		} else {
@@ -677,7 +677,7 @@ list_set_create(struct ip_set *set, struct nlattr *tb[], u32 flags)
 					    sizeof(struct setc_elem), 0);
 			if (!map)
 				return -ENOMEM;
-			map->offset[IPSET_OFFSET_COUNTER] =
+			map->offset[IPSET_EXT_ID_COUNTER] =
 				offsetof(struct setc_elem, counter);
 		}
 	} else if (tb[IPSET_ATTR_TIMEOUT]) {
@@ -686,7 +686,7 @@ list_set_create(struct ip_set *set, struct nlattr *tb[], u32 flags)
 		if (!map)
 			return -ENOMEM;
 		set->extensions |= IPSET_EXT_TIMEOUT;
-		map->offset[IPSET_OFFSET_TIMEOUT] =
+		map->offset[IPSET_EXT_ID_TIMEOUT] =
 			offsetof(struct sett_elem, timeout);
 		list_set_gc_init(set, list_set_gc);
 	} else {
-- 
1.7.10.4


^ permalink raw reply related

* [PATCH 17/33] netfilter: ipset: Support extensions which need a per data destroy function
From: Pablo Neira Ayuso @ 2013-10-04  8:33 UTC (permalink / raw)
  To: netfilter-devel; +Cc: davem, netdev
In-Reply-To: <1380875598-5250-1-git-send-email-pablo@netfilter.org>

From: Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>

Signed-off-by: Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>
---
 include/linux/netfilter/ipset/ip_set.h  |   22 +++++++---
 net/netfilter/ipset/ip_set_bitmap_gen.h |   38 ++++++++++++++---
 net/netfilter/ipset/ip_set_hash_gen.h   |   71 ++++++++++++++++++++-----------
 net/netfilter/ipset/ip_set_list_set.c   |   19 ++++++---
 4 files changed, 107 insertions(+), 43 deletions(-)

diff --git a/include/linux/netfilter/ipset/ip_set.h b/include/linux/netfilter/ipset/ip_set.h
index 66d6bd4..6372ee2 100644
--- a/include/linux/netfilter/ipset/ip_set.h
+++ b/include/linux/netfilter/ipset/ip_set.h
@@ -49,11 +49,13 @@ enum ip_set_feature {
 
 /* Set extensions */
 enum ip_set_extension {
-	IPSET_EXT_NONE = 0,
-	IPSET_EXT_BIT_TIMEOUT = 1,
+	IPSET_EXT_BIT_TIMEOUT = 0,
 	IPSET_EXT_TIMEOUT = (1 << IPSET_EXT_BIT_TIMEOUT),
-	IPSET_EXT_BIT_COUNTER = 2,
+	IPSET_EXT_BIT_COUNTER = 1,
 	IPSET_EXT_COUNTER = (1 << IPSET_EXT_BIT_COUNTER),
+	/* Mark set with an extension which needs to call destroy */
+	IPSET_EXT_BIT_DESTROY = 7,
+	IPSET_EXT_DESTROY = (1 << IPSET_EXT_BIT_DESTROY),
 };
 
 #define SET_WITH_TIMEOUT(s)	((s)->extensions & IPSET_EXT_TIMEOUT)
@@ -68,6 +70,8 @@ enum ip_set_ext_id {
 
 /* Extension type */
 struct ip_set_ext_type {
+	/* Destroy extension private data (can be NULL) */
+	void (*destroy)(void *ext);
 	enum ip_set_extension type;
 	enum ipset_cadt_flags flag;
 	/* Size and minimal alignment */
@@ -88,13 +92,21 @@ struct ip_set_counter {
 	atomic64_t packets;
 };
 
+struct ip_set;
+
+static inline void
+ip_set_ext_destroy(struct ip_set *set, void *data)
+{
+	/* Check that the extension is enabled for the set and
+	 * call it's destroy function for its extension part in data.
+	 */
+}
+
 #define ext_timeout(e, s)	\
 (unsigned long *)(((void *)(e)) + (s)->offset[IPSET_EXT_ID_TIMEOUT])
 #define ext_counter(e, s)	\
 (struct ip_set_counter *)(((void *)(e)) + (s)->offset[IPSET_EXT_ID_COUNTER])
 
-struct ip_set;
-
 typedef int (*ipset_adtfn)(struct ip_set *set, void *value,
 			   const struct ip_set_ext *ext,
 			   struct ip_set_ext *mext, u32 cmdflags);
diff --git a/net/netfilter/ipset/ip_set_bitmap_gen.h b/net/netfilter/ipset/ip_set_bitmap_gen.h
index f32ddbc..4515fe8 100644
--- a/net/netfilter/ipset/ip_set_bitmap_gen.h
+++ b/net/netfilter/ipset/ip_set_bitmap_gen.h
@@ -12,6 +12,7 @@
 #define mtype_gc_test		IPSET_TOKEN(MTYPE, _gc_test)
 #define mtype_is_filled		IPSET_TOKEN(MTYPE, _is_filled)
 #define mtype_do_add		IPSET_TOKEN(MTYPE, _do_add)
+#define mtype_ext_cleanup	IPSET_TOKEN(MTYPE, _ext_cleanup)
 #define mtype_do_del		IPSET_TOKEN(MTYPE, _do_del)
 #define mtype_do_list		IPSET_TOKEN(MTYPE, _do_list)
 #define mtype_do_head		IPSET_TOKEN(MTYPE, _do_head)
@@ -47,6 +48,17 @@ mtype_gc_init(struct ip_set *set, void (*gc)(unsigned long ul_set))
 }
 
 static void
+mtype_ext_cleanup(struct ip_set *set)
+{
+	struct mtype *map = set->data;
+	u32 id;
+
+	for (id = 0; id < map->elements; id++)
+		if (test_bit(id, map->members))
+			ip_set_ext_destroy(set, get_ext(set, map, id));
+}
+
+static void
 mtype_destroy(struct ip_set *set)
 {
 	struct mtype *map = set->data;
@@ -55,8 +67,11 @@ mtype_destroy(struct ip_set *set)
 		del_timer_sync(&map->gc);
 
 	ip_set_free(map->members);
-	if (set->dsize)
+	if (set->dsize) {
+		if (set->extensions & IPSET_EXT_DESTROY)
+			mtype_ext_cleanup(set);
 		ip_set_free(map->extensions);
+	}
 	kfree(map);
 
 	set->data = NULL;
@@ -67,6 +82,8 @@ mtype_flush(struct ip_set *set)
 {
 	struct mtype *map = set->data;
 
+	if (set->extensions & IPSET_EXT_DESTROY)
+		mtype_ext_cleanup(set);
 	memset(map->members, 0, map->memsize);
 }
 
@@ -132,6 +149,8 @@ mtype_add(struct ip_set *set, void *value, const struct ip_set_ext *ext,
 			ret = 0;
 		else if (!(flags & IPSET_FLAG_EXIST))
 			return -IPSET_ERR_EXIST;
+		/* Element is re-added, cleanup extensions */
+		ip_set_ext_destroy(set, x);
 	}
 
 	if (SET_WITH_TIMEOUT(set))
@@ -152,11 +171,14 @@ mtype_del(struct ip_set *set, void *value, const struct ip_set_ext *ext,
 {
 	struct mtype *map = set->data;
 	const struct mtype_adt_elem *e = value;
-	const void *x = get_ext(set, map, e->id);
+	void *x = get_ext(set, map, e->id);
 
-	if (mtype_do_del(e, map) ||
-	    (SET_WITH_TIMEOUT(set) &&
-	     ip_set_timeout_expired(ext_timeout(x, set))))
+	if (mtype_do_del(e, map))
+		return -IPSET_ERR_EXIST;
+
+	ip_set_ext_destroy(set, x);
+	if (SET_WITH_TIMEOUT(set) &&
+	    ip_set_timeout_expired(ext_timeout(x, set)))
 		return -IPSET_ERR_EXIST;
 
 	return 0;
@@ -235,7 +257,7 @@ mtype_gc(unsigned long ul_set)
 {
 	struct ip_set *set = (struct ip_set *) ul_set;
 	struct mtype *map = set->data;
-	const void *x;
+	void *x;
 	u32 id;
 
 	/* We run parallel with other readers (test element)
@@ -244,8 +266,10 @@ mtype_gc(unsigned long ul_set)
 	for (id = 0; id < map->elements; id++)
 		if (mtype_gc_test(id, map, set->dsize)) {
 			x = get_ext(set, map, id);
-			if (ip_set_timeout_expired(ext_timeout(x, set)))
+			if (ip_set_timeout_expired(ext_timeout(x, set))) {
 				clear_bit(id, map->members);
+				ip_set_ext_destroy(set, x);
+			}
 		}
 	read_unlock_bh(&set->lock);
 
diff --git a/net/netfilter/ipset/ip_set_hash_gen.h b/net/netfilter/ipset/ip_set_hash_gen.h
index 3999f17..3c26e5b 100644
--- a/net/netfilter/ipset/ip_set_hash_gen.h
+++ b/net/netfilter/ipset/ip_set_hash_gen.h
@@ -117,23 +117,6 @@ htable_bits(u32 hashsize)
 	return bits;
 }
 
-/* Destroy the hashtable part of the set */
-static void
-ahash_destroy(struct htable *t)
-{
-	struct hbucket *n;
-	u32 i;
-
-	for (i = 0; i < jhash_size(t->htable_bits); i++) {
-		n = hbucket(t, i);
-		if (n->size)
-			/* FIXME: use slab cache */
-			kfree(n->value);
-	}
-
-	ip_set_free(t);
-}
-
 static int
 hbucket_elem_add(struct hbucket *n, u8 ahash_max, size_t dsize)
 {
@@ -192,6 +175,8 @@ hbucket_elem_add(struct hbucket *n, u8 ahash_max, size_t dsize)
 #undef mtype_data_next
 #undef mtype_elem
 
+#undef mtype_ahash_destroy
+#undef mtype_ext_cleanup
 #undef mtype_add_cidr
 #undef mtype_del_cidr
 #undef mtype_ahash_memsize
@@ -230,6 +215,8 @@ hbucket_elem_add(struct hbucket *n, u8 ahash_max, size_t dsize)
 #define mtype_data_list		IPSET_TOKEN(MTYPE, _data_list)
 #define mtype_data_next		IPSET_TOKEN(MTYPE, _data_next)
 #define mtype_elem		IPSET_TOKEN(MTYPE, _elem)
+#define mtype_ahash_destroy	IPSET_TOKEN(MTYPE, _ahash_destroy)
+#define mtype_ext_cleanup	IPSET_TOKEN(MTYPE, _ext_cleanup)
 #define mtype_add_cidr		IPSET_TOKEN(MTYPE, _add_cidr)
 #define mtype_del_cidr		IPSET_TOKEN(MTYPE, _del_cidr)
 #define mtype_ahash_memsize	IPSET_TOKEN(MTYPE, _ahash_memsize)
@@ -359,6 +346,19 @@ mtype_ahash_memsize(const struct htype *h, const struct htable *t,
 	return memsize;
 }
 
+/* Get the ith element from the array block n */
+#define ahash_data(n, i, dsize)	\
+	((struct mtype_elem *)((n)->value + ((i) * (dsize))))
+
+static void
+mtype_ext_cleanup(struct ip_set *set, struct hbucket *n)
+{
+	int i;
+
+	for (i = 0; i < n->pos; i++)
+		ip_set_ext_destroy(set, ahash_data(n, i, set->dsize));
+}
+
 /* Flush a hash type of set: destroy all elements */
 static void
 mtype_flush(struct ip_set *set)
@@ -372,6 +372,8 @@ mtype_flush(struct ip_set *set)
 	for (i = 0; i < jhash_size(t->htable_bits); i++) {
 		n = hbucket(t, i);
 		if (n->size) {
+			if (set->extensions & IPSET_EXT_DESTROY)
+				mtype_ext_cleanup(set, n);
 			n->size = n->pos = 0;
 			/* FIXME: use slab cache */
 			kfree(n->value);
@@ -383,6 +385,26 @@ mtype_flush(struct ip_set *set)
 	h->elements = 0;
 }
 
+/* Destroy the hashtable part of the set */
+static void
+mtype_ahash_destroy(struct ip_set *set, struct htable *t)
+{
+	struct hbucket *n;
+	u32 i;
+
+	for (i = 0; i < jhash_size(t->htable_bits); i++) {
+		n = hbucket(t, i);
+		if (n->size) {
+			if (set->extensions & IPSET_EXT_DESTROY)
+				mtype_ext_cleanup(set, n);
+			/* FIXME: use slab cache */
+			kfree(n->value);
+		}
+	}
+
+	ip_set_free(t);
+}
+
 /* Destroy a hash type of set */
 static void
 mtype_destroy(struct ip_set *set)
@@ -392,7 +414,7 @@ mtype_destroy(struct ip_set *set)
 	if (set->extensions & IPSET_EXT_TIMEOUT)
 		del_timer_sync(&h->gc);
 
-	ahash_destroy(rcu_dereference_bh_nfnl(h->table));
+	mtype_ahash_destroy(set, rcu_dereference_bh_nfnl(h->table));
 #ifdef IP_SET_HASH_WITH_RBTREE
 	rbtree_destroy(&h->rbtree);
 #endif
@@ -430,10 +452,6 @@ mtype_same_set(const struct ip_set *a, const struct ip_set *b)
 	       a->extensions == b->extensions;
 }
 
-/* Get the ith element from the array block n */
-#define ahash_data(n, i, dsize)	\
-	((struct mtype_elem *)((n)->value + ((i) * (dsize))))
-
 /* Delete expired elements from the hashtable */
 static void
 mtype_expire(struct ip_set *set, struct htype *h, u8 nets_length, size_t dsize)
@@ -456,6 +474,7 @@ mtype_expire(struct ip_set *set, struct htype *h, u8 nets_length, size_t dsize)
 				mtype_del_cidr(h, CIDR(data->cidr),
 					       nets_length, 0);
 #endif
+				ip_set_ext_destroy(set, data);
 				if (j != n->pos - 1)
 					/* Not last one */
 					memcpy(data,
@@ -557,7 +576,7 @@ retry:
 				mtype_data_reset_flags(data, &flags);
 #endif
 				read_unlock_bh(&set->lock);
-				ahash_destroy(t);
+				mtype_ahash_destroy(set, t);
 				if (ret == -EAGAIN)
 					goto retry;
 				return ret;
@@ -578,7 +597,7 @@ retry:
 
 	pr_debug("set %s resized from %u (%p) to %u (%p)\n", set->name,
 		 orig->htable_bits, orig, t->htable_bits, t);
-	ahash_destroy(orig);
+	mtype_ahash_destroy(set, orig);
 
 	return 0;
 }
@@ -642,6 +661,7 @@ reuse_slot:
 		mtype_del_cidr(h, CIDR(data->cidr), NLEN(set->family), 0);
 		mtype_add_cidr(h, CIDR(d->cidr), NLEN(set->family), 0);
 #endif
+		ip_set_ext_destroy(set, data);
 	} else {
 		/* Use/create a new slot */
 		TUNE_AHASH_MAX(h, multi);
@@ -707,6 +727,7 @@ mtype_del(struct ip_set *set, void *value, const struct ip_set_ext *ext,
 #ifdef IP_SET_HASH_WITH_NETS
 		mtype_del_cidr(h, CIDR(d->cidr), NLEN(set->family), 0);
 #endif
+		ip_set_ext_destroy(set, data);
 		if (n->pos + AHASH_INIT_SIZE < n->size) {
 			void *tmp = kzalloc((n->size - AHASH_INIT_SIZE)
 					    * set->dsize,
@@ -1033,7 +1054,7 @@ IPSET_TOKEN(HTYPE, _create)(struct ip_set *set, struct nlattr *tb[], u32 flags)
 	rcu_assign_pointer(h->table, t);
 
 	set->data = h;
-	if (set->family ==  NFPROTO_IPV4) {
+	if (set->family == NFPROTO_IPV4) {
 		set->variant = &IPSET_TOKEN(HTYPE, 4_variant);
 		set->dsize = ip_set_elem_len(set, tb,
 				sizeof(struct IPSET_TOKEN(HTYPE, 4_elem)));
diff --git a/net/netfilter/ipset/ip_set_list_set.c b/net/netfilter/ipset/ip_set_list_set.c
index 7fd11c7..e44986a 100644
--- a/net/netfilter/ipset/ip_set_list_set.c
+++ b/net/netfilter/ipset/ip_set_list_set.c
@@ -168,16 +168,19 @@ list_set_add(struct ip_set *set, u32 i, struct set_adt_elem *d,
 	struct set_elem *e = list_set_elem(set, map, i);
 
 	if (e->id != IPSET_INVALID_ID) {
-		if (i == map->size - 1)
+		if (i == map->size - 1) {
 			/* Last element replaced: e.g. add new,before,last */
 			ip_set_put_byindex(e->id);
-		else {
+			ip_set_ext_destroy(set, e);
+		} else {
 			struct set_elem *x = list_set_elem(set, map,
 							   map->size - 1);
 
 			/* Last element pushed off */
-			if (x->id != IPSET_INVALID_ID)
+			if (x->id != IPSET_INVALID_ID) {
 				ip_set_put_byindex(x->id);
+				ip_set_ext_destroy(set, x);
+			}
 			memmove(list_set_elem(set, map, i + 1), e,
 				set->dsize * (map->size - (i + 1)));
 		}
@@ -198,6 +201,7 @@ list_set_del(struct ip_set *set, u32 i)
 	struct set_elem *e = list_set_elem(set, map, i);
 
 	ip_set_put_byindex(e->id);
+	ip_set_ext_destroy(set, e);
 
 	if (i < map->size - 1)
 		memmove(e, list_set_elem(set, map, i + 1),
@@ -266,14 +270,14 @@ list_set_uadd(struct ip_set *set, void *value, const struct ip_set_ext *ext,
 	bool flag_exist = flags & IPSET_FLAG_EXIST;
 	u32 i, ret = 0;
 
+	if (SET_WITH_TIMEOUT(set))
+		set_cleanup_entries(set);
+
 	/* Check already added element */
 	for (i = 0; i < map->size; i++) {
 		e = list_set_elem(set, map, i);
 		if (e->id == IPSET_INVALID_ID)
 			goto insert;
-		else if (SET_WITH_TIMEOUT(set) &&
-			 ip_set_timeout_expired(ext_timeout(e, set)))
-			continue;
 		else if (e->id != d->id)
 			continue;
 
@@ -286,6 +290,8 @@ list_set_uadd(struct ip_set *set, void *value, const struct ip_set_ext *ext,
 			/* Can't re-add */
 			return -IPSET_ERR_EXIST;
 		/* Update extensions */
+		ip_set_ext_destroy(set, e);
+
 		if (SET_WITH_TIMEOUT(set))
 			ip_set_timeout_set(ext_timeout(e, set), ext->timeout);
 		if (SET_WITH_COUNTER(set))
@@ -423,6 +429,7 @@ list_set_flush(struct ip_set *set)
 		e = list_set_elem(set, map, i);
 		if (e->id != IPSET_INVALID_ID) {
 			ip_set_put_byindex(e->id);
+			ip_set_ext_destroy(set, e);
 			e->id = IPSET_INVALID_ID;
 		}
 	}
-- 
1.7.10.4


^ permalink raw reply related

page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox