Netdev List

Netdev List
 help / color / mirror / Atom feed

* [PATCH iproute2] tests: Move tc related tests to testsuite/tests/tc folder
From: Vadim Kochan @ 2014-10-31  8:03 UTC (permalink / raw)
  To: netdev; +Cc: Vadim Kochan

With this change the results of tc tests will be recorded under:

    testsuite/results/tc/

The ip related tests can be added under:

    testsuite/tests/ip

Signed-off-by: Vadim Kochan <vadim4j@gmail.com>
---
 testsuite/tests/cbq.t            | 10 ------
 testsuite/tests/cls-testbed.t    | 73 ----------------------------------------
 testsuite/tests/dsmark.t         | 31 -----------------
 testsuite/tests/policer.t        | 13 -------
 testsuite/tests/tc/cbq.t         | 10 ++++++
 testsuite/tests/tc/cls-testbed.t | 73 ++++++++++++++++++++++++++++++++++++++++
 testsuite/tests/tc/dsmark.t      | 31 +++++++++++++++++
 testsuite/tests/tc/policer.t     | 13 +++++++
 8 files changed, 127 insertions(+), 127 deletions(-)
 delete mode 100755 testsuite/tests/cbq.t
 delete mode 100755 testsuite/tests/cls-testbed.t
 delete mode 100755 testsuite/tests/dsmark.t
 delete mode 100755 testsuite/tests/policer.t
 create mode 100755 testsuite/tests/tc/cbq.t
 create mode 100755 testsuite/tests/tc/cls-testbed.t
 create mode 100755 testsuite/tests/tc/dsmark.t
 create mode 100755 testsuite/tests/tc/policer.t

diff --git a/testsuite/tests/cbq.t b/testsuite/tests/cbq.t
deleted file mode 100755
index bff814b..0000000
--- a/testsuite/tests/cbq.t
+++ /dev/null
@@ -1,10 +0,0 @@
-#!/bin/sh
-$TC qdisc del dev $DEV root >/dev/null 2>&1
-$TC qdisc add dev $DEV root handle 10:0 cbq bandwidth 100Mbit avpkt 1400 mpu 64
-$TC class add dev $DEV parent 10:0  classid 10:12   cbq bandwidth 100mbit rate 100mbit allot 1514 prio 3 maxburst 1 avpkt  500 bounded
-$TC qdisc list dev $DEV
-$TC qdisc del dev $DEV root
-$TC qdisc list dev $DEV
-$TC qdisc add dev $DEV root handle 10:0 cbq bandwidth 100Mbit avpkt 1400 mpu 64
-$TC class add dev $DEV parent 10:0  classid 10:12   cbq bandwidth 100mbit rate 100mbit allot 1514 prio 3 maxburst 1 avpkt  500 bounded
-$TC qdisc del dev $DEV root
diff --git a/testsuite/tests/cls-testbed.t b/testsuite/tests/cls-testbed.t
deleted file mode 100755
index 2afc26f..0000000
--- a/testsuite/tests/cls-testbed.t
+++ /dev/null
@@ -1,73 +0,0 @@
-#!/bin/bash
-# vim: ft=sh
-
-source lib/generic.sh
-
-QDISCS="cbq htb dsmark"
-
-if [ ! -d tests/cls ]; then
-    ts_log "tests/cls folder does not exist"
-    ts_skip
-fi
-
-for q in ${QDISCS}; do
-	ts_log "Preparing classifier testbed with qdisc $q"
-
-	for c in tests/cls/*.c; do
-
-		case "$q" in
-		cbq)
-			ts_tc "cls-testbed" "cbq root qdisc creation" \
-				qdisc add dev $DEV root handle 10:0 \
-				cbq bandwidth 100Mbit avpkt 1400 mpu 64
-			ts_tc "cls-testbed" "cbq root class creation" \
-				class add dev $DEV parent 10:0  classid 10:12 \
-				cbq bandwidth 100mbit rate 100mbit allot 1514 prio 3 \
-				maxburst 1 avpkt  500 bounded
-			;;
-		htb)
-			ts_qdisc_available "htb"
-			if [ $? -eq 0 ]; then
-				ts_log "cls-testbed: HTB is unsupported by $TC, skipping"
-				continue;
-			fi
-			ts_tc "cls-testbed" "htb root qdisc creation" \
-				qdisc add dev $DEV root handle 10:0 htb
-			ts_tc "cls-testbed" "htb root class creation" \
-				class add dev $DEV parent 10:0 classid 10:12 \
-				htb rate 100Mbit quantum 1514
-			;;
-		dsmark)
-			ts_qdisc_available "dsmark"
-			if [ $? -eq 0 ]; then
-				ts_log "cls-testbed: dsmark is unsupported by $TC, skipping"
-				continue;
-			fi
-			ts_tc "cls-testbed" "dsmark root qdisc creation" \
-				qdisc add dev $DEV root handle 20:0 \
-				dsmark indices 64 default_index 1 set_tc_index
-			ts_tc "cls-testbed" "dsmark class creation" \
-				class change dev $DEV parent 20:0 classid 20:12 \
-				dsmark mask 0xff value 2
-			ts_tc "cls-testbed" "prio inner qdisc creation" \
-				qdisc add dev $DEV parent 20:0 handle 10:0 prio
-			;;
-		*)
-			ts_err "cls-testbed: no testbed configuration found for qdisc $q"
-			continue
-			;;
-		esac
-
-		ts_tc "cls-testbed" "tree listing" qdisc list dev eth0
-		ts_tc "cls-testbed" "tree class listing" class list dev eth0
-		ts_log "cls-testbed: starting classifier test $c"
-		$c 
-
-		case "$q" in
-		*)
-			ts_tc "cls-testbed" "generic qdisc tree deletion" \
-				qdisc del dev $DEV root
-			;;
-		esac
-	done
-done
diff --git a/testsuite/tests/dsmark.t b/testsuite/tests/dsmark.t
deleted file mode 100755
index 6934165..0000000
--- a/testsuite/tests/dsmark.t
+++ /dev/null
@@ -1,31 +0,0 @@
-#!/bin/bash
-# vim: ft=sh
-
-source lib/generic.sh
-
-ts_qdisc_available "dsmark"
-if [ $? -eq 0 ]; then
-	ts_log "dsmark: Unsupported by $TC, skipping"
-	exit 127
-fi
-
-ts_tc "dsmark" "dsmark root qdisc creation" \
-	qdisc add dev $DEV root handle 10:0 \
-	dsmark indices 64 default_index 1 set_tc_index
-
-ts_tc "dsmark" "dsmark class 1 creation" \
-	class change dev $DEV parent 10:0 classid 10:12 \
-	dsmark mask 0xff value 2
-
-ts_tc "dsmark" "dsmark class 2 creation" \
-	class change dev $DEV parent 10:0 classid 10:13 \
-	dsmark mask 0xfc value 4
-
-ts_tc "dsmark" "dsmark dump qdisc" \
-	qdisc list dev $DEV
-
-ts_tc "dsmark" "dsmark dump class" \
-	class list dev $DEV parent 10:0
-
-ts_tc "dsmark" "generic qdisc tree deletion" \
-	qdisc del dev $DEV root
diff --git a/testsuite/tests/policer.t b/testsuite/tests/policer.t
deleted file mode 100755
index eaf16ac..0000000
--- a/testsuite/tests/policer.t
+++ /dev/null
@@ -1,13 +0,0 @@
-#!/bin/sh
-$TC qdisc del dev $DEV root >/dev/null 2>&1
-$TC qdisc add dev $DEV root handle 10:0 cbq bandwidth 100Mbit avpkt 1400 mpu 64
-$TC class add dev $DEV parent 10:0  classid 10:12   cbq bandwidth 100mbit rate 100mbit allot 1514 prio 3 maxburst 1 avpkt  500 bounded
-$TC filter add dev $DEV parent 10:0 protocol ip prio 10 u32 match ip protocol 1 0xff police rate 2kbit buffer 10k drop flowid 10:12
-$TC qdisc list dev $DEV
-$TC filter list dev $DEV parent 10:0
-$TC qdisc del dev $DEV root
-$TC qdisc list dev $DEV
-$TC qdisc add dev $DEV root handle 10:0 cbq bandwidth 100Mbit avpkt 1400 mpu 64
-$TC class add dev $DEV parent 10:0  classid 10:12   cbq bandwidth 100mbit rate 100mbit allot 1514 prio 3 maxburst 1 avpkt  500 bounded
-$TC filter add dev $DEV parent 10:0 protocol ip prio 10 u32 match ip protocol 1 0xff police rate 2kbit buffer 10k drop flowid 10:12
-$TC qdisc del dev $DEV root
diff --git a/testsuite/tests/tc/cbq.t b/testsuite/tests/tc/cbq.t
new file mode 100755
index 0000000..bff814b
--- /dev/null
+++ b/testsuite/tests/tc/cbq.t
@@ -0,0 +1,10 @@
+#!/bin/sh
+$TC qdisc del dev $DEV root >/dev/null 2>&1
+$TC qdisc add dev $DEV root handle 10:0 cbq bandwidth 100Mbit avpkt 1400 mpu 64
+$TC class add dev $DEV parent 10:0  classid 10:12   cbq bandwidth 100mbit rate 100mbit allot 1514 prio 3 maxburst 1 avpkt  500 bounded
+$TC qdisc list dev $DEV
+$TC qdisc del dev $DEV root
+$TC qdisc list dev $DEV
+$TC qdisc add dev $DEV root handle 10:0 cbq bandwidth 100Mbit avpkt 1400 mpu 64
+$TC class add dev $DEV parent 10:0  classid 10:12   cbq bandwidth 100mbit rate 100mbit allot 1514 prio 3 maxburst 1 avpkt  500 bounded
+$TC qdisc del dev $DEV root
diff --git a/testsuite/tests/tc/cls-testbed.t b/testsuite/tests/tc/cls-testbed.t
new file mode 100755
index 0000000..2afc26f
--- /dev/null
+++ b/testsuite/tests/tc/cls-testbed.t
@@ -0,0 +1,73 @@
+#!/bin/bash
+# vim: ft=sh
+
+source lib/generic.sh
+
+QDISCS="cbq htb dsmark"
+
+if [ ! -d tests/cls ]; then
+    ts_log "tests/cls folder does not exist"
+    ts_skip
+fi
+
+for q in ${QDISCS}; do
+	ts_log "Preparing classifier testbed with qdisc $q"
+
+	for c in tests/cls/*.c; do
+
+		case "$q" in
+		cbq)
+			ts_tc "cls-testbed" "cbq root qdisc creation" \
+				qdisc add dev $DEV root handle 10:0 \
+				cbq bandwidth 100Mbit avpkt 1400 mpu 64
+			ts_tc "cls-testbed" "cbq root class creation" \
+				class add dev $DEV parent 10:0  classid 10:12 \
+				cbq bandwidth 100mbit rate 100mbit allot 1514 prio 3 \
+				maxburst 1 avpkt  500 bounded
+			;;
+		htb)
+			ts_qdisc_available "htb"
+			if [ $? -eq 0 ]; then
+				ts_log "cls-testbed: HTB is unsupported by $TC, skipping"
+				continue;
+			fi
+			ts_tc "cls-testbed" "htb root qdisc creation" \
+				qdisc add dev $DEV root handle 10:0 htb
+			ts_tc "cls-testbed" "htb root class creation" \
+				class add dev $DEV parent 10:0 classid 10:12 \
+				htb rate 100Mbit quantum 1514
+			;;
+		dsmark)
+			ts_qdisc_available "dsmark"
+			if [ $? -eq 0 ]; then
+				ts_log "cls-testbed: dsmark is unsupported by $TC, skipping"
+				continue;
+			fi
+			ts_tc "cls-testbed" "dsmark root qdisc creation" \
+				qdisc add dev $DEV root handle 20:0 \
+				dsmark indices 64 default_index 1 set_tc_index
+			ts_tc "cls-testbed" "dsmark class creation" \
+				class change dev $DEV parent 20:0 classid 20:12 \
+				dsmark mask 0xff value 2
+			ts_tc "cls-testbed" "prio inner qdisc creation" \
+				qdisc add dev $DEV parent 20:0 handle 10:0 prio
+			;;
+		*)
+			ts_err "cls-testbed: no testbed configuration found for qdisc $q"
+			continue
+			;;
+		esac
+
+		ts_tc "cls-testbed" "tree listing" qdisc list dev eth0
+		ts_tc "cls-testbed" "tree class listing" class list dev eth0
+		ts_log "cls-testbed: starting classifier test $c"
+		$c 
+
+		case "$q" in
+		*)
+			ts_tc "cls-testbed" "generic qdisc tree deletion" \
+				qdisc del dev $DEV root
+			;;
+		esac
+	done
+done
diff --git a/testsuite/tests/tc/dsmark.t b/testsuite/tests/tc/dsmark.t
new file mode 100755
index 0000000..6934165
--- /dev/null
+++ b/testsuite/tests/tc/dsmark.t
@@ -0,0 +1,31 @@
+#!/bin/bash
+# vim: ft=sh
+
+source lib/generic.sh
+
+ts_qdisc_available "dsmark"
+if [ $? -eq 0 ]; then
+	ts_log "dsmark: Unsupported by $TC, skipping"
+	exit 127
+fi
+
+ts_tc "dsmark" "dsmark root qdisc creation" \
+	qdisc add dev $DEV root handle 10:0 \
+	dsmark indices 64 default_index 1 set_tc_index
+
+ts_tc "dsmark" "dsmark class 1 creation" \
+	class change dev $DEV parent 10:0 classid 10:12 \
+	dsmark mask 0xff value 2
+
+ts_tc "dsmark" "dsmark class 2 creation" \
+	class change dev $DEV parent 10:0 classid 10:13 \
+	dsmark mask 0xfc value 4
+
+ts_tc "dsmark" "dsmark dump qdisc" \
+	qdisc list dev $DEV
+
+ts_tc "dsmark" "dsmark dump class" \
+	class list dev $DEV parent 10:0
+
+ts_tc "dsmark" "generic qdisc tree deletion" \
+	qdisc del dev $DEV root
diff --git a/testsuite/tests/tc/policer.t b/testsuite/tests/tc/policer.t
new file mode 100755
index 0000000..eaf16ac
--- /dev/null
+++ b/testsuite/tests/tc/policer.t
@@ -0,0 +1,13 @@
+#!/bin/sh
+$TC qdisc del dev $DEV root >/dev/null 2>&1
+$TC qdisc add dev $DEV root handle 10:0 cbq bandwidth 100Mbit avpkt 1400 mpu 64
+$TC class add dev $DEV parent 10:0  classid 10:12   cbq bandwidth 100mbit rate 100mbit allot 1514 prio 3 maxburst 1 avpkt  500 bounded
+$TC filter add dev $DEV parent 10:0 protocol ip prio 10 u32 match ip protocol 1 0xff police rate 2kbit buffer 10k drop flowid 10:12
+$TC qdisc list dev $DEV
+$TC filter list dev $DEV parent 10:0
+$TC qdisc del dev $DEV root
+$TC qdisc list dev $DEV
+$TC qdisc add dev $DEV root handle 10:0 cbq bandwidth 100Mbit avpkt 1400 mpu 64
+$TC class add dev $DEV parent 10:0  classid 10:12   cbq bandwidth 100mbit rate 100mbit allot 1514 prio 3 maxburst 1 avpkt  500 bounded
+$TC filter add dev $DEV parent 10:0 protocol ip prio 10 u32 match ip protocol 1 0xff police rate 2kbit buffer 10k drop flowid 10:12
+$TC qdisc del dev $DEV root
-- 
2.1.0

^ permalink raw reply related

* Re: DMA allocations from CMA and fatal_signal_pending check
From: Joonsoo Kim @ 2014-10-31  8:28 UTC (permalink / raw)
  To: Florian Fainelli
  Cc: linux-arm-kernel, Brian Norris, Gregory Fong, linux-kernel,
	linux-mm, lauraa, gioh.kim, aneesh.kumar, mina86, m.szyprowski,
	akpm, netdev@vger.kernel.org
In-Reply-To: <544FE9BE.6040503@gmail.com>

On Tue, Oct 28, 2014 at 12:08:46PM -0700, Florian Fainelli wrote:
> Hello,
> 
> While debugging why some dma_alloc_coherent() allocations where
> returning NULL on our brcmstb platform, specifically with
> drivers/net/ethernet/broadcom/bcmcsysport.c, I came across the
> fatal_signal_pending() check in mm/page_alloc.c which is there.
> 
> This driver calls dma_alloc_coherent(, GFP_KERNEL) which ends up making
> a coherent allocation from a CMA region on our platform. Since that
> allocation is allowed to sleep, and because we are in bcm_syport_open(),
> executed from process context, a pending signal makes
> dma_alloc_coherent() return NULL.

Hello, Florian.

fatal_signal_pending means that there is SIGKILL on that process.
I guess that caller of dma_alloc_coherent() will die soon.
In this case, why CMA should be succeed?

> 
> There are two ways I could fix this:
> 
> - use a GFP_ATOMIC allocation, which would avoid this sensitivity to a
> pending signal being fatal (we suffer from the same issue in
> bcm_sysport_resume)
> 
> - move the DMA coherent allocation before bcm_sysport_open(), in the
> driver's probe function, but if the network interface is never used, we
> would be waisting precious DMA coherent memory for nothing (it is only 4
> bytes times 32 but still

I guess that it is okay that bcm_sysport_open() return -EINTR?

Thanks.

^ permalink raw reply

* iwlwifi: mvm:  BACKPORT_WANT_DEV_COREDUMP?
From: Paul Bolle @ 2014-10-31  8:40 UTC (permalink / raw)
  To: Johannes Berg
  Cc: Valentin Rothberg, Emmanuel Grumbach, John W. Linville,
	Intel Linux Wireless, linux-wireless, netdev, linux-kernel

Your commit aadede6e9f4c ("iwlwifi: mvm: port to devcoredump framework")
landed in today's linux-next (next-20141031). It adds a select statement
for BACKPORT_WANT_DEV_COREDUMP. There's no Kconfig symbol 
BACKPORT_WANT_DEV_COREDUMP so this select is currently a nop. (In
https://lkml.org/lkml/2014/9/30/578 I proposed a patch that emits a
warning in cases like this.)

Did you perhaps meant to select WANT_DEV_COREDUMP? Or is the Kconfig
symbol BACKPORT_WANT_DEV_COREDUMP queued somewhere?

Paul Bolle

^ permalink raw reply

* Re: iwlwifi: mvm:  BACKPORT_WANT_DEV_COREDUMP?
From: Johannes Berg @ 2014-10-31  8:45 UTC (permalink / raw)
  To: Paul Bolle
  Cc: Valentin Rothberg, Emmanuel Grumbach, John W. Linville,
	Intel Linux Wireless, linux-wireless, netdev, linux-kernel
In-Reply-To: <1414744834.1964.7.camel@x220>

On Fri, 2014-10-31 at 09:40 +0100, Paul Bolle wrote:
> Your commit aadede6e9f4c ("iwlwifi: mvm: port to devcoredump framework")
> landed in today's linux-next (next-20141031). It adds a select statement
> for BACKPORT_WANT_DEV_COREDUMP. There's no Kconfig symbol 
> BACKPORT_WANT_DEV_COREDUMP so this select is currently a nop. (In
> https://lkml.org/lkml/2014/9/30/578 I proposed a patch that emits a
> warning in cases like this.)
> 
> Did you perhaps meant to select WANT_DEV_COREDUMP?

Yes. We'll fix it up in the iwlwifi tree.

Thanks for the report!

johannes

^ permalink raw reply

* Re: iwlwifi: mvm:  BACKPORT_WANT_DEV_COREDUMP?
From: Paul Bolle @ 2014-10-31  9:06 UTC (permalink / raw)
  To: Johannes Berg
  Cc: Valentin Rothberg, Emmanuel Grumbach, John W. Linville,
	Intel Linux Wireless, linux-wireless, netdev, linux-kernel
In-Reply-To: <1414745106.3014.17.camel@jlt4.sipsolutions.net>

On Fri, 2014-10-31 at 09:45 +0100, Johannes Berg wrote:
> On Fri, 2014-10-31 at 09:40 +0100, Paul Bolle wrote:
> > Your commit aadede6e9f4c ("iwlwifi: mvm: port to devcoredump framework")
> > landed in today's linux-next (next-20141031). It adds a select statement
> > for BACKPORT_WANT_DEV_COREDUMP. There's no Kconfig symbol 
> > BACKPORT_WANT_DEV_COREDUMP so this select is currently a nop. (In
> > https://lkml.org/lkml/2014/9/30/578 I proposed a patch that emits a
> > warning in cases like this.)
> > 
> > Did you perhaps meant to select WANT_DEV_COREDUMP?
> 
> Yes. We'll fix it up in the iwlwifi tree.
> 
> Thanks for the report!

Perhaps you could also look into somehow guarding the call of
dev_coredumpm(), that this commit added, with checks for
CONFIG_DEV_COREDUMP. See, I had a quick look at all this and selecting
WANT_DEV_COREDUMP might not be enough, because DISABLE_DEV_COREDUMP can
still, well, disable DEV_COREDUMP. Or am I misreading the Kconfig
symbols that regulate DEV_COREDUMP?

Thanks,


Paul Bolle

^ permalink raw reply

* Re: iwlwifi: mvm:  BACKPORT_WANT_DEV_COREDUMP?
From: Johannes Berg @ 2014-10-31  9:08 UTC (permalink / raw)
  To: Paul Bolle
  Cc: Valentin Rothberg, Emmanuel Grumbach, John W. Linville,
	Intel Linux Wireless, linux-wireless, netdev, linux-kernel
In-Reply-To: <1414746397.1964.13.camel@x220>

On Fri, 2014-10-31 at 10:06 +0100, Paul Bolle wrote:
> On Fri, 2014-10-31 at 09:45 +0100, Johannes Berg wrote:
> > On Fri, 2014-10-31 at 09:40 +0100, Paul Bolle wrote:
> > > Your commit aadede6e9f4c ("iwlwifi: mvm: port to devcoredump framework")
> > > landed in today's linux-next (next-20141031). It adds a select statement
> > > for BACKPORT_WANT_DEV_COREDUMP. There's no Kconfig symbol 
> > > BACKPORT_WANT_DEV_COREDUMP so this select is currently a nop. (In
> > > https://lkml.org/lkml/2014/9/30/578 I proposed a patch that emits a
> > > warning in cases like this.)
> > > 
> > > Did you perhaps meant to select WANT_DEV_COREDUMP?
> > 
> > Yes. We'll fix it up in the iwlwifi tree.
> > 
> > Thanks for the report!
> 
> Perhaps you could also look into somehow guarding the call of
> dev_coredumpm(), that this commit added, with checks for
> CONFIG_DEV_COREDUMP. See, I had a quick look at all this and selecting
> WANT_DEV_COREDUMP might not be enough, because DISABLE_DEV_COREDUMP can
> still, well, disable DEV_COREDUMP. Or am I misreading the Kconfig
> symbols that regulate DEV_COREDUMP?

No, you're correctly reading that. However, the devcoredump header file
provides simple functions in this case. That means there's some extra
work (allocating and filling the buffer just to free it immediately) but
it simplifies the code.

johannes

^ permalink raw reply

* Re: iwlwifi: mvm:  BACKPORT_WANT_DEV_COREDUMP?
From: Paul Bolle @ 2014-10-31  9:17 UTC (permalink / raw)
  To: Johannes Berg
  Cc: Valentin Rothberg, Emmanuel Grumbach, John W. Linville,
	Intel Linux Wireless, linux-wireless, netdev, linux-kernel
In-Reply-To: <1414746518.3014.18.camel@jlt4.sipsolutions.net>

On Fri, 2014-10-31 at 10:08 +0100, Johannes Berg wrote:
> On Fri, 2014-10-31 at 10:06 +0100, Paul Bolle wrote:
> > Perhaps you could also look into somehow guarding the call of
> > dev_coredumpm(), that this commit added, with checks for
> > CONFIG_DEV_COREDUMP. See, I had a quick look at all this and selecting
> > WANT_DEV_COREDUMP might not be enough, because DISABLE_DEV_COREDUMP can
> > still, well, disable DEV_COREDUMP. Or am I misreading the Kconfig
> > symbols that regulate DEV_COREDUMP?
> 
> No, you're correctly reading that. However, the devcoredump header file
> provides simple functions in this case. That means there's some extra
> work (allocating and filling the buffer just to free it immediately) but
> it simplifies the code.

I see. More than a quick look was required here. Thanks for explaining
this!


Paul Bolle

^ permalink raw reply

* Re: [PATCH -next 0/2] net: allow setting ecn via routing table
From: Daniel Borkmann @ 2014-10-31  9:24 UTC (permalink / raw)
  To: Eric Dumazet; +Cc: Florian Westphal, David Miller, netdev
In-Reply-To: <1414710342.15352.14.camel@edumazet-glaptop2.roam.corp.google.com>

On 10/31/2014 12:05 AM, Eric Dumazet wrote:
> On Thu, 2014-10-30 at 23:15 +0100, Florian Westphal wrote:
>
>> Do you think a fallback to non-ecn for retransmitted syns would help?
>> If not, do you think having ecn tunable available via route is helpful?
>
> Unfortunately some firewalls are buggy and accept a single SYN per flow.
>
> You would need to blacklist ecn at first sign of a possible ecn problem,
> for all following connections attempts.
>
> Note that ECN problems are not only contained in SYN packets being
> eventually dropped. You might have a success to establish a flow, and
> later get CE marks for all packets and cwnd converges to 1.

Wow, that is buggy! Btw, fwiw, there was a recent study [1] (paper
not public yet) which scanned the Alexa's publicly available top
million websites list from a vantage point in US, Europe and Asia:

Half of the Alexa list will now happily use ECN (tcp_ecn=2, most
likely blamed to commit 255cac91c3c9 ;)), the break in connectivity
on-path was found is about 1 in 10,000 cases. Timeouts rather than
receiving back RSTs were much more common in the negotiation phase
(and mostly seen in the Alexa middle band, ranks around 50k-150k):
from 12-thousand hosts on which there _may_ be ECN-linked connection
failures, only 79 failed with RST when _not_ failing with RST when
ECN is not requested.

It's unclear though, how much equipment actually marks the CE, and
as you mention above, marks it correctly ...

> This is really a lot of work to get all sorted out.

Yep, you are right.

  [1] http://ecn.ethz.ch/

^ permalink raw reply

* Re: [PATCH net-next v4 1/4] netns: add genl cmd to add and get peer netns ids
From: Nicolas Dichtel @ 2014-10-31  9:41 UTC (permalink / raw)
  To: Eric W. Biederman
  Cc: netdev-u79uwXL29TY76Z2rM5mHXA,
	containers-cunTk1MwBs9QetFLy7KEm3xJsTq8ys+cHZ5vskTnxNA,
	linux-kernel-u79uwXL29TY76Z2rM5mHXA,
	linux-api-u79uwXL29TY76Z2rM5mHXA, davem-fT/PcQaiUtIeIZ0/mPfg9Q,
	stephen-OTpzqLSitTUnbdJkjeBofR2eb7JE58TQ,
	akpm-de/tnXTf+JLsfHDXvbKv3WD2FQJk+8+b,
	luto-kltTT9wpgjJwATOyAt5JVQ, cwang-xCSkyg8dI+0RB7SZvlqPiA
In-Reply-To: <874mulh0cs.fsf-JOvCrm2gF+uungPnsOpG7nhyD016LWXt@public.gmane.org>

Le 30/10/2014 19:35, Eric W. Biederman a écrit :
> Nicolas Dichtel <nicolas.dichtel-pdR9zngts4EAvxtiuMwx3w@public.gmane.org> writes:
>
>> With this patch, a user can define an id for a peer netns by providing a FD or a
>> PID. These ids are local to netns (ie valid only into one netns).
>
> Scratches head.  Do you actually find value in using the pid instead of
> a file descriptor?
I copied the mechanism from rtnl_link_get_net():
First check if the user provides a PID, if not, check for a FD.

>
> Doing things by pid was an early attempt to make things work, and has
> been a bit clutsy.  If you don't find value in it I would recommend just
> supporting getting/setting the network namespace by file descriptor.
Hmm, if I understand well, it's what is done in the patch:

[snip]
>> +static int netns_nl_cmd_newid(struct sk_buff *skb, struct genl_info *info)
>> +{
[snip]
>> +	if (info->attrs[NETNSA_PID])
>> +		peer = get_net_ns_by_pid(nla_get_u32(info->attrs[NETNSA_PID]));
>> +	else if (info->attrs[NETNSA_FD])
>> +		peer = get_net_ns_by_fd(nla_get_u32(info->attrs[NETNSA_FD]));
>> +	else
>> +		return -EINVAL;
>> +	if (IS_ERR(peer))
>> +		return PTR_ERR(peer);

Am I right?


Regards,
Nicolas

^ permalink raw reply

* Re: [PATCH net-next v4 0/4] netns: allow to identify peer netns
From: Nicolas Dichtel @ 2014-10-31  9:48 UTC (permalink / raw)
  To: Eric W. Biederman
  Cc: netdev-u79uwXL29TY76Z2rM5mHXA,
	containers-cunTk1MwBs9QetFLy7KEm3xJsTq8ys+cHZ5vskTnxNA,
	linux-kernel-u79uwXL29TY76Z2rM5mHXA,
	linux-api-u79uwXL29TY76Z2rM5mHXA, davem-fT/PcQaiUtIeIZ0/mPfg9Q,
	stephen-OTpzqLSitTUnbdJkjeBofR2eb7JE58TQ,
	akpm-de/tnXTf+JLsfHDXvbKv3WD2FQJk+8+b,
	luto-kltTT9wpgjJwATOyAt5JVQ, cwang-xCSkyg8dI+0RB7SZvlqPiA
In-Reply-To: <871tpph03k.fsf-JOvCrm2gF+uungPnsOpG7nhyD016LWXt@public.gmane.org>

Le 30/10/2014 19:41, Eric W. Biederman a écrit :
> Nicolas Dichtel <nicolas.dichtel-pdR9zngts4EAvxtiuMwx3w@public.gmane.org> writes:
>
>> The goal of this serie is to be able to multicast netlink messages with an
>> attribute that identify a peer netns.
>> This is needed by the userland to interpret some informations contained in
>> netlink messages (like IFLA_LINK value, but also some other attributes in case
>> of x-netns netdevice (see also
>> http://thread.gmane.org/gmane.linux.network/315933/focus=316064 and
>> http://thread.gmane.org/gmane.linux.kernel.containers/28301/focus=4239)).
>>
>> Ids of peer netns are set by userland via a new genl messages. These ids are
>> stored per netns and are local (ie only valid in the netns where they are set).
>> To avoid allocating an int for each peer netns, I use idr_for_each() to retrieve
>> the id of a peer netns. Note that it will be possible to add a table (struct net
>> -> id) later to optimize this lookup if needed.
>>
>> Patch 1/4 introduces the netlink API mechanism to set and get these ids.
>> Patch 2/4 and 3/4 implements an example of how to use these ids in rtnetlink
>> messages. And patch 4/4 shows that the netlink messages can be symetric between
>> a GET and a SET.
>>
>> iproute2 patches are available, I can send them on demand.
>
> A quick reply.  I think this patchset is in the right general direction.
> There are some oddball details that seem odd/awkward to me such as using
> genetlink instead of rtnetlink to get and set the ids, and not having
> ids if they are not set (that feels like a maintenance/usability challenge).
No problem to use rtnetlink, in fact, I hesitated.

For the second point, I'm not sure to follow you: how to have an id, which will
not break migration, without asking the user to set it?
Note that if the user does not provide an id, you still have a magic value to
say "it's a peer netns but we don't know which one".

>
> I would like to give your patches a deep review, but I won't be able to
> do that for a couple of weeks.  I am deep in the process of moving,
> and will be mostly offline until about the Nov 11th.

No problem, I will wait.
I would be great to get a final version for the 3.19 ;-)


Thank you,
Nicolas

^ permalink raw reply

* [PATCH net-next v2 0/3] Code adjustment
From: Hayes Wang @ 2014-10-31  9:56 UTC (permalink / raw)
  To: netdev; +Cc: nic_swsd, linux-kernel, linux-usb, Hayes Wang
In-Reply-To: <1394712342-15778-75-Taiwan-albertk@realtek.com>

v2:
 Correct the spelling error for the comment of patch #3.

v1:
Adjust some codes to make them more reasonable.

Hayes Wang (3):
  r8152: remove the duplicate init for the list of rx_done
  r8152: clear the flag of SCHEDULE_TASKLET in tasklet
  r8152: check RTL8152_UNPLUG and netif_running before autoresume

 drivers/net/usb/r8152.c | 13 +++++++++----
 1 file changed, 9 insertions(+), 4 deletions(-)

-- 
1.9.3

^ permalink raw reply

* [PATCH net-next v2 1/3] r8152: remove the duplicate init for the list of rx_done
From: Hayes Wang @ 2014-10-31  9:56 UTC (permalink / raw)
  To: netdev; +Cc: nic_swsd, linux-kernel, linux-usb, Hayes Wang
In-Reply-To: <1394712342-15778-79-Taiwan-albertk@realtek.com>

The INIT_LIST_HEAD(&tp->rx_done) would be done in rtl_start_rx(),
so remove the unnecessary one in alloc_all_mem().

Signed-off-by: Hayes Wang <hayeswang@realtek.com>
---
 drivers/net/usb/r8152.c | 1 -
 1 file changed, 1 deletion(-)

diff --git a/drivers/net/usb/r8152.c b/drivers/net/usb/r8152.c
index f116335..ff54098 100644
--- a/drivers/net/usb/r8152.c
+++ b/drivers/net/usb/r8152.c
@@ -1256,7 +1256,6 @@ static int alloc_all_mem(struct r8152 *tp)
 
 	spin_lock_init(&tp->rx_lock);
 	spin_lock_init(&tp->tx_lock);
-	INIT_LIST_HEAD(&tp->rx_done);
 	INIT_LIST_HEAD(&tp->tx_free);
 	skb_queue_head_init(&tp->tx_queue);
 
-- 
1.9.3

^ permalink raw reply related

* [PATCH net-next v2 3/3] r8152: check RTL8152_UNPLUG and netif_running before autoresume
From: Hayes Wang @ 2014-10-31  9:56 UTC (permalink / raw)
  To: netdev; +Cc: nic_swsd, linux-kernel, linux-usb, Hayes Wang
In-Reply-To: <1394712342-15778-79-Taiwan-albertk@realtek.com>

If the device is unplugged or !netif_running(), the workqueue
doesn't need to wake the device, and could return directly.

Signed-off-by: Hayes Wang <hayeswang@realtek.com>
---
 drivers/net/usb/r8152.c | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/drivers/net/usb/r8152.c b/drivers/net/usb/r8152.c
index 670279a..90cc89c 100644
--- a/drivers/net/usb/r8152.c
+++ b/drivers/net/usb/r8152.c
@@ -2859,15 +2859,18 @@ static void rtl_work_func_t(struct work_struct *work)
 {
 	struct r8152 *tp = container_of(work, struct r8152, schedule.work);
 
+	/* If the device is unplugged or !netif_running(), the workqueue
+	 * doesn't need to wake the device, and could return directly.
+	 */
+	if (test_bit(RTL8152_UNPLUG, &tp->flags) || !netif_running(tp->netdev))
+		return;
+
 	if (usb_autopm_get_interface(tp->intf) < 0)
 		return;
 
 	if (!test_bit(WORK_ENABLE, &tp->flags))
 		goto out1;
 
-	if (test_bit(RTL8152_UNPLUG, &tp->flags))
-		goto out1;
-
 	if (!mutex_trylock(&tp->control)) {
 		schedule_delayed_work(&tp->schedule, 0);
 		goto out1;
-- 
1.9.3

^ permalink raw reply related

* [PATCH net-next v2 2/3] r8152: clear the flag of SCHEDULE_TASKLET in tasklet
From: Hayes Wang @ 2014-10-31  9:56 UTC (permalink / raw)
  To: netdev; +Cc: nic_swsd, linux-kernel, linux-usb, Hayes Wang
In-Reply-To: <1394712342-15778-79-Taiwan-albertk@realtek.com>

Clear the flag of SCHEDULE_TASKLET in bottom_half() to avoid
re-schedule the tasklet again by workqueue.

Signed-off-by: Hayes Wang <hayeswang@realtek.com>
---
 drivers/net/usb/r8152.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/drivers/net/usb/r8152.c b/drivers/net/usb/r8152.c
index ff54098..670279a 100644
--- a/drivers/net/usb/r8152.c
+++ b/drivers/net/usb/r8152.c
@@ -1798,6 +1798,9 @@ static void bottom_half(unsigned long data)
 	if (!netif_carrier_ok(tp->netdev))
 		return;
 
+	if (test_bit(SCHEDULE_TASKLET, &tp->flags))
+		clear_bit(SCHEDULE_TASKLET, &tp->flags);
+
 	rx_bottom(tp);
 	tx_bottom(tp);
 }
-- 
1.9.3

^ permalink raw reply related

* Re: [PATCH v3 1/1] ip-link: add switch to show human readable output
From: Christian Hesse @ 2014-10-31 10:17 UTC (permalink / raw)
  To: Stephen Hemminger; +Cc: netdev
In-Reply-To: <1414660599-25707-1-git-send-email-mail@eworm.de>

[-- Attachment #1: Type: text/plain, Size: 3127 bytes --]

Stephen Hemminger <stephen@networkplumber.org> on Wed, 2014/10/29 22:47:
> I like the idea as a concept

Great! ;)

> but there are two issues:
>   1. The IEC suffix is a rarely used thing and is non-standard
>      for communications where K = 1000 M = 1000000 etc.
>      Please just use standard suffices

Removed the suffix in patch v3. Not sure if it is correct, though. I do use a
base of 2, so K = 1024, M = 1048576, ...

This is what ifconfig behaves as well. Output should give the exact same
numbers (except the suffix):

# ip -s -h link list en
3: en: <BROADCAST,MULTICAST,UP,LOWER_UP> mtu 1500 qdisc fq_codel state UP
mode DEFAULT group default qlen 1000
    link/ether 00:de:ad:be:ee:ef brd ff:ff:ff:ff:ff:ff
    RX: bytes  packets  errors  dropped overrun mcast
    26.3M      59.2K    0       68      0       0
    TX: bytes  packets  errors  dropped carrier collsns
    37.0M      67.6K    0       0       0       0
# ifconfig en 
en: flags=4163<UP,BROADCAST,RUNNING,MULTICAST>  mtu 1500
        inet XX.XX.XX.XX  netmask 255.255.255.0  broadcast XX.XX.XX.255
        ether 00:de:ad:be:ee:ef  txqueuelen 1000  (Ethernet)
        RX packets 60677  bytes 27671339 (26.3 MiB)
        RX errors 0  dropped 68  overruns 0  frame 0
        TX packets 69350  bytes 38874609 (37.0 MiB)
        TX errors 0  dropped 0 overruns 0  carrier 0  collisions 0

>   2. Don't double print the data, if the user asks for human
>      format, only show the human format.

Fixed.

Hopefully I got the line breaks right. How to check print_link_stats32()?
Even von i686 ip calls print_link_stats64() on my systems.

I did some minor changes to the error stats column alignment. Is that ok? See
the difference of before and after:

# ip -s -s link list en    # before
2: eth0: <BROADCAST,MULTICAST,UP,LOWER_UP> mtu 1500 qdisc pfifo_fast state UP
mode DEFAULT group default qlen 1000
    link/ether 00:de:ad:be:ee:ef brd ff:ff:ff:ff:ff:ff 
    RX: bytes  packets  errors  dropped overrun mcast   
    144544239  100441   0       0       0       1      
    RX errors: length  crc     frame   fifo    missed
               0        0       0       0       0      
    TX: bytes  packets  errors  dropped carrier collsns 
    808343     9630     0       0       0       0      
    TX errors: aborted fifo    window  heartbeat transns
               0        0       0       0        2      
# ip -s -s link list en    # after
2: eth0: <BROADCAST,MULTICAST,UP,LOWER_UP> mtu 1500 qdisc pfifo_fast state UP
mode DEFAULT group default qlen 1000
    link/ether 00:de:ad:be:ee:ef brd ff:ff:ff:ff:ff:ff
    RX: bytes  packets  errors  dropped overrun mcast
    87432187   60642    0       0       0       1
    RX errors: length   crc     frame   fifo    missed
               0        0       0       0       0
    TX: bytes  packets  errors  dropped carrier collsns 
    450437     5793     0       0       0       0      
    TX errors: aborted  fifo   window heartbeat transns
               0        0       0       0       2      
-- 
Best regards,
Christian Hesse

[-- Attachment #2: OpenPGP digital signature --]
[-- Type: application/pgp-signature, Size: 819 bytes --]

^ permalink raw reply

* Re: [GIT PULL nf] IPVS Fixes for v3.18
From: Pablo Neira Ayuso @ 2014-10-31 11:47 UTC (permalink / raw)
  To: Simon Horman
  Cc: lvs-devel, netdev, netfilter-devel, Wensong Zhang,
	Julian Anastasov
In-Reply-To: <1414458334-22479-1-git-send-email-horms@verge.net.au>

On Tue, Oct 28, 2014 at 10:05:33AM +0900, Simon Horman wrote:
> Hi Pablo,
> 
> please consider this fix for v3.18.
> 
> It fixes a null-pointer dereference that may occur when logging
> errors.
> 
> This problem was introduced by 4a4739d56b0 ("ipvs: Pull out
> crosses_local_route_boundary logic") in v3.17-rc5. As such I would
> also like it considered for 3.17-stable.
> 
> 
> The following changes since commit 7965ee93719921ea5978f331da653dfa2d7b99f5:
> 
>   netfilter: nft_compat: fix wrong target lookup in nft_target_select_ops() (2014-10-27 22:17:46 +0100)
> 
> are available in the git repository at:
> 
>   https://git.kernel.org/pub/scm/linux/kernel/git/horms/ipvs.git tags/ipvs-fixes-for-v3.18

Pulled, thanks Simon.

^ permalink raw reply

* [PATCH -next v2 0/2] net: allow setting ecn via routing table
From: Florian Westphal @ 2014-10-31 12:13 UTC (permalink / raw)
  To: netdev

Here is the resend of the v1 patchset, as requested.
Thanks again for all the valuable comments and the feedback received for
the original submission.

I did a quick re-test, with a 'feature ecn' route ecn set and tcp_ecn != 1
and the various combinations look ok:

host with ecnroute is initiator
- connect to host with tcp_ecn != 0: connection uses ecn
- connect to host with tcp_ecn == 0: ecn is announced but not used

host with ecnroute is responder:
- connect from host with tcp_ecn == 1: connection uses ecn
- connect from host with tcp_ecn != 1: connection does not uses ecn

original cover letter below:

These two patches allow turning on explicit congestion notification
based on the destination network.

For example, assuming the default tcp_ecn sysctl '2', the following will
enable ecn (tcp_ecn=1 behaviour, i.e. request ecn to be enabled for a
tcp connection) for all connections to hosts inside the 192.168.2/24 network:

ip route change 192.168.2.0/24 dev eth0 features ecn

Having a more fine-grained per-route setting can be beneficial for
various reasons, for example 1) within data centers, or 2) local ISPs
may deploy ECN support for their own video/streaming services [1], etc.

Joint work with Daniel Borkmann, feature suggested by Hannes Frederic Sowa.

The patch to enable this in iproute2 will be posted shortly, it is currently
also available here:
http://git.breakpoint.cc/cgit/fw/iproute2.git/commit/?h=iproute_features&id=8843d2d8973fb81c78a7efe6d42e3a17d739003e

[1] http://www.ietf.org/proceedings/89/slides/slides-89-tsvarea-1.pdf, p.15

^ permalink raw reply

* [PATCH -next v2 1/2] syncookies: remove ecn_ok validation when decoding option timestamp
From: Florian Westphal @ 2014-10-31 12:13 UTC (permalink / raw)
  To: netdev; +Cc: Florian Westphal
In-Reply-To: <1414757602-27637-1-git-send-email-fw@strlen.de>

Won't work anymore when tcp_ecn=0 and RTAX_FEATURES route attribute did
allow ecn.

Just turn on ecn if the client ts says so.

This means that while syn cookies are in use clients can turn on ecn
even if it is off on the server.

However, there seems to be no harm in permitting this.

Alternatively one can extend the test to also perform route lookup and
check RTAX_FEATURES, but it simply doesn't appear to be worth the effort.

Signed-off-by: Florian Westphal <fw@strlen.de>
---
 Changes since v1:
  - reword commit message

 include/net/tcp.h     | 3 +--
 net/ipv4/syncookies.c | 6 ++----
 net/ipv6/syncookies.c | 2 +-
 3 files changed, 4 insertions(+), 7 deletions(-)

diff --git a/include/net/tcp.h b/include/net/tcp.h
index 3a35b15..57521de 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -493,8 +493,7 @@ __u32 cookie_v4_init_sequence(struct sock *sk, const struct sk_buff *skb,
 #endif
 
 __u32 cookie_init_timestamp(struct request_sock *req);
-bool cookie_check_timestamp(struct tcp_options_received *opt, struct net *net,
-			    bool *ecn_ok);
+bool cookie_check_timestamp(struct tcp_options_received *opt, bool *ecn_ok);
 
 /* From net/ipv6/syncookies.c */
 int __cookie_v6_check(const struct ipv6hdr *iph, const struct tcphdr *th,
diff --git a/net/ipv4/syncookies.c b/net/ipv4/syncookies.c
index 4ac7bca..c4e5e2d 100644
--- a/net/ipv4/syncookies.c
+++ b/net/ipv4/syncookies.c
@@ -225,7 +225,7 @@ static inline struct sock *get_cookie_sock(struct sock *sk, struct sk_buff *skb,
  * return false if we decode an option that should not be.
  */
 bool cookie_check_timestamp(struct tcp_options_received *tcp_opt,
-			struct net *net, bool *ecn_ok)
+			    bool *ecn_ok)
 {
 	/* echoed timestamp, lowest bits contain options */
 	u32 options = tcp_opt->rcv_tsecr & TSMASK;
@@ -240,8 +240,6 @@ bool cookie_check_timestamp(struct tcp_options_received *tcp_opt,
 
 	tcp_opt->sack_ok = (options & (1 << 4)) ? TCP_SACK_SEEN : 0;
 	*ecn_ok = (options >> 5) & 1;
-	if (*ecn_ok && !net->ipv4.sysctl_tcp_ecn)
-		return false;
 
 	if (tcp_opt->sack_ok && !sysctl_tcp_sack)
 		return false;
@@ -290,7 +288,7 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb)
 	memset(&tcp_opt, 0, sizeof(tcp_opt));
 	tcp_parse_options(skb, &tcp_opt, 0, NULL);
 
-	if (!cookie_check_timestamp(&tcp_opt, sock_net(sk), &ecn_ok))
+	if (!cookie_check_timestamp(&tcp_opt, &ecn_ok))
 		goto out;
 
 	ret = NULL;
diff --git a/net/ipv6/syncookies.c b/net/ipv6/syncookies.c
index be291ba..a08062c 100644
--- a/net/ipv6/syncookies.c
+++ b/net/ipv6/syncookies.c
@@ -186,7 +186,7 @@ struct sock *cookie_v6_check(struct sock *sk, struct sk_buff *skb)
 	memset(&tcp_opt, 0, sizeof(tcp_opt));
 	tcp_parse_options(skb, &tcp_opt, 0, NULL);
 
-	if (!cookie_check_timestamp(&tcp_opt, sock_net(sk), &ecn_ok))
+	if (!cookie_check_timestamp(&tcp_opt, &ecn_ok))
 		goto out;
 
 	ret = NULL;
-- 
2.0.4

^ permalink raw reply related

* [PATCH -next v2 2/2] net: allow setting ecn via routing table
From: Florian Westphal @ 2014-10-31 12:13 UTC (permalink / raw)
  To: netdev; +Cc: Florian Westphal, Daniel Borkmann
In-Reply-To: <1414757602-27637-1-git-send-email-fw@strlen.de>

Allows to set ECN on a per-route basis in case the sysctl tcp_ecn is not set to 1.
IOW, when ECN is set for specific routes, it provides a tcp_ecn=1 behaviour for that
route while the rest of the stack acts according to the global settings.

Having a more fine-grained per-route setting can be beneficial for various reasons,
for example a) within data centers, or b) local ISPs may deploy ECN support for
their own video/streaming services [1], etc.

One can use 'ip route change dev $dev $net features ecn' to toggle this.

[1] http://www.ietf.org/proceedings/89/slides/slides-89-tsvarea-1.pdf, p.15

Joint work with Daniel Borkmann.

Reference: http://thread.gmane.org/gmane.linux.network/335797
Suggested-by: Hannes Frederic Sowa <hannes@stressinduktion.org>
Signed-off-by: Daniel Borkmann <dborkman@redhat.com>
Signed-off-by: Florian Westphal <fw@strlen.de>
---
 Changes since v1:
 reword commit message and add reference to the discussion of the v1 patchset

 net/ipv4/tcp_input.c  | 25 +++++++++++++++----------
 net/ipv4/tcp_output.c | 12 ++++++++++--
 2 files changed, 25 insertions(+), 12 deletions(-)

diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 4e4617e..9db942a 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -5876,20 +5876,22 @@ static inline void pr_drop_req(struct request_sock *req, __u16 port, int family)
  */
 static void tcp_ecn_create_request(struct request_sock *req,
 				   const struct sk_buff *skb,
-				   const struct sock *listen_sk)
+				   const struct sock *listen_sk,
+				   struct dst_entry *dst)
 {
 	const struct tcphdr *th = tcp_hdr(skb);
 	const struct net *net = sock_net(listen_sk);
 	bool th_ecn = th->ece && th->cwr;
-	bool ect, need_ecn;
+	bool ect, need_ecn, ecn_ok;
 
 	if (!th_ecn)
 		return;
 
 	ect = !INET_ECN_is_not_ect(TCP_SKB_CB(skb)->ip_dsfield);
 	need_ecn = tcp_ca_needs_ecn(listen_sk);
+	ecn_ok = net->ipv4.sysctl_tcp_ecn || dst_feature(dst, RTAX_FEATURE_ECN);
 
-	if (!ect && !need_ecn && net->ipv4.sysctl_tcp_ecn)
+	if (!ect && !need_ecn && ecn_ok)
 		inet_rsk(req)->ecn_ok = 1;
 	else if (ect && need_ecn)
 		inet_rsk(req)->ecn_ok = 1;
@@ -5954,13 +5956,7 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops,
 	if (security_inet_conn_request(sk, skb, req))
 		goto drop_and_free;
 
-	if (!want_cookie || tmp_opt.tstamp_ok)
-		tcp_ecn_create_request(req, skb, sk);
-
-	if (want_cookie) {
-		isn = cookie_init_sequence(af_ops, sk, skb, &req->mss);
-		req->cookie_ts = tmp_opt.tstamp_ok;
-	} else if (!isn) {
+	if (!want_cookie && !isn) {
 		/* VJ's idea. We save last timestamp seen
 		 * from the destination in peer table, when entering
 		 * state TIME-WAIT, and check against it before
@@ -6008,6 +6004,15 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops,
 			goto drop_and_free;
 	}
 
+	tcp_ecn_create_request(req, skb, sk, dst);
+
+	if (want_cookie) {
+		isn = cookie_init_sequence(af_ops, sk, skb, &req->mss);
+		req->cookie_ts = tmp_opt.tstamp_ok;
+		if (!tmp_opt.tstamp_ok)
+			inet_rsk(req)->ecn_ok = 0;
+	}
+
 	tcp_rsk(req)->snt_isn = isn;
 	tcp_openreq_init_rwin(req, sk, dst);
 	fastopen = !want_cookie &&
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 3af2129..b1c6296 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -333,10 +333,18 @@ static void tcp_ecn_send_synack(struct sock *sk, struct sk_buff *skb)
 static void tcp_ecn_send_syn(struct sock *sk, struct sk_buff *skb)
 {
 	struct tcp_sock *tp = tcp_sk(sk);
+	bool use_ecn = sock_net(sk)->ipv4.sysctl_tcp_ecn == 1 ||
+		       tcp_ca_needs_ecn(sk);
+
+	if (!use_ecn) {
+		const struct dst_entry *dst = __sk_dst_get(sk);
+		if (dst && dst_feature(dst, RTAX_FEATURE_ECN))
+			use_ecn = true;
+	}
 
 	tp->ecn_flags = 0;
-	if (sock_net(sk)->ipv4.sysctl_tcp_ecn == 1 ||
-	    tcp_ca_needs_ecn(sk)) {
+
+	if (use_ecn) {
 		TCP_SKB_CB(skb)->tcp_flags |= TCPHDR_ECE | TCPHDR_CWR;
 		tp->ecn_flags = TCP_ECN_OK;
 		if (tcp_ca_needs_ecn(sk))
-- 
2.0.4

^ permalink raw reply related

* [PATCH 3/6] netfilter: nf_reject_ipv4: split nf_send_reset() in smaller functions
From: Pablo Neira Ayuso @ 2014-10-31 12:18 UTC (permalink / raw)
  To: netfilter-devel; +Cc: davem, netdev
In-Reply-To: <1414757912-29150-1-git-send-email-pablo@netfilter.org>

That can be reused by the reject bridge expression to build the reject
packet. The new functions are:

* nf_reject_ip_tcphdr_get(): to sanitize and to obtain the TCP header.
* nf_reject_iphdr_put(): to build the IPv4 header.
* nf_reject_ip_tcphdr_put(): to build the TCP header.

Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/net/netfilter/ipv4/nf_reject.h |   10 ++++
 net/ipv4/netfilter/nf_reject_ipv4.c    |   88 ++++++++++++++++++++++----------
 2 files changed, 72 insertions(+), 26 deletions(-)

diff --git a/include/net/netfilter/ipv4/nf_reject.h b/include/net/netfilter/ipv4/nf_reject.h
index e842719..03e928a 100644
--- a/include/net/netfilter/ipv4/nf_reject.h
+++ b/include/net/netfilter/ipv4/nf_reject.h
@@ -1,6 +1,8 @@
 #ifndef _IPV4_NF_REJECT_H
 #define _IPV4_NF_REJECT_H
 
+#include <linux/skbuff.h>
+#include <net/ip.h>
 #include <net/icmp.h>
 
 static inline void nf_send_unreach(struct sk_buff *skb_in, int code)
@@ -10,4 +12,12 @@ static inline void nf_send_unreach(struct sk_buff *skb_in, int code)
 
 void nf_send_reset(struct sk_buff *oldskb, int hook);
 
+const struct tcphdr *nf_reject_ip_tcphdr_get(struct sk_buff *oldskb,
+					     struct tcphdr *_oth, int hook);
+struct iphdr *nf_reject_iphdr_put(struct sk_buff *nskb,
+				  const struct sk_buff *oldskb,
+				  __be16 protocol, int ttl);
+void nf_reject_ip_tcphdr_put(struct sk_buff *nskb, const struct sk_buff *oldskb,
+			     const struct tcphdr *oth);
+
 #endif /* _IPV4_NF_REJECT_H */
diff --git a/net/ipv4/netfilter/nf_reject_ipv4.c b/net/ipv4/netfilter/nf_reject_ipv4.c
index 92b303d..1baaa83 100644
--- a/net/ipv4/netfilter/nf_reject_ipv4.c
+++ b/net/ipv4/netfilter/nf_reject_ipv4.c
@@ -12,43 +12,39 @@
 #include <net/route.h>
 #include <net/dst.h>
 #include <linux/netfilter_ipv4.h>
+#include <net/netfilter/ipv4/nf_reject.h>
 
-/* Send RST reply */
-void nf_send_reset(struct sk_buff *oldskb, int hook)
+const struct tcphdr *nf_reject_ip_tcphdr_get(struct sk_buff *oldskb,
+					     struct tcphdr *_oth, int hook)
 {
-	struct sk_buff *nskb;
-	const struct iphdr *oiph;
-	struct iphdr *niph;
 	const struct tcphdr *oth;
-	struct tcphdr _otcph, *tcph;
 
 	/* IP header checks: fragment. */
 	if (ip_hdr(oldskb)->frag_off & htons(IP_OFFSET))
-		return;
+		return NULL;
 
 	oth = skb_header_pointer(oldskb, ip_hdrlen(oldskb),
-				 sizeof(_otcph), &_otcph);
+				 sizeof(struct tcphdr), _oth);
 	if (oth == NULL)
-		return;
+		return NULL;
 
 	/* No RST for RST. */
 	if (oth->rst)
-		return;
-
-	if (skb_rtable(oldskb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))
-		return;
+		return NULL;
 
 	/* Check checksum */
 	if (nf_ip_checksum(oldskb, hook, ip_hdrlen(oldskb), IPPROTO_TCP))
-		return;
-	oiph = ip_hdr(oldskb);
+		return NULL;
 
-	nskb = alloc_skb(sizeof(struct iphdr) + sizeof(struct tcphdr) +
-			 LL_MAX_HEADER, GFP_ATOMIC);
-	if (!nskb)
-		return;
+	return oth;
+}
+EXPORT_SYMBOL_GPL(nf_reject_ip_tcphdr_get);
 
-	skb_reserve(nskb, LL_MAX_HEADER);
+struct iphdr *nf_reject_iphdr_put(struct sk_buff *nskb,
+				  const struct sk_buff *oldskb,
+				  __be16 protocol, int ttl)
+{
+	struct iphdr *niph, *oiph = ip_hdr(oldskb);
 
 	skb_reset_network_header(nskb);
 	niph = (struct iphdr *)skb_put(nskb, sizeof(struct iphdr));
@@ -57,10 +53,23 @@ void nf_send_reset(struct sk_buff *oldskb, int hook)
 	niph->tos	= 0;
 	niph->id	= 0;
 	niph->frag_off	= htons(IP_DF);
-	niph->protocol	= IPPROTO_TCP;
+	niph->protocol	= protocol;
 	niph->check	= 0;
 	niph->saddr	= oiph->daddr;
 	niph->daddr	= oiph->saddr;
+	niph->ttl	= ttl;
+
+	nskb->protocol = htons(ETH_P_IP);
+
+	return niph;
+}
+EXPORT_SYMBOL_GPL(nf_reject_iphdr_put);
+
+void nf_reject_ip_tcphdr_put(struct sk_buff *nskb, const struct sk_buff *oldskb,
+			  const struct tcphdr *oth)
+{
+	struct iphdr *niph = ip_hdr(nskb);
+	struct tcphdr *tcph;
 
 	skb_reset_transport_header(nskb);
 	tcph = (struct tcphdr *)skb_put(nskb, sizeof(struct tcphdr));
@@ -69,9 +78,9 @@ void nf_send_reset(struct sk_buff *oldskb, int hook)
 	tcph->dest	= oth->source;
 	tcph->doff	= sizeof(struct tcphdr) / 4;
 
-	if (oth->ack)
+	if (oth->ack) {
 		tcph->seq = oth->ack_seq;
-	else {
+	} else {
 		tcph->ack_seq = htonl(ntohl(oth->seq) + oth->syn + oth->fin +
 				      oldskb->len - ip_hdrlen(oldskb) -
 				      (oth->doff << 2));
@@ -84,16 +93,43 @@ void nf_send_reset(struct sk_buff *oldskb, int hook)
 	nskb->ip_summed = CHECKSUM_PARTIAL;
 	nskb->csum_start = (unsigned char *)tcph - nskb->head;
 	nskb->csum_offset = offsetof(struct tcphdr, check);
+}
+EXPORT_SYMBOL_GPL(nf_reject_ip_tcphdr_put);
+
+/* Send RST reply */
+void nf_send_reset(struct sk_buff *oldskb, int hook)
+{
+	struct sk_buff *nskb;
+	const struct iphdr *oiph;
+	struct iphdr *niph;
+	const struct tcphdr *oth;
+	struct tcphdr _oth;
+
+	oth = nf_reject_ip_tcphdr_get(oldskb, &_oth, hook);
+	if (!oth)
+		return;
+
+	if (skb_rtable(oldskb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))
+		return;
+
+	oiph = ip_hdr(oldskb);
+
+	nskb = alloc_skb(sizeof(struct iphdr) + sizeof(struct tcphdr) +
+			 LL_MAX_HEADER, GFP_ATOMIC);
+	if (!nskb)
+		return;
 
 	/* ip_route_me_harder expects skb->dst to be set */
 	skb_dst_set_noref(nskb, skb_dst(oldskb));
 
-	nskb->protocol = htons(ETH_P_IP);
+	skb_reserve(nskb, LL_MAX_HEADER);
+	niph = nf_reject_iphdr_put(nskb, oldskb, IPPROTO_TCP,
+				   ip4_dst_hoplimit(skb_dst(nskb)));
+	nf_reject_ip_tcphdr_put(nskb, oldskb, oth);
+
 	if (ip_route_me_harder(nskb, RTN_UNSPEC))
 		goto free_nskb;
 
-	niph->ttl	= ip4_dst_hoplimit(skb_dst(nskb));
-
 	/* "Never happens" */
 	if (nskb->len > dst_mtu(skb_dst(nskb)))
 		goto free_nskb;
-- 
1.7.10.4

^ permalink raw reply related

* [PATCH 5/6] netfilter: nft_reject_bridge: don't use IP stack to reject traffic
From: Pablo Neira Ayuso @ 2014-10-31 12:18 UTC (permalink / raw)
  To: netfilter-devel; +Cc: davem, netdev
In-Reply-To: <1414757912-29150-1-git-send-email-pablo@netfilter.org>

If the packet is received via the bridge stack, this cannot reject
packets from the IP stack.

This adds functions to build the reject packet and send it from the
bridge stack. Comments and assumptions on this patch:

1) Validate the IPv4 and IPv6 headers before further processing,
   given that the packet comes from the bridge stack, we cannot assume
   they are clean. Truncated packets are dropped, we follow similar
   approach in the existing iptables match/target extensions that need
   to inspect layer 4 headers that is not available. This also includes
   packets that are directed to multicast and broadcast ethernet
   addresses.

2) br_deliver() is exported to inject the reject packet via
   bridge localout -> postrouting. So the approach is similar to what
   we already do in the iptables reject target. The reject packet is
   sent to the bridge port from which we have received the original
   packet.

3) The reject packet is forged based on the original packet. The TTL
   is set based on sysctl_ip_default_ttl for IPv4 and per-net
   ipv6.devconf_all hoplimit for IPv6.

Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/bridge/br_forward.c                  |    1 +
 net/bridge/netfilter/nft_reject_bridge.c |  263 ++++++++++++++++++++++++++++--
 2 files changed, 254 insertions(+), 10 deletions(-)

diff --git a/net/bridge/br_forward.c b/net/bridge/br_forward.c
index 992ec49..44cb786 100644
--- a/net/bridge/br_forward.c
+++ b/net/bridge/br_forward.c
@@ -112,6 +112,7 @@ void br_deliver(const struct net_bridge_port *to, struct sk_buff *skb)
 
 	kfree_skb(skb);
 }
+EXPORT_SYMBOL_GPL(br_deliver);
 
 /* called with rcu_read_lock */
 void br_forward(const struct net_bridge_port *to, struct sk_buff *skb, struct sk_buff *skb0)
diff --git a/net/bridge/netfilter/nft_reject_bridge.c b/net/bridge/netfilter/nft_reject_bridge.c
index a764795..31b27e1 100644
--- a/net/bridge/netfilter/nft_reject_bridge.c
+++ b/net/bridge/netfilter/nft_reject_bridge.c
@@ -16,6 +16,237 @@
 #include <net/netfilter/nft_reject.h>
 #include <net/netfilter/ipv4/nf_reject.h>
 #include <net/netfilter/ipv6/nf_reject.h>
+#include <linux/ip.h>
+#include <net/ip.h>
+#include "../br_private.h"
+
+static void nft_reject_br_push_etherhdr(struct sk_buff *oldskb,
+					struct sk_buff *nskb)
+{
+	struct ethhdr *eth;
+
+	eth = (struct ethhdr *)skb_push(nskb, ETH_HLEN);
+	skb_reset_mac_header(nskb);
+	ether_addr_copy(eth->h_source, eth_hdr(oldskb)->h_dest);
+	ether_addr_copy(eth->h_dest, eth_hdr(oldskb)->h_source);
+	eth->h_proto = eth_hdr(oldskb)->h_proto;
+	skb_pull(nskb, ETH_HLEN);
+}
+
+static int nft_reject_iphdr_validate(struct sk_buff *oldskb)
+{
+	struct iphdr *iph;
+	u32 len;
+
+	if (!pskb_may_pull(oldskb, sizeof(struct iphdr)))
+		return 0;
+
+	iph = ip_hdr(oldskb);
+	if (iph->ihl < 5 || iph->version != 4)
+		return 0;
+
+	len = ntohs(iph->tot_len);
+	if (oldskb->len < len)
+		return 0;
+	else if (len < (iph->ihl*4))
+		return 0;
+
+	if (!pskb_may_pull(oldskb, iph->ihl*4))
+		return 0;
+
+	return 1;
+}
+
+static void nft_reject_br_send_v4_tcp_reset(struct sk_buff *oldskb, int hook)
+{
+	struct sk_buff *nskb;
+	struct iphdr *niph;
+	const struct tcphdr *oth;
+	struct tcphdr _oth;
+
+	if (!nft_reject_iphdr_validate(oldskb))
+		return;
+
+	oth = nf_reject_ip_tcphdr_get(oldskb, &_oth, hook);
+	if (!oth)
+		return;
+
+	nskb = alloc_skb(sizeof(struct iphdr) + sizeof(struct tcphdr) +
+			 LL_MAX_HEADER, GFP_ATOMIC);
+	if (!nskb)
+		return;
+
+	skb_reserve(nskb, LL_MAX_HEADER);
+	niph = nf_reject_iphdr_put(nskb, oldskb, IPPROTO_TCP,
+				   sysctl_ip_default_ttl);
+	nf_reject_ip_tcphdr_put(nskb, oldskb, oth);
+	niph->ttl	= sysctl_ip_default_ttl;
+	niph->tot_len	= htons(nskb->len);
+	ip_send_check(niph);
+
+	nft_reject_br_push_etherhdr(oldskb, nskb);
+
+	br_deliver(br_port_get_rcu(oldskb->dev), nskb);
+}
+
+static void nft_reject_br_send_v4_unreach(struct sk_buff *oldskb, int hook,
+					  u8 code)
+{
+	struct sk_buff *nskb;
+	struct iphdr *niph;
+	struct icmphdr *icmph;
+	unsigned int len;
+	void *payload;
+	__wsum csum;
+
+	if (!nft_reject_iphdr_validate(oldskb))
+		return;
+
+	/* IP header checks: fragment. */
+	if (ip_hdr(oldskb)->frag_off & htons(IP_OFFSET))
+		return;
+
+	/* RFC says return as much as we can without exceeding 576 bytes. */
+	len = min_t(unsigned int, 536, oldskb->len);
+
+	if (!pskb_may_pull(oldskb, len))
+		return;
+
+	if (nf_ip_checksum(oldskb, hook, ip_hdrlen(oldskb), 0))
+		return;
+
+	nskb = alloc_skb(sizeof(struct iphdr) + sizeof(struct icmphdr) +
+			 LL_MAX_HEADER + len, GFP_ATOMIC);
+	if (!nskb)
+		return;
+
+	skb_reserve(nskb, LL_MAX_HEADER);
+	niph = nf_reject_iphdr_put(nskb, oldskb, IPPROTO_ICMP,
+				   sysctl_ip_default_ttl);
+
+	skb_reset_transport_header(nskb);
+	icmph = (struct icmphdr *)skb_put(nskb, sizeof(struct icmphdr));
+	memset(icmph, 0, sizeof(*icmph));
+	icmph->type     = ICMP_DEST_UNREACH;
+	icmph->code	= code;
+
+	payload = skb_put(nskb, len);
+	memcpy(payload, skb_network_header(oldskb), len);
+
+	csum = csum_partial((void *)icmph, len + sizeof(struct icmphdr), 0);
+	icmph->checksum = csum_fold(csum);
+
+	niph->tot_len	= htons(nskb->len);
+	ip_send_check(niph);
+
+	nft_reject_br_push_etherhdr(oldskb, nskb);
+
+	br_deliver(br_port_get_rcu(oldskb->dev), nskb);
+}
+
+static int nft_reject_ip6hdr_validate(struct sk_buff *oldskb)
+{
+	struct ipv6hdr *hdr;
+	u32 pkt_len;
+
+	if (!pskb_may_pull(oldskb, sizeof(struct ipv6hdr)))
+		return 0;
+
+	hdr = ipv6_hdr(oldskb);
+	if (hdr->version != 6)
+		return 0;
+
+	pkt_len = ntohs(hdr->payload_len);
+	if (pkt_len + sizeof(struct ipv6hdr) > oldskb->len)
+		return 0;
+
+	return 1;
+}
+
+static void nft_reject_br_send_v6_tcp_reset(struct net *net,
+					    struct sk_buff *oldskb, int hook)
+{
+	struct sk_buff *nskb;
+	const struct tcphdr *oth;
+	struct tcphdr _oth;
+	unsigned int otcplen;
+	struct ipv6hdr *nip6h;
+
+	if (!nft_reject_ip6hdr_validate(oldskb))
+		return;
+
+	oth = nf_reject_ip6_tcphdr_get(oldskb, &_oth, &otcplen, hook);
+	if (!oth)
+		return;
+
+	nskb = alloc_skb(sizeof(struct ipv6hdr) + sizeof(struct tcphdr) +
+			 LL_MAX_HEADER, GFP_ATOMIC);
+	if (!nskb)
+		return;
+
+	skb_reserve(nskb, LL_MAX_HEADER);
+	nip6h = nf_reject_ip6hdr_put(nskb, oldskb, IPPROTO_TCP,
+				     net->ipv6.devconf_all->hop_limit);
+	nf_reject_ip6_tcphdr_put(nskb, oldskb, oth, otcplen);
+	nip6h->payload_len = htons(nskb->len - sizeof(struct ipv6hdr));
+
+	nft_reject_br_push_etherhdr(oldskb, nskb);
+
+	br_deliver(br_port_get_rcu(oldskb->dev), nskb);
+}
+
+static void nft_reject_br_send_v6_unreach(struct net *net,
+					  struct sk_buff *oldskb, int hook,
+					  u8 code)
+{
+	struct sk_buff *nskb;
+	struct ipv6hdr *nip6h;
+	struct icmp6hdr *icmp6h;
+	unsigned int len;
+	void *payload;
+
+	if (!nft_reject_ip6hdr_validate(oldskb))
+		return;
+
+	/* Include "As much of invoking packet as possible without the ICMPv6
+	 * packet exceeding the minimum IPv6 MTU" in the ICMP payload.
+	 */
+	len = min_t(unsigned int, 1220, oldskb->len);
+
+	if (!pskb_may_pull(oldskb, len))
+		return;
+
+	nskb = alloc_skb(sizeof(struct iphdr) + sizeof(struct icmp6hdr) +
+			 LL_MAX_HEADER + len, GFP_ATOMIC);
+	if (!nskb)
+		return;
+
+	skb_reserve(nskb, LL_MAX_HEADER);
+	nip6h = nf_reject_ip6hdr_put(nskb, oldskb, IPPROTO_ICMPV6,
+				     net->ipv6.devconf_all->hop_limit);
+
+	skb_reset_transport_header(nskb);
+	icmp6h = (struct icmp6hdr *)skb_put(nskb, sizeof(struct icmp6hdr));
+	memset(icmp6h, 0, sizeof(*icmp6h));
+	icmp6h->icmp6_type = ICMPV6_DEST_UNREACH;
+	icmp6h->icmp6_code = code;
+
+	payload = skb_put(nskb, len);
+	memcpy(payload, skb_network_header(oldskb), len);
+	nip6h->payload_len = htons(nskb->len - sizeof(struct ipv6hdr));
+
+	icmp6h->icmp6_cksum =
+		csum_ipv6_magic(&nip6h->saddr, &nip6h->daddr,
+				nskb->len - sizeof(struct ipv6hdr),
+				IPPROTO_ICMPV6,
+				csum_partial(icmp6h,
+					     nskb->len - sizeof(struct ipv6hdr),
+					     0));
+
+	nft_reject_br_push_etherhdr(oldskb, nskb);
+
+	br_deliver(br_port_get_rcu(oldskb->dev), nskb);
+}
 
 static void nft_reject_bridge_eval(const struct nft_expr *expr,
 				 struct nft_data data[NFT_REG_MAX + 1],
@@ -23,35 +254,46 @@ static void nft_reject_bridge_eval(const struct nft_expr *expr,
 {
 	struct nft_reject *priv = nft_expr_priv(expr);
 	struct net *net = dev_net((pkt->in != NULL) ? pkt->in : pkt->out);
+	const unsigned char *dest = eth_hdr(pkt->skb)->h_dest;
+
+	if (is_broadcast_ether_addr(dest) ||
+	    is_multicast_ether_addr(dest))
+		goto out;
 
 	switch (eth_hdr(pkt->skb)->h_proto) {
 	case htons(ETH_P_IP):
 		switch (priv->type) {
 		case NFT_REJECT_ICMP_UNREACH:
-			nf_send_unreach(pkt->skb, priv->icmp_code);
+			nft_reject_br_send_v4_unreach(pkt->skb,
+						      pkt->ops->hooknum,
+						      priv->icmp_code);
 			break;
 		case NFT_REJECT_TCP_RST:
-			nf_send_reset(pkt->skb, pkt->ops->hooknum);
+			nft_reject_br_send_v4_tcp_reset(pkt->skb,
+							pkt->ops->hooknum);
 			break;
 		case NFT_REJECT_ICMPX_UNREACH:
-			nf_send_unreach(pkt->skb,
-					nft_reject_icmp_code(priv->icmp_code));
+			nft_reject_br_send_v4_unreach(pkt->skb,
+						      pkt->ops->hooknum,
+						      nft_reject_icmp_code(priv->icmp_code));
 			break;
 		}
 		break;
 	case htons(ETH_P_IPV6):
 		switch (priv->type) {
 		case NFT_REJECT_ICMP_UNREACH:
-			nf_send_unreach6(net, pkt->skb, priv->icmp_code,
-					 pkt->ops->hooknum);
+			nft_reject_br_send_v6_unreach(net, pkt->skb,
+						      pkt->ops->hooknum,
+						      priv->icmp_code);
 			break;
 		case NFT_REJECT_TCP_RST:
-			nf_send_reset6(net, pkt->skb, pkt->ops->hooknum);
+			nft_reject_br_send_v6_tcp_reset(net, pkt->skb,
+							pkt->ops->hooknum);
 			break;
 		case NFT_REJECT_ICMPX_UNREACH:
-			nf_send_unreach6(net, pkt->skb,
-					 nft_reject_icmpv6_code(priv->icmp_code),
-					 pkt->ops->hooknum);
+			nft_reject_br_send_v6_unreach(net, pkt->skb,
+						      pkt->ops->hooknum,
+						      nft_reject_icmpv6_code(priv->icmp_code));
 			break;
 		}
 		break;
@@ -59,6 +301,7 @@ static void nft_reject_bridge_eval(const struct nft_expr *expr,
 		/* No explicit way to reject this protocol, drop it. */
 		break;
 	}
+out:
 	data[NFT_REG_VERDICT].verdict = NF_DROP;
 }
 
-- 
1.7.10.4

^ permalink raw reply related

* [PATCH 4/6] netfilter: nf_reject_ipv6: split nf_send_reset6() in smaller functions
From: Pablo Neira Ayuso @ 2014-10-31 12:18 UTC (permalink / raw)
  To: netfilter-devel; +Cc: davem, netdev
In-Reply-To: <1414757912-29150-1-git-send-email-pablo@netfilter.org>

That can be reused by the reject bridge expression to build the reject
packet. The new functions are:

* nf_reject_ip6_tcphdr_get(): to sanitize and to obtain the TCP header.
* nf_reject_ip6hdr_put(): to build the IPv6 header.
* nf_reject_ip6_tcphdr_put(): to build the TCP header.

Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/net/netfilter/ipv6/nf_reject.h |   10 ++
 net/ipv6/netfilter/nf_reject_ipv6.c    |  175 ++++++++++++++++++++------------
 2 files changed, 119 insertions(+), 66 deletions(-)

diff --git a/include/net/netfilter/ipv6/nf_reject.h b/include/net/netfilter/ipv6/nf_reject.h
index 48e1881..23216d4 100644
--- a/include/net/netfilter/ipv6/nf_reject.h
+++ b/include/net/netfilter/ipv6/nf_reject.h
@@ -15,4 +15,14 @@ nf_send_unreach6(struct net *net, struct sk_buff *skb_in, unsigned char code,
 
 void nf_send_reset6(struct net *net, struct sk_buff *oldskb, int hook);
 
+const struct tcphdr *nf_reject_ip6_tcphdr_get(struct sk_buff *oldskb,
+					      struct tcphdr *otcph,
+					      unsigned int *otcplen, int hook);
+struct ipv6hdr *nf_reject_ip6hdr_put(struct sk_buff *nskb,
+				     const struct sk_buff *oldskb,
+				     __be16 protocol, int hoplimit);
+void nf_reject_ip6_tcphdr_put(struct sk_buff *nskb,
+			      const struct sk_buff *oldskb,
+			      const struct tcphdr *oth, unsigned int otcplen);
+
 #endif /* _IPV6_NF_REJECT_H */
diff --git a/net/ipv6/netfilter/nf_reject_ipv6.c b/net/ipv6/netfilter/nf_reject_ipv6.c
index 20d9def..015eb8a 100644
--- a/net/ipv6/netfilter/nf_reject_ipv6.c
+++ b/net/ipv6/netfilter/nf_reject_ipv6.c
@@ -12,116 +12,102 @@
 #include <net/ip6_fib.h>
 #include <net/ip6_checksum.h>
 #include <linux/netfilter_ipv6.h>
+#include <net/netfilter/ipv6/nf_reject.h>
 
-void nf_send_reset6(struct net *net, struct sk_buff *oldskb, int hook)
+const struct tcphdr *nf_reject_ip6_tcphdr_get(struct sk_buff *oldskb,
+					      struct tcphdr *otcph,
+					      unsigned int *otcplen, int hook)
 {
-	struct sk_buff *nskb;
-	struct tcphdr otcph, *tcph;
-	unsigned int otcplen, hh_len;
-	int tcphoff, needs_ack;
 	const struct ipv6hdr *oip6h = ipv6_hdr(oldskb);
-	struct ipv6hdr *ip6h;
-#define DEFAULT_TOS_VALUE	0x0U
-	const __u8 tclass = DEFAULT_TOS_VALUE;
-	struct dst_entry *dst = NULL;
 	u8 proto;
 	__be16 frag_off;
-	struct flowi6 fl6;
-
-	if ((!(ipv6_addr_type(&oip6h->saddr) & IPV6_ADDR_UNICAST)) ||
-	    (!(ipv6_addr_type(&oip6h->daddr) & IPV6_ADDR_UNICAST))) {
-		pr_debug("addr is not unicast.\n");
-		return;
-	}
+	int tcphoff;
 
 	proto = oip6h->nexthdr;
-	tcphoff = ipv6_skip_exthdr(oldskb, ((u8*)(oip6h+1) - oldskb->data), &proto, &frag_off);
+	tcphoff = ipv6_skip_exthdr(oldskb, ((u8*)(oip6h+1) - oldskb->data),
+				   &proto, &frag_off);
 
 	if ((tcphoff < 0) || (tcphoff > oldskb->len)) {
 		pr_debug("Cannot get TCP header.\n");
-		return;
+		return NULL;
 	}
 
-	otcplen = oldskb->len - tcphoff;
+	*otcplen = oldskb->len - tcphoff;
 
 	/* IP header checks: fragment, too short. */
-	if (proto != IPPROTO_TCP || otcplen < sizeof(struct tcphdr)) {
-		pr_debug("proto(%d) != IPPROTO_TCP, "
-			 "or too short. otcplen = %d\n",
-			 proto, otcplen);
-		return;
+	if (proto != IPPROTO_TCP || *otcplen < sizeof(struct tcphdr)) {
+		pr_debug("proto(%d) != IPPROTO_TCP or too short (len = %d)\n",
+			 proto, *otcplen);
+		return NULL;
 	}
 
-	if (skb_copy_bits(oldskb, tcphoff, &otcph, sizeof(struct tcphdr)))
-		BUG();
+	otcph = skb_header_pointer(oldskb, tcphoff, sizeof(struct tcphdr),
+				   otcph);
+	if (otcph == NULL)
+		return NULL;
 
 	/* No RST for RST. */
-	if (otcph.rst) {
+	if (otcph->rst) {
 		pr_debug("RST is set\n");
-		return;
+		return NULL;
 	}
 
 	/* Check checksum. */
 	if (nf_ip6_checksum(oldskb, hook, tcphoff, IPPROTO_TCP)) {
 		pr_debug("TCP checksum is invalid\n");
-		return;
-	}
-
-	memset(&fl6, 0, sizeof(fl6));
-	fl6.flowi6_proto = IPPROTO_TCP;
-	fl6.saddr = oip6h->daddr;
-	fl6.daddr = oip6h->saddr;
-	fl6.fl6_sport = otcph.dest;
-	fl6.fl6_dport = otcph.source;
-	security_skb_classify_flow(oldskb, flowi6_to_flowi(&fl6));
-	dst = ip6_route_output(net, NULL, &fl6);
-	if (dst == NULL || dst->error) {
-		dst_release(dst);
-		return;
-	}
-	dst = xfrm_lookup(net, dst, flowi6_to_flowi(&fl6), NULL, 0);
-	if (IS_ERR(dst))
-		return;
-
-	hh_len = (dst->dev->hard_header_len + 15)&~15;
-	nskb = alloc_skb(hh_len + 15 + dst->header_len + sizeof(struct ipv6hdr)
-			 + sizeof(struct tcphdr) + dst->trailer_len,
-			 GFP_ATOMIC);
-
-	if (!nskb) {
-		net_dbg_ratelimited("cannot alloc skb\n");
-		dst_release(dst);
-		return;
+		return NULL;
 	}
 
-	skb_dst_set(nskb, dst);
+	return otcph;
+}
+EXPORT_SYMBOL_GPL(nf_reject_ip6_tcphdr_get);
 
-	skb_reserve(nskb, hh_len + dst->header_len);
+struct ipv6hdr *nf_reject_ip6hdr_put(struct sk_buff *nskb,
+				     const struct sk_buff *oldskb,
+				     __be16 protocol, int hoplimit)
+{
+	struct ipv6hdr *ip6h;
+	const struct ipv6hdr *oip6h = ipv6_hdr(oldskb);
+#define DEFAULT_TOS_VALUE	0x0U
+	const __u8 tclass = DEFAULT_TOS_VALUE;
 
 	skb_put(nskb, sizeof(struct ipv6hdr));
 	skb_reset_network_header(nskb);
 	ip6h = ipv6_hdr(nskb);
 	ip6_flow_hdr(ip6h, tclass, 0);
-	ip6h->hop_limit = ip6_dst_hoplimit(dst);
-	ip6h->nexthdr = IPPROTO_TCP;
+	ip6h->hop_limit = hoplimit;
+	ip6h->nexthdr = protocol;
 	ip6h->saddr = oip6h->daddr;
 	ip6h->daddr = oip6h->saddr;
 
+	nskb->protocol = htons(ETH_P_IPV6);
+
+	return ip6h;
+}
+EXPORT_SYMBOL_GPL(nf_reject_ip6hdr_put);
+
+void nf_reject_ip6_tcphdr_put(struct sk_buff *nskb,
+			      const struct sk_buff *oldskb,
+			      const struct tcphdr *oth, unsigned int otcplen)
+{
+	struct tcphdr *tcph;
+	int needs_ack;
+
 	skb_reset_transport_header(nskb);
 	tcph = (struct tcphdr *)skb_put(nskb, sizeof(struct tcphdr));
 	/* Truncate to length (no data) */
 	tcph->doff = sizeof(struct tcphdr)/4;
-	tcph->source = otcph.dest;
-	tcph->dest = otcph.source;
+	tcph->source = oth->dest;
+	tcph->dest = oth->source;
 
-	if (otcph.ack) {
+	if (oth->ack) {
 		needs_ack = 0;
-		tcph->seq = otcph.ack_seq;
+		tcph->seq = oth->ack_seq;
 		tcph->ack_seq = 0;
 	} else {
 		needs_ack = 1;
-		tcph->ack_seq = htonl(ntohl(otcph.seq) + otcph.syn + otcph.fin
-				      + otcplen - (otcph.doff<<2));
+		tcph->ack_seq = htonl(ntohl(oth->seq) + oth->syn + oth->fin +
+				      otcplen - (oth->doff<<2));
 		tcph->seq = 0;
 	}
 
@@ -139,6 +125,63 @@ void nf_send_reset6(struct net *net, struct sk_buff *oldskb, int hook)
 				      sizeof(struct tcphdr), IPPROTO_TCP,
 				      csum_partial(tcph,
 						   sizeof(struct tcphdr), 0));
+}
+EXPORT_SYMBOL_GPL(nf_reject_ip6_tcphdr_put);
+
+void nf_send_reset6(struct net *net, struct sk_buff *oldskb, int hook)
+{
+	struct sk_buff *nskb;
+	struct tcphdr _otcph;
+	const struct tcphdr *otcph;
+	unsigned int otcplen, hh_len;
+	const struct ipv6hdr *oip6h = ipv6_hdr(oldskb);
+	struct ipv6hdr *ip6h;
+	struct dst_entry *dst = NULL;
+	struct flowi6 fl6;
+
+	if ((!(ipv6_addr_type(&oip6h->saddr) & IPV6_ADDR_UNICAST)) ||
+	    (!(ipv6_addr_type(&oip6h->daddr) & IPV6_ADDR_UNICAST))) {
+		pr_debug("addr is not unicast.\n");
+		return;
+	}
+
+	otcph = nf_reject_ip6_tcphdr_get(oldskb, &_otcph, &otcplen, hook);
+	if (!otcph)
+		return;
+
+	memset(&fl6, 0, sizeof(fl6));
+	fl6.flowi6_proto = IPPROTO_TCP;
+	fl6.saddr = oip6h->daddr;
+	fl6.daddr = oip6h->saddr;
+	fl6.fl6_sport = otcph->dest;
+	fl6.fl6_dport = otcph->source;
+	security_skb_classify_flow(oldskb, flowi6_to_flowi(&fl6));
+	dst = ip6_route_output(net, NULL, &fl6);
+	if (dst == NULL || dst->error) {
+		dst_release(dst);
+		return;
+	}
+	dst = xfrm_lookup(net, dst, flowi6_to_flowi(&fl6), NULL, 0);
+	if (IS_ERR(dst))
+		return;
+
+	hh_len = (dst->dev->hard_header_len + 15)&~15;
+	nskb = alloc_skb(hh_len + 15 + dst->header_len + sizeof(struct ipv6hdr)
+			 + sizeof(struct tcphdr) + dst->trailer_len,
+			 GFP_ATOMIC);
+
+	if (!nskb) {
+		net_dbg_ratelimited("cannot alloc skb\n");
+		dst_release(dst);
+		return;
+	}
+
+	skb_dst_set(nskb, dst);
+
+	skb_reserve(nskb, hh_len + dst->header_len);
+	ip6h = nf_reject_ip6hdr_put(nskb, oldskb, IPPROTO_TCP,
+				    ip6_dst_hoplimit(dst));
+	nf_reject_ip6_tcphdr_put(nskb, oldskb, otcph, otcplen);
 
 	nf_ct_attach(nskb, oldskb);
 
-- 
1.7.10.4

^ permalink raw reply related

* [PATCH 2/6] netfilter: nf_tables_bridge: update hook_mask to allow {pre,post}routing
From: Pablo Neira Ayuso @ 2014-10-31 12:18 UTC (permalink / raw)
  To: netfilter-devel; +Cc: davem, netdev
In-Reply-To: <1414757912-29150-1-git-send-email-pablo@netfilter.org>

Fixes: 36d2af5 ("netfilter: nf_tables: allow to filter from prerouting and postrouting")
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/bridge/netfilter/nf_tables_bridge.c |    6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/net/bridge/netfilter/nf_tables_bridge.c b/net/bridge/netfilter/nf_tables_bridge.c
index da17a5e..074c557 100644
--- a/net/bridge/netfilter/nf_tables_bridge.c
+++ b/net/bridge/netfilter/nf_tables_bridge.c
@@ -75,9 +75,11 @@ static const struct nf_chain_type filter_bridge = {
 	.type		= NFT_CHAIN_T_DEFAULT,
 	.family		= NFPROTO_BRIDGE,
 	.owner		= THIS_MODULE,
-	.hook_mask	= (1 << NF_BR_LOCAL_IN) |
+	.hook_mask	= (1 << NF_BR_PRE_ROUTING) |
+			  (1 << NF_BR_LOCAL_IN) |
 			  (1 << NF_BR_FORWARD) |
-			  (1 << NF_BR_LOCAL_OUT),
+			  (1 << NF_BR_LOCAL_OUT) |
+			  (1 << NF_BR_POST_ROUTING),
 };
 
 static int __init nf_tables_bridge_init(void)
-- 
1.7.10.4

^ permalink raw reply related

* [PATCH 0/6] netfilter/ipvs fixes for net
From: Pablo Neira Ayuso @ 2014-10-31 12:18 UTC (permalink / raw)
  To: netfilter-devel; +Cc: davem, netdev

Hi David,

The following patchset contains fixes for netfilter/ipvs. This round of
fixes is larger than usual at this stage, specifically because of the
nf_tables bridge reject fixes that I would like to see in 3.18. The
patches are:

1) Fix a null-pointer dereference that may occur when logging
   errors. This problem was introduced by 4a4739d56b0 ("ipvs: Pull
   out crosses_local_route_boundary logic") in v3.17-rc5.

2) Update hook mask in nft_reject_bridge so we can also filter out
   packets from there. This fixes 36d2af5 ("netfilter: nf_tables: allow
   to filter from prerouting and postrouting"), which needs this chunk
   to work.

3) Two patches to refactor common code to forge the IPv4 and IPv6
   reject packets from the bridge. These are required by the nf_tables
   reject bridge fix.

4) Fix nft_reject_bridge by avoiding the use of the IP stack to reject
   packets from the bridge. The idea is to forge the reject packets and
   inject them to the original port via br_deliver() which is now
   exported for that purpose.

5) Restrict nft_reject_bridge to bridge prerouting and input hooks.
   the original skbuff may cloned after prerouting when the bridge stack
   needs to flood it to several bridge ports, it is too late to reject
   the traffic.

You can pull these changes from:

  git://git.kernel.org/pub/scm/linux/kernel/git/pablo/nf.git

Thanks!

----------------------------------------------------------------

The following changes since commit 7965ee93719921ea5978f331da653dfa2d7b99f5:

  netfilter: nft_compat: fix wrong target lookup in nft_target_select_ops() (2014-10-27 22:17:46 +0100)

are available in the git repository at:

  git://git.kernel.org/pub/scm/linux/kernel/git/pablo/nf.git master

for you to fetch changes up to 127917c29a432c3b798e014a1714e9c1af0f87fe:

  netfilter: nft_reject_bridge: restrict reject to prerouting and input (2014-10-31 12:50:09 +0100)

----------------------------------------------------------------
Alex Gartrell (1):
      ipvs: Avoid null-pointer deref in debug code

Pablo Neira Ayuso (5):
      netfilter: nf_tables_bridge: update hook_mask to allow {pre,post}routing
      netfilter: nf_reject_ipv4: split nf_send_reset() in smaller functions
      netfilter: nf_reject_ipv6: split nf_send_reset6() in smaller functions
      netfilter: nft_reject_bridge: don't use IP stack to reject traffic
      netfilter: nft_reject_bridge: restrict reject to prerouting and input

 include/net/netfilter/ipv4/nf_reject.h   |   10 +
 include/net/netfilter/ipv6/nf_reject.h   |   10 +
 net/bridge/br_forward.c		  |    1 +
 net/bridge/netfilter/nf_tables_bridge.c  |    6 +-
 net/bridge/netfilter/nft_reject_bridge.c |  296 ++++++++++++++++++++++++++++--
 net/ipv4/netfilter/nf_reject_ipv4.c	  |   88 ++++++---
 net/ipv6/netfilter/nf_reject_ipv6.c	  |  175 +++++++++++-------
 net/netfilter/ipvs/ip_vs_xmit.c	  |    4 +-
 8 files changed, 483 insertions(+), 107 deletions(-)

^ permalink raw reply

* [PATCH 1/6] ipvs: Avoid null-pointer deref in debug code
From: Pablo Neira Ayuso @ 2014-10-31 12:18 UTC (permalink / raw)
  To: netfilter-devel; +Cc: davem, netdev
In-Reply-To: <1414757912-29150-1-git-send-email-pablo@netfilter.org>

From: Alex Gartrell <agartrell@fb.com>

Use daddr instead of reaching into dest.

Reported-by: Dan Carpenter <dan.carpenter@oracle.com>
Signed-off-by: Alex Gartrell <agartrell@fb.com>
Signed-off-by: Simon Horman <horms@verge.net.au>
---
 net/netfilter/ipvs/ip_vs_xmit.c |    4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/net/netfilter/ipvs/ip_vs_xmit.c b/net/netfilter/ipvs/ip_vs_xmit.c
index 91f17c1..437a366 100644
--- a/net/netfilter/ipvs/ip_vs_xmit.c
+++ b/net/netfilter/ipvs/ip_vs_xmit.c
@@ -316,7 +316,7 @@ __ip_vs_get_out_rt(int skb_af, struct sk_buff *skb, struct ip_vs_dest *dest,
 	if (unlikely(crosses_local_route_boundary(skb_af, skb, rt_mode,
 						  local))) {
 		IP_VS_DBG_RL("We are crossing local and non-local addresses"
-			     " daddr=%pI4\n", &dest->addr.ip);
+			     " daddr=%pI4\n", &daddr);
 		goto err_put;
 	}
 
@@ -458,7 +458,7 @@ __ip_vs_get_out_rt_v6(int skb_af, struct sk_buff *skb, struct ip_vs_dest *dest,
 	if (unlikely(crosses_local_route_boundary(skb_af, skb, rt_mode,
 						  local))) {
 		IP_VS_DBG_RL("We are crossing local and non-local addresses"
-			     " daddr=%pI6\n", &dest->addr.in6);
+			     " daddr=%pI6\n", daddr);
 		goto err_put;
 	}
 
-- 
1.7.10.4


^ permalink raw reply related

page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox