Netdev List
 help / color / mirror / Atom feed
* [PATCH net-next] bpf: Preserve const register type on const OR alu ops
From: Alexei Starovoitov @ 2016-12-03 20:31 UTC (permalink / raw)
  To: David S . Miller; +Cc: Daniel Borkmann, Gianluca Borello, netdev

From: Gianluca Borello <g.borello@gmail.com>

Occasionally, clang (e.g. version 3.8.1) translates a sum between two
constant operands using a BPF_OR instead of a BPF_ADD. The verifier is
currently not handling this scenario, and the destination register type
becomes UNKNOWN_VALUE even if it's still storing a constant. As a result,
the destination register cannot be used as argument to a helper function
expecting a ARG_CONST_STACK_*, limiting some use cases.

Modify the verifier to handle this case, and add a few tests to make sure
all combinations are supported, and stack boundaries are still verified
even with BPF_OR.

Signed-off-by: Gianluca Borello <g.borello@gmail.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 kernel/bpf/verifier.c                       |  9 ++++-
 tools/testing/selftests/bpf/.gitignore      |  1 +
 tools/testing/selftests/bpf/test_verifier.c | 60 +++++++++++++++++++++++++++++
 3 files changed, 68 insertions(+), 2 deletions(-)

diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 0e742210750e..38d05da84a49 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -1481,14 +1481,19 @@ static int evaluate_reg_imm_alu(struct bpf_verifier_env *env,
 	struct bpf_reg_state *src_reg = &regs[insn->src_reg];
 	u8 opcode = BPF_OP(insn->code);
 
-	/* dst_reg->type == CONST_IMM here, simulate execution of 'add' insn.
-	 * Don't care about overflow or negative values, just add them
+	/* dst_reg->type == CONST_IMM here, simulate execution of 'add'/'or'
+	 * insn. Don't care about overflow or negative values, just add them
 	 */
 	if (opcode == BPF_ADD && BPF_SRC(insn->code) == BPF_K)
 		dst_reg->imm += insn->imm;
 	else if (opcode == BPF_ADD && BPF_SRC(insn->code) == BPF_X &&
 		 src_reg->type == CONST_IMM)
 		dst_reg->imm += src_reg->imm;
+	else if (opcode == BPF_OR && BPF_SRC(insn->code) == BPF_K)
+		dst_reg->imm |= insn->imm;
+	else if (opcode == BPF_OR && BPF_SRC(insn->code) == BPF_X &&
+		 src_reg->type == CONST_IMM)
+		dst_reg->imm |= src_reg->imm;
 	else
 		mark_reg_unknown_value(regs, insn->dst_reg);
 	return 0;
diff --git a/tools/testing/selftests/bpf/.gitignore b/tools/testing/selftests/bpf/.gitignore
index 3c59f96e3ed8..071431bedde8 100644
--- a/tools/testing/selftests/bpf/.gitignore
+++ b/tools/testing/selftests/bpf/.gitignore
@@ -1,2 +1,3 @@
 test_verifier
 test_maps
+test_lru_map
diff --git a/tools/testing/selftests/bpf/test_verifier.c b/tools/testing/selftests/bpf/test_verifier.c
index 5da2e9d7689c..8d71e44b319d 100644
--- a/tools/testing/selftests/bpf/test_verifier.c
+++ b/tools/testing/selftests/bpf/test_verifier.c
@@ -2683,6 +2683,66 @@ static struct bpf_test tests[] = {
 		.errstr_unpriv = "R0 pointer arithmetic prohibited",
 		.result_unpriv = REJECT,
 	},
+	{
+		"constant register |= constant should keep constant type",
+		.insns = {
+			BPF_MOV64_REG(BPF_REG_1, BPF_REG_10),
+			BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, -48),
+			BPF_MOV64_IMM(BPF_REG_2, 34),
+			BPF_ALU64_IMM(BPF_OR, BPF_REG_2, 13),
+			BPF_MOV64_IMM(BPF_REG_3, 0),
+			BPF_EMIT_CALL(BPF_FUNC_probe_read),
+			BPF_EXIT_INSN(),
+		},
+		.result = ACCEPT,
+		.prog_type = BPF_PROG_TYPE_TRACEPOINT,
+	},
+	{
+		"constant register |= constant should not bypass stack boundary checks",
+		.insns = {
+			BPF_MOV64_REG(BPF_REG_1, BPF_REG_10),
+			BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, -48),
+			BPF_MOV64_IMM(BPF_REG_2, 34),
+			BPF_ALU64_IMM(BPF_OR, BPF_REG_2, 24),
+			BPF_MOV64_IMM(BPF_REG_3, 0),
+			BPF_EMIT_CALL(BPF_FUNC_probe_read),
+			BPF_EXIT_INSN(),
+		},
+		.errstr = "invalid stack type R1 off=-48 access_size=58",
+		.result = REJECT,
+		.prog_type = BPF_PROG_TYPE_TRACEPOINT,
+	},
+	{
+		"constant register |= constant register should keep constant type",
+		.insns = {
+			BPF_MOV64_REG(BPF_REG_1, BPF_REG_10),
+			BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, -48),
+			BPF_MOV64_IMM(BPF_REG_2, 34),
+			BPF_MOV64_IMM(BPF_REG_4, 13),
+			BPF_ALU64_REG(BPF_OR, BPF_REG_2, BPF_REG_4),
+			BPF_MOV64_IMM(BPF_REG_3, 0),
+			BPF_EMIT_CALL(BPF_FUNC_probe_read),
+			BPF_EXIT_INSN(),
+		},
+		.result = ACCEPT,
+		.prog_type = BPF_PROG_TYPE_TRACEPOINT,
+	},
+	{
+		"constant register |= constant register should not bypass stack boundary checks",
+		.insns = {
+			BPF_MOV64_REG(BPF_REG_1, BPF_REG_10),
+			BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, -48),
+			BPF_MOV64_IMM(BPF_REG_2, 34),
+			BPF_MOV64_IMM(BPF_REG_4, 24),
+			BPF_ALU64_REG(BPF_OR, BPF_REG_2, BPF_REG_4),
+			BPF_MOV64_IMM(BPF_REG_3, 0),
+			BPF_EMIT_CALL(BPF_FUNC_probe_read),
+			BPF_EXIT_INSN(),
+		},
+		.errstr = "invalid stack type R1 off=-48 access_size=58",
+		.result = REJECT,
+		.prog_type = BPF_PROG_TYPE_TRACEPOINT,
+	},
 };
 
 static int probe_filter_length(const struct bpf_insn *fp)
-- 
2.8.0

^ permalink raw reply related

* Re: [PATCH -next] net: ethernet: ti: davinci_cpdma: add missing EXPORTs
From: David Miller @ 2016-12-03 20:27 UTC (permalink / raw)
  To: paul.gortmaker
  Cc: ivan.khoronzhuk, mugunthanvnm, grygorii.strashko, linux-omap,
	netdev
In-Reply-To: <20161201202528.12531-1-paul.gortmaker@windriver.com>

From: Paul Gortmaker <paul.gortmaker@windriver.com>
Date: Thu, 1 Dec 2016 15:25:28 -0500

> As of commit 8f32b90981dcdb355516fb95953133f8d4e6b11d
> ("net: ethernet: ti: davinci_cpdma: add set rate for a channel") the
> ARM allmodconfig builds would fail modpost with:
> 
> ERROR: "cpdma_chan_set_weight" [drivers/net/ethernet/ti/ti_cpsw.ko] undefined!
> ERROR: "cpdma_chan_get_rate" [drivers/net/ethernet/ti/ti_cpsw.ko] undefined!
> ERROR: "cpdma_chan_get_min_rate" [drivers/net/ethernet/ti/ti_cpsw.ko] undefined!
> ERROR: "cpdma_chan_set_rate" [drivers/net/ethernet/ti/ti_cpsw.ko] undefined!
> 
> Since these weren't declared as static, it is assumed they were
> meant to be shared outside the file, and that modular build testing
> was simply overlooked.
> 
> Fixes: 8f32b90981dc ("net: ethernet: ti: davinci_cpdma: add set rate for a channel")
> Cc: Ivan Khoronzhuk <ivan.khoronzhuk@linaro.org>
> Cc: Mugunthan V N <mugunthanvnm@ti.com>
> Cc: Grygorii Strashko <grygorii.strashko@ti.com>
> Cc: linux-omap@vger.kernel.org
> Cc: netdev@vger.kernel.org
> Signed-off-by: Paul Gortmaker <paul.gortmaker@windriver.com>

Applied.

^ permalink raw reply

* Re: pull-request: can-next 2016-12-01,pull-request: can-next 2016-12-01
From: David Miller @ 2016-12-03 20:27 UTC (permalink / raw)
  To: mkl; +Cc: netdev, kernel, linux-can
In-Reply-To: <de63a1d8-2822-2d98-973c-c1b2f8c493cf@pengutronix.de>

From: Marc Kleine-Budde <mkl@pengutronix.de>
Date: Thu, 1 Dec 2016 21:21:44 +0100

> this is a pull request of 4 patches for net-next/master.
> 
> There are two patches by Chris Paterson for the rcar_can and rcar_canfd
> device tree binding documentation. And a patch by Geert Uytterhoeven
> that corrects the order of interrupt specifiers.
> 
> The fourth patch by Colin Ian King fixes a spelling error in the
> kvaser_usb driver.

Pulled, thanks.

^ permalink raw reply

* Re: [PATCH V2 net-next] net: hns: Fix to conditionally convey RX checksum flag to stack
From: David Miller @ 2016-12-03 20:25 UTC (permalink / raw)
  To: salil.mehta; +Cc: yisen.zhuang, mehta.salil.lnk, netdev, linux-kernel, linuxarm
In-Reply-To: <F4CC6FACFEB3C54C9141D49AD221F7F91A7D5EF1@lhreml503-mbx>

From: Salil Mehta <salil.mehta@huawei.com>
Date: Thu, 1 Dec 2016 16:59:14 +0000

> It looks to me the cumbersome check in the PATCH V2 should
> be retained.

I really want something simpler with small checks that are
done in logical pieces in a straigtforward progression.

The code in V2 is completely unreadable.

^ permalink raw reply

* Re: [PATCH 2/2] net: stmmac: unify mdio functions
From: David Miller @ 2016-12-03 20:24 UTC (permalink / raw)
  To: clabbe.montjoie; +Cc: peppe.cavallaro, alexandre.torgue, netdev, linux-kernel
In-Reply-To: <1480605581-13350-2-git-send-email-clabbe.montjoie@gmail.com>

From: Corentin Labbe <clabbe.montjoie@gmail.com>
Date: Thu,  1 Dec 2016 16:19:41 +0100

> stmmac_mdio_{read|write} and stmmac_mdio_{read|write}_gmac4 are not
> enought different for being split.
> The only differences between thoses two functions are shift/mask for
> addr/reg/clk_csr.
> 
> This patch introduce a per platform set of variable for setting thoses
> shift/mask and unify mdio read and write functions.
> 
> Signed-off-by: Corentin Labbe <clabbe.montjoie@gmail.com>

Applied.

^ permalink raw reply

* Re: [PATCH net-next 1/4] bpf: xdp: Allow head adjustment in XDP prog
From: Daniel Borkmann @ 2016-12-03 20:21 UTC (permalink / raw)
  To: Martin KaFai Lau, Jesper Dangaard Brouer
  Cc: netdev, Alexei Starovoitov, Brenden Blanco, David Miller,
	Saeed Mahameed, Tariq Toukan, Kernel Team
In-Reply-To: <20161203193249.GC70461@kafai-mba.local>

On 12/03/2016 08:32 PM, Martin KaFai Lau wrote:
> On Sat, Dec 03, 2016 at 04:24:13PM +0100, Jesper Dangaard Brouer wrote:
>> On Fri, 2 Dec 2016 15:23:30 -0800
>> Martin KaFai Lau <kafai@fb.com> wrote:
>>
>>> -bool bpf_helper_changes_skb_data(void *func)
>>> +BPF_CALL_2(bpf_xdp_adjust_head, struct xdp_buff *, xdp, int, offset)
>>> +{
>>> +	/* Both mlx4 and mlx5 driver align each packet to PAGE_SIZE when
>>> +	 * XDP prog is set.
>>> +	 * If the above is not true for the other drivers to support
>>> +	 * bpf_xdp_adjust_head, struct xdp_buff can be extended.
>>> +	 */
>>> +	void *head = (void *)((unsigned long)xdp->data & PAGE_MASK);
>>> +	void *new_data = xdp->data + offset;
>>> +
>>> +	if (new_data < head || new_data >= xdp->data_end)
>>> +		/* The packet length must be >=1 */
>>> +		return -EINVAL;
>>> +
>>> +	xdp->data = new_data;
>>> +
>>> +	return 0;
>>> +}
>>
>> First time I read this code, I was about to complain about you didn't
>> use XDP_PACKET_HEADROOM in your boundary check.  But then I noticed the
>> PAGE_MASK.  If you rename "head" to "page_boundary" or "page_start"
>> then IMHO the code would be more readable.
> bpf_xdp_adjust_head() could be called multiple times.  Hence,
> XDP_PACKET_HEADROOM is not used in the boundary check.
>
> My thinking is "head" here can closely resemble the meaning of
> skb->head as a boundary.  I think missing the info on
> what head it is could be the confusing part.
>
> Instead of skb boundary (there is no skb here) or
> page boundary (other future XDP driver may not align like mlx4/5),
> I think may be "pkt_head" can give more clarity here and also
> for furture XDP-capble driver?

I think as-is with head is also fine with me, but if it should be
something better readable (?), perhaps as such (modulo the min len
part):

BPF_CALL_2(bpf_xdp_adjust_head, struct xdp_buff *, xdp, int, offset)
{
	unsigned long addr = (unsigned long)xdp->data & PAGE_MASK;
	void *data_hard_start = (void *)addr;
	void *data = xdp->data + offset;

	if (unlikely(data < data_hard_start || data >= xdp->data_end))
		return -EINVAL;

	xdp->data = data;
	return 0;
}

Thanks,
Daniel

^ permalink raw reply

* Re: [PATCH net-next 1/4] bpf: xdp: Allow head adjustment in XDP prog
From: Martin KaFai Lau @ 2016-12-03 19:32 UTC (permalink / raw)
  To: Jesper Dangaard Brouer
  Cc: netdev, Alexei Starovoitov, Brenden Blanco, Daniel Borkmann,
	David Miller, Saeed Mahameed, Tariq Toukan, Kernel Team
In-Reply-To: <20161203162413.5f305f9f@redhat.com>

On Sat, Dec 03, 2016 at 04:24:13PM +0100, Jesper Dangaard Brouer wrote:
> On Fri, 2 Dec 2016 15:23:30 -0800
> Martin KaFai Lau <kafai@fb.com> wrote:
>
> > -bool bpf_helper_changes_skb_data(void *func)
> > +BPF_CALL_2(bpf_xdp_adjust_head, struct xdp_buff *, xdp, int, offset)
> > +{
> > +	/* Both mlx4 and mlx5 driver align each packet to PAGE_SIZE when
> > +	 * XDP prog is set.
> > +	 * If the above is not true for the other drivers to support
> > +	 * bpf_xdp_adjust_head, struct xdp_buff can be extended.
> > +	 */
> > +	void *head = (void *)((unsigned long)xdp->data & PAGE_MASK);
> > +	void *new_data = xdp->data + offset;
> > +
> > +	if (new_data < head || new_data >= xdp->data_end)
> > +		/* The packet length must be >=1 */
> > +		return -EINVAL;
> > +
> > +	xdp->data = new_data;
> > +
> > +	return 0;
> > +}
>
> First time I read this code, I was about to complain about you didn't
> use XDP_PACKET_HEADROOM in your boundary check.  But then I noticed the
> PAGE_MASK.  If you rename "head" to "page_boundary" or "page_start"
> then IMHO the code would be more readable.
bpf_xdp_adjust_head() could be called multiple times.  Hence,
XDP_PACKET_HEADROOM is not used in the boundary check.

My thinking is "head" here can closely resemble the meaning of
skb->head as a boundary.  I think missing the info on
what head it is could be the confusing part.

Instead of skb boundary (there is no skb here) or
page boundary (other future XDP driver may not align like mlx4/5),
I think may be "pkt_head" can give more clarity here and also
for furture XDP-capble driver?

^ permalink raw reply

* Re: [PATCH] irda: w83977af_ir: fix damaged whitespace
From: David Miller @ 2016-12-03 20:09 UTC (permalink / raw)
  To: arnd; +Cc: samuel, netdev, linux-kernel
In-Reply-To: <20161128141951.2674778-1-arnd@arndb.de>

From: Arnd Bergmann <arnd@arndb.de>
Date: Mon, 28 Nov 2016 15:19:43 +0100

> As David Miller pointed out for for the previous patch, the whitespace
> in some functions looks rather odd. This was caused by commit 6329da5f258a
> ("obsolete config in kernel source: USE_INTERNAL_TIMER"), which removed
> some conditions but did not reindent the code.
> 
> This fixes the indentation in the file and removes extraneous whitespace
> at the end of the lines and before tabs.
> 
> There are many other minor coding style problems in the driver, but I'm
> not touching those here.
> 
> Signed-off-by: Arnd Bergmann <arnd@arndb.de>

Applied, thanks Arnd.

^ permalink raw reply

* Re: [PATCH V2 net-next] net: hns: Fix to conditionally convey RX checksum flag to stack
From: David Miller @ 2016-12-03 20:09 UTC (permalink / raw)
  To: salil.mehta; +Cc: yisen.zhuang, mehta.salil.lnk, netdev, linux-kernel, linuxarm
In-Reply-To: <F4CC6FACFEB3C54C9141D49AD221F7F91A7D4439@lhreml503-mbx>

From: Salil Mehta <salil.mehta@huawei.com>
Date: Thu, 1 Dec 2016 12:09:22 +0000

> But maybe now since we don't have any method to de-multiplex the kind of
> checksum error (cannot depend upon register) we can have below code
> re-arrangement:
> 
> hns_nic_rx_checksum() {
>       /* check supported L3 protocol */
> 	if (l3 != IPV4 && l3 != IPV6)
> 		return;
>       /* check if L3 protocols error */
>       if (l3e)
> 	 	return;
> 
>       /* check if the packets are fragmented */
> 	If (l3frags)
> 		Return;
> 
>       /* check supported L4 protocol */
>  	if (l4 != UDP && l4 != TCP && l4 != SCTP)
>  		return;
>       /* check if any L4 protocol error */
>       if (l3e)
> 	 	return;
> 
>       /* packet with valid checksum - covey to stack */
>       skb->ip_summed = CHECKSUM_UNNECESSARY
> }

This looks a lot cleaner and easier to understand.

^ permalink raw reply

* Re: [PATCH] stmmac: cleanup documenation, make it match reality
From: David Miller @ 2016-12-03 20:07 UTC (permalink / raw)
  To: pavel; +Cc: akpm, peppe.cavallaro, netdev, linux-kernel
In-Reply-To: <20161201103218.GB19056@amd>

From: Pavel Machek <pavel@ucw.cz>
Date: Thu, 1 Dec 2016 11:32:18 +0100

> Fix english in documentation, make documentation match reality, remove
> options that were removed from code.
>     
> Signed-off-by: Pavel Machek <pavel@denx.de>

Applied.

^ permalink raw reply

* [PATCH net-next] r8169: Add support for restarting auto-negotiation
From: Florian Fainelli @ 2016-12-03 20:01 UTC (permalink / raw)
  To: netdev
  Cc: davem, hau, romieu, Florian Fainelli,
	Realtek linux nic maintainers, open list

Implement ethtooll::nway_restart by utilizing mii_nway_restart.

Signed-off-by: Florian Fainelli <f.fainelli@gmail.com>
---
 drivers/net/ethernet/realtek/r8169.c | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/drivers/net/ethernet/realtek/r8169.c b/drivers/net/ethernet/realtek/r8169.c
index 2830190aaace..f9b97f5946f8 100644
--- a/drivers/net/ethernet/realtek/r8169.c
+++ b/drivers/net/ethernet/realtek/r8169.c
@@ -2344,6 +2344,13 @@ static void rtl8169_get_strings(struct net_device *dev, u32 stringset, u8 *data)
 	}
 }
 
+static int rtl8169_nway_reset(struct net_device *dev)
+{
+	struct rtl8169_private *tp = netdev_priv(dev);
+
+	return mii_nway_restart(&tp->mii);
+}
+
 static const struct ethtool_ops rtl8169_ethtool_ops = {
 	.get_drvinfo		= rtl8169_get_drvinfo,
 	.get_regs_len		= rtl8169_get_regs_len,
@@ -2359,6 +2366,7 @@ static const struct ethtool_ops rtl8169_ethtool_ops = {
 	.get_sset_count		= rtl8169_get_sset_count,
 	.get_ethtool_stats	= rtl8169_get_ethtool_stats,
 	.get_ts_info		= ethtool_op_get_ts_info,
+	.nway_reset		= rtl8169_nway_reset,
 };
 
 static void rtl8169_get_mac_version(struct rtl8169_private *tp,
-- 
2.9.3

^ permalink raw reply related

* pull request: bluetooth-next 2016-12-03
From: Johan Hedberg @ 2016-12-03 19:53 UTC (permalink / raw)
  To: davem; +Cc: linux-bluetooth, netdev

[-- Attachment #1: Type: text/plain, Size: 2179 bytes --]

Hi Dave,

Here's a set of Bluetooth & 802.15.4 patches for net-next (i.e. 4.10
kernel):

 - Fix for a potential NULL deref in the ieee802154 netlink code
 - Fix for the ED values of the at86rf2xx driver
 - Documentation updates to ieee802154
 - Cleanups to u8 vs __u8 usage
 - Timer API usage cleanups in HCI drivers

Please let me know if there are any issues pulling. Thanks.

Johan

---
The following changes since commit 0b42f25d2f123bb7fbd3565d003a8ea9e1e810fe:

  Merge git://git.kernel.org/pub/scm/linux/kernel/git/davem/net (2016-11-26 23:42:21 -0500)

are available in the git repository at:

  git://git.kernel.org/pub/scm/linux/kernel/git/bluetooth/bluetooth-next.git for-upstream

for you to fetch changes up to 6bf0d84d13e968b4f8bf0710e0cae785e228dbba:

  docs: ieee802154: update main documentation file (2016-11-30 12:33:07 +0100)

----------------------------------------------------------------
Alexander Aring (1):
      at86rf230: fix cca ed values for rf233

Pavel Machek (1):
      Bluetooth: __ variants of u8 and friends are not neccessary inside kernel

Prasanna Karthik (3):
      Bluetooth: hci_bcsp: Use setup_timer Kernel API instead of init_timer
      Bluetooth: hci_h5: Use setup_timer Kernel API instead of init_timer
      Bluetooth: hci_qca: Use setup_timer Kernel API instead of init_timer

Stefan Schmidt (3):
      ieee802154: add myself as co-maintainer to MAINTAINERS file
      ieee802154: fakelb: print number of created fake devices during probe
      docs: ieee802154: update main documentation file

vegard.nossum@oracle.com (1):
      ieee802154: check device type

 Documentation/networking/ieee802154.txt | 26 +++++++++++---------------
 MAINTAINERS                             |  1 +
 drivers/bluetooth/hci_bcsp.c            |  4 +---
 drivers/bluetooth/hci_h5.c              |  4 +---
 drivers/bluetooth/hci_qca.c             |  9 +++------
 drivers/net/ieee802154/at86rf230.c      | 16 +++++++++++-----
 drivers/net/ieee802154/fakelb.c         |  2 +-
 include/net/bluetooth/bluetooth.h       | 25 +++++++++++++------------
 net/ieee802154/nl-phy.c                 |  6 +++++-
 9 files changed, 47 insertions(+), 46 deletions(-)

[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 801 bytes --]

^ permalink raw reply

* Re: [flamebait] xdp, well meaning but pointless
From: John Fastabend @ 2016-12-03 19:48 UTC (permalink / raw)
  To: Willem de Bruijn, Jesper Dangaard Brouer
  Cc: Florian Westphal, Network Development
In-Reply-To: <CAF=yD-+Wx9Pkt55-dUU6dYM7P==kQoKE4YXKoFSq1yi7rY5rDA@mail.gmail.com>

On 16-12-03 08:19 AM, Willem de Bruijn wrote:
> On Fri, Dec 2, 2016 at 12:22 PM, Jesper Dangaard Brouer
> <brouer@redhat.com> wrote:
>>
>> On Thu, 1 Dec 2016 10:11:08 +0100 Florian Westphal <fw@strlen.de> wrote:
>>
>>> In light of DPDKs existence it make a lot more sense to me to provide
>>> a). a faster mmap based interface (possibly AF_PACKET based) that allows
>>> to map nic directly into userspace, detaching tx/rx queue from kernel.
>>>
>>> John Fastabend sent something like this last year as a proof of
>>> concept, iirc it was rejected because register space got exposed directly
>>> to userspace.  I think we should re-consider merging netmap
>>> (or something conceptually close to its design).
>>
>> I'm actually working in this direction, of zero-copy RX mapping packets
>> into userspace.  This work is mostly related to page_pool, and I only
>> plan to use XDP as a filter for selecting packets going to userspace,
>> as this choice need to be taken very early.
>>
>> My design is here:
>>  https://prototype-kernel.readthedocs.io/en/latest/vm/page_pool/design/memory_model_nic.html
>>
>> This is mostly about changing the memory model in the drivers, to allow
>> for safely mapping pages to userspace.  (An efficient queue mechanism is
>> not covered).
> 
> Virtio virtqueues are used in various other locations in the stack.
> With separate memory pools and send + completion descriptor rings,
> signal moderation, careful avoidance of cacheline bouncing, etc. these
> seem like a good opportunity for a TPACKET_V4 format.
> 

FWIW. After we rejected exposing the register space to user space due to
valid security issues we fell back to using VFIO which works nicely for
mapping virtual functions into userspace and VMs. The main  drawback is
user space has to manage the VF but that is mostly a solved problem at
this point. Deployment concerns aside.

There was a TPACKET_V4 version we had a prototype of that passed
buffers down to the hardware to use with the dma engine. This gives
zero-copy but same as VFs requires the hardware to do all the steering
of traffic and any expected policy in front of the application. Due to
requiring user space to kick hardware and vice versa though it was
somewhat slower so I didn't finish it up. The kick was implemented as a
syscall iirc. I can maybe look at it a bit more next week and see if its
worth reviving now in this context.

I don't think any of this requires page pools though. Or rather tpacket
and vhost/virtio already know how to do page pools is perhaps the other
way to look at it.

One idea I've been playing around with is a vhost backend using
tpacketv{3|4} so we don't require socket manipulation.

Thanks,
John

^ permalink raw reply

* [PATCH v2 net-next 0/8] tcp: tsq: performance series
From: Eric Dumazet @ 2016-12-03 19:14 UTC (permalink / raw)
  To: David S . Miller; +Cc: netdev, Eric Dumazet, Yuchung Cheng, Eric Dumazet

Under very high TX stress, CPU handling NIC TX completions can spend
considerable amount of cycles handling TSQ (TCP Small Queues) logic.

This patch series avoids some atomic operations, but most notable
patch is the 3rd one, allowing other cpus processing ACK packets and
calling tcp_write_xmit() to grab TCP_TSQ_DEFERRED so that
tcp_tasklet_func() can skip already processed sockets.

This avoid lots of lock acquisitions and cache lines accesses,
particularly under load.

In v2, I added :

- tcp_small_queue_check() change to allow 1st and 2nd packets
  in write queue to be sent, even in the case TX completion of
  already acknowledged packets did not happen yet.
  This helps when TX completion coalescing parameters are set
  even to insane values, and/or busy polling is used.

- A reorganization of struct sock fields to
  lower false sharing and increase data locality.

- Then I moved tsq_flags from tcp_sock to struct sock also
  to reduce cache line misses during TX completions.

I measured an overall throughput gain of 22 % for heavy TCP use
over a single TX queue.

Eric Dumazet (8):
  tcp: tsq: add tsq_flags / tsq_enum
  tcp: tsq: remove one locked operation in tcp_wfree()
  tcp: tsq: add shortcut in tcp_tasklet_func()
  tcp: tsq: avoid one atomic in tcp_wfree()
  tcp: tsq: add a shortcut in tcp_small_queue_check()
  tcp: tcp_mtu_probe() is likely to exit early
  net: reorganize struct sock for better data locality
  tcp: tsq: move tsq_flags close to sk_wmem_alloc

 include/linux/tcp.h   | 12 +++++--
 include/net/sock.h    | 51 +++++++++++++++--------------
 net/ipv4/tcp.c        |  4 +--
 net/ipv4/tcp_ipv4.c   |  2 +-
 net/ipv4/tcp_output.c | 91 +++++++++++++++++++++++++++++++--------------------
 net/ipv4/tcp_timer.c  |  4 +--
 net/ipv6/tcp_ipv6.c   |  2 +-
 7 files changed, 98 insertions(+), 68 deletions(-)

-- 
2.8.0.rc3.226.g39d4020

^ permalink raw reply

* Hello Beautiful
From: Bentley @ 2016-12-03 12:12 UTC (permalink / raw)


Hello beautiful, How you doing today? I hope you are doing well. My name is Bentley, from the US. I'm in Syria right now fighting ISIS. I want to get to know you better, if I may be so bold. I consider myself an easy-going man, and I am currently looking for a relationship in which I feel loved. Please tell me more about yourself, if you don't mind.

Hope to hear from you soon.

Regards,
Bentley.

^ permalink raw reply

* [PATCH v2 net-next 6/8] tcp: tcp_mtu_probe() is likely to exit early
From: Eric Dumazet @ 2016-12-03 19:14 UTC (permalink / raw)
  To: David S . Miller; +Cc: netdev, Eric Dumazet, Yuchung Cheng, Eric Dumazet
In-Reply-To: <1480792497-16607-1-git-send-email-edumazet@google.com>

Adding a likely() in tcp_mtu_probe() moves its code which used to
be inlined in front of tcp_write_xmit()

We still have a cache line miss to access icsk->icsk_mtup.enabled,
we will probably have to reorganize fields to help data locality.

Signed-off-by: Eric Dumazet <edumazet@google.com>
---
 net/ipv4/tcp_output.c | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index d5c46749adab..5f04bee4c86a 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -1932,26 +1932,26 @@ static inline void tcp_mtu_check_reprobe(struct sock *sk)
  */
 static int tcp_mtu_probe(struct sock *sk)
 {
-	struct tcp_sock *tp = tcp_sk(sk);
 	struct inet_connection_sock *icsk = inet_csk(sk);
+	struct tcp_sock *tp = tcp_sk(sk);
 	struct sk_buff *skb, *nskb, *next;
 	struct net *net = sock_net(sk);
-	int len;
 	int probe_size;
 	int size_needed;
-	int copy;
+	int copy, len;
 	int mss_now;
 	int interval;
 
 	/* Not currently probing/verifying,
 	 * not in recovery,
 	 * have enough cwnd, and
-	 * not SACKing (the variable headers throw things off) */
-	if (!icsk->icsk_mtup.enabled ||
-	    icsk->icsk_mtup.probe_size ||
-	    inet_csk(sk)->icsk_ca_state != TCP_CA_Open ||
-	    tp->snd_cwnd < 11 ||
-	    tp->rx_opt.num_sacks || tp->rx_opt.dsack)
+	 * not SACKing (the variable headers throw things off)
+	 */
+	if (likely(!icsk->icsk_mtup.enabled ||
+		   icsk->icsk_mtup.probe_size ||
+		   inet_csk(sk)->icsk_ca_state != TCP_CA_Open ||
+		   tp->snd_cwnd < 11 ||
+		   tp->rx_opt.num_sacks || tp->rx_opt.dsack))
 		return -1;
 
 	/* Use binary search for probe_size between tcp_mss_base,
-- 
2.8.0.rc3.226.g39d4020

^ permalink raw reply related

* [PATCH v2 net-next 7/8] net: reorganize struct sock for better data locality
From: Eric Dumazet @ 2016-12-03 19:14 UTC (permalink / raw)
  To: David S . Miller; +Cc: netdev, Eric Dumazet, Yuchung Cheng, Eric Dumazet
In-Reply-To: <1480792497-16607-1-git-send-email-edumazet@google.com>

Group fields used in TX path, and keep some cache lines mostly read
to permit sharing among cpus.

Gained two 4 bytes holes on 64bit arches.

Added a place holder for tcp tsq_flags, next to sk_wmem_alloc
to speed up tcp_wfree() in the following patch.

I have not added ____cacheline_aligned_in_smp, this might be done later.
I prefer doing this once inet and tcp/udp sockets reorg is also done.

Tested with both TCP and UDP.

UDP receiver performance under flood increased by ~20 % :
Accessing sk_filter/sk_wq/sk_napi_id no longer stalls because sk_drops
was moved away from a critical cache line, now mostly read and shared.

	/* --- cacheline 4 boundary (256 bytes) --- */
	unsigned int               sk_napi_id;           /* 0x100   0x4 */
	int                        sk_rcvbuf;            /* 0x104   0x4 */
	struct sk_filter *         sk_filter;            /* 0x108   0x8 */
	union {
		struct socket_wq * sk_wq;                /*         0x8 */
		struct socket_wq * sk_wq_raw;            /*         0x8 */
	};                                               /* 0x110   0x8 */
	struct xfrm_policy *       sk_policy[2];         /* 0x118  0x10 */
	struct dst_entry *         sk_rx_dst;            /* 0x128   0x8 */
	struct dst_entry *         sk_dst_cache;         /* 0x130   0x8 */
	atomic_t                   sk_omem_alloc;        /* 0x138   0x4 */
	int                        sk_sndbuf;            /* 0x13c   0x4 */
	/* --- cacheline 5 boundary (320 bytes) --- */
	int                        sk_wmem_queued;       /* 0x140   0x4 */
	atomic_t                   sk_wmem_alloc;        /* 0x144   0x4 */
	long unsigned int          sk_tsq_flags;         /* 0x148   0x8 */
	struct sk_buff *           sk_send_head;         /* 0x150   0x8 */
	struct sk_buff_head        sk_write_queue;       /* 0x158  0x18 */
	__s32                      sk_peek_off;          /* 0x170   0x4 */
	int                        sk_write_pending;     /* 0x174   0x4 */
	long int                   sk_sndtimeo;          /* 0x178   0x8 */

Signed-off-by: Eric Dumazet <edumazet@google.com>
---
 include/net/sock.h | 51 +++++++++++++++++++++++++++------------------------
 1 file changed, 27 insertions(+), 24 deletions(-)

diff --git a/include/net/sock.h b/include/net/sock.h
index 69afda6bea15..6dfe3aa22b97 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -343,6 +343,9 @@ struct sock {
 #define sk_rxhash		__sk_common.skc_rxhash
 
 	socket_lock_t		sk_lock;
+	atomic_t		sk_drops;
+	int			sk_rcvlowat;
+	struct sk_buff_head	sk_error_queue;
 	struct sk_buff_head	sk_receive_queue;
 	/*
 	 * The backlog queue is special, it is always used with
@@ -359,14 +362,13 @@ struct sock {
 		struct sk_buff	*tail;
 	} sk_backlog;
 #define sk_rmem_alloc sk_backlog.rmem_alloc
-	int			sk_forward_alloc;
 
-	__u32			sk_txhash;
+	int			sk_forward_alloc;
 #ifdef CONFIG_NET_RX_BUSY_POLL
-	unsigned int		sk_napi_id;
 	unsigned int		sk_ll_usec;
+	/* ===== mostly read cache line ===== */
+	unsigned int		sk_napi_id;
 #endif
-	atomic_t		sk_drops;
 	int			sk_rcvbuf;
 
 	struct sk_filter __rcu	*sk_filter;
@@ -379,11 +381,30 @@ struct sock {
 #endif
 	struct dst_entry	*sk_rx_dst;
 	struct dst_entry __rcu	*sk_dst_cache;
-	/* Note: 32bit hole on 64bit arches */
-	atomic_t		sk_wmem_alloc;
 	atomic_t		sk_omem_alloc;
 	int			sk_sndbuf;
+
+	/* ===== cache line for TX ===== */
+	int			sk_wmem_queued;
+	atomic_t		sk_wmem_alloc;
+	unsigned long		sk_tsq_flags;
+	struct sk_buff		*sk_send_head;
 	struct sk_buff_head	sk_write_queue;
+	__s32			sk_peek_off;
+	int			sk_write_pending;
+	long			sk_sndtimeo;
+	struct timer_list	sk_timer;
+	__u32			sk_priority;
+	__u32			sk_mark;
+	u32			sk_pacing_rate; /* bytes per second */
+	u32			sk_max_pacing_rate;
+	struct page_frag	sk_frag;
+	netdev_features_t	sk_route_caps;
+	netdev_features_t	sk_route_nocaps;
+	int			sk_gso_type;
+	unsigned int		sk_gso_max_size;
+	gfp_t			sk_allocation;
+	__u32			sk_txhash;
 
 	/*
 	 * Because of non atomicity rules, all
@@ -414,42 +435,24 @@ struct sock {
 #define SK_PROTOCOL_MAX U8_MAX
 	kmemcheck_bitfield_end(flags);
 
-	int			sk_wmem_queued;
-	gfp_t			sk_allocation;
-	u32			sk_pacing_rate; /* bytes per second */
-	u32			sk_max_pacing_rate;
-	netdev_features_t	sk_route_caps;
-	netdev_features_t	sk_route_nocaps;
-	int			sk_gso_type;
-	unsigned int		sk_gso_max_size;
 	u16			sk_gso_max_segs;
-	int			sk_rcvlowat;
 	unsigned long	        sk_lingertime;
-	struct sk_buff_head	sk_error_queue;
 	struct proto		*sk_prot_creator;
 	rwlock_t		sk_callback_lock;
 	int			sk_err,
 				sk_err_soft;
 	u32			sk_ack_backlog;
 	u32			sk_max_ack_backlog;
-	__u32			sk_priority;
-	__u32			sk_mark;
 	kuid_t			sk_uid;
 	struct pid		*sk_peer_pid;
 	const struct cred	*sk_peer_cred;
 	long			sk_rcvtimeo;
-	long			sk_sndtimeo;
-	struct timer_list	sk_timer;
 	ktime_t			sk_stamp;
 	u16			sk_tsflags;
 	u8			sk_shutdown;
 	u32			sk_tskey;
 	struct socket		*sk_socket;
 	void			*sk_user_data;
-	struct page_frag	sk_frag;
-	struct sk_buff		*sk_send_head;
-	__s32			sk_peek_off;
-	int			sk_write_pending;
 #ifdef CONFIG_SECURITY
 	void			*sk_security;
 #endif
-- 
2.8.0.rc3.226.g39d4020

^ permalink raw reply related

* [PATCH v2 net-next 8/8] tcp: tsq: move tsq_flags close to sk_wmem_alloc
From: Eric Dumazet @ 2016-12-03 19:14 UTC (permalink / raw)
  To: David S . Miller; +Cc: netdev, Eric Dumazet, Yuchung Cheng, Eric Dumazet
In-Reply-To: <1480792497-16607-1-git-send-email-edumazet@google.com>

tsq_flags being in the same cache line than sk_wmem_alloc
makes a lot of sense. Both fields are changed from tcp_wfree()
and more generally by various TSQ related functions.

Prior patch made room in struct sock and added sk_tsq_flags,
this patch deletes tsq_flags from struct tcp_sock.

Signed-off-by: Eric Dumazet <edumazet@google.com>
---
 include/linux/tcp.h   |  1 -
 net/ipv4/tcp.c        |  4 ++--
 net/ipv4/tcp_ipv4.c   |  2 +-
 net/ipv4/tcp_output.c | 24 +++++++++++-------------
 net/ipv4/tcp_timer.c  |  4 ++--
 net/ipv6/tcp_ipv6.c   |  2 +-
 6 files changed, 17 insertions(+), 20 deletions(-)

diff --git a/include/linux/tcp.h b/include/linux/tcp.h
index d8be083ab0b0..fc5848dad7a4 100644
--- a/include/linux/tcp.h
+++ b/include/linux/tcp.h
@@ -186,7 +186,6 @@ struct tcp_sock {
 	u32	tsoffset;	/* timestamp offset */
 
 	struct list_head tsq_node; /* anchor in tsq_tasklet.head list */
-	unsigned long	tsq_flags;
 
 	/* Data for direct copy to user */
 	struct {
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 1149b48700a1..1ef3165114ba 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -663,9 +663,9 @@ static void tcp_push(struct sock *sk, int flags, int mss_now,
 	if (tcp_should_autocork(sk, skb, size_goal)) {
 
 		/* avoid atomic op if TSQ_THROTTLED bit is already set */
-		if (!test_bit(TSQ_THROTTLED, &tp->tsq_flags)) {
+		if (!test_bit(TSQ_THROTTLED, &sk->sk_tsq_flags)) {
 			NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPAUTOCORKING);
-			set_bit(TSQ_THROTTLED, &tp->tsq_flags);
+			set_bit(TSQ_THROTTLED, &sk->sk_tsq_flags);
 		}
 		/* It is possible TX completion already happened
 		 * before we set TSQ_THROTTLED.
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index b50f05905ced..30d81f533ada 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -443,7 +443,7 @@ void tcp_v4_err(struct sk_buff *icmp_skb, u32 info)
 			if (!sock_owned_by_user(sk)) {
 				tcp_v4_mtu_reduced(sk);
 			} else {
-				if (!test_and_set_bit(TCP_MTU_REDUCED_DEFERRED, &tp->tsq_flags))
+				if (!test_and_set_bit(TCP_MTU_REDUCED_DEFERRED, &sk->sk_tsq_flags))
 					sock_hold(sk);
 			}
 			goto out;
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 5f04bee4c86a..b45101f3d2bd 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -767,14 +767,15 @@ static void tcp_tasklet_func(unsigned long data)
 	list_for_each_safe(q, n, &list) {
 		tp = list_entry(q, struct tcp_sock, tsq_node);
 		list_del(&tp->tsq_node);
-		clear_bit(TSQ_QUEUED, &tp->tsq_flags);
 
 		sk = (struct sock *)tp;
+		clear_bit(TSQ_QUEUED, &sk->sk_tsq_flags);
+
 		if (!sk->sk_lock.owned &&
-		    test_bit(TCP_TSQ_DEFERRED, &tp->tsq_flags)) {
+		    test_bit(TCP_TSQ_DEFERRED, &sk->sk_tsq_flags)) {
 			bh_lock_sock(sk);
 			if (!sock_owned_by_user(sk)) {
-				clear_bit(TCP_TSQ_DEFERRED, &tp->tsq_flags);
+				clear_bit(TCP_TSQ_DEFERRED, &sk->sk_tsq_flags);
 				tcp_tsq_handler(sk);
 			}
 			bh_unlock_sock(sk);
@@ -797,16 +798,15 @@ static void tcp_tasklet_func(unsigned long data)
  */
 void tcp_release_cb(struct sock *sk)
 {
-	struct tcp_sock *tp = tcp_sk(sk);
 	unsigned long flags, nflags;
 
 	/* perform an atomic operation only if at least one flag is set */
 	do {
-		flags = tp->tsq_flags;
+		flags = sk->sk_tsq_flags;
 		if (!(flags & TCP_DEFERRED_ALL))
 			return;
 		nflags = flags & ~TCP_DEFERRED_ALL;
-	} while (cmpxchg(&tp->tsq_flags, flags, nflags) != flags);
+	} while (cmpxchg(&sk->sk_tsq_flags, flags, nflags) != flags);
 
 	if (flags & TCPF_TSQ_DEFERRED)
 		tcp_tsq_handler(sk);
@@ -878,7 +878,7 @@ void tcp_wfree(struct sk_buff *skb)
 	if (wmem >= SKB_TRUESIZE(1) && this_cpu_ksoftirqd() == current)
 		goto out;
 
-	for (oval = READ_ONCE(tp->tsq_flags);; oval = nval) {
+	for (oval = READ_ONCE(sk->sk_tsq_flags);; oval = nval) {
 		struct tsq_tasklet *tsq;
 		bool empty;
 
@@ -886,7 +886,7 @@ void tcp_wfree(struct sk_buff *skb)
 			goto out;
 
 		nval = (oval & ~TSQF_THROTTLED) | TSQF_QUEUED | TCPF_TSQ_DEFERRED;
-		nval = cmpxchg(&tp->tsq_flags, oval, nval);
+		nval = cmpxchg(&sk->sk_tsq_flags, oval, nval);
 		if (nval != oval)
 			continue;
 
@@ -2100,7 +2100,7 @@ static bool tcp_small_queue_check(struct sock *sk, const struct sk_buff *skb,
 		    skb->prev == sk->sk_write_queue.next)
 			return false;
 
-		set_bit(TSQ_THROTTLED, &tcp_sk(sk)->tsq_flags);
+		set_bit(TSQ_THROTTLED, &sk->sk_tsq_flags);
 		/* It is possible TX completion already happened
 		 * before we set TSQ_THROTTLED, so we must
 		 * test again the condition.
@@ -2241,8 +2241,8 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
 		    unlikely(tso_fragment(sk, skb, limit, mss_now, gfp)))
 			break;
 
-		if (test_bit(TCP_TSQ_DEFERRED, &tp->tsq_flags))
-			clear_bit(TCP_TSQ_DEFERRED, &tp->tsq_flags);
+		if (test_bit(TCP_TSQ_DEFERRED, &sk->sk_tsq_flags))
+			clear_bit(TCP_TSQ_DEFERRED, &sk->sk_tsq_flags);
 		if (tcp_small_queue_check(sk, skb, 0))
 			break;
 
@@ -3545,8 +3545,6 @@ void tcp_send_ack(struct sock *sk)
 	/* We do not want pure acks influencing TCP Small Queues or fq/pacing
 	 * too much.
 	 * SKB_TRUESIZE(max(1 .. 66, MAX_TCP_HEADER)) is unfortunately ~784
-	 * We also avoid tcp_wfree() overhead (cache line miss accessing
-	 * tp->tsq_flags) by using regular sock_wfree()
 	 */
 	skb_set_tcp_pure_ack(buff);
 
diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c
index 3ea1cf804748..3705075f42c3 100644
--- a/net/ipv4/tcp_timer.c
+++ b/net/ipv4/tcp_timer.c
@@ -310,7 +310,7 @@ static void tcp_delack_timer(unsigned long data)
 		inet_csk(sk)->icsk_ack.blocked = 1;
 		__NET_INC_STATS(sock_net(sk), LINUX_MIB_DELAYEDACKLOCKED);
 		/* deleguate our work to tcp_release_cb() */
-		if (!test_and_set_bit(TCP_DELACK_TIMER_DEFERRED, &tcp_sk(sk)->tsq_flags))
+		if (!test_and_set_bit(TCP_DELACK_TIMER_DEFERRED, &sk->sk_tsq_flags))
 			sock_hold(sk);
 	}
 	bh_unlock_sock(sk);
@@ -592,7 +592,7 @@ static void tcp_write_timer(unsigned long data)
 		tcp_write_timer_handler(sk);
 	} else {
 		/* delegate our work to tcp_release_cb() */
-		if (!test_and_set_bit(TCP_WRITE_TIMER_DEFERRED, &tcp_sk(sk)->tsq_flags))
+		if (!test_and_set_bit(TCP_WRITE_TIMER_DEFERRED, &sk->sk_tsq_flags))
 			sock_hold(sk);
 	}
 	bh_unlock_sock(sk);
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index a2185a214abc..73bc8fc68acd 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -399,7 +399,7 @@ static void tcp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
 		if (!sock_owned_by_user(sk))
 			tcp_v6_mtu_reduced(sk);
 		else if (!test_and_set_bit(TCP_MTU_REDUCED_DEFERRED,
-					   &tp->tsq_flags))
+					   &sk->sk_tsq_flags))
 			sock_hold(sk);
 		goto out;
 	}
-- 
2.8.0.rc3.226.g39d4020

^ permalink raw reply related

* [PATCH v2 net-next 4/8] tcp: tsq: avoid one atomic in tcp_wfree()
From: Eric Dumazet @ 2016-12-03 19:14 UTC (permalink / raw)
  To: David S . Miller; +Cc: netdev, Eric Dumazet, Yuchung Cheng, Eric Dumazet
In-Reply-To: <1480792497-16607-1-git-send-email-edumazet@google.com>

Under high load, tcp_wfree() has an atomic operation trying
to schedule a tasklet over and over.

We can schedule it only if our per cpu list was empty.

Signed-off-by: Eric Dumazet <edumazet@google.com>
---
 net/ipv4/tcp_output.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index fa23b688a6f3..0db63efe5b8b 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -880,6 +880,7 @@ void tcp_wfree(struct sk_buff *skb)
 
 	for (oval = READ_ONCE(tp->tsq_flags);; oval = nval) {
 		struct tsq_tasklet *tsq;
+		bool empty;
 
 		if (!(oval & TSQF_THROTTLED) || (oval & TSQF_QUEUED))
 			goto out;
@@ -892,8 +893,10 @@ void tcp_wfree(struct sk_buff *skb)
 		/* queue this socket to tasklet queue */
 		local_irq_save(flags);
 		tsq = this_cpu_ptr(&tsq_tasklet);
+		empty = list_empty(&tsq->head);
 		list_add(&tp->tsq_node, &tsq->head);
-		tasklet_schedule(&tsq->tasklet);
+		if (empty)
+			tasklet_schedule(&tsq->tasklet);
 		local_irq_restore(flags);
 		return;
 	}
-- 
2.8.0.rc3.226.g39d4020

^ permalink raw reply related

* [PATCH v2 net-next 3/8] tcp: tsq: add shortcut in tcp_tasklet_func()
From: Eric Dumazet @ 2016-12-03 19:14 UTC (permalink / raw)
  To: David S . Miller; +Cc: netdev, Eric Dumazet, Yuchung Cheng, Eric Dumazet
In-Reply-To: <1480792497-16607-1-git-send-email-edumazet@google.com>

Under high stress, I've seen tcp_tasklet_func() consuming
~700 usec, handling ~150 tcp sockets.

By setting TCP_TSQ_DEFERRED in tcp_wfree(), we give a chance
for other cpus/threads entering tcp_write_xmit() to grab it,
allowing tcp_tasklet_func() to skip sockets that already did
an xmit cycle.

In the future, we might give to ACK processing an increased
budget to reduce even more tcp_tasklet_func() amount of work.

Signed-off-by: Eric Dumazet <edumazet@google.com>
---
 net/ipv4/tcp_output.c | 22 ++++++++++++----------
 1 file changed, 12 insertions(+), 10 deletions(-)

diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 4adaf8e1bb63..fa23b688a6f3 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -767,19 +767,19 @@ static void tcp_tasklet_func(unsigned long data)
 	list_for_each_safe(q, n, &list) {
 		tp = list_entry(q, struct tcp_sock, tsq_node);
 		list_del(&tp->tsq_node);
+		clear_bit(TSQ_QUEUED, &tp->tsq_flags);
 
 		sk = (struct sock *)tp;
-		bh_lock_sock(sk);
-
-		if (!sock_owned_by_user(sk)) {
-			tcp_tsq_handler(sk);
-		} else {
-			/* defer the work to tcp_release_cb() */
-			set_bit(TCP_TSQ_DEFERRED, &tp->tsq_flags);
+		if (!sk->sk_lock.owned &&
+		    test_bit(TCP_TSQ_DEFERRED, &tp->tsq_flags)) {
+			bh_lock_sock(sk);
+			if (!sock_owned_by_user(sk)) {
+				clear_bit(TCP_TSQ_DEFERRED, &tp->tsq_flags);
+				tcp_tsq_handler(sk);
+			}
+			bh_unlock_sock(sk);
 		}
-		bh_unlock_sock(sk);
 
-		clear_bit(TSQ_QUEUED, &tp->tsq_flags);
 		sk_free(sk);
 	}
 }
@@ -884,7 +884,7 @@ void tcp_wfree(struct sk_buff *skb)
 		if (!(oval & TSQF_THROTTLED) || (oval & TSQF_QUEUED))
 			goto out;
 
-		nval = (oval & ~TSQF_THROTTLED) | TSQF_QUEUED;
+		nval = (oval & ~TSQF_THROTTLED) | TSQF_QUEUED | TCPF_TSQ_DEFERRED;
 		nval = cmpxchg(&tp->tsq_flags, oval, nval);
 		if (nval != oval)
 			continue;
@@ -2229,6 +2229,8 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
 		    unlikely(tso_fragment(sk, skb, limit, mss_now, gfp)))
 			break;
 
+		if (test_bit(TCP_TSQ_DEFERRED, &tp->tsq_flags))
+			clear_bit(TCP_TSQ_DEFERRED, &tp->tsq_flags);
 		if (tcp_small_queue_check(sk, skb, 0))
 			break;
 
-- 
2.8.0.rc3.226.g39d4020

^ permalink raw reply related

* [PATCH v2 net-next 5/8] tcp: tsq: add a shortcut in tcp_small_queue_check()
From: Eric Dumazet @ 2016-12-03 19:14 UTC (permalink / raw)
  To: David S . Miller; +Cc: netdev, Eric Dumazet, Yuchung Cheng, Eric Dumazet
In-Reply-To: <1480792497-16607-1-git-send-email-edumazet@google.com>

Always allow the two first skbs in write queue to be sent,
regardless of sk_wmem_alloc/sk_pacing_rate values.

This helps a lot in situations where TX completions are delayed either
because of driver latencies or softirq latencies.

Test is done with no cache line misses.

Signed-off-by: Eric Dumazet <edumazet@google.com>
---
 net/ipv4/tcp_output.c | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 0db63efe5b8b..d5c46749adab 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -2091,6 +2091,15 @@ static bool tcp_small_queue_check(struct sock *sk, const struct sk_buff *skb,
 	limit <<= factor;
 
 	if (atomic_read(&sk->sk_wmem_alloc) > limit) {
+		/* Always send the 1st or 2nd skb in write queue.
+		 * No need to wait for TX completion to call us back,
+		 * after softirq/tasklet schedule.
+		 * This helps when TX completions are delayed too much.
+		 */
+		if (skb == sk->sk_write_queue.next ||
+		    skb->prev == sk->sk_write_queue.next)
+			return false;
+
 		set_bit(TSQ_THROTTLED, &tcp_sk(sk)->tsq_flags);
 		/* It is possible TX completion already happened
 		 * before we set TSQ_THROTTLED, so we must
-- 
2.8.0.rc3.226.g39d4020

^ permalink raw reply related

* [PATCH v2 net-next 2/8] tcp: tsq: remove one locked operation in tcp_wfree()
From: Eric Dumazet @ 2016-12-03 19:14 UTC (permalink / raw)
  To: David S . Miller; +Cc: netdev, Eric Dumazet, Yuchung Cheng, Eric Dumazet
In-Reply-To: <1480792497-16607-1-git-send-email-edumazet@google.com>

Instead of atomically clear TSQ_THROTTLED and atomically set TSQ_QUEUED
bits, use one cmpxchg() to perform a single locked operation.

Since the following patch will also set TCP_TSQ_DEFERRED here,
this cmpxchg() will make this addition free.

Signed-off-by: Eric Dumazet <edumazet@google.com>
---
 net/ipv4/tcp_output.c | 13 ++++++++++---
 1 file changed, 10 insertions(+), 3 deletions(-)

diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 8f0289b0fb24..4adaf8e1bb63 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -860,6 +860,7 @@ void tcp_wfree(struct sk_buff *skb)
 {
 	struct sock *sk = skb->sk;
 	struct tcp_sock *tp = tcp_sk(sk);
+	unsigned long flags, nval, oval;
 	int wmem;
 
 	/* Keep one reference on sk_wmem_alloc.
@@ -877,11 +878,17 @@ void tcp_wfree(struct sk_buff *skb)
 	if (wmem >= SKB_TRUESIZE(1) && this_cpu_ksoftirqd() == current)
 		goto out;
 
-	if (test_and_clear_bit(TSQ_THROTTLED, &tp->tsq_flags) &&
-	    !test_and_set_bit(TSQ_QUEUED, &tp->tsq_flags)) {
-		unsigned long flags;
+	for (oval = READ_ONCE(tp->tsq_flags);; oval = nval) {
 		struct tsq_tasklet *tsq;
 
+		if (!(oval & TSQF_THROTTLED) || (oval & TSQF_QUEUED))
+			goto out;
+
+		nval = (oval & ~TSQF_THROTTLED) | TSQF_QUEUED;
+		nval = cmpxchg(&tp->tsq_flags, oval, nval);
+		if (nval != oval)
+			continue;
+
 		/* queue this socket to tasklet queue */
 		local_irq_save(flags);
 		tsq = this_cpu_ptr(&tsq_tasklet);
-- 
2.8.0.rc3.226.g39d4020

^ permalink raw reply related

* [PATCH v2 net-next 1/8] tcp: tsq: add tsq_flags / tsq_enum
From: Eric Dumazet @ 2016-12-03 19:14 UTC (permalink / raw)
  To: David S . Miller; +Cc: netdev, Eric Dumazet, Yuchung Cheng, Eric Dumazet
In-Reply-To: <1480792497-16607-1-git-send-email-edumazet@google.com>

This is a cleanup, to ease code review of following patches.

Old 'enum tsq_flags' is renamed, and a new enumeration is added
with the flags used in cmpxchg() operations as opposed to
single bit operations.

Signed-off-by: Eric Dumazet <edumazet@google.com>
---
 include/linux/tcp.h   | 11 ++++++++++-
 net/ipv4/tcp_output.c | 16 ++++++++--------
 2 files changed, 18 insertions(+), 9 deletions(-)

diff --git a/include/linux/tcp.h b/include/linux/tcp.h
index 734bab4c3bef..d8be083ab0b0 100644
--- a/include/linux/tcp.h
+++ b/include/linux/tcp.h
@@ -364,7 +364,7 @@ struct tcp_sock {
 	u32	*saved_syn;
 };
 
-enum tsq_flags {
+enum tsq_enum {
 	TSQ_THROTTLED,
 	TSQ_QUEUED,
 	TCP_TSQ_DEFERRED,	   /* tcp_tasklet_func() found socket was owned */
@@ -375,6 +375,15 @@ enum tsq_flags {
 				    */
 };
 
+enum tsq_flags {
+	TSQF_THROTTLED			= (1UL << TSQ_THROTTLED),
+	TSQF_QUEUED			= (1UL << TSQ_QUEUED),
+	TCPF_TSQ_DEFERRED		= (1UL << TCP_TSQ_DEFERRED),
+	TCPF_WRITE_TIMER_DEFERRED	= (1UL << TCP_WRITE_TIMER_DEFERRED),
+	TCPF_DELACK_TIMER_DEFERRED	= (1UL << TCP_DELACK_TIMER_DEFERRED),
+	TCPF_MTU_REDUCED_DEFERRED	= (1UL << TCP_MTU_REDUCED_DEFERRED),
+};
+
 static inline struct tcp_sock *tcp_sk(const struct sock *sk)
 {
 	return (struct tcp_sock *)sk;
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index c7adcb57654e..8f0289b0fb24 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -784,10 +784,10 @@ static void tcp_tasklet_func(unsigned long data)
 	}
 }
 
-#define TCP_DEFERRED_ALL ((1UL << TCP_TSQ_DEFERRED) |		\
-			  (1UL << TCP_WRITE_TIMER_DEFERRED) |	\
-			  (1UL << TCP_DELACK_TIMER_DEFERRED) |	\
-			  (1UL << TCP_MTU_REDUCED_DEFERRED))
+#define TCP_DEFERRED_ALL (TCPF_TSQ_DEFERRED |		\
+			  TCPF_WRITE_TIMER_DEFERRED |	\
+			  TCPF_DELACK_TIMER_DEFERRED |	\
+			  TCPF_MTU_REDUCED_DEFERRED)
 /**
  * tcp_release_cb - tcp release_sock() callback
  * @sk: socket
@@ -808,7 +808,7 @@ void tcp_release_cb(struct sock *sk)
 		nflags = flags & ~TCP_DEFERRED_ALL;
 	} while (cmpxchg(&tp->tsq_flags, flags, nflags) != flags);
 
-	if (flags & (1UL << TCP_TSQ_DEFERRED))
+	if (flags & TCPF_TSQ_DEFERRED)
 		tcp_tsq_handler(sk);
 
 	/* Here begins the tricky part :
@@ -822,15 +822,15 @@ void tcp_release_cb(struct sock *sk)
 	 */
 	sock_release_ownership(sk);
 
-	if (flags & (1UL << TCP_WRITE_TIMER_DEFERRED)) {
+	if (flags & TCPF_WRITE_TIMER_DEFERRED) {
 		tcp_write_timer_handler(sk);
 		__sock_put(sk);
 	}
-	if (flags & (1UL << TCP_DELACK_TIMER_DEFERRED)) {
+	if (flags & TCPF_DELACK_TIMER_DEFERRED) {
 		tcp_delack_timer_handler(sk);
 		__sock_put(sk);
 	}
-	if (flags & (1UL << TCP_MTU_REDUCED_DEFERRED)) {
+	if (flags & TCPF_MTU_REDUCED_DEFERRED) {
 		inet_csk(sk)->icsk_af_ops->mtu_reduced(sk);
 		__sock_put(sk);
 	}
-- 
2.8.0.rc3.226.g39d4020

^ permalink raw reply related

* Re: [Patch net-next] act_mirred: fix a typo in get_dev
From: Eric Dumazet @ 2016-12-03 18:59 UTC (permalink / raw)
  To: Cong Wang; +Cc: netdev, Hadar Hen Zion, Jiri Pirko
In-Reply-To: <1480790161-8097-1-git-send-email-xiyou.wangcong@gmail.com>

On Sat, 2016-12-03 at 10:36 -0800, Cong Wang wrote:
> Cc: Hadar Hen Zion <hadarh@mellanox.com>
> Cc: Jiri Pirko <jiri@mellanox.com>
> Signed-off-by: Cong Wang <xiyou.wangcong@gmail.com>
> ---
>  net/sched/act_mirred.c | 2 +-
>  1 file changed, 1 insertion(+), 1 deletion(-)
> 
> diff --git a/net/sched/act_mirred.c b/net/sched/act_mirred.c
> index bb09ba3..2d9fa6e 100644
> --- a/net/sched/act_mirred.c
> +++ b/net/sched/act_mirred.c
> @@ -321,7 +321,7 @@ static int tcf_mirred_device(const struct tc_action *a, struct net *net,
>  	int ifindex = tcf_mirred_ifindex(a);
>  
>  	*mirred_dev = __dev_get_by_index(net, ifindex);
> -	if (!mirred_dev)
> +	if (!*mirred_dev)
>  		return -EINVAL;
>  	return 0;
>  }

Fixes: 255cb30425c0 ("net/sched: act_mirred: Add new tc_action_ops get_dev()")
Acked-by: Eric Dumazet <edumazet@google.com>

^ permalink raw reply

* Re: [PATCH v2 net-next 1/2] flow dissector: ICMP support
From: Tom Herbert @ 2016-12-03 18:52 UTC (permalink / raw)
  To: Jiri Pirko
  Cc: Simon Horman, David Miller, Linux Kernel Network Developers,
	Jay Vosburgh, Veaceslav Falico, Andy Gospodarek, Jamal Hadi Salim,
	Jiri Pirko
In-Reply-To: <20161203104918.GA1880@nanopsycho.orion>

On Sat, Dec 3, 2016 at 2:49 AM, Jiri Pirko <jiri@resnulli.us> wrote:
> Fri, Dec 02, 2016 at 09:31:41PM CET, simon.horman@netronome.com wrote:
>>Allow dissection of ICMP(V6) type and code. This re-uses transport layer
>>port dissection code as although ICMP is not a transport protocol and their
>>type and code are not ports this allows sharing of both code and storage.
>>
>>Signed-off-by: Simon Horman <simon.horman@netronome.com>
>>---
>> drivers/net/bonding/bond_main.c |  6 +++--
>> include/linux/skbuff.h          |  5 +++++
>> include/net/flow_dissector.h    | 50 ++++++++++++++++++++++++++++++++++++++---
>> net/core/flow_dissector.c       | 34 +++++++++++++++++++++++++---
>> net/sched/cls_flow.c            |  4 ++--
>> 5 files changed, 89 insertions(+), 10 deletions(-)
>>
>>diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c
>>index 8029dd4912b6..a6f75cfb2bf7 100644
>>--- a/drivers/net/bonding/bond_main.c
>>+++ b/drivers/net/bonding/bond_main.c
>>@@ -3181,7 +3181,8 @@ static bool bond_flow_dissect(struct bonding *bond, struct sk_buff *skb,
>>       } else {
>>               return false;
>>       }
>>-      if (bond->params.xmit_policy == BOND_XMIT_POLICY_LAYER34 && proto >= 0)
>>+      if (bond->params.xmit_policy == BOND_XMIT_POLICY_LAYER34 &&
>>+          proto >= 0 && !skb_flow_is_icmp_any(skb, proto))
>>               fk->ports.ports = skb_flow_get_ports(skb, noff, proto);
>>
>>       return true;
>>@@ -3209,7 +3210,8 @@ u32 bond_xmit_hash(struct bonding *bond, struct sk_buff *skb)
>>               return bond_eth_hash(skb);
>>
>>       if (bond->params.xmit_policy == BOND_XMIT_POLICY_LAYER23 ||
>>-          bond->params.xmit_policy == BOND_XMIT_POLICY_ENCAP23)
>>+          bond->params.xmit_policy == BOND_XMIT_POLICY_ENCAP23 ||
>>+          flow_keys_are_icmp_any(&flow))
>>               hash = bond_eth_hash(skb);
>>       else
>>               hash = (__force u32)flow.ports.ports;
>>diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
>>index 9c535fbccf2c..44a8f69a9198 100644
>>--- a/include/linux/skbuff.h
>>+++ b/include/linux/skbuff.h
>>@@ -1094,6 +1094,11 @@ u32 __skb_get_poff(const struct sk_buff *skb, void *data,
>> __be32 __skb_flow_get_ports(const struct sk_buff *skb, int thoff, u8 ip_proto,
>>                           void *data, int hlen_proto);
>>
>>+static inline bool skb_flow_is_icmp_any(const struct sk_buff *skb, u8 ip_proto)
>>+{
>>+      return flow_protos_are_icmp_any(skb->protocol, ip_proto);
>>+}
>>+
>> static inline __be32 skb_flow_get_ports(const struct sk_buff *skb,
>>                                       int thoff, u8 ip_proto)
>> {
>>diff --git a/include/net/flow_dissector.h b/include/net/flow_dissector.h
>>index c4f31666afd2..5540dfa18872 100644
>>--- a/include/net/flow_dissector.h
>>+++ b/include/net/flow_dissector.h
>>@@ -2,6 +2,7 @@
>> #define _NET_FLOW_DISSECTOR_H
>>
>> #include <linux/types.h>
>>+#include <linux/in.h>
>> #include <linux/in6.h>
>> #include <uapi/linux/if_ether.h>
>>
>>@@ -89,10 +90,15 @@ struct flow_dissector_key_addrs {
>> };
>>
>> /**
>>- * flow_dissector_key_tp_ports:
>>- *    @ports: port numbers of Transport header
>>+ * flow_dissector_key_ports:
>>+ *    @ports: port numbers of Transport header or
>>+ *            type and code of ICMP header
>>+ *            ports: source (high) and destination (low) port numbers
>>  *            src: source port number
>>  *            dst: destination port number
>>+ *            icmp: ICMP type (high) and code (low)
>>+ *            type: ICMP type
>>+ *            type: ICMP code
>>  */
>> struct flow_dissector_key_ports {
>>       union {
>>@@ -101,6 +107,11 @@ struct flow_dissector_key_ports {
>>                       __be16 src;
>>                       __be16 dst;
>>               };
>>+              __be16 icmp;
>>+              struct {
>>+                      u8 type;
>>+                      u8 code;
>>+              };
>
> Digging into this a bit more. I think it would be much nice not to mix
> up l4 ports and icmp stuff.
>
> How about to have FLOW_DISSECTOR_KEY_ICMP
> and
> struct flow_dissector_key_icmp {
>         u8 type;
>         u8 code;
> };
>
> The you can make this structure and struct flow_dissector_key_ports into
> an union in struct flow_keys.
>
> Looks much cleaner to me.
>
I agree, this patch adds to many conditionals into the fast path for
ICMP handling. Neither is there much point in using type and code as
input to the packet hash.

Tom

>
>
>>       };
>> };
>>
>>@@ -188,9 +199,42 @@ struct flow_keys_digest {
>> void make_flow_keys_digest(struct flow_keys_digest *digest,
>>                          const struct flow_keys *flow);
>>
>>+static inline bool flow_protos_are_icmpv4(__be16 n_proto, u8 ip_proto)
>>+{
>>+      return n_proto == htons(ETH_P_IP) && ip_proto == IPPROTO_ICMP;
>>+}
>>+
>>+static inline bool flow_protos_are_icmpv6(__be16 n_proto, u8 ip_proto)
>>+{
>>+      return n_proto == htons(ETH_P_IPV6) && ip_proto == IPPROTO_ICMPV6;
>>+}
>>+
>>+static inline bool flow_protos_are_icmp_any(__be16 n_proto, u8 ip_proto)
>>+{
>>+      return flow_protos_are_icmpv4(n_proto, ip_proto) ||
>>+              flow_protos_are_icmpv6(n_proto, ip_proto);
>>+}
>>+
>>+static inline bool flow_basic_key_is_icmpv4(const struct flow_dissector_key_basic *basic)
>>+{
>>+      return flow_protos_are_icmpv4(basic->n_proto, basic->ip_proto);
>>+}
>>+
>>+static inline bool flow_basic_key_is_icmpv6(const struct flow_dissector_key_basic *basic)
>>+{
>>+      return flow_protos_are_icmpv6(basic->n_proto, basic->ip_proto);
>>+}
>>+
>>+static inline bool flow_keys_are_icmp_any(const struct flow_keys *keys)
>>+{
>>+      return flow_protos_are_icmp_any(keys->basic.n_proto,
>>+                                      keys->basic.ip_proto);
>>+}
>>+
>> static inline bool flow_keys_have_l4(const struct flow_keys *keys)
>> {
>>-      return (keys->ports.ports || keys->tags.flow_label);
>>+      return (!flow_keys_are_icmp_any(keys) && keys->ports.ports) ||
>>+              keys->tags.flow_label;
>> }
>>
>> u32 flow_hash_from_keys(struct flow_keys *keys);
>>diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c
>>index 1eb6f949e5b2..0584b4bb4390 100644
>>--- a/net/core/flow_dissector.c
>>+++ b/net/core/flow_dissector.c
>>@@ -58,6 +58,28 @@ void skb_flow_dissector_init(struct flow_dissector *flow_dissector,
>> EXPORT_SYMBOL(skb_flow_dissector_init);
>>
>> /**
>>+ * skb_flow_get_be16 - extract be16 entity
>>+ * @skb: sk_buff to extract from
>>+ * @poff: offset to extract at
>>+ * @data: raw buffer pointer to the packet
>>+ * @hlen: packet header length
>>+ *
>>+ * The function will try to retrieve a be32 entity at
>>+ * offset poff
>>+ */
>>+__be16 skb_flow_get_be16(const struct sk_buff *skb, int poff, void *data,
>>+                       int hlen)
>>+{
>>+      __be16 *u, _u;
>>+
>>+      u = __skb_header_pointer(skb, poff, sizeof(_u), data, hlen, &_u);
>>+      if (u)
>>+              return *u;
>>+
>>+      return 0;
>>+}
>>+
>>+/**
>>  * __skb_flow_get_ports - extract the upper layer ports and return them
>>  * @skb: sk_buff to extract the ports from
>>  * @thoff: transport header offset
>>@@ -542,8 +564,13 @@ bool __skb_flow_dissect(const struct sk_buff *skb,
>>               key_ports = skb_flow_dissector_target(flow_dissector,
>>                                                     FLOW_DISSECTOR_KEY_PORTS,
>>                                                     target_container);
>>-              key_ports->ports = __skb_flow_get_ports(skb, nhoff, ip_proto,
>>-                                                      data, hlen);
>>+              if (flow_protos_are_icmp_any(proto, ip_proto))
>>+                      key_ports->icmp = skb_flow_get_be16(skb, nhoff, data,
>>+                                                          hlen);
>>+              else
>>+                      key_ports->ports = __skb_flow_get_ports(skb, nhoff,
>>+                                                              ip_proto, data,
>>+                                                              hlen);
>>       }
>>
>> out_good:
>>@@ -718,7 +745,8 @@ void make_flow_keys_digest(struct flow_keys_digest *digest,
>>
>>       data->n_proto = flow->basic.n_proto;
>>       data->ip_proto = flow->basic.ip_proto;
>>-      data->ports = flow->ports.ports;
>>+      if (flow_keys_have_l4(flow))
>>+              data->ports = flow->ports.ports;
>>       data->src = flow->addrs.v4addrs.src;
>>       data->dst = flow->addrs.v4addrs.dst;
>> }
>>diff --git a/net/sched/cls_flow.c b/net/sched/cls_flow.c
>>index e39672394c7b..a1a7ae71aa62 100644
>>--- a/net/sched/cls_flow.c
>>+++ b/net/sched/cls_flow.c
>>@@ -96,7 +96,7 @@ static u32 flow_get_proto(const struct sk_buff *skb,
>> static u32 flow_get_proto_src(const struct sk_buff *skb,
>>                             const struct flow_keys *flow)
>> {
>>-      if (flow->ports.ports)
>>+      if (!flow_keys_are_icmp_any(flow) && flow->ports.ports)
>>               return ntohs(flow->ports.src);
>>
>>       return addr_fold(skb->sk);
>>@@ -105,7 +105,7 @@ static u32 flow_get_proto_src(const struct sk_buff *skb,
>> static u32 flow_get_proto_dst(const struct sk_buff *skb,
>>                             const struct flow_keys *flow)
>> {
>>-      if (flow->ports.ports)
>>+      if (!flow_keys_are_icmp_any(flow) && flow->ports.ports)
>>               return ntohs(flow->ports.dst);
>>
>>       return addr_fold(skb_dst(skb)) ^ (__force u16) tc_skb_protocol(skb);
>>--
>>2.7.0.rc3.207.g0ac5344
>>

^ permalink raw reply


This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox