Netdev List
 help / color / mirror / Atom feed
* [net-next PATCH 1/3] bnx2x: link code refactoring
From: Yuval Mintz @ 2012-09-13 12:56 UTC (permalink / raw)
  To: davem, netdev; +Cc: eilong, ariele, Yaniv Rosner, Yuval Mintz
In-Reply-To: <1347540981-16198-1-git-send-email-yuvalmin@broadcom.com>

From: Yaniv Rosner <yaniv.rosner@broadcom.com>

Separate the interrupt setting part of each external PHY to a specific
function.
This allows calling the interrupt setting in case of link-flap avoidance,
since some link owners may not enable the interrupt on their own.

Signed-off-by: Yaniv Rosner <yaniv.rosner@broadcom.com>
Signed-off-by: Yuval Mintz <yuvalmin@broadcom.com>
Signed-off-by: Eilon Greenstein <eilong@broadcom.com>
---
 drivers/net/ethernet/broadcom/bnx2x/bnx2x_link.c |  192 +++++++++++++---------
 drivers/net/ethernet/broadcom/bnx2x/bnx2x_link.h |    1 +
 2 files changed, 114 insertions(+), 79 deletions(-)

diff --git a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_link.c b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_link.c
index f4beb46..05620ef 100644
--- a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_link.c
+++ b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_link.c
@@ -7203,6 +7203,22 @@ static void bnx2x_8073_set_pause_cl37(struct link_params *params,
 	msleep(500);
 }
 
+static void bnx2x_8073_specific_func(struct bnx2x_phy *phy,
+				     struct link_params *params,
+				     u32 action)
+{
+	struct bnx2x *bp = params->bp;
+	switch (action) {
+	case PHY_INIT:
+		/* Enable LASI */
+		bnx2x_cl45_write(bp, phy,
+				 MDIO_PMA_DEVAD, MDIO_PMA_LASI_RXCTRL, (1<<2));
+		bnx2x_cl45_write(bp, phy,
+				 MDIO_PMA_DEVAD, MDIO_PMA_LASI_CTRL,  0x0004);
+		break;
+	}
+}
+
 static int bnx2x_8073_config_init(struct bnx2x_phy *phy,
 				  struct link_params *params,
 				  struct link_vars *vars)
@@ -7223,12 +7239,7 @@ static int bnx2x_8073_config_init(struct bnx2x_phy *phy,
 	bnx2x_set_gpio(bp, MISC_REGISTERS_GPIO_1,
 		       MISC_REGISTERS_GPIO_OUTPUT_HIGH, gpio_port);
 
-	/* Enable LASI */
-	bnx2x_cl45_write(bp, phy,
-			 MDIO_PMA_DEVAD, MDIO_PMA_LASI_RXCTRL, (1<<2));
-	bnx2x_cl45_write(bp, phy,
-			 MDIO_PMA_DEVAD, MDIO_PMA_LASI_CTRL,  0x0004);
-
+	bnx2x_8073_specific_func(phy, params, PHY_INIT);
 	bnx2x_8073_set_pause_cl37(params, phy, vars);
 
 	bnx2x_cl45_read(bp, phy,
@@ -8263,7 +8274,7 @@ static void bnx2x_8727_specific_func(struct bnx2x_phy *phy,
 				     u32 action)
 {
 	struct bnx2x *bp = params->bp;
-
+	u16 val;
 	switch (action) {
 	case DISABLE_TX:
 		bnx2x_sfp_set_transmitter(params, phy, 0);
@@ -8272,6 +8283,40 @@ static void bnx2x_8727_specific_func(struct bnx2x_phy *phy,
 		if (!(phy->flags & FLAGS_SFP_NOT_APPROVED))
 			bnx2x_sfp_set_transmitter(params, phy, 1);
 		break;
+	case PHY_INIT:
+		bnx2x_cl45_write(bp, phy,
+				 MDIO_PMA_DEVAD, MDIO_PMA_LASI_RXCTRL,
+				 (1<<2) | (1<<5));
+		bnx2x_cl45_write(bp, phy,
+				 MDIO_PMA_DEVAD, MDIO_PMA_LASI_TXCTRL,
+				 0);
+		bnx2x_cl45_write(bp, phy,
+				 MDIO_PMA_DEVAD, MDIO_PMA_LASI_CTRL, 0x0006);
+		/* Make MOD_ABS give interrupt on change */
+		bnx2x_cl45_read(bp, phy, MDIO_PMA_DEVAD,
+				MDIO_PMA_REG_8727_PCS_OPT_CTRL,
+				&val);
+		val |= (1<<12);
+		if (phy->flags & FLAGS_NOC)
+			val |= (3<<5);
+		/* Set 8727 GPIOs to input to allow reading from the 8727 GPIO0
+		 * status which reflect SFP+ module over-current
+		 */
+		if (!(phy->flags & FLAGS_NOC))
+			val &= 0xff8f; /* Reset bits 4-6 */
+		bnx2x_cl45_write(bp, phy,
+				 MDIO_PMA_DEVAD, MDIO_PMA_REG_8727_PCS_OPT_CTRL,
+				 val);
+
+		/* Set 2-wire transfer rate of SFP+ module EEPROM
+		 * to 100Khz since some DACs(direct attached cables) do
+		 * not work at 400Khz.
+		 */
+		bnx2x_cl45_write(bp, phy,
+				 MDIO_PMA_DEVAD,
+				 MDIO_PMA_REG_8727_TWO_WIRE_SLAVE_ADDR,
+				 0xa001);
+		break;
 	default:
 		DP(NETIF_MSG_LINK, "Function 0x%x not supported by 8727\n",
 		   action);
@@ -9054,28 +9099,15 @@ static int bnx2x_8727_config_init(struct bnx2x_phy *phy,
 				  struct link_vars *vars)
 {
 	u32 tx_en_mode;
-	u16 tmp1, val, mod_abs, tmp2;
-	u16 rx_alarm_ctrl_val;
-	u16 lasi_ctrl_val;
+	u16 tmp1, mod_abs, tmp2;
 	struct bnx2x *bp = params->bp;
 	/* Enable PMD link, MOD_ABS_FLT, and 1G link alarm */
 
 	bnx2x_wait_reset_complete(bp, phy, params);
-	rx_alarm_ctrl_val = (1<<2) | (1<<5) ;
-	/* Should be 0x6 to enable XS on Tx side. */
-	lasi_ctrl_val = 0x0006;
 
 	DP(NETIF_MSG_LINK, "Initializing BCM8727\n");
-	/* Enable LASI */
-	bnx2x_cl45_write(bp, phy,
-			 MDIO_PMA_DEVAD, MDIO_PMA_LASI_RXCTRL,
-			 rx_alarm_ctrl_val);
-	bnx2x_cl45_write(bp, phy,
-			 MDIO_PMA_DEVAD, MDIO_PMA_LASI_TXCTRL,
-			 0);
-	bnx2x_cl45_write(bp, phy,
-			 MDIO_PMA_DEVAD, MDIO_PMA_LASI_CTRL, lasi_ctrl_val);
 
+	bnx2x_8727_specific_func(phy, params, PHY_INIT);
 	/* Initially configure MOD_ABS to interrupt when module is
 	 * presence( bit 8)
 	 */
@@ -9091,25 +9123,9 @@ static int bnx2x_8727_config_init(struct bnx2x_phy *phy,
 	bnx2x_cl45_write(bp, phy,
 			 MDIO_PMA_DEVAD, MDIO_PMA_REG_PHY_IDENTIFIER, mod_abs);
 
-
 	/* Enable/Disable PHY transmitter output */
 	bnx2x_set_disable_pmd_transmit(params, phy, 0);
 
-	/* Make MOD_ABS give interrupt on change */
-	bnx2x_cl45_read(bp, phy, MDIO_PMA_DEVAD, MDIO_PMA_REG_8727_PCS_OPT_CTRL,
-			&val);
-	val |= (1<<12);
-	if (phy->flags & FLAGS_NOC)
-		val |= (3<<5);
-
-	/* Set 8727 GPIOs to input to allow reading from the 8727 GPIO0
-	 * status which reflect SFP+ module over-current
-	 */
-	if (!(phy->flags & FLAGS_NOC))
-		val &= 0xff8f; /* Reset bits 4-6 */
-	bnx2x_cl45_write(bp, phy,
-			 MDIO_PMA_DEVAD, MDIO_PMA_REG_8727_PCS_OPT_CTRL, val);
-
 	bnx2x_8727_power_module(bp, phy, 1);
 
 	bnx2x_cl45_read(bp, phy,
@@ -9119,13 +9135,7 @@ static int bnx2x_8727_config_init(struct bnx2x_phy *phy,
 			MDIO_PMA_DEVAD, MDIO_PMA_LASI_RXSTAT, &tmp1);
 
 	bnx2x_8727_config_speed(phy, params);
-	/* Set 2-wire transfer rate of SFP+ module EEPROM
-	 * to 100Khz since some DACs(direct attached cables) do
-	 * not work at 400Khz.
-	 */
-	bnx2x_cl45_write(bp, phy,
-			 MDIO_PMA_DEVAD, MDIO_PMA_REG_8727_TWO_WIRE_SLAVE_ADDR,
-			 0xa001);
+
 
 	/* Set TX PreEmphasis if needed */
 	if ((params->feature_config_flags &
@@ -9554,6 +9564,29 @@ static void bnx2x_848xx_set_led(struct bnx2x *bp,
 			 0xFFFB, 0xFFFD);
 }
 
+static void bnx2x_848xx_specific_func(struct bnx2x_phy *phy,
+				      struct link_params *params,
+				      u32 action)
+{
+	struct bnx2x *bp = params->bp;
+	switch (action) {
+	case PHY_INIT:
+		if (phy->type != PORT_HW_CFG_XGXS_EXT_PHY_TYPE_BCM84833) {
+			/* Save spirom version */
+			bnx2x_save_848xx_spirom_version(phy, bp, params->port);
+		}
+		/* This phy uses the NIG latch mechanism since link indication
+		 * arrives through its LED4 and not via its LASI signal, so we
+		 * get steady signal instead of clear on read
+		 */
+		bnx2x_bits_en(bp, NIG_REG_LATCH_BC_0 + params->port*4,
+			      1 << NIG_LATCH_BC_ENABLE_MI_INT);
+
+		bnx2x_848xx_set_led(bp, phy);
+		break;
+	}
+}
+
 static int bnx2x_848xx_cmn_config_init(struct bnx2x_phy *phy,
 				       struct link_params *params,
 				       struct link_vars *vars)
@@ -9561,22 +9594,10 @@ static int bnx2x_848xx_cmn_config_init(struct bnx2x_phy *phy,
 	struct bnx2x *bp = params->bp;
 	u16 autoneg_val, an_1000_val, an_10_100_val, an_10g_val;
 
-	if (phy->type != PORT_HW_CFG_XGXS_EXT_PHY_TYPE_BCM84833) {
-		/* Save spirom version */
-		bnx2x_save_848xx_spirom_version(phy, bp, params->port);
-	}
-	/* This phy uses the NIG latch mechanism since link indication
-	 * arrives through its LED4 and not via its LASI signal, so we
-	 * get steady signal instead of clear on read
-	 */
-	bnx2x_bits_en(bp, NIG_REG_LATCH_BC_0 + params->port*4,
-		      1 << NIG_LATCH_BC_ENABLE_MI_INT);
-
+	bnx2x_848xx_specific_func(phy, params, PHY_INIT);
 	bnx2x_cl45_write(bp, phy,
 			 MDIO_PMA_DEVAD, MDIO_PMA_REG_CTRL, 0x0000);
 
-	bnx2x_848xx_set_led(bp, phy);
-
 	/* set 1000 speed advertisement */
 	bnx2x_cl45_read(bp, phy,
 			MDIO_AN_DEVAD, MDIO_AN_REG_8481_1000T_CTRL,
@@ -10565,6 +10586,35 @@ static void bnx2x_848xx_set_link_led(struct bnx2x_phy *phy,
 /******************************************************************/
 /*			54618SE PHY SECTION			  */
 /******************************************************************/
+static void bnx2x_54618se_specific_func(struct bnx2x_phy *phy,
+					struct link_params *params,
+					u32 action)
+{
+	struct bnx2x *bp = params->bp;
+	u16 temp;
+	switch (action) {
+	case PHY_INIT:
+		/* Configure LED4: set to INTR (0x6). */
+		/* Accessing shadow register 0xe. */
+		bnx2x_cl22_write(bp, phy,
+				 MDIO_REG_GPHY_SHADOW,
+				 MDIO_REG_GPHY_SHADOW_LED_SEL2);
+		bnx2x_cl22_read(bp, phy,
+				MDIO_REG_GPHY_SHADOW,
+				&temp);
+		temp &= ~(0xf << 4);
+		temp |= (0x6 << 4);
+		bnx2x_cl22_write(bp, phy,
+				 MDIO_REG_GPHY_SHADOW,
+				 MDIO_REG_GPHY_SHADOW_WR_ENA | temp);
+		/* Configure INTR based on link status change. */
+		bnx2x_cl22_write(bp, phy,
+				 MDIO_REG_INTR_MASK,
+				 ~MDIO_REG_INTR_MASK_LINK_STATUS);
+		break;
+	}
+}
+
 static int bnx2x_54618se_config_init(struct bnx2x_phy *phy,
 					       struct link_params *params,
 					       struct link_vars *vars)
@@ -10602,24 +10652,8 @@ static int bnx2x_54618se_config_init(struct bnx2x_phy *phy,
 	/* Wait for GPHY to reset */
 	msleep(50);
 
-	/* Configure LED4: set to INTR (0x6). */
-	/* Accessing shadow register 0xe. */
-	bnx2x_cl22_write(bp, phy,
-			MDIO_REG_GPHY_SHADOW,
-			MDIO_REG_GPHY_SHADOW_LED_SEL2);
-	bnx2x_cl22_read(bp, phy,
-			MDIO_REG_GPHY_SHADOW,
-			&temp);
-	temp &= ~(0xf << 4);
-	temp |= (0x6 << 4);
-	bnx2x_cl22_write(bp, phy,
-			MDIO_REG_GPHY_SHADOW,
-			MDIO_REG_GPHY_SHADOW_WR_ENA | temp);
-	/* Configure INTR based on link status change. */
-	bnx2x_cl22_write(bp, phy,
-			MDIO_REG_INTR_MASK,
-			~MDIO_REG_INTR_MASK_LINK_STATUS);
 
+	bnx2x_54618se_specific_func(phy, params, PHY_INIT);
 	/* Flip the signal detect polarity (set 0x1c.0x1e[8]). */
 	bnx2x_cl22_write(bp, phy,
 			MDIO_REG_GPHY_SHADOW,
@@ -11349,7 +11383,7 @@ static struct bnx2x_phy phy_8073 = {
 	.format_fw_ver	= (format_fw_ver_t)bnx2x_format_ver,
 	.hw_reset	= (hw_reset_t)NULL,
 	.set_link_led	= (set_link_led_t)NULL,
-	.phy_specific_func = (phy_specific_func_t)NULL
+	.phy_specific_func = (phy_specific_func_t)bnx2x_8073_specific_func
 };
 static struct bnx2x_phy phy_8705 = {
 	.type		= PORT_HW_CFG_XGXS_EXT_PHY_TYPE_BCM8705,
@@ -11542,7 +11576,7 @@ static struct bnx2x_phy phy_84823 = {
 	.format_fw_ver	= (format_fw_ver_t)bnx2x_848xx_format_ver,
 	.hw_reset	= (hw_reset_t)NULL,
 	.set_link_led	= (set_link_led_t)bnx2x_848xx_set_link_led,
-	.phy_specific_func = (phy_specific_func_t)NULL
+	.phy_specific_func = (phy_specific_func_t)bnx2x_848xx_specific_func
 };
 
 static struct bnx2x_phy phy_84833 = {
@@ -11578,7 +11612,7 @@ static struct bnx2x_phy phy_84833 = {
 	.format_fw_ver	= (format_fw_ver_t)bnx2x_848xx_format_ver,
 	.hw_reset	= (hw_reset_t)bnx2x_84833_hw_reset_phy,
 	.set_link_led	= (set_link_led_t)bnx2x_848xx_set_link_led,
-	.phy_specific_func = (phy_specific_func_t)NULL
+	.phy_specific_func = (phy_specific_func_t)bnx2x_848xx_specific_func
 };
 
 static struct bnx2x_phy phy_54618se = {
@@ -11612,7 +11646,7 @@ static struct bnx2x_phy phy_54618se = {
 	.format_fw_ver	= (format_fw_ver_t)NULL,
 	.hw_reset	= (hw_reset_t)NULL,
 	.set_link_led	= (set_link_led_t)bnx2x_5461x_set_link_led,
-	.phy_specific_func = (phy_specific_func_t)NULL
+	.phy_specific_func = (phy_specific_func_t)bnx2x_54618se_specific_func
 };
 /*****************************************************************/
 /*                                                               */
diff --git a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_link.h b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_link.h
index 51cac81..600ffda 100644
--- a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_link.h
+++ b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_link.h
@@ -216,6 +216,7 @@ struct bnx2x_phy {
 	phy_specific_func_t phy_specific_func;
 #define DISABLE_TX	1
 #define ENABLE_TX	2
+#define PHY_INIT	3
 };
 
 /* Inputs parameters to the CLC */
-- 
1.7.9.rc2

^ permalink raw reply related

* [net-next PATCH 0/3] bnx2x: Link flap avoidance added
From: Yuval Mintz @ 2012-09-13 12:56 UTC (permalink / raw)
  To: davem, netdev; +Cc: eilong, ariele, Yuval Mintz

Hi Dave,

In various flows in the bnx2x driver, the link is toggled unnecessarily -
In such flows, if the link is already up it would be pulled down than
raised up again, even if no change in the link was requested by the
user.

This patch series tries to eliminate this problem, or at least to greatly
reduce the number of cases that would actually cause such a scenario to
happen.

Please consider applying this patch series to 'net-next'.

Thanks,
Yuval

^ permalink raw reply

* Re: GRO aggregation
From: Or Gerlitz @ 2012-09-13 12:47 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: Shlomo Pongartz, Rick Jones, netdev@vger.kernel.org, Tom Herbert,
	Yevgeny Petrilin
In-Reply-To: <1347537926.13103.1530.camel@edumazet-glaptop>

On Thu, Sep 13, 2012 at 3:05 PM, Eric Dumazet <eric.dumazet@gmail.com> wrote:
> On Thu, 2012-09-13 at 12:59 +0300, Or Gerlitz wrote:
>> On Thu, Sep 13, 2012 at 11:11 AM, Eric Dumazet <eric.dumazet@gmail.com> wrote:
>> > MAX_SKB_FRAGS is 16
>> > skb_gro_receive() will return -E2BIG once this limit is hit.
>> > If you use a MSS = 100 (instead of MSS = 1460), then GRO skb will
>> > contain only at most 1700 bytes, but TSO packets can still be 64KB, if
>> > the sender NIC can afford it (some NICS wont work quite well)

>> Addressing this assertion of yours, Shlomo showed that with ixgbe he managed
>> to see GRO aggregating 32KB which means 20-21 packets that is > 16 fragments
>> in this notation, can it be related to the way ixgbe is actually allocating skbs?

> Hard to say without knowing exact kernel version, as things change a lot in this area.

As Shlomo wrote earlier on this thread his testbed is 3.6-rc1


> You have several kind of GRO. One fast and one slow.
> The slow one uses a linked list of skbs (pinfo->frag_list), while the
> fast one uses fragments (pinfo->nr_frags)
>
> For example, some drivers (mellanox one is in this lot) pull too many
> bytes in skb->head and this defeats the fast GRO :
> Part of payload is in skb->head, remaining part in pinfo->frags[0]
>
> skb_gro_receive() then has to allocate a new head skb, to link skbs into
> head->frag_list. The total skb->truesize is not reduced at all, its
> increased.
>
> So you might think GRO is working, but its only a hack, as one skb has a
> list of skbs, and this makes TCP read() slower, and defeats TCP
> coalescing as well. Whats the point of delivering fat skbs to TCP stack
> if it slows down the consumer, because of increased cache line misses ?

Shlomo is dealing with making the IPoIB driver work well with GRO,
thanks for the
comments on the Mellanox Ethernet driver, we will look there too
(added Yevgeny)...

As for IPoIB it has two modes, connected which irrelevant for this
discussion, and datagram
- who is under the  scope here. Its MTU is typically 2044 but can be
4092 as well, the allocation
of skb's for this mode is done in ipoib_alloc_rx_skb() -- which you've
patched recently...

Following your comment we noted that if using the lower/typical mtu of
2044 which means
we are below the ipoib_ud_need_sg() threshold, skbs are allocated on
one "form" and if using
the 4092 mtu in another "form" - do you see each of the form to fall
into different GRO flow, e.g
2044 to the "slow" and 4092 to the "fast"?!

Or.

^ permalink raw reply

* Re: GRO aggregation
From: Eric Dumazet @ 2012-09-13 12:34 UTC (permalink / raw)
  To: Or Gerlitz
  Cc: Shlomo Pongartz, Rick Jones, netdev@vger.kernel.org, Tom Herbert
In-Reply-To: <1347537926.13103.1530.camel@edumazet-glaptop>

On Thu, 2012-09-13 at 14:05 +0200, Eric Dumazet wrote:

> But there is no real difference in throughput.
> 
> diff --git a/drivers/net/ethernet/mellanox/mlx4/mlx4_en.h b/drivers/net/ethernet/mellanox/mlx4/mlx4_en.h
> index 6c4f935..435c35e 100644
> --- a/drivers/net/ethernet/mellanox/mlx4/mlx4_en.h
> +++ b/drivers/net/ethernet/mellanox/mlx4/mlx4_en.h
> @@ -96,8 +96,8 @@
>  /* Receive fragment sizes; we use at most 4 fragments (for 9600 byte MTU
>   * and 4K allocations) */
>  enum {
> -	FRAG_SZ0 = 512 - NET_IP_ALIGN,
> -	FRAG_SZ1 = 1024,
> +	FRAG_SZ0 = 1536 - NET_IP_ALIGN,
> +	FRAG_SZ1 = 2048,
>         FRAG_SZ2 = 4096,
>         FRAG_SZ3 = MLX4_EN_ALLOC_SIZE
>  };
> 

Oh well, adding one prefetch() is giving ~10% more throughput.

I guess this mlx4 driver needs some care.

diff --git a/drivers/net/ethernet/mellanox/mlx4/en_rx.c b/drivers/net/ethernet/mellanox/mlx4/en_rx.c
index 5aba5ec..547eec8 100644
--- a/drivers/net/ethernet/mellanox/mlx4/en_rx.c
+++ b/drivers/net/ethernet/mellanox/mlx4/en_rx.c
@@ -38,6 +38,7 @@
 #include <linux/if_ether.h>
 #include <linux/if_vlan.h>
 #include <linux/vmalloc.h>
+#include <linux/prefetch.h>
 
 #include "mlx4_en.h"
 
@@ -617,7 +618,8 @@ int mlx4_en_process_rx_cq(struct net_device *dev, struct mlx4_en_cq *cq, int bud
 		    !((dev->features & NETIF_F_LOOPBACK) ||
 		      priv->validate_loopback))
 			goto next;
-
+		/* avoid cache miss in tcp_gro_receive() */
+		prefetch((char *)ethh + 64);
 		/*
 		 * Packet is OK - process it.
 		 */

^ permalink raw reply related

* Re: GRO aggregation
From: Eric Dumazet @ 2012-09-13 12:05 UTC (permalink / raw)
  To: Or Gerlitz
  Cc: Shlomo Pongartz, Rick Jones, netdev@vger.kernel.org, Tom Herbert
In-Reply-To: <CAJZOPZL8qrqYfAvcQBDB9CFy7WwztWchaMyADGBnwKpW-r1Q4g@mail.gmail.com>

On Thu, 2012-09-13 at 12:59 +0300, Or Gerlitz wrote:
> On Thu, Sep 13, 2012 at 11:11 AM, Eric Dumazet <eric.dumazet@gmail.com> wrote:
> > MAX_SKB_FRAGS is 16
> > skb_gro_receive() will return -E2BIG once this limit is hit.
> > If you use a MSS = 100 (instead of MSS = 1460), then GRO skb will
> > contain only at most 1700 bytes, but TSO packets can still be 64KB, if
> > the sender NIC can afford it (some NICS wont work quite well)
> 
> Hi Eric,
> 
> Addressing this assertion of yours, Shlomo showed that with ixgbe he managed
> to see GRO aggregating 32KB which means 20-21 packets that is > 16 fragments
> in this notation, can it be related to the way ixgbe is actually
> allocating skbs?
> 

Hard to say without knowing exact kernel version, as things change a lot
in this area.

You have several kind of GRO. One fast and one slow.

The slow one uses a linked list of skbs (pinfo->frag_list), while the
fast one uses fragments (pinfo->nr_frags)

For example, some drivers (mellanox one is in this lot) pull too many
bytes in skb->head and this defeats the fast GRO :
Part of payload is in skb->head, remaining part in pinfo->frags[0]

skb_gro_receive() then has to allocate a new head skb, to link skbs into
head->frag_list. The total skb->truesize is not reduced at all, its
increased.

So you might think GRO is working, but its only a hack, as one skb has a
list of skbs, and this makes TCP read() slower, and defeats TCP
coalescing as well. Whats the point of delivering fat skbs to TCP stack
if it slows down the consumer, because of increased cache line misses ?

I am not _very_ interested in the slow GRO behavior, I try to improve
the fast path.

ixgbe uses the fast GRO, at least on recent kernels.

In my tests on mellanox, it only aggregates 8 frames per skb, and still
we reach 10Gbps...

03:41:40.128074 IP 7.7.7.84.38079 > 7.7.7.83.52113: . 1563841:1575425(11584) ack 0 win 229 <nop,nop,timestamp 137349733 152427711>
03:41:40.128080 IP 7.7.7.84.38079 > 7.7.7.83.52113: . 1575425:1587009(11584) ack 0 win 229 <nop,nop,timestamp 137349733 152427711>
03:41:40.128085 IP 7.7.7.84.38079 > 7.7.7.83.52113: . 1587009:1598593(11584) ack 0 win 229 <nop,nop,timestamp 137349733 152427711>
03:41:40.128089 IP 7.7.7.84.38079 > 7.7.7.83.52113: . 1598593:1610177(11584) ack 0 win 229 <nop,nop,timestamp 137349733 152427711>
03:41:40.128093 IP 7.7.7.84.38079 > 7.7.7.83.52113: . 1610177:1621761(11584) ack 0 win 229 <nop,nop,timestamp 137349733 152427711>
03:41:40.128103 IP 7.7.7.84.38079 > 7.7.7.83.52113: . 1633345:1644929(11584) ack 0 win 229 <nop,nop,timestamp 137349733 152427711>
03:41:40.128116 IP 7.7.7.84.38079 > 7.7.7.83.52113: . 1668097:1679681(11584) ack 0 win 229 <nop,nop,timestamp 137349733 152427711>
03:41:40.128121 IP 7.7.7.84.38079 > 7.7.7.83.52113: . 1679681:1691265(11584) ack 0 win 229 <nop,nop,timestamp 137349733 152427711>
03:41:40.128134 IP 7.7.7.84.38079 > 7.7.7.83.52113: . 1714433:1726017(11584) ack 0 win 229 <nop,nop,timestamp 137349733 152427711>
03:41:40.128146 IP 7.7.7.84.38079 > 7.7.7.83.52113: . 1749185:1759321(10136) ack 0 win 229 <nop,nop,timestamp 137349733 152427711>
03:41:40.128163 IP 7.7.7.83.52113 > 7.7.7.84.38079: . ack 1575425 win 4147 <nop,nop,timestamp 152427711 137349733>
03:41:40.128193 IP 7.7.7.83.52113 > 7.7.7.84.38079: . ack 1759321 win 3339 <nop,nop,timestamp 152427711 137349733>

And it aggregates 8 frames per skb because each individual frame uses 2 fragments :

One of 512 bytes and one of 1024 bytes : total of 1536 bytes,
instead of the typical 2048 bytes used by other NIC

To get better performance, mellanox could use only one frag
per MTU (if MTU <= 1500), using 1536 bytes frags.

I tried this and this gives now :

05:00:12.507398 IP 7.7.7.84.63422 > 7.7.7.83.37622: . 2064384:2089000(24616) ack 1 win 229 <nop,nop,timestamp 142062123 4294793380>
05:00:12.507419 IP 7.7.7.84.63422 > 7.7.7.83.37622: . 2138232:2161400(23168) ack 1 win 229 <nop,nop,timestamp 142062123 4294793380>
05:00:12.507489 IP 7.7.7.84.63422 > 7.7.7.83.37622: . 2244664:2269280(24616) ack 1 win 229 <nop,nop,timestamp 142062123 4294793380>
05:00:12.507509 IP 7.7.7.83.37622 > 7.7.7.84.63422: . ack 2244664 win 16384 <nop,nop,timestamp 4294793380 142062123>

But there is no real difference in throughput.

diff --git a/drivers/net/ethernet/mellanox/mlx4/mlx4_en.h b/drivers/net/ethernet/mellanox/mlx4/mlx4_en.h
index 6c4f935..435c35e 100644
--- a/drivers/net/ethernet/mellanox/mlx4/mlx4_en.h
+++ b/drivers/net/ethernet/mellanox/mlx4/mlx4_en.h
@@ -96,8 +96,8 @@
 /* Receive fragment sizes; we use at most 4 fragments (for 9600 byte MTU
  * and 4K allocations) */
 enum {
-	FRAG_SZ0 = 512 - NET_IP_ALIGN,
-	FRAG_SZ1 = 1024,
+	FRAG_SZ0 = 1536 - NET_IP_ALIGN,
+	FRAG_SZ1 = 2048,
        FRAG_SZ2 = 4096,
        FRAG_SZ3 = MLX4_EN_ALLOC_SIZE
 };

^ permalink raw reply related

* Re: [PATCH net-next V3 1/2] IB/ipoib: Add rtnl_link_ops support
From: Or Gerlitz @ 2012-09-13 11:28 UTC (permalink / raw)
  To: Rami Rosen; +Cc: Patrick McHardy, Eric Dumazet, netdev, Shlomo Pongratz
In-Reply-To: <CAKoUAr=8zZA9EgvJEVrd0a-Uw=zzksHj+5P2Sp3Lb4vXgSJRYA@mail.gmail.com>

On 12/09/2012 17:53, Rami Rosen wrote:
>  From the dump of CPU #1, it seems indeed not related at all to "modprobe -r".
>
> Could it be that there is some IB stack sysfs write activity?
> (regardless of the modprobe -r" you issued) ?  I see some candidates for it.
>
> delete_child() is a method of the IB stack (ipoib/ipoib_main.c)
>
> Maybe in order to help debug the problem, you might try to add in
> delete_child() method, print of the name of the attribute which is being deleted?
>
> (struct device_attribute has a a member "struct attribute attr",
> which in turn has  "const char *name").
>

> the existing dependency chain (in reverse order) is:
>
> -> #1 (rtnl_mutex){+.+.+.}:
>        [<ffffffff81072b30>] lock_acquire+0x14f/0x19b
>        [<ffffffff81396a43>] mutex_lock_nested+0x64/0x2ce
>        [<ffffffff812fc103>] rtnl_lock+0x12/0x14
>        [<ffffffff812eecf1>] netdev_run_todo+0xa5/0x27e
>        [<ffffffff812fc0dd>] rtnl_unlock+0x9/0xb
>        [<ffffffffa0394889>] ipoib_vlan_delete+0x111/0x148 [ib_ipoib]
>        [<ffffffffa038d29b>] delete_child+0x44/0x60 [ib_ipoib]
>        [<ffffffff81247bd8>] dev_attr_store+0x1b/0x1d
>        [<ffffffff8114e223>] sysfs_write_file+0x103/0x13f
>        [<ffffffff810f206b>] vfs_write+0xae/0x133
>        [<ffffffff810f21a9>] sys_write+0x45/0x6c
>        [<ffffffff813a05e2>] system_call_fastpath+0x16/0x1b 

I've added code in ipoib_delete_child to print the caller PID and dump 
the stack,
its triggeredwhen I do "echo 0x8001 > /sys/class/net/ib0/delete_child" 
but the lockdep
warningis raised only when I actually unload the module, and no print... 
that is
ipoib_delete_child  isn't called.

Or.

^ permalink raw reply

* Re: [PATCH net-next V3 1/2] IB/ipoib: Add rtnl_link_ops support
From: Or Gerlitz @ 2012-09-13 10:54 UTC (permalink / raw)
  To: Eric Dumazet; +Cc: Rami Rosen, Patrick McHardy, netdev, Shlomo Pongratz
In-Reply-To: <1347462823.13103.1085.camel@edumazet-glaptop>

On 12/09/2012 18:13, Eric Dumazet wrote:
> It might be related to module load/unload udevd or some external 
> daemon can access sysfs files while you unload the module 

Hi Eric,

I see, the IPoIB add/delete child sysfs handlers (ipoib_vlan_add/delete) 
use
RTNL locking to protect against netdev changes while the handlers are in 
action.

IPoIB uses devide_create_file to add the sysfs entries, but doesn't care 
to use
device_remove_file as these entries sit under the netdevice 
/sys/class/net/DEV
directory and are removed by higher layer when DEV gets 
unregistered/deleted, I'm
not sure if/how the driver is supposed to protect against access to 
these entries
while going down, when the module is unloaded, etc, any idea will be 
appreciated.


Or.

^ permalink raw reply

* [PATCH 3/5] netfilter: nfnetlink_queue: remove pointless conditional before kfree_skb()
From: pablo @ 2012-09-13 11:01 UTC (permalink / raw)
  To: netfilter-devel; +Cc: davem, netdev
In-Reply-To: <1347534092-3579-1-git-send-email-pablo@netfilter.org>

From: Wei Yongjun <yongjun_wei@trendmicro.com.cn>

Remove pointless conditional before kfree_skb().

Signed-off-by: Wei Yongjun <yongjun_wei@trendmicro.com.cn>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/netfilter/nfnetlink_queue_core.c |    3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/net/netfilter/nfnetlink_queue_core.c b/net/netfilter/nfnetlink_queue_core.c
index c0496a5..5c2d78d 100644
--- a/net/netfilter/nfnetlink_queue_core.c
+++ b/net/netfilter/nfnetlink_queue_core.c
@@ -406,8 +406,7 @@ nfqnl_build_packet_message(struct nfqnl_instance *queue,
 	return skb;
 
 nla_put_failure:
-	if (skb)
-		kfree_skb(skb);
+	kfree_skb(skb);
 	net_err_ratelimited("nf_queue: error creating packet message\n");
 	return NULL;
 }
-- 
1.7.10.4

^ permalink raw reply related

* [PATCH 5/5] netfilter: ctnetlink: fix module auto-load in ctnetlink_parse_nat
From: pablo @ 2012-09-13 11:01 UTC (permalink / raw)
  To: netfilter-devel; +Cc: davem, netdev
In-Reply-To: <1347534092-3579-1-git-send-email-pablo@netfilter.org>

From: Pablo Neira Ayuso <pablo@netfilter.org>

(c7232c9 netfilter: add protocol independent NAT core) added
incorrect locking for the module auto-load case in ctnetlink_parse_nat.

That function is always called from ctnetlink_create_conntrack which
requires no locking.

Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/netfilter/nf_conntrack_netlink.c |    3 ---
 1 file changed, 3 deletions(-)

diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c
index a205bd6..090d267 100644
--- a/net/netfilter/nf_conntrack_netlink.c
+++ b/net/netfilter/nf_conntrack_netlink.c
@@ -1120,16 +1120,13 @@ ctnetlink_parse_nat_setup(struct nf_conn *ct,
 	if (err == -EAGAIN) {
 #ifdef CONFIG_MODULES
 		rcu_read_unlock();
-		spin_unlock_bh(&nf_conntrack_lock);
 		nfnl_unlock();
 		if (request_module("nf-nat-%u", nf_ct_l3num(ct)) < 0) {
 			nfnl_lock();
-			spin_lock_bh(&nf_conntrack_lock);
 			rcu_read_lock();
 			return -EOPNOTSUPP;
 		}
 		nfnl_lock();
-		spin_lock_bh(&nf_conntrack_lock);
 		rcu_read_lock();
 #else
 		err = -EOPNOTSUPP;
-- 
1.7.10.4


^ permalink raw reply related

* [PATCH 4/5] ipvs: use list_del_init instead of list_del/INIT_LIST_HEAD
From: pablo @ 2012-09-13 11:01 UTC (permalink / raw)
  To: netfilter-devel; +Cc: davem, netdev
In-Reply-To: <1347534092-3579-1-git-send-email-pablo@netfilter.org>

From: Wei Yongjun <yongjun_wei@trendmicro.com.cn>

Using list_del_init() instead of list_del() + INIT_LIST_HEAD().

spatch with a semantic match is used to found this problem.
(http://coccinelle.lip6.fr/)

Signed-off-by: Wei Yongjun <yongjun_wei@trendmicro.com.cn>
Acked-by: Simon Horman <horms@verge.net.au>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/netfilter/ipvs/ip_vs_ctl.c |    3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/net/netfilter/ipvs/ip_vs_ctl.c b/net/netfilter/ipvs/ip_vs_ctl.c
index 767cc12..37b38d0 100644
--- a/net/netfilter/ipvs/ip_vs_ctl.c
+++ b/net/netfilter/ipvs/ip_vs_ctl.c
@@ -539,8 +539,7 @@ static int ip_vs_rs_unhash(struct ip_vs_dest *dest)
 	 * Remove it from the rs_table table.
 	 */
 	if (!list_empty(&dest->d_list)) {
-		list_del(&dest->d_list);
-		INIT_LIST_HEAD(&dest->d_list);
+		list_del_init(&dest->d_list);
 	}
 
 	return 1;
-- 
1.7.10.4


^ permalink raw reply related

* [PATCH 2/5] netfilter: nf_nat: fix out-of-bounds access in address selection
From: pablo @ 2012-09-13 11:01 UTC (permalink / raw)
  To: netfilter-devel; +Cc: davem, netdev
In-Reply-To: <1347534092-3579-1-git-send-email-pablo@netfilter.org>

From: Florian Westphal <fw@strlen.de>

include/linux/jhash.h:138:16: warning: array subscript is above array bounds
[jhash2() expects the number of u32 in the key]

Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/netfilter/nf_nat_core.c |    2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/net/netfilter/nf_nat_core.c b/net/netfilter/nf_nat_core.c
index 29d4452..1816ad3 100644
--- a/net/netfilter/nf_nat_core.c
+++ b/net/netfilter/nf_nat_core.c
@@ -255,7 +255,7 @@ find_best_ips_proto(u16 zone, struct nf_conntrack_tuple *tuple,
 	 * client coming from the same IP (some Internet Banking sites
 	 * like this), even across reboots.
 	 */
-	j = jhash2((u32 *)&tuple->src.u3, sizeof(tuple->src.u3),
+	j = jhash2((u32 *)&tuple->src.u3, sizeof(tuple->src.u3) / sizeof(u32),
 		   range->flags & NF_NAT_RANGE_PERSISTENT ?
 			0 : (__force u32)tuple->dst.u3.all[max] ^ zone);
 
-- 
1.7.10.4


^ permalink raw reply related

* [PATCH 1/5] netfilter: fix crash during boot if NAT has been compiled built-in
From: pablo @ 2012-09-13 11:01 UTC (permalink / raw)
  To: netfilter-devel; +Cc: davem, netdev
In-Reply-To: <1347534092-3579-1-git-send-email-pablo@netfilter.org>

From: Pablo Neira Ayuso <pablo@netfilter.org>

(c7232c9 netfilter: add protocol independent NAT core) introduced a
problem that leads to crashing during boot due to NULL pointer
dereference. It seems that xt_nat calls xt_register_target() before
xt_init():

net/netfilter/x_tables.c:static struct xt_af *xt; is NULL and we crash on
xt_register_target(struct xt_target *target)
{
        u_int8_t af = target->family;
        int ret;

        ret = mutex_lock_interruptible(&xt[af].mutex);
...

Fix this by changing the linking order, to make sure that x_tables
comes before xt_nat.

Reported-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/netfilter/Makefile |    2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/net/netfilter/Makefile b/net/netfilter/Makefile
index 98244d4..0baa3f1 100644
--- a/net/netfilter/Makefile
+++ b/net/netfilter/Makefile
@@ -47,7 +47,6 @@ nf_nat-y	:= nf_nat_core.o nf_nat_proto_unknown.o nf_nat_proto_common.o \
 		   nf_nat_proto_udp.o nf_nat_proto_tcp.o nf_nat_helper.o
 
 obj-$(CONFIG_NF_NAT) += nf_nat.o
-obj-$(CONFIG_NF_NAT) += xt_nat.o
 
 # NAT protocols (nf_nat)
 obj-$(CONFIG_NF_NAT_PROTO_DCCP) += nf_nat_proto_dccp.o
@@ -71,6 +70,7 @@ obj-$(CONFIG_NETFILTER_XTABLES) += x_tables.o xt_tcpudp.o
 obj-$(CONFIG_NETFILTER_XT_MARK) += xt_mark.o
 obj-$(CONFIG_NETFILTER_XT_CONNMARK) += xt_connmark.o
 obj-$(CONFIG_NETFILTER_XT_SET) += xt_set.o
+obj-$(CONFIG_NF_NAT) += xt_nat.o
 
 # targets
 obj-$(CONFIG_NETFILTER_XT_TARGET_AUDIT) += xt_AUDIT.o
-- 
1.7.10.4


^ permalink raw reply related

* [PATCH 0/5] Netfilter updates for net-next
From: pablo @ 2012-09-13 11:01 UTC (permalink / raw)
  To: netfilter-devel; +Cc: davem, netdev

From: Pablo Neira Ayuso <pablo@netfilter.org>

Hi David,

The following patchset contains four Netfilter updates, mostly targeting
to fix issues added with IPv6 NAT, and one little IPVS update for net-next:

* Remove unneeded conditional free of skb in nfnetlink_queue, from
  Wei Yongjun.

* One semantic path from coccinelle detected the use of list_del +
  INIT_LIST_HEAD, instead of list_del_init, again from Wei Yongjun.

* Fix out-of-bound memory access in the NAT address selection, from
  Florian Westphal. This was introduced with the IPv6 NAT patches.

* Two fixes for crashes that were introduced in the recently merged
  IPv6 NAT support, from myself.

You can pull these changes from:

git://1984.lsi.us.es/nf-next master

Thanks!

Florian Westphal (1):
  netfilter: nf_nat: fix out-of-bounds access in address selection

Pablo Neira Ayuso (2):
  netfilter: fix crash during boot if NAT has been compiled built-in
  netfilter: ctnetlink: fix module auto-load in ctnetlink_parse_nat

Wei Yongjun (2):
  netfilter: nfnetlink_queue: remove pointless conditional before kfree_skb()
  ipvs: use list_del_init instead of list_del/INIT_LIST_HEAD

 net/netfilter/Makefile               |    2 +-
 net/netfilter/ipvs/ip_vs_ctl.c       |    3 +--
 net/netfilter/nf_conntrack_netlink.c |    3 ---
 net/netfilter/nf_nat_core.c          |    2 +-
 net/netfilter/nfnetlink_queue_core.c |    3 +--
 5 files changed, 4 insertions(+), 9 deletions(-)

-- 
1.7.10.4


^ permalink raw reply

* [PATCH 2/4] netfilter: Mark SYN/ACK packets as invalid from original direction
From: pablo @ 2012-09-13 10:54 UTC (permalink / raw)
  To: netfilter-devel; +Cc: davem, netdev
In-Reply-To: <1347533648-3451-1-git-send-email-pablo@netfilter.org>

From: Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>

Clients should not send such packets. By accepting them, we open
up a hole by wich ephemeral ports can be discovered in an off-path
attack.

See: "Reflection scan: an Off-Path Attack on TCP" by Jan Wrobel,
http://arxiv.org/abs/1201.2074

Signed-off-by: Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/netfilter/nf_conntrack_proto_tcp.c |   19 ++++++++-----------
 1 file changed, 8 insertions(+), 11 deletions(-)

diff --git a/net/netfilter/nf_conntrack_proto_tcp.c b/net/netfilter/nf_conntrack_proto_tcp.c
index a5ac11e..aba98f9 100644
--- a/net/netfilter/nf_conntrack_proto_tcp.c
+++ b/net/netfilter/nf_conntrack_proto_tcp.c
@@ -158,21 +158,18 @@ static const u8 tcp_conntracks[2][6][TCP_CONNTRACK_MAX] = {
  *	sCL -> sSS
  */
 /* 	     sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sS2	*/
-/*synack*/ { sIV, sIV, sIG, sIG, sIG, sIG, sIG, sIG, sIG, sSR },
+/*synack*/ { sIV, sIV, sSR, sIV, sIV, sIV, sIV, sIV, sIV, sSR },
 /*
  *	sNO -> sIV	Too late and no reason to do anything
  *	sSS -> sIV	Client can't send SYN and then SYN/ACK
  *	sS2 -> sSR	SYN/ACK sent to SYN2 in simultaneous open
- *	sSR -> sIG
- *	sES -> sIG	Error: SYNs in window outside the SYN_SENT state
- *			are errors. Receiver will reply with RST
- *			and close the connection.
- *			Or we are not in sync and hold a dead connection.
- *	sFW -> sIG
- *	sCW -> sIG
- *	sLA -> sIG
- *	sTW -> sIG
- *	sCL -> sIG
+ *	sSR -> sSR	Late retransmitted SYN/ACK in simultaneous open
+ *	sES -> sIV	Invalid SYN/ACK packets sent by the client
+ *	sFW -> sIV
+ *	sCW -> sIV
+ *	sLA -> sIV
+ *	sTW -> sIV
+ *	sCL -> sIV
  */
 /* 	     sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sS2	*/
 /*fin*/    { sIV, sIV, sFW, sFW, sLA, sLA, sLA, sTW, sCL, sIV },
-- 
1.7.10.4

^ permalink raw reply related

* Re: xt_nat_init: BUG: unable to handle kernel NULL pointer dereference at 00000000000000e0
From: Fengguang Wu @ 2012-09-13 10:55 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: Patrick McHardy, networking, Pablo Neira Ayuso,
	Netfilter Development Maili..., LKML, Florian Westphal
In-Reply-To: <1347528823.13103.1427.camel@edumazet-glaptop>

> Wasnt it already solved ?
> 
> http://1984.lsi.us.es/git/nf-next/commit/?id=00545bec9412d130c77f72a08d6c8b6ad21d4a1
> 
> Just have to wait that netfilter fixes are pushed upstream

OK, sorry. I didn't subscribe many mailing lists and rely on the
search results in google and LKML to avoid duplicate reports..

Thanks,
Fengguang

^ permalink raw reply

* [PATCH 4/4] netfilter: log: Fix log-level processing
From: pablo @ 2012-09-13 10:54 UTC (permalink / raw)
  To: netfilter-devel; +Cc: davem, netdev
In-Reply-To: <1347533648-3451-1-git-send-email-pablo@netfilter.org>

From: Joe Perches <joe@perches.com>

auto75914331@hushmail.com reports that iptables does not correctly
output the KERN_<level>.

$IPTABLES -A RULE_0_in  -j LOG  --log-level notice --log-prefix "DENY  in: "

result with linux 3.6-rc5
Sep 12 06:37:29 xxxxx kernel: <5>DENY  in: IN=eth0 OUT= MAC=.......

result with linux 3.5.3 and older:
Sep  9 10:43:01 xxxxx kernel: DENY  in: IN=eth0 OUT= MAC......

commit 04d2c8c83d0
("printk: convert the format for KERN_<LEVEL> to a 2 byte pattern")
updated the syslog header style but did not update netfilter uses.

Do so.

Use KERN_SOH and string concatenation instead of "%c" KERN_SOH_ASCII
as suggested by Eric Dumazet.

Signed-off-by: Joe Perches <joe@perches.com>
cc: auto75914331@hushmail.com
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/bridge/netfilter/ebt_log.c |    2 +-
 net/netfilter/xt_LOG.c         |    4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/net/bridge/netfilter/ebt_log.c b/net/bridge/netfilter/ebt_log.c
index f88ee53..92de5e5 100644
--- a/net/bridge/netfilter/ebt_log.c
+++ b/net/bridge/netfilter/ebt_log.c
@@ -80,7 +80,7 @@ ebt_log_packet(u_int8_t pf, unsigned int hooknum,
 	unsigned int bitmask;
 
 	spin_lock_bh(&ebt_log_lock);
-	printk("<%c>%s IN=%s OUT=%s MAC source = %pM MAC dest = %pM proto = 0x%04x",
+	printk(KERN_SOH "%c%s IN=%s OUT=%s MAC source = %pM MAC dest = %pM proto = 0x%04x",
 	       '0' + loginfo->u.log.level, prefix,
 	       in ? in->name : "", out ? out->name : "",
 	       eth_hdr(skb)->h_source, eth_hdr(skb)->h_dest,
diff --git a/net/netfilter/xt_LOG.c b/net/netfilter/xt_LOG.c
index 2a4f969..91e9af4 100644
--- a/net/netfilter/xt_LOG.c
+++ b/net/netfilter/xt_LOG.c
@@ -443,8 +443,8 @@ log_packet_common(struct sbuff *m,
 		  const struct nf_loginfo *loginfo,
 		  const char *prefix)
 {
-	sb_add(m, "<%d>%sIN=%s OUT=%s ", loginfo->u.log.level,
-	       prefix,
+	sb_add(m, KERN_SOH "%c%sIN=%s OUT=%s ",
+	       '0' + loginfo->u.log.level, prefix,
 	       in ? in->name : "",
 	       out ? out->name : "");
 #ifdef CONFIG_BRIDGE_NETFILTER
-- 
1.7.10.4


^ permalink raw reply related

* [PATCH 3/4] netfilter: Validate the sequence number of dataless ACK packets as well
From: pablo @ 2012-09-13 10:54 UTC (permalink / raw)
  To: netfilter-devel; +Cc: davem, netdev
In-Reply-To: <1347533648-3451-1-git-send-email-pablo@netfilter.org>

From: Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>

We spare nothing by not validating the sequence number of dataless
ACK packets and enabling it makes harder off-path attacks.

See: "Reflection scan: an Off-Path Attack on TCP" by Jan Wrobel,
http://arxiv.org/abs/1201.2074

Signed-off-by: Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/netfilter/nf_conntrack_proto_tcp.c |   10 ++--------
 1 file changed, 2 insertions(+), 8 deletions(-)

diff --git a/net/netfilter/nf_conntrack_proto_tcp.c b/net/netfilter/nf_conntrack_proto_tcp.c
index aba98f9..e046b37 100644
--- a/net/netfilter/nf_conntrack_proto_tcp.c
+++ b/net/netfilter/nf_conntrack_proto_tcp.c
@@ -630,15 +630,9 @@ static bool tcp_in_window(const struct nf_conn *ct,
 		ack = sack = receiver->td_end;
 	}
 
-	if (seq == end
-	    && (!tcph->rst
-		|| (seq == 0 && state->state == TCP_CONNTRACK_SYN_SENT)))
+	if (tcph->rst && seq == 0 && state->state == TCP_CONNTRACK_SYN_SENT)
 		/*
-		 * Packets contains no data: we assume it is valid
-		 * and check the ack value only.
-		 * However RST segments are always validated by their
-		 * SEQ number, except when seq == 0 (reset sent answering
-		 * SYN.
+		 * RST sent answering SYN.
 		 */
 		seq = end = sender->td_end;
 
-- 
1.7.10.4


^ permalink raw reply related

* [PATCH 1/4] netfilter: take care of timewait sockets
From: pablo @ 2012-09-13 10:54 UTC (permalink / raw)
  To: netfilter-devel; +Cc: davem, netdev
In-Reply-To: <1347533648-3451-1-git-send-email-pablo@netfilter.org>

From: Eric Dumazet <edumazet@google.com>

Sami Farin reported crashes in xt_LOG because it assumes skb->sk is a
full blown socket.

Since (41063e9 ipv4: Early TCP socket demux), we can have skb->sk
pointing to a timewait socket.

Same fix is needed in nfnetlink_log.

Diagnosed-by: Florian Westphal <fw@strlen.de>
Reported-by: Sami Farin <hvtaifwkbgefbaei@gmail.com>
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/netfilter/nfnetlink_log.c |   14 ++++++++------
 net/netfilter/xt_LOG.c        |   33 +++++++++++++++++----------------
 2 files changed, 25 insertions(+), 22 deletions(-)

diff --git a/net/netfilter/nfnetlink_log.c b/net/netfilter/nfnetlink_log.c
index 14e2f39..5cfb5be 100644
--- a/net/netfilter/nfnetlink_log.c
+++ b/net/netfilter/nfnetlink_log.c
@@ -381,6 +381,7 @@ __build_packet_message(struct nfulnl_instance *inst,
 	struct nlmsghdr *nlh;
 	struct nfgenmsg *nfmsg;
 	sk_buff_data_t old_tail = inst->skb->tail;
+	struct sock *sk;
 
 	nlh = nlmsg_put(inst->skb, 0, 0,
 			NFNL_SUBSYS_ULOG << 8 | NFULNL_MSG_PACKET,
@@ -499,18 +500,19 @@ __build_packet_message(struct nfulnl_instance *inst,
 	}
 
 	/* UID */
-	if (skb->sk) {
-		read_lock_bh(&skb->sk->sk_callback_lock);
-		if (skb->sk->sk_socket && skb->sk->sk_socket->file) {
-			struct file *file = skb->sk->sk_socket->file;
+	sk = skb->sk;
+	if (sk && sk->sk_state != TCP_TIME_WAIT) {
+		read_lock_bh(&sk->sk_callback_lock);
+		if (sk->sk_socket && sk->sk_socket->file) {
+			struct file *file = sk->sk_socket->file;
 			__be32 uid = htonl(file->f_cred->fsuid);
 			__be32 gid = htonl(file->f_cred->fsgid);
-			read_unlock_bh(&skb->sk->sk_callback_lock);
+			read_unlock_bh(&sk->sk_callback_lock);
 			if (nla_put_be32(inst->skb, NFULA_UID, uid) ||
 			    nla_put_be32(inst->skb, NFULA_GID, gid))
 				goto nla_put_failure;
 		} else
-			read_unlock_bh(&skb->sk->sk_callback_lock);
+			read_unlock_bh(&sk->sk_callback_lock);
 	}
 
 	/* local sequence number */
diff --git a/net/netfilter/xt_LOG.c b/net/netfilter/xt_LOG.c
index ff5f75f..2a4f969 100644
--- a/net/netfilter/xt_LOG.c
+++ b/net/netfilter/xt_LOG.c
@@ -145,6 +145,19 @@ static int dump_tcp_header(struct sbuff *m, const struct sk_buff *skb,
 	return 0;
 }
 
+static void dump_sk_uid_gid(struct sbuff *m, struct sock *sk)
+{
+	if (!sk || sk->sk_state == TCP_TIME_WAIT)
+		return;
+
+	read_lock_bh(&sk->sk_callback_lock);
+	if (sk->sk_socket && sk->sk_socket->file)
+		sb_add(m, "UID=%u GID=%u ",
+			sk->sk_socket->file->f_cred->fsuid,
+			sk->sk_socket->file->f_cred->fsgid);
+	read_unlock_bh(&sk->sk_callback_lock);
+}
+
 /* One level of recursion won't kill us */
 static void dump_ipv4_packet(struct sbuff *m,
 			const struct nf_loginfo *info,
@@ -361,14 +374,8 @@ static void dump_ipv4_packet(struct sbuff *m,
 	}
 
 	/* Max length: 15 "UID=4294967295 " */
-	if ((logflags & XT_LOG_UID) && !iphoff && skb->sk) {
-		read_lock_bh(&skb->sk->sk_callback_lock);
-		if (skb->sk->sk_socket && skb->sk->sk_socket->file)
-			sb_add(m, "UID=%u GID=%u ",
-				skb->sk->sk_socket->file->f_cred->fsuid,
-				skb->sk->sk_socket->file->f_cred->fsgid);
-		read_unlock_bh(&skb->sk->sk_callback_lock);
-	}
+	if ((logflags & XT_LOG_UID) && !iphoff)
+		dump_sk_uid_gid(m, skb->sk);
 
 	/* Max length: 16 "MARK=0xFFFFFFFF " */
 	if (!iphoff && skb->mark)
@@ -717,14 +724,8 @@ static void dump_ipv6_packet(struct sbuff *m,
 	}
 
 	/* Max length: 15 "UID=4294967295 " */
-	if ((logflags & XT_LOG_UID) && recurse && skb->sk) {
-		read_lock_bh(&skb->sk->sk_callback_lock);
-		if (skb->sk->sk_socket && skb->sk->sk_socket->file)
-			sb_add(m, "UID=%u GID=%u ",
-				skb->sk->sk_socket->file->f_cred->fsuid,
-				skb->sk->sk_socket->file->f_cred->fsgid);
-		read_unlock_bh(&skb->sk->sk_callback_lock);
-	}
+	if ((logflags & XT_LOG_UID) && recurse)
+		dump_sk_uid_gid(m, skb->sk);
 
 	/* Max length: 16 "MARK=0xFFFFFFFF " */
 	if (!recurse && skb->mark)
-- 
1.7.10.4


^ permalink raw reply related

* [PATCH 0/4] netfilter updates for 3.6-rc5
From: pablo @ 2012-09-13 10:54 UTC (permalink / raw)
  To: netfilter-devel; +Cc: davem, netdev

From: Pablo Neira Ayuso <pablo@netfilter.org>

Hi David,

The following patchset contains four updates for your net tree, they are:

* Fix crash on timewait sockets, since the TCP early demux was added,
  in nfnetlink_log, from Eric Dumazet.

* Fix broken syslog log-level for xt_LOG and ebt_log since printk format was
  converted from <.> to a 2 bytes pattern using ASCII SOH, from Joe Perches.

* Two security fixes for the TCP connection tracking targeting off-path attacks,
  from Jozsef Kadlecsik. The problem was discovered by Jan Wrobel and it is
  documented in: http://mixedbit.org/reflection_scan/reflection_scan.pdf.

You can pull these changes from:

git://1984.lsi.us.es/nf master

Thanks!

Eric Dumazet (1):
  netfilter: take care of timewait sockets

Joe Perches (1):
  netfilter: log: Fix log-level processing

Jozsef Kadlecsik (2):
  netfilter: Mark SYN/ACK packets as invalid from original direction
  netfilter: Validate the sequence number of dataless ACK packets as well

 net/bridge/netfilter/ebt_log.c         |    2 +-
 net/netfilter/nf_conntrack_proto_tcp.c |   29 +++++++++----------------
 net/netfilter/nfnetlink_log.c          |   14 ++++++------
 net/netfilter/xt_LOG.c                 |   37 ++++++++++++++++----------------
 4 files changed, 38 insertions(+), 44 deletions(-)

-- 
1.7.10.4


^ permalink raw reply

* Re: GRO aggregation
From: Or Gerlitz @ 2012-09-13  9:59 UTC (permalink / raw)
  To: Eric Dumazet; +Cc: Shlomo Pongartz, Rick Jones, netdev@vger.kernel.org
In-Reply-To: <1347523865.13103.1423.camel@edumazet-glaptop>

On Thu, Sep 13, 2012 at 11:11 AM, Eric Dumazet <eric.dumazet@gmail.com> wrote:
> MAX_SKB_FRAGS is 16
> skb_gro_receive() will return -E2BIG once this limit is hit.
> If you use a MSS = 100 (instead of MSS = 1460), then GRO skb will
> contain only at most 1700 bytes, but TSO packets can still be 64KB, if
> the sender NIC can afford it (some NICS wont work quite well)

Hi Eric,

Addressing this assertion of yours, Shlomo showed that with ixgbe he managed
to see GRO aggregating 32KB which means 20-21 packets that is > 16 fragments
in this notation, can it be related to the way ixgbe is actually
allocating skbs?

Or.

^ permalink raw reply

* Re: [PATCH 9/9] drivers/isdn/gigaset/common.c: Remove useless kfree
From: Tilman Schmidt @ 2012-09-13  9:52 UTC (permalink / raw)
  To: Peter Senna Tschudin
  Cc: Hansjoerg Lipp, kernel-janitors, Karsten Keil, gigaset307x-common,
	netdev, linux-kernel
In-Reply-To: <1347462407-13499-9-git-send-email-peter.senna@gmail.com>

[-- Attachment #1: Type: text/plain, Size: 1196 bytes --]

Am 12.09.2012 17:06, schrieb Peter Senna Tschudin:
> From: Peter Senna Tschudin <peter.senna@gmail.com>
> 
> Remove useless kfree() and clean up code related to the removal.
> 
> The semantic patch that finds this problem is as follows:
[...]
> 
> Signed-off-by: Peter Senna Tschudin <peter.senna@gmail.com>

Acked-by: Tilman Schmidt <tilman@imap.cc>

> ---
>  drivers/isdn/gigaset/common.c |    1 -
>  1 file changed, 1 deletion(-)
> 
> diff --git a/drivers/isdn/gigaset/common.c b/drivers/isdn/gigaset/common.c
> index aa41485..30a6b17 100644
> --- a/drivers/isdn/gigaset/common.c
> +++ b/drivers/isdn/gigaset/common.c
> @@ -1123,7 +1123,6 @@ struct gigaset_driver *gigaset_initdriver(unsigned minor, unsigned minors,
>  	return drv;
>  
>  error:
> -	kfree(drv->cs);
>  	kfree(drv);
>  	return NULL;
>  }

This is indeed vestigial code, left in when another error path
that needed it was removed. Though innocuous, it's better to
remove it.

Thanks,
Tilman

-- 
Tilman Schmidt                    E-Mail: tilman@imap.cc
Bonn, Germany
Diese Nachricht besteht zu 100% aus wiederverwerteten Bits.
Ungeöffnet mindestens haltbar bis: (siehe Rückseite)




[-- Attachment #2: OpenPGP digital signature --]
[-- Type: application/pgp-signature, Size: 260 bytes --]

^ permalink raw reply

* Interface working only after first packet is sent out ...
From: Sylvain Munaut @ 2012-09-13  9:48 UTC (permalink / raw)
  To: netdev

Hi,


I've been having an issue for a fairly long time, on several system
(either physical or virtual machine).

Basically I don't have network connectivity until the machine
initiates it's first packet out.

So let's say I just booted the machine (and it has no sw running on
it, just base system), if I try to ping it, it won't respond.
In a tcpdump on the interface, I can see the incoming ping, but not
outgoing responses.

Now from the machine itself, I generate a single ping to anywhere,
then everything will start working as expected and it will last
forever until the next reboot.

Has anyone ever seen something like this ?

Most of the time I don't notice it because either it's a desktop
machine and then even just a DHCP request makes it work from boot, or
even on server, the ntp daemon generates the first packet and it
works.

I don't know if it's a bug or if I'm doing something wrong somewhere,
but I really don't see what it could be ....



Cheers,

    Sylvain

^ permalink raw reply

* Re: xt_nat_init: BUG: unable to handle kernel NULL pointer dereference at 00000000000000e0
From: Pablo Neira Ayuso @ 2012-09-13  9:42 UTC (permalink / raw)
  To: Florian Westphal
  Cc: Fengguang Wu, Patrick McHardy, networking,
	Netfilter Development Maili..., LKML
In-Reply-To: <20120913093024.GL14750@breakpoint.cc>

On Thu, Sep 13, 2012 at 11:30:24AM +0200, Florian Westphal wrote:
> Fengguang Wu <fengguang.wu@intel.com> wrote:
> > Hi Patrick,
> > 
> > This happens in today's linux-next tree and is pretty reproducible.
> > [    1.834544] nf_conntrack version 0.5.0 (1786 buckets, 7144 max)
> > [    1.835406] ctnetlink v0.93: registering with nfnetlink.
> > [    1.836202] BUG: unable to handle kernel NULL pointer dereference at 00000000000000e0
> > [    1.837539] IP: [<ffffffff81a19123>] mutex_lock_interruptible+0x23/0x70
> 
> Should be fixed by
> commit 00545bec9412d130c77f72a08d6c8b6ad21d4a1e
> Author: Pablo Neira Ayuso <pablo@netfilter.org>
> Subject: netfilter: fix crash during boot if NAT has been compiled built-in
> 
> But seems its not yet in net-next.  Pablo?

I'll request the pull asap.

^ permalink raw reply

* Re: xt_nat_init: BUG: unable to handle kernel NULL pointer dereference at 00000000000000e0
From: Eric Dumazet @ 2012-09-13  9:33 UTC (permalink / raw)
  To: Fengguang Wu
  Cc: Patrick McHardy, networking, Pablo Neira Ayuso,
	Netfilter Development Maili..., LKML
In-Reply-To: <20120913091629.GA28022@localhost>

On Thu, 2012-09-13 at 17:16 +0800, Fengguang Wu wrote:
> Hi Patrick,
> 
> This happens in today's linux-next tree and is pretty reproducible.
> Bisection has been started.
> 
> [    1.834544] nf_conntrack version 0.5.0 (1786 buckets, 7144 max)
> [    1.835406] ctnetlink v0.93: registering with nfnetlink.
> [    1.836202] BUG: unable to handle kernel NULL pointer dereference at 00000000000000e0
> [    1.837539] IP: [<ffffffff81a19123>] mutex_lock_interruptible+0x23/0x70
> [    1.838493] PGD 0 
> [    1.838983] Oops: 0002 [#1] PREEMPT SMP 
> [    1.839786] CPU 0 
> [    1.840050] Pid: 1, comm: swapper/0 Not tainted 3.6.0-rc5-07686-gdbf978b #58 Bochs Bochs
> [    1.840172] RIP: 0010:[<ffffffff81a19123>]  [<ffffffff81a19123>] mutex_lock_interruptible+0x23/0x70
> [    1.840172] RSP: 0018:ffff88000d845e40  EFLAGS: 00010202
> [    1.840172] RAX: 0000000000000246 RBX: 00000000000000e0 RCX: ffff88000d848000
> [    1.840172] RDX: 0000000000000000 RSI: 0000000000000175 RDI: ffffffff81e6bc88
> [    1.840172] RBP: ffff88000d845e50 R08: ffffffff821bd170 R09: 0000000000000000
> [    1.840172] R10: 0000000000000000 R11: dead000000200200 R12: 00000000ffffffff
> [    1.840172] R13: 0000000000000004 R14: ffffffff821c1ce0 R15: 0000000000000000
> [    1.840172] FS:  0000000000000000(0000) GS:ffff88000de00000(0000) knlGS:0000000000000000
> [    1.840172] CS:  0010 DS: 0000 ES: 0000 CR0: 000000008005003b
> [    1.840172] CR2: 00000000000000e0 CR3: 000000000200b000 CR4: 00000000000006f0
> [    1.840172] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
> [    1.840172] DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7: 0000000000000400
> [    1.840172] Process swapper/0 (pid: 1, threadinfo ffff88000d844000, task ffff88000d848000)
> [    1.840172] Stack:
> [    1.840172]  00000000000000e0 ffffffff821c1ce0 ffff88000d845e80 ffffffff8187c204
> [    1.840172]  0000000000000000 ffffffff820f7ea0 0000000000000000 ffffffff821c1ce0
> [    1.840172]  ffff88000d845ec0 ffffffff8187c334 ffff88000d845eb0 0000000000000000
> [    1.840172] Call Trace:
> [    1.840172]  [<ffffffff8187c204>] xt_register_target+0x34/0x70
> [    1.840172]  [<ffffffff8187c334>] xt_register_targets+0x34/0x70
> [    1.840172]  [<ffffffff8221f336>] ? nf_nat_init+0x12c/0x12c
> [    1.840172]  [<ffffffff8221f34b>] xt_nat_init+0x15/0x17
> [    1.840172]  [<ffffffff821dcc3e>] do_one_initcall+0x7a/0x134
> [    1.840172]  [<ffffffff821dcdfb>] kernel_init+0x103/0x187
> [    1.864014]  [<ffffffff821dc615>] ? do_early_param+0x8d/0x8d
> [    1.864014]  [<ffffffff81a1dc44>] kernel_thread_helper+0x4/0x10
> [    1.864014]  [<ffffffff821dccf8>] ? do_one_initcall+0x134/0x134
> [    1.864014]  [<ffffffff81a1dc40>] ? gs_change+0x13/0x13
> [    1.864014] Code: 8b 65 f8 c9 c3 0f 1f 00 55 31 d2 be 75 01 00 00 48 89 e5 41 54 41 bc ff ff ff ff 53 48 89 fb 48 c7 c7 88 bc e6 81 e8 dd 73 6e ff <f0> 44 0f c1 23 41 83 ec 01 31 d2 48 c7 c7 c8 b7 13 82 41 c1 ec 
> [    1.864014] RIP  [<ffffffff81a19123>] mutex_lock_interruptible+0x23/0x70
> [    1.864014]  RSP <ffff88000d845e40>
> [    1.864014] CR2: 00000000000000e0
> 
> Thanks,
> Fengguang

Wasnt it already solved ?

http://1984.lsi.us.es/git/nf-next/commit/?id=00545bec9412d130c77f72a08d6c8b6ad21d4a1

Just have to wait that netfilter fixes are pushed upstream

Thanks




^ permalink raw reply

* Re: xt_nat_init: BUG: unable to handle kernel NULL pointer dereference at 00000000000000e0
From: Florian Westphal @ 2012-09-13  9:30 UTC (permalink / raw)
  To: Fengguang Wu
  Cc: Patrick McHardy, networking, Pablo Neira Ayuso,
	Netfilter Development Maili..., LKML
In-Reply-To: <20120913091629.GA28022@localhost>

Fengguang Wu <fengguang.wu@intel.com> wrote:
> Hi Patrick,
> 
> This happens in today's linux-next tree and is pretty reproducible.
> [    1.834544] nf_conntrack version 0.5.0 (1786 buckets, 7144 max)
> [    1.835406] ctnetlink v0.93: registering with nfnetlink.
> [    1.836202] BUG: unable to handle kernel NULL pointer dereference at 00000000000000e0
> [    1.837539] IP: [<ffffffff81a19123>] mutex_lock_interruptible+0x23/0x70

Should be fixed by
commit 00545bec9412d130c77f72a08d6c8b6ad21d4a1e
Author: Pablo Neira Ayuso <pablo@netfilter.org>
Subject: netfilter: fix crash during boot if NAT has been compiled built-in

But seems its not yet in net-next.  Pablo?

^ permalink raw reply


This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox