Netdev List
 help / color / mirror / Atom feed
* [PATCH net 6/6] sfc: Only bind to EF10 functions with the LinkCtrl and Trusted flags
From: Ben Hutchings @ 2013-10-08 17:45 UTC (permalink / raw)
  To: David Miller; +Cc: netdev, linux-net-drivers
In-Reply-To: <1381254091.1530.9.camel@bwh-desktop.uk.level5networks.com>

Although we do not yet enable multiple PFs per port, it is possible
that a board will be reconfigured to enable them while the driver has
not yet been updated to fully support this.

The most obvious problem is that multiple functions may try to set
conflicting link settings.  But we will also run into trouble if the
firmware doesn't consider us fully trusted.  So, abort probing unless
both the LinkCtrl and Trusted flags are set for this function.

Signed-off-by: Ben Hutchings <bhutchings@solarflare.com>
---
 drivers/net/ethernet/sfc/mcdi.c | 18 +++++++++++++++++-
 1 file changed, 17 insertions(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/sfc/mcdi.c b/drivers/net/ethernet/sfc/mcdi.c
index c082562..366c8e3 100644
--- a/drivers/net/ethernet/sfc/mcdi.c
+++ b/drivers/net/ethernet/sfc/mcdi.c
@@ -963,7 +963,7 @@ static int efx_mcdi_drv_attach(struct efx_nic *efx, bool driver_operating,
 			       bool *was_attached)
 {
 	MCDI_DECLARE_BUF(inbuf, MC_CMD_DRV_ATTACH_IN_LEN);
-	MCDI_DECLARE_BUF(outbuf, MC_CMD_DRV_ATTACH_OUT_LEN);
+	MCDI_DECLARE_BUF(outbuf, MC_CMD_DRV_ATTACH_EXT_OUT_LEN);
 	size_t outlen;
 	int rc;
 
@@ -981,6 +981,22 @@ static int efx_mcdi_drv_attach(struct efx_nic *efx, bool driver_operating,
 		goto fail;
 	}
 
+	/* We currently assume we have control of the external link
+	 * and are completely trusted by firmware.  Abort probing
+	 * if that's not true for this function.
+	 */
+	if (driver_operating &&
+	    outlen >= MC_CMD_DRV_ATTACH_EXT_OUT_LEN &&
+	    (MCDI_DWORD(outbuf, DRV_ATTACH_EXT_OUT_FUNC_FLAGS) &
+	     (1 << MC_CMD_DRV_ATTACH_EXT_OUT_FLAG_LINKCTRL |
+	      1 << MC_CMD_DRV_ATTACH_EXT_OUT_FLAG_TRUSTED)) !=
+	    (1 << MC_CMD_DRV_ATTACH_EXT_OUT_FLAG_LINKCTRL |
+	     1 << MC_CMD_DRV_ATTACH_EXT_OUT_FLAG_TRUSTED)) {
+		netif_err(efx, probe, efx->net_dev,
+			  "This driver version only supports one function per port\n");
+		return -ENODEV;
+	}
+
 	if (was_attached != NULL)
 		*was_attached = MCDI_DWORD(outbuf, DRV_ATTACH_OUT_OLD_STATE);
 	return 0;

-- 
Ben Hutchings, Staff Engineer, Solarflare
Not speaking for my employer; that's the marketing department's job.
They asked us to note that Solarflare product names are trademarked.

^ permalink raw reply related

* [PATCH net 5/6] sfc: Add PM and RXDP drop counters to ethtool stats
From: Ben Hutchings @ 2013-10-08 17:44 UTC (permalink / raw)
  To: David Miller; +Cc: netdev, linux-net-drivers
In-Reply-To: <1381254091.1530.9.camel@bwh-desktop.uk.level5networks.com>

From: Edward Cree <ecree@solarflare.com>

Recognise the new Packet Memory and RX Data Path counters.

The following counters are added:
rx_pm_{trunc,discard}_bb_overflow - burst buffer overflowed.  This should not
 occur if BB correctly configured.
rx_pm_{trunc,discard}_vfifo_full - not enough space in packet memory.  May
 indicate RX performance problems.
rx_pm_{trunc,discard}_qbb - dropped by 802.1Qbb early discard mechanism.
 Since Qbb is not supported at present, this should not occur.
rx_pm_discard_mapping - 802.1p priority configured to be dropped.  This should
 not occur in normal operation.
rx_dp_q_disabled_packets - packet was to be delivered to a queue but queue is
 disabled.  May indicate misconfiguration by the driver.
rx_dp_di_dropped_packets - parser-dispatcher indicated that a packet should be
 dropped.
rx_dp_streaming_packets - packet was sent to the RXDP streaming bus, ie. a
 filter directed the packet to the MCPU.
rx_dp_emerg_{fetch,wait} - RX datapath had to wait for descriptors to be
 loaded.  Indicates performance problems but not drops.

These are only provided if the MC firmware has the
PM_AND_RXDP_COUNTERS capability.  Otherwise, mask them out.

Signed-off-by: Ben Hutchings <bhutchings@solarflare.com>
---
 drivers/net/ethernet/sfc/ef10.c | 35 +++++++++++++++++++++++++++++++++++
 drivers/net/ethernet/sfc/nic.h  | 12 ++++++++++++
 2 files changed, 47 insertions(+)

diff --git a/drivers/net/ethernet/sfc/ef10.c b/drivers/net/ethernet/sfc/ef10.c
index c00a5d6..21f9ad6 100644
--- a/drivers/net/ethernet/sfc/ef10.c
+++ b/drivers/net/ethernet/sfc/ef10.c
@@ -444,6 +444,18 @@ static const struct efx_hw_stat_desc efx_ef10_stat_desc[EF10_STAT_COUNT] = {
 	EF10_DMA_STAT(rx_align_error, RX_ALIGN_ERROR_PKTS),
 	EF10_DMA_STAT(rx_length_error, RX_LENGTH_ERROR_PKTS),
 	EF10_DMA_STAT(rx_nodesc_drops, RX_NODESC_DROPS),
+	EF10_DMA_STAT(rx_pm_trunc_bb_overflow, PM_TRUNC_BB_OVERFLOW),
+	EF10_DMA_STAT(rx_pm_discard_bb_overflow, PM_DISCARD_BB_OVERFLOW),
+	EF10_DMA_STAT(rx_pm_trunc_vfifo_full, PM_TRUNC_VFIFO_FULL),
+	EF10_DMA_STAT(rx_pm_discard_vfifo_full, PM_DISCARD_VFIFO_FULL),
+	EF10_DMA_STAT(rx_pm_trunc_qbb, PM_TRUNC_QBB),
+	EF10_DMA_STAT(rx_pm_discard_qbb, PM_DISCARD_QBB),
+	EF10_DMA_STAT(rx_pm_discard_mapping, PM_DISCARD_MAPPING),
+	EF10_DMA_STAT(rx_dp_q_disabled_packets, RXDP_Q_DISABLED_PKTS),
+	EF10_DMA_STAT(rx_dp_di_dropped_packets, RXDP_DI_DROPPED_PKTS),
+	EF10_DMA_STAT(rx_dp_streaming_packets, RXDP_STREAMING_PKTS),
+	EF10_DMA_STAT(rx_dp_emerg_fetch, RXDP_EMERGENCY_FETCH_CONDITIONS),
+	EF10_DMA_STAT(rx_dp_emerg_wait, RXDP_EMERGENCY_WAIT_CONDITIONS),
 };
 
 #define HUNT_COMMON_STAT_MASK ((1ULL << EF10_STAT_tx_bytes) |		\
@@ -498,15 +510,38 @@ static const struct efx_hw_stat_desc efx_ef10_stat_desc[EF10_STAT_COUNT] = {
 #define HUNT_40G_EXTRA_STAT_MASK ((1ULL << EF10_STAT_rx_align_error) |	\
 				  (1ULL << EF10_STAT_rx_length_error))
 
+/* These statistics are only provided if the firmware supports the
+ * capability PM_AND_RXDP_COUNTERS.
+ */
+#define HUNT_PM_AND_RXDP_STAT_MASK (					\
+	(1ULL << EF10_STAT_rx_pm_trunc_bb_overflow) |			\
+	(1ULL << EF10_STAT_rx_pm_discard_bb_overflow) |			\
+	(1ULL << EF10_STAT_rx_pm_trunc_vfifo_full) |			\
+	(1ULL << EF10_STAT_rx_pm_discard_vfifo_full) |			\
+	(1ULL << EF10_STAT_rx_pm_trunc_qbb) |				\
+	(1ULL << EF10_STAT_rx_pm_discard_qbb) |				\
+	(1ULL << EF10_STAT_rx_pm_discard_mapping) |			\
+	(1ULL << EF10_STAT_rx_dp_q_disabled_packets) |			\
+	(1ULL << EF10_STAT_rx_dp_di_dropped_packets) |			\
+	(1ULL << EF10_STAT_rx_dp_streaming_packets) |			\
+	(1ULL << EF10_STAT_rx_dp_emerg_fetch) |				\
+	(1ULL << EF10_STAT_rx_dp_emerg_wait))
+
 static u64 efx_ef10_raw_stat_mask(struct efx_nic *efx)
 {
 	u64 raw_mask = HUNT_COMMON_STAT_MASK;
 	u32 port_caps = efx_mcdi_phy_get_caps(efx);
+	struct efx_ef10_nic_data *nic_data = efx->nic_data;
 
 	if (port_caps & (1 << MC_CMD_PHY_CAP_40000FDX_LBN))
 		raw_mask |= HUNT_40G_EXTRA_STAT_MASK;
 	else
 		raw_mask |= HUNT_10G_ONLY_STAT_MASK;
+
+	if (nic_data->datapath_caps &
+	    (1 << MC_CMD_GET_CAPABILITIES_OUT_PM_AND_RXDP_COUNTERS_LBN))
+		raw_mask |= HUNT_PM_AND_RXDP_STAT_MASK;
+
 	return raw_mask;
 }
 
diff --git a/drivers/net/ethernet/sfc/nic.h b/drivers/net/ethernet/sfc/nic.h
index fda29d3..890bbbe 100644
--- a/drivers/net/ethernet/sfc/nic.h
+++ b/drivers/net/ethernet/sfc/nic.h
@@ -386,6 +386,18 @@ enum {
 	EF10_STAT_rx_align_error,
 	EF10_STAT_rx_length_error,
 	EF10_STAT_rx_nodesc_drops,
+	EF10_STAT_rx_pm_trunc_bb_overflow,
+	EF10_STAT_rx_pm_discard_bb_overflow,
+	EF10_STAT_rx_pm_trunc_vfifo_full,
+	EF10_STAT_rx_pm_discard_vfifo_full,
+	EF10_STAT_rx_pm_trunc_qbb,
+	EF10_STAT_rx_pm_discard_qbb,
+	EF10_STAT_rx_pm_discard_mapping,
+	EF10_STAT_rx_dp_q_disabled_packets,
+	EF10_STAT_rx_dp_di_dropped_packets,
+	EF10_STAT_rx_dp_streaming_packets,
+	EF10_STAT_rx_dp_emerg_fetch,
+	EF10_STAT_rx_dp_emerg_wait,
 	EF10_STAT_COUNT
 };
 


-- 
Ben Hutchings, Staff Engineer, Solarflare
Not speaking for my employer; that's the marketing department's job.
They asked us to note that Solarflare product names are trademarked.

^ permalink raw reply related

* [PATCH net 4/6] sfc: Add definitions for new stats counters and capability flag
From: Ben Hutchings @ 2013-10-08 17:44 UTC (permalink / raw)
  To: David Miller; +Cc: netdev, linux-net-drivers
In-Reply-To: <1381254091.1530.9.camel@bwh-desktop.uk.level5networks.com>

From: Matthew Slattery <mslattery@solarflare.com>

Signed-off-by: Ben Hutchings <bhutchings@solarflare.com>
---
 drivers/net/ethernet/sfc/mcdi_pcol.h | 56 ++++++++++++++++++++++++++++++++++--
 1 file changed, 54 insertions(+), 2 deletions(-)

diff --git a/drivers/net/ethernet/sfc/mcdi_pcol.h b/drivers/net/ethernet/sfc/mcdi_pcol.h
index b5cf624..e0a63dd 100644
--- a/drivers/net/ethernet/sfc/mcdi_pcol.h
+++ b/drivers/net/ethernet/sfc/mcdi_pcol.h
@@ -2574,8 +2574,58 @@
 #define          MC_CMD_MAC_RX_LANES01_DISP_ERR  0x39 /* enum */
 #define          MC_CMD_MAC_RX_LANES23_DISP_ERR  0x3a /* enum */
 #define          MC_CMD_MAC_RX_MATCH_FAULT  0x3b /* enum */
-#define          MC_CMD_GMAC_DMABUF_START  0x40 /* enum */
-#define          MC_CMD_GMAC_DMABUF_END    0x5f /* enum */
+/* enum: PM trunc_bb_overflow counter. Valid for EF10 with PM_AND_RXDP_COUNTERS
+ * capability only.
+ */
+#define          MC_CMD_MAC_PM_TRUNC_BB_OVERFLOW  0x3c
+/* enum: PM discard_bb_overflow counter. Valid for EF10 with
+ * PM_AND_RXDP_COUNTERS capability only.
+ */
+#define          MC_CMD_MAC_PM_DISCARD_BB_OVERFLOW  0x3d
+/* enum: PM trunc_vfifo_full counter. Valid for EF10 with PM_AND_RXDP_COUNTERS
+ * capability only.
+ */
+#define          MC_CMD_MAC_PM_TRUNC_VFIFO_FULL  0x3e
+/* enum: PM discard_vfifo_full counter. Valid for EF10 with
+ * PM_AND_RXDP_COUNTERS capability only.
+ */
+#define          MC_CMD_MAC_PM_DISCARD_VFIFO_FULL  0x3f
+/* enum: PM trunc_qbb counter. Valid for EF10 with PM_AND_RXDP_COUNTERS
+ * capability only.
+ */
+#define          MC_CMD_MAC_PM_TRUNC_QBB  0x40
+/* enum: PM discard_qbb counter. Valid for EF10 with PM_AND_RXDP_COUNTERS
+ * capability only.
+ */
+#define          MC_CMD_MAC_PM_DISCARD_QBB  0x41
+/* enum: PM discard_mapping counter. Valid for EF10 with PM_AND_RXDP_COUNTERS
+ * capability only.
+ */
+#define          MC_CMD_MAC_PM_DISCARD_MAPPING  0x42
+/* enum: RXDP counter: Number of packets dropped due to the queue being
+ * disabled. Valid for EF10 with PM_AND_RXDP_COUNTERS capability only.
+ */
+#define          MC_CMD_MAC_RXDP_Q_DISABLED_PKTS  0x43
+/* enum: RXDP counter: Number of packets dropped by the DICPU. Valid for EF10
+ * with PM_AND_RXDP_COUNTERS capability only.
+ */
+#define          MC_CMD_MAC_RXDP_DI_DROPPED_PKTS  0x45
+/* enum: RXDP counter: Number of non-host packets. Valid for EF10 with
+ * PM_AND_RXDP_COUNTERS capability only.
+ */
+#define          MC_CMD_MAC_RXDP_STREAMING_PKTS  0x46
+/* enum: RXDP counter: Number of times an emergency descriptor fetch was
+ * performed. Valid for EF10 with PM_AND_RXDP_COUNTERS capability only.
+ */
+#define          MC_CMD_MAC_RXDP_EMERGENCY_FETCH_CONDITIONS  0x47
+/* enum: RXDP counter: Number of times the DPCPU waited for an existing
+ * descriptor fetch. Valid for EF10 with PM_AND_RXDP_COUNTERS capability only.
+ */
+#define          MC_CMD_MAC_RXDP_EMERGENCY_WAIT_CONDITIONS  0x48
+/* enum: Start of GMAC stats buffer space, for Siena only. */
+#define          MC_CMD_GMAC_DMABUF_START  0x40
+/* enum: End of GMAC stats buffer space, for Siena only. */
+#define          MC_CMD_GMAC_DMABUF_END    0x5f
 #define          MC_CMD_MAC_GENERATION_END 0x60 /* enum */
 #define          MC_CMD_MAC_NSTATS  0x61 /* enum */
 
@@ -5065,6 +5115,8 @@
 #define        MC_CMD_GET_CAPABILITIES_OUT_RX_BATCHING_WIDTH 1
 #define        MC_CMD_GET_CAPABILITIES_OUT_MCAST_FILTER_CHAINING_LBN 26
 #define        MC_CMD_GET_CAPABILITIES_OUT_MCAST_FILTER_CHAINING_WIDTH 1
+#define        MC_CMD_GET_CAPABILITIES_OUT_PM_AND_RXDP_COUNTERS_LBN 27
+#define        MC_CMD_GET_CAPABILITIES_OUT_PM_AND_RXDP_COUNTERS_WIDTH 1
 /* RxDPCPU firmware id. */
 #define       MC_CMD_GET_CAPABILITIES_OUT_RX_DPCPU_FW_ID_OFST 4
 #define       MC_CMD_GET_CAPABILITIES_OUT_RX_DPCPU_FW_ID_LEN 2


-- 
Ben Hutchings, Staff Engineer, Solarflare
Not speaking for my employer; that's the marketing department's job.
They asked us to note that Solarflare product names are trademarked.

^ permalink raw reply related

* [PATCH net 3/6] sfc: Refactor EF10 stat mask code to allow for more conditional stats
From: Ben Hutchings @ 2013-10-08 17:44 UTC (permalink / raw)
  To: David Miller; +Cc: netdev, linux-net-drivers
In-Reply-To: <1381254091.1530.9.camel@bwh-desktop.uk.level5networks.com>

From: Edward Cree <ecree@solarflare.com>

Previously, efx_ef10_stat_mask returned a static const unsigned long[], which
meant that each possible mask had to be declared statically with
STAT_MASK_BITMAP.  Since adding a condition would double the size of the
decision tree, we now create the bitmask dynamically.

To do this, we have two functions efx_ef10_raw_stat_mask, which returns a u64,
and efx_ef10_get_stat_mask, which fills in an unsigned long * argument.

Signed-off-by: Ben Hutchings <bhutchings@solarflare.com>
---
 drivers/net/ethernet/sfc/ef10.c | 49 +++++++++++++++++++++++------------------
 1 file changed, 28 insertions(+), 21 deletions(-)

diff --git a/drivers/net/ethernet/sfc/ef10.c b/drivers/net/ethernet/sfc/ef10.c
index a4fbb38..c00a5d6 100644
--- a/drivers/net/ethernet/sfc/ef10.c
+++ b/drivers/net/ethernet/sfc/ef10.c
@@ -498,44 +498,49 @@ static const struct efx_hw_stat_desc efx_ef10_stat_desc[EF10_STAT_COUNT] = {
 #define HUNT_40G_EXTRA_STAT_MASK ((1ULL << EF10_STAT_rx_align_error) |	\
 				  (1ULL << EF10_STAT_rx_length_error))
 
-#if BITS_PER_LONG == 64
-#define STAT_MASK_BITMAP(bits) (bits)
-#else
-#define STAT_MASK_BITMAP(bits) (bits) & 0xffffffff, (bits) >> 32
-#endif
-
-static const unsigned long *efx_ef10_stat_mask(struct efx_nic *efx)
+static u64 efx_ef10_raw_stat_mask(struct efx_nic *efx)
 {
-	static const unsigned long hunt_40g_stat_mask[] = {
-		STAT_MASK_BITMAP(HUNT_COMMON_STAT_MASK |
-				 HUNT_40G_EXTRA_STAT_MASK)
-	};
-	static const unsigned long hunt_10g_only_stat_mask[] = {
-		STAT_MASK_BITMAP(HUNT_COMMON_STAT_MASK |
-				 HUNT_10G_ONLY_STAT_MASK)
-	};
+	u64 raw_mask = HUNT_COMMON_STAT_MASK;
 	u32 port_caps = efx_mcdi_phy_get_caps(efx);
 
 	if (port_caps & (1 << MC_CMD_PHY_CAP_40000FDX_LBN))
-		return hunt_40g_stat_mask;
+		raw_mask |= HUNT_40G_EXTRA_STAT_MASK;
 	else
-		return hunt_10g_only_stat_mask;
+		raw_mask |= HUNT_10G_ONLY_STAT_MASK;
+	return raw_mask;
+}
+
+static void efx_ef10_get_stat_mask(struct efx_nic *efx, unsigned long *mask)
+{
+	u64 raw_mask = efx_ef10_raw_stat_mask(efx);
+
+#if BITS_PER_LONG == 64
+	mask[0] = raw_mask;
+#else
+	mask[0] = raw_mask & 0xffffffff;
+	mask[1] = raw_mask >> 32;
+#endif
 }
 
 static size_t efx_ef10_describe_stats(struct efx_nic *efx, u8 *names)
 {
+	DECLARE_BITMAP(mask, EF10_STAT_COUNT);
+
+	efx_ef10_get_stat_mask(efx, mask);
 	return efx_nic_describe_stats(efx_ef10_stat_desc, EF10_STAT_COUNT,
-				      efx_ef10_stat_mask(efx), names);
+				      mask, names);
 }
 
 static int efx_ef10_try_update_nic_stats(struct efx_nic *efx)
 {
 	struct efx_ef10_nic_data *nic_data = efx->nic_data;
-	const unsigned long *stats_mask = efx_ef10_stat_mask(efx);
+	DECLARE_BITMAP(mask, EF10_STAT_COUNT);
 	__le64 generation_start, generation_end;
 	u64 *stats = nic_data->stats;
 	__le64 *dma_stats;
 
+	efx_ef10_get_stat_mask(efx, mask);
+
 	dma_stats = efx->stats_buffer.addr;
 	nic_data = efx->nic_data;
 
@@ -543,7 +548,7 @@ static int efx_ef10_try_update_nic_stats(struct efx_nic *efx)
 	if (generation_end == EFX_MC_STATS_GENERATION_INVALID)
 		return 0;
 	rmb();
-	efx_nic_update_stats(efx_ef10_stat_desc, EF10_STAT_COUNT, stats_mask,
+	efx_nic_update_stats(efx_ef10_stat_desc, EF10_STAT_COUNT, mask,
 			     stats, efx->stats_buffer.addr, false);
 	rmb();
 	generation_start = dma_stats[MC_CMD_MAC_GENERATION_START];
@@ -564,12 +569,14 @@ static int efx_ef10_try_update_nic_stats(struct efx_nic *efx)
 static size_t efx_ef10_update_stats(struct efx_nic *efx, u64 *full_stats,
 				    struct rtnl_link_stats64 *core_stats)
 {
-	const unsigned long *mask = efx_ef10_stat_mask(efx);
+	DECLARE_BITMAP(mask, EF10_STAT_COUNT);
 	struct efx_ef10_nic_data *nic_data = efx->nic_data;
 	u64 *stats = nic_data->stats;
 	size_t stats_count = 0, index;
 	int retry;
 
+	efx_ef10_get_stat_mask(efx, mask);
+
 	/* If we're unlucky enough to read statistics during the DMA, wait
 	 * up to 10ms for it to finish (typically takes <500us)
 	 */


-- 
Ben Hutchings, Staff Engineer, Solarflare
Not speaking for my employer; that's the marketing department's job.
They asked us to note that Solarflare product names are trademarked.

^ permalink raw reply related

* [PATCH net 2/6] sfc: Fix internal indices of ethtool stats for EF10
From: Ben Hutchings @ 2013-10-08 17:43 UTC (permalink / raw)
  To: David Miller; +Cc: netdev, linux-net-drivers
In-Reply-To: <1381254091.1530.9.camel@bwh-desktop.uk.level5networks.com>

From: Edward Cree <ecree@solarflare.com>

The indices in nic_data->stats need to match the EF10_STAT_whatever
enum values.  In efx_nic_update_stats, only mask; gaps are removed in
efx_ef10_update_stats.

Signed-off-by: Ben Hutchings <bhutchings@solarflare.com>
---
 drivers/net/ethernet/sfc/nic.c | 9 +++------
 1 file changed, 3 insertions(+), 6 deletions(-)

diff --git a/drivers/net/ethernet/sfc/nic.c b/drivers/net/ethernet/sfc/nic.c
index e7dbd2d..9826594 100644
--- a/drivers/net/ethernet/sfc/nic.c
+++ b/drivers/net/ethernet/sfc/nic.c
@@ -469,8 +469,7 @@ size_t efx_nic_describe_stats(const struct efx_hw_stat_desc *desc, size_t count,
  * @count: Length of the @desc array
  * @mask: Bitmask of which elements of @desc are enabled
  * @stats: Buffer to update with the converted statistics.  The length
- *	of this array must be at least the number of set bits in the
- *	first @count bits of @mask.
+ *	of this array must be at least @count.
  * @dma_buf: DMA buffer containing hardware statistics
  * @accumulate: If set, the converted values will be added rather than
  *	directly stored to the corresponding elements of @stats
@@ -503,11 +502,9 @@ void efx_nic_update_stats(const struct efx_hw_stat_desc *desc, size_t count,
 			}
 
 			if (accumulate)
-				*stats += val;
+				stats[index] += val;
 			else
-				*stats = val;
+				stats[index] = val;
 		}
-
-		++stats;
 	}
 }


-- 
Ben Hutchings, Staff Engineer, Solarflare
Not speaking for my employer; that's the marketing department's job.
They asked us to note that Solarflare product names are trademarked.

^ permalink raw reply related

* [PATCH net 1/6] sfc: Add rmb() between reading stats and generation count to ensure consistency
From: Ben Hutchings @ 2013-10-08 17:43 UTC (permalink / raw)
  To: David Miller; +Cc: netdev, linux-net-drivers
In-Reply-To: <1381254091.1530.9.camel@bwh-desktop.uk.level5networks.com>

From: Jon Cooper <jcooper@solarflare.com>

Signed-off-by: Ben Hutchings <bhutchings@solarflare.com>
---
 drivers/net/ethernet/sfc/ef10.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/net/ethernet/sfc/ef10.c b/drivers/net/ethernet/sfc/ef10.c
index 9f18ae9..a4fbb38 100644
--- a/drivers/net/ethernet/sfc/ef10.c
+++ b/drivers/net/ethernet/sfc/ef10.c
@@ -545,6 +545,7 @@ static int efx_ef10_try_update_nic_stats(struct efx_nic *efx)
 	rmb();
 	efx_nic_update_stats(efx_ef10_stat_desc, EF10_STAT_COUNT, stats_mask,
 			     stats, efx->stats_buffer.addr, false);
+	rmb();
 	generation_start = dma_stats[MC_CMD_MAC_GENERATION_START];
 	if (generation_end != generation_start)
 		return -EAGAIN;


-- 
Ben Hutchings, Staff Engineer, Solarflare
Not speaking for my employer; that's the marketing department's job.
They asked us to note that Solarflare product names are trademarked.

^ permalink raw reply related

* [PATCH net-next 1/6] sfc: Add rmb() between reading stats and generation count to ensure consistency
From: Ben Hutchings @ 2013-10-08 17:42 UTC (permalink / raw)
  To: David Miller; +Cc: netdev, linux-net-drivers
In-Reply-To: <1381254091.1530.9.camel@bwh-desktop.uk.level5networks.com>

From: Jon Cooper <jcooper@solarflare.com>

Signed-off-by: Ben Hutchings <bhutchings@solarflare.com>
---
 drivers/net/ethernet/sfc/ef10.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/net/ethernet/sfc/ef10.c b/drivers/net/ethernet/sfc/ef10.c
index 9f18ae9..a4fbb38 100644
--- a/drivers/net/ethernet/sfc/ef10.c
+++ b/drivers/net/ethernet/sfc/ef10.c
@@ -545,6 +545,7 @@ static int efx_ef10_try_update_nic_stats(struct efx_nic *efx)
 	rmb();
 	efx_nic_update_stats(efx_ef10_stat_desc, EF10_STAT_COUNT, stats_mask,
 			     stats, efx->stats_buffer.addr, false);
+	rmb();
 	generation_start = dma_stats[MC_CMD_MAC_GENERATION_START];
 	if (generation_end != generation_start)
 		return -EAGAIN;


-- 
Ben Hutchings, Staff Engineer, Solarflare
Not speaking for my employer; that's the marketing department's job.
They asked us to note that Solarflare product names are trademarked.

^ permalink raw reply related

* Pull request: sfc 2013-10-08
From: Ben Hutchings @ 2013-10-08 17:41 UTC (permalink / raw)
  To: David Miller; +Cc: netdev, linux-net-drivers

[-- Attachment #1: Type: text/plain, Size: 1982 bytes --]

The following changes since commit 5e8a402f831dbe7ee831340a91439e46f0d38acd:

  tcp: do not forget FIN in tcp_shifted_skb() (2013-10-04 14:16:36 -0400)

are available in the git repository at:

  git://git.kernel.org/pub/scm/linux/kernel/git/bwh/sfc.git sfc-3.12

for you to fetch changes up to ecb1c9cc215cb5a4390b714d8b09de637f54fa3f:

  sfc: Only bind to EF10 functions with the LinkCtrl and Trusted flags (2013-10-07 20:11:19 +0100)

Some more fixes for EF10 support; hopefully the last lot:

1. Fixes for reading statistics, from Edward Cree and Jon Cooper.
2. Addition of ethtool statistics for packets dropped by the hardware
before they were associated with a specific function, from Edward Cree.
3. Only bind to functions that are in control of their associated port,
as the driver currently assumes this is the case.

Ben.

----------------------------------------------------------------
Ben Hutchings (1):
      sfc: Only bind to EF10 functions with the LinkCtrl and Trusted flags

Edward Cree (3):
      sfc: Fix internal indices of ethtool stats for EF10
      sfc: Refactor EF10 stat mask code to allow for more conditional stats
      sfc: Add PM and RXDP drop counters to ethtool stats

Jon Cooper (1):
      sfc: Add rmb() between reading stats and generation count to ensure consistency

Matthew Slattery (1):
      sfc: Add definitions for new stats counters and capability flag

 drivers/net/ethernet/sfc/ef10.c      | 87 +++++++++++++++++++++++++++---------
 drivers/net/ethernet/sfc/mcdi.c      | 18 +++++++-
 drivers/net/ethernet/sfc/mcdi_pcol.h | 56 ++++++++++++++++++++++-
 drivers/net/ethernet/sfc/nic.c       |  9 ++--
 drivers/net/ethernet/sfc/nic.h       | 12 +++++
 5 files changed, 151 insertions(+), 31 deletions(-)

-- 
Ben Hutchings, Staff Engineer, Solarflare
Not speaking for my employer; that's the marketing department's job.
They asked us to note that Solarflare product names are trademarked.

[-- Attachment #2: This is a digitally signed message part --]
[-- Type: application/pgp-signature, Size: 490 bytes --]

^ permalink raw reply

* RE: [PATCH net] vti: get rid of nf mark rule in prerouting
From: Saurabh Mohan @ 2013-10-08 16:48 UTC (permalink / raw)
  To: Christophe Gouault, netdev@vger.kernel.org; +Cc: David S. Miller, Cong Wang
In-Reply-To: <1381245682-15523-1-git-send-email-christophe.gouault@6wind.com>



> -----Original Message-----
> From: Christophe Gouault [mailto:christophe.gouault@6wind.com]
> Sent: Tuesday, October 08, 2013 8:21 AM
> To: netdev@vger.kernel.org
> Cc: David S. Miller; Cong Wang; Saurabh Mohan; Christophe Gouault
> Subject: [PATCH net] vti: get rid of nf mark rule in prerouting
> 
> This patch fixes and improves the use of vti interfaces (while lightly changing
> the way of configuring them).
> 

Good fix. Thanks for doing this!

^ permalink raw reply

* Re: bug in passing file descriptors
From: Steve Rago @ 2013-10-08 16:51 UTC (permalink / raw)
  To: Andi Kleen
  Cc: Andy Lutomirski, David Miller, Network Development,
	Michael Kerrisk-manpages, Eric W. Biederman
In-Reply-To: <20131008164108.GO6882@two.firstfloor.org>

On 10/08/2013 12:41 PM, Andi Kleen wrote:
>> I just want the semantics to be consistent.  If you want Linux to
>> always require applications that call recvmsg to provide a buffer
>> size of CMSG_SPACE bytes long to retrieve control information, then
>> fail the system call when the buffer is smaller.  But if you do
>> this, you risk breaking applications that work with FreeBSD, Mac OS
>> X, Solaris, and probably a few others.
>
> The primary concern is to be binary compatible with Linux.
>
> But not being compatible between 32bit and 64bit Linux processes on the same
> host would seem like a serious problem to me.
>
>> Regardless, copying 20 bytes and telling me you copied 24 is misleading and wrong.
>
> The question is could it break existing Linux applications to change it?
> And would it help with the 32/64bit compatibility?
>
> If not some other way to fix the compat layer would need to be found.
>
> -Andi
>

I'm not sure if a 64-bit process and a 32-bit process exchange file descriptors on the same system has a problem.  It 
certainly looks like the compat code does the right thing.  I can test this tonight if you want.  The discrepancy arises 
because file descriptors are 4-byte quantities and treated differently (for example, when more than one is specified, 
each one isn't padded to an 8-byte boundary).

The way I discovered the problem is that I had an example program in APUE that was validating that msg_controllen had 
the correct value after calling recvmsg().  It worked on my 32-bit platform, but when I recompiled it and ran it on my 
64-bit platform, the test failed, because msg_controllen was larger than the size that was sent via sendmsg().

Steve

^ permalink raw reply

* Re: [PATCH net-next] net: gro: allow to build full sized skb
From: Eric Dumazet @ 2013-10-08 16:50 UTC (permalink / raw)
  To: Ben Greear; +Cc: David Miller, netdev
In-Reply-To: <52542F77.2050308@candelatech.com>

On Tue, 2013-10-08 at 09:14 -0700, Ben Greear wrote:

> On multi-homed boxes you could easily have paths that would never need linearize
> and other paths that always need it, for whatever reason.
> 
> Maybe a per-netdev check for needs linearize instead of something global
> would be better...and maybe let users over-ride the default behaviour
> regardless?

Sure. Note that this has yet to be discussed.

I mentioned this in the changelog for completeness.

I know people just disabling GRO in forwarding setups, probably because
we lack control of latencies / bursts.

^ permalink raw reply

* Re: bug in passing file descriptors
From: Andi Kleen @ 2013-10-08 16:41 UTC (permalink / raw)
  To: Steve Rago
  Cc: Andy Lutomirski, Andi Kleen, David Miller, Network Development,
	Michael Kerrisk-manpages, Eric W. Biederman
In-Reply-To: <52543065.7000209@nec-labs.com>

> I just want the semantics to be consistent.  If you want Linux to
> always require applications that call recvmsg to provide a buffer
> size of CMSG_SPACE bytes long to retrieve control information, then
> fail the system call when the buffer is smaller.  But if you do
> this, you risk breaking applications that work with FreeBSD, Mac OS
> X, Solaris, and probably a few others.

The primary concern is to be binary compatible with Linux.

But not being compatible between 32bit and 64bit Linux processes on the same
host would seem like a serious problem to me.

> Regardless, copying 20 bytes and telling me you copied 24 is misleading and wrong.

The question is could it break existing Linux applications to change it?
And would it help with the 32/64bit compatibility?

If not some other way to fix the compat layer would need to be found.

-Andi

-- 
ak@linux.intel.com -- Speaking for myself only.

^ permalink raw reply

* Re: [PATCH net-next v2 2/5] xen-netback: Add support for IPv6 checksum offload from guest
From: Wei Liu @ 2013-10-08 16:19 UTC (permalink / raw)
  To: Paul Durrant
  Cc: Wei Liu, xen-devel@lists.xen.org, netdev@vger.kernel.org,
	David Vrabel, Ian Campbell
In-Reply-To: <9AAE0902D5BC7E449B7C8E4E778ABCD012F345@AMSPEX01CL01.citrite.net>

On Tue, Oct 08, 2013 at 02:50:27PM +0100, Paul Durrant wrote:
[...]
> > > -#define PKT_PROT_LEN    (ETH_HLEN + \
> > > -			 VLAN_HLEN + \
> > > -			 sizeof(struct iphdr) + MAX_IPOPTLEN + \
> > > -			 sizeof(struct tcphdr) + MAX_TCP_OPTION_SPACE)
> > > +#define PKT_PROT_LEN 128
> > >
> > 
> > Where does 128 come from?
> > 
> 
> It's just an arbitrary power of 2 that was chosen because it seems to
> cover most likely v6 headers and all v4 headers.
> 

Hmm... How about using the value of MAX_TCP_HEADER? I guess that can
cover all V4 / V6 headers.

MAX_TCP_HEADER varies, depending on configuration. To make sure we can
accommodate all guests packet we might need to use its maximum value
which can be as big as 128 + 128 + 48.

> > >  		if (recalculate_partial_csum) {
> > >  			struct tcphdr *tcph = tcp_hdr(skb);
> > > +
> > > +			header_size = skb->network_header +
> > > +				off +
> > > +				sizeof(struct tcphdr) +
> > > +				MAX_TCP_OPTION_SPACE;
> > > +			maybe_pull_tail(skb, header_size);
> > > +
> > 
> > I presume this function is checksum_setup stripped down to handle IPv4
> > packet. What's the purpose of changing its behaviour? Why the pull_tail
> > here?
> > 
> 
> We have to make sure that the TCP header is in the linear area as we
> are about to write to the checksum field. In practice, the 128 byte
> pull should guarantee this but in case that is varied later I wanted
> to make sure things did not start to fail in an add way.
> 

If you already set the pull size to maximum possible value then this
will not be necessary anymore, right?

> > > +	while ((off <= sizeof(struct ipv6hdr) + ntohs(ipv6h->payload_len))
> > &&
> > > +	       !done) {
> > > +		/* We only access at most the first 2 bytes of any option
> > header
> > > +		 * to determine the next one.
> > > +		 */
> > > +		header_size = skb->network_header + off + 2;
> > > +		maybe_pull_tail(skb, header_size);
> > > +
> > 
> > Will this cause performance problem? Is it possible that you pull too
> > many times?
> > 
> 
> I guess it means we may get two pulls for the TCP/UDP headers rather
> than one so could push the pulls into the individual cases if you
> think it will affect performance that badly.

Hmm... Not sure I get what you mean here. The main problem I'm seeing is
that maybe_pull_tail is called in every loop.

I would like to see as few pulls as possible because __pskb_pull_tail
can be expensive and only expected to use in "exceptional cases" (quoted
from the comment above that function).

Is it possible to pull TCP_MAX_HEADER bytes once to eliminate all other
pulls in checksum_setup{,_ipv4,_ipv6}?

> 
[...]
> > 
> > Can you make the logic explicit here?
> > 
> >    case IPPROTO_TCP:
> >    case IPPROTO_UDP:
> >         done = true;
> > 	break;
> >    default:
> >         break;
> > 
> > Another minor suggestion is that change "done" to "found" because you're
> > trying to find the two type of headers.
> > 
> 
> Yes, I could code it that way if you prefer.
> 

Explicity is better than implicity IMHO. After this change could you
also move the default branch (netdev_err) in the following "switch" to
the first "switch".

Wei.

^ permalink raw reply

* Re: bug in passing file descriptors
From: Steve Rago @ 2013-10-08 16:18 UTC (permalink / raw)
  To: Andy Lutomirski
  Cc: Andi Kleen, David Miller, Network Development,
	Michael Kerrisk-manpages, Eric W. Biederman
In-Reply-To: <CALCETrVoUrcuzEx_TURoGE8_MJtZLOz-UDz6m69Vo6MCWG0zjg@mail.gmail.com>

On 10/08/2013 12:02 PM, Andy Lutomirski wrote:
> On Tue, Oct 8, 2013 at 7:32 AM, Steve Rago <sar@nec-labs.com> wrote:
>> On 10/07/2013 06:55 PM, Andi Kleen wrote:
>>>
>>> David Miller <davem@davemloft.net> writes:
>>>
>>>> From: Steve Rago <sar@nec-labs.com>
>>>> Date: Mon, 7 Oct 2013 16:29:15 -0400
>>>>
>>>>> On 10/07/2013 03:42 PM, David Miller wrote:
>>>>>>
>>>>>> There is no compatability issue.
>>>>>>
>>>>>> 32-bit tasks will always see the 4-byte align/length.
>>>>>> 64-bit tasks will always see the 8-byte align/length.
>>>>>>
>>>>>
>>>>> Really?  So when I compile my application on a 32-bit Linux box and
>>>>> then try to run it on a 64-bit Linux box, you're not going to overrun
>>>>> my buffer when CMSG_SPACE led me to allocate an insufficient amount of
>>>>> memory needed to account for padding on the 64-bit platform?
>>>>
>>>>
>>>> We have a compatability layer that gives 32-bit applications the
>>>> same behavior as if they had run on a 32-bit machine.
>>>>
>>>> Search around for the MSG_MSG_COMPAT flag and how that is used in
>>>> net/socket.c
>>>
>>>
>>> But it seems the compat layer doesn't handle this correctly,
>>> otherwise Steve's original test case would work.
>>>
>>> Must be a bug somewhere in the compat layer.
>>>
>>> -Andi
>>>
>>
>> I did some research last night and I think the problem stems from an
>> underspecified standard.  CMSG_LEN and CMSG_SPACE seem to have originated
>> with RFC 2292, which has since been obsoleted by RFC 3542.  The difference
>> is that CMSG_SPACE accounts for padding at the end, which is needed when you
>> stuff multiple cmsghdr objects in the same buffer.  CMSG_LEN is required to
>> be used to initialize the cmsg_len member of the structure.  When you only
>> have one cmsghdr object in your call to recvmsg, it is unclear whether you
>> need to have a buffer as large as CMSG_SPACE or CMSG_LEN.  Historically,
>> BSD-based platforms never had these macros and didn't return the ancillary
>> data if the space provided by the application wasn't big enough.  Linux,
>> *which has a bug*, won't copy more bytes than cmsg_len specifies when the
>> application uses CMSG_LEN instead of CMSG_SPACE, but then lies to the
>> application by overwriting msg_controllen.  Look in put_cmsg() in
>> net/core/scm.c: "cmlen" is calculated using CM_LEN, and then msg_controllen
>> is checked against it to make sure you don't overwrite the user's buffer.
>> However, in recvmsg(), msg_controllen is overwritten by "msg_sys.msg_control
>> - cmsg_ptr".  msg_control was incremented by CMSG_SPACE at the end of
>> scm_detach_fds().
>>
>> The Linux kernel code seems to copy the right amount of data, even in the
>> compat case, as far as I can tell.  The only bug I see is that
>> msg_controllen returns from recvmsg() with the wrong value.
>
> I don't see how the other behavior would be any less surprising.
> Suppose you had a control message with CMSG_LEN=12 and CMSG_SPACE=16.
> Then, with the semantics you want, one of them takes 12 bytes, two of
> them take 28 bytes, three take 44 bytes, etc.
>
> That seems considerably more surprising than having then take 16, 32,
> and 48 bytes respectively.
>
> --Andy
>

I just want the semantics to be consistent.  If you want Linux to always require applications that call recvmsg to 
provide a buffer size of CMSG_SPACE bytes long to retrieve control information, then fail the system call when the 
buffer is smaller.  But if you do this, you risk breaking applications that work with FreeBSD, Mac OS X, Solaris, and 
probably a few others.

No wonder the Single UNIX Specification didn't standardize these two macros.

Regardless, copying 20 bytes and telling me you copied 24 is misleading and wrong.

Steve

^ permalink raw reply

* Re: [PATCH net-next] net: gro: allow to build full sized skb
From: Ben Greear @ 2013-10-08 16:14 UTC (permalink / raw)
  To: Eric Dumazet; +Cc: David Miller, netdev
In-Reply-To: <1381248143.12191.53.camel@edumazet-glaptop.roam.corp.google.com>

On 10/08/2013 09:02 AM, Eric Dumazet wrote:
> From: Eric Dumazet <edumazet@google.com>
>
> skb_gro_receive() is currently limited to 16 or 17 MSS per GRO skb,
> typically 24616 bytes, because it fills up to MAX_SKB_FRAGS frags.
>
> It's relatively easy to extend the skb using frag_list to allow
> more frags to be appended into the last sk_buff.
>
> This still builds very efficient skbs, and allows reaching 45 MSS per
> skb.
>
> (45 MSS GRO packet uses one skb plus a frag_list containing 2 additional
> sk_buff)
>
> High speed TCP flows benefit from this extension by lowering TCP stack
> cpu usage (less packets stored in receive queue, less ACK packets
> processed)
>
> Forwarding setups could be hurt, as such skbs will need to be
> linearized, although its not a new problem, as GRO could already
> provide skbs with a frag_list.
>
> We could make the 65536 bytes threshold a tunable to mitigate this.
>
> (First time we need to linearize skb in skb_needs_linearize(), we could
> lower the tunable to ~16*1460 so that following skb_gro_receive() calls
> build smaller skbs)

On multi-homed boxes you could easily have paths that would never need linearize
and other paths that always need it, for whatever reason.

Maybe a per-netdev check for needs linearize instead of something global
would be better...and maybe let users over-ride the default behaviour
regardless?

Thanks,
Ben

-- 
Ben Greear <greearb@candelatech.com>
Candela Technologies Inc  http://www.candelatech.com

^ permalink raw reply

* Re: bug in passing file descriptors
From: Andy Lutomirski @ 2013-10-08 16:02 UTC (permalink / raw)
  To: Steve Rago
  Cc: Andi Kleen, David Miller, Network Development,
	Michael Kerrisk-manpages, Eric W. Biederman
In-Reply-To: <52541769.1000306@nec-labs.com>

On Tue, Oct 8, 2013 at 7:32 AM, Steve Rago <sar@nec-labs.com> wrote:
> On 10/07/2013 06:55 PM, Andi Kleen wrote:
>>
>> David Miller <davem@davemloft.net> writes:
>>
>>> From: Steve Rago <sar@nec-labs.com>
>>> Date: Mon, 7 Oct 2013 16:29:15 -0400
>>>
>>>> On 10/07/2013 03:42 PM, David Miller wrote:
>>>>>
>>>>> There is no compatability issue.
>>>>>
>>>>> 32-bit tasks will always see the 4-byte align/length.
>>>>> 64-bit tasks will always see the 8-byte align/length.
>>>>>
>>>>
>>>> Really?  So when I compile my application on a 32-bit Linux box and
>>>> then try to run it on a 64-bit Linux box, you're not going to overrun
>>>> my buffer when CMSG_SPACE led me to allocate an insufficient amount of
>>>> memory needed to account for padding on the 64-bit platform?
>>>
>>>
>>> We have a compatability layer that gives 32-bit applications the
>>> same behavior as if they had run on a 32-bit machine.
>>>
>>> Search around for the MSG_MSG_COMPAT flag and how that is used in
>>> net/socket.c
>>
>>
>> But it seems the compat layer doesn't handle this correctly,
>> otherwise Steve's original test case would work.
>>
>> Must be a bug somewhere in the compat layer.
>>
>> -Andi
>>
>
> I did some research last night and I think the problem stems from an
> underspecified standard.  CMSG_LEN and CMSG_SPACE seem to have originated
> with RFC 2292, which has since been obsoleted by RFC 3542.  The difference
> is that CMSG_SPACE accounts for padding at the end, which is needed when you
> stuff multiple cmsghdr objects in the same buffer.  CMSG_LEN is required to
> be used to initialize the cmsg_len member of the structure.  When you only
> have one cmsghdr object in your call to recvmsg, it is unclear whether you
> need to have a buffer as large as CMSG_SPACE or CMSG_LEN.  Historically,
> BSD-based platforms never had these macros and didn't return the ancillary
> data if the space provided by the application wasn't big enough.  Linux,
> *which has a bug*, won't copy more bytes than cmsg_len specifies when the
> application uses CMSG_LEN instead of CMSG_SPACE, but then lies to the
> application by overwriting msg_controllen.  Look in put_cmsg() in
> net/core/scm.c: "cmlen" is calculated using CM_LEN, and then msg_controllen
> is checked against it to make sure you don't overwrite the user's buffer.
> However, in recvmsg(), msg_controllen is overwritten by "msg_sys.msg_control
> - cmsg_ptr".  msg_control was incremented by CMSG_SPACE at the end of
> scm_detach_fds().
>
> The Linux kernel code seems to copy the right amount of data, even in the
> compat case, as far as I can tell.  The only bug I see is that
> msg_controllen returns from recvmsg() with the wrong value.

I don't see how the other behavior would be any less surprising.
Suppose you had a control message with CMSG_LEN=12 and CMSG_SPACE=16.
Then, with the semantics you want, one of them takes 12 bytes, two of
them take 28 bytes, three take 44 bytes, etc.

That seems considerably more surprising than having then take 16, 32,
and 48 bytes respectively.

--Andy

^ permalink raw reply

* [PATCH net-next] net: gro: allow to build full sized skb
From: Eric Dumazet @ 2013-10-08 16:02 UTC (permalink / raw)
  To: David Miller; +Cc: netdev

From: Eric Dumazet <edumazet@google.com>

skb_gro_receive() is currently limited to 16 or 17 MSS per GRO skb,
typically 24616 bytes, because it fills up to MAX_SKB_FRAGS frags.

It's relatively easy to extend the skb using frag_list to allow
more frags to be appended into the last sk_buff.

This still builds very efficient skbs, and allows reaching 45 MSS per
skb.

(45 MSS GRO packet uses one skb plus a frag_list containing 2 additional
sk_buff)

High speed TCP flows benefit from this extension by lowering TCP stack
cpu usage (less packets stored in receive queue, less ACK packets
processed)

Forwarding setups could be hurt, as such skbs will need to be
linearized, although its not a new problem, as GRO could already
provide skbs with a frag_list.

We could make the 65536 bytes threshold a tunable to mitigate this.

(First time we need to linearize skb in skb_needs_linearize(), we could
lower the tunable to ~16*1460 so that following skb_gro_receive() calls
build smaller skbs)

Signed-off-by: Eric Dumazet <edumazet@google.com>
---
 net/core/skbuff.c |   43 ++++++++++++++++++++++++++-----------------
 1 file changed, 26 insertions(+), 17 deletions(-)

diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index d81cff1..8ead744 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -2936,32 +2936,30 @@ EXPORT_SYMBOL_GPL(skb_segment);
 
 int skb_gro_receive(struct sk_buff **head, struct sk_buff *skb)
 {
-	struct sk_buff *p = *head;
-	struct sk_buff *nskb;
-	struct skb_shared_info *skbinfo = skb_shinfo(skb);
-	struct skb_shared_info *pinfo = skb_shinfo(p);
-	unsigned int headroom;
-	unsigned int len = skb_gro_len(skb);
+	struct skb_shared_info *pinfo, *skbinfo = skb_shinfo(skb);
 	unsigned int offset = skb_gro_offset(skb);
 	unsigned int headlen = skb_headlen(skb);
+	struct sk_buff *nskb, *lp, *p = *head;
+	unsigned int len = skb_gro_len(skb);
 	unsigned int delta_truesize;
+	unsigned int headroom;
 
-	if (p->len + len >= 65536)
+	if (unlikely(p->len + len >= 65536))
 		return -E2BIG;
 
-	if (pinfo->frag_list)
-		goto merge;
-	else if (headlen <= offset) {
+	lp = NAPI_GRO_CB(p)->last ?: p;
+	pinfo = skb_shinfo(lp);
+
+	if (headlen <= offset) {
 		skb_frag_t *frag;
 		skb_frag_t *frag2;
 		int i = skbinfo->nr_frags;
 		int nr_frags = pinfo->nr_frags + i;
 
-		offset -= headlen;
-
 		if (nr_frags > MAX_SKB_FRAGS)
-			return -E2BIG;
+			goto merge;
 
+		offset -= headlen;
 		pinfo->nr_frags = nr_frags;
 		skbinfo->nr_frags = 0;
 
@@ -2992,7 +2990,7 @@ int skb_gro_receive(struct sk_buff **head, struct sk_buff *skb)
 		unsigned int first_offset;
 
 		if (nr_frags + 1 + skbinfo->nr_frags > MAX_SKB_FRAGS)
-			return -E2BIG;
+			goto merge;
 
 		first_offset = skb->data -
 			       (unsigned char *)page_address(page) +
@@ -3010,7 +3008,10 @@ int skb_gro_receive(struct sk_buff **head, struct sk_buff *skb)
 		delta_truesize = skb->truesize - SKB_DATA_ALIGN(sizeof(struct sk_buff));
 		NAPI_GRO_CB(skb)->free = NAPI_GRO_FREE_STOLEN_HEAD;
 		goto done;
-	} else if (skb_gro_len(p) != pinfo->gso_size)
+	}
+	if (pinfo->frag_list)
+		goto merge;
+	if (skb_gro_len(p) != pinfo->gso_size)
 		return -E2BIG;
 
 	headroom = skb_headroom(p);
@@ -3062,16 +3063,24 @@ merge:
 
 	__skb_pull(skb, offset);
 
-	NAPI_GRO_CB(p)->last->next = skb;
+	if (!NAPI_GRO_CB(p)->last)
+		skb_shinfo(p)->frag_list = skb;
+	else
+		NAPI_GRO_CB(p)->last->next = skb;
 	NAPI_GRO_CB(p)->last = skb;
 	skb_header_release(skb);
+	lp = p;
 
 done:
 	NAPI_GRO_CB(p)->count++;
 	p->data_len += len;
 	p->truesize += delta_truesize;
 	p->len += len;
-
+	if (lp != p) {
+		lp->data_len += len;
+		lp->truesize += delta_truesize;
+		lp->len += len;
+	}
 	NAPI_GRO_CB(skb)->same_flow = 1;
 	return 0;
 }

^ permalink raw reply related

* [PATCH net] vti: get rid of nf mark rule in prerouting
From: Christophe Gouault @ 2013-10-08 15:21 UTC (permalink / raw)
  To: netdev; +Cc: David S. Miller, Cong Wang, Saurabh Mohan, Christophe Gouault

This patch fixes and improves the use of vti interfaces (while
lightly changing the way of configuring them).

Currently:

- it is necessary to identify and mark inbound IPsec
  packets destined to each vti interface, via netfilter rules in
  the mangle table at prerouting hook.

- the vti module cannot retrieve the right tunnel in input since
  commit b9959fd3: vti tunnels all have an i_key, but the tunnel lookup
  is done with flag TUNNEL_NO_KEY, so there no chance to retrieve them.

- the i_key is used by the outbound processing as a mark to lookup
  for the right SP and SA bundle.

This patch uses the o_key to store the vti mark (instead of i_key) and
enables:

- to avoid the need for previously marking the inbound skbuffs via a
  netfilter rule.
- to properly retrieve the right tunnel in input, only based on the IPsec
  packet outer addresses.
- to properly perform an inbound policy check (using the tunnel o_key
  as a mark).
- to properly perform an outbound SPD and SAD lookup (using the tunnel
  o_key as a mark).
- to keep the current mark of the skbuff. The skbuff mark is neither
  used nor changed by the vti interface. Only the vti interface o_key
  is used.

SAs have a wildcard mark.
SPs have a mark equal to the vti interface o_key.

The vti interface must be created as follows (i_key = 0, o_key = mark):

   ip link add vti1 mode vti local 1.1.1.1 remote 2.2.2.2 okey 1

The SPs attached to vti1 must be created as follows (mark = vti1 o_key):

   ip xfrm policy add dir out mark 1 tmpl src 1.1.1.1 dst 2.2.2.2 \
      proto esp mode tunnel
   ip xfrm policy add dir in  mark 1 tmpl src 2.2.2.2 dst 1.1.1.1 \
      proto esp mode tunnel

The SAs are created with the default wildcard mark. There is no
distinction between global vs. vti SAs. Just their addresses will
possibly link them to a vti interface:

   ip xfrm state add src 1.1.1.1 dst 2.2.2.2 proto esp spi 1000 mode tunnel \
                 enc "cbc(aes)" "azertyuiopqsdfgh"

   ip xfrm state add src 2.2.2.2 dst 1.1.1.1 proto esp spi 2000 mode tunnel \
                 enc "cbc(aes)" "sqbdhgqsdjqjsdfh"

To avoid matching "global" (not vti) SPs in vti interfaces, global SPs
should no use the default wildcard mark, but explicitly match mark 0.

To avoid a double SPD lookup in input and output (in global and vti SPDs),
the NOPOLICY and NOXFRM options should be set on the vti interfaces:

   echo 1 > /proc/sys/net/ipv4/conf/vti1/disable_policy
   echo 1 > /proc/sys/net/ipv4/conf/vti1/disable_xfrm

The outgoing traffic is steered to vti1 by a route via the vti interface:

   ip route add 192.168.0.0/16 dev vti1

The incoming IPsec traffic is steered to vti1 because its outer addresses
match the vti1 tunnel configuration.

Signed-off-by: Christophe Gouault <christophe.gouault@6wind.com>
---
This is is both a fix and enhancement patch. However, there are 2 ways
of fixing the inbound processing bug:
- either keep the current configuration model (ikey + netfilter rule)
  and change the tunnel lookup method. This patch would then be reverted
  by the enhancement (this sounds counterproductive).
- or directly change the configuration model (okey, no netfilter rule) and keep
  the current tunnel lookup method.

 net/ipv4/ip_vti.c |   14 +++++++++++---
 1 file changed, 11 insertions(+), 3 deletions(-)

diff --git a/net/ipv4/ip_vti.c b/net/ipv4/ip_vti.c
index e805e7b..6e87f85 100644
--- a/net/ipv4/ip_vti.c
+++ b/net/ipv4/ip_vti.c
@@ -125,8 +125,17 @@ static int vti_rcv(struct sk_buff *skb)
 				  iph->saddr, iph->daddr, 0);
 	if (tunnel != NULL) {
 		struct pcpu_tstats *tstats;
+		u32 oldmark = skb->mark;
+		int ret;
 
-		if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
+
+		/* temporarily mark the skb with the tunnel o_key, to
+		 * only match policies with this mark.
+		 */
+		skb->mark = be32_to_cpu(tunnel->parms.o_key);
+		ret = xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb);
+		skb->mark = oldmark;
+		if (!ret)
 			return -1;
 
 		tstats = this_cpu_ptr(tunnel->dev->tstats);
@@ -135,7 +144,6 @@ static int vti_rcv(struct sk_buff *skb)
 		tstats->rx_bytes += skb->len;
 		u64_stats_update_end(&tstats->syncp);
 
-		skb->mark = 0;
 		secpath_reset(skb);
 		skb->dev = tunnel->dev;
 		return 1;
@@ -167,7 +175,7 @@ static netdev_tx_t vti_tunnel_xmit(struct sk_buff *skb, struct net_device *dev)
 
 	memset(&fl4, 0, sizeof(fl4));
 	flowi4_init_output(&fl4, tunnel->parms.link,
-			   be32_to_cpu(tunnel->parms.i_key), RT_TOS(tos),
+			   be32_to_cpu(tunnel->parms.o_key), RT_TOS(tos),
 			   RT_SCOPE_UNIVERSE,
 			   IPPROTO_IPIP, 0,
 			   dst, tiph->saddr, 0, 0);
-- 
1.7.10.4

^ permalink raw reply related

* [PATCH] rtlwifi: rtl8192cu: Fix error in pointer arithmetic
From: Larry Finger @ 2013-10-08 15:18 UTC (permalink / raw)
  To: linville; +Cc: linux-wireless, Larry Finger, netdev, Mark Cave-Ayland, Stable

From: Mark Cave-Ayland <mark.cave-ayland@ilande.co.uk>

An error in calculating the offset in an skb causes the driver to read
essential device info from the wrong locations. The main effect is that
automatic gain calculations are nonsense.

Signed-off-by: Mark Cave-Ayland <mark.cave-ayland@ilande.co.uk>
Signed-off-by: Larry Finger <Larry.Finger@lwfinger.net>
Cc: Stable <stable@vger.kernel.org> [2.6.39+]
---
 drivers/net/wireless/rtlwifi/rtl8192cu/trx.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/drivers/net/wireless/rtlwifi/rtl8192cu/trx.c b/drivers/net/wireless/rtlwifi/rtl8192cu/trx.c
index 04c7e57..25e50ff 100644
--- a/drivers/net/wireless/rtlwifi/rtl8192cu/trx.c
+++ b/drivers/net/wireless/rtlwifi/rtl8192cu/trx.c
@@ -343,7 +343,8 @@ bool rtl92cu_rx_query_desc(struct ieee80211_hw *hw,
 					(bool)GET_RX_DESC_PAGGR(pdesc));
 	rx_status->mactime = GET_RX_DESC_TSFL(pdesc);
 	if (phystatus) {
-		p_drvinfo = (struct rx_fwinfo_92c *)(pdesc + RTL_RX_DESC_SIZE);
+		p_drvinfo = (struct rx_fwinfo_92c *)(skb->data +
+						     stats->rx_bufshift);
 		rtl92c_translate_rx_signal_stuff(hw, skb, stats, pdesc,
 						 p_drvinfo);
 	}
-- 
1.8.1.4

^ permalink raw reply related

* [RFC] net: stmmac: keep RXC running in LPI mode to avoid system overload
From: Romain Baeriswyl @ 2013-10-08 15:06 UTC (permalink / raw)
  To: Giuseppe Cavallaro
  Cc: David S. Miller, netdev, linux-kernel, Romain Baeriswyl

In order to avoid system overload, the clock RXC from the Phy should not be 
stopped when in LPI mode.

With the RTL8211E PHY which support EEE mode and with Apple Airport Extreme that 
supports it also, the kernel get frozen as soon as some Ethernet transfers are 
on-going. System seems to be overloaded with too many interrupts. The 'top' 
command reports often around ~80% irq.

By letting the RXC clock running even in LPI mode as proposed below, the issue 
seems solved. Is it the right way to proceed?  

What is the power impact to not stop RXC in LPI mode?

---
 drivers/net/ethernet/stmicro/stmmac/stmmac_main.c |    6 ++++--
 1 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
index e9eab29..d40d26b 100644
--- a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
+++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
@@ -314,7 +314,8 @@ bool stmmac_eee_init(struct stmmac_priv *priv)
 	/* MAC core supports the EEE feature. */
 	if (priv->dma_cap.eee) {
 		/* Check if the PHY supports EEE */
-		if (phy_init_eee(priv->phydev, 1))
+		/* Keeps RXC running in LPI mode to avoid stability issue */
+		if (phy_init_eee(priv->phydev, 0))
 			goto out;
 
 		if (!priv->eee_active) {
@@ -770,7 +771,8 @@ static void stmmac_adjust_link(struct net_device *dev)
 	/* At this stage, it could be needed to setup the EEE or adjust some
 	 * MAC related HW registers.
 	 */
-	priv->eee_enabled = stmmac_eee_init(priv);
+	if (new_state)
+		priv->eee_enabled = stmmac_eee_init(priv);
 
 	spin_unlock_irqrestore(&priv->lock, flags);
 }
-- 
1.7.1

^ permalink raw reply related

* [PATCH iproute2] iplink: update available type list
From: Nicolas Dichtel @ 2013-10-08 14:59 UTC (permalink / raw)
  To: shemminger; +Cc: netdev, Nicolas Dichtel

macvtap and vti were missing.

Signed-off-by: Nicolas Dichtel <nicolas.dichtel@6wind.com>
---
 ip/iplink.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/ip/iplink.c b/ip/iplink.c
index 16cb6fed9fcf..6cde731ac328 100644
--- a/ip/iplink.c
+++ b/ip/iplink.c
@@ -84,9 +84,9 @@ void iplink_usage(void)
 
 	if (iplink_have_newlink()) {
 		fprintf(stderr, "\n");
-		fprintf(stderr, "TYPE := { vlan | veth | vcan | dummy | ifb | macvlan | can |\n");
-		fprintf(stderr, "          bridge | ipoib | ip6tnl | ipip | sit | vxlan |\n");
-		fprintf(stderr, "          gre | gretap | ip6gre | ip6gretap }\n");
+		fprintf(stderr, "TYPE := { vlan | veth | vcan | dummy | ifb | macvlan | macvtap |\n");
+		fprintf(stderr, "          can | bridge | ipoib | ip6tnl | ipip | sit | vxlan |\n");
+		fprintf(stderr, "          gre | gretap | ip6gre | ip6gretap | vti }\n");
 	}
 	exit(-1);
 }
-- 
1.8.4.1

^ permalink raw reply related

* Re: Neterion and UFO handling [was: Re: [PATCH] ipv6: udp packets following an UFO enqueued packet need also be handled by UFO]
From: Hannes Frederic Sowa @ 2013-10-08 14:53 UTC (permalink / raw)
  To: Jon Mason
  Cc: Jiri Pirko, netdev, yoshfuji, David Miller, kuznet, jmorris,
	kaber, herbert, Eric Dumazet
In-Reply-To: <CAPoiz9wxxvHfmmMOCvPstzT51BafGfU1u5umxBQ8kqqCE=OzbQ@mail.gmail.com>

On Tue, Oct 08, 2013 at 01:07:29AM -0700, Jon Mason wrote:
> On Wed, Oct 2, 2013 at 9:27 AM, Hannes Frederic Sowa
> <hannes@stressinduktion.org> wrote:
> > Hi!
> >
> > I have a question regarding UFO and the neterion driver, which as the only one
> > advertises hardware UFO support:
> >
> > The patch discusses in this thread
> > http://thread.gmane.org/gmane.linux.network/284348/focus=285405 could change
> > some semantics how packets are constructed before submitted to the driver.
> >
> > We currently guarantee that we have the MAC/IP/UDP header in skb->data and the
> > payload is attached in the skb's frags. With the changes discussed in this
> > thread it is possible that we also append to skb->data some amount of data
> > which is not targeted for the header. From reading the driver sources it seems
> > the hardware interprets the skb->data to skb_headlen as the header, so we
> > could include some data in the fragments more than once.
> 
> From my reading of the HW Spec and a quick look at the driver, it
> appears that the driver is using one entry in the TX ring for the
> header and another for the body of the packet to be fragmented (which
> is what the hardware wants).  I don't understand what you are saying,
> but if you are asking if simply appending a new header & data to the
> end of skb->data will get it out on the wire correct, I don't believe
> it will.

No this is not what I tried to say. I'll try to be more clear this
time. ;)

We start with an UDP socket which is corked. As soon as we write the
first few bytes (smaller than the mtu) onto this socket we put the
header in place and the rest of the data is just appended behind the
header directly in skb->data via plain ip_append_data.

Now a second write with a length > mtu happens: The ip(6)_append_data
will branch to ufo_append. This will fetch the first skb and append
to skb->frags.  gso_type and gso_size will be updated on this skb (this
currently does not happen but will with the patches discussed in this
thread).

If this packet is transmitted down to the device driver we have the udp
header in skb->data *and* also the payload from the first write. The
payload from the second write is appended as a frag and gso_type and
gso_size are set. This header+payload seem to be mapped just after the
ufo_in_band_v descriptor as the header in the first tx descriptor:

   4174         txdp->Buffer_Pointer = pci_map_single(sp->pdev, skb->data,
   4175                                               frg_len, PCI_DMA_TODEVICE);

frg_len is set to skb_headlen(skb). This happens right after setting up
the descriptor for the in-band ufo data.

My guess is that this data isn't split currently by the neterion driver
(at least I could not find it in the driver as Eric showed it for bnx2x)
so it might reappear in the packets when the hardware fragments the
packet and places the first tx ring in front of every packet.

Before these changes we never updated the gso_type and gso_size even when
we did append via UFO. So we never had payload in an UFO marked skb->data,
only the headers. Now we could also end up with a some payload in the
first TX ring, which you said is only for the header.

> I do have hardware that I can try the patch on, if you can walk me
> through the use case (unless it is as easy as setup an IPv6 connection
> and ping).

Ok, testing this should not be that complicated:

We can test this with plain IPv4/UDP sockets. I would suggest a net-next kernel
with this patch from Jiri applied: http://patchwork.ozlabs.org/patch/279691/

--- >8 ---
#include <sys/types.h>
#include <sys/socket.h>
#include <netinet/in.h>
#include <arpa/inet.h>
#include <linux/udp.h>
#include <stdio.h>

int test(int mtu)
{
        int fd;
        const int one = 1;
        const int off = 0;
        struct sockaddr_in addr = {.sin_family = AF_INET, .sin_port = htons(53) };
        unsigned char buffer[3701];

        inet_pton(AF_INET, "127.0.0.1", &addr.sin_addr);

        fd = socket(AF_INET, SOCK_DGRAM, 0);
        connect(fd, (struct sockaddr *) &addr, sizeof(addr));

        setsockopt(fd, IPPROTO_UDP, UDP_CORK, &one, sizeof(one));

        write(fd, "    ", 4);
        write(fd, buffer, sizeof(buffer));
        write(fd, " ", 1);

        setsockopt(fd, IPPROTO_UDP, UDP_CORK, &off, sizeof(off));

        close(fd);
}

int main() {
        test(1280);
}
--- >8 ---

I left out error handling so it is better observed with strace if
something went wrong.

You should change the port number and ip address to something reasonable
for your network. My guess would be that the spaces (0x20) of the first
write is now placed between UDP header and payload of every packet
fragmented by the hardware. Would be nice to hear that I am wrong. ;)

Be aware that the above program can cause memory corruption in the kernel
if you did not apply Jiri's patch.

Thanks for helping!

  Hannes

^ permalink raw reply

* Re: [PATCH] mac80211: port CCMP to cryptoapi's CCM driver
From: Ard Biesheuvel @ 2013-10-08 14:52 UTC (permalink / raw)
  To: Johannes Berg
  Cc: David Laight,
	<linux-wireless-u79uwXL29TY76Z2rM5mHXA@public.gmane.org>,
	<netdev-u79uwXL29TY76Z2rM5mHXA@public.gmane.org>,
	<patches-QSEj5FYQhm4dnm+yROfE0A@public.gmane.org>,
	<linville-2XuSBdqkA4R54TAoqtyWWQ@public.gmane.org>
In-Reply-To: <1381239951.13359.13.camel-8Nb76shvtaUJvtFkdXX2HixXY32XiHfO@public.gmane.org>

On 8 October 2013 15:45, Johannes Berg <johannes-cdvu00un1VgdHxzADdlk8Q@public.gmane.org> wrote:
> On Tue, 2013-10-08 at 15:45 +0200, Johannes Berg wrote:
>> On Tue, 2013-10-08 at 15:41 +0200, Ard Biesheuvel wrote:
>>
>> > Actually, as the size is always the same, it should be feasible to
>> > alloc a couple of request structs at init time. would one for rx and
>> > one for tx be sufficient? or is this code more reentrant than that?
>>
>> TX can run concurrently on multiple (four) queues using the same key.
>
> And maybe even more with injection ... I wouldn't go there.
>

OK, clear.

If you think this patch has any merit at all, I am happy to modify it
so that it kmalloc()s the request struct with GFP_ATOMIC at every
invocation.
However, personally I don't think this should be necessary and in fact
my patch removes a stack allocation of u8[48] (from
ieee80211_crypto_ccmp_decrypt() and from ccmp_encrypt_skb() in wpa.c)
so it does even out a bit.

regards,
Ard.
--
To unsubscribe from this list: send the line "unsubscribe linux-wireless" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply

* RE: [PATCH] xen-netback: Don't destroy the netdev until the vif is shut
From: Paul Durrant @ 2013-10-08 14:39 UTC (permalink / raw)
  To: David Miller; +Cc: netdev@vger.kernel.org, Wei Liu
In-Reply-To: <20131004.002248.789391156050342547.davem@davemloft.net>

> -----Original Message-----
> From: David Miller [mailto:davem@davemloft.net]
> Sent: 04 October 2013 05:23
> To: Paul Durrant
> Cc: netdev@vger.kernel.org; Wei Liu
> Subject: Re: [PATCH] xen-netback: Don't destroy the netdev until the vif is
> shut
> 
> 
> Can one of you do -stable backports of this change for v3.11 and any
> other active -stable branch where this fix is relevant?
> 
> I gave it a shot, and it got outside my realm of expertise.
> 
> Thanks!

Ok. Here's a backport to 3.11.

  Paul

--8<--
>From 158011889a79ad7cebdd30d551adcfeae80b8989 Mon Sep 17 00:00:00 2001
From: Paul Durrant <paul.durrant@citrix.com>
Date: Tue, 8 Oct 2013 14:56:44 +0100
Subject: [PATCH] xen-netback: Don't destroy the netdev until the vif is shut
 down

[ upstream commit id: 279f438e36c0a70b23b86d2090aeec50155034a9 ]

Without this patch, if a frontend cycles through states Closing
and Closed (which Windows frontends need to do) then the netdev
will be destroyed and requires re-invocation of hotplug scripts
to restore state before the frontend can move to Connected. Thus
when udev is not in use the backend gets stuck in InitWait.

With this patch, the netdev is left alone whilst the backend is
still online and is only de-registered and freed just prior to
destroying the vif (which is also nicely symmetrical with the
netdev allocation and registration being done during probe) so
no re-invocation of hotplug scripts is required.

Signed-off-by: Paul Durrant <paul.durrant@citrix.com>
Cc: David Vrabel <david.vrabel@citrix.com>
Cc: Wei Liu <wei.liu2@citrix.com>
Cc: Ian Campbell <ian.campbell@citrix.com>
---
 drivers/net/xen-netback/common.h    |    1 +
 drivers/net/xen-netback/interface.c |   23 +++++++++--------------
 drivers/net/xen-netback/xenbus.c    |   17 ++++++++++++-----
 3 files changed, 22 insertions(+), 19 deletions(-)

diff --git a/drivers/net/xen-netback/common.h b/drivers/net/xen-netback/common.h
index 8a4d77e..4d9a5e7 100644
--- a/drivers/net/xen-netback/common.h
+++ b/drivers/net/xen-netback/common.h
@@ -120,6 +120,7 @@ int xenvif_connect(struct xenvif *vif, unsigned long tx_ring_ref,
 		   unsigned long rx_ring_ref, unsigned int tx_evtchn,
 		   unsigned int rx_evtchn);
 void xenvif_disconnect(struct xenvif *vif);
+void xenvif_free(struct xenvif *vif);
 
 void xenvif_get(struct xenvif *vif);
 void xenvif_put(struct xenvif *vif);
diff --git a/drivers/net/xen-netback/interface.c b/drivers/net/xen-netback/interface.c
index 087d2db..73336c1 100644
--- a/drivers/net/xen-netback/interface.c
+++ b/drivers/net/xen-netback/interface.c
@@ -326,6 +326,9 @@ struct xenvif *xenvif_alloc(struct device *parent, domid_t domid,
 	}
 
 	netdev_dbg(dev, "Successfully created xenvif\n");
+
+	__module_get(THIS_MODULE);
+
 	return vif;
 }
 
@@ -413,12 +416,6 @@ void xenvif_carrier_off(struct xenvif *vif)
 
 void xenvif_disconnect(struct xenvif *vif)
 {
-	/* Disconnect funtion might get called by generic framework
-	 * even before vif connects, so we need to check if we really
-	 * need to do a module_put.
-	 */
-	int need_module_put = 0;
-
 	if (netif_carrier_ok(vif->dev))
 		xenvif_carrier_off(vif);
 
@@ -432,18 +429,16 @@ void xenvif_disconnect(struct xenvif *vif)
 			unbind_from_irqhandler(vif->tx_irq, vif);
 			unbind_from_irqhandler(vif->rx_irq, vif);
 		}
-		/* vif->irq is valid, we had a module_get in
-		 * xenvif_connect.
-		 */
-		need_module_put = 1;
 	}
 
-	unregister_netdev(vif->dev);
-
 	xen_netbk_unmap_frontend_rings(vif);
+}
+
+void xenvif_free(struct xenvif *vif)
+{
+	unregister_netdev(vif->dev);
 
 	free_netdev(vif->dev);
 
-	if (need_module_put)
-		module_put(THIS_MODULE);
+	module_put(THIS_MODULE);
 }
diff --git a/drivers/net/xen-netback/xenbus.c b/drivers/net/xen-netback/xenbus.c
index 1fe48fe3..a53782e 100644
--- a/drivers/net/xen-netback/xenbus.c
+++ b/drivers/net/xen-netback/xenbus.c
@@ -42,7 +42,7 @@ static int netback_remove(struct xenbus_device *dev)
 	if (be->vif) {
 		kobject_uevent(&dev->dev.kobj, KOBJ_OFFLINE);
 		xenbus_rm(XBT_NIL, dev->nodename, "hotplug-status");
-		xenvif_disconnect(be->vif);
+		xenvif_free(be->vif);
 		be->vif = NULL;
 	}
 	kfree(be);
@@ -213,9 +213,18 @@ static void disconnect_backend(struct xenbus_device *dev)
 {
 	struct backend_info *be = dev_get_drvdata(&dev->dev);
 
+	if (be->vif)
+		xenvif_disconnect(be->vif);
+}
+
+static void destroy_backend(struct xenbus_device *dev)
+{
+	struct backend_info *be = dev_get_drvdata(&dev->dev);
+
 	if (be->vif) {
+		kobject_uevent(&dev->dev.kobj, KOBJ_OFFLINE);
 		xenbus_rm(XBT_NIL, dev->nodename, "hotplug-status");
-		xenvif_disconnect(be->vif);
+		xenvif_free(be->vif);
 		be->vif = NULL;
 	}
 }
@@ -246,14 +255,11 @@ static void frontend_changed(struct xenbus_device *dev,
 	case XenbusStateConnected:
 		if (dev->state == XenbusStateConnected)
 			break;
-		backend_create_xenvif(be);
 		if (be->vif)
 			connect(be);
 		break;
 
 	case XenbusStateClosing:
-		if (be->vif)
-			kobject_uevent(&dev->dev.kobj, KOBJ_OFFLINE);
 		disconnect_backend(dev);
 		xenbus_switch_state(dev, XenbusStateClosing);
 		break;
@@ -262,6 +268,7 @@ static void frontend_changed(struct xenbus_device *dev,
 		xenbus_switch_state(dev, XenbusStateClosed);
 		if (xenbus_dev_is_online(dev))
 			break;
+		destroy_backend(dev);
 		/* fall through if not online */
 	case XenbusStateUnknown:
 		device_unregister(&dev->dev);
-- 
1.7.10.4

^ permalink raw reply related

* RE: [PATCH] xen-netback: Don't destroy the netdev until the vif is shut
From: Paul Durrant @ 2013-10-08 14:41 UTC (permalink / raw)
  To: David Miller; +Cc: netdev@vger.kernel.org, Wei Liu
In-Reply-To: <20131004.002248.789391156050342547.davem@davemloft.net>

> -----Original Message-----
> From: David Miller [mailto:davem@davemloft.net]
> Sent: 04 October 2013 05:23
> To: Paul Durrant
> Cc: netdev@vger.kernel.org; Wei Liu
> Subject: Re: [PATCH] xen-netback: Don't destroy the netdev until the vif is
> shut
> 
> 
> Can one of you do -stable backports of this change for v3.11 and any
> other active -stable branch where this fix is relevant?
> 
> I gave it a shot, and it got outside my realm of expertise.
> 
> Thanks!

Here is a patch for 3.10.

--8<--
>From e626fc73f445f05d976756d022614f06461cab74 Mon Sep 17 00:00:00 2001
From: Paul Durrant <paul.durrant@citrix.com>
Date: Tue, 8 Oct 2013 14:22:56 +0100
Subject: [PATCH] xen-netback: Don't destroy the netdev until the vif is shut
 down

[ upstream commit id: 279f438e36c0a70b23b86d2090aeec50155034a9 ]

Without this patch, if a frontend cycles through states Closing
and Closed (which Windows frontends need to do) then the netdev
will be destroyed and requires re-invocation of hotplug scripts
to restore state before the frontend can move to Connected. Thus
when udev is not in use the backend gets stuck in InitWait.

With this patch, the netdev is left alone whilst the backend is
still online and is only de-registered and freed just prior to
destroying the vif (which is also nicely symmetrical with the
netdev allocation and registration being done during probe) so
no re-invocation of hotplug scripts is required.

Signed-off-by: Paul Durrant <paul.durrant@citrix.com>
Cc: David Vrabel <david.vrabel@citrix.com>
Cc: Wei Liu <wei.liu2@citrix.com>
Cc: Ian Campbell <ian.campbell@citrix.com>
---
 drivers/net/xen-netback/common.h    |    1 +
 drivers/net/xen-netback/interface.c |   12 ++++++++++--
 drivers/net/xen-netback/xenbus.c    |   17 ++++++++++++-----
 3 files changed, 23 insertions(+), 7 deletions(-)

diff --git a/drivers/net/xen-netback/common.h b/drivers/net/xen-netback/common.h
index 9d7f172..1a28508 100644
--- a/drivers/net/xen-netback/common.h
+++ b/drivers/net/xen-netback/common.h
@@ -115,6 +115,7 @@ struct xenvif *xenvif_alloc(struct device *parent,
 int xenvif_connect(struct xenvif *vif, unsigned long tx_ring_ref,
 		   unsigned long rx_ring_ref, unsigned int evtchn);
 void xenvif_disconnect(struct xenvif *vif);
+void xenvif_free(struct xenvif *vif);
 
 void xenvif_get(struct xenvif *vif);
 void xenvif_put(struct xenvif *vif);
diff --git a/drivers/net/xen-netback/interface.c b/drivers/net/xen-netback/interface.c
index d984141..3a294c2 100644
--- a/drivers/net/xen-netback/interface.c
+++ b/drivers/net/xen-netback/interface.c
@@ -304,6 +304,9 @@ struct xenvif *xenvif_alloc(struct device *parent, domid_t domid,
 	}
 
 	netdev_dbg(dev, "Successfully created xenvif\n");
+
+	__module_get(THIS_MODULE);
+
 	return vif;
 }
 
@@ -369,9 +372,14 @@ void xenvif_disconnect(struct xenvif *vif)
 	if (vif->irq)
 		unbind_from_irqhandler(vif->irq, vif);
 
-	unregister_netdev(vif->dev);
-
 	xen_netbk_unmap_frontend_rings(vif);
+}
+
+void xenvif_free(struct xenvif *vif)
+{
+	unregister_netdev(vif->dev);
 
 	free_netdev(vif->dev);
+
+	module_put(THIS_MODULE);
 }
diff --git a/drivers/net/xen-netback/xenbus.c b/drivers/net/xen-netback/xenbus.c
index 410018c..abe24ff 100644
--- a/drivers/net/xen-netback/xenbus.c
+++ b/drivers/net/xen-netback/xenbus.c
@@ -42,7 +42,7 @@ static int netback_remove(struct xenbus_device *dev)
 	if (be->vif) {
 		kobject_uevent(&dev->dev.kobj, KOBJ_OFFLINE);
 		xenbus_rm(XBT_NIL, dev->nodename, "hotplug-status");
-		xenvif_disconnect(be->vif);
+		xenvif_free(be->vif);
 		be->vif = NULL;
 	}
 	kfree(be);
@@ -203,9 +203,18 @@ static void disconnect_backend(struct xenbus_device *dev)
 {
 	struct backend_info *be = dev_get_drvdata(&dev->dev);
 
+	if (be->vif)
+		xenvif_disconnect(be->vif);
+}
+
+static void destroy_backend(struct xenbus_device *dev)
+{
+	struct backend_info *be = dev_get_drvdata(&dev->dev);
+
 	if (be->vif) {
+		kobject_uevent(&dev->dev.kobj, KOBJ_OFFLINE);
 		xenbus_rm(XBT_NIL, dev->nodename, "hotplug-status");
-		xenvif_disconnect(be->vif);
+		xenvif_free(be->vif);
 		be->vif = NULL;
 	}
 }
@@ -237,14 +246,11 @@ static void frontend_changed(struct xenbus_device *dev,
 	case XenbusStateConnected:
 		if (dev->state == XenbusStateConnected)
 			break;
-		backend_create_xenvif(be);
 		if (be->vif)
 			connect(be);
 		break;
 
 	case XenbusStateClosing:
-		if (be->vif)
-			kobject_uevent(&dev->dev.kobj, KOBJ_OFFLINE);
 		disconnect_backend(dev);
 		xenbus_switch_state(dev, XenbusStateClosing);
 		break;
@@ -253,6 +259,7 @@ static void frontend_changed(struct xenbus_device *dev,
 		xenbus_switch_state(dev, XenbusStateClosed);
 		if (xenbus_dev_is_online(dev))
 			break;
+		destroy_backend(dev);
 		/* fall through if not online */
 	case XenbusStateUnknown:
 		device_unregister(&dev->dev);
-- 
1.7.10.4

^ permalink raw reply related


This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox