netdev.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH net-next] ixgbe: fix stats handling
@ 2010-10-11 12:17 Eric Dumazet
  2010-10-12  0:13 ` Jeff Kirsher
  2010-10-27 22:35 ` Tantilov, Emil S
  0 siblings, 2 replies; 10+ messages in thread
From: Eric Dumazet @ 2010-10-11 12:17 UTC (permalink / raw)
  To: David Miller, Peter Waskiewicz, Emil Tantilov, Jeff Kirsher; +Cc: netdev

Hi

I am sending this patch for Intel people review/test and acknowledge.

Thanks !

[PATCH net-next] ixgbe: fix stats handling

Current ixgbe stats have following problems :

- Not 64 bit safe (on 32bit arches)

- Not safe in ixgbe_clean_rx_irq() :
   All cpus dirty a common location (netdev->stats.rx_bytes &
netdev->stats.rx_packets) without proper synchronization.
   This slow down a bit multiqueue operations, and possibly miss some
updates.

Fixes :

Implement ndo_get_stats64() method to provide accurate 64bit rx|tx
bytes/packets counters, using 64bit safe infrastructure.

ixgbe_get_ethtool_stats() also use this infrastructure to provide 64bit
safe counters.

Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
CC: Peter Waskiewicz <peter.p.waskiewicz.jr@intel.com>
CC: Emil Tantilov <emil.s.tantilov@intel.com>
CC: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
---
 drivers/net/ixgbe/ixgbe.h         |    3 +-
 drivers/net/ixgbe/ixgbe_ethtool.c |   29 +++++++++++---------
 drivers/net/ixgbe/ixgbe_main.c    |   40 +++++++++++++++++++++++++---
 3 files changed, 56 insertions(+), 16 deletions(-)

diff --git a/drivers/net/ixgbe/ixgbe.h b/drivers/net/ixgbe/ixgbe.h
index a8c47b0..944d9e2 100644
--- a/drivers/net/ixgbe/ixgbe.h
+++ b/drivers/net/ixgbe/ixgbe.h
@@ -180,8 +180,9 @@ struct ixgbe_ring {
 					 */
 
 	struct ixgbe_queue_stats stats;
-	unsigned long reinit_state;
+	struct u64_stats_sync syncp;
 	int numa_node;
+	unsigned long reinit_state;
 	u64 rsc_count;			/* stat for coalesced packets */
 	u64 rsc_flush;			/* stats for flushed packets */
 	u32 restart_queue;		/* track tx queue restarts */
diff --git a/drivers/net/ixgbe/ixgbe_ethtool.c b/drivers/net/ixgbe/ixgbe_ethtool.c
index d4ac943..3c7f15d 100644
--- a/drivers/net/ixgbe/ixgbe_ethtool.c
+++ b/drivers/net/ixgbe/ixgbe_ethtool.c
@@ -999,12 +999,11 @@ static void ixgbe_get_ethtool_stats(struct net_device *netdev,
                                     struct ethtool_stats *stats, u64 *data)
 {
 	struct ixgbe_adapter *adapter = netdev_priv(netdev);
-	u64 *queue_stat;
-	int stat_count = sizeof(struct ixgbe_queue_stats) / sizeof(u64);
 	struct rtnl_link_stats64 temp;
 	const struct rtnl_link_stats64 *net_stats;
-	int j, k;
-	int i;
+	unsigned int start;
+	struct ixgbe_ring *ring;
+	int i, j;
 	char *p = NULL;
 
 	ixgbe_update_stats(adapter);
@@ -1025,16 +1024,22 @@ static void ixgbe_get_ethtool_stats(struct net_device *netdev,
 		           sizeof(u64)) ? *(u64 *)p : *(u32 *)p;
 	}
 	for (j = 0; j < adapter->num_tx_queues; j++) {
-		queue_stat = (u64 *)&adapter->tx_ring[j]->stats;
-		for (k = 0; k < stat_count; k++)
-			data[i + k] = queue_stat[k];
-		i += k;
+		ring = adapter->tx_ring[j];
+		do {
+			start = u64_stats_fetch_begin_bh(&ring->syncp);
+			data[i]   = ring->stats.packets;
+			data[i+1] = ring->stats.bytes;
+		} while (u64_stats_fetch_retry_bh(&ring->syncp, start));
+		i += 2;
 	}
 	for (j = 0; j < adapter->num_rx_queues; j++) {
-		queue_stat = (u64 *)&adapter->rx_ring[j]->stats;
-		for (k = 0; k < stat_count; k++)
-			data[i + k] = queue_stat[k];
-		i += k;
+		ring = adapter->rx_ring[j];
+		do {
+			start = u64_stats_fetch_begin_bh(&ring->syncp);
+			data[i]   = ring->stats.packets;
+			data[i+1] = ring->stats.bytes;
+		} while (u64_stats_fetch_retry_bh(&ring->syncp, start));
+		i += 2;
 	}
 	if (adapter->flags & IXGBE_FLAG_DCB_ENABLED) {
 		for (j = 0; j < MAX_TX_PACKET_BUFFERS; j++) {
diff --git a/drivers/net/ixgbe/ixgbe_main.c b/drivers/net/ixgbe/ixgbe_main.c
index 95dbf60..1efbcde 100644
--- a/drivers/net/ixgbe/ixgbe_main.c
+++ b/drivers/net/ixgbe/ixgbe_main.c
@@ -824,8 +824,10 @@ static bool ixgbe_clean_tx_irq(struct ixgbe_q_vector *q_vector,
 
 	tx_ring->total_bytes += total_bytes;
 	tx_ring->total_packets += total_packets;
+	u64_stats_update_begin(&tx_ring->syncp);
 	tx_ring->stats.packets += total_packets;
 	tx_ring->stats.bytes += total_bytes;
+	u64_stats_update_end(&tx_ring->syncp);
 	return count < tx_ring->work_limit;
 }
 
@@ -1172,7 +1174,6 @@ static bool ixgbe_clean_rx_irq(struct ixgbe_q_vector *q_vector,
 			       int *work_done, int work_to_do)
 {
 	struct ixgbe_adapter *adapter = q_vector->adapter;
-	struct net_device *netdev = adapter->netdev;
 	struct pci_dev *pdev = adapter->pdev;
 	union ixgbe_adv_rx_desc *rx_desc, *next_rxd;
 	struct ixgbe_rx_buffer *rx_buffer_info, *next_buffer;
@@ -1298,8 +1299,10 @@ static bool ixgbe_clean_rx_irq(struct ixgbe_q_vector *q_vector,
 					rx_ring->rsc_count++;
 				rx_ring->rsc_flush++;
 			}
+			u64_stats_update_begin(&rx_ring->syncp);
 			rx_ring->stats.packets++;
 			rx_ring->stats.bytes += skb->len;
+			u64_stats_update_end(&rx_ring->syncp);
 		} else {
 			if (rx_ring->flags & IXGBE_RING_RX_PS_ENABLED) {
 				rx_buffer_info->skb = next_buffer->skb;
@@ -1375,8 +1378,6 @@ next_desc:
 
 	rx_ring->total_packets += total_rx_packets;
 	rx_ring->total_bytes += total_rx_bytes;
-	netdev->stats.rx_bytes += total_rx_bytes;
-	netdev->stats.rx_packets += total_rx_packets;
 
 	return cleaned;
 }
@@ -6559,6 +6560,38 @@ static void ixgbe_netpoll(struct net_device *netdev)
 }
 #endif
 
+static struct rtnl_link_stats64 *ixgbe_get_stats64(struct net_device *netdev,
+						   struct rtnl_link_stats64 *stats)
+{
+	struct ixgbe_adapter *adapter = netdev_priv(netdev);
+	int i;
+
+	/* accurate rx/tx bytes/packets stats */
+	dev_txq_stats_fold(netdev, stats);
+	for (i = 0; i < adapter->num_rx_queues; i++) {
+		struct ixgbe_ring *ring = adapter->rx_ring[i];
+		u64 bytes, packets;
+		unsigned int start;
+
+		do {
+			start = u64_stats_fetch_begin_bh(&ring->syncp);
+			packets = ring->stats.packets;
+			bytes   = ring->stats.bytes;
+		} while (u64_stats_fetch_retry_bh(&ring->syncp, start));
+		stats->rx_packets += packets;
+		stats->rx_bytes   += bytes;
+	}
+
+	/* following stats updated by ixgbe_watchdog_task() */
+	stats->multicast	= netdev->stats.multicast;
+	stats->rx_errors	= netdev->stats.rx_errors;
+	stats->rx_length_errors	= netdev->stats.rx_length_errors;
+	stats->rx_crc_errors	= netdev->stats.rx_crc_errors;
+	stats->rx_missed_errors	= netdev->stats.rx_missed_errors;
+	return stats;
+}
+
+
 static const struct net_device_ops ixgbe_netdev_ops = {
 	.ndo_open		= ixgbe_open,
 	.ndo_stop		= ixgbe_close,
@@ -6578,6 +6611,7 @@ static const struct net_device_ops ixgbe_netdev_ops = {
 	.ndo_set_vf_vlan	= ixgbe_ndo_set_vf_vlan,
 	.ndo_set_vf_tx_rate	= ixgbe_ndo_set_vf_bw,
 	.ndo_get_vf_config	= ixgbe_ndo_get_vf_config,
+	.ndo_get_stats64	= ixgbe_get_stats64,
 #ifdef CONFIG_NET_POLL_CONTROLLER
 	.ndo_poll_controller	= ixgbe_netpoll,
 #endif



^ permalink raw reply related	[flat|nested] 10+ messages in thread

* Re: [PATCH net-next] ixgbe: fix stats handling
  2010-10-11 12:17 [PATCH net-next] ixgbe: fix stats handling Eric Dumazet
@ 2010-10-12  0:13 ` Jeff Kirsher
  2010-10-27 22:35 ` Tantilov, Emil S
  1 sibling, 0 replies; 10+ messages in thread
From: Jeff Kirsher @ 2010-10-12  0:13 UTC (permalink / raw)
  To: Eric Dumazet; +Cc: David Miller, Peter Waskiewicz, Emil Tantilov, netdev

On Mon, Oct 11, 2010 at 05:17, Eric Dumazet <eric.dumazet@gmail.com> wrote:
> Hi
>
> I am sending this patch for Intel people review/test and acknowledge.
>
> Thanks !
>
> [PATCH net-next] ixgbe: fix stats handling
>
> Current ixgbe stats have following problems :
>
> - Not 64 bit safe (on 32bit arches)
>
> - Not safe in ixgbe_clean_rx_irq() :
>   All cpus dirty a common location (netdev->stats.rx_bytes &
> netdev->stats.rx_packets) without proper synchronization.
>   This slow down a bit multiqueue operations, and possibly miss some
> updates.
>
> Fixes :
>
> Implement ndo_get_stats64() method to provide accurate 64bit rx|tx
> bytes/packets counters, using 64bit safe infrastructure.
>
> ixgbe_get_ethtool_stats() also use this infrastructure to provide 64bit
> safe counters.
>
> Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
> CC: Peter Waskiewicz <peter.p.waskiewicz.jr@intel.com>
> CC: Emil Tantilov <emil.s.tantilov@intel.com>
> CC: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
> ---
>  drivers/net/ixgbe/ixgbe.h         |    3 +-
>  drivers/net/ixgbe/ixgbe_ethtool.c |   29 +++++++++++---------
>  drivers/net/ixgbe/ixgbe_main.c    |   40 +++++++++++++++++++++++++---
>  3 files changed, 56 insertions(+), 16 deletions(-)
>

Thanks Eric, I have added the patch to my queue.

-- 
Cheers,
Jeff

^ permalink raw reply	[flat|nested] 10+ messages in thread

* RE: [PATCH net-next] ixgbe: fix stats handling
  2010-10-11 12:17 [PATCH net-next] ixgbe: fix stats handling Eric Dumazet
  2010-10-12  0:13 ` Jeff Kirsher
@ 2010-10-27 22:35 ` Tantilov, Emil S
  2010-10-28  4:24   ` Eric Dumazet
  1 sibling, 1 reply; 10+ messages in thread
From: Tantilov, Emil S @ 2010-10-27 22:35 UTC (permalink / raw)
  To: Eric Dumazet, David Miller, Waskiewicz Jr, Peter P,
	Kirsher, Jeffrey T
  Cc: netdev

>-----Original Message-----
>From: Eric Dumazet [mailto:eric.dumazet@gmail.com]
>Sent: Monday, October 11, 2010 5:17 AM
>To: David Miller; Waskiewicz Jr, Peter P; Tantilov, Emil S; Kirsher,
>Jeffrey T
>Cc: netdev
>Subject: [PATCH net-next] ixgbe: fix stats handling
>
>Hi
>
>I am sending this patch for Intel people review/test and acknowledge.
>
>Thanks !
>
>[PATCH net-next] ixgbe: fix stats handling
>
>Current ixgbe stats have following problems :
>
>- Not 64 bit safe (on 32bit arches)
>
>- Not safe in ixgbe_clean_rx_irq() :
>   All cpus dirty a common location (netdev->stats.rx_bytes &
>netdev->stats.rx_packets) without proper synchronization.
>   This slow down a bit multiqueue operations, and possibly miss some
>updates.
>
>Fixes :
>
>Implement ndo_get_stats64() method to provide accurate 64bit rx|tx
>bytes/packets counters, using 64bit safe infrastructure.
>
>ixgbe_get_ethtool_stats() also use this infrastructure to provide 64bit
>safe counters.
>
>Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
>CC: Peter Waskiewicz <peter.p.waskiewicz.jr@intel.com>
>CC: Emil Tantilov <emil.s.tantilov@intel.com>
>CC: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
>---
> drivers/net/ixgbe/ixgbe.h         |    3 +-
> drivers/net/ixgbe/ixgbe_ethtool.c |   29 +++++++++++---------
> drivers/net/ixgbe/ixgbe_main.c    |   40 +++++++++++++++++++++++++---
> 3 files changed, 56 insertions(+), 16 deletions(-)
>
>diff --git a/drivers/net/ixgbe/ixgbe.h b/drivers/net/ixgbe/ixgbe.h
>index a8c47b0..944d9e2 100644
>--- a/drivers/net/ixgbe/ixgbe.h
>+++ b/drivers/net/ixgbe/ixgbe.h
>@@ -180,8 +180,9 @@ struct ixgbe_ring {
> 					 */
>
> 	struct ixgbe_queue_stats stats;
>-	unsigned long reinit_state;
>+	struct u64_stats_sync syncp;
> 	int numa_node;
>+	unsigned long reinit_state;
> 	u64 rsc_count;			/* stat for coalesced packets */
> 	u64 rsc_flush;			/* stats for flushed packets */
> 	u32 restart_queue;		/* track tx queue restarts */
>diff --git a/drivers/net/ixgbe/ixgbe_ethtool.c
>b/drivers/net/ixgbe/ixgbe_ethtool.c
>index d4ac943..3c7f15d 100644
>--- a/drivers/net/ixgbe/ixgbe_ethtool.c
>+++ b/drivers/net/ixgbe/ixgbe_ethtool.c
>@@ -999,12 +999,11 @@ static void ixgbe_get_ethtool_stats(struct net_device
>*netdev,
>                                     struct ethtool_stats *stats, u64
>*data)
> {
> 	struct ixgbe_adapter *adapter = netdev_priv(netdev);
>-	u64 *queue_stat;
>-	int stat_count = sizeof(struct ixgbe_queue_stats) / sizeof(u64);
> 	struct rtnl_link_stats64 temp;
> 	const struct rtnl_link_stats64 *net_stats;
>-	int j, k;
>-	int i;
>+	unsigned int start;
>+	struct ixgbe_ring *ring;
>+	int i, j;
> 	char *p = NULL;
>
> 	ixgbe_update_stats(adapter);
>@@ -1025,16 +1024,22 @@ static void ixgbe_get_ethtool_stats(struct
>net_device *netdev,
> 		           sizeof(u64)) ? *(u64 *)p : *(u32 *)p;
> 	}
> 	for (j = 0; j < adapter->num_tx_queues; j++) {
>-		queue_stat = (u64 *)&adapter->tx_ring[j]->stats;
>-		for (k = 0; k < stat_count; k++)
>-			data[i + k] = queue_stat[k];
>-		i += k;
>+		ring = adapter->tx_ring[j];
>+		do {
>+			start = u64_stats_fetch_begin_bh(&ring->syncp);
>+			data[i]   = ring->stats.packets;
>+			data[i+1] = ring->stats.bytes;
>+		} while (u64_stats_fetch_retry_bh(&ring->syncp, start));
>+		i += 2;
> 	}
> 	for (j = 0; j < adapter->num_rx_queues; j++) {
>-		queue_stat = (u64 *)&adapter->rx_ring[j]->stats;
>-		for (k = 0; k < stat_count; k++)
>-			data[i + k] = queue_stat[k];
>-		i += k;
>+		ring = adapter->rx_ring[j];
>+		do {
>+			start = u64_stats_fetch_begin_bh(&ring->syncp);
>+			data[i]   = ring->stats.packets;
>+			data[i+1] = ring->stats.bytes;
>+		} while (u64_stats_fetch_retry_bh(&ring->syncp, start));
>+		i += 2;
> 	}
> 	if (adapter->flags & IXGBE_FLAG_DCB_ENABLED) {
> 		for (j = 0; j < MAX_TX_PACKET_BUFFERS; j++) {
>diff --git a/drivers/net/ixgbe/ixgbe_main.c
>b/drivers/net/ixgbe/ixgbe_main.c
>index 95dbf60..1efbcde 100644
>--- a/drivers/net/ixgbe/ixgbe_main.c
>+++ b/drivers/net/ixgbe/ixgbe_main.c
>@@ -824,8 +824,10 @@ static bool ixgbe_clean_tx_irq(struct ixgbe_q_vector
>*q_vector,
>
> 	tx_ring->total_bytes += total_bytes;
> 	tx_ring->total_packets += total_packets;
>+	u64_stats_update_begin(&tx_ring->syncp);
> 	tx_ring->stats.packets += total_packets;
> 	tx_ring->stats.bytes += total_bytes;
>+	u64_stats_update_end(&tx_ring->syncp);
> 	return count < tx_ring->work_limit;
> }
>
>@@ -1172,7 +1174,6 @@ static bool ixgbe_clean_rx_irq(struct ixgbe_q_vector
>*q_vector,
> 			       int *work_done, int work_to_do)
> {
> 	struct ixgbe_adapter *adapter = q_vector->adapter;
>-	struct net_device *netdev = adapter->netdev;
> 	struct pci_dev *pdev = adapter->pdev;
> 	union ixgbe_adv_rx_desc *rx_desc, *next_rxd;
> 	struct ixgbe_rx_buffer *rx_buffer_info, *next_buffer;
>@@ -1298,8 +1299,10 @@ static bool ixgbe_clean_rx_irq(struct ixgbe_q_vector
>*q_vector,
> 					rx_ring->rsc_count++;
> 				rx_ring->rsc_flush++;
> 			}
>+			u64_stats_update_begin(&rx_ring->syncp);
> 			rx_ring->stats.packets++;
> 			rx_ring->stats.bytes += skb->len;
>+			u64_stats_update_end(&rx_ring->syncp);
> 		} else {
> 			if (rx_ring->flags & IXGBE_RING_RX_PS_ENABLED) {
> 				rx_buffer_info->skb = next_buffer->skb;
>@@ -1375,8 +1378,6 @@ next_desc:
>
> 	rx_ring->total_packets += total_rx_packets;
> 	rx_ring->total_bytes += total_rx_bytes;
>-	netdev->stats.rx_bytes += total_rx_bytes;
>-	netdev->stats.rx_packets += total_rx_packets;
>
> 	return cleaned;
> }
>@@ -6559,6 +6560,38 @@ static void ixgbe_netpoll(struct net_device *netdev)
> }
> #endif
>
>+static struct rtnl_link_stats64 *ixgbe_get_stats64(struct net_device
>*netdev,
>+						   struct rtnl_link_stats64 *stats)
>+{
>+	struct ixgbe_adapter *adapter = netdev_priv(netdev);
>+	int i;
>+
>+	/* accurate rx/tx bytes/packets stats */
>+	dev_txq_stats_fold(netdev, stats);
>+	for (i = 0; i < adapter->num_rx_queues; i++) {
>+		struct ixgbe_ring *ring = adapter->rx_ring[i];
>+		u64 bytes, packets;
>+		unsigned int start;
>+
>+		do {
>+			start = u64_stats_fetch_begin_bh(&ring->syncp);
>+			packets = ring->stats.packets;
>+			bytes   = ring->stats.bytes;
>+		} while (u64_stats_fetch_retry_bh(&ring->syncp, start));
>+		stats->rx_packets += packets;
>+		stats->rx_bytes   += bytes;
>+	}
>+
>+	/* following stats updated by ixgbe_watchdog_task() */
>+	stats->multicast	= netdev->stats.multicast;
>+	stats->rx_errors	= netdev->stats.rx_errors;
>+	stats->rx_length_errors	= netdev->stats.rx_length_errors;
>+	stats->rx_crc_errors	= netdev->stats.rx_crc_errors;
>+	stats->rx_missed_errors	= netdev->stats.rx_missed_errors;
>+	return stats;
>+}
>+
>+
> static const struct net_device_ops ixgbe_netdev_ops = {
> 	.ndo_open		= ixgbe_open,
> 	.ndo_stop		= ixgbe_close,
>@@ -6578,6 +6611,7 @@ static const struct net_device_ops ixgbe_netdev_ops =
>{
> 	.ndo_set_vf_vlan	= ixgbe_ndo_set_vf_vlan,
> 	.ndo_set_vf_tx_rate	= ixgbe_ndo_set_vf_bw,
> 	.ndo_get_vf_config	= ixgbe_ndo_get_vf_config,
>+	.ndo_get_stats64	= ixgbe_get_stats64,
> #ifdef CONFIG_NET_POLL_CONTROLLER
> 	.ndo_poll_controller	= ixgbe_netpoll,
> #endif
>

Eric,

We are seeing intermittent hangs on ia32 arch which seem to point to this patch:

BUG: unable to handle kernel NULL pointer dereference at 00000040
 IP: [<f7f6b537>] ixgbe_get_stats64+0x47/0x120 [ixgbe]
 *pdpt = 000000002dc83001 *pde = 000000032d7e5067
 Oops: 0000 [#2] SMP
 last sysfs file: /sys/devices/system/cpu/cpu23/cache/index2/shared_cpu_map
 Modules linked in: act_skbedit cls_u32 sch_multiq ixgbe mdio netconsole configfs autofs4 8021q garp stp llc sunrpc ipv6 e
xt3 jbd dm_mirror dm_region_hash dm_log dm_round_robin dm_multipath power_meter hwmon sg ses enclosure dcdbas pcspkr serio_raw iTCO_wdt iTCO_vendor_support io
atdma dca i7core_edac edac_core bnx2 ext4 mbcache jbd2 sr_mod cdrom sd_mod crc_t10dif pata_acpi ata_generic ata_piix megaraid_sas dm_mod [last unloaded: mperf
]

 Pid: 1939, comm: irqbalance Tainted: G      D W   2.6.36-rc7-upstream-net-next-2.6-ixgbe-queue-i386-g55e1a84 #1 09CGW2/Po
werEdge T610
 EIP: 0060:[<f7f6b537>] EFLAGS: 00010206 CPU: 0
 EIP is at ixgbe_get_stats64+0x47/0x120 [ixgbe]
 EAX: 00000000 EBX: ecc45e4c ECX: ebea0400 EDX: 00000000
 ESI: ebea0000 EDI: 00000018 EBP: f7f846a0 ESP: ecc45d88
 DS: 007b ES: 007b FS: 00d8 GS: 00e0 SS: 0068
 Process irqbalance (pid: 1939, ti=ecc44000 task=efc63a50 task.ti=ecc44000)
 Stack:
 ecc45e4c 00000000 ebea0400 ebea0000 ecc45e4c ebea0000 ecc45f04 f7f846a0
 <0> c0750593 ebea0000 edfec340 ebea0000 000002b5 c075063a edfec340 c0993eec
 <0> ebe78000 0000021c 00000000 00000009 00000000 00000000 00000000 00000000
 Call Trace:
 [<c0750593>] ? dev_get_stats+0x33/0xc0
 [<c075063a>] ? dev_seq_printf_stats+0x1a/0x180
 [<c07507aa>] ? dev_seq_show+0xa/0x20
 [<c052398f>] ? seq_read+0x22f/0x3d0
 [<c0523760>] ? seq_read+0x0/0x3d0
 [<c054fdde>] ? proc_reg_read+0x5e/0x90
 [<c054fd80>] ? proc_reg_read+0x0/0x90
 [<c050a1dd>] ? vfs_read+0x9d/0x160
 [<c049d4ef>] ? audit_syscall_entry+0x20f/0x230
 [<c050a971>] ? sys_read+0x41/0x70
 [<c0409cdf>] ? sysenter_do_call+0x12/0x28
 Code: 60 4f 7e c8 8b 44 24 08 8b b8 20 06 00 00 85 ff 7e 63 c7 44 24 04 00 00 00 00 66 90 8b 54 24 04 8b 4c 24 08 8b 84 9
1 00 05 00 00 <8b> 50 40 eb 06 8d 74 26 00 89 ca f6 c2 01 0f 85 ae 00 00 00 8b
 EIP: [<f7f6b537>] ixgbe_get_stats64+0x47/0x120 [ixgbe] SS:ESP 0068:ecc45d88
 CR2: 0000000000000040
 ---[ end trace 51ea89f4e57f54f1 ]---

Emil

^ permalink raw reply	[flat|nested] 10+ messages in thread

* RE: [PATCH net-next] ixgbe: fix stats handling
  2010-10-27 22:35 ` Tantilov, Emil S
@ 2010-10-28  4:24   ` Eric Dumazet
  2010-10-28 12:02     ` [PATCH] ixgbe: delay rx_ring freeing Eric Dumazet
  0 siblings, 1 reply; 10+ messages in thread
From: Eric Dumazet @ 2010-10-28  4:24 UTC (permalink / raw)
  To: Tantilov, Emil S
  Cc: David Miller, Waskiewicz Jr, Peter P, Kirsher, Jeffrey T,
	Brattain, Ross B, netdev

Le mercredi 27 octobre 2010 à 16:35 -0600, Tantilov, Emil S a écrit :
> >-----Original Message-----
> >From: Eric Dumazet [mailto:eric.dumazet@gmail.com]
> >Sent: Monday, October 11, 2010 5:17 AM
> >To: David Miller; Waskiewicz Jr, Peter P; Tantilov, Emil S; Kirsher,
> >Jeffrey T
> >Cc: netdev
> >Subject: [PATCH net-next] ixgbe: fix stats handling
> >
> >Hi
> >
> >I am sending this patch for Intel people review/test and acknowledge.
> >
> >Thanks !
> >
> >[PATCH net-next] ixgbe: fix stats handling
> >
> >Current ixgbe stats have following problems :
> >
> >- Not 64 bit safe (on 32bit arches)
> >
> >- Not safe in ixgbe_clean_rx_irq() :
> >   All cpus dirty a common location (netdev->stats.rx_bytes &
> >netdev->stats.rx_packets) without proper synchronization.
> >   This slow down a bit multiqueue operations, and possibly miss some
> >updates.
> >
> >Fixes :
> >
> >Implement ndo_get_stats64() method to provide accurate 64bit rx|tx
> >bytes/packets counters, using 64bit safe infrastructure.
> >
> >ixgbe_get_ethtool_stats() also use this infrastructure to provide 64bit
> >safe counters.
> >
> >Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
> >CC: Peter Waskiewicz <peter.p.waskiewicz.jr@intel.com>
> >CC: Emil Tantilov <emil.s.tantilov@intel.com>
> >CC: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
> >---
> > drivers/net/ixgbe/ixgbe.h         |    3 +-
> > drivers/net/ixgbe/ixgbe_ethtool.c |   29 +++++++++++---------
> > drivers/net/ixgbe/ixgbe_main.c    |   40 +++++++++++++++++++++++++---
> > 3 files changed, 56 insertions(+), 16 deletions(-)
> >
> >diff --git a/drivers/net/ixgbe/ixgbe.h b/drivers/net/ixgbe/ixgbe.h
> >index a8c47b0..944d9e2 100644
> >--- a/drivers/net/ixgbe/ixgbe.h
> >+++ b/drivers/net/ixgbe/ixgbe.h
> >@@ -180,8 +180,9 @@ struct ixgbe_ring {
> > 					 */
> >
> > 	struct ixgbe_queue_stats stats;
> >-	unsigned long reinit_state;
> >+	struct u64_stats_sync syncp;
> > 	int numa_node;
> >+	unsigned long reinit_state;
> > 	u64 rsc_count;			/* stat for coalesced packets */
> > 	u64 rsc_flush;			/* stats for flushed packets */
> > 	u32 restart_queue;		/* track tx queue restarts */
> >diff --git a/drivers/net/ixgbe/ixgbe_ethtool.c
> >b/drivers/net/ixgbe/ixgbe_ethtool.c
> >index d4ac943..3c7f15d 100644
> >--- a/drivers/net/ixgbe/ixgbe_ethtool.c
> >+++ b/drivers/net/ixgbe/ixgbe_ethtool.c
> >@@ -999,12 +999,11 @@ static void ixgbe_get_ethtool_stats(struct net_device
> >*netdev,
> >                                     struct ethtool_stats *stats, u64
> >*data)
> > {
> > 	struct ixgbe_adapter *adapter = netdev_priv(netdev);
> >-	u64 *queue_stat;
> >-	int stat_count = sizeof(struct ixgbe_queue_stats) / sizeof(u64);
> > 	struct rtnl_link_stats64 temp;
> > 	const struct rtnl_link_stats64 *net_stats;
> >-	int j, k;
> >-	int i;
> >+	unsigned int start;
> >+	struct ixgbe_ring *ring;
> >+	int i, j;
> > 	char *p = NULL;
> >
> > 	ixgbe_update_stats(adapter);
> >@@ -1025,16 +1024,22 @@ static void ixgbe_get_ethtool_stats(struct
> >net_device *netdev,
> > 		           sizeof(u64)) ? *(u64 *)p : *(u32 *)p;
> > 	}
> > 	for (j = 0; j < adapter->num_tx_queues; j++) {
> >-		queue_stat = (u64 *)&adapter->tx_ring[j]->stats;
> >-		for (k = 0; k < stat_count; k++)
> >-			data[i + k] = queue_stat[k];
> >-		i += k;
> >+		ring = adapter->tx_ring[j];
> >+		do {
> >+			start = u64_stats_fetch_begin_bh(&ring->syncp);
> >+			data[i]   = ring->stats.packets;
> >+			data[i+1] = ring->stats.bytes;
> >+		} while (u64_stats_fetch_retry_bh(&ring->syncp, start));
> >+		i += 2;
> > 	}
> > 	for (j = 0; j < adapter->num_rx_queues; j++) {
> >-		queue_stat = (u64 *)&adapter->rx_ring[j]->stats;
> >-		for (k = 0; k < stat_count; k++)
> >-			data[i + k] = queue_stat[k];
> >-		i += k;
> >+		ring = adapter->rx_ring[j];
> >+		do {
> >+			start = u64_stats_fetch_begin_bh(&ring->syncp);
> >+			data[i]   = ring->stats.packets;
> >+			data[i+1] = ring->stats.bytes;
> >+		} while (u64_stats_fetch_retry_bh(&ring->syncp, start));
> >+		i += 2;
> > 	}
> > 	if (adapter->flags & IXGBE_FLAG_DCB_ENABLED) {
> > 		for (j = 0; j < MAX_TX_PACKET_BUFFERS; j++) {
> >diff --git a/drivers/net/ixgbe/ixgbe_main.c
> >b/drivers/net/ixgbe/ixgbe_main.c
> >index 95dbf60..1efbcde 100644
> >--- a/drivers/net/ixgbe/ixgbe_main.c
> >+++ b/drivers/net/ixgbe/ixgbe_main.c
> >@@ -824,8 +824,10 @@ static bool ixgbe_clean_tx_irq(struct ixgbe_q_vector
> >*q_vector,
> >
> > 	tx_ring->total_bytes += total_bytes;
> > 	tx_ring->total_packets += total_packets;
> >+	u64_stats_update_begin(&tx_ring->syncp);
> > 	tx_ring->stats.packets += total_packets;
> > 	tx_ring->stats.bytes += total_bytes;
> >+	u64_stats_update_end(&tx_ring->syncp);
> > 	return count < tx_ring->work_limit;
> > }
> >
> >@@ -1172,7 +1174,6 @@ static bool ixgbe_clean_rx_irq(struct ixgbe_q_vector
> >*q_vector,
> > 			       int *work_done, int work_to_do)
> > {
> > 	struct ixgbe_adapter *adapter = q_vector->adapter;
> >-	struct net_device *netdev = adapter->netdev;
> > 	struct pci_dev *pdev = adapter->pdev;
> > 	union ixgbe_adv_rx_desc *rx_desc, *next_rxd;
> > 	struct ixgbe_rx_buffer *rx_buffer_info, *next_buffer;
> >@@ -1298,8 +1299,10 @@ static bool ixgbe_clean_rx_irq(struct ixgbe_q_vector
> >*q_vector,
> > 					rx_ring->rsc_count++;
> > 				rx_ring->rsc_flush++;
> > 			}
> >+			u64_stats_update_begin(&rx_ring->syncp);
> > 			rx_ring->stats.packets++;
> > 			rx_ring->stats.bytes += skb->len;
> >+			u64_stats_update_end(&rx_ring->syncp);
> > 		} else {
> > 			if (rx_ring->flags & IXGBE_RING_RX_PS_ENABLED) {
> > 				rx_buffer_info->skb = next_buffer->skb;
> >@@ -1375,8 +1378,6 @@ next_desc:
> >
> > 	rx_ring->total_packets += total_rx_packets;
> > 	rx_ring->total_bytes += total_rx_bytes;
> >-	netdev->stats.rx_bytes += total_rx_bytes;
> >-	netdev->stats.rx_packets += total_rx_packets;
> >
> > 	return cleaned;
> > }
> >@@ -6559,6 +6560,38 @@ static void ixgbe_netpoll(struct net_device *netdev)
> > }
> > #endif
> >
> >+static struct rtnl_link_stats64 *ixgbe_get_stats64(struct net_device
> >*netdev,
> >+						   struct rtnl_link_stats64 *stats)
> >+{
> >+	struct ixgbe_adapter *adapter = netdev_priv(netdev);
> >+	int i;
> >+
> >+	/* accurate rx/tx bytes/packets stats */
> >+	dev_txq_stats_fold(netdev, stats);
> >+	for (i = 0; i < adapter->num_rx_queues; i++) {
> >+		struct ixgbe_ring *ring = adapter->rx_ring[i];
> >+		u64 bytes, packets;
> >+		unsigned int start;
> >+
> >+		do {
> >+			start = u64_stats_fetch_begin_bh(&ring->syncp);
> >+			packets = ring->stats.packets;
> >+			bytes   = ring->stats.bytes;
> >+		} while (u64_stats_fetch_retry_bh(&ring->syncp, start));
> >+		stats->rx_packets += packets;
> >+		stats->rx_bytes   += bytes;
> >+	}
> >+
> >+	/* following stats updated by ixgbe_watchdog_task() */
> >+	stats->multicast	= netdev->stats.multicast;
> >+	stats->rx_errors	= netdev->stats.rx_errors;
> >+	stats->rx_length_errors	= netdev->stats.rx_length_errors;
> >+	stats->rx_crc_errors	= netdev->stats.rx_crc_errors;
> >+	stats->rx_missed_errors	= netdev->stats.rx_missed_errors;
> >+	return stats;
> >+}
> >+
> >+
> > static const struct net_device_ops ixgbe_netdev_ops = {
> > 	.ndo_open		= ixgbe_open,
> > 	.ndo_stop		= ixgbe_close,
> >@@ -6578,6 +6611,7 @@ static const struct net_device_ops ixgbe_netdev_ops =
> >{
> > 	.ndo_set_vf_vlan	= ixgbe_ndo_set_vf_vlan,
> > 	.ndo_set_vf_tx_rate	= ixgbe_ndo_set_vf_bw,
> > 	.ndo_get_vf_config	= ixgbe_ndo_get_vf_config,
> >+	.ndo_get_stats64	= ixgbe_get_stats64,
> > #ifdef CONFIG_NET_POLL_CONTROLLER
> > 	.ndo_poll_controller	= ixgbe_netpoll,
> > #endif
> >
> 
> Eric,
> 
> We are seeing intermittent hangs on ia32 arch which seem to point to this patch:
> 
> BUG: unable to handle kernel NULL pointer dereference at 00000040
>  IP: [<f7f6b537>] ixgbe_get_stats64+0x47/0x120 [ixgbe]
>  *pdpt = 000000002dc83001 *pde = 000000032d7e5067
>  Oops: 0000 [#2] SMP
>  last sysfs file: /sys/devices/system/cpu/cpu23/cache/index2/shared_cpu_map
>  Modules linked in: act_skbedit cls_u32 sch_multiq ixgbe mdio netconsole configfs autofs4 8021q garp stp llc sunrpc ipv6 e
> xt3 jbd dm_mirror dm_region_hash dm_log dm_round_robin dm_multipath power_meter hwmon sg ses enclosure dcdbas pcspkr serio_raw iTCO_wdt iTCO_vendor_support io
> atdma dca i7core_edac edac_core bnx2 ext4 mbcache jbd2 sr_mod cdrom sd_mod crc_t10dif pata_acpi ata_generic ata_piix megaraid_sas dm_mod [last unloaded: mperf
> ]
> 
>  Pid: 1939, comm: irqbalance Tainted: G      D W   2.6.36-rc7-upstream-net-next-2.6-ixgbe-queue-i386-g55e1a84 #1 09CGW2/Po
> werEdge T610
>  EIP: 0060:[<f7f6b537>] EFLAGS: 00010206 CPU: 0
>  EIP is at ixgbe_get_stats64+0x47/0x120 [ixgbe]
>  EAX: 00000000 EBX: ecc45e4c ECX: ebea0400 EDX: 00000000
>  ESI: ebea0000 EDI: 00000018 EBP: f7f846a0 ESP: ecc45d88
>  DS: 007b ES: 007b FS: 00d8 GS: 00e0 SS: 0068
>  Process irqbalance (pid: 1939, ti=ecc44000 task=efc63a50 task.ti=ecc44000)
>  Stack:
>  ecc45e4c 00000000 ebea0400 ebea0000 ecc45e4c ebea0000 ecc45f04 f7f846a0
>  <0> c0750593 ebea0000 edfec340 ebea0000 000002b5 c075063a edfec340 c0993eec
>  <0> ebe78000 0000021c 00000000 00000009 00000000 00000000 00000000 00000000
>  Call Trace:
>  [<c0750593>] ? dev_get_stats+0x33/0xc0
>  [<c075063a>] ? dev_seq_printf_stats+0x1a/0x180
>  [<c07507aa>] ? dev_seq_show+0xa/0x20
>  [<c052398f>] ? seq_read+0x22f/0x3d0
>  [<c0523760>] ? seq_read+0x0/0x3d0
>  [<c054fdde>] ? proc_reg_read+0x5e/0x90
>  [<c054fd80>] ? proc_reg_read+0x0/0x90
>  [<c050a1dd>] ? vfs_read+0x9d/0x160
>  [<c049d4ef>] ? audit_syscall_entry+0x20f/0x230
>  [<c050a971>] ? sys_read+0x41/0x70
>  [<c0409cdf>] ? sysenter_do_call+0x12/0x28
>  Code: 60 4f 7e c8 8b 44 24 08 8b b8 20 06 00 00 85 ff 7e 63 c7 44 24 04 00 00 00 00 66 90 8b 54 24 04 8b 4c 24 08 8b 84 9
> 1 00 05 00 00 <8b> 50 40 eb 06 8d 74 26 00 89 ca f6 c2 01 0f 85 ae 00 00 00 8b
>  EIP: [<f7f6b537>] ixgbe_get_stats64+0x47/0x120 [ixgbe] SS:ESP 0068:ecc45d88
>  CR2: 0000000000000040
>  ---[ end trace 51ea89f4e57f54f1 ]---
> 
> Emil

Code disassembly makes no sense to me : this doesnt match C code at all

60 4f 7e c8 
8b 44 24 08             mov    0x08(%esp),%eax 
8b b8 20 06 00 00       mov    0x620(%eax),%edi
85 ff                   test   %edi,%edi
7e 63                   jle    +0x63
c7 44 24 04 00 00 00 00 movl   $0,0x4(%esp)
66 90                   xchg   %ax,%ax
8b 54 24 04             mov    0x04(%esp),%edx   (0 in your report)
8b 4c 24 08             mov    0x08(%esp),%ecx   (ebea0400 in your rep)
8b 84 91 00 05 00 00    mov    0x500(,),eax

<8b> 50 40              mov    0x40(%eax),%edx    NULL pointer deref

eb 06                   jmp    +6
8d 74 26 00             lea    0x0(%esi,%eiz,1),%esi
89 ca                   mov    %ecx,%edx
f6 c2 01                test   $0x1,%dl 
0f 85 ae 00 00 00       jne 
8b

Could you disassemble your ixgbe_get_stats64() function ?

Thanks

Mine is :

00000e29 <ixgbe_get_stats64>:
     e29:       55                      push   %ebp
     e2a:       89 e5                   mov    %esp,%ebp
     e2c:       57                      push   %edi
     e2d:       56                      push   %esi
     e2e:       89 c6                   mov    %eax,%esi
     e30:       53                      push   %ebx
     e31:       89 d3                   mov    %edx,%ebx
     e33:       83 ec 18                sub    $0x18,%esp
     e36:       8d b8 40 03 00 00       lea    0x340(%eax),%edi
     e3c:       e8 fc ff ff ff          call   dev_txq_stats_fold
     e41:       31 c0                   xor    %eax,%eax
     e43:       eb 54                   jmp    e99 <ixgbe_get_stats64+0x70>
     e45:       8b 94 87 40 0d 00 00    mov    0xd40(%edi,%eax,4),%edx
     e4c:       89 75 e0                mov    %esi,-0x20(%ebp)
     e4f:       89 5d dc                mov    %ebx,-0x24(%ebp)
     e52:       8b 4a 34                mov    0x34(%edx),%ecx
     e55:       f6 c1 01                test   $0x1,%cl
     e58:       74 04                   je     e5e <ixgbe_get_stats64+0x35>
     e5a:       f3 90                   pause
     e5c:       eb f4                   jmp    e52 <ixgbe_get_stats64+0x29>
     e5e:       8b 5a 24                mov    0x24(%edx),%ebx
     e61:       8b 72 28                mov    0x28(%edx),%esi
     e64:       89 5d ec                mov    %ebx,-0x14(%ebp)
     e67:       8b 5a 2c                mov    0x2c(%edx),%ebx
     e6a:       89 75 f0                mov    %esi,-0x10(%ebp)
     e6d:       8b 72 30                mov    0x30(%edx),%esi
     e70:       89 5d e4                mov    %ebx,-0x1c(%ebp)
     e73:       89 75 e8                mov    %esi,-0x18(%ebp)
     e76:       39 4a 34                cmp    %ecx,0x34(%edx)
     e79:       75 d7                   jne    e52 <ixgbe_get_stats64+0x29>
     e7b:       8b 5d dc                mov    -0x24(%ebp),%ebx
     e7e:       8b 55 ec                mov    -0x14(%ebp),%edx
     e81:       8b 75 e0                mov    -0x20(%ebp),%esi
     e84:       8b 4d f0                mov    -0x10(%ebp),%ecx
     e87:       01 13                   add    %edx,(%ebx)
     e89:       8b 55 e4                mov    -0x1c(%ebp),%edx
     e8c:       11 4b 04                adc    %ecx,0x4(%ebx)
     e8f:       01 53 10                add    %edx,0x10(%ebx)
     e92:       8b 4d e8                mov    -0x18(%ebp),%ecx
     e95:       11 4b 14                adc    %ecx,0x14(%ebx)
     e98:       40                      inc    %eax
     e99:       3b 87 40 0e 00 00       cmp    0xe40(%edi),%eax
     e9f:       7c a4                   jl     e45 <ixgbe_get_stats64+0x1c>
     ea1:       8b 8e 90 00 00 00       mov    0x90(%esi),%ecx
     ea7:       c7 43 44 00 00 00 00    movl   $0x0,0x44(%ebx)
     eae:       89 4b 40                mov    %ecx,0x40(%ebx)
     eb1:       8b 86 80 00 00 00       mov    0x80(%esi),%eax
     eb7:       c7 43 24 00 00 00 00    movl   $0x0,0x24(%ebx)
     ebe:       89 43 20                mov    %eax,0x20(%ebx)
     ec1:       8b 96 98 00 00 00       mov    0x98(%esi),%edx
     ec7:       c7 43 54 00 00 00 00    movl   $0x0,0x54(%ebx)
     ece:       89 53 50                mov    %edx,0x50(%ebx)
     ed1:       8b 8e a0 00 00 00       mov    0xa0(%esi),%ecx
     ed7:       c7 43 64 00 00 00 00    movl   $0x0,0x64(%ebx)
     ede:       89 4b 60                mov    %ecx,0x60(%ebx)
     ee1:       8b 86 ac 00 00 00       mov    0xac(%esi),%eax
     ee7:       c7 43 7c 00 00 00 00    movl   $0x0,0x7c(%ebx)
     eee:       89 43 78                mov    %eax,0x78(%ebx)
     ef1:       83 c4 18                add    $0x18,%esp
     ef4:       89 d8                   mov    %ebx,%eax
     ef6:       5b                      pop    %ebx
     ef7:       5e                      pop    %esi
     ef8:       5f                      pop    %edi
     ef9:       5d                      pop    %ebp
     efa:       c3                      ret

No 0x40(ptr) deref for me

Could adapter->rx_ring[i] becomes NULL sometime in your driver ?

If yes, a lock/protection is needed (same for ethtool stats I guess)



^ permalink raw reply	[flat|nested] 10+ messages in thread

* [PATCH] ixgbe: delay rx_ring freeing
  2010-10-28  4:24   ` Eric Dumazet
@ 2010-10-28 12:02     ` Eric Dumazet
  2010-10-28 12:21       ` Jeff Kirsher
  0 siblings, 1 reply; 10+ messages in thread
From: Eric Dumazet @ 2010-10-28 12:02 UTC (permalink / raw)
  To: Tantilov, Emil S
  Cc: David Miller, Waskiewicz Jr, Peter P, Kirsher, Jeffrey T,
	Brattain, Ross B, netdev

Le jeudi 28 octobre 2010 à 06:24 +0200, Eric Dumazet a écrit :
> Le mercredi 27 octobre 2010 à 16:35 -0600, Tantilov, Emil S a écrit :

> > Eric,
> > 
> > We are seeing intermittent hangs on ia32 arch which seem to point to this patch:
> > 
> > BUG: unable to handle kernel NULL pointer dereference at 00000040
> >  IP: [<f7f6b537>] ixgbe_get_stats64+0x47/0x120 [ixgbe]
> >  *pdpt = 000000002dc83001 *pde = 000000032d7e5067
> >  Oops: 0000 [#2] SMP
> >  last sysfs file: /sys/devices/system/cpu/cpu23/cache/index2/shared_cpu_map
> >  Modules linked in: act_skbedit cls_u32 sch_multiq ixgbe mdio netconsole configfs autofs4 8021q garp stp llc sunrpc ipv6 e
> > xt3 jbd dm_mirror dm_region_hash dm_log dm_round_robin dm_multipath power_meter hwmon sg ses enclosure dcdbas pcspkr serio_raw iTCO_wdt iTCO_vendor_support io
> > atdma dca i7core_edac edac_core bnx2 ext4 mbcache jbd2 sr_mod cdrom sd_mod crc_t10dif pata_acpi ata_generic ata_piix megaraid_sas dm_mod [last unloaded: mperf
> > ]
> > 
> >  Pid: 1939, comm: irqbalance Tainted: G      D W   2.6.36-rc7-upstream-net-next-2.6-ixgbe-queue-i386-g55e1a84 #1 09CGW2/Po
> > werEdge T610
> >  EIP: 0060:[<f7f6b537>] EFLAGS: 00010206 CPU: 0
> >  EIP is at ixgbe_get_stats64+0x47/0x120 [ixgbe]
> >  EAX: 00000000 EBX: ecc45e4c ECX: ebea0400 EDX: 00000000
> >  ESI: ebea0000 EDI: 00000018 EBP: f7f846a0 ESP: ecc45d88
> >  DS: 007b ES: 007b FS: 00d8 GS: 00e0 SS: 0068
> >  Process irqbalance (pid: 1939, ti=ecc44000 task=efc63a50 task.ti=ecc44000)
> >  Stack:
> >  ecc45e4c 00000000 ebea0400 ebea0000 ecc45e4c ebea0000 ecc45f04 f7f846a0
> >  <0> c0750593 ebea0000 edfec340 ebea0000 000002b5 c075063a edfec340 c0993eec
> >  <0> ebe78000 0000021c 00000000 00000009 00000000 00000000 00000000 00000000
> >  Call Trace:
> >  [<c0750593>] ? dev_get_stats+0x33/0xc0
> >  [<c075063a>] ? dev_seq_printf_stats+0x1a/0x180
> >  [<c07507aa>] ? dev_seq_show+0xa/0x20
> >  [<c052398f>] ? seq_read+0x22f/0x3d0
> >  [<c0523760>] ? seq_read+0x0/0x3d0
> >  [<c054fdde>] ? proc_reg_read+0x5e/0x90
> >  [<c054fd80>] ? proc_reg_read+0x0/0x90
> >  [<c050a1dd>] ? vfs_read+0x9d/0x160
> >  [<c049d4ef>] ? audit_syscall_entry+0x20f/0x230
> >  [<c050a971>] ? sys_read+0x41/0x70
> >  [<c0409cdf>] ? sysenter_do_call+0x12/0x28
> >  Code: 60 4f 7e c8 8b 44 24 08 8b b8 20 06 00 00 85 ff 7e 63 c7 44 24 04 00 00 00 00 66 90 8b 54 24 04 8b 4c 24 08 8b 84 9
> > 1 00 05 00 00 <8b> 50 40 eb 06 8d 74 26 00 89 ca f6 c2 01 0f 85 ae 00 00 00 8b
> >  EIP: [<f7f6b537>] ixgbe_get_stats64+0x47/0x120 [ixgbe] SS:ESP 0068:ecc45d88
> >  CR2: 0000000000000040
> >  ---[ end trace 51ea89f4e57f54f1 ]---
> > 
> > Emil

I believe I now understand the problem, please try following patch.

Thanks



[PATCH] ixgbe: delay rx_ring freeing

"cat /proc/net/dev" uses RCU protection only.

Its quite possible we call a driver get_stats() method while device is
dismantling and freeing its data structures.

So get_stats() methods must be very careful not accessing driver private
data without appropriate locking.

In ixgbe case, we access rx_ring pointers. These pointers are freed in
ixgbe_clear_interrupt_scheme() and set to NULL, this can trigger NULL
dereference in ixgbe_get_stats64()

A possible fix is to use RCU locking in ixgbe_get_stats64() and defer
rx_ring freeing after a grace period in ixgbe_clear_interrupt_scheme()

Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
Reported-by: Tantilov, Emil S <emil.s.tantilov@intel.com>
---
 drivers/net/ixgbe/ixgbe.h      |    1 
 drivers/net/ixgbe/ixgbe_main.c |   34 +++++++++++++++++++++----------
 2 files changed, 25 insertions(+), 10 deletions(-)

diff --git a/drivers/net/ixgbe/ixgbe.h b/drivers/net/ixgbe/ixgbe.h
index ed8703c..018e143 100644
--- a/drivers/net/ixgbe/ixgbe.h
+++ b/drivers/net/ixgbe/ixgbe.h
@@ -192,6 +192,7 @@ struct ixgbe_ring {
 
 	unsigned int size;		/* length in bytes */
 	dma_addr_t dma;			/* phys. address of descriptor ring */
+	struct rcu_head rcu;
 } ____cacheline_internodealigned_in_smp;
 
 enum ixgbe_ring_f_enum {
diff --git a/drivers/net/ixgbe/ixgbe_main.c b/drivers/net/ixgbe/ixgbe_main.c
index f856312..1b16292 100644
--- a/drivers/net/ixgbe/ixgbe_main.c
+++ b/drivers/net/ixgbe/ixgbe_main.c
@@ -4742,6 +4742,11 @@ err_set_interrupt:
 	return err;
 }
 
+static void ring_free_rcu(struct rcu_head *head)
+{
+	kfree(container_of(head, struct ixgbe_ring, rcu));
+}
+
 /**
  * ixgbe_clear_interrupt_scheme - Clear the current interrupt scheme settings
  * @adapter: board private structure to clear interrupt scheme on
@@ -4758,7 +4763,12 @@ void ixgbe_clear_interrupt_scheme(struct ixgbe_adapter *adapter)
 		adapter->tx_ring[i] = NULL;
 	}
 	for (i = 0; i < adapter->num_rx_queues; i++) {
-		kfree(adapter->rx_ring[i]);
+		struct ixgbe_ring *ring = adapter->rx_ring[i];
+		
+		/* ixgbe_get_stats64() might access this ring, we must wait
+		 * a grace period before freeing it.
+		 */
+		call_rcu(&ring->rcu, ring_free_rcu);
 		adapter->rx_ring[i] = NULL;
 	}
 
@@ -6551,20 +6561,23 @@ static struct rtnl_link_stats64 *ixgbe_get_stats64(struct net_device *netdev,
 
 	/* accurate rx/tx bytes/packets stats */
 	dev_txq_stats_fold(netdev, stats);
+	rcu_read_lock();
 	for (i = 0; i < adapter->num_rx_queues; i++) {
-		struct ixgbe_ring *ring = adapter->rx_ring[i];
+		struct ixgbe_ring *ring = ACCESS_ONCE(adapter->rx_ring[i]);
 		u64 bytes, packets;
 		unsigned int start;
 
-		do {
-			start = u64_stats_fetch_begin_bh(&ring->syncp);
-			packets = ring->stats.packets;
-			bytes   = ring->stats.bytes;
-		} while (u64_stats_fetch_retry_bh(&ring->syncp, start));
-		stats->rx_packets += packets;
-		stats->rx_bytes   += bytes;
+		if (ring) {
+			do {
+				start = u64_stats_fetch_begin_bh(&ring->syncp);
+				packets = ring->stats.packets;
+				bytes   = ring->stats.bytes;
+			} while (u64_stats_fetch_retry_bh(&ring->syncp, start));
+			stats->rx_packets += packets;
+			stats->rx_bytes   += bytes;
+		}
 	}
-
+	rcu_read_unlock();
 	/* following stats updated by ixgbe_watchdog_task() */
 	stats->multicast	= netdev->stats.multicast;
 	stats->rx_errors	= netdev->stats.rx_errors;
@@ -7270,6 +7283,7 @@ static void __exit ixgbe_exit_module(void)
 	dca_unregister_notify(&dca_notifier);
 #endif
 	pci_unregister_driver(&ixgbe_driver);
+	rcu_barrier(); /* Wait for completion of call_rcu()'s */
 }
 
 #ifdef CONFIG_IXGBE_DCA



^ permalink raw reply related	[flat|nested] 10+ messages in thread

* Re: [PATCH] ixgbe: delay rx_ring freeing
  2010-10-28 12:02     ` [PATCH] ixgbe: delay rx_ring freeing Eric Dumazet
@ 2010-10-28 12:21       ` Jeff Kirsher
  2010-10-29  3:45         ` [PATCH] ixgbe: refactor ixgbe_alloc_queues() Eric Dumazet
  0 siblings, 1 reply; 10+ messages in thread
From: Jeff Kirsher @ 2010-10-28 12:21 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: Tantilov, Emil S, David Miller, Waskiewicz Jr, Peter P,
	Brattain, Ross B, netdev

[-- Attachment #1: Type: text/plain, Size: 3956 bytes --]

On Thu, 2010-10-28 at 05:02 -0700, Eric Dumazet wrote:
> Le jeudi 28 octobre 2010 à 06:24 +0200, Eric Dumazet a écrit :
> > Le mercredi 27 octobre 2010 à 16:35 -0600, Tantilov, Emil S a écrit :
> 
> > > Eric,
> > > 
> > > We are seeing intermittent hangs on ia32 arch which seem to point to this patch:
> > > 
> > > BUG: unable to handle kernel NULL pointer dereference at 00000040
> > >  IP: [<f7f6b537>] ixgbe_get_stats64+0x47/0x120 [ixgbe]
> > >  *pdpt = 000000002dc83001 *pde = 000000032d7e5067
> > >  Oops: 0000 [#2] SMP
> > >  last sysfs file: /sys/devices/system/cpu/cpu23/cache/index2/shared_cpu_map
> > >  Modules linked in: act_skbedit cls_u32 sch_multiq ixgbe mdio netconsole configfs autofs4 8021q garp stp llc sunrpc ipv6 e
> > > xt3 jbd dm_mirror dm_region_hash dm_log dm_round_robin dm_multipath power_meter hwmon sg ses enclosure dcdbas pcspkr serio_raw iTCO_wdt iTCO_vendor_support io
> > > atdma dca i7core_edac edac_core bnx2 ext4 mbcache jbd2 sr_mod cdrom sd_mod crc_t10dif pata_acpi ata_generic ata_piix megaraid_sas dm_mod [last unloaded: mperf
> > > ]
> > > 
> > >  Pid: 1939, comm: irqbalance Tainted: G      D W   2.6.36-rc7-upstream-net-next-2.6-ixgbe-queue-i386-g55e1a84 #1 09CGW2/Po
> > > werEdge T610
> > >  EIP: 0060:[<f7f6b537>] EFLAGS: 00010206 CPU: 0
> > >  EIP is at ixgbe_get_stats64+0x47/0x120 [ixgbe]
> > >  EAX: 00000000 EBX: ecc45e4c ECX: ebea0400 EDX: 00000000
> > >  ESI: ebea0000 EDI: 00000018 EBP: f7f846a0 ESP: ecc45d88
> > >  DS: 007b ES: 007b FS: 00d8 GS: 00e0 SS: 0068
> > >  Process irqbalance (pid: 1939, ti=ecc44000 task=efc63a50 task.ti=ecc44000)
> > >  Stack:
> > >  ecc45e4c 00000000 ebea0400 ebea0000 ecc45e4c ebea0000 ecc45f04 f7f846a0
> > >  <0> c0750593 ebea0000 edfec340 ebea0000 000002b5 c075063a edfec340 c0993eec
> > >  <0> ebe78000 0000021c 00000000 00000009 00000000 00000000 00000000 00000000
> > >  Call Trace:
> > >  [<c0750593>] ? dev_get_stats+0x33/0xc0
> > >  [<c075063a>] ? dev_seq_printf_stats+0x1a/0x180
> > >  [<c07507aa>] ? dev_seq_show+0xa/0x20
> > >  [<c052398f>] ? seq_read+0x22f/0x3d0
> > >  [<c0523760>] ? seq_read+0x0/0x3d0
> > >  [<c054fdde>] ? proc_reg_read+0x5e/0x90
> > >  [<c054fd80>] ? proc_reg_read+0x0/0x90
> > >  [<c050a1dd>] ? vfs_read+0x9d/0x160
> > >  [<c049d4ef>] ? audit_syscall_entry+0x20f/0x230
> > >  [<c050a971>] ? sys_read+0x41/0x70
> > >  [<c0409cdf>] ? sysenter_do_call+0x12/0x28
> > >  Code: 60 4f 7e c8 8b 44 24 08 8b b8 20 06 00 00 85 ff 7e 63 c7 44 24 04 00 00 00 00 66 90 8b 54 24 04 8b 4c 24 08 8b 84 9
> > > 1 00 05 00 00 <8b> 50 40 eb 06 8d 74 26 00 89 ca f6 c2 01 0f 85 ae 00 00 00 8b
> > >  EIP: [<f7f6b537>] ixgbe_get_stats64+0x47/0x120 [ixgbe] SS:ESP 0068:ecc45d88
> > >  CR2: 0000000000000040
> > >  ---[ end trace 51ea89f4e57f54f1 ]---
> > > 
> > > Emil
> 
> I believe I now understand the problem, please try following patch.
> 
> Thanks
> 
> 
> 
> [PATCH] ixgbe: delay rx_ring freeing
> 
> "cat /proc/net/dev" uses RCU protection only.
> 
> Its quite possible we call a driver get_stats() method while device is
> dismantling and freeing its data structures.
> 
> So get_stats() methods must be very careful not accessing driver private
> data without appropriate locking.
> 
> In ixgbe case, we access rx_ring pointers. These pointers are freed in
> ixgbe_clear_interrupt_scheme() and set to NULL, this can trigger NULL
> dereference in ixgbe_get_stats64()
> 
> A possible fix is to use RCU locking in ixgbe_get_stats64() and defer
> rx_ring freeing after a grace period in ixgbe_clear_interrupt_scheme()
> 
> Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
> Reported-by: Tantilov, Emil S <emil.s.tantilov@intel.com>
> ---
>  drivers/net/ixgbe/ixgbe.h      |    1 
>  drivers/net/ixgbe/ixgbe_main.c |   34 +++++++++++++++++++++----------
>  2 files changed, 25 insertions(+), 10 deletions(-)
> 

Thanks Eric, I have added this patch to my queue.

[-- Attachment #2: This is a digitally signed message part --]
[-- Type: application/pgp-signature, Size: 490 bytes --]

^ permalink raw reply	[flat|nested] 10+ messages in thread

* [PATCH] ixgbe: refactor ixgbe_alloc_queues()
  2010-10-28 12:21       ` Jeff Kirsher
@ 2010-10-29  3:45         ` Eric Dumazet
  2010-10-29  7:46           ` Jeff Kirsher
  2010-10-29  8:38           ` Jeff Kirsher
  0 siblings, 2 replies; 10+ messages in thread
From: Eric Dumazet @ 2010-10-29  3:45 UTC (permalink / raw)
  To: jeffrey.t.kirsher
  Cc: Tantilov, Emil S, David Miller, Waskiewicz Jr, Peter P,
	Brattain, Ross B, netdev

Note : compiled only patch, not tested.

Thanks !

[PATCH] ixgbe: refactor ixgbe_alloc_queues()

I noticed ring variable was initialized before allocations, and that
memory node management was a bit ugly. We also leak memory in case of
ring allocations error.

Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
---
 drivers/net/ixgbe/ixgbe_main.c |   69 ++++++++++++-------------------
 1 file changed, 28 insertions(+), 41 deletions(-)

diff --git a/drivers/net/ixgbe/ixgbe_main.c b/drivers/net/ixgbe/ixgbe_main.c
index 2bd3eb4..e89da9a 100644
--- a/drivers/net/ixgbe/ixgbe_main.c
+++ b/drivers/net/ixgbe/ixgbe_main.c
@@ -4470,65 +4470,52 @@ static void ixgbe_cache_ring_register(struct ixgbe_adapter *adapter)
  **/
 static int ixgbe_alloc_queues(struct ixgbe_adapter *adapter)
 {
-	int i;
-	int orig_node = adapter->node;
+	int rx = 0, tx = 0, nid = adapter->node;
 
-	for (i = 0; i < adapter->num_tx_queues; i++) {
-		struct ixgbe_ring *ring = adapter->tx_ring[i];
-		if (orig_node == -1) {
-			int cur_node = next_online_node(adapter->node);
-			if (cur_node == MAX_NUMNODES)
-				cur_node = first_online_node;
-			adapter->node = cur_node;
-		}
-		ring = kzalloc_node(sizeof(struct ixgbe_ring), GFP_KERNEL,
-				    adapter->node);
+	if (nid < 0 || !node_online(nid))
+		nid = first_online_node;
+
+	for (; tx < adapter->num_tx_queues; tx++) {
+		struct ixgbe_ring *ring;
+
+		ring = kzalloc_node(sizeof(*ring), GFP_KERNEL, nid);
 		if (!ring)
-			ring = kzalloc(sizeof(struct ixgbe_ring), GFP_KERNEL);
+			ring = kzalloc(sizeof(*ring), GFP_KERNEL);
 		if (!ring)
-			goto err_tx_ring_allocation;
+			goto err_allocation;
 		ring->count = adapter->tx_ring_count;
-		ring->queue_index = i;
-		ring->numa_node = adapter->node;
+		ring->queue_index = tx;
+		ring->numa_node = nid;
 
-		adapter->tx_ring[i] = ring;
+		adapter->tx_ring[tx] = ring;
 	}
 
-	/* Restore the adapter's original node */
-	adapter->node = orig_node;
+	for (; rx < adapter->num_rx_queues; rx++) {
+		struct ixgbe_ring *ring;
 
-	for (i = 0; i < adapter->num_rx_queues; i++) {
-		struct ixgbe_ring *ring = adapter->rx_ring[i];
-		if (orig_node == -1) {
-			int cur_node = next_online_node(adapter->node);
-			if (cur_node == MAX_NUMNODES)
-				cur_node = first_online_node;
-			adapter->node = cur_node;
-		}
-		ring = kzalloc_node(sizeof(struct ixgbe_ring), GFP_KERNEL,
-				    adapter->node);
+		ring = kzalloc_node(sizeof(*ring), GFP_KERNEL, nid);
 		if (!ring)
-			ring = kzalloc(sizeof(struct ixgbe_ring), GFP_KERNEL);
+			ring = kzalloc(sizeof(*ring), GFP_KERNEL);
 		if (!ring)
-			goto err_rx_ring_allocation;
+			goto err_allocation;
 		ring->count = adapter->rx_ring_count;
-		ring->queue_index = i;
-		ring->numa_node = adapter->node;
+		ring->queue_index = rx;
+		ring->numa_node = nid;
 
-		adapter->rx_ring[i] = ring;
+		adapter->rx_ring[rx] = ring;
 	}
 
-	/* Restore the adapter's original node */
-	adapter->node = orig_node;
-
 	ixgbe_cache_ring_register(adapter);
 
 	return 0;
 
-err_rx_ring_allocation:
-	for (i = 0; i < adapter->num_tx_queues; i++)
-		kfree(adapter->tx_ring[i]);
-err_tx_ring_allocation:
+err_allocation:
+	while (tx)
+		kfree(adapter->tx_ring[--tx]);
+
+	while (rx)
+		kfree(adapter->tx_ring[--rx]);
+
 	return -ENOMEM;
 }
 



^ permalink raw reply related	[flat|nested] 10+ messages in thread

* Re: [PATCH] ixgbe: refactor ixgbe_alloc_queues()
  2010-10-29  3:45         ` [PATCH] ixgbe: refactor ixgbe_alloc_queues() Eric Dumazet
@ 2010-10-29  7:46           ` Jeff Kirsher
  2010-10-29  8:38           ` Jeff Kirsher
  1 sibling, 0 replies; 10+ messages in thread
From: Jeff Kirsher @ 2010-10-29  7:46 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: Tantilov, Emil S, David Miller, Waskiewicz Jr, Peter P,
	Brattain, Ross B, netdev

On Thu, Oct 28, 2010 at 20:45, Eric Dumazet <eric.dumazet@gmail.com> wrote:
> Note : compiled only patch, not tested.
>
> Thanks !
>
> [PATCH] ixgbe: refactor ixgbe_alloc_queues()
>
> I noticed ring variable was initialized before allocations, and that
> memory node management was a bit ugly. We also leak memory in case of
> ring allocations error.
>
> Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
> ---
>  drivers/net/ixgbe/ixgbe_main.c |   69 ++++++++++++-------------------
>  1 file changed, 28 insertions(+), 41 deletions(-)
>

Thanks Eric.  I will add this to my queue of patches.

-- 
Cheers,
Jeff

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH] ixgbe: refactor ixgbe_alloc_queues()
  2010-10-29  3:45         ` [PATCH] ixgbe: refactor ixgbe_alloc_queues() Eric Dumazet
  2010-10-29  7:46           ` Jeff Kirsher
@ 2010-10-29  8:38           ` Jeff Kirsher
  2010-10-29  8:48             ` Eric Dumazet
  1 sibling, 1 reply; 10+ messages in thread
From: Jeff Kirsher @ 2010-10-29  8:38 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: Tantilov, Emil S, David Miller, Waskiewicz Jr, Peter P,
	Brattain, Ross B, netdev

On Thu, Oct 28, 2010 at 20:45, Eric Dumazet <eric.dumazet@gmail.com> wrote:
> Note : compiled only patch, not tested.
>
> Thanks !
>
> [PATCH] ixgbe: refactor ixgbe_alloc_queues()
>
> I noticed ring variable was initialized before allocations, and that
> memory node management was a bit ugly. We also leak memory in case of
> ring allocations error.
>
> Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
> ---
>  drivers/net/ixgbe/ixgbe_main.c |   69 ++++++++++++-------------------
>  1 file changed, 28 insertions(+), 41 deletions(-)
>


I noticed a typo when applying the patch and I fixed it in the patch
in my queue.

The following code:
> +       while (rx)
> +               kfree(adapter->tx_ring[--rx]);
> +

should be:
> +       while (rx)
> +               kfree(adapter->rx_ring[--rx]);
> +

-- 
Cheers,
Jeff

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH] ixgbe: refactor ixgbe_alloc_queues()
  2010-10-29  8:38           ` Jeff Kirsher
@ 2010-10-29  8:48             ` Eric Dumazet
  0 siblings, 0 replies; 10+ messages in thread
From: Eric Dumazet @ 2010-10-29  8:48 UTC (permalink / raw)
  To: Jeff Kirsher
  Cc: Tantilov, Emil S, David Miller, Waskiewicz Jr, Peter P,
	Brattain, Ross B, netdev

Le vendredi 29 octobre 2010 à 01:38 -0700, Jeff Kirsher a écrit :

> 
> I noticed a typo when applying the patch and I fixed it in the patch
> in my queue.
> 
> The following code:
> > +       while (rx)
> > +               kfree(adapter->tx_ring[--rx]);
> > +
> 
> should be:
> > +       while (rx)
> > +               kfree(adapter->rx_ring[--rx]);
> > +
> 

Good catch, thanks !



^ permalink raw reply	[flat|nested] 10+ messages in thread

end of thread, other threads:[~2010-10-29  8:49 UTC | newest]

Thread overview: 10+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2010-10-11 12:17 [PATCH net-next] ixgbe: fix stats handling Eric Dumazet
2010-10-12  0:13 ` Jeff Kirsher
2010-10-27 22:35 ` Tantilov, Emil S
2010-10-28  4:24   ` Eric Dumazet
2010-10-28 12:02     ` [PATCH] ixgbe: delay rx_ring freeing Eric Dumazet
2010-10-28 12:21       ` Jeff Kirsher
2010-10-29  3:45         ` [PATCH] ixgbe: refactor ixgbe_alloc_queues() Eric Dumazet
2010-10-29  7:46           ` Jeff Kirsher
2010-10-29  8:38           ` Jeff Kirsher
2010-10-29  8:48             ` Eric Dumazet

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).