* [PATCH net-next] ixgbe: fix stats handling
@ 2010-10-11 12:17 Eric Dumazet
2010-10-12 0:13 ` Jeff Kirsher
2010-10-27 22:35 ` Tantilov, Emil S
0 siblings, 2 replies; 10+ messages in thread
From: Eric Dumazet @ 2010-10-11 12:17 UTC (permalink / raw)
To: David Miller, Peter Waskiewicz, Emil Tantilov, Jeff Kirsher; +Cc: netdev
Hi
I am sending this patch for Intel people review/test and acknowledge.
Thanks !
[PATCH net-next] ixgbe: fix stats handling
Current ixgbe stats have following problems :
- Not 64 bit safe (on 32bit arches)
- Not safe in ixgbe_clean_rx_irq() :
All cpus dirty a common location (netdev->stats.rx_bytes &
netdev->stats.rx_packets) without proper synchronization.
This slow down a bit multiqueue operations, and possibly miss some
updates.
Fixes :
Implement ndo_get_stats64() method to provide accurate 64bit rx|tx
bytes/packets counters, using 64bit safe infrastructure.
ixgbe_get_ethtool_stats() also use this infrastructure to provide 64bit
safe counters.
Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
CC: Peter Waskiewicz <peter.p.waskiewicz.jr@intel.com>
CC: Emil Tantilov <emil.s.tantilov@intel.com>
CC: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
---
drivers/net/ixgbe/ixgbe.h | 3 +-
drivers/net/ixgbe/ixgbe_ethtool.c | 29 +++++++++++---------
drivers/net/ixgbe/ixgbe_main.c | 40 +++++++++++++++++++++++++---
3 files changed, 56 insertions(+), 16 deletions(-)
diff --git a/drivers/net/ixgbe/ixgbe.h b/drivers/net/ixgbe/ixgbe.h
index a8c47b0..944d9e2 100644
--- a/drivers/net/ixgbe/ixgbe.h
+++ b/drivers/net/ixgbe/ixgbe.h
@@ -180,8 +180,9 @@ struct ixgbe_ring {
*/
struct ixgbe_queue_stats stats;
- unsigned long reinit_state;
+ struct u64_stats_sync syncp;
int numa_node;
+ unsigned long reinit_state;
u64 rsc_count; /* stat for coalesced packets */
u64 rsc_flush; /* stats for flushed packets */
u32 restart_queue; /* track tx queue restarts */
diff --git a/drivers/net/ixgbe/ixgbe_ethtool.c b/drivers/net/ixgbe/ixgbe_ethtool.c
index d4ac943..3c7f15d 100644
--- a/drivers/net/ixgbe/ixgbe_ethtool.c
+++ b/drivers/net/ixgbe/ixgbe_ethtool.c
@@ -999,12 +999,11 @@ static void ixgbe_get_ethtool_stats(struct net_device *netdev,
struct ethtool_stats *stats, u64 *data)
{
struct ixgbe_adapter *adapter = netdev_priv(netdev);
- u64 *queue_stat;
- int stat_count = sizeof(struct ixgbe_queue_stats) / sizeof(u64);
struct rtnl_link_stats64 temp;
const struct rtnl_link_stats64 *net_stats;
- int j, k;
- int i;
+ unsigned int start;
+ struct ixgbe_ring *ring;
+ int i, j;
char *p = NULL;
ixgbe_update_stats(adapter);
@@ -1025,16 +1024,22 @@ static void ixgbe_get_ethtool_stats(struct net_device *netdev,
sizeof(u64)) ? *(u64 *)p : *(u32 *)p;
}
for (j = 0; j < adapter->num_tx_queues; j++) {
- queue_stat = (u64 *)&adapter->tx_ring[j]->stats;
- for (k = 0; k < stat_count; k++)
- data[i + k] = queue_stat[k];
- i += k;
+ ring = adapter->tx_ring[j];
+ do {
+ start = u64_stats_fetch_begin_bh(&ring->syncp);
+ data[i] = ring->stats.packets;
+ data[i+1] = ring->stats.bytes;
+ } while (u64_stats_fetch_retry_bh(&ring->syncp, start));
+ i += 2;
}
for (j = 0; j < adapter->num_rx_queues; j++) {
- queue_stat = (u64 *)&adapter->rx_ring[j]->stats;
- for (k = 0; k < stat_count; k++)
- data[i + k] = queue_stat[k];
- i += k;
+ ring = adapter->rx_ring[j];
+ do {
+ start = u64_stats_fetch_begin_bh(&ring->syncp);
+ data[i] = ring->stats.packets;
+ data[i+1] = ring->stats.bytes;
+ } while (u64_stats_fetch_retry_bh(&ring->syncp, start));
+ i += 2;
}
if (adapter->flags & IXGBE_FLAG_DCB_ENABLED) {
for (j = 0; j < MAX_TX_PACKET_BUFFERS; j++) {
diff --git a/drivers/net/ixgbe/ixgbe_main.c b/drivers/net/ixgbe/ixgbe_main.c
index 95dbf60..1efbcde 100644
--- a/drivers/net/ixgbe/ixgbe_main.c
+++ b/drivers/net/ixgbe/ixgbe_main.c
@@ -824,8 +824,10 @@ static bool ixgbe_clean_tx_irq(struct ixgbe_q_vector *q_vector,
tx_ring->total_bytes += total_bytes;
tx_ring->total_packets += total_packets;
+ u64_stats_update_begin(&tx_ring->syncp);
tx_ring->stats.packets += total_packets;
tx_ring->stats.bytes += total_bytes;
+ u64_stats_update_end(&tx_ring->syncp);
return count < tx_ring->work_limit;
}
@@ -1172,7 +1174,6 @@ static bool ixgbe_clean_rx_irq(struct ixgbe_q_vector *q_vector,
int *work_done, int work_to_do)
{
struct ixgbe_adapter *adapter = q_vector->adapter;
- struct net_device *netdev = adapter->netdev;
struct pci_dev *pdev = adapter->pdev;
union ixgbe_adv_rx_desc *rx_desc, *next_rxd;
struct ixgbe_rx_buffer *rx_buffer_info, *next_buffer;
@@ -1298,8 +1299,10 @@ static bool ixgbe_clean_rx_irq(struct ixgbe_q_vector *q_vector,
rx_ring->rsc_count++;
rx_ring->rsc_flush++;
}
+ u64_stats_update_begin(&rx_ring->syncp);
rx_ring->stats.packets++;
rx_ring->stats.bytes += skb->len;
+ u64_stats_update_end(&rx_ring->syncp);
} else {
if (rx_ring->flags & IXGBE_RING_RX_PS_ENABLED) {
rx_buffer_info->skb = next_buffer->skb;
@@ -1375,8 +1378,6 @@ next_desc:
rx_ring->total_packets += total_rx_packets;
rx_ring->total_bytes += total_rx_bytes;
- netdev->stats.rx_bytes += total_rx_bytes;
- netdev->stats.rx_packets += total_rx_packets;
return cleaned;
}
@@ -6559,6 +6560,38 @@ static void ixgbe_netpoll(struct net_device *netdev)
}
#endif
+static struct rtnl_link_stats64 *ixgbe_get_stats64(struct net_device *netdev,
+ struct rtnl_link_stats64 *stats)
+{
+ struct ixgbe_adapter *adapter = netdev_priv(netdev);
+ int i;
+
+ /* accurate rx/tx bytes/packets stats */
+ dev_txq_stats_fold(netdev, stats);
+ for (i = 0; i < adapter->num_rx_queues; i++) {
+ struct ixgbe_ring *ring = adapter->rx_ring[i];
+ u64 bytes, packets;
+ unsigned int start;
+
+ do {
+ start = u64_stats_fetch_begin_bh(&ring->syncp);
+ packets = ring->stats.packets;
+ bytes = ring->stats.bytes;
+ } while (u64_stats_fetch_retry_bh(&ring->syncp, start));
+ stats->rx_packets += packets;
+ stats->rx_bytes += bytes;
+ }
+
+ /* following stats updated by ixgbe_watchdog_task() */
+ stats->multicast = netdev->stats.multicast;
+ stats->rx_errors = netdev->stats.rx_errors;
+ stats->rx_length_errors = netdev->stats.rx_length_errors;
+ stats->rx_crc_errors = netdev->stats.rx_crc_errors;
+ stats->rx_missed_errors = netdev->stats.rx_missed_errors;
+ return stats;
+}
+
+
static const struct net_device_ops ixgbe_netdev_ops = {
.ndo_open = ixgbe_open,
.ndo_stop = ixgbe_close,
@@ -6578,6 +6611,7 @@ static const struct net_device_ops ixgbe_netdev_ops = {
.ndo_set_vf_vlan = ixgbe_ndo_set_vf_vlan,
.ndo_set_vf_tx_rate = ixgbe_ndo_set_vf_bw,
.ndo_get_vf_config = ixgbe_ndo_get_vf_config,
+ .ndo_get_stats64 = ixgbe_get_stats64,
#ifdef CONFIG_NET_POLL_CONTROLLER
.ndo_poll_controller = ixgbe_netpoll,
#endif
^ permalink raw reply related [flat|nested] 10+ messages in thread
* Re: [PATCH net-next] ixgbe: fix stats handling
2010-10-11 12:17 [PATCH net-next] ixgbe: fix stats handling Eric Dumazet
@ 2010-10-12 0:13 ` Jeff Kirsher
2010-10-27 22:35 ` Tantilov, Emil S
1 sibling, 0 replies; 10+ messages in thread
From: Jeff Kirsher @ 2010-10-12 0:13 UTC (permalink / raw)
To: Eric Dumazet; +Cc: David Miller, Peter Waskiewicz, Emil Tantilov, netdev
On Mon, Oct 11, 2010 at 05:17, Eric Dumazet <eric.dumazet@gmail.com> wrote:
> Hi
>
> I am sending this patch for Intel people review/test and acknowledge.
>
> Thanks !
>
> [PATCH net-next] ixgbe: fix stats handling
>
> Current ixgbe stats have following problems :
>
> - Not 64 bit safe (on 32bit arches)
>
> - Not safe in ixgbe_clean_rx_irq() :
> All cpus dirty a common location (netdev->stats.rx_bytes &
> netdev->stats.rx_packets) without proper synchronization.
> This slow down a bit multiqueue operations, and possibly miss some
> updates.
>
> Fixes :
>
> Implement ndo_get_stats64() method to provide accurate 64bit rx|tx
> bytes/packets counters, using 64bit safe infrastructure.
>
> ixgbe_get_ethtool_stats() also use this infrastructure to provide 64bit
> safe counters.
>
> Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
> CC: Peter Waskiewicz <peter.p.waskiewicz.jr@intel.com>
> CC: Emil Tantilov <emil.s.tantilov@intel.com>
> CC: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
> ---
> drivers/net/ixgbe/ixgbe.h | 3 +-
> drivers/net/ixgbe/ixgbe_ethtool.c | 29 +++++++++++---------
> drivers/net/ixgbe/ixgbe_main.c | 40 +++++++++++++++++++++++++---
> 3 files changed, 56 insertions(+), 16 deletions(-)
>
Thanks Eric, I have added the patch to my queue.
--
Cheers,
Jeff
^ permalink raw reply [flat|nested] 10+ messages in thread
* RE: [PATCH net-next] ixgbe: fix stats handling
2010-10-11 12:17 [PATCH net-next] ixgbe: fix stats handling Eric Dumazet
2010-10-12 0:13 ` Jeff Kirsher
@ 2010-10-27 22:35 ` Tantilov, Emil S
2010-10-28 4:24 ` Eric Dumazet
1 sibling, 1 reply; 10+ messages in thread
From: Tantilov, Emil S @ 2010-10-27 22:35 UTC (permalink / raw)
To: Eric Dumazet, David Miller, Waskiewicz Jr, Peter P,
Kirsher, Jeffrey T
Cc: netdev
>-----Original Message-----
>From: Eric Dumazet [mailto:eric.dumazet@gmail.com]
>Sent: Monday, October 11, 2010 5:17 AM
>To: David Miller; Waskiewicz Jr, Peter P; Tantilov, Emil S; Kirsher,
>Jeffrey T
>Cc: netdev
>Subject: [PATCH net-next] ixgbe: fix stats handling
>
>Hi
>
>I am sending this patch for Intel people review/test and acknowledge.
>
>Thanks !
>
>[PATCH net-next] ixgbe: fix stats handling
>
>Current ixgbe stats have following problems :
>
>- Not 64 bit safe (on 32bit arches)
>
>- Not safe in ixgbe_clean_rx_irq() :
> All cpus dirty a common location (netdev->stats.rx_bytes &
>netdev->stats.rx_packets) without proper synchronization.
> This slow down a bit multiqueue operations, and possibly miss some
>updates.
>
>Fixes :
>
>Implement ndo_get_stats64() method to provide accurate 64bit rx|tx
>bytes/packets counters, using 64bit safe infrastructure.
>
>ixgbe_get_ethtool_stats() also use this infrastructure to provide 64bit
>safe counters.
>
>Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
>CC: Peter Waskiewicz <peter.p.waskiewicz.jr@intel.com>
>CC: Emil Tantilov <emil.s.tantilov@intel.com>
>CC: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
>---
> drivers/net/ixgbe/ixgbe.h | 3 +-
> drivers/net/ixgbe/ixgbe_ethtool.c | 29 +++++++++++---------
> drivers/net/ixgbe/ixgbe_main.c | 40 +++++++++++++++++++++++++---
> 3 files changed, 56 insertions(+), 16 deletions(-)
>
>diff --git a/drivers/net/ixgbe/ixgbe.h b/drivers/net/ixgbe/ixgbe.h
>index a8c47b0..944d9e2 100644
>--- a/drivers/net/ixgbe/ixgbe.h
>+++ b/drivers/net/ixgbe/ixgbe.h
>@@ -180,8 +180,9 @@ struct ixgbe_ring {
> */
>
> struct ixgbe_queue_stats stats;
>- unsigned long reinit_state;
>+ struct u64_stats_sync syncp;
> int numa_node;
>+ unsigned long reinit_state;
> u64 rsc_count; /* stat for coalesced packets */
> u64 rsc_flush; /* stats for flushed packets */
> u32 restart_queue; /* track tx queue restarts */
>diff --git a/drivers/net/ixgbe/ixgbe_ethtool.c
>b/drivers/net/ixgbe/ixgbe_ethtool.c
>index d4ac943..3c7f15d 100644
>--- a/drivers/net/ixgbe/ixgbe_ethtool.c
>+++ b/drivers/net/ixgbe/ixgbe_ethtool.c
>@@ -999,12 +999,11 @@ static void ixgbe_get_ethtool_stats(struct net_device
>*netdev,
> struct ethtool_stats *stats, u64
>*data)
> {
> struct ixgbe_adapter *adapter = netdev_priv(netdev);
>- u64 *queue_stat;
>- int stat_count = sizeof(struct ixgbe_queue_stats) / sizeof(u64);
> struct rtnl_link_stats64 temp;
> const struct rtnl_link_stats64 *net_stats;
>- int j, k;
>- int i;
>+ unsigned int start;
>+ struct ixgbe_ring *ring;
>+ int i, j;
> char *p = NULL;
>
> ixgbe_update_stats(adapter);
>@@ -1025,16 +1024,22 @@ static void ixgbe_get_ethtool_stats(struct
>net_device *netdev,
> sizeof(u64)) ? *(u64 *)p : *(u32 *)p;
> }
> for (j = 0; j < adapter->num_tx_queues; j++) {
>- queue_stat = (u64 *)&adapter->tx_ring[j]->stats;
>- for (k = 0; k < stat_count; k++)
>- data[i + k] = queue_stat[k];
>- i += k;
>+ ring = adapter->tx_ring[j];
>+ do {
>+ start = u64_stats_fetch_begin_bh(&ring->syncp);
>+ data[i] = ring->stats.packets;
>+ data[i+1] = ring->stats.bytes;
>+ } while (u64_stats_fetch_retry_bh(&ring->syncp, start));
>+ i += 2;
> }
> for (j = 0; j < adapter->num_rx_queues; j++) {
>- queue_stat = (u64 *)&adapter->rx_ring[j]->stats;
>- for (k = 0; k < stat_count; k++)
>- data[i + k] = queue_stat[k];
>- i += k;
>+ ring = adapter->rx_ring[j];
>+ do {
>+ start = u64_stats_fetch_begin_bh(&ring->syncp);
>+ data[i] = ring->stats.packets;
>+ data[i+1] = ring->stats.bytes;
>+ } while (u64_stats_fetch_retry_bh(&ring->syncp, start));
>+ i += 2;
> }
> if (adapter->flags & IXGBE_FLAG_DCB_ENABLED) {
> for (j = 0; j < MAX_TX_PACKET_BUFFERS; j++) {
>diff --git a/drivers/net/ixgbe/ixgbe_main.c
>b/drivers/net/ixgbe/ixgbe_main.c
>index 95dbf60..1efbcde 100644
>--- a/drivers/net/ixgbe/ixgbe_main.c
>+++ b/drivers/net/ixgbe/ixgbe_main.c
>@@ -824,8 +824,10 @@ static bool ixgbe_clean_tx_irq(struct ixgbe_q_vector
>*q_vector,
>
> tx_ring->total_bytes += total_bytes;
> tx_ring->total_packets += total_packets;
>+ u64_stats_update_begin(&tx_ring->syncp);
> tx_ring->stats.packets += total_packets;
> tx_ring->stats.bytes += total_bytes;
>+ u64_stats_update_end(&tx_ring->syncp);
> return count < tx_ring->work_limit;
> }
>
>@@ -1172,7 +1174,6 @@ static bool ixgbe_clean_rx_irq(struct ixgbe_q_vector
>*q_vector,
> int *work_done, int work_to_do)
> {
> struct ixgbe_adapter *adapter = q_vector->adapter;
>- struct net_device *netdev = adapter->netdev;
> struct pci_dev *pdev = adapter->pdev;
> union ixgbe_adv_rx_desc *rx_desc, *next_rxd;
> struct ixgbe_rx_buffer *rx_buffer_info, *next_buffer;
>@@ -1298,8 +1299,10 @@ static bool ixgbe_clean_rx_irq(struct ixgbe_q_vector
>*q_vector,
> rx_ring->rsc_count++;
> rx_ring->rsc_flush++;
> }
>+ u64_stats_update_begin(&rx_ring->syncp);
> rx_ring->stats.packets++;
> rx_ring->stats.bytes += skb->len;
>+ u64_stats_update_end(&rx_ring->syncp);
> } else {
> if (rx_ring->flags & IXGBE_RING_RX_PS_ENABLED) {
> rx_buffer_info->skb = next_buffer->skb;
>@@ -1375,8 +1378,6 @@ next_desc:
>
> rx_ring->total_packets += total_rx_packets;
> rx_ring->total_bytes += total_rx_bytes;
>- netdev->stats.rx_bytes += total_rx_bytes;
>- netdev->stats.rx_packets += total_rx_packets;
>
> return cleaned;
> }
>@@ -6559,6 +6560,38 @@ static void ixgbe_netpoll(struct net_device *netdev)
> }
> #endif
>
>+static struct rtnl_link_stats64 *ixgbe_get_stats64(struct net_device
>*netdev,
>+ struct rtnl_link_stats64 *stats)
>+{
>+ struct ixgbe_adapter *adapter = netdev_priv(netdev);
>+ int i;
>+
>+ /* accurate rx/tx bytes/packets stats */
>+ dev_txq_stats_fold(netdev, stats);
>+ for (i = 0; i < adapter->num_rx_queues; i++) {
>+ struct ixgbe_ring *ring = adapter->rx_ring[i];
>+ u64 bytes, packets;
>+ unsigned int start;
>+
>+ do {
>+ start = u64_stats_fetch_begin_bh(&ring->syncp);
>+ packets = ring->stats.packets;
>+ bytes = ring->stats.bytes;
>+ } while (u64_stats_fetch_retry_bh(&ring->syncp, start));
>+ stats->rx_packets += packets;
>+ stats->rx_bytes += bytes;
>+ }
>+
>+ /* following stats updated by ixgbe_watchdog_task() */
>+ stats->multicast = netdev->stats.multicast;
>+ stats->rx_errors = netdev->stats.rx_errors;
>+ stats->rx_length_errors = netdev->stats.rx_length_errors;
>+ stats->rx_crc_errors = netdev->stats.rx_crc_errors;
>+ stats->rx_missed_errors = netdev->stats.rx_missed_errors;
>+ return stats;
>+}
>+
>+
> static const struct net_device_ops ixgbe_netdev_ops = {
> .ndo_open = ixgbe_open,
> .ndo_stop = ixgbe_close,
>@@ -6578,6 +6611,7 @@ static const struct net_device_ops ixgbe_netdev_ops =
>{
> .ndo_set_vf_vlan = ixgbe_ndo_set_vf_vlan,
> .ndo_set_vf_tx_rate = ixgbe_ndo_set_vf_bw,
> .ndo_get_vf_config = ixgbe_ndo_get_vf_config,
>+ .ndo_get_stats64 = ixgbe_get_stats64,
> #ifdef CONFIG_NET_POLL_CONTROLLER
> .ndo_poll_controller = ixgbe_netpoll,
> #endif
>
Eric,
We are seeing intermittent hangs on ia32 arch which seem to point to this patch:
BUG: unable to handle kernel NULL pointer dereference at 00000040
IP: [<f7f6b537>] ixgbe_get_stats64+0x47/0x120 [ixgbe]
*pdpt = 000000002dc83001 *pde = 000000032d7e5067
Oops: 0000 [#2] SMP
last sysfs file: /sys/devices/system/cpu/cpu23/cache/index2/shared_cpu_map
Modules linked in: act_skbedit cls_u32 sch_multiq ixgbe mdio netconsole configfs autofs4 8021q garp stp llc sunrpc ipv6 e
xt3 jbd dm_mirror dm_region_hash dm_log dm_round_robin dm_multipath power_meter hwmon sg ses enclosure dcdbas pcspkr serio_raw iTCO_wdt iTCO_vendor_support io
atdma dca i7core_edac edac_core bnx2 ext4 mbcache jbd2 sr_mod cdrom sd_mod crc_t10dif pata_acpi ata_generic ata_piix megaraid_sas dm_mod [last unloaded: mperf
]
Pid: 1939, comm: irqbalance Tainted: G D W 2.6.36-rc7-upstream-net-next-2.6-ixgbe-queue-i386-g55e1a84 #1 09CGW2/Po
werEdge T610
EIP: 0060:[<f7f6b537>] EFLAGS: 00010206 CPU: 0
EIP is at ixgbe_get_stats64+0x47/0x120 [ixgbe]
EAX: 00000000 EBX: ecc45e4c ECX: ebea0400 EDX: 00000000
ESI: ebea0000 EDI: 00000018 EBP: f7f846a0 ESP: ecc45d88
DS: 007b ES: 007b FS: 00d8 GS: 00e0 SS: 0068
Process irqbalance (pid: 1939, ti=ecc44000 task=efc63a50 task.ti=ecc44000)
Stack:
ecc45e4c 00000000 ebea0400 ebea0000 ecc45e4c ebea0000 ecc45f04 f7f846a0
<0> c0750593 ebea0000 edfec340 ebea0000 000002b5 c075063a edfec340 c0993eec
<0> ebe78000 0000021c 00000000 00000009 00000000 00000000 00000000 00000000
Call Trace:
[<c0750593>] ? dev_get_stats+0x33/0xc0
[<c075063a>] ? dev_seq_printf_stats+0x1a/0x180
[<c07507aa>] ? dev_seq_show+0xa/0x20
[<c052398f>] ? seq_read+0x22f/0x3d0
[<c0523760>] ? seq_read+0x0/0x3d0
[<c054fdde>] ? proc_reg_read+0x5e/0x90
[<c054fd80>] ? proc_reg_read+0x0/0x90
[<c050a1dd>] ? vfs_read+0x9d/0x160
[<c049d4ef>] ? audit_syscall_entry+0x20f/0x230
[<c050a971>] ? sys_read+0x41/0x70
[<c0409cdf>] ? sysenter_do_call+0x12/0x28
Code: 60 4f 7e c8 8b 44 24 08 8b b8 20 06 00 00 85 ff 7e 63 c7 44 24 04 00 00 00 00 66 90 8b 54 24 04 8b 4c 24 08 8b 84 9
1 00 05 00 00 <8b> 50 40 eb 06 8d 74 26 00 89 ca f6 c2 01 0f 85 ae 00 00 00 8b
EIP: [<f7f6b537>] ixgbe_get_stats64+0x47/0x120 [ixgbe] SS:ESP 0068:ecc45d88
CR2: 0000000000000040
---[ end trace 51ea89f4e57f54f1 ]---
Emil
^ permalink raw reply [flat|nested] 10+ messages in thread
* RE: [PATCH net-next] ixgbe: fix stats handling
2010-10-27 22:35 ` Tantilov, Emil S
@ 2010-10-28 4:24 ` Eric Dumazet
2010-10-28 12:02 ` [PATCH] ixgbe: delay rx_ring freeing Eric Dumazet
0 siblings, 1 reply; 10+ messages in thread
From: Eric Dumazet @ 2010-10-28 4:24 UTC (permalink / raw)
To: Tantilov, Emil S
Cc: David Miller, Waskiewicz Jr, Peter P, Kirsher, Jeffrey T,
Brattain, Ross B, netdev
Le mercredi 27 octobre 2010 à 16:35 -0600, Tantilov, Emil S a écrit :
> >-----Original Message-----
> >From: Eric Dumazet [mailto:eric.dumazet@gmail.com]
> >Sent: Monday, October 11, 2010 5:17 AM
> >To: David Miller; Waskiewicz Jr, Peter P; Tantilov, Emil S; Kirsher,
> >Jeffrey T
> >Cc: netdev
> >Subject: [PATCH net-next] ixgbe: fix stats handling
> >
> >Hi
> >
> >I am sending this patch for Intel people review/test and acknowledge.
> >
> >Thanks !
> >
> >[PATCH net-next] ixgbe: fix stats handling
> >
> >Current ixgbe stats have following problems :
> >
> >- Not 64 bit safe (on 32bit arches)
> >
> >- Not safe in ixgbe_clean_rx_irq() :
> > All cpus dirty a common location (netdev->stats.rx_bytes &
> >netdev->stats.rx_packets) without proper synchronization.
> > This slow down a bit multiqueue operations, and possibly miss some
> >updates.
> >
> >Fixes :
> >
> >Implement ndo_get_stats64() method to provide accurate 64bit rx|tx
> >bytes/packets counters, using 64bit safe infrastructure.
> >
> >ixgbe_get_ethtool_stats() also use this infrastructure to provide 64bit
> >safe counters.
> >
> >Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
> >CC: Peter Waskiewicz <peter.p.waskiewicz.jr@intel.com>
> >CC: Emil Tantilov <emil.s.tantilov@intel.com>
> >CC: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
> >---
> > drivers/net/ixgbe/ixgbe.h | 3 +-
> > drivers/net/ixgbe/ixgbe_ethtool.c | 29 +++++++++++---------
> > drivers/net/ixgbe/ixgbe_main.c | 40 +++++++++++++++++++++++++---
> > 3 files changed, 56 insertions(+), 16 deletions(-)
> >
> >diff --git a/drivers/net/ixgbe/ixgbe.h b/drivers/net/ixgbe/ixgbe.h
> >index a8c47b0..944d9e2 100644
> >--- a/drivers/net/ixgbe/ixgbe.h
> >+++ b/drivers/net/ixgbe/ixgbe.h
> >@@ -180,8 +180,9 @@ struct ixgbe_ring {
> > */
> >
> > struct ixgbe_queue_stats stats;
> >- unsigned long reinit_state;
> >+ struct u64_stats_sync syncp;
> > int numa_node;
> >+ unsigned long reinit_state;
> > u64 rsc_count; /* stat for coalesced packets */
> > u64 rsc_flush; /* stats for flushed packets */
> > u32 restart_queue; /* track tx queue restarts */
> >diff --git a/drivers/net/ixgbe/ixgbe_ethtool.c
> >b/drivers/net/ixgbe/ixgbe_ethtool.c
> >index d4ac943..3c7f15d 100644
> >--- a/drivers/net/ixgbe/ixgbe_ethtool.c
> >+++ b/drivers/net/ixgbe/ixgbe_ethtool.c
> >@@ -999,12 +999,11 @@ static void ixgbe_get_ethtool_stats(struct net_device
> >*netdev,
> > struct ethtool_stats *stats, u64
> >*data)
> > {
> > struct ixgbe_adapter *adapter = netdev_priv(netdev);
> >- u64 *queue_stat;
> >- int stat_count = sizeof(struct ixgbe_queue_stats) / sizeof(u64);
> > struct rtnl_link_stats64 temp;
> > const struct rtnl_link_stats64 *net_stats;
> >- int j, k;
> >- int i;
> >+ unsigned int start;
> >+ struct ixgbe_ring *ring;
> >+ int i, j;
> > char *p = NULL;
> >
> > ixgbe_update_stats(adapter);
> >@@ -1025,16 +1024,22 @@ static void ixgbe_get_ethtool_stats(struct
> >net_device *netdev,
> > sizeof(u64)) ? *(u64 *)p : *(u32 *)p;
> > }
> > for (j = 0; j < adapter->num_tx_queues; j++) {
> >- queue_stat = (u64 *)&adapter->tx_ring[j]->stats;
> >- for (k = 0; k < stat_count; k++)
> >- data[i + k] = queue_stat[k];
> >- i += k;
> >+ ring = adapter->tx_ring[j];
> >+ do {
> >+ start = u64_stats_fetch_begin_bh(&ring->syncp);
> >+ data[i] = ring->stats.packets;
> >+ data[i+1] = ring->stats.bytes;
> >+ } while (u64_stats_fetch_retry_bh(&ring->syncp, start));
> >+ i += 2;
> > }
> > for (j = 0; j < adapter->num_rx_queues; j++) {
> >- queue_stat = (u64 *)&adapter->rx_ring[j]->stats;
> >- for (k = 0; k < stat_count; k++)
> >- data[i + k] = queue_stat[k];
> >- i += k;
> >+ ring = adapter->rx_ring[j];
> >+ do {
> >+ start = u64_stats_fetch_begin_bh(&ring->syncp);
> >+ data[i] = ring->stats.packets;
> >+ data[i+1] = ring->stats.bytes;
> >+ } while (u64_stats_fetch_retry_bh(&ring->syncp, start));
> >+ i += 2;
> > }
> > if (adapter->flags & IXGBE_FLAG_DCB_ENABLED) {
> > for (j = 0; j < MAX_TX_PACKET_BUFFERS; j++) {
> >diff --git a/drivers/net/ixgbe/ixgbe_main.c
> >b/drivers/net/ixgbe/ixgbe_main.c
> >index 95dbf60..1efbcde 100644
> >--- a/drivers/net/ixgbe/ixgbe_main.c
> >+++ b/drivers/net/ixgbe/ixgbe_main.c
> >@@ -824,8 +824,10 @@ static bool ixgbe_clean_tx_irq(struct ixgbe_q_vector
> >*q_vector,
> >
> > tx_ring->total_bytes += total_bytes;
> > tx_ring->total_packets += total_packets;
> >+ u64_stats_update_begin(&tx_ring->syncp);
> > tx_ring->stats.packets += total_packets;
> > tx_ring->stats.bytes += total_bytes;
> >+ u64_stats_update_end(&tx_ring->syncp);
> > return count < tx_ring->work_limit;
> > }
> >
> >@@ -1172,7 +1174,6 @@ static bool ixgbe_clean_rx_irq(struct ixgbe_q_vector
> >*q_vector,
> > int *work_done, int work_to_do)
> > {
> > struct ixgbe_adapter *adapter = q_vector->adapter;
> >- struct net_device *netdev = adapter->netdev;
> > struct pci_dev *pdev = adapter->pdev;
> > union ixgbe_adv_rx_desc *rx_desc, *next_rxd;
> > struct ixgbe_rx_buffer *rx_buffer_info, *next_buffer;
> >@@ -1298,8 +1299,10 @@ static bool ixgbe_clean_rx_irq(struct ixgbe_q_vector
> >*q_vector,
> > rx_ring->rsc_count++;
> > rx_ring->rsc_flush++;
> > }
> >+ u64_stats_update_begin(&rx_ring->syncp);
> > rx_ring->stats.packets++;
> > rx_ring->stats.bytes += skb->len;
> >+ u64_stats_update_end(&rx_ring->syncp);
> > } else {
> > if (rx_ring->flags & IXGBE_RING_RX_PS_ENABLED) {
> > rx_buffer_info->skb = next_buffer->skb;
> >@@ -1375,8 +1378,6 @@ next_desc:
> >
> > rx_ring->total_packets += total_rx_packets;
> > rx_ring->total_bytes += total_rx_bytes;
> >- netdev->stats.rx_bytes += total_rx_bytes;
> >- netdev->stats.rx_packets += total_rx_packets;
> >
> > return cleaned;
> > }
> >@@ -6559,6 +6560,38 @@ static void ixgbe_netpoll(struct net_device *netdev)
> > }
> > #endif
> >
> >+static struct rtnl_link_stats64 *ixgbe_get_stats64(struct net_device
> >*netdev,
> >+ struct rtnl_link_stats64 *stats)
> >+{
> >+ struct ixgbe_adapter *adapter = netdev_priv(netdev);
> >+ int i;
> >+
> >+ /* accurate rx/tx bytes/packets stats */
> >+ dev_txq_stats_fold(netdev, stats);
> >+ for (i = 0; i < adapter->num_rx_queues; i++) {
> >+ struct ixgbe_ring *ring = adapter->rx_ring[i];
> >+ u64 bytes, packets;
> >+ unsigned int start;
> >+
> >+ do {
> >+ start = u64_stats_fetch_begin_bh(&ring->syncp);
> >+ packets = ring->stats.packets;
> >+ bytes = ring->stats.bytes;
> >+ } while (u64_stats_fetch_retry_bh(&ring->syncp, start));
> >+ stats->rx_packets += packets;
> >+ stats->rx_bytes += bytes;
> >+ }
> >+
> >+ /* following stats updated by ixgbe_watchdog_task() */
> >+ stats->multicast = netdev->stats.multicast;
> >+ stats->rx_errors = netdev->stats.rx_errors;
> >+ stats->rx_length_errors = netdev->stats.rx_length_errors;
> >+ stats->rx_crc_errors = netdev->stats.rx_crc_errors;
> >+ stats->rx_missed_errors = netdev->stats.rx_missed_errors;
> >+ return stats;
> >+}
> >+
> >+
> > static const struct net_device_ops ixgbe_netdev_ops = {
> > .ndo_open = ixgbe_open,
> > .ndo_stop = ixgbe_close,
> >@@ -6578,6 +6611,7 @@ static const struct net_device_ops ixgbe_netdev_ops =
> >{
> > .ndo_set_vf_vlan = ixgbe_ndo_set_vf_vlan,
> > .ndo_set_vf_tx_rate = ixgbe_ndo_set_vf_bw,
> > .ndo_get_vf_config = ixgbe_ndo_get_vf_config,
> >+ .ndo_get_stats64 = ixgbe_get_stats64,
> > #ifdef CONFIG_NET_POLL_CONTROLLER
> > .ndo_poll_controller = ixgbe_netpoll,
> > #endif
> >
>
> Eric,
>
> We are seeing intermittent hangs on ia32 arch which seem to point to this patch:
>
> BUG: unable to handle kernel NULL pointer dereference at 00000040
> IP: [<f7f6b537>] ixgbe_get_stats64+0x47/0x120 [ixgbe]
> *pdpt = 000000002dc83001 *pde = 000000032d7e5067
> Oops: 0000 [#2] SMP
> last sysfs file: /sys/devices/system/cpu/cpu23/cache/index2/shared_cpu_map
> Modules linked in: act_skbedit cls_u32 sch_multiq ixgbe mdio netconsole configfs autofs4 8021q garp stp llc sunrpc ipv6 e
> xt3 jbd dm_mirror dm_region_hash dm_log dm_round_robin dm_multipath power_meter hwmon sg ses enclosure dcdbas pcspkr serio_raw iTCO_wdt iTCO_vendor_support io
> atdma dca i7core_edac edac_core bnx2 ext4 mbcache jbd2 sr_mod cdrom sd_mod crc_t10dif pata_acpi ata_generic ata_piix megaraid_sas dm_mod [last unloaded: mperf
> ]
>
> Pid: 1939, comm: irqbalance Tainted: G D W 2.6.36-rc7-upstream-net-next-2.6-ixgbe-queue-i386-g55e1a84 #1 09CGW2/Po
> werEdge T610
> EIP: 0060:[<f7f6b537>] EFLAGS: 00010206 CPU: 0
> EIP is at ixgbe_get_stats64+0x47/0x120 [ixgbe]
> EAX: 00000000 EBX: ecc45e4c ECX: ebea0400 EDX: 00000000
> ESI: ebea0000 EDI: 00000018 EBP: f7f846a0 ESP: ecc45d88
> DS: 007b ES: 007b FS: 00d8 GS: 00e0 SS: 0068
> Process irqbalance (pid: 1939, ti=ecc44000 task=efc63a50 task.ti=ecc44000)
> Stack:
> ecc45e4c 00000000 ebea0400 ebea0000 ecc45e4c ebea0000 ecc45f04 f7f846a0
> <0> c0750593 ebea0000 edfec340 ebea0000 000002b5 c075063a edfec340 c0993eec
> <0> ebe78000 0000021c 00000000 00000009 00000000 00000000 00000000 00000000
> Call Trace:
> [<c0750593>] ? dev_get_stats+0x33/0xc0
> [<c075063a>] ? dev_seq_printf_stats+0x1a/0x180
> [<c07507aa>] ? dev_seq_show+0xa/0x20
> [<c052398f>] ? seq_read+0x22f/0x3d0
> [<c0523760>] ? seq_read+0x0/0x3d0
> [<c054fdde>] ? proc_reg_read+0x5e/0x90
> [<c054fd80>] ? proc_reg_read+0x0/0x90
> [<c050a1dd>] ? vfs_read+0x9d/0x160
> [<c049d4ef>] ? audit_syscall_entry+0x20f/0x230
> [<c050a971>] ? sys_read+0x41/0x70
> [<c0409cdf>] ? sysenter_do_call+0x12/0x28
> Code: 60 4f 7e c8 8b 44 24 08 8b b8 20 06 00 00 85 ff 7e 63 c7 44 24 04 00 00 00 00 66 90 8b 54 24 04 8b 4c 24 08 8b 84 9
> 1 00 05 00 00 <8b> 50 40 eb 06 8d 74 26 00 89 ca f6 c2 01 0f 85 ae 00 00 00 8b
> EIP: [<f7f6b537>] ixgbe_get_stats64+0x47/0x120 [ixgbe] SS:ESP 0068:ecc45d88
> CR2: 0000000000000040
> ---[ end trace 51ea89f4e57f54f1 ]---
>
> Emil
Code disassembly makes no sense to me : this doesnt match C code at all
60 4f 7e c8
8b 44 24 08 mov 0x08(%esp),%eax
8b b8 20 06 00 00 mov 0x620(%eax),%edi
85 ff test %edi,%edi
7e 63 jle +0x63
c7 44 24 04 00 00 00 00 movl $0,0x4(%esp)
66 90 xchg %ax,%ax
8b 54 24 04 mov 0x04(%esp),%edx (0 in your report)
8b 4c 24 08 mov 0x08(%esp),%ecx (ebea0400 in your rep)
8b 84 91 00 05 00 00 mov 0x500(,),eax
<8b> 50 40 mov 0x40(%eax),%edx NULL pointer deref
eb 06 jmp +6
8d 74 26 00 lea 0x0(%esi,%eiz,1),%esi
89 ca mov %ecx,%edx
f6 c2 01 test $0x1,%dl
0f 85 ae 00 00 00 jne
8b
Could you disassemble your ixgbe_get_stats64() function ?
Thanks
Mine is :
00000e29 <ixgbe_get_stats64>:
e29: 55 push %ebp
e2a: 89 e5 mov %esp,%ebp
e2c: 57 push %edi
e2d: 56 push %esi
e2e: 89 c6 mov %eax,%esi
e30: 53 push %ebx
e31: 89 d3 mov %edx,%ebx
e33: 83 ec 18 sub $0x18,%esp
e36: 8d b8 40 03 00 00 lea 0x340(%eax),%edi
e3c: e8 fc ff ff ff call dev_txq_stats_fold
e41: 31 c0 xor %eax,%eax
e43: eb 54 jmp e99 <ixgbe_get_stats64+0x70>
e45: 8b 94 87 40 0d 00 00 mov 0xd40(%edi,%eax,4),%edx
e4c: 89 75 e0 mov %esi,-0x20(%ebp)
e4f: 89 5d dc mov %ebx,-0x24(%ebp)
e52: 8b 4a 34 mov 0x34(%edx),%ecx
e55: f6 c1 01 test $0x1,%cl
e58: 74 04 je e5e <ixgbe_get_stats64+0x35>
e5a: f3 90 pause
e5c: eb f4 jmp e52 <ixgbe_get_stats64+0x29>
e5e: 8b 5a 24 mov 0x24(%edx),%ebx
e61: 8b 72 28 mov 0x28(%edx),%esi
e64: 89 5d ec mov %ebx,-0x14(%ebp)
e67: 8b 5a 2c mov 0x2c(%edx),%ebx
e6a: 89 75 f0 mov %esi,-0x10(%ebp)
e6d: 8b 72 30 mov 0x30(%edx),%esi
e70: 89 5d e4 mov %ebx,-0x1c(%ebp)
e73: 89 75 e8 mov %esi,-0x18(%ebp)
e76: 39 4a 34 cmp %ecx,0x34(%edx)
e79: 75 d7 jne e52 <ixgbe_get_stats64+0x29>
e7b: 8b 5d dc mov -0x24(%ebp),%ebx
e7e: 8b 55 ec mov -0x14(%ebp),%edx
e81: 8b 75 e0 mov -0x20(%ebp),%esi
e84: 8b 4d f0 mov -0x10(%ebp),%ecx
e87: 01 13 add %edx,(%ebx)
e89: 8b 55 e4 mov -0x1c(%ebp),%edx
e8c: 11 4b 04 adc %ecx,0x4(%ebx)
e8f: 01 53 10 add %edx,0x10(%ebx)
e92: 8b 4d e8 mov -0x18(%ebp),%ecx
e95: 11 4b 14 adc %ecx,0x14(%ebx)
e98: 40 inc %eax
e99: 3b 87 40 0e 00 00 cmp 0xe40(%edi),%eax
e9f: 7c a4 jl e45 <ixgbe_get_stats64+0x1c>
ea1: 8b 8e 90 00 00 00 mov 0x90(%esi),%ecx
ea7: c7 43 44 00 00 00 00 movl $0x0,0x44(%ebx)
eae: 89 4b 40 mov %ecx,0x40(%ebx)
eb1: 8b 86 80 00 00 00 mov 0x80(%esi),%eax
eb7: c7 43 24 00 00 00 00 movl $0x0,0x24(%ebx)
ebe: 89 43 20 mov %eax,0x20(%ebx)
ec1: 8b 96 98 00 00 00 mov 0x98(%esi),%edx
ec7: c7 43 54 00 00 00 00 movl $0x0,0x54(%ebx)
ece: 89 53 50 mov %edx,0x50(%ebx)
ed1: 8b 8e a0 00 00 00 mov 0xa0(%esi),%ecx
ed7: c7 43 64 00 00 00 00 movl $0x0,0x64(%ebx)
ede: 89 4b 60 mov %ecx,0x60(%ebx)
ee1: 8b 86 ac 00 00 00 mov 0xac(%esi),%eax
ee7: c7 43 7c 00 00 00 00 movl $0x0,0x7c(%ebx)
eee: 89 43 78 mov %eax,0x78(%ebx)
ef1: 83 c4 18 add $0x18,%esp
ef4: 89 d8 mov %ebx,%eax
ef6: 5b pop %ebx
ef7: 5e pop %esi
ef8: 5f pop %edi
ef9: 5d pop %ebp
efa: c3 ret
No 0x40(ptr) deref for me
Could adapter->rx_ring[i] becomes NULL sometime in your driver ?
If yes, a lock/protection is needed (same for ethtool stats I guess)
^ permalink raw reply [flat|nested] 10+ messages in thread
* [PATCH] ixgbe: delay rx_ring freeing
2010-10-28 4:24 ` Eric Dumazet
@ 2010-10-28 12:02 ` Eric Dumazet
2010-10-28 12:21 ` Jeff Kirsher
0 siblings, 1 reply; 10+ messages in thread
From: Eric Dumazet @ 2010-10-28 12:02 UTC (permalink / raw)
To: Tantilov, Emil S
Cc: David Miller, Waskiewicz Jr, Peter P, Kirsher, Jeffrey T,
Brattain, Ross B, netdev
Le jeudi 28 octobre 2010 à 06:24 +0200, Eric Dumazet a écrit :
> Le mercredi 27 octobre 2010 à 16:35 -0600, Tantilov, Emil S a écrit :
> > Eric,
> >
> > We are seeing intermittent hangs on ia32 arch which seem to point to this patch:
> >
> > BUG: unable to handle kernel NULL pointer dereference at 00000040
> > IP: [<f7f6b537>] ixgbe_get_stats64+0x47/0x120 [ixgbe]
> > *pdpt = 000000002dc83001 *pde = 000000032d7e5067
> > Oops: 0000 [#2] SMP
> > last sysfs file: /sys/devices/system/cpu/cpu23/cache/index2/shared_cpu_map
> > Modules linked in: act_skbedit cls_u32 sch_multiq ixgbe mdio netconsole configfs autofs4 8021q garp stp llc sunrpc ipv6 e
> > xt3 jbd dm_mirror dm_region_hash dm_log dm_round_robin dm_multipath power_meter hwmon sg ses enclosure dcdbas pcspkr serio_raw iTCO_wdt iTCO_vendor_support io
> > atdma dca i7core_edac edac_core bnx2 ext4 mbcache jbd2 sr_mod cdrom sd_mod crc_t10dif pata_acpi ata_generic ata_piix megaraid_sas dm_mod [last unloaded: mperf
> > ]
> >
> > Pid: 1939, comm: irqbalance Tainted: G D W 2.6.36-rc7-upstream-net-next-2.6-ixgbe-queue-i386-g55e1a84 #1 09CGW2/Po
> > werEdge T610
> > EIP: 0060:[<f7f6b537>] EFLAGS: 00010206 CPU: 0
> > EIP is at ixgbe_get_stats64+0x47/0x120 [ixgbe]
> > EAX: 00000000 EBX: ecc45e4c ECX: ebea0400 EDX: 00000000
> > ESI: ebea0000 EDI: 00000018 EBP: f7f846a0 ESP: ecc45d88
> > DS: 007b ES: 007b FS: 00d8 GS: 00e0 SS: 0068
> > Process irqbalance (pid: 1939, ti=ecc44000 task=efc63a50 task.ti=ecc44000)
> > Stack:
> > ecc45e4c 00000000 ebea0400 ebea0000 ecc45e4c ebea0000 ecc45f04 f7f846a0
> > <0> c0750593 ebea0000 edfec340 ebea0000 000002b5 c075063a edfec340 c0993eec
> > <0> ebe78000 0000021c 00000000 00000009 00000000 00000000 00000000 00000000
> > Call Trace:
> > [<c0750593>] ? dev_get_stats+0x33/0xc0
> > [<c075063a>] ? dev_seq_printf_stats+0x1a/0x180
> > [<c07507aa>] ? dev_seq_show+0xa/0x20
> > [<c052398f>] ? seq_read+0x22f/0x3d0
> > [<c0523760>] ? seq_read+0x0/0x3d0
> > [<c054fdde>] ? proc_reg_read+0x5e/0x90
> > [<c054fd80>] ? proc_reg_read+0x0/0x90
> > [<c050a1dd>] ? vfs_read+0x9d/0x160
> > [<c049d4ef>] ? audit_syscall_entry+0x20f/0x230
> > [<c050a971>] ? sys_read+0x41/0x70
> > [<c0409cdf>] ? sysenter_do_call+0x12/0x28
> > Code: 60 4f 7e c8 8b 44 24 08 8b b8 20 06 00 00 85 ff 7e 63 c7 44 24 04 00 00 00 00 66 90 8b 54 24 04 8b 4c 24 08 8b 84 9
> > 1 00 05 00 00 <8b> 50 40 eb 06 8d 74 26 00 89 ca f6 c2 01 0f 85 ae 00 00 00 8b
> > EIP: [<f7f6b537>] ixgbe_get_stats64+0x47/0x120 [ixgbe] SS:ESP 0068:ecc45d88
> > CR2: 0000000000000040
> > ---[ end trace 51ea89f4e57f54f1 ]---
> >
> > Emil
I believe I now understand the problem, please try following patch.
Thanks
[PATCH] ixgbe: delay rx_ring freeing
"cat /proc/net/dev" uses RCU protection only.
Its quite possible we call a driver get_stats() method while device is
dismantling and freeing its data structures.
So get_stats() methods must be very careful not accessing driver private
data without appropriate locking.
In ixgbe case, we access rx_ring pointers. These pointers are freed in
ixgbe_clear_interrupt_scheme() and set to NULL, this can trigger NULL
dereference in ixgbe_get_stats64()
A possible fix is to use RCU locking in ixgbe_get_stats64() and defer
rx_ring freeing after a grace period in ixgbe_clear_interrupt_scheme()
Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
Reported-by: Tantilov, Emil S <emil.s.tantilov@intel.com>
---
drivers/net/ixgbe/ixgbe.h | 1
drivers/net/ixgbe/ixgbe_main.c | 34 +++++++++++++++++++++----------
2 files changed, 25 insertions(+), 10 deletions(-)
diff --git a/drivers/net/ixgbe/ixgbe.h b/drivers/net/ixgbe/ixgbe.h
index ed8703c..018e143 100644
--- a/drivers/net/ixgbe/ixgbe.h
+++ b/drivers/net/ixgbe/ixgbe.h
@@ -192,6 +192,7 @@ struct ixgbe_ring {
unsigned int size; /* length in bytes */
dma_addr_t dma; /* phys. address of descriptor ring */
+ struct rcu_head rcu;
} ____cacheline_internodealigned_in_smp;
enum ixgbe_ring_f_enum {
diff --git a/drivers/net/ixgbe/ixgbe_main.c b/drivers/net/ixgbe/ixgbe_main.c
index f856312..1b16292 100644
--- a/drivers/net/ixgbe/ixgbe_main.c
+++ b/drivers/net/ixgbe/ixgbe_main.c
@@ -4742,6 +4742,11 @@ err_set_interrupt:
return err;
}
+static void ring_free_rcu(struct rcu_head *head)
+{
+ kfree(container_of(head, struct ixgbe_ring, rcu));
+}
+
/**
* ixgbe_clear_interrupt_scheme - Clear the current interrupt scheme settings
* @adapter: board private structure to clear interrupt scheme on
@@ -4758,7 +4763,12 @@ void ixgbe_clear_interrupt_scheme(struct ixgbe_adapter *adapter)
adapter->tx_ring[i] = NULL;
}
for (i = 0; i < adapter->num_rx_queues; i++) {
- kfree(adapter->rx_ring[i]);
+ struct ixgbe_ring *ring = adapter->rx_ring[i];
+
+ /* ixgbe_get_stats64() might access this ring, we must wait
+ * a grace period before freeing it.
+ */
+ call_rcu(&ring->rcu, ring_free_rcu);
adapter->rx_ring[i] = NULL;
}
@@ -6551,20 +6561,23 @@ static struct rtnl_link_stats64 *ixgbe_get_stats64(struct net_device *netdev,
/* accurate rx/tx bytes/packets stats */
dev_txq_stats_fold(netdev, stats);
+ rcu_read_lock();
for (i = 0; i < adapter->num_rx_queues; i++) {
- struct ixgbe_ring *ring = adapter->rx_ring[i];
+ struct ixgbe_ring *ring = ACCESS_ONCE(adapter->rx_ring[i]);
u64 bytes, packets;
unsigned int start;
- do {
- start = u64_stats_fetch_begin_bh(&ring->syncp);
- packets = ring->stats.packets;
- bytes = ring->stats.bytes;
- } while (u64_stats_fetch_retry_bh(&ring->syncp, start));
- stats->rx_packets += packets;
- stats->rx_bytes += bytes;
+ if (ring) {
+ do {
+ start = u64_stats_fetch_begin_bh(&ring->syncp);
+ packets = ring->stats.packets;
+ bytes = ring->stats.bytes;
+ } while (u64_stats_fetch_retry_bh(&ring->syncp, start));
+ stats->rx_packets += packets;
+ stats->rx_bytes += bytes;
+ }
}
-
+ rcu_read_unlock();
/* following stats updated by ixgbe_watchdog_task() */
stats->multicast = netdev->stats.multicast;
stats->rx_errors = netdev->stats.rx_errors;
@@ -7270,6 +7283,7 @@ static void __exit ixgbe_exit_module(void)
dca_unregister_notify(&dca_notifier);
#endif
pci_unregister_driver(&ixgbe_driver);
+ rcu_barrier(); /* Wait for completion of call_rcu()'s */
}
#ifdef CONFIG_IXGBE_DCA
^ permalink raw reply related [flat|nested] 10+ messages in thread
* Re: [PATCH] ixgbe: delay rx_ring freeing
2010-10-28 12:02 ` [PATCH] ixgbe: delay rx_ring freeing Eric Dumazet
@ 2010-10-28 12:21 ` Jeff Kirsher
2010-10-29 3:45 ` [PATCH] ixgbe: refactor ixgbe_alloc_queues() Eric Dumazet
0 siblings, 1 reply; 10+ messages in thread
From: Jeff Kirsher @ 2010-10-28 12:21 UTC (permalink / raw)
To: Eric Dumazet
Cc: Tantilov, Emil S, David Miller, Waskiewicz Jr, Peter P,
Brattain, Ross B, netdev
[-- Attachment #1: Type: text/plain, Size: 3956 bytes --]
On Thu, 2010-10-28 at 05:02 -0700, Eric Dumazet wrote:
> Le jeudi 28 octobre 2010 à 06:24 +0200, Eric Dumazet a écrit :
> > Le mercredi 27 octobre 2010 à 16:35 -0600, Tantilov, Emil S a écrit :
>
> > > Eric,
> > >
> > > We are seeing intermittent hangs on ia32 arch which seem to point to this patch:
> > >
> > > BUG: unable to handle kernel NULL pointer dereference at 00000040
> > > IP: [<f7f6b537>] ixgbe_get_stats64+0x47/0x120 [ixgbe]
> > > *pdpt = 000000002dc83001 *pde = 000000032d7e5067
> > > Oops: 0000 [#2] SMP
> > > last sysfs file: /sys/devices/system/cpu/cpu23/cache/index2/shared_cpu_map
> > > Modules linked in: act_skbedit cls_u32 sch_multiq ixgbe mdio netconsole configfs autofs4 8021q garp stp llc sunrpc ipv6 e
> > > xt3 jbd dm_mirror dm_region_hash dm_log dm_round_robin dm_multipath power_meter hwmon sg ses enclosure dcdbas pcspkr serio_raw iTCO_wdt iTCO_vendor_support io
> > > atdma dca i7core_edac edac_core bnx2 ext4 mbcache jbd2 sr_mod cdrom sd_mod crc_t10dif pata_acpi ata_generic ata_piix megaraid_sas dm_mod [last unloaded: mperf
> > > ]
> > >
> > > Pid: 1939, comm: irqbalance Tainted: G D W 2.6.36-rc7-upstream-net-next-2.6-ixgbe-queue-i386-g55e1a84 #1 09CGW2/Po
> > > werEdge T610
> > > EIP: 0060:[<f7f6b537>] EFLAGS: 00010206 CPU: 0
> > > EIP is at ixgbe_get_stats64+0x47/0x120 [ixgbe]
> > > EAX: 00000000 EBX: ecc45e4c ECX: ebea0400 EDX: 00000000
> > > ESI: ebea0000 EDI: 00000018 EBP: f7f846a0 ESP: ecc45d88
> > > DS: 007b ES: 007b FS: 00d8 GS: 00e0 SS: 0068
> > > Process irqbalance (pid: 1939, ti=ecc44000 task=efc63a50 task.ti=ecc44000)
> > > Stack:
> > > ecc45e4c 00000000 ebea0400 ebea0000 ecc45e4c ebea0000 ecc45f04 f7f846a0
> > > <0> c0750593 ebea0000 edfec340 ebea0000 000002b5 c075063a edfec340 c0993eec
> > > <0> ebe78000 0000021c 00000000 00000009 00000000 00000000 00000000 00000000
> > > Call Trace:
> > > [<c0750593>] ? dev_get_stats+0x33/0xc0
> > > [<c075063a>] ? dev_seq_printf_stats+0x1a/0x180
> > > [<c07507aa>] ? dev_seq_show+0xa/0x20
> > > [<c052398f>] ? seq_read+0x22f/0x3d0
> > > [<c0523760>] ? seq_read+0x0/0x3d0
> > > [<c054fdde>] ? proc_reg_read+0x5e/0x90
> > > [<c054fd80>] ? proc_reg_read+0x0/0x90
> > > [<c050a1dd>] ? vfs_read+0x9d/0x160
> > > [<c049d4ef>] ? audit_syscall_entry+0x20f/0x230
> > > [<c050a971>] ? sys_read+0x41/0x70
> > > [<c0409cdf>] ? sysenter_do_call+0x12/0x28
> > > Code: 60 4f 7e c8 8b 44 24 08 8b b8 20 06 00 00 85 ff 7e 63 c7 44 24 04 00 00 00 00 66 90 8b 54 24 04 8b 4c 24 08 8b 84 9
> > > 1 00 05 00 00 <8b> 50 40 eb 06 8d 74 26 00 89 ca f6 c2 01 0f 85 ae 00 00 00 8b
> > > EIP: [<f7f6b537>] ixgbe_get_stats64+0x47/0x120 [ixgbe] SS:ESP 0068:ecc45d88
> > > CR2: 0000000000000040
> > > ---[ end trace 51ea89f4e57f54f1 ]---
> > >
> > > Emil
>
> I believe I now understand the problem, please try following patch.
>
> Thanks
>
>
>
> [PATCH] ixgbe: delay rx_ring freeing
>
> "cat /proc/net/dev" uses RCU protection only.
>
> Its quite possible we call a driver get_stats() method while device is
> dismantling and freeing its data structures.
>
> So get_stats() methods must be very careful not accessing driver private
> data without appropriate locking.
>
> In ixgbe case, we access rx_ring pointers. These pointers are freed in
> ixgbe_clear_interrupt_scheme() and set to NULL, this can trigger NULL
> dereference in ixgbe_get_stats64()
>
> A possible fix is to use RCU locking in ixgbe_get_stats64() and defer
> rx_ring freeing after a grace period in ixgbe_clear_interrupt_scheme()
>
> Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
> Reported-by: Tantilov, Emil S <emil.s.tantilov@intel.com>
> ---
> drivers/net/ixgbe/ixgbe.h | 1
> drivers/net/ixgbe/ixgbe_main.c | 34 +++++++++++++++++++++----------
> 2 files changed, 25 insertions(+), 10 deletions(-)
>
Thanks Eric, I have added this patch to my queue.
[-- Attachment #2: This is a digitally signed message part --]
[-- Type: application/pgp-signature, Size: 490 bytes --]
^ permalink raw reply [flat|nested] 10+ messages in thread
* [PATCH] ixgbe: refactor ixgbe_alloc_queues()
2010-10-28 12:21 ` Jeff Kirsher
@ 2010-10-29 3:45 ` Eric Dumazet
2010-10-29 7:46 ` Jeff Kirsher
2010-10-29 8:38 ` Jeff Kirsher
0 siblings, 2 replies; 10+ messages in thread
From: Eric Dumazet @ 2010-10-29 3:45 UTC (permalink / raw)
To: jeffrey.t.kirsher
Cc: Tantilov, Emil S, David Miller, Waskiewicz Jr, Peter P,
Brattain, Ross B, netdev
Note : compiled only patch, not tested.
Thanks !
[PATCH] ixgbe: refactor ixgbe_alloc_queues()
I noticed ring variable was initialized before allocations, and that
memory node management was a bit ugly. We also leak memory in case of
ring allocations error.
Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
---
drivers/net/ixgbe/ixgbe_main.c | 69 ++++++++++++-------------------
1 file changed, 28 insertions(+), 41 deletions(-)
diff --git a/drivers/net/ixgbe/ixgbe_main.c b/drivers/net/ixgbe/ixgbe_main.c
index 2bd3eb4..e89da9a 100644
--- a/drivers/net/ixgbe/ixgbe_main.c
+++ b/drivers/net/ixgbe/ixgbe_main.c
@@ -4470,65 +4470,52 @@ static void ixgbe_cache_ring_register(struct ixgbe_adapter *adapter)
**/
static int ixgbe_alloc_queues(struct ixgbe_adapter *adapter)
{
- int i;
- int orig_node = adapter->node;
+ int rx = 0, tx = 0, nid = adapter->node;
- for (i = 0; i < adapter->num_tx_queues; i++) {
- struct ixgbe_ring *ring = adapter->tx_ring[i];
- if (orig_node == -1) {
- int cur_node = next_online_node(adapter->node);
- if (cur_node == MAX_NUMNODES)
- cur_node = first_online_node;
- adapter->node = cur_node;
- }
- ring = kzalloc_node(sizeof(struct ixgbe_ring), GFP_KERNEL,
- adapter->node);
+ if (nid < 0 || !node_online(nid))
+ nid = first_online_node;
+
+ for (; tx < adapter->num_tx_queues; tx++) {
+ struct ixgbe_ring *ring;
+
+ ring = kzalloc_node(sizeof(*ring), GFP_KERNEL, nid);
if (!ring)
- ring = kzalloc(sizeof(struct ixgbe_ring), GFP_KERNEL);
+ ring = kzalloc(sizeof(*ring), GFP_KERNEL);
if (!ring)
- goto err_tx_ring_allocation;
+ goto err_allocation;
ring->count = adapter->tx_ring_count;
- ring->queue_index = i;
- ring->numa_node = adapter->node;
+ ring->queue_index = tx;
+ ring->numa_node = nid;
- adapter->tx_ring[i] = ring;
+ adapter->tx_ring[tx] = ring;
}
- /* Restore the adapter's original node */
- adapter->node = orig_node;
+ for (; rx < adapter->num_rx_queues; rx++) {
+ struct ixgbe_ring *ring;
- for (i = 0; i < adapter->num_rx_queues; i++) {
- struct ixgbe_ring *ring = adapter->rx_ring[i];
- if (orig_node == -1) {
- int cur_node = next_online_node(adapter->node);
- if (cur_node == MAX_NUMNODES)
- cur_node = first_online_node;
- adapter->node = cur_node;
- }
- ring = kzalloc_node(sizeof(struct ixgbe_ring), GFP_KERNEL,
- adapter->node);
+ ring = kzalloc_node(sizeof(*ring), GFP_KERNEL, nid);
if (!ring)
- ring = kzalloc(sizeof(struct ixgbe_ring), GFP_KERNEL);
+ ring = kzalloc(sizeof(*ring), GFP_KERNEL);
if (!ring)
- goto err_rx_ring_allocation;
+ goto err_allocation;
ring->count = adapter->rx_ring_count;
- ring->queue_index = i;
- ring->numa_node = adapter->node;
+ ring->queue_index = rx;
+ ring->numa_node = nid;
- adapter->rx_ring[i] = ring;
+ adapter->rx_ring[rx] = ring;
}
- /* Restore the adapter's original node */
- adapter->node = orig_node;
-
ixgbe_cache_ring_register(adapter);
return 0;
-err_rx_ring_allocation:
- for (i = 0; i < adapter->num_tx_queues; i++)
- kfree(adapter->tx_ring[i]);
-err_tx_ring_allocation:
+err_allocation:
+ while (tx)
+ kfree(adapter->tx_ring[--tx]);
+
+ while (rx)
+ kfree(adapter->tx_ring[--rx]);
+
return -ENOMEM;
}
^ permalink raw reply related [flat|nested] 10+ messages in thread
* Re: [PATCH] ixgbe: refactor ixgbe_alloc_queues()
2010-10-29 3:45 ` [PATCH] ixgbe: refactor ixgbe_alloc_queues() Eric Dumazet
@ 2010-10-29 7:46 ` Jeff Kirsher
2010-10-29 8:38 ` Jeff Kirsher
1 sibling, 0 replies; 10+ messages in thread
From: Jeff Kirsher @ 2010-10-29 7:46 UTC (permalink / raw)
To: Eric Dumazet
Cc: Tantilov, Emil S, David Miller, Waskiewicz Jr, Peter P,
Brattain, Ross B, netdev
On Thu, Oct 28, 2010 at 20:45, Eric Dumazet <eric.dumazet@gmail.com> wrote:
> Note : compiled only patch, not tested.
>
> Thanks !
>
> [PATCH] ixgbe: refactor ixgbe_alloc_queues()
>
> I noticed ring variable was initialized before allocations, and that
> memory node management was a bit ugly. We also leak memory in case of
> ring allocations error.
>
> Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
> ---
> drivers/net/ixgbe/ixgbe_main.c | 69 ++++++++++++-------------------
> 1 file changed, 28 insertions(+), 41 deletions(-)
>
Thanks Eric. I will add this to my queue of patches.
--
Cheers,
Jeff
^ permalink raw reply [flat|nested] 10+ messages in thread
* Re: [PATCH] ixgbe: refactor ixgbe_alloc_queues()
2010-10-29 3:45 ` [PATCH] ixgbe: refactor ixgbe_alloc_queues() Eric Dumazet
2010-10-29 7:46 ` Jeff Kirsher
@ 2010-10-29 8:38 ` Jeff Kirsher
2010-10-29 8:48 ` Eric Dumazet
1 sibling, 1 reply; 10+ messages in thread
From: Jeff Kirsher @ 2010-10-29 8:38 UTC (permalink / raw)
To: Eric Dumazet
Cc: Tantilov, Emil S, David Miller, Waskiewicz Jr, Peter P,
Brattain, Ross B, netdev
On Thu, Oct 28, 2010 at 20:45, Eric Dumazet <eric.dumazet@gmail.com> wrote:
> Note : compiled only patch, not tested.
>
> Thanks !
>
> [PATCH] ixgbe: refactor ixgbe_alloc_queues()
>
> I noticed ring variable was initialized before allocations, and that
> memory node management was a bit ugly. We also leak memory in case of
> ring allocations error.
>
> Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
> ---
> drivers/net/ixgbe/ixgbe_main.c | 69 ++++++++++++-------------------
> 1 file changed, 28 insertions(+), 41 deletions(-)
>
I noticed a typo when applying the patch and I fixed it in the patch
in my queue.
The following code:
> + while (rx)
> + kfree(adapter->tx_ring[--rx]);
> +
should be:
> + while (rx)
> + kfree(adapter->rx_ring[--rx]);
> +
--
Cheers,
Jeff
^ permalink raw reply [flat|nested] 10+ messages in thread
* Re: [PATCH] ixgbe: refactor ixgbe_alloc_queues()
2010-10-29 8:38 ` Jeff Kirsher
@ 2010-10-29 8:48 ` Eric Dumazet
0 siblings, 0 replies; 10+ messages in thread
From: Eric Dumazet @ 2010-10-29 8:48 UTC (permalink / raw)
To: Jeff Kirsher
Cc: Tantilov, Emil S, David Miller, Waskiewicz Jr, Peter P,
Brattain, Ross B, netdev
Le vendredi 29 octobre 2010 à 01:38 -0700, Jeff Kirsher a écrit :
>
> I noticed a typo when applying the patch and I fixed it in the patch
> in my queue.
>
> The following code:
> > + while (rx)
> > + kfree(adapter->tx_ring[--rx]);
> > +
>
> should be:
> > + while (rx)
> > + kfree(adapter->rx_ring[--rx]);
> > +
>
Good catch, thanks !
^ permalink raw reply [flat|nested] 10+ messages in thread
end of thread, other threads:[~2010-10-29 8:49 UTC | newest]
Thread overview: 10+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2010-10-11 12:17 [PATCH net-next] ixgbe: fix stats handling Eric Dumazet
2010-10-12 0:13 ` Jeff Kirsher
2010-10-27 22:35 ` Tantilov, Emil S
2010-10-28 4:24 ` Eric Dumazet
2010-10-28 12:02 ` [PATCH] ixgbe: delay rx_ring freeing Eric Dumazet
2010-10-28 12:21 ` Jeff Kirsher
2010-10-29 3:45 ` [PATCH] ixgbe: refactor ixgbe_alloc_queues() Eric Dumazet
2010-10-29 7:46 ` Jeff Kirsher
2010-10-29 8:38 ` Jeff Kirsher
2010-10-29 8:48 ` Eric Dumazet
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).