Netdev List
 help / color / mirror / Atom feed
From: <netanel@amazon.com>
To: <davem@davemloft.net>, <netdev@vger.kernel.org>
Cc: Netanel Belgazal <netanel@amazon.com>, <dwmw@amazon.com>,
	<zorik@amazon.com>, <matua@amazon.com>, <saeedb@amazon.com>,
	<msw@amazon.com>, <aliguori@amazon.com>, <nafea@amazon.com>,
	<evgenys@amazon.com>, <gtzalik@amazon.com>
Subject: [PATCH net-next 1/2] net: ena: add detection and recovery mechanism for handling missed/misrouted MSI-X
Date: Thu, 28 Dec 2017 21:31:30 +0000	[thread overview]
Message-ID: <1514496691-70940-2-git-send-email-netanel@amazon.com> (raw)
In-Reply-To: <1514496691-70940-1-git-send-email-netanel@amazon.com>

From: Netanel Belgazal <netanel@amazon.com>

A mechanism for detection of stuck Rx/Tx rings due to missed or
misrouted interrupts.
Check if there are unhandled completion descriptors before the first
MSI-X interrupt arrived.
The check is per queue and per interrupt vector.
Once such condition is detected, driver and device reset is scheduled.

Signed-off-by: Netanel Belgazal <netanel@amazon.com>
---
 drivers/net/ethernet/amazon/ena/ena_eth_com.c   | 12 +++++
 drivers/net/ethernet/amazon/ena/ena_eth_com.h   |  2 +
 drivers/net/ethernet/amazon/ena/ena_netdev.c    | 68 ++++++++++++++++++++++---
 drivers/net/ethernet/amazon/ena/ena_netdev.h    |  4 ++
 drivers/net/ethernet/amazon/ena/ena_regs_defs.h |  2 +
 5 files changed, 81 insertions(+), 7 deletions(-)

diff --git a/drivers/net/ethernet/amazon/ena/ena_eth_com.c b/drivers/net/ethernet/amazon/ena/ena_eth_com.c
index b11e573ad57a..04ed2b57ff20 100644
--- a/drivers/net/ethernet/amazon/ena/ena_eth_com.c
+++ b/drivers/net/ethernet/amazon/ena/ena_eth_com.c
@@ -504,3 +504,15 @@ int ena_com_tx_comp_req_id_get(struct ena_com_io_cq *io_cq, u16 *req_id)
 
 	return 0;
 }
+
+bool ena_com_cq_empty(struct ena_com_io_cq *io_cq)
+{
+	struct ena_eth_io_rx_cdesc_base *cdesc;
+
+	cdesc = ena_com_get_next_rx_cdesc(io_cq);
+	if (cdesc)
+		return false;
+	else
+		return true;
+}
+
diff --git a/drivers/net/ethernet/amazon/ena/ena_eth_com.h b/drivers/net/ethernet/amazon/ena/ena_eth_com.h
index bb53c3a4f8e9..2f7657227cfe 100644
--- a/drivers/net/ethernet/amazon/ena/ena_eth_com.h
+++ b/drivers/net/ethernet/amazon/ena/ena_eth_com.h
@@ -88,6 +88,8 @@ int ena_com_add_single_rx_desc(struct ena_com_io_sq *io_sq,
 
 int ena_com_tx_comp_req_id_get(struct ena_com_io_cq *io_cq, u16 *req_id);
 
+bool ena_com_cq_empty(struct ena_com_io_cq *io_cq);
+
 static inline void ena_com_unmask_intr(struct ena_com_io_cq *io_cq,
 				       struct ena_eth_io_intr_reg *intr_reg)
 {
diff --git a/drivers/net/ethernet/amazon/ena/ena_netdev.c b/drivers/net/ethernet/amazon/ena/ena_netdev.c
index 97c5a89a9cf7..a6f283232cb7 100644
--- a/drivers/net/ethernet/amazon/ena/ena_netdev.c
+++ b/drivers/net/ethernet/amazon/ena/ena_netdev.c
@@ -158,6 +158,8 @@ static void ena_init_io_rings_common(struct ena_adapter *adapter,
 	ring->per_napi_packets = 0;
 	ring->per_napi_bytes = 0;
 	ring->cpu = 0;
+	ring->first_interrupt = false;
+	ring->no_interrupt_event_cnt = 0;
 	u64_stats_init(&ring->syncp);
 }
 
@@ -1274,6 +1276,9 @@ static irqreturn_t ena_intr_msix_io(int irq, void *data)
 {
 	struct ena_napi *ena_napi = data;
 
+	ena_napi->tx_ring->first_interrupt = true;
+	ena_napi->rx_ring->first_interrupt = true;
+
 	napi_schedule_irqoff(&ena_napi->napi);
 
 	return IRQ_HANDLED;
@@ -2648,8 +2653,32 @@ static void ena_fw_reset_device(struct work_struct *work)
 	rtnl_unlock();
 }
 
-static int check_missing_comp_in_queue(struct ena_adapter *adapter,
-				       struct ena_ring *tx_ring)
+static int check_for_rx_interrupt_queue(struct ena_adapter *adapter,
+					struct ena_ring *rx_ring)
+{
+	if (likely(rx_ring->first_interrupt))
+		return 0;
+
+	if (ena_com_cq_empty(rx_ring->ena_com_io_cq))
+		return 0;
+
+	rx_ring->no_interrupt_event_cnt++;
+
+	if (rx_ring->no_interrupt_event_cnt == ENA_MAX_NO_INTERRUPT_ITERATIONS) {
+		netif_err(adapter, rx_err, adapter->netdev,
+			  "Potential MSIX issue on Rx side Queue = %d. Reset the device\n",
+			  rx_ring->qid);
+		adapter->reset_reason = ENA_REGS_RESET_MISS_INTERRUPT;
+		smp_mb__before_atomic();
+		set_bit(ENA_FLAG_TRIGGER_RESET, &adapter->flags);
+		return -EIO;
+	}
+
+	return 0;
+}
+
+static int check_missing_comp_in_tx_queue(struct ena_adapter *adapter,
+					  struct ena_ring *tx_ring)
 {
 	struct ena_tx_buffer *tx_buf;
 	unsigned long last_jiffies;
@@ -2659,8 +2688,27 @@ static int check_missing_comp_in_queue(struct ena_adapter *adapter,
 	for (i = 0; i < tx_ring->ring_size; i++) {
 		tx_buf = &tx_ring->tx_buffer_info[i];
 		last_jiffies = tx_buf->last_jiffies;
-		if (unlikely(last_jiffies &&
-			     time_is_before_jiffies(last_jiffies + adapter->missing_tx_completion_to))) {
+
+		if (last_jiffies == 0)
+			/* no pending Tx at this location */
+			continue;
+
+		if (unlikely(!tx_ring->first_interrupt && time_is_before_jiffies(last_jiffies +
+			     2 * adapter->missing_tx_completion_to))) {
+			/* If after graceful period interrupt is still not
+			 * received, we schedule a reset
+			 */
+			netif_err(adapter, tx_err, adapter->netdev,
+				  "Potential MSIX issue on Tx side Queue = %d. Reset the device\n",
+				  tx_ring->qid);
+			adapter->reset_reason = ENA_REGS_RESET_MISS_INTERRUPT;
+			smp_mb__before_atomic();
+			set_bit(ENA_FLAG_TRIGGER_RESET, &adapter->flags);
+			return -EIO;
+		}
+
+		if (unlikely(time_is_before_jiffies(last_jiffies +
+				adapter->missing_tx_completion_to))) {
 			if (!tx_buf->print_once)
 				netif_notice(adapter, tx_err, adapter->netdev,
 					     "Found a Tx that wasn't completed on time, qid %d, index %d.\n",
@@ -2689,9 +2737,10 @@ static int check_missing_comp_in_queue(struct ena_adapter *adapter,
 	return rc;
 }
 
-static void check_for_missing_tx_completions(struct ena_adapter *adapter)
+static void check_for_missing_completions(struct ena_adapter *adapter)
 {
 	struct ena_ring *tx_ring;
+	struct ena_ring *rx_ring;
 	int i, budget, rc;
 
 	/* Make sure the driver doesn't turn the device in other process */
@@ -2710,8 +2759,13 @@ static void check_for_missing_tx_completions(struct ena_adapter *adapter)
 
 	for (i = adapter->last_monitored_tx_qid; i < adapter->num_queues; i++) {
 		tx_ring = &adapter->tx_ring[i];
+		rx_ring = &adapter->rx_ring[i];
+
+		rc = check_missing_comp_in_tx_queue(adapter, tx_ring);
+		if (unlikely(rc))
+			return;
 
-		rc = check_missing_comp_in_queue(adapter, tx_ring);
+		rc = check_for_rx_interrupt_queue(adapter, rx_ring);
 		if (unlikely(rc))
 			return;
 
@@ -2870,7 +2924,7 @@ static void ena_timer_service(struct timer_list *t)
 
 	check_for_admin_com_state(adapter);
 
-	check_for_missing_tx_completions(adapter);
+	check_for_missing_completions(adapter);
 
 	check_for_empty_rx_ring(adapter);
 
diff --git a/drivers/net/ethernet/amazon/ena/ena_netdev.h b/drivers/net/ethernet/amazon/ena/ena_netdev.h
index 3bbc003871de..734ff2e84494 100644
--- a/drivers/net/ethernet/amazon/ena/ena_netdev.h
+++ b/drivers/net/ethernet/amazon/ena/ena_netdev.h
@@ -122,6 +122,7 @@
  * We wait for 6 sec just to be on the safe side.
  */
 #define ENA_DEVICE_KALIVE_TIMEOUT	(6 * HZ)
+#define ENA_MAX_NO_INTERRUPT_ITERATIONS 3
 
 #define ENA_MMIO_DISABLE_REG_READ	BIT(0)
 
@@ -236,6 +237,9 @@ struct ena_ring {
 	/* The maximum header length the device can handle */
 	u8 tx_max_header_size;
 
+	bool first_interrupt;
+	u16 no_interrupt_event_cnt;
+
 	/* cpu for TPH */
 	int cpu;
 	 /* number of tx/rx_buffer_info's entries */
diff --git a/drivers/net/ethernet/amazon/ena/ena_regs_defs.h b/drivers/net/ethernet/amazon/ena/ena_regs_defs.h
index 9aec43c5bba8..48ca97fbe7bc 100644
--- a/drivers/net/ethernet/amazon/ena/ena_regs_defs.h
+++ b/drivers/net/ethernet/amazon/ena/ena_regs_defs.h
@@ -60,6 +60,8 @@ enum ena_regs_reset_reason_types {
 	ENA_REGS_RESET_USER_TRIGGER		= 12,
 
 	ENA_REGS_RESET_GENERIC			= 13,
+
+	ENA_REGS_RESET_MISS_INTERRUPT		= 14,
 };
 
 /* ena_registers offsets */
-- 
2.7.3.AMZN

  reply	other threads:[~2017-12-28 21:31 UTC|newest]

Thread overview: 4+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2017-12-28 21:31 [PATCH net-next 1/2] update ENA driver to version 1.5.0 netanel
2017-12-28 21:31 ` netanel [this message]
2017-12-28 21:31 ` [PATCH net-next 2/2] net: ena: increase ena driver version to 1.5.0 netanel
2018-01-02 19:35 ` [PATCH net-next 1/2] update ENA driver to version 1.5.0 David Miller

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=1514496691-70940-2-git-send-email-netanel@amazon.com \
    --to=netanel@amazon.com \
    --cc=aliguori@amazon.com \
    --cc=davem@davemloft.net \
    --cc=dwmw@amazon.com \
    --cc=evgenys@amazon.com \
    --cc=gtzalik@amazon.com \
    --cc=matua@amazon.com \
    --cc=msw@amazon.com \
    --cc=nafea@amazon.com \
    --cc=netdev@vger.kernel.org \
    --cc=saeedb@amazon.com \
    --cc=zorik@amazon.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox