* [PATCH net] atlantic: fix deadlock at aq_nic_stop
@ 2022-10-14 10:34 Íñigo Huguet
2022-10-14 12:13 ` Andrew Lunn
2022-10-20 7:53 ` [PATCH v2 " Íñigo Huguet
0 siblings, 2 replies; 20+ messages in thread
From: Íñigo Huguet @ 2022-10-14 10:34 UTC (permalink / raw)
To: irusskikh, dbogdanov
Cc: davem, edumazet, kuba, pabeni, netdev, Íñigo Huguet,
Li Liang
NIC is stopped with rtnl_lock held, and during the stop it cancels the
'service_task' work and free irqs.
However, if CONFIG_MACSEC is set, rtnl_lock is acquired both from
aq_nic_service_task and aq_linkstate_threaded_isr. Then a deadlock
happens if aq_nic_stop tries to cancel/disable them when they've already
started their execution.
As the deadlock is caused by rtnl_lock, it causes many other processes
to stall, not only atlantic related stuff.
Fix trying to acquire rtnl_lock at the beginning of those functions, and
returning if NIC closing is ongoing. Also do the "linkstate" stuff in a
workqueue instead than in a threaded irq, where sleeping or waiting a
mutex for a long time is discouraged.
The issue appeared repeteadly attaching and deattaching the NIC to a
bond interface. Doing that after this patch I cannot reproduce the bug.
Fixes: 62c1c2e606f6 ("net: atlantic: MACSec offload skeleton")
Reported-by: Li Liang <liali@redhat.com>
Signed-off-by: Íñigo Huguet <ihuguet@redhat.com>
---
.../ethernet/aquantia/atlantic/aq_macsec.c | 7 +--
.../net/ethernet/aquantia/atlantic/aq_nic.c | 59 ++++++++++++++++---
.../net/ethernet/aquantia/atlantic/aq_nic.h | 1 +
3 files changed, 55 insertions(+), 12 deletions(-)
diff --git a/drivers/net/ethernet/aquantia/atlantic/aq_macsec.c b/drivers/net/ethernet/aquantia/atlantic/aq_macsec.c
index 3d0e16791e1c..5759eba89db9 100644
--- a/drivers/net/ethernet/aquantia/atlantic/aq_macsec.c
+++ b/drivers/net/ethernet/aquantia/atlantic/aq_macsec.c
@@ -1458,7 +1458,7 @@ int aq_macsec_enable(struct aq_nic_s *nic)
if (!nic->macsec_cfg)
return 0;
- rtnl_lock();
+ ASSERT_RTNL();
if (nic->aq_fw_ops->send_macsec_req) {
struct macsec_cfg_request cfg = { 0 };
@@ -1507,7 +1507,6 @@ int aq_macsec_enable(struct aq_nic_s *nic)
ret = aq_apply_macsec_cfg(nic);
unlock:
- rtnl_unlock();
return ret;
}
@@ -1519,9 +1518,9 @@ void aq_macsec_work(struct aq_nic_s *nic)
if (!netif_carrier_ok(nic->ndev))
return;
- rtnl_lock();
+ ASSERT_RTNL();
+
aq_check_txsa_expiration(nic);
- rtnl_unlock();
}
int aq_macsec_rx_sa_cnt(struct aq_nic_s *nic)
diff --git a/drivers/net/ethernet/aquantia/atlantic/aq_nic.c b/drivers/net/ethernet/aquantia/atlantic/aq_nic.c
index 06508eebb585..5cb7d165dd21 100644
--- a/drivers/net/ethernet/aquantia/atlantic/aq_nic.c
+++ b/drivers/net/ethernet/aquantia/atlantic/aq_nic.c
@@ -40,6 +40,8 @@ static unsigned int aq_itr_rx;
module_param_named(aq_itr_rx, aq_itr_rx, uint, 0644);
MODULE_PARM_DESC(aq_itr_rx, "RX interrupt throttle rate");
+#define AQ_TASK_RETRY_MS 50
+
static void aq_nic_update_ndev_stats(struct aq_nic_s *self);
static void aq_nic_rss_init(struct aq_nic_s *self, unsigned int num_rss_queues)
@@ -210,19 +212,41 @@ static int aq_nic_update_link_status(struct aq_nic_s *self)
return 0;
}
-static irqreturn_t aq_linkstate_threaded_isr(int irq, void *private)
+static irqreturn_t aq_linkstate_isr(int irq, void *private)
{
struct aq_nic_s *self = private;
if (!self)
return IRQ_NONE;
+ if (!aq_utils_obj_test(&self->flags, AQ_NIC_FLAG_CLOSING))
+ aq_ndev_schedule_work(&self->linkstate_task);
+
+ return IRQ_HANDLED;
+}
+
+static void aq_nic_linkstate_task(struct work_struct *work)
+{
+ struct aq_nic_s *self = container_of(work, struct aq_nic_s,
+ linkstate_task);
+
+#if IS_ENABLED(CONFIG_MACSEC)
+ /* avoid deadlock at aq_nic_stop -> cancel_work_sync */
+ while (!rtnl_trylock()) {
+ if (aq_utils_obj_test(&self->flags, AQ_NIC_FLAG_CLOSING))
+ return;
+ msleep(AQ_TASK_RETRY_MS);
+ }
+#endif
+
aq_nic_update_link_status(self);
+#if IS_ENABLED(CONFIG_MACSEC)
+ rtnl_unlock();
+#endif
+
self->aq_hw_ops->hw_irq_enable(self->aq_hw,
BIT(self->aq_nic_cfg.link_irq_vec));
-
- return IRQ_HANDLED;
}
static void aq_nic_service_task(struct work_struct *work)
@@ -236,12 +260,23 @@ static void aq_nic_service_task(struct work_struct *work)
if (aq_utils_obj_test(&self->flags, AQ_NIC_FLAGS_IS_NOT_READY))
return;
+#if IS_ENABLED(CONFIG_MACSEC)
+ /* avoid deadlock at aq_nic_stop -> cancel_work_sync */
+ while (!rtnl_trylock()) {
+ if (aq_utils_obj_test(&self->flags, AQ_NIC_FLAG_CLOSING))
+ return;
+ msleep(AQ_TASK_RETRY_MS);
+ }
+#endif
+
err = aq_nic_update_link_status(self);
if (err)
return;
#if IS_ENABLED(CONFIG_MACSEC)
aq_macsec_work(self);
+
+ rtnl_unlock();
#endif
mutex_lock(&self->fwreq_mutex);
@@ -505,6 +540,7 @@ int aq_nic_start(struct aq_nic_s *self)
if (err)
goto err_exit;
+ INIT_WORK(&self->linkstate_task, aq_nic_linkstate_task);
INIT_WORK(&self->service_task, aq_nic_service_task);
timer_setup(&self->service_timer, aq_nic_service_timer_cb, 0);
@@ -531,10 +567,9 @@ int aq_nic_start(struct aq_nic_s *self)
if (cfg->link_irq_vec) {
int irqvec = pci_irq_vector(self->pdev,
cfg->link_irq_vec);
- err = request_threaded_irq(irqvec, NULL,
- aq_linkstate_threaded_isr,
- IRQF_SHARED | IRQF_ONESHOT,
- self->ndev->name, self);
+ err = request_irq(irqvec, aq_linkstate_isr,
+ IRQF_SHARED | IRQF_ONESHOT,
+ self->ndev->name, self);
if (err < 0)
goto err_exit;
self->msix_entry_mask |= (1 << cfg->link_irq_vec);
@@ -1380,11 +1415,15 @@ int aq_nic_set_loopback(struct aq_nic_s *self)
int aq_nic_stop(struct aq_nic_s *self)
{
unsigned int i = 0U;
+ int ret;
+
+ aq_utils_obj_set(&self->flags, AQ_NIC_FLAG_CLOSING);
netif_tx_disable(self->ndev);
netif_carrier_off(self->ndev);
del_timer_sync(&self->service_timer);
+ cancel_work_sync(&self->linkstate_task);
cancel_work_sync(&self->service_task);
self->aq_hw_ops->hw_irq_disable(self->aq_hw, AQ_CFG_IRQ_MASK);
@@ -1401,7 +1440,11 @@ int aq_nic_stop(struct aq_nic_s *self)
aq_ptp_ring_stop(self);
- return self->aq_hw_ops->hw_stop(self->aq_hw);
+ ret = self->aq_hw_ops->hw_stop(self->aq_hw);
+
+ aq_utils_obj_clear(&self->flags, AQ_NIC_FLAG_CLOSING);
+
+ return ret;
}
void aq_nic_set_power(struct aq_nic_s *self)
diff --git a/drivers/net/ethernet/aquantia/atlantic/aq_nic.h b/drivers/net/ethernet/aquantia/atlantic/aq_nic.h
index 935ba889bd9a..a114b66990a9 100644
--- a/drivers/net/ethernet/aquantia/atlantic/aq_nic.h
+++ b/drivers/net/ethernet/aquantia/atlantic/aq_nic.h
@@ -140,6 +140,7 @@ struct aq_nic_s {
const struct aq_fw_ops *aq_fw_ops;
struct aq_nic_cfg_s aq_nic_cfg;
struct timer_list service_timer;
+ struct work_struct linkstate_task;
struct work_struct service_task;
struct timer_list polling_timer;
struct aq_hw_link_status_s link_status;
--
2.34.1
^ permalink raw reply related [flat|nested] 20+ messages in thread* Re: [PATCH net] atlantic: fix deadlock at aq_nic_stop 2022-10-14 10:34 [PATCH net] atlantic: fix deadlock at aq_nic_stop Íñigo Huguet @ 2022-10-14 12:13 ` Andrew Lunn 2022-10-14 12:43 ` Íñigo Huguet 2022-10-20 7:53 ` [PATCH v2 " Íñigo Huguet 1 sibling, 1 reply; 20+ messages in thread From: Andrew Lunn @ 2022-10-14 12:13 UTC (permalink / raw) To: Íñigo Huguet Cc: irusskikh, dbogdanov, davem, edumazet, kuba, pabeni, netdev, Li Liang > Fix trying to acquire rtnl_lock at the beginning of those functions, and > returning if NIC closing is ongoing. Also do the "linkstate" stuff in a > workqueue instead than in a threaded irq, where sleeping or waiting a > mutex for a long time is discouraged. What happens when the same interrupt fires again, while the work queue is still active? The advantage of the threaded interrupt handler is that the interrupt will be kept disabled, and should not fire again until the threaded interrupt handler exits. > +static void aq_nic_linkstate_task(struct work_struct *work) > +{ > + struct aq_nic_s *self = container_of(work, struct aq_nic_s, > + linkstate_task); > + > +#if IS_ENABLED(CONFIG_MACSEC) > + /* avoid deadlock at aq_nic_stop -> cancel_work_sync */ > + while (!rtnl_trylock()) { > + if (aq_utils_obj_test(&self->flags, AQ_NIC_FLAG_CLOSING)) > + return; > + msleep(AQ_TASK_RETRY_MS); > + } > +#endif > + > aq_nic_update_link_status(self); > > +#if IS_ENABLED(CONFIG_MACSEC) > + rtnl_unlock(); > +#endif > + If MACSEC is enabled, aq_nic_update_link_status() is called with RTNL held. If it is not enabled, RTNL is not held. This sort of inconsistency could lead to further locking bugs, since it is not obvious. Please try to make this consistent. Andrew ^ permalink raw reply [flat|nested] 20+ messages in thread
* Re: [PATCH net] atlantic: fix deadlock at aq_nic_stop 2022-10-14 12:13 ` Andrew Lunn @ 2022-10-14 12:43 ` Íñigo Huguet 2022-10-14 13:35 ` Andrew Lunn 0 siblings, 1 reply; 20+ messages in thread From: Íñigo Huguet @ 2022-10-14 12:43 UTC (permalink / raw) To: Andrew Lunn Cc: irusskikh, dbogdanov, davem, edumazet, kuba, pabeni, netdev, Li Liang On Fri, Oct 14, 2022 at 2:14 PM Andrew Lunn <andrew@lunn.ch> wrote: > > > Fix trying to acquire rtnl_lock at the beginning of those functions, and > > returning if NIC closing is ongoing. Also do the "linkstate" stuff in a > > workqueue instead than in a threaded irq, where sleeping or waiting a > > mutex for a long time is discouraged. > > What happens when the same interrupt fires again, while the work queue > is still active? The advantage of the threaded interrupt handler is > that the interrupt will be kept disabled, and should not fire again > until the threaded interrupt handler exits. Nothing happens, if it's already queued, it won't be queued again, and when it runs it will evaluate the last link state. And in the worst case, it will be enqueued to run again, and if linkstate has changed it will be evaluated again. This will rarely happen and it's harmless. Also, I haven't checked it but these lines suggest that the IRQ is auto-disabled in the hw until you enable it again. I didn't rely on this, anyway. self->aq_hw_ops->hw_irq_enable(self->aq_hw, BIT(self->aq_nic_cfg.link_irq_vec)); Honestly I was a bit in doubt on doing this, with the threaded irq it would also work. I'd like to hear more opinions about this and I can change it back. > > > +static void aq_nic_linkstate_task(struct work_struct *work) > > +{ > > + struct aq_nic_s *self = container_of(work, struct aq_nic_s, > > + linkstate_task); > > + > > +#if IS_ENABLED(CONFIG_MACSEC) > > + /* avoid deadlock at aq_nic_stop -> cancel_work_sync */ > > + while (!rtnl_trylock()) { > > + if (aq_utils_obj_test(&self->flags, AQ_NIC_FLAG_CLOSING)) > > + return; > > + msleep(AQ_TASK_RETRY_MS); > > + } > > +#endif > > + > > aq_nic_update_link_status(self); > > > > +#if IS_ENABLED(CONFIG_MACSEC) > > + rtnl_unlock(); > > +#endif > > + > > If MACSEC is enabled, aq_nic_update_link_status() is called with RTNL > held. If it is not enabled, RTNL is not held. This sort of > inconsistency could lead to further locking bugs, since it is not > obvious. Please try to make this consistent. This is not new in these patches, that's what was already happening, I just moved it to get the lock a bit earlier. In my opinion, this is as it should be: why acquire a mutex if you don't have anything to protect with it? And it's worse with rtnl_lock which is held by many processes, and can be held for quite long times... > > Andrew > -- Íñigo Huguet ^ permalink raw reply [flat|nested] 20+ messages in thread
* Re: [PATCH net] atlantic: fix deadlock at aq_nic_stop 2022-10-14 12:43 ` Íñigo Huguet @ 2022-10-14 13:35 ` Andrew Lunn 2022-10-14 13:44 ` Íñigo Huguet 0 siblings, 1 reply; 20+ messages in thread From: Andrew Lunn @ 2022-10-14 13:35 UTC (permalink / raw) To: Íñigo Huguet Cc: irusskikh, dbogdanov, davem, edumazet, kuba, pabeni, netdev, Li Liang On Fri, Oct 14, 2022 at 02:43:47PM +0200, Íñigo Huguet wrote: > On Fri, Oct 14, 2022 at 2:14 PM Andrew Lunn <andrew@lunn.ch> wrote: > > > > > Fix trying to acquire rtnl_lock at the beginning of those functions, and > > > returning if NIC closing is ongoing. Also do the "linkstate" stuff in a > > > workqueue instead than in a threaded irq, where sleeping or waiting a > > > mutex for a long time is discouraged. > > > > What happens when the same interrupt fires again, while the work queue > > is still active? The advantage of the threaded interrupt handler is > > that the interrupt will be kept disabled, and should not fire again > > until the threaded interrupt handler exits. > > Nothing happens, if it's already queued, it won't be queued again, and > when it runs it will evaluate the last link state. And in the worst > case, it will be enqueued to run again, and if linkstate has changed > it will be evaluated again. This will rarely happen and it's harmless. > > Also, I haven't checked it but these lines suggest that the IRQ is > auto-disabled in the hw until you enable it again. I didn't rely on > this, anyway. > self->aq_hw_ops->hw_irq_enable(self->aq_hw, > BIT(self->aq_nic_cfg.link_irq_vec)); > > Honestly I was a bit in doubt on doing this, with the threaded irq it > would also work. I'd like to hear more opinions about this and I can > change it back. Ethernet PHYs do all there interrupt handling in threaded IRQs. That can require a number of MDIO transactions. So we can be talking about 64 bits at 2.5MHz, so 25uS or more. We have not seen issues with that. > > If MACSEC is enabled, aq_nic_update_link_status() is called with RTNL > > held. If it is not enabled, RTNL is not held. This sort of > > inconsistency could lead to further locking bugs, since it is not > > obvious. Please try to make this consistent. > > This is not new in these patches, that's what was already happening, I > just moved it to get the lock a bit earlier. In my opinion, this is as > it should be: why acquire a mutex if you don't have anything to > protect with it? And it's worse with rtnl_lock which is held by many > processes, and can be held for quite long times... Maybe the lock needs to be moved closer to what actually needs to be protect? What is it protecting? Andrew ^ permalink raw reply [flat|nested] 20+ messages in thread
* Re: [PATCH net] atlantic: fix deadlock at aq_nic_stop 2022-10-14 13:35 ` Andrew Lunn @ 2022-10-14 13:44 ` Íñigo Huguet 2022-10-15 15:09 ` Andrew Lunn 0 siblings, 1 reply; 20+ messages in thread From: Íñigo Huguet @ 2022-10-14 13:44 UTC (permalink / raw) To: Andrew Lunn Cc: irusskikh, dbogdanov, davem, edumazet, kuba, pabeni, netdev, Li Liang On Fri, Oct 14, 2022 at 3:35 PM Andrew Lunn <andrew@lunn.ch> wrote: > > On Fri, Oct 14, 2022 at 02:43:47PM +0200, Íñigo Huguet wrote: > > On Fri, Oct 14, 2022 at 2:14 PM Andrew Lunn <andrew@lunn.ch> wrote: > > > > > > > Fix trying to acquire rtnl_lock at the beginning of those functions, and > > > > returning if NIC closing is ongoing. Also do the "linkstate" stuff in a > > > > workqueue instead than in a threaded irq, where sleeping or waiting a > > > > mutex for a long time is discouraged. > > > > > > What happens when the same interrupt fires again, while the work queue > > > is still active? The advantage of the threaded interrupt handler is > > > that the interrupt will be kept disabled, and should not fire again > > > until the threaded interrupt handler exits. > > > > Nothing happens, if it's already queued, it won't be queued again, and > > when it runs it will evaluate the last link state. And in the worst > > case, it will be enqueued to run again, and if linkstate has changed > > it will be evaluated again. This will rarely happen and it's harmless. > > > > Also, I haven't checked it but these lines suggest that the IRQ is > > auto-disabled in the hw until you enable it again. I didn't rely on > > this, anyway. > > self->aq_hw_ops->hw_irq_enable(self->aq_hw, > > BIT(self->aq_nic_cfg.link_irq_vec)); > > > > Honestly I was a bit in doubt on doing this, with the threaded irq it > > would also work. I'd like to hear more opinions about this and I can > > change it back. > > Ethernet PHYs do all there interrupt handling in threaded IRQs. That > can require a number of MDIO transactions. So we can be talking about > 64 bits at 2.5MHz, so 25uS or more. We have not seen issues with that. > > > > If MACSEC is enabled, aq_nic_update_link_status() is called with RTNL > > > held. If it is not enabled, RTNL is not held. This sort of > > > inconsistency could lead to further locking bugs, since it is not > > > obvious. Please try to make this consistent. > > > > This is not new in these patches, that's what was already happening, I > > just moved it to get the lock a bit earlier. In my opinion, this is as > > it should be: why acquire a mutex if you don't have anything to > > protect with it? And it's worse with rtnl_lock which is held by many > > processes, and can be held for quite long times... > > Maybe the lock needs to be moved closer to what actually needs to be > protect? What is it protecting? It's protecting the operations of aq_macsec_enable and aq_macsec_work. The locking was closer to them, but the idea of this patch is to move the locking to an earlier moment so, in the case we need to abort, do it before changing anything. > > Andrew > -- Íñigo Huguet ^ permalink raw reply [flat|nested] 20+ messages in thread
* Re: [PATCH net] atlantic: fix deadlock at aq_nic_stop 2022-10-14 13:44 ` Íñigo Huguet @ 2022-10-15 15:09 ` Andrew Lunn 2022-10-17 7:22 ` Íñigo Huguet 0 siblings, 1 reply; 20+ messages in thread From: Andrew Lunn @ 2022-10-15 15:09 UTC (permalink / raw) To: Íñigo Huguet Cc: irusskikh, dbogdanov, davem, edumazet, kuba, pabeni, netdev, Li Liang > > Maybe the lock needs to be moved closer to what actually needs to be > > protect? What is it protecting? > > It's protecting the operations of aq_macsec_enable and aq_macsec_work. > The locking was closer to them, but the idea of this patch is to move > the locking to an earlier moment so, in the case we need to abort, do > it before changing anything. aq_check_txsa_expiration() seems to be one of the issues? At least, the lock is taken before and released afterwards. So what in aq_check_txsa_expiration() requires the lock? I don't like the use of rtnl_trylock(). It suggests the basic design is wrong, or overly complex, and so probably not working correctly. https://blog.ffwll.ch/2022/07/locking-engineering.html Please try to identify what is being protected. If it is driver internal state, could it be replaced with a driver mutex, rather than RTNL? Or is it network stack as a whole state, which really does require RTNL? If so, how do other drivers deal with this problem? Is it specific to MACSEC? Does MACSEC have a design problem? Andrew ^ permalink raw reply [flat|nested] 20+ messages in thread
* Re: [PATCH net] atlantic: fix deadlock at aq_nic_stop 2022-10-15 15:09 ` Andrew Lunn @ 2022-10-17 7:22 ` Íñigo Huguet 2022-10-18 0:27 ` Andrew Lunn 0 siblings, 1 reply; 20+ messages in thread From: Íñigo Huguet @ 2022-10-17 7:22 UTC (permalink / raw) To: Andrew Lunn Cc: irusskikh, dbogdanov, davem, edumazet, kuba, pabeni, netdev, Li Liang On Sat, Oct 15, 2022 at 5:10 PM Andrew Lunn <andrew@lunn.ch> wrote: > > > > Maybe the lock needs to be moved closer to what actually needs to be > > > protect? What is it protecting? > > > > It's protecting the operations of aq_macsec_enable and aq_macsec_work. > > The locking was closer to them, but the idea of this patch is to move > > the locking to an earlier moment so, in the case we need to abort, do > > it before changing anything. > > aq_check_txsa_expiration() seems to be one of the issues? At least, > the lock is taken before and released afterwards. So what in > aq_check_txsa_expiration() requires the lock? Basically everything in the file aq_macsec.c seems to be implicitly protected by rtnl_lock. One group of functions are all callbacks of the `struct macsec_ops aq_macsec_ops`, which are responsible for configuring macsec offload, all called under rtnl_lock. The rest of the functions in the file are called from ethtool, also protected by rtnl_lock. And part of the problem is that many of these operations are firmware and/or phy configurations which I don't have documentation about how they work. Despite this, it seems reasonable to think that they need to be lock protected. > I don't like the use of rtnl_trylock(). It suggests the basic design is > wrong, or overly complex, and so probably not working correctly. > > https://blog.ffwll.ch/2022/07/locking-engineering.html > > Please try to identify what is being protected. If it is driver > internal state, could it be replaced with a driver mutex, rather than > RTNL? Or is it network stack as a whole state, which really does > require RTNL? If so, how do other drivers deal with this problem? Is > it specific to MACSEC? Does MACSEC have a design problem? I already considered this possibility but discarded it because, as I say above, everything else is already legitimately protected by rtnl_lock. The only alternative I can think of is to add a driver only mutex (let's call it aq_macsec_mutex), as you say, and everytime that macsec offload is to be changed both rtnl_lock and aq_macsec_mutex would be taken. It's true that aq_macsec_mutex wouldn't be much contended because almost always rtnl_lock needs to be acquired first. From the workqueue and the threaded irq there wouldn't be any deadlock because they only hold aq_macsec_mutex and ndo_stop only holds rtnl_lock. I would also allow to put the locking close to what they protect. I thought that this solution would be a bit overkill, but maybe it's less overkill than the one I chose. If you're OK with this, I can prepare an v2. -- Íñigo Huguet ^ permalink raw reply [flat|nested] 20+ messages in thread
* Re: [PATCH net] atlantic: fix deadlock at aq_nic_stop 2022-10-17 7:22 ` Íñigo Huguet @ 2022-10-18 0:27 ` Andrew Lunn 2022-10-18 2:44 ` Jakub Kicinski 2022-10-18 6:11 ` Íñigo Huguet 0 siblings, 2 replies; 20+ messages in thread From: Andrew Lunn @ 2022-10-18 0:27 UTC (permalink / raw) To: Íñigo Huguet Cc: irusskikh, dbogdanov, davem, edumazet, kuba, pabeni, netdev, Li Liang > > Please try to identify what is being protected. If it is driver > > internal state, could it be replaced with a driver mutex, rather than > > RTNL? Or is it network stack as a whole state, which really does > > require RTNL? If so, how do other drivers deal with this problem? Is > > it specific to MACSEC? Does MACSEC have a design problem? > > I already considered this possibility but discarded it because, as I > say above, everything else is already legitimately protected by > rtnl_lock. Did you look at other drivers using MACSEC offload? Is this driver unique in having stuff run in a work queue which you need to cancel? In fact, it is not limited to MACSEC, it could be any work queue which holds RTNL and needs to be cancelled. Andrew ^ permalink raw reply [flat|nested] 20+ messages in thread
* Re: [PATCH net] atlantic: fix deadlock at aq_nic_stop 2022-10-18 0:27 ` Andrew Lunn @ 2022-10-18 2:44 ` Jakub Kicinski 2022-10-18 6:15 ` Íñigo Huguet 2022-10-18 6:11 ` Íñigo Huguet 1 sibling, 1 reply; 20+ messages in thread From: Jakub Kicinski @ 2022-10-18 2:44 UTC (permalink / raw) To: Andrew Lunn, Íñigo Huguet Cc: irusskikh, dbogdanov, davem, edumazet, pabeni, netdev, Li Liang On Tue, 18 Oct 2022 02:27:40 +0200 Andrew Lunn wrote: > > > Please try to identify what is being protected. If it is driver > > > internal state, could it be replaced with a driver mutex, rather than > > > RTNL? Or is it network stack as a whole state, which really does > > > require RTNL? If so, how do other drivers deal with this problem? Is > > > it specific to MACSEC? Does MACSEC have a design problem? > > > > I already considered this possibility but discarded it because, as I > > say above, everything else is already legitimately protected by > > rtnl_lock. > > Did you look at other drivers using MACSEC offload? Is this driver > unique in having stuff run in a work queue which you need to cancel? > In fact, it is not limited to MACSEC, it could be any work queue which > holds RTNL and needs to be cancelled. FWIW the work APIs return a boolean to tell you if the work was actually scheduled / canceled, and you can pair that with a reference count of the netdev to avoid the typical _sync issues. trigger() ASSERT_RTNL(); if (schedule_work(netdev_priv->bla)) netdev_hold(); work() rtnl_lock(); if (netif_running()) do_ya_thing(); netdev_put(); rtnl_unlock(); stop() ASSERT_RTNL(); if (cancel_work(bla)) netdev_put(); I think. ^ permalink raw reply [flat|nested] 20+ messages in thread
* Re: [PATCH net] atlantic: fix deadlock at aq_nic_stop 2022-10-18 2:44 ` Jakub Kicinski @ 2022-10-18 6:15 ` Íñigo Huguet 2022-10-18 15:59 ` Jakub Kicinski 0 siblings, 1 reply; 20+ messages in thread From: Íñigo Huguet @ 2022-10-18 6:15 UTC (permalink / raw) To: Jakub Kicinski Cc: Andrew Lunn, irusskikh, dbogdanov, davem, edumazet, pabeni, netdev, Li Liang On Tue, Oct 18, 2022 at 4:44 AM Jakub Kicinski <kuba@kernel.org> wrote: > FWIW the work APIs return a boolean to tell you if the work was > actually scheduled / canceled, and you can pair that with a reference > count of the netdev to avoid the typical _sync issues. > > trigger() > ASSERT_RTNL(); > if (schedule_work(netdev_priv->bla)) > netdev_hold(); > > work() > rtnl_lock(); > if (netif_running()) > do_ya_thing(); > netdev_put(); > rtnl_unlock(); > > stop() > ASSERT_RTNL(); > if (cancel_work(bla)) > netdev_put(); > > I think. > Interesting solution, I didn't even think of something like this. However, despite not being 100% sure, I think that it's not valid in this case because the work's task communicates with fw and uses resources that are deinitialized at ndo_stop. That's why I think that just holding a reference to the device is not enough. -- Íñigo Huguet ^ permalink raw reply [flat|nested] 20+ messages in thread
* Re: [PATCH net] atlantic: fix deadlock at aq_nic_stop 2022-10-18 6:15 ` Íñigo Huguet @ 2022-10-18 15:59 ` Jakub Kicinski 2022-10-19 6:18 ` Íñigo Huguet 0 siblings, 1 reply; 20+ messages in thread From: Jakub Kicinski @ 2022-10-18 15:59 UTC (permalink / raw) To: Íñigo Huguet Cc: Andrew Lunn, irusskikh, dbogdanov, davem, edumazet, pabeni, netdev, Li Liang On Tue, 18 Oct 2022 08:15:38 +0200 Íñigo Huguet wrote: > Interesting solution, I didn't even think of something like this. > However, despite not being 100% sure, I think that it's not valid in > this case because the work's task communicates with fw and uses > resources that are deinitialized at ndo_stop. That's why I think that > just holding a reference to the device is not enough. You hold a reference to the netdev just to be able to take rtnl_lock() and check if it's still running. If it is UP you're protected from it going down due to rtnl_lock you took. If it's DOWN, as you say, it's not safe to access all the bits so just unlock and return. But because you're holding the reference it's safe to cancel_work() without _sync on down, because the work itself will check if it should have been canceled. Dunno if that's a good explanation :S ^ permalink raw reply [flat|nested] 20+ messages in thread
* Re: [PATCH net] atlantic: fix deadlock at aq_nic_stop 2022-10-18 15:59 ` Jakub Kicinski @ 2022-10-19 6:18 ` Íñigo Huguet 2022-10-19 15:39 ` Jakub Kicinski 0 siblings, 1 reply; 20+ messages in thread From: Íñigo Huguet @ 2022-10-19 6:18 UTC (permalink / raw) To: Jakub Kicinski Cc: Andrew Lunn, irusskikh, dbogdanov, davem, edumazet, pabeni, netdev, Li Liang On Tue, Oct 18, 2022 at 5:59 PM Jakub Kicinski <kuba@kernel.org> wrote: > > On Tue, 18 Oct 2022 08:15:38 +0200 Íñigo Huguet wrote: > > Interesting solution, I didn't even think of something like this. > > However, despite not being 100% sure, I think that it's not valid in > > this case because the work's task communicates with fw and uses > > resources that are deinitialized at ndo_stop. That's why I think that > > just holding a reference to the device is not enough. > > You hold a reference to the netdev just to be able to take rtnl_lock() > and check if it's still running. If it is UP you're protected from it > going down due to rtnl_lock you took. If it's DOWN, as you say, it's not > safe to access all the bits so just unlock and return. > > But because you're holding the reference it's safe to cancel_work() > without _sync on down, because the work itself will check if it should > have been canceled. > > Dunno if that's a good explanation :S Yes, now I get it. However, I think I won't use this strategy this time: rtnl_lock is only needed in the work task if IS_ENABLED(CONFIG_MACSEC). Acquiring rtnl_lock every time if macsec is not enabled wouldn't be protecting anything, so it would be a waste. I think that the strategy suggested by Andrew of adding a dedicated mutex to protect atlantic's macsec operations makes more sense in this case. Do you agree? -- Íñigo Huguet ^ permalink raw reply [flat|nested] 20+ messages in thread
* Re: [PATCH net] atlantic: fix deadlock at aq_nic_stop 2022-10-19 6:18 ` Íñigo Huguet @ 2022-10-19 15:39 ` Jakub Kicinski 2022-10-20 7:46 ` Íñigo Huguet 0 siblings, 1 reply; 20+ messages in thread From: Jakub Kicinski @ 2022-10-19 15:39 UTC (permalink / raw) To: Íñigo Huguet Cc: Andrew Lunn, irusskikh, dbogdanov, davem, edumazet, pabeni, netdev, Li Liang On Wed, 19 Oct 2022 08:18:29 +0200 Íñigo Huguet wrote: > Yes, now I get it. > > However, I think I won't use this strategy this time: rtnl_lock is > only needed in the work task if IS_ENABLED(CONFIG_MACSEC). Acquiring > rtnl_lock every time if macsec is not enabled wouldn't be protecting > anything, so it would be a waste. I think that the strategy suggested > by Andrew of adding a dedicated mutex to protect atlantic's macsec > operations makes more sense in this case. Do you agree? Dunno, locks don't protect operations, they protect state (as the link Andrew sent probably explains?), so it's hard to say how easily you can inject a new lock into this driver covering relevant bits. My gut feeling is that refcounting would be less error prone. But I don't feel strongly enough to force one choice over the other. ^ permalink raw reply [flat|nested] 20+ messages in thread
* Re: [PATCH net] atlantic: fix deadlock at aq_nic_stop 2022-10-19 15:39 ` Jakub Kicinski @ 2022-10-20 7:46 ` Íñigo Huguet 0 siblings, 0 replies; 20+ messages in thread From: Íñigo Huguet @ 2022-10-20 7:46 UTC (permalink / raw) To: Jakub Kicinski Cc: Andrew Lunn, irusskikh, dbogdanov, davem, edumazet, pabeni, netdev, Li Liang On Wed, Oct 19, 2022 at 5:39 PM Jakub Kicinski <kuba@kernel.org> wrote: > Dunno, locks don't protect operations, they protect state (as the link > Andrew sent probably explains?), Yes, you're completely right. I understood that well, and Andrew's link (which is very good, thanks Andrew) explains it too. I wrongly said "operations" because in this case the lock/unlock must be done mainly inside the macsec_ops (operations), and thus my confusion. I'm about to send my patch. If you still feel that it's not the best solution, feel free to insist on the refcount or any other approach you think is better. Thanks and sorry for the noise -- Íñigo Huguet ^ permalink raw reply [flat|nested] 20+ messages in thread
* Re: [PATCH net] atlantic: fix deadlock at aq_nic_stop 2022-10-18 0:27 ` Andrew Lunn 2022-10-18 2:44 ` Jakub Kicinski @ 2022-10-18 6:11 ` Íñigo Huguet 1 sibling, 0 replies; 20+ messages in thread From: Íñigo Huguet @ 2022-10-18 6:11 UTC (permalink / raw) To: Andrew Lunn Cc: irusskikh, dbogdanov, davem, edumazet, kuba, pabeni, netdev, Li Liang On Tue, Oct 18, 2022 at 2:28 AM Andrew Lunn <andrew@lunn.ch> wrote: > Did you look at other drivers using MACSEC offload? Is this driver > unique in having stuff run in a work queue which you need to cancel? > In fact, it is not limited to MACSEC, it could be any work queue which > holds RTNL and needs to be cancelled. Yes, I did. About other drivers using MACSEC offload (which are only 2): they don't have work or anything similar related to macsec where they need to take rtnl_lock or any other lock. But in this driver the need of a lock seems justified, at least as far as I can understand. About other drivers having works that need to take rtnl_lock: they cancel it in PCI shutdown/remove functions, then call unregister_netdev, acquiring rtnl_lock. I considered doing the same, but I didn't for 2 reasons: 1. There is no need to have a periodic task running for a stopped NIC 2. The task uses some resources that are deinitialized at NIC stop and try to communicate with NIC's firmware. If it's stopped in a way different to PCI shutdown/remove (i.e. ip link set down) the task would continue to be executed and try to use these deinitialized resources and communicate with a stopped hw. Of course that point 2 possibly can be fixed to avoid doing it if NIC has been stopped, but it still remains point 1. I didn't research if other drivers really need to have the task running periodically even with the NIC stopped, but I certainly know that this one doesn't need it, looking at what the task does. I do appreciate feedback, suggestions and changes requests (actually I happily accepted to send v2 according to them, right?). But I'd rather if they contained more specific proposals and examples of what I can do to improve my patches, instead of just suggesting that I should do some research before sending them, because I already did. Best regards -- Íñigo Huguet ^ permalink raw reply [flat|nested] 20+ messages in thread
* [PATCH v2 net] atlantic: fix deadlock at aq_nic_stop 2022-10-14 10:34 [PATCH net] atlantic: fix deadlock at aq_nic_stop Íñigo Huguet 2022-10-14 12:13 ` Andrew Lunn @ 2022-10-20 7:53 ` Íñigo Huguet 2022-10-20 8:55 ` Igor Russkikh 2022-10-24 9:30 ` patchwork-bot+netdevbpf 1 sibling, 2 replies; 20+ messages in thread From: Íñigo Huguet @ 2022-10-20 7:53 UTC (permalink / raw) To: irusskikh, kuba, andrew Cc: davem, edumazet, pabeni, dbogdanov, mstarovo, netdev, Íñigo Huguet, Li Liang NIC is stopped with rtnl_lock held, and during the stop it cancels the 'service_task' work and free irqs. However, if CONFIG_MACSEC is set, rtnl_lock is acquired both from aq_nic_service_task and aq_linkstate_threaded_isr. Then a deadlock happens if aq_nic_stop tries to cancel/disable them when they've already started their execution. As the deadlock is caused by rtnl_lock, it causes many other processes to stall, not only atlantic related stuff. Fix it by introducing a mutex that protects each NIC's macsec related data, and locking it instead of the rtnl_lock from the service task and the threaded IRQ. Before this patch, all macsec data was protected with rtnl_lock, but maybe not all of it needs to be protected. With this new mutex, further efforts can be made to limit the protected data only to that which requires it. However, probably it doesn't worth it because all macsec's data accesses are infrequent, and almost all are done from macsec_ops or ethtool callbacks, called holding rtnl_lock, so macsec_mutex won't never be much contended. The issue appeared repeteadly attaching and deattaching the NIC to a bond interface. Doing that after this patch I cannot reproduce the bug. Fixes: 62c1c2e606f6 ("net: atlantic: MACSec offload skeleton") Reported-by: Li Liang <liali@redhat.com> Suggested-by: Andrew Lunn <andrew@lunn.ch> Signed-off-by: Íñigo Huguet <ihuguet@redhat.com> --- v2: per Andrew Lunn's suggestion, avoid rtnl_trylock approach and use a dedicated mutex instead. --- .../ethernet/aquantia/atlantic/aq_macsec.c | 96 ++++++++++++++----- .../net/ethernet/aquantia/atlantic/aq_nic.h | 2 + 2 files changed, 74 insertions(+), 24 deletions(-) diff --git a/drivers/net/ethernet/aquantia/atlantic/aq_macsec.c b/drivers/net/ethernet/aquantia/atlantic/aq_macsec.c index 3d0e16791e1c..a0180811305d 100644 --- a/drivers/net/ethernet/aquantia/atlantic/aq_macsec.c +++ b/drivers/net/ethernet/aquantia/atlantic/aq_macsec.c @@ -1394,26 +1394,57 @@ static void aq_check_txsa_expiration(struct aq_nic_s *nic) egress_sa_threshold_expired); } +#define AQ_LOCKED_MDO_DEF(mdo) \ +static int aq_locked_mdo_##mdo(struct macsec_context *ctx) \ +{ \ + struct aq_nic_s *nic = netdev_priv(ctx->netdev); \ + int ret; \ + mutex_lock(&nic->macsec_mutex); \ + ret = aq_mdo_##mdo(ctx); \ + mutex_unlock(&nic->macsec_mutex); \ + return ret; \ +} + +AQ_LOCKED_MDO_DEF(dev_open) +AQ_LOCKED_MDO_DEF(dev_stop) +AQ_LOCKED_MDO_DEF(add_secy) +AQ_LOCKED_MDO_DEF(upd_secy) +AQ_LOCKED_MDO_DEF(del_secy) +AQ_LOCKED_MDO_DEF(add_rxsc) +AQ_LOCKED_MDO_DEF(upd_rxsc) +AQ_LOCKED_MDO_DEF(del_rxsc) +AQ_LOCKED_MDO_DEF(add_rxsa) +AQ_LOCKED_MDO_DEF(upd_rxsa) +AQ_LOCKED_MDO_DEF(del_rxsa) +AQ_LOCKED_MDO_DEF(add_txsa) +AQ_LOCKED_MDO_DEF(upd_txsa) +AQ_LOCKED_MDO_DEF(del_txsa) +AQ_LOCKED_MDO_DEF(get_dev_stats) +AQ_LOCKED_MDO_DEF(get_tx_sc_stats) +AQ_LOCKED_MDO_DEF(get_tx_sa_stats) +AQ_LOCKED_MDO_DEF(get_rx_sc_stats) +AQ_LOCKED_MDO_DEF(get_rx_sa_stats) + const struct macsec_ops aq_macsec_ops = { - .mdo_dev_open = aq_mdo_dev_open, - .mdo_dev_stop = aq_mdo_dev_stop, - .mdo_add_secy = aq_mdo_add_secy, - .mdo_upd_secy = aq_mdo_upd_secy, - .mdo_del_secy = aq_mdo_del_secy, - .mdo_add_rxsc = aq_mdo_add_rxsc, - .mdo_upd_rxsc = aq_mdo_upd_rxsc, - .mdo_del_rxsc = aq_mdo_del_rxsc, - .mdo_add_rxsa = aq_mdo_add_rxsa, - .mdo_upd_rxsa = aq_mdo_upd_rxsa, - .mdo_del_rxsa = aq_mdo_del_rxsa, - .mdo_add_txsa = aq_mdo_add_txsa, - .mdo_upd_txsa = aq_mdo_upd_txsa, - .mdo_del_txsa = aq_mdo_del_txsa, - .mdo_get_dev_stats = aq_mdo_get_dev_stats, - .mdo_get_tx_sc_stats = aq_mdo_get_tx_sc_stats, - .mdo_get_tx_sa_stats = aq_mdo_get_tx_sa_stats, - .mdo_get_rx_sc_stats = aq_mdo_get_rx_sc_stats, - .mdo_get_rx_sa_stats = aq_mdo_get_rx_sa_stats, + .mdo_dev_open = aq_locked_mdo_dev_open, + .mdo_dev_stop = aq_locked_mdo_dev_stop, + .mdo_add_secy = aq_locked_mdo_add_secy, + .mdo_upd_secy = aq_locked_mdo_upd_secy, + .mdo_del_secy = aq_locked_mdo_del_secy, + .mdo_add_rxsc = aq_locked_mdo_add_rxsc, + .mdo_upd_rxsc = aq_locked_mdo_upd_rxsc, + .mdo_del_rxsc = aq_locked_mdo_del_rxsc, + .mdo_add_rxsa = aq_locked_mdo_add_rxsa, + .mdo_upd_rxsa = aq_locked_mdo_upd_rxsa, + .mdo_del_rxsa = aq_locked_mdo_del_rxsa, + .mdo_add_txsa = aq_locked_mdo_add_txsa, + .mdo_upd_txsa = aq_locked_mdo_upd_txsa, + .mdo_del_txsa = aq_locked_mdo_del_txsa, + .mdo_get_dev_stats = aq_locked_mdo_get_dev_stats, + .mdo_get_tx_sc_stats = aq_locked_mdo_get_tx_sc_stats, + .mdo_get_tx_sa_stats = aq_locked_mdo_get_tx_sa_stats, + .mdo_get_rx_sc_stats = aq_locked_mdo_get_rx_sc_stats, + .mdo_get_rx_sa_stats = aq_locked_mdo_get_rx_sa_stats, }; int aq_macsec_init(struct aq_nic_s *nic) @@ -1435,6 +1466,7 @@ int aq_macsec_init(struct aq_nic_s *nic) nic->ndev->features |= NETIF_F_HW_MACSEC; nic->ndev->macsec_ops = &aq_macsec_ops; + mutex_init(&nic->macsec_mutex); return 0; } @@ -1458,7 +1490,7 @@ int aq_macsec_enable(struct aq_nic_s *nic) if (!nic->macsec_cfg) return 0; - rtnl_lock(); + mutex_lock(&nic->macsec_mutex); if (nic->aq_fw_ops->send_macsec_req) { struct macsec_cfg_request cfg = { 0 }; @@ -1507,7 +1539,7 @@ int aq_macsec_enable(struct aq_nic_s *nic) ret = aq_apply_macsec_cfg(nic); unlock: - rtnl_unlock(); + mutex_unlock(&nic->macsec_mutex); return ret; } @@ -1519,9 +1551,9 @@ void aq_macsec_work(struct aq_nic_s *nic) if (!netif_carrier_ok(nic->ndev)) return; - rtnl_lock(); + mutex_lock(&nic->macsec_mutex); aq_check_txsa_expiration(nic); - rtnl_unlock(); + mutex_unlock(&nic->macsec_mutex); } int aq_macsec_rx_sa_cnt(struct aq_nic_s *nic) @@ -1532,21 +1564,30 @@ int aq_macsec_rx_sa_cnt(struct aq_nic_s *nic) if (!cfg) return 0; + mutex_lock(&nic->macsec_mutex); + for (i = 0; i < AQ_MACSEC_MAX_SC; i++) { if (!test_bit(i, &cfg->rxsc_idx_busy)) continue; cnt += hweight_long(cfg->aq_rxsc[i].rx_sa_idx_busy); } + mutex_unlock(&nic->macsec_mutex); return cnt; } int aq_macsec_tx_sc_cnt(struct aq_nic_s *nic) { + int cnt; + if (!nic->macsec_cfg) return 0; - return hweight_long(nic->macsec_cfg->txsc_idx_busy); + mutex_lock(&nic->macsec_mutex); + cnt = hweight_long(nic->macsec_cfg->txsc_idx_busy); + mutex_unlock(&nic->macsec_mutex); + + return cnt; } int aq_macsec_tx_sa_cnt(struct aq_nic_s *nic) @@ -1557,12 +1598,15 @@ int aq_macsec_tx_sa_cnt(struct aq_nic_s *nic) if (!cfg) return 0; + mutex_lock(&nic->macsec_mutex); + for (i = 0; i < AQ_MACSEC_MAX_SC; i++) { if (!test_bit(i, &cfg->txsc_idx_busy)) continue; cnt += hweight_long(cfg->aq_txsc[i].tx_sa_idx_busy); } + mutex_unlock(&nic->macsec_mutex); return cnt; } @@ -1634,6 +1678,8 @@ u64 *aq_macsec_get_stats(struct aq_nic_s *nic, u64 *data) if (!cfg) return data; + mutex_lock(&nic->macsec_mutex); + aq_macsec_update_stats(nic); common_stats = &cfg->stats; @@ -1716,5 +1762,7 @@ u64 *aq_macsec_get_stats(struct aq_nic_s *nic, u64 *data) data += i; + mutex_unlock(&nic->macsec_mutex); + return data; } diff --git a/drivers/net/ethernet/aquantia/atlantic/aq_nic.h b/drivers/net/ethernet/aquantia/atlantic/aq_nic.h index 935ba889bd9a..ad33f8586532 100644 --- a/drivers/net/ethernet/aquantia/atlantic/aq_nic.h +++ b/drivers/net/ethernet/aquantia/atlantic/aq_nic.h @@ -157,6 +157,8 @@ struct aq_nic_s { struct mutex fwreq_mutex; #if IS_ENABLED(CONFIG_MACSEC) struct aq_macsec_cfg *macsec_cfg; + /* mutex to protect data in macsec_cfg */ + struct mutex macsec_mutex; #endif /* PTP support */ struct aq_ptp_s *aq_ptp; -- 2.34.1 ^ permalink raw reply related [flat|nested] 20+ messages in thread
* Re: [PATCH v2 net] atlantic: fix deadlock at aq_nic_stop 2022-10-20 7:53 ` [PATCH v2 " Íñigo Huguet @ 2022-10-20 8:55 ` Igor Russkikh 2022-10-20 16:17 ` Íñigo Huguet 2022-10-21 19:01 ` Íñigo Huguet 2022-10-24 9:30 ` patchwork-bot+netdevbpf 1 sibling, 2 replies; 20+ messages in thread From: Igor Russkikh @ 2022-10-20 8:55 UTC (permalink / raw) To: Íñigo Huguet, kuba, andrew Cc: davem, edumazet, pabeni, mstarovo, netdev, Li Liang On 10/20/2022 9:53 AM, Íñigo Huguet wrote: > NIC is stopped with rtnl_lock held, and during the stop it cancels the > 'service_task' work and free irqs. Hi Íñigo, thanks for taking care of this. Just reviewed, overall looks reasonable for me. Unfortunately I don't recall now why RTNL lock was used originally, most probably we've tried to secure parallel "ip macsec configure something" commands execution. But the model with internal mutex looks safe for me. Unfortunately I now have no ability to verify your patch, edge usecase here would be to try stress running in parallel: "ethtool -S <iface>" "ip macsec show" "ip macsec <change something>" Plus ideal would be link flipping. Have you tried something like that? Reviewed-by: Igor Russkikh <irusskikh@marvell.com> Regards, Igor ^ permalink raw reply [flat|nested] 20+ messages in thread
* Re: [PATCH v2 net] atlantic: fix deadlock at aq_nic_stop 2022-10-20 8:55 ` Igor Russkikh @ 2022-10-20 16:17 ` Íñigo Huguet 2022-10-21 19:01 ` Íñigo Huguet 1 sibling, 0 replies; 20+ messages in thread From: Íñigo Huguet @ 2022-10-20 16:17 UTC (permalink / raw) To: Igor Russkikh Cc: kuba, andrew, davem, edumazet, pabeni, mstarovo, netdev, Li Liang On Thu, Oct 20, 2022 at 10:56 AM Igor Russkikh <irusskikh@marvell.com> wrote: > > > > On 10/20/2022 9:53 AM, Íñigo Huguet wrote: > > NIC is stopped with rtnl_lock held, and during the stop it cancels the > > 'service_task' work and free irqs. > > Hi Íñigo, thanks for taking care of this. > > Just reviewed, overall looks reasonable for me. Unfortunately I don't recall > now why RTNL lock was used originally, most probably we've tried to secure > parallel "ip macsec configure something" commands execution. > > But the model with internal mutex looks safe for me. > > Unfortunately I now have no ability to verify your patch, edge usecase here > would be to try stress running in parallel: > "ethtool -S <iface>" > "ip macsec show" > "ip macsec <change something>" > Plus ideal would be link flipping. > > Have you tried something like that? I will try to run that tomorrow, thanks! > > Reviewed-by: Igor Russkikh <irusskikh@marvell.com> > > Regards, > Igor > -- Íñigo Huguet ^ permalink raw reply [flat|nested] 20+ messages in thread
* Re: [PATCH v2 net] atlantic: fix deadlock at aq_nic_stop 2022-10-20 8:55 ` Igor Russkikh 2022-10-20 16:17 ` Íñigo Huguet @ 2022-10-21 19:01 ` Íñigo Huguet 1 sibling, 0 replies; 20+ messages in thread From: Íñigo Huguet @ 2022-10-21 19:01 UTC (permalink / raw) To: Igor Russkikh Cc: kuba, andrew, davem, edumazet, pabeni, mstarovo, netdev, Li Liang On Thu, Oct 20, 2022 at 10:56 AM Igor Russkikh <irusskikh@marvell.com> wrote: > > > > On 10/20/2022 9:53 AM, Íñigo Huguet wrote: > > NIC is stopped with rtnl_lock held, and during the stop it cancels the > > 'service_task' work and free irqs. > > Hi Íñigo, thanks for taking care of this. > > Just reviewed, overall looks reasonable for me. Unfortunately I don't recall > now why RTNL lock was used originally, most probably we've tried to secure > parallel "ip macsec configure something" commands execution. > > But the model with internal mutex looks safe for me. > > Unfortunately I now have no ability to verify your patch, edge usecase here > would be to try stress running in parallel: > "ethtool -S <iface>" > "ip macsec show" > "ip macsec <change something>" > Plus ideal would be link flipping. I've been running all this in parallel for some hours without issue. > > Have you tried something like that? > > Reviewed-by: Igor Russkikh <irusskikh@marvell.com> > > Regards, > Igor > -- Íñigo Huguet ^ permalink raw reply [flat|nested] 20+ messages in thread
* Re: [PATCH v2 net] atlantic: fix deadlock at aq_nic_stop 2022-10-20 7:53 ` [PATCH v2 " Íñigo Huguet 2022-10-20 8:55 ` Igor Russkikh @ 2022-10-24 9:30 ` patchwork-bot+netdevbpf 1 sibling, 0 replies; 20+ messages in thread From: patchwork-bot+netdevbpf @ 2022-10-24 9:30 UTC (permalink / raw) To: =?utf-8?b?w43DsWlnbyBIdWd1ZXQgPGlodWd1ZXRAcmVkaGF0LmNvbT4=?= Cc: irusskikh, kuba, andrew, davem, edumazet, pabeni, dbogdanov, mstarovo, netdev, liali Hello: This patch was applied to netdev/net.git (master) by David S. Miller <davem@davemloft.net>: On Thu, 20 Oct 2022 09:53:10 +0200 you wrote: > NIC is stopped with rtnl_lock held, and during the stop it cancels the > 'service_task' work and free irqs. > > However, if CONFIG_MACSEC is set, rtnl_lock is acquired both from > aq_nic_service_task and aq_linkstate_threaded_isr. Then a deadlock > happens if aq_nic_stop tries to cancel/disable them when they've already > started their execution. > > [...] Here is the summary with links: - [v2,net] atlantic: fix deadlock at aq_nic_stop https://git.kernel.org/netdev/net/c/6960d133f66e You are awesome, thank you! -- Deet-doot-dot, I am a bot. https://korg.docs.kernel.org/patchwork/pwbot.html ^ permalink raw reply [flat|nested] 20+ messages in thread
end of thread, other threads:[~2022-10-24 9:30 UTC | newest] Thread overview: 20+ messages (download: mbox.gz follow: Atom feed -- links below jump to the message on this page -- 2022-10-14 10:34 [PATCH net] atlantic: fix deadlock at aq_nic_stop Íñigo Huguet 2022-10-14 12:13 ` Andrew Lunn 2022-10-14 12:43 ` Íñigo Huguet 2022-10-14 13:35 ` Andrew Lunn 2022-10-14 13:44 ` Íñigo Huguet 2022-10-15 15:09 ` Andrew Lunn 2022-10-17 7:22 ` Íñigo Huguet 2022-10-18 0:27 ` Andrew Lunn 2022-10-18 2:44 ` Jakub Kicinski 2022-10-18 6:15 ` Íñigo Huguet 2022-10-18 15:59 ` Jakub Kicinski 2022-10-19 6:18 ` Íñigo Huguet 2022-10-19 15:39 ` Jakub Kicinski 2022-10-20 7:46 ` Íñigo Huguet 2022-10-18 6:11 ` Íñigo Huguet 2022-10-20 7:53 ` [PATCH v2 " Íñigo Huguet 2022-10-20 8:55 ` Igor Russkikh 2022-10-20 16:17 ` Íñigo Huguet 2022-10-21 19:01 ` Íñigo Huguet 2022-10-24 9:30 ` patchwork-bot+netdevbpf
This is a public inbox, see mirroring instructions for how to clone and mirror all data and code used for this inbox; as well as URLs for NNTP newsgroup(s).