All of lore.kernel.org
 help / color / mirror / Atom feed
From: "Russell King (Oracle)" <linux@armlinux.org.uk>
To: Jijie Shao <shaojijie@huawei.com>
Cc: f.fainelli@gmail.com, Andrew Lunn <andrew@lunn.ch>,
	davem@davemloft.net, edumazet@google.com, hkallweit1@gmail.com,
	kuba@kernel.org, netdev@vger.kernel.org, pabeni@redhat.com,
	"shenjian15@huawei.com" <shenjian15@huawei.com>,
	"liuyonglong@huawei.com" <liuyonglong@huawei.com>,
	wangjie125@huawei.com, chenhao418@huawei.com,
	Hao Lan <lanhao@huawei.com>,
	"wangpeiyang1@huawei.com" <wangpeiyang1@huawei.com>
Subject: Re: [PATCH net-next] net: phy: avoid kernel warning dump when stopping an errored PHY
Date: Mon, 4 Sep 2023 15:42:50 +0100	[thread overview]
Message-ID: <ZPXs6i2S8GSCpVOV@shell.armlinux.org.uk> (raw)
In-Reply-To: <8e7e02d8-2b2a-8619-e607-fbac50706252@huawei.com>

On Mon, Sep 04, 2023 at 05:50:32PM +0800, Jijie Shao wrote:
> Hi all,
> We encountered an issue when resetting our netdevice recently, it seems
> related to this patch.
> 
> During our process, we stop phy first and call phy_start() later.
> phy_check_link_status returns error because it read mdio failed. The
> reason why it happened is that the cmdq is unusable when we reset and we
> can't access to mdio.

Are you suggesting that the sequence is:

phy_stop();
reset netdev
phy_start();

?

Is the reason for doing this because you've already detected an issue
with the hardware, and you're trying to recover it - and before you've
called phy_stop() the hardware is already dead?

If that is the case, I'm not really sure what you expect to happen
here. You've identified a race where the state machine is running in
unison with phy_stop(), but in this circumstance it is also possible
that the state machine could complete executing and have called
phy_error_precise() before phy_stop() has even been called. In that
case, you'll still get a warning-splat on the console from
phy_error_precise().

The only difference is that phy_stop() won't warn.

That all said, this is obviously buggy, because phy_stop() has set
the phydev state to PHY_HALTED and the state machine has unexpectedly
changed its state.

I wonder whether we should be tracking the phy_start/stop state
separately, since we've had issues with phy_stop() warning when an
error has occurred (see commit 59088b5a946e).

Maybe something like this (untested)?

diff --git a/drivers/net/phy/phy.c b/drivers/net/phy/phy.c
index df54c137c5f5..d57f6de8a562 100644
--- a/drivers/net/phy/phy.c
+++ b/drivers/net/phy/phy.c
@@ -810,7 +810,8 @@ int phy_start_cable_test(struct phy_device *phydev,
 		goto out;
 	}
 
-	if (phydev->state < PHY_UP ||
+	if (phydev->oper_state != PHY_OPER_STARTED ||
+	    phydev->state < PHY_UP ||
 	    phydev->state > PHY_CABLETEST) {
 		NL_SET_ERR_MSG(extack,
 			       "PHY not configured. Try setting interface up");
@@ -881,7 +882,8 @@ int phy_start_cable_test_tdr(struct phy_device *phydev,
 		goto out;
 	}
 
-	if (phydev->state < PHY_UP ||
+	if (phydev->oper_state != PHY_OPER_STARTED ||
+	    phydev->state < PHY_UP ||
 	    phydev->state > PHY_CABLETEST) {
 		NL_SET_ERR_MSG(extack,
 			       "PHY not configured. Try setting interface up");
@@ -1364,10 +1366,8 @@ void phy_stop(struct phy_device *phydev)
 	struct net_device *dev = phydev->attached_dev;
 	enum phy_state old_state;
 
-	if (!phy_is_started(phydev) && phydev->state != PHY_DOWN &&
-	    phydev->state != PHY_ERROR) {
-		WARN(1, "called from state %s\n",
-		     phy_state_to_str(phydev->state));
+	if (phydev->oper_state != PHY_OPER_STARTED) {
+		WARN(1, "called when not started\n");
 		return;
 	}
 
@@ -1382,6 +1382,7 @@ void phy_stop(struct phy_device *phydev)
 	if (phydev->sfp_bus)
 		sfp_upstream_stop(phydev->sfp_bus);
 
+	phydev->oper_state = PHY_OPER_STOPPED;
 	phydev->state = PHY_HALTED;
 	phy_process_state_change(phydev, old_state);
 
@@ -1411,9 +1412,8 @@ void phy_start(struct phy_device *phydev)
 {
 	mutex_lock(&phydev->lock);
 
-	if (phydev->state != PHY_READY && phydev->state != PHY_HALTED) {
-		WARN(1, "called from state %s\n",
-		     phy_state_to_str(phydev->state));
+	if (phydev->oper_state != PHY_OPER_STOPPED) {
+		WARN(1, "called when not stopped\n");
 		goto out;
 	}
 
@@ -1423,6 +1423,7 @@ void phy_start(struct phy_device *phydev)
 	/* if phy was suspended, bring the physical link up again */
 	__phy_resume(phydev);
 
+	phydev->oper_state = PHY_OPER_STARTED;
 	phydev->state = PHY_UP;
 
 	phy_start_machine(phydev);
@@ -1442,14 +1443,18 @@ void phy_state_machine(struct work_struct *work)
 			container_of(dwork, struct phy_device, state_queue);
 	struct net_device *dev = phydev->attached_dev;
 	bool needs_aneg = false, do_suspend = false;
-	enum phy_state old_state;
+	enum phy_state old_state, state;
 	const void *func = NULL;
 	bool finished = false;
 	int err = 0;
 
 	mutex_lock(&phydev->lock);
 
-	old_state = phydev->state;
+	state = old_state = phydev->state;
+
+	/* If the PHY is stopped, then force state to halted. */
+	if (phydev->oper_state == PHY_OPER_STOPPED)
+		state = PHY_HALTED;
 
 	switch (phydev->state) {
 	case PHY_DOWN:
diff --git a/include/linux/phy.h b/include/linux/phy.h
index 5dcab361a220..b128d903adb3 100644
--- a/include/linux/phy.h
+++ b/include/linux/phy.h
@@ -519,6 +519,11 @@ enum phy_state {
 	PHY_CABLETEST,
 };
 
+enum phy_oper_state {
+	PHY_OPER_STOPPED,
+	PHY_OPER_STARTED,
+};
+
 #define MDIO_MMD_NUM 32
 
 /**
@@ -670,6 +675,7 @@ struct phy_device {
 	int rate_matching;
 
 	enum phy_state state;
+	enum phy_oper_state oper_state;
 
 	u32 dev_flags;
 
@@ -1221,7 +1227,8 @@ int phy_speed_down_core(struct phy_device *phydev);
  */
 static inline bool phy_is_started(struct phy_device *phydev)
 {
-	return phydev->state >= PHY_UP;
+	return phydev->oper_state == PHY_OPER_STARTED &&
+	       phydev->state >= PHY_UP;
 }
 
 void phy_resolve_aneg_pause(struct phy_device *phydev);
-- 
RMK's Patch system: https://www.armlinux.org.uk/developer/patches/
FTTP is here! 80Mbps down 10Mbps up. Decent connectivity at last!

  parent reply	other threads:[~2023-09-04 14:43 UTC|newest]

Thread overview: 14+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2023-05-22 15:58 [PATCH net-next] net: phy: avoid kernel warning dump when stopping an errored PHY Russell King (Oracle)
2023-05-22 16:06 ` Russell King (Oracle)
2023-05-22 19:03 ` Florian Fainelli
2023-09-04  9:50   ` Jijie Shao
2023-09-04 13:43     ` Andrew Lunn
2023-09-05  8:49       ` Jijie Shao
2023-09-05 12:09         ` Andrew Lunn
2023-09-05 14:00           ` Russell King (Oracle)
2023-09-05 13:48         ` Russell King (Oracle)
2023-09-05 15:24           ` Russell King (Oracle)
2023-09-06 12:59             ` Andrew Lunn
2023-09-04 14:42     ` Russell King (Oracle) [this message]
2023-09-05  8:59       ` Jijie Shao
2023-05-24  7:30 ` patchwork-bot+netdevbpf

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=ZPXs6i2S8GSCpVOV@shell.armlinux.org.uk \
    --to=linux@armlinux.org.uk \
    --cc=andrew@lunn.ch \
    --cc=chenhao418@huawei.com \
    --cc=davem@davemloft.net \
    --cc=edumazet@google.com \
    --cc=f.fainelli@gmail.com \
    --cc=hkallweit1@gmail.com \
    --cc=kuba@kernel.org \
    --cc=lanhao@huawei.com \
    --cc=liuyonglong@huawei.com \
    --cc=netdev@vger.kernel.org \
    --cc=pabeni@redhat.com \
    --cc=shaojijie@huawei.com \
    --cc=shenjian15@huawei.com \
    --cc=wangjie125@huawei.com \
    --cc=wangpeiyang1@huawei.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.