public inbox for iwd@lists.linux.dev
 help / color / mirror / Atom feed
* [PATCH 1/8] station: handle netconfig after roaming for FW roams
@ 2024-01-03 18:46 James Prestwood
  2024-01-03 18:46 ` [PATCH 2/8] station: add additional internal state, STATION_STATE_NETCONFIG James Prestwood
                   ` (7 more replies)
  0 siblings, 8 replies; 18+ messages in thread
From: James Prestwood @ 2024-01-03 18:46 UTC (permalink / raw)
  To: iwd; +Cc: James Prestwood

This was not taken into account for FW roams and would result in the
station state being set to connected regardless of netconfig's result.
---
 src/station.c | 12 +++++++++++-
 1 file changed, 11 insertions(+), 1 deletion(-)

diff --git a/src/station.c b/src/station.c
index 73de26bb..5c9ede8b 100644
--- a/src/station.c
+++ b/src/station.c
@@ -3406,6 +3406,16 @@ static void station_beacon_lost(struct station *station)
 	station_roam_timeout_rearm(station, LOSS_ROAM_RATE_LIMIT);
 }
 
+static void station_event_roaming(struct station *station)
+{
+	if (station->netconfig && station->state != STATION_STATE_CONNECTED) {
+		netconfig_reset(station->netconfig);
+		station->netconfig_after_roam = true;
+	}
+
+	station_enter_state(station, STATION_STATE_FW_ROAMING);
+}
+
 static void station_netdev_event(struct netdev *netdev, enum netdev_event event,
 					void *event_data, void *user_data)
 {
@@ -3433,7 +3443,7 @@ static void station_netdev_event(struct netdev *netdev, enum netdev_event event,
 			station_signal_agent_notify(station);
 		break;
 	case NETDEV_EVENT_ROAMING:
-		station_enter_state(station, STATION_STATE_FW_ROAMING);
+		station_event_roaming(station);
 		break;
 	case NETDEV_EVENT_ROAMED:
 		station_event_roamed(station, (struct scan_bss *) event_data);
-- 
2.34.1


^ permalink raw reply related	[flat|nested] 18+ messages in thread

* [PATCH 2/8] station: add additional internal state, STATION_STATE_NETCONFIG
  2024-01-03 18:46 [PATCH 1/8] station: handle netconfig after roaming for FW roams James Prestwood
@ 2024-01-03 18:46 ` James Prestwood
  2024-01-03 18:46 ` [PATCH 3/8] station: add handling for new NETCONFIG state James Prestwood
                   ` (6 subsequent siblings)
  7 siblings, 0 replies; 18+ messages in thread
From: James Prestwood @ 2024-01-03 18:46 UTC (permalink / raw)
  To: iwd; +Cc: James Prestwood

This is still treated as "connecting" from a DBus perspective but
will allow for better handling internally for some roaming corner
cases.
---
 src/dpp.c     | 1 +
 src/station.c | 5 +++++
 src/station.h | 1 +
 src/wsc.c     | 1 +
 4 files changed, 8 insertions(+)

diff --git a/src/dpp.c b/src/dpp.c
index af6574fb..74b4c2bc 100644
--- a/src/dpp.c
+++ b/src/dpp.c
@@ -3761,6 +3761,7 @@ static void dpp_station_state_watch(enum station_state state, void *user_data)
 	case STATION_STATE_CONNECTING:
 	case STATION_STATE_CONNECTED:
 	case STATION_STATE_CONNECTING_AUTO:
+	case STATION_STATE_NETCONFIG:
 		if (L_WARN_ON(dpp->role == DPP_CAPABILITY_CONFIGURATOR))
 			dpp_reset(dpp);
 
diff --git a/src/station.c b/src/station.c
index 5c9ede8b..57d22e91 100644
--- a/src/station.c
+++ b/src/station.c
@@ -1497,6 +1497,8 @@ static const char *station_state_to_string(enum station_state state)
 		return "ft-roaming";
 	case STATION_STATE_FW_ROAMING:
 		return "fw-roaming";
+	case STATION_STATE_NETCONFIG:
+		return "connecting (netconfig)";
 	}
 
 	return "invalid";
@@ -1633,6 +1635,7 @@ static void station_enter_state(struct station *station,
 		station_set_drop_unicast_l2_multicast(station, false);
 		break;
 	case STATION_STATE_DISCONNECTING:
+	case STATION_STATE_NETCONFIG:
 		break;
 	case STATION_STATE_ROAMING:
 	case STATION_STATE_FT_ROAMING:
@@ -3344,6 +3347,7 @@ static void station_disconnect_event(struct station *station, void *event_data)
 	case STATION_STATE_CONNECTED:
 	case STATION_STATE_FT_ROAMING:
 	case STATION_STATE_FW_ROAMING:
+	case STATION_STATE_NETCONFIG:
 		station_disassociated(station);
 		return;
 	default:
@@ -4274,6 +4278,7 @@ static bool station_property_get_state(struct l_dbus *dbus,
 		break;
 	case STATION_STATE_CONNECTING:
 	case STATION_STATE_CONNECTING_AUTO:
+	case STATION_STATE_NETCONFIG:
 		statestr = "connecting";
 		break;
 	case STATION_STATE_CONNECTED:
diff --git a/src/station.h b/src/station.h
index 0d502a08..a38327e4 100644
--- a/src/station.h
+++ b/src/station.h
@@ -45,6 +45,7 @@ enum station_state {
 	STATION_STATE_ROAMING,		/* Reassociation */
 	STATION_STATE_FT_ROAMING,	/* Fast transition */
 	STATION_STATE_FW_ROAMING,	/* Firmware roamed by itself */
+	STATION_STATE_NETCONFIG,
 };
 
 enum station_event {
diff --git a/src/wsc.c b/src/wsc.c
index cda877cf..f88f5deb 100644
--- a/src/wsc.c
+++ b/src/wsc.c
@@ -654,6 +654,7 @@ static void wsc_check_can_connect(struct wsc_station_dbus *wsc,
 	case STATION_STATE_CONNECTING:
 	case STATION_STATE_CONNECTING_AUTO:
 	case STATION_STATE_CONNECTED:
+	case STATION_STATE_NETCONFIG:
 		if (station_disconnect(wsc->station) < 0)
 			goto error;
 
-- 
2.34.1


^ permalink raw reply related	[flat|nested] 18+ messages in thread

* [PATCH 3/8] station: add handling for new NETCONFIG state
  2024-01-03 18:46 [PATCH 1/8] station: handle netconfig after roaming for FW roams James Prestwood
  2024-01-03 18:46 ` [PATCH 2/8] station: add additional internal state, STATION_STATE_NETCONFIG James Prestwood
@ 2024-01-03 18:46 ` James Prestwood
  2024-01-04 18:14   ` Denis Kenzior
  2024-01-03 18:46 ` [PATCH 4/8] station: add debug events for internal states James Prestwood
                   ` (5 subsequent siblings)
  7 siblings, 1 reply; 18+ messages in thread
From: James Prestwood @ 2024-01-03 18:46 UTC (permalink / raw)
  To: iwd; +Cc: James Prestwood

There was an unhandled corner case if netconfig was running and
multiple roam conditions happened in sequence, all before netconfig
had completed. A single roam before netconfig was already handled
(23f0f5717c) but this did not take into account any additional roam
conditions.

If IWD is in this state, having started netconfig, then roamed, and
again restarted netconfig it is still in a roaming state which will
prevent any further roams. IWD will remain "stuck" on the current
BSS until netconfig completes or gets disconnected.

To fix this a new internal station state was added (no changes to
the DBus API) to distinguish between a purely WiFi connecting state
(STATION_STATE_CONNECTING/AUTO) and netconfig
(STATION_STATE_NETCONFIG). This allows IWD roam as needed if
netconfig is still running.

The change is mainly just adding STATION_STATE_NETCONFIG anywhere
that STATION_STATE_CONNECTING is to maintain the same behavior,
except within the netconfig event handler. In this case we should
never get here without being in either a NETCONFIG or ROAMING
state.

For some background this scenario happens if the DHCP server goes
down for an extended period, e.g. if its being upgraded/serviced.
---
 src/station.c | 23 +++++++++++++++++------
 1 file changed, 17 insertions(+), 6 deletions(-)

diff --git a/src/station.c b/src/station.c
index 57d22e91..8f310ec8 100644
--- a/src/station.c
+++ b/src/station.c
@@ -1768,6 +1768,7 @@ static void station_reset_connection_state(struct station *station)
 	if (station->state == STATION_STATE_CONNECTED ||
 			station->state == STATION_STATE_CONNECTING ||
 			station->state == STATION_STATE_CONNECTING_AUTO ||
+			station->state == STATION_STATE_NETCONFIG ||
 			station_is_roaming(station))
 		network_disconnected(network);
 }
@@ -2043,8 +2044,9 @@ static void station_netconfig_event_handler(enum netconfig_event event,
 			dbus_pending_reply(&station->connect_pending, reply);
 		}
 
-		if (L_IN_SET(station->state, STATION_STATE_CONNECTING,
-				STATION_STATE_CONNECTING_AUTO))
+		if (L_IN_SET(station->state, STATION_STATE_NETCONFIG,
+				STATION_STATE_ROAMING, STATION_STATE_FT_ROAMING,
+				STATION_STATE_FW_ROAMING))
 			network_connect_failed(station->connected_network,
 						false);
 
@@ -2070,9 +2072,14 @@ static bool netconfig_after_roam(struct station *station)
 					network_get_settings(network)))
 		return false;
 
-	return netconfig_configure(station->netconfig,
+	if (L_WARN_ON(!netconfig_configure(station->netconfig,
 					station_netconfig_event_handler,
-					station);
+					station)))
+		return false;
+
+	station_enter_state(station, STATION_STATE_NETCONFIG);
+
+	return true;
 }
 
 static void station_roamed(struct station *station)
@@ -3255,6 +3262,8 @@ static void station_connect_ok(struct station *station)
 						station_netconfig_event_handler,
 						station)))
 			return;
+
+		station_enter_state(station, STATION_STATE_NETCONFIG);
 	} else
 		station_enter_state(station, STATION_STATE_CONNECTED);
 }
@@ -4067,7 +4076,8 @@ static struct l_dbus_message *station_dbus_scan(struct l_dbus *dbus,
 		return dbus_error_busy(message);
 
 	if (station->state == STATION_STATE_CONNECTING ||
-			station->state == STATION_STATE_CONNECTING_AUTO)
+			station->state == STATION_STATE_CONNECTING_AUTO ||
+			station->state == STATION_STATE_NETCONFIG)
 		return dbus_error_busy(message);
 
 	station->dbus_scan_subset_idx = 0;
@@ -5025,7 +5035,8 @@ static struct l_dbus_message *station_debug_scan(struct l_dbus *dbus,
 		return dbus_error_busy(message);
 
 	if (station->state == STATION_STATE_CONNECTING ||
-			station->state == STATION_STATE_CONNECTING_AUTO)
+			station->state == STATION_STATE_CONNECTING_AUTO ||
+			station->state == STATION_STATE_NETCONFIG)
 		return dbus_error_busy(message);
 
 	if (!l_dbus_message_get_arguments(message, "aq", &iter))
-- 
2.34.1


^ permalink raw reply related	[flat|nested] 18+ messages in thread

* [PATCH 4/8] station: add debug events for internal states
  2024-01-03 18:46 [PATCH 1/8] station: handle netconfig after roaming for FW roams James Prestwood
  2024-01-03 18:46 ` [PATCH 2/8] station: add additional internal state, STATION_STATE_NETCONFIG James Prestwood
  2024-01-03 18:46 ` [PATCH 3/8] station: add handling for new NETCONFIG state James Prestwood
@ 2024-01-03 18:46 ` James Prestwood
  2024-01-04 17:57   ` Denis Kenzior
  2024-01-03 18:46 ` [PATCH 5/8] auto-t: update roam test to use new debug events James Prestwood
                   ` (4 subsequent siblings)
  7 siblings, 1 reply; 18+ messages in thread
From: James Prestwood @ 2024-01-03 18:46 UTC (permalink / raw)
  To: iwd; +Cc: James Prestwood

This gives the tests a lot more fine-tune control to wait for
specific state transitions rather than only what is exposed over
DBus.

The additional events for "ft-roam" and "reassoc-roam" were removed
since these are now covered by the more generic state change events
("ft-roaming" and "roaming" respectively).
---
 src/station.c | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/src/station.c b/src/station.c
index 8f310ec8..79f25d73 100644
--- a/src/station.c
+++ b/src/station.c
@@ -1556,6 +1556,8 @@ static void station_enter_state(struct station *station,
 			station_state_to_string(station->state),
 			station_state_to_string(state));
 
+	station_debug_event(station, station_state_to_string(state));
+
 	disconnected = !station_is_busy(station);
 
 	if ((disconnected && state > STATION_STATE_AUTOCONNECT_FULL) ||
@@ -2222,8 +2224,6 @@ static int station_transition_reassociate(struct station *station,
 	station->preparing_roam = false;
 	station_enter_state(station, STATION_STATE_ROAMING);
 
-	station_debug_event(station, "reassoc-roam");
-
 	return 0;
 }
 
@@ -2366,8 +2366,6 @@ try_next:
 		station->preparing_roam = false;
 		station_enter_state(station, STATION_STATE_FT_ROAMING);
 
-		station_debug_event(station, "ft-roam");
-
 		break;
 	case -EINVAL:
 		/*
-- 
2.34.1


^ permalink raw reply related	[flat|nested] 18+ messages in thread

* [PATCH 5/8] auto-t: update roam test to use new debug events
  2024-01-03 18:46 [PATCH 1/8] station: handle netconfig after roaming for FW roams James Prestwood
                   ` (2 preceding siblings ...)
  2024-01-03 18:46 ` [PATCH 4/8] station: add debug events for internal states James Prestwood
@ 2024-01-03 18:46 ` James Prestwood
  2024-01-04 17:58   ` Denis Kenzior
  2024-01-03 18:46 ` [PATCH 6/8] auto-t: add test for roaming + netconfig James Prestwood
                   ` (3 subsequent siblings)
  7 siblings, 1 reply; 18+ messages in thread
From: James Prestwood @ 2024-01-03 18:46 UTC (permalink / raw)
  To: iwd; +Cc: James Prestwood

---
 autotests/testPSK-roam/failed_roam_test.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/autotests/testPSK-roam/failed_roam_test.py b/autotests/testPSK-roam/failed_roam_test.py
index d42002d4..ccdec2c6 100644
--- a/autotests/testPSK-roam/failed_roam_test.py
+++ b/autotests/testPSK-roam/failed_roam_test.py
@@ -60,7 +60,7 @@ class Test(unittest.TestCase):
         self.rule0.enabled = False
 
         # IWD should then try BSS 2, and succeed
-        device.wait_for_event('ft-roam', timeout=60)
+        device.wait_for_event('ft-roaming', timeout=60)
         self.verify_roam(self.wd, device, self.bss_hostapd[0], self.bss_hostapd[2])
 
         self.bss_hostapd[2].deauthenticate(device.address)
@@ -75,7 +75,7 @@ class Test(unittest.TestCase):
 
         self.connect(self.wd, device, self.bss_hostapd[0])
 
-        device.wait_for_event('ft-roam', timeout=60)
+        device.wait_for_event('ft-roaming', timeout=60)
 
         condition = 'obj.state == DeviceState.disconnected'
         self.wd.wait_for_object_condition(device, condition)
@@ -101,14 +101,14 @@ class Test(unittest.TestCase):
         # IWD should connect, then attempt a roam to BSS 1, which should
         # fail and cause a fallback to reassociation
         device.wait_for_event('ft-fallback-to-reassoc', timeout=60)
-        device.wait_for_event('reassoc-roam', timeout=60)
+        device.wait_for_event('roaming', timeout=60)
 
         self.verify_roam(self.wd, device, self.bss_hostapd[0], self.bss_hostapd[2])
 
         # Trigger another roam
         self.rule_bss2.signal = -8000
 
-        device.wait_for_event('ft-roam', timeout=60)
+        device.wait_for_event('ft-roaming', timeout=60)
 
         # Ensure an FT roam back to a properly configured AP works.
         self.verify_roam(self.wd, device, self.bss_hostapd[2], self.bss_hostapd[1])
@@ -141,7 +141,7 @@ class Test(unittest.TestCase):
         # fail and cause the rank to be re-computed. This should then put
         # bss 1 as the next candidate (since the FT factor is removed)
         device.wait_for_event('ft-fallback-to-reassoc', timeout=60)
-        device.wait_for_event('ft-roam', timeout=60)
+        device.wait_for_event('ft-roaming', timeout=60)
 
         self.verify_roam(self.wd, device, self.bss_hostapd[0], self.bss_hostapd[1])
 
-- 
2.34.1


^ permalink raw reply related	[flat|nested] 18+ messages in thread

* [PATCH 6/8] auto-t: add test for roaming + netconfig
  2024-01-03 18:46 [PATCH 1/8] station: handle netconfig after roaming for FW roams James Prestwood
                   ` (3 preceding siblings ...)
  2024-01-03 18:46 ` [PATCH 5/8] auto-t: update roam test to use new debug events James Prestwood
@ 2024-01-03 18:46 ` James Prestwood
  2024-01-03 18:46 ` [PATCH 7/8] auto-t: improve failure handling in testPSK-roam James Prestwood
                   ` (2 subsequent siblings)
  7 siblings, 0 replies; 18+ messages in thread
From: James Prestwood @ 2024-01-03 18:46 UTC (permalink / raw)
  To: iwd; +Cc: James Prestwood

This test ensures IWD will continue to roam and restart netconfig if
roam conditions are met prior to netconfig finishing.
---
 autotests/testNetconfigRoam/TestFT.psk        |  2 +
 .../testNetconfigRoam/ft-psk-ccmp-1.conf      | 41 +++++++++
 .../testNetconfigRoam/ft-psk-ccmp-2.conf      | 41 +++++++++
 autotests/testNetconfigRoam/hw.conf           |  8 ++
 autotests/testNetconfigRoam/main.conf         |  3 +
 .../testNetconfigRoam/netconfig_roam_test.py  | 88 +++++++++++++++++++
 6 files changed, 183 insertions(+)
 create mode 100644 autotests/testNetconfigRoam/TestFT.psk
 create mode 100644 autotests/testNetconfigRoam/ft-psk-ccmp-1.conf
 create mode 100644 autotests/testNetconfigRoam/ft-psk-ccmp-2.conf
 create mode 100644 autotests/testNetconfigRoam/hw.conf
 create mode 100644 autotests/testNetconfigRoam/main.conf
 create mode 100644 autotests/testNetconfigRoam/netconfig_roam_test.py

diff --git a/autotests/testNetconfigRoam/TestFT.psk b/autotests/testNetconfigRoam/TestFT.psk
new file mode 100644
index 00000000..e82d1295
--- /dev/null
+++ b/autotests/testNetconfigRoam/TestFT.psk
@@ -0,0 +1,2 @@
+[Security]
+Passphrase=EasilyGuessedPassword
diff --git a/autotests/testNetconfigRoam/ft-psk-ccmp-1.conf b/autotests/testNetconfigRoam/ft-psk-ccmp-1.conf
new file mode 100644
index 00000000..b46d1f27
--- /dev/null
+++ b/autotests/testNetconfigRoam/ft-psk-ccmp-1.conf
@@ -0,0 +1,41 @@
+hw_mode=g
+channel=1
+ssid=TestFT
+utf8_ssid=1
+ctrl_interface=/var/run/hostapd
+
+r1_key_holder=120000000001
+nas_identifier=dummy1
+
+wpa=2
+# Can support WPA-PSK and FT-PSK (space separated list) and/or EAP at the same
+# time but we want to force FT
+wpa_key_mgmt=FT-PSK
+wpa_pairwise=CCMP
+wpa_passphrase=EasilyGuessedPassword
+ieee80211w=1
+rsn_preauth=1
+rsn_preauth_interfaces=lo
+disable_pmksa_caching=0
+# Allow PMK cache to be shared opportunistically among configured interfaces
+# and BSSes (i.e., all configurations within a single hostapd process).
+okc=1
+mobility_domain=1234
+reassociation_deadline=60000
+r0kh=12:00:00:00:00:01 dummy1 000102030405060708090a0b0c0d0e0f
+r0kh=12:00:00:00:00:02 dummy2 000102030405060708090a0b0c0d0e0f
+r1kh=12:00:00:00:00:01 00:00:00:00:00:01 000102030405060708090a0b0c0d0e0f
+r1kh=12:00:00:00:00:02 00:00:00:00:00:02 000102030405060708090a0b0c0d0e0f
+# Push mode only needed for 8021x, not PSK mode since msk already known
+pmk_r1_push=0
+# Allow locally generated FT response so we don't have to configure push/pull
+# between BSSes running as separate hostapd processes as in the test-runner
+# case.  Only works with FT-PSK, otherwise brctl needs to be installed and
+# CONFIG_BRIDGE enabled in the kernel.
+ft_psk_generate_local=1
+rkh_pull_timeout=50
+ft_over_ds=0
+ap_table_expiration_time=36000
+ap_table_max_size=10
+rrm_neighbor_report=1
+ocv=1
diff --git a/autotests/testNetconfigRoam/ft-psk-ccmp-2.conf b/autotests/testNetconfigRoam/ft-psk-ccmp-2.conf
new file mode 100644
index 00000000..3e215457
--- /dev/null
+++ b/autotests/testNetconfigRoam/ft-psk-ccmp-2.conf
@@ -0,0 +1,41 @@
+hw_mode=g
+channel=2
+ssid=TestFT
+utf8_ssid=1
+ctrl_interface=/var/run/hostapd
+
+r1_key_holder=120000000002
+nas_identifier=dummy2
+
+wpa=2
+# Can support WPA-PSK and FT-PSK (space separated list) and/or EAP at the same
+# time but we want to force FT
+wpa_key_mgmt=FT-PSK
+wpa_pairwise=CCMP
+wpa_passphrase=EasilyGuessedPassword
+ieee80211w=1
+rsn_preauth=1
+rsn_preauth_interfaces=lo
+disable_pmksa_caching=0
+# Allow PMK cache to be shared opportunistically among configured interfaces
+# and BSSes (i.e., all configurations within a single hostapd process).
+okc=1
+mobility_domain=1234
+reassociation_deadline=60000
+r0kh=12:00:00:00:00:01 dummy1 000102030405060708090a0b0c0d0e0f
+r0kh=12:00:00:00:00:02 dummy2 000102030405060708090a0b0c0d0e0f
+r1kh=12:00:00:00:00:01 00:00:00:00:00:01 000102030405060708090a0b0c0d0e0f
+r1kh=12:00:00:00:00:02 00:00:00:00:00:02 000102030405060708090a0b0c0d0e0f
+# Push mode only needed for 8021x, not PSK mode since msk already known
+pmk_r1_push=0
+# Allow locally generated FT response so we don't have to configure push/pull
+# between BSSes running as separate hostapd processes as in the test-runner
+# case.  Only works with FT-PSK, otherwise brctl needs to be installed and
+# CONFIG_BRIDGE enabled in the kernel.
+ft_psk_generate_local=1
+rkh_pull_timeout=50
+ft_over_ds=0
+ap_table_expiration_time=36000
+ap_table_max_size=10
+rrm_neighbor_report=1
+ocv=1
diff --git a/autotests/testNetconfigRoam/hw.conf b/autotests/testNetconfigRoam/hw.conf
new file mode 100644
index 00000000..c2b35d6e
--- /dev/null
+++ b/autotests/testNetconfigRoam/hw.conf
@@ -0,0 +1,8 @@
+[SETUP]
+num_radios=3
+start_iwd=0
+hwsim_medium=yes
+
+[HOSTAPD]
+rad0=ft-psk-ccmp-1.conf
+rad1=ft-psk-ccmp-2.conf
diff --git a/autotests/testNetconfigRoam/main.conf b/autotests/testNetconfigRoam/main.conf
new file mode 100644
index 00000000..aedd4ebd
--- /dev/null
+++ b/autotests/testNetconfigRoam/main.conf
@@ -0,0 +1,3 @@
+[General]
+EnableNetworkConfiguration=true
+RoamRetryInterval=1
diff --git a/autotests/testNetconfigRoam/netconfig_roam_test.py b/autotests/testNetconfigRoam/netconfig_roam_test.py
new file mode 100644
index 00000000..c66fc107
--- /dev/null
+++ b/autotests/testNetconfigRoam/netconfig_roam_test.py
@@ -0,0 +1,88 @@
+#! /usr/bin/python3
+
+import unittest
+import sys, os
+
+sys.path.append('../util')
+import iwd
+from iwd import IWD
+from iwd import PSKAgent
+from iwd import NetworkType
+from iwd import DeviceState
+from hwsim import Hwsim
+from hostapd import HostapdCLI
+import testutil
+
+class Test(unittest.TestCase):
+    def test_roam_before_netconfig(self):
+        wd = IWD(True)
+
+        device = wd.list_devices(1)[0]
+        device.autoconnect = True
+
+        self.bss_hostapd[1].wait_for_event(f'AP-STA-CONNECTED {device.address}')
+        device.wait_for_event("connecting (netconfig)")
+
+        roam_to = 0
+        roam_from = 1
+
+        # Roam back and forth, ensuring that the state transitions between
+        # roaming and connecting (netconfig).
+        for _ in range(0, 5):
+            self.rules[roam_to].signal = -2000
+            self.rules[roam_from].signal = -8000
+
+            condition = 'obj.state == DeviceState.roaming'
+            wd.wait_for_object_condition(device, condition)
+
+            self.bss_hostapd[roam_to].wait_for_event(f'AP-STA-CONNECTED {device.address}')
+            device.wait_for_event("connecting (netconfig)")
+
+            tmp = roam_from
+            roam_from = roam_to
+            roam_to = tmp
+
+        self.bss_hostapd[roam_from].deauthenticate(device.address)
+        condition = 'obj.state == DeviceState.disconnected'
+        wd.wait_for_object_condition(device, condition)
+
+    def tearDown(self):
+        os.system('ip link set "' + self.bss_hostapd[0].ifname + '" down')
+        os.system('ip link set "' + self.bss_hostapd[1].ifname + '" down')
+        os.system('ip link set "' + self.bss_hostapd[0].ifname + '" up')
+        os.system('ip link set "' + self.bss_hostapd[1].ifname + '" up')
+
+    @classmethod
+    def setUpClass(cls):
+        hwsim = Hwsim()
+
+        IWD.copy_to_storage('TestFT.psk')
+
+        cls.bss_hostapd = [ HostapdCLI(config='ft-psk-ccmp-1.conf'),
+                            HostapdCLI(config='ft-psk-ccmp-2.conf') ]
+
+        rule0 = hwsim.rules.create()
+        rule0.source = cls.bss_hostapd[0].bssid
+        rule0.signal = -6000
+        rule0.enabled = True
+
+        rule1 = hwsim.rules.create()
+        rule1.source = cls.bss_hostapd[1].bssid
+        rule1.signal = -2000
+        rule1.enabled = True
+
+        cls.rules = [rule0, rule1]
+
+        cls.bss_hostapd[0].set_address('12:00:00:00:00:01')
+        cls.bss_hostapd[1].set_address('12:00:00:00:00:02')
+
+        HostapdCLI.group_neighbors(*cls.bss_hostapd)
+
+    @classmethod
+    def tearDownClass(cls):
+        IWD.clear_storage()
+        cls.bss_hostapd = None
+        cls.rules = None
+
+if __name__ == '__main__':
+    unittest.main(exit=True)
-- 
2.34.1


^ permalink raw reply related	[flat|nested] 18+ messages in thread

* [PATCH 7/8] auto-t: improve failure handling in testPSK-roam
  2024-01-03 18:46 [PATCH 1/8] station: handle netconfig after roaming for FW roams James Prestwood
                   ` (4 preceding siblings ...)
  2024-01-03 18:46 ` [PATCH 6/8] auto-t: add test for roaming + netconfig James Prestwood
@ 2024-01-03 18:46 ` James Prestwood
  2024-01-04 18:00   ` Denis Kenzior
  2024-01-03 18:46 ` [PATCH 8/8] auto-t: fix random testPSK-roam failure James Prestwood
  2024-01-04 17:56 ` [PATCH 1/8] station: handle netconfig after roaming for FW roams Denis Kenzior
  7 siblings, 1 reply; 18+ messages in thread
From: James Prestwood @ 2024-01-03 18:46 UTC (permalink / raw)
  To: iwd; +Cc: James Prestwood

This really needs to be done to many more autotests but since this
one seems to have random failures ensure that all the tests still
run if one fails. In addition add better cleanup for hwsim rules.
---
 autotests/testPSK-roam/connection_test.py     | 25 +++++++++----------
 autotests/testPSK-roam/failed_roam_test.py    |  3 +++
 .../testPSK-roam/roam_ap_disconnect_test.py   | 11 +++++---
 3 files changed, 22 insertions(+), 17 deletions(-)

diff --git a/autotests/testPSK-roam/connection_test.py b/autotests/testPSK-roam/connection_test.py
index 8db4e4cb..2dbf1473 100644
--- a/autotests/testPSK-roam/connection_test.py
+++ b/autotests/testPSK-roam/connection_test.py
@@ -91,13 +91,9 @@ class Test(unittest.TestCase):
         wd.wait_for_object_condition(device, condition)
 
     def test_ft_psk(self):
-        wd = IWD(True)
-
-        self.validate_connection(wd)
+        self.validate_connection(self.wd)
 
     def test_ft_psk_over_ds(self):
-        wd = IWD(True)
-
         self.bss_hostapd[0].set_value('ft_over_ds', '1')
         self.bss_hostapd[0].reload()
         self.bss_hostapd[0].wait_for_event("AP-ENABLED")
@@ -106,11 +102,9 @@ class Test(unittest.TestCase):
         self.bss_hostapd[1].reload()
         self.bss_hostapd[1].wait_for_event("AP-ENABLED")
 
-        self.validate_connection(wd, over_ds=True)
+        self.validate_connection(self.wd, over_ds=True)
 
     def test_reassociate_psk(self):
-        wd = IWD(True)
-
         self.bss_hostapd[0].set_value('wpa_key_mgmt', 'WPA-PSK')
         self.bss_hostapd[0].reload()
         self.bss_hostapd[0].wait_for_event("AP-ENABLED")
@@ -119,12 +113,10 @@ class Test(unittest.TestCase):
         self.bss_hostapd[1].reload()
         self.bss_hostapd[1].wait_for_event("AP-ENABLED")
 
-        self.validate_connection(wd)
+        self.validate_connection(self.wd)
 
     def test_roam_packet_loss(self):
-        wd = IWD(True)
-
-        self.validate_connection(wd, pkt_loss=True)
+        self.validate_connection(self.wd, pkt_loss=True)
 
     def tearDown(self):
         os.system('ip link set "' + self.bss_hostapd[0].ifname + '" down')
@@ -139,6 +131,12 @@ class Test(unittest.TestCase):
         for hapd in self.bss_hostapd:
             hapd.default()
 
+        self.wd.stop()
+        self.wd = None
+
+    def setUp(self):
+        self.wd = IWD(True)
+
     @classmethod
     def setUpClass(cls):
         hwsim = Hwsim()
@@ -178,8 +176,9 @@ class Test(unittest.TestCase):
     def tearDownClass(cls):
         IWD.clear_storage()
         cls.bss_hostapd = None
-        cls.rule0.enabled = False
         cls.rule0.remove()
+        cls.rule1.remove()
+        cls.rule2.remove()
 
 if __name__ == '__main__':
     unittest.main(exit=True)
diff --git a/autotests/testPSK-roam/failed_roam_test.py b/autotests/testPSK-roam/failed_roam_test.py
index ccdec2c6..12e439f6 100644
--- a/autotests/testPSK-roam/failed_roam_test.py
+++ b/autotests/testPSK-roam/failed_roam_test.py
@@ -227,6 +227,9 @@ class Test(unittest.TestCase):
         cls.bss_hostapd = None
         cls.rule0.remove()
         cls.rule1.remove()
+        cls.rule2.remove()
+        cls.rule3.remove()
+        cls.assoc_rule.remove()
 
 if __name__ == '__main__':
     unittest.main(exit=True)
diff --git a/autotests/testPSK-roam/roam_ap_disconnect_test.py b/autotests/testPSK-roam/roam_ap_disconnect_test.py
index b5775990..76c62ea0 100644
--- a/autotests/testPSK-roam/roam_ap_disconnect_test.py
+++ b/autotests/testPSK-roam/roam_ap_disconnect_test.py
@@ -14,8 +14,7 @@ class Test(unittest.TestCase):
     # Tests a crash reported where multiple roam scans combined with an AP
     # disconnect result in a crash getting scan results.
     #
-    def validate(self):
-        wd = IWD(True)
+    def validate(self, wd):
         device = wd.list_devices(1)[0]
 
         ordered_network = device.get_ordered_network('TestFT', full_scan=True)
@@ -43,14 +42,15 @@ class Test(unittest.TestCase):
         wd.wait_for_object_condition(device, condition)
 
     def test_ap_disconnect_no_neighbors(self):
-        self.validate()
+        self.validate(self.wd)
 
     def test_ap_disconnect_neighbors(self):
         HostapdCLI.group_neighbors(*self.bss_hostapd)
 
-        self.validate()
+        self.validate(self.wd)
 
     def setUp(self):
+        self.wd = IWD(True)
         self.bss_hostapd[0].reload()
         self.bss_hostapd[0].wait_for_event("AP-ENABLED")
 
@@ -63,6 +63,9 @@ class Test(unittest.TestCase):
         self.bss_hostapd[0].remove_neighbor(self.bss_hostapd[1].bssid)
         self.bss_hostapd[1].remove_neighbor(self.bss_hostapd[0].bssid)
 
+        self.wd.stop()
+        self.wd = None
+
     @classmethod
     def setUpClass(cls):
         hwsim = Hwsim()
-- 
2.34.1


^ permalink raw reply related	[flat|nested] 18+ messages in thread

* [PATCH 8/8] auto-t: fix random testPSK-roam failure
  2024-01-03 18:46 [PATCH 1/8] station: handle netconfig after roaming for FW roams James Prestwood
                   ` (5 preceding siblings ...)
  2024-01-03 18:46 ` [PATCH 7/8] auto-t: improve failure handling in testPSK-roam James Prestwood
@ 2024-01-03 18:46 ` James Prestwood
  2024-01-04 18:00   ` Denis Kenzior
  2024-01-04 17:56 ` [PATCH 1/8] station: handle netconfig after roaming for FW roams Denis Kenzior
  7 siblings, 1 reply; 18+ messages in thread
From: James Prestwood @ 2024-01-03 18:46 UTC (permalink / raw)
  To: iwd; +Cc: James Prestwood

This was caused by the unused hostapd instance running after being
re-enabled by mistake. This cause an additional scan result with the
same rank to be seen which would then be connected to by luck of the
draw.
---
 autotests/testPSK-roam/connection_test.py | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/autotests/testPSK-roam/connection_test.py b/autotests/testPSK-roam/connection_test.py
index 2dbf1473..8d1658a8 100644
--- a/autotests/testPSK-roam/connection_test.py
+++ b/autotests/testPSK-roam/connection_test.py
@@ -144,9 +144,10 @@ class Test(unittest.TestCase):
         IWD.copy_to_storage('TestFT.psk')
 
         cls.bss_hostapd = [ HostapdCLI(config='ft-psk-ccmp-1.conf'),
-                            HostapdCLI(config='ft-psk-ccmp-2.conf'),
-                            HostapdCLI(config='ft-psk-ccmp-3.conf') ]
-        cls.bss_hostapd[2].disable()
+                            HostapdCLI(config='ft-psk-ccmp-2.conf') ]
+
+        unused = HostapdCLI(config='ft-psk-ccmp-3.conf')
+        unused.disable()
 
         rad0 = hwsim.get_radio('rad0')
         rad3 = hwsim.get_radio('rad3')
-- 
2.34.1


^ permalink raw reply related	[flat|nested] 18+ messages in thread

* Re: [PATCH 1/8] station: handle netconfig after roaming for FW roams
  2024-01-03 18:46 [PATCH 1/8] station: handle netconfig after roaming for FW roams James Prestwood
                   ` (6 preceding siblings ...)
  2024-01-03 18:46 ` [PATCH 8/8] auto-t: fix random testPSK-roam failure James Prestwood
@ 2024-01-04 17:56 ` Denis Kenzior
  7 siblings, 0 replies; 18+ messages in thread
From: Denis Kenzior @ 2024-01-04 17:56 UTC (permalink / raw)
  To: James Prestwood, iwd

Hi James,

On 1/3/24 12:46, James Prestwood wrote:
> This was not taken into account for FW roams and would result in the
> station state being set to connected regardless of netconfig's result.
> ---
>   src/station.c | 12 +++++++++++-
>   1 file changed, 11 insertions(+), 1 deletion(-)
> 

Applied, thanks.

Regards,
-Denis


^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH 4/8] station: add debug events for internal states
  2024-01-03 18:46 ` [PATCH 4/8] station: add debug events for internal states James Prestwood
@ 2024-01-04 17:57   ` Denis Kenzior
  0 siblings, 0 replies; 18+ messages in thread
From: Denis Kenzior @ 2024-01-04 17:57 UTC (permalink / raw)
  To: James Prestwood, iwd

Hi James,

On 1/3/24 12:46, James Prestwood wrote:
> This gives the tests a lot more fine-tune control to wait for
> specific state transitions rather than only what is exposed over
> DBus.
> 
> The additional events for "ft-roam" and "reassoc-roam" were removed
> since these are now covered by the more generic state change events
> ("ft-roaming" and "roaming" respectively).
> ---
>   src/station.c | 6 ++----
>   1 file changed, 2 insertions(+), 4 deletions(-)
> 

Applied, thanks.

Regards,
-Denis


^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH 5/8] auto-t: update roam test to use new debug events
  2024-01-03 18:46 ` [PATCH 5/8] auto-t: update roam test to use new debug events James Prestwood
@ 2024-01-04 17:58   ` Denis Kenzior
  0 siblings, 0 replies; 18+ messages in thread
From: Denis Kenzior @ 2024-01-04 17:58 UTC (permalink / raw)
  To: James Prestwood, iwd

Hi James,

On 1/3/24 12:46, James Prestwood wrote:
> ---
>   autotests/testPSK-roam/failed_roam_test.py | 10 +++++-----
>   1 file changed, 5 insertions(+), 5 deletions(-)
> 

Applied, thanks.

Regards,
-Denis


^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH 7/8] auto-t: improve failure handling in testPSK-roam
  2024-01-03 18:46 ` [PATCH 7/8] auto-t: improve failure handling in testPSK-roam James Prestwood
@ 2024-01-04 18:00   ` Denis Kenzior
  0 siblings, 0 replies; 18+ messages in thread
From: Denis Kenzior @ 2024-01-04 18:00 UTC (permalink / raw)
  To: James Prestwood, iwd

Hi James,

On 1/3/24 12:46, James Prestwood wrote:
> This really needs to be done to many more autotests but since this
> one seems to have random failures ensure that all the tests still
> run if one fails. In addition add better cleanup for hwsim rules.
> ---
>   autotests/testPSK-roam/connection_test.py     | 25 +++++++++----------
>   autotests/testPSK-roam/failed_roam_test.py    |  3 +++
>   .../testPSK-roam/roam_ap_disconnect_test.py   | 11 +++++---
>   3 files changed, 22 insertions(+), 17 deletions(-)
> 

Applied, thanks.

Regards,
-Denis


^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH 8/8] auto-t: fix random testPSK-roam failure
  2024-01-03 18:46 ` [PATCH 8/8] auto-t: fix random testPSK-roam failure James Prestwood
@ 2024-01-04 18:00   ` Denis Kenzior
  0 siblings, 0 replies; 18+ messages in thread
From: Denis Kenzior @ 2024-01-04 18:00 UTC (permalink / raw)
  To: James Prestwood, iwd

Hi James,

On 1/3/24 12:46, James Prestwood wrote:
> This was caused by the unused hostapd instance running after being
> re-enabled by mistake. This cause an additional scan result with the
> same rank to be seen which would then be connected to by luck of the
> draw.
> ---
>   autotests/testPSK-roam/connection_test.py | 7 ++++---
>   1 file changed, 4 insertions(+), 3 deletions(-)
> 

Applied, thanks.

Regards,
-Denis


^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH 3/8] station: add handling for new NETCONFIG state
  2024-01-03 18:46 ` [PATCH 3/8] station: add handling for new NETCONFIG state James Prestwood
@ 2024-01-04 18:14   ` Denis Kenzior
  2024-01-04 18:31     ` James Prestwood
  0 siblings, 1 reply; 18+ messages in thread
From: Denis Kenzior @ 2024-01-04 18:14 UTC (permalink / raw)
  To: James Prestwood, iwd

Hi James,

On 1/3/24 12:46, James Prestwood wrote:
> There was an unhandled corner case if netconfig was running and
> multiple roam conditions happened in sequence, all before netconfig
> had completed. A single roam before netconfig was already handled
> (23f0f5717c) but this did not take into account any additional roam
> conditions.

So if netconfig hasn't completed, we're in the 'connecting' state.  Any 
subsequent roams should still be treated as if we are in 'connecting' state. 
Are we transitioning from connecting -> roaming at the D-Bus API level? We 
shouldn't be.

Another weirdness is that I think we're sending the d-bus reply after connecting 
to the AP, but before netconfig runs.

> 
> If IWD is in this state, having started netconfig, then roamed, and
> again restarted netconfig it is still in a roaming state which will
> prevent any further roams. IWD will remain "stuck" on the current
> BSS until netconfig completes or gets disconnected.

Makes sense, since roaming means netconfig isn't really doing anything.

> 
> To fix this a new internal station state was added (no changes to
> the DBus API) to distinguish between a purely WiFi connecting state
> (STATION_STATE_CONNECTING/AUTO) and netconfig
> (STATION_STATE_NETCONFIG). This allows IWD roam as needed if
> netconfig is still running.

Okay, but how would you distinguish between connecting -> netconfig and 
netconfig->roaming->netconfig?

> 
> The change is mainly just adding STATION_STATE_NETCONFIG anywhere
> that STATION_STATE_CONNECTING is to maintain the same behavior,
> except within the netconfig event handler. In this case we should
> never get here without being in either a NETCONFIG or ROAMING
> state.
> 
> For some background this scenario happens if the DHCP server goes
> down for an extended period, e.g. if its being upgraded/serviced.
> ---
>   src/station.c | 23 +++++++++++++++++------
>   1 file changed, 17 insertions(+), 6 deletions(-)
> 
> diff --git a/src/station.c b/src/station.c
> index 57d22e91..8f310ec8 100644
> --- a/src/station.c
> +++ b/src/station.c
> @@ -1768,6 +1768,7 @@ static void station_reset_connection_state(struct station *station)
>   	if (station->state == STATION_STATE_CONNECTED ||
>   			station->state == STATION_STATE_CONNECTING ||
>   			station->state == STATION_STATE_CONNECTING_AUTO ||
> +			station->state == STATION_STATE_NETCONFIG ||
>   			station_is_roaming(station))
>   		network_disconnected(network);
>   }
> @@ -2043,8 +2044,9 @@ static void station_netconfig_event_handler(enum netconfig_event event,
>   			dbus_pending_reply(&station->connect_pending, reply);
>   		}
>   
> -		if (L_IN_SET(station->state, STATION_STATE_CONNECTING,
> -				STATION_STATE_CONNECTING_AUTO))
> +		if (L_IN_SET(station->state, STATION_STATE_NETCONFIG,
> +				STATION_STATE_ROAMING, STATION_STATE_FT_ROAMING,
> +				STATION_STATE_FW_ROAMING))

I understand why NETCONFIG state is in the set, but why the others?

>   			network_connect_failed(station->connected_network,
>   						false);
>   
> @@ -2070,9 +2072,14 @@ static bool netconfig_after_roam(struct station *station)
>   					network_get_settings(network)))
>   		return false;
>   
> -	return netconfig_configure(station->netconfig,
> +	if (L_WARN_ON(!netconfig_configure(station->netconfig,
>   					station_netconfig_event_handler,
> -					station);
> +					station)))

You already have an L_WARN_ON in the single call site of netconfig_after_roam?

> +		return false;
> +
> +	station_enter_state(station, STATION_STATE_NETCONFIG);
> +
> +	return true;
>   }
>   
>   static void station_roamed(struct station *station)
> @@ -3255,6 +3262,8 @@ static void station_connect_ok(struct station *station)
>   						station_netconfig_event_handler,
>   						station)))
>   			return;
> +
> +		station_enter_state(station, STATION_STATE_NETCONFIG);
>   	} else
>   		station_enter_state(station, STATION_STATE_CONNECTED);
>   }
> @@ -4067,7 +4076,8 @@ static struct l_dbus_message *station_dbus_scan(struct l_dbus *dbus,
>   		return dbus_error_busy(message);
>   
>   	if (station->state == STATION_STATE_CONNECTING ||
> -			station->state == STATION_STATE_CONNECTING_AUTO)
> +			station->state == STATION_STATE_CONNECTING_AUTO ||
> +			station->state == STATION_STATE_NETCONFIG)

Might as well use L_IN_SET here

>   		return dbus_error_busy(message);
>   
>   	station->dbus_scan_subset_idx = 0;
> @@ -5025,7 +5035,8 @@ static struct l_dbus_message *station_debug_scan(struct l_dbus *dbus,
>   		return dbus_error_busy(message);
>   
>   	if (station->state == STATION_STATE_CONNECTING ||
> -			station->state == STATION_STATE_CONNECTING_AUTO)
> +			station->state == STATION_STATE_CONNECTING_AUTO ||
> +			station->state == STATION_STATE_NETCONFIG)

Also, shouldn't this also cover the roaming states?  And do we still need this 
check given wiphy_work?  Does netconfig use wiphy work to make sure nothing 
tries to scan or go off-channel?

>   		return dbus_error_busy(message);
>   
>   	if (!l_dbus_message_get_arguments(message, "aq", &iter))

Regards,
-Denis

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH 3/8] station: add handling for new NETCONFIG state
  2024-01-04 18:14   ` Denis Kenzior
@ 2024-01-04 18:31     ` James Prestwood
  2024-01-04 18:55       ` Denis Kenzior
  0 siblings, 1 reply; 18+ messages in thread
From: James Prestwood @ 2024-01-04 18:31 UTC (permalink / raw)
  To: Denis Kenzior, iwd


On 1/4/24 10:14 AM, Denis Kenzior wrote:
> Hi James,
>
> On 1/3/24 12:46, James Prestwood wrote:
>> There was an unhandled corner case if netconfig was running and
>> multiple roam conditions happened in sequence, all before netconfig
>> had completed. A single roam before netconfig was already handled
>> (23f0f5717c) but this did not take into account any additional roam
>> conditions.
>
> So if netconfig hasn't completed, we're in the 'connecting' state.  
> Any subsequent roams should still be treated as if we are in 
> 'connecting' state. Are we transitioning from connecting -> roaming at 
> the D-Bus API level? We shouldn't be.
Yes we are as it is today.
>
> Another weirdness is that I think we're sending the d-bus reply after 
> connecting to the AP, but before netconfig runs.
Probably this as well... But I'm not sure we can really send it after 
netconfig unless we want to add a timeout error or something. If 
netconfig takes a long time DBus will be unhappy. IMO netconfig is 
somewhat of a special case in this regard, and consumers of the API 
should be waiting on the connected state, not only the DBus method return.
>
>>
>> If IWD is in this state, having started netconfig, then roamed, and
>> again restarted netconfig it is still in a roaming state which will
>> prevent any further roams. IWD will remain "stuck" on the current
>> BSS until netconfig completes or gets disconnected.
>
> Makes sense, since roaming means netconfig isn't really doing anything.
>
>>
>> To fix this a new internal station state was added (no changes to
>> the DBus API) to distinguish between a purely WiFi connecting state
>> (STATION_STATE_CONNECTING/AUTO) and netconfig
>> (STATION_STATE_NETCONFIG). This allows IWD roam as needed if
>> netconfig is still running.
>
> Okay, but how would you distinguish between connecting -> netconfig 
> and netconfig->roaming->netconfig?
I hadn't had a need to distinguish, but given the above of wanting to 
remain the connecting state I think we'll need to.
>
>>
>> The change is mainly just adding STATION_STATE_NETCONFIG anywhere
>> that STATION_STATE_CONNECTING is to maintain the same behavior,
>> except within the netconfig event handler. In this case we should
>> never get here without being in either a NETCONFIG or ROAMING
>> state.
>>
>> For some background this scenario happens if the DHCP server goes
>> down for an extended period, e.g. if its being upgraded/serviced.
>> ---
>>   src/station.c | 23 +++++++++++++++++------
>>   1 file changed, 17 insertions(+), 6 deletions(-)
>>
>> diff --git a/src/station.c b/src/station.c
>> index 57d22e91..8f310ec8 100644
>> --- a/src/station.c
>> +++ b/src/station.c
>> @@ -1768,6 +1768,7 @@ static void 
>> station_reset_connection_state(struct station *station)
>>       if (station->state == STATION_STATE_CONNECTED ||
>>               station->state == STATION_STATE_CONNECTING ||
>>               station->state == STATION_STATE_CONNECTING_AUTO ||
>> +            station->state == STATION_STATE_NETCONFIG ||
>>               station_is_roaming(station))
>>           network_disconnected(network);
>>   }
>> @@ -2043,8 +2044,9 @@ static void 
>> station_netconfig_event_handler(enum netconfig_event event,
>> dbus_pending_reply(&station->connect_pending, reply);
>>           }
>>   -        if (L_IN_SET(station->state, STATION_STATE_CONNECTING,
>> -                STATION_STATE_CONNECTING_AUTO))
>> +        if (L_IN_SET(station->state, STATION_STATE_NETCONFIG,
>> +                STATION_STATE_ROAMING, STATION_STATE_FT_ROAMING,
>> +                STATION_STATE_FW_ROAMING))
>
> I understand why NETCONFIG state is in the set, but why the others?
This was because if we are roaming and netconfig fails for some reason 
we want to disconnect, right?
>
>> network_connect_failed(station->connected_network,
>>                           false);
>>   @@ -2070,9 +2072,14 @@ static bool netconfig_after_roam(struct 
>> station *station)
>>                       network_get_settings(network)))
>>           return false;
>>   -    return netconfig_configure(station->netconfig,
>> +    if (L_WARN_ON(!netconfig_configure(station->netconfig,
>>                       station_netconfig_event_handler,
>> -                    station);
>> +                    station)))
>
> You already have an L_WARN_ON in the single call site of 
> netconfig_after_roam?
>
>> +        return false;
>> +
>> +    station_enter_state(station, STATION_STATE_NETCONFIG);
>> +
>> +    return true;
>>   }
>>     static void station_roamed(struct station *station)
>> @@ -3255,6 +3262,8 @@ static void station_connect_ok(struct station 
>> *station)
>>                           station_netconfig_event_handler,
>>                           station)))
>>               return;
>> +
>> +        station_enter_state(station, STATION_STATE_NETCONFIG);
>>       } else
>>           station_enter_state(station, STATION_STATE_CONNECTED);
>>   }
>> @@ -4067,7 +4076,8 @@ static struct l_dbus_message 
>> *station_dbus_scan(struct l_dbus *dbus,
>>           return dbus_error_busy(message);
>>         if (station->state == STATION_STATE_CONNECTING ||
>> -            station->state == STATION_STATE_CONNECTING_AUTO)
>> +            station->state == STATION_STATE_CONNECTING_AUTO ||
>> +            station->state == STATION_STATE_NETCONFIG)
>
> Might as well use L_IN_SET here
>
>>           return dbus_error_busy(message);
>>         station->dbus_scan_subset_idx = 0;
>> @@ -5025,7 +5035,8 @@ static struct l_dbus_message 
>> *station_debug_scan(struct l_dbus *dbus,
>>           return dbus_error_busy(message);
>>         if (station->state == STATION_STATE_CONNECTING ||
>> -            station->state == STATION_STATE_CONNECTING_AUTO)
>> +            station->state == STATION_STATE_CONNECTING_AUTO ||
>> +            station->state == STATION_STATE_NETCONFIG)
>
> Also, shouldn't this also cover the roaming states?  And do we still 
> need this check given wiphy_work?  Does netconfig use wiphy work to 
> make sure nothing tries to scan or go off-channel?
Yes, we shouldn't scan/offchannel while roaming. And no netconfig 
doesn't uses the wiphy queue, but it really should.
>
>>           return dbus_error_busy(message);
>>         if (!l_dbus_message_get_arguments(message, "aq", &iter))
So sounds like I opened up a can of worms here :) We only noticed these 
issues cropping up recently because of extended server upgrade times, 
i.e. the DHCP server was down for a long time. Clients roamed and if 
they didn't have the recent changes to restart netconfig they'd be stuck 
connected without an IP. I then noticed some of these other limitations 
now, but at least currently being unable to roam is better than having 
no IP and requiring physical attention.
> Regards,
> -Denis

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH 3/8] station: add handling for new NETCONFIG state
  2024-01-04 18:31     ` James Prestwood
@ 2024-01-04 18:55       ` Denis Kenzior
  2024-01-04 19:55         ` James Prestwood
  0 siblings, 1 reply; 18+ messages in thread
From: Denis Kenzior @ 2024-01-04 18:55 UTC (permalink / raw)
  To: James Prestwood, iwd

Hi James,

>> Another weirdness is that I think we're sending the d-bus reply after 
>> connecting to the AP, but before netconfig runs.
> Probably this as well... But I'm not sure we can really send it after netconfig 
> unless we want to add a timeout error or something. If netconfig takes a long 

We already should be disconnecting if netconfig fails.  It is part of the 
NETCONFIG_EVENT_FAILED handling this patch was touching actually.  I'm not sure 
how this was tested exactly, but Andrew was pretty adamant that we should be 
doing so and I thought his arguments were compelling enough.

> time DBus will be unhappy. IMO netconfig is somewhat of a special case in this 
> regard, and consumers of the API should be waiting on the connected state, not 
> only the DBus method return.

netconfig should take a max of a few seconds under normal circumstances. 
Capping the DHCP time might make sense anyway and I think there are even some 
TODO items inside ell/netconfig.c about this.

Anyhow, this is a smaller issue compared to the one above.

>>> @@ -2043,8 +2044,9 @@ static void station_netconfig_event_handler(enum 
>>> netconfig_event event,
>>> dbus_pending_reply(&station->connect_pending, reply);
>>>           }
>>>   -        if (L_IN_SET(station->state, STATION_STATE_CONNECTING,
>>> -                STATION_STATE_CONNECTING_AUTO))
>>> +        if (L_IN_SET(station->state, STATION_STATE_NETCONFIG,
>>> +                STATION_STATE_ROAMING, STATION_STATE_FT_ROAMING,
>>> +                STATION_STATE_FW_ROAMING))
>>
>> I understand why NETCONFIG state is in the set, but why the others?
> This was because if we are roaming and netconfig fails for some reason we want 
> to disconnect, right?

Well, AFAIK, the original intent of this piece was to discard secrets if the 
initial connection failed.  This is now covered by the NETCONFIG state which is 
roughly equivalent to CONNECTING.

The addition of roaming states modifies the original intent.  Hence why I'm 
wondering why this is being added?

>> Also, shouldn't this also cover the roaming states?  And do we still need this 
>> check given wiphy_work?  Does netconfig use wiphy work to make sure nothing 
>> tries to scan or go off-channel?
> Yes, we shouldn't scan/offchannel while roaming. And no netconfig doesn't uses 
> the wiphy queue, but it really should.
>>
>>>           return dbus_error_busy(message);
>>>         if (!l_dbus_message_get_arguments(message, "aq", &iter))
> So sounds like I opened up a can of worms here :) We only noticed these issues 
> cropping up recently because of extended server upgrade times, i.e. the DHCP 

Yep, there's always going to be nasty corner cases to deal with.  I suspect your 
earlier change to re-start netconfig exposed some of these inconsistencies as well.

> server was down for a long time. Clients roamed and if they didn't have the 
> recent changes to restart netconfig they'd be stuck connected without an IP. I 

Hence why I think it is better to treat this as an initial connection failure 
rather than pretending that we roamed and dealing with the consequences.

> then noticed some of these other limitations now, but at least currently being 
> unable to roam is better than having no IP and requiring physical attention.

Regards,
-Denis

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH 3/8] station: add handling for new NETCONFIG state
  2024-01-04 18:55       ` Denis Kenzior
@ 2024-01-04 19:55         ` James Prestwood
  2024-01-04 21:01           ` Denis Kenzior
  0 siblings, 1 reply; 18+ messages in thread
From: James Prestwood @ 2024-01-04 19:55 UTC (permalink / raw)
  To: Denis Kenzior, iwd

Hi Denis,

On 1/4/24 10:55 AM, Denis Kenzior wrote:
> Hi James,
>
>>> Another weirdness is that I think we're sending the d-bus reply 
>>> after connecting to the AP, but before netconfig runs.
>> Probably this as well... But I'm not sure we can really send it after 
>> netconfig unless we want to add a timeout error or something. If 
>> netconfig takes a long 
>
> We already should be disconnecting if netconfig fails.  It is part of 
> the NETCONFIG_EVENT_FAILED handling this patch was touching actually.  
> I'm not sure how this was tested exactly, but Andrew was pretty 
> adamant that we should be doing so and I thought his arguments were 
> compelling enough.
>
>> time DBus will be unhappy. IMO netconfig is somewhat of a special 
>> case in this regard, and consumers of the API should be waiting on 
>> the connected state, not only the DBus method return.
>
> netconfig should take a max of a few seconds under normal 
> circumstances. Capping the DHCP time might make sense anyway and I 
> think there are even some TODO items inside ell/netconfig.c about this.
This seems much simpler to implement and maintain...
>
> Anyhow, this is a smaller issue compared to the one above.
>
>>>> @@ -2043,8 +2044,9 @@ static void 
>>>> station_netconfig_event_handler(enum netconfig_event event,
>>>> dbus_pending_reply(&station->connect_pending, reply);
>>>>           }
>>>>   -        if (L_IN_SET(station->state, STATION_STATE_CONNECTING,
>>>> -                STATION_STATE_CONNECTING_AUTO))
>>>> +        if (L_IN_SET(station->state, STATION_STATE_NETCONFIG,
>>>> +                STATION_STATE_ROAMING, STATION_STATE_FT_ROAMING,
>>>> +                STATION_STATE_FW_ROAMING))
>>>
>>> I understand why NETCONFIG state is in the set, but why the others?
>> This was because if we are roaming and netconfig fails for some 
>> reason we want to disconnect, right?
>
> Well, AFAIK, the original intent of this piece was to discard secrets 
> if the initial connection failed.  This is now covered by the 
> NETCONFIG state which is roughly equivalent to CONNECTING.
>
> The addition of roaming states modifies the original intent. Hence why 
> I'm wondering why this is being added?
>
>>> Also, shouldn't this also cover the roaming states?  And do we still 
>>> need this check given wiphy_work?  Does netconfig use wiphy work to 
>>> make sure nothing tries to scan or go off-channel?
>> Yes, we shouldn't scan/offchannel while roaming. And no netconfig 
>> doesn't uses the wiphy queue, but it really should.
>>>
>>>>           return dbus_error_busy(message);
>>>>         if (!l_dbus_message_get_arguments(message, "aq", &iter))
>> So sounds like I opened up a can of worms here :) We only noticed 
>> these issues cropping up recently because of extended server upgrade 
>> times, i.e. the DHCP 
>
> Yep, there's always going to be nasty corner cases to deal with. I 
> suspect your earlier change to re-start netconfig exposed some of 
> these inconsistencies as well.
>
>> server was down for a long time. Clients roamed and if they didn't 
>> have the recent changes to restart netconfig they'd be stuck 
>> connected without an IP. I 
>
> Hence why I think it is better to treat this as an initial connection 
> failure rather than pretending that we roamed and dealing with the 
> consequences.
So when I originally sent the netconfig after roam change I wanted to 
actually prevent any roaming until netconfig finished, but instead we 
allowed the roam and restarted netconfig. So yes, I feel like this 
wasn't the right direction to go. I'm thinking instead we should prevent 
any roaming, cap the netconfig time (e.g. 15 seconds) and fail the 
connection if netconfig doesn't finish. Then we avoid all these nasty 
details of state changes and trying to manage roams but appear as 
"connecting" on DBus etc.
>> then noticed some of these other limitations now, but at least 
>> currently being unable to roam is better than having no IP and 
>> requiring physical attention.
>
> Regards,
> -Denis

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH 3/8] station: add handling for new NETCONFIG state
  2024-01-04 19:55         ` James Prestwood
@ 2024-01-04 21:01           ` Denis Kenzior
  0 siblings, 0 replies; 18+ messages in thread
From: Denis Kenzior @ 2024-01-04 21:01 UTC (permalink / raw)
  To: James Prestwood, iwd

Hi James,

>> Hence why I think it is better to treat this as an initial connection failure 
>> rather than pretending that we roamed and dealing with the consequences.
> So when I originally sent the netconfig after roam change I wanted to actually 
> prevent any roaming until netconfig finished, but instead we allowed the roam 
> and restarted netconfig. So yes, I feel like this wasn't the right direction to 
> go. I'm thinking instead we should prevent any roaming, cap the netconfig time 
> (e.g. 15 seconds) and fail the connection if netconfig doesn't finish. Then we 

Preventing roaming would certainly be easier to implement, but the fact that you 
had roams during the initial connection (the original problem which prompted 
this change), probably means that we do want to allow roaming eventually.

6G in particular would benefit, since attaching to 2.4/5GHz frequencies first, 
then asking for neighbor reports and roaming to 6G is probably the best strategy 
compared to passively scanning on the 6 GHz band.

Regards,
-Denis

^ permalink raw reply	[flat|nested] 18+ messages in thread

end of thread, other threads:[~2024-01-04 21:01 UTC | newest]

Thread overview: 18+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2024-01-03 18:46 [PATCH 1/8] station: handle netconfig after roaming for FW roams James Prestwood
2024-01-03 18:46 ` [PATCH 2/8] station: add additional internal state, STATION_STATE_NETCONFIG James Prestwood
2024-01-03 18:46 ` [PATCH 3/8] station: add handling for new NETCONFIG state James Prestwood
2024-01-04 18:14   ` Denis Kenzior
2024-01-04 18:31     ` James Prestwood
2024-01-04 18:55       ` Denis Kenzior
2024-01-04 19:55         ` James Prestwood
2024-01-04 21:01           ` Denis Kenzior
2024-01-03 18:46 ` [PATCH 4/8] station: add debug events for internal states James Prestwood
2024-01-04 17:57   ` Denis Kenzior
2024-01-03 18:46 ` [PATCH 5/8] auto-t: update roam test to use new debug events James Prestwood
2024-01-04 17:58   ` Denis Kenzior
2024-01-03 18:46 ` [PATCH 6/8] auto-t: add test for roaming + netconfig James Prestwood
2024-01-03 18:46 ` [PATCH 7/8] auto-t: improve failure handling in testPSK-roam James Prestwood
2024-01-04 18:00   ` Denis Kenzior
2024-01-03 18:46 ` [PATCH 8/8] auto-t: fix random testPSK-roam failure James Prestwood
2024-01-04 18:00   ` Denis Kenzior
2024-01-04 17:56 ` [PATCH 1/8] station: handle netconfig after roaming for FW roams Denis Kenzior

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox