[PATCH v2 1/2] drivers/hv: introduce vmbus_channel_set

linux-hyperv.vger.kernel.org archive mirror
 help / color / mirror / Atom feed

* [PATCH v2 1/2] drivers/hv: introduce vmbus_channel_set_cpu()
@ 2025-01-10 21:59 Hamza Mahfooz
  2025-01-10 21:59 ` [PATCH v2 2/2] drivers/hv: add CPU offlining support Hamza Mahfooz
                   ` (2 more replies)
  0 siblings, 3 replies; 7+ messages in thread
From: Hamza Mahfooz @ 2025-01-10 21:59 UTC (permalink / raw)
  To: linux-hyperv
  Cc: Hamza Mahfooz, Boqun Feng, Wei Liu, K. Y. Srinivasan,
	Haiyang Zhang, Dexuan Cui, linux-kernel

I would like to reuse target_cpu_store() within the driver. So, move
most of it's body into vmbus_channel_set_cpu().

Cc: Boqun Feng <boqun.feng@gmail.com>
Cc: Wei Liu <wei.liu@kernel.org>
Signed-off-by: Hamza Mahfooz <hamzamahfooz@linux.microsoft.com>
---
v2: separate vmbus_channel_set_cpu() changes from
    cpu offlining changes.
---
 drivers/hv/vmbus_drv.c | 52 +++++++++++++++++++++++++-----------------
 include/linux/hyperv.h |  1 +
 2 files changed, 32 insertions(+), 21 deletions(-)

diff --git a/drivers/hv/vmbus_drv.c b/drivers/hv/vmbus_drv.c
index 2892b8da20a5..001e64fb8d43 100644
--- a/drivers/hv/vmbus_drv.c
+++ b/drivers/hv/vmbus_drv.c
@@ -1611,16 +1611,16 @@ static ssize_t target_cpu_show(struct vmbus_channel *channel, char *buf)
 {
 	return sprintf(buf, "%u\n", channel->target_cpu);
 }
-static ssize_t target_cpu_store(struct vmbus_channel *channel,
-				const char *buf, size_t count)
+
+int vmbus_channel_set_cpu(struct vmbus_channel *channel, u32 target_cpu)
 {
-	u32 target_cpu, origin_cpu;
-	ssize_t ret = count;
+	u32 origin_cpu;
+	int ret = 0;
 
-	if (vmbus_proto_version < VERSION_WIN10_V4_1)
-		return -EIO;
+	lockdep_assert_cpus_held();
+	lockdep_assert_held(&vmbus_connection.channel_mutex);
 
-	if (sscanf(buf, "%uu", &target_cpu) != 1)
+	if (vmbus_proto_version < VERSION_WIN10_V4_1)
 		return -EIO;
 
 	/* Validate target_cpu for the cpumask_test_cpu() operation below. */
@@ -1630,22 +1630,17 @@ static ssize_t target_cpu_store(struct vmbus_channel *channel,
 	if (!cpumask_test_cpu(target_cpu, housekeeping_cpumask(HK_TYPE_MANAGED_IRQ)))
 		return -EINVAL;
 
-	/* No CPUs should come up or down during this. */
-	cpus_read_lock();
-
-	if (!cpu_online(target_cpu)) {
-		cpus_read_unlock();
+	if (!cpu_online(target_cpu))
 		return -EINVAL;
-	}
 
 	/*
-	 * Synchronizes target_cpu_store() and channel closure:
+	 * Synchronizes vmbus_channel_set_cpu() and channel closure:
 	 *
 	 * { Initially: state = CHANNEL_OPENED }
 	 *
 	 * CPU1				CPU2
 	 *
-	 * [target_cpu_store()]		[vmbus_disconnect_ring()]
+	 * [vmbus_channel_set_cpu()]	[vmbus_disconnect_ring()]
 	 *
 	 * LOCK channel_mutex		LOCK channel_mutex
 	 * LOAD r1 = state		LOAD r2 = state
@@ -1660,7 +1655,6 @@ static ssize_t target_cpu_store(struct vmbus_channel *channel,
 	 * Note.  The host processes the channel messages "sequentially", in
 	 * the order in which they are received on a per-partition basis.
 	 */
-	mutex_lock(&vmbus_connection.channel_mutex);
 
 	/*
 	 * Hyper-V will ignore MODIFYCHANNEL messages for "non-open" channels;
@@ -1668,17 +1662,17 @@ static ssize_t target_cpu_store(struct vmbus_channel *channel,
 	 */
 	if (channel->state != CHANNEL_OPENED_STATE) {
 		ret = -EIO;
-		goto cpu_store_unlock;
+		goto end;
 	}
 
 	origin_cpu = channel->target_cpu;
 	if (target_cpu == origin_cpu)
-		goto cpu_store_unlock;
+		goto end;
 
 	if (vmbus_send_modifychannel(channel,
 				     hv_cpu_number_to_vp_number(target_cpu))) {
 		ret = -EIO;
-		goto cpu_store_unlock;
+		goto end;
 	}
 
 	/*
@@ -1708,9 +1702,25 @@ static ssize_t target_cpu_store(struct vmbus_channel *channel,
 				origin_cpu, target_cpu);
 	}
 
-cpu_store_unlock:
-	mutex_unlock(&vmbus_connection.channel_mutex);
+end:
+	return ret;
+}
+
+static ssize_t target_cpu_store(struct vmbus_channel *channel,
+				const char *buf, size_t count)
+{
+	ssize_t ret = count;
+	u32 target_cpu;
+
+	if (sscanf(buf, "%uu", &target_cpu) != 1)
+		return -EIO;
+
+	mutex_lock(&vmbus_connection.channel_mutex);
+	cpus_read_lock();
+	ret = vmbus_channel_set_cpu(channel, target_cpu);
 	cpus_read_unlock();
+	mutex_unlock(&vmbus_connection.channel_mutex);
+
 	return ret;
 }
 static VMBUS_CHAN_ATTR(cpu, 0644, target_cpu_show, target_cpu_store);
diff --git a/include/linux/hyperv.h b/include/linux/hyperv.h
index 02a226bcf0ed..25e9e982f1b0 100644
--- a/include/linux/hyperv.h
+++ b/include/linux/hyperv.h
@@ -1670,6 +1670,7 @@ int vmbus_send_tl_connect_request(const guid_t *shv_guest_servie_id,
 				  const guid_t *shv_host_servie_id);
 int vmbus_send_modifychannel(struct vmbus_channel *channel, u32 target_vp);
 void vmbus_set_event(struct vmbus_channel *channel);
+int vmbus_channel_set_cpu(struct vmbus_channel *channel, u32 target_cpu);
 
 /* Get the start of the ring buffer. */
 static inline void *
-- 
2.47.1


^ permalink raw reply related	[flat|nested] 7+ messages in thread

* [PATCH v2 2/2] drivers/hv: add CPU offlining support
  2025-01-10 21:59 [PATCH v2 1/2] drivers/hv: introduce vmbus_channel_set_cpu() Hamza Mahfooz
@ 2025-01-10 21:59 ` Hamza Mahfooz
  2025-01-14  2:43   ` Michael Kelley
  2025-01-13 13:01 ` [PATCH v2 1/2] drivers/hv: introduce vmbus_channel_set_cpu() kernel test robot
  2025-01-14  2:43 ` Michael Kelley
  2 siblings, 1 reply; 7+ messages in thread
From: Hamza Mahfooz @ 2025-01-10 21:59 UTC (permalink / raw)
  To: linux-hyperv
  Cc: Hamza Mahfooz, Boqun Feng, Wei Liu, K. Y. Srinivasan,
	Haiyang Zhang, Dexuan Cui, linux-kernel

Currently, it is effectively impossible to offline CPUs. Since, most
CPUs will have vmbus channels attached to them. So, as made mention of
in commit d570aec0f2154 ("Drivers: hv: vmbus: Synchronize
init_vp_index() vs. CPU hotplug"), rebind channels associated with CPUs
that a user is trying to offline to a new "randomly" selected CPU.

Cc: Boqun Feng <boqun.feng@gmail.com>
Cc: Wei Liu <wei.liu@kernel.org>
Signed-off-by: Hamza Mahfooz <hamzamahfooz@linux.microsoft.com>
---
v2: remove cpus_read_{un,}lock() from hv_pick_new_cpu() and add
    lockdep_assert_cpus_held().
---
 drivers/hv/hv.c | 56 ++++++++++++++++++++++++++++++++++++-------------
 1 file changed, 41 insertions(+), 15 deletions(-)

diff --git a/drivers/hv/hv.c b/drivers/hv/hv.c
index 36d9ba097ff5..9fef71403c86 100644
--- a/drivers/hv/hv.c
+++ b/drivers/hv/hv.c
@@ -433,13 +433,39 @@ static bool hv_synic_event_pending(void)
 	return pending;
 }
 
+static int hv_pick_new_cpu(struct vmbus_channel *channel,
+			   unsigned int current_cpu)
+{
+	int ret = 0;
+	int cpu;
+
+	lockdep_assert_cpus_held();
+	lockdep_assert_held(&vmbus_connection.channel_mutex);
+
+	/*
+	 * We can't assume that the relevant interrupts will be sent before
+	 * the cpu is offlined on older versions of hyperv.
+	 */
+	if (vmbus_proto_version < VERSION_WIN10_V5_3)
+		return -EBUSY;
+
+	cpu = cpumask_next(get_random_u32_below(nr_cpu_ids), cpu_online_mask);
+
+	if (cpu >= nr_cpu_ids || cpu == current_cpu)
+		cpu = VMBUS_CONNECT_CPU;
+
+	ret = vmbus_channel_set_cpu(channel, cpu);
+
+	return ret;
+}
+
 /*
  * hv_synic_cleanup - Cleanup routine for hv_synic_init().
  */
 int hv_synic_cleanup(unsigned int cpu)
 {
 	struct vmbus_channel *channel, *sc;
-	bool channel_found = false;
+	int ret = 0;
 
 	if (vmbus_connection.conn_state != CONNECTED)
 		goto always_cleanup;
@@ -456,31 +482,31 @@ int hv_synic_cleanup(unsigned int cpu)
 
 	/*
 	 * Search for channels which are bound to the CPU we're about to
-	 * cleanup.  In case we find one and vmbus is still connected, we
-	 * fail; this will effectively prevent CPU offlining.
-	 *
-	 * TODO: Re-bind the channels to different CPUs.
+	 * cleanup.
 	 */
 	mutex_lock(&vmbus_connection.channel_mutex);
 	list_for_each_entry(channel, &vmbus_connection.chn_list, listentry) {
 		if (channel->target_cpu == cpu) {
-			channel_found = true;
-			break;
+			ret = hv_pick_new_cpu(channel, cpu);
+
+			if (ret) {
+				mutex_unlock(&vmbus_connection.channel_mutex);
+				return ret;
+			}
 		}
 		list_for_each_entry(sc, &channel->sc_list, sc_list) {
 			if (sc->target_cpu == cpu) {
-				channel_found = true;
-				break;
+				ret = hv_pick_new_cpu(channel, cpu);
+
+				if (ret) {
+					mutex_unlock(&vmbus_connection.channel_mutex);
+					return ret;
+				}
 			}
 		}
-		if (channel_found)
-			break;
 	}
 	mutex_unlock(&vmbus_connection.channel_mutex);
 
-	if (channel_found)
-		return -EBUSY;
-
 	/*
 	 * channel_found == false means that any channels that were previously
 	 * assigned to the CPU have been reassigned elsewhere with a call of
@@ -497,5 +523,5 @@ int hv_synic_cleanup(unsigned int cpu)
 
 	hv_synic_disable_regs(cpu);
 
-	return 0;
+	return ret;
 }
-- 
2.47.1


^ permalink raw reply related	[flat|nested] 7+ messages in thread

* Re: [PATCH v2 1/2] drivers/hv: introduce vmbus_channel_set_cpu()
  2025-01-10 21:59 [PATCH v2 1/2] drivers/hv: introduce vmbus_channel_set_cpu() Hamza Mahfooz
  2025-01-10 21:59 ` [PATCH v2 2/2] drivers/hv: add CPU offlining support Hamza Mahfooz
@ 2025-01-13 13:01 ` kernel test robot
  2025-01-14  2:43 ` Michael Kelley
  2 siblings, 0 replies; 7+ messages in thread
From: kernel test robot @ 2025-01-13 13:01 UTC (permalink / raw)
  To: Hamza Mahfooz, linux-hyperv
  Cc: oe-kbuild-all, Hamza Mahfooz, Boqun Feng, Wei Liu,
	K. Y. Srinivasan, Haiyang Zhang, Dexuan Cui, linux-kernel

Hi Hamza,

kernel test robot noticed the following build errors:

[auto build test ERROR on linus/master]
[also build test ERROR on v6.13-rc7 next-20250113]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch#_base_tree_information]

url:    https://github.com/intel-lab-lkp/linux/commits/Hamza-Mahfooz/drivers-hv-add-CPU-offlining-support/20250111-060127
base:   linus/master
patch link:    https://lore.kernel.org/r/20250110215951.175514-1-hamzamahfooz%40linux.microsoft.com
patch subject: [PATCH v2 1/2] drivers/hv: introduce vmbus_channel_set_cpu()
compiler: gcc-12 (Debian 12.2.0-14) 12.2.0
reproduce (this is a W=1 build): (https://download.01.org/0day-ci/archive/20250113/202501132015.M5PAMVtb-lkp@intel.com/reproduce)

If you fix the issue in a separate patch/commit (i.e. not just a new version of
the same patch/commit), kindly add following tags
| Reported-by: kernel test robot <lkp@intel.com>
| Closes: https://lore.kernel.org/oe-kbuild-all/202501132015.M5PAMVtb-lkp@intel.com/

All errors (new ones prefixed by >>):

   Makefile:60: warning: overriding recipe for target 'emit_tests'
   ../lib.mk:182: warning: ignoring old recipe for target 'emit_tests'
   make[1]: *** No targets.  Stop.
>> Makefile:47: *** Cannot find a vmlinux for VMLINUX_BTF at any of "  ../../../../vmlinux /sys/kernel/btf/vmlinux /boot/vmlinux-5.9.0-2-amd64".  Stop.
   make[1]: *** No targets.  Stop.
   make[1]: *** No targets.  Stop.


vim +47 Makefile

3812b8c5c5d527 Masahiro Yamada 2019-02-22  46  
3812b8c5c5d527 Masahiro Yamada 2019-02-22 @47  # Do not use make's built-in rules and variables
3812b8c5c5d527 Masahiro Yamada 2019-02-22  48  # (this increases performance and avoids hard-to-debug behaviour)
3812b8c5c5d527 Masahiro Yamada 2019-02-22  49  MAKEFLAGS += -rR
3812b8c5c5d527 Masahiro Yamada 2019-02-22  50  

-- 
0-DAY CI Kernel Test Service
https://github.com/intel/lkp-tests/wiki

^ permalink raw reply	[flat|nested] 7+ messages in thread

* RE: [PATCH v2 1/2] drivers/hv: introduce vmbus_channel_set_cpu()
  2025-01-10 21:59 [PATCH v2 1/2] drivers/hv: introduce vmbus_channel_set_cpu() Hamza Mahfooz
  2025-01-10 21:59 ` [PATCH v2 2/2] drivers/hv: add CPU offlining support Hamza Mahfooz
  2025-01-13 13:01 ` [PATCH v2 1/2] drivers/hv: introduce vmbus_channel_set_cpu() kernel test robot
@ 2025-01-14  2:43 ` Michael Kelley
  2 siblings, 0 replies; 7+ messages in thread
From: Michael Kelley @ 2025-01-14  2:43 UTC (permalink / raw)
  To: Hamza Mahfooz, linux-hyperv@vger.kernel.org
  Cc: Boqun Feng, Wei Liu, K. Y. Srinivasan, Haiyang Zhang, Dexuan Cui,
	linux-kernel@vger.kernel.org

From: Hamza Mahfooz <hamzamahfooz@linux.microsoft.com> Sent: Friday, January 10, 2025 2:00 PM
> 
> I would like to reuse target_cpu_store() within the driver. So, move
> most of it's body into vmbus_channel_set_cpu().

Need a more focused commit message.  Don't use personal pronouns
like "I".  I would suggest something like:

   The core functionality in target_cpu_store() is also needed in a
   subsequent patch for automatically changing the CPU when taking
   a CPU offline. As such, factor out the body of target_cpu_store()
   into new function vmbus_channel_set_cpu() that can also be used
   elsewhere.

   No functional change is intended.

> 
> Cc: Boqun Feng <boqun.feng@gmail.com>
> Cc: Wei Liu <wei.liu@kernel.org>
> Signed-off-by: Hamza Mahfooz <hamzamahfooz@linux.microsoft.com>
> ---
> v2: separate vmbus_channel_set_cpu() changes from
>     cpu offlining changes.
> ---
>  drivers/hv/vmbus_drv.c | 52 +++++++++++++++++++++++++-----------------
>  include/linux/hyperv.h |  1 +
>  2 files changed, 32 insertions(+), 21 deletions(-)
> 
> diff --git a/drivers/hv/vmbus_drv.c b/drivers/hv/vmbus_drv.c
> index 2892b8da20a5..001e64fb8d43 100644
> --- a/drivers/hv/vmbus_drv.c
> +++ b/drivers/hv/vmbus_drv.c
> @@ -1611,16 +1611,16 @@ static ssize_t target_cpu_show(struct vmbus_channel
> *channel, char *buf)
>  {
>  	return sprintf(buf, "%u\n", channel->target_cpu);
>  }
> -static ssize_t target_cpu_store(struct vmbus_channel *channel,
> -				const char *buf, size_t count)
> +
> +int vmbus_channel_set_cpu(struct vmbus_channel *channel, u32 target_cpu)
>  {
> -	u32 target_cpu, origin_cpu;
> -	ssize_t ret = count;
> +	u32 origin_cpu;
> +	int ret = 0;
> 
> -	if (vmbus_proto_version < VERSION_WIN10_V4_1)
> -		return -EIO;
> +	lockdep_assert_cpus_held();
> +	lockdep_assert_held(&vmbus_connection.channel_mutex);
> 
> -	if (sscanf(buf, "%uu", &target_cpu) != 1)
> +	if (vmbus_proto_version < VERSION_WIN10_V4_1)
>  		return -EIO;
> 
>  	/* Validate target_cpu for the cpumask_test_cpu() operation below. */
> @@ -1630,22 +1630,17 @@ static ssize_t target_cpu_store(struct vmbus_channel *channel,
>  	if (!cpumask_test_cpu(target_cpu, housekeeping_cpumask(HK_TYPE_MANAGED_IRQ)))
>  		return -EINVAL;
> 
> -	/* No CPUs should come up or down during this. */
> -	cpus_read_lock();
> -
> -	if (!cpu_online(target_cpu)) {
> -		cpus_read_unlock();
> +	if (!cpu_online(target_cpu))
>  		return -EINVAL;
> -	}
> 
>  	/*
> -	 * Synchronizes target_cpu_store() and channel closure:
> +	 * Synchronizes vmbus_channel_set_cpu() and channel closure:
>  	 *
>  	 * { Initially: state = CHANNEL_OPENED }
>  	 *
>  	 * CPU1				CPU2
>  	 *
> -	 * [target_cpu_store()]		[vmbus_disconnect_ring()]
> +	 * [vmbus_channel_set_cpu()]	[vmbus_disconnect_ring()]
>  	 *
>  	 * LOCK channel_mutex		LOCK channel_mutex
>  	 * LOAD r1 = state		LOAD r2 = state
> @@ -1660,7 +1655,6 @@ static ssize_t target_cpu_store(struct vmbus_channel *channel,
>  	 * Note.  The host processes the channel messages "sequentially", in
>  	 * the order in which they are received on a per-partition basis.
>  	 */
> -	mutex_lock(&vmbus_connection.channel_mutex);
> 
>  	/*
>  	 * Hyper-V will ignore MODIFYCHANNEL messages for "non-open" channels;
> @@ -1668,17 +1662,17 @@ static ssize_t target_cpu_store(struct vmbus_channel *channel,
>  	 */
>  	if (channel->state != CHANNEL_OPENED_STATE) {
>  		ret = -EIO;
> -		goto cpu_store_unlock;
> +		goto end;
>  	}
> 
>  	origin_cpu = channel->target_cpu;
>  	if (target_cpu == origin_cpu)
> -		goto cpu_store_unlock;
> +		goto end;
> 
>  	if (vmbus_send_modifychannel(channel,
>  				     hv_cpu_number_to_vp_number(target_cpu))) {
>  		ret = -EIO;
> -		goto cpu_store_unlock;
> +		goto end;
>  	}
> 
>  	/*
> @@ -1708,9 +1702,25 @@ static ssize_t target_cpu_store(struct vmbus_channel *channel,
>  				origin_cpu, target_cpu);
>  	}
> 
> -cpu_store_unlock:
> -	mutex_unlock(&vmbus_connection.channel_mutex);
> +end:
> +	return ret;
> +}
> +
> +static ssize_t target_cpu_store(struct vmbus_channel *channel,
> +				const char *buf, size_t count)
> +{
> +	ssize_t ret = count;
> +	u32 target_cpu;
> +
> +	if (sscanf(buf, "%uu", &target_cpu) != 1)
> +		return -EIO;
> +
> +	mutex_lock(&vmbus_connection.channel_mutex);
> +	cpus_read_lock();

These locks are in the wrong order.  Must do cpus_read_lock()
first, then lock the channel_mutex, to be consistent with the
pattern in hv_synic_cleanup(), which is entered with
cpus_read_lock() already held.

Michael

> +	ret = vmbus_channel_set_cpu(channel, target_cpu);
>  	cpus_read_unlock();
> +	mutex_unlock(&vmbus_connection.channel_mutex);
> +
>  	return ret;
>  }
>  static VMBUS_CHAN_ATTR(cpu, 0644, target_cpu_show, target_cpu_store);
> diff --git a/include/linux/hyperv.h b/include/linux/hyperv.h
> index 02a226bcf0ed..25e9e982f1b0 100644
> --- a/include/linux/hyperv.h
> +++ b/include/linux/hyperv.h
> @@ -1670,6 +1670,7 @@ int vmbus_send_tl_connect_request(const guid_t
> *shv_guest_servie_id,
>  				  const guid_t *shv_host_servie_id);
>  int vmbus_send_modifychannel(struct vmbus_channel *channel, u32 target_vp);
>  void vmbus_set_event(struct vmbus_channel *channel);
> +int vmbus_channel_set_cpu(struct vmbus_channel *channel, u32 target_cpu);
> 
>  /* Get the start of the ring buffer. */
>  static inline void *
> --
> 2.47.1
> 


^ permalink raw reply	[flat|nested] 7+ messages in thread

* RE: [PATCH v2 2/2] drivers/hv: add CPU offlining support
  2025-01-10 21:59 ` [PATCH v2 2/2] drivers/hv: add CPU offlining support Hamza Mahfooz
@ 2025-01-14  2:43   ` Michael Kelley
  2025-01-15 17:22     ` Hamza Mahfooz
  0 siblings, 1 reply; 7+ messages in thread
From: Michael Kelley @ 2025-01-14  2:43 UTC (permalink / raw)
  To: Hamza Mahfooz, linux-hyperv@vger.kernel.org
  Cc: Boqun Feng, Wei Liu, K. Y. Srinivasan, Haiyang Zhang, Dexuan Cui,
	linux-kernel@vger.kernel.org

From: Hamza Mahfooz <hamzamahfooz@linux.microsoft.com> Sent: Friday, January 10, 2025 2:00 PM
> 
> Currently, it is effectively impossible to offline CPUs. Since, most
> CPUs will have vmbus channels attached to them. So, as made mention of
> in commit d570aec0f2154 ("Drivers: hv: vmbus: Synchronize
> init_vp_index() vs. CPU hotplug"), rebind channels associated with CPUs
> that a user is trying to offline to a new "randomly" selected CPU.

Let me provide some additional context and thoughts about the new
functionality proposed in this patch set.

1. I would somewhat challenge the commit message statement that
"it is effectively impossible to offline CPUs". VMBus device interrupts
can be assigned to a different CPU via a /sys interface and the code
in target_cpu_store().  So a CPU *can* be taken offline by first reassigning
any VMBus device interrupts, and then the offlining operation will succeed.
That reassigning requires manual sysadmin actions or some scripting,
which isn't super easy or automatic, but it's not "effectively impossible".

2. As background, when a CPU goes offline, the Linux kernel already has
functionality to reassign unmanaged IRQs that are assigned to the CPU
going offline.  (Managed IRQs are just shut down.)  See fixup_irqs().
Unfortunately, VMBus device interrupts are not modelled as Linux IRQs,
so the existing mechanism is not applied to VMBus devices.

3. In light of #2 and for other reasons, I submitted a patch set in June 2024
that models VMBus device interrupts as Linux IRQs. See [1]. This patch set
got feedback from Thomas Gleixner about how to demultiplex the IRQs, but
no one from Microsoft gave feedback on the overall idea. I think it would
be worthwhile to pursue these patches, but I would like to get some
macro-level thoughts from the Microsoft folks. There are implications for
things such as irqbalance.

4. As the cover letter in my patch set notes, there's still a problem with
the automatic Linux IRQ reassignment mechanism for the new VMBus IRQs.
The cover letter doesn't give full details, but the problem is ultimately due
to needing to get an ack from Hyper-V that the change in VMBus device
interrupt assignment has been completed. I have investigated alternatives
for making it work, but they are all somewhat convoluted. Nevertheless,
if we want to move forward with the patch set, further work on these
alternatives would be warranted.

5. In May 2020, Andrea Parri worked on a patch set that does what this
patch set does -- automatically reassign VMBus device interrupts when
a CPU tries to go offline. That patch set took a broader focus on making a
smart decision about the CPU to which to assign the interrupt in several
different circumstances, one of which was offlining a CPU. It was
somewhat complex and posted as an RFC [2]. I think Andrea ended up
having to work on some other things, and the patch set was not pursued
after the initial posting. It might be worthwhile to review it for comparison
purposes, or maybe it's worth reviving.

All of that is to say that I think there are two paths forward:

A. The quicker fix is to take the approach of this patch set and continue
handling VMBus device interrupts outside of the Linux IRQ mechanism.
Do the automatic reassignment when taking a CPU offline, as coded
in this patch. Andrea Parri's old patch set might have something to add
to this approach, if just for comparison purposes.

B. Take a broader look at the problem by going back to my patch set
that models VMBus device interrupts as Linux IRQs. Work to get
the existing Linux IRQ reassignment mechanism to work for the new
VMBus IRQs. This approach will probably take longer than (A).

I lean toward (B) because it converges with standard Linux IRQs, but I
don't know what's driving doing (A). If there's need to do (A) sooner,
see my comments in the code below. I'm less inclined to add the
complexity of Andrea Parri's old patch set because I think it takes
us even further down the path of doing custom VMBus-related
work when we would do better to converge toward existing Linux
IRQ mechanisms.

[1] https://lore.kernel.org/linux-hyperv/20240604050940.859909-1-mhklinux@outlook.com/
[2] https://lore.kernel.org/linux-hyperv/20200526223218.184057-1-parri.andrea@gmail.com/

> 
> Cc: Boqun Feng <boqun.feng@gmail.com>
> Cc: Wei Liu <wei.liu@kernel.org>
> Signed-off-by: Hamza Mahfooz <hamzamahfooz@linux.microsoft.com>
> ---
> v2: remove cpus_read_{un,}lock() from hv_pick_new_cpu() and add
>     lockdep_assert_cpus_held().
> ---
>  drivers/hv/hv.c | 56 ++++++++++++++++++++++++++++++++++++-------------
>  1 file changed, 41 insertions(+), 15 deletions(-)
> 
> diff --git a/drivers/hv/hv.c b/drivers/hv/hv.c
> index 36d9ba097ff5..9fef71403c86 100644
> --- a/drivers/hv/hv.c
> +++ b/drivers/hv/hv.c
> @@ -433,13 +433,39 @@ static bool hv_synic_event_pending(void)
>  	return pending;
>  }
> 
> +static int hv_pick_new_cpu(struct vmbus_channel *channel,
> +			   unsigned int current_cpu)
> +{
> +	int ret = 0;
> +	int cpu;
> +
> +	lockdep_assert_cpus_held();
> +	lockdep_assert_held(&vmbus_connection.channel_mutex);
> +
> +	/*
> +	 * We can't assume that the relevant interrupts will be sent before
> +	 * the cpu is offlined on older versions of hyperv.
> +	 */
> +	if (vmbus_proto_version < VERSION_WIN10_V5_3)
> +		return -EBUSY;

I'm not sure why this test is here.  The function vmbus_set_channel_cpu()
tests the vmbus_proto_version against V4_1 and returns an appropriate
error. Do we *need* to filter against V5_3 instead of V4_1?

> +
> +	cpu = cpumask_next(get_random_u32_below(nr_cpu_ids), cpu_online_mask);
> +
> +	if (cpu >= nr_cpu_ids || cpu == current_cpu)
> +		cpu = VMBUS_CONNECT_CPU;

Picking a random CPU like this seems to have some problems:

1. The selected CPU might be an isolated CPU, in which case the
call to vmbus_channel_set_cpu() will return an error, and the
attempt to take the CPU offline will eventually fail. But if you try
again to take the CPU offline, a different random CPU may be
chosen that isn't an isolated CPU, and taking the CPU offline
will succeed. Such inconsistent behavior should be avoided.

2. I wonder if we should try to choose a CPU in the same NUMA node
as "current_cpu".  The Linux IRQ mechanism has the concept of CPU
affinity for an IRQ, which can express the NUMA affinity. The normal
Linux reassignment mechanism obeys the IRQ's affinity if possible,
and so would do the right thing for NUMA. So we need to consider
whether to do that here as well.

3. The handling of the current_cpu feels a bit hacky. There's
also no wrap-around in the mask search. Together, I think that
creates a small bias toward choosing the VMBUS_CONNECT_CPU,
which is arguably already somewhat overloaded because all the
low-speed devices use it. I haven't tried to look for alternative
approaches to suggest.

Michael

> +
> +	ret = vmbus_channel_set_cpu(channel, cpu);
> +
> +	return ret;
> +}
> +
>  /*
>   * hv_synic_cleanup - Cleanup routine for hv_synic_init().
>   */
>  int hv_synic_cleanup(unsigned int cpu)
>  {
>  	struct vmbus_channel *channel, *sc;
> -	bool channel_found = false;
> +	int ret = 0;
> 
>  	if (vmbus_connection.conn_state != CONNECTED)
>  		goto always_cleanup;
> @@ -456,31 +482,31 @@ int hv_synic_cleanup(unsigned int cpu)
> 
>  	/*
>  	 * Search for channels which are bound to the CPU we're about to
> -	 * cleanup.  In case we find one and vmbus is still connected, we
> -	 * fail; this will effectively prevent CPU offlining.
> -	 *
> -	 * TODO: Re-bind the channels to different CPUs.
> +	 * cleanup.
>  	 */
>  	mutex_lock(&vmbus_connection.channel_mutex);
>  	list_for_each_entry(channel, &vmbus_connection.chn_list, listentry) {
>  		if (channel->target_cpu == cpu) {
> -			channel_found = true;
> -			break;
> +			ret = hv_pick_new_cpu(channel, cpu);
> +
> +			if (ret) {
> +				mutex_unlock(&vmbus_connection.channel_mutex);
> +				return ret;
> +			}
>  		}
>  		list_for_each_entry(sc, &channel->sc_list, sc_list) {
>  			if (sc->target_cpu == cpu) {
> -				channel_found = true;
> -				break;
> +				ret = hv_pick_new_cpu(channel, cpu);
> +
> +				if (ret) {
> +					mutex_unlock(&vmbus_connection.channel_mutex);
> +					return ret;
> +				}
>  			}
>  		}
> -		if (channel_found)
> -			break;
>  	}
>  	mutex_unlock(&vmbus_connection.channel_mutex);
> 
> -	if (channel_found)
> -		return -EBUSY;
> -
>  	/*
>  	 * channel_found == false means that any channels that were previously
>  	 * assigned to the CPU have been reassigned elsewhere with a call of
> @@ -497,5 +523,5 @@ int hv_synic_cleanup(unsigned int cpu)
> 
>  	hv_synic_disable_regs(cpu);
> 
> -	return 0;
> +	return ret;
>  }
> --
> 2.47.1
> 

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [PATCH v2 2/2] drivers/hv: add CPU offlining support
  2025-01-14  2:43   ` Michael Kelley
@ 2025-01-15 17:22     ` Hamza Mahfooz
  2025-01-15 21:02       ` Michael Kelley
  0 siblings, 1 reply; 7+ messages in thread
From: Hamza Mahfooz @ 2025-01-15 17:22 UTC (permalink / raw)
  To: Michael Kelley
  Cc: linux-hyperv@vger.kernel.org, Boqun Feng, Wei Liu,
	K. Y. Srinivasan, Haiyang Zhang, Dexuan Cui,
	linux-kernel@vger.kernel.org

On Tue, Jan 14, 2025 at 02:43:33AM +0000, Michael Kelley wrote:
> From: Hamza Mahfooz <hamzamahfooz@linux.microsoft.com> Sent: Friday, January 10, 2025 2:00 PM
> > 
> > Currently, it is effectively impossible to offline CPUs. Since, most
> > CPUs will have vmbus channels attached to them. So, as made mention of
> > in commit d570aec0f2154 ("Drivers: hv: vmbus: Synchronize
> > init_vp_index() vs. CPU hotplug"), rebind channels associated with CPUs
> > that a user is trying to offline to a new "randomly" selected CPU.
> 
> Let me provide some additional context and thoughts about the new
> functionality proposed in this patch set.
> 
> 1. I would somewhat challenge the commit message statement that
> "it is effectively impossible to offline CPUs". VMBus device interrupts
> can be assigned to a different CPU via a /sys interface and the code
> in target_cpu_store().  So a CPU *can* be taken offline by first reassigning
> any VMBus device interrupts, and then the offlining operation will succeed.
> That reassigning requires manual sysadmin actions or some scripting,
> which isn't super easy or automatic, but it's not "effectively impossible".

Fair enough.

> 
> 2. As background, when a CPU goes offline, the Linux kernel already has
> functionality to reassign unmanaged IRQs that are assigned to the CPU
> going offline.  (Managed IRQs are just shut down.)  See fixup_irqs().
> Unfortunately, VMBus device interrupts are not modelled as Linux IRQs,
> so the existing mechanism is not applied to VMBus devices.
> 
> 3. In light of #2 and for other reasons, I submitted a patch set in June 2024
> that models VMBus device interrupts as Linux IRQs. See [1]. This patch set
> got feedback from Thomas Gleixner about how to demultiplex the IRQs, but
> no one from Microsoft gave feedback on the overall idea. I think it would
> be worthwhile to pursue these patches, but I would like to get some
> macro-level thoughts from the Microsoft folks. There are implications for
> things such as irqbalance.
> 
> 4. As the cover letter in my patch set notes, there's still a problem with
> the automatic Linux IRQ reassignment mechanism for the new VMBus IRQs.
> The cover letter doesn't give full details, but the problem is ultimately due
> to needing to get an ack from Hyper-V that the change in VMBus device
> interrupt assignment has been completed. I have investigated alternatives
> for making it work, but they are all somewhat convoluted. Nevertheless,
> if we want to move forward with the patch set, further work on these
> alternatives would be warranted.
> 
> 5. In May 2020, Andrea Parri worked on a patch set that does what this
> patch set does -- automatically reassign VMBus device interrupts when
> a CPU tries to go offline. That patch set took a broader focus on making a
> smart decision about the CPU to which to assign the interrupt in several
> different circumstances, one of which was offlining a CPU. It was
> somewhat complex and posted as an RFC [2]. I think Andrea ended up
> having to work on some other things, and the patch set was not pursued
> after the initial posting. It might be worthwhile to review it for comparison
> purposes, or maybe it's worth reviving.
> 
> All of that is to say that I think there are two paths forward:
> 
> A. The quicker fix is to take the approach of this patch set and continue
> handling VMBus device interrupts outside of the Linux IRQ mechanism.
> Do the automatic reassignment when taking a CPU offline, as coded
> in this patch. Andrea Parri's old patch set might have something to add
> to this approach, if just for comparison purposes.
> 
> B. Take a broader look at the problem by going back to my patch set
> that models VMBus device interrupts as Linux IRQs. Work to get
> the existing Linux IRQ reassignment mechanism to work for the new
> VMBus IRQs. This approach will probably take longer than (A).
> 
> I lean toward (B) because it converges with standard Linux IRQs, but I
> don't know what's driving doing (A). If there's need to do (A) sooner,
> see my comments in the code below. I'm less inclined to add the
> complexity of Andrea Parri's old patch set because I think it takes
> us even further down the path of doing custom VMBus-related
> work when we would do better to converge toward existing Linux
> IRQ mechanisms.

I would prefer (B) as well, though as you said it will take longer.
So, I think my series is a reasonable stopgap until we get there.

> 
> [1] https://lore.kernel.org/linux-hyperv/20240604050940.859909-1-mhklinux@outlook.com/
> [2] https://lore.kernel.org/linux-hyperv/20200526223218.184057-1-parri.andrea@gmail.com/
> 
> > 
> > Cc: Boqun Feng <boqun.feng@gmail.com>
> > Cc: Wei Liu <wei.liu@kernel.org>
> > Signed-off-by: Hamza Mahfooz <hamzamahfooz@linux.microsoft.com>
> > ---
> > v2: remove cpus_read_{un,}lock() from hv_pick_new_cpu() and add
> >     lockdep_assert_cpus_held().
> > ---
> >  drivers/hv/hv.c | 56 ++++++++++++++++++++++++++++++++++++-------------
> >  1 file changed, 41 insertions(+), 15 deletions(-)
> > 
> > diff --git a/drivers/hv/hv.c b/drivers/hv/hv.c
> > index 36d9ba097ff5..9fef71403c86 100644
> > --- a/drivers/hv/hv.c
> > +++ b/drivers/hv/hv.c
> > @@ -433,13 +433,39 @@ static bool hv_synic_event_pending(void)
> >  	return pending;
> >  }
> > 
> > +static int hv_pick_new_cpu(struct vmbus_channel *channel,
> > +			   unsigned int current_cpu)
> > +{
> > +	int ret = 0;
> > +	int cpu;
> > +
> > +	lockdep_assert_cpus_held();
> > +	lockdep_assert_held(&vmbus_connection.channel_mutex);
> > +
> > +	/*
> > +	 * We can't assume that the relevant interrupts will be sent before
> > +	 * the cpu is offlined on older versions of hyperv.
> > +	 */
> > +	if (vmbus_proto_version < VERSION_WIN10_V5_3)
> > +		return -EBUSY;
> 
> I'm not sure why this test is here.  The function vmbus_set_channel_cpu()
> tests the vmbus_proto_version against V4_1 and returns an appropriate
> error. Do we *need* to filter against V5_3 instead of V4_1?

Yes, please see:
https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/drivers/hv/vmbus_drv.c#n1685

> 
> > +
> > +	cpu = cpumask_next(get_random_u32_below(nr_cpu_ids), cpu_online_mask);
> > +
> > +	if (cpu >= nr_cpu_ids || cpu == current_cpu)
> > +		cpu = VMBUS_CONNECT_CPU;
> 
> Picking a random CPU like this seems to have some problems:
> 
> 1. The selected CPU might be an isolated CPU, in which case the
> call to vmbus_channel_set_cpu() will return an error, and the
> attempt to take the CPU offline will eventually fail. But if you try
> again to take the CPU offline, a different random CPU may be
> chosen that isn't an isolated CPU, and taking the CPU offline
> will succeed. Such inconsistent behavior should be avoided.
> 
> 2. I wonder if we should try to choose a CPU in the same NUMA node
> as "current_cpu".  The Linux IRQ mechanism has the concept of CPU
> affinity for an IRQ, which can express the NUMA affinity. The normal
> Linux reassignment mechanism obeys the IRQ's affinity if possible,
> and so would do the right thing for NUMA. So we need to consider
> whether to do that here as well.

That sounds good to me.

> 
> 3. The handling of the current_cpu feels a bit hacky. There's
> also no wrap-around in the mask search. Together, I think that
> creates a small bias toward choosing the VMBUS_CONNECT_CPU,
> which is arguably already somewhat overloaded because all the
> low-speed devices use it. I haven't tried to look for alternative
> approaches to suggest.

Ya, I noticed that as well but I didn't want to overcomplicate the
selection heuristic. Though I guess having it wrap-around isn't too
involved.

Hamza

> 
> Michael
> 
> > +
> > +	ret = vmbus_channel_set_cpu(channel, cpu);
> > +
> > +	return ret;
> > +}
> > +
> >  /*
> >   * hv_synic_cleanup - Cleanup routine for hv_synic_init().
> >   */
> >  int hv_synic_cleanup(unsigned int cpu)
> >  {
> >  	struct vmbus_channel *channel, *sc;
> > -	bool channel_found = false;
> > +	int ret = 0;
> > 
> >  	if (vmbus_connection.conn_state != CONNECTED)
> >  		goto always_cleanup;
> > @@ -456,31 +482,31 @@ int hv_synic_cleanup(unsigned int cpu)
> > 
> >  	/*
> >  	 * Search for channels which are bound to the CPU we're about to
> > -	 * cleanup.  In case we find one and vmbus is still connected, we
> > -	 * fail; this will effectively prevent CPU offlining.
> > -	 *
> > -	 * TODO: Re-bind the channels to different CPUs.
> > +	 * cleanup.
> >  	 */
> >  	mutex_lock(&vmbus_connection.channel_mutex);
> >  	list_for_each_entry(channel, &vmbus_connection.chn_list, listentry) {
> >  		if (channel->target_cpu == cpu) {
> > -			channel_found = true;
> > -			break;
> > +			ret = hv_pick_new_cpu(channel, cpu);
> > +
> > +			if (ret) {
> > +				mutex_unlock(&vmbus_connection.channel_mutex);
> > +				return ret;
> > +			}
> >  		}
> >  		list_for_each_entry(sc, &channel->sc_list, sc_list) {
> >  			if (sc->target_cpu == cpu) {
> > -				channel_found = true;
> > -				break;
> > +				ret = hv_pick_new_cpu(channel, cpu);
> > +
> > +				if (ret) {
> > +					mutex_unlock(&vmbus_connection.channel_mutex);
> > +					return ret;
> > +				}
> >  			}
> >  		}
> > -		if (channel_found)
> > -			break;
> >  	}
> >  	mutex_unlock(&vmbus_connection.channel_mutex);
> > 
> > -	if (channel_found)
> > -		return -EBUSY;
> > -
> >  	/*
> >  	 * channel_found == false means that any channels that were previously
> >  	 * assigned to the CPU have been reassigned elsewhere with a call of
> > @@ -497,5 +523,5 @@ int hv_synic_cleanup(unsigned int cpu)
> > 
> >  	hv_synic_disable_regs(cpu);
> > 
> > -	return 0;
> > +	return ret;
> >  }
> > --
> > 2.47.1
> > 

^ permalink raw reply	[flat|nested] 7+ messages in thread

* RE: [PATCH v2 2/2] drivers/hv: add CPU offlining support
  2025-01-15 17:22     ` Hamza Mahfooz
@ 2025-01-15 21:02       ` Michael Kelley
  0 siblings, 0 replies; 7+ messages in thread
From: Michael Kelley @ 2025-01-15 21:02 UTC (permalink / raw)
  To: Hamza Mahfooz
  Cc: linux-hyperv@vger.kernel.org, Boqun Feng, Wei Liu,
	K. Y. Srinivasan, Haiyang Zhang, Dexuan Cui,
	linux-kernel@vger.kernel.org

From: Hamza Mahfooz <hamzamahfooz@linux.microsoft.com> Sent: Wednesday, January 15, 2025 9:23 AM
> 
> On Tue, Jan 14, 2025 at 02:43:33AM +0000, Michael Kelley wrote:
> > From: Hamza Mahfooz <hamzamahfooz@linux.microsoft.com> Sent: Friday, January
> 10, 2025 2:00 PM
> > >
> > > Currently, it is effectively impossible to offline CPUs. Since, most
> > > CPUs will have vmbus channels attached to them. So, as made mention of
> > > in commit d570aec0f2154 ("Drivers: hv: vmbus: Synchronize
> > > init_vp_index() vs. CPU hotplug"), rebind channels associated with CPUs
> > > that a user is trying to offline to a new "randomly" selected CPU.
> >
> > Let me provide some additional context and thoughts about the new
> > functionality proposed in this patch set.
> >
> > 1. I would somewhat challenge the commit message statement that
> > "it is effectively impossible to offline CPUs". VMBus device interrupts
> > can be assigned to a different CPU via a /sys interface and the code
> > in target_cpu_store().  So a CPU *can* be taken offline by first reassigning
> > any VMBus device interrupts, and then the offlining operation will succeed.
> > That reassigning requires manual sysadmin actions or some scripting,
> > which isn't super easy or automatic, but it's not "effectively impossible".
> 
> Fair enough.
> 
> >
> > 2. As background, when a CPU goes offline, the Linux kernel already has
> > functionality to reassign unmanaged IRQs that are assigned to the CPU
> > going offline.  (Managed IRQs are just shut down.)  See fixup_irqs().
> > Unfortunately, VMBus device interrupts are not modelled as Linux IRQs,
> > so the existing mechanism is not applied to VMBus devices.
> >
> > 3. In light of #2 and for other reasons, I submitted a patch set in June 2024
> > that models VMBus device interrupts as Linux IRQs. See [1]. This patch set
> > got feedback from Thomas Gleixner about how to demultiplex the IRQs, but
> > no one from Microsoft gave feedback on the overall idea. I think it would
> > be worthwhile to pursue these patches, but I would like to get some
> > macro-level thoughts from the Microsoft folks. There are implications for
> > things such as irqbalance.
> >
> > 4. As the cover letter in my patch set notes, there's still a problem with
> > the automatic Linux IRQ reassignment mechanism for the new VMBus IRQs.
> > The cover letter doesn't give full details, but the problem is ultimately due
> > to needing to get an ack from Hyper-V that the change in VMBus device
> > interrupt assignment has been completed. I have investigated alternatives
> > for making it work, but they are all somewhat convoluted. Nevertheless,
> > if we want to move forward with the patch set, further work on these
> > alternatives would be warranted.
> >
> > 5. In May 2020, Andrea Parri worked on a patch set that does what this
> > patch set does -- automatically reassign VMBus device interrupts when
> > a CPU tries to go offline. That patch set took a broader focus on making a
> > smart decision about the CPU to which to assign the interrupt in several
> > different circumstances, one of which was offlining a CPU. It was
> > somewhat complex and posted as an RFC [2]. I think Andrea ended up
> > having to work on some other things, and the patch set was not pursued
> > after the initial posting. It might be worthwhile to review it for comparison
> > purposes, or maybe it's worth reviving.
> >
> > All of that is to say that I think there are two paths forward:
> >
> > A. The quicker fix is to take the approach of this patch set and continue
> > handling VMBus device interrupts outside of the Linux IRQ mechanism.
> > Do the automatic reassignment when taking a CPU offline, as coded
> > in this patch. Andrea Parri's old patch set might have something to add
> > to this approach, if just for comparison purposes.
> >
> > B. Take a broader look at the problem by going back to my patch set
> > that models VMBus device interrupts as Linux IRQs. Work to get
> > the existing Linux IRQ reassignment mechanism to work for the new
> > VMBus IRQs. This approach will probably take longer than (A).
> >
> > I lean toward (B) because it converges with standard Linux IRQs, but I
> > don't know what's driving doing (A). If there's need to do (A) sooner,
> > see my comments in the code below. I'm less inclined to add the
> > complexity of Andrea Parri's old patch set because I think it takes
> > us even further down the path of doing custom VMBus-related
> > work when we would do better to converge toward existing Linux
> > IRQ mechanisms.
> 
> I would prefer (B) as well, though as you said it will take longer.
> So, I think my series is a reasonable stopgap until we get there.

Yes, and I don't fundamentally object to your series on that basis.
Somewhat separately, I'd still like to see if there is consensus to converge
on Linux IRQs as a longer-term change that's worth the trouble.

I was reviewing my notes from 6 months ago on making the existing
fixup_irqs() mechanism work for the putative VMBus device IRQs, and
it will be challenging because of needing to get an ack from Hyper-V.
There's still a possibility that it just won't work in any reasonable way,
in which case your patch set is all we'll have. But I'll remain optimistic
and think that there's a way. :-)

> >
> > [1] https://lore.kernel.org/linux-hyperv/20240604050940.859909-1-mhklinux@outlook.com/
> > [2] https://lore.kernel.org/linux-hyperv/20200526223218.184057-1-parri.andrea@gmail.com/
> >
> > >
> > > Cc: Boqun Feng <boqun.feng@gmail.com>
> > > Cc: Wei Liu <wei.liu@kernel.org>
> > > Signed-off-by: Hamza Mahfooz <hamzamahfooz@linux.microsoft.com>
> > > ---
> > > v2: remove cpus_read_{un,}lock() from hv_pick_new_cpu() and add
> > >     lockdep_assert_cpus_held().
> > > ---
> > >  drivers/hv/hv.c | 56 ++++++++++++++++++++++++++++++++++++-------------
> > >  1 file changed, 41 insertions(+), 15 deletions(-)
> > >
> > > diff --git a/drivers/hv/hv.c b/drivers/hv/hv.c
> > > index 36d9ba097ff5..9fef71403c86 100644
> > > --- a/drivers/hv/hv.c
> > > +++ b/drivers/hv/hv.c
> > > @@ -433,13 +433,39 @@ static bool hv_synic_event_pending(void)
> > >  	return pending;
> > >  }
> > >
> > > +static int hv_pick_new_cpu(struct vmbus_channel *channel,
> > > +			   unsigned int current_cpu)
> > > +{
> > > +	int ret = 0;
> > > +	int cpu;
> > > +
> > > +	lockdep_assert_cpus_held();
> > > +	lockdep_assert_held(&vmbus_connection.channel_mutex);
> > > +
> > > +	/*
> > > +	 * We can't assume that the relevant interrupts will be sent before
> > > +	 * the cpu is offlined on older versions of hyperv.
> > > +	 */
> > > +	if (vmbus_proto_version < VERSION_WIN10_V5_3)
> > > +		return -EBUSY;
> >
> > I'm not sure why this test is here.  The function vmbus_set_channel_cpu()
> > tests the vmbus_proto_version against V4_1 and returns an appropriate
> > error. Do we *need* to filter against V5_3 instead of V4_1?
> 
> Yes, please see: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/drivers/hv/vmbus_drv.c#n1685

I knew about that distinction between v4.1+ vs. v5.3+. I was
thinking that explicitly changing the assigned CPU via /sys and
target_cpu_store() should not be any different from changing the
assigned CPU implicitly by taking a CPU offline. But upon further
reflection, that thinking is flawed. Changing the assigned CPU
implicitly by taking a CPU offline is by definition immediately
going to fall into the problem documented in the comment,
whereas changing it explicitly via /sys will not. At least in the
latter case, taking the CPU offline is a separate operation.

But the distinction does produce a functional incongruence.
If running with VMBus version between 4.1 and 5.2 inclusive,
you can change the assigned CPU explicitly via /sys, but not
implicitly by just taking the CPU offline, which is a little weird.
But perhaps that incongruence isn't worth worrying about.
Most users will be on newer versions of Hyper-V and running
with VMBus 5.3 or later.

> 
> >
> > > +
> > > +	cpu = cpumask_next(get_random_u32_below(nr_cpu_ids), cpu_online_mask);
> > > +
> > > +	if (cpu >= nr_cpu_ids || cpu == current_cpu)
> > > +		cpu = VMBUS_CONNECT_CPU;
> >
> > Picking a random CPU like this seems to have some problems:
> >
> > 1. The selected CPU might be an isolated CPU, in which case the
> > call to vmbus_channel_set_cpu() will return an error, and the
> > attempt to take the CPU offline will eventually fail. But if you try
> > again to take the CPU offline, a different random CPU may be
> > chosen that isn't an isolated CPU, and taking the CPU offline
> > will succeed. Such inconsistent behavior should be avoided.
> >
> > 2. I wonder if we should try to choose a CPU in the same NUMA node
> > as "current_cpu".  The Linux IRQ mechanism has the concept of CPU
> > affinity for an IRQ, which can express the NUMA affinity. The normal
> > Linux reassignment mechanism obeys the IRQ's affinity if possible,
> > and so would do the right thing for NUMA. So we need to consider
> > whether to do that here as well.
> 
> That sounds good to me.
> 
> >
> > 3. The handling of the current_cpu feels a bit hacky. There's
> > also no wrap-around in the mask search. Together, I think that
> > creates a small bias toward choosing the VMBUS_CONNECT_CPU,
> > which is arguably already somewhat overloaded because all the
> > low-speed devices use it. I haven't tried to look for alternative
> > approaches to suggest.
> 
> Ya, I noticed that as well but I didn't want to overcomplicate the
> selection heuristic. Though I guess having it wrap-around isn't too
> involved.

This is the "trap" of having a separate mechanism for VMBus
device interrupts. The complexity can grow in order to handle
isolated CPUs, NUMA topologies, etc. But at least at the
moment that complexity isn't too bad and can be handled.

Michael

> 
> Hamza
> 
> >
> > Michael
> >
> > > +
> > > +	ret = vmbus_channel_set_cpu(channel, cpu);
> > > +
> > > +	return ret;
> > > +}
> > > +
> > >  /*
> > >   * hv_synic_cleanup - Cleanup routine for hv_synic_init().
> > >   */
> > >  int hv_synic_cleanup(unsigned int cpu)
> > >  {
> > >  	struct vmbus_channel *channel, *sc;
> > > -	bool channel_found = false;
> > > +	int ret = 0;
> > >
> > >  	if (vmbus_connection.conn_state != CONNECTED)
> > >  		goto always_cleanup;
> > > @@ -456,31 +482,31 @@ int hv_synic_cleanup(unsigned int cpu)
> > >
> > >  	/*
> > >  	 * Search for channels which are bound to the CPU we're about to
> > > -	 * cleanup.  In case we find one and vmbus is still connected, we
> > > -	 * fail; this will effectively prevent CPU offlining.
> > > -	 *
> > > -	 * TODO: Re-bind the channels to different CPUs.
> > > +	 * cleanup.
> > >  	 */
> > >  	mutex_lock(&vmbus_connection.channel_mutex);
> > >  	list_for_each_entry(channel, &vmbus_connection.chn_list, listentry) {
> > >  		if (channel->target_cpu == cpu) {
> > > -			channel_found = true;
> > > -			break;
> > > +			ret = hv_pick_new_cpu(channel, cpu);
> > > +
> > > +			if (ret) {
> > > +				mutex_unlock(&vmbus_connection.channel_mutex);
> > > +				return ret;
> > > +			}
> > >  		}
> > >  		list_for_each_entry(sc, &channel->sc_list, sc_list) {
> > >  			if (sc->target_cpu == cpu) {
> > > -				channel_found = true;
> > > -				break;
> > > +				ret = hv_pick_new_cpu(channel, cpu);
> > > +
> > > +				if (ret) {
> > > +
> 	mutex_unlock(&vmbus_connection.channel_mutex);
> > > +					return ret;
> > > +				}
> > >  			}
> > >  		}
> > > -		if (channel_found)
> > > -			break;
> > >  	}
> > >  	mutex_unlock(&vmbus_connection.channel_mutex);
> > >
> > > -	if (channel_found)
> > > -		return -EBUSY;
> > > -
> > >  	/*
> > >  	 * channel_found == false means that any channels that were previously
> > >  	 * assigned to the CPU have been reassigned elsewhere with a call of
> > > @@ -497,5 +523,5 @@ int hv_synic_cleanup(unsigned int cpu)
> > >
> > >  	hv_synic_disable_regs(cpu);
> > >
> > > -	return 0;
> > > +	return ret;
> > >  }
> > > --
> > > 2.47.1
> > >

^ permalink raw reply	[flat|nested] 7+ messages in thread

end of thread, other threads:[~2025-01-15 21:02 UTC | newest]

Thread overview: 7+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2025-01-10 21:59 [PATCH v2 1/2] drivers/hv: introduce vmbus_channel_set_cpu() Hamza Mahfooz
2025-01-10 21:59 ` [PATCH v2 2/2] drivers/hv: add CPU offlining support Hamza Mahfooz
2025-01-14  2:43   ` Michael Kelley
2025-01-15 17:22     ` Hamza Mahfooz
2025-01-15 21:02       ` Michael Kelley
2025-01-13 13:01 ` [PATCH v2 1/2] drivers/hv: introduce vmbus_channel_set_cpu() kernel test robot
2025-01-14  2:43 ` Michael Kelley

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).