Linux Power Management development

Linux Power Management development
 help / color / mirror / Atom feed

* [RFC] cpuidle : Add support for pseudo-cpuidle driver
From: Abhishek Goel @ 2020-07-23  6:13 UTC (permalink / raw)
  To: linux-pm, linux-kernel; +Cc: rjw, daniel.lezcano, mpe, ego, Abhishek Goel

This option adds support for a testing cpuidle driver, which allows
user to define custom idle states with their respective latencies and
residencies. This is useful for testing the behaviour of governors on
customized set of idle states.

This can be used as of now by hard-coding the customized set of cpuidle
states in the driver. Will add the capability of this driver to be used
as a module in subsequent patches.

Original idea and discussion for this patch can be found at:
https://lkml.org/lkml/2019/12/17/655

Signed-off-by: Abhishek Goel <huntbag@linux.vnet.ibm.com>
---
 drivers/cpuidle/Kconfig        |   9 ++
 drivers/cpuidle/Makefile       |   1 +
 drivers/cpuidle/cpuidle-test.c | 276 +++++++++++++++++++++++++++++++++
 3 files changed, 286 insertions(+)
 create mode 100644 drivers/cpuidle/cpuidle-test.c

diff --git a/drivers/cpuidle/Kconfig b/drivers/cpuidle/Kconfig
index c0aeedd66f02..1d73153a0e35 100644
--- a/drivers/cpuidle/Kconfig
+++ b/drivers/cpuidle/Kconfig
@@ -71,6 +71,15 @@ config HALTPOLL_CPUIDLE
 	 before halting in the guest (more efficient than polling in the
 	 host via halt_poll_ns for some scenarios).
 
+config TEST_CPUIDLE
+	tristate "cpuidle test driver"
+	default m
+	help
+	 This option enables a testing cpuidle driver, which allows to user
+	 to define custom idle states with their respective latencies and residencies.
+	 This is useful for testing the behaviour of governors on different
+	 set of idle states.
+
 endif
 
 config ARCH_NEEDS_CPU_IDLE_COUPLED
diff --git a/drivers/cpuidle/Makefile b/drivers/cpuidle/Makefile
index f07800cbb43f..68ea7dc257b5 100644
--- a/drivers/cpuidle/Makefile
+++ b/drivers/cpuidle/Makefile
@@ -8,6 +8,7 @@ obj-$(CONFIG_ARCH_NEEDS_CPU_IDLE_COUPLED) += coupled.o
 obj-$(CONFIG_DT_IDLE_STATES)		  += dt_idle_states.o
 obj-$(CONFIG_ARCH_HAS_CPU_RELAX)	  += poll_state.o
 obj-$(CONFIG_HALTPOLL_CPUIDLE)		  += cpuidle-haltpoll.o
+obj-$(CONFIG_TEST_CPUIDLE)		  += cpuidle-test.o
 
 ##################################################################################
 # ARM SoC drivers
diff --git a/drivers/cpuidle/cpuidle-test.c b/drivers/cpuidle/cpuidle-test.c
new file mode 100644
index 000000000000..399729440569
--- /dev/null
+++ b/drivers/cpuidle/cpuidle-test.c
@@ -0,0 +1,276 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ *  cpuidle-test - Test driver for cpuidle.
+ *
+ */
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/moduleparam.h>
+#include <linux/cpuidle.h>
+#include <linux/cpu.h>
+#include <linux/module.h>
+#include <linux/sched/idle.h>
+#include <linux/sched/clock.h>
+#include <linux/sched/idle.h>
+
+#define CPUIDLE_STATE_MAX	10
+#define MAX_PARAM_LENGTH	100
+
+static unsigned int nr_states = 4;
+static unsigned int sim_type = 1;
+static char name[MAX_PARAM_LENGTH];
+static char latency_us[MAX_PARAM_LENGTH];
+static char residency_us[MAX_PARAM_LENGTH];
+
+
+module_param(nr_states, uint, 0644);
+module_param(sim_type, uint, 0644);
+module_param_string(name, name, MAX_PARAM_LENGTH, 0644);
+module_param_string(latency_us, latency_us, MAX_PARAM_LENGTH, 0644);
+module_param_string(residency_us, residency_us, MAX_PARAM_LENGTH, 0644);
+
+static struct cpuidle_driver test_cpuidle_driver = {
+	.name		= "test_cpuidle",
+	.owner		= THIS_MODULE,
+};
+
+static struct cpuidle_state *cpuidle_state_table __read_mostly;
+
+static struct cpuidle_device __percpu *test_cpuidle_devices;
+static enum cpuhp_state test_hp_idlestate;
+
+
+static int __cpuidle idle_loop(struct cpuidle_device *dev,
+				struct cpuidle_driver *drv,
+				int index)
+{
+	u64 time_start;
+
+	local_irq_enable();
+	if (!current_set_polling_and_test()) {
+		while (!need_resched())
+			cpu_relax();
+	}
+
+	time_start = local_clock();
+
+	while (local_clock() - time_start < drv->states[index].exit_latency)
+
+	current_clr_polling();
+
+	return index;
+}
+
+static struct cpuidle_state cpuidle_states[CPUIDLE_STATE_MAX] = {
+	{ /* Snooze */
+		.name = "snooze",
+		.exit_latency = 0,
+		.target_residency = 0,
+		.enter = idle_loop },
+};
+
+static struct cpuidle_state cpuidle_states_ppc[] = {
+	{	.name = "snooze",
+		.exit_latency = 0,
+		.target_residency = 0,
+		.enter = idle_loop },
+	{
+		.name = "stop0",
+		.exit_latency = 2,
+		.target_residency = 20,
+		.enter = idle_loop },
+	{
+		.name = "stop1",
+		.exit_latency = 5,
+		.target_residency = 50,
+		.enter = idle_loop },
+	{
+		.name = "stop2",
+		.exit_latency = 10,
+		.target_residency = 100,
+		.enter = idle_loop },
+};
+
+static struct cpuidle_state cpuidle_states_intel[] = {
+	{	.name = "poll",
+		.exit_latency = 0,
+		.target_residency = 0,
+		.enter = idle_loop },
+	{
+		.name = "c1",
+		.exit_latency = 2,
+		.target_residency = 2,
+		.enter = idle_loop },
+	{
+		.name = "c1e",
+		.exit_latency = 10,
+		.target_residency = 20,
+		.enter = idle_loop },
+	{
+		.name = "c3",
+		.exit_latency = 80,
+		.target_residency = 211,
+		.enter = idle_loop },
+};
+
+int cpuidle_cpu_online(unsigned int cpu)
+{
+	struct cpuidle_device *dev;
+
+	dev = per_cpu_ptr(test_cpuidle_devices, cpu);
+	if (!dev->registered) {
+		dev->cpu = cpu;
+		if (cpuidle_register_device(dev)) {
+			pr_notice("cpuidle_register_device %d failed!\n", cpu);
+			return -EIO;
+		}
+	}
+
+	return 0;
+}
+
+int cpuidle_cpu_dead(unsigned int cpu)
+{
+	struct cpuidle_device *dev;
+
+	dev = per_cpu_ptr(test_cpuidle_devices, cpu);
+	if (dev->registered)
+		cpuidle_unregister_device(dev);
+
+	return 0;
+}
+
+int cpuidle_driver_init(void)
+{
+	int idle_state;
+	struct cpuidle_driver *drv = &test_cpuidle_driver;
+
+	drv->state_count = 0;
+
+	for (idle_state = 0; idle_state < nr_states; ++idle_state) {
+		/* Is the state not enabled? */
+		if (cpuidle_state_table[idle_state].enter == NULL)
+			continue;
+
+		drv->states[drv->state_count] =	/* structure copy */
+			cpuidle_state_table[idle_state];
+
+		drv->state_count += 1;
+	}
+
+	return 0;
+}
+
+int add_cpuidle_states(void)
+{
+	/* Parse the module param and initialize the idle states here
+	 * in cpuidle_state_table.
+	 */
+	char *this_param;
+	char *input_name = name;
+	char *input_res = residency_us;
+	char *input_lat = latency_us;
+	int index = 1;
+	long temp;
+	int rc;
+
+	switch (sim_type) {
+	case 1:
+		cpuidle_state_table = cpuidle_states_ppc;
+		return 0;
+	case 2:
+		cpuidle_state_table = cpuidle_states_intel;
+		return 0;
+	case 3:
+		break;
+	default:
+		pr_warn("Sim value out of bound\n");
+		break;
+	}
+
+	if (strnlen(input_name, MAX_PARAM_LENGTH)) {
+		while ((this_param = strsep(&input_name, ",")) && index <= nr_states) {
+			strcpy(cpuidle_states[index].name, this_param);
+			cpuidle_states[index].enter = idle_loop;
+			index++;
+		}
+	}
+
+	if (strnlen(input_res, MAX_PARAM_LENGTH)) {
+		index = 1;
+		while ((this_param = strsep(&input_res, ",")) && index <= nr_states) {
+			rc = kstrtol(this_param, 10, &temp);
+			cpuidle_states[index].target_residency = temp;
+			index++;
+		}
+	}
+
+	if (strnlen(input_lat, MAX_PARAM_LENGTH)) {
+		index = 1;
+		while ((this_param = strsep(&input_lat, ",")) && index <= nr_states) {
+			rc = kstrtol(this_param, 10, &temp);
+			cpuidle_states[index].exit_latency = temp;
+			index++;
+		}
+	}
+
+	cpuidle_state_table = cpuidle_states;
+	return nr_states;
+}
+
+void test_cpuidle_uninit(void)
+{
+	if (test_hp_idlestate)
+		cpuhp_remove_state(test_hp_idlestate);
+	cpuidle_unregister_driver(&test_cpuidle_driver);
+
+	free_percpu(test_cpuidle_devices);
+	test_cpuidle_devices = NULL;
+}
+
+int __init test_cpuidle_init(void)
+{
+	int retval;
+
+	add_cpuidle_states();
+	cpuidle_driver_init();
+	retval = cpuidle_register(&test_cpuidle_driver, NULL);
+	if (retval) {
+		printk(KERN_DEBUG "Registration of test driver failed.\n");
+		return retval;
+	}
+
+	test_cpuidle_devices = alloc_percpu(struct cpuidle_device);
+	if (test_cpuidle_devices == NULL) {
+		cpuidle_unregister_driver(&test_cpuidle_driver);
+		return -ENOMEM;
+	}
+
+	retval = cpuhp_setup_state_nocalls(CPUHP_AP_ONLINE_DYN,
+					   "cpuidle/powernv:online",
+					   cpuidle_cpu_online,
+					   cpuidle_cpu_dead);
+
+	if (retval < 0) {
+		test_cpuidle_uninit();
+	} else {
+		test_hp_idlestate = retval;
+		retval = 0;
+	}
+
+	return retval;
+}
+
+void __exit test_cpuidle_exit(void)
+{
+	test_cpuidle_uninit();
+}
+
+module_init(test_cpuidle_init);
+module_exit(test_cpuidle_exit);
+MODULE_DESCRIPTION("Test Cpuidle Driver");
+MODULE_AUTHOR("Abhishek Goel");
+MODULE_LICENSE("GPL");
+
-- 
2.17.1


^ permalink raw reply related

* Re: [PATCH v2 01/11] xen/manage: keep track of the on-going suspend mode
From: Stefano Stabellini @ 2020-07-22 23:49 UTC (permalink / raw)
  To: Anchal Agarwal
  Cc: Stefano Stabellini, Boris Ostrovsky, tglx, mingo, bp, hpa, x86,
	jgross, linux-pm, linux-mm, kamatam, konrad.wilk, roger.pau,
	axboe, davem, rjw, len.brown, pavel, peterz, eduval, sblbir,
	xen-devel, vkuznets, netdev, linux-kernel, dwmw, benh
In-Reply-To: <20200722180229.GA32316@dev-dsk-anchalag-2a-9c2d1d96.us-west-2.amazon.com>

[-- Attachment #1: Type: text/plain, Size: 5299 bytes --]

On Wed, 22 Jul 2020, Anchal Agarwal wrote:
> On Tue, Jul 21, 2020 at 05:18:34PM -0700, Stefano Stabellini wrote:
> > On Tue, 21 Jul 2020, Boris Ostrovsky wrote:
> > > >>>>>> +static int xen_setup_pm_notifier(void)
> > > >>>>>> +{
> > > >>>>>> +     if (!xen_hvm_domain())
> > > >>>>>> +             return -ENODEV;
> > > >>>>>>
> > > >>>>>> I forgot --- what did we decide about non-x86 (i.e. ARM)?
> > > >>>>> It would be great to support that however, its  out of
> > > >>>>> scope for this patch set.
> > > >>>>> I’ll be happy to discuss it separately.
> > > >>>>
> > > >>>> I wasn't implying that this *should* work on ARM but rather whether this
> > > >>>> will break ARM somehow (because xen_hvm_domain() is true there).
> > > >>>>
> > > >>>>
> > > >>> Ok makes sense. TBH, I haven't tested this part of code on ARM and the series
> > > >>> was only support x86 guests hibernation.
> > > >>> Moreover, this notifier is there to distinguish between 2 PM
> > > >>> events PM SUSPEND and PM hibernation. Now since we only care about PM
> > > >>> HIBERNATION I may just remove this code and rely on "SHUTDOWN_SUSPEND" state.
> > > >>> However, I may have to fix other patches in the series where this check may
> > > >>> appear and cater it only for x86 right?
> > > >>
> > > >>
> > > >> I don't know what would happen if ARM guest tries to handle hibernation
> > > >> callbacks. The only ones that you are introducing are in block and net
> > > >> fronts and that's arch-independent.
> > > >>
> > > >>
> > > >> You do add a bunch of x86-specific code though (syscore ops), would
> > > >> something similar be needed for ARM?
> > > >>
> > > >>
> > > > I don't expect this to work out of the box on ARM. To start with something
> > > > similar will be needed for ARM too.
> > > > We may still want to keep the driver code as-is.
> > > >
> > > > I understand the concern here wrt ARM, however, currently the support is only
> > > > proposed for x86 guests here and similar work could be carried out for ARM.
> > > > Also, if regular hibernation works correctly on arm, then all is needed is to
> > > > fix Xen side of things.
> > > >
> > > > I am not sure what could be done to achieve any assurances on arm side as far as
> > > > this series is concerned.
> > 
> > Just to clarify: new features don't need to work on ARM or cause any
> > addition efforts to you to make them work on ARM. The patch series only
> > needs not to break existing code paths (on ARM and any other platforms).
> > It should also not make it overly difficult to implement the ARM side of
> > things (if there is one) at some point in the future.
> > 
> > FYI drivers/xen/manage.c is compiled and working on ARM today, however
> > Xen suspend/resume is not supported. I don't know for sure if
> > guest-initiated hibernation works because I have not tested it.
> > 
> > 
> > 
> > > If you are not sure what the effects are (or sure that it won't work) on
> > > ARM then I'd add IS_ENABLED(CONFIG_X86) check, i.e.
> > >
> > >
> > > if (!IS_ENABLED(CONFIG_X86) || !xen_hvm_domain())
> > >       return -ENODEV;
> > 
> > That is a good principle to have and thanks for suggesting it. However,
> > in this specific case there is nothing in this patch that doesn't work
> > on ARM. From an ARM perspective I think we should enable it and
> > &xen_pm_notifier_block should be registered.
> > 
> This question is for Boris, I think you we decided to get rid of the notifier
> in V3 as all we need  to check is SHUTDOWN_SUSPEND state which sounds plausible
> to me. So this check may go away. It may still be needed for sycore_ops
> callbacks registration.
> > Given that all guests are HVM guests on ARM, it should work fine as is.
> > 
> > 
> > I gave a quick look at the rest of the series and everything looks fine
> > to me from an ARM perspective. I cannot imaging that the new freeze,
> > thaw, and restore callbacks for net and block are going to cause any
> > trouble on ARM. The two main x86-specific functions are
> > xen_syscore_suspend/resume and they look trivial to implement on ARM (in
> > the sense that they are likely going to look exactly the same.)
> > 
> Yes but for now since things are not tested I will put this
> !IS_ENABLED(CONFIG_X86) on syscore_ops calls registration part just to be safe
> and not break anything.
> > 
> > One question for Anchal: what's going to happen if you trigger a
> > hibernation, you have the new callbacks, but you are missing
> > xen_syscore_suspend/resume?
> > 
> > Is it any worse than not having the new freeze, thaw and restore
> > callbacks at all and try to do a hibernation?
> If callbacks are not there, I don't expect hibernation to work correctly.
> These callbacks takes care of xen primitives like shared_info_page,
> grant table, sched clock, runstate time which are important to save the correct
> state of the guest and bring it back up. Other patches in the series, adds all
> the logic to these syscore callbacks. Freeze/thaw/restore are just there for at driver
> level.

I meant the other way around :-)  Let me rephrase the question.

Do you think that implementing freeze/thaw/restore at the driver level
without having xen_syscore_suspend/resume can potentially make things
worse compared to not having freeze/thaw/restore at the driver level at
all?

^ permalink raw reply

* Re: [PATCH v2 01/11] xen/manage: keep track of the on-going suspend mode
From: Boris Ostrovsky @ 2020-07-22 22:45 UTC (permalink / raw)
  To: Anchal Agarwal, Stefano Stabellini
  Cc: tglx, mingo, bp, hpa, x86, jgross, linux-pm, linux-mm, kamatam,
	konrad.wilk, roger.pau, axboe, davem, rjw, len.brown, pavel,
	peterz, eduval, sblbir, xen-devel, vkuznets, netdev, linux-kernel,
	dwmw, benh
In-Reply-To: <20200722180229.GA32316@dev-dsk-anchalag-2a-9c2d1d96.us-west-2.amazon.com>

On 7/22/20 2:02 PM, Anchal Agarwal wrote:
> On Tue, Jul 21, 2020 at 05:18:34PM -0700, Stefano Stabellini wrote:
>>
>>
>>> If you are not sure what the effects are (or sure that it won't work) on
>>> ARM then I'd add IS_ENABLED(CONFIG_X86) check, i.e.
>>>
>>>
>>> if (!IS_ENABLED(CONFIG_X86) || !xen_hvm_domain())
>>>       return -ENODEV;
>> That is a good principle to have and thanks for suggesting it. However,
>> in this specific case there is nothing in this patch that doesn't work
>> on ARM. From an ARM perspective I think we should enable it and
>> &xen_pm_notifier_block should be registered.
>>
> This question is for Boris, I think you we decided to get rid of the notifier
> in V3 as all we need  to check is SHUTDOWN_SUSPEND state which sounds plausible
> to me. So this check may go away. It may still be needed for sycore_ops
> callbacks registration.


If this check is going away then I guess there is nothing to do here.


My concern isn't about this particular notifier but rather whether this
feature may affect existing functionality (ARM and PVH dom0). If Stefano
feels this should be fine for ARM then so be it.


-boris


>> Given that all guests are HVM guests on ARM, it should work fine as is.
>>
>>
>> I gave a quick look at the rest of the series and everything looks fine
>> to me from an ARM perspective. I cannot imaging that the new freeze,
>> thaw, and restore callbacks for net and block are going to cause any
>> trouble on ARM. The two main x86-specific functions are
>> xen_syscore_suspend/resume and they look trivial to implement on ARM (in
>> the sense that they are likely going to look exactly the same.)
>>
> Yes but for now since things are not tested I will put this
> !IS_ENABLED(CONFIG_X86) on syscore_ops calls registration part just to be safe
> and not break anything.
>> One question for Anchal: what's going to happen if you trigger a
>> hibernation, you have the new callbacks, but you are missing
>> xen_syscore_suspend/resume?
>>
>> Is it any worse than not having the new freeze, thaw and restore
>> callbacks at all and try to do a hibernation?
> If callbacks are not there, I don't expect hibernation to work correctly.
> These callbacks takes care of xen primitives like shared_info_page,
> grant table, sched clock, runstate time which are important to save the correct
> state of the guest and bring it back up. Other patches in the series, adds all
> the logic to these syscore callbacks. Freeze/thaw/restore are just there for at driver
> level.
>
> Thanks,
> Anchal




^ permalink raw reply

* Re: [PATCH v2 01/11] xen/manage: keep track of the on-going suspend mode
From: Anchal Agarwal @ 2020-07-22 18:02 UTC (permalink / raw)
  To: Stefano Stabellini
  Cc: Boris Ostrovsky, tglx, mingo, bp, hpa, x86, jgross, linux-pm,
	linux-mm, kamatam, konrad.wilk, roger.pau, axboe, davem, rjw,
	len.brown, pavel, peterz, eduval, sblbir, xen-devel, vkuznets,
	netdev, linux-kernel, dwmw, benh
In-Reply-To: <alpine.DEB.2.21.2007211640500.17562@sstabellini-ThinkPad-T480s>

On Tue, Jul 21, 2020 at 05:18:34PM -0700, Stefano Stabellini wrote:
> CAUTION: This email originated from outside of the organization. Do not click links or open attachments unless you can confirm the sender and know the content is safe.
> 
> 
> 
> On Tue, 21 Jul 2020, Boris Ostrovsky wrote:
> > >>>>>> +static int xen_setup_pm_notifier(void)
> > >>>>>> +{
> > >>>>>> +     if (!xen_hvm_domain())
> > >>>>>> +             return -ENODEV;
> > >>>>>>
> > >>>>>> I forgot --- what did we decide about non-x86 (i.e. ARM)?
> > >>>>> It would be great to support that however, its  out of
> > >>>>> scope for this patch set.
> > >>>>> I’ll be happy to discuss it separately.
> > >>>>
> > >>>> I wasn't implying that this *should* work on ARM but rather whether this
> > >>>> will break ARM somehow (because xen_hvm_domain() is true there).
> > >>>>
> > >>>>
> > >>> Ok makes sense. TBH, I haven't tested this part of code on ARM and the series
> > >>> was only support x86 guests hibernation.
> > >>> Moreover, this notifier is there to distinguish between 2 PM
> > >>> events PM SUSPEND and PM hibernation. Now since we only care about PM
> > >>> HIBERNATION I may just remove this code and rely on "SHUTDOWN_SUSPEND" state.
> > >>> However, I may have to fix other patches in the series where this check may
> > >>> appear and cater it only for x86 right?
> > >>
> > >>
> > >> I don't know what would happen if ARM guest tries to handle hibernation
> > >> callbacks. The only ones that you are introducing are in block and net
> > >> fronts and that's arch-independent.
> > >>
> > >>
> > >> You do add a bunch of x86-specific code though (syscore ops), would
> > >> something similar be needed for ARM?
> > >>
> > >>
> > > I don't expect this to work out of the box on ARM. To start with something
> > > similar will be needed for ARM too.
> > > We may still want to keep the driver code as-is.
> > >
> > > I understand the concern here wrt ARM, however, currently the support is only
> > > proposed for x86 guests here and similar work could be carried out for ARM.
> > > Also, if regular hibernation works correctly on arm, then all is needed is to
> > > fix Xen side of things.
> > >
> > > I am not sure what could be done to achieve any assurances on arm side as far as
> > > this series is concerned.
> 
> Just to clarify: new features don't need to work on ARM or cause any
> addition efforts to you to make them work on ARM. The patch series only
> needs not to break existing code paths (on ARM and any other platforms).
> It should also not make it overly difficult to implement the ARM side of
> things (if there is one) at some point in the future.
> 
> FYI drivers/xen/manage.c is compiled and working on ARM today, however
> Xen suspend/resume is not supported. I don't know for sure if
> guest-initiated hibernation works because I have not tested it.
> 
> 
> 
> > If you are not sure what the effects are (or sure that it won't work) on
> > ARM then I'd add IS_ENABLED(CONFIG_X86) check, i.e.
> >
> >
> > if (!IS_ENABLED(CONFIG_X86) || !xen_hvm_domain())
> >       return -ENODEV;
> 
> That is a good principle to have and thanks for suggesting it. However,
> in this specific case there is nothing in this patch that doesn't work
> on ARM. From an ARM perspective I think we should enable it and
> &xen_pm_notifier_block should be registered.
> 
This question is for Boris, I think you we decided to get rid of the notifier
in V3 as all we need  to check is SHUTDOWN_SUSPEND state which sounds plausible
to me. So this check may go away. It may still be needed for sycore_ops
callbacks registration.
> Given that all guests are HVM guests on ARM, it should work fine as is.
> 
> 
> I gave a quick look at the rest of the series and everything looks fine
> to me from an ARM perspective. I cannot imaging that the new freeze,
> thaw, and restore callbacks for net and block are going to cause any
> trouble on ARM. The two main x86-specific functions are
> xen_syscore_suspend/resume and they look trivial to implement on ARM (in
> the sense that they are likely going to look exactly the same.)
> 
Yes but for now since things are not tested I will put this
!IS_ENABLED(CONFIG_X86) on syscore_ops calls registration part just to be safe
and not break anything.
> 
> One question for Anchal: what's going to happen if you trigger a
> hibernation, you have the new callbacks, but you are missing
> xen_syscore_suspend/resume?
> 
> Is it any worse than not having the new freeze, thaw and restore
> callbacks at all and try to do a hibernation?
If callbacks are not there, I don't expect hibernation to work correctly.
These callbacks takes care of xen primitives like shared_info_page,
grant table, sched clock, runstate time which are important to save the correct
state of the guest and bring it back up. Other patches in the series, adds all
the logic to these syscore callbacks. Freeze/thaw/restore are just there for at driver
level.

Thanks,
Anchal

^ permalink raw reply

* Re: [PATCH v2 1/2] interconnect: Add sync state support
From: Saravana Kannan @ 2020-07-22 17:07 UTC (permalink / raw)
  To: Georgi Djakov
  Cc: Linux PM, Mike Tipton, okukatla, Bjorn Andersson, Vincent Guittot,
	linux-arm-msm, LKML
In-Reply-To: <20200722110139.24778-2-georgi.djakov@linaro.org>

On Wed, Jul 22, 2020 at 4:01 AM Georgi Djakov <georgi.djakov@linaro.org> wrote:
>
> The bootloaders often do some initial configuration of the interconnects
> in the system and we want to keep this configuration until all consumers
> have probed and expressed their bandwidth needs. This is because we don't
> want to change the configuration by starting to disable unused paths until
> every user had a chance to request the amount of bandwidth it needs.
>
> To accomplish this we will implement an interconnect specific sync_state
> callback which will synchronize (aggregate and set) the current bandwidth
> settings when all consumers have been probed.
>
> Signed-off-by: Georgi Djakov <georgi.djakov@linaro.org>
> ---
>  drivers/interconnect/core.c           | 61 +++++++++++++++++++++++++++
>  include/linux/interconnect-provider.h |  5 +++
>  2 files changed, 66 insertions(+)
>
> diff --git a/drivers/interconnect/core.c b/drivers/interconnect/core.c
> index e5f998744501..0c4e38d9f1fa 100644
> --- a/drivers/interconnect/core.c
> +++ b/drivers/interconnect/core.c
> @@ -26,6 +26,8 @@
>
>  static DEFINE_IDR(icc_idr);
>  static LIST_HEAD(icc_providers);
> +static int providers_count;
> +static bool synced_state;
>  static DEFINE_MUTEX(icc_lock);
>  static struct dentry *icc_debugfs_dir;
>
> @@ -255,6 +257,12 @@ static int aggregate_requests(struct icc_node *node)
>                         continue;
>                 p->aggregate(node, r->tag, r->avg_bw, r->peak_bw,
>                              &node->avg_bw, &node->peak_bw);
> +
> +               /* during boot use the initial bandwidth as a floor value */
> +               if (!synced_state) {
> +                       node->avg_bw = max(node->avg_bw, node->init_avg);
> +                       node->peak_bw = max(node->peak_bw, node->init_peak);
> +               }

Sorry I didn't reply earlier.

I liked your previous approach with the get_bw ops. The v2 approach
forces every interconnect provider driver to set up these values even
if they are okay with just maxing out the bandwidth. Also, if they can
actually query their hardware, this adds additional steps for them.

I think the default should be:
1. Query the current bandwidth at boot and use that.
2. If that's not available, max out the bandwidth.

The interconnect providers that don't like maxing out and don't have
real get_bw() capability can just cache and return the last set_bw()
values. And they start off with those cached values matching whatever
init_bw they need.

That way, the default case (can get bw or don't care about maxing out)
would be easy and the extra work would be limited to drivers that want
neither.

-Saravana

^ permalink raw reply

* Re: [PATCH v3 0/2] Selftest for cpuidle latency measurement
From: Pratik Sampat @ 2020-07-22 15:33 UTC (permalink / raw)
  To: Daniel Lezcano, rjw, mpe, benh, paulus, srivatsa, shuah, npiggin,
	ego, svaidy, pratik.r.sampat, linux-pm, linuxppc-dev,
	linux-kernel, linux-kselftest
In-Reply-To: <17e884b8-09d8-98a8-3890-bf506d2cdfca@linaro.org>

Hello Daniel,

On 21/07/20 8:27 pm, Daniel Lezcano wrote:
> On 21/07/2020 14:42, Pratik Rajesh Sampat wrote:
>> v2: https://lkml.org/lkml/2020/7/17/369
>> Changelog v2-->v3
>> Based on comments from Gautham R. Shenoy adding the following in the
>> selftest,
>> 1. Grepping modules to determine if already loaded
>> 2. Wrapper to enable/disable states
>> 3. Preventing any operation/test on offlined CPUs
>> ---
>>
>> The patch series introduces a mechanism to measure wakeup latency for
>> IPI and timer based interrupts
>> The motivation behind this series is to find significant deviations
>> behind advertised latency and resisdency values
> Why do you want to measure for the timer and the IPI ? Whatever the
> source of the wakeup, the exit latency remains the same, no ?
>
> Is all this kernel-ish code really needed ?
>
> What about using a highres periodic timer and make it expires every eg.
> 50ms x 2400, so it is 120 secondes and measure the deviation. Repeat the
> operation for each idle states.
>
> And in order to make it as much accurate as possible, set the program
> affinity on a CPU and isolate this one by preventing other processes to
> be scheduled on and migrate the interrupts on the other CPUs.
>
> That will be all userspace code, no?
>
>
The kernel module may not needed now that you mention it.
IPI latencies could be measured using pipes and threads using
pthread_attr_setaffinity_np to control the experiment, as you
suggested. This should internally fire a smp_call_function_single.

The original idea was to essentially measure it as closely as possible
in the kernel without involving the kernel-->userspace overhead.
However, the user-space approach may not be too much of a problem as
we are collecting a baseline and the delta of the latency is what we
would be concerned about anyways!

With respect to measuring both timers and IPI latencies: In principle
yes, the exit latency should remain the same but if there is a
deviation in reality we may want to measure it.

I'll implement this experiment in the userspace and get back with the
numbers to confirm.

Thanks for your comments!
Best,
Pratik

>
>


^ permalink raw reply

* Re: [PATCH v2 10/13] cpufreq: powernow-k8: Mark 'hi' and 'lo' dummy variables as __always_unused
From: Pavel Machek @ 2020-07-22 12:50 UTC (permalink / raw)
  To: Lee Jones
  Cc: rjw, viresh.kumar, linux-arm-kernel, linux-kernel, linux-pm,
	Andreas Herrmann, Dominik Brodowski, Paul Devriendt,
	Mark Langsdorf
In-Reply-To: <20200715082634.3024816-11-lee.jones@linaro.org>

[-- Attachment #1: Type: text/plain, Size: 1055 bytes --]

On Wed 2020-07-15 09:26:31, Lee Jones wrote:
> if we fail to use a variable, even a dummy ones, then the compiler
> complains that it is set but not used.  We know this is fine, so we
> set them as __always_unused here to let the compiler know.
> 
> Fixes the following W=1 kernel build warning(s):
> 
>  drivers/cpufreq/powernow-k8.c: In function ‘pending_bit_stuck’:
>  drivers/cpufreq/powernow-k8.c:89:10: warning: variable ‘hi’ set but not used [-Wunused-but-set-variable]
>  89 | u32 lo, hi;
>  | ^~
>  drivers/cpufreq/powernow-k8.c: In function ‘core_voltage_pre_transition’:
>  drivers/cpufreq/powernow-k8.c:285:14: warning: variable ‘lo’ set but not used [-Wunused-but-set-variable]
>  285 | u32 maxvid, lo, rvomult = 1;
>  | ^~
> 
> Cc: Andreas Herrmann <herrmann.der.user@googlemail.com>
> Cc: Dominik Brodowski <linux@brodo.de>

Acked-by: Pavel Machek <pavel@ucw.cz>

-- 
(english) http://www.livejournal.com/~pavelmachek
(cesky, pictures) http://atrey.karlin.mff.cuni.cz/~pavel/picture/horses/blog.html

[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 195 bytes --]

^ permalink raw reply

* [PATCH v2 0/2] Add interconnect sync state support
From: Georgi Djakov @ 2020-07-22 11:01 UTC (permalink / raw)
  To: linux-pm
  Cc: saravanak, mdtipton, okukatla, bjorn.andersson, vincent.guittot,
	georgi.djakov, linux-arm-msm, linux-kernel

Bootloaders often leave some system resources enabled such as clocks,
regulators, interconnects etc. We want to keep these resources enabled
until all their consumers are probed. These resources are often shared,
so we must wait for all the consumers to come up, before deciding
whether to turn them off or change the configuration. This patchset is
trying to solve the above problem just for the on-chip interconnects.

The problem is solved by allowing the interconnect providers to specify
an initial bandwidth values, which are enforced during boot as floor
values, while the requests from all consumers are being collected.
Then the sync_state() callback is used to signal when all consumers have
been probed, meaning that the floor bandwidth is not needed anymore and
the framework is ready to re-aggregate and process all requests.

v2:
* Support initial values for both average and peak bandwidth (Mike)
* Skip aggregating/setting for nodes that don't specify initial bw (Mike)
* Drop patch 2/4: Add get_bw() callback (Mike)
* Squash patches 3 and 4.

v1: https://lore.kernel.org/lkml/20200709110705.30359-1-georgi.djakov@linaro.org/

Georgi Djakov (2):
  interconnect: Add sync state support
  interconnect: qcom: Use icc_sync_state in sdm845 and osm-3l drivers

 drivers/interconnect/core.c           | 61 +++++++++++++++++++++++++++
 drivers/interconnect/qcom/osm-l3.c    |  3 ++
 drivers/interconnect/qcom/sdm845.c    |  3 ++
 include/linux/interconnect-provider.h |  5 +++
 4 files changed, 72 insertions(+)

^ permalink raw reply

* [PATCH v2 2/2] interconnect: qcom: Use icc_sync_state in sdm845 and osm-3l drivers
From: Georgi Djakov @ 2020-07-22 11:01 UTC (permalink / raw)
  To: linux-pm
  Cc: saravanak, mdtipton, okukatla, bjorn.andersson, vincent.guittot,
	georgi.djakov, linux-arm-msm, linux-kernel
In-Reply-To: <20200722110139.24778-1-georgi.djakov@linaro.org>

Lowering the bandwidth on the bus might have negative consequences if
it's done before all consumers had a chance to cast their vote. Let's
return the maximum amount of bandwidth as initial value. This bandwidth
level would be maintained until all consumers have probed.

Signed-off-by: Georgi Djakov <georgi.djakov@linaro.org>
---
 drivers/interconnect/qcom/osm-l3.c | 3 +++
 drivers/interconnect/qcom/sdm845.c | 3 +++
 2 files changed, 6 insertions(+)

diff --git a/drivers/interconnect/qcom/osm-l3.c b/drivers/interconnect/qcom/osm-l3.c
index 96fb9ff5ff2e..54aff6273af1 100644
--- a/drivers/interconnect/qcom/osm-l3.c
+++ b/drivers/interconnect/qcom/osm-l3.c
@@ -236,6 +236,8 @@ static int qcom_osm_l3_probe(struct platform_device *pdev)
 
 		node->name = qnodes[i]->name;
 		node->data = qnodes[i];
+		node->init_avg = INT_MAX;
+		node->init_peak = INT_MAX;
 		icc_node_add(node, provider);
 
 		for (j = 0; j < qnodes[i]->num_links; j++)
@@ -268,6 +270,7 @@ static struct platform_driver osm_l3_driver = {
 	.driver = {
 		.name = "osm-l3",
 		.of_match_table = osm_l3_of_match,
+		.sync_state = icc_sync_state,
 	},
 };
 module_platform_driver(osm_l3_driver);
diff --git a/drivers/interconnect/qcom/sdm845.c b/drivers/interconnect/qcom/sdm845.c
index f6c7b969520d..c04775820f15 100644
--- a/drivers/interconnect/qcom/sdm845.c
+++ b/drivers/interconnect/qcom/sdm845.c
@@ -503,6 +503,8 @@ static int qnoc_probe(struct platform_device *pdev)
 
 		node->name = qnodes[i]->name;
 		node->data = qnodes[i];
+		node->init_avg = INT_MAX;
+		node->init_peak = INT_MAX;
 		icc_node_add(node, provider);
 
 		for (j = 0; j < qnodes[i]->num_links; j++)
@@ -559,6 +561,7 @@ static struct platform_driver qnoc_driver = {
 	.driver = {
 		.name = "qnoc-sdm845",
 		.of_match_table = qnoc_of_match,
+		.sync_state = icc_sync_state,
 	},
 };
 module_platform_driver(qnoc_driver);

^ permalink raw reply related

* [PATCH v2 1/2] interconnect: Add sync state support
From: Georgi Djakov @ 2020-07-22 11:01 UTC (permalink / raw)
  To: linux-pm
  Cc: saravanak, mdtipton, okukatla, bjorn.andersson, vincent.guittot,
	georgi.djakov, linux-arm-msm, linux-kernel
In-Reply-To: <20200722110139.24778-1-georgi.djakov@linaro.org>

The bootloaders often do some initial configuration of the interconnects
in the system and we want to keep this configuration until all consumers
have probed and expressed their bandwidth needs. This is because we don't
want to change the configuration by starting to disable unused paths until
every user had a chance to request the amount of bandwidth it needs.

To accomplish this we will implement an interconnect specific sync_state
callback which will synchronize (aggregate and set) the current bandwidth
settings when all consumers have been probed.

Signed-off-by: Georgi Djakov <georgi.djakov@linaro.org>
---
 drivers/interconnect/core.c           | 61 +++++++++++++++++++++++++++
 include/linux/interconnect-provider.h |  5 +++
 2 files changed, 66 insertions(+)

diff --git a/drivers/interconnect/core.c b/drivers/interconnect/core.c
index e5f998744501..0c4e38d9f1fa 100644
--- a/drivers/interconnect/core.c
+++ b/drivers/interconnect/core.c
@@ -26,6 +26,8 @@
 
 static DEFINE_IDR(icc_idr);
 static LIST_HEAD(icc_providers);
+static int providers_count;
+static bool synced_state;
 static DEFINE_MUTEX(icc_lock);
 static struct dentry *icc_debugfs_dir;
 
@@ -255,6 +257,12 @@ static int aggregate_requests(struct icc_node *node)
 			continue;
 		p->aggregate(node, r->tag, r->avg_bw, r->peak_bw,
 			     &node->avg_bw, &node->peak_bw);
+
+		/* during boot use the initial bandwidth as a floor value */
+		if (!synced_state) {
+			node->avg_bw = max(node->avg_bw, node->init_avg);
+			node->peak_bw = max(node->peak_bw, node->init_peak);
+		}
 	}
 
 	return 0;
@@ -919,6 +927,13 @@ void icc_node_add(struct icc_node *node, struct icc_provider *provider)
 	node->provider = provider;
 	list_add_tail(&node->node_list, &provider->nodes);
 
+	/* get the initial bandwidth values and sync them with hardware */
+	if ((node->init_avg || node->init_peak) && provider->set) {
+		node->avg_bw = node->init_avg;
+		node->peak_bw = node->init_peak;
+		provider->set(node, node);
+	}
+
 	mutex_unlock(&icc_lock);
 }
 EXPORT_SYMBOL_GPL(icc_node_add);
@@ -1014,8 +1029,54 @@ int icc_provider_del(struct icc_provider *provider)
 }
 EXPORT_SYMBOL_GPL(icc_provider_del);
 
+static int of_count_icc_providers(struct device_node *np)
+{
+	struct device_node *child;
+	int count = 0;
+
+	for_each_available_child_of_node(np, child) {
+		if (of_property_read_bool(child, "#interconnect-cells"))
+			count++;
+		count += of_count_icc_providers(child);
+	}
+	of_node_put(np);
+
+	return count;
+}
+
+void icc_sync_state(struct device *dev)
+{
+	struct icc_provider *p;
+	struct icc_node *n;
+	static int count;
+
+	count++;
+
+	if (count < providers_count)
+		return;
+
+	mutex_lock(&icc_lock);
+	synced_state = true;
+	list_for_each_entry(p, &icc_providers, provider_list) {
+		dev_dbg(p->dev, "interconnect provider is in synced state\n");
+		list_for_each_entry(n, &p->nodes, node_list) {
+			if (n->init_avg || n->init_peak) {
+				aggregate_requests(n);
+				p->set(n, n);
+			}
+		}
+	}
+	mutex_unlock(&icc_lock);
+}
+EXPORT_SYMBOL_GPL(icc_sync_state);
+
 static int __init icc_init(void)
 {
+	struct device_node *root = of_find_node_by_path("/");
+
+	providers_count = of_count_icc_providers(root);
+	of_node_put(root);
+
 	icc_debugfs_dir = debugfs_create_dir("interconnect", NULL);
 	debugfs_create_file("interconnect_summary", 0444,
 			    icc_debugfs_dir, NULL, &icc_summary_fops);
diff --git a/include/linux/interconnect-provider.h b/include/linux/interconnect-provider.h
index 0c494534b4d3..aa85b96296c4 100644
--- a/include/linux/interconnect-provider.h
+++ b/include/linux/interconnect-provider.h
@@ -71,6 +71,8 @@ struct icc_provider {
  * @req_list: a list of QoS constraint requests associated with this node
  * @avg_bw: aggregated value of average bandwidth requests from all consumers
  * @peak_bw: aggregated value of peak bandwidth requests from all consumers
+ * @init_avg: average bandwidth value that is read from the hardware during init
+ * @init_peak: peak bandwidth value that is read from the hardware during init
  * @data: pointer to private data
  */
 struct icc_node {
@@ -87,6 +89,8 @@ struct icc_node {
 	struct hlist_head	req_list;
 	u32			avg_bw;
 	u32			peak_bw;
+	u32			init_avg;
+	u32			init_peak;
 	void			*data;
 };
 
@@ -103,6 +107,7 @@ void icc_node_del(struct icc_node *node);
 int icc_nodes_remove(struct icc_provider *provider);
 int icc_provider_add(struct icc_provider *provider);
 int icc_provider_del(struct icc_provider *provider);
+void icc_sync_state(struct device *dev);
 
 #else
 

^ permalink raw reply related

* [PATCH v2 7/7] cpufreq: make schedutil the default for arm and arm64
From: Ionela Voinescu @ 2020-07-22  9:37 UTC (permalink / raw)
  To: rjw, viresh.kumar, dietmar.eggemann, catalin.marinas,
	sudeep.holla, will, linux
  Cc: mingo, peterz, linux-pm, linux-arm-kernel, linux-kernel,
	ionela.voinescu, Valentin Schneider
In-Reply-To: <20200722093732.14297-1-ionela.voinescu@arm.com>

From: Valentin Schneider <valentin.schneider@arm.com>

schedutil is already a hard-requirement for EAS, which has lead to making
it default on arm (when CONFIG_BIG_LITTLE), see:

  commit 8fdcca8e254a ("cpufreq: Select schedutil when using big.LITTLE")

One thing worth pointing out is that schedutil isn't only relevant for
asymmetric CPU capacity systems; for instance, schedutil is the only
governor that honours util-clamp performance requests. Another good example
of this is x86 switching to using it by default in:

  commit a00ec3874e7d ("cpufreq: intel_pstate: Select schedutil as the default governor")

Arguably it should be made the default for all architectures, but it seems
better to wait for them to also gain frequency invariance powers. Make it
the default for arm && arm64 for now.

Signed-off-by: Valentin Schneider <valentin.schneider@arm.com>
Signed-off-by: Ionela Voinescu <ionela.voinescu@arm.com>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Will Deacon <will@kernel.org>
Cc: Russell King <linux@armlinux.org.uk>
Cc: Rafael J. Wysocki <rjw@rjwysocki.net>
Cc: Viresh Kumar <viresh.kumar@linaro.org>
---
 drivers/cpufreq/Kconfig | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/cpufreq/Kconfig b/drivers/cpufreq/Kconfig
index e91750132552..2c7171e0b001 100644
--- a/drivers/cpufreq/Kconfig
+++ b/drivers/cpufreq/Kconfig
@@ -37,7 +37,7 @@ config CPU_FREQ_STAT
 choice
 	prompt "Default CPUFreq governor"
 	default CPU_FREQ_DEFAULT_GOV_USERSPACE if ARM_SA1100_CPUFREQ || ARM_SA1110_CPUFREQ
-	default CPU_FREQ_DEFAULT_GOV_SCHEDUTIL if BIG_LITTLE
+	default CPU_FREQ_DEFAULT_GOV_SCHEDUTIL if ARM64 || ARM
 	default CPU_FREQ_DEFAULT_GOV_SCHEDUTIL if X86_INTEL_PSTATE && SMP
 	default CPU_FREQ_DEFAULT_GOV_PERFORMANCE
 	help
-- 
2.17.1


^ permalink raw reply related

* [PATCH v2 6/7] arch_topology,arm,arm64: define arch_scale_freq_invariant()
From: Ionela Voinescu @ 2020-07-22  9:37 UTC (permalink / raw)
  To: rjw, viresh.kumar, dietmar.eggemann, catalin.marinas,
	sudeep.holla, will, linux
  Cc: mingo, peterz, linux-pm, linux-arm-kernel, linux-kernel,
	ionela.voinescu, Valentin Schneider
In-Reply-To: <20200722093732.14297-1-ionela.voinescu@arm.com>

From: Valentin Schneider <valentin.schneider@arm.com>

arch_scale_freq_invariant() is used by schedutil to determine whether
the scheduler's load-tracking signals are frequency invariant. Its
definition is overridable, though by default it is hardcoded to 'true'
if arch_scale_freq_capacity() is defined ('false' otherwise).

This behaviour is not overridden on arm, arm64 and other users of the
generic arch topology driver, which is somewhat precarious:
arch_scale_freq_capacity() will always be defined, yet not all cpufreq
drivers are guaranteed to drive the frequency invariance scale factor
setting. In other words, the load-tracking signals may very well *not*
be frequency invariant.

Now that cpufreq can be queried on whether the current driver is driving
the Frequency Invariance (FI) scale setting, the current situation can
be improved. This combines the query of whether cpufreq supports the
setting of the frequency scale factor, with whether all online CPUs are
counter-based FI enabled.

While cpufreq FI enablement applies at system level, for all CPUs,
counter-based FI support could also be used for only a subset of CPUs to
set the invariance scale factor. Therefore, if cpufreq-based FI support
is present, we consider the system to be invariant. If missing, we
require all online CPUs to be counter-based FI enabled in order for the
full system to be considered invariant.

If the system ends up not being invariant, a new condition is needed in
the counter initialization code that disables all scale factor setting
based on counters.

Precedence of counters over cpufreq use is not important here. The
invariant status is only given to the system if all CPUs have at least
one method of setting the frequency scale factor.

Signed-off-by: Valentin Schneider <valentin.schneider@arm.com>
Signed-off-by: Ionela Voinescu <ionela.voinescu@arm.com>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Will Deacon <will@kernel.org>
Cc: Sudeep Holla <sudeep.holla@arm.com>
---
 arch/arm/include/asm/topology.h   | 1 +
 arch/arm64/include/asm/topology.h | 1 +
 arch/arm64/kernel/topology.c      | 7 +++++++
 drivers/base/arch_topology.c      | 6 ++++++
 include/linux/arch_topology.h     | 2 ++
 5 files changed, 17 insertions(+)

diff --git a/arch/arm/include/asm/topology.h b/arch/arm/include/asm/topology.h
index 435aba289fc5..d59458e97c6d 100644
--- a/arch/arm/include/asm/topology.h
+++ b/arch/arm/include/asm/topology.h
@@ -9,6 +9,7 @@

 /* Replace task scheduler's default frequency-invariant accounting */
 #define arch_scale_freq_capacity topology_get_freq_scale
+#define arch_scale_freq_invariant topology_scale_freq_invariant

 /* Replace task scheduler's default cpu-invariant accounting */
 #define arch_scale_cpu_capacity topology_get_cpu_scale
diff --git a/arch/arm64/include/asm/topology.h b/arch/arm64/include/asm/topology.h
index 0cc835ddfcd1..b67081c0dcfb 100644
--- a/arch/arm64/include/asm/topology.h
+++ b/arch/arm64/include/asm/topology.h
@@ -27,6 +27,7 @@ void topology_scale_freq_tick(void);

 /* Replace task scheduler's default frequency-invariant accounting */
 #define arch_scale_freq_capacity topology_get_freq_scale
+#define arch_scale_freq_invariant topology_scale_freq_invariant

 /* Replace task scheduler's default cpu-invariant accounting */
 #define arch_scale_cpu_capacity topology_get_cpu_scale
diff --git a/arch/arm64/kernel/topology.c b/arch/arm64/kernel/topology.c
index 9a9f2b8dedf5..4064d39bb66d 100644
--- a/arch/arm64/kernel/topology.c
+++ b/arch/arm64/kernel/topology.c
@@ -246,6 +246,13 @@ static int __init init_amu_fie(void)
 		static_branch_enable(&amu_fie_key);
 	}

+	/*
+	 * If the system is not fully invariant after AMU init, disable
+	 * partial use of counters for frequency invariance.
+	 */
+	if (!topology_scale_freq_invariant())
+		static_branch_disable(&amu_fie_key);
+
 free_valid_mask:
 	free_cpumask_var(valid_cpus);

diff --git a/drivers/base/arch_topology.c b/drivers/base/arch_topology.c
index 3ad59e38f3f3..30b67bb820c5 100644
--- a/drivers/base/arch_topology.c
+++ b/drivers/base/arch_topology.c
@@ -21,6 +21,12 @@
 #include <linux/sched.h>
 #include <linux/smp.h>

+bool topology_scale_freq_invariant(void)
+{
+	return cpufreq_sets_freq_scale() ||
+	       arch_freq_counters_available(cpu_online_mask);
+}
+
 __weak bool arch_freq_counters_available(const struct cpumask *cpus)
 {
 	return false;
diff --git a/include/linux/arch_topology.h b/include/linux/arch_topology.h
index 4be0315700cb..5bc55cfc9399 100644
--- a/include/linux/arch_topology.h
+++ b/include/linux/arch_topology.h
@@ -30,6 +30,8 @@ static inline unsigned long topology_get_freq_scale(int cpu)
 	return per_cpu(freq_scale, cpu);
 }

+bool topology_scale_freq_invariant(void);
+
 bool arch_freq_counters_available(const struct cpumask *cpus);

 DECLARE_PER_CPU(unsigned long, thermal_pressure);
-- 
2.17.1

^ permalink raw reply related

* [PATCH v2 5/7] arch_topology,cpufreq,sched/core: constify arch_* cpumasks
From: Ionela Voinescu @ 2020-07-22  9:37 UTC (permalink / raw)
  To: rjw, viresh.kumar, dietmar.eggemann, catalin.marinas,
	sudeep.holla, will, linux
  Cc: mingo, peterz, linux-pm, linux-arm-kernel, linux-kernel,
	ionela.voinescu, Valentin Schneider
In-Reply-To: <20200722093732.14297-1-ionela.voinescu@arm.com>

From: Valentin Schneider <valentin.schneider@arm.com>

The passed cpumask arguments to:
 - arch_set_freq_scale(),
 - arch_set_thermal_pressure(), and
 - arch_freq_counters_available()

are only iterated over, so reflect this in the prototype. This also
allows to pass system cpumasks like cpu_online_mask without getting
a warning.

Signed-off-by: Valentin Schneider <valentin.schneider@arm.com>
Signed-off-by: Ionela Voinescu <ionela.voinescu@arm.com>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Will Deacon <will@kernel.org>
Cc: Sudeep Holla <sudeep.holla@arm.com>
Cc: Rafael J. Wysocki <rjw@rjwysocki.net>
Cc: Viresh Kumar <viresh.kumar@linaro.org>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
---
 arch/arm64/kernel/topology.c  | 2 +-
 drivers/base/arch_topology.c  | 4 ++--
 drivers/cpufreq/cpufreq.c     | 5 +++--
 include/linux/arch_topology.h | 4 ++--
 include/linux/cpufreq.h       | 3 ++-
 kernel/sched/core.c           | 2 +-
 6 files changed, 11 insertions(+), 9 deletions(-)

diff --git a/arch/arm64/kernel/topology.c b/arch/arm64/kernel/topology.c
index 0801a0f3c156..9a9f2b8dedf5 100644
--- a/arch/arm64/kernel/topology.c
+++ b/arch/arm64/kernel/topology.c
@@ -253,7 +253,7 @@ static int __init init_amu_fie(void)
 }
 late_initcall_sync(init_amu_fie);
 
-bool arch_freq_counters_available(struct cpumask *cpus)
+bool arch_freq_counters_available(const struct cpumask *cpus)
 {
 	return amu_freq_invariant() &&
 	       cpumask_subset(cpus, amu_fie_cpus);
diff --git a/drivers/base/arch_topology.c b/drivers/base/arch_topology.c
index 708768f528dc..3ad59e38f3f3 100644
--- a/drivers/base/arch_topology.c
+++ b/drivers/base/arch_topology.c
@@ -21,14 +21,14 @@
 #include <linux/sched.h>
 #include <linux/smp.h>
 
-__weak bool arch_freq_counters_available(struct cpumask *cpus)
+__weak bool arch_freq_counters_available(const struct cpumask *cpus)
 {
 	return false;
 }
 DEFINE_PER_CPU(unsigned long, freq_scale) = SCHED_CAPACITY_SCALE;
 
 #ifndef CONFIG_BL_SWITCHER
-void arch_set_freq_scale(struct cpumask *cpus, unsigned long cur_freq,
+void arch_set_freq_scale(const struct cpumask *cpus, unsigned long cur_freq,
 			 unsigned long max_freq)
 {
 	unsigned long scale;
diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c
index 1d0b046fe8e9..7e6452143654 100644
--- a/drivers/cpufreq/cpufreq.c
+++ b/drivers/cpufreq/cpufreq.c
@@ -178,8 +178,9 @@ u64 get_cpu_idle_time(unsigned int cpu, u64 *wall, int io_busy)
 }
 EXPORT_SYMBOL_GPL(get_cpu_idle_time);
 
-__weak void arch_set_freq_scale(struct cpumask *cpus, unsigned long cur_freq,
-		unsigned long max_freq)
+__weak void arch_set_freq_scale(const struct cpumask *cpus,
+				unsigned long cur_freq,
+				unsigned long max_freq)
 {
 	if (cpufreq_sets_freq_scale())
 		static_branch_disable_cpuslocked(&cpufreq_set_freq_scale);
diff --git a/include/linux/arch_topology.h b/include/linux/arch_topology.h
index 0566cb3314ef..4be0315700cb 100644
--- a/include/linux/arch_topology.h
+++ b/include/linux/arch_topology.h
@@ -30,7 +30,7 @@ static inline unsigned long topology_get_freq_scale(int cpu)
 	return per_cpu(freq_scale, cpu);
 }
 
-bool arch_freq_counters_available(struct cpumask *cpus);
+bool arch_freq_counters_available(const struct cpumask *cpus);
 
 DECLARE_PER_CPU(unsigned long, thermal_pressure);
 
@@ -39,7 +39,7 @@ static inline unsigned long topology_get_thermal_pressure(int cpu)
 	return per_cpu(thermal_pressure, cpu);
 }
 
-void arch_set_thermal_pressure(struct cpumask *cpus,
+void arch_set_thermal_pressure(const struct cpumask *cpus,
 			       unsigned long th_pressure);
 
 struct cpu_topology {
diff --git a/include/linux/cpufreq.h b/include/linux/cpufreq.h
index f81215ad76f1..781f1c5957ff 100644
--- a/include/linux/cpufreq.h
+++ b/include/linux/cpufreq.h
@@ -1009,7 +1009,8 @@ static inline void sched_cpufreq_governor_change(struct cpufreq_policy *policy,
 extern void arch_freq_prepare_all(void);
 extern unsigned int arch_freq_get_on_cpu(int cpu);
 
-extern void arch_set_freq_scale(struct cpumask *cpus, unsigned long cur_freq,
+extern void arch_set_freq_scale(const struct cpumask *cpus,
+				unsigned long cur_freq,
 				unsigned long max_freq);
 
 /* the following are really really optional */
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index f518af52d0fb..b44a42b1236c 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -3645,7 +3645,7 @@ unsigned long long task_sched_runtime(struct task_struct *p)
 
 DEFINE_PER_CPU(unsigned long, thermal_pressure);
 
-void arch_set_thermal_pressure(struct cpumask *cpus,
+void arch_set_thermal_pressure(const struct cpumask *cpus,
 			       unsigned long th_pressure)
 {
 	int cpu;
-- 
2.17.1


^ permalink raw reply related

* [PATCH v2 4/7] cpufreq: report whether cpufreq supports Frequency Invariance (FI)
From: Ionela Voinescu @ 2020-07-22  9:37 UTC (permalink / raw)
  To: rjw, viresh.kumar, dietmar.eggemann, catalin.marinas,
	sudeep.holla, will, linux
  Cc: mingo, peterz, linux-pm, linux-arm-kernel, linux-kernel,
	ionela.voinescu
In-Reply-To: <20200722093732.14297-1-ionela.voinescu@arm.com>

Now that the update of the FI scale factor is done in cpufreq core for
selected functions - target(), target_index() and fast_switch(),
we can provide feedback to the task scheduler and architecture code
on whether cpufreq supports FI.

For this purpose, provide error and debug messages, together with an
external function to expose whether the cpufreq drivers support FI, by
using a static key.

The logic behind the enablement of cpufreq-based invariance is as
follows:
 - cpufreq-based invariance is disabled by default
 - cpufreq-based invariance is enabled if any of the callbacks
   above is implemented while the unsupported setpolicy() is not
 - if enabled, cpufreq-based invariance will be disabled during the
   call of the default arch_set_freq_scale() function which does
   not set a scale factor.

Signed-off-by: Ionela Voinescu <ionela.voinescu@arm.com>
Cc: Rafael J. Wysocki <rjw@rjwysocki.net>
Cc: Viresh Kumar <viresh.kumar@linaro.org>
---
 drivers/cpufreq/cpufreq.c | 28 ++++++++++++++++++++++++++++
 include/linux/cpufreq.h   |  5 +++++
 2 files changed, 33 insertions(+)

diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c
index 3497c1cd6818..1d0b046fe8e9 100644
--- a/drivers/cpufreq/cpufreq.c
+++ b/drivers/cpufreq/cpufreq.c
@@ -61,6 +61,9 @@ static struct cpufreq_driver *cpufreq_driver;
 static DEFINE_PER_CPU(struct cpufreq_policy *, cpufreq_cpu_data);
 static DEFINE_RWLOCK(cpufreq_driver_lock);
 
+/* Mark support for the scheduler's frequency invariance engine */
+static DEFINE_STATIC_KEY_FALSE(cpufreq_set_freq_scale);
+
 /* Flag to suspend/resume CPUFreq governors */
 static bool cpufreq_suspended;
 
@@ -69,6 +72,25 @@ static inline bool has_target(void)
 	return cpufreq_driver->target_index || cpufreq_driver->target;
 }
 
+static inline
+void enable_cpufreq_freq_invariance(struct cpufreq_driver *driver)
+{
+	if ((driver->target || driver->target_index || driver->fast_switch) &&
+	    !driver->setpolicy) {
+
+		static_branch_enable_cpuslocked(&cpufreq_set_freq_scale);
+		pr_debug("%s: Driver %s can provide frequency invariance.",
+			 __func__, driver->name);
+	} else
+		pr_err("%s: Driver %s cannot provide frequency invariance.",
+		__func__, driver->name);
+}
+
+bool cpufreq_sets_freq_scale(void)
+{
+	return static_branch_likely(&cpufreq_set_freq_scale);
+}
+
 /* internal prototypes */
 static unsigned int __cpufreq_get(struct cpufreq_policy *policy);
 static int cpufreq_init_governor(struct cpufreq_policy *policy);
@@ -159,6 +181,9 @@ EXPORT_SYMBOL_GPL(get_cpu_idle_time);
 __weak void arch_set_freq_scale(struct cpumask *cpus, unsigned long cur_freq,
 		unsigned long max_freq)
 {
+	if (cpufreq_sets_freq_scale())
+		static_branch_disable_cpuslocked(&cpufreq_set_freq_scale);
+
 }
 EXPORT_SYMBOL_GPL(arch_set_freq_scale);
 
@@ -2722,6 +2747,8 @@ int cpufreq_register_driver(struct cpufreq_driver *driver_data)
 	cpufreq_driver = driver_data;
 	write_unlock_irqrestore(&cpufreq_driver_lock, flags);
 
+	enable_cpufreq_freq_invariance(cpufreq_driver);
+
 	if (driver_data->setpolicy)
 		driver_data->flags |= CPUFREQ_CONST_LOOPS;
 
@@ -2791,6 +2818,7 @@ int cpufreq_unregister_driver(struct cpufreq_driver *driver)
 	cpus_read_lock();
 	subsys_interface_unregister(&cpufreq_interface);
 	remove_boost_sysfs_file();
+	static_branch_disable_cpuslocked(&cpufreq_set_freq_scale);
 	cpuhp_remove_state_nocalls_cpuslocked(hp_online);
 
 	write_lock_irqsave(&cpufreq_driver_lock, flags);
diff --git a/include/linux/cpufreq.h b/include/linux/cpufreq.h
index e62b022cb07e..f81215ad76f1 100644
--- a/include/linux/cpufreq.h
+++ b/include/linux/cpufreq.h
@@ -217,6 +217,7 @@ void refresh_frequency_limits(struct cpufreq_policy *policy);
 void cpufreq_update_policy(unsigned int cpu);
 void cpufreq_update_limits(unsigned int cpu);
 bool have_governor_per_policy(void);
+bool cpufreq_sets_freq_scale(void);
 struct kobject *get_governor_parent_kobj(struct cpufreq_policy *policy);
 void cpufreq_enable_fast_switch(struct cpufreq_policy *policy);
 void cpufreq_disable_fast_switch(struct cpufreq_policy *policy);
@@ -237,6 +238,10 @@ static inline unsigned int cpufreq_get_hw_max_freq(unsigned int cpu)
 {
 	return 0;
 }
+static inline bool cpufreq_sets_freq_scale(void)
+{
+	return false;
+}
 static inline void disable_cpufreq(void) { }
 #endif
 
-- 
2.17.1


^ permalink raw reply related

* [PATCH v2 3/7] arch_topology: disable frequency invariance for CONFIG_BL_SWITCHER
From: Ionela Voinescu @ 2020-07-22  9:37 UTC (permalink / raw)
  To: rjw, viresh.kumar, dietmar.eggemann, catalin.marinas,
	sudeep.holla, will, linux
  Cc: mingo, peterz, linux-pm, linux-arm-kernel, linux-kernel,
	ionela.voinescu
In-Reply-To: <20200722093732.14297-1-ionela.voinescu@arm.com>

big.LITTLE switching complicates the setting of a correct cpufreq-based
frequency invariance scale factor due to (as observed in
drivers/cpufreq/vexpress-spc-cpufreq.c):
 - Incorrect current and maximum frequencies as a result of the
   exposure of a virtual frequency table to the cpufreq core,
 - Missed updates as a result of asynchronous frequency adjustments
   caused by frequency changes in other CPU pairs.

Given that its functionality is atypical in regards to frequency
invariance and this is an old technology, disable frequency
invariance for when big.LITTLE switching is configured in to prevent
incorrect scale setting.

Signed-off-by: Ionela Voinescu <ionela.voinescu@arm.com>
Suggested-by: Dietmar Eggemann <dietmar.eggemann@arm.com>
Cc: Sudeep Holla <sudeep.holla@arm.com>
Cc: Rafael J. Wysocki <rjw@rjwysocki.net>
---
 drivers/base/arch_topology.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/drivers/base/arch_topology.c b/drivers/base/arch_topology.c
index 4d0a0038b476..708768f528dc 100644
--- a/drivers/base/arch_topology.c
+++ b/drivers/base/arch_topology.c
@@ -27,6 +27,7 @@ __weak bool arch_freq_counters_available(struct cpumask *cpus)
 }
 DEFINE_PER_CPU(unsigned long, freq_scale) = SCHED_CAPACITY_SCALE;
 
+#ifndef CONFIG_BL_SWITCHER
 void arch_set_freq_scale(struct cpumask *cpus, unsigned long cur_freq,
 			 unsigned long max_freq)
 {
@@ -46,6 +47,7 @@ void arch_set_freq_scale(struct cpumask *cpus, unsigned long cur_freq,
 	for_each_cpu(i, cpus)
 		per_cpu(freq_scale, i) = scale;
 }
+#endif
 
 DEFINE_PER_CPU(unsigned long, cpu_scale) = SCHED_CAPACITY_SCALE;
 
-- 
2.17.1


^ permalink raw reply related

* [PATCH v2 2/7] cpufreq: set invariance scale factor on transition end
From: Ionela Voinescu @ 2020-07-22  9:37 UTC (permalink / raw)
  To: rjw, viresh.kumar, dietmar.eggemann, catalin.marinas,
	sudeep.holla, will, linux
  Cc: mingo, peterz, linux-pm, linux-arm-kernel, linux-kernel,
	ionela.voinescu
In-Reply-To: <20200722093732.14297-1-ionela.voinescu@arm.com>

While the move of the invariance setter calls (arch_set_freq_scale())
from cpufreq drivers to cpufreq core maintained the previous
functionality for existing drivers that use target_index() and
fast_switch() for frequency switching, it also gives the possibility
of adding support for users of the target() callback, which is exploited
here.

To be noted that the target() callback has been flagged as deprecated
since:

commit 9c0ebcf78fde ("cpufreq: Implement light weight ->target_index() routine")

It also doesn't have that many users:

  cpufreq-nforce2.c:371:2:      .target = nforce2_target,
  cppc_cpufreq.c:416:2:         .target = cppc_cpufreq_set_target,
  gx-suspmod.c:439:2:           .target = cpufreq_gx_target,
  pcc-cpufreq.c:573:2:          .target = pcc_cpufreq_target,

Similarly to the path taken for target_index() calls in the cpufreq core
during a frequency change, all of the drivers above will mark the end of a
frequency change by a call to cpufreq_freq_transition_end().

Therefore, cpufreq_freq_transition_end() can be used as the location for
the arch_set_freq_scale() call to potentially inform the scheduler of the
frequency change.

This change maintains the previous functionality for the drivers that
implement the target_index() callback, while also adding support for the
few drivers that implement the deprecated target() callback.

Two notes are worthwhile here:
 - In __target_index(), cpufreq_freq_transition_end() is called only for
   drivers that have synchronous notifications enabled. There is only one
   driver that disables them,

   drivers/cpufreq/powernow-k8.c:1142: .flags = CPUFREQ_ASYNC_NOTIFICATION,

   which is deprecated.

 - Despite marking a successful frequency change, many cpufreq drivers
   will populate the new policy->cur with the new requested frequency,
   although this might not be the one granted by the hardware.

   Therefore, the call to arch_set_freq_scale() is a "best effort" one,
   and it is up to the architecture if the new frequency is used in the
   new frequency scale factor setting or eventually used by the scheduler.
   The architecture is in a better position to decide if it has better
   methods to obtain more accurate information regarding the current
   frequency (for example the use of counters).

Signed-off-by: Ionela Voinescu <ionela.voinescu@arm.com>
Cc: Rafael J. Wysocki <rjw@rjwysocki.net>
Cc: Viresh Kumar <viresh.kumar@linaro.org>
---
 drivers/cpufreq/cpufreq.c | 15 ++++++---------
 1 file changed, 6 insertions(+), 9 deletions(-)

diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c
index bac4101546db..3497c1cd6818 100644
--- a/drivers/cpufreq/cpufreq.c
+++ b/drivers/cpufreq/cpufreq.c
@@ -448,6 +448,10 @@ void cpufreq_freq_transition_end(struct cpufreq_policy *policy,

 	cpufreq_notify_post_transition(policy, freqs, transition_failed);

+	arch_set_freq_scale(policy->related_cpus,
+			    policy->cur,
+			    policy->cpuinfo.max_freq);
+
 	policy->transition_ongoing = false;
 	policy->transition_task = NULL;

@@ -2159,7 +2163,7 @@ int __cpufreq_driver_target(struct cpufreq_policy *policy,
 			    unsigned int relation)
 {
 	unsigned int old_target_freq = target_freq;
-	int index, retval;
+	int index;

 	if (cpufreq_disabled())
 		return -ENODEV;
@@ -2190,14 +2194,7 @@ int __cpufreq_driver_target(struct cpufreq_policy *policy,

 	index = cpufreq_frequency_table_target(policy, target_freq, relation);

-	retval = __target_index(policy, index);
-
-	if (!retval)
-		arch_set_freq_scale(policy->related_cpus,
-				    policy->freq_table[index].frequency,
-				    policy->cpuinfo.max_freq);
-
-	return retval;
+	return __target_index(policy, index);
 }
 EXPORT_SYMBOL_GPL(__cpufreq_driver_target);

-- 
2.17.1

^ permalink raw reply related

* [PATCH v2 1/7] cpufreq: move invariance setter calls in cpufreq core
From: Ionela Voinescu @ 2020-07-22  9:37 UTC (permalink / raw)
  To: rjw, viresh.kumar, dietmar.eggemann, catalin.marinas,
	sudeep.holla, will, linux
  Cc: mingo, peterz, linux-pm, linux-arm-kernel, linux-kernel,
	ionela.voinescu, Valentin Schneider
In-Reply-To: <20200722093732.14297-1-ionela.voinescu@arm.com>

From: Valentin Schneider <valentin.schneider@arm.com>

To properly scale its per-entity load-tracking signals, the task scheduler
needs to be given a frequency scale factor, i.e. some image of the current
frequency the CPU is running at. Currently, this scale can be computed
either by using counters (APERF/MPERF on x86, AMU on arm64), or by
piggy-backing on the frequency selection done by cpufreq.

For the latter, drivers have to explicitly set the scale factor
themselves, despite it being purely boiler-plate code: the required
information depends entirely on the kind of frequency switch callback
implemented by the driver, i.e. either of: target_index(), target(),
fast_switch() and setpolicy().

The fitness of those callbacks with regard to driving the Frequency
Invariance Engine (FIE) is studied below:

target_index()
==============
Documentation states that the chosen frequency "must be determined by
freq_table[index].frequency". It isn't clear if it *has* to be that
frequency, or if it can use that frequency value to do some computation
that ultimately leads to a different frequency selection. All drivers
go for the former, while the vexpress-spc-cpufreq has an atypical
implementation which is handled separately.

Therefore, the hook works on the assumption the core can use
freq_table[index].frequency.

target()
=======
This has been flagged as deprecated since:

  commit 9c0ebcf78fde ("cpufreq: Implement light weight ->target_index() routine")

It also doesn't have that many users:

  cpufreq-nforce2.c:371:2:	.target = nforce2_target,
  cppc_cpufreq.c:416:2:		.target = cppc_cpufreq_set_target,
  gx-suspmod.c:439:2:		.target = cpufreq_gx_target,
  pcc-cpufreq.c:573:2:		.target = pcc_cpufreq_target,

Should we care about drivers using this hook, we may be able to exploit
cpufreq_freq_transition_{being, end}(). This is handled in a separate
patch.

fast_switch()
=============
This callback *has* to return the frequency that was selected.

setpolicy()
===========
This callback does not have any designated way of informing what was the
end choice. But there are only two drivers using setpolicy(), and none
of them have current FIE support:

  drivers/cpufreq/longrun.c:281:	.setpolicy	= longrun_set_policy,
  drivers/cpufreq/intel_pstate.c:2215:	.setpolicy	= intel_pstate_set_policy,

The intel_pstate is known to use counter-driven frequency invariance.

Conclusion
==========

Given that the significant majority of current FIE enabled drivers use
callbacks that lend themselves to triggering the setting of the FIE scale
factor in a generic way, move the invariance setter calls to cpufreq core.

As a result of setting the frequency scale factor in cpufreq core, after
callbacks that lend themselves to trigger it, remove this functionality
from the driver side.

To be noted that despite marking a successful frequency change, many
cpufreq drivers will consider the new frequency as the requested
frequency, although this is might not be the one granted by the hardware.

Therefore, the call to arch_set_freq_scale() is a "best effort" one, and
it is up to the architecture if the new frequency is used in the new
frequency scale factor setting (determined by the implementation of
arch_set_freq_scale()) or eventually used by the scheduler (determined
by the implementation of arch_scale_freq_capacity()). The architecture
is in a better position to decide if it has better methods to obtain
more accurate information regarding the current frequency and use that
information instead (for example, the use of counters).

Signed-off-by: Valentin Schneider <valentin.schneider@arm.com>
Signed-off-by: Ionela Voinescu <ionela.voinescu@arm.com>
Cc: Rafael J. Wysocki <rjw@rjwysocki.net>
Cc: Viresh Kumar <viresh.kumar@linaro.org>
---
 drivers/cpufreq/cpufreq-dt.c           | 10 +---------
 drivers/cpufreq/cpufreq.c              | 20 +++++++++++++++++---
 drivers/cpufreq/qcom-cpufreq-hw.c      |  9 +--------
 drivers/cpufreq/scmi-cpufreq.c         | 12 ++----------
 drivers/cpufreq/scpi-cpufreq.c         |  6 +-----
 drivers/cpufreq/vexpress-spc-cpufreq.c |  5 -----
 6 files changed, 22 insertions(+), 40 deletions(-)

diff --git a/drivers/cpufreq/cpufreq-dt.c b/drivers/cpufreq/cpufreq-dt.c
index 944d7b45afe9..9fd4ce774f12 100644
--- a/drivers/cpufreq/cpufreq-dt.c
+++ b/drivers/cpufreq/cpufreq-dt.c
@@ -40,16 +40,8 @@ static int set_target(struct cpufreq_policy *policy, unsigned int index)
 {
 	struct private_data *priv = policy->driver_data;
 	unsigned long freq = policy->freq_table[index].frequency;
-	int ret;
-
-	ret = dev_pm_opp_set_rate(priv->cpu_dev, freq * 1000);
 
-	if (!ret) {
-		arch_set_freq_scale(policy->related_cpus, freq,
-				    policy->cpuinfo.max_freq);
-	}
-
-	return ret;
+	return dev_pm_opp_set_rate(priv->cpu_dev, freq * 1000);
 }
 
 /*
diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c
index 036f4cc42ede..bac4101546db 100644
--- a/drivers/cpufreq/cpufreq.c
+++ b/drivers/cpufreq/cpufreq.c
@@ -2058,9 +2058,16 @@ EXPORT_SYMBOL(cpufreq_unregister_notifier);
 unsigned int cpufreq_driver_fast_switch(struct cpufreq_policy *policy,
 					unsigned int target_freq)
 {
+	unsigned int freq;
+
 	target_freq = clamp_val(target_freq, policy->min, policy->max);
+	freq = cpufreq_driver->fast_switch(policy, target_freq);
+
+	if (freq)
+		arch_set_freq_scale(policy->related_cpus, freq,
+				    policy->cpuinfo.max_freq);
 
-	return cpufreq_driver->fast_switch(policy, target_freq);
+	return freq;
 }
 EXPORT_SYMBOL_GPL(cpufreq_driver_fast_switch);
 
@@ -2152,7 +2159,7 @@ int __cpufreq_driver_target(struct cpufreq_policy *policy,
 			    unsigned int relation)
 {
 	unsigned int old_target_freq = target_freq;
-	int index;
+	int index, retval;
 
 	if (cpufreq_disabled())
 		return -ENODEV;
@@ -2183,7 +2190,14 @@ int __cpufreq_driver_target(struct cpufreq_policy *policy,
 
 	index = cpufreq_frequency_table_target(policy, target_freq, relation);
 
-	return __target_index(policy, index);
+	retval = __target_index(policy, index);
+
+	if (!retval)
+		arch_set_freq_scale(policy->related_cpus,
+				    policy->freq_table[index].frequency,
+				    policy->cpuinfo.max_freq);
+
+	return retval;
 }
 EXPORT_SYMBOL_GPL(__cpufreq_driver_target);
 
diff --git a/drivers/cpufreq/qcom-cpufreq-hw.c b/drivers/cpufreq/qcom-cpufreq-hw.c
index 573630c23aca..e5d1ee7746a4 100644
--- a/drivers/cpufreq/qcom-cpufreq-hw.c
+++ b/drivers/cpufreq/qcom-cpufreq-hw.c
@@ -85,8 +85,6 @@ static int qcom_cpufreq_hw_target_index(struct cpufreq_policy *policy,
 	if (icc_scaling_enabled)
 		qcom_cpufreq_set_bw(policy, freq);
 
-	arch_set_freq_scale(policy->related_cpus, freq,
-			    policy->cpuinfo.max_freq);
 	return 0;
 }
 
@@ -113,7 +111,6 @@ static unsigned int qcom_cpufreq_hw_fast_switch(struct cpufreq_policy *policy,
 {
 	void __iomem *perf_state_reg = policy->driver_data;
 	int index;
-	unsigned long freq;
 
 	index = policy->cached_resolved_idx;
 	if (index < 0)
@@ -121,11 +118,7 @@ static unsigned int qcom_cpufreq_hw_fast_switch(struct cpufreq_policy *policy,
 
 	writel_relaxed(index, perf_state_reg);
 
-	freq = policy->freq_table[index].frequency;
-	arch_set_freq_scale(policy->related_cpus, freq,
-			    policy->cpuinfo.max_freq);
-
-	return freq;
+	return policy->freq_table[index].frequency;
 }
 
 static int qcom_cpufreq_hw_read_lut(struct device *cpu_dev,
diff --git a/drivers/cpufreq/scmi-cpufreq.c b/drivers/cpufreq/scmi-cpufreq.c
index fb42e3390377..6dd1311660b5 100644
--- a/drivers/cpufreq/scmi-cpufreq.c
+++ b/drivers/cpufreq/scmi-cpufreq.c
@@ -48,16 +48,11 @@ static unsigned int scmi_cpufreq_get_rate(unsigned int cpu)
 static int
 scmi_cpufreq_set_target(struct cpufreq_policy *policy, unsigned int index)
 {
-	int ret;
 	struct scmi_data *priv = policy->driver_data;
 	struct scmi_perf_ops *perf_ops = handle->perf_ops;
 	u64 freq = policy->freq_table[index].frequency;
 
-	ret = perf_ops->freq_set(handle, priv->domain_id, freq * 1000, false);
-	if (!ret)
-		arch_set_freq_scale(policy->related_cpus, freq,
-				    policy->cpuinfo.max_freq);
-	return ret;
+	return perf_ops->freq_set(handle, priv->domain_id, freq * 1000, false);
 }
 
 static unsigned int scmi_cpufreq_fast_switch(struct cpufreq_policy *policy,
@@ -67,11 +62,8 @@ static unsigned int scmi_cpufreq_fast_switch(struct cpufreq_policy *policy,
 	struct scmi_perf_ops *perf_ops = handle->perf_ops;
 
 	if (!perf_ops->freq_set(handle, priv->domain_id,
-				target_freq * 1000, true)) {
-		arch_set_freq_scale(policy->related_cpus, target_freq,
-				    policy->cpuinfo.max_freq);
+				target_freq * 1000, true))
 		return target_freq;
-	}
 
 	return 0;
 }
diff --git a/drivers/cpufreq/scpi-cpufreq.c b/drivers/cpufreq/scpi-cpufreq.c
index b0f5388b8854..43db05b949d9 100644
--- a/drivers/cpufreq/scpi-cpufreq.c
+++ b/drivers/cpufreq/scpi-cpufreq.c
@@ -47,9 +47,8 @@ static unsigned int scpi_cpufreq_get_rate(unsigned int cpu)
 static int
 scpi_cpufreq_set_target(struct cpufreq_policy *policy, unsigned int index)
 {
-	unsigned long freq = policy->freq_table[index].frequency;
+	u64 rate = policy->freq_table[index].frequency * 1000;
 	struct scpi_data *priv = policy->driver_data;
-	u64 rate = freq * 1000;
 	int ret;
 
 	ret = clk_set_rate(priv->clk, rate);
@@ -60,9 +59,6 @@ scpi_cpufreq_set_target(struct cpufreq_policy *policy, unsigned int index)
 	if (clk_get_rate(priv->clk) != rate)
 		return -EIO;
 
-	arch_set_freq_scale(policy->related_cpus, freq,
-			    policy->cpuinfo.max_freq);
-
 	return 0;
 }
 
diff --git a/drivers/cpufreq/vexpress-spc-cpufreq.c b/drivers/cpufreq/vexpress-spc-cpufreq.c
index 4e8b1dee7c9a..313bb9db369b 100644
--- a/drivers/cpufreq/vexpress-spc-cpufreq.c
+++ b/drivers/cpufreq/vexpress-spc-cpufreq.c
@@ -200,11 +200,6 @@ static int ve_spc_cpufreq_set_target(struct cpufreq_policy *policy,
 	ret = ve_spc_cpufreq_set_rate(cpu, actual_cluster, new_cluster,
 				      freqs_new);
 
-	if (!ret) {
-		arch_set_freq_scale(policy->related_cpus, freqs_new,
-				    policy->cpuinfo.max_freq);
-	}
-
 	return ret;
 }
 
-- 
2.17.1


^ permalink raw reply related

* [PATCH v2 0/7] cpufreq: improve frequency invariance support
From: Ionela Voinescu @ 2020-07-22  9:37 UTC (permalink / raw)
  To: rjw, viresh.kumar, dietmar.eggemann, catalin.marinas,
	sudeep.holla, will, linux
  Cc: mingo, peterz, linux-pm, linux-arm-kernel, linux-kernel,
	ionela.voinescu

Hi guys,

Please find below the changes to this series:

v1 -> v2:
 - v1 can be found at [1]
 - No cpufreq flags are introduced
 - Previous patches 2/8 and 3/8 were squashed in this series under 1/7,
   to ensure bisection.
 - 2/7 was introduced as a proposal for Viresh's suggestion to use
   policy->cur in the call to arch_set_freq_scale() and is extended to
   support drivers that implement the target() callback as well
 - Additional commit message changes are added to 1/7 and 2/7, to
   clarify that the definition of arch_set_freq_scale() will filter 
   settings of the scale factor, if unwanted
 - 3/7 disables setting of the scale factor for
   CONFIG_BL_SWITCHER, as Dietmar suggested
 - Small change introduced in 4/7 to disable cpufreq-based frequency
   invariance for the users of the default arch_set_freq_scale() call
   which will not actually set a scale factor
 - build issue solved (reported by 0day test)
 - v2 is based on linux-next 20200716
 - all functional tests in v1 were repeated for v2

[1] https://lore.kernel.org/lkml/20200701090751.7543-1-ionela.voinescu@arm.com/


Let me know what you think!

Thank you,
Ionela.

Ionela Voinescu (3):
  cpufreq: set invariance scale factor on transition end
  arch_topology: disable frequency invariance for CONFIG_BL_SWITCHER
  cpufreq: report whether cpufreq supports Frequency Invariance (FI)

Valentin Schneider (4):
  cpufreq: move invariance setter calls in cpufreq core
  arch_topology,cpufreq,sched/core: constify arch_* cpumasks
  arch_topology,arm,arm64: define arch_scale_freq_invariant()
  cpufreq: make schedutil the default for arm and arm64

 arch/arm/include/asm/topology.h        |  1 +
 arch/arm64/include/asm/topology.h      |  1 +
 arch/arm64/kernel/topology.c           |  9 ++++-
 drivers/base/arch_topology.c           | 12 +++++--
 drivers/cpufreq/Kconfig                |  2 +-
 drivers/cpufreq/cpufreq-dt.c           | 10 +-----
 drivers/cpufreq/cpufreq.c              | 46 ++++++++++++++++++++++++--
 drivers/cpufreq/qcom-cpufreq-hw.c      |  9 +----
 drivers/cpufreq/scmi-cpufreq.c         | 12 ++-----
 drivers/cpufreq/scpi-cpufreq.c         |  6 +---
 drivers/cpufreq/vexpress-spc-cpufreq.c |  5 ---
 include/linux/arch_topology.h          |  6 ++--
 include/linux/cpufreq.h                |  8 ++++-
 kernel/sched/core.c                    |  2 +-
 14 files changed, 81 insertions(+), 48 deletions(-)


base-commit: 4c43049f19a280329c1d01699f3cc8ad6910cbbe
-- 
2.17.1


^ permalink raw reply

* Re: [PATCH v2 04/11] x86/xen: add system core suspend and resume callbacks
From: Julien Grall @ 2020-07-22  9:08 UTC (permalink / raw)
  To: Anchal Agarwal, tglx, mingo, bp, hpa, x86, boris.ostrovsky,
	jgross, linux-pm, linux-mm, kamatam, sstabellini, konrad.wilk,
	roger.pau, axboe, davem, rjw, len.brown, pavel, peterz, eduval,
	sblbir, xen-devel, vkuznets, netdev, linux-kernel, dwmw, benh
In-Reply-To: <20200702182205.GA3531@dev-dsk-anchalag-2a-9c2d1d96.us-west-2.amazon.com>

Hi,

On 02/07/2020 19:22, Anchal Agarwal wrote:
> diff --git a/include/xen/xen-ops.h b/include/xen/xen-ops.h
> index 2521d6a306cd..9fa8a4082d68 100644
> --- a/include/xen/xen-ops.h
> +++ b/include/xen/xen-ops.h
> @@ -41,6 +41,8 @@ u64 xen_steal_clock(int cpu);
>   int xen_setup_shutdown_event(void);
>   
>   bool xen_is_xen_suspend(void);
> +void xen_setup_syscore_ops(void);

The function is only implemented and used by x86. So shouldn't this be 
declared in an x86 header?

Cheers,

-- 
Julien Grall

^ permalink raw reply

* Re: [PATCH 2/2] thermal: cpufreq_cooling: Reuse effective_cpu_util()
From: Viresh Kumar @ 2020-07-22  9:13 UTC (permalink / raw)
  To: Quentin Perret
  Cc: Ingo Molnar, Peter Zijlstra, Vincent Guittot, Zhang Rui,
	Daniel Lezcano, Amit Daniel Kachhap, Javi Merino, Amit Kucheria,
	linux-kernel, Rafael Wysocki, linux-pm
In-Reply-To: <20200717104314.GB2385870@google.com>

On 17-07-20, 11:43, Quentin Perret wrote:
> On Friday 17 Jul 2020 at 11:33:05 (+0100), Quentin Perret wrote:
> > On Friday 17 Jul 2020 at 11:14:38 (+0100), Quentin Perret wrote:
> > > On Tuesday 14 Jul 2020 at 12:06:53 (+0530), Viresh Kumar wrote:
> > > >  /**
> > > > - * get_load() - get load for a cpu since last updated
> > > > + * get_load() - get current load for a cpu
> > > >   * @cpufreq_cdev:	&struct cpufreq_cooling_device for this cpu
> > > >   * @cpu:	cpu number
> > > > - * @cpu_idx:	index of the cpu in time_in_idle*
> > > > + * @cpu_idx:	index of the cpu
> > > >   *
> > > > - * Return: The average load of cpu @cpu in percentage since this
> > > > - * function was last called.
> > > > + * Return: The current load of cpu @cpu in percentage.
> > > >   */
> > > >  static u32 get_load(struct cpufreq_cooling_device *cpufreq_cdev, int cpu,
> > > >  		    int cpu_idx)
> > > >  {
> > > > -	u32 load;
> > > > -	u64 now, now_idle, delta_time, delta_idle;
> > > > -	struct time_in_idle *idle_time = &cpufreq_cdev->idle_time[cpu_idx];
> > > > -
> > > > -	now_idle = get_cpu_idle_time(cpu, &now, 0);
> > > > -	delta_idle = now_idle - idle_time->time;
> > > > -	delta_time = now - idle_time->timestamp;
> > > > +	unsigned long util = cpu_util_cfs(cpu_rq(cpu));
> > > > +	unsigned long max = arch_scale_cpu_capacity(cpu);
> > > 
> > > Should we subtract the thermal PELT signal from max?

Makes sense to me, but I am not really good with this util and load
stuff and so would leave that to you guys to decide :)

-- 
viresh

^ permalink raw reply

* Re: [PATCH v2 01/11] xen/manage: keep track of the on-going suspend mode
From: Roger Pau Monné @ 2020-07-22  8:27 UTC (permalink / raw)
  To: Anchal Agarwal
  Cc: marmarek, Boris Ostrovsky, tglx, mingo, bp, hpa, x86, jgross,
	linux-pm, linux-mm, kamatam, sstabellini, konrad.wilk, axboe,
	davem, rjw, len.brown, pavel, peterz, eduval, sblbir, xen-devel,
	vkuznets, netdev, linux-kernel, dwmw, benh
In-Reply-To: <20200721195509.GA14682@dev-dsk-anchalag-2a-9c2d1d96.us-west-2.amazon.com>

On Tue, Jul 21, 2020 at 07:55:09PM +0000, Anchal Agarwal wrote:
> On Tue, Jul 21, 2020 at 10:30:18AM +0200, Roger Pau Monné wrote:
> > CAUTION: This email originated from outside of the organization. Do not click links or open attachments unless you can confirm the sender and know the content is safe.
> > 
> > 
> > 
> > Marek: I'm adding you in case you could be able to give this a try and
> > make sure it doesn't break suspend for dom0.
> > 
> > On Tue, Jul 21, 2020 at 12:17:36AM +0000, Anchal Agarwal wrote:
> > > On Mon, Jul 20, 2020 at 11:37:05AM +0200, Roger Pau Monné wrote:
> > > > CAUTION: This email originated from outside of the organization. Do not click links or open attachments unless you can confirm the sender and know the content is safe.
> > > >
> > > >
> > > >
> > > > On Sat, Jul 18, 2020 at 09:47:04PM -0400, Boris Ostrovsky wrote:
> > > > > (Roger, question for you at the very end)
> > > > >
> > > > > On 7/17/20 3:10 PM, Anchal Agarwal wrote:
> > > > > > On Wed, Jul 15, 2020 at 05:18:08PM -0400, Boris Ostrovsky wrote:
> > > > > >> CAUTION: This email originated from outside of the organization. Do not click links or open attachments unless you can confirm the sender and know the content is safe.
> > > > > >>
> > > > > >>
> > > > > >>
> > > > > >> On 7/15/20 4:49 PM, Anchal Agarwal wrote:
> > > > > >>> On Mon, Jul 13, 2020 at 11:52:01AM -0400, Boris Ostrovsky wrote:
> > > > > >>>> CAUTION: This email originated from outside of the organization. Do not click links or open attachments unless you can confirm the sender and know the content is safe.
> > > > > >>>>
> > > > > >>>>
> > > > > >>>>
> > > > > >>>> On 7/2/20 2:21 PM, Anchal Agarwal wrote:
> > > > > >>>> And PVH dom0.
> > > > > >>> That's another good use case to make it work with however, I still
> > > > > >>> think that should be tested/worked upon separately as the feature itself
> > > > > >>> (PVH Dom0) is very new.
> > > > > >>
> > > > > >> Same question here --- will this break PVH dom0?
> > > > > >>
> > > > > > I haven't tested it as a part of this series. Is that a blocker here?
> > > > >
> > > > >
> > > > > I suspect dom0 will not do well now as far as hibernation goes, in which
> > > > > case you are not breaking anything.
> > > > >
> > > > >
> > > > > Roger?
> > > >
> > > > I sadly don't have any box ATM that supports hibernation where I
> > > > could test it. We have hibernation support for PV dom0, so while I
> > > > haven't done anything specific to support or test hibernation on PVH
> > > > dom0 I would at least aim to not make this any worse, and hence the
> > > > check should at least also fail for a PVH dom0?
> > > >
> > > > if (!xen_hvm_domain() || xen_initial_domain())
> > > >     return -ENODEV;
> > > >
> > > > Ie: none of this should be applied to a PVH dom0, as it doesn't have
> > > > PV devices and hence should follow the bare metal device suspend.
> > > >
> > > So from what I understand you meant for any guest running on pvh dom0 should not
> > > hibernate if hibernation is triggered from within the guest or should they?
> > 
> > Er no to both I think. What I meant is that a PVH dom0 should be able
> > to properly suspend, and we should make sure this work doesn't make
> > this any harder (or breaks it if it's currently working).
> > 
> > Or at least that's how I understood the question raised by Boris.
> > 
> > You are adding code to the generic suspend path that's also used by dom0
> > in order to perform bare metal suspension. This is fine now for a PV
> > dom0 because the code is gated on xen_hvm_domain, but you should also
> > take into account that a PVH dom0 is considered a HVM domain, and
> > hence will get the notifier registered.
> >
> Ok that makes sense now. This is good to be safe, but my patch series is only to
> support domU hibernation, so I am not sure if this will affect pvh dom0.
> However, since I do not have a good way of testing sure I will add the check.
> 
> Moreover, in Patch-0004, I do register suspend/resume syscore_ops specifically for domU
> hibernation only if its xen_hvm_domain.

So if the hooks are only registered for domU, do you still need this
xen_hvm_domain check here?

I have to admit I'm not familiar with Linux PM suspend.

> I don't see any reason that should not
> be registered for domU's running on pvh dom0.

To be clear: it should be registered for all HVM domUs, regardless of
whether they are running on a PV or a PVH dom0. My intention was never
to suggest otherwise. It should be enabled for all HVM domUs, but
shouldn't be enabled for HVM dom0.

> Those suspend/resume callbacks will
> only be invoked in case hibernation and will be skipped if generic suspend path
> is in progress. Do you see any issue with that?

No, I think it's fine.

Roger.

^ permalink raw reply

* Re: [PATCH v2 01/11] xen/manage: keep track of the on-going suspend mode
From: Stefano Stabellini @ 2020-07-22  0:18 UTC (permalink / raw)
  To: Boris Ostrovsky
  Cc: Anchal Agarwal, tglx, mingo, bp, hpa, x86, jgross, linux-pm,
	linux-mm, kamatam, sstabellini, konrad.wilk, roger.pau, axboe,
	davem, rjw, len.brown, pavel, peterz, eduval, sblbir, xen-devel,
	vkuznets, netdev, linux-kernel, dwmw, benh
In-Reply-To: <408d3ce9-2510-2950-d28d-fdfe8ee41a54@oracle.com>

[-- Attachment #1: Type: text/plain, Size: 3720 bytes --]

On Tue, 21 Jul 2020, Boris Ostrovsky wrote:
> >>>>>> +static int xen_setup_pm_notifier(void)
> >>>>>> +{
> >>>>>> +     if (!xen_hvm_domain())
> >>>>>> +             return -ENODEV;
> >>>>>>
> >>>>>> I forgot --- what did we decide about non-x86 (i.e. ARM)?
> >>>>> It would be great to support that however, its  out of
> >>>>> scope for this patch set.
> >>>>> I’ll be happy to discuss it separately.
> >>>>
> >>>> I wasn't implying that this *should* work on ARM but rather whether this
> >>>> will break ARM somehow (because xen_hvm_domain() is true there).
> >>>>
> >>>>
> >>> Ok makes sense. TBH, I haven't tested this part of code on ARM and the series
> >>> was only support x86 guests hibernation.
> >>> Moreover, this notifier is there to distinguish between 2 PM
> >>> events PM SUSPEND and PM hibernation. Now since we only care about PM
> >>> HIBERNATION I may just remove this code and rely on "SHUTDOWN_SUSPEND" state.
> >>> However, I may have to fix other patches in the series where this check may
> >>> appear and cater it only for x86 right?
> >>
> >>
> >> I don't know what would happen if ARM guest tries to handle hibernation
> >> callbacks. The only ones that you are introducing are in block and net
> >> fronts and that's arch-independent.
> >>
> >>
> >> You do add a bunch of x86-specific code though (syscore ops), would
> >> something similar be needed for ARM?
> >>
> >>
> > I don't expect this to work out of the box on ARM. To start with something
> > similar will be needed for ARM too.
> > We may still want to keep the driver code as-is.
> > 
> > I understand the concern here wrt ARM, however, currently the support is only
> > proposed for x86 guests here and similar work could be carried out for ARM.
> > Also, if regular hibernation works correctly on arm, then all is needed is to
> > fix Xen side of things.
> > 
> > I am not sure what could be done to achieve any assurances on arm side as far as
> > this series is concerned.

Just to clarify: new features don't need to work on ARM or cause any
addition efforts to you to make them work on ARM. The patch series only
needs not to break existing code paths (on ARM and any other platforms).
It should also not make it overly difficult to implement the ARM side of
things (if there is one) at some point in the future.

FYI drivers/xen/manage.c is compiled and working on ARM today, however
Xen suspend/resume is not supported. I don't know for sure if
guest-initiated hibernation works because I have not tested it.

> If you are not sure what the effects are (or sure that it won't work) on
> ARM then I'd add IS_ENABLED(CONFIG_X86) check, i.e.
> 
> 
> if (!IS_ENABLED(CONFIG_X86) || !xen_hvm_domain())
> 	return -ENODEV;

That is a good principle to have and thanks for suggesting it. However,
in this specific case there is nothing in this patch that doesn't work
on ARM. From an ARM perspective I think we should enable it and
&xen_pm_notifier_block should be registered.

Given that all guests are HVM guests on ARM, it should work fine as is.

I gave a quick look at the rest of the series and everything looks fine
to me from an ARM perspective. I cannot imaging that the new freeze,
thaw, and restore callbacks for net and block are going to cause any
trouble on ARM. The two main x86-specific functions are
xen_syscore_suspend/resume and they look trivial to implement on ARM (in
the sense that they are likely going to look exactly the same.)

One question for Anchal: what's going to happen if you trigger a
hibernation, you have the new callbacks, but you are missing
xen_syscore_suspend/resume?

Is it any worse than not having the new freeze, thaw and restore
callbacks at all and try to do a hibernation?

^ permalink raw reply

* Re: [PATCH] cpufreq: intel_pstate: Implement passive mode with HWP enabled
From: Francisco Jerez @ 2020-07-21 23:14 UTC (permalink / raw)
  To: Srinivas Pandruvada, Rafael J. Wysocki
  Cc: Rafael J. Wysocki, Linux PM, Linux Documentation, LKML,
	Peter Zijlstra, Giovanni Gherdovich, Doug Smythies
In-Reply-To: <babeff29a60d3fadb5515eaf57f7bb42a1c9c792.camel@linux.intel.com>


[-- Attachment #1.1: Type: text/plain, Size: 2910 bytes --]

Srinivas Pandruvada <srinivas.pandruvada@linux.intel.com> writes:

> On Mon, 2020-07-20 at 16:20 -0700, Francisco Jerez wrote:
>> "Rafael J. Wysocki" <rafael@kernel.org> writes:
>> 
>> > On Fri, Jul 17, 2020 at 2:21 AM Francisco Jerez <
>> > currojerez@riseup.net> wrote:
>> > > "Rafael J. Wysocki" <rafael@kernel.org> writes:
>> > > 
> {...]
>
>> > Overall, so far, I'm seeing a claim that the CPU subsystem can be
>> > made
>> > use less energy and do as much work as before (which is what
>> > improving
>> > the energy-efficiency means in general) if the maximum frequency of
>> > CPUs is limited in a clever way.
>> > 
>> > I'm failing to see what that clever way is, though.
>> Hopefully the clarifications above help some.
>
> To simplify:
>
> Suppose I called a function numpy.multiply() to multiply two big arrays
> and thread is a pegged to a CPU. Let's say it is causing CPU to
> finish the job in 10ms and it is using a P-State of 0x20. But the same
> job could have been done in 10ms even if it was using P-state of 0x16.
> So we are not energy efficient. To really know where is the bottle neck
> there are numbers of perf counters, may be cache was the issue, we
> could rather raise the uncore frequency a little. A simple APRF,MPERF
> counters are not enough. 

Yes, that's right, APERF and MPERF aren't sufficient to identify every
kind of possible bottleneck, some visibility of the utilization of other
subsystems is necessary in addition -- Like e.g the instrumentation
introduced in my series to detect a GPU bottleneck.  A bottleneck
condition in an IO device can be communicated to CPUFREQ by adjusting a
PM QoS latency request (link [2] in my previous reply) that effectively
gives the governor permission to rearrange CPU work arbitrarily within
the specified time frame (which should be of the order of the natural
latency of the IO device -- e.g. at least the rendering time of a frame
for a GPU) in order to minimize energy usage.

> or we characterize the workload at different P-states and set limits.
> I think this is not you want to say for energy efficiency with your
> changes. 
>
> The way you are trying to improve "performance" is by caller (device
> driver) to say how important my job at hand. Here device driver suppose
> offload this calculations to some GPU and can wait up to 10 ms, you
> want to tell CPU to be slow. But the p-state driver at a movement
> observes that there is a chance of overshoot of latency, it will
> immediately ask for higher P-state. So you want P-state limits based on
> the latency requirements of the caller. Since caller has more knowledge
> of latency requirement, this allows other devices sharing the power
> budget to get more or less power, and improve overall energy efficiency
> as the combined performance of system is improved.
> Is this correct?

Yes, pretty much.

[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 227 bytes --]

^ permalink raw reply

* Re: [PATCH v2 01/11] xen/manage: keep track of the on-going suspend mode
From: Boris Ostrovsky @ 2020-07-21 21:48 UTC (permalink / raw)
  To: Anchal Agarwal
  Cc: tglx, mingo, bp, hpa, x86, jgross, linux-pm, linux-mm, kamatam,
	sstabellini, konrad.wilk, roger.pau, axboe, davem, rjw, len.brown,
	pavel, peterz, eduval, sblbir, xen-devel, vkuznets, netdev,
	linux-kernel, dwmw, benh
In-Reply-To: <20200721000348.GA19610@dev-dsk-anchalag-2a-9c2d1d96.us-west-2.amazon.com>


>>>>>> +static int xen_setup_pm_notifier(void)
>>>>>> +{
>>>>>> +     if (!xen_hvm_domain())
>>>>>> +             return -ENODEV;
>>>>>>
>>>>>> I forgot --- what did we decide about non-x86 (i.e. ARM)?
>>>>> It would be great to support that however, its  out of
>>>>> scope for this patch set.
>>>>> I’ll be happy to discuss it separately.
>>>>
>>>> I wasn't implying that this *should* work on ARM but rather whether this
>>>> will break ARM somehow (because xen_hvm_domain() is true there).
>>>>
>>>>
>>> Ok makes sense. TBH, I haven't tested this part of code on ARM and the series
>>> was only support x86 guests hibernation.
>>> Moreover, this notifier is there to distinguish between 2 PM
>>> events PM SUSPEND and PM hibernation. Now since we only care about PM
>>> HIBERNATION I may just remove this code and rely on "SHUTDOWN_SUSPEND" state.
>>> However, I may have to fix other patches in the series where this check may
>>> appear and cater it only for x86 right?
>>
>>
>> I don't know what would happen if ARM guest tries to handle hibernation
>> callbacks. The only ones that you are introducing are in block and net
>> fronts and that's arch-independent.
>>
>>
>> You do add a bunch of x86-specific code though (syscore ops), would
>> something similar be needed for ARM?
>>
>>
> I don't expect this to work out of the box on ARM. To start with something
> similar will be needed for ARM too.
> We may still want to keep the driver code as-is.
> 
> I understand the concern here wrt ARM, however, currently the support is only
> proposed for x86 guests here and similar work could be carried out for ARM.
> Also, if regular hibernation works correctly on arm, then all is needed is to
> fix Xen side of things.
> 
> I am not sure what could be done to achieve any assurances on arm side as far as
> this series is concerned.


If you are not sure what the effects are (or sure that it won't work) on
ARM then I'd add IS_ENABLED(CONFIG_X86) check, i.e.


if (!IS_ENABLED(CONFIG_X86) || !xen_hvm_domain())
	return -ENODEV;


(plus '|| xen_initial_domain()' for PVH dom0 as Roger suggested.)

-boris

^ permalink raw reply

* Re: [PATCH v2 01/11] xen/manage: keep track of the on-going suspend mode
From: Anchal Agarwal @ 2020-07-21 19:55 UTC (permalink / raw)
  To: Roger Pau Monné
  Cc: marmarek, Boris Ostrovsky, tglx, mingo, bp, hpa, x86, jgross,
	linux-pm, linux-mm, kamatam, sstabellini, konrad.wilk, axboe,
	davem, rjw, len.brown, pavel, peterz, eduval, sblbir, xen-devel,
	vkuznets, netdev, linux-kernel, dwmw, benh, anchalag
In-Reply-To: <20200721083018.GM7191@Air-de-Roger>

On Tue, Jul 21, 2020 at 10:30:18AM +0200, Roger Pau Monné wrote:
> CAUTION: This email originated from outside of the organization. Do not click links or open attachments unless you can confirm the sender and know the content is safe.
> 
> 
> 
> Marek: I'm adding you in case you could be able to give this a try and
> make sure it doesn't break suspend for dom0.
> 
> On Tue, Jul 21, 2020 at 12:17:36AM +0000, Anchal Agarwal wrote:
> > On Mon, Jul 20, 2020 at 11:37:05AM +0200, Roger Pau Monné wrote:
> > > CAUTION: This email originated from outside of the organization. Do not click links or open attachments unless you can confirm the sender and know the content is safe.
> > >
> > >
> > >
> > > On Sat, Jul 18, 2020 at 09:47:04PM -0400, Boris Ostrovsky wrote:
> > > > (Roger, question for you at the very end)
> > > >
> > > > On 7/17/20 3:10 PM, Anchal Agarwal wrote:
> > > > > On Wed, Jul 15, 2020 at 05:18:08PM -0400, Boris Ostrovsky wrote:
> > > > >> CAUTION: This email originated from outside of the organization. Do not click links or open attachments unless you can confirm the sender and know the content is safe.
> > > > >>
> > > > >>
> > > > >>
> > > > >> On 7/15/20 4:49 PM, Anchal Agarwal wrote:
> > > > >>> On Mon, Jul 13, 2020 at 11:52:01AM -0400, Boris Ostrovsky wrote:
> > > > >>>> CAUTION: This email originated from outside of the organization. Do not click links or open attachments unless you can confirm the sender and know the content is safe.
> > > > >>>>
> > > > >>>>
> > > > >>>>
> > > > >>>> On 7/2/20 2:21 PM, Anchal Agarwal wrote:
> > > > >>>> And PVH dom0.
> > > > >>> That's another good use case to make it work with however, I still
> > > > >>> think that should be tested/worked upon separately as the feature itself
> > > > >>> (PVH Dom0) is very new.
> > > > >>
> > > > >> Same question here --- will this break PVH dom0?
> > > > >>
> > > > > I haven't tested it as a part of this series. Is that a blocker here?
> > > >
> > > >
> > > > I suspect dom0 will not do well now as far as hibernation goes, in which
> > > > case you are not breaking anything.
> > > >
> > > >
> > > > Roger?
> > >
> > > I sadly don't have any box ATM that supports hibernation where I
> > > could test it. We have hibernation support for PV dom0, so while I
> > > haven't done anything specific to support or test hibernation on PVH
> > > dom0 I would at least aim to not make this any worse, and hence the
> > > check should at least also fail for a PVH dom0?
> > >
> > > if (!xen_hvm_domain() || xen_initial_domain())
> > >     return -ENODEV;
> > >
> > > Ie: none of this should be applied to a PVH dom0, as it doesn't have
> > > PV devices and hence should follow the bare metal device suspend.
> > >
> > So from what I understand you meant for any guest running on pvh dom0 should not
> > hibernate if hibernation is triggered from within the guest or should they?
> 
> Er no to both I think. What I meant is that a PVH dom0 should be able
> to properly suspend, and we should make sure this work doesn't make
> this any harder (or breaks it if it's currently working).
> 
> Or at least that's how I understood the question raised by Boris.
> 
> You are adding code to the generic suspend path that's also used by dom0
> in order to perform bare metal suspension. This is fine now for a PV
> dom0 because the code is gated on xen_hvm_domain, but you should also
> take into account that a PVH dom0 is considered a HVM domain, and
> hence will get the notifier registered.
>
Ok that makes sense now. This is good to be safe, but my patch series is only to
support domU hibernation, so I am not sure if this will affect pvh dom0.
However, since I do not have a good way of testing sure I will add the check.

Moreover, in Patch-0004, I do register suspend/resume syscore_ops specifically for domU
hibernation only if its xen_hvm_domain. I don't see any reason that should not
be registered for domU's running on pvh dom0. Those suspend/resume callbacks will
only be invoked in case hibernation and will be skipped if generic suspend path
is in progress. Do you see any issue with that?

> > > Also I would contact the QubesOS guys, they rely heavily on the
> > > suspend feature for dom0, and that's something not currently tested by
> > > osstest so any breakages there go unnoticed.
> > >
> > Was this for me or Boris? If its the former then I have no idea how to?
> 
> I've now added Marek.
> 
> Roger.
Anchal

^ permalink raw reply

page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox