LinuxPPC-Dev Archive on lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH v2 1/2] cpuidle: Trace IPI based and timer based wakeup latency from idle states
From: Pratik Rajesh Sampat @ 2020-07-17  9:18 UTC (permalink / raw)
  To: rjw, daniel.lezcano, mpe, benh, paulus, srivatsa, shuah, npiggin,
	ego, svaidy, linux-pm, linuxppc-dev, linux-kernel,
	linux-kselftest
In-Reply-To: <20200717091801.29289-1-psampat@linux.ibm.com>

Fire directed smp_call_function_single IPIs from a specified source
CPU to the specified target CPU to reduce the noise we have to wade
through in the trace log.
The module is based on the idea written by Srivatsa Bhat and maintained
by Vaidyanathan Srinivasan internally.

Queue HR timer and measure jitter. Wakeup latency measurement for idle
states using hrtimer.  Echo a value in ns to timer_test_function and
watch trace. A HRtimer will be queued and when it fires the expected
wakeup vs actual wakeup is computes and delay printed in ns.

Implemented as a module which utilizes debugfs so that it can be
integrated with selftests.

To include the module, check option and include as module
kernel hacking -> Cpuidle latency selftests

[srivatsa.bhat@linux.vnet.ibm.com: Initial implementation in
 cpidle/sysfs]

[svaidy@linux.vnet.ibm.com: wakeup latency measurements using hrtimer
 and fix some of the time calculation]

[ego@linux.vnet.ibm.com: Fix some whitespace and tab errors and
 increase the resolution of IPI wakeup]

Signed-off-by: Pratik Rajesh Sampat <psampat@linux.ibm.com>
---
 drivers/cpuidle/Makefile               |   1 +
 drivers/cpuidle/test-cpuidle_latency.c | 150 +++++++++++++++++++++++++
 lib/Kconfig.debug                      |  10 ++
 3 files changed, 161 insertions(+)
 create mode 100644 drivers/cpuidle/test-cpuidle_latency.c

diff --git a/drivers/cpuidle/Makefile b/drivers/cpuidle/Makefile
index f07800cbb43f..2ae05968078c 100644
--- a/drivers/cpuidle/Makefile
+++ b/drivers/cpuidle/Makefile
@@ -8,6 +8,7 @@ obj-$(CONFIG_ARCH_NEEDS_CPU_IDLE_COUPLED) += coupled.o
 obj-$(CONFIG_DT_IDLE_STATES)		  += dt_idle_states.o
 obj-$(CONFIG_ARCH_HAS_CPU_RELAX)	  += poll_state.o
 obj-$(CONFIG_HALTPOLL_CPUIDLE)		  += cpuidle-haltpoll.o
+obj-$(CONFIG_IDLE_LATENCY_SELFTEST)	  += test-cpuidle_latency.o
 
 ##################################################################################
 # ARM SoC drivers
diff --git a/drivers/cpuidle/test-cpuidle_latency.c b/drivers/cpuidle/test-cpuidle_latency.c
new file mode 100644
index 000000000000..61574665e972
--- /dev/null
+++ b/drivers/cpuidle/test-cpuidle_latency.c
@@ -0,0 +1,150 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Module-based API test facility for cpuidle latency using IPIs and timers
+ */
+
+#include <linux/debugfs.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+
+/* IPI based wakeup latencies */
+struct latency {
+	unsigned int src_cpu;
+	unsigned int dest_cpu;
+	ktime_t time_start;
+	ktime_t time_end;
+	u64 latency_ns;
+} ipi_wakeup;
+
+static void measure_latency(void *info)
+{
+	struct latency *v;
+	ktime_t time_diff;
+
+	v = (struct latency *)info;
+	v->time_end = ktime_get();
+	time_diff = ktime_sub(v->time_end, v->time_start);
+	v->latency_ns = ktime_to_ns(time_diff);
+}
+
+void run_smp_call_function_test(unsigned int cpu)
+{
+	ipi_wakeup.src_cpu = smp_processor_id();
+	ipi_wakeup.dest_cpu = cpu;
+	ipi_wakeup.time_start = ktime_get();
+	smp_call_function_single(cpu, measure_latency, &ipi_wakeup, 1);
+}
+
+/* Timer based wakeup latencies */
+struct timer_data {
+	unsigned int src_cpu;
+	u64 timeout;
+	ktime_t time_start;
+	ktime_t time_end;
+	struct hrtimer timer;
+	u64 timeout_diff_ns;
+} timer_wakeup;
+
+static enum hrtimer_restart timer_called(struct hrtimer *hrtimer)
+{
+	struct timer_data *w;
+	ktime_t time_diff;
+
+	w = container_of(hrtimer, struct timer_data, timer);
+	w->time_end = ktime_get();
+
+	time_diff = ktime_sub(w->time_end, w->time_start);
+	time_diff = ktime_sub(time_diff, ns_to_ktime(w->timeout));
+	w->timeout_diff_ns = ktime_to_ns(time_diff);
+	return HRTIMER_NORESTART;
+}
+
+static void run_timer_test(unsigned int ns)
+{
+	hrtimer_init(&timer_wakeup.timer, CLOCK_MONOTONIC,
+		     HRTIMER_MODE_REL);
+	timer_wakeup.timer.function = timer_called;
+	timer_wakeup.time_start = ktime_get();
+	timer_wakeup.src_cpu = smp_processor_id();
+	timer_wakeup.timeout = ns;
+
+	hrtimer_start(&timer_wakeup.timer, ns_to_ktime(ns),
+		      HRTIMER_MODE_REL_PINNED);
+}
+
+static struct dentry *dir;
+
+static int cpu_read_op(void *data, u64 *value)
+{
+	*value = ipi_wakeup.dest_cpu;
+	return 0;
+}
+
+static int cpu_write_op(void *data, u64 value)
+{
+	run_smp_call_function_test(value);
+	return 0;
+}
+DEFINE_SIMPLE_ATTRIBUTE(ipi_ops, cpu_read_op, cpu_write_op, "%llu\n");
+
+static int timeout_read_op(void *data, u64 *value)
+{
+	*value = timer_wakeup.timeout;
+	return 0;
+}
+
+static int timeout_write_op(void *data, u64 value)
+{
+	run_timer_test(value);
+	return 0;
+}
+DEFINE_SIMPLE_ATTRIBUTE(timeout_ops, timeout_read_op, timeout_write_op, "%llu\n");
+
+static int __init latency_init(void)
+{
+	struct dentry *temp;
+
+	dir = debugfs_create_dir("latency_test", 0);
+	if (!dir) {
+		pr_alert("latency_test: failed to create /sys/kernel/debug/latency_test\n");
+		return -1;
+	}
+	temp = debugfs_create_file("ipi_cpu_dest",
+				   0666,
+				   dir,
+				   NULL,
+				   &ipi_ops);
+	if (!temp) {
+		pr_alert("latency_test: failed to create /sys/kernel/debug/ipi_cpu_dest\n");
+		return -1;
+	}
+	debugfs_create_u64("ipi_latency_ns", 0444, dir, &ipi_wakeup.latency_ns);
+	debugfs_create_u32("ipi_cpu_src", 0444, dir, &ipi_wakeup.src_cpu);
+
+	temp = debugfs_create_file("timeout_expected_ns",
+				   0666,
+				   dir,
+				   NULL,
+				   &timeout_ops);
+	if (!temp) {
+		pr_alert("latency_test: failed to create /sys/kernel/debug/timeout_expected_ns\n");
+		return -1;
+	}
+	debugfs_create_u64("timeout_diff_ns", 0444, dir, &timer_wakeup.timeout_diff_ns);
+	debugfs_create_u32("timeout_cpu_src", 0444, dir, &timer_wakeup.src_cpu);
+	pr_info("Latency Test module loaded\n");
+	return 0;
+}
+
+static void __exit latency_cleanup(void)
+{
+	pr_info("Cleaning up Latency Test module.\n");
+	debugfs_remove_recursive(dir);
+}
+
+module_init(latency_init);
+module_exit(latency_cleanup);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("IBM Corporation");
+MODULE_DESCRIPTION("Measuring idle latency for IPIs and Timers");
diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
index d74ac0fd6b2d..e2283790245a 100644
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@ -1375,6 +1375,16 @@ config DEBUG_KOBJECT
 	  If you say Y here, some extra kobject debugging messages will be sent
 	  to the syslog.
 
+config IDLE_LATENCY_SELFTEST
+	tristate "Cpuidle latency selftests"
+	depends on CPU_IDLE
+	help
+	  This option provides a kernel module that runs tests using the IPI and
+	  timers to measure latency.
+
+	  Say M if you want these self tests to build as a module.
+	  Say N if you are unsure.
+
 config DEBUG_KOBJECT_RELEASE
 	bool "kobject release debugging"
 	depends on DEBUG_OBJECTS_TIMERS
-- 
2.25.4


^ permalink raw reply related

* [PATCH v2 0/2] Selftest for cpuidle latency measurement
From: Pratik Rajesh Sampat @ 2020-07-17  9:17 UTC (permalink / raw)
  To: rjw, daniel.lezcano, mpe, benh, paulus, srivatsa, shuah, npiggin,
	ego, svaidy, linux-pm, linuxppc-dev, linux-kernel,
	linux-kselftest

v1: https://lkml.org/lkml/2020/7/7/1036
Changelog v1 --> v2
1. Based on Shuah Khan's comment, changed exit code to ksft_skip to
   indicate the test is being skipped
2. Change the busy workload for baseline measurement from
   "yes > /dev/null" to "cat /dev/random to /dev/null", based on
   observed CPU utilization for "yes" consuming ~60% CPU while the
   latter consumes 100% of CPUs, giving more accurate baseline numbers
---

The patch series introduces a mechanism to measure wakeup latency for
IPI and timer based interrupts
The motivation behind this series is to find significant deviations
behind advertised latency and resisdency values

To achieve this, we introduce a kernel module and expose its control
knobs through the debugfs interface that the selftests can engage with.

The kernel module provides the following interfaces within
/sys/kernel/debug/latency_test/ for,
1. IPI test:
  ipi_cpu_dest   # Destination CPU for the IPI
  ipi_cpu_src    # Origin of the IPI
  ipi_latency_ns # Measured latency time in ns
2. Timeout test:
  timeout_cpu_src     # CPU on which the timer to be queued
  timeout_expected_ns # Timer duration
  timeout_diff_ns     # Difference of actual duration vs expected timer
To include the module, check option and include as module
kernel hacking -> Cpuidle latency selftests

The selftest inserts the module, disables all the idle states and
enables them one by one testing the following:
1. Keeping source CPU constant, iterates through all the CPUS measuring
   IPI latency for baseline (CPU is busy with
   "cat /dev/random > /dev/null" workload) and the when the CPU is
   allowed to be at rest
2. Iterating through all the CPUs, sending expected timer durations to
   be equivalent to the residency of the the deepest idle state
   enabled and extracting the difference in time between the time of
   wakeup and the expected timer duration

Usage
-----
Can be used in conjuction to the rest of the selftests.
Default Output location in: tools/testing/cpuidle/cpuidle.log

To run this test specifically:
$ make -C tools/testing/selftests TARGETS="cpuidle" run_tests

There are a few optinal arguments too that the script can take
	[-h <help>]
	[-m <location of the module>]
	[-o <location of the output>]

Sample output snippet
---------------------
--IPI Latency Test---
--Baseline IPI Latency measurement: CPU Busy--
SRC_CPU   DEST_CPU IPI_Latency(ns)
...
0            8         1996
0            9         2125
0           10         1264
0           11         1788
0           12         2045
Baseline Average IPI latency(ns): 1843
---Enabling state: 5---
SRC_CPU   DEST_CPU IPI_Latency(ns)
0            8       621719
0            9       624752
0           10       622218
0           11       623968
0           12       621303
Expected IPI latency(ns): 100000
Observed Average IPI latency(ns): 622792

--Timeout Latency Test--
--Baseline Timeout Latency measurement: CPU Busy--
Wakeup_src Baseline_delay(ns) 
...
8            2249
9            2226
10           2211
11           2183
12           2263
Baseline Average timeout diff(ns): 2226
---Enabling state: 5---
8           10749                   
9           10911                   
10          10912                   
11          12100                   
12          73276                   
Expected timeout(ns): 10000200
Observed Average timeout diff(ns): 23589

Pratik Rajesh Sampat (2):
  cpuidle: Trace IPI based and timer based wakeup latency from idle
    states
  selftest/cpuidle: Add support for cpuidle latency measurement

 drivers/cpuidle/Makefile                   |   1 +
 drivers/cpuidle/test-cpuidle_latency.c     | 150 ++++++++++++
 lib/Kconfig.debug                          |  10 +
 tools/testing/selftests/Makefile           |   1 +
 tools/testing/selftests/cpuidle/Makefile   |   6 +
 tools/testing/selftests/cpuidle/cpuidle.sh | 257 +++++++++++++++++++++
 tools/testing/selftests/cpuidle/settings   |   1 +
 7 files changed, 426 insertions(+)
 create mode 100644 drivers/cpuidle/test-cpuidle_latency.c
 create mode 100644 tools/testing/selftests/cpuidle/Makefile
 create mode 100755 tools/testing/selftests/cpuidle/cpuidle.sh
 create mode 100644 tools/testing/selftests/cpuidle/settings

-- 
2.25.4


^ permalink raw reply

* linux-next: manual merge of the set_fs tree with the powerpc tree
From: Stephen Rothwell @ 2020-07-17  9:09 UTC (permalink / raw)
  To: Christoph Hellwig, Michael Ellerman, PowerPC
  Cc: Nathan Lynch, Linux Next Mailing List, Linux Kernel Mailing List

[-- Attachment #1: Type: text/plain, Size: 819 bytes --]

Hi all,

Today's linux-next merge of the set_fs tree got a conflict in:

  arch/powerpc/mm/numa.c

between commit:

  c30f931e891e ("powerpc/numa: remove ability to enable topology updates")

from the powerpc tree and commit:

  16a04bde8169 ("proc: switch over direct seq_read method calls to seq_read_iter")

from the set_fs tree.

I fixed it up (the former removed the code updated by the latter, so I
just did that) and can carry the fix as necessary. This is now fixed as
far as linux-next is concerned, but any non trivial conflicts should be
mentioned to your upstream maintainer when your tree is submitted for
merging.  You may also want to consider cooperating with the maintainer
of the conflicting tree to minimise any particularly complex conflicts.

-- 
Cheers,
Stephen Rothwell

[-- Attachment #2: OpenPGP digital signature --]
[-- Type: application/pgp-signature, Size: 488 bytes --]

^ permalink raw reply

* Re: [PATCH v2 2/5] powerpc/lib: Initialize a temporary mm for code patching
From: kernel test robot @ 2020-07-17  8:57 UTC (permalink / raw)
  To: Christopher M. Riedl, linuxppc-dev
  Cc: clang-built-linux, kbuild-all, kernel-hardening
In-Reply-To: <20200709040316.12789-3-cmr@informatik.wtf>

[-- Attachment #1: Type: text/plain, Size: 2895 bytes --]

Hi "Christopher,

Thank you for the patch! Perhaps something to improve:

[auto build test WARNING on powerpc/next]
[also build test WARNING on char-misc/char-misc-testing v5.8-rc5 next-20200716]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch]

url:    https://github.com/0day-ci/linux/commits/Christopher-M-Riedl/Use-per-CPU-temporary-mappings-for-patching/20200709-123827
base:   https://git.kernel.org/pub/scm/linux/kernel/git/powerpc/linux.git next
config: powerpc-randconfig-r013-20200717 (attached as .config)
compiler: clang version 12.0.0 (https://github.com/llvm/llvm-project ed6b578040a85977026c93bf4188f996148f3218)
reproduce (this is a W=1 build):
        wget https://raw.githubusercontent.com/intel/lkp-tests/master/sbin/make.cross -O ~/bin/make.cross
        chmod +x ~/bin/make.cross
        # install powerpc cross compiling tool for clang build
        # apt-get install binutils-powerpc-linux-gnu
        # save the attached .config to linux build tree
        COMPILER_INSTALL_PATH=$HOME/0day COMPILER=clang make.cross ARCH=powerpc 

If you fix the issue, kindly add following tag as appropriate
Reported-by: kernel test robot <lkp@intel.com>

All warnings (new ones prefixed by >>):

>> arch/powerpc/lib/code-patching.c:53:13: warning: no previous prototype for function 'poking_init' [-Wmissing-prototypes]
   void __init poking_init(void)
               ^
   arch/powerpc/lib/code-patching.c:53:1: note: declare 'static' if the function is not intended to be used outside of this translation unit
   void __init poking_init(void)
   ^
   static 
   1 warning generated.

vim +/poking_init +53 arch/powerpc/lib/code-patching.c

    52	
  > 53	void __init poking_init(void)
    54	{
    55		spinlock_t *ptl; /* for protecting pte table */
    56		pte_t *ptep;
    57	
    58		/*
    59		 * Some parts of the kernel (static keys for example) depend on
    60		 * successful code patching. Code patching under STRICT_KERNEL_RWX
    61		 * requires this setup - otherwise we cannot patch at all. We use
    62		 * BUG_ON() here and later since an early failure is preferred to
    63		 * buggy behavior and/or strange crashes later.
    64		 */
    65		patching_mm = copy_init_mm();
    66		BUG_ON(!patching_mm);
    67	
    68		/*
    69		 * In hash we cannot go above DEFAULT_MAP_WINDOW easily.
    70		 * XXX: Do we want additional bits of entropy for radix?
    71		 */
    72		patching_addr = (get_random_long() & PAGE_MASK) %
    73			(DEFAULT_MAP_WINDOW - PAGE_SIZE);
    74	
    75		ptep = get_locked_pte(patching_mm, patching_addr, &ptl);
    76		BUG_ON(!ptep);
    77		pte_unmap_unlock(ptep, ptl);
    78	}
    79	

---
0-DAY CI Kernel Test Service, Intel Corporation
https://lists.01.org/hyperkitty/list/kbuild-all@lists.01.org

[-- Attachment #2: .config.gz --]
[-- Type: application/gzip, Size: 30733 bytes --]

^ permalink raw reply

* Re: [PATCH v3] powerpc/pseries: detect secure and trusted boot state of the system.
From: Michal Suchánek @ 2020-07-17  8:35 UTC (permalink / raw)
  To: Daniel Axtens; +Cc: linuxppc-dev, Nayna Jain, linux-kernel, Mimi Zohar
In-Reply-To: <87pn8uu1hy.fsf@dja-thinkpad.axtens.net>

On Fri, Jul 17, 2020 at 03:58:01PM +1000, Daniel Axtens wrote:
> Michal Suchánek <msuchanek@suse.de> writes:
> 
> > On Wed, Jul 15, 2020 at 07:52:01AM -0400, Nayna Jain wrote:
> >> The device-tree property to check secure and trusted boot state is
> >> different for guests(pseries) compared to baremetal(powernv).
> >> 
> >> This patch updates the existing is_ppc_secureboot_enabled() and
> >> is_ppc_trustedboot_enabled() functions to add support for pseries.
> >> 
> >> The secureboot and trustedboot state are exposed via device-tree property:
> >> /proc/device-tree/ibm,secure-boot and /proc/device-tree/ibm,trusted-boot
> >> 
> >> The values of ibm,secure-boot under pseries are interpreted as:
> >                                       ^^^
> >> 
> >> 0 - Disabled
> >> 1 - Enabled in Log-only mode. This patch interprets this value as
> >> disabled, since audit mode is currently not supported for Linux.
> >> 2 - Enabled and enforced.
> >> 3-9 - Enabled and enforcing; requirements are at the discretion of the
> >> operating system.
> >> 
> >> The values of ibm,trusted-boot under pseries are interpreted as:
> >                                        ^^^
> > These two should be different I suppose?
> 
> I'm not quite sure what you mean? They'll be documented in a future
> revision of the PAPR, once I get my act together and submit the
> relevant internal paperwork.

Nevermind, one talks about secure boot, the other about trusted boot.

Thanks

Michal

^ permalink raw reply

* Re: [PATCH 11/11] powerpc/smp: Provide an ability to disable coregroup
From: Gautham R Shenoy @ 2020-07-17  8:28 UTC (permalink / raw)
  To: Srikar Dronamraju
  Cc: Nathan Lynch, Gautham R Shenoy, Oliver OHalloran, Michael Neuling,
	Michael Ellerman, Anton Blanchard, linuxppc-dev, Nick Piggin
In-Reply-To: <20200714043624.5648-12-srikar@linux.vnet.ibm.com>

On Tue, Jul 14, 2020 at 10:06:24AM +0530, Srikar Dronamraju wrote:
> If user wants to enable coregroup sched_domain then they can boot with
> kernel parameter "coregroup_support=on"
> 
> Cc: linuxppc-dev <linuxppc-dev@lists.ozlabs.org>
> Cc: Michael Ellerman <michaele@au1.ibm.com>
> Cc: Nick Piggin <npiggin@au1.ibm.com>
> Cc: Oliver OHalloran <oliveroh@au1.ibm.com>
> Cc: Nathan Lynch <nathanl@linux.ibm.com>
> Cc: Michael Neuling <mikey@linux.ibm.com>
> Cc: Anton Blanchard <anton@au1.ibm.com>
> Cc: Gautham R Shenoy <ego@linux.vnet.ibm.com>
> Cc: Vaidyanathan Srinivasan <svaidy@linux.ibm.com>
> Signed-off-by: Srikar Dronamraju <srikar@linux.vnet.ibm.com>


We need this documented in the
Documentation/admin-guide/kernel-parameters.txt

Other than that, the patch looks good to me.
Reviewed-by: Gautham R. Shenoy <ego@linux.vnet.ibm.com>

> ---
>  arch/powerpc/kernel/smp.c | 19 ++++++++++++++++++-
>  1 file changed, 18 insertions(+), 1 deletion(-)
> 
> diff --git a/arch/powerpc/kernel/smp.c b/arch/powerpc/kernel/smp.c
> index bb25c13bbb79..c43909e6e8e9 100644
> --- a/arch/powerpc/kernel/smp.c
> +++ b/arch/powerpc/kernel/smp.c
> @@ -118,6 +118,23 @@ struct smp_ops_t *smp_ops;
>  volatile unsigned int cpu_callin_map[NR_CPUS];
> 
>  int smt_enabled_at_boot = 1;
> +int coregroup_support;
> +
> +static int __init early_coregroup(char *p)
> +{
> +	if (!p)
> +		return 0;
> +
> +	if (strstr(p, "on"))
> +		coregroup_support = 1;
> +
> +	if (strstr(p, "1"))
> +		coregroup_support = 1;
> +
> +	return 0;
> +}
> +
> +early_param("coregroup_support", early_coregroup);
> 
>  /*
>   * Returns 1 if the specified cpu should be brought up during boot.
> @@ -878,7 +895,7 @@ static struct cpumask *cpu_coregroup_mask(int cpu)
> 
>  static bool has_coregroup_support(void)
>  {
> -	return coregroup_enabled;
> +	return coregroup_enabled && coregroup_support;
>  }
> 
>  static const struct cpumask *cpu_mc_mask(int cpu)
> -- 
> 2.17.1
> 

^ permalink raw reply

* Re: [PATCH 10/11] powerpc/smp: Implement cpu_to_coregroup_id
From: Gautham R Shenoy @ 2020-07-17  8:26 UTC (permalink / raw)
  To: Srikar Dronamraju
  Cc: Nathan Lynch, Gautham R Shenoy, Oliver OHalloran, Michael Neuling,
	Michael Ellerman, Anton Blanchard, linuxppc-dev, Nick Piggin
In-Reply-To: <20200714043624.5648-11-srikar@linux.vnet.ibm.com>

On Tue, Jul 14, 2020 at 10:06:23AM +0530, Srikar Dronamraju wrote:
> Lookup the coregroup id from the associativity array.
> 
> If unable to detect the coregroup id, fallback on the core id.
> This way, ensure sched_domain degenerates and an extra sched domain is
> not created.
> 
> Ideally this function should have been implemented in
> arch/powerpc/kernel/smp.c. However if its implemented in mm/numa.c, we
> don't need to find the primary domain again.
> 
> If the device-tree mentions more than one coregroup, then kernel
> implements only the last or the smallest coregroup, which currently
> corresponds to the penultimate domain in the device-tree.
> 
> Cc: linuxppc-dev <linuxppc-dev@lists.ozlabs.org>
> Cc: Michael Ellerman <michaele@au1.ibm.com>
> Cc: Nick Piggin <npiggin@au1.ibm.com>
> Cc: Oliver OHalloran <oliveroh@au1.ibm.com>
> Cc: Nathan Lynch <nathanl@linux.ibm.com>
> Cc: Michael Neuling <mikey@linux.ibm.com>
> Cc: Anton Blanchard <anton@au1.ibm.com>
> Cc: Gautham R Shenoy <ego@linux.vnet.ibm.com>
> Cc: Vaidyanathan Srinivasan <svaidy@linux.ibm.com>
> Signed-off-by: Srikar Dronamraju <srikar@linux.vnet.ibm.com>
> ---
>  arch/powerpc/mm/numa.c | 17 +++++++++++++++++
>  1 file changed, 17 insertions(+)
> 
> diff --git a/arch/powerpc/mm/numa.c b/arch/powerpc/mm/numa.c
> index d9ab9da85eab..4e85564ef62a 100644
> --- a/arch/powerpc/mm/numa.c
> +++ b/arch/powerpc/mm/numa.c
> @@ -1697,6 +1697,23 @@ static const struct proc_ops topology_proc_ops = {
> 
>  int cpu_to_coregroup_id(int cpu)
>  {
> +	__be32 associativity[VPHN_ASSOC_BUFSIZE] = {0};
> +	int index;
> +

It would be good to have an assert here to ensure that we are calling
this function only when coregroups are enabled.

Else, we may end up returning the penultimate index which maps to the
chip-id.



> +	if (cpu < 0 || cpu > nr_cpu_ids)
> +		return -1;
> +
> +	if (!firmware_has_feature(FW_FEATURE_VPHN))
> +		goto out;
> +
> +	if (vphn_get_associativity(cpu, associativity))
> +		goto out;
> +
> +	index = of_read_number(associativity, 1);
> +	if ((index > min_common_depth + 1) && coregroup_enabled)
> +		return of_read_number(&associativity[index - 1], 1);
> +
> +out:
>  	return cpu_to_core_id(cpu);
>  }
> 
> -- 
> 2.17.1
> 

^ permalink raw reply

* Re: [PATCH 09/11] Powerpc/smp: Create coregroup domain
From: Gautham R Shenoy @ 2020-07-17  8:23 UTC (permalink / raw)
  To: Gautham R Shenoy
  Cc: Nathan Lynch, Oliver OHalloran, Michael Neuling,
	Srikar Dronamraju, Michael Ellerman, Anton Blanchard,
	linuxppc-dev, Nick Piggin
In-Reply-To: <20200717081926.GD32531@in.ibm.com>

On Fri, Jul 17, 2020 at 01:49:26PM +0530, Gautham R Shenoy wrote:

> > +int cpu_to_coregroup_id(int cpu)
> > +{
> > +	return cpu_to_core_id(cpu);
> > +}
> 
> 
> So, if has_coregroup_support() returns true, then since the core_group
> identification is currently done through the core-id, the
> coregroup_mask is going to be the same as the
> cpu_core_mask/cpu_cpu_mask. Thus, we will be degenerating the DIE
> domain. Right ? Instead we could have kept the core-group to be a
> single bigcore by default, so that those domains can get degenerated
> preserving the legacy SMT, DIE, NUMA hierarchy.

Never mind, got confused with core_id and cpu_core_mask (which
corresponds to the cores of a chip).  cpu_to_core_id() does exactly
what we have described above. Sorry for the noise.

I am ok with this patch, except that I would request if all the
changes to the topology structure be done within a single function for
ease of tracking it instead of distributing it all over the code.


> 
> 
> 
> 
> > +
> >  static int topology_update_init(void)
> >  {
> >  	start_topology_update();
> > -- 
> > 2.17.1
> > 

^ permalink raw reply

* Re: [PATCH 09/11] Powerpc/smp: Create coregroup domain
From: Gautham R Shenoy @ 2020-07-17  8:19 UTC (permalink / raw)
  To: Srikar Dronamraju
  Cc: Nathan Lynch, Gautham R Shenoy, Oliver OHalloran, Michael Neuling,
	Michael Ellerman, Anton Blanchard, linuxppc-dev, Nick Piggin
In-Reply-To: <20200714043624.5648-10-srikar@linux.vnet.ibm.com>

On Tue, Jul 14, 2020 at 10:06:22AM +0530, Srikar Dronamraju wrote:
> Add percpu coregroup maps and masks to create coregroup domain.
> If a coregroup doesn't exist, the coregroup domain will be degenerated
> in favour of SMT/CACHE domain.
> 
> Cc: linuxppc-dev <linuxppc-dev@lists.ozlabs.org>
> Cc: Michael Ellerman <michaele@au1.ibm.com>
> Cc: Nick Piggin <npiggin@au1.ibm.com>
> Cc: Oliver OHalloran <oliveroh@au1.ibm.com>
> Cc: Nathan Lynch <nathanl@linux.ibm.com>
> Cc: Michael Neuling <mikey@linux.ibm.com>
> Cc: Anton Blanchard <anton@au1.ibm.com>
> Cc: Gautham R Shenoy <ego@linux.vnet.ibm.com>
> Cc: Vaidyanathan Srinivasan <svaidy@linux.ibm.com>
> Signed-off-by: Srikar Dronamraju <srikar@linux.vnet.ibm.com>
> ---
>  arch/powerpc/include/asm/topology.h | 10 ++++++++
>  arch/powerpc/kernel/smp.c           | 37 +++++++++++++++++++++++++++++
>  arch/powerpc/mm/numa.c              |  5 ++++
>  3 files changed, 52 insertions(+)
> 
> diff --git a/arch/powerpc/include/asm/topology.h b/arch/powerpc/include/asm/topology.h
> index 2db7ba789720..34812c35018e 100644
> --- a/arch/powerpc/include/asm/topology.h
> +++ b/arch/powerpc/include/asm/topology.h
> @@ -98,6 +98,7 @@ extern int stop_topology_update(void);
>  extern int prrn_is_enabled(void);
>  extern int find_and_online_cpu_nid(int cpu);
>  extern int timed_topology_update(int nsecs);
> +extern int cpu_to_coregroup_id(int cpu);
>  #else
>  static inline int start_topology_update(void)
>  {
> @@ -120,6 +121,15 @@ static inline int timed_topology_update(int nsecs)
>  	return 0;
>  }
> 
> +static inline int cpu_to_coregroup_id(int cpu)
> +{
> +#ifdef CONFIG_SMP
> +	return cpu_to_core_id(cpu);
> +#else
> +	return 0;
> +#endif
> +}
> +
>  #endif /* CONFIG_NUMA && CONFIG_PPC_SPLPAR */
> 
>  #include <asm-generic/topology.h>
> diff --git a/arch/powerpc/kernel/smp.c b/arch/powerpc/kernel/smp.c
> index ef19eeccd21e..bb25c13bbb79 100644
> --- a/arch/powerpc/kernel/smp.c
> +++ b/arch/powerpc/kernel/smp.c
> @@ -80,6 +80,7 @@ DEFINE_PER_CPU(cpumask_var_t, cpu_sibling_map);
>  DEFINE_PER_CPU(cpumask_var_t, cpu_smallcore_map);
>  DEFINE_PER_CPU(cpumask_var_t, cpu_l2_cache_map);
>  DEFINE_PER_CPU(cpumask_var_t, cpu_core_map);
> +DEFINE_PER_CPU(cpumask_var_t, cpu_coregroup_map);
> 
>  EXPORT_PER_CPU_SYMBOL(cpu_sibling_map);
>  EXPORT_PER_CPU_SYMBOL(cpu_l2_cache_map);
> @@ -91,6 +92,7 @@ enum {
>  	smt_idx,
>  #endif
>  	bigcore_idx,
> +	mc_idx,
>  	die_idx,
>  };
> 
> @@ -869,6 +871,21 @@ static const struct cpumask *smallcore_smt_mask(int cpu)
>  }
>  #endif
> 
> +static struct cpumask *cpu_coregroup_mask(int cpu)
> +{
> +	return per_cpu(cpu_coregroup_map, cpu);
> +}
> +
> +static bool has_coregroup_support(void)
> +{
> +	return coregroup_enabled;
> +}
> +
> +static const struct cpumask *cpu_mc_mask(int cpu)
> +{
> +	return cpu_coregroup_mask(cpu);
> +}
> +
>  static const struct cpumask *cpu_bigcore_mask(int cpu)
>  {
>  	return cpu_core_mask(cpu);
> @@ -879,6 +896,7 @@ static struct sched_domain_topology_level powerpc_topology[] = {
>  	{ cpu_smt_mask, powerpc_smt_flags, SD_INIT_NAME(SMT) },
>  #endif
>  	{ cpu_bigcore_mask, SD_INIT_NAME(BIGCORE) },
> +	{ cpu_mc_mask, SD_INIT_NAME(MC) },
>  	{ cpu_cpu_mask, SD_INIT_NAME(DIE) },
>  	{ NULL, },
>  };
> @@ -933,6 +951,10 @@ void __init smp_prepare_cpus(unsigned int max_cpus)
>  					GFP_KERNEL, node);
>  		zalloc_cpumask_var_node(&per_cpu(cpu_core_map, cpu),
>  					GFP_KERNEL, node);
> +		if (has_coregroup_support())
> +			zalloc_cpumask_var_node(&per_cpu(cpu_coregroup_map, cpu),
> +						GFP_KERNEL, node);
> +
>  #ifdef CONFIG_NEED_MULTIPLE_NODES
>  		/*
>  		 * numa_node_id() works after this.
> @@ -950,6 +972,11 @@ void __init smp_prepare_cpus(unsigned int max_cpus)
>  	cpumask_set_cpu(boot_cpuid, cpu_l2_cache_mask(boot_cpuid));
>  	cpumask_set_cpu(boot_cpuid, cpu_core_mask(boot_cpuid));
> 
> +	if (has_coregroup_support())
> +		cpumask_set_cpu(boot_cpuid, cpu_coregroup_mask(boot_cpuid));
> +	else
> +		powerpc_topology[mc_idx].mask = cpu_bigcore_mask;
> +

The else part could be moved to the common function where we are
modifying the other attributes of the topology array.


>  	init_big_cores();
>  	if (has_big_cores) {
>  		cpumask_set_cpu(boot_cpuid,
> @@ -1241,6 +1268,8 @@ static void remove_cpu_from_masks(int cpu)
>  		set_cpus_unrelated(cpu, i, cpu_sibling_mask);
>  		if (has_big_cores)
>  			set_cpus_unrelated(cpu, i, cpu_smallcore_mask);
> +		if (has_coregroup_support())
> +			set_cpus_unrelated(cpu, i, cpu_coregroup_mask);
>  	}
>  }
>  #endif
> @@ -1301,6 +1330,14 @@ static void add_cpu_to_masks(int cpu)
>  	add_cpu_to_smallcore_masks(cpu);
>  	update_mask_by_l2(cpu, cpu_l2_cache_mask);
> 
> +	if (has_coregroup_support()) {
> +		cpumask_set_cpu(cpu, cpu_coregroup_mask(cpu));
> +		for_each_cpu(i, cpu_online_mask) {
> +			if (cpu_to_coregroup_id(cpu) == cpu_to_coregroup_id(i))
> +				set_cpus_related(cpu, i, cpu_coregroup_mask);
> +		}
> +	}
> +
>  	if (pkg_id == -1) {
>  		struct cpumask *(*mask)(int) = cpu_sibling_mask;
> 
> diff --git a/arch/powerpc/mm/numa.c b/arch/powerpc/mm/numa.c
> index a43eab455be4..d9ab9da85eab 100644
> --- a/arch/powerpc/mm/numa.c
> +++ b/arch/powerpc/mm/numa.c
> @@ -1695,6 +1695,11 @@ static const struct proc_ops topology_proc_ops = {
>  	.proc_release	= single_release,
>  };
> 
> +int cpu_to_coregroup_id(int cpu)
> +{
> +	return cpu_to_core_id(cpu);
> +}


So, if has_coregroup_support() returns true, then since the core_group
identification is currently done through the core-id, the
coregroup_mask is going to be the same as the
cpu_core_mask/cpu_cpu_mask. Thus, we will be degenerating the DIE
domain. Right ? Instead we could have kept the core-group to be a
single bigcore by default, so that those domains can get degenerated
preserving the legacy SMT, DIE, NUMA hierarchy.




> +
>  static int topology_update_init(void)
>  {
>  	start_topology_update();
> -- 
> 2.17.1
> 

^ permalink raw reply

* Re: [PATCH 08/11] powerpc/smp: Allocate cpumask only after searching thread group
From: Gautham R Shenoy @ 2020-07-17  8:08 UTC (permalink / raw)
  To: Srikar Dronamraju
  Cc: Nathan Lynch, Gautham R Shenoy, Oliver OHalloran, Michael Neuling,
	Michael Ellerman, Anton Blanchard, linuxppc-dev, Nick Piggin
In-Reply-To: <20200714043624.5648-9-srikar@linux.vnet.ibm.com>

On Tue, Jul 14, 2020 at 10:06:21AM +0530, Srikar Dronamraju wrote:
> If allocated earlier and the search fails, then cpumask need to be
> freed. However cpu_l1_cache_map can be allocated after we search thread
> group.
> 
> Cc: linuxppc-dev <linuxppc-dev@lists.ozlabs.org>
> Cc: Michael Ellerman <michaele@au1.ibm.com>
> Cc: Nick Piggin <npiggin@au1.ibm.com>
> Cc: Oliver OHalloran <oliveroh@au1.ibm.com>
> Cc: Nathan Lynch <nathanl@linux.ibm.com>
> Cc: Michael Neuling <mikey@linux.ibm.com>
> Cc: Anton Blanchard <anton@au1.ibm.com>
> Cc: Gautham R Shenoy <ego@linux.vnet.ibm.com>
> Cc: Vaidyanathan Srinivasan <svaidy@linux.ibm.com>
> Signed-off-by: Srikar Dronamraju <srikar@linux.vnet.ibm.com>

Good fix.

Reviewed-by: Gautham R. Shenoy <ego@linux.vnet.ibm.com>

> ---
>  arch/powerpc/kernel/smp.c | 7 +++----
>  1 file changed, 3 insertions(+), 4 deletions(-)
> 
> diff --git a/arch/powerpc/kernel/smp.c b/arch/powerpc/kernel/smp.c
> index 96e47450d9b3..ef19eeccd21e 100644
> --- a/arch/powerpc/kernel/smp.c
> +++ b/arch/powerpc/kernel/smp.c
> @@ -797,10 +797,6 @@ static int init_cpu_l1_cache_map(int cpu)
>  	if (err)
>  		goto out;
> 
> -	zalloc_cpumask_var_node(&per_cpu(cpu_l1_cache_map, cpu),
> -				GFP_KERNEL,
> -				cpu_to_node(cpu));
> -
>  	cpu_group_start = get_cpu_thread_group_start(cpu, &tg);
> 
>  	if (unlikely(cpu_group_start == -1)) {
> @@ -809,6 +805,9 @@ static int init_cpu_l1_cache_map(int cpu)
>  		goto out;
>  	}
> 
> +	zalloc_cpumask_var_node(&per_cpu(cpu_l1_cache_map, cpu),
> +				GFP_KERNEL, cpu_to_node(cpu));
> +
>  	for (i = first_thread; i < first_thread + threads_per_core; i++) {
>  		int i_group_start = get_cpu_thread_group_start(i, &tg);
> 
> -- 
> 2.17.1
> 

^ permalink raw reply

* Re: [PATCH 07/11] Powerpc/numa: Detect support for coregroup
From: Gautham R Shenoy @ 2020-07-17  8:08 UTC (permalink / raw)
  To: Srikar Dronamraju
  Cc: Nathan Lynch, Gautham R Shenoy, Oliver OHalloran, Michael Neuling,
	Michael Ellerman, Anton Blanchard, linuxppc-dev, Nick Piggin
In-Reply-To: <20200714043624.5648-8-srikar@linux.vnet.ibm.com>

On Tue, Jul 14, 2020 at 10:06:20AM +0530, Srikar Dronamraju wrote:
> Add support for grouping cores based on the device-tree classification.
> - The last domain in the associativity domains always refers to the
> core.
> - If primary reference domain happens to be the penultimate domain in
> the associativity domains device-tree property, then there are no
> coregroups. However if its not a penultimate domain, then there are
> coregroups. There can be more than one coregroup. For now we would be
> interested in the last or the smallest coregroups.
> 
> Cc: linuxppc-dev <linuxppc-dev@lists.ozlabs.org>
> Cc: Michael Ellerman <michaele@au1.ibm.com>
> Cc: Nick Piggin <npiggin@au1.ibm.com>
> Cc: Oliver OHalloran <oliveroh@au1.ibm.com>
> Cc: Nathan Lynch <nathanl@linux.ibm.com>
> Cc: Michael Neuling <mikey@linux.ibm.com>
> Cc: Anton Blanchard <anton@au1.ibm.com>
> Cc: Gautham R Shenoy <ego@linux.vnet.ibm.com>
> Cc: Vaidyanathan Srinivasan <svaidy@linux.ibm.com>
> Signed-off-by: Srikar Dronamraju <srikar@linux.vnet.ibm.com>

This looks good to me from the discovery of the core-group point of
view.

Reviewed-by: Gautham R. Shenoy <ego@linux.vnet.ibm.com>

> ---
>  arch/powerpc/include/asm/smp.h |  1 +
>  arch/powerpc/kernel/smp.c      |  1 +
>  arch/powerpc/mm/numa.c         | 34 +++++++++++++++++++++-------------
>  3 files changed, 23 insertions(+), 13 deletions(-)
> 
> diff --git a/arch/powerpc/include/asm/smp.h b/arch/powerpc/include/asm/smp.h
> index 49a25e2400f2..5bdc17a7049f 100644
> --- a/arch/powerpc/include/asm/smp.h
> +++ b/arch/powerpc/include/asm/smp.h
> @@ -28,6 +28,7 @@
>  extern int boot_cpuid;
>  extern int spinning_secondaries;
>  extern u32 *cpu_to_phys_id;
> +extern bool coregroup_enabled;
> 
>  extern void cpu_die(void);
>  extern int cpu_to_chip_id(int cpu);
> diff --git a/arch/powerpc/kernel/smp.c b/arch/powerpc/kernel/smp.c
> index f8faf75135af..96e47450d9b3 100644
> --- a/arch/powerpc/kernel/smp.c
> +++ b/arch/powerpc/kernel/smp.c
> @@ -74,6 +74,7 @@ static DEFINE_PER_CPU(int, cpu_state) = { 0 };
> 
>  struct task_struct *secondary_current;
>  bool has_big_cores;
> +bool coregroup_enabled;
> 
>  DEFINE_PER_CPU(cpumask_var_t, cpu_sibling_map);
>  DEFINE_PER_CPU(cpumask_var_t, cpu_smallcore_map);
> diff --git a/arch/powerpc/mm/numa.c b/arch/powerpc/mm/numa.c
> index fc7b0505bdd8..a43eab455be4 100644
> --- a/arch/powerpc/mm/numa.c
> +++ b/arch/powerpc/mm/numa.c
> @@ -887,7 +887,9 @@ static void __init setup_node_data(int nid, u64 start_pfn, u64 end_pfn)
>  static void __init find_possible_nodes(void)
>  {
>  	struct device_node *rtas;
> -	u32 numnodes, i;
> +	const __be32 *domains;
> +	int prop_length, max_nodes;
> +	u32 i;
> 
>  	if (!numa_enabled)
>  		return;
> @@ -896,25 +898,31 @@ static void __init find_possible_nodes(void)
>  	if (!rtas)
>  		return;
> 
> -	if (of_property_read_u32_index(rtas, "ibm,current-associativity-domains",
> -				min_common_depth, &numnodes)) {
> -		/*
> -		 * ibm,current-associativity-domains is a fairly recent
> -		 * property. If it doesn't exist, then fallback on
> -		 * ibm,max-associativity-domains. Current denotes what the
> -		 * platform can support compared to max which denotes what the
> -		 * Hypervisor can support.
> -		 */
> -		if (of_property_read_u32_index(rtas, "ibm,max-associativity-domains",
> -				min_common_depth, &numnodes))
> +	/*
> +	 * ibm,current-associativity-domains is a fairly recent property. If
> +	 * it doesn't exist, then fallback on ibm,max-associativity-domains.
> +	 * Current denotes what the platform can support compared to max
> +	 * which denotes what the Hypervisor can support.
> +	 */
> +	domains = of_get_property(rtas, "ibm,current-associativity-domains",
> +					&prop_length);
> +	if (!domains) {
> +		domains = of_get_property(rtas, "ibm,max-associativity-domains",
> +					&prop_length);
> +		if (!domains)
>  			goto out;
>  	}
> 
> -	for (i = 0; i < numnodes; i++) {
> +	max_nodes = of_read_number(&domains[min_common_depth], 1);
> +	for (i = 0; i < max_nodes; i++) {
>  		if (!node_possible(i))
>  			node_set(i, node_possible_map);
>  	}
> 
> +	prop_length /= sizeof(int);
> +	if (prop_length > min_common_depth + 2)
> +		coregroup_enabled = 1;
> +
>  out:
>  	of_node_put(rtas);
>  }
> -- 
> 2.17.1
> 

^ permalink raw reply

* [v4 5/5] KVM: PPC: Book3S HV: migrate hot plugged memory
From: Ram Pai @ 2020-07-17  8:00 UTC (permalink / raw)
  To: kvm-ppc, linuxppc-dev
  Cc: ldufour, linuxram, cclaudio, bharata, sathnaga, aneesh.kumar,
	sukadev, bauerman, david
In-Reply-To: <1594972827-13928-1-git-send-email-linuxram@us.ibm.com>

From: Laurent Dufour <ldufour@linux.ibm.com>

When a memory slot is hot plugged to a SVM, PFNs associated with the
GFNs in that slot must be migrated to secure-PFNs, aka device-PFNs.

Call kvmppc_uv_migrate_mem_slot() to accomplish this.
Disable page-merge for all pages in the memory slot.

Signed-off-by: Ram Pai <linuxram@us.ibm.com>
[rearranged the code, and modified the commit log]
Signed-off-by: Laurent Dufour <ldufour@linux.ibm.com>
---
 arch/powerpc/include/asm/kvm_book3s_uvmem.h | 10 ++++++++++
 arch/powerpc/kvm/book3s_hv.c                | 10 ++--------
 arch/powerpc/kvm/book3s_hv_uvmem.c          | 22 ++++++++++++++++++++++
 3 files changed, 34 insertions(+), 8 deletions(-)

diff --git a/arch/powerpc/include/asm/kvm_book3s_uvmem.h b/arch/powerpc/include/asm/kvm_book3s_uvmem.h
index f229ab5..6f7da00 100644
--- a/arch/powerpc/include/asm/kvm_book3s_uvmem.h
+++ b/arch/powerpc/include/asm/kvm_book3s_uvmem.h
@@ -25,6 +25,9 @@ void kvmppc_uvmem_drop_pages(const struct kvm_memory_slot *free,
 			     struct kvm *kvm, bool skip_page_out);
 int kvmppc_uv_migrate_mem_slot(struct kvm *kvm,
 			const struct kvm_memory_slot *memslot);
+void kvmppc_memslot_create(struct kvm *kvm, const struct kvm_memory_slot *new);
+void kvmppc_memslot_delete(struct kvm *kvm, const struct kvm_memory_slot *old);
+
 #else
 static inline int kvmppc_uvmem_init(void)
 {
@@ -84,5 +87,12 @@ static inline int kvmppc_send_page_to_uv(struct kvm *kvm, unsigned long gfn)
 static inline void
 kvmppc_uvmem_drop_pages(const struct kvm_memory_slot *free,
 			struct kvm *kvm, bool skip_page_out) { }
+
+static inline void  kvmppc_memslot_create(struct kvm *kvm,
+		const struct kvm_memory_slot *new) { }
+
+static inline void  kvmppc_memslot_delete(struct kvm *kvm,
+		const struct kvm_memory_slot *old) { }
+
 #endif /* CONFIG_PPC_UV */
 #endif /* __ASM_KVM_BOOK3S_UVMEM_H__ */
diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c
index d331b46..bf3be3b 100644
--- a/arch/powerpc/kvm/book3s_hv.c
+++ b/arch/powerpc/kvm/book3s_hv.c
@@ -4515,16 +4515,10 @@ static void kvmppc_core_commit_memory_region_hv(struct kvm *kvm,
 
 	switch (change) {
 	case KVM_MR_CREATE:
-		if (kvmppc_uvmem_slot_init(kvm, new))
-			return;
-		uv_register_mem_slot(kvm->arch.lpid,
-				     new->base_gfn << PAGE_SHIFT,
-				     new->npages * PAGE_SIZE,
-				     0, new->id);
+		kvmppc_memslot_create(kvm, new);
 		break;
 	case KVM_MR_DELETE:
-		uv_unregister_mem_slot(kvm->arch.lpid, old->id);
-		kvmppc_uvmem_slot_free(kvm, old);
+		kvmppc_memslot_delete(kvm, old);
 		break;
 	default:
 		/* TODO: Handle KVM_MR_MOVE */
diff --git a/arch/powerpc/kvm/book3s_hv_uvmem.c b/arch/powerpc/kvm/book3s_hv_uvmem.c
index a206984..a2b4d25 100644
--- a/arch/powerpc/kvm/book3s_hv_uvmem.c
+++ b/arch/powerpc/kvm/book3s_hv_uvmem.c
@@ -1089,6 +1089,28 @@ int kvmppc_send_page_to_uv(struct kvm *kvm, unsigned long gfn)
 	return (ret == U_SUCCESS) ? RESUME_GUEST : -EFAULT;
 }
 
+void kvmppc_memslot_create(struct kvm *kvm, const struct kvm_memory_slot *new)
+{
+	if (kvmppc_uvmem_slot_init(kvm, new))
+		return;
+
+	if (kvmppc_memslot_page_merge(kvm, new, false))
+		return;
+
+	if (uv_register_mem_slot(kvm->arch.lpid, new->base_gfn << PAGE_SHIFT,
+			new->npages * PAGE_SIZE, 0, new->id))
+		return;
+
+	kvmppc_uv_migrate_mem_slot(kvm, new);
+}
+
+void kvmppc_memslot_delete(struct kvm *kvm, const struct kvm_memory_slot *old)
+{
+	uv_unregister_mem_slot(kvm->arch.lpid, old->id);
+	kvmppc_memslot_page_merge(kvm, old, true);
+	kvmppc_uvmem_slot_free(kvm, old);
+}
+
 static u64 kvmppc_get_secmem_size(void)
 {
 	struct device_node *np;
-- 
1.8.3.1


^ permalink raw reply related

* [v4 4/5] KVM: PPC: Book3S HV: retry page migration before erroring-out
From: Ram Pai @ 2020-07-17  8:00 UTC (permalink / raw)
  To: kvm-ppc, linuxppc-dev
  Cc: ldufour, linuxram, cclaudio, bharata, sathnaga, aneesh.kumar,
	sukadev, bauerman, david
In-Reply-To: <1594972827-13928-1-git-send-email-linuxram@us.ibm.com>

The page requested for page-in; sometimes, can have transient
references, and hence cannot migrate immediately. Retry a few times
before returning error.

The same is true for non-migrated pages that are migrated in
H_SVM_INIT_DONE hanlder.  Retry a few times before returning error.

H_SVM_PAGE_IN interface is enhanced to return H_BUSY if the page is
not in a migratable state.

Cc: Paul Mackerras <paulus@ozlabs.org>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Bharata B Rao <bharata@linux.ibm.com>
Cc: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
Cc: Sukadev Bhattiprolu <sukadev@linux.vnet.ibm.com>
Cc: Laurent Dufour <ldufour@linux.ibm.com>
Cc: Thiago Jung Bauermann <bauerman@linux.ibm.com>
Cc: David Gibson <david@gibson.dropbear.id.au>
Cc: Claudio Carvalho <cclaudio@linux.ibm.com>
Cc: kvm-ppc@vger.kernel.org
Cc: linuxppc-dev@lists.ozlabs.org

Signed-off-by: Ram Pai <linuxram@us.ibm.com>
---
 Documentation/powerpc/ultravisor.rst |   1 +
 arch/powerpc/kvm/book3s_hv_uvmem.c   | 106 ++++++++++++++++++++++++-----------
 2 files changed, 74 insertions(+), 33 deletions(-)

diff --git a/Documentation/powerpc/ultravisor.rst b/Documentation/powerpc/ultravisor.rst
index ba6b1bf..fe533ad 100644
--- a/Documentation/powerpc/ultravisor.rst
+++ b/Documentation/powerpc/ultravisor.rst
@@ -1035,6 +1035,7 @@ Return values
 	* H_PARAMETER	if ``guest_pa`` is invalid.
 	* H_P2		if ``flags`` is invalid.
 	* H_P3		if ``order`` of page is invalid.
+	* H_BUSY	if ``page`` is not in a state to pagein
 
 Description
 ~~~~~~~~~~~
diff --git a/arch/powerpc/kvm/book3s_hv_uvmem.c b/arch/powerpc/kvm/book3s_hv_uvmem.c
index 3274663..a206984 100644
--- a/arch/powerpc/kvm/book3s_hv_uvmem.c
+++ b/arch/powerpc/kvm/book3s_hv_uvmem.c
@@ -672,7 +672,7 @@ static int kvmppc_svm_migrate_page(struct vm_area_struct *vma,
 		return ret;
 
 	if (!(*mig.src & MIGRATE_PFN_MIGRATE)) {
-		ret = -1;
+		ret = -2;
 		goto out_finalize;
 	}
 
@@ -700,43 +700,73 @@ static int kvmppc_svm_migrate_page(struct vm_area_struct *vma,
 	return ret;
 }
 
-int kvmppc_uv_migrate_mem_slot(struct kvm *kvm,
-		const struct kvm_memory_slot *memslot)
+/*
+ * return 1, if some page migration failed because of transient error,
+ * while the remaining pages migrated successfully.
+ * The caller can use this as a hint to retry.
+ *
+ * return 0 otherwise. *ret indicates the success status
+ * of this call.
+ */
+static bool __kvmppc_uv_migrate_mem_slot(struct kvm *kvm,
+		const struct kvm_memory_slot *memslot, int *ret)
 {
 	unsigned long gfn = memslot->base_gfn;
 	struct vm_area_struct *vma;
 	unsigned long start, end;
-	int ret = 0;
+	bool retry = false;
 
+	*ret = 0;
 	while (kvmppc_next_nontransitioned_gfn(memslot, kvm, &gfn)) {
 
 		mmap_read_lock(kvm->mm);
 		start = gfn_to_hva(kvm, gfn);
 		if (kvm_is_error_hva(start)) {
-			ret = H_STATE;
+			*ret = H_STATE;
 			goto next;
 		}
 
 		end = start + (1UL << PAGE_SHIFT);
 		vma = find_vma_intersection(kvm->mm, start, end);
 		if (!vma || vma->vm_start > start || vma->vm_end < end) {
-			ret = H_STATE;
+			*ret = H_STATE;
 			goto next;
 		}
 
 		mutex_lock(&kvm->arch.uvmem_lock);
-		ret = kvmppc_svm_migrate_page(vma, start, end,
+		*ret = kvmppc_svm_migrate_page(vma, start, end,
 				(gfn << PAGE_SHIFT), kvm, PAGE_SHIFT, false);
 		mutex_unlock(&kvm->arch.uvmem_lock);
-		if (ret)
-			ret = H_STATE;
 
 next:
 		mmap_read_unlock(kvm->mm);
+		if (*ret == -2) {
+			retry = true;
+			continue;
+		}
 
-		if (ret)
-			break;
+		if (*ret)
+			return false;
 	}
+	return retry;
+}
+
+#define REPEAT_COUNT 10
+
+int kvmppc_uv_migrate_mem_slot(struct kvm *kvm,
+		const struct kvm_memory_slot *memslot)
+{
+	int ret = 0, repeat_count = REPEAT_COUNT;
+
+	/*
+	 * try migration of pages in the memslot 'repeat_count' number of
+	 * times, provided each time migration fails because of transient
+	 * errors only.
+	 */
+	while (__kvmppc_uv_migrate_mem_slot(kvm, memslot, &ret) &&
+		repeat_count--)
+		;
+
 	return ret;
 }
 
@@ -812,7 +842,7 @@ unsigned long kvmppc_h_svm_page_in(struct kvm *kvm, unsigned long gpa,
 	struct vm_area_struct *vma;
 	int srcu_idx;
 	unsigned long gfn = gpa >> page_shift;
-	int ret;
+	int ret, repeat_count = REPEAT_COUNT;
 
 	if (!(kvm->arch.secure_guest & KVMPPC_SECURE_INIT_START))
 		return H_UNSUPPORTED;
@@ -826,34 +856,44 @@ unsigned long kvmppc_h_svm_page_in(struct kvm *kvm, unsigned long gpa,
 	if (flags & H_PAGE_IN_SHARED)
 		return kvmppc_share_page(kvm, gpa, page_shift);
 
-	ret = H_PARAMETER;
 	srcu_idx = srcu_read_lock(&kvm->srcu);
-	mmap_read_lock(kvm->mm);
 
-	start = gfn_to_hva(kvm, gfn);
-	if (kvm_is_error_hva(start))
-		goto out;
-
-	mutex_lock(&kvm->arch.uvmem_lock);
 	/* Fail the page-in request of an already paged-in page */
-	if (kvmppc_gfn_is_uvmem_pfn(gfn, kvm, NULL))
-		goto out_unlock;
+	mutex_lock(&kvm->arch.uvmem_lock);
+	ret = kvmppc_gfn_is_uvmem_pfn(gfn, kvm, NULL);
+	mutex_unlock(&kvm->arch.uvmem_lock);
+	if (ret) {
+		srcu_read_unlock(&kvm->srcu, srcu_idx);
+		return H_PARAMETER;
+	}
 
-	end = start + (1UL << page_shift);
-	vma = find_vma_intersection(kvm->mm, start, end);
-	if (!vma || vma->vm_start > start || vma->vm_end < end)
-		goto out_unlock;
+	do {
+		ret = H_PARAMETER;
+		mmap_read_lock(kvm->mm);
 
-	if (kvmppc_svm_migrate_page(vma, start, end, gpa, kvm, page_shift,
-				true))
-		goto out_unlock;
+		start = gfn_to_hva(kvm, gfn);
+		if (kvm_is_error_hva(start)) {
+			mmap_read_unlock(kvm->mm);
+			break;
+		}
 
-	ret = H_SUCCESS;
+		end = start + (1UL << page_shift);
+		vma = find_vma_intersection(kvm->mm, start, end);
+		if (!vma || vma->vm_start > start || vma->vm_end < end) {
+			mmap_read_unlock(kvm->mm);
+			break;
+		}
+
+		mutex_lock(&kvm->arch.uvmem_lock);
+		ret = kvmppc_svm_migrate_page(vma, start, end, gpa, kvm, page_shift, true);
+		mutex_unlock(&kvm->arch.uvmem_lock);
+
+		mmap_read_unlock(kvm->mm);
+	} while (ret == -2 && repeat_count--);
+
+	if (ret == -2)
+		ret = H_BUSY;
 
-out_unlock:
-	mutex_unlock(&kvm->arch.uvmem_lock);
-out:
-	mmap_read_unlock(kvm->mm);
 	srcu_read_unlock(&kvm->srcu, srcu_idx);
 	return ret;
 }
-- 
1.8.3.1


^ permalink raw reply related

* [v4 3/5] KVM: PPC: Book3S HV: in H_SVM_INIT_DONE, migrate remaining normal-GFNs to secure-GFNs.
From: Ram Pai @ 2020-07-17  8:00 UTC (permalink / raw)
  To: kvm-ppc, linuxppc-dev
  Cc: ldufour, linuxram, cclaudio, bharata, sathnaga, aneesh.kumar,
	sukadev, bauerman, david
In-Reply-To: <1594972827-13928-1-git-send-email-linuxram@us.ibm.com>

The Ultravisor is expected to explicitly call H_SVM_PAGE_IN for all the
pages of the SVM before calling H_SVM_INIT_DONE. This causes a huge
delay in tranistioning the VM to SVM. The Ultravisor is only interested
in the pages that contain the kernel, initrd and other important data
structures. The rest contain throw-away content.

However if not all pages are requested by the Ultravisor, the Hypervisor
continues to consider the GFNs corresponding to the non-requested pages
as normal GFNs. This can lead to data-corruption and undefined behavior.

In H_SVM_INIT_DONE handler, move all the PFNs associated with the SVM's
GFNs to secure-PFNs. Skip the GFNs that are already Paged-in or Shared
or Paged-in followed by a Paged-out.

Cc: Paul Mackerras <paulus@ozlabs.org>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Bharata B Rao <bharata@linux.ibm.com>
Cc: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
Cc: Sukadev Bhattiprolu <sukadev@linux.vnet.ibm.com>
Cc: Laurent Dufour <ldufour@linux.ibm.com>
Cc: Thiago Jung Bauermann <bauerman@linux.ibm.com>
Cc: David Gibson <david@gibson.dropbear.id.au>
Cc: Claudio Carvalho <cclaudio@linux.ibm.com>
Cc: kvm-ppc@vger.kernel.org
Cc: linuxppc-dev@lists.ozlabs.org
Signed-off-by: Ram Pai <linuxram@us.ibm.com>
---
 Documentation/powerpc/ultravisor.rst        |   2 +
 arch/powerpc/include/asm/kvm_book3s_uvmem.h |   2 +
 arch/powerpc/kvm/book3s_hv_uvmem.c          | 135 +++++++++++++++++++++++++---
 3 files changed, 125 insertions(+), 14 deletions(-)

diff --git a/Documentation/powerpc/ultravisor.rst b/Documentation/powerpc/ultravisor.rst
index a1c8c37..ba6b1bf 100644
--- a/Documentation/powerpc/ultravisor.rst
+++ b/Documentation/powerpc/ultravisor.rst
@@ -934,6 +934,8 @@ Return values
 	* H_UNSUPPORTED		if called from the wrong context (e.g.
 				from an SVM or before an H_SVM_INIT_START
 				hypercall).
+	* H_STATE		if the hypervisor could not successfully
+                                transition the VM to Secure VM.
 
 Description
 ~~~~~~~~~~~
diff --git a/arch/powerpc/include/asm/kvm_book3s_uvmem.h b/arch/powerpc/include/asm/kvm_book3s_uvmem.h
index 9cb7d8b..f229ab5 100644
--- a/arch/powerpc/include/asm/kvm_book3s_uvmem.h
+++ b/arch/powerpc/include/asm/kvm_book3s_uvmem.h
@@ -23,6 +23,8 @@ unsigned long kvmppc_h_svm_page_out(struct kvm *kvm,
 unsigned long kvmppc_h_svm_init_abort(struct kvm *kvm);
 void kvmppc_uvmem_drop_pages(const struct kvm_memory_slot *free,
 			     struct kvm *kvm, bool skip_page_out);
+int kvmppc_uv_migrate_mem_slot(struct kvm *kvm,
+			const struct kvm_memory_slot *memslot);
 #else
 static inline int kvmppc_uvmem_init(void)
 {
diff --git a/arch/powerpc/kvm/book3s_hv_uvmem.c b/arch/powerpc/kvm/book3s_hv_uvmem.c
index df2e272..3274663 100644
--- a/arch/powerpc/kvm/book3s_hv_uvmem.c
+++ b/arch/powerpc/kvm/book3s_hv_uvmem.c
@@ -93,6 +93,7 @@
 #include <asm/ultravisor.h>
 #include <asm/mman.h>
 #include <asm/kvm_ppc.h>
+#include <asm/kvm_book3s_uvmem.h>
 
 static struct dev_pagemap kvmppc_uvmem_pgmap;
 static unsigned long *kvmppc_uvmem_bitmap;
@@ -348,8 +349,45 @@ static bool kvmppc_gfn_is_uvmem_pfn(unsigned long gfn, struct kvm *kvm,
 	return false;
 }
 
+/*
+ * starting from *gfn search for the next available GFN that is not yet
+ * transitioned to a secure GFN.  return the value of that GFN in *gfn.  If a
+ * GFN is found, return true, else return false
+ */
+static bool kvmppc_next_nontransitioned_gfn(const struct kvm_memory_slot *memslot,
+		struct kvm *kvm, unsigned long *gfn)
+{
+	struct kvmppc_uvmem_slot *p;
+	bool ret = false;
+	unsigned long i;
+
+	mutex_lock(&kvm->arch.uvmem_lock);
+
+	list_for_each_entry(p, &kvm->arch.uvmem_pfns, list)
+		if (*gfn >= p->base_pfn && *gfn < p->base_pfn + p->nr_pfns)
+			break;
+	if (!p)
+		goto out;
+	/*
+	 * The code below assumes, one to one correspondence between
+	 * kvmppc_uvmem_slot and memslot.
+	 */
+	for (i = *gfn; i < p->base_pfn + p->nr_pfns; i++) {
+		unsigned long index = i - p->base_pfn;
+
+		if (!(p->pfns[index] & KVMPPC_GFN_FLAG_MASK)) {
+			*gfn = i;
+			ret = true;
+			break;
+		}
+	}
+out:
+	mutex_unlock(&kvm->arch.uvmem_lock);
+	return ret;
+}
+
 static int kvmppc_memslot_page_merge(struct kvm *kvm,
-		struct kvm_memory_slot *memslot, bool merge)
+		const struct kvm_memory_slot *memslot, bool merge)
 {
 	unsigned long gfn = memslot->base_gfn;
 	unsigned long end, start = gfn_to_hva(kvm, gfn);
@@ -461,12 +499,31 @@ unsigned long kvmppc_h_svm_init_start(struct kvm *kvm)
 
 unsigned long kvmppc_h_svm_init_done(struct kvm *kvm)
 {
+	struct kvm_memslots *slots;
+	struct kvm_memory_slot *memslot;
+	int srcu_idx;
+	long ret = H_SUCCESS;
+
 	if (!(kvm->arch.secure_guest & KVMPPC_SECURE_INIT_START))
 		return H_UNSUPPORTED;
 
+	/* migrate any unmoved normal pfn to device pfns*/
+	srcu_idx = srcu_read_lock(&kvm->srcu);
+	slots = kvm_memslots(kvm);
+	kvm_for_each_memslot(memslot, slots) {
+		ret = kvmppc_uv_migrate_mem_slot(kvm, memslot);
+		if (ret) {
+			ret = H_STATE;
+			goto out;
+		}
+	}
+
 	kvm->arch.secure_guest |= KVMPPC_SECURE_INIT_DONE;
 	pr_info("LPID %d went secure\n", kvm->arch.lpid);
-	return H_SUCCESS;
+
+out:
+	srcu_read_unlock(&kvm->srcu, srcu_idx);
+	return ret;
 }
 
 /*
@@ -587,12 +644,14 @@ static struct page *kvmppc_uvmem_get_page(unsigned long gpa, struct kvm *kvm)
 }
 
 /*
- * Alloc a PFN from private device memory pool and copy page from normal
- * memory to secure memory using UV_PAGE_IN uvcall.
+ * Alloc a PFN from private device memory pool. If @pagein is true,
+ * copy page from normal memory to secure memory using UV_PAGE_IN uvcall.
  */
-static int kvmppc_svm_page_in(struct vm_area_struct *vma, unsigned long start,
-		   unsigned long end, unsigned long gpa, struct kvm *kvm,
-		   unsigned long page_shift)
+static int kvmppc_svm_migrate_page(struct vm_area_struct *vma,
+		unsigned long start,
+		unsigned long end, unsigned long gpa, struct kvm *kvm,
+		unsigned long page_shift,
+		bool pagein)
 {
 	unsigned long src_pfn, dst_pfn = 0;
 	struct migrate_vma mig;
@@ -623,11 +682,16 @@ static int kvmppc_svm_page_in(struct vm_area_struct *vma, unsigned long start,
 		goto out_finalize;
 	}
 
-	pfn = *mig.src >> MIGRATE_PFN_SHIFT;
-	spage = migrate_pfn_to_page(*mig.src);
-	if (spage)
-		uv_page_in(kvm->arch.lpid, pfn << page_shift, gpa, 0,
-			   page_shift);
+	if (pagein) {
+		pfn = *mig.src >> MIGRATE_PFN_SHIFT;
+		spage = migrate_pfn_to_page(*mig.src);
+		if (spage) {
+			ret = uv_page_in(kvm->arch.lpid, pfn << page_shift,
+					gpa, 0, page_shift);
+			if (ret)
+				goto out_finalize;
+		}
+	}
 
 	*mig.dst = migrate_pfn(page_to_pfn(dpage)) | MIGRATE_PFN_LOCKED;
 	migrate_vma_pages(&mig);
@@ -636,6 +700,46 @@ static int kvmppc_svm_page_in(struct vm_area_struct *vma, unsigned long start,
 	return ret;
 }
 
+int kvmppc_uv_migrate_mem_slot(struct kvm *kvm,
+		const struct kvm_memory_slot *memslot)
+{
+	unsigned long gfn = memslot->base_gfn;
+	struct vm_area_struct *vma;
+	unsigned long start, end;
+	int ret = 0;
+
+	while (kvmppc_next_nontransitioned_gfn(memslot, kvm, &gfn)) {
+
+		mmap_read_lock(kvm->mm);
+		start = gfn_to_hva(kvm, gfn);
+		if (kvm_is_error_hva(start)) {
+			ret = H_STATE;
+			goto next;
+		}
+
+		end = start + (1UL << PAGE_SHIFT);
+		vma = find_vma_intersection(kvm->mm, start, end);
+		if (!vma || vma->vm_start > start || vma->vm_end < end) {
+			ret = H_STATE;
+			goto next;
+		}
+
+		mutex_lock(&kvm->arch.uvmem_lock);
+		ret = kvmppc_svm_migrate_page(vma, start, end,
+				(gfn << PAGE_SHIFT), kvm, PAGE_SHIFT, false);
+		mutex_unlock(&kvm->arch.uvmem_lock);
+		if (ret)
+			ret = H_STATE;
+
+next:
+		mmap_read_unlock(kvm->mm);
+
+		if (ret)
+			break;
+	}
+	return ret;
+}
+
 /*
  * Shares the page with HV, thus making it a normal page.
  *
@@ -740,8 +844,11 @@ unsigned long kvmppc_h_svm_page_in(struct kvm *kvm, unsigned long gpa,
 	if (!vma || vma->vm_start > start || vma->vm_end < end)
 		goto out_unlock;
 
-	if (!kvmppc_svm_page_in(vma, start, end, gpa, kvm, page_shift))
-		ret = H_SUCCESS;
+	if (kvmppc_svm_migrate_page(vma, start, end, gpa, kvm, page_shift,
+				true))
+		goto out_unlock;
+
+	ret = H_SUCCESS;
 
 out_unlock:
 	mutex_unlock(&kvm->arch.uvmem_lock);
-- 
1.8.3.1


^ permalink raw reply related

* [v4 1/5] KVM: PPC: Book3S HV: Disable page merging in H_SVM_INIT_START
From: Ram Pai @ 2020-07-17  8:00 UTC (permalink / raw)
  To: kvm-ppc, linuxppc-dev
  Cc: ldufour, linuxram, cclaudio, bharata, sathnaga, aneesh.kumar,
	sukadev, bauerman, david
In-Reply-To: <1594972827-13928-1-git-send-email-linuxram@us.ibm.com>

Page-merging of pages in memory-slots associated with a Secure VM,
is disabled in H_SVM_PAGE_IN handler.

This operation should have been done much earlier; the moment the VM
is initiated for secure-transition. Delaying this operation, increases
the probability for those pages to acquire new references , making it
impossible to migrate those pages.

Disable page-migration in H_SVM_INIT_START handling.

Signed-off-by: Ram Pai <linuxram@us.ibm.com>
---
 Documentation/powerpc/ultravisor.rst |  1 +
 arch/powerpc/kvm/book3s_hv_uvmem.c   | 98 +++++++++++++++++++++++++++---------
 2 files changed, 76 insertions(+), 23 deletions(-)

diff --git a/Documentation/powerpc/ultravisor.rst b/Documentation/powerpc/ultravisor.rst
index df136c8..a1c8c37 100644
--- a/Documentation/powerpc/ultravisor.rst
+++ b/Documentation/powerpc/ultravisor.rst
@@ -895,6 +895,7 @@ Return values
     One of the following values:
 
 	* H_SUCCESS	 on success.
+        * H_STATE        if the VM is not in a position to switch to secure.
 
 Description
 ~~~~~~~~~~~
diff --git a/arch/powerpc/kvm/book3s_hv_uvmem.c b/arch/powerpc/kvm/book3s_hv_uvmem.c
index e6f76bc..0baa293 100644
--- a/arch/powerpc/kvm/book3s_hv_uvmem.c
+++ b/arch/powerpc/kvm/book3s_hv_uvmem.c
@@ -211,6 +211,65 @@ static bool kvmppc_gfn_is_uvmem_pfn(unsigned long gfn, struct kvm *kvm,
 	return false;
 }
 
+static int kvmppc_memslot_page_merge(struct kvm *kvm,
+		struct kvm_memory_slot *memslot, bool merge)
+{
+	unsigned long gfn = memslot->base_gfn;
+	unsigned long end, start = gfn_to_hva(kvm, gfn);
+	int ret = 0;
+	struct vm_area_struct *vma;
+	int merge_flag = (merge) ? MADV_MERGEABLE : MADV_UNMERGEABLE;
+
+	if (kvm_is_error_hva(start))
+		return H_STATE;
+
+	end = start + (memslot->npages << PAGE_SHIFT);
+
+	mmap_write_lock(kvm->mm);
+	do {
+		vma = find_vma_intersection(kvm->mm, start, end);
+		if (!vma) {
+			ret = H_STATE;
+			break;
+		}
+		ret = ksm_madvise(vma, vma->vm_start, vma->vm_end,
+			  merge_flag, &vma->vm_flags);
+		if (ret) {
+			ret = H_STATE;
+			break;
+		}
+		start = vma->vm_end + 1;
+	} while (end > vma->vm_end);
+
+	mmap_write_unlock(kvm->mm);
+	return ret;
+}
+
+static int __kvmppc_page_merge(struct kvm *kvm, bool merge)
+{
+	struct kvm_memslots *slots;
+	struct kvm_memory_slot *memslot;
+	int ret = 0;
+
+	slots = kvm_memslots(kvm);
+	kvm_for_each_memslot(memslot, slots) {
+		ret = kvmppc_memslot_page_merge(kvm, memslot, merge);
+		if (ret)
+			break;
+	}
+	return ret;
+}
+
+static inline int kvmppc_disable_page_merge(struct kvm *kvm)
+{
+	return __kvmppc_page_merge(kvm, false);
+}
+
+static inline int kvmppc_enable_page_merge(struct kvm *kvm)
+{
+	return __kvmppc_page_merge(kvm, true);
+}
+
 unsigned long kvmppc_h_svm_init_start(struct kvm *kvm)
 {
 	struct kvm_memslots *slots;
@@ -232,11 +291,18 @@ unsigned long kvmppc_h_svm_init_start(struct kvm *kvm)
 		return H_AUTHORITY;
 
 	srcu_idx = srcu_read_lock(&kvm->srcu);
+
+	/* disable page-merging for all memslot */
+	ret = kvmppc_disable_page_merge(kvm);
+	if (ret)
+		goto out;
+
+	/* register the memslot */
 	slots = kvm_memslots(kvm);
 	kvm_for_each_memslot(memslot, slots) {
 		if (kvmppc_uvmem_slot_init(kvm, memslot)) {
 			ret = H_PARAMETER;
-			goto out;
+			break;
 		}
 		ret = uv_register_mem_slot(kvm->arch.lpid,
 					   memslot->base_gfn << PAGE_SHIFT,
@@ -245,9 +311,12 @@ unsigned long kvmppc_h_svm_init_start(struct kvm *kvm)
 		if (ret < 0) {
 			kvmppc_uvmem_slot_free(kvm, memslot);
 			ret = H_PARAMETER;
-			goto out;
+			break;
 		}
 	}
+
+	if (ret)
+		kvmppc_enable_page_merge(kvm);
 out:
 	srcu_read_unlock(&kvm->srcu, srcu_idx);
 	return ret;
@@ -384,7 +453,7 @@ static struct page *kvmppc_uvmem_get_page(unsigned long gpa, struct kvm *kvm)
  */
 static int kvmppc_svm_page_in(struct vm_area_struct *vma, unsigned long start,
 		   unsigned long end, unsigned long gpa, struct kvm *kvm,
-		   unsigned long page_shift, bool *downgrade)
+		   unsigned long page_shift)
 {
 	unsigned long src_pfn, dst_pfn = 0;
 	struct migrate_vma mig;
@@ -400,18 +469,6 @@ static int kvmppc_svm_page_in(struct vm_area_struct *vma, unsigned long start,
 	mig.src = &src_pfn;
 	mig.dst = &dst_pfn;
 
-	/*
-	 * We come here with mmap_lock write lock held just for
-	 * ksm_madvise(), otherwise we only need read mmap_lock.
-	 * Hence downgrade to read lock once ksm_madvise() is done.
-	 */
-	ret = ksm_madvise(vma, vma->vm_start, vma->vm_end,
-			  MADV_UNMERGEABLE, &vma->vm_flags);
-	mmap_write_downgrade(kvm->mm);
-	*downgrade = true;
-	if (ret)
-		return ret;
-
 	ret = migrate_vma_setup(&mig);
 	if (ret)
 		return ret;
@@ -503,7 +560,6 @@ unsigned long kvmppc_h_svm_page_in(struct kvm *kvm, unsigned long gpa,
 		unsigned long flags,
 		unsigned long page_shift)
 {
-	bool downgrade = false;
 	unsigned long start, end;
 	struct vm_area_struct *vma;
 	int srcu_idx;
@@ -524,7 +580,7 @@ unsigned long kvmppc_h_svm_page_in(struct kvm *kvm, unsigned long gpa,
 
 	ret = H_PARAMETER;
 	srcu_idx = srcu_read_lock(&kvm->srcu);
-	mmap_write_lock(kvm->mm);
+	mmap_read_lock(kvm->mm);
 
 	start = gfn_to_hva(kvm, gfn);
 	if (kvm_is_error_hva(start))
@@ -540,16 +596,12 @@ unsigned long kvmppc_h_svm_page_in(struct kvm *kvm, unsigned long gpa,
 	if (!vma || vma->vm_start > start || vma->vm_end < end)
 		goto out_unlock;
 
-	if (!kvmppc_svm_page_in(vma, start, end, gpa, kvm, page_shift,
-				&downgrade))
+	if (!kvmppc_svm_page_in(vma, start, end, gpa, kvm, page_shift))
 		ret = H_SUCCESS;
 out_unlock:
 	mutex_unlock(&kvm->arch.uvmem_lock);
 out:
-	if (downgrade)
-		mmap_read_unlock(kvm->mm);
-	else
-		mmap_write_unlock(kvm->mm);
+	mmap_read_unlock(kvm->mm);
 	srcu_read_unlock(&kvm->srcu, srcu_idx);
 	return ret;
 }
-- 
1.8.3.1


^ permalink raw reply related

* [v4 2/5] KVM: PPC: Book3S HV: track the state GFNs associated with secure VMs
From: Ram Pai @ 2020-07-17  8:00 UTC (permalink / raw)
  To: kvm-ppc, linuxppc-dev
  Cc: ldufour, linuxram, cclaudio, bharata, sathnaga, aneesh.kumar,
	sukadev, bauerman, david
In-Reply-To: <1594972827-13928-1-git-send-email-linuxram@us.ibm.com>

During the life of SVM, its GFNs transition through normal, secure and
shared states. Since the kernel does not track GFNs that are shared, it
is not possible to disambiguate a shared GFN from a GFN whose PFN has
not yet been migrated to a secure-PFN. Also it is not possible to
disambiguate a secure-GFN from a GFN whose GFN has been pagedout from
the ultravisor.

The ability to identify the state of a GFN is needed to skip migration
of its PFN to secure-PFN during ESM transition.

The code is re-organized to track the states of a GFN as explained
below.

************************************************************************
 1. States of a GFN
    ---------------
 The GFN can be in one of the following states.

 (a) Secure - The GFN is secure. The GFN is associated with
 	a Secure VM, the contents of the GFN is not accessible
 	to the Hypervisor.  This GFN can be backed by a secure-PFN,
 	or can be backed by a normal-PFN with contents encrypted.
 	The former is true when the GFN is paged-in into the
 	ultravisor. The latter is true when the GFN is paged-out
 	of the ultravisor.

 (b) Shared - The GFN is shared. The GFN is associated with a
 	a secure VM. The contents of the GFN is accessible to
 	Hypervisor. This GFN is backed by a normal-PFN and its
 	content is un-encrypted.

 (c) Normal - The GFN is a normal. The GFN is associated with
 	a normal VM. The contents of the GFN is accesible to
 	the Hypervisor. Its content is never encrypted.

 2. States of a VM.
    ---------------

 (a) Normal VM:  A VM whose contents are always accessible to
 	the hypervisor.  All its GFNs are normal-GFNs.

 (b) Secure VM: A VM whose contents are not accessible to the
 	hypervisor without the VM's consent.  Its GFNs are
 	either Shared-GFN or Secure-GFNs.

 (c) Transient VM: A Normal VM that is transitioning to secure VM.
 	The transition starts on successful return of
 	H_SVM_INIT_START, and ends on successful return
 	of H_SVM_INIT_DONE. This transient VM, can have GFNs
 	in any of the three states; i.e Secure-GFN, Shared-GFN,
 	and Normal-GFN.	The VM never executes in this state
 	in supervisor-mode.

 3. Memory slot State.
    ------------------
  	The state of a memory slot mirrors the state of the
  	VM the memory slot is associated with.

 4. VM State transition.
    --------------------

  A VM always starts in Normal Mode.

  H_SVM_INIT_START moves the VM into transient state. During this
  time the Ultravisor may request some of its GFNs to be shared or
  secured. So its GFNs can be in one of the three GFN states.

  H_SVM_INIT_DONE moves the VM entirely from transient state to
  secure-state. At this point any left-over normal-GFNs are
  transitioned to Secure-GFN.

  H_SVM_INIT_ABORT moves the transient VM back to normal VM.
  All its GFNs are moved to Normal-GFNs.

  UV_TERMINATE transitions the secure-VM back to normal-VM. All
  the secure-GFN and shared-GFNs are tranistioned to normal-GFN
  Note: The contents of the normal-GFN is undefined at this point.

 5. GFN state implementation:
    -------------------------

 Secure GFN is associated with a secure-PFN; also called uvmem_pfn,
 when the GFN is paged-in. Its pfn[] has KVMPPC_GFN_UVMEM_PFN flag
 set, and contains the value of the secure-PFN.
 It is associated with a normal-PFN; also called mem_pfn, when
 the GFN is pagedout. Its pfn[] has KVMPPC_GFN_MEM_PFN flag set.
 The value of the normal-PFN is not tracked.

 Shared GFN is associated with a normal-PFN. Its pfn[] has
 KVMPPC_UVMEM_SHARED_PFN flag set. The value of the normal-PFN
 is not tracked.

 Normal GFN is associated with normal-PFN. Its pfn[] has
 no flag set. The value of the normal-PFN is not tracked.

 6. Life cycle of a GFN
    --------------------
 --------------------------------------------------------------
 |        |     Share  |  Unshare | SVM       |H_SVM_INIT_DONE|
 |        |operation   |operation | abort/    |               |
 |        |            |          | terminate |               |
 -------------------------------------------------------------
 |        |            |          |           |               |
 | Secure |     Shared | Secure   |Normal     |Secure         |
 |        |            |          |           |               |
 | Shared |     Shared | Secure   |Normal     |Shared         |
 |        |            |          |           |               |
 | Normal |     Shared | Secure   |Normal     |Secure         |
 --------------------------------------------------------------

 7. Life cycle of a VM
    --------------------
 --------------------------------------------------------------------
 |         |  start    |  H_SVM_  |H_SVM_   |H_SVM_     |UV_SVM_    |
 |         |  VM       |INIT_START|INIT_DONE|INIT_ABORT |TERMINATE  |
 |         |           |          |         |           |           |
 --------- ----------------------------------------------------------
 |         |           |          |         |           |           |
 | Normal  | Normal    | Transient|Error    |Error      |Normal     |
 |         |           |          |         |           |           |
 | Secure  |   Error   | Error    |Error    |Error      |Normal     |
 |         |           |          |         |           |           |
 |Transient|   N/A     | Error    |Secure   |Normal     |Normal     |
 --------------------------------------------------------------------

************************************************************************

Cc: Paul Mackerras <paulus@ozlabs.org>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Bharata B Rao <bharata@linux.ibm.com>
Cc: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
Cc: Sukadev Bhattiprolu <sukadev@linux.vnet.ibm.com>
Cc: Laurent Dufour <ldufour@linux.ibm.com>
Cc: Thiago Jung Bauermann <bauerman@linux.ibm.com>
Cc: David Gibson <david@gibson.dropbear.id.au>
Cc: Claudio Carvalho <cclaudio@linux.ibm.com>
Cc: kvm-ppc@vger.kernel.org
Cc: linuxppc-dev@lists.ozlabs.org
Reviewed-by: Thiago Jung Bauermann <bauerman@linux.ibm.com>
Signed-off-by: Ram Pai <linuxram@us.ibm.com>
---
 arch/powerpc/kvm/book3s_hv_uvmem.c | 187 +++++++++++++++++++++++++++++++++----
 1 file changed, 168 insertions(+), 19 deletions(-)

diff --git a/arch/powerpc/kvm/book3s_hv_uvmem.c b/arch/powerpc/kvm/book3s_hv_uvmem.c
index 0baa293..df2e272 100644
--- a/arch/powerpc/kvm/book3s_hv_uvmem.c
+++ b/arch/powerpc/kvm/book3s_hv_uvmem.c
@@ -98,7 +98,127 @@
 static unsigned long *kvmppc_uvmem_bitmap;
 static DEFINE_SPINLOCK(kvmppc_uvmem_bitmap_lock);
 
-#define KVMPPC_UVMEM_PFN	(1UL << 63)
+/*
+ * States of a GFN
+ * ---------------
+ * The GFN can be in one of the following states.
+ *
+ * (a) Secure - The GFN is secure. The GFN is associated with
+ *	a Secure VM, the contents of the GFN is not accessible
+ *	to the Hypervisor.  This GFN can be backed by a secure-PFN,
+ *	or can be backed by a normal-PFN with contents encrypted.
+ *	The former is true when the GFN is paged-in into the
+ *	ultravisor. The latter is true when the GFN is paged-out
+ *	of the ultravisor.
+ *
+ * (b) Shared - The GFN is shared. The GFN is associated with a
+ *	a secure VM. The contents of the GFN is accessible to
+ *	Hypervisor. This GFN is backed by a normal-PFN and its
+ *	content is un-encrypted.
+ *
+ * (c) Normal - The GFN is a normal. The GFN is associated with
+ *	a normal VM. The contents of the GFN is accesible to
+ *	the Hypervisor. Its content is never encrypted.
+ *
+ * States of a VM.
+ * ---------------
+ *
+ * Normal VM:  A VM whose contents are always accessible to
+ *	the hypervisor.  All its GFNs are normal-GFNs.
+ *
+ * Secure VM: A VM whose contents are not accessible to the
+ *	hypervisor without the VM's consent.  Its GFNs are
+ *	either Shared-GFN or Secure-GFNs.
+ *
+ * Transient VM: A Normal VM that is transitioning to secure VM.
+ *	The transition starts on successful return of
+ *	H_SVM_INIT_START, and ends on successful return
+ *	of H_SVM_INIT_DONE. This transient VM, can have GFNs
+ *	in any of the three states; i.e Secure-GFN, Shared-GFN,
+ *	and Normal-GFN.	The VM never executes in this state
+ *	in supervisor-mode.
+ *
+ * Memory slot State.
+ * -----------------------------
+ *	The state of a memory slot mirrors the state of the
+ *	VM the memory slot is associated with.
+ *
+ * VM State transition.
+ * --------------------
+ *
+ *  A VM always starts in Normal Mode.
+ *
+ *  H_SVM_INIT_START moves the VM into transient state. During this
+ *  time the Ultravisor may request some of its GFNs to be shared or
+ *  secured. So its GFNs can be in one of the three GFN states.
+ *
+ *  H_SVM_INIT_DONE moves the VM entirely from transient state to
+ *  secure-state. At this point any left-over normal-GFNs are
+ *  transitioned to Secure-GFN.
+ *
+ *  H_SVM_INIT_ABORT moves the transient VM back to normal VM.
+ *  All its GFNs are moved to Normal-GFNs.
+ *
+ *  UV_TERMINATE transitions the secure-VM back to normal-VM. All
+ *  the secure-GFN and shared-GFNs are tranistioned to normal-GFN
+ *  Note: The contents of the normal-GFN is undefined at this point.
+ *
+ * GFN state implementation:
+ * -------------------------
+ *
+ * Secure GFN is associated with a secure-PFN; also called uvmem_pfn,
+ * when the GFN is paged-in. Its pfn[] has KVMPPC_GFN_UVMEM_PFN flag
+ * set, and contains the value of the secure-PFN.
+ * It is associated with a normal-PFN; also called mem_pfn, when
+ * the GFN is pagedout. Its pfn[] has KVMPPC_GFN_MEM_PFN flag set.
+ * The value of the normal-PFN is not tracked.
+ *
+ * Shared GFN is associated with a normal-PFN. Its pfn[] has
+ * KVMPPC_UVMEM_SHARED_PFN flag set. The value of the normal-PFN
+ * is not tracked.
+ *
+ * Normal GFN is associated with normal-PFN. Its pfn[] has
+ * no flag set. The value of the normal-PFN is not tracked.
+ *
+ * Life cycle of a GFN
+ * --------------------
+ *
+ * --------------------------------------------------------------
+ * |        |     Share  |  Unshare | SVM       |H_SVM_INIT_DONE|
+ * |        |operation   |operation | abort/    |               |
+ * |        |            |          | terminate |               |
+ * -------------------------------------------------------------
+ * |        |            |          |           |               |
+ * | Secure |     Shared | Secure   |Normal     |Secure         |
+ * |        |            |          |           |               |
+ * | Shared |     Shared | Secure   |Normal     |Shared         |
+ * |        |            |          |           |               |
+ * | Normal |     Shared | Secure   |Normal     |Secure         |
+ * --------------------------------------------------------------
+ *
+ * Life cycle of a VM
+ * --------------------
+ *
+ * --------------------------------------------------------------------
+ * |         |  start    |  H_SVM_  |H_SVM_   |H_SVM_     |UV_SVM_    |
+ * |         |  VM       |INIT_START|INIT_DONE|INIT_ABORT |TERMINATE  |
+ * |         |           |          |         |           |           |
+ * --------- ----------------------------------------------------------
+ * |         |           |          |         |           |           |
+ * | Normal  | Normal    | Transient|Error    |Error      |Normal     |
+ * |         |           |          |         |           |           |
+ * | Secure  |   Error   | Error    |Error    |Error      |Normal     |
+ * |         |           |          |         |           |           |
+ * |Transient|   N/A     | Error    |Secure   |Normal     |Normal     |
+ * --------------------------------------------------------------------
+ */
+
+#define KVMPPC_GFN_UVMEM_PFN	(1UL << 63)
+#define KVMPPC_GFN_MEM_PFN	(1UL << 62)
+#define KVMPPC_GFN_SHARED	(1UL << 61)
+#define KVMPPC_GFN_SECURE	(KVMPPC_GFN_UVMEM_PFN | KVMPPC_GFN_MEM_PFN)
+#define KVMPPC_GFN_FLAG_MASK	(KVMPPC_GFN_SECURE | KVMPPC_GFN_SHARED)
+#define KVMPPC_GFN_PFN_MASK	(~KVMPPC_GFN_FLAG_MASK)
 
 struct kvmppc_uvmem_slot {
 	struct list_head list;
@@ -106,11 +226,11 @@ struct kvmppc_uvmem_slot {
 	unsigned long base_pfn;
 	unsigned long *pfns;
 };
-
 struct kvmppc_uvmem_page_pvt {
 	struct kvm *kvm;
 	unsigned long gpa;
 	bool skip_page_out;
+	bool remove_gfn;
 };
 
 bool kvmppc_uvmem_available(void)
@@ -163,8 +283,8 @@ void kvmppc_uvmem_slot_free(struct kvm *kvm, const struct kvm_memory_slot *slot)
 	mutex_unlock(&kvm->arch.uvmem_lock);
 }
 
-static void kvmppc_uvmem_pfn_insert(unsigned long gfn, unsigned long uvmem_pfn,
-				    struct kvm *kvm)
+static void kvmppc_mark_gfn(unsigned long gfn, struct kvm *kvm,
+			unsigned long flag, unsigned long uvmem_pfn)
 {
 	struct kvmppc_uvmem_slot *p;
 
@@ -172,24 +292,41 @@ static void kvmppc_uvmem_pfn_insert(unsigned long gfn, unsigned long uvmem_pfn,
 		if (gfn >= p->base_pfn && gfn < p->base_pfn + p->nr_pfns) {
 			unsigned long index = gfn - p->base_pfn;
 
-			p->pfns[index] = uvmem_pfn | KVMPPC_UVMEM_PFN;
+			if (flag == KVMPPC_GFN_UVMEM_PFN)
+				p->pfns[index] = uvmem_pfn | flag;
+			else
+				p->pfns[index] = flag;
 			return;
 		}
 	}
 }
 
-static void kvmppc_uvmem_pfn_remove(unsigned long gfn, struct kvm *kvm)
+/* mark the GFN as secure-GFN associated with @uvmem pfn device-PFN. */
+static void kvmppc_gfn_secure_uvmem_pfn(unsigned long gfn,
+			unsigned long uvmem_pfn, struct kvm *kvm)
 {
-	struct kvmppc_uvmem_slot *p;
+	kvmppc_mark_gfn(gfn, kvm, KVMPPC_GFN_UVMEM_PFN, uvmem_pfn);
+}
 
-	list_for_each_entry(p, &kvm->arch.uvmem_pfns, list) {
-		if (gfn >= p->base_pfn && gfn < p->base_pfn + p->nr_pfns) {
-			p->pfns[gfn - p->base_pfn] = 0;
-			return;
-		}
-	}
+/* mark the GFN as secure-GFN associated with a memory-PFN. */
+static void kvmppc_gfn_secure_mem_pfn(unsigned long gfn, struct kvm *kvm)
+{
+	kvmppc_mark_gfn(gfn, kvm, KVMPPC_GFN_MEM_PFN, 0);
+}
+
+/* mark the GFN as a shared GFN. */
+static void kvmppc_gfn_shared(unsigned long gfn, struct kvm *kvm)
+{
+	kvmppc_mark_gfn(gfn, kvm, KVMPPC_GFN_SHARED, 0);
+}
+
+/* mark the GFN as a non-existent GFN. */
+static void kvmppc_gfn_remove(unsigned long gfn, struct kvm *kvm)
+{
+	kvmppc_mark_gfn(gfn, kvm, 0, 0);
 }
 
+/* return true, if the GFN is a secure-GFN backed by a secure-PFN */
 static bool kvmppc_gfn_is_uvmem_pfn(unsigned long gfn, struct kvm *kvm,
 				    unsigned long *uvmem_pfn)
 {
@@ -199,10 +336,10 @@ static bool kvmppc_gfn_is_uvmem_pfn(unsigned long gfn, struct kvm *kvm,
 		if (gfn >= p->base_pfn && gfn < p->base_pfn + p->nr_pfns) {
 			unsigned long index = gfn - p->base_pfn;
 
-			if (p->pfns[index] & KVMPPC_UVMEM_PFN) {
+			if (p->pfns[index] & KVMPPC_GFN_UVMEM_PFN) {
 				if (uvmem_pfn)
 					*uvmem_pfn = p->pfns[index] &
-						     ~KVMPPC_UVMEM_PFN;
+						     KVMPPC_GFN_PFN_MASK;
 				return true;
 			} else
 				return false;
@@ -353,6 +490,7 @@ void kvmppc_uvmem_drop_pages(const struct kvm_memory_slot *free,
 
 		mutex_lock(&kvm->arch.uvmem_lock);
 		if (!kvmppc_gfn_is_uvmem_pfn(gfn, kvm, &uvmem_pfn)) {
+			kvmppc_gfn_remove(gfn, kvm);
 			mutex_unlock(&kvm->arch.uvmem_lock);
 			continue;
 		}
@@ -360,6 +498,7 @@ void kvmppc_uvmem_drop_pages(const struct kvm_memory_slot *free,
 		uvmem_page = pfn_to_page(uvmem_pfn);
 		pvt = uvmem_page->zone_device_data;
 		pvt->skip_page_out = skip_page_out;
+		pvt->remove_gfn = true;
 		mutex_unlock(&kvm->arch.uvmem_lock);
 
 		pfn = gfn_to_pfn(kvm, gfn);
@@ -429,7 +568,7 @@ static struct page *kvmppc_uvmem_get_page(unsigned long gpa, struct kvm *kvm)
 		goto out_clear;
 
 	uvmem_pfn = bit + pfn_first;
-	kvmppc_uvmem_pfn_insert(gpa >> PAGE_SHIFT, uvmem_pfn, kvm);
+	kvmppc_gfn_secure_uvmem_pfn(gpa >> PAGE_SHIFT, uvmem_pfn, kvm);
 
 	pvt->gpa = gpa;
 	pvt->kvm = kvm;
@@ -524,6 +663,7 @@ static unsigned long kvmppc_share_page(struct kvm *kvm, unsigned long gpa,
 		uvmem_page = pfn_to_page(uvmem_pfn);
 		pvt = uvmem_page->zone_device_data;
 		pvt->skip_page_out = true;
+		pvt->remove_gfn = false;
 	}
 
 retry:
@@ -537,12 +677,16 @@ static unsigned long kvmppc_share_page(struct kvm *kvm, unsigned long gpa,
 		uvmem_page = pfn_to_page(uvmem_pfn);
 		pvt = uvmem_page->zone_device_data;
 		pvt->skip_page_out = true;
+		pvt->remove_gfn = false;
 		kvm_release_pfn_clean(pfn);
 		goto retry;
 	}
 
-	if (!uv_page_in(kvm->arch.lpid, pfn << page_shift, gpa, 0, page_shift))
+	if (!uv_page_in(kvm->arch.lpid, pfn << page_shift, gpa, 0,
+				page_shift)) {
+		kvmppc_gfn_shared(gfn, kvm);
 		ret = H_SUCCESS;
+	}
 	kvm_release_pfn_clean(pfn);
 	mutex_unlock(&kvm->arch.uvmem_lock);
 out:
@@ -598,6 +742,7 @@ unsigned long kvmppc_h_svm_page_in(struct kvm *kvm, unsigned long gpa,
 
 	if (!kvmppc_svm_page_in(vma, start, end, gpa, kvm, page_shift))
 		ret = H_SUCCESS;
+
 out_unlock:
 	mutex_unlock(&kvm->arch.uvmem_lock);
 out:
@@ -706,7 +851,8 @@ static vm_fault_t kvmppc_uvmem_migrate_to_ram(struct vm_fault *vmf)
 /*
  * Release the device PFN back to the pool
  *
- * Gets called when secure page becomes a normal page during H_SVM_PAGE_OUT.
+ * Gets called when secure GFN tranistions from a secure-PFN
+ * to a normal PFN during H_SVM_PAGE_OUT.
  * Gets called with kvm->arch.uvmem_lock held.
  */
 static void kvmppc_uvmem_page_free(struct page *page)
@@ -721,7 +867,10 @@ static void kvmppc_uvmem_page_free(struct page *page)
 
 	pvt = page->zone_device_data;
 	page->zone_device_data = NULL;
-	kvmppc_uvmem_pfn_remove(pvt->gpa >> PAGE_SHIFT, pvt->kvm);
+	if (pvt->remove_gfn)
+		kvmppc_gfn_remove(pvt->gpa >> PAGE_SHIFT, pvt->kvm);
+	else
+		kvmppc_gfn_secure_mem_pfn(pvt->gpa >> PAGE_SHIFT, pvt->kvm);
 	kfree(pvt);
 }
 
-- 
1.8.3.1


^ permalink raw reply related

* [v4 0/5] Migrate non-migrated pages of a SVM.
From: Ram Pai @ 2020-07-17  8:00 UTC (permalink / raw)
  To: kvm-ppc, linuxppc-dev
  Cc: ldufour, linuxram, cclaudio, bharata, sathnaga, aneesh.kumar,
	sukadev, bauerman, david

The time to switch a VM to Secure-VM, increases by the size of the VM.
A 100GB VM takes about 7minutes. This is unacceptable.  This linear
increase is caused by a suboptimal behavior by the Ultravisor and the
Hypervisor.  The Ultravisor unnecessarily migrates all the GFN of the
VM from normal-memory to secure-memory. It has to just migrate the
necessary and sufficient GFNs.

However when the optimization is incorporated in the Ultravisor, the
Hypervisor starts misbehaving. The Hypervisor has a inbuilt assumption
that the Ultravisor will explicitly request to migrate, each and every
GFN of the VM. If only necessary and sufficient GFNs are requested for
migration, the Hypervisor continues to manage the remaining GFNs as
normal GFNs. This leads to memory corruption; manifested
consistently when the SVM reboots.

The same is true, when a memory slot is hotplugged into a SVM. The
Hypervisor expects the ultravisor to request migration of all GFNs to
secure-GFN.  But the hypervisor cannot handle any H_SVM_PAGE_IN
requests from the Ultravisor, done in the context of
UV_REGISTER_MEM_SLOT ucall.  This problem manifests as random errors
in the SVM, when a memory-slot is hotplugged.

This patch series automatically migrates the non-migrated pages of a
SVM, and thus solves the problem.

Testing: Passed rigorous testing using various sized SVMs.

Changelog:

v4:  .  Incorported Bharata's comments:
	- Optimization -- replace write mmap semaphore with read mmap semphore.
	- disable page-merge during memory hotplug.
	- rearranged the patches. consolidated the page-migration-retry logic
		in a single patch.

v3: . Optimized the page-migration retry-logic. 
    . Relax and relinquish the cpu regularly while bulk migrating
    	the non-migrated pages. This issue was causing soft-lockups.
	Fixed it.
    . Added a new patch, to retry page-migration a couple of times
    	before returning H_BUSY in H_SVM_PAGE_IN. This issue was
	seen a few times in a 24hour continuous reboot test of the SVMs.

v2: . fixed a bug observed by Laurent. The state of the GFN's associated
	with Secure-VMs were not reset during memslot flush.
    . Re-organized the code, for easier review.
    . Better description of the patch series.

v1: fixed a bug observed by Bharata. Pages that where paged-in and later
paged-out must also be skipped from migration during H_SVM_INIT_DONE.


Laurent Dufour (1):
  KVM: PPC: Book3S HV: migrate hot plugged memory

Ram Pai (4):
  KVM: PPC: Book3S HV: Disable page merging in H_SVM_INIT_START
  KVM: PPC: Book3S HV: track the state GFNs associated with secure VMs
  KVM: PPC: Book3S HV: in H_SVM_INIT_DONE, migrate remaining normal-GFNs
    to secure-GFNs.
  KVM: PPC: Book3S HV: retry page migration before erroring-out

 Documentation/powerpc/ultravisor.rst        |   4 +
 arch/powerpc/include/asm/kvm_book3s_uvmem.h |  12 +
 arch/powerpc/kvm/book3s_hv.c                |  10 +-
 arch/powerpc/kvm/book3s_hv_uvmem.c          | 508 ++++++++++++++++++++++++----
 4 files changed, 457 insertions(+), 77 deletions(-)

-- 
1.8.3.1


^ permalink raw reply

* Re: [PATCH 06/11] powerpc/smp: Generalize 2nd sched domain
From: Gautham R Shenoy @ 2020-07-17  6:37 UTC (permalink / raw)
  To: Srikar Dronamraju
  Cc: Nathan Lynch, Gautham R Shenoy, Oliver OHalloran, Michael Neuling,
	Michael Ellerman, Anton Blanchard, linuxppc-dev, Nick Piggin
In-Reply-To: <20200714043624.5648-7-srikar@linux.vnet.ibm.com>

On Tue, Jul 14, 2020 at 10:06:19AM +0530, Srikar Dronamraju wrote:
> Currently "CACHE" domain happens to be the 2nd sched domain as per
> powerpc_topology. This domain will collapse if cpumask of l2-cache is
> same as SMT domain. However we could generalize this domain such that it
> could mean either be a "CACHE" domain or a "BIGCORE" domain.
> 
> While setting up the "CACHE" domain, check if shared_cache is already
> set.
> 
> Cc: linuxppc-dev <linuxppc-dev@lists.ozlabs.org>
> Cc: Michael Ellerman <michaele@au1.ibm.com>
> Cc: Nick Piggin <npiggin@au1.ibm.com>
> Cc: Oliver OHalloran <oliveroh@au1.ibm.com>
> Cc: Nathan Lynch <nathanl@linux.ibm.com>
> Cc: Michael Neuling <mikey@linux.ibm.com>
> Cc: Anton Blanchard <anton@au1.ibm.com>
> Cc: Gautham R Shenoy <ego@linux.vnet.ibm.com>
> Cc: Vaidyanathan Srinivasan <svaidy@linux.ibm.com>
> Signed-off-by: Srikar Dronamraju <srikar@linux.vnet.ibm.com>
> ---
>  arch/powerpc/kernel/smp.c | 48 +++++++++++++++++++++++++++------------
>  1 file changed, 34 insertions(+), 14 deletions(-)
> 
> diff --git a/arch/powerpc/kernel/smp.c b/arch/powerpc/kernel/smp.c
> index 875f57e41355..f8faf75135af 100644
> --- a/arch/powerpc/kernel/smp.c
> +++ b/arch/powerpc/kernel/smp.c
> @@ -85,6 +85,14 @@ EXPORT_PER_CPU_SYMBOL(cpu_l2_cache_map);
>  EXPORT_PER_CPU_SYMBOL(cpu_core_map);
>  EXPORT_SYMBOL_GPL(has_big_cores);
> 
> +enum {
> +#ifdef CONFIG_SCHED_SMT
> +	smt_idx,
> +#endif
> +	bigcore_idx,
> +	die_idx,
> +};
> +
>  #define MAX_THREAD_LIST_SIZE	8
>  #define THREAD_GROUP_SHARE_L1   1
>  struct thread_groups {
> @@ -851,13 +859,7 @@ static int powerpc_shared_cache_flags(void)
>   */
>  static const struct cpumask *shared_cache_mask(int cpu)
>  {
> -	if (shared_caches)
> -		return cpu_l2_cache_mask(cpu);
> -
> -	if (has_big_cores)
> -		return cpu_smallcore_mask(cpu);
> -
> -	return cpu_smt_mask(cpu);
> +	return per_cpu(cpu_l2_cache_map, cpu);
>  }
> 
>  #ifdef CONFIG_SCHED_SMT
> @@ -867,11 +869,16 @@ static const struct cpumask *smallcore_smt_mask(int cpu)
>  }
>  #endif
> 
> +static const struct cpumask *cpu_bigcore_mask(int cpu)
> +{
> +	return cpu_core_mask(cpu);

It should be cpu_smt_mask() if we want the redundant big-core to be
degenerated in favour of the SMT level on P8, no? Because
cpu_core_mask refers to all the CPUs that are in the same chip.


> +}
> +
>  static struct sched_domain_topology_level powerpc_topology[] = {
>  #ifdef CONFIG_SCHED_SMT
>  	{ cpu_smt_mask, powerpc_smt_flags, SD_INIT_NAME(SMT) },
>  #endif
> -	{ shared_cache_mask, powerpc_shared_cache_flags, SD_INIT_NAME(CACHE) },
> +	{ cpu_bigcore_mask, SD_INIT_NAME(BIGCORE) },
>  	{ cpu_cpu_mask, SD_INIT_NAME(DIE) },
>  	{ NULL, },
>  };
> @@ -895,7 +902,7 @@ static int init_big_cores(void)
> 
>  #ifdef CONFIG_SCHED_SMT
>  	pr_info("Big cores detected. Using small core scheduling\n");
> -	powerpc_topology[0].mask = smallcore_smt_mask;
> +	powerpc_topology[smt_idx].mask = smallcore_smt_mask;
>  #endif
> 
>  	return 0;
> @@ -1319,7 +1326,6 @@ static void add_cpu_to_masks(int cpu)
>  void start_secondary(void *unused)
>  {
>  	unsigned int cpu = smp_processor_id();
> -	struct cpumask *(*sibling_mask)(int) = cpu_sibling_mask;
> 
>  	mmgrab(&init_mm);
>  	current->active_mm = &init_mm;
> @@ -1345,14 +1351,20 @@ void start_secondary(void *unused)
>  	/* Update topology CPU masks */
>  	add_cpu_to_masks(cpu);
> 
> -	if (has_big_cores)
> -		sibling_mask = cpu_smallcore_mask;
>  	/*
>  	 * Check for any shared caches. Note that this must be done on a
>  	 * per-core basis because one core in the pair might be disabled.
>  	 */
> -	if (!cpumask_equal(cpu_l2_cache_mask(cpu), sibling_mask(cpu)))
> -		shared_caches = true;
> +	if (!shared_caches) {
> +		struct cpumask *(*sibling_mask)(int) = cpu_sibling_mask;
> +		struct cpumask *mask = cpu_l2_cache_mask(cpu);
> +
> +		if (has_big_cores)
> +			sibling_mask = cpu_smallcore_mask;
> +
> +		if (cpumask_weight(mask) > cpumask_weight(sibling_mask(cpu)))
> +			shared_caches = true;

Shouldn't we use cpumask_subset() here ?

  			
> +	}
> 
>  	set_numa_node(numa_cpu_lookup_table[cpu]);
>  	set_numa_mem(local_memory_node(numa_cpu_lookup_table[cpu]));
> @@ -1390,6 +1402,14 @@ void __init smp_cpus_done(unsigned int max_cpus)
>  		smp_ops->bringup_done();
> 
>  	dump_numa_cpu_topology();
> +	if (shared_caches) {
> +		pr_info("Using shared cache scheduler topology\n");
> +		powerpc_topology[bigcore_idx].mask = shared_cache_mask;
> +#ifdef CONFIG_SCHED_DEBUG
> +		powerpc_topology[bigcore_idx].name = "CACHE";
> +#endif
> +		powerpc_topology[bigcore_idx].sd_flags = powerpc_shared_cache_flags;
> +	}


I would much rather that we have all the topology-fixups done in one
function.

fixup_topology(void) {
     if (has_big_core)
        powerpc_topology[smt_idx].mask = smallcore_smt_mask;

    if (shared_caches) {
       const char *name = "CACHE";
       powerpc_topology[bigcore_idx].mask = shared_cache_mask;
       strlcpy(powerpc_topology[bigcore_idx].name, name,
       				strlen(name));
       powerpc_topology[bigcore_idx].sd_flags = powerpc_shared_cache_flags;
    }

    /* Any other changes to the topology structure here */

And also as an optimization, get rid of degenerate structures here
itself so that we don't pay additional penalty while building the
sched-domains each time.

}

> 
>  	set_sched_topology(powerpc_topology);
>  }
> -- 
> 2.17.1
> 

^ permalink raw reply

* Re: ASMedia USB 3.x host controllers triggering EEH on POWER9
From: Oliver O'Halloran @ 2020-07-17  6:10 UTC (permalink / raw)
  To: Forest Crossman; +Cc: linux-pci, linux-usb, linuxppc-dev
In-Reply-To: <CAO3ALPy8_pxVyFROZUWNafEH1vUCP6LVpNmBBuMDSewGitzdLw@mail.gmail.com>

On Fri, Jul 17, 2020 at 2:14 PM Forest Crossman <cyrozap@gmail.com> wrote:
>
> Hi, all,
>
> I have several ASMedia USB 3.x host controllers (ASM2142 and ASM3142,
> both share the same Vendor ID/Device ID pair) that I'd like to use
> with a POWER9 system (a Raptor Computing Systems Talos II).
> Unfortunately, while the kernel recognizes the controllers just fine,
> as soon as I plug in a device, an EEH error occurs and the host
> controller gets repeatedly reset until it eventually gets disabled. An
> example of one of these errors can be seen here:
> https://paste.debian.net/hidden/e39698eb
>
> Based on the "PHB4 Diag-data" reported by the kernel, it seems that
> LEM_WOF_R bit 35, PHB_FESR bit 20, and RXE_ARB_FESR bit 28 have been
> set. According to the PHB4 specification
> (https://ibm.ent.box.com/s/jftnfhceul07qjh9jtn91xwjmclabc71), they
> respectively mean the following:
>  - ARB: IODA TVT Errors - "TCE Validation Table error occurred. The
> entry is invalid, or the PCI Address was out of range as defined by
> the TTA bounds in the TVE entry."
>  - RXE_ARB OR Error Status - "RXE_ARB error bits, ... OR of all error
> status bits."
>  - IODA TVT Address Range Error - "IODA Error: The PCI Address was out
> of range as defined by the TTA bounds in the TVE entry."

Welcome to my world!

In the future you can use this script to automate some of the tedium
of parsing the eeh dumps:
https://patchwork.ozlabs.org/project/skiboot/patch/20200717044243.1195833-1-oohall@gmail.com/

Anyway, for background the way PHB3 and PHB4 handle incoming DMAs goes
as follows:

1. Map the 16bit <bus><devfn> number to an IOMMU context, we call
those PEs. PE means "partitionable endpoint", but for the purpose of
processing DMAs you can ignore that and just treat it as an IOMMU
context ID.
2. Use the PE number and some of the upper bits of the DMA address to
form the index into the Translation Validation Table.
3. Use the table entry to validate the DMA address is within bounds
and whether it should be translated by the IOMMU or used as-is.

If the table entry says the DMA needs to be translated by the IOMMU we'll also:
4. Walk the IOMMU table to get the relevant IOMMU table entry.
5. Validate the device has permission to read/write to that address.

The "TVT Address Range Error" you're seeing means that the bounds
checks done in 3) is failing. OPAL configures the PHB so there's two
TVT entries (TVEs for short) assigned to each PE. Bit 59 of the DMA
address is used to select which TVE to use. We typically configure
TVE#0 to map 0x0...0x8000_0000 so there's a 2GB 32bit DMA window.
TVE#1 is configured for no-translate (bypass) mode so you can convert
from a system physical address to a DMA address by ORing in bit 59.

From word 2 of the PEST entry the faulting DMA address is:
0x0000203974c00000. That address is interesting since it looks a lot
like a memory address on the 2nd chip, but it doesn't have bit 59 set
so TVE#0 is used to validate it. Obviously that address is above 2GB
so we get the error.

What's probably happening is that the ASmedia controller doesn't
actually implement all 64 address bits and truncates the upper bits of
the DMA address. Doing that is a blatant violation of the PCIe (and
probably the xHCI) spec, but it's also pretty common since "it works
on x86." Something to try would be booting with the iommu=nobypass in
the kernel command line. That'll disable TVE#1 and force all DMAs to
go through TVE#0.

> In other words, the ASMedia USB controllers seem to be trying to write
> to addresses they're not supposed to, and thankfully the PHB4 is
> catching these bad writes before they can cause any corruption of my
> system's memory. Of course, this has the unfortunate side-effect that
> these devices are completely unable to operate with my computer, and
> since it seems to be possible to use these controllers on x86 systems
> (presumably because of the less-strict/disabled-by-default IOMMU), I
> wonder if maybe it would be possible to work around these errors in
> either the kernel or the OPAL firmware?

It's not easy to work around at the platform level since we have no
way to know what an arbitrary device can and can't do. There's various
tricks we can pull like putting the TVE#0 into bypass mode, but that
causes problems if there are any devices in the PE that are actually
limited to 32bit DMA. Some GPUs have a secondary audio function which
only supports 32bit DMA so it's not a completely academic concern
either.

> Now, I'm a novice at kernel hacking, so I don't really know what I'm
> doing, but just for fun I did try to paper over the issue by adding an
> EEH handler to the xhci driver
> (https://paste.debian.net/hidden/16081515), but as you might expect,
> that didn't do anything but prevent further communication with the
> device.

Assuming the nobypass trick above works, what you really need to do is
have the driver report that it can't address all 64bits by setting its
DMA mask accordingly. For the xhci driver it looks like this is done
in xhci_gen_setup(), there might be a quirks-style interface for
working around bugs in specific controllers that you can use. Have a
poke around and see what you can find :)

Oliver

^ permalink raw reply

* Re: [PATCH 05/11] powerpc/smp: Dont assume l2-cache to be superset of sibling
From: Gautham R Shenoy @ 2020-07-17  6:00 UTC (permalink / raw)
  To: Srikar Dronamraju
  Cc: Nathan Lynch, Gautham R Shenoy, Oliver OHalloran, Michael Neuling,
	Michael Ellerman, Anton Blanchard, linuxppc-dev, Nick Piggin
In-Reply-To: <20200714043624.5648-6-srikar@linux.vnet.ibm.com>

Hi Srikar,

On Tue, Jul 14, 2020 at 10:06:18AM +0530, Srikar Dronamraju wrote:
> Current code assumes that cpumask of cpus sharing a l2-cache mask will
> always be a superset of cpu_sibling_mask.
> 
> Lets stop that assumption.
> 
> Cc: linuxppc-dev <linuxppc-dev@lists.ozlabs.org>
> Cc: Michael Ellerman <michaele@au1.ibm.com>
> Cc: Nick Piggin <npiggin@au1.ibm.com>
> Cc: Oliver OHalloran <oliveroh@au1.ibm.com>
> Cc: Nathan Lynch <nathanl@linux.ibm.com>
> Cc: Michael Neuling <mikey@linux.ibm.com>
> Cc: Anton Blanchard <anton@au1.ibm.com>
> Cc: Gautham R Shenoy <ego@linux.vnet.ibm.com>
> Cc: Vaidyanathan Srinivasan <svaidy@linux.ibm.com>
> Signed-off-by: Srikar Dronamraju <srikar@linux.vnet.ibm.com>
> ---
>  arch/powerpc/kernel/smp.c | 28 +++++++++++++++-------------
>  1 file changed, 15 insertions(+), 13 deletions(-)
> 
> diff --git a/arch/powerpc/kernel/smp.c b/arch/powerpc/kernel/smp.c
> index 7d430fc536cc..875f57e41355 100644
> --- a/arch/powerpc/kernel/smp.c
> +++ b/arch/powerpc/kernel/smp.c
> @@ -1198,6 +1198,7 @@ static bool update_mask_by_l2(int cpu, struct cpumask *(*mask_fn)(int))
>  	struct device_node *l2_cache, *np;
>  	int i;
> 
> +	cpumask_set_cpu(cpu, mask_fn(cpu));

It would be good to comment why do we need to do set the CPU in the
l2-mask if we don't have a l2cache domain.

>  	l2_cache = cpu_to_l2cache(cpu);
>  	if (!l2_cache)
>  		return false;
> @@ -1284,29 +1285,30 @@ static void add_cpu_to_masks(int cpu)
>  	 * add it to it's own thread sibling mask.
>  	 */
>  	cpumask_set_cpu(cpu, cpu_sibling_mask(cpu));
> +	cpumask_set_cpu(cpu, cpu_core_mask(cpu));
> 
>  	for (i = first_thread; i < first_thread + threads_per_core; i++)
>  		if (cpu_online(i))
>  			set_cpus_related(i, cpu, cpu_sibling_mask);
> 
>  	add_cpu_to_smallcore_masks(cpu);
> -	/*
> -	 * Copy the thread sibling mask into the cache sibling mask
> -	 * and mark any CPUs that share an L2 with this CPU.
> -	 */
> -	for_each_cpu(i, cpu_sibling_mask(cpu))
> -		set_cpus_related(cpu, i, cpu_l2_cache_mask);
>  	update_mask_by_l2(cpu, cpu_l2_cache_mask);
> 
> -	/*
> -	 * Copy the cache sibling mask into core sibling mask and mark
> -	 * any CPUs on the same chip as this CPU.
> -	 */
> -	for_each_cpu(i, cpu_l2_cache_mask(cpu))
> -		set_cpus_related(cpu, i, cpu_core_mask);
> +	if (pkg_id == -1) {
> +		struct cpumask *(*mask)(int) = cpu_sibling_mask;
> +
> +		/*
> +		 * Copy the sibling mask into core sibling mask and
> +		 * mark any CPUs on the same chip as this CPU.
> +		 */
> +		if (shared_caches)
> +			mask = cpu_l2_cache_mask;
> +


Now that we decoupling the containment relationship between
sibling_mask and l2-cache mask, should we set all the CPUs that are
both in cpu_sibling_mask(cpu) as well as cpu_l2_mask(cpu) in
cpu_core_mask ? 

> +		for_each_cpu(i, mask(cpu))
> +			set_cpus_related(cpu, i, cpu_core_mask);
> 
> -	if (pkg_id == -1)
>  		return;
> +	}
> 
>  	for_each_cpu(i, cpu_online_mask)
>  		if (get_physical_package_id(i) == pkg_id)
> -- 
> 2.17.1
> 
--
Thanks and Regards
gautham.

^ permalink raw reply

* Re: [PATCH v3] powerpc/pseries: detect secure and trusted boot state of the system.
From: Daniel Axtens @ 2020-07-17  5:58 UTC (permalink / raw)
  To: Michal Suchánek, Nayna Jain; +Cc: linuxppc-dev, linux-kernel, Mimi Zohar
In-Reply-To: <20200716081337.GB32107@kitsune.suse.cz>

Michal Suchánek <msuchanek@suse.de> writes:

> On Wed, Jul 15, 2020 at 07:52:01AM -0400, Nayna Jain wrote:
>> The device-tree property to check secure and trusted boot state is
>> different for guests(pseries) compared to baremetal(powernv).
>> 
>> This patch updates the existing is_ppc_secureboot_enabled() and
>> is_ppc_trustedboot_enabled() functions to add support for pseries.
>> 
>> The secureboot and trustedboot state are exposed via device-tree property:
>> /proc/device-tree/ibm,secure-boot and /proc/device-tree/ibm,trusted-boot
>> 
>> The values of ibm,secure-boot under pseries are interpreted as:
>                                       ^^^
>> 
>> 0 - Disabled
>> 1 - Enabled in Log-only mode. This patch interprets this value as
>> disabled, since audit mode is currently not supported for Linux.
>> 2 - Enabled and enforced.
>> 3-9 - Enabled and enforcing; requirements are at the discretion of the
>> operating system.
>> 
>> The values of ibm,trusted-boot under pseries are interpreted as:
>                                        ^^^
> These two should be different I suppose?

I'm not quite sure what you mean? They'll be documented in a future
revision of the PAPR, once I get my act together and submit the
relevant internal paperwork.

Daniel
>
> Thanks
>
> Michal
>> 0 - Disabled
>> 1 - Enabled
>> 
>> Signed-off-by: Nayna Jain <nayna@linux.ibm.com>
>> Reviewed-by: Daniel Axtens <dja@axtens.net>
>> ---
>> v3:
>> * fixed double check. Thanks Daniel for noticing it.
>> * updated patch description.
>> 
>> v2:
>> * included Michael Ellerman's feedback.
>> * added Daniel Axtens's Reviewed-by.
>> 
>>  arch/powerpc/kernel/secure_boot.c | 19 +++++++++++++++++--
>>  1 file changed, 17 insertions(+), 2 deletions(-)
>> 
>> diff --git a/arch/powerpc/kernel/secure_boot.c b/arch/powerpc/kernel/secure_boot.c
>> index 4b982324d368..118bcb5f79c4 100644
>> --- a/arch/powerpc/kernel/secure_boot.c
>> +++ b/arch/powerpc/kernel/secure_boot.c
>> @@ -6,6 +6,7 @@
>>  #include <linux/types.h>
>>  #include <linux/of.h>
>>  #include <asm/secure_boot.h>
>> +#include <asm/machdep.h>
>>  
>>  static struct device_node *get_ppc_fw_sb_node(void)
>>  {
>> @@ -23,12 +24,19 @@ bool is_ppc_secureboot_enabled(void)
>>  {
>>  	struct device_node *node;
>>  	bool enabled = false;
>> +	u32 secureboot;
>>  
>>  	node = get_ppc_fw_sb_node();
>>  	enabled = of_property_read_bool(node, "os-secureboot-enforcing");
>> -
>>  	of_node_put(node);
>>  
>> +	if (enabled)
>> +		goto out;
>> +
>> +	if (!of_property_read_u32(of_root, "ibm,secure-boot", &secureboot))
>> +		enabled = (secureboot > 1);
>> +
>> +out:
>>  	pr_info("Secure boot mode %s\n", enabled ? "enabled" : "disabled");
>>  
>>  	return enabled;
>> @@ -38,12 +46,19 @@ bool is_ppc_trustedboot_enabled(void)
>>  {
>>  	struct device_node *node;
>>  	bool enabled = false;
>> +	u32 trustedboot;
>>  
>>  	node = get_ppc_fw_sb_node();
>>  	enabled = of_property_read_bool(node, "trusted-enabled");
>> -
>>  	of_node_put(node);
>>  
>> +	if (enabled)
>> +		goto out;
>> +
>> +	if (!of_property_read_u32(of_root, "ibm,trusted-boot", &trustedboot))
>> +		enabled = (trustedboot > 0);
>> +
>> +out:
>>  	pr_info("Trusted boot mode %s\n", enabled ? "enabled" : "disabled");
>>  
>>  	return enabled;
>> -- 
>> 2.26.2
>> 

^ permalink raw reply

* Re: [PATCH 04/11] powerpc/smp: Enable small core scheduling sooner
From: Gautham R Shenoy @ 2020-07-17  5:48 UTC (permalink / raw)
  To: Srikar Dronamraju
  Cc: Nathan Lynch, Gautham R Shenoy, Oliver OHalloran, Michael Neuling,
	Michael Ellerman, Anton Blanchard, linuxppc-dev, Nick Piggin
In-Reply-To: <20200714043624.5648-5-srikar@linux.vnet.ibm.com>

On Tue, Jul 14, 2020 at 10:06:17AM +0530, Srikar Dronamraju wrote:
> Enable small core scheduling as soon as we detect that we are in a
> system that supports thread group. Doing so would avoid a redundant
> check.
> 
> Cc: linuxppc-dev <linuxppc-dev@lists.ozlabs.org>
> Cc: Michael Ellerman <michaele@au1.ibm.com>
> Cc: Nick Piggin <npiggin@au1.ibm.com>
> Cc: Oliver OHalloran <oliveroh@au1.ibm.com>
> Cc: Nathan Lynch <nathanl@linux.ibm.com>
> Cc: Michael Neuling <mikey@linux.ibm.com>
> Cc: Anton Blanchard <anton@au1.ibm.com>
> Cc: Gautham R Shenoy <ego@linux.vnet.ibm.com>
> Cc: Vaidyanathan Srinivasan <svaidy@linux.ibm.com>
> Signed-off-by: Srikar Dronamraju <srikar@linux.vnet.ibm.com>

I don't see a problem with this.

However, since we are now going to be maintaining a single topology
structure, wouldn't it be better to collate all the changes being made
to the mask_functions/flags/names of this structure within a single
function so that it becomes easier to keep track of what all changes
are going into the topology and why are we doing it?


> ---
>  arch/powerpc/kernel/smp.c | 12 ++++++------
>  1 file changed, 6 insertions(+), 6 deletions(-)
> 
> diff --git a/arch/powerpc/kernel/smp.c b/arch/powerpc/kernel/smp.c
> index 24529f6134aa..7d430fc536cc 100644
> --- a/arch/powerpc/kernel/smp.c
> +++ b/arch/powerpc/kernel/smp.c
> @@ -892,6 +892,12 @@ static int init_big_cores(void)
>  	}
> 
>  	has_big_cores = true;
> +
> +#ifdef CONFIG_SCHED_SMT
> +	pr_info("Big cores detected. Using small core scheduling\n");
> +	powerpc_topology[0].mask = smallcore_smt_mask;
> +#endif
> +
>  	return 0;
>  }
> 
> @@ -1383,12 +1389,6 @@ void __init smp_cpus_done(unsigned int max_cpus)
> 
>  	dump_numa_cpu_topology();
> 
> -#ifdef CONFIG_SCHED_SMT
> -	if (has_big_cores) {
> -		pr_info("Big cores detected but using small core scheduling\n");
> -		powerpc_topology[0].mask = smallcore_smt_mask;
> -	}
> -#endif
>  	set_sched_topology(powerpc_topology);
>  }
> 
> -- 
> 2.17.1
> 

^ permalink raw reply

* Re: [PATCH 03/11] powerpc/smp: Move powerpc_topology above
From: Gautham R Shenoy @ 2020-07-17  5:45 UTC (permalink / raw)
  To: Srikar Dronamraju
  Cc: Nathan Lynch, Gautham R Shenoy, Oliver OHalloran, Michael Neuling,
	Michael Ellerman, Anton Blanchard, linuxppc-dev, Nick Piggin
In-Reply-To: <20200714043624.5648-4-srikar@linux.vnet.ibm.com>

On Tue, Jul 14, 2020 at 10:06:16AM +0530, Srikar Dronamraju wrote:
> Just moving the powerpc_topology description above.
> This will help in using functions in this file and avoid declarations.
> 
> No other functional changes
> 
> Cc: linuxppc-dev <linuxppc-dev@lists.ozlabs.org>
> Cc: Michael Ellerman <michaele@au1.ibm.com>
> Cc: Nick Piggin <npiggin@au1.ibm.com>
> Cc: Oliver OHalloran <oliveroh@au1.ibm.com>
> Cc: Nathan Lynch <nathanl@linux.ibm.com>
> Cc: Michael Neuling <mikey@linux.ibm.com>
> Cc: Anton Blanchard <anton@au1.ibm.com>
> Cc: Gautham R Shenoy <ego@linux.vnet.ibm.com>
> Cc: Vaidyanathan Srinivasan <svaidy@linux.ibm.com>
> Signed-off-by: Srikar Dronamraju <srikar@linux.vnet.ibm.com>

Reviewed-by: Gautham R. Shenoy <ego@linux.vnet.ibm.com>

> ---
>  arch/powerpc/kernel/smp.c | 116 +++++++++++++++++++-------------------
>  1 file changed, 58 insertions(+), 58 deletions(-)
> 
> diff --git a/arch/powerpc/kernel/smp.c b/arch/powerpc/kernel/smp.c
> index 069ea4b21c6d..24529f6134aa 100644
> --- a/arch/powerpc/kernel/smp.c
> +++ b/arch/powerpc/kernel/smp.c
> @@ -818,6 +818,64 @@ static int init_cpu_l1_cache_map(int cpu)
>  	return err;
>  }
> 
> +static bool shared_caches;
> +
> +#ifdef CONFIG_SCHED_SMT
> +/* cpumask of CPUs with asymmetric SMT dependency */
> +static int powerpc_smt_flags(void)
> +{
> +	int flags = SD_SHARE_CPUCAPACITY | SD_SHARE_PKG_RESOURCES;
> +
> +	if (cpu_has_feature(CPU_FTR_ASYM_SMT)) {
> +		printk_once(KERN_INFO "Enabling Asymmetric SMT scheduling\n");
> +		flags |= SD_ASYM_PACKING;
> +	}
> +	return flags;
> +}
> +#endif
> +
> +/*
> + * P9 has a slightly odd architecture where pairs of cores share an L2 cache.
> + * This topology makes it *much* cheaper to migrate tasks between adjacent cores
> + * since the migrated task remains cache hot. We want to take advantage of this
> + * at the scheduler level so an extra topology level is required.
> + */
> +static int powerpc_shared_cache_flags(void)
> +{
> +	return SD_SHARE_PKG_RESOURCES;
> +}
> +
> +/*
> + * We can't just pass cpu_l2_cache_mask() directly because
> + * returns a non-const pointer and the compiler barfs on that.
> + */
> +static const struct cpumask *shared_cache_mask(int cpu)
> +{
> +	if (shared_caches)
> +		return cpu_l2_cache_mask(cpu);
> +
> +	if (has_big_cores)
> +		return cpu_smallcore_mask(cpu);
> +
> +	return cpu_smt_mask(cpu);
> +}
> +
> +#ifdef CONFIG_SCHED_SMT
> +static const struct cpumask *smallcore_smt_mask(int cpu)
> +{
> +	return cpu_smallcore_mask(cpu);
> +}
> +#endif
> +
> +static struct sched_domain_topology_level powerpc_topology[] = {
> +#ifdef CONFIG_SCHED_SMT
> +	{ cpu_smt_mask, powerpc_smt_flags, SD_INIT_NAME(SMT) },
> +#endif
> +	{ shared_cache_mask, powerpc_shared_cache_flags, SD_INIT_NAME(CACHE) },
> +	{ cpu_cpu_mask, SD_INIT_NAME(DIE) },
> +	{ NULL, },
> +};
> +
>  static int init_big_cores(void)
>  {
>  	int cpu;
> @@ -1249,8 +1307,6 @@ static void add_cpu_to_masks(int cpu)
>  			set_cpus_related(cpu, i, cpu_core_mask);
>  }
> 
> -static bool shared_caches;
> -
>  /* Activate a secondary processor. */
>  void start_secondary(void *unused)
>  {
> @@ -1314,62 +1370,6 @@ int setup_profiling_timer(unsigned int multiplier)
>  	return 0;
>  }
> 
> -#ifdef CONFIG_SCHED_SMT
> -/* cpumask of CPUs with asymmetric SMT dependency */
> -static int powerpc_smt_flags(void)
> -{
> -	int flags = SD_SHARE_CPUCAPACITY | SD_SHARE_PKG_RESOURCES;
> -
> -	if (cpu_has_feature(CPU_FTR_ASYM_SMT)) {
> -		printk_once(KERN_INFO "Enabling Asymmetric SMT scheduling\n");
> -		flags |= SD_ASYM_PACKING;
> -	}
> -	return flags;
> -}
> -#endif
> -
> -/*
> - * P9 has a slightly odd architecture where pairs of cores share an L2 cache.
> - * This topology makes it *much* cheaper to migrate tasks between adjacent cores
> - * since the migrated task remains cache hot. We want to take advantage of this
> - * at the scheduler level so an extra topology level is required.
> - */
> -static int powerpc_shared_cache_flags(void)
> -{
> -	return SD_SHARE_PKG_RESOURCES;
> -}
> -
> -/*
> - * We can't just pass cpu_l2_cache_mask() directly because
> - * returns a non-const pointer and the compiler barfs on that.
> - */
> -static const struct cpumask *shared_cache_mask(int cpu)
> -{
> -	if (shared_caches)
> -		return cpu_l2_cache_mask(cpu);
> -
> -	if (has_big_cores)
> -		return cpu_smallcore_mask(cpu);
> -
> -	return cpu_smt_mask(cpu);
> -}
> -
> -#ifdef CONFIG_SCHED_SMT
> -static const struct cpumask *smallcore_smt_mask(int cpu)
> -{
> -	return cpu_smallcore_mask(cpu);
> -}
> -#endif
> -
> -static struct sched_domain_topology_level powerpc_topology[] = {
> -#ifdef CONFIG_SCHED_SMT
> -	{ cpu_smt_mask, powerpc_smt_flags, SD_INIT_NAME(SMT) },
> -#endif
> -	{ shared_cache_mask, powerpc_shared_cache_flags, SD_INIT_NAME(CACHE) },
> -	{ cpu_cpu_mask, SD_INIT_NAME(DIE) },
> -	{ NULL, },
> -};
> -
>  void __init smp_cpus_done(unsigned int max_cpus)
>  {
>  	/*
> -- 
> 2.17.1
> 

^ permalink raw reply

* Re: [PATCH v4 05/10] powerpc/dt_cpu_ftrs: Add feature for 2nd DAWR
From: Jordan Niethe @ 2020-07-17  5:44 UTC (permalink / raw)
  To: Ravi Bangoria
  Cc: Christophe Leroy, apopple, mikey, miltonm, peterz, oleg,
	Nicholas Piggin, linux-kernel, Paul Mackerras, jolsa, fweisbec,
	pedromfc, naveen.n.rao, linuxppc-dev, mingo
In-Reply-To: <20200717040958.70561-6-ravi.bangoria@linux.ibm.com>

On Fri, Jul 17, 2020 at 2:10 PM Ravi Bangoria
<ravi.bangoria@linux.ibm.com> wrote:
>
> Add new device-tree feature for 2nd DAWR. If this feature is present,
> 2nd DAWR is supported, otherwise not.
>
> Signed-off-by: Ravi Bangoria <ravi.bangoria@linux.ibm.com>
> ---
>  arch/powerpc/include/asm/cputable.h | 7 +++++--
>  arch/powerpc/kernel/dt_cpu_ftrs.c   | 7 +++++++
>  2 files changed, 12 insertions(+), 2 deletions(-)
>
> diff --git a/arch/powerpc/include/asm/cputable.h b/arch/powerpc/include/asm/cputable.h
> index e506d429b1af..3445c86e1f6f 100644
> --- a/arch/powerpc/include/asm/cputable.h
> +++ b/arch/powerpc/include/asm/cputable.h
> @@ -214,6 +214,7 @@ static inline void cpu_feature_keys_init(void) { }
>  #define CPU_FTR_P9_TLBIE_ERAT_BUG      LONG_ASM_CONST(0x0001000000000000)
>  #define CPU_FTR_P9_RADIX_PREFETCH_BUG  LONG_ASM_CONST(0x0002000000000000)
>  #define CPU_FTR_ARCH_31                        LONG_ASM_CONST(0x0004000000000000)
> +#define CPU_FTR_DAWR1                  LONG_ASM_CONST(0x0008000000000000)
>
>  #ifndef __ASSEMBLY__
>
> @@ -497,14 +498,16 @@ static inline void cpu_feature_keys_init(void) { }
>  #define CPU_FTRS_POSSIBLE      \
>             (CPU_FTRS_POWER7 | CPU_FTRS_POWER8E | CPU_FTRS_POWER8 | \
>              CPU_FTR_ALTIVEC_COMP | CPU_FTR_VSX_COMP | CPU_FTRS_POWER9 | \
> -            CPU_FTRS_POWER9_DD2_1 | CPU_FTRS_POWER9_DD2_2 | CPU_FTRS_POWER10)
> +            CPU_FTRS_POWER9_DD2_1 | CPU_FTRS_POWER9_DD2_2 | CPU_FTRS_POWER10 | \
> +            CPU_FTR_DAWR1)
>  #else
>  #define CPU_FTRS_POSSIBLE      \
>             (CPU_FTRS_PPC970 | CPU_FTRS_POWER5 | \
>              CPU_FTRS_POWER6 | CPU_FTRS_POWER7 | CPU_FTRS_POWER8E | \
>              CPU_FTRS_POWER8 | CPU_FTRS_CELL | CPU_FTRS_PA6T | \
>              CPU_FTR_VSX_COMP | CPU_FTR_ALTIVEC_COMP | CPU_FTRS_POWER9 | \
> -            CPU_FTRS_POWER9_DD2_1 | CPU_FTRS_POWER9_DD2_2 | CPU_FTRS_POWER10)
> +            CPU_FTRS_POWER9_DD2_1 | CPU_FTRS_POWER9_DD2_2 | CPU_FTRS_POWER10 | \
> +            CPU_FTR_DAWR1)
Instead of putting CPU_FTR_DAWR1 into CPU_FTRS_POSSIBLE should it go
into CPU_FTRS_POWER10?
Then it will be picked up by CPU_FTRS_POSSIBLE.
>  #endif /* CONFIG_CPU_LITTLE_ENDIAN */
>  #endif
>  #else
> diff --git a/arch/powerpc/kernel/dt_cpu_ftrs.c b/arch/powerpc/kernel/dt_cpu_ftrs.c
> index ac650c233cd9..c78cd3596ec4 100644
> --- a/arch/powerpc/kernel/dt_cpu_ftrs.c
> +++ b/arch/powerpc/kernel/dt_cpu_ftrs.c
> @@ -574,6 +574,12 @@ static int __init feat_enable_mma(struct dt_cpu_feature *f)
>         return 1;
>  }
>
> +static int __init feat_enable_debug_facilities_v31(struct dt_cpu_feature *f)
> +{
> +       cur_cpu_spec->cpu_features |= CPU_FTR_DAWR1;
> +       return 1;
> +}
> +
>  struct dt_cpu_feature_match {
>         const char *name;
>         int (*enable)(struct dt_cpu_feature *f);
> @@ -649,6 +655,7 @@ static struct dt_cpu_feature_match __initdata
>         {"wait-v3", feat_enable, 0},
>         {"prefix-instructions", feat_enable, 0},
>         {"matrix-multiply-assist", feat_enable_mma, 0},
> +       {"debug-facilities-v31", feat_enable_debug_facilities_v31, 0},
Since all feat_enable_debug_facilities_v31() does is set
CPU_FTR_DAWR1, if you just have:
{"debug-facilities-v31", feat_enable, CPU_FTR_DAWR1},
I think cpufeatures_process_feature() should set it in for you at this point:
            if (m->enable(f)) {
                cur_cpu_spec->cpu_features |= m->cpu_ftr_bit_mask;
                break;
            }

>  };
>
>  static bool __initdata using_dt_cpu_ftrs;
> --
> 2.26.2
>

^ permalink raw reply

* Re: [PATCH 02/11] powerpc/smp: Merge Power9 topology with Power topology
From: Gautham R Shenoy @ 2020-07-17  5:44 UTC (permalink / raw)
  To: Srikar Dronamraju
  Cc: Nathan Lynch, Gautham R Shenoy, Oliver OHalloran, Michael Neuling,
	Michael Ellerman, Anton Blanchard, linuxppc-dev, Nick Piggin
In-Reply-To: <20200714043624.5648-3-srikar@linux.vnet.ibm.com>

Hi Srikar,

On Tue, Jul 14, 2020 at 10:06:15AM +0530, Srikar Dronamraju wrote:
> A new sched_domain_topology_level was added just for Power9. However the
> same can be achieved by merging powerpc_topology with power9_topology
> and makes the code more simpler especially when adding a new sched
> domain.
> 
> Cc: linuxppc-dev <linuxppc-dev@lists.ozlabs.org>
> Cc: Michael Ellerman <michaele@au1.ibm.com>
> Cc: Nick Piggin <npiggin@au1.ibm.com>
> Cc: Oliver OHalloran <oliveroh@au1.ibm.com>
> Cc: Nathan Lynch <nathanl@linux.ibm.com>
> Cc: Michael Neuling <mikey@linux.ibm.com>
> Cc: Anton Blanchard <anton@au1.ibm.com>
> Cc: Gautham R Shenoy <ego@linux.vnet.ibm.com>
> Cc: Vaidyanathan Srinivasan <svaidy@linux.ibm.com>
> Signed-off-by: Srikar Dronamraju <srikar@linux.vnet.ibm.com>
> ---
>  arch/powerpc/kernel/smp.c | 33 ++++++++++-----------------------
>  1 file changed, 10 insertions(+), 23 deletions(-)
> 
> diff --git a/arch/powerpc/kernel/smp.c b/arch/powerpc/kernel/smp.c
> index 680c0edcc59d..069ea4b21c6d 100644
> --- a/arch/powerpc/kernel/smp.c
> +++ b/arch/powerpc/kernel/smp.c
> @@ -1315,7 +1315,7 @@ int setup_profiling_timer(unsigned int multiplier)
>  }
> 
>  #ifdef CONFIG_SCHED_SMT
> -/* cpumask of CPUs with asymetric SMT dependancy */
> +/* cpumask of CPUs with asymmetric SMT dependency */
>  static int powerpc_smt_flags(void)
>  {
>  	int flags = SD_SHARE_CPUCAPACITY | SD_SHARE_PKG_RESOURCES;
> @@ -1328,14 +1328,6 @@ static int powerpc_smt_flags(void)
>  }
>  #endif
> 
> -static struct sched_domain_topology_level powerpc_topology[] = {
> -#ifdef CONFIG_SCHED_SMT
> -	{ cpu_smt_mask, powerpc_smt_flags, SD_INIT_NAME(SMT) },
> -#endif
> -	{ cpu_cpu_mask, SD_INIT_NAME(DIE) },
> -	{ NULL, },
> -};
> -
>  /*
>   * P9 has a slightly odd architecture where pairs of cores share an L2 cache.
>   * This topology makes it *much* cheaper to migrate tasks between adjacent cores
> @@ -1353,7 +1345,13 @@ static int powerpc_shared_cache_flags(void)
>   */
>  static const struct cpumask *shared_cache_mask(int cpu)
>  {
> -	return cpu_l2_cache_mask(cpu);
> +	if (shared_caches)
> +		return cpu_l2_cache_mask(cpu);
> +
> +	if (has_big_cores)
> +		return cpu_smallcore_mask(cpu);
> +
> +	return cpu_smt_mask(cpu);
>  }
> 
>  #ifdef CONFIG_SCHED_SMT
> @@ -1363,7 +1361,7 @@ static const struct cpumask *smallcore_smt_mask(int cpu)
>  }
>  #endif
> 
> -static struct sched_domain_topology_level power9_topology[] = {
> +static struct sched_domain_topology_level powerpc_topology[] = {


>  #ifdef CONFIG_SCHED_SMT
>  	{ cpu_smt_mask, powerpc_smt_flags, SD_INIT_NAME(SMT) },
>  #endif
> @@ -1388,21 +1386,10 @@ void __init smp_cpus_done(unsigned int max_cpus)
>  #ifdef CONFIG_SCHED_SMT
>  	if (has_big_cores) {
>  		pr_info("Big cores detected but using small core scheduling\n");
I> -		power9_topology[0].mask = smallcore_smt_mask;
>  		powerpc_topology[0].mask = smallcore_smt_mask;
>  	}
>  #endif
> -	/*
> -	 * If any CPU detects that it's sharing a cache with another CPU then
> -	 * use the deeper topology that is aware of this sharing.
> -	 */
> -	if (shared_caches) {
> -		pr_info("Using shared cache scheduler topology\n");
> -		set_sched_topology(power9_topology);
> -	} else {
> -		pr_info("Using standard scheduler topology\n");
> -		set_sched_topology(powerpc_topology);


Ok, so we will go with the three level topology by default (SMT,
CACHE, DIE) and will rely on the sched-domain creation code to
degenerate CACHE domain in case SMT and CACHE have the same set of
CPUs (POWER8 for eg).

From a cleanup perspective this is better, since we won't have to
worry about defining multiple topology structures, but from a
performance point of view, wouldn't we now pay an extra penalty of
degenerating the CACHE domains on POWER8 kind of systems, each time
when a CPU comes online ?

Do we know how bad it is ? If the degeneration takes a few extra
microseconds, that should be ok I suppose.

--
Thanks and Regards
gautham.

^ permalink raw reply


This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox