LinuxPPC-Dev Archive on lore.kernel.org
 help / color / mirror / Atom feed
* Re: [PATCH v3 1/8] powerpc/uaccess: Add unsafe_copy_from_user
From: Christophe Leroy @ 2021-01-19  7:29 UTC (permalink / raw)
  To: Christopher M. Riedl, linuxppc-dev
In-Reply-To: <C8LLSM3UC598.L4E2VMGJOI06@geist>



Le 17/01/2021 à 18:19, Christopher M. Riedl a écrit :
> On Mon Jan 11, 2021 at 7:22 AM CST, Christophe Leroy wrote:
>>
>>
>> Le 09/01/2021 à 04:25, Christopher M. Riedl a écrit :
>>> Implement raw_copy_from_user_allowed() which assumes that userspace read
>>> access is open. Use this new function to implement raw_copy_from_user().
>>> Finally, wrap the new function to follow the usual "unsafe_" convention
>>> of taking a label argument.
>>
>> I think there is no point implementing raw_copy_from_user_allowed(), see
>> https://github.com/linuxppc/linux/commit/4b842e4e25b1 and
>> https://patchwork.ozlabs.org/project/linuxppc-dev/patch/8c74fc9ce8131cabb10b3e95dc0e430f396ee83e.1610369143.git.christophe.leroy@csgroup.eu/
>>
>> You should simply do:
>>
>> #define unsafe_copy_from_user(d, s, l, e) \
>> unsafe_op_wrap(__copy_tofrom_user((__force void __user *)d, s, l), e)
>>
> 
> I gave this a try and the signal ops decreased by ~8K. Now, to be
> honest, I am not sure what an "acceptable" benchmark number here
> actually is - so maybe this is ok? Same loss with both radix and hash:

I don't think this is ok, but it probably means that you are using unsafe_copy_from_user() to copy 
small constant size data that should be copied with unsafe_get_user() instead.

> 
> 	|                                      | hash   | radix  |
> 	| ------------------------------------ | ------ | ------ |
> 	| linuxppc/next                        | 118693 | 133296 |
> 	| linuxppc/next w/o KUAP+KUEP          | 228911 | 228654 |
> 	| unsafe-signal64                      | 200480 | 234067 |
> 	| unsafe-signal64 (__copy_tofrom_user) | 192467 | 225119 |
> 
> To put this into perspective, prior to KUAP and uaccess flush, signal
> performance in this benchmark was ~290K on hash.
> 
>>
>> Christophe
>>
>>>
>>> The new raw_copy_from_user_allowed() calls non-inline __copy_tofrom_user()
>>> internally. This is still safe to call inside user access blocks formed
>>> with user_*_access_begin()/user_*_access_end() since asm functions are not
>>> instrumented for tracing.
>>>
>>> Signed-off-by: Christopher M. Riedl <cmr@codefail.de>
>>> ---
>>>    arch/powerpc/include/asm/uaccess.h | 28 +++++++++++++++++++---------
>>>    1 file changed, 19 insertions(+), 9 deletions(-)
>>>
>>> diff --git a/arch/powerpc/include/asm/uaccess.h b/arch/powerpc/include/asm/uaccess.h
>>> index 501c9a79038c..698f3a6d6ae5 100644
>>> --- a/arch/powerpc/include/asm/uaccess.h
>>> +++ b/arch/powerpc/include/asm/uaccess.h
>>> @@ -403,38 +403,45 @@ raw_copy_in_user(void __user *to, const void __user *from, unsigned long n)
>>>    }
>>>    #endif /* __powerpc64__ */
>>>    
>>> -static inline unsigned long raw_copy_from_user(void *to,
>>> -		const void __user *from, unsigned long n)
>>> +static inline unsigned long
>>> +raw_copy_from_user_allowed(void *to, const void __user *from, unsigned long n)
>>>    {
>>> -	unsigned long ret;
>>>    	if (__builtin_constant_p(n) && (n <= 8)) {
>>> -		ret = 1;
>>> +		unsigned long ret = 1;
>>>    
>>>    		switch (n) {
>>>    		case 1:
>>>    			barrier_nospec();
>>> -			__get_user_size(*(u8 *)to, from, 1, ret);
>>> +			__get_user_size_allowed(*(u8 *)to, from, 1, ret);
>>>    			break;
>>>    		case 2:
>>>    			barrier_nospec();
>>> -			__get_user_size(*(u16 *)to, from, 2, ret);
>>> +			__get_user_size_allowed(*(u16 *)to, from, 2, ret);
>>>    			break;
>>>    		case 4:
>>>    			barrier_nospec();
>>> -			__get_user_size(*(u32 *)to, from, 4, ret);
>>> +			__get_user_size_allowed(*(u32 *)to, from, 4, ret);
>>>    			break;
>>>    		case 8:
>>>    			barrier_nospec();
>>> -			__get_user_size(*(u64 *)to, from, 8, ret);
>>> +			__get_user_size_allowed(*(u64 *)to, from, 8, ret);
>>>    			break;
>>>    		}
>>>    		if (ret == 0)
>>>    			return 0;
>>>    	}
>>>    
>>> +	return __copy_tofrom_user((__force void __user *)to, from, n);
>>> +}
>>> +
>>> +static inline unsigned long
>>> +raw_copy_from_user(void *to, const void __user *from, unsigned long n)
>>> +{
>>> +	unsigned long ret;
>>> +
>>>    	barrier_nospec();
>>>    	allow_read_from_user(from, n);
>>> -	ret = __copy_tofrom_user((__force void __user *)to, from, n);
>>> +	ret = raw_copy_from_user_allowed(to, from, n);
>>>    	prevent_read_from_user(from, n);
>>>    	return ret;
>>>    }
>>> @@ -542,6 +549,9 @@ user_write_access_begin(const void __user *ptr, size_t len)
>>>    #define unsafe_get_user(x, p, e) unsafe_op_wrap(__get_user_allowed(x, p), e)
>>>    #define unsafe_put_user(x, p, e) __put_user_goto(x, p, e)
>>>    
>>> +#define unsafe_copy_from_user(d, s, l, e) \
>>> +	unsafe_op_wrap(raw_copy_from_user_allowed(d, s, l), e)
>>> +
>>>    #define unsafe_copy_to_user(d, s, l, e) \
>>>    do {									\
>>>    	u8 __user *_dst = (u8 __user *)(d);				\
>>>

^ permalink raw reply

* [PATCH 0/2] powerpc/cacheinfo: Add "ibm,thread-groups" awareness
From: Gautham R. Shenoy @ 2021-01-19  7:36 UTC (permalink / raw)
  To: Michael Ellerman, Nathan Lynch, Srikar Dronamraju
  Cc: Gautham R. Shenoy, linuxppc-dev, linux-kernel

From: "Gautham R. Shenoy" <ego@linux.vnet.ibm.com>

Hi,

Currently the cacheinfo code on powerpc indexes the "cache" objects
(modelling the L1/L2/L3 caches) where the key is device-tree node
corresponding to that cache. On some of the POWER server platforms
thread-groups within the core share different sets of caches (Eg: On
SMT8 POWER9 systems, threads 0,2,4,6 of a core share L1 cache and
threads 1,3,5,7 of the same core share another L1 cache). On such
platforms, there is a single device-tree node corresponding to that
cache and the cache-configuration within the threads of the core is
indicated via "ibm,thread-groups" device-tree property.

Since the current code is not aware of the "ibm,thread-groups"
property, on the aforementoined systems, cacheinfo code still treats
all the threads in the core to be sharing the cache because of the
single device-tree node (In the earlier example, the cacheinfo code
would says CPUs 0-7 share L1 cache).

In this patch series, we make the powerpc cacheinfo code aware of the
"ibm,thread-groups" property. We indexe the "cache" objects by the
key-pair (device-tree node, thread-group id). For any CPUX, for a
given level of cache, the thread-group id is defined to be the first
CPU in the "ibm,thread-groups" cache-group containing CPUX. For levels
of cache which are not represented in "ibm,thread-groups" property,
the thread-group id is -1.

We can now remove the helper function get_shared_cpu_map() and
index_dir_to_cpu() since the cache->shared_cpu_map contains the
correct satate of the thread-siblings sharing the cache.

This has been tested on a SMT8 POWER9 system where L1 cache is split
between groups of threads in the core and on an SMT8 POWER10 system
where L1 and L2 caches are split between groups of threads in a core.

With this patch series, on POWER10 SMT8 system, we see the following
reported via sysfs:

$ grep . /sys/devices/system/cpu/cpu[89]/cache/index[0123]/shared_cpu_list
/sys/devices/system/cpu/cpu8/cache/index0/shared_cpu_list:8,10,12,14
/sys/devices/system/cpu/cpu8/cache/index1/shared_cpu_list:8,10,12,14
/sys/devices/system/cpu/cpu8/cache/index2/shared_cpu_list:8,10,12,14
/sys/devices/system/cpu/cpu8/cache/index3/shared_cpu_list:8-15
/sys/devices/system/cpu/cpu9/cache/index0/shared_cpu_list:9,11,13,15
/sys/devices/system/cpu/cpu9/cache/index1/shared_cpu_list:9,11,13,15
/sys/devices/system/cpu/cpu9/cache/index2/shared_cpu_list:9,11,13,15
/sys/devices/system/cpu/cpu9/cache/index3/shared_cpu_list:8-15

$ ppc64_cpu --smt=4
$ grep . /sys/devices/system/cpu/cpu[89]/cache/index[0123]/shared_cpu_list
/sys/devices/system/cpu/cpu8/cache/index0/shared_cpu_list:8,10
/sys/devices/system/cpu/cpu8/cache/index1/shared_cpu_list:8,10
/sys/devices/system/cpu/cpu8/cache/index2/shared_cpu_list:8,10
/sys/devices/system/cpu/cpu8/cache/index3/shared_cpu_list:8-11
/sys/devices/system/cpu/cpu9/cache/index0/shared_cpu_list:9,11
/sys/devices/system/cpu/cpu9/cache/index1/shared_cpu_list:9,11
/sys/devices/system/cpu/cpu9/cache/index2/shared_cpu_list:9,11
/sys/devices/system/cpu/cpu9/cache/index3/shared_cpu_list:8-11

$ ppc64_cpu --smt=2
$ grep . /sys/devices/system/cpu/cpu[89]/cache/index[0123]/shared_cpu_list
/sys/devices/system/cpu/cpu8/cache/index0/shared_cpu_list:8
/sys/devices/system/cpu/cpu8/cache/index1/shared_cpu_list:8
/sys/devices/system/cpu/cpu8/cache/index2/shared_cpu_list:8
/sys/devices/system/cpu/cpu8/cache/index3/shared_cpu_list:8-9
/sys/devices/system/cpu/cpu9/cache/index0/shared_cpu_list:9
/sys/devices/system/cpu/cpu9/cache/index1/shared_cpu_list:9
/sys/devices/system/cpu/cpu9/cache/index2/shared_cpu_list:9
/sys/devices/system/cpu/cpu9/cache/index3/shared_cpu_list:8-9

$ ppc64_cpu --smt=1
$ grep . /sys/devices/system/cpu/cpu[89]/cache/index[0123]/shared_cpu_list
/sys/devices/system/cpu/cpu8/cache/index0/shared_cpu_list:8
/sys/devices/system/cpu/cpu8/cache/index1/shared_cpu_list:8
/sys/devices/system/cpu/cpu8/cache/index2/shared_cpu_list:8
/sys/devices/system/cpu/cpu8/cache/index3/shared_cpu_list:8

Gautham R. Shenoy (2):
  powerpc/cacheinfo: Lookup cache by dt node and thread-group id
  powerpc/cacheinfo: Remove the redundant get_shared_cpu_map()

 arch/powerpc/include/asm/smp.h  |   3 +
 arch/powerpc/kernel/cacheinfo.c | 121 ++++++++++++++++++++--------------------
 2 files changed, 62 insertions(+), 62 deletions(-)

-- 
1.9.4


^ permalink raw reply

* [PATCH 2/2] powerpc/cacheinfo: Remove the redundant get_shared_cpu_map()
From: Gautham R. Shenoy @ 2021-01-19  7:36 UTC (permalink / raw)
  To: Michael Ellerman, Nathan Lynch, Srikar Dronamraju
  Cc: Gautham R. Shenoy, linuxppc-dev, linux-kernel
In-Reply-To: <1611041780-8640-1-git-send-email-ego@linux.vnet.ibm.com>

From: "Gautham R. Shenoy" <ego@linux.vnet.ibm.com>

The helper function get_shared_cpu_map() was added in

'commit 500fe5f550ec ("powerpc/cacheinfo: Report the correct
shared_cpu_map on big-cores")'

and subsequently expanded upon in

'commit 0be47634db0b ("powerpc/cacheinfo: Print correct cache-sibling
map/list for L2 cache")'

in order to help report the correct groups of threads sharing these caches
on big-core systems where groups of threads within a core can share
different sets of caches.

Now that powerpc/cacheinfo is aware of "ibm,thread-groups" property,
cache->shared_cpu_map contains the correct set of thread-siblings
sharing the cache. Hence we no longer need the functions
get_shared_cpu_map(). This patch removes this function. We also remove
the helper function index_dir_to_cpu() which was only called by
get_shared_cpu_map().

With these functions removed, we can still see the correct
cache-sibling map/list for L1 and L2 caches on systems with L1 and L2
caches distributed among groups of threads in a core.

With this patch, on a SMT8 POWER10 system where the L1 and L2 caches
are split between the two groups of threads in a core, for CPUs 8,9,
the L1-Data, L1-Instruction, L2, L3 cache CPU sibling list is as
follows:

$ grep . /sys/devices/system/cpu/cpu[89]/cache/index[0123]/shared_cpu_list
/sys/devices/system/cpu/cpu8/cache/index0/shared_cpu_list:8,10,12,14
/sys/devices/system/cpu/cpu8/cache/index1/shared_cpu_list:8,10,12,14
/sys/devices/system/cpu/cpu8/cache/index2/shared_cpu_list:8,10,12,14
/sys/devices/system/cpu/cpu8/cache/index3/shared_cpu_list:8-15
/sys/devices/system/cpu/cpu9/cache/index0/shared_cpu_list:9,11,13,15
/sys/devices/system/cpu/cpu9/cache/index1/shared_cpu_list:9,11,13,15
/sys/devices/system/cpu/cpu9/cache/index2/shared_cpu_list:9,11,13,15
/sys/devices/system/cpu/cpu9/cache/index3/shared_cpu_list:8-15

$ ppc64_cpu --smt=4
$ grep . /sys/devices/system/cpu/cpu[89]/cache/index[0123]/shared_cpu_list
/sys/devices/system/cpu/cpu8/cache/index0/shared_cpu_list:8,10
/sys/devices/system/cpu/cpu8/cache/index1/shared_cpu_list:8,10
/sys/devices/system/cpu/cpu8/cache/index2/shared_cpu_list:8,10
/sys/devices/system/cpu/cpu8/cache/index3/shared_cpu_list:8-11
/sys/devices/system/cpu/cpu9/cache/index0/shared_cpu_list:9,11
/sys/devices/system/cpu/cpu9/cache/index1/shared_cpu_list:9,11
/sys/devices/system/cpu/cpu9/cache/index2/shared_cpu_list:9,11
/sys/devices/system/cpu/cpu9/cache/index3/shared_cpu_list:8-11

$ ppc64_cpu --smt=2
$ grep . /sys/devices/system/cpu/cpu[89]/cache/index[0123]/shared_cpu_list
/sys/devices/system/cpu/cpu8/cache/index0/shared_cpu_list:8
/sys/devices/system/cpu/cpu8/cache/index1/shared_cpu_list:8
/sys/devices/system/cpu/cpu8/cache/index2/shared_cpu_list:8
/sys/devices/system/cpu/cpu8/cache/index3/shared_cpu_list:8-9
/sys/devices/system/cpu/cpu9/cache/index0/shared_cpu_list:9
/sys/devices/system/cpu/cpu9/cache/index1/shared_cpu_list:9
/sys/devices/system/cpu/cpu9/cache/index2/shared_cpu_list:9
/sys/devices/system/cpu/cpu9/cache/index3/shared_cpu_list:8-9

$ ppc64_cpu --smt=1
$ grep . /sys/devices/system/cpu/cpu[89]/cache/index[0123]/shared_cpu_list
/sys/devices/system/cpu/cpu8/cache/index0/shared_cpu_list:8
/sys/devices/system/cpu/cpu8/cache/index1/shared_cpu_list:8
/sys/devices/system/cpu/cpu8/cache/index2/shared_cpu_list:8
/sys/devices/system/cpu/cpu8/cache/index3/shared_cpu_list:8

Signed-off-by: Gautham R. Shenoy <ego@linux.vnet.ibm.com>
---
 arch/powerpc/kernel/cacheinfo.c | 41 +----------------------------------------
 1 file changed, 1 insertion(+), 40 deletions(-)

diff --git a/arch/powerpc/kernel/cacheinfo.c b/arch/powerpc/kernel/cacheinfo.c
index 5a6925d..20d9169 100644
--- a/arch/powerpc/kernel/cacheinfo.c
+++ b/arch/powerpc/kernel/cacheinfo.c
@@ -675,45 +675,6 @@ static ssize_t level_show(struct kobject *k, struct kobj_attribute *attr, char *
 static struct kobj_attribute cache_level_attr =
 	__ATTR(level, 0444, level_show, NULL);
 
-static unsigned int index_dir_to_cpu(struct cache_index_dir *index)
-{
-	struct kobject *index_dir_kobj = &index->kobj;
-	struct kobject *cache_dir_kobj = index_dir_kobj->parent;
-	struct kobject *cpu_dev_kobj = cache_dir_kobj->parent;
-	struct device *dev = kobj_to_dev(cpu_dev_kobj);
-
-	return dev->id;
-}
-
-/*
- * On big-core systems, each core has two groups of CPUs each of which
- * has its own L1-cache. The thread-siblings which share l1-cache with
- * @cpu can be obtained via cpu_smallcore_mask().
- *
- * On some big-core systems, the L2 cache is shared only between some
- * groups of siblings. This is already parsed and encoded in
- * cpu_l2_cache_mask().
- *
- * TODO: cache_lookup_or_instantiate() needs to be made aware of the
- *       "ibm,thread-groups" property so that cache->shared_cpu_map
- *       reflects the correct siblings on platforms that have this
- *       device-tree property. This helper function is only a stop-gap
- *       solution so that we report the correct siblings to the
- *       userspace via sysfs.
- */
-static const struct cpumask *get_shared_cpu_map(struct cache_index_dir *index, struct cache *cache)
-{
-	if (has_big_cores) {
-		int cpu = index_dir_to_cpu(index);
-		if (cache->level == 1)
-			return cpu_smallcore_mask(cpu);
-		if (cache->level == 2 && thread_group_shares_l2)
-			return cpu_l2_cache_mask(cpu);
-	}
-
-	return &cache->shared_cpu_map;
-}
-
 static ssize_t
 show_shared_cpumap(struct kobject *k, struct kobj_attribute *attr, char *buf, bool list)
 {
@@ -724,7 +685,7 @@ static const struct cpumask *get_shared_cpu_map(struct cache_index_dir *index, s
 	index = kobj_to_cache_index_dir(k);
 	cache = index->cache;
 
-	mask = get_shared_cpu_map(index, cache);
+	mask = &cache->shared_cpu_map;
 
 	return cpumap_print_to_pagebuf(list, buf, mask);
 }
-- 
1.9.4


^ permalink raw reply related

* [PATCH 1/2] powerpc/cacheinfo: Lookup cache by dt node and thread-group id
From: Gautham R. Shenoy @ 2021-01-19  7:36 UTC (permalink / raw)
  To: Michael Ellerman, Nathan Lynch, Srikar Dronamraju
  Cc: Gautham R. Shenoy, linuxppc-dev, linux-kernel
In-Reply-To: <1611041780-8640-1-git-send-email-ego@linux.vnet.ibm.com>

From: "Gautham R. Shenoy" <ego@linux.vnet.ibm.com>

Currently the cacheinfo code on powerpc indexes the "cache" objects
(modelling the L1/L2/L3 caches) where the key is device-tree node
corresponding to that cache. On some of the POWER server platforms
thread-groups within the core share different sets of caches (Eg: On
SMT8 POWER9 systems, threads 0,2,4,6 of a core share L1 cache and
threads 1,3,5,7 of the same core share another L1 cache). On such
platforms, there is a single device-tree node corresponding to that
cache and the cache-configuration within the threads of the core is
indicated via "ibm,thread-groups" device-tree property.

Since the current code is not aware of the "ibm,thread-groups"
property, on the aforementoined systems, cacheinfo code still treats
all the threads in the core to be sharing the cache because of the
single device-tree node (In the earlier example, the cacheinfo code
would says CPUs 0-7 share L1 cache).

In this patch, we make the powerpc cacheinfo code aware of the
"ibm,thread-groups" property. We indexe the "cache" objects by the
key-pair (device-tree node, thread-group id). For any CPUX, for a
given level of cache, the thread-group id is defined to be the first
CPU in the "ibm,thread-groups" cache-group containing CPUX. For levels
of cache which are not represented in "ibm,thread-groups" property,
the thread-group id is -1.

Signed-off-by: Gautham R. Shenoy <ego@linux.vnet.ibm.com>
---
 arch/powerpc/include/asm/smp.h  |  3 ++
 arch/powerpc/kernel/cacheinfo.c | 80 +++++++++++++++++++++++++++++------------
 2 files changed, 61 insertions(+), 22 deletions(-)

diff --git a/arch/powerpc/include/asm/smp.h b/arch/powerpc/include/asm/smp.h
index c4e2d53..39de24b 100644
--- a/arch/powerpc/include/asm/smp.h
+++ b/arch/powerpc/include/asm/smp.h
@@ -32,6 +32,9 @@
 
 extern int cpu_to_chip_id(int cpu);
 
+DECLARE_PER_CPU(cpumask_var_t, thread_group_l1_cache_map);
+DECLARE_PER_CPU(cpumask_var_t, thread_group_l2_cache_map);
+
 #ifdef CONFIG_SMP
 
 struct smp_ops_t {
diff --git a/arch/powerpc/kernel/cacheinfo.c b/arch/powerpc/kernel/cacheinfo.c
index 6f903e9a..5a6925d 100644
--- a/arch/powerpc/kernel/cacheinfo.c
+++ b/arch/powerpc/kernel/cacheinfo.c
@@ -120,6 +120,7 @@ struct cache {
 	struct cpumask shared_cpu_map; /* online CPUs using this cache */
 	int type;                      /* split cache disambiguation */
 	int level;                     /* level not explicit in device tree */
+	int group_id;                  /* id of the group of threads that share this cache */
 	struct list_head list;         /* global list of cache objects */
 	struct cache *next_local;      /* next cache of >= level */
 };
@@ -142,22 +143,24 @@ static const char *cache_type_string(const struct cache *cache)
 }
 
 static void cache_init(struct cache *cache, int type, int level,
-		       struct device_node *ofnode)
+		       struct device_node *ofnode, int group_id)
 {
 	cache->type = type;
 	cache->level = level;
 	cache->ofnode = of_node_get(ofnode);
+	cache->group_id = group_id;
 	INIT_LIST_HEAD(&cache->list);
 	list_add(&cache->list, &cache_list);
 }
 
-static struct cache *new_cache(int type, int level, struct device_node *ofnode)
+static struct cache *new_cache(int type, int level,
+			       struct device_node *ofnode, int group_id)
 {
 	struct cache *cache;
 
 	cache = kzalloc(sizeof(*cache), GFP_KERNEL);
 	if (cache)
-		cache_init(cache, type, level, ofnode);
+		cache_init(cache, type, level, ofnode, group_id);
 
 	return cache;
 }
@@ -309,20 +312,24 @@ static struct cache *cache_find_first_sibling(struct cache *cache)
 		return cache;
 
 	list_for_each_entry(iter, &cache_list, list)
-		if (iter->ofnode == cache->ofnode && iter->next_local == cache)
+		if (iter->ofnode == cache->ofnode &&
+		    iter->group_id == cache->group_id &&
+		    iter->next_local == cache)
 			return iter;
 
 	return cache;
 }
 
-/* return the first cache on a local list matching node */
-static struct cache *cache_lookup_by_node(const struct device_node *node)
+/* return the first cache on a local list matching node and thread-group id */
+static struct cache *cache_lookup_by_node_group(const struct device_node *node,
+						int group_id)
 {
 	struct cache *cache = NULL;
 	struct cache *iter;
 
 	list_for_each_entry(iter, &cache_list, list) {
-		if (iter->ofnode != node)
+		if (iter->ofnode != node ||
+		    iter->group_id != group_id)
 			continue;
 		cache = cache_find_first_sibling(iter);
 		break;
@@ -352,14 +359,15 @@ static int cache_is_unified_d(const struct device_node *np)
 		CACHE_TYPE_UNIFIED_D : CACHE_TYPE_UNIFIED;
 }
 
-static struct cache *cache_do_one_devnode_unified(struct device_node *node, int level)
+static struct cache *cache_do_one_devnode_unified(struct device_node *node, int group_id,
+						  int level)
 {
 	pr_debug("creating L%d ucache for %pOFP\n", level, node);
 
-	return new_cache(cache_is_unified_d(node), level, node);
+	return new_cache(cache_is_unified_d(node), level, node, group_id);
 }
 
-static struct cache *cache_do_one_devnode_split(struct device_node *node,
+static struct cache *cache_do_one_devnode_split(struct device_node *node, int group_id,
 						int level)
 {
 	struct cache *dcache, *icache;
@@ -367,8 +375,8 @@ static struct cache *cache_do_one_devnode_split(struct device_node *node,
 	pr_debug("creating L%d dcache and icache for %pOFP\n", level,
 		 node);
 
-	dcache = new_cache(CACHE_TYPE_DATA, level, node);
-	icache = new_cache(CACHE_TYPE_INSTRUCTION, level, node);
+	dcache = new_cache(CACHE_TYPE_DATA, level, node, group_id);
+	icache = new_cache(CACHE_TYPE_INSTRUCTION, level, node, group_id);
 
 	if (!dcache || !icache)
 		goto err;
@@ -382,31 +390,32 @@ static struct cache *cache_do_one_devnode_split(struct device_node *node,
 	return NULL;
 }
 
-static struct cache *cache_do_one_devnode(struct device_node *node, int level)
+static struct cache *cache_do_one_devnode(struct device_node *node, int group_id, int level)
 {
 	struct cache *cache;
 
 	if (cache_node_is_unified(node))
-		cache = cache_do_one_devnode_unified(node, level);
+		cache = cache_do_one_devnode_unified(node, group_id, level);
 	else
-		cache = cache_do_one_devnode_split(node, level);
+		cache = cache_do_one_devnode_split(node, group_id, level);
 
 	return cache;
 }
 
 static struct cache *cache_lookup_or_instantiate(struct device_node *node,
+						 int group_id,
 						 int level)
 {
 	struct cache *cache;
 
-	cache = cache_lookup_by_node(node);
+	cache = cache_lookup_by_node_group(node, group_id);
 
 	WARN_ONCE(cache && cache->level != level,
 		  "cache level mismatch on lookup (got %d, expected %d)\n",
 		  cache->level, level);
 
 	if (!cache)
-		cache = cache_do_one_devnode(node, level);
+		cache = cache_do_one_devnode(node, group_id, level);
 
 	return cache;
 }
@@ -443,7 +452,27 @@ static void do_subsidiary_caches_debugcheck(struct cache *cache)
 		  of_node_get_device_type(cache->ofnode));
 }
 
-static void do_subsidiary_caches(struct cache *cache)
+/*
+ * If sub-groups of threads in a core containing @cpu_id share the
+ * L@level-cache (information obtained via "ibm,thread-groups"
+ * device-tree property), then we identify the group by the first
+ * thread-sibling in the group. We define this to be the group-id.
+ *
+ * In the absence of any thread-group information for L@level-cache,
+ * this function returns -1.
+ */
+static int get_group_id(unsigned int cpu_id, int level)
+{
+	if (has_big_cores && level == 1)
+		return cpumask_first(per_cpu(thread_group_l1_cache_map,
+					     cpu_id));
+	else if (thread_group_shares_l2 && level == 2)
+		return cpumask_first(per_cpu(thread_group_l2_cache_map,
+					     cpu_id));
+	return -1;
+}
+
+static void do_subsidiary_caches(struct cache *cache, unsigned int cpu_id)
 {
 	struct device_node *subcache_node;
 	int level = cache->level;
@@ -452,9 +481,11 @@ static void do_subsidiary_caches(struct cache *cache)
 
 	while ((subcache_node = of_find_next_cache_node(cache->ofnode))) {
 		struct cache *subcache;
+		int group_id;
 
 		level++;
-		subcache = cache_lookup_or_instantiate(subcache_node, level);
+		group_id = get_group_id(cpu_id, level);
+		subcache = cache_lookup_or_instantiate(subcache_node, group_id, level);
 		of_node_put(subcache_node);
 		if (!subcache)
 			break;
@@ -468,6 +499,7 @@ static struct cache *cache_chain_instantiate(unsigned int cpu_id)
 {
 	struct device_node *cpu_node;
 	struct cache *cpu_cache = NULL;
+	int group_id;
 
 	pr_debug("creating cache object(s) for CPU %i\n", cpu_id);
 
@@ -476,11 +508,13 @@ static struct cache *cache_chain_instantiate(unsigned int cpu_id)
 	if (!cpu_node)
 		goto out;
 
-	cpu_cache = cache_lookup_or_instantiate(cpu_node, 1);
+	group_id = get_group_id(cpu_id, 1);
+
+	cpu_cache = cache_lookup_or_instantiate(cpu_node, group_id, 1);
 	if (!cpu_cache)
 		goto out;
 
-	do_subsidiary_caches(cpu_cache);
+	do_subsidiary_caches(cpu_cache, cpu_id);
 
 	cache_cpu_set(cpu_cache, cpu_id);
 out:
@@ -848,13 +882,15 @@ static struct cache *cache_lookup_by_cpu(unsigned int cpu_id)
 {
 	struct device_node *cpu_node;
 	struct cache *cache;
+	int group_id;
 
 	cpu_node = of_get_cpu_node(cpu_id, NULL);
 	WARN_ONCE(!cpu_node, "no OF node found for CPU %i\n", cpu_id);
 	if (!cpu_node)
 		return NULL;
 
-	cache = cache_lookup_by_node(cpu_node);
+	group_id = get_group_id(cpu_id, 1);
+	cache = cache_lookup_by_node_group(cpu_node, group_id);
 	of_node_put(cpu_node);
 
 	return cache;
-- 
1.9.4


^ permalink raw reply related

* Re: [RFC Qemu PATCH v2 1/2] spapr: drc: Add support for async hcalls at the drc level
From: Shivaprasad G Bhat @ 2021-01-19  7:10 UTC (permalink / raw)
  To: David Gibson, Greg Kurz
  Cc: xiaoguangrong.eric, linux-nvdimm, aneesh.kumar, mst, qemu-devel,
	kvm-ppc, shivaprasadbhat, qemu-ppc, bharata, imammedo,
	linuxppc-dev
In-Reply-To: <20201228083800.GN6952@yekko.fritz.box>

Thanks for the comments!


On 12/28/20 2:08 PM, David Gibson wrote:

> On Mon, Dec 21, 2020 at 01:08:53PM +0100, Greg Kurz wrote:
...
>> The overall idea looks good but I think you should consider using
>> a thread pool to implement it. See below.
> I am not convinced, however.  Specifically, attaching this to the DRC
> doesn't make sense to me.  We're adding exactly one DRC related async
> hcall, and I can't really see much call for another one.  We could
> have other async hcalls - indeed we already have one for HPT resizing
> - but attaching this to DRCs doesn't help for those.

The semantics of the hcall made me think, if this is going to be

re-usable for future if implemented at DRC level. Other option

is to move the async-hcall-state/list into the NVDIMMState structure

in include/hw/mem/nvdimm.h and handle it with machine->nvdimms_state

at a global level.


Hope you are okay with using the pool based approach that Greg

suggested.


Please let me know.


Thanks,

Shivaprasad



^ permalink raw reply

* Re: [PATCH] selftests/powerpc: Fix exit status of pkey tests
From: Aneesh Kumar K.V @ 2021-01-19  4:35 UTC (permalink / raw)
  To: Sandipan Das, mpe; +Cc: harish, efuller, linuxppc-dev
In-Reply-To: <20210118093145.10134-1-sandipan@linux.ibm.com>

Sandipan Das <sandipan@linux.ibm.com> writes:

> Since main() does not return a value explicitly, the
> return values from FAIL_IF() conditions are ignored
> and the tests can still pass irrespective of failures.
> This makes sure that we always explicitly return the
> correct test exit status.
>

Reviewed-by: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>

> Reported-by: Eirik Fuller <efuller@redhat.com>
> Fixes: 1addb6444791 ("selftests/powerpc: Add test for execute-disabled pkeys")
> Fixes: c27f2fd1705a ("selftests/powerpc: Add test for pkey siginfo verification")
> Signed-off-by: Sandipan Das <sandipan@linux.ibm.com>
> ---
>  tools/testing/selftests/powerpc/mm/pkey_exec_prot.c | 2 +-
>  tools/testing/selftests/powerpc/mm/pkey_siginfo.c   | 2 +-
>  2 files changed, 2 insertions(+), 2 deletions(-)
>
> diff --git a/tools/testing/selftests/powerpc/mm/pkey_exec_prot.c b/tools/testing/selftests/powerpc/mm/pkey_exec_prot.c
> index 9e5c7f3f498a..0af4f02669a1 100644
> --- a/tools/testing/selftests/powerpc/mm/pkey_exec_prot.c
> +++ b/tools/testing/selftests/powerpc/mm/pkey_exec_prot.c
> @@ -290,5 +290,5 @@ static int test(void)
>  
>  int main(void)
>  {
> -	test_harness(test, "pkey_exec_prot");
> +	return test_harness(test, "pkey_exec_prot");
>  }
> diff --git a/tools/testing/selftests/powerpc/mm/pkey_siginfo.c b/tools/testing/selftests/powerpc/mm/pkey_siginfo.c
> index 4f815d7c1214..2db76e56d4cb 100644
> --- a/tools/testing/selftests/powerpc/mm/pkey_siginfo.c
> +++ b/tools/testing/selftests/powerpc/mm/pkey_siginfo.c
> @@ -329,5 +329,5 @@ static int test(void)
>  
>  int main(void)
>  {
> -	test_harness(test, "pkey_siginfo");
> +	return test_harness(test, "pkey_siginfo");
>  }
> -- 
> 2.25.1

^ permalink raw reply

* [PATCH] selftests/powerpc: Only test lwm/stmw on big endian
From: Michael Ellerman @ 2021-01-19  4:18 UTC (permalink / raw)
  To: linuxppc-dev; +Cc: msuchanek, lpechacek

Newer binutils (>= 2.36) refuse to assemble lmw/stmw when building in
little endian mode. That breaks compilation of our alignment handler
test:

  /tmp/cco4l14N.s: Assembler messages:
  /tmp/cco4l14N.s:1440: Error: `lmw' invalid when little-endian
  /tmp/cco4l14N.s:1814: Error: `stmw' invalid when little-endian
  make[2]: *** [../../lib.mk:139: /output/kselftest/powerpc/alignment/alignment_handler] Error 1

These tests do pass on little endian machines, as the kernel will
still emulate those instructions even when running little
endian (which is arguably a kernel bug).

But we don't really need to test that case, so ifdef those
instructions out to get the alignment test building again.

Reported-by: Libor Pechacek <lpechacek@suse.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 .../testing/selftests/powerpc/alignment/alignment_handler.c  | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/tools/testing/selftests/powerpc/alignment/alignment_handler.c b/tools/testing/selftests/powerpc/alignment/alignment_handler.c
index cb53a8b777e6..c25cf7cd45e9 100644
--- a/tools/testing/selftests/powerpc/alignment/alignment_handler.c
+++ b/tools/testing/selftests/powerpc/alignment/alignment_handler.c
@@ -443,7 +443,6 @@ int test_alignment_handler_integer(void)
 	LOAD_DFORM_TEST(ldu);
 	LOAD_XFORM_TEST(ldx);
 	LOAD_XFORM_TEST(ldux);
-	LOAD_DFORM_TEST(lmw);
 	STORE_DFORM_TEST(stb);
 	STORE_XFORM_TEST(stbx);
 	STORE_DFORM_TEST(stbu);
@@ -462,7 +461,11 @@ int test_alignment_handler_integer(void)
 	STORE_XFORM_TEST(stdx);
 	STORE_DFORM_TEST(stdu);
 	STORE_XFORM_TEST(stdux);
+
+#ifdef __BIG_ENDIAN__
+	LOAD_DFORM_TEST(lmw);
 	STORE_DFORM_TEST(stmw);
+#endif
 
 	return rc;
 }
-- 
2.25.1


^ permalink raw reply related

* Re: [PATCH v3] powerpc/mce: Remove per cpu variables from MCE handlers
From: Nicholas Piggin @ 2021-01-19  3:58 UTC (permalink / raw)
  To: Ganesh Goudar, linuxppc-dev, mpe; +Cc: mahesh
In-Reply-To: <20210115125845.28224-1-ganeshgr@linux.ibm.com>

Excerpts from Ganesh Goudar's message of January 15, 2021 10:58 pm:
> Access to per-cpu variables requires translation to be enabled on
> pseries machine running in hash mmu mode, Since part of MCE handler
> runs in realmode and part of MCE handling code is shared between ppc
> architectures pseries and powernv, it becomes difficult to manage
> these variables differently on different architectures, So have
> these variables in paca instead of having them as per-cpu variables
> to avoid complications.

Seems okay.

> 
> Maximum recursive depth of MCE is 4, Considering the maximum depth
> allowed reduce the size of event to 10 from 100.

Could you make this a separate patch, with memory saving numbers?
"Delayed" MCEs are not necessarily the same as recursive (several 
sequential MCEs can occur before the first event is processed).
But I agree 100 is pretty overboard (as is 4 recursive MCEs really).

> 
> Signed-off-by: Ganesh Goudar <ganeshgr@linux.ibm.com>
> ---
> v2: Dynamically allocate memory for machine check event info
> 
> v3: Remove check for hash mmu lpar, use memblock_alloc_try_nid
>     to allocate memory.
> ---
>  arch/powerpc/include/asm/mce.h     | 21 ++++++++-
>  arch/powerpc/include/asm/paca.h    |  4 ++
>  arch/powerpc/kernel/mce.c          | 76 +++++++++++++++++-------------
>  arch/powerpc/kernel/setup-common.c |  2 +-
>  4 files changed, 69 insertions(+), 34 deletions(-)
> 
> diff --git a/arch/powerpc/include/asm/mce.h b/arch/powerpc/include/asm/mce.h
> index e6c27ae843dc..8d6e3a7a9f37 100644
> --- a/arch/powerpc/include/asm/mce.h
> +++ b/arch/powerpc/include/asm/mce.h
> @@ -204,7 +204,18 @@ struct mce_error_info {
>  	bool			ignore_event;
>  };
>  
> -#define MAX_MC_EVT	100
> +#define MAX_MC_EVT	10

> +
> +struct mce_info {
> +	int mce_nest_count;
> +	struct machine_check_event mce_event[MAX_MC_EVT];
> +	/* Queue for delayed MCE events. */
> +	int mce_queue_count;
> +	struct machine_check_event mce_event_queue[MAX_MC_EVT];
> +	/* Queue for delayed MCE UE events. */
> +	int mce_ue_count;
> +	struct machine_check_event  mce_ue_event_queue[MAX_MC_EVT];
> +};
>  
>  /* Release flags for get_mce_event() */
>  #define MCE_EVENT_RELEASE	true
> @@ -233,5 +244,13 @@ long __machine_check_early_realmode_p7(struct pt_regs *regs);
>  long __machine_check_early_realmode_p8(struct pt_regs *regs);
>  long __machine_check_early_realmode_p9(struct pt_regs *regs);
>  long __machine_check_early_realmode_p10(struct pt_regs *regs);
> +#define get_mce_info() local_paca->mce_info

I don't think this adds anything. Could you open code it?

Thanks,
Nick

^ permalink raw reply

* Re: [PATCH 1/4] KVM: PPC: Book3S HV: Remove support for running HPT guest on RPT host without mixed mode support
From: Nicholas Piggin @ 2021-01-19  3:27 UTC (permalink / raw)
  To: Fabiano Rosas, kvm-ppc; +Cc: linuxppc-dev
In-Reply-To: <87czy1bsvz.fsf@linux.ibm.com>

Excerpts from Fabiano Rosas's message of January 19, 2021 11:46 am:
> Resending because the previous got spam-filtered:
> 
> Nicholas Piggin <npiggin@gmail.com> writes:
> 
>> This reverts much of commit c01015091a770 ("KVM: PPC: Book3S HV: Run HPT
>> guests on POWER9 radix hosts"), which was required to run HPT guests on
>> RPT hosts on early POWER9 CPUs without support for "mixed mode", which
>> meant the host could not run with MMU on while guests were running.
>>
>> This code has some corner case bugs, e.g., when the guest hits a machine
>> check or HMI the primary locks up waiting for secondaries to switch LPCR
>> to host, which they never do. This could all be fixed in software, but
>> most CPUs in production have mixed mode support, and those that don't
>> are believed to be all in installations that don't use this capability.
>> So simplify things and remove support.
> 
> With this patch in a DD2.1 machine + indep_threads_mode=N +
> disable_radix, QEMU aborts and dumps registers, is that intended?

Yes. That configuration is hanging handling MCEs in the guest with some 
threads waiting forever to synchronize. Paul suggested it was never a
supported configuration so we might just remove it.

> Could we use the 'no_mixing_hpt_and_radix' logic in check_extension to
> advertise only KVM_CAP_PPC_MMU_RADIX to the guest via OV5 so it doesn't
> try to run hash?
> 
> For instance, if I hack QEMU's 'spapr_dt_ov5_platform_support' from
> OV5_MMU_BOTH to OV5_MMU_RADIX_300 then it boots succesfuly, but the
> guest turns into radix, due to this code in prom_init:
> 
> prom_parse_mmu_model:
> 
> case OV5_FEAT(OV5_MMU_RADIX): /* Only Radix */
> 	prom_debug("MMU - radix only\n");
> 	if (prom_radix_disable) {
> 		/*
> 		 * If we __have__ to do radix, we're better off ignoring
> 		 * the command line rather than not booting.
> 		 */
> 		prom_printf("WARNING: Ignoring cmdline option disable_radix\n");
> 	}
> 	support->radix_mmu = true;
> 	break;
> 
> It seems we could explicitly say that the host does not support hash and
> that would align with the above code.

I'm not sure, sounds like you could, on the other hand these aborts seem 
like the prefered failure mode for these kinds of configuration issues, 
I don't know what the policy is, is reverting back to radix acceptable?

Thanks,
Nick


^ permalink raw reply

* Re: [PATCH v3 1/8] powerpc/uaccess: Add unsafe_copy_from_user
From: Michael Ellerman @ 2021-01-19  2:11 UTC (permalink / raw)
  To: Christopher M. Riedl, Christophe Leroy, linuxppc-dev
In-Reply-To: <C8LLSM3UC598.L4E2VMGJOI06@geist>

"Christopher M. Riedl" <cmr@codefail.de> writes:
> On Mon Jan 11, 2021 at 7:22 AM CST, Christophe Leroy wrote:
>> Le 09/01/2021 à 04:25, Christopher M. Riedl a écrit :
>> > Implement raw_copy_from_user_allowed() which assumes that userspace read
>> > access is open. Use this new function to implement raw_copy_from_user().
>> > Finally, wrap the new function to follow the usual "unsafe_" convention
>> > of taking a label argument.
>>
>> I think there is no point implementing raw_copy_from_user_allowed(), see
>> https://github.com/linuxppc/linux/commit/4b842e4e25b1 and
>> https://patchwork.ozlabs.org/project/linuxppc-dev/patch/8c74fc9ce8131cabb10b3e95dc0e430f396ee83e.1610369143.git.christophe.leroy@csgroup.eu/
>>
>> You should simply do:
>>
>> #define unsafe_copy_from_user(d, s, l, e) \
>> unsafe_op_wrap(__copy_tofrom_user((__force void __user *)d, s, l), e)
>>
>
> I gave this a try and the signal ops decreased by ~8K. Now, to be
> honest, I am not sure what an "acceptable" benchmark number here
> actually is - so maybe this is ok? Same loss with both radix and hash:
>
> 	|                                      | hash   | radix  |
> 	| ------------------------------------ | ------ | ------ |
> 	| linuxppc/next                        | 118693 | 133296 |
> 	| linuxppc/next w/o KUAP+KUEP          | 228911 | 228654 |
> 	| unsafe-signal64                      | 200480 | 234067 |
> 	| unsafe-signal64 (__copy_tofrom_user) | 192467 | 225119 |
>
> To put this into perspective, prior to KUAP and uaccess flush, signal
> performance in this benchmark was ~290K on hash.

If I'm doing the math right 8K is ~4% of the best number.

It seems like 4% is worth a few lines of code to handle these constant
sizes. It's not like we have performance to throw away.

Or, we should chase down where the call sites are that are doing small
constant copies with copy_to/from_user() and change them to use
get/put_user().

cheers

^ permalink raw reply

* Re: [PATCH 1/4] KVM: PPC: Book3S HV: Remove support for running HPT guest on RPT host without mixed mode support
From: Fabiano Rosas @ 2021-01-19  1:46 UTC (permalink / raw)
  To: Nicholas Piggin, kvm-ppc; +Cc: linuxppc-dev, Nicholas Piggin
In-Reply-To: <20210118062809.1430920-2-npiggin@gmail.com>

Resending because the previous got spam-filtered:

Nicholas Piggin <npiggin@gmail.com> writes:

> This reverts much of commit c01015091a770 ("KVM: PPC: Book3S HV: Run HPT
> guests on POWER9 radix hosts"), which was required to run HPT guests on
> RPT hosts on early POWER9 CPUs without support for "mixed mode", which
> meant the host could not run with MMU on while guests were running.
>
> This code has some corner case bugs, e.g., when the guest hits a machine
> check or HMI the primary locks up waiting for secondaries to switch LPCR
> to host, which they never do. This could all be fixed in software, but
> most CPUs in production have mixed mode support, and those that don't
> are believed to be all in installations that don't use this capability.
> So simplify things and remove support.

With this patch in a DD2.1 machine + indep_threads_mode=N +
disable_radix, QEMU aborts and dumps registers, is that intended?

Could we use the 'no_mixing_hpt_and_radix' logic in check_extension to
advertise only KVM_CAP_PPC_MMU_RADIX to the guest via OV5 so it doesn't
try to run hash?

For instance, if I hack QEMU's 'spapr_dt_ov5_platform_support' from
OV5_MMU_BOTH to OV5_MMU_RADIX_300 then it boots succesfuly, but the
guest turns into radix, due to this code in prom_init:

prom_parse_mmu_model:

case OV5_FEAT(OV5_MMU_RADIX): /* Only Radix */
	prom_debug("MMU - radix only\n");
	if (prom_radix_disable) {
		/*
		 * If we __have__ to do radix, we're better off ignoring
		 * the command line rather than not booting.
		 */
		prom_printf("WARNING: Ignoring cmdline option disable_radix\n");
	}
	support->radix_mmu = true;
	break;

It seems we could explicitly say that the host does not support hash and
that would align with the above code.

>
> Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
> ---

^ permalink raw reply

* Re: [PATCH 2/4] KVM: PPC: Book3S HV: Fix radix guest SLB side channel
From: Fabiano Rosas @ 2021-01-19  1:39 UTC (permalink / raw)
  To: Nicholas Piggin, kvm-ppc; +Cc: linuxppc-dev, Nicholas Piggin
In-Reply-To: <20210118062809.1430920-3-npiggin@gmail.com>

Nicholas Piggin <npiggin@gmail.com> writes:

> The slbmte instruction is legal in radix mode, including radix guest
> mode. This means radix guests can load the SLB with arbitrary data.
>
> KVM host does not clear the SLB when exiting a guest if it was a
> radix guest, which would allow a rogue radix guest to use the SLB as
> a side channel to communicate with other guests.
>
> Fix this by ensuring the SLB is cleared when coming out of a radix
> guest. Only the first 4 entries are a concern, because radix guests
> always run with LPCR[UPRT]=1, which limits the reach of slbmte. slbia
> is not used (except in a non-performance-critical path) because it
> can clear cached translations.
>
> Signed-off-by: Nicholas Piggin <npiggin@gmail.com>

Reviewed-by: Fabiano Rosas <farosas@linux.ibm.com>

> ---
>  arch/powerpc/kvm/book3s_hv_rmhandlers.S | 39 ++++++++++++++++++++-----
>  1 file changed, 31 insertions(+), 8 deletions(-)
>
> diff --git a/arch/powerpc/kvm/book3s_hv_rmhandlers.S b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
> index d5a9b57ec129..0e1f5bf168a1 100644
> --- a/arch/powerpc/kvm/book3s_hv_rmhandlers.S
> +++ b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
> @@ -1157,6 +1157,20 @@ EXPORT_SYMBOL_GPL(__kvmhv_vcpu_entry_p9)
>  	mr	r4, r3
>  	b	fast_guest_entry_c
>  guest_exit_short_path:
> +	/*
> +	 * Malicious or buggy radix guests may have inserted SLB entries
> +	 * (only 0..3 because radix always runs with UPRT=1), so these must
> +	 * be cleared here to avoid side-channels. slbmte is used rather
> +	 * than slbia, as it won't clear cached translations.
> +	 */
> +	li	r0,0
> +	slbmte	r0,r0
> +	li	r4,1
> +	slbmte	r0,r4
> +	li	r4,2
> +	slbmte	r0,r4
> +	li	r4,3
> +	slbmte	r0,r4
>
>  	li	r0, KVM_GUEST_MODE_NONE
>  	stb	r0, HSTATE_IN_GUEST(r13)
> @@ -1469,7 +1483,7 @@ guest_exit_cont:		/* r9 = vcpu, r12 = trap, r13 = paca */
>  	lbz	r0, KVM_RADIX(r5)
>  	li	r5, 0
>  	cmpwi	r0, 0
> -	bne	3f			/* for radix, save 0 entries */
> +	bne	0f			/* for radix, save 0 entries */
>  	lwz	r0,VCPU_SLB_NR(r9)	/* number of entries in SLB */
>  	mtctr	r0
>  	li	r6,0
> @@ -1490,12 +1504,9 @@ guest_exit_cont:		/* r9 = vcpu, r12 = trap, r13 = paca */
>  	slbmte	r0,r0
>  	slbia
>  	ptesync
> -3:	stw	r5,VCPU_SLB_MAX(r9)
> +	stw	r5,VCPU_SLB_MAX(r9)
>
>  	/* load host SLB entries */
> -BEGIN_MMU_FTR_SECTION
> -	b	0f
> -END_MMU_FTR_SECTION_IFSET(MMU_FTR_TYPE_RADIX)
>  	ld	r8,PACA_SLBSHADOWPTR(r13)
>
>  	.rept	SLB_NUM_BOLTED
> @@ -1508,7 +1519,17 @@ END_MMU_FTR_SECTION_IFSET(MMU_FTR_TYPE_RADIX)
>  	slbmte	r6,r5
>  1:	addi	r8,r8,16
>  	.endr
> -0:
> +	b	guest_bypass
> +
> +0:	/* Sanitise radix guest SLB, see guest_exit_short_path comment. */
> +	li	r0,0
> +	slbmte	r0,r0
> +	li	r4,1
> +	slbmte	r0,r4
> +	li	r4,2
> +	slbmte	r0,r4
> +	li	r4,3
> +	slbmte	r0,r4
>
>  guest_bypass:
>  	stw	r12, STACK_SLOT_TRAP(r1)
> @@ -3302,12 +3323,14 @@ END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_300)
>  	mtspr	SPRN_CIABR, r0
>  	mtspr	SPRN_DAWRX0, r0
>
> +	/* Clear hash and radix guest SLB, see guest_exit_short_path comment. */
> +	slbmte	r0, r0
> +	slbia
> +
>  BEGIN_MMU_FTR_SECTION
>  	b	4f
>  END_MMU_FTR_SECTION_IFSET(MMU_FTR_TYPE_RADIX)
>
> -	slbmte	r0, r0
> -	slbia
>  	ptesync
>  	ld	r8, PACA_SLBSHADOWPTR(r13)
>  	.rept	SLB_NUM_BOLTED

^ permalink raw reply

* [PATCH v2] powerpc: always enable queued spinlocks for 64s, disable for others
From: Nicholas Piggin @ 2021-01-18 12:34 UTC (permalink / raw)
  To: linuxppc-dev; +Cc: Nicholas Piggin

Queued spinlocks have shown to have good performance and fairness
properties even on smaller (2 socket) POWER systems. This selects
them automatically for 64s. For other platforms they are de-selected,
the standard spinlock is far simpler and smaller code, and single
chips with a handful of cores is unlikely to show any improvement.

CONFIG_EXPERT still allows this to be changed, e.g., to help debug
performance or correctness issues.

Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
---
Since v1, made the condition simpler as suggested by Christophe.
Verified this works as expected for a bunch of old configs
(e.g., updates old !EXPERT configs to PPC_QUEUED_SPINLOCKS=y if they
were previously =n).

Thanks,
Nick

 arch/powerpc/Kconfig | 8 ++------
 1 file changed, 2 insertions(+), 6 deletions(-)

diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
index 107bb4319e0e..eebb4a8c156c 100644
--- a/arch/powerpc/Kconfig
+++ b/arch/powerpc/Kconfig
@@ -505,18 +505,14 @@ config HOTPLUG_CPU
 	  Say N if you are unsure.
 
 config PPC_QUEUED_SPINLOCKS
-	bool "Queued spinlocks"
+	bool "Queued spinlocks" if EXPERT
 	depends on SMP
+	default PPC_BOOK3S_64
 	help
 	  Say Y here to use queued spinlocks which give better scalability and
 	  fairness on large SMP and NUMA systems without harming single threaded
 	  performance.
 
-	  This option is currently experimental, the code is more complex and
-	  less tested so it defaults to "N" for the moment.
-
-	  If unsure, say "N".
-
 config ARCH_CPU_PROBE_RELEASE
 	def_bool y
 	depends on HOTPLUG_CPU
-- 
2.23.0


^ permalink raw reply related

* [PATCH][next] scsi: ibmvfc: Fix spelling mistake "succeded" -> "succeeded"
From: Colin King @ 2021-01-18 11:13 UTC (permalink / raw)
  To: Tyrel Datwyler, Michael Ellerman, Benjamin Herrenschmidt,
	Paul Mackerras, James E . J . Bottomley, Martin K . Petersen,
	linux-scsi, linuxppc-dev
  Cc: kernel-janitors, linux-kernel

From: Colin Ian King <colin.king@canonical.com>

There is a spelling mistake in a ibmvfc_dbg debug message. Fix it.

Signed-off-by: Colin Ian King <colin.king@canonical.com>
---
 drivers/scsi/ibmvscsi/ibmvfc.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/scsi/ibmvscsi/ibmvfc.c b/drivers/scsi/ibmvscsi/ibmvfc.c
index 1db9e04f1ad3..755313b766b9 100644
--- a/drivers/scsi/ibmvscsi/ibmvfc.c
+++ b/drivers/scsi/ibmvscsi/ibmvfc.c
@@ -4808,7 +4808,7 @@ static void ibmvfc_channel_setup_done(struct ibmvfc_event *evt)
 
 	switch (mad_status) {
 	case IBMVFC_MAD_SUCCESS:
-		ibmvfc_dbg(vhost, "Channel Setup succeded\n");
+		ibmvfc_dbg(vhost, "Channel Setup succeeded\n");
 		flags = be32_to_cpu(setup->flags);
 		vhost->do_enquiry = 0;
 		active_queues = be32_to_cpu(setup->num_scsi_subq_channels);
-- 
2.29.2


^ permalink raw reply related

* Re: [PATCH] tools/perf: Fix powerpc gap between kernel end and module start
From: kajoljain @ 2021-01-18 10:21 UTC (permalink / raw)
  To: Jiri Olsa, Athira Rajeev; +Cc: maddy, linuxppc-dev, jolsa, acme
In-Reply-To: <20210112093811.GA1272772@krava>



On 1/12/21 3:08 PM, Jiri Olsa wrote:
> On Mon, Dec 28, 2020 at 09:14:14PM -0500, Athira Rajeev wrote:
> 
> SNIP
> 
>> c000000002799370 b backtrace_flag
>> c000000002799378 B radix_tree_node_cachep
>> c000000002799380 B __bss_stop
>> c0000000027a0000 B _end
>> c008000003890000 t icmp_checkentry      [ip_tables]
>> c008000003890038 t ipt_alloc_initial_table      [ip_tables]
>> c008000003890468 T ipt_do_table [ip_tables]
>> c008000003890de8 T ipt_unregister_table_pre_exit        [ip_tables]
>> ...
>>
>> Perf calls function symbols__fixup_end() which sets the end of symbol
>> to 0xc008000003890000, which is the next address and this is the start
>> address of first module (icmp_checkentry in above) which will make the
>> huge symbol size of 0x80000010f0000.
>>
>> After symbols__fixup_end:
>> symbols__fixup_end: sym->name: _end, sym->start: 0xc0000000027a0000,
>> sym->end: 0xc008000003890000
>>
>> On powerpc, kernel text segment is located at 0xc000000000000000
>> whereas the modules are located at very high memory addresses,
>> 0xc00800000xxxxxxx. Since the gap between end of kernel text segment
>> and beginning of first module's address is high, histogram allocation
>> using calloc fails.
>>
>> Fix this by detecting the kernel's last symbol and limiting
>> the range of last kernel symbol to pagesize.

Patch looks good to me.

Tested-By: Kajol Jain<kjain@linux.ibm.com>

Thanks,
Kajol Jain
>>
>> Signed-off-by: Athira Rajeev<atrajeev@linux.vnet.ibm.com>
> 
> I can't test, but since the same approach works for arm and s390,
> this also looks ok
> 
> Acked-by: Jiri Olsa <jolsa@redhat.com>
> 
> thanks,
> jirka
> 
>> ---
>>  tools/perf/arch/powerpc/util/Build     |  1 +
>>  tools/perf/arch/powerpc/util/machine.c | 24 ++++++++++++++++++++++++
>>  2 files changed, 25 insertions(+)
>>  create mode 100644 tools/perf/arch/powerpc/util/machine.c
>>
>> diff --git a/tools/perf/arch/powerpc/util/Build b/tools/perf/arch/powerpc/util/Build
>> index e86e210bf514..b7945e5a543b 100644
>> --- a/tools/perf/arch/powerpc/util/Build
>> +++ b/tools/perf/arch/powerpc/util/Build
>> @@ -1,4 +1,5 @@
>>  perf-y += header.o
>> +perf-y += machine.o
>>  perf-y += kvm-stat.o
>>  perf-y += perf_regs.o
>>  perf-y += mem-events.o
>> diff --git a/tools/perf/arch/powerpc/util/machine.c b/tools/perf/arch/powerpc/util/machine.c
>> new file mode 100644
>> index 000000000000..c30e5cc88c16
>> --- /dev/null
>> +++ b/tools/perf/arch/powerpc/util/machine.c
>> @@ -0,0 +1,24 @@
>> +// SPDX-License-Identifier: GPL-2.0
>> +
>> +#include <stdio.h>
>> +#include <string.h>
>> +#include <internal/lib.h> // page_size
>> +#include "debug.h"
>> +#include "symbol.h"
>> +
>> +/* On powerpc kernel text segment start at memory addresses, 0xc000000000000000
>> + * whereas the modules are located at very high memory addresses,
>> + * for example 0xc00800000xxxxxxx. The gap between end of kernel text segment
>> + * and beginning of first module's text segment is very high.
>> + * Therefore do not fill this gap and do not assign it to the kernel dso map.
>> + */
>> +
>> +void arch__symbols__fixup_end(struct symbol *p, struct symbol *c)
>> +{
>> +	if (strchr(p->name, '[') == NULL && strchr(c->name, '['))
>> +		/* Limit the range of last kernel symbol */
>> +		p->end += page_size;
>> +	else
>> +		p->end = c->start;
>> +	pr_debug4("%s sym:%s end:%#lx\n", __func__, p->name, p->end);
>> +}
>> -- 
>> 1.8.3.1
>>
> 

^ permalink raw reply

* [PATCH] selftests/powerpc: Fix exit status of pkey tests
From: Sandipan Das @ 2021-01-18  9:31 UTC (permalink / raw)
  To: mpe; +Cc: harish, aneesh.kumar, efuller, linuxppc-dev

Since main() does not return a value explicitly, the
return values from FAIL_IF() conditions are ignored
and the tests can still pass irrespective of failures.
This makes sure that we always explicitly return the
correct test exit status.

Reported-by: Eirik Fuller <efuller@redhat.com>
Fixes: 1addb6444791 ("selftests/powerpc: Add test for execute-disabled pkeys")
Fixes: c27f2fd1705a ("selftests/powerpc: Add test for pkey siginfo verification")
Signed-off-by: Sandipan Das <sandipan@linux.ibm.com>
---
 tools/testing/selftests/powerpc/mm/pkey_exec_prot.c | 2 +-
 tools/testing/selftests/powerpc/mm/pkey_siginfo.c   | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/tools/testing/selftests/powerpc/mm/pkey_exec_prot.c b/tools/testing/selftests/powerpc/mm/pkey_exec_prot.c
index 9e5c7f3f498a..0af4f02669a1 100644
--- a/tools/testing/selftests/powerpc/mm/pkey_exec_prot.c
+++ b/tools/testing/selftests/powerpc/mm/pkey_exec_prot.c
@@ -290,5 +290,5 @@ static int test(void)
 
 int main(void)
 {
-	test_harness(test, "pkey_exec_prot");
+	return test_harness(test, "pkey_exec_prot");
 }
diff --git a/tools/testing/selftests/powerpc/mm/pkey_siginfo.c b/tools/testing/selftests/powerpc/mm/pkey_siginfo.c
index 4f815d7c1214..2db76e56d4cb 100644
--- a/tools/testing/selftests/powerpc/mm/pkey_siginfo.c
+++ b/tools/testing/selftests/powerpc/mm/pkey_siginfo.c
@@ -329,5 +329,5 @@ static int test(void)
 
 int main(void)
 {
-	test_harness(test, "pkey_siginfo");
+	return test_harness(test, "pkey_siginfo");
 }
-- 
2.25.1


^ permalink raw reply related

* [PATCH 4/4] KVM: PPC: Book3S HV: Use POWER9 SLBIA IH=6 variant to clear SLB
From: Nicholas Piggin @ 2021-01-18  6:28 UTC (permalink / raw)
  To: kvm-ppc; +Cc: linuxppc-dev, Nicholas Piggin
In-Reply-To: <20210118062809.1430920-1-npiggin@gmail.com>

IH=6 may preserve hypervisor real-mode ERAT entries and is the
recommended SLBIA hint for switching partitions.

Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
---
 arch/powerpc/kvm/book3s_hv_rmhandlers.S | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/arch/powerpc/kvm/book3s_hv_rmhandlers.S b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
index 9f0fdbae4b44..8cf1f69f442e 100644
--- a/arch/powerpc/kvm/book3s_hv_rmhandlers.S
+++ b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
@@ -898,7 +898,7 @@ BEGIN_MMU_FTR_SECTION
 	/* Radix host won't have populated the SLB, so no need to clear */
 	li	r6, 0
 	slbmte	r6, r6
-	slbia
+	PPC_SLBIA(6)
 	ptesync
 END_MMU_FTR_SECTION_IFCLR(MMU_FTR_TYPE_RADIX)
 
@@ -1506,7 +1506,7 @@ guest_exit_cont:		/* r9 = vcpu, r12 = trap, r13 = paca */
 	/* Finally clear out the SLB */
 	li	r0,0
 	slbmte	r0,r0
-	slbia
+	PPC_SLBIA(6)
 	ptesync
 	stw	r5,VCPU_SLB_MAX(r9)
 
@@ -3329,7 +3329,7 @@ END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_300)
 
 	/* Clear hash and radix guest SLB, see guest_exit_short_path comment. */
 	slbmte	r0, r0
-	slbia
+	PPC_SLBIA(6)
 
 BEGIN_MMU_FTR_SECTION
 	b	4f
-- 
2.23.0


^ permalink raw reply related

* [PATCH 3/4] KVM: PPC: Book3S HV: No need to clear radix host SLB before loading guest
From: Nicholas Piggin @ 2021-01-18  6:28 UTC (permalink / raw)
  To: kvm-ppc; +Cc: linuxppc-dev, Nicholas Piggin
In-Reply-To: <20210118062809.1430920-1-npiggin@gmail.com>

Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
---
 arch/powerpc/kvm/book3s_hv_rmhandlers.S | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/arch/powerpc/kvm/book3s_hv_rmhandlers.S b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
index 0e1f5bf168a1..9f0fdbae4b44 100644
--- a/arch/powerpc/kvm/book3s_hv_rmhandlers.S
+++ b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
@@ -888,15 +888,19 @@ ALT_FTR_SECTION_END_IFCLR(CPU_FTR_ARCH_300)
 	cmpdi	r3, 512		/* 1 microsecond */
 	blt	hdec_soon
 
-	/* For hash guest, clear out and reload the SLB */
 	ld	r6, VCPU_KVM(r4)
 	lbz	r0, KVM_RADIX(r6)
 	cmpwi	r0, 0
 	bne	9f
+
+	/* For hash guest, clear out and reload the SLB */
+BEGIN_MMU_FTR_SECTION
+	/* Radix host won't have populated the SLB, so no need to clear */
 	li	r6, 0
 	slbmte	r6, r6
 	slbia
 	ptesync
+END_MMU_FTR_SECTION_IFCLR(MMU_FTR_TYPE_RADIX)
 
 	/* Load up guest SLB entries (N.B. slb_max will be 0 for radix) */
 	lwz	r5,VCPU_SLB_MAX(r4)
-- 
2.23.0


^ permalink raw reply related

* [PATCH 2/4] KVM: PPC: Book3S HV: Fix radix guest SLB side channel
From: Nicholas Piggin @ 2021-01-18  6:28 UTC (permalink / raw)
  To: kvm-ppc; +Cc: linuxppc-dev, Nicholas Piggin
In-Reply-To: <20210118062809.1430920-1-npiggin@gmail.com>

The slbmte instruction is legal in radix mode, including radix guest
mode. This means radix guests can load the SLB with arbitrary data.

KVM host does not clear the SLB when exiting a guest if it was a
radix guest, which would allow a rogue radix guest to use the SLB as
a side channel to communicate with other guests.

Fix this by ensuring the SLB is cleared when coming out of a radix
guest. Only the first 4 entries are a concern, because radix guests
always run with LPCR[UPRT]=1, which limits the reach of slbmte. slbia
is not used (except in a non-performance-critical path) because it
can clear cached translations.

Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
---
 arch/powerpc/kvm/book3s_hv_rmhandlers.S | 39 ++++++++++++++++++++-----
 1 file changed, 31 insertions(+), 8 deletions(-)

diff --git a/arch/powerpc/kvm/book3s_hv_rmhandlers.S b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
index d5a9b57ec129..0e1f5bf168a1 100644
--- a/arch/powerpc/kvm/book3s_hv_rmhandlers.S
+++ b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
@@ -1157,6 +1157,20 @@ EXPORT_SYMBOL_GPL(__kvmhv_vcpu_entry_p9)
 	mr	r4, r3
 	b	fast_guest_entry_c
 guest_exit_short_path:
+	/*
+	 * Malicious or buggy radix guests may have inserted SLB entries
+	 * (only 0..3 because radix always runs with UPRT=1), so these must
+	 * be cleared here to avoid side-channels. slbmte is used rather
+	 * than slbia, as it won't clear cached translations.
+	 */
+	li	r0,0
+	slbmte	r0,r0
+	li	r4,1
+	slbmte	r0,r4
+	li	r4,2
+	slbmte	r0,r4
+	li	r4,3
+	slbmte	r0,r4
 
 	li	r0, KVM_GUEST_MODE_NONE
 	stb	r0, HSTATE_IN_GUEST(r13)
@@ -1469,7 +1483,7 @@ guest_exit_cont:		/* r9 = vcpu, r12 = trap, r13 = paca */
 	lbz	r0, KVM_RADIX(r5)
 	li	r5, 0
 	cmpwi	r0, 0
-	bne	3f			/* for radix, save 0 entries */
+	bne	0f			/* for radix, save 0 entries */
 	lwz	r0,VCPU_SLB_NR(r9)	/* number of entries in SLB */
 	mtctr	r0
 	li	r6,0
@@ -1490,12 +1504,9 @@ guest_exit_cont:		/* r9 = vcpu, r12 = trap, r13 = paca */
 	slbmte	r0,r0
 	slbia
 	ptesync
-3:	stw	r5,VCPU_SLB_MAX(r9)
+	stw	r5,VCPU_SLB_MAX(r9)
 
 	/* load host SLB entries */
-BEGIN_MMU_FTR_SECTION
-	b	0f
-END_MMU_FTR_SECTION_IFSET(MMU_FTR_TYPE_RADIX)
 	ld	r8,PACA_SLBSHADOWPTR(r13)
 
 	.rept	SLB_NUM_BOLTED
@@ -1508,7 +1519,17 @@ END_MMU_FTR_SECTION_IFSET(MMU_FTR_TYPE_RADIX)
 	slbmte	r6,r5
 1:	addi	r8,r8,16
 	.endr
-0:
+	b	guest_bypass
+
+0:	/* Sanitise radix guest SLB, see guest_exit_short_path comment. */
+	li	r0,0
+	slbmte	r0,r0
+	li	r4,1
+	slbmte	r0,r4
+	li	r4,2
+	slbmte	r0,r4
+	li	r4,3
+	slbmte	r0,r4
 
 guest_bypass:
 	stw	r12, STACK_SLOT_TRAP(r1)
@@ -3302,12 +3323,14 @@ END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_300)
 	mtspr	SPRN_CIABR, r0
 	mtspr	SPRN_DAWRX0, r0
 
+	/* Clear hash and radix guest SLB, see guest_exit_short_path comment. */
+	slbmte	r0, r0
+	slbia
+
 BEGIN_MMU_FTR_SECTION
 	b	4f
 END_MMU_FTR_SECTION_IFSET(MMU_FTR_TYPE_RADIX)
 
-	slbmte	r0, r0
-	slbia
 	ptesync
 	ld	r8, PACA_SLBSHADOWPTR(r13)
 	.rept	SLB_NUM_BOLTED
-- 
2.23.0


^ permalink raw reply related

* [PATCH 0/4] a few KVM patches
From: Nicholas Piggin @ 2021-01-18  6:28 UTC (permalink / raw)
  To: kvm-ppc; +Cc: linuxppc-dev, Nicholas Piggin

These patches are unrelated except touching some of the same code.
The first two fix actual guest exploitable issues, the other two try
to tidy up SLB management slightly.

Thanks,
Nick

Nicholas Piggin (4):
  KVM: PPC: Book3S HV: Remove support for running HPT guest on RPT host
    without mixed mode support
  KVM: PPC: Book3S HV: Fix radix guest SLB side channel
  KVM: PPC: Book3S HV: No need to clear radix host SLB before loading
    guest
  KVM: PPC: Book3S HV: Use POWER9 SLBIA IH=6 variant to clear SLB

 arch/powerpc/include/asm/kvm_book3s_asm.h |  11 --
 arch/powerpc/kernel/asm-offsets.c         |   3 -
 arch/powerpc/kvm/book3s_hv.c              |  56 ++--------
 arch/powerpc/kvm/book3s_hv_builtin.c      | 108 +-----------------
 arch/powerpc/kvm/book3s_hv_rmhandlers.S   | 129 ++++++++++------------
 5 files changed, 70 insertions(+), 237 deletions(-)

-- 
2.23.0


^ permalink raw reply

* [PATCH 1/4] KVM: PPC: Book3S HV: Remove support for running HPT guest on RPT host without mixed mode support
From: Nicholas Piggin @ 2021-01-18  6:28 UTC (permalink / raw)
  To: kvm-ppc; +Cc: linuxppc-dev, Nicholas Piggin
In-Reply-To: <20210118062809.1430920-1-npiggin@gmail.com>

This reverts much of commit c01015091a770 ("KVM: PPC: Book3S HV: Run HPT
guests on POWER9 radix hosts"), which was required to run HPT guests on
RPT hosts on early POWER9 CPUs without support for "mixed mode", which
meant the host could not run with MMU on while guests were running.

This code has some corner case bugs, e.g., when the guest hits a machine
check or HMI the primary locks up waiting for secondaries to switch LPCR
to host, which they never do. This could all be fixed in software, but
most CPUs in production have mixed mode support, and those that don't
are believed to be all in installations that don't use this capability.
So simplify things and remove support.

Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
---
 arch/powerpc/include/asm/kvm_book3s_asm.h |  11 ---
 arch/powerpc/kernel/asm-offsets.c         |   3 -
 arch/powerpc/kvm/book3s_hv.c              |  56 +++--------
 arch/powerpc/kvm/book3s_hv_builtin.c      | 108 +---------------------
 arch/powerpc/kvm/book3s_hv_rmhandlers.S   |  80 ++++------------
 5 files changed, 32 insertions(+), 226 deletions(-)

diff --git a/arch/powerpc/include/asm/kvm_book3s_asm.h b/arch/powerpc/include/asm/kvm_book3s_asm.h
index 078f4648ea27..b6d31bff5209 100644
--- a/arch/powerpc/include/asm/kvm_book3s_asm.h
+++ b/arch/powerpc/include/asm/kvm_book3s_asm.h
@@ -74,16 +74,6 @@ struct kvm_split_mode {
 	u8		do_nap;
 	u8		napped[MAX_SMT_THREADS];
 	struct kvmppc_vcore *vc[MAX_SUBCORES];
-	/* Bits for changing lpcr on P9 */
-	unsigned long	lpcr_req;
-	unsigned long	lpidr_req;
-	unsigned long	host_lpcr;
-	u32		do_set;
-	u32		do_restore;
-	union {
-		u32	allphases;
-		u8	phase[4];
-	} lpcr_sync;
 };
 
 /*
@@ -110,7 +100,6 @@ struct kvmppc_host_state {
 	u8 hwthread_state;
 	u8 host_ipi;
 	u8 ptid;		/* thread number within subcore when split */
-	u8 tid;			/* thread number within whole core */
 	u8 fake_suspend;
 	struct kvm_vcpu *kvm_vcpu;
 	struct kvmppc_vcore *kvm_vcore;
diff --git a/arch/powerpc/kernel/asm-offsets.c b/arch/powerpc/kernel/asm-offsets.c
index b12d7c049bfe..489a22cf1a92 100644
--- a/arch/powerpc/kernel/asm-offsets.c
+++ b/arch/powerpc/kernel/asm-offsets.c
@@ -668,7 +668,6 @@ int main(void)
 	HSTATE_FIELD(HSTATE_SAVED_XIRR, saved_xirr);
 	HSTATE_FIELD(HSTATE_HOST_IPI, host_ipi);
 	HSTATE_FIELD(HSTATE_PTID, ptid);
-	HSTATE_FIELD(HSTATE_TID, tid);
 	HSTATE_FIELD(HSTATE_FAKE_SUSPEND, fake_suspend);
 	HSTATE_FIELD(HSTATE_MMCR0, host_mmcr[0]);
 	HSTATE_FIELD(HSTATE_MMCR1, host_mmcr[1]);
@@ -698,8 +697,6 @@ int main(void)
 	OFFSET(KVM_SPLIT_LDBAR, kvm_split_mode, ldbar);
 	OFFSET(KVM_SPLIT_DO_NAP, kvm_split_mode, do_nap);
 	OFFSET(KVM_SPLIT_NAPPED, kvm_split_mode, napped);
-	OFFSET(KVM_SPLIT_DO_SET, kvm_split_mode, do_set);
-	OFFSET(KVM_SPLIT_DO_RESTORE, kvm_split_mode, do_restore);
 #endif /* CONFIG_KVM_BOOK3S_HV_POSSIBLE */
 
 #ifdef CONFIG_PPC_BOOK3S_64
diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c
index 6f612d240392..2d8627dbd9f6 100644
--- a/arch/powerpc/kvm/book3s_hv.c
+++ b/arch/powerpc/kvm/book3s_hv.c
@@ -134,7 +134,7 @@ static inline bool nesting_enabled(struct kvm *kvm)
 }
 
 /* If set, the threads on each CPU core have to be in the same MMU mode */
-static bool no_mixing_hpt_and_radix;
+static bool no_mixing_hpt_and_radix __read_mostly;
 
 static int kvmppc_hv_setup_htab_rma(struct kvm_vcpu *vcpu);
 
@@ -2862,11 +2862,6 @@ static bool can_dynamic_split(struct kvmppc_vcore *vc, struct core_info *cip)
 	if (one_vm_per_core && vc->kvm != cip->vc[0]->kvm)
 		return false;
 
-	/* Some POWER9 chips require all threads to be in the same MMU mode */
-	if (no_mixing_hpt_and_radix &&
-	    kvm_is_radix(vc->kvm) != kvm_is_radix(cip->vc[0]->kvm))
-		return false;
-
 	if (n_threads < cip->max_subcore_threads)
 		n_threads = cip->max_subcore_threads;
 	if (!subcore_config_ok(cip->n_subcores + 1, n_threads))
@@ -2905,6 +2900,9 @@ static void prepare_threads(struct kvmppc_vcore *vc)
 	for_each_runnable_thread(i, vcpu, vc) {
 		if (signal_pending(vcpu->arch.run_task))
 			vcpu->arch.ret = -EINTR;
+		else if (no_mixing_hpt_and_radix &&
+			 kvm_is_radix(vc->kvm) != radix_enabled())
+			vcpu->arch.ret = -EINVAL;
 		else if (vcpu->arch.vpa.update_pending ||
 			 vcpu->arch.slb_shadow.update_pending ||
 			 vcpu->arch.dtl.update_pending)
@@ -3110,7 +3108,6 @@ static noinline void kvmppc_run_core(struct kvmppc_vcore *vc)
 	int controlled_threads;
 	int trap;
 	bool is_power8;
-	bool hpt_on_radix;
 
 	/*
 	 * Remove from the list any threads that have a signal pending
@@ -3143,11 +3140,8 @@ static noinline void kvmppc_run_core(struct kvmppc_vcore *vc)
 	 * this is a HPT guest on a radix host machine where the
 	 * CPU threads may not be in different MMU modes.
 	 */
-	hpt_on_radix = no_mixing_hpt_and_radix && radix_enabled() &&
-		!kvm_is_radix(vc->kvm);
-	if (((controlled_threads > 1) &&
-	     ((vc->num_threads > threads_per_subcore) || !on_primary_thread())) ||
-	    (hpt_on_radix && vc->kvm->arch.threads_indep)) {
+	if ((controlled_threads > 1) &&
+	    ((vc->num_threads > threads_per_subcore) || !on_primary_thread())) {
 		for_each_runnable_thread(i, vcpu, vc) {
 			vcpu->arch.ret = -EBUSY;
 			kvmppc_remove_runnable(vc, vcpu);
@@ -3215,7 +3209,7 @@ static noinline void kvmppc_run_core(struct kvmppc_vcore *vc)
 	is_power8 = cpu_has_feature(CPU_FTR_ARCH_207S)
 		&& !cpu_has_feature(CPU_FTR_ARCH_300);
 
-	if (split > 1 || hpt_on_radix) {
+	if (split > 1) {
 		sip = &split_info;
 		memset(&split_info, 0, sizeof(split_info));
 		for (sub = 0; sub < core_info.n_subcores; ++sub)
@@ -3237,13 +3231,6 @@ static noinline void kvmppc_run_core(struct kvmppc_vcore *vc)
 			split_info.subcore_size = subcore_size;
 		} else {
 			split_info.subcore_size = 1;
-			if (hpt_on_radix) {
-				/* Use the split_info for LPCR/LPIDR changes */
-				split_info.lpcr_req = vc->lpcr;
-				split_info.lpidr_req = vc->kvm->arch.lpid;
-				split_info.host_lpcr = vc->kvm->arch.host_lpcr;
-				split_info.do_set = 1;
-			}
 		}
 
 		/* order writes to split_info before kvm_split_mode pointer */
@@ -3253,7 +3240,6 @@ static noinline void kvmppc_run_core(struct kvmppc_vcore *vc)
 	for (thr = 0; thr < controlled_threads; ++thr) {
 		struct paca_struct *paca = paca_ptrs[pcpu + thr];
 
-		paca->kvm_hstate.tid = thr;
 		paca->kvm_hstate.napping = 0;
 		paca->kvm_hstate.kvm_split_mode = sip;
 	}
@@ -3327,10 +3313,8 @@ static noinline void kvmppc_run_core(struct kvmppc_vcore *vc)
 	 * When doing micro-threading, poke the inactive threads as well.
 	 * This gets them to the nap instruction after kvm_do_nap,
 	 * which reduces the time taken to unsplit later.
-	 * For POWER9 HPT guest on radix host, we need all the secondary
-	 * threads woken up so they can do the LPCR/LPIDR change.
 	 */
-	if (cmd_bit || hpt_on_radix) {
+	if (cmd_bit) {
 		split_info.do_nap = 1;	/* ask secondaries to nap when done */
 		for (thr = 1; thr < threads_per_subcore; ++thr)
 			if (!(active & (1 << thr)))
@@ -3391,19 +3375,8 @@ static noinline void kvmppc_run_core(struct kvmppc_vcore *vc)
 			cpu_relax();
 			++loops;
 		}
-	} else if (hpt_on_radix) {
-		/* Wait for all threads to have seen final sync */
-		for (thr = 1; thr < controlled_threads; ++thr) {
-			struct paca_struct *paca = paca_ptrs[pcpu + thr];
-
-			while (paca->kvm_hstate.kvm_split_mode) {
-				HMT_low();
-				barrier();
-			}
-			HMT_medium();
-		}
+		split_info.do_nap = 0;
 	}
-	split_info.do_nap = 0;
 
 	kvmppc_set_host_core(pcpu);
 
@@ -4173,7 +4146,6 @@ int kvmhv_run_single_vcpu(struct kvm_vcpu *vcpu, u64 time_limit,
 
 	kvmppc_clear_host_core(pcpu);
 
-	local_paca->kvm_hstate.tid = 0;
 	local_paca->kvm_hstate.napping = 0;
 	local_paca->kvm_hstate.kvm_split_mode = NULL;
 	kvmppc_start_thread(vcpu, vc);
@@ -4358,15 +4330,11 @@ static int kvmppc_vcpu_run_hv(struct kvm_vcpu *vcpu)
 
 	do {
 		/*
-		 * The early POWER9 chips that can't mix radix and HPT threads
-		 * on the same core also need the workaround for the problem
-		 * where the TLB would prefetch entries in the guest exit path
-		 * for radix guests using the guest PIDR value and LPID 0.
-		 * The workaround is in the old path (kvmppc_run_vcpu())
-		 * but not the new path (kvmhv_run_single_vcpu()).
+		 * The TLB prefetch bug fixup is only in the kvmppc_run_vcpu
+		 * path, which also handles hash and dependent threads mode.
 		 */
 		if (kvm->arch.threads_indep && kvm_is_radix(kvm) &&
-		    !no_mixing_hpt_and_radix)
+		    !cpu_has_feature(CPU_FTR_P9_RADIX_PREFETCH_BUG))
 			r = kvmhv_run_single_vcpu(vcpu, ~(u64)0,
 						  vcpu->arch.vcore->lpcr);
 		else
diff --git a/arch/powerpc/kvm/book3s_hv_builtin.c b/arch/powerpc/kvm/book3s_hv_builtin.c
index 8053efdf7ea7..f3d3183249fe 100644
--- a/arch/powerpc/kvm/book3s_hv_builtin.c
+++ b/arch/powerpc/kvm/book3s_hv_builtin.c
@@ -277,8 +277,7 @@ void kvmhv_commence_exit(int trap)
 	struct kvmppc_vcore *vc = local_paca->kvm_hstate.kvm_vcore;
 	int ptid = local_paca->kvm_hstate.ptid;
 	struct kvm_split_mode *sip = local_paca->kvm_hstate.kvm_split_mode;
-	int me, ee, i, t;
-	int cpu0;
+	int me, ee, i;
 
 	/* Set our bit in the threads-exiting-guest map in the 0xff00
 	   bits of vcore->entry_exit_map */
@@ -320,22 +319,6 @@ void kvmhv_commence_exit(int trap)
 		if ((ee >> 8) == 0)
 			kvmhv_interrupt_vcore(vc, ee);
 	}
-
-	/*
-	 * On POWER9 when running a HPT guest on a radix host (sip != NULL),
-	 * we have to interrupt inactive CPU threads to get them to
-	 * restore the host LPCR value.
-	 */
-	if (sip->lpcr_req) {
-		if (cmpxchg(&sip->do_restore, 0, 1) == 0) {
-			vc = local_paca->kvm_hstate.kvm_vcore;
-			cpu0 = vc->pcpu + ptid - local_paca->kvm_hstate.tid;
-			for (t = 1; t < threads_per_core; ++t) {
-				if (sip->napped[t])
-					kvmhv_rm_send_ipi(cpu0 + t);
-			}
-		}
-	}
 }
 
 struct kvmppc_host_rm_ops *kvmppc_host_rm_ops_hv;
@@ -667,95 +650,6 @@ void kvmppc_bad_interrupt(struct pt_regs *regs)
 	panic("Bad KVM trap");
 }
 
-/*
- * Functions used to switch LPCR HR and UPRT bits on all threads
- * when entering and exiting HPT guests on a radix host.
- */
-
-#define PHASE_REALMODE		1	/* in real mode */
-#define PHASE_SET_LPCR		2	/* have set LPCR */
-#define PHASE_OUT_OF_GUEST	4	/* have finished executing in guest */
-#define PHASE_RESET_LPCR	8	/* have reset LPCR to host value */
-
-#define ALL(p)		(((p) << 24) | ((p) << 16) | ((p) << 8) | (p))
-
-static void wait_for_sync(struct kvm_split_mode *sip, int phase)
-{
-	int thr = local_paca->kvm_hstate.tid;
-
-	sip->lpcr_sync.phase[thr] |= phase;
-	phase = ALL(phase);
-	while ((sip->lpcr_sync.allphases & phase) != phase) {
-		HMT_low();
-		barrier();
-	}
-	HMT_medium();
-}
-
-void kvmhv_p9_set_lpcr(struct kvm_split_mode *sip)
-{
-	int num_sets;
-	unsigned long rb, set;
-
-	/* wait for every other thread to get to real mode */
-	wait_for_sync(sip, PHASE_REALMODE);
-
-	/* Set LPCR and LPIDR */
-	mtspr(SPRN_LPCR, sip->lpcr_req);
-	mtspr(SPRN_LPID, sip->lpidr_req);
-	isync();
-
-	/*
-	 * P10 will flush all the congruence class with a single tlbiel
-	 */
-	if (cpu_has_feature(CPU_FTR_ARCH_31))
-		num_sets =  1;
-	else
-		num_sets = POWER9_TLB_SETS_RADIX;
-
-	/* Invalidate the TLB on thread 0 */
-	if (local_paca->kvm_hstate.tid == 0) {
-		sip->do_set = 0;
-		asm volatile("ptesync" : : : "memory");
-		for (set = 0; set < num_sets; ++set) {
-			rb = TLBIEL_INVAL_SET_LPID +
-				(set << TLBIEL_INVAL_SET_SHIFT);
-			asm volatile(PPC_TLBIEL(%0, %1, 0, 0, 0) : :
-				     "r" (rb), "r" (0));
-		}
-		asm volatile("ptesync" : : : "memory");
-	}
-
-	/* indicate that we have done so and wait for others */
-	wait_for_sync(sip, PHASE_SET_LPCR);
-	/* order read of sip->lpcr_sync.allphases vs. sip->do_set */
-	smp_rmb();
-}
-
-/*
- * Called when a thread that has been in the guest needs
- * to reload the host LPCR value - but only on POWER9 when
- * running a HPT guest on a radix host.
- */
-void kvmhv_p9_restore_lpcr(struct kvm_split_mode *sip)
-{
-	/* we're out of the guest... */
-	wait_for_sync(sip, PHASE_OUT_OF_GUEST);
-
-	mtspr(SPRN_LPID, 0);
-	mtspr(SPRN_LPCR, sip->host_lpcr);
-	isync();
-
-	if (local_paca->kvm_hstate.tid == 0) {
-		sip->do_restore = 0;
-		smp_wmb();	/* order store of do_restore vs. phase */
-	}
-
-	wait_for_sync(sip, PHASE_RESET_LPCR);
-	smp_mb();
-	local_paca->kvm_hstate.kvm_split_mode = NULL;
-}
-
 static void kvmppc_end_cede(struct kvm_vcpu *vcpu)
 {
 	vcpu->arch.ceded = 0;
diff --git a/arch/powerpc/kvm/book3s_hv_rmhandlers.S b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
index cd9995ee8441..d5a9b57ec129 100644
--- a/arch/powerpc/kvm/book3s_hv_rmhandlers.S
+++ b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
@@ -85,19 +85,6 @@ _GLOBAL_TOC(kvmppc_hv_entry_trampoline)
 	RFI_TO_KERNEL
 
 kvmppc_call_hv_entry:
-BEGIN_FTR_SECTION
-	/* On P9, do LPCR setting, if necessary */
-	ld	r3, HSTATE_SPLIT_MODE(r13)
-	cmpdi	r3, 0
-	beq	46f
-	lwz	r4, KVM_SPLIT_DO_SET(r3)
-	cmpwi	r4, 0
-	beq	46f
-	bl	kvmhv_p9_set_lpcr
-	nop
-46:
-END_FTR_SECTION_IFSET(CPU_FTR_ARCH_300)
-
 	ld	r4, HSTATE_KVM_VCPU(r13)
 	bl	kvmppc_hv_entry
 
@@ -361,11 +348,11 @@ kvm_secondary_got_guest:
 	LOAD_REG_ADDR(r6, decrementer_max)
 	ld	r6, 0(r6)
 	mtspr	SPRN_HDEC, r6
+BEGIN_FTR_SECTION
 	/* and set per-LPAR registers, if doing dynamic micro-threading */
 	ld	r6, HSTATE_SPLIT_MODE(r13)
 	cmpdi	r6, 0
 	beq	63f
-BEGIN_FTR_SECTION
 	ld	r0, KVM_SPLIT_RPR(r6)
 	mtspr	SPRN_RPR, r0
 	ld	r0, KVM_SPLIT_PMMAR(r6)
@@ -373,16 +360,7 @@ BEGIN_FTR_SECTION
 	ld	r0, KVM_SPLIT_LDBAR(r6)
 	mtspr	SPRN_LDBAR, r0
 	isync
-FTR_SECTION_ELSE
-	/* On P9 we use the split_info for coordinating LPCR changes */
-	lwz	r4, KVM_SPLIT_DO_SET(r6)
-	cmpwi	r4, 0
-	beq	1f
-	mr	r3, r6
-	bl	kvmhv_p9_set_lpcr
-	nop
-1:
-ALT_FTR_SECTION_END_IFCLR(CPU_FTR_ARCH_300)
+END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_300)
 63:
 	/* Order load of vcpu after load of vcore */
 	lwsync
@@ -452,19 +430,15 @@ kvm_no_guest:
 	mtcr	r5
 	blr
 
-53:	HMT_LOW
+53:
+BEGIN_FTR_SECTION
+	HMT_LOW
 	ld	r5, HSTATE_KVM_VCORE(r13)
 	cmpdi	r5, 0
 	bne	60f
 	ld	r3, HSTATE_SPLIT_MODE(r13)
 	cmpdi	r3, 0
 	beq	kvm_no_guest
-	lwz	r0, KVM_SPLIT_DO_SET(r3)
-	cmpwi	r0, 0
-	bne	kvmhv_do_set
-	lwz	r0, KVM_SPLIT_DO_RESTORE(r3)
-	cmpwi	r0, 0
-	bne	kvmhv_do_restore
 	lbz	r0, KVM_SPLIT_DO_NAP(r3)
 	cmpwi	r0, 0
 	beq	kvm_no_guest
@@ -472,24 +446,19 @@ kvm_no_guest:
 	b	kvm_unsplit_nap
 60:	HMT_MEDIUM
 	b	kvm_secondary_got_guest
+FTR_SECTION_ELSE
+	HMT_LOW
+	ld	r5, HSTATE_KVM_VCORE(r13)
+	cmpdi	r5, 0
+	beq	kvm_no_guest
+	HMT_MEDIUM
+	b	kvm_secondary_got_guest
+ALT_FTR_SECTION_END_IFCLR(CPU_FTR_ARCH_300)
 
 54:	li	r0, KVM_HWTHREAD_IN_KVM
 	stb	r0, HSTATE_HWTHREAD_STATE(r13)
 	b	kvm_no_guest
 
-kvmhv_do_set:
-	/* Set LPCR, LPIDR etc. on P9 */
-	HMT_MEDIUM
-	bl	kvmhv_p9_set_lpcr
-	nop
-	b	kvm_no_guest
-
-kvmhv_do_restore:
-	HMT_MEDIUM
-	bl	kvmhv_p9_restore_lpcr
-	nop
-	b	kvm_no_guest
-
 /*
  * Here the primary thread is trying to return the core to
  * whole-core mode, so we need to nap.
@@ -527,7 +496,8 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
 	/* Set kvm_split_mode.napped[tid] = 1 */
 	ld	r3, HSTATE_SPLIT_MODE(r13)
 	li	r0, 1
-	lbz	r4, HSTATE_TID(r13)
+	lhz	r4, PACAPACAINDEX(r13)
+	clrldi	r4, r4, 61	/* micro-threading => P8 => 8 threads/core */
 	addi	r4, r4, KVM_SPLIT_NAPPED
 	stbx	r0, r3, r4
 	/* Check the do_nap flag again after setting napped[] */
@@ -1938,24 +1908,10 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
 19:	lis	r8,0x7fff		/* MAX_INT@h */
 	mtspr	SPRN_HDEC,r8
 
-16:
-BEGIN_FTR_SECTION
-	/* On POWER9 with HPT-on-radix we need to wait for all other threads */
-	ld	r3, HSTATE_SPLIT_MODE(r13)
-	cmpdi	r3, 0
-	beq	47f
-	lwz	r8, KVM_SPLIT_DO_RESTORE(r3)
-	cmpwi	r8, 0
-	beq	47f
-	bl	kvmhv_p9_restore_lpcr
-	nop
-	b	48f
-47:
-END_FTR_SECTION_IFSET(CPU_FTR_ARCH_300)
-	ld	r8,KVM_HOST_LPCR(r4)
+16:	ld	r8,KVM_HOST_LPCR(r4)
 	mtspr	SPRN_LPCR,r8
 	isync
-48:
+
 #ifdef CONFIG_KVM_BOOK3S_HV_EXIT_TIMING
 	/* Finish timing, if we have a vcpu */
 	ld	r4, HSTATE_KVM_VCPU(r13)
@@ -2779,8 +2735,10 @@ ALT_FTR_SECTION_END_IFSET(CPU_FTR_ARCH_300)
 	beq	kvm_end_cede
 	cmpwi	r0, NAPPING_NOVCPU
 	beq	kvm_novcpu_wakeup
+BEGIN_FTR_SECTION
 	cmpwi	r0, NAPPING_UNSPLIT
 	beq	kvm_unsplit_wakeup
+END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_300)
 	twi	31,0,0 /* Nap state must not be zero */
 
 33:	mr	r4, r3
-- 
2.23.0


^ permalink raw reply related

* Re: [PATCH 10/18] arch: powerpc: Stop building and using oprofile
From: Viresh Kumar @ 2021-01-18  4:47 UTC (permalink / raw)
  To: Linus Torvalds, Robert Richter, Michael Ellerman,
	Benjamin Herrenschmidt, Paul Mackerras, Arnd Bergmann,
	Jeremy Kerr
  Cc: Arnd Bergmann, Vincent Guittot, Christoph Hellwig, linux-kernel,
	anmar.oueja, oprofile-list, Alexander Viro, William Cohen,
	linuxppc-dev
In-Reply-To: <fd155a0a1e52650ddc9ec57a1d211fdc777beddb.1610622251.git.viresh.kumar@linaro.org>

On 14-01-21, 17:05, Viresh Kumar wrote:
> The "oprofile" user-space tools don't use the kernel OPROFILE support
> any more, and haven't in a long time. User-space has been converted to
> the perf interfaces.
> 
> This commits stops building oprofile for powerpc and removes any
> reference to it from directories in arch/powerpc/ apart from
> arch/powerpc/oprofile, which will be removed in the next commit (this is
> broken into two commits as the size of the commit became very big, ~5k
> lines).
> 
> Note that the member "oprofile_cpu_type" in "struct cpu_spec" isn't
> removed as it was also used by other parts of the code.
> 
> Suggested-by: Christoph Hellwig <hch@infradead.org>
> Suggested-by: Linus Torvalds <torvalds@linux-foundation.org>
> Signed-off-by: Viresh Kumar <viresh.kumar@linaro.org>
> ---
>  arch/powerpc/Kconfig                          |   1 -
>  arch/powerpc/Makefile                         |   2 -
>  arch/powerpc/configs/44x/akebono_defconfig    |   1 -
>  arch/powerpc/configs/44x/currituck_defconfig  |   1 -
>  arch/powerpc/configs/44x/fsp2_defconfig       |   1 -
>  arch/powerpc/configs/44x/iss476-smp_defconfig |   1 -
>  arch/powerpc/configs/cell_defconfig           |   1 -
>  arch/powerpc/configs/g5_defconfig             |   1 -
>  arch/powerpc/configs/maple_defconfig          |   1 -
>  arch/powerpc/configs/pasemi_defconfig         |   1 -
>  arch/powerpc/configs/pmac32_defconfig         |   1 -
>  arch/powerpc/configs/powernv_defconfig        |   1 -
>  arch/powerpc/configs/ppc64_defconfig          |   1 -
>  arch/powerpc/configs/ppc64e_defconfig         |   1 -
>  arch/powerpc/configs/ppc6xx_defconfig         |   1 -
>  arch/powerpc/configs/ps3_defconfig            |   1 -
>  arch/powerpc/configs/pseries_defconfig        |   1 -
>  arch/powerpc/include/asm/cputable.h           |  20 ---
>  arch/powerpc/include/asm/oprofile_impl.h      | 135 ------------------
>  arch/powerpc/include/asm/spu.h                |  33 -----
>  arch/powerpc/kernel/cputable.c                |  67 ---------
>  arch/powerpc/kernel/dt_cpu_ftrs.c             |   2 -
>  arch/powerpc/platforms/cell/Kconfig           |   5 -
>  arch/powerpc/platforms/cell/spu_notify.c      |  55 -------

+ this..

diff --git a/arch/powerpc/platforms/cell/Makefile b/arch/powerpc/platforms/cell/Makefile
index 10064a33ca96..7ea6692f67e2 100644
--- a/arch/powerpc/platforms/cell/Makefile
+++ b/arch/powerpc/platforms/cell/Makefile
@@ -19,7 +19,6 @@ spu-priv1-$(CONFIG_PPC_CELL_COMMON)   += spu_priv1_mmio.o
 spu-manage-$(CONFIG_PPC_CELL_COMMON)   += spu_manage.o
 
 obj-$(CONFIG_SPU_BASE)                 += spu_callbacks.o spu_base.o \
-                                          spu_notify.o \
                                           spu_syscalls.o \
                                           $(spu-priv1-y) \
                                           $(spu-manage-y) \

-- 
viresh

^ permalink raw reply related

* Re: [PATCH 5/6] powerpc/rtas: rename RTAS_RMOBUF_MAX to RTAS_USER_REGION_SIZE
From: Alexey Kardashevskiy @ 2021-01-18  4:15 UTC (permalink / raw)
  To: Nathan Lynch, linuxppc-dev; +Cc: tyreld, brking, ajd, aneesh.kumar
In-Reply-To: <87a6taxkgf.fsf@linux.ibm.com>



On 16/01/2021 02:56, Nathan Lynch wrote:
> Alexey Kardashevskiy <aik@ozlabs.ru> writes:
>> On 15/01/2021 09:00, Nathan Lynch wrote:
>>> diff --git a/arch/powerpc/include/asm/rtas.h b/arch/powerpc/include/asm/rtas.h
>>> index 332e1000ca0f..1aa7ab1cbc84 100644
>>> --- a/arch/powerpc/include/asm/rtas.h
>>> +++ b/arch/powerpc/include/asm/rtas.h
>>> @@ -19,8 +19,11 @@
>>>    #define RTAS_UNKNOWN_SERVICE (-1)
>>>    #define RTAS_INSTANTIATE_MAX (1ULL<<30) /* Don't instantiate rtas at/above this value */
>>>    
>>> -/* Buffer size for ppc_rtas system call. */
>>> -#define RTAS_RMOBUF_MAX (64 * 1024)
>>> +/* Work areas shared with RTAS must be 4K, naturally aligned. */
>>
>> Why exactly 4K and not (for example) PAGE_SIZE?
> 
> 4K is a platform requirement and isn't related to Linux's configured
> page size. See the PAPR specification for RTAS functions such as
> ibm,configure-connector, ibm,update-nodes, ibm,update-properties.

Good, since we are documenting things here - add to the comment ("per 
PAPR")?


> There are other calls with work area parameters where alignment isn't
> specified (e.g. ibm,get-system-parameter) but 4KB alignment is a safe
> choice for those.
> 
>>> +#define RTAS_WORK_AREA_SIZE   4096
>>> +
>>> +/* Work areas allocated for user space access. */
>>> +#define RTAS_USER_REGION_SIZE (RTAS_WORK_AREA_SIZE * 16)
>>
>> This is still 64K but no clarity why. There is 16 of something, what
>> is it?
> 
> There are 16 4KB work areas in the region. I can name it
> RTAS_NR_USER_WORK_AREAS or similar.


Why 16? PAPR (then add "per PAPR") or we just like 16 ("should be enough")?


-- 
Alexey

^ permalink raw reply

* Re: [PATCH 6/6] powerpc/rtas: constrain user region allocation to RMA
From: Alexey Kardashevskiy @ 2021-01-18  4:12 UTC (permalink / raw)
  To: Nathan Lynch, linuxppc-dev; +Cc: tyreld, brking, ajd, aneesh.kumar
In-Reply-To: <87czy6xlap.fsf@linux.ibm.com>



On 16/01/2021 02:38, Nathan Lynch wrote:
> Alexey Kardashevskiy <aik@ozlabs.ru> writes:
>> On 15/01/2021 09:00, Nathan Lynch wrote:
>>> Memory locations passed as arguments from the OS to RTAS usually need
>>> to be addressable in 32-bit mode and must reside in the Real Mode
>>> Area. On PAPR guests, the RMA starts at logical address 0 and is the
>>> first logical memory block reported in the LPAR’s device tree.
>>>
>>> On powerpc targets with RTAS, Linux makes available to user space a
>>> region of memory suitable for arguments to be passed to RTAS via
>>> sys_rtas(). This region (rtas_rmo_buf) is allocated via the memblock
>>> API during boot in order to ensure that it satisfies the requirements
>>> described above.
>>>
>>> With radix MMU, the upper limit supplied to the memblock allocation
>>> can exceed the bounds of the first logical memory block, since
>>> ppc64_rma_size is ULONG_MAX and RTAS_INSTANTIATE_MAX is 1GB. (512MB is
>>> a common size of the first memory block according to a small sample of
>>> LPARs I have checked.) This leads to failures when user space invokes
>>> an RTAS function that uses a work area, such as
>>> ibm,configure-connector.
>>>
>>> Alter the determination of the upper limit for rtas_rmo_buf's
>>> allocation to consult the device tree directly, ensuring placement
>>> within the RMA regardless of the MMU in use.
>>
>> Can we tie this with RTAS (which also needs to be in RMA) and simply add
>> extra 64K in prom_instantiate_rtas() and advertise this address
>> (ALIGH_UP(rtas-base + rtas-size, PAGE_SIZE)) to the user space? We do
>> not need this RMO area before that point.
> 
> Can you explain more about what advantage that would bring? I'm not
> seeing it. It's a more significant change than what I've written
> here.


We already allocate space for RTAS and (like RMO) it needs to be in RMA, 
and RMO is useless without RTAS. We can reuse RTAS allocation code for 
RMO like this:

===
diff --git a/arch/powerpc/kernel/prom_init.c 
b/arch/powerpc/kernel/prom_init.c
index e9d4eb6144e1..d9527d3e01d2 100644
--- a/arch/powerpc/kernel/prom_init.c
+++ b/arch/powerpc/kernel/prom_init.c
@@ -1821,7 +1821,8 @@ static void __init prom_instantiate_rtas(void)
         if (size == 0)
                 return;

-       base = alloc_down(size, PAGE_SIZE, 0);
+       /* One page for RTAS, one for RMO */
+       base = alloc_down(size, PAGE_SIZE + PAGE_SIZE, 0);
         if (base == 0)
                 prom_panic("Could not allocate memory for RTAS\n");

diff --git a/arch/powerpc/kernel/rtas.c b/arch/powerpc/kernel/rtas.c
index d126d71ea5bd..885d95cf4ed3 100644
--- a/arch/powerpc/kernel/rtas.c
+++ b/arch/powerpc/kernel/rtas.c
@@ -1186,6 +1186,7 @@ void __init rtas_initialize(void)
         rtas.size = size;
         no_entry = of_property_read_u32(rtas.dev, "linux,rtas-entry", 
&entry);
         rtas.entry = no_entry ? rtas.base : entry;
+       rtas_rmo_buf = rtas.base + PAGE_SIZE;

         /* If RTAS was found, allocate the RMO buffer for it and look for
          * the stop-self token if any
@@ -1196,11 +1197,6 @@ void __init rtas_initialize(void)
                 ibm_suspend_me_token = rtas_token("ibm,suspend-me");
         }
  #endif
-       rtas_rmo_buf = memblock_phys_alloc_range(RTAS_RMOBUF_MAX, PAGE_SIZE,
-                                                0, rtas_region);
-       if (!rtas_rmo_buf)
-               panic("ERROR: RTAS: Failed to allocate %lx bytes below 
%pa\n",
-                     PAGE_SIZE, &rtas_region);
===


May be store in the FDT as "linux,rmo-base" next to "linux,rtas-base", 
for clarity, as sharing symbols between prom and main kernel is a bit 
tricky.

The benefit is that we do not do the same thing   (== find 64K in RMA) 
in 2 different ways and if the RMO allocated my way is broken - we'll 
know it much sooner as RTAS itself will break too.


> Would it interact well with kexec?


Good point. For this, the easiest will be setting rtas-size in the FDT 
to the allocated RTAS space (PAGE_SIZE*2 with the hunk above applied). 
Probably.


>> And probably do the same with per-cpu RTAS argument structures mentioned
>> in the cover letter?
> 
> I don't think so, since those need to be allocated with the pacas and
> limited to the maximum possible CPUs, which is discovered by the kernel
> much later.


The first cell of /proc/device-tree/cpus/ibm,drc-indexes is the number 
of cores, it is there when RTAS is instantiated, we know SMT after 
"ibm,client-architecture-support" (if I remember correctly).


> But maybe I misunderstand what you're suggesting.


Usually it is me missing the bigger picture :)


-- 
Alexey

^ permalink raw reply related

* [powerpc:merge] BUILD SUCCESS 41d8cb7ece7c81e4eb897ed7ec7d3c3d72fd0af4
From: kernel test robot @ 2021-01-17 23:05 UTC (permalink / raw)
  To: Michael Ellerman; +Cc: linuxppc-dev

tree/branch: https://git.kernel.org/pub/scm/linux/kernel/git/powerpc/linux.git merge
branch HEAD: 41d8cb7ece7c81e4eb897ed7ec7d3c3d72fd0af4  Automatic merge of 'master' into merge (2021-01-17 21:16)

elapsed time: 758m

configs tested: 112
configs skipped: 2

The following configs have been built successfully.
More configs may be tested in the coming days.

gcc tested configs:
arm                                 defconfig
arm64                            allyesconfig
arm64                               defconfig
arm                              allyesconfig
arm                              allmodconfig
arc                          axs101_defconfig
arc                      axs103_smp_defconfig
mips                     loongson1b_defconfig
c6x                         dsk6455_defconfig
mips                       rbtx49xx_defconfig
sh                           se7343_defconfig
powerpc                    adder875_defconfig
powerpc                 linkstation_defconfig
mips                       capcella_defconfig
powerpc                      katmai_defconfig
h8300                            allyesconfig
powerpc                       holly_defconfig
powerpc                     stx_gp3_defconfig
arm                      integrator_defconfig
mips                  maltasmvp_eva_defconfig
parisc                generic-32bit_defconfig
sh                           sh2007_defconfig
m68k                        m5272c3_defconfig
sh                   rts7751r2dplus_defconfig
powerpc                         ps3_defconfig
arm                        clps711x_defconfig
arc                        vdk_hs38_defconfig
arm                           sama5_defconfig
powerpc                  iss476-smp_defconfig
nds32                               defconfig
sh                         ap325rxa_defconfig
parisc                              defconfig
powerpc                  storcenter_defconfig
xtensa                       common_defconfig
powerpc                  mpc885_ads_defconfig
alpha                            alldefconfig
mips                           rs90_defconfig
arm                          pxa910_defconfig
sh                         ecovec24_defconfig
mips                malta_kvm_guest_defconfig
nios2                            alldefconfig
powerpc                    klondike_defconfig
mips                        nlm_xlr_defconfig
arm                        multi_v7_defconfig
ia64                             allmodconfig
ia64                                defconfig
ia64                             allyesconfig
m68k                             allmodconfig
m68k                                defconfig
m68k                             allyesconfig
nios2                               defconfig
arc                              allyesconfig
nds32                             allnoconfig
c6x                              allyesconfig
nios2                            allyesconfig
csky                                defconfig
alpha                               defconfig
alpha                            allyesconfig
xtensa                           allyesconfig
arc                                 defconfig
sh                               allmodconfig
s390                             allyesconfig
parisc                           allyesconfig
s390                                defconfig
i386                             allyesconfig
sparc                            allyesconfig
sparc                               defconfig
i386                               tinyconfig
i386                                defconfig
mips                             allyesconfig
mips                             allmodconfig
powerpc                          allyesconfig
powerpc                          allmodconfig
powerpc                           allnoconfig
x86_64               randconfig-a006-20210117
x86_64               randconfig-a004-20210117
x86_64               randconfig-a001-20210117
x86_64               randconfig-a005-20210117
x86_64               randconfig-a003-20210117
x86_64               randconfig-a002-20210117
i386                 randconfig-a005-20210117
i386                 randconfig-a006-20210117
i386                 randconfig-a004-20210117
i386                 randconfig-a002-20210117
i386                 randconfig-a003-20210117
i386                 randconfig-a001-20210117
i386                 randconfig-a012-20210117
i386                 randconfig-a011-20210117
i386                 randconfig-a016-20210117
i386                 randconfig-a013-20210117
i386                 randconfig-a015-20210117
i386                 randconfig-a014-20210117
riscv                    nommu_k210_defconfig
riscv                            allyesconfig
riscv                    nommu_virt_defconfig
riscv                             allnoconfig
riscv                               defconfig
riscv                          rv32_defconfig
riscv                            allmodconfig
x86_64                                   rhel
x86_64                           allyesconfig
x86_64                    rhel-7.6-kselftests
x86_64                              defconfig
x86_64                               rhel-8.3
x86_64                      rhel-8.3-kbuiltin
x86_64                                  kexec

clang tested configs:
x86_64               randconfig-a015-20210117
x86_64               randconfig-a016-20210117
x86_64               randconfig-a014-20210117
x86_64               randconfig-a012-20210117
x86_64               randconfig-a013-20210117
x86_64               randconfig-a011-20210117

---
0-DAY CI Kernel Test Service, Intel Corporation
https://lists.01.org/hyperkitty/list/kbuild-all@lists.01.org

^ permalink raw reply


This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox