LinuxPPC-Dev Archive on lore.kernel.org

LinuxPPC-Dev Archive on lore.kernel.org
 help / color / mirror / Atom feed

* [PATCH v5 1/2] tty: hvc: pass DMA capable memory to put_chars()
From: Xianting Tian @ 2021-08-12  8:51 UTC (permalink / raw)
  To: gregkh, jirislaby, amit, arnd, osandov
  Cc: Xianting Tian, guoren, linuxppc-dev, linux-kernel, virtualization
In-Reply-To: <20210812085112.145265-1-xianting.tian@linux.alibaba.com>

As well known, hvc backend can register its opertions to hvc backend.
the opertions contain put_chars(), get_chars() and so on.

Some hvc backend may do dma in its opertions. eg, put_chars() of
virtio-console. But in the code of hvc framework, it may pass DMA
incapable memory to put_chars() under a specific configuration, which
is explained in commit c4baad5029(virtio-console: avoid DMA from stack):
1, c[] is on stack,
   hvc_console_print():
	char c[N_OUTBUF] __ALIGNED__;
	cons_ops[index]->put_chars(vtermnos[index], c, i);
2, ch is on stack,
   static void hvc_poll_put_char(,,char ch)
   {
	struct tty_struct *tty = driver->ttys[0];
	struct hvc_struct *hp = tty->driver_data;
	int n;

	do {
		n = hp->ops->put_chars(hp->vtermno, &ch, 1);
	} while (n <= 0);
   }

Commit c4baad5029 is just the fix to avoid DMA from stack memory, which
is passed to virtio-console by hvc framework in above code. But I think
the fix is aggressive, it directly uses kmemdup() to alloc new buffer
from kmalloc area and do memcpy no matter the memory is in kmalloc area
or not. But most importantly, it should better be fixed in the hvc
framework, by changing it to never pass stack memory to the put_chars()
function in the first place. Otherwise, we still face the same issue if
a new hvc backend using dma added in the furture.

We make 'char c[N_OUTBUF]' part of 'struct hvc_struct', so hp->c is no
longer the stack memory. we can use it in above two cases.

Other cleanup is use ARCH_DMA_MINALIGN for align, and make 'hp->outbuf'
aligned and use struct_size() for the size parameter of kzalloc().

With the patch, we can remove the fix c4baad5029.

Signed-off-by: Xianting Tian <xianting.tian@linux.alibaba.com>
Tested-by: Xianting Tian <xianting.tian@linux.alibaba.com>
---
 drivers/tty/hvc/hvc_console.c | 40 +++++++++++++++++++++--------------
 drivers/tty/hvc/hvc_console.h | 16 ++++++++++++--
 2 files changed, 38 insertions(+), 18 deletions(-)

diff --git a/drivers/tty/hvc/hvc_console.c b/drivers/tty/hvc/hvc_console.c
index 5bb8c4e44..c56564eb7 100644
--- a/drivers/tty/hvc/hvc_console.c
+++ b/drivers/tty/hvc/hvc_console.c
@@ -41,16 +41,6 @@
  */
 #define HVC_CLOSE_WAIT (HZ/100) /* 1/10 of a second */
 
-/*
- * These sizes are most efficient for vio, because they are the
- * native transfer size. We could make them selectable in the
- * future to better deal with backends that want other buffer sizes.
- */
-#define N_OUTBUF	16
-#define N_INBUF		16
-
-#define __ALIGNED__ __attribute__((__aligned__(sizeof(long))))
-
 static struct tty_driver *hvc_driver;
 static struct task_struct *hvc_task;
 
@@ -142,6 +132,7 @@ static int hvc_flush(struct hvc_struct *hp)
 static const struct hv_ops *cons_ops[MAX_NR_HVC_CONSOLES];
 static uint32_t vtermnos[MAX_NR_HVC_CONSOLES] =
 	{[0 ... MAX_NR_HVC_CONSOLES - 1] = -1};
+static char *cons_outbuf[MAX_NR_HVC_CONSOLES];
 
 /*
  * Console APIs, NOT TTY.  These APIs are available immediately when
@@ -151,18 +142,23 @@ static uint32_t vtermnos[MAX_NR_HVC_CONSOLES] =
 static void hvc_console_print(struct console *co, const char *b,
 			      unsigned count)
 {
-	char c[N_OUTBUF] __ALIGNED__;
+	char *c;
 	unsigned i = 0, n = 0;
 	int r, donecr = 0, index = co->index;
+	unsigned long flags;
+	struct hvc_struct *hp;
 
 	/* Console access attempt outside of acceptable console range. */
 	if (index >= MAX_NR_HVC_CONSOLES)
 		return;
 
 	/* This console adapter was removed so it is not usable. */
-	if (vtermnos[index] == -1)
+	if (vtermnos[index] == -1 || !cons_outbuf[index])
 		return;
 
+	c = cons_outbuf[index];
+
+	spin_lock_irqsave(&hp->c_lock, flags);
 	while (count > 0 || i > 0) {
 		if (count > 0 && i < sizeof(c)) {
 			if (b[n] == '\n' && !donecr) {
@@ -191,6 +187,7 @@ static void hvc_console_print(struct console *co, const char *b,
 			}
 		}
 	}
+	spin_unlock_irqrestore(&hp->c_lock, flags);
 	hvc_console_flush(cons_ops[index], vtermnos[index]);
 }
 
@@ -878,9 +875,19 @@ static void hvc_poll_put_char(struct tty_driver *driver, int line, char ch)
 	struct tty_struct *tty = driver->ttys[0];
 	struct hvc_struct *hp = tty->driver_data;
 	int n;
+	unsigned long flags;
+	char *c;
+
+	if (!hp || !cons_outbuf[hp->index])
+		return;
+
+	c = cons_outbuf[hp->index];
 
 	do {
-		n = hp->ops->put_chars(hp->vtermno, &ch, 1);
+		spin_lock_irqsave(&hp->c_lock, flags);
+		c[0] = ch;
+		n = hp->ops->put_chars(hp->vtermno, c, 1);
+		spin_unlock_irqrestore(&hp->c_lock, flags);
 	} while (n <= 0);
 }
 #endif
@@ -922,8 +929,7 @@ struct hvc_struct *hvc_alloc(uint32_t vtermno, int data,
 			return ERR_PTR(err);
 	}
 
-	hp = kzalloc(ALIGN(sizeof(*hp), sizeof(long)) + outbuf_size,
-			GFP_KERNEL);
+	hp = kzalloc(struct_size(hp, outbuf, outbuf_size), GFP_KERNEL);
 	if (!hp)
 		return ERR_PTR(-ENOMEM);
 
@@ -931,13 +937,13 @@ struct hvc_struct *hvc_alloc(uint32_t vtermno, int data,
 	hp->data = data;
 	hp->ops = ops;
 	hp->outbuf_size = outbuf_size;
-	hp->outbuf = &((char *)hp)[ALIGN(sizeof(*hp), sizeof(long))];
 
 	tty_port_init(&hp->port);
 	hp->port.ops = &hvc_port_ops;
 
 	INIT_WORK(&hp->tty_resize, hvc_set_winsz);
 	spin_lock_init(&hp->lock);
+	spin_lock_init(&hp->c_lock);
 	mutex_lock(&hvc_structs_mutex);
 
 	/*
@@ -964,6 +970,7 @@ struct hvc_struct *hvc_alloc(uint32_t vtermno, int data,
 	if (i < MAX_NR_HVC_CONSOLES) {
 		cons_ops[i] = ops;
 		vtermnos[i] = vtermno;
+		cons_outbuf[i] = hp->c;
 	}
 
 	list_add_tail(&(hp->next), &hvc_structs);
@@ -988,6 +995,7 @@ int hvc_remove(struct hvc_struct *hp)
 	if (hp->index < MAX_NR_HVC_CONSOLES) {
 		vtermnos[hp->index] = -1;
 		cons_ops[hp->index] = NULL;
+		cons_outbuf[hp->index] = NULL;
 	}
 
 	/* Don't whack hp->irq because tty_hangup() will need to free the irq. */
diff --git a/drivers/tty/hvc/hvc_console.h b/drivers/tty/hvc/hvc_console.h
index 18d005814..52374e2da 100644
--- a/drivers/tty/hvc/hvc_console.h
+++ b/drivers/tty/hvc/hvc_console.h
@@ -32,13 +32,21 @@
  */
 #define HVC_ALLOC_TTY_ADAPTERS	8
 
+/*
+ * These sizes are most efficient for vio, because they are the
+ * native transfer size. We could make them selectable in the
+ * future to better deal with backends that want other buffer sizes.
+ */
+#define N_OUTBUF	16
+#define N_INBUF		16
+
+#define __ALIGNED__ __attribute__((__aligned__(sizeof(long))))
+
 struct hvc_struct {
 	struct tty_port port;
 	spinlock_t lock;
 	int index;
 	int do_wakeup;
-	char *outbuf;
-	int outbuf_size;
 	int n_outbuf;
 	uint32_t vtermno;
 	const struct hv_ops *ops;
@@ -48,6 +56,10 @@ struct hvc_struct {
 	struct work_struct tty_resize;
 	struct list_head next;
 	unsigned long flags;
+	spinlock_t c_lock;
+	char c[N_OUTBUF] __ALIGNED__;
+	int outbuf_size;
+	char outbuf[0] __ALIGNED__;
 };
 
 /* implemented by a low level driver */
-- 
2.17.1


^ permalink raw reply related

* Re: [PATCH v3 4/8] PCI: replace pci_dev::driver usage that gets the driver name
From: Uwe Kleine-König @ 2021-08-12  8:14 UTC (permalink / raw)
  To: Christoph Hellwig
  Cc: Alexander Duyck, oss-drivers, Paul Mackerras, Herbert Xu,
	Ido Schimmel, Rafa?? Mi??ecki, Jesse Brandeburg, Bjorn Helgaas,
	linux-pci, Jakub Kicinski, Yisen Zhuang, Vadym Kochan,
	Michael Buesch, Jiri Pirko, Salil Mehta, Greg Kroah-Hartman,
	linux-wireless, linux-kernel, Taras Chornyi, Zhou Wang,
	linux-crypto, kernel, netdev, Simon Horman, Oliver O'Halloran,
	linuxppc-dev, David S. Miller
In-Reply-To: <YRTIqGm5Dr8du7a7@infradead.org>

[-- Attachment #1: Type: text/plain, Size: 2565 bytes --]

On Thu, Aug 12, 2021 at 08:07:20AM +0100, Christoph Hellwig wrote:
> On Wed, Aug 11, 2021 at 10:06:33AM +0200, Uwe Kleine-K??nig wrote:
> >  static inline const char *eeh_driver_name(struct pci_dev *pdev)
> >  {
> > -	return (pdev && pdev->driver) ? pdev->driver->name : "<null>";
> > +	const char *drvstr = pdev ? dev_driver_string(&pdev->dev) : "";
> > +
> > +	if (*drvstr == '\0')
> > +		return "<null>";
> > +
> > +	return drvstr;
> 
> This looks rather obsfucated due to the fact that dev_driver_string
> never returns '\0', and due to the strange mix of a tenary operation
> and the if on a related condition.

dev_driver_string() might return "" (via dev_bus_name()). If that happens
*drvstr == '\0' becomes true.

Would the following be better?:

	const char *drvstr;

	if (pdev)
		return "<null>";

	drvstr = dev_driver_string(&pdev->dev);

	if (!strcmp(drvstr, ""))
		return "<null>";

	return drvstr;

When I thought about this hunk I considered it ugly to have "<null>" in
it twice.

> >  }
> >  
> >  #endif /* CONFIG_EEH */
> > diff --git a/drivers/bcma/host_pci.c b/drivers/bcma/host_pci.c
> > index 69c10a7b7c61..dc2ffa686964 100644
> > --- a/drivers/bcma/host_pci.c
> > +++ b/drivers/bcma/host_pci.c
> > @@ -175,9 +175,10 @@ static int bcma_host_pci_probe(struct pci_dev *dev,
> >  	if (err)
> >  		goto err_kfree_bus;
> >  
> > -	name = dev_name(&dev->dev);
> > -	if (dev->driver && dev->driver->name)
> > -		name = dev->driver->name;
> > +	name = dev_driver_string(&dev->dev);
> > +	if (*name == '\0')
> > +		name = dev_name(&dev->dev);
> 
> Where does this '\0' check come from?

The original code is equivalent to

	if (dev->driver && dev->driver->name)
		name = dev->driver->name;
	else:
		name = dev_name(...);

As dev_driver_string() implements something like:

	if (dev->driver && dev->driver->name)
		return dev->driver->name;
	else
		return "";

the change looks fine to me. (One could wonder if it's sensible to fall
back to the device name if the driver has no nice name, but this isn't
new with my change.)

> > +	name = dev_driver_string(&dev->dev);
> > +	if (*name == '\0')
> > +		name = dev_name(&dev->dev);
> > +
> 
> More of this weirdness.

I admit it's not pretty. Would it help to use !strcmp(name, "")
instead of *name == '\0'? Any other constructive suggestion?

Best regards
Uwe

-- 
Pengutronix e.K.                           | Uwe Kleine-König            |
Industrial Linux Solutions                 | https://www.pengutronix.de/ |

[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 488 bytes --]

^ permalink raw reply

* Re: [PATCH] powerpc/book3s64/radix: make tlb_single_page_flush_ceiling a debugfs entry
From: Aneesh Kumar K.V @ 2021-08-12  8:13 UTC (permalink / raw)
  To: Michael Ellerman, Christophe Leroy, linuxppc-dev
In-Reply-To: <87o8a3m89i.fsf@mpe.ellerman.id.au>

On 8/12/21 12:58 PM, Michael Ellerman wrote:
> Christophe Leroy <christophe.leroy@csgroup.eu> writes:
>> Le 10/08/2021 à 06:53, Aneesh Kumar K.V a écrit :
>>> Similar to x86/s390 add a debugfs file to tune tlb_single_page_flush_ceiling.
>>>
>>> Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
>>> ---
>>>    arch/powerpc/mm/book3s64/radix_tlb.c | 48 ++++++++++++++++++++++++++++
>>>    1 file changed, 48 insertions(+)
>>>
>>> diff --git a/arch/powerpc/mm/book3s64/radix_tlb.c b/arch/powerpc/mm/book3s64/radix_tlb.c
>>> index aefc100d79a7..5cca0fe130e7 100644
>>> --- a/arch/powerpc/mm/book3s64/radix_tlb.c
>>> +++ b/arch/powerpc/mm/book3s64/radix_tlb.c
>>> @@ -17,6 +17,7 @@
> ...
>>> +
>>> +static int __init create_tlb_single_page_flush_ceiling(void)
>>> +{
>>> +	debugfs_create_file("tlb_single_page_flush_ceiling", S_IRUSR | S_IWUSR,
>>> +			    powerpc_debugfs_root, NULL, &fops_tlbflush);
>>
>> Could you just use debugfs_create_u32() instead of re-implementing simple read and write ?
> 
> Yeah AFAICS that should work fine.
> 
> It could probably even be a u16?
> 

I was looking at switching all that to u64. Should i fallback to u16, 
considering a tlb_signle_page_flush_ceiling value larger that 2**16 
doesn't make sense?

-aneesh


^ permalink raw reply

* Re: [PATCH v4 1/2] tty: hvc: pass DMA capable memory to put_chars()
From: Xianting TIan @ 2021-08-12  8:07 UTC (permalink / raw)
  To: Arnd Bergmann
  Cc: Jiri Slaby, Amit Shah, gregkh, Linux Kernel Mailing List,
	open list:DRM DRIVER FOR QEMU'S CIRRUS DEVICE, Guo Ren,
	linuxppc-dev, Omar Sandoval
In-Reply-To: <CAK8P3a2=BmVv0tvUKaca+LYxuAussAJtAJW9O3fRN2CbV2-9aw@mail.gmail.com>


在 2021/8/6 下午10:51, Arnd Bergmann 写道:
> On Fri, Aug 6, 2021 at 5:01 AM Xianting Tian
> <xianting.tian@linux.alibaba.com> wrote:
>> @@ -163,6 +155,13 @@ static void hvc_console_print(struct console *co, const char *b,
>>          if (vtermnos[index] == -1)
>>                  return;
>>
>> +       list_for_each_entry(hp, &hvc_structs, next)
>> +               if (hp->vtermno == vtermnos[index])
>> +                       break;
>> +
>> +       c = hp->c;
>> +
>> +       spin_lock_irqsave(&hp->c_lock, flags);
> The loop looks like it might race against changes to the list. It seems strange
> that the print function has to actually search for the structure here.
>
> It may be better to have yet another array for the buffer pointers next to
> the cons_ops[] and vtermnos[] arrays.
>
>> +/*
>> + * These sizes are most efficient for vio, because they are the
>> + * native transfer size. We could make them selectable in the
>> + * future to better deal with backends that want other buffer sizes.
>> + */
>> +#define N_OUTBUF       16
>> +#define N_INBUF                16
>> +
>> +#define __ALIGNED__ __attribute__((__aligned__(sizeof(long))))
> I think you need a higher alignment for DMA buffers, instead of sizeof(long),
> I would suggest ARCH_DMA_MINALIGN.

As some ARCH(eg, x86, riscv) doesn't define ARCH_DMA_MINALIG, so i think 
it 's better remain the code unchanged,

I will send v5 patch soon.

>
>         Arnd

^ permalink raw reply

* Re: [PATCH] powerpc/book3s64/radix: make tlb_single_page_flush_ceiling a debugfs entry
From: Michael Ellerman @ 2021-08-12  7:28 UTC (permalink / raw)
  To: Christophe Leroy, Aneesh Kumar K.V, linuxppc-dev
In-Reply-To: <22c7c194-d75d-bf90-c8e5-83c995c7130c@csgroup.eu>

Christophe Leroy <christophe.leroy@csgroup.eu> writes:
> Le 10/08/2021 à 06:53, Aneesh Kumar K.V a écrit :
>> Similar to x86/s390 add a debugfs file to tune tlb_single_page_flush_ceiling.
>> 
>> Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
>> ---
>>   arch/powerpc/mm/book3s64/radix_tlb.c | 48 ++++++++++++++++++++++++++++
>>   1 file changed, 48 insertions(+)
>> 
>> diff --git a/arch/powerpc/mm/book3s64/radix_tlb.c b/arch/powerpc/mm/book3s64/radix_tlb.c
>> index aefc100d79a7..5cca0fe130e7 100644
>> --- a/arch/powerpc/mm/book3s64/radix_tlb.c
>> +++ b/arch/powerpc/mm/book3s64/radix_tlb.c
>> @@ -17,6 +17,7 @@
...
>> +
>> +static int __init create_tlb_single_page_flush_ceiling(void)
>> +{
>> +	debugfs_create_file("tlb_single_page_flush_ceiling", S_IRUSR | S_IWUSR,
>> +			    powerpc_debugfs_root, NULL, &fops_tlbflush);
>
> Could you just use debugfs_create_u32() instead of re-implementing simple read and write ?

Yeah AFAICS that should work fine.

It could probably even be a u16?

cheers

^ permalink raw reply

* Re: [PATCH v3 4/8] PCI: replace pci_dev::driver usage that gets the driver name
From: Christoph Hellwig @ 2021-08-12  7:07 UTC (permalink / raw)
  To: Uwe Kleine-K??nig
  Cc: linux-pci, Alexander Duyck, oss-drivers, Paul Mackerras,
	Herbert Xu, Rafa?? Mi??ecki, Jesse Brandeburg, Bjorn Helgaas,
	Ido Schimmel, Jakub Kicinski, Yisen Zhuang, Vadym Kochan,
	Michael Buesch, Jiri Pirko, Salil Mehta, Greg Kroah-Hartman,
	linux-wireless, linux-kernel, Taras Chornyi, Zhou Wang,
	linux-crypto, kernel, netdev, Simon Horman, Oliver O'Halloran,
	linuxppc-dev, David S. Miller
In-Reply-To: <20210811080637.2596434-5-u.kleine-koenig@pengutronix.de>

On Wed, Aug 11, 2021 at 10:06:33AM +0200, Uwe Kleine-K??nig wrote:
>  static inline const char *eeh_driver_name(struct pci_dev *pdev)
>  {
> -	return (pdev && pdev->driver) ? pdev->driver->name : "<null>";
> +	const char *drvstr = pdev ? dev_driver_string(&pdev->dev) : "";
> +
> +	if (*drvstr == '\0')
> +		return "<null>";
> +
> +	return drvstr;

This looks rather obsfucated due to the fact that dev_driver_string
never returns '\0', and due to the strange mix of a tenary operation
and the if on a related condition.


>  }
>  
>  #endif /* CONFIG_EEH */
> diff --git a/drivers/bcma/host_pci.c b/drivers/bcma/host_pci.c
> index 69c10a7b7c61..dc2ffa686964 100644
> --- a/drivers/bcma/host_pci.c
> +++ b/drivers/bcma/host_pci.c
> @@ -175,9 +175,10 @@ static int bcma_host_pci_probe(struct pci_dev *dev,
>  	if (err)
>  		goto err_kfree_bus;
>  
> -	name = dev_name(&dev->dev);
> -	if (dev->driver && dev->driver->name)
> -		name = dev->driver->name;
> +	name = dev_driver_string(&dev->dev);
> +	if (*name == '\0')
> +		name = dev_name(&dev->dev);

Where does this '\0' check come from?

> +
> +	name = dev_driver_string(&dev->dev);
> +	if (*name == '\0')
> +		name = dev_name(&dev->dev);
> +

More of this weirdness.

^ permalink raw reply

* Re: clang/ld.lld build fails with `can't create dynamic relocation R_PPC64_ADDR64 against local symbol in readonly segment`
From: Michael Ellerman @ 2021-08-12  5:46 UTC (permalink / raw)
  To: Paul Menzel, Benjamin Herrenschmidt, Paul Mackerras
  Cc: Derek Parker, Dmitrii Okunev, linuxppc-dev
In-Reply-To: <77a69755-5291-285c-45be-c1e42423fddc@molgen.mpg.de>

Paul Menzel <pmenzel@molgen.mpg.de> writes:
> Am 29.07.21 um 10:23 schrieb Paul Menzel:
>
>> I just wanted to make you aware that building Linux for ppc64le with 
>> clang/lld.ld fails with [1]:
>> 
>>      ld.lld: error: can't create dynamic relocation R_PPC64_ADDR64 
>> against symbol: empty_zero_page in readonly segment; recompile object 
>> files with -fPIC or pass '-Wl,-z,notext' to allow text relocations in 
>> the output
>>      >>> defined in arch/powerpc/kernel/head_64.o
>>      >>> referenced by 
>> arch/powerpc/kernel/head_64.o:(___ksymtab+empty_zero_page+0x0)
>> 
>> The patch below from one of the comments [2] fixes it.
>> 
>> --- i/arch/powerpc/Makefile
>> +++ w/arch/powerpc/Makefile
>> @@ -122,7 +122,7 @@ cflags-$(CONFIG_STACKPROTECTOR)     += 
>> -mstack-protector-guard-reg=r2
>>   endif
>> 
>>   LDFLAGS_vmlinux-y := -Bstatic
>> -LDFLAGS_vmlinux-$(CONFIG_RELOCATABLE) := -pie
>> +LDFLAGS_vmlinux-$(CONFIG_RELOCATABLE) := -pie -z notext
>>   LDFLAGS_vmlinux        := $(LDFLAGS_vmlinux-y)
>>   LDFLAGS_vmlinux += $(call ld-option,--orphan-handling=warn)
>
> Any comments, if this is the right fix? Current Linux master branch 
> still fails to build with `LLVM=1` on Ubuntu 21.04 without this change.

Sorry but I have no idea if it's the right fix. What I need is the
author (or someone else) to send a patch with a change log explaining
the change, what it does, why it's right for llvm, and why it's right
for binutils.

cheers

^ permalink raw reply

* Re: [PATCH v2 7/9] usb: phy: fsl-usb: add IRQ check
From: Felipe Balbi @ 2021-08-12  5:38 UTC (permalink / raw)
  To: Sergey Shtylyov; +Cc: Greg Kroah-Hartman, linux-usb, linuxppc-dev, Ran Wang
In-Reply-To: <b0a86089-8b8b-122e-fd6d-73e8c2304964@omp.ru>


Sergey Shtylyov <s.shtylyov@omp.ru> writes:

> The driver neglects to check the result of platform_get_irq()'s call and
> blithely passes the negative error codes to request_irq() (which takes
> *unsigned* IRQ #), causing it to fail with -EINVAL, overriding an original
> error code. Stop calling request_irq() with the invalid IRQ #s.
>
> Fixes: 0807c500a1a6 ("USB: add Freescale USB OTG Transceiver driver")
> Signed-off-by: Sergey Shtylyov <s.shtylyov@omp.ru>

Acked-by: Felipe Balbi <balbi@kernel.org>

-- 
balbi

^ permalink raw reply

* [PATCH v2 1/2] KVM: Refactor kvm_arch_vcpu_fault() to return a struct page pointer
From: Hou Wenlong @ 2021-08-12  4:02 UTC (permalink / raw)
  To: kvm
  Cc: x86, Wanpeng Li, David Hildenbrand, linux-mips, H. Peter Anvin,
	Claudio Imbrenda, Will Deacon, kvmarm, linux-s390, Janosch Frank,
	Marc Zyngier, Joerg Roedel, Huacai Chen, Christian Borntraeger,
	Aleksandar Markovic, Ingo Molnar, Catalin Marinas, Vasily Gorbik,
	Suzuki K Poulose, Heiko Carstens, kvm-ppc, Borislav Petkov,
	Thomas Gleixner, Alexandru Elisei, linux-arm-kernel, Jim Mattson,
	Thomas Bogendoerfer, Sean Christopherson, Cornelia Huck,
	linux-kernel, James Morse, Paolo Bonzini, Vitaly Kuznetsov,
	linuxppc-dev
In-Reply-To: <YRQcZqCWwVH8bCGc@google.com>

From: Sean Christopherson <seanjc@google.com>

Refactor kvm_arch_vcpu_fault() to return 'struct page *' instead of
'vm_fault_t' to simplify architecture specific implementations that do
more than return SIGBUS.  Currently this only applies to s390, but a
future patch will move x86's pio_data handling into x86 where it belongs.

No functional changed intended.

Cc: Hou Wenlong <houwenlong93@linux.alibaba.com>
Signed-off-by: Sean Christopherson <seanjc@google.com>
Signed-off-by: Hou Wenlong <houwenlong93@linux.alibaba.com>
---
 arch/arm64/kvm/arm.c       |  4 ++--
 arch/mips/kvm/mips.c       |  4 ++--
 arch/powerpc/kvm/powerpc.c |  4 ++--
 arch/s390/kvm/kvm-s390.c   | 12 ++++--------
 arch/x86/kvm/x86.c         |  4 ++--
 include/linux/kvm_host.h   |  2 +-
 virt/kvm/kvm_main.c        |  5 ++++-
 7 files changed, 17 insertions(+), 18 deletions(-)

diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c
index e9a2b8f27792..83f4ffe3e4f2 100644
--- a/arch/arm64/kvm/arm.c
+++ b/arch/arm64/kvm/arm.c
@@ -161,9 +161,9 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
 	return ret;
 }
 
-vm_fault_t kvm_arch_vcpu_fault(struct kvm_vcpu *vcpu, struct vm_fault *vmf)
+struct page *kvm_arch_vcpu_fault(struct kvm_vcpu *vcpu, struct vm_fault *vmf)
 {
-	return VM_FAULT_SIGBUS;
+	return NULL;
 }
 
 
diff --git a/arch/mips/kvm/mips.c b/arch/mips/kvm/mips.c
index af9dd029a4e1..ae79874e6fd2 100644
--- a/arch/mips/kvm/mips.c
+++ b/arch/mips/kvm/mips.c
@@ -1053,9 +1053,9 @@ int kvm_arch_vcpu_ioctl_set_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu)
 	return -ENOIOCTLCMD;
 }
 
-vm_fault_t kvm_arch_vcpu_fault(struct kvm_vcpu *vcpu, struct vm_fault *vmf)
+struct page *kvm_arch_vcpu_fault(struct kvm_vcpu *vcpu, struct vm_fault *vmf)
 {
-	return VM_FAULT_SIGBUS;
+	return NULL;
 }
 
 int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c
index be33b5321a76..b9c21f9ab784 100644
--- a/arch/powerpc/kvm/powerpc.c
+++ b/arch/powerpc/kvm/powerpc.c
@@ -2090,9 +2090,9 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
 	return r;
 }
 
-vm_fault_t kvm_arch_vcpu_fault(struct kvm_vcpu *vcpu, struct vm_fault *vmf)
+struct page *kvm_arch_vcpu_fault(struct kvm_vcpu *vcpu, struct vm_fault *vmf)
 {
-	return VM_FAULT_SIGBUS;
+	return NULL;
 }
 
 static int kvm_vm_ioctl_get_pvinfo(struct kvm_ppc_pvinfo *pvinfo)
diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c
index 02574d7b3612..e1b69833e228 100644
--- a/arch/s390/kvm/kvm-s390.c
+++ b/arch/s390/kvm/kvm-s390.c
@@ -4979,17 +4979,13 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
 	return r;
 }
 
-vm_fault_t kvm_arch_vcpu_fault(struct kvm_vcpu *vcpu, struct vm_fault *vmf)
+struct page *kvm_arch_vcpu_fault(struct kvm_vcpu *vcpu, struct vm_fault *vmf)
 {
 #ifdef CONFIG_KVM_S390_UCONTROL
-	if ((vmf->pgoff == KVM_S390_SIE_PAGE_OFFSET)
-		 && (kvm_is_ucontrol(vcpu->kvm))) {
-		vmf->page = virt_to_page(vcpu->arch.sie_block);
-		get_page(vmf->page);
-		return 0;
-	}
+	if (vmf->pgoff == KVM_S390_SIE_PAGE_OFFSET && kvm_is_ucontrol(vcpu->kvm))
+		return virt_to_page(vcpu->arch.sie_block);
 #endif
-	return VM_FAULT_SIGBUS;
+	return NULL;
 }
 
 /* Section: memory related */
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 3cedc7cc132a..1e3bbe5cd33a 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -5347,9 +5347,9 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
 	return r;
 }
 
-vm_fault_t kvm_arch_vcpu_fault(struct kvm_vcpu *vcpu, struct vm_fault *vmf)
+struct page *kvm_arch_vcpu_fault(struct kvm_vcpu *vcpu, struct vm_fault *vmf)
 {
-	return VM_FAULT_SIGBUS;
+	return NULL;
 }
 
 static int kvm_vm_ioctl_set_tss_addr(struct kvm *kvm, unsigned long addr)
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 492d183dd7d0..a949de534722 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -995,7 +995,7 @@ long kvm_arch_dev_ioctl(struct file *filp,
 			unsigned int ioctl, unsigned long arg);
 long kvm_arch_vcpu_ioctl(struct file *filp,
 			 unsigned int ioctl, unsigned long arg);
-vm_fault_t kvm_arch_vcpu_fault(struct kvm_vcpu *vcpu, struct vm_fault *vmf);
+struct page *kvm_arch_vcpu_fault(struct kvm_vcpu *vcpu, struct vm_fault *vmf);
 
 int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext);
 
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 30d322519253..f7d21418971b 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -3448,7 +3448,10 @@ static vm_fault_t kvm_vcpu_fault(struct vm_fault *vmf)
 		    &vcpu->dirty_ring,
 		    vmf->pgoff - KVM_DIRTY_LOG_PAGE_OFFSET);
 	else
-		return kvm_arch_vcpu_fault(vcpu, vmf);
+		page = kvm_arch_vcpu_fault(vcpu, vmf);
+	if (!page)
+		return VM_FAULT_SIGBUS;
+
 	get_page(page);
 	vmf->page = page;
 	return 0;
-- 
2.31.1


^ permalink raw reply related

* Re: [PATCH v7 5/6] powerpc/pseries: Add support for FORM2 associativity
From: Aneesh Kumar K.V @ 2021-08-12  3:36 UTC (permalink / raw)
  To: David Gibson; +Cc: Nathan Lynch, Daniel Henrique Barboza, linuxppc-dev
In-Reply-To: <YRR8Z0EhlXgEKtY8@yekko>

On 8/12/21 7:11 AM, David Gibson wrote:
> On Wed, Aug 11, 2021 at 09:39:32AM +0530, Aneesh Kumar K.V wrote:
>> David Gibson <david@gibson.dropbear.id.au> writes:
>>
>>> On Mon, Aug 09, 2021 at 10:54:33AM +0530, Aneesh Kumar K.V wrote:
>>>> PAPR interface currently supports two different ways of communicating resource
>>>> grouping details to the OS. These are referred to as Form 0 and Form 1
>>>> associativity grouping. Form 0 is the older format and is now considered
>>>> deprecated. This patch adds another resource grouping named FORM2.
>>>>
>>>> Signed-off-by: Daniel Henrique Barboza <danielhb413@gmail.com>
>>>> Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
>>>
>>> LGTM, with the exception of some minor nits noted below.
>> ...
>>
>>> +
>>>> +	for (i = 0; i < max_numa_index; i++)
>>>> +		/* +1 skip the max_numa_index in the property */
>>>> +		numa_id_index_table[i] = of_read_number(&numa_lookup_index[i + 1], 1);
>>>> +
>>>> +
>>>> +	if (numa_dist_table_length != max_numa_index * max_numa_index) {
>>>> +
>>>
>>> Stray extra whitespace line here.
>>>
>>>> +		WARN(1, "Wrong NUMA distance information\n");
>>>> +		/* consider everybody else just remote. */
>>>> +		for (i = 0;  i < max_numa_index; i++) {
>>>> +			for (j = 0; j < max_numa_index; j++) {
>>>> +				int nodeA = numa_id_index_table[i];
>>>> +				int nodeB = numa_id_index_table[j];
>>>> +
>>>> +				if (nodeA == nodeB)
>>>> +					numa_distance_table[nodeA][nodeB] = LOCAL_DISTANCE;
>>>> +				else
>>>> +					numa_distance_table[nodeA][nodeB] = REMOTE_DISTANCE;
>>>> +			}
>>>> +		}
>>>
>>> I don't think it's necessarily a problem, but something to consider is
>>> that this fallback will initialize distance for *all* node IDs,
>>> whereas the normal path will only initialize it for nodes that are in
>>> the index table.  Since some later error checks key off whether
>>> certain fields in the distance table are initialized, is that the
>>> outcome you want?
>>>
>>
>> With the device tree details not correct, one of the possible way to
>> make progress is to consider everybody remote. With new node hotplug
>> support we used to check whether the distance table entry is
>> initialized. With the updated spec, we expect all possible numa node
>> distance to be available during boot.
> 
> Sure.  But my main point here is that the fallback behaviour in this
> clause is different from the fallback behaviour if the table is there
> and parseable, but incomplete - which is also not expected.
> 

With FORM2 fallback with bad device tree details is to consider 
everybody REMOTE. With Form1, we leave the distance table not populated 
as it was with the current kernel versions.

-aneesh

^ permalink raw reply

* Re: [PATCH v7 5/6] powerpc/pseries: Add support for FORM2 associativity
From: David Gibson @ 2021-08-12  1:41 UTC (permalink / raw)
  To: Aneesh Kumar K.V; +Cc: Nathan Lynch, Daniel Henrique Barboza, linuxppc-dev
In-Reply-To: <87a6loaagz.fsf@linux.ibm.com>

[-- Attachment #1: Type: text/plain, Size: 2691 bytes --]

On Wed, Aug 11, 2021 at 09:39:32AM +0530, Aneesh Kumar K.V wrote:
> David Gibson <david@gibson.dropbear.id.au> writes:
> 
> > On Mon, Aug 09, 2021 at 10:54:33AM +0530, Aneesh Kumar K.V wrote:
> >> PAPR interface currently supports two different ways of communicating resource
> >> grouping details to the OS. These are referred to as Form 0 and Form 1
> >> associativity grouping. Form 0 is the older format and is now considered
> >> deprecated. This patch adds another resource grouping named FORM2.
> >> 
> >> Signed-off-by: Daniel Henrique Barboza <danielhb413@gmail.com>
> >> Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
> >
> > LGTM, with the exception of some minor nits noted below.
> ...
> 
> > +
> >> +	for (i = 0; i < max_numa_index; i++)
> >> +		/* +1 skip the max_numa_index in the property */
> >> +		numa_id_index_table[i] = of_read_number(&numa_lookup_index[i + 1], 1);
> >> +
> >> +
> >> +	if (numa_dist_table_length != max_numa_index * max_numa_index) {
> >> +
> >
> > Stray extra whitespace line here.
> >
> >> +		WARN(1, "Wrong NUMA distance information\n");
> >> +		/* consider everybody else just remote. */
> >> +		for (i = 0;  i < max_numa_index; i++) {
> >> +			for (j = 0; j < max_numa_index; j++) {
> >> +				int nodeA = numa_id_index_table[i];
> >> +				int nodeB = numa_id_index_table[j];
> >> +
> >> +				if (nodeA == nodeB)
> >> +					numa_distance_table[nodeA][nodeB] = LOCAL_DISTANCE;
> >> +				else
> >> +					numa_distance_table[nodeA][nodeB] = REMOTE_DISTANCE;
> >> +			}
> >> +		}
> >
> > I don't think it's necessarily a problem, but something to consider is
> > that this fallback will initialize distance for *all* node IDs,
> > whereas the normal path will only initialize it for nodes that are in
> > the index table.  Since some later error checks key off whether
> > certain fields in the distance table are initialized, is that the
> > outcome you want?
> >
> 
> With the device tree details not correct, one of the possible way to
> make progress is to consider everybody remote. With new node hotplug
> support we used to check whether the distance table entry is
> initialized. With the updated spec, we expect all possible numa node
> distance to be available during boot.

Sure.  But my main point here is that the fallback behaviour in this
clause is different from the fallback behaviour if the table is there
and parseable, but incomplete - which is also not expected.

-- 
David Gibson			| I'll have my music baroque, and my code
david AT gibson.dropbear.id.au	| minimalist, thank you.  NOT _the_ _other_
				| _way_ _around_!
http://www.ozlabs.org/~dgibson

[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 833 bytes --]

^ permalink raw reply

* Re: clang/ld.lld build fails with `can't create dynamic relocation R_PPC64_ADDR64 against local symbol in readonly segment`
From: Paul Menzel @ 2021-08-11 23:09 UTC (permalink / raw)
  To: Christophe Leroy, Michael Ellerman, Benjamin Herrenschmidt,
	Paul Mackerras
  Cc: Derek Parker, Dmitrii Okunev, linuxppc-dev
In-Reply-To: <886ac4a5-ad30-b27f-0b80-ec233c571e81@csgroup.eu>

Dear Christophe,


Am 11.08.21 um 16:10 schrieb Christophe Leroy:

> Le 10/08/2021 à 20:38, Paul Menzel a écrit :

>> Am 29.07.21 um 10:23 schrieb Paul Menzel:
>>
>>> I just wanted to make you aware that building Linux for ppc64le with 
>>> clang/lld.ld fails with [1]:
>>>
>>>      ld.lld: error: can't create dynamic relocation R_PPC64_ADDR64 
>>> against symbol: empty_zero_page in readonly segment; recompile object 
>>> files with -fPIC or pass '-Wl,-z,notext' to allow text relocations in 
>>> the output
>>>      >>> defined in arch/powerpc/kernel/head_64.o
>>>      >>> referenced by 
>>> arch/powerpc/kernel/head_64.o:(___ksymtab+empty_zero_page+0x0)
>>>
>>> The patch below from one of the comments [2] fixes it.
>>>
>>> --- i/arch/powerpc/Makefile
>>> +++ w/arch/powerpc/Makefile
>>> @@ -122,7 +122,7 @@ cflags-$(CONFIG_STACKPROTECTOR)     += 
>>> -mstack-protector-guard-reg=r2
>>>   endif
>>>
>>>   LDFLAGS_vmlinux-y := -Bstatic
>>> -LDFLAGS_vmlinux-$(CONFIG_RELOCATABLE) := -pie
>>> +LDFLAGS_vmlinux-$(CONFIG_RELOCATABLE) := -pie -z notext
>>>   LDFLAGS_vmlinux        := $(LDFLAGS_vmlinux-y)
>>>   LDFLAGS_vmlinux += $(call ld-option,--orphan-handling=warn)
>>
>> Any comments, if this is the right fix? Current Linux master branch 
>> still fails to build with `LLVM=1` on Ubuntu 21.04 without this change.
> 
> Which kernel version are you building ?
> 
> Since 
> https://github.com/linuxppc/linux/commit/45b30fafe528601f1a4449c9d68d8ebe7bbc39ad 
> , empty_zero_page[] is in arch/powerpc/mm/mem.c not in 
> arch/powerpc/kernel/head_64.o
> 
> Do you still have the issue with kernel 5.14 ?

Yes, before sending the message, I reproduced it with

     $ git describe
     v5.14-rc5-2-g9a73fa375d58

containing the commit you mentioned.


Kind regards,

Paul

^ permalink raw reply

* Re: [PATCH v2 40/60] KVM: PPC: Book3S HV P9: Implement TM fastpath for guest entry/exit
From: kernel test robot @ 2021-08-11 20:40 UTC (permalink / raw)
  To: Nicholas Piggin, kvm-ppc; +Cc: linuxppc-dev, kbuild-all, Nicholas Piggin
In-Reply-To: <20210811160134.904987-41-npiggin@gmail.com>

[-- Attachment #1: Type: text/plain, Size: 5369 bytes --]

Hi Nicholas,

I love your patch! Yet something to improve:

[auto build test ERROR on powerpc/next]
[also build test ERROR on linus/master v5.14-rc5 next-20210811]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch]

url:    https://github.com/0day-ci/linux/commits/Nicholas-Piggin/KVM-PPC-Book3S-HV-P9-entry-exit-optimisations/20210812-000748
base:   https://git.kernel.org/pub/scm/linux/kernel/git/powerpc/linux.git next
config: powerpc64-randconfig-r024-20210811 (attached as .config)
compiler: powerpc-linux-gcc (GCC) 10.3.0
reproduce (this is a W=1 build):
        wget https://raw.githubusercontent.com/intel/lkp-tests/master/sbin/make.cross -O ~/bin/make.cross
        chmod +x ~/bin/make.cross
        # https://github.com/0day-ci/linux/commit/30a3a9ae99f124a863c41f268c68b647d7116b65
        git remote add linux-review https://github.com/0day-ci/linux
        git fetch --no-tags linux-review Nicholas-Piggin/KVM-PPC-Book3S-HV-P9-entry-exit-optimisations/20210812-000748
        git checkout 30a3a9ae99f124a863c41f268c68b647d7116b65
        # save the attached .config to linux build tree
        mkdir build_dir
        COMPILER_INSTALL_PATH=$HOME/0day COMPILER=gcc-10.3.0 make.cross O=build_dir ARCH=powerpc SHELL=/bin/bash arch/powerpc/kvm/

If you fix the issue, kindly add following tag as appropriate
Reported-by: kernel test robot <lkp@intel.com>

All errors (new ones prefixed by >>):

   In file included from arch/powerpc/include/asm/processor.h:11,
                    from arch/powerpc/include/asm/thread_info.h:40,
                    from include/linux/thread_info.h:60,
                    from include/asm-generic/preempt.h:5,
                    from ./arch/powerpc/include/generated/asm/preempt.h:1,
                    from include/linux/preempt.h:78,
                    from include/linux/percpu.h:6,
                    from include/linux/context_tracking_state.h:5,
                    from include/linux/hardirq.h:5,
                    from include/linux/kvm_host.h:7,
                    from arch/powerpc/kvm/book3s_hv_p9_entry.c:3:
   arch/powerpc/kvm/book3s_hv_p9_entry.c: In function 'load_vcpu_state':
>> arch/powerpc/kvm/book3s_hv_p9_entry.c:297:33: error: 'struct kvm_vcpu_arch' has no member named 'texasr'
     297 |    mtspr(SPRN_TEXASR, vcpu->arch.texasr);
         |                                 ^
   arch/powerpc/include/asm/reg.h:1396:33: note: in definition of macro 'mtspr'
    1396 |          : "r" ((unsigned long)(v)) \
         |                                 ^
>> arch/powerpc/kvm/book3s_hv_p9_entry.c:298:33: error: 'struct kvm_vcpu_arch' has no member named 'tfhar'; did you mean 'tar'?
     298 |    mtspr(SPRN_TFHAR, vcpu->arch.tfhar);
         |                                 ^~~~~
   arch/powerpc/include/asm/reg.h:1396:33: note: in definition of macro 'mtspr'
    1396 |          : "r" ((unsigned long)(v)) \
         |                                 ^
>> arch/powerpc/kvm/book3s_hv_p9_entry.c:299:33: error: 'struct kvm_vcpu_arch' has no member named 'tfiar'; did you mean 'tar'?
     299 |    mtspr(SPRN_TFIAR, vcpu->arch.tfiar);
         |                                 ^~~~~
   arch/powerpc/include/asm/reg.h:1396:33: note: in definition of macro 'mtspr'
    1396 |          : "r" ((unsigned long)(v)) \
         |                                 ^
   arch/powerpc/kvm/book3s_hv_p9_entry.c: In function 'store_vcpu_state':
   arch/powerpc/kvm/book3s_hv_p9_entry.c:331:14: error: 'struct kvm_vcpu_arch' has no member named 'texasr'
     331 |    vcpu->arch.texasr = mfspr(SPRN_TEXASR);
         |              ^
   arch/powerpc/kvm/book3s_hv_p9_entry.c:332:15: error: 'struct kvm_vcpu_arch' has no member named 'tfhar'; did you mean 'tar'?
     332 |    vcpu->arch.tfhar = mfspr(SPRN_TFHAR);
         |               ^~~~~
         |               tar
   arch/powerpc/kvm/book3s_hv_p9_entry.c:333:15: error: 'struct kvm_vcpu_arch' has no member named 'tfiar'; did you mean 'tar'?
     333 |    vcpu->arch.tfiar = mfspr(SPRN_TFIAR);
         |               ^~~~~
         |               tar


vim +297 arch/powerpc/kvm/book3s_hv_p9_entry.c

   283	
   284	/* Returns true if current MSR and/or guest MSR may have changed */
   285	bool load_vcpu_state(struct kvm_vcpu *vcpu,
   286			     struct p9_host_os_sprs *host_os_sprs)
   287	{
   288		bool ret = false;
   289	
   290		if (cpu_has_feature(CPU_FTR_TM) ||
   291		    cpu_has_feature(CPU_FTR_P9_TM_HV_ASSIST)) {
   292			unsigned long guest_msr = vcpu->arch.shregs.msr;
   293			if (MSR_TM_ACTIVE(guest_msr)) {
   294				kvmppc_restore_tm_hv(vcpu, guest_msr, true);
   295				ret = true;
   296			} else {
 > 297				mtspr(SPRN_TEXASR, vcpu->arch.texasr);
 > 298				mtspr(SPRN_TFHAR, vcpu->arch.tfhar);
 > 299				mtspr(SPRN_TFIAR, vcpu->arch.tfiar);
   300			}
   301		}
   302	
   303		load_spr_state(vcpu, host_os_sprs);
   304	
   305		load_fp_state(&vcpu->arch.fp);
   306	#ifdef CONFIG_ALTIVEC
   307		load_vr_state(&vcpu->arch.vr);
   308	#endif
   309		mtspr(SPRN_VRSAVE, vcpu->arch.vrsave);
   310	
   311		return ret;
   312	}
   313	EXPORT_SYMBOL_GPL(load_vcpu_state);
   314	

---
0-DAY CI Kernel Test Service, Intel Corporation
https://lists.01.org/hyperkitty/list/kbuild-all@lists.01.org

[-- Attachment #2: .config.gz --]
[-- Type: application/gzip, Size: 43470 bytes --]

^ permalink raw reply

* Re: [PATCH v2 27/60] KVM: PPC: Book3S HV P9: Reduce mtmsrd instructions required to save host SPRs
From: kernel test robot @ 2021-08-11 19:49 UTC (permalink / raw)
  To: Nicholas Piggin, kvm-ppc; +Cc: linuxppc-dev, kbuild-all, Nicholas Piggin
In-Reply-To: <20210811160134.904987-28-npiggin@gmail.com>

[-- Attachment #1: Type: text/plain, Size: 2235 bytes --]

Hi Nicholas,

I love your patch! Yet something to improve:

[auto build test ERROR on powerpc/next]
[also build test ERROR on linus/master v5.14-rc5 next-20210811]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch]

url:    https://github.com/0day-ci/linux/commits/Nicholas-Piggin/KVM-PPC-Book3S-HV-P9-entry-exit-optimisations/20210812-000748
base:   https://git.kernel.org/pub/scm/linux/kernel/git/powerpc/linux.git next
config: powerpc64-randconfig-r024-20210811 (attached as .config)
compiler: powerpc-linux-gcc (GCC) 10.3.0
reproduce (this is a W=1 build):
        wget https://raw.githubusercontent.com/intel/lkp-tests/master/sbin/make.cross -O ~/bin/make.cross
        chmod +x ~/bin/make.cross
        # https://github.com/0day-ci/linux/commit/b5149d8c735b6802aa0433a0cecc73e4d943e795
        git remote add linux-review https://github.com/0day-ci/linux
        git fetch --no-tags linux-review Nicholas-Piggin/KVM-PPC-Book3S-HV-P9-entry-exit-optimisations/20210812-000748
        git checkout b5149d8c735b6802aa0433a0cecc73e4d943e795
        # save the attached .config to linux build tree
        COMPILER_INSTALL_PATH=$HOME/0day COMPILER=gcc-10.3.0 make.cross ARCH=powerpc64 

If you fix the issue, kindly add following tag as appropriate
Reported-by: kernel test robot <lkp@intel.com>

All errors (new ones prefixed by >>):

>> arch/powerpc/kernel/process.c:596:6: error: no previous prototype for 'save_user_regs_kvm' [-Werror=missing-prototypes]
     596 | void save_user_regs_kvm(void)
         |      ^~~~~~~~~~~~~~~~~~
   cc1: all warnings being treated as errors


vim +/save_user_regs_kvm +596 arch/powerpc/kernel/process.c

   595	
 > 596	void save_user_regs_kvm(void)
   597	{
   598		unsigned long usermsr;
   599	
   600		if (!current->thread.regs)
   601			return;
   602	
   603		usermsr = current->thread.regs->msr;
   604	
   605		if (usermsr & MSR_FP)
   606			save_fpu(current);
   607	
   608		if (usermsr & MSR_VEC)
   609			save_altivec(current);
   610	

---
0-DAY CI Kernel Test Service, Intel Corporation
https://lists.01.org/hyperkitty/list/kbuild-all@lists.01.org

[-- Attachment #2: .config.gz --]
[-- Type: application/gzip, Size: 43470 bytes --]

^ permalink raw reply

* Re: [PATCH v2 27/60] KVM: PPC: Book3S HV P9: Reduce mtmsrd instructions required to save host SPRs
From: kernel test robot @ 2021-08-11 19:18 UTC (permalink / raw)
  To: Nicholas Piggin, kvm-ppc; +Cc: linuxppc-dev, kbuild-all, Nicholas Piggin
In-Reply-To: <20210811160134.904987-28-npiggin@gmail.com>

[-- Attachment #1: Type: text/plain, Size: 2260 bytes --]

Hi Nicholas,

I love your patch! Perhaps something to improve:

[auto build test WARNING on powerpc/next]
[also build test WARNING on linus/master v5.14-rc5 next-20210811]
[cannot apply to scottwood/next kvm-ppc/kvm-ppc-next]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch]

url:    https://github.com/0day-ci/linux/commits/Nicholas-Piggin/KVM-PPC-Book3S-HV-P9-entry-exit-optimisations/20210812-000748
base:   https://git.kernel.org/pub/scm/linux/kernel/git/powerpc/linux.git next
config: powerpc64-buildonly-randconfig-r001-20210810 (attached as .config)
compiler: powerpc-linux-gcc (GCC) 10.3.0
reproduce (this is a W=1 build):
        wget https://raw.githubusercontent.com/intel/lkp-tests/master/sbin/make.cross -O ~/bin/make.cross
        chmod +x ~/bin/make.cross
        # https://github.com/0day-ci/linux/commit/b5149d8c735b6802aa0433a0cecc73e4d943e795
        git remote add linux-review https://github.com/0day-ci/linux
        git fetch --no-tags linux-review Nicholas-Piggin/KVM-PPC-Book3S-HV-P9-entry-exit-optimisations/20210812-000748
        git checkout b5149d8c735b6802aa0433a0cecc73e4d943e795
        # save the attached .config to linux build tree
        COMPILER_INSTALL_PATH=$HOME/0day COMPILER=gcc-10.3.0 make.cross ARCH=powerpc64 

If you fix the issue, kindly add following tag as appropriate
Reported-by: kernel test robot <lkp@intel.com>

All warnings (new ones prefixed by >>):

>> arch/powerpc/kernel/process.c:596:6: warning: no previous prototype for 'save_user_regs_kvm' [-Wmissing-prototypes]
     596 | void save_user_regs_kvm(void)
         |      ^~~~~~~~~~~~~~~~~~


vim +/save_user_regs_kvm +596 arch/powerpc/kernel/process.c

   595	
 > 596	void save_user_regs_kvm(void)
   597	{
   598		unsigned long usermsr;
   599	
   600		if (!current->thread.regs)
   601			return;
   602	
   603		usermsr = current->thread.regs->msr;
   604	
   605		if (usermsr & MSR_FP)
   606			save_fpu(current);
   607	
   608		if (usermsr & MSR_VEC)
   609			save_altivec(current);
   610	

---
0-DAY CI Kernel Test Service, Intel Corporation
https://lists.01.org/hyperkitty/list/kbuild-all@lists.01.org

[-- Attachment #2: .config.gz --]
[-- Type: application/gzip, Size: 31911 bytes --]

^ permalink raw reply

* Re: [PATCH v5 7/8] powerpc/64s: Initialize and use a temporary mm for patching on Radix
From: Christopher M. Riedl @ 2021-08-11 18:28 UTC (permalink / raw)
  To: Christophe Leroy, linuxppc-dev
  Cc: keescook, peterz, x86, npiggin, linux-hardening, tglx, dja
In-Reply-To: <9cc03303-ca54-94b8-7d0b-42647ff4d5a7@csgroup.eu>

On Thu Aug 5, 2021 at 4:48 AM CDT, Christophe Leroy wrote:
>
>
> Le 13/07/2021 à 07:31, Christopher M. Riedl a écrit :
> > When code patching a STRICT_KERNEL_RWX kernel the page containing the
> > address to be patched is temporarily mapped as writeable. Currently, a
> > per-cpu vmalloc patch area is used for this purpose. While the patch
> > area is per-cpu, the temporary page mapping is inserted into the kernel
> > page tables for the duration of patching. The mapping is exposed to CPUs
> > other than the patching CPU - this is undesirable from a hardening
> > perspective. Use a temporary mm instead which keeps the mapping local to
> > the CPU doing the patching.
> > 
> > Use the `poking_init` init hook to prepare a temporary mm and patching
> > address. Initialize the temporary mm by copying the init mm. Choose a
> > randomized patching address inside the temporary mm userspace address
> > space. The patching address is randomized between PAGE_SIZE and
> > DEFAULT_MAP_WINDOW-PAGE_SIZE.
> > 
> > Bits of entropy with 64K page size on BOOK3S_64:
> > 
> >          bits of entropy = log2(DEFAULT_MAP_WINDOW_USER64 / PAGE_SIZE)
> > 
> >          PAGE_SIZE=64K, DEFAULT_MAP_WINDOW_USER64=128TB
> >          bits of entropy = log2(128TB / 64K)
> > 	bits of entropy = 31
> > 
> > The upper limit is DEFAULT_MAP_WINDOW due to how the Book3s64 Hash MMU
> > operates - by default the space above DEFAULT_MAP_WINDOW is not
> > available. Currently the Hash MMU does not use a temporary mm so
> > technically this upper limit isn't necessary; however, a larger
> > randomization range does not further "harden" this overall approach and
> > future work may introduce patching with a temporary mm on Hash as well.
> > 
> > Randomization occurs only once during initialization at boot for each
> > possible CPU in the system.
> > 
> > Introduce two new functions, map_patch() and unmap_patch(), to
> > respectively create and remove the temporary mapping with write
> > permissions at patching_addr. Map the page with PAGE_KERNEL to set
> > EAA[0] for the PTE which ignores the AMR (so no need to unlock/lock
> > KUAP) according to PowerISA v3.0b Figure 35 on Radix.
> > 
> > Based on x86 implementation:
> > 
> > commit 4fc19708b165
> > ("x86/alternatives: Initialize temporary mm for patching")
> > 
> > and:
> > 
> > commit b3fd8e83ada0
> > ("x86/alternatives: Use temporary mm for text poking")
> > 
> > Signed-off-by: Christopher M. Riedl <cmr@linux.ibm.com>
> > 
> > ---
> > 
> > v5:  * Only support Book3s64 Radix MMU for now.
> >       * Use a per-cpu datastructure to hold the patching_addr and
> >         patching_mm to avoid the need for a synchronization lock/mutex.
> > 
> > v4:  * In the previous series this was two separate patches: one to init
> >         the temporary mm in poking_init() (unused in powerpc at the time)
> >         and the other to use it for patching (which removed all the
> >         per-cpu vmalloc code). Now that we use poking_init() in the
> >         existing per-cpu vmalloc approach, that separation doesn't work
> >         as nicely anymore so I just merged the two patches into one.
> >       * Preload the SLB entry and hash the page for the patching_addr
> >         when using Hash on book3s64 to avoid taking an SLB and Hash fault
> >         during patching. The previous implementation was a hack which
> >         changed current->mm to allow the SLB and Hash fault handlers to
> >         work with the temporary mm since both of those code-paths always
> >         assume mm == current->mm.
> >       * Also (hmm - seeing a trend here) with the book3s64 Hash MMU we
> >         have to manage the mm->context.active_cpus counter and mm cpumask
> >         since they determine (via mm_is_thread_local()) if the TLB flush
> >         in pte_clear() is local or not - it should always be local when
> >         we're using the temporary mm. On book3s64's Radix MMU we can
> >         just call local_flush_tlb_mm().
> >       * Use HPTE_USE_KERNEL_KEY on Hash to avoid costly lock/unlock of
> >         KUAP.
> > ---
> >   arch/powerpc/lib/code-patching.c | 132 +++++++++++++++++++++++++++++--
> >   1 file changed, 125 insertions(+), 7 deletions(-)
> > 
> > diff --git a/arch/powerpc/lib/code-patching.c b/arch/powerpc/lib/code-patching.c
> > index 9f2eba9b70ee4..027dabd42b8dd 100644
> > --- a/arch/powerpc/lib/code-patching.c
> > +++ b/arch/powerpc/lib/code-patching.c
> > @@ -11,6 +11,7 @@
> >   #include <linux/cpuhotplug.h>
> >   #include <linux/slab.h>
> >   #include <linux/uaccess.h>
> > +#include <linux/random.h>
> >   
> >   #include <asm/tlbflush.h>
> >   #include <asm/page.h>
> > @@ -103,6 +104,7 @@ static inline void unuse_temporary_mm(struct temp_mm *temp_mm)
> >   
> >   static DEFINE_PER_CPU(struct vm_struct *, text_poke_area);
> >   static DEFINE_PER_CPU(unsigned long, cpu_patching_addr);
> > +static DEFINE_PER_CPU(struct mm_struct *, cpu_patching_mm);
> >   
> >   #if IS_BUILTIN(CONFIG_LKDTM)
> >   unsigned long read_cpu_patching_addr(unsigned int cpu)
> > @@ -133,6 +135,51 @@ static int text_area_cpu_down(unsigned int cpu)
> >   	return 0;
> >   }
> >   
> > +static __always_inline void __poking_init_temp_mm(void)
> > +{
> > +	int cpu;
> > +	spinlock_t *ptl; /* for protecting pte table */
> > +	pte_t *ptep;
> > +	struct mm_struct *patching_mm;
> > +	unsigned long patching_addr;
> > +
> > +	for_each_possible_cpu(cpu) {
> > +		/*
> > +		 * Some parts of the kernel (static keys for example) depend on
> > +		 * successful code patching. Code patching under
> > +		 * STRICT_KERNEL_RWX requires this setup - otherwise we cannot
> > +		 * patch at all. We use BUG_ON() here and later since an early
> > +		 * failure is preferred to buggy behavior and/or strange
> > +		 * crashes later.
> > +		 */
> > +		patching_mm = copy_init_mm();
> > +		BUG_ON(!patching_mm);
>
> Read
> https://www.kernel.org/doc/html/latest/process/deprecated.html#bug-and-bug-on
> and
> https://github.com/linuxppc/issues/issues/88
>
> Avoid BUG_ON()s thanks.
>

Fine, @mpe's reply on the GH issue says the check is probably redundant:

"In general we don't need to BUG_ON(!ptr), the MMU will catch NULL
pointer dereferences for us."

> > +
> > +		per_cpu(cpu_patching_mm, cpu) = patching_mm;
> > +
> > +		/*
> > +		 * Choose a randomized, page-aligned address from the range:
> > +		 * [PAGE_SIZE, DEFAULT_MAP_WINDOW - PAGE_SIZE] The lower
> > +		 * address bound is PAGE_SIZE to avoid the zero-page.  The
> > +		 * upper address bound is DEFAULT_MAP_WINDOW - PAGE_SIZE to
> > +		 * stay under DEFAULT_MAP_WINDOW with the Book3s64 Hash MMU.
> > +		 */
> > +		patching_addr = PAGE_SIZE + ((get_random_long() & PAGE_MASK)
> > +				% (DEFAULT_MAP_WINDOW - 2 * PAGE_SIZE));
>
> % should be at the end of first line and the second line alignment
> should match open parenthesis in
> first line.

Ok - thanks!

>
> > +
> > +		per_cpu(cpu_patching_addr, cpu) = patching_addr;
> > +
> > +		/*
> > +		 * PTE allocation uses GFP_KERNEL which means we need to
> > +		 * pre-allocate the PTE here because we cannot do the
> > +		 * allocation during patching when IRQs are disabled.
> > +		 */
> > +		ptep = get_locked_pte(patching_mm, patching_addr, &ptl);
> > +		BUG_ON(!ptep);
>
> Avoid BUG_ON() please
>

Yup, I'll remove these in the next spin.

>
> > +		pte_unmap_unlock(ptep, ptl);
> > +	}
> > +}
> > +
> >   /*
> >    * Although BUG_ON() is rude, in this case it should only happen if ENOMEM, and
> >    * we judge it as being preferable to a kernel that will crash later when
> > @@ -140,6 +187,11 @@ static int text_area_cpu_down(unsigned int cpu)
> >    */
> >   void __init poking_init(void)
> >   {
> > +	if (radix_enabled()) {
> > +		__poking_init_temp_mm();
> > +		return;
> > +	}
> > +
> >   	BUG_ON(!cpuhp_setup_state(CPUHP_AP_ONLINE_DYN,
> >   		"powerpc/text_poke:online", text_area_cpu_up,
> >   		text_area_cpu_down));
> > @@ -213,30 +265,96 @@ static inline int unmap_patch_area(void)
> >   	return -EINVAL;
> >   }
> >   
> > +struct patch_mapping {
> > +	spinlock_t *ptl; /* for protecting pte table */
> > +	pte_t *ptep;
> > +	struct temp_mm temp_mm;
> > +};
> > +
> > +/*
> > + * This can be called for kernel text or a module.
> > + */
> > +static int map_patch(const void *addr, struct patch_mapping *patch_mapping)
> > +{
> > +	struct page *page;
> > +	pte_t pte;
> > +	pgprot_t pgprot;
> > +	struct mm_struct *patching_mm = __this_cpu_read(cpu_patching_mm);
> > +	unsigned long patching_addr = __this_cpu_read(cpu_patching_addr);
> > +
> > +	if (is_vmalloc_or_module_addr(addr))
> > +		page = vmalloc_to_page(addr);
> > +	else
> > +		page = virt_to_page(addr);
> > +
> > +	patch_mapping->ptep = get_locked_pte(patching_mm, patching_addr,
> > +					     &patch_mapping->ptl);
>
> Not sure you need to split this line, checkpatch now allows 100 chars
> per line.
>

I prefer sticking to 80 columns unless readability *really* improves by
going over that limit.

>
> > +	if (unlikely(!patch_mapping->ptep)) {
> > +		pr_warn("map patch: failed to allocate pte for patching\n");
>
> That's a lot better than all above BUG_ONs
>
>
> > +		return -1;
> > +	}
> > +
> > +	pgprot = PAGE_KERNEL;
> > +	pte = mk_pte(page, pgprot);
> > +	pte = pte_mkdirty(pte);
>
> I'm sure you can do
>
> pte = pte_mkdirty(mk_pte(page, PAGE_KERNEL));
>
> And indeed PAGE_KERNEL already includes _PAGE_DIRTY, so all you should
> need is
>
> pte = mk_pte(page, PAGE_KERNEL);
>
> Or even just
>
> set_pte_at(patching_mm, patching_addr, patch_mapping->ptep, mk_pte(page,
> PAGE_KERNEL));
>

Ok, I'll consolidate this in the next spin. Thanks!

>
> > +	set_pte_at(patching_mm, patching_addr, patch_mapping->ptep, pte);
> > +
> > +	init_temp_mm(&patch_mapping->temp_mm, patching_mm);
> > +	use_temporary_mm(&patch_mapping->temp_mm);
> > +
> > +	return 0;
> > +}
> > +
> > +static void unmap_patch(struct patch_mapping *patch_mapping)
> > +{
> > +	struct mm_struct *patching_mm = __this_cpu_read(cpu_patching_mm);
> > +	unsigned long patching_addr = __this_cpu_read(cpu_patching_addr);
> > +
> > +	pte_clear(patching_mm, patching_addr, patch_mapping->ptep);
> > +
> > +	local_flush_tlb_mm(patching_mm);
> > +
> > +	pte_unmap_unlock(patch_mapping->ptep, patch_mapping->ptl);
> > +
> > +	unuse_temporary_mm(&patch_mapping->temp_mm);
>
> Shouldn't you stop using it before unmapping/unlocking it ?
>

Yes I think you're right - IIRC I had to do this for the Hash MMU (which
we don't support w/ this verion of the series anymore anyways). I'll
revisit this for the next spin.

>
> > +}
> > +
> >   static int do_patch_instruction(u32 *addr, struct ppc_inst instr)
> >   {
> >   	int err, rc = 0;
> >   	u32 *patch_addr = NULL;
> >   	unsigned long flags;
> > +	struct patch_mapping patch_mapping;
> >   
> >   	/*
> > -	 * During early early boot patch_instruction is called
> > -	 * when text_poke_area is not ready, but we still need
> > -	 * to allow patching. We just do the plain old patching
> > +	 * During early early boot patch_instruction is called when the
> > +	 * patching_mm/text_poke_area is not ready, but we still need to allow
> > +	 * patching. We just do the plain old patching.
> >   	 */
> > -	if (!this_cpu_read(text_poke_area))
> > -		return raw_patch_instruction(addr, instr);
> > +	if (radix_enabled()) {
> > +		if (!this_cpu_read(cpu_patching_mm))
> > +			return raw_patch_instruction(addr, instr);
> > +	} else {
> > +		if (!this_cpu_read(text_poke_area))
> > +			return raw_patch_instruction(addr, instr);
> > +	}
> >   
> >   	local_irq_save(flags);
> >   
> > -	err = map_patch_area(addr);
> > +	if (radix_enabled())
> > +		err = map_patch(addr, &patch_mapping);
>
> Maybe call it map_patch_mm() or map_patch_mapping() ?

Yes that does sound better.

>
> > +	else
> > +		err = map_patch_area(addr);
> >   	if (err)
> >   		goto out;
> >   
> >   	patch_addr = (u32 *)(__this_cpu_read(cpu_patching_addr) | offset_in_page(addr));
> >   	rc = __patch_instruction(addr, instr, patch_addr);
> >   
> > -	err = unmap_patch_area();
> > +	if (radix_enabled())
> > +		unmap_patch(&patch_mapping);
>
> No err ? Would be better if it could return something, allthough always
> 0.

Ok I'll do that.

>
> And same comment about naming.
>

Yes I'll use your suggested names.

> > +	else
> > +		err = unmap_patch_area();
> >   
> >   out:
> >   	local_irq_restore(flags);
> > 


^ permalink raw reply

* Re: [PATCH v5 6/8] powerpc: Rework and improve STRICT_KERNEL_RWX patching
From: Christopher M. Riedl @ 2021-08-11 18:10 UTC (permalink / raw)
  To: Christophe Leroy, linuxppc-dev
  Cc: keescook, peterz, x86, npiggin, linux-hardening, tglx, dja
In-Reply-To: <9c53e997-3609-20f8-74c0-7776c867ce6c@csgroup.eu>

On Thu Aug 5, 2021 at 4:34 AM CDT, Christophe Leroy wrote:
>
>
> Le 13/07/2021 à 07:31, Christopher M. Riedl a écrit :
> > Rework code-patching with STRICT_KERNEL_RWX to prepare for the next
> > patch which uses a temporary mm for patching under the Book3s64 Radix
> > MMU. Make improvements by adding a WARN_ON when the patchsite doesn't
> > match after patching and return the error from __patch_instruction()
> > properly.
> > 
> > Signed-off-by: Christopher M. Riedl <cmr@linux.ibm.com>
> > 
> > ---
> > 
> > v5:  * New to series.
> > ---
> >   arch/powerpc/lib/code-patching.c | 51 +++++++++++++++++---------------
> >   1 file changed, 27 insertions(+), 24 deletions(-)
> > 
> > diff --git a/arch/powerpc/lib/code-patching.c b/arch/powerpc/lib/code-patching.c
> > index 3122d8e4cc013..9f2eba9b70ee4 100644
> > --- a/arch/powerpc/lib/code-patching.c
> > +++ b/arch/powerpc/lib/code-patching.c
> > @@ -102,11 +102,12 @@ static inline void unuse_temporary_mm(struct temp_mm *temp_mm)
> >   }
> >   
> >   static DEFINE_PER_CPU(struct vm_struct *, text_poke_area);
> > +static DEFINE_PER_CPU(unsigned long, cpu_patching_addr);
> >   
> >   #if IS_BUILTIN(CONFIG_LKDTM)
> >   unsigned long read_cpu_patching_addr(unsigned int cpu)
> >   {
> > -	return (unsigned long)(per_cpu(text_poke_area, cpu))->addr;
> > +	return per_cpu(cpu_patching_addr, cpu);
> >   }
> >   #endif
> >   
> > @@ -121,6 +122,7 @@ static int text_area_cpu_up(unsigned int cpu)
> >   		return -1;
> >   	}
> >   	this_cpu_write(text_poke_area, area);
> > +	this_cpu_write(cpu_patching_addr, (unsigned long)area->addr);
> >   
> >   	return 0;
> >   }
> > @@ -146,7 +148,7 @@ void __init poking_init(void)
> >   /*
> >    * This can be called for kernel text or a module.
> >    */
> > -static int map_patch_area(void *addr, unsigned long text_poke_addr)
> > +static int map_patch_area(void *addr)
> >   {
> >   	unsigned long pfn;
> >   	int err;
> > @@ -156,17 +158,20 @@ static int map_patch_area(void *addr, unsigned long text_poke_addr)
> >   	else
> >   		pfn = __pa_symbol(addr) >> PAGE_SHIFT;
> >   
> > -	err = map_kernel_page(text_poke_addr, (pfn << PAGE_SHIFT), PAGE_KERNEL);
> > +	err = map_kernel_page(__this_cpu_read(cpu_patching_addr),
> > +			      (pfn << PAGE_SHIFT), PAGE_KERNEL);
> >   
> > -	pr_devel("Mapped addr %lx with pfn %lx:%d\n", text_poke_addr, pfn, err);
> > +	pr_devel("Mapped addr %lx with pfn %lx:%d\n",
> > +		 __this_cpu_read(cpu_patching_addr), pfn, err);
> >   	if (err)
> >   		return -1;
> >   
> >   	return 0;
> >   }
> >   
> > -static inline int unmap_patch_area(unsigned long addr)
> > +static inline int unmap_patch_area(void)
> >   {
> > +	unsigned long addr = __this_cpu_read(cpu_patching_addr);
> >   	pte_t *ptep;
> >   	pmd_t *pmdp;
> >   	pud_t *pudp;
> > @@ -175,23 +180,23 @@ static inline int unmap_patch_area(unsigned long addr)
> >   
> >   	pgdp = pgd_offset_k(addr);
> >   	if (unlikely(!pgdp))
> > -		return -EINVAL;
> > +		goto out_err;
> >   
> >   	p4dp = p4d_offset(pgdp, addr);
> >   	if (unlikely(!p4dp))
> > -		return -EINVAL;
> > +		goto out_err;
> >   
> >   	pudp = pud_offset(p4dp, addr);
> >   	if (unlikely(!pudp))
> > -		return -EINVAL;
> > +		goto out_err;
> >   
> >   	pmdp = pmd_offset(pudp, addr);
> >   	if (unlikely(!pmdp))
> > -		return -EINVAL;
> > +		goto out_err;
> >   
> >   	ptep = pte_offset_kernel(pmdp, addr);
> >   	if (unlikely(!ptep))
> > -		return -EINVAL;
> > +		goto out_err;
> >   
> >   	pr_devel("clearing mm %p, pte %p, addr %lx\n", &init_mm, ptep, addr);
> >   
> > @@ -202,15 +207,17 @@ static inline int unmap_patch_area(unsigned long addr)
> >   	flush_tlb_kernel_range(addr, addr + PAGE_SIZE);
> >   
> >   	return 0;
> > +
> > +out_err:
> > +	pr_warn("failed to unmap %lx\n", addr);
> > +	return -EINVAL;
>
> Can you keep that in the caller of unmap_patch_area() instead of all
> those goto stuff ?
>

Yeah I think that's fair. I'll do this in the next spin.

> >   }
> >   
> >   static int do_patch_instruction(u32 *addr, struct ppc_inst instr)
> >   {
> > -	int err;
> > +	int err, rc = 0;
> >   	u32 *patch_addr = NULL;
> >   	unsigned long flags;
> > -	unsigned long text_poke_addr;
> > -	unsigned long kaddr = (unsigned long)addr;
> >   
> >   	/*
> >   	 * During early early boot patch_instruction is called
> > @@ -222,24 +229,20 @@ static int do_patch_instruction(u32 *addr, struct ppc_inst instr)
> >   
> >   	local_irq_save(flags);
> >   
> > -	text_poke_addr = (unsigned long)__this_cpu_read(text_poke_area)->addr;
> > -	if (map_patch_area(addr, text_poke_addr)) {
> > -		err = -1;
> > +	err = map_patch_area(addr);
> > +	if (err)
> >   		goto out;
> > -	}
> > -
> > -	patch_addr = (u32 *)(text_poke_addr + (kaddr & ~PAGE_MASK));
> >   
> > -	__patch_instruction(addr, instr, patch_addr);
> > +	patch_addr = (u32 *)(__this_cpu_read(cpu_patching_addr) | offset_in_page(addr));
> > +	rc = __patch_instruction(addr, instr, patch_addr);
> >   
> > -	err = unmap_patch_area(text_poke_addr);
> > -	if (err)
> > -		pr_warn("failed to unmap %lx\n", text_poke_addr);
> > +	err = unmap_patch_area();
> >   
> >   out:
> >   	local_irq_restore(flags);
> > +	WARN_ON(!ppc_inst_equal(ppc_inst_read(addr), instr));
>
> Why adding that WARN_ON(), what could make that happen that is worth a
> WARN_ON() ?

Failing to patch something could cause very strange issues later, so
explicitly calling out a failure when it happens is warranted IMO.

>
> Patching is quite a critical fast path, I'm not sure we want to afford
> too many checks during
> patching, we want it quick at first.

Hmm, I'd prefer to measure the impact first - if it's a huge degradation
then sure we can drop the WARN_ON()... I'll add some data with the next
spin.

>
> >   
> > -	return err;
> > +	return rc ? rc : err;
> >   }
> >   #else /* !CONFIG_STRICT_KERNEL_RWX */
> >   
> > 


^ permalink raw reply

* Re: [PATCH v5 2/8] lkdtm/powerpc: Add test to hijack a patch mapping
From: Kees Cook @ 2021-08-11 18:07 UTC (permalink / raw)
  To: Christopher M. Riedl
  Cc: peterz, x86, npiggin, tglx, dja, linuxppc-dev, linux-hardening
In-Reply-To: <CDGVLP8OS8N9.13R0RIGJ1WJ8R@oc8246131445.ibm.com>

On Wed, Aug 11, 2021 at 12:57:00PM -0500, Christopher M. Riedl wrote:
> On Thu Aug 5, 2021 at 4:13 AM CDT, Christophe Leroy wrote:
> >
> >
> > Le 13/07/2021 à 07:31, Christopher M. Riedl a écrit :
> > > When live patching with STRICT_KERNEL_RWX the CPU doing the patching
> > > must temporarily remap the page(s) containing the patch site with +W
> > > permissions. While this temporary mapping is in use, another CPU could
> > > write to the same mapping and maliciously alter kernel text. Implement a
> > > LKDTM test to attempt to exploit such an opening during code patching.
> > > The test is implemented on powerpc and requires LKDTM built into the
> > > kernel (building LKDTM as a module is insufficient).
> > > 
> > > The LKDTM "hijack" test works as follows:
> > > 
> > >    1. A CPU executes an infinite loop to patch an instruction. This is
> > >       the "patching" CPU.
> > >    2. Another CPU attempts to write to the address of the temporary
> > >       mapping used by the "patching" CPU. This other CPU is the
> > >       "hijacker" CPU. The hijack either fails with a fault/error or
> > >       succeeds, in which case some kernel text is now overwritten.
> > > [...]
> > > +#if (IS_BUILTIN(CONFIG_LKDTM) && defined(CONFIG_STRICT_KERNEL_RWX) && \
> > > +	defined(CONFIG_PPC))
> >
> > I think this test shouldn't be limited to CONFIG_PPC and shouldn't be
> > limited to CONFIG_STRICT_KERNEL_RWX. It should be there all the time.

Agreed: if the machinery exists to provide this defense on even one
arch/config/whatever combo, I'd like LKDTM to test for it. This lets use
compare defenses across different combinations more easily, and means
folks must answer questions like "why doesn't $combination provide
$defense?"

> > Also why limiting it to IS_BUILTIN(CONFIG_LKDTM) ?
> 
> The test needs read_cpu_patching_addr() which definitely cannot be
> exposed outside of the kernel (ie. builtin).

FWIW, I'm okay with this. There isn't a solution that feels entirely
"right", so either a build-time requirement like this, or using an
exception for modules like this:

arch/x86/kernel/cpu/common.c:#if IS_MODULE(CONFIG_LKDTM)
arch/x86/kernel/cpu/common.c-EXPORT_SYMBOL_GPL(native_write_cr4);
arch/x86/kernel/cpu/common.c-#endif

I think neither is great. Another idea is maybe using a name-spaced
export, like:

EXPORT_SYMBOL_NS_GPL(native_write_cr4, LKDTM);

But that still means it gets exposed to malicious discovery, so probably
not.

I suspect the best is to just do the BUILTIN check, since building LKDTM
as a module on a _production_ kernel is rare if it exists at all. The
only downside is needing to completely reboot to perform updated tests,
but then, I frequently find myself breaking the kernel badly on bad
tests, so I have to reboot anyway. ;)

-Kees

-- 
Kees Cook

^ permalink raw reply

* Re: [PATCH v5 5/8] powerpc/64s: Introduce temporary mm for Radix MMU
From: Christopher M. Riedl @ 2021-08-11 18:02 UTC (permalink / raw)
  To: Christophe Leroy, linuxppc-dev
  Cc: keescook, peterz, x86, npiggin, linux-hardening, tglx, dja
In-Reply-To: <c8b2291e-57f9-6d9a-583e-4ec65b2c9bcb@csgroup.eu>

On Thu Aug 5, 2021 at 4:27 AM CDT, Christophe Leroy wrote:
>
>
> Le 13/07/2021 à 07:31, Christopher M. Riedl a écrit :
> > x86 supports the notion of a temporary mm which restricts access to
> > temporary PTEs to a single CPU. A temporary mm is useful for situations
> > where a CPU needs to perform sensitive operations (such as patching a
> > STRICT_KERNEL_RWX kernel) requiring temporary mappings without exposing
> > said mappings to other CPUs. Another benefit is that other CPU TLBs do
> > not need to be flushed when the temporary mm is torn down.
> > 
> > Mappings in the temporary mm can be set in the userspace portion of the
> > address-space.
> > 
> > Interrupts must be disabled while the temporary mm is in use. HW
> > breakpoints, which may have been set by userspace as watchpoints on
> > addresses now within the temporary mm, are saved and disabled when
> > loading the temporary mm. The HW breakpoints are restored when unloading
> > the temporary mm. All HW breakpoints are indiscriminately disabled while
> > the temporary mm is in use.
>
> Can you explain more about that breakpoint stuff ? Why is it a special
> case here at all ? Isn't it
> the same when you switch from one user task to another one ? x86 commit
> doesn't say anythink about
> breakpoints.
>

We do not check if the breakpoint is on a kernel address (perf can do
this IIUC) and just disable all of them. I had to dig, but x86 has a
comment with their implementation at arch/x86/kernel/alternative.c:743.

I can reword that part of the commit message if it's unclear.

> > 
> > Based on x86 implementation:
> > 
> > commit cefa929c034e
> > ("x86/mm: Introduce temporary mm structs")
> > 
> > Signed-off-by: Christopher M. Riedl <cmr@linux.ibm.com>
> > 
> > ---
> > 
> > v5:  * Drop support for using a temporary mm on Book3s64 Hash MMU.
> > 
> > v4:  * Pass the prev mm instead of NULL to switch_mm_irqs_off() when
> >         using/unusing the temp mm as suggested by Jann Horn to keep
> >         the context.active counter in-sync on mm/nohash.
> >       * Disable SLB preload in the temporary mm when initializing the
> >         temp_mm struct.
> >       * Include asm/debug.h header to fix build issue with
> >         ppc44x_defconfig.
> > ---
> >   arch/powerpc/include/asm/debug.h |  1 +
> >   arch/powerpc/kernel/process.c    |  5 +++
> >   arch/powerpc/lib/code-patching.c | 56 ++++++++++++++++++++++++++++++++
> >   3 files changed, 62 insertions(+)
> > 
> > diff --git a/arch/powerpc/include/asm/debug.h b/arch/powerpc/include/asm/debug.h
> > index 86a14736c76c3..dfd82635ea8b3 100644
> > --- a/arch/powerpc/include/asm/debug.h
> > +++ b/arch/powerpc/include/asm/debug.h
> > @@ -46,6 +46,7 @@ static inline int debugger_fault_handler(struct pt_regs *regs) { return 0; }
> >   #endif
> >   
> >   void __set_breakpoint(int nr, struct arch_hw_breakpoint *brk);
> > +void __get_breakpoint(int nr, struct arch_hw_breakpoint *brk);
> >   bool ppc_breakpoint_available(void);
> >   #ifdef CONFIG_PPC_ADV_DEBUG_REGS
> >   extern void do_send_trap(struct pt_regs *regs, unsigned long address,
> > diff --git a/arch/powerpc/kernel/process.c b/arch/powerpc/kernel/process.c
> > index 185beb2905801..a0776200772e8 100644
> > --- a/arch/powerpc/kernel/process.c
> > +++ b/arch/powerpc/kernel/process.c
> > @@ -865,6 +865,11 @@ static inline int set_breakpoint_8xx(struct arch_hw_breakpoint *brk)
> >   	return 0;
> >   }
> >   
> > +void __get_breakpoint(int nr, struct arch_hw_breakpoint *brk)
> > +{
> > +	memcpy(brk, this_cpu_ptr(&current_brk[nr]), sizeof(*brk));
> > +}
> > +
> >   void __set_breakpoint(int nr, struct arch_hw_breakpoint *brk)
> >   {
> >   	memcpy(this_cpu_ptr(&current_brk[nr]), brk, sizeof(*brk));
> > diff --git a/arch/powerpc/lib/code-patching.c b/arch/powerpc/lib/code-patching.c
> > index 54b6157d44e95..3122d8e4cc013 100644
> > --- a/arch/powerpc/lib/code-patching.c
> > +++ b/arch/powerpc/lib/code-patching.c
> > @@ -17,6 +17,9 @@
> >   #include <asm/code-patching.h>
> >   #include <asm/setup.h>
> >   #include <asm/inst.h>
> > +#include <asm/mmu_context.h>
> > +#include <asm/debug.h>
> > +#include <asm/tlb.h>
> >   
> >   static int __patch_instruction(u32 *exec_addr, struct ppc_inst instr, u32 *patch_addr)
> >   {
> > @@ -45,6 +48,59 @@ int raw_patch_instruction(u32 *addr, struct ppc_inst instr)
> >   }
> >   
> >   #ifdef CONFIG_STRICT_KERNEL_RWX
> > +
> > +struct temp_mm {
> > +	struct mm_struct *temp;
> > +	struct mm_struct *prev;
> > +	struct arch_hw_breakpoint brk[HBP_NUM_MAX];
> > +};
> > +
> > +static inline void init_temp_mm(struct temp_mm *temp_mm, struct mm_struct *mm)
> > +{
> > +	/* We currently only support temporary mm on the Book3s64 Radix MMU */
> > +	WARN_ON(!radix_enabled());
> > +
> > +	temp_mm->temp = mm;
> > +	temp_mm->prev = NULL;
> > +	memset(&temp_mm->brk, 0, sizeof(temp_mm->brk));
> > +}
> > +
> > +static inline void use_temporary_mm(struct temp_mm *temp_mm)
> > +{
> > +	lockdep_assert_irqs_disabled();
> > +
> > +	temp_mm->prev = current->active_mm;
> > +	switch_mm_irqs_off(temp_mm->prev, temp_mm->temp, current);
> > +
> > +	WARN_ON(!mm_is_thread_local(temp_mm->temp));
> > +
> > +	if (ppc_breakpoint_available()) {
> > +		struct arch_hw_breakpoint null_brk = {0};
> > +		int i = 0;
> > +
> > +		for (; i < nr_wp_slots(); ++i) {
> > +			__get_breakpoint(i, &temp_mm->brk[i]);
> > +			if (temp_mm->brk[i].type != 0)
> > +				__set_breakpoint(i, &null_brk);
> > +		}
> > +	}
> > +}
> > +
> > +static inline void unuse_temporary_mm(struct temp_mm *temp_mm)
>
> not sure about the naming.
>
> Maybe start_using_temp_mm() and stop_using_temp_mm() would be more
> explicit.
>

Hehe I think we've discussed this before - naming things is hard :) I'll
take your suggestions for the next spin.

>
> > +{
> > +	lockdep_assert_irqs_disabled();
> > +
> > +	switch_mm_irqs_off(temp_mm->temp, temp_mm->prev, current);
> > +
> > +	if (ppc_breakpoint_available()) {
> > +		int i = 0;
> > +
> > +		for (; i < nr_wp_slots(); ++i)
> > +			if (temp_mm->brk[i].type != 0)
> > +				__set_breakpoint(i, &temp_mm->brk[i]);
> > +	}
> > +}
> > +
> >   static DEFINE_PER_CPU(struct vm_struct *, text_poke_area);
> >   
> >   #if IS_BUILTIN(CONFIG_LKDTM)
> > 
>
> You'll probably get a bisecting hasard with those unused 'static inline'
> functions in a .c file
> because that patch alone probably fails build.

I just built the patch without any issue. The compiler only complains
for unused 'static' (non-inline) functions right?

^ permalink raw reply

* Re: [PATCH v5 8/8] lkdtm/powerpc: Fix code patching hijack test
From: Christopher M. Riedl @ 2021-08-11 17:57 UTC (permalink / raw)
  To: Christophe Leroy, linuxppc-dev
  Cc: keescook, peterz, x86, npiggin, linux-hardening, tglx, dja
In-Reply-To: <7a6c97ed-815b-49fc-5568-ab4420f53122@csgroup.eu>

On Thu Aug 5, 2021 at 4:18 AM CDT, Christophe Leroy wrote:
>
>
> Le 13/07/2021 à 07:31, Christopher M. Riedl a écrit :
> > Code patching on powerpc with a STRICT_KERNEL_RWX uses a userspace
> > address in a temporary mm on Radix now. Use __put_user() to avoid write
> > failures due to KUAP when attempting a "hijack" on the patching address.
> > __put_user() also works with the non-userspace, vmalloc-based patching
> > address on non-Radix MMUs.
>
> It is not really clean to use __put_user() on non user address,
> allthought it works by change.
>
> I think it would be better to do something like
>
> if (is_kernel_addr(addr))
> copy_to_kernel_nofault(...);
> else
> copy_to_user_nofault(...);
>

Yes that looks much better. I'll pick this up and try it for the next
spin. Thanks!

>
>
> > 
> > Signed-off-by: Christopher M. Riedl <cmr@linux.ibm.com>
> > ---
> >   drivers/misc/lkdtm/perms.c | 9 ---------
> >   1 file changed, 9 deletions(-)
> > 
> > diff --git a/drivers/misc/lkdtm/perms.c b/drivers/misc/lkdtm/perms.c
> > index 41e87e5f9cc86..da6a34a0a49fb 100644
> > --- a/drivers/misc/lkdtm/perms.c
> > +++ b/drivers/misc/lkdtm/perms.c
> > @@ -262,16 +262,7 @@ static inline u32 lkdtm_read_patch_site(void)
> >   /* Returns True if the write succeeds */
> >   static inline bool lkdtm_try_write(u32 data, u32 *addr)
> >   {
> > -#ifdef CONFIG_PPC
> > -	__put_kernel_nofault(addr, &data, u32, err);
> > -	return true;
> > -
> > -err:
> > -	return false;
> > -#endif
> > -#ifdef CONFIG_X86_64
> >   	return !__put_user(data, addr);
> > -#endif
> >   }
> >   
> >   static int lkdtm_patching_cpu(void *data)
> > 


^ permalink raw reply

* Re: [PATCH v5 2/8] lkdtm/powerpc: Add test to hijack a patch mapping
From: Christopher M. Riedl @ 2021-08-11 17:57 UTC (permalink / raw)
  To: Christophe Leroy, linuxppc-dev
  Cc: keescook, peterz, x86, npiggin, linux-hardening, tglx, dja
In-Reply-To: <f933e9de-ff3b-aa5a-bb6e-55770d5ab868@csgroup.eu>

On Thu Aug 5, 2021 at 4:13 AM CDT, Christophe Leroy wrote:
>
>
> Le 13/07/2021 à 07:31, Christopher M. Riedl a écrit :
> > When live patching with STRICT_KERNEL_RWX the CPU doing the patching
> > must temporarily remap the page(s) containing the patch site with +W
> > permissions. While this temporary mapping is in use, another CPU could
> > write to the same mapping and maliciously alter kernel text. Implement a
> > LKDTM test to attempt to exploit such an opening during code patching.
> > The test is implemented on powerpc and requires LKDTM built into the
> > kernel (building LKDTM as a module is insufficient).
> > 
> > The LKDTM "hijack" test works as follows:
> > 
> >    1. A CPU executes an infinite loop to patch an instruction. This is
> >       the "patching" CPU.
> >    2. Another CPU attempts to write to the address of the temporary
> >       mapping used by the "patching" CPU. This other CPU is the
> >       "hijacker" CPU. The hijack either fails with a fault/error or
> >       succeeds, in which case some kernel text is now overwritten.
> > 
> > The virtual address of the temporary patch mapping is provided via an
> > LKDTM-specific accessor to the hijacker CPU. This test assumes a
> > hypothetical situation where this address was leaked previously.
> > 
> > How to run the test:
> > 
> > 	mount -t debugfs none /sys/kernel/debug
> > 	(echo HIJACK_PATCH > /sys/kernel/debug/provoke-crash/DIRECT)
> > 
> > A passing test indicates that it is not possible to overwrite kernel
> > text from another CPU by using the temporary mapping established by
> > a CPU for patching.
> > 
> > Signed-off-by: Christopher M. Riedl <cmr@linux.ibm.com>
> > 
> > ---
> > 
> > v5:  * Use `u32*` instead of `struct ppc_inst*` based on new series in
> >         upstream.
> > 
> > v4:  * Separate the powerpc and x86_64 bits into individual patches.
> >       * Use __put_kernel_nofault() when attempting to hijack the mapping
> >       * Use raw_smp_processor_id() to avoid triggering the BUG() when
> >         calling smp_processor_id() in preemptible code - the only thing
> >         that matters is that one of the threads is bound to a different
> >         CPU - we are not using smp_processor_id() to access any per-cpu
> >         data or similar where preemption should be disabled.
> >       * Rework the patching_cpu() kthread stop condition to avoid:
> >         https://lwn.net/Articles/628628/
> > ---
> >   drivers/misc/lkdtm/core.c  |   1 +
> >   drivers/misc/lkdtm/lkdtm.h |   1 +
> >   drivers/misc/lkdtm/perms.c | 134 +++++++++++++++++++++++++++++++++++++
> >   3 files changed, 136 insertions(+)
> > 
> > diff --git a/drivers/misc/lkdtm/core.c b/drivers/misc/lkdtm/core.c
> > index 8024b6a5cc7fc..fbcb95eda337b 100644
> > --- a/drivers/misc/lkdtm/core.c
> > +++ b/drivers/misc/lkdtm/core.c
> > @@ -147,6 +147,7 @@ static const struct crashtype crashtypes[] = {
> >   	CRASHTYPE(WRITE_RO),
> >   	CRASHTYPE(WRITE_RO_AFTER_INIT),
> >   	CRASHTYPE(WRITE_KERN),
> > +	CRASHTYPE(HIJACK_PATCH),
> >   	CRASHTYPE(REFCOUNT_INC_OVERFLOW),
> >   	CRASHTYPE(REFCOUNT_ADD_OVERFLOW),
> >   	CRASHTYPE(REFCOUNT_INC_NOT_ZERO_OVERFLOW),
> > diff --git a/drivers/misc/lkdtm/lkdtm.h b/drivers/misc/lkdtm/lkdtm.h
> > index 99f90d3e5e9cb..87e7e6136d962 100644
> > --- a/drivers/misc/lkdtm/lkdtm.h
> > +++ b/drivers/misc/lkdtm/lkdtm.h
> > @@ -62,6 +62,7 @@ void lkdtm_EXEC_USERSPACE(void);
> >   void lkdtm_EXEC_NULL(void);
> >   void lkdtm_ACCESS_USERSPACE(void);
> >   void lkdtm_ACCESS_NULL(void);
> > +void lkdtm_HIJACK_PATCH(void);
> >   
> >   /* refcount.c */
> >   void lkdtm_REFCOUNT_INC_OVERFLOW(void);
> > diff --git a/drivers/misc/lkdtm/perms.c b/drivers/misc/lkdtm/perms.c
> > index 2dede2ef658f3..39e7456852229 100644
> > --- a/drivers/misc/lkdtm/perms.c
> > +++ b/drivers/misc/lkdtm/perms.c
> > @@ -9,6 +9,7 @@
> >   #include <linux/vmalloc.h>
> >   #include <linux/mman.h>
> >   #include <linux/uaccess.h>
> > +#include <linux/kthread.h>
> >   #include <asm/cacheflush.h>
> >   
> >   /* Whether or not to fill the target memory area with do_nothing(). */
> > @@ -222,6 +223,139 @@ void lkdtm_ACCESS_NULL(void)
> >   	pr_err("FAIL: survived bad write\n");
> >   }
> >   
> > +#if (IS_BUILTIN(CONFIG_LKDTM) && defined(CONFIG_STRICT_KERNEL_RWX) && \
> > +	defined(CONFIG_PPC))
>
>
> I think this test shouldn't be limited to CONFIG_PPC and shouldn't be
> limited to
> CONFIG_STRICT_KERNEL_RWX. It should be there all the time.
>
> Also why limiting it to IS_BUILTIN(CONFIG_LKDTM) ?
>

The test needs read_cpu_patching_addr() which definitely cannot be
exposed outside of the kernel (ie. builtin).

> > +/*
> > + * This is just a dummy location to patch-over.
> > + */
> > +static void patching_target(void)
> > +{
> > +	return;
> > +}
> > +
> > +#include <asm/code-patching.h>
> > +const u32 *patch_site = (const u32 *)&patching_target;
> > +
> > +static inline int lkdtm_do_patch(u32 data)
> > +{
> > +	return patch_instruction((u32 *)patch_site, ppc_inst(data));
> > +}
> > +
> > +static inline u32 lkdtm_read_patch_site(void)
> > +{
> > +	return READ_ONCE(*patch_site);
> > +}
> > +
> > +/* Returns True if the write succeeds */
> > +static inline bool lkdtm_try_write(u32 data, u32 *addr)
> > +{
> > +	__put_kernel_nofault(addr, &data, u32, err);
> > +	return true;
> > +
> > +err:
> > +	return false;
> > +}
> > +
> > +static int lkdtm_patching_cpu(void *data)
> > +{
> > +	int err = 0;
> > +	u32 val = 0xdeadbeef;
> > +
> > +	pr_info("starting patching_cpu=%d\n", raw_smp_processor_id());
> > +
> > +	do {
> > +		err = lkdtm_do_patch(val);
> > +	} while (lkdtm_read_patch_site() == val && !err && !kthread_should_stop());
> > +
> > +	if (err)
> > +		pr_warn("XFAIL: patch_instruction returned error: %d\n", err);
> > +
> > +	while (!kthread_should_stop()) {
> > +		set_current_state(TASK_INTERRUPTIBLE);
> > +		schedule();
> > +	}
> > +
> > +	return err;
> > +}
> > +
> > +void lkdtm_HIJACK_PATCH(void)
> > +{
> > +	struct task_struct *patching_kthrd;
> > +	int patching_cpu, hijacker_cpu, attempts;
> > +	unsigned long addr;
> > +	bool hijacked;
> > +	const u32 bad_data = 0xbad00bad;
> > +	const u32 original_insn = lkdtm_read_patch_site();
> > +
> > +	if (!IS_ENABLED(CONFIG_SMP)) {
> > +		pr_err("XFAIL: this test requires CONFIG_SMP\n");
> > +		return;
> > +	}
> > +
> > +	if (num_online_cpus() < 2) {
> > +		pr_warn("XFAIL: this test requires at least two cpus\n");
> > +		return;
> > +	}
> > +
> > +	hijacker_cpu = raw_smp_processor_id();
> > +	patching_cpu = cpumask_any_but(cpu_online_mask, hijacker_cpu);
> > +
> > +	patching_kthrd = kthread_create_on_node(&lkdtm_patching_cpu, NULL,
> > +						cpu_to_node(patching_cpu),
> > +						"lkdtm_patching_cpu");
> > +	kthread_bind(patching_kthrd, patching_cpu);
> > +	wake_up_process(patching_kthrd);
> > +
> > +	addr = offset_in_page(patch_site) | read_cpu_patching_addr(patching_cpu);
> > +
> > +	pr_info("starting hijacker_cpu=%d\n", hijacker_cpu);
> > +	for (attempts = 0; attempts < 100000; ++attempts) {
> > +		/* Try to write to the other CPU's temp patch mapping */
> > +		hijacked = lkdtm_try_write(bad_data, (u32 *)addr);
> > +
> > +		if (hijacked) {
> > +			if (kthread_stop(patching_kthrd)) {
> > +				pr_info("hijack attempts: %d\n", attempts);
> > +				pr_err("XFAIL: error stopping patching cpu\n");
> > +				return;
> > +			}
> > +			break;
> > +		}
> > +	}
> > +	pr_info("hijack attempts: %d\n", attempts);
> > +
> > +	if (hijacked) {
> > +		if (lkdtm_read_patch_site() == bad_data)
> > +			pr_err("overwrote kernel text\n");
> > +		/*
> > +		 * There are window conditions where the hijacker cpu manages to
> > +		 * write to the patch site but the site gets overwritten again by
> > +		 * the patching cpu. We still consider that a "successful" hijack
> > +		 * since the hijacker cpu did not fault on the write.
> > +		 */
> > +		pr_err("FAIL: wrote to another cpu's patching area\n");
> > +	} else {
> > +		kthread_stop(patching_kthrd);
> > +	}
> > +
> > +	/* Restore the original data to be able to run the test again */
> > +	lkdtm_do_patch(original_insn);
> > +}
> > +
> > +#else
> > +
> > +void lkdtm_HIJACK_PATCH(void)
> > +{
> > +	if (!IS_ENABLED(CONFIG_PPC))
> > +		pr_err("XFAIL: this test only runs on powerpc\n");
> > +	if (!IS_ENABLED(CONFIG_STRICT_KERNEL_RWX))
> > +		pr_err("XFAIL: this test requires CONFIG_STRICT_KERNEL_RWX\n");
> > +	if (!IS_BUILTIN(CONFIG_LKDTM))
> > +		pr_err("XFAIL: this test requires CONFIG_LKDTM=y (not =m!)\n");
> > +}
> > +
> > +#endif
> > +
> >   void __init lkdtm_perms_init(void)
> >   {
> >   	/* Make sure we can write to __ro_after_init values during __init */
> > 


^ permalink raw reply

* Re: [PATCH v5 4/8] lkdtm/x86_64: Add test to hijack a patch mapping
From: Christopher M. Riedl @ 2021-08-11 17:53 UTC (permalink / raw)
  To: Christophe Leroy, linuxppc-dev
  Cc: keescook, peterz, x86, npiggin, linux-hardening, tglx, dja
In-Reply-To: <95de5ec5-8d48-c969-3c9f-966561f9f58e@csgroup.eu>

On Thu Aug 5, 2021 at 4:09 AM CDT, Christophe Leroy wrote:
>
>
> Le 13/07/2021 à 07:31, Christopher M. Riedl a écrit :
> > A previous commit implemented an LKDTM test on powerpc to exploit the
> > temporary mapping established when patching code with STRICT_KERNEL_RWX
> > enabled. Extend the test to work on x86_64 as well.
> > 
> > Signed-off-by: Christopher M. Riedl <cmr@linux.ibm.com>
> > ---
> >   drivers/misc/lkdtm/perms.c | 26 ++++++++++++++++++++++----
> >   1 file changed, 22 insertions(+), 4 deletions(-)
> > 
> > diff --git a/drivers/misc/lkdtm/perms.c b/drivers/misc/lkdtm/perms.c
> > index 39e7456852229..41e87e5f9cc86 100644
> > --- a/drivers/misc/lkdtm/perms.c
> > +++ b/drivers/misc/lkdtm/perms.c
> > @@ -224,7 +224,7 @@ void lkdtm_ACCESS_NULL(void)
> >   }
> >   
> >   #if (IS_BUILTIN(CONFIG_LKDTM) && defined(CONFIG_STRICT_KERNEL_RWX) && \
> > -	defined(CONFIG_PPC))
> > +	(defined(CONFIG_PPC) || defined(CONFIG_X86_64)))
> >   /*
> >    * This is just a dummy location to patch-over.
> >    */
> > @@ -233,12 +233,25 @@ static void patching_target(void)
> >   	return;
> >   }
> >   
> > -#include <asm/code-patching.h>
> >   const u32 *patch_site = (const u32 *)&patching_target;
> >   
> > +#ifdef CONFIG_PPC
> > +#include <asm/code-patching.h>
> > +#endif
> > +
> > +#ifdef CONFIG_X86_64
> > +#include <asm/text-patching.h>
> > +#endif
> > +
> >   static inline int lkdtm_do_patch(u32 data)
> >   {
> > +#ifdef CONFIG_PPC
> >   	return patch_instruction((u32 *)patch_site, ppc_inst(data));
> > +#endif
> > +#ifdef CONFIG_X86_64
> > +	text_poke((void *)patch_site, &data, sizeof(u32));
> > +	return 0;
> > +#endif
> >   }
> >   
> >   static inline u32 lkdtm_read_patch_site(void)
> > @@ -249,11 +262,16 @@ static inline u32 lkdtm_read_patch_site(void)
> >   /* Returns True if the write succeeds */
> >   static inline bool lkdtm_try_write(u32 data, u32 *addr)
> >   {
> > +#ifdef CONFIG_PPC
> >   	__put_kernel_nofault(addr, &data, u32, err);
> >   	return true;
> >   
> >   err:
> >   	return false;
> > +#endif
> > +#ifdef CONFIG_X86_64
> > +	return !__put_user(data, addr);
> > +#endif
> >   }
> >   
> >   static int lkdtm_patching_cpu(void *data)
> > @@ -346,8 +364,8 @@ void lkdtm_HIJACK_PATCH(void)
> >   
> >   void lkdtm_HIJACK_PATCH(void)
> >   {
> > -	if (!IS_ENABLED(CONFIG_PPC))
> > -		pr_err("XFAIL: this test only runs on powerpc\n");
> > +	if (!IS_ENABLED(CONFIG_PPC) && !IS_ENABLED(CONFIG_X86_64))
> > +		pr_err("XFAIL: this test only runs on powerpc and x86_64\n");
> >   	if (!IS_ENABLED(CONFIG_STRICT_KERNEL_RWX))
> >   		pr_err("XFAIL: this test requires CONFIG_STRICT_KERNEL_RWX\n");
> >   	if (!IS_BUILTIN(CONFIG_LKDTM))
> > 
>
> Instead of spreading arch specific stuff into LKDTM, wouldn't it make
> sence to define common a
> common API ? Because the day another arch like arm64 implements it own
> approach, do we add specific
> functions again and again into LKDTM ?

Hmm a common patch/poke kernel API is probably out of scope for this
series? I do agree though - since you suggested splitting the series
maybe that's something I can add along with the LKDTM patches.

>
> Also, I find it odd to define tests only when they can succeed. For
> other tests like
> ACCESS_USERSPACE, they are there all the time, regardless of whether we
> have selected
> CONFIG_PPC_KUAP or not. I think it should be the same here, have it all
> there time, if
> CONFIG_STRICT_KERNEL_RWX is selected the test succeeds otherwise it
> fails, but it is always there.

I followed the approach in lkdtm_DOUBLE_FAULT and others in
drivers/misc/lkdtm/bugs.c. I suppose it doesn't hurt to always build the
test irrespective of CONFIG_STRICT_KERNEL_RWX.

>
> Christophe


^ permalink raw reply

* Re: [PATCH v5 0/8] Use per-CPU temporary mappings for patching on Radix MMU
From: Christopher M. Riedl @ 2021-08-11 17:49 UTC (permalink / raw)
  To: Christophe Leroy, linuxppc-dev
  Cc: keescook, peterz, x86, npiggin, linux-hardening, tglx, dja
In-Reply-To: <0c3cb140-23ad-0eb6-7df9-633aa51a097c@csgroup.eu>

On Thu Aug 5, 2021 at 4:03 AM CDT, Christophe Leroy wrote:
>
>
> Le 13/07/2021 à 07:31, Christopher M. Riedl a écrit :
> > When compiled with CONFIG_STRICT_KERNEL_RWX, the kernel must create
> > temporary mappings when patching itself. These mappings temporarily
> > override the strict RWX text protections to permit a write. Currently,
> > powerpc allocates a per-CPU VM area for patching. Patching occurs as
> > follows:
> > 
> > 	1. Map page in per-CPU VM area w/ PAGE_KERNEL protection
> > 	2. Patch text
> > 	3. Remove the temporary mapping
> > 
> > While the VM area is per-CPU, the mapping is actually inserted into the
> > kernel page tables. Presumably, this could allow another CPU to access
> > the normally write-protected text - either malicously or accidentally -
> > via this same mapping if the address of the VM area is known. Ideally,
> > the mapping should be kept local to the CPU doing the patching [0].
> > 
> > x86 introduced "temporary mm" structs which allow the creation of mappings
> > local to a particular CPU [1]. This series intends to bring the notion of a
> > temporary mm to powerpc's Book3s64 Radix MMU and harden it by using such a
> > mapping for patching a kernel with strict RWX permissions.
> > 
> > The first four patches implement an LKDTM test "proof-of-concept" which
> > exploits the potential vulnerability (ie. the temporary mapping during patching
> > is exposed in the kernel page tables and accessible by other CPUs) using a
> > simple brute-force approach. This test is implemented for both powerpc and
> > x86_64. The test passes on powerpc Radix with this new series, fails on
> > upstream powerpc, passes on upstream x86_64, and fails on an older (ancient)
> > x86_64 tree without the x86_64 temporary mm patches. The remaining patches add
> > support for and use a temporary mm for code patching on powerpc with the Radix
> > MMU.
>
> I think four first patches (together with last one) are quite
> independent from the heart of the
> series itself which is patches 5, 6, 7. Maybe you should split that
> series it two series ? After all
> those selftests are nice to have but are not absolutely necessary, that
> would help getting forward I
> think.
>

Hmm you're probably right. The selftest at least proves there is a
potential attack which I think is necessary for any hardening related
series/patch. I'll split the series into separate powerpc temp mm and
LKDTM series for the next spin.

> > 
> > Tested boot, ftrace, and repeated LKDTM "hijack":
> > 	- QEMU+KVM (host: POWER9 Blackbird): Radix MMU w/ KUAP
> > 	- QEMU+KVM (host: POWER9 Blackbird): Hash MMU
> > 
> > Tested repeated LKDTM "hijack":
> > 	- QEMU+KVM (host: AMD desktop): x86_64 upstream
> > 	- QEMU+KVM (host: AMD desktop): x86_64 w/o percpu temp mm to
> > 	  verify the LKDTM "hijack" test fails
> > 
> > Tested boot and ftrace:
> > 	- QEMU+TCG: ppc44x (bamboo)
> > 	- QEMU+TCG: g5 (mac99)
> > 
> > I also tested with various extra config options enabled as suggested in
> > section 12) in Documentation/process/submit-checklist.rst.
> > 
> > v5:	* Only support Book3s64 Radix MMU for now. There are some issues with
> > 	  the previous implementation on the Hash MMU as pointed out by Nick
> > 	  Piggin. Fixing these is not trivial so we only support the Radix MMU
> > 	  for now. I tried using realmode (no data translation) to patch with
> > 	  Hash to at least avoid exposing the patch mapping to other CPUs but
> > 	  this doesn't reliably work either since we cannot access vmalloc'd
> > 	  space in realmode.
>
> So you now accept to have two different mode depending on the platform ?

By necessity yes.

> As far as I remember I commented some time ago that non SMP didn't need
> that feature and you were
> reluctant to have two different implementations. What made you change
> your mind ? (just curious).
>

The book3s64 hash mmu support is a pain ;) Supporting both the temp-mm
and vmalloc implementations turned out to be relatively simple - I
initially thought this would be messier. For now we will support both;
however, in the future I'd still like to implement the percpu temp-mm
support for the Hash MMU as well. I suppose we could re-evaluate then if
we want/need both implementations (I know you're in favor of keeping the
vmalloc-based approach for performance reasons on non-SMP).

>
> > 	* Use percpu variables for the patching_mm and patching_addr. This
> > 	  avoids the need for synchronization mechanisms entirely. Thanks to
> > 	  Peter Zijlstra for pointing out text_mutex which unfortunately didn't
> > 	  work out without larger code changes in powerpc. Also thanks to Daniel
> > 	  Axtens for comments about using percpu variables for the *percpu* temp
> > 	  mm things off list.
> > 
> > v4:	* It's time to revisit this series again since @jpn and @mpe fixed
> > 	  our known STRICT_*_RWX bugs on powerpc/64s.
> > 	* Rebase on linuxppc/next:
> >            commit ee1bc694fbaec ("powerpc/kvm: Fix build error when PPC_MEM_KEYS/PPC_PSERIES=n")
> > 	* Completely rework how map_patch() works on book3s64 Hash MMU
> > 	* Split the LKDTM x86_64 and powerpc bits into separate patches
> > 	* Annotate commit messages with changes from v3 instead of
> > 	  listing them here completely out-of context...
> > 
> > v3:	* Rebase on linuxppc/next: commit 9123e3a74ec7 ("Linux 5.9-rc1")
> > 	* Move temporary mm implementation into code-patching.c where it
> > 	  belongs
> > 	* Implement LKDTM hijacker test on x86_64 (on IBM time oof) Do
> > 	* not use address zero for the patching address in the
> > 	  temporary mm (thanks @dja for pointing this out!)
> > 	* Wrap the LKDTM test w/ CONFIG_SMP as suggested by Christophe
> > 	  Leroy
> > 	* Comments to clarify PTE pre-allocation and patching addr
> > 	  selection
> > 
> > v2:	* Rebase on linuxppc/next:
> > 	  commit 105fb38124a4 ("powerpc/8xx: Modify ptep_get()")
> > 	* Always dirty pte when mapping patch
> > 	* Use `ppc_inst_len` instead of `sizeof` on instructions
> > 	* Declare LKDTM patching addr accessor in header where it belongs	
> > 
> > v1:	* Rebase on linuxppc/next (4336b9337824)
> > 	* Save and restore second hw watchpoint
> > 	* Use new ppc_inst_* functions for patching check and in LKDTM test
> > 
> > rfc-v2:	* Many fixes and improvements mostly based on extensive feedback
> >            and testing by Christophe Leroy (thanks!).
> > 	* Make patching_mm and patching_addr static and move
> > 	  '__ro_after_init' to after the variable name (more common in
> > 	  other parts of the kernel)
> > 	* Use 'asm/debug.h' header instead of 'asm/hw_breakpoint.h' to
> > 	  fix PPC64e compile
> > 	* Add comment explaining why we use BUG_ON() during the init
> > 	  call to setup for patching later
> > 	* Move ptep into patch_mapping to avoid walking page tables a
> > 	  second time when unmapping the temporary mapping
> > 	* Use KUAP under non-radix, also manually dirty the PTE for patch
> > 	  mapping on non-BOOK3S_64 platforms
> > 	* Properly return any error from __patch_instruction
> >          * Do not use 'memcmp' where a simple comparison is appropriate
> > 	* Simplify expression for patch address by removing pointer maths
> > 	* Add LKDTM test
> > 
> > [0]: https://github.com/linuxppc/issues/issues/224
> > [1]: https://lore.kernel.org/kernel-hardening/20190426232303.28381-1-nadav.amit@gmail.com/
> > 
> > Christopher M. Riedl (8):
> >    powerpc: Add LKDTM accessor for patching addr
> >    lkdtm/powerpc: Add test to hijack a patch mapping
> >    x86_64: Add LKDTM accessor for patching addr
> >    lkdtm/x86_64: Add test to hijack a patch mapping
> >    powerpc/64s: Introduce temporary mm for Radix MMU
> >    powerpc: Rework and improve STRICT_KERNEL_RWX patching
> >    powerpc/64s: Initialize and use a temporary mm for patching on Radix
> >    lkdtm/powerpc: Fix code patching hijack test
> > 
> >   arch/powerpc/include/asm/code-patching.h |   4 +
> >   arch/powerpc/include/asm/debug.h         |   1 +
> >   arch/powerpc/kernel/process.c            |   5 +
> >   arch/powerpc/lib/code-patching.c         | 240 ++++++++++++++++++++---
> >   arch/x86/include/asm/text-patching.h     |   4 +
> >   arch/x86/kernel/alternative.c            |   7 +
> >   drivers/misc/lkdtm/core.c                |   1 +
> >   drivers/misc/lkdtm/lkdtm.h               |   1 +
> >   drivers/misc/lkdtm/perms.c               | 143 ++++++++++++++
> >   9 files changed, 378 insertions(+), 28 deletions(-)
> > 


^ permalink raw reply

* [PATCH v2 60/60] KVM: PPC: Book3S HV P9: Remove subcore HMI handling
From: Nicholas Piggin @ 2021-08-11 16:01 UTC (permalink / raw)
  To: kvm-ppc; +Cc: linuxppc-dev, Nicholas Piggin
In-Reply-To: <20210811160134.904987-1-npiggin@gmail.com>

On POWER9 and newer, rather than the complex HMI synchronisation and
subcore state, have each thread un-apply the guest TB offset before
calling into the early HMI handler.

This allows the subcore state to be avoided, including subcore enter
/ exit guest, which includes an expensive divide that shows up
slightly in profiles.

Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
---
 arch/powerpc/include/asm/kvm_ppc.h    |  1 +
 arch/powerpc/kvm/book3s_hv.c          | 12 +++---
 arch/powerpc/kvm/book3s_hv_hmi.c      |  7 +++-
 arch/powerpc/kvm/book3s_hv_p9_entry.c |  2 +-
 arch/powerpc/kvm/book3s_hv_ras.c      | 54 +++++++++++++++++++++++++++
 5 files changed, 67 insertions(+), 9 deletions(-)

diff --git a/arch/powerpc/include/asm/kvm_ppc.h b/arch/powerpc/include/asm/kvm_ppc.h
index 2d88944f9f34..6355a6980ccf 100644
--- a/arch/powerpc/include/asm/kvm_ppc.h
+++ b/arch/powerpc/include/asm/kvm_ppc.h
@@ -760,6 +760,7 @@ void kvmppc_realmode_machine_check(struct kvm_vcpu *vcpu);
 void kvmppc_subcore_enter_guest(void);
 void kvmppc_subcore_exit_guest(void);
 long kvmppc_realmode_hmi_handler(void);
+long kvmppc_p9_realmode_hmi_handler(struct kvm_vcpu *vcpu);
 long kvmppc_h_enter(struct kvm_vcpu *vcpu, unsigned long flags,
                     long pte_index, unsigned long pteh, unsigned long ptel);
 long kvmppc_h_remove(struct kvm_vcpu *vcpu, unsigned long flags,
diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c
index e7f4525f2a74..f1f343307578 100644
--- a/arch/powerpc/kvm/book3s_hv.c
+++ b/arch/powerpc/kvm/book3s_hv.c
@@ -4017,8 +4017,6 @@ static int kvmhv_p9_guest_entry(struct kvm_vcpu *vcpu, u64 time_limit,
 
 	vcpu->arch.ceded = 0;
 
-	kvmppc_subcore_enter_guest();
-
 	vcpu_vpa_increment_dispatch(vcpu);
 
 	if (kvmhv_on_pseries()) {
@@ -4071,8 +4069,6 @@ static int kvmhv_p9_guest_entry(struct kvm_vcpu *vcpu, u64 time_limit,
 
 	vcpu_vpa_increment_dispatch(vcpu);
 
-	kvmppc_subcore_exit_guest();
-
 	return trap;
 }
 
@@ -6054,9 +6050,11 @@ static int kvmppc_book3s_init_hv(void)
 	if (r)
 		return r;
 
-	r = kvm_init_subcore_bitmap();
-	if (r)
-		return r;
+	if (!cpu_has_feature(CPU_FTR_ARCH_300)) {
+		r = kvm_init_subcore_bitmap();
+		if (r)
+			return r;
+	}
 
 	/*
 	 * We need a way of accessing the XICS interrupt controller,
diff --git a/arch/powerpc/kvm/book3s_hv_hmi.c b/arch/powerpc/kvm/book3s_hv_hmi.c
index 9af660476314..1ec50c69678b 100644
--- a/arch/powerpc/kvm/book3s_hv_hmi.c
+++ b/arch/powerpc/kvm/book3s_hv_hmi.c
@@ -20,10 +20,15 @@ void wait_for_subcore_guest_exit(void)
 
 	/*
 	 * NULL bitmap pointer indicates that KVM module hasn't
-	 * been loaded yet and hence no guests are running.
+	 * been loaded yet and hence no guests are running, or running
+	 * on POWER9 or newer CPU.
+	 *
 	 * If no KVM is in use, no need to co-ordinate among threads
 	 * as all of them will always be in host and no one is going
 	 * to modify TB other than the opal hmi handler.
+	 *
+	 * POWER9 and newer don't need this synchronisation.
+	 *
 	 * Hence, just return from here.
 	 */
 	if (!local_paca->sibling_subcore_state)
diff --git a/arch/powerpc/kvm/book3s_hv_p9_entry.c b/arch/powerpc/kvm/book3s_hv_p9_entry.c
index 1e18c089478e..7d31ad3de723 100644
--- a/arch/powerpc/kvm/book3s_hv_p9_entry.c
+++ b/arch/powerpc/kvm/book3s_hv_p9_entry.c
@@ -934,7 +934,7 @@ int kvmhv_vcpu_entry_p9(struct kvm_vcpu *vcpu, u64 time_limit, unsigned long lpc
 		kvmppc_realmode_machine_check(vcpu);
 
 	} else if (unlikely(trap == BOOK3S_INTERRUPT_HMI)) {
-		kvmppc_realmode_hmi_handler();
+		kvmppc_p9_realmode_hmi_handler(vcpu);
 
 	} else if (trap == BOOK3S_INTERRUPT_H_EMUL_ASSIST) {
 		vcpu->arch.emul_inst = mfspr(SPRN_HEIR);
diff --git a/arch/powerpc/kvm/book3s_hv_ras.c b/arch/powerpc/kvm/book3s_hv_ras.c
index d4bca93b79f6..3f94f4080d04 100644
--- a/arch/powerpc/kvm/book3s_hv_ras.c
+++ b/arch/powerpc/kvm/book3s_hv_ras.c
@@ -136,6 +136,60 @@ void kvmppc_realmode_machine_check(struct kvm_vcpu *vcpu)
 	vcpu->arch.mce_evt = mce_evt;
 }
 
+
+long kvmppc_p9_realmode_hmi_handler(struct kvm_vcpu *vcpu)
+{
+	struct kvmppc_vcore *vc = vcpu->arch.vcore;
+	long ret = 0;
+
+	/*
+	 * Unapply and clear the offset first. That way, if the TB was not
+	 * resynced then it will remain in host-offset, and if it was resynced
+	 * then it is brought into host-offset. Then the tb offset is
+	 * re-applied before continuing with the KVM exit.
+	 *
+	 * This way, we don't need to actualy know whether not OPAL resynced
+	 * the timebase or do any of the complicated dance that the P7/8
+	 * path requires.
+	 */
+	if (vc->tb_offset_applied) {
+		u64 new_tb = mftb() - vc->tb_offset_applied;
+		mtspr(SPRN_TBU40, new_tb);
+		if ((mftb() & 0xffffff) < (new_tb & 0xffffff)) {
+			new_tb += 0x1000000;
+			mtspr(SPRN_TBU40, new_tb);
+		}
+		vc->tb_offset_applied = 0;
+	}
+
+	local_paca->hmi_irqs++;
+
+	if (hmi_handle_debugtrig(NULL) >= 0) {
+		ret = 1;
+		goto out;
+	}
+
+	if (ppc_md.hmi_exception_early)
+		ppc_md.hmi_exception_early(NULL);
+
+out:
+	if (vc->tb_offset) {
+		u64 new_tb = mftb() + vc->tb_offset;
+		mtspr(SPRN_TBU40, new_tb);
+		if ((mftb() & 0xffffff) < (new_tb & 0xffffff)) {
+			new_tb += 0x1000000;
+			mtspr(SPRN_TBU40, new_tb);
+		}
+		vc->tb_offset_applied = vc->tb_offset;
+	}
+
+	return ret;
+}
+
+/*
+ * The following subcore HMI handling is all only for pre-POWER9 CPUs.
+ */
+
 /* Check if dynamic split is in force and return subcore size accordingly. */
 static inline int kvmppc_cur_subcore_size(void)
 {
-- 
2.23.0


^ permalink raw reply related

* [PATCH v2 59/60] KVM: PPC: Book3S HV P9: Stop using vc->dpdes
From: Nicholas Piggin @ 2021-08-11 16:01 UTC (permalink / raw)
  To: kvm-ppc; +Cc: linuxppc-dev, Nicholas Piggin
In-Reply-To: <20210811160134.904987-1-npiggin@gmail.com>

The P9 path uses vc->dpdes only for msgsndp / SMT emulation. This adds
an ordering requirement between vcpu->doorbell_request and vc->dpdes for
no real benefit. Use vcpu->doorbell_request directly.

Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
---
 arch/powerpc/kvm/book3s_hv.c          | 18 ++++++++++--------
 arch/powerpc/kvm/book3s_hv_builtin.c  |  2 ++
 arch/powerpc/kvm/book3s_hv_p9_entry.c | 14 ++++++++++----
 3 files changed, 22 insertions(+), 12 deletions(-)

diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c
index 1dc98e553997..e7f4525f2a74 100644
--- a/arch/powerpc/kvm/book3s_hv.c
+++ b/arch/powerpc/kvm/book3s_hv.c
@@ -761,6 +761,8 @@ static bool kvmppc_doorbell_pending(struct kvm_vcpu *vcpu)
 
 	if (vcpu->arch.doorbell_request)
 		return true;
+	if (cpu_has_feature(CPU_FTR_ARCH_300))
+		return false;
 	/*
 	 * Ensure that the read of vcore->dpdes comes after the read
 	 * of vcpu->doorbell_request.  This barrier matches the
@@ -2188,8 +2190,10 @@ static int kvmppc_get_one_reg_hv(struct kvm_vcpu *vcpu, u64 id,
 		 * either vcore->dpdes or doorbell_request.
 		 * On POWER8, doorbell_request is 0.
 		 */
-		*val = get_reg_val(id, vcpu->arch.vcore->dpdes |
-				   vcpu->arch.doorbell_request);
+		if (cpu_has_feature(CPU_FTR_ARCH_300))
+			*val = get_reg_val(id, vcpu->arch.doorbell_request);
+		else
+			*val = get_reg_val(id, vcpu->arch.vcore->dpdes);
 		break;
 	case KVM_REG_PPC_VTB:
 		*val = get_reg_val(id, vcpu->arch.vcore->vtb);
@@ -2426,7 +2430,10 @@ static int kvmppc_set_one_reg_hv(struct kvm_vcpu *vcpu, u64 id,
 		vcpu->arch.pspb = set_reg_val(id, *val);
 		break;
 	case KVM_REG_PPC_DPDES:
-		vcpu->arch.vcore->dpdes = set_reg_val(id, *val);
+		if (cpu_has_feature(CPU_FTR_ARCH_300))
+			vcpu->arch.doorbell_request = set_reg_val(id, *val) & 1;
+		else
+			vcpu->arch.vcore->dpdes = set_reg_val(id, *val);
 		break;
 	case KVM_REG_PPC_VTB:
 		vcpu->arch.vcore->vtb = set_reg_val(id, *val);
@@ -4463,11 +4470,6 @@ int kvmhv_run_single_vcpu(struct kvm_vcpu *vcpu, u64 time_limit,
 
 	if (!nested) {
 		kvmppc_core_prepare_to_enter(vcpu);
-		if (vcpu->arch.doorbell_request) {
-			vc->dpdes = 1;
-			smp_wmb();
-			vcpu->arch.doorbell_request = 0;
-		}
 		if (test_bit(BOOK3S_IRQPRIO_EXTERNAL,
 			     &vcpu->arch.pending_exceptions))
 			lpcr |= LPCR_MER;
diff --git a/arch/powerpc/kvm/book3s_hv_builtin.c b/arch/powerpc/kvm/book3s_hv_builtin.c
index a10bf93054ca..3ed90149ed2e 100644
--- a/arch/powerpc/kvm/book3s_hv_builtin.c
+++ b/arch/powerpc/kvm/book3s_hv_builtin.c
@@ -660,6 +660,8 @@ void kvmppc_guest_entry_inject_int(struct kvm_vcpu *vcpu)
 	int ext;
 	unsigned long lpcr;
 
+	WARN_ON_ONCE(cpu_has_feature(CPU_FTR_ARCH_300));
+
 	/* Insert EXTERNAL bit into LPCR at the MER bit position */
 	ext = (vcpu->arch.pending_exceptions >> BOOK3S_IRQPRIO_EXTERNAL) & 1;
 	lpcr = mfspr(SPRN_LPCR);
diff --git a/arch/powerpc/kvm/book3s_hv_p9_entry.c b/arch/powerpc/kvm/book3s_hv_p9_entry.c
index 5745a49021c3..1e18c089478e 100644
--- a/arch/powerpc/kvm/book3s_hv_p9_entry.c
+++ b/arch/powerpc/kvm/book3s_hv_p9_entry.c
@@ -701,6 +701,7 @@ int kvmhv_vcpu_entry_p9(struct kvm_vcpu *vcpu, u64 time_limit, unsigned long lpc
 	unsigned long host_pidr;
 	unsigned long host_dawr1;
 	unsigned long host_dawrx1;
+	unsigned long dpdes;
 
 	hdec = time_limit - *tb;
 	if (hdec < 0)
@@ -763,8 +764,10 @@ int kvmhv_vcpu_entry_p9(struct kvm_vcpu *vcpu, u64 time_limit, unsigned long lpc
 
 	if (vc->pcr)
 		mtspr(SPRN_PCR, vc->pcr | PCR_MASK);
-	if (vc->dpdes)
-		mtspr(SPRN_DPDES, vc->dpdes);
+	if (vcpu->arch.doorbell_request) {
+		vcpu->arch.doorbell_request = 0;
+		mtspr(SPRN_DPDES, 1);
+	}
 
 	if (dawr_enabled()) {
 		if (vcpu->arch.dawr0 != host_dawr0)
@@ -995,7 +998,10 @@ int kvmhv_vcpu_entry_p9(struct kvm_vcpu *vcpu, u64 time_limit, unsigned long lpc
 	vcpu->arch.shregs.sprg2 = mfspr(SPRN_SPRG2);
 	vcpu->arch.shregs.sprg3 = mfspr(SPRN_SPRG3);
 
-	vc->dpdes = mfspr(SPRN_DPDES);
+	dpdes = mfspr(SPRN_DPDES);
+	if (dpdes)
+		vcpu->arch.doorbell_request = 1;
+
 	vc->vtb = mfspr(SPRN_VTB);
 
 	dec = mfspr(SPRN_DEC);
@@ -1057,7 +1063,7 @@ int kvmhv_vcpu_entry_p9(struct kvm_vcpu *vcpu, u64 time_limit, unsigned long lpc
 		}
 	}
 
-	if (vc->dpdes)
+	if (dpdes)
 		mtspr(SPRN_DPDES, 0);
 	if (vc->pcr)
 		mtspr(SPRN_PCR, PCR_MASK);
-- 
2.23.0


^ permalink raw reply related

page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox