LinuxPPC-Dev Archive on lore.kernel.org

LinuxPPC-Dev Archive on lore.kernel.org
 help / color / mirror / Atom feed

* Re: [PATCH] ocxl: Mmio invalidation support
From: Frederic Barrat @ 2020-11-16 17:51 UTC (permalink / raw)
  To: Christophe Lombard, linuxppc-dev, fbarrat, Andrew Donnellan
In-Reply-To: <20201113153333.290505-1-clombard@linux.vnet.ibm.com>



On 13/11/2020 16:33, Christophe Lombard wrote:
> OpenCAPI 4.0/5.0 with TLBI/SLBI Snooping, is not used due to performance
> problems caused by the PAU having to process all incoming TLBI/SLBI
> commands which will cause them to back up on the PowerBus.
> 
> When the Address Translation Mode requires TLB and SLB Invalidate
> operations to be initiated using MMIO registers, a set of registers like
> the following is used:
> • XTS MMIO ATSD0 LPARID register
> • XTS MMIO ATSD0 AVA register
> • XTS MMIO ATSD0 launch register, write access initiates a shoot down
> • XTS MMIO ATSD0 status register
> 
> The MMIO based mechanism also blocks the NPU/PAU from snooping TLBIE
> commands from the PowerBus.
> 
> The Shootdown commands (ATSD) will be generated using MMIO registers
> in the NPU/PAU and sent to the device.
> 
> Signed-off-by: Christophe Lombard <clombard@linux.vnet.ibm.com>
> ---
>   arch/powerpc/include/asm/pnv-ocxl.h   |   2 +
>   arch/powerpc/platforms/powernv/ocxl.c |  19 +++
>   drivers/misc/ocxl/link.c              | 180 ++++++++++++++++++++++----
>   drivers/misc/ocxl/ocxl_internal.h     |  46 ++++++-
>   drivers/misc/ocxl/trace.h             | 125 ++++++++++++++++++
>   5 files changed, 348 insertions(+), 24 deletions(-)
> 
> diff --git a/arch/powerpc/include/asm/pnv-ocxl.h b/arch/powerpc/include/asm/pnv-ocxl.h
> index d37ededca3ee..4a23abcc347b 100644
> --- a/arch/powerpc/include/asm/pnv-ocxl.h
> +++ b/arch/powerpc/include/asm/pnv-ocxl.h
> @@ -28,4 +28,6 @@ int pnv_ocxl_spa_setup(struct pci_dev *dev, void *spa_mem, int PE_mask, void **p
>   void pnv_ocxl_spa_release(void *platform_data);
>   int pnv_ocxl_spa_remove_pe_from_cache(void *platform_data, int pe_handle);
> 
> +extern int pnv_ocxl_map_lpar(struct pci_dev *dev, uint64_t lparid,
> +			     uint64_t lpcr);


"extern" is useless



>   #endif /* _ASM_PNV_OCXL_H */
> diff --git a/arch/powerpc/platforms/powernv/ocxl.c b/arch/powerpc/platforms/powernv/ocxl.c
> index ecdad219d704..100546ea635f 100644
> --- a/arch/powerpc/platforms/powernv/ocxl.c
> +++ b/arch/powerpc/platforms/powernv/ocxl.c
> @@ -483,3 +483,22 @@ int pnv_ocxl_spa_remove_pe_from_cache(void *platform_data, int pe_handle)
>   	return rc;
>   }
>   EXPORT_SYMBOL_GPL(pnv_ocxl_spa_remove_pe_from_cache);
> +
> +int pnv_ocxl_map_lpar(struct pci_dev *dev, uint64_t lparid,
> +		      uint64_t lpcr)
> +{
> +	struct pci_controller *hose = pci_bus_to_host(dev->bus);
> +	struct pnv_phb *phb = hose->private_data;
> +	u32 bdfn;
> +	int rc;
> +
> +	bdfn = (dev->bus->number << 8) | dev->devfn;


I was told a bit too late that pci_dev_id() exists, so we should 
probably use from now on.



> +	rc = opal_npu_map_lpar(phb->opal_id, bdfn, lparid, lpcr);
> +	if (rc) {
> +		dev_err(&dev->dev, "Error mapping device to LPAR: %d\n", rc);
> +		return -EINVAL;
> +	}
> +
> +	return 0;
> +}
> +EXPORT_SYMBOL_GPL(pnv_ocxl_map_lpar);
> diff --git a/drivers/misc/ocxl/link.c b/drivers/misc/ocxl/link.c
> index fd73d3bc0eb6..9b5b77d40734 100644
> --- a/drivers/misc/ocxl/link.c
> +++ b/drivers/misc/ocxl/link.c



Overall, there are many changes in that file and it would help the 
review if it could be broken up in a set of smaller patches.



> @@ -4,6 +4,8 @@
>   #include <linux/mutex.h>
>   #include <linux/mm_types.h>
>   #include <linux/mmu_context.h>
> +#include <linux/mm.h>
> +#include <linux/mmu_notifier.h>
>   #include <asm/copro.h>
>   #include <asm/pnv-ocxl.h>
>   #include <asm/xive.h>
> @@ -33,6 +35,31 @@
> 
>   #define SPA_PE_VALID		0x80000000
> 
> +struct spa;
> +
> +/*
> + * A opencapi link can be used be by several PCI functions. We have
> + * one link per device slot.
> + *
> + * A linked list of opencapi links should suffice, as there's a
> + * limited number of opencapi slots on a system and lookup is only
> + * done when the device is probed
> + */
> +struct ocxl_link {
> +	struct list_head list;
> +	struct kref ref;
> +	int domain;
> +	int bus;
> +	int dev;
> +	u64 mmio_atsd; /* ATSD physical address */
> +	void __iomem *base;    /* ATSD register virtual address */
> +	spinlock_t atsd_lock; // to serialize shootdowns
> +	atomic_t irq_available;
> +	struct spa *spa;
> +	void *platform_data;
> +};
> +static struct list_head links_list = LIST_HEAD_INIT(links_list);
> +static DEFINE_MUTEX(links_list_lock);
> 
>   struct pe_data {
>   	struct mm_struct *mm;
> @@ -41,6 +68,8 @@ struct pe_data {
>   	/* opaque pointer to be passed to the above callback */
>   	void *xsl_err_data;
>   	struct rcu_head rcu;
> +	struct ocxl_link *link;
> +	struct mmu_notifier mmu_notifier;
>   };
> 
>   struct spa {
> @@ -69,27 +98,6 @@ struct spa {
>   	} xsl_fault;
>   };
> 
> -/*
> - * A opencapi link can be used be by several PCI functions. We have
> - * one link per device slot.
> - *
> - * A linked list of opencapi links should suffice, as there's a
> - * limited number of opencapi slots on a system and lookup is only
> - * done when the device is probed
> - */
> -struct ocxl_link {
> -	struct list_head list;
> -	struct kref ref;
> -	int domain;
> -	int bus;
> -	int dev;
> -	atomic_t irq_available;
> -	struct spa *spa;
> -	void *platform_data;
> -};
> -static struct list_head links_list = LIST_HEAD_INIT(links_list);
> -static DEFINE_MUTEX(links_list_lock);
> -
>   enum xsl_response {
>   	CONTINUE,
>   	ADDRESS_ERROR,
> @@ -126,6 +134,8 @@ static void ack_irq(struct spa *spa, enum xsl_response r)
>   	}
>   }
> 
> +static const struct mmu_notifier_ops ocxl_mmu_notifier_ops;
> +
>   static void xsl_fault_handler_bh(struct work_struct *fault_work)
>   {
>   	vm_fault_t flt = 0;
> @@ -376,6 +386,7 @@ static void free_spa(struct ocxl_link *link)
> 
>   static int alloc_link(struct pci_dev *dev, int PE_mask, struct ocxl_link **out_link)
>   {
> +	struct pci_controller *hose = pci_bus_to_host(dev->bus);
>   	struct ocxl_link *link;
>   	int rc;
> 
> @@ -403,6 +414,22 @@ static int alloc_link(struct pci_dev *dev, int PE_mask, struct ocxl_link **out_l
>   	if (rc)
>   		goto err_xsl_irq;
> 
> +	/* Since OpenCAPI 5.0, Address Translation Mode requires TLB
> +	 * and SLB Invalidate operations to be initiated using MMIO
> +	 * registers
> +	 */
> +	if (of_property_read_u64_index(hose->dn, "ibm,mmio-atsd",
> +				       0, &link->mmio_atsd)) {
> +		dev_info(&dev->dev, "No available ATSD found\n");
> +	}
> +	if (link->mmio_atsd) {
> +		link->base = ioremap(link->mmio_atsd, 24);
> +		if (!link->base)
> +			dev_warn(&dev->dev, "ioremap failed - mmio_atsd: %#llx\n", link->mmio_atsd);
> +		else
> +			pnv_ocxl_map_lpar(dev, mfspr(SPRN_LPID), 0);
> +	}



To stay coherent with how we've done things like interrupts, the
device tree parsing and ioremap() for the shootdown page should be
done in a platform-specific file (powernv)

Also, could we get a more descriptive name than link->base?


> +
>   	*out_link = link;
>   	return 0;
> 
> @@ -464,12 +491,101 @@ void ocxl_link_release(struct pci_dev *dev, void *link_handle)
>   {
>   	struct ocxl_link *link = (struct ocxl_link *) link_handle;
> 
> +	if (link->base) {
> +		iounmap(link->base);
> +		link->base = NULL;
> +	}


What's allocated in alloc_link(), i.e. the shootdown page, should be
released in release_xsl() and not in ocxl_link_release() (it's too
early in case of several functions).


>   	mutex_lock(&links_list_lock);
>   	kref_put(&link->ref, release_xsl);
>   	mutex_unlock(&links_list_lock);
>   }
>   EXPORT_SYMBOL_GPL(ocxl_link_release);
> 
> +static void tlb_invalidate(struct ocxl_link *link,
> +			   unsigned long pid,
> +			   unsigned long addr)
> +{
> +	unsigned long timeout = jiffies + (HZ * OCXL_ATSD_TIMEOUT);
> +	uint64_t val;
> +	int pend;
> +
> +	if (!link->base)
> +		return;
> +
> +	spin_lock(&link->atsd_lock);


Looks like atsd_lock is not initialized.


> +	if (addr) {
> +		/* load Abbreviated Virtual Address register with
> +		 * the necessary value
> +		 */
> +		val = SETFIELD(XTS_ATSD_AVA_AVA, 0ull, addr >> (63-51));
> +		out_be64(link->base + XTS_ATSD_AVA, val);
> +		eieio();


That eieio() needs a comment.


> +		trace_ocxl_mmu_notifier_mmio_atsd_ava(val, pid);
> +	}
> +
> +	/* Write access initiates a shoot down to initiate the
> +	 * TLB Invalidate command
> +	 */
> +	val = XTS_ATSD_LNCH_R;


All that code should be in a platform-specific file.


> +	if (addr) {
> +		val = SETFIELD(XTS_ATSD_LNCH_RIC, val, 0b00);
> +		val = SETFIELD(XTS_ATSD_LNCH_IS, val, 0b00);
> +	} else {
> +		val = SETFIELD(XTS_ATSD_LNCH_RIC, val, 0b10);
> +		val = SETFIELD(XTS_ATSD_LNCH_IS, val, 0b01);
> +		val |= XTS_ATSD_LNCH_OCAPI_SINGLETON;
> +	}
> +	val |= XTS_ATSD_LNCH_PRS;
> +	val = SETFIELD(XTS_ATSD_LNCH_AP, val, 0b101);
> +	val = SETFIELD(XTS_ATSD_LNCH_PID, val, pid);
> +	out_be64(link->base + XTS_ATSD_LNCH, val);
> +	trace_ocxl_mmu_notifier_mmio_atsd_lnch(val, addr, pid);
> +
> +	/* Poll the ATSD status register to determine when the
> +	* TLB Invalidate has been completed.
> +	*/
> +	val = in_be64(link->base + XTS_ATSD_STAT);
> +	pend = val >> 63;
> +	trace_ocxl_mmu_notifier_mmio_atsd_stat(val, addr, pid);
> +
> +	while (pend) {
> +		if (time_after_eq(jiffies, timeout)) {
> +			pr_err("%s - Timeout while reading XTS MMIO ATSD status register (val=%#llx, pidr=0x%lx)\n",
> +			       __func__, val, pid);
> +			spin_unlock(&link->atsd_lock);
> +			return;
> +		}
> +		cpu_relax();
> +		val = in_be64(link->base + XTS_ATSD_STAT);
> +		pend = val >> 63;
> +	}
> +	spin_unlock(&link->atsd_lock);
> +	trace_ocxl_mmu_notifier_mmio_atsd_stat(val, addr, pid);
> +}
> +
> +static void invalidate_range_end(struct mmu_notifier *mn,
> +				 const struct mmu_notifier_range *range)
> +{
> +	struct pe_data *pe_data = container_of(mn, struct pe_data, mmu_notifier);
> +	struct ocxl_link *link = pe_data->link;
> +	struct mm_struct *mm = mn->mm;
> +	unsigned long addr, pid, page_size = PAGE_SIZE;
> +
> +	pid = mm->context.id;
> +	trace_ocxl_mmu_notifier_range(range->start, range->end, pid);
> +
> +	for (addr = range->start; addr < range->end; addr += page_size)
> +		tlb_invalidate(link, pid, addr);
> +}
> +
> +static const struct mmu_notifier_ops ocxl_mmu_notifier_ops = {
> +	/* invalidate_range_end() is called when all pages in the
> +	 * range have been unmapped and the pages have been freed by
> +	 * the VM
> +	 */
> +	.invalidate_range_end = invalidate_range_end,
> +};
> +


See comments in mmu_notifier.h. Looks like we shouldn't be using 
invalidate_range_end(), it's too late

Did we test abrupt death of the host process? Do we need to also use 
.release()?



>   static u64 calculate_cfg_state(bool kernel)
>   {
>   	u64 state;
> @@ -517,7 +633,7 @@ int ocxl_link_add_pe(void *link_handle, int pasid, u32 pidr, u32 tidr,
>   		goto unlock;
>   	}
> 
> -	pe_data = kmalloc(sizeof(*pe_data), GFP_KERNEL);
> +	pe_data = kzalloc(sizeof(*pe_data), GFP_KERNEL);
>   	if (!pe_data) {
>   		rc = -ENOMEM;
>   		goto unlock;
> @@ -526,9 +642,13 @@ int ocxl_link_add_pe(void *link_handle, int pasid, u32 pidr, u32 tidr,
>   	pe_data->mm = mm;
>   	pe_data->xsl_err_cb = xsl_err_cb;
>   	pe_data->xsl_err_data = xsl_err_data;
> +	pe_data->link = link;
> +	pe_data->mmu_notifier.ops = &ocxl_mmu_notifier_ops;
> 
>   	memset(pe, 0, sizeof(struct ocxl_process_element));
>   	pe->config_state = cpu_to_be64(calculate_cfg_state(pidr == 0));
> +	pe->pasid = cpu_to_be32(pasid << (31 - 19));
> +	pe->bdf = cpu_to_be32(1 << (31 - 15));


why do we hard-code bdf 1?



>   	pe->lpid = cpu_to_be32(mfspr(SPRN_LPID));
>   	pe->pid = cpu_to_be32(pidr);
>   	pe->tid = cpu_to_be32(tidr);
> @@ -540,8 +660,17 @@ int ocxl_link_add_pe(void *link_handle, int pasid, u32 pidr, u32 tidr,
>   	 * by the nest MMU. If we have a kernel context, TLBIs are
>   	 * already global.
>   	 */
> -	if (mm)
> +	if (mm) {
>   		mm_context_add_copro(mm);
> +		if (link->base) {
> +			/* Use MMIO registers for the TLB and SLB
> +			 * Invalidate operations.
> +			 */
> +			trace_init_mmu_notifier(pasid, mm->context.id);
> +			mmu_notifier_register(&pe_data->mmu_notifier, mm);
> +		}
> +	}
> +
>   	/*
>   	 * Barrier is to make sure PE is visible in the SPA before it
>   	 * is used by the device. It also helps with the global TLBI
> @@ -672,6 +801,11 @@ int ocxl_link_remove_pe(void *link_handle, int pasid)
>   		WARN(1, "Couldn't find pe data when removing PE\n");
>   	} else {
>   		if (pe_data->mm) {
> +			if (link->base) {
> +				trace_release_mmu_notifier(pasid, pe_data->mm->context.id);
> +				mmu_notifier_unregister(&pe_data->mmu_notifier, pe_data->mm);
> +				tlb_invalidate(link, pe_data->mm->context.id, 0ull);
> +			}
>   			mm_context_remove_copro(pe_data->mm);
>   			mmdrop(pe_data->mm);
>   		}
> diff --git a/drivers/misc/ocxl/ocxl_internal.h b/drivers/misc/ocxl/ocxl_internal.h
> index 0bad0a123af6..35d8be3cd270 100644
> --- a/drivers/misc/ocxl/ocxl_internal.h
> +++ b/drivers/misc/ocxl/ocxl_internal.h
> @@ -8,6 +8,48 @@
>   #include <linux/list.h>
>   #include <misc/ocxl.h>
> 
> +/* Find left shift from first set bit in mask */
> +#define MASK_TO_LSH(m)		(__builtin_ffsl(m) - 1)
> +
> +/* Set field fname of oval to fval
> + * NOTE: oval isn't modified, the combined result is returned
> + */
> +#define SETFIELD(m, v, val)				\
> +	(((v) & ~(m)) |	((((typeof(v))(val)) << MASK_TO_LSH(m)) & (m)))
> +
> +#define OCXL_ATSD_TIMEOUT		1
> +
> +/* 5.9.3.3 TLB Management Instructions - PowerISA tags workbook */
> +#define XTS_ATSD_LNCH		0x00
> +#define   XTS_ATSD_LNCH_R	PPC_BIT(0)		/* Radix Invalidate */
> +#define   XTS_ATSD_LNCH_RIC	PPC_BITMASK(1,2)	/* Radix Invalidation Control
> +							 * 0b00 Just invalidate TLB.
> +							 * 0b01 Invalidate just Page Walk Cache.
> +							 * 0b10 Invalidate TLB, Page Walk Cache, and any
> +							 * caching of Partition and Process Table Entries.
> +							 */
> +#define   XTS_ATSD_LNCH_LP	PPC_BITMASK(3, 10)	/* Number and Page Size of translations to be invalidated (HPT only ?) */
> +#define   XTS_ATSD_LNCH_IS	PPC_BITMASK(11, 12)	/* Invalidation Criteria
> +							 * 0b00 Invalidate just the target VA.
> +							 * 0b01 Invalidate matching PID.
> +							 */
> +#define   XTS_ATSD_LNCH_PRS	PPC_BIT(13)		/* 0b1: Process Scope, 0b0: Partition Scope */
> +#define   XTS_ATSD_LNCH_B	PPC_BIT(14)		/* Invalidation Flag */
> +#define   XTS_ATSD_LNCH_AP	PPC_BITMASK(15, 17)	/* Actual Page Size to be invalidated
> +							 * 000 4KB
> +							 * 101 64KB
> +							 * 001 2MB
> +							 * 010 1GB
> +							 */
> +#define   XTS_ATSD_LNCH_L	PPC_BIT(18)		/* Defines the large page select (L=0b0 for 4KB pages, L=0b1 for large pages) */
> +#define   XTS_ATSD_LNCH_PID	PPC_BITMASK(19, 38)	/* Process ID */
> +#define   XTS_ATSD_LNCH_F	PPC_BIT(39)		/* NoFlush – Assumed to be 0b0 */
> +#define   XTS_ATSD_LNCH_OCAPI_SLBI	PPC_BIT(40)
> +#define   XTS_ATSD_LNCH_OCAPI_SINGLETON	PPC_BIT(41)
> +#define XTS_ATSD_AVA		0x08
> +#define   XTS_ATSD_AVA_AVA	PPC_BITMASK(0, 51) /* au lieu de 35*/
> +#define XTS_ATSD_STAT		0x10
> +
>   #define MAX_IRQ_PER_LINK	2000
>   #define MAX_IRQ_PER_CONTEXT	MAX_IRQ_PER_LINK
> 
> @@ -84,7 +126,9 @@ struct ocxl_context {
> 
>   struct ocxl_process_element {
>   	__be64 config_state;
> -	__be32 reserved1[11];
> +	__be32 pasid;
> +	__be32 bdf;
> +	__be32 reserved1[9];
>   	__be32 lpid;
>   	__be32 tid;
>   	__be32 pid;
> diff --git a/drivers/misc/ocxl/trace.h b/drivers/misc/ocxl/trace.h
> index 17e21cb2addd..6171069d071a 100644
> --- a/drivers/misc/ocxl/trace.h
> +++ b/drivers/misc/ocxl/trace.h


I haven't looked at this yet. It should probably be in a separate patch.

   Fred


> @@ -8,6 +8,131 @@
> 
>   #include <linux/tracepoint.h>
> 
> +
> +TRACE_EVENT(ocxl_mmu_notifier_range,
> +	TP_PROTO(unsigned long start, unsigned long end, unsigned long pidr),
> +	TP_ARGS(start, end, pidr),
> +
> +	TP_STRUCT__entry(
> +		__field(unsigned long, start)
> +		__field(unsigned long, end)
> +		__field(unsigned long, pidr)
> +	),
> +
> +	TP_fast_assign(
> +		__entry->start = start;
> +		__entry->end = end;
> +		__entry->pidr = pidr;
> +	),
> +
> +	TP_printk("start=0x%lx end=0x%lx pidr=0x%lx",
> +		__entry->start,
> +		__entry->end,
> +		__entry->pidr
> +	)
> +);
> +
> +TRACE_EVENT(ocxl_mmu_notifier_mmio_atsd_ava,
> +	TP_PROTO(u64 val, unsigned long pidr),
> +	TP_ARGS(val, pidr),
> +
> +	TP_STRUCT__entry(
> +		__field(u64, val)
> +		__field(unsigned long, pidr)
> +	),
> +
> +	TP_fast_assign(
> +		__entry->val = val;
> +		__entry->pidr = pidr;
> +	),
> +
> +	TP_printk("ATSD AVA: 0x%llx pidr=0x%lx",
> +		__entry->val, __entry->pidr
> +	)
> +);
> +
> +TRACE_EVENT(ocxl_mmu_notifier_mmio_atsd_lnch,
> +	TP_PROTO(u64 val, unsigned long addr, unsigned long pidr),
> +	TP_ARGS(val, addr, pidr),
> +
> +	TP_STRUCT__entry(
> +		__field(u64, val)
> +		__field(unsigned long, addr)
> +		__field(unsigned long, pidr)
> +	),
> +
> +	TP_fast_assign(
> +		__entry->val = val;
> +		__entry->addr = addr;
> +		__entry->pidr = pidr;
> +	),
> +
> +	TP_printk("ATSD LNCH: 0x%llx addr=0x%lx pidr=0x%lx",
> +		__entry->val, __entry->addr, __entry->pidr
> +	)
> +);
> +
> +TRACE_EVENT(ocxl_mmu_notifier_mmio_atsd_stat,
> +	TP_PROTO(u64 val, unsigned long addr, unsigned long pidr),
> +	TP_ARGS(val, addr, pidr),
> +
> +	TP_STRUCT__entry(
> +		__field(u64, val)
> +		__field(unsigned long, addr)
> +		__field(unsigned long, pidr)
> +	),
> +
> +	TP_fast_assign(
> +		__entry->val = val;
> +		__entry->addr = addr;
> +		__entry->pidr = pidr;
> +	),
> +
> +	TP_printk("ATSD STAT: 0x%llx addr=0x%lx pidr=0x%lx",
> +		__entry->val, __entry->addr, __entry->pidr
> +	)
> +);
> +
> +TRACE_EVENT(init_mmu_notifier,
> +	TP_PROTO(int pasid, unsigned long pidr),
> +	TP_ARGS(pasid, pidr),
> +
> +	TP_STRUCT__entry(
> +		__field(int, pasid)
> +		__field(unsigned long, pidr)
> +	),
> +
> +	TP_fast_assign(
> +		__entry->pasid = pasid;
> +		__entry->pidr = pidr;
> +	),
> +
> +	TP_printk("pasid=%d, pidr=0x%lx",
> +		__entry->pasid,
> +		__entry->pidr
> +	)
> +);
> +
> +TRACE_EVENT(release_mmu_notifier,
> +	TP_PROTO(int pasid, unsigned long pidr),
> +	TP_ARGS(pasid, pidr),
> +
> +	TP_STRUCT__entry(
> +		__field(int, pasid)
> +		__field(unsigned long, pidr)
> +	),
> +
> +	TP_fast_assign(
> +		__entry->pasid = pasid;
> +		__entry->pidr = pidr;
> +	),
> +
> +	TP_printk("pasid=%d, pidr=0x%lx",
> +		__entry->pasid,
> +		__entry->pidr
> +	)
> +);
> +
>   DECLARE_EVENT_CLASS(ocxl_context,
>   	TP_PROTO(pid_t pid, void *spa, int pasid, u32 pidr, u32 tidr),
>   	TP_ARGS(pid, spa, pasid, pidr, tidr),
> 

^ permalink raw reply

* Re: [PATCH 0/5] perf/mm: Fix PERF_SAMPLE_*_PAGE_SIZE
From: Peter Zijlstra @ 2020-11-16 16:57 UTC (permalink / raw)
  To: Dave Hansen
  Cc: mark.rutland, alexander.shishkin, catalin.marinas, eranian,
	sparclinux, will, mingo, kan.liang, linux-arch, ak, aneesh.kumar,
	Matthew Wilcox, jolsa, npiggin, acme, Kirill A. Shutemov,
	linux-kernel, linuxppc-dev, davem, kirill.shutemov
In-Reply-To: <3f2239fe-367a-16de-fcb5-543d39f34c22@intel.com>

On Mon, Nov 16, 2020 at 08:36:36AM -0800, Dave Hansen wrote:
> On 11/16/20 8:32 AM, Matthew Wilcox wrote:
> >>
> >> That's really the best we can do from software without digging into
> >> microarchitecture-specific events.
> > I mean this is perf.  Digging into microarch specific events is what it
> > does ;-)
> 
> Yeah, totally.

Sure, but the automatic promotion/demotion of TLB sizes is not visible
if you don't know what you startd out with.

> But, if we see a bunch of 4k TLB hit events, it's still handy to know
> that those 4k TLB hits originated from a 2M page table entry.  This
> series just makes sure that perf has the data about the page table
> mapping sizes regardless of what the microarchitecture does with it.

This. 

^ permalink raw reply

* Re: [PATCH 0/5] perf/mm: Fix PERF_SAMPLE_*_PAGE_SIZE
From: Peter Zijlstra @ 2020-11-16 16:55 UTC (permalink / raw)
  To: Dave Hansen
  Cc: mark.rutland, alexander.shishkin, catalin.marinas, eranian,
	sparclinux, will, mingo, kan.liang, linux-arch, ak, aneesh.kumar,
	Matthew Wilcox, jolsa, npiggin, acme, Kirill A. Shutemov,
	linux-kernel, linuxppc-dev, davem, kirill.shutemov
In-Reply-To: <eeec67f6-ea05-1115-f249-b6cdcf2c5e2c@intel.com>

On Mon, Nov 16, 2020 at 08:28:23AM -0800, Dave Hansen wrote:
> On 11/16/20 7:54 AM, Matthew Wilcox wrote:
> > It gets even more complicated with CPUs with multiple levels of TLB
> > which support different TLB entry sizes.  My CPU reports:
> > 
> > TLB info
> >  Instruction TLB: 2M/4M pages, fully associative, 8 entries
> >  Instruction TLB: 4K pages, 8-way associative, 64 entries
> >  Data TLB: 1GB pages, 4-way set associative, 4 entries
> >  Data TLB: 4KB pages, 4-way associative, 64 entries
> >  Shared L2 TLB: 4KB/2MB pages, 6-way associative, 1536 entries
> 
> It's even "worse" on recent AMD systems.  Those will coalesce multiple
> adjacent PTEs into a single TLB entry.  I think Alphas did something
> like this back in the day with an opt-in.
> 
> Anyway, the changelog should probably replace:

ARM64 does too.

> > This enables PERF_SAMPLE_{DATA,CODE}_PAGE_SIZE to report accurate TLB
> > page sizes.
> 
> with something more like:
> 
> This enables PERF_SAMPLE_{DATA,CODE}_PAGE_SIZE to report accurate page
> table mapping sizes.

Sure.

^ permalink raw reply

* Re: [PATCH 0/5] perf/mm: Fix PERF_SAMPLE_*_PAGE_SIZE
From: Dave Hansen @ 2020-11-16 16:36 UTC (permalink / raw)
  To: Matthew Wilcox
  Cc: mark.rutland, Peter Zijlstra, catalin.marinas, eranian,
	sparclinux, will, mingo, kan.liang, linux-arch, ak, aneesh.kumar,
	alexander.shishkin, jolsa, npiggin, acme, Kirill A. Shutemov,
	linux-kernel, linuxppc-dev, davem, kirill.shutemov
In-Reply-To: <20201116163213.GG29991@casper.infradead.org>

On 11/16/20 8:32 AM, Matthew Wilcox wrote:
>>
>> That's really the best we can do from software without digging into
>> microarchitecture-specific events.
> I mean this is perf.  Digging into microarch specific events is what it
> does ;-)

Yeah, totally.

But, if we see a bunch of 4k TLB hit events, it's still handy to know
that those 4k TLB hits originated from a 2M page table entry.  This
series just makes sure that perf has the data about the page table
mapping sizes regardless of what the microarchitecture does with it.

I'm just saying we need to make the descriptions in this perf feature
specifically about the page tables, not the TLB.

^ permalink raw reply

* Re: [PATCH 0/5] perf/mm: Fix PERF_SAMPLE_*_PAGE_SIZE
From: Matthew Wilcox @ 2020-11-16 16:32 UTC (permalink / raw)
  To: Dave Hansen
  Cc: mark.rutland, Peter Zijlstra, catalin.marinas, eranian,
	sparclinux, will, mingo, kan.liang, linux-arch, ak, aneesh.kumar,
	alexander.shishkin, jolsa, npiggin, acme, Kirill A. Shutemov,
	linux-kernel, linuxppc-dev, davem, kirill.shutemov
In-Reply-To: <eeec67f6-ea05-1115-f249-b6cdcf2c5e2c@intel.com>

On Mon, Nov 16, 2020 at 08:28:23AM -0800, Dave Hansen wrote:
> On 11/16/20 7:54 AM, Matthew Wilcox wrote:
> > It gets even more complicated with CPUs with multiple levels of TLB
> > which support different TLB entry sizes.  My CPU reports:
> > 
> > TLB info
> >  Instruction TLB: 2M/4M pages, fully associative, 8 entries
> >  Instruction TLB: 4K pages, 8-way associative, 64 entries
> >  Data TLB: 1GB pages, 4-way set associative, 4 entries
> >  Data TLB: 4KB pages, 4-way associative, 64 entries
> >  Shared L2 TLB: 4KB/2MB pages, 6-way associative, 1536 entries
> 
> It's even "worse" on recent AMD systems.  Those will coalesce multiple
> adjacent PTEs into a single TLB entry.  I think Alphas did something
> like this back in the day with an opt-in.

I debated mentioning that ;-)  We can detect in software whether that's
_possible_, but we can't detect whether it's *done* it.  I heard it
sometimes takes several faults on the 4kB entries for the CPU to decide
that it's beneficial to use a 32kB TLB entry.  But this is all rumour.

> Anyway, the changelog should probably replace:
> 
> > This enables PERF_SAMPLE_{DATA,CODE}_PAGE_SIZE to report accurate TLB
> > page sizes.
> 
> with something more like:
> 
> This enables PERF_SAMPLE_{DATA,CODE}_PAGE_SIZE to report accurate page
> table mapping sizes.
> 
> That's really the best we can do from software without digging into
> microarchitecture-specific events.

I mean this is perf.  Digging into microarch specific events is what it
does ;-)

^ permalink raw reply

* Re: [PATCH 0/5] perf/mm: Fix PERF_SAMPLE_*_PAGE_SIZE
From: Dave Hansen @ 2020-11-16 16:28 UTC (permalink / raw)
  To: Matthew Wilcox, Kirill A. Shutemov
  Cc: mark.rutland, aneesh.kumar, linux-arch, catalin.marinas, will,
	Peter Zijlstra, linuxppc-dev, npiggin, linux-kernel, acme, davem,
	alexander.shishkin, ak, eranian, sparclinux, jolsa, mingo,
	kirill.shutemov, kan.liang
In-Reply-To: <20201116155404.GD29991@casper.infradead.org>

On 11/16/20 7:54 AM, Matthew Wilcox wrote:
> It gets even more complicated with CPUs with multiple levels of TLB
> which support different TLB entry sizes.  My CPU reports:
> 
> TLB info
>  Instruction TLB: 2M/4M pages, fully associative, 8 entries
>  Instruction TLB: 4K pages, 8-way associative, 64 entries
>  Data TLB: 1GB pages, 4-way set associative, 4 entries
>  Data TLB: 4KB pages, 4-way associative, 64 entries
>  Shared L2 TLB: 4KB/2MB pages, 6-way associative, 1536 entries

It's even "worse" on recent AMD systems.  Those will coalesce multiple
adjacent PTEs into a single TLB entry.  I think Alphas did something
like this back in the day with an opt-in.

Anyway, the changelog should probably replace:

> This enables PERF_SAMPLE_{DATA,CODE}_PAGE_SIZE to report accurate TLB
> page sizes.

with something more like:

This enables PERF_SAMPLE_{DATA,CODE}_PAGE_SIZE to report accurate page
table mapping sizes.

That's really the best we can do from software without digging into
microarchitecture-specific events.

^ permalink raw reply

* [PATCH] powerpc/32s: Handle PROTFAULT in hash_page() also for CONFIG_PPC_KUAP
From: Christophe Leroy @ 2020-11-16 16:09 UTC (permalink / raw)
  To: Benjamin Herrenschmidt, Paul Mackerras, Michael Ellerman
  Cc: linuxppc-dev, linux-kernel

On hash 32 bits, handling minor protection faults like unsetting
dirty flag is heavy if done from the normal page_fault processing,
because it implies hash table software lookup for flushing the entry
and then a DSI is taken anyway to add the entry back.

When KUAP was implemented, as explained in commit a68c31fc01ef
("powerpc/32s: Implement Kernel Userspace Access Protection"),
protection faults has been diverted from hash_page() because
hash_page() was not able to identify a KUAP fault.

Implement KUAP verification in hash_page(), by clearing write
permission when the access is a kernel access and Ks is 1.
This works regardless of the address because kernel segments always
have Ks set to 0 while user segments have Ks set to 0 only
when kernel write to userspace is granted.

Then protection faults can be handled by hash_page() even for KUAP.

Signed-off-by: Christophe Leroy <christophe.leroy@csgroup.eu>
---
 arch/powerpc/kernel/head_book3s_32.S |  8 --------
 arch/powerpc/mm/book3s32/hash_low.S  | 13 +++++++++++--
 2 files changed, 11 insertions(+), 10 deletions(-)

diff --git a/arch/powerpc/kernel/head_book3s_32.S b/arch/powerpc/kernel/head_book3s_32.S
index a0dda2a1f2df..a4b811044f97 100644
--- a/arch/powerpc/kernel/head_book3s_32.S
+++ b/arch/powerpc/kernel/head_book3s_32.S
@@ -294,11 +294,7 @@ BEGIN_MMU_FTR_SECTION
 	stw	r11, THR11(r10)
 	mfspr	r10, SPRN_DSISR
 	mfcr	r11
-#ifdef CONFIG_PPC_KUAP
-	andis.	r10, r10, (DSISR_BAD_FAULT_32S | DSISR_DABRMATCH | DSISR_PROTFAULT)@h
-#else
 	andis.	r10, r10, (DSISR_BAD_FAULT_32S | DSISR_DABRMATCH)@h
-#endif
 	mfspr	r10, SPRN_SPRG_THREAD
 	beq	hash_page_dsi
 .Lhash_page_dsi_cont:
@@ -323,11 +319,7 @@ END_MMU_FTR_SECTION_IFSET(MMU_FTR_HPTE_TABLE)
 	EXCEPTION_PROLOG handle_dar_dsisr=1
 	get_and_save_dar_dsisr_on_stack	r4, r5, r11
 BEGIN_MMU_FTR_SECTION
-#ifdef CONFIG_PPC_KUAP
-	andis.	r0, r5, (DSISR_BAD_FAULT_32S | DSISR_DABRMATCH | DSISR_PROTFAULT)@h
-#else
 	andis.	r0, r5, (DSISR_BAD_FAULT_32S | DSISR_DABRMATCH)@h
-#endif
 	bne	handle_page_fault_tramp_2	/* if not, try to put a PTE */
 	rlwinm	r3, r5, 32 - 15, 21, 21		/* DSISR_STORE -> _PAGE_RW */
 	bl	hash_page
diff --git a/arch/powerpc/mm/book3s32/hash_low.S b/arch/powerpc/mm/book3s32/hash_low.S
index b2c912e517b9..9a56ba4f68f2 100644
--- a/arch/powerpc/mm/book3s32/hash_low.S
+++ b/arch/powerpc/mm/book3s32/hash_low.S
@@ -95,8 +95,6 @@ _GLOBAL(hash_page)
 #else
 	rlwimi	r8,r4,23,20,28		/* compute pte address */
 #endif
-	rlwinm	r0,r3,32-3,24,24	/* _PAGE_RW access -> _PAGE_DIRTY */
-	ori	r0,r0,_PAGE_ACCESSED|_PAGE_HASHPTE
 
 	/*
 	 * Update the linux PTE atomically.  We do the lwarx up-front
@@ -112,7 +110,18 @@ _GLOBAL(hash_page)
 #endif
 .Lretry:
 	lwarx	r6,0,r8			/* get linux-style pte, flag word */
+#ifdef CONFIG_PPC_KUAP
+	mfsrin	r5,r4
+	rlwinm	r0,r9,28,_PAGE_RW	/* MSR[PR] => _PAGE_RW */
+	rlwinm	r5,r5,12,_PAGE_RW	/* Ks => _PAGE_RW */
+	andc	r5,r5,r0		/* Ks & ~MSR[PR] */
+	andc	r5,r6,r5		/* Clear _PAGE_RW when Ks = 1 && MSR[PR] = 0 */
+	andc.	r5,r3,r5		/* check access & ~permission */
+#else
 	andc.	r5,r3,r6		/* check access & ~permission */
+#endif
+	rlwinm	r0,r3,32-3,24,24	/* _PAGE_RW access -> _PAGE_DIRTY */
+	ori	r0,r0,_PAGE_ACCESSED|_PAGE_HASHPTE
 #ifdef CONFIG_SMP
 	bne-	.Lhash_page_out		/* return if access not permitted */
 #else
-- 
2.25.0


^ permalink raw reply related

* Re: [PATCH 0/5] perf/mm: Fix PERF_SAMPLE_*_PAGE_SIZE
From: Matthew Wilcox @ 2020-11-16 15:54 UTC (permalink / raw)
  To: Kirill A. Shutemov
  Cc: mark.rutland, Peter Zijlstra, catalin.marinas, eranian,
	dave.hansen, sparclinux, will, mingo, kan.liang, linux-arch, ak,
	aneesh.kumar, alexander.shishkin, jolsa, npiggin, acme,
	linux-kernel, linuxppc-dev, davem, kirill.shutemov
In-Reply-To: <20201116154357.bw64c5ie2kiu5l4x@box>

On Mon, Nov 16, 2020 at 06:43:57PM +0300, Kirill A. Shutemov wrote:
> On Fri, Nov 13, 2020 at 12:19:01PM +0100, Peter Zijlstra wrote:
> > Hi,
> > 
> > These patches provide generic infrastructure to determine TLB page size from
> > page table entries alone. Perf will use this (for either data or code address)
> > to aid in profiling TLB issues.
> 
> I'm not sure it's an issue, but strictly speaking, size of page according
> to page table tree doesn't mean pagewalk would fill TLB entry of the size.
> CPU may support 1G pages in page table tree without 1G TLB at all.
> 
> IIRC, current Intel CPU still don't have any 1G iTLB entries and fill 2M
> iTLB instead.

It gets even more complicated with CPUs with multiple levels of TLB
which support different TLB entry sizes.  My CPU reports:

TLB info
 Instruction TLB: 2M/4M pages, fully associative, 8 entries
 Instruction TLB: 4K pages, 8-way associative, 64 entries
 Data TLB: 1GB pages, 4-way set associative, 4 entries
 Data TLB: 4KB pages, 4-way associative, 64 entries
 Shared L2 TLB: 4KB/2MB pages, 6-way associative, 1536 entries

I'm not quite sure what the rules are for evicting a 1GB entry in the
dTLB into the s2TLB.  I've read them for so many different processors,
I get quite confused.  Some CPUs fracture them; others ditch them entirely
and will look them up again if needed.

I think the architecture here is fine, but it'll need a little bit of
finagling to maybe pass i-vs-d to the pXd_leaf_size() routines, and x86
will need an implementation of pud_leaf_size() which interrogates the
TLB info to find out what size TLB entry will actually be used.

^ permalink raw reply

* [PATCH v2 3/5] powerpc/fault: Avoid heavy search_exception_tables() verification
From: Christophe Leroy @ 2020-11-16 15:53 UTC (permalink / raw)
  To: Benjamin Herrenschmidt, Paul Mackerras, Michael Ellerman
  Cc: linuxppc-dev, linux-kernel
In-Reply-To: <40ae19c2bf013e3815a6ae0d6016963fdb0f51b7.1605541983.git.christophe.leroy@csgroup.eu>

search_exception_tables() is an heavy operation, we have to avoid it.
When KUAP is selected, we'll know the fault has been blocked by KUAP.
Otherwise, it behaves just as if the address was already in the TLBs
and no fault was generated.

Signed-off-by: Christophe Leroy <christophe.leroy@csgroup.eu>
Reviewed-by: Nicholas Piggin <npiggin@gmail.com>
---
v2: Squashed with the preceeding patch which was re-ordering tests that get removed in this patch.
---
 arch/powerpc/mm/fault.c | 23 +++++++----------------
 1 file changed, 7 insertions(+), 16 deletions(-)

diff --git a/arch/powerpc/mm/fault.c b/arch/powerpc/mm/fault.c
index 17665ff97469..1770b41e4730 100644
--- a/arch/powerpc/mm/fault.c
+++ b/arch/powerpc/mm/fault.c
@@ -210,28 +210,19 @@ static bool bad_kernel_fault(struct pt_regs *regs, unsigned long error_code,
 		return true;
 	}
 
-	if (!is_exec && address < TASK_SIZE && (error_code & DSISR_PROTFAULT) &&
-	    !search_exception_tables(regs->nip)) {
-		pr_crit_ratelimited("Kernel attempted to access user page (%lx) - exploit attempt? (uid: %d)\n",
-				    address,
-				    from_kuid(&init_user_ns, current_uid()));
-	}
-
 	// Kernel fault on kernel address is bad
 	if (address >= TASK_SIZE)
 		return true;
 
-	// Fault on user outside of certain regions (eg. copy_tofrom_user()) is bad
-	if (!search_exception_tables(regs->nip))
-		return true;
-
-	// Read/write fault in a valid region (the exception table search passed
-	// above), but blocked by KUAP is bad, it can never succeed.
-	if (bad_kuap_fault(regs, address, is_write))
+	// Read/write fault blocked by KUAP is bad, it can never succeed.
+	if (bad_kuap_fault(regs, address, is_write)) {
+		pr_crit_ratelimited("Kernel attempted to %s user page (%lx) - exploit attempt? (uid: %d)\n",
+				    is_write ? "write" : "read", address,
+				    from_kuid(&init_user_ns, current_uid()));
 		return true;
+	}
 
-	// What's left? Kernel fault on user in well defined regions (extable
-	// matched), and allowed by KUAP in the faulting context.
+	// What's left? Kernel fault on user and allowed by KUAP in the faulting context.
 	return false;
 }
 
-- 
2.25.0


^ permalink raw reply related

* [PATCH v2 4/5] powerpc/fault: Perform exception fixup in do_page_fault()
From: Christophe Leroy @ 2020-11-16 15:53 UTC (permalink / raw)
  To: Benjamin Herrenschmidt, Paul Mackerras, Michael Ellerman
  Cc: linuxppc-dev, linux-kernel
In-Reply-To: <40ae19c2bf013e3815a6ae0d6016963fdb0f51b7.1605541983.git.christophe.leroy@csgroup.eu>

Exception fixup doesn't require the heady full regs saving,
do it from do_page_fault() directly.

For that, split bad_page_fault() in two parts.

As bad_page_fault() can also be called from other places than
handle_page_fault(), it will still perform exception fixup and
fallback on __bad_page_fault().

handle_page_fault() directly calls __bad_page_fault() as the
exception fixup will now be done by do_page_fault()

Reviewed-by: Nicholas Piggin <npiggin@gmail.com>
Signed-off-by: Christophe Leroy <christophe.leroy@csgroup.eu>
---
v2: Add prototype of __bad_page_fault() in asm/bug.h
---
 arch/powerpc/include/asm/bug.h       |  1 +
 arch/powerpc/kernel/entry_32.S       |  2 +-
 arch/powerpc/kernel/exceptions-64e.S |  2 +-
 arch/powerpc/kernel/exceptions-64s.S |  2 +-
 arch/powerpc/mm/fault.c              | 33 ++++++++++++++++++++--------
 5 files changed, 28 insertions(+), 12 deletions(-)

diff --git a/arch/powerpc/include/asm/bug.h b/arch/powerpc/include/asm/bug.h
index 338f36cd9934..919a31840e51 100644
--- a/arch/powerpc/include/asm/bug.h
+++ b/arch/powerpc/include/asm/bug.h
@@ -113,6 +113,7 @@
 struct pt_regs;
 extern int do_page_fault(struct pt_regs *, unsigned long, unsigned long);
 extern void bad_page_fault(struct pt_regs *, unsigned long, int);
+void __bad_page_fault(struct pt_regs *regs, unsigned long address, int sig);
 extern void _exception(int, struct pt_regs *, int, unsigned long);
 extern void _exception_pkey(struct pt_regs *, unsigned long, int);
 extern void die(const char *, struct pt_regs *, long);
diff --git a/arch/powerpc/kernel/entry_32.S b/arch/powerpc/kernel/entry_32.S
index 8cdc8bcde703..eafcf43e3613 100644
--- a/arch/powerpc/kernel/entry_32.S
+++ b/arch/powerpc/kernel/entry_32.S
@@ -671,7 +671,7 @@ handle_page_fault:
 	mr	r5,r3
 	addi	r3,r1,STACK_FRAME_OVERHEAD
 	lwz	r4,_DAR(r1)
-	bl	bad_page_fault
+	bl	__bad_page_fault
 	b	ret_from_except_full
 
 #ifdef CONFIG_PPC_BOOK3S_32
diff --git a/arch/powerpc/kernel/exceptions-64e.S b/arch/powerpc/kernel/exceptions-64e.S
index f579ce46eef2..74d07dc0bb48 100644
--- a/arch/powerpc/kernel/exceptions-64e.S
+++ b/arch/powerpc/kernel/exceptions-64e.S
@@ -1023,7 +1023,7 @@ storage_fault_common:
 	mr	r5,r3
 	addi	r3,r1,STACK_FRAME_OVERHEAD
 	ld	r4,_DAR(r1)
-	bl	bad_page_fault
+	bl	__bad_page_fault
 	b	ret_from_except
 
 /*
diff --git a/arch/powerpc/kernel/exceptions-64s.S b/arch/powerpc/kernel/exceptions-64s.S
index f7d748b88705..2cb3bcfb896d 100644
--- a/arch/powerpc/kernel/exceptions-64s.S
+++ b/arch/powerpc/kernel/exceptions-64s.S
@@ -3254,7 +3254,7 @@ handle_page_fault:
 	mr	r5,r3
 	addi	r3,r1,STACK_FRAME_OVERHEAD
 	ld	r4,_DAR(r1)
-	bl	bad_page_fault
+	bl	__bad_page_fault
 	b	interrupt_return
 
 /* We have a data breakpoint exception - handle it */
diff --git a/arch/powerpc/mm/fault.c b/arch/powerpc/mm/fault.c
index 1770b41e4730..2e50bc1c3783 100644
--- a/arch/powerpc/mm/fault.c
+++ b/arch/powerpc/mm/fault.c
@@ -538,10 +538,20 @@ NOKPROBE_SYMBOL(__do_page_fault);
 int do_page_fault(struct pt_regs *regs, unsigned long address,
 		  unsigned long error_code)
 {
+	const struct exception_table_entry *entry;
 	enum ctx_state prev_state = exception_enter();
 	int rc = __do_page_fault(regs, address, error_code);
 	exception_exit(prev_state);
-	return rc;
+	if (likely(!rc))
+		return 0;
+
+	entry = search_exception_tables(regs->nip);
+	if (unlikely(!entry))
+		return rc;
+
+	instruction_pointer_set(regs, extable_fixup(entry));
+
+	return 0;
 }
 NOKPROBE_SYMBOL(do_page_fault);
 
@@ -550,17 +560,10 @@ NOKPROBE_SYMBOL(do_page_fault);
  * It is called from the DSI and ISI handlers in head.S and from some
  * of the procedures in traps.c.
  */
-void bad_page_fault(struct pt_regs *regs, unsigned long address, int sig)
+void __bad_page_fault(struct pt_regs *regs, unsigned long address, int sig)
 {
-	const struct exception_table_entry *entry;
 	int is_write = page_fault_is_write(regs->dsisr);
 
-	/* Are we prepared to handle this fault?  */
-	if ((entry = search_exception_tables(regs->nip)) != NULL) {
-		regs->nip = extable_fixup(entry);
-		return;
-	}
-
 	/* kernel has accessed a bad area */
 
 	switch (TRAP(regs)) {
@@ -594,3 +597,15 @@ void bad_page_fault(struct pt_regs *regs, unsigned long address, int sig)
 
 	die("Kernel access of bad area", regs, sig);
 }
+
+void bad_page_fault(struct pt_regs *regs, unsigned long address, int sig)
+{
+	const struct exception_table_entry *entry;
+
+	/* Are we prepared to handle this fault?  */
+	entry = search_exception_tables(instruction_pointer(regs));
+	if (entry)
+		instruction_pointer_set(regs, extable_fixup(entry));
+	else
+		__bad_page_fault(regs, address, sig);
+}
-- 
2.25.0


^ permalink raw reply related

* [PATCH v2 5/5] powerpc/mm: Don't WARN() on KUAP fault
From: Christophe Leroy @ 2020-11-16 15:53 UTC (permalink / raw)
  To: Benjamin Herrenschmidt, Paul Mackerras, Michael Ellerman
  Cc: linuxppc-dev, linux-kernel
In-Reply-To: <40ae19c2bf013e3815a6ae0d6016963fdb0f51b7.1605541983.git.christophe.leroy@csgroup.eu>

The WARN() in do_page_fault() is useless the problem is not in
do_page_fault() but on the place which generated the DSI exception.

We already have a dump from the Oops, no need of a WARN() in addition
The warning emitted by bad_kernel_fault() is good enough.

Signed-off-by: Christophe Leroy <christophe.leroy@csgroup.eu>
---
v2: New (Partly taken from patch "powerpc/mm: Kill the task on KUAP fault")
---
 arch/powerpc/include/asm/book3s/32/kup.h       | 6 +-----
 arch/powerpc/include/asm/book3s/64/kup-radix.h | 7 ++++---
 arch/powerpc/include/asm/nohash/32/kup-8xx.h   | 3 +--
 3 files changed, 6 insertions(+), 10 deletions(-)

diff --git a/arch/powerpc/include/asm/book3s/32/kup.h b/arch/powerpc/include/asm/book3s/32/kup.h
index 32fd4452e960..a0117a9d5b06 100644
--- a/arch/powerpc/include/asm/book3s/32/kup.h
+++ b/arch/powerpc/include/asm/book3s/32/kup.h
@@ -183,11 +183,7 @@ bad_kuap_fault(struct pt_regs *regs, unsigned long address, bool is_write)
 	unsigned long begin = regs->kuap & 0xf0000000;
 	unsigned long end = regs->kuap << 28;
 
-	if (!is_write)
-		return false;
-
-	return WARN(address < begin || address >= end,
-		    "Bug: write fault blocked by segment registers !");
+	return is_write && (address < begin || address >= end);
 }
 
 #endif /* CONFIG_PPC_KUAP */
diff --git a/arch/powerpc/include/asm/book3s/64/kup-radix.h b/arch/powerpc/include/asm/book3s/64/kup-radix.h
index 3ee1ec60be84..8bdf559a4b32 100644
--- a/arch/powerpc/include/asm/book3s/64/kup-radix.h
+++ b/arch/powerpc/include/asm/book3s/64/kup-radix.h
@@ -161,9 +161,10 @@ static inline void restore_user_access(unsigned long flags)
 static inline bool
 bad_kuap_fault(struct pt_regs *regs, unsigned long address, bool is_write)
 {
-	return WARN(mmu_has_feature(MMU_FTR_RADIX_KUAP) &&
-		    (regs->kuap & (is_write ? AMR_KUAP_BLOCK_WRITE : AMR_KUAP_BLOCK_READ)),
-		    "Bug: %s fault blocked by AMR!", is_write ? "Write" : "Read");
+	if (!mmu_has_feature(MMU_FTR_RADIX_KUAP))
+		return false;
+
+	return !!(regs->kuap & (is_write ? AMR_KUAP_BLOCK_WRITE : AMR_KUAP_BLOCK_READ));
 }
 #else /* CONFIG_PPC_KUAP */
 static inline void kuap_restore_amr(struct pt_regs *regs, unsigned long amr)
diff --git a/arch/powerpc/include/asm/nohash/32/kup-8xx.h b/arch/powerpc/include/asm/nohash/32/kup-8xx.h
index 567cdc557402..17a4a616436f 100644
--- a/arch/powerpc/include/asm/nohash/32/kup-8xx.h
+++ b/arch/powerpc/include/asm/nohash/32/kup-8xx.h
@@ -63,8 +63,7 @@ static inline void restore_user_access(unsigned long flags)
 static inline bool
 bad_kuap_fault(struct pt_regs *regs, unsigned long address, bool is_write)
 {
-	return WARN(!((regs->kuap ^ MD_APG_KUAP) & 0xff000000),
-		    "Bug: fault blocked by AP register !");
+	return !((regs->kuap ^ MD_APG_KUAP) & 0xff000000);
 }
 
 #endif /* !__ASSEMBLY__ */
-- 
2.25.0


^ permalink raw reply related

* [PATCH v2 1/5] powerpc/mm: sanity_check_fault() should work for all,  not only BOOK3S
From: Christophe Leroy @ 2020-11-16 15:53 UTC (permalink / raw)
  To: Benjamin Herrenschmidt, Paul Mackerras, Michael Ellerman
  Cc: linuxppc-dev, linux-kernel

The verification and message introduced by commit 374f3f5979f9
("powerpc/mm/hash: Handle user access of kernel address gracefully")
applies to all platforms, it should not be limited to BOOK3S.

Make the BOOK3S version of sanity_check_fault() the one for all,
and bail out earlier if not BOOK3S.

Fixes: 374f3f5979f9 ("powerpc/mm/hash: Handle user access of kernel address gracefully")
Reviewed-by: Nicholas Piggin <npiggin@gmail.com>
Signed-off-by: Christophe Leroy <christophe.leroy@csgroup.eu>
---
 arch/powerpc/mm/fault.c | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

diff --git a/arch/powerpc/mm/fault.c b/arch/powerpc/mm/fault.c
index 0add963a849b..72e1b51beb10 100644
--- a/arch/powerpc/mm/fault.c
+++ b/arch/powerpc/mm/fault.c
@@ -303,7 +303,6 @@ static inline void cmo_account_page_fault(void)
 static inline void cmo_account_page_fault(void) { }
 #endif /* CONFIG_PPC_SMLPAR */
 
-#ifdef CONFIG_PPC_BOOK3S
 static void sanity_check_fault(bool is_write, bool is_user,
 			       unsigned long error_code, unsigned long address)
 {
@@ -320,6 +319,9 @@ static void sanity_check_fault(bool is_write, bool is_user,
 		return;
 	}
 
+	if (!IS_ENABLED(CONFIG_PPC_BOOK3S))
+		return;
+
 	/*
 	 * For hash translation mode, we should never get a
 	 * PROTFAULT. Any update to pte to reduce access will result in us
@@ -354,10 +356,6 @@ static void sanity_check_fault(bool is_write, bool is_user,
 
 	WARN_ON_ONCE(error_code & DSISR_PROTFAULT);
 }
-#else
-static void sanity_check_fault(bool is_write, bool is_user,
-			       unsigned long error_code, unsigned long address) { }
-#endif /* CONFIG_PPC_BOOK3S */
 
 /*
  * Define the correct "is_write" bit in error_code based
-- 
2.25.0


^ permalink raw reply related

* [PATCH v2 2/5] powerpc/fault: Unnest definition of page_fault_is_write() and page_fault_is_bad()
From: Christophe Leroy @ 2020-11-16 15:53 UTC (permalink / raw)
  To: Benjamin Herrenschmidt, Paul Mackerras, Michael Ellerman
  Cc: linuxppc-dev, linux-kernel
In-Reply-To: <40ae19c2bf013e3815a6ae0d6016963fdb0f51b7.1605541983.git.christophe.leroy@csgroup.eu>

To make it more readable, separate page_fault_is_write() and page_fault_is_bad()
to avoir several levels of #ifdefs

Reviewed-by: Nicholas Piggin <npiggin@gmail.com>
Signed-off-by: Christophe Leroy <christophe.leroy@csgroup.eu>
---
 arch/powerpc/mm/fault.c | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/arch/powerpc/mm/fault.c b/arch/powerpc/mm/fault.c
index 72e1b51beb10..17665ff97469 100644
--- a/arch/powerpc/mm/fault.c
+++ b/arch/powerpc/mm/fault.c
@@ -363,17 +363,19 @@ static void sanity_check_fault(bool is_write, bool is_user,
  */
 #if (defined(CONFIG_4xx) || defined(CONFIG_BOOKE))
 #define page_fault_is_write(__err)	((__err) & ESR_DST)
-#define page_fault_is_bad(__err)	(0)
 #else
 #define page_fault_is_write(__err)	((__err) & DSISR_ISSTORE)
-#if defined(CONFIG_PPC_8xx)
+#endif
+
+#if defined(CONFIG_4xx) || defined(CONFIG_BOOKE)
+#define page_fault_is_bad(__err)	(0)
+#elif defined(CONFIG_PPC_8xx)
 #define page_fault_is_bad(__err)	((__err) & DSISR_NOEXEC_OR_G)
 #elif defined(CONFIG_PPC64)
 #define page_fault_is_bad(__err)	((__err) & DSISR_BAD_FAULT_64S)
 #else
 #define page_fault_is_bad(__err)	((__err) & DSISR_BAD_FAULT_32S)
 #endif
-#endif
 
 /*
  * For 600- and 800-family processors, the error_code parameter is DSISR
-- 
2.25.0


^ permalink raw reply related

* Re: [PATCH 0/5] perf/mm: Fix PERF_SAMPLE_*_PAGE_SIZE
From: Kirill A. Shutemov @ 2020-11-16 15:43 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: mark.rutland, alexander.shishkin, catalin.marinas, eranian,
	dave.hansen, sparclinux, will, mingo, kan.liang, linux-arch, ak,
	aneesh.kumar, willy, jolsa, npiggin, acme, linux-kernel,
	linuxppc-dev, davem, kirill.shutemov
In-Reply-To: <20201113111901.743573013@infradead.org>

On Fri, Nov 13, 2020 at 12:19:01PM +0100, Peter Zijlstra wrote:
> Hi,
> 
> These patches provide generic infrastructure to determine TLB page size from
> page table entries alone. Perf will use this (for either data or code address)
> to aid in profiling TLB issues.

I'm not sure it's an issue, but strictly speaking, size of page according
to page table tree doesn't mean pagewalk would fill TLB entry of the size.
CPU may support 1G pages in page table tree without 1G TLB at all.

IIRC, current Intel CPU still don't have any 1G iTLB entries and fill 2M
iTLB instead.

-- 
 Kirill A. Shutemov

^ permalink raw reply

* Re: Error: invalid switch -me200
From: Christophe Leroy @ 2020-11-16 15:26 UTC (permalink / raw)
  To: Michael Ellerman
  Cc: Arnd Bergmann, kbuild-all, Brian Cain, linuxppc-dev,
	Masahiro Yamada, Nick Desaulniers, LKML, clang-built-linux,
	Fāng-ruì Sòng, mihai.caraman, Nathan Chancellor,
	Linus Torvalds, kernel test robot
In-Reply-To: <87h7pp4yzm.fsf@mpe.ellerman.id.au>


Quoting Michael Ellerman <mpe@ellerman.id.au>:

> Christophe Leroy <christophe.leroy@csgroup.eu> writes:
>> Le 14/11/2020 à 01:20, Segher Boessenkool a écrit :
>>> On Fri, Nov 13, 2020 at 12:14:18PM -0800, Nick Desaulniers wrote:
>>>>>>> Error: invalid switch -me200
>>>>>>> Error: unrecognized option -me200
>>>>>>
>>>>>> 251 cpu-as-$(CONFIG_E200)   += -Wa,-me200
>>>>>>
>>>>>> Are those all broken configs, or is Kconfig messed up such that
>>>>>> randconfig can select these when it should not?
>>>>>
>>>>> Hmmm, looks like this flag does not exist in mainline binutils? There is
>>>>> a thread in 2010 about this that Segher commented on:
>>>>>
>>>>> https://lore.kernel.org/linuxppc-dev/9859E645-954D-4D07-8003-FFCD2391AB6E@kernel.crashing.org/
>>>>>
>>>>> Guess this config should be eliminated?
>>>
>>> The help text for this config options says that e200 is used in 55xx,
>>> and there *is* an -me5500 GAS flag (which probably does this same
>>> thing, too).  But is any of this tested, or useful, or wanted?
>>>
>>> Maybe Christophe knows, cc:ed.
>>>
>>
>> I don't have much clue on this.
>
> Me either.
>
>> But I see on wikipedia that e5500 is a 64 bits powerpc  
>> (https://en.wikipedia.org/wiki/PowerPC_e5500)
>>
>> What I see is that NXP seems to provide a GCC version that includes  
>> aditionnal cpu (e200z0 e200z2
>> e200z3 e200z4 e200z6 e200z7):
>>
>> valid arguments to '-mcpu=' are: 401 403 405 405fp 440 440fp 464  
>> 464fp 476 476fp 505 601 602 603
>> 603e 604 604e 620 630 740 7400 7450 750 801 821 823 8540 8548 860  
>> 970 G3 G4 G5 a2 cell e200z0 e200z2
>> e200z3 e200z4 e200z6 e200z7 e300c2 e300c3 e500mc e500mc64 e5500  
>> e6500 ec603e native power3 power4
>> power5 power5+ power6 power6x power7 power8 powerpc powerpc64  
>> powerpc64le rs64 titan "
>>
>> https://community.nxp.com/t5/MPC5xxx/GCC-generating-not-implemented-instructions/m-p/845049
>>
>> Apparently based on binutils 2.28
>>
>> https://www.nxp.com/docs/en/release-note/S32DS-POWER-v1-2-RN.pdf
>>
>> But that's not exactly -me200 though.
>>
>> Now, I can't see any defconfig that selects CONFIG_E200, so is that  
>> worth keeping it in the kernel
>> at all ?
>
> There was a commit in 2014 that suggests it worked at least to some
> extent then:
>
>   3477e71d5319 ("powerpc/booke: Restrict SPE exception handlers to  
> e200/e500 cores")

Not sure, that patch seems to be focussed on the new e500mc

>
>
> Presumably there was a non-upstream toolchain where it was supported?
>
> AFAICS the kernel builds OK with just the cpu-as modification removed:
>
> diff --git a/arch/powerpc/Makefile b/arch/powerpc/Makefile
> index a4d56f0a41d9..16b8336f91dd 100644
> --- a/arch/powerpc/Makefile
> +++ b/arch/powerpc/Makefile
> @@ -248,7 +248,6 @@ KBUILD_CFLAGS               += $(call  
> cc-option,-mno-string)
>  cpu-as-$(CONFIG_40x)           += -Wa,-m405
>  cpu-as-$(CONFIG_44x)           += -Wa,-m440
>  cpu-as-$(CONFIG_ALTIVEC)       += $(call as-option,-Wa$(comma)-maltivec)
> -cpu-as-$(CONFIG_E200)          += -Wa,-me200
>  cpu-as-$(CONFIG_E500)          += -Wa,-me500
>
>  # When using '-many -mpower4' gas will first try and find a matching power4
>
>
> So that seems like the obvious fix for now.

Or we could do

diff --git a/arch/powerpc/platforms/Kconfig.cputype  
b/arch/powerpc/platforms/Kconfig.cputype
index c194c4ae8bc7..a11cf9431e1e 100644
--- a/arch/powerpc/platforms/Kconfig.cputype
+++ b/arch/powerpc/platforms/Kconfig.cputype
@@ -67,6 +67,7 @@ config 44x
  	select PHYS_64BIT

  config E200
+	depends on $(cc-option,-me200)
  	bool "Freescale e200"

  endchoice
---
Christophe


^ permalink raw reply related

* [PATCH -next] powerpc/powernv/sriov: Fix unsigned comparison to zero
From: Zou Wei @ 2020-11-16 12:41 UTC (permalink / raw)
  To: mpe, benh, paulus; +Cc: linuxppc-dev, Zou Wei, oohall, linux-kernel

Fixes coccicheck warnings:

./arch/powerpc/platforms/powernv/pci-sriov.c:443:7-10: WARNING: Unsigned expression compared with zero: win < 0
./arch/powerpc/platforms/powernv/pci-sriov.c:462:7-10: WARNING: Unsigned expression compared with zero: win < 0

Signed-off-by: Zou Wei <zou_wei@huawei.com>
---
 arch/powerpc/platforms/powernv/pci-sriov.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/powerpc/platforms/powernv/pci-sriov.c b/arch/powerpc/platforms/powernv/pci-sriov.c
index c4434f2..92fc861 100644
--- a/arch/powerpc/platforms/powernv/pci-sriov.c
+++ b/arch/powerpc/platforms/powernv/pci-sriov.c
@@ -422,7 +422,7 @@ static int pnv_pci_vf_assign_m64(struct pci_dev *pdev, u16 num_vfs)
 {
 	struct pnv_iov_data   *iov;
 	struct pnv_phb        *phb;
-	unsigned int           win;
+	int		       win;
 	struct resource       *res;
 	int                    i, j;
 	int64_t                rc;
-- 
2.6.2


^ permalink raw reply related

* Re: [PATCH V3] sched/rt, powerpc: Prepare for PREEMPT_RT
From: Michael Ellerman @ 2020-11-16 12:46 UTC (permalink / raw)
  To: Wang Qing, Benjamin Herrenschmidt, Paul Mackerras,
	Christophe Leroy, Nicholas Piggin, Jordan Niethe, Alistair Popple,
	Wang Qing, Aneesh Kumar K.V, Peter Zijlstra, Greg Kroah-Hartman,
	linuxppc-dev, linux-kernel
In-Reply-To: <1604998411-16116-1-git-send-email-wangqing@vivo.com>

Wang Qing <wangqing@vivo.com> writes:
> PREEMPT_RT is a separate preemption model, CONFIG_PREEMPT will
>  be disabled when CONFIG_PREEMPT_RT is enabled,  so we need
> to add CONFIG_PREEMPT_RT output to __die().
>
> Signed-off-by: Wang Qing <wangqing@vivo.com>

Something fairly similar was posted previously.

That time I said:

  I don't think there's any point adding the "_RT" to the __die() output
  until/if we ever start supporting PREEMPT_RT.

  https://lore.kernel.org/linuxppc-dev/87d0ext4q3.fsf@mpe.ellerman.id.au/
   

And I think I still feel the same way. It's not clear powerpc will ever
support PREEMPT_RT, so this would just be confusing to people. And
potentially someone will then send a patch to remove it as dead code.

cheers



> diff --git a/arch/powerpc/kernel/traps.c b/arch/powerpc/kernel/traps.c
> index 5006dcb..dec7b81
> --- a/arch/powerpc/kernel/traps.c
> +++ b/arch/powerpc/kernel/traps.c
> @@ -262,10 +262,11 @@ static int __die(const char *str, struct pt_regs *regs, long err)
>  {
>  	printk("Oops: %s, sig: %ld [#%d]\n", str, err, ++die_counter);
>  
> -	printk("%s PAGE_SIZE=%luK%s%s%s%s%s%s %s\n",
> +	printk("%s PAGE_SIZE=%luK%s%s%s%s%s%s%s %s\n",
>  	       IS_ENABLED(CONFIG_CPU_LITTLE_ENDIAN) ? "LE" : "BE",
>  	       PAGE_SIZE / 1024, get_mmu_str(),
>  	       IS_ENABLED(CONFIG_PREEMPT) ? " PREEMPT" : "",
> +	       IS_ENABLED(CONFIG_PREEMPT_RT) ? " PREEMPT_RT" : "",
>  	       IS_ENABLED(CONFIG_SMP) ? " SMP" : "",
>  	       IS_ENABLED(CONFIG_SMP) ? (" NR_CPUS=" __stringify(NR_CPUS)) : "",
>  	       debug_pagealloc_enabled() ? " DEBUG_PAGEALLOC" : "",
> -- 
> 2.7.4

^ permalink raw reply

* Re: [PATCH] KVM: PPC: Book3S HV: XIVE: Fix possible oops when accessing ESB page
From: Michael Ellerman @ 2020-11-16 12:29 UTC (permalink / raw)
  To: Cédric Le Goater, Paul Mackerras
  Cc: kvm, Gustavo Romero, Greg Kurz, kvm-ppc, linuxppc-dev,
	David Gibson
In-Reply-To: <1270ada4-e2a9-6a1a-52a9-b5c3479c05ea@kaod.org>

Cédric Le Goater <clg@kaod.org> writes:
> On 11/6/20 4:19 AM, Michael Ellerman wrote:
>> Cédric Le Goater <clg@kaod.org> writes:
>>> When accessing the ESB page of a source interrupt, the fault handler
>>> will retrieve the page address from the XIVE interrupt 'xive_irq_data'
>>> structure. If the associated KVM XIVE interrupt is not valid, that is
>>> not allocated at the HW level for some reason, the fault handler will
>>> dereference a NULL pointer leading to the oops below :
>>>
>>>     WARNING: CPU: 40 PID: 59101 at arch/powerpc/kvm/book3s_xive_native.c:259 xive_native_esb_fault+0xe4/0x240 [kvm]
>>>     CPU: 40 PID: 59101 Comm: qemu-system-ppc Kdump: loaded Tainted: G        W        --------- -  - 4.18.0-240.el8.ppc64le #1
>>>     NIP:  c00800000e949fac LR: c00000000044b164 CTR: c00800000e949ec8
>>>     REGS: c000001f69617840 TRAP: 0700   Tainted: G        W        --------- -  -  (4.18.0-240.el8.ppc64le)
>>>     MSR:  9000000000029033 <SF,HV,EE,ME,IR,DR,RI,LE>  CR: 44044282  XER: 00000000
>>>     CFAR: c00000000044b160 IRQMASK: 0
>>>     GPR00: c00000000044b164 c000001f69617ac0 c00800000e96e000 c000001f69617c10
>>>     GPR04: 05faa2b21e000080 0000000000000000 0000000000000005 ffffffffffffffff
>>>     GPR08: 0000000000000000 0000000000000001 0000000000000000 0000000000000001
>>>     GPR12: c00800000e949ec8 c000001ffffd3400 0000000000000000 0000000000000000
>>>     GPR16: 0000000000000000 0000000000000000 0000000000000000 0000000000000000
>>>     GPR20: 0000000000000000 0000000000000000 c000001f5c065160 c000000001c76f90
>>>     GPR24: c000001f06f20000 c000001f5c065100 0000000000000008 c000001f0eb98c78
>>>     GPR28: c000001dcab40000 c000001dcab403d8 c000001f69617c10 0000000000000011
>>>     NIP [c00800000e949fac] xive_native_esb_fault+0xe4/0x240 [kvm]
>>>     LR [c00000000044b164] __do_fault+0x64/0x220
>>>     Call Trace:
>>>     [c000001f69617ac0] [0000000137a5dc20] 0x137a5dc20 (unreliable)
>>>     [c000001f69617b50] [c00000000044b164] __do_fault+0x64/0x220
>>>     [c000001f69617b90] [c000000000453838] do_fault+0x218/0x930
>>>     [c000001f69617bf0] [c000000000456f50] __handle_mm_fault+0x350/0xdf0
>>>     [c000001f69617cd0] [c000000000457b1c] handle_mm_fault+0x12c/0x310
>>>     [c000001f69617d10] [c00000000007ef44] __do_page_fault+0x264/0xbb0
>>>     [c000001f69617df0] [c00000000007f8c8] do_page_fault+0x38/0xd0
>>>     [c000001f69617e30] [c00000000000a714] handle_page_fault+0x18/0x38
>>>     Instruction dump:
>>>     40c2fff0 7c2004ac 2fa90000 409e0118 73e90001 41820080 e8bd0008 7c2004ac
>>>     7ca90074 39400000 915c0000 7929d182 <0b090000> 2fa50000 419e0080 e89e0018
>>>     ---[ end trace 66c6ff034c53f64f ]---
>>>     xive-kvm: xive_native_esb_fault: accessing invalid ESB page for source 8 !
>>>
>>> Fix that by checking the validity of the KVM XIVE interrupt structure.
>>>
>>> Reported-by: Greg Kurz <groug@kaod.org>
>>> Signed-off-by: Cédric Le Goater <clg@kaod.org>
>> 
>> Fixes ?
>
> Ah yes :/  
>
> Cc: stable@vger.kernel.org # v5.2+
> Fixes: 6520ca64cde7 ("KVM: PPC: Book3S HV: XIVE: Add a mapping for the source ESB pages")
>
> Since my provider changed its imap servers, my email filters are really screwed 
> up and I miss emails. 
>
> Sorry about that,

No worries.

It doesn't look like Paul has grabbed this, so I'll take it.

cheers

^ permalink raw reply

* Re: [PATCH] powerpc/pseries/hotplug-cpu: Fix memleak when cpus node not exist
From: Michael Ellerman @ 2020-11-16 12:23 UTC (permalink / raw)
  To: Tyrel Datwyler, Nathan Lynch, Zhang Xiaoxu; +Cc: linuxppc-dev, paulus, groug
In-Reply-To: <a00f4a44-6f38-306f-a24f-4e3565deefc2@linux.ibm.com>

Tyrel Datwyler <tyreld@linux.ibm.com> writes:
> On 11/10/20 6:08 AM, Nathan Lynch wrote:
>> Zhang Xiaoxu <zhangxiaoxu5@huawei.com> writes:
>>> From: zhangxiaoxu <zhangxiaoxu5@huawei.com>
>>>
>>> If the cpus nodes not exist, we lost to free the 'cpu_drcs', which
>>> will leak memory.
>>>
>>> Fixes: a0ff72f9f5a7 ("powerpc/pseries/hotplug-cpu: Remove double free in error path")
>>> Reported-by: Hulk Robot <hulkci@huawei.com>
>>> Signed-off-by: zhangxiaoxu <zhangxiaoxu5@huawei.com>
>>> ---
>>>  arch/powerpc/platforms/pseries/hotplug-cpu.c | 1 +
>>>  1 file changed, 1 insertion(+)
>>>
>>> diff --git a/arch/powerpc/platforms/pseries/hotplug-cpu.c b/arch/powerpc/platforms/pseries/hotplug-cpu.c
>>> index f2837e33bf5d..4bb1c9f2bb11 100644
>>> --- a/arch/powerpc/platforms/pseries/hotplug-cpu.c
>>> +++ b/arch/powerpc/platforms/pseries/hotplug-cpu.c
>>> @@ -743,6 +743,7 @@ static int dlpar_cpu_add_by_count(u32 cpus_to_add)
>>>  	parent = of_find_node_by_path("/cpus");
>>>  	if (!parent) {
>>>  		pr_warn("Could not find CPU root node in device tree\n");
>>> +		kfree(cpu_drcs);
>>>  		return -1;
>>>  	}
>> 
>> Thanks for finding this.
>> 
>> a0ff72f9f5a7 ("powerpc/pseries/hotplug-cpu: Remove double free in error
>> path") was posted in Sept 2019 but was not applied until July 2020:
>> 
>> https://lore.kernel.org/linuxppc-dev/20190919231633.1344-1-nathanl@linux.ibm.com/
>> 
>> Here is that change as posted; note the function context is
>> find_dlpar_cpus_to_add(), not dlpar_cpu_add_by_count():
>> 
>> --- a/arch/powerpc/platforms/pseries/hotplug-cpu.c
>> +++ b/arch/powerpc/platforms/pseries/hotplug-cpu.c
>> @@ -726,7 +726,6 @@ static int find_dlpar_cpus_to_add(u32 *cpu_drcs, u32 cpus_to_add)
>>  	parent = of_find_node_by_path("/cpus");
>>  	if (!parent) {
>>  		pr_warn("Could not find CPU root node in device tree\n");
>> -		kfree(cpu_drcs);
>>  		return -1;
>>  	}
>> 
>> Meanwhile b015f6bc9547dbc056edde7177c7868ca8629c4c ("powerpc/pseries: Add
>> cpu DLPAR support for drc-info property") was posted in Nov 2019 and
>> committed a few days later:
>> 
>> https://lore.kernel.org/linux-pci/1573449697-5448-4-git-send-email-tyreld@linux.ibm.com/
>> 
>> This change reorganized the same code, removing
>> find_dlpar_cpus_to_add(), and it had the effect of fixing the same
>> issue.
>> 
>> However git apparently allowed the older change to still apply on top of
>> this (changing a function different from the one in the original
>> patch!), leading to a real bug.
>
> Yikes, not sure how that happened without either the committer massaging the
> patch to apply, or the line location and context matching exactly.

git-am won't apply it, but patch does. I often have to fall back to
using patch when things don't apply, so that's presumably what happened
here. I try to manually check the result is correct but I obviously
didn't do a good job here.

cheers

^ permalink raw reply

* [PATCH] powerpc: Drop -me200 addition to build flags
From: Michael Ellerman @ 2020-11-16 12:09 UTC (permalink / raw)
  To: linuxppc-dev; +Cc: ndesaulniers, linux-kernel, oss, natechancellor

Currently a build with CONFIG_E200=y will fail with:

  Error: invalid switch -me200
  Error: unrecognized option -me200

Upstream binutils has never supported an -me200 option. Presumably it
was supported at some point by either a fork or Freescale internal
binutils.

We can't support code that we can't even build test, so drop the
addition of -me200 to the build flags, so we can at least build with
CONFIG_E200=y.

Reported-by: Németh Márton <nm127@freemail.hu>
Reported-by: kernel test robot <lkp@intel.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---

More discussion: https://lore.kernel.org/lkml/202011131146.g8dPLQDD-lkp@intel.com
---
 arch/powerpc/Makefile | 1 -
 1 file changed, 1 deletion(-)

diff --git a/arch/powerpc/Makefile b/arch/powerpc/Makefile
index a4d56f0a41d9..16b8336f91dd 100644
--- a/arch/powerpc/Makefile
+++ b/arch/powerpc/Makefile
@@ -248,7 +248,6 @@ KBUILD_CFLAGS		+= $(call cc-option,-mno-string)
 cpu-as-$(CONFIG_40x)		+= -Wa,-m405
 cpu-as-$(CONFIG_44x)		+= -Wa,-m440
 cpu-as-$(CONFIG_ALTIVEC)	+= $(call as-option,-Wa$(comma)-maltivec)
-cpu-as-$(CONFIG_E200)		+= -Wa,-me200
 cpu-as-$(CONFIG_E500)		+= -Wa,-me500
 
 # When using '-many -mpower4' gas will first try and find a matching power4
-- 
2.25.1


^ permalink raw reply related

* Re: [PATCH] powerpc/powernv/memtrace: Fake non-memblock aligned sized traces
From: Michael Ellerman @ 2020-11-16 12:02 UTC (permalink / raw)
  To: Jordan Niethe, linuxppc-dev; +Cc: Jordan Niethe, mikey
In-Reply-To: <20201111055524.2458-1-jniethe5@gmail.com>

Jordan Niethe <jniethe5@gmail.com> writes:
> The hardware trace macros which use the memory provided by memtrace are
> able to use trace sizes as small as 16MB. Only memblock aligned values
> can be removed from each NUMA node by writing that value to
> memtrace/enable in debugfs.  This means setting up, say, a 16MB trace is
> not possible.  To allow such a trace size, instead align whatever value
> is written to memtrace/enable to the memblock size for the purpose of
> removing it from each NUMA node but report the written value from
> memtrace/enable and memtrace/x/size in debugfs.

Why does it matter if the size that's removed is larger than the size
that was requested?

Is it about constraining the size of the trace? If so that seems like it
should be the job of the tracing tools, not the kernel.

cheers

^ permalink raw reply

* Re: [PATCH 2/3] Revert "lib: Revert use of fallthrough pseudo-keyword in lib/"
From: kernel test robot @ 2020-11-16 11:49 UTC (permalink / raw)
  To: Nick Desaulniers, Gustavo A . R . Silva, Nathan Chancellor,
	Miguel Ojeda, Michael Ellerman
  Cc: kbuild-all, Nick Desaulniers, linux-kernel, clang-built-linux,
	Paul Mackerras, linuxppc-dev
In-Reply-To: <20201116043532.4032932-3-ndesaulniers@google.com>

[-- Attachment #1: Type: text/plain, Size: 2966 bytes --]

Hi Nick,

I love your patch! Perhaps something to improve:

[auto build test WARNING on powerpc/next]
[also build test WARNING on linus/master v5.10-rc4 next-20201116]
[cannot apply to pmladek/for-next]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch]

url:    https://github.com/0day-ci/linux/commits/Nick-Desaulniers/PPC-Fix-Wimplicit-fallthrough-for-clang/20201116-123803
base:   https://git.kernel.org/pub/scm/linux/kernel/git/powerpc/linux.git next
config: x86_64-randconfig-m001-20201115 (attached as .config)
compiler: gcc-9 (Debian 9.3.0-15) 9.3.0

If you fix the issue, kindly add following tag as appropriate
Reported-by: kernel test robot <lkp@intel.com>

smatch warnings:
lib/zstd/huf_compress.c:559 HUF_compress1X_usingCTable() warn: inconsistent indenting

vim +559 lib/zstd/huf_compress.c

   529	
   530	#define HUF_FLUSHBITS_1(stream)                                            \
   531		if (sizeof((stream)->bitContainer) * 8 < HUF_TABLELOG_MAX * 2 + 7) \
   532		HUF_FLUSHBITS(stream)
   533	
   534	#define HUF_FLUSHBITS_2(stream)                                            \
   535		if (sizeof((stream)->bitContainer) * 8 < HUF_TABLELOG_MAX * 4 + 7) \
   536		HUF_FLUSHBITS(stream)
   537	
   538	size_t HUF_compress1X_usingCTable(void *dst, size_t dstSize, const void *src, size_t srcSize, const HUF_CElt *CTable)
   539	{
   540		const BYTE *ip = (const BYTE *)src;
   541		BYTE *const ostart = (BYTE *)dst;
   542		BYTE *const oend = ostart + dstSize;
   543		BYTE *op = ostart;
   544		size_t n;
   545		BIT_CStream_t bitC;
   546	
   547		/* init */
   548		if (dstSize < 8)
   549			return 0; /* not enough space to compress */
   550		{
   551			size_t const initErr = BIT_initCStream(&bitC, op, oend - op);
   552			if (HUF_isError(initErr))
   553				return 0;
   554		}
   555	
   556		n = srcSize & ~3; /* join to mod 4 */
   557		switch (srcSize & 3) {
   558		case 3: HUF_encodeSymbol(&bitC, ip[n + 2], CTable); HUF_FLUSHBITS_2(&bitC);
 > 559			fallthrough;
   560		case 2: HUF_encodeSymbol(&bitC, ip[n + 1], CTable); HUF_FLUSHBITS_1(&bitC);
   561			fallthrough;
   562		case 1: HUF_encodeSymbol(&bitC, ip[n + 0], CTable); HUF_FLUSHBITS(&bitC);
   563		case 0:
   564		default:;
   565		}
   566	
   567		for (; n > 0; n -= 4) { /* note : n&3==0 at this stage */
   568			HUF_encodeSymbol(&bitC, ip[n - 1], CTable);
   569			HUF_FLUSHBITS_1(&bitC);
   570			HUF_encodeSymbol(&bitC, ip[n - 2], CTable);
   571			HUF_FLUSHBITS_2(&bitC);
   572			HUF_encodeSymbol(&bitC, ip[n - 3], CTable);
   573			HUF_FLUSHBITS_1(&bitC);
   574			HUF_encodeSymbol(&bitC, ip[n - 4], CTable);
   575			HUF_FLUSHBITS(&bitC);
   576		}
   577	
   578		return BIT_closeCStream(&bitC);
   579	}
   580	

---
0-DAY CI Kernel Test Service, Intel Corporation
https://lists.01.org/hyperkitty/list/kbuild-all@lists.01.org

[-- Attachment #2: .config.gz --]
[-- Type: application/gzip, Size: 31321 bytes --]

^ permalink raw reply

* Re: [PATCH 3/3] powerpc: fix -Wimplicit-fallthrough
From: Miguel Ojeda @ 2020-11-16 11:33 UTC (permalink / raw)
  To: Nick Desaulniers
  Cc: Gustavo A . R . Silva, linux-kernel, clang-built-linux,
	Paul Mackerras, Nathan Chancellor, linuxppc-dev
In-Reply-To: <20201116043532.4032932-4-ndesaulniers@google.com>

On Mon, Nov 16, 2020 at 5:35 AM Nick Desaulniers
<ndesaulniers@google.com> wrote:
>
> The "fallthrough" pseudo-keyword was added as a portable way to denote
> intentional fallthrough. Clang will still warn on cases where there is a
> fallthrough to an immediate break. Add explicit breaks for those cases.
>
> Link: https://github.com/ClangBuiltLinux/linux/issues/236
> Signed-off-by: Nick Desaulniers <ndesaulniers@google.com>

It makes things clearer having a `break` added, so I like that warning.

Reviewed-by: Miguel Ojeda <ojeda@kernel.org>

Cheers,
Miguel

^ permalink raw reply

* Re: [PATCH 2/3] Revert "lib: Revert use of fallthrough pseudo-keyword in lib/"
From: Miguel Ojeda @ 2020-11-16 11:26 UTC (permalink / raw)
  To: Nick Desaulniers
  Cc: Gustavo A . R . Silva, linux-kernel, clang-built-linux,
	Paul Mackerras, Nathan Chancellor, linuxppc-dev
In-Reply-To: <20201116043532.4032932-3-ndesaulniers@google.com>

On Mon, Nov 16, 2020 at 5:35 AM Nick Desaulniers
<ndesaulniers@google.com> wrote:
>
> This reverts commit 6a9dc5fd6170 ("lib: Revert use of fallthrough
> pseudo-keyword in lib/")
>
> Now that we can build arch/powerpc/boot/ free of -Wimplicit-fallthrough,
> re-enable these fixes for lib/.
>
> Link: https://github.com/ClangBuiltLinux/linux/issues/236
> Signed-off-by: Nick Desaulniers <ndesaulniers@google.com>

Looks fine on visual inspection:

Reviewed-by: Miguel Ojeda <ojeda@kernel.org>

Cheers,
Miguel

^ permalink raw reply

* Re: [PATCH 1/3] powerpc: boot: include compiler_attributes.h
From: Miguel Ojeda @ 2020-11-16 11:24 UTC (permalink / raw)
  To: Nick Desaulniers
  Cc: Gustavo A . R . Silva, linux-kernel, clang-built-linux,
	Paul Mackerras, Nathan Chancellor, linuxppc-dev
In-Reply-To: <20201116043532.4032932-2-ndesaulniers@google.com>

On Mon, Nov 16, 2020 at 5:35 AM Nick Desaulniers
<ndesaulniers@google.com> wrote:
>
> It was also noted in 6a9dc5fd6170 that we could -D__KERNEL__ and
> -include compiler_types.h like the main kernel does, though testing that
> produces a whole sea of warnings to cleanup. This approach is minimally
> invasive.

I would add a comment noting this as a reminder -- it also helps to
entice a cleanup.

Acked-by: Miguel Ojeda <ojeda@kernel.org>

Cheers,
Miguel

^ permalink raw reply

page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox