LinuxPPC-Dev Archive on lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH 6/9] virtio_balloon: remove the balloon-kvm file system
From: Christoph Hellwig @ 2021-03-09 15:53 UTC (permalink / raw)
  To: Al Viro
  Cc: Jason Gunthorpe, David Hildenbrand, VMware, Inc.,
	Michael S. Tsirkin, linux-kernel, dri-devel, virtualization,
	linux-mm, Minchan Kim, Alex Williamson, Nadav Amit, Daniel Vetter,
	linux-fsdevel, Andrew Morton, linuxppc-dev, Nitin Gupta
In-Reply-To: <20210309155348.974875-1-hch@lst.de>

Just use the generic anon_inode file system.

Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 drivers/virtio/virtio_balloon.c | 30 +++---------------------------
 1 file changed, 3 insertions(+), 27 deletions(-)

diff --git a/drivers/virtio/virtio_balloon.c b/drivers/virtio/virtio_balloon.c
index cae76ee5bdd688..1efb890cd3ff09 100644
--- a/drivers/virtio/virtio_balloon.c
+++ b/drivers/virtio/virtio_balloon.c
@@ -6,6 +6,7 @@
  *  Copyright 2008 Rusty Russell IBM Corporation
  */
 
+#include <linux/anon_inodes.h>
 #include <linux/virtio.h>
 #include <linux/virtio_balloon.h>
 #include <linux/swap.h>
@@ -42,10 +43,6 @@
 	(1 << (VIRTIO_BALLOON_HINT_BLOCK_ORDER + PAGE_SHIFT))
 #define VIRTIO_BALLOON_HINT_BLOCK_PAGES (1 << VIRTIO_BALLOON_HINT_BLOCK_ORDER)
 
-#ifdef CONFIG_BALLOON_COMPACTION
-static struct vfsmount *balloon_mnt;
-#endif
-
 enum virtio_balloon_vq {
 	VIRTIO_BALLOON_VQ_INFLATE,
 	VIRTIO_BALLOON_VQ_DEFLATE,
@@ -805,18 +802,6 @@ static int virtballoon_migratepage(struct balloon_dev_info *vb_dev_info,
 
 	return MIGRATEPAGE_SUCCESS;
 }
-
-static int balloon_init_fs_context(struct fs_context *fc)
-{
-	return init_pseudo(fc, BALLOON_KVM_MAGIC) ? 0 : -ENOMEM;
-}
-
-static struct file_system_type balloon_fs = {
-	.name           = "balloon-kvm",
-	.init_fs_context = balloon_init_fs_context,
-	.kill_sb        = kill_anon_super,
-};
-
 #endif /* CONFIG_BALLOON_COMPACTION */
 
 static unsigned long shrink_free_pages(struct virtio_balloon *vb,
@@ -909,17 +894,11 @@ static int virtballoon_probe(struct virtio_device *vdev)
 		goto out_free_vb;
 
 #ifdef CONFIG_BALLOON_COMPACTION
-	balloon_mnt = kern_mount(&balloon_fs);
-	if (IS_ERR(balloon_mnt)) {
-		err = PTR_ERR(balloon_mnt);
-		goto out_del_vqs;
-	}
-
 	vb->vb_dev_info.migratepage = virtballoon_migratepage;
-	vb->vb_dev_info.inode = alloc_anon_inode_sb(balloon_mnt->mnt_sb);
+	vb->vb_dev_info.inode = alloc_anon_inode();
 	if (IS_ERR(vb->vb_dev_info.inode)) {
 		err = PTR_ERR(vb->vb_dev_info.inode);
-		goto out_kern_unmount;
+		goto out_del_vqs;
 	}
 	vb->vb_dev_info.inode->i_mapping->a_ops = &balloon_aops;
 #endif
@@ -1016,8 +995,6 @@ static int virtballoon_probe(struct virtio_device *vdev)
 out_iput:
 #ifdef CONFIG_BALLOON_COMPACTION
 	iput(vb->vb_dev_info.inode);
-out_kern_unmount:
-	kern_unmount(balloon_mnt);
 out_del_vqs:
 #endif
 	vdev->config->del_vqs(vdev);
@@ -1070,7 +1047,6 @@ static void virtballoon_remove(struct virtio_device *vdev)
 	if (vb->vb_dev_info.inode)
 		iput(vb->vb_dev_info.inode);
 
-	kern_unmount(balloon_mnt);
 #endif
 	kfree(vb);
 }
-- 
2.30.1


^ permalink raw reply related

* [PATCH 5/9] vmw_balloon: remove the balloon-vmware file system
From: Christoph Hellwig @ 2021-03-09 15:53 UTC (permalink / raw)
  To: Al Viro
  Cc: Jason Gunthorpe, David Hildenbrand, VMware, Inc.,
	Michael S. Tsirkin, linux-kernel, dri-devel, virtualization,
	linux-mm, Minchan Kim, Alex Williamson, Nadav Amit, Daniel Vetter,
	linux-fsdevel, Andrew Morton, linuxppc-dev, Nitin Gupta
In-Reply-To: <20210309155348.974875-1-hch@lst.de>

Just use the generic anon_inode file system.

Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 drivers/misc/vmw_balloon.c | 24 ++----------------------
 1 file changed, 2 insertions(+), 22 deletions(-)

diff --git a/drivers/misc/vmw_balloon.c b/drivers/misc/vmw_balloon.c
index 5d057a05ddbee8..be4be32f858253 100644
--- a/drivers/misc/vmw_balloon.c
+++ b/drivers/misc/vmw_balloon.c
@@ -16,6 +16,7 @@
 //#define DEBUG
 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
 
+#include <linux/anon_inodes.h>
 #include <linux/types.h>
 #include <linux/io.h>
 #include <linux/kernel.h>
@@ -1735,20 +1736,6 @@ static inline void vmballoon_debugfs_exit(struct vmballoon *b)
 
 
 #ifdef CONFIG_BALLOON_COMPACTION
-
-static int vmballoon_init_fs_context(struct fs_context *fc)
-{
-	return init_pseudo(fc, BALLOON_VMW_MAGIC) ? 0 : -ENOMEM;
-}
-
-static struct file_system_type vmballoon_fs = {
-	.name           	= "balloon-vmware",
-	.init_fs_context	= vmballoon_init_fs_context,
-	.kill_sb        	= kill_anon_super,
-};
-
-static struct vfsmount *vmballoon_mnt;
-
 /**
  * vmballoon_migratepage() - migrates a balloon page.
  * @b_dev_info: balloon device information descriptor.
@@ -1878,8 +1865,6 @@ static void vmballoon_compaction_deinit(struct vmballoon *b)
 		iput(b->b_dev_info.inode);
 
 	b->b_dev_info.inode = NULL;
-	kern_unmount(vmballoon_mnt);
-	vmballoon_mnt = NULL;
 }
 
 /**
@@ -1895,13 +1880,8 @@ static void vmballoon_compaction_deinit(struct vmballoon *b)
  */
 static __init int vmballoon_compaction_init(struct vmballoon *b)
 {
-	vmballoon_mnt = kern_mount(&vmballoon_fs);
-	if (IS_ERR(vmballoon_mnt))
-		return PTR_ERR(vmballoon_mnt);
-
 	b->b_dev_info.migratepage = vmballoon_migratepage;
-	b->b_dev_info.inode = alloc_anon_inode_sb(vmballoon_mnt->mnt_sb);
-
+	b->b_dev_info.inode = alloc_anon_inode();
 	if (IS_ERR(b->b_dev_info.inode))
 		return PTR_ERR(b->b_dev_info.inode);
 
-- 
2.30.1


^ permalink raw reply related

* [PATCH 4/9] drm: remove the drm file system
From: Christoph Hellwig @ 2021-03-09 15:53 UTC (permalink / raw)
  To: Al Viro
  Cc: Jason Gunthorpe, David Hildenbrand, VMware, Inc.,
	Michael S. Tsirkin, linux-kernel, dri-devel, virtualization,
	linux-mm, Minchan Kim, Alex Williamson, Nadav Amit, Daniel Vetter,
	linux-fsdevel, Andrew Morton, linuxppc-dev, Nitin Gupta
In-Reply-To: <20210309155348.974875-1-hch@lst.de>

Just use the generic anon_inode file system.

Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 drivers/gpu/drm/drm_drv.c | 64 ++-------------------------------------
 1 file changed, 3 insertions(+), 61 deletions(-)

diff --git a/drivers/gpu/drm/drm_drv.c b/drivers/gpu/drm/drm_drv.c
index 87e7214a8e3565..af293d76f979e5 100644
--- a/drivers/gpu/drm/drm_drv.c
+++ b/drivers/gpu/drm/drm_drv.c
@@ -26,6 +26,7 @@
  * DEALINGS IN THE SOFTWARE.
  */
 
+#include <linux/anon_inodes.h>
 #include <linux/debugfs.h>
 #include <linux/fs.h>
 #include <linux/module.h>
@@ -475,65 +476,6 @@ void drm_dev_unplug(struct drm_device *dev)
 }
 EXPORT_SYMBOL(drm_dev_unplug);
 
-/*
- * DRM internal mount
- * We want to be able to allocate our own "struct address_space" to control
- * memory-mappings in VRAM (or stolen RAM, ...). However, core MM does not allow
- * stand-alone address_space objects, so we need an underlying inode. As there
- * is no way to allocate an independent inode easily, we need a fake internal
- * VFS mount-point.
- *
- * The drm_fs_inode_new() function allocates a new inode, drm_fs_inode_free()
- * frees it again. You are allowed to use iget() and iput() to get references to
- * the inode. But each drm_fs_inode_new() call must be paired with exactly one
- * drm_fs_inode_free() call (which does not have to be the last iput()).
- * We use drm_fs_inode_*() to manage our internal VFS mount-point and share it
- * between multiple inode-users. You could, technically, call
- * iget() + drm_fs_inode_free() directly after alloc and sometime later do an
- * iput(), but this way you'd end up with a new vfsmount for each inode.
- */
-
-static int drm_fs_cnt;
-static struct vfsmount *drm_fs_mnt;
-
-static int drm_fs_init_fs_context(struct fs_context *fc)
-{
-	return init_pseudo(fc, 0x010203ff) ? 0 : -ENOMEM;
-}
-
-static struct file_system_type drm_fs_type = {
-	.name		= "drm",
-	.owner		= THIS_MODULE,
-	.init_fs_context = drm_fs_init_fs_context,
-	.kill_sb	= kill_anon_super,
-};
-
-static struct inode *drm_fs_inode_new(void)
-{
-	struct inode *inode;
-	int r;
-
-	r = simple_pin_fs(&drm_fs_type, &drm_fs_mnt, &drm_fs_cnt);
-	if (r < 0) {
-		DRM_ERROR("Cannot mount pseudo fs: %d\n", r);
-		return ERR_PTR(r);
-	}
-
-	inode = alloc_anon_inode_sb(drm_fs_mnt->mnt_sb);
-	if (IS_ERR(inode))
-		simple_release_fs(&drm_fs_mnt, &drm_fs_cnt);
-
-	return inode;
-}
-
-static void drm_fs_inode_free(struct inode *inode)
-{
-	if (inode) {
-		iput(inode);
-		simple_release_fs(&drm_fs_mnt, &drm_fs_cnt);
-	}
-}
-
 /**
  * DOC: component helper usage recommendations
  *
@@ -563,7 +505,7 @@ static void drm_dev_init_release(struct drm_device *dev, void *res)
 {
 	drm_legacy_ctxbitmap_cleanup(dev);
 	drm_legacy_remove_map_hash(dev);
-	drm_fs_inode_free(dev->anon_inode);
+	iput(dev->anon_inode);
 
 	put_device(dev->dev);
 	/* Prevent use-after-free in drm_managed_release when debugging is
@@ -616,7 +558,7 @@ static int drm_dev_init(struct drm_device *dev,
 	if (ret)
 		return ret;
 
-	dev->anon_inode = drm_fs_inode_new();
+	dev->anon_inode = alloc_anon_inode();
 	if (IS_ERR(dev->anon_inode)) {
 		ret = PTR_ERR(dev->anon_inode);
 		DRM_ERROR("Cannot allocate anonymous inode: %d\n", ret);
-- 
2.30.1


^ permalink raw reply related

* [PATCH 3/9] powerpc/pseries: remove the ppc-cmm file system
From: Christoph Hellwig @ 2021-03-09 15:53 UTC (permalink / raw)
  To: Al Viro
  Cc: Jason Gunthorpe, David Hildenbrand, VMware, Inc.,
	Michael S. Tsirkin, linux-kernel, dri-devel, virtualization,
	linux-mm, Minchan Kim, Alex Williamson, Nadav Amit, Daniel Vetter,
	linux-fsdevel, Andrew Morton, linuxppc-dev, Nitin Gupta
In-Reply-To: <20210309155348.974875-1-hch@lst.de>

Just use the generic anon_inode file system.

Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 arch/powerpc/platforms/pseries/cmm.c | 27 ++-------------------------
 1 file changed, 2 insertions(+), 25 deletions(-)

diff --git a/arch/powerpc/platforms/pseries/cmm.c b/arch/powerpc/platforms/pseries/cmm.c
index 6d36b858b14df1..9d07e6bea7126c 100644
--- a/arch/powerpc/platforms/pseries/cmm.c
+++ b/arch/powerpc/platforms/pseries/cmm.c
@@ -6,6 +6,7 @@
  * Author(s): Brian King (brking@linux.vnet.ibm.com),
  */
 
+#include <linux/anon_inodes.h>
 #include <linux/ctype.h>
 #include <linux/delay.h>
 #include <linux/errno.h>
@@ -502,19 +503,6 @@ static struct notifier_block cmm_mem_nb = {
 };
 
 #ifdef CONFIG_BALLOON_COMPACTION
-static struct vfsmount *balloon_mnt;
-
-static int cmm_init_fs_context(struct fs_context *fc)
-{
-	return init_pseudo(fc, PPC_CMM_MAGIC) ? 0 : -ENOMEM;
-}
-
-static struct file_system_type balloon_fs = {
-	.name = "ppc-cmm",
-	.init_fs_context = cmm_init_fs_context,
-	.kill_sb = kill_anon_super,
-};
-
 static int cmm_migratepage(struct balloon_dev_info *b_dev_info,
 			   struct page *newpage, struct page *page,
 			   enum migrate_mode mode)
@@ -573,19 +561,10 @@ static int cmm_balloon_compaction_init(void)
 	balloon_devinfo_init(&b_dev_info);
 	b_dev_info.migratepage = cmm_migratepage;
 
-	balloon_mnt = kern_mount(&balloon_fs);
-	if (IS_ERR(balloon_mnt)) {
-		rc = PTR_ERR(balloon_mnt);
-		balloon_mnt = NULL;
-		return rc;
-	}
-
-	b_dev_info.inode = alloc_anon_inode_sb(balloon_mnt->mnt_sb);
+	b_dev_info.inode = alloc_anon_inode();
 	if (IS_ERR(b_dev_info.inode)) {
 		rc = PTR_ERR(b_dev_info.inode);
 		b_dev_info.inode = NULL;
-		kern_unmount(balloon_mnt);
-		balloon_mnt = NULL;
 		return rc;
 	}
 
@@ -597,8 +576,6 @@ static void cmm_balloon_compaction_deinit(void)
 	if (b_dev_info.inode)
 		iput(b_dev_info.inode);
 	b_dev_info.inode = NULL;
-	kern_unmount(balloon_mnt);
-	balloon_mnt = NULL;
 }
 #else /* CONFIG_BALLOON_COMPACTION */
 static int cmm_balloon_compaction_init(void)
-- 
2.30.1


^ permalink raw reply related

* [PATCH 2/9] fs: add an argument-less alloc_anon_inode
From: Christoph Hellwig @ 2021-03-09 15:53 UTC (permalink / raw)
  To: Al Viro
  Cc: Jason Gunthorpe, David Hildenbrand, VMware, Inc.,
	Michael S. Tsirkin, linux-kernel, dri-devel, virtualization,
	linux-mm, Minchan Kim, Alex Williamson, Nadav Amit, Daniel Vetter,
	linux-fsdevel, Andrew Morton, linuxppc-dev, Nitin Gupta
In-Reply-To: <20210309155348.974875-1-hch@lst.de>

Add a new alloc_anon_inode helper that allocates an inode on
the anon_inode file system.

Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 fs/anon_inodes.c            | 15 +++++++++++++--
 include/linux/anon_inodes.h |  1 +
 2 files changed, 14 insertions(+), 2 deletions(-)

diff --git a/fs/anon_inodes.c b/fs/anon_inodes.c
index 4745fc37014332..b6a8ea71920bc3 100644
--- a/fs/anon_inodes.c
+++ b/fs/anon_inodes.c
@@ -63,7 +63,7 @@ static struct inode *anon_inode_make_secure_inode(
 	const struct qstr qname = QSTR_INIT(name, strlen(name));
 	int error;
 
-	inode = alloc_anon_inode_sb(anon_inode_mnt->mnt_sb);
+	inode = alloc_anon_inode();
 	if (IS_ERR(inode))
 		return inode;
 	inode->i_flags &= ~S_PRIVATE;
@@ -225,13 +225,24 @@ int anon_inode_getfd_secure(const char *name, const struct file_operations *fops
 }
 EXPORT_SYMBOL_GPL(anon_inode_getfd_secure);
 
+/**
+ * alloc_anon_inode - create a new anonymous inode
+ *
+ * Create an inode on the anon_inode file system and return it.
+ */
+struct inode *alloc_anon_inode(void)
+{
+	return alloc_anon_inode_sb(anon_inode_mnt->mnt_sb);
+}
+EXPORT_SYMBOL_GPL(alloc_anon_inode);
+
 static int __init anon_inode_init(void)
 {
 	anon_inode_mnt = kern_mount(&anon_inode_fs_type);
 	if (IS_ERR(anon_inode_mnt))
 		panic("anon_inode_init() kernel mount failed (%ld)\n", PTR_ERR(anon_inode_mnt));
 
-	anon_inode_inode = alloc_anon_inode_sb(anon_inode_mnt->mnt_sb);
+	anon_inode_inode = alloc_anon_inode();
 	if (IS_ERR(anon_inode_inode))
 		panic("anon_inode_init() inode allocation failed (%ld)\n", PTR_ERR(anon_inode_inode));
 
diff --git a/include/linux/anon_inodes.h b/include/linux/anon_inodes.h
index 71881a2b6f7860..b5ae9a6eda9923 100644
--- a/include/linux/anon_inodes.h
+++ b/include/linux/anon_inodes.h
@@ -21,6 +21,7 @@ int anon_inode_getfd_secure(const char *name,
 			    const struct file_operations *fops,
 			    void *priv, int flags,
 			    const struct inode *context_inode);
+struct inode *alloc_anon_inode(void);
 
 #endif /* _LINUX_ANON_INODES_H */
 
-- 
2.30.1


^ permalink raw reply related

* make alloc_anon_inode more useful
From: Christoph Hellwig @ 2021-03-09 15:53 UTC (permalink / raw)
  To: Al Viro
  Cc: Jason Gunthorpe, David Hildenbrand, VMware, Inc.,
	Michael S. Tsirkin, linux-kernel, dri-devel, virtualization,
	linux-mm, Minchan Kim, Alex Williamson, Nadav Amit, Daniel Vetter,
	linux-fsdevel, Andrew Morton, linuxppc-dev, Nitin Gupta

Hi all,

this series first renames the existing alloc_anon_inode to
alloc_anon_inode_sb to clearly mark it as requiring a superblock.

It then adds a new alloc_anon_inode that works on the anon_inode
file system super block, thus removing tons of boilerplate code.

The few remainig callers of alloc_anon_inode_sb all use alloc_file_pseudo
later, but might also be ripe for some cleanup.

Diffstat:
 arch/powerpc/platforms/pseries/cmm.c |   27 +-------------
 drivers/dma-buf/dma-buf.c            |    2 -
 drivers/gpu/drm/drm_drv.c            |   64 +----------------------------------
 drivers/misc/cxl/api.c               |    2 -
 drivers/misc/vmw_balloon.c           |   24 +------------
 drivers/scsi/cxlflash/ocxl_hw.c      |    2 -
 drivers/virtio/virtio_balloon.c      |   30 +---------------
 fs/aio.c                             |    2 -
 fs/anon_inodes.c                     |   15 +++++++-
 fs/libfs.c                           |    2 -
 include/linux/anon_inodes.h          |    1 
 include/linux/fs.h                   |    2 -
 kernel/resource.c                    |   30 ++--------------
 mm/z3fold.c                          |   38 +-------------------
 mm/zsmalloc.c                        |   48 +-------------------------
 15 files changed, 39 insertions(+), 250 deletions(-)

^ permalink raw reply

* [PATCH 1/9] fs: rename alloc_anon_inode to alloc_anon_inode_sb
From: Christoph Hellwig @ 2021-03-09 15:53 UTC (permalink / raw)
  To: Al Viro
  Cc: Jason Gunthorpe, David Hildenbrand, VMware, Inc.,
	Michael S. Tsirkin, linux-kernel, dri-devel, virtualization,
	linux-mm, Minchan Kim, Alex Williamson, Nadav Amit, Daniel Vetter,
	linux-fsdevel, Andrew Morton, linuxppc-dev, Nitin Gupta
In-Reply-To: <20210309155348.974875-1-hch@lst.de>

Rename alloc_inode to free the name for a new variant that does not
need boilerplate to create a super_block first.

Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 arch/powerpc/platforms/pseries/cmm.c | 2 +-
 drivers/dma-buf/dma-buf.c            | 2 +-
 drivers/gpu/drm/drm_drv.c            | 2 +-
 drivers/misc/cxl/api.c               | 2 +-
 drivers/misc/vmw_balloon.c           | 2 +-
 drivers/scsi/cxlflash/ocxl_hw.c      | 2 +-
 drivers/virtio/virtio_balloon.c      | 2 +-
 fs/aio.c                             | 2 +-
 fs/anon_inodes.c                     | 4 ++--
 fs/libfs.c                           | 2 +-
 include/linux/fs.h                   | 2 +-
 kernel/resource.c                    | 2 +-
 mm/z3fold.c                          | 2 +-
 mm/zsmalloc.c                        | 2 +-
 14 files changed, 15 insertions(+), 15 deletions(-)

diff --git a/arch/powerpc/platforms/pseries/cmm.c b/arch/powerpc/platforms/pseries/cmm.c
index 45a3a3022a85c9..6d36b858b14df1 100644
--- a/arch/powerpc/platforms/pseries/cmm.c
+++ b/arch/powerpc/platforms/pseries/cmm.c
@@ -580,7 +580,7 @@ static int cmm_balloon_compaction_init(void)
 		return rc;
 	}
 
-	b_dev_info.inode = alloc_anon_inode(balloon_mnt->mnt_sb);
+	b_dev_info.inode = alloc_anon_inode_sb(balloon_mnt->mnt_sb);
 	if (IS_ERR(b_dev_info.inode)) {
 		rc = PTR_ERR(b_dev_info.inode);
 		b_dev_info.inode = NULL;
diff --git a/drivers/dma-buf/dma-buf.c b/drivers/dma-buf/dma-buf.c
index f264b70c383eb4..dedcc9483352dc 100644
--- a/drivers/dma-buf/dma-buf.c
+++ b/drivers/dma-buf/dma-buf.c
@@ -445,7 +445,7 @@ static inline int is_dma_buf_file(struct file *file)
 static struct file *dma_buf_getfile(struct dma_buf *dmabuf, int flags)
 {
 	struct file *file;
-	struct inode *inode = alloc_anon_inode(dma_buf_mnt->mnt_sb);
+	struct inode *inode = alloc_anon_inode_sb(dma_buf_mnt->mnt_sb);
 
 	if (IS_ERR(inode))
 		return ERR_CAST(inode);
diff --git a/drivers/gpu/drm/drm_drv.c b/drivers/gpu/drm/drm_drv.c
index 20d22e41d7ce74..87e7214a8e3565 100644
--- a/drivers/gpu/drm/drm_drv.c
+++ b/drivers/gpu/drm/drm_drv.c
@@ -519,7 +519,7 @@ static struct inode *drm_fs_inode_new(void)
 		return ERR_PTR(r);
 	}
 
-	inode = alloc_anon_inode(drm_fs_mnt->mnt_sb);
+	inode = alloc_anon_inode_sb(drm_fs_mnt->mnt_sb);
 	if (IS_ERR(inode))
 		simple_release_fs(&drm_fs_mnt, &drm_fs_cnt);
 
diff --git a/drivers/misc/cxl/api.c b/drivers/misc/cxl/api.c
index b493de962153ba..2efbf6c98028ef 100644
--- a/drivers/misc/cxl/api.c
+++ b/drivers/misc/cxl/api.c
@@ -73,7 +73,7 @@ static struct file *cxl_getfile(const char *name,
 		goto err_module;
 	}
 
-	inode = alloc_anon_inode(cxl_vfs_mount->mnt_sb);
+	inode = alloc_anon_inode_sb(cxl_vfs_mount->mnt_sb);
 	if (IS_ERR(inode)) {
 		file = ERR_CAST(inode);
 		goto err_fs;
diff --git a/drivers/misc/vmw_balloon.c b/drivers/misc/vmw_balloon.c
index b837e7eba5f7dc..5d057a05ddbee8 100644
--- a/drivers/misc/vmw_balloon.c
+++ b/drivers/misc/vmw_balloon.c
@@ -1900,7 +1900,7 @@ static __init int vmballoon_compaction_init(struct vmballoon *b)
 		return PTR_ERR(vmballoon_mnt);
 
 	b->b_dev_info.migratepage = vmballoon_migratepage;
-	b->b_dev_info.inode = alloc_anon_inode(vmballoon_mnt->mnt_sb);
+	b->b_dev_info.inode = alloc_anon_inode_sb(vmballoon_mnt->mnt_sb);
 
 	if (IS_ERR(b->b_dev_info.inode))
 		return PTR_ERR(b->b_dev_info.inode);
diff --git a/drivers/scsi/cxlflash/ocxl_hw.c b/drivers/scsi/cxlflash/ocxl_hw.c
index 244fc27215dc79..40184ed926b557 100644
--- a/drivers/scsi/cxlflash/ocxl_hw.c
+++ b/drivers/scsi/cxlflash/ocxl_hw.c
@@ -88,7 +88,7 @@ static struct file *ocxlflash_getfile(struct device *dev, const char *name,
 		goto err2;
 	}
 
-	inode = alloc_anon_inode(ocxlflash_vfs_mount->mnt_sb);
+	inode = alloc_anon_inode_sb(ocxlflash_vfs_mount->mnt_sb);
 	if (IS_ERR(inode)) {
 		rc = PTR_ERR(inode);
 		dev_err(dev, "%s: alloc_anon_inode failed rc=%d\n",
diff --git a/drivers/virtio/virtio_balloon.c b/drivers/virtio/virtio_balloon.c
index 8985fc2cea8615..cae76ee5bdd688 100644
--- a/drivers/virtio/virtio_balloon.c
+++ b/drivers/virtio/virtio_balloon.c
@@ -916,7 +916,7 @@ static int virtballoon_probe(struct virtio_device *vdev)
 	}
 
 	vb->vb_dev_info.migratepage = virtballoon_migratepage;
-	vb->vb_dev_info.inode = alloc_anon_inode(balloon_mnt->mnt_sb);
+	vb->vb_dev_info.inode = alloc_anon_inode_sb(balloon_mnt->mnt_sb);
 	if (IS_ERR(vb->vb_dev_info.inode)) {
 		err = PTR_ERR(vb->vb_dev_info.inode);
 		goto out_kern_unmount;
diff --git a/fs/aio.c b/fs/aio.c
index 1f32da13d39ee6..d1c2aa7fd6de7c 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -234,7 +234,7 @@ static const struct address_space_operations aio_ctx_aops;
 static struct file *aio_private_file(struct kioctx *ctx, loff_t nr_pages)
 {
 	struct file *file;
-	struct inode *inode = alloc_anon_inode(aio_mnt->mnt_sb);
+	struct inode *inode = alloc_anon_inode_sb(aio_mnt->mnt_sb);
 	if (IS_ERR(inode))
 		return ERR_CAST(inode);
 
diff --git a/fs/anon_inodes.c b/fs/anon_inodes.c
index a280156138ed89..4745fc37014332 100644
--- a/fs/anon_inodes.c
+++ b/fs/anon_inodes.c
@@ -63,7 +63,7 @@ static struct inode *anon_inode_make_secure_inode(
 	const struct qstr qname = QSTR_INIT(name, strlen(name));
 	int error;
 
-	inode = alloc_anon_inode(anon_inode_mnt->mnt_sb);
+	inode = alloc_anon_inode_sb(anon_inode_mnt->mnt_sb);
 	if (IS_ERR(inode))
 		return inode;
 	inode->i_flags &= ~S_PRIVATE;
@@ -231,7 +231,7 @@ static int __init anon_inode_init(void)
 	if (IS_ERR(anon_inode_mnt))
 		panic("anon_inode_init() kernel mount failed (%ld)\n", PTR_ERR(anon_inode_mnt));
 
-	anon_inode_inode = alloc_anon_inode(anon_inode_mnt->mnt_sb);
+	anon_inode_inode = alloc_anon_inode_sb(anon_inode_mnt->mnt_sb);
 	if (IS_ERR(anon_inode_inode))
 		panic("anon_inode_init() inode allocation failed (%ld)\n", PTR_ERR(anon_inode_inode));
 
diff --git a/fs/libfs.c b/fs/libfs.c
index e2de5401abca5a..600bebc1cd847f 100644
--- a/fs/libfs.c
+++ b/fs/libfs.c
@@ -1216,7 +1216,7 @@ static int anon_set_page_dirty(struct page *page)
 	return 0;
 };
 
-struct inode *alloc_anon_inode(struct super_block *s)
+struct inode *alloc_anon_inode_sb(struct super_block *s)
 {
 	static const struct address_space_operations anon_aops = {
 		.set_page_dirty = anon_set_page_dirty,
diff --git a/include/linux/fs.h b/include/linux/fs.h
index ec8f3ddf4a6aa8..52387368af3c00 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -3286,7 +3286,7 @@ extern int simple_write_end(struct file *file, struct address_space *mapping,
 			loff_t pos, unsigned len, unsigned copied,
 			struct page *page, void *fsdata);
 extern int always_delete_dentry(const struct dentry *);
-extern struct inode *alloc_anon_inode(struct super_block *);
+extern struct inode *alloc_anon_inode_sb(struct super_block *);
 extern int simple_nosetlease(struct file *, long, struct file_lock **, void **);
 extern const struct dentry_operations simple_dentry_operations;
 
diff --git a/kernel/resource.c b/kernel/resource.c
index 627e61b0c12418..0fd091a3f2fc66 100644
--- a/kernel/resource.c
+++ b/kernel/resource.c
@@ -1863,7 +1863,7 @@ static int __init iomem_init_inode(void)
 		return rc;
 	}
 
-	inode = alloc_anon_inode(iomem_vfs_mount->mnt_sb);
+	inode = alloc_anon_inode_sb(iomem_vfs_mount->mnt_sb);
 	if (IS_ERR(inode)) {
 		rc = PTR_ERR(inode);
 		pr_err("Cannot allocate inode for iomem: %d\n", rc);
diff --git a/mm/z3fold.c b/mm/z3fold.c
index b5dafa7e44e429..e7cd9298b221f5 100644
--- a/mm/z3fold.c
+++ b/mm/z3fold.c
@@ -376,7 +376,7 @@ static void z3fold_unmount(void)
 static const struct address_space_operations z3fold_aops;
 static int z3fold_register_migration(struct z3fold_pool *pool)
 {
-	pool->inode = alloc_anon_inode(z3fold_mnt->mnt_sb);
+	pool->inode = alloc_anon_inode_sb(z3fold_mnt->mnt_sb);
 	if (IS_ERR(pool->inode)) {
 		pool->inode = NULL;
 		return 1;
diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c
index 30c358b7202510..a6449a2ad861de 100644
--- a/mm/zsmalloc.c
+++ b/mm/zsmalloc.c
@@ -2086,7 +2086,7 @@ static const struct address_space_operations zsmalloc_aops = {
 
 static int zs_register_migration(struct zs_pool *pool)
 {
-	pool->inode = alloc_anon_inode(zsmalloc_mnt->mnt_sb);
+	pool->inode = alloc_anon_inode_sb(zsmalloc_mnt->mnt_sb);
 	if (IS_ERR(pool->inode)) {
 		pool->inode = NULL;
 		return 1;
-- 
2.30.1


^ permalink raw reply related

* Re: [PATCH v2 7/8] powerpc/xive: Fix xmon command "dxi"
From: Cédric Le Goater @ 2021-03-09 15:49 UTC (permalink / raw)
  To: Greg Kurz; +Cc: linuxppc-dev, kernel test robot, Dan Carpenter
In-Reply-To: <20210309112325.7b161cc7@bahia.lan>

On 3/9/21 11:23 AM, Greg Kurz wrote:
> On Wed, 3 Mar 2021 18:48:56 +0100
> Cédric Le Goater <clg@kaod.org> wrote:
> 
>> When under xmon, the "dxi" command dumps the state of the XIVE
>> interrupts. If an interrupt number is specified, only the state of
>> the associated XIVE interrupt is dumped. This form of the command
>> lacks an irq_data parameter which is nevertheless used by
>> xmon_xive_get_irq_config(), leading to an xmon crash.
>>
>> Fix that by doing a lookup in the system IRQ mapping to query the IRQ
>> descriptor data. Invalid interrupt numbers, or not belonging to the
>> XIVE IRQ domain, OPAL event interrupt number for instance, should be
>> caught by the previous query done at the firmware level.
>>
>> Reported-by: kernel test robot <lkp@intel.com>
>> Reported-by: Dan Carpenter <dan.carpenter@oracle.com>
>> Fixes: 97ef27507793 ("powerpc/xive: Fix xmon support on the PowerNV platform")
>> Signed-off-by: Cédric Le Goater <clg@kaod.org>
>> ---
> 
> I've tested this in a KVM guest and it seems to do the job.
> 
> 6:mon> dxi 1201
> IRQ 0x00001201 : target=0xfffffc00 prio=ff lirq=0x0 flags= LH PQ=-Q
> 
> Bad HW irq numbers are filtered by the hypervisor:
> 
> 6:mon> dxi bad
> [  696.390577] xive: H_INT_GET_SOURCE_CONFIG lisn=2989 failed -55
> IRQ 0x00000bad : no config rc=-6
> 
> Note that this also allows to show IPIs:
> 
> 6:mon> dxi 0
> IRQ 0x00000000 : target=0x0 prio=06 lirq=0x10 
> 
> This is a bit inconsistent with output of the 0-argument form of "dxi",

It's an hidden feature ! :) 

Yes. You can query at the FW level the configuration of any valid HW 
interrupt number where as "dxi" without an argument only loops on the 
XIVE IRQ domain which does not include the XIVE CPU IPIs which are 
special. You should "dxa" for these. 

> which filters them out for a reason that isn't obvious to me. 

For historical reason. XIVE support for PowerNV was the first to reach 
Linux. If you run the same xmon commands on a PowerNV machine (you could 
use QEMU), the ouput is different. it has more low level details.

> No big deal though, this should be addressed in another patch anyway.

We could simplify the xmon helpers to be sync with the debugfs one
and the QEMU/KVM "info pic" command. I agree.

Thanks,

C. 


> Reviewed-and-tested-by: Greg Kurz <groug@kaod.org>
> 
>>  arch/powerpc/sysdev/xive/common.c | 14 ++++++++++----
>>  1 file changed, 10 insertions(+), 4 deletions(-)
>>
>> diff --git a/arch/powerpc/sysdev/xive/common.c b/arch/powerpc/sysdev/xive/common.c
>> index f6b7b15bbb3a..8eefd152b947 100644
>> --- a/arch/powerpc/sysdev/xive/common.c
>> +++ b/arch/powerpc/sysdev/xive/common.c
>> @@ -255,17 +255,20 @@ notrace void xmon_xive_do_dump(int cpu)
>>  	xmon_printf("\n");
>>  }
>>  
>> +static struct irq_data *xive_get_irq_data(u32 hw_irq)
>> +{
>> +	unsigned int irq = irq_find_mapping(xive_irq_domain, hw_irq);
>> +
>> +	return irq ? irq_get_irq_data(irq) : NULL;
>> +}
>> +
>>  int xmon_xive_get_irq_config(u32 hw_irq, struct irq_data *d)
>>  {
>> -	struct irq_chip *chip = irq_data_get_irq_chip(d);
>>  	int rc;
>>  	u32 target;
>>  	u8 prio;
>>  	u32 lirq;
>>  
>> -	if (!is_xive_irq(chip))
>> -		return -EINVAL;
>> -
>>  	rc = xive_ops->get_irq_config(hw_irq, &target, &prio, &lirq);
>>  	if (rc) {
>>  		xmon_printf("IRQ 0x%08x : no config rc=%d\n", hw_irq, rc);
>> @@ -275,6 +278,9 @@ int xmon_xive_get_irq_config(u32 hw_irq, struct irq_data *d)
>>  	xmon_printf("IRQ 0x%08x : target=0x%x prio=%02x lirq=0x%x ",
>>  		    hw_irq, target, prio, lirq);
>>  
>> +	if (!d)
>> +		d = xive_get_irq_data(hw_irq);
>> +
>>  	if (d) {
>>  		struct xive_irq_data *xd = irq_data_get_irq_handler_data(d);
>>  		u64 val = xive_esb_read(xd, XIVE_ESB_GET);
> 


^ permalink raw reply

* Re: [PATCH 3/3] powerpc/qspinlock: Use generic smp_cond_load_relaxed
From: Davidlohr Bueso @ 2021-03-09 15:46 UTC (permalink / raw)
  To: Michal Such�nek
  Cc: Davidlohr Bueso, peterz, will, linux-kernel, npiggin, mingo,
	paulus, longman, linuxppc-dev
In-Reply-To: <20210309093912.GW6564@kitsune.suse.cz>

On Tue, 09 Mar 2021, Michal Such�nek wrote:

>On Mon, Mar 08, 2021 at 05:59:50PM -0800, Davidlohr Bueso wrote:
>> 49a7d46a06c3 (powerpc: Implement smp_cond_load_relaxed()) added
>> busy-waiting pausing with a preferred SMT priority pattern, lowering
>> the priority (reducing decode cycles) during the whole loop slowpath.
>>
>> However, data shows that while this pattern works well with simple
>                                              ^^^^^^^^^^^^^^^^^^^^^^
>> spinlocks, queued spinlocks benefit more being kept in medium priority,
>> with a cpu_relax() instead, being a low+medium combo on powerpc.
>...
>>
>> diff --git a/arch/powerpc/include/asm/barrier.h b/arch/powerpc/include/asm/barrier.h
>> index aecfde829d5d..7ae29cfb06c0 100644
>> --- a/arch/powerpc/include/asm/barrier.h
>> +++ b/arch/powerpc/include/asm/barrier.h
>> @@ -80,22 +80,6 @@ do {									\
>>	___p1;								\
>>  })
>>
>> -#ifdef CONFIG_PPC64
>Maybe it should be kept for the simple spinlock case then?

It is kept, note that simple spinlocks don't use smp_cond_load_relaxed,
but instead deal with the priorities in arch_spin_lock(), so it will
spin in low priority until it sees a chance to take the lock, where
it switches back to medium.

Thanks,
Davidlohr

^ permalink raw reply

* Re: [PATCH v2 4/8] powerpc/xive: Simplify xive_core_debug_show()
From: Cédric Le Goater @ 2021-03-09 15:39 UTC (permalink / raw)
  To: Greg Kurz; +Cc: linuxppc-dev
In-Reply-To: <20210309104220.663840a2@bahia.lan>

On 3/9/21 10:42 AM, Greg Kurz wrote:
> On Tue, 9 Mar 2021 10:13:39 +0100
> Greg Kurz <groug@kaod.org> wrote:
> 
>> On Mon, 8 Mar 2021 19:11:11 +0100
>> Cédric Le Goater <clg@kaod.org> wrote:
>>
>>> On 3/8/21 7:07 PM, Greg Kurz wrote:
>>>> On Wed, 3 Mar 2021 18:48:53 +0100
>>>> Cédric Le Goater <clg@kaod.org> wrote:
>>>>
>>>>> Now that the IPI interrupt has its own domain, the checks on the HW
>>>>> interrupt number XIVE_IPI_HW_IRQ and on the chip can be replaced by a
>>>>> check on the domain.
>>>>>
>>>>> Signed-off-by: Cédric Le Goater <clg@kaod.org>
>>>>> ---
>>>>
>>>> Shouldn't this have the following tags ?
>>>>
>>>> Reported-by: kernel test robot <lkp@intel.com>
>>>> Reported-by: Dan Carpenter <dan.carpenter@oracle.com>
>>>> Fixes: 930914b7d528 ("powerpc/xive: Add a debugfs file to dump internal XIVE state")
>>>
>>> The next patch has because it removes the useless check on irq_data.
>>>  
>>
>> Ok I get it. This report isn't about an actual crash. Just a false
>> positive because of the not needed check in the caller.
>>
> 
> Hrm... I meant because of the check in xive_debug_show_irq(). On the
> contrary, the check removed by this patch in xive_core_debug_show()
> was rather an explicit hint that xive_debug_show_irq() couldn't be
> called with d being NULL.

yes. irq_desc_get_irq_data() does not return a NULL value and 
xive_debug_show_irq() is only called from the for_each_irq_desc()
loop. 


C.


> 
>>> C.
>>>
>>>>
>>>> Anyway,
>>>>
>>>> Reviewed-by: Greg Kurz <groug@kaod.org>
>>>>
>>>>>  arch/powerpc/sysdev/xive/common.c | 18 ++++--------------
>>>>>  1 file changed, 4 insertions(+), 14 deletions(-)
>>>>>
>>>>> diff --git a/arch/powerpc/sysdev/xive/common.c b/arch/powerpc/sysdev/xive/common.c
>>>>> index 678680531d26..7581cb12bb53 100644
>>>>> --- a/arch/powerpc/sysdev/xive/common.c
>>>>> +++ b/arch/powerpc/sysdev/xive/common.c
>>>>> @@ -1579,17 +1579,14 @@ static void xive_debug_show_cpu(struct seq_file *m, int cpu)
>>>>>  	seq_puts(m, "\n");
>>>>>  }
>>>>>  
>>>>> -static void xive_debug_show_irq(struct seq_file *m, u32 hw_irq, struct irq_data *d)
>>>>> +static void xive_debug_show_irq(struct seq_file *m, struct irq_data *d)
>>>>>  {
>>>>> -	struct irq_chip *chip = irq_data_get_irq_chip(d);
>>>>> +	unsigned int hw_irq = (unsigned int)irqd_to_hwirq(d);
>>>>>  	int rc;
>>>>>  	u32 target;
>>>>>  	u8 prio;
>>>>>  	u32 lirq;
>>>>>  
>>>>> -	if (!is_xive_irq(chip))
>>>>> -		return;
>>>>> -
>>>>>  	rc = xive_ops->get_irq_config(hw_irq, &target, &prio, &lirq);
>>>>>  	if (rc) {
>>>>>  		seq_printf(m, "IRQ 0x%08x : no config rc=%d\n", hw_irq, rc);
>>>>> @@ -1627,16 +1624,9 @@ static int xive_core_debug_show(struct seq_file *m, void *private)
>>>>>  
>>>>>  	for_each_irq_desc(i, desc) {
>>>>>  		struct irq_data *d = irq_desc_get_irq_data(desc);
>>>>> -		unsigned int hw_irq;
>>>>> -
>>>>> -		if (!d)
>>>>> -			continue;
>>>>> -
>>>>> -		hw_irq = (unsigned int)irqd_to_hwirq(d);
>>>>>  
>>>>> -		/* IPIs are special (HW number 0) */
>>>>> -		if (hw_irq != XIVE_IPI_HW_IRQ)
>>>>> -			xive_debug_show_irq(m, hw_irq, d);
>>>>> +		if (d->domain == xive_irq_domain)
>>>>> +			xive_debug_show_irq(m, d);
>>>>>  	}
>>>>>  	return 0;
>>>>>  }
>>>>
>>>
>>
> 


^ permalink raw reply

* Re: [PATCH v2 1/8] powerpc/xive: Use cpu_to_node() instead of ibm,chip-id property
From: Cédric Le Goater @ 2021-03-09 15:33 UTC (permalink / raw)
  To: Greg Kurz; +Cc: Daniel Henrique Barboza, linuxppc-dev
In-Reply-To: <20210308181359.789c143b@bahia.lan>

On 3/8/21 6:13 PM, Greg Kurz wrote:
> On Wed, 3 Mar 2021 18:48:50 +0100
> Cédric Le Goater <clg@kaod.org> wrote:
> 
>> The 'chip_id' field of the XIVE CPU structure is used to choose a
>> target for a source located on the same chip when possible. This field
>> is assigned on the PowerNV platform using the "ibm,chip-id" property
>> on pSeries under KVM when NUMA nodes are defined but it is undefined
> 
> This sentence seems to have a syntax problem... like it is missing an
> 'and' before 'on pSeries'.

ah yes, or simply a comma.

>> under PowerVM. The XIVE source structure has a similar field
>> 'src_chip' which is only assigned on the PowerNV platform.
>>
>> cpu_to_node() returns a compatible value on all platforms, 0 being the
>> default node. It will also give us the opportunity to set the affinity
>> of a source on pSeries when we can localize them.
>>
> 
> IIUC this relies on the fact that the NUMA node id is == to chip id
> on PowerNV, i.e. xc->chip_id which is passed to OPAL remain stable
> with this change.

Linux sets the NUMA node in numa_setup_cpu(). On pseries, the hcall 
H_HOME_NODE_ASSOCIATIVITY returns the node id if I am correct (Daniel
in Cc:)

On PowerNV, Linux uses "ibm,associativity" property of the CPU to find
the node id. This value is built from the chip id in OPAL, so the 
value returned by cpu_to_node(cpu) and the value of the "ibm,chip-id"
property are unlikely to be different.

cpu_to_node(cpu) is used in many places to allocate the structures 
locally to the owning node. XIVE is not an exception (see below in the 
same patch), it is better to be consistent and get the same information 
(node id) using the same routine.


In Linux, "ibm,chip-id" is only used in low level PowerNV drivers : 
LPC, XSCOM, RNG, VAS, NX. XIVE should be in that list also but skiboot
unifies the controllers of the system to only expose one the OS. This
is problematic and should be changed but it's another topic.


> On the other hand, you have the pSeries case under PowerVM that
> doesn't xc->chip_id, which isn't passed to any hcall AFAICT.

yes "ibm,chip-id" is an OPAL concept unfortunately and it has no meaning 
under PAPR. xc->chip_id on pseries (PowerVM) will contains an invalid 
chip id. 

QEMU/KVM exposes "ibm,chip-id" but it's not used. (its value is not
always correct btw)

> It looks like the chip id is only used for localization purpose in
> this case, right ?

Yes and PAPR sources are not localized. So it's not used. MSI sources 
could be if we rewrote the MSI driver.

> In this case, what about doing this change for pSeries only,
> somewhere in spapr.c ?

The IPI code is common to all platforms and all have the same issue. 
I rather not.

Thanks,

C.
 
>> Signed-off-by: Cédric Le Goater <clg@kaod.org>
>> ---
>>  arch/powerpc/sysdev/xive/common.c | 7 +------
>>  1 file changed, 1 insertion(+), 6 deletions(-)
>>
>> diff --git a/arch/powerpc/sysdev/xive/common.c b/arch/powerpc/sysdev/xive/common.c
>> index 595310e056f4..b8e456da28aa 100644
>> --- a/arch/powerpc/sysdev/xive/common.c
>> +++ b/arch/powerpc/sysdev/xive/common.c
>> @@ -1335,16 +1335,11 @@ static int xive_prepare_cpu(unsigned int cpu)
>>  
>>  	xc = per_cpu(xive_cpu, cpu);
>>  	if (!xc) {
>> -		struct device_node *np;
>> -
>>  		xc = kzalloc_node(sizeof(struct xive_cpu),
>>  				  GFP_KERNEL, cpu_to_node(cpu));
>>  		if (!xc)
>>  			return -ENOMEM;
>> -		np = of_get_cpu_node(cpu, NULL);
>> -		if (np)
>> -			xc->chip_id = of_get_ibm_chip_id(np);
>> -		of_node_put(np);
>> +		xc->chip_id = cpu_to_node(cpu);
>>  		xc->hw_ipi = XIVE_BAD_IRQ;
>>  
>>  		per_cpu(xive_cpu, cpu) = xc;
> 


^ permalink raw reply

* [PATCH 4/4] tools/perf: Support pipeline stage cycles for powerpc
From: Athira Rajeev @ 2021-03-09 14:04 UTC (permalink / raw)
  To: linuxppc-dev, linux-kernel, linux-perf-users, mpe, acme, jolsa
  Cc: kan.liang, ravi.bangoria, peterz, maddy, kjain
In-Reply-To: <1615298640-1529-1-git-send-email-atrajeev@linux.vnet.ibm.com>

The pipeline stage cycles details can be recorded on powerpc from
the contents of Performance Monitor Unit (PMU) registers. On
ISA v3.1 platform, sampling registers exposes the cycles spent in
different pipeline stages. Patch adds perf tools support to present
two of the cycle counter information along with memory latency (weight).

Re-use the field 'ins_lat' for storing the first pipeline stage cycle.
This is stored in 'var2_w' field of 'perf_sample_weight'.

Add a new field 'p_stage_cyc' to store the second pipeline stage cycle
which is stored in 'var3_w' field of perf_sample_weight.

Add new sort function 'Pipeline Stage Cycle' and include this in
default_mem_sort_order[]. This new sort function may be used to denote
some other pipeline stage in another architecture. So add this to
list of sort entries that can have dynamic header string.

Signed-off-by: Athira Rajeev <atrajeev@linux.vnet.ibm.com>
---
 tools/perf/Documentation/perf-report.txt |  1 +
 tools/perf/arch/powerpc/util/event.c     | 18 ++++++++++++++++--
 tools/perf/util/event.h                  |  1 +
 tools/perf/util/hist.c                   | 11 ++++++++---
 tools/perf/util/hist.h                   |  1 +
 tools/perf/util/session.c                |  4 +++-
 tools/perf/util/sort.c                   | 24 ++++++++++++++++++++++--
 tools/perf/util/sort.h                   |  2 ++
 8 files changed, 54 insertions(+), 8 deletions(-)

diff --git a/tools/perf/Documentation/perf-report.txt b/tools/perf/Documentation/perf-report.txt
index f546b5e9db05..9691d9c227ba 100644
--- a/tools/perf/Documentation/perf-report.txt
+++ b/tools/perf/Documentation/perf-report.txt
@@ -112,6 +112,7 @@ OPTIONS
 	- ins_lat: Instruction latency in core cycles. This is the global instruction
 	  latency
 	- local_ins_lat: Local instruction latency version
+	- p_stage_cyc: Number of cycles spent in a pipeline stage.
 
 	By default, comm, dso and symbol keys are used.
 	(i.e. --sort comm,dso,symbol)
diff --git a/tools/perf/arch/powerpc/util/event.c b/tools/perf/arch/powerpc/util/event.c
index f49d32c2c8ae..b80fbee83b6e 100644
--- a/tools/perf/arch/powerpc/util/event.c
+++ b/tools/perf/arch/powerpc/util/event.c
@@ -18,8 +18,11 @@ void arch_perf_parse_sample_weight(struct perf_sample *data,
 	weight.full = *array;
 	if (type & PERF_SAMPLE_WEIGHT)
 		data->weight = weight.full;
-	else
+	else {
 		data->weight = weight.var1_dw;
+		data->ins_lat = weight.var2_w;
+		data->p_stage_cyc = weight.var3_w;
+	}
 }
 
 void arch_perf_synthesize_sample_weight(const struct perf_sample *data,
@@ -27,6 +30,17 @@ void arch_perf_synthesize_sample_weight(const struct perf_sample *data,
 {
 	*array = data->weight;
 
-	if (type & PERF_SAMPLE_WEIGHT_STRUCT)
+	if (type & PERF_SAMPLE_WEIGHT_STRUCT) {
 		*array &= 0xffffffff;
+		*array |= ((u64)data->ins_lat << 32);
+	}
+}
+
+const char *arch_perf_header_entry__add(const char *se_header)
+{
+	if (!strcmp(se_header, "Local INSTR Latency"))
+		return "Finish Cyc";
+	else if (!strcmp(se_header, "Pipeline Stage Cycle"))
+		return "Dispatch Cyc";
+	return se_header;
 }
diff --git a/tools/perf/util/event.h b/tools/perf/util/event.h
index 89b149e2e70a..65f89e80916f 100644
--- a/tools/perf/util/event.h
+++ b/tools/perf/util/event.h
@@ -147,6 +147,7 @@ struct perf_sample {
 	u8  cpumode;
 	u16 misc;
 	u16 ins_lat;
+	u16 p_stage_cyc;
 	bool no_hw_idx;		/* No hw_idx collected in branch_stack */
 	char insn[MAX_INSN];
 	void *raw_data;
diff --git a/tools/perf/util/hist.c b/tools/perf/util/hist.c
index c82f5fc26af8..9299ee535518 100644
--- a/tools/perf/util/hist.c
+++ b/tools/perf/util/hist.c
@@ -211,6 +211,7 @@ void hists__calc_col_len(struct hists *hists, struct hist_entry *h)
 	hists__new_col_len(hists, HISTC_MEM_BLOCKED, 10);
 	hists__new_col_len(hists, HISTC_LOCAL_INS_LAT, 13);
 	hists__new_col_len(hists, HISTC_GLOBAL_INS_LAT, 13);
+	hists__new_col_len(hists, HISTC_P_STAGE_CYC, 13);
 	if (symbol_conf.nanosecs)
 		hists__new_col_len(hists, HISTC_TIME, 16);
 	else
@@ -289,13 +290,14 @@ static long hist_time(unsigned long htime)
 }
 
 static void he_stat__add_period(struct he_stat *he_stat, u64 period,
-				u64 weight, u64 ins_lat)
+				u64 weight, u64 ins_lat, u64 p_stage_cyc)
 {
 
 	he_stat->period		+= period;
 	he_stat->weight		+= weight;
 	he_stat->nr_events	+= 1;
 	he_stat->ins_lat	+= ins_lat;
+	he_stat->p_stage_cyc	+= p_stage_cyc;
 }
 
 static void he_stat__add_stat(struct he_stat *dest, struct he_stat *src)
@@ -308,6 +310,7 @@ static void he_stat__add_stat(struct he_stat *dest, struct he_stat *src)
 	dest->nr_events		+= src->nr_events;
 	dest->weight		+= src->weight;
 	dest->ins_lat		+= src->ins_lat;
+	dest->p_stage_cyc		+= src->p_stage_cyc;
 }
 
 static void he_stat__decay(struct he_stat *he_stat)
@@ -597,6 +600,7 @@ static struct hist_entry *hists__findnew_entry(struct hists *hists,
 	u64 period = entry->stat.period;
 	u64 weight = entry->stat.weight;
 	u64 ins_lat = entry->stat.ins_lat;
+	u64 p_stage_cyc = entry->stat.p_stage_cyc;
 	bool leftmost = true;
 
 	p = &hists->entries_in->rb_root.rb_node;
@@ -615,11 +619,11 @@ static struct hist_entry *hists__findnew_entry(struct hists *hists,
 
 		if (!cmp) {
 			if (sample_self) {
-				he_stat__add_period(&he->stat, period, weight, ins_lat);
+				he_stat__add_period(&he->stat, period, weight, ins_lat, p_stage_cyc);
 				hist_entry__add_callchain_period(he, period);
 			}
 			if (symbol_conf.cumulate_callchain)
-				he_stat__add_period(he->stat_acc, period, weight, ins_lat);
+				he_stat__add_period(he->stat_acc, period, weight, ins_lat, p_stage_cyc);
 
 			/*
 			 * This mem info was allocated from sample__resolve_mem
@@ -731,6 +735,7 @@ static void hists__res_sample(struct hist_entry *he, struct perf_sample *sample)
 			.period	= sample->period,
 			.weight = sample->weight,
 			.ins_lat = sample->ins_lat,
+			.p_stage_cyc = sample->p_stage_cyc,
 		},
 		.parent = sym_parent,
 		.filtered = symbol__parent_filter(sym_parent) | al->filtered,
diff --git a/tools/perf/util/hist.h b/tools/perf/util/hist.h
index 3c537232294b..e2faa745c8d6 100644
--- a/tools/perf/util/hist.h
+++ b/tools/perf/util/hist.h
@@ -75,6 +75,7 @@ enum hist_column {
 	HISTC_MEM_BLOCKED,
 	HISTC_LOCAL_INS_LAT,
 	HISTC_GLOBAL_INS_LAT,
+	HISTC_P_STAGE_CYC,
 	HISTC_NR_COLS, /* Last entry */
 };
 
diff --git a/tools/perf/util/session.c b/tools/perf/util/session.c
index 859832a82496..a6fed96d783d 100644
--- a/tools/perf/util/session.c
+++ b/tools/perf/util/session.c
@@ -1302,8 +1302,10 @@ static void dump_sample(struct evsel *evsel, union perf_event *event,
 
 	if (sample_type & PERF_SAMPLE_WEIGHT_TYPE) {
 		printf("... weight: %" PRIu64 "", sample->weight);
-			if (sample_type & PERF_SAMPLE_WEIGHT_STRUCT)
+			if (sample_type & PERF_SAMPLE_WEIGHT_STRUCT) {
 				printf(",0x%"PRIx16"", sample->ins_lat);
+				printf(",0x%"PRIx16"", sample->p_stage_cyc);
+			}
 		printf("\n");
 	}
 
diff --git a/tools/perf/util/sort.c b/tools/perf/util/sort.c
index 741a6df29fa0..cbb3899e7eca 100644
--- a/tools/perf/util/sort.c
+++ b/tools/perf/util/sort.c
@@ -37,7 +37,7 @@
 const char	*parent_pattern = default_parent_pattern;
 const char	*default_sort_order = "comm,dso,symbol";
 const char	default_branch_sort_order[] = "comm,dso_from,symbol_from,symbol_to,cycles";
-const char	default_mem_sort_order[] = "local_weight,mem,sym,dso,symbol_daddr,dso_daddr,snoop,tlb,locked,blocked,local_ins_lat";
+const char	default_mem_sort_order[] = "local_weight,mem,sym,dso,symbol_daddr,dso_daddr,snoop,tlb,locked,blocked,local_ins_lat,p_stage_cyc";
 const char	default_top_sort_order[] = "dso,symbol";
 const char	default_diff_sort_order[] = "dso,symbol";
 const char	default_tracepoint_sort_order[] = "trace";
@@ -46,7 +46,7 @@
 regex_t		ignore_callees_regex;
 int		have_ignore_callees = 0;
 enum sort_mode	sort__mode = SORT_MODE__NORMAL;
-const char	*dynamic_headers[] = {"local_ins_lat"};
+const char	*dynamic_headers[] = {"local_ins_lat", "p_stage_cyc"};
 
 /*
  * Replaces all occurrences of a char used with the:
@@ -1410,6 +1410,25 @@ struct sort_entry sort_global_ins_lat = {
 	.se_width_idx	= HISTC_GLOBAL_INS_LAT,
 };
 
+static int64_t
+sort__global_p_stage_cyc_cmp(struct hist_entry *left, struct hist_entry *right)
+{
+	return left->stat.p_stage_cyc - right->stat.p_stage_cyc;
+}
+
+static int hist_entry__p_stage_cyc_snprintf(struct hist_entry *he, char *bf,
+					size_t size, unsigned int width)
+{
+	return repsep_snprintf(bf, size, "%-*u", width, he->stat.p_stage_cyc);
+}
+
+struct sort_entry sort_p_stage_cyc = {
+	.se_header      = "Pipeline Stage Cycle",
+	.se_cmp         = sort__global_p_stage_cyc_cmp,
+	.se_snprintf	= hist_entry__p_stage_cyc_snprintf,
+	.se_width_idx	= HISTC_P_STAGE_CYC,
+};
+
 struct sort_entry sort_mem_daddr_sym = {
 	.se_header	= "Data Symbol",
 	.se_cmp		= sort__daddr_cmp,
@@ -1853,6 +1872,7 @@ static void sort_dimension_add_dynamic_header(struct sort_dimension *sd)
 	DIM(SORT_CODE_PAGE_SIZE, "code_page_size", sort_code_page_size),
 	DIM(SORT_LOCAL_INS_LAT, "local_ins_lat", sort_local_ins_lat),
 	DIM(SORT_GLOBAL_INS_LAT, "ins_lat", sort_global_ins_lat),
+	DIM(SORT_P_STAGE_CYC, "p_stage_cyc", sort_p_stage_cyc),
 };
 
 #undef DIM
diff --git a/tools/perf/util/sort.h b/tools/perf/util/sort.h
index 63f67a3f3630..23b20cbbc846 100644
--- a/tools/perf/util/sort.h
+++ b/tools/perf/util/sort.h
@@ -51,6 +51,7 @@ struct he_stat {
 	u64			period_guest_us;
 	u64			weight;
 	u64			ins_lat;
+	u64			p_stage_cyc;
 	u32			nr_events;
 };
 
@@ -234,6 +235,7 @@ enum sort_type {
 	SORT_CODE_PAGE_SIZE,
 	SORT_LOCAL_INS_LAT,
 	SORT_GLOBAL_INS_LAT,
+	SORT_P_STAGE_CYC,
 
 	/* branch stack specific sort keys */
 	__SORT_BRANCH_STACK,
-- 
1.8.3.1


^ permalink raw reply related

* [PATCH 3/4] tools/perf: Add powerpc support for PERF_SAMPLE_WEIGHT_STRUCT
From: Athira Rajeev @ 2021-03-09 14:03 UTC (permalink / raw)
  To: linuxppc-dev, linux-kernel, linux-perf-users, mpe, acme, jolsa
  Cc: kan.liang, ravi.bangoria, peterz, maddy, kjain
In-Reply-To: <1615298640-1529-1-git-send-email-atrajeev@linux.vnet.ibm.com>

Add arch specific arch_evsel__set_sample_weight() to set the new
sample type for powerpc.

Add arch specific arch_perf_parse_sample_weight() to store the
sample->weight values depending on the sample type applied.
if the new sample type (PERF_SAMPLE_WEIGHT_STRUCT) is applied,
store only the lower 32 bits to sample->weight. If sample type
is 'PERF_SAMPLE_WEIGHT', store the full 64-bit to sample->weight.

Signed-off-by: Athira Rajeev <atrajeev@linux.vnet.ibm.com>
---
 tools/perf/arch/powerpc/util/Build   |  2 ++
 tools/perf/arch/powerpc/util/event.c | 32 ++++++++++++++++++++++++++++++++
 tools/perf/arch/powerpc/util/evsel.c |  8 ++++++++
 3 files changed, 42 insertions(+)
 create mode 100644 tools/perf/arch/powerpc/util/event.c
 create mode 100644 tools/perf/arch/powerpc/util/evsel.c

diff --git a/tools/perf/arch/powerpc/util/Build b/tools/perf/arch/powerpc/util/Build
index b7945e5a543b..8a79c4126e5b 100644
--- a/tools/perf/arch/powerpc/util/Build
+++ b/tools/perf/arch/powerpc/util/Build
@@ -4,6 +4,8 @@ perf-y += kvm-stat.o
 perf-y += perf_regs.o
 perf-y += mem-events.o
 perf-y += sym-handling.o
+perf-y += evsel.o
+perf-y += event.o
 
 perf-$(CONFIG_DWARF) += dwarf-regs.o
 perf-$(CONFIG_DWARF) += skip-callchain-idx.o
diff --git a/tools/perf/arch/powerpc/util/event.c b/tools/perf/arch/powerpc/util/event.c
new file mode 100644
index 000000000000..f49d32c2c8ae
--- /dev/null
+++ b/tools/perf/arch/powerpc/util/event.c
@@ -0,0 +1,32 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/types.h>
+#include <linux/string.h>
+#include <linux/zalloc.h>
+
+#include "../../../util/event.h"
+#include "../../../util/synthetic-events.h"
+#include "../../../util/machine.h"
+#include "../../../util/tool.h"
+#include "../../../util/map.h"
+#include "../../../util/debug.h"
+
+void arch_perf_parse_sample_weight(struct perf_sample *data,
+				   const __u64 *array, u64 type)
+{
+	union perf_sample_weight weight;
+
+	weight.full = *array;
+	if (type & PERF_SAMPLE_WEIGHT)
+		data->weight = weight.full;
+	else
+		data->weight = weight.var1_dw;
+}
+
+void arch_perf_synthesize_sample_weight(const struct perf_sample *data,
+					__u64 *array, u64 type)
+{
+	*array = data->weight;
+
+	if (type & PERF_SAMPLE_WEIGHT_STRUCT)
+		*array &= 0xffffffff;
+}
diff --git a/tools/perf/arch/powerpc/util/evsel.c b/tools/perf/arch/powerpc/util/evsel.c
new file mode 100644
index 000000000000..2f733cdc8dbb
--- /dev/null
+++ b/tools/perf/arch/powerpc/util/evsel.c
@@ -0,0 +1,8 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <stdio.h>
+#include "util/evsel.h"
+
+void arch_evsel__set_sample_weight(struct evsel *evsel)
+{
+	evsel__set_sample_bit(evsel, WEIGHT_STRUCT);
+}
-- 
1.8.3.1


^ permalink raw reply related

* [PATCH 2/4] tools/perf: Add dynamic headers for perf report columns
From: Athira Rajeev @ 2021-03-09 14:03 UTC (permalink / raw)
  To: linuxppc-dev, linux-kernel, linux-perf-users, mpe, acme, jolsa
  Cc: kan.liang, ravi.bangoria, peterz, maddy, kjain
In-Reply-To: <1615298640-1529-1-git-send-email-atrajeev@linux.vnet.ibm.com>

Currently the header string for different columns in perf report
is fixed. Some fields of perf sample could have different meaning
for different architectures than the meaning conveyed by the header
string. An example is the new field 'var2_w' of perf_sample_weight
structure. This is presently captured as 'Local INSTR Latency' in
perf mem report. But this could be used to denote a different latency
cycle in another architecture.

Introduce a weak function arch_perf_header_entry__add() to set
the arch specific header string for the fields which can contain dynamic
header. If the architecture do not have this function, fall back to the
default header string value.

Signed-off-by: Athira Rajeev <atrajeev@linux.vnet.ibm.com>
---
 tools/perf/util/event.h |  1 +
 tools/perf/util/sort.c  | 19 ++++++++++++++++++-
 2 files changed, 19 insertions(+), 1 deletion(-)

diff --git a/tools/perf/util/event.h b/tools/perf/util/event.h
index f603edbbbc6f..89b149e2e70a 100644
--- a/tools/perf/util/event.h
+++ b/tools/perf/util/event.h
@@ -427,5 +427,6 @@ void  cpu_map_data__synthesize(struct perf_record_cpu_map_data *data, struct per
 
 void arch_perf_parse_sample_weight(struct perf_sample *data, const __u64 *array, u64 type);
 void arch_perf_synthesize_sample_weight(const struct perf_sample *data, __u64 *array, u64 type);
+const char *arch_perf_header_entry__add(const char *se_header);
 
 #endif /* __PERF_RECORD_H */
diff --git a/tools/perf/util/sort.c b/tools/perf/util/sort.c
index 0d5ad42812b9..741a6df29fa0 100644
--- a/tools/perf/util/sort.c
+++ b/tools/perf/util/sort.c
@@ -25,6 +25,7 @@
 #include <traceevent/event-parse.h>
 #include "mem-events.h"
 #include "annotate.h"
+#include "event.h"
 #include "time-utils.h"
 #include "cgroup.h"
 #include "machine.h"
@@ -45,6 +46,7 @@
 regex_t		ignore_callees_regex;
 int		have_ignore_callees = 0;
 enum sort_mode	sort__mode = SORT_MODE__NORMAL;
+const char	*dynamic_headers[] = {"local_ins_lat"};
 
 /*
  * Replaces all occurrences of a char used with the:
@@ -1816,6 +1818,16 @@ struct sort_dimension {
 	int			taken;
 };
 
+const char * __weak arch_perf_header_entry__add(const char *se_header)
+{
+	return se_header;
+}
+
+static void sort_dimension_add_dynamic_header(struct sort_dimension *sd)
+{
+	sd->entry->se_header = arch_perf_header_entry__add(sd->entry->se_header);
+}
+
 #define DIM(d, n, func) [d] = { .name = n, .entry = &(func) }
 
 static struct sort_dimension common_sort_dimensions[] = {
@@ -2739,11 +2751,16 @@ int sort_dimension__add(struct perf_hpp_list *list, const char *tok,
 			struct evlist *evlist,
 			int level)
 {
-	unsigned int i;
+	unsigned int i, j;
 
 	for (i = 0; i < ARRAY_SIZE(common_sort_dimensions); i++) {
 		struct sort_dimension *sd = &common_sort_dimensions[i];
 
+		for (j = 0; j < ARRAY_SIZE(dynamic_headers); j++) {
+			if (!strcmp(dynamic_headers[j], sd->name))
+				sort_dimension_add_dynamic_header(sd);
+		}
+
 		if (strncasecmp(tok, sd->name, strlen(tok)))
 			continue;
 
-- 
1.8.3.1


^ permalink raw reply related

* [PATCH 1/4] powerpc/perf: Expose processor pipeline stage cycles using PERF_SAMPLE_WEIGHT_STRUCT
From: Athira Rajeev @ 2021-03-09 14:03 UTC (permalink / raw)
  To: linuxppc-dev, linux-kernel, linux-perf-users, mpe, acme, jolsa
  Cc: kan.liang, ravi.bangoria, peterz, maddy, kjain
In-Reply-To: <1615298640-1529-1-git-send-email-atrajeev@linux.vnet.ibm.com>

Performance Monitoring Unit (PMU) registers in powerpc provides
information on cycles elapsed between different stages in the
pipeline. This can be used for application tuning. On ISA v3.1
platform, this information is exposed by sampling registers.
Patch adds kernel support to capture two of the cycle counters
as part of perf sample using the sample type:
PERF_SAMPLE_WEIGHT_STRUCT.

The power PMU function 'get_mem_weight' currently uses 64 bit weight
field of perf_sample_data to capture memory latency. But following the
introduction of PERF_SAMPLE_WEIGHT_TYPE, weight field could contain
64-bit or 32-bit value depending on the architexture support for
PERF_SAMPLE_WEIGHT_STRUCT. Patches uses WEIGHT_STRUCT to expose the
pipeline stage cycles info. Hence update the ppmu functions to work for
64-bit and 32-bit weight values.

If the sample type is PERF_SAMPLE_WEIGHT, use the 64-bit weight field.
if the sample type is PERF_SAMPLE_WEIGHT_STRUCT, memory subsystem
latency is stored in the low 32bits of perf_sample_weight structure.
Also for CPU_FTR_ARCH_31, capture the two cycle counter information in
two 16 bit fields of perf_sample_weight structure.

Signed-off-by: Athira Rajeev <atrajeev@linux.vnet.ibm.com>
---
 arch/powerpc/include/asm/perf_event_server.h |  2 +-
 arch/powerpc/perf/core-book3s.c              |  4 ++--
 arch/powerpc/perf/isa207-common.c            | 29 +++++++++++++++++++++++++---
 arch/powerpc/perf/isa207-common.h            |  6 +++++-
 4 files changed, 34 insertions(+), 7 deletions(-)

diff --git a/arch/powerpc/include/asm/perf_event_server.h b/arch/powerpc/include/asm/perf_event_server.h
index 00e7e671bb4b..112cf092d7b3 100644
--- a/arch/powerpc/include/asm/perf_event_server.h
+++ b/arch/powerpc/include/asm/perf_event_server.h
@@ -43,7 +43,7 @@ struct power_pmu {
 				u64 alt[]);
 	void		(*get_mem_data_src)(union perf_mem_data_src *dsrc,
 				u32 flags, struct pt_regs *regs);
-	void		(*get_mem_weight)(u64 *weight);
+	void		(*get_mem_weight)(u64 *weight, u64 type);
 	unsigned long	group_constraint_mask;
 	unsigned long	group_constraint_val;
 	u64             (*bhrb_filter_map)(u64 branch_sample_type);
diff --git a/arch/powerpc/perf/core-book3s.c b/arch/powerpc/perf/core-book3s.c
index 6817331e22ff..57ff2494880c 100644
--- a/arch/powerpc/perf/core-book3s.c
+++ b/arch/powerpc/perf/core-book3s.c
@@ -2206,9 +2206,9 @@ static void record_and_restart(struct perf_event *event, unsigned long val,
 						ppmu->get_mem_data_src)
 			ppmu->get_mem_data_src(&data.data_src, ppmu->flags, regs);
 
-		if (event->attr.sample_type & PERF_SAMPLE_WEIGHT &&
+		if (event->attr.sample_type & PERF_SAMPLE_WEIGHT_TYPE &&
 						ppmu->get_mem_weight)
-			ppmu->get_mem_weight(&data.weight.full);
+			ppmu->get_mem_weight(&data.weight.full, event->attr.sample_type);
 
 		if (perf_event_overflow(event, &data, regs))
 			power_pmu_stop(event, 0);
diff --git a/arch/powerpc/perf/isa207-common.c b/arch/powerpc/perf/isa207-common.c
index e4f577da33d8..5dcbdbd54598 100644
--- a/arch/powerpc/perf/isa207-common.c
+++ b/arch/powerpc/perf/isa207-common.c
@@ -284,8 +284,10 @@ void isa207_get_mem_data_src(union perf_mem_data_src *dsrc, u32 flags,
 	}
 }
 
-void isa207_get_mem_weight(u64 *weight)
+void isa207_get_mem_weight(u64 *weight, u64 type)
 {
+	union perf_sample_weight *weight_fields;
+	u64 weight_lat;
 	u64 mmcra = mfspr(SPRN_MMCRA);
 	u64 exp = MMCRA_THR_CTR_EXP(mmcra);
 	u64 mantissa = MMCRA_THR_CTR_MANT(mmcra);
@@ -296,9 +298,30 @@ void isa207_get_mem_weight(u64 *weight)
 		mantissa = P10_MMCRA_THR_CTR_MANT(mmcra);
 
 	if (val == 0 || val == 7)
-		*weight = 0;
+		weight_lat = 0;
 	else
-		*weight = mantissa << (2 * exp);
+		weight_lat = mantissa << (2 * exp);
+
+	/*
+	 * Use 64 bit weight field (full) if sample type is
+	 * WEIGHT.
+	 *
+	 * if sample type is WEIGHT_STRUCT:
+	 * - store memory latency in the lower 32 bits.
+	 * - For ISA v3.1, use remaining two 16 bit fields of
+	 *   perf_sample_weight to store cycle counter values
+	 *   from sier2.
+	 */
+	weight_fields = (union perf_sample_weight *)weight;
+	if (type & PERF_SAMPLE_WEIGHT)
+		weight_fields->full = weight_lat;
+	else {
+		weight_fields->var1_dw = (u32)weight_lat;
+		if (cpu_has_feature(CPU_FTR_ARCH_31)) {
+			weight_fields->var2_w = P10_SIER2_FINISH_CYC(mfspr(SPRN_SIER2));
+			weight_fields->var3_w = P10_SIER2_DISPATCH_CYC(mfspr(SPRN_SIER2));
+		}
+	}
 }
 
 int isa207_get_constraint(u64 event, unsigned long *maskp, unsigned long *valp, u64 event_config1)
diff --git a/arch/powerpc/perf/isa207-common.h b/arch/powerpc/perf/isa207-common.h
index 1af0e8c97ac7..fc30d43c4d0c 100644
--- a/arch/powerpc/perf/isa207-common.h
+++ b/arch/powerpc/perf/isa207-common.h
@@ -265,6 +265,10 @@
 #define ISA207_SIER_DATA_SRC_SHIFT	53
 #define ISA207_SIER_DATA_SRC_MASK	(0x7ull << ISA207_SIER_DATA_SRC_SHIFT)
 
+/* Bits in SIER2/SIER3 for Power10 */
+#define P10_SIER2_FINISH_CYC(sier2)	(((sier2) >> (63 - 37)) & 0x7fful)
+#define P10_SIER2_DISPATCH_CYC(sier2)	(((sier2) >> (63 - 13)) & 0x7fful)
+
 #define P(a, b)				PERF_MEM_S(a, b)
 #define PH(a, b)			(P(LVL, HIT) | P(a, b))
 #define PM(a, b)			(P(LVL, MISS) | P(a, b))
@@ -278,6 +282,6 @@ int isa207_get_alternatives(u64 event, u64 alt[], int size, unsigned int flags,
 					const unsigned int ev_alt[][MAX_ALT]);
 void isa207_get_mem_data_src(union perf_mem_data_src *dsrc, u32 flags,
 							struct pt_regs *regs);
-void isa207_get_mem_weight(u64 *weight);
+void isa207_get_mem_weight(u64 *weight, u64 type);
 
 #endif
-- 
1.8.3.1


^ permalink raw reply related

* [PATCH 0/4] powerpc/perf: Export processor pipeline stage cycles information
From: Athira Rajeev @ 2021-03-09 14:03 UTC (permalink / raw)
  To: linuxppc-dev, linux-kernel, linux-perf-users, mpe, acme, jolsa
  Cc: kan.liang, ravi.bangoria, peterz, maddy, kjain

Performance Monitoring Unit (PMU) registers in powerpc exports
number of cycles elapsed between different stages in the pipeline.
Example, sampling registers in ISA v3.1.

This patchset implements kernel and perf tools support to expose
these pipeline stage cycles using the sample type PERF_SAMPLE_WEIGHT_TYPE.

Patch 1/4 adds kernel side support to store the cycle counter
values as part of 'var2_w' and 'var3_w' fields of perf_sample_weight
structure.

Patch 2/4 adds support to make the perf report column header
strings as dynamic.
Patch 3/4 adds powerpc support in perf tools for PERF_SAMPLE_WEIGHT_STRUCT
in sample type: PERF_SAMPLE_WEIGHT_TYPE.
Patch 4/4 adds support to present pipeline stage cycles as part of
mem-mode.

Sample output on powerpc:

# perf mem record ls
# perf mem report

# To display the perf.data header info, please use --header/--header-only options.
#
#
# Total Lost Samples: 0
#
# Samples: 11  of event 'cpu/mem-loads/'
# Total weight : 1332
# Sort order   : local_weight,mem,sym,dso,symbol_daddr,dso_daddr,snoop,tlb,locked,blocked,local_ins_lat,stall_cyc
#
# Overhead       Samples  Local Weight  Memory access             Symbol                              Shared Object     Data Symbol                                    Data Object            Snoop         TLB access              Locked  Blocked     Finish Cyc     Dispatch Cyc 
# ........  ............  ............  ........................  ..................................  ................  .............................................  .....................  ............  ......................  ......  ..........  .............  .............
#
    44.14%             1  588           L1 hit                    [k] rcu_nmi_exit                    [kernel.vmlinux]  [k] 0xc0000007ffdd21b0                         [unknown]              N/A           N/A                     No       N/A        7              5            
    22.22%             1  296           L1 hit                    [k] copypage_power7                 [kernel.vmlinux]  [k] 0xc0000000ff6a1780                         [unknown]              N/A           N/A                     No       N/A        293            3            
     6.98%             1  93            L1 hit                    [.] _dl_addr                        libc-2.31.so      [.] 0x00007fff86fa5058                         libc-2.31.so           N/A           N/A                     No       N/A        7              1            
     6.61%             1  88            L2 hit                    [.] new_do_write                    libc-2.31.so      [.] _IO_2_1_stdout_+0x0                        libc-2.31.so           N/A           N/A                     No       N/A        84             1            
     5.93%             1  79            L1 hit                    [k] printk_nmi_exit                 [kernel.vmlinux]  [k] 0xc0000006085df6b0                         [unknown]              N/A           N/A                     No       N/A        7              1            
     4.05%             1  54            L2 hit                    [.] __alloc_dir                     libc-2.31.so      [.] 0x00007fffdb70a640                         [stack]                N/A           N/A                     No       N/A        18             1            
     3.60%             1  48            L1 hit                    [.] _init                           ls                [.] 0x000000016ca82118                         [heap]                 N/A           N/A                     No       N/A        7              6            
     2.40%             1  32            L1 hit                    [k] desc_read                       [kernel.vmlinux]  [k] _printk_rb_static_descs+0x1ea10            [kernel.vmlinux].data  N/A           N/A                     No       N/A        7              1            
     1.65%             1  22            L2 hit                    [k] perf_iterate_ctx.constprop.139  [kernel.vmlinux]  [k] 0xc00000064d79e8a8                         [unknown]              N/A           N/A                     No       N/A        16             1            
     1.58%             1  21            L1 hit                    [k] perf_event_interrupt            [kernel.vmlinux]  [k] 0xc0000006085df6b0                         [unknown]              N/A           N/A                     No       N/A        7              1            
     0.83%             1  11            L1 hit                    [k] perf_event_exec                 [kernel.vmlinux]  [k] 0xc0000007ffdd3288                         [unknown]              N/A           N/A                     No       N/A        7              4            


Athira Rajeev (4):
  powerpc/perf: Expose processor pipeline stage cycles using
    PERF_SAMPLE_WEIGHT_STRUCT
  tools/perf: Add dynamic headers for perf report columns
  tools/perf: Add powerpc support for PERF_SAMPLE_WEIGHT_STRUCT
  tools/perf: Support pipeline stage cycles for powerpc

 arch/powerpc/include/asm/perf_event_server.h |  2 +-
 arch/powerpc/perf/core-book3s.c              |  4 +--
 arch/powerpc/perf/isa207-common.c            | 29 ++++++++++++++++--
 arch/powerpc/perf/isa207-common.h            |  6 +++-
 tools/perf/Documentation/perf-report.txt     |  1 +
 tools/perf/arch/powerpc/util/Build           |  2 ++
 tools/perf/arch/powerpc/util/event.c         | 46 ++++++++++++++++++++++++++++
 tools/perf/arch/powerpc/util/evsel.c         |  8 +++++
 tools/perf/util/event.h                      |  2 ++
 tools/perf/util/hist.c                       | 11 +++++--
 tools/perf/util/hist.h                       |  1 +
 tools/perf/util/session.c                    |  4 ++-
 tools/perf/util/sort.c                       | 41 +++++++++++++++++++++++--
 tools/perf/util/sort.h                       |  2 ++
 14 files changed, 146 insertions(+), 13 deletions(-)
 create mode 100644 tools/perf/arch/powerpc/util/event.c
 create mode 100644 tools/perf/arch/powerpc/util/evsel.c

-- 
1.8.3.1


^ permalink raw reply

* Re: [PATCH v2 8/8] powerpc/xive: Map one IPI interrupt per node
From: Greg Kurz @ 2021-03-09 13:23 UTC (permalink / raw)
  To: Cédric Le Goater; +Cc: linuxppc-dev
In-Reply-To: <20210303174857.1760393-9-clg@kaod.org>

On Wed, 3 Mar 2021 18:48:57 +0100
Cédric Le Goater <clg@kaod.org> wrote:

> ipistorm [*] can be used to benchmark the raw interrupt rate of an
> interrupt controller by measuring the number of IPIs a system can
> sustain. When applied to the XIVE interrupt controller of POWER9 and
> POWER10 systems, a significant drop of the interrupt rate can be
> observed when crossing the second node boundary.
> 
> This is due to the fact that a single IPI interrupt is used for all
> CPUs of the system. The structure is shared and the cache line updates
> impact greatly the traffic between nodes and the overall IPI
> performance.
> 
> As a workaround, the impact can be reduced by deactivating the IRQ
> lockup detector ("noirqdebug") which does a lot of accounting in the
> Linux IRQ descriptor structure and is responsible for most of the
> performance penalty.
> 
> As a fix, this proposal allocates an IPI interrupt per node, to be
> shared by all CPUs of that node. It solves the scaling issue, the IRQ
> lockup detector still has an impact but the XIVE interrupt rate scales
> linearly. It also improves the "noirqdebug" case as showed in the
> tables below.
> 
>  * P9 DD2.2 - 2s * 64 threads
> 
>                                                "noirqdebug"
>                         Mint/s                    Mint/s
>  chips  cpus      IPI/sys   IPI/chip       IPI/chip    IPI/sys
>  --------------------------------------------------------------
>  1      0-15     4.984023   4.875405       4.996536   5.048892
>         0-31    10.879164  10.544040      10.757632  11.037859
>         0-47    15.345301  14.688764      14.926520  15.310053
>         0-63    17.064907  17.066812      17.613416  17.874511
>  2      0-79    11.768764  21.650749      22.689120  22.566508
>         0-95    10.616812  26.878789      28.434703  28.320324
>         0-111   10.151693  31.397803      31.771773  32.388122
>         0-127    9.948502  33.139336      34.875716  35.224548
> 
>  * P10 DD1 - 4s (not homogeneous) 352 threads
> 
>                                                "noirqdebug"
>                         Mint/s                    Mint/s
>  chips  cpus      IPI/sys   IPI/chip       IPI/chip    IPI/sys
>  --------------------------------------------------------------
>  1      0-15     2.409402   2.364108       2.383303   2.395091
>         0-31     6.028325   6.046075       6.089999   6.073750
>         0-47     8.655178   8.644531       8.712830   8.724702
>         0-63    11.629652  11.735953      12.088203  12.055979
>         0-79    14.392321  14.729959      14.986701  14.973073
>         0-95    12.604158  13.004034      17.528748  17.568095
>  2      0-111    9.767753  13.719831      19.968606  20.024218
>         0-127    6.744566  16.418854      22.898066  22.995110
>         0-143    6.005699  19.174421      25.425622  25.417541
>         0-159    5.649719  21.938836      27.952662  28.059603
>         0-175    5.441410  24.109484      31.133915  31.127996
>  3      0-191    5.318341  24.405322      33.999221  33.775354
>         0-207    5.191382  26.449769      36.050161  35.867307
>         0-223    5.102790  29.356943      39.544135  39.508169
>         0-239    5.035295  31.933051      42.135075  42.071975
>         0-255    4.969209  34.477367      44.655395  44.757074
>  4      0-271    4.907652  35.887016      47.080545  47.318537
>         0-287    4.839581  38.076137      50.464307  50.636219
>         0-303    4.786031  40.881319      53.478684  53.310759
>         0-319    4.743750  43.448424      56.388102  55.973969
>         0-335    4.709936  45.623532      59.400930  58.926857
>         0-351    4.681413  45.646151      62.035804  61.830057
> 
> [*] https://github.com/antonblanchard/ipistorm
> 
> Signed-off-by: Cédric Le Goater <clg@kaod.org>
> ---
>  arch/powerpc/sysdev/xive/xive-internal.h |  2 --
>  arch/powerpc/sysdev/xive/common.c        | 39 ++++++++++++++++++------
>  2 files changed, 30 insertions(+), 11 deletions(-)
> 
> diff --git a/arch/powerpc/sysdev/xive/xive-internal.h b/arch/powerpc/sysdev/xive/xive-internal.h
> index 9cf57c722faa..b3a456fdd3a5 100644
> --- a/arch/powerpc/sysdev/xive/xive-internal.h
> +++ b/arch/powerpc/sysdev/xive/xive-internal.h
> @@ -5,8 +5,6 @@
>  #ifndef __XIVE_INTERNAL_H
>  #define __XIVE_INTERNAL_H
>  
> -#define XIVE_IPI_HW_IRQ		0 /* interrupt source # for IPIs */
> -
>  /*
>   * A "disabled" interrupt should never fire, to catch problems
>   * we set its logical number to this
> diff --git a/arch/powerpc/sysdev/xive/common.c b/arch/powerpc/sysdev/xive/common.c
> index 8eefd152b947..c27f7bb0494b 100644
> --- a/arch/powerpc/sysdev/xive/common.c
> +++ b/arch/powerpc/sysdev/xive/common.c
> @@ -65,8 +65,16 @@ static struct irq_domain *xive_irq_domain;
>  #ifdef CONFIG_SMP
>  static struct irq_domain *xive_ipi_irq_domain;
>  
> -/* The IPIs all use the same logical irq number */
> -static u32 xive_ipi_irq;
> +/* The IPIs use the same logical irq number when on the same chip */
> +static struct xive_ipi_desc {
> +	unsigned int irq;
> +	char name[8]; /* enough bytes to fit IPI-XXX */

So this assumes that the node number that node is <= 999 ? This
is certainly the case for now since CONFIG_NODES_SHIFT is 8
on ppc64 but starting with 10, you'd have truncated names.
What about deriving the size of name[] from CONFIG_NODES_SHIFT ?

Apart from that, LGTM. Probably not worth to respin just for
this.

I also could give a try in a KVM guest.

Topology passed to QEMU:

  -smp 8,maxcpus=8,cores=2,threads=2,sockets=2 \
  -numa node,nodeid=0,cpus=0-4 \
  -numa node,nodeid=1,cpus=4-7

Topology observed in guest with lstopo :

  Package L#0
    NUMANode L#0 (P#0 30GB)
    L1d L#0 (32KB) + L1i L#0 (32KB) + Core L#0
      PU L#0 (P#0)
      PU L#1 (P#1)
    L1d L#1 (32KB) + L1i L#1 (32KB) + Core L#1
      PU L#2 (P#2)
      PU L#3 (P#3)
  Package L#1
    NUMANode L#1 (P#1 32GB)
    L1d L#2 (32KB) + L1i L#2 (32KB) + Core L#2
      PU L#4 (P#4)
      PU L#5 (P#5)
    L1d L#3 (32KB) + L1i L#3 (32KB) + Core L#3
      PU L#6 (P#6)
      PU L#7 (P#7)

Interrupts in guest:

$ cat /proc/interrupts 
           CPU0       CPU1       CPU2       CPU3       CPU4       CPU5       CPU6       CPU7       
 16:       1023        871       1042        749          0          0          0          0  XIVE-IPI   0 Edge      IPI-0
 17:          0          0          0          0       2123       1019       1263       1288  XIVE-IPI   1 Edge      IPI-1

IPIs are mapped to the appropriate nodes, and the numbers indicate
that everything is working as expected.

Reviewed-and-tested-by: Greg Kurz <groug@kaod.org>

> +} *xive_ipis;
> +
> +static unsigned int xive_ipi_cpu_to_irq(unsigned int cpu)
> +{
> +	return xive_ipis[cpu_to_node(cpu)].irq;
> +}
>  #endif
>  
>  /* Xive state for each CPU */
> @@ -1106,25 +1114,36 @@ static const struct irq_domain_ops xive_ipi_irq_domain_ops = {
>  
>  static void __init xive_request_ipi(void)
>  {
> -	unsigned int virq;
> +	unsigned int node;
>  
> -	xive_ipi_irq_domain = irq_domain_add_linear(NULL, 1,
> +	xive_ipi_irq_domain = irq_domain_add_linear(NULL, nr_node_ids,
>  						    &xive_ipi_irq_domain_ops, NULL);
>  	if (WARN_ON(xive_ipi_irq_domain == NULL))
>  		return;
>  
> -	/* Initialize it */
> -	virq = irq_create_mapping(xive_ipi_irq_domain, XIVE_IPI_HW_IRQ);
> -	xive_ipi_irq = virq;
> +	xive_ipis = kcalloc(nr_node_ids, sizeof(*xive_ipis), GFP_KERNEL | __GFP_NOFAIL);
> +	for_each_node(node) {
> +		struct xive_ipi_desc *xid = &xive_ipis[node];
> +		irq_hw_number_t node_ipi_hwirq = node;
> +
> +		/*
> +		 * Map one IPI interrupt per node for all cpus of that node.
> +		 * Since the HW interrupt number doesn't have any meaning,
> +		 * simply use the node number.
> +		 */
> +		xid->irq = irq_create_mapping(xive_ipi_irq_domain, node_ipi_hwirq);
> +		snprintf(xid->name, sizeof(xid->name), "IPI-%d", node);
>  
> -	WARN_ON(request_irq(virq, xive_muxed_ipi_action,
> -			    IRQF_PERCPU | IRQF_NO_THREAD, "IPI", NULL));
> +		WARN_ON(request_irq(xid->irq, xive_muxed_ipi_action,
> +				    IRQF_PERCPU | IRQF_NO_THREAD, xid->name, NULL));
> +	}
>  }
>  
>  static int xive_setup_cpu_ipi(unsigned int cpu)
>  {
>  	struct xive_cpu *xc;
>  	int rc;
> +	unsigned int xive_ipi_irq = xive_ipi_cpu_to_irq(cpu);
>  
>  	pr_debug("Setting up IPI for CPU %d\n", cpu);
>  
> @@ -1165,6 +1184,8 @@ static int xive_setup_cpu_ipi(unsigned int cpu)
>  
>  static void xive_cleanup_cpu_ipi(unsigned int cpu, struct xive_cpu *xc)
>  {
> +	unsigned int xive_ipi_irq = xive_ipi_cpu_to_irq(cpu);
> +
>  	/* Disable the IPI and free the IRQ data */
>  
>  	/* Already cleaned up ? */


^ permalink raw reply

* Re: [PATCH v4] powerpc/uprobes: Validation for prefixed instruction
From: Ravi Bangoria @ 2021-03-09 12:58 UTC (permalink / raw)
  To: Naveen N. Rao, Michael Ellerman
  Cc: Ravi Bangoria, jniethe5, oleg, rostedt, linux-kernel, paulus,
	sandipan, naveen.n.rao, linuxppc-dev
In-Reply-To: <20210309112115.GG145@DESKTOP-TDPLP67.localdomain>



On 3/9/21 4:51 PM, Naveen N. Rao wrote:
> On 2021/03/09 08:54PM, Michael Ellerman wrote:
>> Ravi Bangoria <ravi.bangoria@linux.ibm.com> writes:
>>> As per ISA 3.1, prefixed instruction should not cross 64-byte
>>> boundary. So don't allow Uprobe on such prefixed instruction.
>>>
>>> There are two ways probed instruction is changed in mapped pages.
>>> First, when Uprobe is activated, it searches for all the relevant
>>> pages and replace instruction in them. In this case, if that probe
>>> is on the 64-byte unaligned prefixed instruction, error out
>>> directly. Second, when Uprobe is already active and user maps a
>>> relevant page via mmap(), instruction is replaced via mmap() code
>>> path. But because Uprobe is invalid, entire mmap() operation can
>>> not be stopped. In this case just print an error and continue.
>>>
>>> Signed-off-by: Ravi Bangoria <ravi.bangoria@linux.ibm.com>
>>> Acked-by: Naveen N. Rao <naveen.n.rao@linux.vnet.ibm.com>
>>
>> Do we have a Fixes: tag for this?
> 
> Since this is an additional check we are adding, I don't think we should
> add a Fixes: tag. Nothing is broken per-se -- we're just adding more
> checks to catch simple mistakes. Also, like Oleg pointed out, there are
> still many other ways for users to shoot themselves in the foot with
> uprobes and prefixed instructions, if they so desire.
> 
> However, if you still think we should add a Fixes: tag, we can perhaps
> use the below commit since I didn't see any specific commit adding
> support for prefixed instructions for uprobes:
> 
> Fixes: 650b55b707fdfa ("powerpc: Add prefixed instructions to
> instruction data type")

True. IMO, It doesn't really need any Fixes tag.

> 
>>
>>> ---
>>> v3: https://lore.kernel.org/r/20210304050529.59391-1-ravi.bangoria@linux.ibm.com
>>> v3->v4:
>>>    - CONFIG_PPC64 check was not required, remove it.
>>>    - Use SZ_ macros instead of hardcoded numbers.
>>>
>>>   arch/powerpc/kernel/uprobes.c | 7 +++++++
>>>   1 file changed, 7 insertions(+)
>>>
>>> diff --git a/arch/powerpc/kernel/uprobes.c b/arch/powerpc/kernel/uprobes.c
>>> index e8a63713e655..4cbfff6e94a3 100644
>>> --- a/arch/powerpc/kernel/uprobes.c
>>> +++ b/arch/powerpc/kernel/uprobes.c
>>> @@ -41,6 +41,13 @@ int arch_uprobe_analyze_insn(struct arch_uprobe *auprobe,
>>>   	if (addr & 0x03)
>>>   		return -EINVAL;
>>>   
>>> +	if (cpu_has_feature(CPU_FTR_ARCH_31) &&
>>> +	    ppc_inst_prefixed(auprobe->insn) &&
>>> +	    (addr & (SZ_64 - 4)) == SZ_64 - 4) {
>>> +		pr_info_ratelimited("Cannot register a uprobe on 64 byte unaligned prefixed instruction\n");
>>> +		return -EINVAL;
>>
>> I realise we already did the 0x03 check above, but I still think this
>> would be clearer simply as:
>>
>> 	    (addr & 0x3f == 60)
> 
> Indeed, I like the use of `60' there -- hex is overrated ;)

Sure. Will resend.

Ravi

^ permalink raw reply

* [PATCH v2 43/43] powerpc/32: Manage KUAP in C
From: Christophe Leroy @ 2021-03-09 12:10 UTC (permalink / raw)
  To: Benjamin Herrenschmidt, Paul Mackerras, Michael Ellerman, npiggin
  Cc: linuxppc-dev, linux-kernel
In-Reply-To: <cover.1615291471.git.christophe.leroy@csgroup.eu>

Move all KUAP management in C.

Signed-off-by: Christophe Leroy <christophe.leroy@csgroup.eu>
---
 arch/powerpc/include/asm/book3s/32/kup.h     | 50 +-------------------
 arch/powerpc/include/asm/interrupt.h         |  2 +
 arch/powerpc/include/asm/kup.h               |  9 ----
 arch/powerpc/include/asm/nohash/32/kup-8xx.h | 25 +---------
 arch/powerpc/kernel/entry_32.S               |  6 ---
 arch/powerpc/kernel/interrupt.c              | 19 ++------
 arch/powerpc/kernel/process.c                |  3 ++
 7 files changed, 11 insertions(+), 103 deletions(-)

diff --git a/arch/powerpc/include/asm/book3s/32/kup.h b/arch/powerpc/include/asm/book3s/32/kup.h
index c9d6c28bcd10..27991e0d2cf9 100644
--- a/arch/powerpc/include/asm/book3s/32/kup.h
+++ b/arch/powerpc/include/asm/book3s/32/kup.h
@@ -5,55 +5,7 @@
 #include <asm/bug.h>
 #include <asm/book3s/32/mmu-hash.h>
 
-#ifdef __ASSEMBLY__
-
-#ifdef CONFIG_PPC_KUAP
-
-.macro kuap_update_sr	gpr1, gpr2, gpr3	/* NEVER use r0 as gpr2 due to addis */
-101:	mtsrin	\gpr1, \gpr2
-	addi	\gpr1, \gpr1, 0x111		/* next VSID */
-	rlwinm	\gpr1, \gpr1, 0, 0xf0ffffff	/* clear VSID overflow */
-	addis	\gpr2, \gpr2, 0x1000		/* address of next segment */
-	cmplw	\gpr2, \gpr3
-	blt-	101b
-	isync
-.endm
-
-.macro kuap_save_and_lock	sp, thread, gpr1, gpr2, gpr3
-	lwz	\gpr2, KUAP(\thread)
-	rlwinm.	\gpr3, \gpr2, 28, 0xf0000000
-	stw	\gpr2, STACK_REGS_KUAP(\sp)
-	beq+	102f
-	li	\gpr1, 0
-	stw	\gpr1, KUAP(\thread)
-	mfsrin	\gpr1, \gpr2
-	oris	\gpr1, \gpr1, SR_KS@h	/* set Ks */
-	kuap_update_sr	\gpr1, \gpr2, \gpr3
-102:
-.endm
-
-.macro kuap_restore	sp, current, gpr1, gpr2, gpr3
-	lwz	\gpr2, STACK_REGS_KUAP(\sp)
-	rlwinm.	\gpr3, \gpr2, 28, 0xf0000000
-	stw	\gpr2, THREAD + KUAP(\current)
-	beq+	102f
-	mfsrin	\gpr1, \gpr2
-	rlwinm	\gpr1, \gpr1, 0, ~SR_KS	/* Clear Ks */
-	kuap_update_sr	\gpr1, \gpr2, \gpr3
-102:
-.endm
-
-.macro kuap_check	current, gpr
-#ifdef CONFIG_PPC_KUAP_DEBUG
-	lwz	\gpr, THREAD + KUAP(\current)
-999:	twnei	\gpr, 0
-	EMIT_BUG_ENTRY 999b, __FILE__, __LINE__, (BUGFLAG_WARNING | BUGFLAG_ONCE)
-#endif
-.endm
-
-#endif /* CONFIG_PPC_KUAP */
-
-#else /* !__ASSEMBLY__ */
+#ifndef __ASSEMBLY__
 
 #ifdef CONFIG_PPC_KUAP
 
diff --git a/arch/powerpc/include/asm/interrupt.h b/arch/powerpc/include/asm/interrupt.h
index d4bfe94b4a68..b41cb4e014b2 100644
--- a/arch/powerpc/include/asm/interrupt.h
+++ b/arch/powerpc/include/asm/interrupt.h
@@ -37,6 +37,8 @@ static inline void interrupt_enter_prepare(struct pt_regs *regs, struct interrup
 		kuep_lock();
 		current->thread.regs = regs;
 		account_cpu_user_entry();
+	} else {
+		kuap_save_and_lock(regs);
 	}
 #endif
 	/*
diff --git a/arch/powerpc/include/asm/kup.h b/arch/powerpc/include/asm/kup.h
index b7efa46b3109..5bbe8f28d26b 100644
--- a/arch/powerpc/include/asm/kup.h
+++ b/arch/powerpc/include/asm/kup.h
@@ -28,15 +28,6 @@
 
 #ifdef __ASSEMBLY__
 #ifndef CONFIG_PPC_KUAP
-.macro kuap_save_and_lock	sp, thread, gpr1, gpr2, gpr3
-.endm
-
-.macro kuap_restore	sp, current, gpr1, gpr2, gpr3
-.endm
-
-.macro kuap_check	current, gpr
-.endm
-
 .macro kuap_check_amr	gpr1, gpr2
 .endm
 
diff --git a/arch/powerpc/include/asm/nohash/32/kup-8xx.h b/arch/powerpc/include/asm/nohash/32/kup-8xx.h
index c74f5704bc47..fb294dbca102 100644
--- a/arch/powerpc/include/asm/nohash/32/kup-8xx.h
+++ b/arch/powerpc/include/asm/nohash/32/kup-8xx.h
@@ -7,30 +7,7 @@
 
 #ifdef CONFIG_PPC_KUAP
 
-#ifdef __ASSEMBLY__
-
-.macro kuap_save_and_lock	sp, thread, gpr1, gpr2, gpr3
-	lis	\gpr2, MD_APG_KUAP@h	/* only APG0 and APG1 are used */
-	mfspr	\gpr1, SPRN_MD_AP
-	mtspr	SPRN_MD_AP, \gpr2
-	stw	\gpr1, STACK_REGS_KUAP(\sp)
-.endm
-
-.macro kuap_restore	sp, current, gpr1, gpr2, gpr3
-	lwz	\gpr1, STACK_REGS_KUAP(\sp)
-	mtspr	SPRN_MD_AP, \gpr1
-.endm
-
-.macro kuap_check	current, gpr
-#ifdef CONFIG_PPC_KUAP_DEBUG
-	mfspr	\gpr, SPRN_MD_AP
-	rlwinm	\gpr, \gpr, 16, 0xffff
-999:	twnei	\gpr, MD_APG_KUAP@h
-	EMIT_BUG_ENTRY 999b, __FILE__, __LINE__, (BUGFLAG_WARNING | BUGFLAG_ONCE)
-#endif
-.endm
-
-#else /* !__ASSEMBLY__ */
+#ifndef __ASSEMBLY__
 
 #include <asm/reg.h>
 
diff --git a/arch/powerpc/kernel/entry_32.S b/arch/powerpc/kernel/entry_32.S
index 850cb17a937f..f5ac021ff9ed 100644
--- a/arch/powerpc/kernel/entry_32.S
+++ b/arch/powerpc/kernel/entry_32.S
@@ -52,11 +52,9 @@
 	.globl	prepare_transfer_to_handler
 prepare_transfer_to_handler:
 	andi.	r0,r9,MSR_PR
-	addi	r12, r2, THREAD
 	bnelr
 
 	/* if from kernel, check interrupted DOZE/NAP mode */
-	kuap_save_and_lock r11, r12, r9, r5, r6
 	lwz	r12,TI_LOCAL_FLAGS(r2)
 	mtcrf	0x01,r12
 	bt-	31-TLF_NAPPING,4f
@@ -96,7 +94,6 @@ ret_from_syscall:
 	cmplwi	cr0,r5,0
 	bne-	2f
 #endif /* CONFIG_PPC_47x */
-	kuap_check r2, r4
 	lwz	r4,_LINK(r1)
 	lwz	r5,_CCR(r1)
 	mtlr	r4
@@ -208,7 +205,6 @@ END_FTR_SECTION_IFSET(CPU_FTR_SPE)
 	stw	r10,_CCR(r1)
 	stw	r1,KSP(r3)	/* Set old stack pointer */
 
-	kuap_check r2, r0
 #ifdef CONFIG_SMP
 	/* We need a sync somewhere here to make sure that if the
 	 * previous task gets rescheduled on another CPU, it sees all
@@ -276,7 +272,6 @@ interrupt_return:
 	bne-	.Lrestore_nvgprs
 
 .Lfast_user_interrupt_return:
-	kuap_check r2, r4
 	lwz	r11,_NIP(r1)
 	lwz	r12,_MSR(r1)
 	mtspr	SPRN_SRR0,r11
@@ -326,7 +321,6 @@ ALT_FTR_SECTION_END_IFCLR(CPU_FTR_STCX_CHECKS_ADDRESS)
 
 .Lfast_kernel_interrupt_return:
 	cmpwi	cr1,r3,0
-	kuap_restore r1, r2, r3, r4, r5
 	lwz	r11,_NIP(r1)
 	lwz	r12,_MSR(r1)
 	mtspr	SPRN_SRR0,r11
diff --git a/arch/powerpc/kernel/interrupt.c b/arch/powerpc/kernel/interrupt.c
index 40ed55064e54..864cf47b629a 100644
--- a/arch/powerpc/kernel/interrupt.c
+++ b/arch/powerpc/kernel/interrupt.c
@@ -34,6 +34,9 @@ notrace long system_call_exception(long r3, long r4, long r5,
 	syscall_fn f;
 
 	kuep_lock();
+#ifdef CONFIG_PPC32
+	kuap_save_and_lock(regs);
+#endif
 
 	regs->orig_gpr3 = r3;
 
@@ -75,9 +78,7 @@ notrace long system_call_exception(long r3, long r4, long r5,
 			isync();
 	} else
 #endif
-#ifdef CONFIG_PPC64
 		kuap_check();
-#endif
 
 	booke_restore_dbcr0();
 
@@ -253,9 +254,7 @@ notrace unsigned long syscall_exit_prepare(unsigned long r3,
 
 	CT_WARN_ON(ct_state() == CONTEXT_USER);
 
-#ifdef CONFIG_PPC64
 	kuap_check();
-#endif
 
 	regs->result = r3;
 
@@ -350,7 +349,7 @@ notrace unsigned long syscall_exit_prepare(unsigned long r3,
 
 	account_cpu_user_exit();
 
-#ifdef CONFIG_PPC_BOOK3S_64 /* BOOK3E and ppc32 not using this */
+#ifndef CONFIG_PPC_BOOK3E_64 /* BOOK3E not using this */
 	/*
 	 * We do this at the end so that we do context switch with KERNEL AMR
 	 */
@@ -379,9 +378,7 @@ notrace unsigned long interrupt_exit_user_prepare(struct pt_regs *regs, unsigned
 	 * We don't need to restore AMR on the way back to userspace for KUAP.
 	 * AMR can only have been unlocked if we interrupted the kernel.
 	 */
-#ifdef CONFIG_PPC64
 	kuap_check();
-#endif
 
 	local_irq_save(flags);
 
@@ -438,9 +435,7 @@ notrace unsigned long interrupt_exit_user_prepare(struct pt_regs *regs, unsigned
 	/*
 	 * We do this at the end so that we do context switch with KERNEL AMR
 	 */
-#ifdef CONFIG_PPC64
 	kuap_user_restore(regs);
-#endif
 	return ret;
 }
 
@@ -450,9 +445,7 @@ notrace unsigned long interrupt_exit_kernel_prepare(struct pt_regs *regs, unsign
 {
 	unsigned long flags;
 	unsigned long ret = 0;
-#ifdef CONFIG_PPC64
 	unsigned long kuap;
-#endif
 
 	if (!IS_ENABLED(CONFIG_BOOKE) && !IS_ENABLED(CONFIG_40x) &&
 	    unlikely(!(regs->msr & MSR_RI)))
@@ -466,9 +459,7 @@ notrace unsigned long interrupt_exit_kernel_prepare(struct pt_regs *regs, unsign
 	if (TRAP(regs) != 0x700)
 		CT_WARN_ON(ct_state() == CONTEXT_USER);
 
-#ifdef CONFIG_PPC64
 	kuap = kuap_get_and_check();
-#endif
 
 	if (unlikely(current_thread_info()->flags & _TIF_EMULATE_STACK_STORE)) {
 		clear_bits(_TIF_EMULATE_STACK_STORE, &current_thread_info()->flags);
@@ -510,9 +501,7 @@ notrace unsigned long interrupt_exit_kernel_prepare(struct pt_regs *regs, unsign
 	 * which would cause Read-After-Write stalls. Hence, we take the AMR
 	 * value from the check above.
 	 */
-#ifdef CONFIG_PPC64
 	kuap_kernel_restore(regs, kuap);
-#endif
 
 	return ret;
 }
diff --git a/arch/powerpc/kernel/process.c b/arch/powerpc/kernel/process.c
index 5d5d64be2679..fd4da71d92d1 100644
--- a/arch/powerpc/kernel/process.c
+++ b/arch/powerpc/kernel/process.c
@@ -1255,6 +1255,9 @@ struct task_struct *__switch_to(struct task_struct *prev,
 	 */
 	restore_sprs(old_thread, new_thread);
 
+#ifdef CONFIG_PPC32
+	kuap_check();
+#endif
 	last = _switch(old_thread, new_thread);
 
 #ifdef CONFIG_PPC_BOOK3S_64
-- 
2.25.0


^ permalink raw reply related

* [PATCH v2 41/43] powerpc/32s: Create C version of kuap save/restore/check helpers
From: Christophe Leroy @ 2021-03-09 12:10 UTC (permalink / raw)
  To: Benjamin Herrenschmidt, Paul Mackerras, Michael Ellerman, npiggin
  Cc: linuxppc-dev, linux-kernel
In-Reply-To: <cover.1615291471.git.christophe.leroy@csgroup.eu>

In preparation of porting PPC32 to C syscall entry/exit,
create C version of kuap_save_and_lock() and kuap_user_restore() and
kuap_kernel_restore() and kuap_check() and kuap_get_and_check()
on book3s/32.

Signed-off-by: Christophe Leroy <christophe.leroy@csgroup.eu>
---
 arch/powerpc/include/asm/book3s/32/kup.h | 45 ++++++++++++++++++++++++
 1 file changed, 45 insertions(+)

diff --git a/arch/powerpc/include/asm/book3s/32/kup.h b/arch/powerpc/include/asm/book3s/32/kup.h
index b97ea60f6fa3..c9d6c28bcd10 100644
--- a/arch/powerpc/include/asm/book3s/32/kup.h
+++ b/arch/powerpc/include/asm/book3s/32/kup.h
@@ -72,6 +72,51 @@ static inline void kuap_update_sr(u32 sr, u32 addr, u32 end)
 	isync();	/* Context sync required after mtsr() */
 }
 
+static inline void kuap_save_and_lock(struct pt_regs *regs)
+{
+	unsigned long kuap = current->thread.kuap;
+	u32 addr = kuap & 0xf0000000;
+	u32 end = kuap << 28;
+
+	regs->kuap = kuap;
+	if (unlikely(!kuap))
+		return;
+
+	current->thread.kuap = 0;
+	kuap_update_sr(mfsr(addr) | SR_KS, addr, end);	/* Set Ks */
+}
+
+static inline void kuap_user_restore(struct pt_regs *regs)
+{
+}
+
+static inline void kuap_kernel_restore(struct pt_regs *regs, unsigned long kuap)
+{
+	u32 addr = regs->kuap & 0xf0000000;
+	u32 end = regs->kuap << 28;
+
+	current->thread.kuap = regs->kuap;
+
+	if (unlikely(regs->kuap == kuap))
+		return;
+
+	kuap_update_sr(mfsr(addr) & ~SR_KS, addr, end);	/* Clear Ks */
+}
+
+static inline unsigned long kuap_get_and_check(void)
+{
+	unsigned long kuap = current->thread.kuap;
+
+	WARN_ON_ONCE(IS_ENABLED(CONFIG_PPC_KUAP_DEBUG) && kuap != 0);
+
+	return kuap;
+}
+
+static inline void kuap_check(void)
+{
+	kuap_get_and_check();
+}
+
 static __always_inline void allow_user_access(void __user *to, const void __user *from,
 					      u32 size, unsigned long dir)
 {
-- 
2.25.0


^ permalink raw reply related

* [PATCH v2 42/43] powerpc/8xx: Create C version of kuap save/restore/check helpers
From: Christophe Leroy @ 2021-03-09 12:10 UTC (permalink / raw)
  To: Benjamin Herrenschmidt, Paul Mackerras, Michael Ellerman, npiggin
  Cc: linuxppc-dev, linux-kernel
In-Reply-To: <cover.1615291471.git.christophe.leroy@csgroup.eu>

In preparation of porting PPC32 to C syscall entry/exit,
create C version of kuap_save_and_lock() and kuap_user_restore() and
kuap_kernel_restore() and kuap_check() and kuap_get_and_check() on 8xx.

Signed-off-by: Christophe Leroy <christophe.leroy@csgroup.eu>
---
 arch/powerpc/include/asm/nohash/32/kup-8xx.h | 31 ++++++++++++++++++++
 1 file changed, 31 insertions(+)

diff --git a/arch/powerpc/include/asm/nohash/32/kup-8xx.h b/arch/powerpc/include/asm/nohash/32/kup-8xx.h
index 17a4a616436f..c74f5704bc47 100644
--- a/arch/powerpc/include/asm/nohash/32/kup-8xx.h
+++ b/arch/powerpc/include/asm/nohash/32/kup-8xx.h
@@ -34,6 +34,37 @@
 
 #include <asm/reg.h>
 
+static inline void kuap_save_and_lock(struct pt_regs *regs)
+{
+	regs->kuap = mfspr(SPRN_MD_AP);
+	mtspr(SPRN_MD_AP, MD_APG_KUAP);
+}
+
+static inline void kuap_user_restore(struct pt_regs *regs)
+{
+}
+
+static inline void kuap_kernel_restore(struct pt_regs *regs, unsigned long kuap)
+{
+	mtspr(SPRN_MD_AP, regs->kuap);
+}
+
+static inline unsigned long kuap_get_and_check(void)
+{
+	unsigned long kuap = mfspr(SPRN_MD_AP);
+
+	if (IS_ENABLED(CONFIG_PPC_KUAP_DEBUG))
+		WARN_ON_ONCE(kuap >> 16 != MD_APG_KUAP >> 16);
+
+	return kuap;
+}
+
+static inline void kuap_check(void)
+{
+	if (IS_ENABLED(CONFIG_PPC_KUAP_DEBUG))
+		kuap_get_and_check();
+}
+
 static inline void allow_user_access(void __user *to, const void __user *from,
 				     unsigned long size, unsigned long dir)
 {
-- 
2.25.0


^ permalink raw reply related

* [PATCH v2 40/43] powerpc/64s: Make kuap_check_amr() and kuap_get_and_check_amr() generic
From: Christophe Leroy @ 2021-03-09 12:10 UTC (permalink / raw)
  To: Benjamin Herrenschmidt, Paul Mackerras, Michael Ellerman, npiggin
  Cc: linuxppc-dev, linux-kernel
In-Reply-To: <cover.1615291471.git.christophe.leroy@csgroup.eu>

In preparation of porting powerpc32 to C syscall entry/exit,
rename kuap_check_amr() and kuap_get_and_check_amr() as kuap_check()
and kuap_get_and_check(), and move in the generic asm/kup.h the stub
for when CONFIG_PPC_KUAP is not selected.

Signed-off-by: Christophe Leroy <christophe.leroy@csgroup.eu>
---
 arch/powerpc/include/asm/book3s/64/kup.h | 24 ++----------------------
 arch/powerpc/include/asm/kup.h           | 10 +++++++++-
 arch/powerpc/kernel/interrupt.c          | 12 ++++++------
 arch/powerpc/kernel/irq.c                |  2 +-
 4 files changed, 18 insertions(+), 30 deletions(-)

diff --git a/arch/powerpc/include/asm/book3s/64/kup.h b/arch/powerpc/include/asm/book3s/64/kup.h
index 8bd905050896..d9b07e9998be 100644
--- a/arch/powerpc/include/asm/book3s/64/kup.h
+++ b/arch/powerpc/include/asm/book3s/64/kup.h
@@ -287,7 +287,7 @@ static inline void kuap_kernel_restore(struct pt_regs *regs,
 	 */
 }
 
-static inline unsigned long kuap_get_and_check_amr(void)
+static inline unsigned long kuap_get_and_check(void)
 {
 	if (mmu_has_feature(MMU_FTR_BOOK3S_KUAP)) {
 		unsigned long amr = mfspr(SPRN_AMR);
@@ -298,27 +298,7 @@ static inline unsigned long kuap_get_and_check_amr(void)
 	return 0;
 }
 
-#else /* CONFIG_PPC_PKEY */
-
-static inline void kuap_user_restore(struct pt_regs *regs)
-{
-}
-
-static inline void kuap_kernel_restore(struct pt_regs *regs, unsigned long amr)
-{
-}
-
-static inline unsigned long kuap_get_and_check_amr(void)
-{
-	return 0;
-}
-
-#endif /* CONFIG_PPC_PKEY */
-
-
-#ifdef CONFIG_PPC_KUAP
-
-static inline void kuap_check_amr(void)
+static inline void kuap_check(void)
 {
 	if (IS_ENABLED(CONFIG_PPC_KUAP_DEBUG) && mmu_has_feature(MMU_FTR_BOOK3S_KUAP))
 		WARN_ON_ONCE(mfspr(SPRN_AMR) != AMR_KUAP_BLOCKED);
diff --git a/arch/powerpc/include/asm/kup.h b/arch/powerpc/include/asm/kup.h
index 25671f711ec2..b7efa46b3109 100644
--- a/arch/powerpc/include/asm/kup.h
+++ b/arch/powerpc/include/asm/kup.h
@@ -74,7 +74,15 @@ bad_kuap_fault(struct pt_regs *regs, unsigned long address, bool is_write)
 	return false;
 }
 
-static inline void kuap_check_amr(void) { }
+static inline void kuap_check(void) { }
+static inline void kuap_save_and_lock(struct pt_regs *regs) { }
+static inline void kuap_user_restore(struct pt_regs *regs) { }
+static inline void kuap_kernel_restore(struct pt_regs *regs, unsigned long amr) { }
+
+static inline unsigned long kuap_get_and_check(void)
+{
+	return 0;
+}
 
 /*
  * book3s/64/kup-radix.h defines these functions for the !KUAP case to flush
diff --git a/arch/powerpc/kernel/interrupt.c b/arch/powerpc/kernel/interrupt.c
index 727b7848c9cc..40ed55064e54 100644
--- a/arch/powerpc/kernel/interrupt.c
+++ b/arch/powerpc/kernel/interrupt.c
@@ -76,7 +76,7 @@ notrace long system_call_exception(long r3, long r4, long r5,
 	} else
 #endif
 #ifdef CONFIG_PPC64
-		kuap_check_amr();
+		kuap_check();
 #endif
 
 	booke_restore_dbcr0();
@@ -254,7 +254,7 @@ notrace unsigned long syscall_exit_prepare(unsigned long r3,
 	CT_WARN_ON(ct_state() == CONTEXT_USER);
 
 #ifdef CONFIG_PPC64
-	kuap_check_amr();
+	kuap_check();
 #endif
 
 	regs->result = r3;
@@ -380,7 +380,7 @@ notrace unsigned long interrupt_exit_user_prepare(struct pt_regs *regs, unsigned
 	 * AMR can only have been unlocked if we interrupted the kernel.
 	 */
 #ifdef CONFIG_PPC64
-	kuap_check_amr();
+	kuap_check();
 #endif
 
 	local_irq_save(flags);
@@ -451,7 +451,7 @@ notrace unsigned long interrupt_exit_kernel_prepare(struct pt_regs *regs, unsign
 	unsigned long flags;
 	unsigned long ret = 0;
 #ifdef CONFIG_PPC64
-	unsigned long amr;
+	unsigned long kuap;
 #endif
 
 	if (!IS_ENABLED(CONFIG_BOOKE) && !IS_ENABLED(CONFIG_40x) &&
@@ -467,7 +467,7 @@ notrace unsigned long interrupt_exit_kernel_prepare(struct pt_regs *regs, unsign
 		CT_WARN_ON(ct_state() == CONTEXT_USER);
 
 #ifdef CONFIG_PPC64
-	amr = kuap_get_and_check_amr();
+	kuap = kuap_get_and_check();
 #endif
 
 	if (unlikely(current_thread_info()->flags & _TIF_EMULATE_STACK_STORE)) {
@@ -511,7 +511,7 @@ notrace unsigned long interrupt_exit_kernel_prepare(struct pt_regs *regs, unsign
 	 * value from the check above.
 	 */
 #ifdef CONFIG_PPC64
-	kuap_kernel_restore(regs, amr);
+	kuap_kernel_restore(regs, kuap);
 #endif
 
 	return ret;
diff --git a/arch/powerpc/kernel/irq.c b/arch/powerpc/kernel/irq.c
index d71fd10a1dd4..3b18d2b2c702 100644
--- a/arch/powerpc/kernel/irq.c
+++ b/arch/powerpc/kernel/irq.c
@@ -282,7 +282,7 @@ static inline void replay_soft_interrupts_irqrestore(void)
 	 * and re-locking AMR but we shouldn't get here in the first place,
 	 * hence the warning.
 	 */
-	kuap_check_amr();
+	kuap_check();
 
 	if (kuap_state != AMR_KUAP_BLOCKED)
 		set_kuap(AMR_KUAP_BLOCKED);
-- 
2.25.0


^ permalink raw reply related

* [PATCH v2 39/43] powerpc/32s: Move KUEP locking/unlocking in C
From: Christophe Leroy @ 2021-03-09 12:10 UTC (permalink / raw)
  To: Benjamin Herrenschmidt, Paul Mackerras, Michael Ellerman, npiggin
  Cc: linuxppc-dev, linux-kernel
In-Reply-To: <cover.1615291471.git.christophe.leroy@csgroup.eu>

This can be done in C, do it.

Unrolling the loop gains approx. 15% performance.

From now on, prepare_transfer_to_handler() is only for
interrupts from kernel.

Signed-off-by: Christophe Leroy <christophe.leroy@csgroup.eu>
---
 arch/powerpc/include/asm/book3s/32/kup.h | 31 -------------------
 arch/powerpc/include/asm/interrupt.h     |  3 ++
 arch/powerpc/include/asm/kup.h           |  8 +++++
 arch/powerpc/kernel/entry_32.S           | 16 +---------
 arch/powerpc/kernel/interrupt.c          |  4 +++
 arch/powerpc/mm/book3s32/Makefile        |  1 +
 arch/powerpc/mm/book3s32/kuep.c          | 38 ++++++++++++++++++++++++
 7 files changed, 55 insertions(+), 46 deletions(-)
 create mode 100644 arch/powerpc/mm/book3s32/kuep.c

diff --git a/arch/powerpc/include/asm/book3s/32/kup.h b/arch/powerpc/include/asm/book3s/32/kup.h
index 73bc5d2c431d..b97ea60f6fa3 100644
--- a/arch/powerpc/include/asm/book3s/32/kup.h
+++ b/arch/powerpc/include/asm/book3s/32/kup.h
@@ -7,37 +7,6 @@
 
 #ifdef __ASSEMBLY__
 
-.macro kuep_update_sr	gpr1, gpr2		/* NEVER use r0 as gpr2 due to addis */
-101:	mtsrin	\gpr1, \gpr2
-	addi	\gpr1, \gpr1, 0x111		/* next VSID */
-	rlwinm	\gpr1, \gpr1, 0, 0xf0ffffff	/* clear VSID overflow */
-	addis	\gpr2, \gpr2, 0x1000		/* address of next segment */
-	bdnz	101b
-	isync
-.endm
-
-.macro kuep_lock	gpr1, gpr2
-#ifdef CONFIG_PPC_KUEP
-	li	\gpr1, NUM_USER_SEGMENTS
-	li	\gpr2, 0
-	mtctr	\gpr1
-	mfsrin	\gpr1, \gpr2
-	oris	\gpr1, \gpr1, SR_NX@h		/* set Nx */
-	kuep_update_sr \gpr1, \gpr2
-#endif
-.endm
-
-.macro kuep_unlock	gpr1, gpr2
-#ifdef CONFIG_PPC_KUEP
-	li	\gpr1, NUM_USER_SEGMENTS
-	li	\gpr2, 0
-	mtctr	\gpr1
-	mfsrin	\gpr1, \gpr2
-	rlwinm	\gpr1, \gpr1, 0, ~SR_NX		/* Clear Nx */
-	kuep_update_sr \gpr1, \gpr2
-#endif
-.endm
-
 #ifdef CONFIG_PPC_KUAP
 
 .macro kuap_update_sr	gpr1, gpr2, gpr3	/* NEVER use r0 as gpr2 due to addis */
diff --git a/arch/powerpc/include/asm/interrupt.h b/arch/powerpc/include/asm/interrupt.h
index e6d71c2e3aa2..d4bfe94b4a68 100644
--- a/arch/powerpc/include/asm/interrupt.h
+++ b/arch/powerpc/include/asm/interrupt.h
@@ -34,6 +34,7 @@ static inline void interrupt_enter_prepare(struct pt_regs *regs, struct interrup
 		trace_hardirqs_off();
 
 	if (user_mode(regs)) {
+		kuep_lock();
 		current->thread.regs = regs;
 		account_cpu_user_entry();
 	}
@@ -91,6 +92,8 @@ static inline void interrupt_exit_prepare(struct pt_regs *regs, struct interrupt
 	exception_exit(state->ctx_state);
 #endif
 
+	if (user_mode(regs))
+		kuep_unlock();
 	/*
 	 * Book3S exits to user via interrupt_exit_user_prepare(), which does
 	 * context tracking, which is a cleaner way to handle PREEMPT=y
diff --git a/arch/powerpc/include/asm/kup.h b/arch/powerpc/include/asm/kup.h
index 7ec21af49a45..25671f711ec2 100644
--- a/arch/powerpc/include/asm/kup.h
+++ b/arch/powerpc/include/asm/kup.h
@@ -55,6 +55,14 @@ void setup_kuep(bool disabled);
 static inline void setup_kuep(bool disabled) { }
 #endif /* CONFIG_PPC_KUEP */
 
+#if defined(CONFIG_PPC_KUEP) && defined(CONFIG_PPC_BOOK3S_32)
+void kuep_lock(void);
+void kuep_unlock(void);
+#else
+static inline void kuep_lock(void) { }
+static inline void kuep_unlock(void) { }
+#endif
+
 #ifdef CONFIG_PPC_KUAP
 void setup_kuap(bool disabled);
 #else
diff --git a/arch/powerpc/kernel/entry_32.S b/arch/powerpc/kernel/entry_32.S
index 9c333e6db5fa..850cb17a937f 100644
--- a/arch/powerpc/kernel/entry_32.S
+++ b/arch/powerpc/kernel/entry_32.S
@@ -53,14 +53,9 @@
 prepare_transfer_to_handler:
 	andi.	r0,r9,MSR_PR
 	addi	r12, r2, THREAD
-	beq	2f
-#ifdef CONFIG_PPC_BOOK3S_32
-	kuep_lock r11, r12
-#endif
-	blr
+	bnelr
 
 	/* if from kernel, check interrupted DOZE/NAP mode */
-2:
 	kuap_save_and_lock r11, r12, r9, r5, r6
 	lwz	r12,TI_LOCAL_FLAGS(r2)
 	mtcrf	0x01,r12
@@ -84,9 +79,6 @@ _ASM_NOKPROBE_SYMBOL(prepare_transfer_to_handler)
 	.globl	transfer_to_syscall
 transfer_to_syscall:
 	SAVE_NVGPRS(r1)
-#ifdef CONFIG_PPC_BOOK3S_32
-	kuep_lock r11, r12
-#endif
 
 	/* Calling convention has r9 = orig r0, r10 = regs */
 	addi	r10,r1,STACK_FRAME_OVERHEAD
@@ -104,9 +96,6 @@ ret_from_syscall:
 	cmplwi	cr0,r5,0
 	bne-	2f
 #endif /* CONFIG_PPC_47x */
-#ifdef CONFIG_PPC_BOOK3S_32
-	kuep_unlock r5, r7
-#endif
 	kuap_check r2, r4
 	lwz	r4,_LINK(r1)
 	lwz	r5,_CCR(r1)
@@ -287,9 +276,6 @@ interrupt_return:
 	bne-	.Lrestore_nvgprs
 
 .Lfast_user_interrupt_return:
-#ifdef CONFIG_PPC_BOOK3S_32
-	kuep_unlock	r10, r11
-#endif
 	kuap_check r2, r4
 	lwz	r11,_NIP(r1)
 	lwz	r12,_MSR(r1)
diff --git a/arch/powerpc/kernel/interrupt.c b/arch/powerpc/kernel/interrupt.c
index 7082e8ee825e..727b7848c9cc 100644
--- a/arch/powerpc/kernel/interrupt.c
+++ b/arch/powerpc/kernel/interrupt.c
@@ -33,6 +33,8 @@ notrace long system_call_exception(long r3, long r4, long r5,
 {
 	syscall_fn f;
 
+	kuep_lock();
+
 	regs->orig_gpr3 = r3;
 
 	if (IS_ENABLED(CONFIG_PPC_IRQ_SOFT_MASK_DEBUG))
@@ -354,6 +356,8 @@ notrace unsigned long syscall_exit_prepare(unsigned long r3,
 	 */
 	kuap_user_restore(regs);
 #endif
+	kuep_unlock();
+
 	return ret;
 }
 
diff --git a/arch/powerpc/mm/book3s32/Makefile b/arch/powerpc/mm/book3s32/Makefile
index 446d9de88ce4..7f0c8a78ba0c 100644
--- a/arch/powerpc/mm/book3s32/Makefile
+++ b/arch/powerpc/mm/book3s32/Makefile
@@ -9,3 +9,4 @@ endif
 obj-y += mmu.o mmu_context.o
 obj-$(CONFIG_PPC_BOOK3S_603) += nohash_low.o
 obj-$(CONFIG_PPC_BOOK3S_604) += hash_low.o tlb.o
+obj-$(CONFIG_PPC_KUEP) += kuep.o
diff --git a/arch/powerpc/mm/book3s32/kuep.c b/arch/powerpc/mm/book3s32/kuep.c
new file mode 100644
index 000000000000..c70532568a28
--- /dev/null
+++ b/arch/powerpc/mm/book3s32/kuep.c
@@ -0,0 +1,38 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+#include <asm/reg.h>
+#include <asm/task_size_32.h>
+#include <asm/mmu.h>
+
+#define KUEP_UPDATE_TWO_USER_SEGMENTS(n) do {		\
+	if (TASK_SIZE > ((n) << 28))			\
+		mtsr(val1, (n) << 28);			\
+	if (TASK_SIZE > (((n) + 1) << 28))		\
+		mtsr(val2, ((n) + 1) << 28);		\
+	val1 = (val1 + 0x222) & 0xf0ffffff;		\
+	val2 = (val2 + 0x222) & 0xf0ffffff;		\
+} while (0)
+
+static __always_inline void kuep_update(u32 val)
+{
+	int val1 = val;
+	int val2 = (val + 0x111) & 0xf0ffffff;
+
+	KUEP_UPDATE_TWO_USER_SEGMENTS(0);
+	KUEP_UPDATE_TWO_USER_SEGMENTS(2);
+	KUEP_UPDATE_TWO_USER_SEGMENTS(4);
+	KUEP_UPDATE_TWO_USER_SEGMENTS(6);
+	KUEP_UPDATE_TWO_USER_SEGMENTS(8);
+	KUEP_UPDATE_TWO_USER_SEGMENTS(10);
+	KUEP_UPDATE_TWO_USER_SEGMENTS(12);
+	KUEP_UPDATE_TWO_USER_SEGMENTS(14);
+}
+
+void kuep_lock(void)
+{
+	kuep_update(mfsr(0) | SR_NX);
+}
+
+void kuep_unlock(void)
+{
+	kuep_update(mfsr(0) & ~SR_NX);
+}
-- 
2.25.0


^ permalink raw reply related

* [PATCH v2 38/43] powerpc/32: Only use prepare_transfer_to_handler function on book3s/32 and e500
From: Christophe Leroy @ 2021-03-09 12:10 UTC (permalink / raw)
  To: Benjamin Herrenschmidt, Paul Mackerras, Michael Ellerman, npiggin
  Cc: linuxppc-dev, linux-kernel
In-Reply-To: <cover.1615291471.git.christophe.leroy@csgroup.eu>

Only book3s/32 and e500 have significative work to do in
prepare_transfer_to_handler.

Other 32 bit have nothing to do at all.

Signed-off-by: Christophe Leroy <christophe.leroy@csgroup.eu>
---
 arch/powerpc/kernel/entry_32.S   | 6 ++----
 arch/powerpc/kernel/head_32.h    | 2 ++
 arch/powerpc/kernel/head_booke.h | 2 ++
 3 files changed, 6 insertions(+), 4 deletions(-)

diff --git a/arch/powerpc/kernel/entry_32.S b/arch/powerpc/kernel/entry_32.S
index 5cfa10816261..9c333e6db5fa 100644
--- a/arch/powerpc/kernel/entry_32.S
+++ b/arch/powerpc/kernel/entry_32.S
@@ -48,6 +48,7 @@
  */
 	.align	12
 
+#if defined(CONFIG_PPC_BOOK3S_32) || defined(CONFIG_E500)
 	.globl	prepare_transfer_to_handler
 prepare_transfer_to_handler:
 	andi.	r0,r9,MSR_PR
@@ -61,15 +62,12 @@ prepare_transfer_to_handler:
 	/* if from kernel, check interrupted DOZE/NAP mode */
 2:
 	kuap_save_and_lock r11, r12, r9, r5, r6
-#if defined(CONFIG_PPC_BOOK3S_32) || defined(CONFIG_E500)
 	lwz	r12,TI_LOCAL_FLAGS(r2)
 	mtcrf	0x01,r12
 	bt-	31-TLF_NAPPING,4f
 	bt-	31-TLF_SLEEPING,7f
-#endif /* CONFIG_PPC_BOOK3S_32 || CONFIG_E500 */
 	blr
 
-#if defined (CONFIG_PPC_BOOK3S_32) || defined(CONFIG_E500)
 4:	rlwinm	r12,r12,0,~_TLF_NAPPING
 	stw	r12,TI_LOCAL_FLAGS(r2)
 	b	power_save_ppc32_restore
@@ -80,8 +78,8 @@ prepare_transfer_to_handler:
 	rlwinm	r9,r9,0,~MSR_EE
 	stw	r9,_MSR(r11)
 	b	fast_exception_return
-#endif
 _ASM_NOKPROBE_SYMBOL(prepare_transfer_to_handler)
+#endif /* CONFIG_PPC_BOOK3S_32 || CONFIG_E500 */
 
 	.globl	transfer_to_syscall
 transfer_to_syscall:
diff --git a/arch/powerpc/kernel/head_32.h b/arch/powerpc/kernel/head_32.h
index 267479072495..ca303762d8cc 100644
--- a/arch/powerpc/kernel/head_32.h
+++ b/arch/powerpc/kernel/head_32.h
@@ -132,7 +132,9 @@ _ASM_NOKPROBE_SYMBOL(\name\()_virt)
 .endm
 
 .macro prepare_transfer_to_handler
+#ifdef CONFIG_PPC_BOOK3S_32
 	bl	prepare_transfer_to_handler
+#endif
 .endm
 
 .macro SYSCALL_ENTRY trapno
diff --git a/arch/powerpc/kernel/head_booke.h b/arch/powerpc/kernel/head_booke.h
index 4d583fbef0b6..a2565023d2d0 100644
--- a/arch/powerpc/kernel/head_booke.h
+++ b/arch/powerpc/kernel/head_booke.h
@@ -92,7 +92,9 @@ END_BTB_FLUSH_SECTION
 .endm
 
 .macro prepare_transfer_to_handler
+#ifdef CONFIG_E500
 	bl	prepare_transfer_to_handler
+#endif
 .endm
 
 .macro SYSCALL_ENTRY trapno intno srr1
-- 
2.25.0


^ permalink raw reply related

* [PATCH v2 37/43] powerpc/32: Return directly from power_save_ppc32_restore()
From: Christophe Leroy @ 2021-03-09 12:10 UTC (permalink / raw)
  To: Benjamin Herrenschmidt, Paul Mackerras, Michael Ellerman, npiggin
  Cc: linuxppc-dev, linux-kernel
In-Reply-To: <cover.1615291471.git.christophe.leroy@csgroup.eu>

transfer_to_handler_cont: is now just a blr.

Directly perform blr in power_save_ppc32_restore().

Also remove useless setting of r11 in e500 version of
power_save_ppc32_restore().

Signed-off-by: Christophe Leroy <christophe.leroy@csgroup.eu>
---
 arch/powerpc/kernel/entry_32.S  |  3 ---
 arch/powerpc/kernel/idle_6xx.S  |  2 +-
 arch/powerpc/kernel/idle_e500.S | 10 +---------
 3 files changed, 2 insertions(+), 13 deletions(-)

diff --git a/arch/powerpc/kernel/entry_32.S b/arch/powerpc/kernel/entry_32.S
index 815a4ff1ba76..5cfa10816261 100644
--- a/arch/powerpc/kernel/entry_32.S
+++ b/arch/powerpc/kernel/entry_32.S
@@ -67,8 +67,6 @@ prepare_transfer_to_handler:
 	bt-	31-TLF_NAPPING,4f
 	bt-	31-TLF_SLEEPING,7f
 #endif /* CONFIG_PPC_BOOK3S_32 || CONFIG_E500 */
-	.globl transfer_to_handler_cont
-transfer_to_handler_cont:
 	blr
 
 #if defined (CONFIG_PPC_BOOK3S_32) || defined(CONFIG_E500)
@@ -84,7 +82,6 @@ transfer_to_handler_cont:
 	b	fast_exception_return
 #endif
 _ASM_NOKPROBE_SYMBOL(prepare_transfer_to_handler)
-_ASM_NOKPROBE_SYMBOL(transfer_to_handler_cont)
 
 	.globl	transfer_to_syscall
 transfer_to_syscall:
diff --git a/arch/powerpc/kernel/idle_6xx.S b/arch/powerpc/kernel/idle_6xx.S
index 153366e178c4..13cad9297d82 100644
--- a/arch/powerpc/kernel/idle_6xx.S
+++ b/arch/powerpc/kernel/idle_6xx.S
@@ -176,7 +176,7 @@ BEGIN_FTR_SECTION
 	lwz	r9,nap_save_hid1@l(r9)
 	mtspr	SPRN_HID1, r9
 END_FTR_SECTION_IFSET(CPU_FTR_DUAL_PLL_750FX)
-	b	transfer_to_handler_cont
+	blr
 _ASM_NOKPROBE_SYMBOL(power_save_ppc32_restore)
 
 	.data
diff --git a/arch/powerpc/kernel/idle_e500.S b/arch/powerpc/kernel/idle_e500.S
index 7795727e7f08..9e1bc4502c50 100644
--- a/arch/powerpc/kernel/idle_e500.S
+++ b/arch/powerpc/kernel/idle_e500.S
@@ -81,13 +81,5 @@ END_FTR_SECTION_IFSET(CPU_FTR_CAN_NAP)
 _GLOBAL(power_save_ppc32_restore)
 	lwz	r9,_LINK(r11)		/* interrupted in e500_idle */
 	stw	r9,_NIP(r11)		/* make it do a blr */
-
-#ifdef CONFIG_SMP
-	lwz	r11,TASK_CPU(r2)		/* get cpu number * 4 */
-	slwi	r11,r11,2
-#else
-	li	r11,0
-#endif
-
-	b	transfer_to_handler_cont
+	blr
 _ASM_NOKPROBE_SYMBOL(power_save_ppc32_restore)
-- 
2.25.0


^ permalink raw reply related


This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox