* [RFC][PATCH 0/2] Add support for using reserved memory for ima buffer pass
From: Prakhar Srivastava @ 2020-05-04 20:38 UTC (permalink / raw)
To: linux-arm-kernel, linux-kernel, linuxppc-dev, devicetree,
linux-integrity, linux-security-module
Cc: kstewart, mark.rutland, catalin.marinas, bhsharma, tao.li, zohar,
paulus, vincenzo.frascino, frowand.list, nramas, masahiroy,
jmorris, takahiro.akashi, serge, pasha.tatashin, will, prsriva,
robh+dt, hsinyi, tusharsu, tglx, allison, mbrugger, balajib,
dmitry.kasatkin, james.morse, gregkh
IMA during kexec(kexec file load) verifies the kernel signature and measures
the signature of the kernel. The signature in the logs can be used to verfiy the
authenticity of the kernel. The logs don not get carried over kexec and thus
remote attesation cannot verify the signature of the running kernel.
Introduce an ABI to carry forward the ima logs over kexec.
Memory reserved via device tree reservation can be used to store and read
via the of_* functions.
Reserved memory stores the size(sizeof(size_t)) of the buffer in the starting
address, followed by the IMA log contents.
Tested on:
arm64 with Uboot
Prakhar Srivastava (2):
Add a layer of abstraction to use the memory reserved by device tree
for ima buffer pass.
Add support for ima buffer pass using reserved memory for arm64 kexec.
Update the arch sepcific code path in kexec file load to store the
ima buffer in the reserved memory. The same reserved memory is read
on kexec or cold boot.
arch/arm64/Kconfig | 1 +
arch/arm64/include/asm/ima.h | 22 ++++
arch/arm64/include/asm/kexec.h | 5 +
arch/arm64/kernel/Makefile | 1 +
arch/arm64/kernel/ima_kexec.c | 64 ++++++++++
arch/arm64/kernel/machine_kexec_file.c | 1 +
arch/powerpc/include/asm/ima.h | 3 +-
arch/powerpc/kexec/ima.c | 14 ++-
drivers/of/Kconfig | 6 +
drivers/of/Makefile | 1 +
drivers/of/of_ima.c | 165 +++++++++++++++++++++++++
include/linux/of.h | 34 +++++
security/integrity/ima/ima_kexec.c | 15 ++-
13 files changed, 325 insertions(+), 7 deletions(-)
create mode 100644 arch/arm64/include/asm/ima.h
create mode 100644 arch/arm64/kernel/ima_kexec.c
create mode 100644 drivers/of/of_ima.c
--
2.25.1
^ permalink raw reply
* [RFC][PATCH 1/2] Add a layer of abstraction to use the memory reserved by device tree for ima buffer pass.
From: Prakhar Srivastava @ 2020-05-04 20:38 UTC (permalink / raw)
To: linux-arm-kernel, linux-kernel, linuxppc-dev, devicetree,
linux-integrity, linux-security-module
Cc: kstewart, mark.rutland, catalin.marinas, bhsharma, tao.li, zohar,
paulus, vincenzo.frascino, frowand.list, nramas, masahiroy,
jmorris, takahiro.akashi, serge, pasha.tatashin, will, prsriva,
robh+dt, hsinyi, tusharsu, tglx, allison, mbrugger, balajib,
dmitry.kasatkin, james.morse, gregkh
In-Reply-To: <20200504203829.6330-1-prsriva@linux.microsoft.com>
Introduce a device tree layer for to read and store ima buffer
from the reserved memory section of a device tree.
Signed-off-by: Prakhar Srivastava <prsriva@linux.microsoft.com>
---
drivers/of/Kconfig | 6 ++
drivers/of/Makefile | 1 +
drivers/of/of_ima.c | 165 ++++++++++++++++++++++++++++++++++++++++++++
include/linux/of.h | 34 +++++++++
4 files changed, 206 insertions(+)
create mode 100644 drivers/of/of_ima.c
diff --git a/drivers/of/Kconfig b/drivers/of/Kconfig
index d91618641be6..edb3c39740fb 100644
--- a/drivers/of/Kconfig
+++ b/drivers/of/Kconfig
@@ -107,4 +107,10 @@ config OF_DMA_DEFAULT_COHERENT
# arches should select this if DMA is coherent by default for OF devices
bool
+config OF_IMA
+ def_bool y
+ help
+ IMA related wrapper functions to add/remove ima measurement logs during
+ kexec_file_load call.
+
endif # OF
diff --git a/drivers/of/Makefile b/drivers/of/Makefile
index 663a4af0cccd..b4caf083df4e 100644
--- a/drivers/of/Makefile
+++ b/drivers/of/Makefile
@@ -14,5 +14,6 @@ obj-$(CONFIG_OF_RESERVED_MEM) += of_reserved_mem.o
obj-$(CONFIG_OF_RESOLVE) += resolver.o
obj-$(CONFIG_OF_OVERLAY) += overlay.o
obj-$(CONFIG_OF_NUMA) += of_numa.o
+obj-$(CONFIG_OF_IMA) += of_ima.o
obj-$(CONFIG_OF_UNITTEST) += unittest-data/
diff --git a/drivers/of/of_ima.c b/drivers/of/of_ima.c
new file mode 100644
index 000000000000..131f68d81e2e
--- /dev/null
+++ b/drivers/of/of_ima.c
@@ -0,0 +1,165 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (C) 2020 Microsoft Corporation.
+ */
+
+#include <linux/slab.h>
+#include <linux/kexec.h>
+#include <linux/of.h>
+#include <linux/memblock.h>
+#include <linux/libfdt.h>
+#include <linux/of_address.h>
+
+static bool dtb_status_enabled;
+static struct resource mem_res;
+static void *vaddr;
+
+
+/**
+ * of_is_ima_memory_reserved - check if memory is reserved via device
+ * tree.
+ * Return: zero when memory is not reserved.
+ * positive number on success.
+ *
+ */
+int of_is_ima_memory_reserved(void)
+{
+ return dtb_status_enabled;
+}
+
+/**
+ * of_ima_write_buffer - Write the ima buffer into the reserved memory.
+ *
+ * ima_buffer - buffer starting address.
+ * ima_buffer_size - size of segment.
+ *
+ * Return: 0 on success, negative errno on error.
+ */
+int of_ima_write_buffer(void *ima_buffer, size_t ima_buffer_size)
+{
+ void *addr;
+
+ if (!dtb_status_enabled)
+ return -EOPNOTSUPP;
+
+ vaddr = memremap(mem_res.start, resource_size(&mem_res), MEMREMAP_WB);
+ pr_info("Mapped reserved memory, vaddr: 0x%0llX, paddr: 0x%0llX\n , size : %lld",
+ (u64)vaddr, mem_res.start, resource_size(&mem_res));
+
+ if (vaddr) {
+ memcpy(vaddr, &ima_buffer_size, sizeof(size_t));
+ addr = vaddr + sizeof(size_t);
+ memcpy(addr, ima_buffer, ima_buffer_size);
+ memunmap(vaddr);
+ vaddr = NULL;
+ }
+
+ return 0;
+}
+
+/**
+ * of_remove_ima_buffer - Write 0(Zero length buffer to read)to the
+ * size location of the buffer.
+ *
+ * Return: 0 on success, negative errno on error.
+ */
+int of_remove_ima_buffer(void)
+{
+ size_t empty_buffer_size = 0;
+
+ if (!dtb_status_enabled)
+ return -ENOTSUPP;
+
+ if (vaddr) {
+ memcpy(vaddr, &empty_buffer_size, sizeof(size_t));
+ memunmap(vaddr);
+ vaddr = NULL;
+ }
+
+ return 0;
+}
+
+/**
+ * of_ima_get_size_allocated - Get the usable buffer size thats allocated in
+ * the device-tree.
+ *
+ * Return: 0 on unavailable node, size of the memory block - (size_t)
+ */
+size_t of_ima_get_size_allocated(void)
+{
+ size_t size = 0;
+
+ if (!dtb_status_enabled)
+ return size;
+
+ size = resource_size(&mem_res) - sizeof(size_t);
+ return size;
+}
+
+/**
+ * of_get_ima_buffer - Get IMA buffer address.
+ *
+ * @addr: On successful return, set to point to the buffer contents.
+ * @size: On successful return, set to the buffer size.
+ *
+ * Return: 0 on success, negative errno on error.
+ */
+int of_get_ima_buffer(void **addr, size_t *size)
+{
+ if (!dtb_status_enabled)
+ return -ENOTSUPP;
+
+ vaddr = memremap(mem_res.start, resource_size(&mem_res), MEMREMAP_WB);
+ pr_info("Mapped reserved memory, vaddr: 0x%0llX, paddr: 0x%0llX,\n allocated size : %lld, ima_buffer_size: %ld ",
+ (u64)vaddr, mem_res.start, resource_size(&mem_res), *(size_t *)vaddr);
+
+ *size = *(size_t *)vaddr;
+ *addr = vaddr + sizeof(size_t);
+ return 0;
+}
+
+static const struct of_device_id ima_buffer_pass_ids[] = {
+ {
+ .compatible = "linux,ima_buffer_pass",
+ },
+ {}
+};
+
+static const struct of_device_id ima_buffer_pass_match[] = {
+ {
+ .name = "ima_buffer_pass",
+ },
+};
+MODULE_DEVICE_TABLE(of, ima_buffer_pass_match);
+
+static int __init ima_buffer_pass_init(void)
+{
+ int ret = 0;
+ struct device_node *memnp;
+ struct device_node *ima_buffer_pass_node;
+
+ ima_buffer_pass_node = of_find_matching_node(NULL, ima_buffer_pass_ids);
+ if (!ima_buffer_pass_node)
+ return -ENOENT;
+
+ memnp = of_parse_phandle(ima_buffer_pass_node, "memory-region", 0);
+ if (!memnp)
+ return -ENXIO;
+
+ ret = of_address_to_resource(memnp, 0, &mem_res);
+ if (ret < 0)
+ return -ENOENT;
+
+ of_node_put(memnp);
+ dtb_status_enabled = true;
+
+ return ret;
+}
+
+static void __exit ima_buffer_pass_exit(void)
+{
+ pr_info("trying to exit the ima driver\n");
+}
+
+module_init(ima_buffer_pass_init);
+module_exit(ima_buffer_pass_exit);
diff --git a/include/linux/of.h b/include/linux/of.h
index c669c0a4732f..85ce2f24024f 100644
--- a/include/linux/of.h
+++ b/include/linux/of.h
@@ -1485,4 +1485,38 @@ static inline int of_overlay_notifier_unregister(struct notifier_block *nb)
#endif
+#ifdef CONFIG_OF_IMA
+int of_is_ima_memory_reserved(void);
+int of_remove_ima_buffer(void);
+int of_get_ima_buffer(void **addr, size_t *size);
+size_t of_ima_get_size_allocated(void);
+int of_ima_write_buffer(void *ima_buffer,
+ size_t ima_buffer_size);
+#else
+static inline int of_is_ima_memory_reserved(void)
+{
+ return -ENOTSUPP;
+};
+static inline int of_remove_ima_buffer(void)
+{
+ return -ENOTSUPP;
+};
+
+static inline int of_get_ima_buffer(void **addr, size_t *size)
+{
+ return -ENOTSUPP;
+};
+
+static inline size_t of_ima_get_size_allocated(void)
+{
+ return 0;
+};
+
+static inline int of_ima_write_buffer(void *ima_buffer,
+ size_t ima_buffer_size)
+{
+ return -ENOTSUPP;
+};
+#endif
+
#endif /* _LINUX_OF_H */
--
2.25.1
^ permalink raw reply related
* Re: [PATCH 1/3] ASoC: fsl_esai: introduce SoC specific data
From: Nicolin Chen @ 2020-05-04 21:39 UTC (permalink / raw)
To: Shengjiu Wang
Cc: devicetree, alsa-devel, timur, Xiubo.Lee, lgirdwood, linuxppc-dev,
tiwai, robh+dt, perex, broonie, festevam, linux-kernel
In-Reply-To: <27af074e47bf2b81e2dce67ea66a9f7301dfcb07.1588320656.git.shengjiu.wang@nxp.com>
On Fri, May 01, 2020 at 04:12:04PM +0800, Shengjiu Wang wrote:
> Introduce a SoC specific data structure which contains the
> differences between the different SoCs.
> This makes it easier to support more differences without having
> to introduce a new if/else each time.
>
> Signed-off-by: Shengjiu Wang <shengjiu.wang@nxp.com>
Though the 2nd patch is having comments to address, this one
looks fine to me and should be able to merge as long as Mark
is okay with this too:
Acked-by: Nicolin Chen <nicoleotsuka@gmail.com>
^ permalink raw reply
* Re: [PATCH v2 0/5] Statsfs: a new ram-based file sytem for Linux kernel statistics
From: David Rientjes @ 2020-05-04 21:37 UTC (permalink / raw)
To: Emanuele Giuseppe Esposito, Jonathan Adams
Cc: linux-s390, kvm, David Hildenbrand, Cornelia Huck,
Emanuele Giuseppe Esposito, linux-kernel, kvm-ppc, linux-mips,
Christian Borntraeger, Alexander Viro, linux-fsdevel,
Paolo Bonzini, Vitaly Kuznetsov, linuxppc-dev, Jim Mattson
In-Reply-To: <20200504110344.17560-1-eesposit@redhat.com>
[-- Attachment #1: Type: text/plain, Size: 8664 bytes --]
On Mon, 4 May 2020, Emanuele Giuseppe Esposito wrote:
> There is currently no common way for Linux kernel subsystems to expose
> statistics to userspace shared throughout the Linux kernel; subsystems
> have to take care of gathering and displaying statistics by themselves,
> for example in the form of files in debugfs. For example KVM has its own
> code section that takes care of this in virt/kvm/kvm_main.c, where it sets
> up debugfs handlers for displaying values and aggregating them from
> various subfolders to obtain information about the system state (i.e.
> displaying the total number of exits, calculated by summing all exits of
> all cpus of all running virtual machines).
>
> Allowing each section of the kernel to do so has two disadvantages. First,
> it will introduce redundant code. Second, debugfs is anyway not the right
> place for statistics (for example it is affected by lockdown)
>
> In this patch series I introduce statsfs, a synthetic ram-based virtual
> filesystem that takes care of gathering and displaying statistics for the
> Linux kernel subsystems.
>
This is exciting, we have been looking in the same area recently. Adding
Jonathan Adams <jwadams@google.com>.
In your diffstat, one thing I notice that is omitted: an update to
Documentation/* :) Any chance of getting some proposed Documentation/
updates with structure of the fs, the per subsystem breakdown, and best
practices for managing the stats from the kernel level?
> The file system is mounted on /sys/kernel/stats and would be already used
> by kvm. Statsfs was initially introduced by Paolo Bonzini [1].
>
> Statsfs offers a generic and stable API, allowing any kind of
> directory/file organization and supporting multiple kind of aggregations
> (not only sum, but also average, max, min and count_zero) and data types
> (all unsigned and signed types plus boolean). The implementation, which is
> a generalization of KVM’s debugfs statistics code, takes care of gathering
> and displaying information at run time; users only need to specify the
> values to be included in each source.
>
> Statsfs would also be a different mountpoint from debugfs, and would not
> suffer from limited access due to the security lock down patches. Its main
> function is to display each statistics as a file in the desired folder
> hierarchy defined through the API. Statsfs files can be read, and possibly
> cleared if their file mode allows it.
>
> Statsfs has two main components: the public API defined by
> include/linux/statsfs.h, and the virtual file system which should end up
> in /sys/kernel/stats.
>
> The API has two main elements, values and sources. Kernel subsystems like
> KVM can use the API to create a source, add child
> sources/values/aggregates and register it to the root source (that on the
> virtual fs would be /sys/kernel/statsfs).
>
> Sources are created via statsfs_source_create(), and each source becomes a
> directory in the file system. Sources form a parent-child relationship;
> root sources are added to the file system via statsfs_source_register().
> Every other source is added to or removed from a parent through the
> statsfs_source_add_subordinate and statsfs_source_remote_subordinate APIs.
> Once a source is created and added to the tree (via add_subordinate), it
> will be used to compute aggregate values in the parent source.
>
> Values represent quantites that are gathered by the statsfs user. Examples
> of values include the number of vm exits of a given kind, the amount of
> memory used by some data structure, the length of the longest hash table
> chain, or anything like that. Values are defined with the
> statsfs_source_add_values function. Each value is defined by a struct
> statsfs_value; the same statsfs_value can be added to many different
> sources. A value can be considered "simple" if it fetches data from a
> user-provided location, or "aggregate" if it groups all values in the
> subordinates sources that include the same statsfs_value.
>
This seems like it could have a lot of overhead if we wanted to
periodically track the totality of subsystem stats as a form of telemetry
gathering from userspace. To collect telemetry for 1,000 different stats,
do we need to issue lseek()+read() syscalls for each of them individually
(or, worse, open()+read()+close())?
Any thoughts on how that can be optimized? A couple of ideas:
- an interface that allows gathering of all stats for a particular
interface through a single file that would likely be encoded in binary
and the responsibility of userspace to disseminate, or
- an interface that extends beyond this proposal and allows the reader to
specify which stats they are interested in collecting and then the
kernel will only provide these stats in a well formed structure and
also be binary encoded.
We've found that the one-file-per-stat method is pretty much a show
stopper from the performance view and we always must execute at least two
syscalls to obtain a single stat.
Since this is becoming a generic API (good!!), maybe we can discuss
possible ways to optimize gathering of stats in mass?
> For more information, please consult the kerneldoc documentation in patch
> 2 and the sample uses in the kunit tests and in KVM.
>
> This series of patches is based on my previous series "libfs: group and
> simplify linux fs code" and the single patch sent to kvm "kvm_host: unify
> VM_STAT and VCPU_STAT definitions in a single place". The former
> simplifies code duplicated in debugfs and tracefs (from which statsfs is
> based on), the latter groups all macros definition for statistics in kvm
> in a single common file shared by all architectures.
>
> Patch 1 adds a new refcount and kref destructor wrappers that take a
> semaphore, as those are used later by statsfs. Patch 2 introduces the
> statsfs API, patch 3 provides extensive tests that can also be used as
> example on how to use the API and patch 4 adds the file system support.
> Finally, patch 5 provides a real-life example of statsfs usage in KVM.
>
> [1] https://lore.kernel.org/kvm/5d6cdcb1-d8ad-7ae6-7351-3544e2fa366d@redhat.com/?fbclid=IwAR18LHJ0PBcXcDaLzILFhHsl3qpT3z2vlG60RnqgbpGYhDv7L43n0ZXJY8M
>
> Signed-off-by: Emanuele Giuseppe Esposito <eesposit@redhat.com>
>
> v1->v2 remove unnecessary list_foreach_safe loops, fix wrong indentation,
> change statsfs in stats_fs
>
> Emanuele Giuseppe Esposito (5):
> refcount, kref: add dec-and-test wrappers for rw_semaphores
> stats_fs API: create, add and remove stats_fs sources and values
> kunit: tests for stats_fs API
> stats_fs fs: virtual fs to show stats to the end-user
> kvm_main: replace debugfs with stats_fs
>
> MAINTAINERS | 7 +
> arch/arm64/kvm/Kconfig | 1 +
> arch/arm64/kvm/guest.c | 2 +-
> arch/mips/kvm/Kconfig | 1 +
> arch/mips/kvm/mips.c | 2 +-
> arch/powerpc/kvm/Kconfig | 1 +
> arch/powerpc/kvm/book3s.c | 6 +-
> arch/powerpc/kvm/booke.c | 8 +-
> arch/s390/kvm/Kconfig | 1 +
> arch/s390/kvm/kvm-s390.c | 16 +-
> arch/x86/include/asm/kvm_host.h | 2 +-
> arch/x86/kvm/Kconfig | 1 +
> arch/x86/kvm/Makefile | 2 +-
> arch/x86/kvm/debugfs.c | 64 --
> arch/x86/kvm/stats_fs.c | 56 ++
> arch/x86/kvm/x86.c | 6 +-
> fs/Kconfig | 12 +
> fs/Makefile | 1 +
> fs/stats_fs/Makefile | 6 +
> fs/stats_fs/inode.c | 337 ++++++++++
> fs/stats_fs/internal.h | 35 +
> fs/stats_fs/stats_fs-tests.c | 1088 +++++++++++++++++++++++++++++++
> fs/stats_fs/stats_fs.c | 773 ++++++++++++++++++++++
> include/linux/kref.h | 11 +
> include/linux/kvm_host.h | 39 +-
> include/linux/refcount.h | 2 +
> include/linux/stats_fs.h | 304 +++++++++
> include/uapi/linux/magic.h | 1 +
> lib/refcount.c | 32 +
> tools/lib/api/fs/fs.c | 21 +
> virt/kvm/arm/arm.c | 2 +-
> virt/kvm/kvm_main.c | 314 ++-------
> 32 files changed, 2772 insertions(+), 382 deletions(-)
> delete mode 100644 arch/x86/kvm/debugfs.c
> create mode 100644 arch/x86/kvm/stats_fs.c
> create mode 100644 fs/stats_fs/Makefile
> create mode 100644 fs/stats_fs/inode.c
> create mode 100644 fs/stats_fs/internal.h
> create mode 100644 fs/stats_fs/stats_fs-tests.c
> create mode 100644 fs/stats_fs/stats_fs.c
> create mode 100644 include/linux/stats_fs.h
>
> --
> 2.25.2
>
>
^ permalink raw reply
* Re: [PATCH V2 00/11] Subject: Remove duplicated kmap code
From: Al Viro @ 2020-05-04 21:02 UTC (permalink / raw)
To: Ira Weiny
Cc: Peter Zijlstra, Dave Hansen, dri-devel, linux-mips,
James E.J. Bottomley, Max Filippov, Huang Rui, Paul Mackerras,
H. Peter Anvin, sparclinux, Dan Williams, Helge Deller, x86,
linux-csky, Ingo Molnar, linux-snps-arc, linux-xtensa,
Borislav Petkov, Andy Lutomirski, Thomas Gleixner,
linux-arm-kernel, Chris Zankel, Thomas Bogendoerfer, linux-parisc,
linux-kernel, Christian Koenig, Andrew Morton, linuxppc-dev,
David S. Miller
In-Reply-To: <20200504201740.GA985739@iweiny-DESK2.sc.intel.com>
On Mon, May 04, 2020 at 01:17:41PM -0700, Ira Weiny wrote:
> > || * arm: much, much worse. We have several files that pull linux/highmem.h:
> > || arch/arm/mm/cache-feroceon-l2.c, arch/arm/mm/cache-xsc3l2.c,
> > || arch/arm/mm/copypage-*.c, arch/arm/mm/dma-mapping.c, arch/arm/mm/flush.c,
> > || arch/arm/mm/highmem.c, arch/arm/probes/uprobes/core.c,
> > || arch/arm/include/asm/kvm_mmu.h (kmap_atomic_pfn()).
> > || Those are fine, but we also have this:
> > || arch/arm/include/asm/pgtable.h:200:#define __pte_map(pmd) (pte_t *)kmap_atomic(pmd_page(*(pmd)))
> > || arch/arm/include/asm/pgtable.h:208:#define pte_offset_map(pmd,addr) (__pte_map(pmd) + pte_index(addr))
> > || and sure as hell, asm/pgtable.h does *NOT* pull linux/highmem.h.
>
> It does not pull asm/highmem.h either...
No, but the users of those macros need to be considered.
> > || #define pte_offset_map(dir, addr) \
> > || ((pte_t *) kmap_atomic(pmd_page(*(dir))) + pte_index(addr))
> > || One pte_offset_map user in arch/microblaze:
> > || arch/microblaze/kernel/signal.c:207: ptep = pte_offset_map(pmdp, address);
> > || Messy, but doesn't require any changes (we have asm/pgalloc.h included
> > || there, and that pull linux/highmem.h).
>
> AFAICS asm/pgtable.h does not include asm/highmem.h here...
>
> So looks like arch/microblaze/kernel/signal.c will need linux/highmem.h
See above - line 39 in there is
#include <asm/pgalloc.h>
and line 14 in arch/microblaze/include/asm/pgalloc.h is
#include <linux/highmem.h>
It's conditional upon CONFIG_MMU in there, but so's the use of
pte_offset_map() in arch/microblaze/kernel/signal.c
So it shouldn't be a problem.
> > || * xtensa: users in arch/xtensa/kernel/pci-dma.c, arch/xtensa/mm/highmem.c,
> > || arch/xtensa/mm/cache.c and arch/xtensa/platforms/iss/simdisk.c (all pull
> > || linux/highmem.h).
>
> Actually
>
> arch/xtensa/mm/cache.c gets linux/highmem.h from linux/pagemap.h
>
> arch/xtensa/platforms/iss/simdisk.c may have an issue?
> linux/blkdev.h -> CONFIG_BLOCK -> linux/pagemap.h -> linux/highmem.h
> But simdisk.c requires BLK_DEV_SIMDISK -> CONFIG_BLOCK...
> <sigh>
Yep - see above re major chain of indirect includes conditional upon CONFIG_BLOCK
and its uses in places that only build with such configs. There's a plenty of
similar considerations outside of arch/*, unfortunately...
^ permalink raw reply
* Re: [PATCH V2 11/11] drm: Remove drm specific kmap_atomic code
From: Ira Weiny @ 2020-05-04 20:24 UTC (permalink / raw)
To: Daniel Vetter
Cc: Peter Zijlstra, Dave Hansen, dri-devel, linux-mips,
James E.J. Bottomley, Max Filippov, Huang Rui, Paul Mackerras,
H. Peter Anvin, sparclinux, Christoph Hellwig, Helge Deller,
X86 ML, linux-csky, Ingo Molnar, arcml, linux-xtensa,
Dan Williams, Borislav Petkov, Andy Lutomirski, Thomas Gleixner,
Linux ARM, Chris Zankel, Thomas Bogendoerfer, linux-parisc,
Linux Kernel Mailing List, David S. Miller, Andrew Morton,
linuxppc-dev, Christian Koenig
In-Reply-To: <CAKMK7uF4fd3upBYSQEzs==Nx7osn=wZPnxoKLKm9HTxwU_sZ+w@mail.gmail.com>
On Mon, May 04, 2020 at 01:18:51PM +0200, Daniel Vetter wrote:
> On Mon, May 4, 2020 at 3:09 AM <ira.weiny@intel.com> wrote:
> >
> > From: Ira Weiny <ira.weiny@intel.com>
> >
> > kmap_atomic_prot() is now exported by all architectures. Use this
> > function rather than open coding a driver specific kmap_atomic.
> >
> > Reviewed-by: Christian König <christian.koenig@amd.com>
> > Reviewed-by: Christoph Hellwig <hch@lst.de>
> > Signed-off-by: Ira Weiny <ira.weiny@intel.com>
>
> I'm assuming this lands through some other tree or a topic branch or whatever.
Yes I think Andrew queued this up before and so I hope he will continue to do
so with the subsequent versions.
Andrew, LMK if this is an issue.
Thanks,
Ira
>
> Acked-by: Daniel Vetter <daniel.vetter@ffwll.ch>
>
> Cheers, Daniel
>
> > ---
> > drivers/gpu/drm/ttm/ttm_bo_util.c | 56 ++--------------------------
> > drivers/gpu/drm/vmwgfx/vmwgfx_blit.c | 16 ++++----
> > include/drm/ttm/ttm_bo_api.h | 4 --
> > 3 files changed, 12 insertions(+), 64 deletions(-)
> >
> > diff --git a/drivers/gpu/drm/ttm/ttm_bo_util.c b/drivers/gpu/drm/ttm/ttm_bo_util.c
> > index 52d2b71f1588..f09b096ba4fd 100644
> > --- a/drivers/gpu/drm/ttm/ttm_bo_util.c
> > +++ b/drivers/gpu/drm/ttm/ttm_bo_util.c
> > @@ -257,54 +257,6 @@ static int ttm_copy_io_page(void *dst, void *src, unsigned long page)
> > return 0;
> > }
> >
> > -#ifdef CONFIG_X86
> > -#define __ttm_kmap_atomic_prot(__page, __prot) kmap_atomic_prot(__page, __prot)
> > -#define __ttm_kunmap_atomic(__addr) kunmap_atomic(__addr)
> > -#else
> > -#define __ttm_kmap_atomic_prot(__page, __prot) vmap(&__page, 1, 0, __prot)
> > -#define __ttm_kunmap_atomic(__addr) vunmap(__addr)
> > -#endif
> > -
> > -
> > -/**
> > - * ttm_kmap_atomic_prot - Efficient kernel map of a single page with
> > - * specified page protection.
> > - *
> > - * @page: The page to map.
> > - * @prot: The page protection.
> > - *
> > - * This function maps a TTM page using the kmap_atomic api if available,
> > - * otherwise falls back to vmap. The user must make sure that the
> > - * specified page does not have an aliased mapping with a different caching
> > - * policy unless the architecture explicitly allows it. Also mapping and
> > - * unmapping using this api must be correctly nested. Unmapping should
> > - * occur in the reverse order of mapping.
> > - */
> > -void *ttm_kmap_atomic_prot(struct page *page, pgprot_t prot)
> > -{
> > - if (pgprot_val(prot) == pgprot_val(PAGE_KERNEL))
> > - return kmap_atomic(page);
> > - else
> > - return __ttm_kmap_atomic_prot(page, prot);
> > -}
> > -EXPORT_SYMBOL(ttm_kmap_atomic_prot);
> > -
> > -/**
> > - * ttm_kunmap_atomic_prot - Unmap a page that was mapped using
> > - * ttm_kmap_atomic_prot.
> > - *
> > - * @addr: The virtual address from the map.
> > - * @prot: The page protection.
> > - */
> > -void ttm_kunmap_atomic_prot(void *addr, pgprot_t prot)
> > -{
> > - if (pgprot_val(prot) == pgprot_val(PAGE_KERNEL))
> > - kunmap_atomic(addr);
> > - else
> > - __ttm_kunmap_atomic(addr);
> > -}
> > -EXPORT_SYMBOL(ttm_kunmap_atomic_prot);
> > -
> > static int ttm_copy_io_ttm_page(struct ttm_tt *ttm, void *src,
> > unsigned long page,
> > pgprot_t prot)
> > @@ -316,13 +268,13 @@ static int ttm_copy_io_ttm_page(struct ttm_tt *ttm, void *src,
> > return -ENOMEM;
> >
> > src = (void *)((unsigned long)src + (page << PAGE_SHIFT));
> > - dst = ttm_kmap_atomic_prot(d, prot);
> > + dst = kmap_atomic_prot(d, prot);
> > if (!dst)
> > return -ENOMEM;
> >
> > memcpy_fromio(dst, src, PAGE_SIZE);
> >
> > - ttm_kunmap_atomic_prot(dst, prot);
> > + kunmap_atomic(dst);
> >
> > return 0;
> > }
> > @@ -338,13 +290,13 @@ static int ttm_copy_ttm_io_page(struct ttm_tt *ttm, void *dst,
> > return -ENOMEM;
> >
> > dst = (void *)((unsigned long)dst + (page << PAGE_SHIFT));
> > - src = ttm_kmap_atomic_prot(s, prot);
> > + src = kmap_atomic_prot(s, prot);
> > if (!src)
> > return -ENOMEM;
> >
> > memcpy_toio(dst, src, PAGE_SIZE);
> >
> > - ttm_kunmap_atomic_prot(src, prot);
> > + kunmap_atomic(src);
> >
> > return 0;
> > }
> > diff --git a/drivers/gpu/drm/vmwgfx/vmwgfx_blit.c b/drivers/gpu/drm/vmwgfx/vmwgfx_blit.c
> > index bb46ca0c458f..94d456a1d1a9 100644
> > --- a/drivers/gpu/drm/vmwgfx/vmwgfx_blit.c
> > +++ b/drivers/gpu/drm/vmwgfx/vmwgfx_blit.c
> > @@ -374,12 +374,12 @@ static int vmw_bo_cpu_blit_line(struct vmw_bo_blit_line_data *d,
> > copy_size = min_t(u32, copy_size, PAGE_SIZE - src_page_offset);
> >
> > if (unmap_src) {
> > - ttm_kunmap_atomic_prot(d->src_addr, d->src_prot);
> > + kunmap_atomic(d->src_addr);
> > d->src_addr = NULL;
> > }
> >
> > if (unmap_dst) {
> > - ttm_kunmap_atomic_prot(d->dst_addr, d->dst_prot);
> > + kunmap_atomic(d->dst_addr);
> > d->dst_addr = NULL;
> > }
> >
> > @@ -388,8 +388,8 @@ static int vmw_bo_cpu_blit_line(struct vmw_bo_blit_line_data *d,
> > return -EINVAL;
> >
> > d->dst_addr =
> > - ttm_kmap_atomic_prot(d->dst_pages[dst_page],
> > - d->dst_prot);
> > + kmap_atomic_prot(d->dst_pages[dst_page],
> > + d->dst_prot);
> > if (!d->dst_addr)
> > return -ENOMEM;
> >
> > @@ -401,8 +401,8 @@ static int vmw_bo_cpu_blit_line(struct vmw_bo_blit_line_data *d,
> > return -EINVAL;
> >
> > d->src_addr =
> > - ttm_kmap_atomic_prot(d->src_pages[src_page],
> > - d->src_prot);
> > + kmap_atomic_prot(d->src_pages[src_page],
> > + d->src_prot);
> > if (!d->src_addr)
> > return -ENOMEM;
> >
> > @@ -499,9 +499,9 @@ int vmw_bo_cpu_blit(struct ttm_buffer_object *dst,
> > }
> > out:
> > if (d.src_addr)
> > - ttm_kunmap_atomic_prot(d.src_addr, d.src_prot);
> > + kunmap_atomic(d.src_addr);
> > if (d.dst_addr)
> > - ttm_kunmap_atomic_prot(d.dst_addr, d.dst_prot);
> > + kunmap_atomic(d.dst_addr);
> >
> > return ret;
> > }
> > diff --git a/include/drm/ttm/ttm_bo_api.h b/include/drm/ttm/ttm_bo_api.h
> > index 0a9d042e075a..de1ccdcd5703 100644
> > --- a/include/drm/ttm/ttm_bo_api.h
> > +++ b/include/drm/ttm/ttm_bo_api.h
> > @@ -668,10 +668,6 @@ int ttm_bo_mmap_obj(struct vm_area_struct *vma, struct ttm_buffer_object *bo);
> > int ttm_bo_mmap(struct file *filp, struct vm_area_struct *vma,
> > struct ttm_bo_device *bdev);
> >
> > -void *ttm_kmap_atomic_prot(struct page *page, pgprot_t prot);
> > -
> > -void ttm_kunmap_atomic_prot(void *addr, pgprot_t prot);
> > -
> > /**
> > * ttm_bo_io
> > *
> > --
> > 2.25.1
> >
> > _______________________________________________
> > dri-devel mailing list
> > dri-devel@lists.freedesktop.org
> > https://lists.freedesktop.org/mailman/listinfo/dri-devel
>
>
>
> --
> Daniel Vetter
> Software Engineer, Intel Corporation
> +41 (0) 79 365 57 48 - http://blog.ffwll.ch
^ permalink raw reply
* Re: [PATCH V2 00/11] Subject: Remove duplicated kmap code
From: Ira Weiny @ 2020-05-04 20:17 UTC (permalink / raw)
To: Al Viro
Cc: Peter Zijlstra, Dave Hansen, dri-devel, linux-mips,
James E.J. Bottomley, Max Filippov, Huang Rui, Paul Mackerras,
H. Peter Anvin, sparclinux, Dan Williams, Helge Deller, x86,
linux-csky, Ingo Molnar, linux-snps-arc, linux-xtensa,
Borislav Petkov, Andy Lutomirski, Thomas Gleixner,
linux-arm-kernel, Chris Zankel, Thomas Bogendoerfer, linux-parisc,
linux-kernel, Christian Koenig, Andrew Morton, linuxppc-dev,
David S. Miller
In-Reply-To: <20200504053357.GV23230@ZenIV.linux.org.uk>
On Mon, May 04, 2020 at 06:33:57AM +0100, Al Viro wrote:
> On Sun, May 03, 2020 at 10:04:47PM -0700, Ira Weiny wrote:
>
> > Grepping for 'asm/highmem.h' and investigations don't reveal any issues... But
> > you do have me worried. That said 0-day has been crunching on multiple
> > versions of this series without issues such as this (save the mips issue
> > above).
> >
> > I have to say it would be nice if the relation between linux/highmem.h and
> > asm/highmem.h was more straightforward.
>
> IIRC, the headache was due to asm/pgtable.h on several architectures and
> asm/cacheflush.h on parisc.
>
> <digs the notes out>
>
> || IOW, there's one in linux/highmem.h (conditional on !CONFIG_HIGHMEM,
> || !ARCH_HAS_KMAP) and several per-architecture variants, usually declared in
> || their asm/highmem.h. In three of those (microblaze, parisc and powerpc) these
> || are inlines (parisc one identical to linux/highmem.h, lives in asm/cacheflush.h,
> || powerpc and microblaze ones calling kmap_atomic_prot() which is defined in
> || arch/$ARCH/mm/highmem.c).
> ||
> || parisc case is weird - essentially, they want to call
> || flush_kernel_dcache_page_addr() at the places where kunmap/kunmap_atomic
> || is done. And they do so despite not selecting HIGHMEM, with definitions
> || in usual place. They do have ARCH_HAS_KMAP defined, which prevents
> || getting buggered in linux/highmem.h. ARCH_HAS_KMAP is parisc-unique,
> || BTW, and checked only in linux/highmem.h.
> ||
> || All genuine arch-specific variants are defined in (or call functions
> || defined in) translation units that are only included CONFIG_HIGHMEM builds.
I agree with this statement. But IMO additional confusion is caused by the
fact that some arch's condition the declarations on CONFIG_HIGHMEM within
asm/highmem.h (arc, arm, nds32) while others depend on linux/highmem.h (and
elsewhere) to do so (csky, microblaze, mips, powerpc, sparc, x86, xtensa).
Why?
I think (perhaps naive) over time asm/highmem.h needs to be isolated to being
included in linux/highmem.h. But as you point out below that is not so easy.
I think that this series helps toward that goal.
> ||
> || It would be tempting to consolidate those, e.g. by adding __kmap_atomic()
> || and __kmap_atomic_prot() without that boilerplate, with universal kmap_atomic()
> || and kmap_atomic_prot() in linux/highmem.h. Potential problem with that would
> || be something that pulls ash/highmem.h (or asm/cacheflush.h in case of parisc)
> || directly and uses kmap_atomic/kmap_atomic_prot. There's not a lot places
> || pulling asm/highmem.h, and many of them are not even in includes:
> ||
> || arch/arm/include/asm/efi.h:13:#include <asm/highmem.h>
> || arch/arm/mm/dma-mapping.c:31:#include <asm/highmem.h>
> || arch/arm/mm/flush.c:14:#include <asm/highmem.h>
> || arch/arm/mm/mmu.c:27:#include <asm/highmem.h>
> || arch/mips/include/asm/pgtable-32.h:22:#include <asm/highmem.h>
> || arch/mips/mm/cache.c:19:#include <asm/highmem.h>
> || arch/mips/mm/fault.c:28:#include <asm/highmem.h> /* For VMALLOC_END */
> || arch/nds32/include/asm/pgtable.h:60:#include <asm/highmem.h>
> || arch/x86/kernel/setup_percpu.c:20:#include <asm/highmem.h>
> || include/linux/highmem.h:35:#include <asm/highmem.h>
> ||
> || Users of asm/cacheflush.h are rather more numerous; however, anything
> || outside of parisc-specific code has to pull linux/highmem.h, or it won't see
> || the definitions of kmap_atomic/kmap_atomic_prot at all. arch/parisc itself
> || has no callers of those.
> ||
> || Outside of arch/* there is a plenty of callers. However, the following is
> || true: all instances of kmap_atomic or kmap_atomic_prot outside of arch/*
> || are either inside the linux/highmem.h or are preceded by include of
> || linux/highmem.h on any build that sees them (there is a common include
> || chain that is conditional upon CONFIG_BLOCK, but it's only needed in
> || drivers that are BLOCK-dependent). It was not fun to verify, to put
> || it mildly...
> ||
> || So for parisc we have no problem - leaving __kmap_atomic()/__kmap_atomic_prot()
> || in asm/cachefile.h and adding universal wrappers in linux/highmem.h will be
> || fine. For other architectures the things might be trickier.
And the follow up series removes kmap_* from asm/cachefile.h in parisc which
should be cleaner going forward.
> ||
> || * arc: all users in arch/arc/ are within arch/arc/mm/{cache,highmem}.c;
> || both pull linux/highmem.h. We are fine.
Still fine.
> ||
> || * arm: much, much worse. We have several files that pull linux/highmem.h:
> || arch/arm/mm/cache-feroceon-l2.c, arch/arm/mm/cache-xsc3l2.c,
> || arch/arm/mm/copypage-*.c, arch/arm/mm/dma-mapping.c, arch/arm/mm/flush.c,
> || arch/arm/mm/highmem.c, arch/arm/probes/uprobes/core.c,
> || arch/arm/include/asm/kvm_mmu.h (kmap_atomic_pfn()).
> || Those are fine, but we also have this:
> || arch/arm/include/asm/pgtable.h:200:#define __pte_map(pmd) (pte_t *)kmap_atomic(pmd_page(*(pmd)))
> || arch/arm/include/asm/pgtable.h:208:#define pte_offset_map(pmd,addr) (__pte_map(pmd) + pte_index(addr))
> || and sure as hell, asm/pgtable.h does *NOT* pull linux/highmem.h.
It does not pull asm/highmem.h either...
> ||
> || Fortunately, the users of pte_offset_map() (__pte_map() has no other users)
> || are few, both in arch/arm and outside of arch. All arm ones are pulling
> || linux/highmem (arch/arm/mm/{pgd,fault*}.c). Outside of arch we have several
> || that pull highmem.h (by way of rmap.h or pagemap.h, usually):
> || fs/userfaultfd.c, mm/gup.c, mm/hmm.c, mm/huge_memory.c,
> || mm/khugepaged.c, mm/memory-failure.c, mm/memory.c, mm/migrate.c,
> || mm/mremap.c, mm/page_vma_mapped.c, mm/swap_state.c, mm/swapfile.c,
> || mm/vmalloc.c
> || and then there are these in linux/mm.h:
> ||
> || #define pte_offset_map_lock(mm, pmd, address, ptlp) \
> || ({ \
> || spinlock_t *__ptl = pte_lockptr(mm, pmd); \
> || pte_t *__pte = pte_offset_map(pmd, address); \
> || *(ptlp) = __ptl; \
> || spin_lock(__ptl); \
> || __pte; \
> || })
> || #define pte_alloc_map(mm, pmd, address) \
> || (pte_alloc(mm, pmd) ? NULL : pte_offset_map(pmd, address))
> || #define pte_alloc_map_lock(mm, pmd, address, ptlp) \
> || (pte_alloc(mm, pmd) ? \
> || NULL : pte_offset_map_lock(mm, pmd, address, ptlp))
> ||
> || These have two users in arch/arm (arch/arm/mm/pgd.c and
> || arch/arm/lib/uaccess_with_memcpy.c, both pulling highmem.h). Outside of
> || arch there are several new files (plus a lot of what we'd already seen
> || in mm/*.c, unsurprisingly):
> || fs/proc/task_mmu.c, mm/ksm.c, mm/madvise.c, mm/memcontrol.c,
> || mm/mempolicy.c, mm/mincore.c, mm/mprotect.c, mm/pagewalk.c,
> || mm/shmem.c, mm/userfaultfd.c,
> || all pulling linux/highmem.h, as pretty much all core VM does. So we are
> || still fine.
This all seems the same now.
> ||
> || * csky: users in arch/csky/abiv2/cacheflush.c, arch/csky/mm/dma-mapping.c,
> || arch/csky/mm/highmem.c, all pulling linux/highmem.h
Yes still are.
> ||
> || * microblaze: users in arch/microblaze/mm/highmem.c (pulls linux/highmem.h) and,
> || arch/microblaze/include/asm/pgtable.h, this:
> || #define pte_offset_map(dir, addr) \
> || ((pte_t *) kmap_atomic(pmd_page(*(dir))) + pte_index(addr))
> || One pte_offset_map user in arch/microblaze:
> || arch/microblaze/kernel/signal.c:207: ptep = pte_offset_map(pmdp, address);
> || Messy, but doesn't require any changes (we have asm/pgalloc.h included
> || there, and that pull linux/highmem.h).
AFAICS asm/pgtable.h does not include asm/highmem.h here...
So looks like arch/microblaze/kernel/signal.c will need linux/highmem.h
> || Outside of arch we'd already sorted it out when looking at arm.
> ||
> || * mips: users in arch/mips/kernel/crash_dump.c, arch/mips/kernel/uprobes.c,
> || arch/mips/mm/c-r4k.c, arch/mips/mm/dma-noncoherent.c, arch/mips/mm/highmem.c,
> || and arch/mips/mm/init.c (all pulling linux/highmem.h) plus this
> || arch/mips/mm/cache.c, which relies upon asm/highmem.h. This can be switched
> || to linux/highmem.h. On !CONFIG_HIGHMEM builds the call of kmap_atomic() in
> || there is eliminated, since it's conditional upon PageHighMem(). IOW, even
> || though we get a call of (inexistent) out-of-line version, it's not going to
> || survive into object file. With linux/highmem.h use it will be an equally
> || eliminated call of inlined version.
> || XXX: arch/mips/mm/cache.c
Fixed as part of this series.
> ||
> || * nds32: users in arch/nds32/kernel/dma.c, arch/nds32/mm/cacheflush.c and
> || arch/nds32/mm/highmem.c, all pulling linux/highmem.h
Still looks ok.
> ||
> || * powerpc: users in arch/powerpc/kvm/book3s_pr.c,
> || arch/powerpc/kvm/e500_mmu_host.c, arch/powerpc/mm/dma-noncoherent.c,
> || arch/powerpc/mm/highmem.c and arch/powerpc/mm/mem.c, all pulling
> || linux/highmem.h,
still good
> a user in arch/powerpc/mm/hugetlbpage.c pulling it
> || via asm/tlb.h -> linux/pagemap.h -> linux/highmem.h
good
> and
> || macros for pte_offset_map in arch/powerpc/include/asm/*/32/pgtable.h.
> || Users of that within arch/powerpc are either 64bit-only or
> || pull linux/highmem.h (arch/powerpc/mm/pgtable_32.c and
> || arch/powerpc/xmon/xmon.c).
>
Looks ok.
> || Users outside of arch - same as for arm.
> ||
> || * sparc: users in arch/sparc/kernel/uprobes.c and arch/sparc/mm/highmem.c
> || (both pulling linux/highmem.h directly) + arch/sparc/mm/init_64.c pulling
> || it via linux/pagemap.h.
Looks ok.
> Strangely, arch/sparc/mm/io-unit.c and
> || arch/sparc/mm/iommu.c both include linux/highmem.h with odd comment
> || that seems to indicate that once upon a time pte_offset_map() used to
> || requite kmap_atomic() there... Right, it used to - until 2002.
> || These includes are pointless, then...
Looks like it...
I'll throw in a patch for that.
> ||
> || * x86: users in arch/x86/kernel/crash_dump_32.c, arch/x86/kvm/svm.c,
> || arch/x86/lib/usercopy_64.c, arch/x86/mm/highmem_32.c and arch/x86/mm/iomap_32.c,
> || all pulling linux/highmem.h, users in paging_tmpl.h (included from
> || arch/x86/kvm/mmu/mmu.c, which has pulled linux/highmem.h prior to that)
> || and definition of pte_offset_map() (in asm/pgtable_32.h)
> || Users of pte_offset_map() and friends in arch/x86 are in
> || arch/x86/kernel/vm86_32.c and arch/x86/mm/dump_pagetables.c (both
> || pulling linux/highmem.h), in arch/x86/mm/mem_encrypt_identity.c
> || (64bit-only, pte_offset_map() doesn't use kmap_atomic() there) and
> || arch/x86/kernel/tboot.c (pulls linux/highmem.h via asm/pgalloc.h
> || and linux/pagemap.h)
I've built these and they seem fine.
> ||
> || * xtensa: users in arch/xtensa/kernel/pci-dma.c, arch/xtensa/mm/highmem.c,
> || arch/xtensa/mm/cache.c and arch/xtensa/platforms/iss/simdisk.c (all pull
> || linux/highmem.h).
Actually
arch/xtensa/mm/cache.c gets linux/highmem.h from linux/pagemap.h
arch/xtensa/platforms/iss/simdisk.c may have an issue?
linux/blkdev.h -> CONFIG_BLOCK -> linux/pagemap.h -> linux/highmem.h
But simdisk.c requires BLK_DEV_SIMDISK -> CONFIG_BLOCK...
<sigh>
So xtensa still seems good AFAICS.
In summary it looks like the use of kmap_atomic() in pte_offset_map() is a
potential issue in microblaze. I've fixed that in my local tree.
Ira
^ permalink raw reply
* Re: [PATCH v2 17/20] mm: free_area_init: allow defining max_zone_pfn in descending order
From: Mike Rapoport @ 2020-05-04 15:39 UTC (permalink / raw)
To: Guenter Roeck
Cc: Rich Felker, linux-ia64, linux-doc, Catalin Marinas,
Heiko Carstens, x86, Michal Hocko, James E.J. Bottomley,
Max Filippov, Guo Ren, Ley Foon Tan, sparclinux, linux-riscv,
Greg Ungerer, linux-arch, linux-s390, linux-c6x-dev, Baoquan He,
Jonathan Corbet, linux-hexagon, Helge Deller, linux-sh,
Russell King, linux-csky, Mike Rapoport, Geert Uytterhoeven,
Hoan Tran, Mark Salter, Matt Turner, linux-snps-arc,
uclinux-h8-devel, linux-xtensa, Nick Hu, linux-alpha, linux-um,
linux-mips, Richard Weinberger, linux-m68k, Thomas Bogendoerfer,
Qian Cai, Greentime Hu, Paul Walmsley, Stafford Horne,
Guan Xuetao, linux-arm-kernel, Michal Simek, Tony Luck,
Yoshinori Sato, linux-parisc, linux-mm, Vineet Gupta, Brian Cain,
linux-kernel, openrisc, Andrew Morton, linuxppc-dev,
David S. Miller
In-Reply-To: <20200503184300.GA154219@roeck-us.net>
On Sun, May 03, 2020 at 11:43:00AM -0700, Guenter Roeck wrote:
> On Sun, May 03, 2020 at 10:41:38AM -0700, Guenter Roeck wrote:
> > Hi,
> >
> > On Wed, Apr 29, 2020 at 03:11:23PM +0300, Mike Rapoport wrote:
> > > From: Mike Rapoport <rppt@linux.ibm.com>
> > >
> > > Some architectures (e.g. ARC) have the ZONE_HIGHMEM zone below the
> > > ZONE_NORMAL. Allowing free_area_init() parse max_zone_pfn array even it is
> > > sorted in descending order allows using free_area_init() on such
> > > architectures.
> > >
> > > Add top -> down traversal of max_zone_pfn array in free_area_init() and use
> > > the latter in ARC node/zone initialization.
> > >
> > > Signed-off-by: Mike Rapoport <rppt@linux.ibm.com>
> >
> > This patch causes my microblazeel qemu boot test in linux-next to fail.
> > Reverting it fixes the problem.
> >
> The same problem is seen with s390 emulations.
Yeah, this patch breaks some others as well :(
My assumption that max_zone_pfn defines architectural limit for maximal
PFN that can belong to a zone was over-optimistic. Several arches
actually do that, but others do
max_zone_pfn[ZONE_DMA] = MAX_DMA_PFN;
max_zone_pfn[ZONE_NORMAL] = max_pfn;
where MAX_DMA_PFN is build-time constrain and max_pfn is run time limit
for the current system.
So, when max_pfn is lower than MAX_DMA_PFN, the free_init_area() will
consider max_zone_pfn as descending and will wrongly calculate zone
extents.
That said, instead of trying to create a generic way to special case
ARC, I suggest to simply use the below patch instead.
diff --git a/arch/arc/mm/init.c b/arch/arc/mm/init.c
index 41eb9be1653c..386959bac3d2 100644
--- a/arch/arc/mm/init.c
+++ b/arch/arc/mm/init.c
@@ -77,6 +77,11 @@ void __init early_init_dt_add_memory_arch(u64 base, u64 size)
base, TO_MB(size), !in_use ? "Not used":"");
}
+bool arch_has_descending_max_zone_pfns(void)
+{
+ return true;
+}
+
/*
* First memory setup routine called from setup_arch()
* 1. setup swapper's mm @init_mm
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index b990e9734474..114f0e027144 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -7307,6 +7307,15 @@ static void check_for_memory(pg_data_t *pgdat, int nid)
}
}
+/*
+ * Some architecturs, e.g. ARC may have ZONE_HIGHMEM below ZONE_NORMAL. For
+ * such cases we allow max_zone_pfn sorted in the descending order
+ */
+bool __weak arch_has_descending_max_zone_pfns(void)
+{
+ return false;
+}
+
/**
* free_area_init - Initialise all pg_data_t and zone data
* @max_zone_pfn: an array of max PFNs for each zone
@@ -7324,7 +7333,7 @@ void __init free_area_init(unsigned long *max_zone_pfn)
{
unsigned long start_pfn, end_pfn;
int i, nid, zone;
- bool descending = false;
+ bool descending;
/* Record where the zone boundaries are */
memset(arch_zone_lowest_possible_pfn, 0,
@@ -7333,14 +7342,7 @@ void __init free_area_init(unsigned long *max_zone_pfn)
sizeof(arch_zone_highest_possible_pfn));
start_pfn = find_min_pfn_with_active_regions();
-
- /*
- * Some architecturs, e.g. ARC may have ZONE_HIGHMEM below
- * ZONE_NORMAL. For such cases we allow max_zone_pfn sorted in the
- * descending order
- */
- if (MAX_NR_ZONES > 1 && max_zone_pfn[0] > max_zone_pfn[1])
- descending = true;
+ descending = arch_has_descending_max_zone_pfns();
for (i = 0; i < MAX_NR_ZONES; i++) {
if (descending)
> Guenter
>
> > qemu command line:
> >
> > qemu-system-microblazeel -M petalogix-ml605 -m 256 \
> > -kernel arch/microblaze/boot/linux.bin -no-reboot \
> > -initrd rootfs.cpio \
> > -append 'panic=-1 slub_debug=FZPUA rdinit=/sbin/init console=ttyS0,115200' \
> > -monitor none -serial stdio -nographic
> >
> > initrd:
> > https://github.com/groeck/linux-build-test/blob/master/rootfs/microblazeel/rootfs.cpio.gz
> > configuration:
> > https://github.com/groeck/linux-build-test/blob/master/rootfs/microblazeel/qemu_microblazeel_ml605_defconfig
> >
> > Bisect log is below.
> >
> > Guenter
> >
> > ---
> > # bad: [fb9d670f57e3f6478602328bbbf71138be06ca4f] Add linux-next specific files for 20200501
> > # good: [6a8b55ed4056ea5559ebe4f6a4b247f627870d4c] Linux 5.7-rc3
> > git bisect start 'HEAD' 'v5.7-rc3'
> > # good: [068b80b68a670f0b17288c8a3d1ee751f35162ab] Merge remote-tracking branch 'drm/drm-next'
> > git bisect good 068b80b68a670f0b17288c8a3d1ee751f35162ab
> > # good: [46c70fc6a3ac35cd72ddad248dcbe4eee716d2a5] Merge remote-tracking branch 'drivers-x86/for-next'
> > git bisect good 46c70fc6a3ac35cd72ddad248dcbe4eee716d2a5
> > # good: [f39c4ad479a2f005f972a2b941b40efa6b9c9349] Merge remote-tracking branch 'rpmsg/for-next'
> > git bisect good f39c4ad479a2f005f972a2b941b40efa6b9c9349
> > # bad: [165d3ee0162fe28efc2c8180176633e33515df15] ipc-convert-ipcs_idr-to-xarray-update
> > git bisect bad 165d3ee0162fe28efc2c8180176633e33515df15
> > # good: [001f1d211ed2ed0f005838dc4390993930bbbd69] mm: remove early_pfn_in_nid() and CONFIG_NODES_SPAN_OTHER_NODES
> > git bisect good 001f1d211ed2ed0f005838dc4390993930bbbd69
> > # bad: [aaad7401bd32f10c1d591dd886b3a9b9595c6d77] mm/vmsan: fix some typos in comment
> > git bisect bad aaad7401bd32f10c1d591dd886b3a9b9595c6d77
> > # bad: [09f9d0ab1fbed85623b283995aa7a7d78daa1611] khugepaged: allow to collapse PTE-mapped compound pages
> > git bisect bad 09f9d0ab1fbed85623b283995aa7a7d78daa1611
> > # bad: [c942fc8a3e5088407bc32d94f554bab205175f8a] mm/vmstat.c: do not show lowmem reserve protection information of empty zone
> > git bisect bad c942fc8a3e5088407bc32d94f554bab205175f8a
> > # bad: [b29358d269ace3826d8521bea842fc2984cfc11b] mm/page_alloc.c: rename free_pages_check() to check_free_page()
> > git bisect bad b29358d269ace3826d8521bea842fc2984cfc11b
> > # bad: [be0fb591a1f1df20a00c8f023f9ca4891f177b0d] mm: simplify find_min_pfn_with_active_regions()
> > git bisect bad be0fb591a1f1df20a00c8f023f9ca4891f177b0d
> > # bad: [c17422a008d36dcf3e9f51469758c5762716cb0a] mm: rename free_area_init_node() to free_area_init_memoryless_node()
> > git bisect bad c17422a008d36dcf3e9f51469758c5762716cb0a
> > # bad: [51a2f644fd020d5f090044825c388444d11029d5] mm: free_area_init: allow defining max_zone_pfn in descending order
> > git bisect bad 51a2f644fd020d5f090044825c388444d11029d5
> > # first bad commit: [51a2f644fd020d5f090044825c388444d11029d5] mm: free_area_init: allow defining max_zone_pfn in descending order
--
Sincerely yours,
Mike.
^ permalink raw reply related
* Re: [PATCH v2] misc: new driver sram_uapi for user level SRAM access
From: kbuild test robot @ 2020-05-04 14:51 UTC (permalink / raw)
To: Wang Wenhu, gregkh, arnd, linux-kernel, linuxppc-dev, kernel,
robh, Christophe Leroy, Scott Wood, Michael Ellerman,
Randy Dunlap
Cc: robh, kbuild-all, Scott Wood, kernel, Wang Wenhu
In-Reply-To: <20200419123011.86765-1-wenhu.wang@vivo.com>
[-- Attachment #1: Type: text/plain, Size: 2329 bytes --]
Hi Wang,
Thank you for the patch! Perhaps something to improve:
[auto build test WARNING on char-misc/char-misc-testing]
[also build test WARNING on arm-soc/for-next linus/master linux/master v5.7-rc3 next-20200501]
[if your patch is applied to the wrong git tree, please drop us a note to help
improve the system. BTW, we also suggest to use '--base' option to specify the
base tree in git format-patch, please see https://stackoverflow.com/a/37406982]
url: https://github.com/0day-ci/linux/commits/Wang-Wenhu/misc-new-driver-sram_uapi-for-user-level-SRAM-access/20200421-105427
base: https://git.kernel.org/pub/scm/linux/kernel/git/gregkh/char-misc.git ae83d0b416db002fe95601e7f97f64b59514d936
config: microblaze-allyesconfig (attached as .config)
compiler: microblaze-linux-gcc (GCC) 9.3.0
reproduce:
wget https://raw.githubusercontent.com/intel/lkp-tests/master/sbin/make.cross -O ~/bin/make.cross
chmod +x ~/bin/make.cross
# save the attached .config to linux build tree
COMPILER_INSTALL_PATH=$HOME/0day GCC_VERSION=9.3.0 make.cross ARCH=microblaze
If you fix the issue, kindly add following tag as appropriate
Reported-by: kbuild test robot <lkp@intel.com>
Note: it may well be a FALSE warning. FWIW you are at least aware of it now.
http://gcc.gnu.org/wiki/Better_Uninitialized_Warnings
All warnings (new ones prefixed by >>):
drivers/misc/sram_uapi.c: In function 'sram_uapi_ioctl':
>> drivers/misc/sram_uapi.c:100:6: warning: 'type' may be used uninitialized in this function [-Wmaybe-uninitialized]
100 | if (cur->type == type) {
| ^
drivers/misc/sram_uapi.c:188:8: note: 'type' was declared here
188 | __u32 type;
| ^~~~
vim +/type +100 drivers/misc/sram_uapi.c
93
94 static struct sram_api *get_sram_api_from_type(__u32 type)
95 {
96 struct sram_api *cur;
97
98 mutex_lock(&sram_list_lock);
99 list_for_each_entry(cur, &sram_list, list) {
> 100 if (cur->type == type) {
101 kref_get(&cur->kref);
102 mutex_unlock(&sram_list_lock);
103 return cur;
104 }
105 }
106 mutex_unlock(&sram_list_lock);
107
108 return NULL;
109 }
110
---
0-DAY CI Kernel Test Service, Intel Corporation
https://lists.01.org/hyperkitty/list/kbuild-all@lists.01.org
[-- Attachment #2: .config.gz --]
[-- Type: application/gzip, Size: 62734 bytes --]
^ permalink raw reply
* [powerpc:merge] BUILD SUCCESS 1bc92fe3175eb26ff37e580c0383d7a9abe06835
From: kbuild test robot @ 2020-05-04 14:39 UTC (permalink / raw)
To: Michael Ellerman; +Cc: linuxppc-dev
tree/branch: https://git.kernel.org/pub/scm/linux/kernel/git/powerpc/linux.git merge
branch HEAD: 1bc92fe3175eb26ff37e580c0383d7a9abe06835 Automatic merge of branches 'master', 'next' and 'fixes' into merge
elapsed time: 5894m
configs tested: 248
configs skipped: 0
The following configs have been built successfully.
More configs may be tested in the coming days.
arm efm32_defconfig
arm at91_dt_defconfig
arm shmobile_defconfig
arm64 defconfig
arm multi_v5_defconfig
arm sunxi_defconfig
arm multi_v7_defconfig
arm exynos_defconfig
arm64 allyesconfig
arm allyesconfig
arm64 allmodconfig
arm allmodconfig
arm64 allnoconfig
arm allnoconfig
sparc allyesconfig
powerpc defconfig
ia64 defconfig
powerpc chrp32_defconfig
c6x evmc6678_defconfig
xtensa common_defconfig
ia64 alldefconfig
nds32 allnoconfig
mips capcella_defconfig
riscv nommu_virt_defconfig
mips allmodconfig
s390 zfcpdump_defconfig
powerpc mpc512x_defconfig
sh sh7785lcr_32bit_defconfig
xtensa iss_defconfig
um defconfig
sh rsk7269_defconfig
mips allnoconfig
csky defconfig
i386 allnoconfig
i386 allyesconfig
i386 alldefconfig
i386 defconfig
i386 debian-10.3
ia64 allmodconfig
ia64 allnoconfig
ia64 generic_defconfig
ia64 tiger_defconfig
ia64 bigsur_defconfig
ia64 allyesconfig
m68k m5475evb_defconfig
m68k allmodconfig
m68k bvme6000_defconfig
m68k sun3_defconfig
m68k multi_defconfig
nios2 3c120_defconfig
nios2 10m50_defconfig
c6x allyesconfig
openrisc simple_smp_defconfig
openrisc or1ksim_defconfig
nds32 defconfig
alpha defconfig
h8300 h8s-sim_defconfig
h8300 edosk2674_defconfig
h8300 h8300h-sim_defconfig
arc defconfig
arc allyesconfig
microblaze mmu_defconfig
microblaze nommu_defconfig
mips fuloong2e_defconfig
mips malta_kvm_defconfig
mips ar7_defconfig
mips allyesconfig
mips 64r6el_defconfig
mips 32r2_defconfig
mips malta_kvm_guest_defconfig
mips tb0287_defconfig
mips ip32_defconfig
mips decstation_64_defconfig
mips loongson3_defconfig
mips ath79_defconfig
mips bcm63xx_defconfig
parisc allnoconfig
parisc generic-64bit_defconfig
parisc generic-32bit_defconfig
parisc allyesconfig
parisc allmodconfig
powerpc holly_defconfig
powerpc ppc64_defconfig
powerpc rhel-kconfig
powerpc allnoconfig
powerpc mpc866_ads_defconfig
powerpc amigaone_defconfig
powerpc adder875_defconfig
powerpc ep8248e_defconfig
powerpc g5_defconfig
parisc randconfig-a001-20200430
mips randconfig-a001-20200430
m68k randconfig-a001-20200430
riscv randconfig-a001-20200430
alpha randconfig-a001-20200430
nds32 randconfig-a001-20200430
m68k randconfig-a001-20200502
mips randconfig-a001-20200502
nds32 randconfig-a001-20200502
alpha randconfig-a001-20200502
parisc randconfig-a001-20200502
riscv randconfig-a001-20200502
microblaze randconfig-a001-20200430
nios2 randconfig-a001-20200430
h8300 randconfig-a001-20200430
c6x randconfig-a001-20200430
sparc64 randconfig-a001-20200430
h8300 randconfig-a001-20200502
nios2 randconfig-a001-20200502
microblaze randconfig-a001-20200502
c6x randconfig-a001-20200502
sparc64 randconfig-a001-20200502
s390 randconfig-a001-20200430
xtensa randconfig-a001-20200430
csky randconfig-a001-20200430
openrisc randconfig-a001-20200430
sh randconfig-a001-20200430
s390 randconfig-a001-20200502
xtensa randconfig-a001-20200502
sh randconfig-a001-20200502
openrisc randconfig-a001-20200502
csky randconfig-a001-20200502
s390 randconfig-a001-20200501
xtensa randconfig-a001-20200501
sh randconfig-a001-20200501
openrisc randconfig-a001-20200501
csky randconfig-a001-20200501
x86_64 randconfig-a003-20200501
x86_64 randconfig-a001-20200501
i386 randconfig-a003-20200501
i386 randconfig-a002-20200501
i386 randconfig-a001-20200501
i386 randconfig-b003-20200501
x86_64 randconfig-b002-20200501
i386 randconfig-b001-20200501
x86_64 randconfig-b003-20200501
x86_64 randconfig-b001-20200501
i386 randconfig-b002-20200501
i386 randconfig-b001-20200430
i386 randconfig-b002-20200430
x86_64 randconfig-b001-20200430
i386 randconfig-b003-20200430
x86_64 randconfig-b002-20200430
x86_64 randconfig-b003-20200430
i386 randconfig-b003-20200502
i386 randconfig-b001-20200502
x86_64 randconfig-b003-20200502
x86_64 randconfig-b001-20200502
i386 randconfig-b002-20200502
x86_64 randconfig-d001-20200501
i386 randconfig-d003-20200501
x86_64 randconfig-d003-20200501
i386 randconfig-d001-20200501
x86_64 randconfig-d002-20200501
i386 randconfig-d002-20200501
x86_64 randconfig-e003-20200503
x86_64 randconfig-e002-20200503
i386 randconfig-e003-20200503
x86_64 randconfig-e001-20200503
i386 randconfig-e002-20200503
i386 randconfig-e001-20200503
x86_64 randconfig-e002-20200430
i386 randconfig-e003-20200430
x86_64 randconfig-e003-20200430
i386 randconfig-e002-20200430
x86_64 randconfig-e001-20200430
i386 randconfig-e001-20200430
i386 randconfig-f003-20200501
x86_64 randconfig-f001-20200501
x86_64 randconfig-f003-20200501
i386 randconfig-f001-20200501
i386 randconfig-f002-20200501
x86_64 randconfig-a003-20200502
x86_64 randconfig-a001-20200502
x86_64 randconfig-a002-20200502
i386 randconfig-a002-20200502
i386 randconfig-a003-20200502
i386 randconfig-a001-20200502
x86_64 randconfig-g003-20200502
i386 randconfig-g003-20200502
i386 randconfig-g002-20200502
x86_64 randconfig-g001-20200502
x86_64 randconfig-g002-20200502
i386 randconfig-g001-20200502
i386 randconfig-g003-20200501
i386 randconfig-g002-20200501
x86_64 randconfig-g002-20200501
i386 randconfig-g001-20200501
i386 randconfig-h002-20200430
i386 randconfig-h003-20200430
x86_64 randconfig-h001-20200430
x86_64 randconfig-h003-20200430
i386 randconfig-h001-20200430
i386 randconfig-h001-20200501
i386 randconfig-h002-20200501
i386 randconfig-h003-20200501
x86_64 randconfig-h001-20200501
x86_64 randconfig-h003-20200501
i386 randconfig-h001-20200502
i386 randconfig-h002-20200502
i386 randconfig-h003-20200502
x86_64 randconfig-h002-20200502
x86_64 randconfig-h001-20200502
x86_64 randconfig-h003-20200502
ia64 randconfig-a001-20200502
arm64 randconfig-a001-20200502
arc randconfig-a001-20200502
powerpc randconfig-a001-20200502
arm randconfig-a001-20200502
sparc randconfig-a001-20200502
ia64 randconfig-a001-20200501
arc randconfig-a001-20200501
powerpc randconfig-a001-20200501
arm randconfig-a001-20200501
sparc randconfig-a001-20200501
sparc randconfig-a001-20200430
arc randconfig-a001-20200430
ia64 randconfig-a001-20200430
powerpc randconfig-a001-20200430
arm randconfig-a001-20200430
riscv allyesconfig
riscv allnoconfig
riscv defconfig
riscv rv32_defconfig
riscv allmodconfig
s390 debug_defconfig
s390 allyesconfig
s390 allnoconfig
s390 allmodconfig
s390 alldefconfig
s390 defconfig
sh allmodconfig
sh titan_defconfig
sh allnoconfig
sparc defconfig
sparc64 defconfig
sparc64 allnoconfig
sparc64 allyesconfig
sparc64 allmodconfig
um x86_64_defconfig
um i386_defconfig
x86_64 rhel
x86_64 rhel-7.6
x86_64 rhel-7.6-kselftests
x86_64 rhel-7.2-clear
x86_64 lkp
x86_64 fedora-25
x86_64 kexec
---
0-DAY CI Kernel Test Service, Intel Corporation
https://lists.01.org/hyperkitty/list/kbuild-all@lists.01.org
^ permalink raw reply
* [PATCH] powerpc/64/signal: balance return predictor stack in signal trampoline
From: Nicholas Piggin @ 2020-05-04 14:20 UTC (permalink / raw)
To: linuxppc-dev; +Cc: Nicholas Piggin, Alan Modra
Returning from an interrupt or syscall to a signal handler currently
begins execution directly at the handler's entry point, with LR set to
the address of the sigreturn trampoline. When the signal handler
function returns, it runs the trampoline. It looks like this:
# interrupt at user address xyz
# kernel stuff... signal is raised
rfid
# void handler(int sig)
addis 2,12,.TOC.-.LCF0@ha
addi 2,2,.TOC.-.LCF0@l
mflr 0
std 0,16(1)
stdu 1,-96(1)
# handler stuff
ld 0,16(1)
mtlr 0
blr
# __kernel_sigtramp_rt64
addi r1,r1,__SIGNAL_FRAMESIZE
li r0,__NR_rt_sigreturn
sc
# kernel executes rt_sigreturn
rfid
# back to user address xyz
Note the blr with no matching bl. This can corrupt the return predictor.
Solve this by instead resuming execution at the signal trampoline which
then calls the signal handler. qtrace-tools link_stack checker confirms
the entire user/kernel/vdso cycle is balanced after this patch, whereas
it's not upstream.
Alan confirms the dwarf unwind info still looks good. gdb still
recognises the signal frame and can step into parent frames if it break
inside a signal handler.
Performance is pretty noisy, not a very significant change on a POWER9
here, but branch misses are consistently a lot lower on a microbenchmark:
Performance counter stats for './signal':
13,085.72 msec task-clock # 1.000 CPUs utilized
45,024,760,101 cycles # 3.441 GHz
65,102,895,542 instructions # 1.45 insn per cycle
11,271,673,787 branches # 861.372 M/sec
59,468,979 branch-misses # 0.53% of all branches
12,989.09 msec task-clock # 1.000 CPUs utilized
44,692,719,559 cycles # 3.441 GHz
65,109,984,964 instructions # 1.46 insn per cycle
11,282,136,057 branches # 868.585 M/sec
39,786,942 branch-misses # 0.35% of all branches
Cc: Alan Modra <amodra@gmail.com>
Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
---
arch/powerpc/include/asm/ppc-opcode.h | 1 +
arch/powerpc/kernel/signal_64.c | 20 +++++++++++---------
arch/powerpc/kernel/vdso64/sigtramp.S | 13 +++++--------
3 files changed, 17 insertions(+), 17 deletions(-)
diff --git a/arch/powerpc/include/asm/ppc-opcode.h b/arch/powerpc/include/asm/ppc-opcode.h
index c1df75edde44..747b37f1ce09 100644
--- a/arch/powerpc/include/asm/ppc-opcode.h
+++ b/arch/powerpc/include/asm/ppc-opcode.h
@@ -329,6 +329,7 @@
#define PPC_INST_BLR 0x4e800020
#define PPC_INST_BLRL 0x4e800021
#define PPC_INST_BCTR 0x4e800420
+#define PPC_INST_BCTRL 0x4e800421
#define PPC_INST_MULLD 0x7c0001d2
#define PPC_INST_MULLW 0x7c0001d6
#define PPC_INST_MULHWU 0x7c000016
diff --git a/arch/powerpc/kernel/signal_64.c b/arch/powerpc/kernel/signal_64.c
index adfde59cf4ba..6c17e2456ccc 100644
--- a/arch/powerpc/kernel/signal_64.c
+++ b/arch/powerpc/kernel/signal_64.c
@@ -41,7 +41,7 @@
#define FP_REGS_SIZE sizeof(elf_fpregset_t)
#define TRAMP_TRACEBACK 3
-#define TRAMP_SIZE 6
+#define TRAMP_SIZE 7
/*
* When we have signals to deliver, we set up on the user stack,
@@ -603,13 +603,15 @@ static long setup_trampoline(unsigned int syscall, unsigned int __user *tramp)
int i;
long err = 0;
+ /* bctrl # call the handler */
+ err |= __put_user(PPC_INST_BCTRL, &tramp[0]);
/* addi r1, r1, __SIGNAL_FRAMESIZE # Pop the dummy stackframe */
err |= __put_user(PPC_INST_ADDI | __PPC_RT(R1) | __PPC_RA(R1) |
- (__SIGNAL_FRAMESIZE & 0xffff), &tramp[0]);
+ (__SIGNAL_FRAMESIZE & 0xffff), &tramp[1]);
/* li r0, __NR_[rt_]sigreturn| */
- err |= __put_user(PPC_INST_ADDI | (syscall & 0xffff), &tramp[1]);
+ err |= __put_user(PPC_INST_ADDI | (syscall & 0xffff), &tramp[2]);
/* sc */
- err |= __put_user(PPC_INST_SC, &tramp[2]);
+ err |= __put_user(PPC_INST_SC, &tramp[3]);
/* Minimal traceback info */
for (i=TRAMP_TRACEBACK; i < TRAMP_SIZE ;i++)
@@ -867,12 +869,12 @@ int handle_rt_signal64(struct ksignal *ksig, sigset_t *set,
/* Set up to return from userspace. */
if (vdso64_rt_sigtramp && tsk->mm->context.vdso_base) {
- regs->link = tsk->mm->context.vdso_base + vdso64_rt_sigtramp;
+ regs->nip = tsk->mm->context.vdso_base + vdso64_rt_sigtramp;
} else {
err |= setup_trampoline(__NR_rt_sigreturn, &frame->tramp[0]);
if (err)
goto badframe;
- regs->link = (unsigned long) &frame->tramp[0];
+ regs->nip = (unsigned long) &frame->tramp[0];
}
/* Allocate a dummy caller frame for the signal handler. */
@@ -881,8 +883,8 @@ int handle_rt_signal64(struct ksignal *ksig, sigset_t *set,
/* Set up "regs" so we "return" to the signal handler. */
if (is_elf2_task()) {
- regs->nip = (unsigned long) ksig->ka.sa.sa_handler;
- regs->gpr[12] = regs->nip;
+ regs->ctr = (unsigned long) ksig->ka.sa.sa_handler;
+ regs->gpr[12] = regs->ctr;
} else {
/* Handler is *really* a pointer to the function descriptor for
* the signal routine. The first entry in the function
@@ -892,7 +894,7 @@ int handle_rt_signal64(struct ksignal *ksig, sigset_t *set,
func_descr_t __user *funct_desc_ptr =
(func_descr_t __user *) ksig->ka.sa.sa_handler;
- err |= get_user(regs->nip, &funct_desc_ptr->entry);
+ err |= get_user(regs->ctr, &funct_desc_ptr->entry);
err |= get_user(regs->gpr[2], &funct_desc_ptr->toc);
}
diff --git a/arch/powerpc/kernel/vdso64/sigtramp.S b/arch/powerpc/kernel/vdso64/sigtramp.S
index a8cc0409d7d2..bbf68cd01088 100644
--- a/arch/powerpc/kernel/vdso64/sigtramp.S
+++ b/arch/powerpc/kernel/vdso64/sigtramp.S
@@ -6,6 +6,7 @@
* Copyright (C) 2004 Benjamin Herrenschmuidt (benh@kernel.crashing.org), IBM Corp.
* Copyright (C) 2004 Alan Modra (amodra@au.ibm.com)), IBM Corp.
*/
+#include <asm/cache.h> /* IFETCH_ALIGN_BYTES */
#include <asm/processor.h>
#include <asm/ppc_asm.h>
#include <asm/unistd.h>
@@ -14,21 +15,17 @@
.text
-/* The nop here is a hack. The dwarf2 unwind routines subtract 1 from
- the return address to get an address in the middle of the presumed
- call instruction. Since we don't have a call here, we artificially
- extend the range covered by the unwind info by padding before the
- real start. */
- nop
.balign 8
+ .balign IFETCH_ALIGN_BYTES
V_FUNCTION_BEGIN(__kernel_sigtramp_rt64)
-.Lsigrt_start = . - 4
+.Lsigrt_start:
+ bctrl /* call the handler */
addi r1, r1, __SIGNAL_FRAMESIZE
li r0,__NR_rt_sigreturn
sc
.Lsigrt_end:
V_FUNCTION_END(__kernel_sigtramp_rt64)
-/* The ".balign 8" above and the following zeros mimic the old stack
+/* The .balign 8 above and the following zeros mimic the old stack
trampoline layout. The last magic value is the ucontext pointer,
chosen in such a way that older libgcc unwind code returns a zero
for a sigcontext pointer. */
--
2.23.0
^ permalink raw reply related
* Re: [PATCH 1/2] powerpc/spufs: fix copy_to_user while atomic
From: Christoph Hellwig @ 2020-05-04 13:04 UTC (permalink / raw)
To: Jeremy Kerr; +Cc: linuxppc-dev, Christoph Hellwig, Arnd Bergmann
In-Reply-To: <20200429070303.17599-1-jk@ozlabs.org>
powerpc mantainers,
are you going to pick this up for the next -rc1? I'm waiting for it to
hit upstream before resending the coredump series.
On Wed, Apr 29, 2020 at 03:03:02PM +0800, Jeremy Kerr wrote:
> Currently, we may perform a copy_to_user (through
> simple_read_from_buffer()) while holding a context's register_lock,
> while accessing the context save area.
>
> This change uses a temporary buffer for the context save area data,
> which we then pass to simple_read_from_buffer.
>
> Includes changes from Christoph Hellwig <hch@lst.de>.
>
> Fixes: bf1ab978be23 ("[POWERPC] coredump: Add SPU elf notes to coredump.")
> Signed-off-by: Jeremy Kerr <jk@ozlabs.org>
> Reviewed-by: Arnd Bergmann <arnd@arndb.de>
> Reviewed-by: Christoph Hellwig <hch@lst.de>
> ---
> arch/powerpc/platforms/cell/spufs/file.c | 113 +++++++++++++++--------
> 1 file changed, 75 insertions(+), 38 deletions(-)
>
> diff --git a/arch/powerpc/platforms/cell/spufs/file.c b/arch/powerpc/platforms/cell/spufs/file.c
> index c0f950a3f4e1..b4e1ef650b40 100644
> --- a/arch/powerpc/platforms/cell/spufs/file.c
> +++ b/arch/powerpc/platforms/cell/spufs/file.c
> @@ -1978,8 +1978,9 @@ static ssize_t __spufs_mbox_info_read(struct spu_context *ctx,
> static ssize_t spufs_mbox_info_read(struct file *file, char __user *buf,
> size_t len, loff_t *pos)
> {
> - int ret;
> struct spu_context *ctx = file->private_data;
> + u32 stat, data;
> + int ret;
>
> if (!access_ok(buf, len))
> return -EFAULT;
> @@ -1988,11 +1989,16 @@ static ssize_t spufs_mbox_info_read(struct file *file, char __user *buf,
> if (ret)
> return ret;
> spin_lock(&ctx->csa.register_lock);
> - ret = __spufs_mbox_info_read(ctx, buf, len, pos);
> + stat = ctx->csa.prob.mb_stat_R;
> + data = ctx->csa.prob.pu_mb_R;
> spin_unlock(&ctx->csa.register_lock);
> spu_release_saved(ctx);
>
> - return ret;
> + /* EOF if there's no entry in the mbox */
> + if (!(stat & 0x0000ff))
> + return 0;
> +
> + return simple_read_from_buffer(buf, len, pos, &data, sizeof(data));
> }
>
> static const struct file_operations spufs_mbox_info_fops = {
> @@ -2019,6 +2025,7 @@ static ssize_t spufs_ibox_info_read(struct file *file, char __user *buf,
> size_t len, loff_t *pos)
> {
> struct spu_context *ctx = file->private_data;
> + u32 stat, data;
> int ret;
>
> if (!access_ok(buf, len))
> @@ -2028,11 +2035,16 @@ static ssize_t spufs_ibox_info_read(struct file *file, char __user *buf,
> if (ret)
> return ret;
> spin_lock(&ctx->csa.register_lock);
> - ret = __spufs_ibox_info_read(ctx, buf, len, pos);
> + stat = ctx->csa.prob.mb_stat_R;
> + data = ctx->csa.priv2.puint_mb_R;
> spin_unlock(&ctx->csa.register_lock);
> spu_release_saved(ctx);
>
> - return ret;
> + /* EOF if there's no entry in the ibox */
> + if (!(stat & 0xff0000))
> + return 0;
> +
> + return simple_read_from_buffer(buf, len, pos, &data, sizeof(data));
> }
>
> static const struct file_operations spufs_ibox_info_fops = {
> @@ -2041,6 +2053,11 @@ static const struct file_operations spufs_ibox_info_fops = {
> .llseek = generic_file_llseek,
> };
>
> +static size_t spufs_wbox_info_cnt(struct spu_context *ctx)
> +{
> + return (4 - ((ctx->csa.prob.mb_stat_R & 0x00ff00) >> 8)) * sizeof(u32);
> +}
> +
> static ssize_t __spufs_wbox_info_read(struct spu_context *ctx,
> char __user *buf, size_t len, loff_t *pos)
> {
> @@ -2049,7 +2066,7 @@ static ssize_t __spufs_wbox_info_read(struct spu_context *ctx,
> u32 wbox_stat;
>
> wbox_stat = ctx->csa.prob.mb_stat_R;
> - cnt = 4 - ((wbox_stat & 0x00ff00) >> 8);
> + cnt = spufs_wbox_info_cnt(ctx);
> for (i = 0; i < cnt; i++) {
> data[i] = ctx->csa.spu_mailbox_data[i];
> }
> @@ -2062,7 +2079,8 @@ static ssize_t spufs_wbox_info_read(struct file *file, char __user *buf,
> size_t len, loff_t *pos)
> {
> struct spu_context *ctx = file->private_data;
> - int ret;
> + u32 data[ARRAY_SIZE(ctx->csa.spu_mailbox_data)];
> + int ret, count;
>
> if (!access_ok(buf, len))
> return -EFAULT;
> @@ -2071,11 +2089,13 @@ static ssize_t spufs_wbox_info_read(struct file *file, char __user *buf,
> if (ret)
> return ret;
> spin_lock(&ctx->csa.register_lock);
> - ret = __spufs_wbox_info_read(ctx, buf, len, pos);
> + count = spufs_wbox_info_cnt(ctx);
> + memcpy(&data, &ctx->csa.spu_mailbox_data, sizeof(data));
> spin_unlock(&ctx->csa.register_lock);
> spu_release_saved(ctx);
>
> - return ret;
> + return simple_read_from_buffer(buf, len, pos, &data,
> + count * sizeof(u32));
> }
>
> static const struct file_operations spufs_wbox_info_fops = {
> @@ -2084,27 +2104,33 @@ static const struct file_operations spufs_wbox_info_fops = {
> .llseek = generic_file_llseek,
> };
>
> -static ssize_t __spufs_dma_info_read(struct spu_context *ctx,
> - char __user *buf, size_t len, loff_t *pos)
> +static void ___spufs_dma_info_read(struct spu_context *ctx,
> + struct spu_dma_info *info)
> {
> - struct spu_dma_info info;
> - struct mfc_cq_sr *qp, *spuqp;
> int i;
>
> - info.dma_info_type = ctx->csa.priv2.spu_tag_status_query_RW;
> - info.dma_info_mask = ctx->csa.lscsa->tag_mask.slot[0];
> - info.dma_info_status = ctx->csa.spu_chnldata_RW[24];
> - info.dma_info_stall_and_notify = ctx->csa.spu_chnldata_RW[25];
> - info.dma_info_atomic_command_status = ctx->csa.spu_chnldata_RW[27];
> + info->dma_info_type = ctx->csa.priv2.spu_tag_status_query_RW;
> + info->dma_info_mask = ctx->csa.lscsa->tag_mask.slot[0];
> + info->dma_info_status = ctx->csa.spu_chnldata_RW[24];
> + info->dma_info_stall_and_notify = ctx->csa.spu_chnldata_RW[25];
> + info->dma_info_atomic_command_status = ctx->csa.spu_chnldata_RW[27];
> for (i = 0; i < 16; i++) {
> - qp = &info.dma_info_command_data[i];
> - spuqp = &ctx->csa.priv2.spuq[i];
> + struct mfc_cq_sr *qp = &info->dma_info_command_data[i];
> + struct mfc_cq_sr *spuqp = &ctx->csa.priv2.spuq[i];
>
> qp->mfc_cq_data0_RW = spuqp->mfc_cq_data0_RW;
> qp->mfc_cq_data1_RW = spuqp->mfc_cq_data1_RW;
> qp->mfc_cq_data2_RW = spuqp->mfc_cq_data2_RW;
> qp->mfc_cq_data3_RW = spuqp->mfc_cq_data3_RW;
> }
> +}
> +
> +static ssize_t __spufs_dma_info_read(struct spu_context *ctx,
> + char __user *buf, size_t len, loff_t *pos)
> +{
> + struct spu_dma_info info;
> +
> + ___spufs_dma_info_read(ctx, &info);
>
> return simple_read_from_buffer(buf, len, pos, &info,
> sizeof info);
> @@ -2114,6 +2140,7 @@ static ssize_t spufs_dma_info_read(struct file *file, char __user *buf,
> size_t len, loff_t *pos)
> {
> struct spu_context *ctx = file->private_data;
> + struct spu_dma_info info;
> int ret;
>
> if (!access_ok(buf, len))
> @@ -2123,11 +2150,12 @@ static ssize_t spufs_dma_info_read(struct file *file, char __user *buf,
> if (ret)
> return ret;
> spin_lock(&ctx->csa.register_lock);
> - ret = __spufs_dma_info_read(ctx, buf, len, pos);
> + ___spufs_dma_info_read(ctx, &info);
> spin_unlock(&ctx->csa.register_lock);
> spu_release_saved(ctx);
>
> - return ret;
> + return simple_read_from_buffer(buf, len, pos, &info,
> + sizeof(info));
> }
>
> static const struct file_operations spufs_dma_info_fops = {
> @@ -2136,13 +2164,31 @@ static const struct file_operations spufs_dma_info_fops = {
> .llseek = no_llseek,
> };
>
> +static void ___spufs_proxydma_info_read(struct spu_context *ctx,
> + struct spu_proxydma_info *info)
> +{
> + int i;
> +
> + info->proxydma_info_type = ctx->csa.prob.dma_querytype_RW;
> + info->proxydma_info_mask = ctx->csa.prob.dma_querymask_RW;
> + info->proxydma_info_status = ctx->csa.prob.dma_tagstatus_R;
> +
> + for (i = 0; i < 8; i++) {
> + struct mfc_cq_sr *qp = &info->proxydma_info_command_data[i];
> + struct mfc_cq_sr *puqp = &ctx->csa.priv2.puq[i];
> +
> + qp->mfc_cq_data0_RW = puqp->mfc_cq_data0_RW;
> + qp->mfc_cq_data1_RW = puqp->mfc_cq_data1_RW;
> + qp->mfc_cq_data2_RW = puqp->mfc_cq_data2_RW;
> + qp->mfc_cq_data3_RW = puqp->mfc_cq_data3_RW;
> + }
> +}
> +
> static ssize_t __spufs_proxydma_info_read(struct spu_context *ctx,
> char __user *buf, size_t len, loff_t *pos)
> {
> struct spu_proxydma_info info;
> - struct mfc_cq_sr *qp, *puqp;
> int ret = sizeof info;
> - int i;
>
> if (len < ret)
> return -EINVAL;
> @@ -2150,18 +2196,7 @@ static ssize_t __spufs_proxydma_info_read(struct spu_context *ctx,
> if (!access_ok(buf, len))
> return -EFAULT;
>
> - info.proxydma_info_type = ctx->csa.prob.dma_querytype_RW;
> - info.proxydma_info_mask = ctx->csa.prob.dma_querymask_RW;
> - info.proxydma_info_status = ctx->csa.prob.dma_tagstatus_R;
> - for (i = 0; i < 8; i++) {
> - qp = &info.proxydma_info_command_data[i];
> - puqp = &ctx->csa.priv2.puq[i];
> -
> - qp->mfc_cq_data0_RW = puqp->mfc_cq_data0_RW;
> - qp->mfc_cq_data1_RW = puqp->mfc_cq_data1_RW;
> - qp->mfc_cq_data2_RW = puqp->mfc_cq_data2_RW;
> - qp->mfc_cq_data3_RW = puqp->mfc_cq_data3_RW;
> - }
> + ___spufs_proxydma_info_read(ctx, &info);
>
> return simple_read_from_buffer(buf, len, pos, &info,
> sizeof info);
> @@ -2171,17 +2206,19 @@ static ssize_t spufs_proxydma_info_read(struct file *file, char __user *buf,
> size_t len, loff_t *pos)
> {
> struct spu_context *ctx = file->private_data;
> + struct spu_proxydma_info info;
> int ret;
>
> ret = spu_acquire_saved(ctx);
> if (ret)
> return ret;
> spin_lock(&ctx->csa.register_lock);
> - ret = __spufs_proxydma_info_read(ctx, buf, len, pos);
> + ___spufs_proxydma_info_read(ctx, &info);
> spin_unlock(&ctx->csa.register_lock);
> spu_release_saved(ctx);
>
> - return ret;
> + return simple_read_from_buffer(buf, len, pos, &info,
> + sizeof(info));
> }
>
> static const struct file_operations spufs_proxydma_info_fops = {
> --
> 2.17.1
---end quoted text---
^ permalink raw reply
* [PATCH] powerpc/64s/radix: Don't prefetch DAR in update_mmu_cache
From: Nicholas Piggin @ 2020-05-04 12:29 UTC (permalink / raw)
To: linuxppc-dev; +Cc: Nicholas Piggin
The idea behind this prefetch was to kick off a page table walk before
returning from the fault, getting some pipelining advantage.
But this never showed up any noticable performance advantage, and in
fact with KUAP the prefetches are actually blocked and cause some
kind of micro-architectural fault. Removing this improves page fault
microbenchmark performance by about 9%.
Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
---
arch/powerpc/include/asm/book3s/64/pgtable.h | 7 +++++--
arch/powerpc/mm/book3s64/hash_utils.c | 5 -----
arch/powerpc/mm/book3s64/pgtable.c | 13 -------------
3 files changed, 5 insertions(+), 20 deletions(-)
diff --git a/arch/powerpc/include/asm/book3s/64/pgtable.h b/arch/powerpc/include/asm/book3s/64/pgtable.h
index 368b136517e0..59ed15e43e89 100644
--- a/arch/powerpc/include/asm/book3s/64/pgtable.h
+++ b/arch/powerpc/include/asm/book3s/64/pgtable.h
@@ -1139,8 +1139,11 @@ extern pmd_t mk_pmd(struct page *page, pgprot_t pgprot);
extern pmd_t pmd_modify(pmd_t pmd, pgprot_t newprot);
extern void set_pmd_at(struct mm_struct *mm, unsigned long addr,
pmd_t *pmdp, pmd_t pmd);
-extern void update_mmu_cache_pmd(struct vm_area_struct *vma, unsigned long addr,
- pmd_t *pmd);
+static inline void update_mmu_cache_pmd(struct vm_area_struct *vma,
+ unsigned long addr, pmd_t *pmd)
+{
+}
+
extern int hash__has_transparent_hugepage(void);
static inline int has_transparent_hugepage(void)
{
diff --git a/arch/powerpc/mm/book3s64/hash_utils.c b/arch/powerpc/mm/book3s64/hash_utils.c
index 845da1e8ca4f..2458615805ee 100644
--- a/arch/powerpc/mm/book3s64/hash_utils.c
+++ b/arch/powerpc/mm/book3s64/hash_utils.c
@@ -1672,11 +1672,6 @@ void update_mmu_cache(struct vm_area_struct *vma, unsigned long address,
unsigned long trap;
bool is_exec;
- if (radix_enabled()) {
- prefetch((void *)address);
- return;
- }
-
/* We only want HPTEs for linux PTEs that have _PAGE_ACCESSED set */
if (!pte_young(*ptep) || address >= TASK_SIZE)
return;
diff --git a/arch/powerpc/mm/book3s64/pgtable.c b/arch/powerpc/mm/book3s64/pgtable.c
index e0bb69c616e4..821b483a5ac3 100644
--- a/arch/powerpc/mm/book3s64/pgtable.c
+++ b/arch/powerpc/mm/book3s64/pgtable.c
@@ -146,19 +146,6 @@ pmd_t pmd_modify(pmd_t pmd, pgprot_t newprot)
pmdv &= _HPAGE_CHG_MASK;
return pmd_set_protbits(__pmd(pmdv), newprot);
}
-
-/*
- * This is called at the end of handling a user page fault, when the
- * fault has been handled by updating a HUGE PMD entry in the linux page tables.
- * We use it to preload an HPTE into the hash table corresponding to
- * the updated linux HUGE PMD entry.
- */
-void update_mmu_cache_pmd(struct vm_area_struct *vma, unsigned long addr,
- pmd_t *pmd)
-{
- if (radix_enabled())
- prefetch((void *)addr);
-}
#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
/* For use by kexec */
--
2.23.0
^ permalink raw reply related
* Re: [PATCH V2 11/11] drm: Remove drm specific kmap_atomic code
From: Daniel Vetter @ 2020-05-04 11:18 UTC (permalink / raw)
To: Ira Weiny
Cc: Peter Zijlstra, Dave Hansen, dri-devel, linux-mips,
James E.J. Bottomley, Max Filippov, Huang Rui, Paul Mackerras,
H. Peter Anvin, sparclinux, Christoph Hellwig, Helge Deller,
X86 ML, linux-csky, Ingo Molnar, arcml, linux-xtensa,
Dan Williams, Borislav Petkov, Andy Lutomirski, Thomas Gleixner,
Linux ARM, Chris Zankel, Thomas Bogendoerfer, linux-parisc,
Linux Kernel Mailing List, David S. Miller, Andrew Morton,
linuxppc-dev, Christian Koenig
In-Reply-To: <20200504010912.982044-12-ira.weiny@intel.com>
On Mon, May 4, 2020 at 3:09 AM <ira.weiny@intel.com> wrote:
>
> From: Ira Weiny <ira.weiny@intel.com>
>
> kmap_atomic_prot() is now exported by all architectures. Use this
> function rather than open coding a driver specific kmap_atomic.
>
> Reviewed-by: Christian König <christian.koenig@amd.com>
> Reviewed-by: Christoph Hellwig <hch@lst.de>
> Signed-off-by: Ira Weiny <ira.weiny@intel.com>
I'm assuming this lands through some other tree or a topic branch or whatever.
Acked-by: Daniel Vetter <daniel.vetter@ffwll.ch>
Cheers, Daniel
> ---
> drivers/gpu/drm/ttm/ttm_bo_util.c | 56 ++--------------------------
> drivers/gpu/drm/vmwgfx/vmwgfx_blit.c | 16 ++++----
> include/drm/ttm/ttm_bo_api.h | 4 --
> 3 files changed, 12 insertions(+), 64 deletions(-)
>
> diff --git a/drivers/gpu/drm/ttm/ttm_bo_util.c b/drivers/gpu/drm/ttm/ttm_bo_util.c
> index 52d2b71f1588..f09b096ba4fd 100644
> --- a/drivers/gpu/drm/ttm/ttm_bo_util.c
> +++ b/drivers/gpu/drm/ttm/ttm_bo_util.c
> @@ -257,54 +257,6 @@ static int ttm_copy_io_page(void *dst, void *src, unsigned long page)
> return 0;
> }
>
> -#ifdef CONFIG_X86
> -#define __ttm_kmap_atomic_prot(__page, __prot) kmap_atomic_prot(__page, __prot)
> -#define __ttm_kunmap_atomic(__addr) kunmap_atomic(__addr)
> -#else
> -#define __ttm_kmap_atomic_prot(__page, __prot) vmap(&__page, 1, 0, __prot)
> -#define __ttm_kunmap_atomic(__addr) vunmap(__addr)
> -#endif
> -
> -
> -/**
> - * ttm_kmap_atomic_prot - Efficient kernel map of a single page with
> - * specified page protection.
> - *
> - * @page: The page to map.
> - * @prot: The page protection.
> - *
> - * This function maps a TTM page using the kmap_atomic api if available,
> - * otherwise falls back to vmap. The user must make sure that the
> - * specified page does not have an aliased mapping with a different caching
> - * policy unless the architecture explicitly allows it. Also mapping and
> - * unmapping using this api must be correctly nested. Unmapping should
> - * occur in the reverse order of mapping.
> - */
> -void *ttm_kmap_atomic_prot(struct page *page, pgprot_t prot)
> -{
> - if (pgprot_val(prot) == pgprot_val(PAGE_KERNEL))
> - return kmap_atomic(page);
> - else
> - return __ttm_kmap_atomic_prot(page, prot);
> -}
> -EXPORT_SYMBOL(ttm_kmap_atomic_prot);
> -
> -/**
> - * ttm_kunmap_atomic_prot - Unmap a page that was mapped using
> - * ttm_kmap_atomic_prot.
> - *
> - * @addr: The virtual address from the map.
> - * @prot: The page protection.
> - */
> -void ttm_kunmap_atomic_prot(void *addr, pgprot_t prot)
> -{
> - if (pgprot_val(prot) == pgprot_val(PAGE_KERNEL))
> - kunmap_atomic(addr);
> - else
> - __ttm_kunmap_atomic(addr);
> -}
> -EXPORT_SYMBOL(ttm_kunmap_atomic_prot);
> -
> static int ttm_copy_io_ttm_page(struct ttm_tt *ttm, void *src,
> unsigned long page,
> pgprot_t prot)
> @@ -316,13 +268,13 @@ static int ttm_copy_io_ttm_page(struct ttm_tt *ttm, void *src,
> return -ENOMEM;
>
> src = (void *)((unsigned long)src + (page << PAGE_SHIFT));
> - dst = ttm_kmap_atomic_prot(d, prot);
> + dst = kmap_atomic_prot(d, prot);
> if (!dst)
> return -ENOMEM;
>
> memcpy_fromio(dst, src, PAGE_SIZE);
>
> - ttm_kunmap_atomic_prot(dst, prot);
> + kunmap_atomic(dst);
>
> return 0;
> }
> @@ -338,13 +290,13 @@ static int ttm_copy_ttm_io_page(struct ttm_tt *ttm, void *dst,
> return -ENOMEM;
>
> dst = (void *)((unsigned long)dst + (page << PAGE_SHIFT));
> - src = ttm_kmap_atomic_prot(s, prot);
> + src = kmap_atomic_prot(s, prot);
> if (!src)
> return -ENOMEM;
>
> memcpy_toio(dst, src, PAGE_SIZE);
>
> - ttm_kunmap_atomic_prot(src, prot);
> + kunmap_atomic(src);
>
> return 0;
> }
> diff --git a/drivers/gpu/drm/vmwgfx/vmwgfx_blit.c b/drivers/gpu/drm/vmwgfx/vmwgfx_blit.c
> index bb46ca0c458f..94d456a1d1a9 100644
> --- a/drivers/gpu/drm/vmwgfx/vmwgfx_blit.c
> +++ b/drivers/gpu/drm/vmwgfx/vmwgfx_blit.c
> @@ -374,12 +374,12 @@ static int vmw_bo_cpu_blit_line(struct vmw_bo_blit_line_data *d,
> copy_size = min_t(u32, copy_size, PAGE_SIZE - src_page_offset);
>
> if (unmap_src) {
> - ttm_kunmap_atomic_prot(d->src_addr, d->src_prot);
> + kunmap_atomic(d->src_addr);
> d->src_addr = NULL;
> }
>
> if (unmap_dst) {
> - ttm_kunmap_atomic_prot(d->dst_addr, d->dst_prot);
> + kunmap_atomic(d->dst_addr);
> d->dst_addr = NULL;
> }
>
> @@ -388,8 +388,8 @@ static int vmw_bo_cpu_blit_line(struct vmw_bo_blit_line_data *d,
> return -EINVAL;
>
> d->dst_addr =
> - ttm_kmap_atomic_prot(d->dst_pages[dst_page],
> - d->dst_prot);
> + kmap_atomic_prot(d->dst_pages[dst_page],
> + d->dst_prot);
> if (!d->dst_addr)
> return -ENOMEM;
>
> @@ -401,8 +401,8 @@ static int vmw_bo_cpu_blit_line(struct vmw_bo_blit_line_data *d,
> return -EINVAL;
>
> d->src_addr =
> - ttm_kmap_atomic_prot(d->src_pages[src_page],
> - d->src_prot);
> + kmap_atomic_prot(d->src_pages[src_page],
> + d->src_prot);
> if (!d->src_addr)
> return -ENOMEM;
>
> @@ -499,9 +499,9 @@ int vmw_bo_cpu_blit(struct ttm_buffer_object *dst,
> }
> out:
> if (d.src_addr)
> - ttm_kunmap_atomic_prot(d.src_addr, d.src_prot);
> + kunmap_atomic(d.src_addr);
> if (d.dst_addr)
> - ttm_kunmap_atomic_prot(d.dst_addr, d.dst_prot);
> + kunmap_atomic(d.dst_addr);
>
> return ret;
> }
> diff --git a/include/drm/ttm/ttm_bo_api.h b/include/drm/ttm/ttm_bo_api.h
> index 0a9d042e075a..de1ccdcd5703 100644
> --- a/include/drm/ttm/ttm_bo_api.h
> +++ b/include/drm/ttm/ttm_bo_api.h
> @@ -668,10 +668,6 @@ int ttm_bo_mmap_obj(struct vm_area_struct *vma, struct ttm_buffer_object *bo);
> int ttm_bo_mmap(struct file *filp, struct vm_area_struct *vma,
> struct ttm_bo_device *bdev);
>
> -void *ttm_kmap_atomic_prot(struct page *page, pgprot_t prot);
> -
> -void ttm_kunmap_atomic_prot(void *addr, pgprot_t prot);
> -
> /**
> * ttm_bo_io
> *
> --
> 2.25.1
>
> _______________________________________________
> dri-devel mailing list
> dri-devel@lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/dri-devel
--
Daniel Vetter
Software Engineer, Intel Corporation
+41 (0) 79 365 57 48 - http://blog.ffwll.ch
^ permalink raw reply
* [PATCH v2 5/5] kvm_main: replace debugfs with stats_fs
From: Emanuele Giuseppe Esposito @ 2020-05-04 11:03 UTC (permalink / raw)
To: kvm
Cc: Emanuele Giuseppe Esposito, linux-s390, David Hildenbrand,
Cornelia Huck, Emanuele Giuseppe Esposito, linux-kernel, kvm-ppc,
linux-mips, Christian Borntraeger, Alexander Viro, linux-fsdevel,
Paolo Bonzini, Vitaly Kuznetsov, linuxppc-dev, Jim Mattson
In-Reply-To: <20200504110344.17560-1-eesposit@redhat.com>
Use stats_fs API instead of debugfs to create sources and add values.
This also requires to change all architecture files to replace the old
debugfs_entries with stats_fs_vcpu_entries and statsfs_vm_entries.
The files/folders name and organization is kept unchanged, and a symlink
in sys/kernel/debugfs/kvm is left for backward compatibility.
Signed-off-by: Emanuele Giuseppe Esposito <eesposit@redhat.com>
---
arch/arm64/kvm/Kconfig | 1 +
arch/arm64/kvm/guest.c | 2 +-
arch/mips/kvm/Kconfig | 1 +
arch/mips/kvm/mips.c | 2 +-
arch/powerpc/kvm/Kconfig | 1 +
arch/powerpc/kvm/book3s.c | 6 +-
arch/powerpc/kvm/booke.c | 8 +-
arch/s390/kvm/Kconfig | 1 +
arch/s390/kvm/kvm-s390.c | 16 +-
arch/x86/include/asm/kvm_host.h | 2 +-
arch/x86/kvm/Kconfig | 1 +
arch/x86/kvm/Makefile | 2 +-
arch/x86/kvm/debugfs.c | 64 -------
arch/x86/kvm/stats_fs.c | 56 ++++++
arch/x86/kvm/x86.c | 6 +-
include/linux/kvm_host.h | 39 +---
virt/kvm/arm/arm.c | 2 +-
virt/kvm/kvm_main.c | 314 ++++----------------------------
18 files changed, 142 insertions(+), 382 deletions(-)
delete mode 100644 arch/x86/kvm/debugfs.c
create mode 100644 arch/x86/kvm/stats_fs.c
diff --git a/arch/arm64/kvm/Kconfig b/arch/arm64/kvm/Kconfig
index 449386d76441..8c125387b673 100644
--- a/arch/arm64/kvm/Kconfig
+++ b/arch/arm64/kvm/Kconfig
@@ -23,6 +23,7 @@ config KVM
depends on OF
# for TASKSTATS/TASK_DELAY_ACCT:
depends on NET && MULTIUSER
+ imply STATS_FS
select MMU_NOTIFIER
select PREEMPT_NOTIFIERS
select HAVE_KVM_CPU_RELAX_INTERCEPT
diff --git a/arch/arm64/kvm/guest.c b/arch/arm64/kvm/guest.c
index 8417b200bec9..235ed44e4353 100644
--- a/arch/arm64/kvm/guest.c
+++ b/arch/arm64/kvm/guest.c
@@ -29,7 +29,7 @@
#include "trace.h"
-struct kvm_stats_debugfs_item debugfs_entries[] = {
+struct stats_fs_value stats_fs_vcpu_entries[] = {
VCPU_STAT("halt_successful_poll", halt_successful_poll),
VCPU_STAT("halt_attempted_poll", halt_attempted_poll),
VCPU_STAT("halt_poll_invalid", halt_poll_invalid),
diff --git a/arch/mips/kvm/Kconfig b/arch/mips/kvm/Kconfig
index b91d145aa2d5..19d14e979e5f 100644
--- a/arch/mips/kvm/Kconfig
+++ b/arch/mips/kvm/Kconfig
@@ -19,6 +19,7 @@ config KVM
tristate "Kernel-based Virtual Machine (KVM) support"
depends on HAVE_KVM
depends on MIPS_FP_SUPPORT
+ imply STATS_FS
select EXPORT_UASM
select PREEMPT_NOTIFIERS
select KVM_GENERIC_DIRTYLOG_READ_PROTECT
diff --git a/arch/mips/kvm/mips.c b/arch/mips/kvm/mips.c
index fdf1c14d9205..a47d21f35444 100644
--- a/arch/mips/kvm/mips.c
+++ b/arch/mips/kvm/mips.c
@@ -39,7 +39,7 @@
#define VECTORSPACING 0x100 /* for EI/VI mode */
#endif
-struct kvm_stats_debugfs_item debugfs_entries[] = {
+struct stats_fs_value stats_fs_vcpu_entries[] = {
VCPU_STAT("wait", wait_exits),
VCPU_STAT("cache", cache_exits),
VCPU_STAT("signal", signal_exits),
diff --git a/arch/powerpc/kvm/Kconfig b/arch/powerpc/kvm/Kconfig
index 12885eda324e..feb5e110ebb0 100644
--- a/arch/powerpc/kvm/Kconfig
+++ b/arch/powerpc/kvm/Kconfig
@@ -19,6 +19,7 @@ if VIRTUALIZATION
config KVM
bool
+ imply STATS_FS
select PREEMPT_NOTIFIERS
select HAVE_KVM_EVENTFD
select HAVE_KVM_VCPU_ASYNC_IOCTL
diff --git a/arch/powerpc/kvm/book3s.c b/arch/powerpc/kvm/book3s.c
index 37508a356f28..76222ab148da 100644
--- a/arch/powerpc/kvm/book3s.c
+++ b/arch/powerpc/kvm/book3s.c
@@ -38,7 +38,7 @@
/* #define EXIT_DEBUG */
-struct kvm_stats_debugfs_item debugfs_entries[] = {
+struct stats_fs_value stats_fs_vcpu_entries[] = {
VCPU_STAT("exits", sum_exits),
VCPU_STAT("mmio", mmio_exits),
VCPU_STAT("sig", signal_exits),
@@ -66,6 +66,10 @@ struct kvm_stats_debugfs_item debugfs_entries[] = {
VCPU_STAT("pthru_all", pthru_all),
VCPU_STAT("pthru_host", pthru_host),
VCPU_STAT("pthru_bad_aff", pthru_bad_aff),
+ { NULL }
+};
+
+struct stats_fs_value stats_fs_vm_entries[] = {
VM_STAT("largepages_2M", num_2M_pages, .mode = 0444),
VM_STAT("largepages_1G", num_1G_pages, .mode = 0444),
{ NULL }
diff --git a/arch/powerpc/kvm/booke.c b/arch/powerpc/kvm/booke.c
index c2984cb6dfa7..b14c07786cc8 100644
--- a/arch/powerpc/kvm/booke.c
+++ b/arch/powerpc/kvm/booke.c
@@ -35,7 +35,12 @@
unsigned long kvmppc_booke_handlers;
-struct kvm_stats_debugfs_item debugfs_entries[] = {
+struct stats_fs_value stats_fs_vm_entries[] = {
+ VM_STAT("remote_tlb_flush", remote_tlb_flush),
+ { NULL }
+};
+
+struct stats_fs_value stats_fs_vcpu_entries[] = {
VCPU_STAT("mmio", mmio_exits),
VCPU_STAT("sig", signal_exits),
VCPU_STAT("itlb_r", itlb_real_miss_exits),
@@ -54,7 +59,6 @@ struct kvm_stats_debugfs_item debugfs_entries[] = {
VCPU_STAT("halt_wakeup", halt_wakeup),
VCPU_STAT("doorbell", dbell_exits),
VCPU_STAT("guest doorbell", gdbell_exits),
- VM_STAT("remote_tlb_flush", remote_tlb_flush),
{ NULL }
};
diff --git a/arch/s390/kvm/Kconfig b/arch/s390/kvm/Kconfig
index def3b60f1fe8..4e912ffcde78 100644
--- a/arch/s390/kvm/Kconfig
+++ b/arch/s390/kvm/Kconfig
@@ -20,6 +20,7 @@ config KVM
def_tristate y
prompt "Kernel-based Virtual Machine (KVM) support"
depends on HAVE_KVM
+ imply STATS_FS
select PREEMPT_NOTIFIERS
select HAVE_KVM_CPU_RELAX_INTERCEPT
select HAVE_KVM_VCPU_ASYNC_IOCTL
diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c
index dbeb7da07f18..f2f090b78529 100644
--- a/arch/s390/kvm/kvm-s390.c
+++ b/arch/s390/kvm/kvm-s390.c
@@ -57,7 +57,16 @@
#define VCPU_IRQS_MAX_BUF (sizeof(struct kvm_s390_irq) * \
(KVM_MAX_VCPUS + LOCAL_IRQS))
-struct kvm_stats_debugfs_item debugfs_entries[] = {
+struct stats_fs_value stats_fs_vm_entries[] = {
+ VM_STAT("inject_float_mchk", inject_float_mchk),
+ VM_STAT("inject_io", inject_io),
+ VM_STAT("inject_pfault_done", inject_pfault_done),
+ VM_STAT("inject_service_signal", inject_service_signal),
+ VM_STAT("inject_virtio", inject_virtio),
+ { NULL }
+};
+
+struct stats_fs_value stats_fs_vcpu_entries[] = {
VCPU_STAT("userspace_handled", exit_userspace),
VCPU_STAT("exit_null", exit_null),
VCPU_STAT("exit_validity", exit_validity),
@@ -95,18 +104,13 @@ struct kvm_stats_debugfs_item debugfs_entries[] = {
VCPU_STAT("inject_ckc", inject_ckc),
VCPU_STAT("inject_cputm", inject_cputm),
VCPU_STAT("inject_external_call", inject_external_call),
- VM_STAT("inject_float_mchk", inject_float_mchk),
VCPU_STAT("inject_emergency_signal", inject_emergency_signal),
- VM_STAT("inject_io", inject_io),
VCPU_STAT("inject_mchk", inject_mchk),
- VM_STAT("inject_pfault_done", inject_pfault_done),
VCPU_STAT("inject_program", inject_program),
VCPU_STAT("inject_restart", inject_restart),
- VM_STAT("inject_service_signal", inject_service_signal),
VCPU_STAT("inject_set_prefix", inject_set_prefix),
VCPU_STAT("inject_stop_signal", inject_stop_signal),
VCPU_STAT("inject_pfault_init", inject_pfault_init),
- VM_STAT("inject_virtio", inject_virtio),
VCPU_STAT("instruction_epsw", instruction_epsw),
VCPU_STAT("instruction_gs", instruction_gs),
VCPU_STAT("instruction_io_other", instruction_io_other),
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 42a2d0d3984a..6a04f590963f 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -35,7 +35,7 @@
#include <asm/kvm_vcpu_regs.h>
#include <asm/hyperv-tlfs.h>
-#define __KVM_HAVE_ARCH_VCPU_DEBUGFS
+#define __KVM_HAVE_ARCH_VCPU_STATS_FS
#define KVM_MAX_VCPUS 288
#define KVM_SOFT_MAX_VCPUS 240
diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig
index d8154e0684b6..834b8f4790a7 100644
--- a/arch/x86/kvm/Kconfig
+++ b/arch/x86/kvm/Kconfig
@@ -25,6 +25,7 @@ config KVM
# for TASKSTATS/TASK_DELAY_ACCT:
depends on NET && MULTIUSER
depends on X86_LOCAL_APIC
+ imply STATS_FS
select PREEMPT_NOTIFIERS
select MMU_NOTIFIER
select HAVE_KVM_IRQCHIP
diff --git a/arch/x86/kvm/Makefile b/arch/x86/kvm/Makefile
index a789759b7261..18285a382eba 100644
--- a/arch/x86/kvm/Makefile
+++ b/arch/x86/kvm/Makefile
@@ -11,7 +11,7 @@ kvm-$(CONFIG_KVM_ASYNC_PF) += $(KVM)/async_pf.o
kvm-y += x86.o emulate.o i8259.o irq.o lapic.o \
i8254.o ioapic.o irq_comm.o cpuid.o pmu.o mtrr.o \
- hyperv.o debugfs.o mmu/mmu.o mmu/page_track.o
+ hyperv.o stats_fs.o mmu/mmu.o mmu/page_track.o
kvm-intel-y += vmx/vmx.o vmx/vmenter.o vmx/pmu_intel.o vmx/vmcs12.o vmx/evmcs.o vmx/nested.o
kvm-amd-y += svm/svm.o svm/vmenter.o svm/pmu.o svm/nested.o svm/avic.o svm/sev.o
diff --git a/arch/x86/kvm/debugfs.c b/arch/x86/kvm/debugfs.c
deleted file mode 100644
index 018aebce33ff..000000000000
--- a/arch/x86/kvm/debugfs.c
+++ /dev/null
@@ -1,64 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * Kernel-based Virtual Machine driver for Linux
- *
- * Copyright 2016 Red Hat, Inc. and/or its affiliates.
- */
-#include <linux/kvm_host.h>
-#include <linux/debugfs.h>
-#include "lapic.h"
-
-static int vcpu_get_timer_advance_ns(void *data, u64 *val)
-{
- struct kvm_vcpu *vcpu = (struct kvm_vcpu *) data;
- *val = vcpu->arch.apic->lapic_timer.timer_advance_ns;
- return 0;
-}
-
-DEFINE_SIMPLE_ATTRIBUTE(vcpu_timer_advance_ns_fops, vcpu_get_timer_advance_ns, NULL, "%llu\n");
-
-static int vcpu_get_tsc_offset(void *data, u64 *val)
-{
- struct kvm_vcpu *vcpu = (struct kvm_vcpu *) data;
- *val = vcpu->arch.tsc_offset;
- return 0;
-}
-
-DEFINE_SIMPLE_ATTRIBUTE(vcpu_tsc_offset_fops, vcpu_get_tsc_offset, NULL, "%lld\n");
-
-static int vcpu_get_tsc_scaling_ratio(void *data, u64 *val)
-{
- struct kvm_vcpu *vcpu = (struct kvm_vcpu *) data;
- *val = vcpu->arch.tsc_scaling_ratio;
- return 0;
-}
-
-DEFINE_SIMPLE_ATTRIBUTE(vcpu_tsc_scaling_fops, vcpu_get_tsc_scaling_ratio, NULL, "%llu\n");
-
-static int vcpu_get_tsc_scaling_frac_bits(void *data, u64 *val)
-{
- *val = kvm_tsc_scaling_ratio_frac_bits;
- return 0;
-}
-
-DEFINE_SIMPLE_ATTRIBUTE(vcpu_tsc_scaling_frac_fops, vcpu_get_tsc_scaling_frac_bits, NULL, "%llu\n");
-
-void kvm_arch_create_vcpu_debugfs(struct kvm_vcpu *vcpu)
-{
- debugfs_create_file("tsc-offset", 0444, vcpu->debugfs_dentry, vcpu,
- &vcpu_tsc_offset_fops);
-
- if (lapic_in_kernel(vcpu))
- debugfs_create_file("lapic_timer_advance_ns", 0444,
- vcpu->debugfs_dentry, vcpu,
- &vcpu_timer_advance_ns_fops);
-
- if (kvm_has_tsc_control) {
- debugfs_create_file("tsc-scaling-ratio", 0444,
- vcpu->debugfs_dentry, vcpu,
- &vcpu_tsc_scaling_fops);
- debugfs_create_file("tsc-scaling-ratio-frac-bits", 0444,
- vcpu->debugfs_dentry, vcpu,
- &vcpu_tsc_scaling_frac_fops);
- }
-}
diff --git a/arch/x86/kvm/stats_fs.c b/arch/x86/kvm/stats_fs.c
new file mode 100644
index 000000000000..f2ac6ed8b01b
--- /dev/null
+++ b/arch/x86/kvm/stats_fs.c
@@ -0,0 +1,56 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Kernel-based Virtual Machine driver for Linux
+ *
+ * Copyright 2016 Red Hat, Inc. and/or its affiliates.
+ */
+#include <linux/kvm_host.h>
+#include <linux/stats_fs.h>
+#include "lapic.h"
+
+#define VCPU_ARCH_STATS_FS(n, s, x, ...) \
+ { n, offsetof(struct s, x), .aggr_kind = STATS_FS_SUM, \
+ ##__VA_ARGS__ }
+
+struct stats_fs_value stats_fs_vcpu_tsc_offset[] = {
+ VCPU_ARCH_STATS_FS("tsc-offset", kvm_vcpu_arch, tsc_offset,
+ .type = STATS_FS_S64, .mode = 0444),
+ { NULL }
+};
+
+struct stats_fs_value stats_fs_vcpu_arch_lapic_timer[] = {
+ VCPU_ARCH_STATS_FS("lapic_timer_advance_ns", kvm_timer, timer_advance_ns,
+ .type = STATS_FS_U64, .mode = 0444),
+ { NULL }
+};
+
+struct stats_fs_value stats_fs_vcpu_arch_tsc_ratio[] = {
+ VCPU_ARCH_STATS_FS("tsc-scaling-ratio", kvm_vcpu_arch, tsc_scaling_ratio,
+ .type = STATS_FS_U64, .mode = 0444),
+ { NULL }
+};
+
+struct stats_fs_value stats_fs_vcpu_arch_tsc_frac[] = {
+ { "tsc-scaling-ratio-frac-bits", 0, .type = STATS_FS_U64, .mode = 0444 },
+ { NULL } /* base is &kvm_tsc_scaling_ratio_frac_bits */
+};
+
+void kvm_arch_create_vcpu_stats_fs(struct kvm_vcpu *vcpu)
+{
+ stats_fs_source_add_values(vcpu->stats_fs_src, stats_fs_vcpu_tsc_offset,
+ &vcpu->arch);
+
+ if (lapic_in_kernel(vcpu))
+ stats_fs_source_add_values(vcpu->stats_fs_src,
+ stats_fs_vcpu_arch_lapic_timer,
+ &vcpu->arch.apic->lapic_timer);
+
+ if (kvm_has_tsc_control) {
+ stats_fs_source_add_values(vcpu->stats_fs_src,
+ stats_fs_vcpu_arch_tsc_ratio,
+ &vcpu->arch);
+ stats_fs_source_add_values(vcpu->stats_fs_src,
+ stats_fs_vcpu_arch_tsc_frac,
+ &kvm_tsc_scaling_ratio_frac_bits);
+ }
+}
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 35723dafedeb..01fed7ac2e49 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -190,7 +190,7 @@ static u64 __read_mostly host_xss;
u64 __read_mostly supported_xss;
EXPORT_SYMBOL_GPL(supported_xss);
-struct kvm_stats_debugfs_item debugfs_entries[] = {
+struct stats_fs_value stats_fs_vcpu_entries[] = {
VCPU_STAT("pf_fixed", pf_fixed),
VCPU_STAT("pf_guest", pf_guest),
VCPU_STAT("tlb_flush", tlb_flush),
@@ -217,6 +217,10 @@ struct kvm_stats_debugfs_item debugfs_entries[] = {
VCPU_STAT("nmi_injections", nmi_injections),
VCPU_STAT("req_event", req_event),
VCPU_STAT("l1d_flush", l1d_flush),
+ { NULL }
+};
+
+struct stats_fs_value stats_fs_vm_entries[] = {
VM_STAT("mmu_shadow_zapped", mmu_shadow_zapped),
VM_STAT("mmu_pte_write", mmu_pte_write),
VM_STAT("mmu_pte_updated", mmu_pte_updated),
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 3845f857ef7b..1282799aa46e 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -27,6 +27,7 @@
#include <linux/refcount.h>
#include <linux/nospec.h>
#include <asm/signal.h>
+#include <linux/stats_fs.h>
#include <linux/kvm.h>
#include <linux/kvm_para.h>
@@ -318,7 +319,7 @@ struct kvm_vcpu {
bool preempted;
bool ready;
struct kvm_vcpu_arch arch;
- struct dentry *debugfs_dentry;
+ struct stats_fs_source *stats_fs_src;
};
static inline int kvm_vcpu_exiting_guest_mode(struct kvm_vcpu *vcpu)
@@ -498,8 +499,7 @@ struct kvm {
long tlbs_dirty;
struct list_head devices;
u64 manual_dirty_log_protect;
- struct dentry *debugfs_dentry;
- struct kvm_stat_data **debugfs_stat_data;
+ struct stats_fs_source *stats_fs_src;
struct srcu_struct srcu;
struct srcu_struct irq_srcu;
pid_t userspace_pid;
@@ -880,8 +880,8 @@ int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu);
void kvm_arch_vcpu_postcreate(struct kvm_vcpu *vcpu);
void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu);
-#ifdef __KVM_HAVE_ARCH_VCPU_DEBUGFS
-void kvm_arch_create_vcpu_debugfs(struct kvm_vcpu *vcpu);
+#ifdef __KVM_HAVE_ARCH_VCPU_STATS_FS
+void kvm_arch_create_vcpu_stats_fs(struct kvm_vcpu *vcpu);
#endif
int kvm_arch_hardware_enable(void);
@@ -1110,33 +1110,14 @@ static inline bool kvm_is_error_gpa(struct kvm *kvm, gpa_t gpa)
return kvm_is_error_hva(hva);
}
-enum kvm_stat_kind {
- KVM_STAT_VM,
- KVM_STAT_VCPU,
-};
-
-struct kvm_stat_data {
- struct kvm *kvm;
- struct kvm_stats_debugfs_item *dbgfs_item;
-};
-
-struct kvm_stats_debugfs_item {
- const char *name;
- int offset;
- enum kvm_stat_kind kind;
- int mode;
-};
-
-#define KVM_DBGFS_GET_MODE(dbgfs_item) \
- ((dbgfs_item)->mode ? (dbgfs_item)->mode : 0644)
-
#define VM_STAT(n, x, ...) \
- { n, offsetof(struct kvm, stat.x), KVM_STAT_VM, ## __VA_ARGS__ }
+ { n, offsetof(struct kvm, stat.x), STATS_FS_U64, STATS_FS_SUM, ## __VA_ARGS__ }
#define VCPU_STAT(n, x, ...) \
- { n, offsetof(struct kvm_vcpu, stat.x), KVM_STAT_VCPU, ## __VA_ARGS__ }
+ { n, offsetof(struct kvm_vcpu, stat.x), STATS_FS_U64, STATS_FS_SUM, ## __VA_ARGS__ }
-extern struct kvm_stats_debugfs_item debugfs_entries[];
-extern struct dentry *kvm_debugfs_dir;
+extern struct stats_fs_value stats_fs_vcpu_entries[];
+extern struct stats_fs_value stats_fs_vm_entries[];
+extern struct stats_fs_source *kvm_stats_fs_dir;
#if defined(CONFIG_MMU_NOTIFIER) && defined(KVM_ARCH_WANT_MMU_NOTIFIER)
static inline int mmu_notifier_retry(struct kvm *kvm, unsigned long mmu_seq)
diff --git a/virt/kvm/arm/arm.c b/virt/kvm/arm/arm.c
index 48d0ec44ad77..4171f92fa473 100644
--- a/virt/kvm/arm/arm.c
+++ b/virt/kvm/arm/arm.c
@@ -140,7 +140,7 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
return ret;
}
-int kvm_arch_create_vcpu_debugfs(struct kvm_vcpu *vcpu)
+int kvm_arch_create_vcpu_stats_fs(struct kvm_vcpu *vcpu)
{
return 0;
}
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 74bdb7bf3295..093150125bc2 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -25,6 +25,7 @@
#include <linux/vmalloc.h>
#include <linux/reboot.h>
#include <linux/debugfs.h>
+#include <linux/stats_fs.h>
#include <linux/highmem.h>
#include <linux/file.h>
#include <linux/syscore_ops.h>
@@ -109,11 +110,8 @@ static struct kmem_cache *kvm_vcpu_cache;
static __read_mostly struct preempt_ops kvm_preempt_ops;
static DEFINE_PER_CPU(struct kvm_vcpu *, kvm_running_vcpu);
-struct dentry *kvm_debugfs_dir;
-EXPORT_SYMBOL_GPL(kvm_debugfs_dir);
-
-static int kvm_debugfs_num_entries;
-static const struct file_operations stat_fops_per_vm;
+struct stats_fs_source *kvm_stats_fs_dir;
+EXPORT_SYMBOL_GPL(kvm_stats_fs_dir);
static long kvm_vcpu_ioctl(struct file *file, unsigned int ioctl,
unsigned long arg);
@@ -356,6 +354,8 @@ static void kvm_vcpu_init(struct kvm_vcpu *vcpu, struct kvm *kvm, unsigned id)
void kvm_vcpu_destroy(struct kvm_vcpu *vcpu)
{
+ stats_fs_source_revoke(vcpu->stats_fs_src);
+ stats_fs_source_put(vcpu->stats_fs_src);
kvm_arch_vcpu_destroy(vcpu);
/*
@@ -601,52 +601,27 @@ static void kvm_free_memslots(struct kvm *kvm, struct kvm_memslots *slots)
kvfree(slots);
}
-static void kvm_destroy_vm_debugfs(struct kvm *kvm)
+static void kvm_destroy_vm_stats_fs(struct kvm *kvm)
{
- int i;
-
- if (!kvm->debugfs_dentry)
- return;
-
- debugfs_remove_recursive(kvm->debugfs_dentry);
-
- if (kvm->debugfs_stat_data) {
- for (i = 0; i < kvm_debugfs_num_entries; i++)
- kfree(kvm->debugfs_stat_data[i]);
- kfree(kvm->debugfs_stat_data);
- }
+ stats_fs_source_remove_subordinate(kvm_stats_fs_dir, kvm->stats_fs_src);
+ stats_fs_source_revoke(kvm->stats_fs_src);
+ stats_fs_source_put(kvm->stats_fs_src);
}
-static int kvm_create_vm_debugfs(struct kvm *kvm, int fd)
+static int kvm_create_vm_stats_fs(struct kvm *kvm, int fd)
{
char dir_name[ITOA_MAX_LEN * 2];
- struct kvm_stat_data *stat_data;
- struct kvm_stats_debugfs_item *p;
- if (!debugfs_initialized())
+ if (!stats_fs_initialized())
return 0;
snprintf(dir_name, sizeof(dir_name), "%d-%d", task_pid_nr(current), fd);
- kvm->debugfs_dentry = debugfs_create_dir(dir_name, kvm_debugfs_dir);
+ kvm->stats_fs_src = stats_fs_source_create(dir_name);
+ stats_fs_source_add_subordinate(kvm_stats_fs_dir, kvm->stats_fs_src);
- kvm->debugfs_stat_data = kcalloc(kvm_debugfs_num_entries,
- sizeof(*kvm->debugfs_stat_data),
- GFP_KERNEL_ACCOUNT);
- if (!kvm->debugfs_stat_data)
- return -ENOMEM;
+ stats_fs_source_add_values(kvm->stats_fs_src, stats_fs_vm_entries, kvm);
- for (p = debugfs_entries; p->name; p++) {
- stat_data = kzalloc(sizeof(*stat_data), GFP_KERNEL_ACCOUNT);
- if (!stat_data)
- return -ENOMEM;
-
- stat_data->kvm = kvm;
- stat_data->dbgfs_item = p;
- kvm->debugfs_stat_data[p - debugfs_entries] = stat_data;
- debugfs_create_file(p->name, KVM_DBGFS_GET_MODE(p),
- kvm->debugfs_dentry, stat_data,
- &stat_fops_per_vm);
- }
+ stats_fs_source_add_values(kvm->stats_fs_src, stats_fs_vcpu_entries, NULL);
return 0;
}
@@ -783,7 +758,7 @@ static void kvm_destroy_vm(struct kvm *kvm)
struct mm_struct *mm = kvm->mm;
kvm_uevent_notify_change(KVM_EVENT_DESTROY_VM, kvm);
- kvm_destroy_vm_debugfs(kvm);
+ kvm_destroy_vm_stats_fs(kvm);
kvm_arch_sync_events(kvm);
mutex_lock(&kvm_lock);
list_del(&kvm->vm_list);
@@ -2946,7 +2921,6 @@ static int kvm_vcpu_release(struct inode *inode, struct file *filp)
{
struct kvm_vcpu *vcpu = filp->private_data;
- debugfs_remove_recursive(vcpu->debugfs_dentry);
kvm_put_kvm(vcpu->kvm);
return 0;
}
@@ -2970,19 +2944,22 @@ static int create_vcpu_fd(struct kvm_vcpu *vcpu)
return anon_inode_getfd(name, &kvm_vcpu_fops, vcpu, O_RDWR | O_CLOEXEC);
}
-static void kvm_create_vcpu_debugfs(struct kvm_vcpu *vcpu)
+static void kvm_create_vcpu_stats_fs(struct kvm_vcpu *vcpu)
{
-#ifdef __KVM_HAVE_ARCH_VCPU_DEBUGFS
char dir_name[ITOA_MAX_LEN * 2];
- if (!debugfs_initialized())
+ if (!stats_fs_initialized())
return;
snprintf(dir_name, sizeof(dir_name), "vcpu%d", vcpu->vcpu_id);
- vcpu->debugfs_dentry = debugfs_create_dir(dir_name,
- vcpu->kvm->debugfs_dentry);
- kvm_arch_create_vcpu_debugfs(vcpu);
+ vcpu->stats_fs_src = stats_fs_source_create(dir_name);
+ stats_fs_source_add_subordinate(vcpu->kvm->stats_fs_src, vcpu->stats_fs_src);
+
+ stats_fs_source_add_values(vcpu->stats_fs_src, stats_fs_vcpu_entries, vcpu);
+
+#ifdef __KVM_HAVE_ARCH_VCPU_STATS_FS
+ kvm_arch_create_vcpu_stats_fs(vcpu);
#endif
}
@@ -3031,8 +3008,6 @@ static int kvm_vm_ioctl_create_vcpu(struct kvm *kvm, u32 id)
if (r)
goto vcpu_free_run_page;
- kvm_create_vcpu_debugfs(vcpu);
-
mutex_lock(&kvm->lock);
if (kvm_get_vcpu_by_id(kvm, id)) {
r = -EEXIST;
@@ -3061,11 +3036,11 @@ static int kvm_vm_ioctl_create_vcpu(struct kvm *kvm, u32 id)
mutex_unlock(&kvm->lock);
kvm_arch_vcpu_postcreate(vcpu);
+ kvm_create_vcpu_stats_fs(vcpu);
return r;
unlock_vcpu_destroy:
mutex_unlock(&kvm->lock);
- debugfs_remove_recursive(vcpu->debugfs_dentry);
kvm_arch_vcpu_destroy(vcpu);
vcpu_free_run_page:
free_page((unsigned long)vcpu->run);
@@ -3839,7 +3814,7 @@ static int kvm_dev_ioctl_create_vm(unsigned long type)
* cases it will be called by the final fput(file) and will take
* care of doing kvm_put_kvm(kvm).
*/
- if (kvm_create_vm_debugfs(kvm, r) < 0) {
+ if (kvm_create_vm_stats_fs(kvm, r) < 0) {
put_unused_fd(r);
fput(file);
return -ENOMEM;
@@ -4295,214 +4270,6 @@ struct kvm_io_device *kvm_io_bus_get_dev(struct kvm *kvm, enum kvm_bus bus_idx,
}
EXPORT_SYMBOL_GPL(kvm_io_bus_get_dev);
-static int kvm_debugfs_open(struct inode *inode, struct file *file,
- int (*get)(void *, u64 *), int (*set)(void *, u64),
- const char *fmt)
-{
- struct kvm_stat_data *stat_data = (struct kvm_stat_data *)
- inode->i_private;
-
- /* The debugfs files are a reference to the kvm struct which
- * is still valid when kvm_destroy_vm is called.
- * To avoid the race between open and the removal of the debugfs
- * directory we test against the users count.
- */
- if (!refcount_inc_not_zero(&stat_data->kvm->users_count))
- return -ENOENT;
-
- if (simple_attr_open(inode, file, get,
- KVM_DBGFS_GET_MODE(stat_data->dbgfs_item) & 0222
- ? set : NULL,
- fmt)) {
- kvm_put_kvm(stat_data->kvm);
- return -ENOMEM;
- }
-
- return 0;
-}
-
-static int kvm_debugfs_release(struct inode *inode, struct file *file)
-{
- struct kvm_stat_data *stat_data = (struct kvm_stat_data *)
- inode->i_private;
-
- simple_attr_release(inode, file);
- kvm_put_kvm(stat_data->kvm);
-
- return 0;
-}
-
-static int kvm_get_stat_per_vm(struct kvm *kvm, size_t offset, u64 *val)
-{
- *val = *(ulong *)((void *)kvm + offset);
-
- return 0;
-}
-
-static int kvm_clear_stat_per_vm(struct kvm *kvm, size_t offset)
-{
- *(ulong *)((void *)kvm + offset) = 0;
-
- return 0;
-}
-
-static int kvm_get_stat_per_vcpu(struct kvm *kvm, size_t offset, u64 *val)
-{
- int i;
- struct kvm_vcpu *vcpu;
-
- *val = 0;
-
- kvm_for_each_vcpu(i, vcpu, kvm)
- *val += *(u64 *)((void *)vcpu + offset);
-
- return 0;
-}
-
-static int kvm_clear_stat_per_vcpu(struct kvm *kvm, size_t offset)
-{
- int i;
- struct kvm_vcpu *vcpu;
-
- kvm_for_each_vcpu(i, vcpu, kvm)
- *(u64 *)((void *)vcpu + offset) = 0;
-
- return 0;
-}
-
-static int kvm_stat_data_get(void *data, u64 *val)
-{
- int r = -EFAULT;
- struct kvm_stat_data *stat_data = (struct kvm_stat_data *)data;
-
- switch (stat_data->dbgfs_item->kind) {
- case KVM_STAT_VM:
- r = kvm_get_stat_per_vm(stat_data->kvm,
- stat_data->dbgfs_item->offset, val);
- break;
- case KVM_STAT_VCPU:
- r = kvm_get_stat_per_vcpu(stat_data->kvm,
- stat_data->dbgfs_item->offset, val);
- break;
- }
-
- return r;
-}
-
-static int kvm_stat_data_clear(void *data, u64 val)
-{
- int r = -EFAULT;
- struct kvm_stat_data *stat_data = (struct kvm_stat_data *)data;
-
- if (val)
- return -EINVAL;
-
- switch (stat_data->dbgfs_item->kind) {
- case KVM_STAT_VM:
- r = kvm_clear_stat_per_vm(stat_data->kvm,
- stat_data->dbgfs_item->offset);
- break;
- case KVM_STAT_VCPU:
- r = kvm_clear_stat_per_vcpu(stat_data->kvm,
- stat_data->dbgfs_item->offset);
- break;
- }
-
- return r;
-}
-
-static int kvm_stat_data_open(struct inode *inode, struct file *file)
-{
- __simple_attr_check_format("%llu\n", 0ull);
- return kvm_debugfs_open(inode, file, kvm_stat_data_get,
- kvm_stat_data_clear, "%llu\n");
-}
-
-static const struct file_operations stat_fops_per_vm = {
- .owner = THIS_MODULE,
- .open = kvm_stat_data_open,
- .release = kvm_debugfs_release,
- .read = simple_attr_read,
- .write = simple_attr_write,
- .llseek = no_llseek,
-};
-
-static int vm_stat_get(void *_offset, u64 *val)
-{
- unsigned offset = (long)_offset;
- struct kvm *kvm;
- u64 tmp_val;
-
- *val = 0;
- mutex_lock(&kvm_lock);
- list_for_each_entry(kvm, &vm_list, vm_list) {
- kvm_get_stat_per_vm(kvm, offset, &tmp_val);
- *val += tmp_val;
- }
- mutex_unlock(&kvm_lock);
- return 0;
-}
-
-static int vm_stat_clear(void *_offset, u64 val)
-{
- unsigned offset = (long)_offset;
- struct kvm *kvm;
-
- if (val)
- return -EINVAL;
-
- mutex_lock(&kvm_lock);
- list_for_each_entry(kvm, &vm_list, vm_list) {
- kvm_clear_stat_per_vm(kvm, offset);
- }
- mutex_unlock(&kvm_lock);
-
- return 0;
-}
-
-DEFINE_SIMPLE_ATTRIBUTE(vm_stat_fops, vm_stat_get, vm_stat_clear, "%llu\n");
-
-static int vcpu_stat_get(void *_offset, u64 *val)
-{
- unsigned offset = (long)_offset;
- struct kvm *kvm;
- u64 tmp_val;
-
- *val = 0;
- mutex_lock(&kvm_lock);
- list_for_each_entry(kvm, &vm_list, vm_list) {
- kvm_get_stat_per_vcpu(kvm, offset, &tmp_val);
- *val += tmp_val;
- }
- mutex_unlock(&kvm_lock);
- return 0;
-}
-
-static int vcpu_stat_clear(void *_offset, u64 val)
-{
- unsigned offset = (long)_offset;
- struct kvm *kvm;
-
- if (val)
- return -EINVAL;
-
- mutex_lock(&kvm_lock);
- list_for_each_entry(kvm, &vm_list, vm_list) {
- kvm_clear_stat_per_vcpu(kvm, offset);
- }
- mutex_unlock(&kvm_lock);
-
- return 0;
-}
-
-DEFINE_SIMPLE_ATTRIBUTE(vcpu_stat_fops, vcpu_stat_get, vcpu_stat_clear,
- "%llu\n");
-
-static const struct file_operations *stat_fops[] = {
- [KVM_STAT_VCPU] = &vcpu_stat_fops,
- [KVM_STAT_VM] = &vm_stat_fops,
-};
-
static void kvm_uevent_notify_change(unsigned int type, struct kvm *kvm)
{
struct kobj_uevent_env *env;
@@ -4537,34 +4304,32 @@ static void kvm_uevent_notify_change(unsigned int type, struct kvm *kvm)
}
add_uevent_var(env, "PID=%d", kvm->userspace_pid);
- if (!IS_ERR_OR_NULL(kvm->debugfs_dentry)) {
+ if (!IS_ERR_OR_NULL(kvm->stats_fs_src->source_dentry)) {
char *tmp, *p = kmalloc(PATH_MAX, GFP_KERNEL_ACCOUNT);
if (p) {
- tmp = dentry_path_raw(kvm->debugfs_dentry, p, PATH_MAX);
+ tmp = dentry_path_raw(kvm->stats_fs_src->source_dentry, p, PATH_MAX);
if (!IS_ERR(tmp))
add_uevent_var(env, "STATS_PATH=%s", tmp);
kfree(p);
}
}
+
/* no need for checks, since we are adding at most only 5 keys */
env->envp[env->envp_idx++] = NULL;
kobject_uevent_env(&kvm_dev.this_device->kobj, KOBJ_CHANGE, env->envp);
kfree(env);
}
-static void kvm_init_debug(void)
+static void kvm_init_stats_fs(void)
{
- struct kvm_stats_debugfs_item *p;
+ kvm_stats_fs_dir = stats_fs_source_create("kvm");
+ /* symlink to debugfs */
+ debugfs_create_symlink("kvm", NULL, "/sys/kernel/stats_fs/kvm");
+ stats_fs_source_register(kvm_stats_fs_dir);
- kvm_debugfs_dir = debugfs_create_dir("kvm", NULL);
-
- kvm_debugfs_num_entries = 0;
- for (p = debugfs_entries; p->name; ++p, kvm_debugfs_num_entries++) {
- debugfs_create_file(p->name, KVM_DBGFS_GET_MODE(p),
- kvm_debugfs_dir, (void *)(long)p->offset,
- stat_fops[p->kind]);
- }
+ stats_fs_source_add_values(kvm_stats_fs_dir, stats_fs_vcpu_entries, NULL);
+ stats_fs_source_add_values(kvm_stats_fs_dir, stats_fs_vm_entries, NULL);
}
static int kvm_suspend(void)
@@ -4738,7 +4503,7 @@ int kvm_init(void *opaque, unsigned vcpu_size, unsigned vcpu_align,
kvm_preempt_ops.sched_in = kvm_sched_in;
kvm_preempt_ops.sched_out = kvm_sched_out;
- kvm_init_debug();
+ kvm_init_stats_fs();
r = kvm_vfio_ops_init();
WARN_ON(r);
@@ -4767,7 +4532,8 @@ EXPORT_SYMBOL_GPL(kvm_init);
void kvm_exit(void)
{
- debugfs_remove_recursive(kvm_debugfs_dir);
+ stats_fs_source_revoke(kvm_stats_fs_dir);
+ stats_fs_source_put(kvm_stats_fs_dir);
misc_deregister(&kvm_dev);
kmem_cache_destroy(kvm_vcpu_cache);
kvm_async_pf_deinit();
--
2.25.2
^ permalink raw reply related
* [PATCH v2 4/5] stats_fs fs: virtual fs to show stats to the end-user
From: Emanuele Giuseppe Esposito @ 2020-05-04 11:03 UTC (permalink / raw)
To: kvm
Cc: Emanuele Giuseppe Esposito, linux-s390, David Hildenbrand,
Cornelia Huck, Emanuele Giuseppe Esposito, linux-kernel, kvm-ppc,
linux-mips, Christian Borntraeger, Alexander Viro, linux-fsdevel,
Paolo Bonzini, Vitaly Kuznetsov, linuxppc-dev, Jim Mattson
In-Reply-To: <20200504110344.17560-1-eesposit@redhat.com>
Add virtual fs that maps stats_fs sources with directories, and values
(simple or aggregates) to files.
Every time a file is read/cleared, the fs internally invokes the stats_fs
API to get/set the requested value.
fs/stats_fs/inode.c is pretty much similar to what is done in
fs/debugfs/inode.c, with the exception that the API is only
composed by stats_fs_create_file, stats_fs_create_dir and stats_fs_remove.
Signed-off-by: Emanuele Giuseppe Esposito <eesposit@redhat.com>
---
fs/stats_fs/Makefile | 2 +-
fs/stats_fs/inode.c | 337 +++++++++++++++++++++++++++++++++++++
fs/stats_fs/internal.h | 15 ++
fs/stats_fs/stats_fs.c | 163 ++++++++++++++++++
include/linux/stats_fs.h | 15 ++
include/uapi/linux/magic.h | 1 +
tools/lib/api/fs/fs.c | 21 +++
7 files changed, 553 insertions(+), 1 deletion(-)
create mode 100644 fs/stats_fs/inode.c
diff --git a/fs/stats_fs/Makefile b/fs/stats_fs/Makefile
index 9db130fac6b6..ac12c27545f6 100644
--- a/fs/stats_fs/Makefile
+++ b/fs/stats_fs/Makefile
@@ -1,5 +1,5 @@
# SPDX-License-Identifier: GPL-2.0-only
-stats_fs-objs := stats_fs.o
+stats_fs-objs := inode.o stats_fs.o
stats_fs-tests-objs := stats_fs-tests.o
obj-$(CONFIG_STATS_FS) += stats_fs.o
diff --git a/fs/stats_fs/inode.c b/fs/stats_fs/inode.c
new file mode 100644
index 000000000000..865ee91656ba
--- /dev/null
+++ b/fs/stats_fs/inode.c
@@ -0,0 +1,337 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * inode.c - part of stats_fs, a tiny little stats_fs file system
+ *
+ * Copyright (C) 2020 Emanuele Giuseppe Esposito <eesposit@redhat.com>
+ * Copyright (C) 2020 Redhat
+ */
+#define pr_fmt(fmt) "stats_fs: " fmt
+
+#include <linux/module.h>
+#include <linux/fs.h>
+#include <linux/mount.h>
+#include <linux/init.h>
+#include <linux/stats_fs.h>
+#include <linux/string.h>
+#include <linux/seq_file.h>
+#include <linux/parser.h>
+#include <linux/magic.h>
+#include <linux/slab.h>
+
+#include "internal.h"
+
+#define STATS_FS_DEFAULT_MODE 0700
+
+static struct simple_fs stats_fs;
+static bool stats_fs_registered;
+
+struct stats_fs_mount_opts {
+ kuid_t uid;
+ kgid_t gid;
+ umode_t mode;
+};
+
+enum {
+ Opt_uid,
+ Opt_gid,
+ Opt_mode,
+ Opt_err
+};
+
+static const match_table_t tokens = {
+ {Opt_uid, "uid=%u"},
+ {Opt_gid, "gid=%u"},
+ {Opt_mode, "mode=%o"},
+ {Opt_err, NULL}
+};
+
+struct stats_fs_fs_info {
+ struct stats_fs_mount_opts mount_opts;
+};
+
+static int stats_fs_parse_options(char *data, struct stats_fs_mount_opts *opts)
+{
+ substring_t args[MAX_OPT_ARGS];
+ int option;
+ int token;
+ kuid_t uid;
+ kgid_t gid;
+ char *p;
+
+ opts->mode = STATS_FS_DEFAULT_MODE;
+
+ while ((p = strsep(&data, ",")) != NULL) {
+ if (!*p)
+ continue;
+
+ token = match_token(p, tokens, args);
+ switch (token) {
+ case Opt_uid:
+ if (match_int(&args[0], &option))
+ return -EINVAL;
+ uid = make_kuid(current_user_ns(), option);
+ if (!uid_valid(uid))
+ return -EINVAL;
+ opts->uid = uid;
+ break;
+ case Opt_gid:
+ if (match_int(&args[0], &option))
+ return -EINVAL;
+ gid = make_kgid(current_user_ns(), option);
+ if (!gid_valid(gid))
+ return -EINVAL;
+ opts->gid = gid;
+ break;
+ case Opt_mode:
+ if (match_octal(&args[0], &option))
+ return -EINVAL;
+ opts->mode = option & S_IALLUGO;
+ break;
+ /*
+ * We might like to report bad mount options here;
+ * but traditionally stats_fs has ignored all mount options
+ */
+ }
+ }
+
+ return 0;
+}
+
+static int stats_fs_apply_options(struct super_block *sb)
+{
+ struct stats_fs_fs_info *fsi = sb->s_fs_info;
+ struct inode *inode = d_inode(sb->s_root);
+ struct stats_fs_mount_opts *opts = &fsi->mount_opts;
+
+ inode->i_mode &= ~S_IALLUGO;
+ inode->i_mode |= opts->mode;
+
+ inode->i_uid = opts->uid;
+ inode->i_gid = opts->gid;
+
+ return 0;
+}
+
+static int stats_fs_remount(struct super_block *sb, int *flags, char *data)
+{
+ int err;
+ struct stats_fs_fs_info *fsi = sb->s_fs_info;
+
+ sync_filesystem(sb);
+ err = stats_fs_parse_options(data, &fsi->mount_opts);
+ if (err)
+ goto fail;
+
+ stats_fs_apply_options(sb);
+
+fail:
+ return err;
+}
+
+static int stats_fs_show_options(struct seq_file *m, struct dentry *root)
+{
+ struct stats_fs_fs_info *fsi = root->d_sb->s_fs_info;
+ struct stats_fs_mount_opts *opts = &fsi->mount_opts;
+
+ if (!uid_eq(opts->uid, GLOBAL_ROOT_UID))
+ seq_printf(m, ",uid=%u",
+ from_kuid_munged(&init_user_ns, opts->uid));
+ if (!gid_eq(opts->gid, GLOBAL_ROOT_GID))
+ seq_printf(m, ",gid=%u",
+ from_kgid_munged(&init_user_ns, opts->gid));
+ if (opts->mode != STATS_FS_DEFAULT_MODE)
+ seq_printf(m, ",mode=%o", opts->mode);
+
+ return 0;
+}
+
+
+static void stats_fs_free_inode(struct inode *inode)
+{
+ kfree(inode->i_private);
+ free_inode_nonrcu(inode);
+}
+
+static const struct super_operations stats_fs_super_operations = {
+ .statfs = simple_statfs,
+ .remount_fs = stats_fs_remount,
+ .show_options = stats_fs_show_options,
+ .free_inode = stats_fs_free_inode,
+};
+
+static int stats_fs_fill_super(struct super_block *sb, void *data, int silent)
+{
+ static const struct tree_descr stats_fs_files[] = {{""}};
+ struct stats_fs_fs_info *fsi;
+ int err;
+
+ fsi = kzalloc(sizeof(struct stats_fs_fs_info), GFP_KERNEL);
+ sb->s_fs_info = fsi;
+ if (!fsi) {
+ err = -ENOMEM;
+ goto fail;
+ }
+
+ err = stats_fs_parse_options(data, &fsi->mount_opts);
+ if (err)
+ goto fail;
+
+ err = simple_fill_super(sb, STATSFS_MAGIC, stats_fs_files);
+ if (err)
+ goto fail;
+
+ sb->s_op = &stats_fs_super_operations;
+
+ stats_fs_apply_options(sb);
+
+ return 0;
+
+fail:
+ kfree(fsi);
+ sb->s_fs_info = NULL;
+ return err;
+}
+
+static struct dentry *stats_fs_mount(struct file_system_type *fs_type,
+ int flags, const char *dev_name,
+ void *data)
+{
+ return mount_single(fs_type, flags, data, stats_fs_fill_super);
+}
+
+static struct file_system_type stats_fs_fs_type = {
+ .owner = THIS_MODULE,
+ .name = "statsfs",
+ .mount = stats_fs_mount,
+ .kill_sb = kill_litter_super,
+};
+MODULE_ALIAS_FS("statsfs");
+
+
+/**
+ * stats_fs_create_file - create a file in the stats_fs filesystem
+ * @val: a pointer to a stats_fs_value containing all the infos of
+ * the file to create (name, permission)
+ * @src: a pointer to a stats_fs_source containing the dentry of where
+ * to add this file
+ *
+ * This function will return a pointer to a dentry if it succeeds. This
+ * pointer must be passed to the stats_fs_remove() function when the file is
+ * to be removed (no automatic cleanup happens if your module is unloaded,
+ * you are responsible here.) If an error occurs, ERR_PTR(-ERROR) will be
+ * returned.
+ *
+ * Val and src will be also inglobated in a ststsfs_data_inode struct
+ * that will be internally stored as inode->i_private and used in the
+ * get/set attribute functions (see stats_fs_ops in stats_fs.c).
+ */
+struct dentry *stats_fs_create_file(struct stats_fs_value *val, struct stats_fs_source *src)
+{
+ struct dentry *dentry;
+ struct inode *inode;
+ struct stats_fs_data_inode *val_inode;
+
+ val_inode = kzalloc(sizeof(struct stats_fs_data_inode), GFP_KERNEL);
+ if (!val_inode) {
+ printk(KERN_ERR
+ "Kzalloc failure in stats_fs_create_files (ENOMEM)\n");
+ return ERR_PTR(-ENOMEM);
+ }
+
+ val_inode->src = src;
+ val_inode->val = val;
+
+
+ dentry = simplefs_create_file(&stats_fs, &stats_fs_fs_type,
+ val->name, stats_fs_val_get_mode(val),
+ src->source_dentry, val_inode, &inode);
+ if (IS_ERR(dentry))
+ return dentry;
+
+ inode->i_fop = &stats_fs_ops;
+
+ return simplefs_finish_dentry(dentry, inode);
+}
+/**
+ * stats_fs_create_dir - create a directory in the stats_fs filesystem
+ * @name: a pointer to a string containing the name of the directory to
+ * create.
+ * @parent: a pointer to the parent dentry for this file. This should be a
+ * directory dentry if set. If this parameter is NULL, then the
+ * directory will be created in the root of the stats_fs filesystem.
+ *
+ * This function creates a directory in stats_fs with the given name.
+ *
+ * This function will return a pointer to a dentry if it succeeds. This
+ * pointer must be passed to the stats_fs_remove() function when the file is
+ * to be removed (no automatic cleanup happens if your module is unloaded,
+ * you are responsible here.) If an error occurs, ERR_PTR(-ERROR) will be
+ * returned.
+ */
+struct dentry *stats_fs_create_dir(const char *name, struct dentry *parent)
+{
+ struct dentry *dentry;
+ struct inode *inode;
+
+ dentry = simplefs_create_dir(&stats_fs, &stats_fs_fs_type,
+ name, 0755, parent, &inode);
+ if (IS_ERR(dentry))
+ return dentry;
+
+ inode->i_op = &simple_dir_inode_operations;
+ return simplefs_finish_dentry(dentry, inode);
+}
+
+static void remove_one(struct dentry *victim)
+{
+ simple_release_fs(&stats_fs);
+}
+
+/**
+ * stats_fs_remove - recursively removes a directory
+ * @dentry: a pointer to a the dentry of the directory to be removed. If this
+ * parameter is NULL or an error value, nothing will be done.
+ *
+ * This function recursively removes a directory tree in stats_fs that
+ * was previously created with a call to another stats_fs function
+ * (like stats_fs_create_file() or variants thereof.)
+ *
+ * This function is required to be called in order for the file to be
+ * removed, no automatic cleanup of files will happen when a module is
+ * removed, you are responsible here.
+ */
+void stats_fs_remove(struct dentry *dentry)
+{
+ if (IS_ERR_OR_NULL(dentry))
+ return;
+
+ simple_pin_fs(&stats_fs, &stats_fs_fs_type);
+ simple_recursive_removal(dentry, remove_one);
+ simple_release_fs(&stats_fs);
+}
+/**
+ * stats_fs_initialized - Tells whether stats_fs has been registered
+ */
+bool stats_fs_initialized(void)
+{
+ return stats_fs_registered;
+}
+EXPORT_SYMBOL_GPL(stats_fs_initialized);
+
+static int __init stats_fs_init(void)
+{
+ int retval;
+
+ retval = sysfs_create_mount_point(kernel_kobj, "statsfs");
+ if (retval)
+ return retval;
+
+ retval = register_filesystem(&stats_fs_fs_type);
+ if (retval)
+ sysfs_remove_mount_point(kernel_kobj, "statsfs");
+ else
+ stats_fs_registered = true;
+
+ return retval;
+}
+core_initcall(stats_fs_init);
diff --git a/fs/stats_fs/internal.h b/fs/stats_fs/internal.h
index ddf262a60736..1f7bb1da6c3c 100644
--- a/fs/stats_fs/internal.h
+++ b/fs/stats_fs/internal.h
@@ -15,6 +15,21 @@ struct stats_fs_value_source {
struct list_head list_element;
};
+struct stats_fs_data_inode {
+ struct stats_fs_source *src;
+ struct stats_fs_value *val;
+};
+
+extern const struct file_operations stats_fs_ops;
+
+struct dentry *stats_fs_create_file(struct stats_fs_value *val,
+ struct stats_fs_source *src);
+
+struct dentry *stats_fs_create_dir(const char *name, struct dentry *parent);
+
+void stats_fs_remove(struct dentry *dentry);
+#define stats_fs_remove_recursive stats_fs_remove
+
int stats_fs_val_get_mode(struct stats_fs_value *val);
#endif /* _STATS_FS_INTERNAL_H_ */
diff --git a/fs/stats_fs/stats_fs.c b/fs/stats_fs/stats_fs.c
index b63de12769e2..4ac6fe1ec62e 100644
--- a/fs/stats_fs/stats_fs.c
+++ b/fs/stats_fs/stats_fs.c
@@ -17,16 +17,114 @@ struct stats_fs_aggregate_value {
uint32_t count, count_zero;
};
+static void stats_fs_source_remove_files(struct stats_fs_source *src);
+
static int is_val_signed(struct stats_fs_value *val)
{
return val->type & STATS_FS_SIGN;
}
+static int stats_fs_attr_get(void *data, u64 *val)
+{
+ int r = -EFAULT;
+ struct stats_fs_data_inode *val_inode =
+ (struct stats_fs_data_inode *)data;
+
+ r = stats_fs_source_get_value(val_inode->src, val_inode->val, val);
+ return r;
+}
+
+static int stats_fs_attr_clear(void *data, u64 val)
+{
+ int r = -EFAULT;
+ struct stats_fs_data_inode *val_inode =
+ (struct stats_fs_data_inode *)data;
+
+ if (val)
+ return -EINVAL;
+
+ r = stats_fs_source_clear(val_inode->src, val_inode->val);
+ return r;
+}
+
int stats_fs_val_get_mode(struct stats_fs_value *val)
{
return val->mode ? val->mode : 0644;
}
+static int stats_fs_attr_data_open(struct inode *inode, struct file *file)
+{
+ struct stats_fs_data_inode *val_inode;
+ char *fmt;
+
+ val_inode = (struct stats_fs_data_inode *)inode->i_private;
+
+ /* Inodes hold a pointer to the source which is not included in the
+ * refcount, so they files be opened while destroy is running, but
+ * values are removed (base_addr = NULL) before the source is destroyed.
+ */
+ if (!kref_get_unless_zero(&val_inode->src->refcount))
+ return -ENOENT;
+
+ if (is_val_signed(val_inode->val))
+ fmt = "%lld\n";
+ else
+ fmt = "%llu\n";
+
+ if (simple_attr_open(inode, file, stats_fs_attr_get,
+ stats_fs_val_get_mode(val_inode->val) & 0222 ?
+ stats_fs_attr_clear :
+ NULL,
+ fmt)) {
+ stats_fs_source_put(val_inode->src);
+ return -ENOMEM;
+ }
+ return 0;
+}
+
+static int stats_fs_attr_release(struct inode *inode, struct file *file)
+{
+ struct stats_fs_data_inode *val_inode;
+
+ val_inode = (struct stats_fs_data_inode *)inode->i_private;
+
+ simple_attr_release(inode, file);
+ stats_fs_source_put(val_inode->src);
+
+ return 0;
+}
+
+const struct file_operations stats_fs_ops = {
+ .owner = THIS_MODULE,
+ .open = stats_fs_attr_data_open,
+ .release = stats_fs_attr_release,
+ .read = simple_attr_read,
+ .write = simple_attr_write,
+ .llseek = no_llseek,
+};
+
+/* Called with rwsem held for writing */
+static void stats_fs_source_remove_files_locked(struct stats_fs_source *src)
+{
+ struct stats_fs_source *child;
+
+ if (src->source_dentry == NULL)
+ return;
+
+ list_for_each_entry (child, &src->subordinates_head, list_element)
+ stats_fs_source_remove_files(child);
+
+ stats_fs_remove_recursive(src->source_dentry);
+ src->source_dentry = NULL;
+}
+
+static void stats_fs_source_remove_files(struct stats_fs_source *src)
+{
+ down_write(&src->rwsem);
+ stats_fs_source_remove_files_locked(src);
+ up_write(&src->rwsem);
+}
+
static struct stats_fs_value *find_value(struct stats_fs_value_source *src,
struct stats_fs_value *val)
{
@@ -57,6 +155,62 @@ search_value_in_source(struct stats_fs_source *src, struct stats_fs_value *arg,
return NULL;
}
+/* Called with rwsem held for writing */
+static void stats_fs_create_files_locked(struct stats_fs_source *source)
+{
+ struct stats_fs_value_source *val_src;
+ struct stats_fs_value *val;
+
+ if (!source->source_dentry)
+ return;
+
+ list_for_each_entry (val_src, &source->values_head, list_element) {
+ if (val_src->files_created)
+ continue;
+
+ for (val = val_src->values; val->name; val++)
+ stats_fs_create_file(val, source);
+
+ val_src->files_created = true;
+ }
+}
+
+/* Called with rwsem held for writing */
+static void
+stats_fs_create_files_recursive_locked(struct stats_fs_source *source,
+ struct dentry *parent_dentry)
+{
+ struct stats_fs_source *child;
+
+ /* first check values in this folder, since it might be new */
+ if (!source->source_dentry) {
+ source->source_dentry =
+ stats_fs_create_dir(source->name, parent_dentry);
+ }
+
+ stats_fs_create_files_locked(source);
+
+ list_for_each_entry (child, &source->subordinates_head, list_element) {
+ if (child->source_dentry == NULL) {
+ /* assume that if child has a folder,
+ * also the sub-child have that.
+ */
+ down_write(&child->rwsem);
+ stats_fs_create_files_recursive_locked(
+ child, source->source_dentry);
+ up_write(&child->rwsem);
+ }
+ }
+}
+
+void stats_fs_source_register(struct stats_fs_source *source)
+{
+ down_write(&source->rwsem);
+ stats_fs_create_files_recursive_locked(source, NULL);
+ up_write(&source->rwsem);
+}
+EXPORT_SYMBOL_GPL(stats_fs_source_register);
+
/* Called with rwsem held for writing */
static struct stats_fs_value_source *create_value_source(void *base)
{
@@ -93,6 +247,9 @@ int stats_fs_source_add_values(struct stats_fs_source *source,
/* add the val_src to the source list */
list_add(&val_src->list_element, &source->values_head);
+ /* create child if it's the case */
+ stats_fs_create_files_locked(source);
+
up_write(&source->rwsem);
return 0;
@@ -106,6 +263,9 @@ void stats_fs_source_add_subordinate(struct stats_fs_source *source,
stats_fs_source_get(sub);
list_add(&sub->list_element, &source->subordinates_head);
+ if (source->source_dentry)
+ stats_fs_create_files_recursive_locked(sub,
+ source->source_dentry);
up_write(&source->rwsem);
}
@@ -122,6 +282,7 @@ stats_fs_source_remove_subordinate_locked(struct stats_fs_source *source,
list_element) {
if (src_entry == sub) {
list_del_init(&src_entry->list_element);
+ stats_fs_source_remove_files(src_entry);
stats_fs_source_put(src_entry);
return;
}
@@ -565,6 +726,8 @@ static void stats_fs_source_destroy(struct kref *kref_source)
stats_fs_source_remove_subordinate_locked(source, child);
}
+ stats_fs_source_remove_files_locked(source);
+
up_write(&source->rwsem);
kfree(source->name);
kfree(source);
diff --git a/include/linux/stats_fs.h b/include/linux/stats_fs.h
index dc2d2e11f5ea..b04c42d827cf 100644
--- a/include/linux/stats_fs.h
+++ b/include/linux/stats_fs.h
@@ -87,6 +87,18 @@ struct stats_fs_source {
*/
struct stats_fs_source *stats_fs_source_create(const char *fmt, ...);
+/**
+ * stats_fs_source_register - register a source in the stats_fs filesystem
+ * @source: a pointer to the source that will be registered
+ *
+ * Add the given folder as direct child of /sys/kernel/statsfs.
+ * It also starts to recursively search its own child and create all folders
+ * and files if they weren't already. All subsequent add_subordinate calls
+ * on the same source that is used in this function will create corresponding
+ * files and directories.
+ */
+void stats_fs_source_register(struct stats_fs_source *source);
+
/**
* stats_fs_source_add_values - adds values to the given source
* @source: a pointer to the source that will receive the values
@@ -235,6 +247,9 @@ static inline struct stats_fs_source *stats_fs_source_create(const char *fmt,
return ERR_PTR(-ENODEV);
}
+static inline void stats_fs_source_register(struct stats_fs_source *source)
+{ }
+
static inline int stats_fs_source_add_values(struct stats_fs_source *source,
struct stats_fs_value *val,
void *base_ptr)
diff --git a/include/uapi/linux/magic.h b/include/uapi/linux/magic.h
index d78064007b17..46c66ea3fc9e 100644
--- a/include/uapi/linux/magic.h
+++ b/include/uapi/linux/magic.h
@@ -10,6 +10,7 @@
#define CRAMFS_MAGIC 0x28cd3d45 /* some random number */
#define CRAMFS_MAGIC_WEND 0x453dcd28 /* magic number with the wrong endianess */
#define DEBUGFS_MAGIC 0x64626720
+#define STATSFS_MAGIC 0x73746174
#define SECURITYFS_MAGIC 0x73636673
#define SELINUX_MAGIC 0xf97cff8c
#define SMACK_MAGIC 0x43415d53 /* "SMAC" */
diff --git a/tools/lib/api/fs/fs.c b/tools/lib/api/fs/fs.c
index 027b18f7ed8c..6fe306206dfb 100644
--- a/tools/lib/api/fs/fs.c
+++ b/tools/lib/api/fs/fs.c
@@ -35,6 +35,10 @@
#define TRACEFS_MAGIC 0x74726163
#endif
+#ifndef STATSFS_MAGIC
+#define STATSFS_MAGIC 0x73746174
+#endif
+
#ifndef HUGETLBFS_MAGIC
#define HUGETLBFS_MAGIC 0x958458f6
#endif
@@ -76,6 +80,16 @@ static const char * const tracefs__known_mountpoints[] = {
0,
};
+#ifndef STATSFS_DEFAULT_PATH
+#define STATSFS_DEFAULT_PATH "/sys/kernel/statsfs"
+#endif
+
+static const char * const statsfs__known_mountpoints[] = {
+ STATSFS_DEFAULT_PATH,
+ "/statsfs",
+ 0,
+};
+
static const char * const hugetlbfs__known_mountpoints[] = {
0,
};
@@ -100,6 +114,7 @@ enum {
FS__TRACEFS = 3,
FS__HUGETLBFS = 4,
FS__BPF_FS = 5,
+ FS__STATSFS = 6,
};
#ifndef TRACEFS_MAGIC
@@ -127,6 +142,11 @@ static struct fs fs__entries[] = {
.mounts = tracefs__known_mountpoints,
.magic = TRACEFS_MAGIC,
},
+ [FS__STATSFS] = {
+ .name = "statsfs",
+ .mounts = statsfs__known_mountpoints,
+ .magic = STATSFS_MAGIC,
+ },
[FS__HUGETLBFS] = {
.name = "hugetlbfs",
.mounts = hugetlbfs__known_mountpoints,
@@ -297,6 +317,7 @@ FS(sysfs, FS__SYSFS);
FS(procfs, FS__PROCFS);
FS(debugfs, FS__DEBUGFS);
FS(tracefs, FS__TRACEFS);
+FS(statsfs, FS__STATSFS);
FS(hugetlbfs, FS__HUGETLBFS);
FS(bpf_fs, FS__BPF_FS);
--
2.25.2
^ permalink raw reply related
* [PATCH v2 3/5] kunit: tests for stats_fs API
From: Emanuele Giuseppe Esposito @ 2020-05-04 11:03 UTC (permalink / raw)
To: kvm
Cc: Emanuele Giuseppe Esposito, linux-s390, David Hildenbrand,
Cornelia Huck, Emanuele Giuseppe Esposito, linux-kernel, kvm-ppc,
linux-mips, Christian Borntraeger, Alexander Viro, linux-fsdevel,
Paolo Bonzini, Vitaly Kuznetsov, linuxppc-dev, Jim Mattson
In-Reply-To: <20200504110344.17560-1-eesposit@redhat.com>
Add kunit tests to extensively test the stats_fs API functionality.
In order to run them, the kernel .config must set CONFIG_KUNIT=y
and a new .kunitconfig file must be created with CONFIG_STATS_FS=y
and CONFIG_STATS_FS_TEST=y
Tests can be then started by running the following command from the root
directory of the linux kernel source tree:
./tools/testing/kunit/kunit.py run --timeout=30 --jobs=`nproc --all`
Signed-off-by: Emanuele Giuseppe Esposito <eesposit@redhat.com>
---
fs/Kconfig | 6 +
fs/stats_fs/Makefile | 2 +
fs/stats_fs/stats_fs-tests.c | 1088 ++++++++++++++++++++++++++++++++++
3 files changed, 1096 insertions(+)
create mode 100644 fs/stats_fs/stats_fs-tests.c
diff --git a/fs/Kconfig b/fs/Kconfig
index 1b0de0f19e96..0844e8defd22 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -334,4 +334,10 @@ config STATS_FS
stats_fs is a virtual file system that provides counters and
other statistics about the running kernel.
+config STATS_FS_TEST
+ bool "Tests for stats_fs"
+ depends on STATS_FS && KUNIT
+ help
+ tests for the stats_fs API.
+
endmenu
diff --git a/fs/stats_fs/Makefile b/fs/stats_fs/Makefile
index 94fe52d590d5..9db130fac6b6 100644
--- a/fs/stats_fs/Makefile
+++ b/fs/stats_fs/Makefile
@@ -1,4 +1,6 @@
# SPDX-License-Identifier: GPL-2.0-only
stats_fs-objs := stats_fs.o
+stats_fs-tests-objs := stats_fs-tests.o
obj-$(CONFIG_STATS_FS) += stats_fs.o
+obj-$(CONFIG_STATS_FS_TEST) += stats_fs-tests.o
diff --git a/fs/stats_fs/stats_fs-tests.c b/fs/stats_fs/stats_fs-tests.c
new file mode 100644
index 000000000000..46c3fb510ee9
--- /dev/null
+++ b/fs/stats_fs/stats_fs-tests.c
@@ -0,0 +1,1088 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/module.h>
+#include <linux/errno.h>
+#include <linux/file.h>
+#include <linux/fs.h>
+#include <linux/slab.h>
+#include <linux/anon_inodes.h>
+#include <linux/spinlock.h>
+#include <linux/uaccess.h>
+#include <linux/rwsem.h>
+#include <linux/list.h>
+#include <linux/kref.h>
+
+#include <linux/limits.h>
+#include <linux/stats_fs.h>
+#include <kunit/test.h>
+#include "internal.h"
+
+#define STATS_FS_STAT(el, x, ...) \
+ { \
+ .name = #x, .offset = offsetof(struct container, el.x), \
+ ##__VA_ARGS__ \
+ }
+
+#define ARR_SIZE(el) ((int)(sizeof(el) / sizeof(struct stats_fs_value) - 1))
+
+struct test_values_struct {
+ uint64_t u64;
+ int32_t s32;
+ bool bo;
+ uint8_t u8;
+ int16_t s16;
+};
+
+struct container {
+ struct test_values_struct vals;
+};
+
+struct stats_fs_value test_values[6] = {
+ STATS_FS_STAT(vals, u64, .type = STATS_FS_U64,
+ .aggr_kind = STATS_FS_NONE, .mode = 0),
+ STATS_FS_STAT(vals, s32, .type = STATS_FS_S32,
+ .aggr_kind = STATS_FS_NONE, .mode = 0),
+ STATS_FS_STAT(vals, bo, .type = STATS_FS_BOOL,
+ .aggr_kind = STATS_FS_NONE, .mode = 0),
+ STATS_FS_STAT(vals, u8, .type = STATS_FS_U8, .aggr_kind = STATS_FS_NONE,
+ .mode = 0),
+ STATS_FS_STAT(vals, s16, .type = STATS_FS_S16,
+ .aggr_kind = STATS_FS_NONE, .mode = 0),
+ { NULL },
+};
+
+struct stats_fs_value test_aggr[4] = {
+ STATS_FS_STAT(vals, s32, .type = STATS_FS_S32,
+ .aggr_kind = STATS_FS_MIN, .mode = 0),
+ STATS_FS_STAT(vals, bo, .type = STATS_FS_BOOL,
+ .aggr_kind = STATS_FS_MAX, .mode = 0),
+ STATS_FS_STAT(vals, u64, .type = STATS_FS_U64,
+ .aggr_kind = STATS_FS_SUM, .mode = 0),
+ { NULL },
+};
+
+struct stats_fs_value test_same_name[3] = {
+ STATS_FS_STAT(vals, s32, .type = STATS_FS_S32,
+ .aggr_kind = STATS_FS_NONE, .mode = 0),
+ STATS_FS_STAT(vals, s32, .type = STATS_FS_S32,
+ .aggr_kind = STATS_FS_MIN, .mode = 0),
+ { NULL },
+};
+
+struct stats_fs_value test_all_aggr[6] = {
+ STATS_FS_STAT(vals, s32, .type = STATS_FS_S32,
+ .aggr_kind = STATS_FS_MIN, .mode = 0),
+ STATS_FS_STAT(vals, bo, .type = STATS_FS_BOOL,
+ .aggr_kind = STATS_FS_COUNT_ZERO, .mode = 0),
+ STATS_FS_STAT(vals, u64, .type = STATS_FS_U64,
+ .aggr_kind = STATS_FS_SUM, .mode = 0),
+ STATS_FS_STAT(vals, u8, .type = STATS_FS_U8, .aggr_kind = STATS_FS_AVG,
+ .mode = 0),
+ STATS_FS_STAT(vals, s16, .type = STATS_FS_S16,
+ .aggr_kind = STATS_FS_MAX, .mode = 0),
+ { NULL },
+};
+
+#define def_u64 ((uint64_t)64)
+
+#define def_val_s32 ((int32_t)S32_MIN)
+#define def_val_bool ((bool)true)
+#define def_val_u8 ((uint8_t)127)
+#define def_val_s16 ((int16_t)10000)
+
+#define def_val2_s32 ((int32_t)S16_MAX)
+#define def_val2_bool ((bool)false)
+#define def_val2_u8 ((uint8_t)255)
+#define def_val2_s16 ((int16_t)-20000)
+
+struct container cont = {
+ .vals = {
+ .u64 = def_u64,
+ .s32 = def_val_s32,
+ .bo = def_val_bool,
+ .u8 = def_val_u8,
+ .s16 = def_val_s16,
+ },
+};
+
+struct container cont2 = {
+ .vals = {
+ .u64 = def_u64,
+ .s32 = def_val2_s32,
+ .bo = def_val2_bool,
+ .u8 = def_val2_u8,
+ .s16 = def_val2_s16,
+ },
+};
+
+static void get_stats_at_addr(struct stats_fs_source *src, void *addr,
+ int *aggr, int *val, int use_addr)
+{
+ struct stats_fs_value *entry;
+ struct stats_fs_value_source *src_entry;
+ int counter_val = 0, counter_aggr = 0;
+
+ list_for_each_entry (src_entry, &src->values_head, list_element) {
+ if (use_addr && src_entry->base_addr != addr)
+ continue;
+
+ for (entry = src_entry->values; entry->name; entry++) {
+ if (entry->aggr_kind == STATS_FS_NONE)
+ counter_val++;
+ else
+ counter_aggr++;
+ }
+ }
+
+ if (aggr)
+ *aggr = counter_aggr;
+
+ if (val)
+ *val = counter_val;
+}
+
+int source_has_subsource(struct stats_fs_source *src,
+ struct stats_fs_source *sub)
+{
+ struct stats_fs_source *entry;
+
+ list_for_each_entry (entry, &src->subordinates_head, list_element) {
+ if (entry == sub)
+ return 1;
+ }
+ return 0;
+}
+
+int get_number_subsources(struct stats_fs_source *src)
+{
+ struct stats_fs_source *entry;
+ int counter = 0;
+
+ list_for_each_entry (entry, &src->subordinates_head, list_element) {
+ counter++;
+ }
+ return counter;
+}
+
+int get_number_values(struct stats_fs_source *src)
+{
+ int counter = 0;
+
+ get_stats_at_addr(src, NULL, NULL, &counter, 0);
+ return counter;
+}
+
+int get_total_number_values(struct stats_fs_source *src)
+{
+ struct stats_fs_source *sub_entry;
+ int counter = 0;
+
+ get_stats_at_addr(src, NULL, NULL, &counter, 0);
+
+ list_for_each_entry (sub_entry, &src->subordinates_head, list_element) {
+ counter += get_total_number_values(sub_entry);
+ }
+
+ return counter;
+}
+
+int get_number_aggregates(struct stats_fs_source *src)
+{
+ int counter = 0;
+
+ get_stats_at_addr(src, NULL, &counter, NULL, 1);
+ return counter;
+}
+
+int get_number_values_with_base(struct stats_fs_source *src, void *addr)
+{
+ int counter = 0;
+
+ get_stats_at_addr(src, addr, NULL, &counter, 1);
+ return counter;
+}
+
+int get_number_aggr_with_base(struct stats_fs_source *src, void *addr)
+{
+ int counter = 0;
+
+ get_stats_at_addr(src, addr, &counter, NULL, 1);
+ return counter;
+}
+
+static void test_empty_folder(struct kunit *test)
+{
+ struct stats_fs_source *src;
+
+ src = stats_fs_source_create("kvm_%d", 123);
+ KUNIT_EXPECT_EQ(test, strcmp(src->name, "kvm_123"), 0);
+ KUNIT_EXPECT_EQ(test, get_number_subsources(src), 0);
+ KUNIT_EXPECT_EQ(test, get_number_values(src), 0);
+ KUNIT_EXPECT_EQ(test, get_number_aggregates(src), 0);
+ stats_fs_source_put(src);
+}
+
+static void test_add_subfolder(struct kunit *test)
+{
+ struct stats_fs_source *src, *sub;
+
+ src = stats_fs_source_create("parent");
+ sub = stats_fs_source_create("child");
+ stats_fs_source_add_subordinate(src, sub);
+ KUNIT_EXPECT_EQ(test, source_has_subsource(src, sub), true);
+ KUNIT_EXPECT_EQ(test, get_number_subsources(src), 1);
+ KUNIT_EXPECT_EQ(test, get_number_values(src), 0);
+ KUNIT_EXPECT_EQ(test, get_number_aggregates(src), 0);
+ KUNIT_EXPECT_EQ(test, get_number_values(sub), 0);
+ KUNIT_EXPECT_EQ(test, get_number_aggregates(sub), 0);
+ KUNIT_EXPECT_EQ(test, get_total_number_values(src), 0);
+
+ stats_fs_source_put(sub);
+ sub = stats_fs_source_create("not a child");
+ KUNIT_EXPECT_EQ(test, source_has_subsource(src, sub), false);
+ KUNIT_EXPECT_EQ(test, get_number_subsources(src), 1);
+
+ stats_fs_source_put(sub);
+ stats_fs_source_put(src);
+}
+
+static void test_add_value(struct kunit *test)
+{
+ struct stats_fs_source *src;
+ int n;
+
+ src = stats_fs_source_create("parent");
+
+ // add values
+ n = stats_fs_source_add_values(src, test_values, &cont);
+ KUNIT_EXPECT_EQ(test, n, 0);
+ n = get_number_values_with_base(src, &cont);
+ KUNIT_EXPECT_EQ(test, n, ARR_SIZE(test_values));
+
+ // add same values, nothing happens
+ n = stats_fs_source_add_values(src, test_values, &cont);
+ KUNIT_EXPECT_EQ(test, n, -EEXIST);
+ n = get_number_values_with_base(src, &cont);
+ KUNIT_EXPECT_EQ(test, n, ARR_SIZE(test_values));
+
+ // size is invaried
+ KUNIT_EXPECT_EQ(test, get_number_values(src), ARR_SIZE(test_values));
+
+ // no aggregates
+ n = get_number_aggr_with_base(src, &cont);
+ KUNIT_EXPECT_EQ(test, n, 0);
+ KUNIT_EXPECT_EQ(test, get_number_values(src), ARR_SIZE(test_values));
+ KUNIT_EXPECT_EQ(test, get_number_aggregates(src), 0);
+
+ stats_fs_source_put(src);
+}
+
+static void test_add_value_in_subfolder(struct kunit *test)
+{
+ struct stats_fs_source *src, *sub, *sub_not;
+ int n;
+
+ src = stats_fs_source_create("parent");
+ sub = stats_fs_source_create("child");
+
+ // src -> sub
+ stats_fs_source_add_subordinate(src, sub);
+
+ // add values
+ n = stats_fs_source_add_values(sub, test_values, &cont);
+ KUNIT_EXPECT_EQ(test, n, 0);
+ n = get_number_values_with_base(sub, &cont);
+ KUNIT_EXPECT_EQ(test, n, ARR_SIZE(test_values));
+ KUNIT_EXPECT_EQ(test, get_number_values(src), 0);
+ KUNIT_EXPECT_EQ(test, get_number_aggregates(src), 0);
+ KUNIT_EXPECT_EQ(test, get_total_number_values(src),
+ ARR_SIZE(test_values));
+
+ KUNIT_EXPECT_EQ(test, get_number_values(sub), ARR_SIZE(test_values));
+ // no values in sub
+ KUNIT_EXPECT_EQ(test, get_number_aggregates(sub), 0);
+
+ // different folder
+ sub_not = stats_fs_source_create("not a child");
+
+ // add values
+ n = stats_fs_source_add_values(sub_not, test_values, &cont);
+ KUNIT_EXPECT_EQ(test, n, 0);
+ n = get_number_values_with_base(sub_not, &cont);
+ KUNIT_EXPECT_EQ(test, n, ARR_SIZE(test_values));
+ KUNIT_EXPECT_EQ(test, get_number_values(src), 0);
+ KUNIT_EXPECT_EQ(test, get_number_aggregates(src), 0);
+ KUNIT_EXPECT_EQ(test, get_total_number_values(src),
+ ARR_SIZE(test_values));
+
+ // remove sub, check values is 0
+ stats_fs_source_remove_subordinate(src, sub);
+ KUNIT_EXPECT_EQ(test, get_total_number_values(src), 0);
+
+ // re-add sub, check value are added
+ stats_fs_source_add_subordinate(src, sub);
+ KUNIT_EXPECT_EQ(test, get_total_number_values(src),
+ ARR_SIZE(test_values));
+
+ // add sub_not, check value are twice as many
+ stats_fs_source_add_subordinate(src, sub_not);
+ KUNIT_EXPECT_EQ(test, get_total_number_values(src),
+ ARR_SIZE(test_values) * 2);
+
+ KUNIT_EXPECT_EQ(test, get_number_values(sub_not),
+ ARR_SIZE(test_values));
+ KUNIT_EXPECT_EQ(test, get_number_aggregates(sub_not), 0);
+
+ stats_fs_source_put(sub);
+ stats_fs_source_put(sub_not);
+ stats_fs_source_put(src);
+}
+
+static void test_search_value(struct kunit *test)
+{
+ struct stats_fs_source *src;
+ uint64_t ret;
+ int n;
+
+ src = stats_fs_source_create("parent");
+
+ // add values
+ n = stats_fs_source_add_values(src, test_values, &cont);
+ KUNIT_EXPECT_EQ(test, n, 0);
+ n = get_number_values_with_base(src, &cont);
+ KUNIT_EXPECT_EQ(test, n, ARR_SIZE(test_values));
+
+ // get u64
+ n = stats_fs_source_get_value_by_name(src, "u64", &ret);
+ KUNIT_EXPECT_EQ(test, ret, def_u64);
+ KUNIT_EXPECT_EQ(test, n, 0);
+
+ n = stats_fs_source_get_value_by_name(src, "s32", &ret);
+ KUNIT_EXPECT_EQ(test, ((int32_t)ret), def_val_s32);
+ KUNIT_EXPECT_EQ(test, n, 0);
+
+ n = stats_fs_source_get_value_by_name(src, "bo", &ret);
+ KUNIT_EXPECT_EQ(test, ((bool)ret), def_val_bool);
+ KUNIT_EXPECT_EQ(test, n, 0);
+
+ // get a non-added value
+ n = stats_fs_source_get_value_by_name(src, "does not exist", &ret);
+ KUNIT_EXPECT_EQ(test, ret, 0ull);
+ KUNIT_EXPECT_EQ(test, n, -ENOENT);
+
+ stats_fs_source_put(src);
+}
+
+static void test_search_value_in_subfolder(struct kunit *test)
+{
+ struct stats_fs_source *src, *sub;
+ uint64_t ret;
+ int n;
+
+ src = stats_fs_source_create("parent");
+ sub = stats_fs_source_create("child");
+
+ // src -> sub
+ stats_fs_source_add_subordinate(src, sub);
+
+ // add values to sub
+ n = stats_fs_source_add_values(sub, test_values, &cont);
+ KUNIT_EXPECT_EQ(test, n, 0);
+ n = get_number_values_with_base(sub, &cont);
+ KUNIT_EXPECT_EQ(test, n, ARR_SIZE(test_values));
+
+ n = stats_fs_source_get_value_by_name(sub, "u64", &ret);
+ KUNIT_EXPECT_EQ(test, ret, def_u64);
+ KUNIT_EXPECT_EQ(test, n, 0);
+ n = stats_fs_source_get_value_by_name(src, "u64", &ret);
+ KUNIT_EXPECT_EQ(test, ret, 0ull);
+ KUNIT_EXPECT_EQ(test, n, -ENOENT);
+
+ n = stats_fs_source_get_value_by_name(sub, "s32", &ret);
+ KUNIT_EXPECT_EQ(test, ((int32_t)ret), def_val_s32);
+ KUNIT_EXPECT_EQ(test, n, 0);
+ n = stats_fs_source_get_value_by_name(src, "s32", &ret);
+ KUNIT_EXPECT_EQ(test, ret, 0ull);
+ KUNIT_EXPECT_EQ(test, n, -ENOENT);
+
+ n = stats_fs_source_get_value_by_name(sub, "bo", &ret);
+ KUNIT_EXPECT_EQ(test, ((bool)ret), def_val_bool);
+ KUNIT_EXPECT_EQ(test, n, 0);
+ n = stats_fs_source_get_value_by_name(src, "bo", &ret);
+ KUNIT_EXPECT_EQ(test, ret, 0ull);
+ KUNIT_EXPECT_EQ(test, n, -ENOENT);
+
+ n = stats_fs_source_get_value_by_name(sub, "does not exist", &ret);
+ KUNIT_EXPECT_EQ(test, ret, 0ull);
+ KUNIT_EXPECT_EQ(test, n, -ENOENT);
+ n = stats_fs_source_get_value_by_name(src, "does not exist", &ret);
+ KUNIT_EXPECT_EQ(test, ret, 0ull);
+ KUNIT_EXPECT_EQ(test, n, -ENOENT);
+
+ stats_fs_source_put(sub);
+ stats_fs_source_put(src);
+}
+
+static void test_search_value_in_empty_folder(struct kunit *test)
+{
+ struct stats_fs_source *src;
+ uint64_t ret;
+ int n;
+
+ src = stats_fs_source_create("empty folder");
+ KUNIT_EXPECT_EQ(test, get_number_aggregates(src), 0);
+ KUNIT_EXPECT_EQ(test, get_number_subsources(src), 0);
+ KUNIT_EXPECT_EQ(test, get_number_values(src), 0);
+
+ n = stats_fs_source_get_value_by_name(src, "u64", &ret);
+ KUNIT_EXPECT_EQ(test, ret, 0ull);
+ KUNIT_EXPECT_EQ(test, n, -ENOENT);
+
+ n = stats_fs_source_get_value_by_name(src, "s32", &ret);
+ KUNIT_EXPECT_EQ(test, ret, 0ull);
+ KUNIT_EXPECT_EQ(test, n, -ENOENT);
+
+ n = stats_fs_source_get_value_by_name(src, "bo", &ret);
+ KUNIT_EXPECT_EQ(test, ret, 0ull);
+ KUNIT_EXPECT_EQ(test, n, -ENOENT);
+
+ n = stats_fs_source_get_value_by_name(src, "does not exist", &ret);
+ KUNIT_EXPECT_EQ(test, ret, 0ull);
+ KUNIT_EXPECT_EQ(test, n, -ENOENT);
+
+ stats_fs_source_put(src);
+}
+
+static void test_add_aggregate(struct kunit *test)
+{
+ struct stats_fs_source *src;
+ int n;
+
+ src = stats_fs_source_create("parent");
+
+ // add aggr to src, no values
+ n = stats_fs_source_add_values(src, test_aggr, NULL);
+ KUNIT_EXPECT_EQ(test, n, 0);
+ n = get_number_values_with_base(src, NULL);
+ KUNIT_EXPECT_EQ(test, n, 0);
+
+ // count values
+ n = get_number_aggr_with_base(src, NULL);
+ KUNIT_EXPECT_EQ(test, n, ARR_SIZE(test_aggr));
+
+ // add same array again, should not be added
+ n = stats_fs_source_add_values(src, test_aggr, NULL);
+ KUNIT_EXPECT_EQ(test, n, -EEXIST);
+ n = get_number_aggr_with_base(src, NULL);
+ KUNIT_EXPECT_EQ(test, n, ARR_SIZE(test_aggr));
+
+ KUNIT_EXPECT_EQ(test, get_number_values(src), 0);
+ KUNIT_EXPECT_EQ(test, get_number_aggregates(src), ARR_SIZE(test_aggr));
+
+ stats_fs_source_put(src);
+}
+
+static void test_add_aggregate_in_subfolder(struct kunit *test)
+{
+ struct stats_fs_source *src, *sub, *sub_not;
+ int n;
+
+ src = stats_fs_source_create("parent");
+ sub = stats_fs_source_create("child");
+ // src->sub
+ stats_fs_source_add_subordinate(src, sub);
+
+ // add aggr to sub
+ n = stats_fs_source_add_values(sub, test_aggr, NULL);
+ KUNIT_EXPECT_EQ(test, n, 0);
+ n = get_number_aggr_with_base(sub, NULL);
+ KUNIT_EXPECT_EQ(test, n, ARR_SIZE(test_aggr));
+ KUNIT_EXPECT_EQ(test, get_number_values(src), 0);
+ KUNIT_EXPECT_EQ(test, get_number_aggregates(src), 0);
+ KUNIT_EXPECT_EQ(test, get_total_number_values(src), 0);
+
+ KUNIT_EXPECT_EQ(test, get_number_values(sub), 0);
+ KUNIT_EXPECT_EQ(test, get_number_aggregates(sub), ARR_SIZE(test_aggr));
+
+ // not a child
+ sub_not = stats_fs_source_create("not a child");
+
+ // add aggr to "not a child"
+ n = stats_fs_source_add_values(sub_not, test_aggr, NULL);
+ KUNIT_EXPECT_EQ(test, n, 0);
+ n = get_number_aggr_with_base(sub_not, NULL);
+ KUNIT_EXPECT_EQ(test, n, ARR_SIZE(test_aggr));
+ KUNIT_EXPECT_EQ(test, get_number_values(src), 0);
+ KUNIT_EXPECT_EQ(test, get_number_aggregates(src), 0);
+ KUNIT_EXPECT_EQ(test, get_total_number_values(src), 0);
+
+ // remove sub
+ stats_fs_source_remove_subordinate(src, sub);
+ KUNIT_EXPECT_EQ(test, get_total_number_values(src), 0);
+
+ // re-add both
+ stats_fs_source_add_subordinate(src, sub);
+ KUNIT_EXPECT_EQ(test, get_total_number_values(src), 0);
+ stats_fs_source_add_subordinate(src, sub_not);
+ KUNIT_EXPECT_EQ(test, get_total_number_values(src), 0);
+
+ KUNIT_EXPECT_EQ(test, get_number_values(sub_not), 0);
+ KUNIT_EXPECT_EQ(test, get_number_aggregates(sub_not),
+ ARR_SIZE(test_aggr));
+
+ stats_fs_source_put(sub);
+ stats_fs_source_put(sub_not);
+ stats_fs_source_put(src);
+}
+
+static void test_search_aggregate(struct kunit *test)
+{
+ struct stats_fs_source *src;
+ uint64_t ret;
+ int n;
+
+ src = stats_fs_source_create("parent");
+ n = stats_fs_source_add_values(src, test_aggr, NULL);
+ KUNIT_EXPECT_EQ(test, n, 0);
+ n = get_number_aggr_with_base(src, NULL);
+ KUNIT_EXPECT_EQ(test, n, ARR_SIZE(test_aggr));
+ n = get_number_aggr_with_base(src, &cont);
+ KUNIT_EXPECT_EQ(test, n, 0);
+ n = stats_fs_source_get_value_by_name(src, "u64", &ret);
+ KUNIT_EXPECT_EQ(test, ret, 0ull);
+ KUNIT_EXPECT_EQ(test, n, 0);
+
+ n = stats_fs_source_get_value_by_name(src, "s32", &ret);
+ KUNIT_EXPECT_EQ(test, (int64_t)ret, S64_MAX);
+ KUNIT_EXPECT_EQ(test, n, 0);
+
+ n = stats_fs_source_get_value_by_name(src, "bo", &ret);
+ KUNIT_EXPECT_EQ(test, ret, 0ull);
+ KUNIT_EXPECT_EQ(test, n, 0);
+
+ n = stats_fs_source_get_value_by_name(src, "does not exist", &ret);
+ KUNIT_EXPECT_EQ(test, ret, 0ull);
+ KUNIT_EXPECT_EQ(test, n, -ENOENT);
+ stats_fs_source_put(src);
+}
+
+static void test_search_aggregate_in_subfolder(struct kunit *test)
+{
+ struct stats_fs_source *src, *sub;
+ uint64_t ret;
+ int n;
+
+ src = stats_fs_source_create("parent");
+ sub = stats_fs_source_create("child");
+
+ stats_fs_source_add_subordinate(src, sub);
+
+ n = stats_fs_source_add_values(sub, test_aggr, NULL);
+ KUNIT_EXPECT_EQ(test, n, 0);
+ n = get_number_aggr_with_base(sub, NULL);
+ KUNIT_EXPECT_EQ(test, n, ARR_SIZE(test_aggr));
+ n = get_number_aggr_with_base(sub, &cont);
+ KUNIT_EXPECT_EQ(test, n, 0);
+
+ // no u64 in test_aggr
+ n = stats_fs_source_get_value_by_name(sub, "u64", &ret);
+ KUNIT_EXPECT_EQ(test, ret, 0ull);
+ KUNIT_EXPECT_EQ(test, n, 0);
+ n = stats_fs_source_get_value_by_name(src, "u64", &ret);
+ KUNIT_EXPECT_EQ(test, ret, 0ull);
+ KUNIT_EXPECT_EQ(test, n, -ENOENT);
+
+ n = stats_fs_source_get_value_by_name(sub, "s32", &ret);
+ KUNIT_EXPECT_EQ(test, (int64_t)ret, S64_MAX);
+ KUNIT_EXPECT_EQ(test, n, 0);
+ n = stats_fs_source_get_value_by_name(src, "s32", &ret);
+ KUNIT_EXPECT_EQ(test, ret, 0ull);
+ KUNIT_EXPECT_EQ(test, n, -ENOENT);
+
+ n = stats_fs_source_get_value_by_name(sub, "bo", &ret);
+ KUNIT_EXPECT_EQ(test, ret, 0ull);
+ KUNIT_EXPECT_EQ(test, n, 0);
+ n = stats_fs_source_get_value_by_name(src, "bo", &ret);
+ KUNIT_EXPECT_EQ(test, ret, 0ull);
+ KUNIT_EXPECT_EQ(test, n, -ENOENT);
+
+ n = stats_fs_source_get_value_by_name(sub, "does not exist", &ret);
+ KUNIT_EXPECT_EQ(test, ret, 0ull);
+ KUNIT_EXPECT_EQ(test, n, -ENOENT);
+ n = stats_fs_source_get_value_by_name(src, "does not exist", &ret);
+ KUNIT_EXPECT_EQ(test, ret, 0ull);
+ KUNIT_EXPECT_EQ(test, n, -ENOENT);
+
+ stats_fs_source_put(sub);
+ stats_fs_source_put(src);
+}
+
+void test_search_same(struct kunit *test)
+{
+ struct stats_fs_source *src;
+ uint64_t ret;
+ int n;
+
+ src = stats_fs_source_create("parent");
+ n = stats_fs_source_add_values(src, test_same_name, &cont);
+ KUNIT_EXPECT_EQ(test, n, 0);
+ n = get_number_values_with_base(src, &cont);
+ KUNIT_EXPECT_EQ(test, n, 1);
+ n = get_number_aggr_with_base(src, &cont);
+ KUNIT_EXPECT_EQ(test, n, 1);
+
+ n = stats_fs_source_add_values(src, test_same_name, &cont);
+ KUNIT_EXPECT_EQ(test, n, -EEXIST);
+ n = get_number_values_with_base(src, &cont);
+ KUNIT_EXPECT_EQ(test, n, 1);
+ n = get_number_aggr_with_base(src, &cont);
+ KUNIT_EXPECT_EQ(test, n, 1);
+
+ // returns first the value
+ n = stats_fs_source_get_value_by_name(src, "s32", &ret);
+ KUNIT_EXPECT_EQ(test, ((int32_t)ret), def_val_s32);
+ KUNIT_EXPECT_EQ(test, n, 0);
+
+ stats_fs_source_put(src);
+}
+
+static void test_add_mixed(struct kunit *test)
+{
+ struct stats_fs_source *src;
+ int n;
+
+ src = stats_fs_source_create("parent");
+
+ n = stats_fs_source_add_values(src, test_aggr, NULL);
+ KUNIT_EXPECT_EQ(test, n, 0);
+ n = get_number_values_with_base(src, NULL);
+ KUNIT_EXPECT_EQ(test, n, 0);
+ n = stats_fs_source_add_values(src, test_values, &cont);
+ KUNIT_EXPECT_EQ(test, n, 0);
+ n = get_number_aggr_with_base(src, NULL);
+ KUNIT_EXPECT_EQ(test, n, ARR_SIZE(test_aggr));
+
+ n = stats_fs_source_add_values(src, test_values, &cont);
+ KUNIT_EXPECT_EQ(test, n, -EEXIST);
+ n = get_number_values_with_base(src, &cont);
+ KUNIT_EXPECT_EQ(test, n, ARR_SIZE(test_values));
+ n = stats_fs_source_add_values(src, test_aggr, NULL);
+ KUNIT_EXPECT_EQ(test, n, -EEXIST);
+ n = get_number_aggr_with_base(src, NULL);
+ KUNIT_EXPECT_EQ(test, n, ARR_SIZE(test_aggr));
+
+ KUNIT_EXPECT_EQ(test, get_number_values(src), ARR_SIZE(test_values));
+ KUNIT_EXPECT_EQ(test, get_number_aggregates(src), ARR_SIZE(test_aggr));
+ stats_fs_source_put(src);
+}
+
+static void test_search_mixed(struct kunit *test)
+{
+ struct stats_fs_source *src, *sub;
+ uint64_t ret;
+ int n;
+
+ src = stats_fs_source_create("parent");
+ sub = stats_fs_source_create("child");
+ stats_fs_source_add_subordinate(src, sub);
+
+ // src has the aggregates, sub the values. Just search
+ n = stats_fs_source_add_values(sub, test_values, &cont);
+ KUNIT_EXPECT_EQ(test, n, 0);
+ n = get_number_values_with_base(sub, &cont);
+ KUNIT_EXPECT_EQ(test, n, ARR_SIZE(test_values));
+ n = stats_fs_source_add_values(src, test_aggr, &cont);
+ KUNIT_EXPECT_EQ(test, n, 0);
+ n = get_number_aggr_with_base(src, &cont);
+ KUNIT_EXPECT_EQ(test, n, ARR_SIZE(test_aggr));
+
+ // u64 is sum so again same value
+ n = stats_fs_source_get_value_by_name(sub, "u64", &ret);
+ KUNIT_EXPECT_EQ(test, ret, def_u64);
+ KUNIT_EXPECT_EQ(test, n, 0);
+ n = stats_fs_source_get_value_by_name(src, "u64", &ret);
+ KUNIT_EXPECT_EQ(test, ret, def_u64);
+ KUNIT_EXPECT_EQ(test, n, 0);
+
+ // s32 is min so return the value also in the aggregate
+ n = stats_fs_source_get_value_by_name(sub, "s32", &ret);
+ KUNIT_EXPECT_EQ(test, ((int32_t)ret), def_val_s32);
+ KUNIT_EXPECT_EQ(test, n, 0);
+ n = stats_fs_source_get_value_by_name(src, "s32", &ret);
+ KUNIT_EXPECT_EQ(test, ((int32_t)ret), def_val_s32);
+ KUNIT_EXPECT_EQ(test, n, 0);
+
+ // bo is max
+ n = stats_fs_source_get_value_by_name(sub, "bo", &ret);
+ KUNIT_EXPECT_EQ(test, (bool)ret, def_val_bool);
+ KUNIT_EXPECT_EQ(test, n, 0);
+ n = stats_fs_source_get_value_by_name(src, "bo", &ret);
+ KUNIT_EXPECT_EQ(test, (bool)ret, def_val_bool);
+ KUNIT_EXPECT_EQ(test, n, 0);
+
+ n = stats_fs_source_get_value_by_name(sub, "does not exist", &ret);
+ KUNIT_EXPECT_EQ(test, ret, 0ull);
+ KUNIT_EXPECT_EQ(test, n, -ENOENT);
+ n = stats_fs_source_get_value_by_name(src, "does not exist", &ret);
+ KUNIT_EXPECT_EQ(test, ret, 0ull);
+ KUNIT_EXPECT_EQ(test, n, -ENOENT);
+
+ stats_fs_source_put(sub);
+ stats_fs_source_put(src);
+}
+
+static void test_all_aggregations_agg_val_val(struct kunit *test)
+{
+ struct stats_fs_source *src, *sub1, *sub2;
+ uint64_t ret;
+ int n;
+
+ src = stats_fs_source_create("parent");
+ sub1 = stats_fs_source_create("child1");
+ sub2 = stats_fs_source_create("child2");
+ stats_fs_source_add_subordinate(src, sub1);
+ stats_fs_source_add_subordinate(src, sub2);
+
+ n = stats_fs_source_add_values(sub1, test_all_aggr, &cont);
+ KUNIT_EXPECT_EQ(test, n, 0);
+ n = get_number_aggr_with_base(sub1, &cont);
+ KUNIT_EXPECT_EQ(test, n, ARR_SIZE(test_all_aggr));
+ n = stats_fs_source_add_values(sub2, test_all_aggr, &cont2);
+ KUNIT_EXPECT_EQ(test, n, 0);
+ n = get_number_aggr_with_base(sub2, &cont2);
+ KUNIT_EXPECT_EQ(test, n, ARR_SIZE(test_all_aggr));
+
+ n = stats_fs_source_add_values(src, test_all_aggr, NULL);
+ KUNIT_EXPECT_EQ(test, n, 0);
+ n = get_number_aggr_with_base(src, NULL);
+ KUNIT_EXPECT_EQ(test, n, ARR_SIZE(test_all_aggr));
+
+ // sum
+ n = stats_fs_source_get_value_by_name(src, "u64", &ret);
+ KUNIT_EXPECT_EQ(test, n, 0);
+ KUNIT_EXPECT_EQ(test, ret, def_u64 * 2);
+
+ // min
+ n = stats_fs_source_get_value_by_name(src, "s32", &ret);
+ KUNIT_EXPECT_EQ(test, n, 0);
+ KUNIT_EXPECT_EQ(test, ((int32_t)ret), def_val_s32);
+
+ // count_0
+ n = stats_fs_source_get_value_by_name(src, "bo", &ret);
+ KUNIT_EXPECT_EQ(test, n, 0);
+ KUNIT_EXPECT_EQ(test, ret, 1ull);
+
+ // avg
+ n = stats_fs_source_get_value_by_name(src, "u8", &ret);
+ KUNIT_EXPECT_EQ(test, n, 0);
+ KUNIT_EXPECT_EQ(test, ret, 191ull);
+
+ // max
+ n = stats_fs_source_get_value_by_name(src, "s16", &ret);
+ KUNIT_EXPECT_EQ(test, n, 0);
+ KUNIT_EXPECT_EQ(test, (int16_t)ret, def_val_s16);
+
+ stats_fs_source_put(sub1);
+ stats_fs_source_put(sub2);
+ stats_fs_source_put(src);
+}
+
+static void test_all_aggregations_val_agg_val(struct kunit *test)
+{
+ struct stats_fs_source *src, *sub1, *sub2;
+ uint64_t ret;
+ int n;
+
+ src = stats_fs_source_create("parent");
+ sub1 = stats_fs_source_create("child1");
+ sub2 = stats_fs_source_create("child2");
+ stats_fs_source_add_subordinate(src, sub1);
+ stats_fs_source_add_subordinate(src, sub2);
+
+ n = stats_fs_source_add_values(src, test_all_aggr, &cont);
+ KUNIT_EXPECT_EQ(test, n, 0);
+ n = get_number_aggr_with_base(src, &cont);
+ KUNIT_EXPECT_EQ(test, n, ARR_SIZE(test_all_aggr));
+ n = stats_fs_source_add_values(sub2, test_all_aggr, &cont2);
+ KUNIT_EXPECT_EQ(test, n, 0);
+ n = get_number_aggr_with_base(sub2, &cont2);
+ KUNIT_EXPECT_EQ(test, n, ARR_SIZE(test_all_aggr));
+
+ n = stats_fs_source_add_values(sub1, test_all_aggr, NULL);
+ KUNIT_EXPECT_EQ(test, n, 0);
+ n = get_number_aggr_with_base(sub1, NULL);
+ KUNIT_EXPECT_EQ(test, n, ARR_SIZE(test_all_aggr));
+
+ n = stats_fs_source_get_value_by_name(src, "u64", &ret);
+ KUNIT_EXPECT_EQ(test, n, 0);
+ KUNIT_EXPECT_EQ(test, ret, def_u64);
+ n = stats_fs_source_get_value_by_name(sub1, "u64", &ret);
+ KUNIT_EXPECT_EQ(test, n, 0);
+ KUNIT_EXPECT_EQ(test, ret, 0ull);
+ n = stats_fs_source_get_value_by_name(sub2, "u64", &ret);
+ KUNIT_EXPECT_EQ(test, n, 0);
+ KUNIT_EXPECT_EQ(test, ret, def_u64);
+
+ n = stats_fs_source_get_value_by_name(src, "s32", &ret);
+ KUNIT_EXPECT_EQ(test, n, 0);
+ KUNIT_EXPECT_EQ(test, ((int32_t)ret), def_val_s32);
+ n = stats_fs_source_get_value_by_name(sub1, "s32", &ret);
+ KUNIT_EXPECT_EQ(test, n, 0);
+ KUNIT_EXPECT_EQ(test, (int64_t)ret, S64_MAX); // MIN
+ n = stats_fs_source_get_value_by_name(sub2, "s32", &ret);
+ KUNIT_EXPECT_EQ(test, n, 0);
+ KUNIT_EXPECT_EQ(test, ((int32_t)ret), def_val2_s32);
+
+ n = stats_fs_source_get_value_by_name(src, "bo", &ret);
+ KUNIT_EXPECT_EQ(test, n, 0);
+ KUNIT_EXPECT_EQ(test, (bool)ret, def_val_bool);
+ n = stats_fs_source_get_value_by_name(sub1, "bo", &ret);
+ KUNIT_EXPECT_EQ(test, n, 0);
+ KUNIT_EXPECT_EQ(test, ret, 0ull);
+ n = stats_fs_source_get_value_by_name(sub2, "bo", &ret);
+ KUNIT_EXPECT_EQ(test, n, 0);
+ KUNIT_EXPECT_EQ(test, (bool)ret, def_val2_bool);
+
+ n = stats_fs_source_get_value_by_name(src, "u8", &ret);
+ KUNIT_EXPECT_EQ(test, n, 0);
+ KUNIT_EXPECT_EQ(test, (uint8_t)ret, def_val_u8);
+ n = stats_fs_source_get_value_by_name(sub1, "u8", &ret);
+ KUNIT_EXPECT_EQ(test, n, 0);
+ KUNIT_EXPECT_EQ(test, ret, 0ull);
+ n = stats_fs_source_get_value_by_name(sub2, "u8", &ret);
+ KUNIT_EXPECT_EQ(test, n, 0);
+ KUNIT_EXPECT_EQ(test, (uint8_t)ret, def_val2_u8);
+
+ n = stats_fs_source_get_value_by_name(src, "s16", &ret);
+ KUNIT_EXPECT_EQ(test, n, 0);
+ KUNIT_EXPECT_EQ(test, (int16_t)ret, def_val_s16);
+ n = stats_fs_source_get_value_by_name(sub1, "s16", &ret);
+ KUNIT_EXPECT_EQ(test, n, 0);
+ KUNIT_EXPECT_EQ(test, (int64_t)ret, S64_MIN); // MAX
+ n = stats_fs_source_get_value_by_name(sub2, "s16", &ret);
+ KUNIT_EXPECT_EQ(test, n, 0);
+ KUNIT_EXPECT_EQ(test, (int16_t)ret, def_val2_s16);
+
+ stats_fs_source_put(sub1);
+ stats_fs_source_put(sub2);
+ stats_fs_source_put(src);
+}
+
+static void test_all_aggregations_agg_val_val_sub(struct kunit *test)
+{
+ struct stats_fs_source *src, *sub1, *sub11;
+ uint64_t ret;
+ int n;
+
+ src = stats_fs_source_create("parent");
+ sub1 = stats_fs_source_create("child1");
+ sub11 = stats_fs_source_create("child11");
+ stats_fs_source_add_subordinate(src, sub1);
+ stats_fs_source_add_subordinate(sub1, sub11); // changes here!
+
+ n = stats_fs_source_add_values(sub1, test_values, &cont);
+ KUNIT_EXPECT_EQ(test, n, 0);
+ n = get_number_values_with_base(sub1, &cont);
+ KUNIT_EXPECT_EQ(test, n, ARR_SIZE(test_values));
+ n = stats_fs_source_add_values(sub11, test_values, &cont2);
+ KUNIT_EXPECT_EQ(test, n, 0);
+ n = get_number_values_with_base(sub11, &cont2);
+ KUNIT_EXPECT_EQ(test, n, ARR_SIZE(test_values));
+
+ KUNIT_EXPECT_EQ(test, get_total_number_values(src),
+ ARR_SIZE(test_values) * 2);
+
+ n = stats_fs_source_add_values(sub1, test_all_aggr, &cont);
+ KUNIT_EXPECT_EQ(test, n, 0);
+ n = get_number_aggr_with_base(sub1, &cont);
+ KUNIT_EXPECT_EQ(test, n, ARR_SIZE(test_all_aggr));
+ n = stats_fs_source_add_values(sub11, test_all_aggr, &cont2);
+ KUNIT_EXPECT_EQ(test, n, 0);
+ n = get_number_aggr_with_base(sub11, &cont2);
+ KUNIT_EXPECT_EQ(test, n, ARR_SIZE(test_all_aggr));
+
+ n = stats_fs_source_add_values(src, test_all_aggr, NULL);
+ KUNIT_EXPECT_EQ(test, n, 0);
+ n = get_number_aggr_with_base(src, NULL);
+ KUNIT_EXPECT_EQ(test, n, ARR_SIZE(test_all_aggr));
+
+ // sum
+ n = stats_fs_source_get_value_by_name(src, "u64", &ret);
+ KUNIT_EXPECT_EQ(test, n, 0);
+ KUNIT_EXPECT_EQ(test, ret, def_u64 * 2);
+
+ // min
+ n = stats_fs_source_get_value_by_name(src, "s32", &ret);
+ KUNIT_EXPECT_EQ(test, n, 0);
+ KUNIT_EXPECT_EQ(test, ((int32_t)ret), def_val_s32);
+
+ // count_0
+ n = stats_fs_source_get_value_by_name(src, "bo", &ret);
+ KUNIT_EXPECT_EQ(test, n, 0);
+ KUNIT_EXPECT_EQ(test, ret, 1ull);
+
+ // avg
+ n = stats_fs_source_get_value_by_name(src, "u8", &ret);
+ KUNIT_EXPECT_EQ(test, n, 0);
+ KUNIT_EXPECT_EQ(test, ret, 191ull);
+
+ // max
+ n = stats_fs_source_get_value_by_name(src, "s16", &ret);
+ KUNIT_EXPECT_EQ(test, n, 0);
+ KUNIT_EXPECT_EQ(test, (int16_t)ret, def_val_s16);
+
+ stats_fs_source_put(sub1);
+ stats_fs_source_put(sub11);
+ stats_fs_source_put(src);
+}
+
+static void test_all_aggregations_agg_no_val_sub(struct kunit *test)
+{
+ struct stats_fs_source *src, *sub1, *sub11;
+ uint64_t ret;
+ int n;
+
+ src = stats_fs_source_create("parent");
+ sub1 = stats_fs_source_create("child1");
+ sub11 = stats_fs_source_create("child11");
+ stats_fs_source_add_subordinate(src, sub1);
+ stats_fs_source_add_subordinate(sub1, sub11);
+
+ n = stats_fs_source_add_values(sub11, test_all_aggr, &cont2);
+ KUNIT_EXPECT_EQ(test, n, 0);
+ n = get_number_aggr_with_base(sub11, &cont2);
+ KUNIT_EXPECT_EQ(test, n, ARR_SIZE(test_all_aggr));
+
+ KUNIT_EXPECT_EQ(test, get_total_number_values(src), 0);
+
+ n = stats_fs_source_add_values(src, test_all_aggr, NULL);
+ KUNIT_EXPECT_EQ(test, n, 0);
+ n = get_number_aggr_with_base(src, NULL);
+ KUNIT_EXPECT_EQ(test, n, ARR_SIZE(test_all_aggr));
+
+ // sum
+ n = stats_fs_source_get_value_by_name(src, "u64", &ret);
+ KUNIT_EXPECT_EQ(test, n, 0);
+ KUNIT_EXPECT_EQ(test, ret, def_u64);
+
+ // min
+ n = stats_fs_source_get_value_by_name(src, "s32", &ret);
+ KUNIT_EXPECT_EQ(test, n, 0);
+ KUNIT_EXPECT_EQ(test, ((int32_t)ret), def_val2_s32);
+
+ // count_0
+ n = stats_fs_source_get_value_by_name(src, "bo", &ret);
+ KUNIT_EXPECT_EQ(test, n, 0);
+ KUNIT_EXPECT_EQ(test, ret, 1ull);
+
+ // avg
+ n = stats_fs_source_get_value_by_name(src, "u8", &ret);
+ KUNIT_EXPECT_EQ(test, n, 0);
+ KUNIT_EXPECT_EQ(test, (uint8_t)ret, def_val2_u8);
+
+ // max
+ n = stats_fs_source_get_value_by_name(src, "s16", &ret);
+ KUNIT_EXPECT_EQ(test, n, 0);
+ KUNIT_EXPECT_EQ(test, (int16_t)ret, def_val2_s16);
+
+ stats_fs_source_put(sub1);
+ stats_fs_source_put(sub11);
+ stats_fs_source_put(src);
+}
+
+static void test_all_aggregations_agg_agg_val_sub(struct kunit *test)
+{
+ struct stats_fs_source *src, *sub1, *sub11, *sub12;
+ uint64_t ret;
+ int n;
+
+ src = stats_fs_source_create("parent");
+ sub1 = stats_fs_source_create("child1");
+ sub11 = stats_fs_source_create("child11");
+ sub12 = stats_fs_source_create("child12");
+ stats_fs_source_add_subordinate(src, sub1);
+ stats_fs_source_add_subordinate(sub1, sub11);
+ stats_fs_source_add_subordinate(sub1, sub12);
+
+ n = stats_fs_source_add_values(sub11, test_all_aggr, &cont2);
+ KUNIT_EXPECT_EQ(test, n, 0);
+ n = get_number_aggr_with_base(sub11, &cont2);
+ KUNIT_EXPECT_EQ(test, n, ARR_SIZE(test_all_aggr));
+
+ n = stats_fs_source_add_values(sub12, test_all_aggr, &cont);
+ KUNIT_EXPECT_EQ(test, n, 0);
+ n = get_number_aggr_with_base(sub12, &cont);
+ KUNIT_EXPECT_EQ(test, n, ARR_SIZE(test_all_aggr));
+
+ KUNIT_EXPECT_EQ(test, get_total_number_values(src), 0);
+
+ n = stats_fs_source_add_values(src, test_all_aggr, NULL);
+ KUNIT_EXPECT_EQ(test, n, 0);
+ n = get_number_aggr_with_base(src, NULL);
+ KUNIT_EXPECT_EQ(test, n, ARR_SIZE(test_all_aggr));
+
+ n = stats_fs_source_add_values(sub1, test_all_aggr, NULL);
+ KUNIT_EXPECT_EQ(test, n, 0);
+ n = get_number_aggr_with_base(sub1, NULL);
+ KUNIT_EXPECT_EQ(test, n, ARR_SIZE(test_all_aggr));
+
+ // sum
+ n = stats_fs_source_get_value_by_name(src, "u64", &ret);
+ KUNIT_EXPECT_EQ(test, n, 0);
+ KUNIT_EXPECT_EQ(test, ret, def_u64 * 2);
+
+ // min
+ n = stats_fs_source_get_value_by_name(src, "s32", &ret);
+ KUNIT_EXPECT_EQ(test, n, 0);
+ KUNIT_EXPECT_EQ(test, ((int32_t)ret), def_val_s32);
+
+ // count_0
+ n = stats_fs_source_get_value_by_name(src, "bo", &ret);
+ KUNIT_EXPECT_EQ(test, n, 0);
+ KUNIT_EXPECT_EQ(test, ret, 1ull);
+
+ // avg
+ n = stats_fs_source_get_value_by_name(src, "u8", &ret);
+ KUNIT_EXPECT_EQ(test, n, 0);
+ KUNIT_EXPECT_EQ(test, (uint8_t)ret,
+ (uint8_t)((def_val2_u8 + def_val_u8) / 2));
+
+ // max
+ n = stats_fs_source_get_value_by_name(src, "s16", &ret);
+ KUNIT_EXPECT_EQ(test, n, 0);
+ KUNIT_EXPECT_EQ(test, (int16_t)ret, def_val_s16);
+
+ stats_fs_source_put(sub1);
+ stats_fs_source_put(sub11);
+ stats_fs_source_put(sub12);
+ stats_fs_source_put(src);
+}
+
+static struct kunit_case stats_fs_test_cases[] = {
+ KUNIT_CASE(test_empty_folder),
+ KUNIT_CASE(test_add_subfolder),
+ KUNIT_CASE(test_add_value),
+ KUNIT_CASE(test_add_value_in_subfolder),
+ KUNIT_CASE(test_search_value),
+ KUNIT_CASE(test_search_value_in_subfolder),
+ KUNIT_CASE(test_search_value_in_empty_folder),
+ KUNIT_CASE(test_add_aggregate),
+ KUNIT_CASE(test_add_aggregate_in_subfolder),
+ KUNIT_CASE(test_search_aggregate),
+ KUNIT_CASE(test_search_aggregate_in_subfolder),
+ KUNIT_CASE(test_search_same),
+ KUNIT_CASE(test_add_mixed),
+ KUNIT_CASE(test_search_mixed),
+ KUNIT_CASE(test_all_aggregations_agg_val_val),
+ KUNIT_CASE(test_all_aggregations_val_agg_val),
+ KUNIT_CASE(test_all_aggregations_agg_val_val_sub),
+ KUNIT_CASE(test_all_aggregations_agg_no_val_sub),
+ KUNIT_CASE(test_all_aggregations_agg_agg_val_sub),
+ {}
+};
+
+static struct kunit_suite stats_fs_test_suite = {
+ .name = "stats_fs",
+ .test_cases = stats_fs_test_cases,
+};
+
+kunit_test_suite(stats_fs_test_suite);
--
2.25.2
^ permalink raw reply related
* [PATCH v2 0/5] Statsfs: a new ram-based file sytem for Linux kernel statistics
From: Emanuele Giuseppe Esposito @ 2020-05-04 11:03 UTC (permalink / raw)
To: kvm
Cc: Emanuele Giuseppe Esposito, linux-s390, David Hildenbrand,
Cornelia Huck, Emanuele Giuseppe Esposito, linux-kernel, kvm-ppc,
linux-mips, Christian Borntraeger, Alexander Viro, linux-fsdevel,
Paolo Bonzini, Vitaly Kuznetsov, linuxppc-dev, Jim Mattson
There is currently no common way for Linux kernel subsystems to expose
statistics to userspace shared throughout the Linux kernel; subsystems
have to take care of gathering and displaying statistics by themselves,
for example in the form of files in debugfs. For example KVM has its own
code section that takes care of this in virt/kvm/kvm_main.c, where it sets
up debugfs handlers for displaying values and aggregating them from
various subfolders to obtain information about the system state (i.e.
displaying the total number of exits, calculated by summing all exits of
all cpus of all running virtual machines).
Allowing each section of the kernel to do so has two disadvantages. First,
it will introduce redundant code. Second, debugfs is anyway not the right
place for statistics (for example it is affected by lockdown)
In this patch series I introduce statsfs, a synthetic ram-based virtual
filesystem that takes care of gathering and displaying statistics for the
Linux kernel subsystems.
The file system is mounted on /sys/kernel/stats and would be already used
by kvm. Statsfs was initially introduced by Paolo Bonzini [1].
Statsfs offers a generic and stable API, allowing any kind of
directory/file organization and supporting multiple kind of aggregations
(not only sum, but also average, max, min and count_zero) and data types
(all unsigned and signed types plus boolean). The implementation, which is
a generalization of KVM’s debugfs statistics code, takes care of gathering
and displaying information at run time; users only need to specify the
values to be included in each source.
Statsfs would also be a different mountpoint from debugfs, and would not
suffer from limited access due to the security lock down patches. Its main
function is to display each statistics as a file in the desired folder
hierarchy defined through the API. Statsfs files can be read, and possibly
cleared if their file mode allows it.
Statsfs has two main components: the public API defined by
include/linux/statsfs.h, and the virtual file system which should end up
in /sys/kernel/stats.
The API has two main elements, values and sources. Kernel subsystems like
KVM can use the API to create a source, add child
sources/values/aggregates and register it to the root source (that on the
virtual fs would be /sys/kernel/statsfs).
Sources are created via statsfs_source_create(), and each source becomes a
directory in the file system. Sources form a parent-child relationship;
root sources are added to the file system via statsfs_source_register().
Every other source is added to or removed from a parent through the
statsfs_source_add_subordinate and statsfs_source_remote_subordinate APIs.
Once a source is created and added to the tree (via add_subordinate), it
will be used to compute aggregate values in the parent source.
Values represent quantites that are gathered by the statsfs user. Examples
of values include the number of vm exits of a given kind, the amount of
memory used by some data structure, the length of the longest hash table
chain, or anything like that. Values are defined with the
statsfs_source_add_values function. Each value is defined by a struct
statsfs_value; the same statsfs_value can be added to many different
sources. A value can be considered "simple" if it fetches data from a
user-provided location, or "aggregate" if it groups all values in the
subordinates sources that include the same statsfs_value.
For more information, please consult the kerneldoc documentation in patch
2 and the sample uses in the kunit tests and in KVM.
This series of patches is based on my previous series "libfs: group and
simplify linux fs code" and the single patch sent to kvm "kvm_host: unify
VM_STAT and VCPU_STAT definitions in a single place". The former
simplifies code duplicated in debugfs and tracefs (from which statsfs is
based on), the latter groups all macros definition for statistics in kvm
in a single common file shared by all architectures.
Patch 1 adds a new refcount and kref destructor wrappers that take a
semaphore, as those are used later by statsfs. Patch 2 introduces the
statsfs API, patch 3 provides extensive tests that can also be used as
example on how to use the API and patch 4 adds the file system support.
Finally, patch 5 provides a real-life example of statsfs usage in KVM.
[1] https://lore.kernel.org/kvm/5d6cdcb1-d8ad-7ae6-7351-3544e2fa366d@redhat.com/?fbclid=IwAR18LHJ0PBcXcDaLzILFhHsl3qpT3z2vlG60RnqgbpGYhDv7L43n0ZXJY8M
Signed-off-by: Emanuele Giuseppe Esposito <eesposit@redhat.com>
v1->v2 remove unnecessary list_foreach_safe loops, fix wrong indentation,
change statsfs in stats_fs
Emanuele Giuseppe Esposito (5):
refcount, kref: add dec-and-test wrappers for rw_semaphores
stats_fs API: create, add and remove stats_fs sources and values
kunit: tests for stats_fs API
stats_fs fs: virtual fs to show stats to the end-user
kvm_main: replace debugfs with stats_fs
MAINTAINERS | 7 +
arch/arm64/kvm/Kconfig | 1 +
arch/arm64/kvm/guest.c | 2 +-
arch/mips/kvm/Kconfig | 1 +
arch/mips/kvm/mips.c | 2 +-
arch/powerpc/kvm/Kconfig | 1 +
arch/powerpc/kvm/book3s.c | 6 +-
arch/powerpc/kvm/booke.c | 8 +-
arch/s390/kvm/Kconfig | 1 +
arch/s390/kvm/kvm-s390.c | 16 +-
arch/x86/include/asm/kvm_host.h | 2 +-
arch/x86/kvm/Kconfig | 1 +
arch/x86/kvm/Makefile | 2 +-
arch/x86/kvm/debugfs.c | 64 --
arch/x86/kvm/stats_fs.c | 56 ++
arch/x86/kvm/x86.c | 6 +-
fs/Kconfig | 12 +
fs/Makefile | 1 +
fs/stats_fs/Makefile | 6 +
fs/stats_fs/inode.c | 337 ++++++++++
fs/stats_fs/internal.h | 35 +
fs/stats_fs/stats_fs-tests.c | 1088 +++++++++++++++++++++++++++++++
fs/stats_fs/stats_fs.c | 773 ++++++++++++++++++++++
include/linux/kref.h | 11 +
include/linux/kvm_host.h | 39 +-
include/linux/refcount.h | 2 +
include/linux/stats_fs.h | 304 +++++++++
include/uapi/linux/magic.h | 1 +
lib/refcount.c | 32 +
tools/lib/api/fs/fs.c | 21 +
virt/kvm/arm/arm.c | 2 +-
virt/kvm/kvm_main.c | 314 ++-------
32 files changed, 2772 insertions(+), 382 deletions(-)
delete mode 100644 arch/x86/kvm/debugfs.c
create mode 100644 arch/x86/kvm/stats_fs.c
create mode 100644 fs/stats_fs/Makefile
create mode 100644 fs/stats_fs/inode.c
create mode 100644 fs/stats_fs/internal.h
create mode 100644 fs/stats_fs/stats_fs-tests.c
create mode 100644 fs/stats_fs/stats_fs.c
create mode 100644 include/linux/stats_fs.h
--
2.25.2
^ permalink raw reply
* [PATCH v2 2/5] stats_fs API: create, add and remove stats_fs sources and values
From: Emanuele Giuseppe Esposito @ 2020-05-04 11:03 UTC (permalink / raw)
To: kvm
Cc: Emanuele Giuseppe Esposito, linux-s390, David Hildenbrand,
Cornelia Huck, Emanuele Giuseppe Esposito, linux-kernel, kvm-ppc,
linux-mips, Christian Borntraeger, Alexander Viro, linux-fsdevel,
Paolo Bonzini, Vitaly Kuznetsov, linuxppc-dev, Jim Mattson
In-Reply-To: <20200504110344.17560-1-eesposit@redhat.com>
Introduction to the stats_fs API, that allows to easily create, add
and remove stats_fs sources and values. The API allows to easily building
the statistics directory tree to automatically gather them for the linux
kernel. The main functionalities are: create a source, add child
sources/values/aggregates, register it to the root source (that on
the virtual fs would be /sys/kernel/statsfs), ad perform a search for
a value/aggregate.
This allows creating any kind of source tree, making it more flexible
also to future readjustments.
The API representation is only logical and will be backed up
by a virtual file system in patch 4.
Its usage will be shared between the stats_fs file system
and the end-users like kvm, the former calling it when it needs to
display and clear statistics, the latter to add values and sources.
Signed-off-by: Emanuele Giuseppe Esposito <eesposit@redhat.com>
---
MAINTAINERS | 7 +
fs/Kconfig | 6 +
fs/Makefile | 1 +
fs/stats_fs/Makefile | 4 +
fs/stats_fs/internal.h | 20 ++
fs/stats_fs/stats_fs.c | 610 +++++++++++++++++++++++++++++++++++++++
include/linux/stats_fs.h | 289 +++++++++++++++++++
7 files changed, 937 insertions(+)
create mode 100644 fs/stats_fs/Makefile
create mode 100644 fs/stats_fs/internal.h
create mode 100644 fs/stats_fs/stats_fs.c
create mode 100644 include/linux/stats_fs.h
diff --git a/MAINTAINERS b/MAINTAINERS
index b816a453b10e..a8403d07cee5 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -5229,6 +5229,13 @@ F: include/linux/debugfs.h
F: include/linux/kobj*
F: lib/kobj*
+STATS_FS
+M: Paolo Bonzini <pbonzini@redhat.com>
+R: Emanuele Giuseppe Esposito <e.emanuelegiuseppe@gmail.com>
+S: Supported
+F: include/linux/stats_fs.h
+F: fs/stats_fs
+
DRIVERS FOR ADAPTIVE VOLTAGE SCALING (AVS)
M: Kevin Hilman <khilman@kernel.org>
M: Nishanth Menon <nm@ti.com>
diff --git a/fs/Kconfig b/fs/Kconfig
index f08fbbfafd9a..1b0de0f19e96 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -328,4 +328,10 @@ source "fs/unicode/Kconfig"
config IO_WQ
bool
+config STATS_FS
+ bool "Statistics Filesystem"
+ help
+ stats_fs is a virtual file system that provides counters and
+ other statistics about the running kernel.
+
endmenu
diff --git a/fs/Makefile b/fs/Makefile
index 2ce5112b02c8..91558eca0cf7 100644
--- a/fs/Makefile
+++ b/fs/Makefile
@@ -125,6 +125,7 @@ obj-$(CONFIG_BEFS_FS) += befs/
obj-$(CONFIG_HOSTFS) += hostfs/
obj-$(CONFIG_CACHEFILES) += cachefiles/
obj-$(CONFIG_DEBUG_FS) += debugfs/
+obj-$(CONFIG_STATS_FS) += stats_fs/
obj-$(CONFIG_TRACING) += tracefs/
obj-$(CONFIG_OCFS2_FS) += ocfs2/
obj-$(CONFIG_BTRFS_FS) += btrfs/
diff --git a/fs/stats_fs/Makefile b/fs/stats_fs/Makefile
new file mode 100644
index 000000000000..94fe52d590d5
--- /dev/null
+++ b/fs/stats_fs/Makefile
@@ -0,0 +1,4 @@
+# SPDX-License-Identifier: GPL-2.0-only
+stats_fs-objs := stats_fs.o
+
+obj-$(CONFIG_STATS_FS) += stats_fs.o
diff --git a/fs/stats_fs/internal.h b/fs/stats_fs/internal.h
new file mode 100644
index 000000000000..ddf262a60736
--- /dev/null
+++ b/fs/stats_fs/internal.h
@@ -0,0 +1,20 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _STATS_FS_INTERNAL_H_
+#define _STATS_FS_INTERNAL_H_
+
+#include <linux/list.h>
+#include <linux/kref.h>
+#include <linux/rwsem.h>
+#include <linux/stats_fs.h>
+
+/* values, grouped by base */
+struct stats_fs_value_source {
+ void *base_addr;
+ bool files_created;
+ struct stats_fs_value *values;
+ struct list_head list_element;
+};
+
+int stats_fs_val_get_mode(struct stats_fs_value *val);
+
+#endif /* _STATS_FS_INTERNAL_H_ */
diff --git a/fs/stats_fs/stats_fs.c b/fs/stats_fs/stats_fs.c
new file mode 100644
index 000000000000..b63de12769e2
--- /dev/null
+++ b/fs/stats_fs/stats_fs.c
@@ -0,0 +1,610 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/module.h>
+#include <linux/errno.h>
+#include <linux/file.h>
+#include <linux/fs.h>
+#include <linux/slab.h>
+#include <linux/rwsem.h>
+#include <linux/list.h>
+#include <linux/kref.h>
+#include <linux/limits.h>
+#include <linux/stats_fs.h>
+
+#include "internal.h"
+
+struct stats_fs_aggregate_value {
+ uint64_t sum, min, max;
+ uint32_t count, count_zero;
+};
+
+static int is_val_signed(struct stats_fs_value *val)
+{
+ return val->type & STATS_FS_SIGN;
+}
+
+int stats_fs_val_get_mode(struct stats_fs_value *val)
+{
+ return val->mode ? val->mode : 0644;
+}
+
+static struct stats_fs_value *find_value(struct stats_fs_value_source *src,
+ struct stats_fs_value *val)
+{
+ struct stats_fs_value *entry;
+
+ for (entry = src->values; entry->name; entry++) {
+ if (entry == val)
+ return entry;
+ }
+ return NULL;
+}
+
+static struct stats_fs_value *
+search_value_in_source(struct stats_fs_source *src, struct stats_fs_value *arg,
+ struct stats_fs_value_source **val_src)
+{
+ struct stats_fs_value *entry;
+ struct stats_fs_value_source *src_entry;
+
+ list_for_each_entry (src_entry, &src->values_head, list_element) {
+ entry = find_value(src_entry, arg);
+ if (entry) {
+ *val_src = src_entry;
+ return entry;
+ }
+ }
+
+ return NULL;
+}
+
+/* Called with rwsem held for writing */
+static struct stats_fs_value_source *create_value_source(void *base)
+{
+ struct stats_fs_value_source *val_src;
+
+ val_src = kzalloc(sizeof(struct stats_fs_value_source), GFP_KERNEL);
+ if (!val_src)
+ return ERR_PTR(-ENOMEM);
+
+ val_src->base_addr = base;
+ INIT_LIST_HEAD(&val_src->list_element);
+
+ return val_src;
+}
+
+int stats_fs_source_add_values(struct stats_fs_source *source,
+ struct stats_fs_value *stat, void *ptr)
+{
+ struct stats_fs_value_source *val_src;
+ struct stats_fs_value_source *entry;
+
+ down_write(&source->rwsem);
+
+ list_for_each_entry (entry, &source->values_head, list_element) {
+ if (entry->base_addr == ptr && entry->values == stat) {
+ up_write(&source->rwsem);
+ return -EEXIST;
+ }
+ }
+
+ val_src = create_value_source(ptr);
+ val_src->values = (struct stats_fs_value *)stat;
+
+ /* add the val_src to the source list */
+ list_add(&val_src->list_element, &source->values_head);
+
+ up_write(&source->rwsem);
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(stats_fs_source_add_values);
+
+void stats_fs_source_add_subordinate(struct stats_fs_source *source,
+ struct stats_fs_source *sub)
+{
+ down_write(&source->rwsem);
+
+ stats_fs_source_get(sub);
+ list_add(&sub->list_element, &source->subordinates_head);
+
+ up_write(&source->rwsem);
+}
+EXPORT_SYMBOL_GPL(stats_fs_source_add_subordinate);
+
+/* Called with rwsem held for writing */
+static void
+stats_fs_source_remove_subordinate_locked(struct stats_fs_source *source,
+ struct stats_fs_source *sub)
+{
+ struct stats_fs_source *src_entry;
+
+ list_for_each_entry (src_entry, &source->subordinates_head,
+ list_element) {
+ if (src_entry == sub) {
+ list_del_init(&src_entry->list_element);
+ stats_fs_source_put(src_entry);
+ return;
+ }
+ }
+}
+
+void stats_fs_source_remove_subordinate(struct stats_fs_source *source,
+ struct stats_fs_source *sub)
+{
+ down_write(&source->rwsem);
+ stats_fs_source_remove_subordinate_locked(source, sub);
+ up_write(&source->rwsem);
+}
+EXPORT_SYMBOL_GPL(stats_fs_source_remove_subordinate);
+
+/* Called with rwsem held for reading */
+static uint64_t get_simple_value(struct stats_fs_value_source *src,
+ struct stats_fs_value *val)
+{
+ uint64_t value_found;
+ void *address;
+
+ address = src->base_addr + val->offset;
+
+ switch (val->type) {
+ case STATS_FS_U8:
+ value_found = *((uint8_t *)address);
+ break;
+ case STATS_FS_U8 | STATS_FS_SIGN:
+ value_found = *((int8_t *)address);
+ break;
+ case STATS_FS_U16:
+ value_found = *((uint16_t *)address);
+ break;
+ case STATS_FS_U16 | STATS_FS_SIGN:
+ value_found = *((int16_t *)address);
+ break;
+ case STATS_FS_U32:
+ value_found = *((uint32_t *)address);
+ break;
+ case STATS_FS_U32 | STATS_FS_SIGN:
+ value_found = *((int32_t *)address);
+ break;
+ case STATS_FS_U64:
+ value_found = *((uint64_t *)address);
+ break;
+ case STATS_FS_U64 | STATS_FS_SIGN:
+ value_found = *((int64_t *)address);
+ break;
+ case STATS_FS_BOOL:
+ value_found = *((uint8_t *)address);
+ break;
+ default:
+ value_found = 0;
+ break;
+ }
+
+ return value_found;
+}
+
+/* Called with rwsem held for reading */
+static void clear_simple_value(struct stats_fs_value_source *src,
+ struct stats_fs_value *val)
+{
+ void *address;
+
+ address = src->base_addr + val->offset;
+
+ switch (val->type) {
+ case STATS_FS_U8:
+ *((uint8_t *)address) = 0;
+ break;
+ case STATS_FS_U8 | STATS_FS_SIGN:
+ *((int8_t *)address) = 0;
+ break;
+ case STATS_FS_U16:
+ *((uint16_t *)address) = 0;
+ break;
+ case STATS_FS_U16 | STATS_FS_SIGN:
+ *((int16_t *)address) = 0;
+ break;
+ case STATS_FS_U32:
+ *((uint32_t *)address) = 0;
+ break;
+ case STATS_FS_U32 | STATS_FS_SIGN:
+ *((int32_t *)address) = 0;
+ break;
+ case STATS_FS_U64:
+ *((uint64_t *)address) = 0;
+ break;
+ case STATS_FS_U64 | STATS_FS_SIGN:
+ *((int64_t *)address) = 0;
+ break;
+ case STATS_FS_BOOL:
+ *((uint8_t *)address) = 0;
+ break;
+ default:
+ break;
+ }
+}
+
+/* Called with rwsem held for reading */
+static void
+search_all_simple_values(struct stats_fs_source *src,
+ struct stats_fs_value_source *ref_src_entry,
+ struct stats_fs_value *val,
+ struct stats_fs_aggregate_value *agg)
+{
+ struct stats_fs_value_source *src_entry;
+ uint64_t value_found;
+
+ list_for_each_entry (src_entry, &src->values_head, list_element) {
+ /* skip aggregates */
+ if (src_entry->base_addr == NULL)
+ continue;
+
+ /* useless to search here */
+ if (src_entry->values != ref_src_entry->values)
+ continue;
+
+ /* must be here */
+ value_found = get_simple_value(src_entry, val);
+ agg->sum += value_found;
+ agg->count++;
+ agg->count_zero += (value_found == 0);
+
+ if (is_val_signed(val)) {
+ agg->max = (((int64_t)value_found) >=
+ ((int64_t)agg->max)) ?
+ value_found :
+ agg->max;
+ agg->min = (((int64_t)value_found) <=
+ ((int64_t)agg->min)) ?
+ value_found :
+ agg->min;
+ } else {
+ agg->max = (value_found >= agg->max) ? value_found :
+ agg->max;
+ agg->min = (value_found <= agg->min) ? value_found :
+ agg->min;
+ }
+ }
+}
+
+/* Called with rwsem held for reading */
+static void
+do_recursive_aggregation(struct stats_fs_source *root,
+ struct stats_fs_value_source *ref_src_entry,
+ struct stats_fs_value *val,
+ struct stats_fs_aggregate_value *agg)
+{
+ struct stats_fs_source *subordinate;
+
+ /* search all simple values in this folder */
+ search_all_simple_values(root, ref_src_entry, val, agg);
+
+ /* recursively search in all subfolders */
+ list_for_each_entry (subordinate, &root->subordinates_head,
+ list_element) {
+ down_read(&subordinate->rwsem);
+ do_recursive_aggregation(subordinate, ref_src_entry, val, agg);
+ up_read(&subordinate->rwsem);
+ }
+}
+
+/* Called with rwsem held for reading */
+static void init_aggregate_value(struct stats_fs_aggregate_value *agg,
+ struct stats_fs_value *val)
+{
+ agg->count = agg->count_zero = agg->sum = 0;
+ if (is_val_signed(val)) {
+ agg->max = S64_MIN;
+ agg->min = S64_MAX;
+ } else {
+ agg->max = 0;
+ agg->min = U64_MAX;
+ }
+}
+
+/* Called with rwsem held for reading */
+static void store_final_value(struct stats_fs_aggregate_value *agg,
+ struct stats_fs_value *val, uint64_t *ret)
+{
+ int operation;
+
+ operation = val->aggr_kind | is_val_signed(val);
+
+ switch (operation) {
+ case STATS_FS_AVG:
+ *ret = agg->count ? agg->sum / agg->count : 0;
+ break;
+ case STATS_FS_AVG | STATS_FS_SIGN:
+ *ret = agg->count ? ((int64_t)agg->sum) / agg->count : 0;
+ break;
+ case STATS_FS_SUM:
+ case STATS_FS_SUM | STATS_FS_SIGN:
+ *ret = agg->sum;
+ break;
+ case STATS_FS_MIN:
+ case STATS_FS_MIN | STATS_FS_SIGN:
+ *ret = agg->min;
+ break;
+ case STATS_FS_MAX:
+ case STATS_FS_MAX | STATS_FS_SIGN:
+ *ret = agg->max;
+ break;
+ case STATS_FS_COUNT_ZERO:
+ case STATS_FS_COUNT_ZERO | STATS_FS_SIGN:
+ *ret = agg->count_zero;
+ break;
+ default:
+ break;
+ }
+}
+
+/* Called with rwsem held for reading */
+static int stats_fs_source_get_value_locked(struct stats_fs_source *source,
+ struct stats_fs_value *arg,
+ uint64_t *ret)
+{
+ struct stats_fs_value_source *src_entry;
+ struct stats_fs_value *found;
+ struct stats_fs_aggregate_value aggr;
+
+ *ret = 0;
+
+ if (!arg)
+ return -ENOENT;
+
+ /* look in simple values */
+ found = search_value_in_source(source, arg, &src_entry);
+
+ if (!found) {
+ printk(KERN_ERR "Stats_fs: Value in source \"%s\" not found!\n",
+ source->name);
+ return -ENOENT;
+ }
+
+ if (src_entry->base_addr != NULL) {
+ *ret = get_simple_value(src_entry, found);
+ return 0;
+ }
+
+ /* look in aggregates */
+ init_aggregate_value(&aggr, found);
+ do_recursive_aggregation(source, src_entry, found, &aggr);
+ store_final_value(&aggr, found, ret);
+
+ return 0;
+}
+
+int stats_fs_source_get_value(struct stats_fs_source *source,
+ struct stats_fs_value *arg, uint64_t *ret)
+{
+ int retval;
+
+ down_read(&source->rwsem);
+ retval = stats_fs_source_get_value_locked(source, arg, ret);
+ up_read(&source->rwsem);
+
+ return retval;
+}
+EXPORT_SYMBOL_GPL(stats_fs_source_get_value);
+
+/* Called with rwsem held for reading */
+static void set_all_simple_values(struct stats_fs_source *src,
+ struct stats_fs_value_source *ref_src_entry,
+ struct stats_fs_value *val)
+{
+ struct stats_fs_value_source *src_entry;
+
+ list_for_each_entry (src_entry, &src->values_head, list_element) {
+ /* skip aggregates */
+ if (src_entry->base_addr == NULL)
+ continue;
+
+ /* wrong to search here */
+ if (src_entry->values != ref_src_entry->values)
+ continue;
+
+ if (src_entry->base_addr &&
+ src_entry->values == ref_src_entry->values)
+ clear_simple_value(src_entry, val);
+ }
+}
+
+/* Called with rwsem held for reading */
+static void do_recursive_clean(struct stats_fs_source *root,
+ struct stats_fs_value_source *ref_src_entry,
+ struct stats_fs_value *val)
+{
+ struct stats_fs_source *subordinate;
+
+ /* search all simple values in this folder */
+ set_all_simple_values(root, ref_src_entry, val);
+
+ /* recursively search in all subfolders */
+ list_for_each_entry (subordinate, &root->subordinates_head,
+ list_element) {
+ down_read(&subordinate->rwsem);
+ do_recursive_clean(subordinate, ref_src_entry, val);
+ up_read(&subordinate->rwsem);
+ }
+}
+
+/* Called with rwsem held for reading */
+static int stats_fs_source_clear_locked(struct stats_fs_source *source,
+ struct stats_fs_value *val)
+{
+ struct stats_fs_value_source *src_entry;
+ struct stats_fs_value *found;
+
+ if (!val)
+ return -ENOENT;
+
+ /* look in simple values */
+ found = search_value_in_source(source, val, &src_entry);
+
+ if (!found) {
+ printk(KERN_ERR "Stats_fs: Value in source \"%s\" not found!\n",
+ source->name);
+ return -ENOENT;
+ }
+
+ if (src_entry->base_addr != NULL) {
+ clear_simple_value(src_entry, found);
+ return 0;
+ }
+
+ /* look in aggregates */
+ do_recursive_clean(source, src_entry, found);
+
+ return 0;
+}
+
+int stats_fs_source_clear(struct stats_fs_source *source,
+ struct stats_fs_value *val)
+{
+ int retval;
+
+ down_read(&source->rwsem);
+ retval = stats_fs_source_clear_locked(source, val);
+ up_read(&source->rwsem);
+
+ return retval;
+}
+
+/* Called with rwsem held for reading */
+static struct stats_fs_value *
+find_value_by_name(struct stats_fs_value_source *src, char *val)
+{
+ struct stats_fs_value *entry;
+
+ for (entry = src->values; entry->name; entry++)
+ if (!strcmp(entry->name, val))
+ return entry;
+
+ return NULL;
+}
+
+/* Called with rwsem held for reading */
+static struct stats_fs_value *
+search_in_source_by_name(struct stats_fs_source *src, char *name)
+{
+ struct stats_fs_value *entry;
+ struct stats_fs_value_source *src_entry;
+
+ list_for_each_entry (src_entry, &src->values_head, list_element) {
+ entry = find_value_by_name(src_entry, name);
+ if (entry)
+ return entry;
+ }
+
+ return NULL;
+}
+
+int stats_fs_source_get_value_by_name(struct stats_fs_source *source,
+ char *name, uint64_t *ret)
+{
+ struct stats_fs_value *val;
+ int retval;
+
+ down_read(&source->rwsem);
+ val = search_in_source_by_name(source, name);
+
+ if (!val) {
+ *ret = 0;
+ up_read(&source->rwsem);
+ return -ENOENT;
+ }
+
+ retval = stats_fs_source_get_value_locked(source, val, ret);
+ up_read(&source->rwsem);
+
+ return retval;
+}
+EXPORT_SYMBOL_GPL(stats_fs_source_get_value_by_name);
+
+void stats_fs_source_get(struct stats_fs_source *source)
+{
+ kref_get(&source->refcount);
+}
+EXPORT_SYMBOL_GPL(stats_fs_source_get);
+
+void stats_fs_source_revoke(struct stats_fs_source *source)
+{
+ struct stats_fs_value_source *val_src_entry;
+
+ down_write(&source->rwsem);
+
+ list_for_each_entry (val_src_entry, &source->values_head, list_element)
+ val_src_entry->base_addr = NULL;
+
+ up_write(&source->rwsem);
+}
+EXPORT_SYMBOL_GPL(stats_fs_source_revoke);
+
+/* Called with rwsem held for writing
+ *
+ * The refcount is 0 and the lock was taken before refcount
+ * went from 1 to 0
+ */
+static void stats_fs_source_destroy(struct kref *kref_source)
+{
+ struct stats_fs_value_source *val_src_entry;
+ struct list_head *it, *safe;
+ struct stats_fs_source *child, *source;
+
+ source = container_of(kref_source, struct stats_fs_source, refcount);
+
+ /* iterate through the values and delete them */
+ list_for_each_safe (it, safe, &source->values_head) {
+ val_src_entry = list_entry(it, struct stats_fs_value_source,
+ list_element);
+ kfree(val_src_entry);
+ }
+
+ /* iterate through the subordinates and delete them */
+ list_for_each_safe (it, safe, &source->subordinates_head) {
+ child = list_entry(it, struct stats_fs_source, list_element);
+ stats_fs_source_remove_subordinate_locked(source, child);
+ }
+
+ up_write(&source->rwsem);
+ kfree(source->name);
+ kfree(source);
+}
+
+void stats_fs_source_put(struct stats_fs_source *source)
+{
+ kref_put_rwsem(&source->refcount, stats_fs_source_destroy,
+ &source->rwsem);
+}
+EXPORT_SYMBOL_GPL(stats_fs_source_put);
+
+struct stats_fs_source *stats_fs_source_create(const char *fmt, ...)
+{
+ va_list ap;
+ char buf[100];
+ struct stats_fs_source *ret;
+ int char_needed;
+
+ va_start(ap, fmt);
+ char_needed = vsnprintf(buf, 100, fmt, ap);
+ va_end(ap);
+
+ ret = kzalloc(sizeof(struct stats_fs_source), GFP_KERNEL);
+ if (!ret)
+ return ERR_PTR(-ENOMEM);
+
+ ret->name = kstrdup(buf, GFP_KERNEL);
+ if (!ret->name) {
+ kfree(ret);
+ return ERR_PTR(-ENOMEM);
+ }
+
+ kref_init(&ret->refcount);
+ init_rwsem(&ret->rwsem);
+
+ INIT_LIST_HEAD(&ret->values_head);
+ INIT_LIST_HEAD(&ret->subordinates_head);
+ INIT_LIST_HEAD(&ret->list_element);
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(stats_fs_source_create);
diff --git a/include/linux/stats_fs.h b/include/linux/stats_fs.h
new file mode 100644
index 000000000000..dc2d2e11f5ea
--- /dev/null
+++ b/include/linux/stats_fs.h
@@ -0,0 +1,289 @@
+/* SPDX-License-Identifier: GPL-2.0
+ *
+ * stats_fs.h - a tiny little statistics file system
+ *
+ * Copyright (C) 2020 Emanuele Giuseppe Esposito
+ * Copyright (C) 2020 Redhat.
+ *
+ */
+
+#ifndef _STATS_FS_H_
+#define _STATS_FS_H_
+
+#include <linux/list.h>
+
+/* Used to distinguish signed types */
+#define STATS_FS_SIGN 0x8000
+
+enum stat_type {
+ STATS_FS_U8 = 0,
+ STATS_FS_U16 = 1,
+ STATS_FS_U32 = 2,
+ STATS_FS_U64 = 3,
+ STATS_FS_BOOL = 4,
+ STATS_FS_S8 = STATS_FS_U8 | STATS_FS_SIGN,
+ STATS_FS_S16 = STATS_FS_U16 | STATS_FS_SIGN,
+ STATS_FS_S32 = STATS_FS_U32 | STATS_FS_SIGN,
+ STATS_FS_S64 = STATS_FS_U64 | STATS_FS_SIGN,
+};
+
+enum stat_aggr {
+ STATS_FS_NONE = 0,
+ STATS_FS_SUM,
+ STATS_FS_MIN,
+ STATS_FS_MAX,
+ STATS_FS_COUNT_ZERO,
+ STATS_FS_AVG,
+};
+
+struct stats_fs_value {
+ /* Name of the stat */
+ char *name;
+
+ /* Offset from base address to field containing the value */
+ int offset;
+
+ /* Type of the stat BOOL,U64,... */
+ enum stat_type type;
+
+ /* Aggregate type: MIN, MAX, SUM,... */
+ enum stat_aggr aggr_kind;
+
+ /* File mode */
+ uint16_t mode;
+};
+
+struct stats_fs_source {
+ struct kref refcount;
+
+ char *name;
+
+ /* list of source stats_fs_value_source*/
+ struct list_head values_head;
+
+ /* list of struct stats_fs_source for subordinate sources */
+ struct list_head subordinates_head;
+
+ struct list_head list_element;
+
+ struct rw_semaphore rwsem;
+
+ struct dentry *source_dentry;
+};
+
+#if defined(CONFIG_STATS_FS)
+
+/**
+ * stats_fs_source_create - create a stats_fs_source
+ * Creates a stats_fs_source with the given name. This
+ * does not mean it will be backed by the filesystem yet, it will only
+ * be visible to the user once one of its parents (or itself) are
+ * registered in stats_fs.
+ *
+ * Returns a pointer to a stats_fs_source if it succeeds.
+ * This or one of the parents' pointer must be passed to the stats_fs_put()
+ * function when the file is to be removed. If an error occurs,
+ * ERR_PTR(-ERROR) will be returned.
+ */
+struct stats_fs_source *stats_fs_source_create(const char *fmt, ...);
+
+/**
+ * stats_fs_source_add_values - adds values to the given source
+ * @source: a pointer to the source that will receive the values
+ * @val: a pointer to the NULL terminated stats_fs_value array to add
+ * @base_ptr: a pointer to the base pointer used by these values
+ *
+ * In addition to adding values to the source, also create the
+ * files in the filesystem if the source already is backed up by a directory.
+ *
+ * Returns 0 it succeeds. If the value are already in the
+ * source and have the same base_ptr, -EEXIST is returned.
+ */
+int stats_fs_source_add_values(struct stats_fs_source *source,
+ struct stats_fs_value *val, void *base_ptr);
+
+/**
+ * stats_fs_source_add_subordinate - adds a child to the given source
+ * @parent: a pointer to the parent source
+ * @child: a pointer to child source to add
+ *
+ * Recursively create all files in the stats_fs filesystem
+ * only if the parent has already a dentry (created with
+ * stats_fs_source_register).
+ * This avoids the case where this function is called before register.
+ */
+void stats_fs_source_add_subordinate(struct stats_fs_source *parent,
+ struct stats_fs_source *child);
+
+/**
+ * stats_fs_source_remove_subordinate - removes a child from the given source
+ * @parent: a pointer to the parent source
+ * @child: a pointer to child source to remove
+ *
+ * Look if there is such child in the parent. If so,
+ * it will remove all its files and call stats_fs_put on the child.
+ */
+void stats_fs_source_remove_subordinate(struct stats_fs_source *parent,
+ struct stats_fs_source *child);
+
+/**
+ * stats_fs_source_get_value - search a value in the source (and
+ * subordinates)
+ * @source: a pointer to the source that will be searched
+ * @val: a pointer to the stats_fs_value to search
+ * @ret: a pointer to the uint64_t that will hold the found value
+ *
+ * Look up in the source if a value with same value pointer
+ * exists.
+ * If not, it will return -ENOENT. If it exists and it's a simple value
+ * (not an aggregate), the value that it points to will be returned.
+ * If it exists and it's an aggregate (aggr_type != STATS_FS_NONE), all
+ * subordinates will be recursively searched and every simple value match
+ * will be used to aggregate the final result. For example if it's a sum,
+ * all suboordinates having the same value will be sum together.
+ *
+ * This function will return 0 it succeeds.
+ */
+int stats_fs_source_get_value(struct stats_fs_source *source,
+ struct stats_fs_value *val, uint64_t *ret);
+
+/**
+ * stats_fs_source_get_value_by_name - search a value in the source (and
+ * subordinates)
+ * @source: a pointer to the source that will be searched
+ * @name: a pointer to the string representing the value to search
+ * (for example "exits")
+ * @ret: a pointer to the uint64_t that will hold the found value
+ *
+ * Same as stats_fs_source_get_value, but initially the name is used
+ * to search in the given source if there is a value with a matching
+ * name. If so, stats_fs_source_get_value will be called with the found
+ * value, otherwise -ENOENT will be returned.
+ */
+int stats_fs_source_get_value_by_name(struct stats_fs_source *source,
+ char *name, uint64_t *ret);
+
+/**
+ * stats_fs_source_clear - search and clears a value in the source (and
+ * subordinates)
+ * @source: a pointer to the source that will be searched
+ * @val: a pointer to the stats_fs_value to search
+ *
+ * Look up in the source if a value with same value pointer
+ * exists.
+ * If not, it will return -ENOENT. If it exists and it's a simple value
+ * (not an aggregate), the value that it points to will be set to 0.
+ * If it exists and it's an aggregate (aggr_type != STATS_FS_NONE), all
+ * subordinates will be recursively searched and every simple value match
+ * will be set to 0.
+ *
+ * This function will return 0 it succeeds.
+ */
+int stats_fs_source_clear(struct stats_fs_source *source,
+ struct stats_fs_value *val);
+
+/**
+ * stats_fs_source_revoke - disconnect the source from its backing data
+ * @source: a pointer to the source that will be revoked
+ *
+ * Ensure that stats_fs will not access the data that were passed to
+ * stats_fs_source_add_value for this source.
+ *
+ * Because open files increase the reference count for a stats_fs_source,
+ * the source can end up living longer than the data that provides the
+ * values for the source. Calling stats_fs_source_revoke just before the
+ * backing data is freed avoids accesses to freed data structures. The
+ * sources will return 0.
+ */
+void stats_fs_source_revoke(struct stats_fs_source *source);
+
+/**
+ * stats_fs_source_get - increases refcount of source
+ * @source: a pointer to the source whose refcount will be increased
+ */
+void stats_fs_source_get(struct stats_fs_source *source);
+
+/**
+ * stats_fs_source_put - decreases refcount of source and deletes if needed
+ * @source: a pointer to the source whose refcount will be decreased
+ *
+ * If refcount arrives to zero, take care of deleting
+ * and free the source resources and files, by firstly recursively calling
+ * stats_fs_source_remove_subordinate to the child and then deleting
+ * its own files and allocations.
+ */
+void stats_fs_source_put(struct stats_fs_source *source);
+
+/**
+ * stats_fs_initialized - returns true if stats_fs fs has been registered
+ */
+bool stats_fs_initialized(void);
+
+#else
+
+#include <linux/err.h>
+
+/*
+ * We do not return NULL from these functions if CONFIG_STATS_FS is not enabled
+ * so users have a chance to detect if there was a real error or not. We don't
+ * want to duplicate the design decision mistakes of procfs and devfs again.
+ */
+
+static inline struct stats_fs_source *stats_fs_source_create(const char *fmt,
+ ...)
+{
+ return ERR_PTR(-ENODEV);
+}
+
+static inline int stats_fs_source_add_values(struct stats_fs_source *source,
+ struct stats_fs_value *val,
+ void *base_ptr)
+{
+ return -ENODEV;
+}
+
+static inline void
+stats_fs_source_add_subordinate(struct stats_fs_source *parent,
+ struct stats_fs_source *child)
+{ }
+
+static inline void
+stats_fs_source_remove_subordinate(struct stats_fs_source *parent,
+ struct stats_fs_source *child)
+{ }
+
+static inline int stats_fs_source_get_value(struct stats_fs_source *source,
+ struct stats_fs_value *val,
+ uint64_t *ret)
+{
+ return -ENODEV;
+}
+
+static inline int
+stats_fs_source_get_value_by_name(struct stats_fs_source *source, char *name,
+ uint64_t *ret)
+{
+ return -ENODEV;
+}
+
+static inline int stats_fs_source_clear(struct stats_fs_source *source,
+ struct stats_fs_value *val)
+{
+ return -ENODEV;
+}
+
+static inline void stats_fs_source_revoke(struct stats_fs_source *source)
+{ }
+
+static inline void stats_fs_source_get(struct stats_fs_source *source)
+{ }
+
+static inline void stats_fs_source_put(struct stats_fs_source *source)
+{ }
+
+static inline bool stats_fs_initialized(void)
+{ }
+
+#endif
+
+#endif
--
2.25.2
^ permalink raw reply related
* [PATCH v2 1/5] refcount, kref: add dec-and-test wrappers for rw_semaphores
From: Emanuele Giuseppe Esposito @ 2020-05-04 11:03 UTC (permalink / raw)
To: kvm
Cc: Emanuele Giuseppe Esposito, linux-s390, David Hildenbrand,
Cornelia Huck, Emanuele Giuseppe Esposito, linux-kernel, kvm-ppc,
linux-mips, Christian Borntraeger, Alexander Viro, linux-fsdevel,
Paolo Bonzini, Vitaly Kuznetsov, linuxppc-dev, Jim Mattson
In-Reply-To: <20200504110344.17560-1-eesposit@redhat.com>
Similar to the existing functions that take a mutex or spinlock if and
only if a reference count is decremented to zero, these new function
take an rwsem for writing just before the refcount reaches 0 (and
call a user-provided function in the case of kref_put_rwsem).
These will be used for stats_fs_source data structures, which are
protected by an rw_semaphore to allow concurrent sysfs reads.
Signed-off-by: Emanuele Giuseppe Esposito <eesposit@redhat.com>
---
include/linux/kref.h | 11 +++++++++++
include/linux/refcount.h | 2 ++
lib/refcount.c | 32 ++++++++++++++++++++++++++++++++
3 files changed, 45 insertions(+)
diff --git a/include/linux/kref.h b/include/linux/kref.h
index d32e21a2538c..2dc935445f45 100644
--- a/include/linux/kref.h
+++ b/include/linux/kref.h
@@ -79,6 +79,17 @@ static inline int kref_put_mutex(struct kref *kref,
return 0;
}
+static inline int kref_put_rwsem(struct kref *kref,
+ void (*release)(struct kref *kref),
+ struct rw_semaphore *rwsem)
+{
+ if (refcount_dec_and_down_write(&kref->refcount, rwsem)) {
+ release(kref);
+ return 1;
+ }
+ return 0;
+}
+
static inline int kref_put_lock(struct kref *kref,
void (*release)(struct kref *kref),
spinlock_t *lock)
diff --git a/include/linux/refcount.h b/include/linux/refcount.h
index 0e3ee25eb156..a9d5038aec9a 100644
--- a/include/linux/refcount.h
+++ b/include/linux/refcount.h
@@ -99,6 +99,7 @@
#include <linux/spinlock_types.h>
struct mutex;
+struct rw_semaphore;
/**
* struct refcount_t - variant of atomic_t specialized for reference counts
@@ -313,6 +314,7 @@ static inline void refcount_dec(refcount_t *r)
extern __must_check bool refcount_dec_if_one(refcount_t *r);
extern __must_check bool refcount_dec_not_one(refcount_t *r);
extern __must_check bool refcount_dec_and_mutex_lock(refcount_t *r, struct mutex *lock);
+extern __must_check bool refcount_dec_and_down_write(refcount_t *r, struct rw_semaphore *rwsem);
extern __must_check bool refcount_dec_and_lock(refcount_t *r, spinlock_t *lock);
extern __must_check bool refcount_dec_and_lock_irqsave(refcount_t *r,
spinlock_t *lock,
diff --git a/lib/refcount.c b/lib/refcount.c
index ebac8b7d15a7..03e113e1b43a 100644
--- a/lib/refcount.c
+++ b/lib/refcount.c
@@ -4,6 +4,7 @@
*/
#include <linux/mutex.h>
+#include <linux/rwsem.h>
#include <linux/refcount.h>
#include <linux/spinlock.h>
#include <linux/bug.h>
@@ -94,6 +95,37 @@ bool refcount_dec_not_one(refcount_t *r)
}
EXPORT_SYMBOL(refcount_dec_not_one);
+/**
+ * refcount_dec_and_down_write - return holding rwsem for writing if able to decrement
+ * refcount to 0
+ * @r: the refcount
+ * @lock: the mutex to be locked
+ *
+ * Similar to atomic_dec_and_mutex_lock(), it will WARN on underflow and fail
+ * to decrement when saturated at REFCOUNT_SATURATED.
+ *
+ * Provides release memory ordering, such that prior loads and stores are done
+ * before, and provides a control dependency such that free() must come after.
+ * See the comment on top.
+ *
+ * Return: true and hold rwsem for writing if able to decrement refcount to 0, false
+ * otherwise
+ */
+bool refcount_dec_and_down_write(refcount_t *r, struct rw_semaphore *lock)
+{
+ if (refcount_dec_not_one(r))
+ return false;
+
+ down_write(lock);
+ if (!refcount_dec_and_test(r)) {
+ up_write(lock);
+ return false;
+ }
+
+ return true;
+}
+EXPORT_SYMBOL(refcount_dec_and_down_write);
+
/**
* refcount_dec_and_mutex_lock - return holding mutex if able to decrement
* refcount to 0
--
2.25.2
^ permalink raw reply related
* Re: [PATCH] powerpc/5200: update contact email
From: Michael Ellerman @ 2020-05-04 10:55 UTC (permalink / raw)
To: Wolfram Sang, linux-kernel
Cc: devicetree, Wolfram Sang, Rob Herring, Paul Mackerras, kernel,
linuxppc-dev
In-Reply-To: <20200502142642.18979-1-wsa@kernel.org>
Wolfram Sang <wsa@kernel.org> writes:
> My 'pengutronix' address is defunct for years. Merge the entries and use
> the proper contact address.
Is there any point adding the new address? It's just likely to bit-rot
one day too.
I figure the git history is a better source for more up-to-date emails.
cheers
> diff --git a/arch/powerpc/boot/dts/pcm032.dts b/arch/powerpc/boot/dts/pcm032.dts
> index c259c6b3ac5a..780e13d99e7b 100644
> --- a/arch/powerpc/boot/dts/pcm032.dts
> +++ b/arch/powerpc/boot/dts/pcm032.dts
> @@ -3,9 +3,7 @@
> * phyCORE-MPC5200B-IO (pcm032) board Device Tree Source
> *
> * Copyright (C) 2006-2009 Pengutronix
> - * Sascha Hauer <s.hauer@pengutronix.de>
> - * Juergen Beisert <j.beisert@pengutronix.de>
> - * Wolfram Sang <w.sang@pengutronix.de>
> + * Sascha Hauer, Juergen Beisert, Wolfram Sang <kernel@pengutronix.de>
> */
>
> /include/ "mpc5200b.dtsi"
> --
> 2.20.1
^ permalink raw reply
* Re: [PATCH] powerpc/64s: Fix unrecoverable SLB crashes due to preemption check
From: Michael Ellerman @ 2020-05-04 10:53 UTC (permalink / raw)
To: Hugh Dickins; +Cc: linuxppc-dev, hughd, npiggin
In-Reply-To: <alpine.LSU.2.11.2005030008400.1557@eggly.anvils>
Hugh Dickins <hughd@google.com> writes:
> On Sun, 3 May 2020, Michael Ellerman wrote:
>
>> Hugh reported that his trusty G5 crashed after a few hours under load
>> with an "Unrecoverable exception 380".
>>
>> The crash is in interrupt_return() where we check lazy_irq_pending(),
>> which calls get_paca() and with CONFIG_DEBUG_PREEMPT=y that goes to
>> check_preemption_disabled() via debug_smp_processor_id().
>>
>> As Nick explained on the list:
>>
>> Problem is MSR[RI] is cleared here, ready to do the last few things
>> for interrupt return where we're not allowed to take any other
>> interrupts.
>>
>> SLB interrupts can happen just about anywhere aside from kernel
>> text, global variables, and stack. When that hits, it appears to be
>> unrecoverable due to RI=0.
>>
>> The problematic access is in preempt_count() which is:
>>
>> return READ_ONCE(current_thread_info()->preempt_count);
>>
>> Because of THREAD_INFO_IN_TASK, current_thread_info() just points to
>> current, so the access is to somewhere in kernel memory, but not on
>> the stack or in .data, which means it can cause an SLB miss. If we
>> take an SLB miss with RI=0 it is fatal.
>>
>> The easiest solution is to add a version of lazy_irq_pending() that
>> doesn't do the preemption check and call it from the interrupt return
>> path.
>>
>> Fixes: 68b34588e202 ("powerpc/64/sycall: Implement syscall entry/exit logic in C")
>> Reported-by: Hugh Dickins <hughd@google.com>
>> Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
>
> Thank you, Michael and Nick: this has been running fine all day for me.
Thanks Hugh.
cheers
^ permalink raw reply
* Re: [PATCH 2/7] crypto: powerpc/sha1 - remove unused temporary workspace
From: Michael Ellerman @ 2020-05-04 10:27 UTC (permalink / raw)
To: Eric Biggers, linux-crypto
Cc: Jason A . Donenfeld, Theodore Ts'o, linux-kernel,
Paul Mackerras, linuxppc-dev
In-Reply-To: <20200502182427.104383-3-ebiggers@kernel.org>
Eric Biggers <ebiggers@kernel.org> writes:
> From: Eric Biggers <ebiggers@google.com>
>
> The PowerPC implementation of SHA-1 doesn't actually use the 16-word
> temporary array that's passed to the assembly code. This was probably
> meant to correspond to the 'W' array that lib/sha1.c uses. However, in
> sha1-powerpc-asm.S these values are actually stored in GPRs 16-31.
>
> Referencing SHA_WORKSPACE_WORDS from this code also isn't appropriate,
> since it's an implementation detail of lib/sha1.c.
>
> Therefore, just remove this unneeded array.
>
> Tested with:
>
> export ARCH=powerpc CROSS_COMPILE=powerpc-linux-gnu-
> make mpc85xx_defconfig
> cat >> .config << EOF
> # CONFIG_MODULES is not set
> # CONFIG_CRYPTO_MANAGER_DISABLE_TESTS is not set
> CONFIG_DEBUG_KERNEL=y
> CONFIG_CRYPTO_MANAGER_EXTRA_TESTS=y
> CONFIG_CRYPTO_SHA1_PPC=y
> EOF
> make olddefconfig
> make -j32
> qemu-system-ppc -M mpc8544ds -cpu e500 -nographic \
> -kernel arch/powerpc/boot/zImage \
> -append "cryptomgr.fuzz_iterations=1000 cryptomgr.panic_on_fail=1"
Thanks for testing.
I gave it a quick spin on a Power9 and it showed no issues.
Acked-by: Michael Ellerman <mpe@ellerman.id.au> (powerpc)
cheers
^ permalink raw reply
* Re: [PATCH v2 3/3] mm/page_alloc: Keep memoryless cpuless node 0 offline
From: Michal Hocko @ 2020-05-04 9:37 UTC (permalink / raw)
To: Srikar Dronamraju
Cc: Linus Torvalds, linux-kernel, linux-mm, Mel Gorman,
Kirill A. Shutemov, Andrew Morton, linuxppc-dev,
Christopher Lameter, Vlastimil Babka
In-Reply-To: <20200430071820.GF19958@linux.vnet.ibm.com>
On Thu 30-04-20 12:48:20, Srikar Dronamraju wrote:
> * Michal Hocko <mhocko@kernel.org> [2020-04-29 14:22:11]:
>
> > On Wed 29-04-20 07:11:45, Srikar Dronamraju wrote:
> > > > >
> > > > > By marking, N_ONLINE as NODE_MASK_NONE, lets stop assuming that Node 0 is
> > > > > always online.
> > > > >
> > > > > ...
> > > > >
> > > > > --- a/mm/page_alloc.c
> > > > > +++ b/mm/page_alloc.c
> > > > > @@ -116,8 +116,10 @@ EXPORT_SYMBOL(latent_entropy);
> > > > > */
> > > > > nodemask_t node_states[NR_NODE_STATES] __read_mostly = {
> > > > > [N_POSSIBLE] = NODE_MASK_ALL,
> > > > > +#ifdef CONFIG_NUMA
> > > > > + [N_ONLINE] = NODE_MASK_NONE,
> > > > > +#else
> > > > > [N_ONLINE] = { { [0] = 1UL } },
> > > > > -#ifndef CONFIG_NUMA
> > > > > [N_NORMAL_MEMORY] = { { [0] = 1UL } },
> > > > > #ifdef CONFIG_HIGHMEM
> > > > > [N_HIGH_MEMORY] = { { [0] = 1UL } },
> > > >
> > > > So on all other NUMA machines, when does node 0 get marked online?
> > > >
> > > > This change means that for some time during boot, such machines will
> > > > now be running with node 0 marked as offline. What are the
> > > > implications of this? Will something break?
> > >
> > > Till the nodes are detected, marking Node 0 as online tends to be redundant.
> > > Because the system doesn't know if its a NUMA or a non-NUMA system.
> > > Once we detect the nodes, we online them immediately. Hence I don't see any
> > > side-effects or negative implications of this change.
> > >
> > > However if I am missing anything, please do let me know.
> > >
> > > >From my part, I have tested this on
> > > 1. Non-NUMA Single node but CPUs and memory coming from zero node.
> > > 2. Non-NUMA Single node but CPUs and memory coming from non-zero node.
> > > 3. NUMA Multi node but with CPUs and memory from node 0.
> > > 4. NUMA Multi node but with no CPUs and memory from node 0.
> >
> > Have you tested on something else than ppc? Each arch does the NUMA
> > setup separately and this is a big mess. E.g. x86 marks even memory less
> > nodes (see init_memory_less_node) as online.
> >
>
> while I have predominantly tested on ppc, I did test on X86 with CONFIG_NUMA
> enabled/disabled on both single node and multi node machines.
> However, I dont have a cpuless/memoryless x86 system.
This should be able to emulate inside kvm, I believe.
> > Honestly I have hard time to evaluate the effect of this patch. It makes
> > some sense to assume all nodes offline before they get online but this
> > is a land mine territory.
> >
> > I am also not sure what kind of problem this is going to address. You
> > have mentioned numa balancing without many details.
>
> 1. On a machine with just one node with node number not being 0,
> the current setup will end up showing 2 online nodes. And when there are
> more than one online nodes, numa_balancing gets enabled.
>
> Without patch
> $ grep numa /proc/vmstat
> numa_hit 95179
> numa_miss 0
> numa_foreign 0
> numa_interleave 3764
> numa_local 95179
> numa_other 0
> numa_pte_updates 1206973 <----------
> numa_huge_pte_updates 4654 <----------
> numa_hint_faults 19560 <----------
> numa_hint_faults_local 19560 <----------
> numa_pages_migrated 0
>
>
> With patch
> $ grep numa /proc/vmstat
> numa_hit 322338756
> numa_miss 0
> numa_foreign 0
> numa_interleave 3790
> numa_local 322338756
> numa_other 0
> numa_pte_updates 0 <----------
> numa_huge_pte_updates 0 <----------
> numa_hint_faults 0 <----------
> numa_hint_faults_local 0 <----------
> numa_pages_migrated 0
>
> So we have a redundant page hinting numa faults which we can avoid.
interesting. Does this lead to any observable differences? Btw. it would
be really great to describe how the online state influences the numa
balancing.
> 2. Few people have complained about existence of this dummy node when
> parsing lscpu and numactl o/p. They somehow start to think that the tools
> are reporting incorrectly or the kernel is not able to recognize resources
> connected to the node.
Please be more specific.
--
Michal Hocko
SUSE Labs
^ permalink raw reply
* Re: [PATCH v7 14/28] powerpc: Add a probe_kernel_read_inst() function
From: Alistair Popple @ 2020-05-04 9:24 UTC (permalink / raw)
To: Jordan Niethe; +Cc: npiggin, bala24, naveen.n.rao, linuxppc-dev, dja
In-Reply-To: <20200501034220.8982-15-jniethe5@gmail.com>
> @@ -524,7 +524,10 @@ __ftrace_make_call(struct dyn_ftrace *rec, unsigned
> long addr) struct module *mod = rec->arch.mod;
>
> /* read where this goes */
> - if (probe_kernel_read(op, ip, sizeof(op)))
> + if (probe_kernel_read_inst(op, ip))
> + return -EFAULT;
> +
> + if (probe_kernel_read_inst(op + 1, ip + 4))
> return -EFAULT;
I had to double check the above for what happens when we introduce prefix
instructions but it looks mostly correct. There does however look to be a
corner case that could alter the behaviour once prefix instructions are
introduced.
With prefix instructions probe_kernel_read_inst() will read 8 bytes if the first
4 bytes are a valid prefix. Therefore the above could end up trying to read 12
bytes in total if the ip is a normal instruction and ip+4 is prefixed.
Obviously this is never going to match the expected nop sequence, and prefixed
instructions shouldn't cross page boundaries so the extra 4 bytes should never
be the cause of a fault either. The only difference we might see is
ftrace_make_call() incorrectly returning -EFAULT instead of -EINVAL for an
invalid (ie. crossing a 64 byte boundary) prefix instruction sequence.
In practice this doesn't seem like it would cause any real issues and the rest
of the patch does not appear to change any existing behaviour.
Reviewed-by: Alistair Popple <alistair@popple.id.au>
>
> if (!expected_nop_sequence(ip, op[0], op[1])) {
> @@ -587,7 +590,7 @@ __ftrace_make_call(struct dyn_ftrace *rec, unsigned long
> addr) unsigned long ip = rec->ip;
>
> /* read where this goes */
> - if (probe_kernel_read(&op, (void *)ip, MCOUNT_INSN_SIZE))
> + if (probe_kernel_read_inst(&op, (void *)ip))
> return -EFAULT;
>
> /* It should be pointing to a nop */
> @@ -643,7 +646,7 @@ static int __ftrace_make_call_kernel(struct dyn_ftrace
> *rec, unsigned long addr) }
>
> /* Make sure we have a nop */
> - if (probe_kernel_read(&op, ip, sizeof(op))) {
> + if (probe_kernel_read_inst(&op, ip)) {
> pr_err("Unable to read ftrace location %p\n", ip);
> return -EFAULT;
> }
> @@ -721,7 +724,7 @@ __ftrace_modify_call(struct dyn_ftrace *rec, unsigned
> long old_addr, }
>
> /* read where this goes */
> - if (probe_kernel_read(&op, (void *)ip, sizeof(int))) {
> + if (probe_kernel_read_inst(&op, (void *)ip)) {
> pr_err("Fetching opcode failed.\n");
> return -EFAULT;
> }
> diff --git a/arch/powerpc/lib/inst.c b/arch/powerpc/lib/inst.c
> index eaf786afad2b..08dedd927268 100644
> --- a/arch/powerpc/lib/inst.c
> +++ b/arch/powerpc/lib/inst.c
> @@ -16,3 +16,14 @@ int probe_user_read_inst(struct ppc_inst *inst,
> *inst = ppc_inst(val);
> return err;
> }
> +
> +int probe_kernel_read_inst(struct ppc_inst *inst,
> + struct ppc_inst *src)
> +{
> + unsigned int val;
> + int err;
> +
> + err = probe_kernel_read(&val, src, sizeof(val));
> + *inst = ppc_inst(val);
> + return err;
> +}
^ permalink raw reply
page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox