* [RFC PATCH 3/3] mm/nvdimm: Use correct #defines instead of opencoding
From: Aneesh Kumar K.V @ 2019-05-22 6:20 UTC (permalink / raw)
To: dan.j.williams; +Cc: linux-mm, linuxppc-dev, Aneesh Kumar K.V, linux-nvdimm
In-Reply-To: <20190522062057.26581-1-aneesh.kumar@linux.ibm.com>
The nfpn related change is needed to fix the kernel message
"number of pfns truncated from 2617344 to 163584"
The change makes sure the nfpns stored in the superblock is right value.
Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
---
drivers/nvdimm/label.c | 2 +-
drivers/nvdimm/pfn_devs.c | 6 +++---
drivers/nvdimm/region_devs.c | 8 ++++----
3 files changed, 8 insertions(+), 8 deletions(-)
diff --git a/drivers/nvdimm/label.c b/drivers/nvdimm/label.c
index f3d753d3169c..bc6de8fb0153 100644
--- a/drivers/nvdimm/label.c
+++ b/drivers/nvdimm/label.c
@@ -361,7 +361,7 @@ static bool slot_valid(struct nvdimm_drvdata *ndd,
/* check that DPA allocations are page aligned */
if ((__le64_to_cpu(nd_label->dpa)
- | __le64_to_cpu(nd_label->rawsize)) % SZ_4K)
+ | __le64_to_cpu(nd_label->rawsize)) % PAGE_SIZE)
return false;
/* check checksum */
diff --git a/drivers/nvdimm/pfn_devs.c b/drivers/nvdimm/pfn_devs.c
index 94918a4e6e73..f549bddc680c 100644
--- a/drivers/nvdimm/pfn_devs.c
+++ b/drivers/nvdimm/pfn_devs.c
@@ -765,8 +765,8 @@ static int nd_pfn_init(struct nd_pfn *nd_pfn)
* when populating the vmemmap. This *should* be equal to
* PMD_SIZE for most architectures.
*/
- offset = ALIGN(start + reserve + 64 * npfns,
- max(nd_pfn->align, PMD_SIZE)) - start;
+ offset = ALIGN(start + reserve + sizeof(struct page) * npfns,
+ max(nd_pfn->align, PMD_SIZE)) - start;
} else if (nd_pfn->mode == PFN_MODE_RAM)
offset = ALIGN(start + reserve, nd_pfn->align) - start;
else
@@ -778,7 +778,7 @@ static int nd_pfn_init(struct nd_pfn *nd_pfn)
return -ENXIO;
}
- npfns = (size - offset - start_pad - end_trunc) / SZ_4K;
+ npfns = (size - offset - start_pad - end_trunc) / PAGE_SIZE;
pfn_sb->mode = cpu_to_le32(nd_pfn->mode);
pfn_sb->dataoff = cpu_to_le64(offset);
pfn_sb->npfns = cpu_to_le64(npfns);
diff --git a/drivers/nvdimm/region_devs.c b/drivers/nvdimm/region_devs.c
index b4ef7d9ff22e..2d8facea5a03 100644
--- a/drivers/nvdimm/region_devs.c
+++ b/drivers/nvdimm/region_devs.c
@@ -994,10 +994,10 @@ static struct nd_region *nd_region_create(struct nvdimm_bus *nvdimm_bus,
struct nd_mapping_desc *mapping = &ndr_desc->mapping[i];
struct nvdimm *nvdimm = mapping->nvdimm;
- if ((mapping->start | mapping->size) % SZ_4K) {
- dev_err(&nvdimm_bus->dev, "%s: %s mapping%d is not 4K aligned\n",
- caller, dev_name(&nvdimm->dev), i);
-
+ if ((mapping->start | mapping->size) % PAGE_SIZE) {
+ dev_err(&nvdimm_bus->dev,
+ "%s: %s mapping%d is not 4K aligned\n",
+ caller, dev_name(&nvdimm->dev), i);
return NULL;
}
--
2.21.0
^ permalink raw reply related
* Re: [RFC PATCH 1/3] mm/nvdimm: Add PFN_MIN_VERSION support
From: Aneesh Kumar K.V @ 2019-05-22 6:35 UTC (permalink / raw)
To: dan.j.williams; +Cc: linux-mm, linuxppc-dev, linux-nvdimm
In-Reply-To: <20190522062057.26581-1-aneesh.kumar@linux.ibm.com>
On 5/22/19 11:50 AM, Aneesh Kumar K.V wrote:
> This allows us to make changes in a backward incompatible way. I have
> kept the PFN_MIN_VERSION in this patch '0' because we are not introducing
> any incompatible changes in this patch. We also may want to backport this
> to older kernels.
>
> Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
> ---
> drivers/nvdimm/pfn.h | 9 ++++++++-
> drivers/nvdimm/pfn_devs.c | 4 ++++
> drivers/nvdimm/pmem.c | 26 ++++++++++++++++++++++----
> 3 files changed, 34 insertions(+), 5 deletions(-)
>
> diff --git a/drivers/nvdimm/pfn.h b/drivers/nvdimm/pfn.h
> index dde9853453d3..1b10ae5773b6 100644
> --- a/drivers/nvdimm/pfn.h
> +++ b/drivers/nvdimm/pfn.h
> @@ -20,6 +20,12 @@
> #define PFN_SIG_LEN 16
> #define PFN_SIG "NVDIMM_PFN_INFO\0"
> #define DAX_SIG "NVDIMM_DAX_INFO\0"
> +/*
> + * increment this when we are making changes such that older
> + * kernel should fail to initialize that namespace.
> + */
> +
> +#define PFN_MIN_VERSION 0
>
> struct nd_pfn_sb {
> u8 signature[PFN_SIG_LEN];
> @@ -36,7 +42,8 @@ struct nd_pfn_sb {
> __le32 end_trunc;
> /* minor-version-2 record the base alignment of the mapping */
> __le32 align;
> - u8 padding[4000];
> + __le16 min_verison;
> + u8 padding[3998];
> __le64 checksum;
> };
>
> diff --git a/drivers/nvdimm/pfn_devs.c b/drivers/nvdimm/pfn_devs.c
> index 01f40672507f..3250de70a7b3 100644
> --- a/drivers/nvdimm/pfn_devs.c
> +++ b/drivers/nvdimm/pfn_devs.c
> @@ -439,6 +439,9 @@ int nd_pfn_validate(struct nd_pfn *nd_pfn, const char *sig)
> if (nvdimm_read_bytes(ndns, SZ_4K, pfn_sb, sizeof(*pfn_sb), 0))
> return -ENXIO;
>
> + if (le16_to_cpu(pfn_sb->min_version > PFN_MIN_VERSION))
> + return -EOPNOTSUPP;
+ if (le16_to_cpu(pfn_sb->min_version) > PFN_MIN_VERSION)
+ return -EOPNOTSUPP;
-aneesh
^ permalink raw reply
* Re: [PATCH] powerpc/powernv: Return for invalid IMC domain
From: Anju T Sudhakar @ 2019-05-22 5:40 UTC (permalink / raw)
To: Michael Ellerman; +Cc: pavsubra, maddy, linuxppc-dev
In-Reply-To: <87sgt86na9.fsf@concordia.ellerman.id.au>
[-- Attachment #1: Type: text/plain, Size: 1733 bytes --]
Hi,
On 5/21/19 5:18 PM, Michael Ellerman wrote:
> Anju T Sudhakar <anju@linux.vnet.ibm.com> writes:
>> Currently init_imc_pmu() can be failed either because
>> an IMC unit with invalid domain(i.e an IMC node not
>> supported by the kernel) is attempted a pmu-registration
>> or something went wrong while registering a valid IMC unit.
>> In both the cases kernel provides a 'Registration failed'
>> error message.
>>
>> Example:
>> Log message, when trace-imc node is not supported by the kernel, and the
>> skiboot supports trace-imc node.
>>
>> So for kernel, trace-imc node is now an unknown domain.
>>
>> [ 1.731870] nest_phb5_imc performance monitor hardware support registered
>> [ 1.731944] nest_powerbus0_imc performance monitor hardware support registered
>> [ 1.734458] thread_imc performance monitor hardware support registered
>> [ 1.734460] IMC Unknown Device type
>> [ 1.734462] IMC PMU (null) Register failed
>> [ 1.734558] nest_xlink0_imc performance monitor hardware support registered
>> [ 1.734614] nest_xlink1_imc performance monitor hardware support registered
>> [ 1.734670] nest_xlink2_imc performance monitor hardware support registered
>> [ 1.747043] Initialise system trusted keyrings
>> [ 1.747054] Key type blacklist registered
>>
>>
>> To avoid ambiguity on the error message, return for invalid domain
>> before attempting a pmu registration.
> What do we print once the patch is applied?
Once the patch is applied, we return for invalid domains. so we will
only have
`/IMC Unknown Device type/` message printed for *unknown domains*.
And `/IMC PMU (null) Register failed/` message will appear only if the
registration fails for a *known domain*.
Thanks,
Anju
[-- Attachment #2: Type: text/html, Size: 2463 bytes --]
^ permalink raw reply
* Re: [BISECTED] kexec regression on PowerBook G4
From: Christophe Leroy @ 2019-05-22 7:44 UTC (permalink / raw)
To: Aaro Koskinen, Michael Ellerman; +Cc: linuxppc-dev
In-Reply-To: <90f3557b-400b-60b5-9ff8-d5605adeee79@c-s.fr>
Hi Again,
On 05/22/2019 06:14 AM, Christophe Leroy wrote:
> Hi Aero,
>
> Le 22/05/2019 à 00:18, Aaro Koskinen a écrit :
>> Hi,
>>
>> I was trying to upgrade from v5.0 -> v5.1 on PowerBook G4, but when
>> trying
>> to kexec a kernel the system gets stuck (no errors seen on the console).
>
> Do you mean you are trying to kexec a v5.1 kernel from a v5.0 kernel, or
> do you have a working v5.1 kernel, but kexec doesn't work with it ?
>
>>
>> Bisected to: 93c4a162b014 ("powerpc/6xx: Store PGDIR physical address
>> in a SPRG"). This commit doesn't revert cleanly anymore but I tested
>> that the one before works OK.
>
> Not sure that's the problem. There was a problem with that commit, but
> it was fixed by 4622a2d43101 ("powerpc/6xx: fix setup and use of
> SPRN_SPRG_PGDIR for hash32").
> You probably hit some commit between those two during bisect, that's
> likely the reason why you ended here.
>
> Can you restart your bisect from 4622a2d43101 ?
>
> If you have CONFIG_SMP, maybe you should also consider taking
> 397d2300b08c ("powerpc/32s: fix flush_hash_pages() on SMP"). Stable
> 5.1.4 includes it.
>
>>
>> With current Linus HEAD (9c7db5004280), it gets a bit further but still
>> doesn't work: now I get an error on the console after kexec "Starting
>> new kernel! ... Bye!":
>>
>> kernel tried to execute exec-protected page (...) - exploit attempt?
>
> Interesting.
>
> Do you have CONFIG_STRICT_KERNEL_RWX=y in your .config ? If so, can you
> retry without it ?
After looking at the code, I don't thing CONFIG_STRICT_KERNEL_RWX will
make any difference. Can you try the patch below ?
From 8c1039da0d0f26cdf995156a905fc97fe7bda36c Mon Sep 17 00:00:00 2001
From: Christophe Leroy <christophe.leroy@c-s.fr>
Date: Wed, 22 May 2019 07:28:42 +0000
Subject: [PATCH] Fix Kexec
---
arch/powerpc/include/asm/pgtable.h | 2 ++
arch/powerpc/kernel/machine_kexec_32.c | 4 ++++
arch/powerpc/mm/pgtable_32.c | 2 +-
3 files changed, 7 insertions(+), 1 deletion(-)
diff --git a/arch/powerpc/include/asm/pgtable.h
b/arch/powerpc/include/asm/pgtable.h
index 3f53be60fb01..642eea937229 100644
--- a/arch/powerpc/include/asm/pgtable.h
+++ b/arch/powerpc/include/asm/pgtable.h
@@ -140,6 +140,8 @@ static inline void pte_frag_set(mm_context_t *ctx,
void *p)
}
#endif
+int change_page_attr(struct page *page, int numpages, pgprot_t prot);
+
#endif /* __ASSEMBLY__ */
#endif /* _ASM_POWERPC_PGTABLE_H */
diff --git a/arch/powerpc/kernel/machine_kexec_32.c
b/arch/powerpc/kernel/machine_kexec_32.c
index affe5dcce7f4..4f719501e6ae 100644
--- a/arch/powerpc/kernel/machine_kexec_32.c
+++ b/arch/powerpc/kernel/machine_kexec_32.c
@@ -54,6 +54,10 @@ void default_machine_kexec(struct kimage *image)
memcpy((void *)reboot_code_buffer, relocate_new_kernel,
relocate_new_kernel_size);
+ change_page_attr(image->control_code_page,
+ ALIGN(KEXEC_CONTROL_PAGE_SIZE, PAGE_SIZE) >> PAGE_SHIFT,
+ PAGE_KERNEL_TEXT);
+
flush_icache_range(reboot_code_buffer,
reboot_code_buffer + KEXEC_CONTROL_PAGE_SIZE);
printk(KERN_INFO "Bye!\n");
diff --git a/arch/powerpc/mm/pgtable_32.c b/arch/powerpc/mm/pgtable_32.c
index 16ada373b32b..0e4651d803fc 100644
--- a/arch/powerpc/mm/pgtable_32.c
+++ b/arch/powerpc/mm/pgtable_32.c
@@ -340,7 +340,7 @@ static int __change_page_attr_noflush(struct page
*page, pgprot_t prot)
*
* THIS DOES NOTHING WITH BAT MAPPINGS, DEBUG USE ONLY
*/
-static int change_page_attr(struct page *page, int numpages, pgprot_t prot)
+int change_page_attr(struct page *page, int numpages, pgprot_t prot)
{
int i, err = 0;
unsigned long flags;
--
2.13.3
^ permalink raw reply related
* Re: [PATCH 1/2] open: add close_range()
From: Christian Brauner @ 2019-05-22 8:12 UTC (permalink / raw)
To: Linus Torvalds
Cc: linux-ia64, Linux-sh list, Oleg Nesterov, David Howells,
open list:KERNEL SELFTEST FRAMEWORK, sparclinux, Shuah Khan,
linux-arch, linux-s390, Miklos Szeredi, the arch/x86 maintainers,
linux-mips, linux-xtensa, Todd Kjos, Arnd Bergmann, Jann Horn,
linux-m68k, Al Viro, Thomas Gleixner, Dmitry V. Levin, Linux ARM,
Florian Weimer, Parisc List, Linux API, Linux List Kernel Mailing,
alpha, linux-fsdevel, linuxppc-dev
In-Reply-To: <CAHk-=wgtHm4t71oKbykE=awiVv2H2wCy8yH0L_FsyhHQ5OSO+Q@mail.gmail.com>
On Tue, May 21, 2019 at 10:23 PM Linus Torvalds
<torvalds@linux-foundation.org> wrote:
>
> On Tue, May 21, 2019 at 9:41 AM Christian Brauner <christian@brauner.io> wrote:
> >
> > Yeah, you mentioned this before. I do like being able to specify an
> > upper bound to have the ability to place fds strategically after said
> > upper bound.
>
> I suspect that's the case.
>
> And if somebody really wants to just close everything and uses a large
> upper bound, we can - if we really want to - just compare the upper
> bound to the file table size, and do an optimized case for that. We do
> that upper bound comparison anyway to limit the size of the walk, so
> *if* it's a big deal, that case could then do the whole "shrink
> fdtable" case too.
Makes sense.
>
> But I don't believe it's worth optimizing for unless somebody really
> has a load where that is shown to be a big deal. Just do the silly
> and simple loop, and add a cond_resched() in the loop, like
> close_files() does for the "we have a _lot_ of files open" case.
Ok. I will resend a v1 later with the cond_resched() logic you and Al
suggested added.
Thanks!
Christian
^ permalink raw reply
* [RFC PATCH V2 2/3] mm/nvdimm: Add page size and struct page size to pfn superblock
From: Aneesh Kumar K.V @ 2019-05-22 8:27 UTC (permalink / raw)
To: dan.j.williams; +Cc: linux-mm, linuxppc-dev, Aneesh Kumar K.V, linux-nvdimm
In-Reply-To: <20190522082701.6817-1-aneesh.kumar@linux.ibm.com>
This is needed so that we don't wrongly initialize a namespace
which doesn't have enough space reserved for holding struct pages
with the current kernel.
We also increment PFN_MIN_VERSION to make sure that older kernel
won't initialize namespace created with newer kernel.
Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
---
drivers/nvdimm/pfn.h | 7 +++++--
drivers/nvdimm/pfn_devs.c | 19 ++++++++++++++++++-
2 files changed, 23 insertions(+), 3 deletions(-)
diff --git a/drivers/nvdimm/pfn.h b/drivers/nvdimm/pfn.h
index 5fd29242745a..ba11738ca8a2 100644
--- a/drivers/nvdimm/pfn.h
+++ b/drivers/nvdimm/pfn.h
@@ -25,7 +25,7 @@
* kernel should fail to initialize that namespace.
*/
-#define PFN_MIN_VERSION 0
+#define PFN_MIN_VERSION 1
struct nd_pfn_sb {
u8 signature[PFN_SIG_LEN];
@@ -43,7 +43,10 @@ struct nd_pfn_sb {
/* minor-version-2 record the base alignment of the mapping */
__le32 align;
__le16 min_version;
- u8 padding[3998];
+ /* minor-version-3 record the page size and struct page size */
+ __le16 page_struct_size;
+ __le32 page_size;
+ u8 padding[3992];
__le64 checksum;
};
diff --git a/drivers/nvdimm/pfn_devs.c b/drivers/nvdimm/pfn_devs.c
index a2268cf262f5..39fa8cf8ef58 100644
--- a/drivers/nvdimm/pfn_devs.c
+++ b/drivers/nvdimm/pfn_devs.c
@@ -466,6 +466,15 @@ int nd_pfn_validate(struct nd_pfn *nd_pfn, const char *sig)
if (__le16_to_cpu(pfn_sb->version_minor) < 2)
pfn_sb->align = 0;
+ if (__le16_to_cpu(pfn_sb->version_minor) < 3) {
+ /*
+ * For a large part we use PAGE_SIZE. But we
+ * do have some accounting code using SZ_4K.
+ */
+ pfn_sb->page_struct_size = cpu_to_le16(64);
+ pfn_sb->page_size = cpu_to_le32(SZ_4K);
+ }
+
switch (le32_to_cpu(pfn_sb->mode)) {
case PFN_MODE_RAM:
case PFN_MODE_PMEM:
@@ -481,6 +490,12 @@ int nd_pfn_validate(struct nd_pfn *nd_pfn, const char *sig)
align = 1UL << ilog2(offset);
mode = le32_to_cpu(pfn_sb->mode);
+ if (le32_to_cpu(pfn_sb->page_size) != PAGE_SIZE)
+ return -EOPNOTSUPP;
+
+ if (le16_to_cpu(pfn_sb->page_struct_size) != sizeof(struct page))
+ return -EOPNOTSUPP;
+
if (!nd_pfn->uuid) {
/*
* When probing a namepace via nd_pfn_probe() the uuid
@@ -775,11 +790,13 @@ static int nd_pfn_init(struct nd_pfn *nd_pfn)
memcpy(pfn_sb->uuid, nd_pfn->uuid, 16);
memcpy(pfn_sb->parent_uuid, nd_dev_to_uuid(&ndns->dev), 16);
pfn_sb->version_major = cpu_to_le16(1);
- pfn_sb->version_minor = cpu_to_le16(2);
+ pfn_sb->version_minor = cpu_to_le16(3);
pfn_sb->min_version = cpu_to_le16(PFN_MIN_VERSION);
pfn_sb->start_pad = cpu_to_le32(start_pad);
pfn_sb->end_trunc = cpu_to_le32(end_trunc);
pfn_sb->align = cpu_to_le32(nd_pfn->align);
+ pfn_sb->page_struct_size = cpu_to_le16(sizeof(struct page));
+ pfn_sb->page_size = cpu_to_le32(PAGE_SIZE);
checksum = nd_sb_checksum((struct nd_gen_sb *) pfn_sb);
pfn_sb->checksum = cpu_to_le64(checksum);
--
2.21.0
^ permalink raw reply related
* [RFC PATCH V2 3/3] mm/nvdimm: Use correct #defines instead of opencoding
From: Aneesh Kumar K.V @ 2019-05-22 8:27 UTC (permalink / raw)
To: dan.j.williams; +Cc: linux-mm, linuxppc-dev, Aneesh Kumar K.V, linux-nvdimm
In-Reply-To: <20190522082701.6817-1-aneesh.kumar@linux.ibm.com>
The nfpn related change is needed to fix the kernel message
"number of pfns truncated from 2617344 to 163584"
The change makes sure the nfpns stored in the superblock is right value.
Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
---
drivers/nvdimm/label.c | 2 +-
drivers/nvdimm/pfn_devs.c | 6 +++---
drivers/nvdimm/region_devs.c | 8 ++++----
3 files changed, 8 insertions(+), 8 deletions(-)
diff --git a/drivers/nvdimm/label.c b/drivers/nvdimm/label.c
index f3d753d3169c..bc6de8fb0153 100644
--- a/drivers/nvdimm/label.c
+++ b/drivers/nvdimm/label.c
@@ -361,7 +361,7 @@ static bool slot_valid(struct nvdimm_drvdata *ndd,
/* check that DPA allocations are page aligned */
if ((__le64_to_cpu(nd_label->dpa)
- | __le64_to_cpu(nd_label->rawsize)) % SZ_4K)
+ | __le64_to_cpu(nd_label->rawsize)) % PAGE_SIZE)
return false;
/* check checksum */
diff --git a/drivers/nvdimm/pfn_devs.c b/drivers/nvdimm/pfn_devs.c
index 39fa8cf8ef58..9fc2e514e28a 100644
--- a/drivers/nvdimm/pfn_devs.c
+++ b/drivers/nvdimm/pfn_devs.c
@@ -769,8 +769,8 @@ static int nd_pfn_init(struct nd_pfn *nd_pfn)
* when populating the vmemmap. This *should* be equal to
* PMD_SIZE for most architectures.
*/
- offset = ALIGN(start + reserve + 64 * npfns,
- max(nd_pfn->align, PMD_SIZE)) - start;
+ offset = ALIGN(start + reserve + sizeof(struct page) * npfns,
+ max(nd_pfn->align, PMD_SIZE)) - start;
} else if (nd_pfn->mode == PFN_MODE_RAM)
offset = ALIGN(start + reserve, nd_pfn->align) - start;
else
@@ -782,7 +782,7 @@ static int nd_pfn_init(struct nd_pfn *nd_pfn)
return -ENXIO;
}
- npfns = (size - offset - start_pad - end_trunc) / SZ_4K;
+ npfns = (size - offset - start_pad - end_trunc) / PAGE_SIZE;
pfn_sb->mode = cpu_to_le32(nd_pfn->mode);
pfn_sb->dataoff = cpu_to_le64(offset);
pfn_sb->npfns = cpu_to_le64(npfns);
diff --git a/drivers/nvdimm/region_devs.c b/drivers/nvdimm/region_devs.c
index b4ef7d9ff22e..2d8facea5a03 100644
--- a/drivers/nvdimm/region_devs.c
+++ b/drivers/nvdimm/region_devs.c
@@ -994,10 +994,10 @@ static struct nd_region *nd_region_create(struct nvdimm_bus *nvdimm_bus,
struct nd_mapping_desc *mapping = &ndr_desc->mapping[i];
struct nvdimm *nvdimm = mapping->nvdimm;
- if ((mapping->start | mapping->size) % SZ_4K) {
- dev_err(&nvdimm_bus->dev, "%s: %s mapping%d is not 4K aligned\n",
- caller, dev_name(&nvdimm->dev), i);
-
+ if ((mapping->start | mapping->size) % PAGE_SIZE) {
+ dev_err(&nvdimm_bus->dev,
+ "%s: %s mapping%d is not 4K aligned\n",
+ caller, dev_name(&nvdimm->dev), i);
return NULL;
}
--
2.21.0
^ permalink raw reply related
* [RFC PATCH V2 1/3] mm/nvdimm: Add PFN_MIN_VERSION support
From: Aneesh Kumar K.V @ 2019-05-22 8:26 UTC (permalink / raw)
To: dan.j.williams; +Cc: linux-mm, linuxppc-dev, Aneesh Kumar K.V, linux-nvdimm
This allows us to make changes in a backward incompatible way. I have
kept the PFN_MIN_VERSION in this patch '0' because we are not introducing
any incompatible changes in this patch. We also may want to backport this
to older kernels.
The error looks like
dax0.1: init failed, superblock min version 1, kernel support version 0
and the namespace is marked disabled
$ndctl list -Ni
[
{
"dev":"namespace0.0",
"mode":"fsdax",
"map":"mem",
"size":10737418240,
"uuid":"9605de6d-cefa-4a87-99cd-dec28b02cffe",
"state":"disabled"
}
]
Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
---
drivers/nvdimm/pfn.h | 9 ++++++++-
drivers/nvdimm/pfn_devs.c | 8 ++++++++
drivers/nvdimm/pmem.c | 26 ++++++++++++++++++++++----
3 files changed, 38 insertions(+), 5 deletions(-)
diff --git a/drivers/nvdimm/pfn.h b/drivers/nvdimm/pfn.h
index dde9853453d3..5fd29242745a 100644
--- a/drivers/nvdimm/pfn.h
+++ b/drivers/nvdimm/pfn.h
@@ -20,6 +20,12 @@
#define PFN_SIG_LEN 16
#define PFN_SIG "NVDIMM_PFN_INFO\0"
#define DAX_SIG "NVDIMM_DAX_INFO\0"
+/*
+ * increment this when we are making changes such that older
+ * kernel should fail to initialize that namespace.
+ */
+
+#define PFN_MIN_VERSION 0
struct nd_pfn_sb {
u8 signature[PFN_SIG_LEN];
@@ -36,7 +42,8 @@ struct nd_pfn_sb {
__le32 end_trunc;
/* minor-version-2 record the base alignment of the mapping */
__le32 align;
- u8 padding[4000];
+ __le16 min_version;
+ u8 padding[3998];
__le64 checksum;
};
diff --git a/drivers/nvdimm/pfn_devs.c b/drivers/nvdimm/pfn_devs.c
index 01f40672507f..a2268cf262f5 100644
--- a/drivers/nvdimm/pfn_devs.c
+++ b/drivers/nvdimm/pfn_devs.c
@@ -439,6 +439,13 @@ int nd_pfn_validate(struct nd_pfn *nd_pfn, const char *sig)
if (nvdimm_read_bytes(ndns, SZ_4K, pfn_sb, sizeof(*pfn_sb), 0))
return -ENXIO;
+ if (le16_to_cpu(pfn_sb->min_version) > PFN_MIN_VERSION) {
+ dev_err(&nd_pfn->dev,
+ "init failed, superblock min version %ld kernel support version %ld\n",
+ le16_to_cpu(pfn_sb->min_version), PFN_MIN_VERSION);
+ return -EOPNOTSUPP;
+ }
+
if (memcmp(pfn_sb->signature, sig, PFN_SIG_LEN) != 0)
return -ENODEV;
@@ -769,6 +776,7 @@ static int nd_pfn_init(struct nd_pfn *nd_pfn)
memcpy(pfn_sb->parent_uuid, nd_dev_to_uuid(&ndns->dev), 16);
pfn_sb->version_major = cpu_to_le16(1);
pfn_sb->version_minor = cpu_to_le16(2);
+ pfn_sb->min_version = cpu_to_le16(PFN_MIN_VERSION);
pfn_sb->start_pad = cpu_to_le32(start_pad);
pfn_sb->end_trunc = cpu_to_le32(end_trunc);
pfn_sb->align = cpu_to_le32(nd_pfn->align);
diff --git a/drivers/nvdimm/pmem.c b/drivers/nvdimm/pmem.c
index 845c5b430cdd..406427c064d9 100644
--- a/drivers/nvdimm/pmem.c
+++ b/drivers/nvdimm/pmem.c
@@ -490,6 +490,7 @@ static int pmem_attach_disk(struct device *dev,
static int nd_pmem_probe(struct device *dev)
{
+ int ret;
struct nd_namespace_common *ndns;
ndns = nvdimm_namespace_common_probe(dev);
@@ -505,12 +506,29 @@ static int nd_pmem_probe(struct device *dev)
if (is_nd_pfn(dev))
return pmem_attach_disk(dev, ndns);
- /* if we find a valid info-block we'll come back as that personality */
- if (nd_btt_probe(dev, ndns) == 0 || nd_pfn_probe(dev, ndns) == 0
- || nd_dax_probe(dev, ndns) == 0)
+ ret = nd_btt_probe(dev, ndns);
+ if (ret == 0)
return -ENXIO;
+ else if (ret == -EOPNOTSUPP)
+ return ret;
- /* ...otherwise we're just a raw pmem device */
+ ret = nd_pfn_probe(dev, ndns);
+ if (ret == 0)
+ return -ENXIO;
+ else if (ret == -EOPNOTSUPP)
+ return ret;
+
+ ret = nd_dax_probe(dev, ndns);
+ if (ret == 0)
+ return -ENXIO;
+ else if (ret == -EOPNOTSUPP)
+ return ret;
+ /*
+ * We have two failure conditions here, there is no
+ * info reserver block or we found a valid info reserve block
+ * but failed to initialize the pfn superblock.
+ * Don't create a raw pmem disk for the second case.
+ */
return pmem_attach_disk(dev, ndns);
}
--
2.21.0
^ permalink raw reply related
* Re: [RFC PATCH V2 3/3] mm/nvdimm: Use correct #defines instead of opencoding
From: Satheesh Rajendran @ 2019-05-22 9:16 UTC (permalink / raw)
To: Aneesh Kumar K.V; +Cc: linux-mm, dan.j.williams, linuxppc-dev, linux-nvdimm
In-Reply-To: <20190522082701.6817-3-aneesh.kumar@linux.ibm.com>
On Wed, May 22, 2019 at 01:57:01PM +0530, Aneesh Kumar K.V wrote:
> The nfpn related change is needed to fix the kernel message
>
> "number of pfns truncated from 2617344 to 163584"
>
> The change makes sure the nfpns stored in the superblock is right value.
>
> Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
> ---
> drivers/nvdimm/label.c | 2 +-
> drivers/nvdimm/pfn_devs.c | 6 +++---
> drivers/nvdimm/region_devs.c | 8 ++++----
> 3 files changed, 8 insertions(+), 8 deletions(-)
>
> diff --git a/drivers/nvdimm/label.c b/drivers/nvdimm/label.c
> index f3d753d3169c..bc6de8fb0153 100644
> --- a/drivers/nvdimm/label.c
> +++ b/drivers/nvdimm/label.c
> @@ -361,7 +361,7 @@ static bool slot_valid(struct nvdimm_drvdata *ndd,
>
> /* check that DPA allocations are page aligned */
> if ((__le64_to_cpu(nd_label->dpa)
> - | __le64_to_cpu(nd_label->rawsize)) % SZ_4K)
> + | __le64_to_cpu(nd_label->rawsize)) % PAGE_SIZE)
> return false;
>
> /* check checksum */
> diff --git a/drivers/nvdimm/pfn_devs.c b/drivers/nvdimm/pfn_devs.c
> index 39fa8cf8ef58..9fc2e514e28a 100644
> --- a/drivers/nvdimm/pfn_devs.c
> +++ b/drivers/nvdimm/pfn_devs.c
> @@ -769,8 +769,8 @@ static int nd_pfn_init(struct nd_pfn *nd_pfn)
> * when populating the vmemmap. This *should* be equal to
> * PMD_SIZE for most architectures.
> */
> - offset = ALIGN(start + reserve + 64 * npfns,
> - max(nd_pfn->align, PMD_SIZE)) - start;
> + offset = ALIGN(start + reserve + sizeof(struct page) * npfns,
> + max(nd_pfn->align, PMD_SIZE)) - start;
> } else if (nd_pfn->mode == PFN_MODE_RAM)
> offset = ALIGN(start + reserve, nd_pfn->align) - start;
> else
> @@ -782,7 +782,7 @@ static int nd_pfn_init(struct nd_pfn *nd_pfn)
> return -ENXIO;
> }
>
> - npfns = (size - offset - start_pad - end_trunc) / SZ_4K;
> + npfns = (size - offset - start_pad - end_trunc) / PAGE_SIZE;
> pfn_sb->mode = cpu_to_le32(nd_pfn->mode);
> pfn_sb->dataoff = cpu_to_le64(offset);
> pfn_sb->npfns = cpu_to_le64(npfns);
> diff --git a/drivers/nvdimm/region_devs.c b/drivers/nvdimm/region_devs.c
> index b4ef7d9ff22e..2d8facea5a03 100644
> --- a/drivers/nvdimm/region_devs.c
> +++ b/drivers/nvdimm/region_devs.c
> @@ -994,10 +994,10 @@ static struct nd_region *nd_region_create(struct nvdimm_bus *nvdimm_bus,
> struct nd_mapping_desc *mapping = &ndr_desc->mapping[i];
> struct nvdimm *nvdimm = mapping->nvdimm;
>
> - if ((mapping->start | mapping->size) % SZ_4K) {
> - dev_err(&nvdimm_bus->dev, "%s: %s mapping%d is not 4K aligned\n",
> - caller, dev_name(&nvdimm->dev), i);
> -
> + if ((mapping->start | mapping->size) % PAGE_SIZE) {
> + dev_err(&nvdimm_bus->dev,
> + "%s: %s mapping%d is not 4K aligned\n",
s/not 4K aligned/not PAGE_SIZE aligned ?
hope the error msg need to be changed as well..
Regards,
-Satheesh.
> + caller, dev_name(&nvdimm->dev), i);
> return NULL;
> }
>
> --
> 2.21.0
>
^ permalink raw reply
* [PATCH] spi: spi-fsl-spi: call spi_finalize_current_message() at the end
From: Christophe Leroy @ 2019-05-22 11:00 UTC (permalink / raw)
To: Mark Brown; +Cc: linuxppc-dev, linux-kernel, linux-spi
spi_finalize_current_message() shall be called once all
actions are finished, otherwise the last actions might
step over a newly started transfer.
Fixes: c592becbe704 ("spi: fsl-(e)spi: migrate to generic master queueing")
Signed-off-by: Christophe Leroy <christophe.leroy@c-s.fr>
---
drivers/spi/spi-fsl-spi.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/drivers/spi/spi-fsl-spi.c b/drivers/spi/spi-fsl-spi.c
index b36ac6aa3b1f..7fbdaf066719 100644
--- a/drivers/spi/spi-fsl-spi.c
+++ b/drivers/spi/spi-fsl-spi.c
@@ -432,7 +432,6 @@ static int fsl_spi_do_one_msg(struct spi_master *master,
}
m->status = status;
- spi_finalize_current_message(master);
if (status || !cs_change) {
ndelay(nsecs);
@@ -440,6 +439,7 @@ static int fsl_spi_do_one_msg(struct spi_master *master,
}
fsl_spi_setup_transfer(spi, NULL);
+ spi_finalize_current_message(master);
return 0;
}
--
2.13.3
^ permalink raw reply related
* [PATCH v3 0/3] Fix vDSO clock_getres()
From: Vincenzo Frascino @ 2019-05-22 11:07 UTC (permalink / raw)
To: linux-arch, linuxppc-dev, linux-s390, linux-kselftest
Cc: Arnd Bergmann, Heiko Carstens, Paul Mackerras, Martin Schwidefsky,
Thomas Gleixner, Shuah Khan
clock_getres in the vDSO library has to preserve the same behaviour
of posix_get_hrtimer_res().
In particular, posix_get_hrtimer_res() does:
sec = 0;
ns = hrtimer_resolution;
and hrtimer_resolution depends on the enablement of the high
resolution timers that can happen either at compile or at run time.
A possible fix is to change the vdso implementation of clock_getres,
keeping a copy of hrtimer_resolution in vdso data and using that
directly [1].
This patchset implements the proposed fix for arm64, powerpc, s390,
nds32 and adds a test to verify that the syscall and the vdso library
implementation of clock_getres return the same values.
Even if these patches are unified by the same topic, there is no
dependency between them, hence they can be merged singularly by each
arch maintainer.
Note: arm64 and nds32 respective fixes have been merged in 5.2-rc1,
hence they have been removed from this series.
[1] https://marc.info/?l=linux-arm-kernel&m=155110381930196&w=2
Changes:
--------
v3:
- Rebased on 5.2-rc1.
- Addressed review comments.
v2:
- Rebased on 5.1-rc5.
- Addressed review comments.
Cc: Christophe Leroy <christophe.leroy@c-s.fr>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
Cc: Heiko Carstens <heiko.carstens@de.ibm.com>
Cc: Shuah Khan <shuah@kernel.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Arnd Bergmann <arnd@arndb.de>
Signed-off-by: Vincenzo Frascino <vincenzo.frascino@arm.com>
Vincenzo Frascino (3):
powerpc: Fix vDSO clock_getres()
s390: Fix vDSO clock_getres()
kselftest: Extend vDSO selftest to clock_getres
arch/powerpc/include/asm/vdso_datapage.h | 2 +
arch/powerpc/kernel/asm-offsets.c | 2 +-
arch/powerpc/kernel/time.c | 1 +
arch/powerpc/kernel/vdso32/gettimeofday.S | 7 +-
arch/powerpc/kernel/vdso64/gettimeofday.S | 7 +-
arch/s390/include/asm/vdso.h | 1 +
arch/s390/kernel/asm-offsets.c | 2 +-
arch/s390/kernel/time.c | 1 +
arch/s390/kernel/vdso32/clock_getres.S | 12 +-
arch/s390/kernel/vdso64/clock_getres.S | 10 +-
tools/testing/selftests/vDSO/Makefile | 2 +
.../selftests/vDSO/vdso_clock_getres.c | 137 ++++++++++++++++++
12 files changed, 168 insertions(+), 16 deletions(-)
create mode 100644 tools/testing/selftests/vDSO/vdso_clock_getres.c
--
2.21.0
^ permalink raw reply
* [PATCH v3 1/3] powerpc: Fix vDSO clock_getres()
From: Vincenzo Frascino @ 2019-05-22 11:07 UTC (permalink / raw)
To: linux-arch, linuxppc-dev, linux-s390, linux-kselftest
Cc: Arnd Bergmann, Heiko Carstens, stable, Paul Mackerras,
Martin Schwidefsky, Thomas Gleixner, Shuah Khan
In-Reply-To: <20190522110722.28094-1-vincenzo.frascino@arm.com>
clock_getres in the vDSO library has to preserve the same behaviour
of posix_get_hrtimer_res().
In particular, posix_get_hrtimer_res() does:
sec = 0;
ns = hrtimer_resolution;
and hrtimer_resolution depends on the enablement of the high
resolution timers that can happen either at compile or at run time.
Fix the powerpc vdso implementation of clock_getres keeping a copy of
hrtimer_resolution in vdso data and using that directly.
Fixes: a7f290dad32e ("[PATCH] powerpc: Merge vdso's and add vdso support
to 32 bits kernel")
Cc: stable@vger.kernel.org
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Signed-off-by: Vincenzo Frascino <vincenzo.frascino@arm.com>
Reviewed-by: Christophe Leroy <christophe.leroy@c-s.fr>
---
Note: This patch is independent from the others in this series, hence it
can be merged singularly by the powerpc maintainers.
arch/powerpc/include/asm/vdso_datapage.h | 2 ++
arch/powerpc/kernel/asm-offsets.c | 2 +-
arch/powerpc/kernel/time.c | 1 +
arch/powerpc/kernel/vdso32/gettimeofday.S | 7 +++++--
arch/powerpc/kernel/vdso64/gettimeofday.S | 7 +++++--
5 files changed, 14 insertions(+), 5 deletions(-)
diff --git a/arch/powerpc/include/asm/vdso_datapage.h b/arch/powerpc/include/asm/vdso_datapage.h
index bbc06bd72b1f..4333b9a473dc 100644
--- a/arch/powerpc/include/asm/vdso_datapage.h
+++ b/arch/powerpc/include/asm/vdso_datapage.h
@@ -86,6 +86,7 @@ struct vdso_data {
__s32 wtom_clock_nsec; /* Wall to monotonic clock nsec */
__s64 wtom_clock_sec; /* Wall to monotonic clock sec */
struct timespec stamp_xtime; /* xtime as at tb_orig_stamp */
+ __u32 hrtimer_res; /* hrtimer resolution */
__u32 syscall_map_64[SYSCALL_MAP_SIZE]; /* map of syscalls */
__u32 syscall_map_32[SYSCALL_MAP_SIZE]; /* map of syscalls */
};
@@ -107,6 +108,7 @@ struct vdso_data {
__s32 wtom_clock_nsec;
struct timespec stamp_xtime; /* xtime as at tb_orig_stamp */
__u32 stamp_sec_fraction; /* fractional seconds of stamp_xtime */
+ __u32 hrtimer_res; /* hrtimer resolution */
__u32 syscall_map_32[SYSCALL_MAP_SIZE]; /* map of syscalls */
__u32 dcache_block_size; /* L1 d-cache block size */
__u32 icache_block_size; /* L1 i-cache block size */
diff --git a/arch/powerpc/kernel/asm-offsets.c b/arch/powerpc/kernel/asm-offsets.c
index 8e02444e9d3d..dfc40f29f2b9 100644
--- a/arch/powerpc/kernel/asm-offsets.c
+++ b/arch/powerpc/kernel/asm-offsets.c
@@ -389,6 +389,7 @@ int main(void)
OFFSET(WTOM_CLOCK_NSEC, vdso_data, wtom_clock_nsec);
OFFSET(STAMP_XTIME, vdso_data, stamp_xtime);
OFFSET(STAMP_SEC_FRAC, vdso_data, stamp_sec_fraction);
+ OFFSET(CLOCK_REALTIME_RES, vdso_data, hrtimer_res);
OFFSET(CFG_ICACHE_BLOCKSZ, vdso_data, icache_block_size);
OFFSET(CFG_DCACHE_BLOCKSZ, vdso_data, dcache_block_size);
OFFSET(CFG_ICACHE_LOGBLOCKSZ, vdso_data, icache_log_block_size);
@@ -419,7 +420,6 @@ int main(void)
DEFINE(CLOCK_REALTIME_COARSE, CLOCK_REALTIME_COARSE);
DEFINE(CLOCK_MONOTONIC_COARSE, CLOCK_MONOTONIC_COARSE);
DEFINE(NSEC_PER_SEC, NSEC_PER_SEC);
- DEFINE(CLOCK_REALTIME_RES, MONOTONIC_RES_NSEC);
#ifdef CONFIG_BUG
DEFINE(BUG_ENTRY_SIZE, sizeof(struct bug_entry));
diff --git a/arch/powerpc/kernel/time.c b/arch/powerpc/kernel/time.c
index 325d60633dfa..4ea4e9d7a58e 100644
--- a/arch/powerpc/kernel/time.c
+++ b/arch/powerpc/kernel/time.c
@@ -963,6 +963,7 @@ void update_vsyscall(struct timekeeper *tk)
vdso_data->wtom_clock_nsec = tk->wall_to_monotonic.tv_nsec;
vdso_data->stamp_xtime = xt;
vdso_data->stamp_sec_fraction = frac_sec;
+ vdso_data->hrtimer_res = hrtimer_resolution;
smp_wmb();
++(vdso_data->tb_update_count);
}
diff --git a/arch/powerpc/kernel/vdso32/gettimeofday.S b/arch/powerpc/kernel/vdso32/gettimeofday.S
index afd516b572f8..2b5f9e83c610 100644
--- a/arch/powerpc/kernel/vdso32/gettimeofday.S
+++ b/arch/powerpc/kernel/vdso32/gettimeofday.S
@@ -160,12 +160,15 @@ V_FUNCTION_BEGIN(__kernel_clock_getres)
cror cr0*4+eq,cr0*4+eq,cr1*4+eq
bne cr0,99f
+ mflr r12
+ .cfi_register lr,r12
+ bl __get_datapage@local
+ lwz r5,CLOCK_REALTIME_RES(r3)
+ mtlr r12
li r3,0
cmpli cr0,r4,0
crclr cr0*4+so
beqlr
- lis r5,CLOCK_REALTIME_RES@h
- ori r5,r5,CLOCK_REALTIME_RES@l
stw r3,TSPC32_TV_SEC(r4)
stw r5,TSPC32_TV_NSEC(r4)
blr
diff --git a/arch/powerpc/kernel/vdso64/gettimeofday.S b/arch/powerpc/kernel/vdso64/gettimeofday.S
index 1f324c28705b..f07730f73d5e 100644
--- a/arch/powerpc/kernel/vdso64/gettimeofday.S
+++ b/arch/powerpc/kernel/vdso64/gettimeofday.S
@@ -190,12 +190,15 @@ V_FUNCTION_BEGIN(__kernel_clock_getres)
cror cr0*4+eq,cr0*4+eq,cr1*4+eq
bne cr0,99f
+ mflr r12
+ .cfi_register lr,r12
+ bl V_LOCAL_FUNC(__get_datapage)
+ lwz r5,CLOCK_REALTIME_RES(r3)
+ mtlr r12
li r3,0
cmpldi cr0,r4,0
crclr cr0*4+so
beqlr
- lis r5,CLOCK_REALTIME_RES@h
- ori r5,r5,CLOCK_REALTIME_RES@l
std r3,TSPC64_TV_SEC(r4)
std r5,TSPC64_TV_NSEC(r4)
blr
--
2.21.0
^ permalink raw reply related
* [PATCH v3 3/3] kselftest: Extend vDSO selftest to clock_getres
From: Vincenzo Frascino @ 2019-05-22 11:07 UTC (permalink / raw)
To: linux-arch, linuxppc-dev, linux-s390, linux-kselftest
Cc: Arnd Bergmann, Heiko Carstens, Paul Mackerras, Martin Schwidefsky,
Thomas Gleixner, Shuah Khan
In-Reply-To: <20190522110722.28094-1-vincenzo.frascino@arm.com>
The current version of the multiarch vDSO selftest verifies only
gettimeofday.
Extend the vDSO selftest to clock_getres, to verify that the
syscall and the vDSO library function return the same information.
The extension has been used to verify the hrtimer_resoltion fix.
Cc: Shuah Khan <shuah@kernel.org>
Signed-off-by: Vincenzo Frascino <vincenzo.frascino@arm.com>
---
Note: This patch is independent from the others in this series, hence it
can be merged singularly by the kselftest maintainers.
tools/testing/selftests/vDSO/Makefile | 2 +
.../selftests/vDSO/vdso_clock_getres.c | 137 ++++++++++++++++++
2 files changed, 139 insertions(+)
create mode 100644 tools/testing/selftests/vDSO/vdso_clock_getres.c
diff --git a/tools/testing/selftests/vDSO/Makefile b/tools/testing/selftests/vDSO/Makefile
index 9e03d61f52fd..d5c5bfdf1ac1 100644
--- a/tools/testing/selftests/vDSO/Makefile
+++ b/tools/testing/selftests/vDSO/Makefile
@@ -5,6 +5,7 @@ uname_M := $(shell uname -m 2>/dev/null || echo not)
ARCH ?= $(shell echo $(uname_M) | sed -e s/i.86/x86/ -e s/x86_64/x86/)
TEST_GEN_PROGS := $(OUTPUT)/vdso_test
+TEST_GEN_PROGS += $(OUTPUT)/vdso_clock_getres
ifeq ($(ARCH),x86)
TEST_GEN_PROGS += $(OUTPUT)/vdso_standalone_test_x86
endif
@@ -18,6 +19,7 @@ endif
all: $(TEST_GEN_PROGS)
$(OUTPUT)/vdso_test: parse_vdso.c vdso_test.c
+$(OUTPUT)/vdso_clock_getres: vdso_clock_getres.c
$(OUTPUT)/vdso_standalone_test_x86: vdso_standalone_test_x86.c parse_vdso.c
$(CC) $(CFLAGS) $(CFLAGS_vdso_standalone_test_x86) \
vdso_standalone_test_x86.c parse_vdso.c \
diff --git a/tools/testing/selftests/vDSO/vdso_clock_getres.c b/tools/testing/selftests/vDSO/vdso_clock_getres.c
new file mode 100644
index 000000000000..341a9bc34ffc
--- /dev/null
+++ b/tools/testing/selftests/vDSO/vdso_clock_getres.c
@@ -0,0 +1,137 @@
+// SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note
+/*
+ * vdso_clock_getres.c: Sample code to test clock_getres.
+ * Copyright (c) 2019 Arm Ltd.
+ *
+ * Compile with:
+ * gcc -std=gnu99 vdso_clock_getres.c
+ *
+ * Tested on ARM, ARM64, MIPS32, x86 (32-bit and 64-bit),
+ * Power (32-bit and 64-bit), S390x (32-bit and 64-bit).
+ * Might work on other architectures.
+ */
+
+#define _GNU_SOURCE
+#include <elf.h>
+#include <err.h>
+#include <fcntl.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <time.h>
+#include <sys/auxv.h>
+#include <sys/mman.h>
+#include <sys/time.h>
+#include <unistd.h>
+#include <sys/syscall.h>
+
+#include "../kselftest.h"
+
+static long syscall_clock_getres(clockid_t _clkid, struct timespec *_ts)
+{
+ long ret;
+
+ ret = syscall(SYS_clock_getres, _clkid, _ts);
+
+ return ret;
+}
+
+const char *vdso_clock_name[12] = {
+ "CLOCK_REALTIME",
+ "CLOCK_MONOTONIC",
+ "CLOCK_PROCESS_CPUTIME_ID",
+ "CLOCK_THREAD_CPUTIME_ID",
+ "CLOCK_MONOTONIC_RAW",
+ "CLOCK_REALTIME_COARSE",
+ "CLOCK_MONOTONIC_COARSE",
+ "CLOCK_BOOTTIME",
+ "CLOCK_REALTIME_ALARM",
+ "CLOCK_BOOTTIME_ALARM",
+ "CLOCK_SGI_CYCLE",
+ "CLOCK_TAI",
+};
+
+/*
+ * This function calls clock_getres in vdso and by system call
+ * with different values for clock_id.
+ *
+ * Example of output:
+ *
+ * clock_id: CLOCK_REALTIME [PASS]
+ * clock_id: CLOCK_BOOTTIME [PASS]
+ * clock_id: CLOCK_TAI [PASS]
+ * clock_id: CLOCK_REALTIME_COARSE [PASS]
+ * clock_id: CLOCK_MONOTONIC [PASS]
+ * clock_id: CLOCK_MONOTONIC_RAW [PASS]
+ * clock_id: CLOCK_MONOTONIC_COARSE [PASS]
+ */
+static inline int vdso_test_clock(unsigned int clock_id)
+{
+ struct timespec x, y;
+
+ printf("clock_id: %s", vdso_clock_name[clock_id]);
+ clock_getres(clock_id, &x);
+ syscall_clock_getres(clock_id, &y);
+
+ if ((x.tv_sec != y.tv_sec) || (x.tv_sec != y.tv_sec)) {
+ printf(" [FAIL]\n");
+ return KSFT_FAIL;
+ }
+
+ printf(" [PASS]\n");
+ return 0;
+}
+
+int main(int argc, char **argv)
+{
+ int ret;
+
+#if _POSIX_TIMERS > 0
+
+#ifdef CLOCK_REALTIME
+ ret = vdso_test_clock(CLOCK_REALTIME);
+ if (ret)
+ goto out;
+#endif
+
+#ifdef CLOCK_BOOTTIME
+ ret = vdso_test_clock(CLOCK_BOOTTIME);
+ if (ret)
+ goto out;
+#endif
+
+#ifdef CLOCK_TAI
+ ret = vdso_test_clock(CLOCK_TAI);
+ if (ret)
+ goto out;
+#endif
+
+#ifdef CLOCK_REALTIME_COARSE
+ ret = vdso_test_clock(CLOCK_REALTIME_COARSE);
+ if (ret)
+ goto out;
+#endif
+
+#ifdef CLOCK_MONOTONIC
+ ret = vdso_test_clock(CLOCK_MONOTONIC);
+ if (ret)
+ goto out;
+#endif
+
+#ifdef CLOCK_MONOTONIC_RAW
+ ret = vdso_test_clock(CLOCK_MONOTONIC_RAW);
+ if (ret)
+ goto out;
+#endif
+
+#ifdef CLOCK_MONOTONIC_COARSE
+ ret = vdso_test_clock(CLOCK_MONOTONIC_COARSE);
+ if (ret)
+ goto out;
+#endif
+
+#endif
+
+out:
+ return ret;
+}
--
2.21.0
^ permalink raw reply related
* [PATCH v3 2/3] s390: Fix vDSO clock_getres()
From: Vincenzo Frascino @ 2019-05-22 11:07 UTC (permalink / raw)
To: linux-arch, linuxppc-dev, linux-s390, linux-kselftest
Cc: Arnd Bergmann, Heiko Carstens, Paul Mackerras, Martin Schwidefsky,
Thomas Gleixner, Shuah Khan
In-Reply-To: <20190522110722.28094-1-vincenzo.frascino@arm.com>
clock_getres in the vDSO library has to preserve the same behaviour
of posix_get_hrtimer_res().
In particular, posix_get_hrtimer_res() does:
sec = 0;
ns = hrtimer_resolution;
and hrtimer_resolution depends on the enablement of the high
resolution timers that can happen either at compile or at run time.
Fix the s390 vdso implementation of clock_getres keeping a copy of
hrtimer_resolution in vdso data and using that directly.
Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
Cc: Heiko Carstens <heiko.carstens@de.ibm.com>
Signed-off-by: Vincenzo Frascino <vincenzo.frascino@arm.com>
Acked-by: Martin Schwidefsky <schwidefsky@de.ibm.com>
---
Note: This patch is independent from the others in this series, hence it
can be merged singularly by the s390 maintainers.
arch/s390/include/asm/vdso.h | 1 +
arch/s390/kernel/asm-offsets.c | 2 +-
arch/s390/kernel/time.c | 1 +
arch/s390/kernel/vdso32/clock_getres.S | 12 +++++++-----
arch/s390/kernel/vdso64/clock_getres.S | 10 +++++-----
5 files changed, 15 insertions(+), 11 deletions(-)
diff --git a/arch/s390/include/asm/vdso.h b/arch/s390/include/asm/vdso.h
index 169d7604eb80..f3ba84fa9bd1 100644
--- a/arch/s390/include/asm/vdso.h
+++ b/arch/s390/include/asm/vdso.h
@@ -36,6 +36,7 @@ struct vdso_data {
__u32 tk_shift; /* Shift used for xtime_nsec 0x60 */
__u32 ts_dir; /* TOD steering direction 0x64 */
__u64 ts_end; /* TOD steering end 0x68 */
+ __u32 hrtimer_res; /* hrtimer resolution 0x70 */
};
struct vdso_per_cpu_data {
diff --git a/arch/s390/kernel/asm-offsets.c b/arch/s390/kernel/asm-offsets.c
index 41ac4ad21311..4a229a60b24a 100644
--- a/arch/s390/kernel/asm-offsets.c
+++ b/arch/s390/kernel/asm-offsets.c
@@ -76,6 +76,7 @@ int main(void)
OFFSET(__VDSO_TK_SHIFT, vdso_data, tk_shift);
OFFSET(__VDSO_TS_DIR, vdso_data, ts_dir);
OFFSET(__VDSO_TS_END, vdso_data, ts_end);
+ OFFSET(__VDSO_CLOCK_REALTIME_RES, vdso_data, hrtimer_res);
OFFSET(__VDSO_ECTG_BASE, vdso_per_cpu_data, ectg_timer_base);
OFFSET(__VDSO_ECTG_USER, vdso_per_cpu_data, ectg_user_time);
OFFSET(__VDSO_CPU_NR, vdso_per_cpu_data, cpu_nr);
@@ -87,7 +88,6 @@ int main(void)
DEFINE(__CLOCK_REALTIME_COARSE, CLOCK_REALTIME_COARSE);
DEFINE(__CLOCK_MONOTONIC_COARSE, CLOCK_MONOTONIC_COARSE);
DEFINE(__CLOCK_THREAD_CPUTIME_ID, CLOCK_THREAD_CPUTIME_ID);
- DEFINE(__CLOCK_REALTIME_RES, MONOTONIC_RES_NSEC);
DEFINE(__CLOCK_COARSE_RES, LOW_RES_NSEC);
BLANK();
/* idle data offsets */
diff --git a/arch/s390/kernel/time.c b/arch/s390/kernel/time.c
index e8766beee5ad..8ea9db599d38 100644
--- a/arch/s390/kernel/time.c
+++ b/arch/s390/kernel/time.c
@@ -310,6 +310,7 @@ void update_vsyscall(struct timekeeper *tk)
vdso_data->tk_mult = tk->tkr_mono.mult;
vdso_data->tk_shift = tk->tkr_mono.shift;
+ vdso_data->hrtimer_res = hrtimer_resolution;
smp_wmb();
++vdso_data->tb_update_count;
}
diff --git a/arch/s390/kernel/vdso32/clock_getres.S b/arch/s390/kernel/vdso32/clock_getres.S
index eaf9cf1417f6..fecd7684c645 100644
--- a/arch/s390/kernel/vdso32/clock_getres.S
+++ b/arch/s390/kernel/vdso32/clock_getres.S
@@ -18,20 +18,22 @@
__kernel_clock_getres:
CFI_STARTPROC
basr %r1,0
- la %r1,4f-.(%r1)
+10: al %r1,4f-10b(%r1)
+ l %r0,__VDSO_CLOCK_REALTIME_RES(%r1)
chi %r2,__CLOCK_REALTIME
je 0f
chi %r2,__CLOCK_MONOTONIC
je 0f
- la %r1,5f-4f(%r1)
+ basr %r1,0
+ la %r1,5f-.(%r1)
+ l %r0,0(%r1)
chi %r2,__CLOCK_REALTIME_COARSE
je 0f
chi %r2,__CLOCK_MONOTONIC_COARSE
jne 3f
0: ltr %r3,%r3
jz 2f /* res == NULL */
-1: l %r0,0(%r1)
- xc 0(4,%r3),0(%r3) /* set tp->tv_sec to zero */
+1: xc 0(4,%r3),0(%r3) /* set tp->tv_sec to zero */
st %r0,4(%r3) /* store tp->tv_usec */
2: lhi %r2,0
br %r14
@@ -39,6 +41,6 @@ __kernel_clock_getres:
svc 0
br %r14
CFI_ENDPROC
-4: .long __CLOCK_REALTIME_RES
+4: .long _vdso_data - 10b
5: .long __CLOCK_COARSE_RES
.size __kernel_clock_getres,.-__kernel_clock_getres
diff --git a/arch/s390/kernel/vdso64/clock_getres.S b/arch/s390/kernel/vdso64/clock_getres.S
index 081435398e0a..022b58c980db 100644
--- a/arch/s390/kernel/vdso64/clock_getres.S
+++ b/arch/s390/kernel/vdso64/clock_getres.S
@@ -17,12 +17,14 @@
.type __kernel_clock_getres,@function
__kernel_clock_getres:
CFI_STARTPROC
- larl %r1,4f
+ larl %r1,3f
+ lg %r0,0(%r1)
cghi %r2,__CLOCK_REALTIME_COARSE
je 0f
cghi %r2,__CLOCK_MONOTONIC_COARSE
je 0f
- larl %r1,3f
+ larl %r1,_vdso_data
+ l %r0,__VDSO_CLOCK_REALTIME_RES(%r1)
cghi %r2,__CLOCK_REALTIME
je 0f
cghi %r2,__CLOCK_MONOTONIC
@@ -36,7 +38,6 @@ __kernel_clock_getres:
jz 2f
0: ltgr %r3,%r3
jz 1f /* res == NULL */
- lg %r0,0(%r1)
xc 0(8,%r3),0(%r3) /* set tp->tv_sec to zero */
stg %r0,8(%r3) /* store tp->tv_usec */
1: lghi %r2,0
@@ -45,6 +46,5 @@ __kernel_clock_getres:
svc 0
br %r14
CFI_ENDPROC
-3: .quad __CLOCK_REALTIME_RES
-4: .quad __CLOCK_COARSE_RES
+3: .quad __CLOCK_COARSE_RES
.size __kernel_clock_getres,.-__kernel_clock_getres
--
2.21.0
^ permalink raw reply related
* Re: [PATCH v3 3/3] kselftest: Extend vDSO selftest to clock_getres
From: Christophe Leroy @ 2019-05-22 11:50 UTC (permalink / raw)
To: Vincenzo Frascino, linux-arch, linuxppc-dev, linux-s390,
linux-kselftest
Cc: Arnd Bergmann, Heiko Carstens, Paul Mackerras, Martin Schwidefsky,
Thomas Gleixner, Shuah Khan
In-Reply-To: <20190522110722.28094-4-vincenzo.frascino@arm.com>
Le 22/05/2019 à 13:07, Vincenzo Frascino a écrit :
> The current version of the multiarch vDSO selftest verifies only
> gettimeofday.
>
> Extend the vDSO selftest to clock_getres, to verify that the
> syscall and the vDSO library function return the same information.
>
> The extension has been used to verify the hrtimer_resoltion fix.
>
> Cc: Shuah Khan <shuah@kernel.org>
> Signed-off-by: Vincenzo Frascino <vincenzo.frascino@arm.com>
> ---
>
> Note: This patch is independent from the others in this series, hence it
> can be merged singularly by the kselftest maintainers.
>
> tools/testing/selftests/vDSO/Makefile | 2 +
> .../selftests/vDSO/vdso_clock_getres.c | 137 ++++++++++++++++++
> 2 files changed, 139 insertions(+)
> create mode 100644 tools/testing/selftests/vDSO/vdso_clock_getres.c
>
> diff --git a/tools/testing/selftests/vDSO/Makefile b/tools/testing/selftests/vDSO/Makefile
> index 9e03d61f52fd..d5c5bfdf1ac1 100644
> --- a/tools/testing/selftests/vDSO/Makefile
> +++ b/tools/testing/selftests/vDSO/Makefile
> @@ -5,6 +5,7 @@ uname_M := $(shell uname -m 2>/dev/null || echo not)
> ARCH ?= $(shell echo $(uname_M) | sed -e s/i.86/x86/ -e s/x86_64/x86/)
>
> TEST_GEN_PROGS := $(OUTPUT)/vdso_test
> +TEST_GEN_PROGS += $(OUTPUT)/vdso_clock_getres
> ifeq ($(ARCH),x86)
> TEST_GEN_PROGS += $(OUTPUT)/vdso_standalone_test_x86
> endif
> @@ -18,6 +19,7 @@ endif
>
> all: $(TEST_GEN_PROGS)
> $(OUTPUT)/vdso_test: parse_vdso.c vdso_test.c
> +$(OUTPUT)/vdso_clock_getres: vdso_clock_getres.c
> $(OUTPUT)/vdso_standalone_test_x86: vdso_standalone_test_x86.c parse_vdso.c
> $(CC) $(CFLAGS) $(CFLAGS_vdso_standalone_test_x86) \
> vdso_standalone_test_x86.c parse_vdso.c \
> diff --git a/tools/testing/selftests/vDSO/vdso_clock_getres.c b/tools/testing/selftests/vDSO/vdso_clock_getres.c
> new file mode 100644
> index 000000000000..341a9bc34ffc
> --- /dev/null
> +++ b/tools/testing/selftests/vDSO/vdso_clock_getres.c
> @@ -0,0 +1,137 @@
> +// SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note
> +/*
> + * vdso_clock_getres.c: Sample code to test clock_getres.
> + * Copyright (c) 2019 Arm Ltd.
> + *
> + * Compile with:
> + * gcc -std=gnu99 vdso_clock_getres.c
> + *
> + * Tested on ARM, ARM64, MIPS32, x86 (32-bit and 64-bit),
> + * Power (32-bit and 64-bit), S390x (32-bit and 64-bit).
> + * Might work on other architectures.
> + */
> +
> +#define _GNU_SOURCE
> +#include <elf.h>
> +#include <err.h>
> +#include <fcntl.h>
> +#include <stdint.h>
> +#include <stdio.h>
> +#include <stdlib.h>
> +#include <time.h>
> +#include <sys/auxv.h>
> +#include <sys/mman.h>
> +#include <sys/time.h>
> +#include <unistd.h>
> +#include <sys/syscall.h>
> +
> +#include "../kselftest.h"
> +
> +static long syscall_clock_getres(clockid_t _clkid, struct timespec *_ts)
> +{
> + long ret;
> +
> + ret = syscall(SYS_clock_getres, _clkid, _ts);
> +
> + return ret;
> +}
> +
> +const char *vdso_clock_name[12] = {
> + "CLOCK_REALTIME",
> + "CLOCK_MONOTONIC",
> + "CLOCK_PROCESS_CPUTIME_ID",
> + "CLOCK_THREAD_CPUTIME_ID",
> + "CLOCK_MONOTONIC_RAW",
> + "CLOCK_REALTIME_COARSE",
> + "CLOCK_MONOTONIC_COARSE",
> + "CLOCK_BOOTTIME",
> + "CLOCK_REALTIME_ALARM",
> + "CLOCK_BOOTTIME_ALARM",
> + "CLOCK_SGI_CYCLE",
> + "CLOCK_TAI",
> +};
> +
> +/*
> + * This function calls clock_getres in vdso and by system call
> + * with different values for clock_id.
> + *
> + * Example of output:
> + *
> + * clock_id: CLOCK_REALTIME [PASS]
> + * clock_id: CLOCK_BOOTTIME [PASS]
> + * clock_id: CLOCK_TAI [PASS]
> + * clock_id: CLOCK_REALTIME_COARSE [PASS]
> + * clock_id: CLOCK_MONOTONIC [PASS]
> + * clock_id: CLOCK_MONOTONIC_RAW [PASS]
> + * clock_id: CLOCK_MONOTONIC_COARSE [PASS]
> + */
> +static inline int vdso_test_clock(unsigned int clock_id)
> +{
> + struct timespec x, y;
> +
> + printf("clock_id: %s", vdso_clock_name[clock_id]);
> + clock_getres(clock_id, &x);
> + syscall_clock_getres(clock_id, &y);
> +
> + if ((x.tv_sec != y.tv_sec) || (x.tv_sec != y.tv_sec)) {
> + printf(" [FAIL]\n");
> + return KSFT_FAIL;
> + }
> +
> + printf(" [PASS]\n");
> + return 0;
> +}
> +
> +int main(int argc, char **argv)
> +{
> + int ret;
> +
> +#if _POSIX_TIMERS > 0
> +
> +#ifdef CLOCK_REALTIME
Why do you need that #ifdef and all the ones below ?
CLOCK_REALTIME (and others) is defined in include/uapi/linux/time.h, so
it should be there when you build the test, shouldn't it ?
> + ret = vdso_test_clock(CLOCK_REALTIME);
> + if (ret)
> + goto out;
Why that goto ? Nothing is done at out, so a 'return ret' would be
better I think.
And do we really want to stop at first failure ? Wouldn't it be better
to run all the tests regardless ?
Christophe
> +#endif
> +
> +#ifdef CLOCK_BOOTTIME
> + ret = vdso_test_clock(CLOCK_BOOTTIME);
> + if (ret)
> + goto out;
> +#endif
> +
> +#ifdef CLOCK_TAI
> + ret = vdso_test_clock(CLOCK_TAI);
> + if (ret)
> + goto out;
> +#endif
> +
> +#ifdef CLOCK_REALTIME_COARSE
> + ret = vdso_test_clock(CLOCK_REALTIME_COARSE);
> + if (ret)
> + goto out;
> +#endif
> +
> +#ifdef CLOCK_MONOTONIC
> + ret = vdso_test_clock(CLOCK_MONOTONIC);
> + if (ret)
> + goto out;
> +#endif
> +
> +#ifdef CLOCK_MONOTONIC_RAW
> + ret = vdso_test_clock(CLOCK_MONOTONIC_RAW);
> + if (ret)
> + goto out;
> +#endif
> +
> +#ifdef CLOCK_MONOTONIC_COARSE
> + ret = vdso_test_clock(CLOCK_MONOTONIC_COARSE);
> + if (ret)
> + goto out;
> +#endif
> +
> +#endif
> +
> +out:
> + return ret;
> +}
>
^ permalink raw reply
* Failure to boot G4: dt_headr_start=0x01501000
From: Mathieu Malaterre @ 2019-05-22 12:15 UTC (permalink / raw)
To: linuxppc-dev
Hi all,
I have not boot my G4 in a while, today using master here is what I see:
done
Setting btext !
W=640 H=488 LB=768 addr=0x9c008000
copying OF device tree...
starting device tree allocs at 01401000
otloc_up(00100000, 0013d948)
trying: 0x01401000
trying: 0x01501000
-› 01501000
alloc_bottom : 01601000
alloc_top : 20000000
alloc_top_hi : 20000000
nmo_top : 20000000
ram_top : 20000000
Building dt strings...
Building dt structure...
reserved memory map:
00d40000 - 006c1000
Device tree strings 0x01502000 -> 0x00000007
Device tree struct 0x01503000 -> 0x00000007
Quiescing Open Firmware ...
Booting Linux via __start() @ 0x001400000
->dt_headr_start=0x01501000
Any suggestions before I start a bisect ?
Thanks
^ permalink raw reply
* [PATCH] tty: serial: cpm_uart - fix init when SMC is relocated
From: Christophe Leroy @ 2019-05-22 12:17 UTC (permalink / raw)
To: Greg Kroah-Hartman, Jiri Slaby; +Cc: linuxppc-dev, linux-kernel, linux-serial
SMC relocation can also be activated earlier by the bootloader,
so the driver's behaviour cannot rely on selected kernel config.
When the SMC is relocated, CPM_CR_INIT_TRX cannot be used.
But the only thing CPM_CR_INIT_TRX does is to clear the
rstate and tstate registers, so this can be done manually,
even when SMC is not relocated.
Signed-off-by: Christophe Leroy <christophe.leroy@c-s.fr>
Fixes: 9ab921201444 ("cpm_uart: fix non-console port startup bug")
---
drivers/tty/serial/cpm_uart/cpm_uart_core.c | 17 +++++++++++------
1 file changed, 11 insertions(+), 6 deletions(-)
diff --git a/drivers/tty/serial/cpm_uart/cpm_uart_core.c b/drivers/tty/serial/cpm_uart/cpm_uart_core.c
index b929c7ae3a27..7bab9a3eda92 100644
--- a/drivers/tty/serial/cpm_uart/cpm_uart_core.c
+++ b/drivers/tty/serial/cpm_uart/cpm_uart_core.c
@@ -407,7 +407,16 @@ static int cpm_uart_startup(struct uart_port *port)
clrbits16(&pinfo->sccp->scc_sccm, UART_SCCM_RX);
}
cpm_uart_initbd(pinfo);
- cpm_line_cr_cmd(pinfo, CPM_CR_INIT_TRX);
+ if (IS_SMC(pinfo)) {
+ out_be32(&pinfo->smcup->smc_rstate, 0);
+ out_be32(&pinfo->smcup->smc_tstate, 0);
+ out_be16(&pinfo->smcup->smc_rbptr,
+ in_be16(&pinfo->smcup->smc_rbase));
+ out_be16(&pinfo->smcup->smc_tbptr,
+ in_be16(&pinfo->smcup->smc_tbase));
+ } else {
+ cpm_line_cr_cmd(pinfo, CPM_CR_INIT_TRX);
+ }
}
/* Install interrupt handler. */
retval = request_irq(port->irq, cpm_uart_int, 0, "cpm_uart", port);
@@ -861,16 +870,14 @@ static void cpm_uart_init_smc(struct uart_cpm_port *pinfo)
(u8 __iomem *)pinfo->tx_bd_base - DPRAM_BASE);
/*
- * In case SMC1 is being relocated...
+ * In case SMC is being relocated...
*/
-#if defined (CONFIG_I2C_SPI_SMC1_UCODE_PATCH)
out_be16(&up->smc_rbptr, in_be16(&pinfo->smcup->smc_rbase));
out_be16(&up->smc_tbptr, in_be16(&pinfo->smcup->smc_tbase));
out_be32(&up->smc_rstate, 0);
out_be32(&up->smc_tstate, 0);
out_be16(&up->smc_brkcr, 1); /* number of break chars */
out_be16(&up->smc_brkec, 0);
-#endif
/* Set up the uart parameters in the
* parameter ram.
@@ -884,8 +891,6 @@ static void cpm_uart_init_smc(struct uart_cpm_port *pinfo)
out_be16(&up->smc_brkec, 0);
out_be16(&up->smc_brkcr, 1);
- cpm_line_cr_cmd(pinfo, CPM_CR_INIT_TRX);
-
/* Set UART mode, 8 bit, no parity, one stop.
* Enable receive and transmit.
*/
--
2.13.3
^ permalink raw reply related
* Re: Failure to boot G4: dt_headr_start=0x01501000
From: Christophe Leroy @ 2019-05-22 12:20 UTC (permalink / raw)
To: Mathieu Malaterre, linuxppc-dev
In-Reply-To: <CA+7wUszwugJeS_x_ExaHPUb8p23D7Zo2f2qqXfLQwr8EiLsk2g@mail.gmail.com>
Le 22/05/2019 à 14:15, Mathieu Malaterre a écrit :
> Hi all,
>
> I have not boot my G4 in a while, today using master here is what I see:
>
> done
> Setting btext !
> W=640 H=488 LB=768 addr=0x9c008000
> copying OF device tree...
> starting device tree allocs at 01401000
> otloc_up(00100000, 0013d948)
> trying: 0x01401000
> trying: 0x01501000
> -› 01501000
> alloc_bottom : 01601000
> alloc_top : 20000000
> alloc_top_hi : 20000000
> nmo_top : 20000000
> ram_top : 20000000
> Building dt strings...
> Building dt structure...
> reserved memory map:
> 00d40000 - 006c1000
> Device tree strings 0x01502000 -> 0x00000007
> Device tree struct 0x01503000 -> 0x00000007
> Quiescing Open Firmware ...
> Booting Linux via __start() @ 0x001400000
> ->dt_headr_start=0x01501000
>
> Any suggestions before I start a bisect ?
>
Have you tried without CONFIG_PPC_KUEP and CONFIG_PPC_KUAP ?
Christophe
^ permalink raw reply
* Re: [RFC PATCH] mm/nvdimm: Fix kernel crash on devm_mremap_pages_release
From: Aneesh Kumar K.V @ 2019-05-22 13:12 UTC (permalink / raw)
To: Dan Williams, Keith Busch; +Cc: Linux MM, linuxppc-dev, linux-nvdimm
In-Reply-To: <b775d65b-30e3-aceb-f2f8-f2413b129f52@linux.ibm.com>
"Aneesh Kumar K.V" <aneesh.kumar@linux.ibm.com> writes:
> On 5/14/19 9:45 AM, Dan Williams wrote:
>> [ add Keith who was looking at something similar ]
>>
...
>>
>> If it's reserved then we should not be accessing, even if the above
>> works in practice. Isn't the fix something more like this to fix up
>> the assumptions at release time?
>>
>> diff --git a/kernel/memremap.c b/kernel/memremap.c
>> index a856cb5ff192..9074ba14572c 100644
>> --- a/kernel/memremap.c
>> +++ b/kernel/memremap.c
>> @@ -90,6 +90,7 @@ static void devm_memremap_pages_release(void *data)
>> struct device *dev = pgmap->dev;
>> struct resource *res = &pgmap->res;
>> resource_size_t align_start, align_size;
>> + struct vmem_altmap *altmap = pgmap->altmap_valid ? &pgmap->altmap : NULL;
>> unsigned long pfn;
>> int nid;
>>
>> @@ -102,7 +103,10 @@ static void devm_memremap_pages_release(void *data)
>> align_size = ALIGN(res->start + resource_size(res), SECTION_SIZE)
>> - align_start;
>>
>> - nid = page_to_nid(pfn_to_page(align_start >> PAGE_SHIFT));
>> + pfn = align_start >> PAGE_SHIFT;
>> + if (altmap)
>> + pfn += vmem_altmap_offset(altmap);
>> + nid = page_to_nid(pfn_to_page(pfn));
>>
>> mem_hotplug_begin();
>> if (pgmap->type == MEMORY_DEVICE_PRIVATE) {
>> @@ -110,8 +114,7 @@ static void devm_memremap_pages_release(void *data)
>> __remove_pages(page_zone(pfn_to_page(pfn)), pfn,
>> align_size >> PAGE_SHIFT, NULL);
>> } else {
>> - arch_remove_memory(nid, align_start, align_size,
>> - pgmap->altmap_valid ? &pgmap->altmap : NULL);
>> + arch_remove_memory(nid, align_start, align_size, altmap);
>> kasan_remove_zero_shadow(__va(align_start), align_size);
>> }
>> mem_hotplug_done();
>>
> I did try that first. I was not sure about that. From the memory add vs
> remove perspective.
>
> devm_memremap_pages:
>
> align_start = res->start & ~(SECTION_SIZE - 1);
> align_size = ALIGN(res->start + resource_size(res), SECTION_SIZE)
> - align_start;
> align_end = align_start + align_size - 1;
>
> error = arch_add_memory(nid, align_start, align_size, altmap,
> false);
>
>
> devm_memremap_pages_release:
>
> /* pages are dead and unused, undo the arch mapping */
> align_start = res->start & ~(SECTION_SIZE - 1);
> align_size = ALIGN(res->start + resource_size(res), SECTION_SIZE)
> - align_start;
>
> arch_remove_memory(nid, align_start, align_size,
> pgmap->altmap_valid ? &pgmap->altmap : NULL);
>
>
> Now if we are fixing the memremap_pages_release, shouldn't we adjust
> alig_start w.r.t memremap_pages too? and I was not sure what that means
> w.r.t add/remove alignment requirements.
>
> What is the intended usage of reserve area? I guess we want that part to
> be added? if so shouldn't we remove them?
We need to intialize the struct page backing the reserve area too right?
Where should we do that?
-aneesh
^ permalink raw reply
* Re: [PATCH v3 3/3] kselftest: Extend vDSO selftest to clock_getres
From: Vincenzo Frascino @ 2019-05-22 14:55 UTC (permalink / raw)
To: Christophe Leroy, linux-arch, linuxppc-dev, linux-s390,
linux-kselftest
Cc: Arnd Bergmann, Heiko Carstens, Paul Mackerras, Martin Schwidefsky,
Thomas Gleixner, Shuah Khan
In-Reply-To: <3a6d9b99-0026-6743-9e73-4880f3cd6b1c@c-s.fr>
Hi Christophe,
thank you for your review.
On 22/05/2019 12:50, Christophe Leroy wrote:
>
>
> Le 22/05/2019 à 13:07, Vincenzo Frascino a écrit :
>> The current version of the multiarch vDSO selftest verifies only
>> gettimeofday.
>>
>> Extend the vDSO selftest to clock_getres, to verify that the
>> syscall and the vDSO library function return the same information.
>>
>> The extension has been used to verify the hrtimer_resoltion fix.
>>
>> Cc: Shuah Khan <shuah@kernel.org>
>> Signed-off-by: Vincenzo Frascino <vincenzo.frascino@arm.com>
>> ---
>>
>> Note: This patch is independent from the others in this series, hence it
>> can be merged singularly by the kselftest maintainers.
>>
>> tools/testing/selftests/vDSO/Makefile | 2 +
>> .../selftests/vDSO/vdso_clock_getres.c | 137 ++++++++++++++++++
>> 2 files changed, 139 insertions(+)
>> create mode 100644 tools/testing/selftests/vDSO/vdso_clock_getres.c
>>
>> diff --git a/tools/testing/selftests/vDSO/Makefile b/tools/testing/selftests/vDSO/Makefile
>> index 9e03d61f52fd..d5c5bfdf1ac1 100644
>> --- a/tools/testing/selftests/vDSO/Makefile
>> +++ b/tools/testing/selftests/vDSO/Makefile
>> @@ -5,6 +5,7 @@ uname_M := $(shell uname -m 2>/dev/null || echo not)
>> ARCH ?= $(shell echo $(uname_M) | sed -e s/i.86/x86/ -e s/x86_64/x86/)
>>
>> TEST_GEN_PROGS := $(OUTPUT)/vdso_test
>> +TEST_GEN_PROGS += $(OUTPUT)/vdso_clock_getres
>> ifeq ($(ARCH),x86)
>> TEST_GEN_PROGS += $(OUTPUT)/vdso_standalone_test_x86
>> endif
>> @@ -18,6 +19,7 @@ endif
>>
>> all: $(TEST_GEN_PROGS)
>> $(OUTPUT)/vdso_test: parse_vdso.c vdso_test.c
>> +$(OUTPUT)/vdso_clock_getres: vdso_clock_getres.c
>> $(OUTPUT)/vdso_standalone_test_x86: vdso_standalone_test_x86.c parse_vdso.c
>> $(CC) $(CFLAGS) $(CFLAGS_vdso_standalone_test_x86) \
>> vdso_standalone_test_x86.c parse_vdso.c \
>> diff --git a/tools/testing/selftests/vDSO/vdso_clock_getres.c b/tools/testing/selftests/vDSO/vdso_clock_getres.c
>> new file mode 100644
>> index 000000000000..341a9bc34ffc
>> --- /dev/null
>> +++ b/tools/testing/selftests/vDSO/vdso_clock_getres.c
>> @@ -0,0 +1,137 @@
>> +// SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note
>> +/*
>> + * vdso_clock_getres.c: Sample code to test clock_getres.
>> + * Copyright (c) 2019 Arm Ltd.
>> + *
>> + * Compile with:
>> + * gcc -std=gnu99 vdso_clock_getres.c
>> + *
>> + * Tested on ARM, ARM64, MIPS32, x86 (32-bit and 64-bit),
>> + * Power (32-bit and 64-bit), S390x (32-bit and 64-bit).
>> + * Might work on other architectures.
>> + */
>> +
>> +#define _GNU_SOURCE
>> +#include <elf.h>
>> +#include <err.h>
>> +#include <fcntl.h>
>> +#include <stdint.h>
>> +#include <stdio.h>
>> +#include <stdlib.h>
>> +#include <time.h>
>> +#include <sys/auxv.h>
>> +#include <sys/mman.h>
>> +#include <sys/time.h>
>> +#include <unistd.h>
>> +#include <sys/syscall.h>
>> +
>> +#include "../kselftest.h"
>> +
>> +static long syscall_clock_getres(clockid_t _clkid, struct timespec *_ts)
>> +{
>> + long ret;
>> +
>> + ret = syscall(SYS_clock_getres, _clkid, _ts);
>> +
>> + return ret;
>> +}
>> +
>> +const char *vdso_clock_name[12] = {
>> + "CLOCK_REALTIME",
>> + "CLOCK_MONOTONIC",
>> + "CLOCK_PROCESS_CPUTIME_ID",
>> + "CLOCK_THREAD_CPUTIME_ID",
>> + "CLOCK_MONOTONIC_RAW",
>> + "CLOCK_REALTIME_COARSE",
>> + "CLOCK_MONOTONIC_COARSE",
>> + "CLOCK_BOOTTIME",
>> + "CLOCK_REALTIME_ALARM",
>> + "CLOCK_BOOTTIME_ALARM",
>> + "CLOCK_SGI_CYCLE",
>> + "CLOCK_TAI",
>> +};
>> +
>> +/*
>> + * This function calls clock_getres in vdso and by system call
>> + * with different values for clock_id.
>> + *
>> + * Example of output:
>> + *
>> + * clock_id: CLOCK_REALTIME [PASS]
>> + * clock_id: CLOCK_BOOTTIME [PASS]
>> + * clock_id: CLOCK_TAI [PASS]
>> + * clock_id: CLOCK_REALTIME_COARSE [PASS]
>> + * clock_id: CLOCK_MONOTONIC [PASS]
>> + * clock_id: CLOCK_MONOTONIC_RAW [PASS]
>> + * clock_id: CLOCK_MONOTONIC_COARSE [PASS]
>> + */
>> +static inline int vdso_test_clock(unsigned int clock_id)
>> +{
>> + struct timespec x, y;
>> +
>> + printf("clock_id: %s", vdso_clock_name[clock_id]);
>> + clock_getres(clock_id, &x);
>> + syscall_clock_getres(clock_id, &y);
>> +
>> + if ((x.tv_sec != y.tv_sec) || (x.tv_sec != y.tv_sec)) {
>> + printf(" [FAIL]\n");
>> + return KSFT_FAIL;
>> + }
>> +
>> + printf(" [PASS]\n");
>> + return 0;
>> +}
>> +
>> +int main(int argc, char **argv)
>> +{
>> + int ret;
>> +
>> +#if _POSIX_TIMERS > 0
>> +
>> +#ifdef CLOCK_REALTIME
>
> Why do you need that #ifdef and all the ones below ?
>
> CLOCK_REALTIME (and others) is defined in include/uapi/linux/time.h, so
> it should be there when you build the test, shouldn't it ?
>
In implementing this test I followed what the man page for clock_gettime(2)
defines in terms of availability of the timers. Since I do not know how old are
the userspace headers, I think it is a good idea checking that the clocks are
defined before trying to use them.
>> + ret = vdso_test_clock(CLOCK_REALTIME);
>> + if (ret)
>> + goto out;
>
> Why that goto ? Nothing is done at out, so a 'return ret' would be
> better I think.
>
Agree, thanks for pointing this out. Will fix in v4.
> And do we really want to stop at first failure ? Wouldn't it be better
> to run all the tests regardless ?
>
The test is supposed to fail if one of the sub-tests fails, hence once the first
fails doesn't seem convenient running the others, because we already know the
result.
> Christophe
>
>> +#endif
>> +
>> +#ifdef CLOCK_BOOTTIME
>> + ret = vdso_test_clock(CLOCK_BOOTTIME);
>> + if (ret)
>> + goto out;
>> +#endif
>> +
>> +#ifdef CLOCK_TAI
>> + ret = vdso_test_clock(CLOCK_TAI);
>> + if (ret)
>> + goto out;
>> +#endif
>> +
>> +#ifdef CLOCK_REALTIME_COARSE
>> + ret = vdso_test_clock(CLOCK_REALTIME_COARSE);
>> + if (ret)
>> + goto out;
>> +#endif
>> +
>> +#ifdef CLOCK_MONOTONIC
>> + ret = vdso_test_clock(CLOCK_MONOTONIC);
>> + if (ret)
>> + goto out;
>> +#endif
>> +
>> +#ifdef CLOCK_MONOTONIC_RAW
>> + ret = vdso_test_clock(CLOCK_MONOTONIC_RAW);
>> + if (ret)
>> + goto out;
>> +#endif
>> +
>> +#ifdef CLOCK_MONOTONIC_COARSE
>> + ret = vdso_test_clock(CLOCK_MONOTONIC_COARSE);
>> + if (ret)
>> + goto out;
>> +#endif
>> +
>> +#endif
>> +
>> +out:
>> + return ret;
>> +}
>>
--
Regards,
Vincenzo
^ permalink raw reply
* Re: [PATCH v3 3/3] kselftest: Extend vDSO selftest to clock_getres
From: Vincenzo Frascino @ 2019-05-22 15:03 UTC (permalink / raw)
To: Christophe Leroy, linux-arch, linuxppc-dev, linux-s390,
linux-kselftest
Cc: Arnd Bergmann, Heiko Carstens, Paul Mackerras, Martin Schwidefsky,
Thomas Gleixner, Shuah Khan
In-Reply-To: <3a6d9b99-0026-6743-9e73-4880f3cd6b1c@c-s.fr>
Hi Christophe,
thank you for your review.
On 22/05/2019 12:50, Christophe Leroy wrote:
>
>
> Le 22/05/2019 à 13:07, Vincenzo Frascino a écrit :
>> The current version of the multiarch vDSO selftest verifies only
>> gettimeofday.
>>
>> Extend the vDSO selftest to clock_getres, to verify that the
>> syscall and the vDSO library function return the same information.
>>
>> The extension has been used to verify the hrtimer_resoltion fix.
>>
>> Cc: Shuah Khan <shuah@kernel.org>
>> Signed-off-by: Vincenzo Frascino <vincenzo.frascino@arm.com>
>> ---
>>
>> Note: This patch is independent from the others in this series, hence it
>> can be merged singularly by the kselftest maintainers.
>>
>> tools/testing/selftests/vDSO/Makefile | 2 +
>> .../selftests/vDSO/vdso_clock_getres.c | 137 ++++++++++++++++++
>> 2 files changed, 139 insertions(+)
>> create mode 100644 tools/testing/selftests/vDSO/vdso_clock_getres.c
>>
>> diff --git a/tools/testing/selftests/vDSO/Makefile b/tools/testing/selftests/vDSO/Makefile
>> index 9e03d61f52fd..d5c5bfdf1ac1 100644
>> --- a/tools/testing/selftests/vDSO/Makefile
>> +++ b/tools/testing/selftests/vDSO/Makefile
>> @@ -5,6 +5,7 @@ uname_M := $(shell uname -m 2>/dev/null || echo not)
>> ARCH ?= $(shell echo $(uname_M) | sed -e s/i.86/x86/ -e s/x86_64/x86/)
>>
>> TEST_GEN_PROGS := $(OUTPUT)/vdso_test
>> +TEST_GEN_PROGS += $(OUTPUT)/vdso_clock_getres
>> ifeq ($(ARCH),x86)
>> TEST_GEN_PROGS += $(OUTPUT)/vdso_standalone_test_x86
>> endif
>> @@ -18,6 +19,7 @@ endif
>>
>> all: $(TEST_GEN_PROGS)
>> $(OUTPUT)/vdso_test: parse_vdso.c vdso_test.c
>> +$(OUTPUT)/vdso_clock_getres: vdso_clock_getres.c
>> $(OUTPUT)/vdso_standalone_test_x86: vdso_standalone_test_x86.c parse_vdso.c
>> $(CC) $(CFLAGS) $(CFLAGS_vdso_standalone_test_x86) \
>> vdso_standalone_test_x86.c parse_vdso.c \
>> diff --git a/tools/testing/selftests/vDSO/vdso_clock_getres.c b/tools/testing/selftests/vDSO/vdso_clock_getres.c
>> new file mode 100644
>> index 000000000000..341a9bc34ffc
>> --- /dev/null
>> +++ b/tools/testing/selftests/vDSO/vdso_clock_getres.c
>> @@ -0,0 +1,137 @@
>> +// SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note
>> +/*
>> + * vdso_clock_getres.c: Sample code to test clock_getres.
>> + * Copyright (c) 2019 Arm Ltd.
>> + *
>> + * Compile with:
>> + * gcc -std=gnu99 vdso_clock_getres.c
>> + *
>> + * Tested on ARM, ARM64, MIPS32, x86 (32-bit and 64-bit),
>> + * Power (32-bit and 64-bit), S390x (32-bit and 64-bit).
>> + * Might work on other architectures.
>> + */
>> +
>> +#define _GNU_SOURCE
>> +#include <elf.h>
>> +#include <err.h>
>> +#include <fcntl.h>
>> +#include <stdint.h>
>> +#include <stdio.h>
>> +#include <stdlib.h>
>> +#include <time.h>
>> +#include <sys/auxv.h>
>> +#include <sys/mman.h>
>> +#include <sys/time.h>
>> +#include <unistd.h>
>> +#include <sys/syscall.h>
>> +
>> +#include "../kselftest.h"
>> +
>> +static long syscall_clock_getres(clockid_t _clkid, struct timespec *_ts)
>> +{
>> + long ret;
>> +
>> + ret = syscall(SYS_clock_getres, _clkid, _ts);
>> +
>> + return ret;
>> +}
>> +
>> +const char *vdso_clock_name[12] = {
>> + "CLOCK_REALTIME",
>> + "CLOCK_MONOTONIC",
>> + "CLOCK_PROCESS_CPUTIME_ID",
>> + "CLOCK_THREAD_CPUTIME_ID",
>> + "CLOCK_MONOTONIC_RAW",
>> + "CLOCK_REALTIME_COARSE",
>> + "CLOCK_MONOTONIC_COARSE",
>> + "CLOCK_BOOTTIME",
>> + "CLOCK_REALTIME_ALARM",
>> + "CLOCK_BOOTTIME_ALARM",
>> + "CLOCK_SGI_CYCLE",
>> + "CLOCK_TAI",
>> +};
>> +
>> +/*
>> + * This function calls clock_getres in vdso and by system call
>> + * with different values for clock_id.
>> + *
>> + * Example of output:
>> + *
>> + * clock_id: CLOCK_REALTIME [PASS]
>> + * clock_id: CLOCK_BOOTTIME [PASS]
>> + * clock_id: CLOCK_TAI [PASS]
>> + * clock_id: CLOCK_REALTIME_COARSE [PASS]
>> + * clock_id: CLOCK_MONOTONIC [PASS]
>> + * clock_id: CLOCK_MONOTONIC_RAW [PASS]
>> + * clock_id: CLOCK_MONOTONIC_COARSE [PASS]
>> + */
>> +static inline int vdso_test_clock(unsigned int clock_id)
>> +{
>> + struct timespec x, y;
>> +
>> + printf("clock_id: %s", vdso_clock_name[clock_id]);
>> + clock_getres(clock_id, &x);
>> + syscall_clock_getres(clock_id, &y);
>> +
>> + if ((x.tv_sec != y.tv_sec) || (x.tv_sec != y.tv_sec)) {
>> + printf(" [FAIL]\n");
>> + return KSFT_FAIL;
>> + }
>> +
>> + printf(" [PASS]\n");
>> + return 0;
>> +}
>> +
>> +int main(int argc, char **argv)
>> +{
>> + int ret;
>> +
>> +#if _POSIX_TIMERS > 0
>> +
>> +#ifdef CLOCK_REALTIME
>
> Why do you need that #ifdef and all the ones below ?
>
> CLOCK_REALTIME (and others) is defined in include/uapi/linux/time.h, so
> it should be there when you build the test, shouldn't it ?
>
In implementing this test I tried to follow what the man page for
clock_gettime(2) defines in terms of availability of the timers. Since I do not
know how old are the userspace headers, I think it is a good idea checking that
the clocks are defined before trying to use them.
>> + ret = vdso_test_clock(CLOCK_REALTIME);
>> + if (ret)
>> + goto out;
>
> Why that goto ? Nothing is done at out, so a 'return ret' would be
> better I think.
>
Agree, thanks for pointing this out. Will fix in v4.
> And do we really want to stop at first failure ? Wouldn't it be better
> to run all the tests regardless ?
>
The test is supposed to fail if one of the sub-tests fails, hence once the first
fails doesn't seem convenient to run the others, because we already know the
result.
> Christophe
>
>> +#endif
>> +
>> +#ifdef CLOCK_BOOTTIME
>> + ret = vdso_test_clock(CLOCK_BOOTTIME);
>> + if (ret)
>> + goto out;
>> +#endif
>> +
>> +#ifdef CLOCK_TAI
>> + ret = vdso_test_clock(CLOCK_TAI);
>> + if (ret)
>> + goto out;
>> +#endif
>> +
>> +#ifdef CLOCK_REALTIME_COARSE
>> + ret = vdso_test_clock(CLOCK_REALTIME_COARSE);
>> + if (ret)
>> + goto out;
>> +#endif
>> +
>> +#ifdef CLOCK_MONOTONIC
>> + ret = vdso_test_clock(CLOCK_MONOTONIC);
>> + if (ret)
>> + goto out;
>> +#endif
>> +
>> +#ifdef CLOCK_MONOTONIC_RAW
>> + ret = vdso_test_clock(CLOCK_MONOTONIC_RAW);
>> + if (ret)
>> + goto out;
>> +#endif
>> +
>> +#ifdef CLOCK_MONOTONIC_COARSE
>> + ret = vdso_test_clock(CLOCK_MONOTONIC_COARSE);
>> + if (ret)
>> + goto out;
>> +#endif
>> +
>> +#endif
>> +
>> +out:
>> + return ret;
>> +}
>>
--
Regards,
Vincenzo
^ permalink raw reply
* [PATCH v1 1/2] open: add close_range()
From: Christian Brauner @ 2019-05-22 15:52 UTC (permalink / raw)
To: viro, linux-kernel, linux-fsdevel, linux-api, torvalds, fweimer
Cc: linux-ia64, linux-sh, ldv, dhowells, linux-kselftest, sparclinux,
shuah, linux-arch, linux-s390, miklos, x86, Christian Brauner,
linux-mips, linux-xtensa, tkjos, arnd, jannh, linux-m68k, tglx,
linux-arm-kernel, linux-parisc, oleg, linux-alpha, linuxppc-dev
This adds the close_range() syscall. It allows to efficiently close a range
of file descriptors up to all file descriptors of a calling task.
The syscall came up in a recent discussion around the new mount API and
making new file descriptor types cloexec by default. During this
discussion, Al suggested the close_range() syscall (cf. [1]). Note, a
syscall in this manner has been requested by various people over time.
First, it helps to close all file descriptors of an exec()ing task. This
can be done safely via (quoting Al's example from [1] verbatim):
/* that exec is sensitive */
unshare(CLONE_FILES);
/* we don't want anything past stderr here */
close_range(3, ~0U);
execve(....);
The code snippet above is one way of working around the problem that file
descriptors are not cloexec by default. This is aggravated by the fact that
we can't just switch them over without massively regressing userspace. For
a whole class of programs having an in-kernel method of closing all file
descriptors is very helpful (e.g. demons, service managers, programming
language standard libraries, container managers etc.).
(Please note, unshare(CLONE_FILES) should only be needed if the calling
task is multi-threaded and shares the file descriptor table with another
thread in which case two threads could race with one thread allocating
file descriptors and the other one closing them via close_range(). For the
general case close_range() before the execve() is sufficient.)
Second, it allows userspace to avoid implementing closing all file
descriptors by parsing through /proc/<pid>/fd/* and calling close() on each
file descriptor. From looking at various large(ish) userspace code bases
this or similar patterns are very common in:
- service managers (cf. [4])
- libcs (cf. [6])
- container runtimes (cf. [5])
- programming language runtimes/standard libraries
- Python (cf. [2])
- Rust (cf. [7], [8])
As Dmitry pointed out there's even a long-standing glibc bug about missing
kernel support for this task (cf. [3]).
In addition, the syscall will also work for tasks that do not have procfs
mounted and on kernels that do not have procfs support compiled in. In such
situations the only way to make sure that all file descriptors are closed
is to call close() on each file descriptor up to UINT_MAX or RLIMIT_NOFILE,
OPEN_MAX trickery (cf. comment [8] on Rust).
The performance is striking. For good measure, comparing the following
simple close_all_fds() userspace implementation that is essentially just
glibc's version in [6]:
static int close_all_fds(void)
{
int dir_fd;
DIR *dir;
struct dirent *direntp;
dir = opendir("/proc/self/fd");
if (!dir)
return -1;
dir_fd = dirfd(dir);
while ((direntp = readdir(dir))) {
int fd;
if (strcmp(direntp->d_name, ".") == 0)
continue;
if (strcmp(direntp->d_name, "..") == 0)
continue;
fd = atoi(direntp->d_name);
if (fd == dir_fd || fd == 0 || fd == 1 || fd == 2)
continue;
close(fd);
}
closedir(dir);
return 0;
}
to close_range() yields:
1. closing 4 open files:
- close_all_fds(): ~280 us
- close_range(): ~24 us
2. closing 1000 open files:
- close_all_fds(): ~5000 us
- close_range(): ~800 us
close_range() is designed to allow for some flexibility. Specifically, it
does not simply always close all open file descriptors of a task. Instead,
callers can specify an upper bound.
This is e.g. useful for scenarios where specific file descriptors are
created with well-known numbers that are supposed to be excluded from
getting closed.
For extra paranoia close_range() comes with a flags argument. This can e.g.
be used to implement extension. Once can imagine userspace wanting to stop
at the first error instead of ignoring errors under certain circumstances.
There might be other valid ideas in the future. In any case, a flag
argument doesn't hurt and keeps us on the safe side.
From an implementation side this is kept rather dumb. It saw some input
from David and Jann but all nonsense is obviously my own!
- Errors to close file descriptors are currently ignored. (Could be changed
by setting a flag in the future if needed.)
- __close_range() is a rather simplistic wrapper around __close_fd().
My reasoning behind this is based on the nature of how __close_fd() needs
to release an fd. But maybe I misunderstood specifics:
We take the files_lock and rcu-dereference the fdtable of the calling
task, we find the entry in the fdtable, get the file and need to release
files_lock before calling filp_close().
In the meantime the fdtable might have been altered so we can't just
retake the spinlock and keep the old rcu-reference of the fdtable
around. Instead we need to grab a fresh reference to the fdtable.
If my reasoning is correct then there's really no point in fancyfying
__close_range(): We just need to rcu-dereference the fdtable of the
calling task once to cap the max_fd value correctly and then go on
calling __close_fd() in a loop.
/* References */
[1]: https://lore.kernel.org/lkml/20190516165021.GD17978@ZenIV.linux.org.uk/
[2]: https://github.com/python/cpython/blob/9e4f2f3a6b8ee995c365e86d976937c141d867f8/Modules/_posixsubprocess.c#L220
[3]: https://sourceware.org/bugzilla/show_bug.cgi?id=10353#c7
[4]: https://github.com/systemd/systemd/blob/5238e9575906297608ff802a27e2ff9effa3b338/src/basic/fd-util.c#L217
[5]: https://github.com/lxc/lxc/blob/ddf4b77e11a4d08f09b7b9cd13e593f8c047edc5/src/lxc/start.c#L236
[6]: https://sourceware.org/git/?p=glibc.git;a=blob;f=sysdeps/unix/sysv/linux/grantpt.c;h=2030e07fa6e652aac32c775b8c6e005844c3c4eb;hb=HEAD#l17
Note that this is an internal implementation that is not exported.
Currently, libc seems to not provide an exported version of this
because of missing kernel support to do this.
[7]: https://github.com/rust-lang/rust/issues/12148
[8]: https://github.com/rust-lang/rust/blob/5f47c0613ed4eb46fca3633c1297364c09e5e451/src/libstd/sys/unix/process2.rs#L303-L308
Rust's solution is slightly different but is equally unperformant.
Rust calls getdtablesize() which is a glibc library function that
simply returns the current RLIMIT_NOFILE or OPEN_MAX values. Rust then
goes on to call close() on each fd. That's obviously overkill for most
tasks. Rarely, tasks - especially non-demons - hit RLIMIT_NOFILE or
OPEN_MAX.
Let's be nice and assume an unprivileged user with RLIMIT_NOFILE set
to 1024. Even in this case, there's a very high chance that in the
common case Rust is calling the close() syscall 1021 times pointlessly
if the task just has 0, 1, and 2 open.
Suggested-by: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Christian Brauner <christian@brauner.io>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Jann Horn <jannh@google.com>
Cc: David Howells <dhowells@redhat.com>
Cc: Dmitry V. Levin <ldv@altlinux.org>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Florian Weimer <fweimer@redhat.com>
Cc: linux-api@vger.kernel.org
---
v1:
- Linus Torvalds <torvalds@linux-foundation.org>:
- add cond_resched() to yield cpu when closing a lot of file descriptors
- Al Viro <viro@zeniv.linux.org.uk>:
- add cond_resched() to yield cpu when closing a lot of file descriptors
---
arch/alpha/kernel/syscalls/syscall.tbl | 1 +
arch/arm/tools/syscall.tbl | 1 +
arch/arm64/include/asm/unistd32.h | 2 +
arch/ia64/kernel/syscalls/syscall.tbl | 1 +
arch/m68k/kernel/syscalls/syscall.tbl | 1 +
arch/microblaze/kernel/syscalls/syscall.tbl | 1 +
arch/mips/kernel/syscalls/syscall_n32.tbl | 1 +
arch/mips/kernel/syscalls/syscall_n64.tbl | 1 +
arch/mips/kernel/syscalls/syscall_o32.tbl | 1 +
arch/parisc/kernel/syscalls/syscall.tbl | 1 +
arch/powerpc/kernel/syscalls/syscall.tbl | 1 +
arch/s390/kernel/syscalls/syscall.tbl | 1 +
arch/sh/kernel/syscalls/syscall.tbl | 1 +
arch/sparc/kernel/syscalls/syscall.tbl | 1 +
arch/x86/entry/syscalls/syscall_32.tbl | 1 +
arch/x86/entry/syscalls/syscall_64.tbl | 1 +
arch/xtensa/kernel/syscalls/syscall.tbl | 1 +
fs/file.c | 63 ++++++++++++++++++---
fs/open.c | 20 +++++++
include/linux/fdtable.h | 2 +
include/linux/syscalls.h | 2 +
include/uapi/asm-generic/unistd.h | 4 +-
22 files changed, 100 insertions(+), 9 deletions(-)
diff --git a/arch/alpha/kernel/syscalls/syscall.tbl b/arch/alpha/kernel/syscalls/syscall.tbl
index 9e7704e44f6d..b55d93af8096 100644
--- a/arch/alpha/kernel/syscalls/syscall.tbl
+++ b/arch/alpha/kernel/syscalls/syscall.tbl
@@ -473,3 +473,4 @@
541 common fsconfig sys_fsconfig
542 common fsmount sys_fsmount
543 common fspick sys_fspick
+545 common close_range sys_close_range
diff --git a/arch/arm/tools/syscall.tbl b/arch/arm/tools/syscall.tbl
index aaf479a9e92d..0125c97c75dd 100644
--- a/arch/arm/tools/syscall.tbl
+++ b/arch/arm/tools/syscall.tbl
@@ -447,3 +447,4 @@
431 common fsconfig sys_fsconfig
432 common fsmount sys_fsmount
433 common fspick sys_fspick
+435 common close_range sys_close_range
diff --git a/arch/arm64/include/asm/unistd32.h b/arch/arm64/include/asm/unistd32.h
index c39e90600bb3..9a3270d29b42 100644
--- a/arch/arm64/include/asm/unistd32.h
+++ b/arch/arm64/include/asm/unistd32.h
@@ -886,6 +886,8 @@ __SYSCALL(__NR_fsconfig, sys_fsconfig)
__SYSCALL(__NR_fsmount, sys_fsmount)
#define __NR_fspick 433
__SYSCALL(__NR_fspick, sys_fspick)
+#define __NR_close_range 435
+__SYSCALL(__NR_close_range, sys_close_range)
/*
* Please add new compat syscalls above this comment and update
diff --git a/arch/ia64/kernel/syscalls/syscall.tbl b/arch/ia64/kernel/syscalls/syscall.tbl
index e01df3f2f80d..1a90b464e96f 100644
--- a/arch/ia64/kernel/syscalls/syscall.tbl
+++ b/arch/ia64/kernel/syscalls/syscall.tbl
@@ -354,3 +354,4 @@
431 common fsconfig sys_fsconfig
432 common fsmount sys_fsmount
433 common fspick sys_fspick
+435 common close_range sys_close_range
diff --git a/arch/m68k/kernel/syscalls/syscall.tbl b/arch/m68k/kernel/syscalls/syscall.tbl
index 7e3d0734b2f3..2dee2050f9ef 100644
--- a/arch/m68k/kernel/syscalls/syscall.tbl
+++ b/arch/m68k/kernel/syscalls/syscall.tbl
@@ -433,3 +433,4 @@
431 common fsconfig sys_fsconfig
432 common fsmount sys_fsmount
433 common fspick sys_fspick
+435 common close_range sys_close_range
diff --git a/arch/microblaze/kernel/syscalls/syscall.tbl b/arch/microblaze/kernel/syscalls/syscall.tbl
index 26339e417695..923ef69e5a76 100644
--- a/arch/microblaze/kernel/syscalls/syscall.tbl
+++ b/arch/microblaze/kernel/syscalls/syscall.tbl
@@ -439,3 +439,4 @@
431 common fsconfig sys_fsconfig
432 common fsmount sys_fsmount
433 common fspick sys_fspick
+435 common close_range sys_close_range
diff --git a/arch/mips/kernel/syscalls/syscall_n32.tbl b/arch/mips/kernel/syscalls/syscall_n32.tbl
index 0e2dd68ade57..967ed9de51cd 100644
--- a/arch/mips/kernel/syscalls/syscall_n32.tbl
+++ b/arch/mips/kernel/syscalls/syscall_n32.tbl
@@ -372,3 +372,4 @@
431 n32 fsconfig sys_fsconfig
432 n32 fsmount sys_fsmount
433 n32 fspick sys_fspick
+435 n32 close_range sys_close_range
diff --git a/arch/mips/kernel/syscalls/syscall_n64.tbl b/arch/mips/kernel/syscalls/syscall_n64.tbl
index 5eebfa0d155c..71de731102b1 100644
--- a/arch/mips/kernel/syscalls/syscall_n64.tbl
+++ b/arch/mips/kernel/syscalls/syscall_n64.tbl
@@ -348,3 +348,4 @@
431 n64 fsconfig sys_fsconfig
432 n64 fsmount sys_fsmount
433 n64 fspick sys_fspick
+435 n64 close_range sys_close_range
diff --git a/arch/mips/kernel/syscalls/syscall_o32.tbl b/arch/mips/kernel/syscalls/syscall_o32.tbl
index 3cc1374e02d0..5a325ab29f88 100644
--- a/arch/mips/kernel/syscalls/syscall_o32.tbl
+++ b/arch/mips/kernel/syscalls/syscall_o32.tbl
@@ -421,3 +421,4 @@
431 o32 fsconfig sys_fsconfig
432 o32 fsmount sys_fsmount
433 o32 fspick sys_fspick
+435 o32 close_range sys_close_range
diff --git a/arch/parisc/kernel/syscalls/syscall.tbl b/arch/parisc/kernel/syscalls/syscall.tbl
index c9e377d59232..dcc0a0879139 100644
--- a/arch/parisc/kernel/syscalls/syscall.tbl
+++ b/arch/parisc/kernel/syscalls/syscall.tbl
@@ -430,3 +430,4 @@
431 common fsconfig sys_fsconfig
432 common fsmount sys_fsmount
433 common fspick sys_fspick
+435 common close_range sys_close_range
diff --git a/arch/powerpc/kernel/syscalls/syscall.tbl b/arch/powerpc/kernel/syscalls/syscall.tbl
index 103655d84b4b..ba2c1f078cbd 100644
--- a/arch/powerpc/kernel/syscalls/syscall.tbl
+++ b/arch/powerpc/kernel/syscalls/syscall.tbl
@@ -515,3 +515,4 @@
431 common fsconfig sys_fsconfig
432 common fsmount sys_fsmount
433 common fspick sys_fspick
+435 common close_range sys_close_range
diff --git a/arch/s390/kernel/syscalls/syscall.tbl b/arch/s390/kernel/syscalls/syscall.tbl
index e822b2964a83..d7c9043d2902 100644
--- a/arch/s390/kernel/syscalls/syscall.tbl
+++ b/arch/s390/kernel/syscalls/syscall.tbl
@@ -436,3 +436,4 @@
431 common fsconfig sys_fsconfig sys_fsconfig
432 common fsmount sys_fsmount sys_fsmount
433 common fspick sys_fspick sys_fspick
+435 common close_range sys_close_range sys_close_range
diff --git a/arch/sh/kernel/syscalls/syscall.tbl b/arch/sh/kernel/syscalls/syscall.tbl
index 016a727d4357..9b5e6bf0ce32 100644
--- a/arch/sh/kernel/syscalls/syscall.tbl
+++ b/arch/sh/kernel/syscalls/syscall.tbl
@@ -436,3 +436,4 @@
431 common fsconfig sys_fsconfig
432 common fsmount sys_fsmount
433 common fspick sys_fspick
+435 common close_range sys_close_range
diff --git a/arch/sparc/kernel/syscalls/syscall.tbl b/arch/sparc/kernel/syscalls/syscall.tbl
index e047480b1605..8c674a1e0072 100644
--- a/arch/sparc/kernel/syscalls/syscall.tbl
+++ b/arch/sparc/kernel/syscalls/syscall.tbl
@@ -479,3 +479,4 @@
431 common fsconfig sys_fsconfig
432 common fsmount sys_fsmount
433 common fspick sys_fspick
+435 common close_range sys_close_range
diff --git a/arch/x86/entry/syscalls/syscall_32.tbl b/arch/x86/entry/syscalls/syscall_32.tbl
index ad968b7bac72..7f7a89a96707 100644
--- a/arch/x86/entry/syscalls/syscall_32.tbl
+++ b/arch/x86/entry/syscalls/syscall_32.tbl
@@ -438,3 +438,4 @@
431 i386 fsconfig sys_fsconfig __ia32_sys_fsconfig
432 i386 fsmount sys_fsmount __ia32_sys_fsmount
433 i386 fspick sys_fspick __ia32_sys_fspick
+435 i386 close_range sys_close_range __ia32_sys_close_range
diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl
index b4e6f9e6204a..0f7d47ae921c 100644
--- a/arch/x86/entry/syscalls/syscall_64.tbl
+++ b/arch/x86/entry/syscalls/syscall_64.tbl
@@ -355,6 +355,7 @@
431 common fsconfig __x64_sys_fsconfig
432 common fsmount __x64_sys_fsmount
433 common fspick __x64_sys_fspick
+435 common close_range __x64_sys_close_range
#
# x32-specific system call numbers start at 512 to avoid cache impact
diff --git a/arch/xtensa/kernel/syscalls/syscall.tbl b/arch/xtensa/kernel/syscalls/syscall.tbl
index 5fa0ee1c8e00..b489532265d0 100644
--- a/arch/xtensa/kernel/syscalls/syscall.tbl
+++ b/arch/xtensa/kernel/syscalls/syscall.tbl
@@ -404,3 +404,4 @@
431 common fsconfig sys_fsconfig
432 common fsmount sys_fsmount
433 common fspick sys_fspick
+435 common close_range sys_close_range
diff --git a/fs/file.c b/fs/file.c
index 3da91a112bab..54945efa046e 100644
--- a/fs/file.c
+++ b/fs/file.c
@@ -615,12 +615,9 @@ void fd_install(unsigned int fd, struct file *file)
EXPORT_SYMBOL(fd_install);
-/*
- * The same warnings as for __alloc_fd()/__fd_install() apply here...
- */
-int __close_fd(struct files_struct *files, unsigned fd)
+static struct file *pick_file(struct files_struct *files, unsigned fd)
{
- struct file *file;
+ struct file *file = NULL;
struct fdtable *fdt;
spin_lock(&files->file_lock);
@@ -632,15 +629,65 @@ int __close_fd(struct files_struct *files, unsigned fd)
goto out_unlock;
rcu_assign_pointer(fdt->fd[fd], NULL);
__put_unused_fd(files, fd);
- spin_unlock(&files->file_lock);
- return filp_close(file, files);
out_unlock:
spin_unlock(&files->file_lock);
- return -EBADF;
+ return file;
+}
+
+/*
+ * The same warnings as for __alloc_fd()/__fd_install() apply here...
+ */
+int __close_fd(struct files_struct *files, unsigned fd)
+{
+ struct file *file;
+
+ file = pick_file(files, fd);
+ if (!file)
+ return -EBADF;
+
+ return filp_close(file, files);
}
EXPORT_SYMBOL(__close_fd); /* for ksys_close() */
+/**
+ * __close_range() - Close all file descriptors in a given range.
+ *
+ * @fd: starting file descriptor to close
+ * @max_fd: last file descriptor to close
+ *
+ * This closes a range of file descriptors. All file descriptors
+ * from @fd up to and including @max_fd are closed.
+ */
+int __close_range(struct files_struct *files, unsigned fd, unsigned max_fd)
+{
+ unsigned int cur_max;
+
+ if (fd > max_fd)
+ return -EINVAL;
+
+ rcu_read_lock();
+ cur_max = files_fdtable(files)->max_fds;
+ rcu_read_unlock();
+
+ /* cap to last valid index into fdtable */
+ if (max_fd >= cur_max)
+ max_fd = cur_max - 1;
+
+ while (fd <= max_fd) {
+ struct file *file;
+
+ file = pick_file(files, fd++);
+ if (!file)
+ continue;
+
+ filp_close(file, files);
+ cond_resched();
+ }
+
+ return 0;
+}
+
/*
* variant of __close_fd that gets a ref on the file for later fput
*/
diff --git a/fs/open.c b/fs/open.c
index 9c7d724a6f67..c7baaee7aa47 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -1174,6 +1174,26 @@ SYSCALL_DEFINE1(close, unsigned int, fd)
return retval;
}
+/**
+ * close_range() - Close all file descriptors in a given range.
+ *
+ * @fd: starting file descriptor to close
+ * @max_fd: last file descriptor to close
+ * @flags: reserved for future extensions
+ *
+ * This closes a range of file descriptors. All file descriptors
+ * from @fd up to and including @max_fd are closed.
+ * Currently, errors to close a given file descriptor are ignored.
+ */
+SYSCALL_DEFINE3(close_range, unsigned int, fd, unsigned int, max_fd,
+ unsigned int, flags)
+{
+ if (flags)
+ return -EINVAL;
+
+ return __close_range(current->files, fd, max_fd);
+}
+
/*
* This routine simulates a hangup on the tty, to arrange that users
* are given clean terminals at login time.
diff --git a/include/linux/fdtable.h b/include/linux/fdtable.h
index f07c55ea0c22..fcd07181a365 100644
--- a/include/linux/fdtable.h
+++ b/include/linux/fdtable.h
@@ -121,6 +121,8 @@ extern void __fd_install(struct files_struct *files,
unsigned int fd, struct file *file);
extern int __close_fd(struct files_struct *files,
unsigned int fd);
+extern int __close_range(struct files_struct *files, unsigned int fd,
+ unsigned int max_fd);
extern int __close_fd_get_file(unsigned int fd, struct file **res);
extern struct kmem_cache *files_cachep;
diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
index e2870fe1be5b..c0189e223255 100644
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -441,6 +441,8 @@ asmlinkage long sys_fchown(unsigned int fd, uid_t user, gid_t group);
asmlinkage long sys_openat(int dfd, const char __user *filename, int flags,
umode_t mode);
asmlinkage long sys_close(unsigned int fd);
+asmlinkage long sys_close_range(unsigned int fd, unsigned int max_fd,
+ unsigned int flags);
asmlinkage long sys_vhangup(void);
/* fs/pipe.c */
diff --git a/include/uapi/asm-generic/unistd.h b/include/uapi/asm-generic/unistd.h
index a87904daf103..3f36c8745d24 100644
--- a/include/uapi/asm-generic/unistd.h
+++ b/include/uapi/asm-generic/unistd.h
@@ -844,9 +844,11 @@ __SYSCALL(__NR_fsconfig, sys_fsconfig)
__SYSCALL(__NR_fsmount, sys_fsmount)
#define __NR_fspick 433
__SYSCALL(__NR_fspick, sys_fspick)
+#define __NR_close_range 435
+__SYSCALL(__NR_close_range, sys_close_range)
#undef __NR_syscalls
-#define __NR_syscalls 434
+#define __NR_syscalls 436
/*
* 32 bit systems traditionally used different
--
2.21.0
^ permalink raw reply related
* [PATCH v1 2/2] tests: add close_range() tests
From: Christian Brauner @ 2019-05-22 15:52 UTC (permalink / raw)
To: viro, linux-kernel, linux-fsdevel, linux-api, torvalds, fweimer
Cc: linux-ia64, linux-sh, ldv, dhowells, linux-kselftest, sparclinux,
shuah, linux-arch, linux-s390, miklos, x86, Christian Brauner,
linux-mips, linux-xtensa, tkjos, arnd, jannh, linux-m68k, tglx,
linux-arm-kernel, linux-parisc, oleg, linux-alpha, linuxppc-dev
In-Reply-To: <20190522155259.11174-1-christian@brauner.io>
This adds basic tests for the new close_range() syscall.
- test that no invalid flags can be passed
- test that a range of file descriptors is correctly closed
- test that a range of file descriptors is correctly closed if there there
are already closed file descriptors in the range
- test that max_fd is correctly capped to the current fdtable maximum
Signed-off-by: Christian Brauner <christian@brauner.io>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Jann Horn <jannh@google.com>
Cc: David Howells <dhowells@redhat.com>
Cc: Dmitry V. Levin <ldv@altlinux.org>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Florian Weimer <fweimer@redhat.com>
Cc: linux-api@vger.kernel.org
---
v1: unchanged
---
tools/testing/selftests/Makefile | 1 +
tools/testing/selftests/core/.gitignore | 1 +
tools/testing/selftests/core/Makefile | 6 +
.../testing/selftests/core/close_range_test.c | 128 ++++++++++++++++++
4 files changed, 136 insertions(+)
create mode 100644 tools/testing/selftests/core/.gitignore
create mode 100644 tools/testing/selftests/core/Makefile
create mode 100644 tools/testing/selftests/core/close_range_test.c
diff --git a/tools/testing/selftests/Makefile b/tools/testing/selftests/Makefile
index 9781ca79794a..06e57fabbff9 100644
--- a/tools/testing/selftests/Makefile
+++ b/tools/testing/selftests/Makefile
@@ -4,6 +4,7 @@ TARGETS += bpf
TARGETS += breakpoints
TARGETS += capabilities
TARGETS += cgroup
+TARGETS += core
TARGETS += cpufreq
TARGETS += cpu-hotplug
TARGETS += drivers/dma-buf
diff --git a/tools/testing/selftests/core/.gitignore b/tools/testing/selftests/core/.gitignore
new file mode 100644
index 000000000000..6e6712ce5817
--- /dev/null
+++ b/tools/testing/selftests/core/.gitignore
@@ -0,0 +1 @@
+close_range_test
diff --git a/tools/testing/selftests/core/Makefile b/tools/testing/selftests/core/Makefile
new file mode 100644
index 000000000000..de3ae68aa345
--- /dev/null
+++ b/tools/testing/selftests/core/Makefile
@@ -0,0 +1,6 @@
+CFLAGS += -g -I../../../../usr/include/ -I../../../../include
+
+TEST_GEN_PROGS := close_range_test
+
+include ../lib.mk
+
diff --git a/tools/testing/selftests/core/close_range_test.c b/tools/testing/selftests/core/close_range_test.c
new file mode 100644
index 000000000000..ab10cd205ab9
--- /dev/null
+++ b/tools/testing/selftests/core/close_range_test.c
@@ -0,0 +1,128 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#define _GNU_SOURCE
+#include <errno.h>
+#include <fcntl.h>
+#include <linux/kernel.h>
+#include <limits.h>
+#include <stdbool.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <syscall.h>
+#include <unistd.h>
+
+#include "../kselftest.h"
+
+static inline int sys_close_range(unsigned int fd, unsigned int max_fd,
+ unsigned int flags)
+{
+ return syscall(__NR_close_range, fd, max_fd, flags);
+}
+
+#ifndef ARRAY_SIZE
+#define ARRAY_SIZE(x) (sizeof(x) / sizeof((x)[0]))
+#endif
+
+int main(int argc, char **argv)
+{
+ const char *test_name = "close_range";
+ int i, ret;
+ int open_fds[100];
+ int fd_max, fd_mid, fd_min;
+
+ ksft_set_plan(7);
+
+ for (i = 0; i < ARRAY_SIZE(open_fds); i++) {
+ int fd;
+
+ fd = open("/dev/null", O_RDONLY | O_CLOEXEC);
+ if (fd < 0) {
+ if (errno == ENOENT)
+ ksft_exit_skip(
+ "%s test: skipping test since /dev/null does not exist\n",
+ test_name);
+
+ ksft_exit_fail_msg(
+ "%s test: %s - failed to open /dev/null\n",
+ strerror(errno), test_name);
+ }
+
+ open_fds[i] = fd;
+ }
+
+ fd_min = open_fds[0];
+ fd_max = open_fds[99];
+
+ ret = sys_close_range(fd_min, fd_max, 1);
+ if (!ret)
+ ksft_exit_fail_msg(
+ "%s test: managed to pass invalid flag value\n",
+ test_name);
+ ksft_test_result_pass("do not allow invalid flag values for close_range()\n");
+
+ fd_mid = open_fds[50];
+ ret = sys_close_range(fd_min, fd_mid, 0);
+ if (ret < 0)
+ ksft_exit_fail_msg(
+ "%s test: Failed to close range of file descriptors from 4 to 50\n",
+ test_name);
+ ksft_test_result_pass("close_range() from %d to %d\n", fd_min, fd_mid);
+
+ for (i = 0; i <= 50; i++) {
+ ret = fcntl(open_fds[i], F_GETFL);
+ if (ret >= 0)
+ ksft_exit_fail_msg(
+ "%s test: Failed to close range of file descriptors from 4 to 50\n",
+ test_name);
+ }
+ ksft_test_result_pass("fcntl() verify closed range from %d to %d\n", fd_min, fd_mid);
+
+ /* create a couple of gaps */
+ close(57);
+ close(78);
+ close(81);
+ close(82);
+ close(84);
+ close(90);
+
+ fd_mid = open_fds[51];
+ /* Choose slightly lower limit and leave some fds for a later test */
+ fd_max = open_fds[92];
+ ret = sys_close_range(fd_mid, fd_max, 0);
+ if (ret < 0)
+ ksft_exit_fail_msg(
+ "%s test: Failed to close range of file descriptors from 51 to 100\n",
+ test_name);
+ ksft_test_result_pass("close_range() from %d to %d\n", fd_mid, fd_max);
+
+ for (i = 51; i <= 92; i++) {
+ ret = fcntl(open_fds[i], F_GETFL);
+ if (ret >= 0)
+ ksft_exit_fail_msg(
+ "%s test: Failed to close range of file descriptors from 51 to 100\n",
+ test_name);
+ }
+ ksft_test_result_pass("fcntl() verify closed range from %d to %d\n", fd_mid, fd_max);
+
+ fd_mid = open_fds[93];
+ fd_max = open_fds[99];
+ /* test that the kernel caps and still closes all fds */
+ ret = sys_close_range(fd_mid, UINT_MAX, 0);
+ if (ret < 0)
+ ksft_exit_fail_msg(
+ "%s test: Failed to close range of file descriptors from 51 to 100\n",
+ test_name);
+ ksft_test_result_pass("close_range() from %d to %d\n", fd_mid, fd_max);
+
+ for (i = 93; i < 100; i++) {
+ ret = fcntl(open_fds[i], F_GETFL);
+ if (ret >= 0)
+ ksft_exit_fail_msg(
+ "%s test: Failed to close range of file descriptors from 51 to 100\n",
+ test_name);
+ }
+ ksft_test_result_pass("fcntl() verify closed range from %d to %d\n", fd_mid, fd_max);
+
+ return ksft_exit_pass();
+}
--
2.21.0
^ permalink raw reply related
* [PATCH] powerpc/powernv: fix a W=1 compilation warning
From: Qian Cai @ 2019-05-22 16:09 UTC (permalink / raw)
To: benh, paulus, mpe; +Cc: aik, Qian Cai, linuxppc-dev, linux-kernel
The commit b575c731fe58 ("powerpc/powernv/npu: Add set/unset window
helpers") called pnv_npu_set_window() in a void function
pnv_npu_dma_set_32(), but the return code from pnv_npu_set_window() has
no use there as all the error logging happen in pnv_npu_set_window(),
so just remove the unused variable to avoid a compilation warning,
arch/powerpc/platforms/powernv/npu-dma.c: In function
'pnv_npu_dma_set_32':
arch/powerpc/platforms/powernv/npu-dma.c:198:10: warning: variable ‘rc’
set but not used [-Wunused-but-set-variable]
Signed-off-by: Qian Cai <cai@lca.pw>
---
arch/powerpc/platforms/powernv/npu-dma.c | 5 ++---
1 file changed, 2 insertions(+), 3 deletions(-)
diff --git a/arch/powerpc/platforms/powernv/npu-dma.c b/arch/powerpc/platforms/powernv/npu-dma.c
index 495550432f3d..035208ed591f 100644
--- a/arch/powerpc/platforms/powernv/npu-dma.c
+++ b/arch/powerpc/platforms/powernv/npu-dma.c
@@ -195,7 +195,6 @@ static void pnv_npu_dma_set_32(struct pnv_ioda_pe *npe)
{
struct pci_dev *gpdev;
struct pnv_ioda_pe *gpe;
- int64_t rc;
/*
* Find the assoicated PCI devices and get the dma window
@@ -208,8 +207,8 @@ static void pnv_npu_dma_set_32(struct pnv_ioda_pe *npe)
if (!gpe)
return;
- rc = pnv_npu_set_window(&npe->table_group, 0,
- gpe->table_group.tables[0]);
+ pnv_npu_set_window(&npe->table_group, 0,
+ gpe->table_group.tables[0]);
/*
* NVLink devices use the same TCE table configuration as
--
1.8.3.1
^ permalink raw reply related
* Re: [PATCH v1 1/2] open: add close_range()
From: Oleg Nesterov @ 2019-05-22 16:57 UTC (permalink / raw)
To: Christian Brauner
Cc: linux-ia64, linux-sh, ldv, dhowells, linux-kselftest, sparclinux,
shuah, linux-arch, linux-s390, miklos, x86, torvalds, linux-mips,
linux-xtensa, tkjos, arnd, jannh, linux-m68k, viro, tglx,
linux-arm-kernel, fweimer, linux-parisc, linux-api, linux-kernel,
linux-alpha, linux-fsdevel, linuxppc-dev
In-Reply-To: <20190522155259.11174-1-christian@brauner.io>
On 05/22, Christian Brauner wrote:
>
> +static struct file *pick_file(struct files_struct *files, unsigned fd)
> {
> - struct file *file;
> + struct file *file = NULL;
> struct fdtable *fdt;
>
> spin_lock(&files->file_lock);
> @@ -632,15 +629,65 @@ int __close_fd(struct files_struct *files, unsigned fd)
> goto out_unlock;
> rcu_assign_pointer(fdt->fd[fd], NULL);
> __put_unused_fd(files, fd);
> - spin_unlock(&files->file_lock);
> - return filp_close(file, files);
>
> out_unlock:
> spin_unlock(&files->file_lock);
> - return -EBADF;
> + return file;
...
> +int __close_range(struct files_struct *files, unsigned fd, unsigned max_fd)
> +{
> + unsigned int cur_max;
> +
> + if (fd > max_fd)
> + return -EINVAL;
> +
> + rcu_read_lock();
> + cur_max = files_fdtable(files)->max_fds;
> + rcu_read_unlock();
> +
> + /* cap to last valid index into fdtable */
> + if (max_fd >= cur_max)
> + max_fd = cur_max - 1;
> +
> + while (fd <= max_fd) {
> + struct file *file;
> +
> + file = pick_file(files, fd++);
Well, how about something like
static unsigned int find_next_opened_fd(struct fdtable *fdt, unsigned start)
{
unsigned int maxfd = fdt->max_fds;
unsigned int maxbit = maxfd / BITS_PER_LONG;
unsigned int bitbit = start / BITS_PER_LONG;
bitbit = find_next_bit(fdt->full_fds_bits, maxbit, bitbit) * BITS_PER_LONG;
if (bitbit > maxfd)
return maxfd;
if (bitbit > start)
start = bitbit;
return find_next_bit(fdt->open_fds, maxfd, start);
}
unsigned close_next_fd(struct files_struct *files, unsigned start, unsigned maxfd)
{
unsigned fd;
struct file *file;
struct fdtable *fdt;
spin_lock(&files->file_lock);
fdt = files_fdtable(files);
fd = find_next_opened_fd(fdt, start);
if (fd >= fdt->max_fds || fd > maxfd) {
fd = -1;
goto out;
}
file = fdt->fd[fd];
rcu_assign_pointer(fdt->fd[fd], NULL);
__put_unused_fd(files, fd);
out:
spin_unlock(&files->file_lock);
if (fd == -1u)
return fd;
filp_close(file, files);
return fd + 1;
}
?
Then close_range() can do
while (fd < max_fd)
fd = close_next_fd(fd, maxfd);
Oleg.
^ permalink raw reply
page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox