LinuxPPC-Dev Archive on lore.kernel.org

LinuxPPC-Dev Archive on lore.kernel.org
 help / color / mirror / Atom feed

* Re: [PATCH] powerpc/powernv: Show checkstop reason for NPU2 HMIs
From: Michael Ellerman @ 2019-05-23 13:45 UTC (permalink / raw)
  To: Frederic Barrat, linuxppc-dev, ajd, arbab, aik; +Cc: clombard, groug
In-Reply-To: <20190523122804.4801-1-fbarrat@linux.ibm.com>

Frederic Barrat <fbarrat@linux.ibm.com> writes:

> If the kernel is notified of an HMI caused by the NPU2, it's currently
> not being recognized and it logs the default message:
>
>     Unknown Malfunction Alert of type 3
>
> The NPU on Power 9 has 3 Fault Isolation Registers, so that's a lot of
> possible causes, but we should at least log that it's an NPU problem
> and report which FIR and which bit were raised if opal gave us the
> information.
>
> Signed-off-by: Frederic Barrat <fbarrat@linux.ibm.com>
> ---
>
> Could be merged independently from (the opal-api.h change is already
> in the skiboot tree), but works better with, the matching skiboot
> change:
> http://patchwork.ozlabs.org/patch/1104076/

Well it *must* work with or without the skiboot change, because old/new
kernels will run on old/new skiboots.

It looks like it will work fine, we just won't get any extra information
in xstop_reason, right?

cheers

> diff --git a/arch/powerpc/include/asm/opal-api.h b/arch/powerpc/include/asm/opal-api.h
> index e1577cfa7186..2492fe248e1e 100644
> --- a/arch/powerpc/include/asm/opal-api.h
> +++ b/arch/powerpc/include/asm/opal-api.h
> @@ -568,6 +568,7 @@ enum OpalHMI_XstopType {
>  	CHECKSTOP_TYPE_UNKNOWN	=	0,
>  	CHECKSTOP_TYPE_CORE	=	1,
>  	CHECKSTOP_TYPE_NX	=	2,
> +	CHECKSTOP_TYPE_NPU	=	3
>  };
>  
>  enum OpalHMI_CoreXstopReason {
> diff --git a/arch/powerpc/platforms/powernv/opal-hmi.c b/arch/powerpc/platforms/powernv/opal-hmi.c
> index 586ec71a4e17..de12a240b477 100644
> --- a/arch/powerpc/platforms/powernv/opal-hmi.c
> +++ b/arch/powerpc/platforms/powernv/opal-hmi.c
> @@ -149,6 +149,43 @@ static void print_nx_checkstop_reason(const char *level,
>  					xstop_reason[i].description);
>  }
>  
> +static void print_npu_checkstop_reason(const char *level,
> +					struct OpalHMIEvent *hmi_evt)
> +{
> +	uint8_t reason, reason_count, i;
> +
> +	/*
> +	 * We may not have a checkstop reason on some combination of
> +	 * hardware and/or skiboot version
> +	 */
> +	if (!hmi_evt->u.xstop_error.xstop_reason) {
> +		printk("%s	NPU checkstop on chip %x\n", level,
> +			be32_to_cpu(hmi_evt->u.xstop_error.u.chip_id));
> +		return;
> +	}
> +
> +	/*
> +	 * NPU2 has 3 FIRs. Reason encoded on a byte as:
> +	 *   2 bits for the FIR number
> +	 *   6 bits for the bit number
> +	 * It may be possible to find several reasons.
> +	 *
> +	 * We don't display a specific message per FIR bit as there
> +	 * are too many and most are meaningless without the workbook
> +	 * and/or hw team help anyway.
> +	 */
> +	reason_count = sizeof(hmi_evt->u.xstop_error.xstop_reason) /
> +		sizeof(reason);
> +	for (i = 0; i < reason_count; i++) {
> +		reason = (hmi_evt->u.xstop_error.xstop_reason >> (8 * i)) & 0xFF;
> +		if (reason)
> +			printk("%s	NPU checkstop on chip %x: FIR%d bit %d is set\n",
> +				level,
> +				be32_to_cpu(hmi_evt->u.xstop_error.u.chip_id),
> +				reason >> 6, reason & 0x3F);
> +	}
> +}
> +
>  static void print_checkstop_reason(const char *level,
>  					struct OpalHMIEvent *hmi_evt)
>  {
> @@ -160,6 +197,9 @@ static void print_checkstop_reason(const char *level,
>  	case CHECKSTOP_TYPE_NX:
>  		print_nx_checkstop_reason(level, hmi_evt);
>  		break;
> +	case CHECKSTOP_TYPE_NPU:
> +		print_npu_checkstop_reason(level, hmi_evt);
> +		break;
>  	default:
>  		printk("%s	Unknown Malfunction Alert of type %d\n",
>  		       level, type);
> -- 
> 2.21.0

^ permalink raw reply

* Applied "spi: spi-fsl-spi: call spi_finalize_current_message() at the end" to the spi tree
From: Mark Brown @ 2019-05-23 13:49 UTC (permalink / raw)
  To: Christophe Leroy; +Cc: Mark Brown, linuxppc-dev, linux-kernel, linux-spi
In-Reply-To: <9c1b9545e6683666a795cd89070fa9491f33b9da.1558522754.git.christophe.leroy@c-s.fr>

The patch

   spi: spi-fsl-spi: call spi_finalize_current_message() at the end

has been applied to the spi tree at

   https://git.kernel.org/pub/scm/linux/kernel/git/broonie/spi.git for-5.2

All being well this means that it will be integrated into the linux-next
tree (usually sometime in the next 24 hours) and sent to Linus during
the next merge window (or sooner if it is a bug fix), however if
problems are discovered then the patch may be dropped or reverted.  

You may get further e-mails resulting from automated or manual testing
and review of the tree, please engage with people reporting problems and
send followup patches addressing any issues that are reported if needed.

If any updates are required or you are submitting further changes they
should be sent as incremental updates against current git, existing
patches will not be replaced.

Please add any relevant lists and maintainers to the CCs when replying
to this mail.

Thanks,
Mark

From 44a042182cb1e9f7916e015c836967bf638b33c4 Mon Sep 17 00:00:00 2001
From: Christophe Leroy <christophe.leroy@c-s.fr>
Date: Wed, 22 May 2019 11:00:36 +0000
Subject: [PATCH] spi: spi-fsl-spi: call spi_finalize_current_message() at the
 end

spi_finalize_current_message() shall be called once all
actions are finished, otherwise the last actions might
step over a newly started transfer.

Fixes: c592becbe704 ("spi: fsl-(e)spi: migrate to generic master queueing")
Signed-off-by: Christophe Leroy <christophe.leroy@c-s.fr>
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 drivers/spi/spi-fsl-spi.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/spi/spi-fsl-spi.c b/drivers/spi/spi-fsl-spi.c
index b36ac6aa3b1f..7fbdaf066719 100644
--- a/drivers/spi/spi-fsl-spi.c
+++ b/drivers/spi/spi-fsl-spi.c
@@ -432,7 +432,6 @@ static int fsl_spi_do_one_msg(struct spi_master *master,
 	}

 	m->status = status;
-	spi_finalize_current_message(master);

 	if (status || !cs_change) {
 		ndelay(nsecs);
@@ -440,6 +439,7 @@ static int fsl_spi_do_one_msg(struct spi_master *master,
 	}

 	fsl_spi_setup_transfer(spi, NULL);
+	spi_finalize_current_message(master);
 	return 0;
 }

-- 
2.20.1

^ permalink raw reply related

* Re: [PATCH v1 1/2] open: add close_range()
From: Christian Brauner @ 2019-05-23 14:14 UTC (permalink / raw)
  To: Oleg Nesterov
  Cc: linux-ia64, linux-sh, ldv, dhowells, linux-kselftest, sparclinux,
	shuah, linux-arch, linux-s390, miklos, x86, torvalds, linux-mips,
	linux-xtensa, tkjos, arnd, jannh, linux-m68k, viro, tglx,
	linux-arm-kernel, fweimer, linux-parisc, linux-api, linux-kernel,
	linux-alpha, linux-fsdevel, linuxppc-dev
In-Reply-To: <20190523115118.pmscbd6kaqy37dym@brauner.io>

On Thu, May 23, 2019 at 01:51:18PM +0200, Christian Brauner wrote:
> On Wed, May 22, 2019 at 06:57:37PM +0200, Oleg Nesterov wrote:
> > On 05/22, Christian Brauner wrote:
> > >
> > > +static struct file *pick_file(struct files_struct *files, unsigned fd)
> > >  {
> > > -	struct file *file;
> > > +	struct file *file = NULL;
> > >  	struct fdtable *fdt;
> > >  
> > >  	spin_lock(&files->file_lock);
> > > @@ -632,15 +629,65 @@ int __close_fd(struct files_struct *files, unsigned fd)
> > >  		goto out_unlock;
> > >  	rcu_assign_pointer(fdt->fd[fd], NULL);
> > >  	__put_unused_fd(files, fd);
> > > -	spin_unlock(&files->file_lock);
> > > -	return filp_close(file, files);
> > >  
> > >  out_unlock:
> > >  	spin_unlock(&files->file_lock);
> > > -	return -EBADF;
> > > +	return file;
> > 
> > ...
> > 
> > > +int __close_range(struct files_struct *files, unsigned fd, unsigned max_fd)
> > > +{
> > > +	unsigned int cur_max;
> > > +
> > > +	if (fd > max_fd)
> > > +		return -EINVAL;
> > > +
> > > +	rcu_read_lock();
> > > +	cur_max = files_fdtable(files)->max_fds;
> > > +	rcu_read_unlock();
> > > +
> > > +	/* cap to last valid index into fdtable */
> > > +	if (max_fd >= cur_max)
> > > +		max_fd = cur_max - 1;
> > > +
> > > +	while (fd <= max_fd) {
> > > +		struct file *file;
> > > +
> > > +		file = pick_file(files, fd++);
> > 
> > Well, how about something like
> > 
> > 	static unsigned int find_next_opened_fd(struct fdtable *fdt, unsigned start)
> > 	{
> > 		unsigned int maxfd = fdt->max_fds;
> > 		unsigned int maxbit = maxfd / BITS_PER_LONG;
> > 		unsigned int bitbit = start / BITS_PER_LONG;
> > 
> > 		bitbit = find_next_bit(fdt->full_fds_bits, maxbit, bitbit) * BITS_PER_LONG;
> > 		if (bitbit > maxfd)
> > 			return maxfd;
> > 		if (bitbit > start)
> > 			start = bitbit;
> > 		return find_next_bit(fdt->open_fds, maxfd, start);
> > 	}
> 
> > 
> > 	unsigned close_next_fd(struct files_struct *files, unsigned start, unsigned maxfd)
> > 	{
> > 		unsigned fd;
> > 		struct file *file;
> > 		struct fdtable *fdt;
> > 	
> > 		spin_lock(&files->file_lock);
> > 		fdt = files_fdtable(files);
> > 		fd = find_next_opened_fd(fdt, start);
> > 		if (fd >= fdt->max_fds || fd > maxfd) {
> > 			fd = -1;
> > 			goto out;
> > 		}
> > 
> > 		file = fdt->fd[fd];
> > 		rcu_assign_pointer(fdt->fd[fd], NULL);
> > 		__put_unused_fd(files, fd);
> > 	out:
> > 		spin_unlock(&files->file_lock);
> > 
> > 		if (fd == -1u)
> > 			return fd;
> > 
> > 		filp_close(file, files);
> > 		return fd + 1;
> > 	}
> 
> Thanks, Oleg!
> 
> I kept it dumb and was about to reply that your solution introduces more
> code when it seemed we wanted to keep this very simple for now.
> But then I saw that find_next_opened_fd() already exists as
> find_next_fd(). So it's actually not bad compared to what I sent in v1.
> So - with some small tweaks (need to test it and all now) - how do we
> feel about?:

That's obviously not correct atm but I'll send out a tweaked version in
a bit.

Christian

^ permalink raw reply

* Re: [PATCH v1 1/2] open: add close_range()
From: Christian Brauner @ 2019-05-23 14:28 UTC (permalink / raw)
  To: Oleg Nesterov
  Cc: linux-ia64, linux-sh, ldv, dhowells, linux-kselftest, sparclinux,
	shuah, linux-arch, linux-s390, miklos, x86, torvalds, linux-mips,
	linux-xtensa, tkjos, arnd, jannh, linux-m68k, viro, tglx,
	linux-arm-kernel, fweimer, linux-parisc, linux-api, linux-kernel,
	linux-alpha, linux-fsdevel, linuxppc-dev
In-Reply-To: <20190523141447.34s3kc3fuwmoeq7n@brauner.io>

On Thu, May 23, 2019 at 04:14:47PM +0200, Christian Brauner wrote:
> On Thu, May 23, 2019 at 01:51:18PM +0200, Christian Brauner wrote:
> > On Wed, May 22, 2019 at 06:57:37PM +0200, Oleg Nesterov wrote:
> > > On 05/22, Christian Brauner wrote:
> > > >
> > > > +static struct file *pick_file(struct files_struct *files, unsigned fd)
> > > >  {
> > > > -	struct file *file;
> > > > +	struct file *file = NULL;
> > > >  	struct fdtable *fdt;
> > > >  
> > > >  	spin_lock(&files->file_lock);
> > > > @@ -632,15 +629,65 @@ int __close_fd(struct files_struct *files, unsigned fd)
> > > >  		goto out_unlock;
> > > >  	rcu_assign_pointer(fdt->fd[fd], NULL);
> > > >  	__put_unused_fd(files, fd);
> > > > -	spin_unlock(&files->file_lock);
> > > > -	return filp_close(file, files);
> > > >  
> > > >  out_unlock:
> > > >  	spin_unlock(&files->file_lock);
> > > > -	return -EBADF;
> > > > +	return file;
> > > 
> > > ...
> > > 
> > > > +int __close_range(struct files_struct *files, unsigned fd, unsigned max_fd)
> > > > +{
> > > > +	unsigned int cur_max;
> > > > +
> > > > +	if (fd > max_fd)
> > > > +		return -EINVAL;
> > > > +
> > > > +	rcu_read_lock();
> > > > +	cur_max = files_fdtable(files)->max_fds;
> > > > +	rcu_read_unlock();
> > > > +
> > > > +	/* cap to last valid index into fdtable */
> > > > +	if (max_fd >= cur_max)
> > > > +		max_fd = cur_max - 1;
> > > > +
> > > > +	while (fd <= max_fd) {
> > > > +		struct file *file;
> > > > +
> > > > +		file = pick_file(files, fd++);
> > > 
> > > Well, how about something like
> > > 
> > > 	static unsigned int find_next_opened_fd(struct fdtable *fdt, unsigned start)
> > > 	{
> > > 		unsigned int maxfd = fdt->max_fds;
> > > 		unsigned int maxbit = maxfd / BITS_PER_LONG;
> > > 		unsigned int bitbit = start / BITS_PER_LONG;
> > > 
> > > 		bitbit = find_next_bit(fdt->full_fds_bits, maxbit, bitbit) * BITS_PER_LONG;
> > > 		if (bitbit > maxfd)
> > > 			return maxfd;
> > > 		if (bitbit > start)
> > > 			start = bitbit;
> > > 		return find_next_bit(fdt->open_fds, maxfd, start);
> > > 	}
> > 
> > > 
> > > 	unsigned close_next_fd(struct files_struct *files, unsigned start, unsigned maxfd)
> > > 	{
> > > 		unsigned fd;
> > > 		struct file *file;
> > > 		struct fdtable *fdt;
> > > 	
> > > 		spin_lock(&files->file_lock);
> > > 		fdt = files_fdtable(files);
> > > 		fd = find_next_opened_fd(fdt, start);
> > > 		if (fd >= fdt->max_fds || fd > maxfd) {
> > > 			fd = -1;
> > > 			goto out;
> > > 		}
> > > 
> > > 		file = fdt->fd[fd];
> > > 		rcu_assign_pointer(fdt->fd[fd], NULL);
> > > 		__put_unused_fd(files, fd);
> > > 	out:
> > > 		spin_unlock(&files->file_lock);
> > > 
> > > 		if (fd == -1u)
> > > 			return fd;
> > > 
> > > 		filp_close(file, files);
> > > 		return fd + 1;
> > > 	}
> > 
> > Thanks, Oleg!
> > 
> > I kept it dumb and was about to reply that your solution introduces more
> > code when it seemed we wanted to keep this very simple for now.
> > But then I saw that find_next_opened_fd() already exists as
> > find_next_fd(). So it's actually not bad compared to what I sent in v1.
> > So - with some small tweaks (need to test it and all now) - how do we
> > feel about?:
> 
> That's obviously not correct atm but I'll send out a tweaked version in
> a bit.

So given that we would really need another find_next_open_fd() I think
sticking to the simple cond_resched() version I sent before is better
for now until we see real-world performance issues.
I was however missing a test for close_range(fd, fd, 0) anyway so I'll
need to send a v2 with this test added.

Christian

^ permalink raw reply

* Re: [PATCH v1 1/2] open: add close_range()
From: Jann Horn @ 2019-05-23 14:32 UTC (permalink / raw)
  To: Christian Brauner
  Cc: linux-ia64, linux-sh, kernel list, David Howells,
	open list:KERNEL SELFTEST FRAMEWORK, sparclinux, Shuah Khan,
	linux-arch, linux-s390, Miklos Szeredi, the arch/x86 maintainers,
	Linus Torvalds, linux-xtensa, Todd Kjos, Arnd Bergmann,
	linux-m68k, Al Viro, Oleg Nesterov, Thomas Gleixner,
	Dmitry V. Levin, linux-arm-kernel, Florian Weimer, linux-parisc,
	Linux API, linux-mips, linux-alpha, linux-fsdevel, linuxppc-dev
In-Reply-To: <20190523115118.pmscbd6kaqy37dym@brauner.io>

On Thu, May 23, 2019 at 1:51 PM Christian Brauner <christian@brauner.io> wrote:
[...]
> I kept it dumb and was about to reply that your solution introduces more
> code when it seemed we wanted to keep this very simple for now.
> But then I saw that find_next_opened_fd() already exists as
> find_next_fd(). So it's actually not bad compared to what I sent in v1.
> So - with some small tweaks (need to test it and all now) - how do we
> feel about?:
[...]
> static int __close_next_open_fd(struct files_struct *files, unsigned *curfd, unsigned maxfd)
> {
>         struct file *file = NULL;
>         unsigned fd;
>         struct fdtable *fdt;
>
>         spin_lock(&files->file_lock);
>         fdt = files_fdtable(files);
>         fd = find_next_fd(fdt, *curfd);

find_next_fd() finds free fds, not used ones.

>         if (fd >= fdt->max_fds || fd > maxfd)
>                 goto out_unlock;
>
>         file = fdt->fd[fd];
>         rcu_assign_pointer(fdt->fd[fd], NULL);
>         __put_unused_fd(files, fd);

You can't do __put_unused_fd() if the old pointer in fdt->fd[fd] was
NULL - because that means that the fd has been reserved by another
thread that is about to put a file pointer in there, and if you
release the fd here, that messes up the refcounting (or hits the
BUG_ON() in __fd_install()).

> out_unlock:
>         spin_unlock(&files->file_lock);
>
>         if (!file)
>                 return -EBADF;
>
>         *curfd = fd;
>         filp_close(file, files);
>         return 0;
> }
>
> int __close_range(struct files_struct *files, unsigned fd, unsigned max_fd)
> {
>         if (fd > max_fd)
>                 return -EINVAL;
>
>         while (fd <= max_fd) {

Note that with a pattern like this, you have to be careful about what
happens if someone gives you max_fd==0xffffffff - then this condition
is always true and the loop can not terminate this way.

>                 if (__close_next_fd(files, &fd, maxfd))
>                         break;

(obviously it can still terminate this way)

>                 cond_resched();
>                 fd++;
>         }
>
>         return 0;
> }

^ permalink raw reply

* Re: [PATCH v1 1/2] open: add close_range()
From: Christian Brauner @ 2019-05-23 14:37 UTC (permalink / raw)
  To: Jann Horn
  Cc: linux-ia64, linux-sh, kernel list, David Howells,
	open list:KERNEL SELFTEST FRAMEWORK, sparclinux, Shuah Khan,
	linux-arch, linux-s390, Miklos Szeredi, the arch/x86 maintainers,
	Linus Torvalds, linux-xtensa, Todd Kjos, Arnd Bergmann,
	linux-m68k, Al Viro, Oleg Nesterov, Thomas Gleixner,
	Dmitry V. Levin, linux-arm-kernel, Florian Weimer, linux-parisc,
	Linux API, linux-mips, linux-alpha, linux-fsdevel, linuxppc-dev
In-Reply-To: <CAG48ez0Uq2GQnQsuPkNrDdJVku_6GPeZ_5F_-5J3iy2CULr0_Q@mail.gmail.com>

On Thu, May 23, 2019 at 04:32:14PM +0200, Jann Horn wrote:
> On Thu, May 23, 2019 at 1:51 PM Christian Brauner <christian@brauner.io> wrote:
> [...]
> > I kept it dumb and was about to reply that your solution introduces more
> > code when it seemed we wanted to keep this very simple for now.
> > But then I saw that find_next_opened_fd() already exists as
> > find_next_fd(). So it's actually not bad compared to what I sent in v1.
> > So - with some small tweaks (need to test it and all now) - how do we
> > feel about?:
> [...]
> > static int __close_next_open_fd(struct files_struct *files, unsigned *curfd, unsigned maxfd)
> > {
> >         struct file *file = NULL;
> >         unsigned fd;
> >         struct fdtable *fdt;
> >
> >         spin_lock(&files->file_lock);
> >         fdt = files_fdtable(files);
> >         fd = find_next_fd(fdt, *curfd);
> 
> find_next_fd() finds free fds, not used ones.
> 
> >         if (fd >= fdt->max_fds || fd > maxfd)
> >                 goto out_unlock;
> >
> >         file = fdt->fd[fd];
> >         rcu_assign_pointer(fdt->fd[fd], NULL);
> >         __put_unused_fd(files, fd);
> 
> You can't do __put_unused_fd() if the old pointer in fdt->fd[fd] was
> NULL - because that means that the fd has been reserved by another
> thread that is about to put a file pointer in there, and if you
> release the fd here, that messes up the refcounting (or hits the
> BUG_ON() in __fd_install()).
> 
> > out_unlock:
> >         spin_unlock(&files->file_lock);
> >
> >         if (!file)
> >                 return -EBADF;
> >
> >         *curfd = fd;
> >         filp_close(file, files);
> >         return 0;
> > }
> >
> > int __close_range(struct files_struct *files, unsigned fd, unsigned max_fd)
> > {
> >         if (fd > max_fd)
> >                 return -EINVAL;
> >
> >         while (fd <= max_fd) {
> 
> Note that with a pattern like this, you have to be careful about what
> happens if someone gives you max_fd==0xffffffff - then this condition
> is always true and the loop can not terminate this way.
> 
> >                 if (__close_next_fd(files, &fd, maxfd))
> >                         break;
> 
> (obviously it can still terminate this way)

Yup, this was only a quick draft.
I think the dumb simple thing that I did before was the best way to do
it for now.
I first thought that the find_next_open_fd() function already exists but
when I went to write a POC for testing realized it doesn't anyway.

^ permalink raw reply

* Re: [PATCH] powerpc/powernv: Show checkstop reason for NPU2 HMIs
From: Frederic Barrat @ 2019-05-23 14:52 UTC (permalink / raw)
  To: Michael Ellerman, linuxppc-dev, ajd, arbab, aik; +Cc: clombard, groug
In-Reply-To: <87blztqo7u.fsf@concordia.ellerman.id.au>



Le 23/05/2019 à 15:45, Michael Ellerman a écrit :
> Frederic Barrat <fbarrat@linux.ibm.com> writes:
> 
>> If the kernel is notified of an HMI caused by the NPU2, it's currently
>> not being recognized and it logs the default message:
>>
>>      Unknown Malfunction Alert of type 3
>>
>> The NPU on Power 9 has 3 Fault Isolation Registers, so that's a lot of
>> possible causes, but we should at least log that it's an NPU problem
>> and report which FIR and which bit were raised if opal gave us the
>> information.
>>
>> Signed-off-by: Frederic Barrat <fbarrat@linux.ibm.com>
>> ---
>>
>> Could be merged independently from (the opal-api.h change is already
>> in the skiboot tree), but works better with, the matching skiboot
>> change:
>> http://patchwork.ozlabs.org/patch/1104076/
> 
> Well it *must* work with or without the skiboot change, because old/new
> kernels will run on old/new skiboots.
> 
> It looks like it will work fine, we just won't get any extra information
> in xstop_reason, right?


Yes, that's understood, and it was tested. On an old skiboot, we're now 
printing that we got an NPU checkstop (instead of the "unknown 
malfunction alert"), we just won't have the extra FIR info. That's what 
I meant by "works better with the skiboot patch".

   Fred



> cheers
> 
>> diff --git a/arch/powerpc/include/asm/opal-api.h b/arch/powerpc/include/asm/opal-api.h
>> index e1577cfa7186..2492fe248e1e 100644
>> --- a/arch/powerpc/include/asm/opal-api.h
>> +++ b/arch/powerpc/include/asm/opal-api.h
>> @@ -568,6 +568,7 @@ enum OpalHMI_XstopType {
>>   	CHECKSTOP_TYPE_UNKNOWN	=	0,
>>   	CHECKSTOP_TYPE_CORE	=	1,
>>   	CHECKSTOP_TYPE_NX	=	2,
>> +	CHECKSTOP_TYPE_NPU	=	3
>>   };
>>   
>>   enum OpalHMI_CoreXstopReason {
>> diff --git a/arch/powerpc/platforms/powernv/opal-hmi.c b/arch/powerpc/platforms/powernv/opal-hmi.c
>> index 586ec71a4e17..de12a240b477 100644
>> --- a/arch/powerpc/platforms/powernv/opal-hmi.c
>> +++ b/arch/powerpc/platforms/powernv/opal-hmi.c
>> @@ -149,6 +149,43 @@ static void print_nx_checkstop_reason(const char *level,
>>   					xstop_reason[i].description);
>>   }
>>   
>> +static void print_npu_checkstop_reason(const char *level,
>> +					struct OpalHMIEvent *hmi_evt)
>> +{
>> +	uint8_t reason, reason_count, i;
>> +
>> +	/*
>> +	 * We may not have a checkstop reason on some combination of
>> +	 * hardware and/or skiboot version
>> +	 */
>> +	if (!hmi_evt->u.xstop_error.xstop_reason) {
>> +		printk("%s	NPU checkstop on chip %x\n", level,
>> +			be32_to_cpu(hmi_evt->u.xstop_error.u.chip_id));
>> +		return;
>> +	}
>> +
>> +	/*
>> +	 * NPU2 has 3 FIRs. Reason encoded on a byte as:
>> +	 *   2 bits for the FIR number
>> +	 *   6 bits for the bit number
>> +	 * It may be possible to find several reasons.
>> +	 *
>> +	 * We don't display a specific message per FIR bit as there
>> +	 * are too many and most are meaningless without the workbook
>> +	 * and/or hw team help anyway.
>> +	 */
>> +	reason_count = sizeof(hmi_evt->u.xstop_error.xstop_reason) /
>> +		sizeof(reason);
>> +	for (i = 0; i < reason_count; i++) {
>> +		reason = (hmi_evt->u.xstop_error.xstop_reason >> (8 * i)) & 0xFF;
>> +		if (reason)
>> +			printk("%s	NPU checkstop on chip %x: FIR%d bit %d is set\n",
>> +				level,
>> +				be32_to_cpu(hmi_evt->u.xstop_error.u.chip_id),
>> +				reason >> 6, reason & 0x3F);
>> +	}
>> +}
>> +
>>   static void print_checkstop_reason(const char *level,
>>   					struct OpalHMIEvent *hmi_evt)
>>   {
>> @@ -160,6 +197,9 @@ static void print_checkstop_reason(const char *level,
>>   	case CHECKSTOP_TYPE_NX:
>>   		print_nx_checkstop_reason(level, hmi_evt);
>>   		break;
>> +	case CHECKSTOP_TYPE_NPU:
>> +		print_npu_checkstop_reason(level, hmi_evt);
>> +		break;
>>   	default:
>>   		printk("%s	Unknown Malfunction Alert of type %d\n",
>>   		       level, type);
>> -- 
>> 2.21.0
> 


^ permalink raw reply

* [PATCH v3 3/4] mm: introduce ARCH_HAS_PTE_DEVMAP
From: Robin Murphy @ 2019-05-23 15:03 UTC (permalink / raw)
  To: linux-mm
  Cc: Ira Weiny, anshuman.khandual, catalin.marinas, x86, will.deacon,
	linux-kernel, Oliver O'Halloran, akpm, linuxppc-dev,
	Dan Williams, linux-arm-kernel
In-Reply-To: <cover.1558547956.git.robin.murphy@arm.com>

ARCH_HAS_ZONE_DEVICE is somewhat meaningless in itself, and combined
with the long-out-of-date comment can lead to the impression than an
architecture may just enable it (since __add_pages() now "comprehends
device memory" for itself) and expect things to work.

In practice, however, ZONE_DEVICE users have little chance of
functioning correctly without __HAVE_ARCH_PTE_DEVMAP, so let's clean
that up the same way as ARCH_HAS_PTE_SPECIAL and make it the proper
dependency so the real situation is clearer.

Cc: x86@kernel.org
Cc: linuxppc-dev@lists.ozlabs.org
Cc: Michael Ellerman <mpe@ellerman.id.au>
Acked-by: Dan Williams <dan.j.williams@intel.com>
Reviewed-by: Ira Weiny <ira.weiny@intel.com>
Acked-by: Oliver O'Halloran <oohall@gmail.com>
Reviewed-by: Anshuman Khandual <anshuman.khandual@arm.com>
Signed-off-by: Robin Murphy <robin.murphy@arm.com>
---
 arch/powerpc/Kconfig                         | 2 +-
 arch/powerpc/include/asm/book3s/64/pgtable.h | 1 -
 arch/x86/Kconfig                             | 2 +-
 arch/x86/include/asm/pgtable.h               | 4 ++--
 arch/x86/include/asm/pgtable_types.h         | 1 -
 include/linux/mm.h                           | 4 ++--
 include/linux/pfn_t.h                        | 4 ++--
 mm/Kconfig                                   | 5 ++---
 mm/gup.c                                     | 2 +-
 9 files changed, 11 insertions(+), 14 deletions(-)

diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
index 8c1c636308c8..1120ff8ac715 100644
--- a/arch/powerpc/Kconfig
+++ b/arch/powerpc/Kconfig
@@ -128,6 +128,7 @@ config PPC
 	select ARCH_HAS_MMIOWB			if PPC64
 	select ARCH_HAS_PHYS_TO_DMA
 	select ARCH_HAS_PMEM_API                if PPC64
+	select ARCH_HAS_PTE_DEVMAP		if PPC_BOOK3S_64
 	select ARCH_HAS_PTE_SPECIAL
 	select ARCH_HAS_MEMBARRIER_CALLBACKS
 	select ARCH_HAS_SCALED_CPUTIME		if VIRT_CPU_ACCOUNTING_NATIVE && PPC64
@@ -135,7 +136,6 @@ config PPC
 	select ARCH_HAS_TICK_BROADCAST		if GENERIC_CLOCKEVENTS_BROADCAST
 	select ARCH_HAS_UACCESS_FLUSHCACHE	if PPC64
 	select ARCH_HAS_UBSAN_SANITIZE_ALL
-	select ARCH_HAS_ZONE_DEVICE		if PPC_BOOK3S_64
 	select ARCH_HAVE_NMI_SAFE_CMPXCHG
 	select ARCH_KEEP_MEMBLOCK
 	select ARCH_MIGHT_HAVE_PC_PARPORT
diff --git a/arch/powerpc/include/asm/book3s/64/pgtable.h b/arch/powerpc/include/asm/book3s/64/pgtable.h
index 7dede2e34b70..c6c2bdfb369b 100644
--- a/arch/powerpc/include/asm/book3s/64/pgtable.h
+++ b/arch/powerpc/include/asm/book3s/64/pgtable.h
@@ -90,7 +90,6 @@
 #define _PAGE_SOFT_DIRTY	_RPAGE_SW3 /* software: software dirty tracking */
 #define _PAGE_SPECIAL		_RPAGE_SW2 /* software: special page */
 #define _PAGE_DEVMAP		_RPAGE_SW1 /* software: ZONE_DEVICE page */
-#define __HAVE_ARCH_PTE_DEVMAP
 
 /*
  * Drivers request for cache inhibited pte mapping using _PAGE_NO_CACHE
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 2bbbd4d1ba31..57c4e80bd368 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -69,6 +69,7 @@ config X86
 	select ARCH_HAS_KCOV			if X86_64
 	select ARCH_HAS_MEMBARRIER_SYNC_CORE
 	select ARCH_HAS_PMEM_API		if X86_64
+	select ARCH_HAS_PTE_DEVMAP		if X86_64
 	select ARCH_HAS_PTE_SPECIAL
 	select ARCH_HAS_REFCOUNT
 	select ARCH_HAS_UACCESS_FLUSHCACHE	if X86_64
@@ -79,7 +80,6 @@ config X86
 	select ARCH_HAS_STRICT_MODULE_RWX
 	select ARCH_HAS_SYNC_CORE_BEFORE_USERMODE
 	select ARCH_HAS_UBSAN_SANITIZE_ALL
-	select ARCH_HAS_ZONE_DEVICE		if X86_64
 	select ARCH_HAVE_NMI_SAFE_CMPXCHG
 	select ARCH_MIGHT_HAVE_ACPI_PDC		if ACPI
 	select ARCH_MIGHT_HAVE_PC_PARPORT
diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h
index 5e0509b41986..0bc530c4eb13 100644
--- a/arch/x86/include/asm/pgtable.h
+++ b/arch/x86/include/asm/pgtable.h
@@ -271,7 +271,7 @@ static inline int has_transparent_hugepage(void)
 	return boot_cpu_has(X86_FEATURE_PSE);
 }
 
-#ifdef __HAVE_ARCH_PTE_DEVMAP
+#ifdef CONFIG_ARCH_HAS_PTE_DEVMAP
 static inline int pmd_devmap(pmd_t pmd)
 {
 	return !!(pmd_val(pmd) & _PAGE_DEVMAP);
@@ -732,7 +732,7 @@ static inline int pte_present(pte_t a)
 	return pte_flags(a) & (_PAGE_PRESENT | _PAGE_PROTNONE);
 }
 
-#ifdef __HAVE_ARCH_PTE_DEVMAP
+#ifdef CONFIG_ARCH_HAS_PTE_DEVMAP
 static inline int pte_devmap(pte_t a)
 {
 	return (pte_flags(a) & _PAGE_DEVMAP) == _PAGE_DEVMAP;
diff --git a/arch/x86/include/asm/pgtable_types.h b/arch/x86/include/asm/pgtable_types.h
index d6ff0bbdb394..b5e49e6bac63 100644
--- a/arch/x86/include/asm/pgtable_types.h
+++ b/arch/x86/include/asm/pgtable_types.h
@@ -103,7 +103,6 @@
 #if defined(CONFIG_X86_64) || defined(CONFIG_X86_PAE)
 #define _PAGE_NX	(_AT(pteval_t, 1) << _PAGE_BIT_NX)
 #define _PAGE_DEVMAP	(_AT(u64, 1) << _PAGE_BIT_DEVMAP)
-#define __HAVE_ARCH_PTE_DEVMAP
 #else
 #define _PAGE_NX	(_AT(pteval_t, 0))
 #define _PAGE_DEVMAP	(_AT(pteval_t, 0))
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 9cd613a7f67b..f61c016de005 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -536,7 +536,7 @@ static inline void vma_set_anonymous(struct vm_area_struct *vma)
 struct mmu_gather;
 struct inode;
 
-#if !defined(__HAVE_ARCH_PTE_DEVMAP) || !defined(CONFIG_TRANSPARENT_HUGEPAGE)
+#if !defined(CONFIG_ARCH_HAS_PTE_DEVMAP) || !defined(CONFIG_TRANSPARENT_HUGEPAGE)
 static inline int pmd_devmap(pmd_t pmd)
 {
 	return 0;
@@ -1754,7 +1754,7 @@ static inline void sync_mm_rss(struct mm_struct *mm)
 }
 #endif
 
-#ifndef __HAVE_ARCH_PTE_DEVMAP
+#ifndef CONFIG_ARCH_HAS_PTE_DEVMAP
 static inline int pte_devmap(pte_t pte)
 {
 	return 0;
diff --git a/include/linux/pfn_t.h b/include/linux/pfn_t.h
index 7bb77850c65a..de8bc66b10a4 100644
--- a/include/linux/pfn_t.h
+++ b/include/linux/pfn_t.h
@@ -104,7 +104,7 @@ static inline pud_t pfn_t_pud(pfn_t pfn, pgprot_t pgprot)
 #endif
 #endif
 
-#ifdef __HAVE_ARCH_PTE_DEVMAP
+#ifdef CONFIG_ARCH_HAS_PTE_DEVMAP
 static inline bool pfn_t_devmap(pfn_t pfn)
 {
 	const u64 flags = PFN_DEV|PFN_MAP;
@@ -122,7 +122,7 @@ pmd_t pmd_mkdevmap(pmd_t pmd);
 	defined(CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD)
 pud_t pud_mkdevmap(pud_t pud);
 #endif
-#endif /* __HAVE_ARCH_PTE_DEVMAP */
+#endif /* CONFIG_ARCH_HAS_PTE_DEVMAP */
 
 #ifdef CONFIG_ARCH_HAS_PTE_SPECIAL
 static inline bool pfn_t_special(pfn_t pfn)
diff --git a/mm/Kconfig b/mm/Kconfig
index ee8d1f311858..3aeef0442d03 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -647,8 +647,7 @@ config IDLE_PAGE_TRACKING
 	  See Documentation/admin-guide/mm/idle_page_tracking.rst for
 	  more details.
 
-# arch_add_memory() comprehends device memory
-config ARCH_HAS_ZONE_DEVICE
+config ARCH_HAS_PTE_DEVMAP
 	bool
 
 config ZONE_DEVICE
@@ -656,7 +655,7 @@ config ZONE_DEVICE
 	depends on MEMORY_HOTPLUG
 	depends on MEMORY_HOTREMOVE
 	depends on SPARSEMEM_VMEMMAP
-	depends on ARCH_HAS_ZONE_DEVICE
+	depends on ARCH_HAS_PTE_DEVMAP
 	select XARRAY_MULTI
 
 	help
diff --git a/mm/gup.c b/mm/gup.c
index 2c08248d4fa2..777010ca3bf0 100644
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -1799,7 +1799,7 @@ static int gup_pte_range(pmd_t pmd, unsigned long addr, unsigned long end,
 }
 #endif /* CONFIG_ARCH_HAS_PTE_SPECIAL */
 
-#if defined(__HAVE_ARCH_PTE_DEVMAP) && defined(CONFIG_TRANSPARENT_HUGEPAGE)
+#if defined(CONFIG_ARCH_HAS_PTE_DEVMAP) && defined(CONFIG_TRANSPARENT_HUGEPAGE)
 static int __gup_device_huge(unsigned long pfn, unsigned long addr,
 		unsigned long end, struct page **pages, int *nr)
 {
-- 
2.21.0.dirty


^ permalink raw reply related

* [PATCH v2 0/2] close_range()
From: Christian Brauner @ 2019-05-23 15:47 UTC (permalink / raw)
  To: viro, linux-kernel, linux-fsdevel, linux-api, torvalds, fweimer
  Cc: linux-ia64, linux-sh, ldv, dhowells, linux-kselftest, sparclinux,
	shuah, linux-arch, linux-s390, miklos, x86, Christian Brauner,
	linux-mips, linux-xtensa, tkjos, arnd, jannh, linux-m68k, tglx,
	linux-arm-kernel, linux-parisc, oleg, linux-alpha, linuxppc-dev

Hey,

This is v2 of this patchset.

In accordance with some comments There's a cond_resched() added to the
close loop similar to what is done for close_files().
A common helper pick_file() for __close_fd() and __close_range() has
been split out. This allows to only make a cond_resched() call when
filp_close() has been called similar to what is done in close_files().
Maybe that's not worth it. Jann mentioned that cond_resched() looks
rather cheap.
So it maybe that we could simply do:

while (fd <= max_fd) {
       __close(files, fd++);
       cond_resched();
}

I also added a missing test for close_range(fd, fd, 0).

Thanks!
Christian

Christian Brauner (2):
  open: add close_range()
  tests: add close_range() tests

 arch/alpha/kernel/syscalls/syscall.tbl        |   1 +
 arch/arm/tools/syscall.tbl                    |   1 +
 arch/arm64/include/asm/unistd32.h             |   2 +
 arch/ia64/kernel/syscalls/syscall.tbl         |   1 +
 arch/m68k/kernel/syscalls/syscall.tbl         |   1 +
 arch/microblaze/kernel/syscalls/syscall.tbl   |   1 +
 arch/mips/kernel/syscalls/syscall_n32.tbl     |   1 +
 arch/mips/kernel/syscalls/syscall_n64.tbl     |   1 +
 arch/mips/kernel/syscalls/syscall_o32.tbl     |   1 +
 arch/parisc/kernel/syscalls/syscall.tbl       |   1 +
 arch/powerpc/kernel/syscalls/syscall.tbl      |   1 +
 arch/s390/kernel/syscalls/syscall.tbl         |   1 +
 arch/sh/kernel/syscalls/syscall.tbl           |   1 +
 arch/sparc/kernel/syscalls/syscall.tbl        |   1 +
 arch/x86/entry/syscalls/syscall_32.tbl        |   1 +
 arch/x86/entry/syscalls/syscall_64.tbl        |   1 +
 arch/xtensa/kernel/syscalls/syscall.tbl       |   1 +
 fs/file.c                                     |  62 +++++++-
 fs/open.c                                     |  20 +++
 include/linux/fdtable.h                       |   2 +
 include/linux/syscalls.h                      |   2 +
 include/uapi/asm-generic/unistd.h             |   4 +-
 tools/testing/selftests/Makefile              |   1 +
 tools/testing/selftests/core/.gitignore       |   1 +
 tools/testing/selftests/core/Makefile         |   6 +
 .../testing/selftests/core/close_range_test.c | 142 ++++++++++++++++++
 26 files changed, 249 insertions(+), 9 deletions(-)
 create mode 100644 tools/testing/selftests/core/.gitignore
 create mode 100644 tools/testing/selftests/core/Makefile
 create mode 100644 tools/testing/selftests/core/close_range_test.c

-- 
2.21.0


^ permalink raw reply

* [PATCH v2 1/2] open: add close_range()
From: Christian Brauner @ 2019-05-23 15:47 UTC (permalink / raw)
  To: viro, linux-kernel, linux-fsdevel, linux-api, torvalds, fweimer
  Cc: linux-ia64, linux-sh, ldv, dhowells, linux-kselftest, sparclinux,
	shuah, linux-arch, linux-s390, miklos, x86, Christian Brauner,
	linux-mips, linux-xtensa, tkjos, arnd, jannh, linux-m68k, tglx,
	linux-arm-kernel, linux-parisc, oleg, linux-alpha, linuxppc-dev
In-Reply-To: <20190523154747.15162-1-christian@brauner.io>

This adds the close_range() syscall. It allows to efficiently close a range
of file descriptors up to all file descriptors of a calling task.

The syscall came up in a recent discussion around the new mount API and
making new file descriptor types cloexec by default. During this
discussion, Al suggested the close_range() syscall (cf. [1]). Note, a
syscall in this manner has been requested by various people over time.

First, it helps to close all file descriptors of an exec()ing task. This
can be done safely via (quoting Al's example from [1] verbatim):

        /* that exec is sensitive */
        unshare(CLONE_FILES);
        /* we don't want anything past stderr here */
        close_range(3, ~0U);
        execve(....);

The code snippet above is one way of working around the problem that file
descriptors are not cloexec by default. This is aggravated by the fact that
we can't just switch them over without massively regressing userspace. For
a whole class of programs having an in-kernel method of closing all file
descriptors is very helpful (e.g. demons, service managers, programming
language standard libraries, container managers etc.).
(Please note, unshare(CLONE_FILES) should only be needed if the calling
 task is multi-threaded and shares the file descriptor table with another
 thread in which case two threads could race with one thread allocating
 file descriptors and the other one closing them via close_range(). For the
 general case close_range() before the execve() is sufficient.)

Second, it allows userspace to avoid implementing closing all file
descriptors by parsing through /proc/<pid>/fd/* and calling close() on each
file descriptor. From looking at various large(ish) userspace code bases
this or similar patterns are very common in:
- service managers (cf. [4])
- libcs (cf. [6])
- container runtimes (cf. [5])
- programming language runtimes/standard libraries
  - Python (cf. [2])
  - Rust (cf. [7], [8])
As Dmitry pointed out there's even a long-standing glibc bug about missing
kernel support for this task (cf. [3]).
In addition, the syscall will also work for tasks that do not have procfs
mounted and on kernels that do not have procfs support compiled in. In such
situations the only way to make sure that all file descriptors are closed
is to call close() on each file descriptor up to UINT_MAX or RLIMIT_NOFILE,
OPEN_MAX trickery (cf. comment [8] on Rust).

The performance is striking. For good measure, comparing the following
simple close_all_fds() userspace implementation that is essentially just
glibc's version in [6]:

static int close_all_fds(void)
{
        int dir_fd;
        DIR *dir;
        struct dirent *direntp;

        dir = opendir("/proc/self/fd");
        if (!dir)
                return -1;
        dir_fd = dirfd(dir);
        while ((direntp = readdir(dir))) {
                int fd;
                if (strcmp(direntp->d_name, ".") == 0)
                        continue;
                if (strcmp(direntp->d_name, "..") == 0)
                        continue;
                fd = atoi(direntp->d_name);
                if (fd == dir_fd || fd == 0 || fd == 1 || fd == 2)
                        continue;
                close(fd);
        }
        closedir(dir);
        return 0;
}

to close_range() yields:
1. closing 4 open files:
   - close_all_fds(): ~280 us
   - close_range():    ~24 us

2. closing 1000 open files:
   - close_all_fds(): ~5000 us
   - close_range():   ~800 us

close_range() is designed to allow for some flexibility. Specifically, it
does not simply always close all open file descriptors of a task. Instead,
callers can specify an upper bound.
This is e.g. useful for scenarios where specific file descriptors are
created with well-known numbers that are supposed to be excluded from
getting closed.
For extra paranoia close_range() comes with a flags argument. This can e.g.
be used to implement extension. Once can imagine userspace wanting to stop
at the first error instead of ignoring errors under certain circumstances.
There might be other valid ideas in the future. In any case, a flag
argument doesn't hurt and keeps us on the safe side.

From an implementation side this is kept rather dumb. It saw some input
from David and Jann but all nonsense is obviously my own!
- Errors to close file descriptors are currently ignored. (Could be changed
  by setting a flag in the future if needed.)
- __close_range() is a rather simplistic wrapper around __close_fd().
  My reasoning behind this is based on the nature of how __close_fd() needs
  to release an fd. But maybe I misunderstood specifics:
  We take the files_lock and rcu-dereference the fdtable of the calling
  task, we find the entry in the fdtable, get the file and need to release
  files_lock before calling filp_close().
  In the meantime the fdtable might have been altered so we can't just
  retake the spinlock and keep the old rcu-reference of the fdtable
  around. Instead we need to grab a fresh reference to the fdtable.
  If my reasoning is correct then there's really no point in fancyfying
  __close_range(): We just need to rcu-dereference the fdtable of the
  calling task once to cap the max_fd value correctly and then go on
  calling __close_fd() in a loop.

/* References */
[1]: https://lore.kernel.org/lkml/20190516165021.GD17978@ZenIV.linux.org.uk/
[2]: https://github.com/python/cpython/blob/9e4f2f3a6b8ee995c365e86d976937c141d867f8/Modules/_posixsubprocess.c#L220
[3]: https://sourceware.org/bugzilla/show_bug.cgi?id=10353#c7
[4]: https://github.com/systemd/systemd/blob/5238e9575906297608ff802a27e2ff9effa3b338/src/basic/fd-util.c#L217
[5]: https://github.com/lxc/lxc/blob/ddf4b77e11a4d08f09b7b9cd13e593f8c047edc5/src/lxc/start.c#L236
[6]: https://sourceware.org/git/?p=glibc.git;a=blob;f=sysdeps/unix/sysv/linux/grantpt.c;h=2030e07fa6e652aac32c775b8c6e005844c3c4eb;hb=HEAD#l17
     Note that this is an internal implementation that is not exported.
     Currently, libc seems to not provide an exported version of this
     because of missing kernel support to do this.
[7]: https://github.com/rust-lang/rust/issues/12148
[8]: https://github.com/rust-lang/rust/blob/5f47c0613ed4eb46fca3633c1297364c09e5e451/src/libstd/sys/unix/process2.rs#L303-L308
     Rust's solution is slightly different but is equally unperformant.
     Rust calls getdtablesize() which is a glibc library function that
     simply returns the current RLIMIT_NOFILE or OPEN_MAX values. Rust then
     goes on to call close() on each fd. That's obviously overkill for most
     tasks. Rarely, tasks - especially non-demons - hit RLIMIT_NOFILE or
     OPEN_MAX.
     Let's be nice and assume an unprivileged user with RLIMIT_NOFILE set
     to 1024. Even in this case, there's a very high chance that in the
     common case Rust is calling the close() syscall 1021 times pointlessly
     if the task just has 0, 1, and 2 open.

Suggested-by: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Christian Brauner <christian@brauner.io>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Jann Horn <jannh@google.com>
Cc: David Howells <dhowells@redhat.com>
Cc: Dmitry V. Levin <ldv@altlinux.org>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Florian Weimer <fweimer@redhat.com>
Cc: linux-api@vger.kernel.org
---
v1:
- Linus Torvalds <torvalds@linux-foundation.org>:
  - add cond_resched() to yield cpu when closing a lot of file descriptors
- Al Viro <viro@zeniv.linux.org.uk>:
  - add cond_resched() to yield cpu when closing a lot of file descriptors
v2: unchanged
---
 arch/alpha/kernel/syscalls/syscall.tbl      |  1 +
 arch/arm/tools/syscall.tbl                  |  1 +
 arch/arm64/include/asm/unistd32.h           |  2 +
 arch/ia64/kernel/syscalls/syscall.tbl       |  1 +
 arch/m68k/kernel/syscalls/syscall.tbl       |  1 +
 arch/microblaze/kernel/syscalls/syscall.tbl |  1 +
 arch/mips/kernel/syscalls/syscall_n32.tbl   |  1 +
 arch/mips/kernel/syscalls/syscall_n64.tbl   |  1 +
 arch/mips/kernel/syscalls/syscall_o32.tbl   |  1 +
 arch/parisc/kernel/syscalls/syscall.tbl     |  1 +
 arch/powerpc/kernel/syscalls/syscall.tbl    |  1 +
 arch/s390/kernel/syscalls/syscall.tbl       |  1 +
 arch/sh/kernel/syscalls/syscall.tbl         |  1 +
 arch/sparc/kernel/syscalls/syscall.tbl      |  1 +
 arch/x86/entry/syscalls/syscall_32.tbl      |  1 +
 arch/x86/entry/syscalls/syscall_64.tbl      |  1 +
 arch/xtensa/kernel/syscalls/syscall.tbl     |  1 +
 fs/file.c                                   | 62 ++++++++++++++++++---
 fs/open.c                                   | 20 +++++++
 include/linux/fdtable.h                     |  2 +
 include/linux/syscalls.h                    |  2 +
 include/uapi/asm-generic/unistd.h           |  4 +-
 22 files changed, 99 insertions(+), 9 deletions(-)

diff --git a/arch/alpha/kernel/syscalls/syscall.tbl b/arch/alpha/kernel/syscalls/syscall.tbl
index 9e7704e44f6d..b55d93af8096 100644
--- a/arch/alpha/kernel/syscalls/syscall.tbl
+++ b/arch/alpha/kernel/syscalls/syscall.tbl
@@ -473,3 +473,4 @@
 541	common	fsconfig			sys_fsconfig
 542	common	fsmount				sys_fsmount
 543	common	fspick				sys_fspick
+545	common	close_range			sys_close_range
diff --git a/arch/arm/tools/syscall.tbl b/arch/arm/tools/syscall.tbl
index aaf479a9e92d..0125c97c75dd 100644
--- a/arch/arm/tools/syscall.tbl
+++ b/arch/arm/tools/syscall.tbl
@@ -447,3 +447,4 @@
 431	common	fsconfig			sys_fsconfig
 432	common	fsmount				sys_fsmount
 433	common	fspick				sys_fspick
+435	common	close_range			sys_close_range
diff --git a/arch/arm64/include/asm/unistd32.h b/arch/arm64/include/asm/unistd32.h
index c39e90600bb3..9a3270d29b42 100644
--- a/arch/arm64/include/asm/unistd32.h
+++ b/arch/arm64/include/asm/unistd32.h
@@ -886,6 +886,8 @@ __SYSCALL(__NR_fsconfig, sys_fsconfig)
 __SYSCALL(__NR_fsmount, sys_fsmount)
 #define __NR_fspick 433
 __SYSCALL(__NR_fspick, sys_fspick)
+#define __NR_close_range 435
+__SYSCALL(__NR_close_range, sys_close_range)
 
 /*
  * Please add new compat syscalls above this comment and update
diff --git a/arch/ia64/kernel/syscalls/syscall.tbl b/arch/ia64/kernel/syscalls/syscall.tbl
index e01df3f2f80d..1a90b464e96f 100644
--- a/arch/ia64/kernel/syscalls/syscall.tbl
+++ b/arch/ia64/kernel/syscalls/syscall.tbl
@@ -354,3 +354,4 @@
 431	common	fsconfig			sys_fsconfig
 432	common	fsmount				sys_fsmount
 433	common	fspick				sys_fspick
+435	common	close_range			sys_close_range
diff --git a/arch/m68k/kernel/syscalls/syscall.tbl b/arch/m68k/kernel/syscalls/syscall.tbl
index 7e3d0734b2f3..2dee2050f9ef 100644
--- a/arch/m68k/kernel/syscalls/syscall.tbl
+++ b/arch/m68k/kernel/syscalls/syscall.tbl
@@ -433,3 +433,4 @@
 431	common	fsconfig			sys_fsconfig
 432	common	fsmount				sys_fsmount
 433	common	fspick				sys_fspick
+435	common	close_range			sys_close_range
diff --git a/arch/microblaze/kernel/syscalls/syscall.tbl b/arch/microblaze/kernel/syscalls/syscall.tbl
index 26339e417695..923ef69e5a76 100644
--- a/arch/microblaze/kernel/syscalls/syscall.tbl
+++ b/arch/microblaze/kernel/syscalls/syscall.tbl
@@ -439,3 +439,4 @@
 431	common	fsconfig			sys_fsconfig
 432	common	fsmount				sys_fsmount
 433	common	fspick				sys_fspick
+435	common	close_range			sys_close_range
diff --git a/arch/mips/kernel/syscalls/syscall_n32.tbl b/arch/mips/kernel/syscalls/syscall_n32.tbl
index 0e2dd68ade57..967ed9de51cd 100644
--- a/arch/mips/kernel/syscalls/syscall_n32.tbl
+++ b/arch/mips/kernel/syscalls/syscall_n32.tbl
@@ -372,3 +372,4 @@
 431	n32	fsconfig			sys_fsconfig
 432	n32	fsmount				sys_fsmount
 433	n32	fspick				sys_fspick
+435	n32	close_range			sys_close_range
diff --git a/arch/mips/kernel/syscalls/syscall_n64.tbl b/arch/mips/kernel/syscalls/syscall_n64.tbl
index 5eebfa0d155c..71de731102b1 100644
--- a/arch/mips/kernel/syscalls/syscall_n64.tbl
+++ b/arch/mips/kernel/syscalls/syscall_n64.tbl
@@ -348,3 +348,4 @@
 431	n64	fsconfig			sys_fsconfig
 432	n64	fsmount				sys_fsmount
 433	n64	fspick				sys_fspick
+435	n64	close_range			sys_close_range
diff --git a/arch/mips/kernel/syscalls/syscall_o32.tbl b/arch/mips/kernel/syscalls/syscall_o32.tbl
index 3cc1374e02d0..5a325ab29f88 100644
--- a/arch/mips/kernel/syscalls/syscall_o32.tbl
+++ b/arch/mips/kernel/syscalls/syscall_o32.tbl
@@ -421,3 +421,4 @@
 431	o32	fsconfig			sys_fsconfig
 432	o32	fsmount				sys_fsmount
 433	o32	fspick				sys_fspick
+435	o32	close_range			sys_close_range
diff --git a/arch/parisc/kernel/syscalls/syscall.tbl b/arch/parisc/kernel/syscalls/syscall.tbl
index c9e377d59232..dcc0a0879139 100644
--- a/arch/parisc/kernel/syscalls/syscall.tbl
+++ b/arch/parisc/kernel/syscalls/syscall.tbl
@@ -430,3 +430,4 @@
 431	common	fsconfig			sys_fsconfig
 432	common	fsmount				sys_fsmount
 433	common	fspick				sys_fspick
+435	common	close_range			sys_close_range
diff --git a/arch/powerpc/kernel/syscalls/syscall.tbl b/arch/powerpc/kernel/syscalls/syscall.tbl
index 103655d84b4b..ba2c1f078cbd 100644
--- a/arch/powerpc/kernel/syscalls/syscall.tbl
+++ b/arch/powerpc/kernel/syscalls/syscall.tbl
@@ -515,3 +515,4 @@
 431	common	fsconfig			sys_fsconfig
 432	common	fsmount				sys_fsmount
 433	common	fspick				sys_fspick
+435	common	close_range			sys_close_range
diff --git a/arch/s390/kernel/syscalls/syscall.tbl b/arch/s390/kernel/syscalls/syscall.tbl
index e822b2964a83..d7c9043d2902 100644
--- a/arch/s390/kernel/syscalls/syscall.tbl
+++ b/arch/s390/kernel/syscalls/syscall.tbl
@@ -436,3 +436,4 @@
 431  common	fsconfig		sys_fsconfig			sys_fsconfig
 432  common	fsmount			sys_fsmount			sys_fsmount
 433  common	fspick			sys_fspick			sys_fspick
+435  common	close_range		sys_close_range			sys_close_range
diff --git a/arch/sh/kernel/syscalls/syscall.tbl b/arch/sh/kernel/syscalls/syscall.tbl
index 016a727d4357..9b5e6bf0ce32 100644
--- a/arch/sh/kernel/syscalls/syscall.tbl
+++ b/arch/sh/kernel/syscalls/syscall.tbl
@@ -436,3 +436,4 @@
 431	common	fsconfig			sys_fsconfig
 432	common	fsmount				sys_fsmount
 433	common	fspick				sys_fspick
+435	common	close_range			sys_close_range
diff --git a/arch/sparc/kernel/syscalls/syscall.tbl b/arch/sparc/kernel/syscalls/syscall.tbl
index e047480b1605..8c674a1e0072 100644
--- a/arch/sparc/kernel/syscalls/syscall.tbl
+++ b/arch/sparc/kernel/syscalls/syscall.tbl
@@ -479,3 +479,4 @@
 431	common	fsconfig			sys_fsconfig
 432	common	fsmount				sys_fsmount
 433	common	fspick				sys_fspick
+435	common	close_range			sys_close_range
diff --git a/arch/x86/entry/syscalls/syscall_32.tbl b/arch/x86/entry/syscalls/syscall_32.tbl
index ad968b7bac72..7f7a89a96707 100644
--- a/arch/x86/entry/syscalls/syscall_32.tbl
+++ b/arch/x86/entry/syscalls/syscall_32.tbl
@@ -438,3 +438,4 @@
 431	i386	fsconfig		sys_fsconfig			__ia32_sys_fsconfig
 432	i386	fsmount			sys_fsmount			__ia32_sys_fsmount
 433	i386	fspick			sys_fspick			__ia32_sys_fspick
+435	i386	close_range		sys_close_range			__ia32_sys_close_range
diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl
index b4e6f9e6204a..0f7d47ae921c 100644
--- a/arch/x86/entry/syscalls/syscall_64.tbl
+++ b/arch/x86/entry/syscalls/syscall_64.tbl
@@ -355,6 +355,7 @@
 431	common	fsconfig		__x64_sys_fsconfig
 432	common	fsmount			__x64_sys_fsmount
 433	common	fspick			__x64_sys_fspick
+435	common	close_range		__x64_sys_close_range
 
 #
 # x32-specific system call numbers start at 512 to avoid cache impact
diff --git a/arch/xtensa/kernel/syscalls/syscall.tbl b/arch/xtensa/kernel/syscalls/syscall.tbl
index 5fa0ee1c8e00..b489532265d0 100644
--- a/arch/xtensa/kernel/syscalls/syscall.tbl
+++ b/arch/xtensa/kernel/syscalls/syscall.tbl
@@ -404,3 +404,4 @@
 431	common	fsconfig			sys_fsconfig
 432	common	fsmount				sys_fsmount
 433	common	fspick				sys_fspick
+435	common	close_range			sys_close_range
diff --git a/fs/file.c b/fs/file.c
index 3da91a112bab..53430d68bbd7 100644
--- a/fs/file.c
+++ b/fs/file.c
@@ -10,6 +10,7 @@
 #include <linux/syscalls.h>
 #include <linux/export.h>
 #include <linux/fs.h>
+#include <linux/kernel.h>
 #include <linux/mm.h>
 #include <linux/sched/signal.h>
 #include <linux/slab.h>
@@ -615,12 +616,9 @@ void fd_install(unsigned int fd, struct file *file)
 
 EXPORT_SYMBOL(fd_install);
 
-/*
- * The same warnings as for __alloc_fd()/__fd_install() apply here...
- */
-int __close_fd(struct files_struct *files, unsigned fd)
+static struct file *pick_file(struct files_struct *files, unsigned fd)
 {
-	struct file *file;
+	struct file *file = NULL;
 	struct fdtable *fdt;
 
 	spin_lock(&files->file_lock);
@@ -632,15 +630,63 @@ int __close_fd(struct files_struct *files, unsigned fd)
 		goto out_unlock;
 	rcu_assign_pointer(fdt->fd[fd], NULL);
 	__put_unused_fd(files, fd);
-	spin_unlock(&files->file_lock);
-	return filp_close(file, files);
 
 out_unlock:
 	spin_unlock(&files->file_lock);
-	return -EBADF;
+	return file;
+}
+
+/*
+ * The same warnings as for __alloc_fd()/__fd_install() apply here...
+ */
+int __close_fd(struct files_struct *files, unsigned fd)
+{
+	struct file *file;
+
+	file = pick_file(files, fd);
+	if (!file)
+		return -EBADF;
+
+	return filp_close(file, files);
 }
 EXPORT_SYMBOL(__close_fd); /* for ksys_close() */
 
+/**
+ * __close_range() - Close all file descriptors in a given range.
+ *
+ * @fd:     starting file descriptor to close
+ * @max_fd: last file descriptor to close
+ *
+ * This closes a range of file descriptors. All file descriptors
+ * from @fd up to and including @max_fd are closed.
+ */
+int __close_range(struct files_struct *files, unsigned fd, unsigned max_fd)
+{
+	unsigned int cur_max;
+
+	if (fd > max_fd)
+		return -EINVAL;
+
+	rcu_read_lock();
+	cur_max = files_fdtable(files)->max_fds;
+	rcu_read_unlock();
+
+	/* cap to last valid index into fdtable */
+	max_fd = max(max_fd, (cur_max - 1));
+	while (fd <= max_fd) {
+		struct file *file;
+
+		file = pick_file(files, fd++);
+		if (!file)
+			continue;
+
+		filp_close(file, files);
+		cond_resched();
+	}
+
+	return 0;
+}
+
 /*
  * variant of __close_fd that gets a ref on the file for later fput
  */
diff --git a/fs/open.c b/fs/open.c
index 9c7d724a6f67..c7baaee7aa47 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -1174,6 +1174,26 @@ SYSCALL_DEFINE1(close, unsigned int, fd)
 	return retval;
 }
 
+/**
+ * close_range() - Close all file descriptors in a given range.
+ *
+ * @fd:     starting file descriptor to close
+ * @max_fd: last file descriptor to close
+ * @flags:  reserved for future extensions
+ *
+ * This closes a range of file descriptors. All file descriptors
+ * from @fd up to and including @max_fd are closed.
+ * Currently, errors to close a given file descriptor are ignored.
+ */
+SYSCALL_DEFINE3(close_range, unsigned int, fd, unsigned int, max_fd,
+		unsigned int, flags)
+{
+	if (flags)
+		return -EINVAL;
+
+	return __close_range(current->files, fd, max_fd);
+}
+
 /*
  * This routine simulates a hangup on the tty, to arrange that users
  * are given clean terminals at login time.
diff --git a/include/linux/fdtable.h b/include/linux/fdtable.h
index f07c55ea0c22..fcd07181a365 100644
--- a/include/linux/fdtable.h
+++ b/include/linux/fdtable.h
@@ -121,6 +121,8 @@ extern void __fd_install(struct files_struct *files,
 		      unsigned int fd, struct file *file);
 extern int __close_fd(struct files_struct *files,
 		      unsigned int fd);
+extern int __close_range(struct files_struct *files, unsigned int fd,
+			 unsigned int max_fd);
 extern int __close_fd_get_file(unsigned int fd, struct file **res);
 
 extern struct kmem_cache *files_cachep;
diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
index e2870fe1be5b..c0189e223255 100644
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -441,6 +441,8 @@ asmlinkage long sys_fchown(unsigned int fd, uid_t user, gid_t group);
 asmlinkage long sys_openat(int dfd, const char __user *filename, int flags,
 			   umode_t mode);
 asmlinkage long sys_close(unsigned int fd);
+asmlinkage long sys_close_range(unsigned int fd, unsigned int max_fd,
+				unsigned int flags);
 asmlinkage long sys_vhangup(void);
 
 /* fs/pipe.c */
diff --git a/include/uapi/asm-generic/unistd.h b/include/uapi/asm-generic/unistd.h
index a87904daf103..3f36c8745d24 100644
--- a/include/uapi/asm-generic/unistd.h
+++ b/include/uapi/asm-generic/unistd.h
@@ -844,9 +844,11 @@ __SYSCALL(__NR_fsconfig, sys_fsconfig)
 __SYSCALL(__NR_fsmount, sys_fsmount)
 #define __NR_fspick 433
 __SYSCALL(__NR_fspick, sys_fspick)
+#define __NR_close_range 435
+__SYSCALL(__NR_close_range, sys_close_range)
 
 #undef __NR_syscalls
-#define __NR_syscalls 434
+#define __NR_syscalls 436
 
 /*
  * 32 bit systems traditionally used different
-- 
2.21.0


^ permalink raw reply related

* [PATCH v2 2/2] tests: add close_range() tests
From: Christian Brauner @ 2019-05-23 15:47 UTC (permalink / raw)
  To: viro, linux-kernel, linux-fsdevel, linux-api, torvalds, fweimer
  Cc: linux-ia64, linux-sh, ldv, dhowells, linux-kselftest, sparclinux,
	shuah, linux-arch, linux-s390, miklos, x86, Christian Brauner,
	linux-mips, linux-xtensa, tkjos, arnd, jannh, linux-m68k, tglx,
	linux-arm-kernel, linux-parisc, oleg, linux-alpha, linuxppc-dev
In-Reply-To: <20190523154747.15162-1-christian@brauner.io>

This adds basic tests for the new close_range() syscall.
- test that no invalid flags can be passed
- test that a range of file descriptors is correctly closed
- test that a range of file descriptors is correctly closed if there there
  are already closed file descriptors in the range
- test that max_fd is correctly capped to the current fdtable maximum

Signed-off-by: Christian Brauner <christian@brauner.io>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Jann Horn <jannh@google.com>
Cc: David Howells <dhowells@redhat.com>
Cc: Dmitry V. Levin <ldv@altlinux.org>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Florian Weimer <fweimer@redhat.com>
Cc: linux-api@vger.kernel.org
---
v1: unchanged
v2:
- Christian Brauner <christian@brauner.io>:
  - verify that close_range() correctly closes a single file descriptor
---
 tools/testing/selftests/Makefile              |   1 +
 tools/testing/selftests/core/.gitignore       |   1 +
 tools/testing/selftests/core/Makefile         |   6 +
 .../testing/selftests/core/close_range_test.c | 142 ++++++++++++++++++
 4 files changed, 150 insertions(+)
 create mode 100644 tools/testing/selftests/core/.gitignore
 create mode 100644 tools/testing/selftests/core/Makefile
 create mode 100644 tools/testing/selftests/core/close_range_test.c

diff --git a/tools/testing/selftests/Makefile b/tools/testing/selftests/Makefile
index 9781ca79794a..06e57fabbff9 100644
--- a/tools/testing/selftests/Makefile
+++ b/tools/testing/selftests/Makefile
@@ -4,6 +4,7 @@ TARGETS += bpf
 TARGETS += breakpoints
 TARGETS += capabilities
 TARGETS += cgroup
+TARGETS += core
 TARGETS += cpufreq
 TARGETS += cpu-hotplug
 TARGETS += drivers/dma-buf
diff --git a/tools/testing/selftests/core/.gitignore b/tools/testing/selftests/core/.gitignore
new file mode 100644
index 000000000000..6e6712ce5817
--- /dev/null
+++ b/tools/testing/selftests/core/.gitignore
@@ -0,0 +1 @@
+close_range_test
diff --git a/tools/testing/selftests/core/Makefile b/tools/testing/selftests/core/Makefile
new file mode 100644
index 000000000000..de3ae68aa345
--- /dev/null
+++ b/tools/testing/selftests/core/Makefile
@@ -0,0 +1,6 @@
+CFLAGS += -g -I../../../../usr/include/ -I../../../../include
+
+TEST_GEN_PROGS := close_range_test
+
+include ../lib.mk
+
diff --git a/tools/testing/selftests/core/close_range_test.c b/tools/testing/selftests/core/close_range_test.c
new file mode 100644
index 000000000000..d6e6079d3d53
--- /dev/null
+++ b/tools/testing/selftests/core/close_range_test.c
@@ -0,0 +1,142 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#define _GNU_SOURCE
+#include <errno.h>
+#include <fcntl.h>
+#include <linux/kernel.h>
+#include <limits.h>
+#include <stdbool.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <syscall.h>
+#include <unistd.h>
+
+#include "../kselftest.h"
+
+static inline int sys_close_range(unsigned int fd, unsigned int max_fd,
+				  unsigned int flags)
+{
+	return syscall(__NR_close_range, fd, max_fd, flags);
+}
+
+#ifndef ARRAY_SIZE
+#define ARRAY_SIZE(x) (sizeof(x) / sizeof((x)[0]))
+#endif
+
+int main(int argc, char **argv)
+{
+	const char *test_name = "close_range";
+	int i, ret;
+	int open_fds[101];
+	int fd_max, fd_mid, fd_min;
+
+	ksft_set_plan(9);
+
+	for (i = 0; i < ARRAY_SIZE(open_fds); i++) {
+		int fd;
+
+		fd = open("/dev/null", O_RDONLY | O_CLOEXEC);
+		if (fd < 0) {
+			if (errno == ENOENT)
+				ksft_exit_skip(
+					"%s test: skipping test since /dev/null does not exist\n",
+					test_name);
+
+			ksft_exit_fail_msg(
+				"%s test: %s - failed to open /dev/null\n",
+				strerror(errno), test_name);
+		}
+
+		open_fds[i] = fd;
+	}
+
+	fd_min = open_fds[0];
+	fd_max = open_fds[99];
+
+	ret = sys_close_range(fd_min, fd_max, 1);
+	if (!ret)
+		ksft_exit_fail_msg(
+			"%s test: managed to pass invalid flag value\n",
+			test_name);
+	ksft_test_result_pass("do not allow invalid flag values for close_range()\n");
+
+	fd_mid = open_fds[50];
+	ret = sys_close_range(fd_min, fd_mid, 0);
+	if (ret < 0)
+		ksft_exit_fail_msg(
+			"%s test: Failed to close range of file descriptors from %d to %d\n",
+			test_name, fd_min, fd_mid);
+	ksft_test_result_pass("close_range() from %d to %d\n", fd_min, fd_mid);
+
+	for (i = 0; i <= 50; i++) {
+		ret = fcntl(open_fds[i], F_GETFL);
+		if (ret >= 0)
+			ksft_exit_fail_msg(
+				"%s test: Failed to close range of file descriptors from %d to %d\n",
+				test_name, fd_min, fd_mid);
+	}
+	ksft_test_result_pass("fcntl() verify closed range from %d to %d\n", fd_min, fd_mid);
+
+	/* create a couple of gaps */
+	close(57);
+	close(78);
+	close(81);
+	close(82);
+	close(84);
+	close(90);
+
+	fd_mid = open_fds[51];
+	/* Choose slightly lower limit and leave some fds for a later test */
+	fd_max = open_fds[92];
+	ret = sys_close_range(fd_mid, fd_max, 0);
+	if (ret < 0)
+		ksft_exit_fail_msg(
+			"%s test: Failed to close range of file descriptors from 51 to 100\n",
+			test_name);
+	ksft_test_result_pass("close_range() from %d to %d\n", fd_mid, fd_max);
+
+	for (i = 51; i <= 92; i++) {
+		ret = fcntl(open_fds[i], F_GETFL);
+		if (ret >= 0)
+			ksft_exit_fail_msg(
+				"%s test: Failed to close range of file descriptors from 51 to 100\n",
+				test_name);
+	}
+	ksft_test_result_pass("fcntl() verify closed range from %d to %d\n", fd_mid, fd_max);
+
+	fd_mid = open_fds[93];
+	fd_max = open_fds[99];
+	/* test that the kernel caps and still closes all fds */
+	ret = sys_close_range(fd_mid, UINT_MAX, 0);
+	if (ret < 0)
+		ksft_exit_fail_msg(
+			"%s test: Failed to close range of file descriptors from 51 to 100\n",
+			test_name);
+	ksft_test_result_pass("close_range() from %d to %d\n", fd_mid, fd_max);
+
+	for (i = 93; i < 100; i++) {
+		ret = fcntl(open_fds[i], F_GETFL);
+		if (ret >= 0)
+			ksft_exit_fail_msg(
+				"%s test: Failed to close range of file descriptors from 51 to 100\n",
+				test_name);
+	}
+	ksft_test_result_pass("fcntl() verify closed range from %d to %d\n", fd_mid, fd_max);
+
+	ret = sys_close_range(open_fds[100], open_fds[100], 0);
+	if (ret < 0)
+		ksft_exit_fail_msg(
+			"%s test: Failed to close single file descriptor\n",
+			test_name);
+	ksft_test_result_pass("close_range() closed single file descriptor\n");
+
+	ret = fcntl(open_fds[100], F_GETFL);
+	if (ret >= 0)
+		ksft_exit_fail_msg(
+			"%s test: Failed to close single file descriptor\n",
+			test_name);
+	ksft_test_result_pass("fcntl() verify closed single file descriptor\n");
+
+	return ksft_exit_pass();
+}
-- 
2.21.0


^ permalink raw reply related

* Re: [PATCH v2 1/2] open: add close_range()
From: Oleg Nesterov @ 2019-05-23 16:20 UTC (permalink / raw)
  To: Christian Brauner
  Cc: linux-ia64, linux-sh, ldv, dhowells, linux-kselftest, sparclinux,
	shuah, linux-arch, linux-s390, miklos, x86, torvalds, linux-mips,
	linux-xtensa, tkjos, arnd, jannh, linux-m68k, viro, tglx,
	linux-arm-kernel, fweimer, linux-parisc, linux-api, linux-kernel,
	linux-alpha, linux-fsdevel, linuxppc-dev
In-Reply-To: <20190523154747.15162-2-christian@brauner.io>

On 05/23, Christian Brauner wrote:
>
> +int __close_range(struct files_struct *files, unsigned fd, unsigned max_fd)
> +{
> +	unsigned int cur_max;
> +
> +	if (fd > max_fd)
> +		return -EINVAL;
> +
> +	rcu_read_lock();
> +	cur_max = files_fdtable(files)->max_fds;
> +	rcu_read_unlock();
> +
> +	/* cap to last valid index into fdtable */
> +	max_fd = max(max_fd, (cur_max - 1));
                 ^^^

Hmm. min() ?

Oleg.


^ permalink raw reply

* Re: [PATCH v1 1/2] open: add close_range()
From: Oleg Nesterov @ 2019-05-23 16:21 UTC (permalink / raw)
  To: Christian Brauner
  Cc: linux-ia64, linux-sh, ldv, dhowells, linux-kselftest, sparclinux,
	shuah, linux-arch, linux-s390, miklos, x86, torvalds, linux-mips,
	linux-xtensa, tkjos, arnd, jannh, linux-m68k, viro, tglx,
	linux-arm-kernel, fweimer, linux-parisc, linux-api, linux-kernel,
	linux-alpha, linux-fsdevel, linuxppc-dev
In-Reply-To: <20190523142826.omb7vgygudifmveq@brauner.io>

On 05/23, Christian Brauner wrote:
>
> So given that we would really need another find_next_open_fd() I think
> sticking to the simple cond_resched() version I sent before is better
> for now until we see real-world performance issues.

OK, agreed.

Oleg.


^ permalink raw reply

* Re: [PATCH v1 1/2] open: add close_range()
From: Konstantin Khlebnikov @ 2019-05-23 16:22 UTC (permalink / raw)
  To: Christian Brauner, viro, linux-kernel, linux-fsdevel, linux-api,
	torvalds, fweimer
  Cc: linux-ia64, linux-sh, ldv, dhowells, linux-kselftest, sparclinux,
	shuah, linux-arch, linux-s390, miklos, x86, linux-mips,
	linux-xtensa, tkjos, arnd, jannh, linux-m68k, tglx,
	linux-arm-kernel, linux-parisc, oleg, linux-alpha, linuxppc-dev
In-Reply-To: <20190522155259.11174-1-christian@brauner.io>

On 22.05.2019 18:52, Christian Brauner wrote:> This adds the close_range() syscall. It allows to efficiently close a range
 > of file descriptors up to all file descriptors of a calling task.
 >
 > The syscall came up in a recent discussion around the new mount API and
 > making new file descriptor types cloexec by default. During this
 > discussion, Al suggested the close_range() syscall (cf. [1]). Note, a
 > syscall in this manner has been requested by various people over time.
 >
 > First, it helps to close all file descriptors of an exec()ing task. This
 > can be done safely via (quoting Al's example from [1] verbatim):
 >
 >          /* that exec is sensitive */
 >          unshare(CLONE_FILES);
 >          /* we don't want anything past stderr here */
 >          close_range(3, ~0U);
 >          execve(....);
 >
 > The code snippet above is one way of working around the problem that file
 > descriptors are not cloexec by default. This is aggravated by the fact that
 > we can't just switch them over without massively regressing userspace. For
 > a whole class of programs having an in-kernel method of closing all file
 > descriptors is very helpful (e.g. demons, service managers, programming
 > language standard libraries, container managers etc.).
 > (Please note, unshare(CLONE_FILES) should only be needed if the calling
 >   task is multi-threaded and shares the file descriptor table with another
 >   thread in which case two threads could race with one thread allocating
 >   file descriptors and the other one closing them via close_range(). For the
 >   general case close_range() before the execve() is sufficient.)
 >
 > Second, it allows userspace to avoid implementing closing all file
 > descriptors by parsing through /proc/<pid>/fd/* and calling close() on each
 > file descriptor. From looking at various large(ish) userspace code bases
 > this or similar patterns are very common in:
 > - service managers (cf. [4])
 > - libcs (cf. [6])
 > - container runtimes (cf. [5])
 > - programming language runtimes/standard libraries
 >    - Python (cf. [2])
 >    - Rust (cf. [7], [8])
 > As Dmitry pointed out there's even a long-standing glibc bug about missing
 > kernel support for this task (cf. [3]).
 > In addition, the syscall will also work for tasks that do not have procfs
 > mounted and on kernels that do not have procfs support compiled in. In such
 > situations the only way to make sure that all file descriptors are closed
 > is to call close() on each file descriptor up to UINT_MAX or RLIMIT_NOFILE,
 > OPEN_MAX trickery (cf. comment [8] on Rust).
 >
 > The performance is striking. For good measure, comparing the following
 > simple close_all_fds() userspace implementation that is essentially just
 > glibc's version in [6]:
 >
 > static int close_all_fds(void)
 > {
 >          int dir_fd;
 >          DIR *dir;
 >          struct dirent *direntp;
 >
 >          dir = opendir("/proc/self/fd");
 >          if (!dir)
 >                  return -1;
 >          dir_fd = dirfd(dir);
 >          while ((direntp = readdir(dir))) {
 >                  int fd;
 >                  if (strcmp(direntp->d_name, ".") == 0)
 >                          continue;
 >                  if (strcmp(direntp->d_name, "..") == 0)
 >                          continue;
 >                  fd = atoi(direntp->d_name);
 >                  if (fd == dir_fd || fd == 0 || fd == 1 || fd == 2)
 >                          continue;
 >                  close(fd);
 >          }
 >          closedir(dir);
 >          return 0;
 > }
 >
 > to close_range() yields:
 > 1. closing 4 open files:
 >     - close_all_fds(): ~280 us
 >     - close_range():    ~24 us
 >
 > 2. closing 1000 open files:
 >     - close_all_fds(): ~5000 us
 >     - close_range():   ~800 us
 >
 > close_range() is designed to allow for some flexibility. Specifically, it
 > does not simply always close all open file descriptors of a task. Instead,
 > callers can specify an upper bound.
 > This is e.g. useful for scenarios where specific file descriptors are
 > created with well-known numbers that are supposed to be excluded from
 > getting closed.
 > For extra paranoia close_range() comes with a flags argument. This can e.g.
 > be used to implement extension. Once can imagine userspace wanting to stop
 > at the first error instead of ignoring errors under certain circumstances.

 > There might be other valid ideas in the future. In any case, a flag
 > argument doesn't hurt and keeps us on the safe side.

Here is another strange but real-live scenario: crash handler for dumping core.

If applications has network connections it would be better to close them all,
otherwise clients will wait until end of dumping process or timeout.
Also closing normal files might be a good idea for releasing locks.

But simple closing might race with other threads - closed fd will be reused
while some code still thinks it refers to original file.

Our solution closes files without freeing fd: it opens /dev/null and
replaces all opened descriptors using dup2.

So, special flag for close_range() could close files without clearing bitmap.
Effect should be the same - fd wouldn't be reused.

Actually two flags for two phases: closing files and releasing fd.

 >
 >  From an implementation side this is kept rather dumb. It saw some input
 > from David and Jann but all nonsense is obviously my own!
 > - Errors to close file descriptors are currently ignored. (Could be changed
 >    by setting a flag in the future if needed.)
 > - __close_range() is a rather simplistic wrapper around __close_fd().
 >    My reasoning behind this is based on the nature of how __close_fd() needs
 >    to release an fd. But maybe I misunderstood specifics:
 >    We take the files_lock and rcu-dereference the fdtable of the calling
 >    task, we find the entry in the fdtable, get the file and need to release
 >    files_lock before calling filp_close().
 >    In the meantime the fdtable might have been altered so we can't just
 >    retake the spinlock and keep the old rcu-reference of the fdtable
 >    around. Instead we need to grab a fresh reference to the fdtable.
 >    If my reasoning is correct then there's really no point in fancyfying
 >    __close_range(): We just need to rcu-dereference the fdtable of the
 >    calling task once to cap the max_fd value correctly and then go on
 >    calling __close_fd() in a loop.
 >
 > /* References */
 > [1]: https://lore.kernel.org/lkml/20190516165021.GD17978@ZenIV.linux.org.uk/
 > [2]: https://github.com/python/cpython/blob/9e4f2f3a6b8ee995c365e86d976937c141d867f8/Modules/_posixsubprocess.c#L220
 > [3]: https://sourceware.org/bugzilla/show_bug.cgi?id=10353#c7
 > [4]: https://github.com/systemd/systemd/blob/5238e9575906297608ff802a27e2ff9effa3b338/src/basic/fd-util.c#L217
 > [5]: https://github.com/lxc/lxc/blob/ddf4b77e11a4d08f09b7b9cd13e593f8c047edc5/src/lxc/start.c#L236
 > [6]: 
https://sourceware.org/git/?p=glibc.git;a=blob;f=sysdeps/unix/sysv/linux/grantpt.c;h=2030e07fa6e652aac32c775b8c6e005844c3c4eb;hb=HEAD#l17
 >       Note that this is an internal implementation that is not exported.
 >       Currently, libc seems to not provide an exported version of this
 >       because of missing kernel support to do this.
 > [7]: https://github.com/rust-lang/rust/issues/12148
 > [8]: https://github.com/rust-lang/rust/blob/5f47c0613ed4eb46fca3633c1297364c09e5e451/src/libstd/sys/unix/process2.rs#L303-L308
 >       Rust's solution is slightly different but is equally unperformant.
 >       Rust calls getdtablesize() which is a glibc library function that
 >       simply returns the current RLIMIT_NOFILE or OPEN_MAX values. Rust then
 >       goes on to call close() on each fd. That's obviously overkill for most
 >       tasks. Rarely, tasks - especially non-demons - hit RLIMIT_NOFILE or
 >       OPEN_MAX.
 >       Let's be nice and assume an unprivileged user with RLIMIT_NOFILE set
 >       to 1024. Even in this case, there's a very high chance that in the
 >       common case Rust is calling the close() syscall 1021 times pointlessly
 >       if the task just has 0, 1, and 2 open.
 >
 > Suggested-by: Al Viro <viro@zeniv.linux.org.uk>
 > Signed-off-by: Christian Brauner <christian@brauner.io>
 > Cc: Arnd Bergmann <arnd@arndb.de>
 > Cc: Jann Horn <jannh@google.com>
 > Cc: David Howells <dhowells@redhat.com>
 > Cc: Dmitry V. Levin <ldv@altlinux.org>
 > Cc: Oleg Nesterov <oleg@redhat.com>
 > Cc: Linus Torvalds <torvalds@linux-foundation.org>
 > Cc: Florian Weimer <fweimer@redhat.com>
 > Cc: linux-api@vger.kernel.org
 > ---
 > v1:
 > - Linus Torvalds <torvalds@linux-foundation.org>:
 >    - add cond_resched() to yield cpu when closing a lot of file descriptors
 > - Al Viro <viro@zeniv.linux.org.uk>:
 >    - add cond_resched() to yield cpu when closing a lot of file descriptors
 > ---
 >   arch/alpha/kernel/syscalls/syscall.tbl      |  1 +
 >   arch/arm/tools/syscall.tbl                  |  1 +
 >   arch/arm64/include/asm/unistd32.h           |  2 +
 >   arch/ia64/kernel/syscalls/syscall.tbl       |  1 +
 >   arch/m68k/kernel/syscalls/syscall.tbl       |  1 +
 >   arch/microblaze/kernel/syscalls/syscall.tbl |  1 +
 >   arch/mips/kernel/syscalls/syscall_n32.tbl   |  1 +
 >   arch/mips/kernel/syscalls/syscall_n64.tbl   |  1 +
 >   arch/mips/kernel/syscalls/syscall_o32.tbl   |  1 +
 >   arch/parisc/kernel/syscalls/syscall.tbl     |  1 +
 >   arch/powerpc/kernel/syscalls/syscall.tbl    |  1 +
 >   arch/s390/kernel/syscalls/syscall.tbl       |  1 +
 >   arch/sh/kernel/syscalls/syscall.tbl         |  1 +
 >   arch/sparc/kernel/syscalls/syscall.tbl      |  1 +
 >   arch/x86/entry/syscalls/syscall_32.tbl      |  1 +
 >   arch/x86/entry/syscalls/syscall_64.tbl      |  1 +
 >   arch/xtensa/kernel/syscalls/syscall.tbl     |  1 +
 >   fs/file.c                                   | 63 ++++++++++++++++++---
 >   fs/open.c                                   | 20 +++++++
 >   include/linux/fdtable.h                     |  2 +
 >   include/linux/syscalls.h                    |  2 +
 >   include/uapi/asm-generic/unistd.h           |  4 +-
 >   22 files changed, 100 insertions(+), 9 deletions(-)
 >

It would be better to split arch/ wiring into separate patch for better readability.


^ permalink raw reply

* RE: [PATCH v1 1/2] open: add close_range()
From: David Laight @ 2019-05-23 16:29 UTC (permalink / raw)
  To: 'Konstantin Khlebnikov', Christian Brauner,
	viro@zeniv.linux.org.uk, linux-kernel@vger.kernel.org,
	linux-fsdevel@vger.kernel.org, linux-api@vger.kernel.org,
	torvalds@linux-foundation.org, fweimer@redhat.com
  Cc: linux-ia64@vger.kernel.org, linux-sh@vger.kernel.org,
	ldv@altlinux.org, dhowells@redhat.com,
	linux-kselftest@vger.kernel.org, sparclinux@vger.kernel.org,
	shuah@kernel.org, linux-arch@vger.kernel.org,
	linux-s390@vger.kernel.org, miklos@szeredi.hu, x86@kernel.org,
	linux-mips@vger.kernel.org, linux-xtensa@linux-xtensa.org,
	tkjos@android.com, arnd@arndb.de, jannh@google.com,
	linux-m68k@lists.linux-m68k.org, tglx@linutronix.de,
	linux-arm-kernel@lists.infradead.org,
	linux-parisc@vger.kernel.org, oleg@redhat.com,
	linux-alpha@vger.kernel.org, linuxppc-dev@lists.ozlabs.org
In-Reply-To: <67e4458a-9cc4-d1aa-608c-73ebe9e2f7a3@yandex-team.ru>

From:  Konstantin Khlebnikov
> Sent: 23 May 2019 17:22
....
>  > In addition, the syscall will also work for tasks that do not have procfs
>  > mounted and on kernels that do not have procfs support compiled in. In such
>  > situations the only way to make sure that all file descriptors are closed
>  > is to call close() on each file descriptor up to UINT_MAX or RLIMIT_NOFILE,
>  > OPEN_MAX trickery (cf. comment [8] on Rust).

Code using RLIMIT_NOFILE is broken.
It is easy to reduce the hard limit below that of an open fd.

	David

-
Registered Address Lakeside, Bramley Road, Mount Farm, Milton Keynes, MK1 1PT, UK
Registration No: 1397386 (Wales)

^ permalink raw reply

* Re: [PATCH v1 1/2] open: add close_range()
From: Christian Brauner @ 2019-05-23 16:33 UTC (permalink / raw)
  To: Konstantin Khlebnikov
  Cc: linux-ia64, linux-sh, oleg, dhowells, linux-kselftest, sparclinux,
	shuah, linux-arch, linux-s390, miklos, x86, torvalds, linux-mips,
	linux-xtensa, tkjos, arnd, jannh, linux-m68k, viro, tglx, ldv,
	linux-arm-kernel, fweimer, linux-parisc, linux-api, linux-kernel,
	linux-alpha, linux-fsdevel, linuxppc-dev
In-Reply-To: <67e4458a-9cc4-d1aa-608c-73ebe9e2f7a3@yandex-team.ru>

On Thu, May 23, 2019 at 07:22:17PM +0300, Konstantin Khlebnikov wrote:
> On 22.05.2019 18:52, Christian Brauner wrote:> This adds the close_range() syscall. It allows to efficiently close a range
> > of file descriptors up to all file descriptors of a calling task.
> >
> > The syscall came up in a recent discussion around the new mount API and
> > making new file descriptor types cloexec by default. During this
> > discussion, Al suggested the close_range() syscall (cf. [1]). Note, a
> > syscall in this manner has been requested by various people over time.
> >
> > First, it helps to close all file descriptors of an exec()ing task. This
> > can be done safely via (quoting Al's example from [1] verbatim):
> >
> >          /* that exec is sensitive */
> >          unshare(CLONE_FILES);
> >          /* we don't want anything past stderr here */
> >          close_range(3, ~0U);
> >          execve(....);
> >
> > The code snippet above is one way of working around the problem that file
> > descriptors are not cloexec by default. This is aggravated by the fact that
> > we can't just switch them over without massively regressing userspace. For
> > a whole class of programs having an in-kernel method of closing all file
> > descriptors is very helpful (e.g. demons, service managers, programming
> > language standard libraries, container managers etc.).
> > (Please note, unshare(CLONE_FILES) should only be needed if the calling
> >   task is multi-threaded and shares the file descriptor table with another
> >   thread in which case two threads could race with one thread allocating
> >   file descriptors and the other one closing them via close_range(). For the
> >   general case close_range() before the execve() is sufficient.)
> >
> > Second, it allows userspace to avoid implementing closing all file
> > descriptors by parsing through /proc/<pid>/fd/* and calling close() on each
> > file descriptor. From looking at various large(ish) userspace code bases
> > this or similar patterns are very common in:
> > - service managers (cf. [4])
> > - libcs (cf. [6])
> > - container runtimes (cf. [5])
> > - programming language runtimes/standard libraries
> >    - Python (cf. [2])
> >    - Rust (cf. [7], [8])
> > As Dmitry pointed out there's even a long-standing glibc bug about missing
> > kernel support for this task (cf. [3]).
> > In addition, the syscall will also work for tasks that do not have procfs
> > mounted and on kernels that do not have procfs support compiled in. In such
> > situations the only way to make sure that all file descriptors are closed
> > is to call close() on each file descriptor up to UINT_MAX or RLIMIT_NOFILE,
> > OPEN_MAX trickery (cf. comment [8] on Rust).
> >
> > The performance is striking. For good measure, comparing the following
> > simple close_all_fds() userspace implementation that is essentially just
> > glibc's version in [6]:
> >
> > static int close_all_fds(void)
> > {
> >          int dir_fd;
> >          DIR *dir;
> >          struct dirent *direntp;
> >
> >          dir = opendir("/proc/self/fd");
> >          if (!dir)
> >                  return -1;
> >          dir_fd = dirfd(dir);
> >          while ((direntp = readdir(dir))) {
> >                  int fd;
> >                  if (strcmp(direntp->d_name, ".") == 0)
> >                          continue;
> >                  if (strcmp(direntp->d_name, "..") == 0)
> >                          continue;
> >                  fd = atoi(direntp->d_name);
> >                  if (fd == dir_fd || fd == 0 || fd == 1 || fd == 2)
> >                          continue;
> >                  close(fd);
> >          }
> >          closedir(dir);
> >          return 0;
> > }
> >
> > to close_range() yields:
> > 1. closing 4 open files:
> >     - close_all_fds(): ~280 us
> >     - close_range():    ~24 us
> >
> > 2. closing 1000 open files:
> >     - close_all_fds(): ~5000 us
> >     - close_range():   ~800 us
> >
> > close_range() is designed to allow for some flexibility. Specifically, it
> > does not simply always close all open file descriptors of a task. Instead,
> > callers can specify an upper bound.
> > This is e.g. useful for scenarios where specific file descriptors are
> > created with well-known numbers that are supposed to be excluded from
> > getting closed.
> > For extra paranoia close_range() comes with a flags argument. This can e.g.
> > be used to implement extension. Once can imagine userspace wanting to stop
> > at the first error instead of ignoring errors under certain circumstances.
> 
> > There might be other valid ideas in the future. In any case, a flag
> > argument doesn't hurt and keeps us on the safe side.
> 
> Here is another strange but real-live scenario: crash handler for dumping core.
> 
> If applications has network connections it would be better to close them all,
> otherwise clients will wait until end of dumping process or timeout.
> Also closing normal files might be a good idea for releasing locks.
> 
> But simple closing might race with other threads - closed fd will be reused
> while some code still thinks it refers to original file.
> 
> Our solution closes files without freeing fd: it opens /dev/null and
> replaces all opened descriptors using dup2.
> 
> So, special flag for close_range() could close files without clearing bitmap.
> Effect should be the same - fd wouldn't be reused.
> 
> Actually two flags for two phases: closing files and releasing fd.
> 
> >
> >  From an implementation side this is kept rather dumb. It saw some input
> > from David and Jann but all nonsense is obviously my own!
> > - Errors to close file descriptors are currently ignored. (Could be changed
> >    by setting a flag in the future if needed.)
> > - __close_range() is a rather simplistic wrapper around __close_fd().
> >    My reasoning behind this is based on the nature of how __close_fd() needs
> >    to release an fd. But maybe I misunderstood specifics:
> >    We take the files_lock and rcu-dereference the fdtable of the calling
> >    task, we find the entry in the fdtable, get the file and need to release
> >    files_lock before calling filp_close().
> >    In the meantime the fdtable might have been altered so we can't just
> >    retake the spinlock and keep the old rcu-reference of the fdtable
> >    around. Instead we need to grab a fresh reference to the fdtable.
> >    If my reasoning is correct then there's really no point in fancyfying
> >    __close_range(): We just need to rcu-dereference the fdtable of the
> >    calling task once to cap the max_fd value correctly and then go on
> >    calling __close_fd() in a loop.
> >
> > /* References */
> > [1]: https://lore.kernel.org/lkml/20190516165021.GD17978@ZenIV.linux.org.uk/
> > [2]: https://github.com/python/cpython/blob/9e4f2f3a6b8ee995c365e86d976937c141d867f8/Modules/_posixsubprocess.c#L220
> > [3]: https://sourceware.org/bugzilla/show_bug.cgi?id=10353#c7
> > [4]: https://github.com/systemd/systemd/blob/5238e9575906297608ff802a27e2ff9effa3b338/src/basic/fd-util.c#L217
> > [5]: https://github.com/lxc/lxc/blob/ddf4b77e11a4d08f09b7b9cd13e593f8c047edc5/src/lxc/start.c#L236
> > [6]: https://sourceware.org/git/?p=glibc.git;a=blob;f=sysdeps/unix/sysv/linux/grantpt.c;h=2030e07fa6e652aac32c775b8c6e005844c3c4eb;hb=HEAD#l17
> >       Note that this is an internal implementation that is not exported.
> >       Currently, libc seems to not provide an exported version of this
> >       because of missing kernel support to do this.
> > [7]: https://github.com/rust-lang/rust/issues/12148
> > [8]: https://github.com/rust-lang/rust/blob/5f47c0613ed4eb46fca3633c1297364c09e5e451/src/libstd/sys/unix/process2.rs#L303-L308
> >       Rust's solution is slightly different but is equally unperformant.
> >       Rust calls getdtablesize() which is a glibc library function that
> >       simply returns the current RLIMIT_NOFILE or OPEN_MAX values. Rust then
> >       goes on to call close() on each fd. That's obviously overkill for most
> >       tasks. Rarely, tasks - especially non-demons - hit RLIMIT_NOFILE or
> >       OPEN_MAX.
> >       Let's be nice and assume an unprivileged user with RLIMIT_NOFILE set
> >       to 1024. Even in this case, there's a very high chance that in the
> >       common case Rust is calling the close() syscall 1021 times pointlessly
> >       if the task just has 0, 1, and 2 open.
> >
> > Suggested-by: Al Viro <viro@zeniv.linux.org.uk>
> > Signed-off-by: Christian Brauner <christian@brauner.io>
> > Cc: Arnd Bergmann <arnd@arndb.de>
> > Cc: Jann Horn <jannh@google.com>
> > Cc: David Howells <dhowells@redhat.com>
> > Cc: Dmitry V. Levin <ldv@altlinux.org>
> > Cc: Oleg Nesterov <oleg@redhat.com>
> > Cc: Linus Torvalds <torvalds@linux-foundation.org>
> > Cc: Florian Weimer <fweimer@redhat.com>
> > Cc: linux-api@vger.kernel.org
> > ---
> > v1:
> > - Linus Torvalds <torvalds@linux-foundation.org>:
> >    - add cond_resched() to yield cpu when closing a lot of file descriptors
> > - Al Viro <viro@zeniv.linux.org.uk>:
> >    - add cond_resched() to yield cpu when closing a lot of file descriptors
> > ---
> >   arch/alpha/kernel/syscalls/syscall.tbl      |  1 +
> >   arch/arm/tools/syscall.tbl                  |  1 +
> >   arch/arm64/include/asm/unistd32.h           |  2 +
> >   arch/ia64/kernel/syscalls/syscall.tbl       |  1 +
> >   arch/m68k/kernel/syscalls/syscall.tbl       |  1 +
> >   arch/microblaze/kernel/syscalls/syscall.tbl |  1 +
> >   arch/mips/kernel/syscalls/syscall_n32.tbl   |  1 +
> >   arch/mips/kernel/syscalls/syscall_n64.tbl   |  1 +
> >   arch/mips/kernel/syscalls/syscall_o32.tbl   |  1 +
> >   arch/parisc/kernel/syscalls/syscall.tbl     |  1 +
> >   arch/powerpc/kernel/syscalls/syscall.tbl    |  1 +
> >   arch/s390/kernel/syscalls/syscall.tbl       |  1 +
> >   arch/sh/kernel/syscalls/syscall.tbl         |  1 +
> >   arch/sparc/kernel/syscalls/syscall.tbl      |  1 +
> >   arch/x86/entry/syscalls/syscall_32.tbl      |  1 +
> >   arch/x86/entry/syscalls/syscall_64.tbl      |  1 +
> >   arch/xtensa/kernel/syscalls/syscall.tbl     |  1 +
> >   fs/file.c                                   | 63 ++++++++++++++++++---
> >   fs/open.c                                   | 20 +++++++
> >   include/linux/fdtable.h                     |  2 +
> >   include/linux/syscalls.h                    |  2 +
> >   include/uapi/asm-generic/unistd.h           |  4 +-
> >   22 files changed, 100 insertions(+), 9 deletions(-)
> >
> 
> It would be better to split arch/ wiring into separate patch for better readability.

Ok. You mean only do x86 - seems to be the standard - and then move the
others into a separate patch? Doesn't seem worth to have a patch
per-arch, I'd think.

^ permalink raw reply

* Re: [PATCH v2 1/2] open: add close_range()
From: Christian Brauner @ 2019-05-23 16:34 UTC (permalink / raw)
  To: Oleg Nesterov
  Cc: linux-ia64, linux-sh, ldv, dhowells, linux-kselftest, sparclinux,
	shuah, linux-arch, linux-s390, miklos, x86, torvalds, linux-mips,
	linux-xtensa, tkjos, arnd, jannh, linux-m68k, viro, tglx,
	linux-arm-kernel, fweimer, linux-parisc, linux-api, linux-kernel,
	linux-alpha, linux-fsdevel, linuxppc-dev
In-Reply-To: <20190523162004.GC23070@redhat.com>

On Thu, May 23, 2019 at 06:20:05PM +0200, Oleg Nesterov wrote:
> On 05/23, Christian Brauner wrote:
> >
> > +int __close_range(struct files_struct *files, unsigned fd, unsigned max_fd)
> > +{
> > +	unsigned int cur_max;
> > +
> > +	if (fd > max_fd)
> > +		return -EINVAL;
> > +
> > +	rcu_read_lock();
> > +	cur_max = files_fdtable(files)->max_fds;
> > +	rcu_read_unlock();
> > +
> > +	/* cap to last valid index into fdtable */
> > +	max_fd = max(max_fd, (cur_max - 1));
>                  ^^^
> 
> Hmm. min() ?

Yes, thanks! Massive brainf*rt on my end, sorry.

Christian

^ permalink raw reply

* Re: [BISECTED] kexec regression on PowerBook G4
From: Aaro Koskinen @ 2019-05-23 17:27 UTC (permalink / raw)
  To: Christophe Leroy; +Cc: linuxppc-dev
In-Reply-To: <a3619327-14ba-ff34-913f-cf3384284c9a@c-s.fr>

Hi,

On Thu, May 23, 2019 at 07:33:38AM +0200, Christophe Leroy wrote:
> Ok, the Oops confirms that the error is due to executing the kexec control
> code which is located outside the kernel text area.
> 
> My yesterday's proposed change doesn't work because on book3S/32, NX
> protection is based on setting segments to NX, and using IBATs for kernel
> text.
> 
> Can you try the patch I sent out a few minutes ago ?
> (https://patchwork.ozlabs.org/patch/1103827/)

It now crashes with "BUG: Unable to handle kernel instruction fetch"
and the faulting address is 0xef13a000.

A.

^ permalink raw reply

* Re: Failure to boot G4: dt_headr_start=0x01501000
From: Christophe Leroy @ 2019-05-23 18:08 UTC (permalink / raw)
  To: Mathieu Malaterre; +Cc: linuxppc-dev
In-Reply-To: <CA+7wUsxoCrqw5MH+8QrT4kVVC0AcwdgUu3Zuy+-GGU=bU1_ezg@mail.gmail.com>

On 05/23/2019 10:16 AM, Mathieu Malaterre wrote:
> On Thu, May 23, 2019 at 11:45 AM Christophe Leroy
> <christophe.leroy@c-s.fr> wrote:
>>
>>
>>
>> Le 23/05/2019 à 10:53, Mathieu Malaterre a écrit :
>>>
>>> I confirm powerpc/merge does not boot for me (same config). Commit id:
>>>
>>> a27eaa62326d (powerpc/merge) Automatic merge of branches 'master',
>>> 'next' and 'fixes' into merge
>>
>> I see in the config you sent me that you have selected CONFIG_KASAN,
>> which is a big new stuff.
>>
>> Can you try without it ?
> 
> With same config but CONFIG_KASAN=n (on top of a27eaa62326d), I can
> reproduce the boot failure (no change).
> 
> Time for bisect ?
> 

I found the issue. In order to be able to support KASAN, the setup of 
segments have moved earlier in the boot. Your problem is a side effect 
of this change.
Function setup_disp_bat() is supposed to setup BAT3 for btext data.
But setup_disp_bat() rely on someone setting in disp_BAT the values to 
be loaded into BATs. This is done by btext_prepare_BAT() which is called 
by bootx_init().
The problem is that bootx_init() is never called, so setup_disp_bat() 
does nothing and the access to btext data is possible because the 
bootloader has set an entry for it in the hash table.

But by setting up the segment earlier, we break the bootloader hash 
table, which shouldn't be an issue if the BATs had been set properly as 
expected.

The problematic commit is 215b823707ce ("powerpc/32s: set up an early 
static hash table for KASAN)"

Here is a dirty fix that works for me when CONFIG_KASAN is NOT set.
Of course, the real fix has to be to setup the BATs properly, but I 
won't have time to look at that before June. Maybe you can ?

diff --git a/arch/powerpc/kernel/head_32.S b/arch/powerpc/kernel/head_32.S
index 755fab9641d6..fba16970c028 100644
--- a/arch/powerpc/kernel/head_32.S
+++ b/arch/powerpc/kernel/head_32.S
@@ -162,7 +162,6 @@ __after_mmu_off:
  	bl	flush_tlbs

  	bl	initial_bats
-	bl	load_segment_registers
  #ifdef CONFIG_KASAN
  	bl	early_hash_table
  #endif
@@ -920,6 +919,7 @@ END_MMU_FTR_SECTION_IFSET(MMU_FTR_HPTE_TABLE)
  	RFI
  /* Load up the kernel context */
  2:	bl	load_up_mmu
+	bl	load_segment_registers

  #ifdef CONFIG_BDI_SWITCH
  	/* Add helper information for the Abatron bdiGDB debugger.

Christophe

^ permalink raw reply related

* Re: [BISECTED] kexec regression on PowerBook G4
From: Christophe Leroy @ 2019-05-23 18:58 UTC (permalink / raw)
  To: Aaro Koskinen; +Cc: linuxppc-dev
In-Reply-To: <20190523172717.GA5234@darkstar.musicnaut.iki.fi>



Le 23/05/2019 à 19:27, Aaro Koskinen a écrit :
> Hi,
> 
> On Thu, May 23, 2019 at 07:33:38AM +0200, Christophe Leroy wrote:
>> Ok, the Oops confirms that the error is due to executing the kexec control
>> code which is located outside the kernel text area.
>>
>> My yesterday's proposed change doesn't work because on book3S/32, NX
>> protection is based on setting segments to NX, and using IBATs for kernel
>> text.
>>
>> Can you try the patch I sent out a few minutes ago ?
>> (https://patchwork.ozlabs.org/patch/1103827/)
> 
> It now crashes with "BUG: Unable to handle kernel instruction fetch"
> and the faulting address is 0xef13a000.
> 

Ok.

Can you try with both changes at the same time, ie the mtsrin(...) and 
the change_page_attr() ?

I suspect that allthough the HW is not able to check EXEC flag, the SW 
will check it before loading the hash entry.

Christophe

^ permalink raw reply

* [PATCH 5.0 075/139] x86/mpx, mm/core: Fix recursive munmap() corruption
From: Greg Kroah-Hartman @ 2019-05-23 19:06 UTC (permalink / raw)
  To: linux-kernel
  Cc: Michal Hocko, Peter Zijlstra, Dave Hansen, linux-mm,
	Paul Mackerras, H. Peter Anvin, Ingo Molnar, Anton Ivanov,
	linux-arch, Richard Weinberger, H.J. Lu, Rik van Riel, Jeff Dike,
	linux-um, linuxppc-dev, Borislav Petkov, Andy Lutomirski,
	Yang Shi, Guan Xuetao, Vlastimil Babka, Richard Biener,
	Greg Kroah-Hartman, stable, Andrew Morton, Linus Torvalds
In-Reply-To: <20190523181720.120897565@linuxfoundation.org>

From: Dave Hansen <dave.hansen@linux.intel.com>

commit 5a28fc94c9143db766d1ba5480cae82d856ad080 upstream.

This is a bit of a mess, to put it mildly.  But, it's a bug
that only seems to have showed up in 4.20 but wasn't noticed
until now, because nobody uses MPX.

MPX has the arch_unmap() hook inside of munmap() because MPX
uses bounds tables that protect other areas of memory.  When
memory is unmapped, there is also a need to unmap the MPX
bounds tables.  Barring this, unused bounds tables can eat 80%
of the address space.

But, the recursive do_munmap() that gets called vi arch_unmap()
wreaks havoc with __do_munmap()'s state.  It can result in
freeing populated page tables, accessing bogus VMA state,
double-freed VMAs and more.

See the "long story" further below for the gory details.

To fix this, call arch_unmap() before __do_unmap() has a chance
to do anything meaningful.  Also, remove the 'vma' argument
and force the MPX code to do its own, independent VMA lookup.

== UML / unicore32 impact ==

Remove unused 'vma' argument to arch_unmap().  No functional
change.

I compile tested this on UML but not unicore32.

== powerpc impact ==

powerpc uses arch_unmap() well to watch for munmap() on the
VDSO and zeroes out 'current->mm->context.vdso_base'.  Moving
arch_unmap() makes this happen earlier in __do_munmap().  But,
'vdso_base' seems to only be used in perf and in the signal
delivery that happens near the return to userspace.  I can not
find any likely impact to powerpc, other than the zeroing
happening a little earlier.

powerpc does not use the 'vma' argument and is unaffected by
its removal.

I compile-tested a 64-bit powerpc defconfig.

== x86 impact ==

For the common success case this is functionally identical to
what was there before.  For the munmap() failure case, it's
possible that some MPX tables will be zapped for memory that
continues to be in use.  But, this is an extraordinarily
unlikely scenario and the harm would be that MPX provides no
protection since the bounds table got reset (zeroed).

I can't imagine anyone doing this:

	ptr = mmap();
	// use ptr
	ret = munmap(ptr);
	if (ret)
		// oh, there was an error, I'll
		// keep using ptr.

Because if you're doing munmap(), you are *done* with the
memory.  There's probably no good data in there _anyway_.

This passes the original reproducer from Richard Biener as
well as the existing mpx selftests/.

The long story:

munmap() has a couple of pieces:

 1. Find the affected VMA(s)
 2. Split the start/end one(s) if neceesary
 3. Pull the VMAs out of the rbtree
 4. Actually zap the memory via unmap_region(), including
    freeing page tables (or queueing them to be freed).
 5. Fix up some of the accounting (like fput()) and actually
    free the VMA itself.

This specific ordering was actually introduced by:

  dd2283f2605e ("mm: mmap: zap pages with read mmap_sem in munmap")

during the 4.20 merge window.  The previous __do_munmap() code
was actually safe because the only thing after arch_unmap() was
remove_vma_list().  arch_unmap() could not see 'vma' in the
rbtree because it was detached, so it is not even capable of
doing operations unsafe for remove_vma_list()'s use of 'vma'.

Richard Biener reported a test that shows this in dmesg:

  [1216548.787498] BUG: Bad rss-counter state mm:0000000017ce560b idx:1 val:551
  [1216548.787500] BUG: non-zero pgtables_bytes on freeing mm: 24576

What triggered this was the recursive do_munmap() called via
arch_unmap().  It was freeing page tables that has not been
properly zapped.

But, the problem was bigger than this.  For one, arch_unmap()
can free VMAs.  But, the calling __do_munmap() has variables
that *point* to VMAs and obviously can't handle them just
getting freed while the pointer is still in use.

I tried a couple of things here.  First, I tried to fix the page
table freeing problem in isolation, but I then found the VMA
issue.  I also tried having the MPX code return a flag if it
modified the rbtree which would force __do_munmap() to re-walk
to restart.  That spiralled out of control in complexity pretty
fast.

Just moving arch_unmap() and accepting that the bonkers failure
case might eat some bounds tables seems like the simplest viable
fix.

This was also reported in the following kernel bugzilla entry:

  https://bugzilla.kernel.org/show_bug.cgi?id=203123

There are some reports that this commit triggered this bug:

  dd2283f2605 ("mm: mmap: zap pages with read mmap_sem in munmap")

While that commit certainly made the issues easier to hit, I believe
the fundamental issue has been with us as long as MPX itself, thus
the Fixes: tag below is for one of the original MPX commits.

[ mingo: Minor edits to the changelog and the patch. ]

Reported-by: Richard Biener <rguenther@suse.de>
Reported-by: H.J. Lu <hjl.tools@gmail.com>
Signed-off-by: Dave Hansen <dave.hansen@linux.intel.com>
Reviewed-by Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Yang Shi <yang.shi@linux.alibaba.com>
Acked-by: Michael Ellerman <mpe@ellerman.id.au>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Anton Ivanov <anton.ivanov@cambridgegreys.com>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Guan Xuetao <gxt@pku.edu.cn>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Jeff Dike <jdike@addtoit.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Richard Weinberger <richard@nod.at>
Cc: Rik van Riel <riel@surriel.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: linux-arch@vger.kernel.org
Cc: linux-mm@kvack.org
Cc: linux-um@lists.infradead.org
Cc: linuxppc-dev@lists.ozlabs.org
Cc: stable@vger.kernel.org
Fixes: dd2283f2605e ("mm: mmap: zap pages with read mmap_sem in munmap")
Link: http://lkml.kernel.org/r/20190419194747.5E1AD6DC@viggo.jf.intel.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>

---
 arch/powerpc/include/asm/mmu_context.h   |    1 -
 arch/um/include/asm/mmu_context.h        |    1 -
 arch/unicore32/include/asm/mmu_context.h |    1 -
 arch/x86/include/asm/mmu_context.h       |    6 +++---
 arch/x86/include/asm/mpx.h               |   15 ++++++++-------
 arch/x86/mm/mpx.c                        |   10 ++++++----
 include/asm-generic/mm_hooks.h           |    1 -
 mm/mmap.c                                |   15 ++++++++-------
 8 files changed, 25 insertions(+), 25 deletions(-)

--- a/arch/powerpc/include/asm/mmu_context.h
+++ b/arch/powerpc/include/asm/mmu_context.h
@@ -237,7 +237,6 @@ extern void arch_exit_mmap(struct mm_str
 #endif

 static inline void arch_unmap(struct mm_struct *mm,
-			      struct vm_area_struct *vma,
 			      unsigned long start, unsigned long end)
 {
 	if (start <= mm->context.vdso_base && mm->context.vdso_base < end)
--- a/arch/um/include/asm/mmu_context.h
+++ b/arch/um/include/asm/mmu_context.h
@@ -22,7 +22,6 @@ static inline int arch_dup_mmap(struct m
 }
 extern void arch_exit_mmap(struct mm_struct *mm);
 static inline void arch_unmap(struct mm_struct *mm,
-			struct vm_area_struct *vma,
 			unsigned long start, unsigned long end)
 {
 }
--- a/arch/unicore32/include/asm/mmu_context.h
+++ b/arch/unicore32/include/asm/mmu_context.h
@@ -88,7 +88,6 @@ static inline int arch_dup_mmap(struct m
 }

 static inline void arch_unmap(struct mm_struct *mm,
-			struct vm_area_struct *vma,
 			unsigned long start, unsigned long end)
 {
 }
--- a/arch/x86/include/asm/mmu_context.h
+++ b/arch/x86/include/asm/mmu_context.h
@@ -277,8 +277,8 @@ static inline void arch_bprm_mm_init(str
 	mpx_mm_init(mm);
 }

-static inline void arch_unmap(struct mm_struct *mm, struct vm_area_struct *vma,
-			      unsigned long start, unsigned long end)
+static inline void arch_unmap(struct mm_struct *mm, unsigned long start,
+			      unsigned long end)
 {
 	/*
 	 * mpx_notify_unmap() goes and reads a rarely-hot
@@ -298,7 +298,7 @@ static inline void arch_unmap(struct mm_
 	 * consistently wrong.
 	 */
 	if (unlikely(cpu_feature_enabled(X86_FEATURE_MPX)))
-		mpx_notify_unmap(mm, vma, start, end);
+		mpx_notify_unmap(mm, start, end);
 }

 /*
--- a/arch/x86/include/asm/mpx.h
+++ b/arch/x86/include/asm/mpx.h
@@ -64,12 +64,15 @@ struct mpx_fault_info {
 };

 #ifdef CONFIG_X86_INTEL_MPX
-int mpx_fault_info(struct mpx_fault_info *info, struct pt_regs *regs);
-int mpx_handle_bd_fault(void);
+
+extern int mpx_fault_info(struct mpx_fault_info *info, struct pt_regs *regs);
+extern int mpx_handle_bd_fault(void);
+
 static inline int kernel_managing_mpx_tables(struct mm_struct *mm)
 {
 	return (mm->context.bd_addr != MPX_INVALID_BOUNDS_DIR);
 }
+
 static inline void mpx_mm_init(struct mm_struct *mm)
 {
 	/*
@@ -78,11 +81,10 @@ static inline void mpx_mm_init(struct mm
 	 */
 	mm->context.bd_addr = MPX_INVALID_BOUNDS_DIR;
 }
-void mpx_notify_unmap(struct mm_struct *mm, struct vm_area_struct *vma,
-		      unsigned long start, unsigned long end);

-unsigned long mpx_unmapped_area_check(unsigned long addr, unsigned long len,
-		unsigned long flags);
+extern void mpx_notify_unmap(struct mm_struct *mm, unsigned long start, unsigned long end);
+extern unsigned long mpx_unmapped_area_check(unsigned long addr, unsigned long len, unsigned long flags);
+
 #else
 static inline int mpx_fault_info(struct mpx_fault_info *info, struct pt_regs *regs)
 {
@@ -100,7 +102,6 @@ static inline void mpx_mm_init(struct mm
 {
 }
 static inline void mpx_notify_unmap(struct mm_struct *mm,
-				    struct vm_area_struct *vma,
 				    unsigned long start, unsigned long end)
 {
 }
--- a/arch/x86/mm/mpx.c
+++ b/arch/x86/mm/mpx.c
@@ -881,9 +881,10 @@ static int mpx_unmap_tables(struct mm_st
  * the virtual address region start...end have already been split if
  * necessary, and the 'vma' is the first vma in this range (start -> end).
  */
-void mpx_notify_unmap(struct mm_struct *mm, struct vm_area_struct *vma,
-		unsigned long start, unsigned long end)
+void mpx_notify_unmap(struct mm_struct *mm, unsigned long start,
+		      unsigned long end)
 {
+	struct vm_area_struct *vma;
 	int ret;

 	/*
@@ -902,11 +903,12 @@ void mpx_notify_unmap(struct mm_struct *
 	 * which should not occur normally. Being strict about it here
 	 * helps ensure that we do not have an exploitable stack overflow.
 	 */
-	do {
+	vma = find_vma(mm, start);
+	while (vma && vma->vm_start < end) {
 		if (vma->vm_flags & VM_MPX)
 			return;
 		vma = vma->vm_next;
-	} while (vma && vma->vm_start < end);
+	}

 	ret = mpx_unmap_tables(mm, start, end);
 	if (ret)
--- a/include/asm-generic/mm_hooks.h
+++ b/include/asm-generic/mm_hooks.h
@@ -18,7 +18,6 @@ static inline void arch_exit_mmap(struct
 }

 static inline void arch_unmap(struct mm_struct *mm,
-			struct vm_area_struct *vma,
 			unsigned long start, unsigned long end)
 {
 }
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -2736,9 +2736,17 @@ int __do_munmap(struct mm_struct *mm, un
 		return -EINVAL;

 	len = PAGE_ALIGN(len);
+	end = start + len;
 	if (len == 0)
 		return -EINVAL;

+	/*
+	 * arch_unmap() might do unmaps itself.  It must be called
+	 * and finish any rbtree manipulation before this code
+	 * runs and also starts to manipulate the rbtree.
+	 */
+	arch_unmap(mm, start, end);
+
 	/* Find the first overlapping VMA */
 	vma = find_vma(mm, start);
 	if (!vma)
@@ -2747,7 +2755,6 @@ int __do_munmap(struct mm_struct *mm, un
 	/* we have  start < vma->vm_end  */

 	/* if it doesn't overlap, we have nothing.. */
-	end = start + len;
 	if (vma->vm_start >= end)
 		return 0;

@@ -2817,12 +2824,6 @@ int __do_munmap(struct mm_struct *mm, un
 	/* Detach vmas from rbtree */
 	detach_vmas_to_be_unmapped(mm, vma, prev, end);

-	/*
-	 * mpx unmap needs to be called with mmap_sem held for write.
-	 * It is safe to call it before unmap_region().
-	 */
-	arch_unmap(mm, vma, start, end);
-
 	if (downgrade)
 		downgrade_write(&mm->mmap_sem);

^ permalink raw reply

* [PATCH 5.1 084/122] x86/mpx, mm/core: Fix recursive munmap() corruption
From: Greg Kroah-Hartman @ 2019-05-23 19:06 UTC (permalink / raw)
  To: linux-kernel
  Cc: Michal Hocko, Peter Zijlstra, Dave Hansen, linux-mm,
	Paul Mackerras, H. Peter Anvin, Ingo Molnar, Anton Ivanov,
	linux-arch, Richard Weinberger, H.J. Lu, Rik van Riel, Jeff Dike,
	linux-um, linuxppc-dev, Borislav Petkov, Andy Lutomirski,
	Yang Shi, Guan Xuetao, Vlastimil Babka, Richard Biener,
	Greg Kroah-Hartman, stable, Andrew Morton, Linus Torvalds
In-Reply-To: <20190523181705.091418060@linuxfoundation.org>

From: Dave Hansen <dave.hansen@linux.intel.com>

commit 5a28fc94c9143db766d1ba5480cae82d856ad080 upstream.

This is a bit of a mess, to put it mildly.  But, it's a bug
that only seems to have showed up in 4.20 but wasn't noticed
until now, because nobody uses MPX.

MPX has the arch_unmap() hook inside of munmap() because MPX
uses bounds tables that protect other areas of memory.  When
memory is unmapped, there is also a need to unmap the MPX
bounds tables.  Barring this, unused bounds tables can eat 80%
of the address space.

But, the recursive do_munmap() that gets called vi arch_unmap()
wreaks havoc with __do_munmap()'s state.  It can result in
freeing populated page tables, accessing bogus VMA state,
double-freed VMAs and more.

See the "long story" further below for the gory details.

To fix this, call arch_unmap() before __do_unmap() has a chance
to do anything meaningful.  Also, remove the 'vma' argument
and force the MPX code to do its own, independent VMA lookup.

== UML / unicore32 impact ==

Remove unused 'vma' argument to arch_unmap().  No functional
change.

I compile tested this on UML but not unicore32.

== powerpc impact ==

powerpc uses arch_unmap() well to watch for munmap() on the
VDSO and zeroes out 'current->mm->context.vdso_base'.  Moving
arch_unmap() makes this happen earlier in __do_munmap().  But,
'vdso_base' seems to only be used in perf and in the signal
delivery that happens near the return to userspace.  I can not
find any likely impact to powerpc, other than the zeroing
happening a little earlier.

powerpc does not use the 'vma' argument and is unaffected by
its removal.

I compile-tested a 64-bit powerpc defconfig.

== x86 impact ==

For the common success case this is functionally identical to
what was there before.  For the munmap() failure case, it's
possible that some MPX tables will be zapped for memory that
continues to be in use.  But, this is an extraordinarily
unlikely scenario and the harm would be that MPX provides no
protection since the bounds table got reset (zeroed).

I can't imagine anyone doing this:

	ptr = mmap();
	// use ptr
	ret = munmap(ptr);
	if (ret)
		// oh, there was an error, I'll
		// keep using ptr.

Because if you're doing munmap(), you are *done* with the
memory.  There's probably no good data in there _anyway_.

This passes the original reproducer from Richard Biener as
well as the existing mpx selftests/.

The long story:

munmap() has a couple of pieces:

 1. Find the affected VMA(s)
 2. Split the start/end one(s) if neceesary
 3. Pull the VMAs out of the rbtree
 4. Actually zap the memory via unmap_region(), including
    freeing page tables (or queueing them to be freed).
 5. Fix up some of the accounting (like fput()) and actually
    free the VMA itself.

This specific ordering was actually introduced by:

  dd2283f2605e ("mm: mmap: zap pages with read mmap_sem in munmap")

during the 4.20 merge window.  The previous __do_munmap() code
was actually safe because the only thing after arch_unmap() was
remove_vma_list().  arch_unmap() could not see 'vma' in the
rbtree because it was detached, so it is not even capable of
doing operations unsafe for remove_vma_list()'s use of 'vma'.

Richard Biener reported a test that shows this in dmesg:

  [1216548.787498] BUG: Bad rss-counter state mm:0000000017ce560b idx:1 val:551
  [1216548.787500] BUG: non-zero pgtables_bytes on freeing mm: 24576

What triggered this was the recursive do_munmap() called via
arch_unmap().  It was freeing page tables that has not been
properly zapped.

But, the problem was bigger than this.  For one, arch_unmap()
can free VMAs.  But, the calling __do_munmap() has variables
that *point* to VMAs and obviously can't handle them just
getting freed while the pointer is still in use.

I tried a couple of things here.  First, I tried to fix the page
table freeing problem in isolation, but I then found the VMA
issue.  I also tried having the MPX code return a flag if it
modified the rbtree which would force __do_munmap() to re-walk
to restart.  That spiralled out of control in complexity pretty
fast.

Just moving arch_unmap() and accepting that the bonkers failure
case might eat some bounds tables seems like the simplest viable
fix.

This was also reported in the following kernel bugzilla entry:

  https://bugzilla.kernel.org/show_bug.cgi?id=203123

There are some reports that this commit triggered this bug:

  dd2283f2605 ("mm: mmap: zap pages with read mmap_sem in munmap")

While that commit certainly made the issues easier to hit, I believe
the fundamental issue has been with us as long as MPX itself, thus
the Fixes: tag below is for one of the original MPX commits.

[ mingo: Minor edits to the changelog and the patch. ]

Reported-by: Richard Biener <rguenther@suse.de>
Reported-by: H.J. Lu <hjl.tools@gmail.com>
Signed-off-by: Dave Hansen <dave.hansen@linux.intel.com>
Reviewed-by Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Yang Shi <yang.shi@linux.alibaba.com>
Acked-by: Michael Ellerman <mpe@ellerman.id.au>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Anton Ivanov <anton.ivanov@cambridgegreys.com>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Guan Xuetao <gxt@pku.edu.cn>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Jeff Dike <jdike@addtoit.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Richard Weinberger <richard@nod.at>
Cc: Rik van Riel <riel@surriel.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: linux-arch@vger.kernel.org
Cc: linux-mm@kvack.org
Cc: linux-um@lists.infradead.org
Cc: linuxppc-dev@lists.ozlabs.org
Cc: stable@vger.kernel.org
Fixes: dd2283f2605e ("mm: mmap: zap pages with read mmap_sem in munmap")
Link: http://lkml.kernel.org/r/20190419194747.5E1AD6DC@viggo.jf.intel.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>

---
 arch/powerpc/include/asm/mmu_context.h   |    1 -
 arch/um/include/asm/mmu_context.h        |    1 -
 arch/unicore32/include/asm/mmu_context.h |    1 -
 arch/x86/include/asm/mmu_context.h       |    6 +++---
 arch/x86/include/asm/mpx.h               |   15 ++++++++-------
 arch/x86/mm/mpx.c                        |   10 ++++++----
 include/asm-generic/mm_hooks.h           |    1 -
 mm/mmap.c                                |   15 ++++++++-------
 8 files changed, 25 insertions(+), 25 deletions(-)

--- a/arch/powerpc/include/asm/mmu_context.h
+++ b/arch/powerpc/include/asm/mmu_context.h
@@ -237,7 +237,6 @@ extern void arch_exit_mmap(struct mm_str
 #endif

 static inline void arch_unmap(struct mm_struct *mm,
-			      struct vm_area_struct *vma,
 			      unsigned long start, unsigned long end)
 {
 	if (start <= mm->context.vdso_base && mm->context.vdso_base < end)
--- a/arch/um/include/asm/mmu_context.h
+++ b/arch/um/include/asm/mmu_context.h
@@ -22,7 +22,6 @@ static inline int arch_dup_mmap(struct m
 }
 extern void arch_exit_mmap(struct mm_struct *mm);
 static inline void arch_unmap(struct mm_struct *mm,
-			struct vm_area_struct *vma,
 			unsigned long start, unsigned long end)
 {
 }
--- a/arch/unicore32/include/asm/mmu_context.h
+++ b/arch/unicore32/include/asm/mmu_context.h
@@ -88,7 +88,6 @@ static inline int arch_dup_mmap(struct m
 }

 static inline void arch_unmap(struct mm_struct *mm,
-			struct vm_area_struct *vma,
 			unsigned long start, unsigned long end)
 {
 }
--- a/arch/x86/include/asm/mmu_context.h
+++ b/arch/x86/include/asm/mmu_context.h
@@ -277,8 +277,8 @@ static inline void arch_bprm_mm_init(str
 	mpx_mm_init(mm);
 }

-static inline void arch_unmap(struct mm_struct *mm, struct vm_area_struct *vma,
-			      unsigned long start, unsigned long end)
+static inline void arch_unmap(struct mm_struct *mm, unsigned long start,
+			      unsigned long end)
 {
 	/*
 	 * mpx_notify_unmap() goes and reads a rarely-hot
@@ -298,7 +298,7 @@ static inline void arch_unmap(struct mm_
 	 * consistently wrong.
 	 */
 	if (unlikely(cpu_feature_enabled(X86_FEATURE_MPX)))
-		mpx_notify_unmap(mm, vma, start, end);
+		mpx_notify_unmap(mm, start, end);
 }

 /*
--- a/arch/x86/include/asm/mpx.h
+++ b/arch/x86/include/asm/mpx.h
@@ -64,12 +64,15 @@ struct mpx_fault_info {
 };

 #ifdef CONFIG_X86_INTEL_MPX
-int mpx_fault_info(struct mpx_fault_info *info, struct pt_regs *regs);
-int mpx_handle_bd_fault(void);
+
+extern int mpx_fault_info(struct mpx_fault_info *info, struct pt_regs *regs);
+extern int mpx_handle_bd_fault(void);
+
 static inline int kernel_managing_mpx_tables(struct mm_struct *mm)
 {
 	return (mm->context.bd_addr != MPX_INVALID_BOUNDS_DIR);
 }
+
 static inline void mpx_mm_init(struct mm_struct *mm)
 {
 	/*
@@ -78,11 +81,10 @@ static inline void mpx_mm_init(struct mm
 	 */
 	mm->context.bd_addr = MPX_INVALID_BOUNDS_DIR;
 }
-void mpx_notify_unmap(struct mm_struct *mm, struct vm_area_struct *vma,
-		      unsigned long start, unsigned long end);

-unsigned long mpx_unmapped_area_check(unsigned long addr, unsigned long len,
-		unsigned long flags);
+extern void mpx_notify_unmap(struct mm_struct *mm, unsigned long start, unsigned long end);
+extern unsigned long mpx_unmapped_area_check(unsigned long addr, unsigned long len, unsigned long flags);
+
 #else
 static inline int mpx_fault_info(struct mpx_fault_info *info, struct pt_regs *regs)
 {
@@ -100,7 +102,6 @@ static inline void mpx_mm_init(struct mm
 {
 }
 static inline void mpx_notify_unmap(struct mm_struct *mm,
-				    struct vm_area_struct *vma,
 				    unsigned long start, unsigned long end)
 {
 }
--- a/arch/x86/mm/mpx.c
+++ b/arch/x86/mm/mpx.c
@@ -881,9 +881,10 @@ static int mpx_unmap_tables(struct mm_st
  * the virtual address region start...end have already been split if
  * necessary, and the 'vma' is the first vma in this range (start -> end).
  */
-void mpx_notify_unmap(struct mm_struct *mm, struct vm_area_struct *vma,
-		unsigned long start, unsigned long end)
+void mpx_notify_unmap(struct mm_struct *mm, unsigned long start,
+		      unsigned long end)
 {
+	struct vm_area_struct *vma;
 	int ret;

 	/*
@@ -902,11 +903,12 @@ void mpx_notify_unmap(struct mm_struct *
 	 * which should not occur normally. Being strict about it here
 	 * helps ensure that we do not have an exploitable stack overflow.
 	 */
-	do {
+	vma = find_vma(mm, start);
+	while (vma && vma->vm_start < end) {
 		if (vma->vm_flags & VM_MPX)
 			return;
 		vma = vma->vm_next;
-	} while (vma && vma->vm_start < end);
+	}

 	ret = mpx_unmap_tables(mm, start, end);
 	if (ret)
--- a/include/asm-generic/mm_hooks.h
+++ b/include/asm-generic/mm_hooks.h
@@ -18,7 +18,6 @@ static inline void arch_exit_mmap(struct
 }

 static inline void arch_unmap(struct mm_struct *mm,
-			struct vm_area_struct *vma,
 			unsigned long start, unsigned long end)
 {
 }
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -2735,9 +2735,17 @@ int __do_munmap(struct mm_struct *mm, un
 		return -EINVAL;

 	len = PAGE_ALIGN(len);
+	end = start + len;
 	if (len == 0)
 		return -EINVAL;

+	/*
+	 * arch_unmap() might do unmaps itself.  It must be called
+	 * and finish any rbtree manipulation before this code
+	 * runs and also starts to manipulate the rbtree.
+	 */
+	arch_unmap(mm, start, end);
+
 	/* Find the first overlapping VMA */
 	vma = find_vma(mm, start);
 	if (!vma)
@@ -2746,7 +2754,6 @@ int __do_munmap(struct mm_struct *mm, un
 	/* we have  start < vma->vm_end  */

 	/* if it doesn't overlap, we have nothing.. */
-	end = start + len;
 	if (vma->vm_start >= end)
 		return 0;

@@ -2816,12 +2823,6 @@ int __do_munmap(struct mm_struct *mm, un
 	/* Detach vmas from rbtree */
 	detach_vmas_to_be_unmapped(mm, vma, prev, end);

-	/*
-	 * mpx unmap needs to be called with mmap_sem held for write.
-	 * It is safe to call it before unmap_region().
-	 */
-	arch_unmap(mm, vma, start, end);
-
 	if (downgrade)
 		downgrade_write(&mm->mmap_sem);

^ permalink raw reply

* Re: [BISECTED] kexec regression on PowerBook G4
From: Aaro Koskinen @ 2019-05-23 22:23 UTC (permalink / raw)
  To: Christophe Leroy; +Cc: linuxppc-dev
In-Reply-To: <dc24cfa0-cefa-3245-a9aa-5493b094ffac@c-s.fr>

Hi,

On Thu, May 23, 2019 at 08:58:11PM +0200, Christophe Leroy wrote:
> Le 23/05/2019 à 19:27, Aaro Koskinen a écrit :
> >On Thu, May 23, 2019 at 07:33:38AM +0200, Christophe Leroy wrote:
> >>Ok, the Oops confirms that the error is due to executing the kexec control
> >>code which is located outside the kernel text area.
> >>
> >>My yesterday's proposed change doesn't work because on book3S/32, NX
> >>protection is based on setting segments to NX, and using IBATs for kernel
> >>text.
> >>
> >>Can you try the patch I sent out a few minutes ago ?
> >>(https://patchwork.ozlabs.org/patch/1103827/)
> >
> >It now crashes with "BUG: Unable to handle kernel instruction fetch"
> >and the faulting address is 0xef13a000.
> 
> Ok.
> 
> Can you try with both changes at the same time, ie the mtsrin(...) and the
> change_page_attr() ?
> 
> I suspect that allthough the HW is not able to check EXEC flag, the SW will
> check it before loading the hash entry.

Unfortunately still no luck... The crash is pretty much the same with both
changes.

A.

^ permalink raw reply

* Re: [PATCH] ASoC: fsl_esai: fix the channel swap issue after xrun
From: Nicolin Chen @ 2019-05-23 22:50 UTC (permalink / raw)
  To: broonie@kernel.org, S.j. Wang
  Cc: alsa-devel@alsa-project.org, timur@kernel.org,
	Xiubo.Lee@gmail.com, festevam@gmail.com,
	linux-kernel@vger.kernel.org, linuxppc-dev@lists.ozlabs.org
In-Reply-To: <VE1PR04MB6479FF8E1B55E9BE67E7B0ECE3010@VE1PR04MB6479.eurprd04.prod.outlook.com>

On Thu, May 23, 2019 at 11:04:03AM +0000, S.j. Wang wrote:
> > On Thu, May 23, 2019 at 09:53:42AM +0000, S.j. Wang wrote:
> > > > > +     /*
> > > > > +      * Add fifo reset here, because the regcache_sync will
> > > > > +      * write one more data to ETDR.
> > > > > +      * Which will cause channel shift.
> > > >
> > > > Sounds like a bug to me...should fix it first by marking the data
> > > > registers as volatile.
> > > >
> > > The ETDR is a writable register, it is not volatile. Even we change it
> > > to Volatile, I don't think we can't avoid this issue. for the
> > > regcache_sync Just to write this register, it is correct behavior.
> > 
> > Is that so? Quoting the comments of regcache_sync():
> > "* regcache_sync - Sync the register cache with the hardware.
> >  *
> >  * @map: map to configure.
> >  *
> >  * Any registers that should not be synced should be marked as
> >  * volatile."
> > 
> > If regcache_sync() does sync volatile registers too as you said, I don't mind
> > having this FIFO reset WAR for now, though I think this mismatch between
> > the comments and the actual behavior then should get people's attention.
> > 
> > Thank you
> 
> ETDR is not volatile,  if we mark it is volatile, is it correct?

Well, you have a point -- it might not be ideally true, but it sounds
like a correct fix to me according to this comments.

We can wait for Mark's comments or just send a patch to the mail list 
for review.

Thanks you

^ permalink raw reply

* [RFC PATCH v4 07/21] watchdog/hardlockup: Define a generic function to detect hardlockups
From: Ricardo Neri @ 2019-05-24  1:16 UTC (permalink / raw)
  To: Thomas Gleixner, Ingo Molnar, Borislav Petkov
  Cc: Peter Zijlstra, Ricardo Neri, Stephane Eranian, Paul Mackerras,
	H. Peter Anvin, sparclinux, Ashok Raj, Joerg Roedel, x86,
	Luis R. Rodriguez, Andi Kleen, Don Zickus, Ravi V. Shankar,
	Ricardo Neri, Frederic Weisbecker, Nicholas Piggin, Babu Moger,
	Mathieu Desnoyers, Tony Luck, Randy Dunlap, linux-kernel, iommu,
	Masami Hiramatsu, Suravee Suthikulpanit, Philippe Ombredanne,
	Colin Ian King, Andrew Morton, linuxppc-dev, David S. Miller
In-Reply-To: <1558660583-28561-1-git-send-email-ricardo.neri-calderon@linux.intel.com>

The procedure to detect hardlockups is independent of the underlying
mechanism that generates the non-maskable interrupt used to drive the
detector. Thus, it can be put in a separate, generic function. In this
manner, it can be invoked by various implementations of the NMI watchdog.

For this purpose, move the bulk of watchdog_overflow_callback() to the
new function inspect_for_hardlockups(). This function can then be called
from the applicable NMI handlers.

Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Ashok Raj <ashok.raj@intel.com>
Cc: Andi Kleen <andi.kleen@intel.com>
Cc: Tony Luck <tony.luck@intel.com>
Cc: Don Zickus <dzickus@redhat.com>
Cc: Nicholas Piggin <npiggin@gmail.com>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Frederic Weisbecker <frederic@kernel.org>
Cc: Babu Moger <Babu.Moger@amd.com>
Cc: "David S. Miller" <davem@davemloft.net>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Cc: Masami Hiramatsu <mhiramat@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Philippe Ombredanne <pombredanne@nexb.com>
Cc: Colin Ian King <colin.king@canonical.com>
Cc: "Luis R. Rodriguez" <mcgrof@kernel.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Suravee Suthikulpanit <Suravee.Suthikulpanit@amd.com>
Cc: "Ravi V. Shankar" <ravi.v.shankar@intel.com>
Cc: x86@kernel.org
Cc: sparclinux@vger.kernel.org
Cc: linuxppc-dev@lists.ozlabs.org
Signed-off-by: Ricardo Neri <ricardo.neri-calderon@linux.intel.com>
---
 include/linux/nmi.h   |  1 +
 kernel/watchdog_hld.c | 18 +++++++++++-------
 2 files changed, 12 insertions(+), 7 deletions(-)

diff --git a/include/linux/nmi.h b/include/linux/nmi.h
index 9003e29cde46..5a8b19749769 100644
--- a/include/linux/nmi.h
+++ b/include/linux/nmi.h
@@ -212,6 +212,7 @@ extern int proc_watchdog_thresh(struct ctl_table *, int ,
 				void __user *, size_t *, loff_t *);
 extern int proc_watchdog_cpumask(struct ctl_table *, int,
 				 void __user *, size_t *, loff_t *);
+void inspect_for_hardlockups(struct pt_regs *regs);
 
 #ifdef CONFIG_HAVE_ACPI_APEI_NMI
 #include <asm/nmi.h>
diff --git a/kernel/watchdog_hld.c b/kernel/watchdog_hld.c
index 247bf0b1582c..b352e507b17f 100644
--- a/kernel/watchdog_hld.c
+++ b/kernel/watchdog_hld.c
@@ -106,14 +106,8 @@ static struct perf_event_attr wd_hw_attr = {
 	.disabled	= 1,
 };
 
-/* Callback function for perf event subsystem */
-static void watchdog_overflow_callback(struct perf_event *event,
-				       struct perf_sample_data *data,
-				       struct pt_regs *regs)
+void inspect_for_hardlockups(struct pt_regs *regs)
 {
-	/* Ensure the watchdog never gets throttled */
-	event->hw.interrupts = 0;
-
 	if (__this_cpu_read(watchdog_nmi_touch) == true) {
 		__this_cpu_write(watchdog_nmi_touch, false);
 		return;
@@ -163,6 +157,16 @@ static void watchdog_overflow_callback(struct perf_event *event,
 	return;
 }
 
+/* Callback function for perf event subsystem */
+static void watchdog_overflow_callback(struct perf_event *event,
+				       struct perf_sample_data *data,
+				       struct pt_regs *regs)
+{
+	/* Ensure the watchdog never gets throttled */
+	event->hw.interrupts = 0;
+	inspect_for_hardlockups(regs);
+}
+
 static int hardlockup_detector_event_create(void)
 {
 	unsigned int cpu = smp_processor_id();
-- 
2.17.1


^ permalink raw reply related

page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox