Linux userland API discussions

Linux userland API discussions
 help / color / mirror / Atom feed

* Re: [RFCv2 4/6] mm: factor out madvise's core functionality
From: Minchan Kim @ 2019-05-31 23:29 UTC (permalink / raw)
  To: Oleksandr Natalenko
  Cc: Andrew Morton, linux-mm, LKML, linux-api, Michal Hocko,
	Johannes Weiner, Tim Murray, Joel Fernandes, Suren Baghdasaryan,
	Daniel Colascione, Shakeel Butt, Sonny Rao, Brian Geffon, jannh,
	oleg, christian, hdanton
In-Reply-To: <20190531143545.jwmgzaigd4rbw2wy@butterfly.localdomain>

On Fri, May 31, 2019 at 04:35:45PM +0200, Oleksandr Natalenko wrote:
> On Fri, May 31, 2019 at 10:12:26PM +0900, Minchan Kim wrote:
> > On Fri, May 31, 2019 at 09:04:20AM +0200, Oleksandr Natalenko wrote:
> > > On Fri, May 31, 2019 at 03:43:11PM +0900, Minchan Kim wrote:
> > > > This patch factor out madvise's core functionality so that upcoming
> > > > patch can reuse it without duplication. It shouldn't change any behavior.
> > > > 
> > > > Signed-off-by: Minchan Kim <minchan@kernel.org>
> > > > ---
> > > >  mm/madvise.c | 188 +++++++++++++++++++++++++++------------------------
> > > >  1 file changed, 101 insertions(+), 87 deletions(-)
> > > > 
> > > > diff --git a/mm/madvise.c b/mm/madvise.c
> > > > index 9d749a1420b4..466623ea8c36 100644
> > > > --- a/mm/madvise.c
> > > > +++ b/mm/madvise.c
> > > > @@ -425,9 +425,10 @@ static int madvise_pageout_pte_range(pmd_t *pmd, unsigned long addr,
> > > >  	struct page *page;
> > > >  	int isolated = 0;
> > > >  	struct vm_area_struct *vma = walk->vma;
> > > > +	struct task_struct *task = walk->private;
> > > >  	unsigned long next;
> > > >  
> > > > -	if (fatal_signal_pending(current))
> > > > +	if (fatal_signal_pending(task))
> > > >  		return -EINTR;
> > > >  
> > > >  	next = pmd_addr_end(addr, end);
> > > > @@ -505,12 +506,14 @@ static int madvise_pageout_pte_range(pmd_t *pmd, unsigned long addr,
> > > >  }
> > > >  
> > > >  static void madvise_pageout_page_range(struct mmu_gather *tlb,
> > > > -			     struct vm_area_struct *vma,
> > > > -			     unsigned long addr, unsigned long end)
> > > > +				struct task_struct *task,
> > > > +				struct vm_area_struct *vma,
> > > > +				unsigned long addr, unsigned long end)
> > > >  {
> > > >  	struct mm_walk warm_walk = {
> > > >  		.pmd_entry = madvise_pageout_pte_range,
> > > >  		.mm = vma->vm_mm,
> > > > +		.private = task,
> > > >  	};
> > > >  
> > > >  	tlb_start_vma(tlb, vma);
> > > > @@ -519,9 +522,9 @@ static void madvise_pageout_page_range(struct mmu_gather *tlb,
> > > >  }
> > > >  
> > > >  
> > > > -static long madvise_pageout(struct vm_area_struct *vma,
> > > > -			struct vm_area_struct **prev,
> > > > -			unsigned long start_addr, unsigned long end_addr)
> > > > +static long madvise_pageout(struct task_struct *task,
> > > > +		struct vm_area_struct *vma, struct vm_area_struct **prev,
> > > > +		unsigned long start_addr, unsigned long end_addr)
> > > >  {
> > > >  	struct mm_struct *mm = vma->vm_mm;
> > > >  	struct mmu_gather tlb;
> > > > @@ -532,7 +535,7 @@ static long madvise_pageout(struct vm_area_struct *vma,
> > > >  
> > > >  	lru_add_drain();
> > > >  	tlb_gather_mmu(&tlb, mm, start_addr, end_addr);
> > > > -	madvise_pageout_page_range(&tlb, vma, start_addr, end_addr);
> > > > +	madvise_pageout_page_range(&tlb, task, vma, start_addr, end_addr);
> > > >  	tlb_finish_mmu(&tlb, start_addr, end_addr);
> > > >  
> > > >  	return 0;
> > > > @@ -744,7 +747,8 @@ static long madvise_dontneed_single_vma(struct vm_area_struct *vma,
> > > >  	return 0;
> > > >  }
> > > >  
> > > > -static long madvise_dontneed_free(struct vm_area_struct *vma,
> > > > +static long madvise_dontneed_free(struct mm_struct *mm,
> > > > +				  struct vm_area_struct *vma,
> > > >  				  struct vm_area_struct **prev,
> > > >  				  unsigned long start, unsigned long end,
> > > >  				  int behavior)
> > > > @@ -756,8 +760,8 @@ static long madvise_dontneed_free(struct vm_area_struct *vma,
> > > >  	if (!userfaultfd_remove(vma, start, end)) {
> > > >  		*prev = NULL; /* mmap_sem has been dropped, prev is stale */
> > > >  
> > > > -		down_read(&current->mm->mmap_sem);
> > > > -		vma = find_vma(current->mm, start);
> > > > +		down_read(&mm->mmap_sem);
> > > > +		vma = find_vma(mm, start);
> > > >  		if (!vma)
> > > >  			return -ENOMEM;
> > > >  		if (start < vma->vm_start) {
> > > > @@ -804,7 +808,8 @@ static long madvise_dontneed_free(struct vm_area_struct *vma,
> > > >   * Application wants to free up the pages and associated backing store.
> > > >   * This is effectively punching a hole into the middle of a file.
> > > >   */
> > > > -static long madvise_remove(struct vm_area_struct *vma,
> > > > +static long madvise_remove(struct mm_struct *mm,
> > > > +				struct vm_area_struct *vma,
> > > >  				struct vm_area_struct **prev,
> > > >  				unsigned long start, unsigned long end)
> > > >  {
> > > > @@ -838,13 +843,13 @@ static long madvise_remove(struct vm_area_struct *vma,
> > > >  	get_file(f);
> > > >  	if (userfaultfd_remove(vma, start, end)) {
> > > >  		/* mmap_sem was not released by userfaultfd_remove() */
> > > > -		up_read(&current->mm->mmap_sem);
> > > > +		up_read(&mm->mmap_sem);
> > > >  	}
> > > >  	error = vfs_fallocate(f,
> > > >  				FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE,
> > > >  				offset, end - start);
> > > >  	fput(f);
> > > > -	down_read(&current->mm->mmap_sem);
> > > > +	down_read(&mm->mmap_sem);
> > > >  	return error;
> > > >  }
> > > >  
> > > > @@ -918,21 +923,23 @@ static int madvise_inject_error(int behavior,
> > > >  #endif
> > > >  
> > > >  static long
> > > > -madvise_vma(struct vm_area_struct *vma, struct vm_area_struct **prev,
> > > > +madvise_vma(struct task_struct *task, struct mm_struct *mm,
> > > > +		struct vm_area_struct *vma, struct vm_area_struct **prev,
> > > >  		unsigned long start, unsigned long end, int behavior)
> > > >  {
> > > >  	switch (behavior) {
> > > >  	case MADV_REMOVE:
> > > > -		return madvise_remove(vma, prev, start, end);
> > > > +		return madvise_remove(mm, vma, prev, start, end);
> > > >  	case MADV_WILLNEED:
> > > >  		return madvise_willneed(vma, prev, start, end);
> > > >  	case MADV_COLD:
> > > >  		return madvise_cold(vma, prev, start, end);
> > > >  	case MADV_PAGEOUT:
> > > > -		return madvise_pageout(vma, prev, start, end);
> > > > +		return madvise_pageout(task, vma, prev, start, end);
> > > >  	case MADV_FREE:
> > > >  	case MADV_DONTNEED:
> > > > -		return madvise_dontneed_free(vma, prev, start, end, behavior);
> > > > +		return madvise_dontneed_free(mm, vma, prev, start,
> > > > +						end, behavior);
> > > >  	default:
> > > >  		return madvise_behavior(vma, prev, start, end, behavior);
> > > >  	}
> > > > @@ -976,68 +983,8 @@ madvise_behavior_valid(int behavior)
> > > >  	}
> > > >  }
> > > >  
> > > > -/*
> > > > - * The madvise(2) system call.
> > > > - *
> > > > - * Applications can use madvise() to advise the kernel how it should
> > > > - * handle paging I/O in this VM area.  The idea is to help the kernel
> > > > - * use appropriate read-ahead and caching techniques.  The information
> > > > - * provided is advisory only, and can be safely disregarded by the
> > > > - * kernel without affecting the correct operation of the application.
> > > > - *
> > > > - * behavior values:
> > > > - *  MADV_NORMAL - the default behavior is to read clusters.  This
> > > > - *		results in some read-ahead and read-behind.
> > > > - *  MADV_RANDOM - the system should read the minimum amount of data
> > > > - *		on any access, since it is unlikely that the appli-
> > > > - *		cation will need more than what it asks for.
> > > > - *  MADV_SEQUENTIAL - pages in the given range will probably be accessed
> > > > - *		once, so they can be aggressively read ahead, and
> > > > - *		can be freed soon after they are accessed.
> > > > - *  MADV_WILLNEED - the application is notifying the system to read
> > > > - *		some pages ahead.
> > > > - *  MADV_DONTNEED - the application is finished with the given range,
> > > > - *		so the kernel can free resources associated with it.
> > > > - *  MADV_FREE - the application marks pages in the given range as lazy free,
> > > > - *		where actual purges are postponed until memory pressure happens.
> > > > - *  MADV_REMOVE - the application wants to free up the given range of
> > > > - *		pages and associated backing store.
> > > > - *  MADV_DONTFORK - omit this area from child's address space when forking:
> > > > - *		typically, to avoid COWing pages pinned by get_user_pages().
> > > > - *  MADV_DOFORK - cancel MADV_DONTFORK: no longer omit this area when forking.
> > > > - *  MADV_WIPEONFORK - present the child process with zero-filled memory in this
> > > > - *              range after a fork.
> > > > - *  MADV_KEEPONFORK - undo the effect of MADV_WIPEONFORK
> > > > - *  MADV_HWPOISON - trigger memory error handler as if the given memory range
> > > > - *		were corrupted by unrecoverable hardware memory failure.
> > > > - *  MADV_SOFT_OFFLINE - try to soft-offline the given range of memory.
> > > > - *  MADV_MERGEABLE - the application recommends that KSM try to merge pages in
> > > > - *		this area with pages of identical content from other such areas.
> > > > - *  MADV_UNMERGEABLE- cancel MADV_MERGEABLE: no longer merge pages with others.
> > > > - *  MADV_HUGEPAGE - the application wants to back the given range by transparent
> > > > - *		huge pages in the future. Existing pages might be coalesced and
> > > > - *		new pages might be allocated as THP.
> > > > - *  MADV_NOHUGEPAGE - mark the given range as not worth being backed by
> > > > - *		transparent huge pages so the existing pages will not be
> > > > - *		coalesced into THP and new pages will not be allocated as THP.
> > > > - *  MADV_DONTDUMP - the application wants to prevent pages in the given range
> > > > - *		from being included in its core dump.
> > > > - *  MADV_DODUMP - cancel MADV_DONTDUMP: no longer exclude from core dump.
> > > > - *
> > > > - * return values:
> > > > - *  zero    - success
> > > > - *  -EINVAL - start + len < 0, start is not page-aligned,
> > > > - *		"behavior" is not a valid value, or application
> > > > - *		is attempting to release locked or shared pages,
> > > > - *		or the specified address range includes file, Huge TLB,
> > > > - *		MAP_SHARED or VMPFNMAP range.
> > > > - *  -ENOMEM - addresses in the specified range are not currently
> > > > - *		mapped, or are outside the AS of the process.
> > > > - *  -EIO    - an I/O error occurred while paging in data.
> > > > - *  -EBADF  - map exists, but area maps something that isn't a file.
> > > > - *  -EAGAIN - a kernel resource was temporarily unavailable.
> > > > - */
> > > > -SYSCALL_DEFINE3(madvise, unsigned long, start, size_t, len_in, int, behavior)
> > > > +static int madvise_core(struct task_struct *task, struct mm_struct *mm,
> > > > +			unsigned long start, size_t len_in, int behavior)
> > > 
> > > Just a minor nitpick, but can we please have it named madvise_common,
> > > not madvise_core? This would follow a usual naming scheme, when some
> > > common functionality is factored out (like, for mutexes, semaphores
> > > etc), and within the kernel "core" usually means something completely
> > > different.
> > 
> > Sure.
> > 
> > > 
> > > >  {
> > > >  	unsigned long end, tmp;
> > > >  	struct vm_area_struct *vma, *prev;
> > > > @@ -1068,15 +1015,16 @@ SYSCALL_DEFINE3(madvise, unsigned long, start, size_t, len_in, int, behavior)
> > > >  
> > > >  #ifdef CONFIG_MEMORY_FAILURE
> > > >  	if (behavior == MADV_HWPOISON || behavior == MADV_SOFT_OFFLINE)
> > > > -		return madvise_inject_error(behavior, start, start + len_in);
> > > > +		return madvise_inject_error(behavior,
> > > > +					start, start + len_in);
> > > 
> > > Not sure what this change is about except changing the line length.
> > > Note, madvise_inject_error() still operates on "current" through
> > > get_user_pages_fast() and gup_pgd_range(), but that was not changed
> > > here. I Know you've filtered out this hint later, so technically this
> > > is not an issue, but, maybe, this needs some attention too since we've
> > > already spotted it?
> > 
> > It is leftover I had done. I actually modified it to handle remote
> > task but changed my mind not to fix it because process_madvise
> > will not support it at this moment. I'm not sure it's a good idea
> > to change it for *might-be-done-in-future* at this moment even though
> > we have spotted.
> 
> I'd expect to have at least some comments in code on why other hints
> are disabled, so if we already know some shortcomings, this information
> would not be lost.

Okay, I will add some comment but do not want to fix code piece until
someone want to expose the poisoning to external process.

> 
> Of course, I don't care much about memory poisoning, but if it can be
> addressed now, let's address it now.
> 
> > 
> > > 
> > > >  #endif
> > > >  
> > > >  	write = madvise_need_mmap_write(behavior);
> > > >  	if (write) {
> > > > -		if (down_write_killable(&current->mm->mmap_sem))
> > > > +		if (down_write_killable(&mm->mmap_sem))
> > > >  			return -EINTR;
> > > 
> > > Do you still need that trick with mmget_still_valid() here?
> > > Something like:
> > 
> > Since MADV_COLD|PAGEOUT doesn't change address space layout or
> > vma->vm_flags, technically, we don't need it if I understand
> > correctly. Right?
> 
> I'd expect so, yes. But.
> 
> Since we want this interface to be universal and to be able to cover
> various needs, and since my initial intention with working in this
> direction involved KSM, I'd ask you to enable KSM hints too, and once
> (and if) that happens, the work there is done under write lock, and
> you'll need this trick to be applied.
> 
> Of course, I can do that myself later in a subsequent patch series once
> (and, again, if) your series is merged, but, maybe, we can cover this
> already especially given the fact that KSM hinting is a relatively easy
> task in this pile. I did some preliminary tests with it, and so far no
> dragons have started to roar.

Then, do you mind sending a patch based upon this series to expose
MADV_MERGEABLE to process_madvise? It will have the right description
why you want to have such feature which I couldn't provide since I don't
have enough material to write the motivation. And the patch also could
include the logic to prevent coredump race, which is more proper since
finally we need to hold mmap_sem write-side lock, finally.
I will pick it up and will rebase since then.

Thanks.

^ permalink raw reply

* Re: [PATCH 2/5] Add fchmodat4(), a new syscall
From: Palmer Dabbelt @ 2019-05-31 23:36 UTC (permalink / raw)
  To: Arnd Bergmann
  Cc: viro, linux-kernel, linux-fsdevel, linux-api, linux-arch, x86,
	luto, tglx, mingo, bp, hpa
In-Reply-To: <CAK8P3a3HPeVq29k3Zk5rSk4bddiUQFrdEgDZUgdNnYZK+8QpGw@mail.gmail.com>

On Fri, 31 May 2019 12:51:00 PDT (-0700), Arnd Bergmann wrote:
> On Fri, May 31, 2019 at 9:23 PM Palmer Dabbelt <palmer@sifive.com> wrote:
>>
>> man 3p says that fchmodat() takes a flags argument, but the Linux
>> syscall does not.  There doesn't appear to be a good userspace
>> workaround for this issue but the implementation in the kernel is pretty
>> straight-forward.  The specific use case where the missing flags came up
>> was WRT a fuse filesystem implemenation, but the functionality is pretty
>> generic so I'm assuming there would be other use cases.
>>
>> Signed-off-by: Palmer Dabbelt <palmer@sifive.com>
>> ---
>>  fs/open.c                | 21 +++++++++++++++++++--
>>  include/linux/syscalls.h |  5 +++++
>>  2 files changed, 24 insertions(+), 2 deletions(-)
>>
>> diff --git a/fs/open.c b/fs/open.c
>> index a00350018a47..cfad7684e8d3 100644
>> --- a/fs/open.c
>> +++ b/fs/open.c
>> @@ -568,11 +568,17 @@ SYSCALL_DEFINE2(fchmod, unsigned int, fd, umode_t, mode)
>>         return ksys_fchmod(fd, mode);
>>  }
>>
>> -int do_fchmodat(int dfd, const char __user *filename, umode_t mode)
>> +int do_fchmodat4(int dfd, const char __user *filename, umode_t mode, int flags)
> ...
>> +
>> +int do_fchmodat(int dfd, const char __user *filename, umode_t mode)
>> +{
>> +       return do_fchmodat4(dfd, filename, mode, 0);
>> +}
>> +
>
> There is only one external caller of do_fchmodat(), so just change that
> to pass the extra argument here, and keep a single do_fchmodat()
> function used by the sys_fchmod(), sys_fchmod4(), sys_chmod()
> and ksys_chmod().

OK, I'll roll that up into a v2.

^ permalink raw reply

* Re: [PATCH 3/5] asm-generic: Register fchmodat4 as syscall 428
From: Palmer Dabbelt @ 2019-05-31 23:57 UTC (permalink / raw)
  To: Arnd Bergmann
  Cc: viro, linux-kernel, linux-fsdevel, linux-api, linux-arch, x86,
	luto, tglx, mingo, bp, hpa
In-Reply-To: <CAK8P3a2=xko56LbwV4tyhyyyX+tw+EV-NGavYEYj0q61t=mnwg@mail.gmail.com>

On Fri, 31 May 2019 12:56:39 PDT (-0700), Arnd Bergmann wrote:
> On Fri, May 31, 2019 at 9:23 PM Palmer Dabbelt <palmer@sifive.com> wrote:
>>
>> Signed-off-by: Palmer Dabbelt <palmer@sifive.com>
>
> As usual, each patch needs a changelog text. I would prefer having a single
> patch here that changes /all/ system call tables at once, rather than doing one
> at a time like we used to.

Works for me.  That also gives me something to write about it the text :)

> In linux-next, we are at number 434 now, and there are a couple of other
> new system calls being proposed right now (more than usual), so you may
> have to change the number a few times.

OK, no problem.  It'll be a bit easier to handle the number that way.

> Note: most architectures use .tbl files now, the exceptions are
> include/uapi/asm-generic/unistd.h and arch/arm64/include/asm/unistd32.h,
> and the latter also requires changing __NR_compat_syscalls in asm/unistd.h.
>
> Numbers should now be the same across architectures, except for alpha,
> which has a +110 offset. We have discussed ways to have a single
> file to modify for a new call on all architectures, but no patches yet.

OK, thanks.  I'll wait a bit for feedback, but unless there's anything else
I'll go ahead and finish this.

^ permalink raw reply

* Re: [RFC][PATCH] link.2: AT_ATOMIC_DATA and AT_ATOMIC_METADATA
From: Amir Goldstein @ 2019-06-01  7:21 UTC (permalink / raw)
  To: Dave Chinner
  Cc: Theodore Ts'o, Jan Kara, Darrick J . Wong, Chris Mason,
	Al Viro, linux-fsdevel, linux-xfs, Ext4, Linux Btrfs, Linux API,
	Ric Wheeler
In-Reply-To: <20190531224549.GF29573@dread.disaster.area>

On Sat, Jun 1, 2019 at 1:46 AM Dave Chinner <david@fromorbit.com> wrote:
>
> On Fri, May 31, 2019 at 12:41:36PM -0400, Theodore Ts'o wrote:
> > On Fri, May 31, 2019 at 06:21:45PM +0300, Amir Goldstein wrote:
> > > What do you think of:
> > >
> > > "AT_ATOMIC_DATA (since Linux 5.x)
> > > A filesystem which accepts this flag will guarantee that if the linked file
> > > name exists after a system crash, then all of the data written to the file
> > > and all of the file's metadata at the time of the linkat(2) call will be
> > > visible.
> >
> > ".... will be visible after the the file system is remounted".  (Never
> > hurts to be explicit.)
> >
> > > The way to achieve this guarantee on old kernels is to call fsync (2)
> > > before linking the file, but doing so will also results in flushing of
> > > volatile disk caches.
> > >
> > > A filesystem which accepts this flag does NOT
> > > guarantee that any of the file hardlinks will exist after a system crash,
> > > nor that the last observed value of st_nlink (see stat (2)) will persist."
> > >
> >
> > This is I think more precise:
> >
> >     This guarantee can be achieved by calling fsync(2) before linking
> >     the file, but there may be more performant ways to provide these
> >     semantics.  In particular, note that the use of the AT_ATOMIC_DATA
> >     flag does *not* guarantee that the new link created by linkat(2)
> >     will be persisted after a crash.
>
> So here's the *implementation* problem I see with this definition of
> AT_ATOMIC_DATA. After linkat(dirfd, name, AT_ATOMIC_DATA), there is
> no guarantee that the data is on disk or that the link is present.
>
> However:
>
>         linkat(dirfd, name, AT_ATOMIC_DATA);
>         fsync(dirfd);
>
> Suddenly changes all that.
>
> i.e. when we fsync(dirfd) we guarantee that "name" is present in the
> directory and because we used AT_ATOMIC_DATA it implies that the
> data pointed to by "name" must be present on disk. IOWs, what was
> once a pure directory sync operation now *must* fsync all the child
> inodes that have been linkat(AT_ATOMIC_DATA) since the last time the
> direct has been made stable.
>
> IOWs, the described AT_ATOMIC_DATA "we don't have to write the data
> during linkat() go-fast-get-out-of-gaol-free" behaviour isn't worth
> the pixels it is written on - it just moves all the complexity to
> directory fsync, and that's /already/ a behavioural minefield.

Where does it say we don't have to write the data during linkat()?
I was only talking about avoid FLUSH/FUA caused by fsync().
I wrote in commit message:
"First implementation of AT_ATOMIC_DATA is expected to be
filemap_write_and_wait() for xfs/ext4 and probably fdatasync for btrfs."

I failed to convey the reasoning for this flag to you.
It is *not* about the latency of the "atomic link" for the calling thread
It is about not interfering with other workloads running at the same time.

>
> IMO, the "work-around" of forcing filesystems to write back
> destination inodes during a link() operation is just nasty and will
> just end up with really weird performance anomalies occurring in
> production systems. That's not really a solution, either, especially
> as it is far, far faster for applications to use AIO_FSYNC and then
> on the completion callback run a normal linkat() operation...
>
> Hence, if I've understood these correctly, then I'll be recommending
> that XFS follows this:
>
> > We should also document that a file system which does not implement
> > this flag MUST return EINVAL if it is passed this flag to linkat(2).
>
> and returns -EINVAL to these flags because we do not have the change
> tracking infrastructure to handle these directory fsync semantics.
> I also suspect that, even if we could track this efficiently, we
> can't do the flushing atomically because of locking order
> constraints between directories, regular files, pages in the page
> cache, etc.

That is not at all what I had in mind for XFS with the flag.

>
> Given that we can already use AIO to provide this sort of ordering,
> and AIO is vastly faster than synchronous IO, I don't see any point
> in adding complex barrier interfaces that can be /easily implemented
> in userspace/ using existing AIO primitives. You should start
> thinking about expanding libaio with stuff like
> "link_after_fdatasync()" and suddenly the whole problem of
> filesystem data vs metadata ordering goes away because the
> application directly controls all ordering without blocking and
> doesn't need to care what the filesystem under it does....
>

OK. I can work with that. It's not that simple, but I will reply on
your next email, where you wrote more about this alternative.

Thanks,
Amir.

^ permalink raw reply

* Re: [RFC][PATCH] link.2: AT_ATOMIC_DATA and AT_ATOMIC_METADATA
From: Amir Goldstein @ 2019-06-01  8:01 UTC (permalink / raw)
  To: Dave Chinner
  Cc: Theodore Ts'o, Jan Kara, Darrick J . Wong, Chris Mason,
	Al Viro, linux-fsdevel, linux-xfs, Ext4, Linux Btrfs, Linux API
In-Reply-To: <20190531232852.GG29573@dread.disaster.area>

On Sat, Jun 1, 2019 at 2:28 AM Dave Chinner <david@fromorbit.com> wrote:
>
> On Sat, Jun 01, 2019 at 08:45:49AM +1000, Dave Chinner wrote:
> > Given that we can already use AIO to provide this sort of ordering,
> > and AIO is vastly faster than synchronous IO, I don't see any point
> > in adding complex barrier interfaces that can be /easily implemented
> > in userspace/ using existing AIO primitives. You should start
> > thinking about expanding libaio with stuff like
> > "link_after_fdatasync()" and suddenly the whole problem of
> > filesystem data vs metadata ordering goes away because the
> > application directly controls all ordering without blocking and
> > doesn't need to care what the filesystem under it does....
>
> And let me point out that this is also how userspace can do an
> efficient atomic rename - rename_after_fdatasync(). i.e. on
> completion of the AIO_FSYNC, run the rename. This guarantees that
> the application will see either the old file of the complete new
> file, and it *doesn't have to wait for the operation to complete*.
> Once it is in flight, the file will contain the old data until some
> point in the near future when will it contain the new data....

What I am looking for is a way to isolate the effects of "atomic rename/link"
from the rest of the users. Sure there is I/O bandwidth and queued
bios, but at least isolate other threads working on other files or metadata
from contending with the "atomic rename" thread of journal flushes and
the like. Actually, one of my use cases is "atomic rename" of files with
no data (looking for atomicity w.r.t xattr and mtime), so this "atomic rename"
thread should not be interfering with other workloads at all.

>
> Seriously, sit down and work out all the "atomic" data vs metadata
> behaviours you want, and then tell me how many of them cannot be
> implemented as "AIO_FSYNC w/ completion callback function" in
> userspace. This mechanism /guarantees ordering/ at the application
> level, the application does not block waiting for these data
> integrity operations to complete, and you don't need any new kernel
> side functionality to implement this.

So I think what I could have used is AIO_BATCH_FSYNC, an interface
that was proposed by Ric Wheeler and discussed on LSF:
https://lwn.net/Articles/789024/
Ric was looking for a way to efficiently fsync a "bunch of files".
Submitting several AIO_FSYNC calls is not the efficient way of doing that.
So it is either a new AIO_BATCH_FSYNC and a kernel implementation
that flushes the inodes and then calls ->sync_fs(), or a new AIO operation
that just does the ->sync_fs() bit and using sync_file_range() for the inodes.

To be more accurate, the AIO operation that would emulate my
proposed API more closely is AIO_WAIT_FOR_SYNCFS, as I do not wish
to impose excessive journal flushes, I just need a completion callback
when they happened to perform the rename/link.

>
> Fundamentally, the assertion that disk cache flushes are not what
> causes fsync "to be slow" is incorrect. It's the synchronous

Too many double negatives. I am not sure I parsed this correctly.
But I think by now you understand that I don't care that fsync is "slow".
I care about frequent fsyncs making the entire system slow down.

Heck, xfs even has a mitigation in place to improve performance
of too frequent fsyncs, but that mitigation is partly gone since
47c7d0b19502 xfs: fix incorrect log_flushed on fsync

The situation with frequent fsync on ext4 at the moment is probably
worse.

I am trying to reduce the number of fsyncs from applications
and converting fsync to AIO_FSYNC is not going to help with that.

Thanks,
Amir.

^ permalink raw reply

* Re: [PATCH v3 16/16] fpga: dfl: fme: add performance reporting support
From: Wu Hao @ 2019-06-01  8:04 UTC (permalink / raw)
  To: Alan Tull
  Cc: Moritz Fischer, linux-fpga, linux-kernel, linux-api, Luwei Kang,
	Xu Yilun
In-Reply-To: <CANk1AXTNzzq6LM+iC05nBkh3zydtYmD6REWN2sJ1BBLjoe8zyA@mail.gmail.com>

On Thu, May 30, 2019 at 01:53:23PM -0500, Alan Tull wrote:
> On Mon, May 27, 2019 at 12:39 AM Wu Hao <hao.wu@intel.com> wrote:
> 
> Hi Hao,
> 
> Just one correction that I saw below, sorry I didn't catch it last time.

Hi Alan

Thanks for the review. I will remove the code below which is not used.

Hao

> 
> >
> > This patch adds support for performance reporting private feature
> > for FPGA Management Engine (FME). Actually it supports 4 categories
> > performance counters, 'clock', 'cache', 'iommu' and 'fabric', user
> > could read the performance counter via exposed sysfs interfaces.
> > Please refer to sysfs doc for more details.
> >
> > Signed-off-by: Luwei Kang <luwei.kang@intel.com>
> > Signed-off-by: Xu Yilun <yilun.xu@intel.com>
> > Signed-off-by: Wu Hao <hao.wu@intel.com>
> 
> Acked-by: Alan Tull <atull@kernel.org>
> 
> > ---
> > v3: replace scnprintf with sprintf in sysfs interfaces.
> >     update sysfs doc kernel version and date.
> >     fix sysfs doc issue for fabric counter.
> >     refine PERF_OBJ_ATTR_* macro, doesn't count on __ATTR anymore.
> >     introduce PERF_OBJ_ATTR_F_* macro, as it needs to use different
> >     filenames for some of the sysfs attributes.
> >     remove kobject_del when destroy kobject, kobject_put is enough.
> >     do sysfs_remove_groups first when destroying perf_obj.
> >     WARN_ON_ONCE in case internal parms are wrong in read_*_count().
> > ---
> >  Documentation/ABI/testing/sysfs-platform-dfl-fme |  93 +++
> >  drivers/fpga/Makefile                            |   1 +
> >  drivers/fpga/dfl-fme-main.c                      |   4 +
> >  drivers/fpga/dfl-fme-perf.c                      | 962 +++++++++++++++++++++++
> >  drivers/fpga/dfl-fme.h                           |   2 +
> >  drivers/fpga/dfl.c                               |   1 +
> >  drivers/fpga/dfl.h                               |   2 +
> >  7 files changed, 1065 insertions(+)
> >  create mode 100644 drivers/fpga/dfl-fme-perf.c
> >
> > diff --git a/Documentation/ABI/testing/sysfs-platform-dfl-fme b/Documentation/ABI/testing/sysfs-platform-dfl-fme
> > index 5a8938f..63a02cb 100644
> > --- a/Documentation/ABI/testing/sysfs-platform-dfl-fme
> > +++ b/Documentation/ABI/testing/sysfs-platform-dfl-fme
> > @@ -119,3 +119,96 @@ Description:       Write-only. Write error code to this file to clear all errors
> >                 logged in errors, first_error and next_error. Write fails with
> >                 -EINVAL if input parsing fails or input error code doesn't
> >                 match.
> > +
> > +What:          /sys/bus/platform/devices/dfl-fme.0/perf/clock
> > +Date:          May 2019
> > +KernelVersion:  5.3
> > +Contact:       Wu Hao <hao.wu@intel.com>
> > +Description:   Read-Only. Read for Accelerator Function Unit (AFU) clock
> > +               counter.
> > +
> > +What:          /sys/bus/platform/devices/dfl-fme.0/perf/cache/freeze
> > +Date:          May 2019
> > +KernelVersion:  5.3
> > +Contact:       Wu Hao <hao.wu@intel.com>
> > +Description:   Read-Write. Read this file for the current status of 'cache'
> > +               category performance counters, and Write '1' or '0' to freeze
> > +               or unfreeze 'cache' performance counters.
> > +
> > +What:          /sys/bus/platform/devices/dfl-fme.0/perf/cache/<counter>
> > +Date:          May 2019
> > +KernelVersion:  5.3
> > +Contact:       Wu Hao <hao.wu@intel.com>
> > +Description:   Read-Only. Read 'cache' category performance counters:
> > +               read_hit, read_miss, write_hit, write_miss, hold_request,
> > +               data_write_port_contention, tag_write_port_contention,
> > +               tx_req_stall, rx_req_stall and rx_eviction.
> > +
> > +What:          /sys/bus/platform/devices/dfl-fme.0/perf/iommu/freeze
> > +Date:          May 2019
> > +KernelVersion:  5.3
> > +Contact:       Wu Hao <hao.wu@intel.com>
> > +Description:   Read-Write. Read this file for the current status of 'iommu'
> > +               category performance counters, and Write '1' or '0' to freeze
> > +               or unfreeze 'iommu' performance counters.
> > +
> > +What:          /sys/bus/platform/devices/dfl-fme.0/perf/iommu/<sip_counter>
> > +Date:          May 2019
> > +KernelVersion:  5.3
> > +Contact:       Wu Hao <hao.wu@intel.com>
> > +Description:   Read-Only. Read 'iommu' category 'sip' sub category
> > +               performance counters: iotlb_4k_hit, iotlb_2m_hit,
> > +               iotlb_1g_hit, slpwc_l3_hit, slpwc_l4_hit, rcc_hit,
> > +               rcc_miss, iotlb_4k_miss, iotlb_2m_miss, iotlb_1g_miss,
> > +               slpwc_l3_miss and slpwc_l4_miss.
> > +
> > +What:          /sys/bus/platform/devices/dfl-fme.0/perf/iommu/afu0/<counter>
> > +Date:          May 2019
> > +KernelVersion:  5.3
> > +Contact:       Wu Hao <hao.wu@intel.com>
> > +Description:   Read-Only. Read 'iommu' category 'afuX' sub category
> > +               performance counters: read_transaction, write_transaction,
> > +               devtlb_read_hit, devtlb_write_hit, devtlb_4k_fill,
> > +               devtlb_2m_fill and devtlb_1g_fill.
> > +
> > +What:          /sys/bus/platform/devices/dfl-fme.0/perf/fabric/freeze
> > +Date:          May 2019
> > +KernelVersion:  5.3
> > +Contact:       Wu Hao <hao.wu@intel.com>
> > +Description:   Read-Write. Read this file for the current status of 'fabric'
> > +               category performance counters, and Write '1' or '0' to freeze
> > +               or unfreeze 'fabric' performance counters.
> > +
> > +What:          /sys/bus/platform/devices/dfl-fme.0/perf/fabric/<counter>
> > +Date:          May 2019
> > +KernelVersion:  5.3
> > +Contact:       Wu Hao <hao.wu@intel.com>
> > +Description:   Read-Only. Read 'fabric' category performance counters:
> > +               pcie0_read, pcie0_write, pcie1_read, pcie1_write,
> > +               upi_read, upi_write, mmio_read and mmio_write.
> > +
> > +What:          /sys/bus/platform/devices/dfl-fme.0/perf/fabric/enable
> > +Date:          May 2019
> > +KernelVersion:  5.3
> > +Contact:       Wu Hao <hao.wu@intel.com>
> > +Description:   Read-Write. Read this file for current status of device level
> > +               fabric counters. Write "1" to enable device level fabric
> > +               counters. Once device level fabric counters are enabled, port
> > +               level fabric counters will be disabled automatically.
> > +
> > +What:          /sys/bus/platform/devices/dfl-fme.0/perf/fabric/port0/<counter>
> > +Date:          May 2019
> > +KernelVersion:  5.3
> > +Contact:       Wu Hao <hao.wu@intel.com>
> > +Description:   Read-Only. Read 'fabric' category "portX" sub category
> > +               performance counters: pcie0_read, pcie0_write, pcie1_read,
> > +               pcie1_write, upi_read, upi_write and mmio_read.
> > +
> > +What:          /sys/bus/platform/devices/dfl-fme.0/perf/fabric/port0/enable
> > +Date:          May 2019
> > +KernelVersion:  5.3
> > +Contact:       Wu Hao <hao.wu@intel.com>
> > +Description:   Read-Write. Read this file for current status of port level
> > +               fabric counters. Write "1" to enable port level fabric counters.
> > +               Once port level fabric counters are enabled, device level fabric
> > +               counters will be disabled automatically.
> > diff --git a/drivers/fpga/Makefile b/drivers/fpga/Makefile
> > index 4865b74..d8e21df 100644
> > --- a/drivers/fpga/Makefile
> > +++ b/drivers/fpga/Makefile
> > @@ -40,6 +40,7 @@ obj-$(CONFIG_FPGA_DFL_FME_REGION)     += dfl-fme-region.o
> >  obj-$(CONFIG_FPGA_DFL_AFU)             += dfl-afu.o
> >
> >  dfl-fme-objs := dfl-fme-main.o dfl-fme-pr.o dfl-fme-error.o
> > +dfl-fme-objs += dfl-fme-perf.o
> >  dfl-afu-objs := dfl-afu-main.o dfl-afu-region.o dfl-afu-dma-region.o
> >  dfl-afu-objs += dfl-afu-error.o
> >
> > diff --git a/drivers/fpga/dfl-fme-main.c b/drivers/fpga/dfl-fme-main.c
> > index 4490cf4..4a5b25d 100644
> > --- a/drivers/fpga/dfl-fme-main.c
> > +++ b/drivers/fpga/dfl-fme-main.c
> > @@ -231,6 +231,10 @@ static long fme_hdr_ioctl(struct platform_device *pdev,
> >                 .ops = &fme_global_err_ops,
> >         },
> >         {
> > +               .id_table = fme_perf_id_table,
> > +               .ops = &fme_perf_ops,
> > +       },
> > +       {
> >                 .ops = NULL,
> >         },
> >  };
> > diff --git a/drivers/fpga/dfl-fme-perf.c b/drivers/fpga/dfl-fme-perf.c
> > new file mode 100644
> > index 0000000..6eb6c89
> > --- /dev/null
> > +++ b/drivers/fpga/dfl-fme-perf.c
> > @@ -0,0 +1,962 @@
> > +// SPDX-License-Identifier: GPL-2.0
> > +/*
> > + * Driver for FPGA Management Engine (FME) Global Performance Reporting
> > + *
> > + * Copyright 2019 Intel Corporation, Inc.
> > + *
> > + * Authors:
> > + *   Kang Luwei <luwei.kang@intel.com>
> > + *   Xiao Guangrong <guangrong.xiao@linux.intel.com>
> > + *   Wu Hao <hao.wu@intel.com>
> > + *   Joseph Grecco <joe.grecco@intel.com>
> > + *   Enno Luebbers <enno.luebbers@intel.com>
> > + *   Tim Whisonant <tim.whisonant@intel.com>
> > + *   Ananda Ravuri <ananda.ravuri@intel.com>
> > + *   Mitchel, Henry <henry.mitchel@intel.com>
> > + */
> > +
> > +#include "dfl.h"
> > +#include "dfl-fme.h"
> > +
> > +/*
> > + * Performance Counter Registers for Cache.
> > + *
> > + * Cache Events are listed below as CACHE_EVNT_*.
> > + */
> > +#define CACHE_CTRL                     0x8
> > +#define CACHE_RESET_CNTR               BIT_ULL(0)
> > +#define CACHE_FREEZE_CNTR              BIT_ULL(8)
> > +#define CACHE_CTRL_EVNT                        GENMASK_ULL(19, 16)
> > +#define CACHE_EVNT_RD_HIT              0x0
> > +#define CACHE_EVNT_WR_HIT              0x1
> > +#define CACHE_EVNT_RD_MISS             0x2
> > +#define CACHE_EVNT_WR_MISS             0x3
> > +#define CACHE_EVNT_RSVD                        0x4
> > +#define CACHE_EVNT_HOLD_REQ            0x5
> > +#define CACHE_EVNT_DATA_WR_PORT_CONTEN 0x6
> > +#define CACHE_EVNT_TAG_WR_PORT_CONTEN  0x7
> > +#define CACHE_EVNT_TX_REQ_STALL                0x8
> > +#define CACHE_EVNT_RX_REQ_STALL                0x9
> > +#define CACHE_EVNT_EVICTIONS           0xa
> > +#define CACHE_EVNT_MAX                 CACHE_EVNT_EVICTIONS
> > +#define CACHE_CHANNEL_SEL              BIT_ULL(20)
> > +#define CACHE_CHANNEL_RD               0
> > +#define CACHE_CHANNEL_WR               1
> > +#define CACHE_CHANNEL_MAX              2
> > +#define CACHE_CNTR0                    0x10
> > +#define CACHE_CNTR1                    0x18
> > +#define CACHE_CNTR_EVNT_CNTR           GENMASK_ULL(47, 0)
> > +#define CACHE_CNTR_EVNT                        GENMASK_ULL(63, 60)
> > +
> > +/*
> > + * Performance Counter Registers for Fabric.
> > + *
> > + * Fabric Events are listed below as FAB_EVNT_*
> > + */
> > +#define FAB_CTRL                       0x20
> > +#define FAB_RESET_CNTR                 BIT_ULL(0)
> > +#define FAB_FREEZE_CNTR                        BIT_ULL(8)
> > +#define FAB_CTRL_EVNT                  GENMASK_ULL(19, 16)
> > +#define FAB_EVNT_PCIE0_RD              0x0
> > +#define FAB_EVNT_PCIE0_WR              0x1
> > +#define FAB_EVNT_PCIE1_RD              0x2
> > +#define FAB_EVNT_PCIE1_WR              0x3
> > +#define FAB_EVNT_UPI_RD                        0x4
> > +#define FAB_EVNT_UPI_WR                        0x5
> > +#define FAB_EVNT_MMIO_RD               0x6
> > +#define FAB_EVNT_MMIO_WR               0x7
> > +#define FAB_EVNT_MAX                   FAB_EVNT_MMIO_WR
> > +#define FAB_PORT_ID                    GENMASK_ULL(21, 20)
> > +#define FAB_PORT_FILTER                        BIT_ULL(23)
> > +#define FAB_PORT_FILTER_DISABLE                0
> > +#define FAB_PORT_FILTER_ENABLE         1
> > +#define FAB_CNTR                       0x28
> > +#define FAB_CNTR_EVNT_CNTR             GENMASK_ULL(59, 0)
> > +#define FAB_CNTR_EVNT                  GENMASK_ULL(63, 60)
> > +
> > +/*
> > + * Performance Counter Registers for Clock.
> > + *
> > + * Clock Counter can't be reset or frozen by SW.
> > + */
> > +#define CLK_CNTR                       0x30
> > +
> > +/*
> > + * Performance Counter Registers for IOMMU / VT-D.
> > + *
> > + * VT-D Events are listed below as VTD_EVNT_* and VTD_SIP_EVNT_*
> > + */
> > +#define VTD_CTRL                       0x38
> > +#define VTD_RESET_CNTR                 BIT_ULL(0)
> > +#define VTD_FREEZE_CNTR                        BIT_ULL(8)
> > +#define VTD_CTRL_EVNT                  GENMASK_ULL(19, 16)
> > +#define VTD_EVNT_AFU_MEM_RD_TRANS      0x0
> > +#define VTD_EVNT_AFU_MEM_WR_TRANS      0x1
> > +#define VTD_EVNT_AFU_DEVTLB_RD_HIT     0x2
> > +#define VTD_EVNT_AFU_DEVTLB_WR_HIT     0x3
> > +#define VTD_EVNT_DEVTLB_4K_FILL                0x4
> > +#define VTD_EVNT_DEVTLB_2M_FILL                0x5
> > +#define VTD_EVNT_DEVTLB_1G_FILL                0x6
> > +#define VTD_EVNT_MAX                   VTD_EVNT_DEVTLB_1G_FILL
> > +#define VTD_CNTR                       0x40
> > +#define VTD_CNTR_EVNT                  GENMASK_ULL(63, 60)
> > +#define VTD_CNTR_EVNT_CNTR             GENMASK_ULL(47, 0)
> > +#define VTD_SIP_CTRL                   0x48
> > +#define VTD_SIP_RESET_CNTR             BIT_ULL(0)
> > +#define VTD_SIP_FREEZE_CNTR            BIT_ULL(8)
> > +#define VTD_SIP_CTRL_EVNT              GENMASK_ULL(19, 16)
> > +#define VTD_SIP_EVNT_IOTLB_4K_HIT      0x0
> > +#define VTD_SIP_EVNT_IOTLB_2M_HIT      0x1
> > +#define VTD_SIP_EVNT_IOTLB_1G_HIT      0x2
> > +#define VTD_SIP_EVNT_SLPWC_L3_HIT      0x3
> > +#define VTD_SIP_EVNT_SLPWC_L4_HIT      0x4
> > +#define VTD_SIP_EVNT_RCC_HIT           0x5
> > +#define VTD_SIP_EVNT_IOTLB_4K_MISS     0x6
> > +#define VTD_SIP_EVNT_IOTLB_2M_MISS     0x7
> > +#define VTD_SIP_EVNT_IOTLB_1G_MISS     0x8
> > +#define VTD_SIP_EVNT_SLPWC_L3_MISS     0x9
> > +#define VTD_SIP_EVNT_SLPWC_L4_MISS     0xa
> > +#define VTD_SIP_EVNT_RCC_MISS          0xb
> > +#define VTD_SIP_EVNT_MAX               VTD_SIP_EVNT_RCC_MISS
> > +#define VTD_SIP_CNTR                   0X50
> > +#define VTD_SIP_CNTR_EVNT              GENMASK_ULL(63, 60)
> > +#define VTD_SIP_CNTR_EVNT_CNTR         GENMASK_ULL(47, 0)
> > +
> > +#define PERF_OBJ_ROOT_ID               (~0)
> > +
> > +#define PERF_TIMEOUT                   30
> > +
> > +/**
> > + * struct perf_object - object of performance counter
> > + *
> > + * @id: instance id. PERF_OBJ_ROOT_ID indicates it is a parent object which
> > + *      counts performance counters for all instances.
> > + * @attr_groups: the sysfs files are associated with this object.
> > + * @feature: pointer to related private feature.
> > + * @node: used to link itself to parent's children list.
> > + * @children: used to link its children objects together.
> > + * @kobj: generic kobject interface.
> > + *
> > + * 'node' and 'children' are used to construct parent-children hierarchy.
> > + */
> > +struct perf_object {
> > +       int id;
> > +       const struct attribute_group **attr_groups;
> > +       struct dfl_feature *feature;
> > +
> > +       struct list_head node;
> > +       struct list_head children;
> > +       struct kobject kobj;
> > +};
> > +
> > +/**
> > + * struct perf_obj_attribute - attribute of perf object
> > + *
> > + * @attr: attribute of this perf object.
> > + * @show: show callback for sysfs attribute.
> > + * @store: store callback for sysfs attribute.
> > + */
> > +struct perf_obj_attribute {
> > +       struct attribute attr;
> > +       ssize_t (*show)(struct perf_object *pobj, char *buf);
> > +       ssize_t (*store)(struct perf_object *pobj,
> > +                        const char *buf, size_t n);
> > +};
> > +
> > +#define to_perf_obj_attr(_attr)                                        \
> > +               container_of(_attr, struct perf_obj_attribute, attr)
> > +#define to_perf_obj(_kobj)                                     \
> > +               container_of(_kobj, struct perf_object, kobj)
> > +
> > +#define __POBJ_ATTR(_name, _mode, _show, _store) {                     \
> > +       .attr = {.name = __stringify(_name),                            \
> > +                .mode = VERIFY_OCTAL_PERMISSIONS(_mode) },             \
> > +       .show   = _show,                                                \
> > +       .store  = _store,                                               \
> > +}
> > +
> > +#define PERF_OBJ_ATTR_F_RO(_name, _filename)                           \
> > +struct perf_obj_attribute perf_obj_attr_##_name =                      \
> > +       __POBJ_ATTR(_filename, 0444, _name##_show, NULL)
> > +
> > +#define PERF_OBJ_ATTR_F_WO(_name, _filename)                           \
> > +struct perf_obj_attribute perf_obj_attr_##_name =                      \
> > +       __POBJ_ATTR(_filename, 0200, NULL, _name##_store)
> 
> Please remove the macros that aren't actually used.  I think that
> includes PERF_OBJ_ATTR_F_RO, PERF_OBJ_ATTR_F_WO, PERF_OBJ_ATTR_WO, and
> PERF_OBJ_ATTR_RW.
> 
> > +
> > +#define PERF_OBJ_ATTR_F_RW(_name, _filename)                           \
> > +struct perf_obj_attribute perf_obj_attr_##_name =                      \
> > +       __POBJ_ATTR(_filename, 0644, _name##_show, _name##_store)
> > +
> > +#define PERF_OBJ_ATTR_RO(_name)                                                \
> > +struct perf_obj_attribute perf_obj_attr_##_name =                      \
> > +       __POBJ_ATTR(_name, 0444, _name##_show, NULL)
> > +
> > +#define PERF_OBJ_ATTR_WO(_name)                                                \
> > +struct perf_obj_attribute perf_obj_attr_##_name =                      \
> > +       __POBJ_ATTR(_name, 0200, NULL, _name##_store)
> > +
> > +#define PERF_OBJ_ATTR_RW(_name)                                                \
> > +struct perf_obj_attribute perf_obj_attr_##_name =                      \
> > +       __POBJ_ATTR(_name, 0644, _name##_show, _name##_store)
> > +
> > +static ssize_t perf_obj_attr_show(struct kobject *kobj,
> > +                                 struct attribute *__attr, char *buf)
> > +{
> > +       struct perf_obj_attribute *attr = to_perf_obj_attr(__attr);
> > +       struct perf_object *pobj = to_perf_obj(kobj);
> > +       ssize_t ret = -EIO;
> > +
> > +       if (attr->show)
> > +               ret = attr->show(pobj, buf);
> > +       return ret;
> > +}
> > +
> > +static ssize_t perf_obj_attr_store(struct kobject *kobj,
> > +                                  struct attribute *__attr,
> > +                                  const char *buf, size_t n)
> > +{
> > +       struct perf_obj_attribute *attr = to_perf_obj_attr(__attr);
> > +       struct perf_object *pobj = to_perf_obj(kobj);
> > +       ssize_t ret = -EIO;
> > +
> > +       if (attr->store)
> > +               ret = attr->store(pobj, buf, n);
> > +       return ret;
> > +}
> > +
> > +static const struct sysfs_ops perf_obj_sysfs_ops = {
> > +       .show = perf_obj_attr_show,
> > +       .store = perf_obj_attr_store,
> > +};
> > +
> > +static void perf_obj_release(struct kobject *kobj)
> > +{
> > +       kfree(to_perf_obj(kobj));
> > +}
> > +
> > +static struct kobj_type perf_obj_ktype = {
> > +       .sysfs_ops = &perf_obj_sysfs_ops,
> > +       .release = perf_obj_release,
> > +};
> > +
> > +static struct perf_object *
> > +create_perf_obj(struct dfl_feature *feature, struct kobject *parent, int id,
> > +               const struct attribute_group **groups, const char *name)
> > +{
> > +       struct perf_object *pobj;
> > +       int ret;
> > +
> > +       pobj = kzalloc(sizeof(*pobj), GFP_KERNEL);
> > +       if (!pobj)
> > +               return ERR_PTR(-ENOMEM);
> > +
> > +       pobj->id = id;
> > +       pobj->feature = feature;
> > +       pobj->attr_groups = groups;
> > +       INIT_LIST_HEAD(&pobj->node);
> > +       INIT_LIST_HEAD(&pobj->children);
> > +
> > +       if (id != PERF_OBJ_ROOT_ID)
> > +               ret = kobject_init_and_add(&pobj->kobj, &perf_obj_ktype,
> > +                                          parent, "%s%d", name, id);
> > +       else
> > +               ret = kobject_init_and_add(&pobj->kobj, &perf_obj_ktype,
> > +                                          parent, "%s", name);
> > +       if (ret)
> > +               goto put_exit;
> > +
> > +       if (pobj->attr_groups) {
> > +               ret = sysfs_create_groups(&pobj->kobj, pobj->attr_groups);
> > +               if (ret)
> > +                       goto put_exit;
> > +       }
> > +
> > +       return pobj;
> > +
> > +put_exit:
> > +       kobject_put(&pobj->kobj);
> > +       return ERR_PTR(ret);
> > +}
> > +
> > +/*
> > + * Counter Sysfs Interface for Clock.
> > + */
> > +static ssize_t clock_show(struct perf_object *pobj, char *buf)
> > +{
> > +       void __iomem *base = pobj->feature->ioaddr;
> > +
> > +       return sprintf(buf, "0x%llx\n",
> > +                      (unsigned long long)readq(base + CLK_CNTR));
> > +}
> > +static PERF_OBJ_ATTR_RO(clock);
> > +
> > +static struct attribute *clock_attrs[] = {
> > +       &perf_obj_attr_clock.attr,
> > +       NULL,
> > +};
> > +
> > +static struct attribute_group clock_attr_group = {
> > +       .attrs = clock_attrs,
> > +};
> > +
> > +static const struct attribute_group *perf_dev_attr_groups[] = {
> > +       &clock_attr_group,
> > +       NULL,
> > +};
> > +
> > +static void destroy_perf_obj(struct perf_object *pobj)
> > +{
> > +       struct perf_object *obj, *obj_tmp;
> > +
> > +       if (pobj->attr_groups)
> > +               sysfs_remove_groups(&pobj->kobj, pobj->attr_groups);
> > +
> > +       list_for_each_entry_safe(obj, obj_tmp, &pobj->children, node)
> > +               destroy_perf_obj(obj);
> > +
> > +       list_del(&pobj->node);
> > +       kobject_put(&pobj->kobj);
> > +}
> > +
> > +static struct perf_object *create_perf_dev(struct dfl_feature *feature)
> > +{
> > +       struct platform_device *pdev = feature->pdev;
> > +
> > +       return create_perf_obj(feature, &pdev->dev.kobj, PERF_OBJ_ROOT_ID,
> > +                              perf_dev_attr_groups, "perf");
> > +}
> > +
> > +/*
> > + * Counter Sysfs Interfaces for Cache.
> > + */
> > +static ssize_t cache_freeze_show(struct perf_object *pobj, char *buf)
> > +{
> > +       void __iomem *base = pobj->feature->ioaddr;
> > +       u64 v;
> > +
> > +       v = readq(base + CACHE_CTRL);
> > +
> > +       return sprintf(buf, "%u\n",
> > +                      (unsigned int)FIELD_GET(CACHE_FREEZE_CNTR, v));
> > +}
> > +
> > +static ssize_t cache_freeze_store(struct perf_object *pobj,
> > +                                 const char *buf, size_t n)
> > +{
> > +       struct dfl_feature *feature = pobj->feature;
> > +       struct dfl_feature_platform_data *pdata;
> > +       void __iomem *base = feature->ioaddr;
> > +       bool state;
> > +       u64 v;
> > +
> > +       if (strtobool(buf, &state))
> > +               return -EINVAL;
> > +
> > +       pdata = dev_get_platdata(&feature->pdev->dev);
> > +
> > +       mutex_lock(&pdata->lock);
> > +       v = readq(base + CACHE_CTRL);
> > +       v &= ~CACHE_FREEZE_CNTR;
> > +       v |= FIELD_PREP(CACHE_FREEZE_CNTR, state ? 1 : 0);
> > +       writeq(v, base + CACHE_CTRL);
> > +       mutex_unlock(&pdata->lock);
> > +
> > +       return n;
> > +}
> > +static PERF_OBJ_ATTR_F_RW(cache_freeze, freeze);
> > +
> > +static ssize_t read_cache_counter(struct perf_object *pobj, char *buf,
> > +                                 u8 channel, u8 event)
> > +{
> > +       struct dfl_feature *feature = pobj->feature;
> > +       struct dfl_feature_platform_data *pdata;
> > +       void __iomem *base = feature->ioaddr;
> > +       u64 v, count;
> > +
> > +       WARN_ON_ONCE(event > CACHE_EVNT_MAX || channel > CACHE_CHANNEL_MAX);
> > +
> > +       pdata = dev_get_platdata(&feature->pdev->dev);
> > +
> > +       mutex_lock(&pdata->lock);
> > +       /* set channel access type and cache event code. */
> > +       v = readq(base + CACHE_CTRL);
> > +       v &= ~(CACHE_CHANNEL_SEL | CACHE_CTRL_EVNT);
> > +       v |= FIELD_PREP(CACHE_CHANNEL_SEL, channel);
> > +       v |= FIELD_PREP(CACHE_CTRL_EVNT, event);
> > +       writeq(v, base + CACHE_CTRL);
> > +
> > +       if (readq_poll_timeout(base + CACHE_CNTR0, v,
> > +                              FIELD_GET(CACHE_CNTR_EVNT, v) == event,
> > +                              1, PERF_TIMEOUT)) {
> > +               dev_err(&feature->pdev->dev, "timeout, unmatched cache event type in counter registers.\n");
> > +               mutex_unlock(&pdata->lock);
> > +               return -ETIMEDOUT;
> > +       }
> > +
> > +       v = readq(base + CACHE_CNTR0);
> > +       count = FIELD_GET(CACHE_CNTR_EVNT_CNTR, v);
> > +       v = readq(base + CACHE_CNTR1);
> > +       count += FIELD_GET(CACHE_CNTR_EVNT_CNTR, v);
> > +       mutex_unlock(&pdata->lock);
> > +
> > +       return sprintf(buf, "0x%llx\n", (unsigned long long)count);
> > +}
> > +
> > +#define CACHE_SHOW(name, channel, event)                               \
> > +static ssize_t name##_show(struct perf_object *pobj, char *buf)                \
> > +{                                                                      \
> > +       return read_cache_counter(pobj, buf, channel, event);           \
> > +}                                                                      \
> > +static PERF_OBJ_ATTR_RO(name)
> > +
> > +CACHE_SHOW(read_hit, CACHE_CHANNEL_RD, CACHE_EVNT_RD_HIT);
> > +CACHE_SHOW(read_miss, CACHE_CHANNEL_RD, CACHE_EVNT_RD_MISS);
> > +CACHE_SHOW(write_hit, CACHE_CHANNEL_WR, CACHE_EVNT_WR_HIT);
> > +CACHE_SHOW(write_miss, CACHE_CHANNEL_WR, CACHE_EVNT_WR_MISS);
> > +CACHE_SHOW(hold_request, CACHE_CHANNEL_RD, CACHE_EVNT_HOLD_REQ);
> > +CACHE_SHOW(tx_req_stall, CACHE_CHANNEL_RD, CACHE_EVNT_TX_REQ_STALL);
> > +CACHE_SHOW(rx_req_stall, CACHE_CHANNEL_RD, CACHE_EVNT_RX_REQ_STALL);
> > +CACHE_SHOW(rx_eviction, CACHE_CHANNEL_RD, CACHE_EVNT_EVICTIONS);
> > +CACHE_SHOW(data_write_port_contention, CACHE_CHANNEL_WR,
> > +          CACHE_EVNT_DATA_WR_PORT_CONTEN);
> > +CACHE_SHOW(tag_write_port_contention, CACHE_CHANNEL_WR,
> > +          CACHE_EVNT_TAG_WR_PORT_CONTEN);
> > +
> > +static struct attribute *cache_attrs[] = {
> > +       &perf_obj_attr_read_hit.attr,
> > +       &perf_obj_attr_read_miss.attr,
> > +       &perf_obj_attr_write_hit.attr,
> > +       &perf_obj_attr_write_miss.attr,
> > +       &perf_obj_attr_hold_request.attr,
> > +       &perf_obj_attr_data_write_port_contention.attr,
> > +       &perf_obj_attr_tag_write_port_contention.attr,
> > +       &perf_obj_attr_tx_req_stall.attr,
> > +       &perf_obj_attr_rx_req_stall.attr,
> > +       &perf_obj_attr_rx_eviction.attr,
> > +       &perf_obj_attr_cache_freeze.attr,
> > +       NULL,
> > +};
> > +
> > +static struct attribute_group cache_attr_group = {
> > +       .attrs = cache_attrs,
> > +};
> > +
> > +static const struct attribute_group *cache_attr_groups[] = {
> > +       &cache_attr_group,
> > +       NULL,
> > +};
> > +
> > +static int create_perf_cache_obj(struct perf_object *perf_dev)
> > +{
> > +       struct perf_object *pobj;
> > +
> > +       pobj = create_perf_obj(perf_dev->feature, &perf_dev->kobj,
> > +                              PERF_OBJ_ROOT_ID, cache_attr_groups, "cache");
> > +       if (IS_ERR(pobj))
> > +               return PTR_ERR(pobj);
> > +
> > +       list_add(&pobj->node, &perf_dev->children);
> > +
> > +       return 0;
> > +}
> > +
> > +/*
> > + * Counter Sysfs Interfaces for VT-D / IOMMU.
> > + */
> > +static ssize_t vtd_freeze_show(struct perf_object *pobj, char *buf)
> > +{
> > +       void __iomem *base = pobj->feature->ioaddr;
> > +       u64 v;
> > +
> > +       v = readq(base + VTD_CTRL);
> > +
> > +       return sprintf(buf, "%u\n",
> > +                      (unsigned int)FIELD_GET(VTD_FREEZE_CNTR, v));
> > +}
> > +
> > +static ssize_t vtd_freeze_store(struct perf_object *pobj,
> > +                               const char *buf, size_t n)
> > +{
> > +       struct dfl_feature *feature = pobj->feature;
> > +       struct dfl_feature_platform_data *pdata;
> > +       void __iomem *base = feature->ioaddr;
> > +       bool state;
> > +       u64 v;
> > +
> > +       if (strtobool(buf, &state))
> > +               return -EINVAL;
> > +
> > +       pdata = dev_get_platdata(&feature->pdev->dev);
> > +
> > +       mutex_lock(&pdata->lock);
> > +       v = readq(base + VTD_CTRL);
> > +       v &= ~VTD_FREEZE_CNTR;
> > +       v |= FIELD_PREP(VTD_FREEZE_CNTR, state ? 1 : 0);
> > +       writeq(v, base + VTD_CTRL);
> > +       mutex_unlock(&pdata->lock);
> > +
> > +       return n;
> > +}
> > +static PERF_OBJ_ATTR_F_RW(vtd_freeze, freeze);
> > +
> > +static struct attribute *iommu_top_attrs[] = {
> > +       &perf_obj_attr_vtd_freeze.attr,
> > +       NULL,
> > +};
> > +
> > +static struct attribute_group iommu_top_attr_group = {
> > +       .attrs = iommu_top_attrs,
> > +};
> > +
> > +static ssize_t read_iommu_sip_counter(struct perf_object *pobj,
> > +                                     u8 event, char *buf)
> > +{
> > +       struct dfl_feature *feature = pobj->feature;
> > +       struct dfl_feature_platform_data *pdata;
> > +       void __iomem *base = feature->ioaddr;
> > +       u64 v, count;
> > +
> > +       WARN_ON_ONCE(event > VTD_SIP_EVNT_MAX);
> > +
> > +       pdata = dev_get_platdata(&feature->pdev->dev);
> > +
> > +       mutex_lock(&pdata->lock);
> > +       v = readq(base + VTD_SIP_CTRL);
> > +       v &= ~VTD_SIP_CTRL_EVNT;
> > +       v |= FIELD_PREP(VTD_SIP_CTRL_EVNT, event);
> > +       writeq(v, base + VTD_SIP_CTRL);
> > +
> > +       if (readq_poll_timeout(base + VTD_SIP_CNTR, v,
> > +                              FIELD_GET(VTD_SIP_CNTR_EVNT, v) == event,
> > +                              1, PERF_TIMEOUT)) {
> > +               dev_err(&feature->pdev->dev, "timeout, unmatched VTd SIP event type in counter registers\n");
> > +               mutex_unlock(&pdata->lock);
> > +               return -ETIMEDOUT;
> > +       }
> > +
> > +       v = readq(base + VTD_SIP_CNTR);
> > +       count = FIELD_GET(VTD_SIP_CNTR_EVNT_CNTR, v);
> > +       mutex_unlock(&pdata->lock);
> > +
> > +       return sprintf(buf, "0x%llx\n", (unsigned long long)count);
> > +}
> > +
> > +#define VTD_SIP_SHOW(name, event)                                      \
> > +static ssize_t name##_show(struct perf_object *pobj, char *buf)                \
> > +{                                                                      \
> > +       return read_iommu_sip_counter(pobj, event, buf);                \
> > +}                                                                      \
> > +static PERF_OBJ_ATTR_RO(name)
> > +
> > +VTD_SIP_SHOW(iotlb_4k_hit, VTD_SIP_EVNT_IOTLB_4K_HIT);
> > +VTD_SIP_SHOW(iotlb_2m_hit, VTD_SIP_EVNT_IOTLB_2M_HIT);
> > +VTD_SIP_SHOW(iotlb_1g_hit, VTD_SIP_EVNT_IOTLB_1G_HIT);
> > +VTD_SIP_SHOW(slpwc_l3_hit, VTD_SIP_EVNT_SLPWC_L3_HIT);
> > +VTD_SIP_SHOW(slpwc_l4_hit, VTD_SIP_EVNT_SLPWC_L4_HIT);
> > +VTD_SIP_SHOW(rcc_hit, VTD_SIP_EVNT_RCC_HIT);
> > +VTD_SIP_SHOW(iotlb_4k_miss, VTD_SIP_EVNT_IOTLB_4K_MISS);
> > +VTD_SIP_SHOW(iotlb_2m_miss, VTD_SIP_EVNT_IOTLB_2M_MISS);
> > +VTD_SIP_SHOW(iotlb_1g_miss, VTD_SIP_EVNT_IOTLB_1G_MISS);
> > +VTD_SIP_SHOW(slpwc_l3_miss, VTD_SIP_EVNT_SLPWC_L3_MISS);
> > +VTD_SIP_SHOW(slpwc_l4_miss, VTD_SIP_EVNT_SLPWC_L4_MISS);
> > +VTD_SIP_SHOW(rcc_miss, VTD_SIP_EVNT_RCC_MISS);
> > +
> > +static struct attribute *iommu_sip_attrs[] = {
> > +       &perf_obj_attr_iotlb_4k_hit.attr,
> > +       &perf_obj_attr_iotlb_2m_hit.attr,
> > +       &perf_obj_attr_iotlb_1g_hit.attr,
> > +       &perf_obj_attr_slpwc_l3_hit.attr,
> > +       &perf_obj_attr_slpwc_l4_hit.attr,
> > +       &perf_obj_attr_rcc_hit.attr,
> > +       &perf_obj_attr_iotlb_4k_miss.attr,
> > +       &perf_obj_attr_iotlb_2m_miss.attr,
> > +       &perf_obj_attr_iotlb_1g_miss.attr,
> > +       &perf_obj_attr_slpwc_l3_miss.attr,
> > +       &perf_obj_attr_slpwc_l4_miss.attr,
> > +       &perf_obj_attr_rcc_miss.attr,
> > +       NULL,
> > +};
> > +
> > +static struct attribute_group iommu_sip_attr_group = {
> > +       .attrs = iommu_sip_attrs,
> > +};
> > +
> > +static const struct attribute_group *iommu_top_attr_groups[] = {
> > +       &iommu_top_attr_group,
> > +       &iommu_sip_attr_group,
> > +       NULL,
> > +};
> > +
> > +static ssize_t read_iommu_counter(struct perf_object *pobj, u8 event, char *buf)
> > +{
> > +       struct dfl_feature *feature = pobj->feature;
> > +       struct dfl_feature_platform_data *pdata;
> > +       void __iomem *base = feature->ioaddr;
> > +       u64 v, count;
> > +
> > +       WARN_ON_ONCE(event > VTD_EVNT_MAX);
> > +
> > +       event += pobj->id;
> > +       pdata = dev_get_platdata(&feature->pdev->dev);
> > +
> > +       mutex_lock(&pdata->lock);
> > +       v = readq(base + VTD_CTRL);
> > +       v &= ~VTD_CTRL_EVNT;
> > +       v |= FIELD_PREP(VTD_CTRL_EVNT, event);
> > +       writeq(v, base + VTD_CTRL);
> > +
> > +       if (readq_poll_timeout(base + VTD_CNTR, v,
> > +                              FIELD_GET(VTD_CNTR_EVNT, v) == event, 1,
> > +                              PERF_TIMEOUT)) {
> > +               dev_err(&feature->pdev->dev, "timeout, unmatched VTd event type in counter registers\n");
> > +               mutex_unlock(&pdata->lock);
> > +               return -ETIMEDOUT;
> > +       }
> > +
> > +       v = readq(base + VTD_CNTR);
> > +       count = FIELD_GET(VTD_CNTR_EVNT_CNTR, v);
> > +       mutex_unlock(&pdata->lock);
> > +
> > +       return sprintf(buf, "0x%llx\n", (unsigned long long)count);
> > +}
> > +
> > +#define VTD_SHOW(name, base_event)                                     \
> > +static ssize_t name##_show(struct perf_object *pobj, char *buf)                \
> > +{                                                                      \
> > +       return read_iommu_counter(pobj, base_event, buf);               \
> > +}                                                                      \
> > +static PERF_OBJ_ATTR_RO(name)
> > +
> > +VTD_SHOW(read_transaction, VTD_EVNT_AFU_MEM_RD_TRANS);
> > +VTD_SHOW(write_transaction, VTD_EVNT_AFU_MEM_WR_TRANS);
> > +VTD_SHOW(devtlb_read_hit, VTD_EVNT_AFU_DEVTLB_RD_HIT);
> > +VTD_SHOW(devtlb_write_hit, VTD_EVNT_AFU_DEVTLB_WR_HIT);
> > +VTD_SHOW(devtlb_4k_fill, VTD_EVNT_DEVTLB_4K_FILL);
> > +VTD_SHOW(devtlb_2m_fill, VTD_EVNT_DEVTLB_2M_FILL);
> > +VTD_SHOW(devtlb_1g_fill, VTD_EVNT_DEVTLB_1G_FILL);
> > +
> > +static struct attribute *iommu_attrs[] = {
> > +       &perf_obj_attr_read_transaction.attr,
> > +       &perf_obj_attr_write_transaction.attr,
> > +       &perf_obj_attr_devtlb_read_hit.attr,
> > +       &perf_obj_attr_devtlb_write_hit.attr,
> > +       &perf_obj_attr_devtlb_4k_fill.attr,
> > +       &perf_obj_attr_devtlb_2m_fill.attr,
> > +       &perf_obj_attr_devtlb_1g_fill.attr,
> > +       NULL,
> > +};
> > +
> > +static struct attribute_group iommu_attr_group = {
> > +       .attrs = iommu_attrs,
> > +};
> > +
> > +static const struct attribute_group *iommu_attr_groups[] = {
> > +       &iommu_attr_group,
> > +       NULL,
> > +};
> > +
> > +#define PERF_MAX_PORT_NUM      1
> > +
> > +static int create_perf_iommu_obj(struct perf_object *perf_dev)
> > +{
> > +       struct dfl_feature *feature = perf_dev->feature;
> > +       struct device *dev = &feature->pdev->dev;
> > +       struct perf_object *pobj, *obj;
> > +       void __iomem *base;
> > +       u64 v;
> > +       int i;
> > +
> > +       /* check if iommu is not supported on this device. */
> > +       base = dfl_get_feature_ioaddr_by_id(dev, FME_FEATURE_ID_HEADER);
> > +       v = readq(base + FME_HDR_CAP);
> > +       if (!FIELD_GET(FME_CAP_IOMMU_AVL, v))
> > +               return 0;
> > +
> > +       pobj = create_perf_obj(feature, &perf_dev->kobj, PERF_OBJ_ROOT_ID,
> > +                              iommu_top_attr_groups, "iommu");
> > +       if (IS_ERR(pobj))
> > +               return PTR_ERR(pobj);
> > +
> > +       list_add(&pobj->node, &perf_dev->children);
> > +
> > +       for (i = 0; i < PERF_MAX_PORT_NUM; i++) {
> > +               obj = create_perf_obj(feature, &pobj->kobj, i,
> > +                                     iommu_attr_groups, "afu");
> > +               if (IS_ERR(obj))
> > +                       return PTR_ERR(obj);
> > +
> > +               list_add(&obj->node, &pobj->children);
> > +       }
> > +
> > +       return 0;
> > +}
> > +
> > +/*
> > + * Counter Sysfs Interfaces for Fabric
> > + */
> > +static bool fabric_pobj_is_enabled(struct perf_object *pobj)
> > +{
> > +       struct dfl_feature *feature = pobj->feature;
> > +       void __iomem *base = feature->ioaddr;
> > +       u64 v;
> > +
> > +       v = readq(base + FAB_CTRL);
> > +
> > +       if (FIELD_GET(FAB_PORT_FILTER, v) == FAB_PORT_FILTER_DISABLE)
> > +               return pobj->id == PERF_OBJ_ROOT_ID;
> > +
> > +       return pobj->id == FIELD_GET(FAB_PORT_ID, v);
> > +}
> > +
> > +static ssize_t read_fabric_counter(struct perf_object *pobj,
> > +                                  u8 event, char *buf)
> > +{
> > +       struct dfl_feature *feature = pobj->feature;
> > +       struct dfl_feature_platform_data *pdata;
> > +       void __iomem *base = feature->ioaddr;
> > +       u64 v, count = 0;
> > +
> > +       WARN_ON_ONCE(event > FAB_EVNT_MAX);
> > +
> > +       pdata = dev_get_platdata(&feature->pdev->dev);
> > +
> > +       mutex_lock(&pdata->lock);
> > +       /* if it is disabled, force the counter to return zero. */
> > +       if (!fabric_pobj_is_enabled(pobj))
> > +               goto exit;
> > +
> > +       v = readq(base + FAB_CTRL);
> > +       v &= ~FAB_CTRL_EVNT;
> > +       v |= FIELD_PREP(FAB_CTRL_EVNT, event);
> > +       writeq(v, base + FAB_CTRL);
> > +
> > +       if (readq_poll_timeout(base + FAB_CNTR, v,
> > +                              FIELD_GET(FAB_CNTR_EVNT, v) == event,
> > +                              1, PERF_TIMEOUT)) {
> > +               dev_err(&feature->pdev->dev, "timeout, unmatched fab event type in counter registers.\n");
> > +               mutex_unlock(&pdata->lock);
> > +               return -ETIMEDOUT;
> > +       }
> > +
> > +       v = readq(base + FAB_CNTR);
> > +       count = FIELD_GET(FAB_CNTR_EVNT_CNTR, v);
> > +exit:
> > +       mutex_unlock(&pdata->lock);
> > +
> > +       return sprintf(buf, "0x%llx\n", (unsigned long long)count);
> > +}
> > +
> > +#define FAB_SHOW(name, event)                                          \
> > +static ssize_t name##_show(struct perf_object *pobj, char *buf)                \
> > +{                                                                      \
> > +       return read_fabric_counter(pobj, event, buf);                   \
> > +}                                                                      \
> > +static PERF_OBJ_ATTR_RO(name)
> > +
> > +FAB_SHOW(pcie0_read, FAB_EVNT_PCIE0_RD);
> > +FAB_SHOW(pcie0_write, FAB_EVNT_PCIE0_WR);
> > +FAB_SHOW(pcie1_read, FAB_EVNT_PCIE1_RD);
> > +FAB_SHOW(pcie1_write, FAB_EVNT_PCIE1_WR);
> > +FAB_SHOW(upi_read, FAB_EVNT_UPI_RD);
> > +FAB_SHOW(upi_write, FAB_EVNT_UPI_WR);
> > +FAB_SHOW(mmio_read, FAB_EVNT_MMIO_RD);
> > +FAB_SHOW(mmio_write, FAB_EVNT_MMIO_WR);
> > +
> > +static ssize_t fab_enable_show(struct perf_object *pobj, char *buf)
> > +{
> > +       return sprintf(buf, "%u\n",
> > +                      (unsigned int)!!fabric_pobj_is_enabled(pobj));
> > +}
> > +
> > +/*
> > + * If enable one port or all port event counter in fabric, other
> > + * fabric event counter originally enabled will be disable automatically.
> > + */
> > +static ssize_t fab_enable_store(struct perf_object *pobj,
> > +                               const char *buf, size_t n)
> > +{
> > +       struct dfl_feature *feature = pobj->feature;
> > +       struct dfl_feature_platform_data *pdata;
> > +       void __iomem *base = feature->ioaddr;
> > +       bool state;
> > +       u64 v;
> > +
> > +       if (strtobool(buf, &state) || !state)
> > +               return -EINVAL;
> > +
> > +       pdata = dev_get_platdata(&feature->pdev->dev);
> > +
> > +       /* if it is already enabled. */
> > +       if (fabric_pobj_is_enabled(pobj))
> > +               return n;
> > +
> > +       mutex_lock(&pdata->lock);
> > +       v = readq(base + FAB_CTRL);
> > +       v &= ~(FAB_PORT_FILTER | FAB_PORT_ID);
> > +
> > +       if (pobj->id == PERF_OBJ_ROOT_ID) {
> > +               v |= FIELD_PREP(FAB_PORT_FILTER, FAB_PORT_FILTER_DISABLE);
> > +       } else {
> > +               v |= FIELD_PREP(FAB_PORT_FILTER, FAB_PORT_FILTER_ENABLE);
> > +               v |= FIELD_PREP(FAB_PORT_ID, pobj->id);
> > +       }
> > +       writeq(v, base + FAB_CTRL);
> > +       mutex_unlock(&pdata->lock);
> > +
> > +       return n;
> > +}
> > +static PERF_OBJ_ATTR_F_RW(fab_enable, enable);
> > +
> > +static struct attribute *fabric_attrs[] = {
> > +       &perf_obj_attr_pcie0_read.attr,
> > +       &perf_obj_attr_pcie0_write.attr,
> > +       &perf_obj_attr_pcie1_read.attr,
> > +       &perf_obj_attr_pcie1_write.attr,
> > +       &perf_obj_attr_upi_read.attr,
> > +       &perf_obj_attr_upi_write.attr,
> > +       &perf_obj_attr_mmio_read.attr,
> > +       &perf_obj_attr_mmio_write.attr,
> > +       &perf_obj_attr_fab_enable.attr,
> > +       NULL,
> > +};
> > +
> > +static struct attribute_group fabric_attr_group = {
> > +       .attrs = fabric_attrs,
> > +};
> > +
> > +static const struct attribute_group *fabric_attr_groups[] = {
> > +       &fabric_attr_group,
> > +       NULL,
> > +};
> > +
> > +static ssize_t fab_freeze_show(struct perf_object *pobj, char *buf)
> > +{
> > +       void __iomem *base = pobj->feature->ioaddr;
> > +       u64 v;
> > +
> > +       v = readq(base + FAB_CTRL);
> > +
> > +       return sprintf(buf, "%u\n",
> > +                      (unsigned int)FIELD_GET(FAB_FREEZE_CNTR, v));
> > +}
> > +
> > +static ssize_t fab_freeze_store(struct perf_object *pobj,
> > +                               const char *buf, size_t n)
> > +{
> > +       struct dfl_feature *feature = pobj->feature;
> > +       struct dfl_feature_platform_data *pdata;
> > +       void __iomem *base = feature->ioaddr;
> > +       bool state;
> > +       u64 v;
> > +
> > +       if (strtobool(buf, &state))
> > +               return -EINVAL;
> > +
> > +       pdata = dev_get_platdata(&feature->pdev->dev);
> > +
> > +       mutex_lock(&pdata->lock);
> > +       v = readq(base + FAB_CTRL);
> > +       v &= ~FAB_FREEZE_CNTR;
> > +       v |= FIELD_PREP(FAB_FREEZE_CNTR, state ? 1 : 0);
> > +       writeq(v, base + FAB_CTRL);
> > +       mutex_unlock(&pdata->lock);
> > +
> > +       return n;
> > +}
> > +static PERF_OBJ_ATTR_F_RW(fab_freeze, freeze);
> > +
> > +static struct attribute *fabric_top_attrs[] = {
> > +       &perf_obj_attr_fab_freeze.attr,
> > +       NULL,
> > +};
> > +
> > +static struct attribute_group fabric_top_attr_group = {
> > +       .attrs = fabric_top_attrs,
> > +};
> > +
> > +static const struct attribute_group *fabric_top_attr_groups[] = {
> > +       &fabric_attr_group,
> > +       &fabric_top_attr_group,
> > +       NULL,
> > +};
> > +
> > +static int create_perf_fabric_obj(struct perf_object *perf_dev)
> > +{
> > +       struct perf_object *pobj, *obj;
> > +       int i;
> > +
> > +       pobj = create_perf_obj(perf_dev->feature, &perf_dev->kobj,
> > +                              PERF_OBJ_ROOT_ID, fabric_top_attr_groups,
> > +                              "fabric");
> > +       if (IS_ERR(pobj))
> > +               return PTR_ERR(pobj);
> > +
> > +       list_add(&pobj->node, &perf_dev->children);
> > +
> > +       for (i = 0; i < PERF_MAX_PORT_NUM; i++) {
> > +               obj = create_perf_obj(perf_dev->feature, &pobj->kobj, i,
> > +                                     fabric_attr_groups, "port");
> > +               if (IS_ERR(obj))
> > +                       return PTR_ERR(obj);
> > +
> > +               list_add(&obj->node, &pobj->children);
> > +       }
> > +
> > +       return 0;
> > +}
> > +
> > +static int fme_perf_init(struct platform_device *pdev,
> > +                        struct dfl_feature *feature)
> > +{
> > +       struct perf_object *perf_dev;
> > +       int ret;
> > +
> > +       perf_dev = create_perf_dev(feature);
> > +       if (IS_ERR(perf_dev))
> > +               return PTR_ERR(perf_dev);
> > +
> > +       ret = create_perf_fabric_obj(perf_dev);
> > +       if (ret)
> > +               goto done;
> > +
> > +       if (feature->id == FME_FEATURE_ID_GLOBAL_IPERF) {
> > +               /*
> > +                * Cache and IOMMU(VT-D) performance counters are not supported
> > +                * on discreted solutions e.g. Intel Programmable Acceleration
> > +                * Card based on PCIe.
> > +                */
> > +               ret = create_perf_cache_obj(perf_dev);
> > +               if (ret)
> > +                       goto done;
> > +
> > +               ret = create_perf_iommu_obj(perf_dev);
> > +               if (ret)
> > +                       goto done;
> > +       }
> > +
> > +       feature->priv = perf_dev;
> > +       return 0;
> > +
> > +done:
> > +       destroy_perf_obj(perf_dev);
> > +       return ret;
> > +}
> > +
> > +static void fme_perf_uinit(struct platform_device *pdev,
> > +                          struct dfl_feature *feature)
> > +{
> > +       struct perf_object *perf_dev = feature->priv;
> > +
> > +       destroy_perf_obj(perf_dev);
> > +}
> > +
> > +const struct dfl_feature_id fme_perf_id_table[] = {
> > +       {.id = FME_FEATURE_ID_GLOBAL_IPERF,},
> > +       {.id = FME_FEATURE_ID_GLOBAL_DPERF,},
> > +       {0,}
> > +};
> > +
> > +const struct dfl_feature_ops fme_perf_ops = {
> > +       .init = fme_perf_init,
> > +       .uinit = fme_perf_uinit,
> > +};
> > diff --git a/drivers/fpga/dfl-fme.h b/drivers/fpga/dfl-fme.h
> > index 5fbe3f5..dc71048 100644
> > --- a/drivers/fpga/dfl-fme.h
> > +++ b/drivers/fpga/dfl-fme.h
> > @@ -39,5 +39,7 @@ struct dfl_fme {
> >  extern const struct dfl_feature_id fme_pr_mgmt_id_table[];
> >  extern const struct dfl_feature_ops fme_global_err_ops;
> >  extern const struct dfl_feature_id fme_global_err_id_table[];
> > +extern const struct dfl_feature_ops fme_perf_ops;
> > +extern const struct dfl_feature_id fme_perf_id_table[];
> >
> >  #endif /* __DFL_FME_H */
> > diff --git a/drivers/fpga/dfl.c b/drivers/fpga/dfl.c
> > index 1bb2b58..050b18b 100644
> > --- a/drivers/fpga/dfl.c
> > +++ b/drivers/fpga/dfl.c
> > @@ -521,6 +521,7 @@ static int build_info_commit_dev(struct build_feature_devs_info *binfo)
> >                 struct dfl_feature *feature = &pdata->features[index];
> >
> >                 /* save resource information for each feature */
> > +               feature->pdev = fdev;
> >                 feature->id = finfo->fid;
> >                 feature->resource_index = index;
> >                 feature->ioaddr = finfo->ioaddr;
> > diff --git a/drivers/fpga/dfl.h b/drivers/fpga/dfl.h
> > index 6c32080..bf23436 100644
> > --- a/drivers/fpga/dfl.h
> > +++ b/drivers/fpga/dfl.h
> > @@ -191,6 +191,7 @@ struct dfl_feature_driver {
> >  /**
> >   * struct dfl_feature - sub feature of the feature devices
> >   *
> > + * @pdev: parent platform device.
> >   * @id: sub feature id.
> >   * @resource_index: each sub feature has one mmio resource for its registers.
> >   *                 this index is used to find its mmio resource from the
> > @@ -200,6 +201,7 @@ struct dfl_feature_driver {
> >   * @priv: priv data of this feature.
> >   */
> >  struct dfl_feature {
> > +       struct platform_device *pdev;
> >         u64 id;
> >         int resource_index;
> >         void __iomem *ioaddr;
> > --
> > 1.8.3.1
> >

^ permalink raw reply

* Re: [PATCH v1 1/2] fork: add clone3
From: Yann Droneaud @ 2019-06-01  8:49 UTC (permalink / raw)
  To: Christian Brauner
  Cc: viro, linux-kernel, torvalds, jannh, fweimer, oleg, arnd,
	dhowells, Pavel Emelyanov, Andrew Morton, Adrian Reber,
	Andrei Vagin, linux-api
In-Reply-To: <20190531220815.owrc5kbbdemmwdhs@brauner.io>

Hi,

Le samedi 01 juin 2019 à 00:08 +0200, Christian Brauner a écrit :
> On Wed, May 29, 2019 at 05:42:14PM +0200, Yann Droneaud wrote:
> > Le mercredi 29 mai 2019 à 17:22 +0200, Christian Brauner a écrit :
> > > This adds the clone3 system call.
> > > 
> > > 
> > > diff --git a/kernel/fork.c b/kernel/fork.c
> > > index b4cba953040a..6bc3e3d17150 100644
> > > --- a/kernel/fork.c
> > > +++ b/kernel/fork.c
> > > @@ -2472,7 +2475,96 @@ SYSCALL_DEFINE5(clone, unsigned long,
> > > clone_flags, unsigned long, newsp,
> > >  		 unsigned long, tls)
> > >  #endif
> > >  {
> > > -	return _do_fork(clone_flags, newsp, 0, parent_tidptr,
> > > child_tidptr, tls);
> > > +	struct kernel_clone_args args = {
> > > +		.flags = clone_flags,
> > > +		.stack = newsp,
> > > +		.pidfd = parent_tidptr,
> > > +		.parent_tidptr = parent_tidptr,
> > > +		.tls = tls,
> > > +		.child_tidptr = child_tidptr,
> > > +	};
> > > +
> > > +	/* clone(CLONE_PIDFD) uses parent_tidptr to return a pidfd */
> > > +	if ((clone_flags & CLONE_PIDFD) && (clone_flags &
> > > CLONE_PARENT_SETTID))
> > > +		return -EINVAL;
> > > +
> > > +	return _do_fork(&args);
> > > +}
> > > +
> > > +static bool clone3_flags_valid(u64 flags)
> > > +{
> > > +	if (flags & CLONE_DETACHED)
> > > +		return false;
> > > +
> > > +	if (flags & ~CLONE_MAX)
> > > +		return false;
> > > +
> > > +	return true;
> > > +}
> > > +
> > > +static int copy_clone_args_from_user(struct kernel_clone_args
> > > *kargs,
> > > +				     struct clone_args __user *uargs,
> > > +				     size_t size)
> > > +{
> > > +	struct clone_args args;
> > > +
> > > +	if (unlikely(size > PAGE_SIZE))
> > > +		return -E2BIG;
> > > +
> > > +	if (unlikely(size < sizeof(struct clone_args)))
> > > +		return -EINVAL;
> > > +
> > > +	if (unlikely(!access_ok(uargs, size)))
> > > +		return -EFAULT;
> > > +
> > > +	if (size > sizeof(struct clone_args)) {
> > > +		unsigned char __user *addr;
> > > +		unsigned char __user *end;
> > > +		unsigned char val;
> > > +
> > > +		addr = (void __user *)uargs + sizeof(struct
> > > clone_args);
> > > +		end = (void __user *)uargs + size;
> > > +
> > > +		for (; addr < end; addr++) {
> > > +			if (get_user(val, addr))
> > > +				return -EFAULT;
> > > +			if (val)
> > > +				return -E2BIG;
> > 
> > Should be -EINVAL: having something after the structure should be
> > handled just like an invalid flags, while still allowing future
> > userspace program to probe for support for newer feature.
> 
> (Traveling until Monday, so sorry for delayed responses.)
> 
> This copies what:
> 
> kernel/sched/core.c:sched_copy_attr()
> kernel/event/core.c:perf_copy_attr()
> 
> are already doing. Consistency might be good here but, I think.
> 

I would have prefer all the above to returns -EINVAL for consistency
with the unknown flags check ... 

"Designing the API: Planning for Extension" [1] doesn't mandate return
-EINVAL for that case, but does make perf_event_open() and
perf_copy_attr() the example to follow ... so you're right.

[1] 
https://www.kernel.org/doc/html/v5.1/process/adding-syscalls.html#designing-the-api-planning-for-extension

Regards.

-- 
Yann Droneaud
OPTEYA

^ permalink raw reply

* Re: [PATCH v3 16/16] fpga: dfl: fme: add performance reporting support
From: Wu Hao @ 2019-06-01  9:11 UTC (permalink / raw)
  To: Greg KH
  Cc: atull, mdf, linux-fpga, linux-kernel, linux-api, Luwei Kang,
	Xu Yilun
In-Reply-To: <20190530190305.GA2909@kroah.com>

On Thu, May 30, 2019 at 12:03:05PM -0700, Greg KH wrote:
> On Mon, May 27, 2019 at 01:22:26PM +0800, Wu Hao wrote:
> > --- /dev/null
> > +++ b/drivers/fpga/dfl-fme-perf.c
> > @@ -0,0 +1,962 @@
> > +// SPDX-License-Identifier: GPL-2.0
> > +/*
> > + * Driver for FPGA Management Engine (FME) Global Performance Reporting
> > + *
> > + * Copyright 2019 Intel Corporation, Inc.
> > + *
> > + * Authors:
> > + *   Kang Luwei <luwei.kang@intel.com>
> > + *   Xiao Guangrong <guangrong.xiao@linux.intel.com>
> > + *   Wu Hao <hao.wu@intel.com>
> > + *   Joseph Grecco <joe.grecco@intel.com>
> > + *   Enno Luebbers <enno.luebbers@intel.com>
> > + *   Tim Whisonant <tim.whisonant@intel.com>
> > + *   Ananda Ravuri <ananda.ravuri@intel.com>
> > + *   Mitchel, Henry <henry.mitchel@intel.com>
> > + */
> > +
> > +#include "dfl.h"
> > +#include "dfl-fme.h"
> > +
> > +/*
> > + * Performance Counter Registers for Cache.
> > + *
> > + * Cache Events are listed below as CACHE_EVNT_*.
> > + */
> > +#define CACHE_CTRL			0x8
> > +#define CACHE_RESET_CNTR		BIT_ULL(0)
> > +#define CACHE_FREEZE_CNTR		BIT_ULL(8)
> > +#define CACHE_CTRL_EVNT			GENMASK_ULL(19, 16)
> > +#define CACHE_EVNT_RD_HIT		0x0
> > +#define CACHE_EVNT_WR_HIT		0x1
> > +#define CACHE_EVNT_RD_MISS		0x2
> > +#define CACHE_EVNT_WR_MISS		0x3
> > +#define CACHE_EVNT_RSVD			0x4
> > +#define CACHE_EVNT_HOLD_REQ		0x5
> > +#define CACHE_EVNT_DATA_WR_PORT_CONTEN	0x6
> > +#define CACHE_EVNT_TAG_WR_PORT_CONTEN	0x7
> > +#define CACHE_EVNT_TX_REQ_STALL		0x8
> > +#define CACHE_EVNT_RX_REQ_STALL		0x9
> > +#define CACHE_EVNT_EVICTIONS		0xa
> > +#define CACHE_EVNT_MAX			CACHE_EVNT_EVICTIONS
> > +#define CACHE_CHANNEL_SEL		BIT_ULL(20)
> > +#define CACHE_CHANNEL_RD		0
> > +#define CACHE_CHANNEL_WR		1
> > +#define CACHE_CHANNEL_MAX		2
> > +#define CACHE_CNTR0			0x10
> > +#define CACHE_CNTR1			0x18
> > +#define CACHE_CNTR_EVNT_CNTR		GENMASK_ULL(47, 0)
> > +#define CACHE_CNTR_EVNT			GENMASK_ULL(63, 60)
> > +
> > +/*
> > + * Performance Counter Registers for Fabric.
> > + *
> > + * Fabric Events are listed below as FAB_EVNT_*
> > + */
> > +#define FAB_CTRL			0x20
> > +#define FAB_RESET_CNTR			BIT_ULL(0)
> > +#define FAB_FREEZE_CNTR			BIT_ULL(8)
> > +#define FAB_CTRL_EVNT			GENMASK_ULL(19, 16)
> > +#define FAB_EVNT_PCIE0_RD		0x0
> > +#define FAB_EVNT_PCIE0_WR		0x1
> > +#define FAB_EVNT_PCIE1_RD		0x2
> > +#define FAB_EVNT_PCIE1_WR		0x3
> > +#define FAB_EVNT_UPI_RD			0x4
> > +#define FAB_EVNT_UPI_WR			0x5
> > +#define FAB_EVNT_MMIO_RD		0x6
> > +#define FAB_EVNT_MMIO_WR		0x7
> > +#define FAB_EVNT_MAX			FAB_EVNT_MMIO_WR
> > +#define FAB_PORT_ID			GENMASK_ULL(21, 20)
> > +#define FAB_PORT_FILTER			BIT_ULL(23)
> > +#define FAB_PORT_FILTER_DISABLE		0
> > +#define FAB_PORT_FILTER_ENABLE		1
> > +#define FAB_CNTR			0x28
> > +#define FAB_CNTR_EVNT_CNTR		GENMASK_ULL(59, 0)
> > +#define FAB_CNTR_EVNT			GENMASK_ULL(63, 60)
> > +
> > +/*
> > + * Performance Counter Registers for Clock.
> > + *
> > + * Clock Counter can't be reset or frozen by SW.
> > + */
> > +#define CLK_CNTR			0x30
> > +
> > +/*
> > + * Performance Counter Registers for IOMMU / VT-D.
> > + *
> > + * VT-D Events are listed below as VTD_EVNT_* and VTD_SIP_EVNT_*
> > + */
> > +#define VTD_CTRL			0x38
> > +#define VTD_RESET_CNTR			BIT_ULL(0)
> > +#define VTD_FREEZE_CNTR			BIT_ULL(8)
> > +#define VTD_CTRL_EVNT			GENMASK_ULL(19, 16)
> > +#define VTD_EVNT_AFU_MEM_RD_TRANS	0x0
> > +#define VTD_EVNT_AFU_MEM_WR_TRANS	0x1
> > +#define VTD_EVNT_AFU_DEVTLB_RD_HIT	0x2
> > +#define VTD_EVNT_AFU_DEVTLB_WR_HIT	0x3
> > +#define VTD_EVNT_DEVTLB_4K_FILL		0x4
> > +#define VTD_EVNT_DEVTLB_2M_FILL		0x5
> > +#define VTD_EVNT_DEVTLB_1G_FILL		0x6
> > +#define VTD_EVNT_MAX			VTD_EVNT_DEVTLB_1G_FILL
> > +#define VTD_CNTR			0x40
> > +#define VTD_CNTR_EVNT			GENMASK_ULL(63, 60)
> > +#define VTD_CNTR_EVNT_CNTR		GENMASK_ULL(47, 0)
> > +#define VTD_SIP_CTRL			0x48
> > +#define VTD_SIP_RESET_CNTR		BIT_ULL(0)
> > +#define VTD_SIP_FREEZE_CNTR		BIT_ULL(8)
> > +#define VTD_SIP_CTRL_EVNT		GENMASK_ULL(19, 16)
> > +#define VTD_SIP_EVNT_IOTLB_4K_HIT	0x0
> > +#define VTD_SIP_EVNT_IOTLB_2M_HIT	0x1
> > +#define VTD_SIP_EVNT_IOTLB_1G_HIT	0x2
> > +#define VTD_SIP_EVNT_SLPWC_L3_HIT	0x3
> > +#define VTD_SIP_EVNT_SLPWC_L4_HIT	0x4
> > +#define VTD_SIP_EVNT_RCC_HIT		0x5
> > +#define VTD_SIP_EVNT_IOTLB_4K_MISS	0x6
> > +#define VTD_SIP_EVNT_IOTLB_2M_MISS	0x7
> > +#define VTD_SIP_EVNT_IOTLB_1G_MISS	0x8
> > +#define VTD_SIP_EVNT_SLPWC_L3_MISS	0x9
> > +#define VTD_SIP_EVNT_SLPWC_L4_MISS	0xa
> > +#define VTD_SIP_EVNT_RCC_MISS		0xb
> > +#define VTD_SIP_EVNT_MAX		VTD_SIP_EVNT_RCC_MISS
> > +#define VTD_SIP_CNTR			0X50
> > +#define VTD_SIP_CNTR_EVNT		GENMASK_ULL(63, 60)
> > +#define VTD_SIP_CNTR_EVNT_CNTR		GENMASK_ULL(47, 0)
> > +
> > +#define PERF_OBJ_ROOT_ID		(~0)
> > +
> > +#define PERF_TIMEOUT			30
> > +
> > +/**
> > + * struct perf_object - object of performance counter
> > + *
> > + * @id: instance id. PERF_OBJ_ROOT_ID indicates it is a parent object which
> > + *      counts performance counters for all instances.
> > + * @attr_groups: the sysfs files are associated with this object.
> > + * @feature: pointer to related private feature.
> > + * @node: used to link itself to parent's children list.
> > + * @children: used to link its children objects together.
> > + * @kobj: generic kobject interface.
> > + *
> > + * 'node' and 'children' are used to construct parent-children hierarchy.
> > + */
> > +struct perf_object {
> > +	int id;
> > +	const struct attribute_group **attr_groups;
> > +	struct dfl_feature *feature;
> > +
> > +	struct list_head node;
> > +	struct list_head children;
> > +	struct kobject kobj;
> 
> Woah, why are you using a "raw" kobject and not a 'struct device' here?
> You just broke userspace and no libraries will see your kobject's
> properties as the "chain" of struct devices is not happening anymore.
> 
> Why can this not just be a 'struct device'?

Hi Greg,

Many thanks for the review and comments.

Actually we are just trying to create sysfs hierarchy for performance
counters using these data structures.

If we use 'struct device' instead of kobject, then we have to let userspace
code to deal with device's sysfs (e.g. ignore 'uevent' below). This is the
only concern from my side now, as I know that using 'struct device'
saves code as we don't need to introduce a new perf_obj_attribute then.

dfl-fme.0/perf/
 ├── iommu
 │   ├── afu0
 │   │   ├── devtlb_1g_fill
 │   │   ├── devtlb_2m_fill
 │   │   ├── devtlb_4k_fill
 │   │   ├── devtlb_read_hit
 │   │   ├── devtlb_write_hit
 │   │   ├── read_transaction
 │   │   ├── uevent
 │   │   └── write_transaction
 │   ├── freeze
 │   ├── iotlb_1g_hit
 │   ├── iotlb_1g_miss
	 ...
     └── uevent
 ...

Do you think if we could keep it or it's better to use 'struct device'?

> 
> 
> > +};
> > +
> > +/**
> > + * struct perf_obj_attribute - attribute of perf object
> > + *
> > + * @attr: attribute of this perf object.
> > + * @show: show callback for sysfs attribute.
> > + * @store: store callback for sysfs attribute.
> > + */
> > +struct perf_obj_attribute {
> > +	struct attribute attr;
> > +	ssize_t (*show)(struct perf_object *pobj, char *buf);
> > +	ssize_t (*store)(struct perf_object *pobj,
> > +			 const char *buf, size_t n);
> > +};
> > +
> > +#define to_perf_obj_attr(_attr)					\
> > +		container_of(_attr, struct perf_obj_attribute, attr)
> > +#define to_perf_obj(_kobj)					\
> > +		container_of(_kobj, struct perf_object, kobj)
> > +
> > +#define __POBJ_ATTR(_name, _mode, _show, _store) {			\
> > +	.attr = {.name = __stringify(_name),				\
> > +		 .mode = VERIFY_OCTAL_PERMISSIONS(_mode) },		\
> > +	.show   = _show,						\
> > +	.store  = _store,						\
> > +}
> > +
> > +#define PERF_OBJ_ATTR_F_RO(_name, _filename)				\
> > +struct perf_obj_attribute perf_obj_attr_##_name =			\
> > +	__POBJ_ATTR(_filename, 0444, _name##_show, NULL)
> > +
> > +#define PERF_OBJ_ATTR_F_WO(_name, _filename)				\
> > +struct perf_obj_attribute perf_obj_attr_##_name =			\
> > +	__POBJ_ATTR(_filename, 0200, NULL, _name##_store)
> > +
> > +#define PERF_OBJ_ATTR_F_RW(_name, _filename)				\
> > +struct perf_obj_attribute perf_obj_attr_##_name =			\
> > +	__POBJ_ATTR(_filename, 0644, _name##_show, _name##_store)
> > +
> > +#define PERF_OBJ_ATTR_RO(_name)						\
> > +struct perf_obj_attribute perf_obj_attr_##_name =			\
> > +	__POBJ_ATTR(_name, 0444, _name##_show, NULL)
> > +
> > +#define PERF_OBJ_ATTR_WO(_name)						\
> > +struct perf_obj_attribute perf_obj_attr_##_name =			\
> > +	__POBJ_ATTR(_name, 0200, NULL, _name##_store)
> > +
> > +#define PERF_OBJ_ATTR_RW(_name)						\
> > +struct perf_obj_attribute perf_obj_attr_##_name =			\
> > +	__POBJ_ATTR(_name, 0644, _name##_show, _name##_store)
> 
> When you have to roll your own sysfs attributes for a single driver,
> that is a HUGE hint you are doing something wrong.  No driver for an
> individual device should EVER have to do this.
> 
> Please use the driver core properly and do not route around it.

Sure. Thanks!

Hao

> 
> thanks,
> 
> greg k-h

^ permalink raw reply

* Re: [PATCH v3 16/16] fpga: dfl: fme: add performance reporting support
From: Greg KH @ 2019-06-01  9:42 UTC (permalink / raw)
  To: Wu Hao
  Cc: atull, mdf, linux-fpga, linux-kernel, linux-api, Luwei Kang,
	Xu Yilun
In-Reply-To: <20190601091147.GB3743@hao-dev>

On Sat, Jun 01, 2019 at 05:11:47PM +0800, Wu Hao wrote:
> On Thu, May 30, 2019 at 12:03:05PM -0700, Greg KH wrote:
> > On Mon, May 27, 2019 at 01:22:26PM +0800, Wu Hao wrote:
> > > --- /dev/null
> > > +++ b/drivers/fpga/dfl-fme-perf.c
> > > @@ -0,0 +1,962 @@
> > > +// SPDX-License-Identifier: GPL-2.0
> > > +/*
> > > + * Driver for FPGA Management Engine (FME) Global Performance Reporting
> > > + *
> > > + * Copyright 2019 Intel Corporation, Inc.
> > > + *
> > > + * Authors:
> > > + *   Kang Luwei <luwei.kang@intel.com>
> > > + *   Xiao Guangrong <guangrong.xiao@linux.intel.com>
> > > + *   Wu Hao <hao.wu@intel.com>
> > > + *   Joseph Grecco <joe.grecco@intel.com>
> > > + *   Enno Luebbers <enno.luebbers@intel.com>
> > > + *   Tim Whisonant <tim.whisonant@intel.com>
> > > + *   Ananda Ravuri <ananda.ravuri@intel.com>
> > > + *   Mitchel, Henry <henry.mitchel@intel.com>
> > > + */
> > > +
> > > +#include "dfl.h"
> > > +#include "dfl-fme.h"
> > > +
> > > +/*
> > > + * Performance Counter Registers for Cache.
> > > + *
> > > + * Cache Events are listed below as CACHE_EVNT_*.
> > > + */
> > > +#define CACHE_CTRL			0x8
> > > +#define CACHE_RESET_CNTR		BIT_ULL(0)
> > > +#define CACHE_FREEZE_CNTR		BIT_ULL(8)
> > > +#define CACHE_CTRL_EVNT			GENMASK_ULL(19, 16)
> > > +#define CACHE_EVNT_RD_HIT		0x0
> > > +#define CACHE_EVNT_WR_HIT		0x1
> > > +#define CACHE_EVNT_RD_MISS		0x2
> > > +#define CACHE_EVNT_WR_MISS		0x3
> > > +#define CACHE_EVNT_RSVD			0x4
> > > +#define CACHE_EVNT_HOLD_REQ		0x5
> > > +#define CACHE_EVNT_DATA_WR_PORT_CONTEN	0x6
> > > +#define CACHE_EVNT_TAG_WR_PORT_CONTEN	0x7
> > > +#define CACHE_EVNT_TX_REQ_STALL		0x8
> > > +#define CACHE_EVNT_RX_REQ_STALL		0x9
> > > +#define CACHE_EVNT_EVICTIONS		0xa
> > > +#define CACHE_EVNT_MAX			CACHE_EVNT_EVICTIONS
> > > +#define CACHE_CHANNEL_SEL		BIT_ULL(20)
> > > +#define CACHE_CHANNEL_RD		0
> > > +#define CACHE_CHANNEL_WR		1
> > > +#define CACHE_CHANNEL_MAX		2
> > > +#define CACHE_CNTR0			0x10
> > > +#define CACHE_CNTR1			0x18
> > > +#define CACHE_CNTR_EVNT_CNTR		GENMASK_ULL(47, 0)
> > > +#define CACHE_CNTR_EVNT			GENMASK_ULL(63, 60)
> > > +
> > > +/*
> > > + * Performance Counter Registers for Fabric.
> > > + *
> > > + * Fabric Events are listed below as FAB_EVNT_*
> > > + */
> > > +#define FAB_CTRL			0x20
> > > +#define FAB_RESET_CNTR			BIT_ULL(0)
> > > +#define FAB_FREEZE_CNTR			BIT_ULL(8)
> > > +#define FAB_CTRL_EVNT			GENMASK_ULL(19, 16)
> > > +#define FAB_EVNT_PCIE0_RD		0x0
> > > +#define FAB_EVNT_PCIE0_WR		0x1
> > > +#define FAB_EVNT_PCIE1_RD		0x2
> > > +#define FAB_EVNT_PCIE1_WR		0x3
> > > +#define FAB_EVNT_UPI_RD			0x4
> > > +#define FAB_EVNT_UPI_WR			0x5
> > > +#define FAB_EVNT_MMIO_RD		0x6
> > > +#define FAB_EVNT_MMIO_WR		0x7
> > > +#define FAB_EVNT_MAX			FAB_EVNT_MMIO_WR
> > > +#define FAB_PORT_ID			GENMASK_ULL(21, 20)
> > > +#define FAB_PORT_FILTER			BIT_ULL(23)
> > > +#define FAB_PORT_FILTER_DISABLE		0
> > > +#define FAB_PORT_FILTER_ENABLE		1
> > > +#define FAB_CNTR			0x28
> > > +#define FAB_CNTR_EVNT_CNTR		GENMASK_ULL(59, 0)
> > > +#define FAB_CNTR_EVNT			GENMASK_ULL(63, 60)
> > > +
> > > +/*
> > > + * Performance Counter Registers for Clock.
> > > + *
> > > + * Clock Counter can't be reset or frozen by SW.
> > > + */
> > > +#define CLK_CNTR			0x30
> > > +
> > > +/*
> > > + * Performance Counter Registers for IOMMU / VT-D.
> > > + *
> > > + * VT-D Events are listed below as VTD_EVNT_* and VTD_SIP_EVNT_*
> > > + */
> > > +#define VTD_CTRL			0x38
> > > +#define VTD_RESET_CNTR			BIT_ULL(0)
> > > +#define VTD_FREEZE_CNTR			BIT_ULL(8)
> > > +#define VTD_CTRL_EVNT			GENMASK_ULL(19, 16)
> > > +#define VTD_EVNT_AFU_MEM_RD_TRANS	0x0
> > > +#define VTD_EVNT_AFU_MEM_WR_TRANS	0x1
> > > +#define VTD_EVNT_AFU_DEVTLB_RD_HIT	0x2
> > > +#define VTD_EVNT_AFU_DEVTLB_WR_HIT	0x3
> > > +#define VTD_EVNT_DEVTLB_4K_FILL		0x4
> > > +#define VTD_EVNT_DEVTLB_2M_FILL		0x5
> > > +#define VTD_EVNT_DEVTLB_1G_FILL		0x6
> > > +#define VTD_EVNT_MAX			VTD_EVNT_DEVTLB_1G_FILL
> > > +#define VTD_CNTR			0x40
> > > +#define VTD_CNTR_EVNT			GENMASK_ULL(63, 60)
> > > +#define VTD_CNTR_EVNT_CNTR		GENMASK_ULL(47, 0)
> > > +#define VTD_SIP_CTRL			0x48
> > > +#define VTD_SIP_RESET_CNTR		BIT_ULL(0)
> > > +#define VTD_SIP_FREEZE_CNTR		BIT_ULL(8)
> > > +#define VTD_SIP_CTRL_EVNT		GENMASK_ULL(19, 16)
> > > +#define VTD_SIP_EVNT_IOTLB_4K_HIT	0x0
> > > +#define VTD_SIP_EVNT_IOTLB_2M_HIT	0x1
> > > +#define VTD_SIP_EVNT_IOTLB_1G_HIT	0x2
> > > +#define VTD_SIP_EVNT_SLPWC_L3_HIT	0x3
> > > +#define VTD_SIP_EVNT_SLPWC_L4_HIT	0x4
> > > +#define VTD_SIP_EVNT_RCC_HIT		0x5
> > > +#define VTD_SIP_EVNT_IOTLB_4K_MISS	0x6
> > > +#define VTD_SIP_EVNT_IOTLB_2M_MISS	0x7
> > > +#define VTD_SIP_EVNT_IOTLB_1G_MISS	0x8
> > > +#define VTD_SIP_EVNT_SLPWC_L3_MISS	0x9
> > > +#define VTD_SIP_EVNT_SLPWC_L4_MISS	0xa
> > > +#define VTD_SIP_EVNT_RCC_MISS		0xb
> > > +#define VTD_SIP_EVNT_MAX		VTD_SIP_EVNT_RCC_MISS
> > > +#define VTD_SIP_CNTR			0X50
> > > +#define VTD_SIP_CNTR_EVNT		GENMASK_ULL(63, 60)
> > > +#define VTD_SIP_CNTR_EVNT_CNTR		GENMASK_ULL(47, 0)
> > > +
> > > +#define PERF_OBJ_ROOT_ID		(~0)
> > > +
> > > +#define PERF_TIMEOUT			30
> > > +
> > > +/**
> > > + * struct perf_object - object of performance counter
> > > + *
> > > + * @id: instance id. PERF_OBJ_ROOT_ID indicates it is a parent object which
> > > + *      counts performance counters for all instances.
> > > + * @attr_groups: the sysfs files are associated with this object.
> > > + * @feature: pointer to related private feature.
> > > + * @node: used to link itself to parent's children list.
> > > + * @children: used to link its children objects together.
> > > + * @kobj: generic kobject interface.
> > > + *
> > > + * 'node' and 'children' are used to construct parent-children hierarchy.
> > > + */
> > > +struct perf_object {
> > > +	int id;
> > > +	const struct attribute_group **attr_groups;
> > > +	struct dfl_feature *feature;
> > > +
> > > +	struct list_head node;
> > > +	struct list_head children;
> > > +	struct kobject kobj;
> > 
> > Woah, why are you using a "raw" kobject and not a 'struct device' here?
> > You just broke userspace and no libraries will see your kobject's
> > properties as the "chain" of struct devices is not happening anymore.
> > 
> > Why can this not just be a 'struct device'?
> 
> Hi Greg,
> 
> Many thanks for the review and comments.
> 
> Actually we are just trying to create sysfs hierarchy for performance
> counters using these data structures.
> 
> If we use 'struct device' instead of kobject, then we have to let userspace
> code to deal with device's sysfs (e.g. ignore 'uevent' below). This is the
> only concern from my side now, as I know that using 'struct device'
> saves code as we don't need to introduce a new perf_obj_attribute then.
> 
> dfl-fme.0/perf/
>  ├── iommu
>  │   ├── afu0
>  │   │   ├── devtlb_1g_fill
>  │   │   ├── devtlb_2m_fill
>  │   │   ├── devtlb_4k_fill
>  │   │   ├── devtlb_read_hit
>  │   │   ├── devtlb_write_hit
>  │   │   ├── read_transaction
>  │   │   ├── uevent
>  │   │   └── write_transaction
>  │   ├── freeze
>  │   ├── iotlb_1g_hit
>  │   ├── iotlb_1g_miss
> 	 ...
>      └── uevent
>  ...
> 
> Do you think if we could keep it or it's better to use 'struct device'?

What about using the attribute group name?  That gives you a subdir for
free.  Doing anything "deeper" than one level means that you really have
a child device, and yes, you need to use a 'struct device'.  Make it
part of your bus and just have it be a different "type" and all should
be good.

Again, NEVER use a raw kobject as a child of a 'struct device', that
will break things.

And please cc: me on this series from now on, as you are obviously
trying to do complex things with the driver model and sysfs and it is
easy to get very wrong.

But wait, step back, why does this one driver have such a "special"
user/kernel api that unique to it and nothing else?  That's also a big
red flag, why not just use the normal perf api that everyone else uses?

thanks,

greg k-h

^ permalink raw reply

* Re: [PATCH 09/25] vfs: Allow mount information to be queried by fsinfo() [ver #13]
From: Joel Fernandes @ 2019-06-01 16:08 UTC (permalink / raw)
  To: David Howells
  Cc: viro, raven, linux-api, linux-fsdevel, linux-kernel, mszeredi
In-Reply-To: <155905633578.1662.8087594848892366318.stgit@warthog.procyon.org.uk>

On Tue, May 28, 2019 at 04:12:15PM +0100, David Howells wrote:
[snip]
> +
> +/*
> + * Store a mount record into the fsinfo buffer.
> + */
> +static void store_mount_fsinfo(struct fsinfo_kparams *params,
> +			       struct fsinfo_mount_child *child)
> +{
> +	unsigned int usage = params->usage;
> +	unsigned int total = sizeof(*child);
> +
> +	if (params->usage >= INT_MAX)
> +		return;
> +	params->usage = usage + total;
> +	if (params->buffer && params->usage <= params->buf_size)
> +		memcpy(params->buffer + usage, child, total);
> +}
> +
> +/*
> + * Return information about the submounts relative to path.
> + */
> +int fsinfo_generic_mount_children(struct path *path, struct fsinfo_kparams *params)
> +{
> +	struct fsinfo_mount_child record;
> +	struct mount *m, *child;
> +
> +	if (!path->mnt)
> +		return -ENODATA;
> +
> +	rcu_read_lock();
> +
> +	m = real_mount(path->mnt);
> +	list_for_each_entry_rcu(child, &m->mnt_mounts, mnt_child) {
> +		if (child->mnt_parent != m)
> +			continue;
> +		record.mnt_id = child->mnt_id;
> +		record.notify_counter = atomic_read(&child->mnt_notify_counter);
> +		store_mount_fsinfo(params, &record);
> +	}
> +
> +	record.mnt_id = m->mnt_id;
> +	record.notify_counter = atomic_read(&m->mnt_notify_counter);
> +	store_mount_fsinfo(params, &record);
> +
> +	rcu_read_unlock();

Not super familiar with this code, but wanted to check with you:

Here, if the rcu_read_lock is supposed to protect the RCU list, can
rcu_read_lock() scope be reduced to just wrapping around the
list_for_each_entry_rcu?

  rcu_read_lock();
  list_for_each_entry_rcu(..) {
  	...
  }
  rcu_read_unlock();

(and similarly to other similar parts of this patch).

thanks,

  - Joel

> +	return params->usage;
> +}
> +
> +/*
> + * Return the path of the Nth submount relative to path.  This is derived from
> + * d_path(), but the root determination is more complicated.
> + */
> +int fsinfo_generic_mount_submount(struct path *path, struct fsinfo_kparams *params)
> +{
> +	struct mountpoint *mp;
> +	struct mount *m, *child;
> +	struct path mountpoint, root;
> +	unsigned int n = params->Nth;
> +	size_t len;
> +	void *p;
> +
> +	if (!path->mnt)
> +		return -ENODATA;
> +
> +	rcu_read_lock();
> +
> +	m = real_mount(path->mnt);
> +	list_for_each_entry_rcu(child, &m->mnt_mounts, mnt_child) {
> +		mp = READ_ONCE(child->mnt_mp);
> +		if (child->mnt_parent != m || !mp)
> +			continue;
> +		if (n-- == 0)
> +			goto found;
> +	}
> +	rcu_read_unlock();
> +	return -ENODATA;
> +
> +found:
> +	mountpoint.mnt = path->mnt;
> +	mountpoint.dentry = READ_ONCE(mp->m_dentry);
> +
> +	get_fs_root_rcu(current->fs, &root);
> +	if (root.mnt != path->mnt) {
> +		root.mnt = path->mnt;
> +		root.dentry = path->mnt->mnt_root;
> +	}
> +
> +	p = __d_path(&mountpoint, &root, params->buffer, params->buf_size);
> +	rcu_read_unlock();
> +
> +	if (IS_ERR(p))
> +		return PTR_ERR(p);
> +	if (!p)
> +		return -EPERM;
> +
> +	len = (params->buffer + params->buf_size) - p;
> +	memmove(params->buffer, p, len);
> +	return len;
> +}
> +#endif /* CONFIG_FSINFO */
> diff --git a/include/uapi/linux/fsinfo.h b/include/uapi/linux/fsinfo.h
> index dae2e8dd757e..7f7a75e9758a 100644
> --- a/include/uapi/linux/fsinfo.h
> +++ b/include/uapi/linux/fsinfo.h
> @@ -32,6 +32,10 @@ enum fsinfo_attribute {
>  	FSINFO_ATTR_PARAM_ENUM		= 14,	/* Nth enum-to-val */
>  	FSINFO_ATTR_PARAMETERS		= 15,	/* Mount parameters (large string) */
>  	FSINFO_ATTR_LSM_PARAMETERS	= 16,	/* LSM Mount parameters (large string) */
> +	FSINFO_ATTR_MOUNT_INFO		= 17,	/* Mount object information */
> +	FSINFO_ATTR_MOUNT_DEVNAME	= 18,	/* Mount object device name (string) */
> +	FSINFO_ATTR_MOUNT_CHILDREN	= 19,	/* Submount list (array) */
> +	FSINFO_ATTR_MOUNT_SUBMOUNT	= 20,	/* Relative path of Nth submount (string) */
>  	FSINFO_ATTR__NR
>  };
>  
> @@ -268,4 +272,28 @@ struct fsinfo_param_enum {
>  	char		name[252];	/* Name of the enum value */
>  };
>  
> +/*
> + * Information struct for fsinfo(FSINFO_ATTR_MOUNT_INFO).
> + */
> +struct fsinfo_mount_info {
> +	__u64		f_sb_id;	/* Superblock ID */
> +	__u32		mnt_id;		/* Mount identifier (use with AT_FSINFO_MOUNTID_PATH) */
> +	__u32		parent_id;	/* Parent mount identifier */
> +	__u32		group_id;	/* Mount group ID */
> +	__u32		master_id;	/* Slave master group ID */
> +	__u32		from_id;	/* Slave propogated from ID */
> +	__u32		attr;		/* MOUNT_ATTR_* flags */
> +	__u32		notify_counter;	/* Number of notifications generated. */
> +	__u32		__reserved[1];
> +};
> +
> +/*
> + * Information struct element for fsinfo(FSINFO_ATTR_MOUNT_CHILDREN).
> + * - An extra element is placed on the end representing the parent mount.
> + */
> +struct fsinfo_mount_child {
> +	__u32		mnt_id;		/* Mount identifier (use with AT_FSINFO_MOUNTID_PATH) */
> +	__u32		notify_counter;	/* Number of notifications generated on mount. */
> +};
> +
>  #endif /* _UAPI_LINUX_FSINFO_H */
> diff --git a/samples/vfs/test-fsinfo.c b/samples/vfs/test-fsinfo.c
> index 90926024e1c5..a838adcdca9e 100644
> --- a/samples/vfs/test-fsinfo.c
> +++ b/samples/vfs/test-fsinfo.c
> @@ -21,10 +21,10 @@
>  #include <errno.h>
>  #include <time.h>
>  #include <math.h>
> -#include <fcntl.h>
>  #include <sys/syscall.h>
>  #include <linux/fsinfo.h>
>  #include <linux/socket.h>
> +#include <linux/fcntl.h>
>  #include <sys/stat.h>
>  #include <arpa/inet.h>
>  
> @@ -83,6 +83,10 @@ static const struct fsinfo_attr_info fsinfo_buffer_info[FSINFO_ATTR__NR] = {
>  	FSINFO_STRUCT_N		(PARAM_ENUM,		param_enum),
>  	FSINFO_OVERLARGE	(PARAMETERS,		-),
>  	FSINFO_OVERLARGE	(LSM_PARAMETERS,	-),
> +	FSINFO_STRUCT		(MOUNT_INFO,		mount_info),
> +	FSINFO_STRING		(MOUNT_DEVNAME,		mount_devname),
> +	FSINFO_STRUCT_ARRAY	(MOUNT_CHILDREN,	mount_child),
> +	FSINFO_STRING_N		(MOUNT_SUBMOUNT,	mount_submount),
>  };
>  
>  #define FSINFO_NAME(X,Y) [FSINFO_ATTR_##X] = #Y
> @@ -104,6 +108,10 @@ static const char *fsinfo_attr_names[FSINFO_ATTR__NR] = {
>  	FSINFO_NAME		(PARAM_ENUM,		param_enum),
>  	FSINFO_NAME		(PARAMETERS,		parameters),
>  	FSINFO_NAME		(LSM_PARAMETERS,	lsm_parameters),
> +	FSINFO_NAME		(MOUNT_INFO,		mount_info),
> +	FSINFO_NAME		(MOUNT_DEVNAME,		mount_devname),
> +	FSINFO_NAME		(MOUNT_CHILDREN,	mount_children),
> +	FSINFO_NAME		(MOUNT_SUBMOUNT,	mount_submount),
>  };
>  
>  union reply {
> @@ -116,6 +124,8 @@ union reply {
>  	struct fsinfo_capabilities caps;
>  	struct fsinfo_timestamp_info timestamps;
>  	struct fsinfo_volume_uuid uuid;
> +	struct fsinfo_mount_info mount_info;
> +	struct fsinfo_mount_child mount_children[1];
>  };
>  
>  static void dump_hex(unsigned int *data, int from, int to)
> @@ -312,6 +322,29 @@ static void dump_attr_VOLUME_UUID(union reply *r, int size)
>  	       f->uuid[14], f->uuid[15]);
>  }
>  
> +static void dump_attr_MOUNT_INFO(union reply *r, int size)
> +{
> +	struct fsinfo_mount_info *f = &r->mount_info;
> +
> +	printf("\n");
> +	printf("\tsb_id   : %llx\n", (unsigned long long)f->f_sb_id);
> +	printf("\tmnt_id  : %x\n", f->mnt_id);
> +	printf("\tparent  : %x\n", f->parent_id);
> +	printf("\tgroup   : %x\n", f->group_id);
> +	printf("\tattr    : %x\n", f->attr);
> +	printf("\tnotifs  : %x\n", f->notify_counter);
> +}
> +
> +static void dump_attr_MOUNT_CHILDREN(union reply *r, int size)
> +{
> +	struct fsinfo_mount_child *f = r->mount_children;
> +	int i = 0;
> +
> +	printf("\n");
> +	for (; size >= sizeof(*f); size -= sizeof(*f), f++)
> +		printf("\t[%u] %8x %8x\n", i++, f->mnt_id, f->notify_counter);
> +}
> +
>  /*
>   *
>   */
> @@ -327,6 +360,8 @@ static const dumper_t fsinfo_attr_dumper[FSINFO_ATTR__NR] = {
>  	FSINFO_DUMPER(CAPABILITIES),
>  	FSINFO_DUMPER(TIMESTAMP_INFO),
>  	FSINFO_DUMPER(VOLUME_UUID),
> +	FSINFO_DUMPER(MOUNT_INFO),
> +	FSINFO_DUMPER(MOUNT_CHILDREN),
>  };
>  
>  static void dump_fsinfo(enum fsinfo_attribute attr,
> @@ -529,16 +564,21 @@ int main(int argc, char **argv)
>  	unsigned int attr;
>  	int raw = 0, opt, Nth, Mth;
>  
> -	while ((opt = getopt(argc, argv, "adlr"))) {
> +	while ((opt = getopt(argc, argv, "Madlr"))) {
>  		switch (opt) {
> +		case 'M':
> +			params.at_flags = AT_FSINFO_MOUNTID_PATH;
> +			continue;
>  		case 'a':
>  			params.at_flags |= AT_NO_AUTOMOUNT;
> +			params.at_flags &= ~AT_FSINFO_MOUNTID_PATH;
>  			continue;
>  		case 'd':
>  			debug = true;
>  			continue;
>  		case 'l':
>  			params.at_flags &= ~AT_SYMLINK_NOFOLLOW;
> +			params.at_flags &= ~AT_FSINFO_MOUNTID_PATH;
>  			continue;
>  		case 'r':
>  			raw = 1;
> @@ -551,7 +591,8 @@ int main(int argc, char **argv)
>  	argv += optind;
>  
>  	if (argc != 1) {
> -		printf("Format: test-fsinfo [-alr] <file>\n");
> +		printf("Format: test-fsinfo [-adlr] <file>\n");
> +		printf("Format: test-fsinfo [-dr] -M <mnt_id>\n");
>  		exit(2);
>  	}
>  
> 

^ permalink raw reply

* Re: [RFC][PATCH] link.2: AT_ATOMIC_DATA and AT_ATOMIC_METADATA
From: Dave Chinner @ 2019-06-03  4:25 UTC (permalink / raw)
  To: Amir Goldstein
  Cc: Theodore Ts'o, Jan Kara, Darrick J . Wong, Chris Mason,
	Al Viro, linux-fsdevel, linux-xfs, Ext4, Linux Btrfs, Linux API
In-Reply-To: <CAOQ4uxi99NDYMrz-Q7xKta4beQiYFX3-MipZ_RxFNktFTA=vMA@mail.gmail.com>

On Sat, Jun 01, 2019 at 11:01:42AM +0300, Amir Goldstein wrote:
> On Sat, Jun 1, 2019 at 2:28 AM Dave Chinner <david@fromorbit.com> wrote:
> >
> > On Sat, Jun 01, 2019 at 08:45:49AM +1000, Dave Chinner wrote:
> > > Given that we can already use AIO to provide this sort of ordering,
> > > and AIO is vastly faster than synchronous IO, I don't see any point
> > > in adding complex barrier interfaces that can be /easily implemented
> > > in userspace/ using existing AIO primitives. You should start
> > > thinking about expanding libaio with stuff like
> > > "link_after_fdatasync()" and suddenly the whole problem of
> > > filesystem data vs metadata ordering goes away because the
> > > application directly controls all ordering without blocking and
> > > doesn't need to care what the filesystem under it does....
> >
> > And let me point out that this is also how userspace can do an
> > efficient atomic rename - rename_after_fdatasync(). i.e. on
> > completion of the AIO_FSYNC, run the rename. This guarantees that
> > the application will see either the old file of the complete new
> > file, and it *doesn't have to wait for the operation to complete*.
> > Once it is in flight, the file will contain the old data until some
> > point in the near future when will it contain the new data....
> 
> What I am looking for is a way to isolate the effects of "atomic rename/link"
> from the rest of the users.  Sure there is I/O bandwidth and queued
> bios, but at least isolate other threads working on other files or metadata
> from contending with the "atomic rename" thread of journal flushes and
> the like.

That's not a function of the kernel API. That's a function of the
implementation behind the kernel API. i.e. The API requires data to
be written before the rename/link is committed, how that is achieved
is up to the filesystem. And some filesystems will not be able to
isolate the API behavioural requirement from other users....

> Actually, one of my use cases is "atomic rename" of files with
> no data (looking for atomicity w.r.t xattr and mtime), so this "atomic rename"
> thread should not be interfering with other workloads at all.

Which should already guaranteed because a) rename is supposed to be
atomic, and b) metadata ordering requirements in journalled
filesystems. If they lose xattrs across rename, there's something
seriously wrong with the filesystem implementation.  I'm really not
sure what you think filesystems are actually doing with metadata
across rename operations....

> > Seriously, sit down and work out all the "atomic" data vs metadata
> > behaviours you want, and then tell me how many of them cannot be
> > implemented as "AIO_FSYNC w/ completion callback function" in
> > userspace. This mechanism /guarantees ordering/ at the application
> > level, the application does not block waiting for these data
> > integrity operations to complete, and you don't need any new kernel
> > side functionality to implement this.
> 
> So I think what I could have used is AIO_BATCH_FSYNC, an interface
> that was proposed by Ric Wheeler and discussed on LSF:
> https://lwn.net/Articles/789024/
> Ric was looking for a way to efficiently fsync a "bunch of files".
> Submitting several AIO_FSYNC calls is not the efficient way of doing that.

/me sighs.

That's not what I just suggested, and I've already addressed this
"AIO_FSYNC sucks" FUD in multiple separate threads.  You do realise
you can submit multiple AIO operations with a single io_submit()
call, right?

	struct iocb	ioc[10];
	struct io_event ev[10];

	for (i = 0; i < 10; i++) {
		io_prep_fsync(&ioc[i], fd[i]);
		ioc[i]->data = callback_arg[i];
	}

	io_submit(aio_ctx, 10, &ioc);
	io_getevents(aio_ctx, 10, 10, ev, NULL);

	for (i = 0; i < 10; i++)
		post_fsync_callback(&ev[i]);

There's your single syscall AIO_BATCH_FSYNC functionality, and it
implements a per-fd post-fsync callback function. This isn't rocket
science....

[snip]

> I am trying to reduce the number of fsyncs from applications
> and converting fsync to AIO_FSYNC is not going to help with that.

Your whole argument is "fsync is inefficient because cache flushes,
therefore AIO_FSYNC must be inefficient." IOWs, you've already
decided what is wrong, how it can and can't be fixed and the
solution you want regardless of whether your assertions are correct
or not. You haven't provided any evidence that a new kernel API is
the only viable solution, nor that the existing ones cannot provide
the functionality you require.

So, in the interests of /informed debate/, please implement what you
want using batched AIO_FSYNC + rename/linkat completion callback and
measure what it acheives. Then implement a sync_file_range/linkat
thread pool that provides the same functionality to the application
(i.e. writeback concurrency in userspace) and measure it. Then we
can discuss what the relative overhead is with numbers and can
perform analysis to determine what the cause of the performance
differential actually is.

Neither of these things require kernel modifications, but you need
to provide the evidence that existing APIs are insufficient.
Indeed, we now have the new async ioring stuff that can run async
sync_file_range calls, so you probably need to benchmark replacing
AIO_FSYNC with that interface as well. This new API likely does
exactly what you want without the journal/device cache flush
overhead of AIO_FSYNC....

Cheers,

Dave.
-- 
Dave Chinner
david@fromorbit.com

^ permalink raw reply

* [PATCH v1 0/4] Introduce MADV_COLD and MADV_PAGEOUT
From: Minchan Kim @ 2019-06-03  5:36 UTC (permalink / raw)
  To: Andrew Morton
  Cc: linux-mm, LKML, linux-api, Michal Hocko, Johannes Weiner,
	Tim Murray, Joel Fernandes, Suren Baghdasaryan, Daniel Colascione,
	Shakeel Butt, Sonny Rao, Brian Geffon, jannh, oleg, christian,
	oleksandr, hdanton, Minchan Kim

This patch is part of previous series:
https://lore.kernel.org/lkml/20190531064313.193437-1-minchan@kernel.org/T/#u
Originally, it was created for external madvise hinting feature.

https://lkml.org/lkml/2019/5/31/463
Michal wanted to separte the discussion from external hinting interface
so this patchset includes only first part of my entire patchset
  - introduce MADV_COLD and MADV_PAGEOUT hint to madvise.

However, I keep entire description for others for easier understanding
why this kinds of hint was born.

Thanks.

This patchset is against on next-20190530.

Below is description of previous entire patchset.
================= &< =====================

- Background

The Android terminology used for forking a new process and starting an app
from scratch is a cold start, while resuming an existing app is a hot start.
While we continually try to improve the performance of cold starts, hot
starts will always be significantly less power hungry as well as faster so
we are trying to make hot start more likely than cold start.

To increase hot start, Android userspace manages the order that apps should
be killed in a process called ActivityManagerService. ActivityManagerService
tracks every Android app or service that the user could be interacting with
at any time and translates that into a ranked list for lmkd(low memory
killer daemon). They are likely to be killed by lmkd if the system has to
reclaim memory. In that sense they are similar to entries in any other cache.
Those apps are kept alive for opportunistic performance improvements but
those performance improvements will vary based on the memory requirements of
individual workloads.

- Problem

Naturally, cached apps were dominant consumers of memory on the system.
However, they were not significant consumers of swap even though they are
good candidate for swap. Under investigation, swapping out only begins
once the low zone watermark is hit and kswapd wakes up, but the overall
allocation rate in the system might trip lmkd thresholds and cause a cached
process to be killed(we measured performance swapping out vs. zapping the
memory by killing a process. Unsurprisingly, zapping is 10x times faster
even though we use zram which is much faster than real storage) so kill
from lmkd will often satisfy the high zone watermark, resulting in very
few pages actually being moved to swap.

- Approach

The approach we chose was to use a new interface to allow userspace to
proactively reclaim entire processes by leveraging platform information.
This allowed us to bypass the inaccuracy of the kernel’s LRUs for pages
that are known to be cold from userspace and to avoid races with lmkd
by reclaiming apps as soon as they entered the cached state. Additionally,
it could provide many chances for platform to use much information to
optimize memory efficiency.

To achieve the goal, the patchset introduce two new options for madvise.
One is MADV_COLD which will deactivate activated pages and the other is
MADV_PAGEOUT which will reclaim private pages instantly. These new options
complement MADV_DONTNEED and MADV_FREE by adding non-destructive ways to
gain some free memory space. MADV_PAGEOUT is similar to MADV_DONTNEED in a way
that it hints the kernel that memory region is not currently needed and
should be reclaimed immediately; MADV_COLD is similar to MADV_FREE in a way
that it hints the kernel that memory region is not currently needed and
should be reclaimed when memory pressure rises.

This approach is similar in spirit to madvise(MADV_WONTNEED), but the
information required to make the reclaim decision is not known to the app.
Instead, it is known to a centralized userspace daemon, and that daemon
must be able to initiate reclaim on its own without any app involvement.
To solve the concern, this patch introduces new syscall -

    struct pr_madvise_param {
            int size;               /* the size of this structure */
            int cookie;             /* reserved to support atomicity */
            int nr_elem;            /* count of below arrary fields */
            int __user *hints;      /* hints for each range */
            /* to store result of each operation */
            const struct iovec __user *results;
            /* input address ranges */
            const struct iovec __user *ranges;
    };

    int process_madvise(int pidfd, struct pr_madvise_param *u_param,
                            unsigned long flags);

The syscall get pidfd to give hints to external process and provides
pair of result/ranges vector arguments so that it could give several
hints to each address range all at once. It also has cookie variable
to support atomicity of the API for address ranges operations. IOW, if
target process changes address space since monitor process has parsed
address ranges via map_files or maps, the API can detect the race so
could cancel entire address space operation. It's not implemented yet.
Daniel Colascione suggested a idea(Please read description in patch[6/6])
and this patchset adds cookie a variable for the future.

- Experiment

We did bunch of testing with several hundreds of real users, not artificial
benchmark on android. We saw about 17% cold start decreasement without any
significant battery/app startup latency issues. And with artificial benchmark
which launches and switching apps, we saw average 7% app launching improvement,
18% less lmkd kill and good stat from vmstat.

A is vanilla and B is process_madvise.

                                       A          B      delta   ratio(%)
               allocstall_dma          0          0          0       0.00
           allocstall_movable       1464        457      -1007     -69.00
            allocstall_normal     263210     190763     -72447     -28.00
             allocstall_total     264674     191220     -73454     -28.00
          compact_daemon_wake      26912      25294      -1618      -7.00
                 compact_fail      17885      14151      -3734     -21.00
         compact_free_scanned 4204766409 3835994922 -368771487      -9.00
             compact_isolated    3446484    2967618    -478866     -14.00
      compact_migrate_scanned 1621336411 1324695710 -296640701     -19.00
                compact_stall      19387      15343      -4044     -21.00
              compact_success       1502       1192       -310     -21.00
kswapd_high_wmark_hit_quickly        234        184        -50     -22.00
            kswapd_inodesteal     221635     233093      11458       5.00
 kswapd_low_wmark_hit_quickly      66065      54009     -12056     -19.00
                   nr_dirtied     259934     296476      36542      14.00
  nr_vmscan_immediate_reclaim       2587       2356       -231      -9.00
              nr_vmscan_write    1274232    2661733    1387501     108.00
                   nr_written    1514060    2937560    1423500      94.00
                   pageoutrun      67561      55133     -12428     -19.00
                   pgactivate    2335060    1984882    -350178     -15.00
                  pgalloc_dma   13743011   14096463     353452       2.00
              pgalloc_movable          0          0          0       0.00
               pgalloc_normal   18742440   16802065   -1940375     -11.00
                pgalloc_total   32485451   30898528   -1586923      -5.00
                 pgdeactivate    4262210    2930670   -1331540     -32.00
                      pgfault   30812334   31085065     272731       0.00
                       pgfree   33553970   31765164   -1788806      -6.00
                 pginodesteal      33411      15084     -18327     -55.00
                  pglazyfreed          0          0          0       0.00
                   pgmajfault     551312    1508299     956987     173.00
               pgmigrate_fail      43927      29330     -14597     -34.00
            pgmigrate_success    1399851    1203922    -195929     -14.00
                       pgpgin   24141776   19032156   -5109620     -22.00
                      pgpgout     959344    1103316     143972      15.00
                 pgpgoutclean    4639732    3765868    -873864     -19.00
                     pgrefill    4884560    3006938   -1877622     -39.00
                    pgrotated      37828      25897     -11931     -32.00
                pgscan_direct    1456037     957567    -498470     -35.00
       pgscan_direct_throttle          0          0          0       0.00
                pgscan_kswapd    6667767    5047360   -1620407     -25.00
                 pgscan_total    8123804    6004927   -2118877     -27.00
                   pgskip_dma          0          0          0       0.00
               pgskip_movable          0          0          0       0.00
                pgskip_normal      14907      25382      10475      70.00
                 pgskip_total      14907      25382      10475      70.00
               pgsteal_direct    1118986     690215    -428771     -39.00
               pgsteal_kswapd    4750223    3657107   -1093116     -24.00
                pgsteal_total    5869209    4347322   -1521887     -26.00
                       pswpin     417613    1392647     975034     233.00
                      pswpout    1274224    2661731    1387507     108.00
                slabs_scanned   13686905   10807200   -2879705     -22.00
          workingset_activate     668966     569444     -99522     -15.00
       workingset_nodereclaim      38957      32621      -6336     -17.00
           workingset_refault    2816795    2179782    -637013     -23.00
           workingset_restore     294320     168601    -125719     -43.00

pgmajfault is increased by 173% because swapin is increased by 200% by
process_madvise hint. However, swap read based on zram is much cheaper
than file IO in performance point of view and app hot start by swapin is
also cheaper than cold start from the beginning of app which needs many IO
from storage and initialization steps.

Brian Geffon in ChromeOS team had an experiment with process_madvise(2)
Quote form him:
"What I found is that by using process_madvise after a tab has been back
grounded for more than 45 seconds reduced the average tab switch times by
25%! This is a huge result and very obvious validation that process_madvise
hints works well for the ChromeOS use case."

This patchset is against on next-20190530.

Minchan Kim (4):
  mm: introduce MADV_COLD
  mm: change PAGEREF_RECLAIM_CLEAN with PAGE_REFRECLAIM
  mm: account nr_isolated_xxx in [isolate|putback]_lru_page
  mm: introduce MADV_PAGEOUT

 include/linux/page-flags.h             |   1 +
 include/linux/page_idle.h              |  15 ++
 include/linux/swap.h                   |   2 +
 include/uapi/asm-generic/mman-common.h |   2 +
 mm/compaction.c                        |   2 -
 mm/gup.c                               |   7 +-
 mm/internal.h                          |   2 +-
 mm/khugepaged.c                        |   3 -
 mm/madvise.c                           | 241 ++++++++++++++++++++++++-
 mm/memory-failure.c                    |   3 -
 mm/memory_hotplug.c                    |   4 -
 mm/mempolicy.c                         |   6 +-
 mm/migrate.c                           |  37 +---
 mm/oom_kill.c                          |   2 +-
 mm/swap.c                              |  43 +++++
 mm/vmscan.c                            |  62 ++++++-
 16 files changed, 367 insertions(+), 65 deletions(-)

-- 
2.22.0.rc1.311.g5d7573a151-goog

^ permalink raw reply

* [PATCH v1 1/4] mm: introduce MADV_COLD
From: Minchan Kim @ 2019-06-03  5:36 UTC (permalink / raw)
  To: Andrew Morton
  Cc: linux-mm, LKML, linux-api, Michal Hocko, Johannes Weiner,
	Tim Murray, Joel Fernandes, Suren Baghdasaryan, Daniel Colascione,
	Shakeel Butt, Sonny Rao, Brian Geffon, jannh, oleg, christian,
	oleksandr, hdanton, Minchan Kim
In-Reply-To: <20190603053655.127730-1-minchan@kernel.org>

When a process expects no accesses to a certain memory range, it could
give a hint to kernel that the pages can be reclaimed when memory pressure
happens but data should be preserved for future use.  This could reduce
workingset eviction so it ends up increasing performance.

This patch introduces the new MADV_COLD hint to madvise(2) syscall.
MADV_COLD can be used by a process to mark a memory range as not expected
to be used in the near future. The hint can help kernel in deciding which
pages to evict early during memory pressure.

It works for every LRU pages like MADV_[DONTNEED|FREE]. IOW, It moves

	active file page -> inactive file LRU
	active anon page -> inacdtive anon LRU

Unlike MADV_FREE, it doesn't move active anonymous pages to inactive
files's head because MADV_COLD is a little bit different symantic.
MADV_FREE means it's okay to discard when the memory pressure because
the content of the page is *garbage* so freeing such pages is almost zero
overhead since we don't need to swap out and access afterward causes just
minor fault. Thus, it would make sense to put those freeable pages in
inactive file LRU to compete other used-once pages. Even, it could
give a bonus to make them be reclaimed on swapless system. However,
MADV_COLD doesn't mean garbage so reclaiming them requires swap-out/in
in the end. So it's better to move inactive anon's LRU list, not file LRU.
Furthermore, it would help to avoid unnecessary scanning of cold anonymous
if system doesn't have a swap device.

All of error rule is same with MADV_DONTNEED.

Note:
This hint works with only private pages(IOW, page_mapcount(page) < 2)
because shared page could have more chance to be accessed from other
processes sharing the page although the caller reset the reference bits.
It ends up preventing the reclaim of the page and wastes CPU cycle.

* RFCv2
 * add more description - mhocko

* RFCv1
 * renaming from MADV_COOL to MADV_COLD - hannes

* internal review
 * use clear_page_youn in deactivate_page - joelaf
 * Revise the description - surenb
 * Renaming from MADV_WARM to MADV_COOL - surenb

Signed-off-by: Minchan Kim <minchan@kernel.org>
---
 include/linux/page-flags.h             |   1 +
 include/linux/page_idle.h              |  15 ++++
 include/linux/swap.h                   |   1 +
 include/uapi/asm-generic/mman-common.h |   1 +
 mm/internal.h                          |   2 +-
 mm/madvise.c                           | 115 ++++++++++++++++++++++++-
 mm/oom_kill.c                          |   2 +-
 mm/swap.c                              |  43 +++++++++
 8 files changed, 176 insertions(+), 4 deletions(-)

diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h
index 9f8712a4b1a5..58b06654c8dd 100644
--- a/include/linux/page-flags.h
+++ b/include/linux/page-flags.h
@@ -424,6 +424,7 @@ static inline bool set_hwpoison_free_buddy_page(struct page *page)
 TESTPAGEFLAG(Young, young, PF_ANY)
 SETPAGEFLAG(Young, young, PF_ANY)
 TESTCLEARFLAG(Young, young, PF_ANY)
+CLEARPAGEFLAG(Young, young, PF_ANY)
 PAGEFLAG(Idle, idle, PF_ANY)
 #endif
 
diff --git a/include/linux/page_idle.h b/include/linux/page_idle.h
index 1e894d34bdce..f3f43b317150 100644
--- a/include/linux/page_idle.h
+++ b/include/linux/page_idle.h
@@ -19,6 +19,11 @@ static inline void set_page_young(struct page *page)
 	SetPageYoung(page);
 }
 
+static inline void clear_page_young(struct page *page)
+{
+	ClearPageYoung(page);
+}
+
 static inline bool test_and_clear_page_young(struct page *page)
 {
 	return TestClearPageYoung(page);
@@ -65,6 +70,16 @@ static inline void set_page_young(struct page *page)
 	set_bit(PAGE_EXT_YOUNG, &page_ext->flags);
 }
 
+static void clear_page_young(struct page *page)
+{
+	struct page_ext *page_ext = lookup_page_ext(page);
+
+	if (unlikely(!page_ext))
+		return;
+
+	clear_bit(PAGE_EXT_YOUNG, &page_ext->flags);
+}
+
 static inline bool test_and_clear_page_young(struct page *page)
 {
 	struct page_ext *page_ext = lookup_page_ext(page);
diff --git a/include/linux/swap.h b/include/linux/swap.h
index de2c67a33b7e..0ce997edb8bb 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -340,6 +340,7 @@ extern void lru_add_drain_cpu(int cpu);
 extern void lru_add_drain_all(void);
 extern void rotate_reclaimable_page(struct page *page);
 extern void deactivate_file_page(struct page *page);
+extern void deactivate_page(struct page *page);
 extern void mark_page_lazyfree(struct page *page);
 extern void swap_setup(void);
 
diff --git a/include/uapi/asm-generic/mman-common.h b/include/uapi/asm-generic/mman-common.h
index bea0278f65ab..1190f4e7f7b9 100644
--- a/include/uapi/asm-generic/mman-common.h
+++ b/include/uapi/asm-generic/mman-common.h
@@ -43,6 +43,7 @@
 #define MADV_SEQUENTIAL	2		/* expect sequential page references */
 #define MADV_WILLNEED	3		/* will need these pages */
 #define MADV_DONTNEED	4		/* don't need these pages */
+#define MADV_COLD	5		/* deactivatie these pages */
 
 /* common parameters: try to keep these consistent across architectures */
 #define MADV_FREE	8		/* free pages only if memory pressure */
diff --git a/mm/internal.h b/mm/internal.h
index 9eeaf2b95166..75a4f96ec0fb 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -43,7 +43,7 @@ vm_fault_t do_swap_page(struct vm_fault *vmf);
 void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *start_vma,
 		unsigned long floor, unsigned long ceiling);
 
-static inline bool can_madv_dontneed_vma(struct vm_area_struct *vma)
+static inline bool can_madv_lru_vma(struct vm_area_struct *vma)
 {
 	return !(vma->vm_flags & (VM_LOCKED|VM_HUGETLB|VM_PFNMAP));
 }
diff --git a/mm/madvise.c b/mm/madvise.c
index 628022e674a7..ab158766858a 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -40,6 +40,7 @@ static int madvise_need_mmap_write(int behavior)
 	case MADV_REMOVE:
 	case MADV_WILLNEED:
 	case MADV_DONTNEED:
+	case MADV_COLD:
 	case MADV_FREE:
 		return 0;
 	default:
@@ -307,6 +308,113 @@ static long madvise_willneed(struct vm_area_struct *vma,
 	return 0;
 }
 
+static int madvise_cold_pte_range(pmd_t *pmd, unsigned long addr,
+				unsigned long end, struct mm_walk *walk)
+{
+	pte_t *orig_pte, *pte, ptent;
+	spinlock_t *ptl;
+	struct page *page;
+	struct vm_area_struct *vma = walk->vma;
+	unsigned long next;
+
+	next = pmd_addr_end(addr, end);
+	if (pmd_trans_huge(*pmd)) {
+		ptl = pmd_trans_huge_lock(pmd, vma);
+		if (!ptl)
+			return 0;
+
+		if (is_huge_zero_pmd(*pmd))
+			goto huge_unlock;
+
+		page = pmd_page(*pmd);
+		if (page_mapcount(page) > 1)
+			goto huge_unlock;
+
+		if (next - addr != HPAGE_PMD_SIZE) {
+			int err;
+
+			get_page(page);
+			spin_unlock(ptl);
+			lock_page(page);
+			err = split_huge_page(page);
+			unlock_page(page);
+			put_page(page);
+			if (!err)
+				goto regular_page;
+			return 0;
+		}
+
+		pmdp_test_and_clear_young(vma, addr, pmd);
+		deactivate_page(page);
+huge_unlock:
+		spin_unlock(ptl);
+		return 0;
+	}
+
+	if (pmd_trans_unstable(pmd))
+		return 0;
+
+regular_page:
+	orig_pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
+	for (pte = orig_pte; addr < end; pte++, addr += PAGE_SIZE) {
+		ptent = *pte;
+
+		if (pte_none(ptent))
+			continue;
+
+		if (!pte_present(ptent))
+			continue;
+
+		page = vm_normal_page(vma, addr, ptent);
+		if (!page)
+			continue;
+
+		if (page_mapcount(page) > 1)
+			continue;
+
+		ptep_test_and_clear_young(vma, addr, pte);
+		deactivate_page(page);
+	}
+
+	pte_unmap_unlock(orig_pte, ptl);
+	cond_resched();
+
+	return 0;
+}
+
+static void madvise_cold_page_range(struct mmu_gather *tlb,
+			     struct vm_area_struct *vma,
+			     unsigned long addr, unsigned long end)
+{
+	struct mm_walk cool_walk = {
+		.pmd_entry = madvise_cold_pte_range,
+		.mm = vma->vm_mm,
+	};
+
+	tlb_start_vma(tlb, vma);
+	walk_page_range(addr, end, &cool_walk);
+	tlb_end_vma(tlb, vma);
+}
+
+static long madvise_cold(struct vm_area_struct *vma,
+			struct vm_area_struct **prev,
+			unsigned long start_addr, unsigned long end_addr)
+{
+	struct mm_struct *mm = vma->vm_mm;
+	struct mmu_gather tlb;
+
+	*prev = vma;
+	if (!can_madv_lru_vma(vma))
+		return -EINVAL;
+
+	lru_add_drain();
+	tlb_gather_mmu(&tlb, mm, start_addr, end_addr);
+	madvise_cold_page_range(&tlb, vma, start_addr, end_addr);
+	tlb_finish_mmu(&tlb, start_addr, end_addr);
+
+	return 0;
+}
+
 static int madvise_free_pte_range(pmd_t *pmd, unsigned long addr,
 				unsigned long end, struct mm_walk *walk)
 
@@ -519,7 +627,7 @@ static long madvise_dontneed_free(struct vm_area_struct *vma,
 				  int behavior)
 {
 	*prev = vma;
-	if (!can_madv_dontneed_vma(vma))
+	if (!can_madv_lru_vma(vma))
 		return -EINVAL;
 
 	if (!userfaultfd_remove(vma, start, end)) {
@@ -541,7 +649,7 @@ static long madvise_dontneed_free(struct vm_area_struct *vma,
 			 */
 			return -ENOMEM;
 		}
-		if (!can_madv_dontneed_vma(vma))
+		if (!can_madv_lru_vma(vma))
 			return -EINVAL;
 		if (end > vma->vm_end) {
 			/*
@@ -695,6 +803,8 @@ madvise_vma(struct vm_area_struct *vma, struct vm_area_struct **prev,
 		return madvise_remove(vma, prev, start, end);
 	case MADV_WILLNEED:
 		return madvise_willneed(vma, prev, start, end);
+	case MADV_COLD:
+		return madvise_cold(vma, prev, start, end);
 	case MADV_FREE:
 	case MADV_DONTNEED:
 		return madvise_dontneed_free(vma, prev, start, end, behavior);
@@ -716,6 +826,7 @@ madvise_behavior_valid(int behavior)
 	case MADV_WILLNEED:
 	case MADV_DONTNEED:
 	case MADV_FREE:
+	case MADV_COLD:
 #ifdef CONFIG_KSM
 	case MADV_MERGEABLE:
 	case MADV_UNMERGEABLE:
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index 5a58778c91d4..f73d5f5145f0 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -515,7 +515,7 @@ bool __oom_reap_task_mm(struct mm_struct *mm)
 	set_bit(MMF_UNSTABLE, &mm->flags);
 
 	for (vma = mm->mmap ; vma; vma = vma->vm_next) {
-		if (!can_madv_dontneed_vma(vma))
+		if (!can_madv_lru_vma(vma))
 			continue;
 
 		/*
diff --git a/mm/swap.c b/mm/swap.c
index 7b079976cbec..cebedab15aa2 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -47,6 +47,7 @@ int page_cluster;
 static DEFINE_PER_CPU(struct pagevec, lru_add_pvec);
 static DEFINE_PER_CPU(struct pagevec, lru_rotate_pvecs);
 static DEFINE_PER_CPU(struct pagevec, lru_deactivate_file_pvecs);
+static DEFINE_PER_CPU(struct pagevec, lru_deactivate_pvecs);
 static DEFINE_PER_CPU(struct pagevec, lru_lazyfree_pvecs);
 #ifdef CONFIG_SMP
 static DEFINE_PER_CPU(struct pagevec, activate_page_pvecs);
@@ -538,6 +539,23 @@ static void lru_deactivate_file_fn(struct page *page, struct lruvec *lruvec,
 	update_page_reclaim_stat(lruvec, file, 0);
 }
 
+static void lru_deactivate_fn(struct page *page, struct lruvec *lruvec,
+			    void *arg)
+{
+	if (PageLRU(page) && PageActive(page) && !PageUnevictable(page)) {
+		int file = page_is_file_cache(page);
+		int lru = page_lru_base_type(page);
+
+		del_page_from_lru_list(page, lruvec, lru + LRU_ACTIVE);
+		ClearPageActive(page);
+		ClearPageReferenced(page);
+		clear_page_young(page);
+		add_page_to_lru_list(page, lruvec, lru);
+
+		__count_vm_events(PGDEACTIVATE, hpage_nr_pages(page));
+		update_page_reclaim_stat(lruvec, file, 0);
+	}
+}
 
 static void lru_lazyfree_fn(struct page *page, struct lruvec *lruvec,
 			    void *arg)
@@ -590,6 +608,10 @@ void lru_add_drain_cpu(int cpu)
 	if (pagevec_count(pvec))
 		pagevec_lru_move_fn(pvec, lru_deactivate_file_fn, NULL);
 
+	pvec = &per_cpu(lru_deactivate_pvecs, cpu);
+	if (pagevec_count(pvec))
+		pagevec_lru_move_fn(pvec, lru_deactivate_fn, NULL);
+
 	pvec = &per_cpu(lru_lazyfree_pvecs, cpu);
 	if (pagevec_count(pvec))
 		pagevec_lru_move_fn(pvec, lru_lazyfree_fn, NULL);
@@ -623,6 +645,26 @@ void deactivate_file_page(struct page *page)
 	}
 }
 
+/*
+ * deactivate_page - deactivate a page
+ * @page: page to deactivate
+ *
+ * deactivate_page() moves @page to the inactive list if @page was on the active
+ * list and was not an unevictable page.  This is done to accelerate the reclaim
+ * of @page.
+ */
+void deactivate_page(struct page *page)
+{
+	if (PageLRU(page) && PageActive(page) && !PageUnevictable(page)) {
+		struct pagevec *pvec = &get_cpu_var(lru_deactivate_pvecs);
+
+		get_page(page);
+		if (!pagevec_add(pvec, page) || PageCompound(page))
+			pagevec_lru_move_fn(pvec, lru_deactivate_fn, NULL);
+		put_cpu_var(lru_deactivate_pvecs);
+	}
+}
+
 /**
  * mark_page_lazyfree - make an anon page lazyfree
  * @page: page to deactivate
@@ -687,6 +729,7 @@ void lru_add_drain_all(void)
 		if (pagevec_count(&per_cpu(lru_add_pvec, cpu)) ||
 		    pagevec_count(&per_cpu(lru_rotate_pvecs, cpu)) ||
 		    pagevec_count(&per_cpu(lru_deactivate_file_pvecs, cpu)) ||
+		    pagevec_count(&per_cpu(lru_deactivate_pvecs, cpu)) ||
 		    pagevec_count(&per_cpu(lru_lazyfree_pvecs, cpu)) ||
 		    need_activate_page_drain(cpu)) {
 			INIT_WORK(work, lru_add_drain_per_cpu);
-- 
2.22.0.rc1.311.g5d7573a151-goog

^ permalink raw reply related

* [PATCH v1 2/4] mm: change PAGEREF_RECLAIM_CLEAN with PAGE_REFRECLAIM
From: Minchan Kim @ 2019-06-03  5:36 UTC (permalink / raw)
  To: Andrew Morton
  Cc: linux-mm, LKML, linux-api, Michal Hocko, Johannes Weiner,
	Tim Murray, Joel Fernandes, Suren Baghdasaryan, Daniel Colascione,
	Shakeel Butt, Sonny Rao, Brian Geffon, jannh, oleg, christian,
	oleksandr, hdanton, Minchan Kim
In-Reply-To: <20190603053655.127730-1-minchan@kernel.org>

The local variable references in shrink_page_list is PAGEREF_RECLAIM_CLEAN
as default. It is for preventing to reclaim dirty pages when CMA try to
migrate pages. Strictly speaking, we don't need it because CMA didn't allow
to write out by .may_writepage = 0 in reclaim_clean_pages_from_list.

Moreover, it has a problem to prevent anonymous pages's swap out even
though force_reclaim = true in shrink_page_list on upcoming patch.
So this patch makes references's default value to PAGEREF_RECLAIM and
rename force_reclaim with ignore_references to make it more clear.

This is a preparatory work for next patch.

* RFCv1
 * use ignore_referecnes as parameter name - hannes

Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Signed-off-by: Minchan Kim <minchan@kernel.org>
---
 mm/vmscan.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/mm/vmscan.c b/mm/vmscan.c
index 84dcb651d05c..0973a46a0472 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -1102,7 +1102,7 @@ static unsigned long shrink_page_list(struct list_head *page_list,
 				      struct scan_control *sc,
 				      enum ttu_flags ttu_flags,
 				      struct reclaim_stat *stat,
-				      bool force_reclaim)
+				      bool ignore_references)
 {
 	LIST_HEAD(ret_pages);
 	LIST_HEAD(free_pages);
@@ -1116,7 +1116,7 @@ static unsigned long shrink_page_list(struct list_head *page_list,
 		struct address_space *mapping;
 		struct page *page;
 		int may_enter_fs;
-		enum page_references references = PAGEREF_RECLAIM_CLEAN;
+		enum page_references references = PAGEREF_RECLAIM;
 		bool dirty, writeback;
 		unsigned int nr_pages;
 
@@ -1247,7 +1247,7 @@ static unsigned long shrink_page_list(struct list_head *page_list,
 			}
 		}
 
-		if (!force_reclaim)
+		if (!ignore_references)
 			references = page_check_references(page, sc);
 
 		switch (references) {
-- 
2.22.0.rc1.311.g5d7573a151-goog

^ permalink raw reply related

* [PATCH v1 3/4] mm: account nr_isolated_xxx in [isolate|putback]_lru_page
From: Minchan Kim @ 2019-06-03  5:36 UTC (permalink / raw)
  To: Andrew Morton
  Cc: linux-mm, LKML, linux-api, Michal Hocko, Johannes Weiner,
	Tim Murray, Joel Fernandes, Suren Baghdasaryan, Daniel Colascione,
	Shakeel Butt, Sonny Rao, Brian Geffon, jannh, oleg, christian,
	oleksandr, hdanton, Minchan Kim
In-Reply-To: <20190603053655.127730-1-minchan@kernel.org>

The isolate counting is pecpu counter so it would be not huge gain
to work them by batch. Rather than complicating to make them batch,
let's make it more stright-foward via adding the counting logic
into [isolate|putback]_lru_page API.

Link: http://lkml.kernel.org/r/20190531165927.GA20067@cmpxchg.org
Suggested-by: Johannes Weiner <hannes@cmpxchg.org>
Signed-off-by: Minchan Kim <minchan@kernel.org>
---
 mm/compaction.c     |  2 --
 mm/gup.c            |  7 +------
 mm/khugepaged.c     |  3 ---
 mm/memory-failure.c |  3 ---
 mm/memory_hotplug.c |  4 ----
 mm/mempolicy.c      |  6 +-----
 mm/migrate.c        | 37 ++++++++-----------------------------
 mm/vmscan.c         | 22 ++++++++++++++++------
 8 files changed, 26 insertions(+), 58 deletions(-)

diff --git a/mm/compaction.c b/mm/compaction.c
index 9e1b9acb116b..c6591682deda 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -982,8 +982,6 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn,
 
 		/* Successfully isolated */
 		del_page_from_lru_list(page, lruvec, page_lru(page));
-		inc_node_page_state(page,
-				NR_ISOLATED_ANON + page_is_file_cache(page));
 
 isolate_success:
 		list_add(&page->lru, &cc->migratepages);
diff --git a/mm/gup.c b/mm/gup.c
index 63ac50e48072..2d9a9bc358c7 100644
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -1360,13 +1360,8 @@ static long check_and_migrate_cma_pages(struct task_struct *tsk,
 					drain_allow = false;
 				}
 
-				if (!isolate_lru_page(head)) {
+				if (!isolate_lru_page(head))
 					list_add_tail(&head->lru, &cma_page_list);
-					mod_node_page_state(page_pgdat(head),
-							    NR_ISOLATED_ANON +
-							    page_is_file_cache(head),
-							    hpage_nr_pages(head));
-				}
 			}
 		}
 	}
diff --git a/mm/khugepaged.c b/mm/khugepaged.c
index a335f7c1fac4..3359df994fb4 100644
--- a/mm/khugepaged.c
+++ b/mm/khugepaged.c
@@ -503,7 +503,6 @@ void __khugepaged_exit(struct mm_struct *mm)
 
 static void release_pte_page(struct page *page)
 {
-	dec_node_page_state(page, NR_ISOLATED_ANON + page_is_file_cache(page));
 	unlock_page(page);
 	putback_lru_page(page);
 }
@@ -602,8 +601,6 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
 			result = SCAN_DEL_PAGE_LRU;
 			goto out;
 		}
-		inc_node_page_state(page,
-				NR_ISOLATED_ANON + page_is_file_cache(page));
 		VM_BUG_ON_PAGE(!PageLocked(page), page);
 		VM_BUG_ON_PAGE(PageLRU(page), page);
 
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index bc749265a8f3..2187bad7ceff 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -1796,9 +1796,6 @@ static int __soft_offline_page(struct page *page, int flags)
 		 * so use !__PageMovable instead for LRU page's mapping
 		 * cannot have PAGE_MAPPING_MOVABLE.
 		 */
-		if (!__PageMovable(page))
-			inc_node_page_state(page, NR_ISOLATED_ANON +
-						page_is_file_cache(page));
 		list_add(&page->lru, &pagelist);
 		ret = migrate_pages(&pagelist, new_page, NULL, MPOL_MF_MOVE_ALL,
 					MIGRATE_SYNC, MR_MEMORY_FAILURE);
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index a88c5f334e5a..a41bea24d0c9 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -1390,10 +1390,6 @@ do_migrate_range(unsigned long start_pfn, unsigned long end_pfn)
 			ret = isolate_movable_page(page, ISOLATE_UNEVICTABLE);
 		if (!ret) { /* Success */
 			list_add_tail(&page->lru, &source);
-			if (!__PageMovable(page))
-				inc_node_page_state(page, NR_ISOLATED_ANON +
-						    page_is_file_cache(page));
-
 		} else {
 			pr_warn("failed to isolate pfn %lx\n", pfn);
 			dump_page(page, "isolation failed");
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 5b3bf1747c19..cfb0590f69bb 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -948,12 +948,8 @@ static void migrate_page_add(struct page *page, struct list_head *pagelist,
 	 * Avoid migrating a page that is shared with others.
 	 */
 	if ((flags & MPOL_MF_MOVE_ALL) || page_mapcount(head) == 1) {
-		if (!isolate_lru_page(head)) {
+		if (!isolate_lru_page(head))
 			list_add_tail(&head->lru, pagelist);
-			mod_node_page_state(page_pgdat(head),
-				NR_ISOLATED_ANON + page_is_file_cache(head),
-				hpage_nr_pages(head));
-		}
 	}
 }
 
diff --git a/mm/migrate.c b/mm/migrate.c
index 572b4bc85d76..39b95ba04d3e 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -190,8 +190,6 @@ void putback_movable_pages(struct list_head *l)
 			unlock_page(page);
 			put_page(page);
 		} else {
-			mod_node_page_state(page_pgdat(page), NR_ISOLATED_ANON +
-					page_is_file_cache(page), -hpage_nr_pages(page));
 			putback_lru_page(page);
 		}
 	}
@@ -1181,10 +1179,17 @@ static ICE_noinline int unmap_and_move(new_page_t get_new_page,
 		return -ENOMEM;
 
 	if (page_count(page) == 1) {
+		bool is_lru = !__PageMovable(page);
+
 		/* page was freed from under us. So we are done. */
 		ClearPageActive(page);
 		ClearPageUnevictable(page);
-		if (unlikely(__PageMovable(page))) {
+		if (likely(is_lru))
+			mod_node_page_state(page_pgdat(page),
+						NR_ISOLATED_ANON +
+						page_is_file_cache(page),
+						hpage_nr_pages(page));
+		else {
 			lock_page(page);
 			if (!PageMovable(page))
 				__ClearPageIsolated(page);
@@ -1210,15 +1215,6 @@ static ICE_noinline int unmap_and_move(new_page_t get_new_page,
 		 * restored.
 		 */
 		list_del(&page->lru);
-
-		/*
-		 * Compaction can migrate also non-LRU pages which are
-		 * not accounted to NR_ISOLATED_*. They can be recognized
-		 * as __PageMovable
-		 */
-		if (likely(!__PageMovable(page)))
-			mod_node_page_state(page_pgdat(page), NR_ISOLATED_ANON +
-					page_is_file_cache(page), -hpage_nr_pages(page));
 	}
 
 	/*
@@ -1572,9 +1568,6 @@ static int add_page_for_migration(struct mm_struct *mm, unsigned long addr,
 
 		err = 0;
 		list_add_tail(&head->lru, pagelist);
-		mod_node_page_state(page_pgdat(head),
-			NR_ISOLATED_ANON + page_is_file_cache(head),
-			hpage_nr_pages(head));
 	}
 out_putpage:
 	/*
@@ -1890,8 +1883,6 @@ static struct page *alloc_misplaced_dst_page(struct page *page,
 
 static int numamigrate_isolate_page(pg_data_t *pgdat, struct page *page)
 {
-	int page_lru;
-
 	VM_BUG_ON_PAGE(compound_order(page) && !PageTransHuge(page), page);
 
 	/* Avoid migrating to a node that is nearly full */
@@ -1913,10 +1904,6 @@ static int numamigrate_isolate_page(pg_data_t *pgdat, struct page *page)
 		return 0;
 	}
 
-	page_lru = page_is_file_cache(page);
-	mod_node_page_state(page_pgdat(page), NR_ISOLATED_ANON + page_lru,
-				hpage_nr_pages(page));
-
 	/*
 	 * Isolating the page has taken another reference, so the
 	 * caller's reference can be safely dropped without the page
@@ -1971,8 +1958,6 @@ int migrate_misplaced_page(struct page *page, struct vm_area_struct *vma,
 	if (nr_remaining) {
 		if (!list_empty(&migratepages)) {
 			list_del(&page->lru);
-			dec_node_page_state(page, NR_ISOLATED_ANON +
-					page_is_file_cache(page));
 			putback_lru_page(page);
 		}
 		isolated = 0;
@@ -2002,7 +1987,6 @@ int migrate_misplaced_transhuge_page(struct mm_struct *mm,
 	pg_data_t *pgdat = NODE_DATA(node);
 	int isolated = 0;
 	struct page *new_page = NULL;
-	int page_lru = page_is_file_cache(page);
 	unsigned long start = address & HPAGE_PMD_MASK;
 
 	new_page = alloc_pages_node(node,
@@ -2048,8 +2032,6 @@ int migrate_misplaced_transhuge_page(struct mm_struct *mm,
 		/* Retake the callers reference and putback on LRU */
 		get_page(page);
 		putback_lru_page(page);
-		mod_node_page_state(page_pgdat(page),
-			 NR_ISOLATED_ANON + page_lru, -HPAGE_PMD_NR);
 
 		goto out_unlock;
 	}
@@ -2099,9 +2081,6 @@ int migrate_misplaced_transhuge_page(struct mm_struct *mm,
 	count_vm_events(PGMIGRATE_SUCCESS, HPAGE_PMD_NR);
 	count_vm_numa_events(NUMA_PAGE_MIGRATE, HPAGE_PMD_NR);
 
-	mod_node_page_state(page_pgdat(page),
-			NR_ISOLATED_ANON + page_lru,
-			-HPAGE_PMD_NR);
 	return isolated;
 
 out_fail:
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 0973a46a0472..56df55e8afcd 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -999,6 +999,9 @@ int remove_mapping(struct address_space *mapping, struct page *page)
 void putback_lru_page(struct page *page)
 {
 	lru_cache_add(page);
+	mod_node_page_state(page_pgdat(page),
+				NR_ISOLATED_ANON + page_is_file_cache(page),
+				-hpage_nr_pages(page));
 	put_page(page);		/* drop ref from isolate */
 }
 
@@ -1464,6 +1467,9 @@ static unsigned long shrink_page_list(struct list_head *page_list,
 		 */
 		nr_reclaimed += nr_pages;
 
+		mod_node_page_state(pgdat, NR_ISOLATED_ANON +
+						page_is_file_cache(page),
+						-nr_pages);
 		/*
 		 * Is there need to periodically free_page_list? It would
 		 * appear not as the counts should be low
@@ -1539,7 +1545,6 @@ unsigned long reclaim_clean_pages_from_list(struct zone *zone,
 	ret = shrink_page_list(&clean_pages, zone->zone_pgdat, &sc,
 			TTU_IGNORE_ACCESS, &dummy_stat, true);
 	list_splice(&clean_pages, page_list);
-	mod_node_page_state(zone->zone_pgdat, NR_ISOLATED_FILE, -ret);
 	return ret;
 }
 
@@ -1615,6 +1620,9 @@ int __isolate_lru_page(struct page *page, isolate_mode_t mode)
 		 */
 		ClearPageLRU(page);
 		ret = 0;
+		__mod_node_page_state(page_pgdat(page), NR_ISOLATED_ANON +
+						page_is_file_cache(page),
+						hpage_nr_pages(page));
 	}
 
 	return ret;
@@ -1746,6 +1754,7 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
 	trace_mm_vmscan_lru_isolate(sc->reclaim_idx, sc->order, nr_to_scan,
 				    total_scan, skipped, nr_taken, mode, lru);
 	update_lru_sizes(lruvec, lru, nr_zone_taken);
+
 	return nr_taken;
 }
 
@@ -1794,6 +1803,9 @@ int isolate_lru_page(struct page *page)
 			ClearPageLRU(page);
 			del_page_from_lru_list(page, lruvec, lru);
 			ret = 0;
+			mod_node_page_state(pgdat, NR_ISOLATED_ANON +
+						page_is_file_cache(page),
+						hpage_nr_pages(page));
 		}
 		spin_unlock_irq(&pgdat->lru_lock);
 	}
@@ -1885,6 +1897,9 @@ static unsigned noinline_for_stack move_pages_to_lru(struct lruvec *lruvec,
 		update_lru_size(lruvec, lru, page_zonenum(page), nr_pages);
 		list_move(&page->lru, &lruvec->lists[lru]);
 
+		__mod_node_page_state(pgdat, NR_ISOLATED_ANON +
+						page_is_file_cache(page),
+						-hpage_nr_pages(page));
 		if (put_page_testzero(page)) {
 			__ClearPageLRU(page);
 			__ClearPageActive(page);
@@ -1962,7 +1977,6 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec,
 	nr_taken = isolate_lru_pages(nr_to_scan, lruvec, &page_list,
 				     &nr_scanned, sc, lru);
 
-	__mod_node_page_state(pgdat, NR_ISOLATED_ANON + file, nr_taken);
 	reclaim_stat->recent_scanned[file] += nr_taken;
 
 	item = current_is_kswapd() ? PGSCAN_KSWAPD : PGSCAN_DIRECT;
@@ -1988,8 +2002,6 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec,
 
 	move_pages_to_lru(lruvec, &page_list);
 
-	__mod_node_page_state(pgdat, NR_ISOLATED_ANON + file, -nr_taken);
-
 	spin_unlock_irq(&pgdat->lru_lock);
 
 	mem_cgroup_uncharge_list(&page_list);
@@ -2048,7 +2060,6 @@ static void shrink_active_list(unsigned long nr_to_scan,
 	nr_taken = isolate_lru_pages(nr_to_scan, lruvec, &l_hold,
 				     &nr_scanned, sc, lru);
 
-	__mod_node_page_state(pgdat, NR_ISOLATED_ANON + file, nr_taken);
 	reclaim_stat->recent_scanned[file] += nr_taken;
 
 	__count_vm_events(PGREFILL, nr_scanned);
@@ -2117,7 +2128,6 @@ static void shrink_active_list(unsigned long nr_to_scan,
 	__count_vm_events(PGDEACTIVATE, nr_deactivate);
 	__count_memcg_events(lruvec_memcg(lruvec), PGDEACTIVATE, nr_deactivate);
 
-	__mod_node_page_state(pgdat, NR_ISOLATED_ANON + file, -nr_taken);
 	spin_unlock_irq(&pgdat->lru_lock);
 
 	mem_cgroup_uncharge_list(&l_active);
-- 
2.22.0.rc1.311.g5d7573a151-goog

^ permalink raw reply related

* [PATCH v1 4/4] mm: introduce MADV_PAGEOUT
From: Minchan Kim @ 2019-06-03  5:36 UTC (permalink / raw)
  To: Andrew Morton
  Cc: linux-mm, LKML, linux-api, Michal Hocko, Johannes Weiner,
	Tim Murray, Joel Fernandes, Suren Baghdasaryan, Daniel Colascione,
	Shakeel Butt, Sonny Rao, Brian Geffon, jannh, oleg, christian,
	oleksandr, hdanton, Minchan Kim
In-Reply-To: <20190603053655.127730-1-minchan@kernel.org>

When a process expects no accesses to a certain memory range
for a long time, it could hint kernel that the pages can be
reclaimed instantly but data should be preserved for future use.
This could reduce workingset eviction so it ends up increasing
performance.

This patch introduces the new MADV_PAGEOUT hint to madvise(2)
syscall. MADV_PAGEOUT can be used by a process to mark a memory
range as not expected to be used for a long time so that kernel
reclaims *any LRU* pages instantly. The hint can help kernel in deciding
which pages to evict proactively.

All of error rule is same with MADV_DONTNEED.

Note:
    This hint works with only private pages(IOW, page_mapcount(page) < 2)
    because shared page could have more chance to be accessed from other
    processes sharing the page so that it could cause major fault soon,
    which is inefficient.

* RFC v2
 * make reclaim_pages simple via factoring out isolate logic - hannes

* RFCv1
 * rename from MADV_COLD to MADV_PAGEOUT - hannes
 * bail out if process is being killed - Hillf
 * fix reclaim_pages bugs - Hillf

Signed-off-by: Minchan Kim <minchan@kernel.org>
---
 include/linux/swap.h                   |   1 +
 include/uapi/asm-generic/mman-common.h |   1 +
 mm/madvise.c                           | 126 +++++++++++++++++++++++++
 mm/vmscan.c                            |  34 +++++++
 4 files changed, 162 insertions(+)

diff --git a/include/linux/swap.h b/include/linux/swap.h
index 0ce997edb8bb..063c0c1e112b 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -365,6 +365,7 @@ extern int vm_swappiness;
 extern int remove_mapping(struct address_space *mapping, struct page *page);
 extern unsigned long vm_total_pages;
 
+extern unsigned long reclaim_pages(struct list_head *page_list);
 #ifdef CONFIG_NUMA
 extern int node_reclaim_mode;
 extern int sysctl_min_unmapped_ratio;
diff --git a/include/uapi/asm-generic/mman-common.h b/include/uapi/asm-generic/mman-common.h
index 1190f4e7f7b9..92e347a89ddc 100644
--- a/include/uapi/asm-generic/mman-common.h
+++ b/include/uapi/asm-generic/mman-common.h
@@ -44,6 +44,7 @@
 #define MADV_WILLNEED	3		/* will need these pages */
 #define MADV_DONTNEED	4		/* don't need these pages */
 #define MADV_COLD	5		/* deactivatie these pages */
+#define MADV_PAGEOUT	6		/* reclaim these pages */
 
 /* common parameters: try to keep these consistent across architectures */
 #define MADV_FREE	8		/* free pages only if memory pressure */
diff --git a/mm/madvise.c b/mm/madvise.c
index ab158766858a..b010249cb8b6 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -41,6 +41,7 @@ static int madvise_need_mmap_write(int behavior)
 	case MADV_WILLNEED:
 	case MADV_DONTNEED:
 	case MADV_COLD:
+	case MADV_PAGEOUT:
 	case MADV_FREE:
 		return 0;
 	default:
@@ -415,6 +416,128 @@ static long madvise_cold(struct vm_area_struct *vma,
 	return 0;
 }
 
+static int madvise_pageout_pte_range(pmd_t *pmd, unsigned long addr,
+				unsigned long end, struct mm_walk *walk)
+{
+	pte_t *orig_pte, *pte, ptent;
+	spinlock_t *ptl;
+	LIST_HEAD(page_list);
+	struct page *page;
+	int isolated = 0;
+	struct vm_area_struct *vma = walk->vma;
+	unsigned long next;
+
+	if (fatal_signal_pending(current))
+		return -EINTR;
+
+	next = pmd_addr_end(addr, end);
+	if (pmd_trans_huge(*pmd)) {
+		ptl = pmd_trans_huge_lock(pmd, vma);
+		if (!ptl)
+			return 0;
+
+		if (is_huge_zero_pmd(*pmd))
+			goto huge_unlock;
+
+		page = pmd_page(*pmd);
+		if (page_mapcount(page) > 1)
+			goto huge_unlock;
+
+		if (next - addr != HPAGE_PMD_SIZE) {
+			int err;
+
+			get_page(page);
+			spin_unlock(ptl);
+			lock_page(page);
+			err = split_huge_page(page);
+			unlock_page(page);
+			put_page(page);
+			if (!err)
+				goto regular_page;
+			return 0;
+		}
+
+		if (isolate_lru_page(page))
+			goto huge_unlock;
+
+		list_add(&page->lru, &page_list);
+huge_unlock:
+		spin_unlock(ptl);
+		reclaim_pages(&page_list);
+		return 0;
+	}
+
+	if (pmd_trans_unstable(pmd))
+		return 0;
+regular_page:
+	orig_pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
+	for (pte = orig_pte; addr < end; pte++, addr += PAGE_SIZE) {
+		ptent = *pte;
+		if (!pte_present(ptent))
+			continue;
+
+		page = vm_normal_page(vma, addr, ptent);
+		if (!page)
+			continue;
+
+		if (page_mapcount(page) > 1)
+			continue;
+
+		if (isolate_lru_page(page))
+			continue;
+
+		isolated++;
+		list_add(&page->lru, &page_list);
+		if (isolated >= SWAP_CLUSTER_MAX) {
+			pte_unmap_unlock(orig_pte, ptl);
+			reclaim_pages(&page_list);
+			isolated = 0;
+			pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
+			orig_pte = pte;
+		}
+	}
+
+	pte_unmap_unlock(orig_pte, ptl);
+	reclaim_pages(&page_list);
+	cond_resched();
+
+	return 0;
+}
+
+static void madvise_pageout_page_range(struct mmu_gather *tlb,
+			     struct vm_area_struct *vma,
+			     unsigned long addr, unsigned long end)
+{
+	struct mm_walk warm_walk = {
+		.pmd_entry = madvise_pageout_pte_range,
+		.mm = vma->vm_mm,
+	};
+
+	tlb_start_vma(tlb, vma);
+	walk_page_range(addr, end, &warm_walk);
+	tlb_end_vma(tlb, vma);
+}
+
+
+static long madvise_pageout(struct vm_area_struct *vma,
+			struct vm_area_struct **prev,
+			unsigned long start_addr, unsigned long end_addr)
+{
+	struct mm_struct *mm = vma->vm_mm;
+	struct mmu_gather tlb;
+
+	*prev = vma;
+	if (!can_madv_lru_vma(vma))
+		return -EINVAL;
+
+	lru_add_drain();
+	tlb_gather_mmu(&tlb, mm, start_addr, end_addr);
+	madvise_pageout_page_range(&tlb, vma, start_addr, end_addr);
+	tlb_finish_mmu(&tlb, start_addr, end_addr);
+
+	return 0;
+}
+
 static int madvise_free_pte_range(pmd_t *pmd, unsigned long addr,
 				unsigned long end, struct mm_walk *walk)
 
@@ -805,6 +928,8 @@ madvise_vma(struct vm_area_struct *vma, struct vm_area_struct **prev,
 		return madvise_willneed(vma, prev, start, end);
 	case MADV_COLD:
 		return madvise_cold(vma, prev, start, end);
+	case MADV_PAGEOUT:
+		return madvise_pageout(vma, prev, start, end);
 	case MADV_FREE:
 	case MADV_DONTNEED:
 		return madvise_dontneed_free(vma, prev, start, end, behavior);
@@ -827,6 +952,7 @@ madvise_behavior_valid(int behavior)
 	case MADV_DONTNEED:
 	case MADV_FREE:
 	case MADV_COLD:
+	case MADV_PAGEOUT:
 #ifdef CONFIG_KSM
 	case MADV_MERGEABLE:
 	case MADV_UNMERGEABLE:
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 56df55e8afcd..2c2cf442db58 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -2136,6 +2136,40 @@ static void shrink_active_list(unsigned long nr_to_scan,
 			nr_deactivate, nr_rotated, sc->priority, file);
 }
 
+unsigned long reclaim_pages(struct list_head *page_list)
+{
+	unsigned long nr_reclaimed = 0;
+	LIST_HEAD(node_page_list);
+	struct reclaim_stat dummy_stat;
+	struct scan_control sc = {
+		.gfp_mask = GFP_KERNEL,
+		.priority = DEF_PRIORITY,
+		.may_writepage = 1,
+		.may_unmap = 1,
+		.may_swap = 1,
+	};
+
+	while (!list_empty(page_list)) {
+		struct page *page;
+
+		page = lru_to_page(page_list);
+		list_move(&page->lru, &node_page_list);
+		nr_reclaimed += shrink_page_list(&node_page_list,
+						page_pgdat(page),
+						&sc, TTU_IGNORE_ACCESS,
+						&dummy_stat, true);
+		if (!list_empty(&node_page_list)) {
+			struct page *page = lru_to_page(&node_page_list);
+
+			list_del(&page->lru);
+			putback_lru_page(page);
+
+		}
+	}
+
+	return nr_reclaimed;
+}
+
 /*
  * The inactive anon list should be small enough that the VM never has
  * to do too much work.
-- 
2.22.0.rc1.311.g5d7573a151-goog

^ permalink raw reply related

* Re: [RFC][PATCH] link.2: AT_ATOMIC_DATA and AT_ATOMIC_METADATA
From: Amir Goldstein @ 2019-06-03  6:17 UTC (permalink / raw)
  To: Dave Chinner
  Cc: Theodore Ts'o, Jan Kara, Darrick J . Wong, Chris Mason,
	Al Viro, linux-fsdevel, linux-xfs, Ext4, Linux Btrfs, Linux API
In-Reply-To: <20190603042540.GH29573@dread.disaster.area>

> > Actually, one of my use cases is "atomic rename" of files with
> > no data (looking for atomicity w.r.t xattr and mtime), so this "atomic rename"
> > thread should not be interfering with other workloads at all.
>
> Which should already guaranteed because a) rename is supposed to be
> atomic, and b) metadata ordering requirements in journalled
> filesystems. If they lose xattrs across rename, there's something
> seriously wrong with the filesystem implementation.  I'm really not
> sure what you think filesystems are actually doing with metadata
> across rename operations....
>

Dave,

We are going in circles so much that my head is spinning.
I don't blame anyone for having a hard time to keep up with the plot, because
it spans many threads and subjects, so let me re-iterate:

- I *do* know that rename provides me the needed "metadata barrier"
  w.r.t. xattr on xfs/ext4 today.
- I *do* know the sync_file_range()+rename() callback provides the
"data barrier"
  I need on xfs/ext4 today.
- I *do* use this internal fs knowledge in my applications
- I even fixed up sync_file_range() per your suggestion, so I won't need to use
  the FIEMAP_FLAG_SYNC hack
- At attempt from CrashMonkey developers to document this behavior was
  "shot down" for many justified reasons
- Without any documentation nor explicit API with a clean guarantee, users
  cannot write efficient applications without being aware of the filesystem
  underneath and follow that filesystem development to make sure behavior
  has not changed
- The most recent proposal I have made in LSF, based on Jan's suggestion is
  to change nothing in filesystem implementation, but use a new *explicit* verb
  to communicate the expectation of the application, so that
filesystems are free
  the change behavior in the future in the absence of the new verb

Once again, ATOMIC_METADATA is a noop in preset xfs/ext4.
ATOMIC_DATA is sync_file_range() in present xfs/ext4.
The APIs I *need* from the kernel *do* exist, but the filesystem developers
(except xfs) are not willing to document the guarantee that the existing
interfaces provide in the present.

[...]
> So, in the interests of /informed debate/, please implement what you
> want using batched AIO_FSYNC + rename/linkat completion callback and
> measure what it acheives. Then implement a sync_file_range/linkat
> thread pool that provides the same functionality to the application
> (i.e. writeback concurrency in userspace) and measure it. Then we
> can discuss what the relative overhead is with numbers and can
> perform analysis to determine what the cause of the performance
> differential actually is.
>

Fare enough.

> Neither of these things require kernel modifications, but you need
> to provide the evidence that existing APIs are insufficient.

APIs are sufficient if I know which filesystem I am running on.
btrfs needs a different set of syscalls to get the same thing done.

> Indeed, we now have the new async ioring stuff that can run async
> sync_file_range calls, so you probably need to benchmark replacing
> AIO_FSYNC with that interface as well. This new API likely does
> exactly what you want without the journal/device cache flush
> overhead of AIO_FSYNC....
>

Indeed, I am keeping a close watch on io_uring.

Thanks,
Amir.

^ permalink raw reply

* Re: [RFCv2 1/6] mm: introduce MADV_COLD
From: Michal Hocko @ 2019-06-03  7:16 UTC (permalink / raw)
  To: Minchan Kim
  Cc: Andrew Morton, linux-mm, LKML, linux-api, Johannes Weiner,
	Tim Murray, Joel Fernandes, Suren Baghdasaryan, Daniel Colascione,
	Shakeel Butt, Sonny Rao, Brian Geffon, jannh, oleg, christian,
	oleksandr, hdanton
In-Reply-To: <20190531143407.GB216592@google.com>

On Fri 31-05-19 23:34:07, Minchan Kim wrote:
> On Fri, May 31, 2019 at 04:03:32PM +0200, Michal Hocko wrote:
> > On Fri 31-05-19 22:39:04, Minchan Kim wrote:
> > > On Fri, May 31, 2019 at 10:47:52AM +0200, Michal Hocko wrote:
> > > > On Fri 31-05-19 15:43:08, Minchan Kim wrote:
> > > > > When a process expects no accesses to a certain memory range, it could
> > > > > give a hint to kernel that the pages can be reclaimed when memory pressure
> > > > > happens but data should be preserved for future use.  This could reduce
> > > > > workingset eviction so it ends up increasing performance.
> > > > > 
> > > > > This patch introduces the new MADV_COLD hint to madvise(2) syscall.
> > > > > MADV_COLD can be used by a process to mark a memory range as not expected
> > > > > to be used in the near future. The hint can help kernel in deciding which
> > > > > pages to evict early during memory pressure.
> > > > > 
> > > > > Internally, it works via deactivating pages from active list to inactive's
> > > > > head if the page is private because inactive list could be full of
> > > > > used-once pages which are first candidate for the reclaiming and that's a
> > > > > reason why MADV_FREE move pages to head of inactive LRU list. Therefore,
> > > > > if the memory pressure happens, they will be reclaimed earlier than other
> > > > > active pages unless there is no access until the time.
> > > > 
> > > > [I am intentionally not looking at the implementation because below
> > > > points should be clear from the changelog - sorry about nagging ;)]
> > > > 
> > > > What kind of pages can be deactivated? Anonymous/File backed.
> > > > Private/shared? If shared, are there any restrictions?
> > > 
> > > Both file and private pages could be deactived from each active LRU
> > > to each inactive LRU if the page has one map_count. In other words,
> > > 
> > >     if (page_mapcount(page) <= 1)
> > >         deactivate_page(page);
> > 
> > Why do we restrict to pages that are single mapped?
> 
> Because page table in one of process shared the page would have access bit
> so finally we couldn't reclaim the page. The more process it is shared,
> the more fail to reclaim.

So what? In other words why should it be restricted solely based on the
map count. I can see a reason to restrict based on the access
permissions because we do not want to simplify all sorts of side channel
attacks but memory reclaim is capable of reclaiming shared pages and so
far I haven't heard any sound argument why madvise should skip those.
Again if there are any reasons, then document them in the changelog.
 
[...]

> > Please document this, if this is really a desirable semantic because
> > then you have the same set of problems as we've had with the early
> > MADV_FREE implementation mentioned above.
> 
> IIRC, the problem of MADV_FREE was that we couldn't discard freeable
> pages because VM never scan anonymous LRU with swapless system.
> However, it's not the our case because we should reclaim them, not
> discarding.

Right. But there is still the page cache reclaim. Is it expected that
an explicitly cold memory doesn't get reclaimed because we have a
sufficient amount of page cache (a very common case) and we never age
anonymous memory because of that?

-- 
Michal Hocko
SUSE Labs

^ permalink raw reply

* Re: [PATCH v4 0/3] initramfs: add support for xattrs in the initial ram disk
From: Roberto Sassu @ 2019-06-03  9:31 UTC (permalink / raw)
  To: viro
  Cc: linux-security-module, linux-integrity, initramfs, linux-api,
	linux-fsdevel, linux-kernel, bug-cpio, zohar, silviu.vlasceanu,
	dmitry.kasatkin, takondra, kamensky, hpa, arnd, rob,
	james.w.mcmechan, niveditas98
In-Reply-To: <20190523121803.21638-1-roberto.sassu@huawei.com>

Any opinion on this patch set?

Thanks

Roberto


On 5/23/2019 2:18 PM, Roberto Sassu wrote:
> This patch set aims at solving the following use case: appraise files from
> the initial ram disk. To do that, IMA checks the signature/hash from the
> security.ima xattr. Unfortunately, this use case cannot be implemented
> currently, as the CPIO format does not support xattrs.
> 
> This proposal consists in including file metadata as additional files named
> METADATA!!!, for each file added to the ram disk. The CPIO parser in the
> kernel recognizes these special files from the file name, and calls the
> appropriate parser to add metadata to the previously extracted file. It has
> been proposed to use bit 17:16 of the file mode as a way to recognize files
> with metadata, but both the kernel and the cpio tool declare the file mode
> as unsigned short.
> 
> The difference from v2, v3 (https://lkml.org/lkml/2019/5/9/230,
> https://lkml.org/lkml/2019/5/17/466) is that file metadata are stored in
> separate files instead of a single file. Given that files with metadata
> must immediately follow the files metadata will be added to, image
> generators have to be modified in this version.
> 
> The difference from v1 (https://lkml.org/lkml/2018/11/22/1182) is that
> all files have the same name. The file metadata are added to is always the
> previous one, and the image generator in user space will make sure that
> files are in the correct sequence.
> 
> The difference with another proposal
> (https://lore.kernel.org/patchwork/cover/888071/) is that xattrs can be
> included in an image without changing the image format. Files with metadata
> will appear as regular files. It will be task of the parser in the kernel
> to process them.
> 
> This patch set extends the format of data defined in patch 9/15 of the last
> proposal. It adds header version and type, so that new formats can be
> defined and arbitrary metadata types can be processed.
> 
> The changes introduced by this patch set don't cause any compatibility
> issue: kernels without the metadata parser simply extract the special files
> and don't process metadata; kernels with the metadata parser don't process
> metadata if the special files are not included in the image.
> 
>  From the kernel space perspective, backporting this functionality to older
> kernels should be very easy. It is sufficient to add two calls to the new
> function do_process_metadata() in do_copy(), and to check the file name in
> do_name(). From the user space perspective, unlike the previous version of
> the patch set, it is required to modify the image generators in order to
> include metadata as separate files.
> 
> Changelog
> 
> v3:
> - include file metadata as separate files named METADATA!!!
> - add the possibility to include in the ram disk arbitrary metadata types
> 
> v2:
> - replace ksys_lsetxattr() with kern_path() and vfs_setxattr()
>    (suggested by Jann Horn)
> - replace ksys_open()/ksys_read()/ksys_close() with
>    filp_open()/kernel_read()/fput()
>    (suggested by Jann Horn)
> - use path variable instead of name_buf in do_readxattrs()
> - set last byte of str to 0 in do_readxattrs()
> - call do_readxattrs() in do_name() before replacing an existing
>    .xattr-list
> - pass pathname to do_setxattrs()
> 
> v1:
> - move xattr unmarshaling to CPIO parser
> 
> 
> Mimi Zohar (1):
>    initramfs: add file metadata
> 
> Roberto Sassu (2):
>    initramfs: read metadata from special file METADATA!!!
>    gen_init_cpio: add support for file metadata
> 
>   include/linux/initramfs.h |  21 ++++++
>   init/initramfs.c          | 137 +++++++++++++++++++++++++++++++++++++-
>   usr/Kconfig               |   8 +++
>   usr/Makefile              |   4 +-
>   usr/gen_init_cpio.c       | 137 ++++++++++++++++++++++++++++++++++++--
>   usr/gen_initramfs_list.sh |  10 ++-
>   6 files changed, 305 insertions(+), 12 deletions(-)
>   create mode 100644 include/linux/initramfs.h
> 

-- 
HUAWEI TECHNOLOGIES Duesseldorf GmbH, HRB 56063
Managing Director: Bo PENG, Jian LI, Yanli SHI

^ permalink raw reply

* Re: [PATCH v9 12/16] sched/core: uclamp: Extend CPU's cgroup controller
From: Patrick Bellasi @ 2019-06-03 12:24 UTC (permalink / raw)
  To: Tejun Heo, Peter Zijlstra
  Cc: linux-kernel, linux-pm, linux-api, Ingo Molnar,
	Rafael J . Wysocki, Vincent Guittot, Viresh Kumar, Paul Turner,
	Quentin Perret, Dietmar Eggemann, Morten Rasmussen, Juri Lelli,
	Todd Kjos, Joel Fernandes, Steve Muckle, Suren Baghdasaryan
In-Reply-To: <20190531153545.GE374014@devbig004.ftw2.facebook.com>

On 31-May 08:35, Tejun Heo wrote:
> Hello, Patrick.

Hi Tejun!

> On Wed, May 15, 2019 at 10:44:55AM +0100, Patrick Bellasi wrote:
> > Extend the CPU controller with a couple of new attributes util.{min,max}
> > which allows to enforce utilization boosting and capping for all the
> > tasks in a group. Specifically:
> > 
> > - util.min: defines the minimum utilization which should be considered
> > 	    i.e. the RUNNABLE tasks of this group will run at least at a
> > 		 minimum frequency which corresponds to the util.min
> > 		 utilization
> > 
> > - util.max: defines the maximum utilization which should be considered
> > 	    i.e. the RUNNABLE tasks of this group will run up to a
> > 		 maximum frequency which corresponds to the util.max
> > 		 utilization
> 
> Let's please use a prefix which is more specific.  It's clamping the
> utilization estimates of the member tasks which in turn affect
> scheduling / frequency decisions but cpu.util.max reads like it's
> gonna limit the cpu utilization directly.  Maybe just use uclamp?

Being too specific does not risk to expose implementation details?

If that's not a problem and Peter likes:

   cpu.uclamp.{min,max}

that's ok with me.

-- 
#include <best/regards.h>

Patrick Bellasi

^ permalink raw reply

* Re: [PATCH v9 12/16] sched/core: uclamp: Extend CPU's cgroup controller
From: Patrick Bellasi @ 2019-06-03 12:27 UTC (permalink / raw)
  To: Tejun Heo
  Cc: linux-kernel, linux-pm, linux-api, Ingo Molnar, Peter Zijlstra,
	Rafael J . Wysocki, Vincent Guittot, Viresh Kumar, Paul Turner,
	Quentin Perret, Dietmar Eggemann, Morten Rasmussen, Juri Lelli,
	Todd Kjos, Joel Fernandes, Steve Muckle, Suren Baghdasaryan
In-Reply-To: <20190531153545.GE374014@devbig004.ftw2.facebook.com>

On 31-May 08:35, Tejun Heo wrote:

[...]

> > These attributes:
> > 
> > a) are available only for non-root nodes, both on default and legacy
> >    hierarchies, while system wide clamps are defined by a generic
> >    interface which does not depends on cgroups. This system wide
> >    interface enforces constraints on tasks in the root node.
> 
> I'd much prefer if they weren't entangled this way.  The system wide
> limits should work the same regardless of cgroup's existence.  cgroup
> can put further restriction on top but mere creation of cgroups with
> cpu controller enabled shouldn't take them out of the system-wide
> limits.

That's correct and what you describe matches, at least on its
intents, the current implementation provided in:

   [PATCH v9 14/16] sched/core: uclamp: Propagate system defaults to root group
   https://lore.kernel.org/lkml/20190515094459.10317-15-patrick.bellasi@arm.com/

System clamps always work the same way, independently from cgroups:
they define the upper bound for both min and max clamps.

When cgroups are not available, tasks specific clamps are always
capped by system clamps.

When cgroups are available, the root task group clamps are capped by
the system clamps, which affects its "effective" clamps and propagate
them down the hierarchy to child's "effective" clamps.
That's done in:

   [PATCH v9 13/16] sched/core: uclamp: Propagate parent clamps
   https://lore.kernel.org/lkml/20190515094459.10317-14-patrick.bellasi@arm.com/


Example 1
---------

Here is an example of system and groups clamps aggregation:

        	        min                        max
system defaults         400                        600

cg_name        	        min       min.effective	   max	     max.effective
 /uclamp               1024       400              500       500
 /uclamp/app              512       400	             512       500
 /uclamp/app/pid_smalls	    100	      100              200       200
 /uclamp/app/pid_bigs       500	      400              700       500


The ".effective" clamps are used to define the actual clamp value to
apply to tasks, according to the aggregation rules defined in:

   [PATCH v9 15/16] sched/core: uclamp: Use TG's clamps to restrict TASK's clamps
   https://lore.kernel.org/lkml/20190515094459.10317-16-patrick.bellasi@arm.com/

All the above, to me it means that:
 - cgroups are always capped by system clamps
 - cgroups can further restrict system clamps

Does that match with your view?

> > b) enforce effective constraints at each level of the hierarchy which
> >    are a restriction of the group requests considering its parent's
> >    effective constraints. Root group effective constraints are defined
> >    by the system wide interface.
> >    This mechanism allows each (non-root) level of the hierarchy to:
> >    - request whatever clamp values it would like to get
> >    - effectively get only up to the maximum amount allowed by its parent
> 
> I'll come back to this later.
> 
> > c) have higher priority than task-specific clamps, defined via
> >    sched_setattr(), thus allowing to control and restrict task requests
> 
> This sounds good.
> 
> > Add two new attributes to the cpu controller to collect "requested"
> > clamp values. Allow that at each non-root level of the hierarchy.
> > Validate local consistency by enforcing util.min < util.max.
> > Keep it simple by do not caring now about "effective" values computation
> > and propagation along the hierarchy.
> 
> So, the followings are what we're doing for hierarchical protection
> and limit propgations.
> 
> * Limits (high / max) default to max.  Protections (low / min) 0.  A
>   new cgroup by default doesn't constrain itself further and doesn't
>   have any protection.

Example 2
---------

Let say we have:

  /tg1:
        util_min=200 (as a protection)
        util_max=800 (as a limit)

the moment we create a subgroup /tg1/tg11, in v9 it is initialized
with the same limits _and protections_ of its father:

  /tg1/tg11:
        util_min=200 (protection inherited from /tg1)
        util_max=800 (limit inherited from /tg1)

Do you mean that we should have instead:

  /tg1/tg11:
        util_min=0   (no protection by default at creation time)
        util_max=800 (limit inherited from /tg1)


i.e. we need to reset the protection of a newly created subgroup?


> * A limit defines the upper ceiling for the subtree.  If an ancestor
>   has a limit of X, none of its descendants can have more than X.

That's correct, however we distinguish between "requested" and
"effective" values.


Example 3
---------

We can have:

cg_name        	        max       max.effective
 /uclamp/app            400                 400
 /uclamp/app/pid_bigs     500	              400


Which means that a subgroup can "request" a limit (max=500) higher
then its father (max=400), while still getting only up to what its
father allows (max.effective = 400).


Example 4
---------

Tracking the actual requested limit (max=500) it's useful to enforce
it once the father limit should be relaxed, for example we will have:

cg_name        	        max       max.effective
 /uclamp/app            600                 600
 /uclamp/app/pid_bigs     500	              500

where a subgroup gets not more than what it has been configured for.

This is the logic implemented by cpu_util_update_eff() in:

   [PATCH v9 13/16] sched/core: uclamp: Propagate parent clamps
   https://lore.kernel.org/lkml/20190515094459.10317-14-patrick.bellasi@arm.com/


> * A protection defines the upper ceiling of protections for the
>   subtree.  If an andester has a protection of X, none of its
>   descendants can have more protection than X.

Right, that's the current behavior in v9.

> Note that there's no way for an ancestor to enforce protection its
> descendants.  It can only allow them to claim some.  This is
> intentional as the other end of the spectrum is either descendants
> losing the ability to further distribute protections as they see fit.

Ok, that means I need to update in v10 the initialization of subgroups
min clamps to be none by default as discussed in the above Example 2,
right?

[...]

Cheers,
Patrick

-- 
#include <best/regards.h>

Patrick Bellasi

^ permalink raw reply

* Re: [PATCH v9 12/16] sched/core: uclamp: Extend CPU's cgroup controller
From: Patrick Bellasi @ 2019-06-03 12:29 UTC (permalink / raw)
  To: Tejun Heo, Peter Zijlstra
  Cc: linux-kernel, linux-pm, linux-api, Ingo Molnar,
	Rafael J . Wysocki, Vincent Guittot, Viresh Kumar, Paul Turner,
	Quentin Perret, Dietmar Eggemann, Morten Rasmussen, Juri Lelli,
	Todd Kjos, Joel Fernandes, Steve Muckle, Suren Baghdasaryan
In-Reply-To: <20190531153545.GE374014@devbig004.ftw2.facebook.com>

On 31-May 08:35, Tejun Heo wrote:
> Hello, Patrick.
> 
> On Wed, May 15, 2019 at 10:44:55AM +0100, Patrick Bellasi wrote:

[...]

> For proportions (as opposed to weights), we use percentage rational
> numbers - e.g. 38.44 for 38.44%.  I have parser and doc update commits
> pending.  I'll put them on cgroup/for-5.3.

That's a point worth discussing with Peter, we already changed one
time from percentages to 1024 scale.

Utilization clamps are expressed as percentages by definition,
they are just expressed in a convenient 1024 scale which should not be
alien to people using those knobs.

If we wanna use a "more specific" name like uclamp.{min,max} then we
should probably also accept to use a "more specific" metric, don't we?

I personally like the [0..1024] range, but I guess that's really up to
you and Peter to agree upon.

> Thanks.
> 
> --
> tejun

Cheers,
Patrick

-- 
#include <best/regards.h>

Patrick Bellasi

^ permalink raw reply

* Re: [PATCH v2 0/7] mm: process_vm_mmap() -- syscall for duplication a process mapping
From: Kirill Tkhai @ 2019-06-03 14:38 UTC (permalink / raw)
  To: Kirill A. Shutemov
  Cc: akpm, dan.j.williams, mhocko, keith.busch, kirill.shutemov,
	alexander.h.duyck, ira.weiny, andreyknvl, arunks, vbabka, cl,
	riel, keescook, hannes, npiggin, mathieu.desnoyers, shakeelb,
	guro, aarcange, hughd, jglisse, mgorman, daniel.m.jordan, jannh,
	kilobyte, linux-api, linux-kernel, linux-mm
In-Reply-To: <20190522152254.5cyxhjizuwuojlix@box>

On 22.05.2019 18:22, Kirill A. Shutemov wrote:
> On Mon, May 20, 2019 at 05:00:01PM +0300, Kirill Tkhai wrote:
>> This patchset adds a new syscall, which makes possible
>> to clone a VMA from a process to current process.
>> The syscall supplements the functionality provided
>> by process_vm_writev() and process_vm_readv() syscalls,
>> and it may be useful in many situation.
> 
> Kirill, could you explain how the change affects rmap and how it is safe.
> 
> My concern is that the patchset allows to map the same page multiple times
> within one process or even map page allocated by child to the parrent.

Speaking honestly, we already support this model, since ZERO_PAGE() may
be mapped multiply times in any number of mappings.

Kirill

^ permalink raw reply

* [PATCH v2 1/2] fork: add clone3
From: Christian Brauner @ 2019-06-03 14:43 UTC (permalink / raw)
  To: viro, linux-kernel, torvalds, jannh
  Cc: keescook, fweimer, oleg, arnd, dhowells, Christian Brauner,
	Pavel Emelyanov, Andrew Morton, Adrian Reber, Andrei Vagin,
	linux-api

This adds the clone3 system call.

As mentioned several times already (cf. [7], [8]) here's the promised
patchset for clone3().

We recently merged the CLONE_PIDFD patchset (cf. [1]). It took the last
free flag from clone().

Independent of the CLONE_PIDFD patchset a time namespace has been discussed
at Linux Plumber Conference last year and has been sent out and reviewed
(cf. [5]). It is expected that it will go upstream in the not too distant
future. However, it relies on the addition of the CLONE_NEWTIME flag to
clone(). The only other good candidate - CLONE_DETACHED - is currently not
recyclable as we have identified at least two large or widely used
codebases that currently pass this flag (cf. [2], [3], and [4]). Given that
CLONE_PIDFD grabbed the last clone() flag the time namespace is effectively
blocked. clone3() has the advantage that it will unblock this patchset
again.

The idea is to keep clone3() very simple and close to the original clone(),
specifically, to keep on supporting old clone()-based workloads.
We know there have been various creative proposals how a new process
creation syscall or even api is supposed to look like. Some people even
going so far as to argue that the traditional fork()+exec() split should be
abandoned in favor of an in-kernel version of spawn(). Independent of
whether or not we personally think spawn() is a good idea this patchset has
and does not want to have anything to do with this.
One stance we take is that there's no real good alternative to
clone()+exec() and we need and want to support this model going forward;
independent of spawn().
The following requirements guided clone3():
- bump the number of available flags
- move arguments that are currently passed as separate arguments
  in clone() into a dedicated struct clone_args
  - choose a struct layout that is easy to handle on 32 and on 64 bit
  - choose a struct layout that is extensible
  - give new flags that currently need to abuse another flag's dedicated
    return argument in clone() their own dedicated return argument
    (e.g. CLONE_PIDFD)
  - use a separate kernel internal struct kernel_clone_args that is
    properly typed according to current kernel conventions in fork.c and is
    different from  the uapi struct clone_args
- port _do_fork() to use kernel_clone_args so that all process creation
  syscalls such as fork(), vfork(), clone(), and clone3() behave identical
  (Arnd suggested, that we can probably also port do_fork() itself in a
   separate patchset.)
- ease of transition for userspace from clone() to clone3()
  This very much means that we do *not* remove functionality that userspace
  currently relies on as the latter is a good way of creating a syscall
  that won't be adopted.
- do not try to be clever or complex: keep clone3() as dumb as possible

In accordance with Linus suggestions (cf. [11]), clone3() has the following
signature:

/* uapi */
struct clone_args {
        __aligned_u64 flags;
        __aligned_u64 pidfd;
        __aligned_u64 child_tid;
        __aligned_u64 parent_tid;
        __aligned_u64 exit_signal;
        __aligned_u64 stack;
        __aligned_u64 stack_size;
        __aligned_u64 tls;
};

/* kernel internal */
struct kernel_clone_args {
        u64 flags;
        int __user *pidfd;
        int __user *child_tid;
        int __user *parent_tid;
        int exit_signal;
        unsigned long stack;
        unsigned long stack_size;
        unsigned long tls;
};

long sys_clone3(struct clone_args __user *uargs, size_t size)

clone3() cleanly supports all of the supported flags from clone() and thus
all legacy workloads.
The advantage of sticking close to the old clone() is the low cost for
userspace to switch to this new api. Quite a lot of userspace apis (e.g.
pthreads) are based on the clone() syscall. With the new clone3() syscall
supporting all of the old workloads and opening up the ability to add new
features should make switching to it for userspace more appealing. In
essence, glibc can just write a simple wrapper to switch from clone() to
clone3().

There has been some interest in this patchset already. We have received a
patch from the CRIU corner for clone3() that would set the PID/TID of a
restored process without /proc/sys/kernel/ns_last_pid to eliminate a race.

/* User visible differences to legacy clone() */
- CLONE_DETACHED will cause EINVAL with clone3()
- CSIGNAL is deprecated
  It is superseeded by a dedicated "exit_signal" argument in struct
  clone_args freeing up space for additional flags.
  This is based on a suggestion from Andrei and Linus (cf. [9] and [10])

/* References */
[1]: b3e5838252665ee4cfa76b82bdf1198dca81e5be
[2]: https://dxr.mozilla.org/mozilla-central/source/security/sandbox/linux/SandboxFilter.cpp#343
[3]: https://git.musl-libc.org/cgit/musl/tree/src/thread/pthread_create.c#n233
[4]: https://sources.debian.org/src/blcr/0.8.5-2.3/cr_module/cr_dump_self.c/?hl=740#L740
[5]: https://lore.kernel.org/lkml/20190425161416.26600-1-dima@arista.com/
[6]: https://lore.kernel.org/lkml/20190425161416.26600-2-dima@arista.com/
[7]: https://lore.kernel.org/lkml/CAHrFyr5HxpGXA2YrKza-oB-GGwJCqwPfyhD-Y5wbktWZdt0sGQ@mail.gmail.com/
[8]: https://lore.kernel.org/lkml/20190524102756.qjsjxukuq2f4t6bo@brauner.io/
[9]: https://lore.kernel.org/lkml/20190529222414.GA6492@gmail.com/
[10]: https://lore.kernel.org/lkml/CAHk-=whQP-Ykxi=zSYaV9iXsHsENa+2fdj-zYKwyeyed63Lsfw@mail.gmail.com/
[11]: https://lore.kernel.org/lkml/CAHk-=wieuV4hGwznPsX-8E0G2FKhx3NjZ9X3dTKh5zKd+iqOBw@mail.gmail.com/

Suggested-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Christian Brauner <christian@brauner.io>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Kees Cook <keescook@chromium.org>
Cc: Pavel Emelyanov <xemul@virtuozzo.com>
Cc: Jann Horn <jannh@google.com>
Cc: David Howells <dhowells@redhat.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Adrian Reber <adrian@lisas.de>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Andrei Vagin <avagin@gmail.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Florian Weimer <fweimer@redhat.com>
Cc: linux-api@vger.kernel.org
---
v1:
- Linus Torvalds <torvalds@linux-foundation.org>:
  - redesign based on Linus proposal
  - switch from arg-based to revision-based naming scheme: s/clone6/clone3/
- Arnd Bergmann <arnd@arndb.de>:
  - use a single copy_from_user() instead of multiple get_user() calls
    since the latter have a constant overhead on some architectures
  - a range of other tweaks and suggestions
v2:
- Linus Torvalds <torvalds@linux-foundation.org>,
  Andrei Vagin <avagin@gmail.com>:
  - replace CSIGNAL flag with dedicated exit_signal argument in struct
    clone_args
- Christian Brauner <christian@brauner.io>:
  - improve naming for some struct clone_args members
---
 arch/x86/ia32/sys_ia32.c   |  12 ++-
 include/linux/sched/task.h |  14 ++-
 include/linux/syscalls.h   |   6 ++
 include/uapi/linux/sched.h |  17 ++++
 kernel/fork.c              | 188 ++++++++++++++++++++++++++++---------
 5 files changed, 190 insertions(+), 47 deletions(-)

diff --git a/arch/x86/ia32/sys_ia32.c b/arch/x86/ia32/sys_ia32.c
index a43212036257..26525c0cd5e6 100644
--- a/arch/x86/ia32/sys_ia32.c
+++ b/arch/x86/ia32/sys_ia32.c
@@ -237,6 +237,14 @@ COMPAT_SYSCALL_DEFINE5(x86_clone, unsigned long, clone_flags,
 		       unsigned long, newsp, int __user *, parent_tidptr,
 		       unsigned long, tls_val, int __user *, child_tidptr)
 {
-	return _do_fork(clone_flags, newsp, 0, parent_tidptr, child_tidptr,
-			tls_val);
+	struct kernel_clone_args args = {
+		.flags = (clone_flags & ~CSIGNAL),
+		.child_tid = child_tidptr,
+		.parent_tid = parent_tidptr,
+		.exit_signal = (clone_flags & CSIGNAL),
+		.stack = newsp,
+		.tls = tls_val,
+	};
+
+	return _do_fork(&args);
 }
diff --git a/include/linux/sched/task.h b/include/linux/sched/task.h
index f1227f2c38a4..9444d4f2bd83 100644
--- a/include/linux/sched/task.h
+++ b/include/linux/sched/task.h
@@ -8,11 +8,23 @@
  */
 
 #include <linux/sched.h>
+#include <linux/compiler_types.h>
 
 struct task_struct;
 struct rusage;
 union thread_union;
 
+struct kernel_clone_args {
+	u64 flags;
+	int __user *pidfd;
+	int __user *child_tid;
+	int __user *parent_tid;
+	int exit_signal;
+	unsigned long stack;
+	unsigned long stack_size;
+	unsigned long tls;
+};
+
 /*
  * This serializes "schedule()" and also protects
  * the run-queue from deletions/modifications (but
@@ -73,7 +85,7 @@ extern void do_group_exit(int);
 extern void exit_files(struct task_struct *);
 extern void exit_itimers(struct signal_struct *);
 
-extern long _do_fork(unsigned long, unsigned long, unsigned long, int __user *, int __user *, unsigned long);
+extern long _do_fork(struct kernel_clone_args *kargs);
 extern long do_fork(unsigned long, unsigned long, unsigned long, int __user *, int __user *);
 struct task_struct *fork_idle(int);
 struct mm_struct *copy_init_mm(void);
diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
index e2870fe1be5b..254db24af0cd 100644
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -70,6 +70,7 @@ struct sigaltstack;
 struct rseq;
 union bpf_attr;
 struct io_uring_params;
+struct clone_args;
 
 #include <linux/types.h>
 #include <linux/aio_abi.h>
@@ -852,6 +853,11 @@ asmlinkage long sys_clone(unsigned long, unsigned long, int __user *,
 	       int __user *, unsigned long);
 #endif
 #endif
+
+#ifdef __ARCH_WANT_SYS_CLONE
+asmlinkage long sys_clone3(struct clone_args __user *uargs, size_t size);
+#endif
+
 asmlinkage long sys_execve(const char __user *filename,
 		const char __user *const __user *argv,
 		const char __user *const __user *envp);
diff --git a/include/uapi/linux/sched.h b/include/uapi/linux/sched.h
index ed4ee170bee2..634af5ec07b5 100644
--- a/include/uapi/linux/sched.h
+++ b/include/uapi/linux/sched.h
@@ -2,6 +2,8 @@
 #ifndef _UAPI_LINUX_SCHED_H
 #define _UAPI_LINUX_SCHED_H
 
+#include <linux/types.h>
+
 /*
  * cloning flags:
  */
@@ -30,6 +32,21 @@
 #define CLONE_NEWPID		0x20000000	/* New pid namespace */
 #define CLONE_NEWNET		0x40000000	/* New network namespace */
 #define CLONE_IO		0x80000000	/* Clone io context */
+#define CLONE_MAX ~0U
+
+/*
+ * Arguments for the clone3 syscall
+ */
+struct clone_args {
+	__aligned_u64 flags;
+	__aligned_u64 pidfd;
+	__aligned_u64 child_tid;
+	__aligned_u64 parent_tid;
+	__aligned_u64 exit_signal;
+	__aligned_u64 stack;
+	__aligned_u64 stack_size;
+	__aligned_u64 tls;
+};
 
 /*
  * Scheduling policies
diff --git a/kernel/fork.c b/kernel/fork.c
index b4cba953040a..32f27e1d99da 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1760,19 +1760,19 @@ static __always_inline void delayed_free_task(struct task_struct *tsk)
  * flags). The actual kick-off is left to the caller.
  */
 static __latent_entropy struct task_struct *copy_process(
-					unsigned long clone_flags,
-					unsigned long stack_start,
-					unsigned long stack_size,
-					int __user *parent_tidptr,
-					int __user *child_tidptr,
 					struct pid *pid,
 					int trace,
-					unsigned long tls,
-					int node)
+					int node,
+					struct kernel_clone_args *args)
 {
 	int pidfd = -1, retval;
 	struct task_struct *p;
 	struct multiprocess_signals delayed;
+	u64 clone_flags = args->flags;
+	int __user *child_tidptr = args->child_tid;
+	unsigned long tls = args->tls;
+	unsigned long stack_start = args->stack;
+	unsigned long stack_size = args->stack_size;
 
 	/*
 	 * Don't allow sharing the root directory with processes in a different
@@ -1821,27 +1821,12 @@ static __latent_entropy struct task_struct *copy_process(
 	}
 
 	if (clone_flags & CLONE_PIDFD) {
-		int reserved;
-
 		/*
-		 * - CLONE_PARENT_SETTID is useless for pidfds and also
-		 *   parent_tidptr is used to return pidfds.
 		 * - CLONE_DETACHED is blocked so that we can potentially
 		 *   reuse it later for CLONE_PIDFD.
 		 * - CLONE_THREAD is blocked until someone really needs it.
 		 */
-		if (clone_flags &
-		    (CLONE_DETACHED | CLONE_PARENT_SETTID | CLONE_THREAD))
-			return ERR_PTR(-EINVAL);
-
-		/*
-		 * Verify that parent_tidptr is sane so we can potentially
-		 * reuse it later.
-		 */
-		if (get_user(reserved, parent_tidptr))
-			return ERR_PTR(-EFAULT);
-
-		if (reserved != 0)
+		if (clone_flags & (CLONE_DETACHED | CLONE_THREAD))
 			return ERR_PTR(-EINVAL);
 	}
 
@@ -2062,7 +2047,7 @@ static __latent_entropy struct task_struct *copy_process(
 			goto bad_fork_free_pid;
 
 		pidfd = retval;
-		retval = put_user(pidfd, parent_tidptr);
+		retval = put_user(pidfd, args->pidfd);
 		if (retval)
 			goto bad_fork_put_pidfd;
 	}
@@ -2105,7 +2090,7 @@ static __latent_entropy struct task_struct *copy_process(
 		if (clone_flags & CLONE_PARENT)
 			p->exit_signal = current->group_leader->exit_signal;
 		else
-			p->exit_signal = (clone_flags & CSIGNAL);
+			p->exit_signal = args->exit_signal;
 		p->group_leader = p;
 		p->tgid = p->pid;
 	}
@@ -2313,8 +2298,11 @@ static inline void init_idle_pids(struct task_struct *idle)
 struct task_struct *fork_idle(int cpu)
 {
 	struct task_struct *task;
-	task = copy_process(CLONE_VM, 0, 0, NULL, NULL, &init_struct_pid, 0, 0,
-			    cpu_to_node(cpu));
+	struct kernel_clone_args args = {
+		.flags = CLONE_VM,
+	};
+
+	task = copy_process(&init_struct_pid, 0, cpu_to_node(cpu), &args);
 	if (!IS_ERR(task)) {
 		init_idle_pids(task);
 		init_idle(task, cpu);
@@ -2334,18 +2322,15 @@ struct mm_struct *copy_init_mm(void)
  * It copies the process, and if successful kick-starts
  * it and waits for it to finish using the VM if required.
  */
-long _do_fork(unsigned long clone_flags,
-	      unsigned long stack_start,
-	      unsigned long stack_size,
-	      int __user *parent_tidptr,
-	      int __user *child_tidptr,
-	      unsigned long tls)
+long _do_fork(struct kernel_clone_args *args)
 {
+	u64 clone_flags = args->flags;
 	struct completion vfork;
 	struct pid *pid;
 	struct task_struct *p;
 	int trace = 0;
 	long nr;
+	int __user *parent_tidptr = args->parent_tid;
 
 	/*
 	 * Determine whether and which event to report to ptracer.  When
@@ -2356,7 +2341,7 @@ long _do_fork(unsigned long clone_flags,
 	if (!(clone_flags & CLONE_UNTRACED)) {
 		if (clone_flags & CLONE_VFORK)
 			trace = PTRACE_EVENT_VFORK;
-		else if ((clone_flags & CSIGNAL) != SIGCHLD)
+		else if (args->exit_signal != SIGCHLD)
 			trace = PTRACE_EVENT_CLONE;
 		else
 			trace = PTRACE_EVENT_FORK;
@@ -2365,8 +2350,7 @@ long _do_fork(unsigned long clone_flags,
 			trace = 0;
 	}
 
-	p = copy_process(clone_flags, stack_start, stack_size, parent_tidptr,
-			 child_tidptr, NULL, trace, tls, NUMA_NO_NODE);
+	p = copy_process(NULL, trace, NUMA_NO_NODE, args);
 	add_latent_entropy();
 
 	if (IS_ERR(p))
@@ -2414,8 +2398,16 @@ long do_fork(unsigned long clone_flags,
 	      int __user *parent_tidptr,
 	      int __user *child_tidptr)
 {
-	return _do_fork(clone_flags, stack_start, stack_size,
-			parent_tidptr, child_tidptr, 0);
+	struct kernel_clone_args args = {
+		.flags = (clone_flags & ~CSIGNAL),
+		.child_tid = child_tidptr,
+		.parent_tid = parent_tidptr,
+		.exit_signal = (clone_flags & CSIGNAL),
+		.stack = stack_start,
+		.stack_size = stack_size,
+	};
+
+	return _do_fork(&args);
 }
 #endif
 
@@ -2424,15 +2416,25 @@ long do_fork(unsigned long clone_flags,
  */
 pid_t kernel_thread(int (*fn)(void *), void *arg, unsigned long flags)
 {
-	return _do_fork(flags|CLONE_VM|CLONE_UNTRACED, (unsigned long)fn,
-		(unsigned long)arg, NULL, NULL, 0);
+	struct kernel_clone_args args = {
+		.flags = ((flags | CLONE_VM | CLONE_UNTRACED) & ~CSIGNAL),
+		.exit_signal = (flags & CSIGNAL),
+		.stack = (unsigned long)fn,
+		.stack_size = (unsigned long)arg,
+	};
+
+	return _do_fork(&args);
 }
 
 #ifdef __ARCH_WANT_SYS_FORK
 SYSCALL_DEFINE0(fork)
 {
 #ifdef CONFIG_MMU
-	return _do_fork(SIGCHLD, 0, 0, NULL, NULL, 0);
+	struct kernel_clone_args args = {
+		.exit_signal = SIGCHLD,
+	};
+
+	return _do_fork(&args);
 #else
 	/* can not support in nommu mode */
 	return -EINVAL;
@@ -2443,8 +2445,12 @@ SYSCALL_DEFINE0(fork)
 #ifdef __ARCH_WANT_SYS_VFORK
 SYSCALL_DEFINE0(vfork)
 {
-	return _do_fork(CLONE_VFORK | CLONE_VM | SIGCHLD, 0,
-			0, NULL, NULL, 0);
+	struct kernel_clone_args args = {
+		.flags = CLONE_VFORK | CLONE_VM,
+		.exit_signal = SIGCHLD,
+	};
+
+	return _do_fork(&args);
 }
 #endif
 
@@ -2472,7 +2478,101 @@ SYSCALL_DEFINE5(clone, unsigned long, clone_flags, unsigned long, newsp,
 		 unsigned long, tls)
 #endif
 {
-	return _do_fork(clone_flags, newsp, 0, parent_tidptr, child_tidptr, tls);
+	struct kernel_clone_args args = {
+		.flags = (clone_flags & ~CSIGNAL),
+		.pidfd = parent_tidptr,
+		.child_tid = child_tidptr,
+		.parent_tid = parent_tidptr,
+		.exit_signal = (clone_flags & CSIGNAL),
+		.stack = newsp,
+		.tls = tls,
+	};
+
+	/* clone(CLONE_PIDFD) uses parent_tidptr to return a pidfd */
+	if ((clone_flags & CLONE_PIDFD) && (clone_flags & CLONE_PARENT_SETTID))
+		return -EINVAL;
+
+	return _do_fork(&args);
+}
+
+static bool clone3_flags_valid(u64 flags)
+{
+	if (flags & ~CLONE_MAX)
+		return false;
+
+	if (flags & (CLONE_DETACHED | CSIGNAL))
+		return false;
+
+	return true;
+}
+
+static int copy_clone_args_from_user(struct kernel_clone_args *kargs,
+				     struct clone_args __user *uargs,
+				     size_t size)
+{
+	struct clone_args args;
+
+	if (unlikely(size > PAGE_SIZE))
+		return -E2BIG;
+
+	if (unlikely(size < sizeof(struct clone_args)))
+		return -EINVAL;
+
+	if (unlikely(!access_ok(uargs, size)))
+		return -EFAULT;
+
+	if (size > sizeof(struct clone_args)) {
+		unsigned char __user *addr;
+		unsigned char __user *end;
+		unsigned char val;
+
+		addr = (void __user *)uargs + sizeof(struct clone_args);
+		end = (void __user *)uargs + size;
+
+		for (; addr < end; addr++) {
+			if (get_user(val, addr))
+				return -EFAULT;
+			if (val)
+				return -E2BIG;
+		}
+
+		size = sizeof(struct clone_args);
+	}
+
+	if (copy_from_user(&args, uargs, size))
+		return -EFAULT;
+
+	if (!clone3_flags_valid(args.flags))
+		return -EINVAL;
+
+	if ((args.flags & (CLONE_THREAD | CLONE_PARENT)) && args.exit_signal)
+		return -EINVAL;
+
+	memset(kargs, 0, sizeof(*kargs));
+
+	kargs->flags = args.flags;
+	kargs->exit_signal = args.exit_signal;
+	kargs->child_tid = u64_to_user_ptr(args.child_tid);
+	kargs->parent_tid = u64_to_user_ptr(args.parent_tid);
+	kargs->pidfd = u64_to_user_ptr(args.pidfd);
+	kargs->stack = args.stack;
+	kargs->stack_size = args.stack_size;
+	kargs->tls = args.tls;
+
+	return 0;
+}
+
+SYSCALL_DEFINE2(clone3, struct clone_args __user *, uargs, size_t, size)
+{
+	int err;
+
+	struct kernel_clone_args kargs;
+
+	err = copy_clone_args_from_user(&kargs, uargs, size);
+	if (err)
+		return err;
+
+	return _do_fork(&kargs);
 }
 #endif
 
-- 
2.21.0

^ permalink raw reply related

* [PATCH v2 2/2] arch: wire-up clone3() syscall on x86
From: Christian Brauner @ 2019-06-03 14:43 UTC (permalink / raw)
  To: viro, linux-kernel, torvalds, jannh
  Cc: keescook, fweimer, oleg, arnd, dhowells, Christian Brauner,
	Andrew Morton, Adrian Reber, linux-api, linux-arch, x86
In-Reply-To: <20190603144331.16760-1-christian@brauner.io>

Wire up the clone3() call on x86.

This patch only wires up clone3() on x86. Some of the arches look like they
need special assembly massaging and it is probably smarter if the
appropriate arch maintainers would do the actual wiring.

Signed-off-by: Christian Brauner <christian@brauner.io>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Kees Cook <keescook@chromium.org>
Cc: David Howells <dhowells@redhat.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Adrian Reber <adrian@lisas.de>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Florian Weimer <fweimer@redhat.com>
Cc: linux-api@vger.kernel.org
Cc: linux-arch@vger.kernel.org
Cc: x86@kernel.org
---
v1: unchanged
v2: unchanged
---
 arch/x86/entry/syscalls/syscall_32.tbl | 1 +
 arch/x86/entry/syscalls/syscall_64.tbl | 1 +
 include/uapi/asm-generic/unistd.h      | 4 +++-
 3 files changed, 5 insertions(+), 1 deletion(-)

diff --git a/arch/x86/entry/syscalls/syscall_32.tbl b/arch/x86/entry/syscalls/syscall_32.tbl
index ad968b7bac72..80e26211feff 100644
--- a/arch/x86/entry/syscalls/syscall_32.tbl
+++ b/arch/x86/entry/syscalls/syscall_32.tbl
@@ -438,3 +438,4 @@
 431	i386	fsconfig		sys_fsconfig			__ia32_sys_fsconfig
 432	i386	fsmount			sys_fsmount			__ia32_sys_fsmount
 433	i386	fspick			sys_fspick			__ia32_sys_fspick
+436	i386	clone3			sys_clone3			__ia32_sys_clone3
diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl
index b4e6f9e6204a..7968f0b5b5e8 100644
--- a/arch/x86/entry/syscalls/syscall_64.tbl
+++ b/arch/x86/entry/syscalls/syscall_64.tbl
@@ -355,6 +355,7 @@
 431	common	fsconfig		__x64_sys_fsconfig
 432	common	fsmount			__x64_sys_fsmount
 433	common	fspick			__x64_sys_fspick
+436	common	clone3			__x64_sys_clone3/ptregs
 
 #
 # x32-specific system call numbers start at 512 to avoid cache impact
diff --git a/include/uapi/asm-generic/unistd.h b/include/uapi/asm-generic/unistd.h
index a87904daf103..45bc87687c47 100644
--- a/include/uapi/asm-generic/unistd.h
+++ b/include/uapi/asm-generic/unistd.h
@@ -844,9 +844,11 @@ __SYSCALL(__NR_fsconfig, sys_fsconfig)
 __SYSCALL(__NR_fsmount, sys_fsmount)
 #define __NR_fspick 433
 __SYSCALL(__NR_fspick, sys_fspick)
+#define __NR_clone3 436
+__SYSCALL(__NR_clone3, sys_clone3)
 
 #undef __NR_syscalls
-#define __NR_syscalls 434
+#define __NR_syscalls 437
 
 /*
  * 32 bit systems traditionally used different
-- 
2.21.0

^ permalink raw reply related

page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox