Linux userland API discussions
 help / color / mirror / Atom feed
* Re: [RFCv2 4/6] mm: factor out madvise's core functionality
From: Minchan Kim @ 2019-05-31 23:29 UTC (permalink / raw)
  To: Oleksandr Natalenko
  Cc: Andrew Morton, linux-mm, LKML, linux-api, Michal Hocko,
	Johannes Weiner, Tim Murray, Joel Fernandes, Suren Baghdasaryan,
	Daniel Colascione, Shakeel Butt, Sonny Rao, Brian Geffon, jannh,
	oleg, christian, hdanton
In-Reply-To: <20190531143545.jwmgzaigd4rbw2wy@butterfly.localdomain>

On Fri, May 31, 2019 at 04:35:45PM +0200, Oleksandr Natalenko wrote:
> On Fri, May 31, 2019 at 10:12:26PM +0900, Minchan Kim wrote:
> > On Fri, May 31, 2019 at 09:04:20AM +0200, Oleksandr Natalenko wrote:
> > > On Fri, May 31, 2019 at 03:43:11PM +0900, Minchan Kim wrote:
> > > > This patch factor out madvise's core functionality so that upcoming
> > > > patch can reuse it without duplication. It shouldn't change any behavior.
> > > > 
> > > > Signed-off-by: Minchan Kim <minchan@kernel.org>
> > > > ---
> > > >  mm/madvise.c | 188 +++++++++++++++++++++++++++------------------------
> > > >  1 file changed, 101 insertions(+), 87 deletions(-)
> > > > 
> > > > diff --git a/mm/madvise.c b/mm/madvise.c
> > > > index 9d749a1420b4..466623ea8c36 100644
> > > > --- a/mm/madvise.c
> > > > +++ b/mm/madvise.c
> > > > @@ -425,9 +425,10 @@ static int madvise_pageout_pte_range(pmd_t *pmd, unsigned long addr,
> > > >  	struct page *page;
> > > >  	int isolated = 0;
> > > >  	struct vm_area_struct *vma = walk->vma;
> > > > +	struct task_struct *task = walk->private;
> > > >  	unsigned long next;
> > > >  
> > > > -	if (fatal_signal_pending(current))
> > > > +	if (fatal_signal_pending(task))
> > > >  		return -EINTR;
> > > >  
> > > >  	next = pmd_addr_end(addr, end);
> > > > @@ -505,12 +506,14 @@ static int madvise_pageout_pte_range(pmd_t *pmd, unsigned long addr,
> > > >  }
> > > >  
> > > >  static void madvise_pageout_page_range(struct mmu_gather *tlb,
> > > > -			     struct vm_area_struct *vma,
> > > > -			     unsigned long addr, unsigned long end)
> > > > +				struct task_struct *task,
> > > > +				struct vm_area_struct *vma,
> > > > +				unsigned long addr, unsigned long end)
> > > >  {
> > > >  	struct mm_walk warm_walk = {
> > > >  		.pmd_entry = madvise_pageout_pte_range,
> > > >  		.mm = vma->vm_mm,
> > > > +		.private = task,
> > > >  	};
> > > >  
> > > >  	tlb_start_vma(tlb, vma);
> > > > @@ -519,9 +522,9 @@ static void madvise_pageout_page_range(struct mmu_gather *tlb,
> > > >  }
> > > >  
> > > >  
> > > > -static long madvise_pageout(struct vm_area_struct *vma,
> > > > -			struct vm_area_struct **prev,
> > > > -			unsigned long start_addr, unsigned long end_addr)
> > > > +static long madvise_pageout(struct task_struct *task,
> > > > +		struct vm_area_struct *vma, struct vm_area_struct **prev,
> > > > +		unsigned long start_addr, unsigned long end_addr)
> > > >  {
> > > >  	struct mm_struct *mm = vma->vm_mm;
> > > >  	struct mmu_gather tlb;
> > > > @@ -532,7 +535,7 @@ static long madvise_pageout(struct vm_area_struct *vma,
> > > >  
> > > >  	lru_add_drain();
> > > >  	tlb_gather_mmu(&tlb, mm, start_addr, end_addr);
> > > > -	madvise_pageout_page_range(&tlb, vma, start_addr, end_addr);
> > > > +	madvise_pageout_page_range(&tlb, task, vma, start_addr, end_addr);
> > > >  	tlb_finish_mmu(&tlb, start_addr, end_addr);
> > > >  
> > > >  	return 0;
> > > > @@ -744,7 +747,8 @@ static long madvise_dontneed_single_vma(struct vm_area_struct *vma,
> > > >  	return 0;
> > > >  }
> > > >  
> > > > -static long madvise_dontneed_free(struct vm_area_struct *vma,
> > > > +static long madvise_dontneed_free(struct mm_struct *mm,
> > > > +				  struct vm_area_struct *vma,
> > > >  				  struct vm_area_struct **prev,
> > > >  				  unsigned long start, unsigned long end,
> > > >  				  int behavior)
> > > > @@ -756,8 +760,8 @@ static long madvise_dontneed_free(struct vm_area_struct *vma,
> > > >  	if (!userfaultfd_remove(vma, start, end)) {
> > > >  		*prev = NULL; /* mmap_sem has been dropped, prev is stale */
> > > >  
> > > > -		down_read(&current->mm->mmap_sem);
> > > > -		vma = find_vma(current->mm, start);
> > > > +		down_read(&mm->mmap_sem);
> > > > +		vma = find_vma(mm, start);
> > > >  		if (!vma)
> > > >  			return -ENOMEM;
> > > >  		if (start < vma->vm_start) {
> > > > @@ -804,7 +808,8 @@ static long madvise_dontneed_free(struct vm_area_struct *vma,
> > > >   * Application wants to free up the pages and associated backing store.
> > > >   * This is effectively punching a hole into the middle of a file.
> > > >   */
> > > > -static long madvise_remove(struct vm_area_struct *vma,
> > > > +static long madvise_remove(struct mm_struct *mm,
> > > > +				struct vm_area_struct *vma,
> > > >  				struct vm_area_struct **prev,
> > > >  				unsigned long start, unsigned long end)
> > > >  {
> > > > @@ -838,13 +843,13 @@ static long madvise_remove(struct vm_area_struct *vma,
> > > >  	get_file(f);
> > > >  	if (userfaultfd_remove(vma, start, end)) {
> > > >  		/* mmap_sem was not released by userfaultfd_remove() */
> > > > -		up_read(&current->mm->mmap_sem);
> > > > +		up_read(&mm->mmap_sem);
> > > >  	}
> > > >  	error = vfs_fallocate(f,
> > > >  				FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE,
> > > >  				offset, end - start);
> > > >  	fput(f);
> > > > -	down_read(&current->mm->mmap_sem);
> > > > +	down_read(&mm->mmap_sem);
> > > >  	return error;
> > > >  }
> > > >  
> > > > @@ -918,21 +923,23 @@ static int madvise_inject_error(int behavior,
> > > >  #endif
> > > >  
> > > >  static long
> > > > -madvise_vma(struct vm_area_struct *vma, struct vm_area_struct **prev,
> > > > +madvise_vma(struct task_struct *task, struct mm_struct *mm,
> > > > +		struct vm_area_struct *vma, struct vm_area_struct **prev,
> > > >  		unsigned long start, unsigned long end, int behavior)
> > > >  {
> > > >  	switch (behavior) {
> > > >  	case MADV_REMOVE:
> > > > -		return madvise_remove(vma, prev, start, end);
> > > > +		return madvise_remove(mm, vma, prev, start, end);
> > > >  	case MADV_WILLNEED:
> > > >  		return madvise_willneed(vma, prev, start, end);
> > > >  	case MADV_COLD:
> > > >  		return madvise_cold(vma, prev, start, end);
> > > >  	case MADV_PAGEOUT:
> > > > -		return madvise_pageout(vma, prev, start, end);
> > > > +		return madvise_pageout(task, vma, prev, start, end);
> > > >  	case MADV_FREE:
> > > >  	case MADV_DONTNEED:
> > > > -		return madvise_dontneed_free(vma, prev, start, end, behavior);
> > > > +		return madvise_dontneed_free(mm, vma, prev, start,
> > > > +						end, behavior);
> > > >  	default:
> > > >  		return madvise_behavior(vma, prev, start, end, behavior);
> > > >  	}
> > > > @@ -976,68 +983,8 @@ madvise_behavior_valid(int behavior)
> > > >  	}
> > > >  }
> > > >  
> > > > -/*
> > > > - * The madvise(2) system call.
> > > > - *
> > > > - * Applications can use madvise() to advise the kernel how it should
> > > > - * handle paging I/O in this VM area.  The idea is to help the kernel
> > > > - * use appropriate read-ahead and caching techniques.  The information
> > > > - * provided is advisory only, and can be safely disregarded by the
> > > > - * kernel without affecting the correct operation of the application.
> > > > - *
> > > > - * behavior values:
> > > > - *  MADV_NORMAL - the default behavior is to read clusters.  This
> > > > - *		results in some read-ahead and read-behind.
> > > > - *  MADV_RANDOM - the system should read the minimum amount of data
> > > > - *		on any access, since it is unlikely that the appli-
> > > > - *		cation will need more than what it asks for.
> > > > - *  MADV_SEQUENTIAL - pages in the given range will probably be accessed
> > > > - *		once, so they can be aggressively read ahead, and
> > > > - *		can be freed soon after they are accessed.
> > > > - *  MADV_WILLNEED - the application is notifying the system to read
> > > > - *		some pages ahead.
> > > > - *  MADV_DONTNEED - the application is finished with the given range,
> > > > - *		so the kernel can free resources associated with it.
> > > > - *  MADV_FREE - the application marks pages in the given range as lazy free,
> > > > - *		where actual purges are postponed until memory pressure happens.
> > > > - *  MADV_REMOVE - the application wants to free up the given range of
> > > > - *		pages and associated backing store.
> > > > - *  MADV_DONTFORK - omit this area from child's address space when forking:
> > > > - *		typically, to avoid COWing pages pinned by get_user_pages().
> > > > - *  MADV_DOFORK - cancel MADV_DONTFORK: no longer omit this area when forking.
> > > > - *  MADV_WIPEONFORK - present the child process with zero-filled memory in this
> > > > - *              range after a fork.
> > > > - *  MADV_KEEPONFORK - undo the effect of MADV_WIPEONFORK
> > > > - *  MADV_HWPOISON - trigger memory error handler as if the given memory range
> > > > - *		were corrupted by unrecoverable hardware memory failure.
> > > > - *  MADV_SOFT_OFFLINE - try to soft-offline the given range of memory.
> > > > - *  MADV_MERGEABLE - the application recommends that KSM try to merge pages in
> > > > - *		this area with pages of identical content from other such areas.
> > > > - *  MADV_UNMERGEABLE- cancel MADV_MERGEABLE: no longer merge pages with others.
> > > > - *  MADV_HUGEPAGE - the application wants to back the given range by transparent
> > > > - *		huge pages in the future. Existing pages might be coalesced and
> > > > - *		new pages might be allocated as THP.
> > > > - *  MADV_NOHUGEPAGE - mark the given range as not worth being backed by
> > > > - *		transparent huge pages so the existing pages will not be
> > > > - *		coalesced into THP and new pages will not be allocated as THP.
> > > > - *  MADV_DONTDUMP - the application wants to prevent pages in the given range
> > > > - *		from being included in its core dump.
> > > > - *  MADV_DODUMP - cancel MADV_DONTDUMP: no longer exclude from core dump.
> > > > - *
> > > > - * return values:
> > > > - *  zero    - success
> > > > - *  -EINVAL - start + len < 0, start is not page-aligned,
> > > > - *		"behavior" is not a valid value, or application
> > > > - *		is attempting to release locked or shared pages,
> > > > - *		or the specified address range includes file, Huge TLB,
> > > > - *		MAP_SHARED or VMPFNMAP range.
> > > > - *  -ENOMEM - addresses in the specified range are not currently
> > > > - *		mapped, or are outside the AS of the process.
> > > > - *  -EIO    - an I/O error occurred while paging in data.
> > > > - *  -EBADF  - map exists, but area maps something that isn't a file.
> > > > - *  -EAGAIN - a kernel resource was temporarily unavailable.
> > > > - */
> > > > -SYSCALL_DEFINE3(madvise, unsigned long, start, size_t, len_in, int, behavior)
> > > > +static int madvise_core(struct task_struct *task, struct mm_struct *mm,
> > > > +			unsigned long start, size_t len_in, int behavior)
> > > 
> > > Just a minor nitpick, but can we please have it named madvise_common,
> > > not madvise_core? This would follow a usual naming scheme, when some
> > > common functionality is factored out (like, for mutexes, semaphores
> > > etc), and within the kernel "core" usually means something completely
> > > different.
> > 
> > Sure.
> > 
> > > 
> > > >  {
> > > >  	unsigned long end, tmp;
> > > >  	struct vm_area_struct *vma, *prev;
> > > > @@ -1068,15 +1015,16 @@ SYSCALL_DEFINE3(madvise, unsigned long, start, size_t, len_in, int, behavior)
> > > >  
> > > >  #ifdef CONFIG_MEMORY_FAILURE
> > > >  	if (behavior == MADV_HWPOISON || behavior == MADV_SOFT_OFFLINE)
> > > > -		return madvise_inject_error(behavior, start, start + len_in);
> > > > +		return madvise_inject_error(behavior,
> > > > +					start, start + len_in);
> > > 
> > > Not sure what this change is about except changing the line length.
> > > Note, madvise_inject_error() still operates on "current" through
> > > get_user_pages_fast() and gup_pgd_range(), but that was not changed
> > > here. I Know you've filtered out this hint later, so technically this
> > > is not an issue, but, maybe, this needs some attention too since we've
> > > already spotted it?
> > 
> > It is leftover I had done. I actually modified it to handle remote
> > task but changed my mind not to fix it because process_madvise
> > will not support it at this moment. I'm not sure it's a good idea
> > to change it for *might-be-done-in-future* at this moment even though
> > we have spotted.
> 
> I'd expect to have at least some comments in code on why other hints
> are disabled, so if we already know some shortcomings, this information
> would not be lost.

Okay, I will add some comment but do not want to fix code piece until
someone want to expose the poisoning to external process.

> 
> Of course, I don't care much about memory poisoning, but if it can be
> addressed now, let's address it now.
> 
> > 
> > > 
> > > >  #endif
> > > >  
> > > >  	write = madvise_need_mmap_write(behavior);
> > > >  	if (write) {
> > > > -		if (down_write_killable(&current->mm->mmap_sem))
> > > > +		if (down_write_killable(&mm->mmap_sem))
> > > >  			return -EINTR;
> > > 
> > > Do you still need that trick with mmget_still_valid() here?
> > > Something like:
> > 
> > Since MADV_COLD|PAGEOUT doesn't change address space layout or
> > vma->vm_flags, technically, we don't need it if I understand
> > correctly. Right?
> 
> I'd expect so, yes. But.
> 
> Since we want this interface to be universal and to be able to cover
> various needs, and since my initial intention with working in this
> direction involved KSM, I'd ask you to enable KSM hints too, and once
> (and if) that happens, the work there is done under write lock, and
> you'll need this trick to be applied.
> 
> Of course, I can do that myself later in a subsequent patch series once
> (and, again, if) your series is merged, but, maybe, we can cover this
> already especially given the fact that KSM hinting is a relatively easy
> task in this pile. I did some preliminary tests with it, and so far no
> dragons have started to roar.

Then, do you mind sending a patch based upon this series to expose
MADV_MERGEABLE to process_madvise? It will have the right description
why you want to have such feature which I couldn't provide since I don't
have enough material to write the motivation. And the patch also could
include the logic to prevent coredump race, which is more proper since
finally we need to hold mmap_sem write-side lock, finally.
I will pick it up and will rebase since then.

Thanks.

^ permalink raw reply

* Re: [RFC][PATCH] link.2: AT_ATOMIC_DATA and AT_ATOMIC_METADATA
From: Dave Chinner @ 2019-05-31 23:28 UTC (permalink / raw)
  To: Theodore Ts'o
  Cc: Amir Goldstein, Jan Kara, Darrick J . Wong, Chris Mason, Al Viro,
	linux-fsdevel, linux-xfs, Ext4, Linux Btrfs, Linux API
In-Reply-To: <20190531224549.GF29573@dread.disaster.area>

On Sat, Jun 01, 2019 at 08:45:49AM +1000, Dave Chinner wrote:
> Given that we can already use AIO to provide this sort of ordering,
> and AIO is vastly faster than synchronous IO, I don't see any point
> in adding complex barrier interfaces that can be /easily implemented
> in userspace/ using existing AIO primitives. You should start
> thinking about expanding libaio with stuff like
> "link_after_fdatasync()" and suddenly the whole problem of
> filesystem data vs metadata ordering goes away because the
> application directly controls all ordering without blocking and
> doesn't need to care what the filesystem under it does....

And let me point out that this is also how userspace can do an
efficient atomic rename - rename_after_fdatasync(). i.e. on
completion of the AIO_FSYNC, run the rename. This guarantees that
the application will see either the old file of the complete new
file, and it *doesn't have to wait for the operation to complete*.
Once it is in flight, the file will contain the old data until some
point in the near future when will it contain the new data....

Seriously, sit down and work out all the "atomic" data vs metadata
behaviours you want, and then tell me how many of them cannot be
implemented as "AIO_FSYNC w/ completion callback function" in
userspace. This mechanism /guarantees ordering/ at the application
level, the application does not block waiting for these data
integrity operations to complete, and you don't need any new kernel
side functionality to implement this.

Fundamentally, the assertion that disk cache flushes are not what
causes fsync "to be slow" is incorrect. It's the synchronous
"waiting for IO completion" that makes fsync "slow". AIO_FSYNC
avoids needing to wait for IO completion, allowing the application
to do useful work (like issue more DI ops) while data integrity
operations are in flight. At this point, fsync is no longer a "slow"
operation - it's just another background async data flush operation
like the BDI flusher thread...

Cheers,

Dave.
-- 
Dave Chinner
david@fromorbit.com

^ permalink raw reply

* Re: [RFCv2 6/6] mm: extend process_madvise syscall to support vector arrary
From: Minchan Kim @ 2019-05-31 23:18 UTC (permalink / raw)
  To: Yann Droneaud
  Cc: Andrew Morton, linux-mm, LKML, linux-api, Michal Hocko,
	Johannes Weiner, Tim Murray, Joel Fernandes, Suren Baghdasaryan,
	Daniel Colascione, Shakeel Butt, Sonny Rao, Brian Geffon, jannh,
	oleg, christian, oleksandr, hdanton
In-Reply-To: <2fd5d462449f24b04adad2bbdf0e272647e62247.camel@opteya.com>

Hi Yann,

On Fri, May 31, 2019 at 12:06:52PM +0200, Yann Droneaud wrote:
> Hi,
> 
> Le vendredi 31 mai 2019 à 15:43 +0900, Minchan Kim a écrit :
> > 
> > diff --git a/include/uapi/asm-generic/mman-common.h
> > b/include/uapi/asm-generic/mman-common.h
> > index 92e347a89ddc..220c2b5eb961 100644
> > --- a/include/uapi/asm-generic/mman-common.h
> > +++ b/include/uapi/asm-generic/mman-common.h
> > @@ -75,4 +75,15 @@
> >  #define PKEY_ACCESS_MASK	(PKEY_DISABLE_ACCESS |\
> >  				 PKEY_DISABLE_WRITE)
> >  
> > +struct pr_madvise_param {
> > +	int size;		/* the size of this structure */
> > +	int cookie;		/* reserved to support atomicity */
> > +	int nr_elem;		/* count of below arrary fields */
> 
> Those should be unsigned.
> 
> There's an implicit hole here on ABI with 64bits aligned pointers
> 
> > +	int __user *hints;	/* hints for each range */
> > +	/* to store result of each operation */
> > +	const struct iovec __user *results;
> > +	/* input address ranges */
> > +	const struct iovec __user *ranges;
> 
> Using pointer type in uAPI structure require a 'compat' version of the
> syscall need to be provided.
> 
> If using iovec too.

I will fix them when I submit next revision.
Thanks for the review.

^ permalink raw reply

* Re: [RFCv2 3/6] mm: introduce MADV_PAGEOUT
From: Minchan Kim @ 2019-05-31 23:14 UTC (permalink / raw)
  To: Johannes Weiner
  Cc: Andrew Morton, linux-mm, LKML, linux-api, Michal Hocko,
	Tim Murray, Joel Fernandes, Suren Baghdasaryan, Daniel Colascione,
	Shakeel Butt, Sonny Rao, Brian Geffon, jannh, oleg, christian,
	oleksandr, hdanton
In-Reply-To: <20190531165927.GA20067@cmpxchg.org>

Hey Johannes,

On Fri, May 31, 2019 at 12:59:27PM -0400, Johannes Weiner wrote:
> Hi Michan,
> 
> this looks pretty straight-forward to me, only one kink:
> 
> On Fri, May 31, 2019 at 03:43:10PM +0900, Minchan Kim wrote:
> > --- a/mm/vmscan.c
> > +++ b/mm/vmscan.c
> > @@ -2126,6 +2126,83 @@ static void shrink_active_list(unsigned long nr_to_scan,
> >  			nr_deactivate, nr_rotated, sc->priority, file);
> >  }
> >  
> > +unsigned long reclaim_pages(struct list_head *page_list)
> > +{
> > +	int nid = -1;
> > +	unsigned long nr_isolated[2] = {0, };
> > +	unsigned long nr_reclaimed = 0;
> > +	LIST_HEAD(node_page_list);
> > +	struct reclaim_stat dummy_stat;
> > +	struct scan_control sc = {
> > +		.gfp_mask = GFP_KERNEL,
> > +		.priority = DEF_PRIORITY,
> > +		.may_writepage = 1,
> > +		.may_unmap = 1,
> > +		.may_swap = 1,
> > +	};
> > +
> > +	while (!list_empty(page_list)) {
> > +		struct page *page;
> > +
> > +		page = lru_to_page(page_list);
> > +		if (nid == -1) {
> > +			nid = page_to_nid(page);
> > +			INIT_LIST_HEAD(&node_page_list);
> > +			nr_isolated[0] = nr_isolated[1] = 0;
> > +		}
> > +
> > +		if (nid == page_to_nid(page)) {
> > +			list_move(&page->lru, &node_page_list);
> > +			nr_isolated[!!page_is_file_cache(page)] +=
> > +						hpage_nr_pages(page);
> > +			continue;
> > +		}
> > +
> > +		mod_node_page_state(NODE_DATA(nid), NR_ISOLATED_ANON,
> > +					nr_isolated[0]);
> > +		mod_node_page_state(NODE_DATA(nid), NR_ISOLATED_FILE,
> > +					nr_isolated[1]);
> > +		nr_reclaimed += shrink_page_list(&node_page_list,
> > +				NODE_DATA(nid), &sc, TTU_IGNORE_ACCESS,
> > +				&dummy_stat, true);
> > +		while (!list_empty(&node_page_list)) {
> > +			struct page *page = lru_to_page(&node_page_list);
> > +
> > +			list_del(&page->lru);
> > +			putback_lru_page(page);
> > +		}
> > +		mod_node_page_state(NODE_DATA(nid), NR_ISOLATED_ANON,
> > +					-nr_isolated[0]);
> > +		mod_node_page_state(NODE_DATA(nid), NR_ISOLATED_FILE,
> > +					-nr_isolated[1]);
> > +		nid = -1;
> > +	}
> > +
> > +	if (!list_empty(&node_page_list)) {
> > +		mod_node_page_state(NODE_DATA(nid), NR_ISOLATED_ANON,
> > +					nr_isolated[0]);
> > +		mod_node_page_state(NODE_DATA(nid), NR_ISOLATED_FILE,
> > +					nr_isolated[1]);
> > +		nr_reclaimed += shrink_page_list(&node_page_list,
> > +				NODE_DATA(nid), &sc, TTU_IGNORE_ACCESS,
> > +				&dummy_stat, true);
> > +		mod_node_page_state(NODE_DATA(nid), NR_ISOLATED_ANON,
> > +					-nr_isolated[0]);
> > +		mod_node_page_state(NODE_DATA(nid), NR_ISOLATED_FILE,
> > +					-nr_isolated[1]);
> > +
> > +		while (!list_empty(&node_page_list)) {
> > +			struct page *page = lru_to_page(&node_page_list);
> > +
> > +			list_del(&page->lru);
> > +			putback_lru_page(page);
> > +		}
> > +
> > +	}
> 
> The NR_ISOLATED accounting, nid parsing etc. is really awkward and
> makes it hard to see what the function actually does.
> 
> Can you please make those ISOLATED counters part of the isolation API?
> Your patch really shows this is an overdue cleanup.

Yeah, that was very painful.

> 
> These are fast local percpu counters, we don't need the sprawling
> batching we do all over vmscan.c, migrate.c, khugepaged.c,
> compaction.c etc. Isolation can increase the counter page by page, and
> reclaim or putback can likewise decrease them one by one.
> 
> It looks like mlock is the only user of the isolation api that does
> not participate in the NR_ISOLATED_* counters protocol, but I don't
> see why it wouldn't, or why doing so would hurt.
> 
> There are also seem to be quite a few callsites that use the atomic
> versions of the counter API when they're clearly under the irqsafe
> lru_lock. That would be fixed automatically by this work as well.

I agree all points so will prepare clean up patch.

^ permalink raw reply

* Re: [RFC][PATCH] link.2: AT_ATOMIC_DATA and AT_ATOMIC_METADATA
From: Dave Chinner @ 2019-05-31 22:45 UTC (permalink / raw)
  To: Theodore Ts'o
  Cc: Amir Goldstein, Jan Kara, Darrick J . Wong, Chris Mason, Al Viro,
	linux-fsdevel, linux-xfs, Ext4, Linux Btrfs, Linux API
In-Reply-To: <20190531164136.GA3066@mit.edu>

On Fri, May 31, 2019 at 12:41:36PM -0400, Theodore Ts'o wrote:
> On Fri, May 31, 2019 at 06:21:45PM +0300, Amir Goldstein wrote:
> > What do you think of:
> > 
> > "AT_ATOMIC_DATA (since Linux 5.x)
> > A filesystem which accepts this flag will guarantee that if the linked file
> > name exists after a system crash, then all of the data written to the file
> > and all of the file's metadata at the time of the linkat(2) call will be
> > visible.
> 
> ".... will be visible after the the file system is remounted".  (Never
> hurts to be explicit.)
> 
> > The way to achieve this guarantee on old kernels is to call fsync (2)
> > before linking the file, but doing so will also results in flushing of
> > volatile disk caches.
> >
> > A filesystem which accepts this flag does NOT
> > guarantee that any of the file hardlinks will exist after a system crash,
> > nor that the last observed value of st_nlink (see stat (2)) will persist."
> > 
> 
> This is I think more precise:
> 
>     This guarantee can be achieved by calling fsync(2) before linking
>     the file, but there may be more performant ways to provide these
>     semantics.  In particular, note that the use of the AT_ATOMIC_DATA
>     flag does *not* guarantee that the new link created by linkat(2)
>     will be persisted after a crash.

So here's the *implementation* problem I see with this definition of
AT_ATOMIC_DATA. After linkat(dirfd, name, AT_ATOMIC_DATA), there is
no guarantee that the data is on disk or that the link is present.

However:

	linkat(dirfd, name, AT_ATOMIC_DATA);
	fsync(dirfd);

Suddenly changes all that.

i.e. when we fsync(dirfd) we guarantee that "name" is present in the
directory and because we used AT_ATOMIC_DATA it implies that the
data pointed to by "name" must be present on disk. IOWs, what was
once a pure directory sync operation now *must* fsync all the child
inodes that have been linkat(AT_ATOMIC_DATA) since the last time the
direct has been made stable. 

IOWs, the described AT_ATOMIC_DATA "we don't have to write the data
during linkat() go-fast-get-out-of-gaol-free" behaviour isn't worth
the pixels it is written on - it just moves all the complexity to
directory fsync, and that's /already/ a behavioural minefield.

IMO, the "work-around" of forcing filesystems to write back
destination inodes during a link() operation is just nasty and will
just end up with really weird performance anomalies occurring in
production systems. That's not really a solution, either, especially
as it is far, far faster for applications to use AIO_FSYNC and then
on the completion callback run a normal linkat() operation...

Hence, if I've understood these correctly, then I'll be recommending
that XFS follows this:

> We should also document that a file system which does not implement
> this flag MUST return EINVAL if it is passed this flag to linkat(2).

and returns -EINVAL to these flags because we do not have the change
tracking infrastructure to handle these directory fsync semantics.
I also suspect that, even if we could track this efficiently, we
can't do the flushing atomically because of locking order
constraints between directories, regular files, pages in the page
cache, etc.

Given that we can already use AIO to provide this sort of ordering,
and AIO is vastly faster than synchronous IO, I don't see any point
in adding complex barrier interfaces that can be /easily implemented
in userspace/ using existing AIO primitives. You should start
thinking about expanding libaio with stuff like
"link_after_fdatasync()" and suddenly the whole problem of
filesystem data vs metadata ordering goes away because the
application directly controls all ordering without blocking and
doesn't need to care what the filesystem under it does....

Cheers,

Dave.
-- 
Dave Chinner
david@fromorbit.com

^ permalink raw reply

* Re: [PATCH v1 1/2] fork: add clone3
From: Christian Brauner @ 2019-05-31 22:13 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: Andrei Vagin, Al Viro, Linux List Kernel Mailing, Jann Horn,
	Florian Weimer, Oleg Nesterov, Arnd Bergmann, David Howells,
	Pavel Emelyanov, Andrew Morton, Adrian Reber, Linux API
In-Reply-To: <CAHk-=whQP-Ykxi=zSYaV9iXsHsENa+2fdj-zYKwyeyed63Lsfw@mail.gmail.com>

On Fri, May 31, 2019 at 01:38:29PM -0700, Linus Torvalds wrote:
> On Wed, May 29, 2019 at 3:24 PM Andrei Vagin <avagin@gmail.com> wrote:
> >
> > Thank you for thinking about time namespaces. I looked at this patch
> > quickly and I would suggest to move a termination signal out of flags. I
> > think we can add a separate field (exit_signal) into clone_args. Does it
> > make sense? For me, exit_signal in flags always looked weird...
> 
> I agree - the child signal in flags was always just a "it fits" kind
> of thing, and that was obviously true back then, but is not true now.

(Traveling until Monday, so sorry for delayed responses.)

Yip, I agree that this is a good idea (answered Andrei's mail just now
saying the same thing). I'll send out a new version of the patch with
these changes added next week.

Christian

^ permalink raw reply

* Re: [PATCH v1 1/2] fork: add clone3
From: Christian Brauner @ 2019-05-31 22:08 UTC (permalink / raw)
  To: Andrei Vagin
  Cc: viro, linux-kernel, torvalds, jannh, fweimer, oleg, arnd,
	dhowells, Pavel Emelyanov, Andrew Morton, Adrian Reber, linux-api
In-Reply-To: <20190529222414.GA6492@gmail.com>

On Wed, May 29, 2019 at 03:24:15PM -0700, Andrei Vagin wrote:
> On Wed, May 29, 2019 at 05:22:36PM +0200, Christian Brauner wrote:
> > This adds the clone3 system call.
> > 
> > As mentioned several times already (cf. [7], [8]) here's the promised
> > patchset for clone3().
> > 
> > We recently merged the CLONE_PIDFD patchset (cf. [1]). It took the last
> > free flag from clone().
> > 
> > Independent of the CLONE_PIDFD patchset a time namespace has been discussed
> > at Linux Plumber Conference last year and has been sent out and reviewed
> > (cf. [5]). It is expected that it will go upstream in the not too distant
> > future. However, it relies on the addition of the CLONE_NEWTIME flag to
> > clone(). The only other good candidate - CLONE_DETACHED - is currently not
> > recyclable as we have identified at least two large or widely used
> > codebases that currently pass this flag (cf. [2], [3], and [4]). Given that
> > CLONE_PIDFD grabbed the last clone() flag the time namespace is effectively
> > blocked. clone3() has the advantage that it will unblock this patchset
> > again.
> 
> Hi Christian,

Hi Andrei,

(Traveling until Monday, so sorry for delayed responses.)

> 
> Thank you for thinking about time namespaces. I looked at this patch
> quickly and I would suggest to move a termination signal out of flags. I
> think we can add a separate field (exit_signal) into clone_args. Does it
> make sense? For me, exit_signal in flags always looked weird...

Yup, that does sound good to me.

> 
> I will look at this patch more detailed later this week. Thanks.

Excellent!

Christian

^ permalink raw reply

* Re: [PATCH v1 1/2] fork: add clone3
From: Christian Brauner @ 2019-05-31 22:08 UTC (permalink / raw)
  To: Yann Droneaud
  Cc: viro, linux-kernel, torvalds, jannh, fweimer, oleg, arnd,
	dhowells, Pavel Emelyanov, Andrew Morton, Adrian Reber,
	Andrei Vagin, linux-api
In-Reply-To: <1058006e0df4b52b3e53c7b3202c04140899aeb5.camel@opteya.com>

On Wed, May 29, 2019 at 05:42:14PM +0200, Yann Droneaud wrote:
> Le mercredi 29 mai 2019 à 17:22 +0200, Christian Brauner a écrit :
> > This adds the clone3 system call.
> > 
> > 
> > diff --git a/kernel/fork.c b/kernel/fork.c
> > index b4cba953040a..6bc3e3d17150 100644
> > --- a/kernel/fork.c
> > +++ b/kernel/fork.c
> > @@ -2472,7 +2475,96 @@ SYSCALL_DEFINE5(clone, unsigned long, clone_flags, unsigned long, newsp,
> >  		 unsigned long, tls)
> >  #endif
> >  {
> > -	return _do_fork(clone_flags, newsp, 0, parent_tidptr, child_tidptr, tls);
> > +	struct kernel_clone_args args = {
> > +		.flags = clone_flags,
> > +		.stack = newsp,
> > +		.pidfd = parent_tidptr,
> > +		.parent_tidptr = parent_tidptr,
> > +		.tls = tls,
> > +		.child_tidptr = child_tidptr,
> > +	};
> > +
> > +	/* clone(CLONE_PIDFD) uses parent_tidptr to return a pidfd */
> > +	if ((clone_flags & CLONE_PIDFD) && (clone_flags & CLONE_PARENT_SETTID))
> > +		return -EINVAL;
> > +
> > +	return _do_fork(&args);
> > +}
> > +
> > +static bool clone3_flags_valid(u64 flags)
> > +{
> > +	if (flags & CLONE_DETACHED)
> > +		return false;
> > +
> > +	if (flags & ~CLONE_MAX)
> > +		return false;
> > +
> > +	return true;
> > +}
> > +
> > +static int copy_clone_args_from_user(struct kernel_clone_args *kargs,
> > +				     struct clone_args __user *uargs,
> > +				     size_t size)
> > +{
> > +	struct clone_args args;
> > +
> > +	if (unlikely(size > PAGE_SIZE))
> > +		return -E2BIG;
> > +
> > +	if (unlikely(size < sizeof(struct clone_args)))
> > +		return -EINVAL;
> > +
> > +	if (unlikely(!access_ok(uargs, size)))
> > +		return -EFAULT;
> > +
> > +	if (size > sizeof(struct clone_args)) {
> > +		unsigned char __user *addr;
> > +		unsigned char __user *end;
> > +		unsigned char val;
> > +
> > +		addr = (void __user *)uargs + sizeof(struct clone_args);
> > +		end = (void __user *)uargs + size;
> > +
> > +		for (; addr < end; addr++) {
> > +			if (get_user(val, addr))
> > +				return -EFAULT;
> > +			if (val)
> > +				return -E2BIG;
> 
> Should be -EINVAL: having something after the structure should be
> handled just like an invalid flags, while still allowing future
> userspace program to probe for support for newer feature.

(Traveling until Monday, so sorry for delayed responses.)

This copies what:

kernel/sched/core.c:sched_copy_attr()
kernel/event/core.c:perf_copy_attr()

are already doing. Consistency might be good here but, I think.

Christian

^ permalink raw reply

* Re: [PATCH v1 1/2] fork: add clone3
From: Linus Torvalds @ 2019-05-31 20:38 UTC (permalink / raw)
  To: Andrei Vagin
  Cc: Christian Brauner, Al Viro, Linux List Kernel Mailing, Jann Horn,
	Florian Weimer, Oleg Nesterov, Arnd Bergmann, David Howells,
	Pavel Emelyanov, Andrew Morton, Adrian Reber, Linux API
In-Reply-To: <20190529222414.GA6492@gmail.com>

On Wed, May 29, 2019 at 3:24 PM Andrei Vagin <avagin@gmail.com> wrote:
>
> Thank you for thinking about time namespaces. I looked at this patch
> quickly and I would suggest to move a termination signal out of flags. I
> think we can add a separate field (exit_signal) into clone_args. Does it
> make sense? For me, exit_signal in flags always looked weird...

I agree - the child signal in flags was always just a "it fits" kind
of thing, and that was obviously true back then, but is not true now.

In fact, if we move it out of flags, we'd open up new flag values for
the new interface, in that now the low 8 bits are freed up for more
useful things.

In fact, even for the old ones, we might just say that instead of the
full 8-bit CSIGNAL field, nobody ever *used* more than 6 bits, because
_NSIG is 64, and we've never actually had named signals > 31.

(Yeah, yeah, somebody could use signal 64, and yes, mips has _NSIG
128, but realistically we could get two new clone signals in the old
interface and I think nobody would even notice).

In fact, all of the CSIGNAL bits are ignored if CLONE_THREAD or
CLONE_PARENT is set.

                   Linus

^ permalink raw reply

* Re: [PATCH 3/5] asm-generic: Register fchmodat4 as syscall 428
From: Arnd Bergmann @ 2019-05-31 19:56 UTC (permalink / raw)
  To: Palmer Dabbelt
  Cc: Al Viro, Linux Kernel Mailing List, Linux FS-devel Mailing List,
	Linux API, linux-arch, the arch/x86 maintainers, Andy Lutomirski,
	Thomas Gleixner, Ingo Molnar, Borislav Petkov, H. Peter Anvin
In-Reply-To: <20190531191204.4044-4-palmer@sifive.com>

On Fri, May 31, 2019 at 9:23 PM Palmer Dabbelt <palmer@sifive.com> wrote:
>
> Signed-off-by: Palmer Dabbelt <palmer@sifive.com>

As usual, each patch needs a changelog text. I would prefer having a single
patch here that changes /all/ system call tables at once, rather than doing one
at a time like we used to.

In linux-next, we are at number 434 now, and there are a couple of other
new system calls being proposed right now (more than usual), so you may
have to change the number a few times.

Note: most architectures use .tbl files now, the exceptions are
include/uapi/asm-generic/unistd.h and arch/arm64/include/asm/unistd32.h,
and the latter also requires changing __NR_compat_syscalls in asm/unistd.h.

Numbers should now be the same across architectures, except for alpha,
which has a +110 offset. We have discussed ways to have a single
file to modify for a new call on all architectures, but no patches yet.

     Arnd

^ permalink raw reply

* Re: [PATCH 2/5] Add fchmodat4(), a new syscall
From: Arnd Bergmann @ 2019-05-31 19:51 UTC (permalink / raw)
  To: Palmer Dabbelt
  Cc: Al Viro, Linux Kernel Mailing List, Linux FS-devel Mailing List,
	Linux API, linux-arch, the arch/x86 maintainers, Andy Lutomirski,
	Thomas Gleixner, Ingo Molnar, Borislav Petkov, H. Peter Anvin
In-Reply-To: <20190531191204.4044-3-palmer@sifive.com>

On Fri, May 31, 2019 at 9:23 PM Palmer Dabbelt <palmer@sifive.com> wrote:
>
> man 3p says that fchmodat() takes a flags argument, but the Linux
> syscall does not.  There doesn't appear to be a good userspace
> workaround for this issue but the implementation in the kernel is pretty
> straight-forward.  The specific use case where the missing flags came up
> was WRT a fuse filesystem implemenation, but the functionality is pretty
> generic so I'm assuming there would be other use cases.
>
> Signed-off-by: Palmer Dabbelt <palmer@sifive.com>
> ---
>  fs/open.c                | 21 +++++++++++++++++++--
>  include/linux/syscalls.h |  5 +++++
>  2 files changed, 24 insertions(+), 2 deletions(-)
>
> diff --git a/fs/open.c b/fs/open.c
> index a00350018a47..cfad7684e8d3 100644
> --- a/fs/open.c
> +++ b/fs/open.c
> @@ -568,11 +568,17 @@ SYSCALL_DEFINE2(fchmod, unsigned int, fd, umode_t, mode)
>         return ksys_fchmod(fd, mode);
>  }
>
> -int do_fchmodat(int dfd, const char __user *filename, umode_t mode)
> +int do_fchmodat4(int dfd, const char __user *filename, umode_t mode, int flags)
...
> +
> +int do_fchmodat(int dfd, const char __user *filename, umode_t mode)
> +{
> +       return do_fchmodat4(dfd, filename, mode, 0);
> +}
> +

There is only one external caller of do_fchmodat(), so just change that
to pass the extra argument here, and keep a single do_fchmodat()
function used by the sys_fchmod(), sys_fchmod4(), sys_chmod()
and ksys_chmod().

        Arnd

^ permalink raw reply

* Re: [RFC][PATCH] link.2: AT_ATOMIC_DATA and AT_ATOMIC_METADATA
From: Theodore Ts'o @ 2019-05-31 19:21 UTC (permalink / raw)
  To: Amir Goldstein
  Cc: Jan Kara, Darrick J . Wong, Dave Chinner, Chris Mason, Al Viro,
	linux-fsdevel, linux-xfs, Ext4, Linux Btrfs, Linux API
In-Reply-To: <CAOQ4uxjp5psDBLXBu+26xRLpV50txqksVFe6ZhUo0io8kgoH4A@mail.gmail.com>

On Fri, May 31, 2019 at 08:22:06PM +0300, Amir Goldstein wrote:
> >
> > This is I think more precise:
> >
> >     This guarantee can be achieved by calling fsync(2) before linking
> >     the file, but there may be more performant ways to provide these
> >     semantics.  In particular, note that the use of the AT_ATOMIC_DATA
> >     flag does *not* guarantee that the new link created by linkat(2)
> >     will be persisted after a crash.
> 
> OK. Just to be clear, mentioning hardlinks and st_link is not needed
> in your opinion?

Your previous text stated that it was undefined what would happen to
all hardlinks belonging to the file, and that would imply that if a
file had N hard links, some in the directory which we are modifying,
and some in other directories, that somehow any of them might not be
present after the crash.  And that's not the case.  Suppose the file
currently has hardlinks test1/foo, test1/quux, and test2/baz --- and
we've called syncfs(2) on the file system so everything is persisted,
and then linkat(2) is used to create a new hardlink, test1/bar.

After a crash, the existence of test1/foo, test1/quux, and test2/baz
are not in question.  It's only unclear whether or not test1/bar
exists after the crash.

As far as st_nlink is concerned, the presumption is that the file
system itself will be consistent after the crash.  So if the hard link
has been persisted, then st_nlink will be incremented, if it has not,
it won't be.

Finally, one thing which gets hard about trying to state these sorts
of things as guarantees.  Sometimes, the file system won't *know*
whether or not it can make these guarantees.  For example what should
we do if the file system is mounted with nobarrier?  If the overall
hardware design includes UPS's or some other kind of battery backup,
the guarantee may very well exist.  But the file system code can't
know whether or not that is the case.  So my inclination is to allow
the file system to accept the flag even if the mount option nobarrier
is in play --- but in that case, the guarantee is only if the rest of
the system is designed appropriately.

(For that matter, it used to be that there existed hard drives that
lied about whether they had a writeback cache, and/or made the CACHE
FLUSH a no-op so they could win the Winbench benchmarketing wars,
which was worth millions and millions of dollars in sales.  So we can
only assume that the hardware isn't lying to us when we use words like
"guarantee".)

						- Ted

^ permalink raw reply

* [PATCH 5/5] x86: Add fchmod4 to syscall_32.tbl
From: Palmer Dabbelt @ 2019-05-31 19:12 UTC (permalink / raw)
  To: viro, linux-kernel, linux-fsdevel, linux-api
  Cc: linux-arch, x86, luto, tglx, mingo, bp, hpa, Arnd Bergmann,
	Palmer Dabbelt
In-Reply-To: <20190531191204.4044-1-palmer@sifive.com>

Signed-off-by: Palmer Dabbelt <palmer@sifive.com>
---
 arch/x86/entry/syscalls/syscall_32.tbl | 1 +
 1 file changed, 1 insertion(+)

diff --git a/arch/x86/entry/syscalls/syscall_32.tbl b/arch/x86/entry/syscalls/syscall_32.tbl
index 1f9607ed087c..319c7a6d3f02 100644
--- a/arch/x86/entry/syscalls/syscall_32.tbl
+++ b/arch/x86/entry/syscalls/syscall_32.tbl
@@ -433,3 +433,4 @@
 425	i386	io_uring_setup		sys_io_uring_setup		__ia32_sys_io_uring_setup
 426	i386	io_uring_enter		sys_io_uring_enter		__ia32_sys_io_uring_enter
 427	i386	io_uring_register	sys_io_uring_register		__ia32_sys_io_uring_register
+428	i386	fchmodat4		sys_fchmodat4			__ia32_sys_fchmodat4
-- 
2.21.0

^ permalink raw reply related

* [PATCH 4/5] x86: Add fchmodat4 to syscall_64.tbl
From: Palmer Dabbelt @ 2019-05-31 19:12 UTC (permalink / raw)
  To: viro, linux-kernel, linux-fsdevel, linux-api
  Cc: linux-arch, x86, luto, tglx, mingo, bp, hpa, Arnd Bergmann,
	Palmer Dabbelt
In-Reply-To: <20190531191204.4044-1-palmer@sifive.com>

Signed-off-by: Palmer Dabbelt <palmer@sifive.com>
---
 arch/x86/entry/syscalls/syscall_64.tbl | 1 +
 1 file changed, 1 insertion(+)

diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl
index 92ee0b4378d4..998aa3eb09e2 100644
--- a/arch/x86/entry/syscalls/syscall_64.tbl
+++ b/arch/x86/entry/syscalls/syscall_64.tbl
@@ -349,6 +349,7 @@
 425	common	io_uring_setup		__x64_sys_io_uring_setup
 426	common	io_uring_enter		__x64_sys_io_uring_enter
 427	common	io_uring_register	__x64_sys_io_uring_register
+428	common	fchmodat4		__x64_sys_fchmodat4
 
 #
 # x32-specific system call numbers start at 512 to avoid cache impact
-- 
2.21.0

^ permalink raw reply related

* [PATCH 3/5] asm-generic: Register fchmodat4 as syscall 428
From: Palmer Dabbelt @ 2019-05-31 19:12 UTC (permalink / raw)
  To: viro, linux-kernel, linux-fsdevel, linux-api
  Cc: linux-arch, x86, luto, tglx, mingo, bp, hpa, Arnd Bergmann,
	Palmer Dabbelt
In-Reply-To: <20190531191204.4044-1-palmer@sifive.com>

Signed-off-by: Palmer Dabbelt <palmer@sifive.com>
---
 include/uapi/asm-generic/unistd.h | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/include/uapi/asm-generic/unistd.h b/include/uapi/asm-generic/unistd.h
index dee7292e1df6..f0f4cad4c416 100644
--- a/include/uapi/asm-generic/unistd.h
+++ b/include/uapi/asm-generic/unistd.h
@@ -833,8 +833,11 @@ __SYSCALL(__NR_io_uring_enter, sys_io_uring_enter)
 #define __NR_io_uring_register 427
 __SYSCALL(__NR_io_uring_register, sys_io_uring_register)
 
+#define __NR_fchmodat4 428
+__SYSCALL(__NR_fchmodat4, sys_fchmodat4)
+
 #undef __NR_syscalls
-#define __NR_syscalls 428
+#define __NR_syscalls 429
 
 /*
  * 32 bit systems traditionally used different
-- 
2.21.0

^ permalink raw reply related

* [PATCH 2/5] Add fchmodat4(), a new syscall
From: Palmer Dabbelt @ 2019-05-31 19:12 UTC (permalink / raw)
  To: viro, linux-kernel, linux-fsdevel, linux-api
  Cc: linux-arch, x86, luto, tglx, mingo, bp, hpa, Arnd Bergmann,
	Palmer Dabbelt
In-Reply-To: <20190531191204.4044-1-palmer@sifive.com>

man 3p says that fchmodat() takes a flags argument, but the Linux
syscall does not.  There doesn't appear to be a good userspace
workaround for this issue but the implementation in the kernel is pretty
straight-forward.  The specific use case where the missing flags came up
was WRT a fuse filesystem implemenation, but the functionality is pretty
generic so I'm assuming there would be other use cases.

Signed-off-by: Palmer Dabbelt <palmer@sifive.com>
---
 fs/open.c                | 21 +++++++++++++++++++--
 include/linux/syscalls.h |  5 +++++
 2 files changed, 24 insertions(+), 2 deletions(-)

diff --git a/fs/open.c b/fs/open.c
index a00350018a47..cfad7684e8d3 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -568,11 +568,17 @@ SYSCALL_DEFINE2(fchmod, unsigned int, fd, umode_t, mode)
 	return ksys_fchmod(fd, mode);
 }
 
-int do_fchmodat(int dfd, const char __user *filename, umode_t mode)
+int do_fchmodat4(int dfd, const char __user *filename, umode_t mode, int flags)
 {
 	struct path path;
 	int error;
-	unsigned int lookup_flags = LOOKUP_FOLLOW;
+	unsigned int lookup_flags;
+
+	if (unlikely(flags & ~AT_SYMLINK_NOFOLLOW))
+		return -EINVAL;
+
+	lookup_flags = flags & AT_SYMLINK_NOFOLLOW ? 0 : LOOKUP_FOLLOW;
+
 retry:
 	error = user_path_at(dfd, filename, lookup_flags, &path);
 	if (!error) {
@@ -586,6 +592,17 @@ int do_fchmodat(int dfd, const char __user *filename, umode_t mode)
 	return error;
 }
 
+SYSCALL_DEFINE4(fchmodat4, int, dfd, const char __user *, filename,
+		umode_t, mode, int, flags)
+{
+	return do_fchmodat4(dfd, filename, mode, flags);
+}
+
+int do_fchmodat(int dfd, const char __user *filename, umode_t mode)
+{
+	return do_fchmodat4(dfd, filename, mode, 0);
+}
+
 SYSCALL_DEFINE3(fchmodat, int, dfd, const char __user *, filename,
 		umode_t, mode)
 {
diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
index 396871b218f4..cb040a412a4c 100644
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -435,6 +435,8 @@ asmlinkage long sys_chroot(const char __user *filename);
 asmlinkage long sys_fchmod(unsigned int fd, umode_t mode);
 asmlinkage long sys_fchmodat(int dfd, const char __user *filename,
 			     umode_t mode);
+asmlinkage long sys_fchmodat4(int dfd, const char __user *filename,
+			     umode_t mode, int flags);
 asmlinkage long sys_fchownat(int dfd, const char __user *filename, uid_t user,
 			     gid_t group, int flag);
 asmlinkage long sys_fchown(unsigned int fd, uid_t user, gid_t group);
@@ -1315,6 +1317,9 @@ static inline long ksys_link(const char __user *oldname,
 
 extern int do_fchmodat(int dfd, const char __user *filename, umode_t mode);
 
+extern int do_fchmodat4(int dfd, const char __user *filename, umode_t mode,
+			int flags);
+
 static inline int ksys_chmod(const char __user *filename, umode_t mode)
 {
 	return do_fchmodat(AT_FDCWD, filename, mode);
-- 
2.21.0

^ permalink raw reply related

* [PATCH 1/5] Non-functional cleanup of a "__user * filename"
From: Palmer Dabbelt @ 2019-05-31 19:12 UTC (permalink / raw)
  To: viro, linux-kernel, linux-fsdevel, linux-api
  Cc: linux-arch, x86, luto, tglx, mingo, bp, hpa, Arnd Bergmann,
	Palmer Dabbelt
In-Reply-To: <20190531191204.4044-1-palmer@sifive.com>

The next patch defines a very similar interface, which I copied from
this definition.  Since I'm touching it anyway I don't see any reason
not to just go fix this one up.

Signed-off-by: Palmer Dabbelt <palmer@sifive.com>
---
 include/linux/syscalls.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
index e446806a561f..396871b218f4 100644
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -433,7 +433,7 @@ asmlinkage long sys_chdir(const char __user *filename);
 asmlinkage long sys_fchdir(unsigned int fd);
 asmlinkage long sys_chroot(const char __user *filename);
 asmlinkage long sys_fchmod(unsigned int fd, umode_t mode);
-asmlinkage long sys_fchmodat(int dfd, const char __user * filename,
+asmlinkage long sys_fchmodat(int dfd, const char __user *filename,
 			     umode_t mode);
 asmlinkage long sys_fchownat(int dfd, const char __user *filename, uid_t user,
 			     gid_t group, int flag);
-- 
2.21.0

^ permalink raw reply related

* Add a new fchmodat4() syscall
From: Palmer Dabbelt @ 2019-05-31 19:11 UTC (permalink / raw)
  To: viro, linux-kernel, linux-fsdevel, linux-api
  Cc: linux-arch, x86, luto, tglx, mingo, bp, hpa, Arnd Bergmann

I spent half of dinner last night being complained to by one of our
hardware engineers about Linux's lack of support for the flags argument
to fchmodat().  This all came about because of a FUSE filesystem
implementation, and while there are some application-specific
workarounds for the issue it seemed to me like the cleanest bet was to
just go add another fchmodat() that supports flags to the kernel.

The actual implementation is super simple: essentially it's just
the same as fchmodat(), but LOOKUP_FOLLOW is conditionally set based on
the flags.  I've attempted to make this match "man 2 fchmodat" as
closely as possible, which says EINVAL is returned for invalid flags (as
opposed to ENOTSUPP, which is currently returned by glibc for
AT_SYMLINK_NOFOLLOW).  I have a sketch of a glibc patch that I haven't
even compiled yet, but seems fairly straight-forward:

    diff --git a/sysdeps/unix/sysv/linux/fchmodat.c b/sysdeps/unix/sysv/linux/fchmodat.c
    index 6d9cbc1ce9e0..b1beab76d56c 100644
    --- a/sysdeps/unix/sysv/linux/fchmodat.c
    +++ b/sysdeps/unix/sysv/linux/fchmodat.c
    @@ -29,12 +29,36 @@
     int
     fchmodat (int fd, const char *file, mode_t mode, int flag)
     {
    -  if (flag & ~AT_SYMLINK_NOFOLLOW)
    -    return INLINE_SYSCALL_ERROR_RETURN_VALUE (EINVAL);
    -#ifndef __NR_lchmod		/* Linux so far has no lchmod syscall.  */
    +  /* There are four paths through this code:
    +      - The flags are zero.  In this case it's fine to call fchmodat.
    +      - The flags are non-zero and glibc doesn't have access to
    +	__NR_fchmodat4.  In this case all we can do is emulate the error codes
    +	defined by the glibc interface from userspace.
    +      - The flags are non-zero, glibc has __NR_fchmodat4, and the kernel has
    +	fchmodat4.  This is the simplest case, as the fchmodat4 syscall exactly
    +	matches glibc's library interface so it can be called directly.
    +      - The flags are non-zero, glibc has __NR_fchmodat4, but the kernel does
    +	not.  In this case we must respect the error codes defined by the glibc
    +	interface instead of returning ENOSYS.
    +    The intent here is to ensure that the kernel is called at most once per
    +    library call, and that the error types defined by glibc are always
    +    respected.  */
    +
    +#ifdef __NR_fchmodat4
    +  long result;
    +#endif
    +
    +  if (flag == 0)
    +    return INLINE_SYSCALL (fchmodat, 3, fd, file, mode);
    +
    +#ifdef __NR_fchmodat4
    +  result = INLINE_SYSCALL (fchmodat4, 4, fd, file, mode, flag);
    +  if (result == 0 || errno != ENOSYS)
    +    return result;
    +#endif
    +
       if (flag & AT_SYMLINK_NOFOLLOW)
         return INLINE_SYSCALL_ERROR_RETURN_VALUE (ENOTSUP);
    -#endif
     
    -  return INLINE_SYSCALL (fchmodat, 3, fd, file, mode);
    +  return INLINE_SYSCALL_ERROR_RETURN_VALUE (EINVAL);
     }

I've never added a new syscall before so I'm not really sure what the
proper procedure to follow is.  I'm assuming any new syscall will
involve fairly significant discussion, so I've just done the minimum of
an implementation for this patch set.  Specifically, I've:

* Defined a new syscall that looks like fchmodat but includes a flag
  argument, which I'm calling fchmodat4 because it has 4 arguments.  I
  don't know if that's the correct naming convention, and don't really
  have any skin in that game.
* Implemented that syscall by extending the fchmod code to handle flags,
  which is pretty straight-forward.  I think it's sane, but given that
  it's so simple I'm not sure if I'm missing something -- specifically,
  I didn't go check to make sure the semantics of AT_SYMLINK_NOFOLLOW
  match !LOOKUP_FOLLOW.  I'm assuming the do, but sometimes when I look
  at something and say "that's so simple, how is it broken" I'm actually
  just missing something entirely.
* Added an asm-generic syscall number for this, which I assume I'm
  supposed to do this first as it looks like we're trying to keep the
  numbers in sync everywhere.
* Added x86 syscalls for this so I could test it.

I also cleaned up a checkpatch issue in fchmodat().  I only found this
because I copied the fchmodat() interface for fchmodat4() and it threw
the warning, I don't personally care either way as to whether or not the
space is in there.

I've given this fairly minimal testing.  Essentially all I've done is
booted up 5.1.6 with this patch set on my local development box and run

    $ touch test-file
    $ ln -s test-file test-link
    $ cat > test.c
    #include <fcntl.h>
    #include <stdio.h>
    #include <unistd.h>
    
    int main(int argc, char **argv)
    {
            long out;
    
            out = syscall(428, AT_FDCWD, "test-file", 0x888, AT_SYMLINK_NOFOLLOW);
            printf("fchmodat4(AT_FDCWD, \"test-file\", 0x888, AT_SYMLINK_NOFOLLOW): %ld\n", out);
    
            out = syscall(428, AT_FDCWD, "test-file", 0x888, 0);
            printf("fchmodat4(AT_FDCWD, \"test-file\", 0x888, 0): %ld\n", out);
    
            out = syscall(268, AT_FDCWD, "test-file", 0x888);
            printf("fchmodat(AT_FDCWD, \"test-file\", 0x888): %ld\n", out);
    
            out = syscall(428, AT_FDCWD, "test-link", 0x888, AT_SYMLINK_NOFOLLOW);
            printf("fchmodat4(AT_FDCWD, \"test-link\", 0x888, AT_SYMLINK_NOFOLLOW): %ld\n", out);
    
            out = syscall(428, AT_FDCWD, "test-link", 0x888, 0);
            printf("fchmodat4(AT_FDCWD, \"test-link\", 0x888, 0): %ld\n", out);
    
            out = syscall(268, AT_FDCWD, "test-link", 0x888);
            printf("fchmodat(AT_FDCWD, \"test-link\", 0x888): %ld\n", out);
    
            return 0;
    }
    $ gcc test.c -o test
    $ ./test
    fchmodat4(AT_FDCWD, "test-file", 0x888, AT_SYMLINK_NOFOLLOW): 0
    fchmodat4(AT_FDCWD, "test-file", 0x888, 0): 0
    fchmodat(AT_FDCWD, "test-file", 0x888): 0
    fchmodat4(AT_FDCWD, "test-link", 0x888, AT_SYMLINK_NOFOLLOW): -1
    fchmodat4(AT_FDCWD, "test-link", 0x888, 0): 0
    fchmodat(AT_FDCWD, "test-link", 0x888): 0

While I don't think there's any reason what's there is unacceptable, I
don't really consider this finished.  I couldn't find a cookbook for
"here's how you add a system call", but all I really did was "git grep
add | grep syscall" so if there's something out there then please let me
know and I'll follow it.  Specifically, I haven't:

* Added any sort of documentation.  I don't find anything with a "git
  grep fchmodat", so I'm assuming it's just the man pages that are
  relevant here.
* Fixed any of the other architectures.  I'm assuming this is just the
  mechanical process of fixing all these in the same way I did for x86.

      arch/alpha/kernel/syscalls/syscall.tbl:461      common  fchmodat                        sys_fchmodat
      arch/arm/tools/syscall.tbl:333  common  fchmodat                sys_fchmodat
      arch/arm64/include/asm/unistd32.h:#define __NR_fchmodat 333
      arch/arm64/include/asm/unistd32.h:__SYSCALL(__NR_fchmodat, sys_fchmodat)
      arch/ia64/kernel/fsys.S:        data8 0                         // fchmodat
      arch/ia64/kernel/syscalls/syscall.tbl:268       common  fchmodat                        sys_fchmodat
      arch/m68k/kernel/syscalls/syscall.tbl:299       common  fchmodat                        sys_fchmodat
      arch/microblaze/kernel/syscalls/syscall.tbl:306 common  fchmodat                        sys_fchmodat
      arch/mips/kernel/syscalls/syscall_n32.tbl:262   n32     fchmodat                        sys_fchmodat
      arch/mips/kernel/syscalls/syscall_n64.tbl:258   n64     fchmodat                        sys_fchmodat
      arch/mips/kernel/syscalls/syscall_o32.tbl:299   o32     fchmodat                        sys_fchmodat
      arch/parisc/kernel/syscalls/syscall.tbl:286     common  fchmodat                sys_fchmodat
      arch/powerpc/kernel/syscalls/syscall.tbl:297    common  fchmodat                        sys_fchmodat
      arch/s390/kernel/syscalls/syscall.tbl:299  common       fchmodat                sys_fchmodat                    sys_fchmodat
      arch/sh/include/uapi/asm/unistd_64.h:#define __NR_fchmodat              334
      arch/sh/kernel/syscalls/syscall.tbl:306 common  fchmodat                        sys_fchmodat
      arch/sh/kernel/syscalls_64.S:   .long sys_fchmodat
      arch/sparc/kernel/syscalls/syscall.tbl:295      common  fchmodat                sys_fchmodat
      arch/xtensa/kernel/syscalls/syscall.tbl:300     common  fchmodat                        sys_fchmodat
* Looked at anything in tools.  Again, I'm assuming it's just a
  mechanical process of looking at all of these and adding fchmodat4.

      tools/include/nolibc/nolibc.h:#ifdef __NR_fchmodat
      tools/include/nolibc/nolibc.h:  return my_syscall4(__NR_fchmodat, AT_FDCWD, path, mode, 0);
      tools/include/uapi/asm-generic/unistd.h:#define __NR_fchmodat 53
      tools/include/uapi/asm-generic/unistd.h:__SYSCALL(__NR_fchmodat, sys_fchmodat)
      tools/perf/arch/powerpc/entry/syscalls/syscall.tbl:297  common  fchmodat                        sys_fchmodat
      tools/perf/arch/s390/entry/syscalls/syscall.tbl:299  common     fchmodat                sys_fchmodat                    compat_sys_fchmodat
      tools/perf/arch/x86/entry/syscalls/syscall_64.tbl:268   common  fchmodat                __x64_sys_fchmodat
      tools/perf/builtin-trace.c:     { .name     = "fchmodat",

* Done anything with userspace, aside from thinking about the glibc code
  above.  I'd assume that I'm meant to bring in libc-alpha to the
  discussion, but I didn't want to do so this early in case this was
  just a non-starter.

I'm happy dealing with all of that, but given that I'm assuming there's
going to be some discussion I wanted to send out the proof-of-concept
first to see if this has any legs.  Aside from the glibc side the
remaining work smells pretty mechanical, so I figured I'd wait on that
until I knew it wasn't going to be a waste of time -- partially because
I'm lazy, but mostly because I just realized I blew my whole morning
working on this when all I really wanted to do was avoid discussing
fchmodat in the first place :)
    

^ permalink raw reply

* Re: [PATCH 1/5] glibc: Perform rseq(2) registration at C startup and thread creation (v10)
From: Mathieu Desnoyers @ 2019-05-31 18:10 UTC (permalink / raw)
  To: Florian Weimer
  Cc: carlos, Joseph Myers, Szabolcs Nagy, libc-alpha, Thomas Gleixner,
	Ben Maurer, Peter Zijlstra, Paul E. McKenney, Boqun Feng,
	Will Deacon, Dave Watson, Paul Turner, Rich Felker, linux-kernel,
	linux-api
In-Reply-To: <87muj2k4ov.fsf@oldenburg2.str.redhat.com>



----- On May 31, 2019, at 11:46 AM, Florian Weimer fweimer@redhat.com wrote:

> * Mathieu Desnoyers:
> 
>> Let's break this down into the various sub-issues involved:
>>
>> 1) How early do we need to setup rseq ? Should it be setup before:
>>    - LD_PRELOAD .so constructors ?
>>      - Without circular dependency,
>>      - With circular dependency,
>>    - audit libraries initialization ?
>>    - IFUNC resolvers ?
>>    - other callbacks ?
>>    - memory allocator calls ?
>>
>> We may end up in a situation where we need memory allocation to be setup
>> in order to initialize TLS before rseq can be registered for the main
>> thread. I suspect we will end up needing a fallbacks which always work
>> for the few cases that would try to use rseq too early in dl/libc startup.
> 
> I think the answer to that depends on whether it's okay to have an
> observable transition from “no rseq kernel support” to “kernel supports
> rseq”.

As far as my own use-cases are concerned, I only care that rseq is initialized
before LD_PRELOAD .so constructors are executed.

There appears to be some amount of documented limitations for what can be
done by the IFUNC resolvers. It might be acceptable to document that rseq
might not be initialized yet when those are executed.

I'd like to hear what others think about whether we should care about IFUNC
resolvers and audit libraries using restartable sequences TLS ?

[...]

> 
>> 4) Inability to touch a TLS variable (__rseq_abi) from ld-linux-*.so.2
>>    - Should we extend the dynamic linker to allow such TLS variable to be
>>      accessed ? If so, how much effort is required ?
>>    - Can we find an alternative way to initialize rseq early during
>>      dl init stages while still performing the TLS access from a function
>>      implemented within libc.so ?
> 
> This is again related to the answer for (1).  There are various hacks we
> could implement to make the initialization invisible (e.g., computing
> the address of the variable using the equivalent of dlsym, after loading
> all the initial objects and before starting relocation).  If it's not
> too hard to add TLS support to ld.so, we can consider that as well.
> (The allocation side should be pretty easy, relocation support it could
> be more tricky.)
> 
>> So far, I got rseq to be initialized before LD_PRELOADed library
>> constructors by doing the initialization in a constructor within
>> libc.so. I don't particularly like this approach, because the
>> constructor order is not guaranteed.
> 
> Right.

One question related to use of constructors: AFAIU, if a library depends
on glibc, ELF guarantees that the glibc constructor will be executed first,
before the other library.

Which leaves us with the execution order of constructors within libc.so,
which is not guaranteed if we just use __attribute__ ((constructor)).
However, all gcc versions that are required to build recent glibc
seem to support a constructor with a "priority" value (lower gets
executed first, and those are executed before constructors without
priority).

Could we do e.g.:

--- a/include/libc-internal.h
+++ b/include/libc-internal.h
@@ -21,6 +21,12 @@
 
 #include <hp-timing.h>
 
+/* Libc constructor priority order. Lower is executed first.  */
+enum libc_constructor_prio {
+       /* Priorities between 0 and 100 are reserved.  */
+       LIBC_CONSTRUCTOR_PRIO_RSEQ_INIT = 1000,
+};
+
 /* Initialize the `__libc_enable_secure' flag.  */
 extern void __libc_init_secure (void);
 
and

csu/libc-start.c:

static
__attribute__ ((constructor (LIBC_CONSTRUCTOR_PRIO_RSEQ_INIT)))
void __rseq_libc_init (void)
{
  rseq_init ();
  /* Register rseq ABI to the kernel.   */
  (void) rseq_register_current_thread ();
}

[...]

Thanks,

Mathieu




-- 
Mathieu Desnoyers
EfficiOS Inc.
http://www.efficios.com

^ permalink raw reply

* [PATCH ghak90 V6] fixup! audit: add containerid filtering
From: Richard Guy Briggs @ 2019-05-31 17:53 UTC (permalink / raw)
  To: containers, linux-api, Linux-Audit Mailing List, linux-fsdevel,
	LKML, netdev, netfilter-devel
  Cc: Paul Moore, sgrubb, omosnace, dhowells, simo, eparis, serge,
	ebiederm, nhorman, Richard Guy Briggs

Remove the BUG() call since we will never have an invalid op value as
audit_data_to_entry()/audit_to_op() ensure that the op value is a a
known good value.

Signed-off-by: Richard Guy Briggs <rgb@redhat.com>
---
 kernel/auditfilter.c | 1 -
 1 file changed, 1 deletion(-)

diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c
index 407b5bb3b4c6..385a114a1254 100644
--- a/kernel/auditfilter.c
+++ b/kernel/auditfilter.c
@@ -1244,7 +1244,6 @@ int audit_comparator64(u64 left, u32 op, u64 right)
 	case Audit_bittest:
 		return ((left & right) == right);
 	default:
-		BUG();
 		return 0;
 	}
 }
-- 
1.8.3.1

^ permalink raw reply related

* Re: [RFCv2 5/6] mm: introduce external memory hinting API
From: Daniel Colascione @ 2019-05-31 17:35 UTC (permalink / raw)
  To: Minchan Kim
  Cc: Andrew Morton, linux-mm, LKML, Linux API, Michal Hocko,
	Johannes Weiner, Tim Murray, Joel Fernandes, Suren Baghdasaryan,
	Shakeel Butt, Sonny Rao, Brian Geffon, Jann Horn, Oleg Nesterov,
	Christian Brauner, oleksandr, hdanton
In-Reply-To: <20190531064313.193437-6-minchan@kernel.org>

On Thu, May 30, 2019 at 11:43 PM Minchan Kim <minchan@kernel.org> wrote:
>
> There is some usecase that centralized userspace daemon want to give
> a memory hint like MADV_[COLD|PAGEEOUT] to other process. Android's
> ActivityManagerService is one of them.
>
> It's similar in spirit to madvise(MADV_WONTNEED), but the information
> required to make the reclaim decision is not known to the app. Instead,
> it is known to the centralized userspace daemon(ActivityManagerService),
> and that daemon must be able to initiate reclaim on its own without
> any app involvement.
>
> To solve the issue, this patch introduces new syscall process_madvise(2).
> It could give a hint to the exeternal process of pidfd.
>
>  int process_madvise(int pidfd, void *addr, size_t length, int advise,
>                         unsigned long cookie, unsigned long flag);
>
> Since it could affect other process's address range, only privileged
> process(CAP_SYS_PTRACE) or something else(e.g., being the same UID)
> gives it the right to ptrace the process could use it successfully.
>
> The syscall has a cookie argument to privode atomicity(i.e., detect
> target process's address space change since monitor process has parsed
> the address range of target process so the operaion could fail in case
> of happening race). Although there is no interface to get a cookie
> at this moment, it could be useful to consider it as argument to avoid
> introducing another new syscall in future. It could support *atomicity*
> for disruptive hint(e.g., MADV_DONTNEED|FREE).
> flag argument is reserved for future use if we need to extend the API.

How about a compromise? Let's allow all madvise hints if the process
is calling process_madvise *on itself* (which will be useful once we
wire up the atomicity cookie) and restrict the cross-process case to
the hints you've mentioned. This way, the restriction on madvise hints
isn't tied to the specific API, but to the relationship between hinter
and hintee.

^ permalink raw reply

* Re: [RFC][PATCH] link.2: AT_ATOMIC_DATA and AT_ATOMIC_METADATA
From: Amir Goldstein @ 2019-05-31 17:22 UTC (permalink / raw)
  To: Theodore Ts'o
  Cc: Jan Kara, Darrick J . Wong, Dave Chinner, Chris Mason, Al Viro,
	linux-fsdevel, linux-xfs, Ext4, Linux Btrfs, Linux API
In-Reply-To: <20190531164136.GA3066@mit.edu>

On Fri, May 31, 2019 at 7:41 PM Theodore Ts'o <tytso@mit.edu> wrote:
>
> On Fri, May 31, 2019 at 06:21:45PM +0300, Amir Goldstein wrote:
> > What do you think of:
> >
> > "AT_ATOMIC_DATA (since Linux 5.x)
> > A filesystem which accepts this flag will guarantee that if the linked file
> > name exists after a system crash, then all of the data written to the file
> > and all of the file's metadata at the time of the linkat(2) call will be
> > visible.
>
> ".... will be visible after the the file system is remounted".  (Never
> hurts to be explicit.)
>
> > The way to achieve this guarantee on old kernels is to call fsync (2)
> > before linking the file, but doing so will also results in flushing of
> > volatile disk caches.
> >
> > A filesystem which accepts this flag does NOT
> > guarantee that any of the file hardlinks will exist after a system crash,
> > nor that the last observed value of st_nlink (see stat (2)) will persist."
> >
>
> This is I think more precise:
>
>     This guarantee can be achieved by calling fsync(2) before linking
>     the file, but there may be more performant ways to provide these
>     semantics.  In particular, note that the use of the AT_ATOMIC_DATA
>     flag does *not* guarantee that the new link created by linkat(2)
>     will be persisted after a crash.

OK. Just to be clear, mentioning hardlinks and st_link is not needed
in your opinion?

>
> We should also document that a file system which does not implement
> this flag MUST return EINVAL if it is passed this flag to linkat(2).
>

OK. I think this part can be documented as possible reason for EINVAL
As in renameat(2) man page:
       EINVAL The filesystem does not support one of the flags in flags.

Thanks,
Amir.

^ permalink raw reply

* Re: [PATCH 1/7] General notification queue with user mmap()'able ring buffer
From: David Howells @ 2019-05-31 17:12 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: dhowells, Jann Horn, Greg KH, Al Viro, raven, linux-fsdevel,
	Linux API, linux-block, keyrings, linux-security-module,
	kernel list, Kees Cook, Kernel Hardening
In-Reply-To: <20190531164444.GD2606@hirez.programming.kicks-ass.net>

Peter Zijlstra <peterz@infradead.org> wrote:

> > > (and it has already been established that refcount_t doesn't work for
> > > usage count scenarios)
> > 
> > ?
> > 
> > Does that mean struct kref doesn't either?
> 
> Indeed, since kref is just a pointless wrapper around refcount_t it does
> not either.
> 
> The main distinction between a reference count and a usage count is that
> 0 means different things. For a refcount 0 means dead. For a usage count
> 0 is merely unused but valid.

Ah - I consider the terms interchangeable.

Take Documentation/filesystems/vfs.txt for instance:

  dget: open a new handle for an existing dentry (this just increments
	the usage count)

  dput: close a handle for a dentry (decrements the usage count). ...

  ...

  d_lookup: look up a dentry given its parent and path name component
	It looks up the child of that given name from the dcache
	hash table. If it is found, the reference count is incremented
	and the dentry is returned. The caller must use dput()
	to free the dentry when it finishes using it.

Here we interchange the terms.

Or https://www.kernel.org/doc/gorman/html/understand/understand013.html
which seems to interchange the terms in reference to struct page.

David

^ permalink raw reply

* Re: [RFCv2 3/6] mm: introduce MADV_PAGEOUT
From: Johannes Weiner @ 2019-05-31 16:59 UTC (permalink / raw)
  To: Minchan Kim
  Cc: Andrew Morton, linux-mm, LKML, linux-api, Michal Hocko,
	Tim Murray, Joel Fernandes, Suren Baghdasaryan, Daniel Colascione,
	Shakeel Butt, Sonny Rao, Brian Geffon, jannh, oleg, christian,
	oleksandr, hdanton
In-Reply-To: <20190531064313.193437-4-minchan@kernel.org>

Hi Michan,

this looks pretty straight-forward to me, only one kink:

On Fri, May 31, 2019 at 03:43:10PM +0900, Minchan Kim wrote:
> --- a/mm/vmscan.c
> +++ b/mm/vmscan.c
> @@ -2126,6 +2126,83 @@ static void shrink_active_list(unsigned long nr_to_scan,
>  			nr_deactivate, nr_rotated, sc->priority, file);
>  }
>  
> +unsigned long reclaim_pages(struct list_head *page_list)
> +{
> +	int nid = -1;
> +	unsigned long nr_isolated[2] = {0, };
> +	unsigned long nr_reclaimed = 0;
> +	LIST_HEAD(node_page_list);
> +	struct reclaim_stat dummy_stat;
> +	struct scan_control sc = {
> +		.gfp_mask = GFP_KERNEL,
> +		.priority = DEF_PRIORITY,
> +		.may_writepage = 1,
> +		.may_unmap = 1,
> +		.may_swap = 1,
> +	};
> +
> +	while (!list_empty(page_list)) {
> +		struct page *page;
> +
> +		page = lru_to_page(page_list);
> +		if (nid == -1) {
> +			nid = page_to_nid(page);
> +			INIT_LIST_HEAD(&node_page_list);
> +			nr_isolated[0] = nr_isolated[1] = 0;
> +		}
> +
> +		if (nid == page_to_nid(page)) {
> +			list_move(&page->lru, &node_page_list);
> +			nr_isolated[!!page_is_file_cache(page)] +=
> +						hpage_nr_pages(page);
> +			continue;
> +		}
> +
> +		mod_node_page_state(NODE_DATA(nid), NR_ISOLATED_ANON,
> +					nr_isolated[0]);
> +		mod_node_page_state(NODE_DATA(nid), NR_ISOLATED_FILE,
> +					nr_isolated[1]);
> +		nr_reclaimed += shrink_page_list(&node_page_list,
> +				NODE_DATA(nid), &sc, TTU_IGNORE_ACCESS,
> +				&dummy_stat, true);
> +		while (!list_empty(&node_page_list)) {
> +			struct page *page = lru_to_page(&node_page_list);
> +
> +			list_del(&page->lru);
> +			putback_lru_page(page);
> +		}
> +		mod_node_page_state(NODE_DATA(nid), NR_ISOLATED_ANON,
> +					-nr_isolated[0]);
> +		mod_node_page_state(NODE_DATA(nid), NR_ISOLATED_FILE,
> +					-nr_isolated[1]);
> +		nid = -1;
> +	}
> +
> +	if (!list_empty(&node_page_list)) {
> +		mod_node_page_state(NODE_DATA(nid), NR_ISOLATED_ANON,
> +					nr_isolated[0]);
> +		mod_node_page_state(NODE_DATA(nid), NR_ISOLATED_FILE,
> +					nr_isolated[1]);
> +		nr_reclaimed += shrink_page_list(&node_page_list,
> +				NODE_DATA(nid), &sc, TTU_IGNORE_ACCESS,
> +				&dummy_stat, true);
> +		mod_node_page_state(NODE_DATA(nid), NR_ISOLATED_ANON,
> +					-nr_isolated[0]);
> +		mod_node_page_state(NODE_DATA(nid), NR_ISOLATED_FILE,
> +					-nr_isolated[1]);
> +
> +		while (!list_empty(&node_page_list)) {
> +			struct page *page = lru_to_page(&node_page_list);
> +
> +			list_del(&page->lru);
> +			putback_lru_page(page);
> +		}
> +
> +	}

The NR_ISOLATED accounting, nid parsing etc. is really awkward and
makes it hard to see what the function actually does.

Can you please make those ISOLATED counters part of the isolation API?
Your patch really shows this is an overdue cleanup.

These are fast local percpu counters, we don't need the sprawling
batching we do all over vmscan.c, migrate.c, khugepaged.c,
compaction.c etc. Isolation can increase the counter page by page, and
reclaim or putback can likewise decrease them one by one.

It looks like mlock is the only user of the isolation api that does
not participate in the NR_ISOLATED_* counters protocol, but I don't
see why it wouldn't, or why doing so would hurt.

There are also seem to be quite a few callsites that use the atomic
versions of the counter API when they're clearly under the irqsafe
lru_lock. That would be fixed automatically by this work as well.

^ permalink raw reply

* Re: [PATCH 1/7] General notification queue with user mmap()'able ring buffer
From: Peter Zijlstra @ 2019-05-31 16:44 UTC (permalink / raw)
  To: David Howells
  Cc: Jann Horn, Greg KH, Al Viro, raven, linux-fsdevel, Linux API,
	linux-block, keyrings, linux-security-module, kernel list,
	Kees Cook, Kernel Hardening
In-Reply-To: <606.1559312412@warthog.procyon.org.uk>

On Fri, May 31, 2019 at 03:20:12PM +0100, David Howells wrote:
> Peter Zijlstra <peterz@infradead.org> wrote:

> > (and it has already been established that refcount_t doesn't work for
> > usage count scenarios)
> 
> ?
> 
> Does that mean struct kref doesn't either?

Indeed, since kref is just a pointless wrapper around refcount_t it does
not either.

The main distinction between a reference count and a usage count is that
0 means different things. For a refcount 0 means dead. For a usage count
0 is merely unused but valid.

Incrementing a 0 refcount is a serious bug -- use-after-free (and hence
refcount_t will refuse this and splat), for a usage count this is no
problem.

Now, it is sort-of possible to merge the two, by basically stating
something like: usage = refcount - 1. But that can get tricky and people
have not really liked the result much for the few times I tried.

^ permalink raw reply


This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox