linux-ext4.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
From: "Kirill A. Shutemov" <kirill@shutemov.name>
To: Matthew Wilcox <willy@infradead.org>
Cc: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com>,
	Theodore Ts'o <tytso@mit.edu>,
	Andreas Dilger <adilger.kernel@dilger.ca>,
	Jan Kara <jack@suse.com>,
	Andrew Morton <akpm@linux-foundation.org>,
	Alexander Viro <viro@zeniv.linux.org.uk>,
	Hugh Dickins <hughd@google.com>,
	Andrea Arcangeli <aarcange@redhat.com>,
	Dave Hansen <dave.hansen@intel.com>,
	Vlastimil Babka <vbabka@suse.cz>,
	Ross Zwisler <ross.zwisler@linux.intel.com>,
	linux-ext4@vger.kernel.org, linux-fsdevel@vger.kernel.org,
	linux-kernel@vger.kernel.org, linux-mm@kvack.org,
	linux-block@vger.kernel.org
Subject: Re: [PATCHv6 01/37] mm, shmem: swich huge tmpfs to multi-order radix-tree entries
Date: Thu, 9 Feb 2017 19:58:20 +0300	[thread overview]
Message-ID: <20170209165820.GA12768@node.shutemov.name> (raw)
In-Reply-To: <20170209035727.GQ2267@bombadil.infradead.org>

On Wed, Feb 08, 2017 at 07:57:27PM -0800, Matthew Wilcox wrote:
> On Thu, Jan 26, 2017 at 02:57:43PM +0300, Kirill A. Shutemov wrote:
> > +++ b/include/linux/pagemap.h
> > @@ -332,6 +332,15 @@ static inline struct page *grab_cache_page_nowait(struct address_space *mapping,
> >  			mapping_gfp_mask(mapping));
> >  }
> >  
> > +static inline struct page *find_subpage(struct page *page, pgoff_t offset)
> > +{
> > +	VM_BUG_ON_PAGE(PageTail(page), page);
> > +	VM_BUG_ON_PAGE(page->index > offset, page);
> > +	VM_BUG_ON_PAGE(page->index + (1 << compound_order(page)) < offset,
> > +			page);
> > +	return page - page->index + offset;
> > +}
> 
> What would you think to:
> 
> static inline void check_page_index(struct page *page, pgoff_t offset)
> {
> 	VM_BUG_ON_PAGE(PageTail(page), page);
> 	VM_BUG_ON_PAGE(page->index > offset, page);
> 	VM_BUG_ON_PAGE(page->index + (1 << compound_order(page)) <= offset,
> 			page);
> }
> 
> (I think I fixed an off-by-one error up there ...  if
> index + (1 << order) == offset, this is also a bug, right?
> because offset would then refer to the next page, not this page)

Right, thanks.

> 
> static inline struct page *find_subpage(struct page *page, pgoff_t offset)
> {
> 	check_page_index(page, offset);
> 	return page + (offset - page->index);
> }
> 
> ... then you can use check_page_index down ...

Okay, makes sense.

> 
> > @@ -1250,7 +1233,6 @@ struct page *find_lock_entry(struct address_space *mapping, pgoff_t offset)
> >  			put_page(page);
> >  			goto repeat;
> >  		}
> > -		VM_BUG_ON_PAGE(page_to_pgoff(page) != offset, page);
> 
> ... here?

Ok.

> > @@ -1472,25 +1451,35 @@ unsigned find_get_pages(struct address_space *mapping, pgoff_t start,
> >  			goto repeat;
> >  		}
> >  
> > +		/* For multi-order entries, find relevant subpage */
> > +		if (PageTransHuge(page)) {
> > +			VM_BUG_ON(index - page->index < 0);
> > +			VM_BUG_ON(index - page->index >= HPAGE_PMD_NR);
> > +			page += index - page->index;
> > +		}
> 
> Use find_subpage() here?

Ok.

> >  		pages[ret] = page;
> >  		if (++ret == nr_pages)
> >  			break;
> > +		if (!PageTransCompound(page))
> > +			continue;
> > +		for (refs = 0; ret < nr_pages &&
> > +				(index + 1) % HPAGE_PMD_NR;
> > +				ret++, refs++, index++)
> > +			pages[ret] = ++page;
> > +		if (refs)
> > +			page_ref_add(compound_head(page), refs);
> > +		if (ret == nr_pages)
> > +			break;
> 
> Can we avoid referencing huge pages specifically in the page cache?  I'd
> like us to get to the point where we can put arbitrary compound pages into
> the page cache.  For example, I think this can be written as:
> 
> 		if (!PageCompound(page))
> 			continue;
> 		for (refs = 0; ret < nr_pages; refs++, index++) {
> 			if (index > page->index + (1 << compound_order(page)))
> 				break;
> 			pages[ret++] = ++page;
> 		}
> 		if (refs)
> 			page_ref_add(compound_head(page), refs);
> 		if (ret == nr_pages)
> 			break;

That's slightly more costly, but I guess that's fine.

> > @@ -1541,19 +1533,12 @@ unsigned find_get_pages_contig(struct address_space *mapping, pgoff_t index,
> >  
> > +		/* For multi-order entries, find relevant subpage */
> > +		if (PageTransHuge(page)) {
> > +			VM_BUG_ON(index - page->index < 0);
> > +			VM_BUG_ON(index - page->index >= HPAGE_PMD_NR);
> > +			page += index - page->index;
> > +		}
> > +
> >  		pages[ret] = page;
> >  		if (++ret == nr_pages)
> >  			break;
> > +		if (!PageTransCompound(page))
> > +			continue;
> > +		for (refs = 0; ret < nr_pages &&
> > +				(index + 1) % HPAGE_PMD_NR;
> > +				ret++, refs++, index++)
> > +			pages[ret] = ++page;
> > +		if (refs)
> > +			page_ref_add(compound_head(page), refs);
> > +		if (ret == nr_pages)
> > +			break;
> >  	}
> >  	rcu_read_unlock();
> >  	return ret;
> 
> Ugh, the same code again.  Hmm ... we only need to modify 'ret' as a result
> of this ... so could we split it out like this?
> 
> static unsigned populate_pages(struct page **pages, unsigned i, unsigned max,
>                                 struct page *page)
> {
>         unsigned refs = 0;
>         for (;;) {
>                 pages[i++] = page;
>                 if (i == max)
>                         break;
>                 if (PageHead(page + 1))
>                         break;

Hm? PageHead()? No. The next page can head or small.

I *guess* we can get away with !PageTail(page + 1)...

>                 page++;
>                 refs++;
>         }
>         if (refs)
>                 page_ref_add(compound_head(page), refs);
>         return i;
> }
> 
> > +unsigned find_get_pages_tag(struct address_space *mapping, pgoff_t *indexp,
> >  			int tag, unsigned int nr_pages, struct page **pages)
> 
> >  			break;
> > +		if (!PageTransCompound(page))
> > +			continue;
> > +		for (refs = 0; ret < nr_pages &&
> > +				(index + 1) % HPAGE_PMD_NR;
> > +				ret++, refs++, index++)
> > +			pages[ret] = ++page;
> > +		if (refs)
> > +			page_ref_add(compound_head(page), refs);
> > +		if (ret == nr_pages)
> > +			break;
> >  	}
> 
> ... and again!
> 
> > @@ -2326,25 +2337,26 @@ void filemap_map_pages(struct vm_fault *vmf,
> > +		/* For multi-order entries, find relevant subpage */
> > +		if (PageTransHuge(page)) {
> > +			VM_BUG_ON(index - page->index < 0);
> > +			VM_BUG_ON(index - page->index >= HPAGE_PMD_NR);
> > +			page += index - page->index;
> > +		}
> > +
> >  		if (!PageUptodate(page) ||
> >  				PageReadahead(page) ||
> 
> Readahead is PF_NO_COMPOUND ... so I don't see why this works?

We wouldn't see pages with readahead flag set/cleared until huge pages
support in ext4 is enabled at very end of the patchset.

> 
> > @@ -2378,8 +2390,14 @@ void filemap_map_pages(struct vm_fault *vmf,
> >  		/* Huge page is mapped? No need to proceed. */
> >  		if (pmd_trans_huge(*vmf->pmd))
> >  			break;
> > -		if (iter.index == end_pgoff)
> > +		if (index == end_pgoff)
> >  			break;
> > +		if (page && PageTransCompound(page) &&
> > +				(index & (HPAGE_PMD_NR - 1)) !=
> > +				HPAGE_PMD_NR - 1) {
> > +			index++;
> > +			goto repeat;
> 
> Do we really have to go all the way back to the beginning of the loop?  It'd
> be nice to be able to insert all of the relevant PTEs in a shorter loop here.
> We'd need to bump the reference count N more times, and I think we do need
> to check HWPoison for each subpage.

I'll look into it.

Thanks for review.

-- 
 Kirill A. Shutemov

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

  reply	other threads:[~2017-02-09 16:58 UTC|newest]

Thread overview: 67+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2017-01-26 11:57 [PATCHv6 00/37] ext4: support of huge pages Kirill A. Shutemov
2017-01-26 11:57 ` [PATCHv6 01/37] mm, shmem: swich huge tmpfs to multi-order radix-tree entries Kirill A. Shutemov
2017-02-09  3:57   ` Matthew Wilcox
2017-02-09 16:58     ` Kirill A. Shutemov [this message]
2017-02-13 13:43       ` Kirill A. Shutemov
2017-01-26 11:57 ` [PATCHv6 02/37] Revert "radix-tree: implement radix_tree_maybe_preload_order()" Kirill A. Shutemov
2017-01-26 15:38   ` Matthew Wilcox
2017-01-26 11:57 ` [PATCHv6 03/37] page-flags: relax page flag policy for few flags Kirill A. Shutemov
2017-02-09  4:01   ` Matthew Wilcox
2017-02-13 13:59     ` Kirill A. Shutemov
2017-01-26 11:57 ` [PATCHv6 04/37] mm, rmap: account file thp pages Kirill A. Shutemov
2017-02-09 20:17   ` Matthew Wilcox
2017-01-26 11:57 ` [PATCHv6 05/37] thp: try to free page's buffers before attempt split Kirill A. Shutemov
2017-02-09 20:14   ` Matthew Wilcox
2017-02-13 14:32     ` Kirill A. Shutemov
2017-01-26 11:57 ` [PATCHv6 06/37] thp: handle write-protection faults for file THP Kirill A. Shutemov
2017-01-26 15:44   ` Matthew Wilcox
2017-01-26 15:57     ` Kirill A. Shutemov
2017-02-09 20:19   ` Matthew Wilcox
2017-01-26 11:57 ` [PATCHv6 07/37] filemap: allocate huge page in page_cache_read(), if allowed Kirill A. Shutemov
2017-02-09 21:18   ` Matthew Wilcox
2017-02-13 15:17     ` Kirill A. Shutemov
2017-01-26 11:57 ` [PATCHv6 08/37] filemap: handle huge pages in do_generic_file_read() Kirill A. Shutemov
2017-02-09 21:55   ` Matthew Wilcox
2017-02-13 15:33     ` Kirill A. Shutemov
2017-02-13 16:01       ` Matthew Wilcox
2017-02-13 16:09         ` Matthew Wilcox
2017-02-13 16:28   ` Matthew Wilcox
2017-01-26 11:57 ` [PATCHv6 09/37] filemap: allocate huge page in pagecache_get_page(), if allowed Kirill A. Shutemov
2017-02-09 21:59   ` Matthew Wilcox
2017-01-26 11:57 ` [PATCHv6 10/37] filemap: handle huge pages in filemap_fdatawait_range() Kirill A. Shutemov
2017-02-09 23:03   ` Matthew Wilcox
2017-01-26 11:57 ` [PATCHv6 11/37] HACK: readahead: alloc huge pages, if allowed Kirill A. Shutemov
2017-02-09 23:34   ` Matthew Wilcox
2017-02-10  0:23     ` Andreas Dilger
2017-02-10 14:51       ` Matthew Wilcox
2017-01-26 11:57 ` [PATCHv6 12/37] brd: make it handle huge pages Kirill A. Shutemov
2017-02-10 17:24   ` Matthew Wilcox
2017-01-26 11:57 ` [PATCHv6 13/37] mm: make write_cache_pages() work on " Kirill A. Shutemov
2017-02-10 17:46   ` Matthew Wilcox
2017-01-26 11:57 ` [PATCHv6 14/37] thp: introduce hpage_size() and hpage_mask() Kirill A. Shutemov
2017-01-26 11:57 ` [PATCHv6 15/37] thp: do not threat slab pages as huge in hpage_{nr_pages,size,mask} Kirill A. Shutemov
2017-02-10 22:13   ` Matthew Wilcox
2017-01-26 11:57 ` [PATCHv6 16/37] thp: make thp_get_unmapped_area() respect S_HUGE_MODE Kirill A. Shutemov
2017-02-10 17:50   ` Matthew Wilcox
2017-01-26 11:57 ` [PATCHv6 17/37] fs: make block_read_full_page() be able to read huge page Kirill A. Shutemov
2017-02-10 17:58   ` Matthew Wilcox
2017-01-26 11:58 ` [PATCHv6 18/37] fs: make block_write_{begin,end}() be able to handle huge pages Kirill A. Shutemov
2017-01-26 11:58 ` [PATCHv6 19/37] fs: make block_page_mkwrite() aware about " Kirill A. Shutemov
2017-01-26 11:58 ` [PATCHv6 20/37] truncate: make truncate_inode_pages_range() " Kirill A. Shutemov
2017-01-26 11:58 ` [PATCHv6 21/37] truncate: make invalidate_inode_pages2_range() " Kirill A. Shutemov
2017-01-26 11:58 ` [PATCHv6 22/37] mm, hugetlb: switch hugetlbfs to multi-order radix-tree entries Kirill A. Shutemov
2017-01-26 11:58 ` [PATCHv6 23/37] mm: account huge pages to dirty, writaback, reclaimable, etc Kirill A. Shutemov
2017-01-26 11:58 ` [PATCHv6 24/37] ext4: make ext4_mpage_readpages() hugepage-aware Kirill A. Shutemov
2017-01-26 11:58 ` [PATCHv6 25/37] ext4: make ext4_writepage() work on huge pages Kirill A. Shutemov
2017-01-26 11:58 ` [PATCHv6 26/37] ext4: handle huge pages in ext4_page_mkwrite() Kirill A. Shutemov
2017-01-26 11:58 ` [PATCHv6 27/37] ext4: handle huge pages in __ext4_block_zero_page_range() Kirill A. Shutemov
2017-01-26 11:58 ` [PATCHv6 28/37] ext4: make ext4_block_write_begin() aware about huge pages Kirill A. Shutemov
2017-01-26 11:58 ` [PATCHv6 29/37] ext4: handle huge pages in ext4_da_write_end() Kirill A. Shutemov
2017-01-26 11:58 ` [PATCHv6 30/37] ext4: make ext4_da_page_release_reservation() aware about huge pages Kirill A. Shutemov
2017-01-26 11:58 ` [PATCHv6 31/37] ext4: handle writeback with " Kirill A. Shutemov
2017-01-26 11:58 ` [PATCHv6 32/37] ext4: make EXT4_IOC_MOVE_EXT work " Kirill A. Shutemov
2017-01-26 11:58 ` [PATCHv6 33/37] ext4: fix SEEK_DATA/SEEK_HOLE for " Kirill A. Shutemov
2017-01-26 11:58 ` [PATCHv6 34/37] ext4: make fallocate() operations work with " Kirill A. Shutemov
2017-01-26 11:58 ` [PATCHv6 35/37] ext4: reserve larger jounral transaction for " Kirill A. Shutemov
2017-01-26 11:58 ` [PATCHv6 36/37] mm, fs, ext4: expand use of page_mapping() and page_to_pgoff() Kirill A. Shutemov
2017-01-26 11:58 ` [PATCHv6 37/37] ext4, vfs: add huge= mount option Kirill A. Shutemov

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20170209165820.GA12768@node.shutemov.name \
    --to=kirill@shutemov.name \
    --cc=aarcange@redhat.com \
    --cc=adilger.kernel@dilger.ca \
    --cc=akpm@linux-foundation.org \
    --cc=dave.hansen@intel.com \
    --cc=hughd@google.com \
    --cc=jack@suse.com \
    --cc=kirill.shutemov@linux.intel.com \
    --cc=linux-block@vger.kernel.org \
    --cc=linux-ext4@vger.kernel.org \
    --cc=linux-fsdevel@vger.kernel.org \
    --cc=linux-kernel@vger.kernel.org \
    --cc=linux-mm@kvack.org \
    --cc=ross.zwisler@linux.intel.com \
    --cc=tytso@mit.edu \
    --cc=vbabka@suse.cz \
    --cc=viro@zeniv.linux.org.uk \
    --cc=willy@infradead.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).