From mboxrd@z Thu Jan 1 00:00:00 1970 From: j.glisse@gmail.com Subject: [PATCH 09/11] fs/ext4: add support for hmm migration to remote memory of pagecache. Date: Fri, 2 May 2014 09:52:08 -0400 Message-ID: <1399038730-25641-10-git-send-email-j.glisse@gmail.com> References: <1399038730-25641-1-git-send-email-j.glisse@gmail.com> Mime-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: quoted-printable Cc: =?UTF-8?q?J=C3=A9r=C3=B4me=20Glisse?= To: linux-mm@kvack.org, linux-kernel@vger.kernel.org, linux-fsdevel@vger.kernel.org Return-path: In-Reply-To: <1399038730-25641-1-git-send-email-j.glisse@gmail.com> Sender: owner-linux-mm@kvack.org List-Id: linux-fsdevel.vger.kernel.org From: J=C3=A9r=C3=B4me Glisse This add support for migrating page of ext4 filesystem to remote device memory using the hmm infrastructure. Writeback need special handling as we want to keep content inside remote memory. Signed-off-by: J=C3=A9r=C3=B4me Glisse --- fs/ext4/file.c | 20 +++++++ fs/ext4/inode.c | 175 +++++++++++++++++++++++++++++++++++++++++++++++++-= ------ 2 files changed, 174 insertions(+), 21 deletions(-) diff --git a/fs/ext4/file.c b/fs/ext4/file.c index 708aad7..7c787d5 100644 --- a/fs/ext4/file.c +++ b/fs/ext4/file.c @@ -26,6 +26,7 @@ #include #include #include +#include #include "ext4.h" #include "ext4_jbd2.h" #include "xattr.h" @@ -304,6 +305,7 @@ static int ext4_find_unwritten_pgoff(struct inode *in= ode, unsigned long nr_pages; =20 num =3D min_t(pgoff_t, end - index, PAGEVEC_SIZE); +retry: nr_pages =3D pagevec_lookup(&pvec, inode->i_mapping, index, (pgoff_t)num); if (nr_pages =3D=3D 0) { @@ -321,6 +323,24 @@ static int ext4_find_unwritten_pgoff(struct inode *i= node, break; } =20 + for (i =3D 0; i < nr_pages; i++) { + struct page *page =3D pvec.pages[i]; + + if (radix_tree_exception(page)) { + swp_entry_t swap =3D radix_to_swp_entry(page); + + /* FIXME How to handle hmm migration failure ? */ + hmm_pagecache_migrate(inode->i_mapping, swap); + for (; i < nr_pages; i++) { + if (radix_tree_exception(pvec.pages[i])) { + pvec.pages[i] =3D NULL; + } + } + pagevec_release(&pvec); + goto retry; + } + } + /* * If this is the first time to go into the loop and * offset is smaller than the first page offset, it will be a diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index b1dc334..f2558e2 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -39,6 +39,7 @@ #include #include #include +#include =20 #include "ext4_jbd2.h" #include "xattr.h" @@ -1462,16 +1463,37 @@ static void mpage_release_unused_pages(struct mpa= ge_da_data *mpd, break; for (i =3D 0; i < nr_pages; i++) { struct page *page =3D pvec.pages[i]; - if (page->index > end) - break; - BUG_ON(!PageLocked(page)); - BUG_ON(PageWriteback(page)); - if (invalidate) { - block_invalidatepage(page, 0, PAGE_CACHE_SIZE); - ClearPageUptodate(page); + + if (radix_tree_exceptional_entry(page)) { + swp_entry_t swap =3D radix_to_swp_entry(page); + + page =3D hmm_pagecache_page(mapping, swap); + pvec.pages[i] =3D page; + if (page->index > end) + break; + } else { + if (page->index > end) + break; + BUG_ON(!PageLocked(page)); + BUG_ON(PageWriteback(page)); + if (invalidate) { + block_invalidatepage(page, 0, PAGE_CACHE_SIZE); + ClearPageUptodate(page); + } } unlock_page(page); } + for (; i < nr_pages; i++) { + struct page *page =3D pvec.pages[i]; + + if (radix_tree_exceptional_entry(page)) { + swp_entry_t swap =3D radix_to_swp_entry(page); + + page =3D hmm_pagecache_page(mapping, swap); + unlock_page(page); + pvec.pages[i] =3D page; + } + } index =3D pvec.pages[nr_pages - 1]->index + 1; pagevec_release(&pvec); } @@ -2060,6 +2082,20 @@ static int mpage_map_and_submit_buffers(struct mpa= ge_da_data *mpd) PAGEVEC_SIZE); if (nr_pages =3D=3D 0) break; + + /* Replace hmm entry with the page backing it. At this point + * they are uptodate and locked. + */ + for (i =3D 0; i < nr_pages; i++) { + struct page *page =3D pvec.pages[i]; + + if (radix_tree_exceptional_entry(page)) { + swp_entry_t swap =3D radix_to_swp_entry(page); + + pvec.pages[i] =3D hmm_pagecache_page(inode->i_mapping, swap); + } + } + for (i =3D 0; i < nr_pages; i++) { struct page *page =3D pvec.pages[i]; =20 @@ -2331,13 +2367,61 @@ static int mpage_prepare_extent_to_map(struct mpa= ge_da_data *mpd) mpd->map.m_len =3D 0; mpd->next_page =3D index; while (index <=3D end) { + pgoff_t save_index =3D index; + bool migrated; + nr_pages =3D pagevec_lookup_tag(&pvec, mapping, &index, tag, min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1); if (nr_pages =3D=3D 0) goto out; =20 + for (i =3D 0, migrated =3D false; i < nr_pages; i++) { + struct page *page =3D pvec.pages[i]; + + if (radix_tree_exceptional_entry(page)) { + swp_entry_t swap =3D radix_to_swp_entry(page); + + /* This can not happen ! */ + VM_BUG_ON(!is_hmm_entry(swap)); + page =3D hmm_pagecache_writeback(mapping, swap); + if (page =3D=3D NULL) { + migrated =3D true; + pvec.pages[i] =3D NULL; + } + } + } + + /* Some rmem was migrated we need to redo the page cache lookup. */ + if (migrated) { + for (i =3D 0; i < nr_pages; i++) { + struct page *page =3D pvec.pages[i]; + + if (page && radix_tree_exceptional_entry(page)) { + swp_entry_t swap =3D radix_to_swp_entry(page); + + page =3D hmm_pagecache_page(mapping, swap); + unlock_page(page); + page_cache_release(page); + pvec.pages[i] =3D page; + } + } + pagevec_release(&pvec); + cond_resched(); + index =3D save_index; + continue; + } + for (i =3D 0; i < nr_pages; i++) { struct page *page =3D pvec.pages[i]; + struct page *hmm_page =3D NULL; + + if (radix_tree_exceptional_entry(page)) { + swp_entry_t swap =3D radix_to_swp_entry(page); + + pvec.pages[i] =3D hmm_pagecache_page(mapping, swap); + hmm_page =3D page =3D pvec.pages[i]; + page_cache_release(hmm_page); + } =20 /* * At this point, the page may be truncated or @@ -2364,20 +2448,24 @@ static int mpage_prepare_extent_to_map(struct mpa= ge_da_data *mpd) if (mpd->map.m_len > 0 && mpd->next_page !=3D page->index) goto out; =20 - lock_page(page); - /* - * If the page is no longer dirty, or its mapping no - * longer corresponds to inode we are writing (which - * means it has been truncated or invalidated), or the - * page is already under writeback and we are not doing - * a data integrity writeback, skip the page - */ - if (!PageDirty(page) || - (PageWriteback(page) && - (mpd->wbc->sync_mode =3D=3D WB_SYNC_NONE)) || - unlikely(page->mapping !=3D mapping)) { - unlock_page(page); - continue; + if (!hmm_page) { + lock_page(page); + + /* If the page is no longer dirty, or its + * mapping no longer corresponds to inode + * we are writing (which means it has been + * truncated or invalidated), or the page + * is already under writeback and we are + * not doing a data integrity writeback, + * skip the page + */ + if (!PageDirty(page) || + (PageWriteback(page) && + (mpd->wbc->sync_mode =3D=3D WB_SYNC_NONE)) || + unlikely(page->mapping !=3D mapping)) { + unlock_page(page); + continue; + } } =20 wait_on_page_writeback(page); @@ -2396,11 +2484,37 @@ static int mpage_prepare_extent_to_map(struct mpa= ge_da_data *mpd) err =3D 0; left--; } + /* Some entry of pvec might still be exceptional ! */ + for (i =3D 0; i < nr_pages; i++) { + struct page *page =3D pvec.pages[i]; + + if (radix_tree_exceptional_entry(page)) { + swp_entry_t swap =3D radix_to_swp_entry(page); + + page =3D hmm_pagecache_page(mapping, swap); + unlock_page(page); + page_cache_release(page); + pvec.pages[i] =3D page; + } + } pagevec_release(&pvec); cond_resched(); } return 0; out: + /* Some entry of pvec might still be exceptional ! */ + for (i =3D 0; i < nr_pages; i++) { + struct page *page =3D pvec.pages[i]; + + if (radix_tree_exceptional_entry(page)) { + swp_entry_t swap =3D radix_to_swp_entry(page); + + page =3D hmm_pagecache_page(mapping, swap); + unlock_page(page); + page_cache_release(page); + pvec.pages[i] =3D page; + } + } pagevec_release(&pvec); return err; } @@ -3281,6 +3395,7 @@ static const struct address_space_operations ext4_a= ops =3D { .migratepage =3D buffer_migrate_page, .is_partially_uptodate =3D block_is_partially_uptodate, .error_remove_page =3D generic_error_remove_page, + .features =3D AOPS_FEATURE_HMM, }; =20 static const struct address_space_operations ext4_journalled_aops =3D { @@ -3297,6 +3412,7 @@ static const struct address_space_operations ext4_j= ournalled_aops =3D { .direct_IO =3D ext4_direct_IO, .is_partially_uptodate =3D block_is_partially_uptodate, .error_remove_page =3D generic_error_remove_page, + .features =3D AOPS_FEATURE_HMM, }; =20 static const struct address_space_operations ext4_da_aops =3D { @@ -3313,6 +3429,7 @@ static const struct address_space_operations ext4_d= a_aops =3D { .migratepage =3D buffer_migrate_page, .is_partially_uptodate =3D block_is_partially_uptodate, .error_remove_page =3D generic_error_remove_page, + .features =3D AOPS_FEATURE_HMM, }; =20 void ext4_set_aops(struct inode *inode) @@ -3355,11 +3472,20 @@ static int ext4_block_zero_page_range(handle_t *h= andle, struct page *page; int err =3D 0; =20 +retry: page =3D find_or_create_page(mapping, from >> PAGE_CACHE_SHIFT, mapping_gfp_mask(mapping) & ~__GFP_FS); if (!page) return -ENOMEM; =20 + if (radix_tree_exception(page)) { + swp_entry_t swap =3D radix_to_swp_entry(page); + + /* FIXME How to handle hmm migration failure ? */ + hmm_pagecache_migrate(mapping, swap); + goto retry; + } + blocksize =3D inode->i_sb->s_blocksize; max =3D blocksize - (offset & (blocksize - 1)); =20 @@ -4529,6 +4655,13 @@ static void ext4_wait_for_tail_page_commit(struct = inode *inode) inode->i_size >> PAGE_CACHE_SHIFT); if (!page) return; + if (radix_tree_exception(page)) { + swp_entry_t swap =3D radix_to_swp_entry(page); + + /* FIXME How to handle hmm migration failure ? */ + hmm_pagecache_migrate(inode->i_mapping, swap); + continue; + } ret =3D __ext4_journalled_invalidatepage(page, offset, PAGE_CACHE_SIZE - offset); unlock_page(page); --=20 1.9.0 -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@kvack.org. For more info on Linux MM, see: http://www.linux-mm.org/ . Don't email: email@kvack.org