From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand id S1756682AbcECVDj (ORCPT ); Tue, 3 May 2016 17:03:39 -0400 Received: from mga11.intel.com ([192.55.52.93]:3393 "EHLO mga11.intel.com" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S1756543AbcECVDi (ORCPT ); Tue, 3 May 2016 17:03:38 -0400 X-ExtLoop1: 1 X-IronPort-AV: E=Sophos;i="5.24,574,1455004800"; d="scan'208";a="945506900" Message-ID: <1462309416.21143.14.camel@linux.intel.com> Subject: [PATCH 7/7] mm: Batch unmapping of pages that are in swap cache From: Tim Chen To: Andrew Morton , Vladimir Davydov , Johannes Weiner , Michal Hocko , Minchan Kim , Hugh Dickins Cc: "Kirill A.Shutemov" , Andi Kleen , Aaron Lu , Huang Ying , linux-mm , linux-kernel@vger.kernel.org Date: Tue, 03 May 2016 14:03:36 -0700 In-Reply-To: References: Content-Type: text/plain; charset="UTF-8" X-Mailer: Evolution 3.18.5.2 (3.18.5.2-1.fc23) Mime-Version: 1.0 Content-Transfer-Encoding: 8bit Sender: linux-kernel-owner@vger.kernel.org List-ID: X-Mailing-List: linux-kernel@vger.kernel.org We created a new function __remove_swap_mapping_batch that allows all pages under the same swap partition to be removed from the swap cache's mapping in a single acquisition of the mapping's tree lock.  This reduces the contention on the lock when multiple threads are reclaiming memory by swapping to the same swap partition. The handle_pgout_batch function is updated so all the pages under the same swap partition are unmapped together when the have been paged out. Signed-off-by: Tim Chen ---  mm/vmscan.c | 426 ++++++++++++++++++++++++++++++++++++++++--------------------  1 file changed, 286 insertions(+), 140 deletions(-) diff --git a/mm/vmscan.c b/mm/vmscan.c index 9fc04e1..5e4b8ce 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -690,6 +690,103 @@ cannot_free:   return 0;  }   +/* use this only for swap mapped pages */ +static void __remove_swap_mapping_batch(struct page *pages[], +     bool reclaimed, short ret[], int nr) +{ + unsigned long flags; + struct page *page; + swp_entry_t swap[SWAP_BATCH]; + struct address_space *mapping; + + int i, batch_size; + + if (nr <= 0) + return; + + while (nr) { + mapping = page_mapping(pages[0]); + BUG_ON(!mapping); + + batch_size = min(nr, SWAP_BATCH); + + spin_lock_irqsave(&mapping->tree_lock, flags); + for (i = 0; i < batch_size; ++i) { + page = pages[i]; + + BUG_ON(!PageLocked(page)); + BUG_ON(!PageSwapCache(page)); + BUG_ON(mapping != page_mapping(page)); + + /* stop batching if mapping changes */ + if (mapping != page_mapping(page)) { + batch_size = i; + break; + } + /* +  * The non racy check for a busy page. +  * +  * Must be careful with the order of the tests. When someone has +  * a ref to the page, it may be possible that they dirty it then +  * drop the reference. So if PageDirty is tested before page_count +  * here, then the following race may occur: +  * +  * get_user_pages(&page); +  * [user mapping goes away] +  * write_to(page); +  * !PageDirty(page)    [good] +  * SetPageDirty(page); +  * put_page(page); +  * !page_count(page)   [good, discard it] +  * +  * [oops, our write_to data is lost] +  * +  * Reversing the order of the tests ensures such a situation cannot +  * escape unnoticed. The smp_rmb is needed to ensure the page->flags +  * load is not satisfied before that of page->_count. +  * +  * Note that if SetPageDirty is always performed via set_page_dirty, +  * and thus under tree_lock, then this ordering is not required. +  */ + if (!page_ref_freeze(page, 2)) + goto cannot_free; + /* note: atomic_cmpxchg in page_freeze_refs provides the smp_rmb */ + if (unlikely(PageDirty(page))) { + page_ref_unfreeze(page, 2); + goto cannot_free; + } + + swap[i].val = page_private(page); + __delete_from_swap_cache(page); + + ret[i] = 1; + continue; + +cannot_free: + ret[i] = 0; + } + spin_unlock_irqrestore(&mapping->tree_lock, flags); + + /* need to keep irq off for mem_cgroup accounting, don't restore flags yet  */ + local_irq_disable(); + for (i = 0; i < batch_size; ++i) { + if (ret[i]) { + page = pages[i]; + mem_cgroup_swapout(page, swap[i]); + } + } + local_irq_enable(); + + for (i = 0; i < batch_size; ++i) { + if (ret[i]) + swapcache_free(swap[i]); + } + /* advance to next batch */ + pages += batch_size; + ret += batch_size; + nr -= batch_size; + } +}  /*   * Attempt to detach a locked page from its ->mapping.  If it is dirty or if   * someone else has a ref on the page, abort and return 0.  If it was @@ -897,177 +994,226 @@ static void handle_pgout_batch(struct list_head *page_list,   int nr)  {   struct address_space *mapping; + struct page *umap_pages[SWAP_BATCH];   struct page *page; - int i; - - for (i = 0; i < nr; ++i) { - page = pages[i]; - mapping =  page_mapping(page); + int i, j, batch_size; + short umap_ret[SWAP_BATCH], idx[SWAP_BATCH]; + + while (nr) { + j = 0; + batch_size = min(nr, SWAP_BATCH); + mapping = NULL; + + for (i = 0; i < batch_size; ++i) { + page = pages[i]; + + if (mapping) { + if (mapping != page_mapping(page)) { + /* mapping change, stop batch here */ + batch_size = i; + break; + } + } else + mapping =  page_mapping(page);   - /* check outcome of cache addition */ - if (!ret[i]) { - ret[i] = PG_ACTIVATE_LOCKED; - continue; - } - /* -  * The page is mapped into the page tables of one or more -  * processes. Try to unmap it here. -  */ - if (page_mapped(page) && mapping) { - switch (swap_ret[i] = try_to_unmap(page, lazyfree ? - (ttu_flags | TTU_BATCH_FLUSH | TTU_LZFREE) : - (ttu_flags | TTU_BATCH_FLUSH))) { - case SWAP_FAIL: + /* check outcome of cache addition */ + if (!ret[i]) {   ret[i] = PG_ACTIVATE_LOCKED;   continue; - case SWAP_AGAIN: - ret[i] = PG_KEEP_LOCKED; - continue; - case SWAP_MLOCK: - ret[i] = PG_MLOCKED; - continue; - case SWAP_LZFREE: - goto lazyfree; - case SWAP_SUCCESS: - ; /* try to free the page below */   } - } - - if (PageDirty(page)) {   /* -  * Only kswapd can writeback filesystem pages to -  * avoid risk of stack overflow but only writeback -  * if many dirty pages have been encountered. +  * The page is mapped into the page tables of one or more +  * processes. Try to unmap it here.    */ - if (page_is_file_cache(page) && - (!current_is_kswapd() || -  !test_bit(ZONE_DIRTY, &zone->flags))) { + if (page_mapped(page) && mapping) { + switch (swap_ret[i] = try_to_unmap(page, lazyfree ? + (ttu_flags | TTU_BATCH_FLUSH | TTU_LZFREE) : + (ttu_flags | TTU_BATCH_FLUSH))) { + case SWAP_FAIL: + ret[i] = PG_ACTIVATE_LOCKED; + continue; + case SWAP_AGAIN: + ret[i] = PG_KEEP_LOCKED; + continue; + case SWAP_MLOCK: + ret[i] = PG_MLOCKED; + continue; + case SWAP_LZFREE: + goto lazyfree; + case SWAP_SUCCESS: + ; /* try to free the page below */ + } + } + + if (PageDirty(page)) {   /* -  * Immediately reclaim when written back. -  * Similar in principal to deactivate_page() -  * except we already have the page isolated -  * and know it's dirty +  * Only kswapd can writeback filesystem pages to +  * avoid risk of stack overflow but only writeback +  * if many dirty pages have been encountered.    */ - inc_zone_page_state(page, NR_VMSCAN_IMMEDIATE); - SetPageReclaim(page); - - ret[i] = PG_KEEP_LOCKED; - continue; - } + if (page_is_file_cache(page) && + (!current_is_kswapd() || +  !test_bit(ZONE_DIRTY, &zone->flags))) { + /* +  * Immediately reclaim when written back. +  * Similar in principal to deactivate_page() +  * except we already have the page isolated +  * and know it's dirty +  */ + inc_zone_page_state(page, NR_VMSCAN_IMMEDIATE); + SetPageReclaim(page);   - if (references == PAGEREF_RECLAIM_CLEAN) { - ret[i] = PG_KEEP_LOCKED; - continue; - } - if (!may_enter_fs) { - ret[i] = PG_KEEP_LOCKED; - continue; - } - if (!sc->may_writepage) { - ret[i] = PG_KEEP_LOCKED; - continue; - } + ret[i] = PG_KEEP_LOCKED; + continue; + }   - /* -  * Page is dirty. Flush the TLB if a writable entry -  * potentially exists to avoid CPU writes after IO -  * starts and then write it out here. -  */ - try_to_unmap_flush_dirty(); - switch (pageout(page, mapping, sc)) { - case PAGE_KEEP: - ret[i] = PG_KEEP_LOCKED; - continue; - case PAGE_ACTIVATE: - ret[i] = PG_ACTIVATE_LOCKED; - continue; - case PAGE_SUCCESS: - if (PageWriteback(page)) { - ret[i] = PG_KEEP; + if (references == PAGEREF_RECLAIM_CLEAN) { + ret[i] = PG_KEEP_LOCKED; + continue; + } + if (!may_enter_fs) { + ret[i] = PG_KEEP_LOCKED;   continue;   } - if (PageDirty(page)) { - ret[i] = PG_KEEP; + if (!sc->may_writepage) { + ret[i] = PG_KEEP_LOCKED;   continue;   }     /* -  * A synchronous write - probably a ramdisk.  Go -  * ahead and try to reclaim the page. +  * Page is dirty. Flush the TLB if a writable entry +  * potentially exists to avoid CPU writes after IO +  * starts and then write it out here.    */ - if (!trylock_page(page)) { - ret[i] = PG_KEEP; - continue; - } - if (PageDirty(page) || PageWriteback(page)) { + try_to_unmap_flush_dirty(); + switch (pageout(page, mapping, sc)) { + case PAGE_KEEP:   ret[i] = PG_KEEP_LOCKED;   continue; + case PAGE_ACTIVATE: + ret[i] = PG_ACTIVATE_LOCKED; + continue; + case PAGE_SUCCESS: + if (PageWriteback(page)) { + ret[i] = PG_KEEP; + continue; + } + if (PageDirty(page)) { + ret[i] = PG_KEEP; + continue; + } + + /* +  * A synchronous write - probably a ramdisk.  Go +  * ahead and try to reclaim the page. +  */ + if (!trylock_page(page)) { + ret[i] = PG_KEEP; + continue; + } + if (PageDirty(page) || PageWriteback(page)) { + ret[i] = PG_KEEP_LOCKED; + continue; + } + mapping = page_mapping(page); + case PAGE_CLEAN: + ; /* try to free the page below */   } - mapping = page_mapping(page); - case PAGE_CLEAN: - ; /* try to free the page below */   } - }   - /* -  * If the page has buffers, try to free the buffer mappings -  * associated with this page. If we succeed we try to free -  * the page as well. -  * -  * We do this even if the page is PageDirty(). -  * try_to_release_page() does not perform I/O, but it is -  * possible for a page to have PageDirty set, but it is actually -  * clean (all its buffers are clean).  This happens if the -  * buffers were written out directly, with submit_bh(). ext3 -  * will do this, as well as the blockdev mapping. -  * try_to_release_page() will discover that cleanness and will -  * drop the buffers and mark the page clean - it can be freed. -  * -  * Rarely, pages can have buffers and no ->mapping.  These are -  * the pages which were not successfully invalidated in -  * truncate_complete_page().  We try to drop those buffers here -  * and if that worked, and the page is no longer mapped into -  * process address space (page_count == 1) it can be freed. -  * Otherwise, leave the page on the LRU so it is swappable. -  */ - if (page_has_private(page)) { - if (!try_to_release_page(page, sc->gfp_mask)) { - ret[i] = PG_ACTIVATE_LOCKED; + /* +  * If the page has buffers, try to free the buffer mappings +  * associated with this page. If we succeed we try to free +  * the page as well. +  * +  * We do this even if the page is PageDirty(). +  * try_to_release_page() does not perform I/O, but it is +  * possible for a page to have PageDirty set, but it is actually +  * clean (all its buffers are clean).  This happens if the +  * buffers were written out directly, with submit_bh(). ext3 +  * will do this, as well as the blockdev mapping. +  * try_to_release_page() will discover that cleanness and will +  * drop the buffers and mark the page clean - it can be freed. +  * +  * Rarely, pages can have buffers and no ->mapping.  These are +  * the pages which were not successfully invalidated in +  * truncate_complete_page().  We try to drop those buffers here +  * and if that worked, and the page is no longer mapped into +  * process address space (page_count == 1) it can be freed. +  * Otherwise, leave the page on the LRU so it is swappable. +  */ + if (page_has_private(page)) { + if (!try_to_release_page(page, sc->gfp_mask)) { + ret[i] = PG_ACTIVATE_LOCKED; + continue; + } + if (!mapping && page_count(page) == 1) { + unlock_page(page); + if (put_page_testzero(page)) { + ret[i] = PG_FREE; + continue; + } else { + /* +  * rare race with speculative reference. +  * the speculative reference will free +  * this page shortly, so we may +  * increment nr_reclaimed (and +  * leave it off the LRU). +  */ + ret[i] = PG_SPECULATIVE_REF; + continue; + } + } + } +lazyfree: + if (!mapping) { + ret[i] = PG_KEEP_LOCKED;   continue;   } - if (!mapping && page_count(page) == 1) { - unlock_page(page); - if (put_page_testzero(page)) { - ret[i] = PG_FREE; - continue; - } else { - /* -  * rare race with speculative reference. -  * the speculative reference will free -  * this page shortly, so we may -  * increment nr_reclaimed (and -  * leave it off the LRU). -  */ - ret[i] = PG_SPECULATIVE_REF; + if (!PageSwapCache(page)) { + if (!__remove_mapping(mapping, page, true)) { + ret[i] = PG_KEEP_LOCKED;   continue;   } + __ClearPageLocked(page); + ret[i] = PG_FREE; + continue;   } + + /* note pages to be unmapped */ + ret[i] = PG_UNKNOWN; + idx[j] = i; + umap_pages[j] = page; + ++j;   } -lazyfree: - if (!mapping || !__remove_mapping(mapping, page, true)) { - ret[i] = PG_KEEP_LOCKED; - continue; + + /* handle remaining pages that need to be unmapped */ + __remove_swap_mapping_batch(umap_pages, true, umap_ret, j); + + for (i = 0; i < j; ++i) { + if (!umap_ret[i]) { + /* unmap failed */ + ret[idx[i]] = PG_KEEP_LOCKED; + continue; + } + + page = umap_pages[i]; + /* +  * At this point, we have no other references and there is +  * no way to pick any more up (removed from LRU, removed +  * from pagecache). Can use non-atomic bitops now (and +  * we obviously don't have to worry about waking up a process +  * waiting on the page lock, because there are no references. +  */ + __ClearPageLocked(page); + ret[idx[i]] = PG_FREE;   }   - /* -  * At this point, we have no other references and there is -  * no way to pick any more up (removed from LRU, removed -  * from pagecache). Can use non-atomic bitops now (and -  * we obviously don't have to worry about waking up a process -  * waiting on the page lock, because there are no references. -  */ - __ClearPageLocked(page); - ret[i] = PG_FREE; + /* advance pointers to next batch and remaining page count */ + nr = nr - batch_size; + pages += batch_size; + ret += batch_size; + swap_ret += batch_size;   }  }   --  2.5.5