* [patch 2/3 v2]swap: make each swap partition have one address_space @ 2013-01-22 2:29 Shaohua Li 2013-01-22 19:49 ` Rik van Riel 2013-01-23 6:16 ` Minchan Kim 0 siblings, 2 replies; 15+ messages in thread From: Shaohua Li @ 2013-01-22 2:29 UTC (permalink / raw) To: linux-mm; +Cc: akpm, hughd, riel, minchan When I use several fast SSD to do swap, swapper_space.tree_lock is heavily contended. This makes each swap partition have one address_space to reduce the lock contention. There is an array of address_space for swap. The swap entry type is the index to the array. In my test with 3 SSD, this increases the swapout throughput 20%. V1->V2: simplify code Signed-off-by: Shaohua Li <shli@fusionio.com> --- fs/proc/meminfo.c | 4 +-- include/linux/swap.h | 9 ++++---- mm/memcontrol.c | 4 +-- mm/mincore.c | 5 ++-- mm/swap.c | 9 ++++++-- mm/swap_state.c | 57 ++++++++++++++++++++++++++++++++++----------------- mm/swapfile.c | 5 ++-- mm/util.c | 10 ++++++-- 8 files changed, 68 insertions(+), 35 deletions(-) Index: linux/include/linux/swap.h =================================================================== --- linux.orig/include/linux/swap.h 2013-01-22 09:13:14.000000000 +0800 +++ linux/include/linux/swap.h 2013-01-22 09:34:44.923011706 +0800 @@ -8,7 +8,7 @@ #include <linux/memcontrol.h> #include <linux/sched.h> #include <linux/node.h> - +#include <linux/fs.h> #include <linux/atomic.h> #include <asm/page.h> @@ -330,8 +330,9 @@ int generic_swapfile_activate(struct swa sector_t *); /* linux/mm/swap_state.c */ -extern struct address_space swapper_space; -#define total_swapcache_pages swapper_space.nrpages +extern struct address_space swapper_spaces[]; +#define swap_address_space(entry) (&swapper_spaces[swp_type(entry)]) +extern unsigned long total_swapcache_pages(void); extern void show_swap_cache_info(void); extern int add_to_swap(struct page *); extern int add_to_swap_cache(struct page *, swp_entry_t, gfp_t); @@ -382,7 +383,7 @@ mem_cgroup_uncharge_swapcache(struct pag #define nr_swap_pages 0L #define total_swap_pages 0L -#define total_swapcache_pages 0UL +#define total_swapcache_pages() 0UL #define si_swapinfo(val) \ do { (val)->freeswap = (val)->totalswap = 0; } while (0) Index: linux/mm/memcontrol.c =================================================================== --- linux.orig/mm/memcontrol.c 2013-01-22 09:13:14.000000000 +0800 +++ linux/mm/memcontrol.c 2013-01-22 09:29:29.374977700 +0800 @@ -6279,7 +6279,7 @@ static struct page *mc_handle_swap_pte(s * Because lookup_swap_cache() updates some statistics counter, * we call find_get_page() with swapper_space directly. */ - page = find_get_page(&swapper_space, ent.val); + page = find_get_page(swap_address_space(ent), ent.val); if (do_swap_account) entry->val = ent.val; @@ -6320,7 +6320,7 @@ static struct page *mc_handle_file_pte(s swp_entry_t swap = radix_to_swp_entry(page); if (do_swap_account) *entry = swap; - page = find_get_page(&swapper_space, swap.val); + page = find_get_page(swap_address_space(swap), swap.val); } #endif return page; Index: linux/mm/mincore.c =================================================================== --- linux.orig/mm/mincore.c 2013-01-22 09:13:14.000000000 +0800 +++ linux/mm/mincore.c 2013-01-22 09:29:29.378977649 +0800 @@ -75,7 +75,7 @@ static unsigned char mincore_page(struct /* shmem/tmpfs may return swap: account for swapcache page too. */ if (radix_tree_exceptional_entry(page)) { swp_entry_t swap = radix_to_swp_entry(page); - page = find_get_page(&swapper_space, swap.val); + page = find_get_page(swap_address_space(swap), swap.val); } #endif if (page) { @@ -135,7 +135,8 @@ static void mincore_pte_range(struct vm_ } else { #ifdef CONFIG_SWAP pgoff = entry.val; - *vec = mincore_page(&swapper_space, pgoff); + *vec = mincore_page(swap_address_space(entry), + pgoff); #else WARN_ON(1); *vec = 1; Index: linux/mm/swap.c =================================================================== --- linux.orig/mm/swap.c 2013-01-22 09:13:14.000000000 +0800 +++ linux/mm/swap.c 2013-01-22 09:29:29.378977649 +0800 @@ -855,9 +855,14 @@ EXPORT_SYMBOL(pagevec_lookup_tag); void __init swap_setup(void) { unsigned long megs = totalram_pages >> (20 - PAGE_SHIFT); - #ifdef CONFIG_SWAP - bdi_init(swapper_space.backing_dev_info); + int i; + + for (i = 0; i < MAX_SWAPFILES; i++) { + bdi_init(swapper_spaces[i].backing_dev_info); + spin_lock_init(&swapper_spaces[i].tree_lock); + INIT_LIST_HEAD(&swapper_spaces[i].i_mmap_nonlinear); + } #endif /* Use a smaller cluster for small-memory machines */ Index: linux/mm/swap_state.c =================================================================== --- linux.orig/mm/swap_state.c 2013-01-22 09:13:14.000000000 +0800 +++ linux/mm/swap_state.c 2013-01-22 09:29:29.378977649 +0800 @@ -36,12 +36,12 @@ static struct backing_dev_info swap_back .capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK | BDI_CAP_SWAP_BACKED, }; -struct address_space swapper_space = { - .page_tree = RADIX_TREE_INIT(GFP_ATOMIC|__GFP_NOWARN), - .tree_lock = __SPIN_LOCK_UNLOCKED(swapper_space.tree_lock), - .a_ops = &swap_aops, - .i_mmap_nonlinear = LIST_HEAD_INIT(swapper_space.i_mmap_nonlinear), - .backing_dev_info = &swap_backing_dev_info, +struct address_space swapper_spaces[MAX_SWAPFILES] = { + [0 ... MAX_SWAPFILES - 1] = { + .page_tree = RADIX_TREE_INIT(GFP_ATOMIC|__GFP_NOWARN), + .a_ops = &swap_aops, + .backing_dev_info = &swap_backing_dev_info, + } }; #define INC_CACHE_INFO(x) do { swap_cache_info.x++; } while (0) @@ -53,9 +53,19 @@ static struct { unsigned long find_total; } swap_cache_info; +unsigned long total_swapcache_pages(void) +{ + int i; + unsigned long ret = 0; + + for (i = 0; i < MAX_SWAPFILES; i++) + ret += swapper_spaces[i].nrpages; + return ret; +} + void show_swap_cache_info(void) { - printk("%lu pages in swap cache\n", total_swapcache_pages); + printk("%lu pages in swap cache\n", total_swapcache_pages()); printk("Swap cache stats: add %lu, delete %lu, find %lu/%lu\n", swap_cache_info.add_total, swap_cache_info.del_total, swap_cache_info.find_success, swap_cache_info.find_total); @@ -70,23 +80,26 @@ void show_swap_cache_info(void) static int __add_to_swap_cache(struct page *page, swp_entry_t entry) { int error; + struct address_space *address_space; VM_BUG_ON(!PageLocked(page)); VM_BUG_ON(PageSwapCache(page)); VM_BUG_ON(!PageSwapBacked(page)); page_cache_get(page); - SetPageSwapCache(page); set_page_private(page, entry.val); + SetPageSwapCache(page); - spin_lock_irq(&swapper_space.tree_lock); - error = radix_tree_insert(&swapper_space.page_tree, entry.val, page); + address_space = swap_address_space(entry); + spin_lock_irq(&address_space->tree_lock); + error = radix_tree_insert(&address_space->page_tree, + entry.val, page); if (likely(!error)) { - total_swapcache_pages++; + address_space->nrpages++; __inc_zone_page_state(page, NR_FILE_PAGES); INC_CACHE_INFO(add_total); } - spin_unlock_irq(&swapper_space.tree_lock); + spin_unlock_irq(&address_space->tree_lock); if (unlikely(error)) { /* @@ -122,14 +135,19 @@ int add_to_swap_cache(struct page *page, */ void __delete_from_swap_cache(struct page *page) { + swp_entry_t entry; + struct address_space *address_space; + VM_BUG_ON(!PageLocked(page)); VM_BUG_ON(!PageSwapCache(page)); VM_BUG_ON(PageWriteback(page)); - radix_tree_delete(&swapper_space.page_tree, page_private(page)); + entry.val = page_private(page); + address_space = swap_address_space(entry); + radix_tree_delete(&address_space->page_tree, page_private(page)); set_page_private(page, 0); ClearPageSwapCache(page); - total_swapcache_pages--; + address_space->nrpages--; __dec_zone_page_state(page, NR_FILE_PAGES); INC_CACHE_INFO(del_total); } @@ -195,12 +213,14 @@ int add_to_swap(struct page *page) void delete_from_swap_cache(struct page *page) { swp_entry_t entry; + struct address_space *address_space; entry.val = page_private(page); - spin_lock_irq(&swapper_space.tree_lock); + address_space = swap_address_space(entry); + spin_lock_irq(&address_space->tree_lock); __delete_from_swap_cache(page); - spin_unlock_irq(&swapper_space.tree_lock); + spin_unlock_irq(&address_space->tree_lock); swapcache_free(entry, page); page_cache_release(page); @@ -263,7 +283,7 @@ struct page * lookup_swap_cache(swp_entr { struct page *page; - page = find_get_page(&swapper_space, entry.val); + page = find_get_page(swap_address_space(entry), entry.val); if (page) INC_CACHE_INFO(find_success); @@ -290,7 +310,8 @@ struct page *read_swap_cache_async(swp_e * called after lookup_swap_cache() failed, re-calling * that would confuse statistics. */ - found_page = find_get_page(&swapper_space, entry.val); + found_page = find_get_page(swap_address_space(entry), + entry.val); if (found_page) break; Index: linux/mm/swapfile.c =================================================================== --- linux.orig/mm/swapfile.c 2013-01-22 09:13:14.000000000 +0800 +++ linux/mm/swapfile.c 2013-01-22 09:29:29.378977649 +0800 @@ -79,7 +79,7 @@ __try_to_reclaim_swap(struct swap_info_s struct page *page; int ret = 0; - page = find_get_page(&swapper_space, entry.val); + page = find_get_page(swap_address_space(entry), entry.val); if (!page) return 0; /* @@ -699,7 +699,8 @@ int free_swap_and_cache(swp_entry_t entr p = swap_info_get(entry); if (p) { if (swap_entry_free(p, entry, 1) == SWAP_HAS_CACHE) { - page = find_get_page(&swapper_space, entry.val); + page = find_get_page(swap_address_space(entry), + entry.val); if (page && !trylock_page(page)) { page_cache_release(page); page = NULL; Index: linux/fs/proc/meminfo.c =================================================================== --- linux.orig/fs/proc/meminfo.c 2013-01-22 09:13:14.000000000 +0800 +++ linux/fs/proc/meminfo.c 2013-01-22 09:29:29.378977649 +0800 @@ -40,7 +40,7 @@ static int meminfo_proc_show(struct seq_ * sysctl_overcommit_ratio / 100) + total_swap_pages; cached = global_page_state(NR_FILE_PAGES) - - total_swapcache_pages - i.bufferram; + total_swapcache_pages() - i.bufferram; if (cached < 0) cached = 0; @@ -109,7 +109,7 @@ static int meminfo_proc_show(struct seq_ K(i.freeram), K(i.bufferram), K(cached), - K(total_swapcache_pages), + K(total_swapcache_pages()), K(pages[LRU_ACTIVE_ANON] + pages[LRU_ACTIVE_FILE]), K(pages[LRU_INACTIVE_ANON] + pages[LRU_INACTIVE_FILE]), K(pages[LRU_ACTIVE_ANON]), Index: linux/mm/util.c =================================================================== --- linux.orig/mm/util.c 2013-01-22 09:21:51.000000000 +0800 +++ linux/mm/util.c 2013-01-22 09:30:24.218287858 +0800 @@ -6,6 +6,7 @@ #include <linux/sched.h> #include <linux/security.h> #include <linux/swap.h> +#include <linux/swapops.h> #include <asm/uaccess.h> #include "internal.h" @@ -385,9 +386,12 @@ struct address_space *page_mapping(struc VM_BUG_ON(PageSlab(page)); #ifdef CONFIG_SWAP - if (unlikely(PageSwapCache(page))) - mapping = &swapper_space; - else + if (unlikely(PageSwapCache(page))) { + swp_entry_t entry; + + entry.val = page_private(page); + mapping = swap_address_space(entry); + } else #endif if ((unsigned long)mapping & PAGE_MAPPING_ANON) mapping = NULL; -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@kvack.org. For more info on Linux MM, see: http://www.linux-mm.org/ . Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a> ^ permalink raw reply [flat|nested] 15+ messages in thread
* Re: [patch 2/3 v2]swap: make each swap partition have one address_space 2013-01-22 2:29 [patch 2/3 v2]swap: make each swap partition have one address_space Shaohua Li @ 2013-01-22 19:49 ` Rik van Riel 2013-01-23 6:16 ` Minchan Kim 1 sibling, 0 replies; 15+ messages in thread From: Rik van Riel @ 2013-01-22 19:49 UTC (permalink / raw) To: Shaohua Li; +Cc: linux-mm, akpm, hughd, minchan On 01/21/2013 09:29 PM, Shaohua Li wrote: > > When I use several fast SSD to do swap, swapper_space.tree_lock is heavily > contended. This makes each swap partition have one address_space to reduce the > lock contention. There is an array of address_space for swap. The swap entry > type is the index to the array. > > In my test with 3 SSD, this increases the swapout throughput 20%. > > V1->V2: simplify code > > Signed-off-by: Shaohua Li <shli@fusionio.com> Acked-by: Rik van Riel <riel@redhat.com> -- All rights reversed -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@kvack.org. For more info on Linux MM, see: http://www.linux-mm.org/ . Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a> ^ permalink raw reply [flat|nested] 15+ messages in thread
* Re: [patch 2/3 v2]swap: make each swap partition have one address_space 2013-01-22 2:29 [patch 2/3 v2]swap: make each swap partition have one address_space Shaohua Li 2013-01-22 19:49 ` Rik van Riel @ 2013-01-23 6:16 ` Minchan Kim 2013-01-23 7:36 ` Shaohua Li 1 sibling, 1 reply; 15+ messages in thread From: Minchan Kim @ 2013-01-23 6:16 UTC (permalink / raw) To: Shaohua Li; +Cc: linux-mm, akpm, hughd, riel Looks good to me. Below just nitpicks. I saw Andrew already took this into mmotm so I'm not sure he or you will do next spin but anyway, review goes. Just nitpicks and a question. On Tue, Jan 22, 2013 at 10:29:51AM +0800, Shaohua Li wrote: > > When I use several fast SSD to do swap, swapper_space.tree_lock is heavily > contended. This makes each swap partition have one address_space to reduce the > lock contention. There is an array of address_space for swap. The swap entry > type is the index to the array. > > In my test with 3 SSD, this increases the swapout throughput 20%. > > V1->V2: simplify code > > Signed-off-by: Shaohua Li <shli@fusionio.com> Acked-by: Minchan Kim <minchan@kernel.org> > --- > fs/proc/meminfo.c | 4 +-- > include/linux/swap.h | 9 ++++---- > mm/memcontrol.c | 4 +-- > mm/mincore.c | 5 ++-- > mm/swap.c | 9 ++++++-- > mm/swap_state.c | 57 ++++++++++++++++++++++++++++++++++----------------- > mm/swapfile.c | 5 ++-- > mm/util.c | 10 ++++++-- > 8 files changed, 68 insertions(+), 35 deletions(-) > > Index: linux/include/linux/swap.h > =================================================================== > --- linux.orig/include/linux/swap.h 2013-01-22 09:13:14.000000000 +0800 > +++ linux/include/linux/swap.h 2013-01-22 09:34:44.923011706 +0800 > @@ -8,7 +8,7 @@ > #include <linux/memcontrol.h> > #include <linux/sched.h> > #include <linux/node.h> > - > +#include <linux/fs.h> > #include <linux/atomic.h> > #include <asm/page.h> > > @@ -330,8 +330,9 @@ int generic_swapfile_activate(struct swa > sector_t *); > > /* linux/mm/swap_state.c */ > -extern struct address_space swapper_space; > -#define total_swapcache_pages swapper_space.nrpages > +extern struct address_space swapper_spaces[]; > +#define swap_address_space(entry) (&swapper_spaces[swp_type(entry)]) How about this naming? #define swapper_space(entry) (&swapper_spaces[swp_type(entry)]) > +extern unsigned long total_swapcache_pages(void); > extern void show_swap_cache_info(void); > extern int add_to_swap(struct page *); > extern int add_to_swap_cache(struct page *, swp_entry_t, gfp_t); > @@ -382,7 +383,7 @@ mem_cgroup_uncharge_swapcache(struct pag > > #define nr_swap_pages 0L > #define total_swap_pages 0L > -#define total_swapcache_pages 0UL > +#define total_swapcache_pages() 0UL > > #define si_swapinfo(val) \ > do { (val)->freeswap = (val)->totalswap = 0; } while (0) > Index: linux/mm/memcontrol.c > =================================================================== > --- linux.orig/mm/memcontrol.c 2013-01-22 09:13:14.000000000 +0800 Acked-by: Minchan Kim <minchan@kernel.org> > +++ linux/mm/memcontrol.c 2013-01-22 09:29:29.374977700 +0800 > @@ -6279,7 +6279,7 @@ static struct page *mc_handle_swap_pte(s > * Because lookup_swap_cache() updates some statistics counter, > * we call find_get_page() with swapper_space directly. > */ > - page = find_get_page(&swapper_space, ent.val); > + page = find_get_page(swap_address_space(ent), ent.val); > if (do_swap_account) > entry->val = ent.val; > > @@ -6320,7 +6320,7 @@ static struct page *mc_handle_file_pte(s > swp_entry_t swap = radix_to_swp_entry(page); > if (do_swap_account) > *entry = swap; > - page = find_get_page(&swapper_space, swap.val); > + page = find_get_page(swap_address_space(swap), swap.val); > } > #endif > return page; > Index: linux/mm/mincore.c > =================================================================== > --- linux.orig/mm/mincore.c 2013-01-22 09:13:14.000000000 +0800 > +++ linux/mm/mincore.c 2013-01-22 09:29:29.378977649 +0800 > @@ -75,7 +75,7 @@ static unsigned char mincore_page(struct > /* shmem/tmpfs may return swap: account for swapcache page too. */ > if (radix_tree_exceptional_entry(page)) { > swp_entry_t swap = radix_to_swp_entry(page); > - page = find_get_page(&swapper_space, swap.val); > + page = find_get_page(swap_address_space(swap), swap.val); > } > #endif > if (page) { > @@ -135,7 +135,8 @@ static void mincore_pte_range(struct vm_ > } else { > #ifdef CONFIG_SWAP > pgoff = entry.val; > - *vec = mincore_page(&swapper_space, pgoff); > + *vec = mincore_page(swap_address_space(entry), > + pgoff); > #else > WARN_ON(1); > *vec = 1; > Index: linux/mm/swap.c > =================================================================== > --- linux.orig/mm/swap.c 2013-01-22 09:13:14.000000000 +0800 > +++ linux/mm/swap.c 2013-01-22 09:29:29.378977649 +0800 > @@ -855,9 +855,14 @@ EXPORT_SYMBOL(pagevec_lookup_tag); > void __init swap_setup(void) > { > unsigned long megs = totalram_pages >> (20 - PAGE_SHIFT); > - > #ifdef CONFIG_SWAP > - bdi_init(swapper_space.backing_dev_info); > + int i; > + > + for (i = 0; i < MAX_SWAPFILES; i++) { > + bdi_init(swapper_spaces[i].backing_dev_info); > + spin_lock_init(&swapper_spaces[i].tree_lock); > + INIT_LIST_HEAD(&swapper_spaces[i].i_mmap_nonlinear); > + } > #endif > > /* Use a smaller cluster for small-memory machines */ > Index: linux/mm/swap_state.c > =================================================================== > --- linux.orig/mm/swap_state.c 2013-01-22 09:13:14.000000000 +0800 > +++ linux/mm/swap_state.c 2013-01-22 09:29:29.378977649 +0800 > @@ -36,12 +36,12 @@ static struct backing_dev_info swap_back > .capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK | BDI_CAP_SWAP_BACKED, > }; > > -struct address_space swapper_space = { > - .page_tree = RADIX_TREE_INIT(GFP_ATOMIC|__GFP_NOWARN), > - .tree_lock = __SPIN_LOCK_UNLOCKED(swapper_space.tree_lock), > - .a_ops = &swap_aops, > - .i_mmap_nonlinear = LIST_HEAD_INIT(swapper_space.i_mmap_nonlinear), > - .backing_dev_info = &swap_backing_dev_info, > +struct address_space swapper_spaces[MAX_SWAPFILES] = { > + [0 ... MAX_SWAPFILES - 1] = { > + .page_tree = RADIX_TREE_INIT(GFP_ATOMIC|__GFP_NOWARN), > + .a_ops = &swap_aops, > + .backing_dev_info = &swap_backing_dev_info, > + } > }; > > #define INC_CACHE_INFO(x) do { swap_cache_info.x++; } while (0) > @@ -53,9 +53,19 @@ static struct { > unsigned long find_total; > } swap_cache_info; > > +unsigned long total_swapcache_pages(void) > +{ > + int i; > + unsigned long ret = 0; > + > + for (i = 0; i < MAX_SWAPFILES; i++) > + ret += swapper_spaces[i].nrpages; > + return ret; > +} > + > void show_swap_cache_info(void) > { > - printk("%lu pages in swap cache\n", total_swapcache_pages); > + printk("%lu pages in swap cache\n", total_swapcache_pages()); > printk("Swap cache stats: add %lu, delete %lu, find %lu/%lu\n", > swap_cache_info.add_total, swap_cache_info.del_total, > swap_cache_info.find_success, swap_cache_info.find_total); > @@ -70,23 +80,26 @@ void show_swap_cache_info(void) > static int __add_to_swap_cache(struct page *page, swp_entry_t entry) > { > int error; > + struct address_space *address_space; > > VM_BUG_ON(!PageLocked(page)); > VM_BUG_ON(PageSwapCache(page)); > VM_BUG_ON(!PageSwapBacked(page)); > > page_cache_get(page); > - SetPageSwapCache(page); > set_page_private(page, entry.val); > + SetPageSwapCache(page); Why did you move this line? Is there any special reason? > > - spin_lock_irq(&swapper_space.tree_lock); > - error = radix_tree_insert(&swapper_space.page_tree, entry.val, page); > + address_space = swap_address_space(entry); > + spin_lock_irq(&address_space->tree_lock); > + error = radix_tree_insert(&address_space->page_tree, > + entry.val, page); How about introducing utility functions to hold a lock from entry? lock_swapper_space(entry); unlock_swapper_space(entry); -- Kind regards, Minchan Kim -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@kvack.org. For more info on Linux MM, see: http://www.linux-mm.org/ . Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a> ^ permalink raw reply [flat|nested] 15+ messages in thread
* Re: [patch 2/3 v2]swap: make each swap partition have one address_space 2013-01-23 6:16 ` Minchan Kim @ 2013-01-23 7:36 ` Shaohua Li 2013-01-23 8:04 ` Minchan Kim 0 siblings, 1 reply; 15+ messages in thread From: Shaohua Li @ 2013-01-23 7:36 UTC (permalink / raw) To: Minchan Kim; +Cc: linux-mm, akpm, hughd, riel On Wed, Jan 23, 2013 at 03:16:45PM +0900, Minchan Kim wrote: > Looks good to me. Below just nitpicks. > I saw Andrew already took this into mmotm so I'm not sure he or you will do > next spin but anyway, review goes. Just nitpicks and a question. > > On Tue, Jan 22, 2013 at 10:29:51AM +0800, Shaohua Li wrote: > > > > When I use several fast SSD to do swap, swapper_space.tree_lock is heavily > > contended. This makes each swap partition have one address_space to reduce the > > lock contention. There is an array of address_space for swap. The swap entry > > type is the index to the array. > > > > In my test with 3 SSD, this increases the swapout throughput 20%. > > > > V1->V2: simplify code > > > > Signed-off-by: Shaohua Li <shli@fusionio.com> > > Acked-by: Minchan Kim <minchan@kernel.org> > > > --- > > fs/proc/meminfo.c | 4 +-- > > include/linux/swap.h | 9 ++++---- > > mm/memcontrol.c | 4 +-- > > mm/mincore.c | 5 ++-- > > mm/swap.c | 9 ++++++-- > > mm/swap_state.c | 57 ++++++++++++++++++++++++++++++++++----------------- > > mm/swapfile.c | 5 ++-- > > mm/util.c | 10 ++++++-- > > 8 files changed, 68 insertions(+), 35 deletions(-) > > > > Index: linux/include/linux/swap.h > > =================================================================== > > --- linux.orig/include/linux/swap.h 2013-01-22 09:13:14.000000000 +0800 > > +++ linux/include/linux/swap.h 2013-01-22 09:34:44.923011706 +0800 > > @@ -8,7 +8,7 @@ > > #include <linux/memcontrol.h> > > #include <linux/sched.h> > > #include <linux/node.h> > > - > > +#include <linux/fs.h> > > #include <linux/atomic.h> > > #include <asm/page.h> > > > > @@ -330,8 +330,9 @@ int generic_swapfile_activate(struct swa > > sector_t *); > > > > /* linux/mm/swap_state.c */ > > -extern struct address_space swapper_space; > > -#define total_swapcache_pages swapper_space.nrpages > > +extern struct address_space swapper_spaces[]; > > +#define swap_address_space(entry) (&swapper_spaces[swp_type(entry)]) > > How about this naming? > > #define swapper_space(entry) (&swapper_spaces[swp_type(entry)]) > > > +extern unsigned long total_swapcache_pages(void); > > extern void show_swap_cache_info(void); > > extern int add_to_swap(struct page *); > > extern int add_to_swap_cache(struct page *, swp_entry_t, gfp_t); > > @@ -382,7 +383,7 @@ mem_cgroup_uncharge_swapcache(struct pag > > > > #define nr_swap_pages 0L > > #define total_swap_pages 0L > > -#define total_swapcache_pages 0UL > > +#define total_swapcache_pages() 0UL > > > > #define si_swapinfo(val) \ > > do { (val)->freeswap = (val)->totalswap = 0; } while (0) > > Index: linux/mm/memcontrol.c > > =================================================================== > > --- linux.orig/mm/memcontrol.c 2013-01-22 09:13:14.000000000 +0800 > Acked-by: Minchan Kim <minchan@kernel.org> > > > +++ linux/mm/memcontrol.c 2013-01-22 09:29:29.374977700 +0800 > > @@ -6279,7 +6279,7 @@ static struct page *mc_handle_swap_pte(s > > * Because lookup_swap_cache() updates some statistics counter, > > * we call find_get_page() with swapper_space directly. > > */ > > - page = find_get_page(&swapper_space, ent.val); > > + page = find_get_page(swap_address_space(ent), ent.val); > > if (do_swap_account) > > entry->val = ent.val; > > > > @@ -6320,7 +6320,7 @@ static struct page *mc_handle_file_pte(s > > swp_entry_t swap = radix_to_swp_entry(page); > > if (do_swap_account) > > *entry = swap; > > - page = find_get_page(&swapper_space, swap.val); > > + page = find_get_page(swap_address_space(swap), swap.val); > > } > > #endif > > return page; > > Index: linux/mm/mincore.c > > =================================================================== > > --- linux.orig/mm/mincore.c 2013-01-22 09:13:14.000000000 +0800 > > +++ linux/mm/mincore.c 2013-01-22 09:29:29.378977649 +0800 > > @@ -75,7 +75,7 @@ static unsigned char mincore_page(struct > > /* shmem/tmpfs may return swap: account for swapcache page too. */ > > if (radix_tree_exceptional_entry(page)) { > > swp_entry_t swap = radix_to_swp_entry(page); > > - page = find_get_page(&swapper_space, swap.val); > > + page = find_get_page(swap_address_space(swap), swap.val); > > } > > #endif > > if (page) { > > @@ -135,7 +135,8 @@ static void mincore_pte_range(struct vm_ > > } else { > > #ifdef CONFIG_SWAP > > pgoff = entry.val; > > - *vec = mincore_page(&swapper_space, pgoff); > > + *vec = mincore_page(swap_address_space(entry), > > + pgoff); > > #else > > WARN_ON(1); > > *vec = 1; > > Index: linux/mm/swap.c > > =================================================================== > > --- linux.orig/mm/swap.c 2013-01-22 09:13:14.000000000 +0800 > > +++ linux/mm/swap.c 2013-01-22 09:29:29.378977649 +0800 > > @@ -855,9 +855,14 @@ EXPORT_SYMBOL(pagevec_lookup_tag); > > void __init swap_setup(void) > > { > > unsigned long megs = totalram_pages >> (20 - PAGE_SHIFT); > > - > > #ifdef CONFIG_SWAP > > - bdi_init(swapper_space.backing_dev_info); > > + int i; > > + > > + for (i = 0; i < MAX_SWAPFILES; i++) { > > + bdi_init(swapper_spaces[i].backing_dev_info); > > + spin_lock_init(&swapper_spaces[i].tree_lock); > > + INIT_LIST_HEAD(&swapper_spaces[i].i_mmap_nonlinear); > > + } > > #endif > > > > /* Use a smaller cluster for small-memory machines */ > > Index: linux/mm/swap_state.c > > =================================================================== > > --- linux.orig/mm/swap_state.c 2013-01-22 09:13:14.000000000 +0800 > > +++ linux/mm/swap_state.c 2013-01-22 09:29:29.378977649 +0800 > > @@ -36,12 +36,12 @@ static struct backing_dev_info swap_back > > .capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK | BDI_CAP_SWAP_BACKED, > > }; > > > > -struct address_space swapper_space = { > > - .page_tree = RADIX_TREE_INIT(GFP_ATOMIC|__GFP_NOWARN), > > - .tree_lock = __SPIN_LOCK_UNLOCKED(swapper_space.tree_lock), > > - .a_ops = &swap_aops, > > - .i_mmap_nonlinear = LIST_HEAD_INIT(swapper_space.i_mmap_nonlinear), > > - .backing_dev_info = &swap_backing_dev_info, > > +struct address_space swapper_spaces[MAX_SWAPFILES] = { > > + [0 ... MAX_SWAPFILES - 1] = { > > + .page_tree = RADIX_TREE_INIT(GFP_ATOMIC|__GFP_NOWARN), > > + .a_ops = &swap_aops, > > + .backing_dev_info = &swap_backing_dev_info, > > + } > > }; > > > > #define INC_CACHE_INFO(x) do { swap_cache_info.x++; } while (0) > > @@ -53,9 +53,19 @@ static struct { > > unsigned long find_total; > > } swap_cache_info; > > > > +unsigned long total_swapcache_pages(void) > > +{ > > + int i; > > + unsigned long ret = 0; > > + > > + for (i = 0; i < MAX_SWAPFILES; i++) > > + ret += swapper_spaces[i].nrpages; > > + return ret; > > +} > > + > > void show_swap_cache_info(void) > > { > > - printk("%lu pages in swap cache\n", total_swapcache_pages); > > + printk("%lu pages in swap cache\n", total_swapcache_pages()); > > printk("Swap cache stats: add %lu, delete %lu, find %lu/%lu\n", > > swap_cache_info.add_total, swap_cache_info.del_total, > > swap_cache_info.find_success, swap_cache_info.find_total); > > @@ -70,23 +80,26 @@ void show_swap_cache_info(void) > > static int __add_to_swap_cache(struct page *page, swp_entry_t entry) > > { > > int error; > > + struct address_space *address_space; > > > > VM_BUG_ON(!PageLocked(page)); > > VM_BUG_ON(PageSwapCache(page)); > > VM_BUG_ON(!PageSwapBacked(page)); > > > > page_cache_get(page); > > - SetPageSwapCache(page); > > set_page_private(page, entry.val); > > + SetPageSwapCache(page); > > Why did you move this line? Is there any special reason? Originally I'm afraid page_mapping() gets invalid page_private(), but I then realized we hold page lock. There are some places we don't hold page lock. either such page isn't swap page or the caller can tolerate race. I forgot removing this change in the patch. But I certainly can be wrong. We can add memory barrier if required. Thanks, Shaohua -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@kvack.org. For more info on Linux MM, see: http://www.linux-mm.org/ . Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a> ^ permalink raw reply [flat|nested] 15+ messages in thread
* Re: [patch 2/3 v2]swap: make each swap partition have one address_space 2013-01-23 7:36 ` Shaohua Li @ 2013-01-23 8:04 ` Minchan Kim 2013-01-24 1:39 ` Simon Jeons 0 siblings, 1 reply; 15+ messages in thread From: Minchan Kim @ 2013-01-23 8:04 UTC (permalink / raw) To: Shaohua Li; +Cc: linux-mm, akpm, hughd, riel On Wed, Jan 23, 2013 at 03:36:55PM +0800, Shaohua Li wrote: > On Wed, Jan 23, 2013 at 03:16:45PM +0900, Minchan Kim wrote: > > Looks good to me. Below just nitpicks. > > I saw Andrew already took this into mmotm so I'm not sure he or you will do > > next spin but anyway, review goes. Just nitpicks and a question. > > > > On Tue, Jan 22, 2013 at 10:29:51AM +0800, Shaohua Li wrote: > > > > > > When I use several fast SSD to do swap, swapper_space.tree_lock is heavily > > > contended. This makes each swap partition have one address_space to reduce the > > > lock contention. There is an array of address_space for swap. The swap entry > > > type is the index to the array. > > > > > > In my test with 3 SSD, this increases the swapout throughput 20%. > > > > > > V1->V2: simplify code > > > > > > Signed-off-by: Shaohua Li <shli@fusionio.com> > > > > Acked-by: Minchan Kim <minchan@kernel.org> > > > > > --- > > > fs/proc/meminfo.c | 4 +-- > > > include/linux/swap.h | 9 ++++---- > > > mm/memcontrol.c | 4 +-- > > > mm/mincore.c | 5 ++-- > > > mm/swap.c | 9 ++++++-- > > > mm/swap_state.c | 57 ++++++++++++++++++++++++++++++++++----------------- > > > mm/swapfile.c | 5 ++-- > > > mm/util.c | 10 ++++++-- > > > 8 files changed, 68 insertions(+), 35 deletions(-) > > > > > > Index: linux/include/linux/swap.h > > > =================================================================== > > > --- linux.orig/include/linux/swap.h 2013-01-22 09:13:14.000000000 +0800 > > > +++ linux/include/linux/swap.h 2013-01-22 09:34:44.923011706 +0800 > > > @@ -8,7 +8,7 @@ > > > #include <linux/memcontrol.h> > > > #include <linux/sched.h> > > > #include <linux/node.h> > > > - > > > +#include <linux/fs.h> > > > #include <linux/atomic.h> > > > #include <asm/page.h> > > > > > > @@ -330,8 +330,9 @@ int generic_swapfile_activate(struct swa > > > sector_t *); > > > > > > /* linux/mm/swap_state.c */ > > > -extern struct address_space swapper_space; > > > -#define total_swapcache_pages swapper_space.nrpages > > > +extern struct address_space swapper_spaces[]; > > > +#define swap_address_space(entry) (&swapper_spaces[swp_type(entry)]) > > > > How about this naming? > > > > #define swapper_space(entry) (&swapper_spaces[swp_type(entry)]) > > > > > +extern unsigned long total_swapcache_pages(void); > > > extern void show_swap_cache_info(void); > > > extern int add_to_swap(struct page *); > > > extern int add_to_swap_cache(struct page *, swp_entry_t, gfp_t); > > > @@ -382,7 +383,7 @@ mem_cgroup_uncharge_swapcache(struct pag > > > > > > #define nr_swap_pages 0L > > > #define total_swap_pages 0L > > > -#define total_swapcache_pages 0UL > > > +#define total_swapcache_pages() 0UL > > > > > > #define si_swapinfo(val) \ > > > do { (val)->freeswap = (val)->totalswap = 0; } while (0) > > > Index: linux/mm/memcontrol.c > > > =================================================================== > > > --- linux.orig/mm/memcontrol.c 2013-01-22 09:13:14.000000000 +0800 > > Acked-by: Minchan Kim <minchan@kernel.org> > > > > > +++ linux/mm/memcontrol.c 2013-01-22 09:29:29.374977700 +0800 > > > @@ -6279,7 +6279,7 @@ static struct page *mc_handle_swap_pte(s > > > * Because lookup_swap_cache() updates some statistics counter, > > > * we call find_get_page() with swapper_space directly. > > > */ > > > - page = find_get_page(&swapper_space, ent.val); > > > + page = find_get_page(swap_address_space(ent), ent.val); > > > if (do_swap_account) > > > entry->val = ent.val; > > > > > > @@ -6320,7 +6320,7 @@ static struct page *mc_handle_file_pte(s > > > swp_entry_t swap = radix_to_swp_entry(page); > > > if (do_swap_account) > > > *entry = swap; > > > - page = find_get_page(&swapper_space, swap.val); > > > + page = find_get_page(swap_address_space(swap), swap.val); > > > } > > > #endif > > > return page; > > > Index: linux/mm/mincore.c > > > =================================================================== > > > --- linux.orig/mm/mincore.c 2013-01-22 09:13:14.000000000 +0800 > > > +++ linux/mm/mincore.c 2013-01-22 09:29:29.378977649 +0800 > > > @@ -75,7 +75,7 @@ static unsigned char mincore_page(struct > > > /* shmem/tmpfs may return swap: account for swapcache page too. */ > > > if (radix_tree_exceptional_entry(page)) { > > > swp_entry_t swap = radix_to_swp_entry(page); > > > - page = find_get_page(&swapper_space, swap.val); > > > + page = find_get_page(swap_address_space(swap), swap.val); > > > } > > > #endif > > > if (page) { > > > @@ -135,7 +135,8 @@ static void mincore_pte_range(struct vm_ > > > } else { > > > #ifdef CONFIG_SWAP > > > pgoff = entry.val; > > > - *vec = mincore_page(&swapper_space, pgoff); > > > + *vec = mincore_page(swap_address_space(entry), > > > + pgoff); > > > #else > > > WARN_ON(1); > > > *vec = 1; > > > Index: linux/mm/swap.c > > > =================================================================== > > > --- linux.orig/mm/swap.c 2013-01-22 09:13:14.000000000 +0800 > > > +++ linux/mm/swap.c 2013-01-22 09:29:29.378977649 +0800 > > > @@ -855,9 +855,14 @@ EXPORT_SYMBOL(pagevec_lookup_tag); > > > void __init swap_setup(void) > > > { > > > unsigned long megs = totalram_pages >> (20 - PAGE_SHIFT); > > > - > > > #ifdef CONFIG_SWAP > > > - bdi_init(swapper_space.backing_dev_info); > > > + int i; > > > + > > > + for (i = 0; i < MAX_SWAPFILES; i++) { > > > + bdi_init(swapper_spaces[i].backing_dev_info); > > > + spin_lock_init(&swapper_spaces[i].tree_lock); > > > + INIT_LIST_HEAD(&swapper_spaces[i].i_mmap_nonlinear); > > > + } > > > #endif > > > > > > /* Use a smaller cluster for small-memory machines */ > > > Index: linux/mm/swap_state.c > > > =================================================================== > > > --- linux.orig/mm/swap_state.c 2013-01-22 09:13:14.000000000 +0800 > > > +++ linux/mm/swap_state.c 2013-01-22 09:29:29.378977649 +0800 > > > @@ -36,12 +36,12 @@ static struct backing_dev_info swap_back > > > .capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK | BDI_CAP_SWAP_BACKED, > > > }; > > > > > > -struct address_space swapper_space = { > > > - .page_tree = RADIX_TREE_INIT(GFP_ATOMIC|__GFP_NOWARN), > > > - .tree_lock = __SPIN_LOCK_UNLOCKED(swapper_space.tree_lock), > > > - .a_ops = &swap_aops, > > > - .i_mmap_nonlinear = LIST_HEAD_INIT(swapper_space.i_mmap_nonlinear), > > > - .backing_dev_info = &swap_backing_dev_info, > > > +struct address_space swapper_spaces[MAX_SWAPFILES] = { > > > + [0 ... MAX_SWAPFILES - 1] = { > > > + .page_tree = RADIX_TREE_INIT(GFP_ATOMIC|__GFP_NOWARN), > > > + .a_ops = &swap_aops, > > > + .backing_dev_info = &swap_backing_dev_info, > > > + } > > > }; > > > > > > #define INC_CACHE_INFO(x) do { swap_cache_info.x++; } while (0) > > > @@ -53,9 +53,19 @@ static struct { > > > unsigned long find_total; > > > } swap_cache_info; > > > > > > +unsigned long total_swapcache_pages(void) > > > +{ > > > + int i; > > > + unsigned long ret = 0; > > > + > > > + for (i = 0; i < MAX_SWAPFILES; i++) > > > + ret += swapper_spaces[i].nrpages; > > > + return ret; > > > +} > > > + > > > void show_swap_cache_info(void) > > > { > > > - printk("%lu pages in swap cache\n", total_swapcache_pages); > > > + printk("%lu pages in swap cache\n", total_swapcache_pages()); > > > printk("Swap cache stats: add %lu, delete %lu, find %lu/%lu\n", > > > swap_cache_info.add_total, swap_cache_info.del_total, > > > swap_cache_info.find_success, swap_cache_info.find_total); > > > @@ -70,23 +80,26 @@ void show_swap_cache_info(void) > > > static int __add_to_swap_cache(struct page *page, swp_entry_t entry) > > > { > > > int error; > > > + struct address_space *address_space; > > > > > > VM_BUG_ON(!PageLocked(page)); > > > VM_BUG_ON(PageSwapCache(page)); > > > VM_BUG_ON(!PageSwapBacked(page)); > > > > > > page_cache_get(page); > > > - SetPageSwapCache(page); > > > set_page_private(page, entry.val); > > > + SetPageSwapCache(page); > > > > Why did you move this line? Is there any special reason? > > Originally I'm afraid page_mapping() gets invalid page_private(), but I then > realized we hold page lock. There are some places we don't hold page lock. > either such page isn't swap page or the caller can tolerate race. I forgot > removing this change in the patch. But I certainly can be wrong. We can add > memory barrier if required. Yeb. While I reviewed the patch, I was concern about that but I fail to find a problem, too. But maybe it's valuable to add comment about that race (!PageSwapCache(page) but page_mapping could return swapper_space) in page_mapping. So if some problem happens in the future, people could help to find culprit. Could you respin this with adding the comment and removing above unnecessary moving? > > Thanks, > Shaohua > > -- > To unsubscribe, send a message with 'unsubscribe linux-mm' in > the body to majordomo@kvack.org. For more info on Linux MM, > see: http://www.linux-mm.org/ . > Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a> -- Kind regards, Minchan Kim -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@kvack.org. For more info on Linux MM, see: http://www.linux-mm.org/ . Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a> ^ permalink raw reply [flat|nested] 15+ messages in thread
* Re: [patch 2/3 v2]swap: make each swap partition have one address_space 2013-01-23 8:04 ` Minchan Kim @ 2013-01-24 1:39 ` Simon Jeons 2013-01-24 2:22 ` Minchan Kim 0 siblings, 1 reply; 15+ messages in thread From: Simon Jeons @ 2013-01-24 1:39 UTC (permalink / raw) To: Minchan Kim; +Cc: Shaohua Li, linux-mm, akpm, hughd, riel On Wed, 2013-01-23 at 17:04 +0900, Minchan Kim wrote: > On Wed, Jan 23, 2013 at 03:36:55PM +0800, Shaohua Li wrote: > > On Wed, Jan 23, 2013 at 03:16:45PM +0900, Minchan Kim wrote: > > > Looks good to me. Below just nitpicks. > > > I saw Andrew already took this into mmotm so I'm not sure he or you will do > > > next spin but anyway, review goes. Just nitpicks and a question. > > > > > > On Tue, Jan 22, 2013 at 10:29:51AM +0800, Shaohua Li wrote: > > > > > > > > When I use several fast SSD to do swap, swapper_space.tree_lock is heavily > > > > contended. This makes each swap partition have one address_space to reduce the > > > > lock contention. There is an array of address_space for swap. The swap entry > > > > type is the index to the array. > > > > > > > > In my test with 3 SSD, this increases the swapout throughput 20%. > > > > > > > > V1->V2: simplify code > > > > > > > > Signed-off-by: Shaohua Li <shli@fusionio.com> > > > > > > Acked-by: Minchan Kim <minchan@kernel.org> > > > > > > > --- > > > > fs/proc/meminfo.c | 4 +-- > > > > include/linux/swap.h | 9 ++++---- > > > > mm/memcontrol.c | 4 +-- > > > > mm/mincore.c | 5 ++-- > > > > mm/swap.c | 9 ++++++-- > > > > mm/swap_state.c | 57 ++++++++++++++++++++++++++++++++++----------------- > > > > mm/swapfile.c | 5 ++-- > > > > mm/util.c | 10 ++++++-- > > > > 8 files changed, 68 insertions(+), 35 deletions(-) > > > > > > > > Index: linux/include/linux/swap.h > > > > =================================================================== > > > > --- linux.orig/include/linux/swap.h 2013-01-22 09:13:14.000000000 +0800 > > > > +++ linux/include/linux/swap.h 2013-01-22 09:34:44.923011706 +0800 > > > > @@ -8,7 +8,7 @@ > > > > #include <linux/memcontrol.h> > > > > #include <linux/sched.h> > > > > #include <linux/node.h> > > > > - > > > > +#include <linux/fs.h> > > > > #include <linux/atomic.h> > > > > #include <asm/page.h> > > > > > > > > @@ -330,8 +330,9 @@ int generic_swapfile_activate(struct swa > > > > sector_t *); > > > > > > > > /* linux/mm/swap_state.c */ > > > > -extern struct address_space swapper_space; > > > > -#define total_swapcache_pages swapper_space.nrpages > > > > +extern struct address_space swapper_spaces[]; > > > > +#define swap_address_space(entry) (&swapper_spaces[swp_type(entry)]) > > > > > > How about this naming? > > > > > > #define swapper_space(entry) (&swapper_spaces[swp_type(entry)]) > > > > > > > +extern unsigned long total_swapcache_pages(void); > > > > extern void show_swap_cache_info(void); > > > > extern int add_to_swap(struct page *); > > > > extern int add_to_swap_cache(struct page *, swp_entry_t, gfp_t); > > > > @@ -382,7 +383,7 @@ mem_cgroup_uncharge_swapcache(struct pag > > > > > > > > #define nr_swap_pages 0L > > > > #define total_swap_pages 0L > > > > -#define total_swapcache_pages 0UL > > > > +#define total_swapcache_pages() 0UL > > > > > > > > #define si_swapinfo(val) \ > > > > do { (val)->freeswap = (val)->totalswap = 0; } while (0) > > > > Index: linux/mm/memcontrol.c > > > > =================================================================== > > > > --- linux.orig/mm/memcontrol.c 2013-01-22 09:13:14.000000000 +0800 > > > Acked-by: Minchan Kim <minchan@kernel.org> > > > > > > > +++ linux/mm/memcontrol.c 2013-01-22 09:29:29.374977700 +0800 > > > > @@ -6279,7 +6279,7 @@ static struct page *mc_handle_swap_pte(s > > > > * Because lookup_swap_cache() updates some statistics counter, > > > > * we call find_get_page() with swapper_space directly. > > > > */ > > > > - page = find_get_page(&swapper_space, ent.val); > > > > + page = find_get_page(swap_address_space(ent), ent.val); > > > > if (do_swap_account) > > > > entry->val = ent.val; > > > > > > > > @@ -6320,7 +6320,7 @@ static struct page *mc_handle_file_pte(s > > > > swp_entry_t swap = radix_to_swp_entry(page); > > > > if (do_swap_account) > > > > *entry = swap; > > > > - page = find_get_page(&swapper_space, swap.val); > > > > + page = find_get_page(swap_address_space(swap), swap.val); > > > > } > > > > #endif > > > > return page; > > > > Index: linux/mm/mincore.c > > > > =================================================================== > > > > --- linux.orig/mm/mincore.c 2013-01-22 09:13:14.000000000 +0800 > > > > +++ linux/mm/mincore.c 2013-01-22 09:29:29.378977649 +0800 > > > > @@ -75,7 +75,7 @@ static unsigned char mincore_page(struct > > > > /* shmem/tmpfs may return swap: account for swapcache page too. */ > > > > if (radix_tree_exceptional_entry(page)) { > > > > swp_entry_t swap = radix_to_swp_entry(page); > > > > - page = find_get_page(&swapper_space, swap.val); > > > > + page = find_get_page(swap_address_space(swap), swap.val); > > > > } > > > > #endif > > > > if (page) { > > > > @@ -135,7 +135,8 @@ static void mincore_pte_range(struct vm_ > > > > } else { > > > > #ifdef CONFIG_SWAP > > > > pgoff = entry.val; > > > > - *vec = mincore_page(&swapper_space, pgoff); > > > > + *vec = mincore_page(swap_address_space(entry), > > > > + pgoff); > > > > #else > > > > WARN_ON(1); > > > > *vec = 1; > > > > Index: linux/mm/swap.c > > > > =================================================================== > > > > --- linux.orig/mm/swap.c 2013-01-22 09:13:14.000000000 +0800 > > > > +++ linux/mm/swap.c 2013-01-22 09:29:29.378977649 +0800 > > > > @@ -855,9 +855,14 @@ EXPORT_SYMBOL(pagevec_lookup_tag); > > > > void __init swap_setup(void) > > > > { > > > > unsigned long megs = totalram_pages >> (20 - PAGE_SHIFT); > > > > - > > > > #ifdef CONFIG_SWAP > > > > - bdi_init(swapper_space.backing_dev_info); > > > > + int i; > > > > + > > > > + for (i = 0; i < MAX_SWAPFILES; i++) { > > > > + bdi_init(swapper_spaces[i].backing_dev_info); > > > > + spin_lock_init(&swapper_spaces[i].tree_lock); > > > > + INIT_LIST_HEAD(&swapper_spaces[i].i_mmap_nonlinear); > > > > + } > > > > #endif > > > > > > > > /* Use a smaller cluster for small-memory machines */ > > > > Index: linux/mm/swap_state.c > > > > =================================================================== > > > > --- linux.orig/mm/swap_state.c 2013-01-22 09:13:14.000000000 +0800 > > > > +++ linux/mm/swap_state.c 2013-01-22 09:29:29.378977649 +0800 > > > > @@ -36,12 +36,12 @@ static struct backing_dev_info swap_back > > > > .capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK | BDI_CAP_SWAP_BACKED, > > > > }; > > > > > > > > -struct address_space swapper_space = { > > > > - .page_tree = RADIX_TREE_INIT(GFP_ATOMIC|__GFP_NOWARN), > > > > - .tree_lock = __SPIN_LOCK_UNLOCKED(swapper_space.tree_lock), > > > > - .a_ops = &swap_aops, > > > > - .i_mmap_nonlinear = LIST_HEAD_INIT(swapper_space.i_mmap_nonlinear), > > > > - .backing_dev_info = &swap_backing_dev_info, > > > > +struct address_space swapper_spaces[MAX_SWAPFILES] = { > > > > + [0 ... MAX_SWAPFILES - 1] = { > > > > + .page_tree = RADIX_TREE_INIT(GFP_ATOMIC|__GFP_NOWARN), > > > > + .a_ops = &swap_aops, > > > > + .backing_dev_info = &swap_backing_dev_info, > > > > + } > > > > }; > > > > > > > > #define INC_CACHE_INFO(x) do { swap_cache_info.x++; } while (0) > > > > @@ -53,9 +53,19 @@ static struct { > > > > unsigned long find_total; > > > > } swap_cache_info; > > > > > > > > +unsigned long total_swapcache_pages(void) > > > > +{ > > > > + int i; > > > > + unsigned long ret = 0; > > > > + > > > > + for (i = 0; i < MAX_SWAPFILES; i++) > > > > + ret += swapper_spaces[i].nrpages; > > > > + return ret; > > > > +} > > > > + > > > > void show_swap_cache_info(void) > > > > { > > > > - printk("%lu pages in swap cache\n", total_swapcache_pages); > > > > + printk("%lu pages in swap cache\n", total_swapcache_pages()); > > > > printk("Swap cache stats: add %lu, delete %lu, find %lu/%lu\n", > > > > swap_cache_info.add_total, swap_cache_info.del_total, > > > > swap_cache_info.find_success, swap_cache_info.find_total); > > > > @@ -70,23 +80,26 @@ void show_swap_cache_info(void) > > > > static int __add_to_swap_cache(struct page *page, swp_entry_t entry) > > > > { > > > > int error; > > > > + struct address_space *address_space; > > > > > > > > VM_BUG_ON(!PageLocked(page)); > > > > VM_BUG_ON(PageSwapCache(page)); > > > > VM_BUG_ON(!PageSwapBacked(page)); > > > > > > > > page_cache_get(page); > > > > - SetPageSwapCache(page); > > > > set_page_private(page, entry.val); > > > > + SetPageSwapCache(page); > > > > > > Why did you move this line? Is there any special reason? > > > > Originally I'm afraid page_mapping() gets invalid page_private(), but I then > > realized we hold page lock. There are some places we don't hold page lock. > > either such page isn't swap page or the caller can tolerate race. I forgot > > removing this change in the patch. But I certainly can be wrong. We can add > > memory barrier if required. > > Yeb. While I reviewed the patch, I was concern about that but I fail to find > a problem, too. But maybe it's valuable to add comment about that race > (!PageSwapCache(page) but page_mapping could return swapper_space) in page_mapping. Hi Minchan, If the race Shaohua mentioned should be PageSwapCache(page) but page_mapping couldn't return swapper_space since page_mapping() gets invalid page_private(). > So if some problem happens in the future, people could help to find culprit. > > Could you respin this with adding the comment and removing above unnecessary moving? > > > > > Thanks, > > Shaohua > > > > -- > > To unsubscribe, send a message with 'unsubscribe linux-mm' in > > the body to majordomo@kvack.org. For more info on Linux MM, > > see: http://www.linux-mm.org/ . > > Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a> > -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@kvack.org. For more info on Linux MM, see: http://www.linux-mm.org/ . Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a> ^ permalink raw reply [flat|nested] 15+ messages in thread
* Re: [patch 2/3 v2]swap: make each swap partition have one address_space 2013-01-24 1:39 ` Simon Jeons @ 2013-01-24 2:22 ` Minchan Kim 2013-01-24 2:43 ` Shaohua Li 0 siblings, 1 reply; 15+ messages in thread From: Minchan Kim @ 2013-01-24 2:22 UTC (permalink / raw) To: Simon Jeons; +Cc: Shaohua Li, linux-mm, akpm, hughd, riel Hi Simon, On Wed, Jan 23, 2013 at 07:39:56PM -0600, Simon Jeons wrote: > On Wed, 2013-01-23 at 17:04 +0900, Minchan Kim wrote: > > On Wed, Jan 23, 2013 at 03:36:55PM +0800, Shaohua Li wrote: > > > On Wed, Jan 23, 2013 at 03:16:45PM +0900, Minchan Kim wrote: > > > > Looks good to me. Below just nitpicks. > > > > I saw Andrew already took this into mmotm so I'm not sure he or you will do > > > > next spin but anyway, review goes. Just nitpicks and a question. > > > > > > > > On Tue, Jan 22, 2013 at 10:29:51AM +0800, Shaohua Li wrote: > > > > > > > > > > When I use several fast SSD to do swap, swapper_space.tree_lock is heavily > > > > > contended. This makes each swap partition have one address_space to reduce the > > > > > lock contention. There is an array of address_space for swap. The swap entry > > > > > type is the index to the array. > > > > > > > > > > In my test with 3 SSD, this increases the swapout throughput 20%. > > > > > > > > > > V1->V2: simplify code > > > > > > > > > > Signed-off-by: Shaohua Li <shli@fusionio.com> > > > > > > > > Acked-by: Minchan Kim <minchan@kernel.org> > > > > > > > > > --- > > > > > fs/proc/meminfo.c | 4 +-- > > > > > include/linux/swap.h | 9 ++++---- > > > > > mm/memcontrol.c | 4 +-- > > > > > mm/mincore.c | 5 ++-- > > > > > mm/swap.c | 9 ++++++-- > > > > > mm/swap_state.c | 57 ++++++++++++++++++++++++++++++++++----------------- > > > > > mm/swapfile.c | 5 ++-- > > > > > mm/util.c | 10 ++++++-- > > > > > 8 files changed, 68 insertions(+), 35 deletions(-) > > > > > > > > > > Index: linux/include/linux/swap.h > > > > > =================================================================== > > > > > --- linux.orig/include/linux/swap.h 2013-01-22 09:13:14.000000000 +0800 > > > > > +++ linux/include/linux/swap.h 2013-01-22 09:34:44.923011706 +0800 > > > > > @@ -8,7 +8,7 @@ > > > > > #include <linux/memcontrol.h> > > > > > #include <linux/sched.h> > > > > > #include <linux/node.h> > > > > > - > > > > > +#include <linux/fs.h> > > > > > #include <linux/atomic.h> > > > > > #include <asm/page.h> > > > > > > > > > > @@ -330,8 +330,9 @@ int generic_swapfile_activate(struct swa > > > > > sector_t *); > > > > > > > > > > /* linux/mm/swap_state.c */ > > > > > -extern struct address_space swapper_space; > > > > > -#define total_swapcache_pages swapper_space.nrpages > > > > > +extern struct address_space swapper_spaces[]; > > > > > +#define swap_address_space(entry) (&swapper_spaces[swp_type(entry)]) > > > > > > > > How about this naming? > > > > > > > > #define swapper_space(entry) (&swapper_spaces[swp_type(entry)]) > > > > > > > > > +extern unsigned long total_swapcache_pages(void); > > > > > extern void show_swap_cache_info(void); > > > > > extern int add_to_swap(struct page *); > > > > > extern int add_to_swap_cache(struct page *, swp_entry_t, gfp_t); > > > > > @@ -382,7 +383,7 @@ mem_cgroup_uncharge_swapcache(struct pag > > > > > > > > > > #define nr_swap_pages 0L > > > > > #define total_swap_pages 0L > > > > > -#define total_swapcache_pages 0UL > > > > > +#define total_swapcache_pages() 0UL > > > > > > > > > > #define si_swapinfo(val) \ > > > > > do { (val)->freeswap = (val)->totalswap = 0; } while (0) > > > > > Index: linux/mm/memcontrol.c > > > > > =================================================================== > > > > > --- linux.orig/mm/memcontrol.c 2013-01-22 09:13:14.000000000 +0800 > > > > Acked-by: Minchan Kim <minchan@kernel.org> > > > > > > > > > +++ linux/mm/memcontrol.c 2013-01-22 09:29:29.374977700 +0800 > > > > > @@ -6279,7 +6279,7 @@ static struct page *mc_handle_swap_pte(s > > > > > * Because lookup_swap_cache() updates some statistics counter, > > > > > * we call find_get_page() with swapper_space directly. > > > > > */ > > > > > - page = find_get_page(&swapper_space, ent.val); > > > > > + page = find_get_page(swap_address_space(ent), ent.val); > > > > > if (do_swap_account) > > > > > entry->val = ent.val; > > > > > > > > > > @@ -6320,7 +6320,7 @@ static struct page *mc_handle_file_pte(s > > > > > swp_entry_t swap = radix_to_swp_entry(page); > > > > > if (do_swap_account) > > > > > *entry = swap; > > > > > - page = find_get_page(&swapper_space, swap.val); > > > > > + page = find_get_page(swap_address_space(swap), swap.val); > > > > > } > > > > > #endif > > > > > return page; > > > > > Index: linux/mm/mincore.c > > > > > =================================================================== > > > > > --- linux.orig/mm/mincore.c 2013-01-22 09:13:14.000000000 +0800 > > > > > +++ linux/mm/mincore.c 2013-01-22 09:29:29.378977649 +0800 > > > > > @@ -75,7 +75,7 @@ static unsigned char mincore_page(struct > > > > > /* shmem/tmpfs may return swap: account for swapcache page too. */ > > > > > if (radix_tree_exceptional_entry(page)) { > > > > > swp_entry_t swap = radix_to_swp_entry(page); > > > > > - page = find_get_page(&swapper_space, swap.val); > > > > > + page = find_get_page(swap_address_space(swap), swap.val); > > > > > } > > > > > #endif > > > > > if (page) { > > > > > @@ -135,7 +135,8 @@ static void mincore_pte_range(struct vm_ > > > > > } else { > > > > > #ifdef CONFIG_SWAP > > > > > pgoff = entry.val; > > > > > - *vec = mincore_page(&swapper_space, pgoff); > > > > > + *vec = mincore_page(swap_address_space(entry), > > > > > + pgoff); > > > > > #else > > > > > WARN_ON(1); > > > > > *vec = 1; > > > > > Index: linux/mm/swap.c > > > > > =================================================================== > > > > > --- linux.orig/mm/swap.c 2013-01-22 09:13:14.000000000 +0800 > > > > > +++ linux/mm/swap.c 2013-01-22 09:29:29.378977649 +0800 > > > > > @@ -855,9 +855,14 @@ EXPORT_SYMBOL(pagevec_lookup_tag); > > > > > void __init swap_setup(void) > > > > > { > > > > > unsigned long megs = totalram_pages >> (20 - PAGE_SHIFT); > > > > > - > > > > > #ifdef CONFIG_SWAP > > > > > - bdi_init(swapper_space.backing_dev_info); > > > > > + int i; > > > > > + > > > > > + for (i = 0; i < MAX_SWAPFILES; i++) { > > > > > + bdi_init(swapper_spaces[i].backing_dev_info); > > > > > + spin_lock_init(&swapper_spaces[i].tree_lock); > > > > > + INIT_LIST_HEAD(&swapper_spaces[i].i_mmap_nonlinear); > > > > > + } > > > > > #endif > > > > > > > > > > /* Use a smaller cluster for small-memory machines */ > > > > > Index: linux/mm/swap_state.c > > > > > =================================================================== > > > > > --- linux.orig/mm/swap_state.c 2013-01-22 09:13:14.000000000 +0800 > > > > > +++ linux/mm/swap_state.c 2013-01-22 09:29:29.378977649 +0800 > > > > > @@ -36,12 +36,12 @@ static struct backing_dev_info swap_back > > > > > .capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK | BDI_CAP_SWAP_BACKED, > > > > > }; > > > > > > > > > > -struct address_space swapper_space = { > > > > > - .page_tree = RADIX_TREE_INIT(GFP_ATOMIC|__GFP_NOWARN), > > > > > - .tree_lock = __SPIN_LOCK_UNLOCKED(swapper_space.tree_lock), > > > > > - .a_ops = &swap_aops, > > > > > - .i_mmap_nonlinear = LIST_HEAD_INIT(swapper_space.i_mmap_nonlinear), > > > > > - .backing_dev_info = &swap_backing_dev_info, > > > > > +struct address_space swapper_spaces[MAX_SWAPFILES] = { > > > > > + [0 ... MAX_SWAPFILES - 1] = { > > > > > + .page_tree = RADIX_TREE_INIT(GFP_ATOMIC|__GFP_NOWARN), > > > > > + .a_ops = &swap_aops, > > > > > + .backing_dev_info = &swap_backing_dev_info, > > > > > + } > > > > > }; > > > > > > > > > > #define INC_CACHE_INFO(x) do { swap_cache_info.x++; } while (0) > > > > > @@ -53,9 +53,19 @@ static struct { > > > > > unsigned long find_total; > > > > > } swap_cache_info; > > > > > > > > > > +unsigned long total_swapcache_pages(void) > > > > > +{ > > > > > + int i; > > > > > + unsigned long ret = 0; > > > > > + > > > > > + for (i = 0; i < MAX_SWAPFILES; i++) > > > > > + ret += swapper_spaces[i].nrpages; > > > > > + return ret; > > > > > +} > > > > > + > > > > > void show_swap_cache_info(void) > > > > > { > > > > > - printk("%lu pages in swap cache\n", total_swapcache_pages); > > > > > + printk("%lu pages in swap cache\n", total_swapcache_pages()); > > > > > printk("Swap cache stats: add %lu, delete %lu, find %lu/%lu\n", > > > > > swap_cache_info.add_total, swap_cache_info.del_total, > > > > > swap_cache_info.find_success, swap_cache_info.find_total); > > > > > @@ -70,23 +80,26 @@ void show_swap_cache_info(void) > > > > > static int __add_to_swap_cache(struct page *page, swp_entry_t entry) > > > > > { > > > > > int error; > > > > > + struct address_space *address_space; > > > > > > > > > > VM_BUG_ON(!PageLocked(page)); > > > > > VM_BUG_ON(PageSwapCache(page)); > > > > > VM_BUG_ON(!PageSwapBacked(page)); > > > > > > > > > > page_cache_get(page); > > > > > - SetPageSwapCache(page); > > > > > set_page_private(page, entry.val); > > > > > + SetPageSwapCache(page); > > > > > > > > Why did you move this line? Is there any special reason? > > > > > > Originally I'm afraid page_mapping() gets invalid page_private(), but I then > > > realized we hold page lock. There are some places we don't hold page lock. > > > either such page isn't swap page or the caller can tolerate race. I forgot > > > removing this change in the patch. But I certainly can be wrong. We can add > > > memory barrier if required. > > > > Yeb. While I reviewed the patch, I was concern about that but I fail to find > > a problem, too. But maybe it's valuable to add comment about that race > > (!PageSwapCache(page) but page_mapping could return swapper_space) in page_mapping. > > Hi Minchan, > > If the race Shaohua mentioned should be PageSwapCache(page) but > page_mapping couldn't return swapper_space since page_mapping() gets > invalid page_private(). Right you are. In such case, it could be a problem. Let's see following case. isolate_migratepages_range __isolate_lru_page __add_to_swap_cache set_page_private(page, entry.val) SetPageSwapCache(page) PageSwapCache entry.val = page_private(page); mapping = swap_address_space(entry); In this case, if memory ordering happens, mapping would be dangling pointer, One of the problem by dangling mapping is the page could be passed into shrink_page_list and try to pageout with dangling mapping. Of course, we have many locks until reaching the page_mapping in shrink_page_list so the problem will not happen but we shouldn't depends on such implicit locks instead of explicit memory barrier because we could remove all locks in reclaim path if super hero comes in or new user of page_mapping would do new something to make a problem. :) I will send a patch if anyone doesn't oppose. > > > So if some problem happens in the future, people could help to find culprit. > > > > Could you respin this with adding the comment and removing above unnecessary moving? > > > > > > > > Thanks, > > > Shaohua > > > > > > -- > > > To unsubscribe, send a message with 'unsubscribe linux-mm' in > > > the body to majordomo@kvack.org. For more info on Linux MM, > > > see: http://www.linux-mm.org/ . > > > Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a> > > > > > -- > To unsubscribe, send a message with 'unsubscribe linux-mm' in > the body to majordomo@kvack.org. For more info on Linux MM, > see: http://www.linux-mm.org/ . > Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a> -- Kind regards, Minchan Kim -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@kvack.org. For more info on Linux MM, see: http://www.linux-mm.org/ . Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a> ^ permalink raw reply [flat|nested] 15+ messages in thread
* Re: [patch 2/3 v2]swap: make each swap partition have one address_space 2013-01-24 2:22 ` Minchan Kim @ 2013-01-24 2:43 ` Shaohua Li 2013-01-24 3:25 ` Simon Jeons 2013-01-24 5:19 ` Minchan Kim 0 siblings, 2 replies; 15+ messages in thread From: Shaohua Li @ 2013-01-24 2:43 UTC (permalink / raw) To: Minchan Kim; +Cc: Simon Jeons, linux-mm, akpm, hughd, riel On Thu, Jan 24, 2013 at 11:22:41AM +0900, Minchan Kim wrote: > Hi Simon, > > On Wed, Jan 23, 2013 at 07:39:56PM -0600, Simon Jeons wrote: > > On Wed, 2013-01-23 at 17:04 +0900, Minchan Kim wrote: > > > On Wed, Jan 23, 2013 at 03:36:55PM +0800, Shaohua Li wrote: > > > > On Wed, Jan 23, 2013 at 03:16:45PM +0900, Minchan Kim wrote: > > > > > Looks good to me. Below just nitpicks. > > > > > I saw Andrew already took this into mmotm so I'm not sure he or you will do > > > > > next spin but anyway, review goes. Just nitpicks and a question. > > > > > > > > > > On Tue, Jan 22, 2013 at 10:29:51AM +0800, Shaohua Li wrote: > > > > > > > > > > > > When I use several fast SSD to do swap, swapper_space.tree_lock is heavily > > > > > > contended. This makes each swap partition have one address_space to reduce the > > > > > > lock contention. There is an array of address_space for swap. The swap entry > > > > > > type is the index to the array. > > > > > > > > > > > > In my test with 3 SSD, this increases the swapout throughput 20%. > > > > > > > > > > > > V1->V2: simplify code > > > > > > > > > > > > Signed-off-by: Shaohua Li <shli@fusionio.com> > > > > > > > > > > Acked-by: Minchan Kim <minchan@kernel.org> > > > > > > > > > > > --- > > > > > > fs/proc/meminfo.c | 4 +-- > > > > > > include/linux/swap.h | 9 ++++---- > > > > > > mm/memcontrol.c | 4 +-- > > > > > > mm/mincore.c | 5 ++-- > > > > > > mm/swap.c | 9 ++++++-- > > > > > > mm/swap_state.c | 57 ++++++++++++++++++++++++++++++++++----------------- > > > > > > mm/swapfile.c | 5 ++-- > > > > > > mm/util.c | 10 ++++++-- > > > > > > 8 files changed, 68 insertions(+), 35 deletions(-) > > > > > > > > > > > > Index: linux/include/linux/swap.h > > > > > > =================================================================== > > > > > > --- linux.orig/include/linux/swap.h 2013-01-22 09:13:14.000000000 +0800 > > > > > > +++ linux/include/linux/swap.h 2013-01-22 09:34:44.923011706 +0800 > > > > > > @@ -8,7 +8,7 @@ > > > > > > #include <linux/memcontrol.h> > > > > > > #include <linux/sched.h> > > > > > > #include <linux/node.h> > > > > > > - > > > > > > +#include <linux/fs.h> > > > > > > #include <linux/atomic.h> > > > > > > #include <asm/page.h> > > > > > > > > > > > > @@ -330,8 +330,9 @@ int generic_swapfile_activate(struct swa > > > > > > sector_t *); > > > > > > > > > > > > /* linux/mm/swap_state.c */ > > > > > > -extern struct address_space swapper_space; > > > > > > -#define total_swapcache_pages swapper_space.nrpages > > > > > > +extern struct address_space swapper_spaces[]; > > > > > > +#define swap_address_space(entry) (&swapper_spaces[swp_type(entry)]) > > > > > > > > > > How about this naming? > > > > > > > > > > #define swapper_space(entry) (&swapper_spaces[swp_type(entry)]) > > > > > > > > > > > +extern unsigned long total_swapcache_pages(void); > > > > > > extern void show_swap_cache_info(void); > > > > > > extern int add_to_swap(struct page *); > > > > > > extern int add_to_swap_cache(struct page *, swp_entry_t, gfp_t); > > > > > > @@ -382,7 +383,7 @@ mem_cgroup_uncharge_swapcache(struct pag > > > > > > > > > > > > #define nr_swap_pages 0L > > > > > > #define total_swap_pages 0L > > > > > > -#define total_swapcache_pages 0UL > > > > > > +#define total_swapcache_pages() 0UL > > > > > > > > > > > > #define si_swapinfo(val) \ > > > > > > do { (val)->freeswap = (val)->totalswap = 0; } while (0) > > > > > > Index: linux/mm/memcontrol.c > > > > > > =================================================================== > > > > > > --- linux.orig/mm/memcontrol.c 2013-01-22 09:13:14.000000000 +0800 > > > > > Acked-by: Minchan Kim <minchan@kernel.org> > > > > > > > > > > > +++ linux/mm/memcontrol.c 2013-01-22 09:29:29.374977700 +0800 > > > > > > @@ -6279,7 +6279,7 @@ static struct page *mc_handle_swap_pte(s > > > > > > * Because lookup_swap_cache() updates some statistics counter, > > > > > > * we call find_get_page() with swapper_space directly. > > > > > > */ > > > > > > - page = find_get_page(&swapper_space, ent.val); > > > > > > + page = find_get_page(swap_address_space(ent), ent.val); > > > > > > if (do_swap_account) > > > > > > entry->val = ent.val; > > > > > > > > > > > > @@ -6320,7 +6320,7 @@ static struct page *mc_handle_file_pte(s > > > > > > swp_entry_t swap = radix_to_swp_entry(page); > > > > > > if (do_swap_account) > > > > > > *entry = swap; > > > > > > - page = find_get_page(&swapper_space, swap.val); > > > > > > + page = find_get_page(swap_address_space(swap), swap.val); > > > > > > } > > > > > > #endif > > > > > > return page; > > > > > > Index: linux/mm/mincore.c > > > > > > =================================================================== > > > > > > --- linux.orig/mm/mincore.c 2013-01-22 09:13:14.000000000 +0800 > > > > > > +++ linux/mm/mincore.c 2013-01-22 09:29:29.378977649 +0800 > > > > > > @@ -75,7 +75,7 @@ static unsigned char mincore_page(struct > > > > > > /* shmem/tmpfs may return swap: account for swapcache page too. */ > > > > > > if (radix_tree_exceptional_entry(page)) { > > > > > > swp_entry_t swap = radix_to_swp_entry(page); > > > > > > - page = find_get_page(&swapper_space, swap.val); > > > > > > + page = find_get_page(swap_address_space(swap), swap.val); > > > > > > } > > > > > > #endif > > > > > > if (page) { > > > > > > @@ -135,7 +135,8 @@ static void mincore_pte_range(struct vm_ > > > > > > } else { > > > > > > #ifdef CONFIG_SWAP > > > > > > pgoff = entry.val; > > > > > > - *vec = mincore_page(&swapper_space, pgoff); > > > > > > + *vec = mincore_page(swap_address_space(entry), > > > > > > + pgoff); > > > > > > #else > > > > > > WARN_ON(1); > > > > > > *vec = 1; > > > > > > Index: linux/mm/swap.c > > > > > > =================================================================== > > > > > > --- linux.orig/mm/swap.c 2013-01-22 09:13:14.000000000 +0800 > > > > > > +++ linux/mm/swap.c 2013-01-22 09:29:29.378977649 +0800 > > > > > > @@ -855,9 +855,14 @@ EXPORT_SYMBOL(pagevec_lookup_tag); > > > > > > void __init swap_setup(void) > > > > > > { > > > > > > unsigned long megs = totalram_pages >> (20 - PAGE_SHIFT); > > > > > > - > > > > > > #ifdef CONFIG_SWAP > > > > > > - bdi_init(swapper_space.backing_dev_info); > > > > > > + int i; > > > > > > + > > > > > > + for (i = 0; i < MAX_SWAPFILES; i++) { > > > > > > + bdi_init(swapper_spaces[i].backing_dev_info); > > > > > > + spin_lock_init(&swapper_spaces[i].tree_lock); > > > > > > + INIT_LIST_HEAD(&swapper_spaces[i].i_mmap_nonlinear); > > > > > > + } > > > > > > #endif > > > > > > > > > > > > /* Use a smaller cluster for small-memory machines */ > > > > > > Index: linux/mm/swap_state.c > > > > > > =================================================================== > > > > > > --- linux.orig/mm/swap_state.c 2013-01-22 09:13:14.000000000 +0800 > > > > > > +++ linux/mm/swap_state.c 2013-01-22 09:29:29.378977649 +0800 > > > > > > @@ -36,12 +36,12 @@ static struct backing_dev_info swap_back > > > > > > .capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK | BDI_CAP_SWAP_BACKED, > > > > > > }; > > > > > > > > > > > > -struct address_space swapper_space = { > > > > > > - .page_tree = RADIX_TREE_INIT(GFP_ATOMIC|__GFP_NOWARN), > > > > > > - .tree_lock = __SPIN_LOCK_UNLOCKED(swapper_space.tree_lock), > > > > > > - .a_ops = &swap_aops, > > > > > > - .i_mmap_nonlinear = LIST_HEAD_INIT(swapper_space.i_mmap_nonlinear), > > > > > > - .backing_dev_info = &swap_backing_dev_info, > > > > > > +struct address_space swapper_spaces[MAX_SWAPFILES] = { > > > > > > + [0 ... MAX_SWAPFILES - 1] = { > > > > > > + .page_tree = RADIX_TREE_INIT(GFP_ATOMIC|__GFP_NOWARN), > > > > > > + .a_ops = &swap_aops, > > > > > > + .backing_dev_info = &swap_backing_dev_info, > > > > > > + } > > > > > > }; > > > > > > > > > > > > #define INC_CACHE_INFO(x) do { swap_cache_info.x++; } while (0) > > > > > > @@ -53,9 +53,19 @@ static struct { > > > > > > unsigned long find_total; > > > > > > } swap_cache_info; > > > > > > > > > > > > +unsigned long total_swapcache_pages(void) > > > > > > +{ > > > > > > + int i; > > > > > > + unsigned long ret = 0; > > > > > > + > > > > > > + for (i = 0; i < MAX_SWAPFILES; i++) > > > > > > + ret += swapper_spaces[i].nrpages; > > > > > > + return ret; > > > > > > +} > > > > > > + > > > > > > void show_swap_cache_info(void) > > > > > > { > > > > > > - printk("%lu pages in swap cache\n", total_swapcache_pages); > > > > > > + printk("%lu pages in swap cache\n", total_swapcache_pages()); > > > > > > printk("Swap cache stats: add %lu, delete %lu, find %lu/%lu\n", > > > > > > swap_cache_info.add_total, swap_cache_info.del_total, > > > > > > swap_cache_info.find_success, swap_cache_info.find_total); > > > > > > @@ -70,23 +80,26 @@ void show_swap_cache_info(void) > > > > > > static int __add_to_swap_cache(struct page *page, swp_entry_t entry) > > > > > > { > > > > > > int error; > > > > > > + struct address_space *address_space; > > > > > > > > > > > > VM_BUG_ON(!PageLocked(page)); > > > > > > VM_BUG_ON(PageSwapCache(page)); > > > > > > VM_BUG_ON(!PageSwapBacked(page)); > > > > > > > > > > > > page_cache_get(page); > > > > > > - SetPageSwapCache(page); > > > > > > set_page_private(page, entry.val); > > > > > > + SetPageSwapCache(page); > > > > > > > > > > Why did you move this line? Is there any special reason? > > > > > > > > Originally I'm afraid page_mapping() gets invalid page_private(), but I then > > > > realized we hold page lock. There are some places we don't hold page lock. > > > > either such page isn't swap page or the caller can tolerate race. I forgot > > > > removing this change in the patch. But I certainly can be wrong. We can add > > > > memory barrier if required. > > > > > > Yeb. While I reviewed the patch, I was concern about that but I fail to find > > > a problem, too. But maybe it's valuable to add comment about that race > > > (!PageSwapCache(page) but page_mapping could return swapper_space) in page_mapping. > > > > Hi Minchan, > > > > If the race Shaohua mentioned should be PageSwapCache(page) but > > page_mapping couldn't return swapper_space since page_mapping() gets > > invalid page_private(). > > Right you are. In such case, it could be a problem. > Let's see following case. > > isolate_migratepages_range > __isolate_lru_page > __add_to_swap_cache > set_page_private(page, entry.val) > SetPageSwapCache(page) > PageSwapCache > entry.val = page_private(page); > mapping = swap_address_space(entry); > > In this case, if memory ordering happens, mapping would be dangling pointer, > One of the problem by dangling mapping is the page could be passed into > shrink_page_list and try to pageout with dangling mapping. Of course, > we have many locks until reaching the page_mapping in shrink_page_list so > the problem will not happen but we shouldn't depends on such implicit locks > instead of explicit memory barrier because we could remove all locks in reclaim > path if super hero comes in or new user of page_mapping would do new something > to make a problem. :) > > I will send a patch if anyone doesn't oppose. That't fine in the case, because page private 0 still return a valide mapping, and we only check mapping here. But I agree this is too subtle. Adding memory fence is safer. I had this yesterday if you don't mind: Subject: mm: add memory to prevent SwapCache bit and page private out of order page_mapping() checks SwapCache bit first and then read page private. Adding memory barrier so page private has correct value before SwapCache bit set. Signed-off-by: Shaohua Li <shli@fusionio.com> --- mm/swap_state.c | 7 +++++-- mm/util.c | 6 ++++++ 2 files changed, 11 insertions(+), 2 deletions(-) Index: linux/mm/swap_state.c =================================================================== --- linux.orig/mm/swap_state.c 2013-01-22 10:12:33.514490665 +0800 +++ linux/mm/swap_state.c 2013-01-23 15:42:00.737459987 +0800 @@ -89,6 +89,7 @@ static int __add_to_swap_cache(struct pa page_cache_get(page); set_page_private(page, entry.val); + smp_wmb(); SetPageSwapCache(page); address_space = swap_address_space(entry); @@ -109,8 +110,9 @@ static int __add_to_swap_cache(struct pa * So add_to_swap_cache() doesn't returns -EEXIST. */ VM_BUG_ON(error == -EEXIST); - set_page_private(page, 0UL); ClearPageSwapCache(page); + smp_mb__after_clear_bit(); + set_page_private(page, 0UL); page_cache_release(page); } @@ -146,8 +148,9 @@ void __delete_from_swap_cache(struct pag entry.val = page_private(page); address_space = swap_address_space(entry); radix_tree_delete(&address_space->page_tree, page_private(page)); - set_page_private(page, 0); ClearPageSwapCache(page); + smp_mb__after_clear_bit(); + set_page_private(page, 0); address_space->nrpages--; __dec_zone_page_state(page, NR_FILE_PAGES); INC_CACHE_INFO(del_total); Index: linux/mm/util.c =================================================================== --- linux.orig/mm/util.c 2013-01-22 10:11:58.310933234 +0800 +++ linux/mm/util.c 2013-01-23 15:50:22.147155111 +0800 @@ -389,6 +389,12 @@ struct address_space *page_mapping(struc if (unlikely(PageSwapCache(page))) { swp_entry_t entry; + /* + * set page_private() first then set SwapCache bit. clearing + * the bit first then zero page private. This memory barrier + * matches the rule. + */ + smp_rmb(); entry.val = page_private(page); mapping = swap_address_space(entry); } else -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@kvack.org. For more info on Linux MM, see: http://www.linux-mm.org/ . Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a> ^ permalink raw reply [flat|nested] 15+ messages in thread
* Re: [patch 2/3 v2]swap: make each swap partition have one address_space 2013-01-24 2:43 ` Shaohua Li @ 2013-01-24 3:25 ` Simon Jeons 2013-01-24 10:35 ` Shaohua Li 2013-01-24 5:19 ` Minchan Kim 1 sibling, 1 reply; 15+ messages in thread From: Simon Jeons @ 2013-01-24 3:25 UTC (permalink / raw) To: Shaohua Li; +Cc: Minchan Kim, linux-mm, akpm, hughd, riel Hi Shaohua, On Thu, 2013-01-24 at 10:43 +0800, Shaohua Li wrote: > On Thu, Jan 24, 2013 at 11:22:41AM +0900, Minchan Kim wrote: > > Hi Simon, > > > > On Wed, Jan 23, 2013 at 07:39:56PM -0600, Simon Jeons wrote: > > > On Wed, 2013-01-23 at 17:04 +0900, Minchan Kim wrote: > > > > On Wed, Jan 23, 2013 at 03:36:55PM +0800, Shaohua Li wrote: > > > > > On Wed, Jan 23, 2013 at 03:16:45PM +0900, Minchan Kim wrote: > > > > > > Looks good to me. Below just nitpicks. > > > > > > I saw Andrew already took this into mmotm so I'm not sure he or you will do > > > > > > next spin but anyway, review goes. Just nitpicks and a question. > > > > > > > > > > > > On Tue, Jan 22, 2013 at 10:29:51AM +0800, Shaohua Li wrote: > > > > > > > > > > > > > > When I use several fast SSD to do swap, swapper_space.tree_lock is heavily > > > > > > > contended. This makes each swap partition have one address_space to reduce the > > > > > > > lock contention. There is an array of address_space for swap. The swap entry > > > > > > > type is the index to the array. > > > > > > > > > > > > > > In my test with 3 SSD, this increases the swapout throughput 20%. > > > > > > > > > > > > > > V1->V2: simplify code > > > > > > > > > > > > > > Signed-off-by: Shaohua Li <shli@fusionio.com> > > > > > > > > > > > > Acked-by: Minchan Kim <minchan@kernel.org> > > > > > > > > > > > > > --- > > > > > > > fs/proc/meminfo.c | 4 +-- > > > > > > > include/linux/swap.h | 9 ++++---- > > > > > > > mm/memcontrol.c | 4 +-- > > > > > > > mm/mincore.c | 5 ++-- > > > > > > > mm/swap.c | 9 ++++++-- > > > > > > > mm/swap_state.c | 57 ++++++++++++++++++++++++++++++++++----------------- > > > > > > > mm/swapfile.c | 5 ++-- > > > > > > > mm/util.c | 10 ++++++-- > > > > > > > 8 files changed, 68 insertions(+), 35 deletions(-) > > > > > > > > > > > > > > Index: linux/include/linux/swap.h > > > > > > > =================================================================== > > > > > > > --- linux.orig/include/linux/swap.h 2013-01-22 09:13:14.000000000 +0800 > > > > > > > +++ linux/include/linux/swap.h 2013-01-22 09:34:44.923011706 +0800 > > > > > > > @@ -8,7 +8,7 @@ > > > > > > > #include <linux/memcontrol.h> > > > > > > > #include <linux/sched.h> > > > > > > > #include <linux/node.h> > > > > > > > - > > > > > > > +#include <linux/fs.h> > > > > > > > #include <linux/atomic.h> > > > > > > > #include <asm/page.h> > > > > > > > > > > > > > > @@ -330,8 +330,9 @@ int generic_swapfile_activate(struct swa > > > > > > > sector_t *); > > > > > > > > > > > > > > /* linux/mm/swap_state.c */ > > > > > > > -extern struct address_space swapper_space; > > > > > > > -#define total_swapcache_pages swapper_space.nrpages > > > > > > > +extern struct address_space swapper_spaces[]; > > > > > > > +#define swap_address_space(entry) (&swapper_spaces[swp_type(entry)]) > > > > > > > > > > > > How about this naming? > > > > > > > > > > > > #define swapper_space(entry) (&swapper_spaces[swp_type(entry)]) > > > > > > > > > > > > > +extern unsigned long total_swapcache_pages(void); > > > > > > > extern void show_swap_cache_info(void); > > > > > > > extern int add_to_swap(struct page *); > > > > > > > extern int add_to_swap_cache(struct page *, swp_entry_t, gfp_t); > > > > > > > @@ -382,7 +383,7 @@ mem_cgroup_uncharge_swapcache(struct pag > > > > > > > > > > > > > > #define nr_swap_pages 0L > > > > > > > #define total_swap_pages 0L > > > > > > > -#define total_swapcache_pages 0UL > > > > > > > +#define total_swapcache_pages() 0UL > > > > > > > > > > > > > > #define si_swapinfo(val) \ > > > > > > > do { (val)->freeswap = (val)->totalswap = 0; } while (0) > > > > > > > Index: linux/mm/memcontrol.c > > > > > > > =================================================================== > > > > > > > --- linux.orig/mm/memcontrol.c 2013-01-22 09:13:14.000000000 +0800 > > > > > > Acked-by: Minchan Kim <minchan@kernel.org> > > > > > > > > > > > > > +++ linux/mm/memcontrol.c 2013-01-22 09:29:29.374977700 +0800 > > > > > > > @@ -6279,7 +6279,7 @@ static struct page *mc_handle_swap_pte(s > > > > > > > * Because lookup_swap_cache() updates some statistics counter, > > > > > > > * we call find_get_page() with swapper_space directly. > > > > > > > */ > > > > > > > - page = find_get_page(&swapper_space, ent.val); > > > > > > > + page = find_get_page(swap_address_space(ent), ent.val); > > > > > > > if (do_swap_account) > > > > > > > entry->val = ent.val; > > > > > > > > > > > > > > @@ -6320,7 +6320,7 @@ static struct page *mc_handle_file_pte(s > > > > > > > swp_entry_t swap = radix_to_swp_entry(page); > > > > > > > if (do_swap_account) > > > > > > > *entry = swap; > > > > > > > - page = find_get_page(&swapper_space, swap.val); > > > > > > > + page = find_get_page(swap_address_space(swap), swap.val); > > > > > > > } > > > > > > > #endif > > > > > > > return page; > > > > > > > Index: linux/mm/mincore.c > > > > > > > =================================================================== > > > > > > > --- linux.orig/mm/mincore.c 2013-01-22 09:13:14.000000000 +0800 > > > > > > > +++ linux/mm/mincore.c 2013-01-22 09:29:29.378977649 +0800 > > > > > > > @@ -75,7 +75,7 @@ static unsigned char mincore_page(struct > > > > > > > /* shmem/tmpfs may return swap: account for swapcache page too. */ > > > > > > > if (radix_tree_exceptional_entry(page)) { > > > > > > > swp_entry_t swap = radix_to_swp_entry(page); > > > > > > > - page = find_get_page(&swapper_space, swap.val); > > > > > > > + page = find_get_page(swap_address_space(swap), swap.val); > > > > > > > } > > > > > > > #endif > > > > > > > if (page) { > > > > > > > @@ -135,7 +135,8 @@ static void mincore_pte_range(struct vm_ > > > > > > > } else { > > > > > > > #ifdef CONFIG_SWAP > > > > > > > pgoff = entry.val; > > > > > > > - *vec = mincore_page(&swapper_space, pgoff); > > > > > > > + *vec = mincore_page(swap_address_space(entry), > > > > > > > + pgoff); > > > > > > > #else > > > > > > > WARN_ON(1); > > > > > > > *vec = 1; > > > > > > > Index: linux/mm/swap.c > > > > > > > =================================================================== > > > > > > > --- linux.orig/mm/swap.c 2013-01-22 09:13:14.000000000 +0800 > > > > > > > +++ linux/mm/swap.c 2013-01-22 09:29:29.378977649 +0800 > > > > > > > @@ -855,9 +855,14 @@ EXPORT_SYMBOL(pagevec_lookup_tag); > > > > > > > void __init swap_setup(void) > > > > > > > { > > > > > > > unsigned long megs = totalram_pages >> (20 - PAGE_SHIFT); > > > > > > > - > > > > > > > #ifdef CONFIG_SWAP > > > > > > > - bdi_init(swapper_space.backing_dev_info); > > > > > > > + int i; > > > > > > > + > > > > > > > + for (i = 0; i < MAX_SWAPFILES; i++) { > > > > > > > + bdi_init(swapper_spaces[i].backing_dev_info); > > > > > > > + spin_lock_init(&swapper_spaces[i].tree_lock); > > > > > > > + INIT_LIST_HEAD(&swapper_spaces[i].i_mmap_nonlinear); > > > > > > > + } > > > > > > > #endif > > > > > > > > > > > > > > /* Use a smaller cluster for small-memory machines */ > > > > > > > Index: linux/mm/swap_state.c > > > > > > > =================================================================== > > > > > > > --- linux.orig/mm/swap_state.c 2013-01-22 09:13:14.000000000 +0800 > > > > > > > +++ linux/mm/swap_state.c 2013-01-22 09:29:29.378977649 +0800 > > > > > > > @@ -36,12 +36,12 @@ static struct backing_dev_info swap_back > > > > > > > .capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK | BDI_CAP_SWAP_BACKED, > > > > > > > }; > > > > > > > > > > > > > > -struct address_space swapper_space = { > > > > > > > - .page_tree = RADIX_TREE_INIT(GFP_ATOMIC|__GFP_NOWARN), > > > > > > > - .tree_lock = __SPIN_LOCK_UNLOCKED(swapper_space.tree_lock), > > > > > > > - .a_ops = &swap_aops, > > > > > > > - .i_mmap_nonlinear = LIST_HEAD_INIT(swapper_space.i_mmap_nonlinear), > > > > > > > - .backing_dev_info = &swap_backing_dev_info, > > > > > > > +struct address_space swapper_spaces[MAX_SWAPFILES] = { > > > > > > > + [0 ... MAX_SWAPFILES - 1] = { > > > > > > > + .page_tree = RADIX_TREE_INIT(GFP_ATOMIC|__GFP_NOWARN), > > > > > > > + .a_ops = &swap_aops, > > > > > > > + .backing_dev_info = &swap_backing_dev_info, > > > > > > > + } > > > > > > > }; > > > > > > > > > > > > > > #define INC_CACHE_INFO(x) do { swap_cache_info.x++; } while (0) > > > > > > > @@ -53,9 +53,19 @@ static struct { > > > > > > > unsigned long find_total; > > > > > > > } swap_cache_info; > > > > > > > > > > > > > > +unsigned long total_swapcache_pages(void) > > > > > > > +{ > > > > > > > + int i; > > > > > > > + unsigned long ret = 0; > > > > > > > + > > > > > > > + for (i = 0; i < MAX_SWAPFILES; i++) > > > > > > > + ret += swapper_spaces[i].nrpages; > > > > > > > + return ret; > > > > > > > +} > > > > > > > + > > > > > > > void show_swap_cache_info(void) > > > > > > > { > > > > > > > - printk("%lu pages in swap cache\n", total_swapcache_pages); > > > > > > > + printk("%lu pages in swap cache\n", total_swapcache_pages()); > > > > > > > printk("Swap cache stats: add %lu, delete %lu, find %lu/%lu\n", > > > > > > > swap_cache_info.add_total, swap_cache_info.del_total, > > > > > > > swap_cache_info.find_success, swap_cache_info.find_total); > > > > > > > @@ -70,23 +80,26 @@ void show_swap_cache_info(void) > > > > > > > static int __add_to_swap_cache(struct page *page, swp_entry_t entry) > > > > > > > { > > > > > > > int error; > > > > > > > + struct address_space *address_space; > > > > > > > > > > > > > > VM_BUG_ON(!PageLocked(page)); > > > > > > > VM_BUG_ON(PageSwapCache(page)); > > > > > > > VM_BUG_ON(!PageSwapBacked(page)); > > > > > > > > > > > > > > page_cache_get(page); > > > > > > > - SetPageSwapCache(page); > > > > > > > set_page_private(page, entry.val); > > > > > > > + SetPageSwapCache(page); > > > > > > > > > > > > Why did you move this line? Is there any special reason? > > > > > > > > > > Originally I'm afraid page_mapping() gets invalid page_private(), but I then > > > > > realized we hold page lock. There are some places we don't hold page lock. > > > > > either such page isn't swap page or the caller can tolerate race. I forgot > > > > > removing this change in the patch. But I certainly can be wrong. We can add > > > > > memory barrier if required. > > > > > > > > Yeb. While I reviewed the patch, I was concern about that but I fail to find > > > > a problem, too. But maybe it's valuable to add comment about that race > > > > (!PageSwapCache(page) but page_mapping could return swapper_space) in page_mapping. > > > > > > Hi Minchan, > > > > > > If the race Shaohua mentioned should be PageSwapCache(page) but > > > page_mapping couldn't return swapper_space since page_mapping() gets > > > invalid page_private(). > > > > Right you are. In such case, it could be a problem. > > Let's see following case. > > > > isolate_migratepages_range > > __isolate_lru_page > > __add_to_swap_cache > > set_page_private(page, entry.val) > > SetPageSwapCache(page) > > PageSwapCache > > entry.val = page_private(page); > > mapping = swap_address_space(entry); > > > > In this case, if memory ordering happens, mapping would be dangling pointer, > > One of the problem by dangling mapping is the page could be passed into > > shrink_page_list and try to pageout with dangling mapping. Of course, > > we have many locks until reaching the page_mapping in shrink_page_list so > > the problem will not happen but we shouldn't depends on such implicit locks > > instead of explicit memory barrier because we could remove all locks in reclaim > > path if super hero comes in or new user of page_mapping would do new something > > to make a problem. :) > > > > I will send a patch if anyone doesn't oppose. > > That't fine in the case, because page private 0 still return a valide mapping, > and we only check mapping here. But I agree this is too subtle. Adding memory > fence is safer. I had this yesterday if you don't mind: > > > Subject: mm: add memory to prevent SwapCache bit and page private out of order > > page_mapping() checks SwapCache bit first and then read page private. Adding > memory barrier so page private has correct value before SwapCache bit set. > > Signed-off-by: Shaohua Li <shli@fusionio.com> > --- > mm/swap_state.c | 7 +++++-- > mm/util.c | 6 ++++++ > 2 files changed, 11 insertions(+), 2 deletions(-) > > Index: linux/mm/swap_state.c > =================================================================== > --- linux.orig/mm/swap_state.c 2013-01-22 10:12:33.514490665 +0800 > +++ linux/mm/swap_state.c 2013-01-23 15:42:00.737459987 +0800 > @@ -89,6 +89,7 @@ static int __add_to_swap_cache(struct pa > > page_cache_get(page); > set_page_private(page, entry.val); > + smp_wmb(); > SetPageSwapCache(page); > > address_space = swap_address_space(entry); > @@ -109,8 +110,9 @@ static int __add_to_swap_cache(struct pa > * So add_to_swap_cache() doesn't returns -EEXIST. > */ > VM_BUG_ON(error == -EEXIST); > - set_page_private(page, 0UL); > ClearPageSwapCache(page); If race happen after clear bit, page_mapping should return NULL against anonymous page, But here will return &swapper_space. > + smp_mb__after_clear_bit(); > + set_page_private(page, 0UL); > page_cache_release(page); > } > > @@ -146,8 +148,9 @@ void __delete_from_swap_cache(struct pag > entry.val = page_private(page); > address_space = swap_address_space(entry); > radix_tree_delete(&address_space->page_tree, page_private(page)); > - set_page_private(page, 0); > ClearPageSwapCache(page); > + smp_mb__after_clear_bit(); > + set_page_private(page, 0); > address_space->nrpages--; > __dec_zone_page_state(page, NR_FILE_PAGES); > INC_CACHE_INFO(del_total); > Index: linux/mm/util.c > =================================================================== > --- linux.orig/mm/util.c 2013-01-22 10:11:58.310933234 +0800 > +++ linux/mm/util.c 2013-01-23 15:50:22.147155111 +0800 > @@ -389,6 +389,12 @@ struct address_space *page_mapping(struc > if (unlikely(PageSwapCache(page))) { > swp_entry_t entry; > > + /* > + * set page_private() first then set SwapCache bit. clearing > + * the bit first then zero page private. This memory barrier > + * matches the rule. > + */ > + smp_rmb(); Why need this? > entry.val = page_private(page); > mapping = swap_address_space(entry); > } else -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@kvack.org. For more info on Linux MM, see: http://www.linux-mm.org/ . Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a> ^ permalink raw reply [flat|nested] 15+ messages in thread
* Re: [patch 2/3 v2]swap: make each swap partition have one address_space 2013-01-24 3:25 ` Simon Jeons @ 2013-01-24 10:35 ` Shaohua Li 0 siblings, 0 replies; 15+ messages in thread From: Shaohua Li @ 2013-01-24 10:35 UTC (permalink / raw) To: Simon Jeons; +Cc: Minchan Kim, linux-mm, akpm, hughd, riel On Wed, Jan 23, 2013 at 09:25:38PM -0600, Simon Jeons wrote: > Hi Shaohua, > > On Thu, 2013-01-24 at 10:43 +0800, Shaohua Li wrote: > > On Thu, Jan 24, 2013 at 11:22:41AM +0900, Minchan Kim wrote: > > > Hi Simon, > > > > > > On Wed, Jan 23, 2013 at 07:39:56PM -0600, Simon Jeons wrote: > > > > On Wed, 2013-01-23 at 17:04 +0900, Minchan Kim wrote: > > > > > On Wed, Jan 23, 2013 at 03:36:55PM +0800, Shaohua Li wrote: > > > > > > On Wed, Jan 23, 2013 at 03:16:45PM +0900, Minchan Kim wrote: > > > > > > > Looks good to me. Below just nitpicks. > > > > > > > I saw Andrew already took this into mmotm so I'm not sure he or you will do > > > > > > > next spin but anyway, review goes. Just nitpicks and a question. > > > > > > > > > > > > > > On Tue, Jan 22, 2013 at 10:29:51AM +0800, Shaohua Li wrote: > > > > > > > > > > > > > > > > When I use several fast SSD to do swap, swapper_space.tree_lock is heavily > > > > > > > > contended. This makes each swap partition have one address_space to reduce the > > > > > > > > lock contention. There is an array of address_space for swap. The swap entry > > > > > > > > type is the index to the array. > > > > > > > > > > > > > > > > In my test with 3 SSD, this increases the swapout throughput 20%. > > > > > > > > > > > > > > > > V1->V2: simplify code > > > > > > > > > > > > > > > > Signed-off-by: Shaohua Li <shli@fusionio.com> > > > > > > > > > > > > > > Acked-by: Minchan Kim <minchan@kernel.org> > > > > > > > > > > > > > > > --- > > > > > > > > fs/proc/meminfo.c | 4 +-- > > > > > > > > include/linux/swap.h | 9 ++++---- > > > > > > > > mm/memcontrol.c | 4 +-- > > > > > > > > mm/mincore.c | 5 ++-- > > > > > > > > mm/swap.c | 9 ++++++-- > > > > > > > > mm/swap_state.c | 57 ++++++++++++++++++++++++++++++++++----------------- > > > > > > > > mm/swapfile.c | 5 ++-- > > > > > > > > mm/util.c | 10 ++++++-- > > > > > > > > 8 files changed, 68 insertions(+), 35 deletions(-) > > > > > > > > > > > > > > > > Index: linux/include/linux/swap.h > > > > > > > > =================================================================== > > > > > > > > --- linux.orig/include/linux/swap.h 2013-01-22 09:13:14.000000000 +0800 > > > > > > > > +++ linux/include/linux/swap.h 2013-01-22 09:34:44.923011706 +0800 > > > > > > > > @@ -8,7 +8,7 @@ > > > > > > > > #include <linux/memcontrol.h> > > > > > > > > #include <linux/sched.h> > > > > > > > > #include <linux/node.h> > > > > > > > > - > > > > > > > > +#include <linux/fs.h> > > > > > > > > #include <linux/atomic.h> > > > > > > > > #include <asm/page.h> > > > > > > > > > > > > > > > > @@ -330,8 +330,9 @@ int generic_swapfile_activate(struct swa > > > > > > > > sector_t *); > > > > > > > > > > > > > > > > /* linux/mm/swap_state.c */ > > > > > > > > -extern struct address_space swapper_space; > > > > > > > > -#define total_swapcache_pages swapper_space.nrpages > > > > > > > > +extern struct address_space swapper_spaces[]; > > > > > > > > +#define swap_address_space(entry) (&swapper_spaces[swp_type(entry)]) > > > > > > > > > > > > > > How about this naming? > > > > > > > > > > > > > > #define swapper_space(entry) (&swapper_spaces[swp_type(entry)]) > > > > > > > > > > > > > > > +extern unsigned long total_swapcache_pages(void); > > > > > > > > extern void show_swap_cache_info(void); > > > > > > > > extern int add_to_swap(struct page *); > > > > > > > > extern int add_to_swap_cache(struct page *, swp_entry_t, gfp_t); > > > > > > > > @@ -382,7 +383,7 @@ mem_cgroup_uncharge_swapcache(struct pag > > > > > > > > > > > > > > > > #define nr_swap_pages 0L > > > > > > > > #define total_swap_pages 0L > > > > > > > > -#define total_swapcache_pages 0UL > > > > > > > > +#define total_swapcache_pages() 0UL > > > > > > > > > > > > > > > > #define si_swapinfo(val) \ > > > > > > > > do { (val)->freeswap = (val)->totalswap = 0; } while (0) > > > > > > > > Index: linux/mm/memcontrol.c > > > > > > > > =================================================================== > > > > > > > > --- linux.orig/mm/memcontrol.c 2013-01-22 09:13:14.000000000 +0800 > > > > > > > Acked-by: Minchan Kim <minchan@kernel.org> > > > > > > > > > > > > > > > +++ linux/mm/memcontrol.c 2013-01-22 09:29:29.374977700 +0800 > > > > > > > > @@ -6279,7 +6279,7 @@ static struct page *mc_handle_swap_pte(s > > > > > > > > * Because lookup_swap_cache() updates some statistics counter, > > > > > > > > * we call find_get_page() with swapper_space directly. > > > > > > > > */ > > > > > > > > - page = find_get_page(&swapper_space, ent.val); > > > > > > > > + page = find_get_page(swap_address_space(ent), ent.val); > > > > > > > > if (do_swap_account) > > > > > > > > entry->val = ent.val; > > > > > > > > > > > > > > > > @@ -6320,7 +6320,7 @@ static struct page *mc_handle_file_pte(s > > > > > > > > swp_entry_t swap = radix_to_swp_entry(page); > > > > > > > > if (do_swap_account) > > > > > > > > *entry = swap; > > > > > > > > - page = find_get_page(&swapper_space, swap.val); > > > > > > > > + page = find_get_page(swap_address_space(swap), swap.val); > > > > > > > > } > > > > > > > > #endif > > > > > > > > return page; > > > > > > > > Index: linux/mm/mincore.c > > > > > > > > =================================================================== > > > > > > > > --- linux.orig/mm/mincore.c 2013-01-22 09:13:14.000000000 +0800 > > > > > > > > +++ linux/mm/mincore.c 2013-01-22 09:29:29.378977649 +0800 > > > > > > > > @@ -75,7 +75,7 @@ static unsigned char mincore_page(struct > > > > > > > > /* shmem/tmpfs may return swap: account for swapcache page too. */ > > > > > > > > if (radix_tree_exceptional_entry(page)) { > > > > > > > > swp_entry_t swap = radix_to_swp_entry(page); > > > > > > > > - page = find_get_page(&swapper_space, swap.val); > > > > > > > > + page = find_get_page(swap_address_space(swap), swap.val); > > > > > > > > } > > > > > > > > #endif > > > > > > > > if (page) { > > > > > > > > @@ -135,7 +135,8 @@ static void mincore_pte_range(struct vm_ > > > > > > > > } else { > > > > > > > > #ifdef CONFIG_SWAP > > > > > > > > pgoff = entry.val; > > > > > > > > - *vec = mincore_page(&swapper_space, pgoff); > > > > > > > > + *vec = mincore_page(swap_address_space(entry), > > > > > > > > + pgoff); > > > > > > > > #else > > > > > > > > WARN_ON(1); > > > > > > > > *vec = 1; > > > > > > > > Index: linux/mm/swap.c > > > > > > > > =================================================================== > > > > > > > > --- linux.orig/mm/swap.c 2013-01-22 09:13:14.000000000 +0800 > > > > > > > > +++ linux/mm/swap.c 2013-01-22 09:29:29.378977649 +0800 > > > > > > > > @@ -855,9 +855,14 @@ EXPORT_SYMBOL(pagevec_lookup_tag); > > > > > > > > void __init swap_setup(void) > > > > > > > > { > > > > > > > > unsigned long megs = totalram_pages >> (20 - PAGE_SHIFT); > > > > > > > > - > > > > > > > > #ifdef CONFIG_SWAP > > > > > > > > - bdi_init(swapper_space.backing_dev_info); > > > > > > > > + int i; > > > > > > > > + > > > > > > > > + for (i = 0; i < MAX_SWAPFILES; i++) { > > > > > > > > + bdi_init(swapper_spaces[i].backing_dev_info); > > > > > > > > + spin_lock_init(&swapper_spaces[i].tree_lock); > > > > > > > > + INIT_LIST_HEAD(&swapper_spaces[i].i_mmap_nonlinear); > > > > > > > > + } > > > > > > > > #endif > > > > > > > > > > > > > > > > /* Use a smaller cluster for small-memory machines */ > > > > > > > > Index: linux/mm/swap_state.c > > > > > > > > =================================================================== > > > > > > > > --- linux.orig/mm/swap_state.c 2013-01-22 09:13:14.000000000 +0800 > > > > > > > > +++ linux/mm/swap_state.c 2013-01-22 09:29:29.378977649 +0800 > > > > > > > > @@ -36,12 +36,12 @@ static struct backing_dev_info swap_back > > > > > > > > .capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK | BDI_CAP_SWAP_BACKED, > > > > > > > > }; > > > > > > > > > > > > > > > > -struct address_space swapper_space = { > > > > > > > > - .page_tree = RADIX_TREE_INIT(GFP_ATOMIC|__GFP_NOWARN), > > > > > > > > - .tree_lock = __SPIN_LOCK_UNLOCKED(swapper_space.tree_lock), > > > > > > > > - .a_ops = &swap_aops, > > > > > > > > - .i_mmap_nonlinear = LIST_HEAD_INIT(swapper_space.i_mmap_nonlinear), > > > > > > > > - .backing_dev_info = &swap_backing_dev_info, > > > > > > > > +struct address_space swapper_spaces[MAX_SWAPFILES] = { > > > > > > > > + [0 ... MAX_SWAPFILES - 1] = { > > > > > > > > + .page_tree = RADIX_TREE_INIT(GFP_ATOMIC|__GFP_NOWARN), > > > > > > > > + .a_ops = &swap_aops, > > > > > > > > + .backing_dev_info = &swap_backing_dev_info, > > > > > > > > + } > > > > > > > > }; > > > > > > > > > > > > > > > > #define INC_CACHE_INFO(x) do { swap_cache_info.x++; } while (0) > > > > > > > > @@ -53,9 +53,19 @@ static struct { > > > > > > > > unsigned long find_total; > > > > > > > > } swap_cache_info; > > > > > > > > > > > > > > > > +unsigned long total_swapcache_pages(void) > > > > > > > > +{ > > > > > > > > + int i; > > > > > > > > + unsigned long ret = 0; > > > > > > > > + > > > > > > > > + for (i = 0; i < MAX_SWAPFILES; i++) > > > > > > > > + ret += swapper_spaces[i].nrpages; > > > > > > > > + return ret; > > > > > > > > +} > > > > > > > > + > > > > > > > > void show_swap_cache_info(void) > > > > > > > > { > > > > > > > > - printk("%lu pages in swap cache\n", total_swapcache_pages); > > > > > > > > + printk("%lu pages in swap cache\n", total_swapcache_pages()); > > > > > > > > printk("Swap cache stats: add %lu, delete %lu, find %lu/%lu\n", > > > > > > > > swap_cache_info.add_total, swap_cache_info.del_total, > > > > > > > > swap_cache_info.find_success, swap_cache_info.find_total); > > > > > > > > @@ -70,23 +80,26 @@ void show_swap_cache_info(void) > > > > > > > > static int __add_to_swap_cache(struct page *page, swp_entry_t entry) > > > > > > > > { > > > > > > > > int error; > > > > > > > > + struct address_space *address_space; > > > > > > > > > > > > > > > > VM_BUG_ON(!PageLocked(page)); > > > > > > > > VM_BUG_ON(PageSwapCache(page)); > > > > > > > > VM_BUG_ON(!PageSwapBacked(page)); > > > > > > > > > > > > > > > > page_cache_get(page); > > > > > > > > - SetPageSwapCache(page); > > > > > > > > set_page_private(page, entry.val); > > > > > > > > + SetPageSwapCache(page); > > > > > > > > > > > > > > Why did you move this line? Is there any special reason? > > > > > > > > > > > > Originally I'm afraid page_mapping() gets invalid page_private(), but I then > > > > > > realized we hold page lock. There are some places we don't hold page lock. > > > > > > either such page isn't swap page or the caller can tolerate race. I forgot > > > > > > removing this change in the patch. But I certainly can be wrong. We can add > > > > > > memory barrier if required. > > > > > > > > > > Yeb. While I reviewed the patch, I was concern about that but I fail to find > > > > > a problem, too. But maybe it's valuable to add comment about that race > > > > > (!PageSwapCache(page) but page_mapping could return swapper_space) in page_mapping. > > > > > > > > Hi Minchan, > > > > > > > > If the race Shaohua mentioned should be PageSwapCache(page) but > > > > page_mapping couldn't return swapper_space since page_mapping() gets > > > > invalid page_private(). > > > > > > Right you are. In such case, it could be a problem. > > > Let's see following case. > > > > > > isolate_migratepages_range > > > __isolate_lru_page > > > __add_to_swap_cache > > > set_page_private(page, entry.val) > > > SetPageSwapCache(page) > > > PageSwapCache > > > entry.val = page_private(page); > > > mapping = swap_address_space(entry); > > > > > > In this case, if memory ordering happens, mapping would be dangling pointer, > > > One of the problem by dangling mapping is the page could be passed into > > > shrink_page_list and try to pageout with dangling mapping. Of course, > > > we have many locks until reaching the page_mapping in shrink_page_list so > > > the problem will not happen but we shouldn't depends on such implicit locks > > > instead of explicit memory barrier because we could remove all locks in reclaim > > > path if super hero comes in or new user of page_mapping would do new something > > > to make a problem. :) > > > > > > I will send a patch if anyone doesn't oppose. > > > > That't fine in the case, because page private 0 still return a valide mapping, > > and we only check mapping here. But I agree this is too subtle. Adding memory > > fence is safer. I had this yesterday if you don't mind: > > > > > > Subject: mm: add memory to prevent SwapCache bit and page private out of order > > > > page_mapping() checks SwapCache bit first and then read page private. Adding > > memory barrier so page private has correct value before SwapCache bit set. > > > > Signed-off-by: Shaohua Li <shli@fusionio.com> > > --- > > mm/swap_state.c | 7 +++++-- > > mm/util.c | 6 ++++++ > > 2 files changed, 11 insertions(+), 2 deletions(-) > > > > Index: linux/mm/swap_state.c > > =================================================================== > > --- linux.orig/mm/swap_state.c 2013-01-22 10:12:33.514490665 +0800 > > +++ linux/mm/swap_state.c 2013-01-23 15:42:00.737459987 +0800 > > @@ -89,6 +89,7 @@ static int __add_to_swap_cache(struct pa > > > > page_cache_get(page); > > set_page_private(page, entry.val); > > + smp_wmb(); > > SetPageSwapCache(page); > > > > address_space = swap_address_space(entry); > > @@ -109,8 +110,9 @@ static int __add_to_swap_cache(struct pa > > * So add_to_swap_cache() doesn't returns -EEXIST. > > */ > > VM_BUG_ON(error == -EEXIST); > > - set_page_private(page, 0UL); > > ClearPageSwapCache(page); > > If race happen after clear bit, page_mapping should return NULL against > anonymous page, But here will return &swapper_space. This race always exists (even in upstream). The caller can tolerate it. > > + smp_mb__after_clear_bit(); > > + set_page_private(page, 0UL); > > page_cache_release(page); > > } > > > > @@ -146,8 +148,9 @@ void __delete_from_swap_cache(struct pag > > entry.val = page_private(page); > > address_space = swap_address_space(entry); > > radix_tree_delete(&address_space->page_tree, page_private(page)); > > - set_page_private(page, 0); > > ClearPageSwapCache(page); > > + smp_mb__after_clear_bit(); > > + set_page_private(page, 0); > > address_space->nrpages--; > > __dec_zone_page_state(page, NR_FILE_PAGES); > > INC_CACHE_INFO(del_total); > > Index: linux/mm/util.c > > =================================================================== > > --- linux.orig/mm/util.c 2013-01-22 10:11:58.310933234 +0800 > > +++ linux/mm/util.c 2013-01-23 15:50:22.147155111 +0800 > > @@ -389,6 +389,12 @@ struct address_space *page_mapping(struc > > if (unlikely(PageSwapCache(page))) { > > swp_entry_t entry; > > > > + /* > > + * set page_private() first then set SwapCache bit. clearing > > + * the bit first then zero page private. This memory barrier > > + * matches the rule. > > + */ > > + smp_rmb(); > > Why need this? To prevent read SwapCache and page private reorder. -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@kvack.org. For more info on Linux MM, see: http://www.linux-mm.org/ . Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a> ^ permalink raw reply [flat|nested] 15+ messages in thread
* Re: [patch 2/3 v2]swap: make each swap partition have one address_space 2013-01-24 2:43 ` Shaohua Li 2013-01-24 3:25 ` Simon Jeons @ 2013-01-24 5:19 ` Minchan Kim 2013-01-24 6:14 ` Simon Jeons 2013-01-24 10:24 ` Shaohua Li 1 sibling, 2 replies; 15+ messages in thread From: Minchan Kim @ 2013-01-24 5:19 UTC (permalink / raw) To: Shaohua Li; +Cc: Simon Jeons, linux-mm, akpm, hughd, riel On Thu, Jan 24, 2013 at 10:43:11AM +0800, Shaohua Li wrote: > On Thu, Jan 24, 2013 at 11:22:41AM +0900, Minchan Kim wrote: > > Hi Simon, > > > > On Wed, Jan 23, 2013 at 07:39:56PM -0600, Simon Jeons wrote: > > > On Wed, 2013-01-23 at 17:04 +0900, Minchan Kim wrote: > > > > On Wed, Jan 23, 2013 at 03:36:55PM +0800, Shaohua Li wrote: > > > > > On Wed, Jan 23, 2013 at 03:16:45PM +0900, Minchan Kim wrote: > > > > > > Looks good to me. Below just nitpicks. > > > > > > I saw Andrew already took this into mmotm so I'm not sure he or you will do > > > > > > next spin but anyway, review goes. Just nitpicks and a question. > > > > > > > > > > > > On Tue, Jan 22, 2013 at 10:29:51AM +0800, Shaohua Li wrote: > > > > > > > > > > > > > > When I use several fast SSD to do swap, swapper_space.tree_lock is heavily > > > > > > > contended. This makes each swap partition have one address_space to reduce the > > > > > > > lock contention. There is an array of address_space for swap. The swap entry > > > > > > > type is the index to the array. > > > > > > > > > > > > > > In my test with 3 SSD, this increases the swapout throughput 20%. > > > > > > > > > > > > > > V1->V2: simplify code > > > > > > > > > > > > > > Signed-off-by: Shaohua Li <shli@fusionio.com> > > > > > > > > > > > > Acked-by: Minchan Kim <minchan@kernel.org> > > > > > > > > > > > > > --- > > > > > > > fs/proc/meminfo.c | 4 +-- > > > > > > > include/linux/swap.h | 9 ++++---- > > > > > > > mm/memcontrol.c | 4 +-- > > > > > > > mm/mincore.c | 5 ++-- > > > > > > > mm/swap.c | 9 ++++++-- > > > > > > > mm/swap_state.c | 57 ++++++++++++++++++++++++++++++++++----------------- > > > > > > > mm/swapfile.c | 5 ++-- > > > > > > > mm/util.c | 10 ++++++-- > > > > > > > 8 files changed, 68 insertions(+), 35 deletions(-) > > > > > > > > > > > > > > Index: linux/include/linux/swap.h > > > > > > > =================================================================== > > > > > > > --- linux.orig/include/linux/swap.h 2013-01-22 09:13:14.000000000 +0800 > > > > > > > +++ linux/include/linux/swap.h 2013-01-22 09:34:44.923011706 +0800 > > > > > > > @@ -8,7 +8,7 @@ > > > > > > > #include <linux/memcontrol.h> > > > > > > > #include <linux/sched.h> > > > > > > > #include <linux/node.h> > > > > > > > - > > > > > > > +#include <linux/fs.h> > > > > > > > #include <linux/atomic.h> > > > > > > > #include <asm/page.h> > > > > > > > > > > > > > > @@ -330,8 +330,9 @@ int generic_swapfile_activate(struct swa > > > > > > > sector_t *); > > > > > > > > > > > > > > /* linux/mm/swap_state.c */ > > > > > > > -extern struct address_space swapper_space; > > > > > > > -#define total_swapcache_pages swapper_space.nrpages > > > > > > > +extern struct address_space swapper_spaces[]; > > > > > > > +#define swap_address_space(entry) (&swapper_spaces[swp_type(entry)]) > > > > > > > > > > > > How about this naming? > > > > > > > > > > > > #define swapper_space(entry) (&swapper_spaces[swp_type(entry)]) > > > > > > > > > > > > > +extern unsigned long total_swapcache_pages(void); > > > > > > > extern void show_swap_cache_info(void); > > > > > > > extern int add_to_swap(struct page *); > > > > > > > extern int add_to_swap_cache(struct page *, swp_entry_t, gfp_t); > > > > > > > @@ -382,7 +383,7 @@ mem_cgroup_uncharge_swapcache(struct pag > > > > > > > > > > > > > > #define nr_swap_pages 0L > > > > > > > #define total_swap_pages 0L > > > > > > > -#define total_swapcache_pages 0UL > > > > > > > +#define total_swapcache_pages() 0UL > > > > > > > > > > > > > > #define si_swapinfo(val) \ > > > > > > > do { (val)->freeswap = (val)->totalswap = 0; } while (0) > > > > > > > Index: linux/mm/memcontrol.c > > > > > > > =================================================================== > > > > > > > --- linux.orig/mm/memcontrol.c 2013-01-22 09:13:14.000000000 +0800 > > > > > > Acked-by: Minchan Kim <minchan@kernel.org> > > > > > > > > > > > > > +++ linux/mm/memcontrol.c 2013-01-22 09:29:29.374977700 +0800 > > > > > > > @@ -6279,7 +6279,7 @@ static struct page *mc_handle_swap_pte(s > > > > > > > * Because lookup_swap_cache() updates some statistics counter, > > > > > > > * we call find_get_page() with swapper_space directly. > > > > > > > */ > > > > > > > - page = find_get_page(&swapper_space, ent.val); > > > > > > > + page = find_get_page(swap_address_space(ent), ent.val); > > > > > > > if (do_swap_account) > > > > > > > entry->val = ent.val; > > > > > > > > > > > > > > @@ -6320,7 +6320,7 @@ static struct page *mc_handle_file_pte(s > > > > > > > swp_entry_t swap = radix_to_swp_entry(page); > > > > > > > if (do_swap_account) > > > > > > > *entry = swap; > > > > > > > - page = find_get_page(&swapper_space, swap.val); > > > > > > > + page = find_get_page(swap_address_space(swap), swap.val); > > > > > > > } > > > > > > > #endif > > > > > > > return page; > > > > > > > Index: linux/mm/mincore.c > > > > > > > =================================================================== > > > > > > > --- linux.orig/mm/mincore.c 2013-01-22 09:13:14.000000000 +0800 > > > > > > > +++ linux/mm/mincore.c 2013-01-22 09:29:29.378977649 +0800 > > > > > > > @@ -75,7 +75,7 @@ static unsigned char mincore_page(struct > > > > > > > /* shmem/tmpfs may return swap: account for swapcache page too. */ > > > > > > > if (radix_tree_exceptional_entry(page)) { > > > > > > > swp_entry_t swap = radix_to_swp_entry(page); > > > > > > > - page = find_get_page(&swapper_space, swap.val); > > > > > > > + page = find_get_page(swap_address_space(swap), swap.val); > > > > > > > } > > > > > > > #endif > > > > > > > if (page) { > > > > > > > @@ -135,7 +135,8 @@ static void mincore_pte_range(struct vm_ > > > > > > > } else { > > > > > > > #ifdef CONFIG_SWAP > > > > > > > pgoff = entry.val; > > > > > > > - *vec = mincore_page(&swapper_space, pgoff); > > > > > > > + *vec = mincore_page(swap_address_space(entry), > > > > > > > + pgoff); > > > > > > > #else > > > > > > > WARN_ON(1); > > > > > > > *vec = 1; > > > > > > > Index: linux/mm/swap.c > > > > > > > =================================================================== > > > > > > > --- linux.orig/mm/swap.c 2013-01-22 09:13:14.000000000 +0800 > > > > > > > +++ linux/mm/swap.c 2013-01-22 09:29:29.378977649 +0800 > > > > > > > @@ -855,9 +855,14 @@ EXPORT_SYMBOL(pagevec_lookup_tag); > > > > > > > void __init swap_setup(void) > > > > > > > { > > > > > > > unsigned long megs = totalram_pages >> (20 - PAGE_SHIFT); > > > > > > > - > > > > > > > #ifdef CONFIG_SWAP > > > > > > > - bdi_init(swapper_space.backing_dev_info); > > > > > > > + int i; > > > > > > > + > > > > > > > + for (i = 0; i < MAX_SWAPFILES; i++) { > > > > > > > + bdi_init(swapper_spaces[i].backing_dev_info); > > > > > > > + spin_lock_init(&swapper_spaces[i].tree_lock); > > > > > > > + INIT_LIST_HEAD(&swapper_spaces[i].i_mmap_nonlinear); > > > > > > > + } > > > > > > > #endif > > > > > > > > > > > > > > /* Use a smaller cluster for small-memory machines */ > > > > > > > Index: linux/mm/swap_state.c > > > > > > > =================================================================== > > > > > > > --- linux.orig/mm/swap_state.c 2013-01-22 09:13:14.000000000 +0800 > > > > > > > +++ linux/mm/swap_state.c 2013-01-22 09:29:29.378977649 +0800 > > > > > > > @@ -36,12 +36,12 @@ static struct backing_dev_info swap_back > > > > > > > .capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK | BDI_CAP_SWAP_BACKED, > > > > > > > }; > > > > > > > > > > > > > > -struct address_space swapper_space = { > > > > > > > - .page_tree = RADIX_TREE_INIT(GFP_ATOMIC|__GFP_NOWARN), > > > > > > > - .tree_lock = __SPIN_LOCK_UNLOCKED(swapper_space.tree_lock), > > > > > > > - .a_ops = &swap_aops, > > > > > > > - .i_mmap_nonlinear = LIST_HEAD_INIT(swapper_space.i_mmap_nonlinear), > > > > > > > - .backing_dev_info = &swap_backing_dev_info, > > > > > > > +struct address_space swapper_spaces[MAX_SWAPFILES] = { > > > > > > > + [0 ... MAX_SWAPFILES - 1] = { > > > > > > > + .page_tree = RADIX_TREE_INIT(GFP_ATOMIC|__GFP_NOWARN), > > > > > > > + .a_ops = &swap_aops, > > > > > > > + .backing_dev_info = &swap_backing_dev_info, > > > > > > > + } > > > > > > > }; > > > > > > > > > > > > > > #define INC_CACHE_INFO(x) do { swap_cache_info.x++; } while (0) > > > > > > > @@ -53,9 +53,19 @@ static struct { > > > > > > > unsigned long find_total; > > > > > > > } swap_cache_info; > > > > > > > > > > > > > > +unsigned long total_swapcache_pages(void) > > > > > > > +{ > > > > > > > + int i; > > > > > > > + unsigned long ret = 0; > > > > > > > + > > > > > > > + for (i = 0; i < MAX_SWAPFILES; i++) > > > > > > > + ret += swapper_spaces[i].nrpages; > > > > > > > + return ret; > > > > > > > +} > > > > > > > + > > > > > > > void show_swap_cache_info(void) > > > > > > > { > > > > > > > - printk("%lu pages in swap cache\n", total_swapcache_pages); > > > > > > > + printk("%lu pages in swap cache\n", total_swapcache_pages()); > > > > > > > printk("Swap cache stats: add %lu, delete %lu, find %lu/%lu\n", > > > > > > > swap_cache_info.add_total, swap_cache_info.del_total, > > > > > > > swap_cache_info.find_success, swap_cache_info.find_total); > > > > > > > @@ -70,23 +80,26 @@ void show_swap_cache_info(void) > > > > > > > static int __add_to_swap_cache(struct page *page, swp_entry_t entry) > > > > > > > { > > > > > > > int error; > > > > > > > + struct address_space *address_space; > > > > > > > > > > > > > > VM_BUG_ON(!PageLocked(page)); > > > > > > > VM_BUG_ON(PageSwapCache(page)); > > > > > > > VM_BUG_ON(!PageSwapBacked(page)); > > > > > > > > > > > > > > page_cache_get(page); > > > > > > > - SetPageSwapCache(page); > > > > > > > set_page_private(page, entry.val); > > > > > > > + SetPageSwapCache(page); > > > > > > > > > > > > Why did you move this line? Is there any special reason? > > > > > > > > > > Originally I'm afraid page_mapping() gets invalid page_private(), but I then > > > > > realized we hold page lock. There are some places we don't hold page lock. > > > > > either such page isn't swap page or the caller can tolerate race. I forgot > > > > > removing this change in the patch. But I certainly can be wrong. We can add > > > > > memory barrier if required. > > > > > > > > Yeb. While I reviewed the patch, I was concern about that but I fail to find > > > > a problem, too. But maybe it's valuable to add comment about that race > > > > (!PageSwapCache(page) but page_mapping could return swapper_space) in page_mapping. > > > > > > Hi Minchan, > > > > > > If the race Shaohua mentioned should be PageSwapCache(page) but > > > page_mapping couldn't return swapper_space since page_mapping() gets > > > invalid page_private(). > > > > Right you are. In such case, it could be a problem. > > Let's see following case. > > > > isolate_migratepages_range > > __isolate_lru_page > > __add_to_swap_cache > > set_page_private(page, entry.val) > > SetPageSwapCache(page) > > PageSwapCache > > entry.val = page_private(page); > > mapping = swap_address_space(entry); > > > > In this case, if memory ordering happens, mapping would be dangling pointer, > > One of the problem by dangling mapping is the page could be passed into > > shrink_page_list and try to pageout with dangling mapping. Of course, > > we have many locks until reaching the page_mapping in shrink_page_list so > > the problem will not happen but we shouldn't depends on such implicit locks > > instead of explicit memory barrier because we could remove all locks in reclaim > > path if super hero comes in or new user of page_mapping would do new something > > to make a problem. :) > > > > I will send a patch if anyone doesn't oppose. > > That't fine in the case, because page private 0 still return a valide mapping, > and we only check mapping here. But I agree this is too subtle. Adding memory It could try to do many things with wrong address_space so it is very error-prone. > fence is safer. I had this yesterday if you don't mind: Of course. Below just some nitpicks. > > > Subject: mm: add memory to prevent SwapCache bit and page private out of order mm: Get rid of memory reordering of SwapCache and PagePrivate > > page_mapping() checks SwapCache bit first and then read page private. Adding > memory barrier so page private has correct value before SwapCache bit set. Please write down the problem if we don't apply the patch. > > Signed-off-by: Shaohua Li <shli@fusionio.com> Acked-by: Minchan Kim <minchan@kernel.org> > --- > mm/swap_state.c | 7 +++++-- > mm/util.c | 6 ++++++ > 2 files changed, 11 insertions(+), 2 deletions(-) > > Index: linux/mm/swap_state.c > =================================================================== > --- linux.orig/mm/swap_state.c 2013-01-22 10:12:33.514490665 +0800 > +++ linux/mm/swap_state.c 2013-01-23 15:42:00.737459987 +0800 > @@ -89,6 +89,7 @@ static int __add_to_swap_cache(struct pa > > page_cache_get(page); > set_page_private(page, entry.val); > + smp_wmb(); > SetPageSwapCache(page); > > address_space = swap_address_space(entry); > @@ -109,8 +110,9 @@ static int __add_to_swap_cache(struct pa > * So add_to_swap_cache() doesn't returns -EEXIST. > */ > VM_BUG_ON(error == -EEXIST); > - set_page_private(page, 0UL); > ClearPageSwapCache(page); > + smp_mb__after_clear_bit(); Could you use smp_wmb() instead of smp_mb__after_clear_bit? Yes. ClearPageSwapCache does clear_bit so it's okay now but we might change it in future so let's not make unnecessary dependency. It's not really hot path. > + set_page_private(page, 0UL); > page_cache_release(page); > } > > @@ -146,8 +148,9 @@ void __delete_from_swap_cache(struct pag > entry.val = page_private(page); > address_space = swap_address_space(entry); > radix_tree_delete(&address_space->page_tree, page_private(page)); > - set_page_private(page, 0); > ClearPageSwapCache(page); > + smp_mb__after_clear_bit(); > + set_page_private(page, 0); > address_space->nrpages--; > __dec_zone_page_state(page, NR_FILE_PAGES); > INC_CACHE_INFO(del_total); > Index: linux/mm/util.c > =================================================================== > --- linux.orig/mm/util.c 2013-01-22 10:11:58.310933234 +0800 > +++ linux/mm/util.c 2013-01-23 15:50:22.147155111 +0800 > @@ -389,6 +389,12 @@ struct address_space *page_mapping(struc > if (unlikely(PageSwapCache(page))) { > swp_entry_t entry; > > + /* > + * set page_private() first then set SwapCache bit. clearing > + * the bit first then zero page private. This memory barrier > + * matches the rule. > + */ > + smp_rmb(); > entry.val = page_private(page); > mapping = swap_address_space(entry); > } else > > -- > To unsubscribe, send a message with 'unsubscribe linux-mm' in > the body to majordomo@kvack.org. For more info on Linux MM, > see: http://www.linux-mm.org/ . > Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a> -- Kind regards, Minchan Kim -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@kvack.org. For more info on Linux MM, see: http://www.linux-mm.org/ . Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a> ^ permalink raw reply [flat|nested] 15+ messages in thread
* Re: [patch 2/3 v2]swap: make each swap partition have one address_space 2013-01-24 5:19 ` Minchan Kim @ 2013-01-24 6:14 ` Simon Jeons 2013-01-24 10:24 ` Shaohua Li 1 sibling, 0 replies; 15+ messages in thread From: Simon Jeons @ 2013-01-24 6:14 UTC (permalink / raw) To: Minchan Kim; +Cc: Shaohua Li, linux-mm, akpm, hughd, riel On Thu, 2013-01-24 at 14:19 +0900, Minchan Kim wrote: > On Thu, Jan 24, 2013 at 10:43:11AM +0800, Shaohua Li wrote: > > On Thu, Jan 24, 2013 at 11:22:41AM +0900, Minchan Kim wrote: > > > Hi Simon, > > > > > > On Wed, Jan 23, 2013 at 07:39:56PM -0600, Simon Jeons wrote: > > > > On Wed, 2013-01-23 at 17:04 +0900, Minchan Kim wrote: > > > > > On Wed, Jan 23, 2013 at 03:36:55PM +0800, Shaohua Li wrote: > > > > > > On Wed, Jan 23, 2013 at 03:16:45PM +0900, Minchan Kim wrote: > > > > > > > Looks good to me. Below just nitpicks. > > > > > > > I saw Andrew already took this into mmotm so I'm not sure he or you will do > > > > > > > next spin but anyway, review goes. Just nitpicks and a question. > > > > > > > > > > > > > > On Tue, Jan 22, 2013 at 10:29:51AM +0800, Shaohua Li wrote: > > > > > > > > > > > > > > > > When I use several fast SSD to do swap, swapper_space.tree_lock is heavily > > > > > > > > contended. This makes each swap partition have one address_space to reduce the > > > > > > > > lock contention. There is an array of address_space for swap. The swap entry > > > > > > > > type is the index to the array. > > > > > > > > > > > > > > > > In my test with 3 SSD, this increases the swapout throughput 20%. > > > > > > > > > > > > > > > > V1->V2: simplify code > > > > > > > > > > > > > > > > Signed-off-by: Shaohua Li <shli@fusionio.com> > > > > > > > > > > > > > > Acked-by: Minchan Kim <minchan@kernel.org> > > > > > > > > > > > > > > > --- > > > > > > > > fs/proc/meminfo.c | 4 +-- > > > > > > > > include/linux/swap.h | 9 ++++---- > > > > > > > > mm/memcontrol.c | 4 +-- > > > > > > > > mm/mincore.c | 5 ++-- > > > > > > > > mm/swap.c | 9 ++++++-- > > > > > > > > mm/swap_state.c | 57 ++++++++++++++++++++++++++++++++++----------------- > > > > > > > > mm/swapfile.c | 5 ++-- > > > > > > > > mm/util.c | 10 ++++++-- > > > > > > > > 8 files changed, 68 insertions(+), 35 deletions(-) > > > > > > > > > > > > > > > > Index: linux/include/linux/swap.h > > > > > > > > =================================================================== > > > > > > > > --- linux.orig/include/linux/swap.h 2013-01-22 09:13:14.000000000 +0800 > > > > > > > > +++ linux/include/linux/swap.h 2013-01-22 09:34:44.923011706 +0800 > > > > > > > > @@ -8,7 +8,7 @@ > > > > > > > > #include <linux/memcontrol.h> > > > > > > > > #include <linux/sched.h> > > > > > > > > #include <linux/node.h> > > > > > > > > - > > > > > > > > +#include <linux/fs.h> > > > > > > > > #include <linux/atomic.h> > > > > > > > > #include <asm/page.h> > > > > > > > > > > > > > > > > @@ -330,8 +330,9 @@ int generic_swapfile_activate(struct swa > > > > > > > > sector_t *); > > > > > > > > > > > > > > > > /* linux/mm/swap_state.c */ > > > > > > > > -extern struct address_space swapper_space; > > > > > > > > -#define total_swapcache_pages swapper_space.nrpages > > > > > > > > +extern struct address_space swapper_spaces[]; > > > > > > > > +#define swap_address_space(entry) (&swapper_spaces[swp_type(entry)]) > > > > > > > > > > > > > > How about this naming? > > > > > > > > > > > > > > #define swapper_space(entry) (&swapper_spaces[swp_type(entry)]) > > > > > > > > > > > > > > > +extern unsigned long total_swapcache_pages(void); > > > > > > > > extern void show_swap_cache_info(void); > > > > > > > > extern int add_to_swap(struct page *); > > > > > > > > extern int add_to_swap_cache(struct page *, swp_entry_t, gfp_t); > > > > > > > > @@ -382,7 +383,7 @@ mem_cgroup_uncharge_swapcache(struct pag > > > > > > > > > > > > > > > > #define nr_swap_pages 0L > > > > > > > > #define total_swap_pages 0L > > > > > > > > -#define total_swapcache_pages 0UL > > > > > > > > +#define total_swapcache_pages() 0UL > > > > > > > > > > > > > > > > #define si_swapinfo(val) \ > > > > > > > > do { (val)->freeswap = (val)->totalswap = 0; } while (0) > > > > > > > > Index: linux/mm/memcontrol.c > > > > > > > > =================================================================== > > > > > > > > --- linux.orig/mm/memcontrol.c 2013-01-22 09:13:14.000000000 +0800 > > > > > > > Acked-by: Minchan Kim <minchan@kernel.org> > > > > > > > > > > > > > > > +++ linux/mm/memcontrol.c 2013-01-22 09:29:29.374977700 +0800 > > > > > > > > @@ -6279,7 +6279,7 @@ static struct page *mc_handle_swap_pte(s > > > > > > > > * Because lookup_swap_cache() updates some statistics counter, > > > > > > > > * we call find_get_page() with swapper_space directly. > > > > > > > > */ > > > > > > > > - page = find_get_page(&swapper_space, ent.val); > > > > > > > > + page = find_get_page(swap_address_space(ent), ent.val); > > > > > > > > if (do_swap_account) > > > > > > > > entry->val = ent.val; > > > > > > > > > > > > > > > > @@ -6320,7 +6320,7 @@ static struct page *mc_handle_file_pte(s > > > > > > > > swp_entry_t swap = radix_to_swp_entry(page); > > > > > > > > if (do_swap_account) > > > > > > > > *entry = swap; > > > > > > > > - page = find_get_page(&swapper_space, swap.val); > > > > > > > > + page = find_get_page(swap_address_space(swap), swap.val); > > > > > > > > } > > > > > > > > #endif > > > > > > > > return page; > > > > > > > > Index: linux/mm/mincore.c > > > > > > > > =================================================================== > > > > > > > > --- linux.orig/mm/mincore.c 2013-01-22 09:13:14.000000000 +0800 > > > > > > > > +++ linux/mm/mincore.c 2013-01-22 09:29:29.378977649 +0800 > > > > > > > > @@ -75,7 +75,7 @@ static unsigned char mincore_page(struct > > > > > > > > /* shmem/tmpfs may return swap: account for swapcache page too. */ > > > > > > > > if (radix_tree_exceptional_entry(page)) { > > > > > > > > swp_entry_t swap = radix_to_swp_entry(page); > > > > > > > > - page = find_get_page(&swapper_space, swap.val); > > > > > > > > + page = find_get_page(swap_address_space(swap), swap.val); > > > > > > > > } > > > > > > > > #endif > > > > > > > > if (page) { > > > > > > > > @@ -135,7 +135,8 @@ static void mincore_pte_range(struct vm_ > > > > > > > > } else { > > > > > > > > #ifdef CONFIG_SWAP > > > > > > > > pgoff = entry.val; > > > > > > > > - *vec = mincore_page(&swapper_space, pgoff); > > > > > > > > + *vec = mincore_page(swap_address_space(entry), > > > > > > > > + pgoff); > > > > > > > > #else > > > > > > > > WARN_ON(1); > > > > > > > > *vec = 1; > > > > > > > > Index: linux/mm/swap.c > > > > > > > > =================================================================== > > > > > > > > --- linux.orig/mm/swap.c 2013-01-22 09:13:14.000000000 +0800 > > > > > > > > +++ linux/mm/swap.c 2013-01-22 09:29:29.378977649 +0800 > > > > > > > > @@ -855,9 +855,14 @@ EXPORT_SYMBOL(pagevec_lookup_tag); > > > > > > > > void __init swap_setup(void) > > > > > > > > { > > > > > > > > unsigned long megs = totalram_pages >> (20 - PAGE_SHIFT); > > > > > > > > - > > > > > > > > #ifdef CONFIG_SWAP > > > > > > > > - bdi_init(swapper_space.backing_dev_info); > > > > > > > > + int i; > > > > > > > > + > > > > > > > > + for (i = 0; i < MAX_SWAPFILES; i++) { > > > > > > > > + bdi_init(swapper_spaces[i].backing_dev_info); > > > > > > > > + spin_lock_init(&swapper_spaces[i].tree_lock); > > > > > > > > + INIT_LIST_HEAD(&swapper_spaces[i].i_mmap_nonlinear); > > > > > > > > + } > > > > > > > > #endif > > > > > > > > > > > > > > > > /* Use a smaller cluster for small-memory machines */ > > > > > > > > Index: linux/mm/swap_state.c > > > > > > > > =================================================================== > > > > > > > > --- linux.orig/mm/swap_state.c 2013-01-22 09:13:14.000000000 +0800 > > > > > > > > +++ linux/mm/swap_state.c 2013-01-22 09:29:29.378977649 +0800 > > > > > > > > @@ -36,12 +36,12 @@ static struct backing_dev_info swap_back > > > > > > > > .capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK | BDI_CAP_SWAP_BACKED, > > > > > > > > }; > > > > > > > > > > > > > > > > -struct address_space swapper_space = { > > > > > > > > - .page_tree = RADIX_TREE_INIT(GFP_ATOMIC|__GFP_NOWARN), > > > > > > > > - .tree_lock = __SPIN_LOCK_UNLOCKED(swapper_space.tree_lock), > > > > > > > > - .a_ops = &swap_aops, > > > > > > > > - .i_mmap_nonlinear = LIST_HEAD_INIT(swapper_space.i_mmap_nonlinear), > > > > > > > > - .backing_dev_info = &swap_backing_dev_info, > > > > > > > > +struct address_space swapper_spaces[MAX_SWAPFILES] = { > > > > > > > > + [0 ... MAX_SWAPFILES - 1] = { > > > > > > > > + .page_tree = RADIX_TREE_INIT(GFP_ATOMIC|__GFP_NOWARN), > > > > > > > > + .a_ops = &swap_aops, > > > > > > > > + .backing_dev_info = &swap_backing_dev_info, > > > > > > > > + } > > > > > > > > }; > > > > > > > > > > > > > > > > #define INC_CACHE_INFO(x) do { swap_cache_info.x++; } while (0) > > > > > > > > @@ -53,9 +53,19 @@ static struct { > > > > > > > > unsigned long find_total; > > > > > > > > } swap_cache_info; > > > > > > > > > > > > > > > > +unsigned long total_swapcache_pages(void) > > > > > > > > +{ > > > > > > > > + int i; > > > > > > > > + unsigned long ret = 0; > > > > > > > > + > > > > > > > > + for (i = 0; i < MAX_SWAPFILES; i++) > > > > > > > > + ret += swapper_spaces[i].nrpages; > > > > > > > > + return ret; > > > > > > > > +} > > > > > > > > + > > > > > > > > void show_swap_cache_info(void) > > > > > > > > { > > > > > > > > - printk("%lu pages in swap cache\n", total_swapcache_pages); > > > > > > > > + printk("%lu pages in swap cache\n", total_swapcache_pages()); > > > > > > > > printk("Swap cache stats: add %lu, delete %lu, find %lu/%lu\n", > > > > > > > > swap_cache_info.add_total, swap_cache_info.del_total, > > > > > > > > swap_cache_info.find_success, swap_cache_info.find_total); > > > > > > > > @@ -70,23 +80,26 @@ void show_swap_cache_info(void) > > > > > > > > static int __add_to_swap_cache(struct page *page, swp_entry_t entry) > > > > > > > > { > > > > > > > > int error; > > > > > > > > + struct address_space *address_space; > > > > > > > > > > > > > > > > VM_BUG_ON(!PageLocked(page)); > > > > > > > > VM_BUG_ON(PageSwapCache(page)); > > > > > > > > VM_BUG_ON(!PageSwapBacked(page)); > > > > > > > > > > > > > > > > page_cache_get(page); > > > > > > > > - SetPageSwapCache(page); > > > > > > > > set_page_private(page, entry.val); > > > > > > > > + SetPageSwapCache(page); > > > > > > > > > > > > > > Why did you move this line? Is there any special reason? > > > > > > > > > > > > Originally I'm afraid page_mapping() gets invalid page_private(), but I then > > > > > > realized we hold page lock. There are some places we don't hold page lock. > > > > > > either such page isn't swap page or the caller can tolerate race. I forgot > > > > > > removing this change in the patch. But I certainly can be wrong. We can add > > > > > > memory barrier if required. > > > > > > > > > > Yeb. While I reviewed the patch, I was concern about that but I fail to find > > > > > a problem, too. But maybe it's valuable to add comment about that race > > > > > (!PageSwapCache(page) but page_mapping could return swapper_space) in page_mapping. > > > > > > > > Hi Minchan, > > > > > > > > If the race Shaohua mentioned should be PageSwapCache(page) but > > > > page_mapping couldn't return swapper_space since page_mapping() gets > > > > invalid page_private(). > > > > > > Right you are. In such case, it could be a problem. > > > Let's see following case. > > > > > > isolate_migratepages_range > > > __isolate_lru_page > > > __add_to_swap_cache > > > set_page_private(page, entry.val) > > > SetPageSwapCache(page) > > > PageSwapCache > > > entry.val = page_private(page); > > > mapping = swap_address_space(entry); > > > > > > In this case, if memory ordering happens, mapping would be dangling pointer, > > > One of the problem by dangling mapping is the page could be passed into > > > shrink_page_list and try to pageout with dangling mapping. Of course, > > > we have many locks until reaching the page_mapping in shrink_page_list so > > > the problem will not happen but we shouldn't depends on such implicit locks > > > instead of explicit memory barrier because we could remove all locks in reclaim > > > path if super hero comes in or new user of page_mapping would do new something > > > to make a problem. :) > > > > > > I will send a patch if anyone doesn't oppose. > > > > That't fine in the case, because page private 0 still return a valide mapping, > > and we only check mapping here. But I agree this is too subtle. Adding memory > > It could try to do many things with wrong address_space so it is very error-prone. > > > fence is safer. I had this yesterday if you don't mind: > > Of course. Below just some nitpicks. > > > > > > > Subject: mm: add memory to prevent SwapCache bit and page private out of order > > mm: Get rid of memory reordering of SwapCache and PagePrivate > > > > page_mapping() checks SwapCache bit first and then read page private. Adding > > memory barrier so page private has correct value before SwapCache bit set. > > Please write down the problem if we don't apply the patch. > > > > > Signed-off-by: Shaohua Li <shli@fusionio.com> > Acked-by: Minchan Kim <minchan@kernel.org> > > > --- > > mm/swap_state.c | 7 +++++-- > > mm/util.c | 6 ++++++ > > 2 files changed, 11 insertions(+), 2 deletions(-) > > > > Index: linux/mm/swap_state.c > > =================================================================== > > --- linux.orig/mm/swap_state.c 2013-01-22 10:12:33.514490665 +0800 > > +++ linux/mm/swap_state.c 2013-01-23 15:42:00.737459987 +0800 > > @@ -89,6 +89,7 @@ static int __add_to_swap_cache(struct pa > > > > page_cache_get(page); > > set_page_private(page, entry.val); > > + smp_wmb(); > > SetPageSwapCache(page); > > > > address_space = swap_address_space(entry); > > @@ -109,8 +110,9 @@ static int __add_to_swap_cache(struct pa > > * So add_to_swap_cache() doesn't returns -EEXIST. > > */ > > VM_BUG_ON(error == -EEXIST); > > - set_page_private(page, 0UL); > > ClearPageSwapCache(page); > > + smp_mb__after_clear_bit(); > > Could you use smp_wmb() instead of smp_mb__after_clear_bit? > Yes. ClearPageSwapCache does clear_bit so it's okay now but we might change it > in future so let's not make unnecessary dependency. It's not really hot path. > I confuse! Since smp_wmb() equal to smp_mb_after_clear_bit(), then why introduce smp_mb_after_clear_bit(), just hope people know barrier is after clear bit? > > > + set_page_private(page, 0UL); > > page_cache_release(page); > > } > > > > @@ -146,8 +148,9 @@ void __delete_from_swap_cache(struct pag > > entry.val = page_private(page); > > address_space = swap_address_space(entry); > > radix_tree_delete(&address_space->page_tree, page_private(page)); > > - set_page_private(page, 0); > > ClearPageSwapCache(page); > > + smp_mb__after_clear_bit(); > > + set_page_private(page, 0); > > address_space->nrpages--; > > __dec_zone_page_state(page, NR_FILE_PAGES); > > INC_CACHE_INFO(del_total); > > Index: linux/mm/util.c > > =================================================================== > > --- linux.orig/mm/util.c 2013-01-22 10:11:58.310933234 +0800 > > +++ linux/mm/util.c 2013-01-23 15:50:22.147155111 +0800 > > @@ -389,6 +389,12 @@ struct address_space *page_mapping(struc > > if (unlikely(PageSwapCache(page))) { > > swp_entry_t entry; > > > > + /* > > + * set page_private() first then set SwapCache bit. clearing > > + * the bit first then zero page private. This memory barrier > > + * matches the rule. > > + */ > > + smp_rmb(); > > entry.val = page_private(page); > > mapping = swap_address_space(entry); > > } else > > > > -- > > To unsubscribe, send a message with 'unsubscribe linux-mm' in > > the body to majordomo@kvack.org. For more info on Linux MM, > > see: http://www.linux-mm.org/ . > > Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a> > -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@kvack.org. For more info on Linux MM, see: http://www.linux-mm.org/ . Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a> ^ permalink raw reply [flat|nested] 15+ messages in thread
* Re: [patch 2/3 v2]swap: make each swap partition have one address_space 2013-01-24 5:19 ` Minchan Kim 2013-01-24 6:14 ` Simon Jeons @ 2013-01-24 10:24 ` Shaohua Li 2013-01-31 21:50 ` Andrew Morton 1 sibling, 1 reply; 15+ messages in thread From: Shaohua Li @ 2013-01-24 10:24 UTC (permalink / raw) To: Minchan Kim; +Cc: Simon Jeons, linux-mm, akpm, hughd, riel On Thu, Jan 24, 2013 at 02:19:10PM +0900, Minchan Kim wrote: > On Thu, Jan 24, 2013 at 10:43:11AM +0800, Shaohua Li wrote: > > On Thu, Jan 24, 2013 at 11:22:41AM +0900, Minchan Kim wrote: > > > Hi Simon, > > > > > > On Wed, Jan 23, 2013 at 07:39:56PM -0600, Simon Jeons wrote: > > > > On Wed, 2013-01-23 at 17:04 +0900, Minchan Kim wrote: > > > > > On Wed, Jan 23, 2013 at 03:36:55PM +0800, Shaohua Li wrote: > > > > > > On Wed, Jan 23, 2013 at 03:16:45PM +0900, Minchan Kim wrote: > > > > > > > Looks good to me. Below just nitpicks. > > > > > > > I saw Andrew already took this into mmotm so I'm not sure he or you will do > > > > > > > next spin but anyway, review goes. Just nitpicks and a question. > > > > > > > > > > > > > > On Tue, Jan 22, 2013 at 10:29:51AM +0800, Shaohua Li wrote: > > > > > > > > > > > > > > > > When I use several fast SSD to do swap, swapper_space.tree_lock is heavily > > > > > > > > contended. This makes each swap partition have one address_space to reduce the > > > > > > > > lock contention. There is an array of address_space for swap. The swap entry > > > > > > > > type is the index to the array. > > > > > > > > > > > > > > > > In my test with 3 SSD, this increases the swapout throughput 20%. > > > > > > > > > > > > > > > > V1->V2: simplify code > > > > > > > > > > > > > > > > Signed-off-by: Shaohua Li <shli@fusionio.com> > > > > > > > > > > > > > > Acked-by: Minchan Kim <minchan@kernel.org> > > > > > > > > > > > > > > > --- > > > > > > > > fs/proc/meminfo.c | 4 +-- > > > > > > > > include/linux/swap.h | 9 ++++---- > > > > > > > > mm/memcontrol.c | 4 +-- > > > > > > > > mm/mincore.c | 5 ++-- > > > > > > > > mm/swap.c | 9 ++++++-- > > > > > > > > mm/swap_state.c | 57 ++++++++++++++++++++++++++++++++++----------------- > > > > > > > > mm/swapfile.c | 5 ++-- > > > > > > > > mm/util.c | 10 ++++++-- > > > > > > > > 8 files changed, 68 insertions(+), 35 deletions(-) > > > > > > > > > > > > > > > > Index: linux/include/linux/swap.h > > > > > > > > =================================================================== > > > > > > > > --- linux.orig/include/linux/swap.h 2013-01-22 09:13:14.000000000 +0800 > > > > > > > > +++ linux/include/linux/swap.h 2013-01-22 09:34:44.923011706 +0800 > > > > > > > > @@ -8,7 +8,7 @@ > > > > > > > > #include <linux/memcontrol.h> > > > > > > > > #include <linux/sched.h> > > > > > > > > #include <linux/node.h> > > > > > > > > - > > > > > > > > +#include <linux/fs.h> > > > > > > > > #include <linux/atomic.h> > > > > > > > > #include <asm/page.h> > > > > > > > > > > > > > > > > @@ -330,8 +330,9 @@ int generic_swapfile_activate(struct swa > > > > > > > > sector_t *); > > > > > > > > > > > > > > > > /* linux/mm/swap_state.c */ > > > > > > > > -extern struct address_space swapper_space; > > > > > > > > -#define total_swapcache_pages swapper_space.nrpages > > > > > > > > +extern struct address_space swapper_spaces[]; > > > > > > > > +#define swap_address_space(entry) (&swapper_spaces[swp_type(entry)]) > > > > > > > > > > > > > > How about this naming? > > > > > > > > > > > > > > #define swapper_space(entry) (&swapper_spaces[swp_type(entry)]) > > > > > > > > > > > > > > > +extern unsigned long total_swapcache_pages(void); > > > > > > > > extern void show_swap_cache_info(void); > > > > > > > > extern int add_to_swap(struct page *); > > > > > > > > extern int add_to_swap_cache(struct page *, swp_entry_t, gfp_t); > > > > > > > > @@ -382,7 +383,7 @@ mem_cgroup_uncharge_swapcache(struct pag > > > > > > > > > > > > > > > > #define nr_swap_pages 0L > > > > > > > > #define total_swap_pages 0L > > > > > > > > -#define total_swapcache_pages 0UL > > > > > > > > +#define total_swapcache_pages() 0UL > > > > > > > > > > > > > > > > #define si_swapinfo(val) \ > > > > > > > > do { (val)->freeswap = (val)->totalswap = 0; } while (0) > > > > > > > > Index: linux/mm/memcontrol.c > > > > > > > > =================================================================== > > > > > > > > --- linux.orig/mm/memcontrol.c 2013-01-22 09:13:14.000000000 +0800 > > > > > > > Acked-by: Minchan Kim <minchan@kernel.org> > > > > > > > > > > > > > > > +++ linux/mm/memcontrol.c 2013-01-22 09:29:29.374977700 +0800 > > > > > > > > @@ -6279,7 +6279,7 @@ static struct page *mc_handle_swap_pte(s > > > > > > > > * Because lookup_swap_cache() updates some statistics counter, > > > > > > > > * we call find_get_page() with swapper_space directly. > > > > > > > > */ > > > > > > > > - page = find_get_page(&swapper_space, ent.val); > > > > > > > > + page = find_get_page(swap_address_space(ent), ent.val); > > > > > > > > if (do_swap_account) > > > > > > > > entry->val = ent.val; > > > > > > > > > > > > > > > > @@ -6320,7 +6320,7 @@ static struct page *mc_handle_file_pte(s > > > > > > > > swp_entry_t swap = radix_to_swp_entry(page); > > > > > > > > if (do_swap_account) > > > > > > > > *entry = swap; > > > > > > > > - page = find_get_page(&swapper_space, swap.val); > > > > > > > > + page = find_get_page(swap_address_space(swap), swap.val); > > > > > > > > } > > > > > > > > #endif > > > > > > > > return page; > > > > > > > > Index: linux/mm/mincore.c > > > > > > > > =================================================================== > > > > > > > > --- linux.orig/mm/mincore.c 2013-01-22 09:13:14.000000000 +0800 > > > > > > > > +++ linux/mm/mincore.c 2013-01-22 09:29:29.378977649 +0800 > > > > > > > > @@ -75,7 +75,7 @@ static unsigned char mincore_page(struct > > > > > > > > /* shmem/tmpfs may return swap: account for swapcache page too. */ > > > > > > > > if (radix_tree_exceptional_entry(page)) { > > > > > > > > swp_entry_t swap = radix_to_swp_entry(page); > > > > > > > > - page = find_get_page(&swapper_space, swap.val); > > > > > > > > + page = find_get_page(swap_address_space(swap), swap.val); > > > > > > > > } > > > > > > > > #endif > > > > > > > > if (page) { > > > > > > > > @@ -135,7 +135,8 @@ static void mincore_pte_range(struct vm_ > > > > > > > > } else { > > > > > > > > #ifdef CONFIG_SWAP > > > > > > > > pgoff = entry.val; > > > > > > > > - *vec = mincore_page(&swapper_space, pgoff); > > > > > > > > + *vec = mincore_page(swap_address_space(entry), > > > > > > > > + pgoff); > > > > > > > > #else > > > > > > > > WARN_ON(1); > > > > > > > > *vec = 1; > > > > > > > > Index: linux/mm/swap.c > > > > > > > > =================================================================== > > > > > > > > --- linux.orig/mm/swap.c 2013-01-22 09:13:14.000000000 +0800 > > > > > > > > +++ linux/mm/swap.c 2013-01-22 09:29:29.378977649 +0800 > > > > > > > > @@ -855,9 +855,14 @@ EXPORT_SYMBOL(pagevec_lookup_tag); > > > > > > > > void __init swap_setup(void) > > > > > > > > { > > > > > > > > unsigned long megs = totalram_pages >> (20 - PAGE_SHIFT); > > > > > > > > - > > > > > > > > #ifdef CONFIG_SWAP > > > > > > > > - bdi_init(swapper_space.backing_dev_info); > > > > > > > > + int i; > > > > > > > > + > > > > > > > > + for (i = 0; i < MAX_SWAPFILES; i++) { > > > > > > > > + bdi_init(swapper_spaces[i].backing_dev_info); > > > > > > > > + spin_lock_init(&swapper_spaces[i].tree_lock); > > > > > > > > + INIT_LIST_HEAD(&swapper_spaces[i].i_mmap_nonlinear); > > > > > > > > + } > > > > > > > > #endif > > > > > > > > > > > > > > > > /* Use a smaller cluster for small-memory machines */ > > > > > > > > Index: linux/mm/swap_state.c > > > > > > > > =================================================================== > > > > > > > > --- linux.orig/mm/swap_state.c 2013-01-22 09:13:14.000000000 +0800 > > > > > > > > +++ linux/mm/swap_state.c 2013-01-22 09:29:29.378977649 +0800 > > > > > > > > @@ -36,12 +36,12 @@ static struct backing_dev_info swap_back > > > > > > > > .capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK | BDI_CAP_SWAP_BACKED, > > > > > > > > }; > > > > > > > > > > > > > > > > -struct address_space swapper_space = { > > > > > > > > - .page_tree = RADIX_TREE_INIT(GFP_ATOMIC|__GFP_NOWARN), > > > > > > > > - .tree_lock = __SPIN_LOCK_UNLOCKED(swapper_space.tree_lock), > > > > > > > > - .a_ops = &swap_aops, > > > > > > > > - .i_mmap_nonlinear = LIST_HEAD_INIT(swapper_space.i_mmap_nonlinear), > > > > > > > > - .backing_dev_info = &swap_backing_dev_info, > > > > > > > > +struct address_space swapper_spaces[MAX_SWAPFILES] = { > > > > > > > > + [0 ... MAX_SWAPFILES - 1] = { > > > > > > > > + .page_tree = RADIX_TREE_INIT(GFP_ATOMIC|__GFP_NOWARN), > > > > > > > > + .a_ops = &swap_aops, > > > > > > > > + .backing_dev_info = &swap_backing_dev_info, > > > > > > > > + } > > > > > > > > }; > > > > > > > > > > > > > > > > #define INC_CACHE_INFO(x) do { swap_cache_info.x++; } while (0) > > > > > > > > @@ -53,9 +53,19 @@ static struct { > > > > > > > > unsigned long find_total; > > > > > > > > } swap_cache_info; > > > > > > > > > > > > > > > > +unsigned long total_swapcache_pages(void) > > > > > > > > +{ > > > > > > > > + int i; > > > > > > > > + unsigned long ret = 0; > > > > > > > > + > > > > > > > > + for (i = 0; i < MAX_SWAPFILES; i++) > > > > > > > > + ret += swapper_spaces[i].nrpages; > > > > > > > > + return ret; > > > > > > > > +} > > > > > > > > + > > > > > > > > void show_swap_cache_info(void) > > > > > > > > { > > > > > > > > - printk("%lu pages in swap cache\n", total_swapcache_pages); > > > > > > > > + printk("%lu pages in swap cache\n", total_swapcache_pages()); > > > > > > > > printk("Swap cache stats: add %lu, delete %lu, find %lu/%lu\n", > > > > > > > > swap_cache_info.add_total, swap_cache_info.del_total, > > > > > > > > swap_cache_info.find_success, swap_cache_info.find_total); > > > > > > > > @@ -70,23 +80,26 @@ void show_swap_cache_info(void) > > > > > > > > static int __add_to_swap_cache(struct page *page, swp_entry_t entry) > > > > > > > > { > > > > > > > > int error; > > > > > > > > + struct address_space *address_space; > > > > > > > > > > > > > > > > VM_BUG_ON(!PageLocked(page)); > > > > > > > > VM_BUG_ON(PageSwapCache(page)); > > > > > > > > VM_BUG_ON(!PageSwapBacked(page)); > > > > > > > > > > > > > > > > page_cache_get(page); > > > > > > > > - SetPageSwapCache(page); > > > > > > > > set_page_private(page, entry.val); > > > > > > > > + SetPageSwapCache(page); > > > > > > > > > > > > > > Why did you move this line? Is there any special reason? > > > > > > > > > > > > Originally I'm afraid page_mapping() gets invalid page_private(), but I then > > > > > > realized we hold page lock. There are some places we don't hold page lock. > > > > > > either such page isn't swap page or the caller can tolerate race. I forgot > > > > > > removing this change in the patch. But I certainly can be wrong. We can add > > > > > > memory barrier if required. > > > > > > > > > > Yeb. While I reviewed the patch, I was concern about that but I fail to find > > > > > a problem, too. But maybe it's valuable to add comment about that race > > > > > (!PageSwapCache(page) but page_mapping could return swapper_space) in page_mapping. > > > > > > > > Hi Minchan, > > > > > > > > If the race Shaohua mentioned should be PageSwapCache(page) but > > > > page_mapping couldn't return swapper_space since page_mapping() gets > > > > invalid page_private(). > > > > > > Right you are. In such case, it could be a problem. > > > Let's see following case. > > > > > > isolate_migratepages_range > > > __isolate_lru_page > > > __add_to_swap_cache > > > set_page_private(page, entry.val) > > > SetPageSwapCache(page) > > > PageSwapCache > > > entry.val = page_private(page); > > > mapping = swap_address_space(entry); > > > > > > In this case, if memory ordering happens, mapping would be dangling pointer, > > > One of the problem by dangling mapping is the page could be passed into > > > shrink_page_list and try to pageout with dangling mapping. Of course, > > > we have many locks until reaching the page_mapping in shrink_page_list so > > > the problem will not happen but we shouldn't depends on such implicit locks > > > instead of explicit memory barrier because we could remove all locks in reclaim > > > path if super hero comes in or new user of page_mapping would do new something > > > to make a problem. :) > > > > > > I will send a patch if anyone doesn't oppose. > > > > That't fine in the case, because page private 0 still return a valide mapping, > > and we only check mapping here. But I agree this is too subtle. Adding memory > > It could try to do many things with wrong address_space so it is very error-prone. > > > fence is safer. I had this yesterday if you don't mind: > > Of course. Below just some nitpicks. > > > > > > > Subject: mm: add memory to prevent SwapCache bit and page private out of order > > mm: Get rid of memory reordering of SwapCache and PagePrivate > > > > page_mapping() checks SwapCache bit first and then read page private. Adding > > memory barrier so page private has correct value before SwapCache bit set. > > Please write down the problem if we don't apply the patch. > > > > > Signed-off-by: Shaohua Li <shli@fusionio.com> > Acked-by: Minchan Kim <minchan@kernel.org> > > > --- > > mm/swap_state.c | 7 +++++-- > > mm/util.c | 6 ++++++ > > 2 files changed, 11 insertions(+), 2 deletions(-) > > > > Index: linux/mm/swap_state.c > > =================================================================== > > --- linux.orig/mm/swap_state.c 2013-01-22 10:12:33.514490665 +0800 > > +++ linux/mm/swap_state.c 2013-01-23 15:42:00.737459987 +0800 > > @@ -89,6 +89,7 @@ static int __add_to_swap_cache(struct pa > > > > page_cache_get(page); > > set_page_private(page, entry.val); > > + smp_wmb(); > > SetPageSwapCache(page); > > > > address_space = swap_address_space(entry); > > @@ -109,8 +110,9 @@ static int __add_to_swap_cache(struct pa > > * So add_to_swap_cache() doesn't returns -EEXIST. > > */ > > VM_BUG_ON(error == -EEXIST); > > - set_page_private(page, 0UL); > > ClearPageSwapCache(page); > > + smp_mb__after_clear_bit(); > > Could you use smp_wmb() instead of smp_mb__after_clear_bit? > Yes. ClearPageSwapCache does clear_bit so it's okay now but we might change it > in future so let's not make unnecessary dependency. It's not really hot path. Ok, that's fine, not big deal anyway. Updated the patch. Subject: mm: add memory barrier to prevent SwapCache bit and page private out of order page_mapping() checks SwapCache bit first and then read page private. Adding memory barrier so page private has correct value before SwapCache bit set. In some cases, page_mapping() isn't called with page locked. Without doing this, we might get a wrong swap address space with SwapCache bit set. Though I didn't found a problem with this so far (such code typically only checks if the page has mapping or the mapping can be dirty or migrated), this is too subtle and error-prone, so we want to avoid it. Signed-off-by: Shaohua Li <shli@fusionio.com> Acked-by: Minchan Kim <minchan@kernel.org> --- mm/swap_state.c | 7 +++++-- mm/util.c | 6 ++++++ 2 files changed, 11 insertions(+), 2 deletions(-) Index: linux/mm/swap_state.c =================================================================== --- linux.orig/mm/swap_state.c 2013-01-22 10:12:33.514490665 +0800 +++ linux/mm/swap_state.c 2013-01-24 18:08:05.149390977 +0800 @@ -89,6 +89,7 @@ static int __add_to_swap_cache(struct pa page_cache_get(page); set_page_private(page, entry.val); + smp_wmb(); SetPageSwapCache(page); address_space = swap_address_space(entry); @@ -109,8 +110,9 @@ static int __add_to_swap_cache(struct pa * So add_to_swap_cache() doesn't returns -EEXIST. */ VM_BUG_ON(error == -EEXIST); - set_page_private(page, 0UL); ClearPageSwapCache(page); + smp_wmb(); + set_page_private(page, 0UL); page_cache_release(page); } @@ -146,8 +148,9 @@ void __delete_from_swap_cache(struct pag entry.val = page_private(page); address_space = swap_address_space(entry); radix_tree_delete(&address_space->page_tree, page_private(page)); - set_page_private(page, 0); ClearPageSwapCache(page); + smp_wmb(); + set_page_private(page, 0); address_space->nrpages--; __dec_zone_page_state(page, NR_FILE_PAGES); INC_CACHE_INFO(del_total); Index: linux/mm/util.c =================================================================== --- linux.orig/mm/util.c 2013-01-22 10:11:58.310933234 +0800 +++ linux/mm/util.c 2013-01-23 15:50:22.147155111 +0800 @@ -389,6 +389,12 @@ struct address_space *page_mapping(struc if (unlikely(PageSwapCache(page))) { swp_entry_t entry; + /* + * set page_private() first then set SwapCache bit. clearing + * the bit first then zero page private. This memory barrier + * matches the rule. + */ + smp_rmb(); entry.val = page_private(page); mapping = swap_address_space(entry); } else -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@kvack.org. For more info on Linux MM, see: http://www.linux-mm.org/ . Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a> ^ permalink raw reply [flat|nested] 15+ messages in thread
* Re: [patch 2/3 v2]swap: make each swap partition have one address_space 2013-01-24 10:24 ` Shaohua Li @ 2013-01-31 21:50 ` Andrew Morton 2013-01-31 23:29 ` Minchan Kim 0 siblings, 1 reply; 15+ messages in thread From: Andrew Morton @ 2013-01-31 21:50 UTC (permalink / raw) To: Shaohua Li; +Cc: Minchan Kim, Simon Jeons, linux-mm, hughd, riel On Thu, 24 Jan 2013 18:24:14 +0800 Shaohua Li <shli@kernel.org> wrote: > Subject: mm: add memory barrier to prevent SwapCache bit and page private out of order > > page_mapping() checks SwapCache bit first and then read page private. Adding > memory barrier so page private has correct value before SwapCache bit set. > > In some cases, page_mapping() isn't called with page locked. Without doing > this, we might get a wrong swap address space with SwapCache bit set. Though I > didn't found a problem with this so far (such code typically only checks if the > page has mapping or the mapping can be dirty or migrated), this is too subtle > and error-prone, so we want to avoid it. > > ... > > --- linux.orig/mm/swap_state.c 2013-01-22 10:12:33.514490665 +0800 > +++ linux/mm/swap_state.c 2013-01-24 18:08:05.149390977 +0800 > @@ -89,6 +89,7 @@ static int __add_to_swap_cache(struct pa > > page_cache_get(page); > set_page_private(page, entry.val); > + smp_wmb(); > SetPageSwapCache(page); SetPageSwapCache() uses set_bit() and arch/x86/include/asm/bitops.h says "This function is atomic and may not be reordered". -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@kvack.org. For more info on Linux MM, see: http://www.linux-mm.org/ . Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a> ^ permalink raw reply [flat|nested] 15+ messages in thread
* Re: [patch 2/3 v2]swap: make each swap partition have one address_space 2013-01-31 21:50 ` Andrew Morton @ 2013-01-31 23:29 ` Minchan Kim 0 siblings, 0 replies; 15+ messages in thread From: Minchan Kim @ 2013-01-31 23:29 UTC (permalink / raw) To: Andrew Morton; +Cc: Shaohua Li, Simon Jeons, linux-mm, hughd, riel Hi Andrew, On Thu, Jan 31, 2013 at 01:50:42PM -0800, Andrew Morton wrote: > On Thu, 24 Jan 2013 18:24:14 +0800 > Shaohua Li <shli@kernel.org> wrote: > > > Subject: mm: add memory barrier to prevent SwapCache bit and page private out of order > > > > page_mapping() checks SwapCache bit first and then read page private. Adding > > memory barrier so page private has correct value before SwapCache bit set. > > > > In some cases, page_mapping() isn't called with page locked. Without doing > > this, we might get a wrong swap address space with SwapCache bit set. Though I > > didn't found a problem with this so far (such code typically only checks if the > > page has mapping or the mapping can be dirty or migrated), this is too subtle > > and error-prone, so we want to avoid it. > > > > ... > > > > --- linux.orig/mm/swap_state.c 2013-01-22 10:12:33.514490665 +0800 > > +++ linux/mm/swap_state.c 2013-01-24 18:08:05.149390977 +0800 > > @@ -89,6 +89,7 @@ static int __add_to_swap_cache(struct pa > > > > page_cache_get(page); > > set_page_private(page, entry.val); > > + smp_wmb(); > > SetPageSwapCache(page); > > SetPageSwapCache() uses set_bit() and arch/x86/include/asm/bitops.h > says "This function is atomic and may not be reordered". And says below. * Note: there are no guarantees that this function will not be reordered * on non x86 architectures, so if you are writing portable code, * make sure not to rely on its reordering guarantees. -- Kind regards, Minchan Kim -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@kvack.org. For more info on Linux MM, see: http://www.linux-mm.org/ . Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a> ^ permalink raw reply [flat|nested] 15+ messages in thread
end of thread, other threads:[~2013-01-31 23:29 UTC | newest] Thread overview: 15+ messages (download: mbox.gz follow: Atom feed -- links below jump to the message on this page -- 2013-01-22 2:29 [patch 2/3 v2]swap: make each swap partition have one address_space Shaohua Li 2013-01-22 19:49 ` Rik van Riel 2013-01-23 6:16 ` Minchan Kim 2013-01-23 7:36 ` Shaohua Li 2013-01-23 8:04 ` Minchan Kim 2013-01-24 1:39 ` Simon Jeons 2013-01-24 2:22 ` Minchan Kim 2013-01-24 2:43 ` Shaohua Li 2013-01-24 3:25 ` Simon Jeons 2013-01-24 10:35 ` Shaohua Li 2013-01-24 5:19 ` Minchan Kim 2013-01-24 6:14 ` Simon Jeons 2013-01-24 10:24 ` Shaohua Li 2013-01-31 21:50 ` Andrew Morton 2013-01-31 23:29 ` Minchan Kim
This is a public inbox, see mirroring instructions for how to clone and mirror all data and code used for this inbox; as well as URLs for NNTP newsgroup(s).