Linux-mm Archive on lore.kernel.org

Linux-mm Archive on lore.kernel.org
 help / color / mirror / Atom feed

* [PATCH] mm: drop "wait" parameter from write_one_page
From: Jeff Layton @ 2017-05-25 10:33 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Ross Zwisler, Jan Kara, Matthew Wilcox, Christoph Hellwig,
	linux-mm, linux-fsdevel

The callers all set it to 1.

Also, make it clear that this function will not set any sort of AS_*
error, and that the caller must do so if necessary. No existing caller
uses this on normal files, so none of them need it.

Also, add __must_check here since, in general, the callers need to
handle an error here in some fashion.

Signed-off-by: Jeff Layton <jlayton@redhat.com>
Reviewed-by: Ross Zwisler <ross.zwisler@linux.intel.com>
Reviewed-by: Jan Kara <jack@suse.cz>
Reviewed-by: Matthew Wilcox <mawilcox@microsoft.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
---
 fs/exofs/dir.c        |  2 +-
 fs/ext2/dir.c         |  2 +-
 fs/jfs/jfs_metapage.c |  4 ++--
 fs/minix/dir.c        |  2 +-
 fs/sysv/dir.c         |  2 +-
 fs/ufs/dir.c          |  2 +-
 include/linux/mm.h    |  2 +-
 mm/page-writeback.c   | 14 +++++++-------
 8 files changed, 15 insertions(+), 15 deletions(-)

diff --git a/fs/exofs/dir.c b/fs/exofs/dir.c
index 8eeb694332fe..98233a97b7b8 100644
--- a/fs/exofs/dir.c
+++ b/fs/exofs/dir.c
@@ -72,7 +72,7 @@ static int exofs_commit_chunk(struct page *page, loff_t pos, unsigned len)
 	set_page_dirty(page);
 
 	if (IS_DIRSYNC(dir))
-		err = write_one_page(page, 1);
+		err = write_one_page(page);
 	else
 		unlock_page(page);
 
diff --git a/fs/ext2/dir.c b/fs/ext2/dir.c
index d9650c9508e4..e2709695b177 100644
--- a/fs/ext2/dir.c
+++ b/fs/ext2/dir.c
@@ -100,7 +100,7 @@ static int ext2_commit_chunk(struct page *page, loff_t pos, unsigned len)
 	}
 
 	if (IS_DIRSYNC(dir)) {
-		err = write_one_page(page, 1);
+		err = write_one_page(page);
 		if (!err)
 			err = sync_inode_metadata(dir, 1);
 	} else {
diff --git a/fs/jfs/jfs_metapage.c b/fs/jfs/jfs_metapage.c
index 489aaa1403e5..744fa3c079e6 100644
--- a/fs/jfs/jfs_metapage.c
+++ b/fs/jfs/jfs_metapage.c
@@ -711,7 +711,7 @@ void force_metapage(struct metapage *mp)
 	get_page(page);
 	lock_page(page);
 	set_page_dirty(page);
-	write_one_page(page, 1);
+	write_one_page(page);
 	clear_bit(META_forcewrite, &mp->flag);
 	put_page(page);
 }
@@ -756,7 +756,7 @@ void release_metapage(struct metapage * mp)
 		set_page_dirty(page);
 		if (test_bit(META_sync, &mp->flag)) {
 			clear_bit(META_sync, &mp->flag);
-			write_one_page(page, 1);
+			write_one_page(page);
 			lock_page(page); /* write_one_page unlocks the page */
 		}
 	} else if (mp->lsn)	/* discard_metapage doesn't remove it */
diff --git a/fs/minix/dir.c b/fs/minix/dir.c
index 7edc9b395700..baa9721f1299 100644
--- a/fs/minix/dir.c
+++ b/fs/minix/dir.c
@@ -57,7 +57,7 @@ static int dir_commit_chunk(struct page *page, loff_t pos, unsigned len)
 		mark_inode_dirty(dir);
 	}
 	if (IS_DIRSYNC(dir))
-		err = write_one_page(page, 1);
+		err = write_one_page(page);
 	else
 		unlock_page(page);
 	return err;
diff --git a/fs/sysv/dir.c b/fs/sysv/dir.c
index 5bdae85ceef7..f5191cb2c947 100644
--- a/fs/sysv/dir.c
+++ b/fs/sysv/dir.c
@@ -45,7 +45,7 @@ static int dir_commit_chunk(struct page *page, loff_t pos, unsigned len)
 		mark_inode_dirty(dir);
 	}
 	if (IS_DIRSYNC(dir))
-		err = write_one_page(page, 1);
+		err = write_one_page(page);
 	else
 		unlock_page(page);
 	return err;
diff --git a/fs/ufs/dir.c b/fs/ufs/dir.c
index de01b8f2aa78..48609f1d9580 100644
--- a/fs/ufs/dir.c
+++ b/fs/ufs/dir.c
@@ -53,7 +53,7 @@ static int ufs_commit_chunk(struct page *page, loff_t pos, unsigned len)
 		mark_inode_dirty(dir);
 	}
 	if (IS_DIRSYNC(dir))
-		err = write_one_page(page, 1);
+		err = write_one_page(page);
 	else
 		unlock_page(page);
 	return err;
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 7cb17c6b97de..ca9c8b27cecb 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -2199,7 +2199,7 @@ extern void filemap_map_pages(struct vm_fault *vmf,
 extern int filemap_page_mkwrite(struct vm_fault *vmf);
 
 /* mm/page-writeback.c */
-int write_one_page(struct page *page, int wait);
+int __must_check write_one_page(struct page *page);
 void task_dirty_inc(struct task_struct *tsk);
 
 /* readahead.c */
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 143c1c25d680..b901fe52b153 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -2366,15 +2366,16 @@ int do_writepages(struct address_space *mapping, struct writeback_control *wbc)
 }
 
 /**
- * write_one_page - write out a single page and optionally wait on I/O
+ * write_one_page - write out a single page and wait on I/O
  * @page: the page to write
- * @wait: if true, wait on writeout
  *
  * The page must be locked by the caller and will be unlocked upon return.
  *
- * write_one_page() returns a negative error code if I/O failed.
+ * write_one_page() returns a negative error code if I/O failed. Note that
+ * the address_space is not marked for error. The caller must do this if
+ * needed.
  */
-int write_one_page(struct page *page, int wait)
+int write_one_page(struct page *page)
 {
 	struct address_space *mapping = page->mapping;
 	int ret = 0;
@@ -2385,13 +2386,12 @@ int write_one_page(struct page *page, int wait)
 
 	BUG_ON(!PageLocked(page));
 
-	if (wait)
-		wait_on_page_writeback(page);
+	wait_on_page_writeback(page);
 
 	if (clear_page_dirty_for_io(page)) {
 		get_page(page);
 		ret = mapping->a_ops->writepage(page, &wbc);
-		if (ret == 0 && wait) {
+		if (ret == 0) {
 			wait_on_page_writeback(page);
 			if (PageError(page))
 				ret = -EIO;
-- 
2.9.4

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply related

* [PATCH v2] mm/oom_kill: count global and memory cgroup oom kills
From: Konstantin Khlebnikov @ 2017-05-25 10:28 UTC (permalink / raw)
  To: linux-mm, linux-kernel, Michal Hocko
  Cc: Tetsuo Handa, Andrew Morton, Roman Guschin, David Rientjes

Show count of oom killer invocations in /proc/vmstat and count of
processes killed in memory cgroup in knob "memory.events"
(in memory.oom_control for v1 cgroup).

Also describe difference between "oom" and "oom_kill" in memory
cgroup documentation. Currently oom in memory cgroup kills tasks
iff shortage has happened inside page fault.

These counters helps in monitoring oom kills - for now
the only way is grepping for magic words in kernel log.

Signed-off-by: Konstantin Khlebnikov <khlebnikov@yandex-team.ru>

---

v1: https://lkml.kernel.org/r/149520375057.74196.2843113275800730971.stgit@buzz

v2:
* count all oom kills in /proc/vmstat
* update counter for cgroup which tasks belongs to
---
 Documentation/cgroup-v2.txt   |   20 ++++++++++++++++----
 include/linux/memcontrol.h    |    5 ++++-
 include/linux/vm_event_item.h |    1 +
 mm/memcontrol.c               |    2 ++
 mm/oom_kill.c                 |    5 +++++
 mm/vmstat.c                   |    1 +
 6 files changed, 29 insertions(+), 5 deletions(-)

diff --git a/Documentation/cgroup-v2.txt b/Documentation/cgroup-v2.txt
index dc5e2dcdbef4..738b1c7023ad 100644
--- a/Documentation/cgroup-v2.txt
+++ b/Documentation/cgroup-v2.txt
@@ -826,13 +826,25 @@ PAGE_SIZE multiple when read back.
 
 		The number of times the cgroup's memory usage was
 		about to go over the max boundary.  If direct reclaim
-		fails to bring it down, the OOM killer is invoked.
+		fails to bring it down, the cgroup goes to OOM state.
 
 	  oom
 
-		The number of times the OOM killer has been invoked in
-		the cgroup.  This may not exactly match the number of
-		processes killed but should generally be close.
+		The number of time the cgroup's memory usage was
+		reached the limit and allocation was about to fail.
+
+		Depending on context result could be invocation of OOM
+		killer and retrying allocation or failing alloction.
+
+		Failed allocation in its turn could be returned into
+		userspace as -ENOMEM or siletly ignored in cases like
+		disk readahead.	 For now OOM in memory cgroup kills
+		tasks iff shortage has happened inside page fault.
+
+	  oom_kill
+
+		The number of processes belonging to this cgroup
+		killed by any kind of OOM killer.
 
   memory.stat
 
diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 899949bbb2f9..42296f7001da 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -556,8 +556,11 @@ static inline void mem_cgroup_count_vm_event(struct mm_struct *mm,
 
 	rcu_read_lock();
 	memcg = mem_cgroup_from_task(rcu_dereference(mm->owner));
-	if (likely(memcg))
+	if (likely(memcg)) {
 		this_cpu_inc(memcg->stat->events[idx]);
+		if (idx == OOM_KILL)
+			cgroup_file_notify(&memcg->events_file);
+	}
 	rcu_read_unlock();
 }
 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
diff --git a/include/linux/vm_event_item.h b/include/linux/vm_event_item.h
index d84ae90ccd5c..1707e0a7d943 100644
--- a/include/linux/vm_event_item.h
+++ b/include/linux/vm_event_item.h
@@ -41,6 +41,7 @@ enum vm_event_item { PGPGIN, PGPGOUT, PSWPIN, PSWPOUT,
 		KSWAPD_LOW_WMARK_HIT_QUICKLY, KSWAPD_HIGH_WMARK_HIT_QUICKLY,
 		PAGEOUTRUN, PGROTATED,
 		DROP_PAGECACHE, DROP_SLAB,
+		OOM_KILL,
 #ifdef CONFIG_NUMA_BALANCING
 		NUMA_PTE_UPDATES,
 		NUMA_HUGE_PTE_UPDATES,
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 94172089f52f..7011ebf2b90e 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -3574,6 +3574,7 @@ static int mem_cgroup_oom_control_read(struct seq_file *sf, void *v)
 
 	seq_printf(sf, "oom_kill_disable %d\n", memcg->oom_kill_disable);
 	seq_printf(sf, "under_oom %d\n", (bool)memcg->under_oom);
+	seq_printf(sf, "oom_kill %lu\n", memcg_sum_events(memcg, OOM_KILL));
 	return 0;
 }
 
@@ -5165,6 +5166,7 @@ static int memory_events_show(struct seq_file *m, void *v)
 	seq_printf(m, "high %lu\n", memcg_sum_events(memcg, MEMCG_HIGH));
 	seq_printf(m, "max %lu\n", memcg_sum_events(memcg, MEMCG_MAX));
 	seq_printf(m, "oom %lu\n", memcg_sum_events(memcg, MEMCG_OOM));
+	seq_printf(m, "oom_kill %lu\n", memcg_sum_events(memcg, OOM_KILL));
 
 	return 0;
 }
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index 04c9143a8625..dd30a045ef5b 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -876,6 +876,11 @@ static void oom_kill_process(struct oom_control *oc, const char *message)
 	/* Get a reference to safely compare mm after task_unlock(victim) */
 	mm = victim->mm;
 	mmgrab(mm);
+
+	/* Raise event before sending signal: reaper must see this */
+	count_vm_event(OOM_KILL);
+	mem_cgroup_count_vm_event(mm, OOM_KILL);
+
 	/*
 	 * We should send SIGKILL before setting TIF_MEMDIE in order to prevent
 	 * the OOM victim from depleting the memory reserves from the user
diff --git a/mm/vmstat.c b/mm/vmstat.c
index 76f73670200a..fe80b81a86e0 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -1018,6 +1018,7 @@ const char * const vmstat_text[] = {
 
 	"drop_pagecache",
 	"drop_slab",
+	"oom_kill",
 
 #ifdef CONFIG_NUMA_BALANCING
 	"numa_pte_updates",

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply related

* Re: [PATCH] mm/migrate: Fix ref-count handling when !hugepage_migration_supported()
From: Punit Agrawal @ 2017-05-25 10:18 UTC (permalink / raw)
  To: Naoya Horiguchi
  Cc: Andrew Morton, will.deacon@arm.com, catalin.marinas@arm.com,
	manoj.iyer@arm.com, linux-kernel@vger.kernel.org,
	linux-arm-kernel@lists.infradead.org, linux-mm@kvack.org,
	tbaicar@codeaurora.org, timur@qti.qualcomm.com, Joonsoo Kim,
	Wanpeng Li, Christoph Lameter
In-Reply-To: <20170525015927.GA26520@hori1.linux.bs1.fc.nec.co.jp>

Naoya Horiguchi <n-horiguchi@ah.jp.nec.com> writes:

> On Wed, May 24, 2017 at 04:47:28PM +0100, Punit Agrawal wrote:
>> On failing to migrate a page, soft_offline_huge_page() performs the
>> necessary update to the hugepage ref-count. When
>> !hugepage_migration_supported() , unmap_and_move_hugepage() also
>> decrements the page ref-count for the hugepage. The combined behaviour
>> leaves the ref-count in an inconsistent state.
>> 
>> This leads to soft lockups when running the overcommitted hugepage test
>> from mce-tests suite.
>> 
>> Soft offlining pfn 0x83ed600 at process virtual address 0x400000000000
>> soft offline: 0x83ed600: migration failed 1, type
>> 1fffc00000008008 (uptodate|head)
>> INFO: rcu_preempt detected stalls on CPUs/tasks:
>>  Tasks blocked on level-0 rcu_node (CPUs 0-7): P2715
>>   (detected by 7, t=5254 jiffies, g=963, c=962, q=321)
>>   thugetlb_overco R  running task        0  2715   2685 0x00000008
>>   Call trace:
>>   [<ffff000008089f90>] dump_backtrace+0x0/0x268
>>   [<ffff00000808a2d4>] show_stack+0x24/0x30
>>   [<ffff000008100d34>] sched_show_task+0x134/0x180
>>   [<ffff0000081c90fc>] rcu_print_detail_task_stall_rnp+0x54/0x7c
>>   [<ffff00000813cfd4>] rcu_check_callbacks+0xa74/0xb08
>>   [<ffff000008143a3c>] update_process_times+0x34/0x60
>>   [<ffff0000081550e8>] tick_sched_handle.isra.7+0x38/0x70
>>   [<ffff00000815516c>] tick_sched_timer+0x4c/0x98
>>   [<ffff0000081442e0>] __hrtimer_run_queues+0xc0/0x300
>>   [<ffff000008144fa4>] hrtimer_interrupt+0xac/0x228
>>   [<ffff0000089a56d4>] arch_timer_handler_phys+0x3c/0x50
>>   [<ffff00000812f1bc>] handle_percpu_devid_irq+0x8c/0x290
>>   [<ffff0000081297fc>] generic_handle_irq+0x34/0x50
>>   [<ffff000008129f00>] __handle_domain_irq+0x68/0xc0
>>   [<ffff0000080816b4>] gic_handle_irq+0x5c/0xb0
>> 
>> Fix this by dropping the ref-count decrement in
>> unmap_and_move_hugepage() when !hugepage_migration_supported().
>> 
>> Fixes: 32665f2bbfed ("mm/migrate: correct failure handling if !hugepage_migration_support()")
>> Reported-by: Manoj Iyer <manoj.iyer@canonical.com>
>> Signed-off-by: Punit Agrawal <punit.agrawal@arm.com>
>> Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
>> Cc: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
>> Cc: Wanpeng Li <liwanp@linux.vnet.ibm.com>
>> Cc: Christoph Lameter <cl@linux.com>
>> 
>> --
>> Hi Andrew,
>> 
>> We ran into this bug when working towards enabling memory corruption
>> on arm64. The patch was tested on an arm64 platform running v4.12-rc2
>> with the series to enable memory corruption handling[0].
>> 
>> Please consider merging as a fix for the 4.12 release.
>> 
>> Thanks,
>> Punit
>> 
>> [0] https://www.spinics.net/lists/arm-kernel/msg581657.html
>> ---
>>  mm/migrate.c | 4 +---
>>  1 file changed, 1 insertion(+), 3 deletions(-)
>> 
>> diff --git a/mm/migrate.c b/mm/migrate.c
>> index 89a0a1707f4c..187abd1526df 100644
>> --- a/mm/migrate.c
>> +++ b/mm/migrate.c
>> @@ -1201,10 +1201,8 @@ static int unmap_and_move_huge_page(new_page_t get_new_page,
>>  	 * tables or check whether the hugepage is pmd-based or not before
>>  	 * kicking migration.
>>  	 */
>> -	if (!hugepage_migration_supported(page_hstate(hpage))) {
>> -		putback_active_hugepage(hpage);
>
> Thank you for reporting and suggestion, Punit, Manoj.
>
> Simply dropping this putback_active_hugepage() may resume the failure
> counting issue addressed in 32665f2bbfed, so I would recommend to call
> putback_movable_pages() in failure path in soft_offline_huge_page().
>
> @@ -1600,7 +1600,8 @@ static int soft_offline_huge_page(struct page *page, int flags)
>  		 * only one hugepage pointed to by hpage, so we need not
>  		 * run through the pagelist here.
>  		 */
> -		putback_active_hugepage(hpage);
> +		if (!list_empty(&pagelist))
> +			putback_movable_pages(&pagelist);
>  		if (ret > 0)
>  			ret = -EIO;
>  	} else {
>
> Could you check this works for you?

Using this sequence works as well. I'll send out an update shortly.

Thanks

>
> Thanks,
> Naoya Horiguchi
> --
> To unsubscribe, send a message with 'unsubscribe linux-mm' in
> the body to majordomo@kvack.org.  For more info on Linux MM,
> see: http://www.linux-mm.org/ .
> Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply

* Re: [PATCH] mm/migrate: Fix ref-count handling when !hugepage_migration_supported()
From: Punit Agrawal @ 2017-05-25  9:49 UTC (permalink / raw)
  To: Andrew Morton
  Cc: will.deacon, catalin.marinas, manoj.iyer, linux-kernel,
	linux-arm-kernel, linux-mm, tbaicar, timur, Joonsoo Kim,
	Naoya Horiguchi, Wanpeng Li, Christoph Lameter
In-Reply-To: <20170524125610.8fbc644f8fa1cf8175b7757b@linux-foundation.org>

Andrew Morton <akpm@linux-foundation.org> writes:

> On Wed, 24 May 2017 16:47:28 +0100 Punit Agrawal <punit.agrawal@arm.com> wrote:
>
>> On failing to migrate a page, soft_offline_huge_page() performs the
>> necessary update to the hugepage ref-count. When
>> !hugepage_migration_supported() , unmap_and_move_hugepage() also
>> decrements the page ref-count for the hugepage. The combined behaviour
>> leaves the ref-count in an inconsistent state.
>> 
>> This leads to soft lockups when running the overcommitted hugepage test
>> from mce-tests suite.
>> 
>> Soft offlining pfn 0x83ed600 at process virtual address 0x400000000000
>> soft offline: 0x83ed600: migration failed 1, type
>> 1fffc00000008008 (uptodate|head)
>> INFO: rcu_preempt detected stalls on CPUs/tasks:
>>  Tasks blocked on level-0 rcu_node (CPUs 0-7): P2715
>>   (detected by 7, t=5254 jiffies, g=963, c=962, q=321)
>>   thugetlb_overco R  running task        0  2715   2685 0x00000008
>>   Call trace:
>>   [<ffff000008089f90>] dump_backtrace+0x0/0x268
>>   [<ffff00000808a2d4>] show_stack+0x24/0x30
>>   [<ffff000008100d34>] sched_show_task+0x134/0x180
>>   [<ffff0000081c90fc>] rcu_print_detail_task_stall_rnp+0x54/0x7c
>>   [<ffff00000813cfd4>] rcu_check_callbacks+0xa74/0xb08
>>   [<ffff000008143a3c>] update_process_times+0x34/0x60
>>   [<ffff0000081550e8>] tick_sched_handle.isra.7+0x38/0x70
>>   [<ffff00000815516c>] tick_sched_timer+0x4c/0x98
>>   [<ffff0000081442e0>] __hrtimer_run_queues+0xc0/0x300
>>   [<ffff000008144fa4>] hrtimer_interrupt+0xac/0x228
>>   [<ffff0000089a56d4>] arch_timer_handler_phys+0x3c/0x50
>>   [<ffff00000812f1bc>] handle_percpu_devid_irq+0x8c/0x290
>>   [<ffff0000081297fc>] generic_handle_irq+0x34/0x50
>>   [<ffff000008129f00>] __handle_domain_irq+0x68/0xc0
>>   [<ffff0000080816b4>] gic_handle_irq+0x5c/0xb0
>> 
>> Fix this by dropping the ref-count decrement in
>> unmap_and_move_hugepage() when !hugepage_migration_supported().
>> 
>> Fixes: 32665f2bbfed ("mm/migrate: correct failure handling if !hugepage_migration_support()")
>> Reported-by: Manoj Iyer <manoj.iyer@canonical.com>
>> Signed-off-by: Punit Agrawal <punit.agrawal@arm.com>
>> Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
>> Cc: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
>> Cc: Wanpeng Li <liwanp@linux.vnet.ibm.com>
>> Cc: Christoph Lameter <cl@linux.com>
>
> 32665f2bbfed was three years ago.  Do you have any theory as to why
> this took so long to be detected?

This only triggers on systems that enable memory failure handling
(ARCH_SUPPORTS_MEMORY_FAILURE) but not hugepage migration
(!ARCH_ENABLE_HUGEPAGE_MIGRATION).

I imagine this wasn't triggered as there aren't many systems running
this configuration.

> And do you believe a -stable backport is warranted?

I'll defer to Horiguchi-san's judgement here.

>
> --
> To unsubscribe, send a message with 'unsubscribe linux-mm' in
> the body to majordomo@kvack.org.  For more info on Linux MM,
> see: http://www.linux-mm.org/ .
> Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply

* Re: dm ioctl: Restore __GFP_HIGH in copy_params()
From: Michal Hocko @ 2017-05-25  8:58 UTC (permalink / raw)
  To: Mikulas Patocka
  Cc: David Rientjes, Mike Snitzer, Junaid Shahid, Alasdair Kergon,
	Andrew Morton, linux-mm, andreslc, gthelen, vbabka, linux-kernel
In-Reply-To: <alpine.LRH.2.02.1705231236210.20039@file01.intranet.prod.int.rdu2.redhat.com>

On Tue 23-05-17 12:44:18, Mikulas Patocka wrote:
> 
> 
> On Tue, 23 May 2017, Michal Hocko wrote:
> 
> > On Mon 22-05-17 13:35:41, David Rientjes wrote:
> > > On Mon, 22 May 2017, Mike Snitzer wrote:
> > [...]
> > > > While adding the __GFP_NOFAIL flag would serve to document expectations
> > > > I'm left unconvinced that the memory allocator will _not fail_ for an
> > > > order-0 page -- as Mikulas said most ioctls don't need more than 4K.
> > > 
> > > __GFP_NOFAIL would make no sense in kvmalloc() calls, ever, it would never 
> > > fallback to vmalloc :)
> > 
> > Sorry, I could have been more specific. You would have to opencode
> > kvmalloc obviously. It is documented to not support this flag for the
> > reasons you have mentioned above.
> > 
> > > I'm hoping this can get merged during the 4.12 window to fix the broken 
> > > commit d224e9381897.
> > 
> > I obviously disagree. Relying on memory reserves for _correctness_ is
> > clearly broken by design, full stop. But it is dm code and you are going
> > it is responsibility of the respective maintainers to support this code.
> 
> Block loop device is broken in the same way - it converts block requests 
> to filesystem reads and writes and those FS reads and writes allocate 
> memory.

I do not see those would depend on the __GFP_HIGH. Also writes are throttled
so the memory shouldn't get full of dirty pages.

> Network block device needs an userspace daemon to perform I/O.

which makes it pretty much not reliable for any forward progress. AFAIR
swap over NBD access full memory reserves to overcome this. But that is
merely an exception

> iSCSI also needs to allocate memory to perform I/O.

Shouldn't it use mempools? I am sorry but I am not familiar with this
area at all.
 
> NFS and other networking filesystems are also broken in the same way (they 
> need to receive a packet to acknowledge a write and packet reception needs 
> to allocate memory).
> 
> So - what should these *broken* drivers do to reduce the possibility of 
> the deadlock?

the IO path has traditionally used mempools to guarantee a forward
progress. If this is not an option then the choice is not all that
great. We are throttling memory writers (or drop packets when the memory
is too low) and finally have the OOM killer to free up some memory. 
-- 
Michal Hocko
SUSE Labs

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply

* Re: [PATCH] mm/oom_kill: count global and memory cgroup oom kills
From: Konstantin Khlebnikov @ 2017-05-25  8:44 UTC (permalink / raw)
  To: David Rientjes
  Cc: Roman Guschin, linux-mm, Andrew Morton, Tejun Heo, cgroups,
	linux-kernel, Vlastimil Babka, Michal Hocko, hannes
In-Reply-To: <alpine.DEB.2.10.1705241338120.49680@chino.kir.corp.google.com>



On 24.05.2017 23:43, David Rientjes wrote:
> On Tue, 23 May 2017, Konstantin Khlebnikov wrote:
> 
>> This is worth addition. Let's call it "oom_victim" for short.
>>
>> It allows to locate leaky part if they are spread over sub-containers within
>> common limit.
>> But doesn't tell which limit caused this kill. For hierarchical limits this
>> might be not so easy.
>>
>> I think oom_kill better suits for automatic actions - restart affected
>> hierarchy, increase limits, e.t.c.
>> But oom_victim allows to determine container affected by global oom killer.
>>
>> So, probably it's worth to merge them together and increment oom_kill by
>> global killer for victim memcg:
>>
>> 	if (!is_memcg_oom(oc)) {
>> 		count_vm_event(OOM_KILL);
>> 		mem_cgroup_count_vm_event(mm, OOM_KILL);
>> 	} else
>> 		mem_cgroup_event(oc->memcg, OOM_KILL);
>>
> 
> Our complete solution is that we have a complementary
> memory.oom_kill_control that allows users to register for eventfd(2)
> notification when the kernel oom killer kills a victim, but this is
> because we have had complete support for userspace oom handling for years.
> When read, it exports three classes of information:
> 
>   - the "total" (hierarchical) and "local" (memcg specific) number of oom
>     kills for system oom conditions (overcommit),
> 
>   - the "total" and "local" number of oom kills for memcg oom conditions,
>     and
>   
>   - the total number of processes in the hierarchy where an oom victim was
>     reaped successfully and unsuccessfully.
> 
> One benefit of this is that it prevents us from having to scrape the
> kernel log for oom events which has been troublesome in the past, but
> userspace can easily do so when the eventfd triggers for the kill
> notification.
> 

Ok. I've decided to simplify this thing and count kills to cgroup where task lived.
Like page faults. And show in vmstat total count of any kind of kills.

Simply:
	count_vm_event(OOM_KILL);
	mem_cgroup_count_vm_event(mm, OOM_KILL);

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply

* Re: [PATCH -mm 06/13] block: Increase BIO_MAX_PAGES to PMD size if THP_SWAP enabled
From: Ming Lei @ 2017-05-25  8:42 UTC (permalink / raw)
  To: Huang, Ying
  Cc: Andrew Morton, linux-mm, linux-kernel, Johannes Weiner,
	Minchan Kim, Jens Axboe, Ming Lei, Shaohua Li, linux-block
In-Reply-To: <20170525064635.2832-7-ying.huang@intel.com>

On Thu, May 25, 2017 at 02:46:28PM +0800, Huang, Ying wrote:
> From: Huang Ying <ying.huang@intel.com>
> 
> In this patch, BIO_MAX_PAGES is changed from 256 to HPAGE_PMD_NR if
> CONFIG_THP_SWAP is enabled and HPAGE_PMD_NR > 256.  This is to support
> THP (Transparent Huge Page) swap optimization.  Where the THP will be
> write to disk as a whole instead of HPAGE_PMD_NR normal pages to batch
> the various operations during swap.  And the page is likely to be
> written to disk to free memory when system memory goes really low, the
> memory pool need to be used to avoid deadlock.
> 
> Signed-off-by: "Huang, Ying" <ying.huang@intel.com>
> Cc: Johannes Weiner <hannes@cmpxchg.org>
> Cc: Minchan Kim <minchan@kernel.org>
> Cc: Jens Axboe <axboe@kernel.dk>
> Cc: Ming Lei <tom.leiming@gmail.com>
> Cc: Shaohua Li <shli@fb.com>
> Cc: linux-block@vger.kernel.org
> ---
>  include/linux/bio.h | 8 ++++++++
>  1 file changed, 8 insertions(+)
> 
> diff --git a/include/linux/bio.h b/include/linux/bio.h
> index d1b04b0e99cf..314796486507 100644
> --- a/include/linux/bio.h
> +++ b/include/linux/bio.h
> @@ -38,7 +38,15 @@
>  #define BIO_BUG_ON
>  #endif
>  
> +#ifdef CONFIG_THP_SWAP
> +#if HPAGE_PMD_NR > 256
> +#define BIO_MAX_PAGES		HPAGE_PMD_NR
> +#else
>  #define BIO_MAX_PAGES		256
> +#endif
> +#else
> +#define BIO_MAX_PAGES		256
> +#endif
>  
>  #define bio_prio(bio)			(bio)->bi_ioprio
>  #define bio_set_prio(bio, prio)		((bio)->bi_ioprio = prio)

Last time we discussed we should use multipage bvec for this usage.

I will rebase the last post on v4.12-rc and kick if off again since
the raid cleanup is just done on v4.11.

	http://marc.info/?t=148453679000002&r=1&w=2

Thanks,
Ming

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply

* Re: [RFC PATCH 2/2] mm, memory_hotplug: drop CONFIG_MOVABLE_NODE
From: Vlastimil Babka @ 2017-05-25  8:41 UTC (permalink / raw)
  To: Michal Hocko
  Cc: linux-mm, Andrew Morton, Mel Gorman, Andrea Arcangeli,
	Jerome Glisse, Reza Arbab, Yasuaki Ishimatsu, qiuxishi,
	Kani Toshimitsu, slaoub, Joonsoo Kim, Andi Kleen, David Rientjes,
	Daniel Kiper, Igor Mammedov, Vitaly Kuznetsov, LKML
In-Reply-To: <20170525062722.GD12721@dhcp22.suse.cz>

On 05/25/2017 08:27 AM, Michal Hocko wrote:
> On Wed 24-05-17 17:17:08, Vlastimil Babka wrote:
>> On 05/24/2017 03:42 PM, Michal Hocko wrote:
> [...]
>>
>> I'd expect stuff might compile and work (run without crash), just in
>> some cases the boot option could be effectively ignored? In that case
>> it's just a matter of documenting the option, possibly also some warning
>> when used, e.g. "node_movable was ignored because CONFIG_FOO is not
>> enabled"?
> 
> Hmm, I can make the cmd parameter available only when
> CONFIG_HAVE_MEMBLOCK_NODE_MAP but I am not sure how helpful it would be.
> AFAIR unrecognized options are just ignored. On the other hand debugging
> why the parameter doesn't do anything might be really frustrating. Here
> is the patch I will put on top of the two posted. Strictly speaking it
> breaks the bisection but swithing the order would be kind of pointless
> ifdefery game and I do not see it would matter all that much. I can
> rework if you guys think otherwise though.

Sounds good, thanks!

> ---
> From 4ed5cca9399f9b1e616478160ed5320d3951ec29 Mon Sep 17 00:00:00 2001
> From: Michal Hocko <mhocko@suse.com>
> Date: Wed, 24 May 2017 15:43:49 +0200
> Subject: [PATCH] mm, memory_hotplug: move movable_node to the hotplug proper
> 
> movable_node_is_enabled is defined in memblock proper while it
> is initialized from the memory hotplug proper. This is quite messy
> and it makes a dependency between the two so move movable_node along
> with the helper functions to memory_hotplug.
> 
> To make it more entertaining the kernel parameter is ignored unless
> CONFIG_HAVE_MEMBLOCK_NODE_MAP=y because we do not have the node
> information for each memblock otherwise. So let's warn when the option
> is disabled.
> 
> Signed-off-by: Michal Hocko <mhocko@suse.com>

Acked-by: Vlastimil Babka <vbabka@suse.cz>

> ---
>  include/linux/memblock.h       |  7 -------
>  include/linux/memory_hotplug.h | 10 ++++++++++
>  mm/memblock.c                  |  1 -
>  mm/memory_hotplug.c            |  6 ++++++
>  4 files changed, 16 insertions(+), 8 deletions(-)
> 
> diff --git a/include/linux/memblock.h b/include/linux/memblock.h
> index 9622fb8c101b..071692894254 100644
> --- a/include/linux/memblock.h
> +++ b/include/linux/memblock.h
> @@ -57,8 +57,6 @@ struct memblock {
>  
>  extern struct memblock memblock;
>  extern int memblock_debug;
> -/* If movable_node boot option specified */
> -extern bool movable_node_enabled;
>  
>  #ifdef CONFIG_ARCH_DISCARD_MEMBLOCK
>  #define __init_memblock __meminit
> @@ -171,11 +169,6 @@ static inline bool memblock_is_hotpluggable(struct memblock_region *m)
>  	return m->flags & MEMBLOCK_HOTPLUG;
>  }
>  
> -static inline bool __init_memblock movable_node_is_enabled(void)
> -{
> -	return movable_node_enabled;
> -}
> -
>  static inline bool memblock_is_mirror(struct memblock_region *m)
>  {
>  	return m->flags & MEMBLOCK_MIRROR;
> diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h
> index 9e0249d0f5e4..d6e5e63b31d5 100644
> --- a/include/linux/memory_hotplug.h
> +++ b/include/linux/memory_hotplug.h
> @@ -115,6 +115,12 @@ extern void __online_page_free(struct page *page);
>  extern int try_online_node(int nid);
>  
>  extern bool memhp_auto_online;
> +/* If movable_node boot option specified */
> +extern bool movable_node_enabled;
> +static inline bool movable_node_is_enabled(void)
> +{
> +	return movable_node_enabled;
> +}
>  
>  #ifdef CONFIG_MEMORY_HOTREMOVE
>  extern bool is_pageblock_removable_nolock(struct page *page);
> @@ -266,6 +272,10 @@ static inline void put_online_mems(void) {}
>  static inline void mem_hotplug_begin(void) {}
>  static inline void mem_hotplug_done(void) {}
>  
> +static inline bool movable_node_is_enabled(void)
> +{
> +	return false;
> +}
>  #endif /* ! CONFIG_MEMORY_HOTPLUG */
>  
>  #ifdef CONFIG_MEMORY_HOTREMOVE
> diff --git a/mm/memblock.c b/mm/memblock.c
> index 4895f5a6cf7e..8c52fb11510c 100644
> --- a/mm/memblock.c
> +++ b/mm/memblock.c
> @@ -54,7 +54,6 @@ struct memblock memblock __initdata_memblock = {
>  };
>  
>  int memblock_debug __initdata_memblock;
> -bool movable_node_enabled __initdata_memblock = false;
>  static bool system_has_some_mirror __initdata_memblock = false;
>  static int memblock_can_resize __initdata_memblock;
>  static int memblock_memory_in_slab __initdata_memblock = 0;
> diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
> index 2a14f8c18a22..1a148b35e8a3 100644
> --- a/mm/memory_hotplug.c
> +++ b/mm/memory_hotplug.c
> @@ -79,6 +79,8 @@ static struct {
>  #define memhp_lock_acquire()      lock_map_acquire(&mem_hotplug.dep_map)
>  #define memhp_lock_release()      lock_map_release(&mem_hotplug.dep_map)
>  
> +bool movable_node_enabled = false;
> +
>  #ifndef CONFIG_MEMORY_HOTPLUG_DEFAULT_ONLINE
>  bool memhp_auto_online;
>  #else
> @@ -1561,7 +1563,11 @@ check_pages_isolated(unsigned long start_pfn, unsigned long end_pfn)
>  
>  static int __init cmdline_parse_movable_node(char *p)
>  {
> +#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
>  	movable_node_enabled = true;
> +#else
> +	pr_warn("movable_node parameter depends on CONFIG_HAVE_MEMBLOCK_NODE_MAP to work properly\n");
> +#endif
>  	return 0;
>  }
>  early_param("movable_node", cmdline_parse_movable_node);
> 

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply

* Re: [RFC PATCH] mm: fix mlock incorrent event account
From: Michal Hocko @ 2017-05-25  8:13 UTC (permalink / raw)
  To: zhongjiang; +Cc: akpm, vbabka, qiuxishi, linux-mm
In-Reply-To: <1495699179-7566-1-git-send-email-zhongjiang@huawei.com>

On Thu 25-05-17 15:59:39, zhongjiang wrote:
> From: zhong jiang <zhongjiang@huawei.com>
> 
> when clear_page_mlock call, we had finish the page isolate successfully,
> but it fails to increase the UNEVICTABLE_PGMUNLOCKED account.
> 
> The patch add the event account when successful page isolation.

Could you describe _what_ is the problem, how it can be _triggered_
and _how_ serious it is. Is it something that can be triggered from
userspace? The mlock code is really tricky and it is far from trivial
to see whether this is obviously right or a wrong assumption on your
side. Before people go and spend time reviewing it is fair to introduce
them to the problem.

I believe this is not the first time I am giving you this feedback
so I would _really_ appreciated if you tried harder with the changelog.
It is much simpler to write a patch than review it in many cases.

> Signed-off-by: zhong jiang <zhongjiang@huawei.com>
> ---
>  mm/mlock.c | 1 +
>  1 file changed, 1 insertion(+)
> 
> diff --git a/mm/mlock.c b/mm/mlock.c
> index c483c5c..941930b 100644
> --- a/mm/mlock.c
> +++ b/mm/mlock.c
> @@ -64,6 +64,7 @@ void clear_page_mlock(struct page *page)
>  			    -hpage_nr_pages(page));
>  	count_vm_event(UNEVICTABLE_PGCLEARED);
>  	if (!isolate_lru_page(page)) {
> +		count_vm_event(UNEVICTABLE_PGMUNLOCKED);
>  		putback_lru_page(page);
>  	} else {
>  		/*
> -- 
> 1.8.3.1

-- 
Michal Hocko
SUSE Labs

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply

* [RFC PATCH] mm: fix mlock incorrent event account
From: zhongjiang @ 2017-05-25  7:59 UTC (permalink / raw)
  To: akpm; +Cc: vbabka, mhocko, qiuxishi, linux-mm, zhongjiang

From: zhong jiang <zhongjiang@huawei.com>

when clear_page_mlock call, we had finish the page isolate successfully,
but it fails to increase the UNEVICTABLE_PGMUNLOCKED account.

The patch add the event account when successful page isolation.

Signed-off-by: zhong jiang <zhongjiang@huawei.com>
---
 mm/mlock.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/mm/mlock.c b/mm/mlock.c
index c483c5c..941930b 100644
--- a/mm/mlock.c
+++ b/mm/mlock.c
@@ -64,6 +64,7 @@ void clear_page_mlock(struct page *page)
 			    -hpage_nr_pages(page));
 	count_vm_event(UNEVICTABLE_PGCLEARED);
 	if (!isolate_lru_page(page)) {
+		count_vm_event(UNEVICTABLE_PGMUNLOCKED);
 		putback_lru_page(page);
 	} else {
 		/*
-- 
1.8.3.1

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply related

* [PATCH v3] mlock: fix mlock count can not decrease in race condition
From: Yisheng Xie @ 2017-05-25  7:07 UTC (permalink / raw)
  To: akpm
  Cc: vbabka, joern, mgorman, walken, hughd, riel, hannes, mhocko,
	qiuxishi, zhongjiang, guohanjun, wangkefeng.wang, stable,
	linux-kernel, linux-mm

Kefeng reported that when running the follow test, the mlock count in
meminfo will increases permanently:

 [1] testcase
 linux:~ # cat test_mlockal
 grep Mlocked /proc/meminfo
  for j in `seq 0 10`
  do
 	for i in `seq 4 15`
 	do
 		./p_mlockall >> log &
 	done
 	sleep 0.2
 done
 # wait some time to let mlock counter decrease and 5s may not enough
 sleep 5
 grep Mlocked /proc/meminfo

 linux:~ # cat p_mlockall.c
 #include <sys/mman.h>
 #include <stdlib.h>
 #include <stdio.h>

 #define SPACE_LEN	4096

 int main(int argc, char ** argv)
 {
 	int ret;
 	void *adr = malloc(SPACE_LEN);
 	if (!adr)
 		return -1;

 	ret = mlockall(MCL_CURRENT | MCL_FUTURE);
 	printf("mlcokall ret = %d\n", ret);

 	ret = munlockall();
 	printf("munlcokall ret = %d\n", ret);

 	free(adr);
 	return 0;
 }

In __munlock_pagevec() we should decrement NR_MLOCK for each page where
we clear the PageMlocked flag. Commit 1ebb7cc6a583 ("mm: munlock: batch
NR_MLOCK zone state updates") has introduced a bug where we don't
decrement NR_MLOCK for pages where we clear the flag, but fail to
isolate them from the lru list (e.g. when the pages are on some other
cpu's percpu pagevec). Since PageMlocked stays cleared, the NR_MLOCK
accounting gets permanently disrupted by this.

Fix it by counting the number of page whoes PageMlock flag is cleared.

Fixes: 1ebb7cc6a583 ("mm: munlock: batch NR_MLOCK zone state updates")
Signed-off-by: Yisheng Xie <xieyisheng1@huawei.com>
Reported-by: Kefeng Wang <wangkefeng.wang@huawei.com>
Tested-by: Kefeng Wang <wangkefeng.wang@huawei.com>
Suggested-by: Vlastimil Babka <vbabka@suse.cz>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
Cc: Joern Engel <joern@logfs.org>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Michel Lespinasse <walken@google.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Rik van Riel <riel@redhat.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Michal Hocko <mhocko@suse.cz>
Cc: Xishi Qiu <qiuxishi@huawei.com>
CC: zhongjiang <zhongjiang@huawei.com>
Cc: Hanjun Guo <guohanjun@huawei.com>
Cc: <stable@vger.kernel.org>
---
v2:
 - use delta_munlocked for it doesn't do the increment in fastpath - Vlastimil

v3:
 - change the changelog to make it more clear - Vlastimil

Hi Andrew:
Could you please help to fold this?

Thanks
Yisheng Xie

 mm/mlock.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/mm/mlock.c b/mm/mlock.c
index c483c5c..b562b55 100644
--- a/mm/mlock.c
+++ b/mm/mlock.c
@@ -284,7 +284,7 @@ static void __munlock_pagevec(struct pagevec *pvec, struct zone *zone)
 {
 	int i;
 	int nr = pagevec_count(pvec);
-	int delta_munlocked;
+	int delta_munlocked = -nr;
 	struct pagevec pvec_putback;
 	int pgrescued = 0;
 
@@ -304,6 +304,8 @@ static void __munlock_pagevec(struct pagevec *pvec, struct zone *zone)
 				continue;
 			else
 				__munlock_isolation_failed(page);
+		} else {
+			delta_munlocked++;
 		}
 
 		/*
@@ -315,7 +317,6 @@ static void __munlock_pagevec(struct pagevec *pvec, struct zone *zone)
 		pagevec_add(&pvec_putback, pvec->pages[i]);
 		pvec->pages[i] = NULL;
 	}
-	delta_munlocked = -nr + pagevec_count(&pvec_putback);
 	__mod_zone_page_state(zone, NR_MLOCK, delta_munlocked);
 	spin_unlock_irq(zone_lru_lock(zone));
 
-- 
1.7.12.4

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply related

* Re: [PATCH 0/6] refine and rename slub sysfs
From: Michal Hocko @ 2017-05-25  6:49 UTC (permalink / raw)
  To: Wei Yang; +Cc: cl, penberg, rientjes, akpm, linux-mm, linux-kernel
In-Reply-To: <20170524152124.GB8445@WeideMacBook-Pro.local>

On Wed 24-05-17 23:21:24, Wei Yang wrote:
> On Wed, May 24, 2017 at 02:03:18PM +0200, Michal Hocko wrote:
> >On Wed 24-05-17 17:54:50, Wei Yang wrote:
> >> On Tue, May 23, 2017 at 08:39:11AM +0200, Michal Hocko wrote:
> >[...]
> >> >Is this worth risking breakage of the userspace which consume this data
> >> >now? Do you have any user space code which will greatly benefit from the
> >> >new data and which couldn't do the same with the current format/output?
> >> >
> >> >If yes this all should be in the changelog.
> >> 
> >> The answer is no.
> >> 
> >> I have the same concern as yours. So this patch set could be divided into two
> >> parts: 1. add some new entry with current name convention, 2. change the name
> >> convention.
> >
> >Who is going to use those new entries and for what purpose? Why do we
> >want to expose even more details of the slab allocator to the userspace.
> >Is the missing information something fundamental that some user space
> >cannot work without it? Seriously these are essential questions you
> >should have answer for _before_ posting the patch and mention all those
> >reasons in the changelog.
> 
> It is me who wants to get more details of the slub behavior.  
> AFAIK, no one else is expecting this.

My point is that whatever the reason is, it _should_ be described
properly. This is a user visible change and we will have hard time to
change in future once there is userspace depending on it. So ask
yourself, is this so useful that the future maintenance will be still
reasonable? Also doesn't this export too much of the internal
implementation details that would make future development harder?
Also make sure to CC linux-api mailing list for future posts which
involve user API visible changes.

Thanks!
-- 
Michal Hocko
SUSE Labs

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply

* Re: [PATCH v2] mlock: fix mlock count can not decrease in race condition
From: Yisheng Xie @ 2017-05-25  6:48 UTC (permalink / raw)
  To: Vlastimil Babka, akpm
  Cc: joern, mgorman, walken, hughd, riel, hannes, mhocko, qiuxishi,
	zhongjiang, guohanjun, wangkefeng.wang, stable, linux-kernel,
	linux-mm
In-Reply-To: <6c19fa2f-36b6-d36b-3b51-7fdfc22e1a5c@suse.cz>

Hi Vlastimil,
Thanks for comment!

On 2017/5/25 14:32, Vlastimil Babka wrote:
> On 05/25/2017 04:13 AM, Yisheng Xie wrote:
>> Kefeng reported that when run the follow test the mlock count
> 
>> in meminfo
>> cannot be decreased:
> 
> "increases permanently."?
Yes if I am not mis-understanding what your means.

> 
>>  [1] testcase
>>  linux:~ # cat test_mlockal
>>  grep Mlocked /proc/meminfo
>>   for j in `seq 0 10`
>>   do
>>  	for i in `seq 4 15`
>>  	do
>>  		./p_mlockall >> log &
>>  	done
>>  	sleep 0.2
>>  done
>>  # wait some time to let mlock counter decrease and 5s may not enough
>>  sleep 5
>>  grep Mlocked /proc/meminfo
>>
>>  linux:~ # cat p_mlockall.c
>>  #include <sys/mman.h>
>>  #include <stdlib.h>
>>  #include <stdio.h>
>>
>>  #define SPACE_LEN	4096
>>
>>  int main(int argc, char ** argv)
>>  {
>>  	int ret;
>>  	void *adr = malloc(SPACE_LEN);
>>  	if (!adr)
>>  		return -1;
>>
>>  	ret = mlockall(MCL_CURRENT | MCL_FUTURE);
>>  	printf("mlcokall ret = %d\n", ret);
>>
>>  	ret = munlockall();
>>  	printf("munlcokall ret = %d\n", ret);
>>
>>  	free(adr);
>>  	return 0;
>>  }
>>
>> When __munlock_pagevec, we ClearPageMlock but isolation_failed in race
>> condition, and we do not count these page into delta_munlocked, which cause
>> mlock counter incorrect for we had Clear the PageMlock and cannot count down
>> the number in the feture.
> 
> Can I suggest the following instead:
> 
> In __munlock_pagevec() we should decrement NR_MLOCK for each page where
> we clear the PageMlocked flag. Commit 1ebb7cc6a583 ("mm: munlock: batch
> NR_MLOCK zone state updates") has introduced a bug where we don't
> decrement NR_MLOCK for pages where we clear the flag, but fail to
> isolate them from the lru list (e.g. when the pages are on some other
> cpu's percpu pagevec). Since PageMlocked stays cleared, the NR_MLOCK
> accounting gets permanently disrupted by this.
That's much better and clear. Should I send another version ?

Thanks
Yisheng Xie

> 
>> Fix it by count the number of page whoes PageMlock flag is cleared.
>>
>> Fixes: 1ebb7cc6a583 (" mm: munlock: batch NR_MLOCK zone state updates")
>> Signed-off-by: Yisheng Xie <xieyisheng1@huawei.com>
>> Reported-by: Kefeng Wang <wangkefeng.wang@huawei.com>
>> Tested-by: Kefeng Wang <wangkefeng.wang@huawei.com>
>> Cc: Vlastimil Babka <vbabka@suse.cz>
> 
> Acked-by: Vlastimil Babka <vbabka@suse.cz>
> 
> Thanks!
> 
>> Cc: Joern Engel <joern@logfs.org>
>> Cc: Mel Gorman <mgorman@suse.de>
>> Cc: Michel Lespinasse <walken@google.com>
>> Cc: Hugh Dickins <hughd@google.com>
>> Cc: Rik van Riel <riel@redhat.com>
>> Cc: Johannes Weiner <hannes@cmpxchg.org>
>> Cc: Michal Hocko <mhocko@suse.cz>
>> Cc: Xishi Qiu <qiuxishi@huawei.com>
>> CC: zhongjiang <zhongjiang@huawei.com>
>> Cc: Hanjun Guo <guohanjun@huawei.com>
>> Cc: <stable@vger.kernel.org>
>> ---
>> v2:
>>  - use delta_munlocked for it doesn't do the increment in fastpath - Vlastimil
>>
>> Hi Andrew:
>> Could you please help to fold this?
>>
>> Thanks
>> Yisheng Xie
>>
>>  mm/mlock.c | 5 +++--
>>  1 file changed, 3 insertions(+), 2 deletions(-)
>>
>> diff --git a/mm/mlock.c b/mm/mlock.c
>> index c483c5c..b562b55 100644
>> --- a/mm/mlock.c
>> +++ b/mm/mlock.c
>> @@ -284,7 +284,7 @@ static void __munlock_pagevec(struct pagevec *pvec, struct zone *zone)
>>  {
>>  	int i;
>>  	int nr = pagevec_count(pvec);
>> -	int delta_munlocked;
>> +	int delta_munlocked = -nr;
>>  	struct pagevec pvec_putback;
>>  	int pgrescued = 0;
>>  
>> @@ -304,6 +304,8 @@ static void __munlock_pagevec(struct pagevec *pvec, struct zone *zone)
>>  				continue;
>>  			else
>>  				__munlock_isolation_failed(page);
>> +		} else {
>> +			delta_munlocked++;
>>  		}
>>  
>>  		/*
>> @@ -315,7 +317,6 @@ static void __munlock_pagevec(struct pagevec *pvec, struct zone *zone)
>>  		pagevec_add(&pvec_putback, pvec->pages[i]);
>>  		pvec->pages[i] = NULL;
>>  	}
>> -	delta_munlocked = -nr + pagevec_count(&pvec_putback);
>>  	__mod_zone_page_state(zone, NR_MLOCK, delta_munlocked);
>>  	spin_unlock_irq(zone_lru_lock(zone));
>>  
>>
> 
> 
> .
> 

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply

* [PATCH -mm 13/13] mm, THP, swap: Add THP swapping out fallback counting
From: Huang, Ying @ 2017-05-25  6:46 UTC (permalink / raw)
  To: Andrew Morton
  Cc: linux-mm, linux-kernel, Huang Ying, Johannes Weiner, Minchan Kim,
	Hugh Dickins, Shaohua Li, Rik van Riel, Andrea Arcangeli,
	Kirill A . Shutemov, Michal Hocko
In-Reply-To: <20170525064635.2832-1-ying.huang@intel.com>

From: Huang Ying <ying.huang@intel.com>

When swapping out THP (Transparent Huge Page), instead of swapping out
the THP as a whole, sometimes we have to fallback to split the THP
into normal pages before swapping, because no free swap clusters are
available, or cgroup limit is exceeded, etc.  To count the number of
the fallback, a new VM event THP_SWPOUT_FALLBACK is added, and counted
when we fallback to split the THP.

Signed-off-by: "Huang, Ying" <ying.huang@intel.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Minchan Kim <minchan@kernel.org>
Cc: Hugh Dickins <hughd@google.com>
Cc: Shaohua Li <shli@kernel.org>
Cc: Rik van Riel <riel@redhat.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: "Kirill A . Shutemov" <kirill.shutemov@linux.intel.com>
Cc: Michal Hocko <mhocko@kernel.org>
---
 include/linux/vm_event_item.h | 1 +
 mm/vmscan.c                   | 3 +++
 mm/vmstat.c                   | 1 +
 3 files changed, 5 insertions(+)

diff --git a/include/linux/vm_event_item.h b/include/linux/vm_event_item.h
index 5b5b0f094060..66effbadc9b8 100644
--- a/include/linux/vm_event_item.h
+++ b/include/linux/vm_event_item.h
@@ -85,6 +85,7 @@ enum vm_event_item { PGPGIN, PGPGOUT, PSWPIN, PSWPOUT,
 		THP_ZERO_PAGE_ALLOC,
 		THP_ZERO_PAGE_ALLOC_FAILED,
 		THP_SWPOUT,
+		THP_SWPOUT_FALLBACK,
 #endif
 #ifdef CONFIG_MEMORY_BALLOON
 		BALLOON_INFLATE,
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 510e709aecd4..0f5a6bfc5e65 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -1153,6 +1153,9 @@ static unsigned long shrink_page_list(struct list_head *page_list,
 					if (split_huge_page_to_list(page,
 								    page_list))
 						goto activate_locked;
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+					count_vm_event(THP_SWPOUT_FALLBACK);
+#endif
 					if (!add_to_swap(page))
 						goto activate_locked;
 				}
diff --git a/mm/vmstat.c b/mm/vmstat.c
index ebfd79df1008..9400c915e9a2 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -1071,6 +1071,7 @@ const char * const vmstat_text[] = {
 	"thp_zero_page_alloc",
 	"thp_zero_page_alloc_failed",
 	"thp_swpout",
+	"thp_swpout_fallback",
 #endif
 #ifdef CONFIG_MEMORY_BALLOON
 	"balloon_inflate",
-- 
2.11.0

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply related

* [PATCH -mm 12/13] mm, THP, swap: Delay splitting THP after swapped out
From: Huang, Ying @ 2017-05-25  6:46 UTC (permalink / raw)
  To: Andrew Morton
  Cc: linux-mm, linux-kernel, Huang Ying, Johannes Weiner, Minchan Kim,
	Hugh Dickins, Shaohua Li, Rik van Riel, Andrea Arcangeli,
	Kirill A . Shutemov, Michal Hocko
In-Reply-To: <20170525064635.2832-1-ying.huang@intel.com>

From: Huang Ying <ying.huang@intel.com>

In this patch, splitting transparent huge page (THP) during swapping
out is delayed from after adding the THP into the swap cache to after
swapping out finishes.  After the patch, more operations for the
anonymous THP reclaiming, such as writing the THP to the swap device,
removing the THP from the swap cache could be batched.  So that the
performance of anonymous THP swapping out could be improved.

This is the second step for the THP swap support.  The plan is to
delay splitting the THP step by step and avoid splitting the THP
finally.

With the patchset, the swap out throughput improves 42% (from about
5.81GB/s to about 8.25GB/s) in the vm-scalability swap-w-seq test case
with 16 processes.  At the same time, the IPI (reflect TLB flushing)
reduced about 78.9%.  The test is done on a Xeon E5 v3 system.  The
swap device used is a RAM simulated PMEM (persistent memory) device.
To test the sequential swapping out, the test case creates 8
processes, which sequentially allocate and write to the anonymous
pages until the RAM and part of the swap device is used up.

Signed-off-by: "Huang, Ying" <ying.huang@intel.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Minchan Kim <minchan@kernel.org>
Cc: Hugh Dickins <hughd@google.com>
Cc: Shaohua Li <shli@kernel.org>
Cc: Rik van Riel <riel@redhat.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: "Kirill A . Shutemov" <kirill.shutemov@linux.intel.com>
Cc: Michal Hocko <mhocko@kernel.org>
---
 mm/vmscan.c | 95 +++++++++++++++++++++++++++++++++----------------------------
 1 file changed, 52 insertions(+), 43 deletions(-)

diff --git a/mm/vmscan.c b/mm/vmscan.c
index f7e949ac9756..510e709aecd4 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -535,7 +535,9 @@ static inline int is_page_cache_freeable(struct page *page)
 	 * that isolated the page, the page cache radix tree and
 	 * optional buffer heads at page->private.
 	 */
-	return page_count(page) - page_has_private(page) == 2;
+	int radix_pins = PageTransHuge(page) && PageSwapCache(page) ?
+		HPAGE_PMD_NR : 1;
+	return page_count(page) - page_has_private(page) == 1 + radix_pins;
 }
 
 static int may_write_to_inode(struct inode *inode, struct scan_control *sc)
@@ -665,6 +667,7 @@ static int __remove_mapping(struct address_space *mapping, struct page *page,
 			    bool reclaimed)
 {
 	unsigned long flags;
+	int refcount;
 
 	BUG_ON(!PageLocked(page));
 	BUG_ON(mapping != page_mapping(page));
@@ -695,11 +698,15 @@ static int __remove_mapping(struct address_space *mapping, struct page *page,
 	 * Note that if SetPageDirty is always performed via set_page_dirty,
 	 * and thus under tree_lock, then this ordering is not required.
 	 */
-	if (!page_ref_freeze(page, 2))
+	if (unlikely(PageTransHuge(page)) && PageSwapCache(page))
+		refcount = 1 + HPAGE_PMD_NR;
+	else
+		refcount = 2;
+	if (!page_ref_freeze(page, refcount))
 		goto cannot_free;
 	/* note: atomic_cmpxchg in page_freeze_refs provides the smp_rmb */
 	if (unlikely(PageDirty(page))) {
-		page_ref_unfreeze(page, 2);
+		page_ref_unfreeze(page, refcount);
 		goto cannot_free;
 	}
 
@@ -1121,58 +1128,56 @@ static unsigned long shrink_page_list(struct list_head *page_list,
 		 * Try to allocate it some swap space here.
 		 * Lazyfree page could be freed directly
 		 */
-		if (PageAnon(page) && PageSwapBacked(page) &&
-		    !PageSwapCache(page)) {
-			if (!(sc->gfp_mask & __GFP_IO))
-				goto keep_locked;
-			if (PageTransHuge(page)) {
-				/* cannot split THP, skip it */
-				if (!can_split_huge_page(page, NULL))
-					goto activate_locked;
-				/*
-				 * Split pages without a PMD map right
-				 * away. Chances are some or all of the
-				 * tail pages can be freed without IO.
-				 */
-				if (!compound_mapcount(page) &&
-				    split_huge_page_to_list(page, page_list))
-					goto activate_locked;
-			}
-			if (!add_to_swap(page)) {
-				if (!PageTransHuge(page))
-					goto activate_locked;
-				/* Split THP and swap individual base pages */
-				if (split_huge_page_to_list(page, page_list))
-					goto activate_locked;
-				if (!add_to_swap(page))
-					goto activate_locked;
-			}
-
-			/* XXX: We don't support THP writes */
-			if (PageTransHuge(page) &&
-				  split_huge_page_to_list(page, page_list)) {
-				delete_from_swap_cache(page);
-				goto activate_locked;
-			}
+		if (PageAnon(page) && PageSwapBacked(page)) {
+			if (!PageSwapCache(page)) {
+				if (!(sc->gfp_mask & __GFP_IO))
+					goto keep_locked;
+				if (PageTransHuge(page)) {
+					/* cannot split THP, skip it */
+					if (!can_split_huge_page(page, NULL))
+						goto activate_locked;
+					/*
+					 * Split pages without a PMD map right
+					 * away. Chances are some or all of the
+					 * tail pages can be freed without IO.
+					 */
+					if (!compound_mapcount(page) &&
+					    split_huge_page_to_list(page,
+								    page_list))
+						goto activate_locked;
+				}
+				if (!add_to_swap(page)) {
+					if (!PageTransHuge(page))
+						goto activate_locked;
+					/* Fallback to swap normal pages */
+					if (split_huge_page_to_list(page,
+								    page_list))
+						goto activate_locked;
+					if (!add_to_swap(page))
+						goto activate_locked;
+				}
 
-			may_enter_fs = 1;
+				may_enter_fs = 1;
 
-			/* Adding to swap updated mapping */
-			mapping = page_mapping(page);
+				/* Adding to swap updated mapping */
+				mapping = page_mapping(page);
+			}
 		} else if (unlikely(PageTransHuge(page))) {
 			/* Split file THP */
 			if (split_huge_page_to_list(page, page_list))
 				goto keep_locked;
 		}
 
-		VM_BUG_ON_PAGE(PageTransHuge(page), page);
-
 		/*
 		 * The page is mapped into the page tables of one or more
 		 * processes. Try to unmap it here.
 		 */
 		if (page_mapped(page)) {
-			if (!try_to_unmap(page, ttu_flags | TTU_BATCH_FLUSH)) {
+			enum ttu_flags flags = ttu_flags | TTU_BATCH_FLUSH;
+
+			if (unlikely(PageTransHuge(page)))
+				flags |= TTU_SPLIT_HUGE_PMD;
+			if (!try_to_unmap(page, flags)) {
 				nr_unmap_fail++;
 				goto activate_locked;
 			}
@@ -1311,7 +1316,11 @@ static unsigned long shrink_page_list(struct list_head *page_list,
 		 * Is there need to periodically free_page_list? It would
 		 * appear not as the counts should be low
 		 */
-		list_add(&page->lru, &free_pages);
+		if (unlikely(PageTransHuge(page))) {
+			mem_cgroup_uncharge(page);
+			(*get_compound_page_dtor(page))(page);
+		} else
+			list_add(&page->lru, &free_pages);
 		continue;
 
 activate_locked:
-- 
2.11.0

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply related

* [PATCH -mm 11/13] memcg, THP, swap: Make mem_cgroup_swapout() support THP
From: Huang, Ying @ 2017-05-25  6:46 UTC (permalink / raw)
  To: Andrew Morton
  Cc: linux-mm, linux-kernel, Huang Ying, Johannes Weiner, Minchan Kim,
	Michal Hocko, Andrea Arcangeli, Kirill A . Shutemov
In-Reply-To: <20170525064635.2832-1-ying.huang@intel.com>

From: Huang Ying <ying.huang@intel.com>

This patch makes mem_cgroup_swapout() works for the transparent huge
page (THP).  Which will move the memory cgroup charge from memory to
swap for a THP.

This will be used for the THP swap support.  Where a THP may be
swapped out as a whole to a set of (HPAGE_PMD_NR) continuous swap
slots on the swap device.

Signed-off-by: "Huang, Ying" <ying.huang@intel.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Minchan Kim <minchan@kernel.org>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: "Kirill A . Shutemov" <kirill.shutemov@linux.intel.com>
---
 mm/memcontrol.c | 23 +++++++++++++++--------
 1 file changed, 15 insertions(+), 8 deletions(-)

diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 7de1fa07f77d..f520dcadabb5 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -4621,8 +4621,8 @@ static enum mc_target_type get_mctgt_type(struct vm_area_struct *vma,
 
 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
 /*
- * We don't consider swapping or file mapped pages because THP does not
- * support them for now.
+ * We don't consider PMD mapped swapping or file mapped pages because THP does
+ * not support them for now.
  * Caller should make sure that pmd_trans_huge(pmd) is true.
  */
 static enum mc_target_type get_mctgt_type_thp(struct vm_area_struct *vma,
@@ -5855,6 +5855,7 @@ static struct mem_cgroup *mem_cgroup_id_get_online(struct mem_cgroup *memcg)
 void mem_cgroup_swapout(struct page *page, swp_entry_t entry)
 {
 	struct mem_cgroup *memcg, *swap_memcg;
+	unsigned int nr_entries;
 	unsigned short oldid;
 
 	VM_BUG_ON_PAGE(PageLRU(page), page);
@@ -5875,19 +5876,24 @@ void mem_cgroup_swapout(struct page *page, swp_entry_t entry)
 	 * ancestor for the swap instead and transfer the memory+swap charge.
 	 */
 	swap_memcg = mem_cgroup_id_get_online(memcg);
-	oldid = swap_cgroup_record(entry, mem_cgroup_id(swap_memcg), 1);
+	nr_entries = hpage_nr_pages(page);
+	/* Get references for the tail pages, too */
+	if (nr_entries > 1)
+		mem_cgroup_id_get_many(swap_memcg, nr_entries - 1);
+	oldid = swap_cgroup_record(entry, mem_cgroup_id(swap_memcg),
+				   nr_entries);
 	VM_BUG_ON_PAGE(oldid, page);
-	mem_cgroup_swap_statistics(swap_memcg, 1);
+	mem_cgroup_swap_statistics(swap_memcg, nr_entries);
 
 	page->mem_cgroup = NULL;
 
 	if (!mem_cgroup_is_root(memcg))
-		page_counter_uncharge(&memcg->memory, 1);
+		page_counter_uncharge(&memcg->memory, nr_entries);
 
 	if (memcg != swap_memcg) {
 		if (!mem_cgroup_is_root(swap_memcg))
-			page_counter_charge(&swap_memcg->memsw, 1);
-		page_counter_uncharge(&memcg->memsw, 1);
+			page_counter_charge(&swap_memcg->memsw, nr_entries);
+		page_counter_uncharge(&memcg->memsw, nr_entries);
 	}
 
 	/*
@@ -5897,7 +5903,8 @@ void mem_cgroup_swapout(struct page *page, swp_entry_t entry)
 	 * only synchronisation we have for udpating the per-CPU variables.
 	 */
 	VM_BUG_ON(!irqs_disabled());
-	mem_cgroup_charge_statistics(memcg, page, false, -1);
+	mem_cgroup_charge_statistics(memcg, page, PageTransHuge(page),
+				     -nr_entries);
 	memcg_check_events(memcg, page);
 
 	if (!mem_cgroup_is_root(memcg))
-- 
2.11.0

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply related

* [PATCH -mm 10/13] memcg, THP, swap: Avoid to duplicated charge THP in swap cache
From: Huang, Ying @ 2017-05-25  6:46 UTC (permalink / raw)
  To: Andrew Morton
  Cc: linux-mm, linux-kernel, Huang Ying, Johannes Weiner, Minchan Kim,
	Michal Hocko, Andrea Arcangeli, Kirill A . Shutemov
In-Reply-To: <20170525064635.2832-1-ying.huang@intel.com>

From: Huang Ying <ying.huang@intel.com>

For a THP (Transparent Huge Page), tail_page->mem_cgroup is NULL.  So
to check whether the page is charged already, we need to check the
head page.  This is not an issue before because it is impossible for a
THP to be in the swap cache before.  But after we add delaying
splitting THP after swapped out support, it is possible now.

Signed-off-by: "Huang, Ying" <ying.huang@intel.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Minchan Kim <minchan@kernel.org>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: "Kirill A . Shutemov" <kirill.shutemov@linux.intel.com>
---
 mm/memcontrol.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 1f36bb61a6de..7de1fa07f77d 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -5372,7 +5372,7 @@ int mem_cgroup_try_charge(struct page *page, struct mm_struct *mm,
 		 * in turn serializes uncharging.
 		 */
 		VM_BUG_ON_PAGE(!PageLocked(page), page);
-		if (page->mem_cgroup)
+		if (compound_head(page)->mem_cgroup)
 			goto out;
 
 		if (do_swap_account) {
-- 
2.11.0

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply related

* [PATCH -mm 09/13] memcg, THP, swap: Support move mem cgroup charge for THP swapped out
From: Huang, Ying @ 2017-05-25  6:46 UTC (permalink / raw)
  To: Andrew Morton
  Cc: linux-mm, linux-kernel, Huang Ying, Johannes Weiner, Minchan Kim,
	Michal Hocko, Andrea Arcangeli, Kirill A . Shutemov
In-Reply-To: <20170525064635.2832-1-ying.huang@intel.com>

From: Huang Ying <ying.huang@intel.com>

PTE mapped THP (Transparent Huge Page) will be ignored when moving
memory cgroup charge.  But for THP which is in the swap cache, the
memory cgroup charge for the swap of a tail-page may be moved in
current implementation.  That isn't correct, because the swap charge
for all sub-pages of a THP should be moved together.  Following the
processing of the PTE mapped THP, the mem cgroup charge moving for the
swap entry for a tail-page of a THP is ignored too.

Signed-off-by: "Huang, Ying" <ying.huang@intel.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Minchan Kim <minchan@kernel.org>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: "Kirill A . Shutemov" <kirill.shutemov@linux.intel.com>
---
 mm/memcontrol.c | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index c131f7e5ecd1..1f36bb61a6de 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -4606,8 +4606,11 @@ static enum mc_target_type get_mctgt_type(struct vm_area_struct *vma,
 		if (!ret || !target)
 			put_page(page);
 	}
-	/* There is a swap entry and a page doesn't exist or isn't charged */
-	if (ent.val && !ret &&
+	/*
+	 * There is a swap entry and a page doesn't exist or isn't charged.
+	 * But we cannot move a tail-page in a THP.
+	 */
+	if (ent.val && !ret && (!page || !PageTransCompound(page)) &&
 	    mem_cgroup_id(mc.from) == lookup_swap_cgroup_id(ent)) {
 		ret = MC_TARGET_SWAP;
 		if (target)
-- 
2.11.0

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply related

* [PATCH -mm 08/13] mm, THP, swap: Support to split THP for THP swapped out
From: Huang, Ying @ 2017-05-25  6:46 UTC (permalink / raw)
  To: Andrew Morton
  Cc: linux-mm, linux-kernel, Huang Ying, Johannes Weiner, Minchan Kim,
	Hugh Dickins, Shaohua Li, Rik van Riel, Andrea Arcangeli,
	Kirill A . Shutemov
In-Reply-To: <20170525064635.2832-1-ying.huang@intel.com>

From: Huang Ying <ying.huang@intel.com>

After adding swapping out support for THP (Transparent Huge Page), it
is possible that a THP in swap cache (partly swapped out) need to be
split.  To split such a THP, the swap cluster backing the THP need to
be split too, that is, the CLUSTER_FLAG_HUGE flag need to be cleared
for the swap cluster.  The patch implemented this.

And because the THP swap writing needs the THP keeps as huge page
during writing.  The PageWriteback flag is checked before splitting.

Signed-off-by: "Huang, Ying" <ying.huang@intel.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Minchan Kim <minchan@kernel.org>
Cc: Hugh Dickins <hughd@google.com>
Cc: Shaohua Li <shli@kernel.org>
Cc: Rik van Riel <riel@redhat.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: "Kirill A . Shutemov" <kirill.shutemov@linux.intel.com>
---
 include/linux/swap.h |  9 +++++++++
 mm/huge_memory.c     | 10 +++++++++-
 mm/swapfile.c        | 15 +++++++++++++++
 3 files changed, 33 insertions(+), 1 deletion(-)

diff --git a/include/linux/swap.h b/include/linux/swap.h
index ed51d5e699e0..fbe75245971e 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -525,6 +525,15 @@ static inline swp_entry_t get_swap_page(struct page *page)
 
 #endif /* CONFIG_SWAP */
 
+#ifdef CONFIG_THP_SWAP
+extern int split_swap_cluster(swp_entry_t entry);
+#else
+static inline int split_swap_cluster(swp_entry_t entry)
+{
+	return 0;
+}
+#endif
+
 #ifdef CONFIG_MEMCG
 static inline int mem_cgroup_swappiness(struct mem_cgroup *memcg)
 {
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 0eb1251f924a..0aefc90c6573 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -2446,6 +2446,9 @@ int split_huge_page_to_list(struct page *page, struct list_head *list)
 	VM_BUG_ON_PAGE(!PageLocked(page), page);
 	VM_BUG_ON_PAGE(!PageCompound(page), page);
 
+	if (PageWriteback(page))
+		return -EBUSY;
+
 	if (PageAnon(head)) {
 		/*
 		 * The caller does not necessarily hold an mmap_sem that would
@@ -2523,7 +2526,12 @@ int split_huge_page_to_list(struct page *page, struct list_head *list)
 			__dec_node_page_state(page, NR_SHMEM_THPS);
 		spin_unlock(&pgdata->split_queue_lock);
 		__split_huge_page(page, list, flags);
-		ret = 0;
+		if (PageSwapCache(head)) {
+			swp_entry_t entry = { .val = page_private(head) };
+
+			ret = split_swap_cluster(entry);
+		} else
+			ret = 0;
 	} else {
 		if (IS_ENABLED(CONFIG_DEBUG_VM) && mapcount) {
 			pr_alert("total_mapcount: %u, page_count(): %u\n",
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 2a2f5d08f0a9..d4fd80be2e2d 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -1215,6 +1215,21 @@ static void swapcache_free_cluster(swp_entry_t entry)
 		}
 	}
 }
+
+int split_swap_cluster(swp_entry_t entry)
+{
+	struct swap_info_struct *si;
+	struct swap_cluster_info *ci;
+	unsigned long offset = swp_offset(entry);
+
+	si = _swap_info_get(entry);
+	if (!si)
+		return -EBUSY;
+	ci = lock_cluster(si, offset);
+	cluster_clear_huge(ci);
+	unlock_cluster(ci);
+	return 0;
+}
 #else
 static inline void swapcache_free_cluster(swp_entry_t entry)
 {
-- 
2.11.0

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply related

* [PATCH -mm 07/13] mm, THP, swap: Support to write THP to swap device as a whole
From: Huang, Ying @ 2017-05-25  6:46 UTC (permalink / raw)
  To: Andrew Morton
  Cc: linux-mm, linux-kernel, Huang Ying, Johannes Weiner, Minchan Kim,
	Hugh Dickins, Shaohua Li, Rik van Riel, Jens Axboe
In-Reply-To: <20170525064635.2832-1-ying.huang@intel.com>

From: Huang Ying <ying.huang@intel.com>

In the patch, the swap writing is enhanced to support to write a
THP (Transparent Huge Page) as a whole.  This is a part of the THP
swap optimization and will improve swap write IO performance for the
more large continuous IOs.

Signed-off-by: "Huang, Ying" <ying.huang@intel.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Minchan Kim <minchan@kernel.org>
Cc: Hugh Dickins <hughd@google.com>
Cc: Shaohua Li <shli@kernel.org>
Cc: Rik van Riel <riel@redhat.com>
Cc: Jens Axboe <axboe@fb.com>
---
 include/linux/page-flags.h    |  4 ++--
 include/linux/vm_event_item.h |  1 +
 mm/page_io.c                  | 21 ++++++++++++++++-----
 mm/vmstat.c                   |  1 +
 4 files changed, 20 insertions(+), 7 deletions(-)

diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h
index d33e3280c8ad..ba2d470d2d0a 100644
--- a/include/linux/page-flags.h
+++ b/include/linux/page-flags.h
@@ -303,8 +303,8 @@ PAGEFLAG(OwnerPriv1, owner_priv_1, PF_ANY)
  * Only test-and-set exist for PG_writeback.  The unconditional operators are
  * risky: they bypass page accounting.
  */
-TESTPAGEFLAG(Writeback, writeback, PF_NO_COMPOUND)
-	TESTSCFLAG(Writeback, writeback, PF_NO_COMPOUND)
+TESTPAGEFLAG(Writeback, writeback, PF_NO_TAIL)
+	TESTSCFLAG(Writeback, writeback, PF_NO_TAIL)
 PAGEFLAG(MappedToDisk, mappedtodisk, PF_NO_TAIL)
 
 /* PG_readahead is only used for reads; PG_reclaim is only for writes */
diff --git a/include/linux/vm_event_item.h b/include/linux/vm_event_item.h
index d84ae90ccd5c..5b5b0f094060 100644
--- a/include/linux/vm_event_item.h
+++ b/include/linux/vm_event_item.h
@@ -84,6 +84,7 @@ enum vm_event_item { PGPGIN, PGPGOUT, PSWPIN, PSWPOUT,
 #endif
 		THP_ZERO_PAGE_ALLOC,
 		THP_ZERO_PAGE_ALLOC_FAILED,
+		THP_SWPOUT,
 #endif
 #ifdef CONFIG_MEMORY_BALLOON
 		BALLOON_INFLATE,
diff --git a/mm/page_io.c b/mm/page_io.c
index 23f6d0d3470f..ec5229fb3607 100644
--- a/mm/page_io.c
+++ b/mm/page_io.c
@@ -27,16 +27,18 @@
 static struct bio *get_swap_bio(gfp_t gfp_flags,
 				struct page *page, bio_end_io_t end_io)
 {
+	int i, nr = hpage_nr_pages(page);
 	struct bio *bio;
 
-	bio = bio_alloc(gfp_flags, 1);
+	bio = bio_alloc(gfp_flags, nr);
 	if (bio) {
 		bio->bi_iter.bi_sector = map_swap_page(page, &bio->bi_bdev);
 		bio->bi_iter.bi_sector <<= PAGE_SHIFT - 9;
 		bio->bi_end_io = end_io;
 
-		bio_add_page(bio, page, PAGE_SIZE, 0);
-		BUG_ON(bio->bi_iter.bi_size != PAGE_SIZE);
+		for (i = 0; i < nr; i++)
+			bio_add_page(bio, page + i, PAGE_SIZE, 0);
+		VM_BUG_ON(bio->bi_iter.bi_size != PAGE_SIZE * nr);
 	}
 	return bio;
 }
@@ -257,6 +259,15 @@ static sector_t swap_page_sector(struct page *page)
 	return (sector_t)__page_file_index(page) << (PAGE_SHIFT - 9);
 }
 
+static inline void count_swpout_vm_event(struct page *page)
+{
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+	if (unlikely(PageTransHuge(page)))
+		count_vm_event(THP_SWPOUT);
+#endif
+	count_vm_events(PSWPOUT, hpage_nr_pages(page));
+}
+
 int __swap_writepage(struct page *page, struct writeback_control *wbc,
 		bio_end_io_t end_write_func)
 {
@@ -308,7 +319,7 @@ int __swap_writepage(struct page *page, struct writeback_control *wbc,
 
 	ret = bdev_write_page(sis->bdev, swap_page_sector(page), page, wbc);
 	if (!ret) {
-		count_vm_event(PSWPOUT);
+		count_swpout_vm_event(page);
 		return 0;
 	}
 
@@ -321,7 +332,7 @@ int __swap_writepage(struct page *page, struct writeback_control *wbc,
 		goto out;
 	}
 	bio->bi_opf = REQ_OP_WRITE | wbc_to_write_flags(wbc);
-	count_vm_event(PSWPOUT);
+	count_swpout_vm_event(page);
 	set_page_writeback(page);
 	unlock_page(page);
 	submit_bio(bio);
diff --git a/mm/vmstat.c b/mm/vmstat.c
index c432e581f9a9..ebfd79df1008 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -1070,6 +1070,7 @@ const char * const vmstat_text[] = {
 #endif
 	"thp_zero_page_alloc",
 	"thp_zero_page_alloc_failed",
+	"thp_swpout",
 #endif
 #ifdef CONFIG_MEMORY_BALLOON
 	"balloon_inflate",
-- 
2.11.0

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply related

* [PATCH -mm 06/13] block: Increase BIO_MAX_PAGES to PMD size if THP_SWAP enabled
From: Huang, Ying @ 2017-05-25  6:46 UTC (permalink / raw)
  To: Andrew Morton
  Cc: linux-mm, linux-kernel, Huang Ying, Johannes Weiner, Minchan Kim,
	Jens Axboe, Ming Lei, Shaohua Li, linux-block
In-Reply-To: <20170525064635.2832-1-ying.huang@intel.com>

From: Huang Ying <ying.huang@intel.com>

In this patch, BIO_MAX_PAGES is changed from 256 to HPAGE_PMD_NR if
CONFIG_THP_SWAP is enabled and HPAGE_PMD_NR > 256.  This is to support
THP (Transparent Huge Page) swap optimization.  Where the THP will be
write to disk as a whole instead of HPAGE_PMD_NR normal pages to batch
the various operations during swap.  And the page is likely to be
written to disk to free memory when system memory goes really low, the
memory pool need to be used to avoid deadlock.

Signed-off-by: "Huang, Ying" <ying.huang@intel.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Minchan Kim <minchan@kernel.org>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: Ming Lei <tom.leiming@gmail.com>
Cc: Shaohua Li <shli@fb.com>
Cc: linux-block@vger.kernel.org
---
 include/linux/bio.h | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/include/linux/bio.h b/include/linux/bio.h
index d1b04b0e99cf..314796486507 100644
--- a/include/linux/bio.h
+++ b/include/linux/bio.h
@@ -38,7 +38,15 @@
 #define BIO_BUG_ON
 #endif
 
+#ifdef CONFIG_THP_SWAP
+#if HPAGE_PMD_NR > 256
+#define BIO_MAX_PAGES		HPAGE_PMD_NR
+#else
 #define BIO_MAX_PAGES		256
+#endif
+#else
+#define BIO_MAX_PAGES		256
+#endif
 
 #define bio_prio(bio)			(bio)->bi_ioprio
 #define bio_set_prio(bio, prio)		((bio)->bi_ioprio = prio)
-- 
2.11.0

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply related

* [PATCH -mm 05/13] block, THP: Make block_device_operations.rw_page support THP
From: Huang, Ying @ 2017-05-25  6:46 UTC (permalink / raw)
  To: Andrew Morton
  Cc: linux-mm, linux-kernel, Huang Ying, Johannes Weiner, Minchan Kim,
	Dan Williams, Ross Zwisler, Vishal L Verma, Jens Axboe,
	linux-nvdimm
In-Reply-To: <20170525064635.2832-1-ying.huang@intel.com>

From: Huang Ying <ying.huang@intel.com>

The .rw_page in struct block_device_operations is used by the swap
subsystem to read/write the page contents from/into the corresponding
swap slot in the swap device.  To support the THP (Transparent Huge
Page) swap optimization, the .rw_page is enhanced to support to
read/write THP if possible.

Signed-off-by: "Huang, Ying" <ying.huang@intel.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Minchan Kim <minchan@kernel.org>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: Ross Zwisler <ross.zwisler@intel.com>
Cc: Vishal L Verma <vishal.l.verma@intel.com>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: linux-nvdimm@lists.01.org
---
 drivers/block/brd.c           |  6 +++++-
 drivers/block/zram/zram_drv.c |  2 ++
 drivers/nvdimm/btt.c          |  4 +++-
 drivers/nvdimm/pmem.c         | 42 +++++++++++++++++++++++++++++++-----------
 4 files changed, 41 insertions(+), 13 deletions(-)

diff --git a/drivers/block/brd.c b/drivers/block/brd.c
index 57b574f2f66a..4240d2a9dcf9 100644
--- a/drivers/block/brd.c
+++ b/drivers/block/brd.c
@@ -324,7 +324,11 @@ static int brd_rw_page(struct block_device *bdev, sector_t sector,
 		       struct page *page, bool is_write)
 {
 	struct brd_device *brd = bdev->bd_disk->private_data;
-	int err = brd_do_bvec(brd, page, PAGE_SIZE, 0, is_write, sector);
+	int err;
+
+	if (PageTransHuge(page))
+		return -ENOTSUPP;
+	err = brd_do_bvec(brd, page, PAGE_SIZE, 0, is_write, sector);
 	page_endio(page, is_write, err);
 	return err;
 }
diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c
index 5f2a862d8e31..09b11286c927 100644
--- a/drivers/block/zram/zram_drv.c
+++ b/drivers/block/zram/zram_drv.c
@@ -1049,6 +1049,8 @@ static int zram_rw_page(struct block_device *bdev, sector_t sector,
 	struct zram *zram;
 	struct bio_vec bv;
 
+	if (PageTransHuge(page))
+		return -ENOTSUPP;
 	zram = bdev->bd_disk->private_data;
 
 	if (!valid_io_request(zram, sector, PAGE_SIZE)) {
diff --git a/drivers/nvdimm/btt.c b/drivers/nvdimm/btt.c
index 983718b8fd9b..46d4a0bd2ae6 100644
--- a/drivers/nvdimm/btt.c
+++ b/drivers/nvdimm/btt.c
@@ -1248,8 +1248,10 @@ static int btt_rw_page(struct block_device *bdev, sector_t sector,
 		struct page *page, bool is_write)
 {
 	struct btt *btt = bdev->bd_disk->private_data;
+	unsigned int len;
 
-	btt_do_bvec(btt, NULL, page, PAGE_SIZE, 0, is_write, sector);
+	len = hpage_nr_pages(page) * PAGE_SIZE;
+	btt_do_bvec(btt, NULL, page, len, 0, is_write, sector);
 	page_endio(page, is_write, 0);
 	return 0;
 }
diff --git a/drivers/nvdimm/pmem.c b/drivers/nvdimm/pmem.c
index c544d466ea51..e644115d56a7 100644
--- a/drivers/nvdimm/pmem.c
+++ b/drivers/nvdimm/pmem.c
@@ -78,22 +78,40 @@ static int pmem_clear_poison(struct pmem_device *pmem, phys_addr_t offset,
 static void write_pmem(void *pmem_addr, struct page *page,
 		unsigned int off, unsigned int len)
 {
-	void *mem = kmap_atomic(page);
-
-	memcpy_to_pmem(pmem_addr, mem + off, len);
-	kunmap_atomic(mem);
+	unsigned int chunk;
+	void *mem;
+
+	while (len) {
+		mem = kmap_atomic(page);
+		chunk = min_t(unsigned int, len, PAGE_SIZE);
+		memcpy_to_pmem(pmem_addr, mem + off, chunk);
+		kunmap_atomic(mem);
+		len -= chunk;
+		off = 0;
+		page++;
+		pmem_addr += PAGE_SIZE;
+	}
 }
 
 static int read_pmem(struct page *page, unsigned int off,
 		void *pmem_addr, unsigned int len)
 {
+	unsigned int chunk;
 	int rc;
-	void *mem = kmap_atomic(page);
-
-	rc = memcpy_mcsafe(mem + off, pmem_addr, len);
-	kunmap_atomic(mem);
-	if (rc)
-		return -EIO;
+	void *mem;
+
+	while (len) {
+		mem = kmap_atomic(page);
+		chunk = min_t(unsigned int, len, PAGE_SIZE);
+		rc = memcpy_mcsafe(mem + off, pmem_addr, chunk);
+		kunmap_atomic(mem);
+		if (rc)
+			return -EIO;
+		len -= chunk;
+		off = 0;
+		page++;
+		pmem_addr += PAGE_SIZE;
+	}
 	return 0;
 }
 
@@ -184,9 +202,11 @@ static int pmem_rw_page(struct block_device *bdev, sector_t sector,
 		       struct page *page, bool is_write)
 {
 	struct pmem_device *pmem = bdev->bd_queue->queuedata;
+	unsigned int len;
 	int rc;
 
-	rc = pmem_do_bvec(pmem, page, PAGE_SIZE, 0, is_write, sector);
+	len = hpage_nr_pages(page) * PAGE_SIZE;
+	rc = pmem_do_bvec(pmem, page, len, 0, is_write, sector);
 
 	/*
 	 * The ->rw_page interface is subtle and tricky.  The core
-- 
2.11.0

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply related

* [PATCH -mm 04/13] mm, THP, swap: Don't allocate huge cluster for file backed swap device
From: Huang, Ying @ 2017-05-25  6:46 UTC (permalink / raw)
  To: Andrew Morton
  Cc: linux-mm, linux-kernel, Huang Ying, Johannes Weiner, Minchan Kim,
	Hugh Dickins, Shaohua Li, Rik van Riel
In-Reply-To: <20170525064635.2832-1-ying.huang@intel.com>

From: Huang Ying <ying.huang@intel.com>

It's hard to write a whole transparent huge page (THP) to a file
backed swap device during swapping out and the file backed swap device
isn't very popular.  So the huge cluster allocation for the file
backed swap device is disabled.

Signed-off-by: "Huang, Ying" <ying.huang@intel.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Minchan Kim <minchan@kernel.org>
Cc: Hugh Dickins <hughd@google.com>
Cc: Shaohua Li <shli@kernel.org>
Cc: Rik van Riel <riel@redhat.com>
---
 mm/swapfile.c | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/mm/swapfile.c b/mm/swapfile.c
index bd0f38f31d3d..2a2f5d08f0a9 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -947,9 +947,10 @@ int get_swap_pages(int n_goal, bool cluster, swp_entry_t swp_entries[])
 			spin_unlock(&si->lock);
 			goto nextsi;
 		}
-		if (cluster)
-			n_ret = swap_alloc_cluster(si, swp_entries);
-		else
+		if (cluster) {
+			if (!(si->flags & SWP_FILE))
+				n_ret = swap_alloc_cluster(si, swp_entries);
+		} else
 			n_ret = scan_swap_map_slots(si, SWAP_HAS_CACHE,
 						    n_goal, swp_entries);
 		spin_unlock(&si->lock);
-- 
2.11.0

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply related

* [PATCH -mm 03/13] mm, THP, swap: Make reuse_swap_page() works for THP swapped out
From: Huang, Ying @ 2017-05-25  6:46 UTC (permalink / raw)
  To: Andrew Morton
  Cc: linux-mm, linux-kernel, Huang Ying, Johannes Weiner, Minchan Kim,
	Hugh Dickins, Shaohua Li, Rik van Riel, Andrea Arcangeli,
	Kirill A . Shutemov
In-Reply-To: <20170525064635.2832-1-ying.huang@intel.com>

From: Huang Ying <ying.huang@intel.com>

After supporting to delay THP (Transparent Huge Page) splitting after
swapped out, it is possible that some page table mappings of the THP
are turned into swap entries.  So reuse_swap_page() need to check the
swap count in addition to the map count as before.  This patch done
that.

In the huge PMD write protect fault handler, in addition to the page
map count, the swap count need to be checked too, so the page lock
need to be acquired too when calling reuse_swap_page() in addition to
the page table lock.

Signed-off-by: "Huang, Ying" <ying.huang@intel.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Minchan Kim <minchan@kernel.org>
Cc: Hugh Dickins <hughd@google.com>
Cc: Shaohua Li <shli@kernel.org>
Cc: Rik van Riel <riel@redhat.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: "Kirill A . Shutemov" <kirill.shutemov@linux.intel.com>
---
 include/linux/swap.h |   4 +-
 mm/huge_memory.c     |  16 +++++++-
 mm/memory.c          |   6 +--
 mm/swapfile.c        | 102 ++++++++++++++++++++++++++++++++++++++++++++++-----
 4 files changed, 113 insertions(+), 15 deletions(-)

diff --git a/include/linux/swap.h b/include/linux/swap.h
index c563c45b30b4..ed51d5e699e0 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -508,8 +508,8 @@ static inline int swp_swapcount(swp_entry_t entry)
 	return 0;
 }
 
-#define reuse_swap_page(page, total_mapcount) \
-	(page_trans_huge_mapcount(page, total_mapcount) == 1)
+#define reuse_swap_page(page, total_map_swapcount) \
+	(page_trans_huge_mapcount(page, total_map_swapcount) == 1)
 
 static inline int try_to_free_swap(struct page *page)
 {
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 3a14c77fcce7..0eb1251f924a 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -1226,15 +1226,29 @@ int do_huge_pmd_wp_page(struct vm_fault *vmf, pmd_t orig_pmd)
 	 * We can only reuse the page if nobody else maps the huge page or it's
 	 * part.
 	 */
-	if (page_trans_huge_mapcount(page, NULL) == 1) {
+	if (!trylock_page(page)) {
+		get_page(page);
+		spin_unlock(vmf->ptl);
+		lock_page(page);
+		spin_lock(vmf->ptl);
+		if (unlikely(!pmd_same(*vmf->pmd, orig_pmd))) {
+			unlock_page(page);
+			put_page(page);
+			goto out_unlock;
+		}
+		put_page(page);
+	}
+	if (reuse_swap_page(page, NULL)) {
 		pmd_t entry;
 		entry = pmd_mkyoung(orig_pmd);
 		entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
 		if (pmdp_set_access_flags(vma, haddr, vmf->pmd, entry,  1))
 			update_mmu_cache_pmd(vma, vmf->address, vmf->pmd);
 		ret |= VM_FAULT_WRITE;
+		unlock_page(page);
 		goto out_unlock;
 	}
+	unlock_page(page);
 	get_page(page);
 	spin_unlock(vmf->ptl);
 alloc:
diff --git a/mm/memory.c b/mm/memory.c
index d320b4e16826..ac780fc619cd 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -2541,7 +2541,7 @@ static int do_wp_page(struct vm_fault *vmf)
 	 * not dirty accountable.
 	 */
 	if (PageAnon(vmf->page) && !PageKsm(vmf->page)) {
-		int total_mapcount;
+		int total_map_swapcount;
 		if (!trylock_page(vmf->page)) {
 			get_page(vmf->page);
 			pte_unmap_unlock(vmf->pte, vmf->ptl);
@@ -2556,8 +2556,8 @@ static int do_wp_page(struct vm_fault *vmf)
 			}
 			put_page(vmf->page);
 		}
-		if (reuse_swap_page(vmf->page, &total_mapcount)) {
-			if (total_mapcount == 1) {
+		if (reuse_swap_page(vmf->page, &total_map_swapcount)) {
+			if (total_map_swapcount == 1) {
 				/*
 				 * The page is all ours. Move it to
 				 * our anon_vma so the rmap code will
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 675afc235de1..bd0f38f31d3d 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -1389,9 +1389,89 @@ static bool page_swapped(struct page *page)
 		return swap_page_trans_huge_swapped(si, entry);
 	return false;
 }
+
+static int page_trans_huge_map_swapcount(struct page *page, int *total_mapcount,
+					 int *total_swapcount)
+{
+	int i, map_swapcount, _total_mapcount, _total_swapcount;
+	unsigned long offset;
+	struct swap_info_struct *si;
+	struct swap_cluster_info *ci = NULL;
+	unsigned char *map = NULL;
+	int mapcount, swapcount = 0;
+
+	/* hugetlbfs shouldn't call it */
+	VM_BUG_ON_PAGE(PageHuge(page), page);
+
+	if (likely(!PageTransCompound(page))) {
+		mapcount = atomic_read(&page->_mapcount) + 1;
+		if (total_mapcount)
+			*total_mapcount = mapcount;
+		if (PageSwapCache(page))
+			swapcount = page_swapcount(page);
+		if (total_swapcount)
+			*total_swapcount = swapcount;
+		return mapcount + swapcount;
+	}
+
+	page = compound_head(page);
+
+	_total_mapcount = _total_swapcount = map_swapcount = 0;
+	if (PageSwapCache(page)) {
+		swp_entry_t entry;
+
+		entry.val = page_private(page);
+		si = _swap_info_get(entry);
+		if (si) {
+			map = si->swap_map;
+			offset = swp_offset(entry);
+		}
+	}
+	if (map)
+		ci = lock_cluster(si, offset);
+	for (i = 0; i < HPAGE_PMD_NR; i++) {
+		mapcount = atomic_read(&page[i]._mapcount) + 1;
+		_total_mapcount += mapcount;
+		if (map) {
+			swapcount = swap_count(map[offset + i]);
+			_total_swapcount += swapcount;
+		}
+		map_swapcount = max(map_swapcount, mapcount + swapcount);
+	}
+	unlock_cluster(ci);
+	if (PageDoubleMap(page)) {
+		map_swapcount -= 1;
+		_total_mapcount -= HPAGE_PMD_NR;
+	}
+	mapcount = compound_mapcount(page);
+	map_swapcount += mapcount;
+	_total_mapcount += mapcount;
+	if (total_mapcount)
+		*total_mapcount = _total_mapcount;
+	if (total_swapcount)
+		*total_swapcount = _total_swapcount;
+
+	return map_swapcount;
+}
 #else
 #define swap_page_trans_huge_swapped(si, entry)	swap_swapcount(si, entry)
 #define page_swapped(page)			(page_swapcount(page) != 0)
+
+static int page_trans_huge_map_swapcount(struct page *page, int *total_mapcount,
+					 int *total_swapcount)
+{
+	int mapcount, swapcount = 0;
+
+	/* hugetlbfs shouldn't call it */
+	VM_BUG_ON_PAGE(PageHuge(page), page);
+
+	mapcount = page_trans_huge_mapcount(page, total_mapcount)
+	if (PageSwapCache(page))
+		swapcount = page_swapcount(page);
+	if (total_swapcount)
+		*total_swapcount = swapcount;
+	return mapcount + swapcount;
+}
 #endif
 
 /*
@@ -1400,23 +1480,27 @@ static bool page_swapped(struct page *page)
  * on disk will never be read, and seeking back there to write new content
  * later would only waste time away from clustering.
  *
- * NOTE: total_mapcount should not be relied upon by the caller if
+ * NOTE: total_map_swapcount should not be relied upon by the caller if
  * reuse_swap_page() returns false, but it may be always overwritten
  * (see the other implementation for CONFIG_SWAP=n).
  */
-bool reuse_swap_page(struct page *page, int *total_mapcount)
+bool reuse_swap_page(struct page *page, int *total_map_swapcount)
 {
-	int count;
+	int count, total_mapcount, total_swapcount;
 
 	VM_BUG_ON_PAGE(!PageLocked(page), page);
 	if (unlikely(PageKsm(page)))
 		return false;
-	count = page_trans_huge_mapcount(page, total_mapcount);
-	if (count <= 1 && PageSwapCache(page)) {
-		count += page_swapcount(page);
-		if (count != 1)
-			goto out;
+	count = page_trans_huge_map_swapcount(page, &total_mapcount,
+					      &total_swapcount);
+	if (total_map_swapcount)
+		*total_map_swapcount = total_mapcount + total_swapcount;
+	if (count == 1 && PageSwapCache(page) &&
+	    (likely(!PageTransCompound(page)) ||
+	     /* The remaining swap count will be freed soon */
+	     total_swapcount == page_swapcount(page))) {
 		if (!PageWriteback(page)) {
+			page = compound_head(page);
 			delete_from_swap_cache(page);
 			SetPageDirty(page);
 		} else {
@@ -1432,7 +1516,7 @@ bool reuse_swap_page(struct page *page, int *total_mapcount)
 			spin_unlock(&p->lock);
 		}
 	}
-out:
+
 	return count <= 1;
 }
 
-- 
2.11.0

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply related

* [PATCH -mm 02/13] mm, THP, swap: Support to reclaim swap space for THP swapped out
From: Huang, Ying @ 2017-05-25  6:46 UTC (permalink / raw)
  To: Andrew Morton
  Cc: linux-mm, linux-kernel, Huang Ying, Johannes Weiner, Minchan Kim,
	Hugh Dickins, Shaohua Li, Rik van Riel
In-Reply-To: <20170525064635.2832-1-ying.huang@intel.com>

From: Huang Ying <ying.huang@intel.com>

The normal swap slot reclaiming can be done when the swap count
reaches SWAP_HAS_CACHE.  But for the swap slot which is backing a THP,
all swap slots backing one THP must be reclaimed together, because the
swap slot may be used again when the THP is swapped out again later.
So the swap slots backing one THP can be reclaimed together when the
swap count for all swap slots for the THP reached SWAP_HAS_CACHE.  In
the patch, the functions to check whether the swap count for all swap
slots backing one THP reached SWAP_HAS_CACHE are implemented and used
when checking whether a swap slot can be reclaimed.

To make it easier to determine whether a swap slot is backing a THP, a
new swap cluster flag named CLUSTER_FLAG_HUGE is added to mark a swap
cluster which is backing a THP (Transparent Huge Page).  Because THP
swap in as a whole isn't supported now.  After deleting the THP from
the swap cache (for example, swapping out finished), the
CLUSTER_FLAG_HUGE flag will be cleared.  So that, the normal pages
inside THP can be swapped in individually.

Signed-off-by: "Huang, Ying" <ying.huang@intel.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Minchan Kim <minchan@kernel.org>
Cc: Hugh Dickins <hughd@google.com>
Cc: Shaohua Li <shli@kernel.org>
Cc: Rik van Riel <riel@redhat.com>
---
 include/linux/swap.h |  1 +
 mm/swapfile.c        | 78 +++++++++++++++++++++++++++++++++++++++++++++++-----
 2 files changed, 72 insertions(+), 7 deletions(-)

diff --git a/include/linux/swap.h b/include/linux/swap.h
index 5ab1c98c7d27..c563c45b30b4 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -188,6 +188,7 @@ struct swap_cluster_info {
 };
 #define CLUSTER_FLAG_FREE 1 /* This cluster is free */
 #define CLUSTER_FLAG_NEXT_NULL 2 /* This cluster has no next cluster */
+#define CLUSTER_FLAG_HUGE 4 /* This cluster is backing a transparent huge page */
 
 /*
  * We assign a cluster to each CPU, so each CPU can allocate swap entry from
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 4cd02dec6894..675afc235de1 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -264,6 +264,16 @@ static inline void cluster_set_null(struct swap_cluster_info *info)
 	info->data = 0;
 }
 
+static inline bool cluster_is_huge(struct swap_cluster_info *info)
+{
+	return info->flags & CLUSTER_FLAG_HUGE;
+}
+
+static inline void cluster_clear_huge(struct swap_cluster_info *info)
+{
+	info->flags &= ~CLUSTER_FLAG_HUGE;
+}
+
 static inline struct swap_cluster_info *lock_cluster(struct swap_info_struct *si,
 						     unsigned long offset)
 {
@@ -845,7 +855,7 @@ static int swap_alloc_cluster(struct swap_info_struct *si, swp_entry_t *slot)
 	offset = idx * SWAPFILE_CLUSTER;
 	ci = lock_cluster(si, offset);
 	alloc_cluster(si, idx);
-	cluster_set_count_flag(ci, SWAPFILE_CLUSTER, 0);
+	cluster_set_count_flag(ci, SWAPFILE_CLUSTER, CLUSTER_FLAG_HUGE);
 
 	map = si->swap_map + offset;
 	for (i = 0; i < SWAPFILE_CLUSTER; i++)
@@ -1175,6 +1185,7 @@ static void swapcache_free_cluster(swp_entry_t entry)
 		return;
 
 	ci = lock_cluster(si, offset);
+	VM_BUG_ON(!cluster_is_huge(ci));
 	map = si->swap_map + offset;
 	for (i = 0; i < SWAPFILE_CLUSTER; i++) {
 		val = map[i];
@@ -1186,6 +1197,7 @@ static void swapcache_free_cluster(swp_entry_t entry)
 		for (i = 0; i < SWAPFILE_CLUSTER; i++)
 			map[i] &= ~SWAP_HAS_CACHE;
 	}
+	cluster_clear_huge(ci);
 	unlock_cluster(ci);
 	if (free_entries == SWAPFILE_CLUSTER) {
 		spin_lock(&si->lock);
@@ -1334,6 +1346,54 @@ int swp_swapcount(swp_entry_t entry)
 	return count;
 }
 
+#ifdef CONFIG_THP_SWAP
+static bool swap_page_trans_huge_swapped(struct swap_info_struct *si,
+					 swp_entry_t entry)
+{
+	struct swap_cluster_info *ci;
+	unsigned char *map = si->swap_map;
+	unsigned long roffset = swp_offset(entry);
+	unsigned long offset = round_down(roffset, SWAPFILE_CLUSTER);
+	int i;
+	bool ret = false;
+
+	ci = lock_cluster_or_swap_info(si, offset);
+	if (!cluster_is_huge(ci)) {
+		if (map[roffset] != SWAP_HAS_CACHE)
+			ret = true;
+		goto unlock_out;
+	}
+	for (i = 0; i < SWAPFILE_CLUSTER; i++) {
+		if (map[offset + i] != SWAP_HAS_CACHE) {
+			ret = true;
+			break;
+		}
+	}
+unlock_out:
+	unlock_cluster_or_swap_info(si, ci);
+	return ret;
+}
+
+static bool page_swapped(struct page *page)
+{
+	swp_entry_t entry;
+	struct swap_info_struct *si;
+
+	if (likely(!PageTransCompound(page)))
+		return page_swapcount(page) != 0;
+
+	page = compound_head(page);
+	entry.val = page_private(page);
+	si = _swap_info_get(entry);
+	if (si)
+		return swap_page_trans_huge_swapped(si, entry);
+	return false;
+}
+#else
+#define swap_page_trans_huge_swapped(si, entry)	swap_swapcount(si, entry)
+#define page_swapped(page)			(page_swapcount(page) != 0)
+#endif
+
 /*
  * We can write to an anon page without COW if there are no other references
  * to it.  And as a side-effect, free up its swap: because the old content
@@ -1388,7 +1448,7 @@ int try_to_free_swap(struct page *page)
 		return 0;
 	if (PageWriteback(page))
 		return 0;
-	if (page_swapcount(page))
+	if (page_swapped(page))
 		return 0;
 
 	/*
@@ -1409,6 +1469,7 @@ int try_to_free_swap(struct page *page)
 	if (pm_suspended_storage())
 		return 0;
 
+	page = compound_head(page);
 	delete_from_swap_cache(page);
 	SetPageDirty(page);
 	return 1;
@@ -1430,7 +1491,8 @@ int free_swap_and_cache(swp_entry_t entry)
 	p = _swap_info_get(entry);
 	if (p) {
 		count = __swap_entry_free(p, entry, 1);
-		if (count == SWAP_HAS_CACHE) {
+		if (count == SWAP_HAS_CACHE &&
+		    !swap_page_trans_huge_swapped(p, entry)) {
 			page = find_get_page(swap_address_space(entry),
 					     swp_offset(entry));
 			if (page && !trylock_page(page)) {
@@ -1447,7 +1509,8 @@ int free_swap_and_cache(swp_entry_t entry)
 		 */
 		if (PageSwapCache(page) && !PageWriteback(page) &&
 		    (!page_mapped(page) || mem_cgroup_swap_full(page)) &&
-		    !swap_swapcount(p, entry)) {
+		    !swap_page_trans_huge_swapped(p, entry)) {
+			page = compound_head(page);
 			delete_from_swap_cache(page);
 			SetPageDirty(page);
 		}
@@ -2001,7 +2064,7 @@ int try_to_unuse(unsigned int type, bool frontswap,
 				.sync_mode = WB_SYNC_NONE,
 			};
 
-			swap_writepage(page, &wbc);
+			swap_writepage(compound_head(page), &wbc);
 			lock_page(page);
 			wait_on_page_writeback(page);
 		}
@@ -2014,8 +2077,9 @@ int try_to_unuse(unsigned int type, bool frontswap,
 		 * delete, since it may not have been written out to swap yet.
 		 */
 		if (PageSwapCache(page) &&
-		    likely(page_private(page) == entry.val))
-			delete_from_swap_cache(page);
+		    likely(page_private(page) == entry.val) &&
+		    !page_swapped(page))
+			delete_from_swap_cache(compound_head(page));
 
 		/*
 		 * So we could skip searching mms once swap count went
-- 
2.11.0

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply related

page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox