Linux userland API discussions
 help / color / mirror / Atom feed
* Re: [PATCH 2/9] security: Add hooks to rule on setting a watch [ver #5]
From: Stephen Smalley @ 2019-07-08 18:46 UTC (permalink / raw)
  To: David Howells, viro
  Cc: Casey Schaufler, Greg Kroah-Hartman, nicolas.dichtel, raven,
	Christian Brauner, keyrings, linux-usb, linux-security-module,
	linux-fsdevel, linux-api, linux-block, linux-kernel
In-Reply-To: <156173692760.15137.9636883182556029747.stgit@warthog.procyon.org.uk>

On 6/28/19 11:48 AM, David Howells wrote:
> Add security hooks that will allow an LSM to rule on whether or not a watch
> may be set.  More than one hook is required as the watches watch different
> types of object.
> 
> Signed-off-by: David Howells <dhowells@redhat.com>
> cc: Casey Schaufler <casey@schaufler-ca.com>
> cc: Stephen Smalley <sds@tycho.nsa.gov>
> cc: linux-security-module@vger.kernel.org
> ---
> 
>   include/linux/lsm_hooks.h |   22 ++++++++++++++++++++++
>   include/linux/security.h  |   15 +++++++++++++++
>   security/security.c       |   13 +++++++++++++
>   3 files changed, 50 insertions(+)
> 
> diff --git a/include/linux/lsm_hooks.h b/include/linux/lsm_hooks.h
> index 47f58cfb6a19..f9d31f6445e4 100644
> --- a/include/linux/lsm_hooks.h
> +++ b/include/linux/lsm_hooks.h
> @@ -1413,6 +1413,20 @@
>    *	@ctx is a pointer in which to place the allocated security context.
>    *	@ctxlen points to the place to put the length of @ctx.
>    *
> + * Security hooks for the general notification queue:
> + *
> + * @watch_key:
> + *	Check to see if a process is allowed to watch for event notifications
> + *	from a key or keyring.
> + *	@watch: The watch object
> + *	@key: The key to watch.
> + *
> + * @watch_devices:
> + *	Check to see if a process is allowed to watch for event notifications
> + *	from devices (as a global set).
> + *	@watch: The watch object

It is difficult to evaluate these without at least one implementation of 
each hook.  I am unclear as to how any security module would use the 
watch argument, since it has no security field/blob and does not appear 
to contain any information that would be relevant to deciding whether or 
not to permit the watch to be set.

> + *
> + *
>    * Security hooks for using the eBPF maps and programs functionalities through
>    * eBPF syscalls.
>    *
> @@ -1688,6 +1702,10 @@ union security_list_options {
>   	int (*inode_notifysecctx)(struct inode *inode, void *ctx, u32 ctxlen);
>   	int (*inode_setsecctx)(struct dentry *dentry, void *ctx, u32 ctxlen);
>   	int (*inode_getsecctx)(struct inode *inode, void **ctx, u32 *ctxlen);
> +#ifdef CONFIG_WATCH_QUEUE
> +	int (*watch_key)(struct watch *watch, struct key *key);
> +	int (*watch_devices)(struct watch *watch);
> +#endif /* CONFIG_WATCH_QUEUE */
>   
>   #ifdef CONFIG_SECURITY_NETWORK
>   	int (*unix_stream_connect)(struct sock *sock, struct sock *other,
> @@ -1964,6 +1982,10 @@ struct security_hook_heads {
>   	struct hlist_head inode_notifysecctx;
>   	struct hlist_head inode_setsecctx;
>   	struct hlist_head inode_getsecctx;
> +#ifdef CONFIG_WATCH_QUEUE
> +	struct hlist_head watch_key;
> +	struct hlist_head watch_devices;
> +#endif /* CONFIG_WATCH_QUEUE */
>   #ifdef CONFIG_SECURITY_NETWORK
>   	struct hlist_head unix_stream_connect;
>   	struct hlist_head unix_may_send;
> diff --git a/include/linux/security.h b/include/linux/security.h
> index 659071c2e57c..540863678355 100644
> --- a/include/linux/security.h
> +++ b/include/linux/security.h
> @@ -57,6 +57,7 @@ struct mm_struct;
>   struct fs_context;
>   struct fs_parameter;
>   enum fs_value_type;
> +struct watch;
>   
>   /* Default (no) options for the capable function */
>   #define CAP_OPT_NONE 0x0
> @@ -392,6 +393,10 @@ void security_inode_invalidate_secctx(struct inode *inode);
>   int security_inode_notifysecctx(struct inode *inode, void *ctx, u32 ctxlen);
>   int security_inode_setsecctx(struct dentry *dentry, void *ctx, u32 ctxlen);
>   int security_inode_getsecctx(struct inode *inode, void **ctx, u32 *ctxlen);
> +#ifdef CONFIG_WATCH_QUEUE
> +int security_watch_key(struct watch *watch, struct key *key);
> +int security_watch_devices(struct watch *watch);
> +#endif /* CONFIG_WATCH_QUEUE */
>   #else /* CONFIG_SECURITY */
>   
>   static inline int call_lsm_notifier(enum lsm_event event, void *data)
> @@ -1204,6 +1209,16 @@ static inline int security_inode_getsecctx(struct inode *inode, void **ctx, u32
>   {
>   	return -EOPNOTSUPP;
>   }
> +#ifdef CONFIG_WATCH_QUEUE
> +static inline int security_watch_key(struct watch *watch, struct key *key)
> +{
> +	return 0;
> +}
> +static inline int security_watch_devices(struct watch *watch)
> +{
> +	return 0;
> +}
> +#endif /* CONFIG_WATCH_QUEUE */
>   #endif	/* CONFIG_SECURITY */
>   
>   #ifdef CONFIG_SECURITY_NETWORK
> diff --git a/security/security.c b/security/security.c
> index 613a5c00e602..2c9919226ad1 100644
> --- a/security/security.c
> +++ b/security/security.c
> @@ -1917,6 +1917,19 @@ int security_inode_getsecctx(struct inode *inode, void **ctx, u32 *ctxlen)
>   }
>   EXPORT_SYMBOL(security_inode_getsecctx);
>   
> +#ifdef CONFIG_WATCH_QUEUE
> +int security_watch_key(struct watch *watch, struct key *key)
> +{
> +	return call_int_hook(watch_key, 0, watch, key);
> +}
> +
> +int security_watch_devices(struct watch *watch)
> +{
> +	return call_int_hook(watch_devices, 0, watch);
> +}
> +
> +#endif /* CONFIG_WATCH_QUEUE */
> +
>   #ifdef CONFIG_SECURITY_NETWORK
>   
>   int security_unix_stream_connect(struct sock *sock, struct sock *other, struct sock *newsk)
> 

^ permalink raw reply

* Re: [PATCH 3/9] security: Add a hook for the point of notification insertion [ver #5]
From: Stephen Smalley @ 2019-07-08 19:13 UTC (permalink / raw)
  To: David Howells, viro
  Cc: Casey Schaufler, Greg Kroah-Hartman, nicolas.dichtel, raven,
	Christian Brauner, keyrings, linux-usb, linux-security-module,
	linux-fsdevel, linux-api, linux-block, linux-kernel
In-Reply-To: <156173694190.15137.8939274212328721351.stgit@warthog.procyon.org.uk>

On 6/28/19 11:49 AM, David Howells wrote:
> Add a security hook that allows an LSM to rule on whether a notification
> message is allowed to be inserted into a particular watch queue.
> 
> The hook is given the following information:
> 
>   (1) The credentials of the triggerer (which may be init_cred for a system
>       notification, eg. a hardware error).
> 
>   (2) The credentials of the whoever set the watch.
> 
>   (3) The notification message.

As with the other proposed hooks, it is difficult to evaluate this hook 
without at least one implementation of the hook.  Since Casey is the 
only one requesting this hook, a Smack implementation would be the 
natural choice; I do not intend to implement this hook for SELinux. 
However, by providing this hook, you are in effect taking a position wrt 
the earlier controversy over it, i.e. that application developers must 
deal with the possibility that notifications can be dropped if a 
security module does not permit the triggerer to post the notification 
to the watcher, without any indication to either the triggerer or the 
watcher.  This is a choice you are making by providing this hook.  The 
alternative is to require that permission to set a watch imply the 
ability to receive all notifications for the watched object.  Aside from 
friendliness to application developers, the latter also yields stable, 
sane policy and better performance.

> 
> Signed-off-by: David Howells <dhowells@redhat.com>
> cc: Casey Schaufler <casey@schaufler-ca.com>
> cc: Stephen Smalley <sds@tycho.nsa.gov>
> cc: linux-security-module@vger.kernel.org
> ---
> 
>   include/linux/lsm_hooks.h |   10 ++++++++++
>   include/linux/security.h  |   10 ++++++++++
>   security/security.c       |    6 ++++++
>   3 files changed, 26 insertions(+)
> 
> diff --git a/include/linux/lsm_hooks.h b/include/linux/lsm_hooks.h
> index f9d31f6445e4..fd4b2b14e7d0 100644
> --- a/include/linux/lsm_hooks.h
> +++ b/include/linux/lsm_hooks.h
> @@ -1426,6 +1426,12 @@
>    *	from devices (as a global set).
>    *	@watch: The watch object
>    *
> + * @post_notification:
> + *	Check to see if a watch notification can be posted to a particular
> + *	queue.
> + *	@w_cred: The credentials of the whoever set the watch.
> + *	@cred: The event-triggerer's credentials
> + *	@n: The notification being posted
>    *
>    * Security hooks for using the eBPF maps and programs functionalities through
>    * eBPF syscalls.
> @@ -1705,6 +1711,9 @@ union security_list_options {
>   #ifdef CONFIG_WATCH_QUEUE
>   	int (*watch_key)(struct watch *watch, struct key *key);
>   	int (*watch_devices)(struct watch *watch);
> +	int (*post_notification)(const struct cred *w_cred,
> +				 const struct cred *cred,
> +				 struct watch_notification *n);
>   #endif /* CONFIG_WATCH_QUEUE */
>   
>   #ifdef CONFIG_SECURITY_NETWORK
> @@ -1985,6 +1994,7 @@ struct security_hook_heads {
>   #ifdef CONFIG_WATCH_QUEUE
>   	struct hlist_head watch_key;
>   	struct hlist_head watch_devices;
> +	struct hlist_head post_notification;
>   #endif /* CONFIG_WATCH_QUEUE */
>   #ifdef CONFIG_SECURITY_NETWORK
>   	struct hlist_head unix_stream_connect;
> diff --git a/include/linux/security.h b/include/linux/security.h
> index 540863678355..5c074bf18bea 100644
> --- a/include/linux/security.h
> +++ b/include/linux/security.h
> @@ -58,6 +58,7 @@ struct fs_context;
>   struct fs_parameter;
>   enum fs_value_type;
>   struct watch;
> +struct watch_notification;
>   
>   /* Default (no) options for the capable function */
>   #define CAP_OPT_NONE 0x0
> @@ -396,6 +397,9 @@ int security_inode_getsecctx(struct inode *inode, void **ctx, u32 *ctxlen);
>   #ifdef CONFIG_WATCH_QUEUE
>   int security_watch_key(struct watch *watch, struct key *key);
>   int security_watch_devices(struct watch *watch);
> +int security_post_notification(const struct cred *w_cred,
> +			       const struct cred *cred,
> +			       struct watch_notification *n);
>   #endif /* CONFIG_WATCH_QUEUE */
>   #else /* CONFIG_SECURITY */
>   
> @@ -1218,6 +1222,12 @@ static inline int security_watch_devices(struct watch *watch)
>   {
>   	return 0;
>   }
> +static inline int security_post_notification(const struct cred *w_cred,
> +					     const struct cred *cred,
> +					     struct watch_notification *n)
> +{
> +	return 0;
> +}
>   #endif /* CONFIG_WATCH_QUEUE */
>   #endif	/* CONFIG_SECURITY */
>   
> diff --git a/security/security.c b/security/security.c
> index 2c9919226ad1..459e87d55ac9 100644
> --- a/security/security.c
> +++ b/security/security.c
> @@ -1928,6 +1928,12 @@ int security_watch_devices(struct watch *watch)
>   	return call_int_hook(watch_devices, 0, watch);
>   }
>   
> +int security_post_notification(const struct cred *w_cred,
> +			       const struct cred *cred,
> +			       struct watch_notification *n)
> +{
> +	return call_int_hook(post_notification, 0, w_cred, cred, n);
> +}
>   #endif /* CONFIG_WATCH_QUEUE */
>   
>   #ifdef CONFIG_SECURITY_NETWORK
> 

^ permalink raw reply

* Re: [PATCH ghak90 V6 02/10] audit: add container id
From: Paul Moore @ 2019-07-08 20:43 UTC (permalink / raw)
  To: Richard Guy Briggs
  Cc: Tycho Andersen, Serge E. Hallyn, containers, linux-api,
	Linux-Audit Mailing List, linux-fsdevel, LKML, netdev,
	netfilter-devel, sgrubb, omosnace, dhowells, simo, Eric Paris,
	ebiederm, nhorman
In-Reply-To: <20190708181237.5poheliito7zpvmc@madcap2.tricolour.ca>

On July 8, 2019 8:12:56 PM Richard Guy Briggs <rgb@redhat.com> wrote:

> On 2019-05-30 19:26, Paul Moore wrote:
>> On Thu, May 30, 2019 at 5:29 PM Tycho Andersen <tycho@tycho.ws> wrote:
>>> On Thu, May 30, 2019 at 03:29:32PM -0400, Paul Moore wrote:
>>>>
>>>>
>>>> [REMINDER: It is an "*audit* container ID" and not a general
>>>> "container ID" ;)  Smiley aside, I'm not kidding about that part.]
>>>
>>> This sort of seems like a distinction without a difference; presumably
>>> audit is going to want to differentiate between everything that people
>>> in userspace call a container. So you'll have to support all this
>>> insanity anyway, even if it's "not a container ID".
>>
>> That's not quite right.  Audit doesn't care about what a container is,
>> or is not, it also doesn't care if the "audit container ID" actually
>> matches the ID used by the container engine in userspace and I think
>> that is a very important line to draw.  Audit is simply given a value
>> which it calls the "audit container ID", it ensures that the value is
>> inherited appropriately (e.g. children inherit their parent's audit
>> container ID), and it uses the value in audit records to provide some
>> additional context for log analysis.  The distinction isn't limited to
>> the value itself, but also to how it is used; it is an "audit
>> container ID" and not a "container ID" because this value is
>> exclusively for use by the audit subsystem.  We are very intentionally
>> not adding a generic container ID to the kernel.  If the kernel does
>> ever grow a general purpose container ID we will be one of the first
>> ones in line to make use of it, but we are not going to be the ones to
>> generically add containers to the kernel.  Enough people already hate
>> audit ;)
>>
>>>> I'm not interested in supporting/merging something that isn't useful;
>>>> if this doesn't work for your use case then we need to figure out what
>>>> would work.  It sounds like nested containers are much more common in
>>>> the lxc world, can you elaborate a bit more on this?
>>>>
>>>>
>>>> As far as the possible solutions you mention above, I'm not sure I
>>>> like the per-userns audit container IDs, I'd much rather just emit the
>>>> necessary tracking information via the audit record stream and let the
>>>> log analysis tools figure it out.  However, the bigger question is how
>>>> to limit (re)setting the audit container ID when you are in a non-init
>>>> userns.  For reasons already mentioned, using capable() is a non
>>>> starter for everything but the initial userns, and using ns_capable()
>>>> is equally poor as it essentially allows any userns the ability to
>>>> munge it's audit container ID (obviously not good).  It appears we
>>>> need a different method for controlling access to the audit container
>>>> ID.
>>>
>>> One option would be to make it a string, and have it be append only.
>>> That should be safe with no checks.
>>>
>>> I know there was a long thread about what type to make this thing. I
>>> think you could accomplish the append-only-ness with a u64 if you had
>>> some rule about only allowing setting lower order bits than those that
>>> are already set. With 4 bits for simplicity:
>>>
>>> 1100         # initial container id
>>> 1100 -> 1011 # not allowed
>>> 1100 -> 1101 # allowed, but now 1101 is set in stone since there are
>>>       # no lower order bits left
>>>
>>> There are probably fancier ways to do it if you actually understand
>>> math :)
>>
>> ;)
>>
>>> Since userns nesting is limited to 32 levels (right now, IIRC), and
>>> you have 64 bits, this might be reasonable. You could just teach
>>> container engines to use the first say N bits for themselves, with a 1
>>> bit for the barrier at the end.
>>
>> I like the creativity, but I worry that at some point these
>> limitations are going to be raised (limits have a funny way of doing
>> that over time) and we will be in trouble.  I say "trouble" because I
>> want to be able to quickly do an audit container ID comparison and
>> we're going to pay a penalty for these larger values (we'll need this
>> when we add multiple auditd support and the requisite record routing).
>>
>> Thinking about this makes me also realize we probably need to think a
>> bit longer about audit container ID conflicts between orchestrators.
>> Right now we just take the value that is given to us by the
>> orchestrator, but if we want to allow multiple container orchestrators
>> to work without some form of cooperation in userspace (I think we have
>> to assume the orchestrators will not talk to each other) we likely
>> need to have some way to block reuse of an audit container ID.  We
>> would either need to prevent the orchestrator from explicitly setting
>> an audit container ID to a currently in use value, or instead generate
>> the audit container ID in the kernel upon an event triggered by the
>> orchestrator (e.g. a write to a /proc file).  I suspect we should
>> start looking at the idr code, I think we will need to make use of it.
>
> To address this, I'd suggest that it is enforced to only allow the
> setting of descendants and to maintain a master list of audit container
> identifiers (with a hash table if necessary later) that includes the
> container owner.
>
> This also allows the orchestrator/engine to inject processes into
> existing containers by checking that the audit container identifier is
> only used again by the same owner.
>
> I have working code for both.

Just a quick note that due to some holiday travel I'm not going to be able to adequately respond to your latest messages on this thread for at least another week, likely a bit more.  I'm only checking mail to put out fires, and the audit container ID work tends to be something that starts them ;)

^ permalink raw reply

* Re: [PATCH v3 1/5] mm: introduce MADV_COLD
From: Michal Hocko @ 2019-07-09  9:19 UTC (permalink / raw)
  To: Minchan Kim
  Cc: Dave Hansen, Andrew Morton, linux-mm, LKML, linux-api,
	Johannes Weiner, Tim Murray, Joel Fernandes, Suren Baghdasaryan,
	Daniel Colascione, Shakeel Butt, Sonny Rao, oleksandr, hdanton,
	lizeb, Kirill A . Shutemov
In-Reply-To: <20190701073500.GA136163@google.com>

On Mon 01-07-19 16:35:00, Minchan Kim wrote:
> >From 39df9f94e6204b8893f3f3feb692745657392657 Mon Sep 17 00:00:00 2001
> From: Minchan Kim <minchan@kernel.org>
> Date: Fri, 24 May 2019 13:47:54 +0900
> Subject: [PATCH v3 1/5] mm: introduce MADV_COLD
> 
> When a process expects no accesses to a certain memory range, it could
> give a hint to kernel that the pages can be reclaimed when memory pressure
> happens but data should be preserved for future use.  This could reduce
> workingset eviction so it ends up increasing performance.
> 
> This patch introduces the new MADV_COLD hint to madvise(2) syscall.
> MADV_COLD can be used by a process to mark a memory range as not expected
> to be used in the near future. The hint can help kernel in deciding which
> pages to evict early during memory pressure.
> 
> It works for every LRU pages like MADV_[DONTNEED|FREE]. IOW, It moves
> 
> 	active file page -> inactive file LRU
> 	active anon page -> inacdtive anon LRU
> 
> Unlike MADV_FREE, it doesn't move active anonymous pages to inactive
> file LRU's head because MADV_COLD is a little bit different symantic.
> MADV_FREE means it's okay to discard when the memory pressure because
> the content of the page is *garbage* so freeing such pages is almost zero
> overhead since we don't need to swap out and access afterward causes just
> minor fault. Thus, it would make sense to put those freeable pages in
> inactive file LRU to compete other used-once pages. It makes sense for
> implmentaion point of view, too because it's not swapbacked memory any
> longer until it would be re-dirtied. Even, it could give a bonus to make
> them be reclaimed on swapless system. However, MADV_COLD doesn't mean
> garbage so reclaiming them requires swap-out/in in the end so it's bigger
> cost. Since we have designed VM LRU aging based on cost-model, anonymous
> cold pages would be better to position inactive anon's LRU list, not file
> LRU. Furthermore, it would help to avoid unnecessary scanning if system
> doesn't have a swap device. Let's start simpler way without adding
> complexity at this moment. However, keep in mind, too that it's a caveat
> that workloads with a lot of pages cache are likely to ignore MADV_COLD
> on anonymous memory because we rarely age anonymous LRU lists.
> 
> * man-page material
> 
> MADV_COLD (since Linux x.x)
> 
> Pages in the specified regions will be treated as less-recently-accessed
> compared to pages in the system with similar access frequencies.
> In contrast to MADV_FREE, the contents of the region are preserved
> regardless of subsequent writes to pages.
> 
> MADV_COLD cannot be applied to locked pages, Huge TLB pages, or VM_PFNMAP
> pages.
> 
> * v2
>  * add up the warn with lots of page cache workload - mhocko
>  * add man page stuff - dave
> 
> * v1
>  * remove page_mapcount filter - hannes, mhocko
>  * remove idle page handling - joelaf
> 
> * RFCv2
>  * add more description - mhocko
> 
> * RFCv1
>  * renaming from MADV_COOL to MADV_COLD - hannes
> 
> * internal review
>  * use clear_page_youn in deactivate_page - joelaf
>  * Revise the description - surenb
>  * Renaming from MADV_WARM to MADV_COOL - surenb
> 
> Signed-off-by: Minchan Kim <minchan@kernel.org>

OK, looks reasonable to me. THP part still gives me a head spin but it
is consistent with madv_free part so I will trust that all weird corner
cases are already caught there.

Acked-by: Michal Hocko <mhocko@suse.com>

Thanks!
-- 
Michal Hocko
SUSE Labs

^ permalink raw reply

* Re: [PATCH v3 3/5] mm: account nr_isolated_xxx in [isolate|putback]_lru_page
From: Michal Hocko @ 2019-07-09  9:38 UTC (permalink / raw)
  To: Minchan Kim
  Cc: Andrew Morton, linux-mm, LKML, linux-api, Johannes Weiner,
	Tim Murray, Joel Fernandes, Suren Baghdasaryan, Daniel Colascione,
	Shakeel Butt, Sonny Rao, oleksandr, hdanton, lizeb, Dave Hansen,
	Kirill A . Shutemov
In-Reply-To: <20190627115405.255259-4-minchan@kernel.org>

On Thu 27-06-19 20:54:03, Minchan Kim wrote:
> The isolate counting is pecpu counter so it would be not huge gain
> to work them by batch. Rather than complicating to make them batch,
> let's make it more stright-foward via adding the counting logic
> into [isolate|putback]_lru_page API.
> 
> * v1
>  * fix accounting bug - Hillf
> 
> Link: http://lkml.kernel.org/r/20190531165927.GA20067@cmpxchg.org
> Suggested-by: Johannes Weiner <hannes@cmpxchg.org>
> Signed-off-by: Minchan Kim <minchan@kernel.org>

I like that the NR_ISOLATED_$FOO handling gets out of any code except
for vmscan and migration. This is definitely an improvement.

I haven't spotted any imbalance so I hope I haven't really missed any
path.

Acked-by: Michal Hocko <mhocko@suse.com>

Thanks!

> ---
>  mm/compaction.c     |  2 --
>  mm/gup.c            |  7 +------
>  mm/khugepaged.c     |  3 ---
>  mm/memory-failure.c |  3 ---
>  mm/memory_hotplug.c |  4 ----
>  mm/mempolicy.c      |  6 +-----
>  mm/migrate.c        | 37 ++++++++-----------------------------
>  mm/vmscan.c         | 22 ++++++++++++++++------
>  8 files changed, 26 insertions(+), 58 deletions(-)
> 
> diff --git a/mm/compaction.c b/mm/compaction.c
> index 9e1b9acb116b..c6591682deda 100644
> --- a/mm/compaction.c
> +++ b/mm/compaction.c
> @@ -982,8 +982,6 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn,
>  
>  		/* Successfully isolated */
>  		del_page_from_lru_list(page, lruvec, page_lru(page));
> -		inc_node_page_state(page,
> -				NR_ISOLATED_ANON + page_is_file_cache(page));
>  
>  isolate_success:
>  		list_add(&page->lru, &cc->migratepages);
> diff --git a/mm/gup.c b/mm/gup.c
> index 7dde2e3a1963..aec3a2b7e61b 100644
> --- a/mm/gup.c
> +++ b/mm/gup.c
> @@ -1473,13 +1473,8 @@ static long check_and_migrate_cma_pages(struct task_struct *tsk,
>  					drain_allow = false;
>  				}
>  
> -				if (!isolate_lru_page(head)) {
> +				if (!isolate_lru_page(head))
>  					list_add_tail(&head->lru, &cma_page_list);
> -					mod_node_page_state(page_pgdat(head),
> -							    NR_ISOLATED_ANON +
> -							    page_is_file_cache(head),
> -							    hpage_nr_pages(head));
> -				}
>  			}
>  		}
>  	}
> diff --git a/mm/khugepaged.c b/mm/khugepaged.c
> index 0f7419938008..7da34e198ec5 100644
> --- a/mm/khugepaged.c
> +++ b/mm/khugepaged.c
> @@ -503,7 +503,6 @@ void __khugepaged_exit(struct mm_struct *mm)
>  
>  static void release_pte_page(struct page *page)
>  {
> -	dec_node_page_state(page, NR_ISOLATED_ANON + page_is_file_cache(page));
>  	unlock_page(page);
>  	putback_lru_page(page);
>  }
> @@ -602,8 +601,6 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
>  			result = SCAN_DEL_PAGE_LRU;
>  			goto out;
>  		}
> -		inc_node_page_state(page,
> -				NR_ISOLATED_ANON + page_is_file_cache(page));
>  		VM_BUG_ON_PAGE(!PageLocked(page), page);
>  		VM_BUG_ON_PAGE(PageLRU(page), page);
>  
> diff --git a/mm/memory-failure.c b/mm/memory-failure.c
> index 7e08cbf3ba49..3586e8226e4e 100644
> --- a/mm/memory-failure.c
> +++ b/mm/memory-failure.c
> @@ -1795,9 +1795,6 @@ static int __soft_offline_page(struct page *page, int flags)
>  		 * so use !__PageMovable instead for LRU page's mapping
>  		 * cannot have PAGE_MAPPING_MOVABLE.
>  		 */
> -		if (!__PageMovable(page))
> -			inc_node_page_state(page, NR_ISOLATED_ANON +
> -						page_is_file_cache(page));
>  		list_add(&page->lru, &pagelist);
>  		ret = migrate_pages(&pagelist, new_page, NULL, MPOL_MF_MOVE_ALL,
>  					MIGRATE_SYNC, MR_MEMORY_FAILURE);
> diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
> index dfab21dc33dc..68577c677b46 100644
> --- a/mm/memory_hotplug.c
> +++ b/mm/memory_hotplug.c
> @@ -1384,10 +1384,6 @@ do_migrate_range(unsigned long start_pfn, unsigned long end_pfn)
>  			ret = isolate_movable_page(page, ISOLATE_UNEVICTABLE);
>  		if (!ret) { /* Success */
>  			list_add_tail(&page->lru, &source);
> -			if (!__PageMovable(page))
> -				inc_node_page_state(page, NR_ISOLATED_ANON +
> -						    page_is_file_cache(page));
> -
>  		} else {
>  			pr_warn("failed to isolate pfn %lx\n", pfn);
>  			dump_page(page, "isolation failed");
> diff --git a/mm/mempolicy.c b/mm/mempolicy.c
> index 64562809bf3b..03081f3404ca 100644
> --- a/mm/mempolicy.c
> +++ b/mm/mempolicy.c
> @@ -994,12 +994,8 @@ static int migrate_page_add(struct page *page, struct list_head *pagelist,
>  	 * Avoid migrating a page that is shared with others.
>  	 */
>  	if ((flags & MPOL_MF_MOVE_ALL) || page_mapcount(head) == 1) {
> -		if (!isolate_lru_page(head)) {
> +		if (!isolate_lru_page(head))
>  			list_add_tail(&head->lru, pagelist);
> -			mod_node_page_state(page_pgdat(head),
> -				NR_ISOLATED_ANON + page_is_file_cache(head),
> -				hpage_nr_pages(head));
> -		}
>  	}
>  
>  	return 0;
> diff --git a/mm/migrate.c b/mm/migrate.c
> index 572b4bc85d76..5583324c01e7 100644
> --- a/mm/migrate.c
> +++ b/mm/migrate.c
> @@ -190,8 +190,6 @@ void putback_movable_pages(struct list_head *l)
>  			unlock_page(page);
>  			put_page(page);
>  		} else {
> -			mod_node_page_state(page_pgdat(page), NR_ISOLATED_ANON +
> -					page_is_file_cache(page), -hpage_nr_pages(page));
>  			putback_lru_page(page);
>  		}
>  	}
> @@ -1181,10 +1179,17 @@ static ICE_noinline int unmap_and_move(new_page_t get_new_page,
>  		return -ENOMEM;
>  
>  	if (page_count(page) == 1) {
> +		bool is_lru = !__PageMovable(page);
> +
>  		/* page was freed from under us. So we are done. */
>  		ClearPageActive(page);
>  		ClearPageUnevictable(page);
> -		if (unlikely(__PageMovable(page))) {
> +		if (likely(is_lru))
> +			mod_node_page_state(page_pgdat(page),
> +						NR_ISOLATED_ANON +
> +						page_is_file_cache(page),
> +						-hpage_nr_pages(page));
> +		else {
>  			lock_page(page);
>  			if (!PageMovable(page))
>  				__ClearPageIsolated(page);
> @@ -1210,15 +1215,6 @@ static ICE_noinline int unmap_and_move(new_page_t get_new_page,
>  		 * restored.
>  		 */
>  		list_del(&page->lru);
> -
> -		/*
> -		 * Compaction can migrate also non-LRU pages which are
> -		 * not accounted to NR_ISOLATED_*. They can be recognized
> -		 * as __PageMovable
> -		 */
> -		if (likely(!__PageMovable(page)))
> -			mod_node_page_state(page_pgdat(page), NR_ISOLATED_ANON +
> -					page_is_file_cache(page), -hpage_nr_pages(page));
>  	}
>  
>  	/*
> @@ -1572,9 +1568,6 @@ static int add_page_for_migration(struct mm_struct *mm, unsigned long addr,
>  
>  		err = 0;
>  		list_add_tail(&head->lru, pagelist);
> -		mod_node_page_state(page_pgdat(head),
> -			NR_ISOLATED_ANON + page_is_file_cache(head),
> -			hpage_nr_pages(head));
>  	}
>  out_putpage:
>  	/*
> @@ -1890,8 +1883,6 @@ static struct page *alloc_misplaced_dst_page(struct page *page,
>  
>  static int numamigrate_isolate_page(pg_data_t *pgdat, struct page *page)
>  {
> -	int page_lru;
> -
>  	VM_BUG_ON_PAGE(compound_order(page) && !PageTransHuge(page), page);
>  
>  	/* Avoid migrating to a node that is nearly full */
> @@ -1913,10 +1904,6 @@ static int numamigrate_isolate_page(pg_data_t *pgdat, struct page *page)
>  		return 0;
>  	}
>  
> -	page_lru = page_is_file_cache(page);
> -	mod_node_page_state(page_pgdat(page), NR_ISOLATED_ANON + page_lru,
> -				hpage_nr_pages(page));
> -
>  	/*
>  	 * Isolating the page has taken another reference, so the
>  	 * caller's reference can be safely dropped without the page
> @@ -1971,8 +1958,6 @@ int migrate_misplaced_page(struct page *page, struct vm_area_struct *vma,
>  	if (nr_remaining) {
>  		if (!list_empty(&migratepages)) {
>  			list_del(&page->lru);
> -			dec_node_page_state(page, NR_ISOLATED_ANON +
> -					page_is_file_cache(page));
>  			putback_lru_page(page);
>  		}
>  		isolated = 0;
> @@ -2002,7 +1987,6 @@ int migrate_misplaced_transhuge_page(struct mm_struct *mm,
>  	pg_data_t *pgdat = NODE_DATA(node);
>  	int isolated = 0;
>  	struct page *new_page = NULL;
> -	int page_lru = page_is_file_cache(page);
>  	unsigned long start = address & HPAGE_PMD_MASK;
>  
>  	new_page = alloc_pages_node(node,
> @@ -2048,8 +2032,6 @@ int migrate_misplaced_transhuge_page(struct mm_struct *mm,
>  		/* Retake the callers reference and putback on LRU */
>  		get_page(page);
>  		putback_lru_page(page);
> -		mod_node_page_state(page_pgdat(page),
> -			 NR_ISOLATED_ANON + page_lru, -HPAGE_PMD_NR);
>  
>  		goto out_unlock;
>  	}
> @@ -2099,9 +2081,6 @@ int migrate_misplaced_transhuge_page(struct mm_struct *mm,
>  	count_vm_events(PGMIGRATE_SUCCESS, HPAGE_PMD_NR);
>  	count_vm_numa_events(NUMA_PAGE_MIGRATE, HPAGE_PMD_NR);
>  
> -	mod_node_page_state(page_pgdat(page),
> -			NR_ISOLATED_ANON + page_lru,
> -			-HPAGE_PMD_NR);
>  	return isolated;
>  
>  out_fail:
> diff --git a/mm/vmscan.c b/mm/vmscan.c
> index 49e9ee4d771d..223ce5da08f0 100644
> --- a/mm/vmscan.c
> +++ b/mm/vmscan.c
> @@ -1014,6 +1014,9 @@ int remove_mapping(struct address_space *mapping, struct page *page)
>  void putback_lru_page(struct page *page)
>  {
>  	lru_cache_add(page);
> +	mod_node_page_state(page_pgdat(page),
> +				NR_ISOLATED_ANON + page_is_file_cache(page),
> +				-hpage_nr_pages(page));
>  	put_page(page);		/* drop ref from isolate */
>  }
>  
> @@ -1479,6 +1482,9 @@ static unsigned long shrink_page_list(struct list_head *page_list,
>  		 */
>  		nr_reclaimed += nr_pages;
>  
> +		mod_node_page_state(pgdat, NR_ISOLATED_ANON +
> +						page_is_file_cache(page),
> +						-nr_pages);
>  		/*
>  		 * Is there need to periodically free_page_list? It would
>  		 * appear not as the counts should be low
> @@ -1554,7 +1560,6 @@ unsigned long reclaim_clean_pages_from_list(struct zone *zone,
>  	ret = shrink_page_list(&clean_pages, zone->zone_pgdat, &sc,
>  			TTU_IGNORE_ACCESS, &dummy_stat, true);
>  	list_splice(&clean_pages, page_list);
> -	mod_node_page_state(zone->zone_pgdat, NR_ISOLATED_FILE, -ret);
>  	return ret;
>  }
>  
> @@ -1630,6 +1635,9 @@ int __isolate_lru_page(struct page *page, isolate_mode_t mode)
>  		 */
>  		ClearPageLRU(page);
>  		ret = 0;
> +		__mod_node_page_state(page_pgdat(page), NR_ISOLATED_ANON +
> +						page_is_file_cache(page),
> +						hpage_nr_pages(page));
>  	}
>  
>  	return ret;
> @@ -1761,6 +1769,7 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
>  	trace_mm_vmscan_lru_isolate(sc->reclaim_idx, sc->order, nr_to_scan,
>  				    total_scan, skipped, nr_taken, mode, lru);
>  	update_lru_sizes(lruvec, lru, nr_zone_taken);
> +
>  	return nr_taken;
>  }
>  
> @@ -1809,6 +1818,9 @@ int isolate_lru_page(struct page *page)
>  			ClearPageLRU(page);
>  			del_page_from_lru_list(page, lruvec, lru);
>  			ret = 0;
> +			mod_node_page_state(pgdat, NR_ISOLATED_ANON +
> +						page_is_file_cache(page),
> +						hpage_nr_pages(page));
>  		}
>  		spin_unlock_irq(&pgdat->lru_lock);
>  	}
> @@ -1900,6 +1912,9 @@ static unsigned noinline_for_stack move_pages_to_lru(struct lruvec *lruvec,
>  		update_lru_size(lruvec, lru, page_zonenum(page), nr_pages);
>  		list_move(&page->lru, &lruvec->lists[lru]);
>  
> +		__mod_node_page_state(pgdat, NR_ISOLATED_ANON +
> +						page_is_file_cache(page),
> +						-hpage_nr_pages(page));
>  		if (put_page_testzero(page)) {
>  			__ClearPageLRU(page);
>  			__ClearPageActive(page);
> @@ -1977,7 +1992,6 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec,
>  	nr_taken = isolate_lru_pages(nr_to_scan, lruvec, &page_list,
>  				     &nr_scanned, sc, lru);
>  
> -	__mod_node_page_state(pgdat, NR_ISOLATED_ANON + file, nr_taken);
>  	reclaim_stat->recent_scanned[file] += nr_taken;
>  
>  	item = current_is_kswapd() ? PGSCAN_KSWAPD : PGSCAN_DIRECT;
> @@ -2003,8 +2017,6 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec,
>  
>  	move_pages_to_lru(lruvec, &page_list);
>  
> -	__mod_node_page_state(pgdat, NR_ISOLATED_ANON + file, -nr_taken);
> -
>  	spin_unlock_irq(&pgdat->lru_lock);
>  
>  	mem_cgroup_uncharge_list(&page_list);
> @@ -2063,7 +2075,6 @@ static void shrink_active_list(unsigned long nr_to_scan,
>  	nr_taken = isolate_lru_pages(nr_to_scan, lruvec, &l_hold,
>  				     &nr_scanned, sc, lru);
>  
> -	__mod_node_page_state(pgdat, NR_ISOLATED_ANON + file, nr_taken);
>  	reclaim_stat->recent_scanned[file] += nr_taken;
>  
>  	__count_vm_events(PGREFILL, nr_scanned);
> @@ -2132,7 +2143,6 @@ static void shrink_active_list(unsigned long nr_to_scan,
>  	__count_vm_events(PGDEACTIVATE, nr_deactivate);
>  	__count_memcg_events(lruvec_memcg(lruvec), PGDEACTIVATE, nr_deactivate);
>  
> -	__mod_node_page_state(pgdat, NR_ISOLATED_ANON + file, -nr_taken);
>  	spin_unlock_irq(&pgdat->lru_lock);
>  
>  	mem_cgroup_uncharge_list(&l_active);
> -- 
> 2.22.0.410.gd8fdbe21b5-goog

-- 
Michal Hocko
SUSE Labs

^ permalink raw reply

* Re: [PATCH v3 4/5] mm: introduce MADV_PAGEOUT
From: Michal Hocko @ 2019-07-09  9:55 UTC (permalink / raw)
  To: Minchan Kim
  Cc: Andrew Morton, linux-mm, LKML, linux-api, Johannes Weiner,
	Tim Murray, Joel Fernandes, Suren Baghdasaryan, Daniel Colascione,
	Shakeel Butt, Sonny Rao, oleksandr, hdanton, lizeb, Dave Hansen,
	Kirill A . Shutemov
In-Reply-To: <20190627115405.255259-5-minchan@kernel.org>

On Thu 27-06-19 20:54:04, Minchan Kim wrote:
> When a process expects no accesses to a certain memory range
> for a long time, it could hint kernel that the pages can be
> reclaimed instantly but data should be preserved for future use.
> This could reduce workingset eviction so it ends up increasing
> performance.
> 
> This patch introduces the new MADV_PAGEOUT hint to madvise(2)
> syscall. MADV_PAGEOUT can be used by a process to mark a memory
> range as not expected to be used for a long time so that kernel
> reclaims *any LRU* pages instantly. The hint can help kernel in
> deciding which pages to evict proactively.
> 
> - man-page material
> 
> MADV_PAGEOUT (since Linux x.x)
> 
> Do not expect access in the near future so pages in the specified
> regions could be reclaimed instantly regardless of memory pressure.
> Thus, access in the range after successful operation could cause
> major page fault but never lose the up-to-date contents unlike
> MADV_DONTNEED.

> It works for only private anonymous mappings and
> non-anonymous mappings that belong to files that the calling process
> could successfully open for writing; otherwise, it could be used for
> sidechannel attack.

I would rephrase this way:
"
Pages belonging to a shared mapping are only processed if a write access
is allowed for the calling process.
"

I wouldn't really mention side channel attacks for a man page. You can
mention can_do_mincore check and the side channel prevention in the
changelog that is not aimed for the man page.

> MADV_PAGEOUT cannot be applied to locked pages, Huge TLB pages, or
> VM_PFNMAP pages.
> 
> * v2
>  * add comment about SWAP_CLUSTER_MAX - mhocko
>  * add permission check to prevent sidechannel attack - mhocko
>  * add man page stuff - dave
> 
> * v1
>  * change pte to old and rely on the other's reference - hannes
>  * remove page_mapcount to check shared page - mhocko
> 
> * RFC v2
>  * make reclaim_pages simple via factoring out isolate logic - hannes
> 
> * RFCv1
>  * rename from MADV_COLD to MADV_PAGEOUT - hannes
>  * bail out if process is being killed - Hillf
>  * fix reclaim_pages bugs - Hillf
> 
> Signed-off-by: Minchan Kim <minchan@kernel.org>
> ---


I am still not convinced about the SWAP_CLUSTER_MAX batching and the
udnerlying OOM argument. Is one pmd worth of pages really an OOM risk?
Sure you can have many invocations in parallel and that would add on
but the same might happen with SWAP_CLUSTER_MAX. So I would just remove
the batching for now and think of it only if we really see this being a
problem for real. Unless you feel really strong about this, of course.

Anyway the patch looks ok to me otherwise.

Acked-by: Michal Hocko <mhocko@suse.co>
-- 
Michal Hocko
SUSE Labs

^ permalink raw reply

* Re: [PATCH v3 5/5] mm: factor out pmd young/dirty bit handling and THP split
From: Michal Hocko @ 2019-07-09 14:10 UTC (permalink / raw)
  To: Minchan Kim
  Cc: Andrew Morton, linux-mm, LKML, linux-api, Johannes Weiner,
	Tim Murray, Joel Fernandes, Suren Baghdasaryan, Daniel Colascione,
	Shakeel Butt, Sonny Rao, oleksandr, hdanton, lizeb, Dave Hansen,
	Kirill A . Shutemov
In-Reply-To: <20190627115405.255259-6-minchan@kernel.org>

On Thu 27-06-19 20:54:05, Minchan Kim wrote:
> Now, there are common part among MADV_COLD|PAGEOUT|FREE to reset
> access/dirty bit resetting or split the THP page to handle part
> of subpages in the THP page. This patch factor out the common part.

While this reduces the code duplication to some degree I suspect it only
goes half way. I haven't tried that myself due to lack of time but I
believe this has a potential to reduce even more. All those madvise
calls are doing the same thing essentially. What page tables and apply
an operation on ptes and/or a page that is mapped. And that suggests
that the specific operation should be good with defining two - pte and
page operations on each entry. All the rest should be a common code.

That being said, I do not feel strongly about this patch. The rest of
the series should be good enough even without it and I wouldn't delay it
just by discussing a potential of the cleanup.

> Signed-off-by: Minchan Kim <minchan@kernel.org>
> ---
>  include/linux/huge_mm.h |   3 -
>  mm/huge_memory.c        |  74 -------------
>  mm/madvise.c            | 234 +++++++++++++++++++++++-----------------
>  3 files changed, 135 insertions(+), 176 deletions(-)
> 
> diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
> index 7cd5c150c21d..2667e1aa3ce5 100644
> --- a/include/linux/huge_mm.h
> +++ b/include/linux/huge_mm.h
> @@ -29,9 +29,6 @@ extern struct page *follow_trans_huge_pmd(struct vm_area_struct *vma,
>  					  unsigned long addr,
>  					  pmd_t *pmd,
>  					  unsigned int flags);
> -extern bool madvise_free_huge_pmd(struct mmu_gather *tlb,
> -			struct vm_area_struct *vma,
> -			pmd_t *pmd, unsigned long addr, unsigned long next);
>  extern int zap_huge_pmd(struct mmu_gather *tlb,
>  			struct vm_area_struct *vma,
>  			pmd_t *pmd, unsigned long addr);
> diff --git a/mm/huge_memory.c b/mm/huge_memory.c
> index 93f531b63a45..e4b9a06788f3 100644
> --- a/mm/huge_memory.c
> +++ b/mm/huge_memory.c
> @@ -1671,80 +1671,6 @@ vm_fault_t do_huge_pmd_numa_page(struct vm_fault *vmf, pmd_t pmd)
>  	return 0;
>  }
>  
> -/*
> - * Return true if we do MADV_FREE successfully on entire pmd page.
> - * Otherwise, return false.
> - */
> -bool madvise_free_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
> -		pmd_t *pmd, unsigned long addr, unsigned long next)
> -{
> -	spinlock_t *ptl;
> -	pmd_t orig_pmd;
> -	struct page *page;
> -	struct mm_struct *mm = tlb->mm;
> -	bool ret = false;
> -
> -	tlb_change_page_size(tlb, HPAGE_PMD_SIZE);
> -
> -	ptl = pmd_trans_huge_lock(pmd, vma);
> -	if (!ptl)
> -		goto out_unlocked;
> -
> -	orig_pmd = *pmd;
> -	if (is_huge_zero_pmd(orig_pmd))
> -		goto out;
> -
> -	if (unlikely(!pmd_present(orig_pmd))) {
> -		VM_BUG_ON(thp_migration_supported() &&
> -				  !is_pmd_migration_entry(orig_pmd));
> -		goto out;
> -	}
> -
> -	page = pmd_page(orig_pmd);
> -	/*
> -	 * If other processes are mapping this page, we couldn't discard
> -	 * the page unless they all do MADV_FREE so let's skip the page.
> -	 */
> -	if (page_mapcount(page) != 1)
> -		goto out;
> -
> -	if (!trylock_page(page))
> -		goto out;
> -
> -	/*
> -	 * If user want to discard part-pages of THP, split it so MADV_FREE
> -	 * will deactivate only them.
> -	 */
> -	if (next - addr != HPAGE_PMD_SIZE) {
> -		get_page(page);
> -		spin_unlock(ptl);
> -		split_huge_page(page);
> -		unlock_page(page);
> -		put_page(page);
> -		goto out_unlocked;
> -	}
> -
> -	if (PageDirty(page))
> -		ClearPageDirty(page);
> -	unlock_page(page);
> -
> -	if (pmd_young(orig_pmd) || pmd_dirty(orig_pmd)) {
> -		pmdp_invalidate(vma, addr, pmd);
> -		orig_pmd = pmd_mkold(orig_pmd);
> -		orig_pmd = pmd_mkclean(orig_pmd);
> -
> -		set_pmd_at(mm, addr, pmd, orig_pmd);
> -		tlb_remove_pmd_tlb_entry(tlb, pmd, addr);
> -	}
> -
> -	mark_page_lazyfree(page);
> -	ret = true;
> -out:
> -	spin_unlock(ptl);
> -out_unlocked:
> -	return ret;
> -}
> -
>  static inline void zap_deposited_table(struct mm_struct *mm, pmd_t *pmd)
>  {
>  	pgtable_t pgtable;
> diff --git a/mm/madvise.c b/mm/madvise.c
> index ee210473f639..13b06dc8d402 100644
> --- a/mm/madvise.c
> +++ b/mm/madvise.c
> @@ -310,6 +310,91 @@ static long madvise_willneed(struct vm_area_struct *vma,
>  	return 0;
>  }
>  
> +enum madv_pmdp_reset_t {
> +	MADV_PMDP_RESET,	/* pmd was reset successfully */
> +	MADV_PMDP_SPLIT,	/* pmd was split */
> +	MADV_PMDP_ERROR,
> +};
> +
> +static enum madv_pmdp_reset_t madvise_pmdp_reset_or_split(struct mm_walk *walk,
> +				pmd_t *pmd, spinlock_t *ptl,
> +				unsigned long addr, unsigned long end,
> +				bool young, bool dirty)
> +{
> +	pmd_t orig_pmd;
> +	unsigned long next;
> +	struct page *page;
> +	struct mmu_gather *tlb = walk->private;
> +	struct mm_struct *mm = walk->mm;
> +	struct vm_area_struct *vma = walk->vma;
> +	bool reset_young = false;
> +	bool reset_dirty = false;
> +	enum madv_pmdp_reset_t ret = MADV_PMDP_ERROR;
> +
> +	orig_pmd = *pmd;
> +	if (is_huge_zero_pmd(orig_pmd))
> +		return ret;
> +
> +	if (unlikely(!pmd_present(orig_pmd))) {
> +		VM_BUG_ON(thp_migration_supported() &&
> +				!is_pmd_migration_entry(orig_pmd));
> +		return ret;
> +	}
> +
> +	next = pmd_addr_end(addr, end);
> +	page = pmd_page(orig_pmd);
> +	if (next - addr != HPAGE_PMD_SIZE) {
> +		/*
> +		 * THP collapsing is not cheap so only split the page is
> +		 * private to the this process.
> +		 */
> +		if (page_mapcount(page) != 1)
> +			return ret;
> +		get_page(page);
> +		spin_unlock(ptl);
> +		lock_page(page);
> +		if (!split_huge_page(page))
> +			ret = MADV_PMDP_SPLIT;
> +		unlock_page(page);
> +		put_page(page);
> +		return ret;
> +	}
> +
> +	if (young && pmd_young(orig_pmd))
> +		reset_young = true;
> +	if (dirty && pmd_dirty(orig_pmd))
> +		reset_dirty = true;
> +
> +	/*
> +	 * Other process could rely on the PG_dirty for data consistency,
> +	 * not pte_dirty so we could reset PG_dirty only when we are owner
> +	 * of the page.
> +	 */
> +	if (reset_dirty) {
> +		if (page_mapcount(page) != 1)
> +			goto out;
> +		if (!trylock_page(page))
> +			goto out;
> +		if (PageDirty(page))
> +			ClearPageDirty(page);
> +		unlock_page(page);
> +	}
> +
> +	ret = MADV_PMDP_RESET;
> +	if (reset_young || reset_dirty) {
> +		tlb_change_page_size(tlb, HPAGE_PMD_SIZE);
> +		pmdp_invalidate(vma, addr, pmd);
> +		if (reset_young)
> +			orig_pmd = pmd_mkold(orig_pmd);
> +		if (reset_dirty)
> +			orig_pmd = pmd_mkclean(orig_pmd);
> +		set_pmd_at(mm, addr, pmd, orig_pmd);
> +		tlb_remove_pmd_tlb_entry(tlb, pmd, addr);
> +	}
> +out:
> +	return ret;
> +}
> +
>  static int madvise_cold_pte_range(pmd_t *pmd, unsigned long addr,
>  				unsigned long end, struct mm_walk *walk)
>  {
> @@ -319,64 +404,31 @@ static int madvise_cold_pte_range(pmd_t *pmd, unsigned long addr,
>  	pte_t *orig_pte, *pte, ptent;
>  	spinlock_t *ptl;
>  	struct page *page;
> -	unsigned long next;
>  
> -	next = pmd_addr_end(addr, end);
>  	if (pmd_trans_huge(*pmd)) {
> -		pmd_t orig_pmd;
> -
> -		tlb_change_page_size(tlb, HPAGE_PMD_SIZE);
>  		ptl = pmd_trans_huge_lock(pmd, vma);
>  		if (!ptl)
>  			return 0;
>  
> -		orig_pmd = *pmd;
> -		if (is_huge_zero_pmd(orig_pmd))
> -			goto huge_unlock;
> -
> -		if (unlikely(!pmd_present(orig_pmd))) {
> -			VM_BUG_ON(thp_migration_supported() &&
> -					!is_pmd_migration_entry(orig_pmd));
> -			goto huge_unlock;
> -		}
> -
> -		page = pmd_page(orig_pmd);
> -		if (next - addr != HPAGE_PMD_SIZE) {
> -			int err;
> -
> -			if (page_mapcount(page) != 1)
> -				goto huge_unlock;
> -
> -			get_page(page);
> +		switch (madvise_pmdp_reset_or_split(walk, pmd, ptl, addr, end,
> +							true, false)) {
> +		case MADV_PMDP_RESET:
>  			spin_unlock(ptl);
> -			lock_page(page);
> -			err = split_huge_page(page);
> -			unlock_page(page);
> -			put_page(page);
> -			if (!err)
> -				goto regular_page;
> -			return 0;
> -		}
> -
> -		if (pmd_young(orig_pmd)) {
> -			pmdp_invalidate(vma, addr, pmd);
> -			orig_pmd = pmd_mkold(orig_pmd);
> -
> -			set_pmd_at(mm, addr, pmd, orig_pmd);
> -			tlb_remove_pmd_tlb_entry(tlb, pmd, addr);
> +			page = pmd_page(*pmd);
> +			test_and_clear_page_young(page);
> +			deactivate_page(page);
> +			goto next;
> +		case MADV_PMDP_ERROR:
> +			spin_unlock(ptl);
> +			goto next;
> +		case MADV_PMDP_SPLIT:
> +			; /* go through */
>  		}
> -
> -		test_and_clear_page_young(page);
> -		deactivate_page(page);
> -huge_unlock:
> -		spin_unlock(ptl);
> -		return 0;
>  	}
>  
>  	if (pmd_trans_unstable(pmd))
>  		return 0;
>  
> -regular_page:
>  	tlb_change_page_size(tlb, PAGE_SIZE);
>  	orig_pte = pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
>  	flush_tlb_batched_pending(mm);
> @@ -443,6 +495,7 @@ static int madvise_cold_pte_range(pmd_t *pmd, unsigned long addr,
>  
>  	arch_enter_lazy_mmu_mode();
>  	pte_unmap_unlock(orig_pte, ptl);
> +next:
>  	cond_resched();
>  
>  	return 0;
> @@ -493,70 +546,38 @@ static int madvise_pageout_pte_range(pmd_t *pmd, unsigned long addr,
>  	LIST_HEAD(page_list);
>  	struct page *page;
>  	int isolated = 0;
> -	unsigned long next;
>  
>  	if (fatal_signal_pending(current))
>  		return -EINTR;
>  
> -	next = pmd_addr_end(addr, end);
>  	if (pmd_trans_huge(*pmd)) {
> -		pmd_t orig_pmd;
> -
> -		tlb_change_page_size(tlb, HPAGE_PMD_SIZE);
>  		ptl = pmd_trans_huge_lock(pmd, vma);
>  		if (!ptl)
>  			return 0;
>  
> -		orig_pmd = *pmd;
> -		if (is_huge_zero_pmd(orig_pmd))
> -			goto huge_unlock;
> -
> -		if (unlikely(!pmd_present(orig_pmd))) {
> -			VM_BUG_ON(thp_migration_supported() &&
> -					!is_pmd_migration_entry(orig_pmd));
> -			goto huge_unlock;
> -		}
> -
> -		page = pmd_page(orig_pmd);
> -		if (next - addr != HPAGE_PMD_SIZE) {
> -			int err;
> -
> -			if (page_mapcount(page) != 1)
> -				goto huge_unlock;
> -			get_page(page);
> +		switch (madvise_pmdp_reset_or_split(walk, pmd, ptl, addr, end,
> +							true, false)) {
> +		case MADV_PMDP_RESET:
> +			page = pmd_page(*pmd);
>  			spin_unlock(ptl);
> -			lock_page(page);
> -			err = split_huge_page(page);
> -			unlock_page(page);
> -			put_page(page);
> -			if (!err)
> -				goto regular_page;
> -			return 0;
> -		}
> -
> -		if (isolate_lru_page(page))
> -			goto huge_unlock;
> -
> -		if (pmd_young(orig_pmd)) {
> -			pmdp_invalidate(vma, addr, pmd);
> -			orig_pmd = pmd_mkold(orig_pmd);
> -
> -			set_pmd_at(mm, addr, pmd, orig_pmd);
> -			tlb_remove_tlb_entry(tlb, pmd, addr);
> +			if (isolate_lru_page(page))
> +				return 0;
> +			ClearPageReferenced(page);
> +			test_and_clear_page_young(page);
> +			list_add(&page->lru, &page_list);
> +			reclaim_pages(&page_list);
> +			goto next;
> +		case MADV_PMDP_ERROR:
> +			spin_unlock(ptl);
> +			goto next;
> +		case MADV_PMDP_SPLIT:
> +			; /* go through */
>  		}
> -
> -		ClearPageReferenced(page);
> -		test_and_clear_page_young(page);
> -		list_add(&page->lru, &page_list);
> -huge_unlock:
> -		spin_unlock(ptl);
> -		reclaim_pages(&page_list);
> -		return 0;
>  	}
>  
>  	if (pmd_trans_unstable(pmd))
>  		return 0;
> -regular_page:
> +
>  	tlb_change_page_size(tlb, PAGE_SIZE);
>  	orig_pte = pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
>  	flush_tlb_batched_pending(mm);
> @@ -631,6 +652,7 @@ static int madvise_pageout_pte_range(pmd_t *pmd, unsigned long addr,
>  	arch_leave_lazy_mmu_mode();
>  	pte_unmap_unlock(orig_pte, ptl);
>  	reclaim_pages(&page_list);
> +next:
>  	cond_resched();
>  
>  	return 0;
> @@ -700,12 +722,26 @@ static int madvise_free_pte_range(pmd_t *pmd, unsigned long addr,
>  	pte_t *orig_pte, *pte, ptent;
>  	struct page *page;
>  	int nr_swap = 0;
> -	unsigned long next;
>  
> -	next = pmd_addr_end(addr, end);
> -	if (pmd_trans_huge(*pmd))
> -		if (madvise_free_huge_pmd(tlb, vma, pmd, addr, next))
> +	if (pmd_trans_huge(*pmd)) {
> +		ptl = pmd_trans_huge_lock(pmd, vma);
> +		if (!ptl)
> +			return 0;
> +
> +		switch (madvise_pmdp_reset_or_split(walk, pmd, ptl, addr, end,
> +							true, true)) {
> +		case MADV_PMDP_RESET:
> +			page = pmd_page(*pmd);
> +			spin_unlock(ptl);
> +			mark_page_lazyfree(page);
>  			goto next;
> +		case MADV_PMDP_ERROR:
> +			spin_unlock(ptl);
> +			goto next;
> +		case MADV_PMDP_SPLIT:
> +			; /* go through */
> +		}
> +	}
>  
>  	if (pmd_trans_unstable(pmd))
>  		return 0;
> @@ -817,8 +853,8 @@ static int madvise_free_pte_range(pmd_t *pmd, unsigned long addr,
>  	}
>  	arch_leave_lazy_mmu_mode();
>  	pte_unmap_unlock(orig_pte, ptl);
> -	cond_resched();
>  next:
> +	cond_resched();
>  	return 0;
>  }
>  
> -- 
> 2.22.0.410.gd8fdbe21b5-goog

-- 
Michal Hocko
SUSE Labs

^ permalink raw reply

* Re: [PATCH v9 00/24] ILP32 for ARM64
From: Yury Norov @ 2019-07-09 22:42 UTC (permalink / raw)
  To: Yury Norov
  Cc: Catalin Marinas, Arnd Bergmann, linux-arm-kernel, linux-kernel,
	linux-doc, linux-arch, linux-api, Adam Borowski, Alexander Graf,
	Alexey Klimov, Andreas Schwab, Andrew Pinski, Bamvor Zhangjian,
	Chris Metcalf, Christoph Muellner, Dave Martin, David S . Miller,
	Florian Weimer, Geert Uytterhoeven, Heiko Carstens,
	James Hogan <jame>
In-Reply-To: <20180516081910.10067-1-ynorov@caviumnetworks.com>

Hi all,

On Wed, May 16, 2018 at 11:18:45AM +0300, Yury Norov wrote:
> This series enables AARCH64 with ILP32 mode.
> 
> As supporting work, it introduces ARCH_32BIT_OFF_T configuration
> option that is enabled for existing 32-bit architectures but disabled
> for new arches (so 64-bit off_t userspace type is used by new userspace).
> Also it deprecates getrlimit and setrlimit syscalls prior to prlimit64.
> 
> Based on kernel v4.16. Tested with LTP, glibc testsuite, trinity, lmbench,
> CPUSpec.
> 
> This series on github: 
> https://github.com/norov/linux/tree/ilp32-4.16
> Linaro toolchain:
> http://snapshots.linaro.org/components/toolchain/binaries/7.3-2018.04-rc1/aarch64-linux-gnu_ilp32/
> Debian repo:
> http://people.linaro.org/~wookey/ilp32/
> OpenSUSE repo:
> https://build.opensuse.org/project/show/devel:ARM:Factory:Contrib:ILP32
> 
> Changes:
> v3: https://lkml.org/lkml/2014/9/3/704
> v4: https://lkml.org/lkml/2015/4/13/691
> v5: https://lkml.org/lkml/2015/9/29/911
> v6: https://lkml.org/lkml/2016/5/23/661
> v7: https://lkml.org/lkml/2017/1/9/213
> v8: https://lkml.org/lkml/2017/6/19/624
> v9: - rebased on top of v4.16;
>     - signal subsystem reworked to avoid code duplication, as requested
>       by Dave Martin (patches 18 and 20);
>     - new files introduced in series use SPDX notation for license;
>     - linux-api and linux-arch CCed as the series changes kernel ABI;
>     - checkpatch and other minor fixes.
>     - Zhou Chengming's reported-by for patch 2 and signed-off-by for
>       patch 21 removed because his email became invalid. Zhou, please
>       share your new email.

This is a 5.2-based version of series.
https://github.com/norov/linux/tree/ilp32-5.2

Thanks,
Yury

^ permalink raw reply

* Re: [PATCH v2 00/11] FPGA DFL updates
From: Wu Hao @ 2019-07-10  5:07 UTC (permalink / raw)
  To: gregkh, mdf, linux-fpga; +Cc: linux-kernel, linux-api, atull
In-Reply-To: <1562286238-11413-1-git-send-email-hao.wu@intel.com>

On Fri, Jul 05, 2019 at 08:23:47AM +0800, Wu Hao wrote:
> Hi Greg / Moritz
> 
> This is v2 patchset which adds more features to FPGA DFL. This patchset
> is made on top of patch[1] and char-misc-next tree. Documentation patch
> for DFL is dropped from this patchset, and will resubmit it later to
> avoid conflict.

Hi Greg,

Did you get a chance to take a look at this new version to see if these
patches are good to take?

Hope we can catch up with the merge window.

Thanks
Hao

> 
> Main changes from v1:
>   - remove DRV/MODULE_VERSION modifications. (patch #1, #3, #4, #6)
>   - remove argsz from new ioctls. (patch #2)
>   - replace sysfs_create/remove_* with device_add/remove_* for sysfs entries.
>     (patch #5, #8, #11)
> 
> [1] [PATCH] fpga: dfl: use driver core functions, not sysfs ones.
>     https://lkml.org/lkml/2019/7/4/36
> 
> Wu Hao (11):
>   fpga: dfl: fme: support 512bit data width PR
>   fpga: dfl: fme: add DFL_FPGA_FME_PORT_RELEASE/ASSIGN ioctl support.
>   fpga: dfl: pci: enable SRIOV support.
>   fpga: dfl: afu: add AFU state related sysfs interfaces
>   fpga: dfl: afu: add userclock sysfs interfaces.
>   fpga: dfl: add id_table for dfl private feature driver
>   fpga: dfl: afu: export __port_enable/disable function.
>   fpga: dfl: afu: add error reporting support.
>   fpga: dfl: afu: add STP (SignalTap) support
>   fpga: dfl: fme: add capability sysfs interfaces
>   fpga: dfl: fme: add global error reporting support
> 
>  Documentation/ABI/testing/sysfs-platform-dfl-fme  |  98 ++++++
>  Documentation/ABI/testing/sysfs-platform-dfl-port | 104 ++++++
>  drivers/fpga/Makefile                             |   3 +-
>  drivers/fpga/dfl-afu-error.c                      | 225 +++++++++++++
>  drivers/fpga/dfl-afu-main.c                       | 328 +++++++++++++++++-
>  drivers/fpga/dfl-afu.h                            |   7 +
>  drivers/fpga/dfl-fme-error.c                      | 385 ++++++++++++++++++++++
>  drivers/fpga/dfl-fme-main.c                       |  93 +++++-
>  drivers/fpga/dfl-fme-mgr.c                        | 110 ++++++-
>  drivers/fpga/dfl-fme-pr.c                         |  50 ++-
>  drivers/fpga/dfl-fme.h                            |   7 +-
>  drivers/fpga/dfl-pci.c                            |  39 +++
>  drivers/fpga/dfl.c                                | 166 +++++++++-
>  drivers/fpga/dfl.h                                |  54 ++-
>  include/uapi/linux/fpga-dfl.h                     |  19 ++
>  15 files changed, 1617 insertions(+), 71 deletions(-)
>  create mode 100644 drivers/fpga/dfl-afu-error.c
>  create mode 100644 drivers/fpga/dfl-fme-error.c
> 
> -- 
> 1.8.3.1

^ permalink raw reply

* Re: [PATCH v2 00/11] FPGA DFL updates
From: Greg KH @ 2019-07-10  5:54 UTC (permalink / raw)
  To: Wu Hao; +Cc: mdf, linux-fpga, linux-kernel, linux-api, atull
In-Reply-To: <20190710050746.GA28620@hao-dev>

On Wed, Jul 10, 2019 at 01:07:46PM +0800, Wu Hao wrote:
> On Fri, Jul 05, 2019 at 08:23:47AM +0800, Wu Hao wrote:
> > Hi Greg / Moritz
> > 
> > This is v2 patchset which adds more features to FPGA DFL. This patchset
> > is made on top of patch[1] and char-misc-next tree. Documentation patch
> > for DFL is dropped from this patchset, and will resubmit it later to
> > avoid conflict.
> 
> Hi Greg,
> 
> Did you get a chance to take a look at this new version to see if these
> patches are good to take?
> 
> Hope we can catch up with the merge window.

You sent them last Friday.  I actually tried to not do kernel work last
weekend.  The merge window opened up on Sunday.  My trees should have
been closed last week, so no, this missed this merge window, I'll
review these patches once 5.3-rc1 is out.

thanks

greg k-h

^ permalink raw reply

* Re: [PATCH v2 00/11] FPGA DFL updates
From: Wu Hao @ 2019-07-10  7:22 UTC (permalink / raw)
  To: Greg KH; +Cc: mdf, linux-fpga, linux-kernel, linux-api, atull
In-Reply-To: <20190710055417.GA5778@kroah.com>

On Wed, Jul 10, 2019 at 07:54:17AM +0200, Greg KH wrote:
> On Wed, Jul 10, 2019 at 01:07:46PM +0800, Wu Hao wrote:
> > On Fri, Jul 05, 2019 at 08:23:47AM +0800, Wu Hao wrote:
> > > Hi Greg / Moritz
> > > 
> > > This is v2 patchset which adds more features to FPGA DFL. This patchset
> > > is made on top of patch[1] and char-misc-next tree. Documentation patch
> > > for DFL is dropped from this patchset, and will resubmit it later to
> > > avoid conflict.
> > 
> > Hi Greg,
> > 
> > Did you get a chance to take a look at this new version to see if these
> > patches are good to take?
> > 
> > Hope we can catch up with the merge window.
> 
> You sent them last Friday.  I actually tried to not do kernel work last
> weekend.  The merge window opened up on Sunday.  My trees should have
> been closed last week, so no, this missed this merge window, I'll
> review these patches once 5.3-rc1 is out.

Oh..... I see...

Ok. Then let me resend it with documentation patch added back once 5.3-rc1
is out.

Thanks
Hao

> 
> thanks
> 
> greg k-h

^ permalink raw reply

* Re: [PATCH v3 4/5] mm: introduce MADV_PAGEOUT
From: Minchan Kim @ 2019-07-10 10:48 UTC (permalink / raw)
  To: Michal Hocko
  Cc: Andrew Morton, linux-mm, LKML, linux-api, Johannes Weiner,
	Tim Murray, Joel Fernandes, Suren Baghdasaryan, Daniel Colascione,
	Shakeel Butt, Sonny Rao, oleksandr, hdanton, lizeb, Dave Hansen,
	Kirill A . Shutemov
In-Reply-To: <20190709095518.GF26380@dhcp22.suse.cz>

On Tue, Jul 09, 2019 at 11:55:19AM +0200, Michal Hocko wrote:
> On Thu 27-06-19 20:54:04, Minchan Kim wrote:
> > When a process expects no accesses to a certain memory range
> > for a long time, it could hint kernel that the pages can be
> > reclaimed instantly but data should be preserved for future use.
> > This could reduce workingset eviction so it ends up increasing
> > performance.
> > 
> > This patch introduces the new MADV_PAGEOUT hint to madvise(2)
> > syscall. MADV_PAGEOUT can be used by a process to mark a memory
> > range as not expected to be used for a long time so that kernel
> > reclaims *any LRU* pages instantly. The hint can help kernel in
> > deciding which pages to evict proactively.
> > 
> > - man-page material
> > 
> > MADV_PAGEOUT (since Linux x.x)
> > 
> > Do not expect access in the near future so pages in the specified
> > regions could be reclaimed instantly regardless of memory pressure.
> > Thus, access in the range after successful operation could cause
> > major page fault but never lose the up-to-date contents unlike
> > MADV_DONTNEED.
> 
> > It works for only private anonymous mappings and
> > non-anonymous mappings that belong to files that the calling process
> > could successfully open for writing; otherwise, it could be used for
> > sidechannel attack.
> 
> I would rephrase this way:
> "
> Pages belonging to a shared mapping are only processed if a write access
> is allowed for the calling process.
> "
> 
> I wouldn't really mention side channel attacks for a man page. You can
> mention can_do_mincore check and the side channel prevention in the
> changelog that is not aimed for the man page.

Agree. I will rephrase with one you suggested.
Thanks for the suggestion.

> 
> > MADV_PAGEOUT cannot be applied to locked pages, Huge TLB pages, or
> > VM_PFNMAP pages.
> > 
> > * v2
> >  * add comment about SWAP_CLUSTER_MAX - mhocko
> >  * add permission check to prevent sidechannel attack - mhocko
> >  * add man page stuff - dave
> > 
> > * v1
> >  * change pte to old and rely on the other's reference - hannes
> >  * remove page_mapcount to check shared page - mhocko
> > 
> > * RFC v2
> >  * make reclaim_pages simple via factoring out isolate logic - hannes
> > 
> > * RFCv1
> >  * rename from MADV_COLD to MADV_PAGEOUT - hannes
> >  * bail out if process is being killed - Hillf
> >  * fix reclaim_pages bugs - Hillf
> > 
> > Signed-off-by: Minchan Kim <minchan@kernel.org>
> > ---
> 
> 
> I am still not convinced about the SWAP_CLUSTER_MAX batching and the
> udnerlying OOM argument. Is one pmd worth of pages really an OOM risk?
> Sure you can have many invocations in parallel and that would add on
> but the same might happen with SWAP_CLUSTER_MAX. So I would just remove
> the batching for now and think of it only if we really see this being a
> problem for real. Unless you feel really strong about this, of course.

I don't have the number to support SWAP_CLUSTER_MAX batching for hinting
operations. However, I wanted to be consistent with other LRU batching
logic so that it could affect altogether if someone try to increase
SWAP_CLUSTER_MAX which is more efficienty for batching operation, later.
(AFAIK, someone tried it a few years ago but rollback soon, I couldn't
rebemeber what was the reason at that time, anyway).

> 
> Anyway the patch looks ok to me otherwise.
> 
> Acked-by: Michal Hocko <mhocko@suse.co>

Thanks!

^ permalink raw reply

* Re: [PATCH v3 5/5] mm: factor out pmd young/dirty bit handling and THP split
From: Minchan Kim @ 2019-07-10 10:56 UTC (permalink / raw)
  To: Michal Hocko
  Cc: Andrew Morton, linux-mm, LKML, linux-api, Johannes Weiner,
	Tim Murray, Joel Fernandes, Suren Baghdasaryan, Daniel Colascione,
	Shakeel Butt, Sonny Rao, oleksandr, hdanton, lizeb, Dave Hansen,
	Kirill A . Shutemov
In-Reply-To: <20190709141019.GN26380@dhcp22.suse.cz>

On Tue, Jul 09, 2019 at 04:10:19PM +0200, Michal Hocko wrote:
> On Thu 27-06-19 20:54:05, Minchan Kim wrote:
> > Now, there are common part among MADV_COLD|PAGEOUT|FREE to reset
> > access/dirty bit resetting or split the THP page to handle part
> > of subpages in the THP page. This patch factor out the common part.
> 
> While this reduces the code duplication to some degree I suspect it only
> goes half way. I haven't tried that myself due to lack of time but I
> believe this has a potential to reduce even more. All those madvise
> calls are doing the same thing essentially. What page tables and apply
> an operation on ptes and/or a page that is mapped. And that suggests
> that the specific operation should be good with defining two - pte and
> page operations on each entry. All the rest should be a common code.
> 
> That being said, I do not feel strongly about this patch. The rest of
> the series should be good enough even without it and I wouldn't delay it
> just by discussing a potential of the cleanup.

I totally agree with you. For several cycles, some people asked me to
factor common part out. I understand them why they wanted it. However,
when I tried it, it's not trivial to clean it out due to subtle
difference of them. If I couldn't make it clean at this moment, I want to
keep them without factoing out since it's more readable, at least.

I will drop this patch next submit unless someone pop with better idea.

> 
> > Signed-off-by: Minchan Kim <minchan@kernel.org>
> > ---
> >  include/linux/huge_mm.h |   3 -
> >  mm/huge_memory.c        |  74 -------------
> >  mm/madvise.c            | 234 +++++++++++++++++++++++-----------------
> >  3 files changed, 135 insertions(+), 176 deletions(-)
> > 
> > diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
> > index 7cd5c150c21d..2667e1aa3ce5 100644
> > --- a/include/linux/huge_mm.h
> > +++ b/include/linux/huge_mm.h
> > @@ -29,9 +29,6 @@ extern struct page *follow_trans_huge_pmd(struct vm_area_struct *vma,
> >  					  unsigned long addr,
> >  					  pmd_t *pmd,
> >  					  unsigned int flags);
> > -extern bool madvise_free_huge_pmd(struct mmu_gather *tlb,
> > -			struct vm_area_struct *vma,
> > -			pmd_t *pmd, unsigned long addr, unsigned long next);
> >  extern int zap_huge_pmd(struct mmu_gather *tlb,
> >  			struct vm_area_struct *vma,
> >  			pmd_t *pmd, unsigned long addr);
> > diff --git a/mm/huge_memory.c b/mm/huge_memory.c
> > index 93f531b63a45..e4b9a06788f3 100644
> > --- a/mm/huge_memory.c
> > +++ b/mm/huge_memory.c
> > @@ -1671,80 +1671,6 @@ vm_fault_t do_huge_pmd_numa_page(struct vm_fault *vmf, pmd_t pmd)
> >  	return 0;
> >  }
> >  
> > -/*
> > - * Return true if we do MADV_FREE successfully on entire pmd page.
> > - * Otherwise, return false.
> > - */
> > -bool madvise_free_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
> > -		pmd_t *pmd, unsigned long addr, unsigned long next)
> > -{
> > -	spinlock_t *ptl;
> > -	pmd_t orig_pmd;
> > -	struct page *page;
> > -	struct mm_struct *mm = tlb->mm;
> > -	bool ret = false;
> > -
> > -	tlb_change_page_size(tlb, HPAGE_PMD_SIZE);
> > -
> > -	ptl = pmd_trans_huge_lock(pmd, vma);
> > -	if (!ptl)
> > -		goto out_unlocked;
> > -
> > -	orig_pmd = *pmd;
> > -	if (is_huge_zero_pmd(orig_pmd))
> > -		goto out;
> > -
> > -	if (unlikely(!pmd_present(orig_pmd))) {
> > -		VM_BUG_ON(thp_migration_supported() &&
> > -				  !is_pmd_migration_entry(orig_pmd));
> > -		goto out;
> > -	}
> > -
> > -	page = pmd_page(orig_pmd);
> > -	/*
> > -	 * If other processes are mapping this page, we couldn't discard
> > -	 * the page unless they all do MADV_FREE so let's skip the page.
> > -	 */
> > -	if (page_mapcount(page) != 1)
> > -		goto out;
> > -
> > -	if (!trylock_page(page))
> > -		goto out;
> > -
> > -	/*
> > -	 * If user want to discard part-pages of THP, split it so MADV_FREE
> > -	 * will deactivate only them.
> > -	 */
> > -	if (next - addr != HPAGE_PMD_SIZE) {
> > -		get_page(page);
> > -		spin_unlock(ptl);
> > -		split_huge_page(page);
> > -		unlock_page(page);
> > -		put_page(page);
> > -		goto out_unlocked;
> > -	}
> > -
> > -	if (PageDirty(page))
> > -		ClearPageDirty(page);
> > -	unlock_page(page);
> > -
> > -	if (pmd_young(orig_pmd) || pmd_dirty(orig_pmd)) {
> > -		pmdp_invalidate(vma, addr, pmd);
> > -		orig_pmd = pmd_mkold(orig_pmd);
> > -		orig_pmd = pmd_mkclean(orig_pmd);
> > -
> > -		set_pmd_at(mm, addr, pmd, orig_pmd);
> > -		tlb_remove_pmd_tlb_entry(tlb, pmd, addr);
> > -	}
> > -
> > -	mark_page_lazyfree(page);
> > -	ret = true;
> > -out:
> > -	spin_unlock(ptl);
> > -out_unlocked:
> > -	return ret;
> > -}
> > -
> >  static inline void zap_deposited_table(struct mm_struct *mm, pmd_t *pmd)
> >  {
> >  	pgtable_t pgtable;
> > diff --git a/mm/madvise.c b/mm/madvise.c
> > index ee210473f639..13b06dc8d402 100644
> > --- a/mm/madvise.c
> > +++ b/mm/madvise.c
> > @@ -310,6 +310,91 @@ static long madvise_willneed(struct vm_area_struct *vma,
> >  	return 0;
> >  }
> >  
> > +enum madv_pmdp_reset_t {
> > +	MADV_PMDP_RESET,	/* pmd was reset successfully */
> > +	MADV_PMDP_SPLIT,	/* pmd was split */
> > +	MADV_PMDP_ERROR,
> > +};
> > +
> > +static enum madv_pmdp_reset_t madvise_pmdp_reset_or_split(struct mm_walk *walk,
> > +				pmd_t *pmd, spinlock_t *ptl,
> > +				unsigned long addr, unsigned long end,
> > +				bool young, bool dirty)
> > +{
> > +	pmd_t orig_pmd;
> > +	unsigned long next;
> > +	struct page *page;
> > +	struct mmu_gather *tlb = walk->private;
> > +	struct mm_struct *mm = walk->mm;
> > +	struct vm_area_struct *vma = walk->vma;
> > +	bool reset_young = false;
> > +	bool reset_dirty = false;
> > +	enum madv_pmdp_reset_t ret = MADV_PMDP_ERROR;
> > +
> > +	orig_pmd = *pmd;
> > +	if (is_huge_zero_pmd(orig_pmd))
> > +		return ret;
> > +
> > +	if (unlikely(!pmd_present(orig_pmd))) {
> > +		VM_BUG_ON(thp_migration_supported() &&
> > +				!is_pmd_migration_entry(orig_pmd));
> > +		return ret;
> > +	}
> > +
> > +	next = pmd_addr_end(addr, end);
> > +	page = pmd_page(orig_pmd);
> > +	if (next - addr != HPAGE_PMD_SIZE) {
> > +		/*
> > +		 * THP collapsing is not cheap so only split the page is
> > +		 * private to the this process.
> > +		 */
> > +		if (page_mapcount(page) != 1)
> > +			return ret;
> > +		get_page(page);
> > +		spin_unlock(ptl);
> > +		lock_page(page);
> > +		if (!split_huge_page(page))
> > +			ret = MADV_PMDP_SPLIT;
> > +		unlock_page(page);
> > +		put_page(page);
> > +		return ret;
> > +	}
> > +
> > +	if (young && pmd_young(orig_pmd))
> > +		reset_young = true;
> > +	if (dirty && pmd_dirty(orig_pmd))
> > +		reset_dirty = true;
> > +
> > +	/*
> > +	 * Other process could rely on the PG_dirty for data consistency,
> > +	 * not pte_dirty so we could reset PG_dirty only when we are owner
> > +	 * of the page.
> > +	 */
> > +	if (reset_dirty) {
> > +		if (page_mapcount(page) != 1)
> > +			goto out;
> > +		if (!trylock_page(page))
> > +			goto out;
> > +		if (PageDirty(page))
> > +			ClearPageDirty(page);
> > +		unlock_page(page);
> > +	}
> > +
> > +	ret = MADV_PMDP_RESET;
> > +	if (reset_young || reset_dirty) {
> > +		tlb_change_page_size(tlb, HPAGE_PMD_SIZE);
> > +		pmdp_invalidate(vma, addr, pmd);
> > +		if (reset_young)
> > +			orig_pmd = pmd_mkold(orig_pmd);
> > +		if (reset_dirty)
> > +			orig_pmd = pmd_mkclean(orig_pmd);
> > +		set_pmd_at(mm, addr, pmd, orig_pmd);
> > +		tlb_remove_pmd_tlb_entry(tlb, pmd, addr);
> > +	}
> > +out:
> > +	return ret;
> > +}
> > +
> >  static int madvise_cold_pte_range(pmd_t *pmd, unsigned long addr,
> >  				unsigned long end, struct mm_walk *walk)
> >  {
> > @@ -319,64 +404,31 @@ static int madvise_cold_pte_range(pmd_t *pmd, unsigned long addr,
> >  	pte_t *orig_pte, *pte, ptent;
> >  	spinlock_t *ptl;
> >  	struct page *page;
> > -	unsigned long next;
> >  
> > -	next = pmd_addr_end(addr, end);
> >  	if (pmd_trans_huge(*pmd)) {
> > -		pmd_t orig_pmd;
> > -
> > -		tlb_change_page_size(tlb, HPAGE_PMD_SIZE);
> >  		ptl = pmd_trans_huge_lock(pmd, vma);
> >  		if (!ptl)
> >  			return 0;
> >  
> > -		orig_pmd = *pmd;
> > -		if (is_huge_zero_pmd(orig_pmd))
> > -			goto huge_unlock;
> > -
> > -		if (unlikely(!pmd_present(orig_pmd))) {
> > -			VM_BUG_ON(thp_migration_supported() &&
> > -					!is_pmd_migration_entry(orig_pmd));
> > -			goto huge_unlock;
> > -		}
> > -
> > -		page = pmd_page(orig_pmd);
> > -		if (next - addr != HPAGE_PMD_SIZE) {
> > -			int err;
> > -
> > -			if (page_mapcount(page) != 1)
> > -				goto huge_unlock;
> > -
> > -			get_page(page);
> > +		switch (madvise_pmdp_reset_or_split(walk, pmd, ptl, addr, end,
> > +							true, false)) {
> > +		case MADV_PMDP_RESET:
> >  			spin_unlock(ptl);
> > -			lock_page(page);
> > -			err = split_huge_page(page);
> > -			unlock_page(page);
> > -			put_page(page);
> > -			if (!err)
> > -				goto regular_page;
> > -			return 0;
> > -		}
> > -
> > -		if (pmd_young(orig_pmd)) {
> > -			pmdp_invalidate(vma, addr, pmd);
> > -			orig_pmd = pmd_mkold(orig_pmd);
> > -
> > -			set_pmd_at(mm, addr, pmd, orig_pmd);
> > -			tlb_remove_pmd_tlb_entry(tlb, pmd, addr);
> > +			page = pmd_page(*pmd);
> > +			test_and_clear_page_young(page);
> > +			deactivate_page(page);
> > +			goto next;
> > +		case MADV_PMDP_ERROR:
> > +			spin_unlock(ptl);
> > +			goto next;
> > +		case MADV_PMDP_SPLIT:
> > +			; /* go through */
> >  		}
> > -
> > -		test_and_clear_page_young(page);
> > -		deactivate_page(page);
> > -huge_unlock:
> > -		spin_unlock(ptl);
> > -		return 0;
> >  	}
> >  
> >  	if (pmd_trans_unstable(pmd))
> >  		return 0;
> >  
> > -regular_page:
> >  	tlb_change_page_size(tlb, PAGE_SIZE);
> >  	orig_pte = pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
> >  	flush_tlb_batched_pending(mm);
> > @@ -443,6 +495,7 @@ static int madvise_cold_pte_range(pmd_t *pmd, unsigned long addr,
> >  
> >  	arch_enter_lazy_mmu_mode();
> >  	pte_unmap_unlock(orig_pte, ptl);
> > +next:
> >  	cond_resched();
> >  
> >  	return 0;
> > @@ -493,70 +546,38 @@ static int madvise_pageout_pte_range(pmd_t *pmd, unsigned long addr,
> >  	LIST_HEAD(page_list);
> >  	struct page *page;
> >  	int isolated = 0;
> > -	unsigned long next;
> >  
> >  	if (fatal_signal_pending(current))
> >  		return -EINTR;
> >  
> > -	next = pmd_addr_end(addr, end);
> >  	if (pmd_trans_huge(*pmd)) {
> > -		pmd_t orig_pmd;
> > -
> > -		tlb_change_page_size(tlb, HPAGE_PMD_SIZE);
> >  		ptl = pmd_trans_huge_lock(pmd, vma);
> >  		if (!ptl)
> >  			return 0;
> >  
> > -		orig_pmd = *pmd;
> > -		if (is_huge_zero_pmd(orig_pmd))
> > -			goto huge_unlock;
> > -
> > -		if (unlikely(!pmd_present(orig_pmd))) {
> > -			VM_BUG_ON(thp_migration_supported() &&
> > -					!is_pmd_migration_entry(orig_pmd));
> > -			goto huge_unlock;
> > -		}
> > -
> > -		page = pmd_page(orig_pmd);
> > -		if (next - addr != HPAGE_PMD_SIZE) {
> > -			int err;
> > -
> > -			if (page_mapcount(page) != 1)
> > -				goto huge_unlock;
> > -			get_page(page);
> > +		switch (madvise_pmdp_reset_or_split(walk, pmd, ptl, addr, end,
> > +							true, false)) {
> > +		case MADV_PMDP_RESET:
> > +			page = pmd_page(*pmd);
> >  			spin_unlock(ptl);
> > -			lock_page(page);
> > -			err = split_huge_page(page);
> > -			unlock_page(page);
> > -			put_page(page);
> > -			if (!err)
> > -				goto regular_page;
> > -			return 0;
> > -		}
> > -
> > -		if (isolate_lru_page(page))
> > -			goto huge_unlock;
> > -
> > -		if (pmd_young(orig_pmd)) {
> > -			pmdp_invalidate(vma, addr, pmd);
> > -			orig_pmd = pmd_mkold(orig_pmd);
> > -
> > -			set_pmd_at(mm, addr, pmd, orig_pmd);
> > -			tlb_remove_tlb_entry(tlb, pmd, addr);
> > +			if (isolate_lru_page(page))
> > +				return 0;
> > +			ClearPageReferenced(page);
> > +			test_and_clear_page_young(page);
> > +			list_add(&page->lru, &page_list);
> > +			reclaim_pages(&page_list);
> > +			goto next;
> > +		case MADV_PMDP_ERROR:
> > +			spin_unlock(ptl);
> > +			goto next;
> > +		case MADV_PMDP_SPLIT:
> > +			; /* go through */
> >  		}
> > -
> > -		ClearPageReferenced(page);
> > -		test_and_clear_page_young(page);
> > -		list_add(&page->lru, &page_list);
> > -huge_unlock:
> > -		spin_unlock(ptl);
> > -		reclaim_pages(&page_list);
> > -		return 0;
> >  	}
> >  
> >  	if (pmd_trans_unstable(pmd))
> >  		return 0;
> > -regular_page:
> > +
> >  	tlb_change_page_size(tlb, PAGE_SIZE);
> >  	orig_pte = pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
> >  	flush_tlb_batched_pending(mm);
> > @@ -631,6 +652,7 @@ static int madvise_pageout_pte_range(pmd_t *pmd, unsigned long addr,
> >  	arch_leave_lazy_mmu_mode();
> >  	pte_unmap_unlock(orig_pte, ptl);
> >  	reclaim_pages(&page_list);
> > +next:
> >  	cond_resched();
> >  
> >  	return 0;
> > @@ -700,12 +722,26 @@ static int madvise_free_pte_range(pmd_t *pmd, unsigned long addr,
> >  	pte_t *orig_pte, *pte, ptent;
> >  	struct page *page;
> >  	int nr_swap = 0;
> > -	unsigned long next;
> >  
> > -	next = pmd_addr_end(addr, end);
> > -	if (pmd_trans_huge(*pmd))
> > -		if (madvise_free_huge_pmd(tlb, vma, pmd, addr, next))
> > +	if (pmd_trans_huge(*pmd)) {
> > +		ptl = pmd_trans_huge_lock(pmd, vma);
> > +		if (!ptl)
> > +			return 0;
> > +
> > +		switch (madvise_pmdp_reset_or_split(walk, pmd, ptl, addr, end,
> > +							true, true)) {
> > +		case MADV_PMDP_RESET:
> > +			page = pmd_page(*pmd);
> > +			spin_unlock(ptl);
> > +			mark_page_lazyfree(page);
> >  			goto next;
> > +		case MADV_PMDP_ERROR:
> > +			spin_unlock(ptl);
> > +			goto next;
> > +		case MADV_PMDP_SPLIT:
> > +			; /* go through */
> > +		}
> > +	}
> >  
> >  	if (pmd_trans_unstable(pmd))
> >  		return 0;
> > @@ -817,8 +853,8 @@ static int madvise_free_pte_range(pmd_t *pmd, unsigned long addr,
> >  	}
> >  	arch_leave_lazy_mmu_mode();
> >  	pte_unmap_unlock(orig_pte, ptl);
> > -	cond_resched();
> >  next:
> > +	cond_resched();
> >  	return 0;
> >  }
> >  
> > -- 
> > 2.22.0.410.gd8fdbe21b5-goog
> 
> -- 
> Michal Hocko
> SUSE Labs

^ permalink raw reply

* Re: [PATCH v3 4/5] mm: introduce MADV_PAGEOUT
From: Michal Hocko @ 2019-07-10 11:16 UTC (permalink / raw)
  To: Minchan Kim
  Cc: Andrew Morton, linux-mm, LKML, linux-api, Johannes Weiner,
	Tim Murray, Joel Fernandes, Suren Baghdasaryan, Daniel Colascione,
	Shakeel Butt, Sonny Rao, oleksandr, hdanton, lizeb, Dave Hansen,
	Kirill A . Shutemov
In-Reply-To: <20190710104809.GA186559@google.com>

On Wed 10-07-19 19:48:09, Minchan Kim wrote:
> On Tue, Jul 09, 2019 at 11:55:19AM +0200, Michal Hocko wrote:
[...]
> > I am still not convinced about the SWAP_CLUSTER_MAX batching and the
> > udnerlying OOM argument. Is one pmd worth of pages really an OOM risk?
> > Sure you can have many invocations in parallel and that would add on
> > but the same might happen with SWAP_CLUSTER_MAX. So I would just remove
> > the batching for now and think of it only if we really see this being a
> > problem for real. Unless you feel really strong about this, of course.
> 
> I don't have the number to support SWAP_CLUSTER_MAX batching for hinting
> operations. However, I wanted to be consistent with other LRU batching
> logic so that it could affect altogether if someone try to increase
> SWAP_CLUSTER_MAX which is more efficienty for batching operation, later.
> (AFAIK, someone tried it a few years ago but rollback soon, I couldn't
> rebemeber what was the reason at that time, anyway).

Then please drop this part. It makes the code more complex while any
benefit is not demonstrated.

> > Anyway the patch looks ok to me otherwise.
> > 
> > Acked-by: Michal Hocko <mhocko@suse.co>
> 
> Thanks!

-- 
Michal Hocko
SUSE Labs

^ permalink raw reply

* Re: [PATCH v3 4/5] mm: introduce MADV_PAGEOUT
From: Minchan Kim @ 2019-07-10 11:53 UTC (permalink / raw)
  To: Michal Hocko
  Cc: Andrew Morton, linux-mm, LKML, linux-api, Johannes Weiner,
	Tim Murray, Joel Fernandes, Suren Baghdasaryan, Daniel Colascione,
	Shakeel Butt, Sonny Rao, oleksandr, hdanton, lizeb, Dave Hansen,
	Kirill A . Shutemov
In-Reply-To: <20190710111622.GI29695@dhcp22.suse.cz>

On Wed, Jul 10, 2019 at 01:16:22PM +0200, Michal Hocko wrote:
> On Wed 10-07-19 19:48:09, Minchan Kim wrote:
> > On Tue, Jul 09, 2019 at 11:55:19AM +0200, Michal Hocko wrote:
> [...]
> > > I am still not convinced about the SWAP_CLUSTER_MAX batching and the
> > > udnerlying OOM argument. Is one pmd worth of pages really an OOM risk?
> > > Sure you can have many invocations in parallel and that would add on
> > > but the same might happen with SWAP_CLUSTER_MAX. So I would just remove
> > > the batching for now and think of it only if we really see this being a
> > > problem for real. Unless you feel really strong about this, of course.
> > 
> > I don't have the number to support SWAP_CLUSTER_MAX batching for hinting
> > operations. However, I wanted to be consistent with other LRU batching
> > logic so that it could affect altogether if someone try to increase
> > SWAP_CLUSTER_MAX which is more efficienty for batching operation, later.
> > (AFAIK, someone tried it a few years ago but rollback soon, I couldn't
> > rebemeber what was the reason at that time, anyway).
> 
> Then please drop this part. It makes the code more complex while any
> benefit is not demonstrated.

The history says the benefit.
https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/patch/?id=d37dd5dcb955dd8c2cdd4eaef1f15d1b7ecbc379
With the history, rather than proving it's worth for upcoming new code,
need to try to prove no harmful any longer if we want to remove(or not
consistent with other reclaim path). It's not the goal of this patch.

^ permalink raw reply

* Re: [PATCH V34 10/29] hibernate: Disable when the kernel is locked down
From: Joey Lee @ 2019-07-10 15:26 UTC (permalink / raw)
  To: Jiri Kosina
  Cc: Pavel Machek, Matthew Garrett, jmorris@namei.org,
	linux-security-module@vger.kernel.org,
	linux-kernel@vger.kernel.org, linux-api@vger.kernel.org,
	Josh Boyer, David Howells, Matthew Garrett, rjw@rjwysocki.net,
	linux-pm@vger.kernel.org
In-Reply-To: <nycvar.YFH.7.76.1906241520500.27227@cbobk.fhfr.pm>

Hi,

On Mon, Jun 24, 2019 at 03:21:23PM +0200, Jiri Kosina wrote:
> On Sat, 22 Jun 2019, Pavel Machek wrote:
> 
> > > There is currently no way to verify the resume image when returning
> > > from hibernate.  This might compromise the signed modules trust model,
> > > so until we can work with signed hibernate images we disable it when the
> > > kernel is locked down.
> > 
> > I keep getting these...
> > 
> > IIRC suse has patches to verify the images.
> 
> Yeah, Joey Lee is taking care of those. CCing.
>

The last time that I sent for hibernation encryption and authentication is
here:
    https://lkml.org/lkml/2019/1/3/281 

It needs some big changes after review:
 - Simplify the design: remove keyring dependency and trampoline. 
 - Encrypted whole snapshot image instead of only data pages.
 - Using TPM:
	- Direct use TPM API in hibernation instead of keyring
	- Localities (suggested by James Bottomley)

I am still finding enough time to implement those changes, especial TPM
parts.

Thanks
Joey Lee

^ permalink raw reply

* Re: [PATCH v3 4/5] mm: introduce MADV_PAGEOUT
From: Michal Hocko @ 2019-07-10 19:47 UTC (permalink / raw)
  To: Minchan Kim
  Cc: Andrew Morton, linux-mm, LKML, linux-api, Johannes Weiner,
	Tim Murray, Joel Fernandes, Suren Baghdasaryan, Daniel Colascione,
	Shakeel Butt, Sonny Rao, oleksandr, hdanton, lizeb, Dave Hansen,
	Kirill A . Shutemov
In-Reply-To: <20190710115356.GC186559@google.com>

On Wed 10-07-19 20:53:56, Minchan Kim wrote:
> On Wed, Jul 10, 2019 at 01:16:22PM +0200, Michal Hocko wrote:
> > On Wed 10-07-19 19:48:09, Minchan Kim wrote:
> > > On Tue, Jul 09, 2019 at 11:55:19AM +0200, Michal Hocko wrote:
> > [...]
> > > > I am still not convinced about the SWAP_CLUSTER_MAX batching and the
> > > > udnerlying OOM argument. Is one pmd worth of pages really an OOM risk?
> > > > Sure you can have many invocations in parallel and that would add on
> > > > but the same might happen with SWAP_CLUSTER_MAX. So I would just remove
> > > > the batching for now and think of it only if we really see this being a
> > > > problem for real. Unless you feel really strong about this, of course.
> > > 
> > > I don't have the number to support SWAP_CLUSTER_MAX batching for hinting
> > > operations. However, I wanted to be consistent with other LRU batching
> > > logic so that it could affect altogether if someone try to increase
> > > SWAP_CLUSTER_MAX which is more efficienty for batching operation, later.
> > > (AFAIK, someone tried it a few years ago but rollback soon, I couldn't
> > > rebemeber what was the reason at that time, anyway).
> > 
> > Then please drop this part. It makes the code more complex while any
> > benefit is not demonstrated.
> 
> The history says the benefit.
> https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/patch/?id=d37dd5dcb955dd8c2cdd4eaef1f15d1b7ecbc379

Limiting the number of isolated pages is fine. All I am saying is that
SWAP_CLUSTER_MAX is an arbitrary number same as 512 pages for one PMD as
a unit of work. Both can lead to the same effect if there are too many
parallel tasks doing the same thing.

I do not want you to change that in the reclaim path. All I am asking
for is to add a bathing without any actual data to back that because
that makes the code more complex without any gains.
-- 
Michal Hocko
SUSE Labs

^ permalink raw reply

* Re: [PATCH v3 4/5] mm: introduce MADV_PAGEOUT
From: Minchan Kim @ 2019-07-11  0:25 UTC (permalink / raw)
  To: Michal Hocko
  Cc: Andrew Morton, linux-mm, LKML, linux-api, Johannes Weiner,
	Tim Murray, Joel Fernandes, Suren Baghdasaryan, Daniel Colascione,
	Shakeel Butt, Sonny Rao, oleksandr, hdanton, lizeb, Dave Hansen,
	Kirill A . Shutemov
In-Reply-To: <20190710194719.GS29695@dhcp22.suse.cz>

On Wed, Jul 10, 2019 at 09:47:19PM +0200, Michal Hocko wrote:
> On Wed 10-07-19 20:53:56, Minchan Kim wrote:
> > On Wed, Jul 10, 2019 at 01:16:22PM +0200, Michal Hocko wrote:
> > > On Wed 10-07-19 19:48:09, Minchan Kim wrote:
> > > > On Tue, Jul 09, 2019 at 11:55:19AM +0200, Michal Hocko wrote:
> > > [...]
> > > > > I am still not convinced about the SWAP_CLUSTER_MAX batching and the
> > > > > udnerlying OOM argument. Is one pmd worth of pages really an OOM risk?
> > > > > Sure you can have many invocations in parallel and that would add on
> > > > > but the same might happen with SWAP_CLUSTER_MAX. So I would just remove
> > > > > the batching for now and think of it only if we really see this being a
> > > > > problem for real. Unless you feel really strong about this, of course.
> > > > 
> > > > I don't have the number to support SWAP_CLUSTER_MAX batching for hinting
> > > > operations. However, I wanted to be consistent with other LRU batching
> > > > logic so that it could affect altogether if someone try to increase
> > > > SWAP_CLUSTER_MAX which is more efficienty for batching operation, later.
> > > > (AFAIK, someone tried it a few years ago but rollback soon, I couldn't
> > > > rebemeber what was the reason at that time, anyway).
> > > 
> > > Then please drop this part. It makes the code more complex while any
> > > benefit is not demonstrated.
> > 
> > The history says the benefit.
> > https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/patch/?id=d37dd5dcb955dd8c2cdd4eaef1f15d1b7ecbc379
> 
> Limiting the number of isolated pages is fine. All I am saying is that
> SWAP_CLUSTER_MAX is an arbitrary number same as 512 pages for one PMD as
> a unit of work. Both can lead to the same effect if there are too many
> parallel tasks doing the same thing.
> 
> I do not want you to change that in the reclaim path. All I am asking
> for is to add a bathing without any actual data to back that because
> that makes the code more complex without any gains.

I understand what you meant and I'm really one to make code simple.
However, my concern was that we have isolated by SWAP_CLUSTER_MAX(32 pages)
for other path(reclaim/compaction) so I want to be consistent with others.
If you think that the consistency(IOW, others are 32 limit but here 256
limit) is no helpful this case, I don't have any strong opinion.
Let's drop the part. I will add it into description, then.

Thanks.

> -- 
> Michal Hocko
> SUSE Labs

^ permalink raw reply

* [PATCH v4 0/4] Introduce MADV_COLD and MADV_PAGEOUT
From: Minchan Kim @ 2019-07-11  1:25 UTC (permalink / raw)
  To: Andrew Morton
  Cc: linux-mm, LKML, linux-api, Michal Hocko, Johannes Weiner,
	Tim Murray, Joel Fernandes, Suren Baghdasaryan, Daniel Colascione,
	Shakeel Butt, Sonny Rao, oleksandr, hdanton, lizeb, Dave Hansen,
	Kirill A . Shutemov, Minchan Kim

This patch is part of previous series:
https://lore.kernel.org/lkml/20190531064313.193437-1-minchan@kernel.org/
Originally, it was created for external madvise hinting feature.

https://lkml.org/lkml/2019/5/31/463
Michal wanted to separte the discussion from external hinting interface
so this patchset includes only first part of my entire patchset

  - introduce MADV_COLD and MADV_PAGEOUT hint to madvise.

However, I keep entire description for others for easier understanding
why this kinds of hint was born.

Thanks.

This patchset is against on next-20190710.

Below is description of previous entire patchset.

================= &< =====================

- Background

The Android terminology used for forking a new process and starting an app
from scratch is a cold start, while resuming an existing app is a hot start.
While we continually try to improve the performance of cold starts, hot
starts will always be significantly less power hungry as well as faster so
we are trying to make hot start more likely than cold start.

To increase hot start, Android userspace manages the order that apps should
be killed in a process called ActivityManagerService. ActivityManagerService
tracks every Android app or service that the user could be interacting with
at any time and translates that into a ranked list for lmkd(low memory
killer daemon). They are likely to be killed by lmkd if the system has to
reclaim memory. In that sense they are similar to entries in any other cache.
Those apps are kept alive for opportunistic performance improvements but
those performance improvements will vary based on the memory requirements of
individual workloads.

- Problem

Naturally, cached apps were dominant consumers of memory on the system.
However, they were not significant consumers of swap even though they are
good candidate for swap. Under investigation, swapping out only begins
once the low zone watermark is hit and kswapd wakes up, but the overall
allocation rate in the system might trip lmkd thresholds and cause a cached
process to be killed(we measured performance swapping out vs. zapping the
memory by killing a process. Unsurprisingly, zapping is 10x times faster
even though we use zram which is much faster than real storage) so kill
from lmkd will often satisfy the high zone watermark, resulting in very
few pages actually being moved to swap.

- Approach

The approach we chose was to use a new interface to allow userspace to
proactively reclaim entire processes by leveraging platform information.
This allowed us to bypass the inaccuracy of the kernel’s LRUs for pages
that are known to be cold from userspace and to avoid races with lmkd
by reclaiming apps as soon as they entered the cached state. Additionally,
it could provide many chances for platform to use much information to
optimize memory efficiency.

To achieve the goal, the patchset introduce two new options for madvise.
One is MADV_COLD which will deactivate activated pages and the other is
MADV_PAGEOUT which will reclaim private pages instantly. These new options
complement MADV_DONTNEED and MADV_FREE by adding non-destructive ways to
gain some free memory space. MADV_PAGEOUT is similar to MADV_DONTNEED in a way
that it hints the kernel that memory region is not currently needed and
should be reclaimed immediately; MADV_COLD is similar to MADV_FREE in a way
that it hints the kernel that memory region is not currently needed and
should be reclaimed when memory pressure rises.

* v3 - http://lore.kernel.org/lkml/20190627115405.255259-1-minchan@kernel.org
* v2 - http://lore.kernel.org/lkml/20190610111252.239156-1-minchan@kernel.org
* v1 - http://lore.kernel.org/lkml/20190603053655.127730-1-minchan@kernel.org

Minchan Kim (4):
  mm: introduce MADV_COLD
  mm: change PAGEREF_RECLAIM_CLEAN with PAGE_REFRECLAIM
  mm: account nr_isolated_xxx in [isolate|putback]_lru_page
  mm: introduce MADV_PAGEOUT

 include/linux/swap.h                   |   2 +
 include/uapi/asm-generic/mman-common.h |   2 +
 mm/compaction.c                        |   2 -
 mm/gup.c                               |   7 +-
 mm/internal.h                          |   2 +-
 mm/khugepaged.c                        |   3 -
 mm/madvise.c                           | 377 ++++++++++++++++++++++++-
 mm/memory-failure.c                    |   3 -
 mm/memory_hotplug.c                    |   4 -
 mm/mempolicy.c                         |   6 +-
 mm/migrate.c                           |  37 +--
 mm/oom_kill.c                          |   2 +-
 mm/swap.c                              |  42 +++
 mm/vmscan.c                            |  83 +++++-
 14 files changed, 507 insertions(+), 65 deletions(-)

-- 
2.22.0.410.gd8fdbe21b5-goog

^ permalink raw reply

* [PATCH v4 1/4] mm: introduce MADV_COLD
From: Minchan Kim @ 2019-07-11  1:25 UTC (permalink / raw)
  To: Andrew Morton
  Cc: linux-mm, LKML, linux-api, Michal Hocko, Johannes Weiner,
	Tim Murray, Joel Fernandes, Suren Baghdasaryan, Daniel Colascione,
	Shakeel Butt, Sonny Rao, oleksandr, hdanton, lizeb, Dave Hansen,
	Kirill A . Shutemov, Minchan Kim
In-Reply-To: <20190711012528.176050-1-minchan@kernel.org>

When a process expects no accesses to a certain memory range, it could
give a hint to kernel that the pages can be reclaimed when memory pressure
happens but data should be preserved for future use.  This could reduce
workingset eviction so it ends up increasing performance.

This patch introduces the new MADV_COLD hint to madvise(2) syscall.
MADV_COLD can be used by a process to mark a memory range as not expected
to be used in the near future. The hint can help kernel in deciding which
pages to evict early during memory pressure.

It works for every LRU pages like MADV_[DONTNEED|FREE]. IOW, It moves

	active file page -> inactive file LRU
	active anon page -> inacdtive anon LRU

Unlike MADV_FREE, it doesn't move active anonymous pages to inactive
file LRU's head because MADV_COLD is a little bit different symantic.
MADV_FREE means it's okay to discard when the memory pressure because
the content of the page is *garbage* so freeing such pages is almost zero
overhead since we don't need to swap out and access afterward causes just
minor fault. Thus, it would make sense to put those freeable pages in
inactive file LRU to compete other used-once pages. It makes sense for
implmentaion point of view, too because it's not swapbacked memory any
longer until it would be re-dirtied. Even, it could give a bonus to make
them be reclaimed on swapless system. However, MADV_COLD doesn't mean
garbage so reclaiming them requires swap-out/in in the end so it's bigger
cost. Since we have designed VM LRU aging based on cost-model, anonymous
cold pages would be better to position inactive anon's LRU list, not file
LRU. Furthermore, it would help to avoid unnecessary scanning if system
doesn't have a swap device. Let's start simpler way without adding
complexity at this moment. However, keep in mind, too that it's a caveat
that workloads with a lot of pages cache are likely to ignore MADV_COLD
on anonymous memory because we rarely age anonymous LRU lists.

* man-page material

MADV_COLD (since Linux x.x)

Pages in the specified regions will be treated as less-recently-accessed
compared to pages in the system with similar access frequencies.
In contrast to MADV_FREE, the contents of the region are preserved
regardless of subsequent writes to pages.

MADV_COLD cannot be applied to locked pages, Huge TLB pages, or VM_PFNMAP
pages.

* v2
 * add up the warn with lots of page cache workload - mhocko
 * add man page stuff - dave

* v1
 * remove page_mapcount filter - hannes, mhocko
 * remove idle page handling - joelaf

* RFCv2
 * add more description - mhocko

* RFCv1
 * renaming from MADV_COOL to MADV_COLD - hannes

* internal review
 * use clear_page_youn in deactivate_page - joelaf
 * Revise the description - surenb
 * Renaming from MADV_WARM to MADV_COOL - surenb

Acked-by: Michal Hocko <mhocko@suse.com>
Signed-off-by: Minchan Kim <minchan@kernel.org>
---
 include/linux/swap.h                   |   1 +
 include/uapi/asm-generic/mman-common.h |   1 +
 mm/internal.h                          |   2 +-
 mm/madvise.c                           | 180 ++++++++++++++++++++++++-
 mm/oom_kill.c                          |   2 +-
 mm/swap.c                              |  42 ++++++
 6 files changed, 224 insertions(+), 4 deletions(-)

diff --git a/include/linux/swap.h b/include/linux/swap.h
index de2c67a33b7e..0ce997edb8bb 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -340,6 +340,7 @@ extern void lru_add_drain_cpu(int cpu);
 extern void lru_add_drain_all(void);
 extern void rotate_reclaimable_page(struct page *page);
 extern void deactivate_file_page(struct page *page);
+extern void deactivate_page(struct page *page);
 extern void mark_page_lazyfree(struct page *page);
 extern void swap_setup(void);
 
diff --git a/include/uapi/asm-generic/mman-common.h b/include/uapi/asm-generic/mman-common.h
index 63b1f506ea67..ef8a56927b12 100644
--- a/include/uapi/asm-generic/mman-common.h
+++ b/include/uapi/asm-generic/mman-common.h
@@ -45,6 +45,7 @@
 #define MADV_SEQUENTIAL	2		/* expect sequential page references */
 #define MADV_WILLNEED	3		/* will need these pages */
 #define MADV_DONTNEED	4		/* don't need these pages */
+#define MADV_COLD	5		/* deactivatie these pages */
 
 /* common parameters: try to keep these consistent across architectures */
 #define MADV_FREE	8		/* free pages only if memory pressure */
diff --git a/mm/internal.h b/mm/internal.h
index f53a14d67538..c61b215ff265 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -39,7 +39,7 @@ vm_fault_t do_swap_page(struct vm_fault *vmf);
 void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *start_vma,
 		unsigned long floor, unsigned long ceiling);
 
-static inline bool can_madv_dontneed_vma(struct vm_area_struct *vma)
+static inline bool can_madv_lru_vma(struct vm_area_struct *vma)
 {
 	return !(vma->vm_flags & (VM_LOCKED|VM_HUGETLB|VM_PFNMAP));
 }
diff --git a/mm/madvise.c b/mm/madvise.c
index 968df3aa069f..bae0055f9724 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -40,6 +40,7 @@ static int madvise_need_mmap_write(int behavior)
 	case MADV_REMOVE:
 	case MADV_WILLNEED:
 	case MADV_DONTNEED:
+	case MADV_COLD:
 	case MADV_FREE:
 		return 0;
 	default:
@@ -307,6 +308,178 @@ static long madvise_willneed(struct vm_area_struct *vma,
 	return 0;
 }
 
+static int madvise_cold_pte_range(pmd_t *pmd, unsigned long addr,
+				unsigned long end, struct mm_walk *walk)
+{
+	struct mmu_gather *tlb = walk->private;
+	struct mm_struct *mm = tlb->mm;
+	struct vm_area_struct *vma = walk->vma;
+	pte_t *orig_pte, *pte, ptent;
+	spinlock_t *ptl;
+	struct page *page;
+	unsigned long next;
+
+	next = pmd_addr_end(addr, end);
+	if (pmd_trans_huge(*pmd)) {
+		pmd_t orig_pmd;
+
+		tlb_change_page_size(tlb, HPAGE_PMD_SIZE);
+		ptl = pmd_trans_huge_lock(pmd, vma);
+		if (!ptl)
+			return 0;
+
+		orig_pmd = *pmd;
+		if (is_huge_zero_pmd(orig_pmd))
+			goto huge_unlock;
+
+		if (unlikely(!pmd_present(orig_pmd))) {
+			VM_BUG_ON(thp_migration_supported() &&
+					!is_pmd_migration_entry(orig_pmd));
+			goto huge_unlock;
+		}
+
+		page = pmd_page(orig_pmd);
+		if (next - addr != HPAGE_PMD_SIZE) {
+			int err;
+
+			if (page_mapcount(page) != 1)
+				goto huge_unlock;
+
+			get_page(page);
+			spin_unlock(ptl);
+			lock_page(page);
+			err = split_huge_page(page);
+			unlock_page(page);
+			put_page(page);
+			if (!err)
+				goto regular_page;
+			return 0;
+		}
+
+		if (pmd_young(orig_pmd)) {
+			pmdp_invalidate(vma, addr, pmd);
+			orig_pmd = pmd_mkold(orig_pmd);
+
+			set_pmd_at(mm, addr, pmd, orig_pmd);
+			tlb_remove_pmd_tlb_entry(tlb, pmd, addr);
+		}
+
+		test_and_clear_page_young(page);
+		deactivate_page(page);
+huge_unlock:
+		spin_unlock(ptl);
+		return 0;
+	}
+
+	if (pmd_trans_unstable(pmd))
+		return 0;
+
+regular_page:
+	tlb_change_page_size(tlb, PAGE_SIZE);
+	orig_pte = pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
+	flush_tlb_batched_pending(mm);
+	arch_enter_lazy_mmu_mode();
+	for (; addr < end; pte++, addr += PAGE_SIZE) {
+		ptent = *pte;
+
+		if (pte_none(ptent))
+			continue;
+
+		if (!pte_present(ptent))
+			continue;
+
+		page = vm_normal_page(vma, addr, ptent);
+		if (!page)
+			continue;
+
+		/*
+		 * Creating a THP page is expensive so split it only if we
+		 * are sure it's worth. Split it if we are only owner.
+		 */
+		if (PageTransCompound(page)) {
+			if (page_mapcount(page) != 1)
+				break;
+			get_page(page);
+			if (!trylock_page(page)) {
+				put_page(page);
+				break;
+			}
+			pte_unmap_unlock(orig_pte, ptl);
+			if (split_huge_page(page)) {
+				unlock_page(page);
+				put_page(page);
+				pte_offset_map_lock(mm, pmd, addr, &ptl);
+				break;
+			}
+			unlock_page(page);
+			put_page(page);
+			pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
+			pte--;
+			addr -= PAGE_SIZE;
+			continue;
+		}
+
+		VM_BUG_ON_PAGE(PageTransCompound(page), page);
+
+		if (pte_young(ptent)) {
+			ptent = ptep_get_and_clear_full(mm, addr, pte,
+							tlb->fullmm);
+			ptent = pte_mkold(ptent);
+			set_pte_at(mm, addr, pte, ptent);
+			tlb_remove_tlb_entry(tlb, pte, addr);
+		}
+
+		/*
+		 * We are deactivating a page for accelerating reclaiming.
+		 * VM couldn't reclaim the page unless we clear PG_young.
+		 * As a side effect, it makes confuse idle-page tracking
+		 * because they will miss recent referenced history.
+		 */
+		test_and_clear_page_young(page);
+		deactivate_page(page);
+	}
+
+	arch_enter_lazy_mmu_mode();
+	pte_unmap_unlock(orig_pte, ptl);
+	cond_resched();
+
+	return 0;
+}
+
+static void madvise_cold_page_range(struct mmu_gather *tlb,
+			     struct vm_area_struct *vma,
+			     unsigned long addr, unsigned long end)
+{
+	struct mm_walk cold_walk = {
+		.pmd_entry = madvise_cold_pte_range,
+		.mm = vma->vm_mm,
+		.private = tlb,
+	};
+
+	tlb_start_vma(tlb, vma);
+	walk_page_range(addr, end, &cold_walk);
+	tlb_end_vma(tlb, vma);
+}
+
+static long madvise_cold(struct vm_area_struct *vma,
+			struct vm_area_struct **prev,
+			unsigned long start_addr, unsigned long end_addr)
+{
+	struct mm_struct *mm = vma->vm_mm;
+	struct mmu_gather tlb;
+
+	*prev = vma;
+	if (!can_madv_lru_vma(vma))
+		return -EINVAL;
+
+	lru_add_drain();
+	tlb_gather_mmu(&tlb, mm, start_addr, end_addr);
+	madvise_cold_page_range(&tlb, vma, start_addr, end_addr);
+	tlb_finish_mmu(&tlb, start_addr, end_addr);
+
+	return 0;
+}
+
 static int madvise_free_pte_range(pmd_t *pmd, unsigned long addr,
 				unsigned long end, struct mm_walk *walk)
 
@@ -519,7 +692,7 @@ static long madvise_dontneed_free(struct vm_area_struct *vma,
 				  int behavior)
 {
 	*prev = vma;
-	if (!can_madv_dontneed_vma(vma))
+	if (!can_madv_lru_vma(vma))
 		return -EINVAL;
 
 	if (!userfaultfd_remove(vma, start, end)) {
@@ -541,7 +714,7 @@ static long madvise_dontneed_free(struct vm_area_struct *vma,
 			 */
 			return -ENOMEM;
 		}
-		if (!can_madv_dontneed_vma(vma))
+		if (!can_madv_lru_vma(vma))
 			return -EINVAL;
 		if (end > vma->vm_end) {
 			/*
@@ -695,6 +868,8 @@ madvise_vma(struct vm_area_struct *vma, struct vm_area_struct **prev,
 		return madvise_remove(vma, prev, start, end);
 	case MADV_WILLNEED:
 		return madvise_willneed(vma, prev, start, end);
+	case MADV_COLD:
+		return madvise_cold(vma, prev, start, end);
 	case MADV_FREE:
 	case MADV_DONTNEED:
 		return madvise_dontneed_free(vma, prev, start, end, behavior);
@@ -716,6 +891,7 @@ madvise_behavior_valid(int behavior)
 	case MADV_WILLNEED:
 	case MADV_DONTNEED:
 	case MADV_FREE:
+	case MADV_COLD:
 #ifdef CONFIG_KSM
 	case MADV_MERGEABLE:
 	case MADV_UNMERGEABLE:
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index 95872bdfec4e..c8f0ec6b0e80 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -523,7 +523,7 @@ bool __oom_reap_task_mm(struct mm_struct *mm)
 	set_bit(MMF_UNSTABLE, &mm->flags);
 
 	for (vma = mm->mmap ; vma; vma = vma->vm_next) {
-		if (!can_madv_dontneed_vma(vma))
+		if (!can_madv_lru_vma(vma))
 			continue;
 
 		/*
diff --git a/mm/swap.c b/mm/swap.c
index 55899c1f54af..b501ad6eb091 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -47,6 +47,7 @@ int page_cluster;
 static DEFINE_PER_CPU(struct pagevec, lru_add_pvec);
 static DEFINE_PER_CPU(struct pagevec, lru_rotate_pvecs);
 static DEFINE_PER_CPU(struct pagevec, lru_deactivate_file_pvecs);
+static DEFINE_PER_CPU(struct pagevec, lru_deactivate_pvecs);
 static DEFINE_PER_CPU(struct pagevec, lru_lazyfree_pvecs);
 #ifdef CONFIG_SMP
 static DEFINE_PER_CPU(struct pagevec, activate_page_pvecs);
@@ -538,6 +539,22 @@ static void lru_deactivate_file_fn(struct page *page, struct lruvec *lruvec,
 	update_page_reclaim_stat(lruvec, file, 0);
 }
 
+static void lru_deactivate_fn(struct page *page, struct lruvec *lruvec,
+			    void *arg)
+{
+	if (PageLRU(page) && PageActive(page) && !PageUnevictable(page)) {
+		int file = page_is_file_cache(page);
+		int lru = page_lru_base_type(page);
+
+		del_page_from_lru_list(page, lruvec, lru + LRU_ACTIVE);
+		ClearPageActive(page);
+		ClearPageReferenced(page);
+		add_page_to_lru_list(page, lruvec, lru);
+
+		__count_vm_events(PGDEACTIVATE, hpage_nr_pages(page));
+		update_page_reclaim_stat(lruvec, file, 0);
+	}
+}
 
 static void lru_lazyfree_fn(struct page *page, struct lruvec *lruvec,
 			    void *arg)
@@ -590,6 +607,10 @@ void lru_add_drain_cpu(int cpu)
 	if (pagevec_count(pvec))
 		pagevec_lru_move_fn(pvec, lru_deactivate_file_fn, NULL);
 
+	pvec = &per_cpu(lru_deactivate_pvecs, cpu);
+	if (pagevec_count(pvec))
+		pagevec_lru_move_fn(pvec, lru_deactivate_fn, NULL);
+
 	pvec = &per_cpu(lru_lazyfree_pvecs, cpu);
 	if (pagevec_count(pvec))
 		pagevec_lru_move_fn(pvec, lru_lazyfree_fn, NULL);
@@ -623,6 +644,26 @@ void deactivate_file_page(struct page *page)
 	}
 }
 
+/*
+ * deactivate_page - deactivate a page
+ * @page: page to deactivate
+ *
+ * deactivate_page() moves @page to the inactive list if @page was on the active
+ * list and was not an unevictable page.  This is done to accelerate the reclaim
+ * of @page.
+ */
+void deactivate_page(struct page *page)
+{
+	if (PageLRU(page) && PageActive(page) && !PageUnevictable(page)) {
+		struct pagevec *pvec = &get_cpu_var(lru_deactivate_pvecs);
+
+		get_page(page);
+		if (!pagevec_add(pvec, page) || PageCompound(page))
+			pagevec_lru_move_fn(pvec, lru_deactivate_fn, NULL);
+		put_cpu_var(lru_deactivate_pvecs);
+	}
+}
+
 /**
  * mark_page_lazyfree - make an anon page lazyfree
  * @page: page to deactivate
@@ -687,6 +728,7 @@ void lru_add_drain_all(void)
 		if (pagevec_count(&per_cpu(lru_add_pvec, cpu)) ||
 		    pagevec_count(&per_cpu(lru_rotate_pvecs, cpu)) ||
 		    pagevec_count(&per_cpu(lru_deactivate_file_pvecs, cpu)) ||
+		    pagevec_count(&per_cpu(lru_deactivate_pvecs, cpu)) ||
 		    pagevec_count(&per_cpu(lru_lazyfree_pvecs, cpu)) ||
 		    need_activate_page_drain(cpu)) {
 			INIT_WORK(work, lru_add_drain_per_cpu);
-- 
2.22.0.410.gd8fdbe21b5-goog

^ permalink raw reply related

* [PATCH v4 2/4] mm: change PAGEREF_RECLAIM_CLEAN with PAGE_REFRECLAIM
From: Minchan Kim @ 2019-07-11  1:25 UTC (permalink / raw)
  To: Andrew Morton
  Cc: linux-mm, LKML, linux-api, Michal Hocko, Johannes Weiner,
	Tim Murray, Joel Fernandes, Suren Baghdasaryan, Daniel Colascione,
	Shakeel Butt, Sonny Rao, oleksandr, hdanton, lizeb, Dave Hansen,
	Kirill A . Shutemov, Minchan Kim
In-Reply-To: <20190711012528.176050-1-minchan@kernel.org>

The local variable references in shrink_page_list is PAGEREF_RECLAIM_CLEAN
as default. It is for preventing to reclaim dirty pages when CMA try to
migrate pages. Strictly speaking, we don't need it because CMA didn't allow
to write out by .may_writepage = 0 in reclaim_clean_pages_from_list.

Moreover, it has a problem to prevent anonymous pages's swap out even
though force_reclaim = true in shrink_page_list on upcoming patch.
So this patch makes references's default value to PAGEREF_RECLAIM and
rename force_reclaim with ignore_references to make it more clear.

This is a preparatory work for next patch.

* RFCv1
 * use ignore_referecnes as parameter name - hannes

Acked-by: Michal Hocko <mhocko@suse.com>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Signed-off-by: Minchan Kim <minchan@kernel.org>
---
 mm/vmscan.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/mm/vmscan.c b/mm/vmscan.c
index a0301edd8d03..b4fa04d10ba6 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -1119,7 +1119,7 @@ static unsigned long shrink_page_list(struct list_head *page_list,
 				      struct scan_control *sc,
 				      enum ttu_flags ttu_flags,
 				      struct reclaim_stat *stat,
-				      bool force_reclaim)
+				      bool ignore_references)
 {
 	LIST_HEAD(ret_pages);
 	LIST_HEAD(free_pages);
@@ -1133,7 +1133,7 @@ static unsigned long shrink_page_list(struct list_head *page_list,
 		struct address_space *mapping;
 		struct page *page;
 		int may_enter_fs;
-		enum page_references references = PAGEREF_RECLAIM_CLEAN;
+		enum page_references references = PAGEREF_RECLAIM;
 		bool dirty, writeback;
 		unsigned int nr_pages;
 
@@ -1264,7 +1264,7 @@ static unsigned long shrink_page_list(struct list_head *page_list,
 			}
 		}
 
-		if (!force_reclaim)
+		if (!ignore_references)
 			references = page_check_references(page, sc);
 
 		switch (references) {
-- 
2.22.0.410.gd8fdbe21b5-goog

^ permalink raw reply related

* [PATCH v4 3/4] mm: account nr_isolated_xxx in [isolate|putback]_lru_page
From: Minchan Kim @ 2019-07-11  1:25 UTC (permalink / raw)
  To: Andrew Morton
  Cc: linux-mm, LKML, linux-api, Michal Hocko, Johannes Weiner,
	Tim Murray, Joel Fernandes, Suren Baghdasaryan, Daniel Colascione,
	Shakeel Butt, Sonny Rao, oleksandr, hdanton, lizeb, Dave Hansen,
	Kirill A . Shutemov, Minchan Kim
In-Reply-To: <20190711012528.176050-1-minchan@kernel.org>

The isolate counting is pecpu counter so it would be not huge gain
to work them by batch. Rather than complicating to make them batch,
let's make it more stright-foward via adding the counting logic
into [isolate|putback]_lru_page API.

* v1
 * fix accounting bug - Hillf

Link: http://lkml.kernel.org/r/20190531165927.GA20067@cmpxchg.org
Acked-by: Michal Hocko <mhocko@suse.com>
Suggested-by: Johannes Weiner <hannes@cmpxchg.org>
Signed-off-by: Minchan Kim <minchan@kernel.org>
---
 mm/compaction.c     |  2 --
 mm/gup.c            |  7 +------
 mm/khugepaged.c     |  3 ---
 mm/memory-failure.c |  3 ---
 mm/memory_hotplug.c |  4 ----
 mm/mempolicy.c      |  6 +-----
 mm/migrate.c        | 37 ++++++++-----------------------------
 mm/vmscan.c         | 22 ++++++++++++++++------
 8 files changed, 26 insertions(+), 58 deletions(-)

diff --git a/mm/compaction.c b/mm/compaction.c
index 9e1b9acb116b..c6591682deda 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -982,8 +982,6 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn,
 
 		/* Successfully isolated */
 		del_page_from_lru_list(page, lruvec, page_lru(page));
-		inc_node_page_state(page,
-				NR_ISOLATED_ANON + page_is_file_cache(page));
 
 isolate_success:
 		list_add(&page->lru, &cc->migratepages);
diff --git a/mm/gup.c b/mm/gup.c
index 98f13ab37bac..11d0634ce613 100644
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -1475,13 +1475,8 @@ static long check_and_migrate_cma_pages(struct task_struct *tsk,
 					drain_allow = false;
 				}
 
-				if (!isolate_lru_page(head)) {
+				if (!isolate_lru_page(head))
 					list_add_tail(&head->lru, &cma_page_list);
-					mod_node_page_state(page_pgdat(head),
-							    NR_ISOLATED_ANON +
-							    page_is_file_cache(head),
-							    hpage_nr_pages(head));
-				}
 			}
 		}
 
diff --git a/mm/khugepaged.c b/mm/khugepaged.c
index eaaa21b23215..a8b517d6df4a 100644
--- a/mm/khugepaged.c
+++ b/mm/khugepaged.c
@@ -503,7 +503,6 @@ void __khugepaged_exit(struct mm_struct *mm)
 
 static void release_pte_page(struct page *page)
 {
-	dec_node_page_state(page, NR_ISOLATED_ANON + page_is_file_cache(page));
 	unlock_page(page);
 	putback_lru_page(page);
 }
@@ -602,8 +601,6 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
 			result = SCAN_DEL_PAGE_LRU;
 			goto out;
 		}
-		inc_node_page_state(page,
-				NR_ISOLATED_ANON + page_is_file_cache(page));
 		VM_BUG_ON_PAGE(!PageLocked(page), page);
 		VM_BUG_ON_PAGE(PageLRU(page), page);
 
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index 7ef849da8278..9900bb95d774 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -1791,9 +1791,6 @@ static int __soft_offline_page(struct page *page, int flags)
 		 * so use !__PageMovable instead for LRU page's mapping
 		 * cannot have PAGE_MAPPING_MOVABLE.
 		 */
-		if (!__PageMovable(page))
-			inc_node_page_state(page, NR_ISOLATED_ANON +
-						page_is_file_cache(page));
 		list_add(&page->lru, &pagelist);
 		ret = migrate_pages(&pagelist, new_page, NULL, MPOL_MF_MOVE_ALL,
 					MIGRATE_SYNC, MR_MEMORY_FAILURE);
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index b9ba5b85f9f7..15bad2043b41 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -1383,10 +1383,6 @@ do_migrate_range(unsigned long start_pfn, unsigned long end_pfn)
 			ret = isolate_movable_page(page, ISOLATE_UNEVICTABLE);
 		if (!ret) { /* Success */
 			list_add_tail(&page->lru, &source);
-			if (!__PageMovable(page))
-				inc_node_page_state(page, NR_ISOLATED_ANON +
-						    page_is_file_cache(page));
-
 		} else {
 			pr_warn("failed to isolate pfn %lx\n", pfn);
 			dump_page(page, "isolation failed");
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 4acc2d14bc77..a5685eee6d1d 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -994,12 +994,8 @@ static int migrate_page_add(struct page *page, struct list_head *pagelist,
 	 * Avoid migrating a page that is shared with others.
 	 */
 	if ((flags & MPOL_MF_MOVE_ALL) || page_mapcount(head) == 1) {
-		if (!isolate_lru_page(head)) {
+		if (!isolate_lru_page(head))
 			list_add_tail(&head->lru, pagelist);
-			mod_node_page_state(page_pgdat(head),
-				NR_ISOLATED_ANON + page_is_file_cache(head),
-				hpage_nr_pages(head));
-		}
 	}
 
 	return 0;
diff --git a/mm/migrate.c b/mm/migrate.c
index 8992741f10aa..f54f449a99f5 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -190,8 +190,6 @@ void putback_movable_pages(struct list_head *l)
 			unlock_page(page);
 			put_page(page);
 		} else {
-			mod_node_page_state(page_pgdat(page), NR_ISOLATED_ANON +
-					page_is_file_cache(page), -hpage_nr_pages(page));
 			putback_lru_page(page);
 		}
 	}
@@ -1175,10 +1173,17 @@ static ICE_noinline int unmap_and_move(new_page_t get_new_page,
 		return -ENOMEM;
 
 	if (page_count(page) == 1) {
+		bool is_lru = !__PageMovable(page);
+
 		/* page was freed from under us. So we are done. */
 		ClearPageActive(page);
 		ClearPageUnevictable(page);
-		if (unlikely(__PageMovable(page))) {
+		if (likely(is_lru))
+			mod_node_page_state(page_pgdat(page),
+						NR_ISOLATED_ANON +
+						page_is_file_cache(page),
+						-hpage_nr_pages(page));
+		else {
 			lock_page(page);
 			if (!PageMovable(page))
 				__ClearPageIsolated(page);
@@ -1204,15 +1209,6 @@ static ICE_noinline int unmap_and_move(new_page_t get_new_page,
 		 * restored.
 		 */
 		list_del(&page->lru);
-
-		/*
-		 * Compaction can migrate also non-LRU pages which are
-		 * not accounted to NR_ISOLATED_*. They can be recognized
-		 * as __PageMovable
-		 */
-		if (likely(!__PageMovable(page)))
-			mod_node_page_state(page_pgdat(page), NR_ISOLATED_ANON +
-					page_is_file_cache(page), -hpage_nr_pages(page));
 	}
 
 	/*
@@ -1566,9 +1562,6 @@ static int add_page_for_migration(struct mm_struct *mm, unsigned long addr,
 
 		err = 0;
 		list_add_tail(&head->lru, pagelist);
-		mod_node_page_state(page_pgdat(head),
-			NR_ISOLATED_ANON + page_is_file_cache(head),
-			hpage_nr_pages(head));
 	}
 out_putpage:
 	/*
@@ -1884,8 +1877,6 @@ static struct page *alloc_misplaced_dst_page(struct page *page,
 
 static int numamigrate_isolate_page(pg_data_t *pgdat, struct page *page)
 {
-	int page_lru;
-
 	VM_BUG_ON_PAGE(compound_order(page) && !PageTransHuge(page), page);
 
 	/* Avoid migrating to a node that is nearly full */
@@ -1907,10 +1898,6 @@ static int numamigrate_isolate_page(pg_data_t *pgdat, struct page *page)
 		return 0;
 	}
 
-	page_lru = page_is_file_cache(page);
-	mod_node_page_state(page_pgdat(page), NR_ISOLATED_ANON + page_lru,
-				hpage_nr_pages(page));
-
 	/*
 	 * Isolating the page has taken another reference, so the
 	 * caller's reference can be safely dropped without the page
@@ -1965,8 +1952,6 @@ int migrate_misplaced_page(struct page *page, struct vm_area_struct *vma,
 	if (nr_remaining) {
 		if (!list_empty(&migratepages)) {
 			list_del(&page->lru);
-			dec_node_page_state(page, NR_ISOLATED_ANON +
-					page_is_file_cache(page));
 			putback_lru_page(page);
 		}
 		isolated = 0;
@@ -1996,7 +1981,6 @@ int migrate_misplaced_transhuge_page(struct mm_struct *mm,
 	pg_data_t *pgdat = NODE_DATA(node);
 	int isolated = 0;
 	struct page *new_page = NULL;
-	int page_lru = page_is_file_cache(page);
 	unsigned long start = address & HPAGE_PMD_MASK;
 
 	new_page = alloc_pages_node(node,
@@ -2042,8 +2026,6 @@ int migrate_misplaced_transhuge_page(struct mm_struct *mm,
 		/* Retake the callers reference and putback on LRU */
 		get_page(page);
 		putback_lru_page(page);
-		mod_node_page_state(page_pgdat(page),
-			 NR_ISOLATED_ANON + page_lru, -HPAGE_PMD_NR);
 
 		goto out_unlock;
 	}
@@ -2093,9 +2075,6 @@ int migrate_misplaced_transhuge_page(struct mm_struct *mm,
 	count_vm_events(PGMIGRATE_SUCCESS, HPAGE_PMD_NR);
 	count_vm_numa_events(NUMA_PAGE_MIGRATE, HPAGE_PMD_NR);
 
-	mod_node_page_state(page_pgdat(page),
-			NR_ISOLATED_ANON + page_lru,
-			-HPAGE_PMD_NR);
 	return isolated;
 
 out_fail:
diff --git a/mm/vmscan.c b/mm/vmscan.c
index b4fa04d10ba6..ca192b792d4f 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -1016,6 +1016,9 @@ int remove_mapping(struct address_space *mapping, struct page *page)
 void putback_lru_page(struct page *page)
 {
 	lru_cache_add(page);
+	mod_node_page_state(page_pgdat(page),
+				NR_ISOLATED_ANON + page_is_file_cache(page),
+				-hpage_nr_pages(page));
 	put_page(page);		/* drop ref from isolate */
 }
 
@@ -1481,6 +1484,9 @@ static unsigned long shrink_page_list(struct list_head *page_list,
 		 */
 		nr_reclaimed += nr_pages;
 
+		mod_node_page_state(pgdat, NR_ISOLATED_ANON +
+						page_is_file_cache(page),
+						-nr_pages);
 		/*
 		 * Is there need to periodically free_page_list? It would
 		 * appear not as the counts should be low
@@ -1556,7 +1562,6 @@ unsigned long reclaim_clean_pages_from_list(struct zone *zone,
 	ret = shrink_page_list(&clean_pages, zone->zone_pgdat, &sc,
 			TTU_IGNORE_ACCESS, &dummy_stat, true);
 	list_splice(&clean_pages, page_list);
-	mod_node_page_state(zone->zone_pgdat, NR_ISOLATED_FILE, -ret);
 	return ret;
 }
 
@@ -1632,6 +1637,9 @@ int __isolate_lru_page(struct page *page, isolate_mode_t mode)
 		 */
 		ClearPageLRU(page);
 		ret = 0;
+		__mod_node_page_state(page_pgdat(page), NR_ISOLATED_ANON +
+						page_is_file_cache(page),
+						hpage_nr_pages(page));
 	}
 
 	return ret;
@@ -1763,6 +1771,7 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
 	trace_mm_vmscan_lru_isolate(sc->reclaim_idx, sc->order, nr_to_scan,
 				    total_scan, skipped, nr_taken, mode, lru);
 	update_lru_sizes(lruvec, lru, nr_zone_taken);
+
 	return nr_taken;
 }
 
@@ -1811,6 +1820,9 @@ int isolate_lru_page(struct page *page)
 			ClearPageLRU(page);
 			del_page_from_lru_list(page, lruvec, lru);
 			ret = 0;
+			mod_node_page_state(pgdat, NR_ISOLATED_ANON +
+						page_is_file_cache(page),
+						hpage_nr_pages(page));
 		}
 		spin_unlock_irq(&pgdat->lru_lock);
 	}
@@ -1902,6 +1914,9 @@ static unsigned noinline_for_stack move_pages_to_lru(struct lruvec *lruvec,
 		update_lru_size(lruvec, lru, page_zonenum(page), nr_pages);
 		list_move(&page->lru, &lruvec->lists[lru]);
 
+		__mod_node_page_state(pgdat, NR_ISOLATED_ANON +
+						page_is_file_cache(page),
+						-hpage_nr_pages(page));
 		if (put_page_testzero(page)) {
 			__ClearPageLRU(page);
 			__ClearPageActive(page);
@@ -1979,7 +1994,6 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec,
 	nr_taken = isolate_lru_pages(nr_to_scan, lruvec, &page_list,
 				     &nr_scanned, sc, lru);
 
-	__mod_node_page_state(pgdat, NR_ISOLATED_ANON + file, nr_taken);
 	reclaim_stat->recent_scanned[file] += nr_taken;
 
 	item = current_is_kswapd() ? PGSCAN_KSWAPD : PGSCAN_DIRECT;
@@ -2005,8 +2019,6 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec,
 
 	move_pages_to_lru(lruvec, &page_list);
 
-	__mod_node_page_state(pgdat, NR_ISOLATED_ANON + file, -nr_taken);
-
 	spin_unlock_irq(&pgdat->lru_lock);
 
 	mem_cgroup_uncharge_list(&page_list);
@@ -2065,7 +2077,6 @@ static void shrink_active_list(unsigned long nr_to_scan,
 	nr_taken = isolate_lru_pages(nr_to_scan, lruvec, &l_hold,
 				     &nr_scanned, sc, lru);
 
-	__mod_node_page_state(pgdat, NR_ISOLATED_ANON + file, nr_taken);
 	reclaim_stat->recent_scanned[file] += nr_taken;
 
 	__count_vm_events(PGREFILL, nr_scanned);
@@ -2134,7 +2145,6 @@ static void shrink_active_list(unsigned long nr_to_scan,
 	__count_vm_events(PGDEACTIVATE, nr_deactivate);
 	__count_memcg_events(lruvec_memcg(lruvec), PGDEACTIVATE, nr_deactivate);
 
-	__mod_node_page_state(pgdat, NR_ISOLATED_ANON + file, -nr_taken);
 	spin_unlock_irq(&pgdat->lru_lock);
 
 	mem_cgroup_uncharge_list(&l_active);
-- 
2.22.0.410.gd8fdbe21b5-goog

^ permalink raw reply related

* [PATCH v4 4/4] mm: introduce MADV_PAGEOUT
From: Minchan Kim @ 2019-07-11  1:25 UTC (permalink / raw)
  To: Andrew Morton
  Cc: linux-mm, LKML, linux-api, Michal Hocko, Johannes Weiner,
	Tim Murray, Joel Fernandes, Suren Baghdasaryan, Daniel Colascione,
	Shakeel Butt, Sonny Rao, oleksandr, hdanton, lizeb, Dave Hansen,
	Kirill A . Shutemov, Minchan Kim
In-Reply-To: <20190711012528.176050-1-minchan@kernel.org>

When a process expects no accesses to a certain memory range
for a long time, it could hint kernel that the pages can be
reclaimed instantly but data should be preserved for future use.
This could reduce workingset eviction so it ends up increasing
performance.

This patch introduces the new MADV_PAGEOUT hint to madvise(2)
syscall. MADV_PAGEOUT can be used by a process to mark a memory
range as not expected to be used for a long time so that kernel
reclaims *any LRU* pages instantly. The hint can help kernel in
deciding which pages to evict proactively.

A note: It doesn't apply SWAP_CLUSTER_MAX LRU page isolation limit
intentionally because it's automatically bounded by PMD size.
If PMD size(e.g., 256) makes some trouble, we could fix it later
by limit it to SWAP_CLUSTER_MAX[1].

- man-page material

MADV_PAGEOUT (since Linux x.x)

Do not expect access in the near future so pages in the specified
regions could be reclaimed instantly regardless of memory pressure.
Thus, access in the range after successful operation could cause
major page fault but never lose the up-to-date contents unlike
MADV_DONTNEED. Pages belonging to a shared mapping are only processed
if a write access is allowed for the calling process.

MADV_PAGEOUT cannot be applied to locked pages, Huge TLB pages, or
VM_PFNMAP pages.

* v3
 * man page material modification - mhocko
 * remove using SWAP_CLUSTER_MAX - mhocko

* v2
 * add comment about SWAP_CLUSTER_MAX - mhocko
 * add permission check to prevent sidechannel attack - mhocko
 * add man page stuff - dave

* v1
 * change pte to old and rely on the other's reference - hannes
 * remove page_mapcount to check shared page - mhocko

* RFC v2
 * make reclaim_pages simple via factoring out isolate logic - hannes

* RFCv1
 * rename from MADV_COLD to MADV_PAGEOUT - hannes
 * bail out if process is being killed - Hillf
 * fix reclaim_pages bugs - Hillf

[1] https://lore.kernel.org/lkml/20190710194719.GS29695@dhcp22.suse.cz/
Acked-by: Michal Hocko <mhocko@suse.com>
Signed-off-by: Minchan Kim <minchan@kernel.org>
---
 include/linux/swap.h                   |   1 +
 include/uapi/asm-generic/mman-common.h |   1 +
 mm/madvise.c                           | 197 +++++++++++++++++++++++++
 mm/vmscan.c                            |  55 +++++++
 4 files changed, 254 insertions(+)

diff --git a/include/linux/swap.h b/include/linux/swap.h
index 0ce997edb8bb..063c0c1e112b 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -365,6 +365,7 @@ extern int vm_swappiness;
 extern int remove_mapping(struct address_space *mapping, struct page *page);
 extern unsigned long vm_total_pages;
 
+extern unsigned long reclaim_pages(struct list_head *page_list);
 #ifdef CONFIG_NUMA
 extern int node_reclaim_mode;
 extern int sysctl_min_unmapped_ratio;
diff --git a/include/uapi/asm-generic/mman-common.h b/include/uapi/asm-generic/mman-common.h
index ef8a56927b12..c613abdb7284 100644
--- a/include/uapi/asm-generic/mman-common.h
+++ b/include/uapi/asm-generic/mman-common.h
@@ -46,6 +46,7 @@
 #define MADV_WILLNEED	3		/* will need these pages */
 #define MADV_DONTNEED	4		/* don't need these pages */
 #define MADV_COLD	5		/* deactivatie these pages */
+#define MADV_PAGEOUT	6		/* reclaim these pages */
 
 /* common parameters: try to keep these consistent across architectures */
 #define MADV_FREE	8		/* free pages only if memory pressure */
diff --git a/mm/madvise.c b/mm/madvise.c
index bae0055f9724..bc2f0138982e 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -11,6 +11,7 @@
 #include <linux/syscalls.h>
 #include <linux/mempolicy.h>
 #include <linux/page-isolation.h>
+#include <linux/page_idle.h>
 #include <linux/userfaultfd_k.h>
 #include <linux/hugetlb.h>
 #include <linux/falloc.h>
@@ -41,6 +42,7 @@ static int madvise_need_mmap_write(int behavior)
 	case MADV_WILLNEED:
 	case MADV_DONTNEED:
 	case MADV_COLD:
+	case MADV_PAGEOUT:
 	case MADV_FREE:
 		return 0;
 	default:
@@ -480,6 +482,198 @@ static long madvise_cold(struct vm_area_struct *vma,
 	return 0;
 }
 
+static int madvise_pageout_pte_range(pmd_t *pmd, unsigned long addr,
+				unsigned long end, struct mm_walk *walk)
+{
+	struct mmu_gather *tlb = walk->private;
+	struct mm_struct *mm = tlb->mm;
+	struct vm_area_struct *vma = walk->vma;
+	pte_t *orig_pte, *pte, ptent;
+	spinlock_t *ptl;
+	LIST_HEAD(page_list);
+	struct page *page;
+	unsigned long next;
+
+	if (fatal_signal_pending(current))
+		return -EINTR;
+
+	next = pmd_addr_end(addr, end);
+	if (pmd_trans_huge(*pmd)) {
+		pmd_t orig_pmd;
+
+		tlb_change_page_size(tlb, HPAGE_PMD_SIZE);
+		ptl = pmd_trans_huge_lock(pmd, vma);
+		if (!ptl)
+			return 0;
+
+		orig_pmd = *pmd;
+		if (is_huge_zero_pmd(orig_pmd))
+			goto huge_unlock;
+
+		if (unlikely(!pmd_present(orig_pmd))) {
+			VM_BUG_ON(thp_migration_supported() &&
+					!is_pmd_migration_entry(orig_pmd));
+			goto huge_unlock;
+		}
+
+		page = pmd_page(orig_pmd);
+		if (next - addr != HPAGE_PMD_SIZE) {
+			int err;
+
+			if (page_mapcount(page) != 1)
+				goto huge_unlock;
+			get_page(page);
+			spin_unlock(ptl);
+			lock_page(page);
+			err = split_huge_page(page);
+			unlock_page(page);
+			put_page(page);
+			if (!err)
+				goto regular_page;
+			return 0;
+		}
+
+		if (isolate_lru_page(page))
+			goto huge_unlock;
+
+		if (pmd_young(orig_pmd)) {
+			pmdp_invalidate(vma, addr, pmd);
+			orig_pmd = pmd_mkold(orig_pmd);
+
+			set_pmd_at(mm, addr, pmd, orig_pmd);
+			tlb_remove_tlb_entry(tlb, pmd, addr);
+		}
+
+		ClearPageReferenced(page);
+		test_and_clear_page_young(page);
+		list_add(&page->lru, &page_list);
+huge_unlock:
+		spin_unlock(ptl);
+		reclaim_pages(&page_list);
+		return 0;
+	}
+
+	if (pmd_trans_unstable(pmd))
+		return 0;
+regular_page:
+	tlb_change_page_size(tlb, PAGE_SIZE);
+	orig_pte = pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
+	flush_tlb_batched_pending(mm);
+	arch_enter_lazy_mmu_mode();
+	for (; addr < end; pte++, addr += PAGE_SIZE) {
+		ptent = *pte;
+		if (!pte_present(ptent))
+			continue;
+
+		page = vm_normal_page(vma, addr, ptent);
+		if (!page)
+			continue;
+
+		/*
+		 * creating a THP page is expensive so split it only if we
+		 * are sure it's worth. Split it if we are only owner.
+		 */
+		if (PageTransCompound(page)) {
+			if (page_mapcount(page) != 1)
+				break;
+			get_page(page);
+			if (!trylock_page(page)) {
+				put_page(page);
+				break;
+			}
+			pte_unmap_unlock(orig_pte, ptl);
+			if (split_huge_page(page)) {
+				unlock_page(page);
+				put_page(page);
+				pte_offset_map_lock(mm, pmd, addr, &ptl);
+				break;
+			}
+			unlock_page(page);
+			put_page(page);
+			pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
+			pte--;
+			addr -= PAGE_SIZE;
+			continue;
+		}
+
+		VM_BUG_ON_PAGE(PageTransCompound(page), page);
+
+		if (isolate_lru_page(page))
+			continue;
+
+		if (pte_young(ptent)) {
+			ptent = ptep_get_and_clear_full(mm, addr, pte,
+							tlb->fullmm);
+			ptent = pte_mkold(ptent);
+			set_pte_at(mm, addr, pte, ptent);
+			tlb_remove_tlb_entry(tlb, pte, addr);
+		}
+		ClearPageReferenced(page);
+		test_and_clear_page_young(page);
+		list_add(&page->lru, &page_list);
+	}
+
+	arch_leave_lazy_mmu_mode();
+	pte_unmap_unlock(orig_pte, ptl);
+	reclaim_pages(&page_list);
+	cond_resched();
+
+	return 0;
+}
+
+static void madvise_pageout_page_range(struct mmu_gather *tlb,
+			     struct vm_area_struct *vma,
+			     unsigned long addr, unsigned long end)
+{
+	struct mm_walk pageout_walk = {
+		.pmd_entry = madvise_pageout_pte_range,
+		.mm = vma->vm_mm,
+		.private = tlb,
+	};
+
+	tlb_start_vma(tlb, vma);
+	walk_page_range(addr, end, &pageout_walk);
+	tlb_end_vma(tlb, vma);
+}
+
+static inline bool can_do_pageout(struct vm_area_struct *vma)
+{
+	if (vma_is_anonymous(vma))
+		return true;
+	if (!vma->vm_file)
+		return false;
+	/*
+	 * paging out pagecache only for non-anonymous mappings that correspond
+	 * to the files the calling process could (if tried) open for writing;
+	 * otherwise we'd be including shared non-exclusive mappings, which
+	 * opens a side channel.
+	 */
+	return inode_owner_or_capable(file_inode(vma->vm_file)) ||
+		inode_permission(file_inode(vma->vm_file), MAY_WRITE) == 0;
+}
+
+static long madvise_pageout(struct vm_area_struct *vma,
+			struct vm_area_struct **prev,
+			unsigned long start_addr, unsigned long end_addr)
+{
+	struct mm_struct *mm = vma->vm_mm;
+	struct mmu_gather tlb;
+
+	*prev = vma;
+	if (!can_madv_lru_vma(vma))
+		return -EINVAL;
+
+	if (!can_do_pageout(vma))
+		return 0;
+
+	lru_add_drain();
+	tlb_gather_mmu(&tlb, mm, start_addr, end_addr);
+	madvise_pageout_page_range(&tlb, vma, start_addr, end_addr);
+	tlb_finish_mmu(&tlb, start_addr, end_addr);
+
+	return 0;
+}
+
 static int madvise_free_pte_range(pmd_t *pmd, unsigned long addr,
 				unsigned long end, struct mm_walk *walk)
 
@@ -870,6 +1064,8 @@ madvise_vma(struct vm_area_struct *vma, struct vm_area_struct **prev,
 		return madvise_willneed(vma, prev, start, end);
 	case MADV_COLD:
 		return madvise_cold(vma, prev, start, end);
+	case MADV_PAGEOUT:
+		return madvise_pageout(vma, prev, start, end);
 	case MADV_FREE:
 	case MADV_DONTNEED:
 		return madvise_dontneed_free(vma, prev, start, end, behavior);
@@ -892,6 +1088,7 @@ madvise_behavior_valid(int behavior)
 	case MADV_DONTNEED:
 	case MADV_FREE:
 	case MADV_COLD:
+	case MADV_PAGEOUT:
 #ifdef CONFIG_KSM
 	case MADV_MERGEABLE:
 	case MADV_UNMERGEABLE:
diff --git a/mm/vmscan.c b/mm/vmscan.c
index ca192b792d4f..bda3c41de767 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -2153,6 +2153,61 @@ static void shrink_active_list(unsigned long nr_to_scan,
 			nr_deactivate, nr_rotated, sc->priority, file);
 }
 
+unsigned long reclaim_pages(struct list_head *page_list)
+{
+	int nid = -1;
+	unsigned long nr_reclaimed = 0;
+	LIST_HEAD(node_page_list);
+	struct reclaim_stat dummy_stat;
+	struct page *page;
+	struct scan_control sc = {
+		.gfp_mask = GFP_KERNEL,
+		.priority = DEF_PRIORITY,
+		.may_writepage = 1,
+		.may_unmap = 1,
+		.may_swap = 1,
+	};
+
+	while (!list_empty(page_list)) {
+		page = lru_to_page(page_list);
+		if (nid == -1) {
+			nid = page_to_nid(page);
+			INIT_LIST_HEAD(&node_page_list);
+		}
+
+		if (nid == page_to_nid(page)) {
+			list_move(&page->lru, &node_page_list);
+			continue;
+		}
+
+		nr_reclaimed += shrink_page_list(&node_page_list,
+						NODE_DATA(nid),
+						&sc, 0,
+						&dummy_stat, false);
+		while (!list_empty(&node_page_list)) {
+			page = lru_to_page(&node_page_list);
+			list_del(&page->lru);
+			putback_lru_page(page);
+		}
+
+		nid = -1;
+	}
+
+	if (!list_empty(&node_page_list)) {
+		nr_reclaimed += shrink_page_list(&node_page_list,
+						NODE_DATA(nid),
+						&sc, 0,
+						&dummy_stat, false);
+		while (!list_empty(&node_page_list)) {
+			page = lru_to_page(&node_page_list);
+			list_del(&page->lru);
+			putback_lru_page(page);
+		}
+	}
+
+	return nr_reclaimed;
+}
+
 /*
  * The inactive anon list should be small enough that the VM never has
  * to do too much work.
-- 
2.22.0.410.gd8fdbe21b5-goog

^ permalink raw reply related

* Re: [PATCH V34 10/29] hibernate: Disable when the kernel is locked down
From: joeyli @ 2019-07-11  4:11 UTC (permalink / raw)
  To: Jiri Kosina
  Cc: Pavel Machek, Matthew Garrett, jmorris, linux-security-module,
	linux-kernel, linux-api, Josh Boyer, David Howells,
	Matthew Garrett, rjw, linux-pm
In-Reply-To: <nycvar.YFH.7.76.1906241520500.27227@cbobk.fhfr.pm>

Hi experts,

On Mon, Jun 24, 2019 at 03:21:23PM +0200, Jiri Kosina wrote:
> On Sat, 22 Jun 2019, Pavel Machek wrote:
> 
> > > There is currently no way to verify the resume image when returning
> > > from hibernate.  This might compromise the signed modules trust model,
> > > so until we can work with signed hibernate images we disable it when the
> > > kernel is locked down.
> > 
> > I keep getting these...
> > 
> > IIRC suse has patches to verify the images.
> 
> Yeah, Joey Lee is taking care of those. CCing.
>

The last time that I sent for hibernation encryption and authentication is
here:
    https://lkml.org/lkml/2019/1/3/281

It needs some big changes after review:
 - Simplify the design: remove keyring dependency and trampoline.
 - Encrypted whole snapshot image instead of only data pages.
 - Using TPM:
        - Direct use TPM API in hibernation instead of keyring
        - Localities (suggested by James Bottomley)

I am still finding enough time to implement those changes, especial TPM
parts.

Thanks
Joey Lee

^ permalink raw reply

* Re: [PATCH v4 1/4] mm: introduce MADV_COLD
From: Johannes Weiner @ 2019-07-11 15:25 UTC (permalink / raw)
  To: Minchan Kim
  Cc: Andrew Morton, linux-mm, LKML, linux-api, Michal Hocko,
	Tim Murray, Joel Fernandes, Suren Baghdasaryan, Daniel Colascione,
	Shakeel Butt, Sonny Rao, oleksandr, hdanton, lizeb, Dave Hansen,
	Kirill A . Shutemov
In-Reply-To: <20190711012528.176050-2-minchan@kernel.org>

On Thu, Jul 11, 2019 at 10:25:25AM +0900, Minchan Kim wrote:
> When a process expects no accesses to a certain memory range, it could
> give a hint to kernel that the pages can be reclaimed when memory pressure
> happens but data should be preserved for future use.  This could reduce
> workingset eviction so it ends up increasing performance.
> 
> This patch introduces the new MADV_COLD hint to madvise(2) syscall.
> MADV_COLD can be used by a process to mark a memory range as not expected
> to be used in the near future. The hint can help kernel in deciding which
> pages to evict early during memory pressure.
> 
> It works for every LRU pages like MADV_[DONTNEED|FREE]. IOW, It moves
> 
> 	active file page -> inactive file LRU
> 	active anon page -> inacdtive anon LRU
> 
> Unlike MADV_FREE, it doesn't move active anonymous pages to inactive
> file LRU's head because MADV_COLD is a little bit different symantic.
> MADV_FREE means it's okay to discard when the memory pressure because
> the content of the page is *garbage* so freeing such pages is almost zero
> overhead since we don't need to swap out and access afterward causes just
> minor fault. Thus, it would make sense to put those freeable pages in
> inactive file LRU to compete other used-once pages. It makes sense for
> implmentaion point of view, too because it's not swapbacked memory any
> longer until it would be re-dirtied. Even, it could give a bonus to make
> them be reclaimed on swapless system. However, MADV_COLD doesn't mean
> garbage so reclaiming them requires swap-out/in in the end so it's bigger
> cost. Since we have designed VM LRU aging based on cost-model, anonymous
> cold pages would be better to position inactive anon's LRU list, not file
> LRU. Furthermore, it would help to avoid unnecessary scanning if system
> doesn't have a swap device. Let's start simpler way without adding
> complexity at this moment. However, keep in mind, too that it's a caveat
> that workloads with a lot of pages cache are likely to ignore MADV_COLD
> on anonymous memory because we rarely age anonymous LRU lists.
> 
> * man-page material
> 
> MADV_COLD (since Linux x.x)
> 
> Pages in the specified regions will be treated as less-recently-accessed
> compared to pages in the system with similar access frequencies.
> In contrast to MADV_FREE, the contents of the region are preserved
> regardless of subsequent writes to pages.
> 
> MADV_COLD cannot be applied to locked pages, Huge TLB pages, or VM_PFNMAP
> pages.
> 
> * v2
>  * add up the warn with lots of page cache workload - mhocko
>  * add man page stuff - dave
> 
> * v1
>  * remove page_mapcount filter - hannes, mhocko
>  * remove idle page handling - joelaf
> 
> * RFCv2
>  * add more description - mhocko
> 
> * RFCv1
>  * renaming from MADV_COOL to MADV_COLD - hannes
> 
> * internal review
>  * use clear_page_youn in deactivate_page - joelaf
>  * Revise the description - surenb
>  * Renaming from MADV_WARM to MADV_COOL - surenb
> 
> Acked-by: Michal Hocko <mhocko@suse.com>
> Signed-off-by: Minchan Kim <minchan@kernel.org>

Acked-by: Johannes Weiner <hannes@cmpxchg.org>

^ permalink raw reply


This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox