Linux userland API discussions
 help / color / mirror / Atom feed
* [PATCH V36 01/29] security: Support early LSMs
From: Matthew Garrett @ 2019-07-18 19:43 UTC (permalink / raw)
  To: jmorris
  Cc: linux-security-module, linux-kernel, linux-api, Matthew Garrett,
	Matthew Garrett, Kees Cook
In-Reply-To: <20190718194415.108476-1-matthewgarrett@google.com>

The lockdown module is intended to allow for kernels to be locked down
early in boot - sufficiently early that we don't have the ability to
kmalloc() yet. Add support for early initialisation of some LSMs, and
then add them to the list of names when we do full initialisation later.
Early LSMs are initialised in link order and cannot be overridden via
boot parameters, and cannot make use of kmalloc() (since the allocator
isn't initialised yet).

Signed-off-by: Matthew Garrett <mjg59@google.com>
Acked-by: Kees Cook <keescook@chromium.org>
---
 include/asm-generic/vmlinux.lds.h |  8 ++++-
 include/linux/lsm_hooks.h         |  6 ++++
 include/linux/security.h          |  1 +
 init/main.c                       |  1 +
 security/security.c               | 50 ++++++++++++++++++++++++++-----
 5 files changed, 57 insertions(+), 9 deletions(-)

diff --git a/include/asm-generic/vmlinux.lds.h b/include/asm-generic/vmlinux.lds.h
index ca42182992a5..6cc6174a2a4c 100644
--- a/include/asm-generic/vmlinux.lds.h
+++ b/include/asm-generic/vmlinux.lds.h
@@ -215,8 +215,13 @@
 			__start_lsm_info = .;				\
 			KEEP(*(.lsm_info.init))				\
 			__end_lsm_info = .;
+#define EARLY_LSM_TABLE()	. = ALIGN(8);				\
+			__start_early_lsm_info = .;			\
+			KEEP(*(.early_lsm_info.init))			\
+			__end_early_lsm_info = .;
 #else
 #define LSM_TABLE()
+#define EARLY_LSM_TABLE()
 #endif
 
 #define ___OF_TABLE(cfg, name)	_OF_TABLE_##cfg(name)
@@ -616,7 +621,8 @@
 	ACPI_PROBE_TABLE(irqchip)					\
 	ACPI_PROBE_TABLE(timer)						\
 	EARLYCON_TABLE()						\
-	LSM_TABLE()
+	LSM_TABLE()							\
+	EARLY_LSM_TABLE()
 
 #define INIT_TEXT							\
 	*(.init.text .init.text.*)					\
diff --git a/include/linux/lsm_hooks.h b/include/linux/lsm_hooks.h
index df1318d85f7d..aebb0e032072 100644
--- a/include/linux/lsm_hooks.h
+++ b/include/linux/lsm_hooks.h
@@ -2104,12 +2104,18 @@ struct lsm_info {
 };
 
 extern struct lsm_info __start_lsm_info[], __end_lsm_info[];
+extern struct lsm_info __start_early_lsm_info[], __end_early_lsm_info[];
 
 #define DEFINE_LSM(lsm)							\
 	static struct lsm_info __lsm_##lsm				\
 		__used __section(.lsm_info.init)			\
 		__aligned(sizeof(unsigned long))
 
+#define DEFINE_EARLY_LSM(lsm)						\
+	static struct lsm_info __early_lsm_##lsm			\
+		__used __section(.early_lsm_info.init)			\
+		__aligned(sizeof(unsigned long))
+
 #ifdef CONFIG_SECURITY_SELINUX_DISABLE
 /*
  * Assuring the safety of deleting a security module is up to
diff --git a/include/linux/security.h b/include/linux/security.h
index 5f7441abbf42..66a2fcbe6ab0 100644
--- a/include/linux/security.h
+++ b/include/linux/security.h
@@ -195,6 +195,7 @@ int unregister_blocking_lsm_notifier(struct notifier_block *nb);
 
 /* prototypes */
 extern int security_init(void);
+extern int early_security_init(void);
 
 /* Security operations */
 int security_binder_set_context_mgr(struct task_struct *mgr);
diff --git a/init/main.c b/init/main.c
index ff5803b0841c..0fefca3fd43c 100644
--- a/init/main.c
+++ b/init/main.c
@@ -593,6 +593,7 @@ asmlinkage __visible void __init start_kernel(void)
 	boot_cpu_init();
 	page_address_init();
 	pr_notice("%s", linux_banner);
+	early_security_init();
 	setup_arch(&command_line);
 	mm_init_cpumask(&init_mm);
 	setup_command_line(command_line);
diff --git a/security/security.c b/security/security.c
index 250ee2d76406..90f1e291c800 100644
--- a/security/security.c
+++ b/security/security.c
@@ -33,6 +33,7 @@
 
 /* How many LSMs were built into the kernel? */
 #define LSM_COUNT (__end_lsm_info - __start_lsm_info)
+#define EARLY_LSM_COUNT (__end_early_lsm_info - __start_early_lsm_info)
 
 struct security_hook_heads security_hook_heads __lsm_ro_after_init;
 static BLOCKING_NOTIFIER_HEAD(blocking_lsm_notifier_chain);
@@ -277,6 +278,8 @@ static void __init ordered_lsm_parse(const char *order, const char *origin)
 static void __init lsm_early_cred(struct cred *cred);
 static void __init lsm_early_task(struct task_struct *task);
 
+static int lsm_append(const char *new, char **result);
+
 static void __init ordered_lsm_init(void)
 {
 	struct lsm_info **lsm;
@@ -323,6 +326,26 @@ static void __init ordered_lsm_init(void)
 	kfree(ordered_lsms);
 }
 
+int __init early_security_init(void)
+{
+	int i;
+	struct hlist_head *list = (struct hlist_head *) &security_hook_heads;
+	struct lsm_info *lsm;
+
+	for (i = 0; i < sizeof(security_hook_heads) / sizeof(struct hlist_head);
+	     i++)
+		INIT_HLIST_HEAD(&list[i]);
+
+	for (lsm = __start_early_lsm_info; lsm < __end_early_lsm_info; lsm++) {
+		if (!lsm->enabled)
+			lsm->enabled = &lsm_enabled_true;
+		prepare_lsm(lsm);
+		initialize_lsm(lsm);
+	}
+
+	return 0;
+}
+
 /**
  * security_init - initializes the security framework
  *
@@ -330,14 +353,18 @@ static void __init ordered_lsm_init(void)
  */
 int __init security_init(void)
 {
-	int i;
-	struct hlist_head *list = (struct hlist_head *) &security_hook_heads;
+	struct lsm_info *lsm;
 
 	pr_info("Security Framework initializing\n");
 
-	for (i = 0; i < sizeof(security_hook_heads) / sizeof(struct hlist_head);
-	     i++)
-		INIT_HLIST_HEAD(&list[i]);
+	/*
+	 * Append the names of the early LSM modules now that kmalloc() is
+	 * available
+	 */
+	for (lsm = __start_early_lsm_info; lsm < __end_early_lsm_info; lsm++) {
+		if (lsm->enabled)
+			lsm_append(lsm->name, &lsm_names);
+	}
 
 	/* Load LSMs in specified order. */
 	ordered_lsm_init();
@@ -384,7 +411,7 @@ static bool match_last_lsm(const char *list, const char *lsm)
 	return !strcmp(last, lsm);
 }
 
-static int lsm_append(char *new, char **result)
+static int lsm_append(const char *new, char **result)
 {
 	char *cp;
 
@@ -422,8 +449,15 @@ void __init security_add_hooks(struct security_hook_list *hooks, int count,
 		hooks[i].lsm = lsm;
 		hlist_add_tail_rcu(&hooks[i].list, hooks[i].head);
 	}
-	if (lsm_append(lsm, &lsm_names) < 0)
-		panic("%s - Cannot get early memory.\n", __func__);
+
+	/*
+	 * Don't try to append during early_security_init(), we'll come back
+	 * and fix this up afterwards.
+	 */
+	if (slab_is_available()) {
+		if (lsm_append(lsm, &lsm_names) < 0)
+			panic("%s - Cannot get early memory.\n", __func__);
+	}
 }
 
 int call_blocking_lsm_notifier(enum lsm_event event, void *data)
-- 
2.22.0.510.g264f2c817a-goog

^ permalink raw reply related

* [PATCH V36 00/29] security: Add kernel lockdown functionality
From: Matthew Garrett @ 2019-07-18 19:43 UTC (permalink / raw)
  To: jmorris; +Cc: linux-security-module, linux-kernel, linux-api

Minor changes to the previous set, other than a significant rework of
the "Ignore acpi_rsdp kernel param" patch to deal with the early parsing
of that parameter under certain circumstances.

^ permalink raw reply

* [PATCH v12 6/6] sched/core: uclamp: always use enum uclamp_id for clamp_id values
From: Patrick Bellasi @ 2019-07-18 18:17 UTC (permalink / raw)
  To: linux-kernel, linux-pm, linux-api, cgroups
  Cc: Ingo Molnar, Peter Zijlstra, Tejun Heo, Rafael J . Wysocki,
	Vincent Guittot, Viresh Kumar, Paul Turner, Michal Koutny,
	Quentin Perret, Dietmar Eggemann, Morten Rasmussen, Juri Lelli,
	Todd Kjos, Joel Fernandes, Steve Muckle, Suren Baghdasaryan,
	Alessio Balsini
In-Reply-To: <20190718181748.28446-1-patrick.bellasi@arm.com>

The supported clamp indexes are defined in enum clamp_id however, because
of the code logic in some of the first utilization clamping series version,
sometimes we needed to use unsigned int to represent indexes.

This is not more required since the final version of the uclamp_* APIs can
always use the proper enum uclamp_id type.

Fix it with a bulk rename now that we have all the bits merged.

Signed-off-by: Patrick Bellasi <patrick.bellasi@arm.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>

---
Changes in v12:
 Message-ID: <20190716140319.hdmgcuevnpwdqobl@e110439-lin>
 - added in this series
---
 kernel/sched/core.c  | 38 +++++++++++++++++++-------------------
 kernel/sched/sched.h |  2 +-
 2 files changed, 20 insertions(+), 20 deletions(-)

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 26ac1cbec0be..1e6cf9499d49 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -810,7 +810,7 @@ static inline unsigned int uclamp_bucket_base_value(unsigned int clamp_value)
 	return UCLAMP_BUCKET_DELTA * uclamp_bucket_id(clamp_value);
 }
 
-static inline unsigned int uclamp_none(int clamp_id)
+static inline enum uclamp_id uclamp_none(enum uclamp_id clamp_id)
 {
 	if (clamp_id == UCLAMP_MIN)
 		return 0;
@@ -826,7 +826,7 @@ static inline void uclamp_se_set(struct uclamp_se *uc_se,
 }
 
 static inline unsigned int
-uclamp_idle_value(struct rq *rq, unsigned int clamp_id,
+uclamp_idle_value(struct rq *rq, enum uclamp_id clamp_id,
 		  unsigned int clamp_value)
 {
 	/*
@@ -842,7 +842,7 @@ uclamp_idle_value(struct rq *rq, unsigned int clamp_id,
 	return uclamp_none(UCLAMP_MIN);
 }
 
-static inline void uclamp_idle_reset(struct rq *rq, unsigned int clamp_id,
+static inline void uclamp_idle_reset(struct rq *rq, enum uclamp_id clamp_id,
 				     unsigned int clamp_value)
 {
 	/* Reset max-clamp retention only on idle exit */
@@ -853,8 +853,8 @@ static inline void uclamp_idle_reset(struct rq *rq, unsigned int clamp_id,
 }
 
 static inline
-unsigned int uclamp_rq_max_value(struct rq *rq, unsigned int clamp_id,
-				 unsigned int clamp_value)
+enum uclamp_id uclamp_rq_max_value(struct rq *rq, enum uclamp_id clamp_id,
+				   unsigned int clamp_value)
 {
 	struct uclamp_bucket *bucket = rq->uclamp[clamp_id].bucket;
 	int bucket_id = UCLAMP_BUCKETS - 1;
@@ -874,7 +874,7 @@ unsigned int uclamp_rq_max_value(struct rq *rq, unsigned int clamp_id,
 }
 
 static inline struct uclamp_se
-uclamp_tg_restrict(struct task_struct *p, unsigned int clamp_id)
+uclamp_tg_restrict(struct task_struct *p, enum uclamp_id clamp_id)
 {
 	struct uclamp_se uc_req = p->uclamp_req[clamp_id];
 #ifdef CONFIG_UCLAMP_TASK_GROUP
@@ -906,7 +906,7 @@ uclamp_tg_restrict(struct task_struct *p, unsigned int clamp_id)
  * - the system default clamp value, defined by the sysadmin
  */
 static inline struct uclamp_se
-uclamp_eff_get(struct task_struct *p, unsigned int clamp_id)
+uclamp_eff_get(struct task_struct *p, enum uclamp_id clamp_id)
 {
 	struct uclamp_se uc_req = uclamp_tg_restrict(p, clamp_id);
 	struct uclamp_se uc_max = uclamp_default[clamp_id];
@@ -918,7 +918,7 @@ uclamp_eff_get(struct task_struct *p, unsigned int clamp_id)
 	return uc_req;
 }
 
-unsigned int uclamp_eff_value(struct task_struct *p, unsigned int clamp_id)
+enum uclamp_id uclamp_eff_value(struct task_struct *p, enum uclamp_id clamp_id)
 {
 	struct uclamp_se uc_eff;
 
@@ -942,7 +942,7 @@ unsigned int uclamp_eff_value(struct task_struct *p, unsigned int clamp_id)
  * for each bucket when all its RUNNABLE tasks require the same clamp.
  */
 static inline void uclamp_rq_inc_id(struct rq *rq, struct task_struct *p,
-				    unsigned int clamp_id)
+				    enum uclamp_id clamp_id)
 {
 	struct uclamp_rq *uc_rq = &rq->uclamp[clamp_id];
 	struct uclamp_se *uc_se = &p->uclamp[clamp_id];
@@ -980,7 +980,7 @@ static inline void uclamp_rq_inc_id(struct rq *rq, struct task_struct *p,
  * enforce the expected state and warn.
  */
 static inline void uclamp_rq_dec_id(struct rq *rq, struct task_struct *p,
-				    unsigned int clamp_id)
+				    enum uclamp_id clamp_id)
 {
 	struct uclamp_rq *uc_rq = &rq->uclamp[clamp_id];
 	struct uclamp_se *uc_se = &p->uclamp[clamp_id];
@@ -1019,7 +1019,7 @@ static inline void uclamp_rq_dec_id(struct rq *rq, struct task_struct *p,
 
 static inline void uclamp_rq_inc(struct rq *rq, struct task_struct *p)
 {
-	unsigned int clamp_id;
+	enum uclamp_id clamp_id;
 
 	if (unlikely(!p->sched_class->uclamp_enabled))
 		return;
@@ -1034,7 +1034,7 @@ static inline void uclamp_rq_inc(struct rq *rq, struct task_struct *p)
 
 static inline void uclamp_rq_dec(struct rq *rq, struct task_struct *p)
 {
-	unsigned int clamp_id;
+	enum uclamp_id clamp_id;
 
 	if (unlikely(!p->sched_class->uclamp_enabled))
 		return;
@@ -1044,7 +1044,7 @@ static inline void uclamp_rq_dec(struct rq *rq, struct task_struct *p)
 }
 
 static inline void
-uclamp_update_active(struct task_struct *p, unsigned int clamp_id)
+uclamp_update_active(struct task_struct *p, enum uclamp_id clamp_id)
 {
 	struct rq_flags rf;
 	struct rq *rq;
@@ -1080,9 +1080,9 @@ static inline void
 uclamp_update_active_tasks(struct cgroup_subsys_state *css,
 			   unsigned int clamps)
 {
+	enum uclamp_id clamp_id;
 	struct css_task_iter it;
 	struct task_struct *p;
-	unsigned int clamp_id;
 
 	css_task_iter_start(css, 0, &it);
 	while ((p = css_task_iter_next(&it))) {
@@ -1188,7 +1188,7 @@ static int uclamp_validate(struct task_struct *p,
 static void __setscheduler_uclamp(struct task_struct *p,
 				  const struct sched_attr *attr)
 {
-	unsigned int clamp_id;
+	enum uclamp_id clamp_id;
 
 	/*
 	 * On scheduling class change, reset to default clamps for tasks
@@ -1225,7 +1225,7 @@ static void __setscheduler_uclamp(struct task_struct *p,
 
 static void uclamp_fork(struct task_struct *p)
 {
-	unsigned int clamp_id;
+	enum uclamp_id clamp_id;
 
 	for_each_clamp_id(clamp_id)
 		p->uclamp[clamp_id].active = false;
@@ -1247,7 +1247,7 @@ static void uclamp_fork(struct task_struct *p)
 static void __init init_uclamp(void)
 {
 	struct uclamp_se uc_max = {};
-	unsigned int clamp_id;
+	enum uclamp_id clamp_id;
 	int cpu;
 
 	mutex_init(&uclamp_mutex);
@@ -6851,7 +6851,7 @@ static inline void alloc_uclamp_sched_group(struct task_group *tg,
 					    struct task_group *parent)
 {
 #ifdef CONFIG_UCLAMP_TASK_GROUP
-	int clamp_id;
+	enum uclamp_id clamp_id;
 
 	for_each_clamp_id(clamp_id) {
 		uclamp_se_set(&tg->uclamp_req[clamp_id],
@@ -7113,7 +7113,7 @@ static void cpu_util_update_eff(struct cgroup_subsys_state *css)
 	struct uclamp_se *uc_parent = NULL;
 	struct uclamp_se *uc_se = NULL;
 	unsigned int eff[UCLAMP_CNT];
-	unsigned int clamp_id;
+	enum uclamp_id clamp_id;
 	unsigned int clamps;
 
 	css_for_each_descendant_pre(css, top_css) {
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 93a030321210..e230474c1548 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -2276,7 +2276,7 @@ static inline void cpufreq_update_util(struct rq *rq, unsigned int flags) {}
 #endif /* CONFIG_CPU_FREQ */
 
 #ifdef CONFIG_UCLAMP_TASK
-unsigned int uclamp_eff_value(struct task_struct *p, unsigned int clamp_id);
+enum uclamp_id uclamp_eff_value(struct task_struct *p, enum uclamp_id clamp_id);
 
 static __always_inline
 unsigned int uclamp_util_with(struct rq *rq, unsigned int util,
-- 
2.22.0

^ permalink raw reply related

* [PATCH v12 5/6] sched/core: uclamp: Update CPU's refcount on TG's clamp changes
From: Patrick Bellasi @ 2019-07-18 18:17 UTC (permalink / raw)
  To: linux-kernel, linux-pm, linux-api, cgroups
  Cc: Ingo Molnar, Peter Zijlstra, Tejun Heo, Rafael J . Wysocki,
	Vincent Guittot, Viresh Kumar, Paul Turner, Michal Koutny,
	Quentin Perret, Dietmar Eggemann, Morten Rasmussen, Juri Lelli,
	Todd Kjos, Joel Fernandes, Steve Muckle, Suren Baghdasaryan,
	Alessio Balsini
In-Reply-To: <20190718181748.28446-1-patrick.bellasi@arm.com>

On updates of task group (TG) clamp values, ensure that these new values
are enforced on all RUNNABLE tasks of the task group, i.e. all RUNNABLE
tasks are immediately boosted and/or capped as requested.

Do that each time we update effective clamps from cpu_util_update_eff().
Use the *cgroup_subsys_state (css) to walk the list of tasks in each
affected TG and update their RUNNABLE tasks.
Update each task by using the same mechanism used for cpu affinity masks
updates, i.e. by taking the rq lock.

Signed-off-by: Patrick Bellasi <patrick.bellasi@arm.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Tejun Heo <tj@kernel.org>
---
 kernel/sched/core.c | 58 ++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 57 insertions(+), 1 deletion(-)

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 426736b2c4d7..26ac1cbec0be 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -1043,6 +1043,57 @@ static inline void uclamp_rq_dec(struct rq *rq, struct task_struct *p)
 		uclamp_rq_dec_id(rq, p, clamp_id);
 }
 
+static inline void
+uclamp_update_active(struct task_struct *p, unsigned int clamp_id)
+{
+	struct rq_flags rf;
+	struct rq *rq;
+
+	/*
+	 * Lock the task and the rq where the task is (or was) queued.
+	 *
+	 * We might lock the (previous) rq of a !RUNNABLE task, but that's the
+	 * price to pay to safely serialize util_{min,max} updates with
+	 * enqueues, dequeues and migration operations.
+	 * This is the same locking schema used by __set_cpus_allowed_ptr().
+	 */
+	rq = task_rq_lock(p, &rf);
+
+	/*
+	 * Setting the clamp bucket is serialized by task_rq_lock().
+	 * If the task is not yet RUNNABLE and its task_struct is not
+	 * affecting a valid clamp bucket, the next time it's enqueued,
+	 * it will already see the updated clamp bucket value.
+	 */
+	if (!p->uclamp[clamp_id].active)
+		goto done;
+
+	uclamp_rq_dec_id(rq, p, clamp_id);
+	uclamp_rq_inc_id(rq, p, clamp_id);
+
+done:
+
+	task_rq_unlock(rq, p, &rf);
+}
+
+static inline void
+uclamp_update_active_tasks(struct cgroup_subsys_state *css,
+			   unsigned int clamps)
+{
+	struct css_task_iter it;
+	struct task_struct *p;
+	unsigned int clamp_id;
+
+	css_task_iter_start(css, 0, &it);
+	while ((p = css_task_iter_next(&it))) {
+		for_each_clamp_id(clamp_id) {
+			if ((0x1 << clamp_id) & clamps)
+				uclamp_update_active(p, clamp_id);
+		}
+	}
+	css_task_iter_end(&it);
+}
+
 #ifdef CONFIG_UCLAMP_TASK_GROUP
 static void cpu_util_update_eff(struct cgroup_subsys_state *css);
 static void uclamp_update_root_tg(void)
@@ -7091,8 +7142,13 @@ static void cpu_util_update_eff(struct cgroup_subsys_state *css)
 			uc_se[clamp_id].bucket_id = uclamp_bucket_id(eff[clamp_id]);
 			clamps |= (0x1 << clamp_id);
 		}
-		if (!clamps)
+		if (!clamps) {
 			css = css_rightmost_descendant(css);
+			continue;
+		}
+
+		/* Immediately update descendants RUNNABLE tasks */
+		uclamp_update_active_tasks(css, clamps);
 	}
 }
 
-- 
2.22.0

^ permalink raw reply related

* [PATCH v12 4/6] sched/core: uclamp: Use TG's clamps to restrict TASK's clamps
From: Patrick Bellasi @ 2019-07-18 18:17 UTC (permalink / raw)
  To: linux-kernel, linux-pm, linux-api, cgroups
  Cc: Ingo Molnar, Peter Zijlstra, Tejun Heo, Rafael J . Wysocki,
	Vincent Guittot, Viresh Kumar, Paul Turner, Michal Koutny,
	Quentin Perret, Dietmar Eggemann, Morten Rasmussen, Juri Lelli,
	Todd Kjos, Joel Fernandes, Steve Muckle, Suren Baghdasaryan,
	Alessio Balsini
In-Reply-To: <20190718181748.28446-1-patrick.bellasi@arm.com>

When a task specific clamp value is configured via sched_setattr(2), this
value is accounted in the corresponding clamp bucket every time the task is
{en,de}qeued. However, when cgroups are also in use, the task specific
clamp values could be restricted by the task_group (TG) clamp values.

Update uclamp_cpu_inc() to aggregate task and TG clamp values. Every time a
task is enqueued, it's accounted in the clamp bucket tracking the smaller
clamp between the task specific value and its TG effective value. This
allows to:

1. ensure cgroup clamps are always used to restrict task specific requests,
   i.e. boosted not more than its TG effective protection and capped at
   least as its TG effective limit.

2. implement a "nice-like" policy, where tasks are still allowed to request
   less than what enforced by their TG effective limits and protections

Do this by exploiting the concept of "effective" clamp, which is already
used by a TG to track parent enforced restrictions.

Apply task group clamp restrictions only to tasks belonging to a child
group. While, for tasks in the root group or in an autogroup, system
defaults are still enforced.

Signed-off-by: Patrick Bellasi <patrick.bellasi@arm.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Tejun Heo <tj@kernel.org>

---
Changes in v12:
 Message-ID: <20190716143435.iwwd6fjr3udlqol4@e110439-lin>
 - remove not required and confusing sentence from the above changelog
---
 kernel/sched/core.c | 28 +++++++++++++++++++++++++++-
 1 file changed, 27 insertions(+), 1 deletion(-)

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index e9231b089d5c..426736b2c4d7 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -873,16 +873,42 @@ unsigned int uclamp_rq_max_value(struct rq *rq, unsigned int clamp_id,
 	return uclamp_idle_value(rq, clamp_id, clamp_value);
 }
 
+static inline struct uclamp_se
+uclamp_tg_restrict(struct task_struct *p, unsigned int clamp_id)
+{
+	struct uclamp_se uc_req = p->uclamp_req[clamp_id];
+#ifdef CONFIG_UCLAMP_TASK_GROUP
+	struct uclamp_se uc_max;
+
+	/*
+	 * Tasks in autogroups or root task group will be
+	 * restricted by system defaults.
+	 */
+	if (task_group_is_autogroup(task_group(p)))
+		return uc_req;
+	if (task_group(p) == &root_task_group)
+		return uc_req;
+
+	uc_max = task_group(p)->uclamp[clamp_id];
+	if (uc_req.value > uc_max.value || !uc_req.user_defined)
+		return uc_max;
+#endif
+
+	return uc_req;
+}
+
 /*
  * The effective clamp bucket index of a task depends on, by increasing
  * priority:
  * - the task specific clamp value, when explicitly requested from userspace
+ * - the task group effective clamp value, for tasks not either in the root
+ *   group or in an autogroup
  * - the system default clamp value, defined by the sysadmin
  */
 static inline struct uclamp_se
 uclamp_eff_get(struct task_struct *p, unsigned int clamp_id)
 {
-	struct uclamp_se uc_req = p->uclamp_req[clamp_id];
+	struct uclamp_se uc_req = uclamp_tg_restrict(p, clamp_id);
 	struct uclamp_se uc_max = uclamp_default[clamp_id];
 
 	/* System default restrictions always apply */
-- 
2.22.0

^ permalink raw reply related

* [PATCH v12 3/6] sched/core: uclamp: Propagate system defaults to root group
From: Patrick Bellasi @ 2019-07-18 18:17 UTC (permalink / raw)
  To: linux-kernel, linux-pm, linux-api, cgroups
  Cc: Ingo Molnar, Peter Zijlstra, Tejun Heo, Rafael J . Wysocki,
	Vincent Guittot, Viresh Kumar, Paul Turner, Michal Koutny,
	Quentin Perret, Dietmar Eggemann, Morten Rasmussen, Juri Lelli,
	Todd Kjos, Joel Fernandes, Steve Muckle, Suren Baghdasaryan,
	Alessio Balsini
In-Reply-To: <20190718181748.28446-1-patrick.bellasi@arm.com>

The clamp values are not tunable at the level of the root task group.
That's for two main reasons:

 - the root group represents "system resources" which are always
   entirely available from the cgroup standpoint.

 - when tuning/restricting "system resources" makes sense, tuning must
   be done using a system wide API which should also be available when
   control groups are not.

When a system wide restriction is available, cgroups should be aware of
its value in order to know exactly how much "system resources" are
available for the subgroups.

Utilization clamping supports already the concepts of:

 - system defaults: which define the maximum possible clamp values
   usable by tasks.

 - effective clamps: which allows a parent cgroup to constraint (maybe
   temporarily) its descendants without losing the information related
   to the values "requested" from them.

Exploit these two concepts and bind them together in such a way that,
whenever system default are tuned, the new values are propagated to
(possibly) restrict or relax the "effective" value of nested cgroups.

Signed-off-by: Patrick Bellasi <patrick.bellasi@arm.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Tejun Heo <tj@kernel.org>

---
Changes in v12:
 Message-ID: <20190716143417.us3xhksrsaxsl2ok@e110439-lin>
 - add missing RCU read locks across cpu_util_update_eff() call from
   uclamp_update_root_tg()
---
 kernel/sched/core.c | 25 +++++++++++++++++++++++++
 1 file changed, 25 insertions(+)

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 08f5a0c205c6..e9231b089d5c 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -1017,10 +1017,30 @@ static inline void uclamp_rq_dec(struct rq *rq, struct task_struct *p)
 		uclamp_rq_dec_id(rq, p, clamp_id);
 }
 
+#ifdef CONFIG_UCLAMP_TASK_GROUP
+static void cpu_util_update_eff(struct cgroup_subsys_state *css);
+static void uclamp_update_root_tg(void)
+{
+	struct task_group *tg = &root_task_group;
+
+	uclamp_se_set(&tg->uclamp_req[UCLAMP_MIN],
+		      sysctl_sched_uclamp_util_min, false);
+	uclamp_se_set(&tg->uclamp_req[UCLAMP_MAX],
+		      sysctl_sched_uclamp_util_max, false);
+
+	rcu_read_lock();
+	cpu_util_update_eff(&root_task_group.css);
+	rcu_read_unlock();
+}
+#else
+static void uclamp_update_root_tg(void) { }
+#endif
+
 int sysctl_sched_uclamp_handler(struct ctl_table *table, int write,
 				void __user *buffer, size_t *lenp,
 				loff_t *ppos)
 {
+	bool update_root_tg = false;
 	int old_min, old_max;
 	int result;
 
@@ -1043,12 +1063,17 @@ int sysctl_sched_uclamp_handler(struct ctl_table *table, int write,
 	if (old_min != sysctl_sched_uclamp_util_min) {
 		uclamp_se_set(&uclamp_default[UCLAMP_MIN],
 			      sysctl_sched_uclamp_util_min, false);
+		update_root_tg = true;
 	}
 	if (old_max != sysctl_sched_uclamp_util_max) {
 		uclamp_se_set(&uclamp_default[UCLAMP_MAX],
 			      sysctl_sched_uclamp_util_max, false);
+		update_root_tg = true;
 	}
 
+	if (update_root_tg)
+		uclamp_update_root_tg();
+
 	/*
 	 * Updating all the RUNNABLE task is expensive, keep it simple and do
 	 * just a lazy update at each next enqueue time.
-- 
2.22.0

^ permalink raw reply related

* [PATCH v12 2/6] sched/core: uclamp: Propagate parent clamps
From: Patrick Bellasi @ 2019-07-18 18:17 UTC (permalink / raw)
  To: linux-kernel, linux-pm, linux-api, cgroups
  Cc: Ingo Molnar, Peter Zijlstra, Tejun Heo, Rafael J . Wysocki,
	Vincent Guittot, Viresh Kumar, Paul Turner, Michal Koutny,
	Quentin Perret, Dietmar Eggemann, Morten Rasmussen, Juri Lelli,
	Todd Kjos, Joel Fernandes, Steve Muckle, Suren Baghdasaryan,
	Alessio Balsini
In-Reply-To: <20190718181748.28446-1-patrick.bellasi@arm.com>

In order to properly support hierarchical resources control, the cgroup
delegation model requires that attribute writes from a child group never
fail but still are locally consistent and constrained based on parent's
assigned resources. This requires to properly propagate and aggregate
parent attributes down to its descendants.

Implement this mechanism by adding a new "effective" clamp value for each
task group. The effective clamp value is defined as the smaller value
between the clamp value of a group and the effective clamp value of its
parent. This is the actual clamp value enforced on tasks in a task group.

Since it's possible for a cpu.uclamp.min value to be bigger than the
cpu.uclamp.max value, ensure local consistency by restricting each
"protection" (i.e. min utilization) with the corresponding "limit"
(i.e. max utilization).

Do that at effective clamps propagation to ensure all user-space write
never fails while still always tracking the most restrictive values.

Update sysctl_sched_uclamp_handler() to use the newly introduced
uclamp_mutex so that we serialize system default updates with cgroup
relate updates.

Signed-off-by: Patrick Bellasi <patrick.bellasi@arm.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Tejun Heo <tj@kernel.org>

---
Changes in v12:
 Message-ID: <20190716140706.vuggfigjlys44lkp@e110439-lin>
 - use a dedicated variable for parent restrictions
 - make more explicit in the documentation that the requested "protection" is
   always capped by the requested "limit"
 Message-ID: <20190716175542.p7vs2muslyuez6lq@e110439-lin>
 - use the newly added uclamp_mutex to serialize the sysfs write callback
---
 kernel/sched/core.c  | 70 ++++++++++++++++++++++++++++++++++++++++++--
 kernel/sched/sched.h |  2 ++
 2 files changed, 69 insertions(+), 3 deletions(-)

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index fcc32afe53cb..08f5a0c205c6 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -773,6 +773,18 @@ static void set_load_weight(struct task_struct *p, bool update_load)
 }
 
 #ifdef CONFIG_UCLAMP_TASK
+/*
+ * Serializes updates of utilization clamp values
+ *
+ * The (slow-path) user-space triggers utilization clamp value updates which
+ * can require updates on (fast-path) scheduler's data structures used to
+ * support enqueue/dequeue operations.
+ * While the per-CPU rq lock protects fast-path update operations, user-space
+ * requests are serialized using a mutex to reduce the risk of conflicting
+ * updates or API abuses.
+ */
+static DEFINE_MUTEX(uclamp_mutex);
+
 /* Max allowed minimum utilization */
 unsigned int sysctl_sched_uclamp_util_min = SCHED_CAPACITY_SCALE;
 
@@ -1010,10 +1022,9 @@ int sysctl_sched_uclamp_handler(struct ctl_table *table, int write,
 				loff_t *ppos)
 {
 	int old_min, old_max;
-	static DEFINE_MUTEX(mutex);
 	int result;
 
-	mutex_lock(&mutex);
+	mutex_lock(&uclamp_mutex);
 	old_min = sysctl_sched_uclamp_util_min;
 	old_max = sysctl_sched_uclamp_util_max;
 
@@ -1048,7 +1059,7 @@ int sysctl_sched_uclamp_handler(struct ctl_table *table, int write,
 	sysctl_sched_uclamp_util_min = old_min;
 	sysctl_sched_uclamp_util_max = old_max;
 done:
-	mutex_unlock(&mutex);
+	mutex_unlock(&uclamp_mutex);
 
 	return result;
 }
@@ -1137,6 +1148,8 @@ static void __init init_uclamp(void)
 	unsigned int clamp_id;
 	int cpu;
 
+	mutex_init(&uclamp_mutex);
+
 	for_each_possible_cpu(cpu) {
 		memset(&cpu_rq(cpu)->uclamp, 0, sizeof(struct uclamp_rq));
 		cpu_rq(cpu)->uclamp_flags = 0;
@@ -1153,6 +1166,7 @@ static void __init init_uclamp(void)
 		uclamp_default[clamp_id] = uc_max;
 #ifdef CONFIG_UCLAMP_TASK_GROUP
 		root_task_group.uclamp_req[clamp_id] = uc_max;
+		root_task_group.uclamp[clamp_id] = uc_max;
 #endif
 	}
 }
@@ -6740,6 +6754,7 @@ static inline void alloc_uclamp_sched_group(struct task_group *tg,
 	for_each_clamp_id(clamp_id) {
 		uclamp_se_set(&tg->uclamp_req[clamp_id],
 			      uclamp_none(clamp_id), false);
+		tg->uclamp[clamp_id] = parent->uclamp[clamp_id];
 	}
 #endif
 }
@@ -6990,6 +7005,45 @@ static void cpu_cgroup_attach(struct cgroup_taskset *tset)
 }
 
 #ifdef CONFIG_UCLAMP_TASK_GROUP
+static void cpu_util_update_eff(struct cgroup_subsys_state *css)
+{
+	struct cgroup_subsys_state *top_css = css;
+	struct uclamp_se *uc_parent = NULL;
+	struct uclamp_se *uc_se = NULL;
+	unsigned int eff[UCLAMP_CNT];
+	unsigned int clamp_id;
+	unsigned int clamps;
+
+	css_for_each_descendant_pre(css, top_css) {
+		uc_parent = css_tg(css)->parent
+			? css_tg(css)->parent->uclamp : NULL;
+
+		for_each_clamp_id(clamp_id) {
+			/* Assume effective clamps matches requested clamps */
+			eff[clamp_id] = css_tg(css)->uclamp_req[clamp_id].value;
+			/* Cap effective clamps with parent's effective clamps */
+			if (uc_parent &&
+			    eff[clamp_id] > uc_parent[clamp_id].value) {
+				eff[clamp_id] = uc_parent[clamp_id].value;
+			}
+		}
+		/* Ensure protection is always capped by limit */
+		eff[UCLAMP_MIN] = min(eff[UCLAMP_MIN], eff[UCLAMP_MAX]);
+
+		/* Propagate most restrictive effective clamps */
+		clamps = 0x0;
+		uc_se = css_tg(css)->uclamp;
+		for_each_clamp_id(clamp_id) {
+			if (eff[clamp_id] == uc_se[clamp_id].value)
+				continue;
+			uc_se[clamp_id].value = eff[clamp_id];
+			uc_se[clamp_id].bucket_id = uclamp_bucket_id(eff[clamp_id]);
+			clamps |= (0x1 << clamp_id);
+		}
+		if (!clamps)
+			css = css_rightmost_descendant(css);
+	}
+}
 
 #define _POW10(exp) ((unsigned int)1e##exp)
 #define POW10(exp) _POW10(exp)
@@ -7040,6 +7094,7 @@ static ssize_t cpu_uclamp_min_write(struct kernfs_open_file *of,
 	if (req.ret)
 		return req.ret;
 
+	mutex_lock(&uclamp_mutex);
 	rcu_read_lock();
 
 	tg = css_tg(of_css(of));
@@ -7049,7 +7104,11 @@ static ssize_t cpu_uclamp_min_write(struct kernfs_open_file *of,
 	/* Keep track of the actual requested value */
 	tg->uclamp_pct[UCLAMP_MIN] = req.percent;
 
+	/* Update effective clamps to track the most restrictive value */
+	cpu_util_update_eff(of_css(of));
+
 	rcu_read_unlock();
+	mutex_unlock(&uclamp_mutex);
 
 	return nbytes;
 }
@@ -7065,6 +7124,7 @@ static ssize_t cpu_uclamp_max_write(struct kernfs_open_file *of,
 	if (req.ret)
 		return req.ret;
 
+	mutex_lock(&uclamp_mutex);
 	rcu_read_lock();
 
 	tg = css_tg(of_css(of));
@@ -7074,7 +7134,11 @@ static ssize_t cpu_uclamp_max_write(struct kernfs_open_file *of,
 	/* Keep track of the actual requested value */
 	tg->uclamp_pct[UCLAMP_MAX] = req.percent;
 
+	/* Update effective clamps to track the most restrictive value */
+	cpu_util_update_eff(of_css(of));
+
 	rcu_read_unlock();
+	mutex_unlock(&uclamp_mutex);
 
 	return nbytes;
 }
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index f10557a2dea7..93a030321210 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -399,6 +399,8 @@ struct task_group {
 	unsigned int		uclamp_pct[UCLAMP_CNT];
 	/* Clamp values requested for a task group */
 	struct uclamp_se	uclamp_req[UCLAMP_CNT];
+	/* Effective clamp values used for a task group */
+	struct uclamp_se	uclamp[UCLAMP_CNT];
 #endif
 
 };
-- 
2.22.0

^ permalink raw reply related

* [PATCH v12 1/6] sched/core: uclamp: Extend CPU's cgroup controller
From: Patrick Bellasi @ 2019-07-18 18:17 UTC (permalink / raw)
  To: linux-kernel, linux-pm, linux-api, cgroups
  Cc: Ingo Molnar, Peter Zijlstra, Tejun Heo, Rafael J . Wysocki,
	Vincent Guittot, Viresh Kumar, Paul Turner, Michal Koutny,
	Quentin Perret, Dietmar Eggemann, Morten Rasmussen, Juri Lelli,
	Todd Kjos, Joel Fernandes, Steve Muckle, Suren Baghdasaryan,
	Alessio Balsini
In-Reply-To: <20190718181748.28446-1-patrick.bellasi@arm.com>

The cgroup CPU bandwidth controller allows to assign a specified
(maximum) bandwidth to the tasks of a group. However this bandwidth is
defined and enforced only on a temporal base, without considering the
actual frequency a CPU is running on. Thus, the amount of computation
completed by a task within an allocated bandwidth can be very different
depending on the actual frequency the CPU is running that task.
The amount of computation can be affected also by the specific CPU a
task is running on, especially when running on asymmetric capacity
systems like Arm's big.LITTLE.

With the availability of schedutil, the scheduler is now able
to drive frequency selections based on actual task utilization.
Moreover, the utilization clamping support provides a mechanism to
bias the frequency selection operated by schedutil depending on
constraints assigned to the tasks currently RUNNABLE on a CPU.

Giving the mechanisms described above, it is now possible to extend the
cpu controller to specify the minimum (or maximum) utilization which
should be considered for tasks RUNNABLE on a cpu.
This makes it possible to better defined the actual computational
power assigned to task groups, thus improving the cgroup CPU bandwidth
controller which is currently based just on time constraints.

Extend the CPU controller with a couple of new attributes uclamp.{min,max}
which allow to enforce utilization boosting and capping for all the
tasks in a group.

Specifically:

- uclamp.min: defines the minimum utilization which should be considered
	      i.e. the RUNNABLE tasks of this group will run at least at a
	      	 minimum frequency which corresponds to the uclamp.min
	      	 utilization

- uclamp.max: defines the maximum utilization which should be considered
	      i.e. the RUNNABLE tasks of this group will run up to a
	      	 maximum frequency which corresponds to the uclamp.max
	      	 utilization

These attributes:

a) are available only for non-root nodes, both on default and legacy
   hierarchies, while system wide clamps are defined by a generic
   interface which does not depends on cgroups. This system wide
   interface enforces constraints on tasks in the root node.

b) enforce effective constraints at each level of the hierarchy which
   are a restriction of the group requests considering its parent's
   effective constraints. Root group effective constraints are defined
   by the system wide interface.
   This mechanism allows each (non-root) level of the hierarchy to:
   - request whatever clamp values it would like to get
   - effectively get only up to the maximum amount allowed by its parent

c) have higher priority than task-specific clamps, defined via
   sched_setattr(), thus allowing to control and restrict task requests.

Add two new attributes to the cpu controller to collect "requested"
clamp values. Allow that at each non-root level of the hierarchy.
Validate local consistency by enforcing uclamp.min < uclamp.max.
Keep it simple by not caring now about "effective" values computation
and propagation along the hierarchy.

Signed-off-by: Patrick Bellasi <patrick.bellasi@arm.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Tejun Heo <tj@kernel.org>

---
Changes in v12:
 Message-ID: <20190715133801.yohhd2hywzsv3uyf@e110439-lin>
 - track requested cgroup's percentage to mask conversion rounding to userspace
 - introduce UCLAMP_PERCENT_{SHIFT,SCALE} to avoid hardcoded constants
 - s/uclamp_scale_from_percent()/capacity_from_percent()/
 - move range check from cpu_uclamp_{min,max}_write() to capacity_from_percent()
 Message-ID: <20190718152327.vmnds3kpagh2xz2r@e110439-lin>
 - fix percentage's decimals format string
---
 Documentation/admin-guide/cgroup-v2.rst |  34 +++++
 init/Kconfig                            |  22 +++
 kernel/sched/core.c                     | 175 +++++++++++++++++++++++-
 kernel/sched/sched.h                    |   8 ++
 4 files changed, 238 insertions(+), 1 deletion(-)

diff --git a/Documentation/admin-guide/cgroup-v2.rst b/Documentation/admin-guide/cgroup-v2.rst
index 3b29005aa981..5f1c266131b0 100644
--- a/Documentation/admin-guide/cgroup-v2.rst
+++ b/Documentation/admin-guide/cgroup-v2.rst
@@ -951,6 +951,13 @@ controller implements weight and absolute bandwidth limit models for
 normal scheduling policy and absolute bandwidth allocation model for
 realtime scheduling policy.
 
+In all the above models, cycles distribution is defined only on a temporal
+base and it does not account for the frequency at which tasks are executed.
+The (optional) utilization clamping support allows to hint the schedutil
+cpufreq governor about the minimum desired frequency which should always be
+provided by a CPU, as well as the maximum desired frequency, which should not
+be exceeded by a CPU.
+
 WARNING: cgroup2 doesn't yet support control of realtime processes and
 the cpu controller can only be enabled when all RT processes are in
 the root cgroup.  Be aware that system management software may already
@@ -1016,6 +1023,33 @@ All time durations are in microseconds.
 	Shows pressure stall information for CPU. See
 	Documentation/accounting/psi.rst for details.
 
+  cpu.uclamp.min
+        A read-write single value file which exists on non-root cgroups.
+        The default is "0", i.e. no utilization boosting.
+
+        The requested minimum utilization (protection) as a percentage
+        rational number, e.g. 12.34 for 12.34%.
+
+        This interface allows reading and setting minimum utilization clamp
+        values similar to the sched_setattr(2). This minimum utilization
+        value is used to clamp the task specific minimum utilization clamp.
+
+        The requested minimum utilization (protection) is always capped by
+        the current value for the maximum utilization (limit), i.e.
+        `cpu.uclamp.max`.
+
+  cpu.uclamp.max
+        A read-write single value file which exists on non-root cgroups.
+        The default is "max". i.e. no utilization capping
+
+        The requested maximum utilization (limit) as a percentage rational
+        number, e.g. 98.76 for 98.76%.
+
+        This interface allows reading and setting maximum utilization clamp
+        values similar to the sched_setattr(2). This maximum utilization
+        value is used to clamp the task specific maximum utilization clamp.
+
+
 
 Memory
 ------
diff --git a/init/Kconfig b/init/Kconfig
index bd7d650d4a99..ac285cfa78b6 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -928,6 +928,28 @@ config RT_GROUP_SCHED
 
 endif #CGROUP_SCHED
 
+config UCLAMP_TASK_GROUP
+	bool "Utilization clamping per group of tasks"
+	depends on CGROUP_SCHED
+	depends on UCLAMP_TASK
+	default n
+	help
+	  This feature enables the scheduler to track the clamped utilization
+	  of each CPU based on RUNNABLE tasks currently scheduled on that CPU.
+
+	  When this option is enabled, the user can specify a min and max
+	  CPU bandwidth which is allowed for each single task in a group.
+	  The max bandwidth allows to clamp the maximum frequency a task
+	  can use, while the min bandwidth allows to define a minimum
+	  frequency a task will always use.
+
+	  When task group based utilization clamping is enabled, an eventually
+	  specified task-specific clamp value is constrained by the cgroup
+	  specified clamp value. Both minimum and maximum task clamping cannot
+	  be bigger than the corresponding clamping defined at task group level.
+
+	  If in doubt, say N.
+
 config CGROUP_PIDS
 	bool "PIDs controller"
 	help
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 2b037f195473..fcc32afe53cb 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -1149,8 +1149,12 @@ static void __init init_uclamp(void)
 
 	/* System defaults allow max clamp values for both indexes */
 	uclamp_se_set(&uc_max, uclamp_none(UCLAMP_MAX), false);
-	for_each_clamp_id(clamp_id)
+	for_each_clamp_id(clamp_id) {
 		uclamp_default[clamp_id] = uc_max;
+#ifdef CONFIG_UCLAMP_TASK_GROUP
+		root_task_group.uclamp_req[clamp_id] = uc_max;
+#endif
+	}
 }
 
 #else /* CONFIG_UCLAMP_TASK */
@@ -6727,6 +6731,19 @@ void ia64_set_curr_task(int cpu, struct task_struct *p)
 /* task_group_lock serializes the addition/removal of task groups */
 static DEFINE_SPINLOCK(task_group_lock);
 
+static inline void alloc_uclamp_sched_group(struct task_group *tg,
+					    struct task_group *parent)
+{
+#ifdef CONFIG_UCLAMP_TASK_GROUP
+	int clamp_id;
+
+	for_each_clamp_id(clamp_id) {
+		uclamp_se_set(&tg->uclamp_req[clamp_id],
+			      uclamp_none(clamp_id), false);
+	}
+#endif
+}
+
 static void sched_free_group(struct task_group *tg)
 {
 	free_fair_sched_group(tg);
@@ -6750,6 +6767,8 @@ struct task_group *sched_create_group(struct task_group *parent)
 	if (!alloc_rt_sched_group(tg, parent))
 		goto err;
 
+	alloc_uclamp_sched_group(tg, parent);
+
 	return tg;
 
 err:
@@ -6970,6 +6989,132 @@ static void cpu_cgroup_attach(struct cgroup_taskset *tset)
 		sched_move_task(task);
 }
 
+#ifdef CONFIG_UCLAMP_TASK_GROUP
+
+#define _POW10(exp) ((unsigned int)1e##exp)
+#define POW10(exp) _POW10(exp)
+
+struct uclamp_request {
+#define UCLAMP_PERCENT_SHIFT	2
+#define UCLAMP_PERCENT_SCALE	(100 * POW10(UCLAMP_PERCENT_SHIFT))
+	s64 percent;
+	u64 util;
+	int ret;
+};
+
+static inline struct uclamp_request
+capacity_from_percent(char *buf)
+{
+	struct uclamp_request req = {
+		.percent = UCLAMP_PERCENT_SCALE,
+		.util = SCHED_CAPACITY_SCALE,
+		.ret = 0,
+	};
+
+	buf = strim(buf);
+	if (strncmp("max", buf, 4)) {
+		req.ret = cgroup_parse_float(buf, UCLAMP_PERCENT_SHIFT,
+					     &req.percent);
+		if (req.ret)
+			return req;
+		if (req.percent > UCLAMP_PERCENT_SCALE) {
+			req.ret = -ERANGE;
+			return req;
+		}
+
+		req.util = req.percent << SCHED_CAPACITY_SHIFT;
+		req.util = DIV_ROUND_CLOSEST_ULL(req.util, UCLAMP_PERCENT_SCALE);
+	}
+
+	return req;
+}
+
+static ssize_t cpu_uclamp_min_write(struct kernfs_open_file *of,
+				    char *buf, size_t nbytes,
+				    loff_t off)
+{
+	struct uclamp_request req;
+	struct task_group *tg;
+
+	req = capacity_from_percent(buf);
+	if (req.ret)
+		return req.ret;
+
+	rcu_read_lock();
+
+	tg = css_tg(of_css(of));
+	if (tg->uclamp_req[UCLAMP_MIN].value != req.util)
+		uclamp_se_set(&tg->uclamp_req[UCLAMP_MIN], req.util, false);
+
+	/* Keep track of the actual requested value */
+	tg->uclamp_pct[UCLAMP_MIN] = req.percent;
+
+	rcu_read_unlock();
+
+	return nbytes;
+}
+
+static ssize_t cpu_uclamp_max_write(struct kernfs_open_file *of,
+				    char *buf, size_t nbytes,
+				    loff_t off)
+{
+	struct uclamp_request req;
+	struct task_group *tg;
+
+	req = capacity_from_percent(buf);
+	if (req.ret)
+		return req.ret;
+
+	rcu_read_lock();
+
+	tg = css_tg(of_css(of));
+	if (tg->uclamp_req[UCLAMP_MAX].value != req.util)
+		uclamp_se_set(&tg->uclamp_req[UCLAMP_MAX], req.util, false);
+
+	/* Keep track of the actual requested value */
+	tg->uclamp_pct[UCLAMP_MAX] = req.percent;
+
+	rcu_read_unlock();
+
+	return nbytes;
+}
+
+static inline void cpu_uclamp_print(struct seq_file *sf,
+				    enum uclamp_id clamp_id)
+{
+	struct task_group *tg;
+	u64 util_clamp;
+	u64 percent;
+	u32 rem;
+
+	rcu_read_lock();
+	tg = css_tg(seq_css(sf));
+	util_clamp = tg->uclamp_req[clamp_id].value;
+	rcu_read_unlock();
+
+	if (util_clamp == SCHED_CAPACITY_SCALE) {
+		seq_puts(sf, "max\n");
+		return;
+	}
+
+	percent = tg->uclamp_pct[clamp_id];
+	percent = div_u64_rem(percent, POW10(UCLAMP_PERCENT_SHIFT), &rem);
+	seq_printf(sf, "%llu.%0*u\n", percent, UCLAMP_PERCENT_SHIFT, rem);
+}
+
+static int cpu_uclamp_min_show(struct seq_file *sf, void *v)
+{
+	cpu_uclamp_print(sf, UCLAMP_MIN);
+	return 0;
+}
+
+static int cpu_uclamp_max_show(struct seq_file *sf, void *v)
+{
+	cpu_uclamp_print(sf, UCLAMP_MAX);
+	return 0;
+}
+#endif /* CONFIG_UCLAMP_TASK_GROUP */
+
 #ifdef CONFIG_FAIR_GROUP_SCHED
 static int cpu_shares_write_u64(struct cgroup_subsys_state *css,
 				struct cftype *cftype, u64 shareval)
@@ -7314,6 +7459,20 @@ static struct cftype cpu_legacy_files[] = {
 		.read_u64 = cpu_rt_period_read_uint,
 		.write_u64 = cpu_rt_period_write_uint,
 	},
+#endif
+#ifdef CONFIG_UCLAMP_TASK_GROUP
+	{
+		.name = "uclamp.min",
+		.flags = CFTYPE_NOT_ON_ROOT,
+		.seq_show = cpu_uclamp_min_show,
+		.write = cpu_uclamp_min_write,
+	},
+	{
+		.name = "uclamp.max",
+		.flags = CFTYPE_NOT_ON_ROOT,
+		.seq_show = cpu_uclamp_max_show,
+		.write = cpu_uclamp_max_write,
+	},
 #endif
 	{ }	/* Terminate */
 };
@@ -7481,6 +7640,20 @@ static struct cftype cpu_files[] = {
 		.seq_show = cpu_max_show,
 		.write = cpu_max_write,
 	},
+#endif
+#ifdef CONFIG_UCLAMP_TASK_GROUP
+	{
+		.name = "uclamp.min",
+		.flags = CFTYPE_NOT_ON_ROOT,
+		.seq_show = cpu_uclamp_min_show,
+		.write = cpu_uclamp_min_write,
+	},
+	{
+		.name = "uclamp.max",
+		.flags = CFTYPE_NOT_ON_ROOT,
+		.seq_show = cpu_uclamp_max_show,
+		.write = cpu_uclamp_max_write,
+	},
 #endif
 	{ }	/* terminate */
 };
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 802b1f3405f2..f10557a2dea7 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -393,6 +393,14 @@ struct task_group {
 #endif
 
 	struct cfs_bandwidth	cfs_bandwidth;
+
+#ifdef CONFIG_UCLAMP_TASK_GROUP
+	/* The two decimal precision [%] value requested from user-space */
+	unsigned int		uclamp_pct[UCLAMP_CNT];
+	/* Clamp values requested for a task group */
+	struct uclamp_se	uclamp_req[UCLAMP_CNT];
+#endif
+
 };
 
 #ifdef CONFIG_FAIR_GROUP_SCHED
-- 
2.22.0

^ permalink raw reply related

* [PATCH v12 0/6] Add utilization clamping support (CGroups API)
From: Patrick Bellasi @ 2019-07-18 18:17 UTC (permalink / raw)
  To: linux-kernel, linux-pm, linux-api, cgroups
  Cc: Ingo Molnar, Peter Zijlstra, Tejun Heo, Rafael J . Wysocki,
	Vincent Guittot, Viresh Kumar, Paul Turner, Michal Koutny,
	Quentin Perret, Dietmar Eggemann, Morten Rasmussen, Juri Lelli,
	Todd Kjos, Joel Fernandes, Steve Muckle, Suren Baghdasaryan,
	Alessio Balsini

Hi all, this is a respin of:

  https://lore.kernel.org/lkml/20190708084357.12944-1-patrick.bellasi@arm.com/

which addresses all the comments collected so far:

- track requested cgroup's percentage to mask conversion rounding to userspace
- use a dedicated variable for parent restrictions
- make more explicit in the documentation that the requested "protection" is
  always capped by the requested "limit"
- use the newly added uclamp_mutex to serialize the sysfs write callback
- add missing RCU read locks across cpu_util_update_eff() call from
  uclamp_update_root_tg()
- remove not required and confusing sentence from the above changelog
- add a new patch to always use enum uclamp_id for clamp_id values
- fix percentage's decimals format string

as well as adds some small modifications:

- introduce UCLAMP_PERCENT_{SHIFT,SCALE} to avoid hardcoded constants
- s/uclamp_scale_from_percent()/capacity_from_percent()/
- move range check from cpu_uclamp_{min,max}_write() to capacity_from_percent()

The series is based on top of today's Linus master branch (wip for 5.3-rc1):

  commit 22051d9c4a57 ("Merge tag 'platform-drivers-x86-v5.3-2' of git://git.infradead.org/linux-platform-drivers-x86")

Thanks Quentin, Michal and Tejun for your review comments!

This has been the first code review targeting specifically the cgroups bits and
the series is now hopefully in a better shape.

Looking forward for any additional comments! ;)

Cheers,
Patrick

Series Organization
===================

The full tree is available here:

   git://linux-arm.org/linux-pb.git   lkml/utilclamp_v12
   http://www.linux-arm.org/git?p=linux-pb.git;a=shortlog;h=refs/heads/lkml/utilclamp_v12


Newcomer's Short Abstract
=========================

The Linux scheduler tracks a "utilization" signal for each scheduling entity
(SE), e.g. tasks, to know how much CPU time they use. This signal allows the
scheduler to know how "big" a task is and, in principle, it can support
advanced task placement strategies by selecting the best CPU to run a task.
Some of these strategies are represented by the Energy Aware Scheduler [1].

When the schedutil cpufreq governor is in use, the utilization signal allows
the Linux scheduler to also drive frequency selection. The CPU utilization
signal, which represents the aggregated utilization of tasks scheduled on that
CPU, is used to select the frequency which best fits the workload generated by
the tasks.

The current translation of utilization values into a frequency selection is
simple: we go to max for RT tasks or to the minimum frequency which can
accommodate the utilization of DL+FAIR tasks.
However, utilization values by themselves cannot convey the desired
power/performance behaviors of each task as intended by user-space.
As such they are not ideally suited for task placement decisions.

Task placement and frequency selection policies in the kernel can be improved
by taking into consideration hints coming from authorized user-space elements,
like for example the Android middleware or more generally any "System
Management Software" (SMS) framework.

Utilization clamping is a mechanism which allows to "clamp" (i.e. filter) the
utilization generated by RT and FAIR tasks within a range defined by user-space.
The clamped utilization value can then be used, for example, to enforce a
minimum and/or maximum frequency depending on which tasks are active on a CPU.

The main use-cases for utilization clamping are:

 - boosting: better interactive response for small tasks which
   are affecting the user experience.

   Consider for example the case of a small control thread for an external
   accelerator (e.g. GPU, DSP, other devices). Here, from the task utilization
   the scheduler does not have a complete view of what the task's requirements
   are and, if it's a small utilization task, it keeps selecting a more energy
   efficient CPU, with smaller capacity and lower frequency, thus negatively
   impacting the overall time required to complete task activations.

 - capping: increase energy efficiency for background tasks not affecting the
   user experience.

   Since running on a lower capacity CPU at a lower frequency is more energy
   efficient, when the completion time is not a main goal, then capping the
   utilization considered for certain (maybe big) tasks can have positive
   effects, both on energy consumption and thermal headroom.
   This feature allows also to make RT tasks more energy friendly on mobile
   systems where running them on high capacity CPUs and at the maximum
   frequency is not required.

>From these two use-cases, it's worth noticing that frequency selection
biasing, introduced by patches 9 and 10 of this series, is just one possible
usage of utilization clamping. Another compelling extension of utilization
clamping is in helping the scheduler in making tasks placement decisions.

Utilization is (also) a task specific property the scheduler uses to know
how much CPU bandwidth a task requires, at least as long as there is idle time.
Thus, the utilization clamp values, defined either per-task or per-task_group,
can represent tasks to the scheduler as being bigger (or smaller) than what
they actually are.

Utilization clamping thus enables interesting additional optimizations, for
example on asymmetric capacity systems like Arm big.LITTLE and DynamIQ CPUs,
where:

 - boosting: try to run small/foreground tasks on higher-capacity CPUs to
   complete them faster despite being less energy efficient.

 - capping: try to run big/background tasks on low-capacity CPUs to save power
   and thermal headroom for more important tasks

This series does not present this additional usage of utilization clamping but
it's an integral part of the EAS feature set, where [2] is one of its main
components.

Android kernels use SchedTune, a solution similar to utilization clamping, to
bias both 'frequency selection' and 'task placement'. This series provides the
foundation to add similar features to mainline while focusing, for the
time being, just on schedutil integration.


References
==========

[1] Energy Aware Scheduling
    https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/Documentation/scheduler/sched-energy.txt?h=v5.1

[2] Expressing per-task/per-cgroup performance hints
    Linux Plumbers Conference 2018
    https://linuxplumbersconf.org/event/2/contributions/128/


Patrick Bellasi (6):
  sched/core: uclamp: Extend CPU's cgroup controller
  sched/core: uclamp: Propagate parent clamps
  sched/core: uclamp: Propagate system defaults to root group
  sched/core: uclamp: Use TG's clamps to restrict TASK's clamps
  sched/core: uclamp: Update CPU's refcount on TG's clamp changes
  sched/core: uclamp: always use enum uclamp_id for clamp_id values

 Documentation/admin-guide/cgroup-v2.rst |  34 +++
 init/Kconfig                            |  22 ++
 kernel/sched/core.c                     | 382 ++++++++++++++++++++++--
 kernel/sched/sched.h                    |  12 +-
 4 files changed, 430 insertions(+), 20 deletions(-)

-- 
2.22.0

^ permalink raw reply

* [v3 PATCH 2/2] mm: mempolicy: handle vma with unmovable pages mapped correctly in mbind
From: Yang Shi @ 2019-07-18 17:17 UTC (permalink / raw)
  To: vbabka, mhocko, mgorman, akpm; +Cc: yang.shi, linux-mm, linux-kernel, linux-api
In-Reply-To: <1563470274-52126-1-git-send-email-yang.shi@linux.alibaba.com>

When running syzkaller internally, we ran into the below bug on 4.9.x
kernel:

kernel BUG at mm/huge_memory.c:2124!
invalid opcode: 0000 [#1] SMP KASAN
Dumping ftrace buffer:
   (ftrace buffer empty)
Modules linked in:
CPU: 0 PID: 1518 Comm: syz-executor107 Not tainted 4.9.168+ #2
Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 0.5.1 01/01/2011
task: ffff880067b34900 task.stack: ffff880068998000
RIP: 0010:[<ffffffff81895d6b>]  [<ffffffff81895d6b>] split_huge_page_to_list+0x8fb/0x1030 mm/huge_memory.c:2124
RSP: 0018:ffff88006899f980  EFLAGS: 00010286
RAX: 0000000000000000 RBX: ffffea00018f1700 RCX: 0000000000000000
RDX: 1ffffd400031e2e7 RSI: 0000000000000001 RDI: ffffea00018f1738
RBP: ffff88006899f9e8 R08: 0000000000000001 R09: 0000000000000000
R10: 0000000000000000 R11: fffffbfff0d8b13e R12: ffffea00018f1400
R13: ffffea00018f1400 R14: ffffea00018f1720 R15: ffffea00018f1401
FS:  00007fa333996740(0000) GS:ffff88006c600000(0000) knlGS:0000000000000000
CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
CR2: 0000000020000040 CR3: 0000000066b9c000 CR4: 00000000000606f0
Stack:
 0000000000000246 ffff880067b34900 0000000000000000 ffff88007ffdc000
 0000000000000000 ffff88006899f9e8 ffffffff812b4015 ffff880064c64e18
 ffffea00018f1401 dffffc0000000000 ffffea00018f1700 0000000020ffd000
Call Trace:
 [<ffffffff818490f1>] split_huge_page include/linux/huge_mm.h:100 [inline]
 [<ffffffff818490f1>] queue_pages_pte_range+0x7e1/0x1480 mm/mempolicy.c:538
 [<ffffffff817ed0da>] walk_pmd_range mm/pagewalk.c:50 [inline]
 [<ffffffff817ed0da>] walk_pud_range mm/pagewalk.c:90 [inline]
 [<ffffffff817ed0da>] walk_pgd_range mm/pagewalk.c:116 [inline]
 [<ffffffff817ed0da>] __walk_page_range+0x44a/0xdb0 mm/pagewalk.c:208
 [<ffffffff817edb94>] walk_page_range+0x154/0x370 mm/pagewalk.c:285
 [<ffffffff81844515>] queue_pages_range+0x115/0x150 mm/mempolicy.c:694
 [<ffffffff8184f493>] do_mbind mm/mempolicy.c:1241 [inline]
 [<ffffffff8184f493>] SYSC_mbind+0x3c3/0x1030 mm/mempolicy.c:1370
 [<ffffffff81850146>] SyS_mbind+0x46/0x60 mm/mempolicy.c:1352
 [<ffffffff810097e2>] do_syscall_64+0x1d2/0x600 arch/x86/entry/common.c:282
 [<ffffffff82ff6f93>] entry_SYSCALL_64_after_swapgs+0x5d/0xdb
Code: c7 80 1c 02 00 e8 26 0a 76 01 <0f> 0b 48 c7 c7 40 46 45 84 e8 4c
RIP  [<ffffffff81895d6b>] split_huge_page_to_list+0x8fb/0x1030 mm/huge_memory.c:2124
 RSP <ffff88006899f980>

with the below test:

---8<---

uint64_t r[1] = {0xffffffffffffffff};

int main(void)
{
        syscall(__NR_mmap, 0x20000000, 0x1000000, 3, 0x32, -1, 0);
                                intptr_t res = 0;
        res = syscall(__NR_socket, 0x11, 3, 0x300);
        if (res != -1)
                r[0] = res;
*(uint32_t*)0x20000040 = 0x10000;
*(uint32_t*)0x20000044 = 1;
*(uint32_t*)0x20000048 = 0xc520;
*(uint32_t*)0x2000004c = 1;
        syscall(__NR_setsockopt, r[0], 0x107, 0xd, 0x20000040, 0x10);
        syscall(__NR_mmap, 0x20fed000, 0x10000, 0, 0x8811, r[0], 0);
*(uint64_t*)0x20000340 = 2;
        syscall(__NR_mbind, 0x20ff9000, 0x4000, 0x4002, 0x20000340,
0x45d4, 3);
        return 0;
}

---8<---

Actually the test does:

mmap(0x20000000, 16777216, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_FIXED|MAP_ANONYMOUS, -1, 0) = 0x20000000
socket(AF_PACKET, SOCK_RAW, 768)        = 3
setsockopt(3, SOL_PACKET, PACKET_TX_RING, {block_size=65536, block_nr=1, frame_size=50464, frame_nr=1}, 16) = 0
mmap(0x20fed000, 65536, PROT_NONE, MAP_SHARED|MAP_FIXED|MAP_POPULATE|MAP_DENYWRITE, 3, 0) = 0x20fed000
mbind(..., MPOL_MF_STRICT|MPOL_MF_MOVE) = 0

The setsockopt() would allocate compound pages (16 pages in this test)
for packet tx ring, then the mmap() would call packet_mmap() to map the
pages into the user address space specified by the mmap() call.

When calling mbind(), it would scan the vma to queue the pages for
migration to the new node.  It would split any huge page since 4.9
doesn't support THP migration, however, the packet tx ring compound
pages are not THP and even not movable.  So, the above bug is triggered.

However, the later kernel is not hit by this issue due to the
commit d44d363f65780f2ac2 ("mm: don't assume anonymous pages have
SwapBacked flag"), which just removes the PageSwapBacked check for a
different reason.

But, there is a deeper issue.  According to the semantic of mbind(), it
should return -EIO if MPOL_MF_MOVE or MPOL_MF_MOVE_ALL was specified and
MPOL_MF_STRICT was also specified, but the kernel was unable to move
all existing pages in the range.  The tx ring of the packet socket is
definitely not movable, however, mbind() returns success for this case.

Although the most socket file associates with non-movable pages, but XDP
may have movable pages from gup.  So, it sounds not fine to just check
the underlying file type of vma in vma_migratable().

Change migrate_page_add() to check if the page is movable or not, if it
is unmovable, just return -EIO.  But do not abort pte walk immediately,
since there may be pages off LRU temporarily.  We should migrate other
pages if MPOL_MF_MOVE* is specified.  Set has_unmovable flag if some
paged could not be not moved, then return -EIO for mbind() eventually.

With this change the above test would return -EIO as expected.

Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Mel Gorman <mgorman@techsingularity.net>
Signed-off-by: Yang Shi <yang.shi@linux.alibaba.com>
---
 mm/mempolicy.c | 34 +++++++++++++++++++++++++++-------
 1 file changed, 27 insertions(+), 7 deletions(-)

diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 0e73cc7..1d689fb 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -403,7 +403,7 @@ void mpol_rebind_mm(struct mm_struct *mm, nodemask_t *new)
 	},
 };
 
-static void migrate_page_add(struct page *page, struct list_head *pagelist,
+static int migrate_page_add(struct page *page, struct list_head *pagelist,
 				unsigned long flags);
 
 struct queue_pages {
@@ -463,12 +463,11 @@ static int queue_pages_pmd(pmd_t *pmd, spinlock_t *ptl, unsigned long addr,
 	flags = qp->flags;
 	/* go to thp migration */
 	if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) {
-		if (!vma_migratable(walk->vma)) {
+		if (!vma_migratable(walk->vma) ||
+		    migrate_page_add(page, qp->pagelist, flags)) {
 			ret = 1;
 			goto unlock;
 		}
-
-		migrate_page_add(page, qp->pagelist, flags);
 	} else
 		ret = -EIO;
 unlock:
@@ -532,7 +531,14 @@ static int queue_pages_pte_range(pmd_t *pmd, unsigned long addr,
 				has_unmovable |= true;
 				break;
 			}
-			migrate_page_add(page, qp->pagelist, flags);
+
+			/*
+			 * Do not abort immediately since there may be
+			 * temporary off LRU pages in the range.  Still
+			 * need migrate other LRU pages.
+			 */
+			if (migrate_page_add(page, qp->pagelist, flags))
+				has_unmovable |= true;
 		} else
 			break;
 	}
@@ -961,10 +967,21 @@ static long do_get_mempolicy(int *policy, nodemask_t *nmask,
 /*
  * page migration, thp tail pages can be passed.
  */
-static void migrate_page_add(struct page *page, struct list_head *pagelist,
+static int migrate_page_add(struct page *page, struct list_head *pagelist,
 				unsigned long flags)
 {
 	struct page *head = compound_head(page);
+
+	/*
+	 * Non-movable page may reach here.  And, there may be
+	 * temporary off LRU pages or non-LRU movable pages.
+	 * Treat them as unmovable pages since they can't be
+	 * isolated, so they can't be moved at the moment.  It
+	 * should return -EIO for this case too.
+	 */
+	if (!PageLRU(head) && (flags & MPOL_MF_STRICT))
+		return -EIO;
+
 	/*
 	 * Avoid migrating a page that is shared with others.
 	 */
@@ -976,6 +993,8 @@ static void migrate_page_add(struct page *page, struct list_head *pagelist,
 				hpage_nr_pages(head));
 		}
 	}
+
+	return 0;
 }
 
 /* page allocation callback for NUMA node migration */
@@ -1178,9 +1197,10 @@ static struct page *new_page(struct page *page, unsigned long start)
 }
 #else
 
-static void migrate_page_add(struct page *page, struct list_head *pagelist,
+static int migrate_page_add(struct page *page, struct list_head *pagelist,
 				unsigned long flags)
 {
+	return -EIO;
 }
 
 int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from,
-- 
1.8.3.1

^ permalink raw reply related

* [v3 PATCH 1/2] mm: mempolicy: make the behavior consistent when MPOL_MF_MOVE* and MPOL_MF_STRICT were specified
From: Yang Shi @ 2019-07-18 17:17 UTC (permalink / raw)
  To: vbabka, mhocko, mgorman, akpm; +Cc: yang.shi, linux-mm, linux-kernel, linux-api
In-Reply-To: <1563470274-52126-1-git-send-email-yang.shi@linux.alibaba.com>

When both MPOL_MF_MOVE* and MPOL_MF_STRICT was specified, mbind() should
try best to migrate misplaced pages, if some of the pages could not be
migrated, then return -EIO.

There are three different sub-cases:
1. vma is not migratable
2. vma is migratable, but there are unmovable pages
3. vma is migratable, pages are movable, but migrate_pages() fails

If #1 happens, kernel would just abort immediately, then return -EIO,
after the commit a7f40cfe3b7ada57af9b62fd28430eeb4a7cfcb7 ("mm:
mempolicy: make mbind() return -EIO when MPOL_MF_STRICT is specified").

If #3 happens, kernel would set policy and migrate pages with best-effort,
but won't rollback the migrated pages and reset the policy back.

Before that commit, they behaves in the same way.  It'd better to keep
their behavior consistent.  But, rolling back the migrated pages and
resetting the policy back sounds not feasible, so just make #1 behave as
same as #3.

Userspace will know that not everything was successfully migrated (via
-EIO), and can take whatever steps it deems necessary - attempt rollback,
determine which exact page(s) are violating the policy, etc.

Make queue_pages_range() return 1 to indicate there are unmovable pages
or vma is not migratable.

The #2 is not handled correctly in the current kernel, the following
patch will fix it.

Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Mel Gorman <mgorman@techsingularity.net>
Signed-off-by: Yang Shi <yang.shi@linux.alibaba.com>
---
 mm/mempolicy.c | 66 +++++++++++++++++++++++++++++++++++++++++-----------------
 1 file changed, 47 insertions(+), 19 deletions(-)

diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index f48693f..0e73cc7 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -429,11 +429,14 @@ static inline bool queue_pages_required(struct page *page,
 }
 
 /*
- * queue_pages_pmd() has three possible return values:
- * 1 - pages are placed on the right node or queued successfully.
- * 0 - THP was split.
- * -EIO - is migration entry or MPOL_MF_STRICT was specified and an existing
- *        page was already on a node that does not follow the policy.
+ * queue_pages_pmd() has four possible return values:
+ * 0 - pages are placed on the right node or queued successfully.
+ * 1 - there is unmovable page, and MPOL_MF_MOVE* & MPOL_MF_STRICT were
+ *     specified.
+ * 2 - THP was split.
+ * -EIO - is migration entry or only MPOL_MF_STRICT was specified and an
+ *        existing page was already on a node that does not follow the
+ *        policy.
  */
 static int queue_pages_pmd(pmd_t *pmd, spinlock_t *ptl, unsigned long addr,
 				unsigned long end, struct mm_walk *walk)
@@ -451,19 +454,17 @@ static int queue_pages_pmd(pmd_t *pmd, spinlock_t *ptl, unsigned long addr,
 	if (is_huge_zero_page(page)) {
 		spin_unlock(ptl);
 		__split_huge_pmd(walk->vma, pmd, addr, false, NULL);
+		ret = 2;
 		goto out;
 	}
-	if (!queue_pages_required(page, qp)) {
-		ret = 1;
+	if (!queue_pages_required(page, qp))
 		goto unlock;
-	}
 
-	ret = 1;
 	flags = qp->flags;
 	/* go to thp migration */
 	if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) {
 		if (!vma_migratable(walk->vma)) {
-			ret = -EIO;
+			ret = 1;
 			goto unlock;
 		}
 
@@ -479,6 +480,13 @@ static int queue_pages_pmd(pmd_t *pmd, spinlock_t *ptl, unsigned long addr,
 /*
  * Scan through pages checking if pages follow certain conditions,
  * and move them to the pagelist if they do.
+ *
+ * queue_pages_pte_range() has three possible return values:
+ * 0 - pages are placed on the right node or queued successfully.
+ * 1 - there is unmovable page, and MPOL_MF_MOVE* & MPOL_MF_STRICT were
+ *     specified.
+ * -EIO - only MPOL_MF_STRICT was specified and an existing page was already
+ *        on a node that does not follow the policy.
  */
 static int queue_pages_pte_range(pmd_t *pmd, unsigned long addr,
 			unsigned long end, struct mm_walk *walk)
@@ -488,15 +496,15 @@ static int queue_pages_pte_range(pmd_t *pmd, unsigned long addr,
 	struct queue_pages *qp = walk->private;
 	unsigned long flags = qp->flags;
 	int ret;
+	bool has_unmovable = false;
 	pte_t *pte;
 	spinlock_t *ptl;
 
 	ptl = pmd_trans_huge_lock(pmd, vma);
 	if (ptl) {
 		ret = queue_pages_pmd(pmd, ptl, addr, end, walk);
-		if (ret > 0)
-			return 0;
-		else if (ret < 0)
+		/* THP was split, fall through to pte walk */
+		if (ret != 2)
 			return ret;
 	}
 
@@ -519,14 +527,21 @@ static int queue_pages_pte_range(pmd_t *pmd, unsigned long addr,
 		if (!queue_pages_required(page, qp))
 			continue;
 		if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) {
-			if (!vma_migratable(vma))
+			/* MPOL_MF_STRICT must be specified if we get here */
+			if (!vma_migratable(vma)) {
+				has_unmovable |= true;
 				break;
+			}
 			migrate_page_add(page, qp->pagelist, flags);
 		} else
 			break;
 	}
 	pte_unmap_unlock(pte - 1, ptl);
 	cond_resched();
+
+	if (has_unmovable)
+		return 1;
+
 	return addr != end ? -EIO : 0;
 }
 
@@ -639,7 +654,13 @@ static int queue_pages_test_walk(unsigned long start, unsigned long end,
  *
  * If pages found in a given range are on a set of nodes (determined by
  * @nodes and @flags,) it's isolated and queued to the pagelist which is
- * passed via @private.)
+ * passed via @private.
+ *
+ * queue_pages_range() has three possible return values:
+ * 1 - there is unmovable page, but MPOL_MF_MOVE* & MPOL_MF_STRICT were
+ *     specified.
+ * 0 - queue pages successfully or no misplaced page.
+ * -EIO - there is misplaced page and only MPOL_MF_STRICT was specified.
  */
 static int
 queue_pages_range(struct mm_struct *mm, unsigned long start, unsigned long end,
@@ -1182,6 +1203,7 @@ static long do_mbind(unsigned long start, unsigned long len,
 	struct mempolicy *new;
 	unsigned long end;
 	int err;
+	int ret;
 	LIST_HEAD(pagelist);
 
 	if (flags & ~(unsigned long)MPOL_MF_VALID)
@@ -1243,10 +1265,15 @@ static long do_mbind(unsigned long start, unsigned long len,
 	if (err)
 		goto mpol_out;
 
-	err = queue_pages_range(mm, start, end, nmask,
+	ret = queue_pages_range(mm, start, end, nmask,
 			  flags | MPOL_MF_INVERT, &pagelist);
-	if (!err)
-		err = mbind_range(mm, start, end, new);
+
+	if (ret < 0) {
+		err = -EIO;
+		goto up_out;
+	}
+
+	err = mbind_range(mm, start, end, new);
 
 	if (!err) {
 		int nr_failed = 0;
@@ -1259,11 +1286,12 @@ static long do_mbind(unsigned long start, unsigned long len,
 				putback_movable_pages(&pagelist);
 		}
 
-		if (nr_failed && (flags & MPOL_MF_STRICT))
+		if ((ret > 0) || (nr_failed && (flags & MPOL_MF_STRICT)))
 			err = -EIO;
 	} else
 		putback_movable_pages(&pagelist);
 
+up_out:
 	up_write(&mm->mmap_sem);
  mpol_out:
 	mpol_put(new);
-- 
1.8.3.1

^ permalink raw reply related

* [v3 PATCH 0/2] mm: mempolicy: fix mbind()'s inconsistent behavior for unmovable pages
From: Yang Shi @ 2019-07-18 17:17 UTC (permalink / raw)
  To: vbabka, mhocko, mgorman, akpm; +Cc: yang.shi, linux-mm, linux-kernel, linux-api


Changelog
v3: * Adopted the suggestions from Vastimil.  Saved another 20 lines.
      Using flag in struct queue_pages looks not outperform renumbering retval
      too much since we still have to return 1 to tell the caller there are
      unmovable pages.  So just renumber the retval.
    * Manpage is not very clear about shared pages when MPOL_MF_MOVE is
      specified, just leave it as it is for now till it gets clarified.
v2: * Fixed the inconsistent behavior by not aborting !vma_migratable()
      immediately by a separate patch (patch 1/2), and this is also the
      preparation for patch 2/2. For the details please see the commit
      log.  Per Vlastimil.
    * Not abort immediately if unmovable page is met. This should handle
      non-LRU movable pages and temporary off-LRU pages more friendly.
      Per Vlastimil and Michal Hocko.


Yang Shi (2):
      mm: mempolicy: make the behavior consistent when MPOL_MF_MOVE* and MPOL_MF_STRICT were specified
      mm: mempolicy: handle vma with unmovable pages mapped correctly in mbind

 mm/mempolicy.c | 100 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++--------------------
 1 file changed, 74 insertions(+), 26 deletions(-)

^ permalink raw reply

* Re: [PATCH v9 08/10] open: openat2(2) syscall
From: Aleksa Sarai @ 2019-07-18 16:12 UTC (permalink / raw)
  To: Arnd Bergmann
  Cc: Al Viro, Jeff Layton, J. Bruce Fields, David Howells, Shuah Khan,
	Shuah Khan, Christian Brauner, Eric Biederman, Andy Lutomirski,
	Andrew Morton, Alexei Starovoitov, Kees Cook, Jann Horn,
	Tycho Andersen, David Drysdale, Chanho Min, Oleg Nesterov,
	Aleksa Sarai, Linus Torvalds, containers, alpha <linux-alp>
In-Reply-To: <CAK8P3a33rGhPDFfRBAQyLTMG_WoEgX_toDgWR2O7rSwxKsZG+w@mail.gmail.com>

[-- Attachment #1: Type: text/plain, Size: 5083 bytes --]

On 2019-07-18, Arnd Bergmann <arnd@arndb.de> wrote:
> On Sat, Jul 6, 2019 at 5:00 PM Aleksa Sarai <cyphar@cyphar.com> wrote:
> 
> > diff --git a/arch/alpha/kernel/syscalls/syscall.tbl b/arch/alpha/kernel/syscalls/syscall.tbl
> > index 9e7704e44f6d..1703d048c141 100644
> > --- a/arch/alpha/kernel/syscalls/syscall.tbl
> > +++ b/arch/alpha/kernel/syscalls/syscall.tbl
> > @@ -461,6 +461,7 @@
> >  530    common  getegid                         sys_getegid
> >  531    common  geteuid                         sys_geteuid
> >  532    common  getppid                         sys_getppid
> > +533    common  openat2                         sys_openat2
> >  # all other architectures have common numbers for new syscall, alpha
> >  # is the exception.
> >  534    common  pidfd_send_signal               sys_pidfd_send_signal
> 
> My plan here was to add new syscalls in the same order as everwhere else,
> just with the number 110 higher. In the long run, I hope we can automate
> this.

Alright, I will adjust this.

> > diff --git a/arch/arm/tools/syscall.tbl b/arch/arm/tools/syscall.tbl
> > index aaf479a9e92d..4ad262698396 100644
> > --- a/arch/arm/tools/syscall.tbl
> > +++ b/arch/arm/tools/syscall.tbl
> > @@ -447,3 +447,4 @@
> >  431    common  fsconfig                        sys_fsconfig
> >  432    common  fsmount                         sys_fsmount
> >  433    common  fspick                          sys_fspick
> > +434    common  openat2                         sys_openat2
> 
> 434 is already used in linux-next, I suggest you use 437 (Palmer
> just submitted fchmodat4, which could become 436).

437 sounds good to me.

> > +/**
> > + * Arguments for how openat2(2) should open the target path. If @extra is zero,
> > + * then openat2(2) is identical to openat(2).
> > + *
> > + * @flags: O_* flags (unknown flags ignored).
> > + * @mode: O_CREAT file mode (ignored otherwise).
> > + * @upgrade_mask: restrict how the O_PATH may be re-opened (ignored otherwise).
> > + * @resolve: RESOLVE_* flags (-EINVAL on unknown flags).
> > + * @reserved: reserved for future extensions, must be zeroed.
> > + */
> > +struct open_how {
> > +       __u32 flags;
> > +       union {
> > +               __u16 mode;
> > +               __u16 upgrade_mask;
> > +       };
> > +       __u16 resolve;
> > +       __u64 reserved[7]; /* must be zeroed */
> > +};
> 
> We can have system calls with up to six arguments on all architectures, so
> this could still be done more conventionally without the indirection: like
> 
> long openat2(int dfd, const char __user * filename, int flags, mode_t
> mode_mask, __u16 resolve);
> 
> In fact, that seems similar enough to the existing openat() that I think
> you could also just add the fifth argument to the existing call when
> a newly defined flag is set, similarly to how we only use the 'mode'
> argument when O_CREAT or O_TMPFILE are set.

I considered doing this (and even had a preliminary version of it), but
I discovered that I was not in favour of this idea -- once I started to
write tests using it -- for a few reasons:

  1. It doesn't really allow for clean extension for a future 6th
	 argument (because you are using up O_* flags to signify "use the
	 next argument", and O_* flags don't give -EINVAL if they're
	 unknown). Now, yes you can do the on-start runtime check that
	 everyone does -- but I've never really liked having to do it.

	 Having reserved padding for later extensions (that is actually
	 checked and gives -EINVAL) matches more modern syscall designs.

  2. I really was hoping that the variadic openat(2) could be done away
     using this union setup (Linus said he didn't like it, and suggested
	 using something like 'struct stat' as an argument for openat(2) --
	 though personally I am not sure I would personally like to use an
	 interface like that).

  3. In order to avoid wasting a syscall argument for mode/mask you need
	 to either have something like your suggested mode_mask (which makes
	 the syscall arguments less consistent) or have some sort of
	 mode-like argument that is treated specially (which is really awful
	 on multiple levels -- this one I also tried and even wrote my
	 original tests using). And in both cases, the shims for
	 open{,at}(2) are somewhat less clean.

All of that being said, I'd be happy to switch to whatever you think
makes the most sense. As long as it's possible to get an O_PATH with
RESOLVE_IN_ROOT set, I'm happy.

> > --- a/include/linux/syscalls.h
> > +++ b/include/linux/syscalls.h
> 
> This file seems to lack a declaration for the system call, which means it
> will cause a build failure on some architectures, e.g. arch/arc/kernel/sys.c:
> 
> #define __SYSCALL(nr, call) [nr] = (call),
> void *sys_call_table[NR_syscalls] = {
>         [0 ... NR_syscalls-1] = sys_ni_syscall,
> #include <asm/unistd.h>
> };

Thanks, I will fix this.

-- 
Aleksa Sarai
Senior Software Engineer (Containers)
SUSE Linux GmbH
<https://www.cyphar.com/>

[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 228 bytes --]

^ permalink raw reply

* Re: [PATCH v9 08/10] open: openat2(2) syscall
From: Aleksa Sarai @ 2019-07-18 15:21 UTC (permalink / raw)
  To: Rasmus Villemoes
  Cc: Al Viro, Jeff Layton, J. Bruce Fields, Arnd Bergmann,
	David Howells, Shuah Khan, Shuah Khan, Christian Brauner,
	Eric Biederman, Andy Lutomirski, Andrew Morton,
	Alexei Starovoitov, Kees Cook, Jann Horn, Tycho Andersen,
	David Drysdale, Chanho Min, Oleg Nesterov, Aleksa Sarai,
	Linus Torvalds, containers
In-Reply-To: <845e4364-685f-343b-46fb-c418766dce3e@rasmusvillemoes.dk>

[-- Attachment #1: Type: text/plain, Size: 3189 bytes --]

On 2019-07-18, Rasmus Villemoes <linux@rasmusvillemoes.dk> wrote:
> On 06/07/2019 16.57, Aleksa Sarai wrote:
> > --- a/fs/open.c
> > +++ b/fs/open.c
> > @@ -928,24 +928,32 @@ struct file *open_with_fake_path(const struct path *path, int flags,
> >  }
> >  EXPORT_SYMBOL(open_with_fake_path);
> >  
> > -static inline int build_open_flags(int flags, umode_t mode, struct open_flags *op)
> > +static inline int build_open_flags(struct open_how how, struct open_flags *op)
> >  {
> 
> How does passing such a huge struct by value affect code generation?
> Does gcc actually inline the function (and does it even inline the old
> one given that it's already non-trivial and has more than one caller).

I'm not sure, but I'll just do what you suggested with passing a const
reference and just copying the few fields that actually are touched by
this function.

> >  
> > diff --git a/include/linux/fcntl.h b/include/linux/fcntl.h
> > index 2868ae6c8fc1..e59917292213 100644
> > --- a/include/linux/fcntl.h
> > +++ b/include/linux/fcntl.h
> > @@ -4,13 +4,26 @@
> >  
> >  #include <uapi/linux/fcntl.h>
> >  
> > -/* list of all valid flags for the open/openat flags argument: */
> > +/* Should open_how.mode be set for older syscalls wrappers? */
> > +#define OPENHOW_MODE(flags, mode) \
> > +	(((flags) | (O_CREAT | __O_TMPFILE)) ? (mode) : 0)
> > +
> 
> Typo: (((flags) & (O_CREAT | __O_TMPFILE)) ? (mode) : 0)

Yup, thanks. I'm not sure why my tests passed on v9 with this bug (they
didn't pass in my v10-draft until I fixed this bug earlier today).

> 
> > +/**
> > + * Arguments for how openat2(2) should open the target path. If @extra is zero,
> > + * then openat2(2) is identical to openat(2).
> > + *
> > + * @flags: O_* flags (unknown flags ignored).
> > + * @mode: O_CREAT file mode (ignored otherwise).
> 
> should probably say "O_CREAT/O_TMPFILE file mode".

:+1:

> > + * @upgrade_mask: restrict how the O_PATH may be re-opened (ignored otherwise).
> > + * @resolve: RESOLVE_* flags (-EINVAL on unknown flags).
> > + * @reserved: reserved for future extensions, must be zeroed.
> > + */
> > +struct open_how {
> > +	__u32 flags;
> > +	union {
> > +		__u16 mode;
> > +		__u16 upgrade_mask;
> > +	};
> > +	__u16 resolve;
> 
> So mode and upgrade_mask are naturally u16 aka mode_t. And yes, they
> probably never need to be used together, so the union works. That then
> makes the next member 2-byte aligned, so using a u16 for the resolve
> flags brings us to an 8-byte boundary, and 11 unused flag bits should be
> enough for a while. But it seems a bit artificial to cram all this
> together and then add 56 bytes of reserved space.

I will happily admit that padding to 64 bytes is probably _very_ extreme
(I picked it purely because it's the size of a cache-line so anything
bigger makes even less sense). I was hoping someone would suggest a
better size once I posted the patchset, since I couldn't think of a good
answer myself.

Do you have any suggestions for a better layout or padding size?

-- 
Aleksa Sarai
Senior Software Engineer (Containers)
SUSE Linux GmbH
<https://www.cyphar.com/>

[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 228 bytes --]

^ permalink raw reply

* Re: [PATCH v9 08/10] open: openat2(2) syscall
From: Arnd Bergmann @ 2019-07-18 15:10 UTC (permalink / raw)
  To: Aleksa Sarai
  Cc: Al Viro, Jeff Layton, J. Bruce Fields, David Howells, Shuah Khan,
	Shuah Khan, Christian Brauner, Eric Biederman, Andy Lutomirski,
	Andrew Morton, Alexei Starovoitov, Kees Cook, Jann Horn,
	Tycho Andersen, David Drysdale, Chanho Min, Oleg Nesterov,
	Aleksa Sarai, Linus Torvalds, containers, alpha <linux-alp>
In-Reply-To: <20190706145737.5299-9-cyphar@cyphar.com>

On Sat, Jul 6, 2019 at 5:00 PM Aleksa Sarai <cyphar@cyphar.com> wrote:

> diff --git a/arch/alpha/kernel/syscalls/syscall.tbl b/arch/alpha/kernel/syscalls/syscall.tbl
> index 9e7704e44f6d..1703d048c141 100644
> --- a/arch/alpha/kernel/syscalls/syscall.tbl
> +++ b/arch/alpha/kernel/syscalls/syscall.tbl
> @@ -461,6 +461,7 @@
>  530    common  getegid                         sys_getegid
>  531    common  geteuid                         sys_geteuid
>  532    common  getppid                         sys_getppid
> +533    common  openat2                         sys_openat2
>  # all other architectures have common numbers for new syscall, alpha
>  # is the exception.
>  534    common  pidfd_send_signal               sys_pidfd_send_signal

My plan here was to add new syscalls in the same order as everwhere else,
just with the number 110 higher. In the long run, I hope we can automate
this.

> diff --git a/arch/arm/tools/syscall.tbl b/arch/arm/tools/syscall.tbl
> index aaf479a9e92d..4ad262698396 100644
> --- a/arch/arm/tools/syscall.tbl
> +++ b/arch/arm/tools/syscall.tbl
> @@ -447,3 +447,4 @@
>  431    common  fsconfig                        sys_fsconfig
>  432    common  fsmount                         sys_fsmount
>  433    common  fspick                          sys_fspick
> +434    common  openat2                         sys_openat2

434 is already used in linux-next, I suggest you use 437 (Palmer
just submitted fchmodat4, which could become 436).

> +/**
> + * Arguments for how openat2(2) should open the target path. If @extra is zero,
> + * then openat2(2) is identical to openat(2).
> + *
> + * @flags: O_* flags (unknown flags ignored).
> + * @mode: O_CREAT file mode (ignored otherwise).
> + * @upgrade_mask: restrict how the O_PATH may be re-opened (ignored otherwise).
> + * @resolve: RESOLVE_* flags (-EINVAL on unknown flags).
> + * @reserved: reserved for future extensions, must be zeroed.
> + */
> +struct open_how {
> +       __u32 flags;
> +       union {
> +               __u16 mode;
> +               __u16 upgrade_mask;
> +       };
> +       __u16 resolve;
> +       __u64 reserved[7]; /* must be zeroed */
> +};

We can have system calls with up to six arguments on all architectures, so
this could still be done more conventionally without the indirection: like

long openat2(int dfd, const char __user * filename, int flags, mode_t
mode_mask, __u16 resolve);

In fact, that seems similar enough to the existing openat() that I think
you could also just add the fifth argument to the existing call when
a newly defined flag is set, similarly to how we only use the 'mode'
argument when O_CREAT or O_TMPFILE are set.

> --- a/include/linux/syscalls.h
> +++ b/include/linux/syscalls.h

This file seems to lack a declaration for the system call, which means it
will cause a build failure on some architectures, e.g. arch/arc/kernel/sys.c:

#define __SYSCALL(nr, call) [nr] = (call),
void *sys_call_table[NR_syscalls] = {
        [0 ... NR_syscalls-1] = sys_ni_syscall,
#include <asm/unistd.h>
};

        Arnd

^ permalink raw reply

* Re: [PATCH v9 08/10] open: openat2(2) syscall
From: Rasmus Villemoes @ 2019-07-18 14:48 UTC (permalink / raw)
  To: Aleksa Sarai, Al Viro, Jeff Layton, J. Bruce Fields,
	Arnd Bergmann, David Howells, Shuah Khan, Shuah Khan
  Cc: Christian Brauner, Eric Biederman, Andy Lutomirski, Andrew Morton,
	Alexei Starovoitov, Kees Cook, Jann Horn, Tycho Andersen,
	David Drysdale, Chanho Min, Oleg Nesterov, Aleksa Sarai,
	Linus Torvalds, containers, linux-alpha, linux-api, linux-arch,
	linux-arm-kernel, linux-fsdevel, linux-ia64, linux-kernel,
	linux-kselftest, linux-m
In-Reply-To: <20190706145737.5299-9-cyphar@cyphar.com>

On 06/07/2019 16.57, Aleksa Sarai wrote:
> 
> --- a/fs/open.c
> +++ b/fs/open.c
> @@ -928,24 +928,32 @@ struct file *open_with_fake_path(const struct path *path, int flags,
>  }
>  EXPORT_SYMBOL(open_with_fake_path);
>  
> -static inline int build_open_flags(int flags, umode_t mode, struct open_flags *op)
> +static inline int build_open_flags(struct open_how how, struct open_flags *op)
>  {

How does passing such a huge struct by value affect code generation?
Does gcc actually inline the function (and does it even inline the old
one given that it's already non-trivial and has more than one caller).

>  	int lookup_flags = 0;
> -	int acc_mode = ACC_MODE(flags);
> +	int opath_mask = 0;
> +	int acc_mode = ACC_MODE(how.flags);
> +
> +	if (how.resolve & ~VALID_RESOLVE_FLAGS)
> +		return -EINVAL;
> +	if (!(how.flags & (O_PATH | O_CREAT | __O_TMPFILE)) && how.mode != 0)
> +		return -EINVAL;
> +	if (memchr_inv(how.reserved, 0, sizeof(how.reserved)))
> +		return -EINVAL;

How about passing how by const reference, and copy the few fields you
need to local variables. That would at least simplify this patch by
eliminating a lot of the

> -	flags &= VALID_OPEN_FLAGS;
> +	how.flags &= VALID_OPEN_FLAGS;
>  
> -	if (flags & (O_CREAT | __O_TMPFILE))
> -		op->mode = (mode & S_IALLUGO) | S_IFREG;
> +	if (how.flags & (O_CREAT | __O_TMPFILE))
> +		op->mode = (how.mode & S_IALLUGO) | S_IFREG;

churn.

>  
> diff --git a/include/linux/fcntl.h b/include/linux/fcntl.h
> index 2868ae6c8fc1..e59917292213 100644
> --- a/include/linux/fcntl.h
> +++ b/include/linux/fcntl.h
> @@ -4,13 +4,26 @@
>  
>  #include <uapi/linux/fcntl.h>
>  
> -/* list of all valid flags for the open/openat flags argument: */
> +/* Should open_how.mode be set for older syscalls wrappers? */
> +#define OPENHOW_MODE(flags, mode) \
> +	(((flags) | (O_CREAT | __O_TMPFILE)) ? (mode) : 0)
> +

Typo: (((flags) & (O_CREAT | __O_TMPFILE)) ? (mode) : 0)

> +/**
> + * Arguments for how openat2(2) should open the target path. If @extra is zero,
> + * then openat2(2) is identical to openat(2).
> + *
> + * @flags: O_* flags (unknown flags ignored).
> + * @mode: O_CREAT file mode (ignored otherwise).

should probably say "O_CREAT/O_TMPFILE file mode".

> + * @upgrade_mask: restrict how the O_PATH may be re-opened (ignored otherwise).
> + * @resolve: RESOLVE_* flags (-EINVAL on unknown flags).
> + * @reserved: reserved for future extensions, must be zeroed.
> + */
> +struct open_how {
> +	__u32 flags;
> +	union {
> +		__u16 mode;
> +		__u16 upgrade_mask;
> +	};
> +	__u16 resolve;

So mode and upgrade_mask are naturally u16 aka mode_t. And yes, they
probably never need to be used together, so the union works. That then
makes the next member 2-byte aligned, so using a u16 for the resolve
flags brings us to an 8-byte boundary, and 11 unused flag bits should be
enough for a while. But it seems a bit artificial to cram all this
together and then add 56 bytes of reserved space.

Rasmus

^ permalink raw reply

* Re: [PATCH v9 05/10] namei: O_BENEATH-style path resolution flags
From: Aleksa Sarai @ 2019-07-18  3:17 UTC (permalink / raw)
  To: Al Viro
  Cc: Jeff Layton, J. Bruce Fields, Arnd Bergmann, David Howells,
	Shuah Khan, Shuah Khan, Christian Brauner, David Drysdale,
	Andy Lutomirski, Linus Torvalds, Eric Biederman, Andrew Morton,
	Alexei Starovoitov, Kees Cook, Jann Horn, Tycho Andersen,
	Chanho Min, Oleg Nesterov, Aleksa Sarai, containers, linux-alpha
In-Reply-To: <20190714143623.GR17978@ZenIV.linux.org.uk>

[-- Attachment #1: Type: text/plain, Size: 2207 bytes --]

On 2019-07-14, Al Viro <viro@zeniv.linux.org.uk> wrote:
> On Sun, Jul 14, 2019 at 05:00:29PM +1000, Aleksa Sarai wrote:
> > The basic property being guaranteed by LOOKUP_IN_ROOT is that it will
> > not result in resolution of a path component which was not inside the
> > root of the dirfd tree at some point during resolution (and that all
> > absolute symlink and ".." resolution will be done relative to the
> > dirfd). This may smell slightly of chroot(2), because unfortunately it
> > is a similar concept -- the reason for this is to allow for a more
> > efficient way to safely resolve paths inside a rootfs than spawning a
> > separate process to then pass back the fd to the caller.
> 
> IDGI...  If attacker can modify your subtree, you have already lost -
> after all, they can make anything appear inside that tree just before
> your syscall is made and bring it back out immediately afterwards.
> And if they can't, what is the race you are trying to protect against?
> Confused...

I'll be honest, this code mostly exists because Jann Horn said that it
was necessary in order for this interface to be safe against those kinds
of attacks. Though, it's also entirely possible I just am
mis-remembering the attack scenario he described when I posted v1 of
this series last year.

The use-case I need this functionality for (as do other container
runtimes) is one where you are trying to safely interact with a
directory tree that is a (malicious) container's root filesystem -- so
the container won't be able to move the directory tree root, nor can
they move things outside the rootfs into it (or the reverse). Users
dealing with FTP, web, or file servers probably have similar
requirements.

There is an obvious race condition if you allow the attacker to move the
root (I give an example and test-case of it in the last patch in the
series), and given that it is fairly trivial to defend against I don't
see the downside in including it? But it's obviously your call -- and
maybe Jann Horn can explain the reasoning behind this much better than I
can.

-- 
Aleksa Sarai
Senior Software Engineer (Containers)
SUSE Linux GmbH
<https://www.cyphar.com/>

[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 228 bytes --]

^ permalink raw reply

* Re: [PATCH ghak90 V6 02/10] audit: add container id
From: Richard Guy Briggs @ 2019-07-18  0:51 UTC (permalink / raw)
  To: Paul Moore
  Cc: Tycho Andersen, nhorman, linux-api, containers, LKML, dhowells,
	Linux-Audit Mailing List, netfilter-devel, ebiederm, simo, netdev,
	linux-fsdevel, Eric Paris, Serge E. Hallyn
In-Reply-To: <CAHC9VhScHizB2r5q3T5s0P3jkYdvzBPPudDksosYFp_TO7W9-Q@mail.gmail.com>

On 2019-07-16 19:30, Paul Moore wrote:
> On Tue, Jul 16, 2019 at 6:03 PM Richard Guy Briggs <rgb@redhat.com> wrote:
> > On 2019-07-15 17:04, Paul Moore wrote:
> > > On Mon, Jul 8, 2019 at 2:06 PM Richard Guy Briggs <rgb@redhat.com> wrote:
> 
> ...
> 
> > > > If we can't trust ns_capable() then why are we passing on
> > > > CAP_AUDIT_CONTROL?  It is being passed down and not stripped purposely
> > > > by the orchestrator/engine.  If ns_capable() isn't inherited how is it
> > > > gained otherwise?  Can it be inserted by cotainer image?  I think the
> > > > answer is "no".  Either we trust ns_capable() or we have audit
> > > > namespaces (recommend based on user namespace) (or both).
> > >
> > > My thinking is that since ns_capable() checks the credentials with
> > > respect to the current user namespace we can't rely on it to control
> > > access since it would be possible for a privileged process running
> > > inside an unprivileged container to manipulate the audit container ID
> > > (containerized process has CAP_AUDIT_CONTROL, e.g. running as root in
> > > the container, while the container itself does not).
> >
> > What makes an unprivileged container unprivileged?  "root", or "CAP_*"?
> 
> My understanding is that when most people refer to an unprivileged
> container they are referring to a container run without capabilities
> or a container run by a user other than root.  I'm sure there are
> better definitions out there, by folks much smarter than me on these
> things, but that's my working definition.

Close enough to my understanding...

> > If CAP_AUDIT_CONTROL is granted, does "root" matter?
> 
> Our discussions here have been about capabilities, not UIDs.  The only
> reason root might matter is that it generally has the full capability
> set.

Good, that's my understanding.

> > Does it matter what user namespace it is in?
> 
> What likely matters is what check is called: capable() or
> ns_capable().  Those can yield very different results.

Ok, I finally found what I was looking for to better understand the
challenge with trusting ns_capable().  Sorry for being so dense and slow
on this one.  I thought I had gone through the code carefully enough,
but this time I finally found it.  set_cred_user_ns() sets a full set of
capabilities rather than inheriting them from the parent user_ns, called
from userns_install() or create_userns().  Even if the container
orchestrator/engine restricts those capabilities on its own containers,
they could easily unshare a userns and get a full set unless it also
restricted CAP_SYS_ADMIN, which is used too many other places to be
practical to restrict.

> > I understand that root is *gained* in an
> > unprivileged user namespace, but capabilities are inherited or permitted
> > and that process either has it or it doesn't and an unprivileged user
> > namespace can't gain a capability that has been rescinded.  Different
> > subsystems use the userid or capabilities or both to determine
> > privileges.
> 
> Once again, I believe the important thing to focus on here is
> capable() vs ns_capable().  We can't safely rely on ns_capable() for
> the audit container ID policy since that is easily met inside the
> container regardless of the process' creds which started the
> container.

Agreed.

> > In this case, is the userid relevant?
> 
> We don't do UID checks, we do capability checks, so yes, the UID is irrelevant.

Agreed.

> > > > At this point I would say we are at an impasse unless we trust
> > > > ns_capable() or we implement audit namespaces.
> > >
> > > I'm not sure how we can trust ns_capable(), but if you can think of a
> > > way I would love to hear it.  I'm also not sure how namespacing audit
> > > is helpful (see my above comments), but if you think it is please
> > > explain.
> >
> > So if we are not namespacing, why do we not trust capabilities?
> 
> We can trust capable(CAP_AUDIT_CONTROL) for enforcing audit container
> ID policy, we can not trust ns_capable(CAP_AUDIT_CONTROL).

Ok.  So does a process in a non-init user namespace have two (or more)
sets of capabilities stored in creds, one in the init_user_ns, and one
in current_user_ns?  Or does it get stripped of all its capabilities in
init_user_ns once it has its own set in current_user_ns?  If the former,
then we can use capable().  If the latter, we need another mechanism, as
you have suggested might be needed.

If some random unprivileged user wants to fire up a container
orchestrator/engine in his own user namespace, then audit needs to be
namespaced.  Can we safely discard this scenario for now?  That user can
use a VM.

> paul moore

- RGB

--
Richard Guy Briggs <rgb@redhat.com>
Sr. S/W Engineer, Kernel Security, Base Operating Systems
Remote, Ottawa, Red Hat Canada
IRC: rgb, SunRaycer
Voice: +1.647.777.2635, Internal: (81) 32635

^ permalink raw reply

* Re: [PATCH v5 1/5] mm: introduce MADV_COLD
From: Suren Baghdasaryan @ 2019-07-17 22:14 UTC (permalink / raw)
  To: Minchan Kim
  Cc: Andrew Morton, linux-mm, LKML, linux-api, Michal Hocko,
	Johannes Weiner, Tim Murray, Joel Fernandes, Daniel Colascione,
	Shakeel Butt, Sonny Rao, oleksandr, hdanton, Benoit Lize,
	Dave Hansen, Kirill A . Shutemov
In-Reply-To: <20190714233401.36909-2-minchan@kernel.org>

Hi Minchan,
Couple comments inline.
Thanks!

On Sun, Jul 14, 2019 at 4:34 PM Minchan Kim <minchan@kernel.org> wrote:
>
> When a process expects no accesses to a certain memory range, it could
> give a hint to kernel that the pages can be reclaimed when memory pressure
> happens but data should be preserved for future use.  This could reduce
> workingset eviction so it ends up increasing performance.
>
> This patch introduces the new MADV_COLD hint to madvise(2) syscall.
> MADV_COLD can be used by a process to mark a memory range as not expected
> to be used in the near future. The hint can help kernel in deciding which
> pages to evict early during memory pressure.
>
> It works for every LRU pages like MADV_[DONTNEED|FREE]. IOW, It moves
>
>         active file page -> inactive file LRU
>         active anon page -> inacdtive anon LRU
>
> Unlike MADV_FREE, it doesn't move active anonymous pages to inactive
> file LRU's head because MADV_COLD is a little bit different symantic.
> MADV_FREE means it's okay to discard when the memory pressure because
> the content of the page is *garbage* so freeing such pages is almost zero
> overhead since we don't need to swap out and access afterward causes just
> minor fault. Thus, it would make sense to put those freeable pages in
> inactive file LRU to compete other used-once pages. It makes sense for
> implmentaion point of view, too because it's not swapbacked memory any
> longer until it would be re-dirtied. Even, it could give a bonus to make
> them be reclaimed on swapless system. However, MADV_COLD doesn't mean
> garbage so reclaiming them requires swap-out/in in the end so it's bigger
> cost. Since we have designed VM LRU aging based on cost-model, anonymous
> cold pages would be better to position inactive anon's LRU list, not file
> LRU. Furthermore, it would help to avoid unnecessary scanning if system
> doesn't have a swap device. Let's start simpler way without adding
> complexity at this moment. However, keep in mind, too that it's a caveat
> that workloads with a lot of pages cache are likely to ignore MADV_COLD
> on anonymous memory because we rarely age anonymous LRU lists.
>
> * man-page material
>
> MADV_COLD (since Linux x.x)
>
> Pages in the specified regions will be treated as less-recently-accessed
> compared to pages in the system with similar access frequencies.
> In contrast to MADV_FREE, the contents of the region are preserved
> regardless of subsequent writes to pages.
>
> MADV_COLD cannot be applied to locked pages, Huge TLB pages, or VM_PFNMAP
> pages.
>
> * v2
>  * add up the warn with lots of page cache workload - mhocko
>  * add man page stuff - dave
>
> * v1
>  * remove page_mapcount filter - hannes, mhocko
>  * remove idle page handling - joelaf
>
> * RFCv2
>  * add more description - mhocko
>
> * RFCv1
>  * renaming from MADV_COOL to MADV_COLD - hannes
>
> * internal review
>  * use clear_page_youn in deactivate_page - joelaf
>  * Revise the description - surenb
>  * Renaming from MADV_WARM to MADV_COOL - surenb
>
> Acked-by: Michal Hocko <mhocko@suse.com>
> Acked-by: Johannes Weiner <hannes@cmpxchg.org>
> Signed-off-by: Minchan Kim <minchan@kernel.org>
> ---
>  include/linux/swap.h                   |   1 +
>  include/uapi/asm-generic/mman-common.h |   1 +
>  mm/internal.h                          |   2 +-
>  mm/madvise.c                           | 180 ++++++++++++++++++++++++-
>  mm/oom_kill.c                          |   2 +-
>  mm/swap.c                              |  42 ++++++
>  6 files changed, 224 insertions(+), 4 deletions(-)
>
> diff --git a/include/linux/swap.h b/include/linux/swap.h
> index de2c67a33b7e..0ce997edb8bb 100644
> --- a/include/linux/swap.h
> +++ b/include/linux/swap.h
> @@ -340,6 +340,7 @@ extern void lru_add_drain_cpu(int cpu);
>  extern void lru_add_drain_all(void);
>  extern void rotate_reclaimable_page(struct page *page);
>  extern void deactivate_file_page(struct page *page);
> +extern void deactivate_page(struct page *page);
>  extern void mark_page_lazyfree(struct page *page);
>  extern void swap_setup(void);
>
> diff --git a/include/uapi/asm-generic/mman-common.h b/include/uapi/asm-generic/mman-common.h
> index 63b1f506ea67..ef8a56927b12 100644
> --- a/include/uapi/asm-generic/mman-common.h
> +++ b/include/uapi/asm-generic/mman-common.h
> @@ -45,6 +45,7 @@
>  #define MADV_SEQUENTIAL        2               /* expect sequential page references */
>  #define MADV_WILLNEED  3               /* will need these pages */
>  #define MADV_DONTNEED  4               /* don't need these pages */
> +#define MADV_COLD      5               /* deactivatie these pages */

s/deactivatie/deactivate

>
>  /* common parameters: try to keep these consistent across architectures */
>  #define MADV_FREE      8               /* free pages only if memory pressure */
> diff --git a/mm/internal.h b/mm/internal.h
> index f53a14d67538..c61b215ff265 100644
> --- a/mm/internal.h
> +++ b/mm/internal.h
> @@ -39,7 +39,7 @@ vm_fault_t do_swap_page(struct vm_fault *vmf);
>  void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *start_vma,
>                 unsigned long floor, unsigned long ceiling);
>
> -static inline bool can_madv_dontneed_vma(struct vm_area_struct *vma)
> +static inline bool can_madv_lru_vma(struct vm_area_struct *vma)
>  {
>         return !(vma->vm_flags & (VM_LOCKED|VM_HUGETLB|VM_PFNMAP));
>  }
> diff --git a/mm/madvise.c b/mm/madvise.c
> index 968df3aa069f..bae0055f9724 100644
> --- a/mm/madvise.c
> +++ b/mm/madvise.c
> @@ -40,6 +40,7 @@ static int madvise_need_mmap_write(int behavior)
>         case MADV_REMOVE:
>         case MADV_WILLNEED:
>         case MADV_DONTNEED:
> +       case MADV_COLD:
>         case MADV_FREE:
>                 return 0;
>         default:
> @@ -307,6 +308,178 @@ static long madvise_willneed(struct vm_area_struct *vma,
>         return 0;
>  }
>
> +static int madvise_cold_pte_range(pmd_t *pmd, unsigned long addr,
> +                               unsigned long end, struct mm_walk *walk)
> +{
> +       struct mmu_gather *tlb = walk->private;
> +       struct mm_struct *mm = tlb->mm;
> +       struct vm_area_struct *vma = walk->vma;
> +       pte_t *orig_pte, *pte, ptent;
> +       spinlock_t *ptl;
> +       struct page *page;
> +       unsigned long next;
> +
> +       next = pmd_addr_end(addr, end);
> +       if (pmd_trans_huge(*pmd)) {
> +               pmd_t orig_pmd;
> +
> +               tlb_change_page_size(tlb, HPAGE_PMD_SIZE);
> +               ptl = pmd_trans_huge_lock(pmd, vma);
> +               if (!ptl)
> +                       return 0;
> +
> +               orig_pmd = *pmd;
> +               if (is_huge_zero_pmd(orig_pmd))
> +                       goto huge_unlock;
> +
> +               if (unlikely(!pmd_present(orig_pmd))) {
> +                       VM_BUG_ON(thp_migration_supported() &&
> +                                       !is_pmd_migration_entry(orig_pmd));
> +                       goto huge_unlock;
> +               }
> +
> +               page = pmd_page(orig_pmd);
> +               if (next - addr != HPAGE_PMD_SIZE) {
> +                       int err;
> +
> +                       if (page_mapcount(page) != 1)
> +                               goto huge_unlock;
> +
> +                       get_page(page);
> +                       spin_unlock(ptl);
> +                       lock_page(page);
> +                       err = split_huge_page(page);
> +                       unlock_page(page);
> +                       put_page(page);
> +                       if (!err)
> +                               goto regular_page;
> +                       return 0;
> +               }
> +
> +               if (pmd_young(orig_pmd)) {
> +                       pmdp_invalidate(vma, addr, pmd);
> +                       orig_pmd = pmd_mkold(orig_pmd);
> +
> +                       set_pmd_at(mm, addr, pmd, orig_pmd);
> +                       tlb_remove_pmd_tlb_entry(tlb, pmd, addr);
> +               }
> +
> +               test_and_clear_page_young(page);
> +               deactivate_page(page);
> +huge_unlock:
> +               spin_unlock(ptl);
> +               return 0;
> +       }
> +
> +       if (pmd_trans_unstable(pmd))
> +               return 0;
> +
> +regular_page:
> +       tlb_change_page_size(tlb, PAGE_SIZE);
> +       orig_pte = pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
> +       flush_tlb_batched_pending(mm);
> +       arch_enter_lazy_mmu_mode();
> +       for (; addr < end; pte++, addr += PAGE_SIZE) {
> +               ptent = *pte;
> +
> +               if (pte_none(ptent))
> +                       continue;
> +
> +               if (!pte_present(ptent))
> +                       continue;
> +
> +               page = vm_normal_page(vma, addr, ptent);
> +               if (!page)
> +                       continue;
> +
> +               /*
> +                * Creating a THP page is expensive so split it only if we
> +                * are sure it's worth. Split it if we are only owner.
> +                */
> +               if (PageTransCompound(page)) {
> +                       if (page_mapcount(page) != 1)
> +                               break;
> +                       get_page(page);
> +                       if (!trylock_page(page)) {
> +                               put_page(page);
> +                               break;
> +                       }
> +                       pte_unmap_unlock(orig_pte, ptl);
> +                       if (split_huge_page(page)) {
> +                               unlock_page(page);
> +                               put_page(page);
> +                               pte_offset_map_lock(mm, pmd, addr, &ptl);
> +                               break;
> +                       }
> +                       unlock_page(page);
> +                       put_page(page);
> +                       pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
> +                       pte--;
> +                       addr -= PAGE_SIZE;
> +                       continue;
> +               }
> +
> +               VM_BUG_ON_PAGE(PageTransCompound(page), page);
> +
> +               if (pte_young(ptent)) {
> +                       ptent = ptep_get_and_clear_full(mm, addr, pte,
> +                                                       tlb->fullmm);
> +                       ptent = pte_mkold(ptent);
> +                       set_pte_at(mm, addr, pte, ptent);
> +                       tlb_remove_tlb_entry(tlb, pte, addr);
> +               }
> +
> +               /*
> +                * We are deactivating a page for accelerating reclaiming.
> +                * VM couldn't reclaim the page unless we clear PG_young.
> +                * As a side effect, it makes confuse idle-page tracking
> +                * because they will miss recent referenced history.
> +                */
> +               test_and_clear_page_young(page);
> +               deactivate_page(page);
> +       }
> +
> +       arch_enter_lazy_mmu_mode();

Did you mean to say arch_leave_lazy_mmu_mode() ?

> +       pte_unmap_unlock(orig_pte, ptl);
> +       cond_resched();
> +
> +       return 0;
> +}
> +
> +static void madvise_cold_page_range(struct mmu_gather *tlb,
> +                            struct vm_area_struct *vma,
> +                            unsigned long addr, unsigned long end)
> +{
> +       struct mm_walk cold_walk = {
> +               .pmd_entry = madvise_cold_pte_range,
> +               .mm = vma->vm_mm,
> +               .private = tlb,
> +       };
> +
> +       tlb_start_vma(tlb, vma);
> +       walk_page_range(addr, end, &cold_walk);
> +       tlb_end_vma(tlb, vma);
> +}
> +
> +static long madvise_cold(struct vm_area_struct *vma,
> +                       struct vm_area_struct **prev,
> +                       unsigned long start_addr, unsigned long end_addr)
> +{
> +       struct mm_struct *mm = vma->vm_mm;
> +       struct mmu_gather tlb;
> +
> +       *prev = vma;
> +       if (!can_madv_lru_vma(vma))
> +               return -EINVAL;
> +
> +       lru_add_drain();
> +       tlb_gather_mmu(&tlb, mm, start_addr, end_addr);
> +       madvise_cold_page_range(&tlb, vma, start_addr, end_addr);
> +       tlb_finish_mmu(&tlb, start_addr, end_addr);
> +
> +       return 0;
> +}
> +
>  static int madvise_free_pte_range(pmd_t *pmd, unsigned long addr,
>                                 unsigned long end, struct mm_walk *walk)
>
> @@ -519,7 +692,7 @@ static long madvise_dontneed_free(struct vm_area_struct *vma,
>                                   int behavior)
>  {
>         *prev = vma;
> -       if (!can_madv_dontneed_vma(vma))
> +       if (!can_madv_lru_vma(vma))
>                 return -EINVAL;
>
>         if (!userfaultfd_remove(vma, start, end)) {
> @@ -541,7 +714,7 @@ static long madvise_dontneed_free(struct vm_area_struct *vma,
>                          */
>                         return -ENOMEM;
>                 }
> -               if (!can_madv_dontneed_vma(vma))
> +               if (!can_madv_lru_vma(vma))
>                         return -EINVAL;
>                 if (end > vma->vm_end) {
>                         /*
> @@ -695,6 +868,8 @@ madvise_vma(struct vm_area_struct *vma, struct vm_area_struct **prev,
>                 return madvise_remove(vma, prev, start, end);
>         case MADV_WILLNEED:
>                 return madvise_willneed(vma, prev, start, end);
> +       case MADV_COLD:
> +               return madvise_cold(vma, prev, start, end);
>         case MADV_FREE:
>         case MADV_DONTNEED:
>                 return madvise_dontneed_free(vma, prev, start, end, behavior);
> @@ -716,6 +891,7 @@ madvise_behavior_valid(int behavior)
>         case MADV_WILLNEED:
>         case MADV_DONTNEED:
>         case MADV_FREE:
> +       case MADV_COLD:
>  #ifdef CONFIG_KSM
>         case MADV_MERGEABLE:
>         case MADV_UNMERGEABLE:
> diff --git a/mm/oom_kill.c b/mm/oom_kill.c
> index 95872bdfec4e..c8f0ec6b0e80 100644
> --- a/mm/oom_kill.c
> +++ b/mm/oom_kill.c
> @@ -523,7 +523,7 @@ bool __oom_reap_task_mm(struct mm_struct *mm)
>         set_bit(MMF_UNSTABLE, &mm->flags);
>
>         for (vma = mm->mmap ; vma; vma = vma->vm_next) {
> -               if (!can_madv_dontneed_vma(vma))
> +               if (!can_madv_lru_vma(vma))
>                         continue;
>
>                 /*
> diff --git a/mm/swap.c b/mm/swap.c
> index 55899c1f54af..b501ad6eb091 100644
> --- a/mm/swap.c
> +++ b/mm/swap.c
> @@ -47,6 +47,7 @@ int page_cluster;
>  static DEFINE_PER_CPU(struct pagevec, lru_add_pvec);
>  static DEFINE_PER_CPU(struct pagevec, lru_rotate_pvecs);
>  static DEFINE_PER_CPU(struct pagevec, lru_deactivate_file_pvecs);
> +static DEFINE_PER_CPU(struct pagevec, lru_deactivate_pvecs);
>  static DEFINE_PER_CPU(struct pagevec, lru_lazyfree_pvecs);
>  #ifdef CONFIG_SMP
>  static DEFINE_PER_CPU(struct pagevec, activate_page_pvecs);
> @@ -538,6 +539,22 @@ static void lru_deactivate_file_fn(struct page *page, struct lruvec *lruvec,
>         update_page_reclaim_stat(lruvec, file, 0);
>  }
>
> +static void lru_deactivate_fn(struct page *page, struct lruvec *lruvec,
> +                           void *arg)
> +{
> +       if (PageLRU(page) && PageActive(page) && !PageUnevictable(page)) {
> +               int file = page_is_file_cache(page);
> +               int lru = page_lru_base_type(page);
> +
> +               del_page_from_lru_list(page, lruvec, lru + LRU_ACTIVE);
> +               ClearPageActive(page);
> +               ClearPageReferenced(page);
> +               add_page_to_lru_list(page, lruvec, lru);
> +
> +               __count_vm_events(PGDEACTIVATE, hpage_nr_pages(page));
> +               update_page_reclaim_stat(lruvec, file, 0);
> +       }
> +}
>
>  static void lru_lazyfree_fn(struct page *page, struct lruvec *lruvec,
>                             void *arg)
> @@ -590,6 +607,10 @@ void lru_add_drain_cpu(int cpu)
>         if (pagevec_count(pvec))
>                 pagevec_lru_move_fn(pvec, lru_deactivate_file_fn, NULL);
>
> +       pvec = &per_cpu(lru_deactivate_pvecs, cpu);
> +       if (pagevec_count(pvec))
> +               pagevec_lru_move_fn(pvec, lru_deactivate_fn, NULL);
> +
>         pvec = &per_cpu(lru_lazyfree_pvecs, cpu);
>         if (pagevec_count(pvec))
>                 pagevec_lru_move_fn(pvec, lru_lazyfree_fn, NULL);
> @@ -623,6 +644,26 @@ void deactivate_file_page(struct page *page)
>         }
>  }
>
> +/*
> + * deactivate_page - deactivate a page
> + * @page: page to deactivate
> + *
> + * deactivate_page() moves @page to the inactive list if @page was on the active
> + * list and was not an unevictable page.  This is done to accelerate the reclaim
> + * of @page.
> + */
> +void deactivate_page(struct page *page)
> +{
> +       if (PageLRU(page) && PageActive(page) && !PageUnevictable(page)) {
> +               struct pagevec *pvec = &get_cpu_var(lru_deactivate_pvecs);
> +
> +               get_page(page);
> +               if (!pagevec_add(pvec, page) || PageCompound(page))
> +                       pagevec_lru_move_fn(pvec, lru_deactivate_fn, NULL);
> +               put_cpu_var(lru_deactivate_pvecs);
> +       }
> +}
> +
>  /**
>   * mark_page_lazyfree - make an anon page lazyfree
>   * @page: page to deactivate
> @@ -687,6 +728,7 @@ void lru_add_drain_all(void)
>                 if (pagevec_count(&per_cpu(lru_add_pvec, cpu)) ||
>                     pagevec_count(&per_cpu(lru_rotate_pvecs, cpu)) ||
>                     pagevec_count(&per_cpu(lru_deactivate_file_pvecs, cpu)) ||
> +                   pagevec_count(&per_cpu(lru_deactivate_pvecs, cpu)) ||
>                     pagevec_count(&per_cpu(lru_lazyfree_pvecs, cpu)) ||
>                     need_activate_page_drain(cpu)) {
>                         INIT_WORK(work, lru_add_drain_per_cpu);
> --
> 2.22.0.510.g264f2c817a-goog
>

^ permalink raw reply

* Re: [v2 PATCH 2/2] mm: mempolicy: handle vma with unmovable pages mapped correctly in mbind
From: Yang Shi @ 2019-07-17 19:25 UTC (permalink / raw)
  To: Vlastimil Babka, mhocko, mgorman, akpm; +Cc: linux-mm, linux-kernel, Linux API
In-Reply-To: <7be3d36a-19fe-2e3b-8840-27fb5fd60f15@suse.cz>



On 7/17/19 11:50 AM, Vlastimil Babka wrote:
> On 7/17/19 8:23 PM, Yang Shi wrote:
>>
>> On 7/16/19 10:28 AM, Yang Shi wrote:
>>>
>>> On 7/16/19 5:07 AM, Vlastimil Babka wrote:
>>>> On 6/22/19 2:20 AM, Yang Shi wrote:
>>>>> @@ -969,10 +975,21 @@ static long do_get_mempolicy(int *policy,
>>>>> nodemask_t *nmask,
>>>>>    /*
>>>>>     * page migration, thp tail pages can be passed.
>>>>>     */
>>>>> -static void migrate_page_add(struct page *page, struct list_head
>>>>> *pagelist,
>>>>> +static int migrate_page_add(struct page *page, struct list_head
>>>>> *pagelist,
>>>>>                    unsigned long flags)
>>>>>    {
>>>>>        struct page *head = compound_head(page);
>>>>> +
>>>>> +    /*
>>>>> +     * Non-movable page may reach here.  And, there may be
>>>>> +     * temporaty off LRU pages or non-LRU movable pages.
>>>>> +     * Treat them as unmovable pages since they can't be
>>>>> +     * isolated, so they can't be moved at the moment.  It
>>>>> +     * should return -EIO for this case too.
>>>>> +     */
>>>>> +    if (!PageLRU(head) && (flags & MPOL_MF_STRICT))
>>>>> +        return -EIO;
>>>>> +
>>>> Hm but !PageLRU() is not the only way why queueing for migration can
>>>> fail, as can be seen from the rest of the function. Shouldn't all cases
>>>> be reported?
>>> Do you mean the shared pages and isolation failed pages? I'm not sure
>>> whether we should consider these cases break the semantics or not, so
>>> I leave them as they are. But, strictly speaking they should be
>>> reported too, at least for the isolation failed page.
> CC'd linux-api, should be done on v3 posting also.
>
>> By reading mbind man page, it says:
>>
>> If MPOL_MF_MOVE is specified in flags, then the kernel will attempt to
>> move all the existing pages in the memory range so that they follow the
>> policy.  Pages that are shared with other processes will not be moved.
>> If MPOL_MF_STRICT is also specified, then the call fails with the error
>> EIO if some pages could not be moved.
> I don't think this means that for shared pages, -EIO should not be
> reported. I can imagine both interpretations of the paragraph. I guess
> we can be conservative and keep not reporting them, if that was always
> the case - but then perhaps clarify the man page?

Yes, I agree the man page does looks ambiguous.  Anyway, I think we 
could add a patch later to kernel or manpage for either interpretations 
once it gets clarified.

>
>> It looks the code already handles shared page correctly, we just need
>> return -EIO for isolation failed page if MPOL_MF_STRICT is specified.
>>
>>> Thanks,
>>> Yang
>>>
>>>>>        /*
>>>>>         * Avoid migrating a page that is shared with others.
>>>>>         */
>>>>> @@ -984,6 +1001,8 @@ static void migrate_page_add(struct page *page,
>>>>> struct list_head *pagelist,
>>>>>                    hpage_nr_pages(head));
>>>>>            }
>>>>>        }
>>>>> +
>>>>> +    return 0;
>>>>>    }
>>>>>      /* page allocation callback for NUMA node migration */
>>>>> @@ -1186,9 +1205,10 @@ static struct page *new_page(struct page
>>>>> *page, unsigned long start)
>>>>>    }
>>>>>    #else
>>>>>    -static void migrate_page_add(struct page *page, struct list_head
>>>>> *pagelist,
>>>>> +static int migrate_page_add(struct page *page, struct list_head
>>>>> *pagelist,
>>>>>                    unsigned long flags)
>>>>>    {
>>>>> +    return -EIO;
>>>>>    }
>>>>>      int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from,
>>>>>

^ permalink raw reply

* Re: [v2 PATCH 2/2] mm: mempolicy: handle vma with unmovable pages mapped correctly in mbind
From: Vlastimil Babka @ 2019-07-17 18:50 UTC (permalink / raw)
  To: Yang Shi, mhocko, mgorman, akpm; +Cc: linux-mm, linux-kernel, Linux API
In-Reply-To: <3197a7df-c7bc-2bac-3d40-dbfc97d4a909@linux.alibaba.com>

On 7/17/19 8:23 PM, Yang Shi wrote:
> 
> 
> On 7/16/19 10:28 AM, Yang Shi wrote:
>>
>>
>> On 7/16/19 5:07 AM, Vlastimil Babka wrote:
>>> On 6/22/19 2:20 AM, Yang Shi wrote:
>>>> @@ -969,10 +975,21 @@ static long do_get_mempolicy(int *policy, 
>>>> nodemask_t *nmask,
>>>>   /*
>>>>    * page migration, thp tail pages can be passed.
>>>>    */
>>>> -static void migrate_page_add(struct page *page, struct list_head 
>>>> *pagelist,
>>>> +static int migrate_page_add(struct page *page, struct list_head 
>>>> *pagelist,
>>>>                   unsigned long flags)
>>>>   {
>>>>       struct page *head = compound_head(page);
>>>> +
>>>> +    /*
>>>> +     * Non-movable page may reach here.  And, there may be
>>>> +     * temporaty off LRU pages or non-LRU movable pages.
>>>> +     * Treat them as unmovable pages since they can't be
>>>> +     * isolated, so they can't be moved at the moment.  It
>>>> +     * should return -EIO for this case too.
>>>> +     */
>>>> +    if (!PageLRU(head) && (flags & MPOL_MF_STRICT))
>>>> +        return -EIO;
>>>> +
>>> Hm but !PageLRU() is not the only way why queueing for migration can
>>> fail, as can be seen from the rest of the function. Shouldn't all cases
>>> be reported?
>>
>> Do you mean the shared pages and isolation failed pages? I'm not sure 
>> whether we should consider these cases break the semantics or not, so 
>> I leave them as they are. But, strictly speaking they should be 
>> reported too, at least for the isolation failed page.

CC'd linux-api, should be done on v3 posting also.

> By reading mbind man page, it says:
> 
> If MPOL_MF_MOVE is specified in flags, then the kernel will attempt to 
> move all the existing pages in the memory range so that they follow the 
> policy.  Pages that are shared with other processes will not be moved.  
> If MPOL_MF_STRICT is also specified, then the call fails with the error 
> EIO if some pages could not be moved.

I don't think this means that for shared pages, -EIO should not be
reported. I can imagine both interpretations of the paragraph. I guess
we can be conservative and keep not reporting them, if that was always
the case - but then perhaps clarify the man page?

> It looks the code already handles shared page correctly, we just need 
> return -EIO for isolation failed page if MPOL_MF_STRICT is specified.
> 
>>
>> Thanks,
>> Yang
>>
>>>
>>>>       /*
>>>>        * Avoid migrating a page that is shared with others.
>>>>        */
>>>> @@ -984,6 +1001,8 @@ static void migrate_page_add(struct page *page, 
>>>> struct list_head *pagelist,
>>>>                   hpage_nr_pages(head));
>>>>           }
>>>>       }
>>>> +
>>>> +    return 0;
>>>>   }
>>>>     /* page allocation callback for NUMA node migration */
>>>> @@ -1186,9 +1205,10 @@ static struct page *new_page(struct page 
>>>> *page, unsigned long start)
>>>>   }
>>>>   #else
>>>>   -static void migrate_page_add(struct page *page, struct list_head 
>>>> *pagelist,
>>>> +static int migrate_page_add(struct page *page, struct list_head 
>>>> *pagelist,
>>>>                   unsigned long flags)
>>>>   {
>>>> +    return -EIO;
>>>>   }
>>>>     int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from,
>>>>
>>
> 

^ permalink raw reply

* Re: [PATCH v2 4/4] tools: Add fchmodat4
From: Arnaldo Carvalho de Melo @ 2019-07-17 12:39 UTC (permalink / raw)
  To: Palmer Dabbelt
  Cc: viro, linux-kernel, linux-fsdevel, linux-api, Arnd Bergmann, rth,
	ink, mattst88, linux, catalin.marinas, will, tony.luck,
	fenghua.yu, geert, monstr, ralf, paul.burton, jhogan,
	James.Bottomley, deller, benh, paulus, mpe, heiko.carstens, gor,
	borntraeger, ysato, dalias, davem, luto, tglx, mingo, bp, hpa,
	x86, peterz
In-Reply-To: <20190717012719.5524-5-palmer@sifive.com>

Em Tue, Jul 16, 2019 at 06:27:19PM -0700, Palmer Dabbelt escreveu:
> I'm not sure why it's necessary to add this explicitly to tools/ as well
> as arch/, but there were a few instances of fspick in here so I blindly
> added fchmodat4 in the same fashion.

The copies in tools/ for these specific files are used to generate a
syscall table used by 'perf trace', and we don't/can't access files
outside of tools/ to build tools/perf/, so we grab a copy and have
checks in place to warn perf developers when those copies get out of
sync.

Its not required that kernel developers update anything in tools, you're
welcomed to do so if you wish tho.

Thanks,

- Arnaldo
 
> Signed-off-by: Palmer Dabbelt <palmer@sifive.com>
> ---
>  tools/include/uapi/asm-generic/unistd.h           | 4 +++-
>  tools/perf/arch/x86/entry/syscalls/syscall_64.tbl | 1 +
>  2 files changed, 4 insertions(+), 1 deletion(-)
> 
> diff --git a/tools/include/uapi/asm-generic/unistd.h b/tools/include/uapi/asm-generic/unistd.h
> index a87904daf103..36232ea94956 100644
> --- a/tools/include/uapi/asm-generic/unistd.h
> +++ b/tools/include/uapi/asm-generic/unistd.h
> @@ -844,9 +844,11 @@ __SYSCALL(__NR_fsconfig, sys_fsconfig)
>  __SYSCALL(__NR_fsmount, sys_fsmount)
>  #define __NR_fspick 433
>  __SYSCALL(__NR_fspick, sys_fspick)
> +#define __NR_fchmodat4 434
> +__SYSCALL(__NR_fchmodat4, sys_fchmodat4)
>  
>  #undef __NR_syscalls
> -#define __NR_syscalls 434
> +#define __NR_syscalls 435
>  
>  /*
>   * 32 bit systems traditionally used different
> diff --git a/tools/perf/arch/x86/entry/syscalls/syscall_64.tbl b/tools/perf/arch/x86/entry/syscalls/syscall_64.tbl
> index b4e6f9e6204a..b92d5b195e66 100644
> --- a/tools/perf/arch/x86/entry/syscalls/syscall_64.tbl
> +++ b/tools/perf/arch/x86/entry/syscalls/syscall_64.tbl
> @@ -355,6 +355,7 @@
>  431	common	fsconfig		__x64_sys_fsconfig
>  432	common	fsmount			__x64_sys_fsmount
>  433	common	fspick			__x64_sys_fspick
> +434	common	fchmodat4		__x64_sys_fchmodat4
>  
>  #
>  # x32-specific system call numbers start at 512 to avoid cache impact
> -- 
> 2.21.0

-- 

- Arnaldo

^ permalink raw reply

* Re: [PATCH v2 3/4] arch: Register fchmodat4, usually as syscall 434
From: Arnd Bergmann @ 2019-07-17  7:54 UTC (permalink / raw)
  To: Palmer Dabbelt
  Cc: Rich Felker, Catalin Marinas, Linux-sh list, Peter Zijlstra,
	Heiko Carstens, Stefan Agner, linux-mips, James Bottomley,
	Namhyung Kim, kim.phillips, Paul Mackerras, Deepa Dinamani,
	H. Peter Anvin, sparclinux, linux-ia64, Will Deacon, linux-arch,
	linux-s390, Hannes Reinecke, Yoshinori Sato, Helge Deller,
	the arch/x86 maintainers, Russell King - ARM Linux, Christian
In-Reply-To: <20190717012719.5524-4-palmer@sifive.com>

On Wed, Jul 17, 2019 at 3:29 AM Palmer Dabbelt <palmer@sifive.com> wrote:
>
> This registers the new fchmodat4 syscall in most places as nuber 434,
> with alpha being the exception where it's 544.  I found all these sites
> by grepping for fspick, which I assume has found me everything.

434 is now pidfd_open, the next free one is 436.

>  arch/alpha/kernel/syscalls/syscall.tbl      | 1 +
>  arch/arm/tools/syscall.tbl                  | 1 +
>  arch/arm64/include/asm/unistd32.h           | 2 ++

You missed arch/arm64/include/asm/unistd.h, which
contains __NR_compat_syscalls

      Arnd

^ permalink raw reply

* Re: [PATCH v2 2/4] Add fchmodat4(), a new syscall
From: Al Viro @ 2019-07-17  3:02 UTC (permalink / raw)
  To: Rich Felker
  Cc: Palmer Dabbelt, linux-kernel, linux-fsdevel, linux-api,
	Arnd Bergmann, rth, ink, mattst88, linux, catalin.marinas, will,
	tony.luck, fenghua.yu, geert, monstr, ralf, paul.burton, jhogan,
	James.Bottomley, deller, benh, paulus, mpe, heiko.carstens, gor,
	borntraeger, ysato, davem, luto, tglx, mingo, bp, hpa, x86,
	peterz, acme@
In-Reply-To: <20190717024046.GI1506@brightrain.aerifal.cx>

On Tue, Jul 16, 2019 at 10:40:46PM -0400, Rich Felker wrote:
> On Tue, Jul 16, 2019 at 06:27:17PM -0700, Palmer Dabbelt wrote:
> > man 3p says that fchmodat() takes a flags argument, but the Linux
> > syscall does not.  There doesn't appear to be a good userspace
> > workaround for this issue but the implementation in the kernel is pretty
> > straight-forward.  The specific use case where the missing flags came up
> > was WRT a fuse filesystem implemenation, but the functionality is pretty
> > generic so I'm assuming there would be other use cases.
> 
> Note that we do have a workaround in musl libc with O_PATH and
> /proc/self/fd, but a syscall that allows a proper fix with the ugly
> workaround only in the fallback path for old kernels will be much
> appreciated!
> 
> What about also doing a new SYS_faccessat4 with working AT_EACCESS
> flag? The workaround we have to do for it is far worse.

Umm...  That's doable, but getting into the "don't switch creds unless
needed" territory.  I'll need to play with that a bit and see what
gives a tolerable variant...

What of this part wrt AT_EACCESS?
        if (!issecure(SECURE_NO_SETUID_FIXUP)) {
                /* Clear the capabilities if we switch to a non-root user */
                kuid_t root_uid = make_kuid(override_cred->user_ns, 0);
                if (!uid_eq(override_cred->uid, root_uid))
                        cap_clear(override_cred->cap_effective);
                else
                        override_cred->cap_effective =
                                override_cred->cap_permitted;
        }

^ permalink raw reply

* Re: [PATCH v2 2/4] Add fchmodat4(), a new syscall
From: Rich Felker @ 2019-07-17  2:40 UTC (permalink / raw)
  To: Palmer Dabbelt
  Cc: viro, linux-kernel, linux-fsdevel, linux-api, Arnd Bergmann, rth,
	ink, mattst88, linux, catalin.marinas, will, tony.luck,
	fenghua.yu, geert, monstr, ralf, paul.burton, jhogan,
	James.Bottomley, deller, benh, paulus, mpe, heiko.carstens, gor,
	borntraeger, ysato, davem, luto, tglx, mingo, bp, hpa, x86,
	peterz, acme
In-Reply-To: <20190717012719.5524-3-palmer@sifive.com>

On Tue, Jul 16, 2019 at 06:27:17PM -0700, Palmer Dabbelt wrote:
> man 3p says that fchmodat() takes a flags argument, but the Linux
> syscall does not.  There doesn't appear to be a good userspace
> workaround for this issue but the implementation in the kernel is pretty
> straight-forward.  The specific use case where the missing flags came up
> was WRT a fuse filesystem implemenation, but the functionality is pretty
> generic so I'm assuming there would be other use cases.

Note that we do have a workaround in musl libc with O_PATH and
/proc/self/fd, but a syscall that allows a proper fix with the ugly
workaround only in the fallback path for old kernels will be much
appreciated!

What about also doing a new SYS_faccessat4 with working AT_EACCESS
flag? The workaround we have to do for it is far worse.

Rich

^ permalink raw reply


This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox