* [PATCH] kernel: add pid_max to pid_namespace
@ 2024-09-02 11:49 Yun Zhou
2024-10-04 19:05 ` Steven Rostedt
0 siblings, 1 reply; 6+ messages in thread
From: Yun Zhou @ 2024-09-02 11:49 UTC (permalink / raw)
To: mcgrof, keescook, yzaikin, rostedt, mhiramat, mathieu.desnoyers,
yun.zhou
Cc: linux-kernel, linux-fsdevel, linux-trace-kernel
Signed-off-by: Yun Zhou <yun.zhou@windriver.com>
---
include/linux/pid_namespace.h | 1 +
kernel/pid.c | 12 ++++++------
kernel/pid_namespace.c | 33 ++++++++++++++++++++++++++++-----
kernel/sysctl.c | 9 ---------
kernel/trace/pid_list.c | 2 +-
kernel/trace/trace.c | 2 +-
kernel/trace/trace.h | 2 --
7 files changed, 37 insertions(+), 24 deletions(-)
diff --git a/include/linux/pid_namespace.h b/include/linux/pid_namespace.h
index f9f9931e02d6..0e3c18f3cac5 100644
--- a/include/linux/pid_namespace.h
+++ b/include/linux/pid_namespace.h
@@ -27,6 +27,7 @@ struct pid_namespace {
struct idr idr;
struct rcu_head rcu;
unsigned int pid_allocated;
+ int pid_max;
struct task_struct *child_reaper;
struct kmem_cache *pid_cachep;
unsigned int level;
diff --git a/kernel/pid.c b/kernel/pid.c
index 6500ef956f2f..14da3f68ceed 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -59,8 +59,6 @@ struct pid init_struct_pid = {
}, }
};
-int pid_max = PID_MAX_DEFAULT;
-
#define RESERVED_PIDS 300
int pid_max_min = RESERVED_PIDS + 1;
@@ -74,6 +72,7 @@ int pid_max_max = PID_MAX_LIMIT;
*/
struct pid_namespace init_pid_ns = {
.ns.count = REFCOUNT_INIT(2),
+ .pid_max = PID_MAX_DEFAULT,
.idr = IDR_INIT(init_pid_ns.idr),
.pid_allocated = PIDNS_ADDING,
.level = 0,
@@ -194,7 +193,7 @@ struct pid *alloc_pid(struct pid_namespace *ns, pid_t *set_tid,
tid = set_tid[ns->level - i];
retval = -EINVAL;
- if (tid < 1 || tid >= pid_max)
+ if (tid < 1 || tid >= tmp->pid_max)
goto out_free;
/*
* Also fail if a PID != 1 is requested and
@@ -234,7 +233,7 @@ struct pid *alloc_pid(struct pid_namespace *ns, pid_t *set_tid,
* a partially initialized PID (see below).
*/
nr = idr_alloc_cyclic(&tmp->idr, NULL, pid_min,
- pid_max, GFP_ATOMIC);
+ tmp->pid_max, GFP_ATOMIC);
}
spin_unlock_irq(&pidmap_lock);
idr_preload_end();
@@ -651,11 +650,12 @@ void __init pid_idr_init(void)
BUILD_BUG_ON(PID_MAX_LIMIT >= PIDNS_ADDING);
/* bump default and minimum pid_max based on number of cpus */
- pid_max = min(pid_max_max, max_t(int, pid_max,
+ init_pid_ns.pid_max = min(pid_max_max, max_t(int, init_pid_ns.pid_max,
PIDS_PER_CPU_DEFAULT * num_possible_cpus()));
pid_max_min = max_t(int, pid_max_min,
PIDS_PER_CPU_MIN * num_possible_cpus());
- pr_info("pid_max: default: %u minimum: %u\n", pid_max, pid_max_min);
+ pr_info("pid_max: default: %u minimum: %u\n", init_pid_ns.pid_max,
+ pid_max_min);
idr_init(&init_pid_ns.idr);
diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c
index 3028b2218aa4..d6b3f34ecb25 100644
--- a/kernel/pid_namespace.c
+++ b/kernel/pid_namespace.c
@@ -110,6 +110,7 @@ static struct pid_namespace *create_pid_namespace(struct user_namespace *user_ns
ns->user_ns = get_user_ns(user_ns);
ns->ucounts = ucounts;
ns->pid_allocated = PIDNS_ADDING;
+ ns->pid_max = parent_pid_ns->pid_max;
#if defined(CONFIG_SYSCTL) && defined(CONFIG_MEMFD_CREATE)
ns->memfd_noexec_scope = pidns_memfd_noexec_scope(parent_pid_ns);
#endif
@@ -295,20 +296,44 @@ static int pid_ns_ctl_handler(struct ctl_table *table, int write,
return ret;
}
+#endif /* CONFIG_CHECKPOINT_RESTORE */
+
+static int pid_max_ns_ctl_handler(struct ctl_table *table, int write,
+ void *buffer, size_t *lenp, loff_t *ppos)
+{
+ struct pid_namespace *pid_ns = task_active_pid_ns(current);
+
+ if (write && !checkpoint_restore_ns_capable(pid_ns->user_ns))
+ return -EPERM;
+
+ table->data = &pid_ns->pid_max;
+ if (pid_ns->parent)
+ table->extra2 = &pid_ns->parent->pid_max;
+
+ return proc_dointvec_minmax(table, write, buffer, lenp, ppos);
+}
-extern int pid_max;
static struct ctl_table pid_ns_ctl_table[] = {
+#ifdef CONFIG_CHECKPOINT_RESTORE
{
.procname = "ns_last_pid",
.maxlen = sizeof(int),
.mode = 0666, /* permissions are checked in the handler */
.proc_handler = pid_ns_ctl_handler,
.extra1 = SYSCTL_ZERO,
- .extra2 = &pid_max,
+ .extra2 = &init_pid_ns.pid_max,
+ },
+#endif /* CONFIG_CHECKPOINT_RESTORE */
+ {
+ .procname = "pid_max",
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = pid_max_ns_ctl_handler,
+ .extra1 = &pid_max_min,
+ .extra2 = &pid_max_max,
},
{ }
};
-#endif /* CONFIG_CHECKPOINT_RESTORE */
int reboot_pid_ns(struct pid_namespace *pid_ns, int cmd)
{
@@ -465,9 +490,7 @@ static __init int pid_namespaces_init(void)
{
pid_ns_cachep = KMEM_CACHE(pid_namespace, SLAB_PANIC | SLAB_ACCOUNT);
-#ifdef CONFIG_CHECKPOINT_RESTORE
register_sysctl_init("kernel", pid_ns_ctl_table);
-#endif
register_pid_ns_sysctl_table_vm();
return 0;
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 157f7ce2942d..857bfdb39b15 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -1809,15 +1809,6 @@ static struct ctl_table kern_table[] = {
.proc_handler = proc_dointvec,
},
#endif
- {
- .procname = "pid_max",
- .data = &pid_max,
- .maxlen = sizeof (int),
- .mode = 0644,
- .proc_handler = proc_dointvec_minmax,
- .extra1 = &pid_max_min,
- .extra2 = &pid_max_max,
- },
{
.procname = "panic_on_oops",
.data = &panic_on_oops,
diff --git a/kernel/trace/pid_list.c b/kernel/trace/pid_list.c
index 95106d02b32d..ef52820e6719 100644
--- a/kernel/trace/pid_list.c
+++ b/kernel/trace/pid_list.c
@@ -414,7 +414,7 @@ struct trace_pid_list *trace_pid_list_alloc(void)
int i;
/* According to linux/thread.h, pids can be no bigger that 30 bits */
- WARN_ON_ONCE(pid_max > (1 << 30));
+ WARN_ON_ONCE(init_pid_ns.pid_max > (1 << 30));
pid_list = kzalloc(sizeof(*pid_list), GFP_KERNEL);
if (!pid_list)
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index fbcd3bafb93e..6295679ce16c 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -5415,7 +5415,7 @@ int set_tracer_flag(struct trace_array *tr, unsigned int mask, int enabled)
if (mask == TRACE_ITER_RECORD_TGID) {
if (!tgid_map) {
- tgid_map_max = pid_max;
+ tgid_map_max = init_pid_ns.pid_max;
map = kvcalloc(tgid_map_max + 1, sizeof(*tgid_map),
GFP_KERNEL);
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index b7f4ea25a194..df61b1db86a2 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -700,8 +700,6 @@ extern unsigned long tracing_thresh;
/* PID filtering */
-extern int pid_max;
-
bool trace_find_filtered_pid(struct trace_pid_list *filtered_pids,
pid_t search_pid);
bool trace_ignore_this_task(struct trace_pid_list *filtered_pids,
--
2.27.0
^ permalink raw reply related [flat|nested] 6+ messages in thread
* Re: [PATCH] kernel: add pid_max to pid_namespace
2024-09-02 11:49 Yun Zhou
@ 2024-10-04 19:05 ` Steven Rostedt
0 siblings, 0 replies; 6+ messages in thread
From: Steven Rostedt @ 2024-10-04 19:05 UTC (permalink / raw)
To: Yun Zhou
Cc: mcgrof, keescook, yzaikin, mhiramat, mathieu.desnoyers,
linux-kernel, linux-fsdevel, linux-trace-kernel
On Mon, 2 Sep 2024 19:49:20 +0800
Yun Zhou <yun.zhou@windriver.com> wrote:
-ENOCHANGELOG
What? Why? Why should I care about this?
A change log *must* have all the information to say why this change is
necessary. It's OK for the subject to state what it is doing, but there
most definitely needs a "why?" in the change log.
-- Steve
> Signed-off-by: Yun Zhou <yun.zhou@windriver.com>
> ---
> include/linux/pid_namespace.h | 1 +
> kernel/pid.c | 12 ++++++------
> kernel/pid_namespace.c | 33 ++++++++++++++++++++++++++++-----
> kernel/sysctl.c | 9 ---------
> kernel/trace/pid_list.c | 2 +-
> kernel/trace/trace.c | 2 +-
> kernel/trace/trace.h | 2 --
> 7 files changed, 37 insertions(+), 24 deletions(-)
>
> diff --git a/include/linux/pid_namespace.h b/include/linux/pid_namespace.h
> index f9f9931e02d6..0e3c18f3cac5 100644
> --- a/include/linux/pid_namespace.h
> +++ b/include/linux/pid_namespace.h
> @@ -27,6 +27,7 @@ struct pid_namespace {
> struct idr idr;
> struct rcu_head rcu;
> unsigned int pid_allocated;
> + int pid_max;
> struct task_struct *child_reaper;
> struct kmem_cache *pid_cachep;
> unsigned int level;
> diff --git a/kernel/pid.c b/kernel/pid.c
> index 6500ef956f2f..14da3f68ceed 100644
> --- a/kernel/pid.c
> +++ b/kernel/pid.c
> @@ -59,8 +59,6 @@ struct pid init_struct_pid = {
> }, }
> };
>
> -int pid_max = PID_MAX_DEFAULT;
> -
> #define RESERVED_PIDS 300
>
> int pid_max_min = RESERVED_PIDS + 1;
> @@ -74,6 +72,7 @@ int pid_max_max = PID_MAX_LIMIT;
> */
> struct pid_namespace init_pid_ns = {
> .ns.count = REFCOUNT_INIT(2),
> + .pid_max = PID_MAX_DEFAULT,
> .idr = IDR_INIT(init_pid_ns.idr),
> .pid_allocated = PIDNS_ADDING,
> .level = 0,
> @@ -194,7 +193,7 @@ struct pid *alloc_pid(struct pid_namespace *ns, pid_t *set_tid,
> tid = set_tid[ns->level - i];
>
> retval = -EINVAL;
> - if (tid < 1 || tid >= pid_max)
> + if (tid < 1 || tid >= tmp->pid_max)
> goto out_free;
> /*
> * Also fail if a PID != 1 is requested and
> @@ -234,7 +233,7 @@ struct pid *alloc_pid(struct pid_namespace *ns, pid_t *set_tid,
> * a partially initialized PID (see below).
> */
> nr = idr_alloc_cyclic(&tmp->idr, NULL, pid_min,
> - pid_max, GFP_ATOMIC);
> + tmp->pid_max, GFP_ATOMIC);
> }
> spin_unlock_irq(&pidmap_lock);
> idr_preload_end();
> @@ -651,11 +650,12 @@ void __init pid_idr_init(void)
> BUILD_BUG_ON(PID_MAX_LIMIT >= PIDNS_ADDING);
>
> /* bump default and minimum pid_max based on number of cpus */
> - pid_max = min(pid_max_max, max_t(int, pid_max,
> + init_pid_ns.pid_max = min(pid_max_max, max_t(int, init_pid_ns.pid_max,
> PIDS_PER_CPU_DEFAULT * num_possible_cpus()));
> pid_max_min = max_t(int, pid_max_min,
> PIDS_PER_CPU_MIN * num_possible_cpus());
> - pr_info("pid_max: default: %u minimum: %u\n", pid_max, pid_max_min);
> + pr_info("pid_max: default: %u minimum: %u\n", init_pid_ns.pid_max,
> + pid_max_min);
>
> idr_init(&init_pid_ns.idr);
>
> diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c
> index 3028b2218aa4..d6b3f34ecb25 100644
> --- a/kernel/pid_namespace.c
> +++ b/kernel/pid_namespace.c
> @@ -110,6 +110,7 @@ static struct pid_namespace *create_pid_namespace(struct user_namespace *user_ns
> ns->user_ns = get_user_ns(user_ns);
> ns->ucounts = ucounts;
> ns->pid_allocated = PIDNS_ADDING;
> + ns->pid_max = parent_pid_ns->pid_max;
> #if defined(CONFIG_SYSCTL) && defined(CONFIG_MEMFD_CREATE)
> ns->memfd_noexec_scope = pidns_memfd_noexec_scope(parent_pid_ns);
> #endif
> @@ -295,20 +296,44 @@ static int pid_ns_ctl_handler(struct ctl_table *table, int write,
>
> return ret;
> }
> +#endif /* CONFIG_CHECKPOINT_RESTORE */
> +
> +static int pid_max_ns_ctl_handler(struct ctl_table *table, int write,
> + void *buffer, size_t *lenp, loff_t *ppos)
> +{
> + struct pid_namespace *pid_ns = task_active_pid_ns(current);
> +
> + if (write && !checkpoint_restore_ns_capable(pid_ns->user_ns))
> + return -EPERM;
> +
> + table->data = &pid_ns->pid_max;
> + if (pid_ns->parent)
> + table->extra2 = &pid_ns->parent->pid_max;
> +
> + return proc_dointvec_minmax(table, write, buffer, lenp, ppos);
> +}
>
> -extern int pid_max;
> static struct ctl_table pid_ns_ctl_table[] = {
> +#ifdef CONFIG_CHECKPOINT_RESTORE
> {
> .procname = "ns_last_pid",
> .maxlen = sizeof(int),
> .mode = 0666, /* permissions are checked in the handler */
> .proc_handler = pid_ns_ctl_handler,
> .extra1 = SYSCTL_ZERO,
> - .extra2 = &pid_max,
> + .extra2 = &init_pid_ns.pid_max,
> + },
> +#endif /* CONFIG_CHECKPOINT_RESTORE */
> + {
> + .procname = "pid_max",
> + .maxlen = sizeof(int),
> + .mode = 0644,
> + .proc_handler = pid_max_ns_ctl_handler,
> + .extra1 = &pid_max_min,
> + .extra2 = &pid_max_max,
> },
> { }
> };
> -#endif /* CONFIG_CHECKPOINT_RESTORE */
>
> int reboot_pid_ns(struct pid_namespace *pid_ns, int cmd)
> {
> @@ -465,9 +490,7 @@ static __init int pid_namespaces_init(void)
> {
> pid_ns_cachep = KMEM_CACHE(pid_namespace, SLAB_PANIC | SLAB_ACCOUNT);
>
> -#ifdef CONFIG_CHECKPOINT_RESTORE
> register_sysctl_init("kernel", pid_ns_ctl_table);
> -#endif
>
> register_pid_ns_sysctl_table_vm();
> return 0;
> diff --git a/kernel/sysctl.c b/kernel/sysctl.c
> index 157f7ce2942d..857bfdb39b15 100644
> --- a/kernel/sysctl.c
> +++ b/kernel/sysctl.c
> @@ -1809,15 +1809,6 @@ static struct ctl_table kern_table[] = {
> .proc_handler = proc_dointvec,
> },
> #endif
> - {
> - .procname = "pid_max",
> - .data = &pid_max,
> - .maxlen = sizeof (int),
> - .mode = 0644,
> - .proc_handler = proc_dointvec_minmax,
> - .extra1 = &pid_max_min,
> - .extra2 = &pid_max_max,
> - },
> {
> .procname = "panic_on_oops",
> .data = &panic_on_oops,
> diff --git a/kernel/trace/pid_list.c b/kernel/trace/pid_list.c
> index 95106d02b32d..ef52820e6719 100644
> --- a/kernel/trace/pid_list.c
> +++ b/kernel/trace/pid_list.c
> @@ -414,7 +414,7 @@ struct trace_pid_list *trace_pid_list_alloc(void)
> int i;
>
> /* According to linux/thread.h, pids can be no bigger that 30 bits */
> - WARN_ON_ONCE(pid_max > (1 << 30));
> + WARN_ON_ONCE(init_pid_ns.pid_max > (1 << 30));
>
> pid_list = kzalloc(sizeof(*pid_list), GFP_KERNEL);
> if (!pid_list)
> diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
> index fbcd3bafb93e..6295679ce16c 100644
> --- a/kernel/trace/trace.c
> +++ b/kernel/trace/trace.c
> @@ -5415,7 +5415,7 @@ int set_tracer_flag(struct trace_array *tr, unsigned int mask, int enabled)
>
> if (mask == TRACE_ITER_RECORD_TGID) {
> if (!tgid_map) {
> - tgid_map_max = pid_max;
> + tgid_map_max = init_pid_ns.pid_max;
> map = kvcalloc(tgid_map_max + 1, sizeof(*tgid_map),
> GFP_KERNEL);
>
> diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
> index b7f4ea25a194..df61b1db86a2 100644
> --- a/kernel/trace/trace.h
> +++ b/kernel/trace/trace.h
> @@ -700,8 +700,6 @@ extern unsigned long tracing_thresh;
>
> /* PID filtering */
>
> -extern int pid_max;
> -
> bool trace_find_filtered_pid(struct trace_pid_list *filtered_pids,
> pid_t search_pid);
> bool trace_ignore_this_task(struct trace_pid_list *filtered_pids,
^ permalink raw reply [flat|nested] 6+ messages in thread
* [PATCH] kernel: add pid_max to pid_namespace
@ 2024-10-30 5:29 Yun Zhou
2024-11-01 11:51 ` Joel Granados
2024-11-01 13:23 ` Steven Rostedt
0 siblings, 2 replies; 6+ messages in thread
From: Yun Zhou @ 2024-10-30 5:29 UTC (permalink / raw)
To: mcgrof, kees, joel.granados, rostedt, mhiramat, mathieu.desnoyers,
yun.zhou
Cc: linux-kernel, linux-fsdevel, linux-trace-kernel
It is necessary to have a different pid_max in different containers.
For example, multiple containers are running on a host, one of which
is Android, and its 32 bit bionic libc only accepts pid <= 65535. So
it requires the global pid_max <= 65535. This will cause configuration
conflicts with other containers and also limit the maximum number of
tasks for the entire system.
Signed-off-by: Yun Zhou <yun.zhou@windriver.com>
---
include/linux/pid_namespace.h | 1 +
kernel/pid.c | 12 +++++------
kernel/pid_namespace.c | 35 ++++++++++++++++++++++++++-----
kernel/sysctl.c | 9 --------
kernel/trace/pid_list.c | 2 +-
kernel/trace/trace.h | 2 --
kernel/trace/trace_sched_switch.c | 2 +-
7 files changed, 39 insertions(+), 24 deletions(-)
diff --git a/include/linux/pid_namespace.h b/include/linux/pid_namespace.h
index f9f9931e02d6..064cfe2542fc 100644
--- a/include/linux/pid_namespace.h
+++ b/include/linux/pid_namespace.h
@@ -38,6 +38,7 @@ struct pid_namespace {
struct ucounts *ucounts;
int reboot; /* group exit code if this pidns was rebooted */
struct ns_common ns;
+ int pid_max;
#if defined(CONFIG_SYSCTL) && defined(CONFIG_MEMFD_CREATE)
int memfd_noexec_scope;
#endif
diff --git a/kernel/pid.c b/kernel/pid.c
index 2715afb77eab..f8026a61436b 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -60,8 +60,6 @@ struct pid init_struct_pid = {
}, }
};
-int pid_max = PID_MAX_DEFAULT;
-
int pid_max_min = RESERVED_PIDS + 1;
int pid_max_max = PID_MAX_LIMIT;
/*
@@ -78,6 +76,7 @@ static u64 pidfs_ino = RESERVED_PIDS;
*/
struct pid_namespace init_pid_ns = {
.ns.count = REFCOUNT_INIT(2),
+ .pid_max = PID_MAX_DEFAULT,
.idr = IDR_INIT(init_pid_ns.idr),
.pid_allocated = PIDNS_ADDING,
.level = 0,
@@ -198,7 +197,7 @@ struct pid *alloc_pid(struct pid_namespace *ns, pid_t *set_tid,
tid = set_tid[ns->level - i];
retval = -EINVAL;
- if (tid < 1 || tid >= pid_max)
+ if (tid < 1 || tid >= tmp->pid_max)
goto out_free;
/*
* Also fail if a PID != 1 is requested and
@@ -238,7 +237,7 @@ struct pid *alloc_pid(struct pid_namespace *ns, pid_t *set_tid,
* a partially initialized PID (see below).
*/
nr = idr_alloc_cyclic(&tmp->idr, NULL, pid_min,
- pid_max, GFP_ATOMIC);
+ tmp->pid_max, GFP_ATOMIC);
}
spin_unlock_irq(&pidmap_lock);
idr_preload_end();
@@ -653,11 +652,12 @@ void __init pid_idr_init(void)
BUILD_BUG_ON(PID_MAX_LIMIT >= PIDNS_ADDING);
/* bump default and minimum pid_max based on number of cpus */
- pid_max = min(pid_max_max, max_t(int, pid_max,
+ init_pid_ns.pid_max = min(pid_max_max, max_t(int, init_pid_ns.pid_max,
PIDS_PER_CPU_DEFAULT * num_possible_cpus()));
pid_max_min = max_t(int, pid_max_min,
PIDS_PER_CPU_MIN * num_possible_cpus());
- pr_info("pid_max: default: %u minimum: %u\n", pid_max, pid_max_min);
+ pr_info("pid_max: default: %u minimum: %u\n", init_pid_ns.pid_max,
+ pid_max_min);
idr_init(&init_pid_ns.idr);
diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c
index d70ab49d5b4a..d8ddc0c56599 100644
--- a/kernel/pid_namespace.c
+++ b/kernel/pid_namespace.c
@@ -111,6 +111,7 @@ static struct pid_namespace *create_pid_namespace(struct user_namespace *user_ns
ns->user_ns = get_user_ns(user_ns);
ns->ucounts = ucounts;
ns->pid_allocated = PIDNS_ADDING;
+ ns->pid_max = parent_pid_ns->pid_max;
#if defined(CONFIG_SYSCTL) && defined(CONFIG_MEMFD_CREATE)
ns->memfd_noexec_scope = pidns_memfd_noexec_scope(parent_pid_ns);
#endif
@@ -280,19 +281,45 @@ static int pid_ns_ctl_handler(const struct ctl_table *table, int write,
return ret;
}
+#endif /* CONFIG_CHECKPOINT_RESTORE */
+
+static int pid_max_ns_ctl_handler(const struct ctl_table *table, int write,
+ void *buffer, size_t *lenp, loff_t *ppos)
+{
+ struct pid_namespace *pid_ns = task_active_pid_ns(current);
+ struct ctl_table tmp = *table;
+
+ if (write && !checkpoint_restore_ns_capable(pid_ns->user_ns))
+ return -EPERM;
+
+ tmp.data = &pid_ns->pid_max;
+ if (pid_ns->parent)
+ tmp.extra2 = &pid_ns->parent->pid_max;
+
+ return proc_dointvec_minmax(&tmp, write, buffer, lenp, ppos);
+}
-extern int pid_max;
static struct ctl_table pid_ns_ctl_table[] = {
+#ifdef CONFIG_CHECKPOINT_RESTORE
{
.procname = "ns_last_pid",
.maxlen = sizeof(int),
.mode = 0666, /* permissions are checked in the handler */
.proc_handler = pid_ns_ctl_handler,
.extra1 = SYSCTL_ZERO,
- .extra2 = &pid_max,
+ .extra2 = &init_pid_ns.pid_max,
},
-};
#endif /* CONFIG_CHECKPOINT_RESTORE */
+ {
+ .procname = "pid_max",
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = pid_max_ns_ctl_handler,
+ .extra1 = &pid_max_min,
+ .extra2 = &pid_max_max,
+ },
+ { }
+};
int reboot_pid_ns(struct pid_namespace *pid_ns, int cmd)
{
@@ -449,9 +476,7 @@ static __init int pid_namespaces_init(void)
{
pid_ns_cachep = KMEM_CACHE(pid_namespace, SLAB_PANIC | SLAB_ACCOUNT);
-#ifdef CONFIG_CHECKPOINT_RESTORE
register_sysctl_init("kernel", pid_ns_ctl_table);
-#endif
register_pid_ns_sysctl_table_vm();
return 0;
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 79e6cb1d5c48..676a0d675e7f 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -1804,15 +1804,6 @@ static struct ctl_table kern_table[] = {
.proc_handler = proc_dointvec,
},
#endif
- {
- .procname = "pid_max",
- .data = &pid_max,
- .maxlen = sizeof (int),
- .mode = 0644,
- .proc_handler = proc_dointvec_minmax,
- .extra1 = &pid_max_min,
- .extra2 = &pid_max_max,
- },
{
.procname = "panic_on_oops",
.data = &panic_on_oops,
diff --git a/kernel/trace/pid_list.c b/kernel/trace/pid_list.c
index 4966e6bbdf6f..c62b9b3cfb3d 100644
--- a/kernel/trace/pid_list.c
+++ b/kernel/trace/pid_list.c
@@ -414,7 +414,7 @@ struct trace_pid_list *trace_pid_list_alloc(void)
int i;
/* According to linux/thread.h, pids can be no bigger that 30 bits */
- WARN_ON_ONCE(pid_max > (1 << 30));
+ WARN_ON_ONCE(init_pid_ns.pid_max > (1 << 30));
pid_list = kzalloc(sizeof(*pid_list), GFP_KERNEL);
if (!pid_list)
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index c866991b9c78..e51851d64e4d 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -715,8 +715,6 @@ extern unsigned long tracing_thresh;
/* PID filtering */
-extern int pid_max;
-
bool trace_find_filtered_pid(struct trace_pid_list *filtered_pids,
pid_t search_pid);
bool trace_ignore_this_task(struct trace_pid_list *filtered_pids,
diff --git a/kernel/trace/trace_sched_switch.c b/kernel/trace/trace_sched_switch.c
index 8a407adb0e1c..c20c80abe065 100644
--- a/kernel/trace/trace_sched_switch.c
+++ b/kernel/trace/trace_sched_switch.c
@@ -442,7 +442,7 @@ int trace_alloc_tgid_map(void)
if (tgid_map)
return 0;
- tgid_map_max = pid_max;
+ tgid_map_max = init_pid_ns.pid_max;
map = kvcalloc(tgid_map_max + 1, sizeof(*tgid_map),
GFP_KERNEL);
if (!map)
--
2.27.0
^ permalink raw reply related [flat|nested] 6+ messages in thread
* Re: [PATCH] kernel: add pid_max to pid_namespace
2024-10-30 5:29 Yun Zhou
@ 2024-11-01 11:51 ` Joel Granados
2024-11-01 13:23 ` Steven Rostedt
1 sibling, 0 replies; 6+ messages in thread
From: Joel Granados @ 2024-11-01 11:51 UTC (permalink / raw)
To: Yun Zhou
Cc: mcgrof, kees, rostedt, mhiramat, mathieu.desnoyers, linux-kernel,
linux-fsdevel, linux-trace-kernel
On Wed, Oct 30, 2024 at 01:29:33PM +0800, Yun Zhou wrote:
> It is necessary to have a different pid_max in different containers.
> For example, multiple containers are running on a host, one of which
> is Android, and its 32 bit bionic libc only accepts pid <= 65535. So
> it requires the global pid_max <= 65535. This will cause configuration
> conflicts with other containers and also limit the maximum number of
> tasks for the entire system.
>
> Signed-off-by: Yun Zhou <yun.zhou@windriver.com>
> ---
> include/linux/pid_namespace.h | 1 +
> kernel/pid.c | 12 +++++------
> kernel/pid_namespace.c | 35 ++++++++++++++++++++++++++-----
> kernel/sysctl.c | 9 --------
> kernel/trace/pid_list.c | 2 +-
> kernel/trace/trace.h | 2 --
> kernel/trace/trace_sched_switch.c | 2 +-
> 7 files changed, 39 insertions(+), 24 deletions(-)
>
> diff --git a/include/linux/pid_namespace.h b/include/linux/pid_namespace.h
> index f9f9931e02d6..064cfe2542fc 100644
> --- a/include/linux/pid_namespace.h
> +++ b/include/linux/pid_namespace.h
> @@ -38,6 +38,7 @@ struct pid_namespace {
> struct ucounts *ucounts;
> int reboot; /* group exit code if this pidns was rebooted */
> struct ns_common ns;
> + int pid_max;
> #if defined(CONFIG_SYSCTL) && defined(CONFIG_MEMFD_CREATE)
> int memfd_noexec_scope;
> #endif
> diff --git a/kernel/pid.c b/kernel/pid.c
> index 2715afb77eab..f8026a61436b 100644
> --- a/kernel/pid.c
> +++ b/kernel/pid.c
> @@ -60,8 +60,6 @@ struct pid init_struct_pid = {
> }, }
> };
>
> -int pid_max = PID_MAX_DEFAULT;
> -
> int pid_max_min = RESERVED_PIDS + 1;
> int pid_max_max = PID_MAX_LIMIT;
> /*
> @@ -78,6 +76,7 @@ static u64 pidfs_ino = RESERVED_PIDS;
> */
> struct pid_namespace init_pid_ns = {
> .ns.count = REFCOUNT_INIT(2),
> + .pid_max = PID_MAX_DEFAULT,
> .idr = IDR_INIT(init_pid_ns.idr),
> .pid_allocated = PIDNS_ADDING,
> .level = 0,
> @@ -198,7 +197,7 @@ struct pid *alloc_pid(struct pid_namespace *ns, pid_t *set_tid,
> tid = set_tid[ns->level - i];
>
> retval = -EINVAL;
> - if (tid < 1 || tid >= pid_max)
> + if (tid < 1 || tid >= tmp->pid_max)
> goto out_free;
> /*
> * Also fail if a PID != 1 is requested and
> @@ -238,7 +237,7 @@ struct pid *alloc_pid(struct pid_namespace *ns, pid_t *set_tid,
> * a partially initialized PID (see below).
> */
> nr = idr_alloc_cyclic(&tmp->idr, NULL, pid_min,
> - pid_max, GFP_ATOMIC);
> + tmp->pid_max, GFP_ATOMIC);
> }
> spin_unlock_irq(&pidmap_lock);
> idr_preload_end();
> @@ -653,11 +652,12 @@ void __init pid_idr_init(void)
> BUILD_BUG_ON(PID_MAX_LIMIT >= PIDNS_ADDING);
>
> /* bump default and minimum pid_max based on number of cpus */
> - pid_max = min(pid_max_max, max_t(int, pid_max,
> + init_pid_ns.pid_max = min(pid_max_max, max_t(int, init_pid_ns.pid_max,
> PIDS_PER_CPU_DEFAULT * num_possible_cpus()));
> pid_max_min = max_t(int, pid_max_min,
> PIDS_PER_CPU_MIN * num_possible_cpus());
> - pr_info("pid_max: default: %u minimum: %u\n", pid_max, pid_max_min);
> + pr_info("pid_max: default: %u minimum: %u\n", init_pid_ns.pid_max,
> + pid_max_min);
>
> idr_init(&init_pid_ns.idr);
>
> diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c
> index d70ab49d5b4a..d8ddc0c56599 100644
> --- a/kernel/pid_namespace.c
> +++ b/kernel/pid_namespace.c
> @@ -111,6 +111,7 @@ static struct pid_namespace *create_pid_namespace(struct user_namespace *user_ns
> ns->user_ns = get_user_ns(user_ns);
> ns->ucounts = ucounts;
> ns->pid_allocated = PIDNS_ADDING;
> + ns->pid_max = parent_pid_ns->pid_max;
> #if defined(CONFIG_SYSCTL) && defined(CONFIG_MEMFD_CREATE)
> ns->memfd_noexec_scope = pidns_memfd_noexec_scope(parent_pid_ns);
> #endif
> @@ -280,19 +281,45 @@ static int pid_ns_ctl_handler(const struct ctl_table *table, int write,
>
> return ret;
> }
> +#endif /* CONFIG_CHECKPOINT_RESTORE */
> +
> +static int pid_max_ns_ctl_handler(const struct ctl_table *table, int write,
> + void *buffer, size_t *lenp, loff_t *ppos)
> +{
> + struct pid_namespace *pid_ns = task_active_pid_ns(current);
> + struct ctl_table tmp = *table;
> +
> + if (write && !checkpoint_restore_ns_capable(pid_ns->user_ns))
> + return -EPERM;
> +
> + tmp.data = &pid_ns->pid_max;
> + if (pid_ns->parent)
> + tmp.extra2 = &pid_ns->parent->pid_max;
> +
> + return proc_dointvec_minmax(&tmp, write, buffer, lenp, ppos);
> +}
>
> -extern int pid_max;
> static struct ctl_table pid_ns_ctl_table[] = {
> +#ifdef CONFIG_CHECKPOINT_RESTORE
> {
> .procname = "ns_last_pid",
> .maxlen = sizeof(int),
> .mode = 0666, /* permissions are checked in the handler */
> .proc_handler = pid_ns_ctl_handler,
> .extra1 = SYSCTL_ZERO,
> - .extra2 = &pid_max,
> + .extra2 = &init_pid_ns.pid_max,
> },
> -};
> #endif /* CONFIG_CHECKPOINT_RESTORE */
> + {
> + .procname = "pid_max",
> + .maxlen = sizeof(int),
> + .mode = 0644,
> + .proc_handler = pid_max_ns_ctl_handler,
> + .extra1 = &pid_max_min,
> + .extra2 = &pid_max_max,
> + },
> + { }
There is no longer any need for sentinels in ctl_table arrays. Please
remove this one for your next version.
Best
--
Joel Granados
^ permalink raw reply [flat|nested] 6+ messages in thread
* Re: [PATCH] kernel: add pid_max to pid_namespace
2024-10-30 5:29 Yun Zhou
2024-11-01 11:51 ` Joel Granados
@ 2024-11-01 13:23 ` Steven Rostedt
1 sibling, 0 replies; 6+ messages in thread
From: Steven Rostedt @ 2024-11-01 13:23 UTC (permalink / raw)
To: Yun Zhou
Cc: mcgrof, kees, joel.granados, mhiramat, mathieu.desnoyers,
linux-kernel, linux-fsdevel, linux-trace-kernel
On Wed, 30 Oct 2024 13:29:33 +0800
Yun Zhou <yun.zhou@windriver.com> wrote:
> diff --git a/kernel/trace/pid_list.c b/kernel/trace/pid_list.c
> index 4966e6bbdf6f..c62b9b3cfb3d 100644
> --- a/kernel/trace/pid_list.c
> +++ b/kernel/trace/pid_list.c
> @@ -414,7 +414,7 @@ struct trace_pid_list *trace_pid_list_alloc(void)
> int i;
>
> /* According to linux/thread.h, pids can be no bigger that 30 bits */
> - WARN_ON_ONCE(pid_max > (1 << 30));
> + WARN_ON_ONCE(init_pid_ns.pid_max > (1 << 30));
>
> pid_list = kzalloc(sizeof(*pid_list), GFP_KERNEL);
> if (!pid_list)
> diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
> index c866991b9c78..e51851d64e4d 100644
> --- a/kernel/trace/trace.h
> +++ b/kernel/trace/trace.h
> @@ -715,8 +715,6 @@ extern unsigned long tracing_thresh;
>
> /* PID filtering */
>
> -extern int pid_max;
> -
> bool trace_find_filtered_pid(struct trace_pid_list *filtered_pids,
> pid_t search_pid);
> bool trace_ignore_this_task(struct trace_pid_list *filtered_pids,
> diff --git a/kernel/trace/trace_sched_switch.c b/kernel/trace/trace_sched_switch.c
> index 8a407adb0e1c..c20c80abe065 100644
> --- a/kernel/trace/trace_sched_switch.c
> +++ b/kernel/trace/trace_sched_switch.c
> @@ -442,7 +442,7 @@ int trace_alloc_tgid_map(void)
> if (tgid_map)
> return 0;
>
> - tgid_map_max = pid_max;
> + tgid_map_max = init_pid_ns.pid_max;
> map = kvcalloc(tgid_map_max + 1, sizeof(*tgid_map),
> GFP_KERNEL);
> if (!map)
Acked-by: Steven Rostedt (Google) <rostedt@goodmis>org>
-- Steve
^ permalink raw reply [flat|nested] 6+ messages in thread
* [PATCH] kernel: add pid_max to pid_namespace
@ 2024-11-05 3:08 Yun Zhou
0 siblings, 0 replies; 6+ messages in thread
From: Yun Zhou @ 2024-11-05 3:08 UTC (permalink / raw)
To: mcgrof, kees, joel.granados, mhiramat, mathieu.desnoyers,
yun.zhou
Cc: linux-kernel, linux-fsdevel, linux-trace-kernel
It is necessary to have a different pid_max in different containers.
For example, multiple containers are running on a host, one of which
is Android, and its 32 bit bionic libc only accepts pid <= 65535. So
it requires the global pid_max <= 65535. This will cause configuration
conflicts with other containers and also limit the maximum number of
tasks for the entire system.
Signed-off-by: Yun Zhou <yun.zhou@windriver.com>
---
include/linux/pid_namespace.h | 1 +
kernel/pid.c | 12 +++++------
kernel/pid_namespace.c | 34 ++++++++++++++++++++++++++-----
kernel/sysctl.c | 9 --------
kernel/trace/pid_list.c | 2 +-
kernel/trace/trace.h | 2 --
kernel/trace/trace_sched_switch.c | 2 +-
7 files changed, 38 insertions(+), 24 deletions(-)
diff --git a/include/linux/pid_namespace.h b/include/linux/pid_namespace.h
index f9f9931e02d6..064cfe2542fc 100644
--- a/include/linux/pid_namespace.h
+++ b/include/linux/pid_namespace.h
@@ -38,6 +38,7 @@ struct pid_namespace {
struct ucounts *ucounts;
int reboot; /* group exit code if this pidns was rebooted */
struct ns_common ns;
+ int pid_max;
#if defined(CONFIG_SYSCTL) && defined(CONFIG_MEMFD_CREATE)
int memfd_noexec_scope;
#endif
diff --git a/kernel/pid.c b/kernel/pid.c
index 2715afb77eab..f8026a61436b 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -60,8 +60,6 @@ struct pid init_struct_pid = {
}, }
};
-int pid_max = PID_MAX_DEFAULT;
-
int pid_max_min = RESERVED_PIDS + 1;
int pid_max_max = PID_MAX_LIMIT;
/*
@@ -78,6 +76,7 @@ static u64 pidfs_ino = RESERVED_PIDS;
*/
struct pid_namespace init_pid_ns = {
.ns.count = REFCOUNT_INIT(2),
+ .pid_max = PID_MAX_DEFAULT,
.idr = IDR_INIT(init_pid_ns.idr),
.pid_allocated = PIDNS_ADDING,
.level = 0,
@@ -198,7 +197,7 @@ struct pid *alloc_pid(struct pid_namespace *ns, pid_t *set_tid,
tid = set_tid[ns->level - i];
retval = -EINVAL;
- if (tid < 1 || tid >= pid_max)
+ if (tid < 1 || tid >= tmp->pid_max)
goto out_free;
/*
* Also fail if a PID != 1 is requested and
@@ -238,7 +237,7 @@ struct pid *alloc_pid(struct pid_namespace *ns, pid_t *set_tid,
* a partially initialized PID (see below).
*/
nr = idr_alloc_cyclic(&tmp->idr, NULL, pid_min,
- pid_max, GFP_ATOMIC);
+ tmp->pid_max, GFP_ATOMIC);
}
spin_unlock_irq(&pidmap_lock);
idr_preload_end();
@@ -653,11 +652,12 @@ void __init pid_idr_init(void)
BUILD_BUG_ON(PID_MAX_LIMIT >= PIDNS_ADDING);
/* bump default and minimum pid_max based on number of cpus */
- pid_max = min(pid_max_max, max_t(int, pid_max,
+ init_pid_ns.pid_max = min(pid_max_max, max_t(int, init_pid_ns.pid_max,
PIDS_PER_CPU_DEFAULT * num_possible_cpus()));
pid_max_min = max_t(int, pid_max_min,
PIDS_PER_CPU_MIN * num_possible_cpus());
- pr_info("pid_max: default: %u minimum: %u\n", pid_max, pid_max_min);
+ pr_info("pid_max: default: %u minimum: %u\n", init_pid_ns.pid_max,
+ pid_max_min);
idr_init(&init_pid_ns.idr);
diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c
index d70ab49d5b4a..a5a8254825d5 100644
--- a/kernel/pid_namespace.c
+++ b/kernel/pid_namespace.c
@@ -111,6 +111,7 @@ static struct pid_namespace *create_pid_namespace(struct user_namespace *user_ns
ns->user_ns = get_user_ns(user_ns);
ns->ucounts = ucounts;
ns->pid_allocated = PIDNS_ADDING;
+ ns->pid_max = parent_pid_ns->pid_max;
#if defined(CONFIG_SYSCTL) && defined(CONFIG_MEMFD_CREATE)
ns->memfd_noexec_scope = pidns_memfd_noexec_scope(parent_pid_ns);
#endif
@@ -280,19 +281,44 @@ static int pid_ns_ctl_handler(const struct ctl_table *table, int write,
return ret;
}
+#endif /* CONFIG_CHECKPOINT_RESTORE */
+
+static int pid_max_ns_ctl_handler(const struct ctl_table *table, int write,
+ void *buffer, size_t *lenp, loff_t *ppos)
+{
+ struct pid_namespace *pid_ns = task_active_pid_ns(current);
+ struct ctl_table tmp = *table;
+
+ if (write && !checkpoint_restore_ns_capable(pid_ns->user_ns))
+ return -EPERM;
+
+ tmp.data = &pid_ns->pid_max;
+ if (pid_ns->parent)
+ tmp.extra2 = &pid_ns->parent->pid_max;
+
+ return proc_dointvec_minmax(&tmp, write, buffer, lenp, ppos);
+}
-extern int pid_max;
static struct ctl_table pid_ns_ctl_table[] = {
+#ifdef CONFIG_CHECKPOINT_RESTORE
{
.procname = "ns_last_pid",
.maxlen = sizeof(int),
.mode = 0666, /* permissions are checked in the handler */
.proc_handler = pid_ns_ctl_handler,
.extra1 = SYSCTL_ZERO,
- .extra2 = &pid_max,
+ .extra2 = &init_pid_ns.pid_max,
},
-};
#endif /* CONFIG_CHECKPOINT_RESTORE */
+ {
+ .procname = "pid_max",
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = pid_max_ns_ctl_handler,
+ .extra1 = &pid_max_min,
+ .extra2 = &pid_max_max,
+ },
+};
int reboot_pid_ns(struct pid_namespace *pid_ns, int cmd)
{
@@ -449,9 +475,7 @@ static __init int pid_namespaces_init(void)
{
pid_ns_cachep = KMEM_CACHE(pid_namespace, SLAB_PANIC | SLAB_ACCOUNT);
-#ifdef CONFIG_CHECKPOINT_RESTORE
register_sysctl_init("kernel", pid_ns_ctl_table);
-#endif
register_pid_ns_sysctl_table_vm();
return 0;
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 79e6cb1d5c48..676a0d675e7f 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -1804,15 +1804,6 @@ static struct ctl_table kern_table[] = {
.proc_handler = proc_dointvec,
},
#endif
- {
- .procname = "pid_max",
- .data = &pid_max,
- .maxlen = sizeof (int),
- .mode = 0644,
- .proc_handler = proc_dointvec_minmax,
- .extra1 = &pid_max_min,
- .extra2 = &pid_max_max,
- },
{
.procname = "panic_on_oops",
.data = &panic_on_oops,
diff --git a/kernel/trace/pid_list.c b/kernel/trace/pid_list.c
index 4966e6bbdf6f..c62b9b3cfb3d 100644
--- a/kernel/trace/pid_list.c
+++ b/kernel/trace/pid_list.c
@@ -414,7 +414,7 @@ struct trace_pid_list *trace_pid_list_alloc(void)
int i;
/* According to linux/thread.h, pids can be no bigger that 30 bits */
- WARN_ON_ONCE(pid_max > (1 << 30));
+ WARN_ON_ONCE(init_pid_ns.pid_max > (1 << 30));
pid_list = kzalloc(sizeof(*pid_list), GFP_KERNEL);
if (!pid_list)
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index c866991b9c78..e51851d64e4d 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -715,8 +715,6 @@ extern unsigned long tracing_thresh;
/* PID filtering */
-extern int pid_max;
-
bool trace_find_filtered_pid(struct trace_pid_list *filtered_pids,
pid_t search_pid);
bool trace_ignore_this_task(struct trace_pid_list *filtered_pids,
diff --git a/kernel/trace/trace_sched_switch.c b/kernel/trace/trace_sched_switch.c
index 8a407adb0e1c..c20c80abe065 100644
--- a/kernel/trace/trace_sched_switch.c
+++ b/kernel/trace/trace_sched_switch.c
@@ -442,7 +442,7 @@ int trace_alloc_tgid_map(void)
if (tgid_map)
return 0;
- tgid_map_max = pid_max;
+ tgid_map_max = init_pid_ns.pid_max;
map = kvcalloc(tgid_map_max + 1, sizeof(*tgid_map),
GFP_KERNEL);
if (!map)
--
2.27.0
^ permalink raw reply related [flat|nested] 6+ messages in thread
end of thread, other threads:[~2024-11-05 3:08 UTC | newest]
Thread overview: 6+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2024-11-05 3:08 [PATCH] kernel: add pid_max to pid_namespace Yun Zhou
-- strict thread matches above, loose matches on Subject: below --
2024-10-30 5:29 Yun Zhou
2024-11-01 11:51 ` Joel Granados
2024-11-01 13:23 ` Steven Rostedt
2024-09-02 11:49 Yun Zhou
2024-10-04 19:05 ` Steven Rostedt
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).