From: ebiederm@xmission.com (Eric W. Biederman)
To: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
Cc: Dave Jones <davej@codemonkey.org.uk>,
Linux Kernel <linux-kernel@vger.kernel.org>,
Linus Torvalds <torvalds@linux-foundation.org>,
Al Viro <viro@zeniv.linux.org.uk>
Subject: Re: proc_flush_task oops
Date: Tue, 19 Dec 2017 12:25:16 -0600 [thread overview]
Message-ID: <87zi6e7eyb.fsf@xmission.com> (raw)
In-Reply-To: <54567407-4a09-5c79-71e2-d1ce5877bf65@I-love.SAKURA.ne.jp> (Tetsuo Handa's message of "Tue, 19 Dec 2017 19:49:17 +0900")
Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp> writes:
> On 2017/12/19 12:39, Dave Jones wrote:
>> On Mon, Dec 18, 2017 at 03:50:52PM -0800, Linus Torvalds wrote:
>>
>> > But I don't see what would have changed in this area recently.
>> >
>> > Do you end up saving the seeds that cause crashes? Is this
>> > reproducible? (Other than seeing it twoce, of course)
>>
>> Only clue so far, is every time I'm able to trigger it, the last thing
>> the child process that triggers it did, was an execveat.
>>
>> Telling it to just fuzz execveat doesn't instantly trigger it, so it
>> must be a combination of some other syscall. I'll leave a script running
>> overnight to see if I can binary search the other syscalls in
>> combination with it.
>>
>> One other thing: I said this was rc4, but it was actually rc4 + all the
>> x86 stuff from today. There's enough creepy stuff in that pile, that
>> I'll try with just plain rc4 tomorrow too.
>>
>> Dave
>>
>
> When hitting an oops at finalization function using fuzzing tool, checking for
> corresponding initialization function and previous error messages might help.
>
> It seems to me that there are error paths which allow nsproxy to be replaced
> without assigning ->pid_ns_for_children->proc_mnt.
>
> ----------
> static __latent_entropy struct task_struct *copy_process(
> unsigned long clone_flags,
> unsigned long stack_start,
> unsigned long stack_size,
> int __user *child_tidptr,
> struct pid *pid,
> int trace,
> unsigned long tls,
> int node)
> {
> (...snipped...)
> retval = copy_namespaces(clone_flags, p); // Allocates p->nsproxy->pid_ns_for_children
> if (retval)
> goto bad_fork_cleanup_mm;
> retval = copy_io(clone_flags, p);
> if (retval)
> goto bad_fork_cleanup_namespaces; // p->nsproxy->pid_ns_for_children->proc_mnt == NULL.
> retval = copy_thread_tls(clone_flags, stack_start, stack_size, p, tls);
> if (retval)
> goto bad_fork_cleanup_io; // p->nsproxy->pid_ns_for_children->proc_mnt == NULL.
>
> if (pid != &init_struct_pid) {
> pid = alloc_pid(p->nsproxy->pid_ns_for_children); // Initializes p->nsproxy->pid_ns_for_children->proc_mnt upon success.
> if (IS_ERR(pid)) {
> retval = PTR_ERR(pid);
> goto bad_fork_cleanup_thread; // p->nsproxy->pid_ns_for_children->proc_mnt == NULL.
> }
> }
> (...snipped...)
> }
> int copy_namespaces(unsigned long flags, struct task_struct *tsk)
> {
> struct nsproxy *old_ns = tsk->nsproxy;
> struct user_namespace *user_ns = task_cred_xxx(tsk, user_ns);
> struct nsproxy *new_ns;
>
> if (likely(!(flags & (CLONE_NEWNS | CLONE_NEWUTS | CLONE_NEWIPC |
> CLONE_NEWPID | CLONE_NEWNET |
> CLONE_NEWCGROUP)))) {
> get_nsproxy(old_ns);
> return 0;
> }
>
> if (!ns_capable(user_ns, CAP_SYS_ADMIN))
> return -EPERM;
>
> /*
> * CLONE_NEWIPC must detach from the undolist: after switching
> * to a new ipc namespace, the semaphore arrays from the old
> * namespace are unreachable. In clone parlance, CLONE_SYSVSEM
> * means share undolist with parent, so we must forbid using
> * it along with CLONE_NEWIPC.
> */
> if ((flags & (CLONE_NEWIPC | CLONE_SYSVSEM)) ==
> (CLONE_NEWIPC | CLONE_SYSVSEM))
> return -EINVAL;
>
> new_ns = create_new_namespaces(flags, tsk, user_ns, tsk->fs); // Allocates new nsproxy.
> if (IS_ERR(new_ns))
> return PTR_ERR(new_ns);
>
> tsk->nsproxy = new_ns; // p->nsproxy is updated with ->pid_ns_for_children->proc_mnt == NULL.
> return 0;
> }
> static struct nsproxy *create_new_namespaces(unsigned long flags,
> struct task_struct *tsk, struct user_namespace *user_ns,
> struct fs_struct *new_fs)
> {
> struct nsproxy *new_nsp;
> int err;
>
> new_nsp = create_nsproxy();
> if (!new_nsp)
> return ERR_PTR(-ENOMEM);
>
> new_nsp->mnt_ns = copy_mnt_ns(flags, tsk->nsproxy->mnt_ns, user_ns, new_fs);
> if (IS_ERR(new_nsp->mnt_ns)) {
> err = PTR_ERR(new_nsp->mnt_ns);
> goto out_ns;
> }
>
> new_nsp->uts_ns = copy_utsname(flags, user_ns, tsk->nsproxy->uts_ns);
> if (IS_ERR(new_nsp->uts_ns)) {
> err = PTR_ERR(new_nsp->uts_ns);
> goto out_uts;
> }
>
> new_nsp->ipc_ns = copy_ipcs(flags, user_ns, tsk->nsproxy->ipc_ns);
> if (IS_ERR(new_nsp->ipc_ns)) {
> err = PTR_ERR(new_nsp->ipc_ns);
> goto out_ipc;
> }
>
> new_nsp->pid_ns_for_children =
> copy_pid_ns(flags, user_ns, tsk->nsproxy->pid_ns_for_children); // Returns with ->proc_mnt == NULL.
> if (IS_ERR(new_nsp->pid_ns_for_children)) {
> err = PTR_ERR(new_nsp->pid_ns_for_children);
> goto out_pid;
> }
>
> new_nsp->cgroup_ns = copy_cgroup_ns(flags, user_ns,
> tsk->nsproxy->cgroup_ns);
> if (IS_ERR(new_nsp->cgroup_ns)) {
> err = PTR_ERR(new_nsp->cgroup_ns);
> goto out_cgroup;
> }
>
> new_nsp->net_ns = copy_net_ns(flags, user_ns, tsk->nsproxy->net_ns);
> if (IS_ERR(new_nsp->net_ns)) {
> err = PTR_ERR(new_nsp->net_ns);
> goto out_net;
> }
>
> return new_nsp; // Returns with ->pid_ns_for_children->proc_mnt == NULL.
>
> out_net:
> put_cgroup_ns(new_nsp->cgroup_ns);
> out_cgroup:
> if (new_nsp->pid_ns_for_children)
> put_pid_ns(new_nsp->pid_ns_for_children);
> out_pid:
> if (new_nsp->ipc_ns)
> put_ipc_ns(new_nsp->ipc_ns);
> out_ipc:
> if (new_nsp->uts_ns)
> put_uts_ns(new_nsp->uts_ns);
> out_uts:
> if (new_nsp->mnt_ns)
> put_mnt_ns(new_nsp->mnt_ns);
> out_ns:
> kmem_cache_free(nsproxy_cachep, new_nsp);
> return ERR_PTR(err);
> }
>
> struct pid_namespace *copy_pid_ns(unsigned long flags,
> struct user_namespace *user_ns, struct pid_namespace *old_ns)
> {
> if (!(flags & CLONE_NEWPID))
> return get_pid_ns(old_ns);
> if (task_active_pid_ns(current) != old_ns)
> return ERR_PTR(-EINVAL);
> return create_pid_namespace(user_ns, old_ns); // Returns with ->proc_mnt == NULL.
> }
>
> static struct pid_namespace *create_pid_namespace(struct user_namespace *user_ns,
> struct pid_namespace *parent_pid_ns)
> {
> struct pid_namespace *ns;
> unsigned int level = parent_pid_ns->level + 1;
> struct ucounts *ucounts;
> int err;
>
> err = -EINVAL;
> if (!in_userns(parent_pid_ns->user_ns, user_ns))
> goto out;
>
> err = -ENOSPC;
> if (level > MAX_PID_NS_LEVEL)
> goto out;
> ucounts = inc_pid_namespaces(user_ns);
> if (!ucounts)
> goto out;
>
> err = -ENOMEM;
> ns = kmem_cache_zalloc(pid_ns_cachep, GFP_KERNEL); // Initializes ->proc_mnt with NULL.
> if (ns == NULL)
> goto out_dec;
>
> idr_init(&ns->idr);
>
> ns->pid_cachep = create_pid_cachep(level + 1);
> if (ns->pid_cachep == NULL)
> goto out_free_idr;
>
> err = ns_alloc_inum(&ns->ns);
> if (err)
> goto out_free_idr;
> ns->ns.ops = &pidns_operations;
>
> kref_init(&ns->kref);
> ns->level = level;
> ns->parent = get_pid_ns(parent_pid_ns);
> ns->user_ns = get_user_ns(user_ns);
> ns->ucounts = ucounts;
> ns->pid_allocated = PIDNS_ADDING;
> INIT_WORK(&ns->proc_work, proc_cleanup_work);
>
> return ns; // Returns with ->proc_mnt == NULL.
>
> out_free_idr:
> idr_destroy(&ns->idr);
> kmem_cache_free(pid_ns_cachep, ns);
> out_dec:
> dec_pid_namespaces(ucounts);
> out:
> return ERR_PTR(err);
> }
>
> struct pid *alloc_pid(struct pid_namespace *ns)
> {
> (...snipped...)
> if (unlikely(is_child_reaper(pid))) {
> if (pid_ns_prepare_proc(ns)) {
> disable_pid_allocation(ns);
> goto out_free;
> }
> }
> (...snipped...)
> }
>
> int pid_ns_prepare_proc(struct pid_namespace *ns)
> {
> struct vfsmount *mnt;
>
> mnt = kern_mount_data(&proc_fs_type, ns);
> if (IS_ERR(mnt))
> return PTR_ERR(mnt);
>
> ns->proc_mnt = mnt;
> return 0;
> }
> ----------
>
> If one of copy_io(), copy_thread_tls() or alloc_pid() returns an error, creation
> of a child fails with p->nsproxy->pid_ns_for_children->proc_mnt == NULL.
>
> Then, when the child exits, the parent waiting at wait() calls
> proc_flush_task() which assumes that everything was set up properly.
I don't think so.
proc_mnt is allocated when the first pid is allocated in a pid
namespace. And proc_mnt is freed after the last pid in a pid
namespace is freed. That is schedule_work(&ns->proc_work) in free_pid.
Yes we can occassionally have pid namespaces without pids and
thus without a proc_mnt.
But I don't see that allows us to create a task_struct that is
not flushable.
If fork fails there is hashed pid and no task_struct to call
proc_flush_task on.
If fork succeeds there the process contains a pid in a pid namespace.
So while pid_ns_for_children->proc_mnt might be NULL. It does not
follow that pid->ns->proc_mnt can be NULL.
Eric
> ----------
> void proc_flush_task(struct task_struct *task)
> {
> int i;
> struct pid *pid, *tgid;
> struct upid *upid;
>
> pid = task_pid(task);
> tgid = task_tgid(task);
>
> for (i = 0; i <= pid->level; i++) {
> upid = &pid->numbers[i];
> proc_flush_task_mnt(upid->ns->proc_mnt, upid->nr,
> tgid->numbers[i].nr);
> }
> }
> ----------
next prev parent reply other threads:[~2017-12-19 18:25 UTC|newest]
Thread overview: 29+ messages / expand[flat|nested] mbox.gz Atom feed top
2017-12-18 21:44 proc_flush_task oops Dave Jones
2017-12-18 22:15 ` Al Viro
2017-12-18 23:10 ` Dave Jones
2017-12-18 23:50 ` Linus Torvalds
2017-12-19 1:22 ` Dave Jones
2017-12-19 3:39 ` Dave Jones
2017-12-19 10:49 ` Tetsuo Handa
2017-12-19 18:25 ` Eric W. Biederman [this message]
2017-12-19 18:27 ` Eric W. Biederman
2017-12-19 19:30 ` Dave Jones
2017-12-19 21:44 ` Eric W. Biederman
2017-12-20 1:54 ` Eric W. Biederman
2017-12-20 5:28 ` Dave Jones
2017-12-20 18:25 ` Eric W. Biederman
2017-12-21 3:16 ` Dave Jones
2017-12-21 8:26 ` Eric W. Biederman
2017-12-21 10:38 ` Alexey Dobriyan
2017-12-21 14:25 ` Dave Jones
2017-12-21 16:41 ` Eric W. Biederman
2017-12-21 22:00 ` Dave Jones
2017-12-22 1:31 ` Eric W. Biederman
2017-12-22 3:35 ` Dave Jones
2017-12-22 7:58 ` Eric W. Biederman
2017-12-22 10:13 ` Alexey Dobriyan
2017-12-22 14:41 ` Eric W. Biederman
2017-12-22 16:11 ` [TEST PATCH] pid: fix allocating pid 2 for init (was Re: proc_flush_task oops) Alexey Dobriyan
2017-12-24 3:12 ` Eric W. Biederman
2017-12-24 3:16 ` [PATCH] pid: Handle failure to allocate the first pid in a pid namespace Eric W. Biederman
2017-12-20 8:00 ` proc_flush_task oops Dmitry Vyukov
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=87zi6e7eyb.fsf@xmission.com \
--to=ebiederm@xmission.com \
--cc=davej@codemonkey.org.uk \
--cc=linux-kernel@vger.kernel.org \
--cc=penguin-kernel@I-love.SAKURA.ne.jp \
--cc=torvalds@linux-foundation.org \
--cc=viro@zeniv.linux.org.uk \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.