From: kernel test robot <lkp@intel.com>
To: oe-kbuild@lists.linux.dev
Cc: lkp@intel.com
Subject: [jpoimboe:objtool-diff 2/2] kernel/fork.c: linux/livepatch.h is included more than once.
Date: Sat, 25 May 2024 14:40:00 +0800 [thread overview]
Message-ID: <202405251400.UdnwcgiL-lkp@intel.com> (raw)
::::::
:::::: Manual check reason: "low confidence bisect report"
::::::
BCC: lkp@intel.com
CC: oe-kbuild-all@lists.linux.dev
TO: Josh Poimboeuf <jpoimboe@kernel.org>
tree: https://git.kernel.org/pub/scm/linux/kernel/git/jpoimboe/linux.git objtool-diff
head: 745009dc796e56fc87e911138d801679ecd3576e
commit: 745009dc796e56fc87e911138d801679ecd3576e [2/2] test
:::::: branch date: 6 hours ago
:::::: commit date: 6 hours ago
compiler: clang version 18.1.5 (https://github.com/llvm/llvm-project 617a15a9eac96088ae5e9134248d8236e34b91b1)
If you fix the issue in a separate patch/commit (i.e. not just a new version of
the same patch/commit), kindly add following tags
| Reported-by: kernel test robot <lkp@intel.com>
| Closes: https://lore.kernel.org/r/202405251400.UdnwcgiL-lkp@intel.com/
includecheck warnings: (new ones prefixed by >>)
>> kernel/fork.c: linux/livepatch.h is included more than once.
--
>> drivers/input/joydev.c: linux/module.h is included more than once.
--
>> drivers/input/misc/pcspkr.c: linux/module.h is included more than once.
vim +93 kernel/fork.c
> 93 #include <linux/livepatch.h>
94 #include <linux/thread_info.h>
95 #include <linux/stackleak.h>
96 #include <linux/kasan.h>
97 #include <linux/scs.h>
98 #include <linux/io_uring.h>
99 #include <linux/bpf.h>
100 #include <linux/stackprotector.h>
101 #include <linux/user_events.h>
102 #include <linux/iommu.h>
103 #include <linux/rseq.h>
104 #include <uapi/linux/pidfd.h>
105 #include <linux/pidfs.h>
106
107 #include <asm/pgalloc.h>
108 #include <linux/uaccess.h>
109 #include <asm/mmu_context.h>
110 #include <asm/cacheflush.h>
111 #include <asm/tlbflush.h>
112
113 #include <trace/events/sched.h>
114
115 #define CREATE_TRACE_POINTS
116 #include <trace/events/task.h>
117
118 /*
119 * Minimum number of threads to boot the kernel
120 */
121 #define MIN_THREADS 20
122
123 /*
124 * Maximum number of threads
125 */
126 #define MAX_THREADS FUTEX_TID_MASK
127
128 /*
129 * Protected counters by write_lock_irq(&tasklist_lock)
130 */
131 unsigned long total_forks; /* Handle normal Linux uptimes. */
132 int nr_threads; /* The idle threads do not count.. */
133
134 static int max_threads; /* tunable limit on nr_threads */
135
136 #define NAMED_ARRAY_INDEX(x) [x] = __stringify(x)
137
138 static const char * const resident_page_types[] = {
139 NAMED_ARRAY_INDEX(MM_FILEPAGES),
140 NAMED_ARRAY_INDEX(MM_ANONPAGES),
141 NAMED_ARRAY_INDEX(MM_SWAPENTS),
142 NAMED_ARRAY_INDEX(MM_SHMEMPAGES),
143 };
144
145 DEFINE_PER_CPU(unsigned long, process_counts) = 0;
146
147 __cacheline_aligned DEFINE_RWLOCK(tasklist_lock); /* outer */
148
149 #ifdef CONFIG_PROVE_RCU
150 int lockdep_tasklist_lock_is_held(void)
151 {
152 return lockdep_is_held(&tasklist_lock);
153 }
154 EXPORT_SYMBOL_GPL(lockdep_tasklist_lock_is_held);
155 #endif /* #ifdef CONFIG_PROVE_RCU */
156
157 int nr_processes(void)
158 {
159 int cpu;
160 int total = 0;
161
162 for_each_possible_cpu(cpu)
163 total += per_cpu(process_counts, cpu);
164
165 return total;
166 }
167
168 void __weak arch_release_task_struct(struct task_struct *tsk)
169 {
170 }
171
172 static struct kmem_cache *task_struct_cachep;
173
174 static inline struct task_struct *alloc_task_struct_node(int node)
175 {
176 return kmem_cache_alloc_node(task_struct_cachep, GFP_KERNEL, node);
177 }
178
179 static inline void free_task_struct(struct task_struct *tsk)
180 {
181 kmem_cache_free(task_struct_cachep, tsk);
182 }
183
184 /*
185 * Allocate pages if THREAD_SIZE is >= PAGE_SIZE, otherwise use a
186 * kmemcache based allocator.
187 */
188 # if THREAD_SIZE >= PAGE_SIZE || defined(CONFIG_VMAP_STACK)
189
190 # ifdef CONFIG_VMAP_STACK
191 /*
192 * vmalloc() is a bit slow, and calling vfree() enough times will force a TLB
193 * flush. Try to minimize the number of calls by caching stacks.
194 */
195 #define NR_CACHED_STACKS 2
196 static DEFINE_PER_CPU(struct vm_struct *, cached_stacks[NR_CACHED_STACKS]);
197
198 struct vm_stack {
199 struct rcu_head rcu;
200 struct vm_struct *stack_vm_area;
201 };
202
203 static bool try_release_thread_stack_to_cache(struct vm_struct *vm)
204 {
205 unsigned int i;
206
207 for (i = 0; i < NR_CACHED_STACKS; i++) {
208 if (this_cpu_cmpxchg(cached_stacks[i], NULL, vm) != NULL)
209 continue;
210 return true;
211 }
212 return false;
213 }
214
215 static void thread_stack_free_rcu(struct rcu_head *rh)
216 {
217 struct vm_stack *vm_stack = container_of(rh, struct vm_stack, rcu);
218
219 if (try_release_thread_stack_to_cache(vm_stack->stack_vm_area))
220 return;
221
222 vfree(vm_stack);
223 }
224
225 static void thread_stack_delayed_free(struct task_struct *tsk)
226 {
227 struct vm_stack *vm_stack = tsk->stack;
228
229 vm_stack->stack_vm_area = tsk->stack_vm_area;
230 call_rcu(&vm_stack->rcu, thread_stack_free_rcu);
231 }
232
233 static int free_vm_stack_cache(unsigned int cpu)
234 {
235 struct vm_struct **cached_vm_stacks = per_cpu_ptr(cached_stacks, cpu);
236 int i;
237
238 for (i = 0; i < NR_CACHED_STACKS; i++) {
239 struct vm_struct *vm_stack = cached_vm_stacks[i];
240
241 if (!vm_stack)
242 continue;
243
244 vfree(vm_stack->addr);
245 cached_vm_stacks[i] = NULL;
246 }
247
248 return 0;
249 }
250
251 static int memcg_charge_kernel_stack(struct vm_struct *vm)
252 {
253 int i;
254 int ret;
255 int nr_charged = 0;
256
257 BUG_ON(vm->nr_pages != THREAD_SIZE / PAGE_SIZE);
258
259 for (i = 0; i < THREAD_SIZE / PAGE_SIZE; i++) {
260 ret = memcg_kmem_charge_page(vm->pages[i], GFP_KERNEL, 0);
261 if (ret)
262 goto err;
263 nr_charged++;
264 }
265 return 0;
266 err:
267 for (i = 0; i < nr_charged; i++)
268 memcg_kmem_uncharge_page(vm->pages[i], 0);
269 return ret;
270 }
271
272 static int alloc_thread_stack_node(struct task_struct *tsk, int node)
273 {
274 struct vm_struct *vm;
275 void *stack;
276 int i;
277
278 for (i = 0; i < NR_CACHED_STACKS; i++) {
279 struct vm_struct *s;
280
281 s = this_cpu_xchg(cached_stacks[i], NULL);
282
283 if (!s)
284 continue;
285
286 /* Reset stack metadata. */
287 kasan_unpoison_range(s->addr, THREAD_SIZE);
288
289 stack = kasan_reset_tag(s->addr);
290
291 /* Clear stale pointers from reused stack. */
292 memset(stack, 0, THREAD_SIZE);
293
294 if (memcg_charge_kernel_stack(s)) {
295 vfree(s->addr);
296 return -ENOMEM;
297 }
298
299 tsk->stack_vm_area = s;
300 tsk->stack = stack;
301 return 0;
302 }
303
304 /*
305 * Allocated stacks are cached and later reused by new threads,
306 * so memcg accounting is performed manually on assigning/releasing
307 * stacks to tasks. Drop __GFP_ACCOUNT.
308 */
309 stack = __vmalloc_node_range(THREAD_SIZE, THREAD_ALIGN,
310 VMALLOC_START, VMALLOC_END,
311 THREADINFO_GFP & ~__GFP_ACCOUNT,
312 PAGE_KERNEL,
313 0, node, __builtin_return_address(0));
314 if (!stack)
315 return -ENOMEM;
316
317 vm = find_vm_area(stack);
318 if (memcg_charge_kernel_stack(vm)) {
319 vfree(stack);
320 return -ENOMEM;
321 }
322 /*
323 * We can't call find_vm_area() in interrupt context, and
324 * free_thread_stack() can be called in interrupt context,
325 * so cache the vm_struct.
326 */
327 tsk->stack_vm_area = vm;
328 stack = kasan_reset_tag(stack);
329 tsk->stack = stack;
330 return 0;
331 }
332
333 static void free_thread_stack(struct task_struct *tsk)
334 {
335 if (!try_release_thread_stack_to_cache(tsk->stack_vm_area))
336 thread_stack_delayed_free(tsk);
337
338 tsk->stack = NULL;
339 tsk->stack_vm_area = NULL;
340 }
341
342 # else /* !CONFIG_VMAP_STACK */
343
344 static void thread_stack_free_rcu(struct rcu_head *rh)
345 {
346 __free_pages(virt_to_page(rh), THREAD_SIZE_ORDER);
347 }
348
349 static void thread_stack_delayed_free(struct task_struct *tsk)
350 {
351 struct rcu_head *rh = tsk->stack;
352
353 call_rcu(rh, thread_stack_free_rcu);
354 }
355
356 static int alloc_thread_stack_node(struct task_struct *tsk, int node)
357 {
358 struct page *page = alloc_pages_node(node, THREADINFO_GFP,
359 THREAD_SIZE_ORDER);
360
361 if (likely(page)) {
362 tsk->stack = kasan_reset_tag(page_address(page));
363 return 0;
364 }
365 return -ENOMEM;
366 }
367
368 static void free_thread_stack(struct task_struct *tsk)
369 {
370 thread_stack_delayed_free(tsk);
371 tsk->stack = NULL;
372 }
373
374 # endif /* CONFIG_VMAP_STACK */
375 # else /* !(THREAD_SIZE >= PAGE_SIZE || defined(CONFIG_VMAP_STACK)) */
376
377 static struct kmem_cache *thread_stack_cache;
378
379 static void thread_stack_free_rcu(struct rcu_head *rh)
380 {
381 kmem_cache_free(thread_stack_cache, rh);
382 }
383
384 static void thread_stack_delayed_free(struct task_struct *tsk)
385 {
386 struct rcu_head *rh = tsk->stack;
387
388 call_rcu(rh, thread_stack_free_rcu);
389 }
390
391 static int alloc_thread_stack_node(struct task_struct *tsk, int node)
392 {
393 unsigned long *stack;
394 stack = kmem_cache_alloc_node(thread_stack_cache, THREADINFO_GFP, node);
395 stack = kasan_reset_tag(stack);
396 tsk->stack = stack;
397 return stack ? 0 : -ENOMEM;
398 }
399
400 static void free_thread_stack(struct task_struct *tsk)
401 {
402 thread_stack_delayed_free(tsk);
403 tsk->stack = NULL;
404 }
405
406 void thread_stack_cache_init(void)
407 {
408 thread_stack_cache = kmem_cache_create_usercopy("thread_stack",
409 THREAD_SIZE, THREAD_SIZE, 0, 0,
410 THREAD_SIZE, NULL);
411 BUG_ON(thread_stack_cache == NULL);
412 }
413
414 # endif /* THREAD_SIZE >= PAGE_SIZE || defined(CONFIG_VMAP_STACK) */
415
416 /* SLAB cache for signal_struct structures (tsk->signal) */
417 static struct kmem_cache *signal_cachep;
418
419 /* SLAB cache for sighand_struct structures (tsk->sighand) */
420 struct kmem_cache *sighand_cachep;
421
422 /* SLAB cache for files_struct structures (tsk->files) */
423 struct kmem_cache *files_cachep;
424
425 /* SLAB cache for fs_struct structures (tsk->fs) */
426 struct kmem_cache *fs_cachep;
427
428 /* SLAB cache for vm_area_struct structures */
429 static struct kmem_cache *vm_area_cachep;
430
431 /* SLAB cache for mm_struct structures (tsk->mm) */
432 static struct kmem_cache *mm_cachep;
433
434 #ifdef CONFIG_PER_VMA_LOCK
435
436 /* SLAB cache for vm_area_struct.lock */
437 static struct kmem_cache *vma_lock_cachep;
438
439 static bool vma_lock_alloc(struct vm_area_struct *vma)
440 {
441 vma->vm_lock = kmem_cache_alloc(vma_lock_cachep, GFP_KERNEL);
442 if (!vma->vm_lock)
443 return false;
444
445 init_rwsem(&vma->vm_lock->lock);
446 vma->vm_lock_seq = -1;
447
448 return true;
449 }
450
451 static inline void vma_lock_free(struct vm_area_struct *vma)
452 {
453 kmem_cache_free(vma_lock_cachep, vma->vm_lock);
454 }
455
456 #else /* CONFIG_PER_VMA_LOCK */
457
458 static inline bool vma_lock_alloc(struct vm_area_struct *vma) { return true; }
459 static inline void vma_lock_free(struct vm_area_struct *vma) {}
460
461 #endif /* CONFIG_PER_VMA_LOCK */
462
463 struct vm_area_struct *vm_area_alloc(struct mm_struct *mm)
464 {
465 struct vm_area_struct *vma;
466
467 vma = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL);
468 if (!vma)
469 return NULL;
470
471 vma_init(vma, mm);
472 if (!vma_lock_alloc(vma)) {
473 kmem_cache_free(vm_area_cachep, vma);
474 return NULL;
475 }
476
477 return vma;
478 }
479
480 struct vm_area_struct *vm_area_dup(struct vm_area_struct *orig)
481 {
482 struct vm_area_struct *new = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL);
483
484 if (!new)
485 return NULL;
486
487 ASSERT_EXCLUSIVE_WRITER(orig->vm_flags);
488 ASSERT_EXCLUSIVE_WRITER(orig->vm_file);
489 /*
490 * orig->shared.rb may be modified concurrently, but the clone
491 * will be reinitialized.
492 */
493 data_race(memcpy(new, orig, sizeof(*new)));
494 if (!vma_lock_alloc(new)) {
495 kmem_cache_free(vm_area_cachep, new);
496 return NULL;
497 }
498 INIT_LIST_HEAD(&new->anon_vma_chain);
499 vma_numab_state_init(new);
500 dup_anon_vma_name(orig, new);
501
502 return new;
503 }
504
505 void __vm_area_free(struct vm_area_struct *vma)
506 {
507 vma_numab_state_free(vma);
508 free_anon_vma_name(vma);
509 vma_lock_free(vma);
510 kmem_cache_free(vm_area_cachep, vma);
511 }
512
513 #ifdef CONFIG_PER_VMA_LOCK
514 static void vm_area_free_rcu_cb(struct rcu_head *head)
515 {
516 struct vm_area_struct *vma = container_of(head, struct vm_area_struct,
517 vm_rcu);
518
519 /* The vma should not be locked while being destroyed. */
520 VM_BUG_ON_VMA(rwsem_is_locked(&vma->vm_lock->lock), vma);
521 __vm_area_free(vma);
522 }
523 #endif
524
525 void vm_area_free(struct vm_area_struct *vma)
526 {
527 #ifdef CONFIG_PER_VMA_LOCK
528 call_rcu(&vma->vm_rcu, vm_area_free_rcu_cb);
529 #else
530 __vm_area_free(vma);
531 #endif
532 }
533
534 static void account_kernel_stack(struct task_struct *tsk, int account)
535 {
536 if (IS_ENABLED(CONFIG_VMAP_STACK)) {
537 struct vm_struct *vm = task_stack_vm_area(tsk);
538 int i;
539
540 for (i = 0; i < THREAD_SIZE / PAGE_SIZE; i++)
541 mod_lruvec_page_state(vm->pages[i], NR_KERNEL_STACK_KB,
542 account * (PAGE_SIZE / 1024));
543 } else {
544 void *stack = task_stack_page(tsk);
545
546 /* All stack pages are in the same node. */
547 mod_lruvec_kmem_state(stack, NR_KERNEL_STACK_KB,
548 account * (THREAD_SIZE / 1024));
549 }
550 }
551
552 void exit_task_stack_account(struct task_struct *tsk)
553 {
554 account_kernel_stack(tsk, -1);
555
556 if (IS_ENABLED(CONFIG_VMAP_STACK)) {
557 struct vm_struct *vm;
558 int i;
559
560 vm = task_stack_vm_area(tsk);
561 for (i = 0; i < THREAD_SIZE / PAGE_SIZE; i++)
562 memcg_kmem_uncharge_page(vm->pages[i], 0);
563 }
564 }
565
566 static void release_task_stack(struct task_struct *tsk)
567 {
568 if (WARN_ON(READ_ONCE(tsk->__state) != TASK_DEAD))
569 return; /* Better to leak the stack than to free prematurely */
570
571 free_thread_stack(tsk);
572 }
573
574 #ifdef CONFIG_THREAD_INFO_IN_TASK
575 void put_task_stack(struct task_struct *tsk)
576 {
577 if (refcount_dec_and_test(&tsk->stack_refcount))
578 release_task_stack(tsk);
579 }
580 #endif
581
582 void free_task(struct task_struct *tsk)
583 {
584 #ifdef CONFIG_SECCOMP
585 WARN_ON_ONCE(tsk->seccomp.filter);
586 #endif
587 release_user_cpus_ptr(tsk);
588 scs_release(tsk);
589
590 #ifndef CONFIG_THREAD_INFO_IN_TASK
591 /*
592 * The task is finally done with both the stack and thread_info,
593 * so free both.
594 */
595 release_task_stack(tsk);
596 #else
597 /*
598 * If the task had a separate stack allocation, it should be gone
599 * by now.
600 */
601 WARN_ON_ONCE(refcount_read(&tsk->stack_refcount) != 0);
602 #endif
603 rt_mutex_debug_task_free(tsk);
604 ftrace_graph_exit_task(tsk);
605 arch_release_task_struct(tsk);
606 if (tsk->flags & PF_KTHREAD)
607 free_kthread_struct(tsk);
608 bpf_task_storage_free(tsk);
609 free_task_struct(tsk);
610 }
611 EXPORT_SYMBOL(free_task);
612
613 static void dup_mm_exe_file(struct mm_struct *mm, struct mm_struct *oldmm)
614 {
615 struct file *exe_file;
616
617 exe_file = get_mm_exe_file(oldmm);
618 RCU_INIT_POINTER(mm->exe_file, exe_file);
619 /*
620 * We depend on the oldmm having properly denied write access to the
621 * exe_file already.
622 */
623 if (exe_file && deny_write_access(exe_file))
624 pr_warn_once("deny_write_access() failed in %s\n", __func__);
625 }
626
627 #ifdef CONFIG_MMU
628 static __latent_entropy int dup_mmap(struct mm_struct *mm,
629 struct mm_struct *oldmm)
630 {
631 struct vm_area_struct *mpnt, *tmp;
632 int retval;
633 unsigned long charge = 0;
634 LIST_HEAD(uf);
635 VMA_ITERATOR(vmi, mm, 0);
636
637 uprobe_start_dup_mmap();
638 if (mmap_write_lock_killable(oldmm)) {
639 retval = -EINTR;
640 goto fail_uprobe_end;
641 }
642 flush_cache_dup_mm(oldmm);
643 uprobe_dup_mmap(oldmm, mm);
644 /*
645 * Not linked in yet - no deadlock potential:
646 */
647 mmap_write_lock_nested(mm, SINGLE_DEPTH_NESTING);
648
649 /* No ordering required: file already has been exposed. */
650 dup_mm_exe_file(mm, oldmm);
651
652 mm->total_vm = oldmm->total_vm;
653 mm->data_vm = oldmm->data_vm;
654 mm->exec_vm = oldmm->exec_vm;
655 mm->stack_vm = oldmm->stack_vm;
656
657 retval = ksm_fork(mm, oldmm);
658 if (retval)
659 goto out;
660 khugepaged_fork(mm, oldmm);
661
662 /* Use __mt_dup() to efficiently build an identical maple tree. */
663 retval = __mt_dup(&oldmm->mm_mt, &mm->mm_mt, GFP_KERNEL);
664 if (unlikely(retval))
665 goto out;
666
667 mt_clear_in_rcu(vmi.mas.tree);
668 for_each_vma(vmi, mpnt) {
669 struct file *file;
670
671 vma_start_write(mpnt);
672 if (mpnt->vm_flags & VM_DONTCOPY) {
673 retval = vma_iter_clear_gfp(&vmi, mpnt->vm_start,
674 mpnt->vm_end, GFP_KERNEL);
675 if (retval)
676 goto loop_out;
677
678 vm_stat_account(mm, mpnt->vm_flags, -vma_pages(mpnt));
679 continue;
680 }
681 charge = 0;
682 /*
683 * Don't duplicate many vmas if we've been oom-killed (for
684 * example)
685 */
686 if (fatal_signal_pending(current)) {
687 retval = -EINTR;
688 goto loop_out;
689 }
690 if (mpnt->vm_flags & VM_ACCOUNT) {
691 unsigned long len = vma_pages(mpnt);
692
693 if (security_vm_enough_memory_mm(oldmm, len)) /* sic */
694 goto fail_nomem;
695 charge = len;
696 }
697 tmp = vm_area_dup(mpnt);
698 if (!tmp)
699 goto fail_nomem;
700 retval = vma_dup_policy(mpnt, tmp);
701 if (retval)
702 goto fail_nomem_policy;
703 tmp->vm_mm = mm;
704 retval = dup_userfaultfd(tmp, &uf);
705 if (retval)
706 goto fail_nomem_anon_vma_fork;
707 if (tmp->vm_flags & VM_WIPEONFORK) {
708 /*
709 * VM_WIPEONFORK gets a clean slate in the child.
710 * Don't prepare anon_vma until fault since we don't
711 * copy page for current vma.
712 */
713 tmp->anon_vma = NULL;
714 } else if (anon_vma_fork(tmp, mpnt))
715 goto fail_nomem_anon_vma_fork;
716 vm_flags_clear(tmp, VM_LOCKED_MASK);
717 /*
718 * Copy/update hugetlb private vma information.
719 */
720 if (is_vm_hugetlb_page(tmp))
721 hugetlb_dup_vma_private(tmp);
722
723 /*
724 * Link the vma into the MT. After using __mt_dup(), memory
725 * allocation is not necessary here, so it cannot fail.
726 */
727 vma_iter_bulk_store(&vmi, tmp);
728
729 mm->map_count++;
730
731 if (tmp->vm_ops && tmp->vm_ops->open)
732 tmp->vm_ops->open(tmp);
733
734 file = tmp->vm_file;
735 if (file) {
736 struct address_space *mapping = file->f_mapping;
737
738 get_file(file);
739 i_mmap_lock_write(mapping);
740 if (vma_is_shared_maywrite(tmp))
741 mapping_allow_writable(mapping);
742 flush_dcache_mmap_lock(mapping);
743 /* insert tmp into the share list, just after mpnt */
744 vma_interval_tree_insert_after(tmp, mpnt,
745 &mapping->i_mmap);
746 flush_dcache_mmap_unlock(mapping);
747 i_mmap_unlock_write(mapping);
748 }
749
750 if (!(tmp->vm_flags & VM_WIPEONFORK))
751 retval = copy_page_range(tmp, mpnt);
752
753 if (retval) {
754 mpnt = vma_next(&vmi);
755 goto loop_out;
756 }
757 }
758 /* a new mm has just been created */
759 retval = arch_dup_mmap(oldmm, mm);
760 loop_out:
761 vma_iter_free(&vmi);
762 if (!retval) {
763 mt_set_in_rcu(vmi.mas.tree);
764 } else if (mpnt) {
765 /*
766 * The entire maple tree has already been duplicated. If the
767 * mmap duplication fails, mark the failure point with
768 * XA_ZERO_ENTRY. In exit_mmap(), if this marker is encountered,
769 * stop releasing VMAs that have not been duplicated after this
770 * point.
771 */
772 mas_set_range(&vmi.mas, mpnt->vm_start, mpnt->vm_end - 1);
773 mas_store(&vmi.mas, XA_ZERO_ENTRY);
774 }
775 out:
776 mmap_write_unlock(mm);
777 flush_tlb_mm(oldmm);
778 mmap_write_unlock(oldmm);
779 dup_userfaultfd_complete(&uf);
780 fail_uprobe_end:
781 uprobe_end_dup_mmap();
782 return retval;
783
784 fail_nomem_anon_vma_fork:
785 mpol_put(vma_policy(tmp));
786 fail_nomem_policy:
787 vm_area_free(tmp);
788 fail_nomem:
789 retval = -ENOMEM;
790 vm_unacct_memory(charge);
791 goto loop_out;
792 }
793
794 static inline int mm_alloc_pgd(struct mm_struct *mm)
795 {
796 mm->pgd = pgd_alloc(mm);
797 if (unlikely(!mm->pgd))
798 return -ENOMEM;
799 return 0;
800 }
801
802 static inline void mm_free_pgd(struct mm_struct *mm)
803 {
804 pgd_free(mm, mm->pgd);
805 }
806 #else
807 static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
808 {
809 mmap_write_lock(oldmm);
810 dup_mm_exe_file(mm, oldmm);
811 mmap_write_unlock(oldmm);
812 return 0;
813 }
814 #define mm_alloc_pgd(mm) (0)
815 #define mm_free_pgd(mm)
816 #endif /* CONFIG_MMU */
817
818 static void check_mm(struct mm_struct *mm)
819 {
820 int i;
821
822 BUILD_BUG_ON_MSG(ARRAY_SIZE(resident_page_types) != NR_MM_COUNTERS,
823 "Please make sure 'struct resident_page_types[]' is updated as well");
824
825 for (i = 0; i < NR_MM_COUNTERS; i++) {
826 long x = percpu_counter_sum(&mm->rss_stat[i]);
827
828 if (unlikely(x))
829 pr_alert("BUG: Bad rss-counter state mm:%p type:%s val:%ld\n",
830 mm, resident_page_types[i], x);
831 }
832
833 if (mm_pgtables_bytes(mm))
834 pr_alert("BUG: non-zero pgtables_bytes on freeing mm: %ld\n",
835 mm_pgtables_bytes(mm));
836
837 #if defined(CONFIG_TRANSPARENT_HUGEPAGE) && !USE_SPLIT_PMD_PTLOCKS
838 VM_BUG_ON_MM(mm->pmd_huge_pte, mm);
839 #endif
840 }
841
842 #define allocate_mm() (kmem_cache_alloc(mm_cachep, GFP_KERNEL))
843 #define free_mm(mm) (kmem_cache_free(mm_cachep, (mm)))
844
845 static void do_check_lazy_tlb(void *arg)
846 {
847 struct mm_struct *mm = arg;
848
849 WARN_ON_ONCE(current->active_mm == mm);
850 }
851
852 static void do_shoot_lazy_tlb(void *arg)
853 {
854 struct mm_struct *mm = arg;
855
856 if (current->active_mm == mm) {
857 WARN_ON_ONCE(current->mm);
858 current->active_mm = &init_mm;
859 switch_mm(mm, &init_mm, current);
860 }
861 }
862
863 static void cleanup_lazy_tlbs(struct mm_struct *mm)
864 {
865 if (!IS_ENABLED(CONFIG_MMU_LAZY_TLB_SHOOTDOWN)) {
866 /*
867 * In this case, lazy tlb mms are refounted and would not reach
868 * __mmdrop until all CPUs have switched away and mmdrop()ed.
869 */
870 return;
871 }
872
873 /*
874 * Lazy mm shootdown does not refcount "lazy tlb mm" usage, rather it
875 * requires lazy mm users to switch to another mm when the refcount
876 * drops to zero, before the mm is freed. This requires IPIs here to
877 * switch kernel threads to init_mm.
878 *
879 * archs that use IPIs to flush TLBs can piggy-back that lazy tlb mm
880 * switch with the final userspace teardown TLB flush which leaves the
881 * mm lazy on this CPU but no others, reducing the need for additional
882 * IPIs here. There are cases where a final IPI is still required here,
883 * such as the final mmdrop being performed on a different CPU than the
884 * one exiting, or kernel threads using the mm when userspace exits.
885 *
886 * IPI overheads have not found to be expensive, but they could be
887 * reduced in a number of possible ways, for example (roughly
888 * increasing order of complexity):
889 * - The last lazy reference created by exit_mm() could instead switch
890 * to init_mm, however it's probable this will run on the same CPU
891 * immediately afterwards, so this may not reduce IPIs much.
892 * - A batch of mms requiring IPIs could be gathered and freed at once.
893 * - CPUs store active_mm where it can be remotely checked without a
894 * lock, to filter out false-positives in the cpumask.
895 * - After mm_users or mm_count reaches zero, switching away from the
896 * mm could clear mm_cpumask to reduce some IPIs, perhaps together
897 * with some batching or delaying of the final IPIs.
898 * - A delayed freeing and RCU-like quiescing sequence based on mm
899 * switching to avoid IPIs completely.
900 */
901 on_each_cpu_mask(mm_cpumask(mm), do_shoot_lazy_tlb, (void *)mm, 1);
902 if (IS_ENABLED(CONFIG_DEBUG_VM_SHOOT_LAZIES))
903 on_each_cpu(do_check_lazy_tlb, (void *)mm, 1);
904 }
905
906 /*
907 * Called when the last reference to the mm
908 * is dropped: either by a lazy thread or by
909 * mmput. Free the page directory and the mm.
910 */
911 void __mmdrop(struct mm_struct *mm)
912 {
913 BUG_ON(mm == &init_mm);
914 WARN_ON_ONCE(mm == current->mm);
915
916 /* Ensure no CPUs are using this as their lazy tlb mm */
917 cleanup_lazy_tlbs(mm);
918
919 WARN_ON_ONCE(mm == current->active_mm);
920 mm_free_pgd(mm);
921 destroy_context(mm);
922 mmu_notifier_subscriptions_destroy(mm);
923 check_mm(mm);
924 put_user_ns(mm->user_ns);
925 mm_pasid_drop(mm);
926 mm_destroy_cid(mm);
927 percpu_counter_destroy_many(mm->rss_stat, NR_MM_COUNTERS);
928
929 free_mm(mm);
930 }
931 EXPORT_SYMBOL_GPL(__mmdrop);
932
933 static void mmdrop_async_fn(struct work_struct *work)
934 {
935 struct mm_struct *mm;
936
937 mm = container_of(work, struct mm_struct, async_put_work);
938 __mmdrop(mm);
939 }
940
941 static void mmdrop_async(struct mm_struct *mm)
942 {
943 if (unlikely(atomic_dec_and_test(&mm->mm_count))) {
944 INIT_WORK(&mm->async_put_work, mmdrop_async_fn);
945 schedule_work(&mm->async_put_work);
946 }
947 }
948
949 static inline void free_signal_struct(struct signal_struct *sig)
950 {
951 taskstats_tgid_free(sig);
952 sched_autogroup_exit(sig);
953 /*
954 * __mmdrop is not safe to call from softirq context on x86 due to
955 * pgd_dtor so postpone it to the async context
956 */
957 if (sig->oom_mm)
958 mmdrop_async(sig->oom_mm);
959 kmem_cache_free(signal_cachep, sig);
960 }
961
962 static inline void put_signal_struct(struct signal_struct *sig)
963 {
964 if (refcount_dec_and_test(&sig->sigcnt))
965 free_signal_struct(sig);
966 }
967
968 void __put_task_struct(struct task_struct *tsk)
969 {
970 WARN_ON(!tsk->exit_state);
971 WARN_ON(refcount_read(&tsk->usage));
972 WARN_ON(tsk == current);
973
974 io_uring_free(tsk);
975 cgroup_free(tsk);
976 task_numa_free(tsk, true);
977 security_task_free(tsk);
978 exit_creds(tsk);
979 delayacct_tsk_free(tsk);
980 put_signal_struct(tsk->signal);
981 sched_core_free(tsk);
982 free_task(tsk);
983 }
984 EXPORT_SYMBOL_GPL(__put_task_struct);
985
986 void __put_task_struct_rcu_cb(struct rcu_head *rhp)
987 {
988 struct task_struct *task = container_of(rhp, struct task_struct, rcu);
989
990 __put_task_struct(task);
991 }
992 EXPORT_SYMBOL_GPL(__put_task_struct_rcu_cb);
993
994 void __init __weak arch_task_cache_init(void) { }
995
996 /*
997 * set_max_threads
998 */
999 static void set_max_threads(unsigned int max_threads_suggested)
1000 {
1001 u64 threads;
1002 unsigned long nr_pages = totalram_pages();
1003
1004 /*
1005 * The number of threads shall be limited such that the thread
1006 * structures may only consume a small part of the available memory.
1007 */
1008 if (fls64(nr_pages) + fls64(PAGE_SIZE) > 64)
1009 threads = MAX_THREADS;
1010 else
1011 threads = div64_u64((u64) nr_pages * (u64) PAGE_SIZE,
1012 (u64) THREAD_SIZE * 8UL);
1013
1014 if (threads > max_threads_suggested)
1015 threads = max_threads_suggested;
1016
1017 max_threads = clamp_t(u64, threads, MIN_THREADS, MAX_THREADS);
1018 }
1019
1020 #ifdef CONFIG_ARCH_WANTS_DYNAMIC_TASK_STRUCT
1021 /* Initialized by the architecture: */
1022 int arch_task_struct_size __read_mostly;
1023 #endif
1024
1025 static void task_struct_whitelist(unsigned long *offset, unsigned long *size)
1026 {
1027 /* Fetch thread_struct whitelist for the architecture. */
1028 arch_thread_struct_whitelist(offset, size);
1029
1030 /*
1031 * Handle zero-sized whitelist or empty thread_struct, otherwise
1032 * adjust offset to position of thread_struct in task_struct.
1033 */
1034 if (unlikely(*size == 0))
1035 *offset = 0;
1036 else
1037 *offset += offsetof(struct task_struct, thread);
1038 }
1039
1040 void __init fork_init(void)
1041 {
1042 int i;
1043 #ifndef ARCH_MIN_TASKALIGN
1044 #define ARCH_MIN_TASKALIGN 0
1045 #endif
1046 int align = max_t(int, L1_CACHE_BYTES, ARCH_MIN_TASKALIGN);
1047 unsigned long useroffset, usersize;
1048
1049 /* create a slab on which task_structs can be allocated */
1050 task_struct_whitelist(&useroffset, &usersize);
1051 task_struct_cachep = kmem_cache_create_usercopy("task_struct",
1052 arch_task_struct_size, align,
1053 SLAB_PANIC|SLAB_ACCOUNT,
1054 useroffset, usersize, NULL);
1055
1056 /* do the arch specific task caches init */
1057 arch_task_cache_init();
1058
1059 set_max_threads(MAX_THREADS);
1060
1061 init_task.signal->rlim[RLIMIT_NPROC].rlim_cur = max_threads/2;
1062 init_task.signal->rlim[RLIMIT_NPROC].rlim_max = max_threads/2;
1063 init_task.signal->rlim[RLIMIT_SIGPENDING] =
1064 init_task.signal->rlim[RLIMIT_NPROC];
1065
1066 for (i = 0; i < UCOUNT_COUNTS; i++)
1067 init_user_ns.ucount_max[i] = max_threads/2;
1068
1069 set_userns_rlimit_max(&init_user_ns, UCOUNT_RLIMIT_NPROC, RLIM_INFINITY);
1070 set_userns_rlimit_max(&init_user_ns, UCOUNT_RLIMIT_MSGQUEUE, RLIM_INFINITY);
1071 set_userns_rlimit_max(&init_user_ns, UCOUNT_RLIMIT_SIGPENDING, RLIM_INFINITY);
1072 set_userns_rlimit_max(&init_user_ns, UCOUNT_RLIMIT_MEMLOCK, RLIM_INFINITY);
1073
1074 #ifdef CONFIG_VMAP_STACK
1075 cpuhp_setup_state(CPUHP_BP_PREPARE_DYN, "fork:vm_stack_cache",
1076 NULL, free_vm_stack_cache);
1077 #endif
1078
1079 scs_init();
1080
1081 lockdep_init_task(&init_task);
1082 uprobes_init();
1083 }
1084
1085 int __weak arch_dup_task_struct(struct task_struct *dst,
1086 struct task_struct *src)
1087 {
1088 *dst = *src;
1089 return 0;
1090 }
1091
1092 void set_task_stack_end_magic(struct task_struct *tsk)
1093 {
1094 unsigned long *stackend;
1095
1096 stackend = end_of_stack(tsk);
1097 *stackend = STACK_END_MAGIC; /* for overflow detection */
1098 }
1099
1100 static struct task_struct *dup_task_struct(struct task_struct *orig, int node)
1101 {
1102 struct task_struct *tsk;
1103 int err;
1104
1105 if (node == NUMA_NO_NODE)
1106 node = tsk_fork_get_node(orig);
1107 tsk = alloc_task_struct_node(node);
1108 if (!tsk)
1109 return NULL;
1110
1111 err = arch_dup_task_struct(tsk, orig);
1112 if (err)
1113 goto free_tsk;
1114
1115 err = alloc_thread_stack_node(tsk, node);
1116 if (err)
1117 goto free_tsk;
1118
1119 #ifdef CONFIG_THREAD_INFO_IN_TASK
1120 refcount_set(&tsk->stack_refcount, 1);
1121 #endif
1122 account_kernel_stack(tsk, 1);
1123
1124 err = scs_prepare(tsk, node);
1125 if (err)
1126 goto free_stack;
1127
1128 #ifdef CONFIG_SECCOMP
1129 /*
1130 * We must handle setting up seccomp filters once we're under
1131 * the sighand lock in case orig has changed between now and
1132 * then. Until then, filter must be NULL to avoid messing up
1133 * the usage counts on the error path calling free_task.
1134 */
1135 tsk->seccomp.filter = NULL;
1136 #endif
1137
1138 setup_thread_stack(tsk, orig);
1139 clear_user_return_notifier(tsk);
1140 clear_tsk_need_resched(tsk);
1141 set_task_stack_end_magic(tsk);
1142 clear_syscall_work_syscall_user_dispatch(tsk);
1143
1144 #ifdef CONFIG_STACKPROTECTOR
1145 tsk->stack_canary = get_random_canary();
1146 #endif
1147 if (orig->cpus_ptr == &orig->cpus_mask)
1148 tsk->cpus_ptr = &tsk->cpus_mask;
1149 dup_user_cpus_ptr(tsk, orig, node);
1150
1151 /*
1152 * One for the user space visible state that goes away when reaped.
1153 * One for the scheduler.
1154 */
1155 refcount_set(&tsk->rcu_users, 2);
1156 /* One for the rcu users */
1157 refcount_set(&tsk->usage, 1);
1158 #ifdef CONFIG_BLK_DEV_IO_TRACE
1159 tsk->btrace_seq = 0;
1160 #endif
1161 tsk->splice_pipe = NULL;
1162 tsk->task_frag.page = NULL;
1163 tsk->wake_q.next = NULL;
1164 tsk->worker_private = NULL;
1165
1166 kcov_task_init(tsk);
1167 kmsan_task_create(tsk);
1168 kmap_local_fork(tsk);
1169
1170 #ifdef CONFIG_FAULT_INJECTION
1171 tsk->fail_nth = 0;
1172 #endif
1173
1174 #ifdef CONFIG_BLK_CGROUP
1175 tsk->throttle_disk = NULL;
1176 tsk->use_memdelay = 0;
1177 #endif
1178
1179 #ifdef CONFIG_ARCH_HAS_CPU_PASID
1180 tsk->pasid_activated = 0;
1181 #endif
1182
1183 #ifdef CONFIG_MEMCG
1184 tsk->active_memcg = NULL;
1185 #endif
1186
1187 #ifdef CONFIG_CPU_SUP_INTEL
1188 tsk->reported_split_lock = 0;
1189 #endif
1190
1191 #ifdef CONFIG_SCHED_MM_CID
1192 tsk->mm_cid = -1;
1193 tsk->last_mm_cid = -1;
1194 tsk->mm_cid_active = 0;
1195 tsk->migrate_from_cpu = -1;
1196 #endif
1197 return tsk;
1198
1199 free_stack:
1200 exit_task_stack_account(tsk);
1201 free_thread_stack(tsk);
1202 free_tsk:
1203 free_task_struct(tsk);
1204 return NULL;
1205 }
1206
1207 __cacheline_aligned_in_smp DEFINE_SPINLOCK(mmlist_lock);
1208
1209 static unsigned long default_dump_filter = MMF_DUMP_FILTER_DEFAULT;
1210
1211 static int __init coredump_filter_setup(char *s)
1212 {
1213 default_dump_filter =
1214 (simple_strtoul(s, NULL, 0) << MMF_DUMP_FILTER_SHIFT) &
1215 MMF_DUMP_FILTER_MASK;
1216 return 1;
1217 }
1218
1219 __setup("coredump_filter=", coredump_filter_setup);
1220
1221 #include <linux/init_task.h>
1222
1223 static void mm_init_aio(struct mm_struct *mm)
1224 {
1225 #ifdef CONFIG_AIO
1226 spin_lock_init(&mm->ioctx_lock);
1227 mm->ioctx_table = NULL;
1228 #endif
1229 }
1230
1231 static __always_inline void mm_clear_owner(struct mm_struct *mm,
1232 struct task_struct *p)
1233 {
1234 #ifdef CONFIG_MEMCG
1235 if (mm->owner == p)
1236 WRITE_ONCE(mm->owner, NULL);
1237 #endif
1238 }
1239
1240 static void mm_init_owner(struct mm_struct *mm, struct task_struct *p)
1241 {
1242 #ifdef CONFIG_MEMCG
1243 mm->owner = p;
1244 #endif
1245 }
1246
1247 static void mm_init_uprobes_state(struct mm_struct *mm)
1248 {
1249 #ifdef CONFIG_UPROBES
1250 mm->uprobes_state.xol_area = NULL;
1251 #endif
1252 }
1253
1254 static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p,
1255 struct user_namespace *user_ns)
1256 {
1257 mt_init_flags(&mm->mm_mt, MM_MT_FLAGS);
1258 mt_set_external_lock(&mm->mm_mt, &mm->mmap_lock);
1259 atomic_set(&mm->mm_users, 1);
1260 atomic_set(&mm->mm_count, 1);
1261 seqcount_init(&mm->write_protect_seq);
1262 mmap_init_lock(mm);
1263 INIT_LIST_HEAD(&mm->mmlist);
1264 #ifdef CONFIG_PER_VMA_LOCK
1265 mm->mm_lock_seq = 0;
1266 #endif
1267 mm_pgtables_bytes_init(mm);
1268 mm->map_count = 0;
1269 mm->locked_vm = 0;
1270 atomic64_set(&mm->pinned_vm, 0);
1271 memset(&mm->rss_stat, 0, sizeof(mm->rss_stat));
1272 spin_lock_init(&mm->page_table_lock);
1273 spin_lock_init(&mm->arg_lock);
1274 mm_init_cpumask(mm);
1275 mm_init_aio(mm);
1276 mm_init_owner(mm, p);
1277 mm_pasid_init(mm);
1278 RCU_INIT_POINTER(mm->exe_file, NULL);
1279 mmu_notifier_subscriptions_init(mm);
1280 init_tlb_flush_pending(mm);
1281 #if defined(CONFIG_TRANSPARENT_HUGEPAGE) && !USE_SPLIT_PMD_PTLOCKS
1282 mm->pmd_huge_pte = NULL;
1283 #endif
1284 mm_init_uprobes_state(mm);
1285 hugetlb_count_init(mm);
1286
1287 if (current->mm) {
1288 mm->flags = mmf_init_flags(current->mm->flags);
1289 mm->def_flags = current->mm->def_flags & VM_INIT_DEF_MASK;
1290 } else {
1291 mm->flags = default_dump_filter;
1292 mm->def_flags = 0;
1293 }
1294
1295 if (mm_alloc_pgd(mm))
1296 goto fail_nopgd;
1297
1298 if (init_new_context(p, mm))
1299 goto fail_nocontext;
1300
1301 if (mm_alloc_cid(mm))
1302 goto fail_cid;
1303
1304 if (percpu_counter_init_many(mm->rss_stat, 0, GFP_KERNEL_ACCOUNT,
1305 NR_MM_COUNTERS))
1306 goto fail_pcpu;
1307
1308 mm->user_ns = get_user_ns(user_ns);
1309 lru_gen_init_mm(mm);
1310 return mm;
1311
1312 fail_pcpu:
1313 mm_destroy_cid(mm);
1314 fail_cid:
1315 destroy_context(mm);
1316 fail_nocontext:
1317 mm_free_pgd(mm);
1318 fail_nopgd:
1319 free_mm(mm);
1320 return NULL;
1321 }
1322
1323 /*
1324 * Allocate and initialize an mm_struct.
1325 */
1326 struct mm_struct *mm_alloc(void)
1327 {
1328 struct mm_struct *mm;
1329
1330 mm = allocate_mm();
1331 if (!mm)
1332 return NULL;
1333
1334 memset(mm, 0, sizeof(*mm));
1335 return mm_init(mm, current, current_user_ns());
1336 }
1337
1338 static inline void __mmput(struct mm_struct *mm)
1339 {
1340 VM_BUG_ON(atomic_read(&mm->mm_users));
1341
1342 uprobe_clear_state(mm);
1343 exit_aio(mm);
1344 ksm_exit(mm);
1345 khugepaged_exit(mm); /* must run before exit_mmap */
1346 exit_mmap(mm);
1347 mm_put_huge_zero_page(mm);
1348 set_mm_exe_file(mm, NULL);
1349 if (!list_empty(&mm->mmlist)) {
1350 spin_lock(&mmlist_lock);
1351 list_del(&mm->mmlist);
1352 spin_unlock(&mmlist_lock);
1353 }
1354 if (mm->binfmt)
1355 module_put(mm->binfmt->module);
1356 lru_gen_del_mm(mm);
1357 mmdrop(mm);
1358 }
1359
1360 /*
1361 * Decrement the use count and release all resources for an mm.
1362 */
1363 void mmput(struct mm_struct *mm)
1364 {
1365 might_sleep();
1366
1367 if (atomic_dec_and_test(&mm->mm_users))
1368 __mmput(mm);
1369 }
1370 EXPORT_SYMBOL_GPL(mmput);
1371
1372 #ifdef CONFIG_MMU
1373 static void mmput_async_fn(struct work_struct *work)
1374 {
1375 struct mm_struct *mm = container_of(work, struct mm_struct,
1376 async_put_work);
1377
1378 __mmput(mm);
1379 }
1380
1381 void mmput_async(struct mm_struct *mm)
1382 {
1383 if (atomic_dec_and_test(&mm->mm_users)) {
1384 INIT_WORK(&mm->async_put_work, mmput_async_fn);
1385 schedule_work(&mm->async_put_work);
1386 }
1387 }
1388 EXPORT_SYMBOL_GPL(mmput_async);
1389 #endif
1390
1391 /**
1392 * set_mm_exe_file - change a reference to the mm's executable file
1393 * @mm: The mm to change.
1394 * @new_exe_file: The new file to use.
1395 *
1396 * This changes mm's executable file (shown as symlink /proc/[pid]/exe).
1397 *
1398 * Main users are mmput() and sys_execve(). Callers prevent concurrent
1399 * invocations: in mmput() nobody alive left, in execve it happens before
1400 * the new mm is made visible to anyone.
1401 *
1402 * Can only fail if new_exe_file != NULL.
1403 */
1404 int set_mm_exe_file(struct mm_struct *mm, struct file *new_exe_file)
1405 {
1406 struct file *old_exe_file;
1407
1408 /*
1409 * It is safe to dereference the exe_file without RCU as
1410 * this function is only called if nobody else can access
1411 * this mm -- see comment above for justification.
1412 */
1413 old_exe_file = rcu_dereference_raw(mm->exe_file);
1414
1415 if (new_exe_file) {
1416 /*
1417 * We expect the caller (i.e., sys_execve) to already denied
1418 * write access, so this is unlikely to fail.
1419 */
1420 if (unlikely(deny_write_access(new_exe_file)))
1421 return -EACCES;
1422 get_file(new_exe_file);
1423 }
1424 rcu_assign_pointer(mm->exe_file, new_exe_file);
1425 if (old_exe_file) {
1426 allow_write_access(old_exe_file);
1427 fput(old_exe_file);
1428 }
1429 return 0;
1430 }
1431
1432 /**
1433 * replace_mm_exe_file - replace a reference to the mm's executable file
1434 * @mm: The mm to change.
1435 * @new_exe_file: The new file to use.
1436 *
1437 * This changes mm's executable file (shown as symlink /proc/[pid]/exe).
1438 *
1439 * Main user is sys_prctl(PR_SET_MM_MAP/EXE_FILE).
1440 */
1441 int replace_mm_exe_file(struct mm_struct *mm, struct file *new_exe_file)
1442 {
1443 struct vm_area_struct *vma;
1444 struct file *old_exe_file;
1445 int ret = 0;
1446
1447 /* Forbid mm->exe_file change if old file still mapped. */
1448 old_exe_file = get_mm_exe_file(mm);
1449 if (old_exe_file) {
1450 VMA_ITERATOR(vmi, mm, 0);
1451 mmap_read_lock(mm);
1452 for_each_vma(vmi, vma) {
1453 if (!vma->vm_file)
1454 continue;
1455 if (path_equal(&vma->vm_file->f_path,
1456 &old_exe_file->f_path)) {
1457 ret = -EBUSY;
1458 break;
1459 }
1460 }
1461 mmap_read_unlock(mm);
1462 fput(old_exe_file);
1463 if (ret)
1464 return ret;
1465 }
1466
1467 ret = deny_write_access(new_exe_file);
1468 if (ret)
1469 return -EACCES;
1470 get_file(new_exe_file);
1471
1472 /* set the new file */
1473 mmap_write_lock(mm);
1474 old_exe_file = rcu_dereference_raw(mm->exe_file);
1475 rcu_assign_pointer(mm->exe_file, new_exe_file);
1476 mmap_write_unlock(mm);
1477
1478 if (old_exe_file) {
1479 allow_write_access(old_exe_file);
1480 fput(old_exe_file);
1481 }
1482 return 0;
1483 }
1484
1485 /**
1486 * get_mm_exe_file - acquire a reference to the mm's executable file
1487 * @mm: The mm of interest.
1488 *
1489 * Returns %NULL if mm has no associated executable file.
1490 * User must release file via fput().
1491 */
1492 struct file *get_mm_exe_file(struct mm_struct *mm)
1493 {
1494 struct file *exe_file;
1495
1496 rcu_read_lock();
1497 exe_file = get_file_rcu(&mm->exe_file);
1498 rcu_read_unlock();
1499 return exe_file;
1500 }
1501
1502 /**
1503 * get_task_exe_file - acquire a reference to the task's executable file
1504 * @task: The task.
1505 *
1506 * Returns %NULL if task's mm (if any) has no associated executable file or
1507 * this is a kernel thread with borrowed mm (see the comment above get_task_mm).
1508 * User must release file via fput().
1509 */
1510 struct file *get_task_exe_file(struct task_struct *task)
1511 {
1512 struct file *exe_file = NULL;
1513 struct mm_struct *mm;
1514
1515 task_lock(task);
1516 mm = task->mm;
1517 if (mm) {
1518 if (!(task->flags & PF_KTHREAD))
1519 exe_file = get_mm_exe_file(mm);
1520 }
1521 task_unlock(task);
1522 return exe_file;
1523 }
1524
1525 /**
1526 * get_task_mm - acquire a reference to the task's mm
1527 * @task: The task.
1528 *
1529 * Returns %NULL if the task has no mm. Checks PF_KTHREAD (meaning
1530 * this kernel workthread has transiently adopted a user mm with use_mm,
1531 * to do its AIO) is not set and if so returns a reference to it, after
1532 * bumping up the use count. User must release the mm via mmput()
1533 * after use. Typically used by /proc and ptrace.
1534 */
1535 struct mm_struct *get_task_mm(struct task_struct *task)
1536 {
1537 struct mm_struct *mm;
1538
1539 task_lock(task);
1540 mm = task->mm;
1541 if (mm) {
1542 if (task->flags & PF_KTHREAD)
1543 mm = NULL;
1544 else
1545 mmget(mm);
1546 }
1547 task_unlock(task);
1548 return mm;
1549 }
1550 EXPORT_SYMBOL_GPL(get_task_mm);
1551
1552 struct mm_struct *mm_access(struct task_struct *task, unsigned int mode)
1553 {
1554 struct mm_struct *mm;
1555 int err;
1556
1557 err = down_read_killable(&task->signal->exec_update_lock);
1558 if (err)
1559 return ERR_PTR(err);
1560
1561 mm = get_task_mm(task);
1562 if (mm && mm != current->mm &&
1563 !ptrace_may_access(task, mode)) {
1564 mmput(mm);
1565 mm = ERR_PTR(-EACCES);
1566 }
1567 up_read(&task->signal->exec_update_lock);
1568
1569 return mm;
1570 }
1571
1572 static void complete_vfork_done(struct task_struct *tsk)
1573 {
1574 struct completion *vfork;
1575
1576 task_lock(tsk);
1577 vfork = tsk->vfork_done;
1578 if (likely(vfork)) {
1579 tsk->vfork_done = NULL;
1580 complete(vfork);
1581 }
1582 task_unlock(tsk);
1583 }
1584
1585 static int wait_for_vfork_done(struct task_struct *child,
1586 struct completion *vfork)
1587 {
1588 unsigned int state = TASK_KILLABLE|TASK_FREEZABLE;
1589 int killed;
1590
1591 cgroup_enter_frozen();
1592 killed = wait_for_completion_state(vfork, state);
1593 cgroup_leave_frozen(false);
1594
1595 if (killed) {
1596 task_lock(child);
1597 child->vfork_done = NULL;
1598 task_unlock(child);
1599 }
1600
1601 put_task_struct(child);
1602 return killed;
1603 }
1604
1605 /* Please note the differences between mmput and mm_release.
1606 * mmput is called whenever we stop holding onto a mm_struct,
1607 * error success whatever.
1608 *
1609 * mm_release is called after a mm_struct has been removed
1610 * from the current process.
1611 *
1612 * This difference is important for error handling, when we
1613 * only half set up a mm_struct for a new process and need to restore
1614 * the old one. Because we mmput the new mm_struct before
1615 * restoring the old one. . .
1616 * Eric Biederman 10 January 1998
1617 */
1618 static void mm_release(struct task_struct *tsk, struct mm_struct *mm)
1619 {
1620 uprobe_free_utask(tsk);
1621
1622 /* Get rid of any cached register state */
1623 deactivate_mm(tsk, mm);
1624
1625 /*
1626 * Signal userspace if we're not exiting with a core dump
1627 * because we want to leave the value intact for debugging
1628 * purposes.
1629 */
1630 if (tsk->clear_child_tid) {
1631 if (atomic_read(&mm->mm_users) > 1) {
1632 /*
1633 * We don't check the error code - if userspace has
1634 * not set up a proper pointer then tough luck.
1635 */
1636 put_user(0, tsk->clear_child_tid);
1637 do_futex(tsk->clear_child_tid, FUTEX_WAKE,
1638 1, NULL, NULL, 0, 0);
1639 }
1640 tsk->clear_child_tid = NULL;
1641 }
1642
1643 /*
1644 * All done, finally we can wake up parent and return this mm to him.
1645 * Also kthread_stop() uses this completion for synchronization.
1646 */
1647 if (tsk->vfork_done)
1648 complete_vfork_done(tsk);
1649 }
1650
1651 void exit_mm_release(struct task_struct *tsk, struct mm_struct *mm)
1652 {
1653 futex_exit_release(tsk);
1654 mm_release(tsk, mm);
1655 }
1656
1657 void exec_mm_release(struct task_struct *tsk, struct mm_struct *mm)
1658 {
1659 futex_exec_release(tsk);
1660 mm_release(tsk, mm);
1661 }
1662
1663 /**
1664 * dup_mm() - duplicates an existing mm structure
1665 * @tsk: the task_struct with which the new mm will be associated.
1666 * @oldmm: the mm to duplicate.
1667 *
1668 * Allocates a new mm structure and duplicates the provided @oldmm structure
1669 * content into it.
1670 *
1671 * Return: the duplicated mm or NULL on failure.
1672 */
1673 static struct mm_struct *dup_mm(struct task_struct *tsk,
1674 struct mm_struct *oldmm)
1675 {
1676 struct mm_struct *mm;
1677 int err;
1678
1679 mm = allocate_mm();
1680 if (!mm)
1681 goto fail_nomem;
1682
1683 memcpy(mm, oldmm, sizeof(*mm));
1684
1685 if (!mm_init(mm, tsk, mm->user_ns))
1686 goto fail_nomem;
1687
1688 err = dup_mmap(mm, oldmm);
1689 if (err)
1690 goto free_pt;
1691
1692 mm->hiwater_rss = get_mm_rss(mm);
1693 mm->hiwater_vm = mm->total_vm;
1694
1695 if (mm->binfmt && !try_module_get(mm->binfmt->module))
1696 goto free_pt;
1697
1698 return mm;
1699
1700 free_pt:
1701 /* don't put binfmt in mmput, we haven't got module yet */
1702 mm->binfmt = NULL;
1703 mm_init_owner(mm, NULL);
1704 mmput(mm);
1705
1706 fail_nomem:
1707 return NULL;
1708 }
1709
1710 static int copy_mm(unsigned long clone_flags, struct task_struct *tsk)
1711 {
1712 struct mm_struct *mm, *oldmm;
1713
1714 tsk->min_flt = tsk->maj_flt = 0;
1715 tsk->nvcsw = tsk->nivcsw = 0;
1716 #ifdef CONFIG_DETECT_HUNG_TASK
1717 tsk->last_switch_count = tsk->nvcsw + tsk->nivcsw;
1718 tsk->last_switch_time = 0;
1719 #endif
1720
1721 tsk->mm = NULL;
1722 tsk->active_mm = NULL;
1723
1724 /*
1725 * Are we cloning a kernel thread?
1726 *
1727 * We need to steal a active VM for that..
1728 */
1729 oldmm = current->mm;
1730 if (!oldmm)
1731 return 0;
1732
1733 if (clone_flags & CLONE_VM) {
1734 mmget(oldmm);
1735 mm = oldmm;
1736 } else {
1737 mm = dup_mm(tsk, current->mm);
1738 if (!mm)
1739 return -ENOMEM;
1740 }
1741
1742 tsk->mm = mm;
1743 tsk->active_mm = mm;
1744 sched_mm_cid_fork(tsk);
1745 return 0;
1746 }
1747
1748 static int copy_fs(unsigned long clone_flags, struct task_struct *tsk)
1749 {
1750 struct fs_struct *fs = current->fs;
1751 if (clone_flags & CLONE_FS) {
1752 /* tsk->fs is already what we want */
1753 spin_lock(&fs->lock);
1754 /* "users" and "in_exec" locked for check_unsafe_exec() */
1755 if (fs->in_exec) {
1756 spin_unlock(&fs->lock);
1757 return -EAGAIN;
1758 }
1759 fs->users++;
1760 spin_unlock(&fs->lock);
1761 return 0;
1762 }
1763 tsk->fs = copy_fs_struct(fs);
1764 if (!tsk->fs)
1765 return -ENOMEM;
1766 return 0;
1767 }
1768
1769 static int copy_files(unsigned long clone_flags, struct task_struct *tsk,
1770 int no_files)
1771 {
1772 struct files_struct *oldf, *newf;
1773 int error = 0;
1774
1775 /*
1776 * A background process may not have any files ...
1777 */
1778 oldf = current->files;
1779 if (!oldf)
1780 goto out;
1781
1782 if (no_files) {
1783 tsk->files = NULL;
1784 goto out;
1785 }
1786
1787 if (clone_flags & CLONE_FILES) {
1788 atomic_inc(&oldf->count);
1789 goto out;
1790 }
1791
1792 newf = dup_fd(oldf, NR_OPEN_MAX, &error);
1793 if (!newf)
1794 goto out;
1795
1796 tsk->files = newf;
1797 error = 0;
1798 out:
1799 return error;
1800 }
1801
1802 static int copy_sighand(unsigned long clone_flags, struct task_struct *tsk)
1803 {
1804 struct sighand_struct *sig;
1805
1806 if (clone_flags & CLONE_SIGHAND) {
1807 refcount_inc(¤t->sighand->count);
1808 return 0;
1809 }
1810 sig = kmem_cache_alloc(sighand_cachep, GFP_KERNEL);
1811 RCU_INIT_POINTER(tsk->sighand, sig);
1812 if (!sig)
1813 return -ENOMEM;
1814
1815 refcount_set(&sig->count, 1);
1816 spin_lock_irq(¤t->sighand->siglock);
1817 memcpy(sig->action, current->sighand->action, sizeof(sig->action));
1818 spin_unlock_irq(¤t->sighand->siglock);
1819
1820 /* Reset all signal handler not set to SIG_IGN to SIG_DFL. */
1821 if (clone_flags & CLONE_CLEAR_SIGHAND)
1822 flush_signal_handlers(tsk, 0);
1823
1824 return 0;
1825 }
1826
1827 void __cleanup_sighand(struct sighand_struct *sighand)
1828 {
1829 if (refcount_dec_and_test(&sighand->count)) {
1830 signalfd_cleanup(sighand);
1831 /*
1832 * sighand_cachep is SLAB_TYPESAFE_BY_RCU so we can free it
1833 * without an RCU grace period, see __lock_task_sighand().
1834 */
1835 kmem_cache_free(sighand_cachep, sighand);
1836 }
1837 }
1838
1839 /*
1840 * Initialize POSIX timer handling for a thread group.
1841 */
1842 static void posix_cpu_timers_init_group(struct signal_struct *sig)
1843 {
1844 struct posix_cputimers *pct = &sig->posix_cputimers;
1845 unsigned long cpu_limit;
1846
1847 cpu_limit = READ_ONCE(sig->rlim[RLIMIT_CPU].rlim_cur);
1848 posix_cputimers_group_init(pct, cpu_limit);
1849 }
1850
1851 void kpatch_foo(void);
1852 void kpatch_foo(void)
1853 {
1854 if (!jiffies)
1855 printk("kpatch copy signal\n");
1856 }
1857
1858 static int copy_signal(unsigned long clone_flags, struct task_struct *tsk)
1859 {
1860 struct signal_struct *sig;
1861
1862 kpatch_foo();
1863
1864 if (clone_flags & CLONE_THREAD)
1865 return 0;
1866
1867 sig = kmem_cache_zalloc(signal_cachep, GFP_KERNEL);
1868 tsk->signal = sig;
1869 if (!sig)
1870 return -ENOMEM;
1871
1872 sig->nr_threads = 1;
1873 sig->quick_threads = 1;
1874 atomic_set(&sig->live, 1);
1875 refcount_set(&sig->sigcnt, 1);
1876
1877 /* list_add(thread_node, thread_head) without INIT_LIST_HEAD() */
1878 sig->thread_head = (struct list_head)LIST_HEAD_INIT(tsk->thread_node);
1879 tsk->thread_node = (struct list_head)LIST_HEAD_INIT(sig->thread_head);
1880
1881 init_waitqueue_head(&sig->wait_chldexit);
1882 sig->curr_target = tsk;
1883 init_sigpending(&sig->shared_pending);
1884 INIT_HLIST_HEAD(&sig->multiprocess);
1885 seqlock_init(&sig->stats_lock);
1886 prev_cputime_init(&sig->prev_cputime);
1887
1888 #ifdef CONFIG_POSIX_TIMERS
1889 INIT_LIST_HEAD(&sig->posix_timers);
1890 hrtimer_init(&sig->real_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
1891 sig->real_timer.function = it_real_fn;
1892 #endif
1893
1894 task_lock(current->group_leader);
1895 memcpy(sig->rlim, current->signal->rlim, sizeof sig->rlim);
1896 task_unlock(current->group_leader);
1897
1898 posix_cpu_timers_init_group(sig);
1899
1900 tty_audit_fork(sig);
1901 sched_autogroup_fork(sig);
1902
1903 sig->oom_score_adj = current->signal->oom_score_adj;
1904 sig->oom_score_adj_min = current->signal->oom_score_adj_min;
1905
1906 mutex_init(&sig->cred_guard_mutex);
1907 init_rwsem(&sig->exec_update_lock);
1908
1909 return 0;
1910 }
1911
1912 static void copy_seccomp(struct task_struct *p)
1913 {
1914 #ifdef CONFIG_SECCOMP
1915 /*
1916 * Must be called with sighand->lock held, which is common to
1917 * all threads in the group. Holding cred_guard_mutex is not
1918 * needed because this new task is not yet running and cannot
1919 * be racing exec.
1920 */
1921 assert_spin_locked(¤t->sighand->siglock);
1922
1923 /* Ref-count the new filter user, and assign it. */
1924 get_seccomp_filter(current);
1925 p->seccomp = current->seccomp;
1926
1927 /*
1928 * Explicitly enable no_new_privs here in case it got set
1929 * between the task_struct being duplicated and holding the
1930 * sighand lock. The seccomp state and nnp must be in sync.
1931 */
1932 if (task_no_new_privs(current))
1933 task_set_no_new_privs(p);
1934
1935 /*
1936 * If the parent gained a seccomp mode after copying thread
1937 * flags and between before we held the sighand lock, we have
1938 * to manually enable the seccomp thread flag here.
1939 */
1940 if (p->seccomp.mode != SECCOMP_MODE_DISABLED)
1941 set_task_syscall_work(p, SECCOMP);
1942 #endif
1943 }
1944
1945 SYSCALL_DEFINE1(set_tid_address, int __user *, tidptr)
1946 {
1947 current->clear_child_tid = tidptr;
1948
1949 return task_pid_vnr(current);
1950 }
1951
1952 static void rt_mutex_init_task(struct task_struct *p)
1953 {
1954 raw_spin_lock_init(&p->pi_lock);
1955 #ifdef CONFIG_RT_MUTEXES
1956 p->pi_waiters = RB_ROOT_CACHED;
1957 p->pi_top_task = NULL;
1958 p->pi_blocked_on = NULL;
1959 #endif
1960 }
1961
1962 static inline void init_task_pid_links(struct task_struct *task)
1963 {
1964 enum pid_type type;
1965
1966 for (type = PIDTYPE_PID; type < PIDTYPE_MAX; ++type)
1967 INIT_HLIST_NODE(&task->pid_links[type]);
1968 }
1969
1970 static inline void
1971 init_task_pid(struct task_struct *task, enum pid_type type, struct pid *pid)
1972 {
1973 if (type == PIDTYPE_PID)
1974 task->thread_pid = pid;
1975 else
1976 task->signal->pids[type] = pid;
1977 }
1978
1979 static inline void rcu_copy_process(struct task_struct *p)
1980 {
1981 #ifdef CONFIG_PREEMPT_RCU
1982 p->rcu_read_lock_nesting = 0;
1983 p->rcu_read_unlock_special.s = 0;
1984 p->rcu_blocked_node = NULL;
1985 INIT_LIST_HEAD(&p->rcu_node_entry);
1986 #endif /* #ifdef CONFIG_PREEMPT_RCU */
1987 #ifdef CONFIG_TASKS_RCU
1988 p->rcu_tasks_holdout = false;
1989 INIT_LIST_HEAD(&p->rcu_tasks_holdout_list);
1990 p->rcu_tasks_idle_cpu = -1;
1991 INIT_LIST_HEAD(&p->rcu_tasks_exit_list);
1992 #endif /* #ifdef CONFIG_TASKS_RCU */
1993 #ifdef CONFIG_TASKS_TRACE_RCU
1994 p->trc_reader_nesting = 0;
1995 p->trc_reader_special.s = 0;
1996 INIT_LIST_HEAD(&p->trc_holdout_list);
1997 INIT_LIST_HEAD(&p->trc_blkd_node);
1998 #endif /* #ifdef CONFIG_TASKS_TRACE_RCU */
1999 }
2000
2001 /**
2002 * __pidfd_prepare - allocate a new pidfd_file and reserve a pidfd
2003 * @pid: the struct pid for which to create a pidfd
2004 * @flags: flags of the new @pidfd
2005 * @ret: Where to return the file for the pidfd.
2006 *
2007 * Allocate a new file that stashes @pid and reserve a new pidfd number in the
2008 * caller's file descriptor table. The pidfd is reserved but not installed yet.
2009 *
2010 * The helper doesn't perform checks on @pid which makes it useful for pidfds
2011 * created via CLONE_PIDFD where @pid has no task attached when the pidfd and
2012 * pidfd file are prepared.
2013 *
2014 * If this function returns successfully the caller is responsible to either
2015 * call fd_install() passing the returned pidfd and pidfd file as arguments in
2016 * order to install the pidfd into its file descriptor table or they must use
2017 * put_unused_fd() and fput() on the returned pidfd and pidfd file
2018 * respectively.
2019 *
2020 * This function is useful when a pidfd must already be reserved but there
2021 * might still be points of failure afterwards and the caller wants to ensure
2022 * that no pidfd is leaked into its file descriptor table.
2023 *
2024 * Return: On success, a reserved pidfd is returned from the function and a new
2025 * pidfd file is returned in the last argument to the function. On
2026 * error, a negative error code is returned from the function and the
2027 * last argument remains unchanged.
2028 */
2029 static int __pidfd_prepare(struct pid *pid, unsigned int flags, struct file **ret)
2030 {
2031 int pidfd;
2032 struct file *pidfd_file;
2033
2034 pidfd = get_unused_fd_flags(O_CLOEXEC);
2035 if (pidfd < 0)
2036 return pidfd;
2037
2038 pidfd_file = pidfs_alloc_file(pid, flags | O_RDWR);
2039 if (IS_ERR(pidfd_file)) {
2040 put_unused_fd(pidfd);
2041 return PTR_ERR(pidfd_file);
2042 }
2043 /*
2044 * anon_inode_getfile() ignores everything outside of the
2045 * O_ACCMODE | O_NONBLOCK mask, set PIDFD_THREAD manually.
2046 */
2047 pidfd_file->f_flags |= (flags & PIDFD_THREAD);
2048 *ret = pidfd_file;
2049 return pidfd;
2050 }
2051
2052 /**
2053 * pidfd_prepare - allocate a new pidfd_file and reserve a pidfd
2054 * @pid: the struct pid for which to create a pidfd
2055 * @flags: flags of the new @pidfd
2056 * @ret: Where to return the pidfd.
2057 *
2058 * Allocate a new file that stashes @pid and reserve a new pidfd number in the
2059 * caller's file descriptor table. The pidfd is reserved but not installed yet.
2060 *
2061 * The helper verifies that @pid is still in use, without PIDFD_THREAD the
2062 * task identified by @pid must be a thread-group leader.
2063 *
2064 * If this function returns successfully the caller is responsible to either
2065 * call fd_install() passing the returned pidfd and pidfd file as arguments in
2066 * order to install the pidfd into its file descriptor table or they must use
2067 * put_unused_fd() and fput() on the returned pidfd and pidfd file
2068 * respectively.
2069 *
2070 * This function is useful when a pidfd must already be reserved but there
2071 * might still be points of failure afterwards and the caller wants to ensure
2072 * that no pidfd is leaked into its file descriptor table.
2073 *
2074 * Return: On success, a reserved pidfd is returned from the function and a new
2075 * pidfd file is returned in the last argument to the function. On
2076 * error, a negative error code is returned from the function and the
2077 * last argument remains unchanged.
2078 */
2079 int pidfd_prepare(struct pid *pid, unsigned int flags, struct file **ret)
2080 {
2081 bool thread = flags & PIDFD_THREAD;
2082
2083 if (!pid || !pid_has_task(pid, thread ? PIDTYPE_PID : PIDTYPE_TGID))
2084 return -EINVAL;
2085
2086 return __pidfd_prepare(pid, flags, ret);
2087 }
2088
2089 static void __delayed_free_task(struct rcu_head *rhp)
2090 {
2091 struct task_struct *tsk = container_of(rhp, struct task_struct, rcu);
2092
2093 free_task(tsk);
2094 }
2095
2096 static __always_inline void delayed_free_task(struct task_struct *tsk)
2097 {
2098 if (IS_ENABLED(CONFIG_MEMCG))
2099 call_rcu(&tsk->rcu, __delayed_free_task);
2100 else
2101 free_task(tsk);
2102 }
2103
2104 static void copy_oom_score_adj(u64 clone_flags, struct task_struct *tsk)
2105 {
2106 /* Skip if kernel thread */
2107 if (!tsk->mm)
2108 return;
2109
2110 /* Skip if spawning a thread or using vfork */
2111 if ((clone_flags & (CLONE_VM | CLONE_THREAD | CLONE_VFORK)) != CLONE_VM)
2112 return;
2113
2114 /* We need to synchronize with __set_oom_adj */
2115 mutex_lock(&oom_adj_mutex);
2116 set_bit(MMF_MULTIPROCESS, &tsk->mm->flags);
2117 /* Update the values in case they were changed after copy_signal */
2118 tsk->signal->oom_score_adj = current->signal->oom_score_adj;
2119 tsk->signal->oom_score_adj_min = current->signal->oom_score_adj_min;
2120 mutex_unlock(&oom_adj_mutex);
2121 }
2122
2123 #ifdef CONFIG_RV
2124 static void rv_task_fork(struct task_struct *p)
2125 {
2126 int i;
2127
2128 for (i = 0; i < RV_PER_TASK_MONITORS; i++)
2129 p->rv[i].da_mon.monitoring = false;
2130 }
2131 #else
2132 #define rv_task_fork(p) do {} while (0)
2133 #endif
2134
2135 /*
2136 * This creates a new process as a copy of the old one,
2137 * but does not actually start it yet.
2138 *
2139 * It copies the registers, and all the appropriate
2140 * parts of the process environment (as per the clone
2141 * flags). The actual kick-off is left to the caller.
2142 */
2143 __latent_entropy struct task_struct *copy_process(
2144 struct pid *pid,
2145 int trace,
2146 int node,
2147 struct kernel_clone_args *args)
2148 {
2149 int pidfd = -1, retval;
2150 struct task_struct *p;
2151 struct multiprocess_signals delayed;
2152 struct file *pidfile = NULL;
2153 const u64 clone_flags = args->flags;
2154 struct nsproxy *nsp = current->nsproxy;
2155
2156 /*
2157 * Don't allow sharing the root directory with processes in a different
2158 * namespace
2159 */
2160 if ((clone_flags & (CLONE_NEWNS|CLONE_FS)) == (CLONE_NEWNS|CLONE_FS))
2161 return ERR_PTR(-EINVAL);
2162
2163 if ((clone_flags & (CLONE_NEWUSER|CLONE_FS)) == (CLONE_NEWUSER|CLONE_FS))
2164 return ERR_PTR(-EINVAL);
2165
2166 /*
2167 * Thread groups must share signals as well, and detached threads
2168 * can only be started up within the thread group.
2169 */
2170 if ((clone_flags & CLONE_THREAD) && !(clone_flags & CLONE_SIGHAND))
2171 return ERR_PTR(-EINVAL);
2172
2173 /*
2174 * Shared signal handlers imply shared VM. By way of the above,
2175 * thread groups also imply shared VM. Blocking this case allows
2176 * for various simplifications in other code.
2177 */
2178 if ((clone_flags & CLONE_SIGHAND) && !(clone_flags & CLONE_VM))
2179 return ERR_PTR(-EINVAL);
2180
2181 /*
2182 * Siblings of global init remain as zombies on exit since they are
2183 * not reaped by their parent (swapper). To solve this and to avoid
2184 * multi-rooted process trees, prevent global and container-inits
2185 * from creating siblings.
2186 */
2187 if ((clone_flags & CLONE_PARENT) &&
2188 current->signal->flags & SIGNAL_UNKILLABLE)
2189 return ERR_PTR(-EINVAL);
2190
2191 /*
2192 * If the new process will be in a different pid or user namespace
2193 * do not allow it to share a thread group with the forking task.
2194 */
2195 if (clone_flags & CLONE_THREAD) {
2196 if ((clone_flags & (CLONE_NEWUSER | CLONE_NEWPID)) ||
2197 (task_active_pid_ns(current) != nsp->pid_ns_for_children))
2198 return ERR_PTR(-EINVAL);
2199 }
2200
2201 if (clone_flags & CLONE_PIDFD) {
2202 /*
2203 * - CLONE_DETACHED is blocked so that we can potentially
2204 * reuse it later for CLONE_PIDFD.
2205 */
2206 if (clone_flags & CLONE_DETACHED)
2207 return ERR_PTR(-EINVAL);
2208 }
2209
2210 /*
2211 * Force any signals received before this point to be delivered
2212 * before the fork happens. Collect up signals sent to multiple
2213 * processes that happen during the fork and delay them so that
2214 * they appear to happen after the fork.
2215 */
2216 sigemptyset(&delayed.signal);
2217 INIT_HLIST_NODE(&delayed.node);
2218
2219 spin_lock_irq(¤t->sighand->siglock);
2220 if (!(clone_flags & CLONE_THREAD))
2221 hlist_add_head(&delayed.node, ¤t->signal->multiprocess);
2222 recalc_sigpending();
2223 spin_unlock_irq(¤t->sighand->siglock);
2224 retval = -ERESTARTNOINTR;
2225 if (task_sigpending(current))
2226 goto fork_out;
2227
2228 retval = -ENOMEM;
2229 p = dup_task_struct(current, node);
2230 if (!p)
2231 goto fork_out;
2232 p->flags &= ~PF_KTHREAD;
2233 if (args->kthread)
2234 p->flags |= PF_KTHREAD;
2235 if (args->user_worker) {
2236 /*
2237 * Mark us a user worker, and block any signal that isn't
2238 * fatal or STOP
2239 */
2240 p->flags |= PF_USER_WORKER;
2241 siginitsetinv(&p->blocked, sigmask(SIGKILL)|sigmask(SIGSTOP));
2242 }
2243 if (args->io_thread)
2244 p->flags |= PF_IO_WORKER;
2245
2246 if (args->name)
2247 strscpy_pad(p->comm, args->name, sizeof(p->comm));
2248
2249 p->set_child_tid = (clone_flags & CLONE_CHILD_SETTID) ? args->child_tid : NULL;
2250 /*
2251 * Clear TID on mm_release()?
2252 */
2253 p->clear_child_tid = (clone_flags & CLONE_CHILD_CLEARTID) ? args->child_tid : NULL;
2254
2255 ftrace_graph_init_task(p);
2256
2257 rt_mutex_init_task(p);
2258
2259 lockdep_assert_irqs_enabled();
2260 #ifdef CONFIG_PROVE_LOCKING
2261 DEBUG_LOCKS_WARN_ON(!p->softirqs_enabled);
2262 #endif
2263 retval = copy_creds(p, clone_flags);
2264 if (retval < 0)
2265 goto bad_fork_free;
2266
2267 retval = -EAGAIN;
2268 if (is_rlimit_overlimit(task_ucounts(p), UCOUNT_RLIMIT_NPROC, rlimit(RLIMIT_NPROC))) {
2269 if (p->real_cred->user != INIT_USER &&
2270 !capable(CAP_SYS_RESOURCE) && !capable(CAP_SYS_ADMIN))
2271 goto bad_fork_cleanup_count;
2272 }
2273 current->flags &= ~PF_NPROC_EXCEEDED;
2274
2275 /*
2276 * If multiple threads are within copy_process(), then this check
2277 * triggers too late. This doesn't hurt, the check is only there
2278 * to stop root fork bombs.
2279 */
2280 retval = -EAGAIN;
2281 if (data_race(nr_threads >= max_threads))
2282 goto bad_fork_cleanup_count;
2283
2284 delayacct_tsk_init(p); /* Must remain after dup_task_struct() */
2285 p->flags &= ~(PF_SUPERPRIV | PF_WQ_WORKER | PF_IDLE | PF_NO_SETAFFINITY);
2286 p->flags |= PF_FORKNOEXEC;
2287 INIT_LIST_HEAD(&p->children);
2288 INIT_LIST_HEAD(&p->sibling);
2289 rcu_copy_process(p);
2290 p->vfork_done = NULL;
2291 spin_lock_init(&p->alloc_lock);
2292
2293 init_sigpending(&p->pending);
2294
2295 p->utime = p->stime = p->gtime = 0;
2296 #ifdef CONFIG_ARCH_HAS_SCALED_CPUTIME
2297 p->utimescaled = p->stimescaled = 0;
2298 #endif
2299 prev_cputime_init(&p->prev_cputime);
2300
2301 #ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN
2302 seqcount_init(&p->vtime.seqcount);
2303 p->vtime.starttime = 0;
2304 p->vtime.state = VTIME_INACTIVE;
2305 #endif
2306
2307 #ifdef CONFIG_IO_URING
2308 p->io_uring = NULL;
2309 #endif
2310
2311 p->default_timer_slack_ns = current->timer_slack_ns;
2312
2313 #ifdef CONFIG_PSI
2314 p->psi_flags = 0;
2315 #endif
2316
2317 task_io_accounting_init(&p->ioac);
2318 acct_clear_integrals(p);
2319
2320 posix_cputimers_init(&p->posix_cputimers);
2321
2322 p->io_context = NULL;
2323 audit_set_context(p, NULL);
2324 cgroup_fork(p);
2325 if (args->kthread) {
2326 if (!set_kthread_struct(p))
2327 goto bad_fork_cleanup_delayacct;
2328 }
2329 #ifdef CONFIG_NUMA
2330 p->mempolicy = mpol_dup(p->mempolicy);
2331 if (IS_ERR(p->mempolicy)) {
2332 retval = PTR_ERR(p->mempolicy);
2333 p->mempolicy = NULL;
2334 goto bad_fork_cleanup_delayacct;
2335 }
2336 #endif
2337 #ifdef CONFIG_CPUSETS
2338 p->cpuset_mem_spread_rotor = NUMA_NO_NODE;
2339 p->cpuset_slab_spread_rotor = NUMA_NO_NODE;
2340 seqcount_spinlock_init(&p->mems_allowed_seq, &p->alloc_lock);
2341 #endif
2342 #ifdef CONFIG_TRACE_IRQFLAGS
2343 memset(&p->irqtrace, 0, sizeof(p->irqtrace));
2344 p->irqtrace.hardirq_disable_ip = _THIS_IP_;
2345 p->irqtrace.softirq_enable_ip = _THIS_IP_;
2346 p->softirqs_enabled = 1;
2347 p->softirq_context = 0;
2348 #endif
2349
2350 p->pagefault_disabled = 0;
2351
2352 #ifdef CONFIG_LOCKDEP
2353 lockdep_init_task(p);
2354 #endif
2355
2356 #ifdef CONFIG_DEBUG_MUTEXES
2357 p->blocked_on = NULL; /* not blocked yet */
2358 #endif
2359 #ifdef CONFIG_BCACHE
2360 p->sequential_io = 0;
2361 p->sequential_io_avg = 0;
2362 #endif
2363 #ifdef CONFIG_BPF_SYSCALL
2364 RCU_INIT_POINTER(p->bpf_storage, NULL);
2365 p->bpf_ctx = NULL;
2366 #endif
2367
2368 /* Perform scheduler related setup. Assign this task to a CPU. */
2369 retval = sched_fork(clone_flags, p);
2370 if (retval)
2371 goto bad_fork_cleanup_policy;
2372
2373 retval = perf_event_init_task(p, clone_flags);
2374 if (retval)
2375 goto bad_fork_cleanup_policy;
2376 retval = audit_alloc(p);
2377 if (retval)
2378 goto bad_fork_cleanup_perf;
2379 /* copy all the process information */
2380 shm_init_task(p);
2381 retval = security_task_alloc(p, clone_flags);
2382 if (retval)
2383 goto bad_fork_cleanup_audit;
2384 retval = copy_semundo(clone_flags, p);
2385 if (retval)
2386 goto bad_fork_cleanup_security;
2387 retval = copy_files(clone_flags, p, args->no_files);
2388 if (retval)
2389 goto bad_fork_cleanup_semundo;
2390 retval = copy_fs(clone_flags, p);
2391 if (retval)
2392 goto bad_fork_cleanup_files;
2393 retval = copy_sighand(clone_flags, p);
2394 if (retval)
2395 goto bad_fork_cleanup_fs;
2396 retval = copy_signal(clone_flags, p);
2397 if (retval)
2398 goto bad_fork_cleanup_sighand;
2399 retval = copy_mm(clone_flags, p);
2400 if (retval)
2401 goto bad_fork_cleanup_signal;
2402 retval = copy_namespaces(clone_flags, p);
2403 if (retval)
2404 goto bad_fork_cleanup_mm;
2405 retval = copy_io(clone_flags, p);
2406 if (retval)
2407 goto bad_fork_cleanup_namespaces;
2408 retval = copy_thread(p, args);
2409 if (retval)
2410 goto bad_fork_cleanup_io;
2411
2412 stackleak_task_init(p);
2413
2414 if (pid != &init_struct_pid) {
2415 pid = alloc_pid(p->nsproxy->pid_ns_for_children, args->set_tid,
2416 args->set_tid_size);
2417 if (IS_ERR(pid)) {
2418 retval = PTR_ERR(pid);
2419 goto bad_fork_cleanup_thread;
2420 }
2421 }
2422
2423 /*
2424 * This has to happen after we've potentially unshared the file
2425 * descriptor table (so that the pidfd doesn't leak into the child
2426 * if the fd table isn't shared).
2427 */
2428 if (clone_flags & CLONE_PIDFD) {
2429 int flags = (clone_flags & CLONE_THREAD) ? PIDFD_THREAD : 0;
2430
2431 /* Note that no task has been attached to @pid yet. */
2432 retval = __pidfd_prepare(pid, flags, &pidfile);
2433 if (retval < 0)
2434 goto bad_fork_free_pid;
2435 pidfd = retval;
2436
2437 retval = put_user(pidfd, args->pidfd);
2438 if (retval)
2439 goto bad_fork_put_pidfd;
2440 }
2441
2442 #ifdef CONFIG_BLOCK
2443 p->plug = NULL;
2444 #endif
2445 futex_init_task(p);
2446
2447 /*
2448 * sigaltstack should be cleared when sharing the same VM
2449 */
2450 if ((clone_flags & (CLONE_VM|CLONE_VFORK)) == CLONE_VM)
2451 sas_ss_reset(p);
2452
2453 /*
2454 * Syscall tracing and stepping should be turned off in the
2455 * child regardless of CLONE_PTRACE.
2456 */
2457 user_disable_single_step(p);
2458 clear_task_syscall_work(p, SYSCALL_TRACE);
2459 #if defined(CONFIG_GENERIC_ENTRY) || defined(TIF_SYSCALL_EMU)
2460 clear_task_syscall_work(p, SYSCALL_EMU);
2461 #endif
2462 clear_tsk_latency_tracing(p);
2463
2464 /* ok, now we should be set up.. */
2465 p->pid = pid_nr(pid);
2466 if (clone_flags & CLONE_THREAD) {
2467 p->group_leader = current->group_leader;
2468 p->tgid = current->tgid;
2469 } else {
2470 p->group_leader = p;
2471 p->tgid = p->pid;
2472 }
2473
2474 p->nr_dirtied = 0;
2475 p->nr_dirtied_pause = 128 >> (PAGE_SHIFT - 10);
2476 p->dirty_paused_when = 0;
2477
2478 p->pdeath_signal = 0;
2479 p->task_works = NULL;
2480 clear_posix_cputimers_work(p);
2481
2482 #ifdef CONFIG_KRETPROBES
2483 p->kretprobe_instances.first = NULL;
2484 #endif
2485 #ifdef CONFIG_RETHOOK
2486 p->rethooks.first = NULL;
2487 #endif
2488
2489 /*
2490 * Ensure that the cgroup subsystem policies allow the new process to be
2491 * forked. It should be noted that the new process's css_set can be changed
2492 * between here and cgroup_post_fork() if an organisation operation is in
2493 * progress.
2494 */
2495 retval = cgroup_can_fork(p, args);
2496 if (retval)
2497 goto bad_fork_put_pidfd;
2498
2499 /*
2500 * Now that the cgroups are pinned, re-clone the parent cgroup and put
2501 * the new task on the correct runqueue. All this *before* the task
2502 * becomes visible.
2503 *
2504 * This isn't part of ->can_fork() because while the re-cloning is
2505 * cgroup specific, it unconditionally needs to place the task on a
2506 * runqueue.
2507 */
2508 sched_cgroup_fork(p, args);
2509
2510 /*
2511 * From this point on we must avoid any synchronous user-space
2512 * communication until we take the tasklist-lock. In particular, we do
2513 * not want user-space to be able to predict the process start-time by
2514 * stalling fork(2) after we recorded the start_time but before it is
2515 * visible to the system.
2516 */
2517
2518 p->start_time = ktime_get_ns();
2519 p->start_boottime = ktime_get_boottime_ns();
2520
2521 /*
2522 * Make it visible to the rest of the system, but dont wake it up yet.
2523 * Need tasklist lock for parent etc handling!
2524 */
2525 write_lock_irq(&tasklist_lock);
2526
2527 /* CLONE_PARENT re-uses the old parent */
2528 if (clone_flags & (CLONE_PARENT|CLONE_THREAD)) {
2529 p->real_parent = current->real_parent;
2530 p->parent_exec_id = current->parent_exec_id;
2531 if (clone_flags & CLONE_THREAD)
2532 p->exit_signal = -1;
2533 else
2534 p->exit_signal = current->group_leader->exit_signal;
2535 } else {
2536 p->real_parent = current;
2537 p->parent_exec_id = current->self_exec_id;
2538 p->exit_signal = args->exit_signal;
2539 }
2540
2541 klp_copy_process(p);
2542
2543 sched_core_fork(p);
2544
2545 spin_lock(¤t->sighand->siglock);
2546
2547 rv_task_fork(p);
2548
2549 rseq_fork(p, clone_flags);
2550
2551 /* Don't start children in a dying pid namespace */
2552 if (unlikely(!(ns_of_pid(pid)->pid_allocated & PIDNS_ADDING))) {
2553 retval = -ENOMEM;
2554 goto bad_fork_cancel_cgroup;
2555 }
2556
2557 /* Let kill terminate clone/fork in the middle */
2558 if (fatal_signal_pending(current)) {
2559 retval = -EINTR;
2560 goto bad_fork_cancel_cgroup;
2561 }
2562
2563 /* No more failure paths after this point. */
2564
2565 /*
2566 * Copy seccomp details explicitly here, in case they were changed
2567 * before holding sighand lock.
2568 */
2569 copy_seccomp(p);
2570
2571 init_task_pid_links(p);
2572 if (likely(p->pid)) {
2573 ptrace_init_task(p, (clone_flags & CLONE_PTRACE) || trace);
2574
2575 init_task_pid(p, PIDTYPE_PID, pid);
2576 if (thread_group_leader(p)) {
2577 init_task_pid(p, PIDTYPE_TGID, pid);
2578 init_task_pid(p, PIDTYPE_PGID, task_pgrp(current));
2579 init_task_pid(p, PIDTYPE_SID, task_session(current));
2580
2581 if (is_child_reaper(pid)) {
2582 ns_of_pid(pid)->child_reaper = p;
2583 p->signal->flags |= SIGNAL_UNKILLABLE;
2584 }
2585 p->signal->shared_pending.signal = delayed.signal;
2586 p->signal->tty = tty_kref_get(current->signal->tty);
2587 /*
2588 * Inherit has_child_subreaper flag under the same
2589 * tasklist_lock with adding child to the process tree
2590 * for propagate_has_child_subreaper optimization.
2591 */
2592 p->signal->has_child_subreaper = p->real_parent->signal->has_child_subreaper ||
2593 p->real_parent->signal->is_child_subreaper;
2594 list_add_tail(&p->sibling, &p->real_parent->children);
2595 list_add_tail_rcu(&p->tasks, &init_task.tasks);
2596 attach_pid(p, PIDTYPE_TGID);
2597 attach_pid(p, PIDTYPE_PGID);
2598 attach_pid(p, PIDTYPE_SID);
2599 __this_cpu_inc(process_counts);
2600 } else {
2601 current->signal->nr_threads++;
2602 current->signal->quick_threads++;
2603 atomic_inc(¤t->signal->live);
2604 refcount_inc(¤t->signal->sigcnt);
2605 task_join_group_stop(p);
2606 list_add_tail_rcu(&p->thread_node,
2607 &p->signal->thread_head);
2608 }
2609 attach_pid(p, PIDTYPE_PID);
2610 nr_threads++;
2611 }
2612 total_forks++;
2613 hlist_del_init(&delayed.node);
2614 spin_unlock(¤t->sighand->siglock);
2615 syscall_tracepoint_update(p);
2616 write_unlock_irq(&tasklist_lock);
2617
2618 if (pidfile)
2619 fd_install(pidfd, pidfile);
2620
2621 proc_fork_connector(p);
2622 sched_post_fork(p);
2623 cgroup_post_fork(p, args);
2624 perf_event_fork(p);
2625
2626 trace_task_newtask(p, clone_flags);
2627 uprobe_copy_process(p, clone_flags);
2628 user_events_fork(p, clone_flags);
2629
2630 copy_oom_score_adj(clone_flags, p);
2631
2632 return p;
2633
2634 bad_fork_cancel_cgroup:
2635 sched_core_free(p);
2636 spin_unlock(¤t->sighand->siglock);
2637 write_unlock_irq(&tasklist_lock);
2638 cgroup_cancel_fork(p, args);
2639 bad_fork_put_pidfd:
2640 if (clone_flags & CLONE_PIDFD) {
2641 fput(pidfile);
2642 put_unused_fd(pidfd);
2643 }
2644 bad_fork_free_pid:
2645 if (pid != &init_struct_pid)
2646 free_pid(pid);
2647 bad_fork_cleanup_thread:
2648 exit_thread(p);
2649 bad_fork_cleanup_io:
2650 if (p->io_context)
2651 exit_io_context(p);
2652 bad_fork_cleanup_namespaces:
2653 exit_task_namespaces(p);
2654 bad_fork_cleanup_mm:
2655 if (p->mm) {
2656 mm_clear_owner(p->mm, p);
2657 mmput(p->mm);
2658 }
2659 bad_fork_cleanup_signal:
2660 if (!(clone_flags & CLONE_THREAD))
2661 free_signal_struct(p->signal);
2662 bad_fork_cleanup_sighand:
2663 __cleanup_sighand(p->sighand);
2664 bad_fork_cleanup_fs:
2665 exit_fs(p); /* blocking */
2666 bad_fork_cleanup_files:
2667 exit_files(p); /* blocking */
2668 bad_fork_cleanup_semundo:
2669 exit_sem(p);
2670 bad_fork_cleanup_security:
2671 security_task_free(p);
2672 bad_fork_cleanup_audit:
2673 audit_free(p);
2674 bad_fork_cleanup_perf:
2675 perf_event_free_task(p);
2676 bad_fork_cleanup_policy:
2677 lockdep_free_task(p);
2678 #ifdef CONFIG_NUMA
2679 mpol_put(p->mempolicy);
2680 #endif
2681 bad_fork_cleanup_delayacct:
2682 delayacct_tsk_free(p);
2683 bad_fork_cleanup_count:
2684 dec_rlimit_ucounts(task_ucounts(p), UCOUNT_RLIMIT_NPROC, 1);
2685 exit_creds(p);
2686 bad_fork_free:
2687 WRITE_ONCE(p->__state, TASK_DEAD);
2688 exit_task_stack_account(p);
2689 put_task_stack(p);
2690 delayed_free_task(p);
2691 fork_out:
2692 spin_lock_irq(¤t->sighand->siglock);
2693 hlist_del_init(&delayed.node);
2694 spin_unlock_irq(¤t->sighand->siglock);
2695 return ERR_PTR(retval);
2696 }
2697
2698 static inline void init_idle_pids(struct task_struct *idle)
2699 {
2700 enum pid_type type;
2701
2702 for (type = PIDTYPE_PID; type < PIDTYPE_MAX; ++type) {
2703 INIT_HLIST_NODE(&idle->pid_links[type]); /* not really needed */
2704 init_task_pid(idle, type, &init_struct_pid);
2705 }
2706 }
2707
2708 static int idle_dummy(void *dummy)
2709 {
2710 /* This function is never called */
2711 return 0;
2712 }
2713
2714 struct task_struct * __init fork_idle(int cpu)
2715 {
2716 struct task_struct *task;
2717 struct kernel_clone_args args = {
2718 .flags = CLONE_VM,
2719 .fn = &idle_dummy,
2720 .fn_arg = NULL,
2721 .kthread = 1,
2722 .idle = 1,
2723 };
2724
2725 task = copy_process(&init_struct_pid, 0, cpu_to_node(cpu), &args);
2726 if (!IS_ERR(task)) {
2727 init_idle_pids(task);
2728 init_idle(task, cpu);
2729 }
2730
2731 return task;
2732 }
2733
2734 /*
2735 * This is like kernel_clone(), but shaved down and tailored to just
2736 * creating io_uring workers. It returns a created task, or an error pointer.
2737 * The returned task is inactive, and the caller must fire it up through
2738 * wake_up_new_task(p). All signals are blocked in the created task.
2739 */
2740 struct task_struct *create_io_thread(int (*fn)(void *), void *arg, int node)
2741 {
2742 unsigned long flags = CLONE_FS|CLONE_FILES|CLONE_SIGHAND|CLONE_THREAD|
2743 CLONE_IO;
2744 struct kernel_clone_args args = {
2745 .flags = ((lower_32_bits(flags) | CLONE_VM |
2746 CLONE_UNTRACED) & ~CSIGNAL),
2747 .exit_signal = (lower_32_bits(flags) & CSIGNAL),
2748 .fn = fn,
2749 .fn_arg = arg,
2750 .io_thread = 1,
2751 .user_worker = 1,
2752 };
2753
2754 return copy_process(NULL, 0, node, &args);
2755 }
2756
2757 /*
2758 * Ok, this is the main fork-routine.
2759 *
2760 * It copies the process, and if successful kick-starts
2761 * it and waits for it to finish using the VM if required.
2762 *
2763 * args->exit_signal is expected to be checked for sanity by the caller.
2764 */
> 2765 #include <linux/livepatch.h>
2766 pid_t kernel_clone(struct kernel_clone_args *args)
2767 {
2768 u64 clone_flags = args->flags;
2769 struct completion vfork;
2770 struct pid *pid;
2771 struct task_struct *p;
2772 int trace = 0;
2773 pid_t nr;
2774 int *newpid;
2775 static int ctr = 0;
2776
2777 /*
2778 * For legacy clone() calls, CLONE_PIDFD uses the parent_tid argument
2779 * to return the pidfd. Hence, CLONE_PIDFD and CLONE_PARENT_SETTID are
2780 * mutually exclusive. With clone3() CLONE_PIDFD has grown a separate
2781 * field in struct clone_args and it still doesn't make sense to have
2782 * them both point at the same memory location. Performing this check
2783 * here has the advantage that we don't need to have a separate helper
2784 * to check for legacy clone().
2785 */
2786 if ((clone_flags & CLONE_PIDFD) &&
2787 (clone_flags & CLONE_PARENT_SETTID) &&
2788 (args->pidfd == args->parent_tid))
2789 return -EINVAL;
2790
2791 /*
2792 * Determine whether and which event to report to ptracer. When
2793 * called from kernel_thread or CLONE_UNTRACED is explicitly
2794 * requested, no event is reported; otherwise, report if the event
2795 * for the type of forking is enabled.
2796 */
2797 if (!(clone_flags & CLONE_UNTRACED)) {
2798 if (clone_flags & CLONE_VFORK)
2799 trace = PTRACE_EVENT_VFORK;
2800 else if (args->exit_signal != SIGCHLD)
2801 trace = PTRACE_EVENT_CLONE;
2802 else
2803 trace = PTRACE_EVENT_FORK;
2804
2805 if (likely(!ptrace_event_enabled(current, trace)))
2806 trace = 0;
2807 }
2808
2809 p = copy_process(NULL, trace, NUMA_NO_NODE, args);
2810 add_latent_entropy();
2811
2812 if (IS_ERR(p))
2813 return PTR_ERR(p);
2814
2815 newpid = klp_shadow_get_or_alloc(p, 0, sizeof(*newpid), GFP_KERNEL,
2816 NULL, NULL);
2817 if (newpid)
2818 *newpid = ctr++;
2819
2820 /*
2821 * Do this prior waking up the new thread - the thread pointer
2822 * might get invalid after that point, if the thread exits quickly.
2823 */
2824 trace_sched_process_fork(current, p);
2825
2826 pid = get_task_pid(p, PIDTYPE_PID);
2827 nr = pid_vnr(pid);
2828
2829 if (clone_flags & CLONE_PARENT_SETTID)
2830 put_user(nr, args->parent_tid);
2831
2832 if (clone_flags & CLONE_VFORK) {
2833 p->vfork_done = &vfork;
2834 init_completion(&vfork);
2835 get_task_struct(p);
2836 }
2837
2838 if (IS_ENABLED(CONFIG_LRU_GEN_WALKS_MMU) && !(clone_flags & CLONE_VM)) {
2839 /* lock the task to synchronize with memcg migration */
2840 task_lock(p);
2841 lru_gen_add_mm(p->mm);
2842 task_unlock(p);
2843 }
2844
2845 wake_up_new_task(p);
2846
2847 /* forking complete and child started to run, tell ptracer */
2848 if (unlikely(trace))
2849 ptrace_event_pid(trace, pid);
2850
2851 if (clone_flags & CLONE_VFORK) {
2852 if (!wait_for_vfork_done(p, &vfork))
2853 ptrace_event_pid(PTRACE_EVENT_VFORK_DONE, pid);
2854 }
2855
2856 put_pid(pid);
2857 return nr;
2858 }
2859
--
0-DAY CI Kernel Test Service
https://github.com/intel/lkp-tests/wiki
next reply other threads:[~2024-05-25 6:40 UTC|newest]
Thread overview: 2+ messages / expand[flat|nested] mbox.gz Atom feed top
2024-05-25 6:40 kernel test robot [this message]
2024-05-27 1:47 ` [jpoimboe:objtool-diff 2/2] kernel/fork.c: linux/livepatch.h is included more than once Liu, Yujie
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=202405251400.UdnwcgiL-lkp@intel.com \
--to=lkp@intel.com \
--cc=oe-kbuild@lists.linux.dev \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.