From: Dan Carpenter <dan.carpenter@oracle.com>
To: Philip.Yang@amd.com
Cc: amd-gfx@lists.freedesktop.org
Subject: [bug report] drm/amdkfd: process exit and retry fault race
Date: Fri, 26 Nov 2021 11:11:51 +0300 [thread overview]
Message-ID: <20211126081151.GA14477@kili> (raw)
Hello Philip Yang,
The patch a0c55ecee100: "drm/amdkfd: process exit and retry fault
race" from Nov 16, 2021, leads to the following Smatch static checker
warning:
drivers/gpu/drm/amd/amdgpu/../amdkfd/kfd_svm.c:2615 svm_range_restore_pages()
warn: missing error code here? 'get_task_mm()' failed. 'r' = '0'
drivers/gpu/drm/amd/amdgpu/../amdkfd/kfd_svm.c
2570 int
2571 svm_range_restore_pages(struct amdgpu_device *adev, unsigned int pasid,
2572 uint64_t addr, bool write_fault)
2573 {
2574 struct mm_struct *mm = NULL;
2575 struct svm_range_list *svms;
2576 struct svm_range *prange;
2577 struct kfd_process *p;
2578 uint64_t timestamp;
2579 int32_t best_loc;
2580 int32_t gpuidx = MAX_GPU_INSTANCE;
2581 bool write_locked = false;
2582 struct vm_area_struct *vma;
2583 int r = 0;
2584
2585 if (!KFD_IS_SVM_API_SUPPORTED(adev->kfd.dev)) {
2586 pr_debug("device does not support SVM\n");
2587 return -EFAULT;
2588 }
2589
2590 p = kfd_lookup_process_by_pasid(pasid);
2591 if (!p) {
2592 pr_debug("kfd process not founded pasid 0x%x\n", pasid);
2593 return 0;
2594 }
2595 if (!p->xnack_enabled) {
2596 pr_debug("XNACK not enabled for pasid 0x%x\n", pasid);
2597 r = -EFAULT;
2598 goto out;
2599 }
2600 svms = &p->svms;
2601
2602 pr_debug("restoring svms 0x%p fault address 0x%llx\n", svms, addr);
2603
2604 if (atomic_read(&svms->drain_pagefaults)) {
2605 pr_debug("draining retry fault, drop fault 0x%llx\n", addr);
Presumably this is a success path.
2606 goto out;
2607 }
2608
2609 /* p->lead_thread is available as kfd_process_wq_release flush the work
2610 * before releasing task ref.
2611 */
2612 mm = get_task_mm(p->lead_thread);
2613 if (!mm) {
2614 pr_debug("svms 0x%p failed to get mm\n", svms);
--> 2615 goto out;
This used to be an error path, but the patch removes the error code and
makes it a success path. Unfortunately, it confuses static checkers
and also human readers. The way to silence the static checker warning
is to set the "r = 0;" explicitly within 4 lines of the goto.
r = 0;
pr_debug("svms 0x%p failed to get mm\n", svms);
goto out;
2616 }
2617
2618 mmap_read_lock(mm);
2619 retry_write_locked:
2620 mutex_lock(&svms->lock);
2621 prange = svm_range_from_addr(svms, addr, NULL);
2622 if (!prange) {
2623 pr_debug("failed to find prange svms 0x%p address [0x%llx]\n",
2624 svms, addr);
2625 if (!write_locked) {
2626 /* Need the write lock to create new range with MMU notifier.
2627 * Also flush pending deferred work to make sure the interval
2628 * tree is up to date before we add a new range
2629 */
2630 mutex_unlock(&svms->lock);
2631 mmap_read_unlock(mm);
2632 mmap_write_lock(mm);
2633 write_locked = true;
2634 goto retry_write_locked;
2635 }
2636 prange = svm_range_create_unregistered_range(adev, p, mm, addr);
2637 if (!prange) {
2638 pr_debug("failed to create unregistered range svms 0x%p address [0x%llx]\n",
2639 svms, addr);
2640 mmap_write_downgrade(mm);
2641 r = -EFAULT;
2642 goto out_unlock_svms;
2643 }
2644 }
2645 if (write_locked)
2646 mmap_write_downgrade(mm);
2647
2648 mutex_lock(&prange->migrate_mutex);
2649
2650 if (svm_range_skip_recover(prange)) {
2651 amdgpu_gmc_filter_faults_remove(adev, addr, pasid);
2652 goto out_unlock_range;
Success path.
2653 }
2654
2655 timestamp = ktime_to_us(ktime_get()) - prange->validate_timestamp;
2656 /* skip duplicate vm fault on different pages of same range */
2657 if (timestamp < AMDGPU_SVM_RANGE_RETRY_FAULT_PENDING) {
2658 pr_debug("svms 0x%p [0x%lx %lx] already restored\n",
2659 svms, prange->start, prange->last);
2660 goto out_unlock_range;
Same.
2661 }
2662
2663 /* __do_munmap removed VMA, return success as we are handling stale
2664 * retry fault.
2665 */
2666 vma = find_vma(mm, addr << PAGE_SHIFT);
2667 if (!vma || (addr << PAGE_SHIFT) < vma->vm_start) {
2668 pr_debug("address 0x%llx VMA is removed\n", addr);
2669 r = 0;
2670 goto out_unlock_range;
This success path is very clear. Good!
2671 }
2672
2673 if (!svm_fault_allowed(vma, write_fault)) {
2674 pr_debug("fault addr 0x%llx no %s permission\n", addr,
2675 write_fault ? "write" : "read");
2676 r = -EPERM;
2677 goto out_unlock_range;
2678 }
2679
2680 best_loc = svm_range_best_restore_location(prange, adev, &gpuidx);
2681 if (best_loc == -1) {
2682 pr_debug("svms %p failed get best restore loc [0x%lx 0x%lx]\n",
2683 svms, prange->start, prange->last);
2684 r = -EACCES;
2685 goto out_unlock_range;
2686 }
2687
2688 pr_debug("svms %p [0x%lx 0x%lx] best restore 0x%x, actual loc 0x%x\n",
2689 svms, prange->start, prange->last, best_loc,
2690 prange->actual_loc);
2691
2692 if (prange->actual_loc != best_loc) {
2693 if (best_loc) {
2694 r = svm_migrate_to_vram(prange, best_loc, mm);
2695 if (r) {
2696 pr_debug("svm_migrate_to_vram failed (%d) at %llx, falling back to system memory\n",
2697 r, addr);
2698 /* Fallback to system memory if migration to
2699 * VRAM failed
2700 */
2701 if (prange->actual_loc)
2702 r = svm_migrate_vram_to_ram(prange, mm);
2703 else
2704 r = 0;
2705 }
2706 } else {
2707 r = svm_migrate_vram_to_ram(prange, mm);
2708 }
2709 if (r) {
2710 pr_debug("failed %d to migrate svms %p [0x%lx 0x%lx]\n",
2711 r, svms, prange->start, prange->last);
2712 goto out_unlock_range;
2713 }
2714 }
2715
2716 r = svm_range_validate_and_map(mm, prange, gpuidx, false, false);
2717 if (r)
2718 pr_debug("failed %d to map svms 0x%p [0x%lx 0x%lx] to gpus\n",
2719 r, svms, prange->start, prange->last);
2720
2721 out_unlock_range:
2722 mutex_unlock(&prange->migrate_mutex);
2723 out_unlock_svms:
2724 mutex_unlock(&svms->lock);
2725 mmap_read_unlock(mm);
2726
2727 svm_range_count_fault(adev, p, gpuidx);
2728
2729 mmput(mm);
2730 out:
2731 kfd_unref_process(p);
2732
2733 if (r == -EAGAIN) {
2734 pr_debug("recover vm fault later\n");
2735 amdgpu_gmc_filter_faults_remove(adev, addr, pasid);
2736 r = 0;
2737 }
2738 return r;
2739 }
regards,
dan carpenter
reply other threads:[~2021-11-26 8:12 UTC|newest]
Thread overview: [no followups] expand[flat|nested] mbox.gz Atom feed
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20211126081151.GA14477@kili \
--to=dan.carpenter@oracle.com \
--cc=Philip.Yang@amd.com \
--cc=amd-gfx@lists.freedesktop.org \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.