All of lore.kernel.org
 help / color / mirror / Atom feed
* [bug report] drm/amdkfd: process exit and retry fault race
@ 2021-11-26  8:11 Dan Carpenter
  0 siblings, 0 replies; only message in thread
From: Dan Carpenter @ 2021-11-26  8:11 UTC (permalink / raw)
  To: Philip.Yang; +Cc: amd-gfx

Hello Philip Yang,

The patch a0c55ecee100: "drm/amdkfd: process exit and retry fault
race" from Nov 16, 2021, leads to the following Smatch static checker
warning:

	drivers/gpu/drm/amd/amdgpu/../amdkfd/kfd_svm.c:2615 svm_range_restore_pages()
	warn: missing error code here? 'get_task_mm()' failed. 'r' = '0'

drivers/gpu/drm/amd/amdgpu/../amdkfd/kfd_svm.c
    2570 int
    2571 svm_range_restore_pages(struct amdgpu_device *adev, unsigned int pasid,
    2572                         uint64_t addr, bool write_fault)
    2573 {
    2574         struct mm_struct *mm = NULL;
    2575         struct svm_range_list *svms;
    2576         struct svm_range *prange;
    2577         struct kfd_process *p;
    2578         uint64_t timestamp;
    2579         int32_t best_loc;
    2580         int32_t gpuidx = MAX_GPU_INSTANCE;
    2581         bool write_locked = false;
    2582         struct vm_area_struct *vma;
    2583         int r = 0;
    2584 
    2585         if (!KFD_IS_SVM_API_SUPPORTED(adev->kfd.dev)) {
    2586                 pr_debug("device does not support SVM\n");
    2587                 return -EFAULT;
    2588         }
    2589 
    2590         p = kfd_lookup_process_by_pasid(pasid);
    2591         if (!p) {
    2592                 pr_debug("kfd process not founded pasid 0x%x\n", pasid);
    2593                 return 0;
    2594         }
    2595         if (!p->xnack_enabled) {
    2596                 pr_debug("XNACK not enabled for pasid 0x%x\n", pasid);
    2597                 r = -EFAULT;
    2598                 goto out;
    2599         }
    2600         svms = &p->svms;
    2601 
    2602         pr_debug("restoring svms 0x%p fault address 0x%llx\n", svms, addr);
    2603 
    2604         if (atomic_read(&svms->drain_pagefaults)) {
    2605                 pr_debug("draining retry fault, drop fault 0x%llx\n", addr);

Presumably this is a success path.

    2606                 goto out;
    2607         }
    2608 
    2609         /* p->lead_thread is available as kfd_process_wq_release flush the work
    2610          * before releasing task ref.
    2611          */
    2612         mm = get_task_mm(p->lead_thread);
    2613         if (!mm) {
    2614                 pr_debug("svms 0x%p failed to get mm\n", svms);
--> 2615                 goto out;

This used to be an error path, but the patch removes the error code and
makes it a success path.  Unfortunately, it confuses static checkers
and also human readers.  The way to silence the static checker warning
is to set the "r = 0;" explicitly within 4 lines of the goto.

		r = 0;
		pr_debug("svms 0x%p failed to get mm\n", svms);
		goto out;


    2616         }
    2617 
    2618         mmap_read_lock(mm);
    2619 retry_write_locked:
    2620         mutex_lock(&svms->lock);
    2621         prange = svm_range_from_addr(svms, addr, NULL);
    2622         if (!prange) {
    2623                 pr_debug("failed to find prange svms 0x%p address [0x%llx]\n",
    2624                          svms, addr);
    2625                 if (!write_locked) {
    2626                         /* Need the write lock to create new range with MMU notifier.
    2627                          * Also flush pending deferred work to make sure the interval
    2628                          * tree is up to date before we add a new range
    2629                          */
    2630                         mutex_unlock(&svms->lock);
    2631                         mmap_read_unlock(mm);
    2632                         mmap_write_lock(mm);
    2633                         write_locked = true;
    2634                         goto retry_write_locked;
    2635                 }
    2636                 prange = svm_range_create_unregistered_range(adev, p, mm, addr);
    2637                 if (!prange) {
    2638                         pr_debug("failed to create unregistered range svms 0x%p address [0x%llx]\n",
    2639                                  svms, addr);
    2640                         mmap_write_downgrade(mm);
    2641                         r = -EFAULT;
    2642                         goto out_unlock_svms;
    2643                 }
    2644         }
    2645         if (write_locked)
    2646                 mmap_write_downgrade(mm);
    2647 
    2648         mutex_lock(&prange->migrate_mutex);
    2649 
    2650         if (svm_range_skip_recover(prange)) {
    2651                 amdgpu_gmc_filter_faults_remove(adev, addr, pasid);
    2652                 goto out_unlock_range;

Success path.

    2653         }
    2654 
    2655         timestamp = ktime_to_us(ktime_get()) - prange->validate_timestamp;
    2656         /* skip duplicate vm fault on different pages of same range */
    2657         if (timestamp < AMDGPU_SVM_RANGE_RETRY_FAULT_PENDING) {
    2658                 pr_debug("svms 0x%p [0x%lx %lx] already restored\n",
    2659                          svms, prange->start, prange->last);
    2660                 goto out_unlock_range;

Same.

    2661         }
    2662 
    2663         /* __do_munmap removed VMA, return success as we are handling stale
    2664          * retry fault.
    2665          */
    2666         vma = find_vma(mm, addr << PAGE_SHIFT);
    2667         if (!vma || (addr << PAGE_SHIFT) < vma->vm_start) {
    2668                 pr_debug("address 0x%llx VMA is removed\n", addr);
    2669                 r = 0;
    2670                 goto out_unlock_range;

This success path is very clear.  Good!

    2671         }
    2672 
    2673         if (!svm_fault_allowed(vma, write_fault)) {
    2674                 pr_debug("fault addr 0x%llx no %s permission\n", addr,
    2675                         write_fault ? "write" : "read");
    2676                 r = -EPERM;
    2677                 goto out_unlock_range;
    2678         }
    2679 
    2680         best_loc = svm_range_best_restore_location(prange, adev, &gpuidx);
    2681         if (best_loc == -1) {
    2682                 pr_debug("svms %p failed get best restore loc [0x%lx 0x%lx]\n",
    2683                          svms, prange->start, prange->last);
    2684                 r = -EACCES;
    2685                 goto out_unlock_range;
    2686         }
    2687 
    2688         pr_debug("svms %p [0x%lx 0x%lx] best restore 0x%x, actual loc 0x%x\n",
    2689                  svms, prange->start, prange->last, best_loc,
    2690                  prange->actual_loc);
    2691 
    2692         if (prange->actual_loc != best_loc) {
    2693                 if (best_loc) {
    2694                         r = svm_migrate_to_vram(prange, best_loc, mm);
    2695                         if (r) {
    2696                                 pr_debug("svm_migrate_to_vram failed (%d) at %llx, falling back to system memory\n",
    2697                                          r, addr);
    2698                                 /* Fallback to system memory if migration to
    2699                                  * VRAM failed
    2700                                  */
    2701                                 if (prange->actual_loc)
    2702                                         r = svm_migrate_vram_to_ram(prange, mm);
    2703                                 else
    2704                                         r = 0;
    2705                         }
    2706                 } else {
    2707                         r = svm_migrate_vram_to_ram(prange, mm);
    2708                 }
    2709                 if (r) {
    2710                         pr_debug("failed %d to migrate svms %p [0x%lx 0x%lx]\n",
    2711                                  r, svms, prange->start, prange->last);
    2712                         goto out_unlock_range;
    2713                 }
    2714         }
    2715 
    2716         r = svm_range_validate_and_map(mm, prange, gpuidx, false, false);
    2717         if (r)
    2718                 pr_debug("failed %d to map svms 0x%p [0x%lx 0x%lx] to gpus\n",
    2719                          r, svms, prange->start, prange->last);
    2720 
    2721 out_unlock_range:
    2722         mutex_unlock(&prange->migrate_mutex);
    2723 out_unlock_svms:
    2724         mutex_unlock(&svms->lock);
    2725         mmap_read_unlock(mm);
    2726 
    2727         svm_range_count_fault(adev, p, gpuidx);
    2728 
    2729         mmput(mm);
    2730 out:
    2731         kfd_unref_process(p);
    2732 
    2733         if (r == -EAGAIN) {
    2734                 pr_debug("recover vm fault later\n");
    2735                 amdgpu_gmc_filter_faults_remove(adev, addr, pasid);
    2736                 r = 0;
    2737         }
    2738         return r;
    2739 }

regards,
dan carpenter

^ permalink raw reply	[flat|nested] only message in thread

only message in thread, other threads:[~2021-11-26  8:12 UTC | newest]

Thread overview: (only message) (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2021-11-26  8:11 [bug report] drm/amdkfd: process exit and retry fault race Dan Carpenter

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.