From mboxrd@z Thu Jan 1 00:00:00 1970 From: =?ISO-8859-1?Q?Christian_K=F6nig?= Subject: Re: [PATCH 04/20] drm/radeon: convert fence to uint64_t v4 Date: Mon, 07 May 2012 17:04:37 +0200 Message-ID: <4FA7E485.4010200@vodafone.de> References: <1336390975-30945-1-git-send-email-deathsimple@vodafone.de> <1336390975-30945-5-git-send-email-deathsimple@vodafone.de> Mime-Version: 1.0 Content-Type: text/plain; charset="iso-8859-1"; Format="flowed" Content-Transfer-Encoding: quoted-printable Return-path: Received: from outgoing.email.vodafone.de (outgoing.email.vodafone.de [139.7.28.128]) by gabe.freedesktop.org (Postfix) with ESMTP id DD60F9EF5A for ; Mon, 7 May 2012 08:04:41 -0700 (PDT) In-Reply-To: List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , Sender: dri-devel-bounces+sf-dri-devel=m.gmane.org@lists.freedesktop.org Errors-To: dri-devel-bounces+sf-dri-devel=m.gmane.org@lists.freedesktop.org To: Jerome Glisse Cc: Jerome Glisse , dri-devel@lists.freedesktop.org List-Id: dri-devel@lists.freedesktop.org On 07.05.2012 16:39, Jerome Glisse wrote: > On Mon, May 7, 2012 at 7:42 AM, Christian K=F6nig wrote: >> From: Jerome Glisse >> >> This convert fence to use uint64_t sequence number intention is >> to use the fact that uin64_t is big enough that we don't need to >> care about wrap around. >> >> Tested with and without writeback using 0xFFFFF000 as initial >> fence sequence and thus allowing to test the wrap around from >> 32bits to 64bits. >> >> v2: Add comment about possible race btw CPU& GPU, add comment >> stressing that we need 2 dword aligned for R600_WB_EVENT_OFFSET >> Read fence sequenc in reverse order of GPU write them so we >> mitigate the race btw CPU and GPU. >> >> v3: Drop the need for ring to emit the 64bits fence, and just have >> each ring emit the lower 32bits of the fence sequence. We >> handle the wrap over 32bits in fence_process. >> >> v4: Just a small optimization: Don't reread the last_seq value >> if loop restarts, since we already know its value anyway. >> Also start at zero not one for seq value and use pre instead >> of post increment in emmit, otherwise wait_empty will deadlock. > Why changing that v3 was already good no deadlock. I started at 1 > especialy for that, a signaled fence is set to 0 so it always compare > as signaled. Just using preincrement is exactly like starting at one. > I don't see the need for this change but if it makes you happy. Not exactly, the last emitted sequence is also used in = radeon_fence_wait_empty. So when you use post increment = radeon_fence_wait_empty will actually not wait for the last emitted = fence to be signaled, but for last emitted + 1, so it practically waits = forever. Without this change suspend (for example) will just lockup. Cheers, Christian. > > Cheers, > Jerome >> Signed-off-by: Jerome Glisse >> Signed-off-by: Christian K=F6nig >> --- >> drivers/gpu/drm/radeon/radeon.h | 39 ++++++----- >> drivers/gpu/drm/radeon/radeon_fence.c | 116 +++++++++++++++++++++++--= -------- >> drivers/gpu/drm/radeon/radeon_ring.c | 9 ++- >> 3 files changed, 107 insertions(+), 57 deletions(-) >> >> diff --git a/drivers/gpu/drm/radeon/radeon.h b/drivers/gpu/drm/radeon/ra= deon.h >> index e99ea81..cdf46bc 100644 >> --- a/drivers/gpu/drm/radeon/radeon.h >> +++ b/drivers/gpu/drm/radeon/radeon.h >> @@ -100,28 +100,32 @@ extern int radeon_lockup_timeout; >> * Copy from radeon_drv.h so we don't have to include both and have con= flicting >> * symbol; >> */ >> -#define RADEON_MAX_USEC_TIMEOUT 100000 /* 100 ms */ >> -#define RADEON_FENCE_JIFFIES_TIMEOUT (HZ / 2) >> +#define RADEON_MAX_USEC_TIMEOUT 100000 /* 100 m= s */ >> +#define RADEON_FENCE_JIFFIES_TIMEOUT (HZ / 2) >> /* RADEON_IB_POOL_SIZE must be a power of 2 */ >> -#define RADEON_IB_POOL_SIZE 16 >> -#define RADEON_DEBUGFS_MAX_COMPONENTS 32 >> -#define RADEONFB_CONN_LIMIT 4 >> -#define RADEON_BIOS_NUM_SCRATCH 8 >> +#define RADEON_IB_POOL_SIZE 16 >> +#define RADEON_DEBUGFS_MAX_COMPONENTS 32 >> +#define RADEONFB_CONN_LIMIT 4 >> +#define RADEON_BIOS_NUM_SCRATCH 8 >> >> /* max number of rings */ >> -#define RADEON_NUM_RINGS 3 >> +#define RADEON_NUM_RINGS 3 >> + >> +/* fence seq are set to this number when signaled */ >> +#define RADEON_FENCE_SIGNALED_SEQ 0LL >> +#define RADEON_FENCE_NOTEMITED_SEQ (~0LL) >> >> /* internal ring indices */ >> /* r1xx+ has gfx CP ring */ >> -#define RADEON_RING_TYPE_GFX_INDEX 0 >> +#define RADEON_RING_TYPE_GFX_INDEX 0 >> >> /* cayman has 2 compute CP rings */ >> -#define CAYMAN_RING_TYPE_CP1_INDEX 1 >> -#define CAYMAN_RING_TYPE_CP2_INDEX 2 >> +#define CAYMAN_RING_TYPE_CP1_INDEX 1 >> +#define CAYMAN_RING_TYPE_CP2_INDEX 2 >> >> /* hardcode those limit for now */ >> -#define RADEON_VA_RESERVED_SIZE (8<< 20) >> -#define RADEON_IB_VM_MAX_SIZE (64<< 10) >> +#define RADEON_VA_RESERVED_SIZE (8<< 20) >> +#define RADEON_IB_VM_MAX_SIZE (64<< 10) >> >> /* >> * Errata workarounds. >> @@ -254,8 +258,9 @@ struct radeon_fence_driver { >> uint32_t scratch_reg; >> uint64_t gpu_addr; >> volatile uint32_t *cpu_addr; >> - atomic_t seq; >> - uint32_t last_seq; >> + /* seq is protected by ring emission lock */ >> + uint64_t seq; >> + atomic64_t last_seq; >> unsigned long last_activity; >> wait_queue_head_t queue; >> struct list_head emitted; >> @@ -268,11 +273,9 @@ struct radeon_fence { >> struct kref kref; >> struct list_head list; >> /* protected by radeon_fence.lock */ >> - uint32_t seq; >> - bool emitted; >> - bool signaled; >> + uint64_t seq; >> /* RB, DMA, etc. */ >> - int ring; >> + unsigned ring; >> struct radeon_semaphore *semaphore; >> }; >> >> diff --git a/drivers/gpu/drm/radeon/radeon_fence.c b/drivers/gpu/drm/rad= eon/radeon_fence.c >> index 5bb78bf..feb2bbc 100644 >> --- a/drivers/gpu/drm/radeon/radeon_fence.c >> +++ b/drivers/gpu/drm/radeon/radeon_fence.c >> @@ -66,14 +66,14 @@ int radeon_fence_emit(struct radeon_device *rdev, st= ruct radeon_fence *fence) >> unsigned long irq_flags; >> >> write_lock_irqsave(&rdev->fence_lock, irq_flags); >> - if (fence->emitted) { >> + if (fence->seq&& fence->seq< RADEON_FENCE_NOTEMITED_SEQ) { >> write_unlock_irqrestore(&rdev->fence_lock, irq_flags); >> return 0; >> } >> - fence->seq =3D atomic_add_return(1,&rdev->fence_drv[fence->ring]= .seq); >> + /* we are protected by the ring emission mutex */ >> + fence->seq =3D ++rdev->fence_drv[fence->ring].seq; >> radeon_fence_ring_emit(rdev, fence->ring, fence); >> trace_radeon_fence_emit(rdev->ddev, fence->seq); >> - fence->emitted =3D true; >> /* are we the first fence on a previusly idle ring? */ >> if (list_empty(&rdev->fence_drv[fence->ring].emitted)) { >> rdev->fence_drv[fence->ring].last_activity =3D jiffies; >> @@ -87,14 +87,60 @@ static bool radeon_fence_poll_locked(struct radeon_d= evice *rdev, int ring) >> { >> struct radeon_fence *fence; >> struct list_head *i, *n; >> - uint32_t seq; >> + uint64_t seq, last_seq; >> + unsigned count_loop =3D 0; >> bool wake =3D false; >> >> - seq =3D radeon_fence_read(rdev, ring); >> - if (seq =3D=3D rdev->fence_drv[ring].last_seq) >> - return false; >> + /* Note there is a scenario here for an infinite loop but it's >> + * very unlikely to happen. For it to happen, the current polling >> + * process need to be interrupted by another process and another >> + * process needs to update the last_seq btw the atomic read and >> + * xchg of the current process. >> + * >> + * More over for this to go in infinite loop there need to be >> + * continuously new fence signaled ie radeon_fence_read needs >> + * to return a different value each time for both the currently >> + * polling process and the other process that xchg the last_seq >> + * btw atomic read and xchg of the current process. And the >> + * value the other process set as last seq must be higher than >> + * the seq value we just read. Which means that current process >> + * need to be interrupted after radeon_fence_read and before >> + * atomic xchg. >> + * >> + * To be even more safe we count the number of time we loop and >> + * we bail after 10 loop just accepting the fact that we might >> + * have temporarly set the last_seq not to the true real last >> + * seq but to an older one. >> + */ >> + last_seq =3D atomic64_read(&rdev->fence_drv[ring].last_seq); >> + do { >> + seq =3D radeon_fence_read(rdev, ring); >> + seq |=3D last_seq& 0xffffffff00000000LL; >> + if (seq< last_seq) { >> + seq +=3D 0x100000000LL; >> + } >> >> - rdev->fence_drv[ring].last_seq =3D seq; >> + if (!wake&& seq =3D=3D last_seq) { >> + return false; >> + } >> + /* If we loop over we don't want to return without >> + * checking if a fence is signaled as it means that the >> + * seq we just read is different from the previous on. >> + */ >> + wake =3D true; >> + if ((count_loop++)> 10) { >> + /* We looped over too many time leave with the >> + * fact that we might have set an older fence >> + * seq then the current real last seq as signaled >> + * by the hw. >> + */ >> + break; >> + } >> + last_seq =3D seq; >> + } while (atomic64_xchg(&rdev->fence_drv[ring].last_seq, seq)> s= eq); >> + >> + /* reset wake to false */ >> + wake =3D false; >> rdev->fence_drv[ring].last_activity =3D jiffies; >> >> n =3D NULL; >> @@ -112,7 +158,7 @@ static bool radeon_fence_poll_locked(struct radeon_d= evice *rdev, int ring) >> n =3D i->prev; >> list_move_tail(i,&rdev->fence_drv[ring].signaled= ); >> fence =3D list_entry(i, struct radeon_fence, lis= t); >> - fence->signaled =3D true; >> + fence->seq =3D RADEON_FENCE_SIGNALED_SEQ; >> i =3D n; >> } while (i !=3D&rdev->fence_drv[ring].emitted); >> wake =3D true; >> @@ -128,7 +174,7 @@ static void radeon_fence_destroy(struct kref *kref) >> fence =3D container_of(kref, struct radeon_fence, kref); >> write_lock_irqsave(&fence->rdev->fence_lock, irq_flags); >> list_del(&fence->list); >> - fence->emitted =3D false; >> + fence->seq =3D RADEON_FENCE_NOTEMITED_SEQ; >> write_unlock_irqrestore(&fence->rdev->fence_lock, irq_flags); >> if (fence->semaphore) >> radeon_semaphore_free(fence->rdev, fence->semaphore); >> @@ -145,9 +191,7 @@ int radeon_fence_create(struct radeon_device *rdev, >> } >> kref_init(&((*fence)->kref)); >> (*fence)->rdev =3D rdev; >> - (*fence)->emitted =3D false; >> - (*fence)->signaled =3D false; >> - (*fence)->seq =3D 0; >> + (*fence)->seq =3D RADEON_FENCE_NOTEMITED_SEQ; >> (*fence)->ring =3D ring; >> (*fence)->semaphore =3D NULL; >> INIT_LIST_HEAD(&(*fence)->list); >> @@ -163,18 +207,18 @@ bool radeon_fence_signaled(struct radeon_fence *fe= nce) >> return true; >> >> write_lock_irqsave(&fence->rdev->fence_lock, irq_flags); >> - signaled =3D fence->signaled; >> + signaled =3D (fence->seq =3D=3D RADEON_FENCE_SIGNALED_SEQ); >> /* if we are shuting down report all fence as signaled */ >> if (fence->rdev->shutdown) { >> signaled =3D true; >> } >> - if (!fence->emitted) { >> + if (fence->seq =3D=3D RADEON_FENCE_NOTEMITED_SEQ) { >> WARN(1, "Querying an unemitted fence : %p !\n", fence); >> signaled =3D true; >> } >> if (!signaled) { >> radeon_fence_poll_locked(fence->rdev, fence->ring); >> - signaled =3D fence->signaled; >> + signaled =3D (fence->seq =3D=3D RADEON_FENCE_SIGNALED_SE= Q); >> } >> write_unlock_irqrestore(&fence->rdev->fence_lock, irq_flags); >> return signaled; >> @@ -183,8 +227,8 @@ bool radeon_fence_signaled(struct radeon_fence *fenc= e) >> int radeon_fence_wait(struct radeon_fence *fence, bool intr) >> { >> struct radeon_device *rdev; >> - unsigned long irq_flags, timeout; >> - u32 seq; >> + unsigned long irq_flags, timeout, last_activity; >> + uint64_t seq; >> int i, r; >> bool signaled; >> >> @@ -207,7 +251,9 @@ int radeon_fence_wait(struct radeon_fence *fence, bo= ol intr) >> timeout =3D 1; >> } >> /* save current sequence value used to check for GPU loc= kups */ >> - seq =3D rdev->fence_drv[fence->ring].last_seq; >> + seq =3D atomic64_read(&rdev->fence_drv[fence->ring].last= _seq); >> + /* Save current last activity valuee, used to check for = GPU lockups */ >> + last_activity =3D rdev->fence_drv[fence->ring].last_acti= vity; >> read_unlock_irqrestore(&rdev->fence_lock, irq_flags); >> >> trace_radeon_fence_wait_begin(rdev->ddev, seq); >> @@ -235,24 +281,23 @@ int radeon_fence_wait(struct radeon_fence *fence, = bool intr) >> } >> >> write_lock_irqsave(&rdev->fence_lock, irq_flags); >> - /* check if sequence value has changed since las= t_activity */ >> - if (seq !=3D rdev->fence_drv[fence->ring].last_s= eq) { >> + /* test if somebody else has already decided tha= t this is a lockup */ >> + if (last_activity !=3D rdev->fence_drv[fence->ri= ng].last_activity) { >> write_unlock_irqrestore(&rdev->fence_loc= k, irq_flags); >> continue; >> } >> >> - /* change sequence value on all rings, so nobody= else things there is a lockup */ >> - for (i =3D 0; i< RADEON_NUM_RINGS; ++i) >> - rdev->fence_drv[i].last_seq -=3D 0x10000; >> - >> - rdev->fence_drv[fence->ring].last_activity =3D j= iffies; >> write_unlock_irqrestore(&rdev->fence_lock, irq_f= lags); >> >> if (radeon_ring_is_lockup(rdev, fence->ring,&rde= v->ring[fence->ring])) { >> - >> /* good news we believe it's a lockup */ >> - printk(KERN_WARNING "GPU lockup (waiting= for 0x%08X last fence id 0x%08X)\n", >> - fence->seq, seq); >> + dev_warn(rdev->dev, "GPU lockup (waiting= for 0x%016llx last fence id 0x%016llx)\n", >> + fence->seq, seq); >> + >> + /* change last activity so nobody else t= hink there is a lockup */ >> + for (i =3D 0; i< RADEON_NUM_RINGS; ++i)= { >> + rdev->fence_drv[i].last_activity= =3D jiffies; >> + } >> >> /* mark the ring as not ready any more */ >> rdev->ring[fence->ring].ready =3D false; >> @@ -387,9 +432,9 @@ int radeon_fence_driver_start_ring(struct radeon_dev= ice *rdev, int ring) >> } >> rdev->fence_drv[ring].cpu_addr =3D&rdev->wb.wb[index/4]; >> rdev->fence_drv[ring].gpu_addr =3D rdev->wb.gpu_addr + index; >> - radeon_fence_write(rdev, atomic_read(&rdev->fence_drv[ring]..seq= ), ring); >> + radeon_fence_write(rdev, rdev->fence_drv[ring].seq, ring); >> rdev->fence_drv[ring].initialized =3D true; >> - DRM_INFO("fence driver on ring %d use gpu addr 0x%08Lx and cpu a= ddr 0x%p\n", >> + DRM_INFO("fence driver on ring %d use gpu addr 0x%016llx and cpu= addr 0x%p\n", >> ring, rdev->fence_drv[ring].gpu_addr, rdev->fence_drv[r= ing].cpu_addr); >> write_unlock_irqrestore(&rdev->fence_lock, irq_flags); >> return 0; >> @@ -400,7 +445,8 @@ static void radeon_fence_driver_init_ring(struct rad= eon_device *rdev, int ring) >> rdev->fence_drv[ring].scratch_reg =3D -1; >> rdev->fence_drv[ring].cpu_addr =3D NULL; >> rdev->fence_drv[ring].gpu_addr =3D 0; >> - atomic_set(&rdev->fence_drv[ring].seq, 0); >> + rdev->fence_drv[ring].seq =3D 0; >> + atomic64_set(&rdev->fence_drv[ring].last_seq, 0); >> INIT_LIST_HEAD(&rdev->fence_drv[ring].emitted); >> INIT_LIST_HEAD(&rdev->fence_drv[ring].signaled); >> init_waitqueue_head(&rdev->fence_drv[ring].queue); >> @@ -458,12 +504,12 @@ static int radeon_debugfs_fence_info(struct seq_fi= le *m, void *data) >> continue; >> >> seq_printf(m, "--- ring %d ---\n", i); >> - seq_printf(m, "Last signaled fence 0x%08X\n", >> - radeon_fence_read(rdev, i)); >> + seq_printf(m, "Last signaled fence 0x%016lx\n", >> + atomic64_read(&rdev->fence_drv[i].last_seq)); >> if (!list_empty(&rdev->fence_drv[i].emitted)) { >> fence =3D list_entry(rdev->fence_drv[i].emitted.= prev, >> struct radeon_fence, list); >> - seq_printf(m, "Last emitted fence %p with 0x%08X= \n", >> + seq_printf(m, "Last emitted fence %p with 0x%016= llx\n", >> fence, fence->seq); >> } >> } >> diff --git a/drivers/gpu/drm/radeon/radeon_ring.c b/drivers/gpu/drm/rade= on/radeon_ring.c >> index a4d60ae..4ae222b 100644 >> --- a/drivers/gpu/drm/radeon/radeon_ring.c >> +++ b/drivers/gpu/drm/radeon/radeon_ring.c >> @@ -82,7 +82,7 @@ bool radeon_ib_try_free(struct radeon_device *rdev, st= ruct radeon_ib *ib) >> bool done =3D false; >> >> /* only free ib which have been emited */ >> - if (ib->fence&& ib->fence->emitted) { >> + if (ib->fence&& ib->fence->seq< RADEON_FENCE_NOTEMITED_SEQ) { >> if (radeon_fence_signaled(ib->fence)) { >> radeon_fence_unref(&ib->fence); >> radeon_sa_bo_free(rdev,&ib->sa_bo); >> @@ -149,8 +149,9 @@ retry: >> /* this should be rare event, ie all ib scheduled none signaled = yet. >> */ >> for (i =3D 0; i< RADEON_IB_POOL_SIZE; i++) { >> - if (rdev->ib_pool.ibs[idx].fence&& rdev->ib_pool.ibs[id= x].fence->emitted) { >> - r =3D radeon_fence_wait(rdev->ib_pool.ibs[idx].f= ence, false); >> + struct radeon_fence *fence =3D rdev->ib_pool.ibs[idx].fe= nce; >> + if (fence&& fence->seq< RADEON_FENCE_NOTEMITED_SEQ) { >> + r =3D radeon_fence_wait(fence, false); >> if (!r) { >> goto retry; >> } >> @@ -173,7 +174,7 @@ void radeon_ib_free(struct radeon_device *rdev, stru= ct radeon_ib **ib) >> return; >> } >> radeon_mutex_lock(&rdev->ib_pool.mutex); >> - if (tmp->fence&& !tmp->fence->emitted) { >> + if (tmp->fence&& tmp->fence->seq =3D=3D RADEON_FENCE_NOTEMITED_= SEQ) { >> radeon_sa_bo_free(rdev,&tmp->sa_bo); >> radeon_fence_unref(&tmp->fence); >> } >> -- >> 1.7.5.4 >>