* [PATCH 1/2] drm/i915: Do not return stale status / remove impossible WARN
@ 2016-02-11 18:03 Tvrtko Ursulin
2016-02-11 18:03 ` [PATCH 2/2] drm/i915: Execlist irq handler micro optimisations Tvrtko Ursulin
` (2 more replies)
0 siblings, 3 replies; 14+ messages in thread
From: Tvrtko Ursulin @ 2016-02-11 18:03 UTC (permalink / raw)
To: Intel-gfx
From: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
Only caller to get_context_status ensures read pointer stays in
range so the WARN is impossible. Also, if the WARN would be
triggered by a hypothetical new caller stale status would be
returned to them.
Maybe it is better to wrap the pointer in the function itself
then to avoid both and even results in smaller code.
Signed-off-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
---
drivers/gpu/drm/i915/intel_lrc.c | 15 ++++++---------
1 file changed, 6 insertions(+), 9 deletions(-)
diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c
index 89eb892df4ae..951f1e6af947 100644
--- a/drivers/gpu/drm/i915/intel_lrc.c
+++ b/drivers/gpu/drm/i915/intel_lrc.c
@@ -507,17 +507,16 @@ static bool execlists_check_remove_request(struct intel_engine_cs *ring,
return false;
}
-static void get_context_status(struct intel_engine_cs *ring,
- u8 read_pointer,
- u32 *status, u32 *context_id)
+static u32 get_context_status(struct intel_engine_cs *ring, u8 read_pointer,
+ u32 *context_id)
{
struct drm_i915_private *dev_priv = ring->dev->dev_private;
- if (WARN_ON(read_pointer >= GEN8_CSB_ENTRIES))
- return;
+ read_pointer %= GEN8_CSB_ENTRIES;
- *status = I915_READ(RING_CONTEXT_STATUS_BUF_LO(ring, read_pointer));
*context_id = I915_READ(RING_CONTEXT_STATUS_BUF_HI(ring, read_pointer));
+
+ return I915_READ(RING_CONTEXT_STATUS_BUF_LO(ring, read_pointer));
}
/**
@@ -547,9 +546,7 @@ void intel_lrc_irq_handler(struct intel_engine_cs *ring)
spin_lock(&ring->execlist_lock);
while (read_pointer < write_pointer) {
-
- get_context_status(ring, ++read_pointer % GEN8_CSB_ENTRIES,
- &status, &status_id);
+ status = get_context_status(ring, ++read_pointer, &status_id);
if (status & GEN8_CTX_STATUS_IDLE_ACTIVE)
continue;
--
1.9.1
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx
^ permalink raw reply related [flat|nested] 14+ messages in thread
* [PATCH 2/2] drm/i915: Execlist irq handler micro optimisations
2016-02-11 18:03 [PATCH 1/2] drm/i915: Do not return stale status / remove impossible WARN Tvrtko Ursulin
@ 2016-02-11 18:03 ` Tvrtko Ursulin
2016-02-11 20:59 ` Chris Wilson
2016-02-12 12:00 ` [PATCH v2] " Tvrtko Ursulin
2016-02-11 21:00 ` [PATCH 1/2] drm/i915: Do not return stale status / remove impossible WARN Chris Wilson
2016-02-16 8:10 ` ✗ Fi.CI.BAT: failure for series starting with [2/2] drm/i915: Execlist irq handler micro optimisations (rev3) Patchwork
2 siblings, 2 replies; 14+ messages in thread
From: Tvrtko Ursulin @ 2016-02-11 18:03 UTC (permalink / raw)
To: Intel-gfx
From: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
Assorted changes most likely without any practical effect
apart from a tiny reduction in generated code for the interrupt
handler and request submission.
* Remove needless initialization.
* Improve cache locality by reorganizing code and/or using
branch hints to keep unexpected or error conditions out
of line.
* Favor busy submit path vs. empty queue.
* Less branching in hot-paths.
Signed-off-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
---
drivers/gpu/drm/i915/intel_lrc.c | 91 ++++++++++++++++-----------------
drivers/gpu/drm/i915/intel_ringbuffer.h | 1 +
2 files changed, 44 insertions(+), 48 deletions(-)
diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c
index 951f1e6af947..589d6404b5ca 100644
--- a/drivers/gpu/drm/i915/intel_lrc.c
+++ b/drivers/gpu/drm/i915/intel_lrc.c
@@ -269,6 +269,9 @@ logical_ring_init_platform_invariants(struct intel_engine_cs *ring)
{
struct drm_device *dev = ring->dev;
+ if (IS_GEN8(dev) || IS_GEN9(dev))
+ ring->idle_lite_restore_wa = ~0;
+
ring->disable_lite_restore_wa = (IS_SKL_REVID(dev, 0, SKL_REVID_B0) ||
IS_BXT_REVID(dev, 0, BXT_REVID_A1)) &&
(ring->id == VCS || ring->id == VCS2);
@@ -424,7 +427,7 @@ static void execlists_submit_requests(struct drm_i915_gem_request *rq0,
static void execlists_context_unqueue(struct intel_engine_cs *ring)
{
struct drm_i915_gem_request *req0 = NULL, *req1 = NULL;
- struct drm_i915_gem_request *cursor = NULL, *tmp = NULL;
+ struct drm_i915_gem_request *cursor, *tmp;
assert_spin_locked(&ring->execlist_lock);
@@ -434,9 +437,6 @@ static void execlists_context_unqueue(struct intel_engine_cs *ring)
*/
WARN_ON(!intel_irqs_enabled(ring->dev->dev_private));
- if (list_empty(&ring->execlist_queue))
- return;
-
/* Try to read in pairs */
list_for_each_entry_safe(cursor, tmp, &ring->execlist_queue,
execlist_link) {
@@ -451,37 +451,35 @@ static void execlists_context_unqueue(struct intel_engine_cs *ring)
req0 = cursor;
} else {
req1 = cursor;
+ WARN_ON(req1->elsp_submitted);
break;
}
}
- if (IS_GEN8(ring->dev) || IS_GEN9(ring->dev)) {
+ if (unlikely(!req0))
+ return;
+
+ if (req0->elsp_submitted & ring->idle_lite_restore_wa) {
/*
- * WaIdleLiteRestore: make sure we never cause a lite
- * restore with HEAD==TAIL
+ * WaIdleLiteRestore: make sure we never cause a lite restore
+ * with HEAD==TAIL.
+ *
+ * Apply the wa NOOPS to prevent ring:HEAD == req:TAIL as we
+ * resubmit the request. See gen8_emit_request() for where we
+ * prepare the padding after the end of the request.
*/
- if (req0->elsp_submitted) {
- /*
- * Apply the wa NOOPS to prevent ring:HEAD == req:TAIL
- * as we resubmit the request. See gen8_emit_request()
- * for where we prepare the padding after the end of the
- * request.
- */
- struct intel_ringbuffer *ringbuf;
+ struct intel_ringbuffer *ringbuf;
- ringbuf = req0->ctx->engine[ring->id].ringbuf;
- req0->tail += 8;
- req0->tail &= ringbuf->size - 1;
- }
+ ringbuf = req0->ctx->engine[ring->id].ringbuf;
+ req0->tail += 8;
+ req0->tail &= ringbuf->size - 1;
}
- WARN_ON(req1 && req1->elsp_submitted);
-
execlists_submit_requests(req0, req1);
}
-static bool execlists_check_remove_request(struct intel_engine_cs *ring,
- u32 request_id)
+static unsigned int
+execlists_check_remove_request(struct intel_engine_cs *ring, u32 request_id)
{
struct drm_i915_gem_request *head_req;
@@ -491,20 +489,21 @@ static bool execlists_check_remove_request(struct intel_engine_cs *ring,
struct drm_i915_gem_request,
execlist_link);
- if (head_req != NULL) {
- if (intel_execlists_ctx_id(head_req->ctx, ring) == request_id) {
- WARN(head_req->elsp_submitted == 0,
- "Never submitted head request\n");
+ if (!head_req)
+ return 0;
+
+ if (unlikely(intel_execlists_ctx_id(head_req->ctx, ring) != request_id))
+ return 0;
- if (--head_req->elsp_submitted <= 0) {
- list_move_tail(&head_req->execlist_link,
- &ring->execlist_retired_req_list);
- return true;
- }
- }
- }
+ WARN(head_req->elsp_submitted == 0, "Never submitted head request\n");
- return false;
+ if (--head_req->elsp_submitted > 0)
+ return 0;
+
+ list_move_tail(&head_req->execlist_link,
+ &ring->execlist_retired_req_list);
+
+ return 1;
}
static u32 get_context_status(struct intel_engine_cs *ring, u8 read_pointer,
@@ -551,7 +550,7 @@ void intel_lrc_irq_handler(struct intel_engine_cs *ring)
if (status & GEN8_CTX_STATUS_IDLE_ACTIVE)
continue;
- if (status & GEN8_CTX_STATUS_PREEMPTED) {
+ if (unlikely(status & GEN8_CTX_STATUS_PREEMPTED)) {
if (status & GEN8_CTX_STATUS_LITE_RESTORE) {
if (execlists_check_remove_request(ring, status_id))
WARN(1, "Lite Restored request removed from queue\n");
@@ -559,21 +558,16 @@ void intel_lrc_irq_handler(struct intel_engine_cs *ring)
WARN(1, "Preemption without Lite Restore\n");
}
- if ((status & GEN8_CTX_STATUS_ACTIVE_IDLE) ||
- (status & GEN8_CTX_STATUS_ELEMENT_SWITCH)) {
- if (execlists_check_remove_request(ring, status_id))
- submit_contexts++;
- }
+ if (status & (GEN8_CTX_STATUS_ACTIVE_IDLE |
+ GEN8_CTX_STATUS_ELEMENT_SWITCH))
+ submit_contexts +=
+ execlists_check_remove_request(ring, status_id);
}
- if (ring->disable_lite_restore_wa) {
- /* Prevent a ctx to preempt itself */
- if ((status & GEN8_CTX_STATUS_ACTIVE_IDLE) &&
- (submit_contexts != 0))
- execlists_context_unqueue(ring);
- } else if (submit_contexts != 0) {
+ if (submit_contexts && (!ring->disable_lite_restore_wa ||
+ (ring->disable_lite_restore_wa && (status &
+ GEN8_CTX_STATUS_ACTIVE_IDLE))))
execlists_context_unqueue(ring);
- }
spin_unlock(&ring->execlist_lock);
@@ -2028,6 +2022,7 @@ void intel_logical_ring_cleanup(struct intel_engine_cs *ring)
ring->status_page.obj = NULL;
}
+ ring->idle_lite_restore_wa = 0;
ring->disable_lite_restore_wa = false;
ring->ctx_desc_template = 0;
diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.h b/drivers/gpu/drm/i915/intel_ringbuffer.h
index 566b0ae10ce0..54054ba6e223 100644
--- a/drivers/gpu/drm/i915/intel_ringbuffer.h
+++ b/drivers/gpu/drm/i915/intel_ringbuffer.h
@@ -272,6 +272,7 @@ struct intel_engine_cs {
struct list_head execlist_queue;
struct list_head execlist_retired_req_list;
u8 next_context_status_buffer;
+ unsigned int idle_lite_restore_wa;
bool disable_lite_restore_wa;
u32 ctx_desc_template;
u32 irq_keep_mask; /* bitmask for interrupts that should not be masked */
--
1.9.1
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx
^ permalink raw reply related [flat|nested] 14+ messages in thread
* Re: [PATCH 2/2] drm/i915: Execlist irq handler micro optimisations
2016-02-11 18:03 ` [PATCH 2/2] drm/i915: Execlist irq handler micro optimisations Tvrtko Ursulin
@ 2016-02-11 20:59 ` Chris Wilson
2016-02-12 12:00 ` [PATCH v2] " Tvrtko Ursulin
1 sibling, 0 replies; 14+ messages in thread
From: Chris Wilson @ 2016-02-11 20:59 UTC (permalink / raw)
To: Tvrtko Ursulin; +Cc: Intel-gfx
On Thu, Feb 11, 2016 at 06:03:10PM +0000, Tvrtko Ursulin wrote:
> From: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
>
> Assorted changes most likely without any practical effect
> apart from a tiny reduction in generated code for the interrupt
> handler and request submission.
>
> * Remove needless initialization.
> * Improve cache locality by reorganizing code and/or using
> branch hints to keep unexpected or error conditions out
> of line.
> * Favor busy submit path vs. empty queue.
> * Less branching in hot-paths.
Sadly there are panics here to be taken care of before optimisations. :|
> Signed-off-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
> ---
> drivers/gpu/drm/i915/intel_lrc.c | 91 ++++++++++++++++-----------------
> drivers/gpu/drm/i915/intel_ringbuffer.h | 1 +
> 2 files changed, 44 insertions(+), 48 deletions(-)
>
> diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c
> index 951f1e6af947..589d6404b5ca 100644
> --- a/drivers/gpu/drm/i915/intel_lrc.c
> +++ b/drivers/gpu/drm/i915/intel_lrc.c
> @@ -269,6 +269,9 @@ logical_ring_init_platform_invariants(struct intel_engine_cs *ring)
> {
> struct drm_device *dev = ring->dev;
>
> + if (IS_GEN8(dev) || IS_GEN9(dev))
> + ring->idle_lite_restore_wa = ~0;
> +
> ring->disable_lite_restore_wa = (IS_SKL_REVID(dev, 0, SKL_REVID_B0) ||
> IS_BXT_REVID(dev, 0, BXT_REVID_A1)) &&
> (ring->id == VCS || ring->id == VCS2);
> @@ -424,7 +427,7 @@ static void execlists_submit_requests(struct drm_i915_gem_request *rq0,
> static void execlists_context_unqueue(struct intel_engine_cs *ring)
> {
> struct drm_i915_gem_request *req0 = NULL, *req1 = NULL;
> - struct drm_i915_gem_request *cursor = NULL, *tmp = NULL;
> + struct drm_i915_gem_request *cursor, *tmp;
>
> assert_spin_locked(&ring->execlist_lock);
>
> @@ -434,9 +437,6 @@ static void execlists_context_unqueue(struct intel_engine_cs *ring)
> */
> WARN_ON(!intel_irqs_enabled(ring->dev->dev_private));
>
> - if (list_empty(&ring->execlist_queue))
> - return;
> -
> /* Try to read in pairs */
> list_for_each_entry_safe(cursor, tmp, &ring->execlist_queue,
> execlist_link) {
> @@ -451,37 +451,35 @@ static void execlists_context_unqueue(struct intel_engine_cs *ring)
> req0 = cursor;
> } else {
> req1 = cursor;
> + WARN_ON(req1->elsp_submitted);
> break;
> }
> }
>
> - if (IS_GEN8(ring->dev) || IS_GEN9(ring->dev)) {
> + if (unlikely(!req0))
> + return;
> +
> + if (req0->elsp_submitted & ring->idle_lite_restore_wa) {
If you are going for the microptimsation, just do
if (req->elsp_submitted)
We unconditionally prepare the ringbuffer for the wa and currently apply
it everywhere (and future hw will not be taking this path, aiui at least).
-Chris
--
Chris Wilson, Intel Open Source Technology Centre
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx
^ permalink raw reply [flat|nested] 14+ messages in thread
* Re: [PATCH 1/2] drm/i915: Do not return stale status / remove impossible WARN
2016-02-11 18:03 [PATCH 1/2] drm/i915: Do not return stale status / remove impossible WARN Tvrtko Ursulin
2016-02-11 18:03 ` [PATCH 2/2] drm/i915: Execlist irq handler micro optimisations Tvrtko Ursulin
@ 2016-02-11 21:00 ` Chris Wilson
2016-02-12 10:05 ` Tvrtko Ursulin
2016-02-16 8:10 ` ✗ Fi.CI.BAT: failure for series starting with [2/2] drm/i915: Execlist irq handler micro optimisations (rev3) Patchwork
2 siblings, 1 reply; 14+ messages in thread
From: Chris Wilson @ 2016-02-11 21:00 UTC (permalink / raw)
To: Tvrtko Ursulin; +Cc: Intel-gfx
On Thu, Feb 11, 2016 at 06:03:09PM +0000, Tvrtko Ursulin wrote:
> From: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
>
> Only caller to get_context_status ensures read pointer stays in
> range so the WARN is impossible. Also, if the WARN would be
> triggered by a hypothetical new caller stale status would be
> returned to them.
>
> Maybe it is better to wrap the pointer in the function itself
> then to avoid both and even results in smaller code.
>
> Signed-off-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
> ---
> drivers/gpu/drm/i915/intel_lrc.c | 15 ++++++---------
> 1 file changed, 6 insertions(+), 9 deletions(-)
>
> diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c
> index 89eb892df4ae..951f1e6af947 100644
> --- a/drivers/gpu/drm/i915/intel_lrc.c
> +++ b/drivers/gpu/drm/i915/intel_lrc.c
> @@ -507,17 +507,16 @@ static bool execlists_check_remove_request(struct intel_engine_cs *ring,
> return false;
> }
>
> -static void get_context_status(struct intel_engine_cs *ring,
> - u8 read_pointer,
> - u32 *status, u32 *context_id)
> +static u32 get_context_status(struct intel_engine_cs *ring, u8 read_pointer,
> + u32 *context_id)
> {
> struct drm_i915_private *dev_priv = ring->dev->dev_private;
>
> - if (WARN_ON(read_pointer >= GEN8_CSB_ENTRIES))
> - return;
> + read_pointer %= GEN8_CSB_ENTRIES;
>
> - *status = I915_READ(RING_CONTEXT_STATUS_BUF_LO(ring, read_pointer));
> *context_id = I915_READ(RING_CONTEXT_STATUS_BUF_HI(ring, read_pointer));
Micro-optimising hat says not to even do the uncached, spinlocked mmio
read when not required.
-Chris
--
Chris Wilson, Intel Open Source Technology Centre
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx
^ permalink raw reply [flat|nested] 14+ messages in thread
* Re: [PATCH 1/2] drm/i915: Do not return stale status / remove impossible WARN
2016-02-11 21:00 ` [PATCH 1/2] drm/i915: Do not return stale status / remove impossible WARN Chris Wilson
@ 2016-02-12 10:05 ` Tvrtko Ursulin
2016-02-12 10:21 ` Chris Wilson
0 siblings, 1 reply; 14+ messages in thread
From: Tvrtko Ursulin @ 2016-02-12 10:05 UTC (permalink / raw)
To: Chris Wilson, Intel-gfx
On 11/02/16 21:00, Chris Wilson wrote:
> On Thu, Feb 11, 2016 at 06:03:09PM +0000, Tvrtko Ursulin wrote:
>> From: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
>>
>> Only caller to get_context_status ensures read pointer stays in
>> range so the WARN is impossible. Also, if the WARN would be
>> triggered by a hypothetical new caller stale status would be
>> returned to them.
>>
>> Maybe it is better to wrap the pointer in the function itself
>> then to avoid both and even results in smaller code.
>>
>> Signed-off-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
>> ---
>> drivers/gpu/drm/i915/intel_lrc.c | 15 ++++++---------
>> 1 file changed, 6 insertions(+), 9 deletions(-)
>>
>> diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c
>> index 89eb892df4ae..951f1e6af947 100644
>> --- a/drivers/gpu/drm/i915/intel_lrc.c
>> +++ b/drivers/gpu/drm/i915/intel_lrc.c
>> @@ -507,17 +507,16 @@ static bool execlists_check_remove_request(struct intel_engine_cs *ring,
>> return false;
>> }
>>
>> -static void get_context_status(struct intel_engine_cs *ring,
>> - u8 read_pointer,
>> - u32 *status, u32 *context_id)
>> +static u32 get_context_status(struct intel_engine_cs *ring, u8 read_pointer,
>> + u32 *context_id)
>> {
>> struct drm_i915_private *dev_priv = ring->dev->dev_private;
>>
>> - if (WARN_ON(read_pointer >= GEN8_CSB_ENTRIES))
>> - return;
>> + read_pointer %= GEN8_CSB_ENTRIES;
>>
>> - *status = I915_READ(RING_CONTEXT_STATUS_BUF_LO(ring, read_pointer));
>> *context_id = I915_READ(RING_CONTEXT_STATUS_BUF_HI(ring, read_pointer));
>
> Micro-optimising hat says not to even do the uncached, spinlocked mmio
> read when not required.
You mean move the forcewake grab out from elsp write to cover all mmio
reads? These two patches make the irq handler around 3.7% smaller and
moving the forcewake/uncore lock shrinks that by a 1% more. Must be
faster as well, if someone could measure it. :)
diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c
index 1e7ccd0a6573..77a64008f53d 100644
--- a/drivers/gpu/drm/i915/intel_lrc.c
+++ b/drivers/gpu/drm/i915/intel_lrc.c
@@ -375,8 +375,6 @@ static void execlists_elsp_write(struct drm_i915_gem_request *rq0,
rq0->elsp_submitted++;
/* You must always write both descriptors in the order below. */
- spin_lock(&dev_priv->uncore.lock);
- intel_uncore_forcewake_get__locked(dev_priv, FORCEWAKE_ALL);
I915_WRITE_FW(RING_ELSP(ring), upper_32_bits(desc[1]));
I915_WRITE_FW(RING_ELSP(ring), lower_32_bits(desc[1]));
@@ -386,8 +384,6 @@ static void execlists_elsp_write(struct drm_i915_gem_request *rq0,
/* ELSP is a wo register, use another nearby reg for posting */
POSTING_READ_FW(RING_EXECLIST_STATUS_LO(ring));
- intel_uncore_forcewake_put__locked(dev_priv, FORCEWAKE_ALL);
- spin_unlock(&dev_priv->uncore.lock);
}
static int execlists_update_context(struct drm_i915_gem_request *rq)
@@ -513,9 +509,9 @@ static u32 get_context_status(struct intel_engine_cs *ring, u8 read_pointer,
read_pointer %= GEN8_CSB_ENTRIES;
- *context_id = I915_READ(RING_CONTEXT_STATUS_BUF_HI(ring, read_pointer));
+ *context_id = I915_READ_FW(RING_CONTEXT_STATUS_BUF_HI(ring, read_pointer));
- return I915_READ(RING_CONTEXT_STATUS_BUF_LO(ring, read_pointer));
+ return I915_READ_FW(RING_CONTEXT_STATUS_BUF_LO(ring, read_pointer));
}
/**
@@ -535,15 +531,18 @@ void intel_lrc_irq_handler(struct intel_engine_cs *ring)
u32 status_id;
u32 submit_contexts = 0;
- status_pointer = I915_READ(RING_CONTEXT_STATUS_PTR(ring));
+ spin_lock(&ring->execlist_lock);
+
+ spin_lock(&dev_priv->uncore.lock);
+ intel_uncore_forcewake_get__locked(dev_priv, FORCEWAKE_ALL);
+
+ status_pointer = I915_READ_FW(RING_CONTEXT_STATUS_PTR(ring));
read_pointer = ring->next_context_status_buffer;
write_pointer = GEN8_CSB_WRITE_PTR(status_pointer);
if (read_pointer > write_pointer)
write_pointer += GEN8_CSB_ENTRIES;
- spin_lock(&ring->execlist_lock);
-
while (read_pointer < write_pointer) {
status = get_context_status(ring, ++read_pointer, &status_id);
@@ -569,22 +568,26 @@ void intel_lrc_irq_handler(struct intel_engine_cs *ring)
GEN8_CTX_STATUS_ACTIVE_IDLE))))
execlists_context_unqueue(ring);
- spin_unlock(&ring->execlist_lock);
-
- if (unlikely(submit_contexts > 2))
- DRM_ERROR("More than two context complete events?\n");
-
ring->next_context_status_buffer = write_pointer % GEN8_CSB_ENTRIES;
/* Update the read pointer to the old write pointer. Manual ringbuffer
* management ftw </sarcasm> */
- I915_WRITE(RING_CONTEXT_STATUS_PTR(ring),
- _MASKED_FIELD(GEN8_CSB_READ_PTR_MASK,
- ring->next_context_status_buffer << 8));
+ I915_WRITE_FW(RING_CONTEXT_STATUS_PTR(ring),
+ _MASKED_FIELD(GEN8_CSB_READ_PTR_MASK,
+ ring->next_context_status_buffer << 8));
+
+ intel_uncore_forcewake_put__locked(dev_priv, FORCEWAKE_ALL);
+ spin_unlock(&dev_priv->uncore.lock);
+
+ spin_unlock(&ring->execlist_lock);
+
+ if (unlikely(submit_contexts > 2))
+ DRM_ERROR("More than two context complete events?\n");
}
static int execlists_context_queue(struct drm_i915_gem_request *request)
{
+ struct drm_i915_private *dev_priv = request->i915;
struct intel_engine_cs *ring = request->ring;
struct drm_i915_gem_request *cursor;
int num_elements = 0;
@@ -616,9 +619,16 @@ static int execlists_context_queue(struct drm_i915_gem_request *request)
}
list_add_tail(&request->execlist_link, &ring->execlist_queue);
- if (num_elements == 0)
+ if (num_elements == 0) {
+ spin_lock(&dev_priv->uncore.lock);
+ intel_uncore_forcewake_get__locked(dev_priv, FORCEWAKE_ALL);
+
execlists_context_unqueue(ring);
+ intel_uncore_forcewake_put__locked(dev_priv, FORCEWAKE_ALL);
+ spin_unlock(&dev_priv->uncore.lock);
+ }
+
spin_unlock_irq(&ring->execlist_lock);
return 0;
Regards,
Tvrtko
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx
^ permalink raw reply related [flat|nested] 14+ messages in thread
* Re: [PATCH 1/2] drm/i915: Do not return stale status / remove impossible WARN
2016-02-12 10:05 ` Tvrtko Ursulin
@ 2016-02-12 10:21 ` Chris Wilson
2016-02-12 11:45 ` Tvrtko Ursulin
0 siblings, 1 reply; 14+ messages in thread
From: Chris Wilson @ 2016-02-12 10:21 UTC (permalink / raw)
To: Tvrtko Ursulin; +Cc: Intel-gfx
On Fri, Feb 12, 2016 at 10:05:58AM +0000, Tvrtko Ursulin wrote:
>
> On 11/02/16 21:00, Chris Wilson wrote:
> > On Thu, Feb 11, 2016 at 06:03:09PM +0000, Tvrtko Ursulin wrote:
> >> From: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
> >>
> >> Only caller to get_context_status ensures read pointer stays in
> >> range so the WARN is impossible. Also, if the WARN would be
> >> triggered by a hypothetical new caller stale status would be
> >> returned to them.
> >>
> >> Maybe it is better to wrap the pointer in the function itself
> >> then to avoid both and even results in smaller code.
> >>
> >> Signed-off-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
> >> ---
> >> drivers/gpu/drm/i915/intel_lrc.c | 15 ++++++---------
> >> 1 file changed, 6 insertions(+), 9 deletions(-)
> >>
> >> diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c
> >> index 89eb892df4ae..951f1e6af947 100644
> >> --- a/drivers/gpu/drm/i915/intel_lrc.c
> >> +++ b/drivers/gpu/drm/i915/intel_lrc.c
> >> @@ -507,17 +507,16 @@ static bool execlists_check_remove_request(struct intel_engine_cs *ring,
> >> return false;
> >> }
> >>
> >> -static void get_context_status(struct intel_engine_cs *ring,
> >> - u8 read_pointer,
> >> - u32 *status, u32 *context_id)
> >> +static u32 get_context_status(struct intel_engine_cs *ring, u8 read_pointer,
> >> + u32 *context_id)
> >> {
> >> struct drm_i915_private *dev_priv = ring->dev->dev_private;
> >>
> >> - if (WARN_ON(read_pointer >= GEN8_CSB_ENTRIES))
> >> - return;
> >> + read_pointer %= GEN8_CSB_ENTRIES;
> >>
> >> - *status = I915_READ(RING_CONTEXT_STATUS_BUF_LO(ring, read_pointer));
> >> *context_id = I915_READ(RING_CONTEXT_STATUS_BUF_HI(ring, read_pointer));
> >
> > Micro-optimising hat says not to even do the uncached, spinlocked mmio
> > read when not required.
>
> You mean move the forcewake grab out from elsp write to cover all mmio
> reads? These two patches make the irq handler around 3.7% smaller and
> moving the forcewake/uncore lock shrinks that by a 1% more. Must be
> faster as well, if someone could measure it. :)
If only we already have benchmarks capable of measuring such
optimisations!
> diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c
> index 1e7ccd0a6573..77a64008f53d 100644
> --- a/drivers/gpu/drm/i915/intel_lrc.c
> +++ b/drivers/gpu/drm/i915/intel_lrc.c
> @@ -375,8 +375,6 @@ static void execlists_elsp_write(struct drm_i915_gem_request *rq0,
> rq0->elsp_submitted++;
>
> /* You must always write both descriptors in the order below. */
> - spin_lock(&dev_priv->uncore.lock);
> - intel_uncore_forcewake_get__locked(dev_priv, FORCEWAKE_ALL);
> I915_WRITE_FW(RING_ELSP(ring), upper_32_bits(desc[1]));
> I915_WRITE_FW(RING_ELSP(ring), lower_32_bits(desc[1]));
>
> @@ -386,8 +384,6 @@ static void execlists_elsp_write(struct drm_i915_gem_request *rq0,
>
> /* ELSP is a wo register, use another nearby reg for posting */
> POSTING_READ_FW(RING_EXECLIST_STATUS_LO(ring));
> - intel_uncore_forcewake_put__locked(dev_priv, FORCEWAKE_ALL);
> - spin_unlock(&dev_priv->uncore.lock);
> }
>
> static int execlists_update_context(struct drm_i915_gem_request *rq)
> @@ -513,9 +509,9 @@ static u32 get_context_status(struct intel_engine_cs *ring, u8 read_pointer,
>
> read_pointer %= GEN8_CSB_ENTRIES;
>
> - *context_id = I915_READ(RING_CONTEXT_STATUS_BUF_HI(ring, read_pointer));
> + *context_id = I915_READ_FW(RING_CONTEXT_STATUS_BUF_HI(ring, read_pointer));
I was also suggesting that we don't need this read in almost 25% of
cases! :)
> @@ -616,9 +619,16 @@ static int execlists_context_queue(struct drm_i915_gem_request *request)
> }
>
> list_add_tail(&request->execlist_link, &ring->execlist_queue);
> - if (num_elements == 0)
> + if (num_elements == 0) {
> + spin_lock(&dev_priv->uncore.lock);
We need the irq variants here.
> + intel_uncore_forcewake_get__locked(dev_priv, FORCEWAKE_ALL);
-Chris
--
Chris Wilson, Intel Open Source Technology Centre
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx
^ permalink raw reply [flat|nested] 14+ messages in thread
* Re: [PATCH 1/2] drm/i915: Do not return stale status / remove impossible WARN
2016-02-12 10:21 ` Chris Wilson
@ 2016-02-12 11:45 ` Tvrtko Ursulin
0 siblings, 0 replies; 14+ messages in thread
From: Tvrtko Ursulin @ 2016-02-12 11:45 UTC (permalink / raw)
To: Chris Wilson, Intel-gfx
On 12/02/16 10:21, Chris Wilson wrote:
> On Fri, Feb 12, 2016 at 10:05:58AM +0000, Tvrtko Ursulin wrote:
>>
>> On 11/02/16 21:00, Chris Wilson wrote:
>>> On Thu, Feb 11, 2016 at 06:03:09PM +0000, Tvrtko Ursulin wrote:
>>>> From: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
>>>>
>>>> Only caller to get_context_status ensures read pointer stays in
>>>> range so the WARN is impossible. Also, if the WARN would be
>>>> triggered by a hypothetical new caller stale status would be
>>>> returned to them.
>>>>
>>>> Maybe it is better to wrap the pointer in the function itself
>>>> then to avoid both and even results in smaller code.
>>>>
>>>> Signed-off-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
>>>> ---
>>>> drivers/gpu/drm/i915/intel_lrc.c | 15 ++++++---------
>>>> 1 file changed, 6 insertions(+), 9 deletions(-)
>>>>
>>>> diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c
>>>> index 89eb892df4ae..951f1e6af947 100644
>>>> --- a/drivers/gpu/drm/i915/intel_lrc.c
>>>> +++ b/drivers/gpu/drm/i915/intel_lrc.c
>>>> @@ -507,17 +507,16 @@ static bool execlists_check_remove_request(struct intel_engine_cs *ring,
>>>> return false;
>>>> }
>>>>
>>>> -static void get_context_status(struct intel_engine_cs *ring,
>>>> - u8 read_pointer,
>>>> - u32 *status, u32 *context_id)
>>>> +static u32 get_context_status(struct intel_engine_cs *ring, u8 read_pointer,
>>>> + u32 *context_id)
>>>> {
>>>> struct drm_i915_private *dev_priv = ring->dev->dev_private;
>>>>
>>>> - if (WARN_ON(read_pointer >= GEN8_CSB_ENTRIES))
>>>> - return;
>>>> + read_pointer %= GEN8_CSB_ENTRIES;
>>>>
>>>> - *status = I915_READ(RING_CONTEXT_STATUS_BUF_LO(ring, read_pointer));
>>>> *context_id = I915_READ(RING_CONTEXT_STATUS_BUF_HI(ring, read_pointer));
>>>
>>> Micro-optimising hat says not to even do the uncached, spinlocked mmio
>>> read when not required.
>>
>> You mean move the forcewake grab out from elsp write to cover all mmio
>> reads? These two patches make the irq handler around 3.7% smaller and
>> moving the forcewake/uncore lock shrinks that by a 1% more. Must be
>> faster as well, if someone could measure it. :)
>
> If only we already have benchmarks capable of measuring such
> optimisations!
We do?
>> diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c
>> index 1e7ccd0a6573..77a64008f53d 100644
>> --- a/drivers/gpu/drm/i915/intel_lrc.c
>> +++ b/drivers/gpu/drm/i915/intel_lrc.c
>> @@ -375,8 +375,6 @@ static void execlists_elsp_write(struct drm_i915_gem_request *rq0,
>> rq0->elsp_submitted++;
>>
>> /* You must always write both descriptors in the order below. */
>> - spin_lock(&dev_priv->uncore.lock);
>> - intel_uncore_forcewake_get__locked(dev_priv, FORCEWAKE_ALL);
>> I915_WRITE_FW(RING_ELSP(ring), upper_32_bits(desc[1]));
>> I915_WRITE_FW(RING_ELSP(ring), lower_32_bits(desc[1]));
>>
>> @@ -386,8 +384,6 @@ static void execlists_elsp_write(struct drm_i915_gem_request *rq0,
>>
>> /* ELSP is a wo register, use another nearby reg for posting */
>> POSTING_READ_FW(RING_EXECLIST_STATUS_LO(ring));
>> - intel_uncore_forcewake_put__locked(dev_priv, FORCEWAKE_ALL);
>> - spin_unlock(&dev_priv->uncore.lock);
>> }
>>
>> static int execlists_update_context(struct drm_i915_gem_request *rq)
>> @@ -513,9 +509,9 @@ static u32 get_context_status(struct intel_engine_cs *ring, u8 read_pointer,
>>
>> read_pointer %= GEN8_CSB_ENTRIES;
>>
>> - *context_id = I915_READ(RING_CONTEXT_STATUS_BUF_HI(ring, read_pointer));
>> + *context_id = I915_READ_FW(RING_CONTEXT_STATUS_BUF_HI(ring, read_pointer));
>
> I was also suggesting that we don't need this read in almost 25% of
> cases! :)
Oh right, yeah I can do that.
>> @@ -616,9 +619,16 @@ static int execlists_context_queue(struct drm_i915_gem_request *request)
>> }
>>
>> list_add_tail(&request->execlist_link, &ring->execlist_queue);
>> - if (num_elements == 0)
>> + if (num_elements == 0) {
>> + spin_lock(&dev_priv->uncore.lock);
>
> We need the irq variants here.
No since the execlist lock further up already disables interrupts.
Regards,
Tvrtko
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx
^ permalink raw reply [flat|nested] 14+ messages in thread
* [PATCH v2] drm/i915: Execlist irq handler micro optimisations
2016-02-11 18:03 ` [PATCH 2/2] drm/i915: Execlist irq handler micro optimisations Tvrtko Ursulin
2016-02-11 20:59 ` Chris Wilson
@ 2016-02-12 12:00 ` Tvrtko Ursulin
2016-02-12 13:46 ` Tvrtko Ursulin
2016-02-12 14:42 ` Chris Wilson
1 sibling, 2 replies; 14+ messages in thread
From: Tvrtko Ursulin @ 2016-02-12 12:00 UTC (permalink / raw)
To: Intel-gfx
From: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
Assorted changes most likely without any practical effect
apart from a tiny reduction in generated code for the interrupt
handler and request submission.
* Remove needless initialization.
* Improve cache locality by reorganizing code and/or using
branch hints to keep unexpected or error conditions out
of line.
* Favor busy submit path vs. empty queue.
* Less branching in hot-paths.
v2:
* Avoid mmio reads when possible. (Chris Wilson)
* Use natural integer size for csb indices.
* Remove useless return value from execlists_update_context.
* Extract 32-bit ppgtt PDPs update so it is out of line and
shared with two callers.
* Grab forcewake across all mmio operations to ease the
load on uncore lock and use chepear mmio ops.
Version 2 now makes the irq handling code path ~20% smaller on
48-bit PPGTT hardware, and a little bit less elsewhere. Hot
paths are mostly in-line now and hammering on the uncore
spinlock is greatly reduced together with mmio traffic to an
extent.
Signed-off-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
Cc: Chris Wilson <chris@chris-wilson.co.uk>
---
drivers/gpu/drm/i915/intel_lrc.c | 187 +++++++++++++++++---------------
drivers/gpu/drm/i915/intel_ringbuffer.h | 3 +-
2 files changed, 99 insertions(+), 91 deletions(-)
diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c
index a513a75d1fc9..a474e00be50e 100644
--- a/drivers/gpu/drm/i915/intel_lrc.c
+++ b/drivers/gpu/drm/i915/intel_lrc.c
@@ -269,6 +269,9 @@ logical_ring_init_platform_invariants(struct intel_engine_cs *ring)
{
struct drm_device *dev = ring->dev;
+ if (IS_GEN8(dev) || IS_GEN9(dev))
+ ring->idle_lite_restore_wa = ~0;
+
ring->disable_lite_restore_wa = (IS_SKL_REVID(dev, 0, SKL_REVID_B0) ||
IS_BXT_REVID(dev, 0, BXT_REVID_A1)) &&
(ring->id == VCS || ring->id == VCS2);
@@ -372,8 +375,6 @@ static void execlists_elsp_write(struct drm_i915_gem_request *rq0,
rq0->elsp_submitted++;
/* You must always write both descriptors in the order below. */
- spin_lock(&dev_priv->uncore.lock);
- intel_uncore_forcewake_get__locked(dev_priv, FORCEWAKE_ALL);
I915_WRITE_FW(RING_ELSP(ring), upper_32_bits(desc[1]));
I915_WRITE_FW(RING_ELSP(ring), lower_32_bits(desc[1]));
@@ -383,11 +384,18 @@ static void execlists_elsp_write(struct drm_i915_gem_request *rq0,
/* ELSP is a wo register, use another nearby reg for posting */
POSTING_READ_FW(RING_EXECLIST_STATUS_LO(ring));
- intel_uncore_forcewake_put__locked(dev_priv, FORCEWAKE_ALL);
- spin_unlock(&dev_priv->uncore.lock);
}
-static int execlists_update_context(struct drm_i915_gem_request *rq)
+static void
+execlists_update_context_pdps(struct i915_hw_ppgtt *ppgtt, u32 *reg_state)
+{
+ ASSIGN_CTX_PDP(ppgtt, reg_state, 3);
+ ASSIGN_CTX_PDP(ppgtt, reg_state, 2);
+ ASSIGN_CTX_PDP(ppgtt, reg_state, 1);
+ ASSIGN_CTX_PDP(ppgtt, reg_state, 0);
+}
+
+static void execlists_update_context(struct drm_i915_gem_request *rq)
{
struct intel_engine_cs *ring = rq->ring;
struct i915_hw_ppgtt *ppgtt = rq->ctx->ppgtt;
@@ -395,19 +403,13 @@ static int execlists_update_context(struct drm_i915_gem_request *rq)
reg_state[CTX_RING_TAIL+1] = rq->tail;
- if (ppgtt && !USES_FULL_48BIT_PPGTT(ppgtt->base.dev)) {
- /* True 32b PPGTT with dynamic page allocation: update PDP
- * registers and point the unallocated PDPs to scratch page.
- * PML4 is allocated during ppgtt init, so this is not needed
- * in 48-bit mode.
- */
- ASSIGN_CTX_PDP(ppgtt, reg_state, 3);
- ASSIGN_CTX_PDP(ppgtt, reg_state, 2);
- ASSIGN_CTX_PDP(ppgtt, reg_state, 1);
- ASSIGN_CTX_PDP(ppgtt, reg_state, 0);
- }
-
- return 0;
+ /* True 32b PPGTT with dynamic page allocation: update PDP
+ * registers and point the unallocated PDPs to scratch page.
+ * PML4 is allocated during ppgtt init, so this is not needed
+ * in 48-bit mode.
+ */
+ if (ppgtt && !USES_FULL_48BIT_PPGTT(ppgtt->base.dev))
+ execlists_update_context_pdps(ppgtt, reg_state);
}
static void execlists_submit_requests(struct drm_i915_gem_request *rq0,
@@ -424,7 +426,7 @@ static void execlists_submit_requests(struct drm_i915_gem_request *rq0,
static void execlists_context_unqueue(struct intel_engine_cs *ring)
{
struct drm_i915_gem_request *req0 = NULL, *req1 = NULL;
- struct drm_i915_gem_request *cursor = NULL, *tmp = NULL;
+ struct drm_i915_gem_request *cursor, *tmp;
assert_spin_locked(&ring->execlist_lock);
@@ -434,9 +436,6 @@ static void execlists_context_unqueue(struct intel_engine_cs *ring)
*/
WARN_ON(!intel_irqs_enabled(ring->dev->dev_private));
- if (list_empty(&ring->execlist_queue))
- return;
-
/* Try to read in pairs */
list_for_each_entry_safe(cursor, tmp, &ring->execlist_queue,
execlist_link) {
@@ -451,37 +450,35 @@ static void execlists_context_unqueue(struct intel_engine_cs *ring)
req0 = cursor;
} else {
req1 = cursor;
+ WARN_ON(req1->elsp_submitted);
break;
}
}
- if (IS_GEN8(ring->dev) || IS_GEN9(ring->dev)) {
+ if (unlikely(!req0))
+ return;
+
+ if (req0->elsp_submitted & ring->idle_lite_restore_wa) {
/*
- * WaIdleLiteRestore: make sure we never cause a lite
- * restore with HEAD==TAIL
+ * WaIdleLiteRestore: make sure we never cause a lite restore
+ * with HEAD==TAIL.
+ *
+ * Apply the wa NOOPS to prevent ring:HEAD == req:TAIL as we
+ * resubmit the request. See gen8_emit_request() for where we
+ * prepare the padding after the end of the request.
*/
- if (req0->elsp_submitted) {
- /*
- * Apply the wa NOOPS to prevent ring:HEAD == req:TAIL
- * as we resubmit the request. See gen8_emit_request()
- * for where we prepare the padding after the end of the
- * request.
- */
- struct intel_ringbuffer *ringbuf;
+ struct intel_ringbuffer *ringbuf;
- ringbuf = req0->ctx->engine[ring->id].ringbuf;
- req0->tail += 8;
- req0->tail &= ringbuf->size - 1;
- }
+ ringbuf = req0->ctx->engine[ring->id].ringbuf;
+ req0->tail += 8;
+ req0->tail &= ringbuf->size - 1;
}
- WARN_ON(req1 && req1->elsp_submitted);
-
execlists_submit_requests(req0, req1);
}
-static bool execlists_check_remove_request(struct intel_engine_cs *ring,
- u32 request_id)
+static unsigned int
+execlists_check_remove_request(struct intel_engine_cs *ring, u32 request_id)
{
struct drm_i915_gem_request *head_req;
@@ -491,32 +488,39 @@ static bool execlists_check_remove_request(struct intel_engine_cs *ring,
struct drm_i915_gem_request,
execlist_link);
- if (head_req != NULL) {
- if (intel_execlists_ctx_id(head_req->ctx, ring) == request_id) {
- WARN(head_req->elsp_submitted == 0,
- "Never submitted head request\n");
+ if (!head_req)
+ return 0;
- if (--head_req->elsp_submitted <= 0) {
- list_move_tail(&head_req->execlist_link,
- &ring->execlist_retired_req_list);
- return true;
- }
- }
- }
+ if (unlikely(intel_execlists_ctx_id(head_req->ctx, ring) != request_id))
+ return 0;
+
+ WARN(head_req->elsp_submitted == 0, "Never submitted head request\n");
+
+ if (--head_req->elsp_submitted > 0)
+ return 0;
+
+ list_move_tail(&head_req->execlist_link,
+ &ring->execlist_retired_req_list);
- return false;
+ return 1;
}
static u32 get_context_status(struct intel_engine_cs *ring, u8 read_pointer,
u32 *context_id)
{
struct drm_i915_private *dev_priv = ring->dev->dev_private;
+ u32 status;
read_pointer %= GEN8_CSB_ENTRIES;
- *context_id = I915_READ(RING_CONTEXT_STATUS_BUF_HI(ring, read_pointer));
+ status = I915_READ_FW(RING_CONTEXT_STATUS_BUF_LO(ring, read_pointer));
- return I915_READ(RING_CONTEXT_STATUS_BUF_LO(ring, read_pointer));
+ if (status & GEN8_CTX_STATUS_IDLE_ACTIVE)
+ return 0;
+
+ *context_id = I915_READ_FW(RING_CONTEXT_STATUS_BUF_HI(ring, read_pointer));
+
+ return status;
}
/**
@@ -530,28 +534,27 @@ void intel_lrc_irq_handler(struct intel_engine_cs *ring)
{
struct drm_i915_private *dev_priv = ring->dev->dev_private;
u32 status_pointer;
- u8 read_pointer;
- u8 write_pointer;
+ unsigned int read_pointer, write_pointer;
u32 status = 0;
u32 status_id;
- u32 submit_contexts = 0;
+ unsigned int submit_contexts = 0;
+
+ spin_lock(&ring->execlist_lock);
+
+ spin_lock(&dev_priv->uncore.lock);
+ intel_uncore_forcewake_get__locked(dev_priv, FORCEWAKE_ALL);
- status_pointer = I915_READ(RING_CONTEXT_STATUS_PTR(ring));
+ status_pointer = I915_READ_FW(RING_CONTEXT_STATUS_PTR(ring));
read_pointer = ring->next_context_status_buffer;
write_pointer = GEN8_CSB_WRITE_PTR(status_pointer);
if (read_pointer > write_pointer)
write_pointer += GEN8_CSB_ENTRIES;
- spin_lock(&ring->execlist_lock);
-
while (read_pointer < write_pointer) {
status = get_context_status(ring, ++read_pointer, &status_id);
- if (status & GEN8_CTX_STATUS_IDLE_ACTIVE)
- continue;
-
- if (status & GEN8_CTX_STATUS_PREEMPTED) {
+ if (unlikely(status & GEN8_CTX_STATUS_PREEMPTED)) {
if (status & GEN8_CTX_STATUS_LITE_RESTORE) {
if (execlists_check_remove_request(ring, status_id))
WARN(1, "Lite Restored request removed from queue\n");
@@ -559,38 +562,37 @@ void intel_lrc_irq_handler(struct intel_engine_cs *ring)
WARN(1, "Preemption without Lite Restore\n");
}
- if ((status & GEN8_CTX_STATUS_ACTIVE_IDLE) ||
- (status & GEN8_CTX_STATUS_ELEMENT_SWITCH)) {
- if (execlists_check_remove_request(ring, status_id))
- submit_contexts++;
- }
+ if (status & (GEN8_CTX_STATUS_ACTIVE_IDLE |
+ GEN8_CTX_STATUS_ELEMENT_SWITCH))
+ submit_contexts +=
+ execlists_check_remove_request(ring, status_id);
}
- if (ring->disable_lite_restore_wa) {
- /* Prevent a ctx to preempt itself */
- if ((status & GEN8_CTX_STATUS_ACTIVE_IDLE) &&
- (submit_contexts != 0))
- execlists_context_unqueue(ring);
- } else if (submit_contexts != 0) {
+ if (submit_contexts && (!ring->disable_lite_restore_wa ||
+ (ring->disable_lite_restore_wa && (status &
+ GEN8_CTX_STATUS_ACTIVE_IDLE))))
execlists_context_unqueue(ring);
- }
-
- spin_unlock(&ring->execlist_lock);
-
- if (unlikely(submit_contexts > 2))
- DRM_ERROR("More than two context complete events?\n");
ring->next_context_status_buffer = write_pointer % GEN8_CSB_ENTRIES;
/* Update the read pointer to the old write pointer. Manual ringbuffer
* management ftw </sarcasm> */
- I915_WRITE(RING_CONTEXT_STATUS_PTR(ring),
- _MASKED_FIELD(GEN8_CSB_READ_PTR_MASK,
- ring->next_context_status_buffer << 8));
+ I915_WRITE_FW(RING_CONTEXT_STATUS_PTR(ring),
+ _MASKED_FIELD(GEN8_CSB_READ_PTR_MASK,
+ ring->next_context_status_buffer << 8));
+
+ intel_uncore_forcewake_put__locked(dev_priv, FORCEWAKE_ALL);
+ spin_unlock(&dev_priv->uncore.lock);
+
+ spin_unlock(&ring->execlist_lock);
+
+ if (unlikely(submit_contexts > 2))
+ DRM_ERROR("More than two context complete events?\n");
}
static int execlists_context_queue(struct drm_i915_gem_request *request)
{
+ struct drm_i915_private *dev_priv = request->i915;
struct intel_engine_cs *ring = request->ring;
struct drm_i915_gem_request *cursor;
int num_elements = 0;
@@ -622,9 +624,16 @@ static int execlists_context_queue(struct drm_i915_gem_request *request)
}
list_add_tail(&request->execlist_link, &ring->execlist_queue);
- if (num_elements == 0)
+ if (num_elements == 0) {
+ spin_lock(&dev_priv->uncore.lock);
+ intel_uncore_forcewake_get__locked(dev_priv, FORCEWAKE_ALL);
+
execlists_context_unqueue(ring);
+ intel_uncore_forcewake_put__locked(dev_priv, FORCEWAKE_ALL);
+ spin_unlock(&dev_priv->uncore.lock);
+ }
+
spin_unlock_irq(&ring->execlist_lock);
return 0;
@@ -2013,6 +2022,7 @@ void intel_logical_ring_cleanup(struct intel_engine_cs *ring)
ring->status_page.obj = NULL;
}
+ ring->idle_lite_restore_wa = 0;
ring->disable_lite_restore_wa = false;
ring->ctx_desc_template = 0;
@@ -2417,10 +2427,7 @@ populate_lr_context(struct intel_context *ctx, struct drm_i915_gem_object *ctx_o
* With dynamic page allocation, PDPs may not be allocated at
* this point. Point the unallocated PDPs to the scratch page
*/
- ASSIGN_CTX_PDP(ppgtt, reg_state, 3);
- ASSIGN_CTX_PDP(ppgtt, reg_state, 2);
- ASSIGN_CTX_PDP(ppgtt, reg_state, 1);
- ASSIGN_CTX_PDP(ppgtt, reg_state, 0);
+ execlists_update_context_pdps(ppgtt, reg_state);
}
if (ring->id == RCS) {
diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.h b/drivers/gpu/drm/i915/intel_ringbuffer.h
index 566b0ae10ce0..dd910d30a380 100644
--- a/drivers/gpu/drm/i915/intel_ringbuffer.h
+++ b/drivers/gpu/drm/i915/intel_ringbuffer.h
@@ -271,7 +271,8 @@ struct intel_engine_cs {
spinlock_t execlist_lock;
struct list_head execlist_queue;
struct list_head execlist_retired_req_list;
- u8 next_context_status_buffer;
+ unsigned int next_context_status_buffer;
+ unsigned int idle_lite_restore_wa;
bool disable_lite_restore_wa;
u32 ctx_desc_template;
u32 irq_keep_mask; /* bitmask for interrupts that should not be masked */
--
1.9.1
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx
^ permalink raw reply related [flat|nested] 14+ messages in thread
* Re: [PATCH v2] drm/i915: Execlist irq handler micro optimisations
2016-02-12 12:00 ` [PATCH v2] " Tvrtko Ursulin
@ 2016-02-12 13:46 ` Tvrtko Ursulin
2016-02-12 14:30 ` Chris Wilson
2016-02-12 14:42 ` Chris Wilson
1 sibling, 1 reply; 14+ messages in thread
From: Tvrtko Ursulin @ 2016-02-12 13:46 UTC (permalink / raw)
To: Intel-gfx
On 12/02/16 12:00, Tvrtko Ursulin wrote:
> From: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
>
> Assorted changes most likely without any practical effect
> apart from a tiny reduction in generated code for the interrupt
> handler and request submission.
>
> * Remove needless initialization.
> * Improve cache locality by reorganizing code and/or using
> branch hints to keep unexpected or error conditions out
> of line.
> * Favor busy submit path vs. empty queue.
> * Less branching in hot-paths.
>
> v2:
>
> * Avoid mmio reads when possible. (Chris Wilson)
> * Use natural integer size for csb indices.
> * Remove useless return value from execlists_update_context.
> * Extract 32-bit ppgtt PDPs update so it is out of line and
> shared with two callers.
> * Grab forcewake across all mmio operations to ease the
> load on uncore lock and use chepear mmio ops.
>
> Version 2 now makes the irq handling code path ~20% smaller on
> 48-bit PPGTT hardware, and a little bit less elsewhere. Hot
> paths are mostly in-line now and hammering on the uncore
> spinlock is greatly reduced together with mmio traffic to an
> extent.
Is gem_latency an interesting benchmark for this?
Five runs on vanilla:
747693/1: 9.080us 2.000us 2.000us 121.840us
742108/1: 9.060us 2.520us 2.520us 122.645us
744097/1: 9.060us 2.000us 2.000us 122.372us
744056/1: 9.180us 1.980us 1.980us 122.394us
742610/1: 9.040us 2.560us 2.560us 122.525us
Five runs with this patch series:
786532/1: 10.760us 1.520us 1.520us 115.705us
780735/1: 10.740us 1.580us 1.580us 116.558us
783706/1: 10.800us 1.460us 1.460us 116.280us
784135/1: 10.800us 1.520us 1.520us 116.151us
784037/1: 10.740us 1.520us 1.520us 116.250us
So it looks all got better apart from dispatch latency.
5% more throughput, 30% better consumer and producer latencies, 5% less
CPU usage, but 18% worse dispatch latency.
Comments?
Regards,
Tvrtko
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx
^ permalink raw reply [flat|nested] 14+ messages in thread
* Re: [PATCH v2] drm/i915: Execlist irq handler micro optimisations
2016-02-12 13:46 ` Tvrtko Ursulin
@ 2016-02-12 14:30 ` Chris Wilson
0 siblings, 0 replies; 14+ messages in thread
From: Chris Wilson @ 2016-02-12 14:30 UTC (permalink / raw)
To: Tvrtko Ursulin; +Cc: Intel-gfx
On Fri, Feb 12, 2016 at 01:46:35PM +0000, Tvrtko Ursulin wrote:
>
>
> On 12/02/16 12:00, Tvrtko Ursulin wrote:
> >From: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
> >
> >Assorted changes most likely without any practical effect
> >apart from a tiny reduction in generated code for the interrupt
> >handler and request submission.
> >
> > * Remove needless initialization.
> > * Improve cache locality by reorganizing code and/or using
> > branch hints to keep unexpected or error conditions out
> > of line.
> > * Favor busy submit path vs. empty queue.
> > * Less branching in hot-paths.
> >
> >v2:
> >
> > * Avoid mmio reads when possible. (Chris Wilson)
> > * Use natural integer size for csb indices.
> > * Remove useless return value from execlists_update_context.
> > * Extract 32-bit ppgtt PDPs update so it is out of line and
> > shared with two callers.
> > * Grab forcewake across all mmio operations to ease the
> > load on uncore lock and use chepear mmio ops.
> >
> >Version 2 now makes the irq handling code path ~20% smaller on
> >48-bit PPGTT hardware, and a little bit less elsewhere. Hot
> >paths are mostly in-line now and hammering on the uncore
> >spinlock is greatly reduced together with mmio traffic to an
> >extent.
>
> Is gem_latency an interesting benchmark for th
Yes, we should be able to detect changes on the order of 100ns and if
the results are stable and above the noise, then definitely.
"./gem_latency" sends one batch and then waits up it, so I only the patch
to directly affect the dispatch latency. I'd say the wake up latency is
solely due to reducing the spinlock contextion.
"./gem_latency -n 100" would queue 100 no-op batches before the
measurement. That may help to look at the overhead of handling the
context-switch interrupt.
-Chris
--
Chris Wilson, Intel Open Source Technology Centre
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx
^ permalink raw reply [flat|nested] 14+ messages in thread
* Re: [PATCH v2] drm/i915: Execlist irq handler micro optimisations
2016-02-12 12:00 ` [PATCH v2] " Tvrtko Ursulin
2016-02-12 13:46 ` Tvrtko Ursulin
@ 2016-02-12 14:42 ` Chris Wilson
2016-02-12 15:54 ` Tvrtko Ursulin
1 sibling, 1 reply; 14+ messages in thread
From: Chris Wilson @ 2016-02-12 14:42 UTC (permalink / raw)
To: Tvrtko Ursulin; +Cc: Intel-gfx
On Fri, Feb 12, 2016 at 12:00:40PM +0000, Tvrtko Ursulin wrote:
> From: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
>
> Assorted changes most likely without any practical effect
> apart from a tiny reduction in generated code for the interrupt
> handler and request submission.
>
> * Remove needless initialization.
> * Improve cache locality by reorganizing code and/or using
> branch hints to keep unexpected or error conditions out
> of line.
> * Favor busy submit path vs. empty queue.
> * Less branching in hot-paths.
>
> v2:
>
> * Avoid mmio reads when possible. (Chris Wilson)
> * Use natural integer size for csb indices.
> * Remove useless return value from execlists_update_context.
> * Extract 32-bit ppgtt PDPs update so it is out of line and
> shared with two callers.
> * Grab forcewake across all mmio operations to ease the
> load on uncore lock and use chepear mmio ops.
>
> Version 2 now makes the irq handling code path ~20% smaller on
> 48-bit PPGTT hardware, and a little bit less elsewhere. Hot
> paths are mostly in-line now and hammering on the uncore
> spinlock is greatly reduced together with mmio traffic to an
> extent.
Did you notice that ring->next_context_status_buffer is redundant as we
also have that information to hand in status_pointer?
What's your thinking for
if (req->elsp_submitted & ring->gen8_9)
vs a plain
if (req->elsp_submitted)
?
The tidies look good. Be useful to double check whether gem_latency is
behaving as a canary, it's a bit of a puzzle why that first dispatch
latency would grow.
-Chris
--
Chris Wilson, Intel Open Source Technology Centre
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx
^ permalink raw reply [flat|nested] 14+ messages in thread
* Re: [PATCH v2] drm/i915: Execlist irq handler micro optimisations
2016-02-12 14:42 ` Chris Wilson
@ 2016-02-12 15:54 ` Tvrtko Ursulin
2016-02-12 16:22 ` Chris Wilson
0 siblings, 1 reply; 14+ messages in thread
From: Tvrtko Ursulin @ 2016-02-12 15:54 UTC (permalink / raw)
To: Chris Wilson, Intel-gfx, Tvrtko Ursulin
On 12/02/16 14:42, Chris Wilson wrote:
> On Fri, Feb 12, 2016 at 12:00:40PM +0000, Tvrtko Ursulin wrote:
>> From: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
>>
>> Assorted changes most likely without any practical effect
>> apart from a tiny reduction in generated code for the interrupt
>> handler and request submission.
>>
>> * Remove needless initialization.
>> * Improve cache locality by reorganizing code and/or using
>> branch hints to keep unexpected or error conditions out
>> of line.
>> * Favor busy submit path vs. empty queue.
>> * Less branching in hot-paths.
>>
>> v2:
>>
>> * Avoid mmio reads when possible. (Chris Wilson)
>> * Use natural integer size for csb indices.
>> * Remove useless return value from execlists_update_context.
>> * Extract 32-bit ppgtt PDPs update so it is out of line and
>> shared with two callers.
>> * Grab forcewake across all mmio operations to ease the
>> load on uncore lock and use chepear mmio ops.
>>
>> Version 2 now makes the irq handling code path ~20% smaller on
>> 48-bit PPGTT hardware, and a little bit less elsewhere. Hot
>> paths are mostly in-line now and hammering on the uncore
>> spinlock is greatly reduced together with mmio traffic to an
>> extent.
>
> Did you notice that ring->next_context_status_buffer is redundant as we
> also have that information to hand in status_pointer?
I didn't and don't know that part that well. There might be some future
proofing issues around it as well.
> What's your thinking for
>
> if (req->elsp_submitted & ring->gen8_9)
>
> vs a plain
>
> if (req->elsp_submitted)
> ?
Another don't know this part that well. Is it not useful to not submit
two noops if they are not needed? Do they still end up submitted to the
GPU somehow?
> The tidies look good. Be useful to double check whether gem_latency is
> behaving as a canary, it's a bit of a puzzle why that first dispatch
> latency would grow.
Yes a puzzle, no idea how and why. But "gem_latency -n 100" does not
show this regression. I've done a hundred runs and these are the results:
* Throughput up 4.04%
* Dispatch latency down 0.37%
* Consumer and producer latencies down 22.53%
* CPU time down 2.25%
So it all looks good.
Regards,
Tvrtko
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx
^ permalink raw reply [flat|nested] 14+ messages in thread
* Re: [PATCH v2] drm/i915: Execlist irq handler micro optimisations
2016-02-12 15:54 ` Tvrtko Ursulin
@ 2016-02-12 16:22 ` Chris Wilson
0 siblings, 0 replies; 14+ messages in thread
From: Chris Wilson @ 2016-02-12 16:22 UTC (permalink / raw)
To: Tvrtko Ursulin; +Cc: Intel-gfx
On Fri, Feb 12, 2016 at 03:54:27PM +0000, Tvrtko Ursulin wrote:
>
> On 12/02/16 14:42, Chris Wilson wrote:
> >On Fri, Feb 12, 2016 at 12:00:40PM +0000, Tvrtko Ursulin wrote:
> >>From: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
> >>
> >>Assorted changes most likely without any practical effect
> >>apart from a tiny reduction in generated code for the interrupt
> >>handler and request submission.
> >>
> >> * Remove needless initialization.
> >> * Improve cache locality by reorganizing code and/or using
> >> branch hints to keep unexpected or error conditions out
> >> of line.
> >> * Favor busy submit path vs. empty queue.
> >> * Less branching in hot-paths.
> >>
> >>v2:
> >>
> >> * Avoid mmio reads when possible. (Chris Wilson)
> >> * Use natural integer size for csb indices.
> >> * Remove useless return value from execlists_update_context.
> >> * Extract 32-bit ppgtt PDPs update so it is out of line and
> >> shared with two callers.
> >> * Grab forcewake across all mmio operations to ease the
> >> load on uncore lock and use chepear mmio ops.
> >>
> >>Version 2 now makes the irq handling code path ~20% smaller on
> >>48-bit PPGTT hardware, and a little bit less elsewhere. Hot
> >>paths are mostly in-line now and hammering on the uncore
> >>spinlock is greatly reduced together with mmio traffic to an
> >>extent.
> >
> >Did you notice that ring->next_context_status_buffer is redundant as we
> >also have that information to hand in status_pointer?
>
> I didn't and don't know that part that well. There might be some
> future proofing issues around it as well.
Unlikely :-p
> >What's your thinking for
> >
> > if (req->elsp_submitted & ring->gen8_9)
> >
> >vs a plain
> >
> > if (req->elsp_submitted)
> >?
>
> Another don't know this part that well. Is it not useful to not
> submit two noops if they are not needed? Do they still end up
> submitted to the GPU somehow?
The command streamer always has to execute them since they lie between
the last dispatch TAIL and the next TAIL (in the lrc). All we do here is
to tweak the request->tail value, that may or may not be used the next
time we write the ELSP (depending on whether we are submitting the same
live request again). (The next request's tail will include the noops
before it's dispatch.)
> >The tidies look good. Be useful to double check whether gem_latency is
> >behaving as a canary, it's a bit of a puzzle why that first dispatch
> >latency would grow.
>
> Yes a puzzle, no idea how and why. But "gem_latency -n 100" does not
> show this regression. I've done a hundred runs and these are the
> results:
>
> * Throughput up 4.04%
> * Dispatch latency down 0.37%
> * Consumer and producer latencies down 22.53%
> * CPU time down 2.25%
>
> So it all looks good.
Yup.
-Chris
--
Chris Wilson, Intel Open Source Technology Centre
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx
^ permalink raw reply [flat|nested] 14+ messages in thread
* ✗ Fi.CI.BAT: failure for series starting with [2/2] drm/i915: Execlist irq handler micro optimisations (rev3)
2016-02-11 18:03 [PATCH 1/2] drm/i915: Do not return stale status / remove impossible WARN Tvrtko Ursulin
2016-02-11 18:03 ` [PATCH 2/2] drm/i915: Execlist irq handler micro optimisations Tvrtko Ursulin
2016-02-11 21:00 ` [PATCH 1/2] drm/i915: Do not return stale status / remove impossible WARN Chris Wilson
@ 2016-02-16 8:10 ` Patchwork
2 siblings, 0 replies; 14+ messages in thread
From: Patchwork @ 2016-02-16 8:10 UTC (permalink / raw)
To: Tvrtko Ursulin; +Cc: intel-gfx
== Summary ==
Series 3304v3 Series without cover letter
2016-02-12T12:00:45.925088 http://patchwork.freedesktop.org/api/1.0/series/3304/revisions/3/mbox/
Applying: drm/i915: Execlist irq handler micro optimisations
Repository lacks necessary blobs to fall back on 3-way merge.
Cannot fall back to three-way merge.
Patch failed at 0001 drm/i915: Execlist irq handler micro optimisations
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx
^ permalink raw reply [flat|nested] 14+ messages in thread
end of thread, other threads:[~2016-02-16 8:10 UTC | newest]
Thread overview: 14+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2016-02-11 18:03 [PATCH 1/2] drm/i915: Do not return stale status / remove impossible WARN Tvrtko Ursulin
2016-02-11 18:03 ` [PATCH 2/2] drm/i915: Execlist irq handler micro optimisations Tvrtko Ursulin
2016-02-11 20:59 ` Chris Wilson
2016-02-12 12:00 ` [PATCH v2] " Tvrtko Ursulin
2016-02-12 13:46 ` Tvrtko Ursulin
2016-02-12 14:30 ` Chris Wilson
2016-02-12 14:42 ` Chris Wilson
2016-02-12 15:54 ` Tvrtko Ursulin
2016-02-12 16:22 ` Chris Wilson
2016-02-11 21:00 ` [PATCH 1/2] drm/i915: Do not return stale status / remove impossible WARN Chris Wilson
2016-02-12 10:05 ` Tvrtko Ursulin
2016-02-12 10:21 ` Chris Wilson
2016-02-12 11:45 ` Tvrtko Ursulin
2016-02-16 8:10 ` ✗ Fi.CI.BAT: failure for series starting with [2/2] drm/i915: Execlist irq handler micro optimisations (rev3) Patchwork
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).