* [PATCH 1/2] drm/i915: Cache some registers in execlists hot paths
@ 2016-03-23 15:15 Tvrtko Ursulin
2016-03-23 15:15 ` [PATCH 2/2] drm/i915: Remove impossibe checks from i915_gem_request_add_to_client Tvrtko Ursulin
` (2 more replies)
0 siblings, 3 replies; 9+ messages in thread
From: Tvrtko Ursulin @ 2016-03-23 15:15 UTC (permalink / raw)
To: Intel-gfx
From: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
GCC cannot optimize well calculations hidden in macros and
assigned to temporary structures. We can cache the register in
ELSP write, and refactor reading of the CSB a bit to enable
it to do a better job.
Code is still equally readable but the generated body of the
CSB read loop is 30% smaller, and since that loop runs at
least once per interrupt, which in turn can fire in tens or
hundreds thousands times per second, must be of some value.
Signed-off-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
---
drivers/gpu/drm/i915/intel_lrc.c | 26 +++++++++++++-------------
drivers/gpu/drm/i915/intel_lrc.h | 11 ++++++++---
2 files changed, 21 insertions(+), 16 deletions(-)
diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c
index 3a23b9549f7b..67592f8354d6 100644
--- a/drivers/gpu/drm/i915/intel_lrc.c
+++ b/drivers/gpu/drm/i915/intel_lrc.c
@@ -361,8 +361,8 @@ static void execlists_elsp_write(struct drm_i915_gem_request *rq0,
{
struct intel_engine_cs *engine = rq0->engine;
- struct drm_device *dev = engine->dev;
- struct drm_i915_private *dev_priv = dev->dev_private;
+ struct drm_i915_private *dev_priv = rq0->i915;
+ i915_reg_t elsp_reg = RING_ELSP(engine);
uint64_t desc[2];
if (rq1) {
@@ -376,12 +376,12 @@ static void execlists_elsp_write(struct drm_i915_gem_request *rq0,
rq0->elsp_submitted++;
/* You must always write both descriptors in the order below. */
- I915_WRITE_FW(RING_ELSP(engine), upper_32_bits(desc[1]));
- I915_WRITE_FW(RING_ELSP(engine), lower_32_bits(desc[1]));
+ I915_WRITE_FW(elsp_reg, upper_32_bits(desc[1]));
+ I915_WRITE_FW(elsp_reg, lower_32_bits(desc[1]));
- I915_WRITE_FW(RING_ELSP(engine), upper_32_bits(desc[0]));
+ I915_WRITE_FW(elsp_reg, upper_32_bits(desc[0]));
/* The context is automatically loaded after the following */
- I915_WRITE_FW(RING_ELSP(engine), lower_32_bits(desc[0]));
+ I915_WRITE_FW(elsp_reg, lower_32_bits(desc[0]));
/* ELSP is a wo register, use another nearby reg for posting */
POSTING_READ_FW(RING_EXECLIST_STATUS_LO(engine));
@@ -517,21 +517,19 @@ execlists_check_remove_request(struct intel_engine_cs *engine, u32 request_id)
}
static u32
-get_context_status(struct intel_engine_cs *engine, unsigned int read_pointer,
- u32 *context_id)
+get_context_status(struct drm_i915_private *dev_priv, u32 csb_base,
+ unsigned int read_pointer, u32 *context_id)
{
- struct drm_i915_private *dev_priv = engine->dev->dev_private;
u32 status;
read_pointer %= GEN8_CSB_ENTRIES;
- status = I915_READ_FW(RING_CONTEXT_STATUS_BUF_LO(engine, read_pointer));
+ status = I915_READ_FW(RING_CSB_LO(csb_base, read_pointer));
if (status & GEN8_CTX_STATUS_IDLE_ACTIVE)
return 0;
- *context_id = I915_READ_FW(RING_CONTEXT_STATUS_BUF_HI(engine,
- read_pointer));
+ *context_id = I915_READ_FW(RING_CSB_HI(csb_base, read_pointer));
return status;
}
@@ -548,6 +546,7 @@ void intel_lrc_irq_handler(struct intel_engine_cs *engine)
struct drm_i915_private *dev_priv = engine->dev->dev_private;
u32 status_pointer;
unsigned int read_pointer, write_pointer;
+ u32 csb_base = RING_CSB_BASE(engine);
u32 csb[GEN8_CSB_ENTRIES][2];
unsigned int csb_read = 0, i;
unsigned int submit_contexts = 0;
@@ -565,7 +564,8 @@ void intel_lrc_irq_handler(struct intel_engine_cs *engine)
while (read_pointer < write_pointer) {
if (WARN_ON_ONCE(csb_read == GEN8_CSB_ENTRIES))
break;
- csb[csb_read][0] = get_context_status(engine, ++read_pointer,
+ csb[csb_read][0] = get_context_status(dev_priv, csb_base,
+ ++read_pointer,
&csb[csb_read][1]);
csb_read++;
}
diff --git a/drivers/gpu/drm/i915/intel_lrc.h b/drivers/gpu/drm/i915/intel_lrc.h
index a17cb12221ba..6690d93d603f 100644
--- a/drivers/gpu/drm/i915/intel_lrc.h
+++ b/drivers/gpu/drm/i915/intel_lrc.h
@@ -34,9 +34,14 @@
#define CTX_CTRL_INHIBIT_SYN_CTX_SWITCH (1 << 3)
#define CTX_CTRL_ENGINE_CTX_RESTORE_INHIBIT (1 << 0)
#define CTX_CTRL_RS_CTX_ENABLE (1 << 1)
-#define RING_CONTEXT_STATUS_BUF_LO(ring, i) _MMIO((ring)->mmio_base + 0x370 + (i) * 8)
-#define RING_CONTEXT_STATUS_BUF_HI(ring, i) _MMIO((ring)->mmio_base + 0x370 + (i) * 8 + 4)
-#define RING_CONTEXT_STATUS_PTR(ring) _MMIO((ring)->mmio_base + 0x3a0)
+
+#define RING_CSB_BASE(ring) ((ring)->mmio_base + 0x370)
+#define RING_CSB_LO(csb_base, i) _MMIO((csb_base) + (i) * 8)
+#define RING_CSB_HI(csb_base, i) _MMIO((csb_base) + (i) * 8 + 4)
+
+#define RING_CONTEXT_STATUS_BUF_LO(ring, i) RING_CSB_LO(RING_CSB_BASE(ring), i)
+#define RING_CONTEXT_STATUS_BUF_HI(ring, i) RING_CSB_HI(RING_CSB_BASE(ring), i)
+#define RING_CONTEXT_STATUS_PTR(ring) _MMIO((ring)->mmio_base + 0x3a0)
/* The docs specify that the write pointer wraps around after 5h, "After status
* is written out to the last available status QW at offset 5h, this pointer
--
1.9.1
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx
^ permalink raw reply related [flat|nested] 9+ messages in thread
* [PATCH 2/2] drm/i915: Remove impossibe checks from i915_gem_request_add_to_client
2016-03-23 15:15 [PATCH 1/2] drm/i915: Cache some registers in execlists hot paths Tvrtko Ursulin
@ 2016-03-23 15:15 ` Tvrtko Ursulin
2016-03-23 15:31 ` Chris Wilson
2016-03-23 16:06 ` ✗ Fi.CI.BAT: failure for series starting with [1/2] drm/i915: Cache some registers in execlists hot paths Patchwork
2016-03-24 11:16 ` [PATCH 1/2] " Chris Wilson
2 siblings, 1 reply; 9+ messages in thread
From: Tvrtko Ursulin @ 2016-03-23 15:15 UTC (permalink / raw)
To: Intel-gfx
From: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
There is only one caller of this functions which calls it under
strictly defined conditions. Error checking and return value
can be removed.
Signed-off-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
---
BTW it looks like the whole point of this request list is to implement
the throttle ioctl? Looks like a strange ioctl with a fixed 20ms criteria,
could it be re-implemented somehow without having to have this list?
Does it have to be per-client or could it wait for any requests on any
engine emitted 20ms ago?
drivers/gpu/drm/i915/i915_drv.h | 4 ++--
drivers/gpu/drm/i915/i915_gem.c | 14 ++------------
drivers/gpu/drm/i915/i915_gem_execbuffer.c | 4 +---
3 files changed, 5 insertions(+), 17 deletions(-)
diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
index 8727746cecd2..eaf015918b81 100644
--- a/drivers/gpu/drm/i915/i915_drv.h
+++ b/drivers/gpu/drm/i915/i915_drv.h
@@ -2292,8 +2292,8 @@ i915_gem_request_alloc(struct intel_engine_cs *engine,
struct intel_context *ctx);
void i915_gem_request_cancel(struct drm_i915_gem_request *req);
void i915_gem_request_free(struct kref *req_ref);
-int i915_gem_request_add_to_client(struct drm_i915_gem_request *req,
- struct drm_file *file);
+void i915_gem_request_add_to_client(struct drm_i915_gem_request *req,
+ struct drm_file *file);
static inline uint32_t
i915_gem_request_get_seqno(struct drm_i915_gem_request *req)
diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
index 8588c83abb35..6b4f271602f3 100644
--- a/drivers/gpu/drm/i915/i915_gem.c
+++ b/drivers/gpu/drm/i915/i915_gem.c
@@ -1367,19 +1367,11 @@ out:
return ret;
}
-int i915_gem_request_add_to_client(struct drm_i915_gem_request *req,
- struct drm_file *file)
+void i915_gem_request_add_to_client(struct drm_i915_gem_request *req,
+ struct drm_file *file)
{
struct drm_i915_file_private *file_priv;
- WARN_ON(!req || !file || req->file_priv);
-
- if (!req || !file)
- return -EINVAL;
-
- if (req->file_priv)
- return -EINVAL;
-
file_priv = file->driver_priv;
spin_lock(&file_priv->mm.lock);
@@ -1388,8 +1380,6 @@ int i915_gem_request_add_to_client(struct drm_i915_gem_request *req,
spin_unlock(&file_priv->mm.lock);
req->pid = get_pid(task_pid(current));
-
- return 0;
}
static inline void
diff --git a/drivers/gpu/drm/i915/i915_gem_execbuffer.c b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
index 374a0cb7a092..9295168cce10 100644
--- a/drivers/gpu/drm/i915/i915_gem_execbuffer.c
+++ b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
@@ -1620,9 +1620,7 @@ i915_gem_do_execbuffer(struct drm_device *dev, void *data,
goto err_batch_unpin;
}
- ret = i915_gem_request_add_to_client(req, file);
- if (ret)
- goto err_batch_unpin;
+ i915_gem_request_add_to_client(req, file);
/*
* Save assorted stuff away to pass through to *_submission().
--
1.9.1
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx
^ permalink raw reply related [flat|nested] 9+ messages in thread
* Re: [PATCH 2/2] drm/i915: Remove impossibe checks from i915_gem_request_add_to_client
2016-03-23 15:15 ` [PATCH 2/2] drm/i915: Remove impossibe checks from i915_gem_request_add_to_client Tvrtko Ursulin
@ 2016-03-23 15:31 ` Chris Wilson
2016-03-24 9:49 ` Tvrtko Ursulin
0 siblings, 1 reply; 9+ messages in thread
From: Chris Wilson @ 2016-03-23 15:31 UTC (permalink / raw)
To: Tvrtko Ursulin; +Cc: Intel-gfx
On Wed, Mar 23, 2016 at 03:15:03PM +0000, Tvrtko Ursulin wrote:
> From: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
>
> There is only one caller of this functions which calls it under
> strictly defined conditions. Error checking and return value
> can be removed.
>
> Signed-off-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
> ---
> BTW it looks like the whole point of this request list is to implement
> the throttle ioctl? Looks like a strange ioctl with a fixed 20ms criteria,
> could it be re-implemented somehow without having to have this list?
>
> Does it have to be per-client or could it wait for any requests on any
> engine emitted 20ms ago?
It is per-client, and I long wished the 20ms wasn't defined by the ABI.
What I proposed doing is this:
https://cgit.freedesktop.org/~ickle/linux-2.6/commit/?h=breadcrumbs&id=e33947505b54582964c8c202b22f88fc5705f484
Please note that this also fixes a race condition I've seen in the wild.
-Chris
--
Chris Wilson, Intel Open Source Technology Centre
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx
^ permalink raw reply [flat|nested] 9+ messages in thread
* ✗ Fi.CI.BAT: failure for series starting with [1/2] drm/i915: Cache some registers in execlists hot paths
2016-03-23 15:15 [PATCH 1/2] drm/i915: Cache some registers in execlists hot paths Tvrtko Ursulin
2016-03-23 15:15 ` [PATCH 2/2] drm/i915: Remove impossibe checks from i915_gem_request_add_to_client Tvrtko Ursulin
@ 2016-03-23 16:06 ` Patchwork
2016-03-24 11:16 ` [PATCH 1/2] " Chris Wilson
2 siblings, 0 replies; 9+ messages in thread
From: Patchwork @ 2016-03-23 16:06 UTC (permalink / raw)
To: Tvrtko Ursulin; +Cc: intel-gfx
== Series Details ==
Series: series starting with [1/2] drm/i915: Cache some registers in execlists hot paths
URL : https://patchwork.freedesktop.org/series/4808/
State : failure
== Summary ==
Series 4808v1 Series without cover letter
http://patchwork.freedesktop.org/api/1.0/series/4808/revisions/1/mbox/
Test drv_module_reload_basic:
pass -> INCOMPLETE (hsw-brixbox)
Test pm_rpm:
Subgroup basic-pci-d3-state:
pass -> DMESG-WARN (bsw-nuc-2)
dmesg-warn -> PASS (byt-nuc)
Subgroup basic-rte:
dmesg-warn -> PASS (bsw-nuc-2)
bdw-nuci7 total:192 pass:180 dwarn:0 dfail:0 fail:0 skip:12
bdw-ultra total:192 pass:171 dwarn:0 dfail:0 fail:0 skip:21
bsw-nuc-2 total:192 pass:154 dwarn:1 dfail:0 fail:0 skip:37
byt-nuc total:192 pass:157 dwarn:0 dfail:0 fail:0 skip:35
hsw-brixbox total:59 pass:51 dwarn:0 dfail:0 fail:0 skip:7
hsw-gt2 total:192 pass:175 dwarn:0 dfail:0 fail:0 skip:17
ilk-hp8440p total:192 pass:129 dwarn:0 dfail:0 fail:0 skip:63
ivb-t430s total:192 pass:167 dwarn:0 dfail:0 fail:0 skip:25
skl-i7k-2 total:192 pass:169 dwarn:0 dfail:0 fail:0 skip:23
skl-nuci5 total:192 pass:181 dwarn:0 dfail:0 fail:0 skip:11
Results at /archive/results/CI_IGT_test/Patchwork_1692/
6f788978796a35996bea8795db70f9c4194b70a2 drm-intel-nightly: 2016y-03m-23d-12h-24m-24s UTC integration manifest
54deecfbd6f9ea5b9cb62127914c8f9dd968e010 drm/i915: Remove impossibe checks from i915_gem_request_add_to_client
9522e99cd824dc1787860f056754eb164bc80ad5 drm/i915: Cache some registers in execlists hot paths
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx
^ permalink raw reply [flat|nested] 9+ messages in thread
* Re: [PATCH 2/2] drm/i915: Remove impossibe checks from i915_gem_request_add_to_client
2016-03-23 15:31 ` Chris Wilson
@ 2016-03-24 9:49 ` Tvrtko Ursulin
2016-03-24 10:11 ` Chris Wilson
0 siblings, 1 reply; 9+ messages in thread
From: Tvrtko Ursulin @ 2016-03-24 9:49 UTC (permalink / raw)
To: Chris Wilson, Intel-gfx
On 23/03/16 15:31, Chris Wilson wrote:
> On Wed, Mar 23, 2016 at 03:15:03PM +0000, Tvrtko Ursulin wrote:
>> From: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
>>
>> There is only one caller of this functions which calls it under
>> strictly defined conditions. Error checking and return value
>> can be removed.
>>
>> Signed-off-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
>> ---
>> BTW it looks like the whole point of this request list is to implement
>> the throttle ioctl? Looks like a strange ioctl with a fixed 20ms criteria,
>> could it be re-implemented somehow without having to have this list?
>>
>> Does it have to be per-client or could it wait for any requests on any
>> engine emitted 20ms ago?
>
> It is per-client, and I long wished the 20ms wasn't defined by the ABI.
Is it useful and/or widely used though?
With multiple clients submitting work I don't see the semantics are
deterministic. I was thinking if it could be replaced with a simpler
implementation for the similar random effect, losing the list altogether?
Especially wonder how the scheduler will affect it.
> What I proposed doing is this:
>
> https://cgit.freedesktop.org/~ickle/linux-2.6/commit/?h=breadcrumbs&id=e33947505b54582964c8c202b22f88fc5705f484
How is it safe? list_add modifies various pointers in multiple steps so
I didn't know that is safe against concurrent iteration.
RCU flavoured list API can do such things but in this case switching to
that would only enable lockless throttle and not gain anything on the
add_to_client path.
> Please note that this also fixes a race condition I've seen in the wild.
What is the race?
Regards,
Tvrtko
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx
^ permalink raw reply [flat|nested] 9+ messages in thread
* Re: [PATCH 2/2] drm/i915: Remove impossibe checks from i915_gem_request_add_to_client
2016-03-24 9:49 ` Tvrtko Ursulin
@ 2016-03-24 10:11 ` Chris Wilson
0 siblings, 0 replies; 9+ messages in thread
From: Chris Wilson @ 2016-03-24 10:11 UTC (permalink / raw)
To: Tvrtko Ursulin; +Cc: Intel-gfx
On Thu, Mar 24, 2016 at 09:49:25AM +0000, Tvrtko Ursulin wrote:
>
> On 23/03/16 15:31, Chris Wilson wrote:
> >On Wed, Mar 23, 2016 at 03:15:03PM +0000, Tvrtko Ursulin wrote:
> >>From: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
> >>
> >>There is only one caller of this functions which calls it under
> >>strictly defined conditions. Error checking and return value
> >>can be removed.
> >>
> >>Signed-off-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
> >>---
> >>BTW it looks like the whole point of this request list is to implement
> >>the throttle ioctl? Looks like a strange ioctl with a fixed 20ms criteria,
> >>could it be re-implemented somehow without having to have this list?
> >>
> >>Does it have to be per-client or could it wait for any requests on any
> >>engine emitted 20ms ago?
> >
> >It is per-client, and I long wished the 20ms wasn't defined by the ABI.
>
> Is it useful and/or widely used though?
X for throttling front buffer rendering.
> With multiple clients submitting work I don't see the semantics are
> deterministic. I was thinking if it could be replaced with a simpler
> implementation for the similar random effect, losing the list
> altogether?
No, it wants per-client semantics.
> Especially wonder how the scheduler will affect it.
It won't because that would be breaking ABI.
> >What I proposed doing is this:
> >
> >https://cgit.freedesktop.org/~ickle/linux-2.6/commit/?h=breadcrumbs&id=e33947505b54582964c8c202b22f88fc5705f484
>
> How is it safe? list_add modifies various pointers in multiple steps
> so I didn't know that is safe against concurrent iteration.
It is safe. list_add() is now properly ordered using the compiler
barrier (i.e. it gained the RCU semantics).
> RCU flavoured list API can do such things but in this case switching
> to that would only enable lockless throttle and not gain anything on
> the add_to_client path.
And add_to_client is the hotter of the pair.
> >Please note that this also fixes a race condition I've seen in the wild.
>
> What is the race?
file_priv not being checked after acquiring the spinlock under which the
list may be reset.
-Chris
--
Chris Wilson, Intel Open Source Technology Centre
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx
^ permalink raw reply [flat|nested] 9+ messages in thread
* Re: [PATCH 1/2] drm/i915: Cache some registers in execlists hot paths
2016-03-23 15:15 [PATCH 1/2] drm/i915: Cache some registers in execlists hot paths Tvrtko Ursulin
2016-03-23 15:15 ` [PATCH 2/2] drm/i915: Remove impossibe checks from i915_gem_request_add_to_client Tvrtko Ursulin
2016-03-23 16:06 ` ✗ Fi.CI.BAT: failure for series starting with [1/2] drm/i915: Cache some registers in execlists hot paths Patchwork
@ 2016-03-24 11:16 ` Chris Wilson
2016-03-24 11:31 ` Tvrtko Ursulin
2 siblings, 1 reply; 9+ messages in thread
From: Chris Wilson @ 2016-03-24 11:16 UTC (permalink / raw)
To: Tvrtko Ursulin; +Cc: Intel-gfx
On Wed, Mar 23, 2016 at 03:15:02PM +0000, Tvrtko Ursulin wrote:
> From: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
>
> GCC cannot optimize well calculations hidden in macros and
> assigned to temporary structures. We can cache the register in
> ELSP write, and refactor reading of the CSB a bit to enable
> it to do a better job.
>
> Code is still equally readable but the generated body of the
> CSB read loop is 30% smaller, and since that loop runs at
> least once per interrupt, which in turn can fire in tens or
> hundreds thousands times per second, must be of some value.
>
> Signed-off-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
> ---
> drivers/gpu/drm/i915/intel_lrc.c | 26 +++++++++++++-------------
> drivers/gpu/drm/i915/intel_lrc.h | 11 ++++++++---
> 2 files changed, 21 insertions(+), 16 deletions(-)
>
> diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c
> index 3a23b9549f7b..67592f8354d6 100644
> --- a/drivers/gpu/drm/i915/intel_lrc.c
> +++ b/drivers/gpu/drm/i915/intel_lrc.c
> @@ -361,8 +361,8 @@ static void execlists_elsp_write(struct drm_i915_gem_request *rq0,
> {
>
> struct intel_engine_cs *engine = rq0->engine;
> - struct drm_device *dev = engine->dev;
> - struct drm_i915_private *dev_priv = dev->dev_private;
> + struct drm_i915_private *dev_priv = rq0->i915;
> + i915_reg_t elsp_reg = RING_ELSP(engine);
If we are going to open-code it, how about going whole hog
and
/* We opencode I915_WRITE_FW(RING_ELSP(engine)) here to save gcc the
* trouble of doing so - gcc fails miserably!
*/
u32 *elsp = rq->i915->regs + i915_mmio_reg_offset(RING_ELSP(engine));
then use writel(upper_32_bits(desc[1]), elsp);
Keeping the comment around for grepping.
> uint64_t desc[2];
>
> if (rq1) {
> @@ -376,12 +376,12 @@ static void execlists_elsp_write(struct drm_i915_gem_request *rq0,
> rq0->elsp_submitted++;
>
> /* You must always write both descriptors in the order below. */
> - I915_WRITE_FW(RING_ELSP(engine), upper_32_bits(desc[1]));
> - I915_WRITE_FW(RING_ELSP(engine), lower_32_bits(desc[1]));
> + I915_WRITE_FW(elsp_reg, upper_32_bits(desc[1]));
> + I915_WRITE_FW(elsp_reg, lower_32_bits(desc[1]));
>
> - I915_WRITE_FW(RING_ELSP(engine), upper_32_bits(desc[0]));
> + I915_WRITE_FW(elsp_reg, upper_32_bits(desc[0]));
> /* The context is automatically loaded after the following */
> - I915_WRITE_FW(RING_ELSP(engine), lower_32_bits(desc[0]));
> + I915_WRITE_FW(elsp_reg, lower_32_bits(desc[0]));
>
> /* ELSP is a wo register, use another nearby reg for posting */
> POSTING_READ_FW(RING_EXECLIST_STATUS_LO(engine));
Observing the above, we can also kill the POSTING_READ_FW() which will
make a much bigger improvement than all of the above.
> @@ -517,21 +517,19 @@ execlists_check_remove_request(struct intel_engine_cs *engine, u32 request_id)
> }
>
> static u32
> -get_context_status(struct intel_engine_cs *engine, unsigned int read_pointer,
> - u32 *context_id)
> +get_context_status(struct drm_i915_private *dev_priv, u32 csb_base,
> + unsigned int read_pointer, u32 *context_id)
> {
> - struct drm_i915_private *dev_priv = engine->dev->dev_private;
> u32 status;
>
> read_pointer %= GEN8_CSB_ENTRIES;
>
> - status = I915_READ_FW(RING_CONTEXT_STATUS_BUF_LO(engine, read_pointer));
> + status = I915_READ_FW(RING_CSB_LO(csb_base, read_pointer));
If we forgo the "convenience" interface here, could we not also improve
the readability of the code by just having the csb ringbuffer and readl?
-Chris
--
Chris Wilson, Intel Open Source Technology Centre
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx
^ permalink raw reply [flat|nested] 9+ messages in thread
* Re: [PATCH 1/2] drm/i915: Cache some registers in execlists hot paths
2016-03-24 11:16 ` [PATCH 1/2] " Chris Wilson
@ 2016-03-24 11:31 ` Tvrtko Ursulin
2016-03-24 11:53 ` Chris Wilson
0 siblings, 1 reply; 9+ messages in thread
From: Tvrtko Ursulin @ 2016-03-24 11:31 UTC (permalink / raw)
To: Chris Wilson, Intel-gfx
On 24/03/16 11:16, Chris Wilson wrote:
> On Wed, Mar 23, 2016 at 03:15:02PM +0000, Tvrtko Ursulin wrote:
>> From: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
>>
>> GCC cannot optimize well calculations hidden in macros and
>> assigned to temporary structures. We can cache the register in
>> ELSP write, and refactor reading of the CSB a bit to enable
>> it to do a better job.
>>
>> Code is still equally readable but the generated body of the
>> CSB read loop is 30% smaller, and since that loop runs at
>> least once per interrupt, which in turn can fire in tens or
>> hundreds thousands times per second, must be of some value.
>>
>> Signed-off-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
>> ---
>> drivers/gpu/drm/i915/intel_lrc.c | 26 +++++++++++++-------------
>> drivers/gpu/drm/i915/intel_lrc.h | 11 ++++++++---
>> 2 files changed, 21 insertions(+), 16 deletions(-)
>>
>> diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c
>> index 3a23b9549f7b..67592f8354d6 100644
>> --- a/drivers/gpu/drm/i915/intel_lrc.c
>> +++ b/drivers/gpu/drm/i915/intel_lrc.c
>> @@ -361,8 +361,8 @@ static void execlists_elsp_write(struct drm_i915_gem_request *rq0,
>> {
>>
>> struct intel_engine_cs *engine = rq0->engine;
>> - struct drm_device *dev = engine->dev;
>> - struct drm_i915_private *dev_priv = dev->dev_private;
>> + struct drm_i915_private *dev_priv = rq0->i915;
>> + i915_reg_t elsp_reg = RING_ELSP(engine);
>
> If we are going to open-code it, how about going whole hog
> and
>
> /* We opencode I915_WRITE_FW(RING_ELSP(engine)) here to save gcc the
> * trouble of doing so - gcc fails miserably!
> */
> u32 *elsp = rq->i915->regs + i915_mmio_reg_offset(RING_ELSP(engine));
> then use writel(upper_32_bits(desc[1]), elsp);
>
> Keeping the comment around for grepping.
Yeah I had that but thought it will be considered to tasteless for
posting. :)
>> uint64_t desc[2];
>>
>> if (rq1) {
>> @@ -376,12 +376,12 @@ static void execlists_elsp_write(struct drm_i915_gem_request *rq0,
>> rq0->elsp_submitted++;
>>
>> /* You must always write both descriptors in the order below. */
>> - I915_WRITE_FW(RING_ELSP(engine), upper_32_bits(desc[1]));
>> - I915_WRITE_FW(RING_ELSP(engine), lower_32_bits(desc[1]));
>> + I915_WRITE_FW(elsp_reg, upper_32_bits(desc[1]));
>> + I915_WRITE_FW(elsp_reg, lower_32_bits(desc[1]));
>>
>> - I915_WRITE_FW(RING_ELSP(engine), upper_32_bits(desc[0]));
>> + I915_WRITE_FW(elsp_reg, upper_32_bits(desc[0]));
>> /* The context is automatically loaded after the following */
>> - I915_WRITE_FW(RING_ELSP(engine), lower_32_bits(desc[0]));
>> + I915_WRITE_FW(elsp_reg, lower_32_bits(desc[0]));
>>
>> /* ELSP is a wo register, use another nearby reg for posting */
>> POSTING_READ_FW(RING_EXECLIST_STATUS_LO(engine));
>
> Observing the above, we can also kill the POSTING_READ_FW() which will
> make a much bigger improvement than all of the above.
It is a different register, why it can be removed?
>> @@ -517,21 +517,19 @@ execlists_check_remove_request(struct intel_engine_cs *engine, u32 request_id)
>> }
>>
>> static u32
>> -get_context_status(struct intel_engine_cs *engine, unsigned int read_pointer,
>> - u32 *context_id)
>> +get_context_status(struct drm_i915_private *dev_priv, u32 csb_base,
>> + unsigned int read_pointer, u32 *context_id)
>> {
>> - struct drm_i915_private *dev_priv = engine->dev->dev_private;
>> u32 status;
>>
>> read_pointer %= GEN8_CSB_ENTRIES;
>>
>> - status = I915_READ_FW(RING_CONTEXT_STATUS_BUF_LO(engine, read_pointer));
>> + status = I915_READ_FW(RING_CSB_LO(csb_base, read_pointer));
>
> If we forgo the "convenience" interface here, could we not also improve
> the readability of the code by just having the csb ringbuffer and readl?
The same whole-hog open coding you mean like the above? It is slightly
more efficient, not sure that is is more readable so maybe I am not
getting exactly what you mean.
Regards,
Tvrtko
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx
^ permalink raw reply [flat|nested] 9+ messages in thread
* Re: [PATCH 1/2] drm/i915: Cache some registers in execlists hot paths
2016-03-24 11:31 ` Tvrtko Ursulin
@ 2016-03-24 11:53 ` Chris Wilson
0 siblings, 0 replies; 9+ messages in thread
From: Chris Wilson @ 2016-03-24 11:53 UTC (permalink / raw)
To: Tvrtko Ursulin; +Cc: Intel-gfx
On Thu, Mar 24, 2016 at 11:31:34AM +0000, Tvrtko Ursulin wrote:
>
> On 24/03/16 11:16, Chris Wilson wrote:
> >On Wed, Mar 23, 2016 at 03:15:02PM +0000, Tvrtko Ursulin wrote:
> >>From: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
> >>
> >>GCC cannot optimize well calculations hidden in macros and
> >>assigned to temporary structures. We can cache the register in
> >>ELSP write, and refactor reading of the CSB a bit to enable
> >>it to do a better job.
> >>
> >>Code is still equally readable but the generated body of the
> >>CSB read loop is 30% smaller, and since that loop runs at
> >>least once per interrupt, which in turn can fire in tens or
> >>hundreds thousands times per second, must be of some value.
> >>
> >>Signed-off-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
> >>---
> >> drivers/gpu/drm/i915/intel_lrc.c | 26 +++++++++++++-------------
> >> drivers/gpu/drm/i915/intel_lrc.h | 11 ++++++++---
> >> 2 files changed, 21 insertions(+), 16 deletions(-)
> >>
> >>diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c
> >>index 3a23b9549f7b..67592f8354d6 100644
> >>--- a/drivers/gpu/drm/i915/intel_lrc.c
> >>+++ b/drivers/gpu/drm/i915/intel_lrc.c
> >>@@ -361,8 +361,8 @@ static void execlists_elsp_write(struct drm_i915_gem_request *rq0,
> >> {
> >>
> >> struct intel_engine_cs *engine = rq0->engine;
> >>- struct drm_device *dev = engine->dev;
> >>- struct drm_i915_private *dev_priv = dev->dev_private;
> >>+ struct drm_i915_private *dev_priv = rq0->i915;
> >>+ i915_reg_t elsp_reg = RING_ELSP(engine);
> >
> >If we are going to open-code it, how about going whole hog
> >and
> >
> >/* We opencode I915_WRITE_FW(RING_ELSP(engine)) here to save gcc the
> > * trouble of doing so - gcc fails miserably!
> > */
> >u32 *elsp = rq->i915->regs + i915_mmio_reg_offset(RING_ELSP(engine));
> >then use writel(upper_32_bits(desc[1]), elsp);
> >
> >Keeping the comment around for grepping.
>
> Yeah I had that but thought it will be considered to tasteless for
> posting. :)
>
> >> uint64_t desc[2];
> >>
> >> if (rq1) {
> >>@@ -376,12 +376,12 @@ static void execlists_elsp_write(struct drm_i915_gem_request *rq0,
> >> rq0->elsp_submitted++;
> >>
> >> /* You must always write both descriptors in the order below. */
> >>- I915_WRITE_FW(RING_ELSP(engine), upper_32_bits(desc[1]));
> >>- I915_WRITE_FW(RING_ELSP(engine), lower_32_bits(desc[1]));
> >>+ I915_WRITE_FW(elsp_reg, upper_32_bits(desc[1]));
> >>+ I915_WRITE_FW(elsp_reg, lower_32_bits(desc[1]));
> >>
> >>- I915_WRITE_FW(RING_ELSP(engine), upper_32_bits(desc[0]));
> >>+ I915_WRITE_FW(elsp_reg, upper_32_bits(desc[0]));
> >> /* The context is automatically loaded after the following */
> >>- I915_WRITE_FW(RING_ELSP(engine), lower_32_bits(desc[0]));
> >>+ I915_WRITE_FW(elsp_reg, lower_32_bits(desc[0]));
> >>
> >> /* ELSP is a wo register, use another nearby reg for posting */
> >> POSTING_READ_FW(RING_EXECLIST_STATUS_LO(engine));
> >
> >Observing the above, we can also kill the POSTING_READ_FW() which will
> >make a much bigger improvement than all of the above.
>
> It is a different register, why it can be removed?
We use uncached mmio writes. The POSTING_READs are primarily for
documentation about where we need the write barriers to flush the WC
buffer (which we don't use nor are we ever going to because HW forbids
us). However they are most painful at times.
> >>@@ -517,21 +517,19 @@ execlists_check_remove_request(struct intel_engine_cs *engine, u32 request_id)
> >> }
> >>
> >> static u32
> >>-get_context_status(struct intel_engine_cs *engine, unsigned int read_pointer,
> >>- u32 *context_id)
> >>+get_context_status(struct drm_i915_private *dev_priv, u32 csb_base,
> >>+ unsigned int read_pointer, u32 *context_id)
> >> {
> >>- struct drm_i915_private *dev_priv = engine->dev->dev_private;
> >> u32 status;
> >>
> >> read_pointer %= GEN8_CSB_ENTRIES;
> >>
> >>- status = I915_READ_FW(RING_CONTEXT_STATUS_BUF_LO(engine, read_pointer));
> >>+ status = I915_READ_FW(RING_CSB_LO(csb_base, read_pointer));
> >
> >If we forgo the "convenience" interface here, could we not also improve
> >the readability of the code by just having the csb ringbuffer and readl?
>
> The same whole-hog open coding you mean like the above? It is
> slightly more efficient, not sure that is is more readable so maybe
> I am not getting exactly what you mean.
I think it will be. Even more so we say "we're x86 only and use u32 *
__iomem csb" ;)
quick draft
t a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c
index 40ef4ea..b7d2ae9 100644
--- a/drivers/gpu/drm/i915/intel_lrc.c
+++ b/drivers/gpu/drm/i915/intel_lrc.c
@@ -516,26 +516,6 @@ execlists_check_remove_request(struct intel_engine_cs *engine, u32 request_id)
return 1;
}
-static u32
-get_context_status(struct intel_engine_cs *engine, unsigned int read_pointer,
- u32 *context_id)
-{
- struct drm_i915_private *dev_priv = engine->dev->dev_private;
- u32 status;
-
- read_pointer %= GEN8_CSB_ENTRIES;
-
- status = I915_READ_FW(RING_CONTEXT_STATUS_BUF_LO(engine, read_pointer));
-
- if (status & GEN8_CTX_STATUS_IDLE_ACTIVE)
- return 0;
-
- *context_id = I915_READ_FW(RING_CONTEXT_STATUS_BUF_HI(engine,
- read_pointer));
-
- return status;
-}
-
/**
* intel_lrc_irq_handler() - handle Context Switch interrupts
* @ring: Engine Command Streamer to handle.
@@ -548,6 +528,7 @@ void intel_lrc_irq_handler(struct intel_engine_cs *engine)
struct drm_i915_private *dev_priv = engine->dev->dev_private;
u32 status_pointer;
unsigned int read_pointer, write_pointer;
+ u32 *csb_mmio = dev_priv->regs + i915_mmio_reg_offset(RING_CONTEXT_STATUS_BUF_LO(engine, 0));
u32 csb[GEN8_CSB_ENTRIES][2];
unsigned int csb_read = 0, i;
unsigned int submit_contexts = 0;
@@ -563,10 +544,16 @@ void intel_lrc_irq_handler(struct intel_engine_cs *engine)
write_pointer += GEN8_CSB_ENTRIES;
while (read_pointer < write_pointer) {
+ unsigned idx;
+
if (WARN_ON_ONCE(csb_read == GEN8_CSB_ENTRIES))
break;
- csb[csb_read][0] = get_context_status(engine, ++read_pointer,
- &csb[csb_read][1]);
+
+ idx = read_pointer++ % GEN8_CSB_ENTRIES;
+ csb[csb_read][0] = readl(&csb_mmio[2*idx+0]);
+ if ((status & GEN8_CTX_STATUS_IDLE_ACTIVE) == 0)
+ csb[csb_read][1] = readl(&csb_mmio[2*idx+1]);
+
csb_read++;
}
-Chris
--
Chris Wilson, Intel Open Source Technology Centre
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx
^ permalink raw reply related [flat|nested] 9+ messages in thread
end of thread, other threads:[~2016-03-24 11:53 UTC | newest]
Thread overview: 9+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2016-03-23 15:15 [PATCH 1/2] drm/i915: Cache some registers in execlists hot paths Tvrtko Ursulin
2016-03-23 15:15 ` [PATCH 2/2] drm/i915: Remove impossibe checks from i915_gem_request_add_to_client Tvrtko Ursulin
2016-03-23 15:31 ` Chris Wilson
2016-03-24 9:49 ` Tvrtko Ursulin
2016-03-24 10:11 ` Chris Wilson
2016-03-23 16:06 ` ✗ Fi.CI.BAT: failure for series starting with [1/2] drm/i915: Cache some registers in execlists hot paths Patchwork
2016-03-24 11:16 ` [PATCH 1/2] " Chris Wilson
2016-03-24 11:31 ` Tvrtko Ursulin
2016-03-24 11:53 ` Chris Wilson
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).