* [PATCH v4 1/6] drm/i915/gen8: Add infrastructure to initialize WA batch buffers
2015-06-16 19:25 [PATCH v4 0/6] Add Per-context WA using WA batch buffers Arun Siluvery
@ 2015-06-16 19:25 ` Arun Siluvery
2015-06-16 20:25 ` Chris Wilson
2015-06-16 20:33 ` Chris Wilson
2015-06-16 19:25 ` [PATCH v4 2/6] drm/i915/gen8: Re-order init pipe_control in lrc mode Arun Siluvery
` (4 subsequent siblings)
5 siblings, 2 replies; 19+ messages in thread
From: Arun Siluvery @ 2015-06-16 19:25 UTC (permalink / raw)
To: intel-gfx
Some of the WA are to be applied during context save but before restore and
some at the end of context save/restore but before executing the instructions
in the ring, WA batch buffers are created for this purpose and these WA cannot
be applied using normal means. Each context has two registers to load the
offsets of these batch buffers. If they are non-zero, HW understands that it
need to execute these batches.
v1: In this version two separate ring_buffer objects were used to load WA
instructions for indirect and per context batch buffers and they were part
of every context.
v2: Chris suggested to include additional page in context and use it to load
these WA instead of creating separate objects. This will simplify lot of things
as we need not explicity pin/unpin them. Thomas Daniel further pointed that GuC
is planning to use a similar setup to share data between GuC and driver and
WA batch buffers can probably share that page. However after discussions with
Dave who is implementing GuC changes, he suggested to use an independent page
for the reasons - GuC area might grow and these WA are initialized only once and
are not changed afterwards so we can share them share across all contexts.
The page is updated with WA during render ring init. This has an advantage of
not adding more special cases to default_context.
The size of indirect and percontext batch buffers are not fixed during init,
the functions that load instructions are modified to report the no of commands
inserted and the size is now calculated after the batch is updated.
The functions also accept an offset field which is helpful if we would like to
have multiple batches at different offsets within the page and select them
based on some criteria. This is not a requirement at this point but could
help in future (Dave).
(Thanks to Chris, Dave and Thomas for their reviews and inputs)
Signed-off-by: Rafael Barbalho <rafael.barbalho@intel.com>
Signed-off-by: Arun Siluvery <arun.siluvery@linux.intel.com>
---
drivers/gpu/drm/i915/intel_lrc.c | 179 +++++++++++++++++++++++++++++++-
drivers/gpu/drm/i915/intel_ringbuffer.h | 18 ++++
2 files changed, 193 insertions(+), 4 deletions(-)
diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c
index 0413b8f..cad274a 100644
--- a/drivers/gpu/drm/i915/intel_lrc.c
+++ b/drivers/gpu/drm/i915/intel_lrc.c
@@ -211,6 +211,7 @@ enum {
FAULT_AND_CONTINUE /* Unsupported */
};
#define GEN8_CTX_ID_SHIFT 32
+#define CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT 0x17
static int intel_lr_context_pin(struct intel_engine_cs *ring,
struct intel_context *ctx);
@@ -1077,6 +1078,109 @@ static int intel_logical_ring_workarounds_emit(struct intel_engine_cs *ring,
return 0;
}
+static int gen8_init_indirectctx_bb(struct intel_engine_cs *ring,
+ uint32_t offset,
+ uint32_t *num_dwords)
+{
+ uint32_t index;
+ struct page *page;
+ uint32_t *cmd;
+
+ page = i915_gem_object_get_page(ring->wa_ctx.obj, 0);
+ cmd = kmap_atomic(page);
+
+ index = offset;
+
+ /* FIXME: fill one cacheline with NOOPs.
+ * Replace these instructions with WA
+ */
+ while (index < (offset + 16))
+ cmd[index++] = MI_NOOP;
+
+ /*
+ * MI_BATCH_BUFFER_END is not required in Indirect ctx BB because
+ * execution depends on the length specified in terms of cache lines
+ * in the register CTX_RCS_INDIRECT_CTX
+ */
+
+ kunmap_atomic(cmd);
+
+ if (index > (PAGE_SIZE / sizeof(uint32_t)))
+ return -EINVAL;
+
+ *num_dwords = index - offset;
+
+ return 0;
+}
+
+static int gen8_init_perctx_bb(struct intel_engine_cs *ring,
+ uint32_t offset,
+ uint32_t *num_dwords)
+{
+ uint32_t index;
+ struct page *page;
+ uint32_t *cmd;
+
+ page = i915_gem_object_get_page(ring->wa_ctx.obj, 0);
+ cmd = kmap_atomic(page);
+
+ index = offset;
+
+ /* FIXME: fill one cacheline with NOOPs.
+ * Replace these instructions with WA
+ */
+ while (index < (offset + 16))
+ cmd[index++] = MI_NOOP;
+
+ cmd[index - 1] = MI_BATCH_BUFFER_END;
+
+ kunmap_atomic(cmd);
+
+ if (index > (PAGE_SIZE / sizeof(uint32_t)))
+ return -EINVAL;
+
+ *num_dwords = index - offset;
+
+ return 0;
+}
+
+static int intel_init_workaround_bb(struct intel_engine_cs *ring)
+{
+ int ret;
+ uint32_t num_dwords;
+ struct i915_ctx_workarounds *wa_ctx = &ring->wa_ctx;
+
+ if (ring->scratch.obj == NULL) {
+ DRM_ERROR("scratch page not allocated for %s\n", ring->name);
+ return -EINVAL;
+ }
+
+ if (INTEL_INFO(ring->dev)->gen == 8) {
+ wa_ctx->indctx_batch_offset = 0;
+
+ ret = gen8_init_indirectctx_bb(ring,
+ wa_ctx->indctx_batch_offset,
+ &num_dwords);
+ if (ret)
+ return ret;
+
+ wa_ctx->indctx_batch_size = round_up(num_dwords, CACHELINE_DWORDS);
+ wa_ctx->perctx_batch_offset = wa_ctx->indctx_batch_size;
+
+ ret = gen8_init_perctx_bb(ring,
+ wa_ctx->perctx_batch_offset,
+ &num_dwords);
+ if (ret)
+ return ret;
+ } else {
+ WARN(INTEL_INFO(ring->dev)->gen >= 8,
+ "WA batch buffer is not initialized for Gen%d\n",
+ INTEL_INFO(ring->dev)->gen);
+ }
+
+ return 0;
+}
+
static int gen8_init_common_ring(struct intel_engine_cs *ring)
{
struct drm_device *dev = ring->dev;
@@ -1382,6 +1486,40 @@ static int gen8_init_rcs_context(struct intel_engine_cs *ring,
return intel_lr_context_render_state_init(ring, ctx);
}
+static int lrc_setup_wa_ctx_obj(struct intel_engine_cs *ring, u32 size)
+{
+ int ret;
+ struct drm_device *dev = ring->dev;
+
+ WARN_ON(ring->id != RCS);
+
+ size = roundup(size, PAGE_SIZE);
+ ring->wa_ctx.obj = i915_gem_alloc_object(dev, size);
+ if (!ring->wa_ctx.obj) {
+ DRM_DEBUG_DRIVER("alloc LRC WA ctx backing obj failed.\n");
+ return -ENOMEM;
+ }
+
+ ret = i915_gem_obj_ggtt_pin(ring->wa_ctx.obj, GEN8_LR_CONTEXT_ALIGN, 0);
+ if (ret) {
+ DRM_DEBUG_DRIVER("pin LRC WA ctx backing obj failed: %d\n",
+ ret);
+ drm_gem_object_unreference(&ring->wa_ctx.obj->base);
+ return ret;
+ }
+
+ return 0;
+}
+
+static void lrc_destroy_wa_ctx_obj(struct intel_engine_cs *ring)
+{
+ WARN_ON(ring->id != RCS);
+
+ i915_gem_object_ggtt_unpin(ring->wa_ctx.obj);
+ drm_gem_object_unreference(&ring->wa_ctx.obj->base);
+ ring->wa_ctx.obj = NULL;
+}
+
/**
* intel_logical_ring_cleanup() - deallocate the Engine Command Streamer
*
@@ -1474,7 +1612,29 @@ static int logical_render_ring_init(struct drm_device *dev)
if (ret)
return ret;
- return intel_init_pipe_control(ring);
+ if (INTEL_INFO(ring->dev)->gen >= 8) {
+ ret = lrc_setup_wa_ctx_obj(ring, PAGE_SIZE);
+ if (ret) {
+ DRM_DEBUG_DRIVER("Failed to setup context WA page: %d\n",
+ ret);
+ return ret;
+ }
+
+ ret = intel_init_workaround_bb(ring);
+ if (ret) {
+ lrc_destroy_wa_ctx_obj(ring);
+ DRM_ERROR("WA batch buffers are not initialized: %d\n",
+ ret);
+ }
+ }
+
+ ret = intel_init_pipe_control(ring);
+ if (ret) {
+ if (ring->wa_ctx.obj)
+ lrc_destroy_wa_ctx_obj(ring);
+ }
+
+ return ret;
}
static int logical_bsd_ring_init(struct drm_device *dev)
@@ -1754,15 +1914,26 @@ populate_lr_context(struct intel_context *ctx, struct drm_i915_gem_object *ctx_o
reg_state[CTX_SECOND_BB_STATE] = ring->mmio_base + 0x118;
reg_state[CTX_SECOND_BB_STATE+1] = 0;
if (ring->id == RCS) {
- /* TODO: according to BSpec, the register state context
- * for CHV does not have these. OTOH, these registers do
- * exist in CHV. I'm waiting for a clarification */
reg_state[CTX_BB_PER_CTX_PTR] = ring->mmio_base + 0x1c0;
reg_state[CTX_BB_PER_CTX_PTR+1] = 0;
reg_state[CTX_RCS_INDIRECT_CTX] = ring->mmio_base + 0x1c4;
reg_state[CTX_RCS_INDIRECT_CTX+1] = 0;
reg_state[CTX_RCS_INDIRECT_CTX_OFFSET] = ring->mmio_base + 0x1c8;
reg_state[CTX_RCS_INDIRECT_CTX_OFFSET+1] = 0;
+ if (ring->wa_ctx.obj) {
+ reg_state[CTX_RCS_INDIRECT_CTX+1] =
+ (i915_gem_obj_ggtt_offset(ring->wa_ctx.obj) +
+ ring->wa_ctx.indctx_batch_offset * sizeof(uint32_t)) |
+ (ring->wa_ctx.indctx_batch_size / CACHELINE_DWORDS);
+
+ reg_state[CTX_RCS_INDIRECT_CTX_OFFSET+1] =
+ CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT << 6;
+
+ reg_state[CTX_BB_PER_CTX_PTR+1] =
+ (i915_gem_obj_ggtt_offset(ring->wa_ctx.obj) +
+ ring->wa_ctx.perctx_batch_offset * sizeof(uint32_t)) |
+ 0x01;
+ }
}
reg_state[CTX_LRI_HEADER_1] = MI_LOAD_REGISTER_IMM(9);
reg_state[CTX_LRI_HEADER_1] |= MI_LRI_FORCE_POSTED;
diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.h b/drivers/gpu/drm/i915/intel_ringbuffer.h
index 39f6dfc..1f38af3 100644
--- a/drivers/gpu/drm/i915/intel_ringbuffer.h
+++ b/drivers/gpu/drm/i915/intel_ringbuffer.h
@@ -12,6 +12,7 @@
* workarounds!
*/
#define CACHELINE_BYTES 64
+#define CACHELINE_DWORDS (CACHELINE_BYTES / sizeof(uint32_t))
/*
* Gen2 BSpec "1. Programming Environment" / 1.4.4.6 "Ring Buffer Use"
@@ -119,6 +120,22 @@ struct intel_ringbuffer {
struct intel_context;
+/*
+ * we use a single page to load ctx workarounds so all of these
+ * values are referred in terms of dwords
+ *
+ * offset field - helpful in case if we want to have multiple batches
+ * at different offsets based on some conditions. It is not a requirement
+ * at the moment but provides an option for future use.
+ * indctx_batch_size - HW expects this value in terms of cachelines
+ */
+struct i915_ctx_workarounds {
+ u32 indctx_batch_offset;
+ u32 indctx_batch_size;
+ u32 perctx_batch_offset;
+ struct drm_i915_gem_object *obj;
+};
+
struct intel_engine_cs {
const char *name;
enum intel_ring_id {
@@ -142,6 +159,7 @@ struct intel_engine_cs {
struct i915_gem_batch_pool batch_pool;
struct intel_hw_status_page status_page;
+ struct i915_ctx_workarounds wa_ctx;
unsigned irq_refcount; /* protected by dev_priv->irq_lock */
u32 irq_enable_mask; /* bitmask to enable ring interrupt */
--
2.3.0
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/intel-gfx
^ permalink raw reply related [flat|nested] 19+ messages in thread* Re: [PATCH v4 1/6] drm/i915/gen8: Add infrastructure to initialize WA batch buffers
2015-06-16 19:25 ` [PATCH v4 1/6] drm/i915/gen8: Add infrastructure to initialize " Arun Siluvery
@ 2015-06-16 20:25 ` Chris Wilson
2015-06-17 18:48 ` Siluvery, Arun
2015-06-16 20:33 ` Chris Wilson
1 sibling, 1 reply; 19+ messages in thread
From: Chris Wilson @ 2015-06-16 20:25 UTC (permalink / raw)
To: Arun Siluvery; +Cc: intel-gfx
On Tue, Jun 16, 2015 at 08:25:20PM +0100, Arun Siluvery wrote:
> +static int gen8_init_indirectctx_bb(struct intel_engine_cs *ring,
> + uint32_t offset,
> + uint32_t *num_dwords)
> +{
> + uint32_t index;
> + struct page *page;
> + uint32_t *cmd;
> +
> + page = i915_gem_object_get_page(ring->wa_ctx.obj, 0);
> + cmd = kmap_atomic(page);
> +
> + index = offset;
> +
> + /* FIXME: fill one cacheline with NOOPs.
> + * Replace these instructions with WA
> + */
> + while (index < (offset + 16))
> + cmd[index++] = MI_NOOP;
> +
> + /*
> + * MI_BATCH_BUFFER_END is not required in Indirect ctx BB because
> + * execution depends on the length specified in terms of cache lines
> + * in the register CTX_RCS_INDIRECT_CTX
> + */
> +
> + kunmap_atomic(cmd);
> +
> + if (index > (PAGE_SIZE / sizeof(uint32_t)))
> + return -EINVAL;
Check before you GPF!
You just overran the buffer and corrupted memory, if you didn't succeed
in trapping a segfault.
To be generic, align to the cacheline then check you have enough room
for your own data.
-Chris
--
Chris Wilson, Intel Open Source Technology Centre
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/intel-gfx
^ permalink raw reply [flat|nested] 19+ messages in thread* Re: [PATCH v4 1/6] drm/i915/gen8: Add infrastructure to initialize WA batch buffers
2015-06-16 20:25 ` Chris Wilson
@ 2015-06-17 18:48 ` Siluvery, Arun
2015-06-17 19:48 ` Siluvery, Arun
2015-06-17 20:21 ` Chris Wilson
0 siblings, 2 replies; 19+ messages in thread
From: Siluvery, Arun @ 2015-06-17 18:48 UTC (permalink / raw)
To: Chris Wilson, intel-gfx
On 16/06/2015 21:25, Chris Wilson wrote:
> On Tue, Jun 16, 2015 at 08:25:20PM +0100, Arun Siluvery wrote:
>> +static int gen8_init_indirectctx_bb(struct intel_engine_cs *ring,
>> + uint32_t offset,
>> + uint32_t *num_dwords)
>> +{
>> + uint32_t index;
>> + struct page *page;
>> + uint32_t *cmd;
>> +
>> + page = i915_gem_object_get_page(ring->wa_ctx.obj, 0);
>> + cmd = kmap_atomic(page);
>> +
>> + index = offset;
>> +
>> + /* FIXME: fill one cacheline with NOOPs.
>> + * Replace these instructions with WA
>> + */
>> + while (index < (offset + 16))
>> + cmd[index++] = MI_NOOP;
>> +
>> + /*
>> + * MI_BATCH_BUFFER_END is not required in Indirect ctx BB because
>> + * execution depends on the length specified in terms of cache lines
>> + * in the register CTX_RCS_INDIRECT_CTX
>> + */
>> +
>> + kunmap_atomic(cmd);
>> +
>> + if (index > (PAGE_SIZE / sizeof(uint32_t)))
>> + return -EINVAL;
>
> Check before you GPF!
>
> You just overran the buffer and corrupted memory, if you didn't succeed
> in trapping a segfault.
>
> To be generic, align to the cacheline then check you have enough room
> for your own data.
> -Chris
>
Hi Chris,
The placement of condition is not correct. I don't completely follow
your suggestion, could you please elaborate; here we don't know upfront
how much more data to be written.
I have made below changes to check after writing every command and
return error as soon as we reach the end.
#define wa_ctx_emit(batch, cmd) { \
if (WARN_ON(index >= (PAGE_SIZE / sizeof(uint32_t)))) { \
kunmap_atomic(batch); \
return -ENOSPC; \
} \
batch[index++] = (cmd); \
}
is this acceptable?
I think this is the only one issue, all other comments are addressed.
regards
Arun
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/intel-gfx
^ permalink raw reply [flat|nested] 19+ messages in thread* Re: [PATCH v4 1/6] drm/i915/gen8: Add infrastructure to initialize WA batch buffers
2015-06-17 18:48 ` Siluvery, Arun
@ 2015-06-17 19:48 ` Siluvery, Arun
2015-06-17 20:21 ` Chris Wilson
1 sibling, 0 replies; 19+ messages in thread
From: Siluvery, Arun @ 2015-06-17 19:48 UTC (permalink / raw)
To: Chris Wilson, intel-gfx
On 17/06/2015 19:48, Siluvery, Arun wrote:
> On 16/06/2015 21:25, Chris Wilson wrote:
>> On Tue, Jun 16, 2015 at 08:25:20PM +0100, Arun Siluvery wrote:
>>> +static int gen8_init_indirectctx_bb(struct intel_engine_cs *ring,
>>> + uint32_t offset,
>>> + uint32_t *num_dwords)
>>> +{
>>> + uint32_t index;
>>> + struct page *page;
>>> + uint32_t *cmd;
>>> +
>>> + page = i915_gem_object_get_page(ring->wa_ctx.obj, 0);
>>> + cmd = kmap_atomic(page);
>>> +
>>> + index = offset;
>>> +
>>> + /* FIXME: fill one cacheline with NOOPs.
>>> + * Replace these instructions with WA
>>> + */
>>> + while (index < (offset + 16))
>>> + cmd[index++] = MI_NOOP;
>>> +
>>> + /*
>>> + * MI_BATCH_BUFFER_END is not required in Indirect ctx BB because
>>> + * execution depends on the length specified in terms of cache lines
>>> + * in the register CTX_RCS_INDIRECT_CTX
>>> + */
>>> +
>>> + kunmap_atomic(cmd);
>>> +
>>> + if (index > (PAGE_SIZE / sizeof(uint32_t)))
>>> + return -EINVAL;
>>
>> Check before you GPF!
>>
>> You just overran the buffer and corrupted memory, if you didn't succeed
>> in trapping a segfault.
>>
>> To be generic, align to the cacheline then check you have enough room
>> for your own data.
>> -Chris
>>
> Hi Chris,
>
> The placement of condition is not correct. I don't completely follow
> your suggestion, could you please elaborate; here we don't know upfront
> how much more data to be written.
> I have made below changes to check after writing every command and
> return error as soon as we reach the end.
>
> #define wa_ctx_emit(batch, cmd) { \
> if (WARN_ON(index >= (PAGE_SIZE / sizeof(uint32_t)))) { \
> kunmap_atomic(batch); \
> return -ENOSPC; \
> } \
> batch[index++] = (cmd); \
> }
> is this acceptable?
> I think this is the only one issue, all other comments are addressed.
>
one other improvement is possible - mapping/unmapping page can be kept
in common path, will update the patch accordingly.
regards
Arun
> regards
> Arun
>
> _______________________________________________
> Intel-gfx mailing list
> Intel-gfx@lists.freedesktop.org
> http://lists.freedesktop.org/mailman/listinfo/intel-gfx
>
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/intel-gfx
^ permalink raw reply [flat|nested] 19+ messages in thread* Re: [PATCH v4 1/6] drm/i915/gen8: Add infrastructure to initialize WA batch buffers
2015-06-17 18:48 ` Siluvery, Arun
2015-06-17 19:48 ` Siluvery, Arun
@ 2015-06-17 20:21 ` Chris Wilson
2015-06-17 21:36 ` Siluvery, Arun
1 sibling, 1 reply; 19+ messages in thread
From: Chris Wilson @ 2015-06-17 20:21 UTC (permalink / raw)
To: Siluvery, Arun; +Cc: intel-gfx
On Wed, Jun 17, 2015 at 07:48:16PM +0100, Siluvery, Arun wrote:
> On 16/06/2015 21:25, Chris Wilson wrote:
> >On Tue, Jun 16, 2015 at 08:25:20PM +0100, Arun Siluvery wrote:
> >>+static int gen8_init_indirectctx_bb(struct intel_engine_cs *ring,
> >>+ uint32_t offset,
> >>+ uint32_t *num_dwords)
> >>+{
> >>+ uint32_t index;
> >>+ struct page *page;
> >>+ uint32_t *cmd;
> >>+
> >>+ page = i915_gem_object_get_page(ring->wa_ctx.obj, 0);
> >>+ cmd = kmap_atomic(page);
> >>+
> >>+ index = offset;
> >>+
> >>+ /* FIXME: fill one cacheline with NOOPs.
> >>+ * Replace these instructions with WA
> >>+ */
> >>+ while (index < (offset + 16))
> >>+ cmd[index++] = MI_NOOP;
> >>+
> >>+ /*
> >>+ * MI_BATCH_BUFFER_END is not required in Indirect ctx BB because
> >>+ * execution depends on the length specified in terms of cache lines
> >>+ * in the register CTX_RCS_INDIRECT_CTX
> >>+ */
> >>+
> >>+ kunmap_atomic(cmd);
> >>+
> >>+ if (index > (PAGE_SIZE / sizeof(uint32_t)))
> >>+ return -EINVAL;
> >
> >Check before you GPF!
> >
> >You just overran the buffer and corrupted memory, if you didn't succeed
> >in trapping a segfault.
> >
> >To be generic, align to the cacheline then check you have enough room
> >for your own data.
> >-Chris
> >
> Hi Chris,
>
> The placement of condition is not correct. I don't completely follow
> your suggestion, could you please elaborate; here we don't know
> upfront how much more data to be written.
Hmm, are we anticipating an unbounded number of workarounds? At some
point you have to have a rough upper bound in order to do the bo
allocation. If we are really unsure, then we do need to split this into
two passes, one to count the number of dwords and the second to allocate
and actually fill the cmd[].
> I have made below changes to check after writing every command and
> return error as soon as we reach the end.
>
> #define wa_ctx_emit(batch, cmd) { \
> if (WARN_ON(index >= (PAGE_SIZE / sizeof(uint32_t)))) { \
> kunmap_atomic(batch); \
> return -ENOSPC; \
> } \
> batch[index++] = (cmd); \
> }
> is this acceptable?
> I think this is the only one issue, all other comments are addressed.
It's the lesser of evils for sure. Still feel dubious that we don't know
upfront how much data we need to allocate.
-Chris
--
Chris Wilson, Intel Open Source Technology Centre
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/intel-gfx
^ permalink raw reply [flat|nested] 19+ messages in thread* Re: [PATCH v4 1/6] drm/i915/gen8: Add infrastructure to initialize WA batch buffers
2015-06-17 20:21 ` Chris Wilson
@ 2015-06-17 21:36 ` Siluvery, Arun
0 siblings, 0 replies; 19+ messages in thread
From: Siluvery, Arun @ 2015-06-17 21:36 UTC (permalink / raw)
To: Chris Wilson, intel-gfx
On 17/06/2015 21:21, Chris Wilson wrote:
> On Wed, Jun 17, 2015 at 07:48:16PM +0100, Siluvery, Arun wrote:
>> On 16/06/2015 21:25, Chris Wilson wrote:
>>> On Tue, Jun 16, 2015 at 08:25:20PM +0100, Arun Siluvery wrote:
>>>> +static int gen8_init_indirectctx_bb(struct intel_engine_cs *ring,
>>>> + uint32_t offset,
>>>> + uint32_t *num_dwords)
>>>> +{
>>>> + uint32_t index;
>>>> + struct page *page;
>>>> + uint32_t *cmd;
>>>> +
>>>> + page = i915_gem_object_get_page(ring->wa_ctx.obj, 0);
>>>> + cmd = kmap_atomic(page);
>>>> +
>>>> + index = offset;
>>>> +
>>>> + /* FIXME: fill one cacheline with NOOPs.
>>>> + * Replace these instructions with WA
>>>> + */
>>>> + while (index < (offset + 16))
>>>> + cmd[index++] = MI_NOOP;
>>>> +
>>>> + /*
>>>> + * MI_BATCH_BUFFER_END is not required in Indirect ctx BB because
>>>> + * execution depends on the length specified in terms of cache lines
>>>> + * in the register CTX_RCS_INDIRECT_CTX
>>>> + */
>>>> +
>>>> + kunmap_atomic(cmd);
>>>> +
>>>> + if (index > (PAGE_SIZE / sizeof(uint32_t)))
>>>> + return -EINVAL;
>>>
>>> Check before you GPF!
>>>
>>> You just overran the buffer and corrupted memory, if you didn't succeed
>>> in trapping a segfault.
>>>
>>> To be generic, align to the cacheline then check you have enough room
>>> for your own data.
>>> -Chris
>>>
>> Hi Chris,
>>
>> The placement of condition is not correct. I don't completely follow
>> your suggestion, could you please elaborate; here we don't know
>> upfront how much more data to be written.
>
> Hmm, are we anticipating an unbounded number of workarounds? At some
> point you have to have a rough upper bound in order to do the bo
> allocation. If we are really unsure, then we do need to split this into
> two passes, one to count the number of dwords and the second to allocate
> and actually fill the cmd[].
>
Since we have a full page dedicated for this, that should be sufficient
for good number of WA; if we need more than one page means we have major
issues.
The list for Gen8 is small, same for Gen9 also, maybe few more gets
added going forward but not close to filling entire page. Some of them
will even be restricted to specific steppings/revisions. For these
reasons I think a single page setup is sufficient.
Do you anticipate any other use cases that require allocating more than
one page?
Two pass approach can be implemented but it adds unnecessary complexity
which may not be required in this case. please let me know your thoughts.
>> I have made below changes to check after writing every command and
>> return error as soon as we reach the end.
>>
>> #define wa_ctx_emit(batch, cmd) { \
>> if (WARN_ON(index >= (PAGE_SIZE / sizeof(uint32_t)))) { \
>> kunmap_atomic(batch); \
>> return -ENOSPC; \
>> } \
>> batch[index++] = (cmd); \
>> }
>> is this acceptable?
>> I think this is the only one issue, all other comments are addressed.
>
> It's the lesser of evils for sure. Still feel dubious that we don't know
> upfront how much data we need to allocate.
yes, but with single pass approach do you see any way it can be improved?
regards
Arun
> -Chris
>
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/intel-gfx
^ permalink raw reply [flat|nested] 19+ messages in thread
* Re: [PATCH v4 1/6] drm/i915/gen8: Add infrastructure to initialize WA batch buffers
2015-06-16 19:25 ` [PATCH v4 1/6] drm/i915/gen8: Add infrastructure to initialize " Arun Siluvery
2015-06-16 20:25 ` Chris Wilson
@ 2015-06-16 20:33 ` Chris Wilson
2015-06-17 9:38 ` Siluvery, Arun
1 sibling, 1 reply; 19+ messages in thread
From: Chris Wilson @ 2015-06-16 20:33 UTC (permalink / raw)
To: Arun Siluvery; +Cc: intel-gfx
On Tue, Jun 16, 2015 at 08:25:20PM +0100, Arun Siluvery wrote:
> +static int lrc_setup_wa_ctx_obj(struct intel_engine_cs *ring, u32 size)
> +{
> + int ret;
> + struct drm_device *dev = ring->dev;
You only use it once, keeping it as a local seems counter-intuitive.
> + WARN_ON(ring->id != RCS);
> +
> + size = roundup(size, PAGE_SIZE);
Out of curiousity is gcc smart enough to turn this into an ALIGN()?
> + ring->wa_ctx.obj = i915_gem_alloc_object(dev, size);
> + if (!ring->wa_ctx.obj) {
> + DRM_DEBUG_DRIVER("alloc LRC WA ctx backing obj failed.\n");
> + return -ENOMEM;
> + }
> +
> + ret = i915_gem_obj_ggtt_pin(ring->wa_ctx.obj, GEN8_LR_CONTEXT_ALIGN, 0);
Strange choice of alignment since we pass around cacheline offsets.
> + if (ret) {
> + DRM_DEBUG_DRIVER("pin LRC WA ctx backing obj failed: %d\n",
> + ret);
> + drm_gem_object_unreference(&ring->wa_ctx.obj->base);
> + return ret;
> + }
> +
> + return 0;
> +}
> +
> +static void lrc_destroy_wa_ctx_obj(struct intel_engine_cs *ring)
> +{
> + WARN_ON(ring->id != RCS);
> +
> + i915_gem_object_ggtt_unpin(ring->wa_ctx.obj);
> + drm_gem_object_unreference(&ring->wa_ctx.obj->base);
> + ring->wa_ctx.obj = NULL;
> +}
> +
> /**
> * intel_logical_ring_cleanup() - deallocate the Engine Command Streamer
> *
> @@ -1474,7 +1612,29 @@ static int logical_render_ring_init(struct drm_device *dev)
> if (ret)
> return ret;
>
> - return intel_init_pipe_control(ring);
> + if (INTEL_INFO(ring->dev)->gen >= 8) {
> + ret = lrc_setup_wa_ctx_obj(ring, PAGE_SIZE);
> + if (ret) {
> + DRM_DEBUG_DRIVER("Failed to setup context WA page: %d\n",
> + ret);
> + return ret;
> + }
> +
> + ret = intel_init_workaround_bb(ring);
> + if (ret) {
> + lrc_destroy_wa_ctx_obj(ring);
> + DRM_ERROR("WA batch buffers are not initialized: %d\n",
> + ret);
> + }
> + }
> +
> + ret = intel_init_pipe_control(ring);
Did you consider stuffing it into the spare are of the pipe control
scatch bo? :)
-Chris
--
Chris Wilson, Intel Open Source Technology Centre
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/intel-gfx
^ permalink raw reply [flat|nested] 19+ messages in thread* Re: [PATCH v4 1/6] drm/i915/gen8: Add infrastructure to initialize WA batch buffers
2015-06-16 20:33 ` Chris Wilson
@ 2015-06-17 9:38 ` Siluvery, Arun
0 siblings, 0 replies; 19+ messages in thread
From: Siluvery, Arun @ 2015-06-17 9:38 UTC (permalink / raw)
To: Chris Wilson, intel-gfx
On 16/06/2015 21:33, Chris Wilson wrote:
> On Tue, Jun 16, 2015 at 08:25:20PM +0100, Arun Siluvery wrote:
>> +static int lrc_setup_wa_ctx_obj(struct intel_engine_cs *ring, u32 size)
>> +{
>> + int ret;
>> + struct drm_device *dev = ring->dev;
>
> You only use it once, keeping it as a local seems counter-intuitive.
>
>> + WARN_ON(ring->id != RCS);
>> +
>> + size = roundup(size, PAGE_SIZE);
>
> Out of curiousity is gcc smart enough to turn this into an ALIGN()?
replaced with PAGE_ALIGN(size)
>
>> + ring->wa_ctx.obj = i915_gem_alloc_object(dev, size);
>> + if (!ring->wa_ctx.obj) {
>> + DRM_DEBUG_DRIVER("alloc LRC WA ctx backing obj failed.\n");
>> + return -ENOMEM;
>> + }
>> +
>> + ret = i915_gem_obj_ggtt_pin(ring->wa_ctx.obj, GEN8_LR_CONTEXT_ALIGN, 0);
>
> Strange choice of alignment since we pass around cacheline offsets.
>
this is from the initial version where it was part of context, sorry
missed this, replaced with PAGE_SIZE.
>> + if (ret) {
>> + DRM_DEBUG_DRIVER("pin LRC WA ctx backing obj failed: %d\n",
>> + ret);
>> + drm_gem_object_unreference(&ring->wa_ctx.obj->base);
>> + return ret;
>> + }
>> +
>> + return 0;
>> +}
>> +
>> +static void lrc_destroy_wa_ctx_obj(struct intel_engine_cs *ring)
>> +{
>> + WARN_ON(ring->id != RCS);
>> +
>> + i915_gem_object_ggtt_unpin(ring->wa_ctx.obj);
>> + drm_gem_object_unreference(&ring->wa_ctx.obj->base);
>> + ring->wa_ctx.obj = NULL;
>> +}
>> +
>> /**
>> * intel_logical_ring_cleanup() - deallocate the Engine Command Streamer
>> *
>> @@ -1474,7 +1612,29 @@ static int logical_render_ring_init(struct drm_device *dev)
>> if (ret)
>> return ret;
>>
>> - return intel_init_pipe_control(ring);
>> + if (INTEL_INFO(ring->dev)->gen >= 8) {
>> + ret = lrc_setup_wa_ctx_obj(ring, PAGE_SIZE);
>> + if (ret) {
>> + DRM_DEBUG_DRIVER("Failed to setup context WA page: %d\n",
>> + ret);
>> + return ret;
>> + }
>> +
>> + ret = intel_init_workaround_bb(ring);
>> + if (ret) {
>> + lrc_destroy_wa_ctx_obj(ring);
>> + DRM_ERROR("WA batch buffers are not initialized: %d\n",
>> + ret);
>> + }
>> + }
>> +
>> + ret = intel_init_pipe_control(ring);
>
> Did you consider stuffing it into the spare are of the pipe control
> scatch bo? :)
Not exactly but I think it is better to keep them separate. It is not
that a single page is not sufficient even if we add more WA in future
but for logical reasons. In case if there is any error while
initializing these WA we are destroying the page and continuing further
which cannot be done with scratch page.
regards
Arun
> -Chris
>
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/intel-gfx
^ permalink raw reply [flat|nested] 19+ messages in thread
* [PATCH v4 2/6] drm/i915/gen8: Re-order init pipe_control in lrc mode
2015-06-16 19:25 [PATCH v4 0/6] Add Per-context WA using WA batch buffers Arun Siluvery
2015-06-16 19:25 ` [PATCH v4 1/6] drm/i915/gen8: Add infrastructure to initialize " Arun Siluvery
@ 2015-06-16 19:25 ` Arun Siluvery
2015-06-16 19:25 ` [PATCH v4 3/6] drm/i915/gen8: Add WaDisableCtxRestoreArbitration workaround Arun Siluvery
` (3 subsequent siblings)
5 siblings, 0 replies; 19+ messages in thread
From: Arun Siluvery @ 2015-06-16 19:25 UTC (permalink / raw)
To: intel-gfx
Some of the WA applied using WA batch buffers perform writes to scratch page.
In the current flow WA are initialized before scratch obj is allocated.
This patch reorders intel_init_pipe_control() to have a valid scratch obj
before we initialize WA.
Signed-off-by: Michel Thierry <michel.thierry@intel.com>
Signed-off-by: Arun Siluvery <arun.siluvery@linux.intel.com>
---
drivers/gpu/drm/i915/intel_lrc.c | 5 +++--
1 file changed, 3 insertions(+), 2 deletions(-)
diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c
index cad274a..dcc3f2e 100644
--- a/drivers/gpu/drm/i915/intel_lrc.c
+++ b/drivers/gpu/drm/i915/intel_lrc.c
@@ -1608,7 +1608,8 @@ static int logical_render_ring_init(struct drm_device *dev)
ring->emit_bb_start = gen8_emit_bb_start;
ring->dev = dev;
- ret = logical_ring_init(dev, ring);
+
+ ret = intel_init_pipe_control(ring);
if (ret)
return ret;
@@ -1628,7 +1629,7 @@ static int logical_render_ring_init(struct drm_device *dev)
}
}
- ret = intel_init_pipe_control(ring);
+ ret = logical_ring_init(dev, ring);
if (ret) {
if (ring->wa_ctx.obj)
lrc_destroy_wa_ctx_obj(ring);
--
2.3.0
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/intel-gfx
^ permalink raw reply related [flat|nested] 19+ messages in thread* [PATCH v4 3/6] drm/i915/gen8: Add WaDisableCtxRestoreArbitration workaround
2015-06-16 19:25 [PATCH v4 0/6] Add Per-context WA using WA batch buffers Arun Siluvery
2015-06-16 19:25 ` [PATCH v4 1/6] drm/i915/gen8: Add infrastructure to initialize " Arun Siluvery
2015-06-16 19:25 ` [PATCH v4 2/6] drm/i915/gen8: Re-order init pipe_control in lrc mode Arun Siluvery
@ 2015-06-16 19:25 ` Arun Siluvery
2015-06-16 19:25 ` [PATCH v4 4/6] drm/i915/gen8: Add WaFlushCoherentL3CacheLinesAtContextSwitch workaround Arun Siluvery
` (2 subsequent siblings)
5 siblings, 0 replies; 19+ messages in thread
From: Arun Siluvery @ 2015-06-16 19:25 UTC (permalink / raw)
To: intel-gfx
In Indirect and Per context w/a batch buffer,
+WaDisableCtxRestoreArbitration
Signed-off-by: Rafael Barbalho <rafael.barbalho@intel.com>
Signed-off-by: Arun Siluvery <arun.siluvery@linux.intel.com>
---
drivers/gpu/drm/i915/intel_lrc.c | 18 ++++++++----------
1 file changed, 8 insertions(+), 10 deletions(-)
diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c
index dcc3f2e..c9875f6 100644
--- a/drivers/gpu/drm/i915/intel_lrc.c
+++ b/drivers/gpu/drm/i915/intel_lrc.c
@@ -1091,10 +1091,11 @@ static int gen8_init_indirectctx_bb(struct intel_engine_cs *ring,
index = offset;
- /* FIXME: fill one cacheline with NOOPs.
- * Replace these instructions with WA
- */
- while (index < (offset + 16))
+ /* WaDisableCtxRestoreArbitration:bdw,chv */
+ cmd[index++] = MI_ARB_ON_OFF | MI_ARB_DISABLE;
+
+ /* padding */
+ while (((unsigned long) (cmd + index) % CACHELINE_BYTES) != 0)
cmd[index++] = MI_NOOP;
/*
@@ -1126,13 +1127,10 @@ static int gen8_init_perctx_bb(struct intel_engine_cs *ring,
index = offset;
- /* FIXME: fill one cacheline with NOOPs.
- * Replace these instructions with WA
- */
- while (index < (offset + 16))
- cmd[index++] = MI_NOOP;
+ /* WaDisableCtxRestoreArbitration:bdw,chv */
+ cmd[index++] = MI_ARB_ON_OFF | MI_ARB_ENABLE;
- cmd[index - 1] = MI_BATCH_BUFFER_END;
+ cmd[index++] = MI_BATCH_BUFFER_END;
kunmap_atomic(cmd);
--
2.3.0
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/intel-gfx
^ permalink raw reply related [flat|nested] 19+ messages in thread* [PATCH v4 4/6] drm/i915/gen8: Add WaFlushCoherentL3CacheLinesAtContextSwitch workaround
2015-06-16 19:25 [PATCH v4 0/6] Add Per-context WA using WA batch buffers Arun Siluvery
` (2 preceding siblings ...)
2015-06-16 19:25 ` [PATCH v4 3/6] drm/i915/gen8: Add WaDisableCtxRestoreArbitration workaround Arun Siluvery
@ 2015-06-16 19:25 ` Arun Siluvery
2015-06-16 20:38 ` Chris Wilson
2015-06-16 19:25 ` [PATCH v4 5/6] drm/i915/gen8: Add WaClearSlmSpaceAtContextSwitch workaround Arun Siluvery
2015-06-16 19:25 ` [PATCH v4 6/6] drm/i915/gen8: Add WaRsRestoreWithPerCtxtBb workaround Arun Siluvery
5 siblings, 1 reply; 19+ messages in thread
From: Arun Siluvery @ 2015-06-16 19:25 UTC (permalink / raw)
To: intel-gfx
In Indirect context w/a batch buffer,
+WaFlushCoherentL3CacheLinesAtContextSwitch:bdw
v2: Add LRI commands to set/reset bit that invalidates coherent lines,
update WA to include programming restrictions and exclude CHV as
it is not required (Ville)
Signed-off-by: Rafael Barbalho <rafael.barbalho@intel.com>
Signed-off-by: Arun Siluvery <arun.siluvery@linux.intel.com>
---
drivers/gpu/drm/i915/i915_reg.h | 2 ++
drivers/gpu/drm/i915/intel_lrc.c | 23 +++++++++++++++++++++++
2 files changed, 25 insertions(+)
diff --git a/drivers/gpu/drm/i915/i915_reg.h b/drivers/gpu/drm/i915/i915_reg.h
index 84af255..d14ad20 100644
--- a/drivers/gpu/drm/i915/i915_reg.h
+++ b/drivers/gpu/drm/i915/i915_reg.h
@@ -426,6 +426,7 @@
#define PIPE_CONTROL_INDIRECT_STATE_DISABLE (1<<9)
#define PIPE_CONTROL_NOTIFY (1<<8)
#define PIPE_CONTROL_FLUSH_ENABLE (1<<7) /* gen7+ */
+#define PIPE_CONTROL_DC_FLUSH_ENABLE (1<<5)
#define PIPE_CONTROL_VF_CACHE_INVALIDATE (1<<4)
#define PIPE_CONTROL_CONST_CACHE_INVALIDATE (1<<3)
#define PIPE_CONTROL_STATE_CACHE_INVALIDATE (1<<2)
@@ -5788,6 +5789,7 @@ enum skl_disp_power_wells {
#define GEN8_L3SQCREG4 0xb118
#define GEN8_LQSC_RO_PERF_DIS (1<<27)
+#define GEN8_LQSC_FLUSH_COHERENT_LINES (1<<21)
/* GEN8 chicken */
#define HDC_CHICKEN0 0x7300
diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c
index c9875f6..92556b9 100644
--- a/drivers/gpu/drm/i915/intel_lrc.c
+++ b/drivers/gpu/drm/i915/intel_lrc.c
@@ -1094,6 +1094,29 @@ static int gen8_init_indirectctx_bb(struct intel_engine_cs *ring,
/* WaDisableCtxRestoreArbitration:bdw,chv */
cmd[index++] = MI_ARB_ON_OFF | MI_ARB_DISABLE;
+ /* WaFlushCoherentL3CacheLinesAtContextSwitch:bdw */
+ if (IS_BROADWELL(ring->dev)) {
+ struct drm_i915_private *dev_priv = ring->dev->dev_private;
+
+ cmd[index++] = MI_LOAD_REGISTER_IMM(1);
+ cmd[index++] = GEN8_L3SQCREG4;
+ cmd[index++] = I915_READ(GEN8_L3SQCREG4) |
+ GEN8_LQSC_FLUSH_COHERENT_LINES;
+
+ cmd[index++] = GFX_OP_PIPE_CONTROL(6);
+ cmd[index++] = PIPE_CONTROL_CS_STALL |
+ PIPE_CONTROL_DC_FLUSH_ENABLE;
+ cmd[index++] = 0;
+ cmd[index++] = 0;
+ cmd[index++] = 0;
+ cmd[index++] = 0;
+
+ cmd[index++] = MI_LOAD_REGISTER_IMM(1);
+ cmd[index++] = GEN8_L3SQCREG4;
+ cmd[index++] = I915_READ(GEN8_L3SQCREG4) &
+ ~GEN8_LQSC_FLUSH_COHERENT_LINES;
+ }
+
/* padding */
while (((unsigned long) (cmd + index) % CACHELINE_BYTES) != 0)
cmd[index++] = MI_NOOP;
--
2.3.0
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/intel-gfx
^ permalink raw reply related [flat|nested] 19+ messages in thread* Re: [PATCH v4 4/6] drm/i915/gen8: Add WaFlushCoherentL3CacheLinesAtContextSwitch workaround
2015-06-16 19:25 ` [PATCH v4 4/6] drm/i915/gen8: Add WaFlushCoherentL3CacheLinesAtContextSwitch workaround Arun Siluvery
@ 2015-06-16 20:38 ` Chris Wilson
2015-06-26 16:45 ` Dave Gordon
0 siblings, 1 reply; 19+ messages in thread
From: Chris Wilson @ 2015-06-16 20:38 UTC (permalink / raw)
To: Arun Siluvery; +Cc: intel-gfx
On Tue, Jun 16, 2015 at 08:25:23PM +0100, Arun Siluvery wrote:
> + /* WaFlushCoherentL3CacheLinesAtContextSwitch:bdw */
> + if (IS_BROADWELL(ring->dev)) {
> + struct drm_i915_private *dev_priv = ring->dev->dev_private;
dev_priv = to_i915(ring->dev);
> +
> + cmd[index++] = MI_LOAD_REGISTER_IMM(1);
> + cmd[index++] = GEN8_L3SQCREG4;
> + cmd[index++] = I915_READ(GEN8_L3SQCREG4) |
> + GEN8_LQSC_FLUSH_COHERENT_LINES;
Read the reg once, it is clearer that way.
> +
> + cmd[index++] = GFX_OP_PIPE_CONTROL(6);
> + cmd[index++] = PIPE_CONTROL_CS_STALL |
> + PIPE_CONTROL_DC_FLUSH_ENABLE;
> + cmd[index++] = 0;
> + cmd[index++] = 0;
> + cmd[index++] = 0;
> + cmd[index++] = 0;
> +
> + cmd[index++] = MI_LOAD_REGISTER_IMM(1);
> + cmd[index++] = GEN8_L3SQCREG4;
> + cmd[index++] = I915_READ(GEN8_L3SQCREG4) &
> + ~GEN8_LQSC_FLUSH_COHERENT_LINES;
-Chris
--
Chris Wilson, Intel Open Source Technology Centre
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/intel-gfx
^ permalink raw reply [flat|nested] 19+ messages in thread* Re: [PATCH v4 4/6] drm/i915/gen8: Add WaFlushCoherentL3CacheLinesAtContextSwitch workaround
2015-06-16 20:38 ` Chris Wilson
@ 2015-06-26 16:45 ` Dave Gordon
2015-06-26 16:56 ` Chris Wilson
0 siblings, 1 reply; 19+ messages in thread
From: Dave Gordon @ 2015-06-26 16:45 UTC (permalink / raw)
To: Chris Wilson, Arun Siluvery, intel-gfx
On 16/06/15 21:38, Chris Wilson wrote:
> On Tue, Jun 16, 2015 at 08:25:23PM +0100, Arun Siluvery wrote:
>> + /* WaFlushCoherentL3CacheLinesAtContextSwitch:bdw */
>> + if (IS_BROADWELL(ring->dev)) {
>> + struct drm_i915_private *dev_priv = ring->dev->dev_private;
>
> dev_priv = to_i915(ring->dev);
>
>> +
>> + cmd[index++] = MI_LOAD_REGISTER_IMM(1);
>> + cmd[index++] = GEN8_L3SQCREG4;
>> + cmd[index++] = I915_READ(GEN8_L3SQCREG4) |
>> + GEN8_LQSC_FLUSH_COHERENT_LINES;
>
> Read the reg once, it is clearer that way.
>
>> +
>> + cmd[index++] = GFX_OP_PIPE_CONTROL(6);
>> + cmd[index++] = PIPE_CONTROL_CS_STALL |
>> + PIPE_CONTROL_DC_FLUSH_ENABLE;
>> + cmd[index++] = 0;
>> + cmd[index++] = 0;
>> + cmd[index++] = 0;
>> + cmd[index++] = 0;
>> +
>> + cmd[index++] = MI_LOAD_REGISTER_IMM(1);
>> + cmd[index++] = GEN8_L3SQCREG4;
>> + cmd[index++] = I915_READ(GEN8_L3SQCREG4) &
>> + ~GEN8_LQSC_FLUSH_COHERENT_LINES;
> -Chris
What Chris said. But also, is it even meaningful to read a h/w register
now (when?) and use its value as the basis for future LRI instructions?
How (and when) does this register get its initial value, and does it get
changed at any other time? If the value we put in the register is a
run-time constant, there's really no need to read it back even once.
.Dave.
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/intel-gfx
^ permalink raw reply [flat|nested] 19+ messages in thread* Re: [PATCH v4 4/6] drm/i915/gen8: Add WaFlushCoherentL3CacheLinesAtContextSwitch workaround
2015-06-26 16:45 ` Dave Gordon
@ 2015-06-26 16:56 ` Chris Wilson
0 siblings, 0 replies; 19+ messages in thread
From: Chris Wilson @ 2015-06-26 16:56 UTC (permalink / raw)
To: Dave Gordon; +Cc: intel-gfx
On Fri, Jun 26, 2015 at 05:45:08PM +0100, Dave Gordon wrote:
> On 16/06/15 21:38, Chris Wilson wrote:
> > On Tue, Jun 16, 2015 at 08:25:23PM +0100, Arun Siluvery wrote:
> >> + /* WaFlushCoherentL3CacheLinesAtContextSwitch:bdw */
> >> + if (IS_BROADWELL(ring->dev)) {
> >> + struct drm_i915_private *dev_priv = ring->dev->dev_private;
> >
> > dev_priv = to_i915(ring->dev);
> >
> >> +
> >> + cmd[index++] = MI_LOAD_REGISTER_IMM(1);
> >> + cmd[index++] = GEN8_L3SQCREG4;
> >> + cmd[index++] = I915_READ(GEN8_L3SQCREG4) |
> >> + GEN8_LQSC_FLUSH_COHERENT_LINES;
> >
> > Read the reg once, it is clearer that way.
> >
> >> +
> >> + cmd[index++] = GFX_OP_PIPE_CONTROL(6);
> >> + cmd[index++] = PIPE_CONTROL_CS_STALL |
> >> + PIPE_CONTROL_DC_FLUSH_ENABLE;
> >> + cmd[index++] = 0;
> >> + cmd[index++] = 0;
> >> + cmd[index++] = 0;
> >> + cmd[index++] = 0;
> >> +
> >> + cmd[index++] = MI_LOAD_REGISTER_IMM(1);
> >> + cmd[index++] = GEN8_L3SQCREG4;
> >> + cmd[index++] = I915_READ(GEN8_L3SQCREG4) &
> >> + ~GEN8_LQSC_FLUSH_COHERENT_LINES;
> > -Chris
>
> What Chris said. But also, is it even meaningful to read a h/w register
> now (when?) and use its value as the basis for future LRI instructions?
> How (and when) does this register get its initial value, and does it get
> changed at any other time? If the value we put in the register is a
> run-time constant, there's really no need to read it back even once.
True. To be generic, we need to do STORE_REG_MEM, LRI constant value,
then LOAD_REG_MEM. Hopefully, no userspace ever actually need to twiddle
that register and we can just load a constant value.
-Chris
--
Chris Wilson, Intel Open Source Technology Centre
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/intel-gfx
^ permalink raw reply [flat|nested] 19+ messages in thread
* [PATCH v4 5/6] drm/i915/gen8: Add WaClearSlmSpaceAtContextSwitch workaround
2015-06-16 19:25 [PATCH v4 0/6] Add Per-context WA using WA batch buffers Arun Siluvery
` (3 preceding siblings ...)
2015-06-16 19:25 ` [PATCH v4 4/6] drm/i915/gen8: Add WaFlushCoherentL3CacheLinesAtContextSwitch workaround Arun Siluvery
@ 2015-06-16 19:25 ` Arun Siluvery
2015-06-16 20:35 ` Chris Wilson
2015-06-16 19:25 ` [PATCH v4 6/6] drm/i915/gen8: Add WaRsRestoreWithPerCtxtBb workaround Arun Siluvery
5 siblings, 1 reply; 19+ messages in thread
From: Arun Siluvery @ 2015-06-16 19:25 UTC (permalink / raw)
To: intel-gfx
In Indirect context w/a batch buffer,
WaClearSlmSpaceAtContextSwitch
v2: s/PIPE_CONTROL_FLUSH_RO_CACHES/PIPE_CONTROL_FLUSH_L3 (Ville)
Signed-off-by: Rafael Barbalho <rafael.barbalho@intel.com>
Signed-off-by: Arun Siluvery <arun.siluvery@linux.intel.com>
---
drivers/gpu/drm/i915/i915_reg.h | 1 +
drivers/gpu/drm/i915/intel_lrc.c | 19 +++++++++++++++++++
2 files changed, 20 insertions(+)
diff --git a/drivers/gpu/drm/i915/i915_reg.h b/drivers/gpu/drm/i915/i915_reg.h
index d14ad20..7637e64 100644
--- a/drivers/gpu/drm/i915/i915_reg.h
+++ b/drivers/gpu/drm/i915/i915_reg.h
@@ -410,6 +410,7 @@
#define DISPLAY_PLANE_A (0<<20)
#define DISPLAY_PLANE_B (1<<20)
#define GFX_OP_PIPE_CONTROL(len) ((0x3<<29)|(0x3<<27)|(0x2<<24)|(len-2))
+#define PIPE_CONTROL_FLUSH_L3 (1<<27)
#define PIPE_CONTROL_GLOBAL_GTT_IVB (1<<24) /* gen7+ */
#define PIPE_CONTROL_MMIO_WRITE (1<<23)
#define PIPE_CONTROL_STORE_DATA_INDEX (1<<21)
diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c
index 92556b9..27e6692 100644
--- a/drivers/gpu/drm/i915/intel_lrc.c
+++ b/drivers/gpu/drm/i915/intel_lrc.c
@@ -1085,6 +1085,8 @@ static int gen8_init_indirectctx_bb(struct intel_engine_cs *ring,
uint32_t index;
struct page *page;
uint32_t *cmd;
+ u32 scratch_addr;
+ unsigned long flags = 0;
page = i915_gem_object_get_page(ring->wa_ctx.obj, 0);
cmd = kmap_atomic(page);
@@ -1117,6 +1119,23 @@ static int gen8_init_indirectctx_bb(struct intel_engine_cs *ring,
~GEN8_LQSC_FLUSH_COHERENT_LINES;
}
+ /* WaClearSlmSpaceAtContextSwitch:bdw,chv */
+ flags = PIPE_CONTROL_FLUSH_L3 |
+ PIPE_CONTROL_GLOBAL_GTT_IVB |
+ PIPE_CONTROL_CS_STALL |
+ PIPE_CONTROL_QW_WRITE;
+
+ /* Actual scratch location is at 128 bytes offset */
+ scratch_addr = ring->scratch.gtt_offset + 2*CACHELINE_BYTES;
+ scratch_addr |= PIPE_CONTROL_GLOBAL_GTT;
+
+ cmd[index++] = GFX_OP_PIPE_CONTROL(6);
+ cmd[index++] = flags;
+ cmd[index++] = scratch_addr;
+ cmd[index++] = 0;
+ cmd[index++] = 0;
+ cmd[index++] = 0;
+
/* padding */
while (((unsigned long) (cmd + index) % CACHELINE_BYTES) != 0)
cmd[index++] = MI_NOOP;
--
2.3.0
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/intel-gfx
^ permalink raw reply related [flat|nested] 19+ messages in thread* [PATCH v4 6/6] drm/i915/gen8: Add WaRsRestoreWithPerCtxtBb workaround
2015-06-16 19:25 [PATCH v4 0/6] Add Per-context WA using WA batch buffers Arun Siluvery
` (4 preceding siblings ...)
2015-06-16 19:25 ` [PATCH v4 5/6] drm/i915/gen8: Add WaClearSlmSpaceAtContextSwitch workaround Arun Siluvery
@ 2015-06-16 19:25 ` Arun Siluvery
2015-06-16 20:43 ` Chris Wilson
5 siblings, 1 reply; 19+ messages in thread
From: Arun Siluvery @ 2015-06-16 19:25 UTC (permalink / raw)
To: intel-gfx
In Per context w/a batch buffer,
WaRsRestoreWithPerCtxtBb
v2: This patches modifies definitions of MI_LOAD_REGISTER_MEM and
MI_LOAD_REGISTER_REG; Add GEN8 specific defines for these instructions
so as to not break any future users of existing definitions (Michel)
v3: Length defined in current definitions of LRM, LRR instructions is not correct.
Correct and use existing definitions and also move them out of command parser
placeholder to appropriate place.
remove unnecessary padding and follow the WA programming sequence exactly
as mentioned in spec (Dave).
Signed-off-by: Rafael Barbalho <rafael.barbalho@intel.com>
Signed-off-by: Arun Siluvery <arun.siluvery@linux.intel.com>
---
drivers/gpu/drm/i915/i915_reg.h | 29 +++++++++++++++++++--
drivers/gpu/drm/i915/intel_lrc.c | 56 ++++++++++++++++++++++++++++++++++++++++
2 files changed, 83 insertions(+), 2 deletions(-)
diff --git a/drivers/gpu/drm/i915/i915_reg.h b/drivers/gpu/drm/i915/i915_reg.h
index 7637e64..208620d 100644
--- a/drivers/gpu/drm/i915/i915_reg.h
+++ b/drivers/gpu/drm/i915/i915_reg.h
@@ -347,6 +347,31 @@
#define MI_INVALIDATE_BSD (1<<7)
#define MI_FLUSH_DW_USE_GTT (1<<2)
#define MI_FLUSH_DW_USE_PPGTT (0<<2)
+#define MI_LOAD_REGISTER_MEM MI_INSTR(0x29, 1)
+#define MI_LOAD_REGISTER_MEM_GEN8 MI_INSTR(0x29, 2)
+#define MI_LRM_USE_GLOBAL_GTT (1<<22)
+#define MI_LRM_ASYNC_MODE_ENABLE (1<<21)
+#define MI_LOAD_REGISTER_REG MI_INSTR(0x2A, 1)
+#define MI_ATOMIC(len) MI_INSTR(0x2F, (len-2))
+#define MI_ATOMIC_MEMORY_TYPE_GGTT (1<<22)
+#define MI_ATOMIC_INLINE_DATA (1<<18)
+#define MI_ATOMIC_CS_STALL (1<<17)
+#define MI_ATOMIC_RETURN_DATA_CTL (1<<16)
+#define MI_ATOMIC_OP_MASK(op) ((op) << 8)
+#define MI_ATOMIC_AND MI_ATOMIC_OP_MASK(0x01)
+#define MI_ATOMIC_OR MI_ATOMIC_OP_MASK(0x02)
+#define MI_ATOMIC_XOR MI_ATOMIC_OP_MASK(0x03)
+#define MI_ATOMIC_MOVE MI_ATOMIC_OP_MASK(0x04)
+#define MI_ATOMIC_INC MI_ATOMIC_OP_MASK(0x05)
+#define MI_ATOMIC_DEC MI_ATOMIC_OP_MASK(0x06)
+#define MI_ATOMIC_ADD MI_ATOMIC_OP_MASK(0x07)
+#define MI_ATOMIC_SUB MI_ATOMIC_OP_MASK(0x08)
+#define MI_ATOMIC_RSUB MI_ATOMIC_OP_MASK(0x09)
+#define MI_ATOMIC_IMAX MI_ATOMIC_OP_MASK(0x0A)
+#define MI_ATOMIC_IMIN MI_ATOMIC_OP_MASK(0x0B)
+#define MI_ATOMIC_UMAX MI_ATOMIC_OP_MASK(0x0C)
+#define MI_ATOMIC_UMIN MI_ATOMIC_OP_MASK(0x0D)
+
#define MI_BATCH_BUFFER MI_INSTR(0x30, 1)
#define MI_BATCH_NON_SECURE (1)
/* for snb/ivb/vlv this also means "batch in ppgtt" when ppgtt is enabled. */
@@ -451,8 +476,6 @@
#define MI_CLFLUSH MI_INSTR(0x27, 0)
#define MI_REPORT_PERF_COUNT MI_INSTR(0x28, 0)
#define MI_REPORT_PERF_COUNT_GGTT (1<<0)
-#define MI_LOAD_REGISTER_MEM MI_INSTR(0x29, 0)
-#define MI_LOAD_REGISTER_REG MI_INSTR(0x2A, 0)
#define MI_RS_STORE_DATA_IMM MI_INSTR(0x2B, 0)
#define MI_LOAD_URB_MEM MI_INSTR(0x2C, 0)
#define MI_STORE_URB_MEM MI_INSTR(0x2D, 0)
@@ -1799,6 +1822,8 @@ enum skl_disp_power_wells {
#define GEN8_RC_SEMA_IDLE_MSG_DISABLE (1 << 12)
#define GEN8_FF_DOP_CLOCK_GATE_DISABLE (1<<10)
+#define GEN8_RS_PREEMPT_STATUS 0x215C
+
/* Fuse readout registers for GT */
#define CHV_FUSE_GT (VLV_DISPLAY_BASE + 0x2168)
#define CHV_FGT_DISABLE_SS0 (1 << 10)
diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c
index 27e6692..ebaba27 100644
--- a/drivers/gpu/drm/i915/intel_lrc.c
+++ b/drivers/gpu/drm/i915/intel_lrc.c
@@ -1163,15 +1163,71 @@ static int gen8_init_perctx_bb(struct intel_engine_cs *ring,
uint32_t index;
struct page *page;
uint32_t *cmd;
+ u32 scratch_addr;
+ unsigned long flags = 0;
page = i915_gem_object_get_page(ring->wa_ctx.obj, 0);
cmd = kmap_atomic(page);
index = offset;
+ /* Actual scratch location is at 128 bytes offset */
+ scratch_addr = ring->scratch.gtt_offset + 2*CACHELINE_BYTES;
+ scratch_addr |= PIPE_CONTROL_GLOBAL_GTT;
+
/* WaDisableCtxRestoreArbitration:bdw,chv */
cmd[index++] = MI_ARB_ON_OFF | MI_ARB_ENABLE;
+ /*
+ * As per Bspec, to workaround a known HW issue, SW must perform the
+ * below programming sequence prior to programming MI_BATCH_BUFFER_END.
+ *
+ * This is only applicable for Gen8.
+ */
+
+ /* WaRsRestoreWithPerCtxtBb:bdw,chv */
+ cmd[index++] = MI_LOAD_REGISTER_IMM(1);
+ cmd[index++] = INSTPM;
+ cmd[index++] = _MASKED_BIT_DISABLE(INSTPM_FORCE_ORDERING);
+
+ flags = MI_ATOMIC_MEMORY_TYPE_GGTT |
+ MI_ATOMIC_INLINE_DATA |
+ MI_ATOMIC_CS_STALL |
+ MI_ATOMIC_RETURN_DATA_CTL |
+ MI_ATOMIC_MOVE;
+
+ cmd[index++] = MI_ATOMIC(5) | flags;
+ cmd[index++] = scratch_addr;
+ cmd[index++] = 0;
+ cmd[index++] = _MASKED_BIT_ENABLE(INSTPM_FORCE_ORDERING);
+ cmd[index++] = _MASKED_BIT_ENABLE(INSTPM_FORCE_ORDERING);
+
+ /*
+ * BSpec says MI_LOAD_REGISTER_MEM, MI_LOAD_REGISTER_REG and
+ * MI_BATCH_BUFFER_END instructions in this sequence need to be
+ * in the same cacheline. To satisfy this case even if more WA are
+ * added in future, pad current cacheline and start remaining sequence
+ * in new cacheline.
+ */
+ while (((unsigned long) (cmd + index) % CACHELINE_BYTES) != 0)
+ cmd[index++] = MI_NOOP;
+
+ cmd[index++] = MI_LOAD_REGISTER_MEM_GEN8 |
+ MI_LRM_USE_GLOBAL_GTT |
+ MI_LRM_ASYNC_MODE_ENABLE;
+ cmd[index++] = INSTPM;
+ cmd[index++] = scratch_addr;
+ cmd[index++] = 0;
+
+ /*
+ * BSpec says there should not be any commands programmed
+ * between MI_LOAD_REGISTER_REG and MI_BATCH_BUFFER_END so
+ * do not add any new commands
+ */
+ cmd[index++] = MI_LOAD_REGISTER_REG;
+ cmd[index++] = GEN8_RS_PREEMPT_STATUS;
+ cmd[index++] = GEN8_RS_PREEMPT_STATUS;
+
cmd[index++] = MI_BATCH_BUFFER_END;
kunmap_atomic(cmd);
--
2.3.0
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/intel-gfx
^ permalink raw reply related [flat|nested] 19+ messages in thread* Re: [PATCH v4 6/6] drm/i915/gen8: Add WaRsRestoreWithPerCtxtBb workaround
2015-06-16 19:25 ` [PATCH v4 6/6] drm/i915/gen8: Add WaRsRestoreWithPerCtxtBb workaround Arun Siluvery
@ 2015-06-16 20:43 ` Chris Wilson
0 siblings, 0 replies; 19+ messages in thread
From: Chris Wilson @ 2015-06-16 20:43 UTC (permalink / raw)
To: Arun Siluvery; +Cc: intel-gfx
On Tue, Jun 16, 2015 at 08:25:25PM +0100, Arun Siluvery wrote:
> v3: Length defined in current definitions of LRM, LRR instructions is not correct.
> Correct and use existing definitions and also move them out of command parser
> placeholder to appropriate place.
Not that it wasn't correct. Common form has to be use 0 for instructions
whose length vary between platforms and then to fill in (X - 2) as
required. (At least that is what we do in userspace.)
> + /* WaRsRestoreWithPerCtxtBb:bdw,chv */
> + cmd[index++] = MI_LOAD_REGISTER_IMM(1);
> + cmd[index++] = INSTPM;
> + cmd[index++] = _MASKED_BIT_DISABLE(INSTPM_FORCE_ORDERING);
> +
> + flags = MI_ATOMIC_MEMORY_TYPE_GGTT |
> + MI_ATOMIC_INLINE_DATA |
> + MI_ATOMIC_CS_STALL |
> + MI_ATOMIC_RETURN_DATA_CTL |
> + MI_ATOMIC_MOVE;
> +
> + cmd[index++] = MI_ATOMIC(5) | flags;
This is very inconsistent style, and indeed flags doesn't make the code
any more readable than simply specifying each in the cmd.
> + cmd[index++] = scratch_addr;
> + cmd[index++] = 0;
> + cmd[index++] = _MASKED_BIT_ENABLE(INSTPM_FORCE_ORDERING);
> + cmd[index++] = _MASKED_BIT_ENABLE(INSTPM_FORCE_ORDERING);
> +
> + /*
> + * BSpec says MI_LOAD_REGISTER_MEM, MI_LOAD_REGISTER_REG and
> + * MI_BATCH_BUFFER_END instructions in this sequence need to be
> + * in the same cacheline. To satisfy this case even if more WA are
> + * added in future, pad current cacheline and start remaining sequence
> + * in new cacheline.
> + */
> + while (((unsigned long) (cmd + index) % CACHELINE_BYTES) != 0)
> + cmd[index++] = MI_NOOP;
> +
> + cmd[index++] = MI_LOAD_REGISTER_MEM_GEN8 |
> + MI_LRM_USE_GLOBAL_GTT |
> + MI_LRM_ASYNC_MODE_ENABLE;
Use brackets to force alignment when it's ugly like this.
-Chris
--
Chris Wilson, Intel Open Source Technology Centre
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/intel-gfx
^ permalink raw reply [flat|nested] 19+ messages in thread