* [PATCH 1/6] drm/i915: Eliminate vmap overhead for cmd parser
@ 2015-10-01 11:57 Chris Wilson
2015-10-01 11:57 ` [PATCH 2/6] drm/i915: Cache last cmd descriptor when parsing Chris Wilson
` (6 more replies)
0 siblings, 7 replies; 12+ messages in thread
From: Chris Wilson @ 2015-10-01 11:57 UTC (permalink / raw)
To: intel-gfx
With a little complexity to handle cmds straddling page boundaries, we
can completely avoiding having to vmap the batch and the shadow batch
objects whilst running the command parser.
On ivb i7-3720MQ:
x11perf -dot before 54.3M, after 53.2M (max 203M)
glxgears before 7110 fps, after 7300 fps (max 7860 fps)
Before:
Time to blt 16384 bytes x 1: 12.400µs, 1.2GiB/s
Time to blt 16384 bytes x 4096: 3.055µs, 5.0GiB/s
After:
Time to blt 16384 bytes x 1: 8.600µs, 1.8GiB/s
Time to blt 16384 bytes x 4096: 2.456µs, 6.2GiB/s
Removing the vmap is mostly a win, except we lose in a few cases where
the batch size is greater than a page due to the extra complexity (loss
of a simple cache efficient large copy, and boundary handling).
v2: Reorder so that we do check oacontrol remaining set at end-of-batch
Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
---
drivers/gpu/drm/i915/i915_cmd_parser.c | 298 ++++++++++++++++-----------------
1 file changed, 146 insertions(+), 152 deletions(-)
diff --git a/drivers/gpu/drm/i915/i915_cmd_parser.c b/drivers/gpu/drm/i915/i915_cmd_parser.c
index 237ff6884a22..91e4baa0f2b8 100644
--- a/drivers/gpu/drm/i915/i915_cmd_parser.c
+++ b/drivers/gpu/drm/i915/i915_cmd_parser.c
@@ -852,100 +852,6 @@ find_reg(const struct drm_i915_reg_descriptor *table,
return NULL;
}
-static u32 *vmap_batch(struct drm_i915_gem_object *obj,
- unsigned start, unsigned len)
-{
- int i;
- void *addr = NULL;
- struct sg_page_iter sg_iter;
- int first_page = start >> PAGE_SHIFT;
- int last_page = (len + start + 4095) >> PAGE_SHIFT;
- int npages = last_page - first_page;
- struct page **pages;
-
- pages = drm_malloc_ab(npages, sizeof(*pages));
- if (pages == NULL) {
- DRM_DEBUG_DRIVER("Failed to get space for pages\n");
- goto finish;
- }
-
- i = 0;
- for_each_sg_page(obj->pages->sgl, &sg_iter, obj->pages->nents, first_page) {
- pages[i++] = sg_page_iter_page(&sg_iter);
- if (i == npages)
- break;
- }
-
- addr = vmap(pages, i, 0, PAGE_KERNEL);
- if (addr == NULL) {
- DRM_DEBUG_DRIVER("Failed to vmap pages\n");
- goto finish;
- }
-
-finish:
- if (pages)
- drm_free_large(pages);
- return (u32*)addr;
-}
-
-/* Returns a vmap'd pointer to dest_obj, which the caller must unmap */
-static u32 *copy_batch(struct drm_i915_gem_object *dest_obj,
- struct drm_i915_gem_object *src_obj,
- u32 batch_start_offset,
- u32 batch_len)
-{
- int needs_clflush = 0;
- void *src_base, *src;
- void *dst = NULL;
- int ret;
-
- if (batch_len > dest_obj->base.size ||
- batch_len + batch_start_offset > src_obj->base.size)
- return ERR_PTR(-E2BIG);
-
- if (WARN_ON(dest_obj->pages_pin_count == 0))
- return ERR_PTR(-ENODEV);
-
- ret = i915_gem_obj_prepare_shmem_read(src_obj, &needs_clflush);
- if (ret) {
- DRM_DEBUG_DRIVER("CMD: failed to prepare shadow batch\n");
- return ERR_PTR(ret);
- }
-
- src_base = vmap_batch(src_obj, batch_start_offset, batch_len);
- if (!src_base) {
- DRM_DEBUG_DRIVER("CMD: Failed to vmap batch\n");
- ret = -ENOMEM;
- goto unpin_src;
- }
-
- ret = i915_gem_object_set_to_cpu_domain(dest_obj, true);
- if (ret) {
- DRM_DEBUG_DRIVER("CMD: Failed to set shadow batch to CPU\n");
- goto unmap_src;
- }
-
- dst = vmap_batch(dest_obj, 0, batch_len);
- if (!dst) {
- DRM_DEBUG_DRIVER("CMD: Failed to vmap shadow batch\n");
- ret = -ENOMEM;
- goto unmap_src;
- }
-
- src = src_base + offset_in_page(batch_start_offset);
- if (needs_clflush)
- drm_clflush_virt_range(src, batch_len);
-
- memcpy(dst, src, batch_len);
-
-unmap_src:
- vunmap(src_base);
-unpin_src:
- i915_gem_object_unpin_pages(src_obj);
-
- return ret ? ERR_PTR(ret) : dst;
-}
-
/**
* i915_needs_cmd_parser() - should a given ring use software command parsing?
* @ring: the ring in question
@@ -1112,16 +1018,35 @@ int i915_parse_cmds(struct intel_engine_cs *ring,
u32 batch_len,
bool is_master)
{
- u32 *cmd, *batch_base, *batch_end;
+ u32 tmp[128];
+ const struct drm_i915_cmd_descriptor *desc;
+ unsigned dst_iter, src_iter;
+ int needs_clflush = 0;
+ struct get_page rewind;
+ void *src, *dst;
+ unsigned in, out;
+ u32 *buf, partial = 0, length;
struct drm_i915_cmd_descriptor default_desc = { 0 };
bool oacontrol_set = false; /* OACONTROL tracking. See check_cmd() */
int ret = 0;
- batch_base = copy_batch(shadow_batch_obj, batch_obj,
- batch_start_offset, batch_len);
- if (IS_ERR(batch_base)) {
- DRM_DEBUG_DRIVER("CMD: Failed to copy batch\n");
- return PTR_ERR(batch_base);
+ if (batch_len > shadow_batch_obj->base.size ||
+ batch_len + batch_start_offset > batch_obj->base.size)
+ return -E2BIG;
+
+ if (WARN_ON(shadow_batch_obj->pages_pin_count == 0))
+ return -ENODEV;
+
+ ret = i915_gem_obj_prepare_shmem_read(batch_obj, &needs_clflush);
+ if (ret) {
+ DRM_DEBUG_DRIVER("CMD: failed to prepare shadow batch\n");
+ return ret;
+ }
+
+ ret = i915_gem_object_set_to_cpu_domain(shadow_batch_obj, true);
+ if (ret) {
+ DRM_DEBUG_DRIVER("CMD: Failed to set shadow batch to CPU\n");
+ goto unpin;
}
/*
@@ -1129,69 +1054,138 @@ int i915_parse_cmds(struct intel_engine_cs *ring,
* large or larger and copy_batch() will write MI_NOPs to the extra
* space. Parsing should be faster in some cases this way.
*/
- batch_end = batch_base + (batch_len / sizeof(*batch_end));
+ ret = -EINVAL;
+ rewind = batch_obj->get_page;
+
+ dst_iter = 0;
+ dst = kmap_atomic(i915_gem_object_get_page(shadow_batch_obj, dst_iter));
+
+ in = offset_in_page(batch_start_offset);
+ out = 0;
+ for (src_iter = batch_start_offset >> PAGE_SHIFT;
+ src_iter < batch_obj->base.size >> PAGE_SHIFT;
+ src_iter++) {
+ u32 this, i, j, k;
+ u32 *cmd, *page_end, *batch_end;
+
+ this = batch_len;
+ if (this > PAGE_SIZE - in)
+ this = PAGE_SIZE - in;
+
+ src = kmap_atomic(i915_gem_object_get_page(batch_obj, src_iter));
+ if (needs_clflush)
+ drm_clflush_virt_range(src + in, this);
+
+ i = this;
+ j = in;
+ do {
+ /* We keep dst around so that we do not blow
+ * the CPU caches immediately after the copy (due
+ * to the kunmap_atomic(dst) doing a TLB flush.
+ */
+ if (out == PAGE_SIZE) {
+ kunmap_atomic(dst);
+ dst = kmap_atomic(i915_gem_object_get_page(shadow_batch_obj, ++dst_iter));
+ out = 0;
+ }
- cmd = batch_base;
- while (cmd < batch_end) {
- const struct drm_i915_cmd_descriptor *desc;
- u32 length;
+ k = i;
+ if (k > PAGE_SIZE - out)
+ k = PAGE_SIZE - out;
+ if (k == PAGE_SIZE)
+ copy_page(dst, src);
+ else
+ memcpy(dst + out, src + j, k);
+
+ out += k;
+ j += k;
+ i -= k;
+ } while (i);
+
+ cmd = src + in;
+ page_end = (void *)cmd + this;
+ batch_end = (void *)cmd + batch_len;
+
+ if (partial) {
+ memcpy(tmp + partial, cmd, (length - partial)*4);
+ cmd -= partial;
+ partial = 0;
+ buf = tmp;
+ goto check;
+ }
- if (*cmd == MI_BATCH_BUFFER_END)
- break;
+ do {
+ if (*cmd == MI_BATCH_BUFFER_END) {
+ if (oacontrol_set) {
+ DRM_DEBUG_DRIVER("CMD: batch set OACONTROL but did not clear it\n");
+ ret = -EINVAL;
+ } else
+ ret = 0;
+ goto unmap;
+ }
- desc = find_cmd(ring, *cmd, &default_desc);
- if (!desc) {
- DRM_DEBUG_DRIVER("CMD: Unrecognized command: 0x%08X\n",
- *cmd);
- ret = -EINVAL;
- break;
- }
+ desc = find_cmd(ring, *cmd, &default_desc);
+ if (!desc) {
+ DRM_DEBUG_DRIVER("CMD: Unrecognized command: 0x%08X\n",
+ *cmd);
+ goto unmap;
+ }
- /*
- * If the batch buffer contains a chained batch, return an
- * error that tells the caller to abort and dispatch the
- * workload as a non-secure batch.
- */
- if (desc->cmd.value == MI_BATCH_BUFFER_START) {
- ret = -EACCES;
- break;
- }
+ /*
+ * If the batch buffer contains a chained batch, return an
+ * error that tells the caller to abort and dispatch the
+ * workload as a non-secure batch.
+ */
+ if (desc->cmd.value == MI_BATCH_BUFFER_START) {
+ ret = -EACCES;
+ goto unmap;
+ }
- if (desc->flags & CMD_DESC_FIXED)
- length = desc->length.fixed;
- else
- length = ((*cmd & desc->length.mask) + LENGTH_BIAS);
-
- if ((batch_end - cmd) < length) {
- DRM_DEBUG_DRIVER("CMD: Command length exceeds batch length: 0x%08X length=%u batchlen=%td\n",
- *cmd,
- length,
- batch_end - cmd);
- ret = -EINVAL;
- break;
- }
+ if (desc->flags & CMD_DESC_FIXED)
+ length = desc->length.fixed;
+ else
+ length = ((*cmd & desc->length.mask) + LENGTH_BIAS);
- if (!check_cmd(ring, desc, cmd, length, is_master,
- &oacontrol_set)) {
- ret = -EINVAL;
- break;
- }
+ if (cmd + length > page_end) {
+ if (length + cmd > batch_end) {
+ DRM_DEBUG_DRIVER("CMD: Command length exceeds batch length: 0x%08X length=%u batchlen=%td\n",
+ *cmd, length, batch_end - cmd);
+ goto unmap;
+ }
- cmd += length;
- }
+ if (WARN_ON(length > sizeof(tmp)/4)) {
+ ret = -ENODEV;
+ goto unmap;
+ }
- if (oacontrol_set) {
- DRM_DEBUG_DRIVER("CMD: batch set OACONTROL but did not clear it\n");
- ret = -EINVAL;
- }
+ partial = page_end - cmd;
+ memcpy(tmp, cmd, partial*4);
+ break;
+ }
- if (cmd >= batch_end) {
- DRM_DEBUG_DRIVER("CMD: Got to the end of the buffer w/o a BBE cmd!\n");
- ret = -EINVAL;
- }
+ buf = cmd;
+check:
+ if (!check_cmd(ring, desc, buf, length, is_master,
+ &oacontrol_set))
+ goto unmap;
- vunmap(batch_base);
+ cmd += length;
+ } while (cmd < page_end);
+
+ batch_len -= this;
+ if (batch_len == 0)
+ break;
+
+ kunmap_atomic(src);
+ in = 0;
+ }
+unmap:
+ kunmap_atomic(src);
+ kunmap_atomic(dst);
+ batch_obj->get_page = rewind;
+unpin:
+ i915_gem_object_unpin_pages(batch_obj);
return ret;
}
--
2.6.0
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/intel-gfx
^ permalink raw reply related [flat|nested] 12+ messages in thread
* [PATCH 2/6] drm/i915: Cache last cmd descriptor when parsing
2015-10-01 11:57 [PATCH 1/6] drm/i915: Eliminate vmap overhead for cmd parser Chris Wilson
@ 2015-10-01 11:57 ` Chris Wilson
2015-10-01 11:57 ` [PATCH 3/6] drm/i915: Use WC copies on !llc platforms for the command parser Chris Wilson
` (5 subsequent siblings)
6 siblings, 0 replies; 12+ messages in thread
From: Chris Wilson @ 2015-10-01 11:57 UTC (permalink / raw)
To: intel-gfx
The cmd parser has the biggest impact on the BLT ring, because it is
relatively verbose composed to the other engines as the vertex data is
inline. It also typically has runs of repeating commands (again since
the vertex data is inline, it typically has sequences of XY_SETUP_BLT,
XY_SCANLINE_BLT..) We can easily reduce the impact of cmdparsing on
benchmarks by then caching the last descriptor and comparing it against
the next command header. To get maximum benefit, we also want to take
advantage of skipping a few validations and length determinations if the
header is unchanged between commands.
ivb i7-3720QM:
x11perf -dot: before 52.3M, after 124M (max 203M)
glxgears: before 7310 fps, after 7550 fps (max 7860 fps)
v2: Fix initial cached cmd descriptor to match MI_NOOP.
Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
---
drivers/gpu/drm/i915/i915_cmd_parser.c | 129 +++++++++++++++------------------
drivers/gpu/drm/i915/i915_drv.h | 10 +--
2 files changed, 62 insertions(+), 77 deletions(-)
diff --git a/drivers/gpu/drm/i915/i915_cmd_parser.c b/drivers/gpu/drm/i915/i915_cmd_parser.c
index 91e4baa0f2b8..0e826bec7942 100644
--- a/drivers/gpu/drm/i915/i915_cmd_parser.c
+++ b/drivers/gpu/drm/i915/i915_cmd_parser.c
@@ -786,6 +786,9 @@ void i915_cmd_parser_fini_ring(struct intel_engine_cs *ring)
fini_hash_table(ring);
}
+/*
+ * Returns a pointer to a descriptor for the command specified by cmd_header.
+ */
static const struct drm_i915_cmd_descriptor*
find_cmd_in_table(struct intel_engine_cs *ring,
u32 cmd_header)
@@ -805,37 +808,6 @@ find_cmd_in_table(struct intel_engine_cs *ring,
return NULL;
}
-/*
- * Returns a pointer to a descriptor for the command specified by cmd_header.
- *
- * The caller must supply space for a default descriptor via the default_desc
- * parameter. If no descriptor for the specified command exists in the ring's
- * command parser tables, this function fills in default_desc based on the
- * ring's default length encoding and returns default_desc.
- */
-static const struct drm_i915_cmd_descriptor*
-find_cmd(struct intel_engine_cs *ring,
- u32 cmd_header,
- struct drm_i915_cmd_descriptor *default_desc)
-{
- const struct drm_i915_cmd_descriptor *desc;
- u32 mask;
-
- desc = find_cmd_in_table(ring, cmd_header);
- if (desc)
- return desc;
-
- mask = ring->get_cmd_length_mask(cmd_header);
- if (!mask)
- return NULL;
-
- BUG_ON(!default_desc);
- default_desc->flags = CMD_DESC_SKIP;
- default_desc->length.mask = mask;
-
- return default_desc;
-}
-
static const struct drm_i915_reg_descriptor *
find_reg(const struct drm_i915_reg_descriptor *table,
int count, u32 addr)
@@ -878,17 +850,6 @@ static bool check_cmd(const struct intel_engine_cs *ring,
const bool is_master,
bool *oacontrol_set)
{
- if (desc->flags & CMD_DESC_REJECT) {
- DRM_DEBUG_DRIVER("CMD: Rejected command: 0x%08X\n", *cmd);
- return false;
- }
-
- if ((desc->flags & CMD_DESC_MASTER) && !is_master) {
- DRM_DEBUG_DRIVER("CMD: Rejected master-only command: 0x%08X\n",
- *cmd);
- return false;
- }
-
if (desc->flags & CMD_DESC_REGISTER) {
/*
* Get the distance between individual register offset
@@ -1019,14 +980,15 @@ int i915_parse_cmds(struct intel_engine_cs *ring,
bool is_master)
{
u32 tmp[128];
- const struct drm_i915_cmd_descriptor *desc;
+ struct drm_i915_cmd_descriptor default_desc = { CMD_DESC_SKIP };
+ const struct drm_i915_cmd_descriptor *desc = &default_desc;
+ u32 last_cmd_header = 0;
unsigned dst_iter, src_iter;
int needs_clflush = 0;
struct get_page rewind;
void *src, *dst;
unsigned in, out;
- u32 *buf, partial = 0, length;
- struct drm_i915_cmd_descriptor default_desc = { 0 };
+ u32 *buf, partial = 0, length = 1;
bool oacontrol_set = false; /* OACONTROL tracking. See check_cmd() */
int ret = 0;
@@ -1115,37 +1077,60 @@ int i915_parse_cmds(struct intel_engine_cs *ring,
}
do {
- if (*cmd == MI_BATCH_BUFFER_END) {
- if (oacontrol_set) {
- DRM_DEBUG_DRIVER("CMD: batch set OACONTROL but did not clear it\n");
- ret = -EINVAL;
- } else
- ret = 0;
- goto unmap;
- }
+ if (*cmd != last_cmd_header) {
+ if (*cmd == MI_BATCH_BUFFER_END) {
+ if (unlikely(oacontrol_set)) {
+ DRM_DEBUG_DRIVER("CMD: batch set OACONTROL but did not clear it\n");
+ ret = -EINVAL;
+ } else
+ ret = 0;
+ goto unmap;
+ }
- desc = find_cmd(ring, *cmd, &default_desc);
- if (!desc) {
- DRM_DEBUG_DRIVER("CMD: Unrecognized command: 0x%08X\n",
- *cmd);
- goto unmap;
- }
+ desc = find_cmd_in_table(ring, *cmd);
+ if (desc) {
+ if (unlikely(desc->flags & CMD_DESC_REJECT)) {
+ DRM_DEBUG_DRIVER("CMD: Rejected command: 0x%08X\n", *cmd);
+ goto unmap;
+ }
+
+ if (unlikely((desc->flags & CMD_DESC_MASTER) && !is_master)) {
+ DRM_DEBUG_DRIVER("CMD: Rejected master-only command: 0x%08X\n", *cmd);
+ goto unmap;
+ }
+
+ /*
+ * If the batch buffer contains a
+ * chained batch, return an error that
+ * tells the caller to abort and
+ * dispatch the workload as a
+ * non-secure batch.
+ */
+ if (unlikely(desc->cmd.value == MI_BATCH_BUFFER_START)) {
+ ret = -EACCES;
+ goto unmap;
+ }
+
+ if (desc->flags & CMD_DESC_FIXED)
+ length = desc->length.fixed;
+ else
+ length = (*cmd & desc->length.mask) + LENGTH_BIAS;
+ } else {
+ u32 mask = ring->get_cmd_length_mask(*cmd);
+ if (unlikely(!mask)) {
+ DRM_DEBUG_DRIVER("CMD: Unrecognized command: 0x%08X\n", *cmd);
+ goto unmap;
+ }
+
+ default_desc.length.mask = mask;
+ desc = &default_desc;
+
+ length = (*cmd & mask) + LENGTH_BIAS;
+ }
- /*
- * If the batch buffer contains a chained batch, return an
- * error that tells the caller to abort and dispatch the
- * workload as a non-secure batch.
- */
- if (desc->cmd.value == MI_BATCH_BUFFER_START) {
- ret = -EACCES;
- goto unmap;
+ last_cmd_header = *cmd;
}
- if (desc->flags & CMD_DESC_FIXED)
- length = desc->length.fixed;
- else
- length = ((*cmd & desc->length.mask) + LENGTH_BIAS);
-
if (cmd + length > page_end) {
if (length + cmd > batch_end) {
DRM_DEBUG_DRIVER("CMD: Command length exceeds batch length: 0x%08X length=%u batchlen=%td\n",
diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
index 67132a6a2b1a..92d959f37cc7 100644
--- a/drivers/gpu/drm/i915/i915_drv.h
+++ b/drivers/gpu/drm/i915/i915_drv.h
@@ -2308,12 +2308,12 @@ struct drm_i915_cmd_descriptor {
* is the DRM master
*/
u32 flags;
+#define CMD_DESC_SKIP (0)
#define CMD_DESC_FIXED (1<<0)
-#define CMD_DESC_SKIP (1<<1)
-#define CMD_DESC_REJECT (1<<2)
-#define CMD_DESC_REGISTER (1<<3)
-#define CMD_DESC_BITMASK (1<<4)
-#define CMD_DESC_MASTER (1<<5)
+#define CMD_DESC_REJECT (1<<1)
+#define CMD_DESC_REGISTER (1<<2)
+#define CMD_DESC_BITMASK (1<<3)
+#define CMD_DESC_MASTER (1<<4)
/*
* The command's unique identification bits and the bitmask to get them.
--
2.6.0
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/intel-gfx
^ permalink raw reply related [flat|nested] 12+ messages in thread
* [PATCH 3/6] drm/i915: Use WC copies on !llc platforms for the command parser
2015-10-01 11:57 [PATCH 1/6] drm/i915: Eliminate vmap overhead for cmd parser Chris Wilson
2015-10-01 11:57 ` [PATCH 2/6] drm/i915: Cache last cmd descriptor when parsing Chris Wilson
@ 2015-10-01 11:57 ` Chris Wilson
2015-10-01 11:57 ` [PATCH 4/6] drm/i915: Reduce arithmetic operations during cmd parser lookup Chris Wilson
` (4 subsequent siblings)
6 siblings, 0 replies; 12+ messages in thread
From: Chris Wilson @ 2015-10-01 11:57 UTC (permalink / raw)
To: intel-gfx
Since we blow the TLB caches by using kmap/kunmap, we may as well go the
whole hog and see if declaring our destination page as WC is faster than
keeping it as WB and using clflush. It should be!
Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
---
drivers/gpu/drm/i915/i915_cmd_parser.c | 26 ++++++++++++++++++++------
1 file changed, 20 insertions(+), 6 deletions(-)
diff --git a/drivers/gpu/drm/i915/i915_cmd_parser.c b/drivers/gpu/drm/i915/i915_cmd_parser.c
index 0e826bec7942..f4d4c3132932 100644
--- a/drivers/gpu/drm/i915/i915_cmd_parser.c
+++ b/drivers/gpu/drm/i915/i915_cmd_parser.c
@@ -984,9 +984,10 @@ int i915_parse_cmds(struct intel_engine_cs *ring,
const struct drm_i915_cmd_descriptor *desc = &default_desc;
u32 last_cmd_header = 0;
unsigned dst_iter, src_iter;
- int needs_clflush = 0;
struct get_page rewind;
void *src, *dst;
+ int src_needs_clflush = 0;
+ bool dst_needs_clflush;
unsigned in, out;
u32 *buf, partial = 0, length = 1;
bool oacontrol_set = false; /* OACONTROL tracking. See check_cmd() */
@@ -999,13 +1000,19 @@ int i915_parse_cmds(struct intel_engine_cs *ring,
if (WARN_ON(shadow_batch_obj->pages_pin_count == 0))
return -ENODEV;
- ret = i915_gem_obj_prepare_shmem_read(batch_obj, &needs_clflush);
+ ret = i915_gem_obj_prepare_shmem_read(batch_obj, &src_needs_clflush);
if (ret) {
DRM_DEBUG_DRIVER("CMD: failed to prepare shadow batch\n");
return ret;
}
- ret = i915_gem_object_set_to_cpu_domain(shadow_batch_obj, true);
+ dst_needs_clflush =
+ shadow_batch_obj->base.write_domain != I915_GEM_DOMAIN_CPU &&
+ !INTEL_INFO(shadow_batch_obj->base.dev)->has_llc;
+ if (dst_needs_clflush)
+ ret = i915_gem_object_set_to_gtt_domain(shadow_batch_obj, true);
+ else
+ ret = i915_gem_object_set_to_cpu_domain(shadow_batch_obj, true);
if (ret) {
DRM_DEBUG_DRIVER("CMD: Failed to set shadow batch to CPU\n");
goto unpin;
@@ -1035,7 +1042,7 @@ int i915_parse_cmds(struct intel_engine_cs *ring,
this = PAGE_SIZE - in;
src = kmap_atomic(i915_gem_object_get_page(batch_obj, src_iter));
- if (needs_clflush)
+ if (src_needs_clflush)
drm_clflush_virt_range(src + in, this);
i = this;
@@ -1054,10 +1061,17 @@ int i915_parse_cmds(struct intel_engine_cs *ring,
k = i;
if (k > PAGE_SIZE - out)
k = PAGE_SIZE - out;
- if (k == PAGE_SIZE)
+ if (k == PAGE_SIZE) {
copy_page(dst, src);
- else
+ } else {
+ /* Partial cache lines need clflushing */
+ if (dst_needs_clflush &&
+ (out | k) & (boot_cpu_data.x86_clflush_size - 1))
+ drm_clflush_virt_range(dst + out, k);
memcpy(dst + out, src + j, k);
+ }
+ if (dst_needs_clflush)
+ drm_clflush_virt_range(dst + out, k);
out += k;
j += k;
--
2.6.0
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/intel-gfx
^ permalink raw reply related [flat|nested] 12+ messages in thread
* [PATCH 4/6] drm/i915: Reduce arithmetic operations during cmd parser lookup
2015-10-01 11:57 [PATCH 1/6] drm/i915: Eliminate vmap overhead for cmd parser Chris Wilson
2015-10-01 11:57 ` [PATCH 2/6] drm/i915: Cache last cmd descriptor when parsing Chris Wilson
2015-10-01 11:57 ` [PATCH 3/6] drm/i915: Use WC copies on !llc platforms for the command parser Chris Wilson
@ 2015-10-01 11:57 ` Chris Wilson
2015-10-01 11:57 ` [PATCH 5/6] drm/i915: Reduce pointer indirection " Chris Wilson
` (3 subsequent siblings)
6 siblings, 0 replies; 12+ messages in thread
From: Chris Wilson @ 2015-10-01 11:57 UTC (permalink / raw)
To: intel-gfx
Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
---
drivers/gpu/drm/i915/i915_cmd_parser.c | 5 +----
1 file changed, 1 insertion(+), 4 deletions(-)
diff --git a/drivers/gpu/drm/i915/i915_cmd_parser.c b/drivers/gpu/drm/i915/i915_cmd_parser.c
index f4d4c3132932..4fc995b2729d 100644
--- a/drivers/gpu/drm/i915/i915_cmd_parser.c
+++ b/drivers/gpu/drm/i915/i915_cmd_parser.c
@@ -798,10 +798,7 @@ find_cmd_in_table(struct intel_engine_cs *ring,
hash_for_each_possible(ring->cmd_hash, desc_node, node,
cmd_header & CMD_HASH_MASK) {
const struct drm_i915_cmd_descriptor *desc = desc_node->desc;
- u32 masked_cmd = desc->cmd.mask & cmd_header;
- u32 masked_value = desc->cmd.value & desc->cmd.mask;
-
- if (masked_cmd == masked_value)
+ if (((cmd_header ^ desc->cmd.value) & desc->cmd.mask) == 0)
return desc;
}
--
2.6.0
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/intel-gfx
^ permalink raw reply related [flat|nested] 12+ messages in thread
* [PATCH 5/6] drm/i915: Reduce pointer indirection during cmd parser lookup
2015-10-01 11:57 [PATCH 1/6] drm/i915: Eliminate vmap overhead for cmd parser Chris Wilson
` (2 preceding siblings ...)
2015-10-01 11:57 ` [PATCH 4/6] drm/i915: Reduce arithmetic operations during cmd parser lookup Chris Wilson
@ 2015-10-01 11:57 ` Chris Wilson
2015-10-01 11:57 ` [PATCH 6/6] drm/i915: Improve hash function for the command parser Chris Wilson
` (2 subsequent siblings)
6 siblings, 0 replies; 12+ messages in thread
From: Chris Wilson @ 2015-10-01 11:57 UTC (permalink / raw)
To: intel-gfx
Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
---
drivers/gpu/drm/i915/i915_cmd_parser.c | 51 ++++++++--------------------------
drivers/gpu/drm/i915/i915_drv.h | 4 ++-
2 files changed, 14 insertions(+), 41 deletions(-)
diff --git a/drivers/gpu/drm/i915/i915_cmd_parser.c b/drivers/gpu/drm/i915/i915_cmd_parser.c
index 4fc995b2729d..ffba09a90e83 100644
--- a/drivers/gpu/drm/i915/i915_cmd_parser.c
+++ b/drivers/gpu/drm/i915/i915_cmd_parser.c
@@ -113,7 +113,7 @@
/* Command Mask Fixed Len Action
---------------------------------------------------------- */
-static const struct drm_i915_cmd_descriptor common_cmds[] = {
+static struct drm_i915_cmd_descriptor common_cmds[] = {
CMD( MI_NOOP, SMI, F, 1, S ),
CMD( MI_USER_INTERRUPT, SMI, F, 1, R ),
CMD( MI_WAIT_FOR_EVENT, SMI, F, 1, M ),
@@ -146,7 +146,7 @@ static const struct drm_i915_cmd_descriptor common_cmds[] = {
CMD( MI_BATCH_BUFFER_START, SMI, !F, 0xFF, S ),
};
-static const struct drm_i915_cmd_descriptor render_cmds[] = {
+static struct drm_i915_cmd_descriptor render_cmds[] = {
CMD( MI_FLUSH, SMI, F, 1, S ),
CMD( MI_ARB_ON_OFF, SMI, F, 1, R ),
CMD( MI_PREDICATE, SMI, F, 1, S ),
@@ -207,7 +207,7 @@ static const struct drm_i915_cmd_descriptor render_cmds[] = {
}}, ),
};
-static const struct drm_i915_cmd_descriptor hsw_render_cmds[] = {
+static struct drm_i915_cmd_descriptor hsw_render_cmds[] = {
CMD( MI_SET_PREDICATE, SMI, F, 1, S ),
CMD( MI_RS_CONTROL, SMI, F, 1, S ),
CMD( MI_URB_ATOMIC_ALLOC, SMI, F, 1, S ),
@@ -229,7 +229,7 @@ static const struct drm_i915_cmd_descriptor hsw_render_cmds[] = {
CMD( GFX_OP_3DSTATE_BINDING_TABLE_EDIT_PS, S3D, !F, 0x1FF, S ),
};
-static const struct drm_i915_cmd_descriptor video_cmds[] = {
+static struct drm_i915_cmd_descriptor video_cmds[] = {
CMD( MI_ARB_ON_OFF, SMI, F, 1, R ),
CMD( MI_SET_APPID, SMI, F, 1, S ),
CMD( MI_STORE_DWORD_IMM, SMI, !F, 0xFF, B,
@@ -273,7 +273,7 @@ static const struct drm_i915_cmd_descriptor video_cmds[] = {
CMD( MFX_WAIT, SMFX, F, 1, S ),
};
-static const struct drm_i915_cmd_descriptor vecs_cmds[] = {
+static struct drm_i915_cmd_descriptor vecs_cmds[] = {
CMD( MI_ARB_ON_OFF, SMI, F, 1, R ),
CMD( MI_SET_APPID, SMI, F, 1, S ),
CMD( MI_STORE_DWORD_IMM, SMI, !F, 0xFF, B,
@@ -311,7 +311,7 @@ static const struct drm_i915_cmd_descriptor vecs_cmds[] = {
}}, ),
};
-static const struct drm_i915_cmd_descriptor blt_cmds[] = {
+static struct drm_i915_cmd_descriptor blt_cmds[] = {
CMD( MI_DISPLAY_FLIP, SMI, !F, 0xFF, R ),
CMD( MI_STORE_DWORD_IMM, SMI, !F, 0x3FF, B,
.bits = {{
@@ -344,7 +344,7 @@ static const struct drm_i915_cmd_descriptor blt_cmds[] = {
CMD( SRC_COPY_BLT, S2D, !F, 0x3F, S ),
};
-static const struct drm_i915_cmd_descriptor hsw_blt_cmds[] = {
+static struct drm_i915_cmd_descriptor hsw_blt_cmds[] = {
CMD( MI_LOAD_SCAN_LINES_INCL, SMI, !F, 0x3F, M ),
CMD( MI_LOAD_SCAN_LINES_EXCL, SMI, !F, 0x3F, R ),
};
@@ -610,11 +610,6 @@ static bool validate_regs_sorted(struct intel_engine_cs *ring)
ring->master_reg_count);
}
-struct cmd_node {
- const struct drm_i915_cmd_descriptor *desc;
- struct hlist_node node;
-};
-
/*
* Different command ranges have different numbers of bits for the opcode. For
* example, MI commands use bits 31:23 while 3D commands use bits 31:16. The
@@ -643,16 +638,8 @@ static int init_hash_table(struct intel_engine_cs *ring,
const struct drm_i915_cmd_table *table = &cmd_tables[i];
for (j = 0; j < table->count; j++) {
- const struct drm_i915_cmd_descriptor *desc =
- &table->table[j];
- struct cmd_node *desc_node =
- kmalloc(sizeof(*desc_node), GFP_KERNEL);
-
- if (!desc_node)
- return -ENOMEM;
-
- desc_node->desc = desc;
- hash_add(ring->cmd_hash, &desc_node->node,
+ struct drm_i915_cmd_descriptor *desc = &table->table[j];
+ hash_add(ring->cmd_hash, &desc->node[ring->id],
desc->cmd.value & CMD_HASH_MASK);
}
}
@@ -660,18 +647,6 @@ static int init_hash_table(struct intel_engine_cs *ring,
return 0;
}
-static void fini_hash_table(struct intel_engine_cs *ring)
-{
- struct hlist_node *tmp;
- struct cmd_node *desc_node;
- int i;
-
- hash_for_each_safe(ring->cmd_hash, i, tmp, desc_node, node) {
- hash_del(&desc_node->node);
- kfree(desc_node);
- }
-}
-
/**
* i915_cmd_parser_init_ring() - set cmd parser related fields for a ringbuffer
* @ring: the ringbuffer to initialize
@@ -762,7 +737,6 @@ int i915_cmd_parser_init_ring(struct intel_engine_cs *ring)
ret = init_hash_table(ring, cmd_tables, cmd_table_count);
if (ret) {
DRM_ERROR("CMD: cmd_parser_init failed!\n");
- fini_hash_table(ring);
return ret;
}
@@ -782,8 +756,6 @@ void i915_cmd_parser_fini_ring(struct intel_engine_cs *ring)
{
if (!ring->needs_cmd_parser)
return;
-
- fini_hash_table(ring);
}
/*
@@ -793,11 +765,10 @@ static const struct drm_i915_cmd_descriptor*
find_cmd_in_table(struct intel_engine_cs *ring,
u32 cmd_header)
{
- struct cmd_node *desc_node;
+ const struct drm_i915_cmd_descriptor *desc;
- hash_for_each_possible(ring->cmd_hash, desc_node, node,
+ hash_for_each_possible(ring->cmd_hash, desc, node[ring->id],
cmd_header & CMD_HASH_MASK) {
- const struct drm_i915_cmd_descriptor *desc = desc_node->desc;
if (((cmd_header ^ desc->cmd.value) & desc->cmd.mask) == 0)
return desc;
}
diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
index 92d959f37cc7..7e3a731fbaff 100644
--- a/drivers/gpu/drm/i915/i915_drv.h
+++ b/drivers/gpu/drm/i915/i915_drv.h
@@ -2371,6 +2371,8 @@ struct drm_i915_cmd_descriptor {
u32 condition_offset;
u32 condition_mask;
} bits[MAX_CMD_DESC_BITMASKS];
+
+ struct hlist_node node[I915_NUM_RINGS];
};
/*
@@ -2380,7 +2382,7 @@ struct drm_i915_cmd_descriptor {
* descriptors, which must be sorted with command opcodes in ascending order.
*/
struct drm_i915_cmd_table {
- const struct drm_i915_cmd_descriptor *table;
+ struct drm_i915_cmd_descriptor *table;
int count;
};
--
2.6.0
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/intel-gfx
^ permalink raw reply related [flat|nested] 12+ messages in thread
* [PATCH 6/6] drm/i915: Improve hash function for the command parser
2015-10-01 11:57 [PATCH 1/6] drm/i915: Eliminate vmap overhead for cmd parser Chris Wilson
` (3 preceding siblings ...)
2015-10-01 11:57 ` [PATCH 5/6] drm/i915: Reduce pointer indirection " Chris Wilson
@ 2015-10-01 11:57 ` Chris Wilson
2015-10-01 12:37 ` [PATCH 1/6] drm/i915: Eliminate vmap overhead for cmd parser Ville Syrjälä
2015-10-06 12:44 ` Daniel Vetter
6 siblings, 0 replies; 12+ messages in thread
From: Chris Wilson @ 2015-10-01 11:57 UTC (permalink / raw)
To: intel-gfx
The existing code's hashfunction is very suboptimal (most 3D commands
use the same bucket degrading the hash to a long list). The code even
acknowledge that the issue was known and the fix simple:
/*
* If we attempt to generate a perfect hash, we should be able to look at bits
* 31:29 of a command from a batch buffer and use the full mask for that
* client. The existing INSTR_CLIENT_MASK/SHIFT defines can be used for this.
*/
Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
---
drivers/gpu/drm/i915/i915_cmd_parser.c | 45 +++++++++++++++++++++-------------
1 file changed, 28 insertions(+), 17 deletions(-)
diff --git a/drivers/gpu/drm/i915/i915_cmd_parser.c b/drivers/gpu/drm/i915/i915_cmd_parser.c
index ffba09a90e83..844b67c55161 100644
--- a/drivers/gpu/drm/i915/i915_cmd_parser.c
+++ b/drivers/gpu/drm/i915/i915_cmd_parser.c
@@ -86,24 +86,24 @@
* general bitmasking mechanism.
*/
-#define STD_MI_OPCODE_MASK 0xFF800000
-#define STD_3D_OPCODE_MASK 0xFFFF0000
-#define STD_2D_OPCODE_MASK 0xFFC00000
-#define STD_MFX_OPCODE_MASK 0xFFFF0000
+#define STD_MI_OPCODE_SHIFT (32 - 9)
+#define STD_3D_OPCODE_SHIFT (32 - 16)
+#define STD_2D_OPCODE_SHIFT (32 - 10)
+#define STD_MFX_OPCODE_SHIFT (32 - 16)
#define CMD(op, opm, f, lm, fl, ...) \
{ \
.flags = (fl) | ((f) ? CMD_DESC_FIXED : 0), \
- .cmd = { (op), (opm) }, \
+ .cmd = { (op), ~0 << (opm) }, \
.length = { (lm) }, \
__VA_ARGS__ \
}
/* Convenience macros to compress the tables */
-#define SMI STD_MI_OPCODE_MASK
-#define S3D STD_3D_OPCODE_MASK
-#define S2D STD_2D_OPCODE_MASK
-#define SMFX STD_MFX_OPCODE_MASK
+#define SMI STD_MI_OPCODE_SHIFT
+#define S3D STD_3D_OPCODE_SHIFT
+#define S2D STD_2D_OPCODE_SHIFT
+#define SMFX STD_MFX_OPCODE_SHIFT
#define F true
#define S CMD_DESC_SKIP
#define R CMD_DESC_REJECT
@@ -619,12 +619,24 @@ static bool validate_regs_sorted(struct intel_engine_cs *ring)
* non-opcode bits being set. But if we don't include those bits, some 3D
* commands may hash to the same bucket due to not including opcode bits that
* make the command unique. For now, we will risk hashing to the same bucket.
- *
- * If we attempt to generate a perfect hash, we should be able to look at bits
- * 31:29 of a command from a batch buffer and use the full mask for that
- * client. The existing INSTR_CLIENT_MASK/SHIFT defines can be used for this.
*/
-#define CMD_HASH_MASK STD_MI_OPCODE_MASK
+static inline u32 cmd_header_key(u32 x)
+{
+ u32 shift;
+ switch (x >> INSTR_CLIENT_SHIFT) {
+ default:
+ case INSTR_MI_CLIENT:
+ shift = STD_MI_OPCODE_SHIFT;
+ break;
+ case INSTR_RC_CLIENT:
+ shift = STD_3D_OPCODE_SHIFT;
+ break;
+ case INSTR_BC_CLIENT:
+ shift = STD_2D_OPCODE_SHIFT;
+ break;
+ }
+ return x >> shift;
+}
static int init_hash_table(struct intel_engine_cs *ring,
const struct drm_i915_cmd_table *cmd_tables,
@@ -640,7 +652,7 @@ static int init_hash_table(struct intel_engine_cs *ring,
for (j = 0; j < table->count; j++) {
struct drm_i915_cmd_descriptor *desc = &table->table[j];
hash_add(ring->cmd_hash, &desc->node[ring->id],
- desc->cmd.value & CMD_HASH_MASK);
+ cmd_header_key(desc->cmd.value));
}
}
@@ -768,10 +780,9 @@ find_cmd_in_table(struct intel_engine_cs *ring,
const struct drm_i915_cmd_descriptor *desc;
hash_for_each_possible(ring->cmd_hash, desc, node[ring->id],
- cmd_header & CMD_HASH_MASK) {
+ cmd_header_key(cmd_header))
if (((cmd_header ^ desc->cmd.value) & desc->cmd.mask) == 0)
return desc;
- }
return NULL;
}
--
2.6.0
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/intel-gfx
^ permalink raw reply related [flat|nested] 12+ messages in thread
* Re: [PATCH 1/6] drm/i915: Eliminate vmap overhead for cmd parser
2015-10-01 11:57 [PATCH 1/6] drm/i915: Eliminate vmap overhead for cmd parser Chris Wilson
` (4 preceding siblings ...)
2015-10-01 11:57 ` [PATCH 6/6] drm/i915: Improve hash function for the command parser Chris Wilson
@ 2015-10-01 12:37 ` Ville Syrjälä
2015-10-01 13:24 ` Chris Wilson
2015-10-06 12:44 ` Daniel Vetter
6 siblings, 1 reply; 12+ messages in thread
From: Ville Syrjälä @ 2015-10-01 12:37 UTC (permalink / raw)
To: Chris Wilson; +Cc: intel-gfx
On Thu, Oct 01, 2015 at 12:57:10PM +0100, Chris Wilson wrote:
> With a little complexity to handle cmds straddling page boundaries, we
> can completely avoiding having to vmap the batch and the shadow batch
> objects whilst running the command parser.
>
> On ivb i7-3720MQ:
>
> x11perf -dot before 54.3M, after 53.2M (max 203M)
> glxgears before 7110 fps, after 7300 fps (max 7860 fps)
>
> Before:
> Time to blt 16384 bytes x 1: 12.400µs, 1.2GiB/s
> Time to blt 16384 bytes x 4096: 3.055µs, 5.0GiB/s
>
> After:
> Time to blt 16384 bytes x 1: 8.600µs, 1.8GiB/s
> Time to blt 16384 bytes x 4096: 2.456µs, 6.2GiB/s
>
> Removing the vmap is mostly a win, except we lose in a few cases where
> the batch size is greater than a page due to the extra complexity (loss
> of a simple cache efficient large copy, and boundary handling).
>
> v2: Reorder so that we do check oacontrol remaining set at end-of-batch
>
> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> ---
> drivers/gpu/drm/i915/i915_cmd_parser.c | 298 ++++++++++++++++-----------------
> 1 file changed, 146 insertions(+), 152 deletions(-)
>
> diff --git a/drivers/gpu/drm/i915/i915_cmd_parser.c b/drivers/gpu/drm/i915/i915_cmd_parser.c
> index 237ff6884a22..91e4baa0f2b8 100644
> --- a/drivers/gpu/drm/i915/i915_cmd_parser.c
> +++ b/drivers/gpu/drm/i915/i915_cmd_parser.c
> @@ -852,100 +852,6 @@ find_reg(const struct drm_i915_reg_descriptor *table,
> return NULL;
> }
>
> -static u32 *vmap_batch(struct drm_i915_gem_object *obj,
> - unsigned start, unsigned len)
> -{
> - int i;
> - void *addr = NULL;
> - struct sg_page_iter sg_iter;
> - int first_page = start >> PAGE_SHIFT;
> - int last_page = (len + start + 4095) >> PAGE_SHIFT;
> - int npages = last_page - first_page;
> - struct page **pages;
> -
> - pages = drm_malloc_ab(npages, sizeof(*pages));
> - if (pages == NULL) {
> - DRM_DEBUG_DRIVER("Failed to get space for pages\n");
> - goto finish;
> - }
> -
> - i = 0;
> - for_each_sg_page(obj->pages->sgl, &sg_iter, obj->pages->nents, first_page) {
> - pages[i++] = sg_page_iter_page(&sg_iter);
> - if (i == npages)
> - break;
> - }
> -
> - addr = vmap(pages, i, 0, PAGE_KERNEL);
> - if (addr == NULL) {
> - DRM_DEBUG_DRIVER("Failed to vmap pages\n");
> - goto finish;
> - }
> -
> -finish:
> - if (pages)
> - drm_free_large(pages);
> - return (u32*)addr;
> -}
> -
> -/* Returns a vmap'd pointer to dest_obj, which the caller must unmap */
> -static u32 *copy_batch(struct drm_i915_gem_object *dest_obj,
> - struct drm_i915_gem_object *src_obj,
> - u32 batch_start_offset,
> - u32 batch_len)
> -{
> - int needs_clflush = 0;
> - void *src_base, *src;
> - void *dst = NULL;
> - int ret;
> -
> - if (batch_len > dest_obj->base.size ||
> - batch_len + batch_start_offset > src_obj->base.size)
> - return ERR_PTR(-E2BIG);
> -
> - if (WARN_ON(dest_obj->pages_pin_count == 0))
> - return ERR_PTR(-ENODEV);
> -
> - ret = i915_gem_obj_prepare_shmem_read(src_obj, &needs_clflush);
> - if (ret) {
> - DRM_DEBUG_DRIVER("CMD: failed to prepare shadow batch\n");
> - return ERR_PTR(ret);
> - }
> -
> - src_base = vmap_batch(src_obj, batch_start_offset, batch_len);
> - if (!src_base) {
> - DRM_DEBUG_DRIVER("CMD: Failed to vmap batch\n");
> - ret = -ENOMEM;
> - goto unpin_src;
> - }
> -
> - ret = i915_gem_object_set_to_cpu_domain(dest_obj, true);
> - if (ret) {
> - DRM_DEBUG_DRIVER("CMD: Failed to set shadow batch to CPU\n");
> - goto unmap_src;
> - }
> -
> - dst = vmap_batch(dest_obj, 0, batch_len);
> - if (!dst) {
> - DRM_DEBUG_DRIVER("CMD: Failed to vmap shadow batch\n");
> - ret = -ENOMEM;
> - goto unmap_src;
> - }
> -
> - src = src_base + offset_in_page(batch_start_offset);
> - if (needs_clflush)
> - drm_clflush_virt_range(src, batch_len);
> -
> - memcpy(dst, src, batch_len);
> -
> -unmap_src:
> - vunmap(src_base);
> -unpin_src:
> - i915_gem_object_unpin_pages(src_obj);
> -
> - return ret ? ERR_PTR(ret) : dst;
> -}
> -
> /**
> * i915_needs_cmd_parser() - should a given ring use software command parsing?
> * @ring: the ring in question
> @@ -1112,16 +1018,35 @@ int i915_parse_cmds(struct intel_engine_cs *ring,
> u32 batch_len,
> bool is_master)
> {
> - u32 *cmd, *batch_base, *batch_end;
> + u32 tmp[128];
> + const struct drm_i915_cmd_descriptor *desc;
> + unsigned dst_iter, src_iter;
> + int needs_clflush = 0;
> + struct get_page rewind;
> + void *src, *dst;
> + unsigned in, out;
> + u32 *buf, partial = 0, length;
> struct drm_i915_cmd_descriptor default_desc = { 0 };
> bool oacontrol_set = false; /* OACONTROL tracking. See check_cmd() */
> int ret = 0;
>
> - batch_base = copy_batch(shadow_batch_obj, batch_obj,
> - batch_start_offset, batch_len);
> - if (IS_ERR(batch_base)) {
> - DRM_DEBUG_DRIVER("CMD: Failed to copy batch\n");
> - return PTR_ERR(batch_base);
> + if (batch_len > shadow_batch_obj->base.size ||
> + batch_len + batch_start_offset > batch_obj->base.size)
> + return -E2BIG;
> +
> + if (WARN_ON(shadow_batch_obj->pages_pin_count == 0))
> + return -ENODEV;
> +
> + ret = i915_gem_obj_prepare_shmem_read(batch_obj, &needs_clflush);
> + if (ret) {
> + DRM_DEBUG_DRIVER("CMD: failed to prepare shadow batch\n");
> + return ret;
> + }
> +
> + ret = i915_gem_object_set_to_cpu_domain(shadow_batch_obj, true);
> + if (ret) {
> + DRM_DEBUG_DRIVER("CMD: Failed to set shadow batch to CPU\n");
> + goto unpin;
> }
>
> /*
> @@ -1129,69 +1054,138 @@ int i915_parse_cmds(struct intel_engine_cs *ring,
> * large or larger and copy_batch() will write MI_NOPs to the extra
> * space. Parsing should be faster in some cases this way.
> */
> - batch_end = batch_base + (batch_len / sizeof(*batch_end));
> + ret = -EINVAL;
> + rewind = batch_obj->get_page;
> +
> + dst_iter = 0;
> + dst = kmap_atomic(i915_gem_object_get_page(shadow_batch_obj, dst_iter));
> +
> + in = offset_in_page(batch_start_offset);
> + out = 0;
> + for (src_iter = batch_start_offset >> PAGE_SHIFT;
> + src_iter < batch_obj->base.size >> PAGE_SHIFT;
> + src_iter++) {
> + u32 this, i, j, k;
> + u32 *cmd, *page_end, *batch_end;
> +
> + this = batch_len;
> + if (this > PAGE_SIZE - in)
> + this = PAGE_SIZE - in;
> +
> + src = kmap_atomic(i915_gem_object_get_page(batch_obj, src_iter));
> + if (needs_clflush)
> + drm_clflush_virt_range(src + in, this);
> +
> + i = this;
> + j = in;
> + do {
> + /* We keep dst around so that we do not blow
> + * the CPU caches immediately after the copy (due
> + * to the kunmap_atomic(dst) doing a TLB flush.
> + */
> + if (out == PAGE_SIZE) {
> + kunmap_atomic(dst);
> + dst = kmap_atomic(i915_gem_object_get_page(shadow_batch_obj, ++dst_iter));
> + out = 0;
> + }
>
> - cmd = batch_base;
> - while (cmd < batch_end) {
> - const struct drm_i915_cmd_descriptor *desc;
> - u32 length;
> + k = i;
> + if (k > PAGE_SIZE - out)
> + k = PAGE_SIZE - out;
> + if (k == PAGE_SIZE)
> + copy_page(dst, src);
> + else
> + memcpy(dst + out, src + j, k);
> +
> + out += k;
> + j += k;
> + i -= k;
> + } while (i);
> +
> + cmd = src + in;
So you're now checking the src batch? What prevents userspace from
overwriting it with eg. NOPS between the time you copied it and the
time you check it?
> + page_end = (void *)cmd + this;
> + batch_end = (void *)cmd + batch_len;
> +
> + if (partial) {
> + memcpy(tmp + partial, cmd, (length - partial)*4);
> + cmd -= partial;
> + partial = 0;
> + buf = tmp;
> + goto check;
> + }
>
> - if (*cmd == MI_BATCH_BUFFER_END)
> - break;
> + do {
> + if (*cmd == MI_BATCH_BUFFER_END) {
> + if (oacontrol_set) {
> + DRM_DEBUG_DRIVER("CMD: batch set OACONTROL but did not clear it\n");
> + ret = -EINVAL;
> + } else
> + ret = 0;
> + goto unmap;
> + }
>
> - desc = find_cmd(ring, *cmd, &default_desc);
> - if (!desc) {
> - DRM_DEBUG_DRIVER("CMD: Unrecognized command: 0x%08X\n",
> - *cmd);
> - ret = -EINVAL;
> - break;
> - }
> + desc = find_cmd(ring, *cmd, &default_desc);
> + if (!desc) {
> + DRM_DEBUG_DRIVER("CMD: Unrecognized command: 0x%08X\n",
> + *cmd);
> + goto unmap;
> + }
>
> - /*
> - * If the batch buffer contains a chained batch, return an
> - * error that tells the caller to abort and dispatch the
> - * workload as a non-secure batch.
> - */
> - if (desc->cmd.value == MI_BATCH_BUFFER_START) {
> - ret = -EACCES;
> - break;
> - }
> + /*
> + * If the batch buffer contains a chained batch, return an
> + * error that tells the caller to abort and dispatch the
> + * workload as a non-secure batch.
> + */
> + if (desc->cmd.value == MI_BATCH_BUFFER_START) {
> + ret = -EACCES;
> + goto unmap;
> + }
>
> - if (desc->flags & CMD_DESC_FIXED)
> - length = desc->length.fixed;
> - else
> - length = ((*cmd & desc->length.mask) + LENGTH_BIAS);
> -
> - if ((batch_end - cmd) < length) {
> - DRM_DEBUG_DRIVER("CMD: Command length exceeds batch length: 0x%08X length=%u batchlen=%td\n",
> - *cmd,
> - length,
> - batch_end - cmd);
> - ret = -EINVAL;
> - break;
> - }
> + if (desc->flags & CMD_DESC_FIXED)
> + length = desc->length.fixed;
> + else
> + length = ((*cmd & desc->length.mask) + LENGTH_BIAS);
>
> - if (!check_cmd(ring, desc, cmd, length, is_master,
> - &oacontrol_set)) {
> - ret = -EINVAL;
> - break;
> - }
> + if (cmd + length > page_end) {
> + if (length + cmd > batch_end) {
> + DRM_DEBUG_DRIVER("CMD: Command length exceeds batch length: 0x%08X length=%u batchlen=%td\n",
> + *cmd, length, batch_end - cmd);
> + goto unmap;
> + }
>
> - cmd += length;
> - }
> + if (WARN_ON(length > sizeof(tmp)/4)) {
> + ret = -ENODEV;
> + goto unmap;
> + }
>
> - if (oacontrol_set) {
> - DRM_DEBUG_DRIVER("CMD: batch set OACONTROL but did not clear it\n");
> - ret = -EINVAL;
> - }
> + partial = page_end - cmd;
> + memcpy(tmp, cmd, partial*4);
> + break;
> + }
>
> - if (cmd >= batch_end) {
> - DRM_DEBUG_DRIVER("CMD: Got to the end of the buffer w/o a BBE cmd!\n");
> - ret = -EINVAL;
> - }
> + buf = cmd;
> +check:
> + if (!check_cmd(ring, desc, buf, length, is_master,
> + &oacontrol_set))
> + goto unmap;
>
> - vunmap(batch_base);
> + cmd += length;
> + } while (cmd < page_end);
> +
> + batch_len -= this;
> + if (batch_len == 0)
> + break;
> +
> + kunmap_atomic(src);
> + in = 0;
> + }
>
> +unmap:
> + kunmap_atomic(src);
> + kunmap_atomic(dst);
> + batch_obj->get_page = rewind;
> +unpin:
> + i915_gem_object_unpin_pages(batch_obj);
> return ret;
> }
>
> --
> 2.6.0
>
> _______________________________________________
> Intel-gfx mailing list
> Intel-gfx@lists.freedesktop.org
> http://lists.freedesktop.org/mailman/listinfo/intel-gfx
--
Ville Syrjälä
Intel OTC
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/intel-gfx
^ permalink raw reply [flat|nested] 12+ messages in thread
* Re: [PATCH 1/6] drm/i915: Eliminate vmap overhead for cmd parser
2015-10-01 12:37 ` [PATCH 1/6] drm/i915: Eliminate vmap overhead for cmd parser Ville Syrjälä
@ 2015-10-01 13:24 ` Chris Wilson
2015-10-01 13:29 ` Ville Syrjälä
0 siblings, 1 reply; 12+ messages in thread
From: Chris Wilson @ 2015-10-01 13:24 UTC (permalink / raw)
To: Ville Syrjälä; +Cc: intel-gfx
On Thu, Oct 01, 2015 at 03:37:21PM +0300, Ville Syrjälä wrote:
> On Thu, Oct 01, 2015 at 12:57:10PM +0100, Chris Wilson wrote:
> > - while (cmd < batch_end) {
> > - const struct drm_i915_cmd_descriptor *desc;
> > - u32 length;
> > + k = i;
> > + if (k > PAGE_SIZE - out)
> > + k = PAGE_SIZE - out;
> > + if (k == PAGE_SIZE)
> > + copy_page(dst, src);
> > + else
> > + memcpy(dst + out, src + j, k);
> > +
> > + out += k;
> > + j += k;
> > + i -= k;
> > + } while (i);
> > +
> > + cmd = src + in;
>
> So you're now checking the src batch? What prevents userspace from
> overwriting it with eg. NOPS between the time you copied it and the
> time you check it?
Zilch. I picked src as it was already in the CPU cache, whereas dst will
be WC later. To satisfy you and byt, I need to stage the copy into a
temporary page, scan it, then copy into dst.
The silver lining is that it does remove some lines of code. I'm pretty
confident that the double copy should not be noticed if I remember about
the cache-trashing of kunmap and carefully manage that.
-Chris
--
Chris Wilson, Intel Open Source Technology Centre
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/intel-gfx
^ permalink raw reply [flat|nested] 12+ messages in thread
* Re: [PATCH 1/6] drm/i915: Eliminate vmap overhead for cmd parser
2015-10-01 13:24 ` Chris Wilson
@ 2015-10-01 13:29 ` Ville Syrjälä
2015-10-01 13:39 ` Chris Wilson
0 siblings, 1 reply; 12+ messages in thread
From: Ville Syrjälä @ 2015-10-01 13:29 UTC (permalink / raw)
To: Chris Wilson, intel-gfx
On Thu, Oct 01, 2015 at 02:24:53PM +0100, Chris Wilson wrote:
> On Thu, Oct 01, 2015 at 03:37:21PM +0300, Ville Syrjälä wrote:
> > On Thu, Oct 01, 2015 at 12:57:10PM +0100, Chris Wilson wrote:
> > > - while (cmd < batch_end) {
> > > - const struct drm_i915_cmd_descriptor *desc;
> > > - u32 length;
> > > + k = i;
> > > + if (k > PAGE_SIZE - out)
> > > + k = PAGE_SIZE - out;
> > > + if (k == PAGE_SIZE)
> > > + copy_page(dst, src);
> > > + else
> > > + memcpy(dst + out, src + j, k);
> > > +
> > > + out += k;
> > > + j += k;
> > > + i -= k;
> > > + } while (i);
> > > +
> > > + cmd = src + in;
> >
> > So you're now checking the src batch? What prevents userspace from
> > overwriting it with eg. NOPS between the time you copied it and the
> > time you check it?
>
> Zilch. I picked src as it was already in the CPU cache, whereas dst will
> be WC later. To satisfy you and byt, I need to stage the copy into a
> temporary page, scan it, then copy into dst.
>
> The silver lining is that it does remove some lines of code. I'm pretty
> confident that the double copy should not be noticed if I remember about
> the cache-trashing of kunmap and carefully manage that.
Yeah, I was thinking that optimally we'd do the copy+scan in cacheline
units, but maybe that's too small to make it actually efficient?
--
Ville Syrjälä
Intel OTC
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/intel-gfx
^ permalink raw reply [flat|nested] 12+ messages in thread
* Re: [PATCH 1/6] drm/i915: Eliminate vmap overhead for cmd parser
2015-10-01 13:29 ` Ville Syrjälä
@ 2015-10-01 13:39 ` Chris Wilson
0 siblings, 0 replies; 12+ messages in thread
From: Chris Wilson @ 2015-10-01 13:39 UTC (permalink / raw)
To: Ville Syrjälä; +Cc: intel-gfx
On Thu, Oct 01, 2015 at 04:29:48PM +0300, Ville Syrjälä wrote:
> On Thu, Oct 01, 2015 at 02:24:53PM +0100, Chris Wilson wrote:
> > On Thu, Oct 01, 2015 at 03:37:21PM +0300, Ville Syrjälä wrote:
> > > On Thu, Oct 01, 2015 at 12:57:10PM +0100, Chris Wilson wrote:
> > > > - while (cmd < batch_end) {
> > > > - const struct drm_i915_cmd_descriptor *desc;
> > > > - u32 length;
> > > > + k = i;
> > > > + if (k > PAGE_SIZE - out)
> > > > + k = PAGE_SIZE - out;
> > > > + if (k == PAGE_SIZE)
> > > > + copy_page(dst, src);
> > > > + else
> > > > + memcpy(dst + out, src + j, k);
> > > > +
> > > > + out += k;
> > > > + j += k;
> > > > + i -= k;
> > > > + } while (i);
> > > > +
> > > > + cmd = src + in;
> > >
> > > So you're now checking the src batch? What prevents userspace from
> > > overwriting it with eg. NOPS between the time you copied it and the
> > > time you check it?
> >
> > Zilch. I picked src as it was already in the CPU cache, whereas dst will
> > be WC later. To satisfy you and byt, I need to stage the copy into a
> > temporary page, scan it, then copy into dst.
> >
> > The silver lining is that it does remove some lines of code. I'm pretty
> > confident that the double copy should not be noticed if I remember about
> > the cache-trashing of kunmap and carefully manage that.
>
> Yeah, I was thinking that optimally we'd do the copy+scan in cacheline
> units, but maybe that's too small to make it actually efficient?
Yes, the issue there becomes that we need up to 256 bytes (though I
think the largest command is actually ~130bytes) in order to pass a
complete copy of a command to the validator. If we make the temporary
too small we waste time building up partial commands, and the maximum
sane size is one page. __get_free_page() seems a reasonable first
choice, though a kmalloc() from a fresh slab tends to be faster to
allocate.
-Chris
--
Chris Wilson, Intel Open Source Technology Centre
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/intel-gfx
^ permalink raw reply [flat|nested] 12+ messages in thread
* Re: [PATCH 1/6] drm/i915: Eliminate vmap overhead for cmd parser
2015-10-01 11:57 [PATCH 1/6] drm/i915: Eliminate vmap overhead for cmd parser Chris Wilson
` (5 preceding siblings ...)
2015-10-01 12:37 ` [PATCH 1/6] drm/i915: Eliminate vmap overhead for cmd parser Ville Syrjälä
@ 2015-10-06 12:44 ` Daniel Vetter
2015-10-06 13:00 ` Chris Wilson
6 siblings, 1 reply; 12+ messages in thread
From: Daniel Vetter @ 2015-10-06 12:44 UTC (permalink / raw)
To: Chris Wilson; +Cc: intel-gfx
On Thu, Oct 01, 2015 at 12:57:10PM +0100, Chris Wilson wrote:
> With a little complexity to handle cmds straddling page boundaries, we
> can completely avoiding having to vmap the batch and the shadow batch
> objects whilst running the command parser.
>
> On ivb i7-3720MQ:
>
> x11perf -dot before 54.3M, after 53.2M (max 203M)
> glxgears before 7110 fps, after 7300 fps (max 7860 fps)
>
> Before:
> Time to blt 16384 bytes x 1: 12.400µs, 1.2GiB/s
> Time to blt 16384 bytes x 4096: 3.055µs, 5.0GiB/s
>
> After:
> Time to blt 16384 bytes x 1: 8.600µs, 1.8GiB/s
> Time to blt 16384 bytes x 4096: 2.456µs, 6.2GiB/s
Numbers for the overall series (or individual patches would be even
better) are needed. I thought you have this neat script now to do that for
an entire series?
-Daniel
>
> Removing the vmap is mostly a win, except we lose in a few cases where
> the batch size is greater than a page due to the extra complexity (loss
> of a simple cache efficient large copy, and boundary handling).
>
> v2: Reorder so that we do check oacontrol remaining set at end-of-batch
>
> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> ---
> drivers/gpu/drm/i915/i915_cmd_parser.c | 298 ++++++++++++++++-----------------
> 1 file changed, 146 insertions(+), 152 deletions(-)
>
> diff --git a/drivers/gpu/drm/i915/i915_cmd_parser.c b/drivers/gpu/drm/i915/i915_cmd_parser.c
> index 237ff6884a22..91e4baa0f2b8 100644
> --- a/drivers/gpu/drm/i915/i915_cmd_parser.c
> +++ b/drivers/gpu/drm/i915/i915_cmd_parser.c
> @@ -852,100 +852,6 @@ find_reg(const struct drm_i915_reg_descriptor *table,
> return NULL;
> }
>
> -static u32 *vmap_batch(struct drm_i915_gem_object *obj,
> - unsigned start, unsigned len)
> -{
> - int i;
> - void *addr = NULL;
> - struct sg_page_iter sg_iter;
> - int first_page = start >> PAGE_SHIFT;
> - int last_page = (len + start + 4095) >> PAGE_SHIFT;
> - int npages = last_page - first_page;
> - struct page **pages;
> -
> - pages = drm_malloc_ab(npages, sizeof(*pages));
> - if (pages == NULL) {
> - DRM_DEBUG_DRIVER("Failed to get space for pages\n");
> - goto finish;
> - }
> -
> - i = 0;
> - for_each_sg_page(obj->pages->sgl, &sg_iter, obj->pages->nents, first_page) {
> - pages[i++] = sg_page_iter_page(&sg_iter);
> - if (i == npages)
> - break;
> - }
> -
> - addr = vmap(pages, i, 0, PAGE_KERNEL);
> - if (addr == NULL) {
> - DRM_DEBUG_DRIVER("Failed to vmap pages\n");
> - goto finish;
> - }
> -
> -finish:
> - if (pages)
> - drm_free_large(pages);
> - return (u32*)addr;
> -}
> -
> -/* Returns a vmap'd pointer to dest_obj, which the caller must unmap */
> -static u32 *copy_batch(struct drm_i915_gem_object *dest_obj,
> - struct drm_i915_gem_object *src_obj,
> - u32 batch_start_offset,
> - u32 batch_len)
> -{
> - int needs_clflush = 0;
> - void *src_base, *src;
> - void *dst = NULL;
> - int ret;
> -
> - if (batch_len > dest_obj->base.size ||
> - batch_len + batch_start_offset > src_obj->base.size)
> - return ERR_PTR(-E2BIG);
> -
> - if (WARN_ON(dest_obj->pages_pin_count == 0))
> - return ERR_PTR(-ENODEV);
> -
> - ret = i915_gem_obj_prepare_shmem_read(src_obj, &needs_clflush);
> - if (ret) {
> - DRM_DEBUG_DRIVER("CMD: failed to prepare shadow batch\n");
> - return ERR_PTR(ret);
> - }
> -
> - src_base = vmap_batch(src_obj, batch_start_offset, batch_len);
> - if (!src_base) {
> - DRM_DEBUG_DRIVER("CMD: Failed to vmap batch\n");
> - ret = -ENOMEM;
> - goto unpin_src;
> - }
> -
> - ret = i915_gem_object_set_to_cpu_domain(dest_obj, true);
> - if (ret) {
> - DRM_DEBUG_DRIVER("CMD: Failed to set shadow batch to CPU\n");
> - goto unmap_src;
> - }
> -
> - dst = vmap_batch(dest_obj, 0, batch_len);
> - if (!dst) {
> - DRM_DEBUG_DRIVER("CMD: Failed to vmap shadow batch\n");
> - ret = -ENOMEM;
> - goto unmap_src;
> - }
> -
> - src = src_base + offset_in_page(batch_start_offset);
> - if (needs_clflush)
> - drm_clflush_virt_range(src, batch_len);
> -
> - memcpy(dst, src, batch_len);
> -
> -unmap_src:
> - vunmap(src_base);
> -unpin_src:
> - i915_gem_object_unpin_pages(src_obj);
> -
> - return ret ? ERR_PTR(ret) : dst;
> -}
> -
> /**
> * i915_needs_cmd_parser() - should a given ring use software command parsing?
> * @ring: the ring in question
> @@ -1112,16 +1018,35 @@ int i915_parse_cmds(struct intel_engine_cs *ring,
> u32 batch_len,
> bool is_master)
> {
> - u32 *cmd, *batch_base, *batch_end;
> + u32 tmp[128];
> + const struct drm_i915_cmd_descriptor *desc;
> + unsigned dst_iter, src_iter;
> + int needs_clflush = 0;
> + struct get_page rewind;
> + void *src, *dst;
> + unsigned in, out;
> + u32 *buf, partial = 0, length;
> struct drm_i915_cmd_descriptor default_desc = { 0 };
> bool oacontrol_set = false; /* OACONTROL tracking. See check_cmd() */
> int ret = 0;
>
> - batch_base = copy_batch(shadow_batch_obj, batch_obj,
> - batch_start_offset, batch_len);
> - if (IS_ERR(batch_base)) {
> - DRM_DEBUG_DRIVER("CMD: Failed to copy batch\n");
> - return PTR_ERR(batch_base);
> + if (batch_len > shadow_batch_obj->base.size ||
> + batch_len + batch_start_offset > batch_obj->base.size)
> + return -E2BIG;
> +
> + if (WARN_ON(shadow_batch_obj->pages_pin_count == 0))
> + return -ENODEV;
> +
> + ret = i915_gem_obj_prepare_shmem_read(batch_obj, &needs_clflush);
> + if (ret) {
> + DRM_DEBUG_DRIVER("CMD: failed to prepare shadow batch\n");
> + return ret;
> + }
> +
> + ret = i915_gem_object_set_to_cpu_domain(shadow_batch_obj, true);
> + if (ret) {
> + DRM_DEBUG_DRIVER("CMD: Failed to set shadow batch to CPU\n");
> + goto unpin;
> }
>
> /*
> @@ -1129,69 +1054,138 @@ int i915_parse_cmds(struct intel_engine_cs *ring,
> * large or larger and copy_batch() will write MI_NOPs to the extra
> * space. Parsing should be faster in some cases this way.
> */
> - batch_end = batch_base + (batch_len / sizeof(*batch_end));
> + ret = -EINVAL;
> + rewind = batch_obj->get_page;
> +
> + dst_iter = 0;
> + dst = kmap_atomic(i915_gem_object_get_page(shadow_batch_obj, dst_iter));
> +
> + in = offset_in_page(batch_start_offset);
> + out = 0;
> + for (src_iter = batch_start_offset >> PAGE_SHIFT;
> + src_iter < batch_obj->base.size >> PAGE_SHIFT;
> + src_iter++) {
> + u32 this, i, j, k;
> + u32 *cmd, *page_end, *batch_end;
> +
> + this = batch_len;
> + if (this > PAGE_SIZE - in)
> + this = PAGE_SIZE - in;
> +
> + src = kmap_atomic(i915_gem_object_get_page(batch_obj, src_iter));
> + if (needs_clflush)
> + drm_clflush_virt_range(src + in, this);
> +
> + i = this;
> + j = in;
> + do {
> + /* We keep dst around so that we do not blow
> + * the CPU caches immediately after the copy (due
> + * to the kunmap_atomic(dst) doing a TLB flush.
> + */
> + if (out == PAGE_SIZE) {
> + kunmap_atomic(dst);
> + dst = kmap_atomic(i915_gem_object_get_page(shadow_batch_obj, ++dst_iter));
> + out = 0;
> + }
>
> - cmd = batch_base;
> - while (cmd < batch_end) {
> - const struct drm_i915_cmd_descriptor *desc;
> - u32 length;
> + k = i;
> + if (k > PAGE_SIZE - out)
> + k = PAGE_SIZE - out;
> + if (k == PAGE_SIZE)
> + copy_page(dst, src);
> + else
> + memcpy(dst + out, src + j, k);
> +
> + out += k;
> + j += k;
> + i -= k;
> + } while (i);
> +
> + cmd = src + in;
> + page_end = (void *)cmd + this;
> + batch_end = (void *)cmd + batch_len;
> +
> + if (partial) {
> + memcpy(tmp + partial, cmd, (length - partial)*4);
> + cmd -= partial;
> + partial = 0;
> + buf = tmp;
> + goto check;
> + }
>
> - if (*cmd == MI_BATCH_BUFFER_END)
> - break;
> + do {
> + if (*cmd == MI_BATCH_BUFFER_END) {
> + if (oacontrol_set) {
> + DRM_DEBUG_DRIVER("CMD: batch set OACONTROL but did not clear it\n");
> + ret = -EINVAL;
> + } else
> + ret = 0;
> + goto unmap;
> + }
>
> - desc = find_cmd(ring, *cmd, &default_desc);
> - if (!desc) {
> - DRM_DEBUG_DRIVER("CMD: Unrecognized command: 0x%08X\n",
> - *cmd);
> - ret = -EINVAL;
> - break;
> - }
> + desc = find_cmd(ring, *cmd, &default_desc);
> + if (!desc) {
> + DRM_DEBUG_DRIVER("CMD: Unrecognized command: 0x%08X\n",
> + *cmd);
> + goto unmap;
> + }
>
> - /*
> - * If the batch buffer contains a chained batch, return an
> - * error that tells the caller to abort and dispatch the
> - * workload as a non-secure batch.
> - */
> - if (desc->cmd.value == MI_BATCH_BUFFER_START) {
> - ret = -EACCES;
> - break;
> - }
> + /*
> + * If the batch buffer contains a chained batch, return an
> + * error that tells the caller to abort and dispatch the
> + * workload as a non-secure batch.
> + */
> + if (desc->cmd.value == MI_BATCH_BUFFER_START) {
> + ret = -EACCES;
> + goto unmap;
> + }
>
> - if (desc->flags & CMD_DESC_FIXED)
> - length = desc->length.fixed;
> - else
> - length = ((*cmd & desc->length.mask) + LENGTH_BIAS);
> -
> - if ((batch_end - cmd) < length) {
> - DRM_DEBUG_DRIVER("CMD: Command length exceeds batch length: 0x%08X length=%u batchlen=%td\n",
> - *cmd,
> - length,
> - batch_end - cmd);
> - ret = -EINVAL;
> - break;
> - }
> + if (desc->flags & CMD_DESC_FIXED)
> + length = desc->length.fixed;
> + else
> + length = ((*cmd & desc->length.mask) + LENGTH_BIAS);
>
> - if (!check_cmd(ring, desc, cmd, length, is_master,
> - &oacontrol_set)) {
> - ret = -EINVAL;
> - break;
> - }
> + if (cmd + length > page_end) {
> + if (length + cmd > batch_end) {
> + DRM_DEBUG_DRIVER("CMD: Command length exceeds batch length: 0x%08X length=%u batchlen=%td\n",
> + *cmd, length, batch_end - cmd);
> + goto unmap;
> + }
>
> - cmd += length;
> - }
> + if (WARN_ON(length > sizeof(tmp)/4)) {
> + ret = -ENODEV;
> + goto unmap;
> + }
>
> - if (oacontrol_set) {
> - DRM_DEBUG_DRIVER("CMD: batch set OACONTROL but did not clear it\n");
> - ret = -EINVAL;
> - }
> + partial = page_end - cmd;
> + memcpy(tmp, cmd, partial*4);
> + break;
> + }
>
> - if (cmd >= batch_end) {
> - DRM_DEBUG_DRIVER("CMD: Got to the end of the buffer w/o a BBE cmd!\n");
> - ret = -EINVAL;
> - }
> + buf = cmd;
> +check:
> + if (!check_cmd(ring, desc, buf, length, is_master,
> + &oacontrol_set))
> + goto unmap;
>
> - vunmap(batch_base);
> + cmd += length;
> + } while (cmd < page_end);
> +
> + batch_len -= this;
> + if (batch_len == 0)
> + break;
> +
> + kunmap_atomic(src);
> + in = 0;
> + }
>
> +unmap:
> + kunmap_atomic(src);
> + kunmap_atomic(dst);
> + batch_obj->get_page = rewind;
> +unpin:
> + i915_gem_object_unpin_pages(batch_obj);
> return ret;
> }
>
> --
> 2.6.0
>
> _______________________________________________
> Intel-gfx mailing list
> Intel-gfx@lists.freedesktop.org
> http://lists.freedesktop.org/mailman/listinfo/intel-gfx
--
Daniel Vetter
Software Engineer, Intel Corporation
http://blog.ffwll.ch
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/intel-gfx
^ permalink raw reply [flat|nested] 12+ messages in thread
* Re: [PATCH 1/6] drm/i915: Eliminate vmap overhead for cmd parser
2015-10-06 12:44 ` Daniel Vetter
@ 2015-10-06 13:00 ` Chris Wilson
0 siblings, 0 replies; 12+ messages in thread
From: Chris Wilson @ 2015-10-06 13:00 UTC (permalink / raw)
To: Daniel Vetter; +Cc: intel-gfx
On Tue, Oct 06, 2015 at 02:44:22PM +0200, Daniel Vetter wrote:
> On Thu, Oct 01, 2015 at 12:57:10PM +0100, Chris Wilson wrote:
> > With a little complexity to handle cmds straddling page boundaries, we
> > can completely avoiding having to vmap the batch and the shadow batch
> > objects whilst running the command parser.
> >
> > On ivb i7-3720MQ:
> >
> > x11perf -dot before 54.3M, after 53.2M (max 203M)
> > glxgears before 7110 fps, after 7300 fps (max 7860 fps)
> >
> > Before:
> > Time to blt 16384 bytes x 1: 12.400µs, 1.2GiB/s
> > Time to blt 16384 bytes x 4096: 3.055µs, 5.0GiB/s
> >
> > After:
> > Time to blt 16384 bytes x 1: 8.600µs, 1.8GiB/s
> > Time to blt 16384 bytes x 4096: 2.456µs, 6.2GiB/s
>
> Numbers for the overall series (or individual patches would be even
> better) are needed. I thought you have this neat script now to do that for
> an entire series?
Note that numbers on a patch are for the patch unless otherwise stated.
Double so since these are numbers from when I first posted it and didn't
have anything else to boost cmdparser perf.
I do and I even added the benchmark to demonstrate one case. Then I
forgot to enable the cmdparser in mesa and so its numbers are bunk.
Fancy scripts still can't eliminate pebkac.
However, it did show that we still get the throughput improvement from
killing the vmap even with the temporary copy.
-Chris
--
Chris Wilson, Intel Open Source Technology Centre
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/intel-gfx
^ permalink raw reply [flat|nested] 12+ messages in thread
end of thread, other threads:[~2015-10-06 13:00 UTC | newest]
Thread overview: 12+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2015-10-01 11:57 [PATCH 1/6] drm/i915: Eliminate vmap overhead for cmd parser Chris Wilson
2015-10-01 11:57 ` [PATCH 2/6] drm/i915: Cache last cmd descriptor when parsing Chris Wilson
2015-10-01 11:57 ` [PATCH 3/6] drm/i915: Use WC copies on !llc platforms for the command parser Chris Wilson
2015-10-01 11:57 ` [PATCH 4/6] drm/i915: Reduce arithmetic operations during cmd parser lookup Chris Wilson
2015-10-01 11:57 ` [PATCH 5/6] drm/i915: Reduce pointer indirection " Chris Wilson
2015-10-01 11:57 ` [PATCH 6/6] drm/i915: Improve hash function for the command parser Chris Wilson
2015-10-01 12:37 ` [PATCH 1/6] drm/i915: Eliminate vmap overhead for cmd parser Ville Syrjälä
2015-10-01 13:24 ` Chris Wilson
2015-10-01 13:29 ` Ville Syrjälä
2015-10-01 13:39 ` Chris Wilson
2015-10-06 12:44 ` Daniel Vetter
2015-10-06 13:00 ` Chris Wilson
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox