* [PATCH igt] igt/gem_fence_thresh: Use streaming reads for verify
@ 2017-08-23 12:55 Chris Wilson
2017-08-23 13:14 ` ✗ Fi.CI.BAT: failure for " Patchwork
` (5 more replies)
0 siblings, 6 replies; 8+ messages in thread
From: Chris Wilson @ 2017-08-23 12:55 UTC (permalink / raw)
To: intel-gfx
At the moment, the verify tests use an extremely brutal write-read of
every dword, degrading performance to UC. If we break those up into
cachelines, we can do a wcb write/read at a time instead, roughly 8x
faster. We lose the accuracy of the forced wcb flushes around every dword,
but we are retaining the overall behaviour of checking reads following
writes instead. To compensate, we do check that a single dword write/read
before using wcb aligned accesses.
Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
---
tests/gem_fence_thrash.c | 116 +++++++++++++++++++++++++++++++++++++++++------
1 file changed, 101 insertions(+), 15 deletions(-)
diff --git a/tests/gem_fence_thrash.c b/tests/gem_fence_thrash.c
index 52095f26..3e1edb73 100644
--- a/tests/gem_fence_thrash.c
+++ b/tests/gem_fence_thrash.c
@@ -30,7 +30,6 @@
#include "config.h"
#endif
-#include "igt.h"
#include <unistd.h>
#include <stdlib.h>
#include <stdio.h>
@@ -43,6 +42,12 @@
#include <pthread.h>
#include "drm.h"
+#include "igt.h"
+#include "igt_x86.h"
+
+#define PAGE_SIZE 4096
+#define CACHELINE 64
+
#define OBJECT_SIZE (128*1024) /* restricted to 1MiB alignment on i915 fences */
/* Before introduction of the LRU list for fences, allocation of a fence for a page
@@ -104,15 +109,78 @@ bo_copy (void *_arg)
return NULL;
}
+#if defined(__x86_64__) && !defined(__clang__)
+#define MOVNT 512
+
+#pragma GCC push_options
+#pragma GCC target("sse4.1")
+
+#include <smmintrin.h>
+__attribute__((noinline))
+static void copy_wc_page(void *dst, void *src)
+{
+ if (igt_x86_features() & SSE4_1) {
+ __m128i *S = (__m128i *)src;
+ __m128i *D = (__m128i *)dst;
+
+ for (int i = 0; i < PAGE_SIZE/CACHELINE; i++) {
+ __m128i tmp[4];
+
+ tmp[0] = _mm_stream_load_si128(S++);
+ tmp[1] = _mm_stream_load_si128(S++);
+ tmp[2] = _mm_stream_load_si128(S++);
+ tmp[3] = _mm_stream_load_si128(S++);
+
+ _mm_store_si128(D++, tmp[0]);
+ _mm_store_si128(D++, tmp[1]);
+ _mm_store_si128(D++, tmp[2]);
+ _mm_store_si128(D++, tmp[3]);
+ }
+ } else
+ memcpy(dst, src, PAGE_SIZE);
+}
+static void copy_wc_cacheline(void *dst, void *src)
+{
+ if (igt_x86_features() & SSE4_1) {
+ __m128i *S = (__m128i *)src;
+ __m128i *D = (__m128i *)dst;
+ __m128i tmp[4];
+
+ tmp[0] = _mm_stream_load_si128(S++);
+ tmp[1] = _mm_stream_load_si128(S++);
+ tmp[2] = _mm_stream_load_si128(S++);
+ tmp[3] = _mm_stream_load_si128(S++);
+
+ _mm_store_si128(D++, tmp[0]);
+ _mm_store_si128(D++, tmp[1]);
+ _mm_store_si128(D++, tmp[2]);
+ _mm_store_si128(D++, tmp[3]);
+ } else
+ memcpy(dst, src, CACHELINE);
+}
+
+#pragma GCC pop_options
+
+#else
+static void copy_wc_page(void *dst, const void *src)
+{
+ memcpy(dst, src, PAGE_SIZE);
+}
+static void copy_wc_cacheline(void *dst, const void *src)
+{
+ memcpy(dst, src, CACHELINE);
+}
+#endif
+
static void
_bo_write_verify(struct test *t)
{
int fd = t->fd;
int i, k;
uint32_t **s;
- uint32_t v;
unsigned int dwords = OBJECT_SIZE >> 2;
const char *tile_str[] = { "none", "x", "y" };
+ uint32_t tmp[PAGE_SIZE/sizeof(uint32_t)];
igt_assert(t->tiling >= 0 && t->tiling <= I915_TILING_Y);
igt_assert_lt(0, t->num_surfaces);
@@ -124,21 +192,39 @@ _bo_write_verify(struct test *t)
s[k] = bo_create(fd, t->tiling);
for (k = 0; k < t->num_surfaces; k++) {
- volatile uint32_t *a = s[k];
-
- for (i = 0; i < dwords; i++) {
- a[i] = i;
- v = a[i];
- igt_assert_f(v == i,
- "tiling %s: write failed at %d (%x)\n",
- tile_str[t->tiling], i, v);
+ uint32_t *a = s[k];
+
+ a[0] = 0xdeadbeef;
+ igt_assert_f(a[0] == 0xdeadbeef,
+ "tiling %s: write failed at start (%x)\n",
+ tile_str[t->tiling], a[0]);
+
+ a[dwords - 1] = 0xc0ffee;
+ igt_assert_f(a[dwords - 1] == 0xc0ffee,
+ "tiling %s: write failed at end (%x)\n",
+ tile_str[t->tiling], a[dwords - 1]);
+
+ for (i = 0; i < dwords; i += CACHELINE/sizeof(uint32_t)) {
+ for (int j = 0; j < CACHELINE/sizeof(uint32_t); j++)
+ a[i + j] = ~(i + j);
+
+ copy_wc_cacheline(tmp, a + i);
+ for (int j = 0; j < CACHELINE/sizeof(uint32_t); j++)
+ igt_assert_f(tmp[j] == ~(i+ j),
+ "tiling %s: write failed at %d (%x)\n",
+ tile_str[t->tiling], i + j, tmp[j]);
+
+ for (int j = 0; j < CACHELINE/sizeof(uint32_t); j++)
+ a[i + j] = i + j;
}
- for (i = 0; i < dwords; i++) {
- v = a[i];
- igt_assert_f(v == i,
- "tiling %s: verify failed at %d (%x)\n",
- tile_str[t->tiling], i, v);
+ for (i = 0; i < dwords; i += PAGE_SIZE/sizeof(uint32_t)) {
+ copy_wc_page(tmp, a + i);
+ for (int j = 0; j < PAGE_SIZE/sizeof(uint32_t); j++) {
+ igt_assert_f(tmp[j] == i + j,
+ "tiling %s: verify failed at %d (%x)\n",
+ tile_str[t->tiling], i + j, tmp[j]);
+ }
}
}
--
2.14.1
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx
^ permalink raw reply related [flat|nested] 8+ messages in thread
* ✗ Fi.CI.BAT: failure for igt/gem_fence_thresh: Use streaming reads for verify
2017-08-23 12:55 [PATCH igt] igt/gem_fence_thresh: Use streaming reads for verify Chris Wilson
@ 2017-08-23 13:14 ` Patchwork
2017-08-25 11:14 ` ✓ Fi.CI.BAT: success " Patchwork
` (4 subsequent siblings)
5 siblings, 0 replies; 8+ messages in thread
From: Patchwork @ 2017-08-23 13:14 UTC (permalink / raw)
To: Chris Wilson; +Cc: intel-gfx
== Series Details ==
Series: igt/gem_fence_thresh: Use streaming reads for verify
URL : https://patchwork.freedesktop.org/series/29208/
State : failure
== Summary ==
IGT patchset tested on top of latest successful build
42b42c99cd9d1b890807ae97cbd1c593396ae051 tests/Makefile.am: Wrap audio test with dedicated conditional
with latest DRM-Tip kernel build CI_DRM_2994
ebd0ddf26a92 drm-tip: 2017y-08m-23d-09h-28m-47s UTC integration manifest
Test kms_cursor_legacy:
Subgroup basic-busy-flip-before-cursor-atomic:
fail -> PASS (fi-snb-2600) fdo#100215
Subgroup basic-flip-after-cursor-varying-size:
pass -> FAIL (fi-hsw-4770)
Subgroup basic-flip-before-cursor-varying-size:
pass -> FAIL (fi-hsw-4770)
Test kms_flip:
Subgroup basic-flip-vs-modeset:
skip -> PASS (fi-skl-x1585l) fdo#101781
Test kms_pipe_crc_basic:
Subgroup suspend-read-crc-pipe-b:
pass -> DMESG-WARN (fi-byt-n2820) fdo#101705
fdo#100215 https://bugs.freedesktop.org/show_bug.cgi?id=100215
fdo#101781 https://bugs.freedesktop.org/show_bug.cgi?id=101781
fdo#101705 https://bugs.freedesktop.org/show_bug.cgi?id=101705
fi-bdw-5557u total:279 pass:268 dwarn:0 dfail:0 fail:0 skip:11 time:451s
fi-bdw-gvtdvm total:279 pass:265 dwarn:0 dfail:0 fail:0 skip:14 time:440s
fi-blb-e6850 total:279 pass:224 dwarn:1 dfail:0 fail:0 skip:54 time:363s
fi-bsw-n3050 total:279 pass:243 dwarn:0 dfail:0 fail:0 skip:36 time:567s
fi-bwr-2160 total:279 pass:184 dwarn:0 dfail:0 fail:0 skip:95 time:253s
fi-bxt-j4205 total:279 pass:260 dwarn:0 dfail:0 fail:0 skip:19 time:525s
fi-byt-j1900 total:279 pass:254 dwarn:1 dfail:0 fail:0 skip:24 time:531s
fi-byt-n2820 total:279 pass:250 dwarn:1 dfail:0 fail:0 skip:28 time:520s
fi-elk-e7500 total:279 pass:230 dwarn:0 dfail:0 fail:0 skip:49 time:437s
fi-glk-2a total:279 pass:260 dwarn:0 dfail:0 fail:0 skip:19 time:617s
fi-hsw-4770 total:279 pass:261 dwarn:0 dfail:0 fail:2 skip:16 time:445s
fi-hsw-4770r total:279 pass:263 dwarn:0 dfail:0 fail:0 skip:16 time:423s
fi-ilk-650 total:279 pass:229 dwarn:0 dfail:0 fail:0 skip:50 time:421s
fi-ivb-3520m total:279 pass:261 dwarn:0 dfail:0 fail:0 skip:18 time:506s
fi-ivb-3770 total:279 pass:261 dwarn:0 dfail:0 fail:0 skip:18 time:478s
fi-kbl-7500u total:279 pass:261 dwarn:0 dfail:0 fail:0 skip:18 time:478s
fi-kbl-7560u total:279 pass:269 dwarn:0 dfail:0 fail:0 skip:10 time:597s
fi-kbl-r total:279 pass:261 dwarn:0 dfail:0 fail:0 skip:18 time:596s
fi-pnv-d510 total:279 pass:223 dwarn:1 dfail:0 fail:0 skip:55 time:528s
fi-skl-6260u total:279 pass:269 dwarn:0 dfail:0 fail:0 skip:10 time:471s
fi-skl-6700k total:279 pass:261 dwarn:0 dfail:0 fail:0 skip:18 time:483s
fi-skl-6770hq total:279 pass:269 dwarn:0 dfail:0 fail:0 skip:10 time:488s
fi-skl-gvtdvm total:279 pass:266 dwarn:0 dfail:0 fail:0 skip:13 time:443s
fi-skl-x1585l total:279 pass:269 dwarn:0 dfail:0 fail:0 skip:10 time:501s
fi-snb-2520m total:279 pass:251 dwarn:0 dfail:0 fail:0 skip:28 time:548s
fi-snb-2600 total:279 pass:250 dwarn:0 dfail:0 fail:0 skip:29 time:404s
== Logs ==
For more details see: https://intel-gfx-ci.01.org/tree/drm-tip/IGTPW_85/
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx
^ permalink raw reply [flat|nested] 8+ messages in thread
* ✓ Fi.CI.BAT: success for igt/gem_fence_thresh: Use streaming reads for verify
2017-08-23 12:55 [PATCH igt] igt/gem_fence_thresh: Use streaming reads for verify Chris Wilson
2017-08-23 13:14 ` ✗ Fi.CI.BAT: failure for " Patchwork
@ 2017-08-25 11:14 ` Patchwork
2017-08-25 13:14 ` ✗ Fi.CI.IGT: warning " Patchwork
` (3 subsequent siblings)
5 siblings, 0 replies; 8+ messages in thread
From: Patchwork @ 2017-08-25 11:14 UTC (permalink / raw)
To: Chris Wilson; +Cc: intel-gfx
== Series Details ==
Series: igt/gem_fence_thresh: Use streaming reads for verify
URL : https://patchwork.freedesktop.org/series/29208/
State : success
== Summary ==
IGT patchset tested on top of latest successful build
29d488034a50cd6fbad792cae61321995f0ab51c aubdump: Log some information about the execbuf calls
with latest DRM-Tip kernel build CI_DRM_3001
068cd5b2db68 drm-tip: 2017y-08m-24d-22h-49m-38s UTC integration manifest
Test gem_ringfill:
Subgroup basic-default-hang:
dmesg-warn -> INCOMPLETE (fi-blb-e6850) fdo#101600 +1
Test prime_vgem:
Subgroup basic-fence-flip:
incomplete -> SKIP (fi-kbl-7560u)
fdo#101600 https://bugs.freedesktop.org/show_bug.cgi?id=101600
fi-bdw-5557u total:279 pass:268 dwarn:0 dfail:0 fail:0 skip:11 time:456s
fi-bdw-gvtdvm total:279 pass:265 dwarn:0 dfail:0 fail:0 skip:14 time:443s
fi-blb-e6850 total:147 pass:114 dwarn:0 dfail:0 fail:0 skip:32
fi-bsw-n3050 total:279 pass:243 dwarn:0 dfail:0 fail:0 skip:36 time:557s
fi-bwr-2160 total:279 pass:184 dwarn:0 dfail:0 fail:0 skip:95 time:251s
fi-bxt-j4205 total:279 pass:260 dwarn:0 dfail:0 fail:0 skip:19 time:522s
fi-byt-j1900 total:279 pass:254 dwarn:1 dfail:0 fail:0 skip:24 time:521s
fi-byt-n2820 total:279 pass:250 dwarn:1 dfail:0 fail:0 skip:28 time:516s
fi-elk-e7500 total:279 pass:230 dwarn:0 dfail:0 fail:0 skip:49 time:441s
fi-glk-2a total:279 pass:260 dwarn:0 dfail:0 fail:0 skip:19 time:609s
fi-hsw-4770 total:279 pass:263 dwarn:0 dfail:0 fail:0 skip:16 time:446s
fi-hsw-4770r total:279 pass:263 dwarn:0 dfail:0 fail:0 skip:16 time:425s
fi-ilk-650 total:279 pass:229 dwarn:0 dfail:0 fail:0 skip:50 time:424s
fi-ivb-3520m total:279 pass:261 dwarn:0 dfail:0 fail:0 skip:18 time:504s
fi-ivb-3770 total:279 pass:261 dwarn:0 dfail:0 fail:0 skip:18 time:476s
fi-kbl-7500u total:279 pass:261 dwarn:0 dfail:0 fail:0 skip:18 time:479s
fi-kbl-7560u total:279 pass:269 dwarn:0 dfail:0 fail:0 skip:10 time:601s
fi-kbl-r total:279 pass:261 dwarn:0 dfail:0 fail:0 skip:18 time:599s
fi-pnv-d510 total:279 pass:223 dwarn:1 dfail:0 fail:0 skip:55 time:524s
fi-skl-6260u total:279 pass:269 dwarn:0 dfail:0 fail:0 skip:10 time:470s
fi-skl-6700k total:279 pass:261 dwarn:0 dfail:0 fail:0 skip:18 time:491s
fi-skl-6770hq total:279 pass:269 dwarn:0 dfail:0 fail:0 skip:10 time:490s
fi-skl-gvtdvm total:279 pass:266 dwarn:0 dfail:0 fail:0 skip:13 time:443s
fi-skl-x1585l total:279 pass:268 dwarn:0 dfail:0 fail:0 skip:11 time:484s
fi-snb-2520m total:279 pass:251 dwarn:0 dfail:0 fail:0 skip:28 time:547s
fi-snb-2600 total:279 pass:248 dwarn:0 dfail:0 fail:2 skip:29 time:407s
== Logs ==
For more details see: https://intel-gfx-ci.01.org/tree/drm-tip/IGTPW_97/
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx
^ permalink raw reply [flat|nested] 8+ messages in thread
* ✗ Fi.CI.IGT: warning for igt/gem_fence_thresh: Use streaming reads for verify
2017-08-23 12:55 [PATCH igt] igt/gem_fence_thresh: Use streaming reads for verify Chris Wilson
2017-08-23 13:14 ` ✗ Fi.CI.BAT: failure for " Patchwork
2017-08-25 11:14 ` ✓ Fi.CI.BAT: success " Patchwork
@ 2017-08-25 13:14 ` Patchwork
2017-09-07 18:14 ` [PATCH igt] " Chris Wilson
` (2 subsequent siblings)
5 siblings, 0 replies; 8+ messages in thread
From: Patchwork @ 2017-08-25 13:14 UTC (permalink / raw)
To: Chris Wilson; +Cc: intel-gfx
== Series Details ==
Series: igt/gem_fence_thresh: Use streaming reads for verify
URL : https://patchwork.freedesktop.org/series/29208/
State : warning
== Summary ==
Test kms_setmode:
Subgroup basic:
pass -> FAIL (shard-hsw) fdo#99912
Test perf:
Subgroup blocking:
fail -> PASS (shard-hsw) fdo#102252
Test kms_atomic_transition:
Subgroup plane-all-modeset-transition:
pass -> DMESG-WARN (shard-hsw)
fdo#99912 https://bugs.freedesktop.org/show_bug.cgi?id=99912
fdo#102252 https://bugs.freedesktop.org/show_bug.cgi?id=102252
shard-hsw total:2230 pass:1230 dwarn:1 dfail:0 fail:18 skip:981 time:9459s
== Logs ==
For more details see: https://intel-gfx-ci.01.org/tree/drm-tip/IGTPW_97/shards.html
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx
^ permalink raw reply [flat|nested] 8+ messages in thread
* Re: [PATCH igt] igt/gem_fence_thresh: Use streaming reads for verify
2017-08-23 12:55 [PATCH igt] igt/gem_fence_thresh: Use streaming reads for verify Chris Wilson
` (2 preceding siblings ...)
2017-08-25 13:14 ` ✗ Fi.CI.IGT: warning " Patchwork
@ 2017-09-07 18:14 ` Chris Wilson
2017-10-06 20:53 ` Chris Wilson
2017-10-09 13:36 ` Joonas Lahtinen
5 siblings, 0 replies; 8+ messages in thread
From: Chris Wilson @ 2017-09-07 18:14 UTC (permalink / raw)
To: intel-gfx
Quoting Chris Wilson (2017-08-23 13:55:55)
> At the moment, the verify tests use an extremely brutal write-read of
> every dword, degrading performance to UC. If we break those up into
> cachelines, we can do a wcb write/read at a time instead, roughly 8x
> faster. We lose the accuracy of the forced wcb flushes around every dword,
> but we are retaining the overall behaviour of checking reads following
> writes instead. To compensate, we do check that a single dword write/read
> before using wcb aligned accesses.
>
> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
-> Tumbleweed ->
> ---
> tests/gem_fence_thrash.c | 116 +++++++++++++++++++++++++++++++++++++++++------
> 1 file changed, 101 insertions(+), 15 deletions(-)
>
> diff --git a/tests/gem_fence_thrash.c b/tests/gem_fence_thrash.c
> index 52095f26..3e1edb73 100644
> --- a/tests/gem_fence_thrash.c
> +++ b/tests/gem_fence_thrash.c
> @@ -30,7 +30,6 @@
> #include "config.h"
> #endif
>
> -#include "igt.h"
> #include <unistd.h>
> #include <stdlib.h>
> #include <stdio.h>
> @@ -43,6 +42,12 @@
> #include <pthread.h>
> #include "drm.h"
>
> +#include "igt.h"
> +#include "igt_x86.h"
> +
> +#define PAGE_SIZE 4096
> +#define CACHELINE 64
> +
> #define OBJECT_SIZE (128*1024) /* restricted to 1MiB alignment on i915 fences */
>
> /* Before introduction of the LRU list for fences, allocation of a fence for a page
> @@ -104,15 +109,78 @@ bo_copy (void *_arg)
> return NULL;
> }
>
> +#if defined(__x86_64__) && !defined(__clang__)
> +#define MOVNT 512
> +
> +#pragma GCC push_options
> +#pragma GCC target("sse4.1")
> +
> +#include <smmintrin.h>
> +__attribute__((noinline))
> +static void copy_wc_page(void *dst, void *src)
> +{
> + if (igt_x86_features() & SSE4_1) {
> + __m128i *S = (__m128i *)src;
> + __m128i *D = (__m128i *)dst;
> +
> + for (int i = 0; i < PAGE_SIZE/CACHELINE; i++) {
> + __m128i tmp[4];
> +
> + tmp[0] = _mm_stream_load_si128(S++);
> + tmp[1] = _mm_stream_load_si128(S++);
> + tmp[2] = _mm_stream_load_si128(S++);
> + tmp[3] = _mm_stream_load_si128(S++);
> +
> + _mm_store_si128(D++, tmp[0]);
> + _mm_store_si128(D++, tmp[1]);
> + _mm_store_si128(D++, tmp[2]);
> + _mm_store_si128(D++, tmp[3]);
> + }
> + } else
> + memcpy(dst, src, PAGE_SIZE);
> +}
> +static void copy_wc_cacheline(void *dst, void *src)
> +{
> + if (igt_x86_features() & SSE4_1) {
> + __m128i *S = (__m128i *)src;
> + __m128i *D = (__m128i *)dst;
> + __m128i tmp[4];
> +
> + tmp[0] = _mm_stream_load_si128(S++);
> + tmp[1] = _mm_stream_load_si128(S++);
> + tmp[2] = _mm_stream_load_si128(S++);
> + tmp[3] = _mm_stream_load_si128(S++);
> +
> + _mm_store_si128(D++, tmp[0]);
> + _mm_store_si128(D++, tmp[1]);
> + _mm_store_si128(D++, tmp[2]);
> + _mm_store_si128(D++, tmp[3]);
> + } else
> + memcpy(dst, src, CACHELINE);
> +}
> +
> +#pragma GCC pop_options
> +
> +#else
> +static void copy_wc_page(void *dst, const void *src)
> +{
> + memcpy(dst, src, PAGE_SIZE);
> +}
> +static void copy_wc_cacheline(void *dst, const void *src)
> +{
> + memcpy(dst, src, CACHELINE);
> +}
> +#endif
> +
> static void
> _bo_write_verify(struct test *t)
> {
> int fd = t->fd;
> int i, k;
> uint32_t **s;
> - uint32_t v;
> unsigned int dwords = OBJECT_SIZE >> 2;
> const char *tile_str[] = { "none", "x", "y" };
> + uint32_t tmp[PAGE_SIZE/sizeof(uint32_t)];
>
> igt_assert(t->tiling >= 0 && t->tiling <= I915_TILING_Y);
> igt_assert_lt(0, t->num_surfaces);
> @@ -124,21 +192,39 @@ _bo_write_verify(struct test *t)
> s[k] = bo_create(fd, t->tiling);
>
> for (k = 0; k < t->num_surfaces; k++) {
> - volatile uint32_t *a = s[k];
> -
> - for (i = 0; i < dwords; i++) {
> - a[i] = i;
> - v = a[i];
> - igt_assert_f(v == i,
> - "tiling %s: write failed at %d (%x)\n",
> - tile_str[t->tiling], i, v);
> + uint32_t *a = s[k];
> +
> + a[0] = 0xdeadbeef;
> + igt_assert_f(a[0] == 0xdeadbeef,
> + "tiling %s: write failed at start (%x)\n",
> + tile_str[t->tiling], a[0]);
> +
> + a[dwords - 1] = 0xc0ffee;
> + igt_assert_f(a[dwords - 1] == 0xc0ffee,
> + "tiling %s: write failed at end (%x)\n",
> + tile_str[t->tiling], a[dwords - 1]);
> +
> + for (i = 0; i < dwords; i += CACHELINE/sizeof(uint32_t)) {
> + for (int j = 0; j < CACHELINE/sizeof(uint32_t); j++)
> + a[i + j] = ~(i + j);
> +
> + copy_wc_cacheline(tmp, a + i);
> + for (int j = 0; j < CACHELINE/sizeof(uint32_t); j++)
> + igt_assert_f(tmp[j] == ~(i+ j),
> + "tiling %s: write failed at %d (%x)\n",
> + tile_str[t->tiling], i + j, tmp[j]);
> +
> + for (int j = 0; j < CACHELINE/sizeof(uint32_t); j++)
> + a[i + j] = i + j;
> }
>
> - for (i = 0; i < dwords; i++) {
> - v = a[i];
> - igt_assert_f(v == i,
> - "tiling %s: verify failed at %d (%x)\n",
> - tile_str[t->tiling], i, v);
> + for (i = 0; i < dwords; i += PAGE_SIZE/sizeof(uint32_t)) {
> + copy_wc_page(tmp, a + i);
> + for (int j = 0; j < PAGE_SIZE/sizeof(uint32_t); j++) {
> + igt_assert_f(tmp[j] == i + j,
> + "tiling %s: verify failed at %d (%x)\n",
> + tile_str[t->tiling], i + j, tmp[j]);
> + }
> }
> }
>
> --
> 2.14.1
>
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx
^ permalink raw reply [flat|nested] 8+ messages in thread
* Re: [PATCH igt] igt/gem_fence_thresh: Use streaming reads for verify
2017-08-23 12:55 [PATCH igt] igt/gem_fence_thresh: Use streaming reads for verify Chris Wilson
` (3 preceding siblings ...)
2017-09-07 18:14 ` [PATCH igt] " Chris Wilson
@ 2017-10-06 20:53 ` Chris Wilson
2017-10-09 13:36 ` Joonas Lahtinen
5 siblings, 0 replies; 8+ messages in thread
From: Chris Wilson @ 2017-10-06 20:53 UTC (permalink / raw)
To: intel-gfx
Quoting Chris Wilson (2017-08-23 13:55:55)
> At the moment, the verify tests use an extremely brutal write-read of
> every dword, degrading performance to UC. If we break those up into
> cachelines, we can do a wcb write/read at a time instead, roughly 8x
> faster. We lose the accuracy of the forced wcb flushes around every dword,
> but we are retaining the overall behaviour of checking reads following
> writes instead. To compensate, we do check that a single dword write/read
> before using wcb aligned accesses.
This fixes one of the APL timeouts...
> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> ---
> tests/gem_fence_thrash.c | 116 +++++++++++++++++++++++++++++++++++++++++------
> 1 file changed, 101 insertions(+), 15 deletions(-)
>
> diff --git a/tests/gem_fence_thrash.c b/tests/gem_fence_thrash.c
> index 52095f26..3e1edb73 100644
> --- a/tests/gem_fence_thrash.c
> +++ b/tests/gem_fence_thrash.c
> @@ -30,7 +30,6 @@
> #include "config.h"
> #endif
>
> -#include "igt.h"
> #include <unistd.h>
> #include <stdlib.h>
> #include <stdio.h>
> @@ -43,6 +42,12 @@
> #include <pthread.h>
> #include "drm.h"
>
> +#include "igt.h"
> +#include "igt_x86.h"
> +
> +#define PAGE_SIZE 4096
> +#define CACHELINE 64
> +
> #define OBJECT_SIZE (128*1024) /* restricted to 1MiB alignment on i915 fences */
>
> /* Before introduction of the LRU list for fences, allocation of a fence for a page
> @@ -104,15 +109,78 @@ bo_copy (void *_arg)
> return NULL;
> }
>
> +#if defined(__x86_64__) && !defined(__clang__)
> +#define MOVNT 512
> +
> +#pragma GCC push_options
> +#pragma GCC target("sse4.1")
> +
> +#include <smmintrin.h>
> +__attribute__((noinline))
> +static void copy_wc_page(void *dst, void *src)
> +{
> + if (igt_x86_features() & SSE4_1) {
> + __m128i *S = (__m128i *)src;
> + __m128i *D = (__m128i *)dst;
> +
> + for (int i = 0; i < PAGE_SIZE/CACHELINE; i++) {
> + __m128i tmp[4];
> +
> + tmp[0] = _mm_stream_load_si128(S++);
> + tmp[1] = _mm_stream_load_si128(S++);
> + tmp[2] = _mm_stream_load_si128(S++);
> + tmp[3] = _mm_stream_load_si128(S++);
> +
> + _mm_store_si128(D++, tmp[0]);
> + _mm_store_si128(D++, tmp[1]);
> + _mm_store_si128(D++, tmp[2]);
> + _mm_store_si128(D++, tmp[3]);
> + }
> + } else
> + memcpy(dst, src, PAGE_SIZE);
> +}
> +static void copy_wc_cacheline(void *dst, void *src)
> +{
> + if (igt_x86_features() & SSE4_1) {
> + __m128i *S = (__m128i *)src;
> + __m128i *D = (__m128i *)dst;
> + __m128i tmp[4];
> +
> + tmp[0] = _mm_stream_load_si128(S++);
> + tmp[1] = _mm_stream_load_si128(S++);
> + tmp[2] = _mm_stream_load_si128(S++);
> + tmp[3] = _mm_stream_load_si128(S++);
> +
> + _mm_store_si128(D++, tmp[0]);
> + _mm_store_si128(D++, tmp[1]);
> + _mm_store_si128(D++, tmp[2]);
> + _mm_store_si128(D++, tmp[3]);
> + } else
> + memcpy(dst, src, CACHELINE);
> +}
> +
> +#pragma GCC pop_options
> +
> +#else
> +static void copy_wc_page(void *dst, const void *src)
> +{
> + memcpy(dst, src, PAGE_SIZE);
> +}
> +static void copy_wc_cacheline(void *dst, const void *src)
> +{
> + memcpy(dst, src, CACHELINE);
> +}
> +#endif
> +
> static void
> _bo_write_verify(struct test *t)
> {
> int fd = t->fd;
> int i, k;
> uint32_t **s;
> - uint32_t v;
> unsigned int dwords = OBJECT_SIZE >> 2;
> const char *tile_str[] = { "none", "x", "y" };
> + uint32_t tmp[PAGE_SIZE/sizeof(uint32_t)];
>
> igt_assert(t->tiling >= 0 && t->tiling <= I915_TILING_Y);
> igt_assert_lt(0, t->num_surfaces);
> @@ -124,21 +192,39 @@ _bo_write_verify(struct test *t)
> s[k] = bo_create(fd, t->tiling);
>
> for (k = 0; k < t->num_surfaces; k++) {
> - volatile uint32_t *a = s[k];
> -
> - for (i = 0; i < dwords; i++) {
> - a[i] = i;
> - v = a[i];
> - igt_assert_f(v == i,
> - "tiling %s: write failed at %d (%x)\n",
> - tile_str[t->tiling], i, v);
> + uint32_t *a = s[k];
> +
> + a[0] = 0xdeadbeef;
> + igt_assert_f(a[0] == 0xdeadbeef,
> + "tiling %s: write failed at start (%x)\n",
> + tile_str[t->tiling], a[0]);
> +
> + a[dwords - 1] = 0xc0ffee;
> + igt_assert_f(a[dwords - 1] == 0xc0ffee,
> + "tiling %s: write failed at end (%x)\n",
> + tile_str[t->tiling], a[dwords - 1]);
> +
> + for (i = 0; i < dwords; i += CACHELINE/sizeof(uint32_t)) {
> + for (int j = 0; j < CACHELINE/sizeof(uint32_t); j++)
> + a[i + j] = ~(i + j);
> +
> + copy_wc_cacheline(tmp, a + i);
> + for (int j = 0; j < CACHELINE/sizeof(uint32_t); j++)
> + igt_assert_f(tmp[j] == ~(i+ j),
> + "tiling %s: write failed at %d (%x)\n",
> + tile_str[t->tiling], i + j, tmp[j]);
> +
> + for (int j = 0; j < CACHELINE/sizeof(uint32_t); j++)
> + a[i + j] = i + j;
> }
>
> - for (i = 0; i < dwords; i++) {
> - v = a[i];
> - igt_assert_f(v == i,
> - "tiling %s: verify failed at %d (%x)\n",
> - tile_str[t->tiling], i, v);
> + for (i = 0; i < dwords; i += PAGE_SIZE/sizeof(uint32_t)) {
> + copy_wc_page(tmp, a + i);
> + for (int j = 0; j < PAGE_SIZE/sizeof(uint32_t); j++) {
> + igt_assert_f(tmp[j] == i + j,
> + "tiling %s: verify failed at %d (%x)\n",
> + tile_str[t->tiling], i + j, tmp[j]);
> + }
> }
> }
>
> --
> 2.14.1
>
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx
^ permalink raw reply [flat|nested] 8+ messages in thread
* Re: [PATCH igt] igt/gem_fence_thresh: Use streaming reads for verify
2017-08-23 12:55 [PATCH igt] igt/gem_fence_thresh: Use streaming reads for verify Chris Wilson
` (4 preceding siblings ...)
2017-10-06 20:53 ` Chris Wilson
@ 2017-10-09 13:36 ` Joonas Lahtinen
2017-10-09 13:56 ` Chris Wilson
5 siblings, 1 reply; 8+ messages in thread
From: Joonas Lahtinen @ 2017-10-09 13:36 UTC (permalink / raw)
To: Chris Wilson, intel-gfx
Title: s/thresh/thrash/
On Wed, 2017-08-23 at 13:55 +0100, Chris Wilson wrote:
> At the moment, the verify tests use an extremely brutal write-read of
> every dword, degrading performance to UC. If we break those up into
> cachelines, we can do a wcb write/read at a time instead, roughly 8x
> faster. We lose the accuracy of the forced wcb flushes around every dword,
> but we are retaining the overall behaviour of checking reads following
> writes instead. To compensate, we do check that a single dword write/read
> before using wcb aligned accesses.
>
> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
<SNIP>
> @@ -104,15 +109,78 @@ bo_copy (void *_arg)
> return NULL;
> }
>
> +#if defined(__x86_64__) && !defined(__clang__)
> +#define MOVNT 512
> +
> +#pragma GCC push_options
> +#pragma GCC target("sse4.1")
> +
> +#include <smmintrin.h>
> +__attribute__((noinline))
> +static void copy_wc_page(void *dst, void *src)
> +{
> + if (igt_x86_features() & SSE4_1) {
> + __m128i *S = (__m128i *)src;
> + __m128i *D = (__m128i *)dst;
> +
> + for (int i = 0; i < PAGE_SIZE/CACHELINE; i++) {
> + __m128i tmp[4];
> +
> + tmp[0] = _mm_stream_load_si128(S++);
> + tmp[1] = _mm_stream_load_si128(S++);
> + tmp[2] = _mm_stream_load_si128(S++);
> + tmp[3] = _mm_stream_load_si128(S++);
> +
> + _mm_store_si128(D++, tmp[0]);
> + _mm_store_si128(D++, tmp[1]);
> + _mm_store_si128(D++, tmp[2]);
> + _mm_store_si128(D++, tmp[3]);
> + }
> + } else
> + memcpy(dst, src, PAGE_SIZE);
> +}
Not lib/ material?
Add newline anyway.
Reviewed-by: Joonas Lahtinen <joonas.lahtinen@linux.intel.com>
Regards, Joonas
--
Joonas Lahtinen
Open Source Technology Center
Intel Corporation
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx
^ permalink raw reply [flat|nested] 8+ messages in thread
* Re: [PATCH igt] igt/gem_fence_thresh: Use streaming reads for verify
2017-10-09 13:36 ` Joonas Lahtinen
@ 2017-10-09 13:56 ` Chris Wilson
0 siblings, 0 replies; 8+ messages in thread
From: Chris Wilson @ 2017-10-09 13:56 UTC (permalink / raw)
To: Joonas Lahtinen, intel-gfx
Quoting Joonas Lahtinen (2017-10-09 14:36:27)
> Title: s/thresh/thrash/
>
> On Wed, 2017-08-23 at 13:55 +0100, Chris Wilson wrote:
> > At the moment, the verify tests use an extremely brutal write-read of
> > every dword, degrading performance to UC. If we break those up into
> > cachelines, we can do a wcb write/read at a time instead, roughly 8x
> > faster. We lose the accuracy of the forced wcb flushes around every dword,
> > but we are retaining the overall behaviour of checking reads following
> > writes instead. To compensate, we do check that a single dword write/read
> > before using wcb aligned accesses.
> >
> > Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
>
> <SNIP>
>
> > @@ -104,15 +109,78 @@ bo_copy (void *_arg)
> > return NULL;
> > }
> >
> > +#if defined(__x86_64__) && !defined(__clang__)
> > +#define MOVNT 512
> > +
> > +#pragma GCC push_options
> > +#pragma GCC target("sse4.1")
> > +
> > +#include <smmintrin.h>
> > +__attribute__((noinline))
> > +static void copy_wc_page(void *dst, void *src)
> > +{
> > + if (igt_x86_features() & SSE4_1) {
> > + __m128i *S = (__m128i *)src;
> > + __m128i *D = (__m128i *)dst;
> > +
> > + for (int i = 0; i < PAGE_SIZE/CACHELINE; i++) {
> > + __m128i tmp[4];
> > +
> > + tmp[0] = _mm_stream_load_si128(S++);
> > + tmp[1] = _mm_stream_load_si128(S++);
> > + tmp[2] = _mm_stream_load_si128(S++);
> > + tmp[3] = _mm_stream_load_si128(S++);
> > +
> > + _mm_store_si128(D++, tmp[0]);
> > + _mm_store_si128(D++, tmp[1]);
> > + _mm_store_si128(D++, tmp[2]);
> > + _mm_store_si128(D++, tmp[3]);
> > + }
> > + } else
> > + memcpy(dst, src, PAGE_SIZE);
> > +}
>
> Not lib/ material?
Yes. But you know it's easier to make it work for one case than all.
-Chris
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx
^ permalink raw reply [flat|nested] 8+ messages in thread
end of thread, other threads:[~2017-10-09 13:57 UTC | newest]
Thread overview: 8+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2017-08-23 12:55 [PATCH igt] igt/gem_fence_thresh: Use streaming reads for verify Chris Wilson
2017-08-23 13:14 ` ✗ Fi.CI.BAT: failure for " Patchwork
2017-08-25 11:14 ` ✓ Fi.CI.BAT: success " Patchwork
2017-08-25 13:14 ` ✗ Fi.CI.IGT: warning " Patchwork
2017-09-07 18:14 ` [PATCH igt] " Chris Wilson
2017-10-06 20:53 ` Chris Wilson
2017-10-09 13:36 ` Joonas Lahtinen
2017-10-09 13:56 ` Chris Wilson
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.