All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH igt] igt/gem_fence_thresh: Use streaming reads for verify
@ 2017-08-23 12:55 Chris Wilson
  2017-08-23 13:14 ` ✗ Fi.CI.BAT: failure for " Patchwork
                   ` (5 more replies)
  0 siblings, 6 replies; 8+ messages in thread
From: Chris Wilson @ 2017-08-23 12:55 UTC (permalink / raw)
  To: intel-gfx

At the moment, the verify tests use an extremely brutal write-read of
every dword, degrading performance to UC. If we break those up into
cachelines, we can do a wcb write/read at a time instead, roughly 8x
faster. We lose the accuracy of the forced wcb flushes around every dword,
but we are retaining the overall behaviour of checking reads following
writes instead. To compensate, we do check that a single dword write/read
before using wcb aligned accesses.

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
---
 tests/gem_fence_thrash.c | 116 +++++++++++++++++++++++++++++++++++++++++------
 1 file changed, 101 insertions(+), 15 deletions(-)

diff --git a/tests/gem_fence_thrash.c b/tests/gem_fence_thrash.c
index 52095f26..3e1edb73 100644
--- a/tests/gem_fence_thrash.c
+++ b/tests/gem_fence_thrash.c
@@ -30,7 +30,6 @@
 #include "config.h"
 #endif
 
-#include "igt.h"
 #include <unistd.h>
 #include <stdlib.h>
 #include <stdio.h>
@@ -43,6 +42,12 @@
 #include <pthread.h>
 #include "drm.h"
 
+#include "igt.h"
+#include "igt_x86.h"
+
+#define PAGE_SIZE 4096
+#define CACHELINE 64
+
 #define OBJECT_SIZE (128*1024) /* restricted to 1MiB alignment on i915 fences */
 
 /* Before introduction of the LRU list for fences, allocation of a fence for a page
@@ -104,15 +109,78 @@ bo_copy (void *_arg)
 	return NULL;
 }
 
+#if defined(__x86_64__) && !defined(__clang__)
+#define MOVNT 512
+
+#pragma GCC push_options
+#pragma GCC target("sse4.1")
+
+#include <smmintrin.h>
+__attribute__((noinline))
+static void copy_wc_page(void *dst, void *src)
+{
+	if (igt_x86_features() & SSE4_1) {
+		__m128i *S = (__m128i *)src;
+		__m128i *D = (__m128i *)dst;
+
+		for (int i = 0; i < PAGE_SIZE/CACHELINE; i++) {
+			__m128i tmp[4];
+
+			tmp[0] = _mm_stream_load_si128(S++);
+			tmp[1] = _mm_stream_load_si128(S++);
+			tmp[2] = _mm_stream_load_si128(S++);
+			tmp[3] = _mm_stream_load_si128(S++);
+
+			_mm_store_si128(D++, tmp[0]);
+			_mm_store_si128(D++, tmp[1]);
+			_mm_store_si128(D++, tmp[2]);
+			_mm_store_si128(D++, tmp[3]);
+		}
+	} else
+		memcpy(dst, src, PAGE_SIZE);
+}
+static void copy_wc_cacheline(void *dst, void *src)
+{
+	if (igt_x86_features() & SSE4_1) {
+		__m128i *S = (__m128i *)src;
+		__m128i *D = (__m128i *)dst;
+		__m128i tmp[4];
+
+		tmp[0] = _mm_stream_load_si128(S++);
+		tmp[1] = _mm_stream_load_si128(S++);
+		tmp[2] = _mm_stream_load_si128(S++);
+		tmp[3] = _mm_stream_load_si128(S++);
+
+		_mm_store_si128(D++, tmp[0]);
+		_mm_store_si128(D++, tmp[1]);
+		_mm_store_si128(D++, tmp[2]);
+		_mm_store_si128(D++, tmp[3]);
+	} else
+		memcpy(dst, src, CACHELINE);
+}
+
+#pragma GCC pop_options
+
+#else
+static void copy_wc_page(void *dst, const void *src)
+{
+	memcpy(dst, src, PAGE_SIZE);
+}
+static void copy_wc_cacheline(void *dst, const void *src)
+{
+	memcpy(dst, src, CACHELINE);
+}
+#endif
+
 static void
 _bo_write_verify(struct test *t)
 {
 	int fd = t->fd;
 	int i, k;
 	uint32_t **s;
-	uint32_t v;
 	unsigned int dwords = OBJECT_SIZE >> 2;
 	const char *tile_str[] = { "none", "x", "y" };
+	uint32_t tmp[PAGE_SIZE/sizeof(uint32_t)];
 
 	igt_assert(t->tiling >= 0 && t->tiling <= I915_TILING_Y);
 	igt_assert_lt(0, t->num_surfaces);
@@ -124,21 +192,39 @@ _bo_write_verify(struct test *t)
 		s[k] = bo_create(fd, t->tiling);
 
 	for (k = 0; k < t->num_surfaces; k++) {
-		volatile uint32_t *a = s[k];
-
-		for (i = 0; i < dwords; i++) {
-			a[i] = i;
-			v = a[i];
-			igt_assert_f(v == i,
-				     "tiling %s: write failed at %d (%x)\n",
-				     tile_str[t->tiling], i, v);
+		uint32_t *a = s[k];
+
+		a[0] = 0xdeadbeef;
+		igt_assert_f(a[0] == 0xdeadbeef,
+			     "tiling %s: write failed at start (%x)\n",
+			     tile_str[t->tiling], a[0]);
+
+		a[dwords - 1] = 0xc0ffee;
+		igt_assert_f(a[dwords - 1] == 0xc0ffee,
+			     "tiling %s: write failed at end (%x)\n",
+			     tile_str[t->tiling], a[dwords - 1]);
+
+		for (i = 0; i < dwords; i += CACHELINE/sizeof(uint32_t)) {
+			for (int j = 0; j < CACHELINE/sizeof(uint32_t); j++)
+				a[i + j] = ~(i + j);
+
+			copy_wc_cacheline(tmp, a + i);
+			for (int j = 0; j < CACHELINE/sizeof(uint32_t); j++)
+				igt_assert_f(tmp[j] == ~(i+ j),
+					     "tiling %s: write failed at %d (%x)\n",
+					     tile_str[t->tiling], i + j, tmp[j]);
+
+			for (int j = 0; j < CACHELINE/sizeof(uint32_t); j++)
+				a[i + j] = i + j;
 		}
 
-		for (i = 0; i < dwords; i++) {
-			v = a[i];
-			igt_assert_f(v == i,
-				     "tiling %s: verify failed at %d (%x)\n",
-				     tile_str[t->tiling], i, v);
+		for (i = 0; i < dwords; i += PAGE_SIZE/sizeof(uint32_t)) {
+			copy_wc_page(tmp, a + i);
+			for (int j = 0; j < PAGE_SIZE/sizeof(uint32_t); j++) {
+				igt_assert_f(tmp[j] == i + j,
+					     "tiling %s: verify failed at %d (%x)\n",
+					     tile_str[t->tiling], i + j, tmp[j]);
+			}
 		}
 	}
 
-- 
2.14.1

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply related	[flat|nested] 8+ messages in thread

* ✗ Fi.CI.BAT: failure for igt/gem_fence_thresh: Use streaming reads for verify
  2017-08-23 12:55 [PATCH igt] igt/gem_fence_thresh: Use streaming reads for verify Chris Wilson
@ 2017-08-23 13:14 ` Patchwork
  2017-08-25 11:14 ` ✓ Fi.CI.BAT: success " Patchwork
                   ` (4 subsequent siblings)
  5 siblings, 0 replies; 8+ messages in thread
From: Patchwork @ 2017-08-23 13:14 UTC (permalink / raw)
  To: Chris Wilson; +Cc: intel-gfx

== Series Details ==

Series: igt/gem_fence_thresh: Use streaming reads for verify
URL   : https://patchwork.freedesktop.org/series/29208/
State : failure

== Summary ==

IGT patchset tested on top of latest successful build
42b42c99cd9d1b890807ae97cbd1c593396ae051 tests/Makefile.am: Wrap audio test with dedicated conditional

with latest DRM-Tip kernel build CI_DRM_2994
ebd0ddf26a92 drm-tip: 2017y-08m-23d-09h-28m-47s UTC integration manifest

Test kms_cursor_legacy:
        Subgroup basic-busy-flip-before-cursor-atomic:
                fail       -> PASS       (fi-snb-2600) fdo#100215
        Subgroup basic-flip-after-cursor-varying-size:
                pass       -> FAIL       (fi-hsw-4770)
        Subgroup basic-flip-before-cursor-varying-size:
                pass       -> FAIL       (fi-hsw-4770)
Test kms_flip:
        Subgroup basic-flip-vs-modeset:
                skip       -> PASS       (fi-skl-x1585l) fdo#101781
Test kms_pipe_crc_basic:
        Subgroup suspend-read-crc-pipe-b:
                pass       -> DMESG-WARN (fi-byt-n2820) fdo#101705

fdo#100215 https://bugs.freedesktop.org/show_bug.cgi?id=100215
fdo#101781 https://bugs.freedesktop.org/show_bug.cgi?id=101781
fdo#101705 https://bugs.freedesktop.org/show_bug.cgi?id=101705

fi-bdw-5557u     total:279  pass:268  dwarn:0   dfail:0   fail:0   skip:11  time:451s
fi-bdw-gvtdvm    total:279  pass:265  dwarn:0   dfail:0   fail:0   skip:14  time:440s
fi-blb-e6850     total:279  pass:224  dwarn:1   dfail:0   fail:0   skip:54  time:363s
fi-bsw-n3050     total:279  pass:243  dwarn:0   dfail:0   fail:0   skip:36  time:567s
fi-bwr-2160      total:279  pass:184  dwarn:0   dfail:0   fail:0   skip:95  time:253s
fi-bxt-j4205     total:279  pass:260  dwarn:0   dfail:0   fail:0   skip:19  time:525s
fi-byt-j1900     total:279  pass:254  dwarn:1   dfail:0   fail:0   skip:24  time:531s
fi-byt-n2820     total:279  pass:250  dwarn:1   dfail:0   fail:0   skip:28  time:520s
fi-elk-e7500     total:279  pass:230  dwarn:0   dfail:0   fail:0   skip:49  time:437s
fi-glk-2a        total:279  pass:260  dwarn:0   dfail:0   fail:0   skip:19  time:617s
fi-hsw-4770      total:279  pass:261  dwarn:0   dfail:0   fail:2   skip:16  time:445s
fi-hsw-4770r     total:279  pass:263  dwarn:0   dfail:0   fail:0   skip:16  time:423s
fi-ilk-650       total:279  pass:229  dwarn:0   dfail:0   fail:0   skip:50  time:421s
fi-ivb-3520m     total:279  pass:261  dwarn:0   dfail:0   fail:0   skip:18  time:506s
fi-ivb-3770      total:279  pass:261  dwarn:0   dfail:0   fail:0   skip:18  time:478s
fi-kbl-7500u     total:279  pass:261  dwarn:0   dfail:0   fail:0   skip:18  time:478s
fi-kbl-7560u     total:279  pass:269  dwarn:0   dfail:0   fail:0   skip:10  time:597s
fi-kbl-r         total:279  pass:261  dwarn:0   dfail:0   fail:0   skip:18  time:596s
fi-pnv-d510      total:279  pass:223  dwarn:1   dfail:0   fail:0   skip:55  time:528s
fi-skl-6260u     total:279  pass:269  dwarn:0   dfail:0   fail:0   skip:10  time:471s
fi-skl-6700k     total:279  pass:261  dwarn:0   dfail:0   fail:0   skip:18  time:483s
fi-skl-6770hq    total:279  pass:269  dwarn:0   dfail:0   fail:0   skip:10  time:488s
fi-skl-gvtdvm    total:279  pass:266  dwarn:0   dfail:0   fail:0   skip:13  time:443s
fi-skl-x1585l    total:279  pass:269  dwarn:0   dfail:0   fail:0   skip:10  time:501s
fi-snb-2520m     total:279  pass:251  dwarn:0   dfail:0   fail:0   skip:28  time:548s
fi-snb-2600      total:279  pass:250  dwarn:0   dfail:0   fail:0   skip:29  time:404s

== Logs ==

For more details see: https://intel-gfx-ci.01.org/tree/drm-tip/IGTPW_85/
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 8+ messages in thread

* ✓ Fi.CI.BAT: success for igt/gem_fence_thresh: Use streaming reads for verify
  2017-08-23 12:55 [PATCH igt] igt/gem_fence_thresh: Use streaming reads for verify Chris Wilson
  2017-08-23 13:14 ` ✗ Fi.CI.BAT: failure for " Patchwork
@ 2017-08-25 11:14 ` Patchwork
  2017-08-25 13:14 ` ✗ Fi.CI.IGT: warning " Patchwork
                   ` (3 subsequent siblings)
  5 siblings, 0 replies; 8+ messages in thread
From: Patchwork @ 2017-08-25 11:14 UTC (permalink / raw)
  To: Chris Wilson; +Cc: intel-gfx

== Series Details ==

Series: igt/gem_fence_thresh: Use streaming reads for verify
URL   : https://patchwork.freedesktop.org/series/29208/
State : success

== Summary ==

IGT patchset tested on top of latest successful build
29d488034a50cd6fbad792cae61321995f0ab51c aubdump: Log some information about the execbuf calls

with latest DRM-Tip kernel build CI_DRM_3001
068cd5b2db68 drm-tip: 2017y-08m-24d-22h-49m-38s UTC integration manifest

Test gem_ringfill:
        Subgroup basic-default-hang:
                dmesg-warn -> INCOMPLETE (fi-blb-e6850) fdo#101600 +1
Test prime_vgem:
        Subgroup basic-fence-flip:
                incomplete -> SKIP       (fi-kbl-7560u)

fdo#101600 https://bugs.freedesktop.org/show_bug.cgi?id=101600

fi-bdw-5557u     total:279  pass:268  dwarn:0   dfail:0   fail:0   skip:11  time:456s
fi-bdw-gvtdvm    total:279  pass:265  dwarn:0   dfail:0   fail:0   skip:14  time:443s
fi-blb-e6850     total:147  pass:114  dwarn:0   dfail:0   fail:0   skip:32 
fi-bsw-n3050     total:279  pass:243  dwarn:0   dfail:0   fail:0   skip:36  time:557s
fi-bwr-2160      total:279  pass:184  dwarn:0   dfail:0   fail:0   skip:95  time:251s
fi-bxt-j4205     total:279  pass:260  dwarn:0   dfail:0   fail:0   skip:19  time:522s
fi-byt-j1900     total:279  pass:254  dwarn:1   dfail:0   fail:0   skip:24  time:521s
fi-byt-n2820     total:279  pass:250  dwarn:1   dfail:0   fail:0   skip:28  time:516s
fi-elk-e7500     total:279  pass:230  dwarn:0   dfail:0   fail:0   skip:49  time:441s
fi-glk-2a        total:279  pass:260  dwarn:0   dfail:0   fail:0   skip:19  time:609s
fi-hsw-4770      total:279  pass:263  dwarn:0   dfail:0   fail:0   skip:16  time:446s
fi-hsw-4770r     total:279  pass:263  dwarn:0   dfail:0   fail:0   skip:16  time:425s
fi-ilk-650       total:279  pass:229  dwarn:0   dfail:0   fail:0   skip:50  time:424s
fi-ivb-3520m     total:279  pass:261  dwarn:0   dfail:0   fail:0   skip:18  time:504s
fi-ivb-3770      total:279  pass:261  dwarn:0   dfail:0   fail:0   skip:18  time:476s
fi-kbl-7500u     total:279  pass:261  dwarn:0   dfail:0   fail:0   skip:18  time:479s
fi-kbl-7560u     total:279  pass:269  dwarn:0   dfail:0   fail:0   skip:10  time:601s
fi-kbl-r         total:279  pass:261  dwarn:0   dfail:0   fail:0   skip:18  time:599s
fi-pnv-d510      total:279  pass:223  dwarn:1   dfail:0   fail:0   skip:55  time:524s
fi-skl-6260u     total:279  pass:269  dwarn:0   dfail:0   fail:0   skip:10  time:470s
fi-skl-6700k     total:279  pass:261  dwarn:0   dfail:0   fail:0   skip:18  time:491s
fi-skl-6770hq    total:279  pass:269  dwarn:0   dfail:0   fail:0   skip:10  time:490s
fi-skl-gvtdvm    total:279  pass:266  dwarn:0   dfail:0   fail:0   skip:13  time:443s
fi-skl-x1585l    total:279  pass:268  dwarn:0   dfail:0   fail:0   skip:11  time:484s
fi-snb-2520m     total:279  pass:251  dwarn:0   dfail:0   fail:0   skip:28  time:547s
fi-snb-2600      total:279  pass:248  dwarn:0   dfail:0   fail:2   skip:29  time:407s

== Logs ==

For more details see: https://intel-gfx-ci.01.org/tree/drm-tip/IGTPW_97/
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 8+ messages in thread

* ✗ Fi.CI.IGT: warning for igt/gem_fence_thresh: Use streaming reads for verify
  2017-08-23 12:55 [PATCH igt] igt/gem_fence_thresh: Use streaming reads for verify Chris Wilson
  2017-08-23 13:14 ` ✗ Fi.CI.BAT: failure for " Patchwork
  2017-08-25 11:14 ` ✓ Fi.CI.BAT: success " Patchwork
@ 2017-08-25 13:14 ` Patchwork
  2017-09-07 18:14 ` [PATCH igt] " Chris Wilson
                   ` (2 subsequent siblings)
  5 siblings, 0 replies; 8+ messages in thread
From: Patchwork @ 2017-08-25 13:14 UTC (permalink / raw)
  To: Chris Wilson; +Cc: intel-gfx

== Series Details ==

Series: igt/gem_fence_thresh: Use streaming reads for verify
URL   : https://patchwork.freedesktop.org/series/29208/
State : warning

== Summary ==

Test kms_setmode:
        Subgroup basic:
                pass       -> FAIL       (shard-hsw) fdo#99912
Test perf:
        Subgroup blocking:
                fail       -> PASS       (shard-hsw) fdo#102252
Test kms_atomic_transition:
        Subgroup plane-all-modeset-transition:
                pass       -> DMESG-WARN (shard-hsw)

fdo#99912 https://bugs.freedesktop.org/show_bug.cgi?id=99912
fdo#102252 https://bugs.freedesktop.org/show_bug.cgi?id=102252

shard-hsw        total:2230 pass:1230 dwarn:1   dfail:0   fail:18  skip:981 time:9459s

== Logs ==

For more details see: https://intel-gfx-ci.01.org/tree/drm-tip/IGTPW_97/shards.html
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH igt] igt/gem_fence_thresh: Use streaming reads for verify
  2017-08-23 12:55 [PATCH igt] igt/gem_fence_thresh: Use streaming reads for verify Chris Wilson
                   ` (2 preceding siblings ...)
  2017-08-25 13:14 ` ✗ Fi.CI.IGT: warning " Patchwork
@ 2017-09-07 18:14 ` Chris Wilson
  2017-10-06 20:53 ` Chris Wilson
  2017-10-09 13:36 ` Joonas Lahtinen
  5 siblings, 0 replies; 8+ messages in thread
From: Chris Wilson @ 2017-09-07 18:14 UTC (permalink / raw)
  To: intel-gfx

Quoting Chris Wilson (2017-08-23 13:55:55)
> At the moment, the verify tests use an extremely brutal write-read of
> every dword, degrading performance to UC. If we break those up into
> cachelines, we can do a wcb write/read at a time instead, roughly 8x
> faster. We lose the accuracy of the forced wcb flushes around every dword,
> but we are retaining the overall behaviour of checking reads following
> writes instead. To compensate, we do check that a single dword write/read
> before using wcb aligned accesses.
> 
> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>

-> Tumbleweed ->

> ---
>  tests/gem_fence_thrash.c | 116 +++++++++++++++++++++++++++++++++++++++++------
>  1 file changed, 101 insertions(+), 15 deletions(-)
> 
> diff --git a/tests/gem_fence_thrash.c b/tests/gem_fence_thrash.c
> index 52095f26..3e1edb73 100644
> --- a/tests/gem_fence_thrash.c
> +++ b/tests/gem_fence_thrash.c
> @@ -30,7 +30,6 @@
>  #include "config.h"
>  #endif
>  
> -#include "igt.h"
>  #include <unistd.h>
>  #include <stdlib.h>
>  #include <stdio.h>
> @@ -43,6 +42,12 @@
>  #include <pthread.h>
>  #include "drm.h"
>  
> +#include "igt.h"
> +#include "igt_x86.h"
> +
> +#define PAGE_SIZE 4096
> +#define CACHELINE 64
> +
>  #define OBJECT_SIZE (128*1024) /* restricted to 1MiB alignment on i915 fences */
>  
>  /* Before introduction of the LRU list for fences, allocation of a fence for a page
> @@ -104,15 +109,78 @@ bo_copy (void *_arg)
>         return NULL;
>  }
>  
> +#if defined(__x86_64__) && !defined(__clang__)
> +#define MOVNT 512
> +
> +#pragma GCC push_options
> +#pragma GCC target("sse4.1")
> +
> +#include <smmintrin.h>
> +__attribute__((noinline))
> +static void copy_wc_page(void *dst, void *src)
> +{
> +       if (igt_x86_features() & SSE4_1) {
> +               __m128i *S = (__m128i *)src;
> +               __m128i *D = (__m128i *)dst;
> +
> +               for (int i = 0; i < PAGE_SIZE/CACHELINE; i++) {
> +                       __m128i tmp[4];
> +
> +                       tmp[0] = _mm_stream_load_si128(S++);
> +                       tmp[1] = _mm_stream_load_si128(S++);
> +                       tmp[2] = _mm_stream_load_si128(S++);
> +                       tmp[3] = _mm_stream_load_si128(S++);
> +
> +                       _mm_store_si128(D++, tmp[0]);
> +                       _mm_store_si128(D++, tmp[1]);
> +                       _mm_store_si128(D++, tmp[2]);
> +                       _mm_store_si128(D++, tmp[3]);
> +               }
> +       } else
> +               memcpy(dst, src, PAGE_SIZE);
> +}
> +static void copy_wc_cacheline(void *dst, void *src)
> +{
> +       if (igt_x86_features() & SSE4_1) {
> +               __m128i *S = (__m128i *)src;
> +               __m128i *D = (__m128i *)dst;
> +               __m128i tmp[4];
> +
> +               tmp[0] = _mm_stream_load_si128(S++);
> +               tmp[1] = _mm_stream_load_si128(S++);
> +               tmp[2] = _mm_stream_load_si128(S++);
> +               tmp[3] = _mm_stream_load_si128(S++);
> +
> +               _mm_store_si128(D++, tmp[0]);
> +               _mm_store_si128(D++, tmp[1]);
> +               _mm_store_si128(D++, tmp[2]);
> +               _mm_store_si128(D++, tmp[3]);
> +       } else
> +               memcpy(dst, src, CACHELINE);
> +}
> +
> +#pragma GCC pop_options
> +
> +#else
> +static void copy_wc_page(void *dst, const void *src)
> +{
> +       memcpy(dst, src, PAGE_SIZE);
> +}
> +static void copy_wc_cacheline(void *dst, const void *src)
> +{
> +       memcpy(dst, src, CACHELINE);
> +}
> +#endif
> +
>  static void
>  _bo_write_verify(struct test *t)
>  {
>         int fd = t->fd;
>         int i, k;
>         uint32_t **s;
> -       uint32_t v;
>         unsigned int dwords = OBJECT_SIZE >> 2;
>         const char *tile_str[] = { "none", "x", "y" };
> +       uint32_t tmp[PAGE_SIZE/sizeof(uint32_t)];
>  
>         igt_assert(t->tiling >= 0 && t->tiling <= I915_TILING_Y);
>         igt_assert_lt(0, t->num_surfaces);
> @@ -124,21 +192,39 @@ _bo_write_verify(struct test *t)
>                 s[k] = bo_create(fd, t->tiling);
>  
>         for (k = 0; k < t->num_surfaces; k++) {
> -               volatile uint32_t *a = s[k];
> -
> -               for (i = 0; i < dwords; i++) {
> -                       a[i] = i;
> -                       v = a[i];
> -                       igt_assert_f(v == i,
> -                                    "tiling %s: write failed at %d (%x)\n",
> -                                    tile_str[t->tiling], i, v);
> +               uint32_t *a = s[k];
> +
> +               a[0] = 0xdeadbeef;
> +               igt_assert_f(a[0] == 0xdeadbeef,
> +                            "tiling %s: write failed at start (%x)\n",
> +                            tile_str[t->tiling], a[0]);
> +
> +               a[dwords - 1] = 0xc0ffee;
> +               igt_assert_f(a[dwords - 1] == 0xc0ffee,
> +                            "tiling %s: write failed at end (%x)\n",
> +                            tile_str[t->tiling], a[dwords - 1]);
> +
> +               for (i = 0; i < dwords; i += CACHELINE/sizeof(uint32_t)) {
> +                       for (int j = 0; j < CACHELINE/sizeof(uint32_t); j++)
> +                               a[i + j] = ~(i + j);
> +
> +                       copy_wc_cacheline(tmp, a + i);
> +                       for (int j = 0; j < CACHELINE/sizeof(uint32_t); j++)
> +                               igt_assert_f(tmp[j] == ~(i+ j),
> +                                            "tiling %s: write failed at %d (%x)\n",
> +                                            tile_str[t->tiling], i + j, tmp[j]);
> +
> +                       for (int j = 0; j < CACHELINE/sizeof(uint32_t); j++)
> +                               a[i + j] = i + j;
>                 }
>  
> -               for (i = 0; i < dwords; i++) {
> -                       v = a[i];
> -                       igt_assert_f(v == i,
> -                                    "tiling %s: verify failed at %d (%x)\n",
> -                                    tile_str[t->tiling], i, v);
> +               for (i = 0; i < dwords; i += PAGE_SIZE/sizeof(uint32_t)) {
> +                       copy_wc_page(tmp, a + i);
> +                       for (int j = 0; j < PAGE_SIZE/sizeof(uint32_t); j++) {
> +                               igt_assert_f(tmp[j] == i + j,
> +                                            "tiling %s: verify failed at %d (%x)\n",
> +                                            tile_str[t->tiling], i + j, tmp[j]);
> +                       }
>                 }
>         }
>  
> -- 
> 2.14.1
> 
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH igt] igt/gem_fence_thresh: Use streaming reads for verify
  2017-08-23 12:55 [PATCH igt] igt/gem_fence_thresh: Use streaming reads for verify Chris Wilson
                   ` (3 preceding siblings ...)
  2017-09-07 18:14 ` [PATCH igt] " Chris Wilson
@ 2017-10-06 20:53 ` Chris Wilson
  2017-10-09 13:36 ` Joonas Lahtinen
  5 siblings, 0 replies; 8+ messages in thread
From: Chris Wilson @ 2017-10-06 20:53 UTC (permalink / raw)
  To: intel-gfx

Quoting Chris Wilson (2017-08-23 13:55:55)
> At the moment, the verify tests use an extremely brutal write-read of
> every dword, degrading performance to UC. If we break those up into
> cachelines, we can do a wcb write/read at a time instead, roughly 8x
> faster. We lose the accuracy of the forced wcb flushes around every dword,
> but we are retaining the overall behaviour of checking reads following
> writes instead. To compensate, we do check that a single dword write/read
> before using wcb aligned accesses.

This fixes one of the APL timeouts...

> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> ---
>  tests/gem_fence_thrash.c | 116 +++++++++++++++++++++++++++++++++++++++++------
>  1 file changed, 101 insertions(+), 15 deletions(-)
> 
> diff --git a/tests/gem_fence_thrash.c b/tests/gem_fence_thrash.c
> index 52095f26..3e1edb73 100644
> --- a/tests/gem_fence_thrash.c
> +++ b/tests/gem_fence_thrash.c
> @@ -30,7 +30,6 @@
>  #include "config.h"
>  #endif
>  
> -#include "igt.h"
>  #include <unistd.h>
>  #include <stdlib.h>
>  #include <stdio.h>
> @@ -43,6 +42,12 @@
>  #include <pthread.h>
>  #include "drm.h"
>  
> +#include "igt.h"
> +#include "igt_x86.h"
> +
> +#define PAGE_SIZE 4096
> +#define CACHELINE 64
> +
>  #define OBJECT_SIZE (128*1024) /* restricted to 1MiB alignment on i915 fences */
>  
>  /* Before introduction of the LRU list for fences, allocation of a fence for a page
> @@ -104,15 +109,78 @@ bo_copy (void *_arg)
>         return NULL;
>  }
>  
> +#if defined(__x86_64__) && !defined(__clang__)
> +#define MOVNT 512
> +
> +#pragma GCC push_options
> +#pragma GCC target("sse4.1")
> +
> +#include <smmintrin.h>
> +__attribute__((noinline))
> +static void copy_wc_page(void *dst, void *src)
> +{
> +       if (igt_x86_features() & SSE4_1) {
> +               __m128i *S = (__m128i *)src;
> +               __m128i *D = (__m128i *)dst;
> +
> +               for (int i = 0; i < PAGE_SIZE/CACHELINE; i++) {
> +                       __m128i tmp[4];
> +
> +                       tmp[0] = _mm_stream_load_si128(S++);
> +                       tmp[1] = _mm_stream_load_si128(S++);
> +                       tmp[2] = _mm_stream_load_si128(S++);
> +                       tmp[3] = _mm_stream_load_si128(S++);
> +
> +                       _mm_store_si128(D++, tmp[0]);
> +                       _mm_store_si128(D++, tmp[1]);
> +                       _mm_store_si128(D++, tmp[2]);
> +                       _mm_store_si128(D++, tmp[3]);
> +               }
> +       } else
> +               memcpy(dst, src, PAGE_SIZE);
> +}
> +static void copy_wc_cacheline(void *dst, void *src)
> +{
> +       if (igt_x86_features() & SSE4_1) {
> +               __m128i *S = (__m128i *)src;
> +               __m128i *D = (__m128i *)dst;
> +               __m128i tmp[4];
> +
> +               tmp[0] = _mm_stream_load_si128(S++);
> +               tmp[1] = _mm_stream_load_si128(S++);
> +               tmp[2] = _mm_stream_load_si128(S++);
> +               tmp[3] = _mm_stream_load_si128(S++);
> +
> +               _mm_store_si128(D++, tmp[0]);
> +               _mm_store_si128(D++, tmp[1]);
> +               _mm_store_si128(D++, tmp[2]);
> +               _mm_store_si128(D++, tmp[3]);
> +       } else
> +               memcpy(dst, src, CACHELINE);
> +}
> +
> +#pragma GCC pop_options
> +
> +#else
> +static void copy_wc_page(void *dst, const void *src)
> +{
> +       memcpy(dst, src, PAGE_SIZE);
> +}
> +static void copy_wc_cacheline(void *dst, const void *src)
> +{
> +       memcpy(dst, src, CACHELINE);
> +}
> +#endif
> +
>  static void
>  _bo_write_verify(struct test *t)
>  {
>         int fd = t->fd;
>         int i, k;
>         uint32_t **s;
> -       uint32_t v;
>         unsigned int dwords = OBJECT_SIZE >> 2;
>         const char *tile_str[] = { "none", "x", "y" };
> +       uint32_t tmp[PAGE_SIZE/sizeof(uint32_t)];
>  
>         igt_assert(t->tiling >= 0 && t->tiling <= I915_TILING_Y);
>         igt_assert_lt(0, t->num_surfaces);
> @@ -124,21 +192,39 @@ _bo_write_verify(struct test *t)
>                 s[k] = bo_create(fd, t->tiling);
>  
>         for (k = 0; k < t->num_surfaces; k++) {
> -               volatile uint32_t *a = s[k];
> -
> -               for (i = 0; i < dwords; i++) {
> -                       a[i] = i;
> -                       v = a[i];
> -                       igt_assert_f(v == i,
> -                                    "tiling %s: write failed at %d (%x)\n",
> -                                    tile_str[t->tiling], i, v);
> +               uint32_t *a = s[k];
> +
> +               a[0] = 0xdeadbeef;
> +               igt_assert_f(a[0] == 0xdeadbeef,
> +                            "tiling %s: write failed at start (%x)\n",
> +                            tile_str[t->tiling], a[0]);
> +
> +               a[dwords - 1] = 0xc0ffee;
> +               igt_assert_f(a[dwords - 1] == 0xc0ffee,
> +                            "tiling %s: write failed at end (%x)\n",
> +                            tile_str[t->tiling], a[dwords - 1]);
> +
> +               for (i = 0; i < dwords; i += CACHELINE/sizeof(uint32_t)) {
> +                       for (int j = 0; j < CACHELINE/sizeof(uint32_t); j++)
> +                               a[i + j] = ~(i + j);
> +
> +                       copy_wc_cacheline(tmp, a + i);
> +                       for (int j = 0; j < CACHELINE/sizeof(uint32_t); j++)
> +                               igt_assert_f(tmp[j] == ~(i+ j),
> +                                            "tiling %s: write failed at %d (%x)\n",
> +                                            tile_str[t->tiling], i + j, tmp[j]);
> +
> +                       for (int j = 0; j < CACHELINE/sizeof(uint32_t); j++)
> +                               a[i + j] = i + j;
>                 }
>  
> -               for (i = 0; i < dwords; i++) {
> -                       v = a[i];
> -                       igt_assert_f(v == i,
> -                                    "tiling %s: verify failed at %d (%x)\n",
> -                                    tile_str[t->tiling], i, v);
> +               for (i = 0; i < dwords; i += PAGE_SIZE/sizeof(uint32_t)) {
> +                       copy_wc_page(tmp, a + i);
> +                       for (int j = 0; j < PAGE_SIZE/sizeof(uint32_t); j++) {
> +                               igt_assert_f(tmp[j] == i + j,
> +                                            "tiling %s: verify failed at %d (%x)\n",
> +                                            tile_str[t->tiling], i + j, tmp[j]);
> +                       }
>                 }
>         }
>  
> -- 
> 2.14.1
> 
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH igt] igt/gem_fence_thresh: Use streaming reads for verify
  2017-08-23 12:55 [PATCH igt] igt/gem_fence_thresh: Use streaming reads for verify Chris Wilson
                   ` (4 preceding siblings ...)
  2017-10-06 20:53 ` Chris Wilson
@ 2017-10-09 13:36 ` Joonas Lahtinen
  2017-10-09 13:56   ` Chris Wilson
  5 siblings, 1 reply; 8+ messages in thread
From: Joonas Lahtinen @ 2017-10-09 13:36 UTC (permalink / raw)
  To: Chris Wilson, intel-gfx

Title: s/thresh/thrash/

On Wed, 2017-08-23 at 13:55 +0100, Chris Wilson wrote:
> At the moment, the verify tests use an extremely brutal write-read of
> every dword, degrading performance to UC. If we break those up into
> cachelines, we can do a wcb write/read at a time instead, roughly 8x
> faster. We lose the accuracy of the forced wcb flushes around every dword,
> but we are retaining the overall behaviour of checking reads following
> writes instead. To compensate, we do check that a single dword write/read
> before using wcb aligned accesses.
> 
> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>

<SNIP>

> @@ -104,15 +109,78 @@ bo_copy (void *_arg)
>  	return NULL;
>  }
>  
> +#if defined(__x86_64__) && !defined(__clang__)
> +#define MOVNT 512
> +
> +#pragma GCC push_options
> +#pragma GCC target("sse4.1")
> +
> +#include <smmintrin.h>
> +__attribute__((noinline))
> +static void copy_wc_page(void *dst, void *src)
> +{
> +	if (igt_x86_features() & SSE4_1) {
> +		__m128i *S = (__m128i *)src;
> +		__m128i *D = (__m128i *)dst;
> +
> +		for (int i = 0; i < PAGE_SIZE/CACHELINE; i++) {
> +			__m128i tmp[4];
> +
> +			tmp[0] = _mm_stream_load_si128(S++);
> +			tmp[1] = _mm_stream_load_si128(S++);
> +			tmp[2] = _mm_stream_load_si128(S++);
> +			tmp[3] = _mm_stream_load_si128(S++);
> +
> +			_mm_store_si128(D++, tmp[0]);
> +			_mm_store_si128(D++, tmp[1]);
> +			_mm_store_si128(D++, tmp[2]);
> +			_mm_store_si128(D++, tmp[3]);
> +		}
> +	} else
> +		memcpy(dst, src, PAGE_SIZE);
> +}

Not lib/ material?

Add newline anyway.

Reviewed-by: Joonas Lahtinen <joonas.lahtinen@linux.intel.com>

Regards, Joonas
-- 
Joonas Lahtinen
Open Source Technology Center
Intel Corporation
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH igt] igt/gem_fence_thresh: Use streaming reads for verify
  2017-10-09 13:36 ` Joonas Lahtinen
@ 2017-10-09 13:56   ` Chris Wilson
  0 siblings, 0 replies; 8+ messages in thread
From: Chris Wilson @ 2017-10-09 13:56 UTC (permalink / raw)
  To: Joonas Lahtinen, intel-gfx

Quoting Joonas Lahtinen (2017-10-09 14:36:27)
> Title: s/thresh/thrash/
> 
> On Wed, 2017-08-23 at 13:55 +0100, Chris Wilson wrote:
> > At the moment, the verify tests use an extremely brutal write-read of
> > every dword, degrading performance to UC. If we break those up into
> > cachelines, we can do a wcb write/read at a time instead, roughly 8x
> > faster. We lose the accuracy of the forced wcb flushes around every dword,
> > but we are retaining the overall behaviour of checking reads following
> > writes instead. To compensate, we do check that a single dword write/read
> > before using wcb aligned accesses.
> > 
> > Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> 
> <SNIP>
> 
> > @@ -104,15 +109,78 @@ bo_copy (void *_arg)
> >       return NULL;
> >  }
> >  
> > +#if defined(__x86_64__) && !defined(__clang__)
> > +#define MOVNT 512
> > +
> > +#pragma GCC push_options
> > +#pragma GCC target("sse4.1")
> > +
> > +#include <smmintrin.h>
> > +__attribute__((noinline))
> > +static void copy_wc_page(void *dst, void *src)
> > +{
> > +     if (igt_x86_features() & SSE4_1) {
> > +             __m128i *S = (__m128i *)src;
> > +             __m128i *D = (__m128i *)dst;
> > +
> > +             for (int i = 0; i < PAGE_SIZE/CACHELINE; i++) {
> > +                     __m128i tmp[4];
> > +
> > +                     tmp[0] = _mm_stream_load_si128(S++);
> > +                     tmp[1] = _mm_stream_load_si128(S++);
> > +                     tmp[2] = _mm_stream_load_si128(S++);
> > +                     tmp[3] = _mm_stream_load_si128(S++);
> > +
> > +                     _mm_store_si128(D++, tmp[0]);
> > +                     _mm_store_si128(D++, tmp[1]);
> > +                     _mm_store_si128(D++, tmp[2]);
> > +                     _mm_store_si128(D++, tmp[3]);
> > +             }
> > +     } else
> > +             memcpy(dst, src, PAGE_SIZE);
> > +}
> 
> Not lib/ material?

Yes. But you know it's easier to make it work for one case than all.
-Chris
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 8+ messages in thread

end of thread, other threads:[~2017-10-09 13:57 UTC | newest]

Thread overview: 8+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2017-08-23 12:55 [PATCH igt] igt/gem_fence_thresh: Use streaming reads for verify Chris Wilson
2017-08-23 13:14 ` ✗ Fi.CI.BAT: failure for " Patchwork
2017-08-25 11:14 ` ✓ Fi.CI.BAT: success " Patchwork
2017-08-25 13:14 ` ✗ Fi.CI.IGT: warning " Patchwork
2017-09-07 18:14 ` [PATCH igt] " Chris Wilson
2017-10-06 20:53 ` Chris Wilson
2017-10-09 13:36 ` Joonas Lahtinen
2017-10-09 13:56   ` Chris Wilson

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.