From: "Ville Syrjälä" <ville.syrjala@linux.intel.com>
To: Chris Wilson <chris@chris-wilson.co.uk>
Cc: igt-dev@lists.freedesktop.org
Subject: Re: [igt-dev] [PATCH igt v6] lib: Provide an accelerated routine for readback from WC
Date: Wed, 28 Feb 2018 19:12:44 +0200 [thread overview]
Message-ID: <20180228171244.GO5453@intel.com> (raw)
In-Reply-To: <20180228090016.4589-1-chris@chris-wilson.co.uk>
On Wed, Feb 28, 2018 at 09:00:16AM +0000, Chris Wilson wrote:
> Reading from WC is awfully slow as each access is uncached and so
> performed synchronously, stalling for the memory load. x86 did introduce
> some new instructions in SSE 4.1 to provide a small internal buffer to
> accelerate reading back a cacheline at a time from uncached memory, for
> this purpose.
>
> v2: Don't be lazy and handle misalignment.
> v3: Switch out of sse41 before emitting the generic memcpy routine
> v4: Replace opencoded memcpy_from_wc
> v5: Always flush the internal buffer before use (Eric)
> v6: Assume bulk moves, so check for dst alignment.
>
> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> Cc: Eric Anholt <eric@anholt.net>
> ---
> lib/igt_fb.c | 3 +-
> lib/igt_x86.c | 124 +++++++++++++++++++++++++++++++++++++++++
> lib/igt_x86.h | 2 +
> tests/gem_fence_thrash.c | 63 +--------------------
> tests/gem_mmap_gtt.c | 37 +-----------
> tests/gem_tiled_pread_pwrite.c | 37 +-----------
> 6 files changed, 132 insertions(+), 134 deletions(-)
>
> diff --git a/lib/igt_fb.c b/lib/igt_fb.c
> index ecd73053..7404ba7c 100644
> --- a/lib/igt_fb.c
> +++ b/lib/igt_fb.c
> @@ -32,6 +32,7 @@
> #include "drmtest.h"
> #include "igt_fb.h"
> #include "igt_kms.h"
> +#include "igt_x86.h"
> #include "ioctl_wrappers.h"
> #include "intel_chipset.h"
>
> @@ -1340,7 +1341,7 @@ static void convert_nv12_to_rgb24(struct igt_fb *fb, struct fb_convert_blit_uplo
> * it's faster to copy the whole BO to a temporary buffer and convert
> * from there.
> */
> - memcpy(buf, blit->linear.map, blit->linear.size);
> + igt_memcpy_from_wc(buf, blit->linear.map, blit->linear.size);
> y = &buf[blit->linear.offsets[0]];
> uv = &buf[blit->linear.offsets[1]];
>
> diff --git a/lib/igt_x86.c b/lib/igt_x86.c
> index 0ed3c6f1..54539456 100644
> --- a/lib/igt_x86.c
> +++ b/lib/igt_x86.c
> @@ -36,7 +36,10 @@
> #endif
>
> #include "igt_x86.h"
> +
> +#include <stdint.h>
> #include <stdio.h>
> +#include <string.h>
>
> /**
> * SECTION:igt_x86
> @@ -174,3 +177,124 @@ char *igt_x86_features_to_string(unsigned features, char *line)
> return ret;
> }
> #endif
> +
> +#if defined(__x86_64__) && !defined(__clang__)
> +#define MOVNT 512
What's this MOVNT define?
> +
> +#pragma GCC push_options
> +#pragma GCC target("sse4.1")
> +#pragma GCC diagnostic ignored "-Wpointer-arith"
> +
> +#define min(x, y) ({ \
> + typeof(x) _min1 = (x); \
> + typeof(y) _min2 = (y); \
> + (void) (&_min1 == &_min2); \
> + _min1 < _min2 ? _min1 : _min2; \
> +})
igt_aux.h has this already I believe.
> +
> +#include <smmintrin.h>
> +static void memcpy_from_wc_sse41(void *dst, const void *src, unsigned long len)
> +{
> + char buf[16];
> +
> + /* Flush the internal buffer of potential stale gfx data */
> + __builtin_ia32_mfence();
Isn't there a _mm_mfence()?
Apart from those everything looks all right to me.
Reviewed-by: Ville Syrjälä <ville.syrjala@linux.intel.com>
> +
> + if ((uintptr_t)src & 15) {
> + __m128i *S = (__m128i *)((uintptr_t)src & ~15);
> + unsigned long misalign = (uintptr_t)src & 15;
> + unsigned long copy = min(len, 16 - misalign);
> +
> + _mm_storeu_si128((__m128i *)buf,
> + _mm_stream_load_si128(S));
> +
> + memcpy(dst, buf + misalign, copy);
> +
> + dst += copy;
> + src += copy;
> + len -= copy;
> + }
> +
> + /* We assume we are doing bulk transfers, so prefer aligned moves */
> + if (((uintptr_t)dst & 15) == 0) {
> + while (len >= 64) {
> + __m128i *S = (__m128i *)src;
> + __m128i *D = (__m128i *)dst;
> + __m128i tmp[4];
> +
> + tmp[0] = _mm_stream_load_si128(S + 0);
> + tmp[1] = _mm_stream_load_si128(S + 1);
> + tmp[2] = _mm_stream_load_si128(S + 2);
> + tmp[3] = _mm_stream_load_si128(S + 3);
> +
> + _mm_store_si128(D + 0, tmp[0]);
> + _mm_store_si128(D + 1, tmp[1]);
> + _mm_store_si128(D + 2, tmp[2]);
> + _mm_store_si128(D + 3, tmp[3]);
> +
> + src += 64;
> + dst += 64;
> + len -= 64;
> + }
> + } else {
> + while (len >= 64) {
> + __m128i *S = (__m128i *)src;
> + __m128i *D = (__m128i *)dst;
> + __m128i tmp[4];
> +
> + tmp[0] = _mm_stream_load_si128(S + 0);
> + tmp[1] = _mm_stream_load_si128(S + 1);
> + tmp[2] = _mm_stream_load_si128(S + 2);
> + tmp[3] = _mm_stream_load_si128(S + 3);
> +
> + _mm_storeu_si128(D + 0, tmp[0]);
> + _mm_storeu_si128(D + 1, tmp[1]);
> + _mm_storeu_si128(D + 2, tmp[2]);
> + _mm_storeu_si128(D + 3, tmp[3]);
> +
> + src += 64;
> + dst += 64;
> + len -= 64;
> + }
> + }
> +
> + while (len >= 16) {
> + _mm_storeu_si128((__m128i *)dst,
> + _mm_stream_load_si128((__m128i *)src));
> +
> + src += 16;
> + dst += 16;
> + len -= 16;
> + }
> +
> + if (len) {
> + _mm_storeu_si128((__m128i *)buf,
> + _mm_stream_load_si128((__m128i *)src));
> + memcpy(dst, buf, len);
> + }
> +}
> +
> +#pragma GCC pop_options
> +
> +static void memcpy_from_wc(void *dst, const void *src, unsigned long len)
> +{
> + memcpy(dst, src, len);
> +}
> +
> +static void (*resolve_memcpy_from_wc(void))(void *, const void *, unsigned long)
> +{
> + if (igt_x86_features() & SSE4_1)
> + return memcpy_from_wc_sse41;
> +
> + return memcpy_from_wc;
> +}
> +
> +void igt_memcpy_from_wc(void *dst, const void *src, unsigned long len)
> + __attribute__((ifunc("resolve_memcpy_from_wc")));
> +
> +#else
> +void igt_memcpy_from_wc(void *dst, const void *src, unsigned long len)
> +{
> + memcpy(dst, src, len);
> +}
> +#endif
> diff --git a/lib/igt_x86.h b/lib/igt_x86.h
> index 27b7f0fd..d4f8c343 100644
> --- a/lib/igt_x86.h
> +++ b/lib/igt_x86.h
> @@ -55,4 +55,6 @@ static inline char *igt_x86_features_to_string(unsigned features, char *line)
> }
> #endif
>
> +void igt_memcpy_from_wc(void *dst, const void *src, unsigned long len);
> +
> #endif /* IGT_X86_H */
> diff --git a/tests/gem_fence_thrash.c b/tests/gem_fence_thrash.c
> index c8ff961d..2d7fb2ff 100644
> --- a/tests/gem_fence_thrash.c
> +++ b/tests/gem_fence_thrash.c
> @@ -107,75 +107,16 @@ bo_copy (void *_arg)
> return NULL;
> }
>
> -#if defined(__x86_64__) && !defined(__clang__)
> -
> -#pragma GCC push_options
> -#pragma GCC target("sse4.1")
> -
> -#include <smmintrin.h>
> -
> -#define MOVNT 512
> -
> -__attribute__((noinline))
> -static void copy_wc_page(void *dst, void *src)
> -{
> - if (igt_x86_features() & SSE4_1) {
> - __m128i *S = (__m128i *)src;
> - __m128i *D = (__m128i *)dst;
> -
> - for (int i = 0; i < PAGE_SIZE/CACHELINE; i++) {
> - __m128i tmp[4];
> -
> - tmp[0] = _mm_stream_load_si128(S++);
> - tmp[1] = _mm_stream_load_si128(S++);
> - tmp[2] = _mm_stream_load_si128(S++);
> - tmp[3] = _mm_stream_load_si128(S++);
> -
> - _mm_store_si128(D++, tmp[0]);
> - _mm_store_si128(D++, tmp[1]);
> - _mm_store_si128(D++, tmp[2]);
> - _mm_store_si128(D++, tmp[3]);
> - }
> - } else
> - memcpy(dst, src, PAGE_SIZE);
> -}
> -
> -static void copy_wc_cacheline(void *dst, void *src)
> -{
> - if (igt_x86_features() & SSE4_1) {
> - __m128i *S = (__m128i *)src;
> - __m128i *D = (__m128i *)dst;
> - __m128i tmp[4];
> -
> - tmp[0] = _mm_stream_load_si128(S++);
> - tmp[1] = _mm_stream_load_si128(S++);
> - tmp[2] = _mm_stream_load_si128(S++);
> - tmp[3] = _mm_stream_load_si128(S++);
> -
> - _mm_store_si128(D++, tmp[0]);
> - _mm_store_si128(D++, tmp[1]);
> - _mm_store_si128(D++, tmp[2]);
> - _mm_store_si128(D++, tmp[3]);
> - } else
> - memcpy(dst, src, CACHELINE);
> -}
> -
> -#pragma GCC pop_options
> -
> -#else
> -
> static void copy_wc_page(void *dst, const void *src)
> {
> - memcpy(dst, src, PAGE_SIZE);
> + igt_memcpy_from_wc(dst, src, PAGE_SIZE);
> }
>
> static void copy_wc_cacheline(void *dst, const void *src)
> {
> - memcpy(dst, src, CACHELINE);
> + igt_memcpy_from_wc(dst, src, CACHELINE);
> }
>
> -#endif
> -
> static void
> _bo_write_verify(struct test *t)
> {
> diff --git a/tests/gem_mmap_gtt.c b/tests/gem_mmap_gtt.c
> index 0f598125..6a332b25 100644
> --- a/tests/gem_mmap_gtt.c
> +++ b/tests/gem_mmap_gtt.c
> @@ -529,45 +529,10 @@ test_huge_bo(int fd, int huge, int tiling)
> munmap(linear_pattern, PAGE_SIZE);
> }
>
> -#if defined(__x86_64__) && !defined(__clang__)
> -#define MOVNT 512
> -
> -#pragma GCC push_options
> -#pragma GCC target("sse4.1")
> -
> -#include <smmintrin.h>
> -__attribute__((noinline))
> -static void copy_wc_page(void *dst, void *src)
> -{
> - if (igt_x86_features() & SSE4_1) {
> - __m128i *S = (__m128i *)src;
> - __m128i *D = (__m128i *)dst;
> -
> - for (int i = 0; i < PAGE_SIZE/64; i++) {
> - __m128i tmp[4];
> -
> - tmp[0] = _mm_stream_load_si128(S++);
> - tmp[1] = _mm_stream_load_si128(S++);
> - tmp[2] = _mm_stream_load_si128(S++);
> - tmp[3] = _mm_stream_load_si128(S++);
> -
> - _mm_store_si128(D++, tmp[0]);
> - _mm_store_si128(D++, tmp[1]);
> - _mm_store_si128(D++, tmp[2]);
> - _mm_store_si128(D++, tmp[3]);
> - }
> - } else
> - memcpy(dst, src, PAGE_SIZE);
> -}
> -
> -#pragma GCC pop_options
> -
> -#else
> static void copy_wc_page(void *dst, const void *src)
> {
> - memcpy(dst, src, PAGE_SIZE);
> + igt_memcpy_from_wc(dst, src, PAGE_SIZE);
> }
> -#endif
>
> static unsigned int tile_row_size(int tiling, unsigned int stride)
> {
> diff --git a/tests/gem_tiled_pread_pwrite.c b/tests/gem_tiled_pread_pwrite.c
> index 7b5577fd..313daa38 100644
> --- a/tests/gem_tiled_pread_pwrite.c
> +++ b/tests/gem_tiled_pread_pwrite.c
> @@ -100,45 +100,10 @@ create_bo(int fd)
> return handle;
> }
>
> -#if defined(__x86_64__) && !defined(__clang__)
> -#define MOVNT 512
> -
> -#pragma GCC push_options
> -#pragma GCC target("sse4.1")
> -
> -#include <smmintrin.h>
> -__attribute__((noinline))
> -static void copy_wc_page(void *dst, void *src)
> -{
> - if (igt_x86_features() & SSE4_1) {
> - __m128i *S = (__m128i *)src;
> - __m128i *D = (__m128i *)dst;
> -
> - for (int i = 0; i < PAGE_SIZE/64; i++) {
> - __m128i tmp[4];
> -
> - tmp[0] = _mm_stream_load_si128(S++);
> - tmp[1] = _mm_stream_load_si128(S++);
> - tmp[2] = _mm_stream_load_si128(S++);
> - tmp[3] = _mm_stream_load_si128(S++);
> -
> - _mm_store_si128(D++, tmp[0]);
> - _mm_store_si128(D++, tmp[1]);
> - _mm_store_si128(D++, tmp[2]);
> - _mm_store_si128(D++, tmp[3]);
> - }
> - } else
> - memcpy(dst, src, PAGE_SIZE);
> -}
> -
> -#pragma GCC pop_options
> -
> -#else
> static void copy_wc_page(void *dst, const void *src)
> {
> - memcpy(dst, src, PAGE_SIZE);
> + igt_memcpy_from_wc(dst, src, PAGE_SIZE);
> }
> -#endif
>
> igt_simple_main
> {
> --
> 2.16.2
>
> _______________________________________________
> igt-dev mailing list
> igt-dev@lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/igt-dev
--
Ville Syrjälä
Intel OTC
_______________________________________________
igt-dev mailing list
igt-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/igt-dev
next prev parent reply other threads:[~2018-02-28 17:12 UTC|newest]
Thread overview: 13+ messages / expand[flat|nested] mbox.gz Atom feed top
2018-02-27 21:50 [igt-dev] [PATCH igt] lib: Provide an accelerated routine for readback from WC Chris Wilson
2018-02-27 21:53 ` Chris Wilson
2018-02-27 22:17 ` [igt-dev] [PATCH igt v2] " Chris Wilson
2018-02-27 22:20 ` [igt-dev] [PATCH igt v3] " Chris Wilson
2018-02-27 22:42 ` [igt-dev] [PATCH igt v4] " Chris Wilson
2018-02-27 23:29 ` [igt-dev] [PATCH igt] " Eric Anholt
2018-02-27 23:44 ` [igt-dev] ✓ Fi.CI.BAT: success for lib: Provide an accelerated routine for readback from WC (rev4) Patchwork
2018-02-28 1:04 ` [igt-dev] ✗ Fi.CI.IGT: failure " Patchwork
2018-02-28 9:00 ` [igt-dev] [PATCH igt v6] lib: Provide an accelerated routine for readback from WC Chris Wilson
2018-02-28 17:12 ` Ville Syrjälä [this message]
2018-03-01 8:43 ` Chris Wilson
2018-02-28 9:31 ` [igt-dev] ✓ Fi.CI.BAT: success for lib: Provide an accelerated routine for readback from WC (rev5) Patchwork
2018-02-28 10:16 ` [igt-dev] ✓ Fi.CI.IGT: " Patchwork
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20180228171244.GO5453@intel.com \
--to=ville.syrjala@linux.intel.com \
--cc=chris@chris-wilson.co.uk \
--cc=igt-dev@lists.freedesktop.org \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox