* [PATCH] drm/i915: Evict CS TLBs between batches @ 2014-09-07 8:08 Chris Wilson 2014-09-07 12:26 ` Chris Wilson 2014-09-08 8:03 ` Daniel Vetter 0 siblings, 2 replies; 8+ messages in thread From: Chris Wilson @ 2014-09-07 8:08 UTC (permalink / raw) To: intel-gfx; +Cc: Chris Wilson, Daniel Vetter, stable Running igt, I was encountering the invalid TLB bug on my 845g, despite that it was using the CS workaround. Examining the w/a buffer in the error state, showed that the copy from the user batch into the workaround itself was suffering from the invalid TLB bug (the first cacheline was broken with the first two words reversed). Time to try a fresh approach. This extends the workaround to write into each page of our scratch buffer in order to overflow the TLB and evict the invalid entries. This could be refined to only do so after we update the GTT, but for simplicity, we do it before each batch. I suspect this supersedes our current workaround, but for safety keep doing both. Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk> Cc: Daniel Vetter <daniel.vetter@ffwll.ch> Cc: stable@vger.kernel.org --- drivers/gpu/drm/i915/i915_reg.h | 12 ++++--- drivers/gpu/drm/i915/intel_ringbuffer.c | 62 +++++++++++++++++++-------------- 2 files changed, 44 insertions(+), 30 deletions(-) diff --git a/drivers/gpu/drm/i915/i915_reg.h b/drivers/gpu/drm/i915/i915_reg.h index e4d7607da2c4..f29b44c86a2f 100644 --- a/drivers/gpu/drm/i915/i915_reg.h +++ b/drivers/gpu/drm/i915/i915_reg.h @@ -334,16 +334,20 @@ #define GFX_OP_DESTBUFFER_INFO ((0x3<<29)|(0x1d<<24)|(0x8e<<16)|1) #define GFX_OP_DRAWRECT_INFO ((0x3<<29)|(0x1d<<24)|(0x80<<16)|(0x3)) #define GFX_OP_DRAWRECT_INFO_I965 ((0x7900<<16)|0x2) -#define SRC_COPY_BLT_CMD ((2<<29)|(0x43<<22)|4) + +#define COLOR_BLT_CMD (2<<29 | 0x40<<22 | (5-2)) +#define SRC_COPY_BLT_CMD ((2<<29)|(0x43<<22)|4) #define XY_SRC_COPY_BLT_CMD ((2<<29)|(0x53<<22)|6) #define XY_MONO_SRC_COPY_IMM_BLT ((2<<29)|(0x71<<22)|5) -#define XY_SRC_COPY_BLT_WRITE_ALPHA (1<<21) -#define XY_SRC_COPY_BLT_WRITE_RGB (1<<20) +#define BLT_WRITE_A (2<<20) +#define BLT_WRITE_RGB (1<<20) +#define BLT_WRITE_RGBA (BLT_WRITE_RGB | BLT_WRITE_A) #define BLT_DEPTH_8 (0<<24) #define BLT_DEPTH_16_565 (1<<24) #define BLT_DEPTH_16_1555 (2<<24) #define BLT_DEPTH_32 (3<<24) -#define BLT_ROP_GXCOPY (0xcc<<16) +#define BLT_ROP_SRC_COPY (0xcc<<16) +#define BLT_ROP_COLOR_COPY (0xf0<<16) #define XY_SRC_COPY_BLT_SRC_TILED (1<<15) /* 965+ only */ #define XY_SRC_COPY_BLT_DST_TILED (1<<11) /* 965+ only */ #define CMD_OP_DISPLAYBUFFER_INFO ((0x0<<29)|(0x14<<23)|2) diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.c b/drivers/gpu/drm/i915/intel_ringbuffer.c index 16371a444426..acd933b1b027 100644 --- a/drivers/gpu/drm/i915/intel_ringbuffer.c +++ b/drivers/gpu/drm/i915/intel_ringbuffer.c @@ -1368,49 +1368,59 @@ i830_dispatch_execbuffer(struct intel_engine_cs *ring, u64 offset, u32 len, unsigned flags) { + u32 cs_offset = ring->scratch.gtt_offset; int ret; - if (flags & I915_DISPATCH_PINNED) { - ret = intel_ring_begin(ring, 4); - if (ret) - return ret; + ret = intel_ring_begin(ring, 6); + if (ret) + return ret; - intel_ring_emit(ring, MI_BATCH_BUFFER); - intel_ring_emit(ring, offset | (flags & I915_DISPATCH_SECURE ? 0 : MI_BATCH_NON_SECURE)); - intel_ring_emit(ring, offset + len - 8); - intel_ring_emit(ring, MI_NOOP); - intel_ring_advance(ring); - } else { - u32 cs_offset = ring->scratch.gtt_offset; + /* Evict the invalid PTE TLBs */ + intel_ring_emit(ring, COLOR_BLT_CMD | BLT_WRITE_RGBA); + intel_ring_emit(ring, BLT_DEPTH_32 | BLT_ROP_COLOR_COPY | 4096); + intel_ring_emit(ring, 64 << 16 | 4); /* load each page */ + intel_ring_emit(ring, cs_offset); + intel_ring_emit(ring, 0xdeadbeef); + intel_ring_emit(ring, MI_NOOP); + intel_ring_advance(ring); + if ((flags & I915_DISPATCH_PINNED) == 0) { if (len > I830_BATCH_LIMIT) return -ENOSPC; - ret = intel_ring_begin(ring, 9+3); + ret = intel_ring_begin(ring, 6 + 2); if (ret) return ret; - /* Blit the batch (which has now all relocs applied) to the stable batch - * scratch bo area (so that the CS never stumbles over its tlb - * invalidation bug) ... */ - intel_ring_emit(ring, XY_SRC_COPY_BLT_CMD | - XY_SRC_COPY_BLT_WRITE_ALPHA | - XY_SRC_COPY_BLT_WRITE_RGB); - intel_ring_emit(ring, BLT_DEPTH_32 | BLT_ROP_GXCOPY | 4096); - intel_ring_emit(ring, 0); - intel_ring_emit(ring, (DIV_ROUND_UP(len, 4096) << 16) | 1024); + + /* Blit the batch (which has now all relocs applied) to the + * stable batch scratch bo area (so that the CS never + * stumbles over its tlb invalidation bug) ... + */ + intel_ring_emit(ring, SRC_COPY_BLT_CMD | BLT_WRITE_RGBA); + intel_ring_emit(ring, BLT_DEPTH_32 | BLT_ROP_SRC_COPY | 4096); + intel_ring_emit(ring, DIV_ROUND_UP(len, 4096) << 16 | 1024); intel_ring_emit(ring, cs_offset); - intel_ring_emit(ring, 0); intel_ring_emit(ring, 4096); intel_ring_emit(ring, offset); + intel_ring_emit(ring, MI_FLUSH); + intel_ring_emit(ring, MI_NOOP); + intel_ring_advance(ring); /* ... and execute it. */ - intel_ring_emit(ring, MI_BATCH_BUFFER); - intel_ring_emit(ring, cs_offset | (flags & I915_DISPATCH_SECURE ? 0 : MI_BATCH_NON_SECURE)); - intel_ring_emit(ring, cs_offset + len - 8); - intel_ring_advance(ring); + offset = cs_offset; } + ret = intel_ring_begin(ring, 4); + if (ret) + return ret; + + intel_ring_emit(ring, MI_BATCH_BUFFER); + intel_ring_emit(ring, offset | (flags & I915_DISPATCH_SECURE ? 0 : MI_BATCH_NON_SECURE)); + intel_ring_emit(ring, offset + len - 8); + intel_ring_emit(ring, MI_NOOP); + intel_ring_advance(ring); + return 0; } -- 2.1.0 ^ permalink raw reply related [flat|nested] 8+ messages in thread
* Re: [PATCH] drm/i915: Evict CS TLBs between batches 2014-09-07 8:08 [PATCH] drm/i915: Evict CS TLBs between batches Chris Wilson @ 2014-09-07 12:26 ` Chris Wilson 2014-09-08 8:03 ` Daniel Vetter 1 sibling, 0 replies; 8+ messages in thread From: Chris Wilson @ 2014-09-07 12:26 UTC (permalink / raw) To: intel-gfx; +Cc: Daniel Vetter, stable On Sun, Sep 07, 2014 at 09:08:31AM +0100, Chris Wilson wrote: > Running igt, I was encountering the invalid TLB bug on my 845g, despite > that it was using the CS workaround. Examining the w/a buffer in the > error state, showed that the copy from the user batch into the > workaround itself was suffering from the invalid TLB bug (the first > cacheline was broken with the first two words reversed). Time to try a > fresh approach. This extends the workaround to write into each page of > our scratch buffer in order to overflow the TLB and evict the invalid > entries. This could be refined to only do so after we update the GTT, > but for simplicity, we do it before each batch. This seems to hold up under scruting, so far at least. So I am more convinced that this effectively does the right thing. -Chris -- Chris Wilson, Intel Open Source Technology Centre ^ permalink raw reply [flat|nested] 8+ messages in thread
* Re: [PATCH] drm/i915: Evict CS TLBs between batches 2014-09-07 8:08 [PATCH] drm/i915: Evict CS TLBs between batches Chris Wilson 2014-09-07 12:26 ` Chris Wilson @ 2014-09-08 8:03 ` Daniel Vetter 2014-09-08 8:15 ` Chris Wilson 2014-09-08 13:25 ` Chris Wilson 1 sibling, 2 replies; 8+ messages in thread From: Daniel Vetter @ 2014-09-08 8:03 UTC (permalink / raw) To: Chris Wilson; +Cc: Daniel Vetter, intel-gfx, stable On Sun, Sep 07, 2014 at 09:08:31AM +0100, Chris Wilson wrote: > Running igt, I was encountering the invalid TLB bug on my 845g, despite > that it was using the CS workaround. Examining the w/a buffer in the > error state, showed that the copy from the user batch into the > workaround itself was suffering from the invalid TLB bug (the first > cacheline was broken with the first two words reversed). Time to try a > fresh approach. This extends the workaround to write into each page of > our scratch buffer in order to overflow the TLB and evict the invalid > entries. This could be refined to only do so after we update the GTT, > but for simplicity, we do it before each batch. > > I suspect this supersedes our current workaround, but for safety keep > doing both. I suspect that we might end up with just an elaborate delay implementation, but if it works then it's good. One nitpick below, with that addressed this is Reviewed-by: Daniel Vetter <daniel.vetter@ffwll.ch> > > Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk> > Cc: Daniel Vetter <daniel.vetter@ffwll.ch> > Cc: stable@vger.kernel.org > --- > drivers/gpu/drm/i915/i915_reg.h | 12 ++++--- > drivers/gpu/drm/i915/intel_ringbuffer.c | 62 +++++++++++++++++++-------------- > 2 files changed, 44 insertions(+), 30 deletions(-) > > diff --git a/drivers/gpu/drm/i915/i915_reg.h b/drivers/gpu/drm/i915/i915_reg.h > index e4d7607da2c4..f29b44c86a2f 100644 > --- a/drivers/gpu/drm/i915/i915_reg.h > +++ b/drivers/gpu/drm/i915/i915_reg.h > @@ -334,16 +334,20 @@ > #define GFX_OP_DESTBUFFER_INFO ((0x3<<29)|(0x1d<<24)|(0x8e<<16)|1) > #define GFX_OP_DRAWRECT_INFO ((0x3<<29)|(0x1d<<24)|(0x80<<16)|(0x3)) > #define GFX_OP_DRAWRECT_INFO_I965 ((0x7900<<16)|0x2) > -#define SRC_COPY_BLT_CMD ((2<<29)|(0x43<<22)|4) > + > +#define COLOR_BLT_CMD (2<<29 | 0x40<<22 | (5-2)) > +#define SRC_COPY_BLT_CMD ((2<<29)|(0x43<<22)|4) > #define XY_SRC_COPY_BLT_CMD ((2<<29)|(0x53<<22)|6) > #define XY_MONO_SRC_COPY_IMM_BLT ((2<<29)|(0x71<<22)|5) > -#define XY_SRC_COPY_BLT_WRITE_ALPHA (1<<21) > -#define XY_SRC_COPY_BLT_WRITE_RGB (1<<20) > +#define BLT_WRITE_A (2<<20) > +#define BLT_WRITE_RGB (1<<20) > +#define BLT_WRITE_RGBA (BLT_WRITE_RGB | BLT_WRITE_A) > #define BLT_DEPTH_8 (0<<24) > #define BLT_DEPTH_16_565 (1<<24) > #define BLT_DEPTH_16_1555 (2<<24) > #define BLT_DEPTH_32 (3<<24) > -#define BLT_ROP_GXCOPY (0xcc<<16) > +#define BLT_ROP_SRC_COPY (0xcc<<16) > +#define BLT_ROP_COLOR_COPY (0xf0<<16) > #define XY_SRC_COPY_BLT_SRC_TILED (1<<15) /* 965+ only */ > #define XY_SRC_COPY_BLT_DST_TILED (1<<11) /* 965+ only */ > #define CMD_OP_DISPLAYBUFFER_INFO ((0x0<<29)|(0x14<<23)|2) > diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.c b/drivers/gpu/drm/i915/intel_ringbuffer.c > index 16371a444426..acd933b1b027 100644 > --- a/drivers/gpu/drm/i915/intel_ringbuffer.c > +++ b/drivers/gpu/drm/i915/intel_ringbuffer.c > @@ -1368,49 +1368,59 @@ i830_dispatch_execbuffer(struct intel_engine_cs *ring, > u64 offset, u32 len, > unsigned flags) > { > + u32 cs_offset = ring->scratch.gtt_offset; > int ret; > > - if (flags & I915_DISPATCH_PINNED) { > - ret = intel_ring_begin(ring, 4); > - if (ret) > - return ret; > + ret = intel_ring_begin(ring, 6); > + if (ret) > + return ret; > > - intel_ring_emit(ring, MI_BATCH_BUFFER); > - intel_ring_emit(ring, offset | (flags & I915_DISPATCH_SECURE ? 0 : MI_BATCH_NON_SECURE)); > - intel_ring_emit(ring, offset + len - 8); > - intel_ring_emit(ring, MI_NOOP); > - intel_ring_advance(ring); > - } else { > - u32 cs_offset = ring->scratch.gtt_offset; > + /* Evict the invalid PTE TLBs */ > + intel_ring_emit(ring, COLOR_BLT_CMD | BLT_WRITE_RGBA); > + intel_ring_emit(ring, BLT_DEPTH_32 | BLT_ROP_COLOR_COPY | 4096); > + intel_ring_emit(ring, 64 << 16 | 4); /* load each page */ s/64/(I830M_BACHT_LIMIT / 4096)/ for clarity > + intel_ring_emit(ring, cs_offset); > + intel_ring_emit(ring, 0xdeadbeef); > + intel_ring_emit(ring, MI_NOOP); > + intel_ring_advance(ring); > > + if ((flags & I915_DISPATCH_PINNED) == 0) { > if (len > I830_BATCH_LIMIT) > return -ENOSPC; > > - ret = intel_ring_begin(ring, 9+3); > + ret = intel_ring_begin(ring, 6 + 2); > if (ret) > return ret; > - /* Blit the batch (which has now all relocs applied) to the stable batch > - * scratch bo area (so that the CS never stumbles over its tlb > - * invalidation bug) ... */ > - intel_ring_emit(ring, XY_SRC_COPY_BLT_CMD | > - XY_SRC_COPY_BLT_WRITE_ALPHA | > - XY_SRC_COPY_BLT_WRITE_RGB); > - intel_ring_emit(ring, BLT_DEPTH_32 | BLT_ROP_GXCOPY | 4096); > - intel_ring_emit(ring, 0); > - intel_ring_emit(ring, (DIV_ROUND_UP(len, 4096) << 16) | 1024); > + > + /* Blit the batch (which has now all relocs applied) to the > + * stable batch scratch bo area (so that the CS never > + * stumbles over its tlb invalidation bug) ... > + */ > + intel_ring_emit(ring, SRC_COPY_BLT_CMD | BLT_WRITE_RGBA); > + intel_ring_emit(ring, BLT_DEPTH_32 | BLT_ROP_SRC_COPY | 4096); > + intel_ring_emit(ring, DIV_ROUND_UP(len, 4096) << 16 | 1024); > intel_ring_emit(ring, cs_offset); > - intel_ring_emit(ring, 0); > intel_ring_emit(ring, 4096); > intel_ring_emit(ring, offset); > + > intel_ring_emit(ring, MI_FLUSH); > + intel_ring_emit(ring, MI_NOOP); > + intel_ring_advance(ring); > > /* ... and execute it. */ > - intel_ring_emit(ring, MI_BATCH_BUFFER); > - intel_ring_emit(ring, cs_offset | (flags & I915_DISPATCH_SECURE ? 0 : MI_BATCH_NON_SECURE)); > - intel_ring_emit(ring, cs_offset + len - 8); > - intel_ring_advance(ring); > + offset = cs_offset; > } > > + ret = intel_ring_begin(ring, 4); > + if (ret) > + return ret; > + > + intel_ring_emit(ring, MI_BATCH_BUFFER); > + intel_ring_emit(ring, offset | (flags & I915_DISPATCH_SECURE ? 0 : MI_BATCH_NON_SECURE)); > + intel_ring_emit(ring, offset + len - 8); > + intel_ring_emit(ring, MI_NOOP); > + intel_ring_advance(ring); > + > return 0; > } > > -- > 2.1.0 > -- Daniel Vetter Software Engineer, Intel Corporation +41 (0) 79 365 57 48 - http://blog.ffwll.ch ^ permalink raw reply [flat|nested] 8+ messages in thread
* Re: [PATCH] drm/i915: Evict CS TLBs between batches 2014-09-08 8:03 ` Daniel Vetter @ 2014-09-08 8:15 ` Chris Wilson 2014-09-08 8:25 ` Daniel Vetter 2014-09-08 10:36 ` [Intel-gfx] " Chris Wilson 2014-09-08 13:25 ` Chris Wilson 1 sibling, 2 replies; 8+ messages in thread From: Chris Wilson @ 2014-09-08 8:15 UTC (permalink / raw) To: Daniel Vetter; +Cc: Daniel Vetter, intel-gfx, stable On Mon, Sep 08, 2014 at 10:03:51AM +0200, Daniel Vetter wrote: > On Sun, Sep 07, 2014 at 09:08:31AM +0100, Chris Wilson wrote: > > Running igt, I was encountering the invalid TLB bug on my 845g, despite > > that it was using the CS workaround. Examining the w/a buffer in the > > error state, showed that the copy from the user batch into the > > workaround itself was suffering from the invalid TLB bug (the first > > cacheline was broken with the first two words reversed). Time to try a > > fresh approach. This extends the workaround to write into each page of > > our scratch buffer in order to overflow the TLB and evict the invalid > > entries. This could be refined to only do so after we update the GTT, > > but for simplicity, we do it before each batch. > > > > I suspect this supersedes our current workaround, but for safety keep > > doing both. > > I suspect that we might end up with just an elaborate delay > implementation, but if it works then it's good. One nitpick below, with > that addressed this is Reviewed-by: Daniel Vetter <daniel.vetter@ffwll.ch> One way to test that is simply comparing 64x4096 byte writes in the same page vs 64x4 byte writes in 64 different pages. That should be roughly the same latency (thought with TLB fetches you never be too sure) and demonstrate that it is either the TLB or the delay that's the factor. In my testing, I did multiple copies into the batch w/a so that I was reasonably sure that what the source was stable and the copy of the source didn't match N times. > > > > Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk> > > Cc: Daniel Vetter <daniel.vetter@ffwll.ch> > > Cc: stable@vger.kernel.org > > --- > > drivers/gpu/drm/i915/i915_reg.h | 12 ++++--- > > drivers/gpu/drm/i915/intel_ringbuffer.c | 62 +++++++++++++++++++-------------- > > 2 files changed, 44 insertions(+), 30 deletions(-) > > > > diff --git a/drivers/gpu/drm/i915/i915_reg.h b/drivers/gpu/drm/i915/i915_reg.h > > index e4d7607da2c4..f29b44c86a2f 100644 > > --- a/drivers/gpu/drm/i915/i915_reg.h > > +++ b/drivers/gpu/drm/i915/i915_reg.h > > @@ -334,16 +334,20 @@ > > #define GFX_OP_DESTBUFFER_INFO ((0x3<<29)|(0x1d<<24)|(0x8e<<16)|1) > > #define GFX_OP_DRAWRECT_INFO ((0x3<<29)|(0x1d<<24)|(0x80<<16)|(0x3)) > > #define GFX_OP_DRAWRECT_INFO_I965 ((0x7900<<16)|0x2) > > -#define SRC_COPY_BLT_CMD ((2<<29)|(0x43<<22)|4) > > + > > +#define COLOR_BLT_CMD (2<<29 | 0x40<<22 | (5-2)) > > +#define SRC_COPY_BLT_CMD ((2<<29)|(0x43<<22)|4) > > #define XY_SRC_COPY_BLT_CMD ((2<<29)|(0x53<<22)|6) > > #define XY_MONO_SRC_COPY_IMM_BLT ((2<<29)|(0x71<<22)|5) > > -#define XY_SRC_COPY_BLT_WRITE_ALPHA (1<<21) > > -#define XY_SRC_COPY_BLT_WRITE_RGB (1<<20) > > +#define BLT_WRITE_A (2<<20) > > +#define BLT_WRITE_RGB (1<<20) > > +#define BLT_WRITE_RGBA (BLT_WRITE_RGB | BLT_WRITE_A) > > #define BLT_DEPTH_8 (0<<24) > > #define BLT_DEPTH_16_565 (1<<24) > > #define BLT_DEPTH_16_1555 (2<<24) > > #define BLT_DEPTH_32 (3<<24) > > -#define BLT_ROP_GXCOPY (0xcc<<16) > > +#define BLT_ROP_SRC_COPY (0xcc<<16) > > +#define BLT_ROP_COLOR_COPY (0xf0<<16) > > #define XY_SRC_COPY_BLT_SRC_TILED (1<<15) /* 965+ only */ > > #define XY_SRC_COPY_BLT_DST_TILED (1<<11) /* 965+ only */ > > #define CMD_OP_DISPLAYBUFFER_INFO ((0x0<<29)|(0x14<<23)|2) > > diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.c b/drivers/gpu/drm/i915/intel_ringbuffer.c > > index 16371a444426..acd933b1b027 100644 > > --- a/drivers/gpu/drm/i915/intel_ringbuffer.c > > +++ b/drivers/gpu/drm/i915/intel_ringbuffer.c > > @@ -1368,49 +1368,59 @@ i830_dispatch_execbuffer(struct intel_engine_cs *ring, > > u64 offset, u32 len, > > unsigned flags) > > { > > + u32 cs_offset = ring->scratch.gtt_offset; > > int ret; > > > > - if (flags & I915_DISPATCH_PINNED) { > > - ret = intel_ring_begin(ring, 4); > > - if (ret) > > - return ret; > > + ret = intel_ring_begin(ring, 6); > > + if (ret) > > + return ret; > > > > - intel_ring_emit(ring, MI_BATCH_BUFFER); > > - intel_ring_emit(ring, offset | (flags & I915_DISPATCH_SECURE ? 0 : MI_BATCH_NON_SECURE)); > > - intel_ring_emit(ring, offset + len - 8); > > - intel_ring_emit(ring, MI_NOOP); > > - intel_ring_advance(ring); > > - } else { > > - u32 cs_offset = ring->scratch.gtt_offset; > > + /* Evict the invalid PTE TLBs */ > > + intel_ring_emit(ring, COLOR_BLT_CMD | BLT_WRITE_RGBA); > > + intel_ring_emit(ring, BLT_DEPTH_32 | BLT_ROP_COLOR_COPY | 4096); > > + intel_ring_emit(ring, 64 << 16 | 4); /* load each page */ > > s/64/(I830M_BACHT_LIMIT / 4096)/ for clarity Hmm, I don't think so. My thinking here is that this is TLB entries, so perhaps #define I830M_TLB_ENTRIES 64 #define I830M_BATCH_LIMIT 256*1024 /* uAPI convention */ #define I830M_WA_BUFSIZE MAX(GTT_PAGE_SIZE * I830M_TLB_ENTRIES, I830M_BATCH_LIMIT) -Chris -- Chris Wilson, Intel Open Source Technology Centre ^ permalink raw reply [flat|nested] 8+ messages in thread
* Re: [PATCH] drm/i915: Evict CS TLBs between batches 2014-09-08 8:15 ` Chris Wilson @ 2014-09-08 8:25 ` Daniel Vetter 2014-09-08 10:36 ` [Intel-gfx] " Chris Wilson 1 sibling, 0 replies; 8+ messages in thread From: Daniel Vetter @ 2014-09-08 8:25 UTC (permalink / raw) To: Chris Wilson, Daniel Vetter, intel-gfx, Daniel Vetter, stable On Mon, Sep 08, 2014 at 09:15:50AM +0100, Chris Wilson wrote: > On Mon, Sep 08, 2014 at 10:03:51AM +0200, Daniel Vetter wrote: > > On Sun, Sep 07, 2014 at 09:08:31AM +0100, Chris Wilson wrote: > > > Running igt, I was encountering the invalid TLB bug on my 845g, despite > > > that it was using the CS workaround. Examining the w/a buffer in the > > > error state, showed that the copy from the user batch into the > > > workaround itself was suffering from the invalid TLB bug (the first > > > cacheline was broken with the first two words reversed). Time to try a > > > fresh approach. This extends the workaround to write into each page of > > > our scratch buffer in order to overflow the TLB and evict the invalid > > > entries. This could be refined to only do so after we update the GTT, > > > but for simplicity, we do it before each batch. > > > > > > I suspect this supersedes our current workaround, but for safety keep > > > doing both. > > > > I suspect that we might end up with just an elaborate delay > > implementation, but if it works then it's good. One nitpick below, with > > that addressed this is Reviewed-by: Daniel Vetter <daniel.vetter@ffwll.ch> > > One way to test that is simply comparing 64x4096 byte writes in the same > page vs 64x4 byte writes in 64 different pages. That should be roughly > the same latency (thought with TLB fetches you never be too sure) and > demonstrate that it is either the TLB or the delay that's the factor. > > In my testing, I did multiple copies into the batch w/a so that I was > reasonably sure that what the source was stable and the copy of the > source didn't match N times. > > > > > > > Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk> > > > Cc: Daniel Vetter <daniel.vetter@ffwll.ch> > > > Cc: stable@vger.kernel.org > > > --- > > > drivers/gpu/drm/i915/i915_reg.h | 12 ++++--- > > > drivers/gpu/drm/i915/intel_ringbuffer.c | 62 +++++++++++++++++++-------------- > > > 2 files changed, 44 insertions(+), 30 deletions(-) > > > > > > diff --git a/drivers/gpu/drm/i915/i915_reg.h b/drivers/gpu/drm/i915/i915_reg.h > > > index e4d7607da2c4..f29b44c86a2f 100644 > > > --- a/drivers/gpu/drm/i915/i915_reg.h > > > +++ b/drivers/gpu/drm/i915/i915_reg.h > > > @@ -334,16 +334,20 @@ > > > #define GFX_OP_DESTBUFFER_INFO ((0x3<<29)|(0x1d<<24)|(0x8e<<16)|1) > > > #define GFX_OP_DRAWRECT_INFO ((0x3<<29)|(0x1d<<24)|(0x80<<16)|(0x3)) > > > #define GFX_OP_DRAWRECT_INFO_I965 ((0x7900<<16)|0x2) > > > -#define SRC_COPY_BLT_CMD ((2<<29)|(0x43<<22)|4) > > > + > > > +#define COLOR_BLT_CMD (2<<29 | 0x40<<22 | (5-2)) > > > +#define SRC_COPY_BLT_CMD ((2<<29)|(0x43<<22)|4) > > > #define XY_SRC_COPY_BLT_CMD ((2<<29)|(0x53<<22)|6) > > > #define XY_MONO_SRC_COPY_IMM_BLT ((2<<29)|(0x71<<22)|5) > > > -#define XY_SRC_COPY_BLT_WRITE_ALPHA (1<<21) > > > -#define XY_SRC_COPY_BLT_WRITE_RGB (1<<20) > > > +#define BLT_WRITE_A (2<<20) > > > +#define BLT_WRITE_RGB (1<<20) > > > +#define BLT_WRITE_RGBA (BLT_WRITE_RGB | BLT_WRITE_A) > > > #define BLT_DEPTH_8 (0<<24) > > > #define BLT_DEPTH_16_565 (1<<24) > > > #define BLT_DEPTH_16_1555 (2<<24) > > > #define BLT_DEPTH_32 (3<<24) > > > -#define BLT_ROP_GXCOPY (0xcc<<16) > > > +#define BLT_ROP_SRC_COPY (0xcc<<16) > > > +#define BLT_ROP_COLOR_COPY (0xf0<<16) > > > #define XY_SRC_COPY_BLT_SRC_TILED (1<<15) /* 965+ only */ > > > #define XY_SRC_COPY_BLT_DST_TILED (1<<11) /* 965+ only */ > > > #define CMD_OP_DISPLAYBUFFER_INFO ((0x0<<29)|(0x14<<23)|2) > > > diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.c b/drivers/gpu/drm/i915/intel_ringbuffer.c > > > index 16371a444426..acd933b1b027 100644 > > > --- a/drivers/gpu/drm/i915/intel_ringbuffer.c > > > +++ b/drivers/gpu/drm/i915/intel_ringbuffer.c > > > @@ -1368,49 +1368,59 @@ i830_dispatch_execbuffer(struct intel_engine_cs *ring, > > > u64 offset, u32 len, > > > unsigned flags) > > > { > > > + u32 cs_offset = ring->scratch.gtt_offset; > > > int ret; > > > > > > - if (flags & I915_DISPATCH_PINNED) { > > > - ret = intel_ring_begin(ring, 4); > > > - if (ret) > > > - return ret; > > > + ret = intel_ring_begin(ring, 6); > > > + if (ret) > > > + return ret; > > > > > > - intel_ring_emit(ring, MI_BATCH_BUFFER); > > > - intel_ring_emit(ring, offset | (flags & I915_DISPATCH_SECURE ? 0 : MI_BATCH_NON_SECURE)); > > > - intel_ring_emit(ring, offset + len - 8); > > > - intel_ring_emit(ring, MI_NOOP); > > > - intel_ring_advance(ring); > > > - } else { > > > - u32 cs_offset = ring->scratch.gtt_offset; > > > + /* Evict the invalid PTE TLBs */ > > > + intel_ring_emit(ring, COLOR_BLT_CMD | BLT_WRITE_RGBA); > > > + intel_ring_emit(ring, BLT_DEPTH_32 | BLT_ROP_COLOR_COPY | 4096); > > > + intel_ring_emit(ring, 64 << 16 | 4); /* load each page */ > > > > s/64/(I830M_BACHT_LIMIT / 4096)/ for clarity > > Hmm, I don't think so. My thinking here is that this is TLB entries, > so perhaps > > #define I830M_TLB_ENTRIES 64 > #define I830M_BATCH_LIMIT 256*1024 /* uAPI convention */ > > #define I830M_WA_BUFSIZE MAX(GTT_PAGE_SIZE * I830M_TLB_ENTRIES, I830M_BATCH_LIMIT) Yeah that works better. Or a comment explaining that we have 64 tlbs. Either is fine, as long as we have a symbolic link to the scratch wa buffer size to make sure we don't overwrite random memory by accident. -Daniel -- Daniel Vetter Software Engineer, Intel Corporation +41 (0) 79 365 57 48 - http://blog.ffwll.ch ^ permalink raw reply [flat|nested] 8+ messages in thread
* Re: [Intel-gfx] [PATCH] drm/i915: Evict CS TLBs between batches 2014-09-08 8:15 ` Chris Wilson 2014-09-08 8:25 ` Daniel Vetter @ 2014-09-08 10:36 ` Chris Wilson 1 sibling, 0 replies; 8+ messages in thread From: Chris Wilson @ 2014-09-08 10:36 UTC (permalink / raw) To: Daniel Vetter, intel-gfx, Daniel Vetter, stable On Mon, Sep 08, 2014 at 09:15:50AM +0100, Chris Wilson wrote: > On Mon, Sep 08, 2014 at 10:03:51AM +0200, Daniel Vetter wrote: > > On Sun, Sep 07, 2014 at 09:08:31AM +0100, Chris Wilson wrote: > > > Running igt, I was encountering the invalid TLB bug on my 845g, despite > > > that it was using the CS workaround. Examining the w/a buffer in the > > > error state, showed that the copy from the user batch into the > > > workaround itself was suffering from the invalid TLB bug (the first > > > cacheline was broken with the first two words reversed). Time to try a > > > fresh approach. This extends the workaround to write into each page of > > > our scratch buffer in order to overflow the TLB and evict the invalid > > > entries. This could be refined to only do so after we update the GTT, > > > but for simplicity, we do it before each batch. > > > > > > I suspect this supersedes our current workaround, but for safety keep > > > doing both. > > > > I suspect that we might end up with just an elaborate delay > > implementation, but if it works then it's good. One nitpick below, with > > that addressed this is Reviewed-by: Daniel Vetter <daniel.vetter@ffwll.ch> > > One way to test that is simply comparing 64x4096 byte writes in the same > page vs 64x4 byte writes in 64 different pages. That should be roughly > the same latency (thought with TLB fetches you never be too sure) and > demonstrate that it is either the TLB or the delay that's the factor. Quick update: Wrote 256k into one page (instead of 4 byte write into each of 64 pages), hopefully testing the delay theory, and found it did not prevent the corruption/hang. Now trying to refine the estimate on the number of TLBs. -Chris -- Chris Wilson, Intel Open Source Technology Centre ^ permalink raw reply [flat|nested] 8+ messages in thread
* [PATCH] drm/i915: Evict CS TLBs between batches 2014-09-08 8:03 ` Daniel Vetter 2014-09-08 8:15 ` Chris Wilson @ 2014-09-08 13:25 ` Chris Wilson 2014-09-08 13:53 ` Jani Nikula 1 sibling, 1 reply; 8+ messages in thread From: Chris Wilson @ 2014-09-08 13:25 UTC (permalink / raw) To: intel-gfx; +Cc: Chris Wilson, Daniel Vetter, Ville Syrjälä, stable Running igt, I was encountering the invalid TLB bug on my 845g, despite that it was using the CS workaround. Examining the w/a buffer in the error state, showed that the copy from the user batch into the workaround itself was suffering from the invalid TLB bug (the first cacheline was broken with the first two words reversed). Time to try a fresh approach. This extends the workaround to write into each page of our scratch buffer in order to overflow the TLB and evict the invalid entries. This could be refined to only do so after we update the GTT, but for simplicity, we do it before each batch. I suspect this supersedes our current workaround, but for safety keep doing both. v2: The magic number shall be 2. This doesn't conclusively prove that it is the mythical TLB bug we've been trying to workaround for so long, that it requires touching a number of pages to prevent the corruption indicates to me that it is TLB related, but the corruption (the reversed cacheline) is more subtle than a TLB bug, where we would expect it to read the wrong page entirely. Oh well, it prevents a reliable hang for me and so probably for others as well. Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk> Cc: Daniel Vetter <daniel.vetter@ffwll.ch> Cc: Ville Syrjälä <ville.syrjala@linux.intel.com> Cc: stable@vger.kernel.org Reviewed-by: Daniel Vetter <daniel.vetter@ffwll.ch> --- drivers/gpu/drm/i915/i915_reg.h | 12 ++++-- drivers/gpu/drm/i915/intel_ringbuffer.c | 66 +++++++++++++++++++-------------- 2 files changed, 47 insertions(+), 31 deletions(-) diff --git a/drivers/gpu/drm/i915/i915_reg.h b/drivers/gpu/drm/i915/i915_reg.h index e4d7607da2c4..f29b44c86a2f 100644 --- a/drivers/gpu/drm/i915/i915_reg.h +++ b/drivers/gpu/drm/i915/i915_reg.h @@ -334,16 +334,20 @@ #define GFX_OP_DESTBUFFER_INFO ((0x3<<29)|(0x1d<<24)|(0x8e<<16)|1) #define GFX_OP_DRAWRECT_INFO ((0x3<<29)|(0x1d<<24)|(0x80<<16)|(0x3)) #define GFX_OP_DRAWRECT_INFO_I965 ((0x7900<<16)|0x2) -#define SRC_COPY_BLT_CMD ((2<<29)|(0x43<<22)|4) + +#define COLOR_BLT_CMD (2<<29 | 0x40<<22 | (5-2)) +#define SRC_COPY_BLT_CMD ((2<<29)|(0x43<<22)|4) #define XY_SRC_COPY_BLT_CMD ((2<<29)|(0x53<<22)|6) #define XY_MONO_SRC_COPY_IMM_BLT ((2<<29)|(0x71<<22)|5) -#define XY_SRC_COPY_BLT_WRITE_ALPHA (1<<21) -#define XY_SRC_COPY_BLT_WRITE_RGB (1<<20) +#define BLT_WRITE_A (2<<20) +#define BLT_WRITE_RGB (1<<20) +#define BLT_WRITE_RGBA (BLT_WRITE_RGB | BLT_WRITE_A) #define BLT_DEPTH_8 (0<<24) #define BLT_DEPTH_16_565 (1<<24) #define BLT_DEPTH_16_1555 (2<<24) #define BLT_DEPTH_32 (3<<24) -#define BLT_ROP_GXCOPY (0xcc<<16) +#define BLT_ROP_SRC_COPY (0xcc<<16) +#define BLT_ROP_COLOR_COPY (0xf0<<16) #define XY_SRC_COPY_BLT_SRC_TILED (1<<15) /* 965+ only */ #define XY_SRC_COPY_BLT_DST_TILED (1<<11) /* 965+ only */ #define CMD_OP_DISPLAYBUFFER_INFO ((0x0<<29)|(0x14<<23)|2) diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.c b/drivers/gpu/drm/i915/intel_ringbuffer.c index 16371a444426..2d068edd1adc 100644 --- a/drivers/gpu/drm/i915/intel_ringbuffer.c +++ b/drivers/gpu/drm/i915/intel_ringbuffer.c @@ -1363,54 +1363,66 @@ i965_dispatch_execbuffer(struct intel_engine_cs *ring, /* Just userspace ABI convention to limit the wa batch bo to a resonable size */ #define I830_BATCH_LIMIT (256*1024) +#define I830_TLB_ENTRIES (2) +#define I830_WA_SIZE max(I830_TLB_ENTRIES*4096, I830_BATCH_LIMIT) static int i830_dispatch_execbuffer(struct intel_engine_cs *ring, u64 offset, u32 len, unsigned flags) { + u32 cs_offset = ring->scratch.gtt_offset; int ret; - if (flags & I915_DISPATCH_PINNED) { - ret = intel_ring_begin(ring, 4); - if (ret) - return ret; + ret = intel_ring_begin(ring, 6); + if (ret) + return ret; - intel_ring_emit(ring, MI_BATCH_BUFFER); - intel_ring_emit(ring, offset | (flags & I915_DISPATCH_SECURE ? 0 : MI_BATCH_NON_SECURE)); - intel_ring_emit(ring, offset + len - 8); - intel_ring_emit(ring, MI_NOOP); - intel_ring_advance(ring); - } else { - u32 cs_offset = ring->scratch.gtt_offset; + /* Evict the invalid PTE TLBs */ + intel_ring_emit(ring, COLOR_BLT_CMD | BLT_WRITE_RGBA); + intel_ring_emit(ring, BLT_DEPTH_32 | BLT_ROP_COLOR_COPY | 4096); + intel_ring_emit(ring, I830_TLB_ENTRIES << 16 | 4); /* load each page */ + intel_ring_emit(ring, cs_offset); + intel_ring_emit(ring, 0xdeadbeef); + intel_ring_emit(ring, MI_NOOP); + intel_ring_advance(ring); + if ((flags & I915_DISPATCH_PINNED) == 0) { if (len > I830_BATCH_LIMIT) return -ENOSPC; - ret = intel_ring_begin(ring, 9+3); + ret = intel_ring_begin(ring, 6 + 2); if (ret) return ret; - /* Blit the batch (which has now all relocs applied) to the stable batch - * scratch bo area (so that the CS never stumbles over its tlb - * invalidation bug) ... */ - intel_ring_emit(ring, XY_SRC_COPY_BLT_CMD | - XY_SRC_COPY_BLT_WRITE_ALPHA | - XY_SRC_COPY_BLT_WRITE_RGB); - intel_ring_emit(ring, BLT_DEPTH_32 | BLT_ROP_GXCOPY | 4096); - intel_ring_emit(ring, 0); - intel_ring_emit(ring, (DIV_ROUND_UP(len, 4096) << 16) | 1024); + + /* Blit the batch (which has now all relocs applied) to the + * stable batch scratch bo area (so that the CS never + * stumbles over its tlb invalidation bug) ... + */ + intel_ring_emit(ring, SRC_COPY_BLT_CMD | BLT_WRITE_RGBA); + intel_ring_emit(ring, BLT_DEPTH_32 | BLT_ROP_SRC_COPY | 4096); + intel_ring_emit(ring, DIV_ROUND_UP(len, 4096) << 16 | 1024); intel_ring_emit(ring, cs_offset); - intel_ring_emit(ring, 0); intel_ring_emit(ring, 4096); intel_ring_emit(ring, offset); + intel_ring_emit(ring, MI_FLUSH); + intel_ring_emit(ring, MI_NOOP); + intel_ring_advance(ring); /* ... and execute it. */ - intel_ring_emit(ring, MI_BATCH_BUFFER); - intel_ring_emit(ring, cs_offset | (flags & I915_DISPATCH_SECURE ? 0 : MI_BATCH_NON_SECURE)); - intel_ring_emit(ring, cs_offset + len - 8); - intel_ring_advance(ring); + offset = cs_offset; } + ret = intel_ring_begin(ring, 4); + if (ret) + return ret; + + intel_ring_emit(ring, MI_BATCH_BUFFER); + intel_ring_emit(ring, offset | (flags & I915_DISPATCH_SECURE ? 0 : MI_BATCH_NON_SECURE)); + intel_ring_emit(ring, offset + len - 8); + intel_ring_emit(ring, MI_NOOP); + intel_ring_advance(ring); + return 0; } @@ -2200,7 +2212,7 @@ int intel_init_render_ring_buffer(struct drm_device *dev) /* Workaround batchbuffer to combat CS tlb bug. */ if (HAS_BROKEN_CS_TLB(dev)) { - obj = i915_gem_alloc_object(dev, I830_BATCH_LIMIT); + obj = i915_gem_alloc_object(dev, I830_WA_SIZE); if (obj == NULL) { DRM_ERROR("Failed to allocate batch bo\n"); return -ENOMEM; -- 2.1.0 ^ permalink raw reply related [flat|nested] 8+ messages in thread
* Re: [PATCH] drm/i915: Evict CS TLBs between batches 2014-09-08 13:25 ` Chris Wilson @ 2014-09-08 13:53 ` Jani Nikula 0 siblings, 0 replies; 8+ messages in thread From: Jani Nikula @ 2014-09-08 13:53 UTC (permalink / raw) To: Chris Wilson, intel-gfx; +Cc: Daniel Vetter, stable On Mon, 08 Sep 2014, Chris Wilson <chris@chris-wilson.co.uk> wrote: > Running igt, I was encountering the invalid TLB bug on my 845g, despite > that it was using the CS workaround. Examining the w/a buffer in the > error state, showed that the copy from the user batch into the > workaround itself was suffering from the invalid TLB bug (the first > cacheline was broken with the first two words reversed). Time to try a > fresh approach. This extends the workaround to write into each page of > our scratch buffer in order to overflow the TLB and evict the invalid > entries. This could be refined to only do so after we update the GTT, > but for simplicity, we do it before each batch. > > I suspect this supersedes our current workaround, but for safety keep > doing both. > > v2: The magic number shall be 2. > > This doesn't conclusively prove that it is the mythical TLB bug we've > been trying to workaround for so long, that it requires touching a number > of pages to prevent the corruption indicates to me that it is TLB > related, but the corruption (the reversed cacheline) is more subtle than > a TLB bug, where we would expect it to read the wrong page entirely. > > Oh well, it prevents a reliable hang for me and so probably for others > as well. > > Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk> > Cc: Daniel Vetter <daniel.vetter@ffwll.ch> > Cc: Ville Syrjälä <ville.syrjala@linux.intel.com> > Cc: stable@vger.kernel.org > Reviewed-by: Daniel Vetter <daniel.vetter@ffwll.ch> Pushed to drm-intel-fixes, thanks for the patch and review. BR, Jani. > --- > drivers/gpu/drm/i915/i915_reg.h | 12 ++++-- > drivers/gpu/drm/i915/intel_ringbuffer.c | 66 +++++++++++++++++++-------------- > 2 files changed, 47 insertions(+), 31 deletions(-) > > diff --git a/drivers/gpu/drm/i915/i915_reg.h b/drivers/gpu/drm/i915/i915_reg.h > index e4d7607da2c4..f29b44c86a2f 100644 > --- a/drivers/gpu/drm/i915/i915_reg.h > +++ b/drivers/gpu/drm/i915/i915_reg.h > @@ -334,16 +334,20 @@ > #define GFX_OP_DESTBUFFER_INFO ((0x3<<29)|(0x1d<<24)|(0x8e<<16)|1) > #define GFX_OP_DRAWRECT_INFO ((0x3<<29)|(0x1d<<24)|(0x80<<16)|(0x3)) > #define GFX_OP_DRAWRECT_INFO_I965 ((0x7900<<16)|0x2) > -#define SRC_COPY_BLT_CMD ((2<<29)|(0x43<<22)|4) > + > +#define COLOR_BLT_CMD (2<<29 | 0x40<<22 | (5-2)) > +#define SRC_COPY_BLT_CMD ((2<<29)|(0x43<<22)|4) > #define XY_SRC_COPY_BLT_CMD ((2<<29)|(0x53<<22)|6) > #define XY_MONO_SRC_COPY_IMM_BLT ((2<<29)|(0x71<<22)|5) > -#define XY_SRC_COPY_BLT_WRITE_ALPHA (1<<21) > -#define XY_SRC_COPY_BLT_WRITE_RGB (1<<20) > +#define BLT_WRITE_A (2<<20) > +#define BLT_WRITE_RGB (1<<20) > +#define BLT_WRITE_RGBA (BLT_WRITE_RGB | BLT_WRITE_A) > #define BLT_DEPTH_8 (0<<24) > #define BLT_DEPTH_16_565 (1<<24) > #define BLT_DEPTH_16_1555 (2<<24) > #define BLT_DEPTH_32 (3<<24) > -#define BLT_ROP_GXCOPY (0xcc<<16) > +#define BLT_ROP_SRC_COPY (0xcc<<16) > +#define BLT_ROP_COLOR_COPY (0xf0<<16) > #define XY_SRC_COPY_BLT_SRC_TILED (1<<15) /* 965+ only */ > #define XY_SRC_COPY_BLT_DST_TILED (1<<11) /* 965+ only */ > #define CMD_OP_DISPLAYBUFFER_INFO ((0x0<<29)|(0x14<<23)|2) > diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.c b/drivers/gpu/drm/i915/intel_ringbuffer.c > index 16371a444426..2d068edd1adc 100644 > --- a/drivers/gpu/drm/i915/intel_ringbuffer.c > +++ b/drivers/gpu/drm/i915/intel_ringbuffer.c > @@ -1363,54 +1363,66 @@ i965_dispatch_execbuffer(struct intel_engine_cs *ring, > > /* Just userspace ABI convention to limit the wa batch bo to a resonable size */ > #define I830_BATCH_LIMIT (256*1024) > +#define I830_TLB_ENTRIES (2) > +#define I830_WA_SIZE max(I830_TLB_ENTRIES*4096, I830_BATCH_LIMIT) > static int > i830_dispatch_execbuffer(struct intel_engine_cs *ring, > u64 offset, u32 len, > unsigned flags) > { > + u32 cs_offset = ring->scratch.gtt_offset; > int ret; > > - if (flags & I915_DISPATCH_PINNED) { > - ret = intel_ring_begin(ring, 4); > - if (ret) > - return ret; > + ret = intel_ring_begin(ring, 6); > + if (ret) > + return ret; > > - intel_ring_emit(ring, MI_BATCH_BUFFER); > - intel_ring_emit(ring, offset | (flags & I915_DISPATCH_SECURE ? 0 : MI_BATCH_NON_SECURE)); > - intel_ring_emit(ring, offset + len - 8); > - intel_ring_emit(ring, MI_NOOP); > - intel_ring_advance(ring); > - } else { > - u32 cs_offset = ring->scratch.gtt_offset; > + /* Evict the invalid PTE TLBs */ > + intel_ring_emit(ring, COLOR_BLT_CMD | BLT_WRITE_RGBA); > + intel_ring_emit(ring, BLT_DEPTH_32 | BLT_ROP_COLOR_COPY | 4096); > + intel_ring_emit(ring, I830_TLB_ENTRIES << 16 | 4); /* load each page */ > + intel_ring_emit(ring, cs_offset); > + intel_ring_emit(ring, 0xdeadbeef); > + intel_ring_emit(ring, MI_NOOP); > + intel_ring_advance(ring); > > + if ((flags & I915_DISPATCH_PINNED) == 0) { > if (len > I830_BATCH_LIMIT) > return -ENOSPC; > > - ret = intel_ring_begin(ring, 9+3); > + ret = intel_ring_begin(ring, 6 + 2); > if (ret) > return ret; > - /* Blit the batch (which has now all relocs applied) to the stable batch > - * scratch bo area (so that the CS never stumbles over its tlb > - * invalidation bug) ... */ > - intel_ring_emit(ring, XY_SRC_COPY_BLT_CMD | > - XY_SRC_COPY_BLT_WRITE_ALPHA | > - XY_SRC_COPY_BLT_WRITE_RGB); > - intel_ring_emit(ring, BLT_DEPTH_32 | BLT_ROP_GXCOPY | 4096); > - intel_ring_emit(ring, 0); > - intel_ring_emit(ring, (DIV_ROUND_UP(len, 4096) << 16) | 1024); > + > + /* Blit the batch (which has now all relocs applied) to the > + * stable batch scratch bo area (so that the CS never > + * stumbles over its tlb invalidation bug) ... > + */ > + intel_ring_emit(ring, SRC_COPY_BLT_CMD | BLT_WRITE_RGBA); > + intel_ring_emit(ring, BLT_DEPTH_32 | BLT_ROP_SRC_COPY | 4096); > + intel_ring_emit(ring, DIV_ROUND_UP(len, 4096) << 16 | 1024); > intel_ring_emit(ring, cs_offset); > - intel_ring_emit(ring, 0); > intel_ring_emit(ring, 4096); > intel_ring_emit(ring, offset); > + > intel_ring_emit(ring, MI_FLUSH); > + intel_ring_emit(ring, MI_NOOP); > + intel_ring_advance(ring); > > /* ... and execute it. */ > - intel_ring_emit(ring, MI_BATCH_BUFFER); > - intel_ring_emit(ring, cs_offset | (flags & I915_DISPATCH_SECURE ? 0 : MI_BATCH_NON_SECURE)); > - intel_ring_emit(ring, cs_offset + len - 8); > - intel_ring_advance(ring); > + offset = cs_offset; > } > > + ret = intel_ring_begin(ring, 4); > + if (ret) > + return ret; > + > + intel_ring_emit(ring, MI_BATCH_BUFFER); > + intel_ring_emit(ring, offset | (flags & I915_DISPATCH_SECURE ? 0 : MI_BATCH_NON_SECURE)); > + intel_ring_emit(ring, offset + len - 8); > + intel_ring_emit(ring, MI_NOOP); > + intel_ring_advance(ring); > + > return 0; > } > > @@ -2200,7 +2212,7 @@ int intel_init_render_ring_buffer(struct drm_device *dev) > > /* Workaround batchbuffer to combat CS tlb bug. */ > if (HAS_BROKEN_CS_TLB(dev)) { > - obj = i915_gem_alloc_object(dev, I830_BATCH_LIMIT); > + obj = i915_gem_alloc_object(dev, I830_WA_SIZE); > if (obj == NULL) { > DRM_ERROR("Failed to allocate batch bo\n"); > return -ENOMEM; > -- > 2.1.0 > > _______________________________________________ > Intel-gfx mailing list > Intel-gfx@lists.freedesktop.org > http://lists.freedesktop.org/mailman/listinfo/intel-gfx -- Jani Nikula, Intel Open Source Technology Center _______________________________________________ Intel-gfx mailing list Intel-gfx@lists.freedesktop.org http://lists.freedesktop.org/mailman/listinfo/intel-gfx ^ permalink raw reply [flat|nested] 8+ messages in thread
end of thread, other threads:[~2014-09-08 13:52 UTC | newest] Thread overview: 8+ messages (download: mbox.gz follow: Atom feed -- links below jump to the message on this page -- 2014-09-07 8:08 [PATCH] drm/i915: Evict CS TLBs between batches Chris Wilson 2014-09-07 12:26 ` Chris Wilson 2014-09-08 8:03 ` Daniel Vetter 2014-09-08 8:15 ` Chris Wilson 2014-09-08 8:25 ` Daniel Vetter 2014-09-08 10:36 ` [Intel-gfx] " Chris Wilson 2014-09-08 13:25 ` Chris Wilson 2014-09-08 13:53 ` Jani Nikula
This is a public inbox, see mirroring instructions for how to clone and mirror all data and code used for this inbox