* [PATCH 1/3] drm/i915/gvt: optimize for vGPU mmio switch
2017-12-07 4:34 [PATCH 0/3] mmio save restore refine in vgpu switch Weinan Li
@ 2017-12-07 4:34 ` Weinan Li
2017-12-07 4:34 ` [PATCH 2/3] drm/i915/gvt: refine mocs save restore policy Weinan Li
` (3 subsequent siblings)
4 siblings, 0 replies; 6+ messages in thread
From: Weinan Li @ 2017-12-07 4:34 UTC (permalink / raw)
To: intel-gfx
now mmio switch between vGPUs need to switch to host first then to expected
vGPU, it waste one time mmio save/restore. r/w mmio usually is
time-consuming, and there are so many mocs registers need to save/restore
during vGPU switch. Combine the switch_to_host and switch_to_vgpu can
reduce 1 time mmio save/restore, it will reduce the CPU utilization and
performance while there is multi VMs with heavy work load.
Signed-off-by: Weinan Li <weinan.z.li@intel.com>
---
drivers/gpu/drm/i915/gvt/render.c | 212 ++++++++++++++++----------------------
drivers/gpu/drm/i915/gvt/trace.h | 15 +--
2 files changed, 95 insertions(+), 132 deletions(-)
diff --git a/drivers/gpu/drm/i915/gvt/render.c b/drivers/gpu/drm/i915/gvt/render.c
index dac12c2..ec1e60d 100644
--- a/drivers/gpu/drm/i915/gvt/render.c
+++ b/drivers/gpu/drm/i915/gvt/render.c
@@ -190,9 +190,10 @@ static void handle_tlb_pending_event(struct intel_vgpu *vgpu, int ring_id)
gvt_dbg_core("invalidate TLB for ring %d\n", ring_id);
}
-static void load_mocs(struct intel_vgpu *vgpu, int ring_id)
+static void switch_mocs(struct intel_vgpu *pre, struct intel_vgpu *next,
+ int ring_id)
{
- struct drm_i915_private *dev_priv = vgpu->gvt->dev_priv;
+ struct drm_i915_private *dev_priv;
i915_reg_t offset, l3_offset;
u32 regs[] = {
[RCS] = 0xc800,
@@ -203,54 +204,44 @@ static void load_mocs(struct intel_vgpu *vgpu, int ring_id)
};
int i;
+ dev_priv = pre ? pre->gvt->dev_priv : next->gvt->dev_priv;
if (WARN_ON(ring_id >= ARRAY_SIZE(regs)))
return;
offset.reg = regs[ring_id];
- for (i = 0; i < 64; i++) {
- gen9_render_mocs[ring_id][i] = I915_READ_FW(offset);
- I915_WRITE_FW(offset, vgpu_vreg(vgpu, offset));
- offset.reg += 4;
- }
-
- if (ring_id == RCS) {
- l3_offset.reg = 0xb020;
- for (i = 0; i < 32; i++) {
- gen9_render_mocs_L3[i] = I915_READ_FW(l3_offset);
- I915_WRITE_FW(l3_offset, vgpu_vreg(vgpu, l3_offset));
- l3_offset.reg += 4;
- }
- }
-}
-static void restore_mocs(struct intel_vgpu *vgpu, int ring_id)
-{
- struct drm_i915_private *dev_priv = vgpu->gvt->dev_priv;
- i915_reg_t offset, l3_offset;
- u32 regs[] = {
- [RCS] = 0xc800,
- [VCS] = 0xc900,
- [VCS2] = 0xca00,
- [BCS] = 0xcc00,
- [VECS] = 0xcb00,
- };
- int i;
+ for (i = 0; i < 64; i++) {
+ if (pre)
+ vgpu_vreg(pre, offset) =
+ I915_READ_FW(offset);
+ else
+ gen9_render_mocs[ring_id][i] =
+ I915_READ_FW(offset);
- if (WARN_ON(ring_id >= ARRAY_SIZE(regs)))
- return;
+ if (next)
+ I915_WRITE_FW(offset, vgpu_vreg(next, offset));
+ else
+ I915_WRITE_FW(offset, gen9_render_mocs[ring_id][i]);
- offset.reg = regs[ring_id];
- for (i = 0; i < 64; i++) {
- vgpu_vreg(vgpu, offset) = I915_READ_FW(offset);
- I915_WRITE_FW(offset, gen9_render_mocs[ring_id][i]);
offset.reg += 4;
}
if (ring_id == RCS) {
l3_offset.reg = 0xb020;
for (i = 0; i < 32; i++) {
- vgpu_vreg(vgpu, l3_offset) = I915_READ_FW(l3_offset);
- I915_WRITE_FW(l3_offset, gen9_render_mocs_L3[i]);
+ if (pre)
+ vgpu_vreg(pre, l3_offset) =
+ I915_READ_FW(l3_offset);
+ else
+ gen9_render_mocs_L3[i] =
+ I915_READ_FW(l3_offset);
+ if (next)
+ I915_WRITE_FW(l3_offset,
+ vgpu_vreg(next, l3_offset));
+ else
+ I915_WRITE_FW(l3_offset,
+ gen9_render_mocs_L3[i]);
+
l3_offset.reg += 4;
}
}
@@ -258,78 +249,25 @@ static void restore_mocs(struct intel_vgpu *vgpu, int ring_id)
#define CTX_CONTEXT_CONTROL_VAL 0x03
-/* Switch ring mmio values (context) from host to a vgpu. */
-static void switch_mmio_to_vgpu(struct intel_vgpu *vgpu, int ring_id)
-{
- struct drm_i915_private *dev_priv = vgpu->gvt->dev_priv;
- struct intel_vgpu_submission *s = &vgpu->submission;
- u32 *reg_state = s->shadow_ctx->engine[ring_id].lrc_reg_state;
- u32 ctx_ctrl = reg_state[CTX_CONTEXT_CONTROL_VAL];
- u32 inhibit_mask =
- _MASKED_BIT_ENABLE(CTX_CTRL_ENGINE_CTX_RESTORE_INHIBIT);
- i915_reg_t last_reg = _MMIO(0);
- struct render_mmio *mmio;
- u32 v;
- int i, array_size;
-
- if (IS_SKYLAKE(vgpu->gvt->dev_priv)
- || IS_KABYLAKE(vgpu->gvt->dev_priv)) {
- mmio = gen9_render_mmio_list;
- array_size = ARRAY_SIZE(gen9_render_mmio_list);
- load_mocs(vgpu, ring_id);
- } else {
- mmio = gen8_render_mmio_list;
- array_size = ARRAY_SIZE(gen8_render_mmio_list);
- }
-
- for (i = 0; i < array_size; i++, mmio++) {
- if (mmio->ring_id != ring_id)
- continue;
-
- mmio->value = I915_READ_FW(mmio->reg);
-
- /*
- * if it is an inhibit context, load in_context mmio
- * into HW by mmio write. If it is not, skip this mmio
- * write.
- */
- if (mmio->in_context &&
- (ctx_ctrl & inhibit_mask) != inhibit_mask)
- continue;
-
- if (mmio->mask)
- v = vgpu_vreg(vgpu, mmio->reg) | (mmio->mask << 16);
- else
- v = vgpu_vreg(vgpu, mmio->reg);
-
- I915_WRITE_FW(mmio->reg, v);
- last_reg = mmio->reg;
-
- trace_render_mmio(vgpu->id, "load",
- i915_mmio_reg_offset(mmio->reg),
- mmio->value, v);
- }
-
- /* Make sure the swiched MMIOs has taken effect. */
- if (likely(INTEL_GVT_MMIO_OFFSET(last_reg)))
- I915_READ_FW(last_reg);
-
- handle_tlb_pending_event(vgpu, ring_id);
-}
-
-/* Switch ring mmio values (context) from vgpu to host. */
-static void switch_mmio_to_host(struct intel_vgpu *vgpu, int ring_id)
+/* Switch ring mmio values (context). */
+static void switch_mmio(struct intel_vgpu *pre, struct intel_vgpu *next,
+ int ring_id)
{
- struct drm_i915_private *dev_priv = vgpu->gvt->dev_priv;
+ struct drm_i915_private *dev_priv;
struct render_mmio *mmio;
i915_reg_t last_reg = _MMIO(0);
- u32 v;
+ u32 old_v, new_v;
int i, array_size;
+ struct intel_vgpu_submission *s;
+ u32 *reg_state, ctx_ctrl;
+ u32 inhibit_mask =
+ _MASKED_BIT_ENABLE(CTX_CTRL_ENGINE_CTX_RESTORE_INHIBIT);
+ dev_priv = pre ? pre->gvt->dev_priv : next->gvt->dev_priv;
if (IS_SKYLAKE(dev_priv) || IS_KABYLAKE(dev_priv)) {
mmio = gen9_render_mmio_list;
array_size = ARRAY_SIZE(gen9_render_mmio_list);
- restore_mocs(vgpu, ring_id);
+ switch_mocs(pre, next, ring_id);
} else {
mmio = gen8_render_mmio_list;
array_size = ARRAY_SIZE(gen8_render_mmio_list);
@@ -338,29 +276,61 @@ static void switch_mmio_to_host(struct intel_vgpu *vgpu, int ring_id)
for (i = 0; i < array_size; i++, mmio++) {
if (mmio->ring_id != ring_id)
continue;
+ // save
+ if (pre) {
+ vgpu_vreg(pre, mmio->reg) = I915_READ_FW(mmio->reg);
+ if (mmio->mask)
+ vgpu_vreg(pre, mmio->reg) &=
+ ~(mmio->mask << 16);
+ old_v = vgpu_vreg(pre, mmio->reg);
+ } else {
+ mmio->value = I915_READ_FW(mmio->reg);
+ old_v = mmio->value;
+ }
- vgpu_vreg(vgpu, mmio->reg) = I915_READ_FW(mmio->reg);
-
- if (mmio->mask) {
- vgpu_vreg(vgpu, mmio->reg) &= ~(mmio->mask << 16);
- v = mmio->value | (mmio->mask << 16);
- } else
- v = mmio->value;
-
- if (mmio->in_context)
- continue;
+ // restore
+ if (next) {
+ s = &next->submission;
+ reg_state =
+ s->shadow_ctx->engine[ring_id].lrc_reg_state;
+ ctx_ctrl = reg_state[CTX_CONTEXT_CONTROL_VAL];
+ /*
+ * if it is an inhibit context, load in_context mmio
+ * into HW by mmio write. If it is not, skip this mmio
+ * write.
+ */
+ if (mmio->in_context &&
+ (ctx_ctrl & inhibit_mask) != inhibit_mask)
+ continue;
+
+ if (mmio->mask)
+ new_v = vgpu_vreg(next, mmio->reg) |
+ (mmio->mask << 16);
+ else
+ new_v = vgpu_vreg(next, mmio->reg);
+ } else {
+ if (mmio->in_context)
+ continue;
+ if (mmio->mask)
+ new_v = mmio->value | (mmio->mask << 16);
+ else
+ new_v = mmio->value;
+ }
- I915_WRITE_FW(mmio->reg, v);
+ I915_WRITE_FW(mmio->reg, new_v);
last_reg = mmio->reg;
-
- trace_render_mmio(vgpu->id, "restore",
+ trace_render_mmio(pre ? pre->id : 0,
+ next ? next->id : 0, "switch",
i915_mmio_reg_offset(mmio->reg),
- mmio->value, v);
+ old_v, new_v);
}
/* Make sure the swiched MMIOs has taken effect. */
if (likely(INTEL_GVT_MMIO_OFFSET(last_reg)))
I915_READ_FW(last_reg);
+
+ if (next)
+ handle_tlb_pending_event(next, ring_id);
}
/**
@@ -391,16 +361,6 @@ void intel_gvt_switch_mmio(struct intel_vgpu *pre,
* handle forcewake mannually.
*/
intel_uncore_forcewake_get(dev_priv, FORCEWAKE_ALL);
-
- /**
- * TODO: Optimize for vGPU to vGPU switch by merging
- * switch_mmio_to_host() and switch_mmio_to_vgpu().
- */
- if (pre)
- switch_mmio_to_host(pre, ring_id);
-
- if (next)
- switch_mmio_to_vgpu(next, ring_id);
-
+ switch_mmio(pre, next, ring_id);
intel_uncore_forcewake_put(dev_priv, FORCEWAKE_ALL);
}
diff --git a/drivers/gpu/drm/i915/gvt/trace.h b/drivers/gpu/drm/i915/gvt/trace.h
index 8c15038..7a25115 100644
--- a/drivers/gpu/drm/i915/gvt/trace.h
+++ b/drivers/gpu/drm/i915/gvt/trace.h
@@ -330,13 +330,14 @@
);
TRACE_EVENT(render_mmio,
- TP_PROTO(int id, char *action, unsigned int reg,
+ TP_PROTO(int old_id, int new_id, char *action, unsigned int reg,
unsigned int old_val, unsigned int new_val),
- TP_ARGS(id, action, reg, new_val, old_val),
+ TP_ARGS(old_id, new_id, action, reg, new_val, old_val),
TP_STRUCT__entry(
- __field(int, id)
+ __field(int, old_id)
+ __field(int, new_id)
__array(char, buf, GVT_TEMP_STR_LEN)
__field(unsigned int, reg)
__field(unsigned int, old_val)
@@ -344,15 +345,17 @@
),
TP_fast_assign(
- __entry->id = id;
+ __entry->old_id = old_id;
+ __entry->new_id = new_id;
snprintf(__entry->buf, GVT_TEMP_STR_LEN, "%s", action);
__entry->reg = reg;
__entry->old_val = old_val;
__entry->new_val = new_val;
),
- TP_printk("VM%u %s reg %x, old %08x new %08x\n",
- __entry->id, __entry->buf, __entry->reg,
+ TP_printk("VM%u -> VM%u %s reg %x, old %08x new %08x\n",
+ __entry->old_id, __entry->new_id,
+ __entry->buf, __entry->reg,
__entry->old_val, __entry->new_val)
);
--
1.9.1
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx
^ permalink raw reply related [flat|nested] 6+ messages in thread* [PATCH 2/3] drm/i915/gvt: refine mocs save restore policy
2017-12-07 4:34 [PATCH 0/3] mmio save restore refine in vgpu switch Weinan Li
2017-12-07 4:34 ` [PATCH 1/3] drm/i915/gvt: optimize for vGPU mmio switch Weinan Li
@ 2017-12-07 4:34 ` Weinan Li
2017-12-07 4:34 ` [PATCH 3/3] drm/i915/gvt: load host render mocs once in mocs switch Weinan Li
` (2 subsequent siblings)
4 siblings, 0 replies; 6+ messages in thread
From: Weinan Li @ 2017-12-07 4:34 UTC (permalink / raw)
To: intel-gfx
Save and restore the mocs regs of one VM in GVT-g burning too much CPU
utilization. Add LRI command scan to monitor the change of mocs registers,
save the state in vreg, and use delta update policy to restore them.
It can obviously reduce the MMIO r/w count, and improve the performance
of context switch.
Signed-off-by: Weinan Li <weinan.z.li@intel.com>
---
drivers/gpu/drm/i915/gvt/cmd_parser.c | 19 +++++++++++++++++++
drivers/gpu/drm/i915/gvt/render.c | 33 ++++++++++++++++++---------------
2 files changed, 37 insertions(+), 15 deletions(-)
diff --git a/drivers/gpu/drm/i915/gvt/cmd_parser.c b/drivers/gpu/drm/i915/gvt/cmd_parser.c
index 18c4573..be5c519b 100644
--- a/drivers/gpu/drm/i915/gvt/cmd_parser.c
+++ b/drivers/gpu/drm/i915/gvt/cmd_parser.c
@@ -825,6 +825,21 @@ static int force_nonpriv_reg_handler(struct parser_exec_state *s,
return 0;
}
+static inline bool is_mocs_mmio(unsigned int offset)
+{
+ return ((offset >= 0xc800) && (offset <= 0xcff8)) ||
+ ((offset >= 0xb020) && (offset <= 0xb0a0));
+}
+
+static int mocs_cmd_reg_handler(struct parser_exec_state *s,
+ unsigned int offset, unsigned int index)
+{
+ if (!is_mocs_mmio(offset))
+ return -EINVAL;
+ vgpu_vreg(s->vgpu, offset) = cmd_val(s, index + 1);
+ return 0;
+}
+
static int cmd_reg_handler(struct parser_exec_state *s,
unsigned int offset, unsigned int index, char *cmd)
{
@@ -848,6 +863,10 @@ static int cmd_reg_handler(struct parser_exec_state *s,
return 0;
}
+ if (is_mocs_mmio(offset) &&
+ mocs_cmd_reg_handler(s, offset, index))
+ return -EINVAL;
+
if (is_force_nonpriv_mmio(offset) &&
force_nonpriv_reg_handler(s, offset, index))
return -EPERM;
diff --git a/drivers/gpu/drm/i915/gvt/render.c b/drivers/gpu/drm/i915/gvt/render.c
index ec1e60d..724f10d 100644
--- a/drivers/gpu/drm/i915/gvt/render.c
+++ b/drivers/gpu/drm/i915/gvt/render.c
@@ -195,6 +195,8 @@ static void switch_mocs(struct intel_vgpu *pre, struct intel_vgpu *next,
{
struct drm_i915_private *dev_priv;
i915_reg_t offset, l3_offset;
+ u32 old_v, new_v;
+
u32 regs[] = {
[RCS] = 0xc800,
[VCS] = 0xc900,
@@ -212,16 +214,17 @@ static void switch_mocs(struct intel_vgpu *pre, struct intel_vgpu *next,
for (i = 0; i < 64; i++) {
if (pre)
- vgpu_vreg(pre, offset) =
- I915_READ_FW(offset);
+ old_v = vgpu_vreg(pre, offset);
else
- gen9_render_mocs[ring_id][i] =
- I915_READ_FW(offset);
-
+ old_v = gen9_render_mocs[ring_id][i]
+ = I915_READ_FW(offset);
if (next)
- I915_WRITE_FW(offset, vgpu_vreg(next, offset));
+ new_v = vgpu_vreg(next, offset);
else
- I915_WRITE_FW(offset, gen9_render_mocs[ring_id][i]);
+ new_v = gen9_render_mocs[ring_id][i];
+
+ if (old_v != new_v)
+ I915_WRITE_FW(offset, new_v);
offset.reg += 4;
}
@@ -230,17 +233,17 @@ static void switch_mocs(struct intel_vgpu *pre, struct intel_vgpu *next,
l3_offset.reg = 0xb020;
for (i = 0; i < 32; i++) {
if (pre)
- vgpu_vreg(pre, l3_offset) =
- I915_READ_FW(l3_offset);
+ old_v = vgpu_vreg(pre, l3_offset);
else
- gen9_render_mocs_L3[i] =
- I915_READ_FW(l3_offset);
+ old_v = gen9_render_mocs_L3[i]
+ = I915_READ_FW(offset);
if (next)
- I915_WRITE_FW(l3_offset,
- vgpu_vreg(next, l3_offset));
+ new_v = vgpu_vreg(next, l3_offset);
else
- I915_WRITE_FW(l3_offset,
- gen9_render_mocs_L3[i]);
+ new_v = gen9_render_mocs_L3[i];
+
+ if (old_v != new_v)
+ I915_WRITE_FW(l3_offset, new_v);
l3_offset.reg += 4;
}
--
1.9.1
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx
^ permalink raw reply related [flat|nested] 6+ messages in thread* [PATCH 3/3] drm/i915/gvt: load host render mocs once in mocs switch
2017-12-07 4:34 [PATCH 0/3] mmio save restore refine in vgpu switch Weinan Li
2017-12-07 4:34 ` [PATCH 1/3] drm/i915/gvt: optimize for vGPU mmio switch Weinan Li
2017-12-07 4:34 ` [PATCH 2/3] drm/i915/gvt: refine mocs save restore policy Weinan Li
@ 2017-12-07 4:34 ` Weinan Li
2017-12-07 4:49 ` ✗ Fi.CI.BAT: failure for mmio save restore refine in vgpu switch Patchwork
2017-12-11 10:56 ` [PATCH 0/3] " Joonas Lahtinen
4 siblings, 0 replies; 6+ messages in thread
From: Weinan Li @ 2017-12-07 4:34 UTC (permalink / raw)
To: intel-gfx
Load host render mocs registers once for delta update of mocs switch, it reduces
mmio read times obviously, then brings performance improvement during multi-vms
switch.
Signed-off-by: Weinan Li <weinan.z.li@intel.com>
---
drivers/gpu/drm/i915/gvt/render.c | 51 ++++++++++++++++++++++++++++++++-------
1 file changed, 42 insertions(+), 9 deletions(-)
diff --git a/drivers/gpu/drm/i915/gvt/render.c b/drivers/gpu/drm/i915/gvt/render.c
index 724f10d..13c3f01 100644
--- a/drivers/gpu/drm/i915/gvt/render.c
+++ b/drivers/gpu/drm/i915/gvt/render.c
@@ -141,8 +141,41 @@ struct render_mmio {
{RCS, _MMIO(0x20e4), 0xffff, false},
};
-static u32 gen9_render_mocs[I915_NUM_ENGINES][64];
-static u32 gen9_render_mocs_L3[32];
+static struct {
+ bool initialized;
+ u32 control_table[I915_NUM_ENGINES][64];
+ u32 l3cc_table[32];
+} gen9_render_mocs;
+
+static int load_render_mocs(struct drm_i915_private *dev_priv)
+{
+ i915_reg_t offset;
+ u32 regs[] = {
+ [RCS] = 0xc800,
+ [VCS] = 0xc900,
+ [VCS2] = 0xca00,
+ [BCS] = 0xcc00,
+ [VECS] = 0xcb00,
+ };
+ int ring_id, i;
+
+ for (ring_id = 0; ring_id < I915_NUM_ENGINES; ring_id++) {
+ offset.reg = regs[ring_id];
+ for (i = 0; i < 64; i++) {
+ gen9_render_mocs.control_table[ring_id][i] =
+ I915_READ_FW(offset);
+ offset.reg += 4;
+ }
+ }
+
+ offset.reg = 0xb020;
+ for (i = 0; i < 32; i++) {
+ gen9_render_mocs.l3cc_table[i] =
+ I915_READ_FW(offset);
+ offset.reg += 4;
+ }
+ gen9_render_mocs.initialized = true;
+}
static void handle_tlb_pending_event(struct intel_vgpu *vgpu, int ring_id)
{
@@ -210,18 +243,19 @@ static void switch_mocs(struct intel_vgpu *pre, struct intel_vgpu *next,
if (WARN_ON(ring_id >= ARRAY_SIZE(regs)))
return;
- offset.reg = regs[ring_id];
+ if (!pre && !gen9_render_mocs.initialized)
+ load_render_mocs(dev_priv);
+ offset.reg = regs[ring_id];
for (i = 0; i < 64; i++) {
if (pre)
old_v = vgpu_vreg(pre, offset);
else
- old_v = gen9_render_mocs[ring_id][i]
- = I915_READ_FW(offset);
+ old_v = gen9_render_mocs.control_table[ring_id][i];
if (next)
new_v = vgpu_vreg(next, offset);
else
- new_v = gen9_render_mocs[ring_id][i];
+ new_v = gen9_render_mocs.control_table[ring_id][i];
if (old_v != new_v)
I915_WRITE_FW(offset, new_v);
@@ -235,12 +269,11 @@ static void switch_mocs(struct intel_vgpu *pre, struct intel_vgpu *next,
if (pre)
old_v = vgpu_vreg(pre, l3_offset);
else
- old_v = gen9_render_mocs_L3[i]
- = I915_READ_FW(offset);
+ old_v = gen9_render_mocs.l3cc_table[i];
if (next)
new_v = vgpu_vreg(next, l3_offset);
else
- new_v = gen9_render_mocs_L3[i];
+ new_v = gen9_render_mocs.l3cc_table[i];
if (old_v != new_v)
I915_WRITE_FW(l3_offset, new_v);
--
1.9.1
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx
^ permalink raw reply related [flat|nested] 6+ messages in thread