Enhancement for PLE handler in KVM

kvm.vger.kernel.org archive mirror
 help / color / mirror / Atom feed

* Enhancement for PLE handler in KVM
       [not found] ` <530B9637.6030708@alcatel-lucent.com>
@ 2014-03-03 18:24   ` Li, Bin (Bin)
  2014-03-03 19:20     ` Paolo Bonzini
  0 siblings, 1 reply; 13+ messages in thread
From: Li, Bin (Bin) @ 2014-03-03 18:24 UTC (permalink / raw)
  To: kvm
  Cc: Neel Jatania, linux-kernel, Avi Kiviti, Srivatsa Vaddagiri,
	Peter Zijlstra, Mike Galbraith, Chris Wright, ttracy,
	Nakajima, Jun, riel

[-- Attachment #1: Type: text/plain, Size: 5711 bytes --]

Hello, all.

The PLE handler attempts to determine an alternate vCPU to schedule.  In 
some cases the wrong vCPU is scheduled and performance suffers.

This patch allows for the guest OS to signal, using a hypercall, that 
it's starting/ending a critical section.  Using this information in the 
PLE handler allows for a more intelligent VCPU scheduling determination 
to be made.  The patch only changes the PLE behaviour if this new 
hypercall mechanism is used; if it isn't used, then the existing PLE 
algorithm continues to be used to determine the next vCPU.

Benefit from the patch:
  -  the guest OS real time performance being significantly improved 
when using hyper call marking entering and leaving guest OS kernel state.
  - The guest OS system clock jitter measured on on Intel E5 2620 
reduced from 400ms down to 6ms.
  - The guest OS system lock is set to a 2ms clock interrupt. The jitter 
is measured by the difference between dtsc() value in clock interrupt 
handler and the expectation of tsc value.
  - detail of test report is attached as reference.

Path details:

 From 77edfa193a4e29ab357ec3b1e097f8469d418507 Mon Sep 17 00:00:00 2001

From: Bin BL LI <bin.bl.li@alcatel-lucent.com>

Date: Mon, 3 Mar 2014 11:23:35 -0500

Subject: [PATCH] Initial commit

---

  arch/x86/kvm/x86.c            |    7 +++++++

  include/linux/kvm_host.h      |   16 ++++++++++++++++

  include/uapi/linux/kvm_para.h |    2 ++

  virt/kvm/kvm_main.c           |   14 +++++++++++++-

  4 files changed, 38 insertions(+), 1 deletions(-)

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c

index 39c28f0..e735de3 100644

--- a/arch/x86/kvm/x86.c

+++ b/arch/x86/kvm/x86.c

@@ -5582,6 +5582,7 @@ void kvm_arch_exit(void)

  int kvm_emulate_halt(struct kvm_vcpu *vcpu)

  {

      ++vcpu->stat.halt_exits;

+    kvm_vcpu_set_holding_lock(vcpu,false);

      if (irqchip_in_kernel(vcpu->kvm)) {

          vcpu->arch.mp_state = KVM_MP_STATE_HALTED;

          return 1;

@@ -5708,6 +5709,12 @@ int kvm_emulate_hypercall(struct kvm_vcpu *vcpu)

          kvm_pv_kick_cpu_op(vcpu->kvm, a0, a1);

          ret = 0;

          break;

+    case KVM_HC_LOCK_GET:

+        kvm_vcpu_set_holding_lock(vcpu,true);

+        break;

+    case KVM_HC_LOCK_RELEASE:

+        kvm_vcpu_set_holding_lock(vcpu,false);

+        break;

      default:

          ret = -KVM_ENOSYS;

          break;

diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h

index b8e9a43..f24892e 100644

--- a/include/linux/kvm_host.h

+++ b/include/linux/kvm_host.h

@@ -266,6 +266,7 @@ struct kvm_vcpu {

          bool in_spin_loop;

          bool dy_eligible;

      } spin_loop;

+    bool holding_lock;

  #endif

      bool preempted;

      struct kvm_vcpu_arch arch;

@@ -403,6 +404,10 @@ struct kvm {

  #endif

      long tlbs_dirty;

      struct list_head devices;

+

+#ifdef CONFIG_HAVE_KVM_CPU_RELAX_INTERCEPT

+    bool using_lock_flag;

+#endif

  };

  

  #define kvm_err(fmt, ...) \

@@ -1076,6 +1081,13 @@ static inline void kvm_vcpu_set_dy_eligible(struct kvm_vcpu *vcpu, bool val)

      vcpu->spin_loop.dy_eligible = val;

  }

  

+static inline void kvm_vcpu_set_holding_lock(struct kvm_vcpu *vcpu, bool val)

+{

+    if ( ! vcpu->kvm->using_lock_flag )

+        vcpu->kvm->using_lock_flag = true;

+    vcpu->holding_lock = val;

+}

+

  #else /* !CONFIG_HAVE_KVM_CPU_RELAX_INTERCEPT */

  

  static inline void kvm_vcpu_set_in_spin_loop(struct kvm_vcpu *vcpu, bool val)

@@ -1085,6 +1097,10 @@ static inline void kvm_vcpu_set_in_spin_loop(struct kvm_vcpu *vcpu, bool val)

  static inline void kvm_vcpu_set_dy_eligible(struct kvm_vcpu *vcpu, bool val)

  {

  }

+

+static inline void kvm_vcpu_set_holding_lock(struct kvm_vcpu *vcpu, bool val)

+{

+}

  #endif /* CONFIG_HAVE_KVM_CPU_RELAX_INTERCEPT */

  #endif

  

diff --git a/include/uapi/linux/kvm_para.h b/include/uapi/linux/kvm_para.h

index 2841f86..2c563a1 100644

--- a/include/uapi/linux/kvm_para.h

+++ b/include/uapi/linux/kvm_para.h

@@ -20,6 +20,8 @@

  #define KVM_HC_FEATURES            3

  #define KVM_HC_PPC_MAP_MAGIC_PAGE    4

  #define KVM_HC_KICK_CPU            5

+#define KVM_HC_LOCK_GET            6

+#define KVM_HC_LOCK_RELEASE        7

  

  /*

   * hypercalls use architecture specific

diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c

index 03a0381..c3a5046 100644

--- a/virt/kvm/kvm_main.c

+++ b/virt/kvm/kvm_main.c

@@ -232,6 +232,7 @@ int kvm_vcpu_init(struct kvm_vcpu *vcpu, struct kvm *kvm, unsigned id)

  

      kvm_vcpu_set_in_spin_loop(vcpu, false);

      kvm_vcpu_set_dy_eligible(vcpu, false);

+    kvm_vcpu_set_holding_lock(vcpu, false);

      vcpu->preempted = false;

  

      r = kvm_arch_vcpu_init(vcpu);

@@ -502,6 +503,10 @@ static struct kvm *kvm_create_vm(unsigned long type)

      list_add(&kvm->vm_list, &vm_list);

      spin_unlock(&kvm_lock);

  

+#ifdef CONFIG_HAVE_KVM_CPU_RELAX_INTERCEPT

+    kvm->using_lock_flag = false;

+#endif

+

      return kvm;

  

  out_err:

@@ -1762,9 +1767,16 @@ static bool kvm_vcpu_eligible_for_directed_yield(struct kvm_vcpu *vcpu)

  #ifdef CONFIG_HAVE_KVM_CPU_RELAX_INTERCEPT

      bool eligible;

  

-    eligible = !vcpu->spin_loop.in_spin_loop ||

+    if ( ! vcpu->kvm->using_lock_flag )

+    {

+        eligible = !vcpu->spin_loop.in_spin_loop ||

              (vcpu->spin_loop.in_spin_loop &&

               vcpu->spin_loop.dy_eligible);

+    }

+    else

+    {

+        eligible = vcpu->holding_lock; /* if holding any lock, yield to it */

+    }

  

      if (vcpu->spin_loop.in_spin_loop)

          kvm_vcpu_set_dy_eligible(vcpu, !vcpu->spin_loop.dy_eligible);

-- 

1.7.1

~/ref/kvm_git >



Regards
Bin


[-- Attachment #2: tst.log --]
[-- Type: text/plain, Size: 12660 bytes --]


System clock jitter measure result. 
Using 8 vCpu SMP guest OS with using hypercall mark getting/leaving
guest OS kernel state also the KVM path to boost the lock holder vCPU.
The max jitter at run time is 6ms
The two 200ms jitter below was from guest OS system initialization
stage instead of run time clock jitter.
======================================================================

11-> sysClkIntShow
OSclock:0x80f8760
-- intrlen:            6804074 [    1999  us]
-- tickLen:           34020374 [    9999  us]
-- tickWin:            5103056 [    1499  us]
-- syncOff:                  0 [       0  us]
-- timeerr:          -17387492 [  -5.111  ms] [0.0004 %] [sync:  -0.071 ms]
-- intrerr: 0xffffffff99f36b1a [  -0.503 sec] [0.0404 %]
+---------+--------------------+--------------------+-------------------------+
|   ts    |        init        |        last        |         time [ms]       |
+---------+--------------------+--------------------+-------------------------+
| cpu_tsc | 0x000000031e754598 | 0x000003de99c1c6f8 |             1246665.096 |
| vxticks | 0x00000000ffff1613 | 0x000000000000fd0d |             1246659.985 |
+---------+--------------------+--------------------+-------------------------+
+---------+--------------------+--------------------+-------------------------+
| counter |        count       |      time [ms]     |  delta  [ms]      [%]   |
+---------+--------------------+--------------------+---------------+---------+
| cpu_tsc |      4241201332576 |        1246665.096 |        +0.000 | +0.0000 |
| vxticks |             124666 |        1246659.985 |        -5.111 | -0.0004 |
| clk_obj |             124666 |        1246659.985 |        +5.040 | +0.0004 |
| clkintr |             623081 |        1246161.839 |      -503.257 | -0.0404 |
+---------+--------------------+--------------------+---------------+---------+

OSclock [0x80f8760] interrupt-source histogram
-- clk freq: 500 Hz
-- clk intr: 623082
+------+---------------------+----------------------------+
| pos# |    Interval [ms]    |     clk ticks          %   |
+------+---------------------+-------------------+--------+
|    0 |   0.000 ..    0.308 |                40 |   0.01 |
|    1 |   0.308 ..    0.616 |                16 |   0.00 |
|    2 |   0.616 ..    0.925 |                22 |   0.00 |
|    3 |   0.925 ..    1.233 |                14 |   0.00 |
|    4 |   1.233 ..    1.541 |                16 |   0.00 |
|    5 |   1.541 ..    1.849 |                51 |   0.01 |
|    6 |   1.849 ..    2.158 |            622769 |  99.95 |
|    7 |   2.158 ..    2.466 |                43 |   0.01 |
|    8 |   2.466 ..    2.774 |                13 |   0.00 |
|    9 |   2.774 ..    3.082 |                13 |   0.00 |
|   10 |   3.082 ..    3.390 |                17 |   0.00 |
|   11 |   3.390 ..    3.699 |                11 |   0.00 |
|   12 |   3.699 ..    4.007 |                 4 |   0.00 |
|   13 |   4.007 ..    4.315 |                 5 |   0.00 |
|   14 |   4.315 ..    4.623 |                 3 |   0.00 |
|   15 |   4.623 ..    4.932 |                 2 |   0.00 |
|   16 |   4.932 ..    5.240 |                 1 |   0.00 |
|   17 |   5.240 ..    5.548 |                 5 |   0.00 |
|   18 |   5.548 ..    5.856 |                26 |   0.00 |
|   19 |   5.856 ..    6.164 |                 8 |   0.00 |
|   20 |   6.164 ..    6.473 |                 2 |   0.00 |
|  663 | 204.350 ..  204.658 |                 1 |   0.00 |
|  668 | 205.891 ..  206.199 |                 1 |   0.00 |
+------+---------------------+-------------------+--------+
| ---- |                     |            623083 | 100.00 |
+------+---------------------+-------------------+--------+




System clock jitter measure result. 
Using 8 vCpu SMP guest OS without hypercall and the vanilla KVM PLE handler.
The system clock jitter (run time) in guest OS coule be bigger than 400ms.
======================================================================


11-> sysClkIntShow
OSclock:0x8055760
-- intrlen:            6804067 [    1999  us]
-- tickLen:           34020337 [   10000  us]
-- tickWin:            5103050 [    1499  us]
-- syncOff:                  0 [       0  us]
-- timeerr:          -22314665 [  -6.559  ms] [0.0009 %] [sync:  -0.032 ms]
-- intrerr: 0xfffffffaca02d1f3 [  -6.579 sec] [0.9233 %]
+---------+--------------------+--------------------+-------------------------+
|   ts    |        init        |        last        |         time [ms]       |
+---------+--------------------+--------------------+-------------------------+
| cpu_tsc | 0x00000003359de5e8 | 0x000002379389f1d2 |              712496.568 |
| vxticks | 0x00000000ffff160f | 0x0000000000002c60 |              712490.008 |
+---------+--------------------+--------------------+-------------------------+
+---------+--------------------+--------------------+-------------------------+
| counter |        count       |      time [ms]     |  delta  [ms]      [%]   |
+---------+--------------------+--------------------+---------------+---------+
| cpu_tsc |      2423937305578 |         712496.568 |        +0.000 | +0.0000 |
| vxticks |              71249 |         712490.008 |        -6.559 | -0.0009 |
| clk_obj |              71249 |         712490.008 |        +6.527 | +0.0009 |
| clkintr |             352959 |         705917.967 |     -6578.601 | -0.9233 |
+---------+--------------------+--------------------+---------------+---------+

OSclock [0x8055760] interrupt-source histogram
-- clk freq: 500 Hz
-- clk intr: 352959
+------+---------------------+----------------------------+
| pos# |    Interval [ms]    |     clk ticks          %   |
+------+---------------------+-------------------+--------+
|    0 |   0.000 ..    0.308 |               270 |   0.08 |
|    1 |   0.308 ..    0.616 |               160 |   0.05 |
|    2 |   0.616 ..    0.925 |               165 |   0.05 |
|    3 |   0.925 ..    1.233 |               200 |   0.06 |
|    4 |   1.233 ..    1.541 |               182 |   0.05 |
|    5 |   1.541 ..    1.849 |               591 |   0.17 |
|    6 |   1.849 ..    2.158 |            349872 |  99.13 |
|    7 |   2.158 ..    2.466 |               530 |   0.15 |
|    8 |   2.466 ..    2.774 |               151 |   0.04 |
|    9 |   2.774 ..    3.082 |               123 |   0.03 |
|   10 |   3.082 ..    3.390 |                87 |   0.02 |
|   11 |   3.390 ..    3.699 |                65 |   0.02 |
|   12 |   3.699 ..    4.007 |                53 |   0.02 |
|   13 |   4.007 ..    4.315 |                38 |   0.01 |
|   14 |   4.315 ..    4.623 |                27 |   0.01 |
|   15 |   4.623 ..    4.932 |                34 |   0.01 |
|   16 |   4.932 ..    5.240 |                44 |   0.01 |
|   17 |   5.240 ..    5.548 |                24 |   0.01 |
|   18 |   5.548 ..    5.856 |                37 |   0.01 |
|   19 |   5.856 ..    6.164 |                32 |   0.01 |
|   20 |   6.164 ..    6.473 |                22 |   0.01 |
|   21 |   6.473 ..    6.781 |                30 |   0.01 |
|   22 |   6.781 ..    7.089 |                21 |   0.01 |
|   23 |   7.089 ..    7.397 |                12 |   0.00 |
|   24 |   7.397 ..    7.706 |                17 |   0.00 |
|   25 |   7.706 ..    8.014 |                13 |   0.00 |
|   26 |   8.014 ..    8.322 |                 3 |   0.00 |
|   27 |   8.322 ..    8.630 |                 9 |   0.00 |
|   28 |   8.630 ..    8.938 |                 7 |   0.00 |
|   29 |   8.938 ..    9.247 |                 7 |   0.00 |
|   30 |   9.247 ..    9.555 |                 3 |   0.00 |
|   31 |   9.555 ..    9.863 |                 2 |   0.00 |
|   32 |   9.863 ..   10.171 |                 8 |   0.00 |
|   33 |  10.171 ..   10.479 |                 6 |   0.00 |
|   34 |  10.479 ..   10.788 |                 1 |   0.00 |
|   35 |  10.788 ..   11.096 |                 3 |   0.00 |
|   36 |  11.096 ..   11.404 |                 6 |   0.00 |
|   37 |  11.404 ..   11.712 |                 1 |   0.00 |
|   38 |  11.712 ..   12.021 |                 1 |   0.00 |
|   39 |  12.021 ..   12.329 |                 2 |   0.00 |
|   40 |  12.329 ..   12.637 |                 2 |   0.00 |
|   41 |  12.637 ..   12.945 |                 3 |   0.00 |
|   42 |  12.945 ..   13.253 |                 4 |   0.00 |
|   44 |  13.562 ..   13.870 |                 2 |   0.00 |
|   45 |  13.870 ..   14.178 |                 3 |   0.00 |
|   46 |  14.178 ..   14.486 |                 2 |   0.00 |
|   47 |  14.486 ..   14.795 |                 5 |   0.00 |
|   48 |  14.795 ..   15.103 |                 3 |   0.00 |
|   49 |  15.103 ..   15.411 |                 1 |   0.00 |
|   50 |  15.411 ..   15.719 |                 2 |   0.00 |
|   51 |  15.719 ..   16.027 |                 3 |   0.00 |
|   53 |  16.336 ..   16.644 |                 2 |   0.00 |
|   54 |  16.644 ..   16.952 |                 2 |   0.00 |
|   56 |  17.260 ..   17.569 |                 2 |   0.00 |
|   57 |  17.569 ..   17.877 |                 1 |   0.00 |
|   58 |  17.877 ..   18.185 |                 1 |   0.00 |
|   60 |  18.493 ..   18.801 |                 1 |   0.00 |
|   62 |  19.110 ..   19.418 |                 1 |   0.00 |
|   64 |  19.726 ..   20.034 |                 1 |   0.00 |
|   65 |  20.034 ..   20.343 |                 1 |   0.00 |
|   66 |  20.343 ..   20.651 |                 1 |   0.00 |
|   67 |  20.651 ..   20.959 |                 1 |   0.00 |
|   71 |  21.884 ..   22.192 |                 2 |   0.00 |
|   75 |  23.117 ..   23.425 |                 1 |   0.00 |
|   76 |  23.425 ..   23.733 |                 1 |   0.00 |
|   81 |  24.966 ..   25.274 |                 2 |   0.00 |
|   82 |  25.274 ..   25.582 |                 1 |   0.00 |
|   83 |  25.582 ..   25.891 |                 3 |   0.00 |
|   85 |  26.199 ..   26.507 |                 1 |   0.00 |
|   87 |  26.815 ..   27.123 |                 1 |   0.00 |
|   90 |  27.740 ..   28.048 |                 1 |   0.00 |
|   91 |  28.048 ..   28.356 |                 2 |   0.00 |
|   99 |  30.514 ..   30.822 |                 1 |   0.00 |
|  101 |  31.130 ..   31.438 |                 1 |   0.00 |
|  107 |  32.980 ..   33.288 |                 1 |   0.00 |
|  111 |  34.212 ..   34.521 |                 1 |   0.00 |
|  119 |  36.678 ..   36.986 |                 1 |   0.00 |
|  120 |  36.986 ..   37.295 |                 1 |   0.00 |
|  122 |  37.603 ..   37.911 |                 1 |   0.00 |
|  128 |  39.452 ..   39.760 |                 1 |   0.00 |
|  129 |  39.760 ..   40.069 |                 1 |   0.00 |
|  130 |  40.069 ..   40.377 |                 1 |   0.00 |
|  137 |  42.226 ..   42.534 |                 1 |   0.00 |
|  138 |  42.534 ..   42.843 |                 1 |   0.00 |
|  140 |  43.151 ..   43.459 |                 1 |   0.00 |
|  144 |  44.384 ..   44.692 |                 1 |   0.00 |
|  161 |  49.623 ..   49.932 |                 1 |   0.00 |
|  164 |  50.548 ..   50.856 |                 1 |   0.00 |
|  182 |  56.096 ..   56.404 |                 1 |   0.00 |
|  189 |  58.254 ..   58.562 |                 1 |   0.00 |
|  192 |  59.178 ..   59.487 |                 1 |   0.00 |
|  214 |  65.959 ..   66.267 |                 1 |   0.00 |
|  230 |  70.891 ..   71.199 |                 1 |   0.00 |
|  258 |  79.521 ..   79.829 |                 1 |   0.00 |
|  279 |  85.993 ..   86.302 |                 1 |   0.00 |
|  300 |  92.466 ..   92.774 |                 1 |   0.00 |
|  309 |  95.240 ..   95.548 |                 1 |   0.00 |
|  339 | 104.487 ..  104.795 |                 1 |   0.00 |
|  411 | 126.679 ..  126.987 |                 1 |   0.00 |
|  434 | 133.768 ..  134.076 |                 1 |   0.00 |
|  463 | 142.706 ..  143.014 |                 1 |   0.00 |
|  532 | 163.973 ..  164.281 |                 1 |   0.00 |
|  537 | 165.514 ..  165.823 |                 1 |   0.00 |
|  538 | 165.823 ..  166.131 |                 1 |   0.00 |
|  631 | 194.487 ..  194.795 |                 1 |   0.00 |
|  634 | 195.412 ..  195.720 |                 1 |   0.00 |
|  707 | 217.912 ..  218.220 |                 1 |   0.00 |
|  728 | 224.384 ..  224.693 |                 1 |   0.00 |
|  735 | 226.542 ..  226.850 |                 1 |   0.00 |
|  772 | 237.946 ..  238.254 |                 1 |   0.00 |
|  924 | 284.796 ..  285.104 |                 1 |   0.00 |
| 1346 | 414.865 ..  415.173 |                 1 |   0.00 |
| 1360 | 419.180 ..  419.488 |                 1 |   0.00 |
+------+---------------------+-------------------+--------+
| ---- |                     |            352959 | 100.00 |
+------+---------------------+-------------------+--------+

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: Enhancement for PLE handler in KVM
  2014-03-03 18:24   ` Enhancement for PLE handler in KVM Li, Bin (Bin)
@ 2014-03-03 19:20     ` Paolo Bonzini
  2014-03-05 14:17       ` Li, Bin (Bin)
  0 siblings, 1 reply; 13+ messages in thread
From: Paolo Bonzini @ 2014-03-03 19:20 UTC (permalink / raw)
  To: Li, Bin (Bin), kvm
  Cc: Neel Jatania, linux-kernel, Avi Kiviti, Srivatsa Vaddagiri,
	Peter Zijlstra, Mike Galbraith, Chris Wright, ttracy,
	Nakajima, Jun, riel

Il 03/03/2014 19:24, Li, Bin (Bin) ha scritto:
> Hello, all.
>
> The PLE handler attempts to determine an alternate vCPU to schedule.  In
> some cases the wrong vCPU is scheduled and performance suffers.
>
> This patch allows for the guest OS to signal, using a hypercall, that
> it's starting/ending a critical section.  Using this information in the
> PLE handler allows for a more intelligent VCPU scheduling determination
> to be made.  The patch only changes the PLE behaviour if this new
> hypercall mechanism is used; if it isn't used, then the existing PLE
> algorithm continues to be used to determine the next vCPU.
>
> Benefit from the patch:
>  -  the guest OS real time performance being significantly improved when
> using hyper call marking entering and leaving guest OS kernel state.
>  - The guest OS system clock jitter measured on on Intel E5 2620 reduced
> from 400ms down to 6ms.
>  - The guest OS system lock is set to a 2ms clock interrupt. The jitter
> is measured by the difference between dtsc() value in clock interrupt
> handler and the expectation of tsc value.
>  - detail of test report is attached as reference.

This patch doesn't include the corresponding guest changes, so it's not 
clear how you would use it and what the overhead would be: a hypercall 
is ~30 times slower than an uncontended spin_lock or spin_unlock.

In fact, performance numbers for common workloads are useful too.

Have you looked at the recent "paravirtual ticketlock"?  It does roughly 
the opposite as this patch: the guest can signal when it's been spinning 
too much, and the host will schedule it out (which hopefully accelerates 
the end of the critical section).

Paolo



> Path details:
>
> From 77edfa193a4e29ab357ec3b1e097f8469d418507 Mon Sep 17 00:00:00 2001

^ permalink raw reply	[flat|nested] 13+ messages in thread

* RE: Enhancement for PLE handler in KVM
  2014-03-03 19:20     ` Paolo Bonzini
@ 2014-03-05 14:17       ` Li, Bin (Bin)
  2014-03-05 14:49         ` Paolo Bonzini
  0 siblings, 1 reply; 13+ messages in thread
From: Li, Bin (Bin) @ 2014-03-05 14:17 UTC (permalink / raw)
  To: Paolo Bonzini, kvm@vger.kernel.org
  Cc: Jatania, Neel (Neel), linux-kernel@vger.kernel.org, Avi Kiviti,
	Srivatsa Vaddagiri, Peter Zijlstra, Mike Galbraith, Chris Wright,
	ttracy@redhat.com, Nakajima, Jun, riel@redhat.com

Hello, Paolo, 

We are using a customized embedded SMP OS as guest OS. It is not meaningful to post the guest OS code.
Also, there is no "performance numbers for common workloads" since there is no common workloads to compare with. 
In our OS, there is still a big kernel lock to protect the kernel. 

What we have observed from trace log ( collected via trace-cmd ) 
  - when 2+ vCPU from same VM being stacked on a pCPU 
  - one of the above vCPU happened to be the lock holder and the other vCPU is in spin lock trying to get the kernel lock, 
  - the vCPU in spin lock could still be boosted by vanilla ple handler incorrectly (current ple handler can yield in current PLE VM exist. But vCPU in spin loop becomes eligible to be "yield-to" in the next PLE VM exist. ) 
  - when the in-correct boosting happens, the vCPU in spin lock will run longer time on the pCPU and causing the lock holder vCPU having less time to run on pCPU since they are sharing the on same pCPU.
  - when the lock holder vCPU being scheduled on pCPU less enough, we observe the clock interrupt is issued the lock holder vCPU but being coalesced. This is the root cause of the system clock jitter in the guest OS

When we applied the hyper call in SMP guest OS and using KVM to boost the lock holder only, we can observe following thing with trace-cmd log: 
  - when 2+ vCPU ( n and m ) from same VM being stacked on pCPU a 
  - one of the above vCPU happened to be the lock holder and the other vCPU is in spin lock trying to get the kernel lock, 
  - we observed two types of scheduling events 
  - Type A: 
     vCpu n is lock holder,  but being switched out. vCpu m is switched on to same pCpu a and doing spin loop trying to get into kernel state. 
     after about 1ms, the lock holder vCpu n being schedule to other pCpu b and starting to run. Then LHP ( lock holder preemption) is resolved. 

  - Type B: 
     vCpu n holding lock, but being switched out. vCpu m is switched on to same pCpu a and doing spin loop to get into kernel state. 
     after about 0.4ms ( biggest # is 2ms in the log I captured while running system test case), the vCpu n being switch back to pCpu a. 
     then the vCpu n done its kernel jobs and released the kernel lock. Other vCpus getting the lock and system is happy after. 

   Adding hyper call in every kernel enter and kernel exist is expensive. 
   From the trace log collect from i7 running @ 3.0GHz , the cost per hyper is <1us. Since my measurement can only give us level result. I would consider it as 1us. Comparing the jitter caused by lock holder preemption, I think this cost is acceptable. 

   The most important is, the guest OS real time performance becomes stable and predictable.

   At the end, we can give the guest OS an option if the guest OS really care about the real time performance. It is up to the guest OS to  decide use it or not. And there is a challenge in the guest OS to mark the lock correctly and accurately too. 

Regarding to the " paravirtual ticketlock ", we did try the same idea in our embedded guest OS.
We got following results:

a) We implemented similar approach like linux "paravirtual ticketlock". The system clock jitter does get reduced a lot. But, the system clock jitter is still happening at lower rate. In a few hours system stress test, we still see the big jitter few times.

b) When using "paravirtual ticketlock", the threshold to decide "are we spinning too much" becomes an important factor need to be tuned to the final system case by case. What we found from the test is, different application running in our guest OS would require different threshold setting.

c) Again, with the enhancement patch in kvm and using hyper call in guest OS, the guest OS system clock jitter is not increasing over time. Also it is not application related either. And the max. jitter is very close to the pinning vCPU to pCPU case. ( no vCPU stack from same VM in the system - the best we can expect. )

Regards 
Bin 

-----Original Message-----
From: Paolo Bonzini [mailto:paolo.bonzini@gmail.com] On Behalf Of Paolo Bonzini
Sent: Monday, March 03, 2014 2:21 PM
To: Li, Bin (Bin); kvm@vger.kernel.org
Cc: Jatania, Neel (Neel); linux-kernel@vger.kernel.org; Avi Kiviti; Srivatsa Vaddagiri; Peter Zijlstra; Mike Galbraith; Chris Wright; ttracy@redhat.com; Nakajima, Jun; riel@redhat.com
Subject: Re: Enhancement for PLE handler in KVM

Il 03/03/2014 19:24, Li, Bin (Bin) ha scritto:
> Hello, all.
>
> The PLE handler attempts to determine an alternate vCPU to schedule.  
> In some cases the wrong vCPU is scheduled and performance suffers.
>
> This patch allows for the guest OS to signal, using a hypercall, that 
> it's starting/ending a critical section.  Using this information in 
> the PLE handler allows for a more intelligent VCPU scheduling 
> determination to be made.  The patch only changes the PLE behaviour if 
> this new hypercall mechanism is used; if it isn't used, then the 
> existing PLE algorithm continues to be used to determine the next vCPU.
>
> Benefit from the patch:
>  -  the guest OS real time performance being significantly improved 
> when using hyper call marking entering and leaving guest OS kernel state.
>  - The guest OS system clock jitter measured on on Intel E5 2620 
> reduced from 400ms down to 6ms.
>  - The guest OS system lock is set to a 2ms clock interrupt. The 
> jitter is measured by the difference between dtsc() value in clock 
> interrupt handler and the expectation of tsc value.
>  - detail of test report is attached as reference.

This patch doesn't include the corresponding guest changes, so it's not clear how you would use it and what the overhead would be: a hypercall is ~30 times slower than an uncontended spin_lock or spin_unlock.

In fact, performance numbers for common workloads are useful too.

Have you looked at the recent "paravirtual ticketlock"?  It does roughly the opposite as this patch: the guest can signal when it's been spinning too much, and the host will schedule it out (which hopefully accelerates the end of the critical section).

Paolo

> Path details:
>
> From 77edfa193a4e29ab357ec3b1e097f8469d418507 Mon Sep 17 00:00:00 2001

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: Enhancement for PLE handler in KVM
  2014-03-05 14:17       ` Li, Bin (Bin)
@ 2014-03-05 14:49         ` Paolo Bonzini
  2014-03-05 21:16           ` Li, Bin (Bin)
  0 siblings, 1 reply; 13+ messages in thread
From: Paolo Bonzini @ 2014-03-05 14:49 UTC (permalink / raw)
  To: Li, Bin (Bin), kvm@vger.kernel.org
  Cc: Jatania, Neel (Neel), linux-kernel@vger.kernel.org,
	Srivatsa Vaddagiri, Peter Zijlstra, Mike Galbraith, Chris Wright,
	ttracy@redhat.com, Nakajima, Jun, riel@redhat.com

Il 05/03/2014 15:17, Li, Bin (Bin) ha scritto:
> Hello, Paolo,
>
> We are using a customized embedded SMP OS as guest OS. It is not meaningful to post the guest OS code.
> Also, there is no "performance numbers for common workloads" since there is no common workloads to compare with.
> In our OS, there is still a big kernel lock to protect the kernel.

Does this means that average spinning time for the spinlock is 
relatively high compared to Linux or Windows?

> - when the in-correct boosting happens, the vCPU in spin lock will run
> longer time on the pCPU and causing the lock holder vCPU having less
> time to run on pCPU since they are sharing the on same pCPU.

Correct.  This is an unfortunate problem in the current implementation 
of PLE.

> Adding hyper call in every kernel enter and kernel exist is
> expensive. From the trace log collect from i7 running @ 3.0GHz , the
> cost per  hyper is <1us.

Right, it is around 1500 cycles and 0.4-0.5 us, i.e. approximately 1 us 
for enter and exit together.

This is not too bad for a kernel with a big lock, but not acceptable if 
you do not have it (as is the case for Linux and Windows).

> Regarding to the " paravirtual ticketlock ", we did try the same idea in our embedded guest OS.
> We got following results:
>
> a) We implemented similar approach like linux "paravirtual
> ticketlock". The system clock jitter does get reduced a lot. But, the
> system clock jitter is still happening at lower rate. In a few hours
> system stress test, we still see the big jitter few times.

Did you find out why?  It could happen if the virtual CPU is scheduled 
out for a relatively long time.  A small number of spinning iterations 
can then account for a relatively large time.

My impression is that you're implementing a paravirtual spinlock, except 
that you're relying on PLE to decide when to go to sleep.  PLE is 
implemented using the TSC.  Can you assume the host TSC is of good 
quality?  If so, perhaps you can try to modify the pv ticketlock 
algorithm, and use a threshold based on TSC instead of an iteration count?

> b) When using "paravirtual ticketlock", the threshold to decide "are
> we spinning too much" becomes an important factor need to be tuned to
> the final system case by case. What we found from the test is, different
> application running in our guest OS would require different threshold
> setting.

Did you also find out here why this is the case?

Paolo

^ permalink raw reply	[flat|nested] 13+ messages in thread

* RE: Enhancement for PLE handler in KVM
  2014-03-05 14:49         ` Paolo Bonzini
@ 2014-03-05 21:16           ` Li, Bin (Bin)
  2014-03-07  3:06             ` Marcelo Tosatti
  0 siblings, 1 reply; 13+ messages in thread
From: Li, Bin (Bin) @ 2014-03-05 21:16 UTC (permalink / raw)
  To: Paolo Bonzini, kvm@vger.kernel.org
  Cc: Jatania, Neel (Neel), linux-kernel@vger.kernel.org,
	Peter Zijlstra, Mike Galbraith, Chris Wright, ttracy@redhat.com,
	Nakajima, Jun, riel@redhat.com

Thanks for the quick response.

Inline the comments and also added a typical log.

Regards
Bin
  

-----Original Message-----
From: Paolo Bonzini [mailto:pbonzini@redhat.com] 
Sent: Wednesday, March 05, 2014 9:49 AM
To: Li, Bin (Bin); kvm@vger.kernel.org
Cc: Jatania, Neel (Neel); linux-kernel@vger.kernel.org; Srivatsa Vaddagiri; Peter Zijlstra; Mike Galbraith; Chris Wright; ttracy@redhat.com; Nakajima, Jun; riel@redhat.com
Subject: Re: Enhancement for PLE handler in KVM

Il 05/03/2014 15:17, Li, Bin (Bin) ha scritto:
> Hello, Paolo,
>
> We are using a customized embedded SMP OS as guest OS. It is not meaningful to post the guest OS code.
> Also, there is no "performance numbers for common workloads" since there is no common workloads to compare with.
> In our OS, there is still a big kernel lock to protect the kernel.

Does this means that average spinning time for the spinlock is relatively high compared to Linux or Windows?

Binl:

Yes. The default setting for ple_window is 4096 which based on linux and window. In our guest OS, the ple_window need to be at least 16384 in order to prevent large jitter caused system reset. With setting ple_window to 16384, the biggest jitter in guest OS is about 24-25ms.


> - when the in-correct boosting happens, the vCPU in spin lock will run 
> longer time on the pCPU and causing the lock holder vCPU having less 
> time to run on pCPU since they are sharing the on same pCPU.

Correct.  This is an unfortunate problem in the current implementation of PLE.

> Adding hyper call in every kernel enter and kernel exist is expensive. 
> From the trace log collect from i7 running @ 3.0GHz , the cost per  
> hyper is <1us.

Right, it is around 1500 cycles and 0.4-0.5 us, i.e. approximately 1 us for enter and exit together.

This is not too bad for a kernel with a big lock, but not acceptable if you do not have it (as is the case for Linux and Windows).

> Regarding to the " paravirtual ticketlock ", we did try the same idea in our embedded guest OS.
> We got following results:
>
> a) We implemented similar approach like linux "paravirtual 
> ticketlock". The system clock jitter does get reduced a lot. But, the 
> system clock jitter is still happening at lower rate. In a few hours 
> system stress test, we still see the big jitter few times.

Did you find out why?  It could happen if the virtual CPU is scheduled out for a relatively long time.  A small number of spinning iterations can then account for a relatively large time.

My impression is that you're implementing a paravirtual spinlock, except that you're relying on PLE to decide when to go to sleep.  PLE is implemented using the TSC.  Can you assume the host TSC is of good quality?  If so, perhaps you can try to modify the pv ticketlock algorithm, and use a threshold based on TSC instead of an iteration count?

BinL:
Our "paravirtual ticket lock" is done in following way, 
 - when a vCPU is doing spin loop to get into kernel state, it will set up a counter to track how many time in the tight loop.
If the counter reach certain threshold, the vCPU will add a request to the lock holder work item pipe ( multiple producer, single consumer pipe ) and the vCPU will run "hlt" instruction, waiting for the lock holder to wake it up.
 - TSC is not used in this case.
 - When lock holder sees the request, it will send an IPI interrupt to wake up the vCPU which is hlt-ed.
 - With this implementation, we are able to get the system stress test ( about hours run with traffic ) completing successfully most of time. But not all the time. The big jitter still happening during the run, just less frequently. The root cause of the problem is still there.

 The lock holder vCPU is schedule out for a relatively long time. We did more investigation for why.
  a) From the trace-cmd log, we believe the root cause is PLE handler incorrectly boost the spin loop vCPU which being stack with the lock holder vCPU on same pCPU. 
     The sample log we captured is, lock holder vCPU and the vCPU doing spin loop, both running on a pCPU. The vCPU in spin loop being incorrectly boosted by ple handler. And linux scheduler could toggling two vCPU on the pCPU at vert high rate. The worst case is, the lock is happen to stay between the two vCPUs on the same pCPU. Then we will see lock holder vCPU is scheduled in doing a little bit real work and the other spin loop vCPU being scheduled in doing spin loop. And the context switch on the pCPU happens very often. ( see more details of typical trace below)

     This causing the real cpu power used for processing real work becomes much less than normal. ( a short period truly over load on this pCPU). And the guest OS stays in a very slow kernel work stage because the lock holder vCPU having much less time to do its real work. The system will get out of this slow movement eventually. But depending on how long being stuck in slow movement, when system get back to normal speed, the application could realize big time gap passed by already. And many time out following...

  b) We also did another test to verify the theory. We built a KVM with a dummy ple handler ( not doing yield to, ple vm exit, just return right away). With this dummy ple handler, the jitter is an issue any more. The biggest jitter in the system is just around 25-30ms during the hours traffic run.

  c) the typical trace captured when running on cent os 6.4 ( even older ple handler which would incorrectly boost spin loop vcpu more often)

      Typical trace is like:

        In the following trace, - cpu 0 being hold for 6ms ( no chance to run ). 
        6ms here is not a big deal. But showing linux could decide not allowing a process to run.

       brief event
...
        qemu-kvm-5069  [003]  8242.604669: kvm_apic_ipi:         dst 9 vec 253 (Fixed|physical|assert|edge|dst)
        qemu-kvm-5069  [003]  8242.604670: kvm_apic_accept_irq:  apicid 9 vec 253 (Fixed|edge)
        qemu-kvm-5069  [003]  8242.604673: wakeup:               5069:120:0  ==+ 5078:120:0 [003]
        qemu-kvm-5069  [003]  8242.604677: context_switch:       5069:120:0  ==> 5078:120:0 [003]   ... linux stacking vcpu 9 (pid 5078) on cpu 3 too
...
        qemu-kvm-5077  [001]  8242.610485: kvm_exit:             [FAILED TO PARSE] exit_reason=40 guest_rip=0x60edd06
        qemu-kvm-5073  [000]  8242.610485: kvm_entry:            vcpu 4
        qemu-kvm-5076  [015]  8242.610486: kvm_entry:            vcpu 7
        qemu-kvm-5078  [003]  8242.610486: kvm_entry:            vcpu 9
        qemu-kvm-5075  [014]  8242.610488: kvm_exit:             [FAILED TO PARSE] exit_reason=40 guest_rip=0x60edd06 ---> exist due to pause instruction.,
        qemu-kvm-5070  [016]  8242.610488: kvm_entry:            vcpu 1
        qemu-kvm-5073  [000]  8242.610488: kvm_exit:             [FAILED TO PARSE] exit_reason=40 guest_rip=0x60edd06
        qemu-kvm-5078  [003]  8242.610488: kvm_exit:             [FAILED TO PARSE] exit_reason=1 guest_rip=0x60edd08
...
        qemu-kvm-5078  [003]  8242.610491: context_switch:       5078:120:0  ==> 5069:120:0 [003]

        qemu-kvm-5069  [003]  8242.610494: kvm_apic_accept_irq:  apicid 0 vec 240 (Fixed|edge)
...
        qemu-kvm-5069  [003]  8242.610496: context_switch:       5069:120:0  ==> 5078:120:0 [003]

... ( all other CPUs are busy trying to entering kernel state )
       qemu-kvm-5078  [003]  8242.610502: context_switch:       5078:120:0  ==> 5069:120:0 [003]
... ( all other CPUs are busy trying to entering kernel state )

        qemu-kvm-5069  [003]  8242.610552: kvm_apic_accept_irq:  apicid 0 vec 240 (Fixed|edge) (coalesced)
        qemu-kvm-5069  [003]  8242.610554: context_switch:       5069:120:0  ==> 5078:120:0 [003]
        qemu-kvm-5069  [003]  8242.610559: kvm_apic_accept_irq:  apicid 0 vec 240 (Fixed|edge) (coalesced)
        qemu-kvm-5069  [003]  8242.610562: context_switch:       5069:120:0  ==> 5078:120:0 [003]
        qemu-kvm-5069  [003]  8242.610569: kvm_apic_accept_irq:  apicid 0 vec 240 (Fixed|edge) (coalesced)
        qemu-kvm-5069  [003]  8242.610571: context_switch:       5069:120:0  ==> 5078:120:0 [003]
        qemu-kvm-5069  [003]  8242.610577: kvm_entry:            vcpu 0
        qemu-kvm-5069  [003]  8242.610579: kvm_exit:             [FAILED TO PARSE] exit_reason=1 guest_rip=0x60ed11f
        qemu-kvm-5069  [003]  8242.610579: kvm_apic_accept_irq:  apicid 0 vec 240 (Fixed|edge) (coalesced)
        qemu-kvm-5069  [003]  8242.610581: context_switch:       5069:120:0  ==> 5078:120:0 [003]
        qemu-kvm-5069  [003]  8242.610588: kvm_apic_accept_irq:  apicid 0 vec 240 (Fixed|edge) (coalesced)
        qemu-kvm-5069  [003]  8242.610590: context_switch:       5069:120:0  ==> 5078:120:0 [003]
        qemu-kvm-5069  [003]  8242.610595: kvm_apic_accept_irq:  apicid 0 vec 240 (Fixed|edge) (coalesced)
        qemu-kvm-5069  [003]  8242.610597: context_switch:       5069:120:0  ==> 5078:120:0 [003]
        qemu-kvm-5069  [003]  8242.610605: kvm_apic_accept_irq:  apicid 0 vec 240 (Fixed|edge) (coalesced)
        qemu-kvm-5069  [003]  8242.610607: context_switch:       5069:120:0  ==> 5078:120:0 [003]
...
       qemu-kvm-5078  [003]  8242.610611: context_switch:       5078:120:0  ==> 5069:120:0 [003]
...
       qemu-kvm-5069  [003]  8242.610613: kvm_entry:            vcpu 0   ... start run vm work after not having chance to do real work for 6 ms.
...  

        Close to the end of this log file, we can see, only vcpu 0 and vcpu 9 doing some real work but very slow, but all on        physical cpu 003. While others kept doing kvm_exit ( reason 40, due to pause instruction ) and kvm_entry. 

        qemu-kvm-5078  [003]  8242.617050: kvm_entry:            vcpu 9
        qemu-kvm-5071  [005]  8242.617051: kvm_exit:             [FAILED TO PARSE] exit_reason=40 guest_rip=0x6180a6c
        qemu-kvm-5070  [016]  8242.617051: kvm_exit:             [FAILED TO PARSE] exit_reason=40 guest_rip=0x6180a6c
        qemu-kvm-5078  [003]  8242.617052: kvm_exit:             [FAILED TO PARSE] exit_reason=0 guest_rip=0x6194c00
        qemu-kvm-5075  [014]  8242.617053: kvm_entry:            vcpu 6
        qemu-kvm-5074  [013]  8242.617054: kvm_entry:            vcpu 5
        qemu-kvm-5073  [000]  8242.617054: kvm_entry:            vcpu 4
        qemu-kvm-5072  [017]  8242.617054: kvm_entry:            vcpu 3
        qemu-kvm-5079  [004]  8242.617054: kvm_entry:            vcpu 10
        qemu-kvm-5076  [015]  8242.617055: kvm_entry:            vcpu 7
        qemu-kvm-5071  [005]  8242.617055: kvm_entry:            vcpu 2
        qemu-kvm-5070  [016]  8242.617055: kvm_entry:            vcpu 1
        qemu-kvm-5077  [001]  8242.617056: kvm_entry:            vcpu 8
        qemu-kvm-5075  [014]  8242.617056: kvm_exit:             [FAILED TO PARSE] exit_reason=40 guest_rip=0x6180a6c
        qemu-kvm-5078  [003]  8242.617056: context_switch:       5078:120:0  ==> 5069:120:0 [003]
        qemu-kvm-5073  [000]  8242.617057: kvm_exit:             [FAILED TO PARSE] exit_reason=40 guest_rip=0x618b94b
        qemu-kvm-5074  [013]  8242.617057: kvm_exit:             [FAILED TO PARSE] exit_reason=40 guest_rip=0x6180a6c
        qemu-kvm-5072  [017]  8242.617057: kvm_exit:             [FAILED TO PARSE] exit_reason=40 guest_rip=0x6180a6c
        qemu-kvm-5079  [004]  8242.617058: kvm_exit:             [FAILED TO PARSE] exit_reason=40 guest_rip=0x6180a6c
        qemu-kvm-5076  [015]  8242.617058: kvm_exit:             [FAILED TO PARSE] exit_reason=40 guest_rip=0x6180a6c
        qemu-kvm-5071  [005]  8242.617058: kvm_exit:             [FAILED TO PARSE] exit_reason=40 guest_rip=0x6180a6c
        qemu-kvm-5075  [014]  8242.617059: kvm_entry:            vcpu 6
        qemu-kvm-5070  [016]  8242.617059: kvm_exit:             [FAILED TO PARSE] exit_reason=40 guest_rip=0x6180a6c
        qemu-kvm-5077  [001]  8242.617059: kvm_exit:             [FAILED TO PARSE] exit_reason=40 guest_rip=0x6180a6c
        qemu-kvm-5069  [003]  8242.617060: context_switch:       5069:120:0  ==> 5078:120:0 [003]
        qemu-kvm-5075  [014]  8242.617062: kvm_exit:             [FAILED TO PARSE] exit_reason=40 guest_rip=0x6180a6c
        qemu-kvm-5076  [015]  8242.617062: kvm_entry:            vcpu 7
        qemu-kvm-5071  [005]  8242.617063: kvm_entry:            vcpu 2
        qemu-kvm-5070  [016]  8242.617063: kvm_entry:            vcpu 1
        qemu-kvm-5077  [001]  8242.617064: kvm_entry:            vcpu 8
        qemu-kvm-5072  [017]  8242.617064: kvm_entry:            vcpu 3
        qemu-kvm-5078  [003]  8242.617065: context_switch:       5078:120:0  ==> 5069:120:0 [003]
        qemu-kvm-5076  [015]  8242.617066: kvm_exit:             [FAILED TO PARSE] exit_reason=40 guest_rip=0x6180a6c
        qemu-kvm-5071  [005]  8242.617066: kvm_exit:             [FAILED TO PARSE] exit_reason=40 guest_rip=0x6180a6c
        qemu-kvm-5070  [016]  8242.617067: kvm_exit:             [FAILED TO PARSE] exit_reason=40 guest_rip=0x6180a6c
        qemu-kvm-5074  [013]  8242.617067: kvm_entry:            vcpu 5
        qemu-kvm-5077  [001]  8242.617068: kvm_exit:             [FAILED TO PARSE] exit_reason=40 guest_rip=0x6180a6c
        qemu-kvm-5072  [017]  8242.617068: kvm_exit:             [FAILED TO PARSE] exit_reason=40 guest_rip=0x6180a6c
        qemu-kvm-5073  [000]  8242.617068: kvm_entry:            vcpu 4
        qemu-kvm-5079  [004]  8242.617068: kvm_entry:            vcpu 10
        qemu-kvm-5069  [003]  8242.617069: context_switch:       5069:120:0  ==> 5078:120:0 [003]
        qemu-kvm-5074  [013]  8242.617071: kvm_exit:             [FAILED TO PARSE] exit_reason=40 guest_rip=0x6180a6c
        qemu-kvm-5073  [000]  8242.617071: kvm_exit:             [FAILED TO PARSE] exit_reason=40 guest_rip=0x618b94b
        qemu-kvm-5079  [004]  8242.617071: kvm_exit:             [FAILED TO PARSE] exit_reason=40 guest_rip=0x6180a6c
        qemu-kvm-5072  [017]  8242.617071: kvm_entry:            vcpu 3
        qemu-kvm-5075  [014]  8242.617072: kvm_entry:            vcpu 6
        qemu-kvm-5078  [003]  8242.617072: context_switch:       5078:120:0  ==> 5069:120:0 [003]
        qemu-kvm-5077  [001]  8242.617072: kvm_entry:            vcpu 8
        qemu-kvm-5075  [014]  8242.617075: kvm_exit:             [FAILED TO PARSE] exit_reason=40 guest_rip=0x6180a6c
        qemu-kvm-5072  [017]  8242.617075: kvm_exit:             [FAILED TO PARSE] exit_reason=40 guest_rip=0x6180a6c
        qemu-kvm-5076  [015]  8242.617076: kvm_entry:            vcpu 7
        qemu-kvm-5077  [001]  8242.617076: kvm_exit:             [FAILED TO PARSE] exit_reason=40 guest_rip=0x6180a6c
        qemu-kvm-5071  [005]  8242.617076: kvm_entry:            vcpu 2
        qemu-kvm-5073  [000]  8242.617076: kvm_entry:            vcpu 4
        qemu-kvm-5070  [016]  8242.617076: kvm_entry:            vcpu 1
        qemu-kvm-5074  [013]  8242.617077: kvm_entry:            vcpu 5
        qemu-kvm-5079  [004]  8242.617077: kvm_entry:            vcpu 10
        qemu-kvm-5069  [003]  8242.617077: context_switch:       5069:120:0  ==> 5078:120:0 [003]
        qemu-kvm-5071  [005]  8242.617079: kvm_exit:             [FAILED TO PARSE] exit_reason=40 guest_rip=0x6180a6c
        qemu-kvm-5076  [015]  8242.617079: kvm_exit:             [FAILED TO PARSE] exit_reason=40 guest_rip=0x6180a6c
        qemu-kvm-5073  [000]  8242.617079: kvm_exit:             [FAILED TO PARSE] exit_reason=40 guest_rip=0x618b94b
        qemu-kvm-5070  [016]  8242.617080: kvm_exit:             [FAILED TO PARSE] exit_reason=40 guest_rip=0x6180a6c
        qemu-kvm-5074  [013]  8242.617080: kvm_exit:             [FAILED TO PARSE] exit_reason=40 guest_rip=0x6180a6c
        qemu-kvm-5079  [004]  8242.617080: kvm_exit:             [FAILED TO PARSE] exit_reason=40 guest_rip=0x6180a6c
        qemu-kvm-5077  [001]  8242.617081: kvm_entry:            vcpu 8
        qemu-kvm-5075  [014]  8242.617081: kvm_entry:            vcpu 6
        qemu-kvm-5078  [003]  8242.617082: context_switch:       5078:120:0  ==> 5069:120:0 [003]
        qemu-kvm-5069  [003]  8242.617083: kvm_entry:            vcpu 0
        qemu-kvm-5072  [017]  8242.617084: kvm_entry:            vcpu 3
        qemu-kvm-5077  [001]  8242.617084: kvm_exit:             [FAILED TO PARSE] exit_reason=40 guest_rip=0x6180a6c
        qemu-kvm-5075  [014]  8242.617084: kvm_exit:             [FAILED TO PARSE] exit_reason=40 guest_rip=0x6180a6c
        qemu-kvm-5069  [003]  8242.617086: kvm_exit:             [FAILED TO PARSE] exit_reason=1 guest_rip=0x60edd08
        qemu-kvm-5072  [017]  8242.617087: kvm_exit:             [FAILED TO PARSE] exit_reason=40 guest_rip=0x6180a6c
        qemu-kvm-5074  [013]  8242.617087: kvm_entry:            vcpu 5
        qemu-kvm-5079  [004]  8242.617087: kvm_entry:            vcpu 10
        qemu-kvm-5071  [005]  8242.617088: kvm_entry:            vcpu 2
        qemu-kvm-5069  [003]  8242.617089: context_switch:       5069:120:0  ==> 5078:120:0 [003]
        qemu-kvm-5076  [015]  8242.617089: kvm_entry:            vcpu 7
        qemu-kvm-5074  [013]  8242.617090: kvm_exit:             [FAILED TO PARSE] exit_reason=40 guest_rip=0x6180a6c
        qemu-kvm-5070  [016]  8242.617090: kvm_entry:            vcpu 1
        qemu-kvm-5078  [003]  8242.617090: kvm_entry:            vcpu 9
        qemu-kvm-5079  [004]  8242.617090: kvm_exit:             [FAILED TO PARSE] exit_reason=40 guest_rip=0x6180a6c
        qemu-kvm-5073  [000]  8242.617091: kvm_entry:            vcpu 4
        qemu-kvm-5071  [005]  8242.617091: kvm_exit:             [FAILED TO PARSE] exit_reason=40 guest_rip=0x6180a6c
        qemu-kvm-5076  [015]  8242.617092: kvm_exit:             [FAILED TO PARSE] exit_reason=40 guest_rip=0x6180a6c
        qemu-kvm-5072  [017]  8242.617092: kvm_entry:            vcpu 3
        qemu-kvm-5078  [003]  8242.617092: kvm_exit:             [FAILED TO PARSE] exit_reason=1 guest_rip=0x6194c00
        qemu-kvm-5075  [014]  8242.617093: kvm_entry:            vcpu 6
        qemu-kvm-5079  [004]  8242.617093: kvm_entry:            vcpu 10
        qemu-kvm-5070  [016]  8242.617094: kvm_exit:             [FAILED TO PARSE] exit_reason=40 guest_rip=0x6180a6c
        qemu-kvm-5077  [001]  8242.617094: kvm_entry:            vcpu 8

> b) When using "paravirtual ticketlock", the threshold to decide "are 
> we spinning too much" becomes an important factor need to be tuned to 
> the final system case by case. What we found from the test is, 
> different application running in our guest OS would require different 
> threshold setting.

Did you also find out here why this is the case?

Binl:
Yes. The application running in our customized embedded OS is also real time application which is timing sensitive. 
The timing sensitive level varies among the applications.
When I talking about different threshold, the assumption is using default 4096 ple_window setting.
If I set up the ple_window to 16384 or higher, there will be no problem for our application. But the finial user could also run linux or window VM on same hypervisor which would prefer default 4096 setting for linux and windows.
We are looking for a solution to be good for both linux / window and real time application.
The enhancement patch we proposed will satisfy both linux/window application and real time embedded applications.

Paolo

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: Enhancement for PLE handler in KVM
  2014-03-05 21:16           ` Li, Bin (Bin)
@ 2014-03-07  3:06             ` Marcelo Tosatti
  2014-03-07 14:26               ` Li, Bin (Bin)
  0 siblings, 1 reply; 13+ messages in thread
From: Marcelo Tosatti @ 2014-03-07  3:06 UTC (permalink / raw)
  To: Li, Bin (Bin)
  Cc: Paolo Bonzini, kvm@vger.kernel.org, Jatania, Neel (Neel),
	linux-kernel@vger.kernel.org, Peter Zijlstra, Mike Galbraith,
	Chris Wright, ttracy@redhat.com, Nakajima, Jun, riel@redhat.com

On Wed, Mar 05, 2014 at 09:16:45PM +0000, Li, Bin (Bin) wrote:
> Did you also find out here why this is the case?
> 
> Binl:
> Yes. The application running in our customized embedded OS is also real time application which is timing sensitive. 
> The timing sensitive level varies among the applications.
> When I talking about different threshold, the assumption is using default 4096 ple_window setting.
> If I set up the ple_window to 16384 or higher, there will be no problem for our application. But the finial user could also run linux or window VM on same hypervisor which would prefer default 4096 setting for linux and windows.

Then have per-VM PLE values?

> We are looking for a solution to be good for both linux / window and real time application.
> The enhancement patch we proposed will satisfy both linux/window application and real time embedded applications.
> 
> Paolo

^ permalink raw reply	[flat|nested] 13+ messages in thread

* RE: Enhancement for PLE handler in KVM
  2014-03-07  3:06             ` Marcelo Tosatti
@ 2014-03-07 14:26               ` Li, Bin (Bin)
  2014-03-07 17:41                 ` Marcelo Tosatti
  0 siblings, 1 reply; 13+ messages in thread
From: Li, Bin (Bin) @ 2014-03-07 14:26 UTC (permalink / raw)
  To: Marcelo Tosatti
  Cc: Paolo Bonzini, kvm@vger.kernel.org, Jatania, Neel (Neel),
	linux-kernel@vger.kernel.org

Can we have "per-VM PLE values"?

My understanding is that the ple values are kvm module setting which applying to all VMs in the system.
And all vms must be stopped first, then unload kvm-intel, reload kvm-intel with new ple setting.

/sbin/modprobe -r kvm-intel
/sbin/modprobe kvm-intel ple_window=16384

Regards
Bin

-----Original Message-----
From: Marcelo Tosatti [mailto:mtosatti@redhat.com] 
Sent: Thursday, March 06, 2014 10:07 PM
To: Li, Bin (Bin)
Cc: Paolo Bonzini; kvm@vger.kernel.org; Jatania, Neel (Neel); linux-kernel@vger.kernel.org; Peter Zijlstra; Mike Galbraith; Chris Wright; ttracy@redhat.com; Nakajima, Jun; riel@redhat.com
Subject: Re: Enhancement for PLE handler in KVM

On Wed, Mar 05, 2014 at 09:16:45PM +0000, Li, Bin (Bin) wrote:
> Did you also find out here why this is the case?
> 
> Binl:
> Yes. The application running in our customized embedded OS is also real time application which is timing sensitive. 
> The timing sensitive level varies among the applications.
> When I talking about different threshold, the assumption is using default 4096 ple_window setting.
> If I set up the ple_window to 16384 or higher, there will be no problem for our application. But the finial user could also run linux or window VM on same hypervisor which would prefer default 4096 setting for linux and windows.

Then have per-VM PLE values?

> We are looking for a solution to be good for both linux / window and real time application.
> The enhancement patch we proposed will satisfy both linux/window application and real time embedded applications.
> 
> Paolo

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: Enhancement for PLE handler in KVM
  2014-03-07 14:26               ` Li, Bin (Bin)
@ 2014-03-07 17:41                 ` Marcelo Tosatti
  2014-03-07 22:08                   ` Li, Bin (Bin)
  0 siblings, 1 reply; 13+ messages in thread
From: Marcelo Tosatti @ 2014-03-07 17:41 UTC (permalink / raw)
  To: Li, Bin (Bin)
  Cc: Paolo Bonzini, kvm@vger.kernel.org, Jatania, Neel (Neel),
	linux-kernel@vger.kernel.org

On Fri, Mar 07, 2014 at 02:26:19PM +0000, Li, Bin (Bin) wrote:
> Can we have "per-VM PLE values"?
> 
> My understanding is that the ple values are kvm module setting which applying to all VMs in the system.
> And all vms must be stopped first, then unload kvm-intel, reload kvm-intel with new ple setting.
> 
> /sbin/modprobe -r kvm-intel
> /sbin/modprobe kvm-intel ple_window=16384
> 
> Regards
> Bin

Yes, but it can be made per-VM (its a VMCS field).

^ permalink raw reply	[flat|nested] 13+ messages in thread

* RE: Enhancement for PLE handler in KVM
  2014-03-07 17:41                 ` Marcelo Tosatti
@ 2014-03-07 22:08                   ` Li, Bin (Bin)
  2014-03-08  1:54                     ` Marcelo Tosatti
  0 siblings, 1 reply; 13+ messages in thread
From: Li, Bin (Bin) @ 2014-03-07 22:08 UTC (permalink / raw)
  To: Marcelo Tosatti
  Cc: Paolo Bonzini, kvm@vger.kernel.org, Jatania, Neel (Neel),
	linux-kernel@vger.kernel.org

Fully agree.
It will be a very helpful feature to make ple setting per VM. 
This feature will provide more flexible control to the VM user. All KVM user will love to have it.

The enhancement we proposed is neither overlapping nor conflicting with this feature. The enhancement is targeting to provide the best real time performance to the guest OS.
There will be more and more embedded system migrating to KVM. Especially in the telecom industry. And a lot of existing system will be running on top of customized embedded OS which significant different from generic OS (either linux  or windows ).

Is there any concern regarding to the enhancement we need to address? Or more work need to be done?

Regards
Bin

-----Original Message-----
From: Marcelo Tosatti [mailto:mtosatti@redhat.com] 
Sent: Friday, March 07, 2014 12:42 PM
To: Li, Bin (Bin)
Cc: Paolo Bonzini; kvm@vger.kernel.org; Jatania, Neel (Neel); linux-kernel@vger.kernel.org
Subject: Re: Enhancement for PLE handler in KVM

On Fri, Mar 07, 2014 at 02:26:19PM +0000, Li, Bin (Bin) wrote:
> Can we have "per-VM PLE values"?
> 
> My understanding is that the ple values are kvm module setting which applying to all VMs in the system.
> And all vms must be stopped first, then unload kvm-intel, reload kvm-intel with new ple setting.
> 
> /sbin/modprobe -r kvm-intel
> /sbin/modprobe kvm-intel ple_window=16384
> 
> Regards
> Bin

Yes, but it can be made per-VM (its a VMCS field).

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: Enhancement for PLE handler in KVM
  2014-03-07 22:08                   ` Li, Bin (Bin)
@ 2014-03-08  1:54                     ` Marcelo Tosatti
       [not found]                       ` <531F19D7.6030909@alcatel-lucent.com>
  0 siblings, 1 reply; 13+ messages in thread
From: Marcelo Tosatti @ 2014-03-08  1:54 UTC (permalink / raw)
  To: Li, Bin (Bin)
  Cc: Paolo Bonzini, kvm@vger.kernel.org, Jatania, Neel (Neel),
	linux-kernel@vger.kernel.org

On Fri, Mar 07, 2014 at 10:08:52PM +0000, Li, Bin (Bin) wrote:
> Fully agree.
> It will be a very helpful feature to make ple setting per VM. 
> This feature will provide more flexible control to the VM user. All KVM user will love to have it.
> 
> The enhancement we proposed is neither overlapping nor conflicting with this feature. The enhancement is targeting to provide the best real time performance to the guest OS.
> There will be more and more embedded system migrating to KVM. Especially in the telecom industry. And a lot of existing system will be running on top of customized embedded OS which significant different from generic OS (either linux  or windows ).

The point Paolo raised is that the hypercall interface can increase CPU 
consumption significantly. It would be good to understand why is
the interface the only way to fix the problem.

> Is there any concern regarding to the enhancement we need to address? Or more work need to be done?

It was not clear from the information you provided that increasing PLE
window alone is not sufficient to reduce clock jitter in the guest 
to acceptable levels (from 400ms to <10ms).

BTW, can you explain the clock jitter measure?

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: Enhancement for PLE handler in KVM
       [not found]                       ` <531F19D7.6030909@alcatel-lucent.com>
@ 2014-03-11 16:14                         ` Paolo Bonzini
  2014-03-12 13:05                           ` Li, Bin (Bin)
  0 siblings, 1 reply; 13+ messages in thread
From: Paolo Bonzini @ 2014-03-11 16:14 UTC (permalink / raw)
  To: Li, Bin (Bin), Marcelo Tosatti
  Cc: kvm@vger.kernel.org, Jatania, Neel (Neel),
	linux-kernel@vger.kernel.org

Il 11/03/2014 15:12, Li, Bin (Bin) ha scritto:
>     - For the guest OS which doesn't use the hyper call interface, there
>     will be no impact to them. The proposed ple handler enhancement has
>     structured to use the hint only if the guest OS using the new
>     proposed hyper call. And it is per VM only.
>     The VM which running general guest OS ( linux / windows) will still
>     use the today's ple hanlder to boost vCPUs. While the other VM,
>     which using new hyper call indicating the lock get and release, the
>     ple handler for this VM will boost the lock holder only.

No, if there is a jitter problem we want to fix it for all guest OSes, 
not just for those that use a big kernel lock.

>     - The main advantage of this proposal is that, it reliably solves
>     the problem. Any other option which could prevent the problem from
>     happening thoroughly?

You haven't proved this yet.  My impression is that, on a 
non-overcommitted system, your proposal is exactly the same as a fair 
lock with paravirtualization (except more expensive for the lock taker, 
even when there is no contention).

I think I understand why, on an overcommitted system, you could still 
have jitter with pv ticketlocks and not with your infrastructure.  The 
reason is that pv ticketlocks do not attempt to donate the quantum to 
the lock holder.  Is there anything we can do to fix *this*?  I would 
accept a new hypercall KVM_HC_HALT_AND_YIELD_TO_CPU that takes an APIC 
id, donates the quantum to that CPU, and puts the originating CPU in 
halted state.

If this is not enough, it's up to you to disprove this and explain why 
the two have different jitter characteristics.  To do this, you need to 
implement paravirtualized fair locks in your kernel (and possibly 
halt-and-yield), measure the difference in jitter, *trace what's 
happening on the host to characterize the benefits of your solution*, etc.

>     - Using hyper call to mark lock status does increase cpu
>     consumption. But the impact to the system is very much depending on
>     lock usage character in the guest OS.
>
>       For the guest OS, which typically doing less frequent kernel lock,
>     but longer operation for each kernel lock, the overall impact from
>     hype call could *NOT *be an issue.

Again, if there is a jitter problem we want to fix it for all locks, 
like we did for pv ticketlocks.

Paolo

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: Enhancement for PLE handler in KVM
  2014-03-11 16:14                         ` Paolo Bonzini
@ 2014-03-12 13:05                           ` Li, Bin (Bin)
  0 siblings, 0 replies; 13+ messages in thread
From: Li, Bin (Bin) @ 2014-03-12 13:05 UTC (permalink / raw)
  To: Paolo Bonzini
  Cc: Marcelo Tosatti, kvm@vger.kernel.org, Jatania, Neel (Neel),
	linux-kernel@vger.kernel.org

Thanks, Paolo for the comments

Understand the requirement to "fix it for all guest OSes".

I will investigate the "new hypercall KVM_HC_HALT_AND_YIELD_TO_CPU that 
takes an APIC id, donates the quantum to that CPU, and puts the 
originating CPU in halted state. "

Regards
Bin

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: Enhancement for PLE handler in KVM
@ 2014-03-24 14:12 Raghavendra KT
  0 siblings, 0 replies; 13+ messages in thread
From: Raghavendra KT @ 2014-03-24 14:12 UTC (permalink / raw)
  To: Li, Bin (Bin)
  Cc: KVM, Neel Jatania, Linux Kernel Mailing List, Avi Kiviti,
	Srivatsa Vaddagiri, Peter Zijlstra, Mike Galbraith, Chris Wright,
	ttracy, Nakajima, Jun, Rik van Riel

On Mon, Mar 3, 2014 at 11:54 PM, Li, Bin (Bin)
<bin.bl.li@alcatel-lucent.com> wrote:
> Hello, all.
>
> The PLE handler attempts to determine an alternate vCPU to schedule.  In
> some cases the wrong vCPU is scheduled and performance suffers.
>
> This patch allows for the guest OS to signal, using a hypercall, that it's
> starting/ending a critical section.  Using this information in the PLE
> handler allows for a more intelligent VCPU scheduling determination to be
> made.  The patch only changes the PLE behaviour if this new hypercall
> mechanism is used; if it isn't used, then the existing PLE algorithm
> continues to be used to determine the next vCPU.
>
> Benefit from the patch:
>  -  the guest OS real time performance being significantly improved when
> using hyper call marking entering and leaving guest OS kernel state.
>  - The guest OS system clock jitter measured on on Intel E5 2620 reduced
> from 400ms down to 6ms.
>  - The guest OS system lock is set to a 2ms clock interrupt. The jitter is
> measured by the difference between dtsc() value in clock interrupt handler
> and the expectation of tsc value.
>  - detail of test report is attached as reference.
>
> Path details:
>
> From 77edfa193a4e29ab357ec3b1e097f8469d418507 Mon Sep 17 00:00:00 2001
>
> From: Bin BL LI <bin.bl.li@alcatel-lucent.com>
>
> Date: Mon, 3 Mar 2014 11:23:35 -0500
>
> Subject: [PATCH] Initial commit
>
> ---
>
>  arch/x86/kvm/x86.c            |    7 +++++++
>
>  include/linux/kvm_host.h      |   16 ++++++++++++++++
>
>  include/uapi/linux/kvm_para.h |    2 ++
>
>  virt/kvm/kvm_main.c           |   14 +++++++++++++-
>
>  4 files changed, 38 insertions(+), 1 deletions(-)
>
> diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
>
> index 39c28f0..e735de3 100644
>
> --- a/arch/x86/kvm/x86.c
>
> +++ b/arch/x86/kvm/x86.c
>
> @@ -5582,6 +5582,7 @@ void kvm_arch_exit(void)
>
>  int kvm_emulate_halt(struct kvm_vcpu *vcpu)
>
>  {
>
>      ++vcpu->stat.halt_exits;
>
> +    kvm_vcpu_set_holding_lock(vcpu,false);
>
>      if (irqchip_in_kernel(vcpu->kvm)) {
>
>          vcpu->arch.mp_state = KVM_MP_STATE_HALTED;
>
>          return 1;
>

Joining late to comment on this :(.

Seeing that you are  trying to set 'holding_lock'  in halt handling
path, I am just curious if you could try
https://lkml.org/lkml/2013/7/22/41 to see if you get any benefits. [
We could not get any convincing
 benefit during pv patch posting and dropped it].

 and regarding SPIN_THRESHOLD tuning,  I did some experiment by
dynamically tuning loop count
based on head,tail vaules (for e.g. if we are nearer to the
lock-holder in the queue  loop longer), but that
also did not yield much result.

[...]

^ permalink raw reply	[flat|nested] 13+ messages in thread

end of thread, other threads:[~2014-03-24 14:12 UTC | newest]

Thread overview: 13+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
     [not found] <53061044.2000009@alcatel-lucent.com>
     [not found] ` <530B9637.6030708@alcatel-lucent.com>
2014-03-03 18:24   ` Enhancement for PLE handler in KVM Li, Bin (Bin)
2014-03-03 19:20     ` Paolo Bonzini
2014-03-05 14:17       ` Li, Bin (Bin)
2014-03-05 14:49         ` Paolo Bonzini
2014-03-05 21:16           ` Li, Bin (Bin)
2014-03-07  3:06             ` Marcelo Tosatti
2014-03-07 14:26               ` Li, Bin (Bin)
2014-03-07 17:41                 ` Marcelo Tosatti
2014-03-07 22:08                   ` Li, Bin (Bin)
2014-03-08  1:54                     ` Marcelo Tosatti
     [not found]                       ` <531F19D7.6030909@alcatel-lucent.com>
2014-03-11 16:14                         ` Paolo Bonzini
2014-03-12 13:05                           ` Li, Bin (Bin)
2014-03-24 14:12 Raghavendra KT

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).