From mboxrd@z Thu Jan  1 00:00:00 1970
From: Anthony Liguori <anthony@codemonkey.ws>
Subject: Re: [Qemu-devel] [PATCH] kvm: First step to push iothread lock out
 of inner run loop
Date: Thu, 28 Jun 2012 09:10:39 -0500
Message-ID: <4FEC65DF.20504@codemonkey.ws>
References: <4FE4F56D.1020201@web.de> <4FE4F7F5.7030400@web.de> <20120626193420.GA19852@amt.cnet>
Mime-Version: 1.0
Content-Type: text/plain; charset=ISO-8859-1; format=flowed
Content-Transfer-Encoding: 7bit
Cc: Jan Kiszka <jan.kiszka@web.de>,
	Liu Ping Fan <pingfank@linux.vnet.ibm.com>,
	kvm <kvm@vger.kernel.org>, qemu-devel <qemu-devel@nongnu.org>,
	Alexander Graf <agraf@suse.de>, Avi Kivity <avi@redhat.com>
To: Marcelo Tosatti <mtosatti@redhat.com>
Return-path: <kvm-owner@vger.kernel.org>
Received: from mail-pz0-f46.google.com ([209.85.210.46]:45332 "EHLO
	mail-pz0-f46.google.com" rhost-flags-OK-OK-OK-OK) by vger.kernel.org
	with ESMTP id S1751254Ab2F1OKp (ORCPT <rfc822;kvm@vger.kernel.org>);
	Thu, 28 Jun 2012 10:10:45 -0400
Received: by dady13 with SMTP id y13so2929801dad.19
        for <kvm@vger.kernel.org>; Thu, 28 Jun 2012 07:10:44 -0700 (PDT)
In-Reply-To: <20120626193420.GA19852@amt.cnet>
Sender: kvm-owner@vger.kernel.org
List-ID: <kvm.vger.kernel.org>

On 06/26/2012 02:34 PM, Marcelo Tosatti wrote:
> On Sat, Jun 23, 2012 at 12:55:49AM +0200, Jan Kiszka wrote:
>> Should have declared this [RFC] in the subject and CC'ed kvm...
>>
>> On 2012-06-23 00:45, Jan Kiszka wrote:
>>> This sketches a possible path to get rid of the iothread lock on vmexits
>>> in KVM mode. On x86, the the in-kernel irqchips has to be used because
>>> we otherwise need to synchronize APIC and other per-cpu state accesses
>>> that could be changed concurrently. Not yet fully analyzed is the NMI
>>> injection path in the absence of an APIC.
>>>
>>> s390x should be fine without specific locking as their pre/post-run
>>> callbacks are empty. Power requires locking for the pre-run callback.
>>>
>>> This patch is untested, but a similar version was successfully used in
>>> a x86 setup with a network I/O path that needed no central iothread
>>> locking anymore (required special MMIO exit handling).
>>> ---
>>>   kvm-all.c         |   18 ++++++++++++++++--
>>>   target-i386/kvm.c |    7 +++++++
>>>   target-ppc/kvm.c  |    4 ++++
>>>   3 files changed, 27 insertions(+), 2 deletions(-)
>>>
>>> diff --git a/kvm-all.c b/kvm-all.c
>>> index f8e4328..9c3e26f 100644
>>> --- a/kvm-all.c
>>> +++ b/kvm-all.c
>>> @@ -1460,6 +1460,8 @@ int kvm_cpu_exec(CPUArchState *env)
>>>           return EXCP_HLT;
>>>       }
>>>
>>> +    qemu_mutex_unlock_iothread();
>>> +
>>>       do {
>>>           if (env->kvm_vcpu_dirty) {
>>>               kvm_arch_put_registers(env, KVM_PUT_RUNTIME_STATE);
>>> @@ -1476,14 +1478,16 @@ int kvm_cpu_exec(CPUArchState *env)
>>>                */
>>>               qemu_cpu_kick_self();
>>>           }
>>> -        qemu_mutex_unlock_iothread();
>>>
>>>           run_ret = kvm_vcpu_ioctl(env, KVM_RUN, 0);
>>>
>>> -        qemu_mutex_lock_iothread();
>>>           kvm_arch_post_run(env, run);
>>>
>>> +        /* TODO: push coalesced mmio flushing to the point where we access
>>> +         * devices that are using it (currently VGA and E1000). */
>>> +        qemu_mutex_lock_iothread();
>>>           kvm_flush_coalesced_mmio_buffer();
>>> +        qemu_mutex_unlock_iothread();
>>>
>>>           if (run_ret<  0) {
>>>               if (run_ret == -EINTR || run_ret == -EAGAIN) {
>>> @@ -1499,19 +1503,23 @@ int kvm_cpu_exec(CPUArchState *env)
>>>           switch (run->exit_reason) {
>>>           case KVM_EXIT_IO:
>>>               DPRINTF("handle_io\n");
>>> +            qemu_mutex_lock_iothread();
>>>               kvm_handle_io(run->io.port,
>>>                             (uint8_t *)run + run->io.data_offset,
>>>                             run->io.direction,
>>>                             run->io.size,
>>>                             run->io.count);
>>> +            qemu_mutex_unlock_iothread();
>>>               ret = 0;
>>>               break;
>>>           case KVM_EXIT_MMIO:
>>>               DPRINTF("handle_mmio\n");
>>> +            qemu_mutex_lock_iothread();
>>>               cpu_physical_memory_rw(run->mmio.phys_addr,
>>>                                      run->mmio.data,
>>>                                      run->mmio.len,
>>>                                      run->mmio.is_write);
>>> +            qemu_mutex_unlock_iothread();
>>>               ret = 0;
>>>               break;
>>>           case KVM_EXIT_IRQ_WINDOW_OPEN:
>>> @@ -1520,7 +1528,9 @@ int kvm_cpu_exec(CPUArchState *env)
>>>               break;
>>>           case KVM_EXIT_SHUTDOWN:
>>>               DPRINTF("shutdown\n");
>>> +            qemu_mutex_lock_iothread();
>>>               qemu_system_reset_request();
>>> +            qemu_mutex_unlock_iothread();
>>>               ret = EXCP_INTERRUPT;
>>>               break;
>>>           case KVM_EXIT_UNKNOWN:
>>> @@ -1533,11 +1543,15 @@ int kvm_cpu_exec(CPUArchState *env)
>>>               break;
>>>           default:
>>>               DPRINTF("kvm_arch_handle_exit\n");
>>> +            qemu_mutex_lock_iothread();
>>>               ret = kvm_arch_handle_exit(env, run);
>>> +            qemu_mutex_unlock_iothread();
>>>               break;
>>>           }
>>>       } while (ret == 0);
>>>
>>> +    qemu_mutex_lock_iothread();
>>> +
>>>       if (ret<  0) {
>>>           cpu_dump_state(env, stderr, fprintf, CPU_DUMP_CODE);
>>>           vm_stop(RUN_STATE_INTERNAL_ERROR);
>>> diff --git a/target-i386/kvm.c b/target-i386/kvm.c
>>> index 0d0d8f6..0ad64d1 100644
>>> --- a/target-i386/kvm.c
>>> +++ b/target-i386/kvm.c
>>> @@ -1631,7 +1631,10 @@ void kvm_arch_pre_run(CPUX86State *env, struct kvm_run *run)
>>>
>>>       /* Inject NMI */
>>>       if (env->interrupt_request&  CPU_INTERRUPT_NMI) {
>>> +        qemu_mutex_lock_iothread();
>>>           env->interrupt_request&= ~CPU_INTERRUPT_NMI;
>>> +        qemu_mutex_unlock_iothread();
>>> +
>>>           DPRINTF("injected NMI\n");
>>>           ret = kvm_vcpu_ioctl(env, KVM_NMI);
>>>           if (ret<  0) {
>>> @@ -1641,6 +1644,8 @@ void kvm_arch_pre_run(CPUX86State *env, struct kvm_run *run)
>>>       }
>>>
>>>       if (!kvm_irqchip_in_kernel()) {
>>> +        qemu_mutex_lock_iothread();
>>> +
>>>           /* Force the VCPU out of its inner loop to process any INIT requests
>>>            * or pending TPR access reports. */
>>>           if (env->interrupt_request&
>>> @@ -1682,6 +1687,8 @@ void kvm_arch_pre_run(CPUX86State *env, struct kvm_run *run)
>>>
>>>           DPRINTF("setting tpr\n");
>>>           run->cr8 = cpu_get_apic_tpr(env->apic_state);
>>> +
>>> +        qemu_mutex_unlock_iothread();
>>>       }
>>>   }
>>>
>>> diff --git a/target-ppc/kvm.c b/target-ppc/kvm.c
>>> index c09cc39..60d91a5 100644
>>> --- a/target-ppc/kvm.c
>>> +++ b/target-ppc/kvm.c
>>> @@ -471,6 +471,8 @@ void kvm_arch_pre_run(CPUPPCState *env, struct kvm_run *run)
>>>       int r;
>>>       unsigned irq;
>>>
>>> +    qemu_mutex_lock_iothread();
>>> +
>>>       /* PowerPC QEMU tracks the various core input pins (interrupt, critical
>>>        * interrupt, reset, etc) in PPC-specific env->irq_input_state. */
>>>       if (!cap_interrupt_level&&
>>> @@ -497,6 +499,8 @@ void kvm_arch_pre_run(CPUPPCState *env, struct kvm_run *run)
>>>       /* We don't know if there are more interrupts pending after this. However,
>>>        * the guest will return to userspace in the course of handling this one
>>>        * anyways, so we will get a chance to deliver the rest. */
>>> +
>>> +    qemu_mutex_unlock_iothread();
>>>   }
>>>
>>>   void kvm_arch_post_run(CPUPPCState *env, struct kvm_run *run)
>>>
>
> The following plan would allow progressive convertion to parallel
> operation.
>
> Jan mentioned the MMIO handler->MMIO handler deadlock in a private message.
>
> Jan: if there is recursive MMIO accesses, you can detect that and skip
> such MMIO handlers in dev_can_use_lock() ? Or blacklist.
>
> What i mentioned earlier about "unsolved issues" are the possibilities
> in "iothread flow" (belief is that it would be better determined at
> with execution experience / tuning).
>
> OK, so, each document starts with xyz.txt. This is compatible with TCG,
> the critical part is not dropping/acquire locks for decent performance
> (that is, only drop memmap_lock from TCG VCPUS thread if IOTHREAD
> requests so).
>
> This came out from discussions with Avi.
>
> immediate-plan.txt
> ------------------
>
> 1.  read_lock(memmap_lock)
> 2.  MemoryRegionSection mrs = lookup(addr)
> 3.  qom_ref(mrs.mr->dev)
> 4.  read_unlock(memmap_lock)
>
> 5.  mutex_lock(dev->lock)
> 6.  dispatch(&mrs, addr, data, size)
> 7.  mutex_unlock(dev->lock)

Just a detail, I don't think we should acquire a device specific lock in global 
code.  Rather, I think we should acquire the global lock before dispatch unless 
a MemoryRegion is marked as being unlocked.

Regards,

Anthony Liguori

>
> 8.  qom_unref(mrs.mr->object)
>
>
> A) Add Device object to MemoryRegions.
>
> memory_region_add_subregion
> memory_region_add_eventfd
> memory_region_add_subregion_overlap
>
>
> B) qom_ref details
>
> Z) see device-hotplug.txt items
>
>
>
> lock-model.txt
> --------------
>
>
> lock_device(dev) {
>      if (dev_can_use_lock(dev))
>          mutex_lock(dev->lock)
>      else
>          mutex_lock(qemu_global_mutex)
> }
>
> dev_can_use_lock(dev)
> {
>      if (all services used by dev mmio handler have)
>          - qemu_global_mutex
>          - trylock(dev->lock) and fail
>
>          then yes
>      else
>          then no
> }
>
>
> That is, from the moment in which the device uses the order
> (dev->lock ->  qemu_global_mutex), the qemu_global_mutex ->  dev->lock
> is forbidden by the IOTHREAD or anyone else.
>
>
> convert-to-unlocked.txt
> -----------------------
>
>
> 1.  read_lock(memmap_lock)
> 2.  MemoryRegionSection mrs = lookup(addr)
> 3.  qom_ref(mrs.mr->dev)
> 4.  read_unlock(memmap_lock)
>
> 5.  mutex_lock(dev->lock)
> 6.  dispatch(&mrs, addr, data, size)
> 7.  mutex_unlock(dev->lock)
>
> 8.  qom_unref(mrs.mr->object)
>
> THE ITEMS BELOW SHOULD BE DOCUMENTATION TO CONVERT DEVICE
> TO UNLOCKED OPERATION.
>
> 1) qom_ref guarantees that the Device object does not disappear. However,
> these operations are allowed in parallel:
>
>      a) after 4. device memory and ioports might change or be unregistered.
>
>         Concurrent accesses of a device that is being hotunplugged or
>         whose mappings change are responsability of the OS, so
>         incorrect emulation is not QEMU's problem. However, device
>         emulation is now subject to:
>              - cpu_physical_memory_map failing.
>              - stl_phys* failing.
>              - cpu_physical_memory_rw failing/reading from unassigned mem.
>
>      b) what should happen with a pending cpu_physical_memory_map pointer,
>         when deleting the corresponding memory region?
>
>
> net.txt
> --------
>
>
> iothread flow
> =============
>
> 1) Skip-work-if-device-locked
>
> select(tap fd ready)
> tap_send
>      if (trylock(TAPState->NetClientState->dev))
>          proceed;
>      else
>          continue; (data remains in queue)
>      tap_read_packet
>      qemu_send_packet_async
>
> DRAWBACK: lock intensive.
> DRAWBACK: VCPU threads can starve IOTHREAD (can be fixed with
> a scheme such as trylock() marks a flag saying "iothread wants lock".
>
> Checking each subsystem for possibility:
>
> - tap_send: OK
>      - user,slirp: if not possible, use device->lock.
> - qemu-char: OK
> - interrupts: lock with qemu_global_mutex.
> - qemutimer:
> - read time: qemu_get_clock_ns (see later).
>
>
> 2) Queue-work-to-vcpu-context
>
> select(tap fd ready)
> tap_send
>      if (trylock(TAPState->NetClientState->dev)) {
>          proceed;
>      } else {
>          AddToDeviceWorkQueue(work);
>      }
>      tap_read_packet
>      qemu_send_packet_async
>
> DRAWBACK: lock intensive
> DRAWBACK: cache effects
>
> 3) VCPU-thread-notify-device-unlocked
>
> select(tap fd ready)
> tap_send
>      if (trylock(TAPState->NetClientState->dev)) {
>          proceed;
>      } else {
>          signal VCPU thread to notify IOTHREAD when unlocked.
>      }
>      tap_read_packet
>
>
> GENERAL OBJECTIVE: to avoid lock inversion. IOTHREAD should follow
> same order.
>
>
> PCI HOTPLUG EJECT FLOW
> ----------------------
>
> 1) Monitor initiated.
>
> monitor ->  qmp_device_del ->  qdev_unplug ->  pci_unplug_device ->
> piix4_device_hotplug ->  SCI interrupt ->  OS removes
> application users from device ->  OS runs _EJ0 ->
>
> For hot removal, the device must be immediately ejected when OSPM calls
> the _EJ0 control method. The _EJ0 control method does not return until
> ejection is complete. After calling _EJ0, OSPM verifies the device no
> longer exists to determine if the eject succeeded. For _HID devices,
> OSPM evaluates the _STA method. For _ADR devices, OSPM checks with the
> bus driver for that device.
>
>
> Avi:  A pci eject guarantees that all accesses
> started after the eject completes do not see the device.
>
> Can you do:
>
> - qobject_remove()
>
> and have the device dispatch running safely? NO, therefore pci_unplug_device
> must hold dev->lock.
>
>
>
>