From: Anthony Liguori <anthony@codemonkey.ws>
To: Marcelo Tosatti <mtosatti@redhat.com>
Cc: Jan Kiszka <jan.kiszka@web.de>,
Liu Ping Fan <pingfank@linux.vnet.ibm.com>,
kvm <kvm@vger.kernel.org>, qemu-devel <qemu-devel@nongnu.org>,
Alexander Graf <agraf@suse.de>, Avi Kivity <avi@redhat.com>
Subject: Re: [Qemu-devel] [PATCH] kvm: First step to push iothread lock out of inner run loop
Date: Thu, 28 Jun 2012 09:10:39 -0500 [thread overview]
Message-ID: <4FEC65DF.20504@codemonkey.ws> (raw)
In-Reply-To: <20120626193420.GA19852@amt.cnet>
On 06/26/2012 02:34 PM, Marcelo Tosatti wrote:
> On Sat, Jun 23, 2012 at 12:55:49AM +0200, Jan Kiszka wrote:
>> Should have declared this [RFC] in the subject and CC'ed kvm...
>>
>> On 2012-06-23 00:45, Jan Kiszka wrote:
>>> This sketches a possible path to get rid of the iothread lock on vmexits
>>> in KVM mode. On x86, the the in-kernel irqchips has to be used because
>>> we otherwise need to synchronize APIC and other per-cpu state accesses
>>> that could be changed concurrently. Not yet fully analyzed is the NMI
>>> injection path in the absence of an APIC.
>>>
>>> s390x should be fine without specific locking as their pre/post-run
>>> callbacks are empty. Power requires locking for the pre-run callback.
>>>
>>> This patch is untested, but a similar version was successfully used in
>>> a x86 setup with a network I/O path that needed no central iothread
>>> locking anymore (required special MMIO exit handling).
>>> ---
>>> kvm-all.c | 18 ++++++++++++++++--
>>> target-i386/kvm.c | 7 +++++++
>>> target-ppc/kvm.c | 4 ++++
>>> 3 files changed, 27 insertions(+), 2 deletions(-)
>>>
>>> diff --git a/kvm-all.c b/kvm-all.c
>>> index f8e4328..9c3e26f 100644
>>> --- a/kvm-all.c
>>> +++ b/kvm-all.c
>>> @@ -1460,6 +1460,8 @@ int kvm_cpu_exec(CPUArchState *env)
>>> return EXCP_HLT;
>>> }
>>>
>>> + qemu_mutex_unlock_iothread();
>>> +
>>> do {
>>> if (env->kvm_vcpu_dirty) {
>>> kvm_arch_put_registers(env, KVM_PUT_RUNTIME_STATE);
>>> @@ -1476,14 +1478,16 @@ int kvm_cpu_exec(CPUArchState *env)
>>> */
>>> qemu_cpu_kick_self();
>>> }
>>> - qemu_mutex_unlock_iothread();
>>>
>>> run_ret = kvm_vcpu_ioctl(env, KVM_RUN, 0);
>>>
>>> - qemu_mutex_lock_iothread();
>>> kvm_arch_post_run(env, run);
>>>
>>> + /* TODO: push coalesced mmio flushing to the point where we access
>>> + * devices that are using it (currently VGA and E1000). */
>>> + qemu_mutex_lock_iothread();
>>> kvm_flush_coalesced_mmio_buffer();
>>> + qemu_mutex_unlock_iothread();
>>>
>>> if (run_ret< 0) {
>>> if (run_ret == -EINTR || run_ret == -EAGAIN) {
>>> @@ -1499,19 +1503,23 @@ int kvm_cpu_exec(CPUArchState *env)
>>> switch (run->exit_reason) {
>>> case KVM_EXIT_IO:
>>> DPRINTF("handle_io\n");
>>> + qemu_mutex_lock_iothread();
>>> kvm_handle_io(run->io.port,
>>> (uint8_t *)run + run->io.data_offset,
>>> run->io.direction,
>>> run->io.size,
>>> run->io.count);
>>> + qemu_mutex_unlock_iothread();
>>> ret = 0;
>>> break;
>>> case KVM_EXIT_MMIO:
>>> DPRINTF("handle_mmio\n");
>>> + qemu_mutex_lock_iothread();
>>> cpu_physical_memory_rw(run->mmio.phys_addr,
>>> run->mmio.data,
>>> run->mmio.len,
>>> run->mmio.is_write);
>>> + qemu_mutex_unlock_iothread();
>>> ret = 0;
>>> break;
>>> case KVM_EXIT_IRQ_WINDOW_OPEN:
>>> @@ -1520,7 +1528,9 @@ int kvm_cpu_exec(CPUArchState *env)
>>> break;
>>> case KVM_EXIT_SHUTDOWN:
>>> DPRINTF("shutdown\n");
>>> + qemu_mutex_lock_iothread();
>>> qemu_system_reset_request();
>>> + qemu_mutex_unlock_iothread();
>>> ret = EXCP_INTERRUPT;
>>> break;
>>> case KVM_EXIT_UNKNOWN:
>>> @@ -1533,11 +1543,15 @@ int kvm_cpu_exec(CPUArchState *env)
>>> break;
>>> default:
>>> DPRINTF("kvm_arch_handle_exit\n");
>>> + qemu_mutex_lock_iothread();
>>> ret = kvm_arch_handle_exit(env, run);
>>> + qemu_mutex_unlock_iothread();
>>> break;
>>> }
>>> } while (ret == 0);
>>>
>>> + qemu_mutex_lock_iothread();
>>> +
>>> if (ret< 0) {
>>> cpu_dump_state(env, stderr, fprintf, CPU_DUMP_CODE);
>>> vm_stop(RUN_STATE_INTERNAL_ERROR);
>>> diff --git a/target-i386/kvm.c b/target-i386/kvm.c
>>> index 0d0d8f6..0ad64d1 100644
>>> --- a/target-i386/kvm.c
>>> +++ b/target-i386/kvm.c
>>> @@ -1631,7 +1631,10 @@ void kvm_arch_pre_run(CPUX86State *env, struct kvm_run *run)
>>>
>>> /* Inject NMI */
>>> if (env->interrupt_request& CPU_INTERRUPT_NMI) {
>>> + qemu_mutex_lock_iothread();
>>> env->interrupt_request&= ~CPU_INTERRUPT_NMI;
>>> + qemu_mutex_unlock_iothread();
>>> +
>>> DPRINTF("injected NMI\n");
>>> ret = kvm_vcpu_ioctl(env, KVM_NMI);
>>> if (ret< 0) {
>>> @@ -1641,6 +1644,8 @@ void kvm_arch_pre_run(CPUX86State *env, struct kvm_run *run)
>>> }
>>>
>>> if (!kvm_irqchip_in_kernel()) {
>>> + qemu_mutex_lock_iothread();
>>> +
>>> /* Force the VCPU out of its inner loop to process any INIT requests
>>> * or pending TPR access reports. */
>>> if (env->interrupt_request&
>>> @@ -1682,6 +1687,8 @@ void kvm_arch_pre_run(CPUX86State *env, struct kvm_run *run)
>>>
>>> DPRINTF("setting tpr\n");
>>> run->cr8 = cpu_get_apic_tpr(env->apic_state);
>>> +
>>> + qemu_mutex_unlock_iothread();
>>> }
>>> }
>>>
>>> diff --git a/target-ppc/kvm.c b/target-ppc/kvm.c
>>> index c09cc39..60d91a5 100644
>>> --- a/target-ppc/kvm.c
>>> +++ b/target-ppc/kvm.c
>>> @@ -471,6 +471,8 @@ void kvm_arch_pre_run(CPUPPCState *env, struct kvm_run *run)
>>> int r;
>>> unsigned irq;
>>>
>>> + qemu_mutex_lock_iothread();
>>> +
>>> /* PowerPC QEMU tracks the various core input pins (interrupt, critical
>>> * interrupt, reset, etc) in PPC-specific env->irq_input_state. */
>>> if (!cap_interrupt_level&&
>>> @@ -497,6 +499,8 @@ void kvm_arch_pre_run(CPUPPCState *env, struct kvm_run *run)
>>> /* We don't know if there are more interrupts pending after this. However,
>>> * the guest will return to userspace in the course of handling this one
>>> * anyways, so we will get a chance to deliver the rest. */
>>> +
>>> + qemu_mutex_unlock_iothread();
>>> }
>>>
>>> void kvm_arch_post_run(CPUPPCState *env, struct kvm_run *run)
>>>
>
> The following plan would allow progressive convertion to parallel
> operation.
>
> Jan mentioned the MMIO handler->MMIO handler deadlock in a private message.
>
> Jan: if there is recursive MMIO accesses, you can detect that and skip
> such MMIO handlers in dev_can_use_lock() ? Or blacklist.
>
> What i mentioned earlier about "unsolved issues" are the possibilities
> in "iothread flow" (belief is that it would be better determined at
> with execution experience / tuning).
>
> OK, so, each document starts with xyz.txt. This is compatible with TCG,
> the critical part is not dropping/acquire locks for decent performance
> (that is, only drop memmap_lock from TCG VCPUS thread if IOTHREAD
> requests so).
>
> This came out from discussions with Avi.
>
> immediate-plan.txt
> ------------------
>
> 1. read_lock(memmap_lock)
> 2. MemoryRegionSection mrs = lookup(addr)
> 3. qom_ref(mrs.mr->dev)
> 4. read_unlock(memmap_lock)
>
> 5. mutex_lock(dev->lock)
> 6. dispatch(&mrs, addr, data, size)
> 7. mutex_unlock(dev->lock)
Just a detail, I don't think we should acquire a device specific lock in global
code. Rather, I think we should acquire the global lock before dispatch unless
a MemoryRegion is marked as being unlocked.
Regards,
Anthony Liguori
>
> 8. qom_unref(mrs.mr->object)
>
>
> A) Add Device object to MemoryRegions.
>
> memory_region_add_subregion
> memory_region_add_eventfd
> memory_region_add_subregion_overlap
>
>
> B) qom_ref details
>
> Z) see device-hotplug.txt items
>
>
>
> lock-model.txt
> --------------
>
>
> lock_device(dev) {
> if (dev_can_use_lock(dev))
> mutex_lock(dev->lock)
> else
> mutex_lock(qemu_global_mutex)
> }
>
> dev_can_use_lock(dev)
> {
> if (all services used by dev mmio handler have)
> - qemu_global_mutex
> - trylock(dev->lock) and fail
>
> then yes
> else
> then no
> }
>
>
> That is, from the moment in which the device uses the order
> (dev->lock -> qemu_global_mutex), the qemu_global_mutex -> dev->lock
> is forbidden by the IOTHREAD or anyone else.
>
>
> convert-to-unlocked.txt
> -----------------------
>
>
> 1. read_lock(memmap_lock)
> 2. MemoryRegionSection mrs = lookup(addr)
> 3. qom_ref(mrs.mr->dev)
> 4. read_unlock(memmap_lock)
>
> 5. mutex_lock(dev->lock)
> 6. dispatch(&mrs, addr, data, size)
> 7. mutex_unlock(dev->lock)
>
> 8. qom_unref(mrs.mr->object)
>
> THE ITEMS BELOW SHOULD BE DOCUMENTATION TO CONVERT DEVICE
> TO UNLOCKED OPERATION.
>
> 1) qom_ref guarantees that the Device object does not disappear. However,
> these operations are allowed in parallel:
>
> a) after 4. device memory and ioports might change or be unregistered.
>
> Concurrent accesses of a device that is being hotunplugged or
> whose mappings change are responsability of the OS, so
> incorrect emulation is not QEMU's problem. However, device
> emulation is now subject to:
> - cpu_physical_memory_map failing.
> - stl_phys* failing.
> - cpu_physical_memory_rw failing/reading from unassigned mem.
>
> b) what should happen with a pending cpu_physical_memory_map pointer,
> when deleting the corresponding memory region?
>
>
> net.txt
> --------
>
>
> iothread flow
> =============
>
> 1) Skip-work-if-device-locked
>
> select(tap fd ready)
> tap_send
> if (trylock(TAPState->NetClientState->dev))
> proceed;
> else
> continue; (data remains in queue)
> tap_read_packet
> qemu_send_packet_async
>
> DRAWBACK: lock intensive.
> DRAWBACK: VCPU threads can starve IOTHREAD (can be fixed with
> a scheme such as trylock() marks a flag saying "iothread wants lock".
>
> Checking each subsystem for possibility:
>
> - tap_send: OK
> - user,slirp: if not possible, use device->lock.
> - qemu-char: OK
> - interrupts: lock with qemu_global_mutex.
> - qemutimer:
> - read time: qemu_get_clock_ns (see later).
>
>
> 2) Queue-work-to-vcpu-context
>
> select(tap fd ready)
> tap_send
> if (trylock(TAPState->NetClientState->dev)) {
> proceed;
> } else {
> AddToDeviceWorkQueue(work);
> }
> tap_read_packet
> qemu_send_packet_async
>
> DRAWBACK: lock intensive
> DRAWBACK: cache effects
>
> 3) VCPU-thread-notify-device-unlocked
>
> select(tap fd ready)
> tap_send
> if (trylock(TAPState->NetClientState->dev)) {
> proceed;
> } else {
> signal VCPU thread to notify IOTHREAD when unlocked.
> }
> tap_read_packet
>
>
> GENERAL OBJECTIVE: to avoid lock inversion. IOTHREAD should follow
> same order.
>
>
> PCI HOTPLUG EJECT FLOW
> ----------------------
>
> 1) Monitor initiated.
>
> monitor -> qmp_device_del -> qdev_unplug -> pci_unplug_device ->
> piix4_device_hotplug -> SCI interrupt -> OS removes
> application users from device -> OS runs _EJ0 ->
>
> For hot removal, the device must be immediately ejected when OSPM calls
> the _EJ0 control method. The _EJ0 control method does not return until
> ejection is complete. After calling _EJ0, OSPM verifies the device no
> longer exists to determine if the eject succeeded. For _HID devices,
> OSPM evaluates the _STA method. For _ADR devices, OSPM checks with the
> bus driver for that device.
>
>
> Avi: A pci eject guarantees that all accesses
> started after the eject completes do not see the device.
>
> Can you do:
>
> - qobject_remove()
>
> and have the device dispatch running safely? NO, therefore pci_unplug_device
> must hold dev->lock.
>
>
>
>
next prev parent reply other threads:[~2012-06-28 14:10 UTC|newest]
Thread overview: 56+ messages / expand[flat|nested] mbox.gz Atom feed top
2012-06-22 22:45 [Qemu-devel] [PATCH] kvm: First step to push iothread lock out of inner run loop Jan Kiszka
2012-06-22 22:55 ` Jan Kiszka
2012-06-22 22:55 ` [Qemu-devel] " Jan Kiszka
2012-06-23 0:22 ` Marcelo Tosatti
2012-06-23 0:22 ` [Qemu-devel] " Marcelo Tosatti
2012-06-23 9:06 ` Marcelo Tosatti
2012-06-23 9:06 ` [Qemu-devel] " Marcelo Tosatti
2012-06-23 11:45 ` Jan Kiszka
2012-06-23 11:45 ` [Qemu-devel] " Jan Kiszka
2012-06-24 8:49 ` Avi Kivity
2012-06-24 8:49 ` [Qemu-devel] " Avi Kivity
2012-06-24 14:08 ` Jan Kiszka
2012-06-24 14:08 ` Jan Kiszka
2012-06-24 14:31 ` Avi Kivity
2012-06-24 14:31 ` Avi Kivity
2012-07-06 17:16 ` Jan Kiszka
2012-07-06 17:16 ` [Qemu-devel] " Jan Kiszka
2012-07-06 18:06 ` Jan Kiszka
2012-07-06 18:06 ` [Qemu-devel] " Jan Kiszka
2012-07-08 7:49 ` Avi Kivity
2012-07-08 7:49 ` [Qemu-devel] " Avi Kivity
2012-06-24 13:34 ` liu ping fan
2012-06-24 13:34 ` [Qemu-devel] " liu ping fan
2012-06-24 14:08 ` Jan Kiszka
2012-06-24 14:08 ` [Qemu-devel] " Jan Kiszka
2012-06-24 14:35 ` Avi Kivity
2012-06-24 14:35 ` [Qemu-devel] " Avi Kivity
2012-06-24 14:40 ` Jan Kiszka
2012-06-24 14:40 ` [Qemu-devel] " Jan Kiszka
2012-06-24 14:46 ` Avi Kivity
2012-06-24 14:46 ` [Qemu-devel] " Avi Kivity
2012-06-24 14:51 ` Jan Kiszka
2012-06-24 14:51 ` [Qemu-devel] " Jan Kiszka
2012-06-24 14:56 ` Avi Kivity
2012-06-24 14:56 ` [Qemu-devel] " Avi Kivity
2012-06-24 14:58 ` Jan Kiszka
2012-06-24 14:58 ` [Qemu-devel] " Jan Kiszka
2012-06-24 14:59 ` Avi Kivity
2012-06-24 14:59 ` [Qemu-devel] " Avi Kivity
2012-06-23 9:22 ` Jan Kiszka
2012-06-23 9:22 ` [Qemu-devel] " Jan Kiszka
2012-06-28 1:11 ` Marcelo Tosatti
2012-06-26 19:34 ` Marcelo Tosatti
2012-06-27 7:39 ` Stefan Hajnoczi
2012-06-27 7:41 ` [Qemu-devel] " Stefan Hajnoczi
2012-06-27 11:09 ` Marcelo Tosatti
2012-06-27 11:19 ` [Qemu-devel] " Marcelo Tosatti
2012-06-28 8:45 ` Stefan Hajnoczi
2012-06-27 7:54 ` Avi Kivity
2012-06-27 14:36 ` Jan Kiszka
2012-06-28 14:10 ` Anthony Liguori [this message]
2012-06-28 15:12 ` [Qemu-devel] " Avi Kivity
2012-06-29 1:29 ` Marcelo Tosatti
2012-06-29 1:45 ` [Qemu-devel] " Marcelo Tosatti
2012-06-22 22:59 ` Anthony Liguori
2012-06-23 9:11 ` Jan Kiszka
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=4FEC65DF.20504@codemonkey.ws \
--to=anthony@codemonkey.ws \
--cc=agraf@suse.de \
--cc=avi@redhat.com \
--cc=jan.kiszka@web.de \
--cc=kvm@vger.kernel.org \
--cc=mtosatti@redhat.com \
--cc=pingfank@linux.vnet.ibm.com \
--cc=qemu-devel@nongnu.org \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.