From: Anthony Liguori <anthony@codemonkey.ws>
To: Marcelo Tosatti <mtosatti@redhat.com>
Cc: Jan Kiszka <jan.kiszka@web.de>,
Liu Ping Fan <pingfank@linux.vnet.ibm.com>,
kvm <kvm@vger.kernel.org>, qemu-devel <qemu-devel@nongnu.org>,
Alexander Graf <agraf@suse.de>, Avi Kivity <avi@redhat.com>
Subject: Re: [Qemu-devel] [PATCH] kvm: First step to push iothread lock out of inner run loop
Date: Thu, 28 Jun 2012 09:10:39 -0500 [thread overview]
Message-ID: <4FEC65DF.20504@codemonkey.ws> (raw)
In-Reply-To: <20120626193420.GA19852@amt.cnet>
On 06/26/2012 02:34 PM, Marcelo Tosatti wrote:
> On Sat, Jun 23, 2012 at 12:55:49AM +0200, Jan Kiszka wrote:
>> Should have declared this [RFC] in the subject and CC'ed kvm...
>>
>> On 2012-06-23 00:45, Jan Kiszka wrote:
>>> This sketches a possible path to get rid of the iothread lock on vmexits
>>> in KVM mode. On x86, the the in-kernel irqchips has to be used because
>>> we otherwise need to synchronize APIC and other per-cpu state accesses
>>> that could be changed concurrently. Not yet fully analyzed is the NMI
>>> injection path in the absence of an APIC.
>>>
>>> s390x should be fine without specific locking as their pre/post-run
>>> callbacks are empty. Power requires locking for the pre-run callback.
>>>
>>> This patch is untested, but a similar version was successfully used in
>>> a x86 setup with a network I/O path that needed no central iothread
>>> locking anymore (required special MMIO exit handling).
>>> ---
>>> kvm-all.c | 18 ++++++++++++++++--
>>> target-i386/kvm.c | 7 +++++++
>>> target-ppc/kvm.c | 4 ++++
>>> 3 files changed, 27 insertions(+), 2 deletions(-)
>>>
>>> diff --git a/kvm-all.c b/kvm-all.c
>>> index f8e4328..9c3e26f 100644
>>> --- a/kvm-all.c
>>> +++ b/kvm-all.c
>>> @@ -1460,6 +1460,8 @@ int kvm_cpu_exec(CPUArchState *env)
>>> return EXCP_HLT;
>>> }
>>>
>>> + qemu_mutex_unlock_iothread();
>>> +
>>> do {
>>> if (env->kvm_vcpu_dirty) {
>>> kvm_arch_put_registers(env, KVM_PUT_RUNTIME_STATE);
>>> @@ -1476,14 +1478,16 @@ int kvm_cpu_exec(CPUArchState *env)
>>> */
>>> qemu_cpu_kick_self();
>>> }
>>> - qemu_mutex_unlock_iothread();
>>>
>>> run_ret = kvm_vcpu_ioctl(env, KVM_RUN, 0);
>>>
>>> - qemu_mutex_lock_iothread();
>>> kvm_arch_post_run(env, run);
>>>
>>> + /* TODO: push coalesced mmio flushing to the point where we access
>>> + * devices that are using it (currently VGA and E1000). */
>>> + qemu_mutex_lock_iothread();
>>> kvm_flush_coalesced_mmio_buffer();
>>> + qemu_mutex_unlock_iothread();
>>>
>>> if (run_ret< 0) {
>>> if (run_ret == -EINTR || run_ret == -EAGAIN) {
>>> @@ -1499,19 +1503,23 @@ int kvm_cpu_exec(CPUArchState *env)
>>> switch (run->exit_reason) {
>>> case KVM_EXIT_IO:
>>> DPRINTF("handle_io\n");
>>> + qemu_mutex_lock_iothread();
>>> kvm_handle_io(run->io.port,
>>> (uint8_t *)run + run->io.data_offset,
>>> run->io.direction,
>>> run->io.size,
>>> run->io.count);
>>> + qemu_mutex_unlock_iothread();
>>> ret = 0;
>>> break;
>>> case KVM_EXIT_MMIO:
>>> DPRINTF("handle_mmio\n");
>>> + qemu_mutex_lock_iothread();
>>> cpu_physical_memory_rw(run->mmio.phys_addr,
>>> run->mmio.data,
>>> run->mmio.len,
>>> run->mmio.is_write);
>>> + qemu_mutex_unlock_iothread();
>>> ret = 0;
>>> break;
>>> case KVM_EXIT_IRQ_WINDOW_OPEN:
>>> @@ -1520,7 +1528,9 @@ int kvm_cpu_exec(CPUArchState *env)
>>> break;
>>> case KVM_EXIT_SHUTDOWN:
>>> DPRINTF("shutdown\n");
>>> + qemu_mutex_lock_iothread();
>>> qemu_system_reset_request();
>>> + qemu_mutex_unlock_iothread();
>>> ret = EXCP_INTERRUPT;
>>> break;
>>> case KVM_EXIT_UNKNOWN:
>>> @@ -1533,11 +1543,15 @@ int kvm_cpu_exec(CPUArchState *env)
>>> break;
>>> default:
>>> DPRINTF("kvm_arch_handle_exit\n");
>>> + qemu_mutex_lock_iothread();
>>> ret = kvm_arch_handle_exit(env, run);
>>> + qemu_mutex_unlock_iothread();
>>> break;
>>> }
>>> } while (ret == 0);
>>>
>>> + qemu_mutex_lock_iothread();
>>> +
>>> if (ret< 0) {
>>> cpu_dump_state(env, stderr, fprintf, CPU_DUMP_CODE);
>>> vm_stop(RUN_STATE_INTERNAL_ERROR);
>>> diff --git a/target-i386/kvm.c b/target-i386/kvm.c
>>> index 0d0d8f6..0ad64d1 100644
>>> --- a/target-i386/kvm.c
>>> +++ b/target-i386/kvm.c
>>> @@ -1631,7 +1631,10 @@ void kvm_arch_pre_run(CPUX86State *env, struct kvm_run *run)
>>>
>>> /* Inject NMI */
>>> if (env->interrupt_request& CPU_INTERRUPT_NMI) {
>>> + qemu_mutex_lock_iothread();
>>> env->interrupt_request&= ~CPU_INTERRUPT_NMI;
>>> + qemu_mutex_unlock_iothread();
>>> +
>>> DPRINTF("injected NMI\n");
>>> ret = kvm_vcpu_ioctl(env, KVM_NMI);
>>> if (ret< 0) {
>>> @@ -1641,6 +1644,8 @@ void kvm_arch_pre_run(CPUX86State *env, struct kvm_run *run)
>>> }
>>>
>>> if (!kvm_irqchip_in_kernel()) {
>>> + qemu_mutex_lock_iothread();
>>> +
>>> /* Force the VCPU out of its inner loop to process any INIT requests
>>> * or pending TPR access reports. */
>>> if (env->interrupt_request&
>>> @@ -1682,6 +1687,8 @@ void kvm_arch_pre_run(CPUX86State *env, struct kvm_run *run)
>>>
>>> DPRINTF("setting tpr\n");
>>> run->cr8 = cpu_get_apic_tpr(env->apic_state);
>>> +
>>> + qemu_mutex_unlock_iothread();
>>> }
>>> }
>>>
>>> diff --git a/target-ppc/kvm.c b/target-ppc/kvm.c
>>> index c09cc39..60d91a5 100644
>>> --- a/target-ppc/kvm.c
>>> +++ b/target-ppc/kvm.c
>>> @@ -471,6 +471,8 @@ void kvm_arch_pre_run(CPUPPCState *env, struct kvm_run *run)
>>> int r;
>>> unsigned irq;
>>>
>>> + qemu_mutex_lock_iothread();
>>> +
>>> /* PowerPC QEMU tracks the various core input pins (interrupt, critical
>>> * interrupt, reset, etc) in PPC-specific env->irq_input_state. */
>>> if (!cap_interrupt_level&&
>>> @@ -497,6 +499,8 @@ void kvm_arch_pre_run(CPUPPCState *env, struct kvm_run *run)
>>> /* We don't know if there are more interrupts pending after this. However,
>>> * the guest will return to userspace in the course of handling this one
>>> * anyways, so we will get a chance to deliver the rest. */
>>> +
>>> + qemu_mutex_unlock_iothread();
>>> }
>>>
>>> void kvm_arch_post_run(CPUPPCState *env, struct kvm_run *run)
>>>
>
> The following plan would allow progressive convertion to parallel
> operation.
>
> Jan mentioned the MMIO handler->MMIO handler deadlock in a private message.
>
> Jan: if there is recursive MMIO accesses, you can detect that and skip
> such MMIO handlers in dev_can_use_lock() ? Or blacklist.
>
> What i mentioned earlier about "unsolved issues" are the possibilities
> in "iothread flow" (belief is that it would be better determined at
> with execution experience / tuning).
>
> OK, so, each document starts with xyz.txt. This is compatible with TCG,
> the critical part is not dropping/acquire locks for decent performance
> (that is, only drop memmap_lock from TCG VCPUS thread if IOTHREAD
> requests so).
>
> This came out from discussions with Avi.
>
> immediate-plan.txt
> ------------------
>
> 1. read_lock(memmap_lock)
> 2. MemoryRegionSection mrs = lookup(addr)
> 3. qom_ref(mrs.mr->dev)
> 4. read_unlock(memmap_lock)
>
> 5. mutex_lock(dev->lock)
> 6. dispatch(&mrs, addr, data, size)
> 7. mutex_unlock(dev->lock)
Just a detail, I don't think we should acquire a device specific lock in global
code. Rather, I think we should acquire the global lock before dispatch unless
a MemoryRegion is marked as being unlocked.
Regards,
Anthony Liguori
>
> 8. qom_unref(mrs.mr->object)
>
>
> A) Add Device object to MemoryRegions.
>
> memory_region_add_subregion
> memory_region_add_eventfd
> memory_region_add_subregion_overlap
>
>
> B) qom_ref details
>
> Z) see device-hotplug.txt items
>
>
>
> lock-model.txt
> --------------
>
>
> lock_device(dev) {
> if (dev_can_use_lock(dev))
> mutex_lock(dev->lock)
> else
> mutex_lock(qemu_global_mutex)
> }
>
> dev_can_use_lock(dev)
> {
> if (all services used by dev mmio handler have)
> - qemu_global_mutex
> - trylock(dev->lock) and fail
>
> then yes
> else
> then no
> }
>
>
> That is, from the moment in which the device uses the order
> (dev->lock -> qemu_global_mutex), the qemu_global_mutex -> dev->lock
> is forbidden by the IOTHREAD or anyone else.
>
>
> convert-to-unlocked.txt
> -----------------------
>
>
> 1. read_lock(memmap_lock)
> 2. MemoryRegionSection mrs = lookup(addr)
> 3. qom_ref(mrs.mr->dev)
> 4. read_unlock(memmap_lock)
>
> 5. mutex_lock(dev->lock)
> 6. dispatch(&mrs, addr, data, size)
> 7. mutex_unlock(dev->lock)
>
> 8. qom_unref(mrs.mr->object)
>
> THE ITEMS BELOW SHOULD BE DOCUMENTATION TO CONVERT DEVICE
> TO UNLOCKED OPERATION.
>
> 1) qom_ref guarantees that the Device object does not disappear. However,
> these operations are allowed in parallel:
>
> a) after 4. device memory and ioports might change or be unregistered.
>
> Concurrent accesses of a device that is being hotunplugged or
> whose mappings change are responsability of the OS, so
> incorrect emulation is not QEMU's problem. However, device
> emulation is now subject to:
> - cpu_physical_memory_map failing.
> - stl_phys* failing.
> - cpu_physical_memory_rw failing/reading from unassigned mem.
>
> b) what should happen with a pending cpu_physical_memory_map pointer,
> when deleting the corresponding memory region?
>
>
> net.txt
> --------
>
>
> iothread flow
> =============
>
> 1) Skip-work-if-device-locked
>
> select(tap fd ready)
> tap_send
> if (trylock(TAPState->NetClientState->dev))
> proceed;
> else
> continue; (data remains in queue)
> tap_read_packet
> qemu_send_packet_async
>
> DRAWBACK: lock intensive.
> DRAWBACK: VCPU threads can starve IOTHREAD (can be fixed with
> a scheme such as trylock() marks a flag saying "iothread wants lock".
>
> Checking each subsystem for possibility:
>
> - tap_send: OK
> - user,slirp: if not possible, use device->lock.
> - qemu-char: OK
> - interrupts: lock with qemu_global_mutex.
> - qemutimer:
> - read time: qemu_get_clock_ns (see later).
>
>
> 2) Queue-work-to-vcpu-context
>
> select(tap fd ready)
> tap_send
> if (trylock(TAPState->NetClientState->dev)) {
> proceed;
> } else {
> AddToDeviceWorkQueue(work);
> }
> tap_read_packet
> qemu_send_packet_async
>
> DRAWBACK: lock intensive
> DRAWBACK: cache effects
>
> 3) VCPU-thread-notify-device-unlocked
>
> select(tap fd ready)
> tap_send
> if (trylock(TAPState->NetClientState->dev)) {
> proceed;
> } else {
> signal VCPU thread to notify IOTHREAD when unlocked.
> }
> tap_read_packet
>
>
> GENERAL OBJECTIVE: to avoid lock inversion. IOTHREAD should follow
> same order.
>
>
> PCI HOTPLUG EJECT FLOW
> ----------------------
>
> 1) Monitor initiated.
>
> monitor -> qmp_device_del -> qdev_unplug -> pci_unplug_device ->
> piix4_device_hotplug -> SCI interrupt -> OS removes
> application users from device -> OS runs _EJ0 ->
>
> For hot removal, the device must be immediately ejected when OSPM calls
> the _EJ0 control method. The _EJ0 control method does not return until
> ejection is complete. After calling _EJ0, OSPM verifies the device no
> longer exists to determine if the eject succeeded. For _HID devices,
> OSPM evaluates the _STA method. For _ADR devices, OSPM checks with the
> bus driver for that device.
>
>
> Avi: A pci eject guarantees that all accesses
> started after the eject completes do not see the device.
>
> Can you do:
>
> - qobject_remove()
>
> and have the device dispatch running safely? NO, therefore pci_unplug_device
> must hold dev->lock.
>
>
>
>
next prev parent reply other threads:[~2012-06-28 14:10 UTC|newest]
Thread overview: 33+ messages / expand[flat|nested] mbox.gz Atom feed top
[not found] <4FE4F56D.1020201@web.de>
2012-06-22 22:55 ` [PATCH] kvm: First step to push iothread lock out of inner run loop Jan Kiszka
2012-06-23 0:22 ` Marcelo Tosatti
2012-06-23 9:06 ` Marcelo Tosatti
2012-06-23 11:45 ` Jan Kiszka
2012-06-24 8:49 ` Avi Kivity
2012-06-24 14:08 ` [Qemu-devel] " Jan Kiszka
2012-06-24 14:31 ` Avi Kivity
2012-07-06 17:16 ` Jan Kiszka
2012-07-06 18:06 ` Jan Kiszka
2012-07-08 7:49 ` Avi Kivity
2012-06-24 13:34 ` liu ping fan
2012-06-24 14:08 ` Jan Kiszka
2012-06-24 14:35 ` Avi Kivity
2012-06-24 14:40 ` Jan Kiszka
2012-06-24 14:46 ` Avi Kivity
2012-06-24 14:51 ` Jan Kiszka
2012-06-24 14:56 ` Avi Kivity
2012-06-24 14:58 ` Jan Kiszka
2012-06-24 14:59 ` Avi Kivity
2012-06-23 9:22 ` Jan Kiszka
2012-06-28 1:11 ` Marcelo Tosatti
2012-06-26 19:34 ` Marcelo Tosatti
2012-06-27 7:39 ` Stefan Hajnoczi
2012-06-27 7:41 ` [Qemu-devel] " Stefan Hajnoczi
2012-06-27 11:09 ` Marcelo Tosatti
2012-06-27 11:19 ` [Qemu-devel] " Marcelo Tosatti
2012-06-28 8:45 ` Stefan Hajnoczi
2012-06-27 7:54 ` Avi Kivity
2012-06-27 14:36 ` Jan Kiszka
2012-06-28 14:10 ` Anthony Liguori [this message]
2012-06-28 15:12 ` [Qemu-devel] " Avi Kivity
2012-06-29 1:29 ` Marcelo Tosatti
2012-06-29 1:45 ` [Qemu-devel] " Marcelo Tosatti
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=4FEC65DF.20504@codemonkey.ws \
--to=anthony@codemonkey.ws \
--cc=agraf@suse.de \
--cc=avi@redhat.com \
--cc=jan.kiszka@web.de \
--cc=kvm@vger.kernel.org \
--cc=mtosatti@redhat.com \
--cc=pingfank@linux.vnet.ibm.com \
--cc=qemu-devel@nongnu.org \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).