* Re: [PATCH] KVM: Allow host IRQ sharing for assigned PCI 2.3 devices
2012-01-09 14:03 [PATCH] KVM: Allow host IRQ sharing for assigned PCI 2.3 devices Jan Kiszka
@ 2012-01-09 19:45 ` Alex Williamson
2012-01-09 21:25 ` Jan Kiszka
2012-01-10 16:17 ` Michael S. Tsirkin
2012-01-12 15:49 ` [PATCH v2] " Jan Kiszka
2 siblings, 1 reply; 20+ messages in thread
From: Alex Williamson @ 2012-01-09 19:45 UTC (permalink / raw)
To: Jan Kiszka
Cc: Avi Kivity, Marcelo Tosatti, kvm, Michael S. Tsirkin,
Jesse Barnes
On Mon, 2012-01-09 at 15:03 +0100, Jan Kiszka wrote:
> PCI 2.3 allows to generically disable IRQ sources at device level. This
> enables us to share legacy IRQs of such devices with other host devices
> when passing them to a guest.
>
> The new IRQ sharing feature introduced here is optional, user space has
> to request it explicitly. Moreover, user space can inform us about its
> view of PCI_COMMAND_INTX_DISABLE so that we can avoid unmasking the
> interrupt and signaling it if the guest masked it via the virtualized
> PCI config space.
>
> Signed-off-by: Jan Kiszka <jan.kiszka@siemens.com>
> ---
>
> This applies to kvm/master after merging
>
> PCI: Rework config space blocking services
> PCI: Introduce INTx check & mask API
>
> from current linux-next/master. I suppose those two will make it into
> 3.3.
>
> To recall the history of it: I tried hard to implement an adaptive
> solution that automatically picks the fastest masking technique whenever
> possible. However, the changes required to the IRQ core subsystem and
> the logic of the device assignment code became so complex and partly
> ugly that I gave up on this. It's simply not worth the pain given that
> legacy PCI interrupts are rarely raised for performance critical device
> at such a high rate (KHz...) that you can measure the difference.
>
> Documentation/virtual/kvm/api.txt | 27 +++++
> arch/x86/kvm/x86.c | 1 +
> include/linux/kvm.h | 6 +
> include/linux/kvm_host.h | 2 +
> virt/kvm/assigned-dev.c | 208 +++++++++++++++++++++++++++++++-----
> 5 files changed, 215 insertions(+), 29 deletions(-)
>
> diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt
> index e1d94bf..670015a 100644
> --- a/Documentation/virtual/kvm/api.txt
> +++ b/Documentation/virtual/kvm/api.txt
> @@ -1159,6 +1159,14 @@ following flags are specified:
>
> /* Depends on KVM_CAP_IOMMU */
> #define KVM_DEV_ASSIGN_ENABLE_IOMMU (1 << 0)
> +/* The following two depend on KVM_CAP_PCI_2_3 */
> +#define KVM_DEV_ASSIGN_PCI_2_3 (1 << 1)
> +#define KVM_DEV_ASSIGN_MASK_INTX (1 << 2)
> +
> +If KVM_DEV_ASSIGN_PCI_2_3 is set, the kernel will manage legacy INTx interrupts
> +via the PCI-2.3-compliant device-level mask, thus enable IRQ sharing with other
> +assigned devices or host devices. KVM_DEV_ASSIGN_MASK_INTX specifies the
> +guest's view on the INTx mask, see KVM_ASSIGN_SET_INTX_MASK for details.
>
> The KVM_DEV_ASSIGN_ENABLE_IOMMU flag is a mandatory option to ensure
> isolation of the device. Usages not specifying this flag are deprecated.
> @@ -1399,6 +1407,25 @@ The following flags are defined:
> If datamatch flag is set, the event will be signaled only if the written value
> to the registered address is equal to datamatch in struct kvm_ioeventfd.
>
> +4.59 KVM_ASSIGN_SET_INTX_MASK
> +
> +Capability: KVM_CAP_PCI_2_3
> +Architectures: x86
> +Type: vm ioctl
> +Parameters: struct kvm_assigned_pci_dev (in)
> +Returns: 0 on success, -1 on error
> +
> +Informs the kernel about the guest's view on the INTx mask. As long as the
> +guest masks the legacy INTx, the kernel will refrain from unmasking it at
> +hardware level and will not assert the guest's IRQ line. User space is still
> +responsible for applying this state to the assigned device's real config space.
> +To avoid that the kernel overwrites the state user space wants to set,
> +KVM_ASSIGN_SET_INTX_MASK has to be called prior to updating the config space.
> +
> +See KVM_ASSIGN_DEV_IRQ for the data structure. The target device is specified
> +by assigned_dev_id. In the flags field, only KVM_DEV_ASSIGN_MASK_INTX is
> +evaluated.
> +
> 4.62 KVM_CREATE_SPAPR_TCE
>
> Capability: KVM_CAP_SPAPR_TCE
> diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
> index 1171def..9381806 100644
> --- a/arch/x86/kvm/x86.c
> +++ b/arch/x86/kvm/x86.c
> @@ -2057,6 +2057,7 @@ int kvm_dev_ioctl_check_extension(long ext)
> case KVM_CAP_XSAVE:
> case KVM_CAP_ASYNC_PF:
> case KVM_CAP_GET_TSC_KHZ:
> + case KVM_CAP_PCI_2_3:
> r = 1;
> break;
> case KVM_CAP_COALESCED_MMIO:
> diff --git a/include/linux/kvm.h b/include/linux/kvm.h
> index 68e67e5..da5f7b7 100644
> --- a/include/linux/kvm.h
> +++ b/include/linux/kvm.h
> @@ -556,6 +556,7 @@ struct kvm_ppc_pvinfo {
> #define KVM_CAP_PPC_RMA 65
> #define KVM_CAP_MAX_VCPUS 66 /* returns max vcpus per vm */
> #define KVM_CAP_PPC_PAPR 68
> +#define KVM_CAP_PCI_2_3 69
> #define KVM_CAP_S390_GMAP 71
> #define KVM_CAP_TSC_DEADLINE_TIMER 72
>
> @@ -697,6 +698,9 @@ struct kvm_clock_data {
> /* Available with KVM_CAP_TSC_CONTROL */
> #define KVM_SET_TSC_KHZ _IO(KVMIO, 0xa2)
> #define KVM_GET_TSC_KHZ _IO(KVMIO, 0xa3)
> +/* Available with KVM_CAP_PCI_2_3 */
> +#define KVM_ASSIGN_SET_INTX_MASK _IOW(KVMIO, 0xa4, \
> + struct kvm_assigned_pci_dev)
>
> /*
> * ioctls for vcpu fds
> @@ -765,6 +769,8 @@ struct kvm_clock_data {
> #define KVM_ALLOCATE_RMA _IOR(KVMIO, 0xa9, struct kvm_allocate_rma)
>
> #define KVM_DEV_ASSIGN_ENABLE_IOMMU (1 << 0)
> +#define KVM_DEV_ASSIGN_PCI_2_3 (1 << 1)
> +#define KVM_DEV_ASSIGN_MASK_INTX (1 << 2)
>
> struct kvm_assigned_pci_dev {
> __u32 assigned_dev_id;
> diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
> index 900c763..07461bd 100644
> --- a/include/linux/kvm_host.h
> +++ b/include/linux/kvm_host.h
> @@ -546,6 +546,7 @@ struct kvm_assigned_dev_kernel {
> unsigned int entries_nr;
> int host_irq;
> bool host_irq_disabled;
> + bool pci_2_3;
> struct msix_entry *host_msix_entries;
> int guest_irq;
> struct msix_entry *guest_msix_entries;
> @@ -555,6 +556,7 @@ struct kvm_assigned_dev_kernel {
> struct pci_dev *dev;
> struct kvm *kvm;
> spinlock_t intx_lock;
> + struct mutex intx_mask_lock;
> char irq_name[32];
> struct pci_saved_state *pci_saved_state;
> };
> diff --git a/virt/kvm/assigned-dev.c b/virt/kvm/assigned-dev.c
> index 758e3b3..b35aba9 100644
> --- a/virt/kvm/assigned-dev.c
> +++ b/virt/kvm/assigned-dev.c
> @@ -57,22 +57,66 @@ static int find_index_from_host_irq(struct kvm_assigned_dev_kernel
> return index;
> }
>
> -static irqreturn_t kvm_assigned_dev_thread(int irq, void *dev_id)
> +static irqreturn_t kvm_assigned_dev_intx(int irq, void *dev_id)
> {
> struct kvm_assigned_dev_kernel *assigned_dev = dev_id;
> + int ret;
>
> - if (assigned_dev->irq_requested_type & KVM_DEV_IRQ_HOST_INTX) {
> - spin_lock(&assigned_dev->intx_lock);
> + spin_lock(&assigned_dev->intx_lock);
> + if (pci_check_and_mask_intx(assigned_dev->dev)) {
> + assigned_dev->host_irq_disabled = true;
> + ret = IRQ_WAKE_THREAD;
> + } else
> + ret = IRQ_NONE;
> + spin_unlock(&assigned_dev->intx_lock);
> +
> + return ret;
> +}
> +
> +static void
> +kvm_assigned_dev_raise_guest_irq(struct kvm_assigned_dev_kernel *assigned_dev,
> + int vector)
> +{
> + if (unlikely(assigned_dev->irq_requested_type &
> + KVM_DEV_IRQ_GUEST_INTX)) {
> + mutex_lock(&assigned_dev->intx_mask_lock);
> + if (!(assigned_dev->flags & KVM_DEV_ASSIGN_MASK_INTX))
> + kvm_set_irq(assigned_dev->kvm,
> + assigned_dev->irq_source_id, vector, 1);
> + mutex_unlock(&assigned_dev->intx_mask_lock);
> + } else
> + kvm_set_irq(assigned_dev->kvm, assigned_dev->irq_source_id,
> + vector, 1);
> +}
I question whether the above function is really worthwhile...
> +static irqreturn_t kvm_assigned_dev_thread_intx(int irq, void *dev_id)
> +{
> + struct kvm_assigned_dev_kernel *assigned_dev = dev_id;
> +
> + if (!(assigned_dev->flags & KVM_DEV_ASSIGN_PCI_2_3)) {
> + spin_lock_irq(&assigned_dev->intx_lock);
> disable_irq_nosync(irq);
> assigned_dev->host_irq_disabled = true;
> - spin_unlock(&assigned_dev->intx_lock);
> + spin_unlock_irq(&assigned_dev->intx_lock);
> }
>
> - kvm_set_irq(assigned_dev->kvm, assigned_dev->irq_source_id,
> - assigned_dev->guest_irq, 1);
> + kvm_assigned_dev_raise_guest_irq(assigned_dev,
> + assigned_dev->guest_irq);
This is the only user of the intx branch and we could avoid the
irq_requested_type check from this call path. The MSI paths can call
kvm_set_irq directly. A nice code consolidation, but probably trumped
by a slightly shorter code path.
BTW, I already updated vfio to the INTx check and mask API, it's a great
cleanup.
> +
> + return IRQ_HANDLED;
> +}
> +
> +#ifdef __KVM_HAVE_MSI
> +static irqreturn_t kvm_assigned_dev_thread_msi(int irq, void *dev_id)
> +{
> + struct kvm_assigned_dev_kernel *assigned_dev = dev_id;
> +
> + kvm_assigned_dev_raise_guest_irq(assigned_dev,
> + assigned_dev->guest_irq);
>
> return IRQ_HANDLED;
> }
> +#endif
>
> #ifdef __KVM_HAVE_MSIX
> static irqreturn_t kvm_assigned_dev_thread_msix(int irq, void *dev_id)
> @@ -83,8 +127,7 @@ static irqreturn_t kvm_assigned_dev_thread_msix(int irq, void *dev_id)
>
> if (index >= 0) {
> vector = assigned_dev->guest_msix_entries[index].vector;
> - kvm_set_irq(assigned_dev->kvm, assigned_dev->irq_source_id,
> - vector, 1);
> + kvm_assigned_dev_raise_guest_irq(assigned_dev, vector);
> }
>
> return IRQ_HANDLED;
> @@ -100,15 +143,31 @@ static void kvm_assigned_dev_ack_irq(struct kvm_irq_ack_notifier *kian)
>
> kvm_set_irq(dev->kvm, dev->irq_source_id, dev->guest_irq, 0);
>
> - /* The guest irq may be shared so this ack may be
> - * from another device.
> - */
> - spin_lock(&dev->intx_lock);
> - if (dev->host_irq_disabled) {
> - enable_irq(dev->host_irq);
> - dev->host_irq_disabled = false;
> + mutex_lock(&dev->intx_mask_lock);
> +
> + if (!(dev->flags & KVM_DEV_ASSIGN_MASK_INTX)) {
> + bool reassert = false;
> +
> + spin_lock_irq(&dev->intx_lock);
> + /*
> + * The guest IRQ may be shared so this ack can come from an
> + * IRQ for another guest device.
> + */
> + if (dev->host_irq_disabled) {
> + if (!(dev->flags & KVM_DEV_ASSIGN_PCI_2_3))
> + enable_irq(dev->host_irq);
> + else if (!pci_check_and_unmask_intx(dev->dev))
> + reassert = true;
> + dev->host_irq_disabled = reassert;
> + }
> + spin_unlock_irq(&dev->intx_lock);
> +
> + if (reassert)
> + kvm_set_irq(dev->kvm, dev->irq_source_id,
> + dev->guest_irq, 1);
> }
> - spin_unlock(&dev->intx_lock);
> +
> + mutex_unlock(&dev->intx_mask_lock);
> }
>
> static void deassign_guest_irq(struct kvm *kvm,
> @@ -156,7 +215,13 @@ static void deassign_host_irq(struct kvm *kvm,
> pci_disable_msix(assigned_dev->dev);
> } else {
> /* Deal with MSI and INTx */
> - disable_irq(assigned_dev->host_irq);
> + if (assigned_dev->flags & KVM_DEV_ASSIGN_PCI_2_3) {
> + spin_lock_irq(&assigned_dev->intx_lock);
> + pci_intx(assigned_dev->dev, false);
> + spin_unlock_irq(&assigned_dev->intx_lock);
> + synchronize_irq(assigned_dev->host_irq);
> + } else
> + disable_irq(assigned_dev->host_irq);
>
> free_irq(assigned_dev->host_irq, assigned_dev);
>
> @@ -237,15 +302,34 @@ void kvm_free_all_assigned_devices(struct kvm *kvm)
> static int assigned_device_enable_host_intx(struct kvm *kvm,
> struct kvm_assigned_dev_kernel *dev)
> {
> + irq_handler_t irq_handler;
> + unsigned long flags;
> +
> dev->host_irq = dev->dev->irq;
> - /* Even though this is PCI, we don't want to use shared
> - * interrupts. Sharing host devices with guest-assigned devices
> - * on the same interrupt line is not a happy situation: there
> - * are going to be long delays in accepting, acking, etc.
> +
> + /*
> + * We can only share the IRQ line with other host devices if we are
> + * able to disable the IRQ source at device-level - independently of
> + * the guest driver. Otherwise host devices may suffer from unbounded
> + * IRQ latencies when the guest keeps the line asserted.
> */
> - if (request_threaded_irq(dev->host_irq, NULL, kvm_assigned_dev_thread,
> - IRQF_ONESHOT, dev->irq_name, dev))
> + if (dev->flags & KVM_DEV_ASSIGN_PCI_2_3) {
> + irq_handler = kvm_assigned_dev_intx;
> + flags = IRQF_SHARED;
> + } else {
> + irq_handler = NULL;
> + flags = IRQF_ONESHOT;
> + }
> + if (request_threaded_irq(dev->host_irq, irq_handler,
> + kvm_assigned_dev_thread_intx, flags,
> + dev->irq_name, dev))
> return -EIO;
> +
> + if (dev->flags & KVM_DEV_ASSIGN_PCI_2_3) {
> + spin_lock_irq(&dev->intx_lock);
> + pci_intx(dev->dev, true);
> + spin_unlock_irq(&dev->intx_lock);
> + }
> return 0;
> }
>
> @@ -262,8 +346,9 @@ static int assigned_device_enable_host_msi(struct kvm *kvm,
> }
>
> dev->host_irq = dev->dev->irq;
> - if (request_threaded_irq(dev->host_irq, NULL, kvm_assigned_dev_thread,
> - 0, dev->irq_name, dev)) {
> + if (request_threaded_irq(dev->host_irq, NULL,
> + kvm_assigned_dev_thread_msi, 0,
> + dev->irq_name, dev)) {
> pci_disable_msi(dev->dev);
> return -EIO;
> }
> @@ -321,7 +406,6 @@ static int assigned_device_enable_guest_msi(struct kvm *kvm,
> {
> dev->guest_irq = irq->guest_irq;
> dev->ack_notifier.gsi = -1;
> - dev->host_irq_disabled = false;
> return 0;
> }
> #endif
> @@ -333,7 +417,6 @@ static int assigned_device_enable_guest_msix(struct kvm *kvm,
> {
> dev->guest_irq = irq->guest_irq;
> dev->ack_notifier.gsi = -1;
> - dev->host_irq_disabled = false;
> return 0;
> }
> #endif
> @@ -367,6 +450,7 @@ static int assign_host_irq(struct kvm *kvm,
> default:
> r = -EINVAL;
> }
> + dev->host_irq_disabled = false;
>
> if (!r)
> dev->irq_requested_type |= host_irq_type;
> @@ -468,6 +552,7 @@ static int kvm_vm_ioctl_deassign_dev_irq(struct kvm *kvm,
> {
> int r = -ENODEV;
> struct kvm_assigned_dev_kernel *match;
> + unsigned long irq_type;
>
> mutex_lock(&kvm->lock);
>
> @@ -476,7 +561,9 @@ static int kvm_vm_ioctl_deassign_dev_irq(struct kvm *kvm,
> if (!match)
> goto out;
>
> - r = kvm_deassign_irq(kvm, match, assigned_irq->flags);
> + irq_type = assigned_irq->flags & (KVM_DEV_IRQ_HOST_MASK |
> + KVM_DEV_IRQ_GUEST_MASK);
> + r = kvm_deassign_irq(kvm, match, irq_type);
> out:
> mutex_unlock(&kvm->lock);
> return r;
> @@ -609,6 +696,10 @@ static int kvm_vm_ioctl_assign_device(struct kvm *kvm,
> if (!match->pci_saved_state)
> printk(KERN_DEBUG "%s: Couldn't store %s saved state\n",
> __func__, dev_name(&dev->dev));
> +
> + if (!pci_intx_mask_supported(dev))
> + assigned_dev->flags &= ~KVM_DEV_ASSIGN_PCI_2_3;
> +
> match->assigned_dev_id = assigned_dev->assigned_dev_id;
> match->host_segnr = assigned_dev->segnr;
> match->host_busnr = assigned_dev->busnr;
> @@ -616,6 +707,7 @@ static int kvm_vm_ioctl_assign_device(struct kvm *kvm,
> match->flags = assigned_dev->flags;
> match->dev = dev;
> spin_lock_init(&match->intx_lock);
> + mutex_init(&match->intx_mask_lock);
> match->irq_source_id = -1;
> match->kvm = kvm;
> match->ack_notifier.irq_acked = kvm_assigned_dev_ack_irq;
> @@ -761,6 +853,56 @@ msix_entry_out:
> }
> #endif
>
> +static int kvm_vm_ioctl_set_pci_irq_mask(struct kvm *kvm,
> + struct kvm_assigned_pci_dev *assigned_dev)
> +{
> + int r = 0;
> + struct kvm_assigned_dev_kernel *match;
> +
> + mutex_lock(&kvm->lock);
> +
> + match = kvm_find_assigned_dev(&kvm->arch.assigned_dev_head,
> + assigned_dev->assigned_dev_id);
> + if (!match) {
> + r = -ENODEV;
> + goto out;
> + }
> +
> + mutex_lock(&match->intx_mask_lock);
> +
> + match->flags &= ~KVM_DEV_ASSIGN_MASK_INTX;
> + match->flags |= assigned_dev->flags & KVM_DEV_ASSIGN_MASK_INTX;
> +
> + if (match->irq_requested_type & KVM_DEV_IRQ_GUEST_INTX) {
> + if (assigned_dev->flags & KVM_DEV_ASSIGN_MASK_INTX) {
> + kvm_set_irq(match->kvm, match->irq_source_id,
> + match->guest_irq, 0);
> + /*
> + * Masking at hardware-level is performed on demand,
> + * i.e. when an IRQ actually arrives at the host.
> + */
Is there any harm in doing this synchronous to the ioctl? We're on a
slow path here anyway since the mask is likely drive by a config space
write. Thanks,
Alex
> + } else {
> + /*
> + * Unmask the IRQ line. It may have been masked
> + * meanwhile if we aren't using PCI 2.3 INTx masking
> + * on the host side.
> + */
> + spin_lock_irq(&match->intx_lock);
> + if (match->host_irq_disabled) {
> + enable_irq(match->host_irq);
> + match->host_irq_disabled = false;
> + }
> + spin_unlock_irq(&match->intx_lock);
> + }
> + }
> +
> + mutex_unlock(&match->intx_mask_lock);
> +
> +out:
> + mutex_unlock(&kvm->lock);
> + return r;
> +}
> +
> long kvm_vm_ioctl_assigned_device(struct kvm *kvm, unsigned ioctl,
> unsigned long arg)
> {
> @@ -868,6 +1010,15 @@ long kvm_vm_ioctl_assigned_device(struct kvm *kvm, unsigned ioctl,
> break;
> }
> #endif
> + case KVM_ASSIGN_SET_INTX_MASK: {
> + struct kvm_assigned_pci_dev assigned_dev;
> +
> + r = -EFAULT;
> + if (copy_from_user(&assigned_dev, argp, sizeof assigned_dev))
> + goto out;
> + r = kvm_vm_ioctl_set_pci_irq_mask(kvm, &assigned_dev);
> + break;
> + }
> default:
> r = -ENOTTY;
> break;
> @@ -875,4 +1026,3 @@ long kvm_vm_ioctl_assigned_device(struct kvm *kvm, unsigned ioctl,
> out:
> return r;
> }
> -
^ permalink raw reply [flat|nested] 20+ messages in thread* Re: [PATCH] KVM: Allow host IRQ sharing for assigned PCI 2.3 devices
2012-01-09 19:45 ` Alex Williamson
@ 2012-01-09 21:25 ` Jan Kiszka
2012-01-09 22:05 ` Alex Williamson
0 siblings, 1 reply; 20+ messages in thread
From: Jan Kiszka @ 2012-01-09 21:25 UTC (permalink / raw)
To: Alex Williamson
Cc: Avi Kivity, Marcelo Tosatti, kvm, Michael S. Tsirkin,
Jesse Barnes
[-- Attachment #1: Type: text/plain, Size: 17111 bytes --]
On 2012-01-09 20:45, Alex Williamson wrote:
> On Mon, 2012-01-09 at 15:03 +0100, Jan Kiszka wrote:
>> PCI 2.3 allows to generically disable IRQ sources at device level. This
>> enables us to share legacy IRQs of such devices with other host devices
>> when passing them to a guest.
>>
>> The new IRQ sharing feature introduced here is optional, user space has
>> to request it explicitly. Moreover, user space can inform us about its
>> view of PCI_COMMAND_INTX_DISABLE so that we can avoid unmasking the
>> interrupt and signaling it if the guest masked it via the virtualized
>> PCI config space.
>>
>> Signed-off-by: Jan Kiszka <jan.kiszka@siemens.com>
>> ---
>>
>> This applies to kvm/master after merging
>>
>> PCI: Rework config space blocking services
>> PCI: Introduce INTx check & mask API
>>
>> from current linux-next/master. I suppose those two will make it into
>> 3.3.
>>
>> To recall the history of it: I tried hard to implement an adaptive
>> solution that automatically picks the fastest masking technique whenever
>> possible. However, the changes required to the IRQ core subsystem and
>> the logic of the device assignment code became so complex and partly
>> ugly that I gave up on this. It's simply not worth the pain given that
>> legacy PCI interrupts are rarely raised for performance critical device
>> at such a high rate (KHz...) that you can measure the difference.
>>
>> Documentation/virtual/kvm/api.txt | 27 +++++
>> arch/x86/kvm/x86.c | 1 +
>> include/linux/kvm.h | 6 +
>> include/linux/kvm_host.h | 2 +
>> virt/kvm/assigned-dev.c | 208 +++++++++++++++++++++++++++++++-----
>> 5 files changed, 215 insertions(+), 29 deletions(-)
>>
>> diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt
>> index e1d94bf..670015a 100644
>> --- a/Documentation/virtual/kvm/api.txt
>> +++ b/Documentation/virtual/kvm/api.txt
>> @@ -1159,6 +1159,14 @@ following flags are specified:
>>
>> /* Depends on KVM_CAP_IOMMU */
>> #define KVM_DEV_ASSIGN_ENABLE_IOMMU (1 << 0)
>> +/* The following two depend on KVM_CAP_PCI_2_3 */
>> +#define KVM_DEV_ASSIGN_PCI_2_3 (1 << 1)
>> +#define KVM_DEV_ASSIGN_MASK_INTX (1 << 2)
>> +
>> +If KVM_DEV_ASSIGN_PCI_2_3 is set, the kernel will manage legacy INTx interrupts
>> +via the PCI-2.3-compliant device-level mask, thus enable IRQ sharing with other
>> +assigned devices or host devices. KVM_DEV_ASSIGN_MASK_INTX specifies the
>> +guest's view on the INTx mask, see KVM_ASSIGN_SET_INTX_MASK for details.
>>
>> The KVM_DEV_ASSIGN_ENABLE_IOMMU flag is a mandatory option to ensure
>> isolation of the device. Usages not specifying this flag are deprecated.
>> @@ -1399,6 +1407,25 @@ The following flags are defined:
>> If datamatch flag is set, the event will be signaled only if the written value
>> to the registered address is equal to datamatch in struct kvm_ioeventfd.
>>
>> +4.59 KVM_ASSIGN_SET_INTX_MASK
>> +
>> +Capability: KVM_CAP_PCI_2_3
>> +Architectures: x86
>> +Type: vm ioctl
>> +Parameters: struct kvm_assigned_pci_dev (in)
>> +Returns: 0 on success, -1 on error
>> +
>> +Informs the kernel about the guest's view on the INTx mask. As long as the
>> +guest masks the legacy INTx, the kernel will refrain from unmasking it at
>> +hardware level and will not assert the guest's IRQ line. User space is still
>> +responsible for applying this state to the assigned device's real config space.
>> +To avoid that the kernel overwrites the state user space wants to set,
>> +KVM_ASSIGN_SET_INTX_MASK has to be called prior to updating the config space.
>> +
>> +See KVM_ASSIGN_DEV_IRQ for the data structure. The target device is specified
>> +by assigned_dev_id. In the flags field, only KVM_DEV_ASSIGN_MASK_INTX is
>> +evaluated.
>> +
>> 4.62 KVM_CREATE_SPAPR_TCE
>>
>> Capability: KVM_CAP_SPAPR_TCE
>> diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
>> index 1171def..9381806 100644
>> --- a/arch/x86/kvm/x86.c
>> +++ b/arch/x86/kvm/x86.c
>> @@ -2057,6 +2057,7 @@ int kvm_dev_ioctl_check_extension(long ext)
>> case KVM_CAP_XSAVE:
>> case KVM_CAP_ASYNC_PF:
>> case KVM_CAP_GET_TSC_KHZ:
>> + case KVM_CAP_PCI_2_3:
>> r = 1;
>> break;
>> case KVM_CAP_COALESCED_MMIO:
>> diff --git a/include/linux/kvm.h b/include/linux/kvm.h
>> index 68e67e5..da5f7b7 100644
>> --- a/include/linux/kvm.h
>> +++ b/include/linux/kvm.h
>> @@ -556,6 +556,7 @@ struct kvm_ppc_pvinfo {
>> #define KVM_CAP_PPC_RMA 65
>> #define KVM_CAP_MAX_VCPUS 66 /* returns max vcpus per vm */
>> #define KVM_CAP_PPC_PAPR 68
>> +#define KVM_CAP_PCI_2_3 69
>> #define KVM_CAP_S390_GMAP 71
>> #define KVM_CAP_TSC_DEADLINE_TIMER 72
>>
>> @@ -697,6 +698,9 @@ struct kvm_clock_data {
>> /* Available with KVM_CAP_TSC_CONTROL */
>> #define KVM_SET_TSC_KHZ _IO(KVMIO, 0xa2)
>> #define KVM_GET_TSC_KHZ _IO(KVMIO, 0xa3)
>> +/* Available with KVM_CAP_PCI_2_3 */
>> +#define KVM_ASSIGN_SET_INTX_MASK _IOW(KVMIO, 0xa4, \
>> + struct kvm_assigned_pci_dev)
>>
>> /*
>> * ioctls for vcpu fds
>> @@ -765,6 +769,8 @@ struct kvm_clock_data {
>> #define KVM_ALLOCATE_RMA _IOR(KVMIO, 0xa9, struct kvm_allocate_rma)
>>
>> #define KVM_DEV_ASSIGN_ENABLE_IOMMU (1 << 0)
>> +#define KVM_DEV_ASSIGN_PCI_2_3 (1 << 1)
>> +#define KVM_DEV_ASSIGN_MASK_INTX (1 << 2)
>>
>> struct kvm_assigned_pci_dev {
>> __u32 assigned_dev_id;
>> diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
>> index 900c763..07461bd 100644
>> --- a/include/linux/kvm_host.h
>> +++ b/include/linux/kvm_host.h
>> @@ -546,6 +546,7 @@ struct kvm_assigned_dev_kernel {
>> unsigned int entries_nr;
>> int host_irq;
>> bool host_irq_disabled;
>> + bool pci_2_3;
>> struct msix_entry *host_msix_entries;
>> int guest_irq;
>> struct msix_entry *guest_msix_entries;
>> @@ -555,6 +556,7 @@ struct kvm_assigned_dev_kernel {
>> struct pci_dev *dev;
>> struct kvm *kvm;
>> spinlock_t intx_lock;
>> + struct mutex intx_mask_lock;
>> char irq_name[32];
>> struct pci_saved_state *pci_saved_state;
>> };
>> diff --git a/virt/kvm/assigned-dev.c b/virt/kvm/assigned-dev.c
>> index 758e3b3..b35aba9 100644
>> --- a/virt/kvm/assigned-dev.c
>> +++ b/virt/kvm/assigned-dev.c
>> @@ -57,22 +57,66 @@ static int find_index_from_host_irq(struct kvm_assigned_dev_kernel
>> return index;
>> }
>>
>> -static irqreturn_t kvm_assigned_dev_thread(int irq, void *dev_id)
>> +static irqreturn_t kvm_assigned_dev_intx(int irq, void *dev_id)
>> {
>> struct kvm_assigned_dev_kernel *assigned_dev = dev_id;
>> + int ret;
>>
>> - if (assigned_dev->irq_requested_type & KVM_DEV_IRQ_HOST_INTX) {
>> - spin_lock(&assigned_dev->intx_lock);
>> + spin_lock(&assigned_dev->intx_lock);
>> + if (pci_check_and_mask_intx(assigned_dev->dev)) {
>> + assigned_dev->host_irq_disabled = true;
>> + ret = IRQ_WAKE_THREAD;
>> + } else
>> + ret = IRQ_NONE;
>> + spin_unlock(&assigned_dev->intx_lock);
>> +
>> + return ret;
>> +}
>> +
>> +static void
>> +kvm_assigned_dev_raise_guest_irq(struct kvm_assigned_dev_kernel *assigned_dev,
>> + int vector)
>> +{
>> + if (unlikely(assigned_dev->irq_requested_type &
>> + KVM_DEV_IRQ_GUEST_INTX)) {
>> + mutex_lock(&assigned_dev->intx_mask_lock);
>> + if (!(assigned_dev->flags & KVM_DEV_ASSIGN_MASK_INTX))
>> + kvm_set_irq(assigned_dev->kvm,
>> + assigned_dev->irq_source_id, vector, 1);
>> + mutex_unlock(&assigned_dev->intx_mask_lock);
>> + } else
>> + kvm_set_irq(assigned_dev->kvm, assigned_dev->irq_source_id,
>> + vector, 1);
>> +}
>
> I question whether the above function is really worthwhile...
>
>> +static irqreturn_t kvm_assigned_dev_thread_intx(int irq, void *dev_id)
>> +{
>> + struct kvm_assigned_dev_kernel *assigned_dev = dev_id;
>> +
>> + if (!(assigned_dev->flags & KVM_DEV_ASSIGN_PCI_2_3)) {
>> + spin_lock_irq(&assigned_dev->intx_lock);
>> disable_irq_nosync(irq);
>> assigned_dev->host_irq_disabled = true;
>> - spin_unlock(&assigned_dev->intx_lock);
>> + spin_unlock_irq(&assigned_dev->intx_lock);
>> }
>>
>> - kvm_set_irq(assigned_dev->kvm, assigned_dev->irq_source_id,
>> - assigned_dev->guest_irq, 1);
>> + kvm_assigned_dev_raise_guest_irq(assigned_dev,
>> + assigned_dev->guest_irq);
>
> This is the only user of the intx branch and we could avoid the
> irq_requested_type check from this call path. The MSI paths can call
> kvm_set_irq directly. A nice code consolidation, but probably trumped
> by a slightly shorter code path.
I see. Possibly, this code became that strange during the various
refactorings. Will have a look.
>
> BTW, I already updated vfio to the INTx check and mask API, it's a great
> cleanup.
Cool!
>
>> +
>> + return IRQ_HANDLED;
>> +}
>> +
>> +#ifdef __KVM_HAVE_MSI
>> +static irqreturn_t kvm_assigned_dev_thread_msi(int irq, void *dev_id)
>> +{
>> + struct kvm_assigned_dev_kernel *assigned_dev = dev_id;
>> +
>> + kvm_assigned_dev_raise_guest_irq(assigned_dev,
>> + assigned_dev->guest_irq);
>>
>> return IRQ_HANDLED;
>> }
>> +#endif
>>
>> #ifdef __KVM_HAVE_MSIX
>> static irqreturn_t kvm_assigned_dev_thread_msix(int irq, void *dev_id)
>> @@ -83,8 +127,7 @@ static irqreturn_t kvm_assigned_dev_thread_msix(int irq, void *dev_id)
>>
>> if (index >= 0) {
>> vector = assigned_dev->guest_msix_entries[index].vector;
>> - kvm_set_irq(assigned_dev->kvm, assigned_dev->irq_source_id,
>> - vector, 1);
>> + kvm_assigned_dev_raise_guest_irq(assigned_dev, vector);
>> }
>>
>> return IRQ_HANDLED;
>> @@ -100,15 +143,31 @@ static void kvm_assigned_dev_ack_irq(struct kvm_irq_ack_notifier *kian)
>>
>> kvm_set_irq(dev->kvm, dev->irq_source_id, dev->guest_irq, 0);
>>
>> - /* The guest irq may be shared so this ack may be
>> - * from another device.
>> - */
>> - spin_lock(&dev->intx_lock);
>> - if (dev->host_irq_disabled) {
>> - enable_irq(dev->host_irq);
>> - dev->host_irq_disabled = false;
>> + mutex_lock(&dev->intx_mask_lock);
>> +
>> + if (!(dev->flags & KVM_DEV_ASSIGN_MASK_INTX)) {
>> + bool reassert = false;
>> +
>> + spin_lock_irq(&dev->intx_lock);
>> + /*
>> + * The guest IRQ may be shared so this ack can come from an
>> + * IRQ for another guest device.
>> + */
>> + if (dev->host_irq_disabled) {
>> + if (!(dev->flags & KVM_DEV_ASSIGN_PCI_2_3))
>> + enable_irq(dev->host_irq);
>> + else if (!pci_check_and_unmask_intx(dev->dev))
>> + reassert = true;
>> + dev->host_irq_disabled = reassert;
>> + }
>> + spin_unlock_irq(&dev->intx_lock);
>> +
>> + if (reassert)
>> + kvm_set_irq(dev->kvm, dev->irq_source_id,
>> + dev->guest_irq, 1);
>> }
>> - spin_unlock(&dev->intx_lock);
>> +
>> + mutex_unlock(&dev->intx_mask_lock);
>> }
>>
>> static void deassign_guest_irq(struct kvm *kvm,
>> @@ -156,7 +215,13 @@ static void deassign_host_irq(struct kvm *kvm,
>> pci_disable_msix(assigned_dev->dev);
>> } else {
>> /* Deal with MSI and INTx */
>> - disable_irq(assigned_dev->host_irq);
>> + if (assigned_dev->flags & KVM_DEV_ASSIGN_PCI_2_3) {
>> + spin_lock_irq(&assigned_dev->intx_lock);
>> + pci_intx(assigned_dev->dev, false);
>> + spin_unlock_irq(&assigned_dev->intx_lock);
>> + synchronize_irq(assigned_dev->host_irq);
>> + } else
>> + disable_irq(assigned_dev->host_irq);
>>
>> free_irq(assigned_dev->host_irq, assigned_dev);
>>
>> @@ -237,15 +302,34 @@ void kvm_free_all_assigned_devices(struct kvm *kvm)
>> static int assigned_device_enable_host_intx(struct kvm *kvm,
>> struct kvm_assigned_dev_kernel *dev)
>> {
>> + irq_handler_t irq_handler;
>> + unsigned long flags;
>> +
>> dev->host_irq = dev->dev->irq;
>> - /* Even though this is PCI, we don't want to use shared
>> - * interrupts. Sharing host devices with guest-assigned devices
>> - * on the same interrupt line is not a happy situation: there
>> - * are going to be long delays in accepting, acking, etc.
>> +
>> + /*
>> + * We can only share the IRQ line with other host devices if we are
>> + * able to disable the IRQ source at device-level - independently of
>> + * the guest driver. Otherwise host devices may suffer from unbounded
>> + * IRQ latencies when the guest keeps the line asserted.
>> */
>> - if (request_threaded_irq(dev->host_irq, NULL, kvm_assigned_dev_thread,
>> - IRQF_ONESHOT, dev->irq_name, dev))
>> + if (dev->flags & KVM_DEV_ASSIGN_PCI_2_3) {
>> + irq_handler = kvm_assigned_dev_intx;
>> + flags = IRQF_SHARED;
>> + } else {
>> + irq_handler = NULL;
>> + flags = IRQF_ONESHOT;
>> + }
>> + if (request_threaded_irq(dev->host_irq, irq_handler,
>> + kvm_assigned_dev_thread_intx, flags,
>> + dev->irq_name, dev))
>> return -EIO;
>> +
>> + if (dev->flags & KVM_DEV_ASSIGN_PCI_2_3) {
>> + spin_lock_irq(&dev->intx_lock);
>> + pci_intx(dev->dev, true);
>> + spin_unlock_irq(&dev->intx_lock);
>> + }
>> return 0;
>> }
>>
>> @@ -262,8 +346,9 @@ static int assigned_device_enable_host_msi(struct kvm *kvm,
>> }
>>
>> dev->host_irq = dev->dev->irq;
>> - if (request_threaded_irq(dev->host_irq, NULL, kvm_assigned_dev_thread,
>> - 0, dev->irq_name, dev)) {
>> + if (request_threaded_irq(dev->host_irq, NULL,
>> + kvm_assigned_dev_thread_msi, 0,
>> + dev->irq_name, dev)) {
>> pci_disable_msi(dev->dev);
>> return -EIO;
>> }
>> @@ -321,7 +406,6 @@ static int assigned_device_enable_guest_msi(struct kvm *kvm,
>> {
>> dev->guest_irq = irq->guest_irq;
>> dev->ack_notifier.gsi = -1;
>> - dev->host_irq_disabled = false;
>> return 0;
>> }
>> #endif
>> @@ -333,7 +417,6 @@ static int assigned_device_enable_guest_msix(struct kvm *kvm,
>> {
>> dev->guest_irq = irq->guest_irq;
>> dev->ack_notifier.gsi = -1;
>> - dev->host_irq_disabled = false;
>> return 0;
>> }
>> #endif
>> @@ -367,6 +450,7 @@ static int assign_host_irq(struct kvm *kvm,
>> default:
>> r = -EINVAL;
>> }
>> + dev->host_irq_disabled = false;
>>
>> if (!r)
>> dev->irq_requested_type |= host_irq_type;
>> @@ -468,6 +552,7 @@ static int kvm_vm_ioctl_deassign_dev_irq(struct kvm *kvm,
>> {
>> int r = -ENODEV;
>> struct kvm_assigned_dev_kernel *match;
>> + unsigned long irq_type;
>>
>> mutex_lock(&kvm->lock);
>>
>> @@ -476,7 +561,9 @@ static int kvm_vm_ioctl_deassign_dev_irq(struct kvm *kvm,
>> if (!match)
>> goto out;
>>
>> - r = kvm_deassign_irq(kvm, match, assigned_irq->flags);
>> + irq_type = assigned_irq->flags & (KVM_DEV_IRQ_HOST_MASK |
>> + KVM_DEV_IRQ_GUEST_MASK);
>> + r = kvm_deassign_irq(kvm, match, irq_type);
>> out:
>> mutex_unlock(&kvm->lock);
>> return r;
>> @@ -609,6 +696,10 @@ static int kvm_vm_ioctl_assign_device(struct kvm *kvm,
>> if (!match->pci_saved_state)
>> printk(KERN_DEBUG "%s: Couldn't store %s saved state\n",
>> __func__, dev_name(&dev->dev));
>> +
>> + if (!pci_intx_mask_supported(dev))
>> + assigned_dev->flags &= ~KVM_DEV_ASSIGN_PCI_2_3;
>> +
>> match->assigned_dev_id = assigned_dev->assigned_dev_id;
>> match->host_segnr = assigned_dev->segnr;
>> match->host_busnr = assigned_dev->busnr;
>> @@ -616,6 +707,7 @@ static int kvm_vm_ioctl_assign_device(struct kvm *kvm,
>> match->flags = assigned_dev->flags;
>> match->dev = dev;
>> spin_lock_init(&match->intx_lock);
>> + mutex_init(&match->intx_mask_lock);
>> match->irq_source_id = -1;
>> match->kvm = kvm;
>> match->ack_notifier.irq_acked = kvm_assigned_dev_ack_irq;
>> @@ -761,6 +853,56 @@ msix_entry_out:
>> }
>> #endif
>>
>> +static int kvm_vm_ioctl_set_pci_irq_mask(struct kvm *kvm,
>> + struct kvm_assigned_pci_dev *assigned_dev)
>> +{
>> + int r = 0;
>> + struct kvm_assigned_dev_kernel *match;
>> +
>> + mutex_lock(&kvm->lock);
>> +
>> + match = kvm_find_assigned_dev(&kvm->arch.assigned_dev_head,
>> + assigned_dev->assigned_dev_id);
>> + if (!match) {
>> + r = -ENODEV;
>> + goto out;
>> + }
>> +
>> + mutex_lock(&match->intx_mask_lock);
>> +
>> + match->flags &= ~KVM_DEV_ASSIGN_MASK_INTX;
>> + match->flags |= assigned_dev->flags & KVM_DEV_ASSIGN_MASK_INTX;
>> +
>> + if (match->irq_requested_type & KVM_DEV_IRQ_GUEST_INTX) {
>> + if (assigned_dev->flags & KVM_DEV_ASSIGN_MASK_INTX) {
>> + kvm_set_irq(match->kvm, match->irq_source_id,
>> + match->guest_irq, 0);
>> + /*
>> + * Masking at hardware-level is performed on demand,
>> + * i.e. when an IRQ actually arrives at the host.
>> + */
>
> Is there any harm in doing this synchronous to the ioctl? We're on a
> slow path here anyway since the mask is likely drive by a config space
> write.
Not sure, maybe locking. What would be the advantage of doing it
synchronously?
Thanks for the review,
Jan
[-- Attachment #2: OpenPGP digital signature --]
[-- Type: application/pgp-signature, Size: 262 bytes --]
^ permalink raw reply [flat|nested] 20+ messages in thread* Re: [PATCH] KVM: Allow host IRQ sharing for assigned PCI 2.3 devices
2012-01-09 21:25 ` Jan Kiszka
@ 2012-01-09 22:05 ` Alex Williamson
2012-01-09 22:26 ` Jan Kiszka
2012-01-10 13:47 ` Jan Kiszka
0 siblings, 2 replies; 20+ messages in thread
From: Alex Williamson @ 2012-01-09 22:05 UTC (permalink / raw)
To: Jan Kiszka
Cc: Avi Kivity, Marcelo Tosatti, kvm, Michael S. Tsirkin,
Jesse Barnes
On Mon, 2012-01-09 at 22:25 +0100, Jan Kiszka wrote:
> On 2012-01-09 20:45, Alex Williamson wrote:
> > On Mon, 2012-01-09 at 15:03 +0100, Jan Kiszka wrote:
> >> +static int kvm_vm_ioctl_set_pci_irq_mask(struct kvm *kvm,
> >> + struct kvm_assigned_pci_dev *assigned_dev)
> >> +{
> >> + int r = 0;
> >> + struct kvm_assigned_dev_kernel *match;
> >> +
> >> + mutex_lock(&kvm->lock);
> >> +
> >> + match = kvm_find_assigned_dev(&kvm->arch.assigned_dev_head,
> >> + assigned_dev->assigned_dev_id);
> >> + if (!match) {
> >> + r = -ENODEV;
> >> + goto out;
> >> + }
> >> +
> >> + mutex_lock(&match->intx_mask_lock);
> >> +
> >> + match->flags &= ~KVM_DEV_ASSIGN_MASK_INTX;
> >> + match->flags |= assigned_dev->flags & KVM_DEV_ASSIGN_MASK_INTX;
> >> +
> >> + if (match->irq_requested_type & KVM_DEV_IRQ_GUEST_INTX) {
> >> + if (assigned_dev->flags & KVM_DEV_ASSIGN_MASK_INTX) {
> >> + kvm_set_irq(match->kvm, match->irq_source_id,
> >> + match->guest_irq, 0);
> >> + /*
> >> + * Masking at hardware-level is performed on demand,
> >> + * i.e. when an IRQ actually arrives at the host.
> >> + */
> >
> > Is there any harm in doing this synchronous to the ioctl? We're on a
> > slow path here anyway since the mask is likely drive by a config space
> > write.
>
> Not sure, maybe locking. What would be the advantage of doing it
> synchronously?
It would just be a closer match to hardware. I'm wondering (FUD) if
there could be a case where a driver does some sensitive operations on
the device that could be interfered with if the device generates that
one last interrupt to actually disable interrupts instead of them being
disabled after setting config space. It's probably a long shot, but
doesn't seem too difficult to switch to synchronous disabling. Thanks,
Alex
^ permalink raw reply [flat|nested] 20+ messages in thread* Re: [PATCH] KVM: Allow host IRQ sharing for assigned PCI 2.3 devices
2012-01-09 22:05 ` Alex Williamson
@ 2012-01-09 22:26 ` Jan Kiszka
2012-01-10 13:47 ` Jan Kiszka
1 sibling, 0 replies; 20+ messages in thread
From: Jan Kiszka @ 2012-01-09 22:26 UTC (permalink / raw)
To: Alex Williamson
Cc: Avi Kivity, Marcelo Tosatti, kvm, Michael S. Tsirkin,
Jesse Barnes
[-- Attachment #1: Type: text/plain, Size: 2141 bytes --]
On 2012-01-09 23:05, Alex Williamson wrote:
> On Mon, 2012-01-09 at 22:25 +0100, Jan Kiszka wrote:
>> On 2012-01-09 20:45, Alex Williamson wrote:
>>> On Mon, 2012-01-09 at 15:03 +0100, Jan Kiszka wrote:
>>>> +static int kvm_vm_ioctl_set_pci_irq_mask(struct kvm *kvm,
>>>> + struct kvm_assigned_pci_dev *assigned_dev)
>>>> +{
>>>> + int r = 0;
>>>> + struct kvm_assigned_dev_kernel *match;
>>>> +
>>>> + mutex_lock(&kvm->lock);
>>>> +
>>>> + match = kvm_find_assigned_dev(&kvm->arch.assigned_dev_head,
>>>> + assigned_dev->assigned_dev_id);
>>>> + if (!match) {
>>>> + r = -ENODEV;
>>>> + goto out;
>>>> + }
>>>> +
>>>> + mutex_lock(&match->intx_mask_lock);
>>>> +
>>>> + match->flags &= ~KVM_DEV_ASSIGN_MASK_INTX;
>>>> + match->flags |= assigned_dev->flags & KVM_DEV_ASSIGN_MASK_INTX;
>>>> +
>>>> + if (match->irq_requested_type & KVM_DEV_IRQ_GUEST_INTX) {
>>>> + if (assigned_dev->flags & KVM_DEV_ASSIGN_MASK_INTX) {
>>>> + kvm_set_irq(match->kvm, match->irq_source_id,
>>>> + match->guest_irq, 0);
>>>> + /*
>>>> + * Masking at hardware-level is performed on demand,
>>>> + * i.e. when an IRQ actually arrives at the host.
>>>> + */
>>>
>>> Is there any harm in doing this synchronous to the ioctl? We're on a
>>> slow path here anyway since the mask is likely drive by a config space
>>> write.
>>
>> Not sure, maybe locking. What would be the advantage of doing it
>> synchronously?
>
> It would just be a closer match to hardware. I'm wondering (FUD) if
> there could be a case where a driver does some sensitive operations on
> the device that could be interfered with if the device generates that
> one last interrupt to actually disable interrupts instead of them being
> disabled after setting config space. It's probably a long shot, but
> doesn't seem too difficult to switch to synchronous disabling. Thanks,
Need to check again - but there could be a good reason here as well.
Like for kvm_assigned_dev_raise_guest_irq: it handles different guest
IRQ types while its callers have different host IRQ types. So it has its
purpose.
Jan
[-- Attachment #2: OpenPGP digital signature --]
[-- Type: application/pgp-signature, Size: 262 bytes --]
^ permalink raw reply [flat|nested] 20+ messages in thread* Re: [PATCH] KVM: Allow host IRQ sharing for assigned PCI 2.3 devices
2012-01-09 22:05 ` Alex Williamson
2012-01-09 22:26 ` Jan Kiszka
@ 2012-01-10 13:47 ` Jan Kiszka
2012-01-10 23:41 ` Alex Williamson
1 sibling, 1 reply; 20+ messages in thread
From: Jan Kiszka @ 2012-01-10 13:47 UTC (permalink / raw)
To: Alex Williamson
Cc: Avi Kivity, Marcelo Tosatti, kvm, Michael S. Tsirkin,
Jesse Barnes
On 2012-01-09 23:05, Alex Williamson wrote:
> On Mon, 2012-01-09 at 22:25 +0100, Jan Kiszka wrote:
>> On 2012-01-09 20:45, Alex Williamson wrote:
>>> On Mon, 2012-01-09 at 15:03 +0100, Jan Kiszka wrote:
>>>> +static int kvm_vm_ioctl_set_pci_irq_mask(struct kvm *kvm,
>>>> + struct kvm_assigned_pci_dev *assigned_dev)
>>>> +{
>>>> + int r = 0;
>>>> + struct kvm_assigned_dev_kernel *match;
>>>> +
>>>> + mutex_lock(&kvm->lock);
>>>> +
>>>> + match = kvm_find_assigned_dev(&kvm->arch.assigned_dev_head,
>>>> + assigned_dev->assigned_dev_id);
>>>> + if (!match) {
>>>> + r = -ENODEV;
>>>> + goto out;
>>>> + }
>>>> +
>>>> + mutex_lock(&match->intx_mask_lock);
>>>> +
>>>> + match->flags &= ~KVM_DEV_ASSIGN_MASK_INTX;
>>>> + match->flags |= assigned_dev->flags & KVM_DEV_ASSIGN_MASK_INTX;
>>>> +
>>>> + if (match->irq_requested_type & KVM_DEV_IRQ_GUEST_INTX) {
>>>> + if (assigned_dev->flags & KVM_DEV_ASSIGN_MASK_INTX) {
>>>> + kvm_set_irq(match->kvm, match->irq_source_id,
>>>> + match->guest_irq, 0);
>>>> + /*
>>>> + * Masking at hardware-level is performed on demand,
>>>> + * i.e. when an IRQ actually arrives at the host.
>>>> + */
>>>
>>> Is there any harm in doing this synchronous to the ioctl? We're on a
>>> slow path here anyway since the mask is likely drive by a config space
>>> write.
>>
>> Not sure, maybe locking. What would be the advantage of doing it
>> synchronously?
>
> It would just be a closer match to hardware. I'm wondering (FUD) if
> there could be a case where a driver does some sensitive operations on
> the device that could be interfered with if the device generates that
> one last interrupt to actually disable interrupts instead of them being
> disabled after setting config space.
The guest driver will never see such an interrupt as we will notice on
its arrival that there is some mask pending.
> It's probably a long shot, but
> doesn't seem too difficult to switch to synchronous disabling.
It is a bit as we have no PCI API in place to implement this. We only
have check-and-mask which does not mask if there is no IRQ raised. How
do you handle this in VFIO so far?
Really, I do not see an urgent need for synchronous masking and would
rather refrain from it until we are aware of a real problem with
asynchronous one as implemented here.
Jan
--
Siemens AG, Corporate Technology, CT T DE IT 1
Corporate Competence Center Embedded Linux
^ permalink raw reply [flat|nested] 20+ messages in thread* Re: [PATCH] KVM: Allow host IRQ sharing for assigned PCI 2.3 devices
2012-01-10 13:47 ` Jan Kiszka
@ 2012-01-10 23:41 ` Alex Williamson
2012-01-11 9:47 ` Michael S. Tsirkin
0 siblings, 1 reply; 20+ messages in thread
From: Alex Williamson @ 2012-01-10 23:41 UTC (permalink / raw)
To: Jan Kiszka
Cc: Avi Kivity, Marcelo Tosatti, kvm, Michael S. Tsirkin,
Jesse Barnes
On Tue, 2012-01-10 at 14:47 +0100, Jan Kiszka wrote:
> On 2012-01-09 23:05, Alex Williamson wrote:
> > On Mon, 2012-01-09 at 22:25 +0100, Jan Kiszka wrote:
> >> On 2012-01-09 20:45, Alex Williamson wrote:
> >>> On Mon, 2012-01-09 at 15:03 +0100, Jan Kiszka wrote:
> >>>> +static int kvm_vm_ioctl_set_pci_irq_mask(struct kvm *kvm,
> >>>> + struct kvm_assigned_pci_dev *assigned_dev)
> >>>> +{
> >>>> + int r = 0;
> >>>> + struct kvm_assigned_dev_kernel *match;
> >>>> +
> >>>> + mutex_lock(&kvm->lock);
> >>>> +
> >>>> + match = kvm_find_assigned_dev(&kvm->arch.assigned_dev_head,
> >>>> + assigned_dev->assigned_dev_id);
> >>>> + if (!match) {
> >>>> + r = -ENODEV;
> >>>> + goto out;
> >>>> + }
> >>>> +
> >>>> + mutex_lock(&match->intx_mask_lock);
> >>>> +
> >>>> + match->flags &= ~KVM_DEV_ASSIGN_MASK_INTX;
> >>>> + match->flags |= assigned_dev->flags & KVM_DEV_ASSIGN_MASK_INTX;
> >>>> +
> >>>> + if (match->irq_requested_type & KVM_DEV_IRQ_GUEST_INTX) {
> >>>> + if (assigned_dev->flags & KVM_DEV_ASSIGN_MASK_INTX) {
> >>>> + kvm_set_irq(match->kvm, match->irq_source_id,
> >>>> + match->guest_irq, 0);
> >>>> + /*
> >>>> + * Masking at hardware-level is performed on demand,
> >>>> + * i.e. when an IRQ actually arrives at the host.
> >>>> + */
> >>>
> >>> Is there any harm in doing this synchronous to the ioctl? We're on a
> >>> slow path here anyway since the mask is likely drive by a config space
> >>> write.
> >>
> >> Not sure, maybe locking. What would be the advantage of doing it
> >> synchronously?
> >
> > It would just be a closer match to hardware. I'm wondering (FUD) if
> > there could be a case where a driver does some sensitive operations on
> > the device that could be interfered with if the device generates that
> > one last interrupt to actually disable interrupts instead of them being
> > disabled after setting config space.
>
> The guest driver will never see such an interrupt as we will notice on
> its arrival that there is some mask pending.
Right, I was thinking more about the affect at the hardware level.
> > It's probably a long shot, but
> > doesn't seem too difficult to switch to synchronous disabling.
>
> It is a bit as we have no PCI API in place to implement this. We only
> have check-and-mask which does not mask if there is no IRQ raised.
Can't we use either pci_intx(pdev, 0) for pci 2.3 devices or for older
devices disable the interrupt handler (if configured) with
disable_irq_nosync(pdev->irq).
> How
> do you handle this in VFIO so far?
As suggested above, but I just fixed it, will checkin soon.
> Really, I do not see an urgent need for synchronous masking and would
> rather refrain from it until we are aware of a real problem with
> asynchronous one as implemented here.
Fair enough. Thanks,
Alex
^ permalink raw reply [flat|nested] 20+ messages in thread* Re: [PATCH] KVM: Allow host IRQ sharing for assigned PCI 2.3 devices
2012-01-10 23:41 ` Alex Williamson
@ 2012-01-11 9:47 ` Michael S. Tsirkin
0 siblings, 0 replies; 20+ messages in thread
From: Michael S. Tsirkin @ 2012-01-11 9:47 UTC (permalink / raw)
To: Alex Williamson
Cc: Jan Kiszka, Avi Kivity, Marcelo Tosatti, kvm, Jesse Barnes
On Tue, Jan 10, 2012 at 04:41:50PM -0700, Alex Williamson wrote:
> > The guest driver will never see such an interrupt as we will notice on
> > its arrival that there is some mask pending.
>
> Right, I was thinking more about the affect at the hardware level.
In theory a broken device might assume that intx disable
bit is correlated with internal device registers somehow.
However, the current sharing approach won't work for such
a device anyway as host controls the status bit while
guest controls the rest of the device. So I think we don't care.
--
MST
^ permalink raw reply [flat|nested] 20+ messages in thread
* Re: [PATCH] KVM: Allow host IRQ sharing for assigned PCI 2.3 devices
2012-01-09 14:03 [PATCH] KVM: Allow host IRQ sharing for assigned PCI 2.3 devices Jan Kiszka
2012-01-09 19:45 ` Alex Williamson
@ 2012-01-10 16:17 ` Michael S. Tsirkin
2012-01-10 17:29 ` Jan Kiszka
2012-01-12 15:49 ` [PATCH v2] " Jan Kiszka
2 siblings, 1 reply; 20+ messages in thread
From: Michael S. Tsirkin @ 2012-01-10 16:17 UTC (permalink / raw)
To: Jan Kiszka
Cc: Avi Kivity, Marcelo Tosatti, kvm, Alex Williamson, Jesse Barnes
On Mon, Jan 09, 2012 at 03:03:00PM +0100, Jan Kiszka wrote:
> PCI 2.3 allows to generically disable IRQ sources at device level. This
> enables us to share legacy IRQs of such devices with other host devices
> when passing them to a guest.
>
> The new IRQ sharing feature introduced here is optional, user space has
> to request it explicitly. Moreover, user space can inform us about its
> view of PCI_COMMAND_INTX_DISABLE so that we can avoid unmasking the
> interrupt and signaling it if the guest masked it via the virtualized
> PCI config space.
>
> Signed-off-by: Jan Kiszka <jan.kiszka@siemens.com>
> ---
>
> This applies to kvm/master after merging
>
> PCI: Rework config space blocking services
> PCI: Introduce INTx check & mask API
>
> from current linux-next/master. I suppose those two will make it into
> 3.3.
>
> To recall the history of it: I tried hard to implement an adaptive
> solution that automatically picks the fastest masking technique whenever
> possible. However, the changes required to the IRQ core subsystem and
> the logic of the device assignment code became so complex and partly
> ugly that I gave up on this. It's simply not worth the pain given that
> legacy PCI interrupts are rarely raised for performance critical device
> at such a high rate (KHz...) that you can measure the difference.
>
> Documentation/virtual/kvm/api.txt | 27 +++++
> arch/x86/kvm/x86.c | 1 +
> include/linux/kvm.h | 6 +
> include/linux/kvm_host.h | 2 +
> virt/kvm/assigned-dev.c | 208 +++++++++++++++++++++++++++++++-----
> 5 files changed, 215 insertions(+), 29 deletions(-)
>
> diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt
> index e1d94bf..670015a 100644
> --- a/Documentation/virtual/kvm/api.txt
> +++ b/Documentation/virtual/kvm/api.txt
> @@ -1159,6 +1159,14 @@ following flags are specified:
>
> /* Depends on KVM_CAP_IOMMU */
> #define KVM_DEV_ASSIGN_ENABLE_IOMMU (1 << 0)
> +/* The following two depend on KVM_CAP_PCI_2_3 */
> +#define KVM_DEV_ASSIGN_PCI_2_3 (1 << 1)
> +#define KVM_DEV_ASSIGN_MASK_INTX (1 << 2)
> +
> +If KVM_DEV_ASSIGN_PCI_2_3 is set, the kernel will manage legacy INTx interrupts
> +via the PCI-2.3-compliant device-level mask, thus enable IRQ sharing with other
> +assigned devices or host devices. KVM_DEV_ASSIGN_MASK_INTX specifies the
> +guest's view on the INTx mask, see KVM_ASSIGN_SET_INTX_MASK for details.
>
> The KVM_DEV_ASSIGN_ENABLE_IOMMU flag is a mandatory option to ensure
> isolation of the device. Usages not specifying this flag are deprecated.
> @@ -1399,6 +1407,25 @@ The following flags are defined:
> If datamatch flag is set, the event will be signaled only if the written value
> to the registered address is equal to datamatch in struct kvm_ioeventfd.
>
> +4.59 KVM_ASSIGN_SET_INTX_MASK
> +
> +Capability: KVM_CAP_PCI_2_3
> +Architectures: x86
> +Type: vm ioctl
> +Parameters: struct kvm_assigned_pci_dev (in)
> +Returns: 0 on success, -1 on error
> +
> +Informs the kernel about the guest's view on the INTx mask.
A wild idea: since this is guest view of its IRQ,
can this be specified per guest IRQ+id then?
That might be useful to support MSIX mask bit emulation.
> As long as the
> +guest masks the legacy INTx, the kernel will refrain from unmasking it at
> +hardware level and will not assert the guest's IRQ line. User space is still
> +responsible for applying this state to the assigned device's real config space.
Can this be made more explicit? You mean writing into 1st
byte of PCI control, right?
> +To avoid that the kernel overwrites the state user space wants to set,
> +KVM_ASSIGN_SET_INTX_MASK has to be called prior to updating the config space.
This looks like a strange requirement, could you explain how
this helps avoid races? This also raises questions about
what should be done to write a bit unrelated to masking.
> +
> +See KVM_ASSIGN_DEV_IRQ for the data structure. The target device is specified
> +by assigned_dev_id. In the flags field, only KVM_DEV_ASSIGN_MASK_INTX is
> +evaluated.
> +
> 4.62 KVM_CREATE_SPAPR_TCE
>
> Capability: KVM_CAP_SPAPR_TCE
> diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
> index 1171def..9381806 100644
> --- a/arch/x86/kvm/x86.c
> +++ b/arch/x86/kvm/x86.c
> @@ -2057,6 +2057,7 @@ int kvm_dev_ioctl_check_extension(long ext)
> case KVM_CAP_XSAVE:
> case KVM_CAP_ASYNC_PF:
> case KVM_CAP_GET_TSC_KHZ:
> + case KVM_CAP_PCI_2_3:
> r = 1;
> break;
> case KVM_CAP_COALESCED_MMIO:
> diff --git a/include/linux/kvm.h b/include/linux/kvm.h
> index 68e67e5..da5f7b7 100644
> --- a/include/linux/kvm.h
> +++ b/include/linux/kvm.h
> @@ -556,6 +556,7 @@ struct kvm_ppc_pvinfo {
> #define KVM_CAP_PPC_RMA 65
> #define KVM_CAP_MAX_VCPUS 66 /* returns max vcpus per vm */
> #define KVM_CAP_PPC_PAPR 68
> +#define KVM_CAP_PCI_2_3 69
> #define KVM_CAP_S390_GMAP 71
> #define KVM_CAP_TSC_DEADLINE_TIMER 72
>
> @@ -697,6 +698,9 @@ struct kvm_clock_data {
> /* Available with KVM_CAP_TSC_CONTROL */
> #define KVM_SET_TSC_KHZ _IO(KVMIO, 0xa2)
> #define KVM_GET_TSC_KHZ _IO(KVMIO, 0xa3)
> +/* Available with KVM_CAP_PCI_2_3 */
> +#define KVM_ASSIGN_SET_INTX_MASK _IOW(KVMIO, 0xa4, \
> + struct kvm_assigned_pci_dev)
>
> /*
> * ioctls for vcpu fds
> @@ -765,6 +769,8 @@ struct kvm_clock_data {
> #define KVM_ALLOCATE_RMA _IOR(KVMIO, 0xa9, struct kvm_allocate_rma)
>
> #define KVM_DEV_ASSIGN_ENABLE_IOMMU (1 << 0)
> +#define KVM_DEV_ASSIGN_PCI_2_3 (1 << 1)
> +#define KVM_DEV_ASSIGN_MASK_INTX (1 << 2)
>
> struct kvm_assigned_pci_dev {
> __u32 assigned_dev_id;
> diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
> index 900c763..07461bd 100644
> --- a/include/linux/kvm_host.h
> +++ b/include/linux/kvm_host.h
> @@ -546,6 +546,7 @@ struct kvm_assigned_dev_kernel {
> unsigned int entries_nr;
> int host_irq;
> bool host_irq_disabled;
> + bool pci_2_3;
> struct msix_entry *host_msix_entries;
> int guest_irq;
> struct msix_entry *guest_msix_entries;
> @@ -555,6 +556,7 @@ struct kvm_assigned_dev_kernel {
> struct pci_dev *dev;
> struct kvm *kvm;
> spinlock_t intx_lock;
> + struct mutex intx_mask_lock;
What exactly does this lock protect?
I see it used sometimes with intx_lock and sometimes without.
> char irq_name[32];
> struct pci_saved_state *pci_saved_state;
> };
> diff --git a/virt/kvm/assigned-dev.c b/virt/kvm/assigned-dev.c
> index 758e3b3..b35aba9 100644
> --- a/virt/kvm/assigned-dev.c
> +++ b/virt/kvm/assigned-dev.c
> @@ -57,22 +57,66 @@ static int find_index_from_host_irq(struct kvm_assigned_dev_kernel
> return index;
> }
>
> -static irqreturn_t kvm_assigned_dev_thread(int irq, void *dev_id)
> +static irqreturn_t kvm_assigned_dev_intx(int irq, void *dev_id)
> {
> struct kvm_assigned_dev_kernel *assigned_dev = dev_id;
> + int ret;
>
> - if (assigned_dev->irq_requested_type & KVM_DEV_IRQ_HOST_INTX) {
> - spin_lock(&assigned_dev->intx_lock);
> + spin_lock(&assigned_dev->intx_lock);
> + if (pci_check_and_mask_intx(assigned_dev->dev)) {
> + assigned_dev->host_irq_disabled = true;
> + ret = IRQ_WAKE_THREAD;
> + } else
> + ret = IRQ_NONE;
> + spin_unlock(&assigned_dev->intx_lock);
> +
> + return ret;
> +}
> +
> +static void
> +kvm_assigned_dev_raise_guest_irq(struct kvm_assigned_dev_kernel *assigned_dev,
> + int vector)
> +{
> + if (unlikely(assigned_dev->irq_requested_type &
> + KVM_DEV_IRQ_GUEST_INTX)) {
> + mutex_lock(&assigned_dev->intx_mask_lock);
> + if (!(assigned_dev->flags & KVM_DEV_ASSIGN_MASK_INTX))
> + kvm_set_irq(assigned_dev->kvm,
> + assigned_dev->irq_source_id, vector, 1);
> + mutex_unlock(&assigned_dev->intx_mask_lock);
> + } else
> + kvm_set_irq(assigned_dev->kvm, assigned_dev->irq_source_id,
> + vector, 1);
> +}
> +
> +static irqreturn_t kvm_assigned_dev_thread_intx(int irq, void *dev_id)
> +{
> + struct kvm_assigned_dev_kernel *assigned_dev = dev_id;
> +
> + if (!(assigned_dev->flags & KVM_DEV_ASSIGN_PCI_2_3)) {
> + spin_lock_irq(&assigned_dev->intx_lock);
> disable_irq_nosync(irq);
> assigned_dev->host_irq_disabled = true;
> - spin_unlock(&assigned_dev->intx_lock);
> + spin_unlock_irq(&assigned_dev->intx_lock);
> }
>
> - kvm_set_irq(assigned_dev->kvm, assigned_dev->irq_source_id,
> - assigned_dev->guest_irq, 1);
> + kvm_assigned_dev_raise_guest_irq(assigned_dev,
> + assigned_dev->guest_irq);
> +
> + return IRQ_HANDLED;
> +}
> +
> +#ifdef __KVM_HAVE_MSI
> +static irqreturn_t kvm_assigned_dev_thread_msi(int irq, void *dev_id)
> +{
> + struct kvm_assigned_dev_kernel *assigned_dev = dev_id;
> +
> + kvm_assigned_dev_raise_guest_irq(assigned_dev,
> + assigned_dev->guest_irq);
>
> return IRQ_HANDLED;
> }
> +#endif
>
> #ifdef __KVM_HAVE_MSIX
> static irqreturn_t kvm_assigned_dev_thread_msix(int irq, void *dev_id)
> @@ -83,8 +127,7 @@ static irqreturn_t kvm_assigned_dev_thread_msix(int irq, void *dev_id)
>
> if (index >= 0) {
> vector = assigned_dev->guest_msix_entries[index].vector;
> - kvm_set_irq(assigned_dev->kvm, assigned_dev->irq_source_id,
> - vector, 1);
> + kvm_assigned_dev_raise_guest_irq(assigned_dev, vector);
> }
>
> return IRQ_HANDLED;
> @@ -100,15 +143,31 @@ static void kvm_assigned_dev_ack_irq(struct kvm_irq_ack_notifier *kian)
>
> kvm_set_irq(dev->kvm, dev->irq_source_id, dev->guest_irq, 0);
>
> - /* The guest irq may be shared so this ack may be
> - * from another device.
> - */
> - spin_lock(&dev->intx_lock);
> - if (dev->host_irq_disabled) {
> - enable_irq(dev->host_irq);
> - dev->host_irq_disabled = false;
> + mutex_lock(&dev->intx_mask_lock);
> +
> + if (!(dev->flags & KVM_DEV_ASSIGN_MASK_INTX)) {
> + bool reassert = false;
> +
> + spin_lock_irq(&dev->intx_lock);
> + /*
> + * The guest IRQ may be shared so this ack can come from an
> + * IRQ for another guest device.
> + */
> + if (dev->host_irq_disabled) {
> + if (!(dev->flags & KVM_DEV_ASSIGN_PCI_2_3))
> + enable_irq(dev->host_irq);
> + else if (!pci_check_and_unmask_intx(dev->dev))
> + reassert = true;
> + dev->host_irq_disabled = reassert;
> + }
> + spin_unlock_irq(&dev->intx_lock);
> +
> + if (reassert)
> + kvm_set_irq(dev->kvm, dev->irq_source_id,
> + dev->guest_irq, 1);
> }
> - spin_unlock(&dev->intx_lock);
> +
> + mutex_unlock(&dev->intx_mask_lock);
> }
>
> static void deassign_guest_irq(struct kvm *kvm,
> @@ -156,7 +215,13 @@ static void deassign_host_irq(struct kvm *kvm,
> pci_disable_msix(assigned_dev->dev);
> } else {
> /* Deal with MSI and INTx */
> - disable_irq(assigned_dev->host_irq);
> + if (assigned_dev->flags & KVM_DEV_ASSIGN_PCI_2_3) {
> + spin_lock_irq(&assigned_dev->intx_lock);
Interesting that the new mutex is not used here.
> + pci_intx(assigned_dev->dev, false);
Interesting.
This will leave the device with interrupts masked.
Do we reset it later?
Maybe we want a comment explaining why it's done.
> + spin_unlock_irq(&assigned_dev->intx_lock);
> + synchronize_irq(assigned_dev->host_irq);
> + } else
> + disable_irq(assigned_dev->host_irq);
>
> free_irq(assigned_dev->host_irq, assigned_dev);
>
> @@ -237,15 +302,34 @@ void kvm_free_all_assigned_devices(struct kvm *kvm)
> static int assigned_device_enable_host_intx(struct kvm *kvm,
> struct kvm_assigned_dev_kernel *dev)
> {
> + irq_handler_t irq_handler;
> + unsigned long flags;
> +
> dev->host_irq = dev->dev->irq;
> - /* Even though this is PCI, we don't want to use shared
> - * interrupts. Sharing host devices with guest-assigned devices
> - * on the same interrupt line is not a happy situation: there
> - * are going to be long delays in accepting, acking, etc.
> +
> + /*
> + * We can only share the IRQ line with other host devices if we are
> + * able to disable the IRQ source at device-level - independently of
> + * the guest driver. Otherwise host devices may suffer from unbounded
> + * IRQ latencies when the guest keeps the line asserted.
> */
> - if (request_threaded_irq(dev->host_irq, NULL, kvm_assigned_dev_thread,
> - IRQF_ONESHOT, dev->irq_name, dev))
> + if (dev->flags & KVM_DEV_ASSIGN_PCI_2_3) {
> + irq_handler = kvm_assigned_dev_intx;
> + flags = IRQF_SHARED;
> + } else {
> + irq_handler = NULL;
> + flags = IRQF_ONESHOT;
> + }
> + if (request_threaded_irq(dev->host_irq, irq_handler,
> + kvm_assigned_dev_thread_intx, flags,
> + dev->irq_name, dev))
> return -EIO;
> +
> + if (dev->flags & KVM_DEV_ASSIGN_PCI_2_3) {
> + spin_lock_irq(&dev->intx_lock);
> + pci_intx(dev->dev, true);
> + spin_unlock_irq(&dev->intx_lock);
> + }
> return 0;
> }
>
> @@ -262,8 +346,9 @@ static int assigned_device_enable_host_msi(struct kvm *kvm,
> }
>
> dev->host_irq = dev->dev->irq;
> - if (request_threaded_irq(dev->host_irq, NULL, kvm_assigned_dev_thread,
> - 0, dev->irq_name, dev)) {
> + if (request_threaded_irq(dev->host_irq, NULL,
> + kvm_assigned_dev_thread_msi, 0,
> + dev->irq_name, dev)) {
> pci_disable_msi(dev->dev);
> return -EIO;
> }
> @@ -321,7 +406,6 @@ static int assigned_device_enable_guest_msi(struct kvm *kvm,
> {
> dev->guest_irq = irq->guest_irq;
> dev->ack_notifier.gsi = -1;
> - dev->host_irq_disabled = false;
> return 0;
> }
> #endif
> @@ -333,7 +417,6 @@ static int assigned_device_enable_guest_msix(struct kvm *kvm,
> {
> dev->guest_irq = irq->guest_irq;
> dev->ack_notifier.gsi = -1;
> - dev->host_irq_disabled = false;
> return 0;
> }
> #endif
> @@ -367,6 +450,7 @@ static int assign_host_irq(struct kvm *kvm,
> default:
> r = -EINVAL;
> }
> + dev->host_irq_disabled = false;
>
> if (!r)
> dev->irq_requested_type |= host_irq_type;
> @@ -468,6 +552,7 @@ static int kvm_vm_ioctl_deassign_dev_irq(struct kvm *kvm,
> {
> int r = -ENODEV;
> struct kvm_assigned_dev_kernel *match;
> + unsigned long irq_type;
>
> mutex_lock(&kvm->lock);
>
> @@ -476,7 +561,9 @@ static int kvm_vm_ioctl_deassign_dev_irq(struct kvm *kvm,
> if (!match)
> goto out;
>
> - r = kvm_deassign_irq(kvm, match, assigned_irq->flags);
> + irq_type = assigned_irq->flags & (KVM_DEV_IRQ_HOST_MASK |
> + KVM_DEV_IRQ_GUEST_MASK);
> + r = kvm_deassign_irq(kvm, match, irq_type);
> out:
> mutex_unlock(&kvm->lock);
> return r;
> @@ -609,6 +696,10 @@ static int kvm_vm_ioctl_assign_device(struct kvm *kvm,
> if (!match->pci_saved_state)
> printk(KERN_DEBUG "%s: Couldn't store %s saved state\n",
> __func__, dev_name(&dev->dev));
> +
> + if (!pci_intx_mask_supported(dev))
> + assigned_dev->flags &= ~KVM_DEV_ASSIGN_PCI_2_3;
> +
> match->assigned_dev_id = assigned_dev->assigned_dev_id;
> match->host_segnr = assigned_dev->segnr;
> match->host_busnr = assigned_dev->busnr;
> @@ -616,6 +707,7 @@ static int kvm_vm_ioctl_assign_device(struct kvm *kvm,
> match->flags = assigned_dev->flags;
> match->dev = dev;
> spin_lock_init(&match->intx_lock);
> + mutex_init(&match->intx_mask_lock);
> match->irq_source_id = -1;
> match->kvm = kvm;
> match->ack_notifier.irq_acked = kvm_assigned_dev_ack_irq;
> @@ -761,6 +853,56 @@ msix_entry_out:
> }
> #endif
>
> +static int kvm_vm_ioctl_set_pci_irq_mask(struct kvm *kvm,
> + struct kvm_assigned_pci_dev *assigned_dev)
> +{
> + int r = 0;
> + struct kvm_assigned_dev_kernel *match;
> +
> + mutex_lock(&kvm->lock);
> +
> + match = kvm_find_assigned_dev(&kvm->arch.assigned_dev_head,
> + assigned_dev->assigned_dev_id);
> + if (!match) {
> + r = -ENODEV;
> + goto out;
> + }
> +
> + mutex_lock(&match->intx_mask_lock);
> +
> + match->flags &= ~KVM_DEV_ASSIGN_MASK_INTX;
> + match->flags |= assigned_dev->flags & KVM_DEV_ASSIGN_MASK_INTX;
> +
> + if (match->irq_requested_type & KVM_DEV_IRQ_GUEST_INTX) {
> + if (assigned_dev->flags & KVM_DEV_ASSIGN_MASK_INTX) {
> + kvm_set_irq(match->kvm, match->irq_source_id,
> + match->guest_irq, 0);
> + /*
> + * Masking at hardware-level is performed on demand,
> + * i.e. when an IRQ actually arrives at the host.
> + */
> + } else {
> + /*
> + * Unmask the IRQ line. It may have been masked
> + * meanwhile if we aren't using PCI 2.3 INTx masking
> + * on the host side.
> + */
> + spin_lock_irq(&match->intx_lock);
> + if (match->host_irq_disabled) {
> + enable_irq(match->host_irq);
> + match->host_irq_disabled = false;
> + }
> + spin_unlock_irq(&match->intx_lock);
> + }
> + }
> +
> + mutex_unlock(&match->intx_mask_lock);
> +
> +out:
> + mutex_unlock(&kvm->lock);
> + return r;
> +}
> +
> long kvm_vm_ioctl_assigned_device(struct kvm *kvm, unsigned ioctl,
> unsigned long arg)
> {
> @@ -868,6 +1010,15 @@ long kvm_vm_ioctl_assigned_device(struct kvm *kvm, unsigned ioctl,
> break;
> }
> #endif
> + case KVM_ASSIGN_SET_INTX_MASK: {
> + struct kvm_assigned_pci_dev assigned_dev;
> +
> + r = -EFAULT;
> + if (copy_from_user(&assigned_dev, argp, sizeof assigned_dev))
> + goto out;
> + r = kvm_vm_ioctl_set_pci_irq_mask(kvm, &assigned_dev);
> + break;
> + }
> default:
> r = -ENOTTY;
> break;
> @@ -875,4 +1026,3 @@ long kvm_vm_ioctl_assigned_device(struct kvm *kvm, unsigned ioctl,
> out:
> return r;
> }
> -
> --
> 1.7.3.4
^ permalink raw reply [flat|nested] 20+ messages in thread* Re: [PATCH] KVM: Allow host IRQ sharing for assigned PCI 2.3 devices
2012-01-10 16:17 ` Michael S. Tsirkin
@ 2012-01-10 17:29 ` Jan Kiszka
2012-01-10 18:10 ` Michael S. Tsirkin
0 siblings, 1 reply; 20+ messages in thread
From: Jan Kiszka @ 2012-01-10 17:29 UTC (permalink / raw)
To: Michael S. Tsirkin
Cc: Avi Kivity, Marcelo Tosatti, kvm, Alex Williamson, Jesse Barnes
On 2012-01-10 17:17, Michael S. Tsirkin wrote:
> On Mon, Jan 09, 2012 at 03:03:00PM +0100, Jan Kiszka wrote:
>> PCI 2.3 allows to generically disable IRQ sources at device level. This
>> enables us to share legacy IRQs of such devices with other host devices
>> when passing them to a guest.
>>
>> The new IRQ sharing feature introduced here is optional, user space has
>> to request it explicitly. Moreover, user space can inform us about its
>> view of PCI_COMMAND_INTX_DISABLE so that we can avoid unmasking the
>> interrupt and signaling it if the guest masked it via the virtualized
>> PCI config space.
>>
>> Signed-off-by: Jan Kiszka <jan.kiszka@siemens.com>
>> ---
>>
>> This applies to kvm/master after merging
>>
>> PCI: Rework config space blocking services
>> PCI: Introduce INTx check & mask API
>>
>> from current linux-next/master. I suppose those two will make it into
>> 3.3.
>>
>> To recall the history of it: I tried hard to implement an adaptive
>> solution that automatically picks the fastest masking technique whenever
>> possible. However, the changes required to the IRQ core subsystem and
>> the logic of the device assignment code became so complex and partly
>> ugly that I gave up on this. It's simply not worth the pain given that
>> legacy PCI interrupts are rarely raised for performance critical device
>> at such a high rate (KHz...) that you can measure the difference.
>>
>> Documentation/virtual/kvm/api.txt | 27 +++++
>> arch/x86/kvm/x86.c | 1 +
>> include/linux/kvm.h | 6 +
>> include/linux/kvm_host.h | 2 +
>> virt/kvm/assigned-dev.c | 208 +++++++++++++++++++++++++++++++-----
>> 5 files changed, 215 insertions(+), 29 deletions(-)
>>
>> diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt
>> index e1d94bf..670015a 100644
>> --- a/Documentation/virtual/kvm/api.txt
>> +++ b/Documentation/virtual/kvm/api.txt
>> @@ -1159,6 +1159,14 @@ following flags are specified:
>>
>> /* Depends on KVM_CAP_IOMMU */
>> #define KVM_DEV_ASSIGN_ENABLE_IOMMU (1 << 0)
>> +/* The following two depend on KVM_CAP_PCI_2_3 */
>> +#define KVM_DEV_ASSIGN_PCI_2_3 (1 << 1)
>> +#define KVM_DEV_ASSIGN_MASK_INTX (1 << 2)
>> +
>> +If KVM_DEV_ASSIGN_PCI_2_3 is set, the kernel will manage legacy INTx interrupts
>> +via the PCI-2.3-compliant device-level mask, thus enable IRQ sharing with other
>> +assigned devices or host devices. KVM_DEV_ASSIGN_MASK_INTX specifies the
>> +guest's view on the INTx mask, see KVM_ASSIGN_SET_INTX_MASK for details.
>>
>> The KVM_DEV_ASSIGN_ENABLE_IOMMU flag is a mandatory option to ensure
>> isolation of the device. Usages not specifying this flag are deprecated.
>> @@ -1399,6 +1407,25 @@ The following flags are defined:
>> If datamatch flag is set, the event will be signaled only if the written value
>> to the registered address is equal to datamatch in struct kvm_ioeventfd.
>>
>> +4.59 KVM_ASSIGN_SET_INTX_MASK
>> +
>> +Capability: KVM_CAP_PCI_2_3
>> +Architectures: x86
>> +Type: vm ioctl
>> +Parameters: struct kvm_assigned_pci_dev (in)
>> +Returns: 0 on success, -1 on error
>> +
>> +Informs the kernel about the guest's view on the INTx mask.
>
> A wild idea: since this is guest view of its IRQ,
> can this be specified per guest IRQ+id then?
> That might be useful to support MSIX mask bit emulation.
I do not yet get the full idea: You want some generic
KVM_ASSIGN_SET_IRQ_MASK? What will be the use case in the MSI[X] area?
>
>> As long as the
>> +guest masks the legacy INTx, the kernel will refrain from unmasking it at
>> +hardware level and will not assert the guest's IRQ line. User space is still
>> +responsible for applying this state to the assigned device's real config space.
>
> Can this be made more explicit? You mean writing into 1st
> byte of PCI control, right?
For sure, I can state this.
>
>> +To avoid that the kernel overwrites the state user space wants to set,
>> +KVM_ASSIGN_SET_INTX_MASK has to be called prior to updating the config space.
>
> This looks like a strange requirement, could you explain how
> this helps avoid races?
By declaring the target state of the INTx bit first to the kernel,
concurrent changes of the kernel while user space performs a
read-modify-write will not lead to an old mask state being written.
> This also raises questions about
> what should be done to write a bit unrelated to masking.
Just write it, using the INTx state user space maintains. In the worst
case, some masking done by the kernel in the meantime will be
overwritten, leading to a single spurious but harmless IRQ. That event
won't be delivered to the guest unless it is ready to receive it - as we
updated the mask state prior to writing to the config space. The point
is that the kernel mechanism has to deal with crazy user space clearing
the mask for whatever reason again.
>
>> +
>> +See KVM_ASSIGN_DEV_IRQ for the data structure. The target device is specified
>> +by assigned_dev_id. In the flags field, only KVM_DEV_ASSIGN_MASK_INTX is
>> +evaluated.
>> +
>> 4.62 KVM_CREATE_SPAPR_TCE
>>
>> Capability: KVM_CAP_SPAPR_TCE
>> diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
>> index 1171def..9381806 100644
>> --- a/arch/x86/kvm/x86.c
>> +++ b/arch/x86/kvm/x86.c
>> @@ -2057,6 +2057,7 @@ int kvm_dev_ioctl_check_extension(long ext)
>> case KVM_CAP_XSAVE:
>> case KVM_CAP_ASYNC_PF:
>> case KVM_CAP_GET_TSC_KHZ:
>> + case KVM_CAP_PCI_2_3:
>> r = 1;
>> break;
>> case KVM_CAP_COALESCED_MMIO:
>> diff --git a/include/linux/kvm.h b/include/linux/kvm.h
>> index 68e67e5..da5f7b7 100644
>> --- a/include/linux/kvm.h
>> +++ b/include/linux/kvm.h
>> @@ -556,6 +556,7 @@ struct kvm_ppc_pvinfo {
>> #define KVM_CAP_PPC_RMA 65
>> #define KVM_CAP_MAX_VCPUS 66 /* returns max vcpus per vm */
>> #define KVM_CAP_PPC_PAPR 68
>> +#define KVM_CAP_PCI_2_3 69
>> #define KVM_CAP_S390_GMAP 71
>> #define KVM_CAP_TSC_DEADLINE_TIMER 72
>>
>> @@ -697,6 +698,9 @@ struct kvm_clock_data {
>> /* Available with KVM_CAP_TSC_CONTROL */
>> #define KVM_SET_TSC_KHZ _IO(KVMIO, 0xa2)
>> #define KVM_GET_TSC_KHZ _IO(KVMIO, 0xa3)
>> +/* Available with KVM_CAP_PCI_2_3 */
>> +#define KVM_ASSIGN_SET_INTX_MASK _IOW(KVMIO, 0xa4, \
>> + struct kvm_assigned_pci_dev)
>>
>> /*
>> * ioctls for vcpu fds
>> @@ -765,6 +769,8 @@ struct kvm_clock_data {
>> #define KVM_ALLOCATE_RMA _IOR(KVMIO, 0xa9, struct kvm_allocate_rma)
>>
>> #define KVM_DEV_ASSIGN_ENABLE_IOMMU (1 << 0)
>> +#define KVM_DEV_ASSIGN_PCI_2_3 (1 << 1)
>> +#define KVM_DEV_ASSIGN_MASK_INTX (1 << 2)
>>
>> struct kvm_assigned_pci_dev {
>> __u32 assigned_dev_id;
>> diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
>> index 900c763..07461bd 100644
>> --- a/include/linux/kvm_host.h
>> +++ b/include/linux/kvm_host.h
>> @@ -546,6 +546,7 @@ struct kvm_assigned_dev_kernel {
>> unsigned int entries_nr;
>> int host_irq;
>> bool host_irq_disabled;
>> + bool pci_2_3;
>> struct msix_entry *host_msix_entries;
>> int guest_irq;
>> struct msix_entry *guest_msix_entries;
>> @@ -555,6 +556,7 @@ struct kvm_assigned_dev_kernel {
>> struct pci_dev *dev;
>> struct kvm *kvm;
>> spinlock_t intx_lock;
>> + struct mutex intx_mask_lock;
>
> What exactly does this lock protect?
> I see it used sometimes with intx_lock and sometimes without.
The KVM_DEV_ASSIGN_MASK_INTX bit in flags.
>
>> char irq_name[32];
>> struct pci_saved_state *pci_saved_state;
>> };
>> diff --git a/virt/kvm/assigned-dev.c b/virt/kvm/assigned-dev.c
>> index 758e3b3..b35aba9 100644
>> --- a/virt/kvm/assigned-dev.c
>> +++ b/virt/kvm/assigned-dev.c
>> @@ -57,22 +57,66 @@ static int find_index_from_host_irq(struct kvm_assigned_dev_kernel
>> return index;
>> }
>>
>> -static irqreturn_t kvm_assigned_dev_thread(int irq, void *dev_id)
>> +static irqreturn_t kvm_assigned_dev_intx(int irq, void *dev_id)
>> {
>> struct kvm_assigned_dev_kernel *assigned_dev = dev_id;
>> + int ret;
>>
>> - if (assigned_dev->irq_requested_type & KVM_DEV_IRQ_HOST_INTX) {
>> - spin_lock(&assigned_dev->intx_lock);
>> + spin_lock(&assigned_dev->intx_lock);
>> + if (pci_check_and_mask_intx(assigned_dev->dev)) {
>> + assigned_dev->host_irq_disabled = true;
>> + ret = IRQ_WAKE_THREAD;
>> + } else
>> + ret = IRQ_NONE;
>> + spin_unlock(&assigned_dev->intx_lock);
>> +
>> + return ret;
>> +}
>> +
>> +static void
>> +kvm_assigned_dev_raise_guest_irq(struct kvm_assigned_dev_kernel *assigned_dev,
>> + int vector)
>> +{
>> + if (unlikely(assigned_dev->irq_requested_type &
>> + KVM_DEV_IRQ_GUEST_INTX)) {
>> + mutex_lock(&assigned_dev->intx_mask_lock);
>> + if (!(assigned_dev->flags & KVM_DEV_ASSIGN_MASK_INTX))
>> + kvm_set_irq(assigned_dev->kvm,
>> + assigned_dev->irq_source_id, vector, 1);
>> + mutex_unlock(&assigned_dev->intx_mask_lock);
>> + } else
>> + kvm_set_irq(assigned_dev->kvm, assigned_dev->irq_source_id,
>> + vector, 1);
>> +}
>> +
>> +static irqreturn_t kvm_assigned_dev_thread_intx(int irq, void *dev_id)
>> +{
>> + struct kvm_assigned_dev_kernel *assigned_dev = dev_id;
>> +
>> + if (!(assigned_dev->flags & KVM_DEV_ASSIGN_PCI_2_3)) {
>> + spin_lock_irq(&assigned_dev->intx_lock);
>> disable_irq_nosync(irq);
>> assigned_dev->host_irq_disabled = true;
>> - spin_unlock(&assigned_dev->intx_lock);
>> + spin_unlock_irq(&assigned_dev->intx_lock);
>> }
>>
>> - kvm_set_irq(assigned_dev->kvm, assigned_dev->irq_source_id,
>> - assigned_dev->guest_irq, 1);
>> + kvm_assigned_dev_raise_guest_irq(assigned_dev,
>> + assigned_dev->guest_irq);
>> +
>> + return IRQ_HANDLED;
>> +}
>> +
>> +#ifdef __KVM_HAVE_MSI
>> +static irqreturn_t kvm_assigned_dev_thread_msi(int irq, void *dev_id)
>> +{
>> + struct kvm_assigned_dev_kernel *assigned_dev = dev_id;
>> +
>> + kvm_assigned_dev_raise_guest_irq(assigned_dev,
>> + assigned_dev->guest_irq);
>>
>> return IRQ_HANDLED;
>> }
>> +#endif
>>
>> #ifdef __KVM_HAVE_MSIX
>> static irqreturn_t kvm_assigned_dev_thread_msix(int irq, void *dev_id)
>> @@ -83,8 +127,7 @@ static irqreturn_t kvm_assigned_dev_thread_msix(int irq, void *dev_id)
>>
>> if (index >= 0) {
>> vector = assigned_dev->guest_msix_entries[index].vector;
>> - kvm_set_irq(assigned_dev->kvm, assigned_dev->irq_source_id,
>> - vector, 1);
>> + kvm_assigned_dev_raise_guest_irq(assigned_dev, vector);
>> }
>>
>> return IRQ_HANDLED;
>> @@ -100,15 +143,31 @@ static void kvm_assigned_dev_ack_irq(struct kvm_irq_ack_notifier *kian)
>>
>> kvm_set_irq(dev->kvm, dev->irq_source_id, dev->guest_irq, 0);
>>
>> - /* The guest irq may be shared so this ack may be
>> - * from another device.
>> - */
>> - spin_lock(&dev->intx_lock);
>> - if (dev->host_irq_disabled) {
>> - enable_irq(dev->host_irq);
>> - dev->host_irq_disabled = false;
>> + mutex_lock(&dev->intx_mask_lock);
>> +
>> + if (!(dev->flags & KVM_DEV_ASSIGN_MASK_INTX)) {
>> + bool reassert = false;
>> +
>> + spin_lock_irq(&dev->intx_lock);
>> + /*
>> + * The guest IRQ may be shared so this ack can come from an
>> + * IRQ for another guest device.
>> + */
>> + if (dev->host_irq_disabled) {
>> + if (!(dev->flags & KVM_DEV_ASSIGN_PCI_2_3))
>> + enable_irq(dev->host_irq);
>> + else if (!pci_check_and_unmask_intx(dev->dev))
>> + reassert = true;
>> + dev->host_irq_disabled = reassert;
>> + }
>> + spin_unlock_irq(&dev->intx_lock);
>> +
>> + if (reassert)
>> + kvm_set_irq(dev->kvm, dev->irq_source_id,
>> + dev->guest_irq, 1);
>> }
>> - spin_unlock(&dev->intx_lock);
>> +
>> + mutex_unlock(&dev->intx_mask_lock);
>> }
>>
>> static void deassign_guest_irq(struct kvm *kvm,
>> @@ -156,7 +215,13 @@ static void deassign_host_irq(struct kvm *kvm,
>> pci_disable_msix(assigned_dev->dev);
>> } else {
>> /* Deal with MSI and INTx */
>> - disable_irq(assigned_dev->host_irq);
>> + if (assigned_dev->flags & KVM_DEV_ASSIGN_PCI_2_3) {
>> + spin_lock_irq(&assigned_dev->intx_lock);
>
> Interesting that the new mutex is not used here.
KVM_DEV_ASSIGN_PCI_2_3 is immutable after setup, and we must disable the
line independent of what user space likes to see.
>
>> + pci_intx(assigned_dev->dev, false);
>
> Interesting.
> This will leave the device with interrupts masked.
> Do we reset it later?
> Maybe we want a comment explaining why it's done.
>
Before releasing the device, we first reset it (which is supposed to
deassert the line). And then we restore the config space to the state
the device had before we grabed it. I can add a comment, but the logic
is mandatory (we can't release the line without disabling the source).
Thanks,
Jan
--
Siemens AG, Corporate Technology, CT T DE IT 1
Corporate Competence Center Embedded Linux
^ permalink raw reply [flat|nested] 20+ messages in thread* Re: [PATCH] KVM: Allow host IRQ sharing for assigned PCI 2.3 devices
2012-01-10 17:29 ` Jan Kiszka
@ 2012-01-10 18:10 ` Michael S. Tsirkin
2012-01-10 18:21 ` Jan Kiszka
0 siblings, 1 reply; 20+ messages in thread
From: Michael S. Tsirkin @ 2012-01-10 18:10 UTC (permalink / raw)
To: Jan Kiszka
Cc: Avi Kivity, Marcelo Tosatti, kvm, Alex Williamson, Jesse Barnes
On Tue, Jan 10, 2012 at 06:29:51PM +0100, Jan Kiszka wrote:
> On 2012-01-10 17:17, Michael S. Tsirkin wrote:
> > On Mon, Jan 09, 2012 at 03:03:00PM +0100, Jan Kiszka wrote:
> >> PCI 2.3 allows to generically disable IRQ sources at device level. This
> >> enables us to share legacy IRQs of such devices with other host devices
> >> when passing them to a guest.
> >>
> >> The new IRQ sharing feature introduced here is optional, user space has
> >> to request it explicitly. Moreover, user space can inform us about its
> >> view of PCI_COMMAND_INTX_DISABLE so that we can avoid unmasking the
> >> interrupt and signaling it if the guest masked it via the virtualized
> >> PCI config space.
> >>
> >> Signed-off-by: Jan Kiszka <jan.kiszka@siemens.com>
> >> ---
> >>
> >> This applies to kvm/master after merging
> >>
> >> PCI: Rework config space blocking services
> >> PCI: Introduce INTx check & mask API
> >>
> >> from current linux-next/master. I suppose those two will make it into
> >> 3.3.
> >>
> >> To recall the history of it: I tried hard to implement an adaptive
> >> solution that automatically picks the fastest masking technique whenever
> >> possible. However, the changes required to the IRQ core subsystem and
> >> the logic of the device assignment code became so complex and partly
> >> ugly that I gave up on this. It's simply not worth the pain given that
> >> legacy PCI interrupts are rarely raised for performance critical device
> >> at such a high rate (KHz...) that you can measure the difference.
> >>
> >> Documentation/virtual/kvm/api.txt | 27 +++++
> >> arch/x86/kvm/x86.c | 1 +
> >> include/linux/kvm.h | 6 +
> >> include/linux/kvm_host.h | 2 +
> >> virt/kvm/assigned-dev.c | 208 +++++++++++++++++++++++++++++++-----
> >> 5 files changed, 215 insertions(+), 29 deletions(-)
> >>
> >> diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt
> >> index e1d94bf..670015a 100644
> >> --- a/Documentation/virtual/kvm/api.txt
> >> +++ b/Documentation/virtual/kvm/api.txt
> >> @@ -1159,6 +1159,14 @@ following flags are specified:
> >>
> >> /* Depends on KVM_CAP_IOMMU */
> >> #define KVM_DEV_ASSIGN_ENABLE_IOMMU (1 << 0)
> >> +/* The following two depend on KVM_CAP_PCI_2_3 */
> >> +#define KVM_DEV_ASSIGN_PCI_2_3 (1 << 1)
> >> +#define KVM_DEV_ASSIGN_MASK_INTX (1 << 2)
> >> +
> >> +If KVM_DEV_ASSIGN_PCI_2_3 is set, the kernel will manage legacy INTx interrupts
> >> +via the PCI-2.3-compliant device-level mask, thus enable IRQ sharing with other
> >> +assigned devices or host devices. KVM_DEV_ASSIGN_MASK_INTX specifies the
> >> +guest's view on the INTx mask, see KVM_ASSIGN_SET_INTX_MASK for details.
> >>
> >> The KVM_DEV_ASSIGN_ENABLE_IOMMU flag is a mandatory option to ensure
> >> isolation of the device. Usages not specifying this flag are deprecated.
> >> @@ -1399,6 +1407,25 @@ The following flags are defined:
> >> If datamatch flag is set, the event will be signaled only if the written value
> >> to the registered address is equal to datamatch in struct kvm_ioeventfd.
> >>
> >> +4.59 KVM_ASSIGN_SET_INTX_MASK
> >> +
> >> +Capability: KVM_CAP_PCI_2_3
> >> +Architectures: x86
> >> +Type: vm ioctl
> >> +Parameters: struct kvm_assigned_pci_dev (in)
> >> +Returns: 0 on success, -1 on error
> >> +
> >> +Informs the kernel about the guest's view on the INTx mask.
> >
> > A wild idea: since this is guest view of its IRQ,
> > can this be specified per guest IRQ+id then?
> > That might be useful to support MSIX mask bit emulation.
>
> I do not yet get the full idea: You want some generic
> KVM_ASSIGN_SET_IRQ_MASK? What will be the use case in the MSI[X] area?
ATM writes to msi/msix mask bit have no effect for assigned
devices. For virtio, they are implemented by deassigning irqfd
which is a very slow operation (rcu write side).
Instead, When guest writes to mask, qemu can set/clear by calling
this ioctl.
> >
> >> As long as the
> >> +guest masks the legacy INTx, the kernel will refrain from unmasking it at
> >> +hardware level and will not assert the guest's IRQ line. User space is still
> >> +responsible for applying this state to the assigned device's real config space.
> >
> > Can this be made more explicit? You mean writing into 1st
> > byte of PCI control, right?
>
> For sure, I can state this.
>
> >
> >> +To avoid that the kernel overwrites the state user space wants to set,
> >> +KVM_ASSIGN_SET_INTX_MASK has to be called prior to updating the config space.
> >
> > This looks like a strange requirement, could you explain how
> > this helps avoid races?
>
> By declaring the target state of the INTx bit first to the kernel,
> concurrent changes of the kernel while user space performs a
> read-modify-write will not lead to an old mask state being written.
I note you don't require KVM_ASSIGN_SET_INTX_MASK before read though.
Further, userspace might cache the control byte. If we require
it not to do it, we probably need to be explicit?
> > This also raises questions about
> > what should be done to write a bit unrelated to masking.
>
> Just write it, using the INTx state user space maintains. In the worst
> case, some masking done by the kernel in the meantime will be
> overwritten, leading to a single spurious but harmless IRQ. That event
> won't be delivered to the guest unless it is ready to receive it - as we
> updated the mask state prior to writing to the config space. The point
> is that the kernel mechanism has to deal with crazy user space clearing
> the mask for whatever reason again.
I guess the point is that we need to avoid is this:
kernel masks bit
read
kernel unmasks bit
write
I'm not sure I understand how the text above suggests
doing this in a race free manner.
A simple way would be to ask userspace to always clear
this bit on writes. What do you think?
--
MST
^ permalink raw reply [flat|nested] 20+ messages in thread
* Re: [PATCH] KVM: Allow host IRQ sharing for assigned PCI 2.3 devices
2012-01-10 18:10 ` Michael S. Tsirkin
@ 2012-01-10 18:21 ` Jan Kiszka
2012-01-10 18:31 ` Michael S. Tsirkin
0 siblings, 1 reply; 20+ messages in thread
From: Jan Kiszka @ 2012-01-10 18:21 UTC (permalink / raw)
To: Michael S. Tsirkin
Cc: Avi Kivity, Marcelo Tosatti, kvm, Alex Williamson, Jesse Barnes
On 2012-01-10 19:10, Michael S. Tsirkin wrote:
> On Tue, Jan 10, 2012 at 06:29:51PM +0100, Jan Kiszka wrote:
>> On 2012-01-10 17:17, Michael S. Tsirkin wrote:
>>> On Mon, Jan 09, 2012 at 03:03:00PM +0100, Jan Kiszka wrote:
>>>> PCI 2.3 allows to generically disable IRQ sources at device level. This
>>>> enables us to share legacy IRQs of such devices with other host devices
>>>> when passing them to a guest.
>>>>
>>>> The new IRQ sharing feature introduced here is optional, user space has
>>>> to request it explicitly. Moreover, user space can inform us about its
>>>> view of PCI_COMMAND_INTX_DISABLE so that we can avoid unmasking the
>>>> interrupt and signaling it if the guest masked it via the virtualized
>>>> PCI config space.
>>>>
>>>> Signed-off-by: Jan Kiszka <jan.kiszka@siemens.com>
>>>> ---
>>>>
>>>> This applies to kvm/master after merging
>>>>
>>>> PCI: Rework config space blocking services
>>>> PCI: Introduce INTx check & mask API
>>>>
>>>> from current linux-next/master. I suppose those two will make it into
>>>> 3.3.
>>>>
>>>> To recall the history of it: I tried hard to implement an adaptive
>>>> solution that automatically picks the fastest masking technique whenever
>>>> possible. However, the changes required to the IRQ core subsystem and
>>>> the logic of the device assignment code became so complex and partly
>>>> ugly that I gave up on this. It's simply not worth the pain given that
>>>> legacy PCI interrupts are rarely raised for performance critical device
>>>> at such a high rate (KHz...) that you can measure the difference.
>>>>
>>>> Documentation/virtual/kvm/api.txt | 27 +++++
>>>> arch/x86/kvm/x86.c | 1 +
>>>> include/linux/kvm.h | 6 +
>>>> include/linux/kvm_host.h | 2 +
>>>> virt/kvm/assigned-dev.c | 208 +++++++++++++++++++++++++++++++-----
>>>> 5 files changed, 215 insertions(+), 29 deletions(-)
>>>>
>>>> diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt
>>>> index e1d94bf..670015a 100644
>>>> --- a/Documentation/virtual/kvm/api.txt
>>>> +++ b/Documentation/virtual/kvm/api.txt
>>>> @@ -1159,6 +1159,14 @@ following flags are specified:
>>>>
>>>> /* Depends on KVM_CAP_IOMMU */
>>>> #define KVM_DEV_ASSIGN_ENABLE_IOMMU (1 << 0)
>>>> +/* The following two depend on KVM_CAP_PCI_2_3 */
>>>> +#define KVM_DEV_ASSIGN_PCI_2_3 (1 << 1)
>>>> +#define KVM_DEV_ASSIGN_MASK_INTX (1 << 2)
>>>> +
>>>> +If KVM_DEV_ASSIGN_PCI_2_3 is set, the kernel will manage legacy INTx interrupts
>>>> +via the PCI-2.3-compliant device-level mask, thus enable IRQ sharing with other
>>>> +assigned devices or host devices. KVM_DEV_ASSIGN_MASK_INTX specifies the
>>>> +guest's view on the INTx mask, see KVM_ASSIGN_SET_INTX_MASK for details.
>>>>
>>>> The KVM_DEV_ASSIGN_ENABLE_IOMMU flag is a mandatory option to ensure
>>>> isolation of the device. Usages not specifying this flag are deprecated.
>>>> @@ -1399,6 +1407,25 @@ The following flags are defined:
>>>> If datamatch flag is set, the event will be signaled only if the written value
>>>> to the registered address is equal to datamatch in struct kvm_ioeventfd.
>>>>
>>>> +4.59 KVM_ASSIGN_SET_INTX_MASK
>>>> +
>>>> +Capability: KVM_CAP_PCI_2_3
>>>> +Architectures: x86
>>>> +Type: vm ioctl
>>>> +Parameters: struct kvm_assigned_pci_dev (in)
>>>> +Returns: 0 on success, -1 on error
>>>> +
>>>> +Informs the kernel about the guest's view on the INTx mask.
>>>
>>> A wild idea: since this is guest view of its IRQ,
>>> can this be specified per guest IRQ+id then?
>>> That might be useful to support MSIX mask bit emulation.
>>
>> I do not yet get the full idea: You want some generic
>> KVM_ASSIGN_SET_IRQ_MASK? What will be the use case in the MSI[X] area?
>
> ATM writes to msi/msix mask bit have no effect for assigned
> devices. For virtio, they are implemented by deassigning irqfd
> which is a very slow operation (rcu write side).
>
> Instead, When guest writes to mask, qemu can set/clear by calling
> this ioctl.
Isn't that effort better invested in proper in-kernel mask emulation for
MSI-X?
>
>>>
>>>> As long as the
>>>> +guest masks the legacy INTx, the kernel will refrain from unmasking it at
>>>> +hardware level and will not assert the guest's IRQ line. User space is still
>>>> +responsible for applying this state to the assigned device's real config space.
>>>
>>> Can this be made more explicit? You mean writing into 1st
>>> byte of PCI control, right?
>>
>> For sure, I can state this.
>>
>>>
>>>> +To avoid that the kernel overwrites the state user space wants to set,
>>>> +KVM_ASSIGN_SET_INTX_MASK has to be called prior to updating the config space.
>>>
>>> This looks like a strange requirement, could you explain how
>>> this helps avoid races?
>>
>> By declaring the target state of the INTx bit first to the kernel,
>> concurrent changes of the kernel while user space performs a
>> read-modify-write will not lead to an old mask state being written.
>
> I note you don't require KVM_ASSIGN_SET_INTX_MASK before read though.
> Further, userspace might cache the control byte. If we require
> it not to do it, we probably need to be explicit?
User space can do with the control byte what it wants - kernel can't
help this anyway. I should just tell the kernel ahead of time what the
next INTx mask state will be. That particularly avoids that the kernel
sets the mask when user space wants it cleared. The other way around is
actually unproblematic as we check KVM_ASSIGN_SET_INTX_MASK before
delivering the IRQ to the guest.
>
>>> This also raises questions about
>>> what should be done to write a bit unrelated to masking.
>>
>> Just write it, using the INTx state user space maintains. In the worst
>> case, some masking done by the kernel in the meantime will be
>> overwritten, leading to a single spurious but harmless IRQ. That event
>> won't be delivered to the guest unless it is ready to receive it - as we
>> updated the mask state prior to writing to the config space. The point
>> is that the kernel mechanism has to deal with crazy user space clearing
>> the mask for whatever reason again.
>
> I guess the point is that we need to avoid is this:
>
> kernel masks bit
> read
> kernel unmasks bit
> write
>
> I'm not sure I understand how the text above suggests
> doing this in a race free manner.
User space must not write INTx as read from the hardware but according
to its own view. Then the above is harmless.
>
>
> A simple way would be to ask userspace to always clear
> this bit on writes. What do you think?
That or - sounds more consistent - writing the state that user space
exposes to the guest anyway. That (in addition to the ordering
requirement) should be clearly stated in the doc, I agree.
Jan
--
Siemens AG, Corporate Technology, CT T DE IT 1
Corporate Competence Center Embedded Linux
^ permalink raw reply [flat|nested] 20+ messages in thread
* Re: [PATCH] KVM: Allow host IRQ sharing for assigned PCI 2.3 devices
2012-01-10 18:21 ` Jan Kiszka
@ 2012-01-10 18:31 ` Michael S. Tsirkin
2012-01-10 18:43 ` Jan Kiszka
0 siblings, 1 reply; 20+ messages in thread
From: Michael S. Tsirkin @ 2012-01-10 18:31 UTC (permalink / raw)
To: Jan Kiszka
Cc: Avi Kivity, Marcelo Tosatti, kvm, Alex Williamson, Jesse Barnes
On Tue, Jan 10, 2012 at 07:21:01PM +0100, Jan Kiszka wrote:
> > ATM writes to msi/msix mask bit have no effect for assigned
> > devices. For virtio, they are implemented by deassigning irqfd
> > which is a very slow operation (rcu write side).
> >
> > Instead, When guest writes to mask, qemu can set/clear by calling
> > this ioctl.
>
> Isn't that effort better invested in proper in-kernel mask emulation for
> MSI-X?
This gives us a working implementation fo free. Whether MSIX mask
writes are worth accelerating in kernel I'm not 100% sure. But IMO this
shows it is a more generic interface.
> >
> >>>
> >>>> As long as the
> >>>> +guest masks the legacy INTx, the kernel will refrain from unmasking it at
> >>>> +hardware level and will not assert the guest's IRQ line. User space is still
> >>>> +responsible for applying this state to the assigned device's real config space.
> >>>
> >>> Can this be made more explicit? You mean writing into 1st
> >>> byte of PCI control, right?
> >>
> >> For sure, I can state this.
> >>
> >>>
> >>>> +To avoid that the kernel overwrites the state user space wants to set,
> >>>> +KVM_ASSIGN_SET_INTX_MASK has to be called prior to updating the config space.
> >>>
> >>> This looks like a strange requirement, could you explain how
> >>> this helps avoid races?
> >>
> >> By declaring the target state of the INTx bit first to the kernel,
> >> concurrent changes of the kernel while user space performs a
> >> read-modify-write will not lead to an old mask state being written.
> >
> > I note you don't require KVM_ASSIGN_SET_INTX_MASK before read though.
> > Further, userspace might cache the control byte. If we require
> > it not to do it, we probably need to be explicit?
>
> User space can do with the control byte what it wants - kernel can't
> help this anyway. I should just tell the kernel ahead of time what the
> next INTx mask state will be. That particularly avoids that the kernel
> sets the mask when user space wants it cleared. The other way around is
> actually unproblematic as we check KVM_ASSIGN_SET_INTX_MASK before
> delivering the IRQ to the guest.
>
> >
> >>> This also raises questions about
> >>> what should be done to write a bit unrelated to masking.
> >>
> >> Just write it, using the INTx state user space maintains. In the worst
> >> case, some masking done by the kernel in the meantime will be
> >> overwritten, leading to a single spurious but harmless IRQ. That event
> >> won't be delivered to the guest unless it is ready to receive it - as we
> >> updated the mask state prior to writing to the config space. The point
> >> is that the kernel mechanism has to deal with crazy user space clearing
> >> the mask for whatever reason again.
> >
> > I guess the point is that we need to avoid is this:
> >
> > kernel masks bit
> > read
> > kernel unmasks bit
> > write
> >
> > I'm not sure I understand how the text above suggests
> > doing this in a race free manner.
>
> User space must not write INTx as read from the hardware but according
> to its own view. Then the above is harmless.
>
> >
> >
> > A simple way would be to ask userspace to always clear
> > this bit on writes. What do you think?
>
> That or - sounds more consistent - writing the state that user space
> exposes to the guest anyway. That (in addition to the ordering
> requirement) should be clearly stated in the doc, I agree.
>
> Jan
Yes, I agree it all works, just needs clear documentation.
In summary, userspace must ignore the value of the bit
it reads from device.
> --
> Siemens AG, Corporate Technology, CT T DE IT 1
> Corporate Competence Center Embedded Linux
^ permalink raw reply [flat|nested] 20+ messages in thread
* Re: [PATCH] KVM: Allow host IRQ sharing for assigned PCI 2.3 devices
2012-01-10 18:31 ` Michael S. Tsirkin
@ 2012-01-10 18:43 ` Jan Kiszka
2012-01-10 19:04 ` Michael S. Tsirkin
0 siblings, 1 reply; 20+ messages in thread
From: Jan Kiszka @ 2012-01-10 18:43 UTC (permalink / raw)
To: Michael S. Tsirkin
Cc: Avi Kivity, Marcelo Tosatti, kvm, Alex Williamson, Jesse Barnes
On 2012-01-10 19:31, Michael S. Tsirkin wrote:
> On Tue, Jan 10, 2012 at 07:21:01PM +0100, Jan Kiszka wrote:
>>> ATM writes to msi/msix mask bit have no effect for assigned
>>> devices. For virtio, they are implemented by deassigning irqfd
>>> which is a very slow operation (rcu write side).
>>>
>>> Instead, When guest writes to mask, qemu can set/clear by calling
>>> this ioctl.
>>
>> Isn't that effort better invested in proper in-kernel mask emulation for
>> MSI-X?
>
> This gives us a working implementation fo free. Whether MSIX mask
> writes are worth accelerating in kernel I'm not 100% sure.
If it's worth optimizing the irqfd on/off dance, then it's more than
likely that eliminating the heavy user space exits, additional syscalls
along that way, and locking contentions up there is worth it as well. We
even have those mask ops in a time-critical paths here, unfortunately.
> But IMO this
> shows it is a more generic interface.
I'm worried about adding something new that will soon become obsolete
again. That's wasted effort IMHO unless we say today that there will be
no in-kernel MSI-X support.
Jan
--
Siemens AG, Corporate Technology, CT T DE IT 1
Corporate Competence Center Embedded Linux
^ permalink raw reply [flat|nested] 20+ messages in thread
* Re: [PATCH] KVM: Allow host IRQ sharing for assigned PCI 2.3 devices
2012-01-10 18:43 ` Jan Kiszka
@ 2012-01-10 19:04 ` Michael S. Tsirkin
2012-01-10 19:40 ` Jan Kiszka
0 siblings, 1 reply; 20+ messages in thread
From: Michael S. Tsirkin @ 2012-01-10 19:04 UTC (permalink / raw)
To: Jan Kiszka
Cc: Avi Kivity, Marcelo Tosatti, kvm, Alex Williamson, Jesse Barnes
On Tue, Jan 10, 2012 at 07:43:36PM +0100, Jan Kiszka wrote:
> On 2012-01-10 19:31, Michael S. Tsirkin wrote:
> > On Tue, Jan 10, 2012 at 07:21:01PM +0100, Jan Kiszka wrote:
> >>> ATM writes to msi/msix mask bit have no effect for assigned
> >>> devices. For virtio, they are implemented by deassigning irqfd
> >>> which is a very slow operation (rcu write side).
> >>>
> >>> Instead, When guest writes to mask, qemu can set/clear by calling
> >>> this ioctl.
> >>
> >> Isn't that effort better invested in proper in-kernel mask emulation for
> >> MSI-X?
> >
> > This gives us a working implementation fo free. Whether MSIX mask
> > writes are worth accelerating in kernel I'm not 100% sure.
>
> If it's worth optimizing the irqfd on/off dance,
Not sure about that either. At least for virtio in my tests
they almost never trigger. But it's needed for correctness
for assigned devices for msix.
> then it's more than
> likely that eliminating the heavy user space exits, additional syscalls
> along that way, and locking contentions up there is worth it as well. We
> even have those mask ops in a time-critical paths here, unfortunately.
>
> > But IMO this
> > shows it is a more generic interface.
>
> I'm worried about adding something new that will soon become obsolete
> again. That's wasted effort IMHO unless we say today that there will be
> no in-kernel MSI-X support.
>
> Jan
Yes. But as we are adding a new interface maybe it's better to add a
more generic one? I don't insist as I don't have a specific proposal,
just something to consider.
> --
> Siemens AG, Corporate Technology, CT T DE IT 1
> Corporate Competence Center Embedded Linux
^ permalink raw reply [flat|nested] 20+ messages in thread
* Re: [PATCH] KVM: Allow host IRQ sharing for assigned PCI 2.3 devices
2012-01-10 19:04 ` Michael S. Tsirkin
@ 2012-01-10 19:40 ` Jan Kiszka
2012-01-10 20:44 ` Michael S. Tsirkin
0 siblings, 1 reply; 20+ messages in thread
From: Jan Kiszka @ 2012-01-10 19:40 UTC (permalink / raw)
To: Michael S. Tsirkin
Cc: Avi Kivity, Marcelo Tosatti, kvm, Alex Williamson, Jesse Barnes
[-- Attachment #1: Type: text/plain, Size: 644 bytes --]
On 2012-01-10 20:04, Michael S. Tsirkin wrote:
>>> But IMO this
>>> shows it is a more generic interface.
>>
>> I'm worried about adding something new that will soon become obsolete
>> again. That's wasted effort IMHO unless we say today that there will be
>> no in-kernel MSI-X support.
>>
>> Jan
>
> Yes. But as we are adding a new interface maybe it's better to add a
> more generic one? I don't insist as I don't have a specific proposal,
> just something to consider.
I could imagine defining an extensible IRQ masking interface, e.g. with
flags that select the type, but only implementing it for INTx for now.
Jan
[-- Attachment #2: OpenPGP digital signature --]
[-- Type: application/pgp-signature, Size: 262 bytes --]
^ permalink raw reply [flat|nested] 20+ messages in thread
* Re: [PATCH] KVM: Allow host IRQ sharing for assigned PCI 2.3 devices
2012-01-10 19:40 ` Jan Kiszka
@ 2012-01-10 20:44 ` Michael S. Tsirkin
2012-01-10 21:18 ` Jan Kiszka
0 siblings, 1 reply; 20+ messages in thread
From: Michael S. Tsirkin @ 2012-01-10 20:44 UTC (permalink / raw)
To: Jan Kiszka
Cc: Avi Kivity, Marcelo Tosatti, kvm, Alex Williamson, Jesse Barnes
On Tue, Jan 10, 2012 at 08:40:59PM +0100, Jan Kiszka wrote:
> On 2012-01-10 20:04, Michael S. Tsirkin wrote:
> >>> But IMO this
> >>> shows it is a more generic interface.
> >>
> >> I'm worried about adding something new that will soon become obsolete
> >> again. That's wasted effort IMHO unless we say today that there will be
> >> no in-kernel MSI-X support.
> >>
> >> Jan
> >
> > Yes. But as we are adding a new interface maybe it's better to add a
> > more generic one? I don't insist as I don't have a specific proposal,
> > just something to consider.
>
> I could imagine defining an extensible IRQ masking interface, e.g. with
> flags that select the type, but only implementing it for INTx for now.
>
> Jan
>
I guess if we pass in the IRQ# the type can be inferred and does not
need to be passed in.
--
MST
^ permalink raw reply [flat|nested] 20+ messages in thread
* Re: [PATCH] KVM: Allow host IRQ sharing for assigned PCI 2.3 devices
2012-01-10 20:44 ` Michael S. Tsirkin
@ 2012-01-10 21:18 ` Jan Kiszka
2012-01-10 21:36 ` Michael S. Tsirkin
0 siblings, 1 reply; 20+ messages in thread
From: Jan Kiszka @ 2012-01-10 21:18 UTC (permalink / raw)
To: Michael S. Tsirkin
Cc: Avi Kivity, Marcelo Tosatti, kvm, Alex Williamson, Jesse Barnes
[-- Attachment #1: Type: text/plain, Size: 1584 bytes --]
On 2012-01-10 21:44, Michael S. Tsirkin wrote:
> On Tue, Jan 10, 2012 at 08:40:59PM +0100, Jan Kiszka wrote:
>> On 2012-01-10 20:04, Michael S. Tsirkin wrote:
>>>>> But IMO this
>>>>> shows it is a more generic interface.
>>>>
>>>> I'm worried about adding something new that will soon become obsolete
>>>> again. That's wasted effort IMHO unless we say today that there will be
>>>> no in-kernel MSI-X support.
>>>>
>>>> Jan
>>>
>>> Yes. But as we are adding a new interface maybe it's better to add a
>>> more generic one? I don't insist as I don't have a specific proposal,
>>> just something to consider.
>>
>> I could imagine defining an extensible IRQ masking interface, e.g. with
>> flags that select the type, but only implementing it for INTx for now.
>>
>> Jan
>>
>
> I guess if we pass in the IRQ# the type can be inferred and does not
> need to be passed in.
What kind of number, a GSI? We do not yet track what is behind a GSI, do we?
Hmm, I think this requires more careful thoughts. What should be the
semantic of "mask" for the addressed device behind the IRQ? For assigned
legacy IRQ it's clear: mask at config space level. For assigned MSI-X it
should be masking at vector level. What about assigned MSI? What about
irqfds? How to deal with future IRQ sources?
No, I think it is better to directly associate the masking feature
directly with the source instead of doing this via some handle,
potentially addressing the whole world. If there is a need for
KVM_IRQFD_MASK, then let's introduce it. As a separate API.
Jan
[-- Attachment #2: OpenPGP digital signature --]
[-- Type: application/pgp-signature, Size: 262 bytes --]
^ permalink raw reply [flat|nested] 20+ messages in thread
* Re: [PATCH] KVM: Allow host IRQ sharing for assigned PCI 2.3 devices
2012-01-10 21:18 ` Jan Kiszka
@ 2012-01-10 21:36 ` Michael S. Tsirkin
0 siblings, 0 replies; 20+ messages in thread
From: Michael S. Tsirkin @ 2012-01-10 21:36 UTC (permalink / raw)
To: Jan Kiszka
Cc: Avi Kivity, Marcelo Tosatti, kvm, Alex Williamson, Jesse Barnes
On Tue, Jan 10, 2012 at 10:18:12PM +0100, Jan Kiszka wrote:
> On 2012-01-10 21:44, Michael S. Tsirkin wrote:
> > On Tue, Jan 10, 2012 at 08:40:59PM +0100, Jan Kiszka wrote:
> >> On 2012-01-10 20:04, Michael S. Tsirkin wrote:
> >>>>> But IMO this
> >>>>> shows it is a more generic interface.
> >>>>
> >>>> I'm worried about adding something new that will soon become obsolete
> >>>> again. That's wasted effort IMHO unless we say today that there will be
> >>>> no in-kernel MSI-X support.
> >>>>
> >>>> Jan
> >>>
> >>> Yes. But as we are adding a new interface maybe it's better to add a
> >>> more generic one? I don't insist as I don't have a specific proposal,
> >>> just something to consider.
> >>
> >> I could imagine defining an extensible IRQ masking interface, e.g. with
> >> flags that select the type, but only implementing it for INTx for now.
> >>
> >> Jan
> >>
> >
> > I guess if we pass in the IRQ# the type can be inferred and does not
> > need to be passed in.
>
> What kind of number, a GSI? We do not yet track what is behind a GSI, do we?
>
> Hmm, I think this requires more careful thoughts. What should be the
> semantic of "mask" for the addressed device behind the IRQ? For assigned
> legacy IRQ it's clear: mask at config space level. For assigned MSI-X it
> should be masking at vector level. What about assigned MSI?
> What about
> irqfds? How to deal with future IRQ sources?
For correctness, it is enough to mask in host kernel.
Masking at device level is an optimization:
e.g. you don't mask immediately either.
> No, I think it is better to directly associate the masking feature
> directly with the source instead of doing this via some handle,
> potentially addressing the whole world. If there is a need for
> KVM_IRQFD_MASK, then let's introduce it. As a separate API.
>
> Jan
>
That's an option too.
--
MST
^ permalink raw reply [flat|nested] 20+ messages in thread
* [PATCH v2] KVM: Allow host IRQ sharing for assigned PCI 2.3 devices
2012-01-09 14:03 [PATCH] KVM: Allow host IRQ sharing for assigned PCI 2.3 devices Jan Kiszka
2012-01-09 19:45 ` Alex Williamson
2012-01-10 16:17 ` Michael S. Tsirkin
@ 2012-01-12 15:49 ` Jan Kiszka
2 siblings, 0 replies; 20+ messages in thread
From: Jan Kiszka @ 2012-01-12 15:49 UTC (permalink / raw)
To: Avi Kivity, Marcelo Tosatti
Cc: kvm, Alex Williamson, Michael S. Tsirkin, Jesse Barnes
PCI 2.3 allows to generically disable IRQ sources at device level. This
enables us to share legacy IRQs of such devices with other host devices
when passing them to a guest.
The new IRQ sharing feature introduced here is optional, user space has
to request it explicitly. Moreover, user space can inform us about its
view of PCI_COMMAND_INTX_DISABLE so that we can avoid unmasking the
interrupt and signaling it if the guest masked it via the virtualized
PCI config space.
Signed-off-by: Jan Kiszka <jan.kiszka@siemens.com>
---
Changes in v2:
- clarified API doc according to review comments
- moved KVM_CAP_PCI_2_3 out of the range that PPC silently occupied
This still depends on
PCI: Rework config space blocking services
PCI: Introduce INTx check & mask API
from current linux-next/master.
Documentation/virtual/kvm/api.txt | 31 ++++++
arch/x86/kvm/x86.c | 1 +
include/linux/kvm.h | 6 +
include/linux/kvm_host.h | 2 +
virt/kvm/assigned-dev.c | 208 +++++++++++++++++++++++++++++++-----
5 files changed, 219 insertions(+), 29 deletions(-)
diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt
index 7ca6962..d606655 100644
--- a/Documentation/virtual/kvm/api.txt
+++ b/Documentation/virtual/kvm/api.txt
@@ -1169,6 +1169,14 @@ following flags are specified:
/* Depends on KVM_CAP_IOMMU */
#define KVM_DEV_ASSIGN_ENABLE_IOMMU (1 << 0)
+/* The following two depend on KVM_CAP_PCI_2_3 */
+#define KVM_DEV_ASSIGN_PCI_2_3 (1 << 1)
+#define KVM_DEV_ASSIGN_MASK_INTX (1 << 2)
+
+If KVM_DEV_ASSIGN_PCI_2_3 is set, the kernel will manage legacy INTx interrupts
+via the PCI-2.3-compliant device-level mask, thus enable IRQ sharing with other
+assigned devices or host devices. KVM_DEV_ASSIGN_MASK_INTX specifies the
+guest's view on the INTx mask, see KVM_ASSIGN_SET_INTX_MASK for details.
The KVM_DEV_ASSIGN_ENABLE_IOMMU flag is a mandatory option to ensure
isolation of the device. Usages not specifying this flag are deprecated.
@@ -1409,6 +1417,29 @@ The following flags are defined:
If datamatch flag is set, the event will be signaled only if the written value
to the registered address is equal to datamatch in struct kvm_ioeventfd.
+4.59 KVM_ASSIGN_SET_INTX_MASK
+
+Capability: KVM_CAP_PCI_2_3
+Architectures: x86
+Type: vm ioctl
+Parameters: struct kvm_assigned_pci_dev (in)
+Returns: 0 on success, -1 on error
+
+Informs the kernel about the guest's view on the INTx mask. As long as the
+guest masks the legacy INTx, the kernel will refrain from unmasking it at
+hardware level and will not assert the guest's IRQ line. User space is still
+responsible for applying this state to the assigned device's real config space
+by setting or clearing the Interrupt Disable bit 10 in the Command register.
+
+To avoid that the kernel overwrites the state user space wants to set,
+KVM_ASSIGN_SET_INTX_MASK has to be called prior to updating the config space.
+Moreover, user space has to write back its own view on the Interrupt Disable
+bit whenever modifying the Command word.
+
+See KVM_ASSIGN_DEV_IRQ for the data structure. The target device is specified
+by assigned_dev_id. In the flags field, only KVM_DEV_ASSIGN_MASK_INTX is
+evaluated.
+
4.62 KVM_CREATE_SPAPR_TCE
Capability: KVM_CAP_SPAPR_TCE
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index f0fa3fb..3c6cff4 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -2077,6 +2077,7 @@ int kvm_dev_ioctl_check_extension(long ext)
case KVM_CAP_XSAVE:
case KVM_CAP_ASYNC_PF:
case KVM_CAP_GET_TSC_KHZ:
+ case KVM_CAP_PCI_2_3:
r = 1;
break;
case KVM_CAP_COALESCED_MMIO:
diff --git a/include/linux/kvm.h b/include/linux/kvm.h
index 245bcb3..a075198f7 100644
--- a/include/linux/kvm.h
+++ b/include/linux/kvm.h
@@ -585,6 +585,7 @@ struct kvm_ppc_pvinfo {
#define KVM_CAP_TSC_DEADLINE_TIMER 72
#define KVM_CAP_S390_UCONTROL 73
#define KVM_CAP_SYNC_REGS 74
+#define KVM_CAP_PCI_2_3 75
#ifdef KVM_CAP_IRQ_ROUTING
@@ -735,6 +736,9 @@ struct kvm_s390_ucas_mapping {
/* Available with KVM_CAP_TSC_CONTROL */
#define KVM_SET_TSC_KHZ _IO(KVMIO, 0xa2)
#define KVM_GET_TSC_KHZ _IO(KVMIO, 0xa3)
+/* Available with KVM_CAP_PCI_2_3 */
+#define KVM_ASSIGN_SET_INTX_MASK _IOW(KVMIO, 0xa4, \
+ struct kvm_assigned_pci_dev)
/*
* ioctls for vcpu fds
@@ -803,6 +807,8 @@ struct kvm_s390_ucas_mapping {
#define KVM_ALLOCATE_RMA _IOR(KVMIO, 0xa9, struct kvm_allocate_rma)
#define KVM_DEV_ASSIGN_ENABLE_IOMMU (1 << 0)
+#define KVM_DEV_ASSIGN_PCI_2_3 (1 << 1)
+#define KVM_DEV_ASSIGN_MASK_INTX (1 << 2)
struct kvm_assigned_pci_dev {
__u32 assigned_dev_id;
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index d4d4d70..17c48ac 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -547,6 +547,7 @@ struct kvm_assigned_dev_kernel {
unsigned int entries_nr;
int host_irq;
bool host_irq_disabled;
+ bool pci_2_3;
struct msix_entry *host_msix_entries;
int guest_irq;
struct msix_entry *guest_msix_entries;
@@ -556,6 +557,7 @@ struct kvm_assigned_dev_kernel {
struct pci_dev *dev;
struct kvm *kvm;
spinlock_t intx_lock;
+ struct mutex intx_mask_lock;
char irq_name[32];
struct pci_saved_state *pci_saved_state;
};
diff --git a/virt/kvm/assigned-dev.c b/virt/kvm/assigned-dev.c
index 758e3b3..b35aba9 100644
--- a/virt/kvm/assigned-dev.c
+++ b/virt/kvm/assigned-dev.c
@@ -57,22 +57,66 @@ static int find_index_from_host_irq(struct kvm_assigned_dev_kernel
return index;
}
-static irqreturn_t kvm_assigned_dev_thread(int irq, void *dev_id)
+static irqreturn_t kvm_assigned_dev_intx(int irq, void *dev_id)
{
struct kvm_assigned_dev_kernel *assigned_dev = dev_id;
+ int ret;
- if (assigned_dev->irq_requested_type & KVM_DEV_IRQ_HOST_INTX) {
- spin_lock(&assigned_dev->intx_lock);
+ spin_lock(&assigned_dev->intx_lock);
+ if (pci_check_and_mask_intx(assigned_dev->dev)) {
+ assigned_dev->host_irq_disabled = true;
+ ret = IRQ_WAKE_THREAD;
+ } else
+ ret = IRQ_NONE;
+ spin_unlock(&assigned_dev->intx_lock);
+
+ return ret;
+}
+
+static void
+kvm_assigned_dev_raise_guest_irq(struct kvm_assigned_dev_kernel *assigned_dev,
+ int vector)
+{
+ if (unlikely(assigned_dev->irq_requested_type &
+ KVM_DEV_IRQ_GUEST_INTX)) {
+ mutex_lock(&assigned_dev->intx_mask_lock);
+ if (!(assigned_dev->flags & KVM_DEV_ASSIGN_MASK_INTX))
+ kvm_set_irq(assigned_dev->kvm,
+ assigned_dev->irq_source_id, vector, 1);
+ mutex_unlock(&assigned_dev->intx_mask_lock);
+ } else
+ kvm_set_irq(assigned_dev->kvm, assigned_dev->irq_source_id,
+ vector, 1);
+}
+
+static irqreturn_t kvm_assigned_dev_thread_intx(int irq, void *dev_id)
+{
+ struct kvm_assigned_dev_kernel *assigned_dev = dev_id;
+
+ if (!(assigned_dev->flags & KVM_DEV_ASSIGN_PCI_2_3)) {
+ spin_lock_irq(&assigned_dev->intx_lock);
disable_irq_nosync(irq);
assigned_dev->host_irq_disabled = true;
- spin_unlock(&assigned_dev->intx_lock);
+ spin_unlock_irq(&assigned_dev->intx_lock);
}
- kvm_set_irq(assigned_dev->kvm, assigned_dev->irq_source_id,
- assigned_dev->guest_irq, 1);
+ kvm_assigned_dev_raise_guest_irq(assigned_dev,
+ assigned_dev->guest_irq);
+
+ return IRQ_HANDLED;
+}
+
+#ifdef __KVM_HAVE_MSI
+static irqreturn_t kvm_assigned_dev_thread_msi(int irq, void *dev_id)
+{
+ struct kvm_assigned_dev_kernel *assigned_dev = dev_id;
+
+ kvm_assigned_dev_raise_guest_irq(assigned_dev,
+ assigned_dev->guest_irq);
return IRQ_HANDLED;
}
+#endif
#ifdef __KVM_HAVE_MSIX
static irqreturn_t kvm_assigned_dev_thread_msix(int irq, void *dev_id)
@@ -83,8 +127,7 @@ static irqreturn_t kvm_assigned_dev_thread_msix(int irq, void *dev_id)
if (index >= 0) {
vector = assigned_dev->guest_msix_entries[index].vector;
- kvm_set_irq(assigned_dev->kvm, assigned_dev->irq_source_id,
- vector, 1);
+ kvm_assigned_dev_raise_guest_irq(assigned_dev, vector);
}
return IRQ_HANDLED;
@@ -100,15 +143,31 @@ static void kvm_assigned_dev_ack_irq(struct kvm_irq_ack_notifier *kian)
kvm_set_irq(dev->kvm, dev->irq_source_id, dev->guest_irq, 0);
- /* The guest irq may be shared so this ack may be
- * from another device.
- */
- spin_lock(&dev->intx_lock);
- if (dev->host_irq_disabled) {
- enable_irq(dev->host_irq);
- dev->host_irq_disabled = false;
+ mutex_lock(&dev->intx_mask_lock);
+
+ if (!(dev->flags & KVM_DEV_ASSIGN_MASK_INTX)) {
+ bool reassert = false;
+
+ spin_lock_irq(&dev->intx_lock);
+ /*
+ * The guest IRQ may be shared so this ack can come from an
+ * IRQ for another guest device.
+ */
+ if (dev->host_irq_disabled) {
+ if (!(dev->flags & KVM_DEV_ASSIGN_PCI_2_3))
+ enable_irq(dev->host_irq);
+ else if (!pci_check_and_unmask_intx(dev->dev))
+ reassert = true;
+ dev->host_irq_disabled = reassert;
+ }
+ spin_unlock_irq(&dev->intx_lock);
+
+ if (reassert)
+ kvm_set_irq(dev->kvm, dev->irq_source_id,
+ dev->guest_irq, 1);
}
- spin_unlock(&dev->intx_lock);
+
+ mutex_unlock(&dev->intx_mask_lock);
}
static void deassign_guest_irq(struct kvm *kvm,
@@ -156,7 +215,13 @@ static void deassign_host_irq(struct kvm *kvm,
pci_disable_msix(assigned_dev->dev);
} else {
/* Deal with MSI and INTx */
- disable_irq(assigned_dev->host_irq);
+ if (assigned_dev->flags & KVM_DEV_ASSIGN_PCI_2_3) {
+ spin_lock_irq(&assigned_dev->intx_lock);
+ pci_intx(assigned_dev->dev, false);
+ spin_unlock_irq(&assigned_dev->intx_lock);
+ synchronize_irq(assigned_dev->host_irq);
+ } else
+ disable_irq(assigned_dev->host_irq);
free_irq(assigned_dev->host_irq, assigned_dev);
@@ -237,15 +302,34 @@ void kvm_free_all_assigned_devices(struct kvm *kvm)
static int assigned_device_enable_host_intx(struct kvm *kvm,
struct kvm_assigned_dev_kernel *dev)
{
+ irq_handler_t irq_handler;
+ unsigned long flags;
+
dev->host_irq = dev->dev->irq;
- /* Even though this is PCI, we don't want to use shared
- * interrupts. Sharing host devices with guest-assigned devices
- * on the same interrupt line is not a happy situation: there
- * are going to be long delays in accepting, acking, etc.
+
+ /*
+ * We can only share the IRQ line with other host devices if we are
+ * able to disable the IRQ source at device-level - independently of
+ * the guest driver. Otherwise host devices may suffer from unbounded
+ * IRQ latencies when the guest keeps the line asserted.
*/
- if (request_threaded_irq(dev->host_irq, NULL, kvm_assigned_dev_thread,
- IRQF_ONESHOT, dev->irq_name, dev))
+ if (dev->flags & KVM_DEV_ASSIGN_PCI_2_3) {
+ irq_handler = kvm_assigned_dev_intx;
+ flags = IRQF_SHARED;
+ } else {
+ irq_handler = NULL;
+ flags = IRQF_ONESHOT;
+ }
+ if (request_threaded_irq(dev->host_irq, irq_handler,
+ kvm_assigned_dev_thread_intx, flags,
+ dev->irq_name, dev))
return -EIO;
+
+ if (dev->flags & KVM_DEV_ASSIGN_PCI_2_3) {
+ spin_lock_irq(&dev->intx_lock);
+ pci_intx(dev->dev, true);
+ spin_unlock_irq(&dev->intx_lock);
+ }
return 0;
}
@@ -262,8 +346,9 @@ static int assigned_device_enable_host_msi(struct kvm *kvm,
}
dev->host_irq = dev->dev->irq;
- if (request_threaded_irq(dev->host_irq, NULL, kvm_assigned_dev_thread,
- 0, dev->irq_name, dev)) {
+ if (request_threaded_irq(dev->host_irq, NULL,
+ kvm_assigned_dev_thread_msi, 0,
+ dev->irq_name, dev)) {
pci_disable_msi(dev->dev);
return -EIO;
}
@@ -321,7 +406,6 @@ static int assigned_device_enable_guest_msi(struct kvm *kvm,
{
dev->guest_irq = irq->guest_irq;
dev->ack_notifier.gsi = -1;
- dev->host_irq_disabled = false;
return 0;
}
#endif
@@ -333,7 +417,6 @@ static int assigned_device_enable_guest_msix(struct kvm *kvm,
{
dev->guest_irq = irq->guest_irq;
dev->ack_notifier.gsi = -1;
- dev->host_irq_disabled = false;
return 0;
}
#endif
@@ -367,6 +450,7 @@ static int assign_host_irq(struct kvm *kvm,
default:
r = -EINVAL;
}
+ dev->host_irq_disabled = false;
if (!r)
dev->irq_requested_type |= host_irq_type;
@@ -468,6 +552,7 @@ static int kvm_vm_ioctl_deassign_dev_irq(struct kvm *kvm,
{
int r = -ENODEV;
struct kvm_assigned_dev_kernel *match;
+ unsigned long irq_type;
mutex_lock(&kvm->lock);
@@ -476,7 +561,9 @@ static int kvm_vm_ioctl_deassign_dev_irq(struct kvm *kvm,
if (!match)
goto out;
- r = kvm_deassign_irq(kvm, match, assigned_irq->flags);
+ irq_type = assigned_irq->flags & (KVM_DEV_IRQ_HOST_MASK |
+ KVM_DEV_IRQ_GUEST_MASK);
+ r = kvm_deassign_irq(kvm, match, irq_type);
out:
mutex_unlock(&kvm->lock);
return r;
@@ -609,6 +696,10 @@ static int kvm_vm_ioctl_assign_device(struct kvm *kvm,
if (!match->pci_saved_state)
printk(KERN_DEBUG "%s: Couldn't store %s saved state\n",
__func__, dev_name(&dev->dev));
+
+ if (!pci_intx_mask_supported(dev))
+ assigned_dev->flags &= ~KVM_DEV_ASSIGN_PCI_2_3;
+
match->assigned_dev_id = assigned_dev->assigned_dev_id;
match->host_segnr = assigned_dev->segnr;
match->host_busnr = assigned_dev->busnr;
@@ -616,6 +707,7 @@ static int kvm_vm_ioctl_assign_device(struct kvm *kvm,
match->flags = assigned_dev->flags;
match->dev = dev;
spin_lock_init(&match->intx_lock);
+ mutex_init(&match->intx_mask_lock);
match->irq_source_id = -1;
match->kvm = kvm;
match->ack_notifier.irq_acked = kvm_assigned_dev_ack_irq;
@@ -761,6 +853,56 @@ msix_entry_out:
}
#endif
+static int kvm_vm_ioctl_set_pci_irq_mask(struct kvm *kvm,
+ struct kvm_assigned_pci_dev *assigned_dev)
+{
+ int r = 0;
+ struct kvm_assigned_dev_kernel *match;
+
+ mutex_lock(&kvm->lock);
+
+ match = kvm_find_assigned_dev(&kvm->arch.assigned_dev_head,
+ assigned_dev->assigned_dev_id);
+ if (!match) {
+ r = -ENODEV;
+ goto out;
+ }
+
+ mutex_lock(&match->intx_mask_lock);
+
+ match->flags &= ~KVM_DEV_ASSIGN_MASK_INTX;
+ match->flags |= assigned_dev->flags & KVM_DEV_ASSIGN_MASK_INTX;
+
+ if (match->irq_requested_type & KVM_DEV_IRQ_GUEST_INTX) {
+ if (assigned_dev->flags & KVM_DEV_ASSIGN_MASK_INTX) {
+ kvm_set_irq(match->kvm, match->irq_source_id,
+ match->guest_irq, 0);
+ /*
+ * Masking at hardware-level is performed on demand,
+ * i.e. when an IRQ actually arrives at the host.
+ */
+ } else {
+ /*
+ * Unmask the IRQ line. It may have been masked
+ * meanwhile if we aren't using PCI 2.3 INTx masking
+ * on the host side.
+ */
+ spin_lock_irq(&match->intx_lock);
+ if (match->host_irq_disabled) {
+ enable_irq(match->host_irq);
+ match->host_irq_disabled = false;
+ }
+ spin_unlock_irq(&match->intx_lock);
+ }
+ }
+
+ mutex_unlock(&match->intx_mask_lock);
+
+out:
+ mutex_unlock(&kvm->lock);
+ return r;
+}
+
long kvm_vm_ioctl_assigned_device(struct kvm *kvm, unsigned ioctl,
unsigned long arg)
{
@@ -868,6 +1010,15 @@ long kvm_vm_ioctl_assigned_device(struct kvm *kvm, unsigned ioctl,
break;
}
#endif
+ case KVM_ASSIGN_SET_INTX_MASK: {
+ struct kvm_assigned_pci_dev assigned_dev;
+
+ r = -EFAULT;
+ if (copy_from_user(&assigned_dev, argp, sizeof assigned_dev))
+ goto out;
+ r = kvm_vm_ioctl_set_pci_irq_mask(kvm, &assigned_dev);
+ break;
+ }
default:
r = -ENOTTY;
break;
@@ -875,4 +1026,3 @@ long kvm_vm_ioctl_assigned_device(struct kvm *kvm, unsigned ioctl,
out:
return r;
}
-
--
1.7.3.4
^ permalink raw reply related [flat|nested] 20+ messages in thread