[PATCH v2 4/4] KVM: Allow host IRQ sharing for passed-through PCI 2.3 devices

public inbox for kvm@vger.kernel.org
 help / color / mirror / Atom feed

* [PATCH v2 4/4] KVM: Allow host IRQ sharing for passed-through PCI 2.3 devices
  2010-11-02 15:49 [PATCH v2 0/4] KVM: Improve IRQ assignment for device passthrough Jan Kiszka
@ 2010-11-02 15:49 ` Jan Kiszka
  2010-11-02 17:41   ` Michael S. Tsirkin
  0 siblings, 1 reply; 30+ messages in thread
From: Jan Kiszka @ 2010-11-02 15:49 UTC (permalink / raw)
  To: Avi Kivity, Marcelo Tosatti; +Cc: kvm, Alex Williamson, Michael S. Tsirkin

PCI 2.3 allows to generically disable IRQ sources at device level. This
enables us to share IRQs of such devices between on the host side when
passing them to a guest.

Signed-off-by: Jan Kiszka <jan.kiszka@siemens.com>
---
 include/linux/kvm_host.h |    1 +
 virt/kvm/assigned-dev.c  |  194 ++++++++++++++++++++++++++++++++++++++++++----
 2 files changed, 180 insertions(+), 15 deletions(-)

diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 46120da..fdc2bd9 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -466,6 +466,7 @@ struct kvm_assigned_dev_kernel {
 	unsigned int entries_nr;
 	int host_irq;
 	bool host_irq_disabled;
+	bool pci_2_3;
 	struct msix_entry *host_msix_entries;
 	int guest_irq;
 	struct msix_entry *guest_msix_entries;
diff --git a/virt/kvm/assigned-dev.c b/virt/kvm/assigned-dev.c
index ca402ed..91fe9c8 100644
--- a/virt/kvm/assigned-dev.c
+++ b/virt/kvm/assigned-dev.c
@@ -55,17 +55,145 @@ static int find_index_from_host_irq(struct kvm_assigned_dev_kernel
 	return index;
 }
 
+/*
+ * Verify that the device supports Interrupt Disable bit in command register,
+ * per PCI 2.3, by flipping this bit and reading it back: this bit was readonly
+ * in PCI 2.2.
+ */
+static bool pci_2_3_supported(struct pci_dev *pdev)
+{
+	bool supported = false;
+	u16 orig, new;
+
+	pci_block_user_cfg_access(pdev);
+	pci_read_config_word(pdev, PCI_COMMAND, &orig);
+	pci_write_config_word(pdev, PCI_COMMAND,
+			      orig ^ PCI_COMMAND_INTX_DISABLE);
+	pci_read_config_word(pdev, PCI_COMMAND, &new);
+
+	/*
+	 * There's no way to protect against
+	 * hardware bugs or detect them reliably, but as long as we know
+	 * what the value should be, let's go ahead and check it.
+	 */
+	if ((new ^ orig) & ~PCI_COMMAND_INTX_DISABLE) {
+		dev_err(&pdev->dev, "Command changed from 0x%x to 0x%x: "
+			"driver or HW bug?\n", orig, new);
+		goto out;
+	}
+	if (!((new ^ orig) & PCI_COMMAND_INTX_DISABLE)) {
+		dev_warn(&pdev->dev, "Device does not support "
+			 "disabling interrupts: unable to bind.\n");
+		goto out;
+	}
+	supported = true;
+
+	/* Now restore the original value. */
+	pci_write_config_word(pdev, PCI_COMMAND, orig);
+
+out:
+	pci_unblock_user_cfg_access(pdev);
+	return supported;
+}
+
+static unsigned int
+pci_2_3_set_irq_mask(struct pci_dev *dev, bool mask, bool check_status)
+{
+	u32 cmd_status_dword;
+	u16 origcmd, newcmd;
+	unsigned int status;
+
+	/*
+	 * We do a single dword read to retrieve both command and status.
+	 * Document assumptions that make this possible.
+	 */
+	BUILD_BUG_ON(PCI_COMMAND % 4);
+	BUILD_BUG_ON(PCI_COMMAND + 2 != PCI_STATUS);
+
+	pci_block_user_cfg_access(dev);
+
+	/*
+	 * Read both command and status registers in a single 32-bit operation.
+	 * Note: we could cache the value for command and move the status read
+	 * out of the lock if there was a way to get notified of user changes
+	 * to command register through sysfs. Should be good for shared irqs.
+	 */
+	pci_read_config_dword(dev, PCI_COMMAND, &cmd_status_dword);
+	origcmd = cmd_status_dword;
+	status = cmd_status_dword >> 16;
+
+	if (check_status) {
+		bool irq_pending = status & PCI_STATUS_INTERRUPT;
+
+		/*
+		 * Check interrupt status register to see whether our device
+		 * triggered the interrupt (when masking) or the next IRQ is
+		 * already pending (when unmasking).
+		 */
+		if (!(mask == irq_pending))
+			goto done;
+	}
+
+	newcmd = origcmd & ~PCI_COMMAND_INTX_DISABLE;
+	if (mask)
+		newcmd |= PCI_COMMAND_INTX_DISABLE;
+	if (newcmd != origcmd)
+		pci_write_config_word(dev, PCI_COMMAND, newcmd);
+
+done:
+	pci_unblock_user_cfg_access(dev);
+	return status;
+}
+
+static void pci_2_3_irq_mask(struct pci_dev *dev)
+{
+	pci_2_3_set_irq_mask(dev, true, false);
+}
+
+static unsigned int pci_2_3_irq_check_and_mask(struct pci_dev *dev)
+{
+	return pci_2_3_set_irq_mask(dev, true, true);
+}
+
+static void pci_2_3_irq_unmask(struct pci_dev *dev)
+{
+	pci_2_3_set_irq_mask(dev, false, false);
+}
+
+static unsigned int pci_2_3_irq_check_and_unmask(struct pci_dev *dev)
+{
+	return pci_2_3_set_irq_mask(dev, false, true);
+}
+
+static irqreturn_t kvm_assigned_dev_intr(int irq, void *dev_id)
+{
+	struct kvm_assigned_dev_kernel *assigned_dev = dev_id;
+	int ret = IRQ_WAKE_THREAD;
+
+	spin_lock(&assigned_dev->intx_lock);
+	if (assigned_dev->host_irq_disabled ||
+	    !(pci_2_3_irq_check_and_mask(assigned_dev->dev) &
+			PCI_STATUS_INTERRUPT))
+		ret = IRQ_NONE;
+	else
+		assigned_dev->host_irq_disabled = true;
+	spin_unlock(&assigned_dev->intx_lock);
+
+	return ret;
+}
+
 static irqreturn_t kvm_assigned_dev_thread(int irq, void *dev_id)
 {
 	struct kvm_assigned_dev_kernel *assigned_dev = dev_id;
 	u32 vector;
 	int index;
 
-	if (assigned_dev->irq_requested_type & KVM_DEV_IRQ_HOST_INTX) {
-		spin_lock(&assigned_dev->intx_lock);
+	if (assigned_dev->irq_requested_type & KVM_DEV_IRQ_HOST_INTX &&
+	    !assigned_dev->pci_2_3) {
+		spin_lock_irq(&assigned_dev->intx_lock);
 		disable_irq_nosync(irq);
 		assigned_dev->host_irq_disabled = true;
-		spin_unlock(&assigned_dev->intx_lock);
+		spin_unlock_irq(&assigned_dev->intx_lock);
 	}
 
 	if (assigned_dev->irq_requested_type & KVM_DEV_IRQ_HOST_MSIX) {
@@ -87,6 +215,7 @@ static irqreturn_t kvm_assigned_dev_thread(int irq, void *dev_id)
 static void kvm_assigned_dev_ack_irq(struct kvm_irq_ack_notifier *kian)
 {
 	struct kvm_assigned_dev_kernel *dev;
+	bool reassert = false;
 
 	if (kian->gsi == -1)
 		return;
@@ -99,12 +228,23 @@ static void kvm_assigned_dev_ack_irq(struct kvm_irq_ack_notifier *kian)
 	/* The guest irq may be shared so this ack may be
 	 * from another device.
 	 */
-	spin_lock(&dev->intx_lock);
+	spin_lock_irq(&dev->intx_lock);
 	if (dev->host_irq_disabled) {
-		enable_irq(dev->host_irq);
+		if (dev->pci_2_3) {
+			if (pci_2_3_irq_check_and_unmask(dev->dev) &
+			    PCI_STATUS_INTERRUPT) {
+				reassert = true;
+				goto out;
+			}
+		} else
+			enable_irq(dev->host_irq);
 		dev->host_irq_disabled = false;
 	}
-	spin_unlock(&dev->intx_lock);
+out:
+	spin_unlock_irq(&dev->intx_lock);
+
+	if (reassert)
+		kvm_set_irq(dev->kvm, dev->irq_source_id, dev->guest_irq, 1);
 }
 
 static void deassign_guest_irq(struct kvm *kvm,
@@ -151,7 +291,11 @@ static void deassign_host_irq(struct kvm *kvm,
 		pci_disable_msix(assigned_dev->dev);
 	} else {
 		/* Deal with MSI and INTx */
-		disable_irq(assigned_dev->host_irq);
+		if (assigned_dev->pci_2_3) {
+			pci_2_3_irq_mask(assigned_dev->dev);
+			synchronize_irq(assigned_dev->host_irq);
+		} else
+			disable_irq(assigned_dev->host_irq);
 
 		free_irq(assigned_dev->host_irq, (void *)assigned_dev);
 
@@ -199,6 +343,13 @@ static void kvm_free_assigned_device(struct kvm *kvm,
 
 	pci_reset_function(assigned_dev->dev);
 
+	/*
+	 * Unmask the IRQ at PCI level once the reset is done - the next user
+	 * may not expect the IRQ being masked.
+	 */
+	if (assigned_dev->pci_2_3)
+		pci_2_3_irq_unmask(assigned_dev->dev);
+
 	pci_release_regions(assigned_dev->dev);
 	pci_disable_device(assigned_dev->dev);
 	pci_dev_put(assigned_dev->dev);
@@ -224,15 +375,29 @@ void kvm_free_all_assigned_devices(struct kvm *kvm)
 static int assigned_device_enable_host_intx(struct kvm *kvm,
 					    struct kvm_assigned_dev_kernel *dev)
 {
+	irq_handler_t irq_handler = NULL;
+	unsigned long flags = 0;
+
 	dev->host_irq = dev->dev->irq;
-	/* Even though this is PCI, we don't want to use shared
-	 * interrupts. Sharing host devices with guest-assigned devices
-	 * on the same interrupt line is not a happy situation: there
-	 * are going to be long delays in accepting, acking, etc.
+
+	/*
+	 * We can only share the IRQ line with other host devices if we are
+	 * able to disable the IRQ source at device-level - independently of
+	 * the guest driver. Otherwise host devices may suffer from unbounded
+	 * IRQ latencies when the guest keeps the line asserted.
 	 */
-	if (request_threaded_irq(dev->host_irq, NULL, kvm_assigned_dev_thread,
-				 0, dev->irq_name, (void *)dev))
+	dev->pci_2_3 = pci_2_3_supported(dev->dev);
+	if (dev->pci_2_3) {
+		irq_handler = kvm_assigned_dev_intr;
+		flags = IRQF_SHARED;
+	}
+	if (request_threaded_irq(dev->host_irq, irq_handler,
+				 kvm_assigned_dev_thread, flags,
+				 dev->irq_name, (void *)dev))
 		return -EIO;
+
+	if (dev->pci_2_3)
+		pci_2_3_irq_unmask(dev->dev);
 	return 0;
 }
 
@@ -308,7 +473,6 @@ static int assigned_device_enable_guest_msi(struct kvm *kvm,
 {
 	dev->guest_irq = irq->guest_irq;
 	dev->ack_notifier.gsi = -1;
-	dev->host_irq_disabled = false;
 	return 0;
 }
 #endif
@@ -320,7 +484,6 @@ static int assigned_device_enable_guest_msix(struct kvm *kvm,
 {
 	dev->guest_irq = irq->guest_irq;
 	dev->ack_notifier.gsi = -1;
-	dev->host_irq_disabled = false;
 	return 0;
 }
 #endif
@@ -354,6 +517,7 @@ static int assign_host_irq(struct kvm *kvm,
 	default:
 		r = -EINVAL;
 	}
+	dev->host_irq_disabled = false;
 
 	if (!r)
 		dev->irq_requested_type |= host_irq_type;
-- 
1.7.1


^ permalink raw reply related	[flat|nested] 30+ messages in thread

* Re: [PATCH v2 4/4] KVM: Allow host IRQ sharing for passed-through PCI 2.3 devices
  2010-11-02 15:49 ` [PATCH v2 4/4] KVM: Allow host IRQ sharing for passed-through PCI 2.3 devices Jan Kiszka
@ 2010-11-02 17:41   ` Michael S. Tsirkin
  2010-11-02 17:56     ` Jan Kiszka
  0 siblings, 1 reply; 30+ messages in thread
From: Michael S. Tsirkin @ 2010-11-02 17:41 UTC (permalink / raw)
  To: Jan Kiszka; +Cc: Avi Kivity, Marcelo Tosatti, kvm, Alex Williamson

On Tue, Nov 02, 2010 at 04:49:20PM +0100, Jan Kiszka wrote:
> PCI 2.3 allows to generically disable IRQ sources at device level. This
> enables us to share IRQs of such devices between on the host side when
> passing them to a guest.
> 
> Signed-off-by: Jan Kiszka <jan.kiszka@siemens.com>
> ---
>  include/linux/kvm_host.h |    1 +
>  virt/kvm/assigned-dev.c  |  194 ++++++++++++++++++++++++++++++++++++++++++----
>  2 files changed, 180 insertions(+), 15 deletions(-)
> 
> diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
> index 46120da..fdc2bd9 100644
> --- a/include/linux/kvm_host.h
> +++ b/include/linux/kvm_host.h
> @@ -466,6 +466,7 @@ struct kvm_assigned_dev_kernel {
>  	unsigned int entries_nr;
>  	int host_irq;
>  	bool host_irq_disabled;
> +	bool pci_2_3;
>  	struct msix_entry *host_msix_entries;
>  	int guest_irq;
>  	struct msix_entry *guest_msix_entries;
> diff --git a/virt/kvm/assigned-dev.c b/virt/kvm/assigned-dev.c
> index ca402ed..91fe9c8 100644
> --- a/virt/kvm/assigned-dev.c
> +++ b/virt/kvm/assigned-dev.c
> @@ -55,17 +55,145 @@ static int find_index_from_host_irq(struct kvm_assigned_dev_kernel
>  	return index;
>  }
>  
> +/*
> + * Verify that the device supports Interrupt Disable bit in command register,
> + * per PCI 2.3, by flipping this bit and reading it back: this bit was readonly
> + * in PCI 2.2.
> + */
> +static bool pci_2_3_supported(struct pci_dev *pdev)
> +{
> +	bool supported = false;
> +	u16 orig, new;
> +
> +	pci_block_user_cfg_access(pdev);
> +	pci_read_config_word(pdev, PCI_COMMAND, &orig);
> +	pci_write_config_word(pdev, PCI_COMMAND,
> +			      orig ^ PCI_COMMAND_INTX_DISABLE);
> +	pci_read_config_word(pdev, PCI_COMMAND, &new);
> +
> +	/*
> +	 * There's no way to protect against
> +	 * hardware bugs or detect them reliably, but as long as we know
> +	 * what the value should be, let's go ahead and check it.
> +	 */
> +	if ((new ^ orig) & ~PCI_COMMAND_INTX_DISABLE) {
> +		dev_err(&pdev->dev, "Command changed from 0x%x to 0x%x: "
> +			"driver or HW bug?\n", orig, new);
> +		goto out;
> +	}
> +	if (!((new ^ orig) & PCI_COMMAND_INTX_DISABLE)) {
> +		dev_warn(&pdev->dev, "Device does not support "
> +			 "disabling interrupts: unable to bind.\n");
> +		goto out;
> +	}
> +	supported = true;
> +
> +	/* Now restore the original value. */
> +	pci_write_config_word(pdev, PCI_COMMAND, orig);
> +
> +out:
> +	pci_unblock_user_cfg_access(pdev);
> +	return supported;
> +}
> +
> +static unsigned int
> +pci_2_3_set_irq_mask(struct pci_dev *dev, bool mask, bool check_status)
> +{
> +	u32 cmd_status_dword;
> +	u16 origcmd, newcmd;
> +	unsigned int status;
> +
> +	/*
> +	 * We do a single dword read to retrieve both command and status.
> +	 * Document assumptions that make this possible.
> +	 */
> +	BUILD_BUG_ON(PCI_COMMAND % 4);
> +	BUILD_BUG_ON(PCI_COMMAND + 2 != PCI_STATUS);
> +
> +	pci_block_user_cfg_access(dev);
> +
> +	/*
> +	 * Read both command and status registers in a single 32-bit operation.
> +	 * Note: we could cache the value for command and move the status read
> +	 * out of the lock if there was a way to get notified of user changes
> +	 * to command register through sysfs. Should be good for shared irqs.
> +	 */
> +	pci_read_config_dword(dev, PCI_COMMAND, &cmd_status_dword);
> +	origcmd = cmd_status_dword;
> +	status = cmd_status_dword >> 16;
> +
> +	if (check_status) {
> +		bool irq_pending = status & PCI_STATUS_INTERRUPT;
> +
> +		/*
> +		 * Check interrupt status register to see whether our device
> +		 * triggered the interrupt (when masking) or the next IRQ is
> +		 * already pending (when unmasking).
> +		 */
> +		if (!(mask == irq_pending))

Same as mask != irq_pending?

> +			goto done;
> +	}
> +
> +	newcmd = origcmd & ~PCI_COMMAND_INTX_DISABLE;
> +	if (mask)
> +		newcmd |= PCI_COMMAND_INTX_DISABLE;
> +	if (newcmd != origcmd)
> +		pci_write_config_word(dev, PCI_COMMAND, newcmd);
> +
> +done:
> +	pci_unblock_user_cfg_access(dev);
> +	return status;
> +}
> +
> +static void pci_2_3_irq_mask(struct pci_dev *dev)
> +{
> +	pci_2_3_set_irq_mask(dev, true, false);
> +}
> +
> +static unsigned int pci_2_3_irq_check_and_mask(struct pci_dev *dev)
> +{
> +	return pci_2_3_set_irq_mask(dev, true, true);
> +}
> +
> +static void pci_2_3_irq_unmask(struct pci_dev *dev)
> +{
> +	pci_2_3_set_irq_mask(dev, false, false);
> +}
> +
> +static unsigned int pci_2_3_irq_check_and_unmask(struct pci_dev *dev)
> +{
> +	return pci_2_3_set_irq_mask(dev, false, true);
> +}
> +

IMO this is not a terribly good interface: all users check the pending bit
(PCI_STATUS_INTERRUPT) which is what the function pci_2_3_set_irq_mask
did anyway.  I'd suggest returning irqreturn_t or bool and not unsigned
int.


> +static irqreturn_t kvm_assigned_dev_intr(int irq, void *dev_id)
> +{
> +	struct kvm_assigned_dev_kernel *assigned_dev = dev_id;
> +	int ret = IRQ_WAKE_THREAD;
> +
> +	spin_lock(&assigned_dev->intx_lock);
> +	if (assigned_dev->host_irq_disabled ||
> +	    !(pci_2_3_irq_check_and_mask(assigned_dev->dev) &
> +			PCI_STATUS_INTERRUPT))
> +		ret = IRQ_NONE;
> +	else
> +		assigned_dev->host_irq_disabled = true;

This is a bug I think.  For pci 2.3 we should never track interrupt
state in kvm IMO.  For example, if userspace unmasks an interrupt
through a config write, we will get an interrupt while host_irq_disabled
is set. If we then fail to mask it, kaboom.

> +	spin_unlock(&assigned_dev->intx_lock);
> +
> +	return ret;
> +}
> +
>  static irqreturn_t kvm_assigned_dev_thread(int irq, void *dev_id)
>  {
>  	struct kvm_assigned_dev_kernel *assigned_dev = dev_id;
>  	u32 vector;
>  	int index;
>  
> -	if (assigned_dev->irq_requested_type & KVM_DEV_IRQ_HOST_INTX) {
> -		spin_lock(&assigned_dev->intx_lock);
> +	if (assigned_dev->irq_requested_type & KVM_DEV_IRQ_HOST_INTX &&
> +	    !assigned_dev->pci_2_3) {
> +		spin_lock_irq(&assigned_dev->intx_lock);
>  		disable_irq_nosync(irq);
>  		assigned_dev->host_irq_disabled = true;
> -		spin_unlock(&assigned_dev->intx_lock);
> +		spin_unlock_irq(&assigned_dev->intx_lock);
>  	}
>  
>  	if (assigned_dev->irq_requested_type & KVM_DEV_IRQ_HOST_MSIX) {
> @@ -87,6 +215,7 @@ static irqreturn_t kvm_assigned_dev_thread(int irq, void *dev_id)
>  static void kvm_assigned_dev_ack_irq(struct kvm_irq_ack_notifier *kian)
>  {
>  	struct kvm_assigned_dev_kernel *dev;
> +	bool reassert = false;
>  
>  	if (kian->gsi == -1)
>  		return;
> @@ -99,12 +228,23 @@ static void kvm_assigned_dev_ack_irq(struct kvm_irq_ack_notifier *kian)
>  	/* The guest irq may be shared so this ack may be
>  	 * from another device.
>  	 */
> -	spin_lock(&dev->intx_lock);
> +	spin_lock_irq(&dev->intx_lock);
>  	if (dev->host_irq_disabled) {
> -		enable_irq(dev->host_irq);
> +		if (dev->pci_2_3) {
> +			if (pci_2_3_irq_check_and_unmask(dev->dev) &
> +			    PCI_STATUS_INTERRUPT) {
> +				reassert = true;
> +				goto out;
> +			}
> +		} else
> +			enable_irq(dev->host_irq);

Or

		if (!dev->pci_2_3)
			enable_irq(dev->host_irq);
		else if (pci_2_3_irq_check_and_unmask(dev->dev) & PCI_STATUS_INTERRUPT) {
			reassert = true;
			goto out;
		}

to reduce nesting.

>  		dev->host_irq_disabled = false;
>  	}
> -	spin_unlock(&dev->intx_lock);
> +out:
> +	spin_unlock_irq(&dev->intx_lock);
> +
> +	if (reassert)
> +		kvm_set_irq(dev->kvm, dev->irq_source_id, dev->guest_irq, 1);

Hmm, I think this still has more overhead than it needs to have.
Instead of setting level to 0 and then back to 1, can't we just
avoid set to 1 in the first place? This would need a different
interface than pci_2_3_irq_check_and_unmask to avoid a race
where interrupt is received while we are acking another one:

	block userspace access
	check pending bit
	if (!pending)
		set irq (0)
	clear pending
	block userspace access

Would be worth it for high volume devices.

>  }
>  
>  static void deassign_guest_irq(struct kvm *kvm,
> @@ -151,7 +291,11 @@ static void deassign_host_irq(struct kvm *kvm,
>  		pci_disable_msix(assigned_dev->dev);
>  	} else {
>  		/* Deal with MSI and INTx */
> -		disable_irq(assigned_dev->host_irq);
> +		if (assigned_dev->pci_2_3) {
> +			pci_2_3_irq_mask(assigned_dev->dev);
> +			synchronize_irq(assigned_dev->host_irq);
> +		} else
> +			disable_irq(assigned_dev->host_irq);
>  
>  		free_irq(assigned_dev->host_irq, (void *)assigned_dev);
>  
> @@ -199,6 +343,13 @@ static void kvm_free_assigned_device(struct kvm *kvm,
>  
>  	pci_reset_function(assigned_dev->dev);
>  
> +	/*
> +	 * Unmask the IRQ at PCI level once the reset is done - the next user
> +	 * may not expect the IRQ being masked.
> +	 */
> +	if (assigned_dev->pci_2_3)
> +		pci_2_3_irq_unmask(assigned_dev->dev);
> +

Doesn't pci_reset_function clear mask bit? It seems to ...

>  	pci_release_regions(assigned_dev->dev);
>  	pci_disable_device(assigned_dev->dev);
>  	pci_dev_put(assigned_dev->dev);
> @@ -224,15 +375,29 @@ void kvm_free_all_assigned_devices(struct kvm *kvm)
>  static int assigned_device_enable_host_intx(struct kvm *kvm,
>  					    struct kvm_assigned_dev_kernel *dev)
>  {
> +	irq_handler_t irq_handler = NULL;
> +	unsigned long flags = 0;
> +
>  	dev->host_irq = dev->dev->irq;
> -	/* Even though this is PCI, we don't want to use shared
> -	 * interrupts. Sharing host devices with guest-assigned devices
> -	 * on the same interrupt line is not a happy situation: there
> -	 * are going to be long delays in accepting, acking, etc.
> +
> +	/*
> +	 * We can only share the IRQ line with other host devices if we are
> +	 * able to disable the IRQ source at device-level - independently of
> +	 * the guest driver. Otherwise host devices may suffer from unbounded
> +	 * IRQ latencies when the guest keeps the line asserted.
>  	 */
> -	if (request_threaded_irq(dev->host_irq, NULL, kvm_assigned_dev_thread,
> -				 0, dev->irq_name, (void *)dev))
> +	dev->pci_2_3 = pci_2_3_supported(dev->dev);
> +	if (dev->pci_2_3) {
> +		irq_handler = kvm_assigned_dev_intr;
> +		flags = IRQF_SHARED;
> +	}

I would prefer and else clause here instead of initializing
variables at the top and overwriting here. Makes it easier
to see which value is valid when.

> +	if (request_threaded_irq(dev->host_irq, irq_handler,
> +				 kvm_assigned_dev_thread, flags,
> +				 dev->irq_name, (void *)dev))
>  		return -EIO;
> +
> +	if (dev->pci_2_3)
> +		pci_2_3_irq_unmask(dev->dev);
>  	return 0;
>  }
>  
> @@ -308,7 +473,6 @@ static int assigned_device_enable_guest_msi(struct kvm *kvm,
>  {
>  	dev->guest_irq = irq->guest_irq;
>  	dev->ack_notifier.gsi = -1;
> -	dev->host_irq_disabled = false;
>  	return 0;
>  }
>  #endif
> @@ -320,7 +484,6 @@ static int assigned_device_enable_guest_msix(struct kvm *kvm,
>  {
>  	dev->guest_irq = irq->guest_irq;
>  	dev->ack_notifier.gsi = -1;
> -	dev->host_irq_disabled = false;
>  	return 0;
>  }
>  #endif
> @@ -354,6 +517,7 @@ static int assign_host_irq(struct kvm *kvm,
>  	default:
>  		r = -EINVAL;
>  	}
> +	dev->host_irq_disabled = false;
>  
>  	if (!r)
>  		dev->irq_requested_type |= host_irq_type;
> -- 
> 1.7.1

^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [PATCH v2 4/4] KVM: Allow host IRQ sharing for passed-through PCI 2.3 devices
  2010-11-02 17:41   ` Michael S. Tsirkin
@ 2010-11-02 17:56     ` Jan Kiszka
  2010-11-02 18:24       ` Michael S. Tsirkin
  0 siblings, 1 reply; 30+ messages in thread
From: Jan Kiszka @ 2010-11-02 17:56 UTC (permalink / raw)
  To: Michael S. Tsirkin; +Cc: Avi Kivity, Marcelo Tosatti, kvm, Alex Williamson

Am 02.11.2010 18:41, Michael S. Tsirkin wrote:
> On Tue, Nov 02, 2010 at 04:49:20PM +0100, Jan Kiszka wrote:
>> PCI 2.3 allows to generically disable IRQ sources at device level. This
>> enables us to share IRQs of such devices between on the host side when
>> passing them to a guest.
>>
>> Signed-off-by: Jan Kiszka <jan.kiszka@siemens.com>
>> ---
>>  include/linux/kvm_host.h |    1 +
>>  virt/kvm/assigned-dev.c  |  194 ++++++++++++++++++++++++++++++++++++++++++----
>>  2 files changed, 180 insertions(+), 15 deletions(-)
>>
>> diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
>> index 46120da..fdc2bd9 100644
>> --- a/include/linux/kvm_host.h
>> +++ b/include/linux/kvm_host.h
>> @@ -466,6 +466,7 @@ struct kvm_assigned_dev_kernel {
>>  	unsigned int entries_nr;
>>  	int host_irq;
>>  	bool host_irq_disabled;
>> +	bool pci_2_3;
>>  	struct msix_entry *host_msix_entries;
>>  	int guest_irq;
>>  	struct msix_entry *guest_msix_entries;
>> diff --git a/virt/kvm/assigned-dev.c b/virt/kvm/assigned-dev.c
>> index ca402ed..91fe9c8 100644
>> --- a/virt/kvm/assigned-dev.c
>> +++ b/virt/kvm/assigned-dev.c
>> @@ -55,17 +55,145 @@ static int find_index_from_host_irq(struct kvm_assigned_dev_kernel
>>  	return index;
>>  }
>>  
>> +/*
>> + * Verify that the device supports Interrupt Disable bit in command register,
>> + * per PCI 2.3, by flipping this bit and reading it back: this bit was readonly
>> + * in PCI 2.2.
>> + */
>> +static bool pci_2_3_supported(struct pci_dev *pdev)
>> +{
>> +	bool supported = false;
>> +	u16 orig, new;
>> +
>> +	pci_block_user_cfg_access(pdev);
>> +	pci_read_config_word(pdev, PCI_COMMAND, &orig);
>> +	pci_write_config_word(pdev, PCI_COMMAND,
>> +			      orig ^ PCI_COMMAND_INTX_DISABLE);
>> +	pci_read_config_word(pdev, PCI_COMMAND, &new);
>> +
>> +	/*
>> +	 * There's no way to protect against
>> +	 * hardware bugs or detect them reliably, but as long as we know
>> +	 * what the value should be, let's go ahead and check it.
>> +	 */
>> +	if ((new ^ orig) & ~PCI_COMMAND_INTX_DISABLE) {
>> +		dev_err(&pdev->dev, "Command changed from 0x%x to 0x%x: "
>> +			"driver or HW bug?\n", orig, new);
>> +		goto out;
>> +	}
>> +	if (!((new ^ orig) & PCI_COMMAND_INTX_DISABLE)) {
>> +		dev_warn(&pdev->dev, "Device does not support "
>> +			 "disabling interrupts: unable to bind.\n");
>> +		goto out;
>> +	}
>> +	supported = true;
>> +
>> +	/* Now restore the original value. */
>> +	pci_write_config_word(pdev, PCI_COMMAND, orig);
>> +
>> +out:
>> +	pci_unblock_user_cfg_access(pdev);
>> +	return supported;
>> +}
>> +
>> +static unsigned int
>> +pci_2_3_set_irq_mask(struct pci_dev *dev, bool mask, bool check_status)
>> +{
>> +	u32 cmd_status_dword;
>> +	u16 origcmd, newcmd;
>> +	unsigned int status;
>> +
>> +	/*
>> +	 * We do a single dword read to retrieve both command and status.
>> +	 * Document assumptions that make this possible.
>> +	 */
>> +	BUILD_BUG_ON(PCI_COMMAND % 4);
>> +	BUILD_BUG_ON(PCI_COMMAND + 2 != PCI_STATUS);
>> +
>> +	pci_block_user_cfg_access(dev);
>> +
>> +	/*
>> +	 * Read both command and status registers in a single 32-bit operation.
>> +	 * Note: we could cache the value for command and move the status read
>> +	 * out of the lock if there was a way to get notified of user changes
>> +	 * to command register through sysfs. Should be good for shared irqs.
>> +	 */
>> +	pci_read_config_dword(dev, PCI_COMMAND, &cmd_status_dword);
>> +	origcmd = cmd_status_dword;
>> +	status = cmd_status_dword >> 16;
>> +
>> +	if (check_status) {
>> +		bool irq_pending = status & PCI_STATUS_INTERRUPT;
>> +
>> +		/*
>> +		 * Check interrupt status register to see whether our device
>> +		 * triggered the interrupt (when masking) or the next IRQ is
>> +		 * already pending (when unmasking).
>> +		 */
>> +		if (!(mask == irq_pending))
> 
> Same as mask != irq_pending?

Yes. Relict of various refactorings.

> 
>> +			goto done;
>> +	}
>> +
>> +	newcmd = origcmd & ~PCI_COMMAND_INTX_DISABLE;
>> +	if (mask)
>> +		newcmd |= PCI_COMMAND_INTX_DISABLE;
>> +	if (newcmd != origcmd)
>> +		pci_write_config_word(dev, PCI_COMMAND, newcmd);
>> +
>> +done:
>> +	pci_unblock_user_cfg_access(dev);
>> +	return status;
>> +}
>> +
>> +static void pci_2_3_irq_mask(struct pci_dev *dev)
>> +{
>> +	pci_2_3_set_irq_mask(dev, true, false);
>> +}
>> +
>> +static unsigned int pci_2_3_irq_check_and_mask(struct pci_dev *dev)
>> +{
>> +	return pci_2_3_set_irq_mask(dev, true, true);
>> +}
>> +
>> +static void pci_2_3_irq_unmask(struct pci_dev *dev)
>> +{
>> +	pci_2_3_set_irq_mask(dev, false, false);
>> +}
>> +
>> +static unsigned int pci_2_3_irq_check_and_unmask(struct pci_dev *dev)
>> +{
>> +	return pci_2_3_set_irq_mask(dev, false, true);
>> +}
>> +
> 
> IMO this is not a terribly good interface: all users check the pending bit
> (PCI_STATUS_INTERRUPT) which is what the function pci_2_3_set_irq_mask
> did anyway.  I'd suggest returning irqreturn_t or bool and not unsigned
> int.

Agreed. Originally, I thought there are more bits in the status word the
caller may make use of. But there are in fact none.

> 
> 
>> +static irqreturn_t kvm_assigned_dev_intr(int irq, void *dev_id)
>> +{
>> +	struct kvm_assigned_dev_kernel *assigned_dev = dev_id;
>> +	int ret = IRQ_WAKE_THREAD;
>> +
>> +	spin_lock(&assigned_dev->intx_lock);
>> +	if (assigned_dev->host_irq_disabled ||
>> +	    !(pci_2_3_irq_check_and_mask(assigned_dev->dev) &
>> +			PCI_STATUS_INTERRUPT))
>> +		ret = IRQ_NONE;
>> +	else
>> +		assigned_dev->host_irq_disabled = true;
> 
> This is a bug I think.  For pci 2.3 we should never track interrupt
> state in kvm IMO.  For example, if userspace unmasks an interrupt
> through a config write, we will get an interrupt while host_irq_disabled
> is set. If we then fail to mask it, kaboom.

Good point. There is no way around evaluating the status word as long as
user space can fiddle with INTX_DISABLE.

> 
>> +	spin_unlock(&assigned_dev->intx_lock);
>> +
>> +	return ret;
>> +}
>> +
>>  static irqreturn_t kvm_assigned_dev_thread(int irq, void *dev_id)
>>  {
>>  	struct kvm_assigned_dev_kernel *assigned_dev = dev_id;
>>  	u32 vector;
>>  	int index;
>>  
>> -	if (assigned_dev->irq_requested_type & KVM_DEV_IRQ_HOST_INTX) {
>> -		spin_lock(&assigned_dev->intx_lock);
>> +	if (assigned_dev->irq_requested_type & KVM_DEV_IRQ_HOST_INTX &&
>> +	    !assigned_dev->pci_2_3) {
>> +		spin_lock_irq(&assigned_dev->intx_lock);
>>  		disable_irq_nosync(irq);
>>  		assigned_dev->host_irq_disabled = true;
>> -		spin_unlock(&assigned_dev->intx_lock);
>> +		spin_unlock_irq(&assigned_dev->intx_lock);
>>  	}
>>  
>>  	if (assigned_dev->irq_requested_type & KVM_DEV_IRQ_HOST_MSIX) {
>> @@ -87,6 +215,7 @@ static irqreturn_t kvm_assigned_dev_thread(int irq, void *dev_id)
>>  static void kvm_assigned_dev_ack_irq(struct kvm_irq_ack_notifier *kian)
>>  {
>>  	struct kvm_assigned_dev_kernel *dev;
>> +	bool reassert = false;
>>  
>>  	if (kian->gsi == -1)
>>  		return;
>> @@ -99,12 +228,23 @@ static void kvm_assigned_dev_ack_irq(struct kvm_irq_ack_notifier *kian)
>>  	/* The guest irq may be shared so this ack may be
>>  	 * from another device.
>>  	 */
>> -	spin_lock(&dev->intx_lock);
>> +	spin_lock_irq(&dev->intx_lock);
>>  	if (dev->host_irq_disabled) {
>> -		enable_irq(dev->host_irq);
>> +		if (dev->pci_2_3) {
>> +			if (pci_2_3_irq_check_and_unmask(dev->dev) &
>> +			    PCI_STATUS_INTERRUPT) {
>> +				reassert = true;
>> +				goto out;
>> +			}
>> +		} else
>> +			enable_irq(dev->host_irq);
> 
> Or
> 
> 		if (!dev->pci_2_3)
> 			enable_irq(dev->host_irq);
> 		else if (pci_2_3_irq_check_and_unmask(dev->dev) & PCI_STATUS_INTERRUPT) {
> 			reassert = true;
> 			goto out;
> 		}
> 
> to reduce nesting.

Yeah.

> 
>>  		dev->host_irq_disabled = false;
>>  	}
>> -	spin_unlock(&dev->intx_lock);
>> +out:
>> +	spin_unlock_irq(&dev->intx_lock);
>> +
>> +	if (reassert)
>> +		kvm_set_irq(dev->kvm, dev->irq_source_id, dev->guest_irq, 1);
> 
> Hmm, I think this still has more overhead than it needs to have.
> Instead of setting level to 0 and then back to 1, can't we just
> avoid set to 1 in the first place? This would need a different
> interface than pci_2_3_irq_check_and_unmask to avoid a race
> where interrupt is received while we are acking another one:
> 
> 	block userspace access
> 	check pending bit
> 	if (!pending)
> 		set irq (0)
> 	clear pending
> 	block userspace access
> 
> Would be worth it for high volume devices.

The problem is that we can't reorder guest IRQ line clearing and host
IRQ line enabling without taking a lock across host IRQ disable + guest
IRQ raise - and that is now distributed across hard and threaded IRQ
handlers and we don't want to hold and IRQ-safe lock during kvm_set_irq.

> 
>>  }
>>  
>>  static void deassign_guest_irq(struct kvm *kvm,
>> @@ -151,7 +291,11 @@ static void deassign_host_irq(struct kvm *kvm,
>>  		pci_disable_msix(assigned_dev->dev);
>>  	} else {
>>  		/* Deal with MSI and INTx */
>> -		disable_irq(assigned_dev->host_irq);
>> +		if (assigned_dev->pci_2_3) {
>> +			pci_2_3_irq_mask(assigned_dev->dev);
>> +			synchronize_irq(assigned_dev->host_irq);
>> +		} else
>> +			disable_irq(assigned_dev->host_irq);
>>  
>>  		free_irq(assigned_dev->host_irq, (void *)assigned_dev);
>>  
>> @@ -199,6 +343,13 @@ static void kvm_free_assigned_device(struct kvm *kvm,
>>  
>>  	pci_reset_function(assigned_dev->dev);
>>  
>> +	/*
>> +	 * Unmask the IRQ at PCI level once the reset is done - the next user
>> +	 * may not expect the IRQ being masked.
>> +	 */
>> +	if (assigned_dev->pci_2_3)
>> +		pci_2_3_irq_unmask(assigned_dev->dev);
>> +
> 
> Doesn't pci_reset_function clear mask bit? It seems to ...

I was left with non-functional devices for the host here if I was not
doing this. Need to recheck, but I think it was required.

> 
>>  	pci_release_regions(assigned_dev->dev);
>>  	pci_disable_device(assigned_dev->dev);
>>  	pci_dev_put(assigned_dev->dev);
>> @@ -224,15 +375,29 @@ void kvm_free_all_assigned_devices(struct kvm *kvm)
>>  static int assigned_device_enable_host_intx(struct kvm *kvm,
>>  					    struct kvm_assigned_dev_kernel *dev)
>>  {
>> +	irq_handler_t irq_handler = NULL;
>> +	unsigned long flags = 0;
>> +
>>  	dev->host_irq = dev->dev->irq;
>> -	/* Even though this is PCI, we don't want to use shared
>> -	 * interrupts. Sharing host devices with guest-assigned devices
>> -	 * on the same interrupt line is not a happy situation: there
>> -	 * are going to be long delays in accepting, acking, etc.
>> +
>> +	/*
>> +	 * We can only share the IRQ line with other host devices if we are
>> +	 * able to disable the IRQ source at device-level - independently of
>> +	 * the guest driver. Otherwise host devices may suffer from unbounded
>> +	 * IRQ latencies when the guest keeps the line asserted.
>>  	 */
>> -	if (request_threaded_irq(dev->host_irq, NULL, kvm_assigned_dev_thread,
>> -				 0, dev->irq_name, (void *)dev))
>> +	dev->pci_2_3 = pci_2_3_supported(dev->dev);
>> +	if (dev->pci_2_3) {
>> +		irq_handler = kvm_assigned_dev_intr;
>> +		flags = IRQF_SHARED;
>> +	}
> 
> I would prefer and else clause here instead of initializing
> variables at the top and overwriting here. Makes it easier
> to see which value is valid when.

OK.

Thanks,
Jan

-- 
Siemens AG, Corporate Technology, CT T DE IT 1
Corporate Competence Center Embedded Linux

^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [PATCH v2 4/4] KVM: Allow host IRQ sharing for passed-through PCI 2.3 devices
  2010-11-02 17:56     ` Jan Kiszka
@ 2010-11-02 18:24       ` Michael S. Tsirkin
  2010-11-02 18:40         ` Jan Kiszka
  0 siblings, 1 reply; 30+ messages in thread
From: Michael S. Tsirkin @ 2010-11-02 18:24 UTC (permalink / raw)
  To: Jan Kiszka; +Cc: Avi Kivity, Marcelo Tosatti, kvm, Alex Williamson

On Tue, Nov 02, 2010 at 06:56:14PM +0100, Jan Kiszka wrote:
> >> @@ -99,12 +228,23 @@ static void kvm_assigned_dev_ack_irq(struct kvm_irq_ack_notifier *kian)
> >>  	/* The guest irq may be shared so this ack may be
> >>  	 * from another device.
> >>  	 */
> >> -	spin_lock(&dev->intx_lock);
> >> +	spin_lock_irq(&dev->intx_lock);
> >>  	if (dev->host_irq_disabled) {
> >> -		enable_irq(dev->host_irq);
> >> +		if (dev->pci_2_3) {
> >> +			if (pci_2_3_irq_check_and_unmask(dev->dev) &
> >> +			    PCI_STATUS_INTERRUPT) {
> >> +				reassert = true;
> >> +				goto out;
> >> +			}
> >> +		} else
> >> +			enable_irq(dev->host_irq);
> > 
> > Or
> > 
> > 		if (!dev->pci_2_3)
> > 			enable_irq(dev->host_irq);
> > 		else if (pci_2_3_irq_check_and_unmask(dev->dev) & PCI_STATUS_INTERRUPT) {
> > 			reassert = true;
> > 			goto out;
> > 		}
> > 
> > to reduce nesting.
> 
> Yeah.
> 
> > 
> >>  		dev->host_irq_disabled = false;
> >>  	}
> >> -	spin_unlock(&dev->intx_lock);
> >> +out:
> >> +	spin_unlock_irq(&dev->intx_lock);
> >> +
> >> +	if (reassert)
> >> +		kvm_set_irq(dev->kvm, dev->irq_source_id, dev->guest_irq, 1);
> > 
> > Hmm, I think this still has more overhead than it needs to have.
> > Instead of setting level to 0 and then back to 1, can't we just
> > avoid set to 1 in the first place? This would need a different
> > interface than pci_2_3_irq_check_and_unmask to avoid a race
> > where interrupt is received while we are acking another one:
> > 
> > 	block userspace access
> > 	check pending bit
> > 	if (!pending)
> > 		set irq (0)
> > 	clear pending
> > 	block userspace access
> > 
> > Would be worth it for high volume devices.
> 
> The problem is that we can't reorder guest IRQ line clearing and host
> IRQ line enabling without taking a lock across host IRQ disable + guest
> IRQ raise - and that is now distributed across hard and threaded IRQ
> handlers and we don't want to hold and IRQ-safe lock during kvm_set_irq.

Oh I think I confused you.
What I mean is:

 	block userspace access
 	check interrupt status bit
 	if (!interrupt status bit set)
 		set irq (0)
 	clear interrupt disable bit
 	block userspace access

This way we enable interrupt after set irq so not need for
extra locks I think.

Hmm one thing I noticed is that pci_block_user_cfg_access
will BUG_ON if it was already blocked. So I think we have
a bug here when interrupt handler kicks in right after
we unmask interrupts.

Probably need some kind of lock to protect against this.


> > 
> >>  }
> >>  
> >>  static void deassign_guest_irq(struct kvm *kvm,
> >> @@ -151,7 +291,11 @@ static void deassign_host_irq(struct kvm *kvm,
> >>  		pci_disable_msix(assigned_dev->dev);
> >>  	} else {
> >>  		/* Deal with MSI and INTx */
> >> -		disable_irq(assigned_dev->host_irq);
> >> +		if (assigned_dev->pci_2_3) {
> >> +			pci_2_3_irq_mask(assigned_dev->dev);
> >> +			synchronize_irq(assigned_dev->host_irq);
> >> +		} else
> >> +			disable_irq(assigned_dev->host_irq);
> >>  
> >>  		free_irq(assigned_dev->host_irq, (void *)assigned_dev);
> >>  
> >> @@ -199,6 +343,13 @@ static void kvm_free_assigned_device(struct kvm *kvm,
> >>  
> >>  	pci_reset_function(assigned_dev->dev);
> >>  
> >> +	/*
> >> +	 * Unmask the IRQ at PCI level once the reset is done - the next user
> >> +	 * may not expect the IRQ being masked.
> >> +	 */
> >> +	if (assigned_dev->pci_2_3)
> >> +		pci_2_3_irq_unmask(assigned_dev->dev);
> >> +
> > 
> > Doesn't pci_reset_function clear mask bit? It seems to ...
> 
> I was left with non-functional devices for the host here if I was not
> doing this. Need to recheck, but I think it was required.

Interesting. Could you check why please?


> > 
> >>  	pci_release_regions(assigned_dev->dev);
> >>  	pci_disable_device(assigned_dev->dev);
> >>  	pci_dev_put(assigned_dev->dev);
> >> @@ -224,15 +375,29 @@ void kvm_free_all_assigned_devices(struct kvm *kvm)
> >>  static int assigned_device_enable_host_intx(struct kvm *kvm,
> >>  					    struct kvm_assigned_dev_kernel *dev)
> >>  {
> >> +	irq_handler_t irq_handler = NULL;
> >> +	unsigned long flags = 0;
> >> +
> >>  	dev->host_irq = dev->dev->irq;
> >> -	/* Even though this is PCI, we don't want to use shared
> >> -	 * interrupts. Sharing host devices with guest-assigned devices
> >> -	 * on the same interrupt line is not a happy situation: there
> >> -	 * are going to be long delays in accepting, acking, etc.
> >> +
> >> +	/*
> >> +	 * We can only share the IRQ line with other host devices if we are
> >> +	 * able to disable the IRQ source at device-level - independently of
> >> +	 * the guest driver. Otherwise host devices may suffer from unbounded
> >> +	 * IRQ latencies when the guest keeps the line asserted.
> >>  	 */
> >> -	if (request_threaded_irq(dev->host_irq, NULL, kvm_assigned_dev_thread,
> >> -				 0, dev->irq_name, (void *)dev))
> >> +	dev->pci_2_3 = pci_2_3_supported(dev->dev);
> >> +	if (dev->pci_2_3) {
> >> +		irq_handler = kvm_assigned_dev_intr;
> >> +		flags = IRQF_SHARED;
> >> +	}
> > 
> > I would prefer and else clause here instead of initializing
> > variables at the top and overwriting here. Makes it easier
> > to see which value is valid when.
> 
> OK.
> 
> Thanks,
> Jan
> 
> -- 
> Siemens AG, Corporate Technology, CT T DE IT 1
> Corporate Competence Center Embedded Linux

^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [PATCH v2 4/4] KVM: Allow host IRQ sharing for passed-through PCI 2.3 devices
  2010-11-02 18:24       ` Michael S. Tsirkin
@ 2010-11-02 18:40         ` Jan Kiszka
  2010-11-02 18:48           ` Jan Kiszka
                             ` (2 more replies)
  0 siblings, 3 replies; 30+ messages in thread
From: Jan Kiszka @ 2010-11-02 18:40 UTC (permalink / raw)
  To: Michael S. Tsirkin; +Cc: Avi Kivity, Marcelo Tosatti, kvm, Alex Williamson

Am 02.11.2010 19:24, Michael S. Tsirkin wrote:
> On Tue, Nov 02, 2010 at 06:56:14PM +0100, Jan Kiszka wrote:
>>>>  		dev->host_irq_disabled = false;
>>>>  	}
>>>> -	spin_unlock(&dev->intx_lock);
>>>> +out:
>>>> +	spin_unlock_irq(&dev->intx_lock);
>>>> +
>>>> +	if (reassert)
>>>> +		kvm_set_irq(dev->kvm, dev->irq_source_id, dev->guest_irq, 1);
>>>
>>> Hmm, I think this still has more overhead than it needs to have.
>>> Instead of setting level to 0 and then back to 1, can't we just
>>> avoid set to 1 in the first place? This would need a different
>>> interface than pci_2_3_irq_check_and_unmask to avoid a race
>>> where interrupt is received while we are acking another one:
>>>
>>> 	block userspace access
>>> 	check pending bit
>>> 	if (!pending)
>>> 		set irq (0)
>>> 	clear pending
>>> 	block userspace access
>>>
>>> Would be worth it for high volume devices.
>>
>> The problem is that we can't reorder guest IRQ line clearing and host
>> IRQ line enabling without taking a lock across host IRQ disable + guest
>> IRQ raise - and that is now distributed across hard and threaded IRQ
>> handlers and we don't want to hold and IRQ-safe lock during kvm_set_irq.
> 
> Oh I think I confused you.
> What I mean is:
> 
>  	block userspace access
>  	check interrupt status bit
>  	if (!interrupt status bit set)
>  		set irq (0)
>  	clear interrupt disable bit
>  	block userspace access
> 
> This way we enable interrupt after set irq so not need for
> extra locks I think.

OK. Would require some serious refactoring again.

But what about edge IRQs? Don't we need to toggle the bit for them? And
as we do not differentiate between level and edge, we currently have to
do this unconditionally.

> 
> Hmm one thing I noticed is that pci_block_user_cfg_access
> will BUG_ON if it was already blocked. So I think we have
> a bug here when interrupt handler kicks in right after
> we unmask interrupts.
> 
> Probably need some kind of lock to protect against this.
> 

Or an atomic counter. Will have a look.

Alex, does VFIO take care of this already?

> 
>>>
>>>>  }
>>>>  
>>>>  static void deassign_guest_irq(struct kvm *kvm,
>>>> @@ -151,7 +291,11 @@ static void deassign_host_irq(struct kvm *kvm,
>>>>  		pci_disable_msix(assigned_dev->dev);
>>>>  	} else {
>>>>  		/* Deal with MSI and INTx */
>>>> -		disable_irq(assigned_dev->host_irq);
>>>> +		if (assigned_dev->pci_2_3) {
>>>> +			pci_2_3_irq_mask(assigned_dev->dev);
>>>> +			synchronize_irq(assigned_dev->host_irq);
>>>> +		} else
>>>> +			disable_irq(assigned_dev->host_irq);
>>>>  
>>>>  		free_irq(assigned_dev->host_irq, (void *)assigned_dev);
>>>>  
>>>> @@ -199,6 +343,13 @@ static void kvm_free_assigned_device(struct kvm *kvm,
>>>>  
>>>>  	pci_reset_function(assigned_dev->dev);
>>>>  
>>>> +	/*
>>>> +	 * Unmask the IRQ at PCI level once the reset is done - the next user
>>>> +	 * may not expect the IRQ being masked.
>>>> +	 */
>>>> +	if (assigned_dev->pci_2_3)
>>>> +		pci_2_3_irq_unmask(assigned_dev->dev);
>>>> +
>>>
>>> Doesn't pci_reset_function clear mask bit? It seems to ...
>>
>> I was left with non-functional devices for the host here if I was not
>> doing this. Need to recheck, but I think it was required.
> 
> Interesting. Could you check why please?
> 

Can't reproduce anymore. This was early code, maybe affected by some
bits or buts that no longer exist.

Spec says it's cleared on reset, so I removed those lines now.

Jan

-- 
Siemens AG, Corporate Technology, CT T DE IT 1
Corporate Competence Center Embedded Linux

^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [PATCH v2 4/4] KVM: Allow host IRQ sharing for passed-through PCI 2.3 devices
  2010-11-02 18:40         ` Jan Kiszka
@ 2010-11-02 18:48           ` Jan Kiszka
  2010-11-02 18:51             ` Jan Kiszka
  2010-11-02 18:52           ` Michael S. Tsirkin
  2010-11-02 19:41           ` Alex Williamson
  2 siblings, 1 reply; 30+ messages in thread
From: Jan Kiszka @ 2010-11-02 18:48 UTC (permalink / raw)
  To: Michael S. Tsirkin; +Cc: Avi Kivity, Marcelo Tosatti, kvm, Alex Williamson

Am 02.11.2010 19:40, Jan Kiszka wrote:
>>>>> @@ -199,6 +343,13 @@ static void kvm_free_assigned_device(struct kvm *kvm,
>>>>>  
>>>>>  	pci_reset_function(assigned_dev->dev);
>>>>>  
>>>>> +	/*
>>>>> +	 * Unmask the IRQ at PCI level once the reset is done - the next user
>>>>> +	 * may not expect the IRQ being masked.
>>>>> +	 */
>>>>> +	if (assigned_dev->pci_2_3)
>>>>> +		pci_2_3_irq_unmask(assigned_dev->dev);
>>>>> +
>>>>
>>>> Doesn't pci_reset_function clear mask bit? It seems to ...
>>>
>>> I was left with non-functional devices for the host here if I was not
>>> doing this. Need to recheck, but I think it was required.
>>
>> Interesting. Could you check why please?
>>
> 
> Can't reproduce anymore. This was early code, maybe affected by some
> bits or buts that no longer exist.
> 
> Spec says it's cleared on reset, so I removed those lines now.
> 

Hmpf, it just happened again: Guest was using my ath9k, I killed the
guest, lspci says DisINTx+. Strange. Is anyone (qemu) restoring the
config space after the reset, bringing the disable bit back?

Jan

-- 
Siemens AG, Corporate Technology, CT T DE IT 1
Corporate Competence Center Embedded Linux

^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [PATCH v2 4/4] KVM: Allow host IRQ sharing for passed-through PCI 2.3 devices
  2010-11-02 18:48           ` Jan Kiszka
@ 2010-11-02 18:51             ` Jan Kiszka
  2010-11-02 18:54               ` Michael S. Tsirkin
  0 siblings, 1 reply; 30+ messages in thread
From: Jan Kiszka @ 2010-11-02 18:51 UTC (permalink / raw)
  To: Michael S. Tsirkin; +Cc: Avi Kivity, Marcelo Tosatti, kvm, Alex Williamson

Am 02.11.2010 19:48, Jan Kiszka wrote:
> Am 02.11.2010 19:40, Jan Kiszka wrote:
>>>>>> @@ -199,6 +343,13 @@ static void kvm_free_assigned_device(struct kvm *kvm,
>>>>>>  
>>>>>>  	pci_reset_function(assigned_dev->dev);
>>>>>>  
>>>>>> +	/*
>>>>>> +	 * Unmask the IRQ at PCI level once the reset is done - the next user
>>>>>> +	 * may not expect the IRQ being masked.
>>>>>> +	 */
>>>>>> +	if (assigned_dev->pci_2_3)
>>>>>> +		pci_2_3_irq_unmask(assigned_dev->dev);
>>>>>> +
>>>>>
>>>>> Doesn't pci_reset_function clear mask bit? It seems to ...
>>>>
>>>> I was left with non-functional devices for the host here if I was not
>>>> doing this. Need to recheck, but I think it was required.
>>>
>>> Interesting. Could you check why please?
>>>
>>
>> Can't reproduce anymore. This was early code, maybe affected by some
>> bits or buts that no longer exist.
>>
>> Spec says it's cleared on reset, so I removed those lines now.
>>
> 
> Hmpf, it just happened again: Guest was using my ath9k, I killed the
> guest, lspci says DisINTx+. Strange. Is anyone (qemu) restoring the
> config space after the reset, bringing the disable bit back?

Or does the kernel cache the control word?

Jan

-- 
Siemens AG, Corporate Technology, CT T DE IT 1
Corporate Competence Center Embedded Linux

^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [PATCH v2 4/4] KVM: Allow host IRQ sharing for passed-through PCI 2.3 devices
  2010-11-02 18:40         ` Jan Kiszka
  2010-11-02 18:48           ` Jan Kiszka
@ 2010-11-02 18:52           ` Michael S. Tsirkin
  2010-11-02 19:11             ` Jan Kiszka
  2010-11-02 19:41           ` Alex Williamson
  2 siblings, 1 reply; 30+ messages in thread
From: Michael S. Tsirkin @ 2010-11-02 18:52 UTC (permalink / raw)
  To: Jan Kiszka; +Cc: Avi Kivity, Marcelo Tosatti, kvm, Alex Williamson

On Tue, Nov 02, 2010 at 07:40:46PM +0100, Jan Kiszka wrote:
> Am 02.11.2010 19:24, Michael S. Tsirkin wrote:
> > On Tue, Nov 02, 2010 at 06:56:14PM +0100, Jan Kiszka wrote:
> >>>>  		dev->host_irq_disabled = false;
> >>>>  	}
> >>>> -	spin_unlock(&dev->intx_lock);
> >>>> +out:
> >>>> +	spin_unlock_irq(&dev->intx_lock);
> >>>> +
> >>>> +	if (reassert)
> >>>> +		kvm_set_irq(dev->kvm, dev->irq_source_id, dev->guest_irq, 1);
> >>>
> >>> Hmm, I think this still has more overhead than it needs to have.
> >>> Instead of setting level to 0 and then back to 1, can't we just
> >>> avoid set to 1 in the first place? This would need a different
> >>> interface than pci_2_3_irq_check_and_unmask to avoid a race
> >>> where interrupt is received while we are acking another one:
> >>>
> >>> 	block userspace access
> >>> 	check pending bit
> >>> 	if (!pending)
> >>> 		set irq (0)
> >>> 	clear pending
> >>> 	block userspace access
> >>>
> >>> Would be worth it for high volume devices.
> >>
> >> The problem is that we can't reorder guest IRQ line clearing and host
> >> IRQ line enabling without taking a lock across host IRQ disable + guest
> >> IRQ raise - and that is now distributed across hard and threaded IRQ
> >> handlers and we don't want to hold and IRQ-safe lock during kvm_set_irq.
> > 
> > Oh I think I confused you.
> > What I mean is:
> > 
> >  	block userspace access
> >  	check interrupt status bit
> >  	if (!interrupt status bit set)
> >  		set irq (0)
> >  	clear interrupt disable bit
> >  	block userspace access
> > 
> > This way we enable interrupt after set irq so not need for
> > extra locks I think.
> 
> OK. Would require some serious refactoring again.

That would also mean we can't just solve the nested block/unblock
problem with a simple lock.  Not sure this is worth the effort.

> But what about edge IRQs? Don't we need to toggle the bit for them? And
> as we do not differentiate between level and edge, we currently have to
> do this unconditionally.

AFAIK PCI IRQs are level, so I don't think we need to bother.

> > 
> > Hmm one thing I noticed is that pci_block_user_cfg_access
> > will BUG_ON if it was already blocked. So I think we have
> > a bug here when interrupt handler kicks in right after
> > we unmask interrupts.
> > 
> > Probably need some kind of lock to protect against this.
> > 
> 
> Or an atomic counter.

BTW block userspace access uses a global spinlock which will likely hurt
us on multi-CPU. Switching that to something more SMP friendly, e.g. a
per-device spinlock, might be a good idea: I don't see why that lock and
queue are global.

> Will have a look.

Need to also consider an interrupt running in parallel
with unmasking in thread.

> Alex, does VFIO take care of this already?

VFIO does this all under spin_lock_irq.

> > 
> >>>
> >>>>  }
> >>>>  
> >>>>  static void deassign_guest_irq(struct kvm *kvm,
> >>>> @@ -151,7 +291,11 @@ static void deassign_host_irq(struct kvm *kvm,
> >>>>  		pci_disable_msix(assigned_dev->dev);
> >>>>  	} else {
> >>>>  		/* Deal with MSI and INTx */
> >>>> -		disable_irq(assigned_dev->host_irq);
> >>>> +		if (assigned_dev->pci_2_3) {
> >>>> +			pci_2_3_irq_mask(assigned_dev->dev);
> >>>> +			synchronize_irq(assigned_dev->host_irq);
> >>>> +		} else
> >>>> +			disable_irq(assigned_dev->host_irq);
> >>>>  
> >>>>  		free_irq(assigned_dev->host_irq, (void *)assigned_dev);
> >>>>  
> >>>> @@ -199,6 +343,13 @@ static void kvm_free_assigned_device(struct kvm *kvm,
> >>>>  
> >>>>  	pci_reset_function(assigned_dev->dev);
> >>>>  
> >>>> +	/*
> >>>> +	 * Unmask the IRQ at PCI level once the reset is done - the next user
> >>>> +	 * may not expect the IRQ being masked.
> >>>> +	 */
> >>>> +	if (assigned_dev->pci_2_3)
> >>>> +		pci_2_3_irq_unmask(assigned_dev->dev);
> >>>> +
> >>>
> >>> Doesn't pci_reset_function clear mask bit? It seems to ...
> >>
> >> I was left with non-functional devices for the host here if I was not
> >> doing this. Need to recheck, but I think it was required.
> > 
> > Interesting. Could you check why please?
> > 
> 
> Can't reproduce anymore. This was early code, maybe affected by some
> bits or buts that no longer exist.
> 
> Spec says it's cleared on reset, so I removed those lines now.
> 
> Jan
> 
> -- 
> Siemens AG, Corporate Technology, CT T DE IT 1
> Corporate Competence Center Embedded Linux

^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [PATCH v2 4/4] KVM: Allow host IRQ sharing for passed-through PCI 2.3 devices
  2010-11-02 18:51             ` Jan Kiszka
@ 2010-11-02 18:54               ` Michael S. Tsirkin
  2010-11-02 19:30                 ` Jan Kiszka
  0 siblings, 1 reply; 30+ messages in thread
From: Michael S. Tsirkin @ 2010-11-02 18:54 UTC (permalink / raw)
  To: Jan Kiszka; +Cc: Avi Kivity, Marcelo Tosatti, kvm, Alex Williamson

On Tue, Nov 02, 2010 at 07:51:39PM +0100, Jan Kiszka wrote:
> Am 02.11.2010 19:48, Jan Kiszka wrote:
> > Am 02.11.2010 19:40, Jan Kiszka wrote:
> >>>>>> @@ -199,6 +343,13 @@ static void kvm_free_assigned_device(struct kvm *kvm,
> >>>>>>  
> >>>>>>  	pci_reset_function(assigned_dev->dev);
> >>>>>>  
> >>>>>> +	/*
> >>>>>> +	 * Unmask the IRQ at PCI level once the reset is done - the next user
> >>>>>> +	 * may not expect the IRQ being masked.
> >>>>>> +	 */
> >>>>>> +	if (assigned_dev->pci_2_3)
> >>>>>> +		pci_2_3_irq_unmask(assigned_dev->dev);
> >>>>>> +
> >>>>>
> >>>>> Doesn't pci_reset_function clear mask bit? It seems to ...
> >>>>
> >>>> I was left with non-functional devices for the host here if I was not
> >>>> doing this. Need to recheck, but I think it was required.
> >>>
> >>> Interesting. Could you check why please?
> >>>
> >>
> >> Can't reproduce anymore. This was early code, maybe affected by some
> >> bits or buts that no longer exist.
> >>
> >> Spec says it's cleared on reset, so I removed those lines now.
> >>
> > 
> > Hmpf, it just happened again: Guest was using my ath9k, I killed the
> > guest, lspci says DisINTx+. Strange. Is anyone (qemu) restoring the
> > config space after the reset, bringing the disable bit back?
> 
> Or does the kernel cache the control word?
> 
> Jan

Maybe it does, need to dig in drivers/pci. If it does this
might have other implications.

> -- 
> Siemens AG, Corporate Technology, CT T DE IT 1
> Corporate Competence Center Embedded Linux

^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [PATCH v2 4/4] KVM: Allow host IRQ sharing for passed-through PCI 2.3 devices
  2010-11-02 18:52           ` Michael S. Tsirkin
@ 2010-11-02 19:11             ` Jan Kiszka
  2010-11-02 19:14               ` Michael S. Tsirkin
  0 siblings, 1 reply; 30+ messages in thread
From: Jan Kiszka @ 2010-11-02 19:11 UTC (permalink / raw)
  To: Michael S. Tsirkin; +Cc: Avi Kivity, Marcelo Tosatti, kvm, Alex Williamson

Am 02.11.2010 19:52, Michael S. Tsirkin wrote:
> On Tue, Nov 02, 2010 at 07:40:46PM +0100, Jan Kiszka wrote:
>> Am 02.11.2010 19:24, Michael S. Tsirkin wrote:
>>> On Tue, Nov 02, 2010 at 06:56:14PM +0100, Jan Kiszka wrote:
>>>>>>  		dev->host_irq_disabled = false;
>>>>>>  	}
>>>>>> -	spin_unlock(&dev->intx_lock);
>>>>>> +out:
>>>>>> +	spin_unlock_irq(&dev->intx_lock);
>>>>>> +
>>>>>> +	if (reassert)
>>>>>> +		kvm_set_irq(dev->kvm, dev->irq_source_id, dev->guest_irq, 1);
>>>>>
>>>>> Hmm, I think this still has more overhead than it needs to have.
>>>>> Instead of setting level to 0 and then back to 1, can't we just
>>>>> avoid set to 1 in the first place? This would need a different
>>>>> interface than pci_2_3_irq_check_and_unmask to avoid a race
>>>>> where interrupt is received while we are acking another one:
>>>>>
>>>>> 	block userspace access
>>>>> 	check pending bit
>>>>> 	if (!pending)
>>>>> 		set irq (0)
>>>>> 	clear pending
>>>>> 	block userspace access
>>>>>
>>>>> Would be worth it for high volume devices.
>>>>
>>>> The problem is that we can't reorder guest IRQ line clearing and host
>>>> IRQ line enabling without taking a lock across host IRQ disable + guest
>>>> IRQ raise - and that is now distributed across hard and threaded IRQ
>>>> handlers and we don't want to hold and IRQ-safe lock during kvm_set_irq.
>>>
>>> Oh I think I confused you.
>>> What I mean is:
>>>
>>>  	block userspace access
>>>  	check interrupt status bit
>>>  	if (!interrupt status bit set)
>>>  		set irq (0)
>>>  	clear interrupt disable bit
>>>  	block userspace access
>>>
>>> This way we enable interrupt after set irq so not need for
>>> extra locks I think.
>>
>> OK. Would require some serious refactoring again.
> 
> That would also mean we can't just solve the nested block/unblock
> problem with a simple lock.  Not sure this is worth the effort.

Can't follow: what can be nested and how?

> 
>> But what about edge IRQs? Don't we need to toggle the bit for them? And
>> as we do not differentiate between level and edge, we currently have to
>> do this unconditionally.
> 
> AFAIK PCI IRQs are level, so I don't think we need to bother.

Ah, indeed.

> 
>>>
>>> Hmm one thing I noticed is that pci_block_user_cfg_access
>>> will BUG_ON if it was already blocked. So I think we have
>>> a bug here when interrupt handler kicks in right after
>>> we unmask interrupts.
>>>
>>> Probably need some kind of lock to protect against this.
>>>
>>
>> Or an atomic counter.
> 
> BTW block userspace access uses a global spinlock which will likely hurt
> us on multi-CPU. Switching that to something more SMP friendly, e.g. a
> per-device spinlock, might be a good idea: I don't see why that lock and
> queue are global.

Been through that code recently, hairy stuff. pci_lock also protects the
bus operation which can be overloaded (e.g. for error injection - if
that is not the only use case). So we need a per-bus lock, but that can
still cause contentions if devices on the same bus are handled on
different cpus.

I think the whole PCI config interface is simply not designed for
performance. It's considered a slow path, which it normally is.

> 
>> Will have a look.
> 
> Need to also consider an interrupt running in parallel
> with unmasking in thread.
> 
>> Alex, does VFIO take care of this already?
> 
> VFIO does this all under spin_lock_irq.

Everything has its pros and cons...

Jan

-- 
Siemens AG, Corporate Technology, CT T DE IT 1
Corporate Competence Center Embedded Linux

^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [PATCH v2 4/4] KVM: Allow host IRQ sharing for passed-through PCI 2.3 devices
  2010-11-02 19:11             ` Jan Kiszka
@ 2010-11-02 19:14               ` Michael S. Tsirkin
  2010-11-02 19:56                 ` Jan Kiszka
  0 siblings, 1 reply; 30+ messages in thread
From: Michael S. Tsirkin @ 2010-11-02 19:14 UTC (permalink / raw)
  To: Jan Kiszka; +Cc: Avi Kivity, Marcelo Tosatti, kvm, Alex Williamson

On Tue, Nov 02, 2010 at 08:11:31PM +0100, Jan Kiszka wrote:
> Am 02.11.2010 19:52, Michael S. Tsirkin wrote:
> > On Tue, Nov 02, 2010 at 07:40:46PM +0100, Jan Kiszka wrote:
> >> Am 02.11.2010 19:24, Michael S. Tsirkin wrote:
> >>> On Tue, Nov 02, 2010 at 06:56:14PM +0100, Jan Kiszka wrote:
> >>>>>>  		dev->host_irq_disabled = false;
> >>>>>>  	}
> >>>>>> -	spin_unlock(&dev->intx_lock);
> >>>>>> +out:
> >>>>>> +	spin_unlock_irq(&dev->intx_lock);
> >>>>>> +
> >>>>>> +	if (reassert)
> >>>>>> +		kvm_set_irq(dev->kvm, dev->irq_source_id, dev->guest_irq, 1);
> >>>>>
> >>>>> Hmm, I think this still has more overhead than it needs to have.
> >>>>> Instead of setting level to 0 and then back to 1, can't we just
> >>>>> avoid set to 1 in the first place? This would need a different
> >>>>> interface than pci_2_3_irq_check_and_unmask to avoid a race
> >>>>> where interrupt is received while we are acking another one:
> >>>>>
> >>>>> 	block userspace access
> >>>>> 	check pending bit
> >>>>> 	if (!pending)
> >>>>> 		set irq (0)
> >>>>> 	clear pending
> >>>>> 	block userspace access
> >>>>>
> >>>>> Would be worth it for high volume devices.
> >>>>
> >>>> The problem is that we can't reorder guest IRQ line clearing and host
> >>>> IRQ line enabling without taking a lock across host IRQ disable + guest
> >>>> IRQ raise - and that is now distributed across hard and threaded IRQ
> >>>> handlers and we don't want to hold and IRQ-safe lock during kvm_set_irq.
> >>>
> >>> Oh I think I confused you.
> >>> What I mean is:
> >>>
> >>>  	block userspace access
> >>>  	check interrupt status bit
> >>>  	if (!interrupt status bit set)
> >>>  		set irq (0)
> >>>  	clear interrupt disable bit
> >>>  	block userspace access
> >>>
> >>> This way we enable interrupt after set irq so not need for
> >>> extra locks I think.
> >>
> >> OK. Would require some serious refactoring again.
> > 
> > That would also mean we can't just solve the nested block/unblock
> > problem with a simple lock.  Not sure this is worth the effort.
> 
> Can't follow: what can be nested and how?

I just mean interrupt trying to block userspace when thread
did that already.

> > 
> >> But what about edge IRQs? Don't we need to toggle the bit for them? And
> >> as we do not differentiate between level and edge, we currently have to
> >> do this unconditionally.
> > 
> > AFAIK PCI IRQs are level, so I don't think we need to bother.
> 
> Ah, indeed.
> 
> > 
> >>>
> >>> Hmm one thing I noticed is that pci_block_user_cfg_access
> >>> will BUG_ON if it was already blocked. So I think we have
> >>> a bug here when interrupt handler kicks in right after
> >>> we unmask interrupts.
> >>>
> >>> Probably need some kind of lock to protect against this.
> >>>
> >>
> >> Or an atomic counter.
> > 
> > BTW block userspace access uses a global spinlock which will likely hurt
> > us on multi-CPU. Switching that to something more SMP friendly, e.g. a
> > per-device spinlock, might be a good idea: I don't see why that lock and
> > queue are global.
> 
> Been through that code recently, hairy stuff. pci_lock also protects the
> bus operation which can be overloaded (e.g. for error injection - if
> that is not the only use case). So we need a per-bus lock, but that can
> still cause contentions if devices on the same bus are handled on
> different cpus.

Right. So this lock got reused for blocking userspace, I do not suggest
we rip it all out, just make userspace blocking use
a finer-grained lock.

> I think the whole PCI config interface is simply not designed for
> performance. It's considered a slow path, which it normally is.

So I guess we'll need to fix that now, at least if we
want to make the 2.3 way the default.

> > 
> >> Will have a look.
> > 
> > Need to also consider an interrupt running in parallel
> > with unmasking in thread.
> > 
> >> Alex, does VFIO take care of this already?
> > 
> > VFIO does this all under spin_lock_irq.
> 
> Everything has its pros and cons...
> 
> Jan
> 
> -- 
> Siemens AG, Corporate Technology, CT T DE IT 1
> Corporate Competence Center Embedded Linux

^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [PATCH v2 4/4] KVM: Allow host IRQ sharing for passed-through PCI 2.3 devices
  2010-11-02 18:54               ` Michael S. Tsirkin
@ 2010-11-02 19:30                 ` Jan Kiszka
  2010-11-02 19:53                   ` Michael S. Tsirkin
  0 siblings, 1 reply; 30+ messages in thread
From: Jan Kiszka @ 2010-11-02 19:30 UTC (permalink / raw)
  To: Michael S. Tsirkin; +Cc: Avi Kivity, Marcelo Tosatti, kvm, Alex Williamson

Am 02.11.2010 19:54, Michael S. Tsirkin wrote:
> On Tue, Nov 02, 2010 at 07:51:39PM +0100, Jan Kiszka wrote:
>> Am 02.11.2010 19:48, Jan Kiszka wrote:
>>> Am 02.11.2010 19:40, Jan Kiszka wrote:
>>>>>>>> @@ -199,6 +343,13 @@ static void kvm_free_assigned_device(struct kvm *kvm,
>>>>>>>>  
>>>>>>>>  	pci_reset_function(assigned_dev->dev);
>>>>>>>>  
>>>>>>>> +	/*
>>>>>>>> +	 * Unmask the IRQ at PCI level once the reset is done - the next user
>>>>>>>> +	 * may not expect the IRQ being masked.
>>>>>>>> +	 */
>>>>>>>> +	if (assigned_dev->pci_2_3)
>>>>>>>> +		pci_2_3_irq_unmask(assigned_dev->dev);
>>>>>>>> +
>>>>>>>
>>>>>>> Doesn't pci_reset_function clear mask bit? It seems to ...
>>>>>>
>>>>>> I was left with non-functional devices for the host here if I was not
>>>>>> doing this. Need to recheck, but I think it was required.
>>>>>
>>>>> Interesting. Could you check why please?
>>>>>
>>>>
>>>> Can't reproduce anymore. This was early code, maybe affected by some
>>>> bits or buts that no longer exist.
>>>>
>>>> Spec says it's cleared on reset, so I removed those lines now.
>>>>
>>>
>>> Hmpf, it just happened again: Guest was using my ath9k, I killed the
>>> guest, lspci says DisINTx+. Strange. Is anyone (qemu) restoring the
>>> config space after the reset, bringing the disable bit back?
>>
>> Or does the kernel cache the control word?
>>
>> Jan
> 
> Maybe it does, need to dig in drivers/pci. If it does this
> might have other implications.

OK, that mystery is resolved now: pci_reset_function saves & restores
the device state.

Jan

-- 
Siemens AG, Corporate Technology, CT T DE IT 1
Corporate Competence Center Embedded Linux

^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [PATCH v2 4/4] KVM: Allow host IRQ sharing for passed-through PCI 2.3 devices
  2010-11-02 18:40         ` Jan Kiszka
  2010-11-02 18:48           ` Jan Kiszka
  2010-11-02 18:52           ` Michael S. Tsirkin
@ 2010-11-02 19:41           ` Alex Williamson
  2 siblings, 0 replies; 30+ messages in thread
From: Alex Williamson @ 2010-11-02 19:41 UTC (permalink / raw)
  To: Jan Kiszka; +Cc: Michael S. Tsirkin, Avi Kivity, Marcelo Tosatti, kvm

On Tue, 2010-11-02 at 19:40 +0100, Jan Kiszka wrote:
> Am 02.11.2010 19:24, Michael S. Tsirkin wrote:
> > On Tue, Nov 02, 2010 at 06:56:14PM +0100, Jan Kiszka wrote:
> >>>>  		dev->host_irq_disabled = false;
> >>>>  	}
> >>>> -	spin_unlock(&dev->intx_lock);
> >>>> +out:
> >>>> +	spin_unlock_irq(&dev->intx_lock);
> >>>> +
> >>>> +	if (reassert)
> >>>> +		kvm_set_irq(dev->kvm, dev->irq_source_id, dev->guest_irq, 1);
> >>>
> >>> Hmm, I think this still has more overhead than it needs to have.
> >>> Instead of setting level to 0 and then back to 1, can't we just
> >>> avoid set to 1 in the first place? This would need a different
> >>> interface than pci_2_3_irq_check_and_unmask to avoid a race
> >>> where interrupt is received while we are acking another one:
> >>>
> >>> 	block userspace access
> >>> 	check pending bit
> >>> 	if (!pending)
> >>> 		set irq (0)
> >>> 	clear pending
> >>> 	block userspace access
> >>>
> >>> Would be worth it for high volume devices.
> >>
> >> The problem is that we can't reorder guest IRQ line clearing and host
> >> IRQ line enabling without taking a lock across host IRQ disable + guest
> >> IRQ raise - and that is now distributed across hard and threaded IRQ
> >> handlers and we don't want to hold and IRQ-safe lock during kvm_set_irq.
> > 
> > Oh I think I confused you.
> > What I mean is:
> > 
> >  	block userspace access
> >  	check interrupt status bit
> >  	if (!interrupt status bit set)
> >  		set irq (0)
> >  	clear interrupt disable bit
> >  	block userspace access
> > 
> > This way we enable interrupt after set irq so not need for
> > extra locks I think.
> 
> OK. Would require some serious refactoring again.
> 
> But what about edge IRQs? Don't we need to toggle the bit for them? And
> as we do not differentiate between level and edge, we currently have to
> do this unconditionally.
> 
> > 
> > Hmm one thing I noticed is that pci_block_user_cfg_access
> > will BUG_ON if it was already blocked. So I think we have
> > a bug here when interrupt handler kicks in right after
> > we unmask interrupts.
> > 
> > Probably need some kind of lock to protect against this.
> > 
> 
> Or an atomic counter. Will have a look.
> 
> Alex, does VFIO take care of this already?

Yes, VFIO has a lock used by the interrupt handler and the EOI handler
that prevents them from both blocking user cfg access at the same time.

Alex


^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [PATCH v2 4/4] KVM: Allow host IRQ sharing for passed-through PCI 2.3 devices
  2010-11-02 19:30                 ` Jan Kiszka
@ 2010-11-02 19:53                   ` Michael S. Tsirkin
  2010-11-02 19:58                     ` Jan Kiszka
  0 siblings, 1 reply; 30+ messages in thread
From: Michael S. Tsirkin @ 2010-11-02 19:53 UTC (permalink / raw)
  To: Jan Kiszka; +Cc: Avi Kivity, Marcelo Tosatti, kvm, Alex Williamson

On Tue, Nov 02, 2010 at 08:30:25PM +0100, Jan Kiszka wrote:
> Am 02.11.2010 19:54, Michael S. Tsirkin wrote:
> > On Tue, Nov 02, 2010 at 07:51:39PM +0100, Jan Kiszka wrote:
> >> Am 02.11.2010 19:48, Jan Kiszka wrote:
> >>> Am 02.11.2010 19:40, Jan Kiszka wrote:
> >>>>>>>> @@ -199,6 +343,13 @@ static void kvm_free_assigned_device(struct kvm *kvm,
> >>>>>>>>  
> >>>>>>>>  	pci_reset_function(assigned_dev->dev);
> >>>>>>>>  
> >>>>>>>> +	/*
> >>>>>>>> +	 * Unmask the IRQ at PCI level once the reset is done - the next user
> >>>>>>>> +	 * may not expect the IRQ being masked.
> >>>>>>>> +	 */
> >>>>>>>> +	if (assigned_dev->pci_2_3)
> >>>>>>>> +		pci_2_3_irq_unmask(assigned_dev->dev);
> >>>>>>>> +
> >>>>>>>
> >>>>>>> Doesn't pci_reset_function clear mask bit? It seems to ...
> >>>>>>
> >>>>>> I was left with non-functional devices for the host here if I was not
> >>>>>> doing this. Need to recheck, but I think it was required.
> >>>>>
> >>>>> Interesting. Could you check why please?
> >>>>>
> >>>>
> >>>> Can't reproduce anymore. This was early code, maybe affected by some
> >>>> bits or buts that no longer exist.
> >>>>
> >>>> Spec says it's cleared on reset, so I removed those lines now.
> >>>>
> >>>
> >>> Hmpf, it just happened again: Guest was using my ath9k, I killed the
> >>> guest, lspci says DisINTx+. Strange. Is anyone (qemu) restoring the
> >>> config space after the reset, bringing the disable bit back?
> >>
> >> Or does the kernel cache the control word?
> >>
> >> Jan
> > 
> > Maybe it does, need to dig in drivers/pci. If it does this
> > might have other implications.
> 
> OK, that mystery is resolved now: pci_reset_function saves & restores
> the device state.
> 
> Jan

Aha. I wonder what other state we need to clear.

> -- 
> Siemens AG, Corporate Technology, CT T DE IT 1
> Corporate Competence Center Embedded Linux

^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [PATCH v2 4/4] KVM: Allow host IRQ sharing for passed-through PCI 2.3 devices
  2010-11-02 19:14               ` Michael S. Tsirkin
@ 2010-11-02 19:56                 ` Jan Kiszka
  0 siblings, 0 replies; 30+ messages in thread
From: Jan Kiszka @ 2010-11-02 19:56 UTC (permalink / raw)
  To: Michael S. Tsirkin; +Cc: Avi Kivity, Marcelo Tosatti, kvm, Alex Williamson

[-- Attachment #1: Type: text/plain, Size: 1512 bytes --]

Am 02.11.2010 20:14, Michael S. Tsirkin wrote:
>>> BTW block userspace access uses a global spinlock which will likely hurt
>>> us on multi-CPU. Switching that to something more SMP friendly, e.g. a
>>> per-device spinlock, might be a good idea: I don't see why that lock and
>>> queue are global.
>>
>> Been through that code recently, hairy stuff. pci_lock also protects the
>> bus operation which can be overloaded (e.g. for error injection - if
>> that is not the only use case). So we need a per-bus lock, but that can
>> still cause contentions if devices on the same bus are handled on
>> different cpus.
> 
> Right. So this lock got reused for blocking userspace, I do not suggest
> we rip it all out, just make userspace blocking use
> a finer-grained lock.
> 
>> I think the whole PCI config interface is simply not designed for
>> performance. It's considered a slow path, which it normally is.
> 
> So I guess we'll need to fix that now, at least if we
> want to make the 2.3 way the default.
> 

On many systems (those with "direct" PCI config access), there is
another lock down the road: pci_config_lock. That can't be broken up as
the protected resource is unique. So we do not gain much improving the
higher-level lock.

BTW, accessing the interrupt controller for IRQ line fiddling is not a
per-device thing either. So as long as the latency of the code under the
locks is not significantly worse with PCI-level masking, we don't lose
scalability here.

Jan


[-- Attachment #2: OpenPGP digital signature --]
[-- Type: application/pgp-signature, Size: 259 bytes --]

^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [PATCH v2 4/4] KVM: Allow host IRQ sharing for passed-through PCI 2.3 devices
  2010-11-02 19:53                   ` Michael S. Tsirkin
@ 2010-11-02 19:58                     ` Jan Kiszka
  2010-11-02 20:05                       ` Michael S. Tsirkin
  0 siblings, 1 reply; 30+ messages in thread
From: Jan Kiszka @ 2010-11-02 19:58 UTC (permalink / raw)
  To: Michael S. Tsirkin; +Cc: Avi Kivity, Marcelo Tosatti, kvm, Alex Williamson

[-- Attachment #1: Type: text/plain, Size: 1935 bytes --]

Am 02.11.2010 20:53, Michael S. Tsirkin wrote:
> On Tue, Nov 02, 2010 at 08:30:25PM +0100, Jan Kiszka wrote:
>> Am 02.11.2010 19:54, Michael S. Tsirkin wrote:
>>> On Tue, Nov 02, 2010 at 07:51:39PM +0100, Jan Kiszka wrote:
>>>> Am 02.11.2010 19:48, Jan Kiszka wrote:
>>>>> Am 02.11.2010 19:40, Jan Kiszka wrote:
>>>>>>>>>> @@ -199,6 +343,13 @@ static void kvm_free_assigned_device(struct kvm *kvm,
>>>>>>>>>>  
>>>>>>>>>>  	pci_reset_function(assigned_dev->dev);
>>>>>>>>>>  
>>>>>>>>>> +	/*
>>>>>>>>>> +	 * Unmask the IRQ at PCI level once the reset is done - the next user
>>>>>>>>>> +	 * may not expect the IRQ being masked.
>>>>>>>>>> +	 */
>>>>>>>>>> +	if (assigned_dev->pci_2_3)
>>>>>>>>>> +		pci_2_3_irq_unmask(assigned_dev->dev);
>>>>>>>>>> +
>>>>>>>>>
>>>>>>>>> Doesn't pci_reset_function clear mask bit? It seems to ...
>>>>>>>>
>>>>>>>> I was left with non-functional devices for the host here if I was not
>>>>>>>> doing this. Need to recheck, but I think it was required.
>>>>>>>
>>>>>>> Interesting. Could you check why please?
>>>>>>>
>>>>>>
>>>>>> Can't reproduce anymore. This was early code, maybe affected by some
>>>>>> bits or buts that no longer exist.
>>>>>>
>>>>>> Spec says it's cleared on reset, so I removed those lines now.
>>>>>>
>>>>>
>>>>> Hmpf, it just happened again: Guest was using my ath9k, I killed the
>>>>> guest, lspci says DisINTx+. Strange. Is anyone (qemu) restoring the
>>>>> config space after the reset, bringing the disable bit back?
>>>>
>>>> Or does the kernel cache the control word?
>>>>
>>>> Jan
>>>
>>> Maybe it does, need to dig in drivers/pci. If it does this
>>> might have other implications.
>>
>> OK, that mystery is resolved now: pci_reset_function saves & restores
>> the device state.
>>
>> Jan
> 
> Aha. I wonder what other state we need to clear.
> 

Maybe just save/restore before/after assigning the device?

Jan


[-- Attachment #2: OpenPGP digital signature --]
[-- Type: application/pgp-signature, Size: 259 bytes --]

^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [PATCH v2 4/4] KVM: Allow host IRQ sharing for passed-through PCI 2.3 devices
  2010-11-02 19:58                     ` Jan Kiszka
@ 2010-11-02 20:05                       ` Michael S. Tsirkin
  0 siblings, 0 replies; 30+ messages in thread
From: Michael S. Tsirkin @ 2010-11-02 20:05 UTC (permalink / raw)
  To: Jan Kiszka; +Cc: Avi Kivity, Marcelo Tosatti, kvm, Alex Williamson

On Tue, Nov 02, 2010 at 08:58:36PM +0100, Jan Kiszka wrote:
> Am 02.11.2010 20:53, Michael S. Tsirkin wrote:
> > On Tue, Nov 02, 2010 at 08:30:25PM +0100, Jan Kiszka wrote:
> >> Am 02.11.2010 19:54, Michael S. Tsirkin wrote:
> >>> On Tue, Nov 02, 2010 at 07:51:39PM +0100, Jan Kiszka wrote:
> >>>> Am 02.11.2010 19:48, Jan Kiszka wrote:
> >>>>> Am 02.11.2010 19:40, Jan Kiszka wrote:
> >>>>>>>>>> @@ -199,6 +343,13 @@ static void kvm_free_assigned_device(struct kvm *kvm,
> >>>>>>>>>>  
> >>>>>>>>>>  	pci_reset_function(assigned_dev->dev);
> >>>>>>>>>>  
> >>>>>>>>>> +	/*
> >>>>>>>>>> +	 * Unmask the IRQ at PCI level once the reset is done - the next user
> >>>>>>>>>> +	 * may not expect the IRQ being masked.
> >>>>>>>>>> +	 */
> >>>>>>>>>> +	if (assigned_dev->pci_2_3)
> >>>>>>>>>> +		pci_2_3_irq_unmask(assigned_dev->dev);
> >>>>>>>>>> +
> >>>>>>>>>
> >>>>>>>>> Doesn't pci_reset_function clear mask bit? It seems to ...
> >>>>>>>>
> >>>>>>>> I was left with non-functional devices for the host here if I was not
> >>>>>>>> doing this. Need to recheck, but I think it was required.
> >>>>>>>
> >>>>>>> Interesting. Could you check why please?
> >>>>>>>
> >>>>>>
> >>>>>> Can't reproduce anymore. This was early code, maybe affected by some
> >>>>>> bits or buts that no longer exist.
> >>>>>>
> >>>>>> Spec says it's cleared on reset, so I removed those lines now.
> >>>>>>
> >>>>>
> >>>>> Hmpf, it just happened again: Guest was using my ath9k, I killed the
> >>>>> guest, lspci says DisINTx+. Strange. Is anyone (qemu) restoring the
> >>>>> config space after the reset, bringing the disable bit back?
> >>>>
> >>>> Or does the kernel cache the control word?
> >>>>
> >>>> Jan
> >>>
> >>> Maybe it does, need to dig in drivers/pci. If it does this
> >>> might have other implications.
> >>
> >> OK, that mystery is resolved now: pci_reset_function saves & restores
> >> the device state.
> >>
> >> Jan
> > 
> > Aha. I wonder what other state we need to clear.
> > 
> 
> Maybe just save/restore before/after assigning the device?
> 
> Jan
> 

Yea. Sounds good.

-- 
MST

^ permalink raw reply	[flat|nested] 30+ messages in thread

* [PATCH v2 0/4] KVM & genirq: Enable adaptive IRQ sharing for passed-through devices
@ 2010-12-12 11:22 Jan Kiszka
  2010-12-12 11:22 ` [PATCH v2 1/4] genirq: Introduce driver-readable IRQ status word Jan Kiszka
                   ` (4 more replies)
  0 siblings, 5 replies; 30+ messages in thread
From: Jan Kiszka @ 2010-12-12 11:22 UTC (permalink / raw)
  To: Thomas Gleixner, Avi Kivity, Marcelo Tosatti
  Cc: linux-kernel, kvm, Tom Lyon, Alex Williamson, Michael S. Tsirkin

Second try to allow adaptive interrupt handlers according to the line's
current sharing situation. This one converts the explicit notifier
interface into several extension of existing genirq APIs:

 - introduce a interrupt status word that can be read by interrupt
   handlers to obtain the current line sharing state
 - introduce IRQF_ADAPTIVE: handlers registered with this flag will be
   informed about an upcoming line sharing by calling the interrupt
   handler itself (instead of a notifier callback)
 - introduce IRQF_COND_ONESHOT: IRQF_ONESHOT semantics if the line is
   not shared, standard semantics otherwise (removes the need to
   re-register handlers to switch between IRQF_SHARED and IRQF_ONESHOT)

The result may look simpler on first glance than v1, but it comes with
more subtle race scenarios IMO. I thought them through, hopefully
catching all, but I would appreciate any skeptical review.

I also tried to replace line-level disabling from the interrupt handler
with signaling genirq to keep the line masked on handler return.
However, I finally dropped this idea as it turns out to be very hard (if
not impossible) to properly synchronize the generic tail (in genirq)
with the driver that may want to re-enable the line at the same time
(depending on the driver's internal state). But if anyone sees a
magically simple way to achieve this, I'm still all ears.

Finally, the last patch in this series makes use of the new interface
for KVM's PCI pass-through subsystem. KVM has to keep the interrupt
source disabled while calling into the guest to handle the event. This
can be done at device or line level. The former is required to share the
interrupt line, the latter is an order of magnitude faster.

Beside pass-through support of KVM, further users of this new interface
could become VFIO (not yet mainline) and uio_pci_generic which have to
resolve the same conflict.

Note: The KVM patch depends on
http://thread.gmane.org/gmane.comp.emulators.kvm.devel/64515

Jan Kiszka (4):
  genirq: Introduce driver-readable IRQ status word
  genirq: Inform handler about line sharing state
  genirq: Add support for IRQF_COND_ONESHOT
  KVM: Allow host IRQ sharing for passed-through PCI 2.3 devices

 Documentation/kvm/api.txt |   27 ++++
 arch/x86/kvm/x86.c        |    1 +
 include/linux/interrupt.h |   15 ++
 include/linux/irq.h       |    2 +
 include/linux/irqdesc.h   |    2 +
 include/linux/kvm.h       |    6 +
 include/linux/kvm_host.h  |   10 ++-
 kernel/irq/irqdesc.c      |    2 +
 kernel/irq/manage.c       |   74 ++++++++++-
 virt/kvm/assigned-dev.c   |  336 ++++++++++++++++++++++++++++++++++++++++-----
 10 files changed, 437 insertions(+), 38 deletions(-)

^ permalink raw reply	[flat|nested] 30+ messages in thread

* [PATCH v2 1/4] genirq: Introduce driver-readable IRQ status word
  2010-12-12 11:22 [PATCH v2 0/4] KVM & genirq: Enable adaptive IRQ sharing for passed-through devices Jan Kiszka
@ 2010-12-12 11:22 ` Jan Kiszka
  2010-12-12 17:29   ` Thomas Gleixner
  2010-12-12 11:22 ` [PATCH v2 2/4] genirq: Inform handler about line sharing state Jan Kiszka
                   ` (3 subsequent siblings)
  4 siblings, 1 reply; 30+ messages in thread
From: Jan Kiszka @ 2010-12-12 11:22 UTC (permalink / raw)
  To: Thomas Gleixner, Avi Kivity, Marcelo Tosatti
  Cc: linux-kernel, kvm, Tom Lyon, Alex Williamson, Michael S. Tsirkin,
	Jan Kiszka

From: Jan Kiszka <jan.kiszka@siemens.com>

This associates a status word with every IRQ descriptor. Drivers can obtain
its content via get_irq_status(irq). First use case will be propagating the
interrupt sharing state.

Signed-off-by: Jan Kiszka <jan.kiszka@siemens.com>
---
 include/linux/interrupt.h |    2 ++
 include/linux/irq.h       |    2 ++
 kernel/irq/manage.c       |   15 +++++++++++++++
 3 files changed, 19 insertions(+), 0 deletions(-)

diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h
index 79d0c4f..16cdbbf 100644
--- a/include/linux/interrupt.h
+++ b/include/linux/interrupt.h
@@ -126,6 +126,8 @@ struct irqaction {
 
 extern irqreturn_t no_action(int cpl, void *dev_id);
 
+extern unsigned long get_irq_status(unsigned long irq);
+
 #ifdef CONFIG_GENERIC_HARDIRQS
 extern int __must_check
 request_threaded_irq(unsigned int irq, irq_handler_t handler,
diff --git a/include/linux/irq.h b/include/linux/irq.h
index abde252..5554203 100644
--- a/include/linux/irq.h
+++ b/include/linux/irq.h
@@ -96,6 +96,7 @@ struct msi_desc;
  *			methods, to allow shared chip implementations
  * @msi_desc:		MSI descriptor
  * @affinity:		IRQ affinity on SMP
+ * @status:		driver-readable status flags (IRQS_*)
  *
  * The fields here need to overlay the ones in irq_desc until we
  * cleaned up the direct references and switched everything over to
@@ -108,6 +109,7 @@ struct irq_data {
 	void			*handler_data;
 	void			*chip_data;
 	struct msi_desc		*msi_desc;
+	unsigned long		status;
 #ifdef CONFIG_SMP
 	cpumask_var_t		affinity;
 #endif
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index 5f92acc..df51284 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -1157,3 +1157,18 @@ int request_any_context_irq(unsigned int irq, irq_handler_t handler,
 	return !ret ? IRQC_IS_HARDIRQ : ret;
 }
 EXPORT_SYMBOL_GPL(request_any_context_irq);
+
+/**
+ *	get_irq_status - read interrupt line status word
+ *	@irq: Interrupt line of the status word
+ *
+ *	This returns the current content of the status word associated with
+ *	the given interrupt line. See IRQS_* flags for details.
+ */
+unsigned long get_irq_status(unsigned long irq)
+{
+	struct irq_desc *desc = irq_to_desc(irq);
+
+	return desc ? desc->irq_data.status : 0;
+}
+EXPORT_SYMBOL_GPL(get_irq_status);
-- 
1.7.1


^ permalink raw reply related	[flat|nested] 30+ messages in thread

* [PATCH v2 2/4] genirq: Inform handler about line sharing state
  2010-12-12 11:22 [PATCH v2 0/4] KVM & genirq: Enable adaptive IRQ sharing for passed-through devices Jan Kiszka
  2010-12-12 11:22 ` [PATCH v2 1/4] genirq: Introduce driver-readable IRQ status word Jan Kiszka
@ 2010-12-12 11:22 ` Jan Kiszka
  2010-12-12 16:53   ` Thomas Gleixner
  2010-12-12 11:22 ` [PATCH v2 3/4] genirq: Add support for IRQF_COND_ONESHOT Jan Kiszka
                   ` (2 subsequent siblings)
  4 siblings, 1 reply; 30+ messages in thread
From: Jan Kiszka @ 2010-12-12 11:22 UTC (permalink / raw)
  To: Thomas Gleixner, Avi Kivity, Marcelo Tosatti
  Cc: linux-kernel, kvm, Tom Lyon, Alex Williamson, Michael S. Tsirkin,
	Jan Kiszka

From: Jan Kiszka <jan.kiszka@siemens.com>

This enabled interrupt handlers to retrieve the current line sharing state via
the new interrupt status word so that they can adapt to it.

The switch from shared to exclusive is generally uncritical and can thus be
performed on demand. However, preparing a line for shared mode may require
preparational steps of the currently registered handler. It can therefore
request an ahead-of-time notification via IRQF_ADAPTIVE. The notification
consists of an exceptional handler invocation with IRQS_MAKE_SHAREABLE set in
the status word.

Signed-off-by: Jan Kiszka <jan.kiszka@siemens.com>
---
 include/linux/interrupt.h |   10 ++++++++++
 include/linux/irqdesc.h   |    2 ++
 kernel/irq/irqdesc.c      |    2 ++
 kernel/irq/manage.c       |   44 +++++++++++++++++++++++++++++++++++++++++---
 4 files changed, 55 insertions(+), 3 deletions(-)

diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h
index 16cdbbf..c6323a2 100644
--- a/include/linux/interrupt.h
+++ b/include/linux/interrupt.h
@@ -55,6 +55,7 @@
  *                Used by threaded interrupts which need to keep the
  *                irq line disabled until the threaded handler has been run.
  * IRQF_NO_SUSPEND - Do not disable this IRQ during suspend
+ * IRQF_ADAPTIVE - Request notification about upcoming interrupt line sharing
  *
  */
 #define IRQF_DISABLED		0x00000020
@@ -67,6 +68,7 @@
 #define IRQF_IRQPOLL		0x00001000
 #define IRQF_ONESHOT		0x00002000
 #define IRQF_NO_SUSPEND		0x00004000
+#define IRQF_ADAPTIVE		0x00008000
 
 #define IRQF_TIMER		(__IRQF_TIMER | IRQF_NO_SUSPEND)
 
@@ -126,6 +128,14 @@ struct irqaction {
 
 extern irqreturn_t no_action(int cpl, void *dev_id);
 
+/*
+ * Driver-readable IRQ line status flags:
+ * IRQS_SHARED - line is shared between multiple handlers
+ * IRQS_MAKE_SHAREABLE - in the process of making an exclusive line shareable
+ */
+#define IRQS_SHARED		0x00000001
+#define IRQS_MAKE_SHAREABLE	0x00000002
+
 extern unsigned long get_irq_status(unsigned long irq);
 
 #ifdef CONFIG_GENERIC_HARDIRQS
diff --git a/include/linux/irqdesc.h b/include/linux/irqdesc.h
index 979c68c..c490e83 100644
--- a/include/linux/irqdesc.h
+++ b/include/linux/irqdesc.h
@@ -29,6 +29,7 @@ struct timer_rand_state;
  * @wait_for_threads:	wait queue for sync_irq to wait for threaded handlers
  * @dir:		/proc/irq/ procfs entry
  * @name:		flow handler name for /proc/interrupts output
+ * @register_lock:	protects registration & release, for unshared->shared
  */
 struct irq_desc {
 
@@ -80,6 +81,7 @@ struct irq_desc {
 	struct proc_dir_entry	*dir;
 #endif
 	const char		*name;
+	struct mutex		register_lock;
 } ____cacheline_internodealigned_in_smp;
 
 #ifndef CONFIG_SPARSE_IRQ
diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c
index 9988d03..de69845 100644
--- a/kernel/irq/irqdesc.c
+++ b/kernel/irq/irqdesc.c
@@ -143,6 +143,7 @@ static struct irq_desc *alloc_desc(int irq, int node)
 
 	raw_spin_lock_init(&desc->lock);
 	lockdep_set_class(&desc->lock, &irq_desc_lock_class);
+	mutex_init(&desc->register_lock);
 
 	desc_set_defaults(irq, desc, node);
 
@@ -254,6 +255,7 @@ int __init early_irq_init(void)
 		alloc_masks(desc + i, GFP_KERNEL, node);
 		desc_smp_init(desc + i, node);
 		lockdep_set_class(&desc[i].lock, &irq_desc_lock_class);
+		mutex_init(&desc[i].register_lock);
 	}
 	return arch_early_irq_init();
 }
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index df51284..a4557eb 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -754,6 +754,8 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
 			old = *old_ptr;
 		} while (old);
 		shared = 1;
+
+		desc->irq_data.status |= IRQS_SHARED;
 	}
 
 	if (!shared) {
@@ -883,6 +885,7 @@ static struct irqaction *__free_irq(unsigned int irq, void *dev_id)
 {
 	struct irq_desc *desc = irq_to_desc(irq);
 	struct irqaction *action, **action_ptr;
+	bool single_handler = false;
 	unsigned long flags;
 
 	WARN(in_interrupt(), "Trying to free IRQ %d from IRQ context!\n", irq);
@@ -928,7 +931,8 @@ static struct irqaction *__free_irq(unsigned int irq, void *dev_id)
 			desc->irq_data.chip->irq_shutdown(&desc->irq_data);
 		else
 			desc->irq_data.chip->irq_disable(&desc->irq_data);
-	}
+	} else if (!desc->action->next)
+		single_handler = true;
 
 #ifdef CONFIG_SMP
 	/* make sure affinity_hint is cleaned up */
@@ -943,6 +947,9 @@ static struct irqaction *__free_irq(unsigned int irq, void *dev_id)
 	/* Make sure it's not being used on another CPU: */
 	synchronize_irq(irq);
 
+	if (single_handler)
+		desc->irq_data.status &= ~IRQS_SHARED;
+
 #ifdef CONFIG_DEBUG_SHIRQ
 	/*
 	 * It's a shared IRQ -- the driver ought to be prepared for an IRQ
@@ -1002,9 +1009,13 @@ void free_irq(unsigned int irq, void *dev_id)
 	if (!desc)
 		return;
 
+	mutex_lock(&desc->register_lock);
+
 	chip_bus_lock(desc);
 	kfree(__free_irq(irq, dev_id));
 	chip_bus_sync_unlock(desc);
+
+	mutex_unlock(&desc->register_lock);
 }
 EXPORT_SYMBOL(free_irq);
 
@@ -1055,7 +1066,7 @@ int request_threaded_irq(unsigned int irq, irq_handler_t handler,
 			 irq_handler_t thread_fn, unsigned long irqflags,
 			 const char *devname, void *dev_id)
 {
-	struct irqaction *action;
+	struct irqaction *action, *old_action;
 	struct irq_desc *desc;
 	int retval;
 
@@ -1091,12 +1102,39 @@ int request_threaded_irq(unsigned int irq, irq_handler_t handler,
 	action->name = devname;
 	action->dev_id = dev_id;
 
+	mutex_lock(&desc->register_lock);
+
+	old_action = desc->action;
+	if (old_action && (old_action->flags & IRQF_ADAPTIVE) &&
+	    !(desc->irq_data.status & IRQS_SHARED)) {
+		/*
+		 * Signal the old handler that is has to switch to shareable
+		 * handling mode. Disable the line to avoid any conflict with
+		 * a real IRQ.
+		 */
+		disable_irq(irq);
+		local_irq_disable();
+
+		desc->irq_data.status |= IRQS_SHARED | IRQS_MAKE_SHAREABLE;
+		old_action->handler(irq, old_action->dev_id);
+		desc->irq_data.status &= ~IRQS_MAKE_SHAREABLE;
+
+		local_irq_enable();
+		enable_irq(irq);
+		
+	}
+
 	chip_bus_lock(desc);
 	retval = __setup_irq(irq, desc, action);
 	chip_bus_sync_unlock(desc);
 
-	if (retval)
+	if (retval) {
+		if (desc->action && !desc->action->next)
+			desc->irq_data.status &= ~IRQS_SHARED;
 		kfree(action);
+	}
+
+	mutex_unlock(&desc->register_lock);
 
 #ifdef CONFIG_DEBUG_SHIRQ
 	if (!retval && (irqflags & IRQF_SHARED)) {
-- 
1.7.1


^ permalink raw reply related	[flat|nested] 30+ messages in thread

* [PATCH v2 3/4] genirq: Add support for IRQF_COND_ONESHOT
  2010-12-12 11:22 [PATCH v2 0/4] KVM & genirq: Enable adaptive IRQ sharing for passed-through devices Jan Kiszka
  2010-12-12 11:22 ` [PATCH v2 1/4] genirq: Introduce driver-readable IRQ status word Jan Kiszka
  2010-12-12 11:22 ` [PATCH v2 2/4] genirq: Inform handler about line sharing state Jan Kiszka
@ 2010-12-12 11:22 ` Jan Kiszka
  2010-12-12 11:22 ` [PATCH v2 4/4] KVM: Allow host IRQ sharing for passed-through PCI 2.3 devices Jan Kiszka
  2010-12-13 10:10 ` [PATCH v2 0/4] KVM & genirq: Enable adaptive IRQ sharing for passed-through devices Michael S. Tsirkin
  4 siblings, 0 replies; 30+ messages in thread
From: Jan Kiszka @ 2010-12-12 11:22 UTC (permalink / raw)
  To: Thomas Gleixner, Avi Kivity, Marcelo Tosatti
  Cc: linux-kernel, kvm, Tom Lyon, Alex Williamson, Michael S. Tsirkin,
	Jan Kiszka

From: Jan Kiszka <jan.kiszka@siemens.com>

Provide an adaptive version of IRQF_ONESHOT: If the line is exclusively used,
IRQF_COND_ONESHOT provides the same semantics as IRQF_ONESHOT. If it is
shared, the line will be unmasked directly after the hardirq handler, just as
if IRQF_COND_ONESHOT was not provided.

Signed-off-by: Jan Kiszka <jan.kiszka@siemens.com>
---
 include/linux/interrupt.h |    3 +++
 kernel/irq/manage.c       |   19 ++++++++++++++++---
 2 files changed, 19 insertions(+), 3 deletions(-)

diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h
index c6323a2..f462807 100644
--- a/include/linux/interrupt.h
+++ b/include/linux/interrupt.h
@@ -56,6 +56,8 @@
  *                irq line disabled until the threaded handler has been run.
  * IRQF_NO_SUSPEND - Do not disable this IRQ during suspend
  * IRQF_ADAPTIVE - Request notification about upcoming interrupt line sharing
+ * IRQF_COND_ONESHOT - If line is not shared, keep interrupt disabled after
+ *                     hardirq handler finshed.
  *
  */
 #define IRQF_DISABLED		0x00000020
@@ -69,6 +71,7 @@
 #define IRQF_ONESHOT		0x00002000
 #define IRQF_NO_SUSPEND		0x00004000
 #define IRQF_ADAPTIVE		0x00008000
+#define IRQF_COND_ONESHOT	0x00010000
 
 #define IRQF_TIMER		(__IRQF_TIMER | IRQF_NO_SUSPEND)
 
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index a4557eb..948b7c9 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -580,7 +580,7 @@ static int irq_thread(void *data)
 	struct sched_param param = { .sched_priority = MAX_USER_RT_PRIO/2, };
 	struct irqaction *action = data;
 	struct irq_desc *desc = irq_to_desc(action->irq);
-	int wake, oneshot = desc->status & IRQ_ONESHOT;
+	int wake, oneshot;
 
 	sched_setscheduler(current, SCHED_FIFO, &param);
 	current->irqaction = action;
@@ -603,6 +603,7 @@ static int irq_thread(void *data)
 			desc->status |= IRQ_PENDING;
 			raw_spin_unlock_irq(&desc->lock);
 		} else {
+			oneshot = desc->status & IRQ_ONESHOT;
 			raw_spin_unlock_irq(&desc->lock);
 
 			action->thread_fn(action->irq, action->dev_id);
@@ -756,6 +757,15 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
 		shared = 1;
 
 		desc->irq_data.status |= IRQS_SHARED;
+		desc->status &= ~IRQ_ONESHOT;
+
+		/* Unmask if the interrupt was masked due to oneshot mode. */
+		if ((desc->status &
+		     (IRQ_INPROGRESS | IRQ_DISABLED | IRQ_MASKED)) ==
+		    IRQ_MASKED) {
+			desc->irq_data.chip->irq_unmask(&desc->irq_data);
+			desc->status &= ~IRQ_MASKED;
+		}
 	}
 
 	if (!shared) {
@@ -780,7 +790,7 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
 		desc->status &= ~(IRQ_AUTODETECT | IRQ_WAITING | IRQ_ONESHOT |
 				  IRQ_INPROGRESS | IRQ_SPURIOUS_DISABLED);
 
-		if (new->flags & IRQF_ONESHOT)
+		if (new->flags & (IRQF_ONESHOT | IRQF_COND_ONESHOT))
 			desc->status |= IRQ_ONESHOT;
 
 		if (!(desc->status & IRQ_NOAUTOEN)) {
@@ -931,8 +941,11 @@ static struct irqaction *__free_irq(unsigned int irq, void *dev_id)
 			desc->irq_data.chip->irq_shutdown(&desc->irq_data);
 		else
 			desc->irq_data.chip->irq_disable(&desc->irq_data);
-	} else if (!desc->action->next)
+	} else if (!desc->action->next) {
 		single_handler = true;
+		if (desc->action->flags & IRQF_COND_ONESHOT)
+			desc->status |= IRQ_ONESHOT;
+	}
 
 #ifdef CONFIG_SMP
 	/* make sure affinity_hint is cleaned up */
-- 
1.7.1


^ permalink raw reply related	[flat|nested] 30+ messages in thread

* [PATCH v2 4/4] KVM: Allow host IRQ sharing for passed-through PCI 2.3 devices
  2010-12-12 11:22 [PATCH v2 0/4] KVM & genirq: Enable adaptive IRQ sharing for passed-through devices Jan Kiszka
                   ` (2 preceding siblings ...)
  2010-12-12 11:22 ` [PATCH v2 3/4] genirq: Add support for IRQF_COND_ONESHOT Jan Kiszka
@ 2010-12-12 11:22 ` Jan Kiszka
  2010-12-13 10:19   ` Avi Kivity
  2010-12-13 10:10 ` [PATCH v2 0/4] KVM & genirq: Enable adaptive IRQ sharing for passed-through devices Michael S. Tsirkin
  4 siblings, 1 reply; 30+ messages in thread
From: Jan Kiszka @ 2010-12-12 11:22 UTC (permalink / raw)
  To: Thomas Gleixner, Avi Kivity, Marcelo Tosatti
  Cc: linux-kernel, kvm, Tom Lyon, Alex Williamson, Michael S. Tsirkin,
	Jan Kiszka

From: Jan Kiszka <jan.kiszka@siemens.com>

PCI 2.3 allows to generically disable IRQ sources at device level. This
enables us to share IRQs of such devices on the host side when passing
them to a guest.

However, IRQ disabling via the PCI config space is more costly than
masking the line via disable_irq. Therefore we register the IRQ in adaptive
mode and switch between line and device level disabling on demand.

This feature is optional, user space has to request it explicitly as it
also has to inform us about its view of PCI_COMMAND_INTX_DISABLE. That
way, we can avoid unmasking the interrupt and signaling it if the guest
masked it via the PCI config space.

Signed-off-by: Jan Kiszka <jan.kiszka@siemens.com>
---
 Documentation/kvm/api.txt |   27 ++++
 arch/x86/kvm/x86.c        |    1 +
 include/linux/kvm.h       |    6 +
 include/linux/kvm_host.h  |   10 ++-
 virt/kvm/assigned-dev.c   |  336 ++++++++++++++++++++++++++++++++++++++++-----
 5 files changed, 346 insertions(+), 34 deletions(-)

diff --git a/Documentation/kvm/api.txt b/Documentation/kvm/api.txt
index e1a9297..1c34e25 100644
--- a/Documentation/kvm/api.txt
+++ b/Documentation/kvm/api.txt
@@ -1112,6 +1112,14 @@ following flags are specified:
 
 /* Depends on KVM_CAP_IOMMU */
 #define KVM_DEV_ASSIGN_ENABLE_IOMMU	(1 << 0)
+/* The following two depend on KVM_CAP_PCI_2_3 */
+#define KVM_DEV_ASSIGN_PCI_2_3		(1 << 1)
+#define KVM_DEV_ASSIGN_MASK_INTX	(1 << 2)
+
+If KVM_DEV_ASSIGN_PCI_2_3 is set, the kernel will manage legacy INTx interrupts
+via the PCI-2.3-compliant device-level mask, but only if IRQ sharing with other
+assigned or host devices requires it. KVM_DEV_ASSIGN_MASK_INTX specifies the
+guest's view on the INTx mask, see KVM_ASSIGN_SET_INTX_MASK for details.
 
 4.48 KVM_DEASSIGN_PCI_DEVICE
 
@@ -1263,6 +1271,25 @@ struct kvm_assigned_msix_entry {
 	__u16 padding[3];
 };
 
+4.54 KVM_ASSIGN_SET_INTX_MASK
+
+Capability: KVM_CAP_PCI_2_3
+Architectures: x86
+Type: vm ioctl
+Parameters: struct kvm_assigned_pci_dev (in)
+Returns: 0 on success, -1 on error
+
+Informs the kernel about the guest's view on the INTx mask. As long as the
+guest masks the legacy INTx, the kernel will refrain from unmasking it at
+hardware level and will not assert the guest's IRQ line. User space is still
+responsible for applying this state to the assigned device's real config space.
+To avoid that the kernel overwrites the state user space wants to set,
+KVM_ASSIGN_SET_INTX_MASK has to be called prior to updating the config space.
+
+See KVM_ASSIGN_DEV_IRQ for the data structure. The target device is specified
+by assigned_dev_id. In the flags field, only KVM_DEV_ASSIGN_MASK_INTX is
+evaluated.
+
 5. The kvm_run structure
 
 Application code obtains a pointer to the kvm_run structure by
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index ed373ba..8775a54 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -1965,6 +1965,7 @@ int kvm_dev_ioctl_check_extension(long ext)
 	case KVM_CAP_X86_ROBUST_SINGLESTEP:
 	case KVM_CAP_XSAVE:
 	case KVM_CAP_ASYNC_PF:
+	case KVM_CAP_PCI_2_3:
 		r = 1;
 		break;
 	case KVM_CAP_COALESCED_MMIO:
diff --git a/include/linux/kvm.h b/include/linux/kvm.h
index ea2dc1a..3cadb42 100644
--- a/include/linux/kvm.h
+++ b/include/linux/kvm.h
@@ -541,6 +541,7 @@ struct kvm_ppc_pvinfo {
 #define KVM_CAP_PPC_GET_PVINFO 57
 #define KVM_CAP_PPC_IRQ_LEVEL 58
 #define KVM_CAP_ASYNC_PF 59
+#define KVM_CAP_PCI_2_3 60
 
 #ifdef KVM_CAP_IRQ_ROUTING
 
@@ -677,6 +678,9 @@ struct kvm_clock_data {
 #define KVM_SET_PIT2              _IOW(KVMIO,  0xa0, struct kvm_pit_state2)
 /* Available with KVM_CAP_PPC_GET_PVINFO */
 #define KVM_PPC_GET_PVINFO	  _IOW(KVMIO,  0xa1, struct kvm_ppc_pvinfo)
+/* Available with KVM_CAP_PCI_2_3 */
+#define KVM_ASSIGN_SET_INTX_MASK  _IOW(KVMIO,  0xa2, \
+				       struct kvm_assigned_pci_dev)
 
 /*
  * ioctls for vcpu fds
@@ -742,6 +746,8 @@ struct kvm_clock_data {
 #define KVM_SET_XCRS		  _IOW(KVMIO,  0xa7, struct kvm_xcrs)
 
 #define KVM_DEV_ASSIGN_ENABLE_IOMMU	(1 << 0)
+#define KVM_DEV_ASSIGN_PCI_2_3		(1 << 1)
+#define KVM_DEV_ASSIGN_MASK_INTX	(1 << 2)
 
 struct kvm_assigned_pci_dev {
 	__u32 assigned_dev_id;
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index ac4e83a..4f95070 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -477,6 +477,12 @@ struct kvm_irq_ack_notifier {
 	void (*irq_acked)(struct kvm_irq_ack_notifier *kian);
 };
 
+enum kvm_intx_state {
+	KVM_INTX_ENABLED,
+	KVM_INTX_LINE_DISABLED,
+	KVM_INTX_DEVICE_DISABLED,
+};
+
 struct kvm_assigned_dev_kernel {
 	struct kvm_irq_ack_notifier ack_notifier;
 	struct list_head list;
@@ -486,7 +492,7 @@ struct kvm_assigned_dev_kernel {
 	int host_devfn;
 	unsigned int entries_nr;
 	int host_irq;
-	bool host_irq_disabled;
+	unsigned long last_irq_status;
 	struct msix_entry *host_msix_entries;
 	int guest_irq;
 	struct msix_entry *guest_msix_entries;
@@ -496,6 +502,8 @@ struct kvm_assigned_dev_kernel {
 	struct pci_dev *dev;
 	struct kvm *kvm;
 	spinlock_t intx_lock;
+	spinlock_t intx_mask_lock;
+	enum kvm_intx_state intx_state;
 	char irq_name[32];
 };
 
diff --git a/virt/kvm/assigned-dev.c b/virt/kvm/assigned-dev.c
index c6114d3..b64799a 100644
--- a/virt/kvm/assigned-dev.c
+++ b/virt/kvm/assigned-dev.c
@@ -55,22 +55,141 @@ static int find_index_from_host_irq(struct kvm_assigned_dev_kernel
 	return index;
 }
 
-static irqreturn_t kvm_assigned_dev_thread(int irq, void *dev_id)
+static bool
+pci_2_3_set_irq_mask(struct pci_dev *dev, bool mask, bool check_status)
+{
+	u32 cmd_status_dword;
+	u16 origcmd, newcmd;
+	bool mask_updated = true;
+
+	/*
+	 * We do a single dword read to retrieve both command and status.
+	 * Document assumptions that make this possible.
+	 */
+	BUILD_BUG_ON(PCI_COMMAND % 4);
+	BUILD_BUG_ON(PCI_COMMAND + 2 != PCI_STATUS);
+
+	pci_block_user_cfg_access(dev);
+
+	/*
+	 * Read both command and status registers in a single 32-bit operation.
+	 * Note: we could cache the value for command and move the status read
+	 * out of the lock if there was a way to get notified of user changes
+	 * to command register through sysfs. Should be good for shared irqs.
+	 */
+	pci_read_config_dword(dev, PCI_COMMAND, &cmd_status_dword);
+
+	if (check_status) {
+		bool irq_pending =
+			(cmd_status_dword >> 16) & PCI_STATUS_INTERRUPT;
+
+		/*
+		 * Check interrupt status register to see whether our device
+		 * triggered the interrupt (when masking) or the next IRQ is
+		 * already pending (when unmasking).
+		 */
+		if (mask != irq_pending) {
+			mask_updated = false;
+			goto done;
+		}
+	}
+
+	origcmd = cmd_status_dword;
+	newcmd = origcmd & ~PCI_COMMAND_INTX_DISABLE;
+	if (mask)
+		newcmd |= PCI_COMMAND_INTX_DISABLE;
+	if (newcmd != origcmd)
+		pci_write_config_word(dev, PCI_COMMAND, newcmd);
+
+done:
+	pci_unblock_user_cfg_access(dev);
+	return mask_updated;
+}
+
+static void pci_2_3_irq_mask(struct pci_dev *dev)
+{
+	pci_2_3_set_irq_mask(dev, true, false);
+}
+
+static bool pci_2_3_irq_check_and_mask(struct pci_dev *dev)
+{
+	return pci_2_3_set_irq_mask(dev, true, true);
+}
+
+static void pci_2_3_irq_unmask(struct pci_dev *dev)
+{
+	pci_2_3_set_irq_mask(dev, false, false);
+}
+
+static bool pci_2_3_irq_check_and_unmask(struct pci_dev *dev)
+{
+	return pci_2_3_set_irq_mask(dev, false, true);
+}
+
+static irqreturn_t kvm_assigned_dev_intr_intx(int irq, void *dev_id)
+{
+	struct kvm_assigned_dev_kernel *assigned_dev = dev_id;
+	unsigned long irq_status = get_irq_status(irq);
+	int ret;
+
+	assigned_dev->last_irq_status = irq_status;
+
+	if (!(irq_status & (IRQS_SHARED | IRQS_MAKE_SHAREABLE)))
+		return IRQ_WAKE_THREAD;
+
+	spin_lock(&assigned_dev->intx_lock);
+
+	if (irq_status & IRQS_MAKE_SHAREABLE) {
+		if (assigned_dev->intx_state == KVM_INTX_LINE_DISABLED) {
+			pci_2_3_irq_mask(assigned_dev->dev);
+			enable_irq(irq);
+			assigned_dev->intx_state = KVM_INTX_DEVICE_DISABLED;
+		}
+		ret = IRQ_HANDLED;
+	} else if (pci_2_3_irq_check_and_mask(assigned_dev->dev)) {
+		assigned_dev->intx_state = KVM_INTX_DEVICE_DISABLED;
+		ret = IRQ_WAKE_THREAD;
+	} else
+		ret =IRQ_NONE;
+
+	spin_unlock(&assigned_dev->intx_lock);
+
+	return ret;
+}
+
+static irqreturn_t kvm_assigned_dev_thread_intx(int irq, void *dev_id)
 {
 	struct kvm_assigned_dev_kernel *assigned_dev = dev_id;
 
-	if (assigned_dev->irq_requested_type & KVM_DEV_IRQ_HOST_INTX) {
-		spin_lock(&assigned_dev->intx_lock);
-		disable_irq_nosync(irq);
-		assigned_dev->host_irq_disabled = true;
-		spin_unlock(&assigned_dev->intx_lock);
+	if (!(assigned_dev->last_irq_status & IRQS_SHARED)) {
+		spin_lock_irq(&assigned_dev->intx_lock);
+		if (assigned_dev->intx_state == KVM_INTX_ENABLED) {
+			disable_irq_nosync(irq);
+			assigned_dev->intx_state = KVM_INTX_LINE_DISABLED;
+		}
+		spin_unlock_irq(&assigned_dev->intx_lock);
 	}
 
+	spin_lock(&assigned_dev->intx_mask_lock);
+	if (!(assigned_dev->flags & KVM_DEV_ASSIGN_MASK_INTX))
+		kvm_set_irq(assigned_dev->kvm, assigned_dev->irq_source_id,
+			    assigned_dev->guest_irq, 1);
+	spin_unlock(&assigned_dev->intx_mask_lock);
+
+	return IRQ_HANDLED;
+}
+
+#ifdef __KVM_HAVE_MSI
+static irqreturn_t kvm_assigned_dev_thread_msi(int irq, void *dev_id)
+{
+	struct kvm_assigned_dev_kernel *assigned_dev = dev_id;
+
 	kvm_set_irq(assigned_dev->kvm, assigned_dev->irq_source_id,
 		    assigned_dev->guest_irq, 1);
 
 	return IRQ_HANDLED;
 }
+#endif
 
 #ifdef __KVM_HAVE_MSIX
 static irqreturn_t kvm_assigned_dev_thread_msix(int irq, void *dev_id)
@@ -102,15 +221,36 @@ static void kvm_assigned_dev_ack_irq(struct kvm_irq_ack_notifier *kian)
 
 	kvm_set_irq(dev->kvm, dev->irq_source_id, dev->guest_irq, 0);
 
-	/* The guest irq may be shared so this ack may be
-	 * from another device.
-	 */
-	spin_lock(&dev->intx_lock);
-	if (dev->host_irq_disabled) {
-		enable_irq(dev->host_irq);
-		dev->host_irq_disabled = false;
+	if (likely(!(dev->irq_requested_type & KVM_DEV_IRQ_HOST_INTX)))
+		return;
+
+	spin_lock(&dev->intx_mask_lock);
+
+	if (!(dev->flags & KVM_DEV_ASSIGN_MASK_INTX)) {
+		bool reassert = false;
+
+		spin_lock_irq(&dev->intx_lock);
+		/*
+		 * The guest IRQ may be shared so this ack can come from an
+		 * IRQ for another guest device.
+		 */
+		if (dev->intx_state == KVM_INTX_LINE_DISABLED) {
+			enable_irq(dev->host_irq);
+			dev->intx_state = KVM_INTX_ENABLED;
+		} else if (dev->intx_state == KVM_INTX_DEVICE_DISABLED) {
+			if (pci_2_3_irq_check_and_unmask(dev->dev))
+				dev->intx_state = KVM_INTX_ENABLED;
+			else
+				reassert = true;
+		}
+		spin_unlock_irq(&dev->intx_lock);
+
+		if (reassert)
+			kvm_set_irq(dev->kvm, dev->irq_source_id,
+				    dev->guest_irq, 1);
 	}
-	spin_unlock(&dev->intx_lock);
+
+	spin_unlock(&dev->intx_mask_lock);
 }
 
 static void deassign_guest_irq(struct kvm *kvm,
@@ -155,14 +295,21 @@ static void deassign_host_irq(struct kvm *kvm,
 		kfree(assigned_dev->host_msix_entries);
 		kfree(assigned_dev->guest_msix_entries);
 		pci_disable_msix(assigned_dev->dev);
+	} else if (assigned_dev->irq_requested_type & KVM_DEV_IRQ_HOST_MSI) {
+		free_irq(assigned_dev->host_irq, assigned_dev);
+		pci_disable_msi(assigned_dev->dev);
 	} else {
-		/* Deal with MSI and INTx */
-		disable_irq(assigned_dev->host_irq);
+		if (assigned_dev->flags & KVM_DEV_ASSIGN_PCI_2_3) {
+			spin_lock_irq(&assigned_dev->intx_lock);
+			pci_2_3_irq_mask(assigned_dev->dev);
+			/* prevent re-enabling by kvm_assigned_dev_ack_irq */
+			assigned_dev->intx_state = KVM_INTX_ENABLED;
+			spin_unlock_irq(&assigned_dev->intx_lock);
+			synchronize_irq(assigned_dev->host_irq);
+		} else
+			disable_irq(assigned_dev->host_irq);
 
 		free_irq(assigned_dev->host_irq, assigned_dev);
-
-		if (assigned_dev->irq_requested_type & KVM_DEV_IRQ_HOST_MSI)
-			pci_disable_msi(assigned_dev->dev);
 	}
 
 	assigned_dev->irq_requested_type &= ~(KVM_DEV_IRQ_HOST_MASK);
@@ -231,16 +378,41 @@ void kvm_free_all_assigned_devices(struct kvm *kvm)
 static int assigned_device_enable_host_intx(struct kvm *kvm,
 					    struct kvm_assigned_dev_kernel *dev)
 {
+	irq_handler_t handler;
+	unsigned long flags;
+	int err;
+
 	dev->host_irq = dev->dev->irq;
-	/* Even though this is PCI, we don't want to use shared
-	 * interrupts. Sharing host devices with guest-assigned devices
-	 * on the same interrupt line is not a happy situation: there
-	 * are going to be long delays in accepting, acking, etc.
+	dev->intx_state = KVM_INTX_ENABLED;
+	dev->last_irq_status = 0;
+
+	/*
+	 * We can only share the IRQ line with other host devices if we are
+	 * able to disable the IRQ source at device-level - independently of
+	 * the guest driver. Otherwise host devices may suffer from unbounded
+	 * IRQ latencies when the guest keeps the line asserted.
+	 * If PCI 2.3 support is available, we can instal a sharing notifier
+	 * and apply the required disabling pattern on demand.
 	 */
-	if (request_threaded_irq(dev->host_irq, NULL, kvm_assigned_dev_thread,
-				 IRQF_ONESHOT, dev->irq_name, dev))
-		return -EIO;
-	return 0;
+	if (dev->flags & KVM_DEV_ASSIGN_PCI_2_3) {
+		handler = kvm_assigned_dev_intr_intx;
+		flags = IRQF_SHARED | IRQF_ADAPTIVE | IRQF_COND_ONESHOT;
+	} else {
+		handler = NULL;
+		flags = IRQF_ONESHOT;
+	}
+
+	err = request_threaded_irq(dev->host_irq, handler,
+				   kvm_assigned_dev_thread_intx, flags,
+				   dev->irq_name, dev);
+
+	if (!err && dev->flags & KVM_DEV_ASSIGN_PCI_2_3) {
+		spin_lock_irq(&dev->intx_lock);
+		pci_2_3_irq_unmask(dev->dev);
+		spin_unlock_irq(&dev->intx_lock);
+	}
+
+	return err;
 }
 
 #ifdef __KVM_HAVE_MSI
@@ -256,8 +428,9 @@ static int assigned_device_enable_host_msi(struct kvm *kvm,
 	}
 
 	dev->host_irq = dev->dev->irq;
-	if (request_threaded_irq(dev->host_irq, NULL, kvm_assigned_dev_thread,
-				 0, dev->irq_name, dev)) {
+	if (request_threaded_irq(dev->host_irq, NULL,
+				 kvm_assigned_dev_thread_msi, 0,
+				 dev->irq_name, dev)) {
 		pci_disable_msi(dev->dev);
 		return -EIO;
 	}
@@ -296,7 +469,6 @@ err:
 	pci_disable_msix(dev->dev);
 	return r;
 }
-
 #endif
 
 static int assigned_device_enable_guest_intx(struct kvm *kvm,
@@ -315,7 +487,6 @@ static int assigned_device_enable_guest_msi(struct kvm *kvm,
 {
 	dev->guest_irq = irq->guest_irq;
 	dev->ack_notifier.gsi = -1;
-	dev->host_irq_disabled = false;
 	return 0;
 }
 #endif
@@ -327,7 +498,6 @@ static int assigned_device_enable_guest_msix(struct kvm *kvm,
 {
 	dev->guest_irq = irq->guest_irq;
 	dev->ack_notifier.gsi = -1;
-	dev->host_irq_disabled = false;
 	return 0;
 }
 #endif
@@ -461,6 +631,7 @@ static int kvm_vm_ioctl_deassign_dev_irq(struct kvm *kvm,
 {
 	int r = -ENODEV;
 	struct kvm_assigned_dev_kernel *match;
+	unsigned long irq_type;
 
 	mutex_lock(&kvm->lock);
 
@@ -469,12 +640,51 @@ static int kvm_vm_ioctl_deassign_dev_irq(struct kvm *kvm,
 	if (!match)
 		goto out;
 
-	r = kvm_deassign_irq(kvm, match, assigned_irq->flags);
+	irq_type = assigned_irq->flags & (KVM_DEV_IRQ_HOST_MASK |
+					  KVM_DEV_IRQ_GUEST_MASK);
+	r = kvm_deassign_irq(kvm, match, irq_type);
 out:
 	mutex_unlock(&kvm->lock);
 	return r;
 }
 
+/*
+ * Verify that the device supports Interrupt Disable bit in command register,
+ * per PCI 2.3, by flipping this bit and reading it back: this bit was readonly
+ * in PCI 2.2.
+ */
+static bool pci_2_3_supported(struct pci_dev *pdev)
+{
+	u16 orig, new;
+
+	pci_block_user_cfg_access(pdev);
+
+	pci_read_config_word(pdev, PCI_COMMAND, &orig);
+	pci_write_config_word(pdev, PCI_COMMAND,
+			      orig ^ PCI_COMMAND_INTX_DISABLE);
+	pci_read_config_word(pdev, PCI_COMMAND, &new);
+	pci_write_config_word(pdev, PCI_COMMAND, orig);
+
+	pci_unblock_user_cfg_access(pdev);
+
+	/*
+	 * There's no way to protect against
+	 * hardware bugs or detect them reliably, but as long as we know
+	 * what the value should be, let's go ahead and check it.
+	 */
+	if ((new ^ orig) & ~PCI_COMMAND_INTX_DISABLE) {
+		dev_err(&pdev->dev, "Command changed from 0x%x to 0x%x: "
+			"driver or HW bug?\n", orig, new);
+		return false;
+	}
+	if (!((new ^ orig) & PCI_COMMAND_INTX_DISABLE)) {
+		dev_warn(&pdev->dev, "Device does not support disabling "
+			 "interrupts, IRQ sharing impossible.\n");
+		return false;
+	}
+	return true;
+}
+
 static int kvm_vm_ioctl_assign_device(struct kvm *kvm,
 				      struct kvm_assigned_pci_dev *assigned_dev)
 {
@@ -523,6 +733,9 @@ static int kvm_vm_ioctl_assign_device(struct kvm *kvm,
 	pci_reset_function(dev);
 	pci_save_state(dev);
 
+	if (!pci_2_3_supported(dev))
+		assigned_dev->flags &= ~KVM_DEV_ASSIGN_PCI_2_3;
+
 	match->assigned_dev_id = assigned_dev->assigned_dev_id;
 	match->host_segnr = assigned_dev->segnr;
 	match->host_busnr = assigned_dev->busnr;
@@ -530,6 +743,7 @@ static int kvm_vm_ioctl_assign_device(struct kvm *kvm,
 	match->flags = assigned_dev->flags;
 	match->dev = dev;
 	spin_lock_init(&match->intx_lock);
+	spin_lock_init(&match->intx_mask_lock);
 	match->irq_source_id = -1;
 	match->kvm = kvm;
 	match->ack_notifier.irq_acked = kvm_assigned_dev_ack_irq;
@@ -676,6 +890,53 @@ msix_entry_out:
 }
 #endif
 
+static int kvm_vm_ioctl_set_pci_irq_mask(struct kvm *kvm,
+		struct kvm_assigned_pci_dev *assigned_dev)
+{
+	int r = 0;
+	struct kvm_assigned_dev_kernel *match;
+
+	mutex_lock(&kvm->lock);
+
+	match = kvm_find_assigned_dev(&kvm->arch.assigned_dev_head,
+				      assigned_dev->assigned_dev_id);
+	if (!match) {
+		r = -ENODEV;
+		goto out;
+	}
+
+	spin_lock(&match->intx_mask_lock);
+
+	match->flags &= ~KVM_DEV_ASSIGN_MASK_INTX;
+	match->flags |= assigned_dev->flags & KVM_DEV_ASSIGN_MASK_INTX;
+
+	if (assigned_dev->flags & KVM_DEV_ASSIGN_MASK_INTX) {
+		kvm_set_irq(match->kvm, match->irq_source_id,
+			    match->guest_irq, 0);
+		/*
+		 * Masking at hardware-level is performed on demand, i.e. when
+		 * an IRQ actually arrives at the host.
+		 */
+	} else {
+		/*
+		 * Unmask the IRQ line. It may have been masked meanwhile if
+		 * we aren't using PCI 2.3 INTx masking on the host side.
+		 */
+		spin_lock_irq(&match->intx_lock);
+		if (match->intx_state == KVM_INTX_LINE_DISABLED) {
+			enable_irq(match->host_irq);
+			match->intx_state = KVM_INTX_ENABLED;
+		}
+		spin_unlock_irq(&match->intx_lock);
+	}
+
+	spin_unlock(&match->intx_mask_lock);
+
+out:
+	mutex_unlock(&kvm->lock);
+	return r;
+}
+
 long kvm_vm_ioctl_assigned_device(struct kvm *kvm, unsigned ioctl,
 				  unsigned long arg)
 {
@@ -783,6 +1044,15 @@ long kvm_vm_ioctl_assigned_device(struct kvm *kvm, unsigned ioctl,
 		break;
 	}
 #endif
+	case KVM_ASSIGN_SET_INTX_MASK: {
+		struct kvm_assigned_pci_dev assigned_dev;
+
+		r = -EFAULT;
+		if (copy_from_user(&assigned_dev, argp, sizeof assigned_dev))
+			goto out;
+		r = kvm_vm_ioctl_set_pci_irq_mask(kvm, &assigned_dev);
+		break;
+	}
 	default:
 		r = -ENOTTY;
 		break;
-- 
1.7.1


^ permalink raw reply related	[flat|nested] 30+ messages in thread

* Re: [PATCH v2 2/4] genirq: Inform handler about line sharing state
  2010-12-12 11:22 ` [PATCH v2 2/4] genirq: Inform handler about line sharing state Jan Kiszka
@ 2010-12-12 16:53   ` Thomas Gleixner
  2010-12-12 21:49     ` Jan Kiszka
  0 siblings, 1 reply; 30+ messages in thread
From: Thomas Gleixner @ 2010-12-12 16:53 UTC (permalink / raw)
  To: Jan Kiszka
  Cc: Avi Kivity, Marcelo Tosatti, linux-kernel, kvm, Tom Lyon,
	Alex Williamson, Michael S. Tsirkin, Jan Kiszka

On Sun, 12 Dec 2010, Jan Kiszka wrote:

> From: Jan Kiszka <jan.kiszka@siemens.com>
> 
> This enabled interrupt handlers to retrieve the current line sharing state via
> the new interrupt status word so that they can adapt to it.
> 
> The switch from shared to exclusive is generally uncritical and can thus be
> performed on demand. However, preparing a line for shared mode may require
> preparational steps of the currently registered handler. It can therefore
> request an ahead-of-time notification via IRQF_ADAPTIVE. The notification
> consists of an exceptional handler invocation with IRQS_MAKE_SHAREABLE set in
> the status word.
> 
> Signed-off-by: Jan Kiszka <jan.kiszka@siemens.com>
> ---
>  include/linux/interrupt.h |   10 ++++++++++
>  include/linux/irqdesc.h   |    2 ++
>  kernel/irq/irqdesc.c      |    2 ++
>  kernel/irq/manage.c       |   44 +++++++++++++++++++++++++++++++++++++++++---
>  4 files changed, 55 insertions(+), 3 deletions(-)
> 
> diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h
> index 16cdbbf..c6323a2 100644
> --- a/include/linux/interrupt.h
> +++ b/include/linux/interrupt.h
> @@ -55,6 +55,7 @@
>   *                Used by threaded interrupts which need to keep the
>   *                irq line disabled until the threaded handler has been run.
>   * IRQF_NO_SUSPEND - Do not disable this IRQ during suspend
> + * IRQF_ADAPTIVE - Request notification about upcoming interrupt line sharing
>   *
>   */
>  #define IRQF_DISABLED		0x00000020
> @@ -67,6 +68,7 @@
>  #define IRQF_IRQPOLL		0x00001000
>  #define IRQF_ONESHOT		0x00002000
>  #define IRQF_NO_SUSPEND		0x00004000
> +#define IRQF_ADAPTIVE		0x00008000
>  
>  #define IRQF_TIMER		(__IRQF_TIMER | IRQF_NO_SUSPEND)
>  
> @@ -126,6 +128,14 @@ struct irqaction {
>  
>  extern irqreturn_t no_action(int cpl, void *dev_id);
>  
> +/*
> + * Driver-readable IRQ line status flags:
> + * IRQS_SHARED - line is shared between multiple handlers
> + * IRQS_MAKE_SHAREABLE - in the process of making an exclusive line shareable
> + */
> +#define IRQS_SHARED		0x00000001
> +#define IRQS_MAKE_SHAREABLE	0x00000002
> +
>  extern unsigned long get_irq_status(unsigned long irq);
>  
>  #ifdef CONFIG_GENERIC_HARDIRQS
> diff --git a/include/linux/irqdesc.h b/include/linux/irqdesc.h
> index 979c68c..c490e83 100644
> --- a/include/linux/irqdesc.h
> +++ b/include/linux/irqdesc.h
> @@ -29,6 +29,7 @@ struct timer_rand_state;
>   * @wait_for_threads:	wait queue for sync_irq to wait for threaded handlers
>   * @dir:		/proc/irq/ procfs entry
>   * @name:		flow handler name for /proc/interrupts output
> + * @register_lock:	protects registration & release, for unshared->shared

I think we can make that a global mutex. request/free_irq are not
hotpath operations which require a mutex per irq descriptor.

Thanks,

	tglx

^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [PATCH v2 1/4] genirq: Introduce driver-readable IRQ status word
  2010-12-12 11:22 ` [PATCH v2 1/4] genirq: Introduce driver-readable IRQ status word Jan Kiszka
@ 2010-12-12 17:29   ` Thomas Gleixner
  2010-12-12 21:51     ` Jan Kiszka
  0 siblings, 1 reply; 30+ messages in thread
From: Thomas Gleixner @ 2010-12-12 17:29 UTC (permalink / raw)
  To: Jan Kiszka
  Cc: Avi Kivity, Marcelo Tosatti, linux-kernel, kvm, Tom Lyon,
	Alex Williamson, Michael S. Tsirkin, Jan Kiszka

On Sun, 12 Dec 2010, Jan Kiszka wrote:

> From: Jan Kiszka <jan.kiszka@siemens.com>
> 
> This associates a status word with every IRQ descriptor. Drivers can obtain
> its content via get_irq_status(irq). First use case will be propagating the
> interrupt sharing state.
> 
> Signed-off-by: Jan Kiszka <jan.kiszka@siemens.com>
> ---
>  include/linux/interrupt.h |    2 ++
>  include/linux/irq.h       |    2 ++
>  kernel/irq/manage.c       |   15 +++++++++++++++
>  3 files changed, 19 insertions(+), 0 deletions(-)
> 
> diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h
> index 79d0c4f..16cdbbf 100644
> --- a/include/linux/interrupt.h
> +++ b/include/linux/interrupt.h
> @@ -126,6 +126,8 @@ struct irqaction {
>  
>  extern irqreturn_t no_action(int cpl, void *dev_id);
>  
> +extern unsigned long get_irq_status(unsigned long irq);
> +
>  #ifdef CONFIG_GENERIC_HARDIRQS
>  extern int __must_check
>  request_threaded_irq(unsigned int irq, irq_handler_t handler,
> diff --git a/include/linux/irq.h b/include/linux/irq.h
> index abde252..5554203 100644
> --- a/include/linux/irq.h
> +++ b/include/linux/irq.h
> @@ -96,6 +96,7 @@ struct msi_desc;
>   *			methods, to allow shared chip implementations
>   * @msi_desc:		MSI descriptor
>   * @affinity:		IRQ affinity on SMP
> + * @status:		driver-readable status flags (IRQS_*)
>   *
>   * The fields here need to overlay the ones in irq_desc until we
>   * cleaned up the direct references and switched everything over to
> @@ -108,6 +109,7 @@ struct irq_data {
>  	void			*handler_data;
>  	void			*chip_data;
>  	struct msi_desc		*msi_desc;
> +	unsigned long		status;

That breaks the current code which has irq_data embedded and shadowed
in irq_desc for migratory reasons until all users are fixed up and the
GENERIC_HARDIRQS_NO_DEPRECATED sections are cleaned up. I know it's
ugly, but that was the only way not to rewrite the world in one go. :)
Just move it below affinity.

Also we should name it different than status, drv_status perhaps, to
avoid confusion with the irq_desc status.

>  #ifdef CONFIG_SMP
>  	cpumask_var_t		affinity;
>  #endif

Thanks,

	tglx

^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [PATCH v2 2/4] genirq: Inform handler about line sharing state
  2010-12-12 16:53   ` Thomas Gleixner
@ 2010-12-12 21:49     ` Jan Kiszka
  0 siblings, 0 replies; 30+ messages in thread
From: Jan Kiszka @ 2010-12-12 21:49 UTC (permalink / raw)
  To: Thomas Gleixner
  Cc: Avi Kivity, Marcelo Tosatti, linux-kernel, kvm, Tom Lyon,
	Alex Williamson, Michael S. Tsirkin, Jan Kiszka

[-- Attachment #1: Type: text/plain, Size: 3026 bytes --]

Am 12.12.2010 17:53, Thomas Gleixner wrote:
> On Sun, 12 Dec 2010, Jan Kiszka wrote:
> 
>> From: Jan Kiszka <jan.kiszka@siemens.com>
>>
>> This enabled interrupt handlers to retrieve the current line sharing state via
>> the new interrupt status word so that they can adapt to it.
>>
>> The switch from shared to exclusive is generally uncritical and can thus be
>> performed on demand. However, preparing a line for shared mode may require
>> preparational steps of the currently registered handler. It can therefore
>> request an ahead-of-time notification via IRQF_ADAPTIVE. The notification
>> consists of an exceptional handler invocation with IRQS_MAKE_SHAREABLE set in
>> the status word.
>>
>> Signed-off-by: Jan Kiszka <jan.kiszka@siemens.com>
>> ---
>>  include/linux/interrupt.h |   10 ++++++++++
>>  include/linux/irqdesc.h   |    2 ++
>>  kernel/irq/irqdesc.c      |    2 ++
>>  kernel/irq/manage.c       |   44 +++++++++++++++++++++++++++++++++++++++++---
>>  4 files changed, 55 insertions(+), 3 deletions(-)
>>
>> diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h
>> index 16cdbbf..c6323a2 100644
>> --- a/include/linux/interrupt.h
>> +++ b/include/linux/interrupt.h
>> @@ -55,6 +55,7 @@
>>   *                Used by threaded interrupts which need to keep the
>>   *                irq line disabled until the threaded handler has been run.
>>   * IRQF_NO_SUSPEND - Do not disable this IRQ during suspend
>> + * IRQF_ADAPTIVE - Request notification about upcoming interrupt line sharing
>>   *
>>   */
>>  #define IRQF_DISABLED		0x00000020
>> @@ -67,6 +68,7 @@
>>  #define IRQF_IRQPOLL		0x00001000
>>  #define IRQF_ONESHOT		0x00002000
>>  #define IRQF_NO_SUSPEND		0x00004000
>> +#define IRQF_ADAPTIVE		0x00008000
>>  
>>  #define IRQF_TIMER		(__IRQF_TIMER | IRQF_NO_SUSPEND)
>>  
>> @@ -126,6 +128,14 @@ struct irqaction {
>>  
>>  extern irqreturn_t no_action(int cpl, void *dev_id);
>>  
>> +/*
>> + * Driver-readable IRQ line status flags:
>> + * IRQS_SHARED - line is shared between multiple handlers
>> + * IRQS_MAKE_SHAREABLE - in the process of making an exclusive line shareable
>> + */
>> +#define IRQS_SHARED		0x00000001
>> +#define IRQS_MAKE_SHAREABLE	0x00000002
>> +
>>  extern unsigned long get_irq_status(unsigned long irq);
>>  
>>  #ifdef CONFIG_GENERIC_HARDIRQS
>> diff --git a/include/linux/irqdesc.h b/include/linux/irqdesc.h
>> index 979c68c..c490e83 100644
>> --- a/include/linux/irqdesc.h
>> +++ b/include/linux/irqdesc.h
>> @@ -29,6 +29,7 @@ struct timer_rand_state;
>>   * @wait_for_threads:	wait queue for sync_irq to wait for threaded handlers
>>   * @dir:		/proc/irq/ procfs entry
>>   * @name:		flow handler name for /proc/interrupts output
>> + * @register_lock:	protects registration & release, for unshared->shared
> 
> I think we can make that a global mutex. request/free_irq are not
> hotpath operations which require a mutex per irq descriptor.
> 

Agreed, will change this.

Jan


[-- Attachment #2: OpenPGP digital signature --]
[-- Type: application/pgp-signature, Size: 259 bytes --]

^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [PATCH v2 1/4] genirq: Introduce driver-readable IRQ status word
  2010-12-12 17:29   ` Thomas Gleixner
@ 2010-12-12 21:51     ` Jan Kiszka
  2010-12-13  8:17       ` Thomas Gleixner
  0 siblings, 1 reply; 30+ messages in thread
From: Jan Kiszka @ 2010-12-12 21:51 UTC (permalink / raw)
  To: Thomas Gleixner
  Cc: Avi Kivity, Marcelo Tosatti, linux-kernel, kvm, Tom Lyon,
	Alex Williamson, Michael S. Tsirkin, Jan Kiszka

[-- Attachment #1: Type: text/plain, Size: 2331 bytes --]

Am 12.12.2010 18:29, Thomas Gleixner wrote:
> On Sun, 12 Dec 2010, Jan Kiszka wrote:
> 
>> From: Jan Kiszka <jan.kiszka@siemens.com>
>>
>> This associates a status word with every IRQ descriptor. Drivers can obtain
>> its content via get_irq_status(irq). First use case will be propagating the
>> interrupt sharing state.
>>
>> Signed-off-by: Jan Kiszka <jan.kiszka@siemens.com>
>> ---
>>  include/linux/interrupt.h |    2 ++
>>  include/linux/irq.h       |    2 ++
>>  kernel/irq/manage.c       |   15 +++++++++++++++
>>  3 files changed, 19 insertions(+), 0 deletions(-)
>>
>> diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h
>> index 79d0c4f..16cdbbf 100644
>> --- a/include/linux/interrupt.h
>> +++ b/include/linux/interrupt.h
>> @@ -126,6 +126,8 @@ struct irqaction {
>>  
>>  extern irqreturn_t no_action(int cpl, void *dev_id);
>>  
>> +extern unsigned long get_irq_status(unsigned long irq);
>> +
>>  #ifdef CONFIG_GENERIC_HARDIRQS
>>  extern int __must_check
>>  request_threaded_irq(unsigned int irq, irq_handler_t handler,
>> diff --git a/include/linux/irq.h b/include/linux/irq.h
>> index abde252..5554203 100644
>> --- a/include/linux/irq.h
>> +++ b/include/linux/irq.h
>> @@ -96,6 +96,7 @@ struct msi_desc;
>>   *			methods, to allow shared chip implementations
>>   * @msi_desc:		MSI descriptor
>>   * @affinity:		IRQ affinity on SMP
>> + * @status:		driver-readable status flags (IRQS_*)
>>   *
>>   * The fields here need to overlay the ones in irq_desc until we
>>   * cleaned up the direct references and switched everything over to
>> @@ -108,6 +109,7 @@ struct irq_data {
>>  	void			*handler_data;
>>  	void			*chip_data;
>>  	struct msi_desc		*msi_desc;
>> +	unsigned long		status;
> 
> That breaks the current code which has irq_data embedded and shadowed
> in irq_desc for migratory reasons until all users are fixed up and the
> GENERIC_HARDIRQS_NO_DEPRECATED sections are cleaned up. I know it's
> ugly, but that was the only way not to rewrite the world in one go. :)
> Just move it below affinity.
> 
> Also we should name it different than status, drv_status perhaps, to
> avoid confusion with the irq_desc status.

OK, will address both in a succeeding round (just waiting for potential
further comments).

Thanks,
Jan


[-- Attachment #2: OpenPGP digital signature --]
[-- Type: application/pgp-signature, Size: 259 bytes --]

^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [PATCH v2 1/4] genirq: Introduce driver-readable IRQ status word
  2010-12-12 21:51     ` Jan Kiszka
@ 2010-12-13  8:17       ` Thomas Gleixner
  0 siblings, 0 replies; 30+ messages in thread
From: Thomas Gleixner @ 2010-12-13  8:17 UTC (permalink / raw)
  To: Jan Kiszka
  Cc: Avi Kivity, Marcelo Tosatti, linux-kernel, kvm, Tom Lyon,
	Alex Williamson, Michael S. Tsirkin, Jan Kiszka

On Sun, 12 Dec 2010, Jan Kiszka wrote:
> Am 12.12.2010 18:29, Thomas Gleixner wrote:
> > Also we should name it different than status, drv_status perhaps, to
> > avoid confusion with the irq_desc status.
> 
> OK, will address both in a succeeding round (just waiting for potential
> further comments).

No further comments from my side ATM.

Thanks,

	tglx

^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [PATCH v2 0/4] KVM & genirq: Enable adaptive IRQ sharing for passed-through devices
  2010-12-12 11:22 [PATCH v2 0/4] KVM & genirq: Enable adaptive IRQ sharing for passed-through devices Jan Kiszka
                   ` (3 preceding siblings ...)
  2010-12-12 11:22 ` [PATCH v2 4/4] KVM: Allow host IRQ sharing for passed-through PCI 2.3 devices Jan Kiszka
@ 2010-12-13 10:10 ` Michael S. Tsirkin
  2010-12-13 22:59   ` Jan Kiszka
  4 siblings, 1 reply; 30+ messages in thread
From: Michael S. Tsirkin @ 2010-12-13 10:10 UTC (permalink / raw)
  To: Jan Kiszka
  Cc: Thomas Gleixner, Avi Kivity, Marcelo Tosatti, linux-kernel, kvm,
	Tom Lyon, Alex Williamson

On Sun, Dec 12, 2010 at 12:22:40PM +0100, Jan Kiszka wrote:
> The result may look simpler on first glance than v1, but it comes with
> more subtle race scenarios IMO. I thought them through, hopefully
> catching all, but I would appreciate any skeptical review.

Thought about the races till my head hurt, and yes, they
all seem to be handled correctly. FWIW

Reviewed-by: Michael S. Tsirkin <mst@redhat.com>

^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [PATCH v2 4/4] KVM: Allow host IRQ sharing for passed-through PCI 2.3 devices
  2010-12-12 11:22 ` [PATCH v2 4/4] KVM: Allow host IRQ sharing for passed-through PCI 2.3 devices Jan Kiszka
@ 2010-12-13 10:19   ` Avi Kivity
  0 siblings, 0 replies; 30+ messages in thread
From: Avi Kivity @ 2010-12-13 10:19 UTC (permalink / raw)
  To: Jan Kiszka
  Cc: Thomas Gleixner, Marcelo Tosatti, linux-kernel, kvm, Tom Lyon,
	Alex Williamson, Michael S. Tsirkin, Jan Kiszka

On 12/12/2010 01:22 PM, Jan Kiszka wrote:
> From: Jan Kiszka<jan.kiszka@siemens.com>
>
> PCI 2.3 allows to generically disable IRQ sources at device level. This
> enables us to share IRQs of such devices on the host side when passing
> them to a guest.
>
> However, IRQ disabling via the PCI config space is more costly than
> masking the line via disable_irq. Therefore we register the IRQ in adaptive
> mode and switch between line and device level disabling on demand.
>
> This feature is optional, user space has to request it explicitly as it
> also has to inform us about its view of PCI_COMMAND_INTX_DISABLE. That
> way, we can avoid unmasking the interrupt and signaling it if the guest
> masked it via the PCI config space.
>

Looks fine.

> +		ret =IRQ_NONE;
> +

Danger, whitespace error detected.  Initiating self-destruct sequence.

-- 
error compiling committee.c: too many arguments to function

^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [PATCH v2 0/4] KVM & genirq: Enable adaptive IRQ sharing for passed-through devices
  2010-12-13 10:10 ` [PATCH v2 0/4] KVM & genirq: Enable adaptive IRQ sharing for passed-through devices Michael S. Tsirkin
@ 2010-12-13 22:59   ` Jan Kiszka
  0 siblings, 0 replies; 30+ messages in thread
From: Jan Kiszka @ 2010-12-13 22:59 UTC (permalink / raw)
  To: Michael S. Tsirkin
  Cc: Thomas Gleixner, Avi Kivity, Marcelo Tosatti, linux-kernel, kvm,
	Tom Lyon, Alex Williamson

[-- Attachment #1: Type: text/plain, Size: 559 bytes --]

Am 13.12.2010 11:10, Michael S. Tsirkin wrote:
> On Sun, Dec 12, 2010 at 12:22:40PM +0100, Jan Kiszka wrote:
>> The result may look simpler on first glance than v1, but it comes with
>> more subtle race scenarios IMO. I thought them through, hopefully
>> catching all, but I would appreciate any skeptical review.
> 
> Thought about the races till my head hurt, and yes, they
> all seem to be handled correctly. FWIW

Ouch, I'm endlessly sorry for causing this pain.

> 
> Reviewed-by: Michael S. Tsirkin <mst@redhat.com>
> 

Thanks!

Jan


[-- Attachment #2: OpenPGP digital signature --]
[-- Type: application/pgp-signature, Size: 259 bytes --]

^ permalink raw reply	[flat|nested] 30+ messages in thread

end of thread, other threads:[~2010-12-13 22:59 UTC | newest]

Thread overview: 30+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2010-12-12 11:22 [PATCH v2 0/4] KVM & genirq: Enable adaptive IRQ sharing for passed-through devices Jan Kiszka
2010-12-12 11:22 ` [PATCH v2 1/4] genirq: Introduce driver-readable IRQ status word Jan Kiszka
2010-12-12 17:29   ` Thomas Gleixner
2010-12-12 21:51     ` Jan Kiszka
2010-12-13  8:17       ` Thomas Gleixner
2010-12-12 11:22 ` [PATCH v2 2/4] genirq: Inform handler about line sharing state Jan Kiszka
2010-12-12 16:53   ` Thomas Gleixner
2010-12-12 21:49     ` Jan Kiszka
2010-12-12 11:22 ` [PATCH v2 3/4] genirq: Add support for IRQF_COND_ONESHOT Jan Kiszka
2010-12-12 11:22 ` [PATCH v2 4/4] KVM: Allow host IRQ sharing for passed-through PCI 2.3 devices Jan Kiszka
2010-12-13 10:19   ` Avi Kivity
2010-12-13 10:10 ` [PATCH v2 0/4] KVM & genirq: Enable adaptive IRQ sharing for passed-through devices Michael S. Tsirkin
2010-12-13 22:59   ` Jan Kiszka
  -- strict thread matches above, loose matches on Subject: below --
2010-11-02 15:49 [PATCH v2 0/4] KVM: Improve IRQ assignment for device passthrough Jan Kiszka
2010-11-02 15:49 ` [PATCH v2 4/4] KVM: Allow host IRQ sharing for passed-through PCI 2.3 devices Jan Kiszka
2010-11-02 17:41   ` Michael S. Tsirkin
2010-11-02 17:56     ` Jan Kiszka
2010-11-02 18:24       ` Michael S. Tsirkin
2010-11-02 18:40         ` Jan Kiszka
2010-11-02 18:48           ` Jan Kiszka
2010-11-02 18:51             ` Jan Kiszka
2010-11-02 18:54               ` Michael S. Tsirkin
2010-11-02 19:30                 ` Jan Kiszka
2010-11-02 19:53                   ` Michael S. Tsirkin
2010-11-02 19:58                     ` Jan Kiszka
2010-11-02 20:05                       ` Michael S. Tsirkin
2010-11-02 18:52           ` Michael S. Tsirkin
2010-11-02 19:11             ` Jan Kiszka
2010-11-02 19:14               ` Michael S. Tsirkin
2010-11-02 19:56                 ` Jan Kiszka
2010-11-02 19:41           ` Alex Williamson

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox