[PATCH 1/2] Remove kvm_commit_irq_routes from error messages

public inbox for kvm@vger.kernel.org
 help / color / mirror / Atom feed

* [PATCH 1/2] Remove kvm_commit_irq_routes from error messages
@ 2012-05-24 17:02 Richard Weinberger
  2012-05-24 17:02 ` [PATCH 2/2] Device assignment: Fix MSI IRQ affinity setting Richard Weinberger
  0 siblings, 1 reply; 16+ messages in thread
From: Richard Weinberger @ 2012-05-24 17:02 UTC (permalink / raw)
  To: kvm; +Cc: avi, mtosatti, tglx, Richard Weinberger

Make my life a bit easier and report the correct function names.
s/kvm_commit_irq_routes/kvm_irqchip_commit_routes

Signed-off-by: Richard Weinberger <richard@nod.at>
---
 hw/device-assignment.c |    4 ++--
 hw/msi.c               |    2 +-
 hw/msix.c              |    4 ++--
 3 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/hw/device-assignment.c b/hw/device-assignment.c
index 1daadb9..09726f9 100644
--- a/hw/device-assignment.c
+++ b/hw/device-assignment.c
@@ -958,7 +958,7 @@ static void assigned_dev_update_msi(PCIDevice *pci_dev)
 
         kvm_add_routing_entry(kvm_state, assigned_dev->entry);
         if (kvm_irqchip_commit_routes(kvm_state) < 0) {
-            perror("assigned_dev_update_msi: kvm_commit_irq_routes");
+            perror("assigned_dev_update_msi: kvm_irqchip_commit_routes");
             assigned_dev->cap.state &= ~ASSIGNED_DEVICE_MSI_ENABLED;
             return;
         }
@@ -1053,7 +1053,7 @@ static int assigned_dev_update_msix_mmio(PCIDevice *pci_dev)
     }
 
     if (r == 0 && kvm_irqchip_commit_routes(kvm_state) < 0) {
-	    perror("assigned_dev_update_msix_mmio: kvm_commit_irq_routes");
+	    perror("assigned_dev_update_msix_mmio: kvm_irqchip_commit_routes");
 	    return -EINVAL;
     }
 
diff --git a/hw/msi.c b/hw/msi.c
index 4fcf769..1a20e83 100644
--- a/hw/msi.c
+++ b/hw/msi.c
@@ -180,7 +180,7 @@ static void kvm_msi_update(PCIDevice *dev)
     if (changed) {
         r = kvm_irqchip_commit_routes(kvm_state);
         if (r) {
-            fprintf(stderr, "%s: kvm_commit_irq_routes failed: %s\n", __func__,
+            fprintf(stderr, "%s: kvm_irqchip_commit_routes failed: %s\n", __func__,
                     strerror(-r));
             exit(1);
         }
diff --git a/hw/msix.c b/hw/msix.c
index 5515a32..34b7455 100644
--- a/hw/msix.c
+++ b/hw/msix.c
@@ -91,7 +91,7 @@ static void kvm_msix_update(PCIDevice *dev, int vector,
         *entry = new_entry;
         r = kvm_irqchip_commit_routes(kvm_state);
         if (r) {
-            fprintf(stderr, "%s: kvm_commit_irq_routes failed: %s\n", __func__,
+            fprintf(stderr, "%s: kvm_irqchip_commit_routes failed: %s\n", __func__,
 		    strerror(-r));
             exit(1);
         }
@@ -112,7 +112,7 @@ static int kvm_msix_vector_add(PCIDevice *dev, unsigned vector)
 
     r = kvm_irqchip_commit_routes(kvm_state);
     if (r < 0) {
-        fprintf(stderr, "%s: kvm_commit_irq_routes failed: %s\n", __func__, strerror(-r));
+        fprintf(stderr, "%s: kvm_irqchip_commit_routes failed: %s\n", __func__, strerror(-r));
         return r;
     }
     return 0;
-- 
1.7.3.1


^ permalink raw reply related	[flat|nested] 16+ messages in thread

* [PATCH 2/2] Device assignment: Fix MSI IRQ affinity setting
  2012-05-24 17:02 [PATCH 1/2] Remove kvm_commit_irq_routes from error messages Richard Weinberger
@ 2012-05-24 17:02 ` Richard Weinberger
  2012-05-24 18:20   ` Alex Williamson
  2012-05-24 20:47   ` Jan Kiszka
  0 siblings, 2 replies; 16+ messages in thread
From: Richard Weinberger @ 2012-05-24 17:02 UTC (permalink / raw)
  To: kvm; +Cc: avi, mtosatti, tglx, Richard Weinberger

MSI interrupt affinity setting on the guest ended always up on vcpu0,
no matter what.
IOW writes to /proc/irq/<IRQ>/smp_affinity are irgnored.
This patch fixes the MSI IRQ routing and avoids the utter madness of
tearing down and setting up the interrupt completely when this changes.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Richard Weinberger <richard@nod.at>
---
 hw/device-assignment.c |   73 ++++++++++++++++++++++++++++++++++++++++++++++--
 1 files changed, 70 insertions(+), 3 deletions(-)

diff --git a/hw/device-assignment.c b/hw/device-assignment.c
index 09726f9..78d57c8 100644
--- a/hw/device-assignment.c
+++ b/hw/device-assignment.c
@@ -913,6 +913,50 @@ void assigned_dev_update_irqs(void)
     }
 }
 
+static void assigned_dev_update_msi_route(PCIDevice *pci_dev)
+{
+    AssignedDevice *adev = DO_UPCAST(AssignedDevice, dev, pci_dev);
+    uint8_t ctrl_byte = pci_get_byte(pci_dev->config + pci_dev->msi_cap +
+				     PCI_MSI_FLAGS);
+    struct kvm_irq_routing_entry *old, new;
+    KVMMsiMessage msg;
+    int r;
+
+    if (!(ctrl_byte & PCI_MSI_FLAGS_ENABLE))
+	    return;
+
+    msg.addr_lo =  pci_get_long(pci_dev->config + pci_dev->msi_cap +
+				PCI_MSI_ADDRESS_LO);
+    msg.addr_hi =  pci_get_long(pci_dev->config + pci_dev->msi_cap +
+				PCI_MSI_ADDRESS_HI);
+    msg.data =  pci_get_long(pci_dev->config + pci_dev->msi_cap +
+			     PCI_MSI_DATA_32);
+
+    old = adev->entry;
+    new = *old;
+    new.u.msi.address_lo = msg.addr_lo;
+    new.u.msi.address_hi = msg.addr_hi;
+    new.u.msi.data = msg.data;
+
+    if (memcmp(old, &new, sizeof(new)) == 0)
+        return;
+
+    r = kvm_update_routing_entry(old, &new);
+    if (r < 0) {
+        fprintf(stderr, "%s: kvm_update_msi failed: %s\n", __func__,
+                strerror(-r));
+        exit(1);
+    }
+
+    *old = new;
+     r = kvm_irqchip_commit_routes(kvm_state);
+     if (r) {
+        fprintf(stderr, "%s: kvm_irqchip_commit_routes failed: %s\n", __func__,
+                strerror(-r));
+        exit(1);
+     }
+}
+
 static void assigned_dev_update_msi(PCIDevice *pci_dev)
 {
     struct kvm_assigned_irq assigned_irq_data;
@@ -1116,6 +1160,14 @@ static uint32_t assigned_dev_pci_read_config(PCIDevice *pci_dev,
     uint32_t virt_val = pci_default_read_config(pci_dev, address, len);
     uint32_t real_val, emulate_mask, full_emulation_mask;
 
+    if (assigned_dev->cap.available & ASSIGNED_DEVICE_CAP_MSI) {
+        uint32_t msi_start = pci_dev->msi_cap;
+        uint32_t msi_end = msi_start + PCI_MSI_DATA_64 + 3;
+
+	if (address >= msi_start && (address + len) < msi_end)
+            return virt_val;
+    }
+
     emulate_mask = 0;
     memcpy(&emulate_mask, assigned_dev->emulate_config_read + address, len);
     emulate_mask = le32_to_cpu(emulate_mask);
@@ -1130,6 +1182,17 @@ static uint32_t assigned_dev_pci_read_config(PCIDevice *pci_dev,
     }
 }
 
+static void handle_cfg_write_msi(PCIDevice *pci_dev, AssignedDevice *adev)
+{
+    if (!kvm_enabled() || !kvm_irqchip_in_kernel())
+	return;
+
+    if (adev->entry && (adev->irq_requested_type & KVM_DEV_IRQ_GUEST_MSI))
+        assigned_dev_update_msi_route(pci_dev);
+    else
+        assigned_dev_update_msi(pci_dev);
+}
+
 static void assigned_dev_pci_write_config(PCIDevice *pci_dev, uint32_t address,
                                           uint32_t val, int len)
 {
@@ -1155,9 +1218,13 @@ static void assigned_dev_pci_write_config(PCIDevice *pci_dev, uint32_t address,
         }
     }
     if (assigned_dev->cap.available & ASSIGNED_DEVICE_CAP_MSI) {
-        if (range_covers_byte(address, len,
-                              pci_dev->msi_cap + PCI_MSI_FLAGS)) {
-            assigned_dev_update_msi(pci_dev);
+        uint32_t msi_start = pci_dev->msi_cap;
+        uint32_t msi_end = msi_start + PCI_MSI_DATA_64 + 3;
+
+        if (address >= msi_start && (address + len) < msi_end) {
+            if (address == msi_start + PCI_MSI_DATA_32)
+                handle_cfg_write_msi(pci_dev, assigned_dev);
+            return;
         }
     }
     if (assigned_dev->cap.available & ASSIGNED_DEVICE_CAP_MSIX) {
-- 
1.7.3.1


^ permalink raw reply related	[flat|nested] 16+ messages in thread

* Re: [PATCH 2/2] Device assignment: Fix MSI IRQ affinity setting
  2012-05-24 17:02 ` [PATCH 2/2] Device assignment: Fix MSI IRQ affinity setting Richard Weinberger
@ 2012-05-24 18:20   ` Alex Williamson
       [not found]     ` <CAEMbtc+ycsC6u=CZ_Yg6C=WV=VqjA2uEDM5KWPM_7n3sZh_9Pw@mail.gmail.com>
  2012-05-24 21:39     ` Thomas Gleixner
  2012-05-24 20:47   ` Jan Kiszka
  1 sibling, 2 replies; 16+ messages in thread
From: Alex Williamson @ 2012-05-24 18:20 UTC (permalink / raw)
  To: Richard Weinberger; +Cc: kvm, avi, mtosatti, tglx

On Thu, 2012-05-24 at 18:02 +0100, Richard Weinberger wrote:
> MSI interrupt affinity setting on the guest ended always up on vcpu0,
> no matter what.
> IOW writes to /proc/irq/<IRQ>/smp_affinity are irgnored.
> This patch fixes the MSI IRQ routing and avoids the utter madness of
> tearing down and setting up the interrupt completely when this changes.
> 
> Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
> Signed-off-by: Richard Weinberger <richard@nod.at>
> ---
>  hw/device-assignment.c |   73 ++++++++++++++++++++++++++++++++++++++++++++++--
>  1 files changed, 70 insertions(+), 3 deletions(-)
> 
> diff --git a/hw/device-assignment.c b/hw/device-assignment.c
> index 09726f9..78d57c8 100644
> --- a/hw/device-assignment.c
> +++ b/hw/device-assignment.c
> @@ -913,6 +913,50 @@ void assigned_dev_update_irqs(void)
>      }
>  }
>  
> +static void assigned_dev_update_msi_route(PCIDevice *pci_dev)
> +{
> +    AssignedDevice *adev = DO_UPCAST(AssignedDevice, dev, pci_dev);
> +    uint8_t ctrl_byte = pci_get_byte(pci_dev->config + pci_dev->msi_cap +
> +				     PCI_MSI_FLAGS);
> +    struct kvm_irq_routing_entry *old, new;
> +    KVMMsiMessage msg;
> +    int r;

Please follow qemu coding style for braces throughout.

> +
> +    if (!(ctrl_byte & PCI_MSI_FLAGS_ENABLE))
> +	    return;
> +
> +    msg.addr_lo =  pci_get_long(pci_dev->config + pci_dev->msi_cap +
> +				PCI_MSI_ADDRESS_LO);
> +    msg.addr_hi =  pci_get_long(pci_dev->config + pci_dev->msi_cap +
> +				PCI_MSI_ADDRESS_HI);

Odd, since we only expose a 32bit MSI capability to the guest...

> +    msg.data =  pci_get_long(pci_dev->config + pci_dev->msi_cap +
> +			     PCI_MSI_DATA_32);

Should be pci_get_word()

> +
> +    old = adev->entry;
> +    new = *old;
> +    new.u.msi.address_lo = msg.addr_lo;
> +    new.u.msi.address_hi = msg.addr_hi;
> +    new.u.msi.data = msg.data;
> +
> +    if (memcmp(old, &new, sizeof(new)) == 0)
> +        return;
> +
> +    r = kvm_update_routing_entry(old, &new);

How does this work?  old is now new, so kvm_update_routing_entry() is
never going to match to the existing entry if address_lo or data
actually change.

> +    if (r < 0) {
> +        fprintf(stderr, "%s: kvm_update_msi failed: %s\n", __func__,
> +                strerror(-r));
> +        exit(1);
> +    }
> +
> +    *old = new;

huh?

> +     r = kvm_irqchip_commit_routes(kvm_state);
> +     if (r) {
> +        fprintf(stderr, "%s: kvm_irqchip_commit_routes failed: %s\n", __func__,
> +                strerror(-r));
> +        exit(1);
> +     }
> +}
> +
>  static void assigned_dev_update_msi(PCIDevice *pci_dev)
>  {
>      struct kvm_assigned_irq assigned_irq_data;
> @@ -1116,6 +1160,14 @@ static uint32_t assigned_dev_pci_read_config(PCIDevice *pci_dev,
>      uint32_t virt_val = pci_default_read_config(pci_dev, address, len);
>      uint32_t real_val, emulate_mask, full_emulation_mask;
>  
> +    if (assigned_dev->cap.available & ASSIGNED_DEVICE_CAP_MSI) {
> +        uint32_t msi_start = pci_dev->msi_cap;
> +        uint32_t msi_end = msi_start + PCI_MSI_DATA_64 + 3;
> +
> +	if (address >= msi_start && (address + len) < msi_end)

ranges_overlap() is meant for this.  We only expose a 32bit MSI cap, so
msi_end is wrong.

> +            return virt_val;
> +    }
> +
>      emulate_mask = 0;
>      memcpy(&emulate_mask, assigned_dev->emulate_config_read + address, len);
>      emulate_mask = le32_to_cpu(emulate_mask);
> @@ -1130,6 +1182,17 @@ static uint32_t assigned_dev_pci_read_config(PCIDevice *pci_dev,
>      }
>  }
>  
> +static void handle_cfg_write_msi(PCIDevice *pci_dev, AssignedDevice *adev)
> +{
> +    if (!kvm_enabled() || !kvm_irqchip_in_kernel())
> +	return;

Unnecessary, device assignment doesn't work otherwise.

> +
> +    if (adev->entry && (adev->irq_requested_type & KVM_DEV_IRQ_GUEST_MSI))

Should just be able to test irq_requested_type.

> +        assigned_dev_update_msi_route(pci_dev);
> +    else
> +        assigned_dev_update_msi(pci_dev);
> +}
> +
>  static void assigned_dev_pci_write_config(PCIDevice *pci_dev, uint32_t address,
>                                            uint32_t val, int len)
>  {
> @@ -1155,9 +1218,13 @@ static void assigned_dev_pci_write_config(PCIDevice *pci_dev, uint32_t address,
>          }
>      }
>      if (assigned_dev->cap.available & ASSIGNED_DEVICE_CAP_MSI) {
> -        if (range_covers_byte(address, len,
> -                              pci_dev->msi_cap + PCI_MSI_FLAGS)) {
> -            assigned_dev_update_msi(pci_dev);
> +        uint32_t msi_start = pci_dev->msi_cap;
> +        uint32_t msi_end = msi_start + PCI_MSI_DATA_64 + 3;
> +
> +        if (address >= msi_start && (address + len) < msi_end) {

Use ranges_overlap() please, msi_end is wrong.

> +            if (address == msi_start + PCI_MSI_DATA_32)
> +                handle_cfg_write_msi(pci_dev, assigned_dev);

Why didn't we just use range_covers_byte(address, len, pci_dev->msi_cap
+ PCI_MSI_DATA_32) to start with?  But how does this handle the enable
bit?

> +            return;
>          }
>      }
>      if (assigned_dev->cap.available & ASSIGNED_DEVICE_CAP_MSIX) {

Thanks,

Alex


^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH 2/2] Device assignment: Fix MSI IRQ affinity setting
       [not found]     ` <CAEMbtc+ycsC6u=CZ_Yg6C=WV=VqjA2uEDM5KWPM_7n3sZh_9Pw@mail.gmail.com>
@ 2012-05-24 19:27       ` Richard Weinberger
  0 siblings, 0 replies; 16+ messages in thread
From: Richard Weinberger @ 2012-05-24 19:27 UTC (permalink / raw)
  To: Alex Williamson; +Cc: avi, tglx, mtosatti, kvm

On Thu, 24 May 2012 13:00:51 -0600, Alex Williamson
<alex.williamson@redhat.com> wrote:
>> How does this work?  old is now new, so kvm_update_routing_entry() is
>> never going to match to the existing entry if address_lo or data
>> actually change.
> 
> Apologies, I read memcpy above

No problem. :)
I'll address your comments and send v2 tomorrow.

Thanks,
//richard


^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH 2/2] Device assignment: Fix MSI IRQ affinity setting
  2012-05-24 17:02 ` [PATCH 2/2] Device assignment: Fix MSI IRQ affinity setting Richard Weinberger
  2012-05-24 18:20   ` Alex Williamson
@ 2012-05-24 20:47   ` Jan Kiszka
  1 sibling, 0 replies; 16+ messages in thread
From: Jan Kiszka @ 2012-05-24 20:47 UTC (permalink / raw)
  To: Richard Weinberger; +Cc: kvm, avi, mtosatti, tglx

On 2012-05-24 14:02, Richard Weinberger wrote:
> MSI interrupt affinity setting on the guest ended always up on vcpu0,
> no matter what.
> IOW writes to /proc/irq/<IRQ>/smp_affinity are irgnored.
> This patch fixes the MSI IRQ routing and avoids the utter madness of
> tearing down and setting up the interrupt completely when this changes.

The device assignment code will soon be significantly refactored in this
regard (MSI/MSI-X handling will use generic QEMU services instead of
open-coding their own bugs). Also for this reason, it would be very good
to explain in the commit log what was broken or missing so that
affinities were not respected.

Thanks,
Jan

-- 
Siemens AG, Corporate Technology, CT T DE IT 1
Corporate Competence Center Embedded Linux

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH 2/2] Device assignment: Fix MSI IRQ affinity setting
  2012-05-24 18:20   ` Alex Williamson
       [not found]     ` <CAEMbtc+ycsC6u=CZ_Yg6C=WV=VqjA2uEDM5KWPM_7n3sZh_9Pw@mail.gmail.com>
@ 2012-05-24 21:39     ` Thomas Gleixner
  2012-05-24 21:53       ` Jan Kiszka
  2012-05-24 22:05       ` Alex Williamson
  1 sibling, 2 replies; 16+ messages in thread
From: Thomas Gleixner @ 2012-05-24 21:39 UTC (permalink / raw)
  To: Alex Williamson
  Cc: Richard Weinberger, kvm, avi, Marcelo Tosatti, Bjorn Helgaas

On Thu, 24 May 2012, Alex Williamson wrote:
> On Thu, 2012-05-24 at 18:02 +0100, Richard Weinberger wrote:
> > +            if (address == msi_start + PCI_MSI_DATA_32)
> > +                handle_cfg_write_msi(pci_dev, assigned_dev);
> 
> Why didn't we just use range_covers_byte(address, len, pci_dev->msi_cap
> + PCI_MSI_DATA_32) to start with?  But how does this handle the enable
> bit?

The problem with the current implementation is that it only changes
the routing if the msi entry goes from masked to unmasked state.

Linux does not mask the entries on affinity changes and never did,
neither for MSI nor for MSI-X.

I know it's probably not according to the spec, but we can't fix that
retroactively.

Thanks,

	tglx

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH 2/2] Device assignment: Fix MSI IRQ affinity setting
  2012-05-24 21:39     ` Thomas Gleixner
@ 2012-05-24 21:53       ` Jan Kiszka
  2012-05-24 22:11         ` Alex Williamson
  2012-05-24 22:17         ` Michael S. Tsirkin
  2012-05-24 22:05       ` Alex Williamson
  1 sibling, 2 replies; 16+ messages in thread
From: Jan Kiszka @ 2012-05-24 21:53 UTC (permalink / raw)
  To: Thomas Gleixner
  Cc: Alex Williamson, Richard Weinberger, kvm, avi, Marcelo Tosatti,
	Bjorn Helgaas, Michael S. Tsirkin

On 2012-05-24 18:39, Thomas Gleixner wrote:
> On Thu, 24 May 2012, Alex Williamson wrote:
>> On Thu, 2012-05-24 at 18:02 +0100, Richard Weinberger wrote:
>>> +            if (address == msi_start + PCI_MSI_DATA_32)
>>> +                handle_cfg_write_msi(pci_dev, assigned_dev);
>>
>> Why didn't we just use range_covers_byte(address, len, pci_dev->msi_cap
>> + PCI_MSI_DATA_32) to start with?  But how does this handle the enable
>> bit?
> 
> The problem with the current implementation is that it only changes
> the routing if the msi entry goes from masked to unmasked state.
> 
> Linux does not mask the entries on affinity changes and never did,
> neither for MSI nor for MSI-X.
> 
> I know it's probably not according to the spec, but we can't fix that
> retroactively.

For MSI, this is allowed. For MSI-X, this would clearly be a Linux bug,
waiting for hardware to dislike this spec violation.

However, if this is the current behavior of such a prominent guest, I
guess we have to stop optimizing the QEMU MSI-X code that it only
updates routings on mask changes. Possibly other OSes get this wrong too...

Thanks, for the clarification. Should go into the changelog.

Jan

-- 
Siemens AG, Corporate Technology, CT T DE IT 1
Corporate Competence Center Embedded Linux

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH 2/2] Device assignment: Fix MSI IRQ affinity setting
  2012-05-24 21:39     ` Thomas Gleixner
  2012-05-24 21:53       ` Jan Kiszka
@ 2012-05-24 22:05       ` Alex Williamson
  1 sibling, 0 replies; 16+ messages in thread
From: Alex Williamson @ 2012-05-24 22:05 UTC (permalink / raw)
  To: Thomas Gleixner
  Cc: Richard Weinberger, kvm, avi, Marcelo Tosatti, Bjorn Helgaas

On Thu, 2012-05-24 at 23:39 +0200, Thomas Gleixner wrote:
> On Thu, 24 May 2012, Alex Williamson wrote:
> > On Thu, 2012-05-24 at 18:02 +0100, Richard Weinberger wrote:
> > > +            if (address == msi_start + PCI_MSI_DATA_32)
> > > +                handle_cfg_write_msi(pci_dev, assigned_dev);
> > 
> > Why didn't we just use range_covers_byte(address, len, pci_dev->msi_cap
> > + PCI_MSI_DATA_32) to start with?  But how does this handle the enable
> > bit?
> 
> The problem with the current implementation is that it only changes
> the routing if the msi entry goes from masked to unmasked state.

We don't expose a maskable MSI capability to the guest, so I think you
mean enable/disable.

> Linux does not mask the entries on affinity changes and never did,
> neither for MSI nor for MSI-X.
> 
> I know it's probably not according to the spec, but we can't fix that
> retroactively.

We need to do both then, enable MSI based on the enable bit and update
routing based on address updates.  It seems like this code is counting
on data being written after the enable bit is set, which is not
guaranteed to happen.  Thanks,

Alex


^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH 2/2] Device assignment: Fix MSI IRQ affinity setting
  2012-05-24 21:53       ` Jan Kiszka
@ 2012-05-24 22:11         ` Alex Williamson
  2012-05-24 23:01           ` Thomas Gleixner
  2012-05-24 22:17         ` Michael S. Tsirkin
  1 sibling, 1 reply; 16+ messages in thread
From: Alex Williamson @ 2012-05-24 22:11 UTC (permalink / raw)
  To: Jan Kiszka
  Cc: Thomas Gleixner, Richard Weinberger, kvm, avi, Marcelo Tosatti,
	Bjorn Helgaas, Michael S. Tsirkin

On Thu, 2012-05-24 at 18:53 -0300, Jan Kiszka wrote:
> On 2012-05-24 18:39, Thomas Gleixner wrote:
> > On Thu, 24 May 2012, Alex Williamson wrote:
> >> On Thu, 2012-05-24 at 18:02 +0100, Richard Weinberger wrote:
> >>> +            if (address == msi_start + PCI_MSI_DATA_32)
> >>> +                handle_cfg_write_msi(pci_dev, assigned_dev);
> >>
> >> Why didn't we just use range_covers_byte(address, len, pci_dev->msi_cap
> >> + PCI_MSI_DATA_32) to start with?  But how does this handle the enable
> >> bit?
> > 
> > The problem with the current implementation is that it only changes
> > the routing if the msi entry goes from masked to unmasked state.
> > 
> > Linux does not mask the entries on affinity changes and never did,
> > neither for MSI nor for MSI-X.
> > 
> > I know it's probably not according to the spec, but we can't fix that
> > retroactively.
> 
> For MSI, this is allowed. For MSI-X, this would clearly be a Linux bug,
> waiting for hardware to dislike this spec violation.
> 
> However, if this is the current behavior of such a prominent guest, I
> guess we have to stop optimizing the QEMU MSI-X code that it only
> updates routings on mask changes. Possibly other OSes get this wrong too...
> 
> Thanks, for the clarification. Should go into the changelog.

Hmm, if Linux didn't mask MSIX before updating vectors it'd not only be
a spec violation, but my testing of the recent changes to fix MSIX
vector updates for exactly this would have failed...

        } else if (msix_masked(&orig) && !msix_masked(entry)) {
            ... update vector...

So I'm not entirely sure I believe that.  Thanks,

Alex



^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH 2/2] Device assignment: Fix MSI IRQ affinity setting
  2012-05-24 21:53       ` Jan Kiszka
  2012-05-24 22:11         ` Alex Williamson
@ 2012-05-24 22:17         ` Michael S. Tsirkin
  2012-05-24 23:06           ` Thomas Gleixner
  1 sibling, 1 reply; 16+ messages in thread
From: Michael S. Tsirkin @ 2012-05-24 22:17 UTC (permalink / raw)
  To: Jan Kiszka
  Cc: Thomas Gleixner, Alex Williamson, Richard Weinberger, kvm, avi,
	Marcelo Tosatti, Bjorn Helgaas

On Thu, May 24, 2012 at 06:53:15PM -0300, Jan Kiszka wrote:
> On 2012-05-24 18:39, Thomas Gleixner wrote:
> > On Thu, 24 May 2012, Alex Williamson wrote:
> >> On Thu, 2012-05-24 at 18:02 +0100, Richard Weinberger wrote:
> >>> +            if (address == msi_start + PCI_MSI_DATA_32)
> >>> +                handle_cfg_write_msi(pci_dev, assigned_dev);
> >>
> >> Why didn't we just use range_covers_byte(address, len, pci_dev->msi_cap
> >> + PCI_MSI_DATA_32) to start with?  But how does this handle the enable
> >> bit?
> > 
> > The problem with the current implementation is that it only changes
> > the routing if the msi entry goes from masked to unmasked state.
> > 
> > Linux does not mask the entries on affinity changes and never did,
> > neither for MSI nor for MSI-X.
> > 
> > I know it's probably not according to the spec, but we can't fix that
> > retroactively.
> 
> For MSI, this is allowed. For MSI-X, this would clearly be a Linux bug,
> waiting for hardware to dislike this spec violation.
> 
> However, if this is the current behavior of such a prominent guest, I
> guess we have to stop optimizing the QEMU MSI-X code that it only
> updates routings on mask changes. Possibly other OSes get this wrong too...

Very strange, a clear spec violation. I'll have to dig in the source to
verify this.


> Thanks, for the clarification. Should go into the changelog.
> 
> Jan
> 
> -- 
> Siemens AG, Corporate Technology, CT T DE IT 1
> Corporate Competence Center Embedded Linux

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH 2/2] Device assignment: Fix MSI IRQ affinity setting
  2012-05-24 22:11         ` Alex Williamson
@ 2012-05-24 23:01           ` Thomas Gleixner
  2012-05-24 23:23             ` Alex Williamson
  0 siblings, 1 reply; 16+ messages in thread
From: Thomas Gleixner @ 2012-05-24 23:01 UTC (permalink / raw)
  To: Alex Williamson
  Cc: Jan Kiszka, Richard Weinberger, kvm, avi, Marcelo Tosatti,
	Bjorn Helgaas, Michael S. Tsirkin

On Thu, 24 May 2012, Alex Williamson wrote:
> On Thu, 2012-05-24 at 18:53 -0300, Jan Kiszka wrote:
> > On 2012-05-24 18:39, Thomas Gleixner wrote:
> > > On Thu, 24 May 2012, Alex Williamson wrote:
> > >> On Thu, 2012-05-24 at 18:02 +0100, Richard Weinberger wrote:
> > >>> +            if (address == msi_start + PCI_MSI_DATA_32)
> > >>> +                handle_cfg_write_msi(pci_dev, assigned_dev);
> > >>
> > >> Why didn't we just use range_covers_byte(address, len, pci_dev->msi_cap
> > >> + PCI_MSI_DATA_32) to start with?  But how does this handle the enable
> > >> bit?
> > > 
> > > The problem with the current implementation is that it only changes
> > > the routing if the msi entry goes from masked to unmasked state.
> > > 
> > > Linux does not mask the entries on affinity changes and never did,
> > > neither for MSI nor for MSI-X.
> > > 
> > > I know it's probably not according to the spec, but we can't fix that
> > > retroactively.
> > 
> > For MSI, this is allowed. For MSI-X, this would clearly be a Linux bug,
> > waiting for hardware to dislike this spec violation.
> > 
> > However, if this is the current behavior of such a prominent guest, I
> > guess we have to stop optimizing the QEMU MSI-X code that it only
> > updates routings on mask changes. Possibly other OSes get this wrong too...
> > 
> > Thanks, for the clarification. Should go into the changelog.
> 
> Hmm, if Linux didn't mask MSIX before updating vectors it'd not only be
> a spec violation, but my testing of the recent changes to fix MSIX
> vector updates for exactly this would have failed...
> 
>         } else if (msix_masked(&orig) && !msix_masked(entry)) {
>             ... update vector...
> 
> So I'm not entirely sure I believe that.  Thanks,

What happens is:

A write to /proc/irq/$N/smp_affinity calls into irq_set_affinity()
which does:

	if (irq_can_move_pcntxt(data)) {
		ret = chip->irq_set_affinity(data, mask, false);
	} else {
		irqd_set_move_pending(data);
		irq_copy_pending(desc, mask);
	}

MSI and MSI-X fall into the !irq_can_move_pcntxt() code path unless
the irq is remapped, which is not the case in a guest. That means that
we merily copy the new mask and set the move pending bit. 

MSI/MSI-X use the edge handler so on the next incoming interrupt, we
do

  irq_desc->chip->irq_ack()

which ends up in ack_apic_edge() which does:

static void ack_apic_edge(struct irq_data *data)
{
	irq_complete_move(data->chip_data);
	irq_move_irq(data);
	ack_APIC_irq();
}

irq_move_irq() is the interesting function. And that does

  irq_desc->chip->irq_mask()

before calling the irq_set_affinity() function which actually changes
the masks.
	      
->irq_mask() ends up in mask_msi_irq(). 

Now that calls msi_set_mask_bit() and for MSI-X that actually masks
the irq. So ignore my MSI-X comment.

But for MSI this ends up in msi_mask_irq() which does:

        if (!desc->msi_attrib.maskbit)
	   return 0;

So in case desc->msi_attrib.maskbit is 0 we do not write anything out
and then the masked/unmasked logic in qemu fails.

Sorry, that I did not decode that down to this level before, but I was
in a hurry and assumed correctly that qemu is doing something
wrong. Not being familiar with that code did not help either :)

So the proper fix is that qemu tells the guest that mask bit is
supported and catches the mask bit toggling before writing it out to
the hardware for those devices which do not support it.

We'll have another look.

Thanks,

	tglx

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH 2/2] Device assignment: Fix MSI IRQ affinity setting
  2012-05-24 22:17         ` Michael S. Tsirkin
@ 2012-05-24 23:06           ` Thomas Gleixner
  2012-05-24 23:19             ` Thomas Gleixner
  0 siblings, 1 reply; 16+ messages in thread
From: Thomas Gleixner @ 2012-05-24 23:06 UTC (permalink / raw)
  To: Michael S. Tsirkin
  Cc: Jan Kiszka, Alex Williamson, Richard Weinberger, kvm, avi,
	Marcelo Tosatti, Bjorn Helgaas

On Fri, 25 May 2012, Michael S. Tsirkin wrote:
> On Thu, May 24, 2012 at 06:53:15PM -0300, Jan Kiszka wrote:
> > On 2012-05-24 18:39, Thomas Gleixner wrote:
> > > On Thu, 24 May 2012, Alex Williamson wrote:
> > >> On Thu, 2012-05-24 at 18:02 +0100, Richard Weinberger wrote:
> > >>> +            if (address == msi_start + PCI_MSI_DATA_32)
> > >>> +                handle_cfg_write_msi(pci_dev, assigned_dev);
> > >>
> > >> Why didn't we just use range_covers_byte(address, len, pci_dev->msi_cap
> > >> + PCI_MSI_DATA_32) to start with?  But how does this handle the enable
> > >> bit?
> > > 
> > > The problem with the current implementation is that it only changes
> > > the routing if the msi entry goes from masked to unmasked state.
> > > 
> > > Linux does not mask the entries on affinity changes and never did,
> > > neither for MSI nor for MSI-X.
> > > 
> > > I know it's probably not according to the spec, but we can't fix that
> > > retroactively.
> > 
> > For MSI, this is allowed. For MSI-X, this would clearly be a Linux bug,
> > waiting for hardware to dislike this spec violation.
> > 
> > However, if this is the current behavior of such a prominent guest, I
> > guess we have to stop optimizing the QEMU MSI-X code that it only
> > updates routings on mask changes. Possibly other OSes get this wrong too...
> 
> Very strange, a clear spec violation. I'll have to dig in the source to
> verify this.

Stop digging. MSI-X is correct.

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH 2/2] Device assignment: Fix MSI IRQ affinity setting
  2012-05-24 23:06           ` Thomas Gleixner
@ 2012-05-24 23:19             ` Thomas Gleixner
  0 siblings, 0 replies; 16+ messages in thread
From: Thomas Gleixner @ 2012-05-24 23:19 UTC (permalink / raw)
  To: Michael S. Tsirkin
  Cc: Jan Kiszka, Alex Williamson, Richard Weinberger, kvm, avi,
	Marcelo Tosatti, Bjorn Helgaas

On Fri, 25 May 2012, Thomas Gleixner wrote:

> On Fri, 25 May 2012, Michael S. Tsirkin wrote:
> > On Thu, May 24, 2012 at 06:53:15PM -0300, Jan Kiszka wrote:
> > > On 2012-05-24 18:39, Thomas Gleixner wrote:
> > > > On Thu, 24 May 2012, Alex Williamson wrote:
> > > >> On Thu, 2012-05-24 at 18:02 +0100, Richard Weinberger wrote:
> > > >>> +            if (address == msi_start + PCI_MSI_DATA_32)
> > > >>> +                handle_cfg_write_msi(pci_dev, assigned_dev);
> > > >>
> > > >> Why didn't we just use range_covers_byte(address, len, pci_dev->msi_cap
> > > >> + PCI_MSI_DATA_32) to start with?  But how does this handle the enable
> > > >> bit?
> > > > 
> > > > The problem with the current implementation is that it only changes
> > > > the routing if the msi entry goes from masked to unmasked state.
> > > > 
> > > > Linux does not mask the entries on affinity changes and never did,
> > > > neither for MSI nor for MSI-X.
> > > > 
> > > > I know it's probably not according to the spec, but we can't fix that
> > > > retroactively.
> > > 
> > > For MSI, this is allowed. For MSI-X, this would clearly be a Linux bug,
> > > waiting for hardware to dislike this spec violation.
> > > 
> > > However, if this is the current behavior of such a prominent guest, I
> > > guess we have to stop optimizing the QEMU MSI-X code that it only
> > > updates routings on mask changes. Possibly other OSes get this wrong too...
> > 
> > Very strange, a clear spec violation. I'll have to dig in the source to
> > verify this.
> 
> Stop digging. MSI-X is correct.

This was based off an older version of qemu-kvm, where the routing for
MSI-X was broken for other reasons. But that seems to be fixed now.

I use the age excuse :)

Thanks,

	tglx


^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH 2/2] Device assignment: Fix MSI IRQ affinity setting
  2012-05-24 23:01           ` Thomas Gleixner
@ 2012-05-24 23:23             ` Alex Williamson
  2012-05-24 23:56               ` Thomas Gleixner
  0 siblings, 1 reply; 16+ messages in thread
From: Alex Williamson @ 2012-05-24 23:23 UTC (permalink / raw)
  To: Thomas Gleixner
  Cc: Jan Kiszka, Richard Weinberger, kvm, avi, Marcelo Tosatti,
	Bjorn Helgaas, Michael S. Tsirkin

On Fri, 2012-05-25 at 01:01 +0200, Thomas Gleixner wrote:
> So the proper fix is that qemu tells the guest that mask bit is
> supported and catches the mask bit toggling before writing it out to
> the hardware for those devices which do not support it.

We can't necessarily do that, we have to work with the config space
we're give.  Using the smallest possible MSI capability always works.
Adding mask bits may not fit in with the existing capabilities of the
physical device.  Thanks,

Alex


^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH 2/2] Device assignment: Fix MSI IRQ affinity setting
  2012-05-24 23:23             ` Alex Williamson
@ 2012-05-24 23:56               ` Thomas Gleixner
  2012-05-25  2:37                 ` Jan Kiszka
  0 siblings, 1 reply; 16+ messages in thread
From: Thomas Gleixner @ 2012-05-24 23:56 UTC (permalink / raw)
  To: Alex Williamson
  Cc: Jan Kiszka, Richard Weinberger, kvm, avi, Marcelo Tosatti,
	Bjorn Helgaas, Michael S. Tsirkin

On Thu, 24 May 2012, Alex Williamson wrote:

> On Fri, 2012-05-25 at 01:01 +0200, Thomas Gleixner wrote:
> > So the proper fix is that qemu tells the guest that mask bit is
> > supported and catches the mask bit toggling before writing it out to
> > the hardware for those devices which do not support it.
> 
> We can't necessarily do that, we have to work with the config space
> we're give.  Using the smallest possible MSI capability always works.
> Adding mask bits may not fit in with the existing capabilities of the
> physical device.  Thanks,

I see what you mean. A random device driver of a random guest OS might
rely on that information. Unlikely, but ....

So we need some logic to circumvent the masked/unmasked logic in case
that property is not set, right ?

Thanks,

	tglx

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH 2/2] Device assignment: Fix MSI IRQ affinity setting
  2012-05-24 23:56               ` Thomas Gleixner
@ 2012-05-25  2:37                 ` Jan Kiszka
  0 siblings, 0 replies; 16+ messages in thread
From: Jan Kiszka @ 2012-05-25  2:37 UTC (permalink / raw)
  To: Thomas Gleixner
  Cc: Alex Williamson, Richard Weinberger, kvm, avi, Marcelo Tosatti,
	Bjorn Helgaas, Michael S. Tsirkin

[-- Attachment #1: Type: text/plain, Size: 1475 bytes --]

On 2012-05-24 20:56, Thomas Gleixner wrote:
> On Thu, 24 May 2012, Alex Williamson wrote:
> 
>> On Fri, 2012-05-25 at 01:01 +0200, Thomas Gleixner wrote:
>>> So the proper fix is that qemu tells the guest that mask bit is
>>> supported and catches the mask bit toggling before writing it out to
>>> the hardware for those devices which do not support it.
>>
>> We can't necessarily do that, we have to work with the config space
>> we're give.  Using the smallest possible MSI capability always works.
>> Adding mask bits may not fit in with the existing capabilities of the
>> physical device.  Thanks,
> 
> I see what you mean. A random device driver of a random guest OS might
> rely on that information. Unlikely, but ....
> 
> So we need some logic to circumvent the masked/unmasked logic in case
> that property is not set, right ?

For MSI emulation in QEMU (including device assignment) it is quite
simple: don't assume that the guest will always mask or even disable
before fiddling with some MSI vector configuration. That is not required
by the spec, so we can't rely on it. The patches I have in a
semi-finished state will do precisely this. But there is still some use
for a dev-assign fix based on the current code for qemu-kvm-1.1.

BTW, along with the switch of device assignment to generic MSI support,
we should also gain support for MSI vector masking - provided the
underlying device comes with that feature as well.

Jan


[-- Attachment #2: OpenPGP digital signature --]
[-- Type: application/pgp-signature, Size: 262 bytes --]

^ permalink raw reply	[flat|nested] 16+ messages in thread

end of thread, other threads:[~2012-05-25  2:37 UTC | newest]

Thread overview: 16+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2012-05-24 17:02 [PATCH 1/2] Remove kvm_commit_irq_routes from error messages Richard Weinberger
2012-05-24 17:02 ` [PATCH 2/2] Device assignment: Fix MSI IRQ affinity setting Richard Weinberger
2012-05-24 18:20   ` Alex Williamson
     [not found]     ` <CAEMbtc+ycsC6u=CZ_Yg6C=WV=VqjA2uEDM5KWPM_7n3sZh_9Pw@mail.gmail.com>
2012-05-24 19:27       ` Richard Weinberger
2012-05-24 21:39     ` Thomas Gleixner
2012-05-24 21:53       ` Jan Kiszka
2012-05-24 22:11         ` Alex Williamson
2012-05-24 23:01           ` Thomas Gleixner
2012-05-24 23:23             ` Alex Williamson
2012-05-24 23:56               ` Thomas Gleixner
2012-05-25  2:37                 ` Jan Kiszka
2012-05-24 22:17         ` Michael S. Tsirkin
2012-05-24 23:06           ` Thomas Gleixner
2012-05-24 23:19             ` Thomas Gleixner
2012-05-24 22:05       ` Alex Williamson
2012-05-24 20:47   ` Jan Kiszka

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox