public inbox for kvm@vger.kernel.org
 help / color / mirror / Atom feed
From: Thomas Gleixner <tglx@linutronix.de>
To: "H. Peter Anvin" <hpa@zytor.com>, Xin Li <xin3.li@intel.com>,
	linux-kernel@vger.kernel.org, x86@kernel.org,
	kvm@vger.kernel.org
Cc: mingo@redhat.com, bp@alien8.de, dave.hansen@linux.intel.com,
	peterz@infradead.org, andrew.cooper3@citrix.com,
	seanjc@google.com, pbonzini@redhat.com, ravi.v.shankar@intel.com,
	jiangshanlai@gmail.com, shan.kang@intel.com
Subject: Re: [PATCH v8 01/33] x86/traps: let common_interrupt() handle IRQ_MOVE_CLEANUP_VECTOR
Date: Tue, 06 Jun 2023 22:09:03 +0200	[thread overview]
Message-ID: <877csgl5eo.ffs@tglx> (raw)
In-Reply-To: <70ef07f1-e3b7-7c4e-01ac-11f159a87a6b@zytor.com>

On Mon, Jun 05 2023 at 10:09, H. Peter Anvin wrote:
> On 6/5/23 10:07, Thomas Gleixner wrote:
>> There is zero reason for this to be an IPI. This can be delegated to a
>> worker or whatever delayed mechanism.
>> 
> As we discussed offline, I agree that this is a better solution (and 
> should be a separate changeset before the FRED one.)

The untested below should do the trick. Wants to be split in several
patches, but you get the idea.

Thanks,

        tglx
---
Subject: x86/vector: Get rid of IRQ_MOVE_CLEANUP_VECTOR
From: Thomas Gleixner <tglx@linutronix.de>

No point to waste a vector for cleaning up the leftovers of a moved
interrupt. Aside of that this must be the lowest priority of all vectors
which makes FRED systems utilizing vectors 0x10-0x1f more complicated
than necessary.

Schedule a timer instead.

Not-Yet-Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/include/asm/hw_irq.h       |    4 -
 arch/x86/include/asm/idtentry.h     |    1 
 arch/x86/include/asm/irq_vectors.h  |    7 ---
 arch/x86/kernel/apic/vector.c       |   83 ++++++++++++++++++++++++++----------
 arch/x86/kernel/idt.c               |    1 
 arch/x86/platform/uv/uv_irq.c       |    2 
 drivers/iommu/amd/iommu.c           |    2 
 drivers/iommu/hyperv-iommu.c        |    4 -
 drivers/iommu/intel/irq_remapping.c |    2 
 9 files changed, 68 insertions(+), 38 deletions(-)

--- a/arch/x86/include/asm/hw_irq.h
+++ b/arch/x86/include/asm/hw_irq.h
@@ -97,10 +97,10 @@ extern struct irq_cfg *irqd_cfg(struct i
 extern void lock_vector_lock(void);
 extern void unlock_vector_lock(void);
 #ifdef CONFIG_SMP
-extern void send_cleanup_vector(struct irq_cfg *);
+extern void vector_schedule_cleanup(struct irq_cfg *);
 extern void irq_complete_move(struct irq_cfg *cfg);
 #else
-static inline void send_cleanup_vector(struct irq_cfg *c) { }
+static inline void vector_schedule_cleanup(struct irq_cfg *c) { }
 static inline void irq_complete_move(struct irq_cfg *c) { }
 #endif
 
--- a/arch/x86/include/asm/idtentry.h
+++ b/arch/x86/include/asm/idtentry.h
@@ -648,7 +648,6 @@ DECLARE_IDTENTRY_SYSVEC(X86_PLATFORM_IPI
 
 #ifdef CONFIG_SMP
 DECLARE_IDTENTRY(RESCHEDULE_VECTOR,			sysvec_reschedule_ipi);
-DECLARE_IDTENTRY_SYSVEC(IRQ_MOVE_CLEANUP_VECTOR,	sysvec_irq_move_cleanup);
 DECLARE_IDTENTRY_SYSVEC(REBOOT_VECTOR,			sysvec_reboot);
 DECLARE_IDTENTRY_SYSVEC(CALL_FUNCTION_SINGLE_VECTOR,	sysvec_call_function_single);
 DECLARE_IDTENTRY_SYSVEC(CALL_FUNCTION_VECTOR,		sysvec_call_function);
--- a/arch/x86/include/asm/irq_vectors.h
+++ b/arch/x86/include/asm/irq_vectors.h
@@ -35,13 +35,6 @@
  */
 #define FIRST_EXTERNAL_VECTOR		0x20
 
-/*
- * Reserve the lowest usable vector (and hence lowest priority)  0x20 for
- * triggering cleanup after irq migration. 0x21-0x2f will still be used
- * for device interrupts.
- */
-#define IRQ_MOVE_CLEANUP_VECTOR		FIRST_EXTERNAL_VECTOR
-
 #define IA32_SYSCALL_VECTOR		0x80
 
 /*
--- a/arch/x86/kernel/apic/vector.c
+++ b/arch/x86/kernel/apic/vector.c
@@ -44,7 +44,18 @@ static cpumask_var_t vector_searchmask;
 static struct irq_chip lapic_controller;
 static struct irq_matrix *vector_matrix;
 #ifdef CONFIG_SMP
-static DEFINE_PER_CPU(struct hlist_head, cleanup_list);
+
+static void vector_cleanup_callback(struct timer_list *tmr);
+
+struct vector_cleanup {
+	struct hlist_head	head;
+	struct timer_list	timer;
+};
+
+static DEFINE_PER_CPU(struct vector_cleanup, vector_cleanup) = {
+	.head	= HLIST_HEAD_INIT,
+	.timer	= __TIMER_INITIALIZER(vector_cleanup_callback, TIMER_PINNED),
+};
 #endif
 
 void lock_vector_lock(void)
@@ -843,8 +854,12 @@ void lapic_online(void)
 
 void lapic_offline(void)
 {
+	struct vector_cleanup *cl = this_cpu_ptr(&vector_cleanup);
+
 	lock_vector_lock();
 	irq_matrix_offline(vector_matrix);
+	WARN_ON_ONCE(try_to_del_timer_sync(&cl->timer) < 0);
+	WARN_ON_ONCE(!hlist_empty(&cl->head));
 	unlock_vector_lock();
 }
 
@@ -934,62 +949,86 @@ static void free_moved_vector(struct api
 	apicd->move_in_progress = 0;
 }
 
-DEFINE_IDTENTRY_SYSVEC(sysvec_irq_move_cleanup)
+static void vector_cleanup_callback(struct timer_list *tmr)
 {
-	struct hlist_head *clhead = this_cpu_ptr(&cleanup_list);
+	struct vector_cleanup *cl = container_of(tmr, typeof(*cl), timer);
 	struct apic_chip_data *apicd;
 	struct hlist_node *tmp;
+	bool rearm = false;
 
-	ack_APIC_irq();
 	/* Prevent vectors vanishing under us */
-	raw_spin_lock(&vector_lock);
+	raw_spin_lock_irq(&vector_lock);
 
-	hlist_for_each_entry_safe(apicd, tmp, clhead, clist) {
+	hlist_for_each_entry_safe(apicd, tmp, &cl->head, clist) {
 		unsigned int irr, vector = apicd->prev_vector;
 
 		/*
 		 * Paranoia: Check if the vector that needs to be cleaned
-		 * up is registered at the APICs IRR. If so, then this is
-		 * not the best time to clean it up. Clean it up in the
-		 * next attempt by sending another IRQ_MOVE_CLEANUP_VECTOR
-		 * to this CPU. IRQ_MOVE_CLEANUP_VECTOR is the lowest
-		 * priority external vector, so on return from this
-		 * interrupt the device interrupt will happen first.
+		 * up is registered at the APICs IRR. That's clearly a
+		 * hardware issue if the vector arrived on the old target
+		 * _after_ interrupts were disabled above. Keep @apicd
+		 * on the list and schedule the timer again to give the CPU
+		 * a chance to handle the pending interrupt.
 		 */
 		irr = apic_read(APIC_IRR + (vector / 32 * 0x10));
 		if (irr & (1U << (vector % 32))) {
-			apic->send_IPI_self(IRQ_MOVE_CLEANUP_VECTOR);
+			pr_warn_once("Moved interrupt pending in old target APIC %u\n", apicd->irq);
+			rearm = true;
 			continue;
 		}
 		free_moved_vector(apicd);
 	}
 
-	raw_spin_unlock(&vector_lock);
+	/*
+	 * Must happen under vector_lock to make the timer_pending() check
+	 * in __vector_schedule_cleanup() race free against the rearm here.
+	 */
+	if (rearm)
+		mod_timer(tmr, jiffies + 1);
+
+	raw_spin_unlock_irq(&vector_lock);
 }
 
-static void __send_cleanup_vector(struct apic_chip_data *apicd)
+static void __vector_schedule_cleanup(struct apic_chip_data *apicd)
 {
-	unsigned int cpu;
+	unsigned int cpu = apicd->prev_cpu;
 
 	raw_spin_lock(&vector_lock);
 	apicd->move_in_progress = 0;
-	cpu = apicd->prev_cpu;
 	if (cpu_online(cpu)) {
-		hlist_add_head(&apicd->clist, per_cpu_ptr(&cleanup_list, cpu));
-		apic->send_IPI(cpu, IRQ_MOVE_CLEANUP_VECTOR);
+		struct vector_cleanup *cl = per_cpu_ptr(&vector_cleanup, cpu);
+
+		/*
+		 * The lockless timer_pending() check is safe here. If it
+		 * returns true, then the callback will observe this new
+		 * apic data in the hlist as everything is serialized by
+		 * vector lock.
+		 *
+		 * If it returns false then the timer is either not armed
+		 * or the other CPU executes the callback, which again
+		 * would be blocked on vector lock. Rearming it in the
+		 * latter case makes it fire for nothing.
+		 *
+		 * This is also safe against the callback rearming the timer
+		 * because that's serialized via vector lock too.
+		 */
+		if (!timer_pending(&cl->timer)) {
+			cl->timer.expires = jiffies + 1;
+			add_timer_on(&cl->timer, cpu);
+		}
 	} else {
 		apicd->prev_vector = 0;
 	}
 	raw_spin_unlock(&vector_lock);
 }
 
-void send_cleanup_vector(struct irq_cfg *cfg)
+void vector_schedule_cleanup(struct irq_cfg *cfg)
 {
 	struct apic_chip_data *apicd;
 
 	apicd = container_of(cfg, struct apic_chip_data, hw_irq_cfg);
 	if (apicd->move_in_progress)
-		__send_cleanup_vector(apicd);
+		__vector_schedule_cleanup(apicd);
 }
 
 void irq_complete_move(struct irq_cfg *cfg)
@@ -1007,7 +1046,7 @@ void irq_complete_move(struct irq_cfg *c
 	 * on the same CPU.
 	 */
 	if (apicd->cpu == smp_processor_id())
-		__send_cleanup_vector(apicd);
+		__vector_schedule_cleanup(apicd);
 }
 
 /*
--- a/arch/x86/kernel/idt.c
+++ b/arch/x86/kernel/idt.c
@@ -131,7 +131,6 @@ static const __initconst struct idt_data
 	INTG(RESCHEDULE_VECTOR,			asm_sysvec_reschedule_ipi),
 	INTG(CALL_FUNCTION_VECTOR,		asm_sysvec_call_function),
 	INTG(CALL_FUNCTION_SINGLE_VECTOR,	asm_sysvec_call_function_single),
-	INTG(IRQ_MOVE_CLEANUP_VECTOR,		asm_sysvec_irq_move_cleanup),
 	INTG(REBOOT_VECTOR,			asm_sysvec_reboot),
 #endif
 
--- a/arch/x86/platform/uv/uv_irq.c
+++ b/arch/x86/platform/uv/uv_irq.c
@@ -58,7 +58,7 @@ uv_set_irq_affinity(struct irq_data *dat
 	ret = parent->chip->irq_set_affinity(parent, mask, force);
 	if (ret >= 0) {
 		uv_program_mmr(cfg, data->chip_data);
-		send_cleanup_vector(cfg);
+		vector_schedule_cleanup(cfg);
 	}
 
 	return ret;
--- a/drivers/iommu/amd/iommu.c
+++ b/drivers/iommu/amd/iommu.c
@@ -3639,7 +3639,7 @@ static int amd_ir_set_affinity(struct ir
 	 * at the new destination. So, time to cleanup the previous
 	 * vector allocation.
 	 */
-	send_cleanup_vector(cfg);
+	vector_schedule_cleanup(cfg);
 
 	return IRQ_SET_MASK_OK_DONE;
 }
--- a/drivers/iommu/hyperv-iommu.c
+++ b/drivers/iommu/hyperv-iommu.c
@@ -51,7 +51,7 @@ static int hyperv_ir_set_affinity(struct
 	if (ret < 0 || ret == IRQ_SET_MASK_OK_DONE)
 		return ret;
 
-	send_cleanup_vector(cfg);
+	vector_schedule_cleanup(cfg);
 
 	return 0;
 }
@@ -257,7 +257,7 @@ static int hyperv_root_ir_set_affinity(s
 	if (ret < 0 || ret == IRQ_SET_MASK_OK_DONE)
 		return ret;
 
-	send_cleanup_vector(cfg);
+	vector_schedule_cleanup(cfg);
 
 	return 0;
 }
--- a/drivers/iommu/intel/irq_remapping.c
+++ b/drivers/iommu/intel/irq_remapping.c
@@ -1180,7 +1180,7 @@ intel_ir_set_affinity(struct irq_data *d
 	 * at the new destination. So, time to cleanup the previous
 	 * vector allocation.
 	 */
-	send_cleanup_vector(cfg);
+	vector_schedule_cleanup(cfg);
 
 	return IRQ_SET_MASK_OK_DONE;
 }

  reply	other threads:[~2023-06-06 20:09 UTC|newest]

Thread overview: 96+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2023-04-10  8:14 [PATCH v8 00/33] x86: enable FRED for x86-64 Xin Li
2023-04-10  8:14 ` [PATCH v8 01/33] x86/traps: let common_interrupt() handle IRQ_MOVE_CLEANUP_VECTOR Xin Li
2023-05-07 11:59   ` Borislav Petkov
2023-06-03 19:19     ` Li, Xin3
2023-06-03 20:51   ` Thomas Gleixner
2023-06-05 17:07     ` Thomas Gleixner
2023-06-05 17:09       ` H. Peter Anvin
2023-06-06 20:09         ` Thomas Gleixner [this message]
2023-06-06 23:16           ` Li, Xin3
2023-06-19  8:00           ` Li, Xin3
2023-06-19 14:22             ` Thomas Gleixner
2023-06-19 18:47               ` Li, Xin3
2023-06-19 19:16                 ` H. Peter Anvin
2023-06-20  0:04                   ` Li, Xin3
2023-04-10  8:14 ` [PATCH v8 02/33] x86/fred: make unions for the cs and ss fields in struct pt_regs Xin Li
2023-06-03  9:48   ` Borislav Petkov
2023-06-05 12:07   ` Thomas Gleixner
2023-06-05 17:12     ` H. Peter Anvin
2023-06-05 17:29       ` Thomas Gleixner
2023-04-10  8:14 ` [PATCH v8 03/33] x86/traps: add a system interrupt table for system interrupt dispatch Xin Li
2023-06-05  8:34   ` Thomas Gleixner
2023-06-06  8:05     ` Li, Xin3
2023-06-05  8:38   ` Thomas Gleixner
2023-06-05  8:39   ` Thomas Gleixner
2023-04-10  8:14 ` [PATCH v8 04/33] x86/traps: add install_system_interrupt_handler() Xin Li
2023-06-05  8:57   ` Thomas Gleixner
2023-06-06  5:46     ` Li, Xin3
2023-04-10  8:14 ` [PATCH v8 05/33] x86/traps: add external_interrupt() to dispatch external interrupts Xin Li
2023-06-05 11:56   ` Thomas Gleixner
2023-06-05 17:52     ` Thomas Gleixner
2023-06-19 19:16     ` Li, Xin3
2023-06-19 21:13       ` Thomas Gleixner
2023-06-20  0:16         ` Li, Xin3
2023-04-10  8:14 ` [PATCH v8 06/33] x86/cpufeature: add the cpu feature bit for FRED Xin Li
2023-04-10  8:14 ` [PATCH v8 07/33] x86/opcode: add ERETU, ERETS instructions to x86-opcode-map Xin Li
2023-04-10  8:14 ` [PATCH v8 08/33] x86/objtool: teach objtool about ERETU and ERETS Xin Li
2023-04-10  8:14 ` [PATCH v8 09/33] x86/cpu: add X86_CR4_FRED macro Xin Li
2023-06-05 12:01   ` Thomas Gleixner
2023-06-05 17:06     ` H. Peter Anvin
2023-06-05 17:19     ` H. Peter Anvin
2023-04-10  8:14 ` [PATCH v8 10/33] x86/fred: add Kconfig option for FRED (CONFIG_X86_FRED) Xin Li
2023-04-10  8:14 ` [PATCH v8 11/33] x86/fred: if CONFIG_X86_FRED is disabled, disable FRED support Xin Li
2023-04-10  8:14 ` [PATCH v8 12/33] x86/cpu: add MSR numbers for FRED configuration Xin Li
2023-04-10  8:14 ` [PATCH v8 13/33] x86/fred: header file for event types Xin Li
2023-04-10  8:14 ` [PATCH v8 14/33] x86/fred: header file with FRED definitions Xin Li
2023-04-10  8:14 ` [PATCH v8 15/33] x86/fred: reserve space for the FRED stack frame Xin Li
2023-04-10  8:14 ` [PATCH v8 16/33] x86/fred: add a page fault entry stub for FRED Xin Li
2023-04-10  8:14 ` [PATCH v8 17/33] x86/fred: add a debug " Xin Li
2023-04-10  8:14 ` [PATCH v8 18/33] x86/fred: add a NMI " Xin Li
2023-04-10  8:14 ` [PATCH v8 19/33] x86/fred: add a machine check " Xin Li
2023-04-10  8:14 ` [PATCH v8 20/33] x86/fred: FRED entry/exit and dispatch code Xin Li
2023-06-05 13:21   ` Thomas Gleixner
2023-04-10  8:14 ` [PATCH v8 21/33] x86/fred: FRED initialization code Xin Li
2023-06-05 12:15   ` Thomas Gleixner
2023-06-05 13:41   ` Thomas Gleixner
2023-04-10  8:14 ` [PATCH v8 22/33] x86/fred: update MSR_IA32_FRED_RSP0 during task switch Xin Li
2023-04-10  8:14 ` [PATCH v8 23/33] x86/fred: let ret_from_fork() jmp to fred_exit_user when FRED is enabled Xin Li
2023-04-10  8:14 ` [PATCH v8 24/33] x86/fred: disallow the swapgs instruction " Xin Li
2023-06-05 13:47   ` Thomas Gleixner
2023-04-10  8:14 ` [PATCH v8 25/33] x86/fred: no ESPFIX needed " Xin Li
2023-04-10  8:14 ` [PATCH v8 26/33] x86/fred: allow single-step trap and NMI when starting a new thread Xin Li
2023-06-05 13:50   ` Thomas Gleixner
2023-06-05 13:52     ` Thomas Gleixner
2023-04-10  8:14 ` [PATCH v8 27/33] x86/fred: fixup fault on ERETU by jumping to fred_entrypoint_user Xin Li
2023-04-10  8:14 ` [PATCH v8 28/33] x86/ia32: do not modify the DPL bits for a null selector Xin Li
2023-04-10  8:14 ` [PATCH v8 29/33] x86/fred: allow FRED systems to use interrupt vectors 0x10-0x1f Xin Li
2023-06-05 14:06   ` Thomas Gleixner
2023-06-05 16:55     ` H. Peter Anvin
2023-04-10  8:14 ` [PATCH v8 30/33] x86/fred: allow dynamic stack frame size Xin Li
2023-06-05 14:11   ` Thomas Gleixner
2023-06-06  6:18     ` Li, Xin3
2023-06-06 13:27       ` Thomas Gleixner
2023-06-06 23:08         ` H. Peter Anvin
2023-04-10  8:14 ` [PATCH v8 31/33] x86/fred: BUG() when ERETU with %rsp not equal to that when the ring 3 event was just delivered Xin Li
2023-06-05 14:15   ` Thomas Gleixner
2023-06-05 16:42     ` H. Peter Anvin
2023-06-05 17:16       ` Thomas Gleixner
2023-04-10  8:14 ` [PATCH v8 32/33] x86/fred: disable FRED by default in its early stage Xin Li
2023-04-10  8:14 ` [PATCH v8 33/33] KVM: x86/vmx: refactor VMX_DO_EVENT_IRQOFF to generate FRED stack frames Xin Li
2023-04-10 21:50   ` Sean Christopherson
2023-04-11  5:06     ` Li, Xin3
2023-04-11 18:34       ` Sean Christopherson
2023-04-11 22:50         ` Li, Xin3
2023-04-12 18:26     ` Li, Xin3
2023-04-12 19:37       ` Sean Christopherson
2023-04-10 18:37 ` [PATCH v8 00/33] x86: enable FRED for x86-64 Dave Hansen
2023-04-10 19:14   ` Li, Xin3
2023-04-10 19:32     ` Borislav Petkov
2023-04-10 19:38       ` Dave Hansen
2023-04-10 20:52         ` Li, Xin3
2023-04-11  4:14       ` Li, Xin3
2023-04-10 18:49 ` Dave Hansen
2023-04-10 19:16   ` Li, Xin3
2023-06-05 17:11     ` Thomas Gleixner
2023-06-05 17:22 ` H. Peter Anvin
2023-06-05 17:32   ` Thomas Gleixner

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=877csgl5eo.ffs@tglx \
    --to=tglx@linutronix.de \
    --cc=andrew.cooper3@citrix.com \
    --cc=bp@alien8.de \
    --cc=dave.hansen@linux.intel.com \
    --cc=hpa@zytor.com \
    --cc=jiangshanlai@gmail.com \
    --cc=kvm@vger.kernel.org \
    --cc=linux-kernel@vger.kernel.org \
    --cc=mingo@redhat.com \
    --cc=pbonzini@redhat.com \
    --cc=peterz@infradead.org \
    --cc=ravi.v.shankar@intel.com \
    --cc=seanjc@google.com \
    --cc=shan.kang@intel.com \
    --cc=x86@kernel.org \
    --cc=xin3.li@intel.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox