Linux virtualization list
 help / color / mirror / Atom feed
* [PATCH v7 03/11] kernel/locking: Drop the overload of {mutex, rwsem}_spin_on_owner
From: Pan Xinhui @ 2016-11-02  9:08 UTC (permalink / raw)
  To: linux-kernel, linuxppc-dev, virtualization, linux-s390,
	xen-devel-request, kvm, xen-devel, x86
  Cc: kernellwp, jgross, dave, David.Laight, rkrcmar, peterz, benh,
	konrad.wilk, will.deacon, Pan Xinhui, mingo, paulus, mpe,
	pbonzini, paulmck, boqun.feng
In-Reply-To: <1478077718-37424-1-git-send-email-xinhui.pan@linux.vnet.ibm.com>

An over-committed guest with more vCPUs than pCPUs has a heavy overload
in the two spin_on_owner. This blames on the lock holder preemption
issue.

Kernel has an interface bool vcpu_is_preempted(int cpu) to see if a vCPU
is currently running or not. So break the spin loops on true condition.

test-case:
perf record -a perf bench sched messaging -g 400 -p && perf report

before patch:
20.68%  sched-messaging  [kernel.vmlinux]  [k] mutex_spin_on_owner
 8.45%  sched-messaging  [kernel.vmlinux]  [k] mutex_unlock
 4.12%  sched-messaging  [kernel.vmlinux]  [k] system_call
 3.01%  sched-messaging  [kernel.vmlinux]  [k] system_call_common
 2.83%  sched-messaging  [kernel.vmlinux]  [k] copypage_power7
 2.64%  sched-messaging  [kernel.vmlinux]  [k] rwsem_spin_on_owner
 2.00%  sched-messaging  [kernel.vmlinux]  [k] osq_lock

after patch:
 9.99%  sched-messaging  [kernel.vmlinux]  [k] mutex_unlock
 5.28%  sched-messaging  [unknown]         [H] 0xc0000000000768e0
 4.27%  sched-messaging  [kernel.vmlinux]  [k] __copy_tofrom_user_power7
 3.77%  sched-messaging  [kernel.vmlinux]  [k] copypage_power7
 3.24%  sched-messaging  [kernel.vmlinux]  [k] _raw_write_lock_irq
 3.02%  sched-messaging  [kernel.vmlinux]  [k] system_call
 2.69%  sched-messaging  [kernel.vmlinux]  [k] wait_consider_task

Signed-off-by: Pan Xinhui <xinhui.pan@linux.vnet.ibm.com>
Acked-by: Christian Borntraeger <borntraeger@de.ibm.com>
Acked-by: Paolo Bonzini <pbonzini@redhat.com>
Tested-by: Juergen Gross <jgross@suse.com>
---
 kernel/locking/mutex.c      | 13 +++++++++++--
 kernel/locking/rwsem-xadd.c | 14 +++++++++++---
 2 files changed, 22 insertions(+), 5 deletions(-)

diff --git a/kernel/locking/mutex.c b/kernel/locking/mutex.c
index a70b90d..24face6 100644
--- a/kernel/locking/mutex.c
+++ b/kernel/locking/mutex.c
@@ -236,7 +236,11 @@ bool mutex_spin_on_owner(struct mutex *lock, struct task_struct *owner)
 		 */
 		barrier();
 
-		if (!owner->on_cpu || need_resched()) {
+		/*
+		 * Use vcpu_is_preempted to detect lock holder preemption issue.
+		 */
+		if (!owner->on_cpu || need_resched() ||
+				vcpu_is_preempted(task_cpu(owner))) {
 			ret = false;
 			break;
 		}
@@ -261,8 +265,13 @@ static inline int mutex_can_spin_on_owner(struct mutex *lock)
 
 	rcu_read_lock();
 	owner = READ_ONCE(lock->owner);
+
+	/*
+	 * As lock holder preemption issue, we both skip spinning if task is not
+	 * on cpu or its cpu is preempted
+	 */
 	if (owner)
-		retval = owner->on_cpu;
+		retval = owner->on_cpu && !vcpu_is_preempted(task_cpu(owner));
 	rcu_read_unlock();
 	/*
 	 * if lock->owner is not set, the mutex owner may have just acquired
diff --git a/kernel/locking/rwsem-xadd.c b/kernel/locking/rwsem-xadd.c
index 2337b4b..b664ce1 100644
--- a/kernel/locking/rwsem-xadd.c
+++ b/kernel/locking/rwsem-xadd.c
@@ -336,7 +336,11 @@ static inline bool rwsem_can_spin_on_owner(struct rw_semaphore *sem)
 		goto done;
 	}
 
-	ret = owner->on_cpu;
+	/*
+	 * As lock holder preemption issue, we both skip spinning if task is not
+	 * on cpu or its cpu is preempted
+	 */
+	ret = owner->on_cpu && !vcpu_is_preempted(task_cpu(owner));
 done:
 	rcu_read_unlock();
 	return ret;
@@ -362,8 +366,12 @@ static noinline bool rwsem_spin_on_owner(struct rw_semaphore *sem)
 		 */
 		barrier();
 
-		/* abort spinning when need_resched or owner is not running */
-		if (!owner->on_cpu || need_resched()) {
+		/*
+		 * abort spinning when need_resched or owner is not running or
+		 * owner's cpu is preempted.
+		 */
+		if (!owner->on_cpu || need_resched() ||
+				vcpu_is_preempted(task_cpu(owner))) {
 			rcu_read_unlock();
 			return false;
 		}
-- 
2.4.11

^ permalink raw reply related

* [PATCH v7 04/11] powerpc/spinlock: support vcpu preempted check
From: Pan Xinhui @ 2016-11-02  9:08 UTC (permalink / raw)
  To: linux-kernel, linuxppc-dev, virtualization, linux-s390,
	xen-devel-request, kvm, xen-devel, x86
  Cc: kernellwp, jgross, dave, David.Laight, rkrcmar, peterz, benh,
	konrad.wilk, will.deacon, Pan Xinhui, mingo, paulus, mpe,
	pbonzini, paulmck, boqun.feng
In-Reply-To: <1478077718-37424-1-git-send-email-xinhui.pan@linux.vnet.ibm.com>

This is to fix some lock holder preemption issues. Some other locks
implementation do a spin loop before acquiring the lock itself.
Currently kernel has an interface of bool vcpu_is_preempted(int cpu). It
takes the cpu as parameter and return true if the cpu is preempted. Then
kernel can break the spin loops upon the retval of vcpu_is_preempted.

As kernel has used this interface, So lets support it.

Only pSeries need support it. And the fact is PowerNV is built into same
kernel image with pSeries. So we need return false if we are runnig as
PowerNV. The another fact is that lppaca->yiled_count keeps zero on
PowerNV. So we can just skip the machine type check.

Suggested-by: Boqun Feng <boqun.feng@gmail.com>
Suggested-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Pan Xinhui <xinhui.pan@linux.vnet.ibm.com>
---
 arch/powerpc/include/asm/spinlock.h | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/arch/powerpc/include/asm/spinlock.h b/arch/powerpc/include/asm/spinlock.h
index fa37fe9..8c1b913 100644
--- a/arch/powerpc/include/asm/spinlock.h
+++ b/arch/powerpc/include/asm/spinlock.h
@@ -52,6 +52,14 @@
 #define SYNC_IO
 #endif
 
+#ifdef CONFIG_PPC_PSERIES
+#define vcpu_is_preempted vcpu_is_preempted
+static inline bool vcpu_is_preempted(int cpu)
+{
+	return !!(be32_to_cpu(lppaca_of(cpu).yield_count) & 1);
+}
+#endif
+
 static __always_inline int arch_spin_value_unlocked(arch_spinlock_t lock)
 {
 	return lock.slock == 0;
-- 
2.4.11

^ permalink raw reply related

* [PATCH v7 05/11] s390/spinlock: Provide vcpu_is_preempted
From: Pan Xinhui @ 2016-11-02  9:08 UTC (permalink / raw)
  To: linux-kernel, linuxppc-dev, virtualization, linux-s390,
	xen-devel-request, kvm, xen-devel, x86
  Cc: kernellwp, jgross, dave, David.Laight, rkrcmar, peterz, benh,
	konrad.wilk, will.deacon, mingo, paulus, mpe, pbonzini, paulmck,
	boqun.feng
In-Reply-To: <1478077718-37424-1-git-send-email-xinhui.pan@linux.vnet.ibm.com>

From: Christian Borntraeger <borntraeger@de.ibm.com>

this implements the s390 backend for commit
"kernel/sched: introduce vcpu preempted check interface"
by reworking the existing smp_vcpu_scheduled into
arch_vcpu_is_preempted. We can then also get rid of the
local cpu_is_preempted function by moving the
CIF_ENABLED_WAIT test into arch_vcpu_is_preempted.

Signed-off-by: Christian Borntraeger <borntraeger@de.ibm.com>
Acked-by: Heiko Carstens <heiko.carstens@de.ibm.com>
---
 arch/s390/include/asm/spinlock.h |  8 ++++++++
 arch/s390/kernel/smp.c           |  9 +++++++--
 arch/s390/lib/spinlock.c         | 25 ++++++++-----------------
 3 files changed, 23 insertions(+), 19 deletions(-)

diff --git a/arch/s390/include/asm/spinlock.h b/arch/s390/include/asm/spinlock.h
index 7e9e09f..7ecd890 100644
--- a/arch/s390/include/asm/spinlock.h
+++ b/arch/s390/include/asm/spinlock.h
@@ -23,6 +23,14 @@ _raw_compare_and_swap(unsigned int *lock, unsigned int old, unsigned int new)
 	return __sync_bool_compare_and_swap(lock, old, new);
 }
 
+#ifndef CONFIG_SMP
+static inline bool arch_vcpu_is_preempted(int cpu) { return false; }
+#else
+bool arch_vcpu_is_preempted(int cpu);
+#endif
+
+#define vcpu_is_preempted arch_vcpu_is_preempted
+
 /*
  * Simple spin lock operations.  There are two variants, one clears IRQ's
  * on the local processor, one does not.
diff --git a/arch/s390/kernel/smp.c b/arch/s390/kernel/smp.c
index 35531fe..b988ed1 100644
--- a/arch/s390/kernel/smp.c
+++ b/arch/s390/kernel/smp.c
@@ -368,10 +368,15 @@ int smp_find_processor_id(u16 address)
 	return -1;
 }
 
-int smp_vcpu_scheduled(int cpu)
+bool arch_vcpu_is_preempted(int cpu)
 {
-	return pcpu_running(pcpu_devices + cpu);
+	if (test_cpu_flag_of(CIF_ENABLED_WAIT, cpu))
+		return false;
+	if (pcpu_running(pcpu_devices + cpu))
+		return false;
+	return true;
 }
+EXPORT_SYMBOL(arch_vcpu_is_preempted);
 
 void smp_yield_cpu(int cpu)
 {
diff --git a/arch/s390/lib/spinlock.c b/arch/s390/lib/spinlock.c
index e5f50a7..e48a48e 100644
--- a/arch/s390/lib/spinlock.c
+++ b/arch/s390/lib/spinlock.c
@@ -37,15 +37,6 @@ static inline void _raw_compare_and_delay(unsigned int *lock, unsigned int old)
 	asm(".insn rsy,0xeb0000000022,%0,0,%1" : : "d" (old), "Q" (*lock));
 }
 
-static inline int cpu_is_preempted(int cpu)
-{
-	if (test_cpu_flag_of(CIF_ENABLED_WAIT, cpu))
-		return 0;
-	if (smp_vcpu_scheduled(cpu))
-		return 0;
-	return 1;
-}
-
 void arch_spin_lock_wait(arch_spinlock_t *lp)
 {
 	unsigned int cpu = SPINLOCK_LOCKVAL;
@@ -62,7 +53,7 @@ void arch_spin_lock_wait(arch_spinlock_t *lp)
 			continue;
 		}
 		/* First iteration: check if the lock owner is running. */
-		if (first_diag && cpu_is_preempted(~owner)) {
+		if (first_diag && arch_vcpu_is_preempted(~owner)) {
 			smp_yield_cpu(~owner);
 			first_diag = 0;
 			continue;
@@ -81,7 +72,7 @@ void arch_spin_lock_wait(arch_spinlock_t *lp)
 		 * yield the CPU unconditionally. For LPAR rely on the
 		 * sense running status.
 		 */
-		if (!MACHINE_IS_LPAR || cpu_is_preempted(~owner)) {
+		if (!MACHINE_IS_LPAR || arch_vcpu_is_preempted(~owner)) {
 			smp_yield_cpu(~owner);
 			first_diag = 0;
 		}
@@ -108,7 +99,7 @@ void arch_spin_lock_wait_flags(arch_spinlock_t *lp, unsigned long flags)
 			continue;
 		}
 		/* Check if the lock owner is running. */
-		if (first_diag && cpu_is_preempted(~owner)) {
+		if (first_diag && arch_vcpu_is_preempted(~owner)) {
 			smp_yield_cpu(~owner);
 			first_diag = 0;
 			continue;
@@ -127,7 +118,7 @@ void arch_spin_lock_wait_flags(arch_spinlock_t *lp, unsigned long flags)
 		 * yield the CPU unconditionally. For LPAR rely on the
 		 * sense running status.
 		 */
-		if (!MACHINE_IS_LPAR || cpu_is_preempted(~owner)) {
+		if (!MACHINE_IS_LPAR || arch_vcpu_is_preempted(~owner)) {
 			smp_yield_cpu(~owner);
 			first_diag = 0;
 		}
@@ -165,7 +156,7 @@ void _raw_read_lock_wait(arch_rwlock_t *rw)
 	owner = 0;
 	while (1) {
 		if (count-- <= 0) {
-			if (owner && cpu_is_preempted(~owner))
+			if (owner && arch_vcpu_is_preempted(~owner))
 				smp_yield_cpu(~owner);
 			count = spin_retry;
 		}
@@ -211,7 +202,7 @@ void _raw_write_lock_wait(arch_rwlock_t *rw, unsigned int prev)
 	owner = 0;
 	while (1) {
 		if (count-- <= 0) {
-			if (owner && cpu_is_preempted(~owner))
+			if (owner && arch_vcpu_is_preempted(~owner))
 				smp_yield_cpu(~owner);
 			count = spin_retry;
 		}
@@ -241,7 +232,7 @@ void _raw_write_lock_wait(arch_rwlock_t *rw)
 	owner = 0;
 	while (1) {
 		if (count-- <= 0) {
-			if (owner && cpu_is_preempted(~owner))
+			if (owner && arch_vcpu_is_preempted(~owner))
 				smp_yield_cpu(~owner);
 			count = spin_retry;
 		}
@@ -285,7 +276,7 @@ void arch_lock_relax(unsigned int cpu)
 {
 	if (!cpu)
 		return;
-	if (MACHINE_IS_LPAR && !cpu_is_preempted(~cpu))
+	if (MACHINE_IS_LPAR && !arch_vcpu_is_preempted(~cpu))
 		return;
 	smp_yield_cpu(~cpu);
 }
-- 
2.4.11

^ permalink raw reply related

* [PATCH v7 06/11] x86, paravirt: Add interface to support kvm/xen vcpu preempted check
From: Pan Xinhui @ 2016-11-02  9:08 UTC (permalink / raw)
  To: linux-kernel, linuxppc-dev, virtualization, linux-s390,
	xen-devel-request, kvm, xen-devel, x86
  Cc: kernellwp, jgross, dave, David.Laight, rkrcmar, peterz, benh,
	konrad.wilk, will.deacon, Pan Xinhui, mingo, paulus, mpe,
	pbonzini, paulmck, boqun.feng
In-Reply-To: <1478077718-37424-1-git-send-email-xinhui.pan@linux.vnet.ibm.com>

This is to fix some lock holder preemption issues. Some other locks
implementation do a spin loop before acquiring the lock itself.
Currently kernel has an interface of bool vcpu_is_preempted(int cpu). It
takes the cpu as parameter and return true if the cpu is preempted.
Then kernel can break the spin loops upon the retval of
vcpu_is_preempted.

As kernel has used this interface, So lets support it.

To deal with kernel and kvm/xen, add vcpu_is_preempted into struct
pv_lock_ops.

Then kvm or xen could provide their own implementation to support
vcpu_is_preempted.

Signed-off-by: Pan Xinhui <xinhui.pan@linux.vnet.ibm.com>
Acked-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/include/asm/paravirt_types.h | 2 ++
 arch/x86/include/asm/spinlock.h       | 8 ++++++++
 arch/x86/kernel/paravirt-spinlocks.c  | 6 ++++++
 3 files changed, 16 insertions(+)

diff --git a/arch/x86/include/asm/paravirt_types.h b/arch/x86/include/asm/paravirt_types.h
index 0f400c0..38c3bb7 100644
--- a/arch/x86/include/asm/paravirt_types.h
+++ b/arch/x86/include/asm/paravirt_types.h
@@ -310,6 +310,8 @@ struct pv_lock_ops {
 
 	void (*wait)(u8 *ptr, u8 val);
 	void (*kick)(int cpu);
+
+	bool (*vcpu_is_preempted)(int cpu);
 };
 
 /* This contains all the paravirt structures: we get a convenient
diff --git a/arch/x86/include/asm/spinlock.h b/arch/x86/include/asm/spinlock.h
index 921bea7..0526f59 100644
--- a/arch/x86/include/asm/spinlock.h
+++ b/arch/x86/include/asm/spinlock.h
@@ -26,6 +26,14 @@
 extern struct static_key paravirt_ticketlocks_enabled;
 static __always_inline bool static_key_false(struct static_key *key);
 
+#ifdef CONFIG_PARAVIRT_SPINLOCKS
+#define vcpu_is_preempted vcpu_is_preempted
+static inline bool vcpu_is_preempted(int cpu)
+{
+	return pv_lock_ops.vcpu_is_preempted(cpu);
+}
+#endif
+
 #include <asm/qspinlock.h>
 
 /*
diff --git a/arch/x86/kernel/paravirt-spinlocks.c b/arch/x86/kernel/paravirt-spinlocks.c
index 2c55a00..2f204dd 100644
--- a/arch/x86/kernel/paravirt-spinlocks.c
+++ b/arch/x86/kernel/paravirt-spinlocks.c
@@ -21,12 +21,18 @@ bool pv_is_native_spin_unlock(void)
 		__raw_callee_save___native_queued_spin_unlock;
 }
 
+static bool native_vcpu_is_preempted(int cpu)
+{
+	return 0;
+}
+
 struct pv_lock_ops pv_lock_ops = {
 #ifdef CONFIG_SMP
 	.queued_spin_lock_slowpath = native_queued_spin_lock_slowpath,
 	.queued_spin_unlock = PV_CALLEE_SAVE(__native_queued_spin_unlock),
 	.wait = paravirt_nop,
 	.kick = paravirt_nop,
+	.vcpu_is_preempted = native_vcpu_is_preempted,
 #endif /* SMP */
 };
 EXPORT_SYMBOL(pv_lock_ops);
-- 
2.4.11

^ permalink raw reply related

* [PATCH v7 07/11] KVM: Introduce kvm_write_guest_offset_cached
From: Pan Xinhui @ 2016-11-02  9:08 UTC (permalink / raw)
  To: linux-kernel, linuxppc-dev, virtualization, linux-s390,
	xen-devel-request, kvm, xen-devel, x86
  Cc: kernellwp, jgross, dave, David.Laight, rkrcmar, peterz, benh,
	konrad.wilk, will.deacon, Pan Xinhui, mingo, paulus, mpe,
	pbonzini, paulmck, boqun.feng
In-Reply-To: <1478077718-37424-1-git-send-email-xinhui.pan@linux.vnet.ibm.com>

It allows us to update some status or field of one struct partially.

We can also save one kvm_read_guest_cached if we just update one filed
of the struct regardless of its current value.

Signed-off-by: Pan Xinhui <xinhui.pan@linux.vnet.ibm.com>
Acked-by: Paolo Bonzini <pbonzini@redhat.com>
---
 include/linux/kvm_host.h |  2 ++
 virt/kvm/kvm_main.c      | 20 ++++++++++++++------
 2 files changed, 16 insertions(+), 6 deletions(-)

diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 01c0b9c..6f00237 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -645,6 +645,8 @@ int kvm_write_guest(struct kvm *kvm, gpa_t gpa, const void *data,
 		    unsigned long len);
 int kvm_write_guest_cached(struct kvm *kvm, struct gfn_to_hva_cache *ghc,
 			   void *data, unsigned long len);
+int kvm_write_guest_offset_cached(struct kvm *kvm, struct gfn_to_hva_cache *ghc,
+			   void *data, int offset, unsigned long len);
 int kvm_gfn_to_hva_cache_init(struct kvm *kvm, struct gfn_to_hva_cache *ghc,
 			      gpa_t gpa, unsigned long len);
 int kvm_clear_guest_page(struct kvm *kvm, gfn_t gfn, int offset, int len);
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 2907b7b..95308ee 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -1972,30 +1972,38 @@ int kvm_gfn_to_hva_cache_init(struct kvm *kvm, struct gfn_to_hva_cache *ghc,
 }
 EXPORT_SYMBOL_GPL(kvm_gfn_to_hva_cache_init);
 
-int kvm_write_guest_cached(struct kvm *kvm, struct gfn_to_hva_cache *ghc,
-			   void *data, unsigned long len)
+int kvm_write_guest_offset_cached(struct kvm *kvm, struct gfn_to_hva_cache *ghc,
+			   void *data, int offset, unsigned long len)
 {
 	struct kvm_memslots *slots = kvm_memslots(kvm);
 	int r;
+	gpa_t gpa = ghc->gpa + offset;
 
-	BUG_ON(len > ghc->len);
+	BUG_ON(len + offset > ghc->len);
 
 	if (slots->generation != ghc->generation)
 		kvm_gfn_to_hva_cache_init(kvm, ghc, ghc->gpa, ghc->len);
 
 	if (unlikely(!ghc->memslot))
-		return kvm_write_guest(kvm, ghc->gpa, data, len);
+		return kvm_write_guest(kvm, gpa, data, len);
 
 	if (kvm_is_error_hva(ghc->hva))
 		return -EFAULT;
 
-	r = __copy_to_user((void __user *)ghc->hva, data, len);
+	r = __copy_to_user((void __user *)ghc->hva + offset, data, len);
 	if (r)
 		return -EFAULT;
-	mark_page_dirty_in_slot(ghc->memslot, ghc->gpa >> PAGE_SHIFT);
+	mark_page_dirty_in_slot(ghc->memslot, gpa >> PAGE_SHIFT);
 
 	return 0;
 }
+EXPORT_SYMBOL_GPL(kvm_write_guest_offset_cached);
+
+int kvm_write_guest_cached(struct kvm *kvm, struct gfn_to_hva_cache *ghc,
+			   void *data, unsigned long len)
+{
+	return kvm_write_guest_offset_cached(kvm, ghc, data, 0, len);
+}
 EXPORT_SYMBOL_GPL(kvm_write_guest_cached);
 
 int kvm_read_guest_cached(struct kvm *kvm, struct gfn_to_hva_cache *ghc,
-- 
2.4.11

^ permalink raw reply related

* [PATCH v7 08/11] x86, kvm/x86.c: support vcpu preempted check
From: Pan Xinhui @ 2016-11-02  9:08 UTC (permalink / raw)
  To: linux-kernel, linuxppc-dev, virtualization, linux-s390,
	xen-devel-request, kvm, xen-devel, x86
  Cc: kernellwp, jgross, dave, David.Laight, rkrcmar, peterz, benh,
	konrad.wilk, will.deacon, Pan Xinhui, mingo, paulus, mpe,
	pbonzini, paulmck, boqun.feng
In-Reply-To: <1478077718-37424-1-git-send-email-xinhui.pan@linux.vnet.ibm.com>

Support the vcpu_is_preempted() functionality under KVM. This will
enhance lock performance on overcommitted hosts (more runnable vcpus
than physical cpus in the system) as doing busy waits for preempted
vcpus will hurt system performance far worse than early yielding.

Use one field of struct kvm_steal_time ::preempted to indicate that if
one vcpu is running or not.

Signed-off-by: Pan Xinhui <xinhui.pan@linux.vnet.ibm.com>
Acked-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/include/uapi/asm/kvm_para.h |  4 +++-
 arch/x86/kvm/x86.c                   | 16 ++++++++++++++++
 2 files changed, 19 insertions(+), 1 deletion(-)

diff --git a/arch/x86/include/uapi/asm/kvm_para.h b/arch/x86/include/uapi/asm/kvm_para.h
index 94dc8ca..1421a65 100644
--- a/arch/x86/include/uapi/asm/kvm_para.h
+++ b/arch/x86/include/uapi/asm/kvm_para.h
@@ -45,7 +45,9 @@ struct kvm_steal_time {
 	__u64 steal;
 	__u32 version;
 	__u32 flags;
-	__u32 pad[12];
+	__u8  preempted;
+	__u8  u8_pad[3];
+	__u32 pad[11];
 };
 
 #define KVM_STEAL_ALIGNMENT_BITS 5
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index e375235..f06e115 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -2057,6 +2057,8 @@ static void record_steal_time(struct kvm_vcpu *vcpu)
 		&vcpu->arch.st.steal, sizeof(struct kvm_steal_time))))
 		return;
 
+	vcpu->arch.st.steal.preempted = 0;
+
 	if (vcpu->arch.st.steal.version & 1)
 		vcpu->arch.st.steal.version += 1;  /* first time write, random junk */
 
@@ -2810,8 +2812,22 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
 	kvm_make_request(KVM_REQ_STEAL_UPDATE, vcpu);
 }
 
+static void kvm_steal_time_set_preempted(struct kvm_vcpu *vcpu)
+{
+	if (!(vcpu->arch.st.msr_val & KVM_MSR_ENABLED))
+		return;
+
+	vcpu->arch.st.steal.preempted = 1;
+
+	kvm_write_guest_offset_cached(vcpu->kvm, &vcpu->arch.st.stime,
+			&vcpu->arch.st.steal.preempted,
+			offsetof(struct kvm_steal_time, preempted),
+			sizeof(vcpu->arch.st.steal.preempted));
+}
+
 void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu)
 {
+	kvm_steal_time_set_preempted(vcpu);
 	kvm_x86_ops->vcpu_put(vcpu);
 	kvm_put_guest_fpu(vcpu);
 	vcpu->arch.last_host_tsc = rdtsc();
-- 
2.4.11

^ permalink raw reply related

* [PATCH v7 09/11] x86, kernel/kvm.c: support vcpu preempted check
From: Pan Xinhui @ 2016-11-02  9:08 UTC (permalink / raw)
  To: linux-kernel, linuxppc-dev, virtualization, linux-s390,
	xen-devel-request, kvm, xen-devel, x86
  Cc: kernellwp, jgross, dave, David.Laight, rkrcmar, peterz, benh,
	konrad.wilk, will.deacon, Pan Xinhui, mingo, paulus, mpe,
	pbonzini, paulmck, boqun.feng
In-Reply-To: <1478077718-37424-1-git-send-email-xinhui.pan@linux.vnet.ibm.com>

Support the vcpu_is_preempted() functionality under KVM. This will
enhance lock performance on overcommitted hosts (more runnable vcpus
than physical cpus in the system) as doing busy waits for preempted
vcpus will hurt system performance far worse than early yielding.

struct kvm_steal_time::preempted indicate that if one vcpu is running or
not after commit("x86, kvm/x86.c: support vcpu preempted check").

unix benchmark result:
host:  kernel 4.8.1, i5-4570, 4 cpus
guest: kernel 4.8.1, 8 vcpus

        test-case                       after-patch       before-patch
Execl Throughput                       |    18307.9 lps  |    11701.6 lps
File Copy 1024 bufsize 2000 maxblocks  |  1352407.3 KBps |   790418.9 KBps
File Copy 256 bufsize 500 maxblocks    |   367555.6 KBps |   222867.7 KBps
File Copy 4096 bufsize 8000 maxblocks  |  3675649.7 KBps |  1780614.4 KBps
Pipe Throughput                        | 11872208.7 lps  | 11855628.9 lps
Pipe-based Context Switching           |  1495126.5 lps  |  1490533.9 lps
Process Creation                       |    29881.2 lps  |    28572.8 lps
Shell Scripts (1 concurrent)           |    23224.3 lpm  |    22607.4 lpm
Shell Scripts (8 concurrent)           |     3531.4 lpm  |     3211.9 lpm
System Call Overhead                   | 10385653.0 lps  | 10419979.0 lps

Signed-off-by: Pan Xinhui <xinhui.pan@linux.vnet.ibm.com>
Acked-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kernel/kvm.c | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c
index edbbfc8..0b48dd2 100644
--- a/arch/x86/kernel/kvm.c
+++ b/arch/x86/kernel/kvm.c
@@ -415,6 +415,15 @@ void kvm_disable_steal_time(void)
 	wrmsr(MSR_KVM_STEAL_TIME, 0, 0);
 }
 
+static bool kvm_vcpu_is_preempted(int cpu)
+{
+	struct kvm_steal_time *src;
+
+	src = &per_cpu(steal_time, cpu);
+
+	return !!src->preempted;
+}
+
 #ifdef CONFIG_SMP
 static void __init kvm_smp_prepare_boot_cpu(void)
 {
@@ -471,6 +480,9 @@ void __init kvm_guest_init(void)
 	if (kvm_para_has_feature(KVM_FEATURE_STEAL_TIME)) {
 		has_steal_clock = 1;
 		pv_time_ops.steal_clock = kvm_steal_clock;
+#ifdef CONFIG_PARAVIRT_SPINLOCKS
+		pv_lock_ops.vcpu_is_preempted = kvm_vcpu_is_preempted;
+#endif
 	}
 
 	if (kvm_para_has_feature(KVM_FEATURE_PV_EOI))
-- 
2.4.11

^ permalink raw reply related

* [PATCH v7 10/11] x86, xen: support vcpu preempted check
From: Pan Xinhui @ 2016-11-02  9:08 UTC (permalink / raw)
  To: linux-kernel, linuxppc-dev, virtualization, linux-s390,
	xen-devel-request, kvm, xen-devel, x86
  Cc: kernellwp, jgross, dave, David.Laight, rkrcmar, peterz, benh,
	konrad.wilk, will.deacon, Pan Xinhui, mingo, paulus, mpe,
	pbonzini, paulmck, boqun.feng
In-Reply-To: <1478077718-37424-1-git-send-email-xinhui.pan@linux.vnet.ibm.com>

From: Juergen Gross <jgross@suse.com>

Support the vcpu_is_preempted() functionality under Xen. This will
enhance lock performance on overcommitted hosts (more runnable vcpus
than physical cpus in the system) as doing busy waits for preempted
vcpus will hurt system performance far worse than early yielding.

A quick test (4 vcpus on 1 physical cpu doing a parallel build job
with "make -j 8") reduced system time by about 5% with this patch.

Signed-off-by: Juergen Gross <jgross@suse.com>
Signed-off-by: Pan Xinhui <xinhui.pan@linux.vnet.ibm.com>
---
 arch/x86/xen/spinlock.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/arch/x86/xen/spinlock.c b/arch/x86/xen/spinlock.c
index 3d6e006..74756bb 100644
--- a/arch/x86/xen/spinlock.c
+++ b/arch/x86/xen/spinlock.c
@@ -114,7 +114,6 @@ void xen_uninit_lock_cpu(int cpu)
 	per_cpu(irq_name, cpu) = NULL;
 }
 
-
 /*
  * Our init of PV spinlocks is split in two init functions due to us
  * using paravirt patching and jump labels patching and having to do
@@ -137,6 +136,8 @@ void __init xen_init_spinlocks(void)
 	pv_lock_ops.queued_spin_unlock = PV_CALLEE_SAVE(__pv_queued_spin_unlock);
 	pv_lock_ops.wait = xen_qlock_wait;
 	pv_lock_ops.kick = xen_qlock_kick;
+
+	pv_lock_ops.vcpu_is_preempted = xen_vcpu_stolen;
 }
 
 /*
-- 
2.4.11

^ permalink raw reply related

* [PATCH v7 11/11] Documentation: virtual: kvm: Support vcpu preempted check
From: Pan Xinhui @ 2016-11-02  9:08 UTC (permalink / raw)
  To: linux-kernel, linuxppc-dev, virtualization, linux-s390,
	xen-devel-request, kvm, xen-devel, x86
  Cc: kernellwp, jgross, dave, David.Laight, rkrcmar, peterz, benh,
	konrad.wilk, will.deacon, Pan Xinhui, mingo, paulus, mpe,
	pbonzini, paulmck, boqun.feng
In-Reply-To: <1478077718-37424-1-git-send-email-xinhui.pan@linux.vnet.ibm.com>

Commit ("x86, kvm: support vcpu preempted check") add one field "__u8
preempted" into struct kvm_steal_time. This field tells if one vcpu is
running or not.

It is zero if 1) some old KVM deos not support this filed. 2) the vcpu
is not preempted. Other values means the vcpu has been preempted.

Signed-off-by: Pan Xinhui <xinhui.pan@linux.vnet.ibm.com>
Acked-by: Radim Krčmář <rkrcmar@redhat.com>
Acked-by: Paolo Bonzini <pbonzini@redhat.com>
---
 Documentation/virtual/kvm/msr.txt | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/Documentation/virtual/kvm/msr.txt b/Documentation/virtual/kvm/msr.txt
index 2a71c8f..ab2ab76 100644
--- a/Documentation/virtual/kvm/msr.txt
+++ b/Documentation/virtual/kvm/msr.txt
@@ -208,7 +208,9 @@ MSR_KVM_STEAL_TIME: 0x4b564d03
 		__u64 steal;
 		__u32 version;
 		__u32 flags;
-		__u32 pad[12];
+		__u8  preempted;
+		__u8  u8_pad[3];
+		__u32 pad[11];
 	}
 
 	whose data will be filled in by the hypervisor periodically. Only one
@@ -232,6 +234,11 @@ MSR_KVM_STEAL_TIME: 0x4b564d03
 		nanoseconds. Time during which the vcpu is idle, will not be
 		reported as steal time.
 
+		preempted: indicate the VCPU who owns this struct is running or
+		not. Non-zero values mean the VCPU has been preempted. Zero
+		means the VCPU is not preempted. NOTE, it is always zero if the
+		the hypervisor doesn't support this field.
+
 MSR_KVM_EOI_EN: 0x4b564d04
 	data: Bit 0 is 1 when PV end of interrupt is enabled on the vcpu; 0
 	when disabled.  Bit 1 is reserved and must be zero.  When PV end of
-- 
2.4.11

_______________________________________________
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization

^ permalink raw reply related

* Re: [PATCH v2 34/37] docs: fix locations of several documents that got moved
From: Pavel Machek @ 2016-11-02  9:31 UTC (permalink / raw)
  To: Mauro Carvalho Chehab
  Cc: Nicolas Pitre, Linus Walleij, Jaroslav Kysela, Chris Metcalf,
	Peter Loeffler, Finn Thain, Yaowei Bai, Jakub Wilk, linux-acpi,
	Geert Uytterhoeven, Guenter Roeck, Petr Mladek, Jean Delvare,
	linux-pm, Alexander Viro, Andy Lutomirski, Thomas Gleixner,
	Karsten Keil, Jiri Kosina, seokhoon.yoon, Li Zefan, Ryan Swan,
	Joe Perches, Andrew Morton, Mark
In-Reply-To: <20161019115918.1162d16e@vento.lan>


[-- Attachment #1.1: Type: text/plain, Size: 1794 bytes --]

Hi!

> > Dunno, but kernel-parameters.txt was already quite long... for a file
> > that is referenced quite often. Adding admin-guide/ into the path does
> > not really help.
> 
> The big string name starts with Documentation/ :) There are some discussions 
> about changing it to doc/ (or docs/). Also, as you said, kernel-parameters
> is already a big name. Perhaps we could use, instead,
> > "kernel-parms".

cmdline?

> If we rename kernel-parameters.rst to kernel-parms.rst, plus the doc/ rename,
> then the string size will actually reduce:
> 
> -		(see Documentation/kernel-parameters.txt), the minimum possible
> +		(see doc/admin-guide/kernel-parms.rst), the minimum possible
> 
> > Maybe admin-guide should go directly into Documentation/ , as that's
> > what our users are interested in?
> 
> There are several problems if we keep them at Documentation/ dir:
> 
> - We'll end by mixing documents already converted to ReST with documents
>   not converted yet;
> 
> - A rename is needed anyway, as Sphinx only accepts ReST files that end
>   with the extension(s) defined at Documentation/conf.py (currently,
>   .rst);
> 
> - A partial documentation build is made by sub-directory. If we put
>   files under /Documentation, there's no way to build just one
>   book.

Well, documentation is primarily for users. I'm sure tools can adapt...

> > Plus, I'm not sure how many developers will figure out that process/
> > is what describes kernel patch submission process. We have processes
> > in the kernel, after all...
> 
> Do you have a better idea?

development/ ?

Best regards,
									Pavel
-- 
(english) http://www.livejournal.com/~pavelmachek
(cesky, pictures) http://atrey.karlin.mff.cuni.cz/~pavel/picture/horses/blog.html

[-- Attachment #1.2: Digital signature --]
[-- Type: application/pgp-signature, Size: 181 bytes --]

[-- Attachment #2: Type: text/plain, Size: 183 bytes --]

_______________________________________________
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization

^ permalink raw reply

* Re: [PATCH v2 34/37] docs: fix locations of several documents that got moved
From: Prarit Bhargava @ 2016-11-02 10:08 UTC (permalink / raw)
  To: Pavel Machek, Mauro Carvalho Chehab
  Cc: Nicolas Pitre, Linus Walleij, Jaroslav Kysela, Chris Metcalf,
	Peter Loeffler, Finn Thain, Yaowei Bai, Jakub Wilk, linux-acpi,
	Geert Uytterhoeven, Guenter Roeck, Petr Mladek, Jean Delvare,
	linux-pm, Alexander Viro, Andy Lutomirski, Thomas Gleixner,
	Karsten Keil, Jiri Kosina, seokhoon.yoon, Li Zefan, Ryan Swan,
	Joe Perches, Andrew Morton, Mark
In-Reply-To: <20161102093154.GC23350@amd>



On 11/02/2016 05:31 AM, Pavel Machek wrote:
> Hi!
> 
>>> Dunno, but kernel-parameters.txt was already quite long... for a file
>>> that is referenced quite often. Adding admin-guide/ into the path does
>>> not really help.
>>
>> The big string name starts with Documentation/ :) There are some discussions 
>> about changing it to doc/ (or docs/). Also, as you said, kernel-parameters
>> is already a big name. Perhaps we could use, instead,
>>> "kernel-parms".
> 
> cmdline?

You would not believe the number of users I've dealt with that have confused
"cmdline/command line" (aka prompt) with "kernel parameter".  I go back and edit
customer debug requests to make sure there's no use of cmdline in place of
kernel parameter.

For the love of every customer facing engineer out there, please stop using
cmdline ;).

P.

> 
>> If we rename kernel-parameters.rst to kernel-parms.rst, plus the doc/ rename,
>> then the string size will actually reduce:
>>
>> -		(see Documentation/kernel-parameters.txt), the minimum possible
>> +		(see doc/admin-guide/kernel-parms.rst), the minimum possible
>>
>>> Maybe admin-guide should go directly into Documentation/ , as that's
>>> what our users are interested in?
>>
>> There are several problems if we keep them at Documentation/ dir:
>>
>> - We'll end by mixing documents already converted to ReST with documents
>>   not converted yet;
>>
>> - A rename is needed anyway, as Sphinx only accepts ReST files that end
>>   with the extension(s) defined at Documentation/conf.py (currently,
>>   .rst);
>>
>> - A partial documentation build is made by sub-directory. If we put
>>   files under /Documentation, there's no way to build just one
>>   book.
> 
> Well, documentation is primarily for users. I'm sure tools can adapt...
> 
>>> Plus, I'm not sure how many developers will figure out that process/
>>> is what describes kernel patch submission process. We have processes
>>> in the kernel, after all...
>>
>> Do you have a better idea?
> 
> development/ ?
> 
> Best regards,
> 									Pavel
> 

^ permalink raw reply

* [PATCH] virtio-net: drop legacy features in virtio 1 mode
From: Michael S. Tsirkin @ 2016-11-04 10:55 UTC (permalink / raw)
  To: linux-kernel; +Cc: netdev, stable, virtualization

Virtio 1.0 spec says VIRTIO_F_ANY_LAYOUT and VIRTIO_NET_F_GSO are
legacy-only feature bits. Do not negotiate them in virtio 1 mode.  Note
this is a spec violation so we need to backport it to stable/downstream
kernels.

Cc: stable@vger.kernel.org
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
---
 drivers/net/virtio_net.c | 30 ++++++++++++++++++++----------
 1 file changed, 20 insertions(+), 10 deletions(-)

diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
index 7a00365..b19fb4d 100644
--- a/drivers/net/virtio_net.c
+++ b/drivers/net/virtio_net.c
@@ -2089,23 +2089,33 @@ static struct virtio_device_id id_table[] = {
 	{ 0 },
 };
 
+#define VIRTNET_FEATURES \
+	VIRTIO_NET_F_CSUM, VIRTIO_NET_F_GUEST_CSUM, \
+	VIRTIO_NET_F_MAC, \
+	VIRTIO_NET_F_HOST_TSO4, VIRTIO_NET_F_HOST_UFO, VIRTIO_NET_F_HOST_TSO6, \
+	VIRTIO_NET_F_HOST_ECN, VIRTIO_NET_F_GUEST_TSO4, VIRTIO_NET_F_GUEST_TSO6, \
+	VIRTIO_NET_F_GUEST_ECN, VIRTIO_NET_F_GUEST_UFO, \
+	VIRTIO_NET_F_MRG_RXBUF, VIRTIO_NET_F_STATUS, VIRTIO_NET_F_CTRL_VQ, \
+	VIRTIO_NET_F_CTRL_RX, VIRTIO_NET_F_CTRL_VLAN, \
+	VIRTIO_NET_F_GUEST_ANNOUNCE, VIRTIO_NET_F_MQ, \
+	VIRTIO_NET_F_CTRL_MAC_ADDR, \
+	VIRTIO_NET_F_MTU
+
 static unsigned int features[] = {
-	VIRTIO_NET_F_CSUM, VIRTIO_NET_F_GUEST_CSUM,
-	VIRTIO_NET_F_GSO, VIRTIO_NET_F_MAC,
-	VIRTIO_NET_F_HOST_TSO4, VIRTIO_NET_F_HOST_UFO, VIRTIO_NET_F_HOST_TSO6,
-	VIRTIO_NET_F_HOST_ECN, VIRTIO_NET_F_GUEST_TSO4, VIRTIO_NET_F_GUEST_TSO6,
-	VIRTIO_NET_F_GUEST_ECN, VIRTIO_NET_F_GUEST_UFO,
-	VIRTIO_NET_F_MRG_RXBUF, VIRTIO_NET_F_STATUS, VIRTIO_NET_F_CTRL_VQ,
-	VIRTIO_NET_F_CTRL_RX, VIRTIO_NET_F_CTRL_VLAN,
-	VIRTIO_NET_F_GUEST_ANNOUNCE, VIRTIO_NET_F_MQ,
-	VIRTIO_NET_F_CTRL_MAC_ADDR,
+	VIRTNET_FEATURES,
+};
+
+static unsigned int features_legacy[] = {
+	VIRTNET_FEATURES,
+	VIRTIO_NET_F_GSO,
 	VIRTIO_F_ANY_LAYOUT,
-	VIRTIO_NET_F_MTU,
 };
 
 static struct virtio_driver virtio_net_driver = {
 	.feature_table = features,
 	.feature_table_size = ARRAY_SIZE(features),
+	.feature_table_legacy = features_legacy,
+	.feature_table_size_legacy = ARRAY_SIZE(features_legacy),
 	.driver.name =	KBUILD_MODNAME,
 	.driver.owner =	THIS_MODULE,
 	.id_table =	id_table,
-- 
MST

^ permalink raw reply related

* Re: [PATCH] virtio-net: drop legacy features in virtio 1 mode
From: Cornelia Huck @ 2016-11-04 11:15 UTC (permalink / raw)
  To: Michael S. Tsirkin; +Cc: netdev, linux-kernel, stable, virtualization
In-Reply-To: <1478256865-29003-1-git-send-email-mst@redhat.com>

On Fri, 4 Nov 2016 12:55:36 +0200
"Michael S. Tsirkin" <mst@redhat.com> wrote:

> Virtio 1.0 spec says VIRTIO_F_ANY_LAYOUT and VIRTIO_NET_F_GSO are
> legacy-only feature bits. Do not negotiate them in virtio 1 mode.  Note
> this is a spec violation so we need to backport it to stable/downstream
> kernels.
> 
> Cc: stable@vger.kernel.org
> Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
> ---
>  drivers/net/virtio_net.c | 30 ++++++++++++++++++++----------
>  1 file changed, 20 insertions(+), 10 deletions(-)

Reviewed-by: Cornelia Huck <cornelia.huck@de.ibm.com>

^ permalink raw reply

* Re: [PATCH kernel v4 7/7] virtio-balloon: tell host vm's unused page info
From: Dave Hansen @ 2016-11-04 18:10 UTC (permalink / raw)
  To: Liang Li, mst
  Cc: virtio-dev, kvm, linux-kernel, qemu-devel, dgilbert, linux-mm,
	amit.shah, pbonzini, virtualization, mgorman
In-Reply-To: <1478067447-24654-8-git-send-email-liang.z.li@intel.com>

Please squish this and patch 5 together.  It makes no sense to separate
them.

> +static void send_unused_pages_info(struct virtio_balloon *vb,
> +				unsigned long req_id)
> +{
> +	struct scatterlist sg_in;
> +	unsigned long pfn = 0, bmap_len, pfn_limit, last_pfn, nr_pfn;
> +	struct virtqueue *vq = vb->req_vq;
> +	struct virtio_balloon_resp_hdr *hdr = vb->resp_hdr;
> +	int ret = 1, used_nr_bmap = 0, i;
> +
> +	if (virtio_has_feature(vb->vdev, VIRTIO_BALLOON_F_PAGE_BITMAP) &&
> +		vb->nr_page_bmap == 1)
> +		extend_page_bitmap(vb);
> +
> +	pfn_limit = PFNS_PER_BMAP * vb->nr_page_bmap;
> +	mutex_lock(&vb->balloon_lock);
> +	last_pfn = get_max_pfn();
> +
> +	while (ret) {
> +		clear_page_bitmap(vb);
> +		ret = get_unused_pages(pfn, pfn + pfn_limit, vb->page_bitmap,
> +			 PFNS_PER_BMAP, vb->nr_page_bmap);

This changed the underlying data structure without changing the way that
the structure is populated.

This algorithm picks a "PFNS_PER_BMAP * vb->nr_page_bmap"-sized set of
pfns, allocates a bitmap for them, the loops through all zones looking
for pages in any free list that are in that range.

Unpacking all the indirection, it looks like this:

for (pfn = 0; pfn < get_max_pfn(); pfn += BITMAP_SIZE_IN_PFNS)
	for_each_populated_zone(zone)
		for_each_migratetype_order(order, t)
			list_for_each(..., &zone->free_area[order])...

Let's say we do a 32k bitmap that can hold ~1M pages.  That's 4GB of
RAM.  On a 1TB system, that's 256 passes through the top-level loop.
The bottom-level lists have tens of thousands of pages in them, even on
my laptop.  Only 1/256 of these pages will get consumed in a given pass.

That's an awfully inefficient way of doing it.  This patch essentially
changed the data structure without changing the algorithm to populate it.

Please change the *algorithm* to use the new data structure efficiently.
 Such a change would only do a single pass through each freelist, and
would choose whether to use the extent-based (pfn -> range) or
bitmap-based approach based on the contents of the free lists.

You should not be using get_max_pfn().  Any patch set that continues to
use it is not likely to be using a proper algorithm.

^ permalink raw reply

* virtio_pci irq handling cleanups
From: Christoph Hellwig @ 2016-11-06 23:09 UTC (permalink / raw)
  To: mst; +Cc: linux-kernel, virtualization

Hi Michael,

this series contains a couple cleanups for the virtio_pci interrupt
handling code, including a switch to the new pci_irq_alloc_vectors
helper.  All these are in preparation of taking advantage of the new
PCI layer / core IRQ interrupt affinity handling, for which I will
send out a series once this and some core interrupt handling changes
are in.

^ permalink raw reply

* [PATCH 1/7] virtio_pci: use pci_alloc_irq_vectors
From: Christoph Hellwig @ 2016-11-06 23:09 UTC (permalink / raw)
  To: mst; +Cc: linux-kernel, virtualization
In-Reply-To: <1478473783-12589-1-git-send-email-hch@lst.de>

This avoids the separate allocation for the msix_entries structures, and
instead allows us to use pci_irq_vector to find a given IRQ vector.

Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 drivers/virtio/virtio_pci_common.c | 42 +++++++++++++++-----------------------
 drivers/virtio/virtio_pci_common.h |  1 -
 2 files changed, 17 insertions(+), 26 deletions(-)

diff --git a/drivers/virtio/virtio_pci_common.c b/drivers/virtio/virtio_pci_common.c
index d9a9058..f5e2751 100644
--- a/drivers/virtio/virtio_pci_common.c
+++ b/drivers/virtio/virtio_pci_common.c
@@ -37,7 +37,7 @@ void vp_synchronize_vectors(struct virtio_device *vdev)
 		synchronize_irq(vp_dev->pci_dev->irq);
 
 	for (i = 0; i < vp_dev->msix_vectors; ++i)
-		synchronize_irq(vp_dev->msix_entries[i].vector);
+		synchronize_irq(pci_irq_vector(vp_dev->pci_dev, i));
 }
 
 /* the notify function used when creating a virt queue */
@@ -113,7 +113,7 @@ static void vp_free_vectors(struct virtio_device *vdev)
 	}
 
 	for (i = 0; i < vp_dev->msix_used_vectors; ++i)
-		free_irq(vp_dev->msix_entries[i].vector, vp_dev);
+		free_irq(pci_irq_vector(vp_dev->pci_dev, i), vp_dev);
 
 	for (i = 0; i < vp_dev->msix_vectors; i++)
 		if (vp_dev->msix_affinity_masks[i])
@@ -123,7 +123,7 @@ static void vp_free_vectors(struct virtio_device *vdev)
 		/* Disable the vector used for configuration */
 		vp_dev->config_vector(vp_dev, VIRTIO_MSI_NO_VECTOR);
 
-		pci_disable_msix(vp_dev->pci_dev);
+		pci_free_irq_vectors(vp_dev->pci_dev);
 		vp_dev->msix_enabled = 0;
 	}
 
@@ -131,8 +131,6 @@ static void vp_free_vectors(struct virtio_device *vdev)
 	vp_dev->msix_used_vectors = 0;
 	kfree(vp_dev->msix_names);
 	vp_dev->msix_names = NULL;
-	kfree(vp_dev->msix_entries);
-	vp_dev->msix_entries = NULL;
 	kfree(vp_dev->msix_affinity_masks);
 	vp_dev->msix_affinity_masks = NULL;
 }
@@ -147,10 +145,6 @@ static int vp_request_msix_vectors(struct virtio_device *vdev, int nvectors,
 
 	vp_dev->msix_vectors = nvectors;
 
-	vp_dev->msix_entries = kmalloc(nvectors * sizeof *vp_dev->msix_entries,
-				       GFP_KERNEL);
-	if (!vp_dev->msix_entries)
-		goto error;
 	vp_dev->msix_names = kmalloc(nvectors * sizeof *vp_dev->msix_names,
 				     GFP_KERNEL);
 	if (!vp_dev->msix_names)
@@ -165,12 +159,9 @@ static int vp_request_msix_vectors(struct virtio_device *vdev, int nvectors,
 					GFP_KERNEL))
 			goto error;
 
-	for (i = 0; i < nvectors; ++i)
-		vp_dev->msix_entries[i].entry = i;
-
-	err = pci_enable_msix_exact(vp_dev->pci_dev,
-				    vp_dev->msix_entries, nvectors);
-	if (err)
+	err = pci_alloc_irq_vectors(vp_dev->pci_dev, nvectors, nvectors,
+			PCI_IRQ_MSIX);
+	if (err < 0)
 		goto error;
 	vp_dev->msix_enabled = 1;
 
@@ -178,7 +169,7 @@ static int vp_request_msix_vectors(struct virtio_device *vdev, int nvectors,
 	v = vp_dev->msix_used_vectors;
 	snprintf(vp_dev->msix_names[v], sizeof *vp_dev->msix_names,
 		 "%s-config", name);
-	err = request_irq(vp_dev->msix_entries[v].vector,
+	err = request_irq(pci_irq_vector(vp_dev->pci_dev, v),
 			  vp_config_changed, 0, vp_dev->msix_names[v],
 			  vp_dev);
 	if (err)
@@ -197,7 +188,7 @@ static int vp_request_msix_vectors(struct virtio_device *vdev, int nvectors,
 		v = vp_dev->msix_used_vectors;
 		snprintf(vp_dev->msix_names[v], sizeof *vp_dev->msix_names,
 			 "%s-virtqueues", name);
-		err = request_irq(vp_dev->msix_entries[v].vector,
+		err = request_irq(pci_irq_vector(vp_dev->pci_dev, v),
 				  vp_vring_interrupt, 0, vp_dev->msix_names[v],
 				  vp_dev);
 		if (err)
@@ -276,14 +267,15 @@ void vp_del_vqs(struct virtio_device *vdev)
 {
 	struct virtio_pci_device *vp_dev = to_vp_device(vdev);
 	struct virtqueue *vq, *n;
-	struct virtio_pci_vq_info *info;
 
 	list_for_each_entry_safe(vq, n, &vdev->vqs, list) {
-		info = vp_dev->vqs[vq->index];
-		if (vp_dev->per_vq_vectors &&
-			info->msix_vector != VIRTIO_MSI_NO_VECTOR)
-			free_irq(vp_dev->msix_entries[info->msix_vector].vector,
-				 vq);
+		if (vp_dev->per_vq_vectors) {
+			int v = vp_dev->vqs[vq->index]->msix_vector;
+
+			if (v != VIRTIO_MSI_NO_VECTOR)
+				free_irq(pci_irq_vector(vp_dev->pci_dev, v),
+					vq);
+		}
 		vp_del_vq(vq);
 	}
 	vp_dev->per_vq_vectors = false;
@@ -356,7 +348,7 @@ static int vp_try_to_find_vqs(struct virtio_device *vdev, unsigned nvqs,
 			 sizeof *vp_dev->msix_names,
 			 "%s-%s",
 			 dev_name(&vp_dev->vdev.dev), names[i]);
-		err = request_irq(vp_dev->msix_entries[msix_vec].vector,
+		err = request_irq(pci_irq_vector(vp_dev->pci_dev, msix_vec),
 				  vring_interrupt, 0,
 				  vp_dev->msix_names[msix_vec],
 				  vqs[i]);
@@ -419,7 +411,7 @@ int vp_set_vq_affinity(struct virtqueue *vq, int cpu)
 
 	if (vp_dev->msix_enabled) {
 		mask = vp_dev->msix_affinity_masks[info->msix_vector];
-		irq = vp_dev->msix_entries[info->msix_vector].vector;
+		irq = pci_irq_vector(vp_dev->pci_dev, info->msix_vector);
 		if (cpu == -1)
 			irq_set_affinity_hint(irq, NULL);
 		else {
diff --git a/drivers/virtio/virtio_pci_common.h b/drivers/virtio/virtio_pci_common.h
index 2826320..b2f6662 100644
--- a/drivers/virtio/virtio_pci_common.h
+++ b/drivers/virtio/virtio_pci_common.h
@@ -85,7 +85,6 @@ struct virtio_pci_device {
 	/* MSI-X support */
 	int msix_enabled;
 	int intx_enabled;
-	struct msix_entry *msix_entries;
 	cpumask_var_t *msix_affinity_masks;
 	/* Name strings for interrupts. This size should be enough,
 	 * and I'm too lazy to allocate each name separately. */
-- 
2.1.4

^ permalink raw reply related

* [PATCH 2/7] virtio_pci: remove the call to vp_free_vectors in vp_request_msix_vectors
From: Christoph Hellwig @ 2016-11-06 23:09 UTC (permalink / raw)
  To: mst; +Cc: linux-kernel, virtualization
In-Reply-To: <1478473783-12589-1-git-send-email-hch@lst.de>

vp_request_msix_vectors is only called by vp_try_to_find_vqs, which already
calls vp_free_vectors through vp_del_vqs in the failure case.

Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 drivers/virtio/virtio_pci_common.c | 1 -
 1 file changed, 1 deletion(-)

diff --git a/drivers/virtio/virtio_pci_common.c b/drivers/virtio/virtio_pci_common.c
index f5e2751..93700c5 100644
--- a/drivers/virtio/virtio_pci_common.c
+++ b/drivers/virtio/virtio_pci_common.c
@@ -197,7 +197,6 @@ static int vp_request_msix_vectors(struct virtio_device *vdev, int nvectors,
 	}
 	return 0;
 error:
-	vp_free_vectors(vdev);
 	return err;
 }
 
-- 
2.1.4

^ permalink raw reply related

* [PATCH 3/7] virtio_pci: merge vp_free_vectors into vp_del_vqs
From: Christoph Hellwig @ 2016-11-06 23:09 UTC (permalink / raw)
  To: mst; +Cc: linux-kernel, virtualization
In-Reply-To: <1478473783-12589-1-git-send-email-hch@lst.de>

Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 drivers/virtio/virtio_pci_common.c | 61 +++++++++++++++++---------------------
 1 file changed, 27 insertions(+), 34 deletions(-)

diff --git a/drivers/virtio/virtio_pci_common.c b/drivers/virtio/virtio_pci_common.c
index 93700c5..f6c5499 100644
--- a/drivers/virtio/virtio_pci_common.c
+++ b/drivers/virtio/virtio_pci_common.c
@@ -102,39 +102,6 @@ static irqreturn_t vp_interrupt(int irq, void *opaque)
 	return vp_vring_interrupt(irq, opaque);
 }
 
-static void vp_free_vectors(struct virtio_device *vdev)
-{
-	struct virtio_pci_device *vp_dev = to_vp_device(vdev);
-	int i;
-
-	if (vp_dev->intx_enabled) {
-		free_irq(vp_dev->pci_dev->irq, vp_dev);
-		vp_dev->intx_enabled = 0;
-	}
-
-	for (i = 0; i < vp_dev->msix_used_vectors; ++i)
-		free_irq(pci_irq_vector(vp_dev->pci_dev, i), vp_dev);
-
-	for (i = 0; i < vp_dev->msix_vectors; i++)
-		if (vp_dev->msix_affinity_masks[i])
-			free_cpumask_var(vp_dev->msix_affinity_masks[i]);
-
-	if (vp_dev->msix_enabled) {
-		/* Disable the vector used for configuration */
-		vp_dev->config_vector(vp_dev, VIRTIO_MSI_NO_VECTOR);
-
-		pci_free_irq_vectors(vp_dev->pci_dev);
-		vp_dev->msix_enabled = 0;
-	}
-
-	vp_dev->msix_vectors = 0;
-	vp_dev->msix_used_vectors = 0;
-	kfree(vp_dev->msix_names);
-	vp_dev->msix_names = NULL;
-	kfree(vp_dev->msix_affinity_masks);
-	vp_dev->msix_affinity_masks = NULL;
-}
-
 static int vp_request_msix_vectors(struct virtio_device *vdev, int nvectors,
 				   bool per_vq_vectors)
 {
@@ -266,6 +233,7 @@ void vp_del_vqs(struct virtio_device *vdev)
 {
 	struct virtio_pci_device *vp_dev = to_vp_device(vdev);
 	struct virtqueue *vq, *n;
+	int i;
 
 	list_for_each_entry_safe(vq, n, &vdev->vqs, list) {
 		if (vp_dev->per_vq_vectors) {
@@ -279,7 +247,32 @@ void vp_del_vqs(struct virtio_device *vdev)
 	}
 	vp_dev->per_vq_vectors = false;
 
-	vp_free_vectors(vdev);
+	if (vp_dev->intx_enabled) {
+		free_irq(vp_dev->pci_dev->irq, vp_dev);
+		vp_dev->intx_enabled = 0;
+	}
+
+	for (i = 0; i < vp_dev->msix_used_vectors; ++i)
+		free_irq(pci_irq_vector(vp_dev->pci_dev, i), vp_dev);
+
+	for (i = 0; i < vp_dev->msix_vectors; i++)
+		if (vp_dev->msix_affinity_masks[i])
+			free_cpumask_var(vp_dev->msix_affinity_masks[i]);
+
+	if (vp_dev->msix_enabled) {
+		/* Disable the vector used for configuration */
+		vp_dev->config_vector(vp_dev, VIRTIO_MSI_NO_VECTOR);
+
+		pci_free_irq_vectors(vp_dev->pci_dev);
+		vp_dev->msix_enabled = 0;
+	}
+
+	vp_dev->msix_vectors = 0;
+	vp_dev->msix_used_vectors = 0;
+	kfree(vp_dev->msix_names);
+	vp_dev->msix_names = NULL;
+	kfree(vp_dev->msix_affinity_masks);
+	vp_dev->msix_affinity_masks = NULL;
 	kfree(vp_dev->vqs);
 	vp_dev->vqs = NULL;
 }
-- 
2.1.4

^ permalink raw reply related

* [PATCH 4/7] virtio_pci: split the INTx case out of vp_try_to_find_vqs
From: Christoph Hellwig @ 2016-11-06 23:09 UTC (permalink / raw)
  To: mst; +Cc: linux-kernel, virtualization
In-Reply-To: <1478473783-12589-1-git-send-email-hch@lst.de>

There is basically no shared logic with the two MSI-X variants, so move
it into a separate function.  Also remove the fairly pointless
vp_request_intx wrapper while we're at it.

Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 drivers/virtio/virtio_pci_common.c | 89 ++++++++++++++++++++++----------------
 1 file changed, 52 insertions(+), 37 deletions(-)

diff --git a/drivers/virtio/virtio_pci_common.c b/drivers/virtio/virtio_pci_common.c
index f6c5499..644bd80 100644
--- a/drivers/virtio/virtio_pci_common.c
+++ b/drivers/virtio/virtio_pci_common.c
@@ -167,18 +167,6 @@ static int vp_request_msix_vectors(struct virtio_device *vdev, int nvectors,
 	return err;
 }
 
-static int vp_request_intx(struct virtio_device *vdev)
-{
-	int err;
-	struct virtio_pci_device *vp_dev = to_vp_device(vdev);
-
-	err = request_irq(vp_dev->pci_dev->irq, vp_interrupt,
-			  IRQF_SHARED, dev_name(&vdev->dev), vp_dev);
-	if (!err)
-		vp_dev->intx_enabled = 1;
-	return err;
-}
-
 static struct virtqueue *vp_setup_vq(struct virtio_device *vdev, unsigned index,
 				     void (*callback)(struct virtqueue *vq),
 				     const char *name,
@@ -281,7 +269,6 @@ static int vp_try_to_find_vqs(struct virtio_device *vdev, unsigned nvqs,
 			      struct virtqueue *vqs[],
 			      vq_callback_t *callbacks[],
 			      const char * const names[],
-			      bool use_msix,
 			      bool per_vq_vectors)
 {
 	struct virtio_pci_device *vp_dev = to_vp_device(vdev);
@@ -292,28 +279,21 @@ static int vp_try_to_find_vqs(struct virtio_device *vdev, unsigned nvqs,
 	if (!vp_dev->vqs)
 		return -ENOMEM;
 
-	if (!use_msix) {
-		/* Old style: one normal interrupt for change and all vqs. */
-		err = vp_request_intx(vdev);
-		if (err)
-			goto error_find;
+	if (per_vq_vectors) {
+		/* Best option: one for change interrupt, one per vq. */
+		nvectors = 1;
+		for (i = 0; i < nvqs; ++i)
+			if (callbacks[i])
+				++nvectors;
 	} else {
-		if (per_vq_vectors) {
-			/* Best option: one for change interrupt, one per vq. */
-			nvectors = 1;
-			for (i = 0; i < nvqs; ++i)
-				if (callbacks[i])
-					++nvectors;
-		} else {
-			/* Second best: one for change, shared for all vqs. */
-			nvectors = 2;
-		}
-
-		err = vp_request_msix_vectors(vdev, nvectors, per_vq_vectors);
-		if (err)
-			goto error_find;
+		/* Second best: one for change, shared for all vqs. */
+		nvectors = 2;
 	}
 
+	err = vp_request_msix_vectors(vdev, nvectors, per_vq_vectors);
+	if (err)
+		goto error_find;
+
 	vp_dev->per_vq_vectors = per_vq_vectors;
 	allocated_vectors = vp_dev->msix_used_vectors;
 	for (i = 0; i < nvqs; ++i) {
@@ -356,6 +336,43 @@ static int vp_try_to_find_vqs(struct virtio_device *vdev, unsigned nvqs,
 	return err;
 }
 
+static int vp_find_vqs_intx(struct virtio_device *vdev, unsigned nvqs,
+		struct virtqueue *vqs[], vq_callback_t *callbacks[],
+		const char * const names[])
+{
+	struct virtio_pci_device *vp_dev = to_vp_device(vdev);
+	int i, err;
+
+	vp_dev->vqs = kcalloc(nvqs, sizeof(*vp_dev->vqs), GFP_KERNEL);
+	if (!vp_dev->vqs)
+		return -ENOMEM;
+
+	err = request_irq(vp_dev->pci_dev->irq, vp_interrupt, IRQF_SHARED,
+			dev_name(&vdev->dev), vp_dev);
+	if (err)
+		goto out_del_vqs;
+
+	vp_dev->intx_enabled = 1;
+	vp_dev->per_vq_vectors = false;
+	for (i = 0; i < nvqs; ++i) {
+		if (!names[i]) {
+			vqs[i] = NULL;
+			continue;
+		}
+		vqs[i] = vp_setup_vq(vdev, i, callbacks[i], names[i],
+				VIRTIO_MSI_NO_VECTOR);
+		if (IS_ERR(vqs[i])) {
+			err = PTR_ERR(vqs[i]);
+			goto out_del_vqs;
+		}
+	}
+
+	return 0;
+out_del_vqs:
+	vp_del_vqs(vdev);
+	return err;
+}
+
 /* the config->find_vqs() implementation */
 int vp_find_vqs(struct virtio_device *vdev, unsigned nvqs,
 		struct virtqueue *vqs[],
@@ -365,17 +382,15 @@ int vp_find_vqs(struct virtio_device *vdev, unsigned nvqs,
 	int err;
 
 	/* Try MSI-X with one vector per queue. */
-	err = vp_try_to_find_vqs(vdev, nvqs, vqs, callbacks, names, true, true);
+	err = vp_try_to_find_vqs(vdev, nvqs, vqs, callbacks, names, true);
 	if (!err)
 		return 0;
 	/* Fallback: MSI-X with one vector for config, one shared for queues. */
-	err = vp_try_to_find_vqs(vdev, nvqs, vqs, callbacks, names,
-				 true, false);
+	err = vp_try_to_find_vqs(vdev, nvqs, vqs, callbacks, names, false);
 	if (!err)
 		return 0;
 	/* Finally fall back to regular interrupts. */
-	return vp_try_to_find_vqs(vdev, nvqs, vqs, callbacks, names,
-				  false, false);
+	return vp_find_vqs_intx(vdev, nvqs, vqs, callbacks, names);
 }
 
 const char *vp_bus_name(struct virtio_device *vdev)
-- 
2.1.4

^ permalink raw reply related

* [PATCH 5/7] virtio_pci: add a helper to request MSI-X interrupts
From: Christoph Hellwig @ 2016-11-06 23:09 UTC (permalink / raw)
  To: mst; +Cc: linux-kernel, virtualization
In-Reply-To: <1478473783-12589-1-git-send-email-hch@lst.de>

Factor out some duplicated code to format that MSI-X vector name and
request the interrupt for it.

Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 drivers/virtio/virtio_pci_common.c | 38 +++++++++++++++-----------------------
 1 file changed, 15 insertions(+), 23 deletions(-)

diff --git a/drivers/virtio/virtio_pci_common.c b/drivers/virtio/virtio_pci_common.c
index 644bd80..7c44891 100644
--- a/drivers/virtio/virtio_pci_common.c
+++ b/drivers/virtio/virtio_pci_common.c
@@ -102,11 +102,19 @@ static irqreturn_t vp_interrupt(int irq, void *opaque)
 	return vp_vring_interrupt(irq, opaque);
 }
 
+static int vp_request_msix_vector(struct virtio_pci_device *vp_dev, int vec,
+		irq_handler_t handler, const char *name, void *dev_id)
+{
+	snprintf(vp_dev->msix_names[vec], sizeof(vp_dev->msix_names[vec]),
+			"%s-%s", dev_name(&vp_dev->vdev.dev), name);
+	return request_irq(pci_irq_vector(vp_dev->pci_dev, vec), handler, 0,
+			vp_dev->msix_names[vec], dev_id);
+}
+
 static int vp_request_msix_vectors(struct virtio_device *vdev, int nvectors,
 				   bool per_vq_vectors)
 {
 	struct virtio_pci_device *vp_dev = to_vp_device(vdev);
-	const char *name = dev_name(&vp_dev->vdev.dev);
 	unsigned i, v;
 	int err = -ENOMEM;
 
@@ -133,15 +141,10 @@ static int vp_request_msix_vectors(struct virtio_device *vdev, int nvectors,
 	vp_dev->msix_enabled = 1;
 
 	/* Set the vector used for configuration */
-	v = vp_dev->msix_used_vectors;
-	snprintf(vp_dev->msix_names[v], sizeof *vp_dev->msix_names,
-		 "%s-config", name);
-	err = request_irq(pci_irq_vector(vp_dev->pci_dev, v),
-			  vp_config_changed, 0, vp_dev->msix_names[v],
-			  vp_dev);
+	err = vp_request_msix_vector(vp_dev, vp_dev->msix_used_vectors++,
+			vp_config_changed, "config", vp_dev);
 	if (err)
 		goto error;
-	++vp_dev->msix_used_vectors;
 
 	v = vp_dev->config_vector(vp_dev, v);
 	/* Verify we had enough resources to assign the vector */
@@ -152,15 +155,10 @@ static int vp_request_msix_vectors(struct virtio_device *vdev, int nvectors,
 
 	if (!per_vq_vectors) {
 		/* Shared vector for all VQs */
-		v = vp_dev->msix_used_vectors;
-		snprintf(vp_dev->msix_names[v], sizeof *vp_dev->msix_names,
-			 "%s-virtqueues", name);
-		err = request_irq(pci_irq_vector(vp_dev->pci_dev, v),
-				  vp_vring_interrupt, 0, vp_dev->msix_names[v],
-				  vp_dev);
+		err = vp_request_msix_vector(vp_dev, vp_dev->msix_used_vectors++,
+				vp_vring_interrupt, "virtqueues", vp_dev);
 		if (err)
 			goto error;
-		++vp_dev->msix_used_vectors;
 	}
 	return 0;
 error:
@@ -316,14 +314,8 @@ static int vp_try_to_find_vqs(struct virtio_device *vdev, unsigned nvqs,
 			continue;
 
 		/* allocate per-vq irq if available and necessary */
-		snprintf(vp_dev->msix_names[msix_vec],
-			 sizeof *vp_dev->msix_names,
-			 "%s-%s",
-			 dev_name(&vp_dev->vdev.dev), names[i]);
-		err = request_irq(pci_irq_vector(vp_dev->pci_dev, msix_vec),
-				  vring_interrupt, 0,
-				  vp_dev->msix_names[msix_vec],
-				  vqs[i]);
+		err = vp_request_msix_vector(vp_dev, msix_vec, vring_interrupt,
+				names[i], vqs[i]);
 		if (err) {
 			vp_del_vq(vqs[i]);
 			goto error_find;
-- 
2.1.4

^ permalink raw reply related

* [PATCH 6/7] virtio_pci: split the shared and per-VQ MSI-X setup routines
From: Christoph Hellwig @ 2016-11-06 23:09 UTC (permalink / raw)
  To: mst; +Cc: linux-kernel, virtualization
In-Reply-To: <1478473783-12589-1-git-send-email-hch@lst.de>

While this is a tiny bit more code in terms of LOC it clearly separates
the purposes of both routines and makes them a lot easier to read, and
more importantly to modify for PCI-level affinity assignment which will
only go into the per-VQ version.

Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 drivers/virtio/virtio_pci_common.c | 103 +++++++++++++++++++++----------------
 1 file changed, 59 insertions(+), 44 deletions(-)

diff --git a/drivers/virtio/virtio_pci_common.c b/drivers/virtio/virtio_pci_common.c
index 7c44891..3f166bc 100644
--- a/drivers/virtio/virtio_pci_common.c
+++ b/drivers/virtio/virtio_pci_common.c
@@ -111,8 +111,7 @@ static int vp_request_msix_vector(struct virtio_pci_device *vp_dev, int vec,
 			vp_dev->msix_names[vec], dev_id);
 }
 
-static int vp_request_msix_vectors(struct virtio_device *vdev, int nvectors,
-				   bool per_vq_vectors)
+static int vp_setup_msix_vectors(struct virtio_device *vdev, int nvectors)
 {
 	struct virtio_pci_device *vp_dev = to_vp_device(vdev);
 	unsigned i, v;
@@ -153,13 +152,6 @@ static int vp_request_msix_vectors(struct virtio_device *vdev, int nvectors,
 		goto error;
 	}
 
-	if (!per_vq_vectors) {
-		/* Shared vector for all VQs */
-		err = vp_request_msix_vector(vp_dev, vp_dev->msix_used_vectors++,
-				vp_vring_interrupt, "virtqueues", vp_dev);
-		if (err)
-			goto error;
-	}
 	return 0;
 error:
 	return err;
@@ -263,67 +255,90 @@ void vp_del_vqs(struct virtio_device *vdev)
 	vp_dev->vqs = NULL;
 }
 
-static int vp_try_to_find_vqs(struct virtio_device *vdev, unsigned nvqs,
-			      struct virtqueue *vqs[],
-			      vq_callback_t *callbacks[],
-			      const char * const names[],
-			      bool per_vq_vectors)
+static int vp_find_vqs_msix(struct virtio_device *vdev, unsigned nvqs,
+		struct virtqueue *vqs[], vq_callback_t *callbacks[],
+		const char * const names[])
 {
 	struct virtio_pci_device *vp_dev = to_vp_device(vdev);
+	int i, err, nvectors = 1, allocated_vectors;
 	u16 msix_vec;
-	int i, err, nvectors, allocated_vectors;
 
-	vp_dev->vqs = kmalloc(nvqs * sizeof *vp_dev->vqs, GFP_KERNEL);
+	vp_dev->vqs = kcalloc(nvqs, sizeof(*vp_dev->vqs), GFP_KERNEL);
 	if (!vp_dev->vqs)
 		return -ENOMEM;
 
-	if (per_vq_vectors) {
-		/* Best option: one for change interrupt, one per vq. */
-		nvectors = 1;
-		for (i = 0; i < nvqs; ++i)
-			if (callbacks[i])
-				++nvectors;
-	} else {
-		/* Second best: one for change, shared for all vqs. */
-		nvectors = 2;
+	for (i = 0; i < nvqs; ++i) {
+		if (callbacks[i])
+			nvectors++;
 	}
-
-	err = vp_request_msix_vectors(vdev, nvectors, per_vq_vectors);
+	err = vp_setup_msix_vectors(vdev, nvectors);
 	if (err)
-		goto error_find;
+		goto out_del_vqs;
 
-	vp_dev->per_vq_vectors = per_vq_vectors;
+	vp_dev->per_vq_vectors = true;
 	allocated_vectors = vp_dev->msix_used_vectors;
 	for (i = 0; i < nvqs; ++i) {
-		if (!names[i]) {
-			vqs[i] = NULL;
+		if (!names[i])
 			continue;
-		} else if (!callbacks[i] || !vp_dev->msix_enabled)
+		if (!callbacks[i])
 			msix_vec = VIRTIO_MSI_NO_VECTOR;
-		else if (vp_dev->per_vq_vectors)
-			msix_vec = allocated_vectors++;
 		else
-			msix_vec = VP_MSIX_VQ_VECTOR;
+			msix_vec = allocated_vectors++;
+
 		vqs[i] = vp_setup_vq(vdev, i, callbacks[i], names[i], msix_vec);
 		if (IS_ERR(vqs[i])) {
 			err = PTR_ERR(vqs[i]);
-			goto error_find;
+			goto out_del_vqs;
 		}
-
-		if (!vp_dev->per_vq_vectors || msix_vec == VIRTIO_MSI_NO_VECTOR)
+		if (msix_vec == VIRTIO_MSI_NO_VECTOR)
 			continue;
-
-		/* allocate per-vq irq if available and necessary */
 		err = vp_request_msix_vector(vp_dev, msix_vec, vring_interrupt,
 				names[i], vqs[i]);
 		if (err) {
 			vp_del_vq(vqs[i]);
-			goto error_find;
+			goto out_del_vqs;
 		}
 	}
+
 	return 0;
+out_del_vqs:
+	vp_del_vqs(vdev);
+	return err;
+}
+
+static int vp_find_vqs_msix_shared(struct virtio_device *vdev, unsigned nvqs,
+		struct virtqueue *vqs[], vq_callback_t *callbacks[],
+		const char * const names[])
+{
+	struct virtio_pci_device *vp_dev = to_vp_device(vdev);
+	int i, err;
+
+	vp_dev->vqs = kcalloc(nvqs, sizeof(*vp_dev->vqs), GFP_KERNEL);
+	if (!vp_dev->vqs)
+		return -ENOMEM;
+	err = vp_setup_msix_vectors(vdev, 2);
+	if (err)
+		goto out_del_vqs;
+	err = vp_request_msix_vector(vp_dev, vp_dev->msix_used_vectors++,
+			vp_vring_interrupt, "virtqueues", vp_dev);
+	if (err)
+		goto out_del_vqs;
 
-error_find:
+	vp_dev->per_vq_vectors = false;
+	for (i = 0; i < nvqs; ++i) {
+		if (!names[i])
+			continue;
+		vqs[i] = vp_setup_vq(vdev, i, callbacks[i], names[i],
+				callbacks[i] ? VP_MSIX_VQ_VECTOR :
+					VIRTIO_MSI_NO_VECTOR);
+		if (IS_ERR(vqs[i])) {
+			err = PTR_ERR(vqs[i]);
+			goto out_del_vqs;
+		}
+	}
+
+	return 0;
+out_del_vqs:
 	vp_del_vqs(vdev);
 	return err;
 }
@@ -374,11 +389,11 @@ int vp_find_vqs(struct virtio_device *vdev, unsigned nvqs,
 	int err;
 
 	/* Try MSI-X with one vector per queue. */
-	err = vp_try_to_find_vqs(vdev, nvqs, vqs, callbacks, names, true);
+	err = vp_find_vqs_msix(vdev, nvqs, vqs, callbacks, names);
 	if (!err)
 		return 0;
 	/* Fallback: MSI-X with one vector for config, one shared for queues. */
-	err = vp_try_to_find_vqs(vdev, nvqs, vqs, callbacks, names, false);
+	err = vp_find_vqs_msix_shared(vdev, nvqs, vqs, callbacks, names);
 	if (!err)
 		return 0;
 	/* Finally fall back to regular interrupts. */
-- 
2.1.4

^ permalink raw reply related

* [PATCH 7/7] virtio_pci: clean up interrupt state in struct virtio_pci_device
From: Christoph Hellwig @ 2016-11-06 23:09 UTC (permalink / raw)
  To: mst; +Cc: linux-kernel, virtualization
In-Reply-To: <1478473783-12589-1-git-send-email-hch@lst.de>

Currently there is a lot of interrupt-related state in struct
virtio_pci_device, must of which is not needed:

 - msix_enabled is stored in struct pci_device and can be used there
 - msix_enabled isn't needed as we can call pci_free_irq_vectors even
   for the INTx case and it will do the right thing, as will a call
   to pci_irq_vector for vector 0
 - used_msix_vectors is not needed because we can just subtract the number
   of per-VQ vectors from the total number and get it
 - per_vq_vectors is not needed as we can just check the msix_vec field
   in the VQ for a valid vector

This just laves us with the old msix_vectors field that has been renamed
to irq_vectors and will be maintained for the INTx case as well to make
our life easier.

Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 drivers/virtio/virtio_pci_common.c | 64 ++++++++++++++------------------------
 drivers/virtio/virtio_pci_common.h | 12 ++-----
 drivers/virtio/virtio_pci_legacy.c |  2 +-
 drivers/virtio/virtio_pci_modern.c |  2 +-
 include/uapi/linux/virtio_pci.h    |  2 +-
 5 files changed, 29 insertions(+), 53 deletions(-)

diff --git a/drivers/virtio/virtio_pci_common.c b/drivers/virtio/virtio_pci_common.c
index 3f166bc..4b2af83 100644
--- a/drivers/virtio/virtio_pci_common.c
+++ b/drivers/virtio/virtio_pci_common.c
@@ -33,10 +33,7 @@ void vp_synchronize_vectors(struct virtio_device *vdev)
 	struct virtio_pci_device *vp_dev = to_vp_device(vdev);
 	int i;
 
-	if (vp_dev->intx_enabled)
-		synchronize_irq(vp_dev->pci_dev->irq);
-
-	for (i = 0; i < vp_dev->msix_vectors; ++i)
+	for (i = 0; i < vp_dev->irq_vectors; ++i)
 		synchronize_irq(pci_irq_vector(vp_dev->pci_dev, i));
 }
 
@@ -117,8 +114,6 @@ static int vp_setup_msix_vectors(struct virtio_device *vdev, int nvectors)
 	unsigned i, v;
 	int err = -ENOMEM;
 
-	vp_dev->msix_vectors = nvectors;
-
 	vp_dev->msix_names = kmalloc(nvectors * sizeof *vp_dev->msix_names,
 				     GFP_KERNEL);
 	if (!vp_dev->msix_names)
@@ -137,10 +132,9 @@ static int vp_setup_msix_vectors(struct virtio_device *vdev, int nvectors)
 			PCI_IRQ_MSIX);
 	if (err < 0)
 		goto error;
-	vp_dev->msix_enabled = 1;
 
 	/* Set the vector used for configuration */
-	err = vp_request_msix_vector(vp_dev, vp_dev->msix_used_vectors++,
+	err = vp_request_msix_vector(vp_dev, vp_dev->irq_vectors++,
 			vp_config_changed, "config", vp_dev);
 	if (err)
 		goto error;
@@ -211,46 +205,40 @@ void vp_del_vqs(struct virtio_device *vdev)
 {
 	struct virtio_pci_device *vp_dev = to_vp_device(vdev);
 	struct virtqueue *vq, *n;
+	int irq_vectors = vp_dev->irq_vectors;
 	int i;
 
 	list_for_each_entry_safe(vq, n, &vdev->vqs, list) {
-		if (vp_dev->per_vq_vectors) {
-			int v = vp_dev->vqs[vq->index]->msix_vector;
+		int vec = vp_dev->vqs[vq->index]->msix_vector;
 
-			if (v != VIRTIO_MSI_NO_VECTOR)
-				free_irq(pci_irq_vector(vp_dev->pci_dev, v),
-					vq);
+		if (vec != VIRTIO_MSI_NO_VECTOR && vec != VP_MSIX_VQ_VECTOR) {
+			free_irq(pci_irq_vector(vp_dev->pci_dev, vec), vq);
+			irq_vectors--;
 		}
-		vp_del_vq(vq);
-	}
-	vp_dev->per_vq_vectors = false;
 
-	if (vp_dev->intx_enabled) {
-		free_irq(vp_dev->pci_dev->irq, vp_dev);
-		vp_dev->intx_enabled = 0;
+		vp_del_vq(vq);
 	}
 
-	for (i = 0; i < vp_dev->msix_used_vectors; ++i)
+	for (i = irq_vectors - 1; i >= 0; i--)
 		free_irq(pci_irq_vector(vp_dev->pci_dev, i), vp_dev);
 
-	for (i = 0; i < vp_dev->msix_vectors; i++)
-		if (vp_dev->msix_affinity_masks[i])
+	if (vp_dev->msix_affinity_masks) {
+		for (i = 0; i < vp_dev->irq_vectors; i++)
 			free_cpumask_var(vp_dev->msix_affinity_masks[i]);
+		kfree(vp_dev->msix_affinity_masks);
+		vp_dev->msix_affinity_masks = NULL;
+	}
 
-	if (vp_dev->msix_enabled) {
-		/* Disable the vector used for configuration */
+	/* Disable the vector used for configuration */
+	if (vp_dev->pci_dev->msix_enabled)
 		vp_dev->config_vector(vp_dev, VIRTIO_MSI_NO_VECTOR);
 
-		pci_free_irq_vectors(vp_dev->pci_dev);
-		vp_dev->msix_enabled = 0;
-	}
+	pci_free_irq_vectors(vp_dev->pci_dev);
+	vp_dev->irq_vectors = 0;
 
-	vp_dev->msix_vectors = 0;
-	vp_dev->msix_used_vectors = 0;
 	kfree(vp_dev->msix_names);
 	vp_dev->msix_names = NULL;
-	kfree(vp_dev->msix_affinity_masks);
-	vp_dev->msix_affinity_masks = NULL;
+
 	kfree(vp_dev->vqs);
 	vp_dev->vqs = NULL;
 }
@@ -260,7 +248,7 @@ static int vp_find_vqs_msix(struct virtio_device *vdev, unsigned nvqs,
 		const char * const names[])
 {
 	struct virtio_pci_device *vp_dev = to_vp_device(vdev);
-	int i, err, nvectors = 1, allocated_vectors;
+	int i, err, nvectors = 1;
 	u16 msix_vec;
 
 	vp_dev->vqs = kcalloc(nvqs, sizeof(*vp_dev->vqs), GFP_KERNEL);
@@ -275,15 +263,13 @@ static int vp_find_vqs_msix(struct virtio_device *vdev, unsigned nvqs,
 	if (err)
 		goto out_del_vqs;
 
-	vp_dev->per_vq_vectors = true;
-	allocated_vectors = vp_dev->msix_used_vectors;
 	for (i = 0; i < nvqs; ++i) {
 		if (!names[i])
 			continue;
 		if (!callbacks[i])
 			msix_vec = VIRTIO_MSI_NO_VECTOR;
 		else
-			msix_vec = allocated_vectors++;
+			msix_vec = vp_dev->irq_vectors++;
 
 		vqs[i] = vp_setup_vq(vdev, i, callbacks[i], names[i], msix_vec);
 		if (IS_ERR(vqs[i])) {
@@ -319,12 +305,11 @@ static int vp_find_vqs_msix_shared(struct virtio_device *vdev, unsigned nvqs,
 	err = vp_setup_msix_vectors(vdev, 2);
 	if (err)
 		goto out_del_vqs;
-	err = vp_request_msix_vector(vp_dev, vp_dev->msix_used_vectors++,
+	err = vp_request_msix_vector(vp_dev, vp_dev->irq_vectors++,
 			vp_vring_interrupt, "virtqueues", vp_dev);
 	if (err)
 		goto out_del_vqs;
 
-	vp_dev->per_vq_vectors = false;
 	for (i = 0; i < nvqs; ++i) {
 		if (!names[i])
 			continue;
@@ -358,9 +343,8 @@ static int vp_find_vqs_intx(struct virtio_device *vdev, unsigned nvqs,
 			dev_name(&vdev->dev), vp_dev);
 	if (err)
 		goto out_del_vqs;
+	vp_dev->irq_vectors = 1;
 
-	vp_dev->intx_enabled = 1;
-	vp_dev->per_vq_vectors = false;
 	for (i = 0; i < nvqs; ++i) {
 		if (!names[i]) {
 			vqs[i] = NULL;
@@ -423,7 +407,7 @@ int vp_set_vq_affinity(struct virtqueue *vq, int cpu)
 	if (!vq->callback)
 		return -EINVAL;
 
-	if (vp_dev->msix_enabled) {
+	if (vp_dev->pci_dev->msix_enabled) {
 		mask = vp_dev->msix_affinity_masks[info->msix_vector];
 		irq = pci_irq_vector(vp_dev->pci_dev, info->msix_vector);
 		if (cpu == -1)
diff --git a/drivers/virtio/virtio_pci_common.h b/drivers/virtio/virtio_pci_common.h
index b2f6662..d3e3a1b 100644
--- a/drivers/virtio/virtio_pci_common.h
+++ b/drivers/virtio/virtio_pci_common.h
@@ -82,20 +82,12 @@ struct virtio_pci_device {
 	/* array of all queues for house-keeping */
 	struct virtio_pci_vq_info **vqs;
 
-	/* MSI-X support */
-	int msix_enabled;
-	int intx_enabled;
 	cpumask_var_t *msix_affinity_masks;
 	/* Name strings for interrupts. This size should be enough,
 	 * and I'm too lazy to allocate each name separately. */
 	char (*msix_names)[256];
-	/* Number of available vectors */
-	unsigned msix_vectors;
-	/* Vectors allocated, excluding per-vq vectors if any */
-	unsigned msix_used_vectors;
-
-	/* Whether we have vector per vq */
-	bool per_vq_vectors;
+	/* Total number of interrupt vectors (INTx or MSI-X) */
+	unsigned irq_vectors;
 
 	struct virtqueue *(*setup_vq)(struct virtio_pci_device *vp_dev,
 				      struct virtio_pci_vq_info *info,
diff --git a/drivers/virtio/virtio_pci_legacy.c b/drivers/virtio/virtio_pci_legacy.c
index 8c4e617..25f90f0 100644
--- a/drivers/virtio/virtio_pci_legacy.c
+++ b/drivers/virtio/virtio_pci_legacy.c
@@ -169,7 +169,7 @@ static void del_vq(struct virtio_pci_vq_info *info)
 
 	iowrite16(vq->index, vp_dev->ioaddr + VIRTIO_PCI_QUEUE_SEL);
 
-	if (vp_dev->msix_enabled) {
+	if (vp_dev->pci_dev->msix_enabled) {
 		iowrite16(VIRTIO_MSI_NO_VECTOR,
 			  vp_dev->ioaddr + VIRTIO_MSI_QUEUE_VECTOR);
 		/* Flush the write out to device */
diff --git a/drivers/virtio/virtio_pci_modern.c b/drivers/virtio/virtio_pci_modern.c
index e76bd91..e18d0b0 100644
--- a/drivers/virtio/virtio_pci_modern.c
+++ b/drivers/virtio/virtio_pci_modern.c
@@ -416,7 +416,7 @@ static void del_vq(struct virtio_pci_vq_info *info)
 
 	vp_iowrite16(vq->index, &vp_dev->common->queue_select);
 
-	if (vp_dev->msix_enabled) {
+	if (vp_dev->pci_dev->msix_enabled) {
 		vp_iowrite16(VIRTIO_MSI_NO_VECTOR,
 			     &vp_dev->common->queue_msix_vector);
 		/* Flush the write out to device */
diff --git a/include/uapi/linux/virtio_pci.h b/include/uapi/linux/virtio_pci.h
index 90007a1..15b4385 100644
--- a/include/uapi/linux/virtio_pci.h
+++ b/include/uapi/linux/virtio_pci.h
@@ -79,7 +79,7 @@
  * configuration space */
 #define VIRTIO_PCI_CONFIG_OFF(msix_enabled)	((msix_enabled) ? 24 : 20)
 /* Deprecated: please use VIRTIO_PCI_CONFIG_OFF instead */
-#define VIRTIO_PCI_CONFIG(dev)	VIRTIO_PCI_CONFIG_OFF((dev)->msix_enabled)
+#define VIRTIO_PCI_CONFIG(dev)	VIRTIO_PCI_CONFIG_OFF((dev)->pci_dev->msix_enabled)
 
 /* Virtio ABI version, this must match exactly */
 #define VIRTIO_PCI_ABI_VERSION		0
-- 
2.1.4

^ permalink raw reply related

* RE: [PATCH kernel v4 7/7] virtio-balloon: tell host vm's unused page info
From: Li, Liang Z @ 2016-11-07  3:37 UTC (permalink / raw)
  To: Hansen, Dave, mst@redhat.com
  Cc: virtio-dev@lists.oasis-open.org, kvm@vger.kernel.org,
	linux-kernel@vger.kernel.org, qemu-devel@nongnu.org,
	dgilbert@redhat.com, linux-mm@kvack.org, amit.shah@redhat.com,
	pbonzini@redhat.com, virtualization@lists.linux-foundation.org,
	mgorman@techsingularity.net
In-Reply-To: <b25eac6e-3744-3874-93a8-02f814549adf@intel.com>

> Please squish this and patch 5 together.  It makes no sense to separate them.
> 

OK.

> > +static void send_unused_pages_info(struct virtio_balloon *vb,
> > +				unsigned long req_id)
> > +{
> > +	struct scatterlist sg_in;
> > +	unsigned long pfn = 0, bmap_len, pfn_limit, last_pfn, nr_pfn;
> > +	struct virtqueue *vq = vb->req_vq;
> > +	struct virtio_balloon_resp_hdr *hdr = vb->resp_hdr;
> > +	int ret = 1, used_nr_bmap = 0, i;
> > +
> > +	if (virtio_has_feature(vb->vdev,
> VIRTIO_BALLOON_F_PAGE_BITMAP) &&
> > +		vb->nr_page_bmap == 1)
> > +		extend_page_bitmap(vb);
> > +
> > +	pfn_limit = PFNS_PER_BMAP * vb->nr_page_bmap;
> > +	mutex_lock(&vb->balloon_lock);
> > +	last_pfn = get_max_pfn();
> > +
> > +	while (ret) {
> > +		clear_page_bitmap(vb);
> > +		ret = get_unused_pages(pfn, pfn + pfn_limit, vb-
> >page_bitmap,
> > +			 PFNS_PER_BMAP, vb->nr_page_bmap);
> 
> This changed the underlying data structure without changing the way that
> the structure is populated.
> 
> This algorithm picks a "PFNS_PER_BMAP * vb->nr_page_bmap"-sized set of
> pfns, allocates a bitmap for them, the loops through all zones looking for
> pages in any free list that are in that range.
> 
> Unpacking all the indirection, it looks like this:
> 
> for (pfn = 0; pfn < get_max_pfn(); pfn += BITMAP_SIZE_IN_PFNS)
> 	for_each_populated_zone(zone)
> 		for_each_migratetype_order(order, t)
> 			list_for_each(..., &zone->free_area[order])...
> 
> Let's say we do a 32k bitmap that can hold ~1M pages.  That's 4GB of RAM.
> On a 1TB system, that's 256 passes through the top-level loop.
> The bottom-level lists have tens of thousands of pages in them, even on my
> laptop.  Only 1/256 of these pages will get consumed in a given pass.
> 
Your description is not exactly.
A 32k bitmap is used only when there is few free memory left in the system and when 
the extend_page_bitmap() failed to allocate more memory for the bitmap. Or dozens of 
32k split bitmap will be used, this version limit the bitmap count to 32, it means we can use
at most 32*32 kB for the bitmap, which can cover 128GB for RAM. We can increase the bitmap
count limit to a larger value if 32 is not big enough.

> That's an awfully inefficient way of doing it.  This patch essentially changed
> the data structure without changing the algorithm to populate it.
> 
> Please change the *algorithm* to use the new data structure efficiently.
>  Such a change would only do a single pass through each freelist, and would
> choose whether to use the extent-based (pfn -> range) or bitmap-based
> approach based on the contents of the free lists.

Save the free page info to a raw bitmap first and then process the raw bitmap to
get the proper ' extent-based ' and  'bitmap-based' is the most efficient way I can 
come up with to save the virtio data transmission.  Do you have some better idea?


In the QEMU, no matter how we encode the bitmap, the raw format bitmap will be
used in the end.  But what I did in this version is:
   kernel: get the raw bitmap  --> encode the bitmap
   QEMU: decode the bitmap --> get the raw bitmap

Is it worth to do this kind of job here? we can save the virtio data transmission, but at the
same time, we did extra work.

It seems the benefit we get for this feature is not as big as that in fast balloon inflating/deflating.
> 
> You should not be using get_max_pfn().  Any patch set that continues to use
> it is not likely to be using a proper algorithm.

Do you have any suggestion about how to avoid it?

Thanks!
Liang

^ permalink raw reply

* BUG: 'list_empty(&vgdev->free_vbufs)' is true!
From: Jiri Slaby @ 2016-11-07  8:43 UTC (permalink / raw)
  To: Michael S. Tsirkin, virtualization, Linux kernel mailing list

Hi,

I can relatively easily reproduce this bug:
BUG: 'list_empty(&vgdev->free_vbufs)' is true!
------------[ cut here ]------------
kernel BUG at /home/latest/linux/drivers/gpu/drm/virtio/virtgpu_vq.c:130!
invalid opcode: 0000 [#1] PREEMPT SMP KASAN
Modules linked in:
CPU: 1 PID: 355 Comm: kworker/1:2 Not tainted 4.9.0-rc2-next-20161028+ #32
Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS
rel-1.9.3-0-ge2fc41e-prebuilt.qemu-project.org 04/01/2014
Workqueue: events drm_fb_helper_dirty_work
task: ffff88007b124980 task.stack: ffff88007b8a0000
RIP: 0010:virtio_gpu_get_vbuf+0x32e/0x630
RSP: 0018:ffff88007b8a78c0 EFLAGS: 00010286
RAX: 000000000000002e RBX: 1ffff1000f714f1d RCX: 0000000000000000
RDX: 000000000000002e RSI: 0000000000000001 RDI: ffffed000f714f0e
RBP: ffff88007b8a7970 R08: 0000000000000001 R09: 0000000000000000
R10: 0000000000000002 R11: 0000000000000001 R12: 0000000000000030
R13: ffff88007caeaba8 R14: 0000000000000018 R15: ffff88007cae0000
FS:  0000000000000000(0000) GS:ffff88007dc80000(0000) knlGS:0000000000000000
CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
CR2: 0000000000601028 CR3: 000000007740d000 CR4: 00000000000006e0
Call Trace:
Code: df 48 89 fa 48 c1 ea 03 80 3c 02 00 0f 85 bb 01 00 00 4c 89 69 e8
eb 9e 48 c7 c6 e0 d2 d1 83 48 c7 c7 20 d3 d1 83 e8 6c fb 04 ff <0f> 0b
48 c7 c7 a0 fb b0 85 e8 09 95 86 ff 48 c7 c6 c0 d3 d1 83
RIP: virtio_gpu_get_vbuf+0x32e/0x630 RSP: ffff88007b8a78c0


There is no stacktrace, as the kernel starts panicing all over the place
during its generation. Any ideas?

thanks,
-- 
js
suse labs

^ permalink raw reply

* Re: [PATCH] virtio-net: drop legacy features in virtio 1 mode
From: Jason Wang @ 2016-11-07  9:17 UTC (permalink / raw)
  To: Michael S. Tsirkin, linux-kernel; +Cc: netdev, stable, virtualization
In-Reply-To: <1478256865-29003-1-git-send-email-mst@redhat.com>



On 2016年11月04日 18:55, Michael S. Tsirkin wrote:
> Virtio 1.0 spec says VIRTIO_F_ANY_LAYOUT and VIRTIO_NET_F_GSO are
> legacy-only feature bits. Do not negotiate them in virtio 1 mode.  Note
> this is a spec violation so we need to backport it to stable/downstream
> kernels.
>
> Cc: stable@vger.kernel.org
> Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
> ---
>   drivers/net/virtio_net.c | 30 ++++++++++++++++++++----------
>   1 file changed, 20 insertions(+), 10 deletions(-)
>
> diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
> index 7a00365..b19fb4d 100644
> --- a/drivers/net/virtio_net.c
> +++ b/drivers/net/virtio_net.c
> @@ -2089,23 +2089,33 @@ static struct virtio_device_id id_table[] = {
>   	{ 0 },
>   };
>   
> +#define VIRTNET_FEATURES \
> +	VIRTIO_NET_F_CSUM, VIRTIO_NET_F_GUEST_CSUM, \
> +	VIRTIO_NET_F_MAC, \
> +	VIRTIO_NET_F_HOST_TSO4, VIRTIO_NET_F_HOST_UFO, VIRTIO_NET_F_HOST_TSO6, \
> +	VIRTIO_NET_F_HOST_ECN, VIRTIO_NET_F_GUEST_TSO4, VIRTIO_NET_F_GUEST_TSO6, \
> +	VIRTIO_NET_F_GUEST_ECN, VIRTIO_NET_F_GUEST_UFO, \
> +	VIRTIO_NET_F_MRG_RXBUF, VIRTIO_NET_F_STATUS, VIRTIO_NET_F_CTRL_VQ, \
> +	VIRTIO_NET_F_CTRL_RX, VIRTIO_NET_F_CTRL_VLAN, \
> +	VIRTIO_NET_F_GUEST_ANNOUNCE, VIRTIO_NET_F_MQ, \
> +	VIRTIO_NET_F_CTRL_MAC_ADDR, \
> +	VIRTIO_NET_F_MTU
> +
>   static unsigned int features[] = {
> -	VIRTIO_NET_F_CSUM, VIRTIO_NET_F_GUEST_CSUM,
> -	VIRTIO_NET_F_GSO, VIRTIO_NET_F_MAC,
> -	VIRTIO_NET_F_HOST_TSO4, VIRTIO_NET_F_HOST_UFO, VIRTIO_NET_F_HOST_TSO6,
> -	VIRTIO_NET_F_HOST_ECN, VIRTIO_NET_F_GUEST_TSO4, VIRTIO_NET_F_GUEST_TSO6,
> -	VIRTIO_NET_F_GUEST_ECN, VIRTIO_NET_F_GUEST_UFO,
> -	VIRTIO_NET_F_MRG_RXBUF, VIRTIO_NET_F_STATUS, VIRTIO_NET_F_CTRL_VQ,
> -	VIRTIO_NET_F_CTRL_RX, VIRTIO_NET_F_CTRL_VLAN,
> -	VIRTIO_NET_F_GUEST_ANNOUNCE, VIRTIO_NET_F_MQ,
> -	VIRTIO_NET_F_CTRL_MAC_ADDR,
> +	VIRTNET_FEATURES,
> +};
> +
> +static unsigned int features_legacy[] = {
> +	VIRTNET_FEATURES,
> +	VIRTIO_NET_F_GSO,
>   	VIRTIO_F_ANY_LAYOUT,
> -	VIRTIO_NET_F_MTU,
>   };
>   
>   static struct virtio_driver virtio_net_driver = {
>   	.feature_table = features,
>   	.feature_table_size = ARRAY_SIZE(features),
> +	.feature_table_legacy = features_legacy,
> +	.feature_table_size_legacy = ARRAY_SIZE(features_legacy),
>   	.driver.name =	KBUILD_MODNAME,
>   	.driver.owner =	THIS_MODULE,
>   	.id_table =	id_table,

Acked-by: Jason Wang <jasowang@redhat.com>
_______________________________________________
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization

^ permalink raw reply


This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox