Linux virtualization list
 help / color / mirror / Atom feed
* [PATCH v5 4/6] pv-qspinlock: powerpc support pv-qspinlock
From: Pan Xinhui @ 2016-06-02  9:26 UTC (permalink / raw)
  To: linux-kernel, linuxppc-dev, virtualization
  Cc: Pan Xinhui, peterz, benh, waiman.long, mingo, paulus, mpe,
	paulmck
In-Reply-To: <1464859590-5292-1-git-send-email-xinhui.pan@linux.vnet.ibm.com>

As we need let pv-qspinlock-kernel run on all environment which might
have no powervm, we should runtime choose which qspinlock version to
use.

The default pv-qspinlock use native version. pv_lock initialization
should be done in bootstage with irq disabled. And if there is PHYP,
restore pv_lock_ops callbacks to pv version.

There is also a hash table, we store cpu number into it and the key is
lock. So everytime pv_wait can know who is the lock holder by searching
the lock. Also store the lock in a per_cpu struct, and remove it when we
own the lock. pv_wait need know which lock we are spinning on.

Signed-off-by: Pan Xinhui <xinhui.pan@linux.vnet.ibm.com>
---
 arch/powerpc/include/asm/qspinlock.h               |  15 +++
 arch/powerpc/include/asm/qspinlock_paravirt.h      |  38 +++++++
 .../powerpc/include/asm/qspinlock_paravirt_types.h |  13 +++
 arch/powerpc/kernel/paravirt.c                     | 121 +++++++++++++++++++++
 arch/powerpc/platforms/pseries/setup.c             |   5 +
 5 files changed, 192 insertions(+)
 create mode 100644 arch/powerpc/include/asm/qspinlock_paravirt.h
 create mode 100644 arch/powerpc/include/asm/qspinlock_paravirt_types.h
 create mode 100644 arch/powerpc/kernel/paravirt.c

diff --git a/arch/powerpc/include/asm/qspinlock.h b/arch/powerpc/include/asm/qspinlock.h
index fc83cd2..61a0d00 100644
--- a/arch/powerpc/include/asm/qspinlock.h
+++ b/arch/powerpc/include/asm/qspinlock.h
@@ -16,10 +16,25 @@ static inline void native_queued_spin_unlock(struct qspinlock *lock)
 	smp_store_release(locked, 0);
 }
 
+#ifdef CONFIG_PARAVIRT_SPINLOCKS
+
+#include <asm/qspinlock_paravirt.h>
+
+static inline void queued_spin_lock_slowpath(struct qspinlock *lock, u32 val)
+{
+	pv_queued_spin_lock(lock, val);
+}
+
+static inline void queued_spin_unlock(struct qspinlock *lock)
+{
+	pv_queued_spin_unlock(lock);
+}
+#else
 static inline void queued_spin_unlock(struct qspinlock *lock)
 {
 	native_queued_spin_unlock(lock);
 }
+#endif
 
 #include <asm-generic/qspinlock.h>
 
diff --git a/arch/powerpc/include/asm/qspinlock_paravirt.h b/arch/powerpc/include/asm/qspinlock_paravirt.h
new file mode 100644
index 0000000..cd17a79
--- /dev/null
+++ b/arch/powerpc/include/asm/qspinlock_paravirt.h
@@ -0,0 +1,38 @@
+#ifndef CONFIG_PARAVIRT_SPINLOCKS
+#error "do not include this file"
+#endif
+
+#ifndef _ASM_QSPINLOCK_PARAVIRT_H
+#define _ASM_QSPINLOCK_PARAVIRT_H
+
+#include  <asm/qspinlock_paravirt_types.h>
+
+extern void pv_lock_init(void);
+extern void native_queued_spin_lock_slowpath(struct qspinlock *lock, u32 val);
+extern void __pv_init_lock_hash(void);
+extern void __pv_queued_spin_lock_slowpath(struct qspinlock *lock, u32 val);
+extern void __pv_queued_spin_unlock(struct qspinlock *lock);
+
+static inline void pv_queued_spin_lock(struct qspinlock *lock, u32 val)
+{
+	CLEAR_IO_SYNC;
+	pv_lock_op.lock(lock, val);
+}
+
+static inline void pv_queued_spin_unlock(struct qspinlock *lock)
+{
+	SYNC_IO;
+	pv_lock_op.unlock(lock);
+}
+
+static inline void pv_wait(u8 *ptr, u8 val, int lockcpu)
+{
+	pv_lock_op.wait(ptr, val, lockcpu);
+}
+
+static inline void pv_kick(int cpu)
+{
+	pv_lock_op.kick(cpu);
+}
+
+#endif
diff --git a/arch/powerpc/include/asm/qspinlock_paravirt_types.h b/arch/powerpc/include/asm/qspinlock_paravirt_types.h
new file mode 100644
index 0000000..e1fdeb0
--- /dev/null
+++ b/arch/powerpc/include/asm/qspinlock_paravirt_types.h
@@ -0,0 +1,13 @@
+#ifndef _ASM_QSPINLOCK_PARAVIRT_TYPES_H
+#define _ASM_QSPINLOCK_PARAVIRT_TYPES_H
+
+struct pv_lock_ops {
+	void (*lock)(struct qspinlock *lock, u32 val);
+	void (*unlock)(struct qspinlock *lock);
+	void (*wait)(u8 *ptr, u8 val, int cpu);
+	void (*kick)(int cpu);
+};
+
+extern struct pv_lock_ops pv_lock_op;
+
+#endif
diff --git a/arch/powerpc/kernel/paravirt.c b/arch/powerpc/kernel/paravirt.c
new file mode 100644
index 0000000..2e87fa6
--- /dev/null
+++ b/arch/powerpc/kernel/paravirt.c
@@ -0,0 +1,121 @@
+/*
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ */
+
+#include <linux/spinlock.h>
+#include <linux/smp.h>
+#include <linux/hash.h>
+#include <linux/bootmem.h>
+
+#define NUM_LOCK_CPU_ENTRY_SHIFT 16
+#define NUM_LOCK_CPU_ENTRY (1 << NUM_LOCK_CPU_ENTRY_SHIFT)
+#define NUM_LOCKS_PER_CPU 4
+
+static u16 *hash_lock_cpu_ptr;
+
+struct locks_on_cpu {
+	void *l[NUM_LOCKS_PER_CPU];
+	int count;
+};
+
+static DEFINE_PER_CPU(struct locks_on_cpu, node);
+
+static u16 *hash(void *l)
+{
+	int val = hash_ptr(l, NUM_LOCK_CPU_ENTRY_SHIFT);
+
+	return &hash_lock_cpu_ptr[val];
+}
+
+static void __init init_hash(void)
+{
+	int size = NUM_LOCK_CPU_ENTRY * sizeof(*hash_lock_cpu_ptr);
+
+	hash_lock_cpu_ptr = memblock_virt_alloc(size, 0);
+	memset(hash_lock_cpu_ptr, 0, size);
+}
+
+#define lock_get_holder(l)	\
+		((int)*hash(l) - 1)
+
+#define lock_set_holder(l)	\
+		(*hash(l) = raw_smp_processor_id() + 1)
+
+static void *this_cpu_lock(void)
+{
+	struct locks_on_cpu *this_node = this_cpu_ptr(&node);
+	int i = this_node->count - 1;
+
+	return this_node->l[i];
+}
+
+static void cpu_save_lock(void *l)
+{
+	struct locks_on_cpu *this_node = this_cpu_ptr(&node);
+	int i = this_node->count++;
+
+	this_node->l[i] = l;
+}
+
+static void cpu_remove_lock(void *l)
+{
+	this_cpu_dec(node.count);
+}
+
+static void __native_queued_spin_unlock(struct qspinlock *lock)
+{
+	native_queued_spin_unlock(lock);
+}
+
+static void __pv_lock(struct qspinlock *lock, u32 val)
+{
+	cpu_save_lock(lock);
+	__pv_queued_spin_lock_slowpath(lock, val);
+	cpu_remove_lock(lock);
+	lock_set_holder(lock);
+}
+
+static void __pv_unlock(struct qspinlock *lock)
+{
+	__pv_queued_spin_unlock(lock);
+}
+
+static void __pv_wait(u8 *ptr, u8 val, int cpu)
+{
+	void *l = this_cpu_lock();
+
+	HMT_low();
+	while (READ_ONCE(*ptr) == val) {
+		cpu = lock_get_holder(l);
+		__spin_yield_cpu(cpu);
+	}
+	HMT_medium();
+}
+
+static void __pv_kick(int cpu)
+{
+	__spin_wake_cpu(cpu);
+}
+
+struct pv_lock_ops pv_lock_op = {
+	.lock = native_queued_spin_lock_slowpath,
+	.unlock = __native_queued_spin_unlock,
+	.wait = NULL,
+	.kick = NULL,
+};
+EXPORT_SYMBOL(pv_lock_op);
+
+void __init pv_lock_init(void)
+{
+	if (SHARED_PROCESSOR) {
+		init_hash();
+		__pv_init_lock_hash();
+		pv_lock_op.lock = __pv_lock;
+		pv_lock_op.unlock = __pv_unlock;
+		pv_lock_op.wait = __pv_wait;
+		pv_lock_op.kick = __pv_kick;
+	}
+}
diff --git a/arch/powerpc/platforms/pseries/setup.c b/arch/powerpc/platforms/pseries/setup.c
index 9883bc7..928b338 100644
--- a/arch/powerpc/platforms/pseries/setup.c
+++ b/arch/powerpc/platforms/pseries/setup.c
@@ -549,6 +549,11 @@ static void __init pSeries_setup_arch(void)
 				"%ld\n", rc);
 		}
 	}
+
+#ifdef CONFIG_PARAVIRT_SPINLOCKS
+	pv_lock_init();
+#endif
+
 }
 
 static int __init pSeries_init_panel(void)
-- 
2.4.11

^ permalink raw reply related

* [PATCH v5 3/6] powerpc: lib/locks.c: Add cpu yield/wake helper function
From: Pan Xinhui @ 2016-06-02  9:26 UTC (permalink / raw)
  To: linux-kernel, linuxppc-dev, virtualization
  Cc: Pan Xinhui, peterz, benh, waiman.long, mingo, paulus, mpe,
	paulmck
In-Reply-To: <1464859590-5292-1-git-send-email-xinhui.pan@linux.vnet.ibm.com>

pv-qspinlock core has pv_wait/pv_kick which will give a better
performace by yielding and kicking cpu at some cases.
lets support them by adding two corresponding helper functions.

Signed-off-by: Pan Xinhui <xinhui.pan@linux.vnet.ibm.com>
---
 arch/powerpc/include/asm/spinlock.h |  4 ++++
 arch/powerpc/lib/locks.c            | 33 +++++++++++++++++++++++++++++++++
 2 files changed, 37 insertions(+)

diff --git a/arch/powerpc/include/asm/spinlock.h b/arch/powerpc/include/asm/spinlock.h
index 4359ee6..3b65372 100644
--- a/arch/powerpc/include/asm/spinlock.h
+++ b/arch/powerpc/include/asm/spinlock.h
@@ -56,9 +56,13 @@
 /* We only yield to the hypervisor if we are in shared processor mode */
 #define SHARED_PROCESSOR (lppaca_shared_proc(local_paca->lppaca_ptr))
 extern void __spin_yield(arch_spinlock_t *lock);
+extern void __spin_yield_cpu(int cpu);
+extern void __spin_wake_cpu(int cpu);
 extern void __rw_yield(arch_rwlock_t *lock);
 #else /* SPLPAR */
 #define __spin_yield(x)	barrier()
+#define __spin_yield_cpu(x) barrier()
+#define __spin_wake_cpu(x) barrier()
 #define __rw_yield(x)	barrier()
 #define SHARED_PROCESSOR	0
 #endif
diff --git a/arch/powerpc/lib/locks.c b/arch/powerpc/lib/locks.c
index a9ebd71..3a58834 100644
--- a/arch/powerpc/lib/locks.c
+++ b/arch/powerpc/lib/locks.c
@@ -23,6 +23,39 @@
 #include <asm/hvcall.h>
 #include <asm/smp.h>
 
+void __spin_yield_cpu(int cpu)
+{
+	unsigned int holder_cpu = cpu, yield_count;
+
+	if (cpu == -1) {
+		plpar_hcall_norets(H_CONFER, -1, 0);
+		return;
+	}
+	BUG_ON(holder_cpu >= nr_cpu_ids);
+	yield_count = be32_to_cpu(lppaca_of(holder_cpu).yield_count);
+	if ((yield_count & 1) == 0) {
+		/* if target cpu is running, confer slices to lpar*/
+		plpar_hcall_norets(H_CONFER, -1, 0);
+		return;
+	}
+	plpar_hcall_norets(H_CONFER,
+		get_hard_smp_processor_id(holder_cpu), yield_count);
+}
+EXPORT_SYMBOL_GPL(__spin_yield_cpu);
+
+void __spin_wake_cpu(int cpu)
+{
+	unsigned int holder_cpu = cpu, yield_count;
+
+	BUG_ON(holder_cpu >= nr_cpu_ids);
+	yield_count = be32_to_cpu(lppaca_of(holder_cpu).yield_count);
+	if ((yield_count & 1) == 0)
+		return;		/* virtual cpu is currently running */
+	plpar_hcall_norets(H_PROD,
+		get_hard_smp_processor_id(holder_cpu));
+}
+EXPORT_SYMBOL_GPL(__spin_wake_cpu);
+
 #ifndef CONFIG_QUEUED_SPINLOCKS
 void __spin_yield(arch_spinlock_t *lock)
 {
-- 
2.4.11

^ permalink raw reply related

* [PATCH v5 2/6] powerpc: pseries/Kconfig: Add qspinlock build config
From: Pan Xinhui @ 2016-06-02  9:26 UTC (permalink / raw)
  To: linux-kernel, linuxppc-dev, virtualization
  Cc: Pan Xinhui, peterz, benh, waiman.long, mingo, paulus, mpe,
	paulmck
In-Reply-To: <1464859590-5292-1-git-send-email-xinhui.pan@linux.vnet.ibm.com>

pseries will use qspinlock by default.

Signed-off-by: Pan Xinhui <xinhui.pan@linux.vnet.ibm.com>
---
 arch/powerpc/platforms/pseries/Kconfig | 1 +
 1 file changed, 1 insertion(+)

diff --git a/arch/powerpc/platforms/pseries/Kconfig b/arch/powerpc/platforms/pseries/Kconfig
index bec90fb..f669323 100644
--- a/arch/powerpc/platforms/pseries/Kconfig
+++ b/arch/powerpc/platforms/pseries/Kconfig
@@ -21,6 +21,7 @@ config PPC_PSERIES
 	select HOTPLUG_CPU if SMP
 	select ARCH_RANDOM
 	select PPC_DOORBELL
+	select ARCH_USE_QUEUED_SPINLOCKS
 	default y
 
 config PPC_SPLPAR
-- 
2.4.11

^ permalink raw reply related

* [PATCH v5 1/6] qspinlock: powerpc support qspinlock
From: Pan Xinhui @ 2016-06-02  9:26 UTC (permalink / raw)
  To: linux-kernel, linuxppc-dev, virtualization
  Cc: Pan Xinhui, peterz, benh, waiman.long, mingo, paulus, mpe,
	paulmck
In-Reply-To: <1464859590-5292-1-git-send-email-xinhui.pan@linux.vnet.ibm.com>

Base code to enable qspinlock on powerpc. this patch add some #ifdef
here and there. Although there is no paravirt related code, we can
successfully build a qspinlock kernel after apply this patch.

Signed-off-by: Pan Xinhui <xinhui.pan@linux.vnet.ibm.com>
---
 arch/powerpc/include/asm/qspinlock.h      | 26 ++++++++++++++++++++++++++
 arch/powerpc/include/asm/spinlock.h       | 27 +++++++++++++++------------
 arch/powerpc/include/asm/spinlock_types.h |  4 ++++
 arch/powerpc/lib/locks.c                  |  4 ++++
 4 files changed, 49 insertions(+), 12 deletions(-)
 create mode 100644 arch/powerpc/include/asm/qspinlock.h

diff --git a/arch/powerpc/include/asm/qspinlock.h b/arch/powerpc/include/asm/qspinlock.h
new file mode 100644
index 0000000..fc83cd2
--- /dev/null
+++ b/arch/powerpc/include/asm/qspinlock.h
@@ -0,0 +1,26 @@
+#ifndef _ASM_POWERPC_QSPINLOCK_H
+#define _ASM_POWERPC_QSPINLOCK_H
+
+#include <asm-generic/qspinlock_types.h>
+
+#define SPIN_THRESHOLD (1 << 15)
+#define queued_spin_unlock queued_spin_unlock
+
+static inline void native_queued_spin_unlock(struct qspinlock *lock)
+{
+	u8 *locked = (u8 *)lock;
+#ifdef __BIG_ENDIAN
+	locked += 3;
+#endif
+	/* no load/store can be across the unlock()*/
+	smp_store_release(locked, 0);
+}
+
+static inline void queued_spin_unlock(struct qspinlock *lock)
+{
+	native_queued_spin_unlock(lock);
+}
+
+#include <asm-generic/qspinlock.h>
+
+#endif /* _ASM_POWERPC_QSPINLOCK_H */
diff --git a/arch/powerpc/include/asm/spinlock.h b/arch/powerpc/include/asm/spinlock.h
index 523673d..4359ee6 100644
--- a/arch/powerpc/include/asm/spinlock.h
+++ b/arch/powerpc/include/asm/spinlock.h
@@ -52,6 +52,20 @@
 #define SYNC_IO
 #endif
 
+#if defined(CONFIG_PPC_SPLPAR)
+/* We only yield to the hypervisor if we are in shared processor mode */
+#define SHARED_PROCESSOR (lppaca_shared_proc(local_paca->lppaca_ptr))
+extern void __spin_yield(arch_spinlock_t *lock);
+extern void __rw_yield(arch_rwlock_t *lock);
+#else /* SPLPAR */
+#define __spin_yield(x)	barrier()
+#define __rw_yield(x)	barrier()
+#define SHARED_PROCESSOR	0
+#endif
+
+#ifdef CONFIG_QUEUED_SPINLOCKS
+#include <asm/qspinlock.h>
+#else
 static __always_inline int arch_spin_value_unlocked(arch_spinlock_t lock)
 {
 	return lock.slock == 0;
@@ -106,18 +120,6 @@ static inline int arch_spin_trylock(arch_spinlock_t *lock)
  * held.  Conveniently, we have a word in the paca that holds this
  * value.
  */
-
-#if defined(CONFIG_PPC_SPLPAR)
-/* We only yield to the hypervisor if we are in shared processor mode */
-#define SHARED_PROCESSOR (lppaca_shared_proc(local_paca->lppaca_ptr))
-extern void __spin_yield(arch_spinlock_t *lock);
-extern void __rw_yield(arch_rwlock_t *lock);
-#else /* SPLPAR */
-#define __spin_yield(x)	barrier()
-#define __rw_yield(x)	barrier()
-#define SHARED_PROCESSOR	0
-#endif
-
 static inline void arch_spin_lock(arch_spinlock_t *lock)
 {
 	CLEAR_IO_SYNC;
@@ -169,6 +171,7 @@ extern void arch_spin_unlock_wait(arch_spinlock_t *lock);
 	do { while (arch_spin_is_locked(lock)) cpu_relax(); } while (0)
 #endif
 
+#endif /* !CONFIG_QUEUED_SPINLOCKS */
 /*
  * Read-write spinlocks, allowing multiple readers
  * but only one writer.
diff --git a/arch/powerpc/include/asm/spinlock_types.h b/arch/powerpc/include/asm/spinlock_types.h
index 2351adc..bd7144e 100644
--- a/arch/powerpc/include/asm/spinlock_types.h
+++ b/arch/powerpc/include/asm/spinlock_types.h
@@ -5,11 +5,15 @@
 # error "please don't include this file directly"
 #endif
 
+#ifdef CONFIG_QUEUED_SPINLOCKS
+#include <asm-generic/qspinlock_types.h>
+#else
 typedef struct {
 	volatile unsigned int slock;
 } arch_spinlock_t;
 
 #define __ARCH_SPIN_LOCK_UNLOCKED	{ 0 }
+#endif
 
 typedef struct {
 	volatile signed int lock;
diff --git a/arch/powerpc/lib/locks.c b/arch/powerpc/lib/locks.c
index f7deebd..a9ebd71 100644
--- a/arch/powerpc/lib/locks.c
+++ b/arch/powerpc/lib/locks.c
@@ -23,6 +23,7 @@
 #include <asm/hvcall.h>
 #include <asm/smp.h>
 
+#ifndef CONFIG_QUEUED_SPINLOCKS
 void __spin_yield(arch_spinlock_t *lock)
 {
 	unsigned int lock_value, holder_cpu, yield_count;
@@ -42,6 +43,7 @@ void __spin_yield(arch_spinlock_t *lock)
 		get_hard_smp_processor_id(holder_cpu), yield_count);
 }
 EXPORT_SYMBOL_GPL(__spin_yield);
+#endif
 
 /*
  * Waiting for a read lock or a write lock on a rwlock...
@@ -69,6 +71,7 @@ void __rw_yield(arch_rwlock_t *rw)
 }
 #endif
 
+#ifndef CONFIG_QUEUED_SPINLOCKS
 void arch_spin_unlock_wait(arch_spinlock_t *lock)
 {
 	smp_mb();
@@ -84,3 +87,4 @@ void arch_spin_unlock_wait(arch_spinlock_t *lock)
 }
 
 EXPORT_SYMBOL(arch_spin_unlock_wait);
+#endif
-- 
2.4.11

^ permalink raw reply related

* [PATCH v5 0/6] powerPC/pSeries use pv-qpsinlock as the default spinlock implemention
From: Pan Xinhui @ 2016-06-02  9:26 UTC (permalink / raw)
  To: linux-kernel, linuxppc-dev, virtualization
  Cc: Pan Xinhui, peterz, benh, waiman.long, mingo, paulus, mpe,
	paulmck
In-Reply-To: <1464859590-5292-1-git-send-email-xinhui.pan@linux.vnet.ibm.com>

change from v4:
	BUG FIX. thanks boqun reporting this issue.
	struct  __qspinlock has different layout in bigendian mahcine.
	native_queued_spin_unlock() may write value to a wrong address. now fix it.
	sorry for not even doing a test on bigendian machine before!!!

change from v3:
	a big change in [PATCH v4 4/6] pv-qspinlock: powerpc support pv-qspinlock
	no other patch changed.
	and the patch cover letter tilte has changed as only pseries may need use pv-qspinlock, not all powerpc.

	1) __pv_wait will not return until *ptr != val as Waiman gives me a tip.
	2) support lock holder serching by storing cpu number into a hash table(implemented as an array)
	This is because lock_stealing hit too much, up to 10%~20% of all the successful lock(), and avoid
	vcpu slices bounce.
	
change from v2:
	__spin_yeild_cpu() will yield slices to lpar if target cpu is running.
	remove unnecessary rmb() in __spin_yield/wake_cpu.
	__pv_wait() will check the *ptr == val.
	some commit message change

change fome v1:
	separate into 6 pathes from one patch
	some minor code changes.

I do several tests on pseries IBM,8408-E8E with 32cpus, 64GB memory, kernel 4.6
benchmark test results are below.

2 perf tests:
perf bench futex hash
perf bench futex lock-pi

_____test________________spinlcok______________pv-qspinlcok_____
|futex hash	|	528572 ops	|	573238 ops	|
|futex lock-pi	|	354 ops		|	352 ops		|

scheduler test:
Test how many loops of schedule() can finish within 10 seconds on all cpus.

_____test________________spinlcok______________pv-qspinlcok_____
|schedule() loops|	340890082 	|	331730973	|

kernel compiling test:
build a default linux kernel image to see how long it took

_____test________________spinlcok______________pv-qspinlcok_____
| compiling takes|	22m 		|	22m		|

some notes:
the performace is as good as current spinlock's. in some case better while some cases worse.
But in some other tests(not listed here), we verify the two spinlock's workloads by perf record&report.
pv-qspinlock is light-weight than current spinlock.
This patch series depends on 2 patches:
[patch]powerpc: Implement {cmp}xchg for u8 and u16
[patch]locking/pvqspinlock: Add lock holder CPU argument to pv_wait() from Waiman

Some other patches in Waiman's "locking/pvqspinlock: Fix missed PV wakeup & support PPC" are not applied for now.

Pan Xinhui (6):
  qspinlock: powerpc support qspinlock
  powerpc: pseries/Kconfig: Add qspinlock build config
  powerpc: lib/locks.c: Add cpu yield/wake helper function
  pv-qspinlock: powerpc support pv-qspinlock
  pv-qspinlock: use cmpxchg_release in __pv_queued_spin_unlock
  powerpc: pseries: Add pv-qspinlock build config/make

 arch/powerpc/include/asm/qspinlock.h               |  41 +++++++
 arch/powerpc/include/asm/qspinlock_paravirt.h      |  38 +++++++
 .../powerpc/include/asm/qspinlock_paravirt_types.h |  13 +++
 arch/powerpc/include/asm/spinlock.h                |  31 ++++--
 arch/powerpc/include/asm/spinlock_types.h          |   4 +
 arch/powerpc/kernel/Makefile                       |   1 +
 arch/powerpc/kernel/paravirt.c                     | 121 +++++++++++++++++++++
 arch/powerpc/lib/locks.c                           |  37 +++++++
 arch/powerpc/platforms/pseries/Kconfig             |   9 ++
 arch/powerpc/platforms/pseries/setup.c             |   5 +
 kernel/locking/qspinlock_paravirt.h                |   2 +-
 11 files changed, 289 insertions(+), 13 deletions(-)
 create mode 100644 arch/powerpc/include/asm/qspinlock.h
 create mode 100644 arch/powerpc/include/asm/qspinlock_paravirt.h
 create mode 100644 arch/powerpc/include/asm/qspinlock_paravirt_types.h
 create mode 100644 arch/powerpc/kernel/paravirt.c

-- 
2.4.11

^ permalink raw reply

* [PATCH v5 0/6] powerPC/pSeries use pv-qpsinlock as the default spinlock implemention
From: Pan Xinhui @ 2016-06-02  9:26 UTC (permalink / raw)
  To: linux-kernel, linuxppc-dev, virtualization
  Cc: Pan Xinhui, peterz, benh, waiman.long, mingo, paulus, mpe,
	paulmck

change from v4:
	BUG FIX. thanks boqun reporting this issue.
	struct  __qspinlock has different layout in bigendian mahcine.
	native_queued_spin_unlock() may write value to a wrong address. now fix it.
	sorry for not even doing a test on bigendian machine before!!!

change from v3:
	a big change in [PATCH v4 4/6] pv-qspinlock: powerpc support pv-qspinlock
	no other patch changed.
	and the patch cover letter tilte has changed as only pseries may need use pv-qspinlock, not all powerpc.

	1) __pv_wait will not return until *ptr != val as Waiman gives me a tip.
	2) support lock holder serching by storing cpu number into a hash table(implemented as an array)
	This is because lock_stealing hit too much, up to 10%~20% of all the successful lock(), and avoid
	vcpu slices bounce.
	
change from v2:
	__spin_yeild_cpu() will yield slices to lpar if target cpu is running.
	remove unnecessary rmb() in __spin_yield/wake_cpu.
	__pv_wait() will check the *ptr == val.
	some commit message change

change fome v1:
	separate into 6 pathes from one patch
	some minor code changes.

I do several tests on pseries IBM,8408-E8E with 32cpus, 64GB memory, kernel 4.6
benchmark test results are below.

2 perf tests:
perf bench futex hash
perf bench futex lock-pi

_____test________________spinlcok______________pv-qspinlcok_____
|futex hash	|	528572 ops	|	573238 ops	|
|futex lock-pi	|	354 ops		|	352 ops		|

scheduler test:
Test how many loops of schedule() can finish within 10 seconds on all cpus.

_____test________________spinlcok______________pv-qspinlcok_____
|schedule() loops|	340890082 	|	331730973	|

kernel compiling test:
build a default linux kernel image to see how long it took

_____test________________spinlcok______________pv-qspinlcok_____
| compiling takes|	22m 		|	22m		|

some notes:
the performace is as good as current spinlock's. in some case better while some cases worse.
But in some other tests(not listed here), we verify the two spinlock's workloads by perf record&report.
pv-qspinlock is light-weight than current spinlock.
This patch series depends on 2 patches:
[patch]powerpc: Implement {cmp}xchg for u8 and u16
[patch]locking/pvqspinlock: Add lock holder CPU argument to pv_wait() from Waiman

Some other patches in Waiman's "locking/pvqspinlock: Fix missed PV wakeup & support PPC" are not applied for now.

Pan Xinhui (6):
  qspinlock: powerpc support qspinlock
  powerpc: pseries/Kconfig: Add qspinlock build config
  powerpc: lib/locks.c: Add cpu yield/wake helper function
  pv-qspinlock: powerpc support pv-qspinlock
  pv-qspinlock: use cmpxchg_release in __pv_queued_spin_unlock
  powerpc: pseries: Add pv-qspinlock build config/make

 arch/powerpc/include/asm/qspinlock.h               |  41 +++++++
 arch/powerpc/include/asm/qspinlock_paravirt.h      |  38 +++++++
 .../powerpc/include/asm/qspinlock_paravirt_types.h |  13 +++
 arch/powerpc/include/asm/spinlock.h                |  31 ++++--
 arch/powerpc/include/asm/spinlock_types.h          |   4 +
 arch/powerpc/kernel/Makefile                       |   1 +
 arch/powerpc/kernel/paravirt.c                     | 121 +++++++++++++++++++++
 arch/powerpc/lib/locks.c                           |  37 +++++++
 arch/powerpc/platforms/pseries/Kconfig             |   9 ++
 arch/powerpc/platforms/pseries/setup.c             |   5 +
 kernel/locking/qspinlock_paravirt.h                |   2 +-
 11 files changed, 289 insertions(+), 13 deletions(-)
 create mode 100644 arch/powerpc/include/asm/qspinlock.h
 create mode 100644 arch/powerpc/include/asm/qspinlock_paravirt.h
 create mode 100644 arch/powerpc/include/asm/qspinlock_paravirt_types.h
 create mode 100644 arch/powerpc/kernel/paravirt.c

-- 
2.4.11

^ permalink raw reply

* [PATCH v5 6/6] powerpc: pseries: Add pv-qspinlock build config/make
From: Pan Xinhui @ 2016-06-02  9:22 UTC (permalink / raw)
  To: linux-kernel, linuxppc-dev, virtualization
  Cc: Pan Xinhui, peterz, benh, waiman.long, mingo, paulus, mpe,
	paulmck
In-Reply-To: <1464859370-5162-1-git-send-email-xinhui.pan@linux.vnet.ibm.com>

pseries has PowerVM support, the default option is Y.

Signed-off-by: Pan Xinhui <xinhui.pan@linux.vnet.ibm.com>
---
 arch/powerpc/kernel/Makefile           | 1 +
 arch/powerpc/platforms/pseries/Kconfig | 8 ++++++++
 2 files changed, 9 insertions(+)

diff --git a/arch/powerpc/kernel/Makefile b/arch/powerpc/kernel/Makefile
index 2da380f..ae7c2f1 100644
--- a/arch/powerpc/kernel/Makefile
+++ b/arch/powerpc/kernel/Makefile
@@ -50,6 +50,7 @@ obj-$(CONFIG_PPC_970_NAP)	+= idle_power4.o
 obj-$(CONFIG_PPC_P7_NAP)	+= idle_power7.o
 procfs-y			:= proc_powerpc.o
 obj-$(CONFIG_PROC_FS)		+= $(procfs-y)
+obj-$(CONFIG_PARAVIRT_SPINLOCKS)	+= paravirt.o
 rtaspci-$(CONFIG_PPC64)-$(CONFIG_PCI)	:= rtas_pci.o
 obj-$(CONFIG_PPC_RTAS)		+= rtas.o rtas-rtc.o $(rtaspci-y-y)
 obj-$(CONFIG_PPC_RTAS_DAEMON)	+= rtasd.o
diff --git a/arch/powerpc/platforms/pseries/Kconfig b/arch/powerpc/platforms/pseries/Kconfig
index f669323..46632e4 100644
--- a/arch/powerpc/platforms/pseries/Kconfig
+++ b/arch/powerpc/platforms/pseries/Kconfig
@@ -128,3 +128,11 @@ config HV_PERF_CTRS
 	  systems. 24x7 is available on Power 8 systems.
 
           If unsure, select Y.
+
+config PARAVIRT_SPINLOCKS
+	bool "Paravirtialization support for qspinlock"
+	depends on PPC_SPLPAR && QUEUED_SPINLOCKS
+	default y
+	help
+	  If platform supports virtualization, for example PowerVM, this option
+	  can let guest have a better performace.
-- 
2.4.11

^ permalink raw reply related

* [PATCH v5 5/6] pv-qspinlock: use cmpxchg_release in __pv_queued_spin_unlock
From: Pan Xinhui @ 2016-06-02  9:22 UTC (permalink / raw)
  To: linux-kernel, linuxppc-dev, virtualization
  Cc: Pan Xinhui, peterz, benh, waiman.long, mingo, paulus, mpe,
	paulmck
In-Reply-To: <1464859370-5162-1-git-send-email-xinhui.pan@linux.vnet.ibm.com>

cmpxchg_release is light-wight than cmpxchg, On some arch like ppc,
barrier impact the performace too much.

Suggested-by:  Boqun Feng <boqun.feng@gmail.com>
Signed-off-by: Pan Xinhui <xinhui.pan@linux.vnet.ibm.com>
---
 kernel/locking/qspinlock_paravirt.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/kernel/locking/qspinlock_paravirt.h b/kernel/locking/qspinlock_paravirt.h
index a5b1248..2bbffe4 100644
--- a/kernel/locking/qspinlock_paravirt.h
+++ b/kernel/locking/qspinlock_paravirt.h
@@ -614,7 +614,7 @@ __visible void __pv_queued_spin_unlock(struct qspinlock *lock)
 	 * unhash. Otherwise it would be possible to have multiple @lock
 	 * entries, which would be BAD.
 	 */
-	locked = cmpxchg(&l->locked, _Q_LOCKED_VAL, 0);
+	locked = cmpxchg_release(&l->locked, _Q_LOCKED_VAL, 0);
 	if (likely(locked == _Q_LOCKED_VAL))
 		return;
 
-- 
2.4.11

^ permalink raw reply related

* [PATCH v5 4/6] pv-qspinlock: powerpc support pv-qspinlock
From: Pan Xinhui @ 2016-06-02  9:22 UTC (permalink / raw)
  To: linux-kernel, linuxppc-dev, virtualization
  Cc: Pan Xinhui, peterz, benh, waiman.long, mingo, paulus, mpe,
	paulmck
In-Reply-To: <1464859370-5162-1-git-send-email-xinhui.pan@linux.vnet.ibm.com>

As we need let pv-qspinlock-kernel run on all environment which might
have no powervm, we should runtime choose which qspinlock version to
use.

The default pv-qspinlock use native version. pv_lock initialization
should be done in bootstage with irq disabled. And if there is PHYP,
restore pv_lock_ops callbacks to pv version.

There is also a hash table, we store cpu number into it and the key is
lock. So everytime pv_wait can know who is the lock holder by searching
the lock. Also store the lock in a per_cpu struct, and remove it when we
own the lock. pv_wait need know which lock we are spinning on.

Signed-off-by: Pan Xinhui <xinhui.pan@linux.vnet.ibm.com>
---
 arch/powerpc/include/asm/qspinlock.h               |  15 +++
 arch/powerpc/include/asm/qspinlock_paravirt.h      |  38 +++++++
 .../powerpc/include/asm/qspinlock_paravirt_types.h |  13 +++
 arch/powerpc/kernel/paravirt.c                     | 121 +++++++++++++++++++++
 arch/powerpc/platforms/pseries/setup.c             |   5 +
 5 files changed, 192 insertions(+)
 create mode 100644 arch/powerpc/include/asm/qspinlock_paravirt.h
 create mode 100644 arch/powerpc/include/asm/qspinlock_paravirt_types.h
 create mode 100644 arch/powerpc/kernel/paravirt.c

diff --git a/arch/powerpc/include/asm/qspinlock.h b/arch/powerpc/include/asm/qspinlock.h
index fc83cd2..61a0d00 100644
--- a/arch/powerpc/include/asm/qspinlock.h
+++ b/arch/powerpc/include/asm/qspinlock.h
@@ -16,10 +16,25 @@ static inline void native_queued_spin_unlock(struct qspinlock *lock)
 	smp_store_release(locked, 0);
 }
 
+#ifdef CONFIG_PARAVIRT_SPINLOCKS
+
+#include <asm/qspinlock_paravirt.h>
+
+static inline void queued_spin_lock_slowpath(struct qspinlock *lock, u32 val)
+{
+	pv_queued_spin_lock(lock, val);
+}
+
+static inline void queued_spin_unlock(struct qspinlock *lock)
+{
+	pv_queued_spin_unlock(lock);
+}
+#else
 static inline void queued_spin_unlock(struct qspinlock *lock)
 {
 	native_queued_spin_unlock(lock);
 }
+#endif
 
 #include <asm-generic/qspinlock.h>
 
diff --git a/arch/powerpc/include/asm/qspinlock_paravirt.h b/arch/powerpc/include/asm/qspinlock_paravirt.h
new file mode 100644
index 0000000..cd17a79
--- /dev/null
+++ b/arch/powerpc/include/asm/qspinlock_paravirt.h
@@ -0,0 +1,38 @@
+#ifndef CONFIG_PARAVIRT_SPINLOCKS
+#error "do not include this file"
+#endif
+
+#ifndef _ASM_QSPINLOCK_PARAVIRT_H
+#define _ASM_QSPINLOCK_PARAVIRT_H
+
+#include  <asm/qspinlock_paravirt_types.h>
+
+extern void pv_lock_init(void);
+extern void native_queued_spin_lock_slowpath(struct qspinlock *lock, u32 val);
+extern void __pv_init_lock_hash(void);
+extern void __pv_queued_spin_lock_slowpath(struct qspinlock *lock, u32 val);
+extern void __pv_queued_spin_unlock(struct qspinlock *lock);
+
+static inline void pv_queued_spin_lock(struct qspinlock *lock, u32 val)
+{
+	CLEAR_IO_SYNC;
+	pv_lock_op.lock(lock, val);
+}
+
+static inline void pv_queued_spin_unlock(struct qspinlock *lock)
+{
+	SYNC_IO;
+	pv_lock_op.unlock(lock);
+}
+
+static inline void pv_wait(u8 *ptr, u8 val, int lockcpu)
+{
+	pv_lock_op.wait(ptr, val, lockcpu);
+}
+
+static inline void pv_kick(int cpu)
+{
+	pv_lock_op.kick(cpu);
+}
+
+#endif
diff --git a/arch/powerpc/include/asm/qspinlock_paravirt_types.h b/arch/powerpc/include/asm/qspinlock_paravirt_types.h
new file mode 100644
index 0000000..e1fdeb0
--- /dev/null
+++ b/arch/powerpc/include/asm/qspinlock_paravirt_types.h
@@ -0,0 +1,13 @@
+#ifndef _ASM_QSPINLOCK_PARAVIRT_TYPES_H
+#define _ASM_QSPINLOCK_PARAVIRT_TYPES_H
+
+struct pv_lock_ops {
+	void (*lock)(struct qspinlock *lock, u32 val);
+	void (*unlock)(struct qspinlock *lock);
+	void (*wait)(u8 *ptr, u8 val, int cpu);
+	void (*kick)(int cpu);
+};
+
+extern struct pv_lock_ops pv_lock_op;
+
+#endif
diff --git a/arch/powerpc/kernel/paravirt.c b/arch/powerpc/kernel/paravirt.c
new file mode 100644
index 0000000..2e87fa6
--- /dev/null
+++ b/arch/powerpc/kernel/paravirt.c
@@ -0,0 +1,121 @@
+/*
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ */
+
+#include <linux/spinlock.h>
+#include <linux/smp.h>
+#include <linux/hash.h>
+#include <linux/bootmem.h>
+
+#define NUM_LOCK_CPU_ENTRY_SHIFT 16
+#define NUM_LOCK_CPU_ENTRY (1 << NUM_LOCK_CPU_ENTRY_SHIFT)
+#define NUM_LOCKS_PER_CPU 4
+
+static u16 *hash_lock_cpu_ptr;
+
+struct locks_on_cpu {
+	void *l[NUM_LOCKS_PER_CPU];
+	int count;
+};
+
+static DEFINE_PER_CPU(struct locks_on_cpu, node);
+
+static u16 *hash(void *l)
+{
+	int val = hash_ptr(l, NUM_LOCK_CPU_ENTRY_SHIFT);
+
+	return &hash_lock_cpu_ptr[val];
+}
+
+static void __init init_hash(void)
+{
+	int size = NUM_LOCK_CPU_ENTRY * sizeof(*hash_lock_cpu_ptr);
+
+	hash_lock_cpu_ptr = memblock_virt_alloc(size, 0);
+	memset(hash_lock_cpu_ptr, 0, size);
+}
+
+#define lock_get_holder(l)	\
+		((int)*hash(l) - 1)
+
+#define lock_set_holder(l)	\
+		(*hash(l) = raw_smp_processor_id() + 1)
+
+static void *this_cpu_lock(void)
+{
+	struct locks_on_cpu *this_node = this_cpu_ptr(&node);
+	int i = this_node->count - 1;
+
+	return this_node->l[i];
+}
+
+static void cpu_save_lock(void *l)
+{
+	struct locks_on_cpu *this_node = this_cpu_ptr(&node);
+	int i = this_node->count++;
+
+	this_node->l[i] = l;
+}
+
+static void cpu_remove_lock(void *l)
+{
+	this_cpu_dec(node.count);
+}
+
+static void __native_queued_spin_unlock(struct qspinlock *lock)
+{
+	native_queued_spin_unlock(lock);
+}
+
+static void __pv_lock(struct qspinlock *lock, u32 val)
+{
+	cpu_save_lock(lock);
+	__pv_queued_spin_lock_slowpath(lock, val);
+	cpu_remove_lock(lock);
+	lock_set_holder(lock);
+}
+
+static void __pv_unlock(struct qspinlock *lock)
+{
+	__pv_queued_spin_unlock(lock);
+}
+
+static void __pv_wait(u8 *ptr, u8 val, int cpu)
+{
+	void *l = this_cpu_lock();
+
+	HMT_low();
+	while (READ_ONCE(*ptr) == val) {
+		cpu = lock_get_holder(l);
+		__spin_yield_cpu(cpu);
+	}
+	HMT_medium();
+}
+
+static void __pv_kick(int cpu)
+{
+	__spin_wake_cpu(cpu);
+}
+
+struct pv_lock_ops pv_lock_op = {
+	.lock = native_queued_spin_lock_slowpath,
+	.unlock = __native_queued_spin_unlock,
+	.wait = NULL,
+	.kick = NULL,
+};
+EXPORT_SYMBOL(pv_lock_op);
+
+void __init pv_lock_init(void)
+{
+	if (SHARED_PROCESSOR) {
+		init_hash();
+		__pv_init_lock_hash();
+		pv_lock_op.lock = __pv_lock;
+		pv_lock_op.unlock = __pv_unlock;
+		pv_lock_op.wait = __pv_wait;
+		pv_lock_op.kick = __pv_kick;
+	}
+}
diff --git a/arch/powerpc/platforms/pseries/setup.c b/arch/powerpc/platforms/pseries/setup.c
index 9883bc7..928b338 100644
--- a/arch/powerpc/platforms/pseries/setup.c
+++ b/arch/powerpc/platforms/pseries/setup.c
@@ -549,6 +549,11 @@ static void __init pSeries_setup_arch(void)
 				"%ld\n", rc);
 		}
 	}
+
+#ifdef CONFIG_PARAVIRT_SPINLOCKS
+	pv_lock_init();
+#endif
+
 }
 
 static int __init pSeries_init_panel(void)
-- 
2.4.11

^ permalink raw reply related

* [PATCH v5 3/6] powerpc: lib/locks.c: Add cpu yield/wake helper function
From: Pan Xinhui @ 2016-06-02  9:22 UTC (permalink / raw)
  To: linux-kernel, linuxppc-dev, virtualization
  Cc: Pan Xinhui, peterz, benh, waiman.long, mingo, paulus, mpe,
	paulmck
In-Reply-To: <1464859370-5162-1-git-send-email-xinhui.pan@linux.vnet.ibm.com>

pv-qspinlock core has pv_wait/pv_kick which will give a better
performace by yielding and kicking cpu at some cases.
lets support them by adding two corresponding helper functions.

Signed-off-by: Pan Xinhui <xinhui.pan@linux.vnet.ibm.com>
---
 arch/powerpc/include/asm/spinlock.h |  4 ++++
 arch/powerpc/lib/locks.c            | 33 +++++++++++++++++++++++++++++++++
 2 files changed, 37 insertions(+)

diff --git a/arch/powerpc/include/asm/spinlock.h b/arch/powerpc/include/asm/spinlock.h
index 4359ee6..3b65372 100644
--- a/arch/powerpc/include/asm/spinlock.h
+++ b/arch/powerpc/include/asm/spinlock.h
@@ -56,9 +56,13 @@
 /* We only yield to the hypervisor if we are in shared processor mode */
 #define SHARED_PROCESSOR (lppaca_shared_proc(local_paca->lppaca_ptr))
 extern void __spin_yield(arch_spinlock_t *lock);
+extern void __spin_yield_cpu(int cpu);
+extern void __spin_wake_cpu(int cpu);
 extern void __rw_yield(arch_rwlock_t *lock);
 #else /* SPLPAR */
 #define __spin_yield(x)	barrier()
+#define __spin_yield_cpu(x) barrier()
+#define __spin_wake_cpu(x) barrier()
 #define __rw_yield(x)	barrier()
 #define SHARED_PROCESSOR	0
 #endif
diff --git a/arch/powerpc/lib/locks.c b/arch/powerpc/lib/locks.c
index a9ebd71..3a58834 100644
--- a/arch/powerpc/lib/locks.c
+++ b/arch/powerpc/lib/locks.c
@@ -23,6 +23,39 @@
 #include <asm/hvcall.h>
 #include <asm/smp.h>
 
+void __spin_yield_cpu(int cpu)
+{
+	unsigned int holder_cpu = cpu, yield_count;
+
+	if (cpu == -1) {
+		plpar_hcall_norets(H_CONFER, -1, 0);
+		return;
+	}
+	BUG_ON(holder_cpu >= nr_cpu_ids);
+	yield_count = be32_to_cpu(lppaca_of(holder_cpu).yield_count);
+	if ((yield_count & 1) == 0) {
+		/* if target cpu is running, confer slices to lpar*/
+		plpar_hcall_norets(H_CONFER, -1, 0);
+		return;
+	}
+	plpar_hcall_norets(H_CONFER,
+		get_hard_smp_processor_id(holder_cpu), yield_count);
+}
+EXPORT_SYMBOL_GPL(__spin_yield_cpu);
+
+void __spin_wake_cpu(int cpu)
+{
+	unsigned int holder_cpu = cpu, yield_count;
+
+	BUG_ON(holder_cpu >= nr_cpu_ids);
+	yield_count = be32_to_cpu(lppaca_of(holder_cpu).yield_count);
+	if ((yield_count & 1) == 0)
+		return;		/* virtual cpu is currently running */
+	plpar_hcall_norets(H_PROD,
+		get_hard_smp_processor_id(holder_cpu));
+}
+EXPORT_SYMBOL_GPL(__spin_wake_cpu);
+
 #ifndef CONFIG_QUEUED_SPINLOCKS
 void __spin_yield(arch_spinlock_t *lock)
 {
-- 
2.4.11

^ permalink raw reply related

* [PATCH v5 2/6] powerpc: pseries/Kconfig: Add qspinlock build config
From: Pan Xinhui @ 2016-06-02  9:22 UTC (permalink / raw)
  To: linux-kernel, linuxppc-dev, virtualization
  Cc: Pan Xinhui, peterz, benh, waiman.long, mingo, paulus, mpe,
	paulmck
In-Reply-To: <1464859370-5162-1-git-send-email-xinhui.pan@linux.vnet.ibm.com>

pseries will use qspinlock by default.

Signed-off-by: Pan Xinhui <xinhui.pan@linux.vnet.ibm.com>
---
 arch/powerpc/platforms/pseries/Kconfig | 1 +
 1 file changed, 1 insertion(+)

diff --git a/arch/powerpc/platforms/pseries/Kconfig b/arch/powerpc/platforms/pseries/Kconfig
index bec90fb..f669323 100644
--- a/arch/powerpc/platforms/pseries/Kconfig
+++ b/arch/powerpc/platforms/pseries/Kconfig
@@ -21,6 +21,7 @@ config PPC_PSERIES
 	select HOTPLUG_CPU if SMP
 	select ARCH_RANDOM
 	select PPC_DOORBELL
+	select ARCH_USE_QUEUED_SPINLOCKS
 	default y
 
 config PPC_SPLPAR
-- 
2.4.11

^ permalink raw reply related

* [PATCH v5 1/6] qspinlock: powerpc support qspinlock
From: Pan Xinhui @ 2016-06-02  9:22 UTC (permalink / raw)
  To: linux-kernel, linuxppc-dev, virtualization
  Cc: Pan Xinhui, peterz, benh, waiman.long, mingo, paulus, mpe,
	paulmck
In-Reply-To: <1464859370-5162-1-git-send-email-xinhui.pan@linux.vnet.ibm.com>

Base code to enable qspinlock on powerpc. this patch add some #ifdef
here and there. Although there is no paravirt related code, we can
successfully build a qspinlock kernel after apply this patch.

Signed-off-by: Pan Xinhui <xinhui.pan@linux.vnet.ibm.com>
---
 arch/powerpc/include/asm/qspinlock.h      | 26 ++++++++++++++++++++++++++
 arch/powerpc/include/asm/spinlock.h       | 27 +++++++++++++++------------
 arch/powerpc/include/asm/spinlock_types.h |  4 ++++
 arch/powerpc/lib/locks.c                  |  4 ++++
 4 files changed, 49 insertions(+), 12 deletions(-)
 create mode 100644 arch/powerpc/include/asm/qspinlock.h

diff --git a/arch/powerpc/include/asm/qspinlock.h b/arch/powerpc/include/asm/qspinlock.h
new file mode 100644
index 0000000..fc83cd2
--- /dev/null
+++ b/arch/powerpc/include/asm/qspinlock.h
@@ -0,0 +1,26 @@
+#ifndef _ASM_POWERPC_QSPINLOCK_H
+#define _ASM_POWERPC_QSPINLOCK_H
+
+#include <asm-generic/qspinlock_types.h>
+
+#define SPIN_THRESHOLD (1 << 15)
+#define queued_spin_unlock queued_spin_unlock
+
+static inline void native_queued_spin_unlock(struct qspinlock *lock)
+{
+	u8 *locked = (u8 *)lock;
+#ifdef __BIG_ENDIAN
+	locked += 3;
+#endif
+	/* no load/store can be across the unlock()*/
+	smp_store_release(locked, 0);
+}
+
+static inline void queued_spin_unlock(struct qspinlock *lock)
+{
+	native_queued_spin_unlock(lock);
+}
+
+#include <asm-generic/qspinlock.h>
+
+#endif /* _ASM_POWERPC_QSPINLOCK_H */
diff --git a/arch/powerpc/include/asm/spinlock.h b/arch/powerpc/include/asm/spinlock.h
index 523673d..4359ee6 100644
--- a/arch/powerpc/include/asm/spinlock.h
+++ b/arch/powerpc/include/asm/spinlock.h
@@ -52,6 +52,20 @@
 #define SYNC_IO
 #endif
 
+#if defined(CONFIG_PPC_SPLPAR)
+/* We only yield to the hypervisor if we are in shared processor mode */
+#define SHARED_PROCESSOR (lppaca_shared_proc(local_paca->lppaca_ptr))
+extern void __spin_yield(arch_spinlock_t *lock);
+extern void __rw_yield(arch_rwlock_t *lock);
+#else /* SPLPAR */
+#define __spin_yield(x)	barrier()
+#define __rw_yield(x)	barrier()
+#define SHARED_PROCESSOR	0
+#endif
+
+#ifdef CONFIG_QUEUED_SPINLOCKS
+#include <asm/qspinlock.h>
+#else
 static __always_inline int arch_spin_value_unlocked(arch_spinlock_t lock)
 {
 	return lock.slock == 0;
@@ -106,18 +120,6 @@ static inline int arch_spin_trylock(arch_spinlock_t *lock)
  * held.  Conveniently, we have a word in the paca that holds this
  * value.
  */
-
-#if defined(CONFIG_PPC_SPLPAR)
-/* We only yield to the hypervisor if we are in shared processor mode */
-#define SHARED_PROCESSOR (lppaca_shared_proc(local_paca->lppaca_ptr))
-extern void __spin_yield(arch_spinlock_t *lock);
-extern void __rw_yield(arch_rwlock_t *lock);
-#else /* SPLPAR */
-#define __spin_yield(x)	barrier()
-#define __rw_yield(x)	barrier()
-#define SHARED_PROCESSOR	0
-#endif
-
 static inline void arch_spin_lock(arch_spinlock_t *lock)
 {
 	CLEAR_IO_SYNC;
@@ -169,6 +171,7 @@ extern void arch_spin_unlock_wait(arch_spinlock_t *lock);
 	do { while (arch_spin_is_locked(lock)) cpu_relax(); } while (0)
 #endif
 
+#endif /* !CONFIG_QUEUED_SPINLOCKS */
 /*
  * Read-write spinlocks, allowing multiple readers
  * but only one writer.
diff --git a/arch/powerpc/include/asm/spinlock_types.h b/arch/powerpc/include/asm/spinlock_types.h
index 2351adc..bd7144e 100644
--- a/arch/powerpc/include/asm/spinlock_types.h
+++ b/arch/powerpc/include/asm/spinlock_types.h
@@ -5,11 +5,15 @@
 # error "please don't include this file directly"
 #endif
 
+#ifdef CONFIG_QUEUED_SPINLOCKS
+#include <asm-generic/qspinlock_types.h>
+#else
 typedef struct {
 	volatile unsigned int slock;
 } arch_spinlock_t;
 
 #define __ARCH_SPIN_LOCK_UNLOCKED	{ 0 }
+#endif
 
 typedef struct {
 	volatile signed int lock;
diff --git a/arch/powerpc/lib/locks.c b/arch/powerpc/lib/locks.c
index f7deebd..a9ebd71 100644
--- a/arch/powerpc/lib/locks.c
+++ b/arch/powerpc/lib/locks.c
@@ -23,6 +23,7 @@
 #include <asm/hvcall.h>
 #include <asm/smp.h>
 
+#ifndef CONFIG_QUEUED_SPINLOCKS
 void __spin_yield(arch_spinlock_t *lock)
 {
 	unsigned int lock_value, holder_cpu, yield_count;
@@ -42,6 +43,7 @@ void __spin_yield(arch_spinlock_t *lock)
 		get_hard_smp_processor_id(holder_cpu), yield_count);
 }
 EXPORT_SYMBOL_GPL(__spin_yield);
+#endif
 
 /*
  * Waiting for a read lock or a write lock on a rwlock...
@@ -69,6 +71,7 @@ void __rw_yield(arch_rwlock_t *rw)
 }
 #endif
 
+#ifndef CONFIG_QUEUED_SPINLOCKS
 void arch_spin_unlock_wait(arch_spinlock_t *lock)
 {
 	smp_mb();
@@ -84,3 +87,4 @@ void arch_spin_unlock_wait(arch_spinlock_t *lock)
 }
 
 EXPORT_SYMBOL(arch_spin_unlock_wait);
+#endif
-- 
2.4.11

^ permalink raw reply related

* [PATCH v5 0/6] powerPC/pSeries use pv-qpsinlock as the default spinlock implemention
From: Pan Xinhui @ 2016-06-02  9:22 UTC (permalink / raw)
  To: linux-kernel, linuxppc-dev, virtualization
  Cc: peterz, benh, waiman.long, root, mingo, paulus, mpe, paulmck
In-Reply-To: <1464859370-5162-1-git-send-email-xinhui.pan@linux.vnet.ibm.com>

From: root <root@ltcalpine2-lp13.aus.stglabs.ibm.com>

change from v4:
	BUG FIX. thanks boqun reporting this issue.
	struct  __qspinlock has different layout in bigendian mahcine.
	native_queued_spin_unlock() may write value to a wrong address. now fix it.
	
change from v3:
	a big change in [PATCH v4 4/6] pv-qspinlock: powerpc support pv-qspinlock
	no other patch changed.
	and the patch cover letter tilte has changed as only pseries may need use pv-qspinlock, not all powerpc.

	1) __pv_wait will not return until *ptr != val as Waiman gives me a tip.
	2) support lock holder serching by storing cpu number into a hash table(implemented as an array)
	This is because lock_stealing hit too much, up to 10%~20% of all the successful lock(), and avoid
	vcpu slices bounce.
	
change from v2:
	__spin_yeild_cpu() will yield slices to lpar if target cpu is running.
	remove unnecessary rmb() in __spin_yield/wake_cpu.
	__pv_wait() will check the *ptr == val.
	some commit message change

change fome v1:
	separate into 6 pathes from one patch
	some minor code changes.

I do several tests on pseries IBM,8408-E8E with 32cpus, 64GB memory, kernel 4.6
benchmark test results are below.

2 perf tests:
perf bench futex hash
perf bench futex lock-pi

_____test________________spinlcok______________pv-qspinlcok_____
|futex hash	|	528572 ops	|	573238 ops	|
|futex lock-pi	|	354 ops		|	352 ops		|

scheduler test:
Test how many loops of schedule() can finish within 10 seconds on all cpus.

_____test________________spinlcok______________pv-qspinlcok_____
|schedule() loops|	340890082 	|	331730973	|

kernel compiling test:
build a default linux kernel image to see how long it took

_____test________________spinlcok______________pv-qspinlcok_____
| compiling takes|	22m 		|	22m		|

some notes:
the performace is as good as current spinlock's. in some case better while some cases worse.
But in some other tests(not listed here), we verify the two spinlock's workloads by perf record&report.
pv-qspinlock is light-weight than current spinlock.
This patch series depends on 2 patches:
[patch]powerpc: Implement {cmp}xchg for u8 and u16
[patch]locking/pvqspinlock: Add lock holder CPU argument to pv_wait() from Waiman

Some other patches in Waiman's "locking/pvqspinlock: Fix missed PV wakeup & support PPC" are not applied for now.


Pan Xinhui (6):
  qspinlock: powerpc support qspinlock
  powerpc: pseries/Kconfig: Add qspinlock build config
  powerpc: lib/locks.c: Add cpu yield/wake helper function
  pv-qspinlock: powerpc support pv-qspinlock
  pv-qspinlock: use cmpxchg_release in __pv_queued_spin_unlock
  powerpc: pseries: Add pv-qspinlock build config/make

 arch/powerpc/include/asm/qspinlock.h               |  41 +++++++
 arch/powerpc/include/asm/qspinlock_paravirt.h      |  38 +++++++
 .../powerpc/include/asm/qspinlock_paravirt_types.h |  13 +++
 arch/powerpc/include/asm/spinlock.h                |  31 ++++--
 arch/powerpc/include/asm/spinlock_types.h          |   4 +
 arch/powerpc/kernel/Makefile                       |   1 +
 arch/powerpc/kernel/paravirt.c                     | 121 +++++++++++++++++++++
 arch/powerpc/lib/locks.c                           |  37 +++++++
 arch/powerpc/platforms/pseries/Kconfig             |   9 ++
 arch/powerpc/platforms/pseries/setup.c             |   5 +
 kernel/locking/qspinlock_paravirt.h                |   2 +-
 11 files changed, 289 insertions(+), 13 deletions(-)
 create mode 100644 arch/powerpc/include/asm/qspinlock.h
 create mode 100644 arch/powerpc/include/asm/qspinlock_paravirt.h
 create mode 100644 arch/powerpc/include/asm/qspinlock_paravirt_types.h
 create mode 100644 arch/powerpc/kernel/paravirt.c

-- 
2.4.11

^ permalink raw reply

* [PATCH v5 0/6] powerPC/pSeries use pv-qpsinlock as the default spinlock implemention
From: Pan Xinhui @ 2016-06-02  9:22 UTC (permalink / raw)
  To: linux-kernel, linuxppc-dev, virtualization
  Cc: peterz, benh, waiman.long, root, mingo, paulus, mpe, paulmck

From: root <root@ltcalpine2-lp13.aus.stglabs.ibm.com>

change from v4:
	BUG FIX. thanks boqun reporting this issue.
	struct  __qspinlock has different layout in bigendian mahcine.
	native_queued_spin_unlock() may write value to a wrong address. now fix it.
	
change from v3:
	a big change in [PATCH v4 4/6] pv-qspinlock: powerpc support pv-qspinlock
	no other patch changed.
	and the patch cover letter tilte has changed as only pseries may need use pv-qspinlock, not all powerpc.

	1) __pv_wait will not return until *ptr != val as Waiman gives me a tip.
	2) support lock holder serching by storing cpu number into a hash table(implemented as an array)
	This is because lock_stealing hit too much, up to 10%~20% of all the successful lock(), and avoid
	vcpu slices bounce.
	
change from v2:
	__spin_yeild_cpu() will yield slices to lpar if target cpu is running.
	remove unnecessary rmb() in __spin_yield/wake_cpu.
	__pv_wait() will check the *ptr == val.
	some commit message change

change fome v1:
	separate into 6 pathes from one patch
	some minor code changes.

I do several tests on pseries IBM,8408-E8E with 32cpus, 64GB memory, kernel 4.6
benchmark test results are below.

2 perf tests:
perf bench futex hash
perf bench futex lock-pi

_____test________________spinlcok______________pv-qspinlcok_____
|futex hash	|	528572 ops	|	573238 ops	|
|futex lock-pi	|	354 ops		|	352 ops		|

scheduler test:
Test how many loops of schedule() can finish within 10 seconds on all cpus.

_____test________________spinlcok______________pv-qspinlcok_____
|schedule() loops|	340890082 	|	331730973	|

kernel compiling test:
build a default linux kernel image to see how long it took

_____test________________spinlcok______________pv-qspinlcok_____
| compiling takes|	22m 		|	22m		|

some notes:
the performace is as good as current spinlock's. in some case better while some cases worse.
But in some other tests(not listed here), we verify the two spinlock's workloads by perf record&report.
pv-qspinlock is light-weight than current spinlock.
This patch series depends on 2 patches:
[patch]powerpc: Implement {cmp}xchg for u8 and u16
[patch]locking/pvqspinlock: Add lock holder CPU argument to pv_wait() from Waiman

Some other patches in Waiman's "locking/pvqspinlock: Fix missed PV wakeup & support PPC" are not applied for now.


Pan Xinhui (6):
  qspinlock: powerpc support qspinlock
  powerpc: pseries/Kconfig: Add qspinlock build config
  powerpc: lib/locks.c: Add cpu yield/wake helper function
  pv-qspinlock: powerpc support pv-qspinlock
  pv-qspinlock: use cmpxchg_release in __pv_queued_spin_unlock
  powerpc: pseries: Add pv-qspinlock build config/make

 arch/powerpc/include/asm/qspinlock.h               |  41 +++++++
 arch/powerpc/include/asm/qspinlock_paravirt.h      |  38 +++++++
 .../powerpc/include/asm/qspinlock_paravirt_types.h |  13 +++
 arch/powerpc/include/asm/spinlock.h                |  31 ++++--
 arch/powerpc/include/asm/spinlock_types.h          |   4 +
 arch/powerpc/kernel/Makefile                       |   1 +
 arch/powerpc/kernel/paravirt.c                     | 121 +++++++++++++++++++++
 arch/powerpc/lib/locks.c                           |  37 +++++++
 arch/powerpc/platforms/pseries/Kconfig             |   9 ++
 arch/powerpc/platforms/pseries/setup.c             |   5 +
 kernel/locking/qspinlock_paravirt.h                |   2 +-
 11 files changed, 289 insertions(+), 13 deletions(-)
 create mode 100644 arch/powerpc/include/asm/qspinlock.h
 create mode 100644 arch/powerpc/include/asm/qspinlock_paravirt.h
 create mode 100644 arch/powerpc/include/asm/qspinlock_paravirt_types.h
 create mode 100644 arch/powerpc/kernel/paravirt.c

-- 
2.4.11

^ permalink raw reply

* Re: [PATCH v7 00/12] Support non-lru page migration
From: Minchan Kim @ 2016-06-02  0:36 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Rik van Riel, Sergey Senozhatsky, Rafael Aquini, Jonathan Corbet,
	Chan Gyun Jeong, Hugh Dickins, linux-kernel, dri-devel,
	virtualization, John Einar Reitan, linux-mm, Chulmin Kim,
	Gioh Kim, Konstantin Khlebnikov, Sangseok Lee, Joonsoo Kim,
	Kyeongdon Kim, Naoya Horiguchi, Vlastimil Babka, Mel Gorman
In-Reply-To: <20160601144151.c9e5c560be29cae9a3ff1f1e@linux-foundation.org>

On Wed, Jun 01, 2016 at 02:41:51PM -0700, Andrew Morton wrote:
> On Wed,  1 Jun 2016 08:21:09 +0900 Minchan Kim <minchan@kernel.org> wrote:
> 
> > Recently, I got many reports about perfermance degradation in embedded
> > system(Android mobile phone, webOS TV and so on) and easy fork fail.
> > 
> > The problem was fragmentation caused by zram and GPU driver mainly.
> > With memory pressure, their pages were spread out all of pageblock and
> > it cannot be migrated with current compaction algorithm which supports
> > only LRU pages. In the end, compaction cannot work well so reclaimer
> > shrinks all of working set pages. It made system very slow and even to
> > fail to fork easily which requires order-[2 or 3] allocations.
> > 
> > Other pain point is that they cannot use CMA memory space so when OOM
> > kill happens, I can see many free pages in CMA area, which is not
> > memory efficient. In our product which has big CMA memory, it reclaims
> > zones too exccessively to allocate GPU and zram page although there are
> > lots of free space in CMA so system becomes very slow easily.
> 
> But this isn't presently implemented for GPU drivers or for CMA, yes?

For GPU driver, Gioh implemented but it was proprietary so couldn't
contribute.

For CMA, [zram: use __GFP_MOVABLE for memory allocation] added __GFP_MOVABLE
for zsmalloc page allocation so it can use CMA area automatically now.

> 
> What's the story there?

^ permalink raw reply

* Re: [PATCH v7 00/12] Support non-lru page migration
From: Daniel Vetter @ 2016-06-01 22:40 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Rik van Riel, Sergey Senozhatsky, Rafael Aquini, linux-mm,
	Chan Gyun Jeong, Jonathan Corbet, Hugh Dickins, linux-kernel,
	dri-devel, virtualization, John Einar Reitan, Minchan Kim,
	Chulmin Kim, Gioh Kim, Konstantin Khlebnikov, Sangseok Lee,
	Joonsoo Kim, Kyeongdon Kim, Naoya Horiguchi, Vlastimil Babka,
	Mel Gorman
In-Reply-To: <20160601144151.c9e5c560be29cae9a3ff1f1e@linux-foundation.org>

On Wed, Jun 01, 2016 at 02:41:51PM -0700, Andrew Morton wrote:
> On Wed,  1 Jun 2016 08:21:09 +0900 Minchan Kim <minchan@kernel.org> wrote:
> 
> > Recently, I got many reports about perfermance degradation in embedded
> > system(Android mobile phone, webOS TV and so on) and easy fork fail.
> > 
> > The problem was fragmentation caused by zram and GPU driver mainly.
> > With memory pressure, their pages were spread out all of pageblock and
> > it cannot be migrated with current compaction algorithm which supports
> > only LRU pages. In the end, compaction cannot work well so reclaimer
> > shrinks all of working set pages. It made system very slow and even to
> > fail to fork easily which requires order-[2 or 3] allocations.
> > 
> > Other pain point is that they cannot use CMA memory space so when OOM
> > kill happens, I can see many free pages in CMA area, which is not
> > memory efficient. In our product which has big CMA memory, it reclaims
> > zones too exccessively to allocate GPU and zram page although there are
> > lots of free space in CMA so system becomes very slow easily.
> 
> But this isn't presently implemented for GPU drivers or for CMA, yes?
> 
> What's the story there?

Broken (out-of-tree) drivers that don't allocate their gpu stuff
correctly.  There's piles of drivers that get_user_page all over the place
but then fail to timely get off these pages again. The fix is to get off
those pages again (either by unpinning timely, or registering an
mmu_notifier if the driver wants to keep the pages pinned indefinitely, as
a caching optimization).

At least that's my guess, and iirc it was confirmed first time around this
series showed up.
-Daniel
-- 
Daniel Vetter
Software Engineer, Intel Corporation
http://blog.ffwll.ch

^ permalink raw reply

* Re: [PATCH v7 00/12] Support non-lru page migration
From: Andrew Morton @ 2016-06-01 21:41 UTC (permalink / raw)
  To: Minchan Kim
  Cc: Rik van Riel, Sergey Senozhatsky, Rafael Aquini, Jonathan Corbet,
	Chan Gyun Jeong, Hugh Dickins, linux-kernel, dri-devel,
	virtualization, John Einar Reitan, linux-mm, Chulmin Kim,
	Gioh Kim, Konstantin Khlebnikov, Sangseok Lee, Joonsoo Kim,
	Kyeongdon Kim, Naoya Horiguchi, Vlastimil Babka, Mel Gorman
In-Reply-To: <1464736881-24886-1-git-send-email-minchan@kernel.org>

On Wed,  1 Jun 2016 08:21:09 +0900 Minchan Kim <minchan@kernel.org> wrote:

> Recently, I got many reports about perfermance degradation in embedded
> system(Android mobile phone, webOS TV and so on) and easy fork fail.
> 
> The problem was fragmentation caused by zram and GPU driver mainly.
> With memory pressure, their pages were spread out all of pageblock and
> it cannot be migrated with current compaction algorithm which supports
> only LRU pages. In the end, compaction cannot work well so reclaimer
> shrinks all of working set pages. It made system very slow and even to
> fail to fork easily which requires order-[2 or 3] allocations.
> 
> Other pain point is that they cannot use CMA memory space so when OOM
> kill happens, I can see many free pages in CMA area, which is not
> memory efficient. In our product which has big CMA memory, it reclaims
> zones too exccessively to allocate GPU and zram page although there are
> lots of free space in CMA so system becomes very slow easily.

But this isn't presently implemented for GPU drivers or for CMA, yes?

What's the story there?

^ permalink raw reply

* Re: [edk2] KVM Forum 2016: Call For Participation
From: Paolo Bonzini @ 2016-06-01 14:50 UTC (permalink / raw)
  To: qemu-devel, KVM list, libvir-list, edk2-devel@ml01.01.org,
	seabios@seabios.org, Linux Virtualization
In-Reply-To: <56E1B83D.7070601@redhat.com>



On 10/03/2016 19:09, Paolo Bonzini wrote:
> ===============
> IMPORTANT DATES
> ===============
> Notification: May 27, 2015

On behalf of the program committee, I apologize for the delay in sending
out the notifications.  If you need to know in advance whether your talk
has been accepted, please contact me offlist.

Thanks,

Paolo

> Schedule announced: June 3, 2015
> Event dates: August 24-26, 2016
> 
> Thank you for your interest in KVM. We're looking forward to your
> submissions and seeing you at the KVM Forum 2016 in August!
> 
> -your KVM Forum 2016 Program Committee
> 
> Please contact us with any questions or comments at
> kvm-forum-2016-pc@redhat.com
> 

^ permalink raw reply

* [PATCH V3 2/2] vhost_net: conditionally enable tx polling
From: Jason Wang @ 2016-06-01  5:56 UTC (permalink / raw)
  To: mst, kvm, virtualization, netdev, linux-kernel
In-Reply-To: <1464760594-30326-1-git-send-email-jasowang@redhat.com>

We always poll tx for socket, this is sub optimal since:

- it will be only used when we exceed the sndbuf of the socket.
- since we use two independent polls for tx and vq, this will slightly
  increase the waitqueue traversing time and more important, vhost
  could not benefit from commit
  9e641bdcfa4ef4d6e2fbaa59c1be0ad5d1551fd5 ("net-tun: restructure
  tun_do_read for better sleep/wakeup efficiency") even if we've
  stopped rx polling during handle_rx since tx poll were still left in
  the waitqueue.

Fix this by conditionally enable tx polling only when -EAGAIN were
met.

Test shows about 8% improvement on guest rx pps.

Before: ~1350000
After:  ~1460000

Signed-off-by: Jason Wang <jasowang@redhat.com>
---
 drivers/vhost/net.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
index 1d3e45f..e75ffcc 100644
--- a/drivers/vhost/net.c
+++ b/drivers/vhost/net.c
@@ -378,6 +378,7 @@ static void handle_tx(struct vhost_net *net)
 		goto out;
 
 	vhost_disable_notify(&net->dev, vq);
+	vhost_net_disable_vq(net, vq);
 
 	hdr_size = nvq->vhost_hlen;
 	zcopy = nvq->ubufs;
@@ -459,6 +460,8 @@ static void handle_tx(struct vhost_net *net)
 					% UIO_MAXIOV;
 			}
 			vhost_discard_vq_desc(vq, 1);
+			if (err == -EAGAIN)
+				vhost_net_enable_vq(net, vq);
 			break;
 		}
 		if (err != len)
-- 
1.8.3.1

^ permalink raw reply related

* [PATCH V3 1/2] vhost_net: stop polling socket during rx processing
From: Jason Wang @ 2016-06-01  5:56 UTC (permalink / raw)
  To: mst, kvm, virtualization, netdev, linux-kernel
In-Reply-To: <1464760594-30326-1-git-send-email-jasowang@redhat.com>

We don't stop rx polling socket during rx processing, this will lead
unnecessary wakeups from under layer net devices (E.g
sock_def_readable() form tun). Rx will be slowed down in this
way. This patch avoids this by stop polling socket during rx
processing. A small drawback is that this introduces some overheads in
light load case because of the extra start/stop polling, but single
netperf TCP_RR does not notice any change. In a super heavy load case,
e.g using pktgen to inject packet to guest, we get about ~8.8%
improvement on pps:

before: ~1240000 pkt/s
after:  ~1350000 pkt/s

Signed-off-by: Jason Wang <jasowang@redhat.com>
---
 drivers/vhost/net.c | 64 +++++++++++++++++++++++++++--------------------------
 1 file changed, 33 insertions(+), 31 deletions(-)

diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
index f744eeb..1d3e45f 100644
--- a/drivers/vhost/net.c
+++ b/drivers/vhost/net.c
@@ -301,6 +301,32 @@ static bool vhost_can_busy_poll(struct vhost_dev *dev,
 	       !vhost_has_work(dev);
 }
 
+static void vhost_net_disable_vq(struct vhost_net *n,
+				 struct vhost_virtqueue *vq)
+{
+	struct vhost_net_virtqueue *nvq =
+		container_of(vq, struct vhost_net_virtqueue, vq);
+	struct vhost_poll *poll = n->poll + (nvq - n->vqs);
+	if (!vq->private_data)
+		return;
+	vhost_poll_stop(poll);
+}
+
+static int vhost_net_enable_vq(struct vhost_net *n,
+				struct vhost_virtqueue *vq)
+{
+	struct vhost_net_virtqueue *nvq =
+		container_of(vq, struct vhost_net_virtqueue, vq);
+	struct vhost_poll *poll = n->poll + (nvq - n->vqs);
+	struct socket *sock;
+
+	sock = vq->private_data;
+	if (!sock)
+		return 0;
+
+	return vhost_poll_start(poll, sock->file);
+}
+
 static int vhost_net_tx_get_vq_desc(struct vhost_net *net,
 				    struct vhost_virtqueue *vq,
 				    struct iovec iov[], unsigned int iov_size,
@@ -613,6 +639,7 @@ static void handle_rx(struct vhost_net *net)
 	if (!sock)
 		goto out;
 	vhost_disable_notify(&net->dev, vq);
+	vhost_net_disable_vq(net, vq);
 
 	vhost_hlen = nvq->vhost_hlen;
 	sock_hlen = nvq->sock_hlen;
@@ -629,7 +656,7 @@ static void handle_rx(struct vhost_net *net)
 					likely(mergeable) ? UIO_MAXIOV : 1);
 		/* On error, stop handling until the next kick. */
 		if (unlikely(headcount < 0))
-			break;
+			goto out;
 		/* On overrun, truncate and discard */
 		if (unlikely(headcount > UIO_MAXIOV)) {
 			iov_iter_init(&msg.msg_iter, READ, vq->iov, 1, 1);
@@ -648,7 +675,7 @@ static void handle_rx(struct vhost_net *net)
 			}
 			/* Nothing new?  Wait for eventfd to tell us
 			 * they refilled. */
-			break;
+			goto out;
 		}
 		/* We don't need to be notified again. */
 		iov_iter_init(&msg.msg_iter, READ, vq->iov, in, vhost_len);
@@ -676,7 +703,7 @@ static void handle_rx(struct vhost_net *net)
 					 &fixup) != sizeof(hdr)) {
 				vq_err(vq, "Unable to write vnet_hdr "
 				       "at addr %p\n", vq->iov->iov_base);
-				break;
+				goto out;
 			}
 		} else {
 			/* Header came from socket; we'll need to patch
@@ -692,7 +719,7 @@ static void handle_rx(struct vhost_net *net)
 				 &fixup) != sizeof num_buffers) {
 			vq_err(vq, "Failed num_buffers write");
 			vhost_discard_vq_desc(vq, headcount);
-			break;
+			goto out;
 		}
 		vhost_add_used_and_signal_n(&net->dev, vq, vq->heads,
 					    headcount);
@@ -701,9 +728,10 @@ static void handle_rx(struct vhost_net *net)
 		total_len += vhost_len;
 		if (unlikely(total_len >= VHOST_NET_WEIGHT)) {
 			vhost_poll_queue(&vq->poll);
-			break;
+			goto out;
 		}
 	}
+	vhost_net_enable_vq(net, vq);
 out:
 	mutex_unlock(&vq->mutex);
 }
@@ -782,32 +810,6 @@ static int vhost_net_open(struct inode *inode, struct file *f)
 	return 0;
 }
 
-static void vhost_net_disable_vq(struct vhost_net *n,
-				 struct vhost_virtqueue *vq)
-{
-	struct vhost_net_virtqueue *nvq =
-		container_of(vq, struct vhost_net_virtqueue, vq);
-	struct vhost_poll *poll = n->poll + (nvq - n->vqs);
-	if (!vq->private_data)
-		return;
-	vhost_poll_stop(poll);
-}
-
-static int vhost_net_enable_vq(struct vhost_net *n,
-				struct vhost_virtqueue *vq)
-{
-	struct vhost_net_virtqueue *nvq =
-		container_of(vq, struct vhost_net_virtqueue, vq);
-	struct vhost_poll *poll = n->poll + (nvq - n->vqs);
-	struct socket *sock;
-
-	sock = vq->private_data;
-	if (!sock)
-		return 0;
-
-	return vhost_poll_start(poll, sock->file);
-}
-
 static struct socket *vhost_net_stop_vq(struct vhost_net *n,
 					struct vhost_virtqueue *vq)
 {
-- 
1.8.3.1

^ permalink raw reply related

* [PATCH V3 0/2] vhost_net polling optimization
From: Jason Wang @ 2016-06-01  5:56 UTC (permalink / raw)
  To: mst, kvm, virtualization, netdev, linux-kernel

Hi:

This series tries to optimize vhost_net polling at two points:

- Stop rx polling for reduicng the unnecessary wakeups during
  handle_rx().
- Conditonally enable tx polling for reducing the unnecessary
  traversing and spinlock touching.

Test shows about 17% improvement on rx pps.

Please review

Changes from V2:
- Don't enable rx vq if we meet an err or rx vq is empty
Changes from V1:
- use vhost_net_disable_vq()/vhost_net_enable_vq() instead of open
  coding.
- Add a new patch for conditionally enable tx polling.

Jason Wang (2):
  vhost_net: stop polling socket during rx processing
  vhost_net: conditionally enable tx polling

 drivers/vhost/net.c | 67 ++++++++++++++++++++++++++++-------------------------
 1 file changed, 36 insertions(+), 31 deletions(-)

-- 
1.8.3.1

^ permalink raw reply

* [PATCH v7 03/12] mm: balloon: use general non-lru movable page feature
From: Minchan Kim @ 2016-05-31 23:21 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Rafael Aquini, Minchan Kim, Vlastimil Babka, linux-kernel,
	virtualization, linux-mm, Gioh Kim, Konstantin Khlebnikov
In-Reply-To: <1464736881-24886-1-git-send-email-minchan@kernel.org>

Now, VM has a feature to migrate non-lru movable pages so
balloon doesn't need custom migration hooks in migrate.c
and compaction.c. Instead, this patch implements
page->mapping->a_ops->{isolate|migrate|putback} functions.

With that, we could remove hooks for ballooning in general
migration functions and make balloon compaction simple.

Cc: virtualization@lists.linux-foundation.org
Cc: Rafael Aquini <aquini@redhat.com>
Cc: Konstantin Khlebnikov <koct9i@gmail.com>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
Signed-off-by: Gioh Kim <gi-oh.kim@profitbricks.com>
Signed-off-by: Minchan Kim <minchan@kernel.org>
---
 drivers/virtio/virtio_balloon.c    | 54 +++++++++++++++++++---
 include/linux/balloon_compaction.h | 53 +++++++--------------
 include/uapi/linux/magic.h         |  1 +
 mm/balloon_compaction.c            | 94 +++++++-------------------------------
 mm/compaction.c                    |  7 ---
 mm/migrate.c                       | 19 +-------
 mm/vmscan.c                        |  2 +-
 7 files changed, 85 insertions(+), 145 deletions(-)

diff --git a/drivers/virtio/virtio_balloon.c b/drivers/virtio/virtio_balloon.c
index 476c0e3a7150..88d5609375de 100644
--- a/drivers/virtio/virtio_balloon.c
+++ b/drivers/virtio/virtio_balloon.c
@@ -30,6 +30,7 @@
 #include <linux/oom.h>
 #include <linux/wait.h>
 #include <linux/mm.h>
+#include <linux/mount.h>
 
 /*
  * Balloon device works in 4K page units.  So each page is pointed to by
@@ -45,6 +46,10 @@ static int oom_pages = OOM_VBALLOON_DEFAULT_PAGES;
 module_param(oom_pages, int, S_IRUSR | S_IWUSR);
 MODULE_PARM_DESC(oom_pages, "pages to free on OOM");
 
+#ifdef CONFIG_BALLOON_COMPACTION
+static struct vfsmount *balloon_mnt;
+#endif
+
 struct virtio_balloon {
 	struct virtio_device *vdev;
 	struct virtqueue *inflate_vq, *deflate_vq, *stats_vq;
@@ -488,8 +493,26 @@ static int virtballoon_migratepage(struct balloon_dev_info *vb_dev_info,
 
 	put_page(page); /* balloon reference */
 
-	return MIGRATEPAGE_SUCCESS;
+	return 0;
 }
+
+static struct dentry *balloon_mount(struct file_system_type *fs_type,
+		int flags, const char *dev_name, void *data)
+{
+	static const struct dentry_operations ops = {
+		.d_dname = simple_dname,
+	};
+
+	return mount_pseudo(fs_type, "balloon-kvm:", NULL, &ops,
+				BALLOON_KVM_MAGIC);
+}
+
+static struct file_system_type balloon_fs = {
+	.name           = "balloon-kvm",
+	.mount          = balloon_mount,
+	.kill_sb        = kill_anon_super,
+};
+
 #endif /* CONFIG_BALLOON_COMPACTION */
 
 static int virtballoon_probe(struct virtio_device *vdev)
@@ -519,9 +542,6 @@ static int virtballoon_probe(struct virtio_device *vdev)
 	vb->vdev = vdev;
 
 	balloon_devinfo_init(&vb->vb_dev_info);
-#ifdef CONFIG_BALLOON_COMPACTION
-	vb->vb_dev_info.migratepage = virtballoon_migratepage;
-#endif
 
 	err = init_vqs(vb);
 	if (err)
@@ -531,13 +551,33 @@ static int virtballoon_probe(struct virtio_device *vdev)
 	vb->nb.priority = VIRTBALLOON_OOM_NOTIFY_PRIORITY;
 	err = register_oom_notifier(&vb->nb);
 	if (err < 0)
-		goto out_oom_notify;
+		goto out_del_vqs;
+
+#ifdef CONFIG_BALLOON_COMPACTION
+	balloon_mnt = kern_mount(&balloon_fs);
+	if (IS_ERR(balloon_mnt)) {
+		err = PTR_ERR(balloon_mnt);
+		unregister_oom_notifier(&vb->nb);
+		goto out_del_vqs;
+	}
+
+	vb->vb_dev_info.migratepage = virtballoon_migratepage;
+	vb->vb_dev_info.inode = alloc_anon_inode(balloon_mnt->mnt_sb);
+	if (IS_ERR(vb->vb_dev_info.inode)) {
+		err = PTR_ERR(vb->vb_dev_info.inode);
+		kern_unmount(balloon_mnt);
+		unregister_oom_notifier(&vb->nb);
+		vb->vb_dev_info.inode = NULL;
+		goto out_del_vqs;
+	}
+	vb->vb_dev_info.inode->i_mapping->a_ops = &balloon_aops;
+#endif
 
 	virtio_device_ready(vdev);
 
 	return 0;
 
-out_oom_notify:
+out_del_vqs:
 	vdev->config->del_vqs(vdev);
 out_free_vb:
 	kfree(vb);
@@ -571,6 +611,8 @@ static void virtballoon_remove(struct virtio_device *vdev)
 	cancel_work_sync(&vb->update_balloon_stats_work);
 
 	remove_common(vb);
+	if (vb->vb_dev_info.inode)
+		iput(vb->vb_dev_info.inode);
 	kfree(vb);
 }
 
diff --git a/include/linux/balloon_compaction.h b/include/linux/balloon_compaction.h
index 9b0a15d06a4f..c0c430d06a9b 100644
--- a/include/linux/balloon_compaction.h
+++ b/include/linux/balloon_compaction.h
@@ -45,9 +45,10 @@
 #define _LINUX_BALLOON_COMPACTION_H
 #include <linux/pagemap.h>
 #include <linux/page-flags.h>
-#include <linux/migrate.h>
+#include <linux/compaction.h>
 #include <linux/gfp.h>
 #include <linux/err.h>
+#include <linux/fs.h>
 
 /*
  * Balloon device information descriptor.
@@ -62,6 +63,7 @@ struct balloon_dev_info {
 	struct list_head pages;		/* Pages enqueued & handled to Host */
 	int (*migratepage)(struct balloon_dev_info *, struct page *newpage,
 			struct page *page, enum migrate_mode mode);
+	struct inode *inode;
 };
 
 extern struct page *balloon_page_enqueue(struct balloon_dev_info *b_dev_info);
@@ -73,45 +75,19 @@ static inline void balloon_devinfo_init(struct balloon_dev_info *balloon)
 	spin_lock_init(&balloon->pages_lock);
 	INIT_LIST_HEAD(&balloon->pages);
 	balloon->migratepage = NULL;
+	balloon->inode = NULL;
 }
 
 #ifdef CONFIG_BALLOON_COMPACTION
-extern bool balloon_page_isolate(struct page *page);
+extern const struct address_space_operations balloon_aops;
+extern bool balloon_page_isolate(struct page *page,
+				isolate_mode_t mode);
 extern void balloon_page_putback(struct page *page);
-extern int balloon_page_migrate(struct page *newpage,
+extern int balloon_page_migrate(struct address_space *mapping,
+				struct page *newpage,
 				struct page *page, enum migrate_mode mode);
 
 /*
- * __is_movable_balloon_page - helper to perform @page PageBalloon tests
- */
-static inline bool __is_movable_balloon_page(struct page *page)
-{
-	return PageBalloon(page);
-}
-
-/*
- * balloon_page_movable - test PageBalloon to identify balloon pages
- *			  and PagePrivate to check that the page is not
- *			  isolated and can be moved by compaction/migration.
- *
- * As we might return false positives in the case of a balloon page being just
- * released under us, this need to be re-tested later, under the page lock.
- */
-static inline bool balloon_page_movable(struct page *page)
-{
-	return PageBalloon(page) && PagePrivate(page);
-}
-
-/*
- * isolated_balloon_page - identify an isolated balloon page on private
- *			   compaction/migration page lists.
- */
-static inline bool isolated_balloon_page(struct page *page)
-{
-	return PageBalloon(page);
-}
-
-/*
  * balloon_page_insert - insert a page into the balloon's page list and make
  *			 the page->private assignment accordingly.
  * @balloon : pointer to balloon device
@@ -124,7 +100,7 @@ static inline void balloon_page_insert(struct balloon_dev_info *balloon,
 				       struct page *page)
 {
 	__SetPageBalloon(page);
-	SetPagePrivate(page);
+	__SetPageMovable(page, balloon->inode->i_mapping);
 	set_page_private(page, (unsigned long)balloon);
 	list_add(&page->lru, &balloon->pages);
 }
@@ -140,11 +116,14 @@ static inline void balloon_page_insert(struct balloon_dev_info *balloon,
 static inline void balloon_page_delete(struct page *page)
 {
 	__ClearPageBalloon(page);
+	__ClearPageMovable(page);
 	set_page_private(page, 0);
-	if (PagePrivate(page)) {
-		ClearPagePrivate(page);
+	/*
+	 * No touch page.lru field once @page has been isolated
+	 * because VM is using the field.
+	 */
+	if (!PageIsolated(page))
 		list_del(&page->lru);
-	}
 }
 
 /*
diff --git a/include/uapi/linux/magic.h b/include/uapi/linux/magic.h
index 546b38886e11..d829ce63529d 100644
--- a/include/uapi/linux/magic.h
+++ b/include/uapi/linux/magic.h
@@ -80,5 +80,6 @@
 #define BPF_FS_MAGIC		0xcafe4a11
 /* Since UDF 2.01 is ISO 13346 based... */
 #define UDF_SUPER_MAGIC		0x15013346
+#define BALLOON_KVM_MAGIC	0x13661366
 
 #endif /* __LINUX_MAGIC_H__ */
diff --git a/mm/balloon_compaction.c b/mm/balloon_compaction.c
index 57b3e9bd6bc5..da91df50ba31 100644
--- a/mm/balloon_compaction.c
+++ b/mm/balloon_compaction.c
@@ -70,7 +70,7 @@ struct page *balloon_page_dequeue(struct balloon_dev_info *b_dev_info)
 		 */
 		if (trylock_page(page)) {
 #ifdef CONFIG_BALLOON_COMPACTION
-			if (!PagePrivate(page)) {
+			if (PageIsolated(page)) {
 				/* raced with isolation */
 				unlock_page(page);
 				continue;
@@ -106,110 +106,50 @@ EXPORT_SYMBOL_GPL(balloon_page_dequeue);
 
 #ifdef CONFIG_BALLOON_COMPACTION
 
-static inline void __isolate_balloon_page(struct page *page)
+bool balloon_page_isolate(struct page *page, isolate_mode_t mode)
+
 {
 	struct balloon_dev_info *b_dev_info = balloon_page_device(page);
 	unsigned long flags;
 
 	spin_lock_irqsave(&b_dev_info->pages_lock, flags);
-	ClearPagePrivate(page);
 	list_del(&page->lru);
 	b_dev_info->isolated_pages++;
 	spin_unlock_irqrestore(&b_dev_info->pages_lock, flags);
+
+	return true;
 }
 
-static inline void __putback_balloon_page(struct page *page)
+void balloon_page_putback(struct page *page)
 {
 	struct balloon_dev_info *b_dev_info = balloon_page_device(page);
 	unsigned long flags;
 
 	spin_lock_irqsave(&b_dev_info->pages_lock, flags);
-	SetPagePrivate(page);
 	list_add(&page->lru, &b_dev_info->pages);
 	b_dev_info->isolated_pages--;
 	spin_unlock_irqrestore(&b_dev_info->pages_lock, flags);
 }
 
-/* __isolate_lru_page() counterpart for a ballooned page */
-bool balloon_page_isolate(struct page *page)
-{
-	/*
-	 * Avoid burning cycles with pages that are yet under __free_pages(),
-	 * or just got freed under us.
-	 *
-	 * In case we 'win' a race for a balloon page being freed under us and
-	 * raise its refcount preventing __free_pages() from doing its job
-	 * the put_page() at the end of this block will take care of
-	 * release this page, thus avoiding a nasty leakage.
-	 */
-	if (likely(get_page_unless_zero(page))) {
-		/*
-		 * As balloon pages are not isolated from LRU lists, concurrent
-		 * compaction threads can race against page migration functions
-		 * as well as race against the balloon driver releasing a page.
-		 *
-		 * In order to avoid having an already isolated balloon page
-		 * being (wrongly) re-isolated while it is under migration,
-		 * or to avoid attempting to isolate pages being released by
-		 * the balloon driver, lets be sure we have the page lock
-		 * before proceeding with the balloon page isolation steps.
-		 */
-		if (likely(trylock_page(page))) {
-			/*
-			 * A ballooned page, by default, has PagePrivate set.
-			 * Prevent concurrent compaction threads from isolating
-			 * an already isolated balloon page by clearing it.
-			 */
-			if (balloon_page_movable(page)) {
-				__isolate_balloon_page(page);
-				unlock_page(page);
-				return true;
-			}
-			unlock_page(page);
-		}
-		put_page(page);
-	}
-	return false;
-}
-
-/* putback_lru_page() counterpart for a ballooned page */
-void balloon_page_putback(struct page *page)
-{
-	/*
-	 * 'lock_page()' stabilizes the page and prevents races against
-	 * concurrent isolation threads attempting to re-isolate it.
-	 */
-	lock_page(page);
-
-	if (__is_movable_balloon_page(page)) {
-		__putback_balloon_page(page);
-		/* drop the extra ref count taken for page isolation */
-		put_page(page);
-	} else {
-		WARN_ON(1);
-		dump_page(page, "not movable balloon page");
-	}
-	unlock_page(page);
-}
 
 /* move_to_new_page() counterpart for a ballooned page */
-int balloon_page_migrate(struct page *newpage,
-			 struct page *page, enum migrate_mode mode)
+int balloon_page_migrate(struct address_space *mapping,
+		struct page *newpage, struct page *page,
+		enum migrate_mode mode)
 {
 	struct balloon_dev_info *balloon = balloon_page_device(page);
-	int rc = -EAGAIN;
 
 	VM_BUG_ON_PAGE(!PageLocked(page), page);
 	VM_BUG_ON_PAGE(!PageLocked(newpage), newpage);
 
-	if (WARN_ON(!__is_movable_balloon_page(page))) {
-		dump_page(page, "not movable balloon page");
-		return rc;
-	}
+	return balloon->migratepage(balloon, newpage, page, mode);
+}
 
-	if (balloon && balloon->migratepage)
-		rc = balloon->migratepage(balloon, newpage, page, mode);
+const struct address_space_operations balloon_aops = {
+	.migratepage = balloon_page_migrate,
+	.isolate_page = balloon_page_isolate,
+	.putback_page = balloon_page_putback,
+};
+EXPORT_SYMBOL_GPL(balloon_aops);
 
-	return rc;
-}
 #endif /* CONFIG_BALLOON_COMPACTION */
diff --git a/mm/compaction.c b/mm/compaction.c
index a680b52e190b..b7bfdf94b545 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -795,13 +795,6 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn,
 		 */
 		is_lru = PageLRU(page);
 		if (!is_lru) {
-			if (unlikely(balloon_page_movable(page))) {
-				if (balloon_page_isolate(page)) {
-					/* Successfully isolated */
-					goto isolate_success;
-				}
-			}
-
 			/*
 			 * __PageMovable can return false positive so we need
 			 * to verify it under page_lock.
diff --git a/mm/migrate.c b/mm/migrate.c
index 60abcf379b51..e6daf49e224f 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -170,14 +170,12 @@ void putback_movable_pages(struct list_head *l)
 		list_del(&page->lru);
 		dec_zone_page_state(page, NR_ISOLATED_ANON +
 				page_is_file_cache(page));
-		if (unlikely(isolated_balloon_page(page))) {
-			balloon_page_putback(page);
 		/*
 		 * We isolated non-lru movable page so here we can use
 		 * __PageMovable because LRU page's mapping cannot have
 		 * PAGE_MAPPING_MOVABLE.
 		 */
-		} else if (unlikely(__PageMovable(page))) {
+		if (unlikely(__PageMovable(page))) {
 			VM_BUG_ON_PAGE(!PageIsolated(page), page);
 			lock_page(page);
 			if (PageMovable(page))
@@ -990,18 +988,6 @@ static int __unmap_and_move(struct page *page, struct page *newpage,
 	if (unlikely(!trylock_page(newpage)))
 		goto out_unlock;
 
-	if (unlikely(isolated_balloon_page(page))) {
-		/*
-		 * A ballooned page does not need any special attention from
-		 * physical to virtual reverse mapping procedures.
-		 * Skip any attempt to unmap PTEs or to remap swap cache,
-		 * in order to avoid burning cycles at rmap level, and perform
-		 * the page migration right away (proteced by page lock).
-		 */
-		rc = balloon_page_migrate(newpage, page, mode);
-		goto out_unlock_both;
-	}
-
 	if (unlikely(!is_lru)) {
 		rc = move_to_new_page(newpage, page, mode);
 		goto out_unlock_both;
@@ -1056,8 +1042,7 @@ static int __unmap_and_move(struct page *page, struct page *newpage,
 	 * list in here.
 	 */
 	if (rc == MIGRATEPAGE_SUCCESS) {
-		if (unlikely(__is_movable_balloon_page(newpage) ||
-				__PageMovable(newpage)))
+		if (unlikely(__PageMovable(newpage)))
 			put_page(newpage);
 		else
 			putback_lru_page(newpage);
diff --git a/mm/vmscan.c b/mm/vmscan.c
index c4a2f4512fca..93ba33789ac6 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -1254,7 +1254,7 @@ unsigned long reclaim_clean_pages_from_list(struct zone *zone,
 
 	list_for_each_entry_safe(page, next, page_list, lru) {
 		if (page_is_file_cache(page) && !PageDirty(page) &&
-		    !isolated_balloon_page(page)) {
+		    !__PageMovable(page)) {
 			ClearPageActive(page);
 			list_move(&page->lru, &clean_pages);
 		}
-- 
1.9.1

^ permalink raw reply related

* [PATCH v7 02/12] mm: migrate: support non-lru movable page migration
From: Minchan Kim @ 2016-05-31 23:21 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Rik van Riel, Sergey Senozhatsky, Rafael Aquini, Minchan Kim,
	Vlastimil Babka, Jonathan Corbet, Hugh Dickins, linux-kernel,
	dri-devel, virtualization, John Einar Reitan, linux-mm,
	Mel Gorman, Joonsoo Kim, Gioh Kim
In-Reply-To: <1464736881-24886-1-git-send-email-minchan@kernel.org>

We have allowed migration for only LRU pages until now and it was
enough to make high-order pages. But recently, embedded system(e.g.,
webOS, android) uses lots of non-movable pages(e.g., zram, GPU memory)
so we have seen several reports about troubles of small high-order
allocation. For fixing the problem, there were several efforts
(e,g,. enhance compaction algorithm, SLUB fallback to 0-order page,
reserved memory, vmalloc and so on) but if there are lots of
non-movable pages in system, their solutions are void in the long run.

So, this patch is to support facility to change non-movable pages
with movable. For the feature, this patch introduces functions related
to migration to address_space_operations as well as some page flags.

If a driver want to make own pages movable, it should define three functions
which are function pointers of struct address_space_operations.

1. bool (*isolate_page) (struct page *page, isolate_mode_t mode);

What VM expects on isolate_page function of driver is to return *true*
if driver isolates page successfully. On returing true, VM marks the page
as PG_isolated so concurrent isolation in several CPUs skip the page
for isolation. If a driver cannot isolate the page, it should return *false*.

Once page is successfully isolated, VM uses page.lru fields so driver
shouldn't expect to preserve values in that fields.

2. int (*migratepage) (struct address_space *mapping,
		struct page *newpage, struct page *oldpage, enum migrate_mode);

After isolation, VM calls migratepage of driver with isolated page.
The function of migratepage is to move content of the old page to new page
and set up fields of struct page newpage. Keep in mind that you should
indicate to the VM the oldpage is no longer movable via __ClearPageMovable()
under page_lock if you migrated the oldpage successfully and returns 0.
If driver cannot migrate the page at the moment, driver can return -EAGAIN.
On -EAGAIN, VM will retry page migration in a short time because VM interprets
-EAGAIN as "temporal migration failure". On returning any error except -EAGAIN,
VM will give up the page migration without retrying in this time.

Driver shouldn't touch page.lru field VM using in the functions.

3. void (*putback_page)(struct page *);

If migration fails on isolated page, VM should return the isolated page
to the driver so VM calls driver's putback_page with migration failed page.
In this function, driver should put the isolated page back to the own data
structure.

4. non-lru movable page flags

There are two page flags for supporting non-lru movable page.

* PG_movable

Driver should use the below function to make page movable under page_lock.

	void __SetPageMovable(struct page *page, struct address_space *mapping)

It needs argument of address_space for registering migration family functions
which will be called by VM. Exactly speaking, PG_movable is not a real flag of
struct page. Rather than, VM reuses page->mapping's lower bits to represent it.

	#define PAGE_MAPPING_MOVABLE 0x2
	page->mapping = page->mapping | PAGE_MAPPING_MOVABLE;

so driver shouldn't access page->mapping directly. Instead, driver should
use page_mapping which mask off the low two bits of page->mapping so it can get
right struct address_space.

For testing of non-lru movable page, VM supports __PageMovable function.
However, it doesn't guarantee to identify non-lru movable page because
page->mapping field is unified with other variables in struct page.
As well, if driver releases the page after isolation by VM, page->mapping
doesn't have stable value although it has PAGE_MAPPING_MOVABLE
(Look at __ClearPageMovable). But __PageMovable is cheap to catch whether
page is LRU or non-lru movable once the page has been isolated. Because
LRU pages never can have PAGE_MAPPING_MOVABLE in page->mapping. It is also
good for just peeking to test non-lru movable pages before more expensive
checking with lock_page in pfn scanning to select victim.

For guaranteeing non-lru movable page, VM provides PageMovable function.
Unlike __PageMovable, PageMovable functions validates page->mapping and
mapping->a_ops->isolate_page under lock_page. The lock_page prevents sudden
destroying of page->mapping.

Driver using __SetPageMovable should clear the flag via __ClearMovablePage
under page_lock before the releasing the page.

* PG_isolated

To prevent concurrent isolation among several CPUs, VM marks isolated page
as PG_isolated under lock_page. So if a CPU encounters PG_isolated non-lru
movable page, it can skip it. Driver doesn't need to manipulate the flag
because VM will set/clear it automatically. Keep in mind that if driver
sees PG_isolated page, it means the page have been isolated by VM so it
shouldn't touch page.lru field.
PG_isolated is alias with PG_reclaim flag so driver shouldn't use the flag
for own purpose.

Cc: Rik van Riel <riel@redhat.com>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Hugh Dickins <hughd@google.com>
Cc: Rafael Aquini <aquini@redhat.com>
Cc: virtualization@lists.linux-foundation.org
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: John Einar Reitan <john.reitan@foss.arm.com>
Cc: dri-devel@lists.freedesktop.org
Cc: Sergey Senozhatsky <sergey.senozhatsky@gmail.com>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
Signed-off-by: Gioh Kim <gi-oh.kim@profitbricks.com>
Signed-off-by: Minchan Kim <minchan@kernel.org>
---
 Documentation/filesystems/Locking |   4 +
 Documentation/filesystems/vfs.txt |  11 +++
 Documentation/vm/page_migration   | 107 ++++++++++++++++++++-
 include/linux/compaction.h        |  17 ++++
 include/linux/fs.h                |   2 +
 include/linux/ksm.h               |   3 +-
 include/linux/migrate.h           |   2 +
 include/linux/mm.h                |   1 +
 include/linux/page-flags.h        |  33 +++++--
 mm/compaction.c                   |  85 +++++++++++++----
 mm/ksm.c                          |   4 +-
 mm/migrate.c                      | 192 ++++++++++++++++++++++++++++++++++----
 mm/page_alloc.c                   |   2 +-
 mm/util.c                         |   6 +-
 14 files changed, 417 insertions(+), 52 deletions(-)

diff --git a/Documentation/filesystems/Locking b/Documentation/filesystems/Locking
index af7c030a0368..3991a976cf43 100644
--- a/Documentation/filesystems/Locking
+++ b/Documentation/filesystems/Locking
@@ -195,7 +195,9 @@ unlocks and drops the reference.
 	int (*releasepage) (struct page *, int);
 	void (*freepage)(struct page *);
 	int (*direct_IO)(struct kiocb *, struct iov_iter *iter);
+	bool (*isolate_page) (struct page *, isolate_mode_t);
 	int (*migratepage)(struct address_space *, struct page *, struct page *);
+	void (*putback_page) (struct page *);
 	int (*launder_page)(struct page *);
 	int (*is_partially_uptodate)(struct page *, unsigned long, unsigned long);
 	int (*error_remove_page)(struct address_space *, struct page *);
@@ -219,7 +221,9 @@ invalidatepage:		yes
 releasepage:		yes
 freepage:		yes
 direct_IO:
+isolate_page:		yes
 migratepage:		yes (both)
+putback_page:		yes
 launder_page:		yes
 is_partially_uptodate:	yes
 error_remove_page:	yes
diff --git a/Documentation/filesystems/vfs.txt b/Documentation/filesystems/vfs.txt
index 19366fef2652..9d4ae317fdcb 100644
--- a/Documentation/filesystems/vfs.txt
+++ b/Documentation/filesystems/vfs.txt
@@ -591,9 +591,14 @@ struct address_space_operations {
 	int (*releasepage) (struct page *, int);
 	void (*freepage)(struct page *);
 	ssize_t (*direct_IO)(struct kiocb *, struct iov_iter *iter);
+	/* isolate a page for migration */
+	bool (*isolate_page) (struct page *, isolate_mode_t);
 	/* migrate the contents of a page to the specified target */
 	int (*migratepage) (struct page *, struct page *);
+	/* put migration-failed page back to right list */
+	void (*putback_page) (struct page *);
 	int (*launder_page) (struct page *);
+
 	int (*is_partially_uptodate) (struct page *, unsigned long,
 					unsigned long);
 	void (*is_dirty_writeback) (struct page *, bool *, bool *);
@@ -739,6 +744,10 @@ struct address_space_operations {
         and transfer data directly between the storage and the
         application's address space.
 
+  isolate_page: Called by the VM when isolating a movable non-lru page.
+	If page is successfully isolated, VM marks the page as PG_isolated
+	via __SetPageIsolated.
+
   migrate_page:  This is used to compact the physical memory usage.
         If the VM wants to relocate a page (maybe off a memory card
         that is signalling imminent failure) it will pass a new page
@@ -746,6 +755,8 @@ struct address_space_operations {
 	transfer any private data across and update any references
         that it has to the page.
 
+  putback_page: Called by the VM when isolated page's migration fails.
+
   launder_page: Called before freeing a page - it writes back the dirty page. To
   	prevent redirtying the page, it is kept locked during the whole
 	operation.
diff --git a/Documentation/vm/page_migration b/Documentation/vm/page_migration
index fea5c0864170..18d37c7ac50b 100644
--- a/Documentation/vm/page_migration
+++ b/Documentation/vm/page_migration
@@ -142,5 +142,110 @@ is increased so that the page cannot be freed while page migration occurs.
 20. The new page is moved to the LRU and can be scanned by the swapper
     etc again.
 
-Christoph Lameter, May 8, 2006.
+C. Non-LRU page migration
+-------------------------
+
+Although original migration aimed for reducing the latency of memory access
+for NUMA, compaction who want to create high-order page is also main customer.
+
+Current problem of the implementation is that it is designed to migrate only
+*LRU* pages. However, there are potential non-lru pages which can be migrated
+in drivers, for example, zsmalloc, virtio-balloon pages.
+
+For virtio-balloon pages, some parts of migration code path have been hooked
+up and added virtio-balloon specific functions to intercept migration logics.
+It's too specific to a driver so other drivers who want to make their pages
+movable would have to add own specific hooks in migration path.
+
+To overclome the problem, VM supports non-LRU page migration which provides
+generic functions for non-LRU movable pages without driver specific hooks
+migration path.
+
+If a driver want to make own pages movable, it should define three functions
+which are function pointers of struct address_space_operations.
+
+1. bool (*isolate_page) (struct page *page, isolate_mode_t mode);
+
+What VM expects on isolate_page function of driver is to return *true*
+if driver isolates page successfully. On returing true, VM marks the page
+as PG_isolated so concurrent isolation in several CPUs skip the page
+for isolation. If a driver cannot isolate the page, it should return *false*.
+
+Once page is successfully isolated, VM uses page.lru fields so driver
+shouldn't expect to preserve values in that fields.
+
+2. int (*migratepage) (struct address_space *mapping,
+		struct page *newpage, struct page *oldpage, enum migrate_mode);
+
+After isolation, VM calls migratepage of driver with isolated page.
+The function of migratepage is to move content of the old page to new page
+and set up fields of struct page newpage. Keep in mind that you should
+indicate to the VM the oldpage is no longer movable via __ClearPageMovable()
+under page_lock if you migrated the oldpage successfully and returns 0.
+If driver cannot migrate the page at the moment, driver can return -EAGAIN.
+On -EAGAIN, VM will retry page migration in a short time because VM interprets
+-EAGAIN as "temporal migration failure". On returning any error except -EAGAIN,
+VM will give up the page migration without retrying in this time.
+
+Driver shouldn't touch page.lru field VM using in the functions.
+
+3. void (*putback_page)(struct page *);
+
+If migration fails on isolated page, VM should return the isolated page
+to the driver so VM calls driver's putback_page with migration failed page.
+In this function, driver should put the isolated page back to the own data
+structure.
 
+4. non-lru movable page flags
+
+There are two page flags for supporting non-lru movable page.
+
+* PG_movable
+
+Driver should use the below function to make page movable under page_lock.
+
+	void __SetPageMovable(struct page *page, struct address_space *mapping)
+
+It needs argument of address_space for registering migration family functions
+which will be called by VM. Exactly speaking, PG_movable is not a real flag of
+struct page. Rather than, VM reuses page->mapping's lower bits to represent it.
+
+	#define PAGE_MAPPING_MOVABLE 0x2
+	page->mapping = page->mapping | PAGE_MAPPING_MOVABLE;
+
+so driver shouldn't access page->mapping directly. Instead, driver should
+use page_mapping which mask off the low two bits of page->mapping under
+page lock so it can get right struct address_space.
+
+For testing of non-lru movable page, VM supports __PageMovable function.
+However, it doesn't guarantee to identify non-lru movable page because
+page->mapping field is unified with other variables in struct page.
+As well, if driver releases the page after isolation by VM, page->mapping
+doesn't have stable value although it has PAGE_MAPPING_MOVABLE
+(Look at __ClearPageMovable). But __PageMovable is cheap to catch whether
+page is LRU or non-lru movable once the page has been isolated. Because
+LRU pages never can have PAGE_MAPPING_MOVABLE in page->mapping. It is also
+good for just peeking to test non-lru movable pages before more expensive
+checking with lock_page in pfn scanning to select victim.
+
+For guaranteeing non-lru movable page, VM provides PageMovable function.
+Unlike __PageMovable, PageMovable functions validates page->mapping and
+mapping->a_ops->isolate_page under lock_page. The lock_page prevents sudden
+destroying of page->mapping.
+
+Driver using __SetPageMovable should clear the flag via __ClearMovablePage
+under page_lock before the releasing the page.
+
+* PG_isolated
+
+To prevent concurrent isolation among several CPUs, VM marks isolated page
+as PG_isolated under lock_page. So if a CPU encounters PG_isolated non-lru
+movable page, it can skip it. Driver doesn't need to manipulate the flag
+because VM will set/clear it automatically. Keep in mind that if driver
+sees PG_isolated page, it means the page have been isolated by VM so it
+shouldn't touch page.lru field.
+PG_isolated is alias with PG_reclaim flag so driver shouldn't use the flag
+for own purpose.
+
+Christoph Lameter, May 8, 2006.
+Minchan Kim, Mar 28, 2016.
diff --git a/include/linux/compaction.h b/include/linux/compaction.h
index a58c852a268f..c6b47c861cea 100644
--- a/include/linux/compaction.h
+++ b/include/linux/compaction.h
@@ -54,6 +54,9 @@ enum compact_result {
 struct alloc_context; /* in mm/internal.h */
 
 #ifdef CONFIG_COMPACTION
+extern int PageMovable(struct page *page);
+extern void __SetPageMovable(struct page *page, struct address_space *mapping);
+extern void __ClearPageMovable(struct page *page);
 extern int sysctl_compact_memory;
 extern int sysctl_compaction_handler(struct ctl_table *table, int write,
 			void __user *buffer, size_t *length, loff_t *ppos);
@@ -151,6 +154,19 @@ extern void kcompactd_stop(int nid);
 extern void wakeup_kcompactd(pg_data_t *pgdat, int order, int classzone_idx);
 
 #else
+static inline int PageMovable(struct page *page)
+{
+	return 0;
+}
+static inline void __SetPageMovable(struct page *page,
+			struct address_space *mapping)
+{
+}
+
+static inline void __ClearPageMovable(struct page *page)
+{
+}
+
 static inline enum compact_result try_to_compact_pages(gfp_t gfp_mask,
 			unsigned int order, int alloc_flags,
 			const struct alloc_context *ac,
@@ -212,6 +228,7 @@ static inline void wakeup_kcompactd(pg_data_t *pgdat, int order, int classzone_i
 #endif /* CONFIG_COMPACTION */
 
 #if defined(CONFIG_COMPACTION) && defined(CONFIG_SYSFS) && defined(CONFIG_NUMA)
+struct node;
 extern int compaction_register_node(struct node *node);
 extern void compaction_unregister_node(struct node *node);
 
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 0cfdf2aec8f7..39ef97414033 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -402,6 +402,8 @@ struct address_space_operations {
 	 */
 	int (*migratepage) (struct address_space *,
 			struct page *, struct page *, enum migrate_mode);
+	bool (*isolate_page)(struct page *, isolate_mode_t);
+	void (*putback_page)(struct page *);
 	int (*launder_page) (struct page *);
 	int (*is_partially_uptodate) (struct page *, unsigned long,
 					unsigned long);
diff --git a/include/linux/ksm.h b/include/linux/ksm.h
index 7ae216a39c9e..481c8c4627ca 100644
--- a/include/linux/ksm.h
+++ b/include/linux/ksm.h
@@ -43,8 +43,7 @@ static inline struct stable_node *page_stable_node(struct page *page)
 static inline void set_page_stable_node(struct page *page,
 					struct stable_node *stable_node)
 {
-	page->mapping = (void *)stable_node +
-				(PAGE_MAPPING_ANON | PAGE_MAPPING_KSM);
+	page->mapping = (void *)((unsigned long)stable_node | PAGE_MAPPING_KSM);
 }
 
 /*
diff --git a/include/linux/migrate.h b/include/linux/migrate.h
index 9b50325e4ddf..404fbfefeb33 100644
--- a/include/linux/migrate.h
+++ b/include/linux/migrate.h
@@ -37,6 +37,8 @@ extern int migrate_page(struct address_space *,
 			struct page *, struct page *, enum migrate_mode);
 extern int migrate_pages(struct list_head *l, new_page_t new, free_page_t free,
 		unsigned long private, enum migrate_mode mode, int reason);
+extern bool isolate_movable_page(struct page *page, isolate_mode_t mode);
+extern void putback_movable_page(struct page *page);
 
 extern int migrate_prep(void);
 extern int migrate_prep_local(void);
diff --git a/include/linux/mm.h b/include/linux/mm.h
index a00ec816233a..33eaec57e997 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1035,6 +1035,7 @@ static inline pgoff_t page_file_index(struct page *page)
 }
 
 bool page_mapped(struct page *page);
+struct address_space *page_mapping(struct page *page);
 
 /*
  * Return true only if the page has been allocated with
diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h
index e5a32445f930..f36dbb3a3060 100644
--- a/include/linux/page-flags.h
+++ b/include/linux/page-flags.h
@@ -129,6 +129,9 @@ enum pageflags {
 
 	/* Compound pages. Stored in first tail page's flags */
 	PG_double_map = PG_private_2,
+
+	/* non-lru isolated movable page */
+	PG_isolated = PG_reclaim,
 };
 
 #ifndef __GENERATING_BOUNDS_H
@@ -357,29 +360,37 @@ PAGEFLAG(Idle, idle, PF_ANY)
  * with the PAGE_MAPPING_ANON bit set to distinguish it.  See rmap.h.
  *
  * On an anonymous page in a VM_MERGEABLE area, if CONFIG_KSM is enabled,
- * the PAGE_MAPPING_KSM bit may be set along with the PAGE_MAPPING_ANON bit;
- * and then page->mapping points, not to an anon_vma, but to a private
+ * the PAGE_MAPPING_MOVABLE bit may be set along with the PAGE_MAPPING_ANON
+ * bit; and then page->mapping points, not to an anon_vma, but to a private
  * structure which KSM associates with that merged page.  See ksm.h.
  *
- * PAGE_MAPPING_KSM without PAGE_MAPPING_ANON is currently never used.
+ * PAGE_MAPPING_KSM without PAGE_MAPPING_ANON is used for non-lru movable
+ * page and then page->mapping points a struct address_space.
  *
  * Please note that, confusingly, "page_mapping" refers to the inode
  * address_space which maps the page from disk; whereas "page_mapped"
  * refers to user virtual address space into which the page is mapped.
  */
-#define PAGE_MAPPING_ANON	1
-#define PAGE_MAPPING_KSM	2
-#define PAGE_MAPPING_FLAGS	(PAGE_MAPPING_ANON | PAGE_MAPPING_KSM)
+#define PAGE_MAPPING_ANON	0x1
+#define PAGE_MAPPING_MOVABLE	0x2
+#define PAGE_MAPPING_KSM	(PAGE_MAPPING_ANON | PAGE_MAPPING_MOVABLE)
+#define PAGE_MAPPING_FLAGS	(PAGE_MAPPING_ANON | PAGE_MAPPING_MOVABLE)
 
-static __always_inline int PageAnonHead(struct page *page)
+static __always_inline int PageMappingFlags(struct page *page)
 {
-	return ((unsigned long)page->mapping & PAGE_MAPPING_ANON) != 0;
+	return ((unsigned long)page->mapping & PAGE_MAPPING_FLAGS) != 0;
 }
 
 static __always_inline int PageAnon(struct page *page)
 {
 	page = compound_head(page);
-	return PageAnonHead(page);
+	return ((unsigned long)page->mapping & PAGE_MAPPING_ANON) != 0;
+}
+
+static __always_inline int __PageMovable(struct page *page)
+{
+	return ((unsigned long)page->mapping & PAGE_MAPPING_FLAGS) ==
+				PAGE_MAPPING_MOVABLE;
 }
 
 #ifdef CONFIG_KSM
@@ -393,7 +404,7 @@ static __always_inline int PageKsm(struct page *page)
 {
 	page = compound_head(page);
 	return ((unsigned long)page->mapping & PAGE_MAPPING_FLAGS) ==
-				(PAGE_MAPPING_ANON | PAGE_MAPPING_KSM);
+				PAGE_MAPPING_KSM;
 }
 #else
 TESTPAGEFLAG_FALSE(Ksm)
@@ -641,6 +652,8 @@ static inline void __ClearPageBalloon(struct page *page)
 	atomic_set(&page->_mapcount, -1);
 }
 
+__PAGEFLAG(Isolated, isolated, PF_ANY);
+
 /*
  * If network-based swap is enabled, sl*b must keep track of whether pages
  * were allocated from pfmemalloc reserves.
diff --git a/mm/compaction.c b/mm/compaction.c
index 1427366ad673..a680b52e190b 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -81,6 +81,44 @@ static inline bool migrate_async_suitable(int migratetype)
 
 #ifdef CONFIG_COMPACTION
 
+int PageMovable(struct page *page)
+{
+	struct address_space *mapping;
+
+	VM_BUG_ON_PAGE(!PageLocked(page), page);
+	if (!__PageMovable(page))
+		return 0;
+
+	mapping = page_mapping(page);
+	if (mapping && mapping->a_ops && mapping->a_ops->isolate_page)
+		return 1;
+
+	return 0;
+}
+EXPORT_SYMBOL(PageMovable);
+
+void __SetPageMovable(struct page *page, struct address_space *mapping)
+{
+	VM_BUG_ON_PAGE(!PageLocked(page), page);
+	VM_BUG_ON_PAGE((unsigned long)mapping & PAGE_MAPPING_MOVABLE, page);
+	page->mapping = (void *)((unsigned long)mapping | PAGE_MAPPING_MOVABLE);
+}
+EXPORT_SYMBOL(__SetPageMovable);
+
+void __ClearPageMovable(struct page *page)
+{
+	VM_BUG_ON_PAGE(!PageLocked(page), page);
+	VM_BUG_ON_PAGE(!PageMovable(page), page);
+	/*
+	 * Clear registered address_space val with keeping PAGE_MAPPING_MOVABLE
+	 * flag so that VM can catch up released page by driver after isolation.
+	 * With it, VM migration doesn't try to put it back.
+	 */
+	page->mapping = (void *)((unsigned long)page->mapping &
+				PAGE_MAPPING_MOVABLE);
+}
+EXPORT_SYMBOL(__ClearPageMovable);
+
 /* Do not skip compaction more than 64 times */
 #define COMPACT_MAX_DEFER_SHIFT 6
 
@@ -735,21 +773,6 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn,
 		}
 
 		/*
-		 * Check may be lockless but that's ok as we recheck later.
-		 * It's possible to migrate LRU pages and balloon pages
-		 * Skip any other type of page
-		 */
-		is_lru = PageLRU(page);
-		if (!is_lru) {
-			if (unlikely(balloon_page_movable(page))) {
-				if (balloon_page_isolate(page)) {
-					/* Successfully isolated */
-					goto isolate_success;
-				}
-			}
-		}
-
-		/*
 		 * Regardless of being on LRU, compound pages such as THP and
 		 * hugetlbfs are not to be compacted. We can potentially save
 		 * a lot of iterations if we skip them at once. The check is
@@ -765,8 +788,38 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn,
 			goto isolate_fail;
 		}
 
-		if (!is_lru)
+		/*
+		 * Check may be lockless but that's ok as we recheck later.
+		 * It's possible to migrate LRU and non-lru movable pages.
+		 * Skip any other type of page
+		 */
+		is_lru = PageLRU(page);
+		if (!is_lru) {
+			if (unlikely(balloon_page_movable(page))) {
+				if (balloon_page_isolate(page)) {
+					/* Successfully isolated */
+					goto isolate_success;
+				}
+			}
+
+			/*
+			 * __PageMovable can return false positive so we need
+			 * to verify it under page_lock.
+			 */
+			if (unlikely(__PageMovable(page)) &&
+					!PageIsolated(page)) {
+				if (locked) {
+					spin_unlock_irqrestore(&zone->lru_lock,
+									flags);
+					locked = false;
+				}
+
+				if (isolate_movable_page(page, isolate_mode))
+					goto isolate_success;
+			}
+
 			goto isolate_fail;
+		}
 
 		/*
 		 * Migration will fail if an anonymous page is pinned in memory,
diff --git a/mm/ksm.c b/mm/ksm.c
index 4786b4150f62..35b8aef867a9 100644
--- a/mm/ksm.c
+++ b/mm/ksm.c
@@ -532,8 +532,8 @@ static struct page *get_ksm_page(struct stable_node *stable_node, bool lock_it)
 	void *expected_mapping;
 	unsigned long kpfn;
 
-	expected_mapping = (void *)stable_node +
-				(PAGE_MAPPING_ANON | PAGE_MAPPING_KSM);
+	expected_mapping = (void *)((unsigned long)stable_node |
+					PAGE_MAPPING_KSM);
 again:
 	kpfn = READ_ONCE(stable_node->kpfn);
 	page = pfn_to_page(kpfn);
diff --git a/mm/migrate.c b/mm/migrate.c
index 2666f28b5236..60abcf379b51 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -31,6 +31,7 @@
 #include <linux/vmalloc.h>
 #include <linux/security.h>
 #include <linux/backing-dev.h>
+#include <linux/compaction.h>
 #include <linux/syscalls.h>
 #include <linux/hugetlb.h>
 #include <linux/hugetlb_cgroup.h>
@@ -73,6 +74,81 @@ int migrate_prep_local(void)
 	return 0;
 }
 
+bool isolate_movable_page(struct page *page, isolate_mode_t mode)
+{
+	struct address_space *mapping;
+
+	/*
+	 * Avoid burning cycles with pages that are yet under __free_pages(),
+	 * or just got freed under us.
+	 *
+	 * In case we 'win' a race for a movable page being freed under us and
+	 * raise its refcount preventing __free_pages() from doing its job
+	 * the put_page() at the end of this block will take care of
+	 * release this page, thus avoiding a nasty leakage.
+	 */
+	if (unlikely(!get_page_unless_zero(page)))
+		goto out;
+
+	/*
+	 * Check PageMovable before holding a PG_lock because page's owner
+	 * assumes anybody doesn't touch PG_lock of newly allocated page
+	 * so unconditionally grapping the lock ruins page's owner side.
+	 */
+	if (unlikely(!__PageMovable(page)))
+		goto out_putpage;
+	/*
+	 * As movable pages are not isolated from LRU lists, concurrent
+	 * compaction threads can race against page migration functions
+	 * as well as race against the releasing a page.
+	 *
+	 * In order to avoid having an already isolated movable page
+	 * being (wrongly) re-isolated while it is under migration,
+	 * or to avoid attempting to isolate pages being released,
+	 * lets be sure we have the page lock
+	 * before proceeding with the movable page isolation steps.
+	 */
+	if (unlikely(!trylock_page(page)))
+		goto out_putpage;
+
+	if (!PageMovable(page) || PageIsolated(page))
+		goto out_no_isolated;
+
+	mapping = page_mapping(page);
+	VM_BUG_ON_PAGE(!mapping, page);
+
+	if (!mapping->a_ops->isolate_page(page, mode))
+		goto out_no_isolated;
+
+	/* Driver shouldn't use PG_isolated bit of page->flags */
+	WARN_ON_ONCE(PageIsolated(page));
+	__SetPageIsolated(page);
+	unlock_page(page);
+
+	return true;
+
+out_no_isolated:
+	unlock_page(page);
+out_putpage:
+	put_page(page);
+out:
+	return false;
+}
+
+/* It should be called on page which is PG_movable */
+void putback_movable_page(struct page *page)
+{
+	struct address_space *mapping;
+
+	VM_BUG_ON_PAGE(!PageLocked(page), page);
+	VM_BUG_ON_PAGE(!PageMovable(page), page);
+	VM_BUG_ON_PAGE(!PageIsolated(page), page);
+
+	mapping = page_mapping(page);
+	mapping->a_ops->putback_page(page);
+	__ClearPageIsolated(page);
+}
+
 /*
  * Put previously isolated pages back onto the appropriate lists
  * from where they were once taken off for compaction/migration.
@@ -94,10 +170,25 @@ void putback_movable_pages(struct list_head *l)
 		list_del(&page->lru);
 		dec_zone_page_state(page, NR_ISOLATED_ANON +
 				page_is_file_cache(page));
-		if (unlikely(isolated_balloon_page(page)))
+		if (unlikely(isolated_balloon_page(page))) {
 			balloon_page_putback(page);
-		else
+		/*
+		 * We isolated non-lru movable page so here we can use
+		 * __PageMovable because LRU page's mapping cannot have
+		 * PAGE_MAPPING_MOVABLE.
+		 */
+		} else if (unlikely(__PageMovable(page))) {
+			VM_BUG_ON_PAGE(!PageIsolated(page), page);
+			lock_page(page);
+			if (PageMovable(page))
+				putback_movable_page(page);
+			else
+				__ClearPageIsolated(page);
+			unlock_page(page);
+			put_page(page);
+		} else {
 			putback_lru_page(page);
+		}
 	}
 }
 
@@ -592,7 +683,7 @@ void migrate_page_copy(struct page *newpage, struct page *page)
  ***********************************************************/
 
 /*
- * Common logic to directly migrate a single page suitable for
+ * Common logic to directly migrate a single LRU page suitable for
  * pages that do not use PagePrivate/PagePrivate2.
  *
  * Pages are locked upon entry and exit.
@@ -755,33 +846,72 @@ static int move_to_new_page(struct page *newpage, struct page *page,
 				enum migrate_mode mode)
 {
 	struct address_space *mapping;
-	int rc;
+	int rc = -EAGAIN;
+	bool is_lru = !__PageMovable(page);
 
 	VM_BUG_ON_PAGE(!PageLocked(page), page);
 	VM_BUG_ON_PAGE(!PageLocked(newpage), newpage);
 
 	mapping = page_mapping(page);
-	if (!mapping)
-		rc = migrate_page(mapping, newpage, page, mode);
-	else if (mapping->a_ops->migratepage)
+
+	if (likely(is_lru)) {
+		if (!mapping)
+			rc = migrate_page(mapping, newpage, page, mode);
+		else if (mapping->a_ops->migratepage)
+			/*
+			 * Most pages have a mapping and most filesystems
+			 * provide a migratepage callback. Anonymous pages
+			 * are part of swap space which also has its own
+			 * migratepage callback. This is the most common path
+			 * for page migration.
+			 */
+			rc = mapping->a_ops->migratepage(mapping, newpage,
+							page, mode);
+		else
+			rc = fallback_migrate_page(mapping, newpage,
+							page, mode);
+	} else {
 		/*
-		 * Most pages have a mapping and most filesystems provide a
-		 * migratepage callback. Anonymous pages are part of swap
-		 * space which also has its own migratepage callback. This
-		 * is the most common path for page migration.
+		 * In case of non-lru page, it could be released after
+		 * isolation step. In that case, we shouldn't try migration.
 		 */
-		rc = mapping->a_ops->migratepage(mapping, newpage, page, mode);
-	else
-		rc = fallback_migrate_page(mapping, newpage, page, mode);
+		VM_BUG_ON_PAGE(!PageIsolated(page), page);
+		if (!PageMovable(page)) {
+			rc = MIGRATEPAGE_SUCCESS;
+			__ClearPageIsolated(page);
+			goto out;
+		}
+
+		rc = mapping->a_ops->migratepage(mapping, newpage,
+						page, mode);
+		WARN_ON_ONCE(rc == MIGRATEPAGE_SUCCESS &&
+			!PageIsolated(page));
+	}
 
 	/*
 	 * When successful, old pagecache page->mapping must be cleared before
 	 * page is freed; but stats require that PageAnon be left as PageAnon.
 	 */
 	if (rc == MIGRATEPAGE_SUCCESS) {
-		if (!PageAnon(page))
+		if (__PageMovable(page)) {
+			VM_BUG_ON_PAGE(!PageIsolated(page), page);
+
+			/*
+			 * We clear PG_movable under page_lock so any compactor
+			 * cannot try to migrate this page.
+			 */
+			__ClearPageIsolated(page);
+		}
+
+		/*
+		 * Anonymous and movable page->mapping will be cleard by
+		 * free_pages_prepare so don't reset it here for keeping
+		 * the type to work PageAnon, for example.
+		 */
+		if (!PageMappingFlags(page))
 			page->mapping = NULL;
 	}
+out:
 	return rc;
 }
 
@@ -791,6 +921,7 @@ static int __unmap_and_move(struct page *page, struct page *newpage,
 	int rc = -EAGAIN;
 	int page_was_mapped = 0;
 	struct anon_vma *anon_vma = NULL;
+	bool is_lru = !__PageMovable(page);
 
 	if (!trylock_page(page)) {
 		if (!force || mode == MIGRATE_ASYNC)
@@ -871,6 +1002,11 @@ static int __unmap_and_move(struct page *page, struct page *newpage,
 		goto out_unlock_both;
 	}
 
+	if (unlikely(!is_lru)) {
+		rc = move_to_new_page(newpage, page, mode);
+		goto out_unlock_both;
+	}
+
 	/*
 	 * Corner case handling:
 	 * 1. When a new swap-cache page is read into, it is added to the LRU
@@ -920,7 +1056,8 @@ static int __unmap_and_move(struct page *page, struct page *newpage,
 	 * list in here.
 	 */
 	if (rc == MIGRATEPAGE_SUCCESS) {
-		if (unlikely(__is_movable_balloon_page(newpage)))
+		if (unlikely(__is_movable_balloon_page(newpage) ||
+				__PageMovable(newpage)))
 			put_page(newpage);
 		else
 			putback_lru_page(newpage);
@@ -961,6 +1098,12 @@ static ICE_noinline int unmap_and_move(new_page_t get_new_page,
 		/* page was freed from under us. So we are done. */
 		ClearPageActive(page);
 		ClearPageUnevictable(page);
+		if (unlikely(__PageMovable(page))) {
+			lock_page(page);
+			if (!PageMovable(page))
+				__ClearPageIsolated(page);
+			unlock_page(page);
+		}
 		if (put_new_page)
 			put_new_page(newpage, private);
 		else
@@ -1010,8 +1153,21 @@ static ICE_noinline int unmap_and_move(new_page_t get_new_page,
 				num_poisoned_pages_inc();
 		}
 	} else {
-		if (rc != -EAGAIN)
-			putback_lru_page(page);
+		if (rc != -EAGAIN) {
+			if (likely(!__PageMovable(page))) {
+				putback_lru_page(page);
+				goto put_new;
+			}
+
+			lock_page(page);
+			if (PageMovable(page))
+				putback_movable_page(page);
+			else
+				__ClearPageIsolated(page);
+			unlock_page(page);
+			put_page(page);
+		}
+put_new:
 		if (put_new_page)
 			put_new_page(newpage, private);
 		else
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 7da8310b86e9..4b3a07ce824d 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -1014,7 +1014,7 @@ static __always_inline bool free_pages_prepare(struct page *page,
 			(page + i)->flags &= ~PAGE_FLAGS_CHECK_AT_PREP;
 		}
 	}
-	if (PageAnonHead(page))
+	if (PageMappingFlags(page))
 		page->mapping = NULL;
 	if (check_free)
 		bad += free_pages_check(page);
diff --git a/mm/util.c b/mm/util.c
index 917e0e3d0f8e..b756ee36f7f0 100644
--- a/mm/util.c
+++ b/mm/util.c
@@ -399,10 +399,12 @@ struct address_space *page_mapping(struct page *page)
 	}
 
 	mapping = page->mapping;
-	if ((unsigned long)mapping & PAGE_MAPPING_FLAGS)
+	if ((unsigned long)mapping & PAGE_MAPPING_ANON)
 		return NULL;
-	return mapping;
+
+	return (void *)((unsigned long)mapping & ~PAGE_MAPPING_FLAGS);
 }
+EXPORT_SYMBOL(page_mapping);
 
 /* Slow path of page_mapcount() for compound pages */
 int __page_mapcount(struct page *page)
-- 
1.9.1

^ permalink raw reply related

* [PATCH v7 00/12] Support non-lru page migration
From: Minchan Kim @ 2016-05-31 23:21 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Rik van Riel, Sergey Senozhatsky, Rafael Aquini, Minchan Kim,
	Chan Gyun Jeong, Jonathan Corbet, Hugh Dickins, linux-kernel,
	dri-devel, virtualization, John Einar Reitan, linux-mm,
	Chulmin Kim, Gioh Kim, Konstantin Khlebnikov, Sangseok Lee,
	Joonsoo Kim, Kyeongdon Kim, Naoya Horiguchi, Vlastimil Babka,
	Mel Gorman

Recently, I got many reports about perfermance degradation in embedded
system(Android mobile phone, webOS TV and so on) and easy fork fail.

The problem was fragmentation caused by zram and GPU driver mainly.
With memory pressure, their pages were spread out all of pageblock and
it cannot be migrated with current compaction algorithm which supports
only LRU pages. In the end, compaction cannot work well so reclaimer
shrinks all of working set pages. It made system very slow and even to
fail to fork easily which requires order-[2 or 3] allocations.

Other pain point is that they cannot use CMA memory space so when OOM
kill happens, I can see many free pages in CMA area, which is not
memory efficient. In our product which has big CMA memory, it reclaims
zones too exccessively to allocate GPU and zram page although there are
lots of free space in CMA so system becomes very slow easily.

To solve these problem, this patch tries to add facility to migrate
non-lru pages via introducing new functions and page flags to help
migration.

struct address_space_operations {
	..
	..
	bool (*isolate_page)(struct page *, isolate_mode_t);
	void (*putback_page)(struct page *);
	..
}

new page flags

	PG_movable
	PG_isolated

For details, please read description in "mm: migrate: support non-lru
movable page migration".

Originally, Gioh Kim had tried to support this feature but he moved so
I took over the work. I took many code from his work and changed a little
bit and Konstantin Khlebnikov helped Gioh a lot so he should deserve to have
many credit, too.

And I should mention Chulmin who have tested this patchset heavily
so I can find many bugs from him. :)

Thanks, Gioh, Konstantin and Chulmin!

This patchset consists of five parts.

1. clean up migration
  mm: use put_page to free page instead of putback_lru_page

2. add non-lru page migration feature
  mm: migrate: support non-lru movable page migration

3. rework KVM memory-ballooning
  mm: balloon: use general non-lru movable page feature

4. zsmalloc refactoring for preparing page migration
  zsmalloc: keep max_object in size_class
  zsmalloc: use bit_spin_lock
  zsmalloc: use accessor
  zsmalloc: factor page chain functionality out
  zsmalloc: introduce zspage structure
  zsmalloc: separate free_zspage from putback_zspage
  zsmalloc: use freeobj for index

5. zsmalloc page migration
  zsmalloc: page migration support
  zram: use __GFP_MOVABLE for memory allocation

* From v6
  * rebase on mmotm-2016-05-27-15-19
  * clean up zsmalloc - Sergey
  * clean up non-lru page migration - Vlastimil

* From v5
  * rebase on next-20160520
  * move utility functions to compaction.c and export - Sergey
  * zsmalloc dobule free fix - Sergey
  * add additional Reviewed-by for zsmalloc - Sergey

* From v4
  * rebase on mmotm-2016-05-05-17-19
  * fix huge object migration - Chulmin
  * !CONFIG_COMPACTION support for zsmalloc

* From v3
  * rebase on mmotm-2016-04-06-20-40
  * fix swap_info deadlock - Chulmin
  * race without page_lock - Vlastimil
  * no use page._mapcount for potential user-mapped page driver - Vlastimil
  * fix and enhance doc/description - Vlastimil
  * use page->mapping lower bits to represent PG_movable
  * make driver side's rule simple.

* From v2
  * rebase on mmotm-2016-03-29-15-54-16
  * check PageMovable before lock_page - Joonsoo
  * check PageMovable before PageIsolated checking - Joonsoo
  * add more description about rule

* From v1
  * rebase on v4.5-mmotm-2016-03-17-15-04
  * reordering patches to merge clean-up patches first
  * add Acked-by/Reviewed-by from Vlastimil and Sergey
  * use each own mount model instead of reusing anon_inode_fs - Al Viro
  * small changes - YiPing, Gioh

Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: dri-devel@lists.freedesktop.org
Cc: Hugh Dickins <hughd@google.com>
Cc: John Einar Reitan <john.reitan@foss.arm.com>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Cc: Konstantin Khlebnikov <koct9i@gmail.com>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
Cc: Rafael Aquini <aquini@redhat.com>
Cc: Rik van Riel <riel@redhat.com>
Cc: Sergey Senozhatsky <sergey.senozhatsky@gmail.com>
Cc: virtualization@lists.linux-foundation.org
Cc: Gioh Kim <gi-oh.kim@profitbricks.com>
Cc: Chan Gyun Jeong <chan.jeong@lge.com>
Cc: Sangseok Lee <sangseok.lee@lge.com>
Cc: Kyeongdon Kim <kyeongdon.kim@lge.com>
Cc: Chulmin Kim <cmlaika.kim@samsung.com>

Minchan Kim (12):
  mm: use put_page to free page instead of putback_lru_page
  mm: migrate: support non-lru movable page migration
  mm: balloon: use general non-lru movable page feature
  zsmalloc: keep max_object in size_class
  zsmalloc: use bit_spin_lock
  zsmalloc: use accessor
  zsmalloc: factor page chain functionality out
  zsmalloc: introduce zspage structure
  zsmalloc: separate free_zspage from putback_zspage
  zsmalloc: use freeobj for index
  zsmalloc: page migration support
  zram: use __GFP_MOVABLE for memory allocation

 Documentation/filesystems/Locking  |    4 +
 Documentation/filesystems/vfs.txt  |   11 +
 Documentation/vm/page_migration    |  107 ++-
 drivers/block/zram/zram_drv.c      |    6 +-
 drivers/virtio/virtio_balloon.c    |   54 +-
 include/linux/balloon_compaction.h |   53 +-
 include/linux/compaction.h         |   17 +
 include/linux/fs.h                 |    2 +
 include/linux/ksm.h                |    3 +-
 include/linux/migrate.h            |    2 +
 include/linux/mm.h                 |    1 +
 include/linux/page-flags.h         |   33 +-
 include/uapi/linux/magic.h         |    2 +
 mm/balloon_compaction.c            |   94 +--
 mm/compaction.c                    |   79 ++-
 mm/ksm.c                           |    4 +-
 mm/migrate.c                       |  257 +++++--
 mm/page_alloc.c                    |    2 +-
 mm/util.c                          |    6 +-
 mm/vmscan.c                        |    2 +-
 mm/zsmalloc.c                      | 1349 +++++++++++++++++++++++++-----------
 21 files changed, 1479 insertions(+), 609 deletions(-)

-- 
1.9.1

^ permalink raw reply

* Re: [PATCH v6v3 02/12] mm: migrate: support non-lru movable page migration
From: Minchan Kim @ 2016-05-31 23:05 UTC (permalink / raw)
  To: Vlastimil Babka
  Cc: Rik van Riel, Sergey Senozhatsky, Rafael Aquini, Jonathan Corbet,
	Hugh Dickins, linux-kernel, dri-devel, virtualization,
	John Einar Reitan, linux-mm, Mel Gorman, Andrew Morton,
	Joonsoo Kim, Gioh Kim
In-Reply-To: <29ae65ef-a4c9-1d45-ce56-70b29d3eacfd@suse.cz>

On Tue, May 31, 2016 at 09:52:48AM +0200, Vlastimil Babka wrote:
> On 05/31/2016 02:01 AM, Minchan Kim wrote:
> >Per Vlastimi's review comment.
> >
> >Thanks for the detail review, Vlastimi!
> >If you have another concern, feel free to say.
> 
> I don't for now :)
> 
> [...]
> 
> >Cc: Rik van Riel <riel@redhat.com>
> >Cc: Vlastimil Babka <vbabka@suse.cz>
> >Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
> >Cc: Mel Gorman <mgorman@suse.de>
> >Cc: Hugh Dickins <hughd@google.com>
> >Cc: Rafael Aquini <aquini@redhat.com>
> >Cc: virtualization@lists.linux-foundation.org
> >Cc: Jonathan Corbet <corbet@lwn.net>
> >Cc: John Einar Reitan <john.reitan@foss.arm.com>
> >Cc: dri-devel@lists.freedesktop.org
> >Cc: Sergey Senozhatsky <sergey.senozhatsky@gmail.com>
> >Signed-off-by: Gioh Kim <gi-oh.kim@profitbricks.com>
> >Signed-off-by: Minchan Kim <minchan@kernel.org>
> 
> Acked-by: Vlastimil Babka <vbabka@suse.cz>

Thanks for the review, Vlastimil!

^ permalink raw reply


This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox