* Re: Improve spinlock performance by moving work to one core
2015-11-06 17:38 ` Waiman Long
@ 2015-11-23 9:41 ` Ling Ma
2015-11-25 1:56 ` Ling Ma
2015-11-25 19:05 ` Waiman Long
0 siblings, 2 replies; 11+ messages in thread
From: Ling Ma @ 2015-11-23 9:41 UTC (permalink / raw)
To: Waiman Long; +Cc: Peter Zijlstra, mingo, linux-kernel, Ling
[-- Attachment #1: Type: text/plain, Size: 1206 bytes --]
Hi Longman,
Attachments include user space application thread.c and kernel patch
spinlock-test.patch based on kernel 4.3.0-rc4
we run thread.c with kernel patch, test original and new spinlock respectively,
perf top -G indicates thread.c cause cache_alloc_refill and
cache_flusharray functions to spend ~25% time on original spinlock,
after introducing new spinlock in two functions, the cost time become ~22%.
The printed data also tell us the new spinlock improves performance
by about 15%( 93841765576 / 81036259588) on E5-2699V3
Appreciate your comments.
Thanks
Ling
2015-11-07 1:38 GMT+08:00 Waiman Long <waiman.long@hpe.com>:
>
> On 11/05/2015 11:28 PM, Ling Ma wrote:
>>
>> Longman
>>
>> Thanks for your suggestion.
>> We will look for real scenario to test, and could you please introduce
>> some benchmarks on spinlock ?
>>
>> Regards
>> Ling
>>
>>
>
> The kernel has been well optimized for most common workloads that spinlock contention is usually not a performance bottleneck. There are still corner cases where there is heavy spinlock contention.
>
> I used a spinlock loop microbenchmark like what you are doing as well as AIM7 for application level testing.
>
> Cheers,
> Longman
>
>
[-- Attachment #2: spinlock-test.patch --]
[-- Type: application/octet-stream, Size: 24652 bytes --]
From 884e2b022a43e31e38f3779a9ee317c30e2a5e17 Mon Sep 17 00:00:00 2001
From: root <root@localhost.localdomain>
Date: Mon, 23 Nov 2015 16:57:54 +0800
Subject: [PATCH] spinlock-test
---
include/asm-generic/qspinlock.h | 5 +-
include/linux/slab_lock.h | 609 +++++++++++++++++++++++++++++++++++++++
mm/slab.c | 214 +++++++++-----
3 files changed, 748 insertions(+), 80 deletions(-)
create mode 100644 include/linux/slab_lock.h
diff --git a/include/asm-generic/qspinlock.h b/include/asm-generic/qspinlock.h
index e2aadbc..5d65b0f 100644
--- a/include/asm-generic/qspinlock.h
+++ b/include/asm-generic/qspinlock.h
@@ -76,11 +76,12 @@ extern void queued_spin_lock_slowpath(struct qspinlock *lock, u32 val);
static __always_inline void queued_spin_lock(struct qspinlock *lock)
{
u32 val;
-
+repeat:
val = atomic_cmpxchg(&lock->val, 0, _Q_LOCKED_VAL);
if (likely(val == 0))
return;
- queued_spin_lock_slowpath(lock, val);
+ goto repeat;
+ //queued_spin_lock_slowpath(lock, val);
}
#ifndef queued_spin_unlock
diff --git a/include/linux/slab_lock.h b/include/linux/slab_lock.h
new file mode 100644
index 0000000..1b0c6da
--- /dev/null
+++ b/include/linux/slab_lock.h
@@ -0,0 +1,609 @@
+#ifndef _LINUX_SLAB_LOCK_H
+#define _LINUX_SLAB_LOCK_H
+# define HP_TIMING_NOW(Var) \
+ ({ unsigned long long _hi, _lo; \
+ asm volatile ("rdtsc" : "=a" (_lo), "=d" (_hi)); \
+ (Var) = _hi << 32 | _lo; })
+
+static unsigned long _total_ = 0;
+static unsigned long _number_ = 0;
+static void atomic_addq(unsigned long *v, unsigned long i)
+{
+ asm volatile( " lock addq %1,%0"
+ : "+m" (*v)
+ : "ir" (i));
+}
+
+#define NEW_MAX_NODES 4
+struct new_spinlock {
+ struct new_spinlock *next;
+ void *para;
+ void (*fn)(void *);
+ struct new_spinlock *locked;
+ int count; /* nesting count, see qspinlock.c */
+ int tail;
+};
+
+static DEFINE_PER_CPU_ALIGNED(struct new_spinlock, new_nodes[NEW_MAX_NODES]);
+#define _Q_SET_MASK(type) (((1U << _Q_ ## type ## _BITS) - 1)\
+ << _Q_ ## type ## _OFFSET)
+#define _Q_LOCKED_OFFSET 0
+#define _Q_LOCKED_BITS 8
+#define _Q_LOCKED_MASK _Q_SET_MASK(LOCKED)
+
+#define _Q_PENDING_OFFSET (_Q_LOCKED_OFFSET + _Q_LOCKED_BITS)
+#if CONFIG_NR_CPUS < (1U << 14)
+#define _Q_PENDING_BITS 8
+#else
+#define _Q_PENDING_BITS 1
+#endif
+#define _Q_PENDING_MASK _Q_SET_MASK(PENDING)
+
+#define _Q_TAIL_IDX_OFFSET (_Q_PENDING_OFFSET + _Q_PENDING_BITS)
+#define _Q_TAIL_IDX_BITS 2
+#define _Q_TAIL_IDX_MASK _Q_SET_MASK(TAIL_IDX)
+
+#define _Q_TAIL_CPU_OFFSET (_Q_TAIL_IDX_OFFSET + _Q_TAIL_IDX_BITS)
+#define _Q_TAIL_CPU_BITS (32 - _Q_TAIL_CPU_OFFSET)
+#define _Q_TAIL_CPU_MASK _Q_SET_MASK(TAIL_CPU)
+
+#define _Q_TAIL_OFFSET _Q_TAIL_IDX_OFFSET
+#define _Q_TAIL_MASK (_Q_TAIL_IDX_MASK | _Q_TAIL_CPU_MASK)
+
+#define _Q_LOCKED_VAL (1U << _Q_LOCKED_OFFSET)
+#define _Q_PENDING_VAL (1U << _Q_PENDING_OFFSET)
+
+
+#define _Q_LOCKED_PENDING_MASK (_Q_LOCKED_MASK | _Q_PENDING_MASK)
+
+struct __qspinlock {
+ union {
+ atomic_t val;
+#ifdef __LITTLE_ENDIAN
+ struct {
+ u8 locked;
+ u8 pending;
+ };
+ struct {
+ u16 locked_pending;
+ u16 tail;
+ };
+#else
+ struct {
+ u16 tail;
+ u16 locked_pending;
+ };
+ struct {
+ u8 reserved[2];
+ u8 pending;
+ u8 locked;
+ };
+#endif
+ };
+};
+
+typedef struct nspinlock {
+ atomic_t val;
+} adl_spinlock_t;
+
+/*
+ * new_xchg_tail - Put in the new queue tail code word & retrieve previous one
+ * @lock : Pointer to queued spinlock structure
+ * @tail : The new queue tail code word
+ * Return: The previous queue tail code word
+ *
+ * xchg(lock, tail)
+ *
+ * p,*,* -> n,*,* ; prev = xchg(lock, node)
+ */
+static __always_inline u32 new_xchg_tail(struct nspinlock *lock, u32 tail)
+{
+ struct __qspinlock *l = (void *)lock;
+
+ return (u32)xchg(&l->tail, tail >> _Q_TAIL_OFFSET) << _Q_TAIL_OFFSET;
+}
+static inline u32 new_encode_tail(int cpu, int idx)
+{
+ u32 tail;
+
+ tail = (cpu + 1) << _Q_TAIL_CPU_OFFSET;
+ tail |= idx << _Q_TAIL_IDX_OFFSET; /* assume < 4 */
+
+ return tail;
+}
+
+static inline struct new_spinlock *new_decode_tail(u32 tail)
+{
+ int cpu = (tail >> _Q_TAIL_CPU_OFFSET) - 1;
+ int idx = (tail & _Q_TAIL_IDX_MASK) >> _Q_TAIL_IDX_OFFSET;
+
+ return per_cpu_ptr(&new_nodes[idx], cpu);
+}
+
+static __always_inline void new_set_locked(struct nspinlock *lock)
+{
+ struct __qspinlock *l = (void *)lock;
+
+ WRITE_ONCE(l->locked, _Q_LOCKED_VAL);
+}
+
+#ifdef CONFIG_PARAVIRT_SPINLOCKS
+#define _a_queued_spin_lock_slowpath native_queued_spin_lock_slowpath
+#endif
+#ifndef arch_mcs_spin_lock_contended
+/*
+ * Using smp_load_acquire() provides a memory barrier that ensures
+ * subsequent operations happen after the lock is acquired.
+ */
+#define arch_new_spin_lock_contended(l) \
+do { \
+ while (!(smp_load_acquire(l))) \
+ cpu_relax_lowlatency(); \
+} while (0)
+#endif
+
+#ifndef arch_new_spin_unlock_contended
+/*
+ * smp_store_release() provides a memory barrier to ensure all
+ * operations in the critical section has been completed before
+ * unlocking.
+ */
+#define arch_new_spin_unlock_contended(l) \
+ smp_store_release((l), 1)
+#endif
+static __always_inline int new_queued_spin_trylock(struct nspinlock *lock)
+{
+ if (!atomic_read(&lock->val) &&
+ (atomic_cmpxchg(&lock->val, 0, _Q_LOCKED_VAL) == 0))
+ return 1;
+ return 0;
+}
+
+void new_queued_spin_lock_slowpath(struct nspinlock *lock, u32 val, void (*fn)(void *), void *para)
+{
+ struct new_spinlock *prev, *next, *node;
+ u32 old, tail;
+ int idx;
+
+ node = this_cpu_ptr(&new_nodes[0]);
+ idx = node->count++;
+ tail = new_encode_tail(smp_processor_id(), idx);
+
+ node += idx;
+ node->locked = node;
+ node->next = NULL;
+ node->fn = fn;
+ node->para = para;
+ node->tail = tail;
+
+ if (new_queued_spin_trylock(lock)) {
+ this_cpu_dec(new_nodes[0].count);
+ fn(para);
+ atomic_sub(_Q_LOCKED_VAL, &lock->val);
+ return;
+ }
+
+ old = new_xchg_tail(lock, tail);
+
+ if (old & _Q_TAIL_MASK) {
+ struct new_spinlock *self;
+ prev = new_decode_tail(old);
+ WRITE_ONCE(prev->next, node);
+ retry:
+ self = node->locked;
+ if(self) {
+ node = self;
+ goto retry;
+ }
+ this_cpu_dec(new_nodes[0].count);
+ return;
+ }
+
+
+ while ((val = smp_load_acquire(&lock->val.counter)) & _Q_LOCKED_PENDING_MASK)
+ cpu_relax();
+
+ new_set_locked(lock);
+
+ old = new_xchg_tail(lock, 0);
+ repeat:
+ tail = node->tail;
+ if(old == tail) {
+ goto end;
+ }
+
+ while (!(next = READ_ONCE(node->next)))
+ cpu_relax();
+
+ node->fn(node->para);
+ node->locked = NULL;
+ tail = next->tail;
+ if(old != tail) {
+ while (!(node = READ_ONCE(next->next)))
+ cpu_relax();
+
+ next->fn(next->para);
+ next->locked = NULL;
+ goto repeat;
+
+ } else
+ node = next;
+
+end:
+ node->fn(node->para);
+ node->locked = NULL;
+
+
+ this_cpu_dec(new_nodes[0].count);
+ atomic_sub(_Q_LOCKED_VAL, &lock->val);
+
+ return;
+}
+
+static void __always_inline new_spin_lock(struct nspinlock *lock, void (*fn)(void *), void *para)
+{
+ u32 val;
+ if(lock->val.counter)
+ goto end;
+ val = atomic_cmpxchg(&lock->val, 0, _Q_LOCKED_VAL);
+ if (likely(val == 0)) {
+ fn(para);
+ atomic_sub(_Q_LOCKED_VAL, &lock->val);
+ return;
+ }
+
+end:
+ new_queued_spin_lock_slowpath(lock, val, fn, para);
+}
+/**********************************************************
+mcs lock
+**********************************************************/
+
+#define MAX_NODES 4
+
+struct mcs_spinlock {
+ struct mcs_spinlock *next;
+ int locked; /* 1 if lock acquired */
+ int count; /* nesting count, see qspinlock.c */
+};
+
+static DEFINE_PER_CPU_ALIGNED(struct mcs_spinlock, mcs_nodes[MAX_NODES]);
+#if _Q_PENDING_BITS == 8
+/**
+ * clear_pending_set_locked - take ownership and clear the pending bit.
+ * @lock: Pointer to queued spinlock structure
+ *
+ * *,1,0 -> *,0,1
+ *
+ * Lock stealing is not allowed if this function is used.
+ */
+static __always_inline void clear_pending_set_locked(struct qspinlock *lock)
+{
+ struct __qspinlock *l = (void *)lock;
+
+ WRITE_ONCE(l->locked_pending, _Q_LOCKED_VAL);
+}
+
+/*
+ * xchg_tail - Put in the new queue tail code word & retrieve previous one
+ * @lock : Pointer to queued spinlock structure
+ * @tail : The new queue tail code word
+ * Return: The previous queue tail code word
+ *
+ * xchg(lock, tail)
+ *
+ * p,*,* -> n,*,* ; prev = xchg(lock, node)
+ */
+static __always_inline u32 xchg_tail(struct qspinlock *lock, u32 tail)
+{
+ struct __qspinlock *l = (void *)lock;
+
+ return (u32)xchg(&l->tail, tail >> _Q_TAIL_OFFSET) << _Q_TAIL_OFFSET;
+}
+
+#else /* _Q_PENDING_BITS == 8 */
+
+/**
+ * clear_pending_set_locked - take ownership and clear the pending bit.
+ * @lock: Pointer to queued spinlock structure
+ *
+ * *,1,0 -> *,0,1
+ */
+static __always_inline void clear_pending_set_locked(struct qspinlock *lock)
+{
+ atomic_add(-_Q_PENDING_VAL + _Q_LOCKED_VAL, &lock->val);
+}
+
+/**
+ * xchg_tail - Put in the new queue tail code word & retrieve previous one
+ * @lock : Pointer to queued spinlock structure
+ * @tail : The new queue tail code word
+ * Return: The previous queue tail code word
+ *
+ * xchg(lock, tail)
+ *
+ * p,*,* -> n,*,* ; prev = xchg(lock, node)
+ */
+static __always_inline u32 xchg_tail(struct qspinlock *lock, u32 tail)
+{
+ u32 old, new, val = atomic_read(&lock->val);
+
+ for (;;) {
+ new = (val & _Q_LOCKED_PENDING_MASK) | tail;
+ old = atomic_cmpxchg(&lock->val, val, new);
+ if (old == val)
+ break;
+
+ val = old;
+ }
+ return old;
+}
+#endif /* _Q_PENDING_BITS == 8 */
+
+
+static inline u32 encode_tail(int cpu, int idx)
+{
+ u32 tail;
+
+ tail = (cpu + 1) << _Q_TAIL_CPU_OFFSET;
+ tail |= idx << _Q_TAIL_IDX_OFFSET; /* assume < 4 */
+
+ return tail;
+}
+
+static inline struct mcs_spinlock *decode_tail(u32 tail)
+{
+ int cpu = (tail >> _Q_TAIL_CPU_OFFSET) - 1;
+ int idx = (tail & _Q_TAIL_IDX_MASK) >> _Q_TAIL_IDX_OFFSET;
+
+ return per_cpu_ptr(&mcs_nodes[idx], cpu);
+}
+
+static __always_inline void set_locked(struct qspinlock *lock)
+{
+ struct __qspinlock *l = (void *)lock;
+
+ WRITE_ONCE(l->locked, _Q_LOCKED_VAL);
+}
+static __always_inline void __pv_init_node(struct mcs_spinlock *node) { }
+static __always_inline void __pv_wait_node(struct mcs_spinlock *node) { }
+static __always_inline void __pv_kick_node(struct qspinlock *lock,
+ struct mcs_spinlock *node) { }
+static __always_inline void __pv_wait_head(struct qspinlock *lock,
+ struct mcs_spinlock *node) { }
+
+
+#define pv_enabled() false
+
+#define pv_init_node __pv_init_node
+#define pv_wait_node __pv_wait_node
+#define pv_kick_node __pv_kick_node
+#define pv_wait_head __pv_wait_head
+
+#ifdef CONFIG_PARAVIRT_SPINLOCKS
+#define org_queued_spin_lock_slowpath native_queued_spin_lock_slowpath
+#endif
+#ifndef arch_mcs_spin_lock_contended
+/*
+ * Using smp_load_acquire() provides a memory barrier that ensures
+ * subsequent operations happen after the lock is acquired.
+ */
+#define arch_mcs_spin_lock_contended(l) \
+do { \
+ while (!(smp_load_acquire(l))) \
+ cpu_relax_lowlatency(); \
+} while (0)
+#endif
+
+#ifndef arch_mcs_spin_unlock_contended
+/*
+ * smp_store_release() provides a memory barrier to ensure all
+ * operations in the critical section has been completed before
+ * unlocking.
+ */
+#define arch_mcs_spin_unlock_contended(l) \
+ smp_store_release((l), 1)
+#endif
+
+ //__raw_cmpxchg((ptr), (old), (new), (size), LOCK_PREFIX)
+static __always_inline void _queued_pending_lock(struct qspinlock *lock)
+{
+ struct __qspinlock *l = (void *)lock;
+ while(cmpxchg(&l->locked_pending, _Q_PENDING_VAL, _Q_LOCKED_VAL) != _Q_LOCKED_VAL)
+ cpu_relax();
+}
+
+void org_queued_spin_lock_slowpath(struct qspinlock *lock, u32 val)
+{
+ struct mcs_spinlock *prev, *next, *node;
+ u32 new, old, tail;
+ int idx;
+
+ BUILD_BUG_ON(CONFIG_NR_CPUS >= (1U << _Q_TAIL_CPU_BITS));
+
+ if (pv_enabled())
+ goto queue;
+
+ if (virt_spin_lock(lock))
+ return;
+
+ /*
+ * wait for in-progress pending->locked hand-overs
+ *
+ * 0,1,0 -> 0,0,1
+ */
+
+ if (val == _Q_PENDING_VAL) {
+ while ((val = atomic_read(&lock->val)) == _Q_PENDING_VAL)
+ cpu_relax();
+ }
+
+ /*
+ * trylock || pending
+ *
+ * 0,0,0 -> 0,0,1 ; trylock
+ * 0,0,1 -> 0,1,1 ; pending
+ */
+ for (;;) {
+ /*
+ * If we observe any contention; queue.
+ */
+ if (val & ~_Q_LOCKED_MASK)
+ goto queue;
+
+ new = _Q_LOCKED_VAL;
+ if (val == new)
+ new |= _Q_PENDING_VAL;
+
+ old = atomic_cmpxchg(&lock->val, val, new);
+ if (old == val)
+ break;
+
+ val = old;
+ }
+
+ /*
+ * we won the trylock
+ */
+ if (new == _Q_LOCKED_VAL)
+ return;
+
+ /*
+ * we're pending, wait for the owner to go away.
+ *
+ * *,1,1 -> *,1,0
+ *
+ * this wait loop must be a load-acquire such that we match the
+ * store-release that clears the locked bit and create lock
+ * sequentiality; this is because not all clear_pending_set_locked()
+ * implementations imply full barriers.
+ */
+
+
+ while ((val = smp_load_acquire(&lock->val.counter)) & _Q_LOCKED_MASK) {
+ cpu_relax();
+ }
+ /*
+ * take ownership and clear the pending bit.
+ *
+ * *,1,0 -> *,0,1
+ */
+ clear_pending_set_locked(lock);
+ return;
+
+ /*
+ * End of pending bit optimistic spinning and beginning of MCS
+ * queuing.
+ */
+queue:
+ node = this_cpu_ptr(&mcs_nodes[0]);
+ idx = node->count++;
+ tail = encode_tail(smp_processor_id(), idx);
+
+ node += idx;
+ node->locked = 0;
+ node->next = NULL;
+ pv_init_node(node);
+
+ /*
+ * We touched a (possibly) cold cacheline in the per-cpu queue node;
+ * attempt the trylock once more in the hope someone let go while we
+ * weren't watching.
+ */
+ if (queued_spin_trylock(lock))
+ goto release;
+
+ /*
+ * We have already touched the queueing cacheline; don't bother with
+ * pending stuff.
+ *
+ * p,*,* -> n,*,*
+ */
+ old = xchg_tail(lock, tail);
+
+ /*
+ * if there was a previous node; link it and wait until reaching the
+ * head of the waitqueue.
+ */
+ if (old & _Q_TAIL_MASK) {
+ prev = decode_tail(old);
+ WRITE_ONCE(prev->next, node);
+
+ pv_wait_node(node);
+ arch_mcs_spin_lock_contended(&node->locked);
+ }
+
+ /*
+ * we're at the head of the waitqueue, wait for the owner & pending to
+ * go away.
+ *
+ * *,x,y -> *,0,0
+ *
+ * this wait loop must use a load-acquire such that we match the
+ * store-release that clears the locked bit and create lock
+ * sequentiality; this is because the set_locked() function below
+ * does not imply a full barrier.
+ *
+ */
+ pv_wait_head(lock, node);
+ while ((val = smp_load_acquire(&lock->val.counter)) & _Q_LOCKED_PENDING_MASK) {
+ cpu_relax();
+ }
+
+ /*
+ * claim the lock:
+ *
+ * n,0,0 -> 0,0,1 : lock, uncontended
+ * *,0,0 -> *,0,1 : lock, contended
+ *
+ * If the queue head is the only one in the queue (lock value == tail),
+ * clear the tail code and grab the lock. Otherwise, we only need
+ * to grab the lock.
+ */
+ for (;;) {
+ if (val != tail) {
+ set_locked(lock);
+ break;
+ }
+ old = atomic_cmpxchg(&lock->val, val, _Q_LOCKED_VAL);
+ if (old == val)
+ goto release; /* No contention */
+
+ val = old;
+ }
+
+ /*
+ * contended path; wait for next, release.
+ */
+ while (!(next = READ_ONCE(node->next)))
+ cpu_relax();
+
+ arch_mcs_spin_unlock_contended(&next->locked);
+ pv_kick_node(lock, next);
+
+release:
+ /*
+ * release the node
+ */
+ this_cpu_dec(mcs_nodes[0].count);
+}
+
+static __always_inline void org_queued_spin_lock(struct qspinlock *lock)
+{
+ u32 val;
+
+ val = atomic_cmpxchg(&lock->val, 0, _Q_LOCKED_VAL);
+ if (likely(val == 0))
+ return;
+ org_queued_spin_lock_slowpath(lock, val);
+}
+
+static __always_inline void org_queued_spin_unlock(struct qspinlock *lock)
+{
+ /*
+ * smp_mb__before_atomic() in order to guarantee release semantics
+ */
+ //smp_mb__before_atomic_dec();
+ atomic_sub(_Q_LOCKED_VAL, &lock->val);
+}
+#endif /* _LINUX_SLAB_LOCK_H */
diff --git a/mm/slab.c b/mm/slab.c
index 4fcc5dd..c1c4821 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -87,6 +87,7 @@
*/
#include <linux/slab.h>
+#include <linux/slab_lock.h>
#include <linux/mm.h>
#include <linux/poison.h>
#include <linux/swap.h>
@@ -128,7 +129,6 @@
#include "internal.h"
#include "slab.h"
-
/*
* DEBUG - 1 for kmem_cache_create() to honour; SLAB_RED_ZONE & SLAB_POISON.
* 0 for faster, smaller code (especially in the critical paths).
@@ -2765,104 +2765,135 @@ static void *cache_free_debugcheck(struct kmem_cache *cachep, void *objp,
#define kfree_debugcheck(x) do { } while(0)
#define cache_free_debugcheck(x,objp,z) (objp)
#endif
-
-static void *cache_alloc_refill(struct kmem_cache *cachep, gfp_t flags,
- bool force_refill)
-{
+struct refill_para {
+ struct kmem_cache *cachep;
int batchcount;
struct kmem_cache_node *n;
struct array_cache *ac;
int node;
+};
- check_irq_off();
- node = numa_mem_id();
- if (unlikely(force_refill))
- goto force_grow;
-retry:
- ac = cpu_cache_get(cachep);
- batchcount = ac->batchcount;
- if (!ac->touched && batchcount > BATCHREFILL_LIMIT) {
- /*
- * If there was little recent activity on this cache, then
- * perform only a partial refill. Otherwise we could generate
- * refill bouncing.
- */
- batchcount = BATCHREFILL_LIMIT;
- }
- n = get_node(cachep, node);
+static void refill_fn(void *p)
+{
- BUG_ON(ac->avail > 0 || !n);
- spin_lock(&n->list_lock);
+
+ struct refill_para *pa = p;
/* See if we can refill from the shared array */
- if (n->shared && transfer_objects(ac, n->shared, batchcount)) {
- n->shared->touched = 1;
+ if (pa->n->shared && transfer_objects(pa->ac, pa->n->shared, pa->batchcount)) {
+ pa->n->shared->touched = 1;
goto alloc_done;
}
- while (batchcount > 0) {
+ while (pa->batchcount > 0) {
struct list_head *entry;
struct page *page;
/* Get slab alloc is to come from. */
- entry = n->slabs_partial.next;
- if (entry == &n->slabs_partial) {
- n->free_touched = 1;
- entry = n->slabs_free.next;
- if (entry == &n->slabs_free)
+ entry = pa->n->slabs_partial.next;
+ if (entry == &pa->n->slabs_partial) {
+ pa->n->free_touched = 1;
+ entry = pa->n->slabs_free.next;
+ if (entry == &pa->n->slabs_free)
goto must_grow;
}
page = list_entry(entry, struct page, lru);
- check_spinlock_acquired(cachep);
+ check_spinlock_acquired(pa->cachep);
/*
* The slab was either on partial or free list so
* there must be at least one object available for
* allocation.
*/
- BUG_ON(page->active >= cachep->num);
+ BUG_ON(page->active >= pa->cachep->num);
- while (page->active < cachep->num && batchcount--) {
- STATS_INC_ALLOCED(cachep);
- STATS_INC_ACTIVE(cachep);
- STATS_SET_HIGH(cachep);
+ while (page->active < pa->cachep->num && pa->batchcount--) {
+ STATS_INC_ALLOCED(pa->cachep);
+ STATS_INC_ACTIVE(pa->cachep);
+ STATS_SET_HIGH(pa->cachep);
- ac_put_obj(cachep, ac, slab_get_obj(cachep, page,
- node));
+ ac_put_obj(pa->cachep, pa->ac, slab_get_obj(pa->cachep, page,
+ pa->node));
}
/* move slabp to correct slabp list: */
list_del(&page->lru);
- if (page->active == cachep->num)
- list_add(&page->lru, &n->slabs_full);
+ if (page->active == pa->cachep->num)
+ list_add(&page->lru, &pa->n->slabs_full);
else
- list_add(&page->lru, &n->slabs_partial);
+ list_add(&page->lru, &pa->n->slabs_partial);
}
must_grow:
- n->free_objects -= ac->avail;
+ pa->n->free_objects -= pa->ac->avail;
alloc_done:
- spin_unlock(&n->list_lock);
+ return;
+
+}
+
+#define ORG_QUEUED_SPINLOCK (1)
+static void *cache_alloc_refill(struct kmem_cache *pcachep, gfp_t flags,
+ bool force_refill)
+{
+ struct refill_para pa;
+ unsigned long start, end;
+ pa.cachep = pcachep;
+
+ check_irq_off();
+ pa.node = numa_mem_id();
+ if (unlikely(force_refill))
+ goto force_grow;
+retry:
+ pa.ac = cpu_cache_get(pa.cachep);
+ pa.batchcount = pa.ac->batchcount;
+ if (!pa.ac->touched && pa.batchcount > BATCHREFILL_LIMIT) {
+ /*
+ * If there was little recent activity on this cache, then
+ * perform only a partial refill. Otherwise we could generate
+ * refill bouncing.
+ */
+ pa.batchcount = BATCHREFILL_LIMIT;
+ }
+ pa.n = get_node(pa.cachep, pa.node);
+
+ BUG_ON(pa.ac->avail > 0 || !pa.n);
+ HP_TIMING_NOW(start);
+#if ORG_QUEUED_SPINLOCK
+ org_queued_spin_lock((struct qspinlock *)&pa.n->list_lock);
+ refill_fn(&pa);
+ org_queued_spin_unlock((struct qspinlock *)&pa.n->list_lock);
+#else
+ new_spin_lock((struct nspinlock *)&pa.n->list_lock, refill_fn, &pa);
+#endif
+ HP_TIMING_NOW(end);
+
+ atomic_addq(&_total_, end - start);
+ atomic_addq(&_number_, 1);
+
+ if(cmpxchg(&_number_,1000000, 1) == 1000000) {
+ printk("\n cost time is %ld\n", _total_);
+ _total_ = 0;
+ }
- if (unlikely(!ac->avail)) {
+ if (unlikely(!pa.ac->avail)) {
int x;
force_grow:
- x = cache_grow(cachep, gfp_exact_node(flags), node, NULL);
+ x = cache_grow(pa.cachep, gfp_exact_node(flags), pa.node, NULL);
/* cache_grow can reenable interrupts, then ac could change. */
- ac = cpu_cache_get(cachep);
- node = numa_mem_id();
+ pa.ac = cpu_cache_get(pa.cachep);
+ pa.node = numa_mem_id();
/* no objects in sight? abort */
- if (!x && (ac->avail == 0 || force_refill))
+ if (!x && (pa.ac->avail == 0 || force_refill))
return NULL;
- if (!ac->avail) /* objects refilled by interrupt? */
+ if (!pa.ac->avail) /* objects refilled by interrupt? */
goto retry;
}
- ac->touched = 1;
+ pa.ac->touched = 1;
- return ac_get_obj(cachep, ac, flags, force_refill);
+ return ac_get_obj(pa.cachep, pa.ac, flags, force_refill);
}
static inline void cache_alloc_debugcheck_before(struct kmem_cache *cachep,
@@ -3316,42 +3347,42 @@ static void free_block(struct kmem_cache *cachep, void **objpp,
}
}
-static void cache_flusharray(struct kmem_cache *cachep, struct array_cache *ac)
-{
- int batchcount;
+struct flusharray_para {
+
struct kmem_cache_node *n;
- int node = numa_mem_id();
- LIST_HEAD(list);
+ int batchcount;
+ int node;
+ struct list_head list;
+ struct kmem_cache *cachep;
+ struct array_cache *ac;
+};
- batchcount = ac->batchcount;
-#if DEBUG
- BUG_ON(!batchcount || batchcount > ac->avail);
-#endif
- check_irq_off();
- n = get_node(cachep, node);
- spin_lock(&n->list_lock);
- if (n->shared) {
- struct array_cache *shared_array = n->shared;
+static void flusharray_fn(void *p)
+{
+
+ struct flusharray_para *pa = p;
+ if (pa->n->shared) {
+ struct array_cache *shared_array = pa->n->shared;
int max = shared_array->limit - shared_array->avail;
if (max) {
- if (batchcount > max)
- batchcount = max;
+ if (pa->batchcount > max)
+ pa->batchcount = max;
memcpy(&(shared_array->entry[shared_array->avail]),
- ac->entry, sizeof(void *) * batchcount);
- shared_array->avail += batchcount;
+ pa->ac->entry, sizeof(void *) * pa->batchcount);
+ shared_array->avail += pa->batchcount;
goto free_done;
}
}
- free_block(cachep, ac->entry, batchcount, node, &list);
+ free_block(pa->cachep, pa->ac->entry, pa->batchcount, pa->node, &pa->list);
free_done:
#if STATS
{
int i = 0;
struct list_head *p;
- p = n->slabs_free.next;
- while (p != &(n->slabs_free)) {
+ p = pa->n->slabs_free.next;
+ while (p != &(pa->n->slabs_free)) {
struct page *page;
page = list_entry(p, struct page, lru);
@@ -3360,13 +3391,40 @@ free_done:
i++;
p = p->next;
}
- STATS_SET_FREEABLE(cachep, i);
+ STATS_SET_FREEABLE(pa->cachep, i);
}
#endif
- spin_unlock(&n->list_lock);
- slabs_destroy(cachep, &list);
- ac->avail -= batchcount;
- memmove(ac->entry, &(ac->entry[batchcount]), sizeof(void *)*ac->avail);
+
+ return;
+}
+static void cache_flusharray(struct kmem_cache *pcachep, struct array_cache *pac)
+{
+ struct flusharray_para pa;
+
+ unsigned long start, end;
+ INIT_LIST_HEAD(&pa.list);
+ pa.node = numa_mem_id();
+ pa.cachep = pcachep;
+ pa.ac = pac;
+ pa.batchcount = pa.ac->batchcount;
+#if DEBUG
+ BUG_ON(!pa.batchcount || pa.batchcount > pa.ac->avail);
+#endif
+ check_irq_off();
+ pa.n = get_node(pa.cachep, pa.node);
+ HP_TIMING_NOW(start);
+#if ORG_QUEUED_SPINLOCK
+ org_queued_spin_lock((struct qspinlock *)&pa.n->list_lock);
+ flusharray_fn(&pa);
+ org_queued_spin_unlock((struct qspinlock *)&pa.n->list_lock);
+#else
+ new_spin_lock((struct nspinlock *)&pa.n->list_lock, flusharray_fn, &pa);
+#endif
+ HP_TIMING_NOW(end);
+ atomic_addq(&_total_, end - start);
+ slabs_destroy(pa.cachep, &pa.list);
+ pa.ac->avail -= pa.batchcount;
+ memmove(pa.ac->entry, &(pa.ac->entry[pa.batchcount]), sizeof(void *)*pa.ac->avail);
}
/*
--
1.7.1
[-- Attachment #3: thread.c --]
[-- Type: text/x-csrc, Size: 2150 bytes --]
/**
Test Case:
OpenDir, Get status and close it.
*/
#include <unistd.h>
#include <dirent.h>
#include <sys/stat.h>
#include <sys/fcntl.h>
#include <stdio.h>
#include <string.h>
#include <errno.h>
#include <pthread.h>
#define TEST_DIR "/tmp/thread"
#define MAX_TEST_THREAD (80)
#define MAX_TEST_FILE 5000
static unsigned long *result[MAX_TEST_THREAD];
static int stop = 0;
static void* case_function(void *para)
{
int id = (int)(long)para;
DIR *pDir;
struct stat f_stat;
struct dirent *entry=NULL;
char path[256];
char cmd[512];
int filecnt = 0;
int dircnt = 0;
int filetotalsize = 0;
unsigned long myresult = 0;
int f = 0;
result[id] = &myresult;
/* Goto my path and construct empty file */
sprintf(path, "%s/%d", TEST_DIR, id);
printf("Creating temp file at %s....\n", path);
sprintf(cmd, "mkdir %s", path);
system(cmd);
chdir(path);
for (f = 0; f < MAX_TEST_FILE; f++)
{
char name[256];
sprintf(name, "%s/%d", path, f);
int t = open(name, O_RDWR | O_CREAT | O_TRUNC, S_IRWXU);
if (t != -1)
close(t);
else
{
printf("Errno = %d.\n", errno);
exit(errno);
}
}
again:
if ((pDir = opendir(path)) == NULL)
{
printf("打开 %s 错误:没有那个文件或目录\n", TEST_DIR);
goto err;
}
while ((entry = readdir(pDir)) != NULL)
{
struct stat buf;
if (entry->d_name[0] == '.')
continue;
//f = open(entry->d_name, 0);
f = stat(entry->d_name, &buf);
if (f)
close(f);
myresult++;
//printf("Filename %s, size %10d",entry->d_name, f_stat.st_size);
}
closedir(pDir);
/* Need to stop */
if (!stop)
goto again;
return 0;
err:
;
}
void main()
{
int i;
pthread_t thread;
system("mkdir "TEST_DIR);
for (i = 0; i < MAX_TEST_THREAD; i++)
{
pthread_create(&thread, NULL, case_function, (void*)(long)i);
}
while (1)
{
sleep(1);
printf("Statistics:\n");
for (i = 0; i < MAX_TEST_THREAD; i++)
{
printf("%d\t", *result[i]);
}
printf("\n");
for (i = 0; i < MAX_TEST_THREAD; i++)
*result[i] = 0;
}
}
^ permalink raw reply related [flat|nested] 11+ messages in thread