xen-devel.lists.xenproject.org archive mirror
 help / color / mirror / Atom feed
* [PATCH 1/2] rwlock: add per-cpu reader-writer locks
@ 2015-11-03 17:58 Malcolm Crossley
  2015-11-03 17:58 ` [PATCH 2/2] grant_table: convert grant table rwlock to percpu rwlock Malcolm Crossley
                   ` (2 more replies)
  0 siblings, 3 replies; 31+ messages in thread
From: Malcolm Crossley @ 2015-11-03 17:58 UTC (permalink / raw)
  To: xen-devel
  Cc: keir, ian.campbell, andrew.cooper3, stefano.stabellini, jbeulich,
	Malcolm Crossley

Per-cpu read-write locks allow for the fast path read case to have low overhead
by only setting/clearing a per-cpu variable for using the read lock.
The per-cpu read fast path also avoids locked compare swap operations which can
be particularly slow on coherent multi-socket systems, particularly if there is
heavy usage of the read lock itself.

The per-cpu reader-writer lock uses a global variable to control the read lock
fast path. This allows a writer to disable the fast path and ensure the readers
use the underlying read-write lock implementation.

Once the writer has taken the write lock and disabled the fast path, it must
poll the per-cpu variable for all CPU's which have entered the critical section
for the specific read-write lock the writer is attempting to take. This design
allows for a single per-cpu variable to be used for read/write locks belonging
to seperate data structures as long as multiple per-cpu read locks are not
simultaneously held by one particular cpu. This also means per-cpu
reader-writer locks are not recursion safe.

Slow path readers which are unblocked set the per-cpu variable and drop the
read lock. This simplifies the implementation and allows for fairness in the
underlying read-write lock to be taken advantage of.

There may be slightly more overhead on the per-cpu write lock path due to
checking each CPUs fast path read variable but this overhead is likely be hidden
by the required delay of waiting for readers to exit the critical section.
The loop is optimised to only iterate over the per-cpu data of active readers
of the rwlock.

Signed-off-by: Malcolm Crossley <malcolm.crossley@citrix.com>
---
 xen/common/spinlock.c        | 32 ++++++++++++++++++++++++++++++++
 xen/include/asm-arm/percpu.h |  5 +++++
 xen/include/asm-x86/percpu.h |  6 ++++++
 xen/include/xen/percpu.h     |  4 ++++
 xen/include/xen/spinlock.h   | 37 +++++++++++++++++++++++++++++++++++++
 5 files changed, 84 insertions(+)

diff --git a/xen/common/spinlock.c b/xen/common/spinlock.c
index 7f89694..a526216 100644
--- a/xen/common/spinlock.c
+++ b/xen/common/spinlock.c
@@ -492,6 +492,38 @@ int _rw_is_write_locked(rwlock_t *lock)
     return (lock->lock == RW_WRITE_FLAG); /* writer in critical section? */
 }
 
+void percpu_write_lock(rwlock_t **per_cpudata, bool_t *writer_activating,
+		rwlock_t *rwlock)
+{
+    int cpu;
+    cpumask_t active_readers;
+
+    cpumask_copy(&active_readers, &cpu_online_map);
+    /* First take the write lock to protect against other writers */
+    write_lock(rwlock);
+
+    /* Now set the global variable so that readers start using read_lock */
+    *writer_activating = true;
+    smp_mb();
+
+    /* Check if there are any percpu readers in progress on our rwlock*/
+    do
+    {
+        for_each_cpu(cpu, &active_readers)
+        {
+            /* Remove any percpu readers not contending 
+             * from our check mask */
+            if ( per_cpu_ptr(per_cpudata, cpu) != rwlock )
+                cpumask_clear_cpu(cpu, &active_readers);
+        }
+        /* Check if we've cleared all percpu readers */
+        if ( cpumask_empty(&active_readers) )
+            break;
+        /* Give the coherency fabric a break */
+        cpu_relax();
+    } while ( 1 );
+}
+
 #ifdef LOCK_PROFILE
 
 struct lock_profile_anc {
diff --git a/xen/include/asm-arm/percpu.h b/xen/include/asm-arm/percpu.h
index 71e7649..c308a56 100644
--- a/xen/include/asm-arm/percpu.h
+++ b/xen/include/asm-arm/percpu.h
@@ -27,6 +27,11 @@ void percpu_init_areas(void);
 #define __get_cpu_var(var) \
     (*RELOC_HIDE(&per_cpu__##var, READ_SYSREG(TPIDR_EL2)))
 
+#define per_cpu_ptr(var, cpu)  \
+    (*RELOC_HIDE(&var, __per_cpu_offset[cpu]))
+#define __get_cpu_ptr(var) \
+    (*RELOC_HIDE(&var, READ_SYSREG(TPIDR_EL2)))
+
 #define DECLARE_PER_CPU(type, name) extern __typeof__(type) per_cpu__##name
 
 DECLARE_PER_CPU(unsigned int, cpu_id);
diff --git a/xen/include/asm-x86/percpu.h b/xen/include/asm-x86/percpu.h
index 604ff0d..51562b9 100644
--- a/xen/include/asm-x86/percpu.h
+++ b/xen/include/asm-x86/percpu.h
@@ -20,4 +20,10 @@ void percpu_init_areas(void);
 
 #define DECLARE_PER_CPU(type, name) extern __typeof__(type) per_cpu__##name
 
+#define __get_cpu_ptr(var) \
+    (*RELOC_HIDE(var, get_cpu_info()->per_cpu_offset))
+
+#define per_cpu_ptr(var, cpu)  \
+    (*RELOC_HIDE(var, __per_cpu_offset[cpu]))
+
 #endif /* __X86_PERCPU_H__ */
diff --git a/xen/include/xen/percpu.h b/xen/include/xen/percpu.h
index abe0b11..c896863 100644
--- a/xen/include/xen/percpu.h
+++ b/xen/include/xen/percpu.h
@@ -16,6 +16,10 @@
 /* Preferred on Xen. Also see arch-defined per_cpu(). */
 #define this_cpu(var)    __get_cpu_var(var)
 
+#define this_cpu_ptr(ptr)    __get_cpu_ptr(ptr)
+
+#define get_per_cpu_var(var)  (per_cpu__##var)
+
 /* Linux compatibility. */
 #define get_cpu_var(var) this_cpu(var)
 #define put_cpu_var(var)
diff --git a/xen/include/xen/spinlock.h b/xen/include/xen/spinlock.h
index fb0438e..f929f1b 100644
--- a/xen/include/xen/spinlock.h
+++ b/xen/include/xen/spinlock.h
@@ -3,6 +3,7 @@
 
 #include <asm/system.h>
 #include <asm/spinlock.h>
+#include <xen/stdbool.h>
 
 #ifndef NDEBUG
 struct lock_debug {
@@ -261,4 +262,40 @@ int _rw_is_write_locked(rwlock_t *lock);
 #define rw_is_locked(l)               _rw_is_locked(l)
 #define rw_is_write_locked(l)         _rw_is_write_locked(l)
 
+static inline void percpu_read_lock(rwlock_t **per_cpudata, bool_t *writer_activating,
+		rwlock_t *rwlock)
+{
+    /* Indicate this cpu is reading */
+    this_cpu_ptr(per_cpudata) = rwlock;
+    smp_mb();
+    /* Check if a writer is waiting */
+    if ( unlikely(*writer_activating) )
+    {
+        /* Let the waiting writer know we aren't holding the lock */
+        this_cpu_ptr(per_cpudata) = NULL;
+        /* Wait using the read lock to keep the lock fair */
+        read_lock(rwlock);
+        /* Set the per CPU data again and continue*/
+        this_cpu_ptr(per_cpudata) = rwlock;
+        /* Drop the read lock because we don't need it anymore */
+        read_unlock(rwlock);
+    }
+}
+
+static inline void percpu_read_unlock(rwlock_t **per_cpudata)
+{
+    this_cpu_ptr(per_cpudata) = NULL;
+    smp_wmb();
+}
+
+/* Don't inline percpu write lock as it's a complex function */
+void percpu_write_lock(rwlock_t **per_cpudata, bool_t *writer_activating,
+		rwlock_t *rwlock);
+
+static inline void percpu_write_unlock(bool_t *writer_activating, rwlock_t *rwlock)
+{
+    *writer_activating = false;
+    write_unlock(rwlock);
+}
+
 #endif /* __SPINLOCK_H__ */
-- 
1.7.12.4

^ permalink raw reply related	[flat|nested] 31+ messages in thread

* [PATCH 2/2] grant_table: convert grant table rwlock to percpu rwlock
  2015-11-03 17:58 [PATCH 1/2] rwlock: add per-cpu reader-writer locks Malcolm Crossley
@ 2015-11-03 17:58 ` Malcolm Crossley
  2015-11-17 17:04   ` Jan Beulich
  2015-11-05 13:48 ` [PATCH 1/2] rwlock: add per-cpu reader-writer locks Marcos E. Matsunaga
  2015-11-17 17:00 ` Jan Beulich
  2 siblings, 1 reply; 31+ messages in thread
From: Malcolm Crossley @ 2015-11-03 17:58 UTC (permalink / raw)
  To: xen-devel
  Cc: keir, ian.campbell, andrew.cooper3, stefano.stabellini, jbeulich,
	Malcolm Crossley

The per domain grant table read lock suffers from significant contention when
performance multi-queue block or network IO due to the parallel
grant map/unmaps/copies occuring on the DomU's grant table.

On multi-socket systems, the contention results in the locked compare swap
operation failing frequently which results in a tight loop of retries of the
compare swap operation. As the coherency fabric can only support a specific
rate of compare swap operations for a particular data location then taking
the read lock itself becomes a bottleneck for grant operations.

Standard rwlock performance of a single VIF VM-VM transfer with 16 queues
configured was limited to approxmiately 10 gbit/s on a 2 socket Haswell-EP
host.

Percpu rwlock performance with the same configuration is approximately
50 gbit/s.

Oprofile was used to determine the initial overhead of the read-write locks
and to confirm the overhead was dramatically reduced by the percpu rwlocks.

Signed-off-by: Malcolm Crossley <malcolm.crossley@citrix.com>
---
 xen/arch/arm/mm.c             |   5 ++-
 xen/arch/x86/mm.c             |   5 ++-
 xen/common/grant_table.c      | 100 ++++++++++++++++++++++--------------------
 xen/include/xen/grant_table.h |   4 ++
 4 files changed, 62 insertions(+), 52 deletions(-)

diff --git a/xen/arch/arm/mm.c b/xen/arch/arm/mm.c
index 8b6d915..596cabf 100644
--- a/xen/arch/arm/mm.c
+++ b/xen/arch/arm/mm.c
@@ -1055,7 +1055,8 @@ int xenmem_add_to_physmap_one(
     switch ( space )
     {
     case XENMAPSPACE_grant_table:
-        write_lock(&d->grant_table->lock);
+        percpu_write_lock(&get_per_cpu_var(grant_rwlock), &grant_rwlock_barrier,
+                               &d->grant_table->lock);
 
         if ( d->grant_table->gt_version == 0 )
             d->grant_table->gt_version = 1;
@@ -1085,7 +1086,7 @@ int xenmem_add_to_physmap_one(
 
         t = p2m_ram_rw;
 
-        write_unlock(&d->grant_table->lock);
+        percpu_write_unlock(&grant_rwlock_barrier, &d->grant_table->lock);
         break;
     case XENMAPSPACE_shared_info:
         if ( idx != 0 )
diff --git a/xen/arch/x86/mm.c b/xen/arch/x86/mm.c
index 92df36f..2a6fe61 100644
--- a/xen/arch/x86/mm.c
+++ b/xen/arch/x86/mm.c
@@ -4654,7 +4654,8 @@ int xenmem_add_to_physmap_one(
                 mfn = virt_to_mfn(d->shared_info);
             break;
         case XENMAPSPACE_grant_table:
-            write_lock(&d->grant_table->lock);
+            percpu_write_lock(&get_per_cpu_var(grant_rwlock), &grant_rwlock_barrier,
+			        &d->grant_table->lock);
 
             if ( d->grant_table->gt_version == 0 )
                 d->grant_table->gt_version = 1;
@@ -4676,7 +4677,7 @@ int xenmem_add_to_physmap_one(
                     mfn = virt_to_mfn(d->grant_table->shared_raw[idx]);
             }
 
-            write_unlock(&d->grant_table->lock);
+            percpu_write_unlock(&grant_rwlock_barrier, &d->grant_table->lock);
             break;
         case XENMAPSPACE_gmfn_range:
         case XENMAPSPACE_gmfn:
diff --git a/xen/common/grant_table.c b/xen/common/grant_table.c
index 5d52d1e..8d19d2a 100644
--- a/xen/common/grant_table.c
+++ b/xen/common/grant_table.c
@@ -178,6 +178,10 @@ struct active_grant_entry {
 #define _active_entry(t, e) \
     ((t)->active[(e)/ACGNT_PER_PAGE][(e)%ACGNT_PER_PAGE])
 
+bool_t grant_rwlock_barrier;
+
+DEFINE_PER_CPU(rwlock_t *, grant_rwlock);
+
 static inline void gnttab_flush_tlb(const struct domain *d)
 {
     if ( !paging_mode_external(d) )
@@ -270,23 +274,23 @@ double_gt_lock(struct grant_table *lgt, struct grant_table *rgt)
      */
     if ( lgt < rgt )
     {
-        write_lock(&lgt->lock);
-        write_lock(&rgt->lock);
+        percpu_write_lock(&get_per_cpu_var(grant_rwlock), &grant_rwlock_barrier, &lgt->lock);
+        percpu_write_lock(&get_per_cpu_var(grant_rwlock), &grant_rwlock_barrier, &rgt->lock);
     }
     else
     {
         if ( lgt != rgt )
-            write_lock(&rgt->lock);
-        write_lock(&lgt->lock);
+            percpu_write_lock(&get_per_cpu_var(grant_rwlock), &grant_rwlock_barrier, &rgt->lock);
+        percpu_write_lock(&get_per_cpu_var(grant_rwlock), &grant_rwlock_barrier, &lgt->lock);
     }
 }
 
 static inline void
 double_gt_unlock(struct grant_table *lgt, struct grant_table *rgt)
 {
-    write_unlock(&lgt->lock);
+    percpu_write_unlock(&grant_rwlock_barrier, &lgt->lock);
     if ( lgt != rgt )
-        write_unlock(&rgt->lock);
+        percpu_write_unlock(&grant_rwlock_barrier, &rgt->lock);
 }
 
 static inline int
@@ -796,7 +800,7 @@ __gnttab_map_grant_ref(
     }
 
     rgt = rd->grant_table;
-    read_lock(&rgt->lock);
+    percpu_read_lock(&get_per_cpu_var(grant_rwlock), &grant_rwlock_barrier, &rgt->lock);
 
     /* Bounds check on the grant ref */
     if ( unlikely(op->ref >= nr_grant_entries(rgt)))
@@ -859,7 +863,7 @@ __gnttab_map_grant_ref(
     cache_flags = (shah->flags & (GTF_PAT | GTF_PWT | GTF_PCD) );
 
     active_entry_release(act);
-    read_unlock(&rgt->lock);
+    percpu_read_unlock(&get_per_cpu_var(grant_rwlock));
 
     /* pg may be set, with a refcount included, from __get_paged_frame */
     if ( !pg )
@@ -1006,7 +1010,7 @@ __gnttab_map_grant_ref(
         put_page(pg);
     }
 
-    read_lock(&rgt->lock);
+    percpu_read_lock(&get_per_cpu_var(grant_rwlock), &grant_rwlock_barrier, &rgt->lock);
 
     act = active_entry_acquire(rgt, op->ref);
 
@@ -1029,7 +1033,7 @@ __gnttab_map_grant_ref(
     active_entry_release(act);
 
  unlock_out:
-    read_unlock(&rgt->lock);
+    percpu_read_unlock(&get_per_cpu_var(grant_rwlock));
     op->status = rc;
     put_maptrack_handle(lgt, handle);
     rcu_unlock_domain(rd);
@@ -1080,18 +1084,18 @@ __gnttab_unmap_common(
 
     op->map = &maptrack_entry(lgt, op->handle);
 
-    read_lock(&lgt->lock);
+    percpu_read_lock(&get_per_cpu_var(grant_rwlock), &grant_rwlock_barrier, &lgt->lock);
 
     if ( unlikely(!read_atomic(&op->map->flags)) )
     {
-        read_unlock(&lgt->lock);
+        percpu_read_unlock(&get_per_cpu_var(grant_rwlock));
         gdprintk(XENLOG_INFO, "Zero flags for handle (%d).\n", op->handle);
         op->status = GNTST_bad_handle;
         return;
     }
 
     dom = op->map->domid;
-    read_unlock(&lgt->lock);
+    percpu_read_unlock(&get_per_cpu_var(grant_rwlock));
 
     if ( unlikely((rd = rcu_lock_domain_by_id(dom)) == NULL) )
     {
@@ -1113,7 +1117,7 @@ __gnttab_unmap_common(
 
     rgt = rd->grant_table;
 
-    read_lock(&rgt->lock);
+    percpu_read_lock(&get_per_cpu_var(grant_rwlock), &grant_rwlock_barrier, &rgt->lock);
 
     op->flags = read_atomic(&op->map->flags);
     if ( unlikely(!op->flags) || unlikely(op->map->domid != dom) )
@@ -1165,7 +1169,7 @@ __gnttab_unmap_common(
  act_release_out:
     active_entry_release(act);
  unmap_out:
-    read_unlock(&rgt->lock);
+    percpu_read_unlock(&get_per_cpu_var(grant_rwlock));
 
     if ( rc == GNTST_okay && gnttab_need_iommu_mapping(ld) )
     {
@@ -1220,7 +1224,7 @@ __gnttab_unmap_common_complete(struct gnttab_unmap_common *op)
     rcu_lock_domain(rd);
     rgt = rd->grant_table;
 
-    read_lock(&rgt->lock);
+    percpu_read_lock(&get_per_cpu_var(grant_rwlock), &grant_rwlock_barrier, &rgt->lock);
     if ( rgt->gt_version == 0 )
         goto unlock_out;
 
@@ -1286,7 +1290,7 @@ __gnttab_unmap_common_complete(struct gnttab_unmap_common *op)
  act_release_out:
     active_entry_release(act);
  unlock_out:
-    read_unlock(&rgt->lock);
+    percpu_read_unlock(&get_per_cpu_var(grant_rwlock));
 
     if ( put_handle )
     {
@@ -1585,7 +1589,7 @@ gnttab_setup_table(
     }
 
     gt = d->grant_table;
-    write_lock(&gt->lock);
+    percpu_write_lock(&get_per_cpu_var(grant_rwlock), &grant_rwlock_barrier, &gt->lock);
 
     if ( gt->gt_version == 0 )
         gt->gt_version = 1;
@@ -1613,7 +1617,7 @@ gnttab_setup_table(
     }
 
  out3:
-    write_unlock(&gt->lock);
+    percpu_write_unlock(&grant_rwlock_barrier, &gt->lock);
  out2:
     rcu_unlock_domain(d);
  out1:
@@ -1655,13 +1659,13 @@ gnttab_query_size(
         goto query_out_unlock;
     }
 
-    read_lock(&d->grant_table->lock);
+    percpu_read_lock(&get_per_cpu_var(grant_rwlock), &grant_rwlock_barrier, &d->grant_table->lock);
 
     op.nr_frames     = nr_grant_frames(d->grant_table);
     op.max_nr_frames = max_grant_frames;
     op.status        = GNTST_okay;
 
-    read_unlock(&d->grant_table->lock);
+    percpu_read_unlock(&get_per_cpu_var(grant_rwlock));
 
  
  query_out_unlock:
@@ -1687,7 +1691,7 @@ gnttab_prepare_for_transfer(
     union grant_combo   scombo, prev_scombo, new_scombo;
     int                 retries = 0;
 
-    read_lock(&rgt->lock);
+    percpu_read_lock(&get_per_cpu_var(grant_rwlock), &grant_rwlock_barrier, &rgt->lock);
 
     if ( unlikely(ref >= nr_grant_entries(rgt)) )
     {
@@ -1730,11 +1734,11 @@ gnttab_prepare_for_transfer(
         scombo = prev_scombo;
     }
 
-    read_unlock(&rgt->lock);
+    percpu_read_unlock(&get_per_cpu_var(grant_rwlock));
     return 1;
 
  fail:
-    read_unlock(&rgt->lock);
+    percpu_read_unlock(&get_per_cpu_var(grant_rwlock));
     return 0;
 }
 
@@ -1925,7 +1929,7 @@ gnttab_transfer(
         TRACE_1D(TRC_MEM_PAGE_GRANT_TRANSFER, e->domain_id);
 
         /* Tell the guest about its new page frame. */
-        read_lock(&e->grant_table->lock);
+        percpu_read_lock(&get_per_cpu_var(grant_rwlock), &grant_rwlock_barrier, &e->grant_table->lock);
         act = active_entry_acquire(e->grant_table, gop.ref);
 
         if ( e->grant_table->gt_version == 1 )
@@ -1949,7 +1953,7 @@ gnttab_transfer(
             GTF_transfer_completed;
 
         active_entry_release(act);
-        read_unlock(&e->grant_table->lock);
+        percpu_read_unlock(&get_per_cpu_var(grant_rwlock));
 
         rcu_unlock_domain(e);
 
@@ -1987,7 +1991,7 @@ __release_grant_for_copy(
     released_read = 0;
     released_write = 0;
 
-    read_lock(&rgt->lock);
+    percpu_read_lock(&get_per_cpu_var(grant_rwlock), &grant_rwlock_barrier, &rgt->lock);
 
     act = active_entry_acquire(rgt, gref);
     sha = shared_entry_header(rgt, gref);
@@ -2029,7 +2033,7 @@ __release_grant_for_copy(
     }
 
     active_entry_release(act);
-    read_unlock(&rgt->lock);
+    percpu_read_unlock(&get_per_cpu_var(grant_rwlock));
 
     if ( td != rd )
     {
@@ -2086,7 +2090,7 @@ __acquire_grant_for_copy(
 
     *page = NULL;
 
-    read_lock(&rgt->lock);
+    percpu_read_lock(&get_per_cpu_var(grant_rwlock), &grant_rwlock_barrier, &rgt->lock);
 
     if ( unlikely(gref >= nr_grant_entries(rgt)) )
         PIN_FAIL(gt_unlock_out, GNTST_bad_gntref,
@@ -2168,20 +2172,20 @@ __acquire_grant_for_copy(
              * here and reacquire
              */
             active_entry_release(act);
-            read_unlock(&rgt->lock);
+            percpu_read_unlock(&get_per_cpu_var(grant_rwlock));
 
             rc = __acquire_grant_for_copy(td, trans_gref, rd->domain_id,
                                           readonly, &grant_frame, page,
                                           &trans_page_off, &trans_length, 0);
 
-            read_lock(&rgt->lock);
+            percpu_read_lock(&get_per_cpu_var(grant_rwlock), &grant_rwlock_barrier, &rgt->lock);
             act = active_entry_acquire(rgt, gref);
 
             if ( rc != GNTST_okay ) {
                 __fixup_status_for_copy_pin(act, status);
                 rcu_unlock_domain(td);
                 active_entry_release(act);
-                read_unlock(&rgt->lock);
+                percpu_read_unlock(&get_per_cpu_var(grant_rwlock));
                 return rc;
             }
 
@@ -2194,7 +2198,7 @@ __acquire_grant_for_copy(
                 __fixup_status_for_copy_pin(act, status);
                 rcu_unlock_domain(td);
                 active_entry_release(act);
-                read_unlock(&rgt->lock);
+                percpu_read_unlock(&get_per_cpu_var(grant_rwlock));
                 put_page(*page);
                 return __acquire_grant_for_copy(rd, gref, ldom, readonly,
                                                 frame, page, page_off, length,
@@ -2258,7 +2262,7 @@ __acquire_grant_for_copy(
     *frame = act->frame;
 
     active_entry_release(act);
-    read_unlock(&rgt->lock);
+    percpu_read_unlock(&get_per_cpu_var(grant_rwlock));
     return rc;
  
  unlock_out_clear:
@@ -2273,7 +2277,7 @@ __acquire_grant_for_copy(
     active_entry_release(act);
 
  gt_unlock_out:
-    read_unlock(&rgt->lock);
+    percpu_read_unlock(&get_per_cpu_var(grant_rwlock));
 
     return rc;
 }
@@ -2589,7 +2593,7 @@ gnttab_set_version(XEN_GUEST_HANDLE_PARAM(gnttab_set_version_t) uop)
     if ( gt->gt_version == op.version )
         goto out;
 
-    write_lock(&gt->lock);
+    percpu_write_lock(&get_per_cpu_var(grant_rwlock), &grant_rwlock_barrier, &gt->lock);
     /*
      * Make sure that the grant table isn't currently in use when we
      * change the version number, except for the first 8 entries which
@@ -2702,7 +2706,7 @@ gnttab_set_version(XEN_GUEST_HANDLE_PARAM(gnttab_set_version_t) uop)
     gt->gt_version = op.version;
 
  out_unlock:
-    write_unlock(&gt->lock);
+    percpu_write_unlock(&grant_rwlock_barrier, &gt->lock);
 
  out:
     op.version = gt->gt_version;
@@ -2758,7 +2762,7 @@ gnttab_get_status_frames(XEN_GUEST_HANDLE_PARAM(gnttab_get_status_frames_t) uop,
 
     op.status = GNTST_okay;
 
-    read_lock(&gt->lock);
+    percpu_read_lock(&get_per_cpu_var(grant_rwlock), &grant_rwlock_barrier, &gt->lock);
 
     for ( i = 0; i < op.nr_frames; i++ )
     {
@@ -2767,7 +2771,7 @@ gnttab_get_status_frames(XEN_GUEST_HANDLE_PARAM(gnttab_get_status_frames_t) uop,
             op.status = GNTST_bad_virt_addr;
     }
 
-    read_unlock(&gt->lock);
+    percpu_read_unlock(&get_per_cpu_var(grant_rwlock));
 out2:
     rcu_unlock_domain(d);
 out1:
@@ -2817,7 +2821,7 @@ __gnttab_swap_grant_ref(grant_ref_t ref_a, grant_ref_t ref_b)
     struct active_grant_entry *act_b = NULL;
     s16 rc = GNTST_okay;
 
-    write_lock(&gt->lock);
+    percpu_write_lock(&get_per_cpu_var(grant_rwlock), &grant_rwlock_barrier, &gt->lock);
 
     /* Bounds check on the grant refs */
     if ( unlikely(ref_a >= nr_grant_entries(d->grant_table)))
@@ -2865,7 +2869,7 @@ out:
         active_entry_release(act_b);
     if ( act_a != NULL )
         active_entry_release(act_a);
-    write_unlock(&gt->lock);
+    percpu_write_unlock(&grant_rwlock_barrier, &gt->lock);
 
     rcu_unlock_domain(d);
 
@@ -2936,12 +2940,12 @@ static int __gnttab_cache_flush(gnttab_cache_flush_t *cflush,
 
     if ( d != owner )
     {
-        read_lock(&owner->grant_table->lock);
+        percpu_read_lock(&get_per_cpu_var(grant_rwlock), &grant_rwlock_barrier, &owner->grant_table->lock);
 
         ret = grant_map_exists(d, owner->grant_table, mfn, ref_count);
         if ( ret != 0 )
         {
-            read_unlock(&owner->grant_table->lock);
+            percpu_read_unlock(&get_per_cpu_var(grant_rwlock));
             rcu_unlock_domain(d);
             put_page(page);
             return ret;
@@ -2961,7 +2965,7 @@ static int __gnttab_cache_flush(gnttab_cache_flush_t *cflush,
         ret = 0;
 
     if ( d != owner )
-        read_unlock(&owner->grant_table->lock);
+        percpu_read_unlock(&get_per_cpu_var(grant_rwlock));
     unmap_domain_page(v);
     put_page(page);
 
@@ -3282,7 +3286,7 @@ gnttab_release_mappings(
         }
 
         rgt = rd->grant_table;
-        read_lock(&rgt->lock);
+        percpu_read_lock(&get_per_cpu_var(grant_rwlock), &grant_rwlock_barrier, &rgt->lock);
 
         act = active_entry_acquire(rgt, ref);
         sha = shared_entry_header(rgt, ref);
@@ -3343,7 +3347,7 @@ gnttab_release_mappings(
             gnttab_clear_flag(_GTF_reading, status);
 
         active_entry_release(act);
-        read_unlock(&rgt->lock);
+        percpu_read_unlock(&get_per_cpu_var(grant_rwlock));
 
         rcu_unlock_domain(rd);
 
@@ -3432,7 +3436,7 @@ static void gnttab_usage_print(struct domain *rd)
     printk("      -------- active --------       -------- shared --------\n");
     printk("[ref] localdom mfn      pin          localdom gmfn     flags\n");
 
-    read_lock(&gt->lock);
+    percpu_read_lock(&get_per_cpu_var(grant_rwlock), &grant_rwlock_barrier, &gt->lock);
 
     for ( ref = 0; ref != nr_grant_entries(gt); ref++ )
     {
@@ -3475,7 +3479,7 @@ static void gnttab_usage_print(struct domain *rd)
         active_entry_release(act);
     }
 
-    read_unlock(&gt->lock);
+    percpu_read_unlock(&get_per_cpu_var(grant_rwlock));
 
     if ( first )
         printk("grant-table for remote domain:%5d ... "
diff --git a/xen/include/xen/grant_table.h b/xen/include/xen/grant_table.h
index 1c29cee..6056d3e 100644
--- a/xen/include/xen/grant_table.h
+++ b/xen/include/xen/grant_table.h
@@ -51,6 +51,10 @@
 /* The maximum size of a grant table. */
 extern unsigned int max_grant_frames;
 
+extern bool_t grant_rwlock_barrier;
+
+DECLARE_PER_CPU(rwlock_t *, grant_rwlock);
+
 /* Per-domain grant information. */
 struct grant_table {
     /*
-- 
1.7.12.4

^ permalink raw reply related	[flat|nested] 31+ messages in thread

* Re: [PATCH 1/2] rwlock: add per-cpu reader-writer locks
  2015-11-03 17:58 [PATCH 1/2] rwlock: add per-cpu reader-writer locks Malcolm Crossley
  2015-11-03 17:58 ` [PATCH 2/2] grant_table: convert grant table rwlock to percpu rwlock Malcolm Crossley
@ 2015-11-05 13:48 ` Marcos E. Matsunaga
  2015-11-05 15:20   ` Malcolm Crossley
  2015-11-17 17:00 ` Jan Beulich
  2 siblings, 1 reply; 31+ messages in thread
From: Marcos E. Matsunaga @ 2015-11-05 13:48 UTC (permalink / raw)
  To: xen-devel, malcolm.crossley

Hi Malcolm,

I tried your patches against staging yesterday and as soon as I started 
a guest, it panic. I have lock_profile enabled and applied your patches 
against:

6f04de658574833688c3f9eab310e7834d56a9c0 x86: cleanup of early cpuid 
handling



(XEN) HVM1 save: CPU
(XEN) HVM1 save: PIC
(XEN) HVM1 save: IOAPIC
(XEN) HVM1 save: LAPIC
(XEN) HVM1 save: LAPIC_REGS
(XEN) HVM1 save: PCI_IRQ
(XEN) HVM1 save: ISA_IRQ
(XEN) HVM1 save: PCI_LINK
(XEN) HVM1 save: PIT
(XEN) HVM1 save: RTC
(XEN) HVM1 save: HPET
(XEN) HVM1 save: PMTIMER
(XEN) HVM1 save: MTRR
(XEN) HVM1 save: VIRIDIAN_DOMAIN
(XEN) HVM1 save: CPU_XSAVE
(XEN) HVM1 save: VIRIDIAN_VCPU
(XEN) HVM1 save: VMCE_VCPU
(XEN) HVM1 save: TSC_ADJUST
(XEN) HVM1 restore: CPU 0
[  394.163143] loop: module loaded
(XEN) Assertion 'rw_is_locked(&t->lock)' failed at grant_table.c:215
(XEN) ----[ Xen-4.7-unstable  x86_64  debug=y  Tainted:    C ]----
(XEN) CPU:    0
(XEN) RIP:    e008:[<ffff82d08011019e>] do_grant_table_op+0x63f/0x2e04
(XEN) RFLAGS: 0000000000010246   CONTEXT: hypervisor (d0v0)
(XEN) rax: 0000000000000000   rbx: ffff83400f9dc9e0   rcx: 0000000000000000
(XEN) rdx: 0000000000000001   rsi: ffff82d080342b10   rdi: ffff83400819b784
(XEN) rbp: ffff8300774ffef8   rsp: ffff8300774ffdf8   r8: 0000000000000002
(XEN) r9:  0000000000000002   r10: 0000000000000002   r11: 00000000ffffffff
(XEN) r12: 0000000000000000   r13: 0000000000000000   r14: ffff83400819b780
(XEN) r15: ffff83400f9d0000   cr0: 0000000080050033   cr4: 00000000001526e0
(XEN) cr3: 000001007f613000   cr2: ffff8800746182b8
(XEN) ds: 0000   es: 0000   fs: 0000   gs: 0000   ss: e010   cs: e008
(XEN) Xen stack trace from rsp=ffff8300774ffdf8:
(XEN)    ffff8300774ffe08 ffff82d000000000 ffff8300774ffef8 ffff82d08017fc9b
(XEN)    ffff82d080342b28 ffff83400f9d8600 ffff82d080342b10 0000000000000000
(XEN)    ffff83400f9dca20 ffff832100000000 ffff834008188000 0000000000000001
(XEN)    00000001772ee000 ffff8801e98d03e0 ffff8300774ffe88 0000000000000000
(XEN)    0000000000000000 ffff8300774fff18 00000021d0269c10 000000000001001a
(XEN)    ffffffff00000001 0000000000000000 0000000000000246 00007ff7de45a407
(XEN)    0000000000000100 00007ff7de45a407 0000000000000033 ffff8300772ee000
(XEN)    ffff8801eb0e3c00 ffff880004bf57e8 ffff8801e98d03e0 ffff8801eb0a5938
(XEN)    00007cff88b000c7 ffff82d08023d952 ffffffff8100128a 0000000000000014
(XEN)    0000000000000000 0000000000000001 ffff8801f6e18388 ffffffff81d3d740
(XEN)    ffff8801efb7bd40 ffff88000542e780 0000000000000282 0000000000000000
(XEN)    ffff8801e98d03a0 ffff8801efe07000 0000000000000014 ffffffff8100128a
(XEN)    0000000000000001 ffff8801e98d03e0 0000000000000000 0001010000000000
(XEN)    ffffffff8100128a 000000000000e033 0000000000000282 ffff8801efb7bce0
(XEN)    000000000000e02b 0000000000000000 0000000000000000 0000000000000000
(XEN)    0000000000000000 0000000000000000 ffff8300772ee000 0000000000000000
(XEN)    0000000000000000
(XEN) Xen call trace:
(XEN)    [<ffff82d08011019e>] do_grant_table_op+0x63f/0x2e04
(XEN)    [<ffff82d08023d952>] lstar_enter+0xe2/0x13c
(XEN)
(XEN)
(XEN) ****************************************
(XEN) Panic on CPU 0:
(XEN) Assertion 'rw_is_locked(&t->lock)' failed at grant_table.c:215
(XEN) ****************************************
(XEN)
(XEN) Manual reset required ('noreboot' specified)


Thanks for your help.

On 11/03/2015 12:58 PM, Malcolm Crossley wrote:
> Per-cpu read-write locks allow for the fast path read case to have low overhead
> by only setting/clearing a per-cpu variable for using the read lock.
> The per-cpu read fast path also avoids locked compare swap operations which can
> be particularly slow on coherent multi-socket systems, particularly if there is
> heavy usage of the read lock itself.
>
> The per-cpu reader-writer lock uses a global variable to control the read lock
> fast path. This allows a writer to disable the fast path and ensure the readers
> use the underlying read-write lock implementation.
>
> Once the writer has taken the write lock and disabled the fast path, it must
> poll the per-cpu variable for all CPU's which have entered the critical section
> for the specific read-write lock the writer is attempting to take. This design
> allows for a single per-cpu variable to be used for read/write locks belonging
> to seperate data structures as long as multiple per-cpu read locks are not
> simultaneously held by one particular cpu. This also means per-cpu
> reader-writer locks are not recursion safe.
>
> Slow path readers which are unblocked set the per-cpu variable and drop the
> read lock. This simplifies the implementation and allows for fairness in the
> underlying read-write lock to be taken advantage of.
>
> There may be slightly more overhead on the per-cpu write lock path due to
> checking each CPUs fast path read variable but this overhead is likely be hidden
> by the required delay of waiting for readers to exit the critical section.
> The loop is optimised to only iterate over the per-cpu data of active readers
> of the rwlock.
>
> Signed-off-by: Malcolm Crossley <malcolm.crossley@citrix.com>
> ---
>   xen/common/spinlock.c        | 32 ++++++++++++++++++++++++++++++++
>   xen/include/asm-arm/percpu.h |  5 +++++
>   xen/include/asm-x86/percpu.h |  6 ++++++
>   xen/include/xen/percpu.h     |  4 ++++
>   xen/include/xen/spinlock.h   | 37 +++++++++++++++++++++++++++++++++++++
>   5 files changed, 84 insertions(+)
>
> diff --git a/xen/common/spinlock.c b/xen/common/spinlock.c
> index 7f89694..a526216 100644
> --- a/xen/common/spinlock.c
> +++ b/xen/common/spinlock.c
> @@ -492,6 +492,38 @@ int _rw_is_write_locked(rwlock_t *lock)
>       return (lock->lock == RW_WRITE_FLAG); /* writer in critical section? */
>   }
>   
> +void percpu_write_lock(rwlock_t **per_cpudata, bool_t *writer_activating,
> +		rwlock_t *rwlock)
> +{
> +    int cpu;
> +    cpumask_t active_readers;
> +
> +    cpumask_copy(&active_readers, &cpu_online_map);
> +    /* First take the write lock to protect against other writers */
> +    write_lock(rwlock);
> +
> +    /* Now set the global variable so that readers start using read_lock */
> +    *writer_activating = true;
> +    smp_mb();
> +
> +    /* Check if there are any percpu readers in progress on our rwlock*/
> +    do
> +    {
> +        for_each_cpu(cpu, &active_readers)
> +        {
> +            /* Remove any percpu readers not contending
> +             * from our check mask */
> +            if ( per_cpu_ptr(per_cpudata, cpu) != rwlock )
> +                cpumask_clear_cpu(cpu, &active_readers);
> +        }
> +        /* Check if we've cleared all percpu readers */
> +        if ( cpumask_empty(&active_readers) )
> +            break;
> +        /* Give the coherency fabric a break */
> +        cpu_relax();
> +    } while ( 1 );
> +}
> +
>   #ifdef LOCK_PROFILE
>   
>   struct lock_profile_anc {
> diff --git a/xen/include/asm-arm/percpu.h b/xen/include/asm-arm/percpu.h
> index 71e7649..c308a56 100644
> --- a/xen/include/asm-arm/percpu.h
> +++ b/xen/include/asm-arm/percpu.h
> @@ -27,6 +27,11 @@ void percpu_init_areas(void);
>   #define __get_cpu_var(var) \
>       (*RELOC_HIDE(&per_cpu__##var, READ_SYSREG(TPIDR_EL2)))
>   
> +#define per_cpu_ptr(var, cpu)  \
> +    (*RELOC_HIDE(&var, __per_cpu_offset[cpu]))
> +#define __get_cpu_ptr(var) \
> +    (*RELOC_HIDE(&var, READ_SYSREG(TPIDR_EL2)))
> +
>   #define DECLARE_PER_CPU(type, name) extern __typeof__(type) per_cpu__##name
>   
>   DECLARE_PER_CPU(unsigned int, cpu_id);
> diff --git a/xen/include/asm-x86/percpu.h b/xen/include/asm-x86/percpu.h
> index 604ff0d..51562b9 100644
> --- a/xen/include/asm-x86/percpu.h
> +++ b/xen/include/asm-x86/percpu.h
> @@ -20,4 +20,10 @@ void percpu_init_areas(void);
>   
>   #define DECLARE_PER_CPU(type, name) extern __typeof__(type) per_cpu__##name
>   
> +#define __get_cpu_ptr(var) \
> +    (*RELOC_HIDE(var, get_cpu_info()->per_cpu_offset))
> +
> +#define per_cpu_ptr(var, cpu)  \
> +    (*RELOC_HIDE(var, __per_cpu_offset[cpu]))
> +
>   #endif /* __X86_PERCPU_H__ */
> diff --git a/xen/include/xen/percpu.h b/xen/include/xen/percpu.h
> index abe0b11..c896863 100644
> --- a/xen/include/xen/percpu.h
> +++ b/xen/include/xen/percpu.h
> @@ -16,6 +16,10 @@
>   /* Preferred on Xen. Also see arch-defined per_cpu(). */
>   #define this_cpu(var)    __get_cpu_var(var)
>   
> +#define this_cpu_ptr(ptr)    __get_cpu_ptr(ptr)
> +
> +#define get_per_cpu_var(var)  (per_cpu__##var)
> +
>   /* Linux compatibility. */
>   #define get_cpu_var(var) this_cpu(var)
>   #define put_cpu_var(var)
> diff --git a/xen/include/xen/spinlock.h b/xen/include/xen/spinlock.h
> index fb0438e..f929f1b 100644
> --- a/xen/include/xen/spinlock.h
> +++ b/xen/include/xen/spinlock.h
> @@ -3,6 +3,7 @@
>   
>   #include <asm/system.h>
>   #include <asm/spinlock.h>
> +#include <xen/stdbool.h>
>   
>   #ifndef NDEBUG
>   struct lock_debug {
> @@ -261,4 +262,40 @@ int _rw_is_write_locked(rwlock_t *lock);
>   #define rw_is_locked(l)               _rw_is_locked(l)
>   #define rw_is_write_locked(l)         _rw_is_write_locked(l)
>   
> +static inline void percpu_read_lock(rwlock_t **per_cpudata, bool_t *writer_activating,
> +		rwlock_t *rwlock)
> +{
> +    /* Indicate this cpu is reading */
> +    this_cpu_ptr(per_cpudata) = rwlock;
> +    smp_mb();
> +    /* Check if a writer is waiting */
> +    if ( unlikely(*writer_activating) )
> +    {
> +        /* Let the waiting writer know we aren't holding the lock */
> +        this_cpu_ptr(per_cpudata) = NULL;
> +        /* Wait using the read lock to keep the lock fair */
> +        read_lock(rwlock);
> +        /* Set the per CPU data again and continue*/
> +        this_cpu_ptr(per_cpudata) = rwlock;
> +        /* Drop the read lock because we don't need it anymore */
> +        read_unlock(rwlock);
> +    }
> +}
> +
> +static inline void percpu_read_unlock(rwlock_t **per_cpudata)
> +{
> +    this_cpu_ptr(per_cpudata) = NULL;
> +    smp_wmb();
> +}
> +
> +/* Don't inline percpu write lock as it's a complex function */
> +void percpu_write_lock(rwlock_t **per_cpudata, bool_t *writer_activating,
> +		rwlock_t *rwlock);
> +
> +static inline void percpu_write_unlock(bool_t *writer_activating, rwlock_t *rwlock)
> +{
> +    *writer_activating = false;
> +    write_unlock(rwlock);
> +}
> +
>   #endif /* __SPINLOCK_H__ */

-- 

Regards,

Marcos Eduardo Matsunaga

Oracle USA
Linux Engineering

“The statements and opinions expressed here are my own and do not
necessarily represent those of Oracle Corporation.”

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH 1/2] rwlock: add per-cpu reader-writer locks
  2015-11-05 13:48 ` [PATCH 1/2] rwlock: add per-cpu reader-writer locks Marcos E. Matsunaga
@ 2015-11-05 15:20   ` Malcolm Crossley
  2015-11-05 15:46     ` Marcos E. Matsunaga
  0 siblings, 1 reply; 31+ messages in thread
From: Malcolm Crossley @ 2015-11-05 15:20 UTC (permalink / raw)
  To: Marcos E. Matsunaga, xen-devel

On 05/11/15 13:48, Marcos E. Matsunaga wrote:
> Hi Malcolm,
> 
> I tried your patches against staging yesterday and as soon as I started a guest, it panic. I have
> lock_profile enabled and applied your patches against:

I tested with a non debug version of Xen (because I was analysing the performance of Xen) and thus
those ASSERTS were never run.

The ASSERTS can be safely removed, the rwlock behaviour is slightly different in that it's possible
for a writer to hold the write lock whilst a reader is progressing through the read critical
section, this is safe because the writer is waiting for the percpu variables to clear before
actually progressing through it's own critical section.

I have an updated version of the patch series which fixes this. Do you want me to post it or are you
happy to remove the ASSERTS yourself ( or switch to non-debug build of Xen)

Sorry for not catching this before it hit the list.

Malcolm

> 
> 6f04de658574833688c3f9eab310e7834d56a9c0 x86: cleanup of early cpuid handling
> 
> 
> 
> (XEN) HVM1 save: CPU
> (XEN) HVM1 save: PIC
> (XEN) HVM1 save: IOAPIC
> (XEN) HVM1 save: LAPIC
> (XEN) HVM1 save: LAPIC_REGS
> (XEN) HVM1 save: PCI_IRQ
> (XEN) HVM1 save: ISA_IRQ
> (XEN) HVM1 save: PCI_LINK
> (XEN) HVM1 save: PIT
> (XEN) HVM1 save: RTC
> (XEN) HVM1 save: HPET
> (XEN) HVM1 save: PMTIMER
> (XEN) HVM1 save: MTRR
> (XEN) HVM1 save: VIRIDIAN_DOMAIN
> (XEN) HVM1 save: CPU_XSAVE
> (XEN) HVM1 save: VIRIDIAN_VCPU
> (XEN) HVM1 save: VMCE_VCPU
> (XEN) HVM1 save: TSC_ADJUST
> (XEN) HVM1 restore: CPU 0
> [  394.163143] loop: module loaded
> (XEN) Assertion 'rw_is_locked(&t->lock)' failed at grant_table.c:215
> (XEN) ----[ Xen-4.7-unstable  x86_64  debug=y  Tainted:    C ]----
> (XEN) CPU:    0
> (XEN) RIP:    e008:[<ffff82d08011019e>] do_grant_table_op+0x63f/0x2e04
> (XEN) RFLAGS: 0000000000010246   CONTEXT: hypervisor (d0v0)
> (XEN) rax: 0000000000000000   rbx: ffff83400f9dc9e0   rcx: 0000000000000000
> (XEN) rdx: 0000000000000001   rsi: ffff82d080342b10   rdi: ffff83400819b784
> (XEN) rbp: ffff8300774ffef8   rsp: ffff8300774ffdf8   r8: 0000000000000002
> (XEN) r9:  0000000000000002   r10: 0000000000000002   r11: 00000000ffffffff
> (XEN) r12: 0000000000000000   r13: 0000000000000000   r14: ffff83400819b780
> (XEN) r15: ffff83400f9d0000   cr0: 0000000080050033   cr4: 00000000001526e0
> (XEN) cr3: 000001007f613000   cr2: ffff8800746182b8
> (XEN) ds: 0000   es: 0000   fs: 0000   gs: 0000   ss: e010   cs: e008
> (XEN) Xen stack trace from rsp=ffff8300774ffdf8:
> (XEN)    ffff8300774ffe08 ffff82d000000000 ffff8300774ffef8 ffff82d08017fc9b
> (XEN)    ffff82d080342b28 ffff83400f9d8600 ffff82d080342b10 0000000000000000
> (XEN)    ffff83400f9dca20 ffff832100000000 ffff834008188000 0000000000000001
> (XEN)    00000001772ee000 ffff8801e98d03e0 ffff8300774ffe88 0000000000000000
> (XEN)    0000000000000000 ffff8300774fff18 00000021d0269c10 000000000001001a
> (XEN)    ffffffff00000001 0000000000000000 0000000000000246 00007ff7de45a407
> (XEN)    0000000000000100 00007ff7de45a407 0000000000000033 ffff8300772ee000
> (XEN)    ffff8801eb0e3c00 ffff880004bf57e8 ffff8801e98d03e0 ffff8801eb0a5938
> (XEN)    00007cff88b000c7 ffff82d08023d952 ffffffff8100128a 0000000000000014
> (XEN)    0000000000000000 0000000000000001 ffff8801f6e18388 ffffffff81d3d740
> (XEN)    ffff8801efb7bd40 ffff88000542e780 0000000000000282 0000000000000000
> (XEN)    ffff8801e98d03a0 ffff8801efe07000 0000000000000014 ffffffff8100128a
> (XEN)    0000000000000001 ffff8801e98d03e0 0000000000000000 0001010000000000
> (XEN)    ffffffff8100128a 000000000000e033 0000000000000282 ffff8801efb7bce0
> (XEN)    000000000000e02b 0000000000000000 0000000000000000 0000000000000000
> (XEN)    0000000000000000 0000000000000000 ffff8300772ee000 0000000000000000
> (XEN)    0000000000000000
> (XEN) Xen call trace:
> (XEN)    [<ffff82d08011019e>] do_grant_table_op+0x63f/0x2e04
> (XEN)    [<ffff82d08023d952>] lstar_enter+0xe2/0x13c
> (XEN)
> (XEN)
> (XEN) ****************************************
> (XEN) Panic on CPU 0:
> (XEN) Assertion 'rw_is_locked(&t->lock)' failed at grant_table.c:215
> (XEN) ****************************************
> (XEN)
> (XEN) Manual reset required ('noreboot' specified)
> 
> 
> Thanks for your help.
> 
> On 11/03/2015 12:58 PM, Malcolm Crossley wrote:
>> Per-cpu read-write locks allow for the fast path read case to have low overhead
>> by only setting/clearing a per-cpu variable for using the read lock.
>> The per-cpu read fast path also avoids locked compare swap operations which can
>> be particularly slow on coherent multi-socket systems, particularly if there is
>> heavy usage of the read lock itself.
>>
>> The per-cpu reader-writer lock uses a global variable to control the read lock
>> fast path. This allows a writer to disable the fast path and ensure the readers
>> use the underlying read-write lock implementation.
>>
>> Once the writer has taken the write lock and disabled the fast path, it must
>> poll the per-cpu variable for all CPU's which have entered the critical section
>> for the specific read-write lock the writer is attempting to take. This design
>> allows for a single per-cpu variable to be used for read/write locks belonging
>> to seperate data structures as long as multiple per-cpu read locks are not
>> simultaneously held by one particular cpu. This also means per-cpu
>> reader-writer locks are not recursion safe.
>>
>> Slow path readers which are unblocked set the per-cpu variable and drop the
>> read lock. This simplifies the implementation and allows for fairness in the
>> underlying read-write lock to be taken advantage of.
>>
>> There may be slightly more overhead on the per-cpu write lock path due to
>> checking each CPUs fast path read variable but this overhead is likely be hidden
>> by the required delay of waiting for readers to exit the critical section.
>> The loop is optimised to only iterate over the per-cpu data of active readers
>> of the rwlock.
>>
>> Signed-off-by: Malcolm Crossley <malcolm.crossley@citrix.com>
>> ---
>>   xen/common/spinlock.c        | 32 ++++++++++++++++++++++++++++++++
>>   xen/include/asm-arm/percpu.h |  5 +++++
>>   xen/include/asm-x86/percpu.h |  6 ++++++
>>   xen/include/xen/percpu.h     |  4 ++++
>>   xen/include/xen/spinlock.h   | 37 +++++++++++++++++++++++++++++++++++++
>>   5 files changed, 84 insertions(+)
>>
>> diff --git a/xen/common/spinlock.c b/xen/common/spinlock.c
>> index 7f89694..a526216 100644
>> --- a/xen/common/spinlock.c
>> +++ b/xen/common/spinlock.c
>> @@ -492,6 +492,38 @@ int _rw_is_write_locked(rwlock_t *lock)
>>       return (lock->lock == RW_WRITE_FLAG); /* writer in critical section? */
>>   }
>>   +void percpu_write_lock(rwlock_t **per_cpudata, bool_t *writer_activating,
>> +        rwlock_t *rwlock)
>> +{
>> +    int cpu;
>> +    cpumask_t active_readers;
>> +
>> +    cpumask_copy(&active_readers, &cpu_online_map);
>> +    /* First take the write lock to protect against other writers */
>> +    write_lock(rwlock);
>> +
>> +    /* Now set the global variable so that readers start using read_lock */
>> +    *writer_activating = true;
>> +    smp_mb();
>> +
>> +    /* Check if there are any percpu readers in progress on our rwlock*/
>> +    do
>> +    {
>> +        for_each_cpu(cpu, &active_readers)
>> +        {
>> +            /* Remove any percpu readers not contending
>> +             * from our check mask */
>> +            if ( per_cpu_ptr(per_cpudata, cpu) != rwlock )
>> +                cpumask_clear_cpu(cpu, &active_readers);
>> +        }
>> +        /* Check if we've cleared all percpu readers */
>> +        if ( cpumask_empty(&active_readers) )
>> +            break;
>> +        /* Give the coherency fabric a break */
>> +        cpu_relax();
>> +    } while ( 1 );
>> +}
>> +
>>   #ifdef LOCK_PROFILE
>>     struct lock_profile_anc {
>> diff --git a/xen/include/asm-arm/percpu.h b/xen/include/asm-arm/percpu.h
>> index 71e7649..c308a56 100644
>> --- a/xen/include/asm-arm/percpu.h
>> +++ b/xen/include/asm-arm/percpu.h
>> @@ -27,6 +27,11 @@ void percpu_init_areas(void);
>>   #define __get_cpu_var(var) \
>>       (*RELOC_HIDE(&per_cpu__##var, READ_SYSREG(TPIDR_EL2)))
>>   +#define per_cpu_ptr(var, cpu)  \
>> +    (*RELOC_HIDE(&var, __per_cpu_offset[cpu]))
>> +#define __get_cpu_ptr(var) \
>> +    (*RELOC_HIDE(&var, READ_SYSREG(TPIDR_EL2)))
>> +
>>   #define DECLARE_PER_CPU(type, name) extern __typeof__(type) per_cpu__##name
>>     DECLARE_PER_CPU(unsigned int, cpu_id);
>> diff --git a/xen/include/asm-x86/percpu.h b/xen/include/asm-x86/percpu.h
>> index 604ff0d..51562b9 100644
>> --- a/xen/include/asm-x86/percpu.h
>> +++ b/xen/include/asm-x86/percpu.h
>> @@ -20,4 +20,10 @@ void percpu_init_areas(void);
>>     #define DECLARE_PER_CPU(type, name) extern __typeof__(type) per_cpu__##name
>>   +#define __get_cpu_ptr(var) \
>> +    (*RELOC_HIDE(var, get_cpu_info()->per_cpu_offset))
>> +
>> +#define per_cpu_ptr(var, cpu)  \
>> +    (*RELOC_HIDE(var, __per_cpu_offset[cpu]))
>> +
>>   #endif /* __X86_PERCPU_H__ */
>> diff --git a/xen/include/xen/percpu.h b/xen/include/xen/percpu.h
>> index abe0b11..c896863 100644
>> --- a/xen/include/xen/percpu.h
>> +++ b/xen/include/xen/percpu.h
>> @@ -16,6 +16,10 @@
>>   /* Preferred on Xen. Also see arch-defined per_cpu(). */
>>   #define this_cpu(var)    __get_cpu_var(var)
>>   +#define this_cpu_ptr(ptr)    __get_cpu_ptr(ptr)
>> +
>> +#define get_per_cpu_var(var)  (per_cpu__##var)
>> +
>>   /* Linux compatibility. */
>>   #define get_cpu_var(var) this_cpu(var)
>>   #define put_cpu_var(var)
>> diff --git a/xen/include/xen/spinlock.h b/xen/include/xen/spinlock.h
>> index fb0438e..f929f1b 100644
>> --- a/xen/include/xen/spinlock.h
>> +++ b/xen/include/xen/spinlock.h
>> @@ -3,6 +3,7 @@
>>     #include <asm/system.h>
>>   #include <asm/spinlock.h>
>> +#include <xen/stdbool.h>
>>     #ifndef NDEBUG
>>   struct lock_debug {
>> @@ -261,4 +262,40 @@ int _rw_is_write_locked(rwlock_t *lock);
>>   #define rw_is_locked(l)               _rw_is_locked(l)
>>   #define rw_is_write_locked(l)         _rw_is_write_locked(l)
>>   +static inline void percpu_read_lock(rwlock_t **per_cpudata, bool_t *writer_activating,
>> +        rwlock_t *rwlock)
>> +{
>> +    /* Indicate this cpu is reading */
>> +    this_cpu_ptr(per_cpudata) = rwlock;
>> +    smp_mb();
>> +    /* Check if a writer is waiting */
>> +    if ( unlikely(*writer_activating) )
>> +    {
>> +        /* Let the waiting writer know we aren't holding the lock */
>> +        this_cpu_ptr(per_cpudata) = NULL;
>> +        /* Wait using the read lock to keep the lock fair */
>> +        read_lock(rwlock);
>> +        /* Set the per CPU data again and continue*/
>> +        this_cpu_ptr(per_cpudata) = rwlock;
>> +        /* Drop the read lock because we don't need it anymore */
>> +        read_unlock(rwlock);
>> +    }
>> +}
>> +
>> +static inline void percpu_read_unlock(rwlock_t **per_cpudata)
>> +{
>> +    this_cpu_ptr(per_cpudata) = NULL;
>> +    smp_wmb();
>> +}
>> +
>> +/* Don't inline percpu write lock as it's a complex function */
>> +void percpu_write_lock(rwlock_t **per_cpudata, bool_t *writer_activating,
>> +        rwlock_t *rwlock);
>> +
>> +static inline void percpu_write_unlock(bool_t *writer_activating, rwlock_t *rwlock)
>> +{
>> +    *writer_activating = false;
>> +    write_unlock(rwlock);
>> +}
>> +
>>   #endif /* __SPINLOCK_H__ */
> 

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH 1/2] rwlock: add per-cpu reader-writer locks
  2015-11-05 15:20   ` Malcolm Crossley
@ 2015-11-05 15:46     ` Marcos E. Matsunaga
  0 siblings, 0 replies; 31+ messages in thread
From: Marcos E. Matsunaga @ 2015-11-05 15:46 UTC (permalink / raw)
  To: Malcolm Crossley, xen-devel

Hi Malcolm,

If you can post the updated patches, that would be great. I think it 
would be better for me to test with your update.

Thanks again.

On 11/05/2015 10:20 AM, Malcolm Crossley wrote:
> On 05/11/15 13:48, Marcos E. Matsunaga wrote:
>> Hi Malcolm,
>>
>> I tried your patches against staging yesterday and as soon as I started a guest, it panic. I have
>> lock_profile enabled and applied your patches against:
> I tested with a non debug version of Xen (because I was analysing the performance of Xen) and thus
> those ASSERTS were never run.
>
> The ASSERTS can be safely removed, the rwlock behaviour is slightly different in that it's possible
> for a writer to hold the write lock whilst a reader is progressing through the read critical
> section, this is safe because the writer is waiting for the percpu variables to clear before
> actually progressing through it's own critical section.
>
> I have an updated version of the patch series which fixes this. Do you want me to post it or are you
> happy to remove the ASSERTS yourself ( or switch to non-debug build of Xen)
>
> Sorry for not catching this before it hit the list.
>
> Malcolm
>
>> 6f04de658574833688c3f9eab310e7834d56a9c0 x86: cleanup of early cpuid handling
>>
>>
>>
>> (XEN) HVM1 save: CPU
>> (XEN) HVM1 save: PIC
>> (XEN) HVM1 save: IOAPIC
>> (XEN) HVM1 save: LAPIC
>> (XEN) HVM1 save: LAPIC_REGS
>> (XEN) HVM1 save: PCI_IRQ
>> (XEN) HVM1 save: ISA_IRQ
>> (XEN) HVM1 save: PCI_LINK
>> (XEN) HVM1 save: PIT
>> (XEN) HVM1 save: RTC
>> (XEN) HVM1 save: HPET
>> (XEN) HVM1 save: PMTIMER
>> (XEN) HVM1 save: MTRR
>> (XEN) HVM1 save: VIRIDIAN_DOMAIN
>> (XEN) HVM1 save: CPU_XSAVE
>> (XEN) HVM1 save: VIRIDIAN_VCPU
>> (XEN) HVM1 save: VMCE_VCPU
>> (XEN) HVM1 save: TSC_ADJUST
>> (XEN) HVM1 restore: CPU 0
>> [  394.163143] loop: module loaded
>> (XEN) Assertion 'rw_is_locked(&t->lock)' failed at grant_table.c:215
>> (XEN) ----[ Xen-4.7-unstable  x86_64  debug=y  Tainted:    C ]----
>> (XEN) CPU:    0
>> (XEN) RIP:    e008:[<ffff82d08011019e>] do_grant_table_op+0x63f/0x2e04
>> (XEN) RFLAGS: 0000000000010246   CONTEXT: hypervisor (d0v0)
>> (XEN) rax: 0000000000000000   rbx: ffff83400f9dc9e0   rcx: 0000000000000000
>> (XEN) rdx: 0000000000000001   rsi: ffff82d080342b10   rdi: ffff83400819b784
>> (XEN) rbp: ffff8300774ffef8   rsp: ffff8300774ffdf8   r8: 0000000000000002
>> (XEN) r9:  0000000000000002   r10: 0000000000000002   r11: 00000000ffffffff
>> (XEN) r12: 0000000000000000   r13: 0000000000000000   r14: ffff83400819b780
>> (XEN) r15: ffff83400f9d0000   cr0: 0000000080050033   cr4: 00000000001526e0
>> (XEN) cr3: 000001007f613000   cr2: ffff8800746182b8
>> (XEN) ds: 0000   es: 0000   fs: 0000   gs: 0000   ss: e010   cs: e008
>> (XEN) Xen stack trace from rsp=ffff8300774ffdf8:
>> (XEN)    ffff8300774ffe08 ffff82d000000000 ffff8300774ffef8 ffff82d08017fc9b
>> (XEN)    ffff82d080342b28 ffff83400f9d8600 ffff82d080342b10 0000000000000000
>> (XEN)    ffff83400f9dca20 ffff832100000000 ffff834008188000 0000000000000001
>> (XEN)    00000001772ee000 ffff8801e98d03e0 ffff8300774ffe88 0000000000000000
>> (XEN)    0000000000000000 ffff8300774fff18 00000021d0269c10 000000000001001a
>> (XEN)    ffffffff00000001 0000000000000000 0000000000000246 00007ff7de45a407
>> (XEN)    0000000000000100 00007ff7de45a407 0000000000000033 ffff8300772ee000
>> (XEN)    ffff8801eb0e3c00 ffff880004bf57e8 ffff8801e98d03e0 ffff8801eb0a5938
>> (XEN)    00007cff88b000c7 ffff82d08023d952 ffffffff8100128a 0000000000000014
>> (XEN)    0000000000000000 0000000000000001 ffff8801f6e18388 ffffffff81d3d740
>> (XEN)    ffff8801efb7bd40 ffff88000542e780 0000000000000282 0000000000000000
>> (XEN)    ffff8801e98d03a0 ffff8801efe07000 0000000000000014 ffffffff8100128a
>> (XEN)    0000000000000001 ffff8801e98d03e0 0000000000000000 0001010000000000
>> (XEN)    ffffffff8100128a 000000000000e033 0000000000000282 ffff8801efb7bce0
>> (XEN)    000000000000e02b 0000000000000000 0000000000000000 0000000000000000
>> (XEN)    0000000000000000 0000000000000000 ffff8300772ee000 0000000000000000
>> (XEN)    0000000000000000
>> (XEN) Xen call trace:
>> (XEN)    [<ffff82d08011019e>] do_grant_table_op+0x63f/0x2e04
>> (XEN)    [<ffff82d08023d952>] lstar_enter+0xe2/0x13c
>> (XEN)
>> (XEN)
>> (XEN) ****************************************
>> (XEN) Panic on CPU 0:
>> (XEN) Assertion 'rw_is_locked(&t->lock)' failed at grant_table.c:215
>> (XEN) ****************************************
>> (XEN)
>> (XEN) Manual reset required ('noreboot' specified)
>>
>>
>> Thanks for your help.
>>
>> On 11/03/2015 12:58 PM, Malcolm Crossley wrote:
>>> Per-cpu read-write locks allow for the fast path read case to have low overhead
>>> by only setting/clearing a per-cpu variable for using the read lock.
>>> The per-cpu read fast path also avoids locked compare swap operations which can
>>> be particularly slow on coherent multi-socket systems, particularly if there is
>>> heavy usage of the read lock itself.
>>>
>>> The per-cpu reader-writer lock uses a global variable to control the read lock
>>> fast path. This allows a writer to disable the fast path and ensure the readers
>>> use the underlying read-write lock implementation.
>>>
>>> Once the writer has taken the write lock and disabled the fast path, it must
>>> poll the per-cpu variable for all CPU's which have entered the critical section
>>> for the specific read-write lock the writer is attempting to take. This design
>>> allows for a single per-cpu variable to be used for read/write locks belonging
>>> to seperate data structures as long as multiple per-cpu read locks are not
>>> simultaneously held by one particular cpu. This also means per-cpu
>>> reader-writer locks are not recursion safe.
>>>
>>> Slow path readers which are unblocked set the per-cpu variable and drop the
>>> read lock. This simplifies the implementation and allows for fairness in the
>>> underlying read-write lock to be taken advantage of.
>>>
>>> There may be slightly more overhead on the per-cpu write lock path due to
>>> checking each CPUs fast path read variable but this overhead is likely be hidden
>>> by the required delay of waiting for readers to exit the critical section.
>>> The loop is optimised to only iterate over the per-cpu data of active readers
>>> of the rwlock.
>>>
>>> Signed-off-by: Malcolm Crossley <malcolm.crossley@citrix.com>
>>> ---
>>>    xen/common/spinlock.c        | 32 ++++++++++++++++++++++++++++++++
>>>    xen/include/asm-arm/percpu.h |  5 +++++
>>>    xen/include/asm-x86/percpu.h |  6 ++++++
>>>    xen/include/xen/percpu.h     |  4 ++++
>>>    xen/include/xen/spinlock.h   | 37 +++++++++++++++++++++++++++++++++++++
>>>    5 files changed, 84 insertions(+)
>>>
>>> diff --git a/xen/common/spinlock.c b/xen/common/spinlock.c
>>> index 7f89694..a526216 100644
>>> --- a/xen/common/spinlock.c
>>> +++ b/xen/common/spinlock.c
>>> @@ -492,6 +492,38 @@ int _rw_is_write_locked(rwlock_t *lock)
>>>        return (lock->lock == RW_WRITE_FLAG); /* writer in critical section? */
>>>    }
>>>    +void percpu_write_lock(rwlock_t **per_cpudata, bool_t *writer_activating,
>>> +        rwlock_t *rwlock)
>>> +{
>>> +    int cpu;
>>> +    cpumask_t active_readers;
>>> +
>>> +    cpumask_copy(&active_readers, &cpu_online_map);
>>> +    /* First take the write lock to protect against other writers */
>>> +    write_lock(rwlock);
>>> +
>>> +    /* Now set the global variable so that readers start using read_lock */
>>> +    *writer_activating = true;
>>> +    smp_mb();
>>> +
>>> +    /* Check if there are any percpu readers in progress on our rwlock*/
>>> +    do
>>> +    {
>>> +        for_each_cpu(cpu, &active_readers)
>>> +        {
>>> +            /* Remove any percpu readers not contending
>>> +             * from our check mask */
>>> +            if ( per_cpu_ptr(per_cpudata, cpu) != rwlock )
>>> +                cpumask_clear_cpu(cpu, &active_readers);
>>> +        }
>>> +        /* Check if we've cleared all percpu readers */
>>> +        if ( cpumask_empty(&active_readers) )
>>> +            break;
>>> +        /* Give the coherency fabric a break */
>>> +        cpu_relax();
>>> +    } while ( 1 );
>>> +}
>>> +
>>>    #ifdef LOCK_PROFILE
>>>      struct lock_profile_anc {
>>> diff --git a/xen/include/asm-arm/percpu.h b/xen/include/asm-arm/percpu.h
>>> index 71e7649..c308a56 100644
>>> --- a/xen/include/asm-arm/percpu.h
>>> +++ b/xen/include/asm-arm/percpu.h
>>> @@ -27,6 +27,11 @@ void percpu_init_areas(void);
>>>    #define __get_cpu_var(var) \
>>>        (*RELOC_HIDE(&per_cpu__##var, READ_SYSREG(TPIDR_EL2)))
>>>    +#define per_cpu_ptr(var, cpu)  \
>>> +    (*RELOC_HIDE(&var, __per_cpu_offset[cpu]))
>>> +#define __get_cpu_ptr(var) \
>>> +    (*RELOC_HIDE(&var, READ_SYSREG(TPIDR_EL2)))
>>> +
>>>    #define DECLARE_PER_CPU(type, name) extern __typeof__(type) per_cpu__##name
>>>      DECLARE_PER_CPU(unsigned int, cpu_id);
>>> diff --git a/xen/include/asm-x86/percpu.h b/xen/include/asm-x86/percpu.h
>>> index 604ff0d..51562b9 100644
>>> --- a/xen/include/asm-x86/percpu.h
>>> +++ b/xen/include/asm-x86/percpu.h
>>> @@ -20,4 +20,10 @@ void percpu_init_areas(void);
>>>      #define DECLARE_PER_CPU(type, name) extern __typeof__(type) per_cpu__##name
>>>    +#define __get_cpu_ptr(var) \
>>> +    (*RELOC_HIDE(var, get_cpu_info()->per_cpu_offset))
>>> +
>>> +#define per_cpu_ptr(var, cpu)  \
>>> +    (*RELOC_HIDE(var, __per_cpu_offset[cpu]))
>>> +
>>>    #endif /* __X86_PERCPU_H__ */
>>> diff --git a/xen/include/xen/percpu.h b/xen/include/xen/percpu.h
>>> index abe0b11..c896863 100644
>>> --- a/xen/include/xen/percpu.h
>>> +++ b/xen/include/xen/percpu.h
>>> @@ -16,6 +16,10 @@
>>>    /* Preferred on Xen. Also see arch-defined per_cpu(). */
>>>    #define this_cpu(var)    __get_cpu_var(var)
>>>    +#define this_cpu_ptr(ptr)    __get_cpu_ptr(ptr)
>>> +
>>> +#define get_per_cpu_var(var)  (per_cpu__##var)
>>> +
>>>    /* Linux compatibility. */
>>>    #define get_cpu_var(var) this_cpu(var)
>>>    #define put_cpu_var(var)
>>> diff --git a/xen/include/xen/spinlock.h b/xen/include/xen/spinlock.h
>>> index fb0438e..f929f1b 100644
>>> --- a/xen/include/xen/spinlock.h
>>> +++ b/xen/include/xen/spinlock.h
>>> @@ -3,6 +3,7 @@
>>>      #include <asm/system.h>
>>>    #include <asm/spinlock.h>
>>> +#include <xen/stdbool.h>
>>>      #ifndef NDEBUG
>>>    struct lock_debug {
>>> @@ -261,4 +262,40 @@ int _rw_is_write_locked(rwlock_t *lock);
>>>    #define rw_is_locked(l)               _rw_is_locked(l)
>>>    #define rw_is_write_locked(l)         _rw_is_write_locked(l)
>>>    +static inline void percpu_read_lock(rwlock_t **per_cpudata, bool_t *writer_activating,
>>> +        rwlock_t *rwlock)
>>> +{
>>> +    /* Indicate this cpu is reading */
>>> +    this_cpu_ptr(per_cpudata) = rwlock;
>>> +    smp_mb();
>>> +    /* Check if a writer is waiting */
>>> +    if ( unlikely(*writer_activating) )
>>> +    {
>>> +        /* Let the waiting writer know we aren't holding the lock */
>>> +        this_cpu_ptr(per_cpudata) = NULL;
>>> +        /* Wait using the read lock to keep the lock fair */
>>> +        read_lock(rwlock);
>>> +        /* Set the per CPU data again and continue*/
>>> +        this_cpu_ptr(per_cpudata) = rwlock;
>>> +        /* Drop the read lock because we don't need it anymore */
>>> +        read_unlock(rwlock);
>>> +    }
>>> +}
>>> +
>>> +static inline void percpu_read_unlock(rwlock_t **per_cpudata)
>>> +{
>>> +    this_cpu_ptr(per_cpudata) = NULL;
>>> +    smp_wmb();
>>> +}
>>> +
>>> +/* Don't inline percpu write lock as it's a complex function */
>>> +void percpu_write_lock(rwlock_t **per_cpudata, bool_t *writer_activating,
>>> +        rwlock_t *rwlock);
>>> +
>>> +static inline void percpu_write_unlock(bool_t *writer_activating, rwlock_t *rwlock)
>>> +{
>>> +    *writer_activating = false;
>>> +    write_unlock(rwlock);
>>> +}
>>> +
>>>    #endif /* __SPINLOCK_H__ */

-- 

Regards,

Marcos Eduardo Matsunaga

Oracle USA
Linux Engineering

“The statements and opinions expressed here are my own and do not
necessarily represent those of Oracle Corporation.”

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH 1/2] rwlock: add per-cpu reader-writer locks
  2015-11-03 17:58 [PATCH 1/2] rwlock: add per-cpu reader-writer locks Malcolm Crossley
  2015-11-03 17:58 ` [PATCH 2/2] grant_table: convert grant table rwlock to percpu rwlock Malcolm Crossley
  2015-11-05 13:48 ` [PATCH 1/2] rwlock: add per-cpu reader-writer locks Marcos E. Matsunaga
@ 2015-11-17 17:00 ` Jan Beulich
  2015-11-18 13:49   ` Malcolm Crossley
  2 siblings, 1 reply; 31+ messages in thread
From: Jan Beulich @ 2015-11-17 17:00 UTC (permalink / raw)
  To: Malcolm Crossley
  Cc: andrew.cooper3, keir, stefano.stabellini, ian.campbell, xen-devel

>>> On 03.11.15 at 18:58, <malcolm.crossley@citrix.com> wrote:
> Per-cpu read-write locks allow for the fast path read case to have low overhead
> by only setting/clearing a per-cpu variable for using the read lock.
> The per-cpu read fast path also avoids locked compare swap operations which can
> be particularly slow on coherent multi-socket systems, particularly if there is
> heavy usage of the read lock itself.
> 
> The per-cpu reader-writer lock uses a global variable to control the read lock
> fast path. This allows a writer to disable the fast path and ensure the readers
> use the underlying read-write lock implementation.
> 
> Once the writer has taken the write lock and disabled the fast path, it must
> poll the per-cpu variable for all CPU's which have entered the critical section
> for the specific read-write lock the writer is attempting to take. This design
> allows for a single per-cpu variable to be used for read/write locks belonging
> to seperate data structures as long as multiple per-cpu read locks are not
> simultaneously held by one particular cpu. This also means per-cpu
> reader-writer locks are not recursion safe.

This sounds like pretty severe a restriction, and something to easily
get wrong once more than a handful of users got introduced.

> Slow path readers which are unblocked set the per-cpu variable and drop the
> read lock. This simplifies the implementation and allows for fairness in the
> underlying read-write lock to be taken advantage of.
> 
> There may be slightly more overhead on the per-cpu write lock path due to
> checking each CPUs fast path read variable but this overhead is likely be hidden
> by the required delay of waiting for readers to exit the critical section.

Hmm, the "slightly" here seems to depend on the number of CPUs.

> --- a/xen/common/spinlock.c
> +++ b/xen/common/spinlock.c
> @@ -492,6 +492,38 @@ int _rw_is_write_locked(rwlock_t *lock)
>      return (lock->lock == RW_WRITE_FLAG); /* writer in critical section? */
>  }
>  
> +void percpu_write_lock(rwlock_t **per_cpudata, bool_t *writer_activating,
> +		rwlock_t *rwlock)
> +{
> +    int cpu;

unsigned int

> +    cpumask_t active_readers;

Did you consider ways to avoid such a large variable to be put on
the stack, and none turned out suitable?

> +    cpumask_copy(&active_readers, &cpu_online_map);

While we have other cases not fully hotplug safe, I don't think we
can allow this in basic locking infrastructure code. You need to
protect against cpu_online_map changing.

> +    /* First take the write lock to protect against other writers */

Most if not all of your comments are missing a full stop.

> +    write_lock(rwlock);
> +
> +    /* Now set the global variable so that readers start using read_lock */
> +    *writer_activating = true;
> +    smp_mb();
> +
> +    /* Check if there are any percpu readers in progress on our rwlock*/
> +    do
> +    {
> +        for_each_cpu(cpu, &active_readers)
> +        {
> +            /* Remove any percpu readers not contending 
> +             * from our check mask */

Comment style.

> +            if ( per_cpu_ptr(per_cpudata, cpu) != rwlock )
> +                cpumask_clear_cpu(cpu, &active_readers);

This is a LOCKed op, and with the subject of the patch being to lower
the load on the fabric, I wonder whether time isn't ripe for introducing
a non-LOCKed variant of set, clear, and their test_and_ variants.

> +        }
> +        /* Check if we've cleared all percpu readers */
> +        if ( cpumask_empty(&active_readers) )
> +            break;
> +        /* Give the coherency fabric a break */
> +        cpu_relax();
> +    } while ( 1 );

While certainly a matter of taste, could I talk you into using
for ( ; ; ) instead?

> --- a/xen/include/xen/spinlock.h
> +++ b/xen/include/xen/spinlock.h
> @@ -3,6 +3,7 @@
>  
>  #include <asm/system.h>
>  #include <asm/spinlock.h>
> +#include <xen/stdbool.h>

No inclusion of this header in code other than such shared with the
tools please.

> @@ -261,4 +262,40 @@ int _rw_is_write_locked(rwlock_t *lock);
>  #define rw_is_locked(l)               _rw_is_locked(l)
>  #define rw_is_write_locked(l)         _rw_is_write_locked(l)
>  
> +static inline void percpu_read_lock(rwlock_t **per_cpudata, bool_t *writer_activating,
> +		rwlock_t *rwlock)
> +{
> +    /* Indicate this cpu is reading */
> +    this_cpu_ptr(per_cpudata) = rwlock;
> +    smp_mb();
> +    /* Check if a writer is waiting */
> +    if ( unlikely(*writer_activating) )
> +    {
> +        /* Let the waiting writer know we aren't holding the lock */
> +        this_cpu_ptr(per_cpudata) = NULL;
> +        /* Wait using the read lock to keep the lock fair */
> +        read_lock(rwlock);
> +        /* Set the per CPU data again and continue*/
> +        this_cpu_ptr(per_cpudata) = rwlock;
> +        /* Drop the read lock because we don't need it anymore */
> +        read_unlock(rwlock);
> +    }
> +}

So am I getting it right that the whole point of the introduction of
this lock flavor is to replace the cmpxchg() in read_lock() with just
the write+read above? If so I'm sure you considered alternatives,
namely of modifying the plain rw locks - would you mind sharing
the outcome thereof? I admit I can't see alternatives, but the price
- as said above - seems quite high. Or maybe I'm misunderstanding
the "as long as multiple per-cpu read locks are not simultaneously
held by one particular cpu", as having made it here I can't right
away see where that restriction comes from?

> +static inline void percpu_read_unlock(rwlock_t **per_cpudata)
> +{
> +    this_cpu_ptr(per_cpudata) = NULL;
> +    smp_wmb();
> +}
> +
> +/* Don't inline percpu write lock as it's a complex function */
> +void percpu_write_lock(rwlock_t **per_cpudata, bool_t *writer_activating,
> +		rwlock_t *rwlock);
> +
> +static inline void percpu_write_unlock(bool_t *writer_activating, rwlock_t *rwlock)
> +{
> +    *writer_activating = false;
> +    write_unlock(rwlock);
> +}

Considering that the arguments one needs to pass to any of the
functions here taking multiple arguments need to be in close sync,
I'm missing abstracting macros here that need to be passed just
a single argument (plus a declaration and a definition one).

Jan

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH 2/2] grant_table: convert grant table rwlock to percpu rwlock
  2015-11-03 17:58 ` [PATCH 2/2] grant_table: convert grant table rwlock to percpu rwlock Malcolm Crossley
@ 2015-11-17 17:04   ` Jan Beulich
  2015-11-17 17:30     ` Andrew Cooper
  0 siblings, 1 reply; 31+ messages in thread
From: Jan Beulich @ 2015-11-17 17:04 UTC (permalink / raw)
  To: Malcolm Crossley
  Cc: andrew.cooper3, keir, stefano.stabellini, ian.campbell, xen-devel

>>> On 03.11.15 at 18:58, <malcolm.crossley@citrix.com> wrote:
> --- a/xen/common/grant_table.c
> +++ b/xen/common/grant_table.c
> @@ -178,6 +178,10 @@ struct active_grant_entry {
>  #define _active_entry(t, e) \
>      ((t)->active[(e)/ACGNT_PER_PAGE][(e)%ACGNT_PER_PAGE])
>  
> +bool_t grant_rwlock_barrier;
> +
> +DEFINE_PER_CPU(rwlock_t *, grant_rwlock);

Shouldn't these be per grant table? And wouldn't doing so eliminate
the main limitation of the per-CPU rwlocks?

Jan

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH 2/2] grant_table: convert grant table rwlock to percpu rwlock
  2015-11-17 17:04   ` Jan Beulich
@ 2015-11-17 17:30     ` Andrew Cooper
  2015-11-17 17:39       ` Jan Beulich
  2015-11-18 20:02       ` Konrad Rzeszutek Wilk
  0 siblings, 2 replies; 31+ messages in thread
From: Andrew Cooper @ 2015-11-17 17:30 UTC (permalink / raw)
  To: Jan Beulich, Malcolm Crossley
  Cc: xen-devel, keir, stefano.stabellini, ian.campbell

On 17/11/15 17:04, Jan Beulich wrote:
>>>> On 03.11.15 at 18:58, <malcolm.crossley@citrix.com> wrote:
>> --- a/xen/common/grant_table.c
>> +++ b/xen/common/grant_table.c
>> @@ -178,6 +178,10 @@ struct active_grant_entry {
>>  #define _active_entry(t, e) \
>>      ((t)->active[(e)/ACGNT_PER_PAGE][(e)%ACGNT_PER_PAGE])
>>  
>> +bool_t grant_rwlock_barrier;
>> +
>> +DEFINE_PER_CPU(rwlock_t *, grant_rwlock);
> Shouldn't these be per grant table? And wouldn't doing so eliminate
> the main limitation of the per-CPU rwlocks?

The grant rwlock is per grant table.

The entire point of this series is to reduce the cmpxchg storm which
happens when many pcpus attempt to grap the same domains grant read lock.

As identified in the commit message, reducing the cmpxchg pressure on
the cache coherency fabric increases intra-vm network through from
10Gbps to 50Gbps when running iperf between two 16-vcpu guests.

Or in other words, 80% of cpu time is wasted with waiting on an atomic
read/modify/write operation against a remote hot cache line.

~Andrew

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH 2/2] grant_table: convert grant table rwlock to percpu rwlock
  2015-11-17 17:30     ` Andrew Cooper
@ 2015-11-17 17:39       ` Jan Beulich
  2015-11-17 17:53         ` Andrew Cooper
  2015-11-18 20:02       ` Konrad Rzeszutek Wilk
  1 sibling, 1 reply; 31+ messages in thread
From: Jan Beulich @ 2015-11-17 17:39 UTC (permalink / raw)
  To: Andrew Cooper
  Cc: xen-devel, Malcolm Crossley, keir, stefano.stabellini,
	ian.campbell

>>> On 17.11.15 at 18:30, <andrew.cooper3@citrix.com> wrote:
> On 17/11/15 17:04, Jan Beulich wrote:
>>>>> On 03.11.15 at 18:58, <malcolm.crossley@citrix.com> wrote:
>>> --- a/xen/common/grant_table.c
>>> +++ b/xen/common/grant_table.c
>>> @@ -178,6 +178,10 @@ struct active_grant_entry {
>>>  #define _active_entry(t, e) \
>>>      ((t)->active[(e)/ACGNT_PER_PAGE][(e)%ACGNT_PER_PAGE])
>>>  
>>> +bool_t grant_rwlock_barrier;
>>> +
>>> +DEFINE_PER_CPU(rwlock_t *, grant_rwlock);
>> Shouldn't these be per grant table? And wouldn't doing so eliminate
>> the main limitation of the per-CPU rwlocks?
> 
> The grant rwlock is per grant table.

That's understood, but I don't see why the above items aren't, too.

> The entire point of this series is to reduce the cmpxchg storm which
> happens when many pcpus attempt to grap the same domains grant read lock.
> 
> As identified in the commit message, reducing the cmpxchg pressure on
> the cache coherency fabric increases intra-vm network through from
> 10Gbps to 50Gbps when running iperf between two 16-vcpu guests.
> 
> Or in other words, 80% of cpu time is wasted with waiting on an atomic
> read/modify/write operation against a remote hot cache line.

All of this is pretty nice, but again unrelated to the question I
raised.

The whole interface would likely become quite a bit easier to use
if there was a percpu_rwlock_t comprising all three elements (the
per-CPU item obviously would need to become a per-CPU pointer,
with allocation of per-CPU data needing introduction).

Jan

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH 2/2] grant_table: convert grant table rwlock to percpu rwlock
  2015-11-17 17:39       ` Jan Beulich
@ 2015-11-17 17:53         ` Andrew Cooper
  2015-11-18  7:45           ` Jan Beulich
  2015-11-18 10:36           ` Ian Campbell
  0 siblings, 2 replies; 31+ messages in thread
From: Andrew Cooper @ 2015-11-17 17:53 UTC (permalink / raw)
  To: Jan Beulich
  Cc: xen-devel, Malcolm Crossley, keir, stefano.stabellini,
	ian.campbell

On 17/11/15 17:39, Jan Beulich wrote:
>>>> On 17.11.15 at 18:30, <andrew.cooper3@citrix.com> wrote:
>> On 17/11/15 17:04, Jan Beulich wrote:
>>>>>> On 03.11.15 at 18:58, <malcolm.crossley@citrix.com> wrote:
>>>> --- a/xen/common/grant_table.c
>>>> +++ b/xen/common/grant_table.c
>>>> @@ -178,6 +178,10 @@ struct active_grant_entry {
>>>>  #define _active_entry(t, e) \
>>>>      ((t)->active[(e)/ACGNT_PER_PAGE][(e)%ACGNT_PER_PAGE])
>>>>  
>>>> +bool_t grant_rwlock_barrier;
>>>> +
>>>> +DEFINE_PER_CPU(rwlock_t *, grant_rwlock);
>>> Shouldn't these be per grant table? And wouldn't doing so eliminate
>>> the main limitation of the per-CPU rwlocks?
>> The grant rwlock is per grant table.
> That's understood, but I don't see why the above items aren't, too.

Ah - because there is never any circumstance where two grant tables are
locked on the same pcpu.

Nor is there any such need.

>
>> The entire point of this series is to reduce the cmpxchg storm which
>> happens when many pcpus attempt to grap the same domains grant read lock.
>>
>> As identified in the commit message, reducing the cmpxchg pressure on
>> the cache coherency fabric increases intra-vm network through from
>> 10Gbps to 50Gbps when running iperf between two 16-vcpu guests.
>>
>> Or in other words, 80% of cpu time is wasted with waiting on an atomic
>> read/modify/write operation against a remote hot cache line.
> All of this is pretty nice, but again unrelated to the question I
> raised.
>
> The whole interface would likely become quite a bit easier to use
> if there was a percpu_rwlock_t comprising all three elements (the
> per-CPU item obviously would need to become a per-CPU pointer,
> with allocation of per-CPU data needing introduction).

Runtime per-CPU data allocation is incompatible with our current scheme
(which relies on the linker to do some of the heavy lifting).

~Andrew

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH 2/2] grant_table: convert grant table rwlock to percpu rwlock
  2015-11-17 17:53         ` Andrew Cooper
@ 2015-11-18  7:45           ` Jan Beulich
  2015-11-18 10:06             ` Andrew Cooper
  2015-11-18 10:36           ` Ian Campbell
  1 sibling, 1 reply; 31+ messages in thread
From: Jan Beulich @ 2015-11-18  7:45 UTC (permalink / raw)
  To: Andrew Cooper
  Cc: xen-devel, Malcolm Crossley, keir, stefano.stabellini,
	ian.campbell

>>> On 17.11.15 at 18:53, <andrew.cooper3@citrix.com> wrote:
> On 17/11/15 17:39, Jan Beulich wrote:
>>>>> On 17.11.15 at 18:30, <andrew.cooper3@citrix.com> wrote:
>>> On 17/11/15 17:04, Jan Beulich wrote:
>>>>>>> On 03.11.15 at 18:58, <malcolm.crossley@citrix.com> wrote:
>>>>> --- a/xen/common/grant_table.c
>>>>> +++ b/xen/common/grant_table.c
>>>>> @@ -178,6 +178,10 @@ struct active_grant_entry {
>>>>>  #define _active_entry(t, e) \
>>>>>      ((t)->active[(e)/ACGNT_PER_PAGE][(e)%ACGNT_PER_PAGE])
>>>>>  
>>>>> +bool_t grant_rwlock_barrier;
>>>>> +
>>>>> +DEFINE_PER_CPU(rwlock_t *, grant_rwlock);
>>>> Shouldn't these be per grant table? And wouldn't doing so eliminate
>>>> the main limitation of the per-CPU rwlocks?
>>> The grant rwlock is per grant table.
>> That's understood, but I don't see why the above items aren't, too.
> 
> Ah - because there is never any circumstance where two grant tables are
> locked on the same pcpu.

double_gt_lock()? Those are write locks, and iiuc the limitation is just
on read locks, but anway.

> Nor is there any such need.

I'm sorry, but no. If the lock flavor is to be tailored to gnttab use,
then it shouldn't be added outside of grant_table.c.

>>> The entire point of this series is to reduce the cmpxchg storm which
>>> happens when many pcpus attempt to grap the same domains grant read lock.
>>>
>>> As identified in the commit message, reducing the cmpxchg pressure on
>>> the cache coherency fabric increases intra-vm network through from
>>> 10Gbps to 50Gbps when running iperf between two 16-vcpu guests.
>>>
>>> Or in other words, 80% of cpu time is wasted with waiting on an atomic
>>> read/modify/write operation against a remote hot cache line.
>> All of this is pretty nice, but again unrelated to the question I
>> raised.
>>
>> The whole interface would likely become quite a bit easier to use
>> if there was a percpu_rwlock_t comprising all three elements (the
>> per-CPU item obviously would need to become a per-CPU pointer,
>> with allocation of per-CPU data needing introduction).
> 
> Runtime per-CPU data allocation is incompatible with our current scheme
> (which relies on the linker to do some of the heavy lifting).

Well, there are ways to deal with that. One would be for components
to declare how much they might need to use in the worst case (along
the lines of x86 Linux'es .brk mechanism). Another (a variant thereof,
much easier to implement right away) would be for grant_table.c to
simply create a per-CPU array with DOMID_FIRST_RESERVED entries,
using the one corresponding to the domain in question. And of course
a scheme not as advanced as current Linux'es might do too - after all
it did for Linux for many years (before the current one got invented).

Jan

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH 2/2] grant_table: convert grant table rwlock to percpu rwlock
  2015-11-18  7:45           ` Jan Beulich
@ 2015-11-18 10:06             ` Andrew Cooper
  2015-11-18 10:48               ` Jan Beulich
  0 siblings, 1 reply; 31+ messages in thread
From: Andrew Cooper @ 2015-11-18 10:06 UTC (permalink / raw)
  To: Jan Beulich
  Cc: xen-devel, Malcolm Crossley, keir, stefano.stabellini,
	ian.campbell

On 18/11/15 07:45, Jan Beulich wrote:
>>>> On 17.11.15 at 18:53, <andrew.cooper3@citrix.com> wrote:
>> On 17/11/15 17:39, Jan Beulich wrote:
>>>>>> On 17.11.15 at 18:30, <andrew.cooper3@citrix.com> wrote:
>>>> On 17/11/15 17:04, Jan Beulich wrote:
>>>>>>>> On 03.11.15 at 18:58, <malcolm.crossley@citrix.com> wrote:
>>>>>> --- a/xen/common/grant_table.c
>>>>>> +++ b/xen/common/grant_table.c
>>>>>> @@ -178,6 +178,10 @@ struct active_grant_entry {
>>>>>>  #define _active_entry(t, e) \
>>>>>>      ((t)->active[(e)/ACGNT_PER_PAGE][(e)%ACGNT_PER_PAGE])
>>>>>>  
>>>>>> +bool_t grant_rwlock_barrier;
>>>>>> +
>>>>>> +DEFINE_PER_CPU(rwlock_t *, grant_rwlock);
>>>>> Shouldn't these be per grant table? And wouldn't doing so eliminate
>>>>> the main limitation of the per-CPU rwlocks?
>>>> The grant rwlock is per grant table.
>>> That's understood, but I don't see why the above items aren't, too.
>> Ah - because there is never any circumstance where two grant tables are
>> locked on the same pcpu.
> double_gt_lock()? Those are write locks, and iiuc the limitation is just
> on read locks, but anway.

Those are grant entry locks, not the grant table lock.  The grant table
write lock is only taken to switch grant table version.

>
>> Nor is there any such need.
> I'm sorry, but no. If the lock flavor is to be tailored to gnttab use,
> then it shouldn't be added outside of grant_table.c.

This mechanism is not specific to the grant table, nor are the grant
tables the only lock intended to be converted in this way.  Doing this
to the p2m lock also gives decent improvements as well, although is
rather more fiddly to get right.

>
>>>> The entire point of this series is to reduce the cmpxchg storm which
>>>> happens when many pcpus attempt to grap the same domains grant read lock.
>>>>
>>>> As identified in the commit message, reducing the cmpxchg pressure on
>>>> the cache coherency fabric increases intra-vm network through from
>>>> 10Gbps to 50Gbps when running iperf between two 16-vcpu guests.
>>>>
>>>> Or in other words, 80% of cpu time is wasted with waiting on an atomic
>>>> read/modify/write operation against a remote hot cache line.
>>> All of this is pretty nice, but again unrelated to the question I
>>> raised.
>>>
>>> The whole interface would likely become quite a bit easier to use
>>> if there was a percpu_rwlock_t comprising all three elements (the
>>> per-CPU item obviously would need to become a per-CPU pointer,
>>> with allocation of per-CPU data needing introduction).
>> Runtime per-CPU data allocation is incompatible with our current scheme
>> (which relies on the linker to do some of the heavy lifting).
> Well, there are ways to deal with that. One would be for components
> to declare how much they might need to use in the worst case (along
> the lines of x86 Linux'es .brk mechanism). Another (a variant thereof,
> much easier to implement right away) would be for grant_table.c to
> simply create a per-CPU array with DOMID_FIRST_RESERVED entries,
> using the one corresponding to the domain in question. And of course
> a scheme not as advanced as current Linux'es might do too - after all
> it did for Linux for many years (before the current one got invented).

Or better yet, not use dynamic allocation.

Coming up with a new per-cpu scheme is all fine and well if someone
wants to do so, but it is a substantial task and not necessary here.

~Andrew

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH 2/2] grant_table: convert grant table rwlock to percpu rwlock
  2015-11-17 17:53         ` Andrew Cooper
  2015-11-18  7:45           ` Jan Beulich
@ 2015-11-18 10:36           ` Ian Campbell
  2015-11-18 10:54             ` Jan Beulich
  1 sibling, 1 reply; 31+ messages in thread
From: Ian Campbell @ 2015-11-18 10:36 UTC (permalink / raw)
  To: Andrew Cooper, Jan Beulich
  Cc: xen-devel, Malcolm Crossley, keir, stefano.stabellini

On Tue, 2015-11-17 at 17:53 +0000, Andrew Cooper wrote:
> On 17/11/15 17:39, Jan Beulich wrote:
> > > > > On 17.11.15 at 18:30, <andrew.cooper3@citrix.com> wrote:
> > > On 17/11/15 17:04, Jan Beulich wrote:
> > > > > > > On 03.11.15 at 18:58, <malcolm.crossley@citrix.com> wrote:
> > > > > --- a/xen/common/grant_table.c
> > > > > +++ b/xen/common/grant_table.c
> > > > > @@ -178,6 +178,10 @@ struct active_grant_entry {
> > > > >  #define _active_entry(t, e) \
> > > > >      ((t)->active[(e)/ACGNT_PER_PAGE][(e)%ACGNT_PER_PAGE])
> > > > >  
> > > > > +bool_t grant_rwlock_barrier;
> > > > > +
> > > > > +DEFINE_PER_CPU(rwlock_t *, grant_rwlock);
> > > > Shouldn't these be per grant table? And wouldn't doing so eliminate
> > > > the main limitation of the per-CPU rwlocks?
> > > The grant rwlock is per grant table.
> > That's understood, but I don't see why the above items aren't, too.
> 
> Ah - because there is never any circumstance where two grant tables are
> locked on the same pcpu.

So per-cpu rwlocks are really a per-pcpu read lock with a fallthrough to a
per-$resource (here == granttable) rwlock when any writers are present for
any instance $resource, not just the one where the write lock is desired,
for the duration of any write lock?

It took me a little while to realize this[0] and I think it certain bears
some discussion in code comments around the restrictions and requirements
of using such a lock for a resource. e.g. the very tempting looking per-
$resource rwlock now cannot be used directly.

With that in mind it would be good if as much of this could be encapsulated
as possible.

If this can't easily be done as part of the percpu_rwlock infra[1] then can
we at least have per-$resource helper functions which eliminates the need
to open code "(&get_per_cpu_var(grant_rwlock), &grant_rwlock_barrier, &lgt-
>lock);" everywhere which wants to take a lock.

Or at the very least can we rename d->grant_table->lock to something
uninviting ("rw_inner_lock", "fallback_lock") and, more importantly stick a
whacking great "do not use under any circumstances, use $x instead" next to
it in the struct declaration.

Does this locking scheme have real-world prior-art in other projects? Looks
like Linux at one point might have been going to gain something called per-
cpu rwlocks but a) I'm not sure they were the same and b) they didn't seem
to go anywhere for some reason.

Ian.

[0] FWIW my initial expectation based on the naming was either a per-cpu-
per-resource lock or a global independent rwlocks for each cpu.

[1] e.g. a data structure encapsulating the per-cpu, barrier and offsetof
the underlying lock within a given struct initialised with:

    bool_t grant_rwlock_barrier;
    DEFINE_PER_CPU(rwlock_t *, grant_percpu_rwlock);
    DEFINE_PER_CPU_RWLOCK(grant_rwlock, grant_percpu_rwlock, grant_rwlock_barrier, offsetof(struct ..., rwlock);

Then lockers just us per_cpu_read_lock(&grant_rwlock) etc.

In fact the first two lines could be part of this macro if people were
willing to tolerate a bit of cpp macro pasting.

Main downside is lack of type safety due to the offset of. Might be
checkable in the macro though.

Ian.

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xen.org
http://lists.xen.org/xen-devel

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH 2/2] grant_table: convert grant table rwlock to percpu rwlock
  2015-11-18 10:06             ` Andrew Cooper
@ 2015-11-18 10:48               ` Jan Beulich
  0 siblings, 0 replies; 31+ messages in thread
From: Jan Beulich @ 2015-11-18 10:48 UTC (permalink / raw)
  To: Andrew Cooper
  Cc: xen-devel, Malcolm Crossley, keir, stefano.stabellini,
	ian.campbell

>>> On 18.11.15 at 11:06, <andrew.cooper3@citrix.com> wrote:
> On 18/11/15 07:45, Jan Beulich wrote:
>>>>> On 17.11.15 at 18:53, <andrew.cooper3@citrix.com> wrote:
>>> On 17/11/15 17:39, Jan Beulich wrote:
>>>>>>> On 17.11.15 at 18:30, <andrew.cooper3@citrix.com> wrote:
>>>>> On 17/11/15 17:04, Jan Beulich wrote:
>>>>>>>>> On 03.11.15 at 18:58, <malcolm.crossley@citrix.com> wrote:
>>>>>>> --- a/xen/common/grant_table.c
>>>>>>> +++ b/xen/common/grant_table.c
>>>>>>> @@ -178,6 +178,10 @@ struct active_grant_entry {
>>>>>>>  #define _active_entry(t, e) \
>>>>>>>      ((t)->active[(e)/ACGNT_PER_PAGE][(e)%ACGNT_PER_PAGE])
>>>>>>>  
>>>>>>> +bool_t grant_rwlock_barrier;
>>>>>>> +
>>>>>>> +DEFINE_PER_CPU(rwlock_t *, grant_rwlock);
>>>>>> Shouldn't these be per grant table? And wouldn't doing so eliminate
>>>>>> the main limitation of the per-CPU rwlocks?
>>>>> The grant rwlock is per grant table.
>>>> That's understood, but I don't see why the above items aren't, too.
>>> Ah - because there is never any circumstance where two grant tables are
>>> locked on the same pcpu.
>> double_gt_lock()? Those are write locks, and iiuc the limitation is just
>> on read locks, but anway.
> 
> Those are grant entry locks, not the grant table lock.  The grant table
> write lock is only taken to switch grant table version.

static inline void
double_gt_lock(struct grant_table *lgt, struct grant_table *rgt)
{
    /*
     * See mapkind() for why the write lock is also required for the
     * remote domain.
     */
    if ( lgt < rgt )
    {
        write_lock(&lgt->lock);
        write_lock(&rgt->lock);
    }
    else
    {
        if ( lgt != rgt )
            write_lock(&rgt->lock);
        write_lock(&lgt->lock);
    }
}

looks very much like taking two grant table locks to me.

>>> Nor is there any such need.
>> I'm sorry, but no. If the lock flavor is to be tailored to gnttab use,
>> then it shouldn't be added outside of grant_table.c.
> 
> This mechanism is not specific to the grant table, nor are the grant
> tables the only lock intended to be converted in this way.  Doing this
> to the p2m lock also gives decent improvements as well, although is
> rather more fiddly to get right.

And there are no nested uses of p2m locks? I recall a PVH issue
needing special tweaking because nested locking could occur for
both a PVH Dom0's lock and a DomU's controlled by it.

As long as there's not going to be percpu_rwlock_t, with all pieces
grouped together, this construct seems too special purpose to me.

>>>>> The entire point of this series is to reduce the cmpxchg storm which
>>>>> happens when many pcpus attempt to grap the same domains grant read lock.
>>>>>
>>>>> As identified in the commit message, reducing the cmpxchg pressure on
>>>>> the cache coherency fabric increases intra-vm network through from
>>>>> 10Gbps to 50Gbps when running iperf between two 16-vcpu guests.
>>>>>
>>>>> Or in other words, 80% of cpu time is wasted with waiting on an atomic
>>>>> read/modify/write operation against a remote hot cache line.
>>>> All of this is pretty nice, but again unrelated to the question I
>>>> raised.
>>>>
>>>> The whole interface would likely become quite a bit easier to use
>>>> if there was a percpu_rwlock_t comprising all three elements (the
>>>> per-CPU item obviously would need to become a per-CPU pointer,
>>>> with allocation of per-CPU data needing introduction).
>>> Runtime per-CPU data allocation is incompatible with our current scheme
>>> (which relies on the linker to do some of the heavy lifting).
>> Well, there are ways to deal with that. One would be for components
>> to declare how much they might need to use in the worst case (along
>> the lines of x86 Linux'es .brk mechanism). Another (a variant thereof,
>> much easier to implement right away) would be for grant_table.c to
>> simply create a per-CPU array with DOMID_FIRST_RESERVED entries,
>> using the one corresponding to the domain in question. And of course
>> a scheme not as advanced as current Linux'es might do too - after all
>> it did for Linux for many years (before the current one got invented).
> 
> Or better yet, not use dynamic allocation.

Which is what the middle of the three suggestions represents...

> Coming up with a new per-cpu scheme is all fine and well if someone
> wants to do so, but it is a substantial task and not necessary here.

... to keep things simple for now.

Jan

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH 2/2] grant_table: convert grant table rwlock to percpu rwlock
  2015-11-18 10:36           ` Ian Campbell
@ 2015-11-18 10:54             ` Jan Beulich
  2015-11-18 11:23               ` Malcolm Crossley
  0 siblings, 1 reply; 31+ messages in thread
From: Jan Beulich @ 2015-11-18 10:54 UTC (permalink / raw)
  To: Ian Campbell
  Cc: Andrew Cooper, Malcolm Crossley, keir, stefano.stabellini,
	xen-devel

>>> On 18.11.15 at 11:36, <ian.campbell@citrix.com> wrote:
> On Tue, 2015-11-17 at 17:53 +0000, Andrew Cooper wrote:
>> On 17/11/15 17:39, Jan Beulich wrote:
>> > > > > On 17.11.15 at 18:30, <andrew.cooper3@citrix.com> wrote:
>> > > On 17/11/15 17:04, Jan Beulich wrote:
>> > > > > > > On 03.11.15 at 18:58, <malcolm.crossley@citrix.com> wrote:
>> > > > > --- a/xen/common/grant_table.c
>> > > > > +++ b/xen/common/grant_table.c
>> > > > > @@ -178,6 +178,10 @@ struct active_grant_entry {
>> > > > >  #define _active_entry(t, e) \
>> > > > >      ((t)->active[(e)/ACGNT_PER_PAGE][(e)%ACGNT_PER_PAGE])
>> > > > >  
>> > > > > +bool_t grant_rwlock_barrier;
>> > > > > +
>> > > > > +DEFINE_PER_CPU(rwlock_t *, grant_rwlock);
>> > > > Shouldn't these be per grant table? And wouldn't doing so eliminate
>> > > > the main limitation of the per-CPU rwlocks?
>> > > The grant rwlock is per grant table.
>> > That's understood, but I don't see why the above items aren't, too.
>> 
>> Ah - because there is never any circumstance where two grant tables are
>> locked on the same pcpu.
> 
> So per-cpu rwlocks are really a per-pcpu read lock with a fallthrough to a
> per-$resource (here == granttable) rwlock when any writers are present for
> any instance $resource, not just the one where the write lock is desired,
> for the duration of any write lock?

That's not how I understood it, the rwlock isn't per-pCPU (at least not
in what this patch does - it remains a per-domain one). The per-pCPU
object is a pointer to an rwlock, which gets made point to whatever
domain's rwlock the pCPU wants to own.

Jan

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH 2/2] grant_table: convert grant table rwlock to percpu rwlock
  2015-11-18 10:54             ` Jan Beulich
@ 2015-11-18 11:23               ` Malcolm Crossley
  2015-11-18 11:41                 ` Jan Beulich
  2015-11-18 11:50                 ` Ian Campbell
  0 siblings, 2 replies; 31+ messages in thread
From: Malcolm Crossley @ 2015-11-18 11:23 UTC (permalink / raw)
  To: Jan Beulich, Ian Campbell
  Cc: Andrew Cooper, keir, stefano.stabellini, xen-devel

On 18/11/15 10:54, Jan Beulich wrote:
>>>> On 18.11.15 at 11:36, <ian.campbell@citrix.com> wrote:
>> On Tue, 2015-11-17 at 17:53 +0000, Andrew Cooper wrote:
>>> On 17/11/15 17:39, Jan Beulich wrote:
>>>>>>> On 17.11.15 at 18:30, <andrew.cooper3@citrix.com> wrote:
>>>>> On 17/11/15 17:04, Jan Beulich wrote:
>>>>>>>>> On 03.11.15 at 18:58, <malcolm.crossley@citrix.com> wrote:
>>>>>>> --- a/xen/common/grant_table.c
>>>>>>> +++ b/xen/common/grant_table.c
>>>>>>> @@ -178,6 +178,10 @@ struct active_grant_entry {
>>>>>>>  #define _active_entry(t, e) \
>>>>>>>      ((t)->active[(e)/ACGNT_PER_PAGE][(e)%ACGNT_PER_PAGE])
>>>>>>>  
>>>>>>> +bool_t grant_rwlock_barrier;
>>>>>>> +
>>>>>>> +DEFINE_PER_CPU(rwlock_t *, grant_rwlock);
>>>>>> Shouldn't these be per grant table? And wouldn't doing so eliminate
>>>>>> the main limitation of the per-CPU rwlocks?
>>>>> The grant rwlock is per grant table.
>>>> That's understood, but I don't see why the above items aren't, too.
>>>
>>> Ah - because there is never any circumstance where two grant tables are
>>> locked on the same pcpu.
>>
>> So per-cpu rwlocks are really a per-pcpu read lock with a fallthrough to a
>> per-$resource (here == granttable) rwlock when any writers are present for
>> any instance $resource, not just the one where the write lock is desired,
>> for the duration of any write lock?
> 

The above description is the very good for for how the per-cpu rwlocks behave.
The code stores a pointer to the per-$resource in the percpu area when a user is
reading the per-$resource, this is why the lock is not safe if you take the lock
for two different per-$resource simultaneously. The grant table code only takes
one grant table lock at any one time so it is a safe user.

I would posit that most code behaves in this manner in an attempt to avoid
deadlocks.

It may also be clearer to change the grant_table rwlock_t to a spinlock which
the writers use.

The interesting question is how generic a pattern is the grant table usage of
only a single per-$resource at a time?

The p2m code has it's own recursion detection code and so is safe from that
issue but does it take a read lock for two per-$resource's simultaneously?


> That's not how I understood it, the rwlock isn't per-pCPU (at least not
> in what this patch does - it remains a per-domain one). The per-pCPU
> object is a pointer to an rwlock, which gets made point to whatever
> domain's rwlock the pCPU wants to own.
> 

This description is correct but it's important to note that the rwlock
is only used by the writers and could be effectively replaced with a spinlock.

Malcolm

> Jan
> 

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH 2/2] grant_table: convert grant table rwlock to percpu rwlock
  2015-11-18 11:23               ` Malcolm Crossley
@ 2015-11-18 11:41                 ` Jan Beulich
  2015-11-18 11:50                   ` Malcolm Crossley
  2015-11-18 11:50                 ` Ian Campbell
  1 sibling, 1 reply; 31+ messages in thread
From: Jan Beulich @ 2015-11-18 11:41 UTC (permalink / raw)
  To: Malcolm Crossley
  Cc: Andrew Cooper, keir, stefano.stabellini, Ian Campbell, xen-devel

>>> On 18.11.15 at 12:23, <malcolm.crossley@citrix.com> wrote:
> On 18/11/15 10:54, Jan Beulich wrote:
>> That's not how I understood it, the rwlock isn't per-pCPU (at least not
>> in what this patch does - it remains a per-domain one). The per-pCPU
>> object is a pointer to an rwlock, which gets made point to whatever
>> domain's rwlock the pCPU wants to own.
> 
> This description is correct but it's important to note that the rwlock
> is only used by the writers and could be effectively replaced with a 
> spinlock.

While such replacement may indeed be possible (whether desirable
is another question), I'm pretty sure I saw readers taking the read
lock on the slow path.

Jan

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH 2/2] grant_table: convert grant table rwlock to percpu rwlock
  2015-11-18 11:23               ` Malcolm Crossley
  2015-11-18 11:41                 ` Jan Beulich
@ 2015-11-18 11:50                 ` Ian Campbell
  2015-11-18 11:56                   ` Malcolm Crossley
  1 sibling, 1 reply; 31+ messages in thread
From: Ian Campbell @ 2015-11-18 11:50 UTC (permalink / raw)
  To: Malcolm Crossley, Jan Beulich
  Cc: Andrew Cooper, keir, stefano.stabellini, xen-devel

On Wed, 2015-11-18 at 11:23 +0000, Malcolm Crossley wrote:
> On 18/11/15 10:54, Jan Beulich wrote:
> > > > > On 18.11.15 at 11:36, <ian.campbell@citrix.com> wrote:
> > > On Tue, 2015-11-17 at 17:53 +0000, Andrew Cooper wrote:
> > > > On 17/11/15 17:39, Jan Beulich wrote:
> > > > > > > > On 17.11.15 at 18:30, <andrew.cooper3@citrix.com> wrote:
> > > > > > On 17/11/15 17:04, Jan Beulich wrote:
> > > > > > > > > > On 03.11.15 at 18:58, <malcolm.crossley@citrix.com>
> > > > > > > > > > wrote:
> > > > > > > > --- a/xen/common/grant_table.c
> > > > > > > > +++ b/xen/common/grant_table.c
> > > > > > > > @@ -178,6 +178,10 @@ struct active_grant_entry {
> > > > > > > >  #define _active_entry(t, e) \
> > > > > > > >      ((t)->active[(e)/ACGNT_PER_PAGE][(e)%ACGNT_PER_PAGE])
> > > > > > > >  
> > > > > > > > +bool_t grant_rwlock_barrier;
> > > > > > > > +
> > > > > > > > +DEFINE_PER_CPU(rwlock_t *, grant_rwlock);
> > > > > > > Shouldn't these be per grant table? And wouldn't doing so
> > > > > > > eliminate
> > > > > > > the main limitation of the per-CPU rwlocks?
> > > > > > The grant rwlock is per grant table.
> > > > > That's understood, but I don't see why the above items aren't,
> > > > > too.
> > > > 
> > > > Ah - because there is never any circumstance where two grant tables
> > > > are
> > > > locked on the same pcpu.
> > > 
> > > So per-cpu rwlocks are really a per-pcpu read lock with a fallthrough
> > > to a
> > > per-$resource (here == granttable) rwlock when any writers are
> > > present for
> > > any instance $resource, not just the one where the write lock is
> > > desired,
> > > for the duration of any write lock?
> > 
> 
> The above description is the very good for for how the per-cpu rwlocks behave.
> The code stores a pointer to the per-$resource in the percpu area when a user is
> reading the per-$resource, this is why the lock is not safe if you take the lock
> for two different per-$resource simultaneously. The grant table code only takes
> one grant table lock at any one time so it is a safe user.

So essentially the "per-pcpu read lock" as I called it is really in essence
a sort of "byte lock" via the NULL vs non-NULL state of the per-cpu pointer
to the underlying rwlock.

> > That's not how I understood it, the rwlock isn't per-pCPU (at least not
> > in what this patch does - it remains a per-domain one). The per-pCPU
> > object is a pointer to an rwlock, which gets made point to whatever
> > domain's rwlock the pCPU wants to own.
> > 
> 
> This description is correct but it's important to note that the rwlock
> is only used by the writers and could be effectively replaced with a
> spinlock.

The rwlock is taken (briefly) by readers if *writer_activating is, isn't
it?

Ian.

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xen.org
http://lists.xen.org/xen-devel

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH 2/2] grant_table: convert grant table rwlock to percpu rwlock
  2015-11-18 11:41                 ` Jan Beulich
@ 2015-11-18 11:50                   ` Malcolm Crossley
  0 siblings, 0 replies; 31+ messages in thread
From: Malcolm Crossley @ 2015-11-18 11:50 UTC (permalink / raw)
  To: Jan Beulich
  Cc: Andrew Cooper, keir, stefano.stabellini, Ian Campbell, xen-devel

On 18/11/15 11:41, Jan Beulich wrote:
>>>> On 18.11.15 at 12:23, <malcolm.crossley@citrix.com> wrote:
>> On 18/11/15 10:54, Jan Beulich wrote:
>>> That's not how I understood it, the rwlock isn't per-pCPU (at least not
>>> in what this patch does - it remains a per-domain one). The per-pCPU
>>> object is a pointer to an rwlock, which gets made point to whatever
>>> domain's rwlock the pCPU wants to own.
>>
>> This description is correct but it's important to note that the rwlock
>> is only used by the writers and could be effectively replaced with a 
>> spinlock.
> 
> While such replacement may indeed be possible (whether desirable
> is another question), I'm pretty sure I saw readers taking the read
> lock on the slow path.
> 

You are right, the core code currently relies upon rw locks. Conceptually it
may not need to but that's a different question as you state. Sorry for that
mistake.

Maybe it would be clearer if we created a percpu_rwlock_t which is really a
rwlock_t. This would prevent accidental usage of the underlying rwlock_t by
new grant table code.

Malcolm

> Jan
> 

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH 2/2] grant_table: convert grant table rwlock to percpu rwlock
  2015-11-18 11:50                 ` Ian Campbell
@ 2015-11-18 11:56                   ` Malcolm Crossley
  2015-11-18 12:07                     ` Ian Campbell
  0 siblings, 1 reply; 31+ messages in thread
From: Malcolm Crossley @ 2015-11-18 11:56 UTC (permalink / raw)
  To: Ian Campbell, Jan Beulich
  Cc: Andrew Cooper, keir, stefano.stabellini, xen-devel

On 18/11/15 11:50, Ian Campbell wrote:
> On Wed, 2015-11-18 at 11:23 +0000, Malcolm Crossley wrote:
>> On 18/11/15 10:54, Jan Beulich wrote:
>>>>>> On 18.11.15 at 11:36, <ian.campbell@citrix.com> wrote:
>>>> On Tue, 2015-11-17 at 17:53 +0000, Andrew Cooper wrote:
>>>>> On 17/11/15 17:39, Jan Beulich wrote:
>>>>>>>>> On 17.11.15 at 18:30, <andrew.cooper3@citrix.com> wrote:
>>>>>>> On 17/11/15 17:04, Jan Beulich wrote:
>>>>>>>>>>> On 03.11.15 at 18:58, <malcolm.crossley@citrix.com>
>>>>>>>>>>> wrote:
>>>>>>>>> --- a/xen/common/grant_table.c
>>>>>>>>> +++ b/xen/common/grant_table.c
>>>>>>>>> @@ -178,6 +178,10 @@ struct active_grant_entry {
>>>>>>>>>  #define _active_entry(t, e) \
>>>>>>>>>      ((t)->active[(e)/ACGNT_PER_PAGE][(e)%ACGNT_PER_PAGE])
>>>>>>>>>  
>>>>>>>>> +bool_t grant_rwlock_barrier;
>>>>>>>>> +
>>>>>>>>> +DEFINE_PER_CPU(rwlock_t *, grant_rwlock);
>>>>>>>> Shouldn't these be per grant table? And wouldn't doing so
>>>>>>>> eliminate
>>>>>>>> the main limitation of the per-CPU rwlocks?
>>>>>>> The grant rwlock is per grant table.
>>>>>> That's understood, but I don't see why the above items aren't,
>>>>>> too.
>>>>>
>>>>> Ah - because there is never any circumstance where two grant tables
>>>>> are
>>>>> locked on the same pcpu.
>>>>
>>>> So per-cpu rwlocks are really a per-pcpu read lock with a fallthrough
>>>> to a
>>>> per-$resource (here == granttable) rwlock when any writers are
>>>> present for
>>>> any instance $resource, not just the one where the write lock is
>>>> desired,
>>>> for the duration of any write lock?
>>>
>>
>> The above description is the very good for for how the per-cpu rwlocks behave.
>> The code stores a pointer to the per-$resource in the percpu area when a user is
>> reading the per-$resource, this is why the lock is not safe if you take the lock
>> for two different per-$resource simultaneously. The grant table code only takes
>> one grant table lock at any one time so it is a safe user.
> 
> So essentially the "per-pcpu read lock" as I called it is really in essence
> a sort of "byte lock" via the NULL vs non-NULL state of the per-cpu pointer
> to the underlying rwlock.

It's not quite a byte lock because it stores a full pointer to the per-$resource
that it's using. It could be changed to be a byte lock but then you will need a
percpu area per-$resource.

> 
>>> That's not how I understood it, the rwlock isn't per-pCPU (at least not
>>> in what this patch does - it remains a per-domain one). The per-pCPU
>>> object is a pointer to an rwlock, which gets made point to whatever
>>> domain's rwlock the pCPU wants to own.
>>>
>>
>> This description is correct but it's important to note that the rwlock
>> is only used by the writers and could be effectively replaced with a
>> spinlock.
> 
> The rwlock is taken (briefly) by readers if *writer_activating is, isn't
> it?

Yes I got this wrong. Sorry about causing confusion.

Malcolm

> 
> Ian.
> 

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH 2/2] grant_table: convert grant table rwlock to percpu rwlock
  2015-11-18 11:56                   ` Malcolm Crossley
@ 2015-11-18 12:07                     ` Ian Campbell
  2015-11-18 13:08                       ` Malcolm Crossley
  0 siblings, 1 reply; 31+ messages in thread
From: Ian Campbell @ 2015-11-18 12:07 UTC (permalink / raw)
  To: Malcolm Crossley, Jan Beulich
  Cc: Andrew Cooper, keir, stefano.stabellini, xen-devel

On Wed, 2015-11-18 at 11:56 +0000, Malcolm Crossley wrote:
> On 18/11/15 11:50, Ian Campbell wrote:
> > On Wed, 2015-11-18 at 11:23 +0000, Malcolm Crossley wrote:
> > > On 18/11/15 10:54, Jan Beulich wrote:
> > > > > > > On 18.11.15 at 11:36, <ian.campbell@citrix.com> wrote:
> > > > > On Tue, 2015-11-17 at 17:53 +0000, Andrew Cooper wrote:
> > > > > > On 17/11/15 17:39, Jan Beulich wrote:
> > > > > > > > > > On 17.11.15 at 18:30, <andrew.cooper3@citrix.com>
> > > > > > > > > > wrote:
> > > > > > > > On 17/11/15 17:04, Jan Beulich wrote:
> > > > > > > > > > > > On 03.11.15 at 18:58, <malcolm.crossley@citrix.com>
> > > > > > > > > > > > wrote:
> > > > > > > > > > --- a/xen/common/grant_table.c
> > > > > > > > > > +++ b/xen/common/grant_table.c
> > > > > > > > > > @@ -178,6 +178,10 @@ struct active_grant_entry {
> > > > > > > > > >  #define _active_entry(t, e) \
> > > > > > > > > >      ((t)-
> > > > > > > > > > >active[(e)/ACGNT_PER_PAGE][(e)%ACGNT_PER_PAGE])
> > > > > > > > > >  
> > > > > > > > > > +bool_t grant_rwlock_barrier;
> > > > > > > > > > +
> > > > > > > > > > +DEFINE_PER_CPU(rwlock_t *, grant_rwlock);
> > > > > > > > > Shouldn't these be per grant table? And wouldn't doing so
> > > > > > > > > eliminate
> > > > > > > > > the main limitation of the per-CPU rwlocks?
> > > > > > > > The grant rwlock is per grant table.
> > > > > > > That's understood, but I don't see why the above items
> > > > > > > aren't,
> > > > > > > too.
> > > > > > 
> > > > > > Ah - because there is never any circumstance where two grant
> > > > > > tables
> > > > > > are
> > > > > > locked on the same pcpu.
> > > > > 
> > > > > So per-cpu rwlocks are really a per-pcpu read lock with a
> > > > > fallthrough
> > > > > to a
> > > > > per-$resource (here == granttable) rwlock when any writers are
> > > > > present for
> > > > > any instance $resource, not just the one where the write lock is
> > > > > desired,
> > > > > for the duration of any write lock?
> > > > 
> > > 
> > > The above description is the very good for for how the per-cpu
> > > rwlocks behave.
> > > The code stores a pointer to the per-$resource in the percpu area
> > > when a user is
> > > reading the per-$resource, this is why the lock is not safe if you
> > > take the lock
> > > for two different per-$resource simultaneously. The grant table code
> > > only takes
> > > one grant table lock at any one time so it is a safe user.
> > 
> > So essentially the "per-pcpu read lock" as I called it is really in
> > essence
> > a sort of "byte lock" via the NULL vs non-NULL state of the per-cpu
> > pointer
> > to the underlying rwlock.
> 
> It's not quite a byte lock because it stores a full pointer to the per-$resource
> that it's using. It could be changed to be a byte lock but then you will need a
> percpu area per-$resource.

Right, I said "in essence sort of" and put scare quotes around the "byte
lock" since I realise it's not literally a byte lock.

But really all I was getting was that it has locked and unlocked states in
some form or other.

(Maybe I should have said "like a bit lock with 32 or 64 bits, setting any
of which corresponds to acquiring the lock" ;-))

iAN.

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xen.org
http://lists.xen.org/xen-devel

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH 2/2] grant_table: convert grant table rwlock to percpu rwlock
  2015-11-18 12:07                     ` Ian Campbell
@ 2015-11-18 13:08                       ` Malcolm Crossley
  2015-11-18 13:47                         ` Jan Beulich
  2015-11-18 14:22                         ` Ian Campbell
  0 siblings, 2 replies; 31+ messages in thread
From: Malcolm Crossley @ 2015-11-18 13:08 UTC (permalink / raw)
  To: Ian Campbell, Jan Beulich
  Cc: Andrew Cooper, keir, stefano.stabellini, xen-devel

On 18/11/15 12:07, Ian Campbell wrote:
> On Wed, 2015-11-18 at 11:56 +0000, Malcolm Crossley wrote:
>> On 18/11/15 11:50, Ian Campbell wrote:
>>> On Wed, 2015-11-18 at 11:23 +0000, Malcolm Crossley wrote:
>>>> On 18/11/15 10:54, Jan Beulich wrote:
>>>>>>>> On 18.11.15 at 11:36, <ian.campbell@citrix.com> wrote:
>>>>>> On Tue, 2015-11-17 at 17:53 +0000, Andrew Cooper wrote:
>>>>>>> On 17/11/15 17:39, Jan Beulich wrote:
>>>>>>>>>>> On 17.11.15 at 18:30, <andrew.cooper3@citrix.com>
>>>>>>>>>>> wrote:
>>>>>>>>> On 17/11/15 17:04, Jan Beulich wrote:
>>>>>>>>>>>>> On 03.11.15 at 18:58, <malcolm.crossley@citrix.com>
>>>>>>>>>>>>> wrote:
>>>>>>>>>>> --- a/xen/common/grant_table.c
>>>>>>>>>>> +++ b/xen/common/grant_table.c
>>>>>>>>>>> @@ -178,6 +178,10 @@ struct active_grant_entry {
>>>>>>>>>>>  #define _active_entry(t, e) \
>>>>>>>>>>>      ((t)-
>>>>>>>>>>>> active[(e)/ACGNT_PER_PAGE][(e)%ACGNT_PER_PAGE])
>>>>>>>>>>>  
>>>>>>>>>>> +bool_t grant_rwlock_barrier;
>>>>>>>>>>> +
>>>>>>>>>>> +DEFINE_PER_CPU(rwlock_t *, grant_rwlock);
>>>>>>>>>> Shouldn't these be per grant table? And wouldn't doing so
>>>>>>>>>> eliminate
>>>>>>>>>> the main limitation of the per-CPU rwlocks?
>>>>>>>>> The grant rwlock is per grant table.
>>>>>>>> That's understood, but I don't see why the above items
>>>>>>>> aren't,
>>>>>>>> too.
>>>>>>>
>>>>>>> Ah - because there is never any circumstance where two grant
>>>>>>> tables
>>>>>>> are
>>>>>>> locked on the same pcpu.
>>>>>>
>>>>>> So per-cpu rwlocks are really a per-pcpu read lock with a
>>>>>> fallthrough
>>>>>> to a
>>>>>> per-$resource (here == granttable) rwlock when any writers are
>>>>>> present for
>>>>>> any instance $resource, not just the one where the write lock is
>>>>>> desired,
>>>>>> for the duration of any write lock?
>>>>>
>>>>
>>>> The above description is the very good for for how the per-cpu
>>>> rwlocks behave.
>>>> The code stores a pointer to the per-$resource in the percpu area
>>>> when a user is
>>>> reading the per-$resource, this is why the lock is not safe if you
>>>> take the lock
>>>> for two different per-$resource simultaneously. The grant table code
>>>> only takes
>>>> one grant table lock at any one time so it is a safe user.
>>>
>>> So essentially the "per-pcpu read lock" as I called it is really in
>>> essence
>>> a sort of "byte lock" via the NULL vs non-NULL state of the per-cpu
>>> pointer
>>> to the underlying rwlock.
>>
>> It's not quite a byte lock because it stores a full pointer to the per-$resource
>> that it's using. It could be changed to be a byte lock but then you will need a
>> percpu area per-$resource.
> 
> Right, I said "in essence sort of" and put scare quotes around the "byte
> lock" since I realise it's not literally a byte lock.
> 
> But really all I was getting was that it has locked and unlocked states in
> some form or other.

I was just concerned that people may not pick up on the subtle difference that the
percpu read areas are used for multiple resources (of which none are locked simultaneously
by the same CPU) where as byte locks are typically used to lock a particular
resource and so you can safely lock multiple resource simultaneously on the same CPU.

> 
> (Maybe I should have said "like a bit lock with 32 or 64 bits, setting any
> of which corresponds to acquiring the lock" ;-))
> 
Not quite, setting the per cpu read area "takes" the read lock for the particular
resource you passed into the percpu rwlock implementation. Writers of another resource
($resource1) will safely ignore readers of ($resource0).

The global barrier will however make _all_ readers take the per-$resource read lock.
An optimisation could be to have a barrier variable per-$resource (stored in the
struct grant_table in this case).

Malcolm


> iAN.
> 

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH 2/2] grant_table: convert grant table rwlock to percpu rwlock
  2015-11-18 13:08                       ` Malcolm Crossley
@ 2015-11-18 13:47                         ` Jan Beulich
  2015-11-18 14:22                         ` Ian Campbell
  1 sibling, 0 replies; 31+ messages in thread
From: Jan Beulich @ 2015-11-18 13:47 UTC (permalink / raw)
  To: Malcolm Crossley
  Cc: Andrew Cooper, keir, stefano.stabellini, Ian Campbell, xen-devel

>>> On 18.11.15 at 14:08, <malcolm.crossley@citrix.com> wrote:
> On 18/11/15 12:07, Ian Campbell wrote:
>> (Maybe I should have said "like a bit lock with 32 or 64 bits, setting any
>> of which corresponds to acquiring the lock" ;-))
>> 
> Not quite, setting the per cpu read area "takes" the read lock for the particular
> resource you passed into the percpu rwlock implementation. Writers of another resource
> ($resource1) will safely ignore readers of ($resource0).
> 
> The global barrier will however make _all_ readers take the per-$resource read lock.
> An optimisation could be to have a barrier variable per-$resource (stored in the
> struct grant_table in this case).

As said before, I don't view this as an optimization, but as a requirement
(to eliminate the nesting restriction).

Jan

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH 1/2] rwlock: add per-cpu reader-writer locks
  2015-11-17 17:00 ` Jan Beulich
@ 2015-11-18 13:49   ` Malcolm Crossley
  2015-11-18 14:15     ` Jan Beulich
  0 siblings, 1 reply; 31+ messages in thread
From: Malcolm Crossley @ 2015-11-18 13:49 UTC (permalink / raw)
  To: Jan Beulich
  Cc: andrew.cooper3, keir, stefano.stabellini, ian.campbell, xen-devel

On 17/11/15 17:00, Jan Beulich wrote:
>>>> On 03.11.15 at 18:58, <malcolm.crossley@citrix.com> wrote:
>> Per-cpu read-write locks allow for the fast path read case to have low overhead
>> by only setting/clearing a per-cpu variable for using the read lock.
>> The per-cpu read fast path also avoids locked compare swap operations which can
>> be particularly slow on coherent multi-socket systems, particularly if there is
>> heavy usage of the read lock itself.
>>
>> The per-cpu reader-writer lock uses a global variable to control the read lock
>> fast path. This allows a writer to disable the fast path and ensure the readers
>> use the underlying read-write lock implementation.
>>
>> Once the writer has taken the write lock and disabled the fast path, it must
>> poll the per-cpu variable for all CPU's which have entered the critical section
>> for the specific read-write lock the writer is attempting to take. This design
>> allows for a single per-cpu variable to be used for read/write locks belonging
>> to seperate data structures as long as multiple per-cpu read locks are not
>> simultaneously held by one particular cpu. This also means per-cpu
>> reader-writer locks are not recursion safe.
> 
> This sounds like pretty severe a restriction, and something to easily
> get wrong once more than a handful of users got introduced.

I can add an ASSERT to detect the recursion for the debug hypervisor but you are right
that it is a restriction. I believe that the tickets locks are not recursion safe
either and rely on additional infrastructure to prevent the recursion occurring on
the lock itself.

> 
>> Slow path readers which are unblocked set the per-cpu variable and drop the
>> read lock. This simplifies the implementation and allows for fairness in the
>> underlying read-write lock to be taken advantage of.
>>
>> There may be slightly more overhead on the per-cpu write lock path due to
>> checking each CPUs fast path read variable but this overhead is likely be hidden
>> by the required delay of waiting for readers to exit the critical section.
> 
> Hmm, the "slightly" here seems to depend on the number of CPUs.

It also depends on the lock contention between the readers and the writers.

The worst case is the uncontended case and only writers are using the lock.
Each time the writer takes the lock they will also read all the percpu areas.

The best case is the read lock in both the contended and uncontended case which
simply checks a global variable and then update a per cpu area.

The reader/writer contended case can have variable overhead which depends on
the time it takes the readers to exit the critical section. If the last reader
are just about the exit the critical section when the writer attempts to take
the lock then the overhead will be similar to the worst case. However, if the
time it takes the readers to exit the critical section is equal to the time
the writer takes to do a first pass read of all the percpu areas then the writer
overhead much lower. Potentially, if there is a single "lagging" reader exiting
the critical section then the writer will spinning reading their percpu area only
and the writer lock overhead may be lower than the cmpxchg operation a standard
rwlock would have.


> 
>> --- a/xen/common/spinlock.c
>> +++ b/xen/common/spinlock.c
>> @@ -492,6 +492,38 @@ int _rw_is_write_locked(rwlock_t *lock)
>>      return (lock->lock == RW_WRITE_FLAG); /* writer in critical section? */
>>  }
>>  
>> +void percpu_write_lock(rwlock_t **per_cpudata, bool_t *writer_activating,
>> +		rwlock_t *rwlock)
>> +{
>> +    int cpu;
> 
> unsigned int

Will fix.
> 
>> +    cpumask_t active_readers;
> 
> Did you consider ways to avoid such a large variable to be put on
> the stack, and none turned out suitable?

I wasn't aware the cpumask_t variable at 32 bytes was considered problematic
for the stack. I can't think of any way around the issue without causing
further restrictions (a percpu area will prevent taking two different percpu
rwlocks at the same time).

> 
>> +    cpumask_copy(&active_readers, &cpu_online_map);
> 
> While we have other cases not fully hotplug safe, I don't think we
> can allow this in basic locking infrastructure code. You need to
> protect against cpu_online_map changing.

I could move this copy to after the point the global barrier has been raised.
This should make it safe for cpu online/offline because new online CPU's will
have to take the read lock. CPU's which go offline should have exited the
read critical section and so should not cause any problems.


> 
>> +    /* First take the write lock to protect against other writers */
> 
> Most if not all of your comments are missing a full stop.

OK, I'll fix this up.
> 
>> +    write_lock(rwlock);
>> +
>> +    /* Now set the global variable so that readers start using read_lock */
>> +    *writer_activating = true;
>> +    smp_mb();
>> +
>> +    /* Check if there are any percpu readers in progress on our rwlock*/
>> +    do
>> +    {
>> +        for_each_cpu(cpu, &active_readers)
>> +        {
>> +            /* Remove any percpu readers not contending 
>> +             * from our check mask */
> 
> Comment style.

OK,
> 
>> +            if ( per_cpu_ptr(per_cpudata, cpu) != rwlock )
>> +                cpumask_clear_cpu(cpu, &active_readers);
> 
> This is a LOCKed op, and with the subject of the patch being to lower
> the load on the fabric, I wonder whether time isn't ripe for introducing
> a non-LOCKed variant of set, clear, and their test_and_ variants.

As long as the stack is allocated per-cpu then I would hope there should be
no fabric cost for operations on stack variables. Non-locked variants would
be more efficient.

> 
>> +        }
>> +        /* Check if we've cleared all percpu readers */
>> +        if ( cpumask_empty(&active_readers) )
>> +            break;
>> +        /* Give the coherency fabric a break */
>> +        cpu_relax();
>> +    } while ( 1 );
> 
> While certainly a matter of taste, could I talk you into using
> for ( ; ; ) instead?

That's fine by me.

> 
>> --- a/xen/include/xen/spinlock.h
>> +++ b/xen/include/xen/spinlock.h
>> @@ -3,6 +3,7 @@
>>  
>>  #include <asm/system.h>
>>  #include <asm/spinlock.h>
>> +#include <xen/stdbool.h>
> 
> No inclusion of this header in code other than such shared with the
> tools please.

Will fixup to <asm/types.h>

> 
>> @@ -261,4 +262,40 @@ int _rw_is_write_locked(rwlock_t *lock);
>>  #define rw_is_locked(l)               _rw_is_locked(l)
>>  #define rw_is_write_locked(l)         _rw_is_write_locked(l)
>>  
>> +static inline void percpu_read_lock(rwlock_t **per_cpudata, bool_t *writer_activating,
>> +		rwlock_t *rwlock)
>> +{
>> +    /* Indicate this cpu is reading */
>> +    this_cpu_ptr(per_cpudata) = rwlock;
>> +    smp_mb();
>> +    /* Check if a writer is waiting */
>> +    if ( unlikely(*writer_activating) )
>> +    {
>> +        /* Let the waiting writer know we aren't holding the lock */
>> +        this_cpu_ptr(per_cpudata) = NULL;
>> +        /* Wait using the read lock to keep the lock fair */
>> +        read_lock(rwlock);
>> +        /* Set the per CPU data again and continue*/
>> +        this_cpu_ptr(per_cpudata) = rwlock;
>> +        /* Drop the read lock because we don't need it anymore */
>> +        read_unlock(rwlock);
>> +    }
>> +}
> 
> So am I getting it right that the whole point of the introduction of
> this lock flavor is to replace the cmpxchg() in read_lock() with just
> the write+read above? If so I'm sure you considered alternatives,
> namely of modifying the plain rw locks - would you mind sharing
> the outcome thereof? 

I attempted to use Linux style queued rwlocks but they have the same
limitation of maintaining read lock state in a shared data variable.
This shared data variable will always suffer from cache line bouncing
and so a data parallel technique was the only way to avoid the cache
line bouncing.

I also tried using an atomic counter and a global barrier in the
grant table specific use case so that the grant table version could be
changed safely but there shared state with multiple writers still
suffered from the cache line bouncing problem.

> I admit I can't see alternatives, but the price
> - as said above - seems quite high. Or maybe I'm misunderstanding
> the "as long as multiple per-cpu read locks are not simultaneously
> held by one particular cpu", as having made it here I can't right
> away see where that restriction comes from?

I've tried to detail the overhead earlier in my reply. The price is
mainly paid when writers are using an uncontended lock. The grant
table write lock is almost never taken in the typical use case and
so for particular algorithms/code flows the price may be worth paying.

> 
>> +static inline void percpu_read_unlock(rwlock_t **per_cpudata)
>> +{
>> +    this_cpu_ptr(per_cpudata) = NULL;
>> +    smp_wmb();
>> +}
>> +
>> +/* Don't inline percpu write lock as it's a complex function */
>> +void percpu_write_lock(rwlock_t **per_cpudata, bool_t *writer_activating,
>> +		rwlock_t *rwlock);
>> +
>> +static inline void percpu_write_unlock(bool_t *writer_activating, rwlock_t *rwlock)
>> +{
>> +    *writer_activating = false;
>> +    write_unlock(rwlock);
>> +}
> 
> Considering that the arguments one needs to pass to any of the
> functions here taking multiple arguments need to be in close sync,
> I'm missing abstracting macros here that need to be passed just
> a single argument (plus a declaration and a definition one).

I agree the per_cpudata and writer_activating arguments are both linked
"global" state and could be abstracted with macro's. The rwlock_t is a
per resource state and would need to be passed as a seperate argument.


So I have some high level questions:

With the current restrictions on the percpu rwlocks (no recursion and no
accessing two percpu rwlocks resources simultaneously on the same PCPU,
Do you think it is worth creating a generic implementation?

Or should I just add a grant table specific implementation because the grant
table code conforms to the above restrictions?

What requirements would there be for a generic percpu rwlock implementation
to be accepted?

Is infrastructure allowing percpu areas to be used in data structures required?

Does the "no recursion" restriction need to be removed?

Does the "no accessing two percpu rwlock resources simultaneously on the same PCPU"
restriction need to be removed?

FYI, I will add assertions to my next patch set so that the violation of the above
restrictions will be caught at run time.


Thanks for the review feedback and I appreciate the time you spent to review this code.

Malcolm

> 
> Jan
> 

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH 1/2] rwlock: add per-cpu reader-writer locks
  2015-11-18 13:49   ` Malcolm Crossley
@ 2015-11-18 14:15     ` Jan Beulich
  2015-11-18 16:21       ` Malcolm Crossley
  0 siblings, 1 reply; 31+ messages in thread
From: Jan Beulich @ 2015-11-18 14:15 UTC (permalink / raw)
  To: Malcolm Crossley
  Cc: andrew.cooper3, keir, stefano.stabellini, ian.campbell, xen-devel

>>> On 18.11.15 at 14:49, <malcolm.crossley@citrix.com> wrote:
> On 17/11/15 17:00, Jan Beulich wrote:
>>>>> On 03.11.15 at 18:58, <malcolm.crossley@citrix.com> wrote:
>>> Per-cpu read-write locks allow for the fast path read case to have low overhead
>>> by only setting/clearing a per-cpu variable for using the read lock.
>>> The per-cpu read fast path also avoids locked compare swap operations which can
>>> be particularly slow on coherent multi-socket systems, particularly if there is
>>> heavy usage of the read lock itself.
>>>
>>> The per-cpu reader-writer lock uses a global variable to control the read lock
>>> fast path. This allows a writer to disable the fast path and ensure the readers
>>> use the underlying read-write lock implementation.
>>>
>>> Once the writer has taken the write lock and disabled the fast path, it must
>>> poll the per-cpu variable for all CPU's which have entered the critical section
>>> for the specific read-write lock the writer is attempting to take. This design
>>> allows for a single per-cpu variable to be used for read/write locks belonging
>>> to seperate data structures as long as multiple per-cpu read locks are not
>>> simultaneously held by one particular cpu. This also means per-cpu
>>> reader-writer locks are not recursion safe.
>> 
>> This sounds like pretty severe a restriction, and something to easily
>> get wrong once more than a handful of users got introduced.
> 
> I can add an ASSERT to detect the recursion for the debug hypervisor but you are right
> that it is a restriction. I believe that the tickets locks are not recursion safe
> either and rely on additional infrastructure to prevent the recursion occurring on
> the lock itself.

No ordinary spin lock can be acquired recursively. The restriction here
is more severe: No two spin locks protecting the same kind of resource
can't be used recursively.

>>> +    cpumask_t active_readers;
>> 
>> Did you consider ways to avoid such a large variable to be put on
>> the stack, and none turned out suitable?
> 
> I wasn't aware the cpumask_t variable at 32 bytes was considered problematic
> for the stack. I can't think of any way around the issue without causing
> further restrictions (a percpu area will prevent taking two different percpu
> rwlocks at the same time).

32 bytes it is only for 256 CPU builds. What about a 4095 CPU config?

Wouldn't it be possible (either excluding use of such locks from
interrupt context, or by disabling interrupts for the duration of
the mask's use) to have a per-CPU mask?

>> I admit I can't see alternatives, but the price
>> - as said above - seems quite high. Or maybe I'm misunderstanding
>> the "as long as multiple per-cpu read locks are not simultaneously
>> held by one particular cpu", as having made it here I can't right
>> away see where that restriction comes from?
> 
> I've tried to detail the overhead earlier in my reply. The price is
> mainly paid when writers are using an uncontended lock. The grant
> table write lock is almost never taken in the typical use case and
> so for particular algorithms/code flows the price may be worth paying.

With "price" I was really referring to the usage restriction, not any
performance effect.

>>> +static inline void percpu_read_unlock(rwlock_t **per_cpudata)
>>> +{
>>> +    this_cpu_ptr(per_cpudata) = NULL;
>>> +    smp_wmb();
>>> +}
>>> +
>>> +/* Don't inline percpu write lock as it's a complex function */
>>> +void percpu_write_lock(rwlock_t **per_cpudata, bool_t *writer_activating,
>>> +		rwlock_t *rwlock);
>>> +
>>> +static inline void percpu_write_unlock(bool_t *writer_activating, rwlock_t *rwlock)
>>> +{
>>> +    *writer_activating = false;
>>> +    write_unlock(rwlock);
>>> +}
>> 
>> Considering that the arguments one needs to pass to any of the
>> functions here taking multiple arguments need to be in close sync,
>> I'm missing abstracting macros here that need to be passed just
>> a single argument (plus a declaration and a definition one).
> 
> I agree the per_cpudata and writer_activating arguments are both linked
> "global" state and could be abstracted with macro's. The rwlock_t is a
> per resource state and would need to be passed as a seperate argument.
> 
> 
> So I have some high level questions:
> 
> With the current restrictions on the percpu rwlocks (no recursion and no
> accessing two percpu rwlocks resources simultaneously on the same PCPU,
> Do you think it is worth creating a generic implementation?

Yes, namely with Andrew already hinting at the p2m lock also wanting
to be converted to this model.

> Or should I just add a grant table specific implementation because the grant
> table code conforms to the above restrictions?
> 
> What requirements would there be for a generic percpu rwlock implementation
> to be accepted?

At least the multiple resource lock issue should be avoided, and I think
I've pointed out before how I think this can be achieved without any
fundamental changes.

> Is infrastructure allowing percpu areas to be used in data structures 
> required?

At some point it seems this would be desirable, but I think I've
explained a way to avoid this getting overly complex for now.

> Does the "no recursion" restriction need to be removed?

Preferably yes, but I think this one can be easier lived with.

> Does the "no accessing two percpu rwlock resources simultaneously on the 
> same PCPU"
> restriction need to be removed?

See above - yes.

Jan

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH 2/2] grant_table: convert grant table rwlock to percpu rwlock
  2015-11-18 13:08                       ` Malcolm Crossley
  2015-11-18 13:47                         ` Jan Beulich
@ 2015-11-18 14:22                         ` Ian Campbell
  1 sibling, 0 replies; 31+ messages in thread
From: Ian Campbell @ 2015-11-18 14:22 UTC (permalink / raw)
  To: Malcolm Crossley, Jan Beulich
  Cc: Andrew Cooper, keir, stefano.stabellini, xen-devel

On Wed, 2015-11-18 at 13:08 +0000, Malcolm Crossley wrote:
> On 18/11/15 12:07, Ian Campbell wrote:
> > On Wed, 2015-11-18 at 11:56 +0000, Malcolm Crossley wrote:
> > > On 18/11/15 11:50, Ian Campbell wrote:
> > > > On Wed, 2015-11-18 at 11:23 +0000, Malcolm Crossley wrote:
> > > > > On 18/11/15 10:54, Jan Beulich wrote:
> > > > > > > > > On 18.11.15 at 11:36, <ian.campbell@citrix.com> wrote:
> > > > > > > On Tue, 2015-11-17 at 17:53 +0000, Andrew Cooper wrote:
> > > > > > > > On 17/11/15 17:39, Jan Beulich wrote:
> > > > > > > > > > > > On 17.11.15 at 18:30, <andrew.cooper3@citrix.com>
> > > > > > > > > > > > wrote:
> > > > > > > > > > On 17/11/15 17:04, Jan Beulich wrote:
> > > > > > > > > > > > > > On 03.11.15 at 18:58, <malcolm.crossley@citrix.
> > > > > > > > > > > > > > com>
> > > > > > > > > > > > > > wrote:
> > > > > > > > > > > > --- a/xen/common/grant_table.c
> > > > > > > > > > > > +++ b/xen/common/grant_table.c
> > > > > > > > > > > > @@ -178,6 +178,10 @@ struct active_grant_entry {
> > > > > > > > > > > >  #define _active_entry(t, e) \
> > > > > > > > > > > >      ((t)-
> > > > > > > > > > > > > active[(e)/ACGNT_PER_PAGE][(e)%ACGNT_PER_PAGE])
> > > > > > > > > > > >  
> > > > > > > > > > > > +bool_t grant_rwlock_barrier;
> > > > > > > > > > > > +
> > > > > > > > > > > > +DEFINE_PER_CPU(rwlock_t *, grant_rwlock);
> > > > > > > > > > > Shouldn't these be per grant table? And wouldn't
> > > > > > > > > > > doing so
> > > > > > > > > > > eliminate
> > > > > > > > > > > the main limitation of the per-CPU rwlocks?
> > > > > > > > > > The grant rwlock is per grant table.
> > > > > > > > > That's understood, but I don't see why the above items
> > > > > > > > > aren't,
> > > > > > > > > too.
> > > > > > > > 
> > > > > > > > Ah - because there is never any circumstance where two
> > > > > > > > grant
> > > > > > > > tables
> > > > > > > > are
> > > > > > > > locked on the same pcpu.
> > > > > > > 
> > > > > > > So per-cpu rwlocks are really a per-pcpu read lock with a
> > > > > > > fallthrough
> > > > > > > to a
> > > > > > > per-$resource (here == granttable) rwlock when any writers
> > > > > > > are
> > > > > > > present for
> > > > > > > any instance $resource, not just the one where the write lock
> > > > > > > is
> > > > > > > desired,
> > > > > > > for the duration of any write lock?
> > > > > > 
> > > > > 
> > > > > The above description is the very good for for how the per-cpu
> > > > > rwlocks behave.
> > > > > The code stores a pointer to the per-$resource in the percpu area
> > > > > when a user is
> > > > > reading the per-$resource, this is why the lock is not safe if
> > > > > you
> > > > > take the lock
> > > > > for two different per-$resource simultaneously. The grant table
> > > > > code
> > > > > only takes
> > > > > one grant table lock at any one time so it is a safe user.
> > > > 
> > > > So essentially the "per-pcpu read lock" as I called it is really in
> > > > essence
> > > > a sort of "byte lock" via the NULL vs non-NULL state of the per-cpu
> > > > pointer
> > > > to the underlying rwlock.
> > > 
> > > It's not quite a byte lock because it stores a full pointer to the
> > > per-$resource
> > > that it's using. It could be changed to be a byte lock but then you
> > > will need a
> > > percpu area per-$resource.
> > 
> > Right, I said "in essence sort of" and put scare quotes around the
> > "byte
> > lock" since I realise it's not literally a byte lock.
> > 
> > But really all I was getting was that it has locked and unlocked states
> > in
> > some form or other.
> 
> I was just concerned that people may not pick up on the subtle difference
> that the
> percpu read areas are used for multiple resources (of which none are
> locked simultaneously
> by the same CPU) where as byte locks are typically used to lock a
> particular
> resource and so you can safely lock multiple resource simultaneously on
> the same CPU.
> 
> > 
> > (Maybe I should have said "like a bit lock with 32 or 64 bits, setting
> > any
> > of which corresponds to acquiring the lock" ;-))
> > 
> Not quite, setting the per cpu read area "takes" the read lock for the particular
> resource you passed into the percpu rwlock implementation. Writers of another resource
> ($resource1) will safely ignore readers of ($resource0).

Ah yes, subtle.

> The global barrier will however make _all_ readers take the per-$resource read lock.
> An optimisation could be to have a barrier variable per-$resource (stored in the
> struct grant_table in this case).

Yes, this global barrier was part of what took me down the wrong path
above.

Ian.

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xen.org
http://lists.xen.org/xen-devel

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH 1/2] rwlock: add per-cpu reader-writer locks
  2015-11-18 14:15     ` Jan Beulich
@ 2015-11-18 16:21       ` Malcolm Crossley
  2015-11-18 17:04         ` Jan Beulich
  0 siblings, 1 reply; 31+ messages in thread
From: Malcolm Crossley @ 2015-11-18 16:21 UTC (permalink / raw)
  To: Jan Beulich
  Cc: andrew.cooper3, keir, stefano.stabellini, ian.campbell, xen-devel

On 18/11/15 14:15, Jan Beulich wrote:
>>>> On 18.11.15 at 14:49, <malcolm.crossley@citrix.com> wrote:
>> On 17/11/15 17:00, Jan Beulich wrote:
>>>>>> On 03.11.15 at 18:58, <malcolm.crossley@citrix.com> wrote:
>>>> Per-cpu read-write locks allow for the fast path read case to have low overhead
>>>> by only setting/clearing a per-cpu variable for using the read lock.
>>>> The per-cpu read fast path also avoids locked compare swap operations which can
>>>> be particularly slow on coherent multi-socket systems, particularly if there is
>>>> heavy usage of the read lock itself.
>>>>
>>>> The per-cpu reader-writer lock uses a global variable to control the read lock
>>>> fast path. This allows a writer to disable the fast path and ensure the readers
>>>> use the underlying read-write lock implementation.
>>>>
>>>> Once the writer has taken the write lock and disabled the fast path, it must
>>>> poll the per-cpu variable for all CPU's which have entered the critical section
>>>> for the specific read-write lock the writer is attempting to take. This design
>>>> allows for a single per-cpu variable to be used for read/write locks belonging
>>>> to seperate data structures as long as multiple per-cpu read locks are not
>>>> simultaneously held by one particular cpu. This also means per-cpu
>>>> reader-writer locks are not recursion safe.
>>>
>>> This sounds like pretty severe a restriction, and something to easily
>>> get wrong once more than a handful of users got introduced.
>>
>> I can add an ASSERT to detect the recursion for the debug hypervisor but you are right
>> that it is a restriction. I believe that the tickets locks are not recursion safe
>> either and rely on additional infrastructure to prevent the recursion occurring on
>> the lock itself.
> 
> No ordinary spin lock can be acquired recursively. The restriction here
> is more severe: No two spin locks protecting the same kind of resource
> can't be used recursively.

I have a suggestion to remove the "no accessing two percpu rwlock resources
simultaneously on the same PCPU" restriction:

On accessing the second resource, we can detect that the percpu read area is already
in use, if this is the case then we can just take standard read_lock on the second
resource instead of using the percpu read area. This makes the percpu rw lock safe
to use on multiple resources simulatenously but it falls back to standard rw lock
performance for taking a two or more percpu rw lock simultaneously.

I'm suggesting the above option instead of per resource (NR_DOM_IDS), percpu data areas
because there may be cases where the number of resources is very large and so
difficult/expensive to preallocate as percpu data areas.

> 
>>>> +    cpumask_t active_readers;
>>>
>>> Did you consider ways to avoid such a large variable to be put on
>>> the stack, and none turned out suitable?
>>
>> I wasn't aware the cpumask_t variable at 32 bytes was considered problematic
>> for the stack. I can't think of any way around the issue without causing
>> further restrictions (a percpu area will prevent taking two different percpu
>> rwlocks at the same time).
> 
> 32 bytes it is only for 256 CPU builds. What about a 4095 CPU config?
> 
> Wouldn't it be possible (either excluding use of such locks from
> interrupt context, or by disabling interrupts for the duration of
> the mask's use) to have a per-CPU mask?

That sounds possible, I'd prefer preventing the use of the lock in
irq context so that the common case does have the penalty of disabling IRQS.

It may also take some time for the readers to exit their critical section
so disabling IRQ's may cause significant interrupt latency.

The trade off here is between IRQ latency and stack usage.
Worst case stack usage would occur with many nested interrupts.
The only way I can think of to preserve IRQ latency and stop stack usage
would be to have preallocated percpu cpumask for the maximum level of interrupt
nesting.

I believe for x86 the maximum level of interrupt nesting is 16 (one per vector nibble)

With 4095 CPU's a cpumask_t would be 512 bytes and so 16 nested interrupts would use all 8K
of stack, where as with 256 CPU we have 32 bytes and so only use a maximum of 512 bytes.

With there being so many users of cpumask_t maybe we should scale the stack size with number
of NR_CPU's instead?

Or I can preallocate "MAX_NR_NESTING_LEVELS" percpu array of cpumask_t?

Or just disable interrupts and take the hit on IRQ latency?

> 
>>> I admit I can't see alternatives, but the price
>>> - as said above - seems quite high. Or maybe I'm misunderstanding
>>> the "as long as multiple per-cpu read locks are not simultaneously
>>> held by one particular cpu", as having made it here I can't right
>>> away see where that restriction comes from?
>>
>> I've tried to detail the overhead earlier in my reply. The price is
>> mainly paid when writers are using an uncontended lock. The grant
>> table write lock is almost never taken in the typical use case and
>> so for particular algorithms/code flows the price may be worth paying.
> 
> With "price" I was really referring to the usage restriction, not any
> performance effect.

I thought you meant overhead when you mentioned "price" :) I should have
picked up the context you provided after the "price" statement however.

> 
>>>> +static inline void percpu_read_unlock(rwlock_t **per_cpudata)
>>>> +{
>>>> +    this_cpu_ptr(per_cpudata) = NULL;
>>>> +    smp_wmb();
>>>> +}
>>>> +
>>>> +/* Don't inline percpu write lock as it's a complex function */
>>>> +void percpu_write_lock(rwlock_t **per_cpudata, bool_t *writer_activating,
>>>> +		rwlock_t *rwlock);
>>>> +
>>>> +static inline void percpu_write_unlock(bool_t *writer_activating, rwlock_t *rwlock)
>>>> +{
>>>> +    *writer_activating = false;
>>>> +    write_unlock(rwlock);
>>>> +}
>>>
>>> Considering that the arguments one needs to pass to any of the
>>> functions here taking multiple arguments need to be in close sync,
>>> I'm missing abstracting macros here that need to be passed just
>>> a single argument (plus a declaration and a definition one).
>>
>> I agree the per_cpudata and writer_activating arguments are both linked
>> "global" state and could be abstracted with macro's. The rwlock_t is a
>> per resource state and would need to be passed as a seperate argument.
>>
>>
>> So I have some high level questions:
>>
>> With the current restrictions on the percpu rwlocks (no recursion and no
>> accessing two percpu rwlocks resources simultaneously on the same PCPU,
>> Do you think it is worth creating a generic implementation?
> 
> Yes, namely with Andrew already hinting at the p2m lock also wanting
> to be converted to this model.

Converting the p2m lock is definitely on the list to be converted due to it's
use in grant operations as well.

> 
>> Or should I just add a grant table specific implementation because the grant
>> table code conforms to the above restrictions?
>>
>> What requirements would there be for a generic percpu rwlock implementation
>> to be accepted?
> 
> At least the multiple resource lock issue should be avoided, and I think
> I've pointed out before how I think this can be achieved without any
> fundamental changes.

Does the suggestion above satisfy this requirement?

> 
>> Is infrastructure allowing percpu areas to be used in data structures 
>> required?
> 
> At some point it seems this would be desirable, but I think I've
> explained a way to avoid this getting overly complex for now.
> 
>> Does the "no recursion" restriction need to be removed?
> 
> Preferably yes, but I think this one can be easier lived with.
> 
>> Does the "no accessing two percpu rwlock resources simultaneously on the 
>> same PCPU"
>> restriction need to be removed?
> 
> See above - yes.

Thanks again for the feedback

Malcolm

> 
> Jan
> 

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH 1/2] rwlock: add per-cpu reader-writer locks
  2015-11-18 16:21       ` Malcolm Crossley
@ 2015-11-18 17:04         ` Jan Beulich
  0 siblings, 0 replies; 31+ messages in thread
From: Jan Beulich @ 2015-11-18 17:04 UTC (permalink / raw)
  To: Malcolm Crossley
  Cc: andrew.cooper3, keir, stefano.stabellini, ian.campbell, xen-devel

>>> On 18.11.15 at 17:21, <malcolm.crossley@citrix.com> wrote:
> I have a suggestion to remove the "no accessing two percpu rwlock resources
> simultaneously on the same PCPU" restriction:
> 
> On accessing the second resource, we can detect that the percpu read area is 
> already
> in use, if this is the case then we can just take standard read_lock on the 
> second
> resource instead of using the percpu read area. This makes the percpu rw 
> lock safe
> to use on multiple resources simulatenously but it falls back to standard rw 
> lock
> performance for taking a two or more percpu rw lock simultaneously.

Yes, that's an option.

> I'm suggesting the above option instead of per resource (NR_DOM_IDS), percpu 
> data areas
> because there may be cases where the number of resources is very large and 
> so
> difficult/expensive to preallocate as percpu data areas.

Well, typically (grant lock, p2m lock) these would be per domain, and
with the 32k limit on domains this wouldn't be too bad (but also not
too nice, I agree).

>>>>> +    cpumask_t active_readers;
>>>>
>>>> Did you consider ways to avoid such a large variable to be put on
>>>> the stack, and none turned out suitable?
>>>
>>> I wasn't aware the cpumask_t variable at 32 bytes was considered problematic
>>> for the stack. I can't think of any way around the issue without causing
>>> further restrictions (a percpu area will prevent taking two different percpu
>>> rwlocks at the same time).
>> 
>> 32 bytes it is only for 256 CPU builds. What about a 4095 CPU config?
>> 
>> Wouldn't it be possible (either excluding use of such locks from
>> interrupt context, or by disabling interrupts for the duration of
>> the mask's use) to have a per-CPU mask?
> 
> That sounds possible, I'd prefer preventing the use of the lock in
> irq context so that the common case does have the penalty of disabling IRQS.

Which seems reasonable, and documentable by not having _irq
and _irqsave/_irqrestore flavors.

I have to admit that I didn't really follow the rest of your
argumentation here, since the moment we don't allow these locks
in interrupts, all problems should be gone.

>>> What requirements would there be for a generic percpu rwlock implementation
>>> to be accepted?
>> 
>> At least the multiple resource lock issue should be avoided, and I think
>> I've pointed out before how I think this can be achieved without any
>> fundamental changes.
> 
> Does the suggestion above satisfy this requirement?

Yes, even if I like the other alternative better.

Jan

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH 2/2] grant_table: convert grant table rwlock to percpu rwlock
  2015-11-17 17:30     ` Andrew Cooper
  2015-11-17 17:39       ` Jan Beulich
@ 2015-11-18 20:02       ` Konrad Rzeszutek Wilk
  2015-11-19  9:03         ` Malcolm Crossley
  2015-11-19 10:09         ` Andrew Cooper
  1 sibling, 2 replies; 31+ messages in thread
From: Konrad Rzeszutek Wilk @ 2015-11-18 20:02 UTC (permalink / raw)
  To: Andrew Cooper
  Cc: keir, ian.campbell, stefano.stabellini, Jan Beulich, xen-devel,
	Malcolm Crossley

On Tue, Nov 17, 2015 at 05:30:59PM +0000, Andrew Cooper wrote:
> On 17/11/15 17:04, Jan Beulich wrote:
> >>>> On 03.11.15 at 18:58, <malcolm.crossley@citrix.com> wrote:
> >> --- a/xen/common/grant_table.c
> >> +++ b/xen/common/grant_table.c
> >> @@ -178,6 +178,10 @@ struct active_grant_entry {
> >>  #define _active_entry(t, e) \
> >>      ((t)->active[(e)/ACGNT_PER_PAGE][(e)%ACGNT_PER_PAGE])
> >>  
> >> +bool_t grant_rwlock_barrier;
> >> +
> >> +DEFINE_PER_CPU(rwlock_t *, grant_rwlock);
> > Shouldn't these be per grant table? And wouldn't doing so eliminate
> > the main limitation of the per-CPU rwlocks?
> 
> The grant rwlock is per grant table.
> 
> The entire point of this series is to reduce the cmpxchg storm which
> happens when many pcpus attempt to grap the same domains grant read lock.
> 
> As identified in the commit message, reducing the cmpxchg pressure on
> the cache coherency fabric increases intra-vm network through from
> 10Gbps to 50Gbps when running iperf between two 16-vcpu guests.
> 
> Or in other words, 80% of cpu time is wasted with waiting on an atomic
> read/modify/write operation against a remote hot cache line.
> 

Why not use MCE locks then (in Linux the implemention is known
as qspinlock). Plus they have added extra code to protect against
recursion (via four levels). See Linux commit
a33fda35e3a7655fb7df756ed67822afb5ed5e8d
locking/qspinlock: Introduce a simple generic 4-byte queued spinlock)

> ~Andrew
> 
> _______________________________________________
> Xen-devel mailing list
> Xen-devel@lists.xen.org
> http://lists.xen.org/xen-devel

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH 2/2] grant_table: convert grant table rwlock to percpu rwlock
  2015-11-18 20:02       ` Konrad Rzeszutek Wilk
@ 2015-11-19  9:03         ` Malcolm Crossley
  2015-11-19 10:09         ` Andrew Cooper
  1 sibling, 0 replies; 31+ messages in thread
From: Malcolm Crossley @ 2015-11-19  9:03 UTC (permalink / raw)
  To: Konrad Rzeszutek Wilk, Andrew Cooper
  Cc: xen-devel, stefano.stabellini, keir, ian.campbell, Jan Beulich

On 18/11/15 20:02, Konrad Rzeszutek Wilk wrote:
> On Tue, Nov 17, 2015 at 05:30:59PM +0000, Andrew Cooper wrote:
>> On 17/11/15 17:04, Jan Beulich wrote:
>>>>>> On 03.11.15 at 18:58, <malcolm.crossley@citrix.com> wrote:
>>>> --- a/xen/common/grant_table.c
>>>> +++ b/xen/common/grant_table.c
>>>> @@ -178,6 +178,10 @@ struct active_grant_entry {
>>>>  #define _active_entry(t, e) \
>>>>      ((t)->active[(e)/ACGNT_PER_PAGE][(e)%ACGNT_PER_PAGE])
>>>>  
>>>> +bool_t grant_rwlock_barrier;
>>>> +
>>>> +DEFINE_PER_CPU(rwlock_t *, grant_rwlock);
>>> Shouldn't these be per grant table? And wouldn't doing so eliminate
>>> the main limitation of the per-CPU rwlocks?
>>
>> The grant rwlock is per grant table.
>>
>> The entire point of this series is to reduce the cmpxchg storm which
>> happens when many pcpus attempt to grap the same domains grant read lock.
>>
>> As identified in the commit message, reducing the cmpxchg pressure on
>> the cache coherency fabric increases intra-vm network through from
>> 10Gbps to 50Gbps when running iperf between two 16-vcpu guests.
>>
>> Or in other words, 80% of cpu time is wasted with waiting on an atomic
>> read/modify/write operation against a remote hot cache line.
>>
> 
> Why not use MCE locks then (in Linux the implemention is known
> as qspinlock). Plus they have added extra code to protect against
> recursion (via four levels). See Linux commit
> a33fda35e3a7655fb7df756ed67822afb5ed5e8d
> locking/qspinlock: Introduce a simple generic 4-byte queued spinlock)
> 

The Linux qspinlock is MCS based but MCS only helps under lock contention.
It still uses a single data location for the lock and so suffers from
cache line bouncing plus the cmpxchg overhead for taking a uncontended lock.

You can see the qspinlock using the cmpxchg mechanism here:
http://lxr.free-electrons.com/source/include/asm-generic/qspinlock.h#L62

I've copy pasted the qspinlock lock implementation inline for convenience:

static __always_inline void queued_spin_lock(struct qspinlock *lock)
{
	u32 val;

	val = atomic_cmpxchg(&lock->val, 0, _Q_LOCKED_VAL);
	if (likely(val == 0))
		return;
	queued_spin_lock_slowpath(lock, val);
}

Malcolm


>> ~Andrew
>>
>> _______________________________________________
>> Xen-devel mailing list
>> Xen-devel@lists.xen.org
>> http://lists.xen.org/xen-devel

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH 2/2] grant_table: convert grant table rwlock to percpu rwlock
  2015-11-18 20:02       ` Konrad Rzeszutek Wilk
  2015-11-19  9:03         ` Malcolm Crossley
@ 2015-11-19 10:09         ` Andrew Cooper
  1 sibling, 0 replies; 31+ messages in thread
From: Andrew Cooper @ 2015-11-19 10:09 UTC (permalink / raw)
  To: Konrad Rzeszutek Wilk
  Cc: keir, ian.campbell, stefano.stabellini, Jan Beulich, xen-devel,
	Malcolm Crossley

On 18/11/15 20:02, Konrad Rzeszutek Wilk wrote:
> On Tue, Nov 17, 2015 at 05:30:59PM +0000, Andrew Cooper wrote:
>> On 17/11/15 17:04, Jan Beulich wrote:
>>>>>> On 03.11.15 at 18:58, <malcolm.crossley@citrix.com> wrote:
>>>> --- a/xen/common/grant_table.c
>>>> +++ b/xen/common/grant_table.c
>>>> @@ -178,6 +178,10 @@ struct active_grant_entry {
>>>>  #define _active_entry(t, e) \
>>>>      ((t)->active[(e)/ACGNT_PER_PAGE][(e)%ACGNT_PER_PAGE])
>>>>  
>>>> +bool_t grant_rwlock_barrier;
>>>> +
>>>> +DEFINE_PER_CPU(rwlock_t *, grant_rwlock);
>>> Shouldn't these be per grant table? And wouldn't doing so eliminate
>>> the main limitation of the per-CPU rwlocks?
>> The grant rwlock is per grant table.
>>
>> The entire point of this series is to reduce the cmpxchg storm which
>> happens when many pcpus attempt to grap the same domains grant read lock.
>>
>> As identified in the commit message, reducing the cmpxchg pressure on
>> the cache coherency fabric increases intra-vm network through from
>> 10Gbps to 50Gbps when running iperf between two 16-vcpu guests.
>>
>> Or in other words, 80% of cpu time is wasted with waiting on an atomic
>> read/modify/write operation against a remote hot cache line.
>>
> Why not use MCE locks then (in Linux the implemention is known
> as qspinlock). Plus they have added extra code to protect against
> recursion (via four levels). See Linux commit
> a33fda35e3a7655fb7df756ed67822afb5ed5e8d
> locking/qspinlock: Introduce a simple generic 4-byte queued spinlock)

The bottlekneck here is the act of taking the read lock, even when it
will be successfully acquired.

There are several good reasons to move to MCS locks (not MCE locks,
which sound somewhat disastrous), but they will not help in the
slightest with this issue.

The optimisation here is to avoid taking the read lock all-together in
the (overwhelmingly) common case.

~Andrew

^ permalink raw reply	[flat|nested] 31+ messages in thread

end of thread, other threads:[~2015-11-19 10:09 UTC | newest]

Thread overview: 31+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2015-11-03 17:58 [PATCH 1/2] rwlock: add per-cpu reader-writer locks Malcolm Crossley
2015-11-03 17:58 ` [PATCH 2/2] grant_table: convert grant table rwlock to percpu rwlock Malcolm Crossley
2015-11-17 17:04   ` Jan Beulich
2015-11-17 17:30     ` Andrew Cooper
2015-11-17 17:39       ` Jan Beulich
2015-11-17 17:53         ` Andrew Cooper
2015-11-18  7:45           ` Jan Beulich
2015-11-18 10:06             ` Andrew Cooper
2015-11-18 10:48               ` Jan Beulich
2015-11-18 10:36           ` Ian Campbell
2015-11-18 10:54             ` Jan Beulich
2015-11-18 11:23               ` Malcolm Crossley
2015-11-18 11:41                 ` Jan Beulich
2015-11-18 11:50                   ` Malcolm Crossley
2015-11-18 11:50                 ` Ian Campbell
2015-11-18 11:56                   ` Malcolm Crossley
2015-11-18 12:07                     ` Ian Campbell
2015-11-18 13:08                       ` Malcolm Crossley
2015-11-18 13:47                         ` Jan Beulich
2015-11-18 14:22                         ` Ian Campbell
2015-11-18 20:02       ` Konrad Rzeszutek Wilk
2015-11-19  9:03         ` Malcolm Crossley
2015-11-19 10:09         ` Andrew Cooper
2015-11-05 13:48 ` [PATCH 1/2] rwlock: add per-cpu reader-writer locks Marcos E. Matsunaga
2015-11-05 15:20   ` Malcolm Crossley
2015-11-05 15:46     ` Marcos E. Matsunaga
2015-11-17 17:00 ` Jan Beulich
2015-11-18 13:49   ` Malcolm Crossley
2015-11-18 14:15     ` Jan Beulich
2015-11-18 16:21       ` Malcolm Crossley
2015-11-18 17:04         ` Jan Beulich

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).