From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand id S268197AbUHTUVC (ORCPT ); Fri, 20 Aug 2004 16:21:02 -0400 Received: (majordomo@vger.kernel.org) by vger.kernel.org id S266334AbUHTUVB (ORCPT ); Fri, 20 Aug 2004 16:21:01 -0400 Received: from dbl.q-ag.de ([213.172.117.3]:16313 "EHLO dbl.q-ag.de") by vger.kernel.org with ESMTP id S268723AbUHTUQT (ORCPT ); Fri, 20 Aug 2004 16:16:19 -0400 Message-ID: <41265CCE.3070808@colorfullife.com> Date: Fri, 20 Aug 2004 22:19:26 +0200 From: Manfred Spraul User-Agent: Mozilla/5.0 (X11; U; Linux i686; fr-FR; rv:1.7.2) Gecko/20040803 X-Accept-Language: en-us, en MIME-Version: 1.0 To: Jesse Barnes CC: paulmck@us.ibm.com, "Martin J. Bligh" , hawkes@sgi.com, linux-kernel@vger.kernel.org, wli@holomorphy.com Subject: Re: kernbench on 512p References: <200408191216.33667.jbarnes@engr.sgi.com> <200408192016.10064.jbarnes@engr.sgi.com> <20040820155717.GF1243@us.ibm.com> <200408201324.32464.jbarnes@engr.sgi.com> In-Reply-To: <200408201324.32464.jbarnes@engr.sgi.com> Content-Type: multipart/mixed; boundary="------------020804040008090304010607" Sender: linux-kernel-owner@vger.kernel.org X-Mailing-List: linux-kernel@vger.kernel.org This is a multi-part message in MIME format. --------------020804040008090304010607 Content-Type: text/plain; charset=ISO-8859-1; format=flowed Content-Transfer-Encoding: 7bit Jesse Barnes wrote: >Looks like a bit more context has changed. Manfred, care to respin against >-mm3 so I can test? > > > The patches are attached. Just boot-tested on a single-cpu system. Three changes: - I've placed the per-group structure into rcu_state. That's simpler but wrong: the state should be allocated from node-local memory, not a big global array. - I found a bug/race in the cpu_offline path: When the last cpu of a group goes offline then the group must be forced into quiescent state. The "&& (!forced)" was missing. - I've removed the spin_unlock_wait(). It was intended to synchronize cpu_online_mask changes with the calculation of ->outstanding. Paul convinced me that this is not necessary. -- Manfred --------------020804040008090304010607 Content-Type: text/plain; name="patch-rcu268-01-locking" Content-Transfer-Encoding: 7bit Content-Disposition: inline; filename="patch-rcu268-01-locking" --- 2.6/kernel/rcupdate.c 2004-08-20 19:59:22.000000000 +0200 +++ build-2.6/kernel/rcupdate.c 2004-08-20 20:46:35.952639280 +0200 @@ -237,14 +237,29 @@ * Clear it from the cpu mask and complete the grace period if it was the last * cpu. Start another grace period if someone has further entries pending */ -static void cpu_quiet(int cpu, struct rcu_ctrlblk *rcp, struct rcu_state *rsp) +static void cpu_quiet(int cpu, struct rcu_ctrlblk *rcp, struct rcu_state *rsp, + struct rcu_data *rdp, int force) { + spin_lock(&rsp->lock); + + if (unlikely(rcp->completed == rcp->cur)) + goto out_unlock; + /* + * RCU_quiescbatch/batch.cur and the cpu bitmap can come out of sync + * during cpu startup. Ignore the quiescent state if that happened. + */ + if (unlikely(rdp->quiescbatch != rcp->cur) && likely(!force)) + goto out_unlock; + cpu_clear(cpu, rsp->cpumask); if (cpus_empty(rsp->cpumask)) { /* batch completed ! */ rcp->completed = rcp->cur; rcu_start_batch(rcp, rsp, 0); } +out_unlock: + spin_unlock(&rsp->lock); + } /* @@ -279,15 +294,7 @@ return; rdp->qs_pending = 0; - spin_lock(&rsp->lock); - /* - * rdp->quiescbatch/rcp->cur and the cpu bitmap can come out of sync - * during cpu startup. Ignore the quiescent state. - */ - if (likely(rdp->quiescbatch == rcp->cur)) - cpu_quiet(rdp->cpu, rcp, rsp); - - spin_unlock(&rsp->lock); + cpu_quiet(rdp->cpu, rcp, rsp, rdp, 0); } @@ -314,10 +321,10 @@ * we can block indefinitely waiting for it, so flush * it here */ - spin_lock_bh(&rsp->lock); - if (rcp->cur != rcp->completed) - cpu_quiet(rdp->cpu, rcp, rsp); - spin_unlock_bh(&rsp->lock); + local_bh_disable(); + cpu_quiet(rdp->cpu, rcp, rsp, rdp, 1); + local_bh_enable(); + rcu_move_batch(this_rdp, rdp->curlist, rdp->curtail); rcu_move_batch(this_rdp, rdp->nxtlist, rdp->nxttail); --------------020804040008090304010607 Content-Type: text/plain; name="patch-rcu268-02-groups" Content-Transfer-Encoding: 7bit Content-Disposition: inline; filename="patch-rcu268-02-groups" --- 2.6/kernel/rcupdate.c 2004-08-20 21:52:45.272210984 +0200 +++ build-2.6/kernel/rcupdate.c 2004-08-20 21:52:24.664343856 +0200 @@ -53,17 +53,59 @@ struct rcu_ctrlblk rcu_bh_ctrlblk = { .cur = -300, .completed = -300 , .lock = SEQCNT_ZERO }; +/* XXX Dummy - should belong into arch XXX */ +#define RCU_HUGE +#define RCU_GROUP_SIZE 2 +/* XXX End of dummy XXX */ + +#ifdef RCU_HUGE + +#define RCU_GROUPCOUNT ((NR_CPUS+RCU_GROUP_SIZE-1)/RCU_GROUP_SIZE) +#define RCU_GROUP_CPUMASKLEN ((RCU_GROUP_SIZE+BITS_PER_LONG-1)/BITS_PER_LONG) +#define RCU_GROUPMASKLEN ((NR_CPUS+RCU_GROUP_SIZE*BITS_PER_LONG-1)/(RCU_GROUP_SIZE*BITS_PER_LONG)) + +struct rcu_group_state { + spinlock_t lock; /* Guard this struct */ + long batchnum; /* batchnum this group is working on. Mitmatch with + * ctrlblk->cur means reinitialize outstanding to + * all active cpus in this group. + */ + unsigned long outstanding[RCU_GROUP_CPUMASKLEN]; +} ____cacheline_maxaligned_in_smp; + +#endif + /* Bookkeeping of the progress of the grace period */ struct rcu_state { spinlock_t lock; /* Guard this struct and writes to rcu_ctrlblk */ +#ifdef RCU_HUGE + long batchnum; /* batchnum the system is working on. Mismatch + * with rcu_ctrlblk.cur means reinitialize + * outstanding to all groups with active cpus + */ + unsigned long outstanding[RCU_GROUPMASKLEN]; + struct rcu_group_state groups[RCU_GROUPCOUNT]; +#else cpumask_t cpumask; /* CPUs that need to switch in order */ /* for current batch to proceed. */ +#endif }; -struct rcu_state rcu_state ____cacheline_maxaligned_in_smp = - {.lock = SPIN_LOCK_UNLOCKED, .cpumask = CPU_MASK_NONE }; -struct rcu_state rcu_bh_state ____cacheline_maxaligned_in_smp = - {.lock = SPIN_LOCK_UNLOCKED, .cpumask = CPU_MASK_NONE }; +#ifdef RCU_HUGE +#define RCU_STATE_INITIALIZER \ + { \ + .lock = SPIN_LOCK_UNLOCKED, \ + .batchnum = -400, \ + .groups = { [0 ... RCU_GROUPCOUNT-1] = \ + { .lock = SPIN_LOCK_UNLOCKED, .batchnum = -400 } } \ + } + +#else +#define RCU_STATE_INITIALIZER {.lock = SPIN_LOCK_UNLOCKED, .cpumask = CPU_MASK_NONE } +#endif + +struct rcu_state rcu_state ____cacheline_maxaligned_in_smp = RCU_STATE_INITIALIZER; +struct rcu_state rcu_bh_state ____cacheline_maxaligned_in_smp = RCU_STATE_INITIALIZER; DEFINE_PER_CPU(struct rcu_data, rcu_data) = { 0L }; DEFINE_PER_CPU(struct rcu_data, rcu_bh_data) = { 0L }; @@ -223,8 +265,15 @@ if (rcp->next_pending && rcp->completed == rcp->cur) { +#ifdef RCU_HUGE + /* Nothing to do: RCU_HUGE uses lazy initialization of the + * outstanding bitmap + */ +#else + /* FIXME: what does this comment mean? */ /* Can't change, since spin lock held. */ cpus_andnot(rsp->cpumask, cpu_online_map, nohz_cpu_mask); +#endif write_seqcount_begin(&rcp->lock); rcp->next_pending = 0; rcp->cur++; @@ -237,6 +286,76 @@ * Clear it from the cpu mask and complete the grace period if it was the last * cpu. Start another grace period if someone has further entries pending */ +#ifdef RCU_HUGE +static void cpu_quiet(int cpu, struct rcu_ctrlblk *rcp, struct rcu_state *rsp, + struct rcu_data *rdp, int force) +{ + struct rcu_group_state *rgs; + long batch; + int i; + + batch = rcp->cur; + + rgs = &rsp->groups[cpu/RCU_GROUP_SIZE]; + + spin_lock(&rgs->lock); + if (rgs->batchnum != batch) { + int offset; + /* first call for this batch - initialize outstanding */ + rgs->batchnum = batch; + memset(rgs->outstanding, 0, sizeof(rgs->outstanding)); + offset = (cpu/RCU_GROUP_SIZE)*RCU_GROUP_SIZE; + for (i=0;ioutstanding); + } + } + if (unlikely(rdp->quiescbatch != rgs->batchnum) && likely(!force)) + goto out_unlock_group; + + __clear_bit(cpu%RCU_GROUP_SIZE, rgs->outstanding); + for (i=0;ioutstanding[i]) + break; + } + if (i==RCU_GROUP_CPUMASKLEN) { + /* group completed, escalate to global level */ + spin_lock(&rsp->lock); + + if (rsp->batchnum != rcp->cur) { + /* first call for this batch - initialize outstanding */ + rsp->batchnum = rcp->cur; + memset(rsp->outstanding, 0, sizeof(rsp->outstanding)); + + for (i=0;ioutstanding); + } + } + if (unlikely(rgs->batchnum != rsp->batchnum) && likely(!force)) + goto out_unlock_all; + __clear_bit(cpu/RCU_GROUP_SIZE, rsp->outstanding); + for (i=0;ioutstanding[i]) + break; + } + if (i==RCU_GROUPMASKLEN) { + /* all groups completed, batch completed */ + rcp->completed = rcp->cur; + rcu_start_batch(rcp, rsp, 0); + } +out_unlock_all: + spin_unlock(&rcu_state.lock); + } +out_unlock_group: + spin_unlock(&rgs->lock); +} +#else static void cpu_quiet(int cpu, struct rcu_ctrlblk *rcp, struct rcu_state *rsp, struct rcu_data *rdp, int force) { @@ -261,6 +380,7 @@ spin_unlock(&rsp->lock); } +#endif /* * Check if the cpu has gone through a quiescent state (say context @@ -418,8 +538,25 @@ tasklet_schedule(&per_cpu(rcu_tasklet, cpu)); } +#ifdef RCU_HUGE +static void rcu_update_group(int cpu, struct rcu_ctrlblk *rcp, + struct rcu_state *rsp) +{ + int i, offset; + offset = (cpu/RCU_GROUP_SIZE)*RCU_GROUP_SIZE; + for (i=0;igroups[cpu/RCU_GROUP_SIZE].batchnum = rcp->completed; + } +} +#endif + static void rcu_init_percpu_data(int cpu, struct rcu_ctrlblk *rcp, - struct rcu_data *rdp) + struct rcu_state *rsp, struct rcu_data *rdp) { memset(rdp, 0, sizeof(*rdp)); rdp->curtail = &rdp->curlist; @@ -428,6 +565,9 @@ rdp->quiescbatch = rcp->completed; rdp->qs_pending = 0; rdp->cpu = cpu; +#ifdef RCU_HUGE + rcu_update_group(cpu, rcp, rsp); +#endif } static void __devinit rcu_online_cpu(int cpu) @@ -435,8 +575,8 @@ struct rcu_data *rdp = &per_cpu(rcu_data, cpu); struct rcu_data *bh_rdp = &per_cpu(rcu_bh_data, cpu); - rcu_init_percpu_data(cpu, &rcu_ctrlblk, rdp); - rcu_init_percpu_data(cpu, &rcu_bh_ctrlblk, bh_rdp); + rcu_init_percpu_data(cpu, &rcu_ctrlblk, &rcu_state, rdp); + rcu_init_percpu_data(cpu, &rcu_bh_ctrlblk, &rcu_bh_state, bh_rdp); tasklet_init(&per_cpu(rcu_tasklet, cpu), rcu_process_callbacks, 0UL); } --------------020804040008090304010607--