From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
To: Herbert Xu <herbert@gondor.apana.org.au>
Cc: David Miller <davem@davemloft.net>,
timo.teras@iki.fi, netdev@vger.kernel.org
Subject: Re: xfrm_state locking regression...
Date: Thu, 11 Sep 2008 14:24:59 -0700 [thread overview]
Message-ID: <20080911212459.GL6693@linux.vnet.ibm.com> (raw)
In-Reply-To: <20080909143312.GA29952@gondor.apana.org.au>
On Wed, Sep 10, 2008 at 12:33:12AM +1000, Herbert Xu wrote:
> On Mon, Sep 08, 2008 at 05:25:13PM -0700, David Miller wrote:
> >
> > The only comment I would make is that maybe it's a bit excessive
> > to trigger the GC worker every time we walk the states.
>
> Good point!
>
> I've avoided the memory barrier by simply extending the mutexed
> section in the GC to cover the list splicing. Here's the updated
> patch:
>
> ipsec: Use RCU-like construct for saved state within a walk
>
> Now that we save states within a walk we need synchronisation
> so that the list the saved state is on doesn't disappear from
> under us.
>
> As it stands this is done by keeping the state on the list which
> is bad because it gets in the way of the management of the state
> life-cycle.
>
> An alternative is to make our own pseudo-RCU system where we use
> counters to indicate which state can't be freed immediately as
> it may be referenced by an ongoing walk when that resumes.
There is only one reader at a time, right? Otherwise, I don't see how
the increments and reads of xfrm_state_walk_completed line up.
Thanx, Paul
> Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
>
> diff --git a/include/net/xfrm.h b/include/net/xfrm.h
> index 2933d74..4bb9499 100644
> --- a/include/net/xfrm.h
> +++ b/include/net/xfrm.h
> @@ -120,9 +120,11 @@ extern struct mutex xfrm_cfg_mutex;
> /* Full description of state of transformer. */
> struct xfrm_state
> {
> - /* Note: bydst is re-used during gc */
> struct list_head all;
> - struct hlist_node bydst;
> + union {
> + struct list_head gclist;
> + struct hlist_node bydst;
> + };
> struct hlist_node bysrc;
> struct hlist_node byspi;
>
> @@ -1286,16 +1288,9 @@ static inline void xfrm_state_walk_init(struct xfrm_state_walk *walk, u8 proto)
> walk->count = 0;
> }
>
> -static inline void xfrm_state_walk_done(struct xfrm_state_walk *walk)
> -{
> - if (walk->state != NULL) {
> - xfrm_state_put(walk->state);
> - walk->state = NULL;
> - }
> -}
> -
> extern int xfrm_state_walk(struct xfrm_state_walk *walk,
> int (*func)(struct xfrm_state *, int, void*), void *);
> +extern void xfrm_state_walk_done(struct xfrm_state_walk *walk);
> extern struct xfrm_state *xfrm_state_alloc(void);
> extern struct xfrm_state *xfrm_state_find(xfrm_address_t *daddr, xfrm_address_t *saddr,
> struct flowi *fl, struct xfrm_tmpl *tmpl,
> diff --git a/net/xfrm/xfrm_state.c b/net/xfrm/xfrm_state.c
> index 7bd62f6..d90f936 100644
> --- a/net/xfrm/xfrm_state.c
> +++ b/net/xfrm/xfrm_state.c
> @@ -59,6 +59,11 @@ static unsigned int xfrm_state_hashmax __read_mostly = 1 * 1024 * 1024;
> static unsigned int xfrm_state_num;
> static unsigned int xfrm_state_genid;
>
> +/* Counter indicating ongoing walk, protected by xfrm_state_lock. */
> +static unsigned long xfrm_state_walk_ongoing;
> +/* Counter indicating walk completion, protected by xfrm_cfg_mutex. */
> +static unsigned long xfrm_state_walk_completed;
> +
> static struct xfrm_state_afinfo *xfrm_state_get_afinfo(unsigned int family);
> static void xfrm_state_put_afinfo(struct xfrm_state_afinfo *afinfo);
>
> @@ -191,7 +196,8 @@ static DEFINE_RWLOCK(xfrm_state_afinfo_lock);
> static struct xfrm_state_afinfo *xfrm_state_afinfo[NPROTO];
>
> static struct work_struct xfrm_state_gc_work;
> -static HLIST_HEAD(xfrm_state_gc_list);
> +static LIST_HEAD(xfrm_state_gc_leftovers);
> +static LIST_HEAD(xfrm_state_gc_list);
> static DEFINE_SPINLOCK(xfrm_state_gc_lock);
>
> int __xfrm_state_delete(struct xfrm_state *x);
> @@ -403,17 +409,22 @@ static void xfrm_state_gc_destroy(struct xfrm_state *x)
>
> static void xfrm_state_gc_task(struct work_struct *data)
> {
> - struct xfrm_state *x;
> - struct hlist_node *entry, *tmp;
> - struct hlist_head gc_list;
> + struct xfrm_state *x, *tmp;
> + unsigned long completed;
>
> + mutex_lock(&xfrm_cfg_mutex);
> spin_lock_bh(&xfrm_state_gc_lock);
> - gc_list.first = xfrm_state_gc_list.first;
> - INIT_HLIST_HEAD(&xfrm_state_gc_list);
> + list_splice_tail_init(&xfrm_state_gc_list, &xfrm_state_gc_leftovers);
> spin_unlock_bh(&xfrm_state_gc_lock);
>
> - hlist_for_each_entry_safe(x, entry, tmp, &gc_list, bydst)
> + completed = xfrm_state_walk_completed;
> + mutex_unlock(&xfrm_cfg_mutex);
> +
> + list_for_each_entry_safe(x, tmp, &xfrm_state_gc_leftovers, gclist) {
> + if ((long)(x->lastused - completed) > 0)
> + break;
> xfrm_state_gc_destroy(x);
> + }
>
> wake_up(&km_waitq);
> }
> @@ -540,12 +551,8 @@ void __xfrm_state_destroy(struct xfrm_state *x)
> {
> WARN_ON(x->km.state != XFRM_STATE_DEAD);
>
> - spin_lock_bh(&xfrm_state_lock);
> - list_del(&x->all);
> - spin_unlock_bh(&xfrm_state_lock);
> -
> spin_lock_bh(&xfrm_state_gc_lock);
> - hlist_add_head(&x->bydst, &xfrm_state_gc_list);
> + list_add_tail(&x->gclist, &xfrm_state_gc_list);
> spin_unlock_bh(&xfrm_state_gc_lock);
> schedule_work(&xfrm_state_gc_work);
> }
> @@ -558,6 +565,8 @@ int __xfrm_state_delete(struct xfrm_state *x)
> if (x->km.state != XFRM_STATE_DEAD) {
> x->km.state = XFRM_STATE_DEAD;
> spin_lock(&xfrm_state_lock);
> + x->lastused = xfrm_state_walk_ongoing;
> + list_del_rcu(&x->all);
> hlist_del(&x->bydst);
> hlist_del(&x->bysrc);
> if (x->id.spi)
> @@ -1572,6 +1581,7 @@ int xfrm_state_walk(struct xfrm_state_walk *walk,
> if (err) {
> xfrm_state_hold(last);
> walk->state = last;
> + xfrm_state_walk_ongoing++;
> goto out;
> }
> }
> @@ -1586,12 +1596,28 @@ int xfrm_state_walk(struct xfrm_state_walk *walk,
> err = func(last, 0, data);
> out:
> spin_unlock_bh(&xfrm_state_lock);
> - if (old != NULL)
> + if (old != NULL) {
> xfrm_state_put(old);
> + xfrm_state_walk_completed++;
> + if (!list_empty(&xfrm_state_gc_leftovers))
> + schedule_work(&xfrm_state_gc_work);
> + }
> return err;
> }
> EXPORT_SYMBOL(xfrm_state_walk);
>
> +void xfrm_state_walk_done(struct xfrm_state_walk *walk)
> +{
> + if (walk->state != NULL) {
> + xfrm_state_put(walk->state);
> + walk->state = NULL;
> + xfrm_state_walk_completed++;
> + if (!list_empty(&xfrm_state_gc_leftovers))
> + schedule_work(&xfrm_state_gc_work);
> + }
> +}
> +EXPORT_SYMBOL(xfrm_state_walk_done);
> +
>
> void xfrm_replay_notify(struct xfrm_state *x, int event)
> {
>
> Thanks,
> --
> Visit Openswan at http://www.openswan.org/
> Email: Herbert Xu ~{PmV>HI~} <herbert@gondor.apana.org.au>
> Home Page: http://gondor.apana.org.au/~herbert/
> PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt
> --
> To unsubscribe from this list: send the line "unsubscribe netdev" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at http://vger.kernel.org/majordomo-info.html
next prev parent reply other threads:[~2008-09-11 21:25 UTC|newest]
Thread overview: 95+ messages / expand[flat|nested] mbox.gz Atom feed top
2008-09-03 2:51 xfrm_state locking regression David Miller
2008-09-03 3:00 ` David Miller
2008-09-03 5:01 ` Herbert Xu
2008-09-03 5:07 ` Timo Teräs
2008-09-03 5:23 ` Herbert Xu
2008-09-03 5:39 ` Timo Teräs
2008-09-03 5:40 ` Herbert Xu
2008-09-09 12:25 ` David Miller
2008-09-03 5:39 ` Herbert Xu
2008-09-03 5:45 ` Timo Teräs
2008-09-03 5:50 ` Herbert Xu
2008-09-03 6:14 ` David Miller
2008-09-03 6:27 ` Timo Teräs
2008-09-03 6:35 ` David Miller
2008-09-03 6:45 ` Timo Teräs
2008-09-03 6:47 ` David Miller
2008-09-03 7:14 ` Timo Teräs
2008-09-05 11:55 ` Herbert Xu
2008-09-09 0:09 ` David Miller
2008-09-09 0:18 ` Herbert Xu
2008-09-09 0:20 ` David Miller
2008-09-09 0:25 ` David Miller
2008-09-09 14:33 ` Herbert Xu
2008-09-09 20:20 ` David Miller
2008-09-10 3:01 ` David Miller
2008-09-11 21:24 ` Paul E. McKenney [this message]
2008-09-11 22:00 ` David Miller
2008-09-11 23:22 ` Paul E. McKenney
2008-09-12 16:08 ` Herbert Xu
2008-09-12 17:37 ` Paul E. McKenney
2008-09-21 12:29 ` Timo Teräs
2008-09-21 15:21 ` Timo Teräs
2008-09-22 11:42 ` Herbert Xu
2008-09-22 13:01 ` Timo Teräs
2008-09-22 23:50 ` Herbert Xu
2008-09-23 4:53 ` Timo Teräs
2008-09-23 4:59 ` Herbert Xu
2008-09-23 5:17 ` Timo Teräs
2008-09-23 5:22 ` Herbert Xu
2008-09-23 6:25 ` Timo Teräs
2008-09-23 6:47 ` Herbert Xu
2008-09-23 6:56 ` Timo Teräs
2008-09-23 9:39 ` Timo Teräs
2008-09-23 11:24 ` Herbert Xu
2008-09-23 12:08 ` Timo Teräs
2008-09-23 12:14 ` Herbert Xu
2008-09-23 12:25 ` Timo Teräs
2008-09-23 12:56 ` Herbert Xu
2008-09-23 13:01 ` Timo Teräs
2008-09-23 13:07 ` Herbert Xu
2008-09-23 13:30 ` Timo Teräs
2008-09-23 13:32 ` Herbert Xu
2008-09-23 13:46 ` Timo Teräs
2008-09-24 4:23 ` Herbert Xu
2008-09-24 5:14 ` Timo Teräs
2008-09-24 5:15 ` Herbert Xu
2008-09-24 5:46 ` Timo Teräs
2008-09-24 5:55 ` Herbert Xu
2008-09-24 6:04 ` Timo Teräs
2008-09-24 6:13 ` Herbert Xu
2008-09-24 6:20 ` Timo Teräs
2008-09-24 6:21 ` Herbert Xu
2008-09-24 7:29 ` Timo Teräs
2008-09-24 7:54 ` Herbert Xu
2008-09-24 13:18 ` Timo Teräs
2008-09-24 14:08 ` Herbert Xu
2008-09-25 6:03 ` Timo Teräs
2008-09-25 7:57 ` Herbert Xu
2008-09-25 8:42 ` Timo Teräs
2008-09-25 8:56 ` Herbert Xu
2008-09-25 9:01 ` Timo Teräs
2008-09-25 9:49 ` Herbert Xu
2008-09-25 12:12 ` Timo Teräs
2008-09-25 12:36 ` Timo Teräs
2008-09-26 2:08 ` Herbert Xu
2008-10-01 10:07 ` David Miller
2008-10-01 14:05 ` Herbert Xu
2008-09-23 2:48 ` David Miller
2008-09-10 3:04 ` David Miller
2008-09-10 3:15 ` Herbert Xu
2008-09-10 3:22 ` David Miller
2008-09-10 3:23 ` Herbert Xu
2008-09-10 3:38 ` David Miller
2008-09-10 4:01 ` Herbert Xu
2008-09-10 4:06 ` David Miller
2008-09-10 4:22 ` Herbert Xu
2008-09-10 4:24 ` David Miller
2008-09-10 4:48 ` David Miller
2008-09-10 4:52 ` David Miller
2008-09-10 4:53 ` Herbert Xu
2008-09-10 5:21 ` David Miller
2008-09-10 5:16 ` Timo Teräs
2008-09-10 5:23 ` David Miller
2008-09-10 5:46 ` Herbert Xu
2008-09-03 6:10 ` David Miller
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20080911212459.GL6693@linux.vnet.ibm.com \
--to=paulmck@linux.vnet.ibm.com \
--cc=davem@davemloft.net \
--cc=herbert@gondor.apana.org.au \
--cc=netdev@vger.kernel.org \
--cc=timo.teras@iki.fi \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.