* [IPV4 1/5] fib_trie: more whitespace cleanup
[not found] <20080123224844.610730277@linux-foundation.org>
@ 2008-01-23 22:48 ` Stephen Hemminger
2008-01-24 4:37 ` David Miller
2008-01-23 22:48 ` [IPV4 2/5] fib_trie: remove unneeded NULL check Stephen Hemminger
` (3 subsequent siblings)
4 siblings, 1 reply; 20+ messages in thread
From: Stephen Hemminger @ 2008-01-23 22:48 UTC (permalink / raw)
To: David Miller; +Cc: netdev
[-- Attachment #1: trie-whitespace.patch --]
[-- Type: text/plain, Size: 1454 bytes --]
Remove extra blank lines.
Signed-off-by: Stephen Hemminger <shemminger@vyatta.com>
--- a/net/ipv4/fib_trie.c 2008-01-22 17:46:28.000000000 -0800
+++ b/net/ipv4/fib_trie.c 2008-01-22 17:50:44.000000000 -0800
@@ -447,7 +447,6 @@ static void tnode_put_child_reorg(struct
BUG_ON(i >= 1<<tn->bits);
-
/* update emptyChildren */
if (n == NULL && chi != NULL)
tn->empty_children++;
@@ -1303,7 +1302,6 @@ err:
return err;
}
-
/* should be called with rcu_read_lock */
static int check_leaf(struct trie *t, struct leaf *l,
t_key key, const struct flowi *flp,
@@ -1718,7 +1716,6 @@ static struct leaf *leaf_walk_rcu(struct
return NULL; /* Root of trie */
}
-
static struct leaf *trie_firstleaf(struct trie *t)
{
struct tnode *n = (struct tnode *) rcu_dereference(t->trie);
@@ -1846,7 +1843,6 @@ static int fn_trie_dump_fa(t_key key, in
{
int i, s_i;
struct fib_alias *fa;
-
__be32 xkey = htonl(key);
s_i = cb->args[4];
@@ -1879,7 +1875,6 @@ static int fn_trie_dump_fa(t_key key, in
return skb->len;
}
-
static int fn_trie_dump_leaf(struct leaf *l, struct fib_table *tb,
struct sk_buff *skb, struct netlink_callback *cb)
{
@@ -2385,7 +2380,6 @@ static int fib_trie_seq_show(struct seq_
struct leaf *l = (struct leaf *) n;
struct leaf_info *li;
struct hlist_node *node;
-
__be32 val = htonl(l->key);
seq_indent(seq, iter->depth);
--
Stephen Hemminger <stephen.hemminger@vyatta.com>
^ permalink raw reply [flat|nested] 20+ messages in thread
* [IPV4 2/5] fib_trie: remove unneeded NULL check
[not found] <20080123224844.610730277@linux-foundation.org>
2008-01-23 22:48 ` [IPV4 1/5] fib_trie: more whitespace cleanup Stephen Hemminger
@ 2008-01-23 22:48 ` Stephen Hemminger
2008-01-24 4:38 ` David Miller
2008-01-23 22:48 ` [IPV4 3/5] fib_trie: dump doesnt use RCU Stephen Hemminger
` (2 subsequent siblings)
4 siblings, 1 reply; 20+ messages in thread
From: Stephen Hemminger @ 2008-01-23 22:48 UTC (permalink / raw)
To: David Miller; +Cc: netdev
[-- Attachment #1: trie-already-hlist.patch --]
[-- Type: text/plain, Size: 535 bytes --]
Since fib_route_seq_show now uses hlist_for_each_entry(), the
leaf info can not be NULL.
Signed-off-by: Stephen Hemminger <shemminger@vyatta.com>
--- a/net/ipv4/fib_trie.c 2008-01-22 17:50:44.000000000 -0800
+++ b/net/ipv4/fib_trie.c 2008-01-22 17:50:58.000000000 -0800
@@ -2474,9 +2474,6 @@ static int fib_route_seq_show(struct seq
struct fib_alias *fa;
__be32 mask, prefix;
- if (!li)
- continue;
-
mask = inet_make_mask(li->plen);
prefix = htonl(l->key);
--
Stephen Hemminger <stephen.hemminger@vyatta.com>
^ permalink raw reply [flat|nested] 20+ messages in thread
* [IPV4 3/5] fib_trie: dump doesnt use RCU
[not found] <20080123224844.610730277@linux-foundation.org>
2008-01-23 22:48 ` [IPV4 1/5] fib_trie: more whitespace cleanup Stephen Hemminger
2008-01-23 22:48 ` [IPV4 2/5] fib_trie: remove unneeded NULL check Stephen Hemminger
@ 2008-01-23 22:48 ` Stephen Hemminger
2008-01-24 4:50 ` David Miller
2008-01-23 22:48 ` [IPV4 4/5] fib_trie: version 0.410 Stephen Hemminger
2008-01-23 22:48 ` [IPV4 5/5] fib_semantics: sparse warnings Stephen Hemminger
4 siblings, 1 reply; 20+ messages in thread
From: Stephen Hemminger @ 2008-01-23 22:48 UTC (permalink / raw)
To: David Miller; +Cc: netdev
[-- Attachment #1: fib-dump-rtnl.patch --]
[-- Type: text/plain, Size: 3261 bytes --]
Since fib dump (via netlink) holds the RTNL mutex, it is unnecessary
to use RCU, and it is impossible to get truncated (-EBUSY) result.
Signed-off-by: Stephen Hemminger <shemminger@vyatta.com>
--- a/net/ipv4/fib_trie.c 2008-01-23 13:55:12.000000000 -0800
+++ b/net/ipv4/fib_trie.c 2008-01-23 14:00:35.000000000 -0800
@@ -1684,7 +1684,7 @@ static int trie_flush_leaf(struct trie *
* Scan for the next right leaf starting at node p->child[idx]
* Since we have back pointer, no recursion necessary.
*/
-static struct leaf *leaf_walk_rcu(struct tnode *p, struct node *c)
+static struct leaf *leaf_walk(struct tnode *p, struct node *c)
{
do {
t_key idx;
@@ -1695,7 +1695,7 @@ static struct leaf *leaf_walk_rcu(struct
idx = 0;
while (idx < 1u << p->bits) {
- c = tnode_get_child_rcu(p, idx++);
+ c = tnode_get_child(p, idx++);
if (!c)
continue;
@@ -1711,14 +1711,14 @@ static struct leaf *leaf_walk_rcu(struct
/* Node empty, walk back up to parent */
c = (struct node *) p;
- } while ( (p = node_parent_rcu(c)) != NULL);
+ } while ( (p = node_parent(c)) != NULL);
return NULL; /* Root of trie */
}
static struct leaf *trie_firstleaf(struct trie *t)
{
- struct tnode *n = (struct tnode *) rcu_dereference(t->trie);
+ struct tnode *n = (struct tnode *) t->trie;
if (!n)
return NULL;
@@ -1726,7 +1726,7 @@ static struct leaf *trie_firstleaf(struc
if (IS_LEAF(n)) /* trie is just a leaf */
return (struct leaf *) n;
- return leaf_walk_rcu(n, NULL);
+ return leaf_walk(n, NULL);
}
static struct leaf *trie_nextleaf(struct leaf *l)
@@ -1737,7 +1737,7 @@ static struct leaf *trie_nextleaf(struct
if (!p)
return NULL; /* trie with just one leaf */
- return leaf_walk_rcu(p, c);
+ return leaf_walk(p, c);
}
/*
@@ -1848,9 +1848,7 @@ static int fn_trie_dump_fa(t_key key, in
s_i = cb->args[4];
i = 0;
- /* rcu_read_lock is hold by caller */
-
- list_for_each_entry_rcu(fa, fah, fa_list) {
+ list_for_each_entry(fa, fah, fa_list) {
if (i < s_i) {
i++;
continue;
@@ -1885,8 +1883,7 @@ static int fn_trie_dump_leaf(struct leaf
s_i = cb->args[3];
i = 0;
- /* rcu_read_lock is hold by caller */
- hlist_for_each_entry_rcu(li, node, &l->list, hlist) {
+ hlist_for_each_entry(li, node, &l->list, hlist) {
if (i < s_i) {
i++;
continue;
@@ -1916,35 +1913,25 @@ static int fn_trie_dump(struct fib_table
struct trie *t = (struct trie *) tb->tb_data;
t_key key = cb->args[2];
- rcu_read_lock();
+ ASSERT_RTNL();
+
/* Dump starting at last key.
* Note: 0.0.0.0/0 (ie default) is first key.
*/
if (!key)
l = trie_firstleaf(t);
- else {
+ else
l = fib_find_node(t, key);
- if (!l) {
- /* The table changed during the dump, rather than
- * giving partial data, just make application retry.
- */
- rcu_read_unlock();
- return -EBUSY;
- }
- }
while (l) {
cb->args[2] = l->key;
- if (fn_trie_dump_leaf(l, tb, skb, cb) < 0) {
- rcu_read_unlock();
+ if (fn_trie_dump_leaf(l, tb, skb, cb) < 0)
return -1;
- }
l = trie_nextleaf(l);
memset(&cb->args[3], 0,
sizeof(cb->args) - 3*sizeof(cb->args[0]));
}
- rcu_read_unlock();
return skb->len;
}
--
Stephen Hemminger <stephen.hemminger@vyatta.com>
^ permalink raw reply [flat|nested] 20+ messages in thread
* [IPV4 4/5] fib_trie: version 0.410
[not found] <20080123224844.610730277@linux-foundation.org>
` (2 preceding siblings ...)
2008-01-23 22:48 ` [IPV4 3/5] fib_trie: dump doesnt use RCU Stephen Hemminger
@ 2008-01-23 22:48 ` Stephen Hemminger
2008-01-24 4:50 ` David Miller
2008-01-23 22:48 ` [IPV4 5/5] fib_semantics: sparse warnings Stephen Hemminger
4 siblings, 1 reply; 20+ messages in thread
From: Stephen Hemminger @ 2008-01-23 22:48 UTC (permalink / raw)
To: David Miller; +Cc: netdev
[-- Attachment #1: fib-trie-minor.patch --]
[-- Type: text/plain, Size: 450 bytes --]
Increase version to reflect recent changes.
Signed-off-by: Stephen Hemminger <shemminger@vyatta.com>
--- a/net/ipv4/fib_trie.c 2008-01-22 17:50:58.000000000 -0800
+++ b/net/ipv4/fib_trie.c 2008-01-22 17:51:02.000000000 -0800
@@ -50,7 +50,7 @@
* Patrick McHardy <kaber@trash.net>
*/
-#define VERSION "0.408"
+#define VERSION "0.410"
#include <asm/uaccess.h>
#include <asm/system.h>
--
Stephen Hemminger <stephen.hemminger@vyatta.com>
^ permalink raw reply [flat|nested] 20+ messages in thread
* [IPV4 5/5] fib_semantics: sparse warnings
[not found] <20080123224844.610730277@linux-foundation.org>
` (3 preceding siblings ...)
2008-01-23 22:48 ` [IPV4 4/5] fib_trie: version 0.410 Stephen Hemminger
@ 2008-01-23 22:48 ` Stephen Hemminger
4 siblings, 0 replies; 20+ messages in thread
From: Stephen Hemminger @ 2008-01-23 22:48 UTC (permalink / raw)
To: David Miller; +Cc: netdev
[-- Attachment #1: fib-semantic-sparse.patch --]
[-- Type: text/plain, Size: 1824 bytes --]
The magic macro change_nexthops introduces a variable nh which overlaps
previous declaration of nh.
Signed-off-by: Stephen Hemminger <shemminger@vyatta.com>
--- a/net/ipv4/fib_semantics.c 2008-01-23 11:03:55.000000000 -0800
+++ b/net/ipv4/fib_semantics.c 2008-01-23 11:05:12.000000000 -0800
@@ -1059,14 +1059,14 @@ int fib_sync_down(__be32 local, struct n
unsigned int hash = fib_devindex_hashfn(dev->ifindex);
struct hlist_head *head = &fib_info_devhash[hash];
struct hlist_node *node;
- struct fib_nh *nh;
+ struct fib_nh *nh1;
- hlist_for_each_entry(nh, node, head, nh_hash) {
- struct fib_info *fi = nh->nh_parent;
+ hlist_for_each_entry(nh1, node, head, nh_hash) {
+ struct fib_info *fi = nh1->nh_parent;
int dead;
BUG_ON(!fi->fib_nhs);
- if (nh->nh_dev != dev || fi == prev_fi)
+ if (nh1->nh_dev != dev || fi == prev_fi)
continue;
prev_fi = fi;
dead = 0;
@@ -1091,6 +1091,7 @@ int fib_sync_down(__be32 local, struct n
}
#endif
} endfor_nexthops(fi)
+
if (dead == fi->fib_nhs) {
fi->fib_flags |= RTNH_F_DEAD;
ret++;
@@ -1114,7 +1115,7 @@ int fib_sync_up(struct net_device *dev)
unsigned int hash;
struct hlist_head *head;
struct hlist_node *node;
- struct fib_nh *nh;
+ struct fib_nh *nh1;
int ret;
if (!(dev->flags&IFF_UP))
@@ -1125,12 +1126,12 @@ int fib_sync_up(struct net_device *dev)
head = &fib_info_devhash[hash];
ret = 0;
- hlist_for_each_entry(nh, node, head, nh_hash) {
- struct fib_info *fi = nh->nh_parent;
+ hlist_for_each_entry(nh1, node, head, nh_hash) {
+ struct fib_info *fi = nh1->nh_parent;
int alive;
BUG_ON(!fi->fib_nhs);
- if (nh->nh_dev != dev || fi == prev_fi)
+ if (nh1->nh_dev != dev || fi == prev_fi)
continue;
prev_fi = fi;
--
Stephen Hemminger <stephen.hemminger@vyatta.com>
^ permalink raw reply [flat|nested] 20+ messages in thread
* Re: [IPV4 1/5] fib_trie: more whitespace cleanup
2008-01-23 22:48 ` [IPV4 1/5] fib_trie: more whitespace cleanup Stephen Hemminger
@ 2008-01-24 4:37 ` David Miller
0 siblings, 0 replies; 20+ messages in thread
From: David Miller @ 2008-01-24 4:37 UTC (permalink / raw)
To: shemminger; +Cc: netdev
From: Stephen Hemminger <shemminger@linux-foundation.org>
Date: Wed, 23 Jan 2008 14:48:45 -0800
> Remove extra blank lines.
>
> Signed-off-by: Stephen Hemminger <shemminger@vyatta.com>
Applied.
^ permalink raw reply [flat|nested] 20+ messages in thread
* Re: [IPV4 2/5] fib_trie: remove unneeded NULL check
2008-01-23 22:48 ` [IPV4 2/5] fib_trie: remove unneeded NULL check Stephen Hemminger
@ 2008-01-24 4:38 ` David Miller
0 siblings, 0 replies; 20+ messages in thread
From: David Miller @ 2008-01-24 4:38 UTC (permalink / raw)
To: shemminger; +Cc: netdev
From: Stephen Hemminger <shemminger@linux-foundation.org>
Date: Wed, 23 Jan 2008 14:48:46 -0800
> Since fib_route_seq_show now uses hlist_for_each_entry(), the
> leaf info can not be NULL.
>
> Signed-off-by: Stephen Hemminger <shemminger@vyatta.com>
Applied.
^ permalink raw reply [flat|nested] 20+ messages in thread
* Re: [IPV4 3/5] fib_trie: dump doesnt use RCU
2008-01-23 22:48 ` [IPV4 3/5] fib_trie: dump doesnt use RCU Stephen Hemminger
@ 2008-01-24 4:50 ` David Miller
2008-01-24 6:41 ` Patrick McHardy
0 siblings, 1 reply; 20+ messages in thread
From: David Miller @ 2008-01-24 4:50 UTC (permalink / raw)
To: shemminger; +Cc: netdev
From: Stephen Hemminger <shemminger@linux-foundation.org>
Date: Wed, 23 Jan 2008 14:48:47 -0800
> Since fib dump (via netlink) holds the RTNL mutex, it is unnecessary
> to use RCU, and it is impossible to get truncated (-EBUSY) result.
>
> Signed-off-by: Stephen Hemminger <shemminger@vyatta.com>
You tested this patch, right? :-/
The whole reason we need the nlk->cb[] state is to hold things across
multiple recvmsg() calls that might be necessary to obtain the full
dump.
rtnetlink goes:
rtnl_lock();
netlink_rcv_skb(skb, &rtnetlink_rcv_msg);
...
static int rtnetlink_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
{
...
if (kind == 2 && nlh->nlmsg_flags&NLM_F_DUMP) {
struct sock *rtnl;
rtnl_dumpit_func dumpit;
dumpit = rtnl_get_dumpit(family, type);
if (dumpit == NULL)
return -EOPNOTSUPP;
__rtnl_unlock();
rtnl = net->rtnl;
err = netlink_dump_start(rtnl, skb, nlh, dumpit, NULL);
rtnl_lock();
return err;
(NOTE: Drops RTNL semaphore for netlink_dump_start() call)
...
int netlink_dump_start(struct sock *ssk, struct sk_buff *skb,
struct nlmsghdr *nlh,
int (*dump)(struct sk_buff *skb,
struct netlink_callback *),
int (*done)(struct netlink_callback *))
{
...
cb->dump = dump;
cb->done = done;
cb->nlh = nlh;
atomic_inc(&skb->users);
cb->skb = skb;
...
mutex_lock(nlk->cb_mutex);
...
nlk->cb = cb;
mutex_unlock(nlk->cb_mutex);
netlink_dump(sk);
...
static int netlink_dump(struct sock *sk)
{
...
mutex_lock(nlk->cb_mutex);
...
len = cb->dump(skb, cb);
if (len > 0) {
mutex_unlock(nlk->cb_mutex);
skb_queue_tail(&sk->sk_receive_queue, skb);
sk->sk_data_ready(sk, len);
return 0;
}
(NOTE: Therefore cb->dump() runs without RTNL semaphore held)
...
static int netlink_recvmsg(struct kiocb *kiocb, struct socket *sock,
struct msghdr *msg, size_t len,
int flags)
{
...
if (nlk->cb && atomic_read(&sk->sk_rmem_alloc) <= sk->sk_rcvbuf / 2)
netlink_dump(sk);
...
Therefore, that RTNL assertion you added should have triggered on any
dump you may have tried since ->dump() is always invoked without the
RTNL semaphore since rtnetlink drops it around the ->dump() call and
the call chain for this fib_trie cause would be:
inet_dump_fib()
fn_trie_dump()
and nothing in that code path retakes the RTNL semaphore.
What test did you run to validate this patch for correctness?
^ permalink raw reply [flat|nested] 20+ messages in thread
* Re: [IPV4 4/5] fib_trie: version 0.410
2008-01-23 22:48 ` [IPV4 4/5] fib_trie: version 0.410 Stephen Hemminger
@ 2008-01-24 4:50 ` David Miller
0 siblings, 0 replies; 20+ messages in thread
From: David Miller @ 2008-01-24 4:50 UTC (permalink / raw)
To: shemminger; +Cc: netdev
From: Stephen Hemminger <shemminger@linux-foundation.org>
Date: Wed, 23 Jan 2008 14:48:48 -0800
> Increase version to reflect recent changes.
>
> Signed-off-by: Stephen Hemminger <shemminger@vyatta.com>
I'm dropping this and patch 5 for now.
^ permalink raw reply [flat|nested] 20+ messages in thread
* Re: [IPV4 3/5] fib_trie: dump doesnt use RCU
2008-01-24 4:50 ` David Miller
@ 2008-01-24 6:41 ` Patrick McHardy
2008-01-24 6:43 ` David Miller
2008-01-24 6:45 ` Stephen Hemminger
0 siblings, 2 replies; 20+ messages in thread
From: Patrick McHardy @ 2008-01-24 6:41 UTC (permalink / raw)
To: David Miller; +Cc: shemminger, netdev
David Miller wrote:
> From: Stephen Hemminger <shemminger@linux-foundation.org>
> Date: Wed, 23 Jan 2008 14:48:47 -0800
>
>
>> Since fib dump (via netlink) holds the RTNL mutex, it is unnecessary
>> to use RCU, and it is impossible to get truncated (-EBUSY) result.
>>
>> Signed-off-by: Stephen Hemminger <shemminger@vyatta.com>
>>
>
> You tested this patch, right? :-/
>
> The whole reason we need the nlk->cb[] state is to hold things across
> multiple recvmsg() calls that might be necessary to obtain the full
> dump.
>
> rtnetlink goes:
>
> rtnl_lock();
> netlink_rcv_skb(skb, &rtnetlink_rcv_msg);
> ...
> static int rtnetlink_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
> {
> ...
> if (kind == 2 && nlh->nlmsg_flags&NLM_F_DUMP) {
> struct sock *rtnl;
> rtnl_dumpit_func dumpit;
>
> dumpit = rtnl_get_dumpit(family, type);
> if (dumpit == NULL)
> return -EOPNOTSUPP;
>
> __rtnl_unlock();
> rtnl = net->rtnl;
> err = netlink_dump_start(rtnl, skb, nlh, dumpit, NULL);
> rtnl_lock();
> return err;
>
> (NOTE: Drops RTNL semaphore for netlink_dump_start() call)
>
> ...
> int netlink_dump_start(struct sock *ssk, struct sk_buff *skb,
> struct nlmsghdr *nlh,
> int (*dump)(struct sk_buff *skb,
> struct netlink_callback *),
> int (*done)(struct netlink_callback *))
> {
> ...
> cb->dump = dump;
> cb->done = done;
> cb->nlh = nlh;
> atomic_inc(&skb->users);
> cb->skb = skb;
> ...
> mutex_lock(nlk->cb_mutex);
> ...
> nlk->cb = cb;
> mutex_unlock(nlk->cb_mutex);
>
> netlink_dump(sk);
> ...
>
> static int netlink_dump(struct sock *sk)
> {
> ...
> mutex_lock(nlk->cb_mutex);
> ...
> len = cb->dump(skb, cb);
>
> if (len > 0) {
> mutex_unlock(nlk->cb_mutex);
> skb_queue_tail(&sk->sk_receive_queue, skb);
> sk->sk_data_ready(sk, len);
> return 0;
> }
>
> (NOTE: Therefore cb->dump() runs without RTNL semaphore held)
>
> ...
> static int netlink_recvmsg(struct kiocb *kiocb, struct socket *sock,
> struct msghdr *msg, size_t len,
> int flags)
> {
> ...
> if (nlk->cb && atomic_read(&sk->sk_rmem_alloc) <= sk->sk_rcvbuf / 2)
> netlink_dump(sk);
> ...
>
> Therefore, that RTNL assertion you added should have triggered on any
> dump you may have tried since ->dump() is always invoked without the
> RTNL semaphore since rtnetlink drops it around the ->dump() call and
> the call chain for this fib_trie cause would be:
>
> inet_dump_fib()
> fn_trie_dump()
>
> and nothing in that code path retakes the RTNL semaphore.
Actually we're always holding the rtnl during dumps, nlk->cb_mutex points
to rtnl_mutex in case of rtnetlink. It used to be held only during the first
->dump invocation and not on continuations, but I changed this a few
versions
ago.
^ permalink raw reply [flat|nested] 20+ messages in thread
* Re: [IPV4 3/5] fib_trie: dump doesnt use RCU
2008-01-24 6:41 ` Patrick McHardy
@ 2008-01-24 6:43 ` David Miller
2008-01-24 6:47 ` Patrick McHardy
2008-01-24 6:45 ` Stephen Hemminger
1 sibling, 1 reply; 20+ messages in thread
From: David Miller @ 2008-01-24 6:43 UTC (permalink / raw)
To: kaber; +Cc: shemminger, netdev
From: Patrick McHardy <kaber@trash.net>
Date: Thu, 24 Jan 2008 07:41:08 +0100
> David Miller wrote:
> > and nothing in that code path retakes the RTNL semaphore.
>
> Actually we're always holding the rtnl during dumps, nlk->cb_mutex points
> to rtnl_mutex in case of rtnetlink. It used to be held only during the first
> ->dump invocation and not on continuations, but I changed this a few
> versions ago.
My bad. Thanks for the correction Patrick.
But continuations can occur on subsequent recvmsg() calls,
does it return to userspace with the mutex held? If so
I'm pretty sure that's not allowed.
^ permalink raw reply [flat|nested] 20+ messages in thread
* Re: [IPV4 3/5] fib_trie: dump doesnt use RCU
2008-01-24 6:41 ` Patrick McHardy
2008-01-24 6:43 ` David Miller
@ 2008-01-24 6:45 ` Stephen Hemminger
2008-01-24 7:27 ` David Miller
1 sibling, 1 reply; 20+ messages in thread
From: Stephen Hemminger @ 2008-01-24 6:45 UTC (permalink / raw)
To: Patrick McHardy; +Cc: David Miller, netdev
Patrick McHardy wrote:
> David Miller wrote:
>> From: Stephen Hemminger <shemminger@linux-foundation.org>
>> Date: Wed, 23 Jan 2008 14:48:47 -0800
>>
>>
>>> Since fib dump (via netlink) holds the RTNL mutex, it is unnecessary
>>> to use RCU, and it is impossible to get truncated (-EBUSY) result.
>>>
>>> Signed-off-by: Stephen Hemminger <shemminger@vyatta.com>
>>>
>>
>> You tested this patch, right? :-/
>>
>> The whole reason we need the nlk->cb[] state is to hold things across
>> multiple recvmsg() calls that might be necessary to obtain the full
>> dump.
>>
>> rtnetlink goes:
>>
>> rtnl_lock();
>> netlink_rcv_skb(skb, &rtnetlink_rcv_msg);
>> ...
>> static int rtnetlink_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
>> {
>> ...
>> if (kind == 2 && nlh->nlmsg_flags&NLM_F_DUMP) {
>> struct sock *rtnl;
>> rtnl_dumpit_func dumpit;
>>
>> dumpit = rtnl_get_dumpit(family, type);
>> if (dumpit == NULL)
>> return -EOPNOTSUPP;
>>
>> __rtnl_unlock();
>> rtnl = net->rtnl;
>> err = netlink_dump_start(rtnl, skb, nlh, dumpit, NULL);
>> rtnl_lock();
>> return err;
>>
>> (NOTE: Drops RTNL semaphore for netlink_dump_start() call)
>>
>> ...
>> int netlink_dump_start(struct sock *ssk, struct sk_buff *skb,
>> struct nlmsghdr *nlh,
>> int (*dump)(struct sk_buff *skb,
>> struct netlink_callback *),
>> int (*done)(struct netlink_callback *))
>> {
>> ...
>> cb->dump = dump;
>> cb->done = done;
>> cb->nlh = nlh;
>> atomic_inc(&skb->users);
>> cb->skb = skb;
>> ...
>> mutex_lock(nlk->cb_mutex);
>> ...
>> nlk->cb = cb;
>> mutex_unlock(nlk->cb_mutex);
>>
>> netlink_dump(sk);
>> ...
>>
>> static int netlink_dump(struct sock *sk)
>> {
>> ...
>> mutex_lock(nlk->cb_mutex);
>> ...
>> len = cb->dump(skb, cb);
>>
>> if (len > 0) {
>> mutex_unlock(nlk->cb_mutex);
>> skb_queue_tail(&sk->sk_receive_queue, skb);
>> sk->sk_data_ready(sk, len);
>> return 0;
>> }
>>
>> (NOTE: Therefore cb->dump() runs without RTNL semaphore held)
>>
>> ...
>> static int netlink_recvmsg(struct kiocb *kiocb, struct socket *sock,
>> struct msghdr *msg, size_t len,
>> int flags)
>> {
>> ...
>> if (nlk->cb && atomic_read(&sk->sk_rmem_alloc) <= sk->sk_rcvbuf / 2)
>> netlink_dump(sk);
>> ...
>>
>> Therefore, that RTNL assertion you added should have triggered on any
>> dump you may have tried since ->dump() is always invoked without the
>> RTNL semaphore since rtnetlink drops it around the ->dump() call and
>> the call chain for this fib_trie cause would be:
>>
>> inet_dump_fib()
>> fn_trie_dump()
>>
>> and nothing in that code path retakes the RTNL semaphore.
>
> Actually we're always holding the rtnl during dumps, nlk->cb_mutex points
> to rtnl_mutex in case of rtnetlink. It used to be held only during the
> first
> ->dump invocation and not on continuations, but I changed this a few
> versions
> ago.
>
>
Yes, I tested, no the assert didn't hit... Actually, the reason I went
down this path was
because I couldn't trigger -EBUSY with concurrent updates.
P.s: I checked and Quagga handles -EAGAIN, so if you need an error code that
would be the one to use.
^ permalink raw reply [flat|nested] 20+ messages in thread
* Re: [IPV4 3/5] fib_trie: dump doesnt use RCU
2008-01-24 6:43 ` David Miller
@ 2008-01-24 6:47 ` Patrick McHardy
2008-01-24 7:26 ` David Miller
0 siblings, 1 reply; 20+ messages in thread
From: Patrick McHardy @ 2008-01-24 6:47 UTC (permalink / raw)
To: David Miller; +Cc: shemminger, netdev
David Miller wrote:
> From: Patrick McHardy <kaber@trash.net>
> Date: Thu, 24 Jan 2008 07:41:08 +0100
>
>
>> David Miller wrote:
>>
>>> and nothing in that code path retakes the RTNL semaphore.
>>>
>> Actually we're always holding the rtnl during dumps, nlk->cb_mutex points
>> to rtnl_mutex in case of rtnetlink. It used to be held only during the first
>> ->dump invocation and not on continuations, but I changed this a few
>> versions ago.
>>
>
> My bad. Thanks for the correction Patrick.
>
> But continuations can occur on subsequent recvmsg() calls,
> does it return to userspace with the mutex held? If so
> I'm pretty sure that's not allowed.
>
No, the mutex is dropped between different ->dump invocations.
^ permalink raw reply [flat|nested] 20+ messages in thread
* Re: [IPV4 3/5] fib_trie: dump doesnt use RCU
2008-01-24 6:47 ` Patrick McHardy
@ 2008-01-24 7:26 ` David Miller
0 siblings, 0 replies; 20+ messages in thread
From: David Miller @ 2008-01-24 7:26 UTC (permalink / raw)
To: kaber; +Cc: shemminger, netdev
From: Patrick McHardy <kaber@trash.net>
Date: Thu, 24 Jan 2008 07:47:39 +0100
> David Miller wrote:
> > From: Patrick McHardy <kaber@trash.net>
> > Date: Thu, 24 Jan 2008 07:41:08 +0100
> >
> >> David Miller wrote:
> >>
> >>> and nothing in that code path retakes the RTNL semaphore.
> >>>
> >> Actually we're always holding the rtnl during dumps, nlk->cb_mutex points
> >> to rtnl_mutex in case of rtnetlink. It used to be held only during the first
> >> ->dump invocation and not on continuations, but I changed this a few
> >> versions ago.
> >>
> >
> > My bad. Thanks for the correction Patrick.
> >
> > But continuations can occur on subsequent recvmsg() calls,
> > does it return to userspace with the mutex held? If so
> > I'm pretty sure that's not allowed.
>
> No, the mutex is dropped between different ->dump invocations.
Ok, great.
This does mean, however, that the RTNL semaphore is dropped between
->dump() invocations on the same nlk->cb[] instance.
And that has implications for the shortcut Stephen is taking.
Stephen's patch assumes that during a (top-level) dump the table
cannot change. And by the above, we can only conclude that it can in
fact change between ->dump() calls for the same top-level dump run.
So the removal of the -EBUSY code block in his patch isn't valid.
^ permalink raw reply [flat|nested] 20+ messages in thread
* Re: [IPV4 3/5] fib_trie: dump doesnt use RCU
2008-01-24 6:45 ` Stephen Hemminger
@ 2008-01-24 7:27 ` David Miller
2008-01-24 21:51 ` [PATCH] fib_trie: rescan if key is lost during dump Stephen Hemminger
0 siblings, 1 reply; 20+ messages in thread
From: David Miller @ 2008-01-24 7:27 UTC (permalink / raw)
To: shemminger; +Cc: kaber, netdev
From: Stephen Hemminger <shemminger@linux-foundation.org>
Date: Wed, 23 Jan 2008 22:45:02 -0800
> Yes, I tested, no the assert didn't hit... Actually, the reason I went
> down this path was
> because I couldn't trigger -EBUSY with concurrent updates.
>
> P.s: I checked and Quagga handles -EAGAIN, so if you need an error code that
> would be the one to use.
Ok, but see my other email, the -EBUSY block can't be removed.
Even though for any given ->dump() call RTNL is held, it
is dropped before the next ->dump() call on the same
piece of nlk->cb[] state.
So the node for the nlk->cb->args[2] key can go away due
to an intervening table change.
^ permalink raw reply [flat|nested] 20+ messages in thread
* [PATCH] fib_trie: rescan if key is lost during dump
2008-01-24 7:27 ` David Miller
@ 2008-01-24 21:51 ` Stephen Hemminger
2008-01-25 8:23 ` Jarek Poplawski
2008-02-01 0:45 ` David Miller
0 siblings, 2 replies; 20+ messages in thread
From: Stephen Hemminger @ 2008-01-24 21:51 UTC (permalink / raw)
To: David Miller; +Cc: kaber, netdev
Normally during a dump the key of the last dumped entry is used for
continuation, but since lock is dropped it might be lost. In that case
fallback to the old counter based N^2 behaviour. This means the dump will end up
skipping some routes which matches what FIB_HASH does.
Signed-off-by: Stephen Hemminger <shemminger@vyatta.com>
--- a/include/linux/netlink.h 2008-01-24 13:39:40.000000000 -0800
+++ b/include/linux/netlink.h 2008-01-24 13:42:05.000000000 -0800
@@ -219,7 +219,7 @@ struct netlink_callback
int (*dump)(struct sk_buff * skb, struct netlink_callback *cb);
int (*done)(struct netlink_callback *cb);
int family;
- long args[5];
+ long args[6];
};
struct netlink_notify
--- a/net/ipv4/fib_trie.c 2008-01-24 13:39:40.000000000 -0800
+++ b/net/ipv4/fib_trie.c 2008-01-24 13:44:26.000000000 -0800
@@ -1743,6 +1743,19 @@ static struct leaf *trie_nextleaf(struct
return leaf_walk_rcu(p, c);
}
+static struct leaf *trie_leafindex(struct trie *t, int index)
+{
+ struct leaf *l = trie_firstleaf(t);
+
+ while (index-- > 0) {
+ l = trie_nextleaf(l);
+ if (!l)
+ break;
+ }
+ return l;
+}
+
+
/*
* Caller must hold RTNL.
*/
@@ -1848,7 +1861,7 @@ static int fn_trie_dump_fa(t_key key, in
struct fib_alias *fa;
__be32 xkey = htonl(key);
- s_i = cb->args[4];
+ s_i = cb->args[5];
i = 0;
/* rcu_read_lock is hold by caller */
@@ -1869,12 +1882,12 @@ static int fn_trie_dump_fa(t_key key, in
plen,
fa->fa_tos,
fa->fa_info, NLM_F_MULTI) < 0) {
- cb->args[4] = i;
+ cb->args[5] = i;
return -1;
}
i++;
}
- cb->args[4] = i;
+ cb->args[5] = i;
return skb->len;
}
@@ -1885,7 +1898,7 @@ static int fn_trie_dump_leaf(struct leaf
struct hlist_node *node;
int i, s_i;
- s_i = cb->args[3];
+ s_i = cb->args[4];
i = 0;
/* rcu_read_lock is hold by caller */
@@ -1896,19 +1909,19 @@ static int fn_trie_dump_leaf(struct leaf
}
if (i > s_i)
- cb->args[4] = 0;
+ cb->args[5] = 0;
if (list_empty(&li->falh))
continue;
if (fn_trie_dump_fa(l->key, li->plen, &li->falh, tb, skb, cb) < 0) {
- cb->args[3] = i;
+ cb->args[4] = i;
return -1;
}
i++;
}
- cb->args[3] = i;
+ cb->args[4] = i;
return skb->len;
}
@@ -1918,35 +1931,37 @@ static int fn_trie_dump(struct fib_table
struct leaf *l;
struct trie *t = (struct trie *) tb->tb_data;
t_key key = cb->args[2];
+ int count = cb->args[3];
rcu_read_lock();
/* Dump starting at last key.
* Note: 0.0.0.0/0 (ie default) is first key.
*/
- if (!key)
+ if (count == 0)
l = trie_firstleaf(t);
else {
+ /* Normally, continue from last key, but if that is missing
+ * fallback to using slow rescan
+ */
l = fib_find_node(t, key);
- if (!l) {
- /* The table changed during the dump, rather than
- * giving partial data, just make application retry.
- */
- rcu_read_unlock();
- return -EBUSY;
- }
+ if (!l)
+ l = trie_leafindex(t, count);
}
while (l) {
cb->args[2] = l->key;
if (fn_trie_dump_leaf(l, tb, skb, cb) < 0) {
+ cb->args[3] = count;
rcu_read_unlock();
return -1;
}
+ ++count;
l = trie_nextleaf(l);
- memset(&cb->args[3], 0,
- sizeof(cb->args) - 3*sizeof(cb->args[0]));
+ memset(&cb->args[4], 0,
+ sizeof(cb->args) - 4*sizeof(cb->args[0]));
}
+ cb->args[3] = count;
rcu_read_unlock();
return skb->len;
^ permalink raw reply [flat|nested] 20+ messages in thread
* Re: [PATCH] fib_trie: rescan if key is lost during dump
2008-01-24 21:51 ` [PATCH] fib_trie: rescan if key is lost during dump Stephen Hemminger
@ 2008-01-25 8:23 ` Jarek Poplawski
2008-01-25 16:13 ` Stephen Hemminger
2008-02-01 0:45 ` David Miller
1 sibling, 1 reply; 20+ messages in thread
From: Jarek Poplawski @ 2008-01-25 8:23 UTC (permalink / raw)
To: Stephen Hemminger; +Cc: David Miller, kaber, netdev
On 24-01-2008 22:51, Stephen Hemminger wrote:
> Normally during a dump the key of the last dumped entry is used for
> continuation, but since lock is dropped it might be lost. In that case
> fallback to the old counter based N^2 behaviour. This means the dump will end up
> skipping some routes which matches what FIB_HASH does.
>
> Signed-off-by: Stephen Hemminger <shemminger@vyatta.com>
...
> @@ -1918,35 +1931,37 @@ static int fn_trie_dump(struct fib_table
> struct leaf *l;
> struct trie *t = (struct trie *) tb->tb_data;
> t_key key = cb->args[2];
> + int count = cb->args[3];
>
> rcu_read_lock();
Sorry, but I lost the point: is rtnl held or not held here at the moment?
If held, how this rcu_read_lock can help? Maybe some additional comment
in the code?
Thanks,
Jarek P.
^ permalink raw reply [flat|nested] 20+ messages in thread
* Re: [PATCH] fib_trie: rescan if key is lost during dump
2008-01-25 8:23 ` Jarek Poplawski
@ 2008-01-25 16:13 ` Stephen Hemminger
2008-01-25 19:01 ` Jarek Poplawski
0 siblings, 1 reply; 20+ messages in thread
From: Stephen Hemminger @ 2008-01-25 16:13 UTC (permalink / raw)
To: Jarek Poplawski; +Cc: David Miller, kaber, netdev
On Fri, 25 Jan 2008 09:23:00 +0100
Jarek Poplawski <jarkao2@gmail.com> wrote:
> On 24-01-2008 22:51, Stephen Hemminger wrote:
> > Normally during a dump the key of the last dumped entry is used for
> > continuation, but since lock is dropped it might be lost. In that case
> > fallback to the old counter based N^2 behaviour. This means the dump will end up
> > skipping some routes which matches what FIB_HASH does.
> >
> > Signed-off-by: Stephen Hemminger <shemminger@vyatta.com>
> ...
> > @@ -1918,35 +1931,37 @@ static int fn_trie_dump(struct fib_table
> > struct leaf *l;
> > struct trie *t = (struct trie *) tb->tb_data;
> > t_key key = cb->args[2];
> > + int count = cb->args[3];
> >
> > rcu_read_lock();
>
> Sorry, but I lost the point: is rtnl held or not held here at the moment?
> If held, how this rcu_read_lock can help? Maybe some additional comment
> in the code?
>
> Thanks,
> Jarek P.
There are two different issues, (therefore two different patches).
1. How to handle multipart resume when the key was deleted during the
period the lock was dropped.
That is what this patch addresses.
2. RCU is unnecessary here because of use of RTNL. I am going to defer
on this till after #1. That patch is much less important.
--
Stephen Hemminger <stephen.hemminger@vyatta.com>
^ permalink raw reply [flat|nested] 20+ messages in thread
* Re: [PATCH] fib_trie: rescan if key is lost during dump
2008-01-25 16:13 ` Stephen Hemminger
@ 2008-01-25 19:01 ` Jarek Poplawski
0 siblings, 0 replies; 20+ messages in thread
From: Jarek Poplawski @ 2008-01-25 19:01 UTC (permalink / raw)
To: Stephen Hemminger; +Cc: David Miller, kaber, netdev
On Fri, Jan 25, 2008 at 08:13:15AM -0800, Stephen Hemminger wrote:
...
> 2. RCU is unnecessary here because of use of RTNL. I am going to defer
> on this till after #1. That patch is much less important.
Thanks! (I've started to suspect another advanced RCU trick already...)
Jarek P.
^ permalink raw reply [flat|nested] 20+ messages in thread
* Re: [PATCH] fib_trie: rescan if key is lost during dump
2008-01-24 21:51 ` [PATCH] fib_trie: rescan if key is lost during dump Stephen Hemminger
2008-01-25 8:23 ` Jarek Poplawski
@ 2008-02-01 0:45 ` David Miller
1 sibling, 0 replies; 20+ messages in thread
From: David Miller @ 2008-02-01 0:45 UTC (permalink / raw)
To: shemminger; +Cc: kaber, netdev
From: Stephen Hemminger <shemminger@linux-foundation.org>
Date: Thu, 24 Jan 2008 13:51:12 -0800
> Normally during a dump the key of the last dumped entry is used for
> continuation, but since lock is dropped it might be lost. In that case
> fallback to the old counter based N^2 behaviour. This means the dump will end up
> skipping some routes which matches what FIB_HASH does.
>
> Signed-off-by: Stephen Hemminger <shemminger@vyatta.com>
Applied, thanks.
^ permalink raw reply [flat|nested] 20+ messages in thread
end of thread, other threads:[~2008-02-01 0:45 UTC | newest]
Thread overview: 20+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
[not found] <20080123224844.610730277@linux-foundation.org>
2008-01-23 22:48 ` [IPV4 1/5] fib_trie: more whitespace cleanup Stephen Hemminger
2008-01-24 4:37 ` David Miller
2008-01-23 22:48 ` [IPV4 2/5] fib_trie: remove unneeded NULL check Stephen Hemminger
2008-01-24 4:38 ` David Miller
2008-01-23 22:48 ` [IPV4 3/5] fib_trie: dump doesnt use RCU Stephen Hemminger
2008-01-24 4:50 ` David Miller
2008-01-24 6:41 ` Patrick McHardy
2008-01-24 6:43 ` David Miller
2008-01-24 6:47 ` Patrick McHardy
2008-01-24 7:26 ` David Miller
2008-01-24 6:45 ` Stephen Hemminger
2008-01-24 7:27 ` David Miller
2008-01-24 21:51 ` [PATCH] fib_trie: rescan if key is lost during dump Stephen Hemminger
2008-01-25 8:23 ` Jarek Poplawski
2008-01-25 16:13 ` Stephen Hemminger
2008-01-25 19:01 ` Jarek Poplawski
2008-02-01 0:45 ` David Miller
2008-01-23 22:48 ` [IPV4 4/5] fib_trie: version 0.410 Stephen Hemminger
2008-01-24 4:50 ` David Miller
2008-01-23 22:48 ` [IPV4 5/5] fib_semantics: sparse warnings Stephen Hemminger
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).