* [PATCH v6 BONUS 4/3] ipv4: Store rtable entries directly in FIB
@ 2011-05-05 23:36 David Miller
2011-05-06 9:12 ` Julian Anastasov
0 siblings, 1 reply; 3+ messages in thread
From: David Miller @ 2011-05-05 23:36 UTC (permalink / raw)
To: netdev; +Cc: tgraf, jpirko, herbert, eric.dumazet
Ok, here is the fun patch showing the scheme I'm working on. Two
things going on here.
First, we store pre-constructed rtable entries, on demand, inside of
the routing table objects themselves.
Second, we get rid of RT_TABLE_LOCAL and load all routes equally
into RT_TABLE_MAIN.
Signed-off-by: David S. Miller <davem@davemloft.net>
diff --git a/include/net/ip_fib.h b/include/net/ip_fib.h
index 10422ef..f3c9598 100644
--- a/include/net/ip_fib.h
+++ b/include/net/ip_fib.h
@@ -44,6 +44,7 @@ struct fib_config {
};
struct fib_info;
+struct rtable;
struct fib_nh {
struct net_device *nh_dev;
@@ -62,6 +63,7 @@ struct fib_nh {
__be32 nh_gw;
__be32 nh_saddr;
int nh_saddr_genid;
+ struct rtable *nh_rtable;
};
/*
@@ -200,10 +202,6 @@ static inline int fib_lookup(struct net *net, const struct flowi4 *flp,
{
struct fib_table *table;
- table = fib_get_table(net, RT_TABLE_LOCAL);
- if (!fib_table_lookup(table, flp, res, FIB_LOOKUP_NOREF))
- return 0;
-
table = fib_get_table(net, RT_TABLE_MAIN);
if (!fib_table_lookup(table, flp, res, FIB_LOOKUP_NOREF))
return 0;
diff --git a/include/net/route.h b/include/net/route.h
index 70155fb..04e7197 100644
--- a/include/net/route.h
+++ b/include/net/route.h
@@ -109,6 +109,7 @@ extern int ip_rt_init(void);
extern void ip_rt_redirect(__be32 old_gw, __be32 dst, __be32 new_gw,
__be32 src, struct net_device *dev);
extern void rt_cache_flush(struct net *net, int how);
+extern struct rtable *ip_route_output_new(struct net *, struct flowi4 *flp);
extern struct rtable *__ip_route_output_key(struct net *, struct flowi4 *flp);
extern struct rtable *ip_route_output_flow(struct net *, struct flowi4 *flp,
struct sock *sk);
diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c
index 33bbbda..24e67d8 100644
--- a/net/ipv4/fib_frontend.c
+++ b/net/ipv4/fib_frontend.c
@@ -155,7 +155,7 @@ static inline unsigned __inet_dev_addr_type(struct net *net,
res.r = NULL;
#endif
- local_table = fib_get_table(net, RT_TABLE_LOCAL);
+ local_table = fib_get_table(net, RT_TABLE_MAIN);
if (local_table) {
ret = RTN_UNICAST;
rcu_read_lock();
@@ -662,11 +662,7 @@ static void fib_magic(int cmd, int type, __be32 dst, int dst_len, struct in_ifad
},
};
- if (type == RTN_UNICAST)
- tb = fib_new_table(net, RT_TABLE_MAIN);
- else
- tb = fib_new_table(net, RT_TABLE_LOCAL);
-
+ tb = fib_new_table(net, RT_TABLE_MAIN);
if (tb == NULL)
return;
diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c
index 641a5a2..c37ebd3 100644
--- a/net/ipv4/fib_semantics.c
+++ b/net/ipv4/fib_semantics.c
@@ -148,6 +148,10 @@ static void free_fib_info_rcu(struct rcu_head *head)
{
struct fib_info *fi = container_of(head, struct fib_info, rcu);
+ change_nexthops(fi) {
+ ip_rt_put(nexthop_nh->nh_rtable);
+ nexthop_nh->nh_rtable = NULL;
+ } endfor_nexthops(fi);
if (fi->fib_metrics != (u32 *) dst_default_metrics)
kfree(fi->fib_metrics);
kfree(fi);
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index 1e67624..2f77d28 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -1861,6 +1861,68 @@ out:
}
EXPORT_SYMBOL_GPL(__ip_route_output_key);
+struct rtable *ip_route_output_new(struct net *net, struct flowi4 *fl4)
+{
+ struct net_device *dev_out = NULL;
+ u32 tos = RT_FL_TOS(fl4);
+ unsigned int flags = 0;
+ struct fib_result res;
+ struct rtable *rth;
+ int orig_oif;
+
+ res.fi = NULL;
+#ifdef CONFIG_IP_MULTIPLE_TABLES
+ res.r = NULL;
+#endif
+
+ orig_oif = fl4->flowi4_oif;
+
+ fl4->flowi4_iif = net->loopback_dev->ifindex;
+ fl4->flowi4_tos = tos & IPTOS_RT_MASK;
+ fl4->flowi4_scope = ((tos & RTO_ONLINK) ?
+ RT_SCOPE_LINK : RT_SCOPE_UNIVERSE);
+
+ rcu_read_lock();
+ if (fib_lookup(net, fl4, &res)) {
+ rth = ERR_PTR(-ENETUNREACH);
+ goto out;
+ }
+
+#ifdef CONFIG_IP_ROUTE_MULTIPATH
+ if (res.fi->fib_nhs > 1 && fl4->flowi4_oif == 0)
+ fib_select_multipath(&res);
+ else
+#endif
+ if (!res.prefixlen && res.table->tb_num_default > 1 &&
+ res.type == RTN_UNICAST && !fl4->flowi4_oif)
+ fib_select_default(&res);
+
+ if (!fl4->saddr)
+ fl4->saddr = FIB_RES_PREFSRC(net, res);
+
+ dev_out = FIB_RES_DEV(res);
+ fl4->flowi4_oif = dev_out->ifindex;
+
+ rth = FIB_RES_NH(res).nh_rtable;
+ if (!rth) {
+ if (res.type == RTN_LOCAL)
+ flags |= RTCF_LOCAL;
+ rth = __mkroute_output(&res, fl4, orig_oif, dev_out, flags);
+ if (!IS_ERR(rth))
+ rth = rt_finalize(rth, NULL);
+ if (!IS_ERR(rth))
+ FIB_RES_NH(res).nh_rtable = rth;
+ }
+
+ if (!IS_ERR(rth))
+ atomic_inc(&rth->dst.__refcnt);
+
+out:
+ rcu_read_unlock();
+ return rth;
+}
+EXPORT_SYMBOL_GPL(ip_route_output_new);
+
static struct dst_entry *ipv4_blackhole_dst_check(struct dst_entry *dst, u32 cookie)
{
return NULL;
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index 544f435..9bb827e 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -929,7 +929,7 @@ int udp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
faddr, saddr, dport, inet->inet_sport);
security_sk_classify_flow(sk, flowi4_to_flowi(&fl4));
- rt = ip_route_output_flow(net, &fl4, sk);
+ rt = ip_route_output_new(net, &fl4);
if (IS_ERR(rt)) {
err = PTR_ERR(rt);
rt = NULL;
^ permalink raw reply related [flat|nested] 3+ messages in thread
* Re: [PATCH v6 BONUS 4/3] ipv4: Store rtable entries directly in FIB
2011-05-05 23:36 [PATCH v6 BONUS 4/3] ipv4: Store rtable entries directly in FIB David Miller
@ 2011-05-06 9:12 ` Julian Anastasov
2011-05-06 17:57 ` David Miller
0 siblings, 1 reply; 3+ messages in thread
From: Julian Anastasov @ 2011-05-06 9:12 UTC (permalink / raw)
To: David Miller; +Cc: netdev, tgraf, jpirko, herbert, eric.dumazet
Hello,
On Thu, 5 May 2011, David Miller wrote:
> Ok, here is the fun patch showing the scheme I'm working on. Two
> things going on here.
>
> First, we store pre-constructed rtable entries, on demand, inside of
> the routing table objects themselves.
>
> Second, we get rid of RT_TABLE_LOCAL and load all routes equally
> into RT_TABLE_MAIN.
>
> Signed-off-by: David S. Miller <davem@davemloft.net>
>
> diff --git a/include/net/ip_fib.h b/include/net/ip_fib.h
> index 10422ef..f3c9598 100644
> --- a/include/net/ip_fib.h
> +++ b/include/net/ip_fib.h
> @@ -44,6 +44,7 @@ struct fib_config {
> };
>
> struct fib_info;
> +struct rtable;
>
> struct fib_nh {
> struct net_device *nh_dev;
> @@ -62,6 +63,7 @@ struct fib_nh {
> __be32 nh_gw;
> __be32 nh_saddr;
> int nh_saddr_genid;
> + struct rtable *nh_rtable;
Caching results of __mkroute_output in NH does
not work well for RTN_MULTICAST because ip_check_mc_rcu
wants to further restrict local delivery depending on
the source address and protocol. Even the routing cache
does not cache the protocol as key. May be received IGMP report
can create input cache entry with RTCF_LOCAL flag and later the
UDP stack can see unwatned incoming traffic that should be
dropped by MCAST_INCLUDE/MCAST_EXCLUDE settings for the same
group. I.e. the routing code calls ip_check_mc_rcu for IGMP
but the cache prevents the next calls for UDP to drop these
sources.
Before now ip_rt_multicast_event was used to
notify about changes in subscriptions for groups and
the routing cache can update its information (RTCF_LOCAL)
per indev+saddr+daddr (but no protocol). Without routing cache
we can not solve the ip_check_mc_rcu problem with
nh_mc_genid fields because NH can be used for many
different saddr addresses.
Same problem is in ip_route_input_common,
we have to call ip_check_mc_rcu for every packet and
this can be a problem with long lists. But I'm not
sure if the stack can see many filters. If yes, may be
only using hash table for in_dev->mc_list and its
"sources" can help here because we have to call
ip_check_mc_rcu for every input and output packet
if dev+saddr+daddr+proto results are not cached for mcast.
Regards
--
Julian Anastasov <ja@ssi.bg>
^ permalink raw reply [flat|nested] 3+ messages in thread
* Re: [PATCH v6 BONUS 4/3] ipv4: Store rtable entries directly in FIB
2011-05-06 9:12 ` Julian Anastasov
@ 2011-05-06 17:57 ` David Miller
0 siblings, 0 replies; 3+ messages in thread
From: David Miller @ 2011-05-06 17:57 UTC (permalink / raw)
To: ja; +Cc: netdev, tgraf, jpirko, herbert, eric.dumazet
From: Julian Anastasov <ja@ssi.bg>
Date: Fri, 6 May 2011 12:12:26 +0300 (EEST)
> Caching results of __mkroute_output in NH does
> not work well for RTN_MULTICAST because ip_check_mc_rcu
> wants to further restrict local delivery depending on
> the source address and protocol.
I understand that multicast needs special handling.
I'm concentrating on unicast/broadcast at the moment because
there is a predominantly clear path for making that work.
^ permalink raw reply [flat|nested] 3+ messages in thread
end of thread, other threads:[~2011-05-06 17:57 UTC | newest]
Thread overview: 3+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2011-05-05 23:36 [PATCH v6 BONUS 4/3] ipv4: Store rtable entries directly in FIB David Miller
2011-05-06 9:12 ` Julian Anastasov
2011-05-06 17:57 ` David Miller
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).