netdev.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
From: Einar Lueck <elueck@de.ibm.com>
To: "David S. Miller" <davem@davemloft.net>
Cc: hadi@cyberus.ca, linux-kernel@vger.kernel.org, netdev@oss.sgi.com
Subject: Re: [RFC][PATCH 2/2] ip multipath, bk head (EXPERIMENTAL)
Date: Tue, 14 Sep 2004 14:38:53 +0200	[thread overview]
Message-ID: <4146E65D.6070309@de.ibm.com> (raw)
In-Reply-To: <20040913224232.4b979c7d.davem@davemloft.net>

[-- Attachment #1: Type: text/plain, Size: 79 bytes --]

I attached the patch the way you requested in the other thread.

Regards
Einar

[-- Attachment #2: multipath_cached.diff --]
[-- Type: text/x-patch, Size: 21144 bytes --]

diff -ruN linux-2.6.8.1.splitold/include/net/dst.h linux-2.6.8.1.multipath_cached/include/net/dst.h
--- linux-2.6.8.1.splitold/include/net/dst.h	2004-09-10 10:24:40.000000000 +0200
+++ linux-2.6.8.1.multipath_cached/include/net/dst.h	2004-09-10 11:16:35.000000000 +0200
@@ -48,6 +48,7 @@
 #define DST_NOXFRM		2
 #define DST_NOPOLICY		4
 #define DST_NOHASH		8
+#define DST_BALANCED            0x10
 	unsigned long		lastuse;
 	unsigned long		expires;
 
diff -ruN linux-2.6.8.1.splitold/include/net/flow.h linux-2.6.8.1.multipath_cached/include/net/flow.h
--- linux-2.6.8.1.splitold/include/net/flow.h	2004-09-10 10:24:40.000000000 +0200
+++ linux-2.6.8.1.multipath_cached/include/net/flow.h	2004-09-10 11:16:35.000000000 +0200
@@ -51,6 +51,9 @@
 
 	__u8	proto;
 	__u8	flags;
+#if defined(CONFIG_IP_ROUTE_MULTIPATH_CACHED)
+#define FLOWI_FLAG_MULTIPATHOLDROUTE 0x01
+#endif
 	union {
 		struct {
 			__u16	sport;
diff -ruN linux-2.6.8.1.splitold/include/net/route.h linux-2.6.8.1.multipath_cached/include/net/route.h
--- linux-2.6.8.1.splitold/include/net/route.h	2004-09-10 10:24:40.000000000 +0200
+++ linux-2.6.8.1.multipath_cached/include/net/route.h	2004-09-10 11:16:35.000000000 +0200
@@ -179,6 +179,9 @@
 		memcpy(&fl, &(*rp)->fl, sizeof(fl));
 		fl.fl_ip_sport = sport;
 		fl.fl_ip_dport = dport;
+#if defined(CONFIG_IP_ROUTE_MULTIPATH_CACHED)
+		fl.flags |= FLOWI_FLAG_MULTIPATHOLDROUTE;
+#endif
 		ip_rt_put(*rp);
 		*rp = NULL;
 		return ip_route_output_flow(rp, &fl, sk, 0);
@@ -197,4 +200,41 @@
 	return rt->peer;
 }
 
+
+#ifdef CONFIG_IP_ROUTE_MULTIPATH_RR
+extern void __multipath_remove(struct rtable *rt);
+static inline void multipath_remove(struct rtable *rt) {
+	if ( rt->u.dst.flags & DST_BALANCED ) {
+		__multipath_remove( rt );
+	}
+}
+#else /* CONFIG_IP_ROUTE_MULTIPATH_RR */
+static inline void multipath_remove(struct rtable *rt) {}
+#endif /* CONFIG_IP_ROUTE_MULTIPATH_RR */
+
+
+#ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
+extern void __multipath_selectroute(const struct flowi *flp,
+				    struct rtable *rth,
+				    struct rtable **rp);
+static inline int multipath_selectroute(const struct flowi *flp,
+					struct rtable *rth,
+					struct rtable **rp) {
+	if ( rth->u.dst.flags & DST_BALANCED ) {
+		__multipath_selectroute( flp, rth, rp );
+		return 1;
+	}
+	else {
+		return 0;
+	}
+}
+#else /* CONFIG_IP_ROUTE_MULTIPATH_CACHED */
+static inline int multipath_selectroute(const struct flowi *flp,
+					struct rtable *rth,
+					struct rtable **rp) {
+	return 0;
+}
+#endif /* CONFIG_IP_ROUTE_MULTIPATH_CACHED */
+
+
 #endif	/* _ROUTE_H */
diff -ruN linux-2.6.8.1.splitold/net/ipv4/Kconfig linux-2.6.8.1.multipath_cached/net/ipv4/Kconfig
--- linux-2.6.8.1.splitold/net/ipv4/Kconfig	2004-09-10 10:25:08.000000000 +0200
+++ linux-2.6.8.1.multipath_cached/net/ipv4/Kconfig	2004-09-10 11:16:35.000000000 +0200
@@ -94,6 +94,41 @@
 	  equal "cost" and chooses one of them in a non-deterministic fashion
 	  if a matching packet arrives.
 
+config IP_ROUTE_MULTIPATH_CACHED
+	bool "IP: equal cost multipath with caching support (EXPERIMENTAL)"
+	depends on: IP_ROUTE_MULTIPATH
+	help
+	  Normally, equal cost multipath routing is not supported by the
+	  routing cache. If you say Y here, alternative routes are cached
+	  in the routing cache and on cache lookup route is chosen in 
+	  Round Robin fashon.
+
+	  If unsure, say N.
+
+#
+# multipath policy configuration
+# 
+choice
+	prompt "Multipath policy"
+	depends on IP_ROUTE_MULTIPATH_CACHED
+	default IP_ROUTE_MULTIPATH_RR
+
+config IP_ROUTE_MULTIPATH_RR
+	bool "round robin (EXPERIMENTAL)"
+	help
+	  Mulitpath routes are chosen according to Round Robin
+
+config IP_ROUTE_MULTIPATH_RANDOM
+	bool "random multipath (EXPERIMENTAL)"
+	help
+	  Multipath routes are chosen in a random fashion (naive 
+	  implementation)
+
+endchoice
+#
+# END OF multipath policy configuration
+#
+
 config IP_ROUTE_TOS
 	bool "IP: use TOS value as routing key"
 	depends on IP_ADVANCED_ROUTER
diff -ruN linux-2.6.8.1.splitold/net/ipv4/Makefile linux-2.6.8.1.multipath_cached/net/ipv4/Makefile
--- linux-2.6.8.1.splitold/net/ipv4/Makefile	2004-09-10 10:25:08.000000000 +0200
+++ linux-2.6.8.1.multipath_cached/net/ipv4/Makefile	2004-09-10 11:16:35.000000000 +0200
@@ -21,6 +21,8 @@
 obj-$(CONFIG_INET_IPCOMP) += ipcomp.o
 obj-$(CONFIG_INET_TUNNEL) += xfrm4_tunnel.o 
 obj-$(CONFIG_IP_PNP) += ipconfig.o
+obj-$(CONFIG_IP_ROUTE_MULTIPATH_RR) += multipath_rr.o
+obj-$(CONFIG_IP_ROUTE_MULTIPATH_RANDOM) += multipath_random.o
 obj-$(CONFIG_NETFILTER)	+= netfilter/
 obj-$(CONFIG_IP_VS) += ipvs/
 
diff -ruN linux-2.6.8.1.splitold/net/ipv4/multipath_random.c linux-2.6.8.1.multipath_cached/net/ipv4/multipath_random.c
--- linux-2.6.8.1.splitold/net/ipv4/multipath_random.c	1970-01-01 01:00:00.000000000 +0100
+++ linux-2.6.8.1.multipath_cached/net/ipv4/multipath_random.c	2004-09-13 16:45:48.000000000 +0200
@@ -0,0 +1,107 @@
+/*
+ *              Random policy for multipath.
+ *
+ *
+ * Version:	$Id: multipath.c,v 1.1.2.1 2004/09/02 20:01:27 elueck Exp $
+ *
+ * Authors:	Einar Lueck <elueck@de.ibm.com><lkml@einar-lueck.de>
+ *
+ *		This program is free software; you can redistribute it and/or
+ *		modify it under the terms of the GNU General Public License
+ *		as published by the Free Software Foundation; either version
+ *		2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/config.h>
+#include <asm/system.h>
+#include <asm/uaccess.h>
+#include <linux/types.h>
+#include <linux/sched.h>
+#include <linux/errno.h>
+#include <linux/timer.h>
+#include <linux/mm.h>
+#include <linux/kernel.h>
+#include <linux/fcntl.h>
+#include <linux/stat.h>
+#include <linux/socket.h>
+#include <linux/in.h>
+#include <linux/inet.h>
+#include <linux/netdevice.h>
+#include <linux/inetdevice.h>
+#include <linux/igmp.h>
+#include <linux/proc_fs.h>
+#include <linux/seq_file.h>
+#include <linux/mroute.h>
+#include <linux/init.h>
+#include <net/ip.h>
+#include <net/protocol.h>
+#include <linux/skbuff.h>
+#include <net/sock.h>
+#include <net/icmp.h>
+#include <net/udp.h>
+#include <net/raw.h>
+#include <linux/notifier.h>
+#include <linux/if_arp.h>
+#include <linux/netfilter_ipv4.h>
+#include <net/ipip.h>
+#include <net/checksum.h>
+
+#define RTprint(a...)	// printk(KERN_DEBUG a)
+
+#define MULTIPATH_MAX_CANDIDATES 40
+
+
+static int __inline__ multipath_comparekeys(const struct flowi *flp1,
+					    const struct flowi *flp2) {
+	return flp1->fl4_dst == flp2->fl4_dst &&
+		flp1->fl4_src == flp2->fl4_src &&
+		flp1->oif == flp2->oif &&
+#ifdef CONFIG_IP_ROUTE_FWMARK
+		flp1->fl4_fwmark == flp2->fl4_fwmark &&
+#endif
+		!((flp1->fl4_tos ^ flp2->fl4_tos) &
+		  (IPTOS_RT_MASK | RTO_ONLINK));
+}
+
+void __multipath_selectroute(const struct flowi *flp, 
+			     struct rtable *first, 
+			     struct rtable **rp) {
+	struct rtable *rt;
+	struct rtable *candidate[MULTIPATH_MAX_CANDIDATES];
+	struct rtable *decision;
+	unsigned char candidate_count = 0;
+	/* FIXME: remove debug code */
+	RTprint( KERN_DEBUG"%s called\n", __FUNCTION__ );
+
+	/* collect all candidate */
+	for (rt = rcu_dereference(first); rt;
+	     rt = rcu_dereference(rt->u.rt_next)) {
+		if ( ( rt->u.dst.flags & DST_BALANCED ) != 0 && 
+		     multipath_comparekeys(&rt->fl, flp) ) {
+			candidate[candidate_count] = rt;
+			++candidate_count;
+		}
+		if ( candidate_count >= MULTIPATH_MAX_CANDIDATES ) {
+			break;
+		}
+	}
+
+	/* choose a random candidate */
+	decision = candidate[0];
+	if ( candidate_count > 1 ) {
+		unsigned char i;
+		unsigned char candidate_no = net_random() % candidate_count;
+		decision = candidate[candidate_no];
+		
+		/* make sure all candidates stay in cache */
+		for ( i = 0; i < candidate_count; ++i ) {
+			candidate[i]->u.dst.lastuse = jiffies;
+		}
+	}
+			
+	decision->u.dst.__use++;
+	*rp = decision;
+}
+
+
+
diff -ruN linux-2.6.8.1.splitold/net/ipv4/multipath_rr.c linux-2.6.8.1.multipath_cached/net/ipv4/multipath_rr.c
--- linux-2.6.8.1.splitold/net/ipv4/multipath_rr.c	1970-01-01 01:00:00.000000000 +0100
+++ linux-2.6.8.1.multipath_cached/net/ipv4/multipath_rr.c	2004-09-10 11:16:35.000000000 +0200
@@ -0,0 +1,202 @@
+/*
+ *              Round robin policy for multipath.
+ *
+ *
+ * Version:	$Id: multipath.c,v 1.1.2.1 2004/09/02 20:01:27 elueck Exp $
+ *
+ * Authors:	Einar Lueck <elueck@de.ibm.com><lkml@einar-lueck.de>
+ *
+ *		This program is free software; you can redistribute it and/or
+ *		modify it under the terms of the GNU General Public License
+ *		as published by the Free Software Foundation; either version
+ *		2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/config.h>
+#include <asm/system.h>
+#include <asm/uaccess.h>
+#include <linux/types.h>
+#include <linux/sched.h>
+#include <linux/errno.h>
+#include <linux/timer.h>
+#include <linux/mm.h>
+#include <linux/kernel.h>
+#include <linux/fcntl.h>
+#include <linux/stat.h>
+#include <linux/socket.h>
+#include <linux/in.h>
+#include <linux/inet.h>
+#include <linux/netdevice.h>
+#include <linux/inetdevice.h>
+#include <linux/igmp.h>
+#include <linux/proc_fs.h>
+#include <linux/seq_file.h>
+#include <linux/mroute.h>
+#include <linux/init.h>
+#include <net/ip.h>
+#include <net/protocol.h>
+#include <linux/skbuff.h>
+#include <net/sock.h>
+#include <net/icmp.h>
+#include <net/udp.h>
+#include <net/raw.h>
+#include <linux/notifier.h>
+#include <linux/if_arp.h>
+#include <linux/netfilter_ipv4.h>
+#include <net/ipip.h>
+#include <net/checksum.h>
+
+struct rt_cache_candidate
+{
+	struct rtable             *candidate;
+	struct rt_cache_candidate *next;
+};
+
+#define RTprint(a...)	// printk(KERN_DEBUG a)
+
+#define MULTIPATH_MAX_CANDIDATES 40
+
+static spinlock_t multipath_state_lock = SPIN_LOCK_UNLOCKED;
+static struct rt_cache_candidate *multipath_state = NULL;
+static int multipath_state_entrycount = 0;/* FIXME remove: is just for debug purposes */					   
+
+static int __inline__ multipath_comparekeys(const struct flowi *flp1,
+					    const struct flowi *flp2) {
+	return flp1->fl4_dst == flp2->fl4_dst &&
+		flp1->fl4_src == flp2->fl4_src &&
+		flp1->oif == flp2->oif &&
+#ifdef CONFIG_IP_ROUTE_FWMARK
+		flp1->fl4_fwmark == flp2->fl4_fwmark &&
+#endif
+		!((flp1->fl4_tos ^ flp2->fl4_tos) &
+		  (IPTOS_RT_MASK | RTO_ONLINK));
+}
+
+void __multipath_remove(struct rtable* rt) {
+	struct rt_cache_candidate *candidate;
+	struct rt_cache_candidate *previous = NULL;
+
+	/* DEBUG STUFF */
+	if ( !( rt->u.dst.flags & DST_BALANCED ) ) {
+		/* FIXME: remove debug code */
+		RTprint( KERN_DEBUG"%s: unexpected argument\n",
+			 __FUNCTION__ );
+		return;
+	}
+
+	spin_lock_bh(&multipath_state_lock);
+
+	for ( candidate = multipath_state; candidate != NULL; 
+	      candidate = candidate->next ) {
+		if ( multipath_comparekeys(&candidate->candidate->fl,
+					   &rt->fl) ) {
+			if ( candidate == multipath_state ) {
+				multipath_state = multipath_state->next;
+			}
+			else {
+				previous->next = candidate->next;
+			}
+			--multipath_state_entrycount;
+			kfree( candidate );
+			RTprint( KERN_DEBUG"%s: removed entry. Entry " \
+				 "count: %d\n", __FUNCTION__,
+				 multipath_state_entrycount );
+			break;
+		}
+		
+		previous = candidate;
+	}
+
+	spin_unlock_bh(&multipath_state_lock);
+}
+
+void __multipath_selectroute(const struct flowi *flp, 
+			     struct rtable *first, struct rtable **rp) 
+{
+	struct rt_cache_candidate *cand;
+	struct rtable *nh, *result;
+	int found_old = 0;
+	
+	spin_lock_bh(&multipath_state_lock);
+
+	/* determine entry with candidate returned last time */
+	for ( cand = multipath_state; cand != NULL; cand = cand->next ) {
+		if ( multipath_comparekeys(&cand->candidate->fl, flp) ) {
+
+			RTprint( KERN_CRIT"%s: determined candidate " \
+				 "returned last time\n",
+				 __FUNCTION__ );
+			break;
+		}
+	}
+
+
+	/* 1. make sure all alt. nexthops have the same GC related data */
+	/* 2. determine the new candidate to be returned */
+	result = NULL;
+	for (nh = rcu_dereference(first); nh;
+	     nh = rcu_dereference(nh->u.rt_next)) {
+		if ( ( nh->u.dst.flags & DST_BALANCED ) != 0 &&
+		     multipath_comparekeys(&nh->fl, flp ) ) {
+			nh->u.dst.lastuse = jiffies;
+			nh->u.dst.__use++;
+			RTprint( KERN_CRIT"%s: found balanced entry\n",
+				 __FUNCTION__ );
+
+			/* determine candidate to be returned */
+			if ( !(flp->flags & FLOWI_FLAG_MULTIPATHOLDROUTE ) ) {
+				if ( found_old && !result ) {
+					result = nh;
+				}
+				else if ( cand != NULL && 
+					  nh == cand->candidate ) {
+					found_old = 1;
+				}
+			}
+		}
+	}
+
+	/* if no previous alternative exists utilize first */
+	if ( result == NULL ) {
+		RTprint( KERN_CRIT"%s: reach end of"\
+			 " chain. Start again.\n",
+			 __FUNCTION__ );
+		
+		result = first;
+	}
+	else {
+		RTprint( KERN_CRIT"%s: found new " \
+			 "candidate\n",
+			 __FUNCTION__ );
+	}
+
+
+	/* if necessary and possible utilize the old alternative */
+	if ( ( flp->flags & FLOWI_FLAG_MULTIPATHOLDROUTE ) != 0 && 
+	     cand != NULL ) {
+		RTprint( KERN_CRIT"%s: holding route \n",
+			 __FUNCTION__ );
+		result = cand->candidate;
+	}
+
+	
+	/* store candidate to return in state */
+	if ( cand == NULL ) {
+                /* create new state entry if necessary */
+		cand = (struct rt_cachec_andidate*)
+			kmalloc( sizeof(struct rt_cache_candidate),
+				 GFP_KERNEL );
+		cand->next = multipath_state;
+		multipath_state = cand;
+		++multipath_state_entrycount;
+		RTprint( KERN_CRIT"%s: entrycount: %d\n",
+			 __FUNCTION__, multipath_state_entrycount );
+	}
+	cand->candidate = result;
+	
+	spin_unlock_bh(&multipath_state_lock);
+	
+	*rp = result;
+}
+
+
diff -ruN linux-2.6.8.1.splitold/net/ipv4/route.c linux-2.6.8.1.multipath_cached/net/ipv4/route.c
--- linux-2.6.8.1.splitold/net/ipv4/route.c	2004-09-14 09:38:49.000000000 +0200
+++ linux-2.6.8.1.multipath_cached/net/ipv4/route.c	2004-09-14 09:36:52.000000000 +0200
@@ -442,11 +442,13 @@
   
 static __inline__ void rt_free(struct rtable *rt)
 {
+	multipath_remove(rt);
 	call_rcu_bh(&rt->u.dst.rcu_head, dst_rcu_free);
 }
 
 static __inline__ void rt_drop(struct rtable *rt)
 {
+	multipath_remove(rt);
 	ip_rt_put(rt);
 	call_rcu_bh(&rt->u.dst.rcu_head, dst_rcu_free);
 }
@@ -539,8 +541,28 @@
 			}
 
 			/* Cleanup aged off entries. */
+#ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
+			/* remove all related balanced entries if necessary */
+			if ( rth->u.dst.flags & DST_BALANCED ) {
+				struct rtable* first = rth;
+				*rthp = rth->u.rt_next;
+				while ( (*rthp)->u.dst.flags & DST_BALANCED &&
+					compare_keys(&(*rthp)->fl, 
+						     &first->fl)) {
+					rth = (*rthp);
+					*rthp = rth->u.rt_next;
+					rt_free( rth );
+				}
+				rt_free(first);
+			}
+			else {
+				*rthp = rth->u.rt_next;
+				rt_free(rth);
+			}
+#else /* CONFIG_IP_ROUTE_MULTIPATH_CACHED */
 			*rthp = rth->u.rt_next;
 			rt_free(rth);
+#endif /* CONFIG_IP_ROUTE_MULTIPATH_CACHED */
 		}
 		spin_unlock(&rt_hash_table[i].lock);
 
@@ -706,8 +728,28 @@
 					rthp = &rth->u.rt_next;
 					continue;
 				}
+#ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
+			/* remove all related balanced entries if necessary */
+			if ( rth->u.dst.flags & DST_BALANCED ) {
+				struct rtable* first = rth;
+				*rthp = rth->u.rt_next;
+				while ( (*rthp)->u.dst.flags & DST_BALANCED &&
+					compare_keys(&(*rthp)->fl, 
+						     &first->fl)) {
+					rth = (*rthp);
+					*rthp = rth->u.rt_next;
+					rt_free( rth );
+				}
+				rt_free(first);
+			}
+			else {
+				*rthp = rth->u.rt_next;
+				rt_free(rth);
+			}
+#else /* CONFIG_IP_ROUTE_MULTIPATH_CACHED */
 				*rthp = rth->u.rt_next;
 				rt_free(rth);
+#endif /* CONFIG_IP_ROUTE_MULTIPATH_CACHED */
 				goal--;
 			}
 			spin_unlock_bh(&rt_hash_table[k].lock);
@@ -789,7 +831,12 @@
 
 	spin_lock_bh(&rt_hash_table[hash].lock);
 	while ((rth = *rthp) != NULL) {
+#ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
+		if (!(rth->u.dst.flags & DST_BALANCED) &&
+		    compare_keys(&rth->fl, &rt->fl)) {
+#else
 		if (compare_keys(&rth->fl, &rt->fl)) {
+#endif
 			/* Put it first */
 			*rthp = rth->u.rt_next;
 			/*
@@ -1622,7 +1669,18 @@
 		goto cleanup;
 	}
 
+#ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
+	if ( res->fi->fib_nhs > 1 )
+		RTprint( KERN_DEBUG"%s: balanced entry created: %d\n",
+			 __FUNCTION__,
+			 rth ); 
+#endif
+
 	rth->u.dst.flags= DST_HOST;
+#ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
+	if ( res->fi->fib_nhs > 1 )
+		rth->u.dst.flags |= DST_BALANCED;
+#endif
 	if (in_dev->cnf.no_policy)
 		rth->u.dst.flags |= DST_NOPOLICY;
 	if (in_dev->cnf.no_xfrm)
@@ -1691,7 +1749,54 @@
 				   struct in_device *in_dev,
 				   u32 daddr, u32 saddr, u32 tos)
 {
+#ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED 
+	struct rtable* rth;
+	unsigned char hop, hopcount, lasthop;
+	int err = -EINVAL;
+	unsigned hash;
+	hopcount = res->fi->fib_nhs;
+	lasthop = hopcount - 1;
+
+	/* distinguish between multipath and singlepath */
+	if ( hopcount < 2 ) 
+		return ip_mkroute_input_def(skb, res, fl, in_dev, daddr, 
+					    saddr, tos);
+	
+	RTprint( KERN_DEBUG"%s: entered (hopcount: %d)\n", __FUNCTION__,
+		 hopcount);
+
+	/* add all alternatives to the routing cache */
+	for ( hop = 0; hop < hopcount; ++hop ) {
+		res->nh_sel = hop;
+
+		RTprint( KERN_DEBUG"%s: entered (hopcount: %d)\n", 
+			 __FUNCTION__, hopcount);
+
+		/* create a routing cache entry */
+		err = __mkroute_input( skb, res, in_dev, daddr, saddr, tos, 
+					 &rth );
+		if ( err ) 
+			return err;
+	
+
+		/* put it into the cache */
+		hash = rt_hash_code(daddr, saddr ^ (fl->iif << 5), tos);
+		err = rt_intern_hash(hash, rth, (struct rtable**)&skb->dst);
+		if ( err ) 
+			return err;
+
+
+		/* only for the last hop the reference count is handled 
+		   outside */
+		RTprint( KERN_DEBUG"%s: balanced entry created: %d\n",
+			 __FUNCTION__, rth );
+		if ( hop == lasthop ) 
+			atomic_set(&(skb->dst->__refcnt), 1);
+	}
+	return err;
+#else /* CONFIG_IP_ROUTE_MULTIPATH_CACHED  */ 
 	return ip_mkroute_input_def(skb, res, fl, in_dev, daddr, saddr, tos);
+#endif
 }
 
 
@@ -1882,6 +1987,7 @@
 	goto done;
 
 martian_source:
+
 	ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
 	goto e_inval;
 }
@@ -1907,6 +2013,17 @@
 		    rth->fl.fl4_fwmark == skb->nfmark &&
 #endif
 		    rth->fl.fl4_tos == tos) {
+			/* check if the route is a multipath route and if so
+			   select one of the alternatives */
+			if ( multipath_selectroute( 
+				     &rth->fl, rth, 
+				     (struct rtable**)&skb->dst) ) {
+				dst_hold(skb->dst);
+				rcu_read_unlock();
+
+				return 0;
+			}
+
 			rth->u.dst.lastuse = jiffies;
 			dst_hold(&rth->u.dst);
 			rth->u.dst.__use++;
@@ -2012,6 +2129,10 @@
 	}		
 
 	rth->u.dst.flags= DST_HOST;
+#ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
+	if (res->fi->fib_nhs > 1)
+		rth->u.dst.flags |= DST_BALANCED;
+#endif
 	if (in_dev->cnf.no_xfrm)
 		rth->u.dst.flags |= DST_NOXFRM;
 	if (in_dev->cnf.no_policy)
@@ -2103,7 +2224,71 @@
 				    struct net_device *dev_out,
 				    unsigned flags)
 {
+#ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
+	u32 tos = RT_FL_TOS(oldflp);
+	unsigned char hop;
+	unsigned hash;
+	int err = -EINVAL;
+	unsigned char hopcount = res->fi->fib_nhs;
+	struct rtable* rth;
+
+	RTprint( KERN_DEBUG"%s: entered (hopcount: %d, fl->oif: %d)\n", 
+		 __FUNCTION__, hopcount, fl->oif);
+
+	if (res->fi->fib_nhs > 1) {		
+		for ( hop = 0; hop < hopcount; ++hop ) {
+			struct net_device *dev2nexthop;
+			RTprint( KERN_DEBUG"%s: hop %d of %d\n", __FUNCTION__,
+				 hop, hopcount );
+
+			res->nh_sel = hop;
+
+			/* hold a work reference to the output device */
+			dev2nexthop = FIB_RES_DEV(*res);
+			dev_hold(dev2nexthop);
+
+			/** FIXME remove debug code */
+			RTprint( KERN_DEBUG"%s: balanced entry created: %d " \
+				 " (GW: %u)\n",
+				 __FUNCTION__,
+				 rth,
+				 FIB_RES_GW(*res) );
+
+			err = __mkroute_output(&rth, res, fl, oldflp, 
+						 dev2nexthop, flags);
+			if ( err != 0 ) {
+				goto cleanup;
+			}
+
+			RTprint( KERN_DEBUG"%s: created successfully %d\n", 
+				 __FUNCTION__, hop );
+			
+			hash = rt_hash_code(oldflp->fl4_dst, 
+					    oldflp->fl4_src ^ 
+					    (oldflp->oif << 5), tos);
+			err = rt_intern_hash(hash, rth, rp);
+			RTprint( KERN_DEBUG"%s: hashed  %d\n", 
+				 __FUNCTION__, hop );
+
+		cleanup:
+			/* release work reference to output device */
+			dev_put(dev2nexthop);
+			
+			if ( err != 0 ) {
+				return err;
+			}
+		}
+		RTprint( KERN_DEBUG"%s: exited loop\n", __FUNCTION__ );
+		atomic_set(&(*rp)->u.dst.__refcnt, 1);
+		return err;
+	}
+	else {
+		return ip_mkroute_output_def(rp, res, fl, oldflp, dev_out, 
+					     flags);
+	}
+#else /* CONFIG_IP_ROUTE_MULTIPATH_CACHED */
 	return ip_mkroute_output_def(rp, res, fl, oldflp, dev_out, flags);
+#endif /* CONFIG_IP_ROUTE_MULTIPATH_CACHED */
 }
 
 /*
@@ -2316,6 +2501,16 @@
 #endif
 		    !((rth->fl.fl4_tos ^ flp->fl4_tos) &
 			    (IPTOS_RT_MASK | RTO_ONLINK))) {
+
+			/* check for multipath routes and choose one if
+			   necessary */
+			if (multipath_selectroute(flp, rth, rp)) {
+				dst_hold(&(*rp)->u.dst);
+				RT_CACHE_STAT_INC(out_hit);
+				rcu_read_unlock_bh();
+				return 0;
+			}
+
 			rth->u.dst.lastuse = jiffies;
 			dst_hold(&rth->u.dst);
 			rth->u.dst.__use++;

  parent reply	other threads:[~2004-09-14 12:38 UTC|newest]

Thread overview: 6+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2004-09-13 10:36 [RFC][PATCH 2/2] ip multipath, bk head (EXPERIMENTAL) Einar Lueck
2004-09-14  2:40 ` jamal
2004-09-14  5:42   ` David S. Miller
2004-09-14 12:14     ` jamal
2004-09-14 12:38     ` Einar Lueck [this message]
2004-09-14 15:08       ` Phil Oester

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=4146E65D.6070309@de.ibm.com \
    --to=elueck@de.ibm.com \
    --cc=davem@davemloft.net \
    --cc=hadi@cyberus.ca \
    --cc=linux-kernel@vger.kernel.org \
    --cc=lkml@einar-lueck.de \
    --cc=netdev@oss.sgi.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).