All of lore.kernel.org
 help / color / mirror / Atom feed
From: Einar Lueck <elueck@de.ibm.com>
To: "David S. Miller" <davem@davemloft.net>
Cc: hadi@cyberus.ca, linux-kernel@vger.kernel.org, netdev@oss.sgi.com
Subject: Re: [RFC][PATCH 2/2] ip multipath, bk head (EXPERIMENTAL)
Date: Tue, 14 Sep 2004 14:38:53 +0200	[thread overview]
Message-ID: <4146E65D.6070309@de.ibm.com> (raw)
In-Reply-To: <20040913224232.4b979c7d.davem@davemloft.net>

[-- Attachment #1: Type: text/plain, Size: 79 bytes --]

I attached the patch the way you requested in the other thread.

Regards
Einar

[-- Attachment #2: multipath_cached.diff --]
[-- Type: text/x-patch, Size: 21144 bytes --]

diff -ruN linux-2.6.8.1.splitold/include/net/dst.h linux-2.6.8.1.multipath_cached/include/net/dst.h
--- linux-2.6.8.1.splitold/include/net/dst.h	2004-09-10 10:24:40.000000000 +0200
+++ linux-2.6.8.1.multipath_cached/include/net/dst.h	2004-09-10 11:16:35.000000000 +0200
@@ -48,6 +48,7 @@
 #define DST_NOXFRM		2
 #define DST_NOPOLICY		4
 #define DST_NOHASH		8
+#define DST_BALANCED            0x10
 	unsigned long		lastuse;
 	unsigned long		expires;
 
diff -ruN linux-2.6.8.1.splitold/include/net/flow.h linux-2.6.8.1.multipath_cached/include/net/flow.h
--- linux-2.6.8.1.splitold/include/net/flow.h	2004-09-10 10:24:40.000000000 +0200
+++ linux-2.6.8.1.multipath_cached/include/net/flow.h	2004-09-10 11:16:35.000000000 +0200
@@ -51,6 +51,9 @@
 
 	__u8	proto;
 	__u8	flags;
+#if defined(CONFIG_IP_ROUTE_MULTIPATH_CACHED)
+#define FLOWI_FLAG_MULTIPATHOLDROUTE 0x01
+#endif
 	union {
 		struct {
 			__u16	sport;
diff -ruN linux-2.6.8.1.splitold/include/net/route.h linux-2.6.8.1.multipath_cached/include/net/route.h
--- linux-2.6.8.1.splitold/include/net/route.h	2004-09-10 10:24:40.000000000 +0200
+++ linux-2.6.8.1.multipath_cached/include/net/route.h	2004-09-10 11:16:35.000000000 +0200
@@ -179,6 +179,9 @@
 		memcpy(&fl, &(*rp)->fl, sizeof(fl));
 		fl.fl_ip_sport = sport;
 		fl.fl_ip_dport = dport;
+#if defined(CONFIG_IP_ROUTE_MULTIPATH_CACHED)
+		fl.flags |= FLOWI_FLAG_MULTIPATHOLDROUTE;
+#endif
 		ip_rt_put(*rp);
 		*rp = NULL;
 		return ip_route_output_flow(rp, &fl, sk, 0);
@@ -197,4 +200,41 @@
 	return rt->peer;
 }
 
+
+#ifdef CONFIG_IP_ROUTE_MULTIPATH_RR
+extern void __multipath_remove(struct rtable *rt);
+static inline void multipath_remove(struct rtable *rt) {
+	if ( rt->u.dst.flags & DST_BALANCED ) {
+		__multipath_remove( rt );
+	}
+}
+#else /* CONFIG_IP_ROUTE_MULTIPATH_RR */
+static inline void multipath_remove(struct rtable *rt) {}
+#endif /* CONFIG_IP_ROUTE_MULTIPATH_RR */
+
+
+#ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
+extern void __multipath_selectroute(const struct flowi *flp,
+				    struct rtable *rth,
+				    struct rtable **rp);
+static inline int multipath_selectroute(const struct flowi *flp,
+					struct rtable *rth,
+					struct rtable **rp) {
+	if ( rth->u.dst.flags & DST_BALANCED ) {
+		__multipath_selectroute( flp, rth, rp );
+		return 1;
+	}
+	else {
+		return 0;
+	}
+}
+#else /* CONFIG_IP_ROUTE_MULTIPATH_CACHED */
+static inline int multipath_selectroute(const struct flowi *flp,
+					struct rtable *rth,
+					struct rtable **rp) {
+	return 0;
+}
+#endif /* CONFIG_IP_ROUTE_MULTIPATH_CACHED */
+
+
 #endif	/* _ROUTE_H */
diff -ruN linux-2.6.8.1.splitold/net/ipv4/Kconfig linux-2.6.8.1.multipath_cached/net/ipv4/Kconfig
--- linux-2.6.8.1.splitold/net/ipv4/Kconfig	2004-09-10 10:25:08.000000000 +0200
+++ linux-2.6.8.1.multipath_cached/net/ipv4/Kconfig	2004-09-10 11:16:35.000000000 +0200
@@ -94,6 +94,41 @@
 	  equal "cost" and chooses one of them in a non-deterministic fashion
 	  if a matching packet arrives.
 
+config IP_ROUTE_MULTIPATH_CACHED
+	bool "IP: equal cost multipath with caching support (EXPERIMENTAL)"
+	depends on: IP_ROUTE_MULTIPATH
+	help
+	  Normally, equal cost multipath routing is not supported by the
+	  routing cache. If you say Y here, alternative routes are cached
+	  in the routing cache and on cache lookup route is chosen in 
+	  Round Robin fashon.
+
+	  If unsure, say N.
+
+#
+# multipath policy configuration
+# 
+choice
+	prompt "Multipath policy"
+	depends on IP_ROUTE_MULTIPATH_CACHED
+	default IP_ROUTE_MULTIPATH_RR
+
+config IP_ROUTE_MULTIPATH_RR
+	bool "round robin (EXPERIMENTAL)"
+	help
+	  Mulitpath routes are chosen according to Round Robin
+
+config IP_ROUTE_MULTIPATH_RANDOM
+	bool "random multipath (EXPERIMENTAL)"
+	help
+	  Multipath routes are chosen in a random fashion (naive 
+	  implementation)
+
+endchoice
+#
+# END OF multipath policy configuration
+#
+
 config IP_ROUTE_TOS
 	bool "IP: use TOS value as routing key"
 	depends on IP_ADVANCED_ROUTER
diff -ruN linux-2.6.8.1.splitold/net/ipv4/Makefile linux-2.6.8.1.multipath_cached/net/ipv4/Makefile
--- linux-2.6.8.1.splitold/net/ipv4/Makefile	2004-09-10 10:25:08.000000000 +0200
+++ linux-2.6.8.1.multipath_cached/net/ipv4/Makefile	2004-09-10 11:16:35.000000000 +0200
@@ -21,6 +21,8 @@
 obj-$(CONFIG_INET_IPCOMP) += ipcomp.o
 obj-$(CONFIG_INET_TUNNEL) += xfrm4_tunnel.o 
 obj-$(CONFIG_IP_PNP) += ipconfig.o
+obj-$(CONFIG_IP_ROUTE_MULTIPATH_RR) += multipath_rr.o
+obj-$(CONFIG_IP_ROUTE_MULTIPATH_RANDOM) += multipath_random.o
 obj-$(CONFIG_NETFILTER)	+= netfilter/
 obj-$(CONFIG_IP_VS) += ipvs/
 
diff -ruN linux-2.6.8.1.splitold/net/ipv4/multipath_random.c linux-2.6.8.1.multipath_cached/net/ipv4/multipath_random.c
--- linux-2.6.8.1.splitold/net/ipv4/multipath_random.c	1970-01-01 01:00:00.000000000 +0100
+++ linux-2.6.8.1.multipath_cached/net/ipv4/multipath_random.c	2004-09-13 16:45:48.000000000 +0200
@@ -0,0 +1,107 @@
+/*
+ *              Random policy for multipath.
+ *
+ *
+ * Version:	$Id: multipath.c,v 1.1.2.1 2004/09/02 20:01:27 elueck Exp $
+ *
+ * Authors:	Einar Lueck <elueck@de.ibm.com><lkml@einar-lueck.de>
+ *
+ *		This program is free software; you can redistribute it and/or
+ *		modify it under the terms of the GNU General Public License
+ *		as published by the Free Software Foundation; either version
+ *		2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/config.h>
+#include <asm/system.h>
+#include <asm/uaccess.h>
+#include <linux/types.h>
+#include <linux/sched.h>
+#include <linux/errno.h>
+#include <linux/timer.h>
+#include <linux/mm.h>
+#include <linux/kernel.h>
+#include <linux/fcntl.h>
+#include <linux/stat.h>
+#include <linux/socket.h>
+#include <linux/in.h>
+#include <linux/inet.h>
+#include <linux/netdevice.h>
+#include <linux/inetdevice.h>
+#include <linux/igmp.h>
+#include <linux/proc_fs.h>
+#include <linux/seq_file.h>
+#include <linux/mroute.h>
+#include <linux/init.h>
+#include <net/ip.h>
+#include <net/protocol.h>
+#include <linux/skbuff.h>
+#include <net/sock.h>
+#include <net/icmp.h>
+#include <net/udp.h>
+#include <net/raw.h>
+#include <linux/notifier.h>
+#include <linux/if_arp.h>
+#include <linux/netfilter_ipv4.h>
+#include <net/ipip.h>
+#include <net/checksum.h>
+
+#define RTprint(a...)	// printk(KERN_DEBUG a)
+
+#define MULTIPATH_MAX_CANDIDATES 40
+
+
+static int __inline__ multipath_comparekeys(const struct flowi *flp1,
+					    const struct flowi *flp2) {
+	return flp1->fl4_dst == flp2->fl4_dst &&
+		flp1->fl4_src == flp2->fl4_src &&
+		flp1->oif == flp2->oif &&
+#ifdef CONFIG_IP_ROUTE_FWMARK
+		flp1->fl4_fwmark == flp2->fl4_fwmark &&
+#endif
+		!((flp1->fl4_tos ^ flp2->fl4_tos) &
+		  (IPTOS_RT_MASK | RTO_ONLINK));
+}
+
+void __multipath_selectroute(const struct flowi *flp, 
+			     struct rtable *first, 
+			     struct rtable **rp) {
+	struct rtable *rt;
+	struct rtable *candidate[MULTIPATH_MAX_CANDIDATES];
+	struct rtable *decision;
+	unsigned char candidate_count = 0;
+	/* FIXME: remove debug code */
+	RTprint( KERN_DEBUG"%s called\n", __FUNCTION__ );
+
+	/* collect all candidate */
+	for (rt = rcu_dereference(first); rt;
+	     rt = rcu_dereference(rt->u.rt_next)) {
+		if ( ( rt->u.dst.flags & DST_BALANCED ) != 0 && 
+		     multipath_comparekeys(&rt->fl, flp) ) {
+			candidate[candidate_count] = rt;
+			++candidate_count;
+		}
+		if ( candidate_count >= MULTIPATH_MAX_CANDIDATES ) {
+			break;
+		}
+	}
+
+	/* choose a random candidate */
+	decision = candidate[0];
+	if ( candidate_count > 1 ) {
+		unsigned char i;
+		unsigned char candidate_no = net_random() % candidate_count;
+		decision = candidate[candidate_no];
+		
+		/* make sure all candidates stay in cache */
+		for ( i = 0; i < candidate_count; ++i ) {
+			candidate[i]->u.dst.lastuse = jiffies;
+		}
+	}
+			
+	decision->u.dst.__use++;
+	*rp = decision;
+}
+
+
+
diff -ruN linux-2.6.8.1.splitold/net/ipv4/multipath_rr.c linux-2.6.8.1.multipath_cached/net/ipv4/multipath_rr.c
--- linux-2.6.8.1.splitold/net/ipv4/multipath_rr.c	1970-01-01 01:00:00.000000000 +0100
+++ linux-2.6.8.1.multipath_cached/net/ipv4/multipath_rr.c	2004-09-10 11:16:35.000000000 +0200
@@ -0,0 +1,202 @@
+/*
+ *              Round robin policy for multipath.
+ *
+ *
+ * Version:	$Id: multipath.c,v 1.1.2.1 2004/09/02 20:01:27 elueck Exp $
+ *
+ * Authors:	Einar Lueck <elueck@de.ibm.com><lkml@einar-lueck.de>
+ *
+ *		This program is free software; you can redistribute it and/or
+ *		modify it under the terms of the GNU General Public License
+ *		as published by the Free Software Foundation; either version
+ *		2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/config.h>
+#include <asm/system.h>
+#include <asm/uaccess.h>
+#include <linux/types.h>
+#include <linux/sched.h>
+#include <linux/errno.h>
+#include <linux/timer.h>
+#include <linux/mm.h>
+#include <linux/kernel.h>
+#include <linux/fcntl.h>
+#include <linux/stat.h>
+#include <linux/socket.h>
+#include <linux/in.h>
+#include <linux/inet.h>
+#include <linux/netdevice.h>
+#include <linux/inetdevice.h>
+#include <linux/igmp.h>
+#include <linux/proc_fs.h>
+#include <linux/seq_file.h>
+#include <linux/mroute.h>
+#include <linux/init.h>
+#include <net/ip.h>
+#include <net/protocol.h>
+#include <linux/skbuff.h>
+#include <net/sock.h>
+#include <net/icmp.h>
+#include <net/udp.h>
+#include <net/raw.h>
+#include <linux/notifier.h>
+#include <linux/if_arp.h>
+#include <linux/netfilter_ipv4.h>
+#include <net/ipip.h>
+#include <net/checksum.h>
+
+struct rt_cache_candidate
+{
+	struct rtable             *candidate;
+	struct rt_cache_candidate *next;
+};
+
+#define RTprint(a...)	// printk(KERN_DEBUG a)
+
+#define MULTIPATH_MAX_CANDIDATES 40
+
+static spinlock_t multipath_state_lock = SPIN_LOCK_UNLOCKED;
+static struct rt_cache_candidate *multipath_state = NULL;
+static int multipath_state_entrycount = 0;/* FIXME remove: is just for debug purposes */					   
+
+static int __inline__ multipath_comparekeys(const struct flowi *flp1,
+					    const struct flowi *flp2) {
+	return flp1->fl4_dst == flp2->fl4_dst &&
+		flp1->fl4_src == flp2->fl4_src &&
+		flp1->oif == flp2->oif &&
+#ifdef CONFIG_IP_ROUTE_FWMARK
+		flp1->fl4_fwmark == flp2->fl4_fwmark &&
+#endif
+		!((flp1->fl4_tos ^ flp2->fl4_tos) &
+		  (IPTOS_RT_MASK | RTO_ONLINK));
+}
+
+void __multipath_remove(struct rtable* rt) {
+	struct rt_cache_candidate *candidate;
+	struct rt_cache_candidate *previous = NULL;
+
+	/* DEBUG STUFF */
+	if ( !( rt->u.dst.flags & DST_BALANCED ) ) {
+		/* FIXME: remove debug code */
+		RTprint( KERN_DEBUG"%s: unexpected argument\n",
+			 __FUNCTION__ );
+		return;
+	}
+
+	spin_lock_bh(&multipath_state_lock);
+
+	for ( candidate = multipath_state; candidate != NULL; 
+	      candidate = candidate->next ) {
+		if ( multipath_comparekeys(&candidate->candidate->fl,
+					   &rt->fl) ) {
+			if ( candidate == multipath_state ) {
+				multipath_state = multipath_state->next;
+			}
+			else {
+				previous->next = candidate->next;
+			}
+			--multipath_state_entrycount;
+			kfree( candidate );
+			RTprint( KERN_DEBUG"%s: removed entry. Entry " \
+				 "count: %d\n", __FUNCTION__,
+				 multipath_state_entrycount );
+			break;
+		}
+		
+		previous = candidate;
+	}
+
+	spin_unlock_bh(&multipath_state_lock);
+}
+
+void __multipath_selectroute(const struct flowi *flp, 
+			     struct rtable *first, struct rtable **rp) 
+{
+	struct rt_cache_candidate *cand;
+	struct rtable *nh, *result;
+	int found_old = 0;
+	
+	spin_lock_bh(&multipath_state_lock);
+
+	/* determine entry with candidate returned last time */
+	for ( cand = multipath_state; cand != NULL; cand = cand->next ) {
+		if ( multipath_comparekeys(&cand->candidate->fl, flp) ) {
+
+			RTprint( KERN_CRIT"%s: determined candidate " \
+				 "returned last time\n",
+				 __FUNCTION__ );
+			break;
+		}
+	}
+
+
+	/* 1. make sure all alt. nexthops have the same GC related data */
+	/* 2. determine the new candidate to be returned */
+	result = NULL;
+	for (nh = rcu_dereference(first); nh;
+	     nh = rcu_dereference(nh->u.rt_next)) {
+		if ( ( nh->u.dst.flags & DST_BALANCED ) != 0 &&
+		     multipath_comparekeys(&nh->fl, flp ) ) {
+			nh->u.dst.lastuse = jiffies;
+			nh->u.dst.__use++;
+			RTprint( KERN_CRIT"%s: found balanced entry\n",
+				 __FUNCTION__ );
+
+			/* determine candidate to be returned */
+			if ( !(flp->flags & FLOWI_FLAG_MULTIPATHOLDROUTE ) ) {
+				if ( found_old && !result ) {
+					result = nh;
+				}
+				else if ( cand != NULL && 
+					  nh == cand->candidate ) {
+					found_old = 1;
+				}
+			}
+		}
+	}
+
+	/* if no previous alternative exists utilize first */
+	if ( result == NULL ) {
+		RTprint( KERN_CRIT"%s: reach end of"\
+			 " chain. Start again.\n",
+			 __FUNCTION__ );
+		
+		result = first;
+	}
+	else {
+		RTprint( KERN_CRIT"%s: found new " \
+			 "candidate\n",
+			 __FUNCTION__ );
+	}
+
+
+	/* if necessary and possible utilize the old alternative */
+	if ( ( flp->flags & FLOWI_FLAG_MULTIPATHOLDROUTE ) != 0 && 
+	     cand != NULL ) {
+		RTprint( KERN_CRIT"%s: holding route \n",
+			 __FUNCTION__ );
+		result = cand->candidate;
+	}
+
+	
+	/* store candidate to return in state */
+	if ( cand == NULL ) {
+                /* create new state entry if necessary */
+		cand = (struct rt_cachec_andidate*)
+			kmalloc( sizeof(struct rt_cache_candidate),
+				 GFP_KERNEL );
+		cand->next = multipath_state;
+		multipath_state = cand;
+		++multipath_state_entrycount;
+		RTprint( KERN_CRIT"%s: entrycount: %d\n",
+			 __FUNCTION__, multipath_state_entrycount );
+	}
+	cand->candidate = result;
+	
+	spin_unlock_bh(&multipath_state_lock);
+	
+	*rp = result;
+}
+
+
diff -ruN linux-2.6.8.1.splitold/net/ipv4/route.c linux-2.6.8.1.multipath_cached/net/ipv4/route.c
--- linux-2.6.8.1.splitold/net/ipv4/route.c	2004-09-14 09:38:49.000000000 +0200
+++ linux-2.6.8.1.multipath_cached/net/ipv4/route.c	2004-09-14 09:36:52.000000000 +0200
@@ -442,11 +442,13 @@
   
 static __inline__ void rt_free(struct rtable *rt)
 {
+	multipath_remove(rt);
 	call_rcu_bh(&rt->u.dst.rcu_head, dst_rcu_free);
 }
 
 static __inline__ void rt_drop(struct rtable *rt)
 {
+	multipath_remove(rt);
 	ip_rt_put(rt);
 	call_rcu_bh(&rt->u.dst.rcu_head, dst_rcu_free);
 }
@@ -539,8 +541,28 @@
 			}
 
 			/* Cleanup aged off entries. */
+#ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
+			/* remove all related balanced entries if necessary */
+			if ( rth->u.dst.flags & DST_BALANCED ) {
+				struct rtable* first = rth;
+				*rthp = rth->u.rt_next;
+				while ( (*rthp)->u.dst.flags & DST_BALANCED &&
+					compare_keys(&(*rthp)->fl, 
+						     &first->fl)) {
+					rth = (*rthp);
+					*rthp = rth->u.rt_next;
+					rt_free( rth );
+				}
+				rt_free(first);
+			}
+			else {
+				*rthp = rth->u.rt_next;
+				rt_free(rth);
+			}
+#else /* CONFIG_IP_ROUTE_MULTIPATH_CACHED */
 			*rthp = rth->u.rt_next;
 			rt_free(rth);
+#endif /* CONFIG_IP_ROUTE_MULTIPATH_CACHED */
 		}
 		spin_unlock(&rt_hash_table[i].lock);
 
@@ -706,8 +728,28 @@
 					rthp = &rth->u.rt_next;
 					continue;
 				}
+#ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
+			/* remove all related balanced entries if necessary */
+			if ( rth->u.dst.flags & DST_BALANCED ) {
+				struct rtable* first = rth;
+				*rthp = rth->u.rt_next;
+				while ( (*rthp)->u.dst.flags & DST_BALANCED &&
+					compare_keys(&(*rthp)->fl, 
+						     &first->fl)) {
+					rth = (*rthp);
+					*rthp = rth->u.rt_next;
+					rt_free( rth );
+				}
+				rt_free(first);
+			}
+			else {
+				*rthp = rth->u.rt_next;
+				rt_free(rth);
+			}
+#else /* CONFIG_IP_ROUTE_MULTIPATH_CACHED */
 				*rthp = rth->u.rt_next;
 				rt_free(rth);
+#endif /* CONFIG_IP_ROUTE_MULTIPATH_CACHED */
 				goal--;
 			}
 			spin_unlock_bh(&rt_hash_table[k].lock);
@@ -789,7 +831,12 @@
 
 	spin_lock_bh(&rt_hash_table[hash].lock);
 	while ((rth = *rthp) != NULL) {
+#ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
+		if (!(rth->u.dst.flags & DST_BALANCED) &&
+		    compare_keys(&rth->fl, &rt->fl)) {
+#else
 		if (compare_keys(&rth->fl, &rt->fl)) {
+#endif
 			/* Put it first */
 			*rthp = rth->u.rt_next;
 			/*
@@ -1622,7 +1669,18 @@
 		goto cleanup;
 	}
 
+#ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
+	if ( res->fi->fib_nhs > 1 )
+		RTprint( KERN_DEBUG"%s: balanced entry created: %d\n",
+			 __FUNCTION__,
+			 rth ); 
+#endif
+
 	rth->u.dst.flags= DST_HOST;
+#ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
+	if ( res->fi->fib_nhs > 1 )
+		rth->u.dst.flags |= DST_BALANCED;
+#endif
 	if (in_dev->cnf.no_policy)
 		rth->u.dst.flags |= DST_NOPOLICY;
 	if (in_dev->cnf.no_xfrm)
@@ -1691,7 +1749,54 @@
 				   struct in_device *in_dev,
 				   u32 daddr, u32 saddr, u32 tos)
 {
+#ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED 
+	struct rtable* rth;
+	unsigned char hop, hopcount, lasthop;
+	int err = -EINVAL;
+	unsigned hash;
+	hopcount = res->fi->fib_nhs;
+	lasthop = hopcount - 1;
+
+	/* distinguish between multipath and singlepath */
+	if ( hopcount < 2 ) 
+		return ip_mkroute_input_def(skb, res, fl, in_dev, daddr, 
+					    saddr, tos);
+	
+	RTprint( KERN_DEBUG"%s: entered (hopcount: %d)\n", __FUNCTION__,
+		 hopcount);
+
+	/* add all alternatives to the routing cache */
+	for ( hop = 0; hop < hopcount; ++hop ) {
+		res->nh_sel = hop;
+
+		RTprint( KERN_DEBUG"%s: entered (hopcount: %d)\n", 
+			 __FUNCTION__, hopcount);
+
+		/* create a routing cache entry */
+		err = __mkroute_input( skb, res, in_dev, daddr, saddr, tos, 
+					 &rth );
+		if ( err ) 
+			return err;
+	
+
+		/* put it into the cache */
+		hash = rt_hash_code(daddr, saddr ^ (fl->iif << 5), tos);
+		err = rt_intern_hash(hash, rth, (struct rtable**)&skb->dst);
+		if ( err ) 
+			return err;
+
+
+		/* only for the last hop the reference count is handled 
+		   outside */
+		RTprint( KERN_DEBUG"%s: balanced entry created: %d\n",
+			 __FUNCTION__, rth );
+		if ( hop == lasthop ) 
+			atomic_set(&(skb->dst->__refcnt), 1);
+	}
+	return err;
+#else /* CONFIG_IP_ROUTE_MULTIPATH_CACHED  */ 
 	return ip_mkroute_input_def(skb, res, fl, in_dev, daddr, saddr, tos);
+#endif
 }
 
 
@@ -1882,6 +1987,7 @@
 	goto done;
 
 martian_source:
+
 	ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
 	goto e_inval;
 }
@@ -1907,6 +2013,17 @@
 		    rth->fl.fl4_fwmark == skb->nfmark &&
 #endif
 		    rth->fl.fl4_tos == tos) {
+			/* check if the route is a multipath route and if so
+			   select one of the alternatives */
+			if ( multipath_selectroute( 
+				     &rth->fl, rth, 
+				     (struct rtable**)&skb->dst) ) {
+				dst_hold(skb->dst);
+				rcu_read_unlock();
+
+				return 0;
+			}
+
 			rth->u.dst.lastuse = jiffies;
 			dst_hold(&rth->u.dst);
 			rth->u.dst.__use++;
@@ -2012,6 +2129,10 @@
 	}		
 
 	rth->u.dst.flags= DST_HOST;
+#ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
+	if (res->fi->fib_nhs > 1)
+		rth->u.dst.flags |= DST_BALANCED;
+#endif
 	if (in_dev->cnf.no_xfrm)
 		rth->u.dst.flags |= DST_NOXFRM;
 	if (in_dev->cnf.no_policy)
@@ -2103,7 +2224,71 @@
 				    struct net_device *dev_out,
 				    unsigned flags)
 {
+#ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
+	u32 tos = RT_FL_TOS(oldflp);
+	unsigned char hop;
+	unsigned hash;
+	int err = -EINVAL;
+	unsigned char hopcount = res->fi->fib_nhs;
+	struct rtable* rth;
+
+	RTprint( KERN_DEBUG"%s: entered (hopcount: %d, fl->oif: %d)\n", 
+		 __FUNCTION__, hopcount, fl->oif);
+
+	if (res->fi->fib_nhs > 1) {		
+		for ( hop = 0; hop < hopcount; ++hop ) {
+			struct net_device *dev2nexthop;
+			RTprint( KERN_DEBUG"%s: hop %d of %d\n", __FUNCTION__,
+				 hop, hopcount );
+
+			res->nh_sel = hop;
+
+			/* hold a work reference to the output device */
+			dev2nexthop = FIB_RES_DEV(*res);
+			dev_hold(dev2nexthop);
+
+			/** FIXME remove debug code */
+			RTprint( KERN_DEBUG"%s: balanced entry created: %d " \
+				 " (GW: %u)\n",
+				 __FUNCTION__,
+				 rth,
+				 FIB_RES_GW(*res) );
+
+			err = __mkroute_output(&rth, res, fl, oldflp, 
+						 dev2nexthop, flags);
+			if ( err != 0 ) {
+				goto cleanup;
+			}
+
+			RTprint( KERN_DEBUG"%s: created successfully %d\n", 
+				 __FUNCTION__, hop );
+			
+			hash = rt_hash_code(oldflp->fl4_dst, 
+					    oldflp->fl4_src ^ 
+					    (oldflp->oif << 5), tos);
+			err = rt_intern_hash(hash, rth, rp);
+			RTprint( KERN_DEBUG"%s: hashed  %d\n", 
+				 __FUNCTION__, hop );
+
+		cleanup:
+			/* release work reference to output device */
+			dev_put(dev2nexthop);
+			
+			if ( err != 0 ) {
+				return err;
+			}
+		}
+		RTprint( KERN_DEBUG"%s: exited loop\n", __FUNCTION__ );
+		atomic_set(&(*rp)->u.dst.__refcnt, 1);
+		return err;
+	}
+	else {
+		return ip_mkroute_output_def(rp, res, fl, oldflp, dev_out, 
+					     flags);
+	}
+#else /* CONFIG_IP_ROUTE_MULTIPATH_CACHED */
 	return ip_mkroute_output_def(rp, res, fl, oldflp, dev_out, flags);
+#endif /* CONFIG_IP_ROUTE_MULTIPATH_CACHED */
 }
 
 /*
@@ -2316,6 +2501,16 @@
 #endif
 		    !((rth->fl.fl4_tos ^ flp->fl4_tos) &
 			    (IPTOS_RT_MASK | RTO_ONLINK))) {
+
+			/* check for multipath routes and choose one if
+			   necessary */
+			if (multipath_selectroute(flp, rth, rp)) {
+				dst_hold(&(*rp)->u.dst);
+				RT_CACHE_STAT_INC(out_hit);
+				rcu_read_unlock_bh();
+				return 0;
+			}
+
 			rth->u.dst.lastuse = jiffies;
 			dst_hold(&rth->u.dst);
 			rth->u.dst.__use++;

  parent reply	other threads:[~2004-09-14 12:51 UTC|newest]

Thread overview: 6+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2004-09-13 10:36 [RFC][PATCH 2/2] ip multipath, bk head (EXPERIMENTAL) Einar Lueck
2004-09-14  2:40 ` jamal
2004-09-14  5:42   ` David S. Miller
2004-09-14 12:14     ` jamal
2004-09-14 12:38     ` Einar Lueck [this message]
2004-09-14 15:08       ` Phil Oester

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=4146E65D.6070309@de.ibm.com \
    --to=elueck@de.ibm.com \
    --cc=davem@davemloft.net \
    --cc=hadi@cyberus.ca \
    --cc=linux-kernel@vger.kernel.org \
    --cc=lkml@einar-lueck.de \
    --cc=netdev@oss.sgi.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.