[SET 2][PATCH 2/8][bonding] Propagating master's settings to slaves

netdev.vger.kernel.org archive mirror
 help / color / mirror / Atom feed

* [SET 2][PATCH 2/8][bonding] Propagating master's settings to slaves
@ 2003-08-08 14:44 Shmulik Hen
  2003-08-08 22:01 ` jamal
  0 siblings, 1 reply; 27+ messages in thread
From: Shmulik Hen @ 2003-08-08 14:44 UTC (permalink / raw)
  To: bonding-devel, netdev

2 - Change monitoring function use the new functionality.

diff -Nuarp linux-2.4.22-rc1/drivers/net/bonding/bond_main.c linux-2.4.22-rc1-devel/drivers/net/bonding/bond_main.c
--- linux-2.4.22-rc1/drivers/net/bonding/bond_main.c	Fri Aug  8 14:03:16 2003
+++ linux-2.4.22-rc1-devel/drivers/net/bonding/bond_main.c	Fri Aug  8 14:03:17 2003
@@ -2207,8 +2207,9 @@ out:
 static void bond_mii_monitor(struct net_device *master)
 {
 	bonding_t *bond = (struct bonding *) master->priv;
-	slave_t *slave, *bestslave, *oldcurrent;
+	slave_t *slave, *oldcurrent;
 	int slave_died = 0;
+	int do_failover = 0;
 
 	read_lock(&bond->lock);
 
@@ -2218,7 +2219,6 @@ static void bond_mii_monitor(struct net_
 	 * program could monitor the link itself if needed.
 	 */
 
-	bestslave = NULL;
 	slave = (slave_t *)bond;
 
 	read_lock(&bond->ptrlock);
@@ -2226,8 +2226,6 @@ static void bond_mii_monitor(struct net_
 	read_unlock(&bond->ptrlock);
 
 	while ((slave = slave->prev) != (slave_t *)bond) {
-		/* use updelay+1 to match an UP slave even when updelay is 0 */
-		int mindelay = updelay + 1;
 		struct net_device *dev = slave->dev;
 		int link_state;
 		u16 old_speed = slave->speed;
@@ -2238,14 +2236,7 @@ static void bond_mii_monitor(struct net_
 		switch (slave->link) {
 		case BOND_LINK_UP:	/* the link was up */
 			if (link_state == BMSR_LSTATUS) {
-				/* link stays up, tell that this one
-				   is immediately available */
-				if (IS_UP(dev) && (mindelay > -2)) {
-					/* -2 is the best case :
-					   this slave was already up */
-					mindelay = -2;
-					bestslave = slave;
-				}
+				/* link stays up, nothing more to do */
 				break;
 			}
 			else { /* link going down */
@@ -2285,6 +2276,7 @@ static void bond_mii_monitor(struct net_
 					    (bond_mode == BOND_MODE_8023AD)) {
 						bond_set_slave_inactive_flags(slave);
 					}
+
 					printk(KERN_INFO
 						"%s: link status definitely down "
 						"for interface %s, disabling it",
@@ -2301,12 +2293,10 @@ static void bond_mii_monitor(struct net_
 						bond_alb_handle_link_change(bond, slave, BOND_LINK_DOWN);
 					}
 
-					write_lock(&bond->ptrlock);
-					if (slave == bond->current_slave) {
-						/* find a new interface and be verbose */
-						reselect_active_interface(bond);
+					if (slave == oldcurrent) {
+						do_failover = 1;
 					}
-					write_unlock(&bond->ptrlock);
+
 					slave_died = 1;
 				} else {
 					slave->delay--;
@@ -2321,13 +2311,6 @@ static void bond_mii_monitor(struct net_
 					master->name,
 					(downdelay - slave->delay) * miimon,
 					dev->name);
-
-				if (IS_UP(dev) && (mindelay > -1)) {
-					/* -1 is a good case : this slave went
-					   down only for a short time */
-					mindelay = -1;
-					bestslave = slave;
-				}
 			}
 			break;
 		case BOND_LINK_DOWN:	/* the link was down */
@@ -2397,26 +2380,12 @@ static void bond_mii_monitor(struct net_
 						bond_alb_handle_link_change(bond, slave, BOND_LINK_UP);
 					}
 
-					write_lock(&bond->ptrlock);
-					if ( (bond->primary_slave != NULL)
-					  && (slave == bond->primary_slave) )
-						reselect_active_interface(bond); 
-					write_unlock(&bond->ptrlock);
-				}
-				else
+					if ((oldcurrent == NULL) ||
+					    (slave == bond->primary_slave)) {
+						do_failover = 1;
+					}
+				} else {
 					slave->delay--;
-
-				/* we'll also look for the mostly eligible slave */
-				if (bond->primary_slave == NULL)  {
-				    if (IS_UP(dev) && (slave->delay < mindelay)) {
-					mindelay = slave->delay;
-					bestslave = slave;
-				    } 
-				} else if ( (IS_UP(bond->primary_slave->dev))  || 
-				          ( (!IS_UP(bond->primary_slave->dev))  && 
-				          (IS_UP(dev) && (slave->delay < mindelay)) ) ) {
-					mindelay = slave->delay;
-					bestslave = slave;
 				}
 			}
 			break;
@@ -2435,26 +2404,17 @@ static void bond_mii_monitor(struct net_
 
 	} /* end of while */
 
-	/* 
-	 * if there's no active interface and we discovered that one
-	 * of the slaves could be activated earlier, so we do it.
-	 */
-	read_lock(&bond->ptrlock);
-	oldcurrent = bond->current_slave;
-	read_unlock(&bond->ptrlock);
+	if (do_failover) {
+		write_lock(&bond->ptrlock);
 
-	/* no active interface at the moment or need to bring up the primary */
-	if (oldcurrent == NULL)  { /* no active interface at the moment */
-		if (bestslave != NULL) { /* last chance to find one ? */
-			write_lock(&bond->ptrlock);
-			change_active_interface(bond, bestslave);
-			write_unlock(&bond->ptrlock);
-		} else if (slave_died) {
-			/* print this message only once a slave has just died */
+		reselect_active_interface(bond);
+		if (oldcurrent && !bond->current_slave) {
 			printk(KERN_INFO
 				"%s: now running without any active interface !\n",
 				master->name);
 		}
+
+		write_unlock(&bond->ptrlock);
 	}
 
 	read_unlock(&bond->lock);
@@ -2472,9 +2432,10 @@ static void bond_mii_monitor(struct net_
 static void loadbalance_arp_monitor(struct net_device *master)
 {
 	bonding_t *bond;
-	slave_t *slave;
+	slave_t *slave, *oldcurrent;
 	int the_delta_in_ticks =  arp_interval * HZ / 1000;
 	int next_timer = jiffies + (arp_interval * HZ / 1000);
+	int do_failover = 0;
 
 	bond = (struct bonding *) master->priv; 
 	if (master->priv == NULL) {
@@ -2498,6 +2459,10 @@ static void loadbalance_arp_monitor(stru
 
 	read_lock(&bond->lock);
 
+	read_lock(&bond->ptrlock);
+	oldcurrent = bond->current_slave;
+	read_unlock(&bond->ptrlock);
+
 	/* see if any of the previous devices are up now (i.e. they have
 	 * xmt and rcv traffic). the current_slave does not come into
 	 * the picture unless it is null. also, slave->jiffies is not needed
@@ -2524,21 +2489,19 @@ static void loadbalance_arp_monitor(stru
 				 * current_slave being null after enslaving
 				 * is closed.
 				 */
-				write_lock(&bond->ptrlock);
-				if (bond->current_slave == NULL) {
+				if (oldcurrent == NULL) {
 					printk(KERN_INFO
 						"%s: link status definitely up "
 						"for interface %s, ",
 						master->name,
 						slave->dev->name);
-					reselect_active_interface(bond); 
+					do_failover = 1;
 				} else {
 					printk(KERN_INFO
 						"%s: interface %s is now up\n",
 						master->name,
 						slave->dev->name);
 				}
-				write_unlock(&bond->ptrlock);
 			} 
 		} else {
 			/* slave->link == BOND_LINK_UP */
@@ -2561,11 +2524,9 @@ static void loadbalance_arp_monitor(stru
 				       master->name,
 				       slave->dev->name);
 
-				write_lock(&bond->ptrlock);
-				if (slave == bond->current_slave) {
-					reselect_active_interface(bond);
+				if (slave == oldcurrent) {
+					do_failover = 1;
 				}
-				write_unlock(&bond->ptrlock);
 			}
 		} 
 
@@ -2579,6 +2540,19 @@ static void loadbalance_arp_monitor(stru
 		if (IS_UP(slave->dev)) {
 			arp_send_all(slave);
 		}
+	}
+
+	if (do_failover) {
+		write_lock(&bond->ptrlock);
+
+		reselect_active_interface(bond);
+		if (oldcurrent && !bond->current_slave) {
+			printk(KERN_INFO
+				"%s: now running without any active interface !\n",
+				master->name);
+		}
+
+		write_unlock(&bond->ptrlock);
 	}
 
 	read_unlock(&bond->lock);

-- 
| Shmulik Hen   Advanced Network Services  |
| Israel Design Center, Jerusalem          |
| LAN Access Division, Platform Networking |
| Intel Communications Group, Intel corp.  |

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [SET 2][PATCH 2/8][bonding] Propagating master's settings to slaves
  2003-08-08 14:44 [SET 2][PATCH 2/8][bonding] Propagating master's settings to slaves Shmulik Hen
@ 2003-08-08 22:01 ` jamal
  0 siblings, 0 replies; 27+ messages in thread
From: jamal @ 2003-08-08 22:01 UTC (permalink / raw)
  To: shmulik.hen; +Cc: bonding-devel, netdev

Shmulik,

Some of this bonding stuff is pretty scary. Lotsa policies in the 
kernel and communication seems to be centred around /proc.
Shouldnt policies on failover be really driven from user space?
Also shouldnt communication be using something like netlink?

cheers,
jamal

On Fri, 2003-08-08 at 10:44, Shmulik Hen wrote:
> 2 - Change monitoring function use the new functionality.
> 

^ permalink raw reply	[flat|nested] 27+ messages in thread

* RE: [SET 2][PATCH 2/8][bonding] Propagating master's settings to slaves
@ 2003-08-09 10:29 Hen, Shmulik
  2003-08-11  2:51 ` jamal
  0 siblings, 1 reply; 27+ messages in thread
From: Hen, Shmulik @ 2003-08-09 10:29 UTC (permalink / raw)
  To: hadi; +Cc: bonding-devel, netdev

> -----Original Message-----
> From: jamal [mailto:hadi@cyberus.ca]
> Sent: Saturday, August 09, 2003 1:01 AM
> To: Hen, Shmulik
> Cc: bonding-devel@lists.sourceforge.net; netdev@oss.sgi.com
> Subject: Re: [SET 2][PATCH 2/8][bonding] Propagating master's settings
> to slaves
> 
> Shmulik,
> 
> Some of this bonding stuff is pretty scary. Lotsa policies in the 
> kernel and communication seems to be centred around /proc.
> Shouldnt policies on failover be really driven from user space?
> Also shouldnt communication be using something like netlink?
> 
> cheers,
> jamal
> 
> On Fri, 2003-08-08 at 10:44, Shmulik Hen wrote:
> > 2 - Change monitoring function use the new functionality.
> > 
> 

Not sure I fully understood the concerns above, but I'll try
to explain what the change was all about.

By monitoring, I meant the 3 timer function running in bonding
to monitor link changes and act once a link fail/recovery is
detected. The old code used to do all the activity related to
changing the current active slave separately in each timer
function and it seemed redundant since it was basically the
same thing repeated 3 times. Instead, we thought it would be
best if we put that into 3 new functions - reselect_active,
find_best_slave and change_active that does all the actual stuff
of swapping an old current with the new one.

The change we did in /proc was to reduce the amount of data
extarcted each time the proc entry is polled. Instead of dumping
all the data of all the bond devices that exist, each bond returns
just data that is relevant to itself.

In the lonf term, the drive is to move any *smart* code done in
the config application into the driver itself and be left with
the smallest, most compact application as possible. This is the
trend we've seen in the VLAN config app, and the bridge module.
All the "brain" is in the kernel module and very little should be
done in the application.

	Shmulik.

^ permalink raw reply	[flat|nested] 27+ messages in thread

* RE: [SET 2][PATCH 2/8][bonding] Propagating master's settings to slaves
  2003-08-09 10:29 [SET 2][PATCH 2/8][bonding] Propagating master's settings to slaves Hen, Shmulik
@ 2003-08-11  2:51 ` jamal
  2003-08-11 10:08   ` Shmulik Hen
  0 siblings, 1 reply; 27+ messages in thread
From: jamal @ 2003-08-11  2:51 UTC (permalink / raw)
  To: Hen, Shmulik; +Cc: bonding-devel, netdev

On Sat, 2003-08-09 at 06:29, Hen, Shmulik wrote:

> > 
> 
> Not sure I fully understood the concerns above, but I'll try
> to explain what the change was all about.
> 

I think it wasnt the one specific change rather a few posted that i
spent a minute or two staring at. And you confirm my suspicion below.

[..]

> 
> In the lonf term, the drive is to move any *smart* code done in
> the config application into the driver itself and be left with
> the smallest, most compact application as possible. This is the
> trend we've seen in the VLAN config app, and the bridge module.
> All the "brain" is in the kernel module and very little should be
> done in the application.

I am not very familiar with the bonding code although i think you guys
have been doing very good work since you got involved.
In any case the approach you state above is wrong. Actually Stephen
Hemminger and I discussed this for bridging. Post 2.6 he is going to
remove a lot of the bridge policy (or "brain" as you call it) out of the
kernel. Netlink for kernel<->userspace not /proc. I think we should head
towards that direction so we can have more sophisticated management.

Thoughts?

cheers,
jamal

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [SET 2][PATCH 2/8][bonding] Propagating master's settings to slaves
  2003-08-11  2:51 ` jamal
@ 2003-08-11 10:08   ` Shmulik Hen
  2003-08-11 13:47     ` jamal
  0 siblings, 1 reply; 27+ messages in thread
From: Shmulik Hen @ 2003-08-11 10:08 UTC (permalink / raw)
  To: hadi; +Cc: bonding-devel, netdev

On Monday 11 August 2003 05:51 am, you wrote:
> On Sat, 2003-08-09 at 06:29, Hen, Shmulik wrote:
> > Not sure I fully understood the concerns above, but I'll try
> > to explain what the change was all about.
>
> I think it wasnt the one specific change rather a few posted that i
> spent a minute or two staring at. And you confirm my suspicion
> below.

I probably didn't make myself clear - by "understood" I wanted to say 
I probably didn't get the *meaning* of the whole sentence , and not 
"I don't under stand why you are concerned".
(English is not my native tongue :) ).

> I am not very familiar with the bonding code although i think you
> guys have been doing very good work since you got involved.
> In any case the approach you state above is wrong. Actually Stephen
> Hemminger and I discussed this for bridging. Post 2.6 he is going
> to remove a lot of the bridge policy (or "brain" as you call it)
> out of the kernel. Netlink for kernel<->userspace not /proc. I
> think we should head towards that direction so we can have more
> sophisticated management.

I, on the other hand, am not familiar with the bridging code and I 
don't know what it actually does internally, I just noticed that 
regarding config operations, most of the code is done at the kernel 
level as response to ioctl commands.

I'll try to clarify how that relates to bonding. The ifenslave utility 
has very little "brain" as it is, and all it knows how to do 
currently is enslave/release slave devices and change the current 
active slave. It also has some ability to extract status info from 
the bond and present it nicely for a user.

The "brain" I was referring to in the bonding module itself has to do 
with timer functions monitoring link status or Tx/Rx activity of the 
slaves, and once a faulty slave is detected, switch to use another 
one instead according to the teaming mode. There are no large scale 
decision making nor major CPU consuming computations that are part of 
the continuous operation of the module that is basically handle Rx/Tx 
on slaves.

The bonding module doesn't need to access any special info that is 
normally available to user space apps. What it does need is very 
short response time and accessibility to kernel internal resources 
like net devices info to make it a high availability intermediate 
driver.

Trying to move that from the kernel module into the config application 
seems to be a very hard task to implement since we'll have to find a 
way to make the application constantly aware to the specifics like 
current topology, slave-to-bond affiliation, updated status of each 
slave, etc., etc. It would also mean that the driver will have to 
wait for the application to tell it what to do each time it needs a 
decision, and by that we'll surely suffer some performance hit and 
probably get low availability or temporary loss of communications.

Going back to the first problem, discussions on the bonding 
development list pointed that it might be better if we moved the 
configuration-time decisions making to the driver, so the application 
wouldn't have to deal with situations like:
1) get the master's MTU settings, master's teaming mode, communication 
   version, backwards compatibility issues, etc.
2) figure if need to set MTU to slave according to all that,
3) try to set that on the new slave being added,
4) if not successfull, decide if may enslave anyway or,
5) maybe undo all previous settings already done to the slave 
   (needs a way to retrieve old values)
6) decide if should go on or fail any further operations
7) repeat the above for all other settings

On the other hand, what we want to get to is something more like:
1) tell bonding to add slave X to bond Y,
2) watch for error returns,
3) print a nice message according to the type of the error.

While the driver, already aware of all possible relevant data, makes 
all decisions, performs settings, handles compatibility issues, 
checks for failures at each stage, handles any undo steps, and return 
success/error values accordingly.

>
> Thoughts?

Mostly explanations :)

Is there anywhere I can see what you refereed to as discussions with 
Stephen Hemminger ? I would really like to know how and what could 
also be applied to bonding.

	Regards,
	Shmulik.

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [SET 2][PATCH 2/8][bonding] Propagating master's settings to slaves
  2003-08-11 10:08   ` Shmulik Hen
@ 2003-08-11 13:47     ` jamal
  2003-08-11 14:07       ` [Bonding-devel] Re: [SET 2][PATCH 2/8][bonding] Propagating master's settings toslaves Laurent DENIEL
  0 siblings, 1 reply; 27+ messages in thread
From: jamal @ 2003-08-11 13:47 UTC (permalink / raw)
  To: shmulik.hen; +Cc: bonding-devel, netdev

On Mon, 2003-08-11 at 06:08, Shmulik Hen wrote:
> On Monday 11 August 2003 05:51 am, you wrote:
> > On Sat, 2003-08-09 at 06:29, Hen, Shmulik wrote:
> > > Not sure I fully understood the concerns above, but I'll try
> > > to explain what the change was all about.
> >
> > I think it wasnt the one specific change rather a few posted that i
> > spent a minute or two staring at. And you confirm my suspicion
> > below.
> 
> I probably didn't make myself clear - by "understood" I wanted to say 
> I probably didn't get the *meaning* of the whole sentence , and not 
> "I don't under stand why you are concerned".
> (English is not my native tongue :) ).
> 
> > I am not very familiar with the bonding code although i think you
> > guys have been doing very good work since you got involved.
> > In any case the approach you state above is wrong. Actually Stephen
> > Hemminger and I discussed this for bridging. Post 2.6 he is going
> > to remove a lot of the bridge policy (or "brain" as you call it)
> > out of the kernel. Netlink for kernel<->userspace not /proc. I
> > think we should head towards that direction so we can have more
> > sophisticated management.
> 
> I, on the other hand, am not familiar with the bridging code and I 
> don't know what it actually does internally, I just noticed that 
> regarding config operations, most of the code is done at the kernel 
> level as response to ioctl commands.
> 

Theres two main components to it: a control protocol and a forwarding
path. The control protocol known as STP tells the forwarding path how to
behave. Essentially, STP carries the policy implemented by the
forwarding path. This is the same breakdown to say routing protocols
like OSPF and regular forwarding path. At the moment STP sits in the
kernel. STP is really the "brains". 

> I'll try to clarify how that relates to bonding. The ifenslave utility 
> has very little "brain" as it is, and all it knows how to do 
> currently is enslave/release slave devices and change the current 
> active slave. It also has some ability to extract status info from 
> the bond and present it nicely for a user.
> 
> The "brain" I was referring to in the bonding module itself has to do 
> with timer functions monitoring link status or Tx/Rx activity of the 
> slaves, and once a faulty slave is detected, switch to use another 
> one instead according to the teaming mode. 
> There are no large scale 
> decision making nor major CPU consuming computations that are part of 
> the continuous operation of the module that is basically handle Rx/Tx 
> on slaves.
> 
> The bonding module doesn't need to access any special info that is 
> normally available to user space apps. What it does need is very 
> short response time and accessibility to kernel internal resources 
> like net devices info to make it a high availability intermediate 
> driver.
> 
> Trying to move that from the kernel module into the config application 
> seems to be a very hard task to implement since we'll have to find a 
> way to make the application constantly aware to the specifics like 
> current topology, slave-to-bond affiliation, updated status of each 
> slave, etc., etc. It would also mean that the driver will have to 
> wait for the application to tell it what to do each time it needs a 
> decision, and by that we'll surely suffer some performance hit and 
> probably get low availability or temporary loss of communications.
> 

Not at all. If you let some app control this i am sure whoever writes
the app has vested interest in getting fast failovers etc.

> Going back to the first problem, discussions on the bonding 
> development list pointed that it might be better if we moved the 
> configuration-time decisions making to the driver, so the application 
> wouldn't have to deal with situations like:
> 1) get the master's MTU settings, master's teaming mode, communication 
>    version, backwards compatibility issues, etc.
> 2) figure if need to set MTU to slave according to all that,
> 3) try to set that on the new slave being added,
> 4) if not successfull, decide if may enslave anyway or,
> 5) maybe undo all previous settings already done to the slave 
>    (needs a way to retrieve old values)
> 6) decide if should go on or fail any further operations
> 7) repeat the above for all other settings
> 
> On the other hand, what we want to get to is something more like:
> 1) tell bonding to add slave X to bond Y,
> 2) watch for error returns,
> 3) print a nice message according to the type of the error.
> 

Dont you think that anything thats "rich" like you list above should
stay out of the kernel? In any case, if you have a controlling app, you
could do more interesting things; example add or delete routes, firewall
rules, qos policies etc which all have very strong correlation with
availability - these are examples btw, not an exhaustive list. If all
you are satisfied with is link management alone, then by all means 
hardcoding behavior into the kernel is fine. I dont think it is
sufficient.

> While the driver, already aware of all possible relevant data, makes 
> all decisions, performs settings, handles compatibility issues, 
> checks for failures at each stage, handles any undo steps, and return 
> success/error values accordingly.
> 

Driver - actually bonding - should have minimal failover policy built in
for the lazy; example what i used to know about bonding - failover to
the next link, maybe send a grat arp etc. If I want more than basic,
then send me netlink events to user space and let me control how it
goes. Maybe i dont want to go to the second link but rather the 4th
link.

> >
> > Thoughts?
> 
> Mostly explanations :)
> 
> Is there anywhere I can see what you refereed to as discussions with 
> Stephen Hemminger ? I would really like to know how and what could 
> also be applied to bonding.
> 

Basically what i described at the top. Move any "richness" to user
space.

cheers,
jamal   

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [Bonding-devel] Re: [SET 2][PATCH 2/8][bonding] Propagating master's settings toslaves
  2003-08-11 13:47     ` jamal
@ 2003-08-11 14:07       ` Laurent DENIEL
  2003-08-11 14:20         ` Shmulik Hen
  0 siblings, 1 reply; 27+ messages in thread
From: Laurent DENIEL @ 2003-08-11 14:07 UTC (permalink / raw)
  To: hadi; +Cc: shmulik.hen, bonding-devel, netdev

jamal a écrit :
> 
> > Trying to move that from the kernel module into the config application
> > seems to be a very hard task to implement since we'll have to find a
> > way to make the application constantly aware to the specifics like
> > current topology, slave-to-bond affiliation, updated status of each
> > slave, etc., etc. It would also mean that the driver will have to
> > wait for the application to tell it what to do each time it needs a
> > decision, and by that we'll surely suffer some performance hit and
> > probably get low availability or temporary loss of communications.
> >
> 
> Not at all. If you let some app control this i am sure whoever writes
> the app has vested interest in getting fast failovers etc.
> 

> 
> Basically what i described at the top. Move any "richness" to user
> space.

HP/Compaq/Digital used to have the same approach with their Netrain
implementation, and from one release of Tru64 UNIX to another, they
could no longer support resolution ala milli-seconds but only seconds
due to the move of such "richness" to user space (among other things). 
I am not saying that doing so on Linux will result to the same, but 
a minimal failover policy shall remain in the kernel for performance 
reason ... (or a user space facility could exist to *configure* such
policy but without direct interaction with user space when the kernel
has to decide).

Laurent

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [Bonding-devel] Re: [SET 2][PATCH 2/8][bonding] Propagating master's  settings toslaves
  2003-08-11 14:07       ` [Bonding-devel] Re: [SET 2][PATCH 2/8][bonding] Propagating master's settings toslaves Laurent DENIEL
@ 2003-08-11 14:20         ` Shmulik Hen
  2003-08-11 14:34           ` jamal
  0 siblings, 1 reply; 27+ messages in thread
From: Shmulik Hen @ 2003-08-11 14:20 UTC (permalink / raw)
  To: Laurent DENIEL, hadi; +Cc: bonding-devel, netdev

On Monday 11 August 2003 05:07 pm, Laurent DENIEL wrote:
> HP/Compaq/Digital used to have the same approach with their Netrain
> implementation, and from one release of Tru64 UNIX to another, they
> could no longer support resolution ala milli-seconds but only
> seconds due to the move of such "richness" to user space (among
> other things). I am not saying that doing so on Linux will result
> to the same, but a minimal failover policy shall remain in the
> kernel for performance reason ... (or a user space facility could
> exist to *configure* such policy but without direct interaction
> with user space when the kernel has to decide).
>
> Laurent

That was my point. Thank you for putting it into better words.
If high availbilty and fast failovers are what's needed, why move it 
out of kernel space and put it in an application ? How fast could it 
work compared to a kernel module ? Why need an extra piece of code 
running in user space (daemon?) to monitor a module when the module 
can do that itself ?

If smarter behavior is needed (e.g. falling to eth4 instead of eth1 
when eth0 fails), we can add some priority mechanism to the driver to 
do that when it decides to swap. Otherwise, we'll be devleoping 
applications from now on, not the Linux kernel :)

	Shmulik.

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [Bonding-devel] Re: [SET 2][PATCH 2/8][bonding] Propagating master's  settings toslaves
  2003-08-11 14:20         ` Shmulik Hen
@ 2003-08-11 14:34           ` jamal
  2003-08-11 16:25             ` Shmulik Hen
  0 siblings, 1 reply; 27+ messages in thread
From: jamal @ 2003-08-11 14:34 UTC (permalink / raw)
  To: shmulik.hen; +Cc: Laurent DENIEL, bonding-devel, netdev

On Mon, 2003-08-11 at 10:20, Shmulik Hen wrote:
> On Monday 11 August 2003 05:07 pm, Laurent DENIEL wrote:
> > HP/Compaq/Digital used to have the same approach with their Netrain
> > implementation, and from one release of Tru64 UNIX to another, they
> > could no longer support resolution ala milli-seconds but only
> > seconds due to the move of such "richness" to user space (among
> > other things). I am not saying that doing so on Linux will result
> > to the same, but a minimal failover policy shall remain in the
> > kernel for performance reason ... (or a user space facility could
> > exist to *configure* such policy but without direct interaction
> > with user space when the kernel has to decide).
> >
> > Laurent
> 
> That was my point. Thank you for putting it into better words.
> If high availbilty and fast failovers are what's needed, why move it 
> out of kernel space and put it in an application ? How fast could it 
> work compared to a kernel module ? Why need an extra piece of code 
> running in user space (daemon?) to monitor a module when the module 
> can do that itself ?
>
> If smarter behavior is needed (e.g. falling to eth4 instead of eth1 
> when eth0 fails), we can add some priority mechanism to the driver to 
> do that when it decides to swap. Otherwise, we'll be devleoping 
> applications from now on, not the Linux kernel :)
> 

So how many smart things are you going to add to the driver? ;->
Do you wanna add the qos policy changeover as well? What about route
changes, firewalling etc. What about sliceing bread and adding butter?
Where do you draw the line?
BTW, I dont understand why it would slow down failover; sure it will a
tiny bit because you have to cross user space to lookup the policy.
Maybe this is the part that i havent made clear, heres an example:
- User space gets notified link eth0 went down
- User space looks up a policy config on what to do when eth0 goes down
- user space executes commands which may include telling kernel to 
move activity to eth1.
 
Note: I agree on a minimal failover policy staying in the kernel; very
basic stuff like what bonding used to do (may still do, dont know).

cheers,
jamal

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [Bonding-devel] Re: [SET 2][PATCH 2/8][bonding] Propagating master's  settings toslaves
  2003-08-11 14:34           ` jamal
@ 2003-08-11 16:25             ` Shmulik Hen
  2003-08-11 16:43               ` Jeff Garzik
  2003-08-12  2:31               ` jamal
  0 siblings, 2 replies; 27+ messages in thread
From: Shmulik Hen @ 2003-08-11 16:25 UTC (permalink / raw)
  To: hadi; +Cc: Laurent DENIEL, bonding-devel, netdev

On Monday 11 August 2003 05:34 pm, jamal wrote:
> So how many smart things are you going to add to the driver? ;->
> Do you wanna add the qos policy changeover as well? What about
> route changes, firewalling etc. What about sliceing bread and
> adding butter? Where do you draw the line?
> BTW, I dont understand why it would slow down failover; sure it
> will a tiny bit because you have to cross user space to lookup the
> policy. Maybe this is the part that i havent made clear, heres an
> example: - User space gets notified link eth0 went down
> - User space looks up a policy config on what to do when eth0 goes
> down - user space executes commands which may include telling
> kernel to move activity to eth1.
>
> Note: I agree on a minimal failover policy staying in the kernel;
> very basic stuff like what bonding used to do (may still do, dont
> know).
>
> cheers,
> jamal

Why have any kernel code other than device drivers in the first place 
?
Why not move all the TCP/IP stack out of kernel space and put it in an 
application ? Lets have the entire ARP mechanism in an appliaction 
and let it handle everything from routing tables management to arp 
negotiation while the kernel will only know how to create arp packets 
that it gets from that app and send them away ? It doesn't need to 
have the know how.
Say we do thing s your way and use the notification mechanism, how 
long do you think it's going to take for the whole operation to 
finish taking into consideration how the kernel runs user space 
applications in comparison with kernel code? what happens when the 
system is heavily loaded ? What happens if the application dies for 
some reason ? 
Why should the bonding driver even care about routes or firewalling ?
It's only meant to group several physical ethernet devices and group 
them under one logical device to handle teaming solutions.

-- 
| Shmulik Hen   Advanced Network Services  |
| Israel Design Center, Jerusalem          |
| LAN Access Division, Platform Networking |
| Intel Communications Group, Intel corp.  |

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [Bonding-devel] Re: [SET 2][PATCH 2/8][bonding] Propagating master's settings toslaves
  2003-08-11 16:25             ` Shmulik Hen
@ 2003-08-11 16:43               ` Jeff Garzik
  2003-08-11 17:31                 ` [Bonding-devel] Re: [SET 2][PATCH 2/8][bonding] Propagating master'ssettings toslaves Laurent DENIEL
                                   ` (2 more replies)
  2003-08-12  2:31               ` jamal
  1 sibling, 3 replies; 27+ messages in thread
From: Jeff Garzik @ 2003-08-11 16:43 UTC (permalink / raw)
  To: shmulik.hen, hadi; +Cc: Laurent DENIEL, bonding-devel, netdev

The answer is, like life, it's a balance.

As a general rule, we do prefer to move all code possible out of the 
Linux kernel.  We have even created "initramfs", which for 2.7, will be 
used as a vehicle to move code from the kernel to userspace, that 
previously had to be in the kernel only because it was a task that "had 
to be performed at boot time".

However, one must consider
(1) does moving code to userspace create any security holes?
(2) does moving code to userspace dramatically increase the number of 
context switches?
(3) does moving code to userspace violate some atomicity that being 
inside the kernel guarantees?

In practice, #3 is the showstopper that occurs most often.

This is why I push for a "bonding-utils" package from Jay.... because of 
the general rule above:  put it into userspace, where possible.

	Jeff

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [Bonding-devel] Re: [SET 2][PATCH 2/8][bonding] Propagating master'ssettings toslaves
  2003-08-11 16:43               ` Jeff Garzik
@ 2003-08-11 17:31                 ` Laurent DENIEL
  2003-08-11 17:43                   ` Jeff Garzik
  2003-08-12  2:32                   ` jamal
  2003-08-11 21:27                 ` [Bonding-devel] Re: [SET 2][PATCH 2/8][bonding] Propagating master's settings toslaves Mark Huth
  2003-08-11 21:41                 ` Jay Vosburgh
  2 siblings, 2 replies; 27+ messages in thread
From: Laurent DENIEL @ 2003-08-11 17:31 UTC (permalink / raw)
  To: Jeff Garzik; +Cc: shmulik.hen, hadi, bonding-devel, netdev

Jeff Garzik a écrit :
> 
> The answer is, like life, it's a balance.
> 
> As a general rule, we do prefer to move all code possible out of the
> Linux kernel.  We have even created "initramfs", which for 2.7, will be
> used as a vehicle to move code from the kernel to userspace, that
> previously had to be in the kernel only because it was a task that "had
> to be performed at boot time".
> 
> However, one must consider
> (1) does moving code to userspace create any security holes?
> (2) does moving code to userspace dramatically increase the number of
> context switches?
> (3) does moving code to userspace violate some atomicity that being
> inside the kernel guarantees?

You forgot one important aspect : 

  (4) does moving code to userspace break compatibility (or behavior) 
      with user land applications (or systems)

What can one do if say, kernel 2.[4|5] switches the NIC in 10 mseconds 
while kernel 2.7 with user land daemon switches in a few seconds ? 
nothing but stay with the previous version or fork the driver development ;-(

But I agree that it is interesting to do some stuff at user land, and if 
the bonding had an option to disable the automatic failover policy, 
this could be implemented with trigger towards user land application that 
could use an ioctl call to switch to the appropriate NIC according to 
the user lan configuration ...

But the fast and simple failover policy shall remain in kernel code.

Laurent

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [Bonding-devel] Re: [SET 2][PATCH 2/8][bonding] Propagating master'ssettings toslaves
  2003-08-11 17:31                 ` [Bonding-devel] Re: [SET 2][PATCH 2/8][bonding] Propagating master'ssettings toslaves Laurent DENIEL
@ 2003-08-11 17:43                   ` Jeff Garzik
  2003-08-12  6:31                     ` Laurent DENIEL
  2003-08-12  2:32                   ` jamal
  1 sibling, 1 reply; 27+ messages in thread
From: Jeff Garzik @ 2003-08-11 17:43 UTC (permalink / raw)
  To: Laurent DENIEL; +Cc: shmulik.hen, hadi, bonding-devel, netdev

Laurent DENIEL wrote:
> Jeff Garzik a écrit :
> 
>>The answer is, like life, it's a balance.
>>
>>As a general rule, we do prefer to move all code possible out of the
>>Linux kernel.  We have even created "initramfs", which for 2.7, will be
>>used as a vehicle to move code from the kernel to userspace, that
>>previously had to be in the kernel only because it was a task that "had
>>to be performed at boot time".
>>
>>However, one must consider
>>(1) does moving code to userspace create any security holes?
>>(2) does moving code to userspace dramatically increase the number of
>>context switches?
>>(3) does moving code to userspace violate some atomicity that being
>>inside the kernel guarantees?
> 
> 
> You forgot one important aspect : 
> 
>   (4) does moving code to userspace break compatibility (or behavior) 
>       with user land applications (or systems)

I agree... assuming these userland interfaces are fairly standard and 
widely deployed.


> What can one do if say, kernel 2.[4|5] switches the NIC in 10 mseconds 
> while kernel 2.7 with user land daemon switches in a few seconds ? 
> nothing but stay with the previous version or fork the driver development ;-(

This is a silly example.  If that happens in practice, then that is a 
bug in the configuration of the userland daemon, or a bug in the 
kernel<->userland ABI.


> But I agree that it is interesting to do some stuff at user land, and if 
> the bonding had an option to disable the automatic failover policy, 
> this could be implemented with trigger towards user land application that 
> could use an ioctl call to switch to the appropriate NIC according to 
> the user lan configuration ...

Remember, ioctls are bad.  :)  Unix design mistake.


> But the fast and simple failover policy shall remain in kernel code.

I would not make such absolute predictions, especially about policy :)

	Jeff

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [Bonding-devel] Re: [SET 2][PATCH 2/8][bonding] Propagating master's settings toslaves
  2003-08-11 16:43               ` Jeff Garzik
  2003-08-11 17:31                 ` [Bonding-devel] Re: [SET 2][PATCH 2/8][bonding] Propagating master'ssettings toslaves Laurent DENIEL
@ 2003-08-11 21:27                 ` Mark Huth
  2003-08-11 21:41                 ` Jay Vosburgh
  2 siblings, 0 replies; 27+ messages in thread
From: Mark Huth @ 2003-08-11 21:27 UTC (permalink / raw)
  To: Jeff Garzik; +Cc: shmulik.hen, hadi, Laurent DENIEL, bonding-devel, netdev

Jeff Garzik wrote:

> The answer is, like life, it's a balance.
>
> As a general rule, we do prefer to move all code possible out of the 
> Linux kernel.  We have even created "initramfs", which for 2.7, will 
> be used as a vehicle to move code from the kernel to userspace, that 
> previously had to be in the kernel only because it was a task that 
> "had to be performed at boot time".
>
> However, one must consider
> (1) does moving code to userspace create any security holes?
> (2) does moving code to userspace dramatically increase the number of 
> context switches?
> (3) does moving code to userspace violate some atomicity that being 
> inside the kernel guarantees?
>
> In practice, #3 is the showstopper that occurs most often.
>
> This is why I push for a "bonding-utils" package from Jay.... because 
> of the general rule above:  put it into userspace, where possible.
>
>     Jeff

Yes, the answer is balance - the complicated, but non-time critical 
things should go into applications.  However, we need to retain a basic 
ability to perform the failover according to pre-configured rules within 
the kernel.  Many of our customers use bonding to provide a redundant 
network path through the wires and switches for what turn out to be 
heavily network dependant applications.  In many cases, the systems do 
not have a local disk, and everything is obtained via say an NFS mount. 
 When the MAC breaks, you may not be able to run userland!

In HA systems at this level, guarding against the failure of a redundant 
hardware component, we find that it is very helpful for the kernel to be 
able to perform a variety of simple, pre-programmed operations without 
resort to userland - this keeps the interacting fault domains smaller. 
 Sure, the decisons about how to configure the behaviors - that is the 
policies - belong in applications.  But the response to an event which 
triggers the actions may well _need_ to be in the kernel.

While the issue may not be so much one of speed - the applications may 
well respond in an adequate manner, depending on design and load - the 
issue of the amount of the system that must work for recovery is quite 
important when trying to push system availabilities into the mythical 5 
9's plus region.  For an application to run, the system has to be able 
to fork and exec, access the file system, allocate memory, etc.  Sure, 
through careful configuration it is possible to reduce the transient 
resources required (run a pre-loaded/locked daemon, make sure the files 
are locally cached, etc) then the configuration and testing are 
complicated.  It worked fine in the lab, because a resource we didn't 
realize was critical, never got pushed out of the dcache, for example.

Mark Huth

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [Bonding-devel] Re: [SET 2][PATCH 2/8][bonding] Propagating master's settings toslaves
  2003-08-11 16:43               ` Jeff Garzik
  2003-08-11 17:31                 ` [Bonding-devel] Re: [SET 2][PATCH 2/8][bonding] Propagating master'ssettings toslaves Laurent DENIEL
  2003-08-11 21:27                 ` [Bonding-devel] Re: [SET 2][PATCH 2/8][bonding] Propagating master's settings toslaves Mark Huth
@ 2003-08-11 21:41                 ` Jay Vosburgh
  2003-08-11 23:15                   ` [SET 2][PATCH 2/8][bonding] Propagating master's settings to slaves Shmulik Hen
  2003-08-12  2:33                   ` [Bonding-devel] Re: [SET 2][PATCH 2/8][bonding] Propagating master's settings toslaves jamal
  2 siblings, 2 replies; 27+ messages in thread
From: Jay Vosburgh @ 2003-08-11 21:41 UTC (permalink / raw)
  To: Jeff Garzik; +Cc: shmulik.hen, hadi, Laurent DENIEL, bonding-devel, netdev

>The answer is, like life, it's a balance.
[...]
>This is why I push for a "bonding-utils" package from Jay.... because of 
>the general rule above:  put it into userspace, where possible.

	Hmm.  My impression from our prior discussions was that your
interest in moving ifenslave out of the kernel source and into its own
package was more of a source code management concern rather than
moving functionality from the kernel into user space (because
ifenslave is in user space to begin with).

	Anyway, for most of the core bonding failover logic, I don't
see how a user space daemon implementation can perform equivalently to
a kernel-only implementation.  I could be wrong (I haven't done any
testing) but for the core "eth0 is dead, enable eth1" type stuff, it
seems to me that in-kernel beats "user space yakking with kernel" for
reliability and speed, particularly on heavily loaded systems.

	Now, that said, I can see a use for a user space monitoring /
control program, for the "strategic" problems (as opposed to the
"tactical" problems, like the previous paragraph).  If we want to,
e.g., monitor bandwidth usage and add or remove links from the
aggregation, that is (a) not as time critical, and (b) somewhat
fuzzier in definition.  Such a user space program could also interface
with various system management or HA thingies and report status for
its activities as well as the activities that bonding performs
independent of it.

	One thought I've had (which dovetails somewhat with an earlier
comment from Laurent) is a tcpdump/bpf-style "policy engine" blob in
the kernel, which is programmed from user space with enough brains to
handle the "tactical" level problems (the "strategic" problems might
be more than such a blob could handle, and if its easy enough to yak
with user space for those problems, it may not be necessary).  I
haven't done much more than think about this, though; it may very well
be overkill for the basic stuff.

	-J

---
	-Jay Vosburgh, IBM Linux Technology Center, fubar@us.ibm.com

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [SET 2][PATCH 2/8][bonding] Propagating master's settings to slaves
  2003-08-11 21:41                 ` Jay Vosburgh
@ 2003-08-11 23:15                   ` Shmulik Hen
  2003-08-11 23:28                     ` [Bonding-devel] " Jay Vosburgh
  2003-08-12  2:36                     ` jamal
  2003-08-12  2:33                   ` [Bonding-devel] Re: [SET 2][PATCH 2/8][bonding] Propagating master's settings toslaves jamal
  1 sibling, 2 replies; 27+ messages in thread
From: Shmulik Hen @ 2003-08-11 23:15 UTC (permalink / raw)
  To: Jay Vosburgh, Jeff Garzik; +Cc: hadi, Laurent DENIEL, bonding-devel, netdev

May I remind you all that the original discussion was only about  
stuff that has to do with configuration time. There was no mention of 
any run time code. ifenslave only does three simple things - add a 
slave, remove a slave and set the current active slave, that's all. 

The drive was to try and make ifenslave slimmer regarding those three 
operations only in the way that any setting of the slave will be done 
by the kernel module instead of the configuration application. There 
is no real "brain" there anyway.

We had some experience with creating an configuration application that 
was incredibly smart and was always aware of what was going on in the 
driver and could make all possible decisions before even attempting 
to access the driver so it could fail the operation without  
"bothering" the driver. It's gigantic. It's extremely hard to install 
and configure. It's even harder to maintain. And all it was meant to 
do is configuration. Imagine what would happen if it was also 
supposed to handle run time issues.

I am not aware of anything like moving kernel code into applications. 
Was that something that was discussed in OLS ? Where can I find some 
more info about this trend ?

	Shmulik.

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [Bonding-devel] Re: [SET 2][PATCH 2/8][bonding] Propagating master's settings to slaves
  2003-08-11 23:15                   ` [SET 2][PATCH 2/8][bonding] Propagating master's settings to slaves Shmulik Hen
@ 2003-08-11 23:28                     ` Jay Vosburgh
  2003-08-12  2:36                     ` jamal
  1 sibling, 0 replies; 27+ messages in thread
From: Jay Vosburgh @ 2003-08-11 23:28 UTC (permalink / raw)
  To: shmulik.hen; +Cc: Jeff Garzik, hadi, Laurent DENIEL, bonding-devel, netdev


>The drive was to try and make ifenslave slimmer regarding those three 
>operations only in the way that any setting of the slave will be done 
>by the kernel module instead of the configuration application. There 
>is no real "brain" there anyway.

	Agreed.  One reason for adding all of that propogation of
settings from master to slave is so that ifenslave doesn't have to do
it.  The less mystic stuff that needs to be synchronized between the
two, the better.

	-J

---
	-Jay Vosburgh, IBM Linux Technology Center, fubar@us.ibm.com

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [Bonding-devel] Re: [SET 2][PATCH 2/8][bonding] Propagating master's  settings toslaves
  2003-08-11 16:25             ` Shmulik Hen
  2003-08-11 16:43               ` Jeff Garzik
@ 2003-08-12  2:31               ` jamal
  1 sibling, 0 replies; 27+ messages in thread
From: jamal @ 2003-08-12  2:31 UTC (permalink / raw)
  To: shmulik.hen; +Cc: Laurent DENIEL, bonding-devel, netdev

On Mon, 2003-08-11 at 12:25, Shmulik Hen wrote:

> Why have any kernel code other than device drivers in the first place 
> ?
> Why not move all the TCP/IP stack out of kernel space and put it in an 
> application ? Lets have the entire ARP mechanism in an appliaction 

Of course ARP should be moved out. Infact you bring out a very good
point;-> If ARP was out of the kernel a lot of the recent discussions on
ARP in relation to HA would have been nonexistent. The only reason they
are even being discussed its because so much of the policy exists in the
kernel. 
[BTW, there are several ARP implementations on Linux which try to move
it out of the kernel and leave some minimal functionality in the
kernel].

> and let it handle everything from routing tables management to arp 
> negotiation while the kernel will only know how to create arp packets 
> that it gets from that app and send them away ? It doesn't need to 
> have the know how.

Actually if you do this, then ARP packets need be generated in user
space.

> Say we do thing s your way and use the notification mechanism, how 
> long do you think it's going to take for the whole operation to 
> finish taking into consideration how the kernel runs user space 
> applications in comparison with kernel code? 

I dont have numbers. I would estimate less than a ms to go to user 
space, execute some simple policy config and come back.

> what happens when the 
> system is heavily loaded ? 

What happens now ? 

> What happens if the application dies for 
> some reason ? 

What happens when the kernel oopses? ;->

> Why should the bonding driver even care about routes or firewalling ?

I gave those as examples of policies that could be executed. There are
many reasons why you would wanna do a route redirection should a
specific link fail, most important being for path availability which is
a layer above link availability. Again, this is just an example, i may
wanna slice bread everytime a link goes down and add butter every time
one comes up. Ridiculuos as that may sound, point is you cant predict
what i wanna do with such events. 

> It's only meant to group several physical ethernet devices and group 
> them under one logical device to handle teaming solutions.

yes, those are basic services. You need to allow for more interesting
things.

cheers,
jamal

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [Bonding-devel] Re: [SET 2][PATCH 2/8][bonding] Propagating master'ssettings toslaves
  2003-08-11 17:31                 ` [Bonding-devel] Re: [SET 2][PATCH 2/8][bonding] Propagating master'ssettings toslaves Laurent DENIEL
  2003-08-11 17:43                   ` Jeff Garzik
@ 2003-08-12  2:32                   ` jamal
  1 sibling, 0 replies; 27+ messages in thread
From: jamal @ 2003-08-12  2:32 UTC (permalink / raw)
  To: Laurent DENIEL; +Cc: Jeff Garzik, shmulik.hen, bonding-devel, netdev

On Mon, 2003-08-11 at 13:31, Laurent DENIEL wrote:

> But I agree that it is interesting to do some stuff at user land, and if 
> the bonding had an option to disable the automatic failover policy, 
> this could be implemented with trigger towards user land application that 
> could use an ioctl call to switch to the appropriate NIC according to

You spoilt otherwise sane text by mentioning ioctl;-> 

> But the fast and simple failover policy shall remain in kernel code.

nod from here. Simple failover policies should stay in the kernel.

cheers,
jamal

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [Bonding-devel] Re: [SET 2][PATCH 2/8][bonding] Propagating master's settings toslaves
  2003-08-11 21:41                 ` Jay Vosburgh
  2003-08-11 23:15                   ` [SET 2][PATCH 2/8][bonding] Propagating master's settings to slaves Shmulik Hen
@ 2003-08-12  2:33                   ` jamal
  1 sibling, 0 replies; 27+ messages in thread
From: jamal @ 2003-08-12  2:33 UTC (permalink / raw)
  To: Jay Vosburgh
  Cc: Jeff Garzik, shmulik.hen, Laurent DENIEL, bonding-devel, netdev

On Mon, 2003-08-11 at 17:41, Jay Vosburgh wrote: 
> 	Anyway, for most of the core bonding failover logic, I don't
> see how a user space daemon implementation can perform equivalently to
> a kernel-only implementation.  I could be wrong (I haven't done any
> testing) but for the core "eth0 is dead, enable eth1" type stuff, it
> seems to me that in-kernel beats "user space yakking with kernel" for
> reliability and speed, particularly on heavily loaded systems.

for "Eth0 dead migrate activity to eth1" thing - i claim thats basic.
Leave it in the kernel.

> 	Now, that said, I can see a use for a user space monitoring /
> control program, for the "strategic" problems (as opposed to the
> "tactical" problems, like the previous paragraph).  If we want to,
> e.g., monitor bandwidth usage and add or remove links from the
> aggregation, that is (a) not as time critical, and (b) somewhat
> fuzzier in definition.  Such a user space program could also interface
> with various system management or HA thingies and report status for
> its activities as well as the activities that bonding performs
> independent of it.

Now thats an interesting app. Bandwidth on demand. Probabaly also bring
down the number of links when they are not being used.
Imagine if you had to push this to the kernel.

> 	One thought I've had (which dovetails somewhat with an earlier
> comment from Laurent) is a tcpdump/bpf-style "policy engine" blob in
> the kernel, which is programmed from user space with enough brains to
> handle the "tactical" level problems (the "strategic" problems might
> be more than such a blob could handle, and if its easy enough to yak
> with user space for those problems, it may not be necessary).  I
> haven't done much more than think about this, though; it may very well
> be overkill for the basic stuff.

It exists. It's called netlink.

cheers,
jamal

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [SET 2][PATCH 2/8][bonding] Propagating master's settings to slaves
  2003-08-11 23:15                   ` [SET 2][PATCH 2/8][bonding] Propagating master's settings to slaves Shmulik Hen
  2003-08-11 23:28                     ` [Bonding-devel] " Jay Vosburgh
@ 2003-08-12  2:36                     ` jamal
  1 sibling, 0 replies; 27+ messages in thread
From: jamal @ 2003-08-12  2:36 UTC (permalink / raw)
  To: shmulik.hen
  Cc: Jay Vosburgh, Jeff Garzik, Laurent DENIEL, bonding-devel, netdev


Shmulik, the only discussion as far as i know is the one that happened
in this thread. I have not seen any discussion before. 

Folks, I really didnt mean to start such a long thread ;->

cheers,
jamal

On Mon, 2003-08-11 at 19:15, Shmulik Hen wrote:
> May I remind you all that the original discussion was only about  
> stuff that has to do with configuration time. There was no mention of 
> any run time code. ifenslave only does three simple things - add a 
> slave, remove a slave and set the current active slave, that's all. 
> 
> The drive was to try and make ifenslave slimmer regarding those three 
> operations only in the way that any setting of the slave will be done 
> by the kernel module instead of the configuration application. There 
> is no real "brain" there anyway.
> 
> We had some experience with creating an configuration application that 
> was incredibly smart and was always aware of what was going on in the 
> driver and could make all possible decisions before even attempting 
> to access the driver so it could fail the operation without  
> "bothering" the driver. It's gigantic. It's extremely hard to install 
> and configure. It's even harder to maintain. And all it was meant to 
> do is configuration. Imagine what would happen if it was also 
> supposed to handle run time issues.
> 
> I am not aware of anything like moving kernel code into applications. 
> Was that something that was discussed in OLS ? Where can I find some 
> more info about this trend ?
> 
> 
> 	Shmulik.
> 
> 

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [Bonding-devel] Re: [SET 2][PATCH 2/8][bonding] Propagating master'ssettings toslaves
  2003-08-11 17:43                   ` Jeff Garzik
@ 2003-08-12  6:31                     ` Laurent DENIEL
  2003-08-12 12:59                       ` jamal
  0 siblings, 1 reply; 27+ messages in thread
From: Laurent DENIEL @ 2003-08-12  6:31 UTC (permalink / raw)
  To: Jeff Garzik; +Cc: shmulik.hen, hadi, bonding-devel, netdev

Jeff Garzik a écrit :
>
> > You forgot one important aspect :
> >
> >   (4) does moving code to userspace break compatibility (or behavior)
> >       with user land applications (or systems)
> 
> I agree... assuming these userland interfaces are fairly standard and
> widely deployed.
> 
> > What can one do if say, kernel 2.[4|5] switches the NIC in 10 mseconds
> > while kernel 2.7 with user land daemon switches in a few seconds ?
> > nothing but stay with the previous version or fork the driver development ;-(
> 
> This is a silly example.  If that happens in practice, then that is a
> bug in the configuration of the userland daemon, or a bug in the
> kernel<->userland ABI.

Not a silly example but a real case that happened to me with another
operating system and I'd hate if it happens also with Linux ...
 
> > But I agree that it is interesting to do some stuff at user land, and if
> > the bonding had an option to disable the automatic failover policy,
> > this could be implemented with trigger towards user land application that
> > could use an ioctl call to switch to the appropriate NIC according to
> > the user lan configuration ...
> 
> Remember, ioctls are bad.  :)  Unix design mistake.

ioctl (which already exist) or something else, this is not the point here.

> > what happens when the 
> > system is heavily loaded ? 
> 
> What happens now ? 
> 
> > What happens if the application dies for 
> > some reason ? 
> 
> What happens when the kernel oopses? ;->

Such silly responses make me think that it is no longer worth to argue ...

Laurent

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [Bonding-devel] Re: [SET 2][PATCH 2/8][bonding] Propagating master'ssettings toslaves
  2003-08-12  6:31                     ` Laurent DENIEL
@ 2003-08-12 12:59                       ` jamal
  2003-08-12 13:08                         ` David S. Miller
  0 siblings, 1 reply; 27+ messages in thread
From: jamal @ 2003-08-12 12:59 UTC (permalink / raw)
  To: Laurent DENIEL; +Cc: Jeff Garzik, shmulik.hen, bonding-devel, netdev

On Tue, 2003-08-12 at 02:31, Laurent DENIEL wrote:

 
> > > What happens if the application dies for 
> > > some reason ? 
> > 
> > What happens when the kernel oopses? ;->
> 
> Such silly responses make me think that it is no longer worth to argue ...
> 

You dont think asking "what if the application dies" is in the same
calibre as "what happens when the kernel oopses"?

cheers,
jamal

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [Bonding-devel] Re: [SET 2][PATCH 2/8][bonding] Propagating master'ssettings toslaves
  2003-08-12 12:59                       ` jamal
@ 2003-08-12 13:08                         ` David S. Miller
  2003-08-12 14:10                           ` Laurent DENIEL
  0 siblings, 1 reply; 27+ messages in thread
From: David S. Miller @ 2003-08-12 13:08 UTC (permalink / raw)
  To: hadi; +Cc: laurent.deniel, jgarzik, shmulik.hen, bonding-devel, netdev

On 12 Aug 2003 08:59:17 -0400
jamal <hadi@cyberus.ca> wrote:

> You dont think asking "what if the application dies" is in the same
> calibre as "what happens when the kernel oopses"?

Don't sweat it Jamal, some people just don't get it :-)

Look, people, when userlevel routing daemon dies your system
effectively stops to route.

There is zero difference between that example and the ones
we are discussing here.

Policy belongs strictly at user space.

One of the great things about what Jamal spends his time working
on is finally a strict seperation of the control layer from everything
else.  And part of this is moving all of the control logic into userspace.
Once that is accomplished, I can have my toilet flush every time a TCP
packet is routed through my system and this won't crap up the kernel.

If you don't see the value in that, perhaps you shouldn't be partaking
in this discussion :-)

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [Bonding-devel] Re: [SET 2][PATCH 2/8][bonding] Propagating master'ssettings toslaves
  2003-08-12 13:08                         ` David S. Miller
@ 2003-08-12 14:10                           ` Laurent DENIEL
       [not found]                             ` <1060698412.1063.7.camel@jzny.localdomain>
  0 siblings, 1 reply; 27+ messages in thread
From: Laurent DENIEL @ 2003-08-12 14:10 UTC (permalink / raw)
  To: David S. Miller; +Cc: hadi, jgarzik, shmulik.hen, bonding-devel, netdev

"David S. Miller" a écrit :
> 
> On 12 Aug 2003 08:59:17 -0400
> jamal <hadi@cyberus.ca> wrote:
> 
> > You dont think asking "what if the application dies" is in the same
> > calibre as "what happens when the kernel oopses"?
> 
> Don't sweat it Jamal, some people just don't get it :-)
> 
> Look, people, when userlevel routing daemon dies your system
> effectively stops to route.

That's why in really *safe* systems, we do not use routing daemon
but only static routes ;-)

And there is a BIG difference : 

When user level daemon dies, you have to be sure that some stuff
exists to monitor and recover from that situation (either by 
restarting the faulty deamon (if it could recover in time which
I doubt with the bonding case), or by switching to a new machine
in a fault tolerant configuration). With kernel ooops, there is
NOTHING to do in such in such a fault tolerant systems, since the
machine is unusable (this is the same as a hardware failure).

But people does not understand the constraints of really safe
systems.

> Policy belongs strictly at user space.
> 
> One of the great things about what Jamal spends his time working
> on is finally a strict seperation of the control layer from everything
> else.  And part of this is moving all of the control logic into userspace.
> Once that is accomplished, I can have my toilet flush every time a TCP
> packet is routed through my system and this won't crap up the kernel.
> 
> If you don't see the value in that, perhaps you shouldn't be partaking
> in this discussion :-)

This is OK as long as your kernel and user space stuff remains suitable
for highly fault tolerant systems and which does not require big montains
of user stuff to do the same as a few line of code in the kernel. Remember
the aim of bonding : NIC fault tolerance or load balancing. I am not against
a user space configurable policy for more complex job but the initial aim of 
the bonding shall remain coded in the kernel (and is only usable in the above 
mentioned systems in that way).

Laurent

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [Bonding-devel] Re: [SET 2][PATCH 2/8][bonding] Propagating master'ssettings toslaves
       [not found]                             ` <1060698412.1063.7.camel@jzny.localdomain>
@ 2003-08-12 14:36                               ` Laurent DENIEL
  2003-08-12 15:05                                 ` jamal
  0 siblings, 1 reply; 27+ messages in thread
From: Laurent DENIEL @ 2003-08-12 14:36 UTC (permalink / raw)
  To: hadi; +Cc: David S. Miller, jgarzik, shmulik.hen, bonding-devel, netdev

jamal a écrit :
> 
> On Tue, 2003-08-12 at 10:10, Laurent DENIEL wrote:
> > "David S. Miller" a écrit :
> 
> > That's why in really *safe* systems, we do not use routing daemon
> > but only static routes ;-)
> >
> > And there is a BIG difference :
> >
> > When user level daemon dies, you have to be sure that some stuff
> > exists to monitor and recover from that situation (either by
> > restarting the faulty deamon (if it could recover in time which
> > I doubt with the bonding case), or by switching to a new machine
> > in a fault tolerant configuration). With kernel ooops, there is
> > NOTHING to do in such in such a fault tolerant systems, since the
> > machine is unusable (this is the same as a hardware failure).
> >
> > But people does not understand the constraints of really safe
> > systems.
> >
> 
> We have hardware watchdog timers to put the kernel into a known state by
> rebooting. If you were not aware of all these RAS efforts on Linux
> (projects like kexec for example) I suggest you start looking at them.

I am aware of this great stuff but see below.

> The kernel will oops and the app will die because of one thing: _A
> software bug_. It doesnt matter what causes the death of the kernel or
> app ( a misconfig for example causing a broadcast loop making the app
> die is a bug).
> If you want a safe system then you donot trust software neither do you
> trust hardware - You must have workarounds incase they go beserk. Heck
> the only entity you should trust is God and thats assuming you believe
> in God.

Hardware / software watchdogs are great but do not necessarily 
solve all problems especially where timing constraints are important.
I prefer to rely on the timing of the bonding kernel code to switch
NIC in milli seconds that to wait seconds or minutes that a user space
daemon have the hand to handle the problem (and yes, I am aware of 
real time class scheduling and so on, but you say don't trust the 
software, and I agree so I prefer a direct kernel hang than nothing 
or something too late (software watchdog will not help in that case).

Laurent

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [Bonding-devel] Re: [SET 2][PATCH 2/8][bonding] Propagating master'ssettings toslaves
  2003-08-12 14:36                               ` Laurent DENIEL
@ 2003-08-12 15:05                                 ` jamal
  0 siblings, 0 replies; 27+ messages in thread
From: jamal @ 2003-08-12 15:05 UTC (permalink / raw)
  To: Laurent DENIEL
  Cc: David S. Miller, jgarzik, shmulik.hen, bonding-devel, netdev

On Tue, 2003-08-12 at 10:36, Laurent DENIEL wrote:

> Hardware / software watchdogs are great but do not necessarily 
> solve all problems especially where timing constraints are important.

I think we are going on a tangent; i could ask you next why you think it
is less likely to have bugs in the kernel than in user space. Please
dont respond because we'll get into long circular debates.

> I prefer to rely on the timing of the bonding kernel code to switch
> NIC in milli seconds that to wait seconds or minutes that a user space
> daemon have the hand to handle the problem (and yes, I am aware of 
> real time class scheduling and so on, but you say don't trust the 
> software, and I agree so I prefer a direct kernel hang than nothing 
> or something too late (software watchdog will not help in that case).

I dont think we have any disagreements that minimalistic kernel policy
should stay. I am not suggesting to move what the _original_ bonding
driver did out of the kernel - so i dont think there are issues with
"waiting for seconds". Are you basing this on experience?

The key is this: We should start looking at bonding as an enabler for 
availabilty not as _the solution_. Bonding provides link availability
for single hops;

cheers,
jamal

^ permalink raw reply	[flat|nested] 27+ messages in thread

end of thread, other threads:[~2003-08-12 15:05 UTC | newest]

Thread overview: 27+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2003-08-09 10:29 [SET 2][PATCH 2/8][bonding] Propagating master's settings to slaves Hen, Shmulik
2003-08-11  2:51 ` jamal
2003-08-11 10:08   ` Shmulik Hen
2003-08-11 13:47     ` jamal
2003-08-11 14:07       ` [Bonding-devel] Re: [SET 2][PATCH 2/8][bonding] Propagating master's settings toslaves Laurent DENIEL
2003-08-11 14:20         ` Shmulik Hen
2003-08-11 14:34           ` jamal
2003-08-11 16:25             ` Shmulik Hen
2003-08-11 16:43               ` Jeff Garzik
2003-08-11 17:31                 ` [Bonding-devel] Re: [SET 2][PATCH 2/8][bonding] Propagating master'ssettings toslaves Laurent DENIEL
2003-08-11 17:43                   ` Jeff Garzik
2003-08-12  6:31                     ` Laurent DENIEL
2003-08-12 12:59                       ` jamal
2003-08-12 13:08                         ` David S. Miller
2003-08-12 14:10                           ` Laurent DENIEL
     [not found]                             ` <1060698412.1063.7.camel@jzny.localdomain>
2003-08-12 14:36                               ` Laurent DENIEL
2003-08-12 15:05                                 ` jamal
2003-08-12  2:32                   ` jamal
2003-08-11 21:27                 ` [Bonding-devel] Re: [SET 2][PATCH 2/8][bonding] Propagating master's settings toslaves Mark Huth
2003-08-11 21:41                 ` Jay Vosburgh
2003-08-11 23:15                   ` [SET 2][PATCH 2/8][bonding] Propagating master's settings to slaves Shmulik Hen
2003-08-11 23:28                     ` [Bonding-devel] " Jay Vosburgh
2003-08-12  2:36                     ` jamal
2003-08-12  2:33                   ` [Bonding-devel] Re: [SET 2][PATCH 2/8][bonding] Propagating master's settings toslaves jamal
2003-08-12  2:31               ` jamal
  -- strict thread matches above, loose matches on Subject: below --
2003-08-08 14:44 [SET 2][PATCH 2/8][bonding] Propagating master's settings to slaves Shmulik Hen
2003-08-08 22:01 ` jamal

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).