[PATCH 1/3] [NET]: Use interface index to keep input device information

netdev.vger.kernel.org archive mirror
 help / color / mirror / Atom feed

* [PATCH 1/3] [NET]: Use interface index to keep input device information
  2006-06-26 14:54 [PATCHSET] Towards accurate incoming interface information Thomas Graf
@ 2006-06-25 22:00 ` Thomas Graf
  2006-06-25 22:00 ` [PATCH 2/3] [VLAN]: Update iif when receiving via VLAN device Thomas Graf
                   ` (2 subsequent siblings)
  3 siblings, 0 replies; 91+ messages in thread
From: Thomas Graf @ 2006-06-25 22:00 UTC (permalink / raw)
  To: davem; +Cc: netdev

[-- Attachment #1: input_dev --]
[-- Type: text/plain, Size: 4396 bytes --]

Using the interface index instead of a direct reference
allows a safe usage beyond the scope where an interface
could disappear.

The old input_dev field was incorrectly made dependant
on CONFIG_NET_CLS_ACT in skb_copy().

Signed-off-by: Thomas Graf <tgraf@suug.ch>

Index: net-2.6.git/include/linux/skbuff.h
===================================================================
--- net-2.6.git.orig/include/linux/skbuff.h
+++ net-2.6.git/include/linux/skbuff.h
@@ -181,7 +181,6 @@ enum {
  *	@sk: Socket we are owned by
  *	@tstamp: Time we arrived
  *	@dev: Device we arrived on/are leaving by
- *	@input_dev: Device we arrived on
  *	@h: Transport layer header
  *	@nh: Network layer header
  *	@mac: Link layer header
@@ -192,6 +191,7 @@ enum {
  *	@data_len: Data length
  *	@mac_len: Length of link layer header
  *	@csum: Checksum
+ *	@iif: Device we arrived on
  *	@local_df: allow local fragmentation
  *	@cloned: Head may be cloned (check refcnt to be sure)
  *	@nohdr: Payload reference only, must not modify header
@@ -228,7 +228,6 @@ struct sk_buff {
 	struct sock		*sk;
 	struct skb_timeval	tstamp;
 	struct net_device	*dev;
-	struct net_device	*input_dev;
 
 	union {
 		struct tcphdr	*th;
@@ -266,6 +265,7 @@ struct sk_buff {
 				data_len,
 				mac_len,
 				csum;
+	int			iif;
 	__u32			priority;
 	__u8			local_df:1,
 				cloned:1,
Index: net-2.6.git/include/net/pkt_cls.h
===================================================================
--- net-2.6.git.orig/include/net/pkt_cls.h
+++ net-2.6.git/include/net/pkt_cls.h
@@ -352,14 +352,19 @@ tcf_change_indev(struct tcf_proto *tp, c
 static inline int
 tcf_match_indev(struct sk_buff *skb, char *indev)
 {
+	int ret = 1;
+
 	if (indev[0]) {
-		if  (!skb->input_dev)
-			return 0;
-		if (strcmp(indev, skb->input_dev->name))
+		struct net_device *dev;
+
+		dev = dev_get_by_index(skb->iif);
+		if  (!dev)
 			return 0;
+		ret = !strcmp(indev, dev->name);
+		dev_put(dev);
 	}
 
-	return 1;
+	return ret;
 }
 #endif /* CONFIG_NET_CLS_IND */
 
Index: net-2.6.git/net/core/dev.c
===================================================================
--- net-2.6.git.orig/net/core/dev.c
+++ net-2.6.git/net/core/dev.c
@@ -1715,8 +1715,8 @@ static int ing_filter(struct sk_buff *sk
 	if (dev->qdisc_ingress) {
 		__u32 ttl = (__u32) G_TC_RTTL(skb->tc_verd);
 		if (MAX_RED_LOOP < ttl++) {
-			printk("Redir loop detected Dropping packet (%s->%s)\n",
-				skb->input_dev->name, skb->dev->name);
+			printk("Redir loop detected Dropping packet (%d->%s)\n",
+				skb->iif, skb->dev->name);
 			return TC_ACT_SHOT;
 		}
 
@@ -1749,8 +1749,8 @@ int netif_receive_skb(struct sk_buff *sk
 	if (!skb->tstamp.off_sec)
 		net_timestamp(skb);
 
-	if (!skb->input_dev)
-		skb->input_dev = skb->dev;
+	if (!skb->iif)
+		skb->iif = skb->dev->ifindex;
 
 	orig_dev = skb_bond(skb);
 
Index: net-2.6.git/net/core/skbuff.c
===================================================================
--- net-2.6.git.orig/net/core/skbuff.c
+++ net-2.6.git/net/core/skbuff.c
@@ -463,10 +463,10 @@ struct sk_buff *skb_clone(struct sk_buff
 	n->tc_verd = SET_TC_VERD(skb->tc_verd,0);
 	n->tc_verd = CLR_TC_OK2MUNGE(n->tc_verd);
 	n->tc_verd = CLR_TC_MUNGED(n->tc_verd);
-	C(input_dev);
 #endif
 	skb_copy_secmark(n, skb);
 #endif
+	C(iif);
 	C(truesize);
 	atomic_set(&n->users, 1);
 	C(head);
Index: net-2.6.git/net/sched/act_api.c
===================================================================
--- net-2.6.git.orig/net/sched/act_api.c
+++ net-2.6.git/net/sched/act_api.c
@@ -156,9 +156,8 @@ int tcf_action_exec(struct sk_buff *skb,
 
 	if (skb->tc_verd & TC_NCLS) {
 		skb->tc_verd = CLR_TC_NCLS(skb->tc_verd);
-		D2PRINTK("(%p)tcf_action_exec: cleared TC_NCLS in %s out %s\n",
-		         skb, skb->input_dev ? skb->input_dev->name : "xxx",
-		         skb->dev->name);
+		D2PRINTK("(%p)tcf_action_exec: cleared TC_NCLS in %d out %s\n",
+		         skb, skb->iif, skb->dev->name);
 		ret = TC_ACT_OK;
 		goto exec_done;
 	}
Index: net-2.6.git/net/sched/act_mirred.c
===================================================================
--- net-2.6.git.orig/net/sched/act_mirred.c
+++ net-2.6.git/net/sched/act_mirred.c
@@ -207,7 +207,7 @@ bad_mirred:
 		skb2->tc_verd = SET_TC_FROM(skb2->tc_verd, at);
 
 	skb2->dev = dev;
-	skb2->input_dev = skb->dev;
+	skb2->iif = skb->dev->ifindex;
 	dev_queue_xmit(skb2);
 	spin_unlock(&p->lock);
 	return p->action;

--


^ permalink raw reply	[flat|nested] 91+ messages in thread

* [PATCH 2/3] [VLAN]: Update iif when receiving via VLAN device
  2006-06-26 14:54 [PATCHSET] Towards accurate incoming interface information Thomas Graf
  2006-06-25 22:00 ` [PATCH 1/3] [NET]: Use interface index to keep input device information Thomas Graf
@ 2006-06-25 22:00 ` Thomas Graf
  2006-06-26 17:04   ` Patrick McHardy
  2006-06-25 22:00 ` [PATCH 3/3] [PKT_SCHED]: Add iif meta match Thomas Graf
  2006-06-27 15:07 ` [PATCHSET] Towards accurate incoming interface information Thomas Graf
  3 siblings, 1 reply; 91+ messages in thread
From: Thomas Graf @ 2006-06-25 22:00 UTC (permalink / raw)
  To: davem; +Cc: netdev

[-- Attachment #1: vlan_input_dev --]
[-- Type: text/plain, Size: 651 bytes --]

Updating iif to the VLAN device helps keeping routing
namespaces defined in case packets from multiple VLANs
collapse on a single device again.

Signed-off-by: Thomas Graf <tgraf@suug.ch>

Index: net-2.6.git/net/8021q/vlan_dev.c
===================================================================
--- net-2.6.git.orig/net/8021q/vlan_dev.c
+++ net-2.6.git/net/8021q/vlan_dev.c
@@ -156,6 +156,8 @@ int vlan_skb_recv(struct sk_buff *skb, s
 		return -1;
 	}
 
+	/* Make packets look like they have been received on the VLAN device */
+	skb->iif = skb->dev->ifindex;
 	skb->dev->last_rx = jiffies;
 
 	/* Bump the rx counters for the VLAN device. */

--


^ permalink raw reply	[flat|nested] 91+ messages in thread

* [PATCH 3/3] [PKT_SCHED]: Add iif meta match
  2006-06-26 14:54 [PATCHSET] Towards accurate incoming interface information Thomas Graf
  2006-06-25 22:00 ` [PATCH 1/3] [NET]: Use interface index to keep input device information Thomas Graf
  2006-06-25 22:00 ` [PATCH 2/3] [VLAN]: Update iif when receiving via VLAN device Thomas Graf
@ 2006-06-25 22:00 ` Thomas Graf
  2006-06-27 15:07 ` [PATCHSET] Towards accurate incoming interface information Thomas Graf
  3 siblings, 0 replies; 91+ messages in thread
From: Thomas Graf @ 2006-06-25 22:00 UTC (permalink / raw)
  To: davem; +Cc: netdev

[-- Attachment #1: iif_meta --]
[-- Type: text/plain, Size: 1285 bytes --]

Signed-off-by: Thomas Graf <tgraf@suug.ch>

Index: net-2.6.git/include/linux/tc_ematch/tc_em_meta.h
===================================================================
--- net-2.6.git.orig/include/linux/tc_ematch/tc_em_meta.h
+++ net-2.6.git/include/linux/tc_ematch/tc_em_meta.h
@@ -81,6 +81,7 @@ enum
  	TCF_META_ID_SK_SNDTIMEO,
  	TCF_META_ID_SK_SENDMSG_OFF,
  	TCF_META_ID_SK_WRITE_PENDING,
+	TCF_META_ID_IIF,
 	__TCF_META_ID_MAX
 };
 #define TCF_META_ID_MAX (__TCF_META_ID_MAX - 1)
Index: net-2.6.git/net/sched/em_meta.c
===================================================================
--- net-2.6.git.orig/net/sched/em_meta.c
+++ net-2.6.git/net/sched/em_meta.c
@@ -170,6 +170,11 @@ META_COLLECTOR(var_dev)
 	*err = var_dev(skb->dev, dst);
 }
 
+META_COLLECTOR(int_iif)
+{
+	dst->value = skb->iif;
+}
+
 /**************************************************************************
  * skb attributes
  **************************************************************************/
@@ -525,6 +530,7 @@ static struct meta_ops __meta_ops[TCF_ME
 		[META_ID(SK_SNDTIMEO)]		= META_FUNC(int_sk_sndtimeo),
 		[META_ID(SK_SENDMSG_OFF)]	= META_FUNC(int_sk_sendmsg_off),
 		[META_ID(SK_WRITE_PENDING)]	= META_FUNC(int_sk_write_pend),
+		[META_ID(IIF)]			= META_FUNC(int_iif),
 	}
 };
 

--


^ permalink raw reply	[flat|nested] 91+ messages in thread

* [PATCHSET] Towards accurate incoming interface information
@ 2006-06-26 14:54 Thomas Graf
  2006-06-25 22:00 ` [PATCH 1/3] [NET]: Use interface index to keep input device information Thomas Graf
                   ` (3 more replies)
  0 siblings, 4 replies; 91+ messages in thread
From: Thomas Graf @ 2006-06-26 14:54 UTC (permalink / raw)
  To: davem; +Cc: netdev

This patchset transforms skb->input_dev based on a device
reference to skb->iif based on an interface index moving
towards accurate iif information for routing and classification
through the following changesets:
                                 
        [NET]: Use interface index to keep input device information
        [VLAN]: Update iif when receiving via VLAN devic
        [PKT_SCHED]: Add iif meta match

 include/linux/skbuff.h               |    4 ++--
 include/linux/tc_ematch/tc_em_meta.h |    1 +
 include/net/pkt_cls.h                |   13 +++++++++----
 net/8021q/vlan_dev.c                 |    2 ++
 net/core/dev.c                       |    8 ++++----
 net/core/skbuff.c                    |    2 +-
 net/sched/act_api.c                  |    5 ++---
 net/sched/act_mirred.c               |    2 +-
 net/sched/em_meta.c                  |    6 ++++++
 9 files changed, 28 insertions(+), 15 deletions(-)


^ permalink raw reply	[flat|nested] 91+ messages in thread

* Re: [PATCH 2/3] [VLAN]: Update iif when receiving via VLAN device
  2006-06-25 22:00 ` [PATCH 2/3] [VLAN]: Update iif when receiving via VLAN device Thomas Graf
@ 2006-06-26 17:04   ` Patrick McHardy
  2006-06-26 17:46     ` David Miller
  0 siblings, 1 reply; 91+ messages in thread
From: Patrick McHardy @ 2006-06-26 17:04 UTC (permalink / raw)
  To: Thomas Graf; +Cc: davem, netdev

Thomas Graf wrote:
> Updating iif to the VLAN device helps keeping routing
> namespaces defined in case packets from multiple VLANs
> collapse on a single device again.
> 
> Signed-off-by: Thomas Graf <tgraf@suug.ch>
> 
> Index: net-2.6.git/net/8021q/vlan_dev.c
> ===================================================================
> --- net-2.6.git.orig/net/8021q/vlan_dev.c
> +++ net-2.6.git/net/8021q/vlan_dev.c
> @@ -156,6 +156,8 @@ int vlan_skb_recv(struct sk_buff *skb, s
>  		return -1;
>  	}
>  
> +	/* Make packets look like they have been received on the VLAN device */
> +	skb->iif = skb->dev->ifindex;
>  	skb->dev->last_rx = jiffies;

I know this was discussed before, but I can't remember the
exact outcome. Why don't we just unconditionally update iif
in netif_receive_skb()?


^ permalink raw reply	[flat|nested] 91+ messages in thread

* Re: [PATCH 2/3] [VLAN]: Update iif when receiving via VLAN device
  2006-06-26 17:04   ` Patrick McHardy
@ 2006-06-26 17:46     ` David Miller
  2006-06-26 18:24       ` Patrick McHardy
  2006-06-26 18:44       ` Thomas Graf
  0 siblings, 2 replies; 91+ messages in thread
From: David Miller @ 2006-06-26 17:46 UTC (permalink / raw)
  To: kaber; +Cc: tgraf, netdev

From: Patrick McHardy <kaber@trash.net>
Date: Mon, 26 Jun 2006 19:04:15 +0200

> I know this was discussed before, but I can't remember the
> exact outcome. Why don't we just unconditionally update iif
> in netif_receive_skb()?

Software devices might have interesting semantics that would
make not setting iif desirable.

Once you set iif, you can't just undo it because the information
is lost.

I also would really prefer to set it unconditionally in
netif_receive_skb(), but Jamal's concerns in this area are real.
We really need to evaluate this on a case-by-case basis.

^ permalink raw reply	[flat|nested] 91+ messages in thread

* Re: [PATCH 2/3] [VLAN]: Update iif when receiving via VLAN device
  2006-06-26 17:46     ` David Miller
@ 2006-06-26 18:24       ` Patrick McHardy
  2006-06-26 18:44       ` Thomas Graf
  1 sibling, 0 replies; 91+ messages in thread
From: Patrick McHardy @ 2006-06-26 18:24 UTC (permalink / raw)
  To: David Miller; +Cc: tgraf, netdev

David Miller wrote:
> From: Patrick McHardy <kaber@trash.net>
> Date: Mon, 26 Jun 2006 19:04:15 +0200
> 
> 
>>I know this was discussed before, but I can't remember the
>>exact outcome. Why don't we just unconditionally update iif
>>in netif_receive_skb()?
> 
> 
> Software devices might have interesting semantics that would
> make not setting iif desirable.

Right, I remember now.

> Once you set iif, you can't just undo it because the information
> is lost.
> 
> I also would really prefer to set it unconditionally in
> netif_receive_skb(), but Jamal's concerns in this area are real.
> We really need to evaluate this on a case-by-case basis.

I still have a hard time imagining a case where this wouldn't fit,
netfilter is perfectly happy with just using skb->dev as input
device on the input path so far. I also think it will be confusing
to the user to have different notions of what constitutes the input
device. If anyone can think of an example where the user would
really expect the input device to be something different than
skb->dev I'd be really interested to hear it.

BTW, this is not meant to be an objection to Thomas's patch, just
me still wondering ..

^ permalink raw reply	[flat|nested] 91+ messages in thread

* Re: [PATCH 2/3] [VLAN]: Update iif when receiving via VLAN device
  2006-06-26 17:46     ` David Miller
  2006-06-26 18:24       ` Patrick McHardy
@ 2006-06-26 18:44       ` Thomas Graf
  2006-06-26 22:29         ` Patrick McHardy
  1 sibling, 1 reply; 91+ messages in thread
From: Thomas Graf @ 2006-06-26 18:44 UTC (permalink / raw)
  To: David Miller; +Cc: kaber, netdev

* David Miller <davem@davemloft.net> 2006-06-26 10:46
> From: Patrick McHardy <kaber@trash.net>
> Date: Mon, 26 Jun 2006 19:04:15 +0200
> 
> > I know this was discussed before, but I can't remember the
> > exact outcome. Why don't we just unconditionally update iif
> > in netif_receive_skb()?
> 
> Software devices might have interesting semantics that would
> make not setting iif desirable.
> 
> Once you set iif, you can't just undo it because the information
> is lost.
> 
> I also would really prefer to set it unconditionally in
> netif_receive_skb(), but Jamal's concerns in this area are real.
> We really need to evaluate this on a case-by-case basis.

I'm playing with a FIFO device based on Jamal's previous
work to replace the broken IMQ device and provide real
queueing at ingress. A set of VLAN devices could be redirected
to such a FIFO device and let applications bind to it in order
to implement trivial bind(INADDR_ANY) namespaces based on anything
that can be selected by classifiers. This example would benefit
from a conditional iif update (given that the mirred action is
extended to take a flag to steer this) so applications like a
dhcp daemon could use a raw socket on the FIFO device and thus
benefit from the bind namespace but still access the original
interface the packet was received from using pktinfo cmsg.

Maybe silly but the best I could come up with :-)

^ permalink raw reply	[flat|nested] 91+ messages in thread

* Re: [PATCH 2/3] [VLAN]: Update iif when receiving via VLAN device
  2006-06-26 18:44       ` Thomas Graf
@ 2006-06-26 22:29         ` Patrick McHardy
  2006-06-26 22:49           ` Patrick McHardy
  2006-06-27 10:03           ` Thomas Graf
  0 siblings, 2 replies; 91+ messages in thread
From: Patrick McHardy @ 2006-06-26 22:29 UTC (permalink / raw)
  To: Thomas Graf; +Cc: David Miller, netdev

Thomas Graf wrote:
> * David Miller <davem@davemloft.net> 2006-06-26 10:46
> 
>>From: Patrick McHardy <kaber@trash.net>
>>Date: Mon, 26 Jun 2006 19:04:15 +0200
>>
>>
>>>I know this was discussed before, but I can't remember the
>>>exact outcome. Why don't we just unconditionally update iif
>>>in netif_receive_skb()?
>>
>>Software devices might have interesting semantics that would
>>make not setting iif desirable.
>>
>>Once you set iif, you can't just undo it because the information
>>is lost.
>>
>>I also would really prefer to set it unconditionally in
>>netif_receive_skb(), but Jamal's concerns in this area are real.
>>We really need to evaluate this on a case-by-case basis.
> 
> 
> I'm playing with a FIFO device based on Jamal's previous
> work to replace the broken IMQ device and provide real
> queueing at ingress.

All my testing (quite a lot) in this area so far suggested that queueing
at ingress doesn't work and is actually counter-productive. It gets
worse with increasing signal propagation delays (signal in this case
means congestion indications). I actually wish I never introduced
the stupid IMQ device, it raises false hope and a lot of people seem
to fall for it.

> A set of VLAN devices could be redirected
> to such a FIFO device and let applications bind to it in order
> to implement trivial bind(INADDR_ANY) namespaces based on anything
> that can be selected by classifiers. This example would benefit
> from a conditional iif update (given that the mirred action is
> extended to take a flag to steer this) so applications like a
> dhcp daemon could use a raw socket on the FIFO device and thus
> benefit from the bind namespace but still access the original
> interface the packet was received from using pktinfo cmsg.
> 
> Maybe silly but the best I could come up with :-)

Doesn't sound too silly (actually quite nifty in my opinion),
but I'm not sure I understand correctly, binding to ifb doesn't
work since it changes both skb->dev and skb->input_dev.


^ permalink raw reply	[flat|nested] 91+ messages in thread

* Re: [PATCH 2/3] [VLAN]: Update iif when receiving via VLAN device
  2006-06-26 22:29         ` Patrick McHardy
@ 2006-06-26 22:49           ` Patrick McHardy
  2006-06-27 10:03           ` Thomas Graf
  1 sibling, 0 replies; 91+ messages in thread
From: Patrick McHardy @ 2006-06-26 22:49 UTC (permalink / raw)
  To: Thomas Graf; +Cc: David Miller, netdev

Patrick McHardy wrote:
> All my testing (quite a lot) in this area so far suggested that queueing
> at ingress doesn't work and is actually counter-productive. It gets
> worse with increasing signal propagation delays (signal in this case
> means congestion indications). I actually wish I never introduced
> the stupid IMQ device, it raises false hope and a lot of people seem
> to fall for it.

Small clarification: with queueing at ingress I mean queueing after
the bottleneck. It might work if you introduce a virtual bottleneck,
but in that case in my experience the bandwidth limit you have to use
is dependant on the number of TCP flows, so usually you can't specify
a limit that will always work, and its wasteful if there is a smaller
number of TCP flows. The reason why you would do it, limiting or
prioritizing incoming traffic on the _real_ bottleneck, is not
achievable of course. Also noteworthy is that policers don't behave
any better, and any other schemes I've tried (I also experienced a
lot of with TCP rate control) have similar or related problems.

^ permalink raw reply	[flat|nested] 91+ messages in thread

* Re: [PATCH 2/3] [VLAN]: Update iif when receiving via VLAN device
  2006-06-26 22:29         ` Patrick McHardy
  2006-06-26 22:49           ` Patrick McHardy
@ 2006-06-27 10:03           ` Thomas Graf
  2006-06-27 13:07             ` jamal
  1 sibling, 1 reply; 91+ messages in thread
From: Thomas Graf @ 2006-06-27 10:03 UTC (permalink / raw)
  To: Patrick McHardy; +Cc: David Miller, netdev

* Patrick McHardy <kaber@trash.net> 2006-06-27 00:29
> Doesn't sound too silly (actually quite nifty in my opinion),
> but I'm not sure I understand correctly, binding to ifb doesn't
> work since it changes both skb->dev and skb->input_dev.

I don't understand this concern. So far the mirred action
updates iif but that can be made configurable.

* Patrick McHardy <kaber@trash.net> 2006-06-27 00:49
> Small clarification: with queueing at ingress I mean queueing after
> the bottleneck. It might work if you introduce a virtual bottleneck,
> but in that case in my experience the bandwidth limit you have to use
> is dependant on the number of TCP flows, so usually you can't specify
> a limit that will always work, and its wasteful if there is a smaller
> number of TCP flows. The reason why you would do it, limiting or
> prioritizing incoming traffic on the _real_ bottleneck, is not
> achievable of course. Also noteworthy is that policers don't behave
> any better, and any other schemes I've tried (I also experienced a
> lot of with TCP rate control) have similar or related problems.

My interest in ingress queueing is mainly on a local delivery
case where skb->tstamp can be modified to control tcp metrics.
I agree that a lot of wrong expectations have developed with
IMQ and similiar devices.


^ permalink raw reply	[flat|nested] 91+ messages in thread

* Re: [PATCH 2/3] [VLAN]: Update iif when receiving via VLAN device
  2006-06-27 10:03           ` Thomas Graf
@ 2006-06-27 13:07             ` jamal
  2006-06-28 10:18               ` Thomas Graf
  0 siblings, 1 reply; 91+ messages in thread
From: jamal @ 2006-06-27 13:07 UTC (permalink / raw)
  To: Thomas Graf; +Cc: Patrick McHardy, David Miller, netdev

On Tue, 2006-27-06 at 12:03 +0200, Thomas Graf wrote:
> * Patrick McHardy <kaber@trash.net> 2006-06-27 00:29
> > Doesn't sound too silly (actually quite nifty in my opinion),
> > but I'm not sure I understand correctly, binding to ifb doesn't
> > work since it changes both skb->dev and skb->input_dev.
> 
> I don't understand this concern. So far the mirred action
> updates iif but that can be made configurable.
> 

I am reading the thread backwards, so i may miss some of the obvious.
I also dont remember the exact discussion we had - but the consensus was
to leave the field setting as is.
Note the meta-setter (been sitting on it for too long) also set the
input_dev.

BTW, in regards to the VLANs, what is wrong with using the
netdev->iflink to figure the real device? 

cheers,
jamal


^ permalink raw reply	[flat|nested] 91+ messages in thread

* Re: [PATCHSET] Towards accurate incoming interface information
  2006-06-26 14:54 [PATCHSET] Towards accurate incoming interface information Thomas Graf
                   ` (2 preceding siblings ...)
  2006-06-25 22:00 ` [PATCH 3/3] [PKT_SCHED]: Add iif meta match Thomas Graf
@ 2006-06-27 15:07 ` Thomas Graf
  2006-06-27 20:24   ` David Miller
  3 siblings, 1 reply; 91+ messages in thread
From: Thomas Graf @ 2006-06-27 15:07 UTC (permalink / raw)
  To: davem; +Cc: netdev

* Thomas Graf <tgraf@suug.ch> 2006-06-26 16:54
> This patchset transforms skb->input_dev based on a device
> reference to skb->iif based on an interface index moving
> towards accurate iif information for routing and classification
> through the following changesets:

Hold on with this, I haven't noticed this ifb device
go in and thus missed to update it. I'll post an
updated patch shortly

^ permalink raw reply	[flat|nested] 91+ messages in thread

* Re: [PATCHSET] Towards accurate incoming interface information
  2006-06-27 15:07 ` [PATCHSET] Towards accurate incoming interface information Thomas Graf
@ 2006-06-27 20:24   ` David Miller
  0 siblings, 0 replies; 91+ messages in thread
From: David Miller @ 2006-06-27 20:24 UTC (permalink / raw)
  To: tgraf; +Cc: netdev

From: Thomas Graf <tgraf@suug.ch>
Date: Tue, 27 Jun 2006 17:07:27 +0200

> * Thomas Graf <tgraf@suug.ch> 2006-06-26 16:54
> > This patchset transforms skb->input_dev based on a device
> > reference to skb->iif based on an interface index moving
> > towards accurate iif information for routing and classification
> > through the following changesets:
> 
> Hold on with this, I haven't noticed this ifb device
> go in and thus missed to update it. I'll post an
> updated patch shortly

Ok.

^ permalink raw reply	[flat|nested] 91+ messages in thread

* Re: [PATCH 2/3] [VLAN]: Update iif when receiving via VLAN device
  2006-06-27 13:07             ` jamal
@ 2006-06-28 10:18               ` Thomas Graf
  2006-06-28 12:22                 ` jamal
  0 siblings, 1 reply; 91+ messages in thread
From: Thomas Graf @ 2006-06-28 10:18 UTC (permalink / raw)
  To: jamal; +Cc: Patrick McHardy, David Miller, netdev

* jamal <hadi@cyberus.ca> 2006-06-27 09:07
> I am reading the thread backwards, so i may miss some of the obvious.
> I also dont remember the exact discussion we had - but the consensus was
> to leave the field setting as is.
> Note the meta-setter (been sitting on it for too long) also set the
> input_dev.

Could you share that piece of code so I don't have to duplicate
that effort?

> BTW, in regards to the VLANs, what is wrong with using the
> netdev->iflink to figure the real device? 

vlan1
          ifb1
vlan2
vlan3
          ifb2
vlan4

dhcp daemon would bind to ifb1 and use pktinfo cmsg to figure
out whether the packet was received on vlan1 or vlan2, the
actual physical device is of no interest anymore.

Looking at your ifb code I see that you set skb->dev based on
iif and update iif unconditionally. I see a lot of problems
with this, firstly iif is already updated in the mirred action,
secondly iif doesn't necessarily point to the device the packet
was last seen on. Could you maybe explain the policy of iif
you have in mind, it doesn't seem very consistent.

BTW, why does ifb have a hard header length? The resulting
push and pull only slows things down.

^ permalink raw reply	[flat|nested] 91+ messages in thread

* Re: [PATCH 2/3] [VLAN]: Update iif when receiving via VLAN device
  2006-06-28 10:18               ` Thomas Graf
@ 2006-06-28 12:22                 ` jamal
  2006-06-28 13:01                   ` Thomas Graf
  0 siblings, 1 reply; 91+ messages in thread
From: jamal @ 2006-06-28 12:22 UTC (permalink / raw)
  To: Thomas Graf; +Cc: netdev, David Miller, Patrick McHardy

On Wed, 2006-28-06 at 12:18 +0200, Thomas Graf wrote:
> * jamal <hadi@cyberus.ca> 2006-06-27 09:07
[..]
> > Note the meta-setter (been sitting on it for too long) also set the
> > input_dev.
> 
> Could you share that piece of code so I don't have to duplicate
> that effort?
> 

I will look it up and send it your way - it is not complete (I have
changed machines twice since i first wrote it). Or maybe the intent of
your question was to ask how was the setting of input_dev was done?

> > BTW, in regards to the VLANs, what is wrong with using the
> > netdev->iflink to figure the real device? 
> 
> 
> vlan1
>           ifb1
> vlan2
> vlan3
>           ifb2
> vlan4
> 
> dhcp daemon would bind to ifb1 and use pktinfo cmsg to figure
> out whether the packet was received on vlan1 or vlan2, the
> actual physical device is of no interest anymore.
> 

Let me see if i understood correctly:
you have a device from both vlan1 and vlan2 both being redirected to
ifb1? and vlan 3, 4=> ifb2?
And i take it to make it interesting vlan1,2 on eth0 and vlan3,4 ontop
of eth1? note the iflink would/should reflect eth1/2. If you had bonding
or bridging in between it gets more interesting. But you are saying
you are no longer interested in the info that it came via eth1/2 at some
point, correct? 
Note, in such a case: iflink rewriting will do it just fine
but then you loose the original info (I think it would be good to not
loose such info to know the origin). I dont know if cmsg uses iflink at
all - they probably look at the ifindex.

Ok, now question: why are you binding on top of ifb? we could have
redirected this to ipip, or tun0 or loopback or eql etc where it may
equally not have made sense to bind to. I can see something like this
making sense for bonding or bridging but not as a generic answer for all
netdevices.

> Looking at your ifb code I see that you set skb->dev based on
> iif and update iif unconditionally. I see a lot of problems
> with this, firstly iif is already updated in the mirred action,
> secondly iif doesn't necessarily point to the device the packet
> was last seen on. Could you maybe explain the policy of iif
> you have in mind, it doesn't seem very consistent.
> 

The concept is to use ifb as a transitionary device. Packets come into
it from either the ingress side of the stack or egress and get returned
on the same side/spot they were found - with ifb represented as the
input device. What ifb does is independent of what mirred does (we could
redirect to other devices other than ifb).

Now that i pay attention to your patch i think i see the complications
it is introducing (i am not done yet)- and i did warn about this in the
discussion _and_ didnt we agree to leave this stuff alone? Can we drop
this change please?

> BTW, why does ifb have a hard header length? The resulting
> push and pull only slows things down.

Good question. I know there were a lot of oddities i had to deal with
when different link layers cross over (some have smaller headers than
other and yet others had none ;->)
I will have to go and look at my notes test cases and get back to you; 

It does work as it is right now (I only notice a pull - there is no
push).

cheers,
jamal 

^ permalink raw reply	[flat|nested] 91+ messages in thread

* Re: [PATCH 2/3] [VLAN]: Update iif when receiving via VLAN device
  2006-06-28 12:22                 ` jamal
@ 2006-06-28 13:01                   ` Thomas Graf
  2006-06-28 13:46                     ` jamal
  0 siblings, 1 reply; 91+ messages in thread
From: Thomas Graf @ 2006-06-28 13:01 UTC (permalink / raw)
  To: jamal; +Cc: netdev, David Miller, Patrick McHardy

* jamal <hadi@cyberus.ca> 2006-06-28 08:22
> Let me see if i understood correctly:
> you have a device from both vlan1 and vlan2 both being redirected to
> ifb1? and vlan 3, 4=> ifb2?
> And i take it to make it interesting vlan1,2 on eth0 and vlan3,4 ontop
> of eth1? note the iflink would/should reflect eth1/2. If you had bonding
> or bridging in between it gets more interesting. But you are saying
> you are no longer interested in the info that it came via eth1/2 at some
> point, correct? 
> Note, in such a case: iflink rewriting will do it just fine
> but then you loose the original info (I think it would be good to not
> loose such info to know the origin). I dont know if cmsg uses iflink at
> all - they probably look at the ifindex.

It's using rt_iif which is currently derived from skb->dev even
though it would make sense to use skb->iif once that is well
defined.

> Ok, now question: why are you binding on top of ifb? we could have
> redirected this to ipip, or tun0 or loopback or eql etc where it may
> equally not have made sense to bind to. I can see something like this
> making sense for bonding or bridging but not as a generic answer for all
> netdevices.

What's special about that? It seems a perfectly fine and simple way
to create namespaces for sockets without enforcing applications to
bind to the correct vlan devices directly.

> The concept is to use ifb as a transitionary device. Packets come into
> it from either the ingress side of the stack or egress and get returned
> on the same side/spot they were found - with ifb represented as the
> input device. What ifb does is independent of what mirred does (we could
> redirect to other devices other than ifb).

This is clear, my question was for you to explain when you intend
to update input_dev etc. By overwriting it in ifb you enforce that
valueable information is lost.

There is nothing wrong with updating iif to point to ifb under
certain circumstances but it's not required to strictly enforce
this. Having mirred do so conditionally is a lot more flexible
without losing any value.

> Now that i pay attention to your patch i think i see the complications
> it is introducing (i am not done yet)- and i did warn about this in the
> discussion _and_ didnt we agree to leave this stuff alone? Can we drop
> this change please?

This statement doesn't make sense, it merely transforms a device
reference to a reference by interface index because the current
method of handling it doesn't make sense at all since the region
where input_dev stays valid without the risk of the device
disappearing is very limited.

> It does work as it is right now (I only notice a pull - there is no
> push).

In order to pull in ifb you need to push in mirred, quite
obvious, no?  Why are you enforcing users of ifb to fake an
ethernet header that is of no use at all?

^ permalink raw reply	[flat|nested] 91+ messages in thread

* Re: [PATCH 2/3] [VLAN]: Update iif when receiving via VLAN device
  2006-06-28 13:01                   ` Thomas Graf
@ 2006-06-28 13:46                     ` jamal
  2006-06-29  8:51                       ` Thomas Graf
  0 siblings, 1 reply; 91+ messages in thread
From: jamal @ 2006-06-28 13:46 UTC (permalink / raw)
  To: Thomas Graf; +Cc: Patrick McHardy, David Miller, netdev

On Wed, 2006-28-06 at 15:01 +0200, Thomas Graf wrote:
> * jamal <hadi@cyberus.ca> 2006-06-28 08:22
[..]
> > Note, in such a case: iflink rewriting will do it just fine
> > but then you loose the original info (I think it would be good to not
> > loose such info to know the origin). I dont know if cmsg uses iflink at
> > all - they probably look at the ifindex.
> 
> It's using rt_iif which is currently derived from skb->dev even
> though it would make sense to use skb->iif once that is well
> defined.
> 

Why not use iflink?
It exists, by definition, specifically as "the ifindex of the down muxed
netdevice" (should probably add that definition in the .h).

This is semantically different from a message to the stack which says
"this came to you from input device X" (represented by input_dev).

> > Ok, now question: why are you binding on top of ifb? we could have
> > redirected this to ipip, or tun0 or loopback or eql etc where it may
> > equally not have made sense to bind to. I can see something like this
> > making sense for bonding or bridging but not as a generic answer for all
> > netdevices.
> 
> What's special about that? It seems a perfectly fine and simple way
> to create namespaces for sockets without enforcing applications to
> bind to the correct vlan devices directly.
> 

Can you achieve the same by binding to any arbitrary netdevice? If you
can then i would agree.

> This is clear, my question was for you to explain when you intend
> to update input_dev etc. By overwriting it in ifb you enforce that
> valueable information is lost.

It is design intent for this field to carry information on where the
packet came from. We may loop many times between devices (between
ingress->egress or ingress->ingress, within the ttl limit) and each time
the only meaningful thing is which was the last netdevice that was
sending it.

> There is nothing wrong with updating iif to point to ifb under
> certain circumstances but it's not required to strictly enforce
> this. Having mirred do so conditionally is a lot more flexible
> without losing any value.
> 

Refer to what i said above.

> > Now that i pay attention to your patch i think i see the complications
> > it is introducing (i am not done yet)- and i did warn about this in the
> > discussion _and_ didnt we agree to leave this stuff alone? Can we drop
> > this change please?
> 
> This statement doesn't make sense, it merely transforms a device
> reference to a reference by interface index because the current
> method of handling it doesn't make sense at all since the region
> where input_dev stays valid without the risk of the device
> disappearing is very limited.
> 

Please refer to what i said above in that you could end up redirecting
many times. I have not added yet the code which allows mirred to also do
ingress redirection which would make it even more interesting.
The challenge is this, and if you can solve it we would be fine:
- I need to access both the input_dev and dev and their metadata.
I could clearly find them by their ifindices and then reference them
after that. For performance reasons that is not optimal in the fast
path.  
[Note, we have had this discussion before and even then we came to some
consensus that it would be hard to achieve that hence my view, again, to
drop this].

> > It does work as it is right now (I only notice a pull - there is no
> > push).
> 
> In order to pull in ifb you need to push in mirred, quite
> obvious, no?  

I think that my thinking at the time was that ifb can only receive
packets via mirred and therefore the pull on ingress at ifb may have
been to counter the push that occurs at mirred. Let me look at my notes
and testcases when i get back home; it may make sense not to have it at
ifb - the fact that it works as is tells me there is more to it than the
obvious.

> Why are you enforcing users of ifb to fake an
> ethernet header that is of no use at all?

This may just be an artifact of inheriting things from dummy device.
The answer will be clear when i look at my notes. 

cheers,
jamal


^ permalink raw reply	[flat|nested] 91+ messages in thread

* Re: [PATCH 2/3] [VLAN]: Update iif when receiving via VLAN device
  2006-06-28 13:46                     ` jamal
@ 2006-06-29  8:51                       ` Thomas Graf
  2006-06-29 20:55                         ` Thomas Graf
  2006-06-29 23:23                         ` jamal
  0 siblings, 2 replies; 91+ messages in thread
From: Thomas Graf @ 2006-06-29  8:51 UTC (permalink / raw)
  To: jamal; +Cc: Patrick McHardy, David Miller, netdev

* jamal <hadi@cyberus.ca> 2006-06-28 09:46
> Why not use iflink?
> It exists, by definition, specifically as "the ifindex of the down muxed
> netdevice" (should probably add that definition in the .h).
> 
> This is semantically different from a message to the stack which says
> "this came to you from input device X" (represented by input_dev).

I disagree, iflink information is bogus once we start redirecting.

> > What's special about that? It seems a perfectly fine and simple way
> > to create namespaces for sockets without enforcing applications to
> > bind to the correct vlan devices directly.
> > 
> 
> Can you achieve the same by binding to any arbitrary netdevice? If you
> can then i would agree.

It was your request to not update input_dev in netif_receive_skb()
but rather have each netdevice handle it manually. What's currently
done in ifb:

	skb->dev = skb->input_dev;
	skb->input_dev = dev;

Confusing, isn't it? I really don't get input_dev/iif is supposed to
be updated in ifb. If a packet is to be redirected, the mirred action
shall take care of providing this information to the next netdevice.
My additional request is to provide a flag to disable this for special
purposes not hurting anyone.

> Please refer to what i said above in that you could end up redirecting
> many times. I have not added yet the code which allows mirred to also do
> ingress redirection which would make it even more interesting.
> The challenge is this, and if you can solve it we would be fine:
> - I need to access both the input_dev and dev and their metadata.
> I could clearly find them by their ifindices and then reference them
> after that. For performance reasons that is not optimal in the fast
> path.  

Nice, you forget that while redirecting the device pointed to by
input_dev can disappear leaving behind an illegal reference. The
purpose of this patch is to fix this.

^ permalink raw reply	[flat|nested] 91+ messages in thread

* Re: [PATCH 2/3] [VLAN]: Update iif when receiving via VLAN device
  2006-06-29  8:51                       ` Thomas Graf
@ 2006-06-29 20:55                         ` Thomas Graf
  2006-06-29 23:23                         ` jamal
  1 sibling, 0 replies; 91+ messages in thread
From: Thomas Graf @ 2006-06-29 20:55 UTC (permalink / raw)
  To: jamal; +Cc: Patrick McHardy, David Miller, netdev

* Thomas Graf <tgraf@suug.ch> 2006-06-29 10:51
> If a packet is to be redirected, the mirred action
> shall take care of providing this information to the next netdevice.
> My additional request is to provide a flag to disable this for special
> purposes not hurting anyone.

[NET]: Use interface index to keep input device information

Using the interface index instead of a direct reference
allows a safe usage beyond the scope where an interface
could disappear.

The old input_dev field was incorrectly made dependant
on CONFIG_NET_CLS_ACT in skb_copy().

The ifb device is changed to not update iif since the only
feeder is mirred which already takes care of this.

Signed-off-by: Thomas Graf <tgraf@suug.ch>

Index: net-2.6.git/include/linux/skbuff.h
===================================================================
--- net-2.6.git.orig/include/linux/skbuff.h
+++ net-2.6.git/include/linux/skbuff.h
@@ -181,7 +181,6 @@ enum {
  *	@sk: Socket we are owned by
  *	@tstamp: Time we arrived
  *	@dev: Device we arrived on/are leaving by
- *	@input_dev: Device we arrived on
  *	@h: Transport layer header
  *	@nh: Network layer header
  *	@mac: Link layer header
@@ -192,6 +191,7 @@ enum {
  *	@data_len: Data length
  *	@mac_len: Length of link layer header
  *	@csum: Checksum
+ *	@iif: Device we arrived on
  *	@local_df: allow local fragmentation
  *	@cloned: Head may be cloned (check refcnt to be sure)
  *	@nohdr: Payload reference only, must not modify header
@@ -228,7 +228,6 @@ struct sk_buff {
 	struct sock		*sk;
 	struct skb_timeval	tstamp;
 	struct net_device	*dev;
-	struct net_device	*input_dev;
 
 	union {
 		struct tcphdr	*th;
@@ -266,6 +265,7 @@ struct sk_buff {
 				data_len,
 				mac_len,
 				csum;
+	int			iif;
 	__u32			priority;
 	__u8			local_df:1,
 				cloned:1,
Index: net-2.6.git/include/net/pkt_cls.h
===================================================================
--- net-2.6.git.orig/include/net/pkt_cls.h
+++ net-2.6.git/include/net/pkt_cls.h
@@ -352,14 +352,19 @@ tcf_change_indev(struct tcf_proto *tp, c
 static inline int
 tcf_match_indev(struct sk_buff *skb, char *indev)
 {
+	int ret = 1;
+
 	if (indev[0]) {
-		if  (!skb->input_dev)
-			return 0;
-		if (strcmp(indev, skb->input_dev->name))
+		struct net_device *dev;
+
+		dev = dev_get_by_index(skb->iif);
+		if  (!dev)
 			return 0;
+		ret = !strcmp(indev, dev->name);
+		dev_put(dev);
 	}
 
-	return 1;
+	return ret;
 }
 #endif /* CONFIG_NET_CLS_IND */
 
Index: net-2.6.git/net/core/dev.c
===================================================================
--- net-2.6.git.orig/net/core/dev.c
+++ net-2.6.git/net/core/dev.c
@@ -1715,8 +1715,8 @@ static int ing_filter(struct sk_buff *sk
 	if (dev->qdisc_ingress) {
 		__u32 ttl = (__u32) G_TC_RTTL(skb->tc_verd);
 		if (MAX_RED_LOOP < ttl++) {
-			printk("Redir loop detected Dropping packet (%s->%s)\n",
-				skb->input_dev->name, skb->dev->name);
+			printk("Redir loop detected Dropping packet (%d->%s)\n",
+				skb->iif, skb->dev->name);
 			return TC_ACT_SHOT;
 		}
 
@@ -1749,8 +1749,8 @@ int netif_receive_skb(struct sk_buff *sk
 	if (!skb->tstamp.off_sec)
 		net_timestamp(skb);
 
-	if (!skb->input_dev)
-		skb->input_dev = skb->dev;
+	if (!skb->iif)
+		skb->iif = skb->dev->ifindex;
 
 	orig_dev = skb_bond(skb);
 
Index: net-2.6.git/net/core/skbuff.c
===================================================================
--- net-2.6.git.orig/net/core/skbuff.c
+++ net-2.6.git/net/core/skbuff.c
@@ -463,10 +463,10 @@ struct sk_buff *skb_clone(struct sk_buff
 	n->tc_verd = SET_TC_VERD(skb->tc_verd,0);
 	n->tc_verd = CLR_TC_OK2MUNGE(n->tc_verd);
 	n->tc_verd = CLR_TC_MUNGED(n->tc_verd);
-	C(input_dev);
 #endif
 	skb_copy_secmark(n, skb);
 #endif
+	C(iif);
 	C(truesize);
 	atomic_set(&n->users, 1);
 	C(head);
Index: net-2.6.git/net/sched/act_api.c
===================================================================
--- net-2.6.git.orig/net/sched/act_api.c
+++ net-2.6.git/net/sched/act_api.c
@@ -156,9 +156,8 @@ int tcf_action_exec(struct sk_buff *skb,
 
 	if (skb->tc_verd & TC_NCLS) {
 		skb->tc_verd = CLR_TC_NCLS(skb->tc_verd);
-		D2PRINTK("(%p)tcf_action_exec: cleared TC_NCLS in %s out %s\n",
-		         skb, skb->input_dev ? skb->input_dev->name : "xxx",
-		         skb->dev->name);
+		D2PRINTK("(%p)tcf_action_exec: cleared TC_NCLS in %d out %s\n",
+		         skb, skb->iif, skb->dev->name);
 		ret = TC_ACT_OK;
 		goto exec_done;
 	}
Index: net-2.6.git/net/sched/act_mirred.c
===================================================================
--- net-2.6.git.orig/net/sched/act_mirred.c
+++ net-2.6.git/net/sched/act_mirred.c
@@ -207,7 +207,7 @@ bad_mirred:
 		skb2->tc_verd = SET_TC_FROM(skb2->tc_verd, at);
 
 	skb2->dev = dev;
-	skb2->input_dev = skb->dev;
+	skb2->iif = skb->dev->ifindex;
 	dev_queue_xmit(skb2);
 	spin_unlock(&p->lock);
 	return p->action;
Index: net-2.6.git/drivers/net/ifb.c
===================================================================
--- net-2.6.git.orig/drivers/net/ifb.c
+++ net-2.6.git/drivers/net/ifb.c
@@ -158,7 +158,7 @@ static int ifb_xmit(struct sk_buff *skb,
 	stats->tx_packets++;
 	stats->tx_bytes+=skb->len;
 
-	if (!from || !skb->input_dev) {
+	if (!from || !skb->iif) {
 dropped:
 		dev_kfree_skb(skb);
 		stats->rx_dropped++;
@@ -169,8 +169,7 @@ dropped:
 		 * ingress -> egress or
 		 * egress -> ingress
 		*/
-		skb->dev = skb->input_dev;
-		skb->input_dev = dev;
+		skb->dev = dev;
 		if (from & AT_INGRESS) {
 			skb_pull(skb, skb->dev->hard_header_len);
 		} else {

^ permalink raw reply	[flat|nested] 91+ messages in thread

* Re: [PATCH 2/3] [VLAN]: Update iif when receiving via VLAN device
  2006-06-29  8:51                       ` Thomas Graf
  2006-06-29 20:55                         ` Thomas Graf
@ 2006-06-29 23:23                         ` jamal
  2006-06-29 23:39                           ` Thomas Graf
  1 sibling, 1 reply; 91+ messages in thread
From: jamal @ 2006-06-29 23:23 UTC (permalink / raw)
  To: Thomas Graf; +Cc: Patrick McHardy, David Miller, netdev

I noticed the other email carrying a patch. Let me respond to this email
and hopefully that will address the patch.

On Thu, 2006-29-06 at 10:51 +0200, Thomas Graf wrote: 
> * jamal <hadi@cyberus.ca> 2006-06-28 09:46
[..] 
> I disagree, iflink information is bogus once we start redirecting.

After redirecting, iflink is "available" i.e it doesnt make any
more sense and therefore it should be (ab)usable to carry other info.
But the more i think about it, using skb->dev is just fine for 
finding the device you are looking for even when it gets redirected.

I am assuming you still want to find out - in the case of a topology
which constitutes more than one netdevice - the second last netdevice,
correct?

> > 
> > Can you achieve the same by binding to any arbitrary netdevice? If you
> > can then i would agree.
> 
> It was your request to not update input_dev in netif_receive_skb()
> but rather have each netdevice handle it manually. 

For good reasons of course.

note: you didnt answer the question i asked, but probably the answer
doesnt matter.

> What's currently
> done in ifb:
> 
> 	skb->dev = skb->input_dev;
> 	skb->input_dev = dev;
> 
> Confusing, isn't it? 

not at all. Let me explain the design intent further below.

> I really don't get input_dev/iif is supposed to
> be updated in ifb. If a packet is to be redirected, the mirred action
> shall take care of providing this information to the next netdevice.

ifb is a special purpose device.
Think of it as a redirector, or injector if you want a better noun. If
you are going to inject packets back to the stack at arbitrary points,
then you need to reflect that in the packet metadata implying where it
came from. Likewise if you are going to redirect packets into arbitrary
points in the stack (as mirred does), you need to do the same. 
I dont know if that makes sense. 

> My additional request is to provide a flag to disable this for special
> purposes not hurting anyone.

I am failing to see the purpose. 
With all due respect, you are changing the design intent, Thomas.

I know your intent is noble in trying to save the 32 bits on 64 bit
machines (at least thats where your patch seems to have started) but the
cost:benefit ratio as i have pointed out is unreasonable. 
If you can meet the requirements i have specified (I am leaving them
below) i will be a happy camper.

> > The challenge is this, and if you can solve it we would be fine:
> > - I need to access both the input_dev and dev and their metadata.
> > I could clearly find them by their ifindices and then reference them
> > after that. For performance reasons that is not optimal in the fast
> > path.  
> 

So just we are clear: I have strong desire to save compute rather than
than saving 32 bits per skb on 64 bit machine. 

> Nice, you forget that while redirecting the device pointed to by
> input_dev can disappear leaving behind an illegal reference. 

We have had this discussion before - that is resolvable via
dev_put/hold.

> The purpose of this patch is to fix this.

Appreciated - but can you do it without replacing the input_dev?

cheers,
jamal

^ permalink raw reply	[flat|nested] 91+ messages in thread

* Re: [PATCH 2/3] [VLAN]: Update iif when receiving via VLAN device
  2006-06-29 23:23                         ` jamal
@ 2006-06-29 23:39                           ` Thomas Graf
  2006-06-29 23:47                             ` David Miller
  2006-06-30  0:03                             ` jamal
  0 siblings, 2 replies; 91+ messages in thread
From: Thomas Graf @ 2006-06-29 23:39 UTC (permalink / raw)
  To: jamal; +Cc: Patrick McHardy, David Miller, netdev

* jamal <hadi@cyberus.ca> 2006-06-29 19:23
> > What's currently
> > done in ifb:
> > 
> > 	skb->dev = skb->input_dev;
> > 	skb->input_dev = dev;
> > 
> > Confusing, isn't it? 
> 
> not at all. Let me explain the design intent further below.

Then let me show you what your code does:

 [mirred attached to filter on eth0 redirecting to ifb0]

tcf_mirred():    skb2->input_dev = skb->dev; (skb2->input_dev=eth0)
ifb_xmit():      skb->dev = skb->input_dev; (skb->dev=eth0)
                 skb->input_dev = dev; (skb->input_dev=ifb0)

So when reentering the stack the skb looks like it would be
on eth0 coming from ifb0. Is that what you wanted? Shouldn't
it be skb->dev=ifb0 skb->input_dev=eth0? That's what my patch
would change it to.

> I know your intent is noble in trying to save the 32 bits on 64 bit
> machines (at least thats where your patch seems to have started) but the
> cost:benefit ratio as i have pointed out is unreasonable. 

Did I ever claim this? You made this up right now. As of now
it doesn't save a single bit.


^ permalink raw reply	[flat|nested] 91+ messages in thread

* Re: [PATCH 2/3] [VLAN]: Update iif when receiving via VLAN device
  2006-06-29 23:39                           ` Thomas Graf
@ 2006-06-29 23:47                             ` David Miller
  2006-06-30  0:08                               ` jamal
  2006-06-30  0:03                             ` jamal
  1 sibling, 1 reply; 91+ messages in thread
From: David Miller @ 2006-06-29 23:47 UTC (permalink / raw)
  To: tgraf; +Cc: hadi, kaber, netdev

From: Thomas Graf <tgraf@suug.ch>
Date: Fri, 30 Jun 2006 01:39:33 +0200

> * jamal <hadi@cyberus.ca> 2006-06-29 19:23
> > I know your intent is noble in trying to save the 32 bits on 64 bit
> > machines (at least thats where your patch seems to have started) but the
> > cost:benefit ratio as i have pointed out is unreasonable. 
> 
> Did I ever claim this? You made this up right now. As of now
> it doesn't save a single bit.

Right.

>From what I can see Thomas's work allows to do correct
reference counting on the input_dev, something which is
not possible right now and is a bug.

^ permalink raw reply	[flat|nested] 91+ messages in thread

* Re: [PATCH 2/3] [VLAN]: Update iif when receiving via VLAN device
  2006-06-29 23:39                           ` Thomas Graf
  2006-06-29 23:47                             ` David Miller
@ 2006-06-30  0:03                             ` jamal
  2006-06-30  0:46                               ` Thomas Graf
  1 sibling, 1 reply; 91+ messages in thread
From: jamal @ 2006-06-30  0:03 UTC (permalink / raw)
  To: Thomas Graf; +Cc: netdev, David Miller, Patrick McHardy

On Fri, 2006-30-06 at 01:39 +0200, Thomas Graf wrote:
> * jamal <hadi@cyberus.ca> 2006-06-29 19:23

> > not at all. Let me explain the design intent further below.
> 
> Then let me show you what your code does:
> 
>  [mirred attached to filter on eth0 redirecting to ifb0]
> 
> tcf_mirred():    skb2->input_dev = skb->dev; (skb2->input_dev=eth0)
> ifb_xmit():      skb->dev = skb->input_dev; (skb->dev=eth0)
>                  skb->input_dev = dev; (skb->input_dev=ifb0)
> 
> So when reentering the stack the skb looks like it would be
> on eth0 coming from ifb0. Is that what you wanted? 

ok, that looks like egress side of the stack, correct?
 
indeed thats what i meant earlier i.e above looks right. Another way to
look at it, is that on the stack, the "dev" part acts as a "to" label of
the direction and the "input_dev" maps to the "from". 


step 0: Packet arriving at egress of eth0 
It will have skb->dev pointing to "eth0" and skb->input_dev will depend
on whether it is coming from the local path (in which it will be NULL)
or it was being routed (in which case it will reflect the netdevice it
last passed on input.

Step 1: when it leaves redirect
As you note it will have indev("from") = eth0 and dev("to") still "ifb0"

Step 2: when it leaves ifb going back to eth0
it will have input_dev("from") = ifb0 and dev("to")  "eth0"

Of course this gets more interesting if you had say redirected to
another device like lo which redirected to ifb1. Or when you try to
create loops etc.

And if you look at ingress, it will be slightly different; however,
in both cases (ingress/egress) things are consistent in the labeling of
from/to to be input_dev/dev

> Shouldn't
> it be skb->dev=ifb0 skb->input_dev=eth0? That's what my patch
> would change it to.
> 

yes, that would change the semantics

> > I know your intent is noble in trying to save the 32 bits on 64 bit
> > machines (at least thats where your patch seems to have started) but the
> > cost:benefit ratio as i have pointed out is unreasonable. 
> 
> Did I ever claim this? You made this up right now. As of now
> it doesn't save a single bit.

Ok, I apologize - i assumed it has something to do with skb diet. 

cheers,
jamal


^ permalink raw reply	[flat|nested] 91+ messages in thread

* Re: [PATCH 2/3] [VLAN]: Update iif when receiving via VLAN device
  2006-06-29 23:47                             ` David Miller
@ 2006-06-30  0:08                               ` jamal
  2006-06-30  0:12                                 ` David Miller
  0 siblings, 1 reply; 91+ messages in thread
From: jamal @ 2006-06-30  0:08 UTC (permalink / raw)
  To: David Miller; +Cc: netdev, kaber, tgraf

On Thu, 2006-29-06 at 16:47 -0700, David Miller wrote:
> From: Thomas Graf <tgraf@suug.ch>
> Date: Fri, 30 Jun 2006 01:39:33 +0200
> 
> > * jamal <hadi@cyberus.ca> 2006-06-29 19:23
> > > I know your intent is noble in trying to save the 32 bits on 64 bit
> > > machines (at least thats where your patch seems to have started) but the
> > > cost:benefit ratio as i have pointed out is unreasonable. 
> > 
> > Did I ever claim this? You made this up right now. As of now
> > it doesn't save a single bit.
> 
> Right.
> 

What am i missing?
on 64bit machine, does it not save 32 bits to use an ifindex as opposed
to the pointer?

> From what I can see Thomas's work allows to do correct
> reference counting on the input_dev, something which is
> not possible right now and is a bug.

Yes, it is a bug, but:
dev_hold/put dont work anymore? why do you need an ifindex instead?

cheers,
jamal


^ permalink raw reply	[flat|nested] 91+ messages in thread

* Re: [PATCH 2/3] [VLAN]: Update iif when receiving via VLAN device
  2006-06-30  0:08                               ` jamal
@ 2006-06-30  0:12                                 ` David Miller
  2006-06-30  0:26                                   ` jamal
  0 siblings, 1 reply; 91+ messages in thread
From: David Miller @ 2006-06-30  0:12 UTC (permalink / raw)
  To: hadi; +Cc: netdev, kaber, tgraf

From: jamal <hadi@cyberus.ca>
Date: Thu, 29 Jun 2006 20:08:19 -0400

> What am i missing?
> on 64bit machine, does it not save 32 bits to use an ifindex as opposed
> to the pointer?

The objects around it are pointers, which are 64-bit, and thus
the 32-bit object gets padded out to 64-bits in the layout of
the struct so that the next pointer member can be properly
aligned.

It does not change the size of sk_buff at all.

> Yes, it is a bug, but:
> dev_hold/put dont work anymore? why do you need an ifindex instead?

You sure you want to do that atomic operation on every single
input packet, regardless of whether egresss operations are
using it or not?

We should put the cost of features at the actual users, and not
impose it upon everyone.

^ permalink raw reply	[flat|nested] 91+ messages in thread

* Re: [PATCH 2/3] [VLAN]: Update iif when receiving via VLAN device
  2006-06-30  0:12                                 ` David Miller
@ 2006-06-30  0:26                                   ` jamal
  2006-06-30  0:29                                     ` David Miller
  0 siblings, 1 reply; 91+ messages in thread
From: jamal @ 2006-06-30  0:26 UTC (permalink / raw)
  To: David Miller; +Cc: tgraf, kaber, netdev

On Thu, 2006-29-06 at 17:12 -0700, David Miller wrote:
> From: jamal <hadi@cyberus.ca>
> Date: Thu, 29 Jun 2006 20:08:19 -0400
> 
> > What am i missing?
> > on 64bit machine, does it not save 32 bits to use an ifindex as opposed
> > to the pointer?
> 
> The objects around it are pointers, which are 64-bit, and thus
> the 32-bit object gets padded out to 64-bits in the layout of
> the struct so that the next pointer member can be properly
> aligned.
> 
> It does not change the size of sk_buff at all.
> 

I see; i take it if things were moved around that may change?

> > Yes, it is a bug, but:
> > dev_hold/put dont work anymore? why do you need an ifindex instead?
> 
> You sure you want to do that atomic operation on every single
> input packet, regardless of whether egress operations are
> using it or not?
> 

Can you avoid doing the refcount?
Note Thomas is doing dev_get_by_index (which will do the atomic ref
count).

For me the choice is between having the iif and:
- __get device from ifindex
- reference dev->something

vs getting the input_dev and
- reference dev->something

> We should put the cost of features at the actual users, and not
> impose it upon everyone.

I didnt quiet follow, the ref count seems only needed in the
redirection, no?

cheers,
jamal


^ permalink raw reply	[flat|nested] 91+ messages in thread

* Re: [PATCH 2/3] [VLAN]: Update iif when receiving via VLAN device
  2006-06-30  0:26                                   ` jamal
@ 2006-06-30  0:29                                     ` David Miller
  2006-06-30  0:44                                       ` Ben Greear
  2006-06-30  0:48                                       ` jamal
  0 siblings, 2 replies; 91+ messages in thread
From: David Miller @ 2006-06-30  0:29 UTC (permalink / raw)
  To: hadi; +Cc: tgraf, kaber, netdev

From: jamal <hadi@cyberus.ca>
Date: Thu, 29 Jun 2006 20:26:20 -0400

> On Thu, 2006-29-06 at 17:12 -0700, David Miller wrote:
> > The objects around it are pointers, which are 64-bit, and thus
> > the 32-bit object gets padded out to 64-bits in the layout of
> > the struct so that the next pointer member can be properly
> > aligned.
> > 
> > It does not change the size of sk_buff at all.
> > 
> 
> I see; i take it if things were moved around that may change?

Yes.

> > > Yes, it is a bug, but:
> > > dev_hold/put dont work anymore? why do you need an ifindex instead?
> > 
> > You sure you want to do that atomic operation on every single
> > input packet, regardless of whether egress operations are
> > using it or not?
> > 
> 
> Can you avoid doing the refcount?
> Note Thomas is doing dev_get_by_index (which will do the atomic ref
> count).

He is doing that where skb->input_device is needed, which is
what we want.

> I didnt quiet follow, the ref count seems only needed in the
> redirection, no?

I'm saying that, we don't need the refcount, just setting
the skb->input_index thing, unless someone actually cares
about the input device.

As long as the packet hits not paths that care about the
SKB input device, no atomic refcounts are taken.  It's
just an integer sitting there in the SKB.

^ permalink raw reply	[flat|nested] 91+ messages in thread

* Re: [PATCH 2/3] [VLAN]: Update iif when receiving via VLAN device
  2006-06-30  0:29                                     ` David Miller
@ 2006-06-30  0:44                                       ` Ben Greear
  2006-06-30  0:48                                       ` jamal
  1 sibling, 0 replies; 91+ messages in thread
From: Ben Greear @ 2006-06-30  0:44 UTC (permalink / raw)
  To: David Miller; +Cc: hadi, tgraf, kaber, netdev

David Miller wrote:

> I'm saying that, we don't need the refcount, just setting
> the skb->input_index thing, unless someone actually cares
> about the input device.
> 
> As long as the packet hits not paths that care about the
> SKB input device, no atomic refcounts are taken.  It's
> just an integer sitting there in the SKB.

Also, if this lookup is to be done multiple times per skb, we could
do the lookup once and grab the ref at that time and store the
pointer in the skb.  If you wanted to be truly horible about it..could
use a 64-bit input_index and copy the pointer there (and set a flag somewhere
noting it is no longer an index but a pointer) so save the extra
64-bits.

Ben


> -
> To unsubscribe from this list: send the line "unsubscribe netdev" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
> 


-- 
Ben Greear <greearb@candelatech.com>
Candela Technologies Inc  http://www.candelatech.com


^ permalink raw reply	[flat|nested] 91+ messages in thread

* Re: [PATCH 2/3] [VLAN]: Update iif when receiving via VLAN device
  2006-06-30  0:03                             ` jamal
@ 2006-06-30  0:46                               ` Thomas Graf
  2006-06-30  1:11                                 ` jamal
  0 siblings, 1 reply; 91+ messages in thread
From: Thomas Graf @ 2006-06-30  0:46 UTC (permalink / raw)
  To: jamal; +Cc: netdev, David Miller, Patrick McHardy

* jamal <hadi@cyberus.ca> 2006-06-29 20:03
> On Fri, 2006-30-06 at 01:39 +0200, Thomas Graf wrote:
> > * jamal <hadi@cyberus.ca> 2006-06-29 19:23
> 
> > > not at all. Let me explain the design intent further below.
> > 
> > Then let me show you what your code does:
> > 
> >  [mirred attached to filter on eth0 redirecting to ifb0]
> > 
> > tcf_mirred():    skb2->input_dev = skb->dev; (skb2->input_dev=eth0)
> > ifb_xmit():      skb->dev = skb->input_dev; (skb->dev=eth0)
> >                  skb->input_dev = dev; (skb->input_dev=ifb0)
> > 
> > So when reentering the stack the skb looks like it would be
> > on eth0 coming from ifb0. Is that what you wanted? 
> 
> ok, that looks like egress side of the stack, correct?

No, that's the ingress side leaving ifb again via netif_rx()
skb->dev should represent the from/at= and not to=. 

For egress your code is correct although impossible to guess
right due to total lack of a comment where it would make sense
and naming that doesn't give any implications.

When leaving ifb0 you want for...
... egress:
   skb->dev=to (eth0) skb->iif=from (ifb0)
... ingress:
   skb->dev=at (ifb0) skb->iif=from (eth0)

So we move the update to the tasklet and set skb->dev to
skb->iif before dev_queue_xmit() and skb->dev = dev (ifb)
before calling netif_rx()

Does that fullfil your requirements?

^ permalink raw reply	[flat|nested] 91+ messages in thread

* Re: [PATCH 2/3] [VLAN]: Update iif when receiving via VLAN device
  2006-06-30  0:29                                     ` David Miller
  2006-06-30  0:44                                       ` Ben Greear
@ 2006-06-30  0:48                                       ` jamal
  2006-06-30  0:55                                         ` Thomas Graf
  1 sibling, 1 reply; 91+ messages in thread
From: jamal @ 2006-06-30  0:48 UTC (permalink / raw)
  To: David Miller; +Cc: netdev, kaber, tgraf

On Thu, 2006-29-06 at 17:29 -0700, David Miller wrote:
> From: jamal <hadi@cyberus.ca>
> Date: Thu, 29 Jun 2006 20:26:20 -0400
[..]
> > I see; i take it if things were moved around that may change?
> 
> Yes.
> 

Ok, relief - so i was not totally unreasonable then ;->

> > Can you avoid doing the refcount?
> > Note Thomas is doing dev_get_by_index (which will do the atomic ref
> > count).
> 
> He is doing that where skb->input_device is needed, which is
> what we want.
> 

I am saying the same thing as well - i think. mirred touches
the input_dev and therefore setting the refcount in mirred is valid -
but iam unsure where to unset it.

> > I didnt quiet follow, the ref count seems only needed in the
> > redirection, no?
> 
> I'm saying that, we don't need the refcount, just setting
> the skb->input_index thing, unless someone actually cares
> about the input device.
> 

the ifb references it; only mirred redirects to the ifb at the moment.
You would need to increment in mirred, no?
Why do i feel i am missing something? ;->

> As long as the packet hits not paths that care about the
> SKB input device, no atomic refcounts are taken.  It's
> just an integer sitting there in the SKB.

indeed. 
I think whether it becomes ifindex or pointer you need to increment the
refcounter. and decrement somewhere.
The challenge for me is a choice to use more cycles if you use ifindex
vs less cycles with a pointer. The advantage for going with ifindex
would be to save those bits(if you rearrange). The question is which is
reasonable?;->

cheers,
jamal 

^ permalink raw reply	[flat|nested] 91+ messages in thread

* Re: [PATCH 2/3] [VLAN]: Update iif when receiving via VLAN device
  2006-06-30  0:48                                       ` jamal
@ 2006-06-30  0:55                                         ` Thomas Graf
  2006-06-30  1:18                                           ` jamal
  0 siblings, 1 reply; 91+ messages in thread
From: Thomas Graf @ 2006-06-30  0:55 UTC (permalink / raw)
  To: jamal; +Cc: David Miller, netdev, kaber

* jamal <hadi@cyberus.ca> 2006-06-29 20:48
> the ifb references it; only mirred redirects to the ifb at the moment.
> You would need to increment in mirred, no?
> Why do i feel i am missing something? ;->

The point is to avoid having an atomic operation for every packet
when setting iif in netif_receive_skb(). If it was only for
mirred nobody would complain I guess.

> I think whether it becomes ifindex or pointer you need to increment the
> refcounter. and decrement somewhere.
> The challenge for me is a choice to use more cycles if you use ifindex
> vs less cycles with a pointer. The advantage for going with ifindex
> would be to save those bits(if you rearrange). The question is which is
> reasonable?;->

The third choice is to just don't care if the interface goes away
but have a chance to figure it out and just assume as if it would
have never been set. The number of devices that can disappear w/o
user control is very very limited and not worth an atomic operation
for every single packet.

^ permalink raw reply	[flat|nested] 91+ messages in thread

* Re: [PATCH 2/3] [VLAN]: Update iif when receiving via VLAN device
  2006-06-30  0:46                               ` Thomas Graf
@ 2006-06-30  1:11                                 ` jamal
  2006-06-30 13:08                                   ` Thomas Graf
  0 siblings, 1 reply; 91+ messages in thread
From: jamal @ 2006-06-30  1:11 UTC (permalink / raw)
  To: Thomas Graf; +Cc: Patrick McHardy, David Miller, netdev

On Fri, 2006-30-06 at 02:46 +0200, Thomas Graf wrote:
> * jamal <hadi@cyberus.ca> 2006-06-29 20:03
> > On Fri, 2006-30-06 at 01:39 +0200, Thomas Graf wrote:
> > > * jamal <hadi@cyberus.ca> 2006-06-29 19:23
> > 
> > > > not at all. Let me explain the design intent further below.
> > > 
> > > Then let me show you what your code does:
> > > 
> > >  [mirred attached to filter on eth0 redirecting to ifb0]
> > > 
> > > tcf_mirred():    skb2->input_dev = skb->dev; (skb2->input_dev=eth0)
> > > ifb_xmit():      skb->dev = skb->input_dev; (skb->dev=eth0)
> > >                  skb->input_dev = dev; (skb->input_dev=ifb0)
> > > 
> > > So when reentering the stack the skb looks like it would be
> > > on eth0 coming from ifb0. Is that what you wanted? 
> > 
> > ok, that looks like egress side of the stack, correct?
> 
> No, that's the ingress side leaving ifb again via netif_rx()
>
> skb->dev should represent the from/at= and not to=. 
> 

Heres what it would look at ingress:

step 0: coming from wire via eth0,
dev=eth0, input_dev=eth0

step 1: redirect to ifb0, leaving redirect
dev=ifb0, input_dev=eth0

step 2: leaving ifb0, coming back to ingress side of stack

dev= eth0, input_dev=ifb0

Again, it gets interesting when you have redirection multiple times
and create loops. It will get more interesting when i submit the code
for redirecting to ingress.
The good news is there is very strong consistency.
I dont know if "at" is the correct description. I visualize it as
hitting the stack just "at" the point.

when the packet first comes in both from/to point to eth0.
So the semantics are the same as in egress.

> For egress your code is correct although impossible to guess
> right due to total lack of a comment where it would make sense
> and naming that doesn't give any implications.
> 

There are some simple comments and  I have been trying very hard to
explain this for a long time.
Patrick was the last person who tortured me ;-> Perhaps i need to
write a document. 

> When leaving ifb0 you want for...
> ... egress:
>    skb->dev=to (eth0) skb->iif=from (ifb0)
> ... ingress:
>    skb->dev=at (ifb0) skb->iif=from (eth0)
> 

Yes, this is correct. I described the flow of the first one in the
earlier email and the ingress side.
Although lets not talk about iif yet. I posed a question earlier on
which scheme would be better.

> So we move the update to the tasklet and set skb->dev to
> skb->iif before dev_queue_xmit() and skb->dev = dev (ifb)
> before calling netif_rx()
> 
> Does that fullfil your requirements?

You lost me on what you are trying to achieve. Just restore the line you
removed in ifb and things would be fine. Lets then settle the issue of
iif vs input_dev.

cheers,
jamal

^ permalink raw reply	[flat|nested] 91+ messages in thread

* Re: [PATCH 2/3] [VLAN]: Update iif when receiving via VLAN device
  2006-06-30  0:55                                         ` Thomas Graf
@ 2006-06-30  1:18                                           ` jamal
  0 siblings, 0 replies; 91+ messages in thread
From: jamal @ 2006-06-30  1:18 UTC (permalink / raw)
  To: Thomas Graf; +Cc: kaber, netdev, David Miller

On Fri, 2006-30-06 at 02:55 +0200, Thomas Graf wrote:
> * jamal <hadi@cyberus.ca> 2006-06-29 20:48

> The point is to avoid having an atomic operation for every packet
> when setting iif in netif_receive_skb(). If it was only for
> mirred nobody would complain I guess.
> 

I never intended to punish all users. I think i was misunderstood.
The only problem is where do you decrement the refcount when you
increment at mirred?
In your case, I assume it is at some sort of rule destruction?

> > I think whether it becomes ifindex or pointer you need to increment the
> > refcounter. and decrement somewhere.
> > The challenge for me is a choice to use more cycles if you use ifindex
> > vs less cycles with a pointer. The advantage for going with ifindex
> > would be to save those bits(if you rearrange). The question is which is
> > reasonable?;->
> 
> The third choice is to just don't care if the interface goes away
> but have a chance to figure it out and just assume as if it would
> have never been set. The number of devices that can disappear w/o
> user control is very very limited and not worth an atomic operation
> for every single packet.
> 

Now that you mention this - I think that option 3 is what we said last
time we had this discussion ;->

cheers,
jamal


^ permalink raw reply	[flat|nested] 91+ messages in thread

* Re: [PATCH 2/3] [VLAN]: Update iif when receiving via VLAN device
  2006-06-30  1:11                                 ` jamal
@ 2006-06-30 13:08                                   ` Thomas Graf
  2006-06-30 13:20                                     ` Nicolas Dichtel
                                                       ` (2 more replies)
  0 siblings, 3 replies; 91+ messages in thread
From: Thomas Graf @ 2006-06-30 13:08 UTC (permalink / raw)
  To: jamal; +Cc: Patrick McHardy, David Miller, netdev

* jamal <hadi@cyberus.ca> 2006-06-29 21:11
> Heres what it would look at ingress:
> 
> step 0: coming from wire via eth0,
> dev=eth0, input_dev=eth0
> 
> step 1: redirect to ifb0, leaving redirect
> dev=ifb0, input_dev=eth0
> 
> step 2: leaving ifb0, coming back to ingress side of stack
> 
> dev= eth0, input_dev=ifb0

That creates a nice loop on ingress. Upon reentering the
stack with skb->dev set to eth0 again we'll go through the
same ingress filters as the first time and we'll hit ifb0
again over and over. Are you suggesting everyone has to
insert a pass action matching input_dev in order to escape
the loop when using ifb?

> > When leaving ifb0 you want for...
> > ... egress:
> >    skb->dev=to (eth0) skb->iif=from (ifb0)
> > ... ingress:
> >    skb->dev=at (ifb0) skb->iif=from (eth0)
> > 
> 
> Yes, this is correct. I described the flow of the first one in the
> earlier email and the ingress side.

How can it be correct if it differs from your description
above? What I described is what the patch changes it to.
Looking closer at ifb it contains a race when updating
skb->dev. Preempt has to be disabled when updating skb->dev
before calling netif_rx() otherwise the device might disappear.

[NET]: Use interface index to keep input device information

Using the interface index instead of a direct reference
allows a safe usage beyond the scope where an interface
could disappear.

The old input_dev field was incorrectly made dependant
on CONFIG_NET_CLS_ACT in skb_copy().

The ifb device is fixed to set skb->dev in a manner that
the device can't disappear before calling netif_rx() and
the semantics are fixed so a packet reentering the stack
looks like it would have been received on the ifb device.

Signed-off-by: Thomas Graf <tgraf@suug.ch>

Index: net-2.6.git/include/linux/skbuff.h
===================================================================
--- net-2.6.git.orig/include/linux/skbuff.h
+++ net-2.6.git/include/linux/skbuff.h
@@ -181,7 +181,6 @@ enum {
  *	@sk: Socket we are owned by
  *	@tstamp: Time we arrived
  *	@dev: Device we arrived on/are leaving by
- *	@input_dev: Device we arrived on
  *	@h: Transport layer header
  *	@nh: Network layer header
  *	@mac: Link layer header
@@ -192,6 +191,7 @@ enum {
  *	@data_len: Data length
  *	@mac_len: Length of link layer header
  *	@csum: Checksum
+ *	@iif: Device we arrived on
  *	@local_df: allow local fragmentation
  *	@cloned: Head may be cloned (check refcnt to be sure)
  *	@nohdr: Payload reference only, must not modify header
@@ -228,7 +228,6 @@ struct sk_buff {
 	struct sock		*sk;
 	struct skb_timeval	tstamp;
 	struct net_device	*dev;
-	struct net_device	*input_dev;
 
 	union {
 		struct tcphdr	*th;
@@ -266,6 +265,7 @@ struct sk_buff {
 				data_len,
 				mac_len,
 				csum;
+	int			iif;
 	__u32			priority;
 	__u8			local_df:1,
 				cloned:1,
Index: net-2.6.git/include/net/pkt_cls.h
===================================================================
--- net-2.6.git.orig/include/net/pkt_cls.h
+++ net-2.6.git/include/net/pkt_cls.h
@@ -352,14 +352,19 @@ tcf_change_indev(struct tcf_proto *tp, c
 static inline int
 tcf_match_indev(struct sk_buff *skb, char *indev)
 {
+	int ret = 1;
+
 	if (indev[0]) {
-		if  (!skb->input_dev)
-			return 0;
-		if (strcmp(indev, skb->input_dev->name))
+		struct net_device *dev;
+
+		dev = dev_get_by_index(skb->iif);
+		if  (!dev)
 			return 0;
+		ret = !strcmp(indev, dev->name);
+		dev_put(dev);
 	}
 
-	return 1;
+	return ret;
 }
 #endif /* CONFIG_NET_CLS_IND */
 
Index: net-2.6.git/net/core/dev.c
===================================================================
--- net-2.6.git.orig/net/core/dev.c
+++ net-2.6.git/net/core/dev.c
@@ -1715,8 +1715,8 @@ static int ing_filter(struct sk_buff *sk
 	if (dev->qdisc_ingress) {
 		__u32 ttl = (__u32) G_TC_RTTL(skb->tc_verd);
 		if (MAX_RED_LOOP < ttl++) {
-			printk("Redir loop detected Dropping packet (%s->%s)\n",
-				skb->input_dev->name, skb->dev->name);
+			printk("Redir loop detected Dropping packet (%d->%s)\n",
+				skb->iif, skb->dev->name);
 			return TC_ACT_SHOT;
 		}
 
@@ -1749,8 +1749,8 @@ int netif_receive_skb(struct sk_buff *sk
 	if (!skb->tstamp.off_sec)
 		net_timestamp(skb);
 
-	if (!skb->input_dev)
-		skb->input_dev = skb->dev;
+	if (!skb->iif)
+		skb->iif = skb->dev->ifindex;
 
 	orig_dev = skb_bond(skb);
 
Index: net-2.6.git/net/core/skbuff.c
===================================================================
--- net-2.6.git.orig/net/core/skbuff.c
+++ net-2.6.git/net/core/skbuff.c
@@ -463,10 +463,10 @@ struct sk_buff *skb_clone(struct sk_buff
 	n->tc_verd = SET_TC_VERD(skb->tc_verd,0);
 	n->tc_verd = CLR_TC_OK2MUNGE(n->tc_verd);
 	n->tc_verd = CLR_TC_MUNGED(n->tc_verd);
-	C(input_dev);
 #endif
 	skb_copy_secmark(n, skb);
 #endif
+	C(iif);
 	C(truesize);
 	atomic_set(&n->users, 1);
 	C(head);
Index: net-2.6.git/net/sched/act_api.c
===================================================================
--- net-2.6.git.orig/net/sched/act_api.c
+++ net-2.6.git/net/sched/act_api.c
@@ -156,9 +156,8 @@ int tcf_action_exec(struct sk_buff *skb,
 
 	if (skb->tc_verd & TC_NCLS) {
 		skb->tc_verd = CLR_TC_NCLS(skb->tc_verd);
-		D2PRINTK("(%p)tcf_action_exec: cleared TC_NCLS in %s out %s\n",
-		         skb, skb->input_dev ? skb->input_dev->name : "xxx",
-		         skb->dev->name);
+		D2PRINTK("(%p)tcf_action_exec: cleared TC_NCLS in %d out %s\n",
+		         skb, skb->iif, skb->dev->name);
 		ret = TC_ACT_OK;
 		goto exec_done;
 	}
Index: net-2.6.git/net/sched/act_mirred.c
===================================================================
--- net-2.6.git.orig/net/sched/act_mirred.c
+++ net-2.6.git/net/sched/act_mirred.c
@@ -207,7 +207,7 @@ bad_mirred:
 		skb2->tc_verd = SET_TC_FROM(skb2->tc_verd, at);
 
 	skb2->dev = dev;
-	skb2->input_dev = skb->dev;
+	skb2->iif = skb->dev->ifindex;
 	dev_queue_xmit(skb2);
 	spin_unlock(&p->lock);
 	return p->action;
Index: net-2.6.git/drivers/net/ifb.c
===================================================================
--- net-2.6.git.orig/drivers/net/ifb.c
+++ net-2.6.git/drivers/net/ifb.c
@@ -98,13 +98,41 @@ static void ri_tasklet(unsigned long dev
 		stats->tx_packets++;
 		stats->tx_bytes +=skb->len;
 		if (from & AT_EGRESS) {
+			/*
+			 * Skb was given to us at egress, direct it back
+			 * to where it came from [iif] but update iif to
+			 * signal its new origin.
+			 */
+			struct net_device *iif;
+
+			iif = __dev_get_by_index(skb->iif);
+			if (!iif)
+				goto drop;
+
 			dp->st_rx_frm_egr++;
+
+			/* Already holding a reference on iif netdevice. */
+			skb->dev = iif;
+			skb->iif = _dev->ifindex;
 			dev_queue_xmit(skb);
 		} else if (from & AT_INGRESS) {
-
+			/*
+			 * Skb was given to us at ingress, reinject into
+			 * stack as if it would have been received on
+			 * this device.
+			 */
 			dp->st_rx_frm_ing++;
+
+			/*
+			 * Disable preempt until holding new reference on
+			 * skb->dev in netif_rx().
+			 */
+			preempt_disable();
+			skb->dev = _dev;
 			netif_rx(skb);
+			preempt_enable();
 		} else {
+drop:
 			dev_kfree_skb(skb);
 			stats->tx_dropped++;
 		}
@@ -158,7 +186,7 @@ static int ifb_xmit(struct sk_buff *skb,
 	stats->tx_packets++;
 	stats->tx_bytes+=skb->len;
 
-	if (!from || !skb->input_dev) {
+	if (!from || !skb->iif) {
 dropped:
 		dev_kfree_skb(skb);
 		stats->rx_dropped++;
@@ -169,8 +197,6 @@ dropped:
 		 * ingress -> egress or
 		 * egress -> ingress
 		*/
-		skb->dev = skb->input_dev;
-		skb->input_dev = dev;
 		if (from & AT_INGRESS) {
 			skb_pull(skb, skb->dev->hard_header_len);
 		} else {

^ permalink raw reply	[flat|nested] 91+ messages in thread

* Re: [PATCH 2/3] [VLAN]: Update iif when receiving via VLAN device
  2006-06-30 13:08                                   ` Thomas Graf
@ 2006-06-30 13:20                                     ` Nicolas Dichtel
  2006-06-30 13:57                                     ` jamal
       [not found]                                     ` <44A52435.20909@6wind.com>
  2 siblings, 0 replies; 91+ messages in thread
From: Nicolas Dichtel @ 2006-06-30 13:20 UTC (permalink / raw)
  To: netdev

Thomas Graf a écrit :
> * jamal <hadi@cyberus.ca> 2006-06-29 21:11
>> Heres what it would look at ingress:
>>
>> step 0: coming from wire via eth0,
>> dev=eth0, input_dev=eth0
>>
>> step 1: redirect to ifb0, leaving redirect
>> dev=ifb0, input_dev=eth0
>>
>> step 2: leaving ifb0, coming back to ingress side of stack
>>
>> dev= eth0, input_dev=ifb0
>
> That creates a nice loop on ingress. Upon reentering the
> stack with skb->dev set to eth0 again we'll go through the
> same ingress filters as the first time and we'll hit ifb0
> again over and over. Are you suggesting everyone has to
> insert a pass action matching input_dev in order to escape
> the loop when using ifb?
Bit 8 of skb->tc_verd is set by IFB, so packet isn't reclassify.
This bit avoid the loop.

Regards,
Nicolas

^ permalink raw reply	[flat|nested] 91+ messages in thread

* Re: [PATCH 2/3] [VLAN]: Update iif when receiving via VLAN device
       [not found]                                     ` <44A52435.20909@6wind.com>
@ 2006-06-30 13:36                                       ` Thomas Graf
  2006-06-30 13:43                                         ` Nicolas Dichtel
  2006-06-30 17:01                                       ` Patrick McHardy
  1 sibling, 1 reply; 91+ messages in thread
From: Thomas Graf @ 2006-06-30 13:36 UTC (permalink / raw)
  To: Nicolas Dichtel; +Cc: jamal, Patrick McHardy, David Miller, netdev

* Nicolas Dichtel <nicolas.dichtel@6wind.com> 2006-06-30 15:16
> >That creates a nice loop on ingress. Upon reentering the
> >stack with skb->dev set to eth0 again we'll go through the
> >same ingress filters as the first time and we'll hit ifb0
> >again over and over. Are you suggesting everyone has to
> >insert a pass action matching input_dev in order to escape
> >the loop when using ifb?
> Bit 8 of skb->tc_verd is set by IFB, so packet isn't reclassify.
> This bit avoid the loop.

Right, my mistake. Just making classification impossible after
going through an ifb device certainly seems like a perfect
idea, nice!

^ permalink raw reply	[flat|nested] 91+ messages in thread

* Re: [PATCH 2/3] [VLAN]: Update iif when receiving via VLAN device
  2006-06-30 13:36                                       ` Thomas Graf
@ 2006-06-30 13:43                                         ` Nicolas Dichtel
  0 siblings, 0 replies; 91+ messages in thread
From: Nicolas Dichtel @ 2006-06-30 13:43 UTC (permalink / raw)
  To: Thomas Graf; +Cc: jamal, Patrick McHardy, David Miller, netdev

Thomas Graf a écrit :
> * Nicolas Dichtel <nicolas.dichtel@6wind.com> 2006-06-30 15:16
>>> That creates a nice loop on ingress. Upon reentering the
>>> stack with skb->dev set to eth0 again we'll go through the
>>> same ingress filters as the first time and we'll hit ifb0
>>> again over and over. Are you suggesting everyone has to
>>> insert a pass action matching input_dev in order to escape
>>> the loop when using ifb?
>> Bit 8 of skb->tc_verd is set by IFB, so packet isn't reclassify.
>> This bit avoid the loop.
>
> Right, my mistake. Just making classification impossible after
> going through an ifb device certainly seems like a perfect
> idea, nice!
Classification has already be done on the first pass. Mirror
action must be the last classifier.

Nicolas

^ permalink raw reply	[flat|nested] 91+ messages in thread

* Re: [PATCH 2/3] [VLAN]: Update iif when receiving via VLAN device
  2006-06-30 13:08                                   ` Thomas Graf
  2006-06-30 13:20                                     ` Nicolas Dichtel
@ 2006-06-30 13:57                                     ` jamal
  2006-06-30 14:15                                       ` Thomas Graf
       [not found]                                     ` <44A52435.20909@6wind.com>
  2 siblings, 1 reply; 91+ messages in thread
From: jamal @ 2006-06-30 13:57 UTC (permalink / raw)
  To: Thomas Graf; +Cc: netdev, David Miller, Patrick McHardy

On Fri, 2006-30-06 at 15:08 +0200, Thomas Graf wrote:
> * jamal <hadi@cyberus.ca> 2006-06-29 21:11
> > Heres what it would look at ingress:
> > 
> > step 0: coming from wire via eth0,
> > dev=eth0, input_dev=eth0
> > 
> > step 1: redirect to ifb0, leaving redirect
> > dev=ifb0, input_dev=eth0
> > 
> > step 2: leaving ifb0, coming back to ingress side of stack
> > 
> > dev= eth0, input_dev=ifb0
> 
> That creates a nice loop on ingress. Upon reentering the
> stack with skb->dev set to eth0 again we'll go through the
> same ingress filters as the first time and we'll hit ifb0
> again over and over. 

loops are taken care of by other metadata. 

> Are you suggesting everyone has to
> insert a pass action matching input_dev in order to escape
> the loop when using ifb?
> 

This works, there are no loops. If you use a meta setter and changed
input_dev to something that creates a loop it will still be caught when
ttls expire.

> > > When leaving ifb0 you want for...
> > > ... egress:
> > >    skb->dev=to (eth0) skb->iif=from (ifb0)
> > > ... ingress:
> > >    skb->dev=at (ifb0) skb->iif=from (eth0)
> > > 
> > 
> > Yes, this is correct. I described the flow of the first one in the
> > earlier email and the ingress side.
> 
> How can it be correct if it differs from your description
> above? What I described is what the patch changes it to.

Double check again: it works as described above; your change messes it.

> Looking closer at ifb it contains a race when updating
> skb->dev. Preempt has to be disabled when updating skb->dev
> before calling netif_rx() otherwise the device might disappear.
> 

I am going to ignore the patch until we resolve the issue of iif vs
input_dev. Why dont we discuss that?

cheers,
jamal


^ permalink raw reply	[flat|nested] 91+ messages in thread

* Re: [PATCH 2/3] [VLAN]: Update iif when receiving via VLAN device
  2006-06-30 13:57                                     ` jamal
@ 2006-06-30 14:15                                       ` Thomas Graf
  2006-06-30 14:35                                         ` jamal
  0 siblings, 1 reply; 91+ messages in thread
From: Thomas Graf @ 2006-06-30 14:15 UTC (permalink / raw)
  To: jamal; +Cc: netdev, David Miller, Patrick McHardy

* jamal <hadi@cyberus.ca> 2006-06-30 09:57
> On Fri, 2006-30-06 at 15:08 +0200, Thomas Graf wrote:
> > That creates a nice loop on ingress. Upon reentering the
> > stack with skb->dev set to eth0 again we'll go through the
> > same ingress filters as the first time and we'll hit ifb0
> > again over and over. 
> 
> loops are taken care of by other metadata. 

Not really, assuming a simple setup such as:

	mirred attached to eth0 redirecting to ifb0
	mirred attached to ifb0 redirecting to ifb1

will result in:

eth0::tcf_mirred()     skb->dev = ifb0, input_dev = eth0
ifb0::tcf_mirred()     skb->dev = ifb1, input_dev = ifb0
ifb1::ifb_xmit()       skb->dev = ifb0, input_dev = ifb1, set ncls bit
ifb0::ifb_xmit()       skb->dev = ifb1, input_dev = ifb0
ifb1::ifb_xmit()       skb->dev = ifb0, input_dev = ifb1
...

Oh dear... and we don't even have a ttl to catch this.

> I am going to ignore the patch until we resolve the issue of iif vs
> input_dev. Why dont we discuss that?

It's starting to get useless to discuss with you. You agreed in
your last posting that the 3rd option, being that not caring about
whether a device might disappear but having a way to check for it,
is what we agreed on and what makes most sense, yet you fail to see
that using ifindex is exactly what reflects this descision.

^ permalink raw reply	[flat|nested] 91+ messages in thread

* Re: [PATCH 2/3] [VLAN]: Update iif when receiving via VLAN device
  2006-06-30 14:15                                       ` Thomas Graf
@ 2006-06-30 14:35                                         ` jamal
  2006-06-30 15:40                                           ` Ben Greear
  2006-06-30 16:32                                           ` Thomas Graf
  0 siblings, 2 replies; 91+ messages in thread
From: jamal @ 2006-06-30 14:35 UTC (permalink / raw)
  To: Thomas Graf; +Cc: Patrick McHardy, David Miller, netdev

On Fri, 2006-30-06 at 16:15 +0200, Thomas Graf wrote:
> * jamal <hadi@cyberus.ca> 2006-06-30 09:57
> > On Fri, 2006-30-06 at 15:08 +0200, Thomas Graf wrote:
> > > That creates a nice loop on ingress. Upon reentering the
> > > stack with skb->dev set to eth0 again we'll go through the
> > > same ingress filters as the first time and we'll hit ifb0
> > > again over and over. 
> > 
> > loops are taken care of by other metadata. 
> 
> Not really, assuming a simple setup such as:
> 
> 	mirred attached to eth0 redirecting to ifb0
> 	mirred attached to ifb0 redirecting to ifb1
> 
> will result in:
> 
> eth0::tcf_mirred()     skb->dev = ifb0, input_dev = eth0
> ifb0::tcf_mirred()     skb->dev = ifb1, input_dev = ifb0
> ifb1::ifb_xmit()       skb->dev = ifb0, input_dev = ifb1, set ncls bit
> ifb0::ifb_xmit()       skb->dev = ifb1, input_dev = ifb0
> ifb1::ifb_xmit()       skb->dev = ifb0, input_dev = ifb1
> ...
> 
> Oh dear... and we don't even have a ttl to catch this.
> 

Did you actually try to run this before you reached this conclusion?

> > I am going to ignore the patch until we resolve the issue of iif vs
> > input_dev. Why dont we discuss that?
> 
> It's starting to get useless to discuss with you. 

sigh.
ok, why dont you take a deep breath or a break because i think we are
not going to make any progress this way. I know you may be feeling
frustrated but i am equally if not more frustrated as well.
Our conflict comes in the fact that you are trying to change the
architecture in the name of fixing bugs. 
With all due respect, the architecture works. I have invested many many
hours testing and verifying. There may be coding bugs - and those need
fixing. Kill the bugs.

> You agreed in
> your last posting that the 3rd option, being that not caring about
> whether a device might disappear but having a way to check for it,
> is what we agreed on and what makes most sense, 

yes, i recalled that as the last discussion we had.

> yet you fail to see
> that using ifindex is exactly what reflects this descision.
> 

The choice is between using an ifindex and a pointer to the device. 
Why is it solvable via an ifindex but not a device pointer?
I may have misunderstood you, so look at this as a fresh opportunity to
enlighten me. Forget about all the discussion weve had thus far.

cheers,
jamal


^ permalink raw reply	[flat|nested] 91+ messages in thread

* Re: [PATCH 2/3] [VLAN]: Update iif when receiving via VLAN device
  2006-06-30 14:35                                         ` jamal
@ 2006-06-30 15:40                                           ` Ben Greear
  2006-06-30 16:32                                           ` Thomas Graf
  1 sibling, 0 replies; 91+ messages in thread
From: Ben Greear @ 2006-06-30 15:40 UTC (permalink / raw)
  To: hadi; +Cc: Thomas Graf, Patrick McHardy, David Miller, netdev

jamal wrote:
> On Fri, 2006-30-06 at 16:15 +0200, Thomas Graf wrote:

>>You agreed in
>>your last posting that the 3rd option, being that not caring about
>>whether a device might disappear but having a way to check for it,
>>is what we agreed on and what makes most sense, 
> 
> 
> yes, i recalled that as the last discussion we had.
> 
> 
>>yet you fail to see
>>that using ifindex is exactly what reflects this descision.
>>
> 
> 
> The choice is between using an ifindex and a pointer to the device. 
> Why is it solvable via an ifindex but not a device pointer?
> I may have misunderstood you, so look at this as a fresh opportunity to
> enlighten me. Forget about all the discussion weve had thus far.

If you use a pointer, without taking it's reference, then that device
can go away, and leave you with a stale memory access, ie kernel crash
or worse.

If you take a reference for every packet (which means an atomic op), then
you slow down every packet, even those that will never use the reference.

If you instead use the if-index, then you do not need to take a reference
for every packet, but only when you actually want to look at that device.
If the device has since disappeared, then you get a null when trying to find
by reference, and the code can take appropriate action.  No chance for
access of stale memory.

Ben

-- 
Ben Greear <greearb@candelatech.com>
Candela Technologies Inc  http://www.candelatech.com

^ permalink raw reply	[flat|nested] 91+ messages in thread

* Re: [PATCH 2/3] [VLAN]: Update iif when receiving via VLAN device
  2006-06-30 14:35                                         ` jamal
  2006-06-30 15:40                                           ` Ben Greear
@ 2006-06-30 16:32                                           ` Thomas Graf
  2006-06-30 17:13                                             ` Thomas Graf
  2006-06-30 17:19                                             ` jamal
  1 sibling, 2 replies; 91+ messages in thread
From: Thomas Graf @ 2006-06-30 16:32 UTC (permalink / raw)
  To: jamal; +Cc: Patrick McHardy, David Miller, netdev

* jamal <hadi@cyberus.ca> 2006-06-30 10:35
> On Fri, 2006-30-06 at 16:15 +0200, Thomas Graf wrote:
> > eth0::tcf_mirred()     skb->dev = ifb0, input_dev = eth0
> > ifb0::tcf_mirred()     skb->dev = ifb1, input_dev = ifb0
> > ifb1::ifb_xmit()       skb->dev = ifb0, input_dev = ifb1, set ncls bit
> > ifb0::ifb_xmit()       skb->dev = ifb1, input_dev = ifb0
> > ifb1::ifb_xmit()       skb->dev = ifb0, input_dev = ifb1
> > ...
> > 
> > Oh dear... and we don't even have a ttl to catch this.
> > 
> 
> Did you actually try to run this before you reached this conclusion?

I did, fortunately some other bug prevents this from happening,
packets are simply dropped somewhere.

> With all due respect, the architecture works. I have invested many many
> hours testing and verifying. There may be coding bugs - and those need
> fixing. Kill the bugs.

Right, just run this

tc filter add dev eth0 parent 1: protocol ip prio 10
              u32 match ip tos 0 0 flowid 1:1
	      action mirred egress redirect dev ifb1
tc filter add dev ifb1 parent 1: protocol ip prio 10
              u32 match ip tos 0 0 flowid 1:1
	      action mirred egress redirect dev ifb0


Anyways, I give up. Last time I've been running after you trying
to fix the many bugs you leave behind. Ever noticed that whenever
you add some new code it's someone else following up with tons of
small bugfix patches having a hard time trying to figure out the
actual intent. I'll just duplicate the code for my purpose, so
much easier.

^ permalink raw reply	[flat|nested] 91+ messages in thread

* Re: [PATCH 2/3] [VLAN]: Update iif when receiving via VLAN device
       [not found]                                     ` <44A52435.20909@6wind.com>
  2006-06-30 13:36                                       ` Thomas Graf
@ 2006-06-30 17:01                                       ` Patrick McHardy
  2006-06-30 17:02                                         ` Patrick McHardy
  1 sibling, 1 reply; 91+ messages in thread
From: Patrick McHardy @ 2006-06-30 17:01 UTC (permalink / raw)
  To: nicolas.dichtel; +Cc: Thomas Graf, jamal, David Miller, netdev

Nicolas Dichtel wrote:
> Bit 8 of skb->tc_verd is set by IFB, so packet isn't reclassify.
> This bit avoid the loop.

It would, if something would actually set it.

~/src/kernel/linux-2.6$ grep NCLS -r net/
net/core/dev.c: if (skb->tc_verd & TC_NCLS) {
net/core/dev.c:         skb->tc_verd = CLR_TC_NCLS(skb->tc_verd);
net/sched/act_api.c:    if (skb->tc_verd & TC_NCLS) {
net/sched/act_api.c:            skb->tc_verd = CLR_TC_NCLS(skb->tc_verd);
net/sched/act_api.c:            D2PRINTK("(%p)tcf_action_exec: cleared
TC_NCLS in %s out %s\n",
~/src/kernel/linux-2.6$

Am I missing something? Jamal, where is this bit supposed to be set?


^ permalink raw reply	[flat|nested] 91+ messages in thread

* Re: [PATCH 2/3] [VLAN]: Update iif when receiving via VLAN device
  2006-06-30 17:01                                       ` Patrick McHardy
@ 2006-06-30 17:02                                         ` Patrick McHardy
  0 siblings, 0 replies; 91+ messages in thread
From: Patrick McHardy @ 2006-06-30 17:02 UTC (permalink / raw)
  To: nicolas.dichtel; +Cc: Thomas Graf, jamal, David Miller, netdev

Patrick McHardy wrote:
> Nicolas Dichtel wrote:
> 
>>Bit 8 of skb->tc_verd is set by IFB, so packet isn't reclassify.
>>This bit avoid the loop.
> 
> 
> It would, if something would actually set it.
> 
> ~/src/kernel/linux-2.6$ grep NCLS -r net/
> net/core/dev.c: if (skb->tc_verd & TC_NCLS) {
> net/core/dev.c:         skb->tc_verd = CLR_TC_NCLS(skb->tc_verd);
> net/sched/act_api.c:    if (skb->tc_verd & TC_NCLS) {
> net/sched/act_api.c:            skb->tc_verd = CLR_TC_NCLS(skb->tc_verd);
> net/sched/act_api.c:            D2PRINTK("(%p)tcf_action_exec: cleared
> TC_NCLS in %s out %s\n",
> ~/src/kernel/linux-2.6$
> 
> Am I missing something? Jamal, where is this bit supposed to be set?

OK, I'm apparently missing the ability to read :) Please disregard ..

^ permalink raw reply	[flat|nested] 91+ messages in thread

* Re: [PATCH 2/3] [VLAN]: Update iif when receiving via VLAN device
  2006-06-30 16:32                                           ` Thomas Graf
@ 2006-06-30 17:13                                             ` Thomas Graf
  2006-06-30 17:18                                               ` Patrick McHardy
                                                                 ` (2 more replies)
  2006-06-30 17:19                                             ` jamal
  1 sibling, 3 replies; 91+ messages in thread
From: Thomas Graf @ 2006-06-30 17:13 UTC (permalink / raw)
  To: jamal; +Cc: Patrick McHardy, David Miller, netdev

* Thomas Graf <tgraf@suug.ch> 2006-06-30 18:32
> Anyways, I give up. Last time I've been running after you trying
> to fix the many bugs you leave behind. Ever noticed that whenever
> you add some new code it's someone else following up with tons of
> small bugfix patches having a hard time trying to figure out the
> actual intent. I'll just duplicate the code for my purpose, so
> much easier.

There you go, leaves ifb broken as-is, at least prevents it
from crashing randomly when the input_dev disappears.

[NET]: Use interface index to keep input device information

Using the interface index instead of a direct reference
allows a safe usage beyond the scope where an interface
could disappear.

The old input_dev field was incorrectly made dependant
on CONFIG_NET_CLS_ACT in skb_copy().

Signed-off-by: Thomas Graf <tgraf@suug.ch>

Index: net-2.6.git/include/linux/skbuff.h
===================================================================
--- net-2.6.git.orig/include/linux/skbuff.h
+++ net-2.6.git/include/linux/skbuff.h
@@ -181,7 +181,6 @@ enum {
  *	@sk: Socket we are owned by
  *	@tstamp: Time we arrived
  *	@dev: Device we arrived on/are leaving by
- *	@input_dev: Device we arrived on
  *	@h: Transport layer header
  *	@nh: Network layer header
  *	@mac: Link layer header
@@ -192,6 +191,7 @@ enum {
  *	@data_len: Data length
  *	@mac_len: Length of link layer header
  *	@csum: Checksum
+ *	@iif: Device we arrived on
  *	@local_df: allow local fragmentation
  *	@cloned: Head may be cloned (check refcnt to be sure)
  *	@nohdr: Payload reference only, must not modify header
@@ -228,7 +228,6 @@ struct sk_buff {
 	struct sock		*sk;
 	struct skb_timeval	tstamp;
 	struct net_device	*dev;
-	struct net_device	*input_dev;
 
 	union {
 		struct tcphdr	*th;
@@ -266,6 +265,7 @@ struct sk_buff {
 				data_len,
 				mac_len,
 				csum;
+	int			iif;
 	__u32			priority;
 	__u8			local_df:1,
 				cloned:1,
Index: net-2.6.git/include/net/pkt_cls.h
===================================================================
--- net-2.6.git.orig/include/net/pkt_cls.h
+++ net-2.6.git/include/net/pkt_cls.h
@@ -352,14 +352,19 @@ tcf_change_indev(struct tcf_proto *tp, c
 static inline int
 tcf_match_indev(struct sk_buff *skb, char *indev)
 {
+	int ret = 1;
+
 	if (indev[0]) {
-		if  (!skb->input_dev)
-			return 0;
-		if (strcmp(indev, skb->input_dev->name))
+		struct net_device *dev;
+
+		dev = dev_get_by_index(skb->iif);
+		if  (!dev)
 			return 0;
+		ret = !strcmp(indev, dev->name);
+		dev_put(dev);
 	}
 
-	return 1;
+	return ret;
 }
 #endif /* CONFIG_NET_CLS_IND */
 
Index: net-2.6.git/net/core/dev.c
===================================================================
--- net-2.6.git.orig/net/core/dev.c
+++ net-2.6.git/net/core/dev.c
@@ -1715,8 +1715,8 @@ static int ing_filter(struct sk_buff *sk
 	if (dev->qdisc_ingress) {
 		__u32 ttl = (__u32) G_TC_RTTL(skb->tc_verd);
 		if (MAX_RED_LOOP < ttl++) {
-			printk("Redir loop detected Dropping packet (%s->%s)\n",
-				skb->input_dev->name, skb->dev->name);
+			printk("Redir loop detected Dropping packet (%d->%s)\n",
+				skb->iif, skb->dev->name);
 			return TC_ACT_SHOT;
 		}
 
@@ -1749,8 +1749,8 @@ int netif_receive_skb(struct sk_buff *sk
 	if (!skb->tstamp.off_sec)
 		net_timestamp(skb);
 
-	if (!skb->input_dev)
-		skb->input_dev = skb->dev;
+	if (!skb->iif)
+		skb->iif = skb->dev->ifindex;
 
 	orig_dev = skb_bond(skb);
 
Index: net-2.6.git/net/core/skbuff.c
===================================================================
--- net-2.6.git.orig/net/core/skbuff.c
+++ net-2.6.git/net/core/skbuff.c
@@ -463,10 +463,10 @@ struct sk_buff *skb_clone(struct sk_buff
 	n->tc_verd = SET_TC_VERD(skb->tc_verd,0);
 	n->tc_verd = CLR_TC_OK2MUNGE(n->tc_verd);
 	n->tc_verd = CLR_TC_MUNGED(n->tc_verd);
-	C(input_dev);
 #endif
 	skb_copy_secmark(n, skb);
 #endif
+	C(iif);
 	C(truesize);
 	atomic_set(&n->users, 1);
 	C(head);
Index: net-2.6.git/net/sched/act_api.c
===================================================================
--- net-2.6.git.orig/net/sched/act_api.c
+++ net-2.6.git/net/sched/act_api.c
@@ -156,9 +156,8 @@ int tcf_action_exec(struct sk_buff *skb,
 
 	if (skb->tc_verd & TC_NCLS) {
 		skb->tc_verd = CLR_TC_NCLS(skb->tc_verd);
-		D2PRINTK("(%p)tcf_action_exec: cleared TC_NCLS in %s out %s\n",
-		         skb, skb->input_dev ? skb->input_dev->name : "xxx",
-		         skb->dev->name);
+		D2PRINTK("(%p)tcf_action_exec: cleared TC_NCLS in %d out %s\n",
+		         skb, skb->iif, skb->dev->name);
 		ret = TC_ACT_OK;
 		goto exec_done;
 	}
Index: net-2.6.git/net/sched/act_mirred.c
===================================================================
--- net-2.6.git.orig/net/sched/act_mirred.c
+++ net-2.6.git/net/sched/act_mirred.c
@@ -207,7 +207,7 @@ bad_mirred:
 		skb2->tc_verd = SET_TC_FROM(skb2->tc_verd, at);
 
 	skb2->dev = dev;
-	skb2->input_dev = skb->dev;
+	skb2->iif = skb->dev->ifindex;
 	dev_queue_xmit(skb2);
 	spin_unlock(&p->lock);
 	return p->action;
Index: net-2.6.git/drivers/net/ifb.c
===================================================================
--- net-2.6.git.orig/drivers/net/ifb.c
+++ net-2.6.git/drivers/net/ifb.c
@@ -158,19 +158,23 @@ static int ifb_xmit(struct sk_buff *skb,
 	stats->tx_packets++;
 	stats->tx_bytes+=skb->len;
 
-	if (!from || !skb->input_dev) {
+	if (!from || !skb->iif) {
 dropped:
 		dev_kfree_skb(skb);
 		stats->rx_dropped++;
 		return ret;
 	} else {
+		struct net_device *iif;
 		/* 
 		 * note we could be going
 		 * ingress -> egress or
 		 * egress -> ingress
 		*/
-		skb->dev = skb->input_dev;
-		skb->input_dev = dev;
+		iif = __dev_get_by_index(skb->iif);
+		if (!iif)
+			goto dropped;
+		skb->dev = iif;
+		skb->iif = dev->ifindex;
 		if (from & AT_INGRESS) {
 			skb_pull(skb, skb->dev->hard_header_len);
 		} else {

^ permalink raw reply	[flat|nested] 91+ messages in thread

* Re: [PATCH 2/3] [VLAN]: Update iif when receiving via VLAN device
  2006-06-30 17:13                                             ` Thomas Graf
@ 2006-06-30 17:18                                               ` Patrick McHardy
  2006-06-30 17:32                                                 ` jamal
  2006-06-30 17:27                                               ` Ben Greear
  2006-06-30 21:09                                               ` jamal
  2 siblings, 1 reply; 91+ messages in thread
From: Patrick McHardy @ 2006-06-30 17:18 UTC (permalink / raw)
  To: Thomas Graf; +Cc: jamal, David Miller, netdev

Thomas Graf wrote:
> * Thomas Graf <tgraf@suug.ch> 2006-06-30 18:32
> 
>>Anyways, I give up. Last time I've been running after you trying
>>to fix the many bugs you leave behind. Ever noticed that whenever
>>you add some new code it's someone else following up with tons of
>>small bugfix patches having a hard time trying to figure out the
>>actual intent. I'll just duplicate the code for my purpose, so
>>much easier.
> 
> 
> There you go, leaves ifb broken as-is, at least prevents it
> from crashing randomly when the input_dev disappears.

That would be a pity. After this patch has been ACKed, could we start
over with the other bugs one at a time? I wasn't really able to gather
the problems from this thread, but some of the behaviour you mentioned
does seem questionable.


^ permalink raw reply	[flat|nested] 91+ messages in thread

* Re: [PATCH 2/3] [VLAN]: Update iif when receiving via VLAN device
  2006-06-30 16:32                                           ` Thomas Graf
  2006-06-30 17:13                                             ` Thomas Graf
@ 2006-06-30 17:19                                             ` jamal
  2006-06-30 17:33                                               ` Patrick McHardy
  2006-06-30 17:34                                               ` Thomas Graf
  1 sibling, 2 replies; 91+ messages in thread
From: jamal @ 2006-06-30 17:19 UTC (permalink / raw)
  To: Thomas Graf; +Cc: netdev, David Miller, Patrick McHardy

On Fri, 2006-30-06 at 18:32 +0200, Thomas Graf wrote:
> * jamal <hadi@cyberus.ca> 2006-06-30 10:35

> > Did you actually try to run this before you reached this conclusion?
> 
> I did, fortunately some other bug prevents this from happening,
> packets are simply dropped somewhere.
> 

It is not a bug, Thomas! I am getting a little frustrated now.
The packets will be dropped because we set the at field to zero which is
invalid. That is done on purpose. It is only meaningful for ifb. The
challenge is much bigger than it appears. You could end up deadlocking
on the tx lock. So this was the choice i had to make.

> > With all due respect, the architecture works. I have invested many many
> > hours testing and verifying. There may be coding bugs - and those need
> > fixing. Kill the bugs.
> 
> Right, just run this
> 
> tc filter add dev eth0 parent 1: protocol ip prio 10
>               u32 match ip tos 0 0 flowid 1:1
> 	      action mirred egress redirect dev ifb1
> tc filter add dev ifb1 parent 1: protocol ip prio 10
>               u32 match ip tos 0 0 flowid 1:1
> 	      action mirred egress redirect dev ifb0
> 
> 
> Anyways, I give up. 

I understood you when you first posted. This is part of my testing. 
Try also to redirect from the same device eth0 to eth0 etc and see some
more interesting things.

> Last time I've been running after you trying
> to fix the many bugs you leave behind. Ever noticed that whenever
> you add some new code it's someone else following up with tons of
> small bugfix patches having a hard time trying to figure out the
> actual intent. 

You could always ask instead of making assumptions. And dont get me wrong, I 
honestly do appreciate you fixing bugs. You have been great and i have
always thanked you - except in situations like this where you just
stress the hell out of me. Bugs will always be there. 
Even God has bugs - but you are not fixing bugs in this case. 

You are attempting to change architecture (which works just fine) in the way you 
think it should work - and then point to something as a bug because it 
doesnt work the way you think it should work. 
This is a problem not just with you BTW, but with Patrick as well (although he has 
gotten better lately). There is a huge difference for example when dealing with 
Herbert. My approach in situations like this, which you dont have to follow, is 
to ask first what the intent was then if i dont like the intent try to convince the
owner that there maybe better ways. Or why they are wrong. 

To make my case, look at what you just did above in just the last 2 emails: 
You made a claim there is a bug. 
I asked you if you had really tested what you are pointing to (because i
know i test for that). You come back and make claims the bug is
elsewhere. This could go on forever typically - and infact it is throughout this
thread. 
Didnt ask if this is perhaps the way it is supposed to work or what the
intent was. It has to be a bug and by golly you are fixing it. Put yourself in 
my shoes.

> I'll just duplicate the code for my purpose, so
> much easier.
> 

Well, I am sorry you feel that way. I dont even remember where it is that 
we started.

cheers,
jamal

^ permalink raw reply	[flat|nested] 91+ messages in thread

* Re: [PATCH 2/3] [VLAN]: Update iif when receiving via VLAN device
  2006-06-30 17:13                                             ` Thomas Graf
  2006-06-30 17:18                                               ` Patrick McHardy
@ 2006-06-30 17:27                                               ` Ben Greear
  2006-06-30 21:09                                               ` jamal
  2 siblings, 0 replies; 91+ messages in thread
From: Ben Greear @ 2006-06-30 17:27 UTC (permalink / raw)
  To: Thomas Graf; +Cc: jamal, Patrick McHardy, David Miller, netdev

Thomas Graf wrote:
> * Thomas Graf <tgraf@suug.ch> 2006-06-30 18:32
> 
>>Anyways, I give up. Last time I've been running after you trying
>>to fix the many bugs you leave behind. Ever noticed that whenever
>>you add some new code it's someone else following up with tons of
>>small bugfix patches having a hard time trying to figure out the
>>actual intent. I'll just duplicate the code for my purpose, so
>>much easier.
> 
> 
> There you go, leaves ifb broken as-is, at least prevents it
> from crashing randomly when the input_dev disappears.
> 
> [NET]: Use interface index to keep input device information
> 
> Using the interface index instead of a direct reference
> allows a safe usage beyond the scope where an interface
> could disappear.
> 
> The old input_dev field was incorrectly made dependant
> on CONFIG_NET_CLS_ACT in skb_copy().
> 
> Signed-off-by: Thomas Graf <tgraf@suug.ch>

> ===================================================================
> --- net-2.6.git.orig/drivers/net/ifb.c
> +++ net-2.6.git/drivers/net/ifb.c
> @@ -158,19 +158,23 @@ static int ifb_xmit(struct sk_buff *skb,
>  	stats->tx_packets++;
>  	stats->tx_bytes+=skb->len;
>  
> -	if (!from || !skb->input_dev) {
> +	if (!from || !skb->iif) {

Since iif of 0 is valid (afaik), this check is now bogus, eh?

Ben

-- 
Ben Greear <greearb@candelatech.com>
Candela Technologies Inc  http://www.candelatech.com


^ permalink raw reply	[flat|nested] 91+ messages in thread

* Re: [PATCH 2/3] [VLAN]: Update iif when receiving via VLAN device
  2006-06-30 17:18                                               ` Patrick McHardy
@ 2006-06-30 17:32                                                 ` jamal
  2006-06-30 17:42                                                   ` Patrick McHardy
  2006-06-30 17:42                                                   ` Thomas Graf
  0 siblings, 2 replies; 91+ messages in thread
From: jamal @ 2006-06-30 17:32 UTC (permalink / raw)
  To: Patrick McHardy; +Cc: netdev, David Miller, Thomas Graf

On Fri, 2006-30-06 at 19:18 +0200, Patrick McHardy wrote:
> Thomas Graf wrote:
> > * Thomas Graf <tgraf@suug.ch> 2006-06-30 18:32
> > 
> >>Anyways, I give up. Last time I've been running after you trying
> >>to fix the many bugs you leave behind. Ever noticed that whenever
> >>you add some new code it's someone else following up with tons of
> >>small bugfix patches having a hard time trying to figure out the
> >>actual intent. I'll just duplicate the code for my purpose, so
> >>much easier.
> > 
> > 
> > There you go, leaves ifb broken as-is, at least prevents it
> > from crashing randomly when the input_dev disappears.
> 
> That would be a pity. After this patch has been ACKed, could we start
> over with the other bugs one at a time? I wasn't really able to gather
> the problems from this thread, but some of the behaviour you mentioned
> does seem questionable.
> 

I will summarize what the outstanding issues are, the rest of the "bugs"
just ignore otherwise the discussion is a waste of time and may
get out of control.

1) ifb references skb->input_dev
2) mirred sets the skb->input_dev which is used in #1

It is possible that when #1 happens infact input_dev is gone because no
ref count is incremented. Ok, Thomas is that sufficient to discuss the
crux of the matter?
At one point many months ago, the logic was for now the likelihood this
will happen is low but we need to cover for by at least figuring the
existence of input_dev when referencing it.

Thomas makes the claim, this can be achieved only by using an ifindex.
And i havent been able to see how. I have a small performance problem if
i just use ifindex. Using ifindex will eventually save 32 bits on the
64 bit machines. I posed the question as to which was more beneficial
as a solution that hasnt been addressed.

cheers,
jamal

^ permalink raw reply	[flat|nested] 91+ messages in thread

* Re: [PATCH 2/3] [VLAN]: Update iif when receiving via VLAN device
  2006-06-30 17:19                                             ` jamal
@ 2006-06-30 17:33                                               ` Patrick McHardy
  2006-06-30 19:59                                                 ` jamal
  2006-06-30 17:34                                               ` Thomas Graf
  1 sibling, 1 reply; 91+ messages in thread
From: Patrick McHardy @ 2006-06-30 17:33 UTC (permalink / raw)
  To: hadi; +Cc: Thomas Graf, netdev, David Miller

jamal wrote:
> You are attempting to change architecture (which works just fine) in the way you 
> think it should work - and then point to something as a bug because it 
> doesnt work the way you think it should work. 
> This is a problem not just with you BTW, but with Patrick as well (although he has 
> gotten better lately). There is a huge difference for example when dealing with 
> Herbert. My approach in situations like this, which you dont have to follow, is 
> to ask first what the intent was then if i dont like the intent try to convince the
> owner that there maybe better ways. Or why they are wrong. 

Well, I thought I stay out of this, but since you mention me ..

I also had the feeling it has gotten easier working with you lately,
but I can understand Thomas's pain, I had the same thoughts more than
once. Your code often does have an enormous amount of bugs and
whitespace and other stylistic problems and working with you can be
challenging for multiple reasons, for example having to go to endless,
often entirely pointless discussions for obvious fixes, especially if
you're already pissed by noticing 50 bugs at once. The reason why you
might be able to work better with Herbert is IMO that he usually
doesn't touch what you seem to feel is "your area". Besides that, I
never changed anything in your architecture, only fixed massive
amounts of bugs.

^ permalink raw reply	[flat|nested] 91+ messages in thread

* Re: [PATCH 2/3] [VLAN]: Update iif when receiving via VLAN device
  2006-06-30 17:19                                             ` jamal
  2006-06-30 17:33                                               ` Patrick McHardy
@ 2006-06-30 17:34                                               ` Thomas Graf
  1 sibling, 0 replies; 91+ messages in thread
From: Thomas Graf @ 2006-06-30 17:34 UTC (permalink / raw)
  To: jamal; +Cc: netdev, David Miller, Patrick McHardy

* jamal <hadi@cyberus.ca> 2006-06-30 13:19
> On Fri, 2006-30-06 at 18:32 +0200, Thomas Graf wrote:
> > * jamal <hadi@cyberus.ca> 2006-06-30 10:35
>  
> > > Did you actually try to run this before you reached this conclusion?
> > 
> > I did, fortunately some other bug prevents this from happening,
> > packets are simply dropped somewhere.
> > 
> 
> It is not a bug, Thomas! I am getting a little frustrated now.
> The packets will be dropped because we set the at field to zero which is
> invalid. That is done on purpose. It is only meaningful for ifb. The
> challenge is much bigger than it appears. You could end up deadlocking
> on the tx lock. So this was the choice i had to make.

Please explain, tc_verd is reset in the tasklet after dequeueing and set
again in dev_queue_xmit(). ifb_xmit will always see a valid tc_verd
at egress.

^ permalink raw reply	[flat|nested] 91+ messages in thread

* Re: [PATCH 2/3] [VLAN]: Update iif when receiving via VLAN device
  2006-06-30 17:32                                                 ` jamal
@ 2006-06-30 17:42                                                   ` Patrick McHardy
  2006-06-30 17:44                                                     ` Thomas Graf
  2006-06-30 19:40                                                     ` jamal
  2006-06-30 17:42                                                   ` Thomas Graf
  1 sibling, 2 replies; 91+ messages in thread
From: Patrick McHardy @ 2006-06-30 17:42 UTC (permalink / raw)
  To: hadi; +Cc: netdev, David Miller, Thomas Graf

jamal wrote:
> On Fri, 2006-30-06 at 19:18 +0200, Patrick McHardy wrote:
> 
>>Thomas Graf wrote:
>>
>>>* Thomas Graf <tgraf@suug.ch> 2006-06-30 18:32
>>>
>>>
>>>>Anyways, I give up. Last time I've been running after you trying
>>>>to fix the many bugs you leave behind. Ever noticed that whenever
>>>>you add some new code it's someone else following up with tons of
>>>>small bugfix patches having a hard time trying to figure out the
>>>>actual intent. I'll just duplicate the code for my purpose, so
>>>>much easier.
>>>
>>>
>>>There you go, leaves ifb broken as-is, at least prevents it
>>>from crashing randomly when the input_dev disappears.
>>
>>That would be a pity. After this patch has been ACKed, could we start
>>over with the other bugs one at a time? I wasn't really able to gather
>>the problems from this thread, but some of the behaviour you mentioned
>>does seem questionable.
>>
> 
> 
> I will summarize what the outstanding issues are, the rest of the "bugs"
> just ignore otherwise the discussion is a waste of time and may
> get out of control.
> 
> 1) ifb references skb->input_dev

That should be fixed by this patch.

> 2) mirred sets the skb->input_dev which is used in #1

I think Thomas was more complaining about the values it is set to
and the fact that only a single redirection is possible for each
packet, no?

> It is possible that when #1 happens infact input_dev is gone because no
> ref count is incremented. Ok, Thomas is that sufficient to discuss the
> crux of the matter?
> At one point many months ago, the logic was for now the likelihood this
> will happen is low but we need to cover for by at least figuring the
> existence of input_dev when referencing it.
> 
> Thomas makes the claim, this can be achieved only by using an ifindex.
> And i havent been able to see how. I have a small performance problem if
> i just use ifindex. Using ifindex will eventually save 32 bits on the
> 64 bit machines. I posed the question as to which was more beneficial
> as a solution that hasnt been addressed.

Are we still talking about this? Its easy: a pointer without taking
a reference can become stale, with ifindex you do the lookup right
when using it, so you either get an result or you don't. It also
saves atomic operations for anyone _not_ using it, which is that
vast majority of users. So the patch clearly makes sense to me.

^ permalink raw reply	[flat|nested] 91+ messages in thread

* Re: [PATCH 2/3] [VLAN]: Update iif when receiving via VLAN device
  2006-06-30 17:32                                                 ` jamal
  2006-06-30 17:42                                                   ` Patrick McHardy
@ 2006-06-30 17:42                                                   ` Thomas Graf
  2006-06-30 19:34                                                     ` jamal
  1 sibling, 1 reply; 91+ messages in thread
From: Thomas Graf @ 2006-06-30 17:42 UTC (permalink / raw)
  To: jamal; +Cc: Patrick McHardy, netdev, David Miller

* jamal <hadi@cyberus.ca> 2006-06-30 13:32
> I will summarize what the outstanding issues are, the rest of the "bugs"
> just ignore otherwise the discussion is a waste of time and may
> get out of control.
> 
> 1) ifb references skb->input_dev
> 2) mirred sets the skb->input_dev which is used in #1
> 
> It is possible that when #1 happens infact input_dev is gone because no
> ref count is incremented. Ok, Thomas is that sufficient to discuss the
> crux of the matter?

Sure, that's the most obvious bug.

> At one point many months ago, the logic was for now the likelihood this
> will happen is low but we need to cover for by at least figuring the
> existence of input_dev when referencing it.
>
> Thomas makes the claim, this can be achieved only by using an ifindex.
> And i havent been able to see how. I have a small performance problem if
> i just use ifindex. Using ifindex will eventually save 32 bits on the
> 64 bit machines. I posed the question as to which was more beneficial
> as a solution that hasnt been addressed.

I'd appreciate if you'd stop sperading lies. I claimed it to be the
best solution, not the only one. Everyone agrees that it's possible
to use a pointer and take a reference in netif_receive_skb() while
it also seems obvious to most that it's not a good idea to take two
additional atomic operations in the fast path for this purpose.

^ permalink raw reply	[flat|nested] 91+ messages in thread

* Re: [PATCH 2/3] [VLAN]: Update iif when receiving via VLAN device
  2006-06-30 17:42                                                   ` Patrick McHardy
@ 2006-06-30 17:44                                                     ` Thomas Graf
  2006-06-30 19:40                                                     ` jamal
  1 sibling, 0 replies; 91+ messages in thread
From: Thomas Graf @ 2006-06-30 17:44 UTC (permalink / raw)
  To: Patrick McHardy; +Cc: hadi, netdev, David Miller

* Patrick McHardy <kaber@trash.net> 2006-06-30 19:42
> I think Thomas was more complaining about the values it is set to
> and the fact that only a single redirection is possible for each
> packet, no?

I have no interest in resolving this anymore, it seems to be
a "feature" to avoid tx lock deadlocks.

^ permalink raw reply	[flat|nested] 91+ messages in thread

* Re: [PATCH 2/3] [VLAN]: Update iif when receiving via VLAN device
  2006-06-30 17:42                                                   ` Thomas Graf
@ 2006-06-30 19:34                                                     ` jamal
  2006-06-30 20:08                                                       ` Thomas Graf
  0 siblings, 1 reply; 91+ messages in thread
From: jamal @ 2006-06-30 19:34 UTC (permalink / raw)
  To: Thomas Graf; +Cc: David Miller, netdev, Patrick McHardy

On Fri, 2006-30-06 at 19:42 +0200, Thomas Graf wrote:
> * jamal <hadi@cyberus.ca> 2006-06-30 13:32

> > Thomas makes the claim, this can be achieved only by using an ifindex.
> > And i havent been able to see how. I have a small performance problem if
> > i just use ifindex. Using ifindex will eventually save 32 bits on the
> > 64 bit machines. I posed the question as to which was more beneficial
> > as a solution that hasnt been addressed.
> 
> I'd appreciate if you'd stop sperading lies. 
>
> I claimed it to be the
> best solution, not the only one. Everyone agrees that it's possible
> to use a pointer and take a reference in netif_receive_skb() while
> it also seems obvious to most that it's not a good idea to take two
> additional atomic operations in the fast path for this purpose.

I thought we went past that point already - and i made it clear that
the reference is _not_ taken in netif_receive_skb().

So assuming it is taken in mirred (i havent thought of where it is
decremented), why would using the ifindex be better?

cheers,
jamal


^ permalink raw reply	[flat|nested] 91+ messages in thread

* Re: [PATCH 2/3] [VLAN]: Update iif when receiving via VLAN device
  2006-06-30 17:42                                                   ` Patrick McHardy
  2006-06-30 17:44                                                     ` Thomas Graf
@ 2006-06-30 19:40                                                     ` jamal
  2006-06-30 20:17                                                       ` David Miller
  1 sibling, 1 reply; 91+ messages in thread
From: jamal @ 2006-06-30 19:40 UTC (permalink / raw)
  To: Patrick McHardy; +Cc: Thomas Graf, David Miller, netdev

On Fri, 2006-30-06 at 19:42 +0200, Patrick McHardy wrote:
> jamal wrote:

> > Thomas makes the claim, this can be achieved only by using an ifindex.
> > And i havent been able to see how. I have a small performance problem if
> > i just use ifindex. Using ifindex will eventually save 32 bits on the
> > 64 bit machines. I posed the question as to which was more beneficial
> > as a solution that hasnt been addressed.
> 
> Are we still talking about this? Its easy: a pointer without taking
> a reference can become stale, 

I should have been clear: reference gets taken in mirred.

> with ifindex you do the lookup right
> when using it, so you either get an result or you don't. 

My arguement was:
dev get by index per device will involve a a) lookup + b) incrementing
the refcount. 
if i use the dev pointer in that path then it becomes only #b

in both cases, you need to increment the counter and then somewhere
decrement it.

cheers,
jamal

> It also
> saves atomic operations for anyone _not_ using it, which is that
> vast majority of users. So the patch clearly makes sense to me.




^ permalink raw reply	[flat|nested] 91+ messages in thread

* Re: [PATCH 2/3] [VLAN]: Update iif when receiving via VLAN device
  2006-06-30 17:33                                               ` Patrick McHardy
@ 2006-06-30 19:59                                                 ` jamal
  2006-06-30 20:30                                                   ` Thomas Graf
  2006-07-01  2:47                                                   ` Patrick McHardy
  0 siblings, 2 replies; 91+ messages in thread
From: jamal @ 2006-06-30 19:59 UTC (permalink / raw)
  To: Patrick McHardy; +Cc: David Miller, netdev, Thomas Graf

On Fri, 2006-30-06 at 19:33 +0200, Patrick McHardy wrote:
> jamal wrote:
> > You are attempting to change architecture (which works just fine) in the way you 
> > think it should work - and then point to something as a bug because it 
> > doesnt work the way you think it should work. 
> > This is a problem not just with you BTW, but with Patrick as well (although he has 
> > gotten better lately). There is a huge difference for example when dealing with 
> > Herbert. My approach in situations like this, which you dont have to follow, is 
> > to ask first what the intent was then if i dont like the intent try to convince the
> > owner that there maybe better ways. Or why they are wrong. 
> 
> Well, I thought I stay out of this, but since you mention me ..
> 

I think you will add value to the discussion ;->
Regardless, we need to settle these kind of issues so we can work better
together. A while back i said was going to bring some of these issues at
the next netconf - but since the discussion is happening already, lets
do it here. Maybe we should take this discussion privately?

> I also had the feeling it has gotten easier working with you lately,

The feeling is mutual. 

> but I can understand Thomas's pain, I had the same thoughts more than
> once. Your code often does have an enormous amount of bugs and
> whitespace and other stylistic problems and working with you can be
> challenging for multiple reasons, for example having to go to endless,

One thing is clear in my mind at least (and i have said it several
times): I am not as good at the semantics as either yourself or Thomas
or Dave or Acme etc but i have tons of other things that compensate for.
Probably "not as good" is not the best  description  - rather my brain
cells put less respect on the stylistic commas than they do on the
message. Perhaps it's dylexsia or me trying to subconsciouly maximize my
time. If you push me hard, however, i will get close to do just as a
good job as you;->  Still, cant live without you guys! 

I would describe the majority of the fixes you have sent against me to
fall into the stylistic category and into fixing TheLinuxWay (i.e same
bug in multiple places and files). This is not to say i am not at fault,
or there havent been issues which made me wonder, but we may have
different metrics of what bugs i would call my mom and say "Mom, I just
fixed enormous amount of bugs". 
[As an example if you went hunting around the kernel, you will find a
lot of things that dont totally conform to lindent rules. You could
literally send 100s of patches].
Perhaps it is the language usage that puts us at odds at times.

> often entirely pointless discussions for obvious fixes, 

You may see it that way - I dont and so when that happens it is not by
any means to engage in a meaningless debate. 
The discussions are typically of the same nature as i just had with
Thomas (just a simple one line change in ifb which looked very
"obvious").

The way it goes is that there is a certain assumption made, and a
solution is suggested in the form of a patch. Accepting such a patch
(which i once in a while dont even get to see until it is in) means a
lot of the other assumptions get invalidated and is followed by tons of
"bug fixes". This has happened a few times to me. And once or twice i
have bitched because i ended having to support some user for days with
the wrong view.

> especially if you're already pissed by noticing 50 bugs at once. 

I dont get agitated by 100 patches which fix stylistic issues or real
bugs - although i would have preferred to see the non-obvious first
before they go in just in case or at least CCed on submission.
Where i get irritated, depending on how my day is going: when i see even
a single patch laded with implicit insults. It comes out to me as
unnecessary arrogance and immaturity. It may not be intentional use of
language, but thats how it comes out. 

> The reason why you
> might be able to work better with Herbert is IMO that he usually
> doesn't touch what you seem to feel is "your area". 

I think it's the maturity approach perhaps. With Herbert, it ends up
being about solving the problem; there are always moments of tension and
what may appear as endless discussion (look at the thread on
qdisc_is_running), but in the end it doesnt turn out into a high school
debate to prove one is better than the other. So maybe it is the mutual
respect and maturity in the discussion. 

In regards to "areas", you may be right; i cross more into Herberts path
than he does mine.
My reaction in this may be related to the way i treat other people and
my expectations:
When i submit patches to i would make sure i cc people who i think have
stakes or better insight in that area even if they seem irrelevant.
In-fact i may have even private conversations with them first if they
are large patches.
As an example i would make sure i cc you if i had some netfilter issues
or Herbert when i submit ipsec patches and we would have a gazillion
discussions which would sometimes result in the patches being totally
re-written. It doesnt make me happy all the time (especially when i have
to drop patches), but in the end it made me work better with that person
and removes doubt in my mind that i have missed something. Unfortunately
this approach (even in non-linux areas) is a lot of times
taken for a weakness.

cheers,
jamal

> Besides that, I
> never changed anything in your architecture, only fixed massive
> amounts of bugs.
> 

^ permalink raw reply	[flat|nested] 91+ messages in thread

* Re: [PATCH 2/3] [VLAN]: Update iif when receiving via VLAN device
  2006-06-30 19:34                                                     ` jamal
@ 2006-06-30 20:08                                                       ` Thomas Graf
  2006-06-30 20:42                                                         ` jamal
  0 siblings, 1 reply; 91+ messages in thread
From: Thomas Graf @ 2006-06-30 20:08 UTC (permalink / raw)
  To: jamal; +Cc: David Miller, netdev, Patrick McHardy

* jamal <hadi@cyberus.ca> 2006-06-30 15:34
> I thought we went past that point already - and i made it clear that
> the reference is _not_ taken in netif_receive_skb().
> 
> So assuming it is taken in mirred (i havent thought of where it is
> decremented), why would using the ifindex be better?

The issue exists regardless of mirred/ifb. As soon as the packet is
queued for the first time we leave netif_receive_skb() and the dev
reference is dropped. Therefore in order to allow functionality
like tcf_match_indev() at egress we have to either take a reference
or ensure that we can catch the unlikely case of the device having
disappeared. I think everyone would agree to use device pointers
if only mirred would acquire it.

^ permalink raw reply	[flat|nested] 91+ messages in thread

* Re: [PATCH 2/3] [VLAN]: Update iif when receiving via VLAN device
  2006-06-30 19:40                                                     ` jamal
@ 2006-06-30 20:17                                                       ` David Miller
  0 siblings, 0 replies; 91+ messages in thread
From: David Miller @ 2006-06-30 20:17 UTC (permalink / raw)
  To: hadi; +Cc: kaber, tgraf, netdev

From: jamal <hadi@cyberus.ca>
Date: Fri, 30 Jun 2006 15:40:26 -0400

> My arguement was:
> dev get by index per device will involve a a) lookup + b) incrementing
> the refcount. 
> if i use the dev pointer in that path then it becomes only #b
> 
> in both cases, you need to increment the counter and then somewhere
> decrement it.

The position is that ->input_dev usage is not fast path.

It may be a fast path in your mirred and ifb devices, but for the rest
of the networking and %99 of users it is totally unused.  We have a
lot of bits of state that sit in the sk_buff but which are used by
a tiny minority, yet the space consumption is eaten by everyone.

Maybe when the IFB and mirred are in more widespread use we can
investigate a different approach.

But at this time any change that makes fringe sk_buff state objects
shrink or disappear will be seriously considered.

Another part of this issue is that if you use a pointer there is no
clean place to clean up the input_dev reference within the scope it is
actually used.  The only reliable place is kfree_skb() which is far
beyond the scope that this ->input_dev thing is used.  We have a lot
of problems in some parts of the networking because object references
are held for too long.  One example is all the awful side effects of
the way the bridge netfilter code holds onto object references for
long periods of time in the netfilter call chains.

And we know the refcounting is broken.  And one way we know to fix
the refcounting without incurring the cost upon everyone is to
use an interface index and only make the input_dev users eat the
atomic operation incurred by grabbing the reference.

And even for the input_dev users, it isn't such a big deal, there
are other costs already associated with hitting these actions and
devices that use input_dev, the atomic reference grab there should
be lost in the noise I think.

^ permalink raw reply	[flat|nested] 91+ messages in thread

* Re: [PATCH 2/3] [VLAN]: Update iif when receiving via VLAN device
  2006-06-30 19:59                                                 ` jamal
@ 2006-06-30 20:30                                                   ` Thomas Graf
  2006-06-30 20:33                                                     ` David Miller
  2006-07-01  2:47                                                   ` Patrick McHardy
  1 sibling, 1 reply; 91+ messages in thread
From: Thomas Graf @ 2006-06-30 20:30 UTC (permalink / raw)
  To: jamal; +Cc: Patrick McHardy, David Miller, netdev

* jamal <hadi@cyberus.ca> 2006-06-30 15:59
> One thing is clear in my mind at least (and i have said it several
> times): I am not as good at the semantics as either yourself or Thomas
> or Dave or Acme etc but i have tons of other things that compensate for.
> Probably "not as good" is not the best  description  - rather my brain
> cells put less respect on the stylistic commas than they do on the
> message. Perhaps it's dylexsia or me trying to subconsciouly maximize my
> time. If you push me hard, however, i will get close to do just as a
> good job as you;->  Still, cant live without you guys! 

Lack of stylistic commas is less of a concern, rather the lack of
comments and notes where absolutely essential. You explained to me
that stacking ifb devices is not supported in order to avoid tx lock
deadlocks. There is not a single note or hint anywhere that would
remotely imply this. When questioned about things like that you often
refer to notes you have somewhere on paper, promising to get back once
looked up, eventually that happens, in this thread it didn't.

^ permalink raw reply	[flat|nested] 91+ messages in thread

* Re: [PATCH 2/3] [VLAN]: Update iif when receiving via VLAN device
  2006-06-30 20:30                                                   ` Thomas Graf
@ 2006-06-30 20:33                                                     ` David Miller
  2006-06-30 20:54                                                       ` jamal
  0 siblings, 1 reply; 91+ messages in thread
From: David Miller @ 2006-06-30 20:33 UTC (permalink / raw)
  To: tgraf; +Cc: hadi, kaber, netdev

From: Thomas Graf <tgraf@suug.ch>
Date: Fri, 30 Jun 2006 22:30:34 +0200

> Lack of stylistic commas is less of a concern, rather the lack of
> comments and notes where absolutely essential.

Yes, intent is very important to document when it is not
obvious.

Everyone is guilty of not doing this from time to time. :)

^ permalink raw reply	[flat|nested] 91+ messages in thread

* Re: [PATCH 2/3] [VLAN]: Update iif when receiving via VLAN device
  2006-06-30 20:08                                                       ` Thomas Graf
@ 2006-06-30 20:42                                                         ` jamal
  0 siblings, 0 replies; 91+ messages in thread
From: jamal @ 2006-06-30 20:42 UTC (permalink / raw)
  To: Thomas Graf; +Cc: Patrick McHardy, netdev, David Miller

On Fri, 2006-30-06 at 22:08 +0200, Thomas Graf wrote:
> * jamal <hadi@cyberus.ca> 2006-06-30 15:34

> > So assuming it is taken in mirred (i havent thought of where it is
> > decremented), why would using the ifindex be better?
> 
> The issue exists regardless of mirred/ifb. As soon as the packet is
> queued for the first time we leave netif_receive_skb() and the dev
> reference is dropped. Therefore in order to allow functionality
> like tcf_match_indev() at egress we have to either take a reference
> or ensure that we can catch the unlikely case of the device having
> disappeared. I think everyone would agree to use device pointers
> if only mirred would acquire it.

Thomas, this is certainly more reasonable explanation. 

On Fri, 2006-30-06 at 13:17 -0700, David Miller wrote:

> We have a
> lot of bits of state that sit in the sk_buff but which are used by
> a tiny minority, yet the space consumption is eaten by everyone.
> 
>
> Maybe when the IFB and mirred are in more widespread use we can
> investigate a different approach.
> 

Not unreasonable.
So saving the bits for the majority trumps the need for slightly more
cycles in mirred/ifb until they get more widespread.


> Another part of this issue is that if you use a pointer there is no
> clean place to clean up the input_dev reference within the scope it is
> actually used. 
> 
>
>  The only reliable place is kfree_skb() which is far
> beyond the scope that this ->input_dev thing is used. 
> 

And i cant think of a good place to do it. 

>  We have a lot
> of problems in some parts of the networking because object references
> are held for too long.  One example is all the awful side effects of
> the way the bridge netfilter code holds onto object references for
> long periods of time in the netfilter call chains.
> 
> And we know the refcounting is broken.  And one way we know to fix
> the refcounting without incurring the cost upon everyone is to
> use an interface index and only make the input_dev users eat the
> atomic operation incurred by grabbing the reference.
> 
> And even for the input_dev users, it isn't such a big deal, there
> are other costs already associated with hitting these actions and
> devices that use input_dev, the atomic reference grab there should
> be lost in the noise I think.

Alright guys - lets get the change in. I wish we had this specific
discussion sooner. 

Thomas, can you post your latest incarnation so i can mow comment more
peacefully? ;->

cheers,
jamal





^ permalink raw reply	[flat|nested] 91+ messages in thread

* Re: [PATCH 2/3] [VLAN]: Update iif when receiving via VLAN device
  2006-06-30 20:33                                                     ` David Miller
@ 2006-06-30 20:54                                                       ` jamal
  2006-06-30 21:10                                                         ` Thomas Graf
  0 siblings, 1 reply; 91+ messages in thread
From: jamal @ 2006-06-30 20:54 UTC (permalink / raw)
  To: David Miller; +Cc: netdev, kaber, tgraf

On Fri, 2006-30-06 at 13:33 -0700, David Miller wrote:
> From: Thomas Graf <tgraf@suug.ch>
> Date: Fri, 30 Jun 2006 22:30:34 +0200
> 
> > Lack of stylistic commas is less of a concern, rather the lack of
> > comments and notes where absolutely essential.
> 
> Yes, intent is very important to document when it is not
> obvious.
> 

I agree - and i try hard to document but at times there's too much
and a line needs to be drawn.
As an example:
the eth->ifb->ifb case though is a very corner case. All the IMQ types
need to only redirect to one ifb; while i test it ive always seen the
test as more of a boundary check than a useful setup.

And Thomas: I do keep notebooks ;-> I doodle about everything, they dont
all fit in my knapsack anymore so sometimes when i say i will go and
refer to my notes, it is real.

cheers,
jamal

^ permalink raw reply	[flat|nested] 91+ messages in thread

* Re: [PATCH 2/3] [VLAN]: Update iif when receiving via VLAN device
  2006-06-30 17:13                                             ` Thomas Graf
  2006-06-30 17:18                                               ` Patrick McHardy
  2006-06-30 17:27                                               ` Ben Greear
@ 2006-06-30 21:09                                               ` jamal
  2006-06-30 21:20                                                 ` Thomas Graf
  2 siblings, 1 reply; 91+ messages in thread
From: jamal @ 2006-06-30 21:09 UTC (permalink / raw)
  To: Thomas Graf; +Cc: netdev, David Miller, Patrick McHardy

Ok, I think found the last patch you posted, comments below (I have to
run off soon):

On Fri, 2006-30-06 at 19:13 +0200, Thomas Graf wrote:

> There you go, leaves ifb broken as-is, at least prevents it
> from crashing randomly when the input_dev disappears.
> 

I hope the above comment does show up in the logs ;->

> [NET]: Use interface index to keep input device information
> 

> ===================================================================
> --- net-2.6.git.orig/include/net/pkt_cls.h
> +++ net-2.6.git/include/net/pkt_cls.h
> @@ -352,14 +352,19 @@ tcf_change_indev(struct tcf_proto *tp, c
>  static inline int
>  tcf_match_indev(struct sk_buff *skb, char *indev)
>  {
> +	int ret = 1;
> +
>  	if (indev[0]) {
> -		if  (!skb->input_dev)
> -			return 0;
> -		if (strcmp(indev, skb->input_dev->name))
> +		struct net_device *dev;
> +
> +		dev = dev_get_by_index(skb->iif);
> +		if  (!dev)
>  			return 0;
> +		ret = !strcmp(indev, dev->name);
> +		dev_put(dev);
>  	}
>  

[..]

> Index: net-2.6.git/drivers/net/ifb.c
> ===================================================================
> --- net-2.6.git.orig/drivers/net/ifb.c
> +++ net-2.6.git/drivers/net/ifb.c
> @@ -158,19 +158,23 @@ static int ifb_xmit(struct sk_buff *skb,
>  	stats->tx_packets++;
>  	stats->tx_bytes+=skb->len;
>  
> -	if (!from || !skb->input_dev) {
> +	if (!from || !skb->iif) {

Can you have something similar to what you did in
tcf_match_indev above?

i.e grab dev by skb->iif so as to increment refcount - if it doesnt
exist it becomes equivalent to !skb->input_dev and if it exists you drop
the ref inside the else below after changing input_dev 

>  		dev_kfree_skb(skb);
>  		stats->rx_dropped++;
>  		return ret;
>  	} else {
> +		struct net_device *iif;
>  		/* 
>  		 * note we could be going
>  		 * ingress -> egress or
>  		 * egress -> ingress
>  		*/
> -		skb->dev = skb->input_dev;
> -		skb->input_dev = dev;
> +		iif = __dev_get_by_index(skb->iif);
> +		if (!iif)
> +			goto dropped;
> +		skb->dev = iif;
> +		skb->iif = dev->ifindex;
>  		if (from & AT_INGRESS) {
>  			skb_pull(skb, skb->dev->hard_header_len);
>  		} else {
> -

cheers,
jamal


^ permalink raw reply	[flat|nested] 91+ messages in thread

* Re: [PATCH 2/3] [VLAN]: Update iif when receiving via VLAN device
  2006-06-30 20:54                                                       ` jamal
@ 2006-06-30 21:10                                                         ` Thomas Graf
  2006-06-30 21:31                                                           ` jamal
  0 siblings, 1 reply; 91+ messages in thread
From: Thomas Graf @ 2006-06-30 21:10 UTC (permalink / raw)
  To: jamal; +Cc: David Miller, netdev, kaber

* jamal <hadi@cyberus.ca> 2006-06-30 16:54
> I agree - and i try hard to document but at times there's too much
> and a line needs to be drawn.
> As an example:
> the eth->ifb->ifb case though is a very corner case. All the IMQ types
> need to only redirect to one ifb; while i test it ive always seen the
> test as more of a boundary check than a useful setup.

Maybe you should just start by explaining why it doesn't work,
your original explanation doesn't make much sense.

> And Thomas: I do keep notebooks ;-> I doodle about everything, they dont
> all fit in my knapsack anymore so sometimes when i say i will go and
> refer to my notes, it is real.

It's plain arrogant to rely code understanding on information that
is not available to others. It makes everyone being dependant on
you, very nice.

To put it in another way, if you would do your job right when
writing the code, contacting or CCing you upon changes would
be only of a formal matter since the code is understandable and
the intent is clear or documented.

^ permalink raw reply	[flat|nested] 91+ messages in thread

* Re: [PATCH 2/3] [VLAN]: Update iif when receiving via VLAN device
  2006-06-30 21:09                                               ` jamal
@ 2006-06-30 21:20                                                 ` Thomas Graf
  2006-06-30 21:35                                                   ` jamal
  0 siblings, 1 reply; 91+ messages in thread
From: Thomas Graf @ 2006-06-30 21:20 UTC (permalink / raw)
  To: jamal; +Cc: netdev, David Miller, Patrick McHardy

* jamal <hadi@cyberus.ca> 2006-06-30 17:09
> On Fri, 2006-30-06 at 19:13 +0200, Thomas Graf wrote:
> > Index: net-2.6.git/drivers/net/ifb.c
> > ===================================================================
> > --- net-2.6.git.orig/drivers/net/ifb.c
> > +++ net-2.6.git/drivers/net/ifb.c
> > @@ -158,19 +158,23 @@ static int ifb_xmit(struct sk_buff *skb,
> >  	stats->tx_packets++;
> >  	stats->tx_bytes+=skb->len;
> >  
> > -	if (!from || !skb->input_dev) {
> > +	if (!from || !skb->iif) {
> 
> Can you have something similar to what you did in
> tcf_match_indev above?
> 
> i.e grab dev by skb->iif so as to increment refcount - if it doesnt
> exist it becomes equivalent to !skb->input_dev and if it exists you drop
> the ref inside the else below after changing input_dev 

A non-existant iif is already equivalent to !iif since it jumps
into the same branch. Grabing a reference is completely pointless,
the netdevice represented by skb->iif is at this point until the
packet gets queued covered by a reference taken in netif_rx().

^ permalink raw reply	[flat|nested] 91+ messages in thread

* Re: [PATCH 2/3] [VLAN]: Update iif when receiving via VLAN device
  2006-06-30 21:10                                                         ` Thomas Graf
@ 2006-06-30 21:31                                                           ` jamal
  2006-06-30 23:45                                                             ` Thomas Graf
  0 siblings, 1 reply; 91+ messages in thread
From: jamal @ 2006-06-30 21:31 UTC (permalink / raw)
  To: Thomas Graf; +Cc: kaber, netdev, David Miller

On Fri, 2006-30-06 at 23:10 +0200, Thomas Graf wrote:
> * jamal <hadi@cyberus.ca> 2006-06-30 16:54
> > I agree - and i try hard to document but at times there's too much
> > and a line needs to be drawn.
> > As an example:
> > the eth->ifb->ifb case though is a very corner case. All the IMQ types
> > need to only redirect to one ifb; while i test it ive always seen the
> > test as more of a boundary check than a useful setup.
> 
> Maybe you should just start by explaining why it doesn't work,
> your original explanation doesn't make much sense.
> 

Better to explain the reason for ifb first:
ifb exists initially as a replacement for IMQ. 
1) qdiscs/policies that are per device as opposed to system wide.
This now allows for sharing.

2) Allows for queueing incoming traffic for shaping instead of
dropping.

In other wise, the main use is for multiple devices to redirect to it.
Main desire is not for it to redirect to any other ifb device or eth
devices. I actually tried to get it to do that, but run into issues
of complexity and and came up with decision to drop instead of killing
the machine.
Other than that, it can redirect to any other devices - but may still
not be meaningful.

I have been thinking of Herberts change of qdisc_is_running and this may
help actually.

> > And Thomas: I do keep notebooks ;-> I doodle about everything, they dont
> > all fit in my knapsack anymore so sometimes when i say i will go and
> > refer to my notes, it is real.
> 
> It's plain arrogant to rely code understanding on information that
> is not available to others. It makes everyone being dependant on
> you, very nice.
> 

I try hard to document more than many many people. I give tutorials, I
write papers when time allows, I spend time responding to emails. And i
do it all for the love of the game. 
But lets look at the other side of the coin:
Dont you think it is polite to ask when something is not clear instead
of making assumptions? 

> To put it in another way, if you would do your job right when
> writing the code, contacting or CCing you upon changes would
> be only of a formal matter since the code is understandable and
> the intent is clear or documented.

I wish I could achieve that. I dont think theres any piece of code that
reaches those standards in the kernel actually. And if theres a piece of
code that achieves that it should be emulated. Most of the times, things
that are hard are not about the code but about issues like algorithmics,
embedded policies etc (I think you are saying the same).

cheers,
jamal

^ permalink raw reply	[flat|nested] 91+ messages in thread

* Re: [PATCH 2/3] [VLAN]: Update iif when receiving via VLAN device
  2006-06-30 21:20                                                 ` Thomas Graf
@ 2006-06-30 21:35                                                   ` jamal
  2006-06-30 23:22                                                     ` Thomas Graf
  0 siblings, 1 reply; 91+ messages in thread
From: jamal @ 2006-06-30 21:35 UTC (permalink / raw)
  To: Thomas Graf; +Cc: Patrick McHardy, David Miller, netdev

On Fri, 2006-30-06 at 23:20 +0200, Thomas Graf wrote:
> * jamal <hadi@cyberus.ca> 2006-06-30 17:09
> > the ref inside the else below after changing input_dev 
> 
> A non-existant iif is already equivalent to !iif since it jumps
> into the same branch. 

I thought 0 was a valid iif as Ben G was pointing - if it is not, you
are right. 

> Grabing a reference is completely pointless,
> the netdevice represented by skb->iif is at this point until the
> packet gets queued covered by a reference taken in netif_rx().

You have to consider that this could happen at both ingress and egress.

cheers,
jamal


^ permalink raw reply	[flat|nested] 91+ messages in thread

* Re: [PATCH 2/3] [VLAN]: Update iif when receiving via VLAN device
  2006-06-30 21:35                                                   ` jamal
@ 2006-06-30 23:22                                                     ` Thomas Graf
  2006-07-01  2:23                                                       ` jamal
  0 siblings, 1 reply; 91+ messages in thread
From: Thomas Graf @ 2006-06-30 23:22 UTC (permalink / raw)
  To: jamal; +Cc: Patrick McHardy, David Miller, netdev

This is boring, I reversed everything to not change any semantics and
you still complain.

* jamal <hadi@cyberus.ca> 2006-06-30 17:35
> On Fri, 2006-30-06 at 23:20 +0200, Thomas Graf wrote:
> > * jamal <hadi@cyberus.ca> 2006-06-30 17:09
> > > the ref inside the else below after changing input_dev 
> > 
> > A non-existant iif is already equivalent to !iif since it jumps
> > into the same branch. 
> 
> I thought 0 was a valid iif as Ben G was pointing - if it is not, you
> are right. 

No, 1 ist the first valid ifindex, see dev_new_index().

> > Grabing a reference is completely pointless,
> > the netdevice represented by skb->iif is at this point until the
> > packet gets queued covered by a reference taken in netif_rx().
> 
> You have to consider that this could happen at both ingress and egress.

Just think for a second, do you expect the device the packet will
be leaving at to disappear?

^ permalink raw reply	[flat|nested] 91+ messages in thread

* Re: [PATCH 2/3] [VLAN]: Update iif when receiving via VLAN device
  2006-06-30 21:31                                                           ` jamal
@ 2006-06-30 23:45                                                             ` Thomas Graf
  2006-07-01  2:59                                                               ` jamal
  0 siblings, 1 reply; 91+ messages in thread
From: Thomas Graf @ 2006-06-30 23:45 UTC (permalink / raw)
  To: jamal; +Cc: kaber, netdev, David Miller

* jamal <hadi@cyberus.ca> 2006-06-30 17:31
> Better to explain the reason for ifb first:
> ifb exists initially as a replacement for IMQ. 
> 1) qdiscs/policies that are per device as opposed to system wide.
> This now allows for sharing.
> 
> 2) Allows for queueing incoming traffic for shaping instead of
> dropping.
> 
> In other wise, the main use is for multiple devices to redirect to it.
> Main desire is not for it to redirect to any other ifb device or eth
> devices. I actually tried to get it to do that, but run into issues
> of complexity and and came up with decision to drop instead of killing
> the machine.
> Other than that, it can redirect to any other devices - but may still
> not be meaningful.

Last time I'm asking. Why are packets dropped? You mentioned tc_verd
is set to 0 leading to a invalid from verdict. Fact is that the
from verdict is set to a meaningful value again at dev_queue_xmit()
or ing_filter() so ifb_xmit() only sees valid values. tx locks of
individual ifb devices are independant, why would it deadlock? Where
is the packet exactly dropped?

> I have been thinking of Herberts change of qdisc_is_running and this may
> help actually.

Help on what?


^ permalink raw reply	[flat|nested] 91+ messages in thread

* Re: [PATCH 2/3] [VLAN]: Update iif when receiving via VLAN device
  2006-06-30 23:22                                                     ` Thomas Graf
@ 2006-07-01  2:23                                                       ` jamal
  2006-07-01 11:51                                                         ` Thomas Graf
  0 siblings, 1 reply; 91+ messages in thread
From: jamal @ 2006-07-01  2:23 UTC (permalink / raw)
  To: Thomas Graf; +Cc: netdev, David Miller, Patrick McHardy

On Sat, 2006-01-07 at 01:22 +0200, Thomas Graf wrote:
> This is boring, I reversed everything to not change any semantics and
> you still complain.
> 

You reversed a bug that you had introduced. Do you want me to review
this patch or not now? Be a little reasonable please.

> * jamal <hadi@cyberus.ca> 2006-06-30 17:35

> > > Grabing a reference is completely pointless,
> > > the netdevice represented by skb->iif is at this point until the
> > > packet gets queued covered by a reference taken in netif_rx().
> > 
> > You have to consider that this could happen at both ingress and egress.
> 
> Just think for a second, do you expect the device the packet will
> be leaving at to disappear?

I am  not certain i understood then: Are we in the mode where the
refcount is not needed because chances are small that a device will
disappear? It seems to me after all this trouble that it may not be so
bad to refcount (I guess i meant refcount the device on input to the ifb
and decrement on the output).

As a note - and i am sure you know this: 
The packet comes to the ifb and gets queued. It gets dequeued at a later
time and is sent off either to the ingress or egress. At any point
during the enq/deq the other device being referenced could 
disappear. This is a lesser sin on ingress than it is on egress.

cheers,
jamal

^ permalink raw reply	[flat|nested] 91+ messages in thread

* Re: [PATCH 2/3] [VLAN]: Update iif when receiving via VLAN device
  2006-06-30 19:59                                                 ` jamal
  2006-06-30 20:30                                                   ` Thomas Graf
@ 2006-07-01  2:47                                                   ` Patrick McHardy
  1 sibling, 0 replies; 91+ messages in thread
From: Patrick McHardy @ 2006-07-01  2:47 UTC (permalink / raw)
  To: hadi; +Cc: David Miller, netdev, Thomas Graf

jamal wrote:
> On Fri, 2006-30-06 at 19:33 +0200, Patrick McHardy wrote:
> 
>>Well, I thought I stay out of this, but since you mention me ..
>>
> I think you will add value to the discussion ;->
> Regardless, we need to settle these kind of issues so we can work better
> together. A while back i said was going to bring some of these issues at
> the next netconf - but since the discussion is happening already, lets
> do it here. Maybe we should take this discussion privately?

Whatever helps resolving this, feel free to take this private if you
feel like it. I think I already managed to get along with our
differences quite well, but it was a hard path. Having friends that
use the same discussion tactics may have helped :) Just to clarify what
I'm saying below, I like you a lot personally, this is purely
"a work thing".

>>I also had the feeling it has gotten easier working with you lately,
> 
> 
> The feeling is mutual. 
> 
> 
>>but I can understand Thomas's pain, I had the same thoughts more than
>>once. Your code often does have an enormous amount of bugs and
>>whitespace and other stylistic problems and working with you can be
>>challenging for multiple reasons, for example having to go to endless,
> 
> 
> One thing is clear in my mind at least (and i have said it several
> times): I am not as good at the semantics as either yourself or Thomas
> or Dave or Acme etc but i have tons of other things that compensate for.
> Probably "not as good" is not the best  description  - rather my brain
> cells put less respect on the stylistic commas than they do on the
> message. Perhaps it's dylexsia or me trying to subconsciouly maximize my
> time. If you push me hard, however, i will get close to do just as a
> good job as you;->  Still, cant live without you guys! 

I think the things you did and which I struggeled with were right
to do, but what I don't understand is why you can't do it right
yourself. Software development has a lot of unpleasant parts,
but ignoring them can't be the right answer. In a shared project,
you should always remember that if you're not doing the work, some
else will have to.

> I would describe the majority of the fixes you have sent against me to
> fall into the stylistic category and into fixing TheLinuxWay (i.e same
> bug in multiple places and files). This is not to say i am not at fault,
> or there havent been issues which made me wonder, but we may have
> different metrics of what bugs i would call my mom and say "Mom, I just
> fixed enormous amount of bugs". 
> [As an example if you went hunting around the kernel, you will find a
> lot of things that dont totally conform to lindent rules. You could
> literally send 100s of patches].
> Perhaps it is the language usage that puts us at odds at times.

Sorry, absolutely not. The bugs I fixed were not just stylistic
things, although I still tried to take care of them where possible.
It was a huge number of bad bugs, even affecting kernels that didn't
even used the feature which introduced the bugs. As for "TheLinuxWay"
(I think I made my opinion on this clear already), cut-and-paste
of crappy code is not it. I'm always amazed how people can copy
crappy code without even thinking about it, when I copy code I usually
fix bugs in the code I copy _before_ doing so.

Language might of course still contribute to our misunderstandings.

>>often entirely pointless discussions for obvious fixes, 
> 
> 
> You may see it that way - I dont and so when that happens it is not by
> any means to engage in a meaningless debate. 
> The discussions are typically of the same nature as i just had with
> Thomas (just a simple one line change in ifb which looked very
> "obvious").

Sometimes you may be right, but despite all the lengthy discussions
we went through, I can't recall a single patch of mine that didn't
went in eventually. But OK, people are complaining about missing
clarification of intent, we're all sometimes guilty of that.
Still, starting a huge discussion about patches that clearly
move in the right direction like Thomas' one which started this
thread is frustrating to the submitter. Discussions about technical
matters should use technical, comprehensible arguments, and nothing
else. This basically means on of three things:

- Bad because it does ..
- Bad because this patch is better
- Applied

Vague expressions about "not good", "might be able to do better" or
"might create problems" without specification are not helpful as long
as they are vague. The point is: _show_ that the submitters argument
is wrong, and if it really is, that shouldn't be hard. This is
especially true if its not some random guy but someone familiar with
what he is changing. This thread is one of many examples.

> The way it goes is that there is a certain assumption made, and a
> solution is suggested in the form of a patch. Accepting such a patch
> (which i once in a while dont even get to see until it is in) means a
> lot of the other assumptions get invalidated and is followed by tons of
> "bug fixes". This has happened a few times to me. And once or twice i
> have bitched because i ended having to support some user for days with
> the wrong view.

Then the patch wasn't reviewed carefully enough. If assumptions get
invalidated, they need to be taken care of at the same time. It can
happen, and sometimes it even is intended (in case of not so well
understood code). But usually it is not and it is simply a bug,
and the submitter is guilty of not beeing careful enough.

>>especially if you're already pissed by noticing 50 bugs at once. 
> 
> 
> I dont get agitated by 100 patches which fix stylistic issues or real
> bugs - although i would have preferred to see the non-obvious first
> before they go in just in case or at least CCed on submission.
> Where i get irritated, depending on how my day is going: when i see even
> a single patch laded with implicit insults. It comes out to me as
> unnecessary arrogance and immaturity. It may not be intentional use of
> language, but thats how it comes out. 

Well, I admit that the insults (in the form of a description of all
the possible bugs, which is not really an insult) are sometimes
intentional, but usually only after I'm already really pissed by
having to take care of an enormous amount of somebody elses crap.
I think that is understandable human behaviour.

>>The reason why you
>>might be able to work better with Herbert is IMO that he usually
>>doesn't touch what you seem to feel is "your area". 
> 
> 
> I think it's the maturity approach perhaps. With Herbert, it ends up
> being about solving the problem; there are always moments of tension and
> what may appear as endless discussion (look at the thread on
> qdisc_is_running), but in the end it doesnt turn out into a high school
> debate to prove one is better than the other. So maybe it is the mutual
> respect and maturity in the discussion. 

Actually that thread was cleared up by Herbert nice and fast :) I too
enjoy working with him, but I think Herbert is a special case as his
arguments are in my opinion always perfectly clear, usually correct,
understandable and nicely formulated. He doesn't leave you much
choice :)

Probably also a bit of a language issue, I'm not able to express myself
as well as he is (especially not as friendly).

> In regards to "areas", you may be right; i cross more into Herberts path
> than he does mine.
> My reaction in this may be related to the way i treat other people and
> my expectations:
> When i submit patches to i would make sure i cc people who i think have
> stakes or better insight in that area even if they seem irrelevant.
> In-fact i may have even private conversations with them first if they
> are large patches.
> As an example i would make sure i cc you if i had some netfilter issues
> or Herbert when i submit ipsec patches and we would have a gazillion
> discussions which would sometimes result in the patches being totally
> re-written. It doesnt make me happy all the time (especially when i have
> to drop patches), but in the end it made me work better with that person
> and removes doubt in my mind that i have missed something. Unfortunately
> this approach (even in non-linux areas) is a lot of times
> taken for a weakness.

I always had the feeling you're beeing "protective", in the form of
pulling people in discussions not really related to the subject.

That was about it with my opinion on the matter, I think we will be
able to resolve this in a better way personally.

^ permalink raw reply	[flat|nested] 91+ messages in thread

* Re: [PATCH 2/3] [VLAN]: Update iif when receiving via VLAN device
  2006-06-30 23:45                                                             ` Thomas Graf
@ 2006-07-01  2:59                                                               ` jamal
  2006-07-01 11:28                                                                 ` Thomas Graf
  0 siblings, 1 reply; 91+ messages in thread
From: jamal @ 2006-07-01  2:59 UTC (permalink / raw)
  To: Thomas Graf; +Cc: David Miller, netdev, kaber

On Sat, 2006-01-07 at 01:45 +0200, Thomas Graf wrote:
> * jamal <hadi@cyberus.ca> 2006-06-30 17:31
> > Better to explain the reason for ifb first:
> > ifb exists initially as a replacement for IMQ. 
> > 1) qdiscs/policies that are per device as opposed to system wide.
> > This now allows for sharing.
> > 
> > 2) Allows for queueing incoming traffic for shaping instead of
> > dropping.
> > 
> > In other wise, the main use is for multiple devices to redirect to it.
> > Main desire is not for it to redirect to any other ifb device or eth
> > devices. I actually tried to get it to do that, but run into issues
> > of complexity and and came up with decision to drop instead of killing
> > the machine.
> > Other than that, it can redirect to any other devices - but may still
> > not be meaningful.
> 
> Last time I'm asking.

Then please listen carefully - because if you read my message with a
reasonable openness the answer is there. 

>  Why are packets dropped? 

When looping happens the only sane thing to do is to drop packets.

I mentioned this was a very tricky thing to achieve in all cases since
there are many behaviors and netdevices that have to be considered. As
an example i ended not submitting the code for looping from egress to
ingress because i felt it needed more testing. And i am certain i have
missed some other device combinations.
Loops could happen within ingress or egress. They could mostly happen
because of mirred or ifb. And some i have discovered via testing.

> You mentioned tc_verd is set to 0 leading to a invalid from verdict. 

yes, for ifb.

> Fact is that the from verdict is set to a meaningful value again at 
> dev_queue_xmit() or ing_filter() so ifb_xmit() only sees valid values. 

Ok, Thomas this is one of those places where we end up colliding for no
good reason - your statement above is accusatory. And sometimes i say 3
things and you pick one and use that as context. The ifb does clear the
field. So if you get redirected again, it will be 0.

> tx locks of individual ifb devices are independant, why would it deadlock? 

different instances of the same device do not deadlock. If however, you
have a series of devices in which A->B->C->D->A then you will have a
possible deadlock. In the case of the ifb i dissallowed it because it
was obvious from the testing. 

> Where is the packet exactly dropped?
> 

For the above in the ifb when they come in with from=0.

> > I have been thinking of Herberts change of qdisc_is_running and this may
> > help actually.
> 
> Help on what?

In the case of deadlocks. Now that it is impossible to contend for the
txlock twice (with that patch), a second redirect will end up just
enqueueing.

cheers,
jamal

^ permalink raw reply	[flat|nested] 91+ messages in thread

* Re: [PATCH 2/3] [VLAN]: Update iif when receiving via VLAN device
  2006-07-01  2:59                                                               ` jamal
@ 2006-07-01 11:28                                                                 ` Thomas Graf
  2006-07-01 13:35                                                                   ` jamal
  0 siblings, 1 reply; 91+ messages in thread
From: Thomas Graf @ 2006-07-01 11:28 UTC (permalink / raw)
  To: jamal; +Cc: David Miller, netdev, kaber

* jamal <hadi@cyberus.ca> 2006-06-30 22:59
> On Sat, 2006-01-07 at 01:45 +0200, Thomas Graf wrote:
> > Fact is that the from verdict is set to a meaningful value again at 
> > dev_queue_xmit() or ing_filter() so ifb_xmit() only sees valid values. 
> 
> Ok, Thomas this is one of those places where we end up colliding for no
> good reason - your statement above is accusatory. And sometimes i say 3
> things and you pick one and use that as context. The ifb does clear the
> field. So if you get redirected again, it will be 0.

Please enlighten me, I may be making a wrong assumption again.

1) tc_verd is reset to 0 after dq in ri_tasklet()
2) TC_AT is set to AT_EGRESS in dev_queue_xmit()
3) TC_FROM being derived from TC_AT in tcf_mirred() when redirecting
4) TC_FROM has the value AT_EGRESS when entering ifb_xmit()

> different instances of the same device do not deadlock. If however, you
> have a series of devices in which A->B->C->D->A then you will have a
> possible deadlock. In the case of the ifb i dissallowed it because it
> was obvious from the testing. 

Correct me if I'm wrong. The skb is queued between each device. When
moving skbs from rq to tq the tx lock is acquired, if not available
the packet is requeued for later. Due to the queueing the tx lock
of the previous device is released. Where exactly is the deadlock?

^ permalink raw reply	[flat|nested] 91+ messages in thread

* Re: [PATCH 2/3] [VLAN]: Update iif when receiving via VLAN device
  2006-07-01  2:23                                                       ` jamal
@ 2006-07-01 11:51                                                         ` Thomas Graf
  2006-07-01 13:47                                                           ` jamal
  0 siblings, 1 reply; 91+ messages in thread
From: Thomas Graf @ 2006-07-01 11:51 UTC (permalink / raw)
  To: jamal; +Cc: netdev, David Miller, Patrick McHardy

* jamal <hadi@cyberus.ca> 2006-06-30 22:23
> I am  not certain i understood then: Are we in the mode where the
> refcount is not needed because chances are small that a device will
> disappear? It seems to me after all this trouble that it may not be so
> bad to refcount (I guess i meant refcount the device on input to the ifb
> and decrement on the output).

Based on two principles:

 a) All netdevices a packet is processed through is properly
    refcounted until the packet is queued for the first time.
    Given that iif is strictly set to the previous device, a
    refcnt is certainly taken up to the point where the skb is
    put into rq. There is one exception to this: The fact that
    you don't set skb->dev = ifb when reinjecting at ingress
    will not take a refcnt on the ifb. This sounds like a broken
    architecture to me but it doesn't matter as you want to
    prohibit ifb -> ifb anyways.
 
 b) The netdevice used for xmit via dev_queue_xmit() is already
    protected by the initial xmit attempt.

I can't see reason why to hurt performance by introducing more
atomic operations in your fast path.

If you have more concerns regarding these patches, feel free to
fix your own architecture or propose to drop the patches, I think
I've spend enough time on this.

^ permalink raw reply	[flat|nested] 91+ messages in thread

* Re: [PATCH 2/3] [VLAN]: Update iif when receiving via VLAN device
  2006-07-01 11:28                                                                 ` Thomas Graf
@ 2006-07-01 13:35                                                                   ` jamal
  2006-07-08 10:54                                                                     ` Thomas Graf
  0 siblings, 1 reply; 91+ messages in thread
From: jamal @ 2006-07-01 13:35 UTC (permalink / raw)
  To: Thomas Graf; +Cc: kaber, netdev, David Miller

On Sat, 2006-01-07 at 13:28 +0200, Thomas Graf wrote:
> * jamal <hadi@cyberus.ca> 2006-06-30 22:59

> Please enlighten me, I may be making a wrong assumption again.
> 
> 1) tc_verd is reset to 0 after dq in ri_tasklet()
> 2) TC_AT is set to AT_EGRESS in dev_queue_xmit()
> 3) TC_FROM being derived from TC_AT in tcf_mirred() when redirecting
> 4) TC_FROM has the value AT_EGRESS when entering ifb_xmit()
> 

Let me answer the next bit and it may clear this.

> > different instances of the same device do not deadlock. If however, you
> > have a series of devices in which A->B->C->D->A then you will have a
> > possible deadlock. In the case of the ifb i dissallowed it because it
> > was obvious from the testing. 
> 
> Correct me if I'm wrong. The skb is queued between each device. When
> moving skbs from rq to tq the tx lock is acquired, if not available
> the packet is requeued for later. Due to the queueing the tx lock
> of the previous device is released. Where exactly is the deadlock?

using what i said as looping in the case of A->B->C->D->A for the case
of egress since that is what you are alluding to;
note that ifb0 will be entered twice: First for example the tasklet in
ifb0 will emit the packet, and then it will go all the way back to xmit
on ifb0. This is where the issue is. Does that clear it?

cheers,
jamal



^ permalink raw reply	[flat|nested] 91+ messages in thread

* Re: [PATCH 2/3] [VLAN]: Update iif when receiving via VLAN device
  2006-07-01 11:51                                                         ` Thomas Graf
@ 2006-07-01 13:47                                                           ` jamal
  0 siblings, 0 replies; 91+ messages in thread
From: jamal @ 2006-07-01 13:47 UTC (permalink / raw)
  To: Thomas Graf; +Cc: Patrick McHardy, David Miller, netdev

On Sat, 2006-01-07 at 13:51 +0200, Thomas Graf wrote:
> * jamal <hadi@cyberus.ca> 2006-06-30 22:23
> > I am  not certain i understood then: Are we in the mode where the
> > refcount is not needed because chances are small that a device will
> > disappear? It seems to me after all this trouble that it may not be so
> > bad to refcount (I guess i meant refcount the device on input to the ifb
> > and decrement on the output).
> 
> Based on two principles:
> 
>  a) All netdevices a packet is processed through is properly
>     refcounted until the packet is queued for the first time.
>     Given that iif is strictly set to the previous device, a
>     refcnt is certainly taken up to the point where the skb is
>     put into rq. 

yes, but that refcount may be lost by the time it is dequeued and
retransmitted. Am i wrong thinking so?

> There is one exception to this: The fact that
>     you don't set skb->dev = ifb when reinjecting at ingress
>     will not take a refcnt on the ifb. This sounds like a broken
>     architecture to me but it doesn't matter as you want to
>     prohibit ifb -> ifb anyways.

Ok, Thomas: Please stop critiqueing the architecture non-stop. You will
not convince me of anything this way. Lets just focus on this issue and
then you can go back and critique all you want.

>  
>  b) The netdevice used for xmit via dev_queue_xmit() is already
>     protected by the initial xmit attempt.
> 

Is it guaranteed? if yes, the patch is perfect.

> I can't see reason why to hurt performance by introducing more
> atomic operations in your fast path.
> 

well, i would be fine with this - with a caveat that nothing really has
changed in ifb then, no? i.e the value of the patch in that case would
be to convert input_dev to iif, correct? If yes, this is fine by me
since as we have discussed the likelihood of this was small.

> If you have more concerns regarding these patches, feel free to
> fix your own architecture or propose to drop the patches, I think
> I've spend enough time on this.

Again, you are not being helpful by throwing in side remarks like these.
I am being very fair to you when you ask questions on how X works after
all assumptions you make - yet i ask questions which i see as valid and
you start throwing your hands off in the air. And then Patrick says i am
creating endless debates; this is why we go into loops.
Just answer the questions: I am trying to understand or you can choose
to ignore the emails all together.

cheers,
jamal

^ permalink raw reply	[flat|nested] 91+ messages in thread

* Re: [PATCH 2/3] [VLAN]: Update iif when receiving via VLAN device
  2006-07-01 13:35                                                                   ` jamal
@ 2006-07-08 10:54                                                                     ` Thomas Graf
  2006-07-08 14:14                                                                       ` Jamal Hadi Salim
  0 siblings, 1 reply; 91+ messages in thread
From: Thomas Graf @ 2006-07-08 10:54 UTC (permalink / raw)
  To: jamal; +Cc: kaber, netdev, David Miller

* jamal <hadi@cyberus.ca> 2006-07-01 09:35
> On Sat, 2006-01-07 at 13:28 +0200, Thomas Graf wrote:
> > Please enlighten me, I may be making a wrong assumption again.
> > 
> > 1) tc_verd is reset to 0 after dq in ri_tasklet()
> > 2) TC_AT is set to AT_EGRESS in dev_queue_xmit()
> > 3) TC_FROM being derived from TC_AT in tcf_mirred() when redirecting
> > 4) TC_FROM has the value AT_EGRESS when entering ifb_xmit()
> > 
> 
> Let me answer the next bit and it may clear this.

It's pretty clear actually, given eth0->ifb0->ifb1 it would look
like:

dev_queue_xmit(eth0)
  tcf_mirred -> ifb0
dev_queue_xmit(ifb0)
  tcf_mirred -> ifb1
dev_queue_xmit(ifb1)
  ifb_xmit(ifb1)
  <QUEUE> /* Here we queue the packet for the first time and
             release the stack of tx locks acquired on the
	     way. TC_FROM was never reset up here so this can't
	     possibly prevent any tx deadlocks. However, the
	     !from check is effective later on ... */
  ri_tasklet(ifb1)
dev_queue_xmit(ifb0)
  ifb_xmit(ifb0) /* classification disabled */
    /* Drop due to !from with input_dev = ifb1 which is
       good as it prevents to loop the packet back to
       ifb1 again which I refered to earlier */

Is this how it was intented?

> using what i said as looping in the case of A->B->C->D->A for the case
> of egress since that is what you are alluding to;
> note that ifb0 will be entered twice: First for example the tasklet in
> ifb0 will emit the packet, and then it will go all the way back to xmit
> on ifb0. This is where the issue is. Does that clear it?

I tried to stay out of A->B->A for now since that's currently
broken due to mirred unless the deadlock is intentional, f.e.
when setting up eth0->ifb0->eth0 like this:

ip link set ifb0 up
tc qdisc add dev ifb0 parent root handle 1: prio
tc qdisc add dev eth0 parent root handle 1: prio
tc filter add dev eth0 parent 1: protocol ip prio 10 u32
          match ip protocol 1 0xff flowid 1:1
	  action mirred egress redirect dev ifb0
tc filter add dev ifb0 parent 1: protocol ip prio 10 u32
          match ip protocol 1 0xff flowid 1:1
	  action mirred egress redirect dev eth0

The following deadlock will ocur:

dev_queue_xmit(eth0)
  tcf_mirred -> ifb0  /* acquire tcf_mirred->lock on eth0 */
dev_queue_xmit(ifb0)
  tcf_mirred -> eth0  /* acquire tcf_mirred->lock on ifb0
dev_queue_xmit(eth0)
  tcf_mirred -> ifb0  /* deadlock */

This is assuming that no tx deadlock happens of course. I
did try this out just to make sure, the machine just hung.
I can't see any code trying to prevent this but this is
another discussion as ifb is not involved in this, I think
it's purely a problem of mirred.

^ permalink raw reply	[flat|nested] 91+ messages in thread

* Re: [PATCH 2/3] [VLAN]: Update iif when receiving via VLAN device
  2006-07-08 10:54                                                                     ` Thomas Graf
@ 2006-07-08 14:14                                                                       ` Jamal Hadi Salim
  2006-07-08 14:29                                                                         ` Jamal Hadi Salim
  2006-07-08 23:46                                                                         ` Thomas Graf
  0 siblings, 2 replies; 91+ messages in thread
From: Jamal Hadi Salim @ 2006-07-08 14:14 UTC (permalink / raw)
  To: Thomas Graf; +Cc: kaber, netdev, David Miller

On Sat, 2006-08-07 at 12:54 +0200, Thomas Graf wrote:
> * jamal <hadi@cyberus.ca> 2006-07-01 09:35


> It's pretty clear actually, given eth0->ifb0->ifb1 it would look
> like:
> 
> dev_queue_xmit(eth0)
>   tcf_mirred -> ifb0
> dev_queue_xmit(ifb0)
>   tcf_mirred -> ifb1
> dev_queue_xmit(ifb1)
>   ifb_xmit(ifb1)
>   <QUEUE> /* Here we queue the packet for the first time and
>              release the stack of tx locks acquired on the
> 	     way. TC_FROM was never reset up here so this can't
> 	     possibly prevent any tx deadlocks. However, the
> 	     !from check is effective later on ... */
>   ri_tasklet(ifb1)
> dev_queue_xmit(ifb0)
>   ifb_xmit(ifb0) /* classification disabled */
>     /* Drop due to !from with input_dev = ifb1 which is
>        good as it prevents to loop the packet back to
>        ifb1 again which I refered to earlier */
> 
> Is this how it was intented?
> 

yes, indeed. This is one class pathalogical case that was/will be caught
by testing; hence the speacial casing. IOW, this will have a "break"
because of the queue. 
There are other such as loop to self (nasty for say egress eth0->eth0),
which i proposed that Herberts qdisc_is_running patch may resolve (I
need to test).

> I tried to stay out of A->B->A for now since that's currently
> broken due to mirred unless the deadlock is intentional, f.e.
> when setting up eth0->ifb0->eth0 like this:
>
> ip link set ifb0 up
> tc qdisc add dev ifb0 parent root handle 1: prio
> tc qdisc add dev eth0 parent root handle 1: prio
> tc filter add dev eth0 parent 1: protocol ip prio 10 u32
>           match ip protocol 1 0xff flowid 1:1
> 	  action mirred egress redirect dev ifb0
> tc filter add dev ifb0 parent 1: protocol ip prio 10 u32
>           match ip protocol 1 0xff flowid 1:1
> 	  action mirred egress redirect dev eth0
> 

I dont know if i tested for the above specific setup. I have to look at
my testcases when i get the opportunity.
The problem is:
There is no easy way to detect such a deadlock. I think it reduces to
the same case as eth0->eth0, i.e:

tc qdisc add dev eth0 root handle 1: prio
tc filter add dev eth0 parent 1: protocol ip prio 10 u32 \
match u32 0 0 flowid 1:1 \
action mirred egress redirect dev eth0


I have a dated  patch to mirred (may not apply cleanly) that i believe
will fix this specific one. Try to see if it also fixes this case you
have.
I do not want to submit this patch because i have a feeling there is a
lot more sophisticated way to do this. I would like to test Herberts
changes first. If you have time maybe you can try Daves 2.6.18 tree.


> This is assuming that no tx deadlock happens of course. I
> did try this out just to make sure, the machine just hung.
> I can't see any code trying to prevent this but this is
> another discussion as ifb is not involved in this, I think
> it's purely a problem of mirred.

Its the second pathological case tx deadlock essentially and mirred
could help - it is not the mirred lock really if the attached patch
fixes it, but i could be wrong.

cheers,
jamal

PS:- My responses will have some latency due to accessibility


^ permalink raw reply	[flat|nested] 91+ messages in thread

* Re: [PATCH 2/3] [VLAN]: Update iif when receiving via VLAN device
  2006-07-08 14:14                                                                       ` Jamal Hadi Salim
@ 2006-07-08 14:29                                                                         ` Jamal Hadi Salim
  2006-07-08 23:46                                                                         ` Thomas Graf
  1 sibling, 0 replies; 91+ messages in thread
From: Jamal Hadi Salim @ 2006-07-08 14:29 UTC (permalink / raw)
  To: Thomas Graf; +Cc: kaber, netdev, David Miller

[-- Attachment #1: Type: text/plain, Size: 544 bytes --]

On Sat, 2006-08-07 at 10:14 -0400, Jamal Hadi Salim wrote:

> I have a dated  patch to mirred (may not apply cleanly) 

Sorry forgot to attach the patch. Attached for real this time;->

> that i believe
> will fix this specific one. Try to see if it also fixes this case you
> have.

I meant i know this works for eth0->eth0 i am not sure if it will fix
your specific case. I need my laptop at the moment ;->
If it does it will be an ok solution but not the best (because it
introduces an unnecessary check for the common case).

cheers,
jamal

[-- Attachment #2: mirred-lpath1 --]
[-- Type: text/x-patch, Size: 452 bytes --]

diff --git a/net/sched/act_mirred.c b/net/sched/act_mirred.c
index 4fcccbd..d8946b3 100644
--- a/net/sched/act_mirred.c
+++ b/net/sched/act_mirred.c
@@ -208,6 +208,12 @@ bad_mirred:
 
 	skb2->dev = dev;
 	skb2->input_dev = skb->dev;
+	if (skb2->input_dev == skb2->dev) {
+		if (net_ratelimit())
+			printk(" Mirred: Loop detected to %s\n",skb2->dev->name);
+		goto bad_mirred;
+	}
+
 	dev_queue_xmit(skb2);
 	spin_unlock(&p->lock);
 	return p->action;

^ permalink raw reply related	[flat|nested] 91+ messages in thread

* Re: [PATCH 2/3] [VLAN]: Update iif when receiving via VLAN device
  2006-07-08 14:14                                                                       ` Jamal Hadi Salim
  2006-07-08 14:29                                                                         ` Jamal Hadi Salim
@ 2006-07-08 23:46                                                                         ` Thomas Graf
  2006-07-08 23:48                                                                           ` Thomas Graf
  2006-07-09 12:52                                                                           ` Jamal Hadi Salim
  1 sibling, 2 replies; 91+ messages in thread
From: Thomas Graf @ 2006-07-08 23:46 UTC (permalink / raw)
  To: Jamal Hadi Salim; +Cc: kaber, netdev, David Miller

* Jamal Hadi Salim <hadi@cyberus.ca> 2006-07-08 10:14
> I dont know if i tested for the above specific setup. I have to look at
> my testcases when i get the opportunity.
> The problem is:
> There is no easy way to detect such a deadlock. I think it reduces to
> the same case as eth0->eth0, i.e:

It's very simple. Just use the same method and add a flag if a mirred
is currently in use as Herbert did to qdisc_run().

> I have a dated  patch to mirred (may not apply cleanly) that i believe
> will fix this specific one. Try to see if it also fixes this case you
> have.

This is complete nonsense, fixing a case out of a generic
problem is useless.

> Its the second pathological case tx deadlock essentially and mirred
> could help - it is not the mirred lock really if the attached patch
> fixes it, but i could be wrong.

__LINK_STATE_QDISC_RUNNING will prevent an eventual tx deadlock,
it will not prevent the mirred deadlock.

I think everything has been said about this, since I'm still not
getting the intend of many parts I'm not able to fix these, sorry.

^ permalink raw reply	[flat|nested] 91+ messages in thread

* Re: [PATCH 2/3] [VLAN]: Update iif when receiving via VLAN device
  2006-07-08 23:46                                                                         ` Thomas Graf
@ 2006-07-08 23:48                                                                           ` Thomas Graf
  2006-07-09 12:52                                                                           ` Jamal Hadi Salim
  1 sibling, 0 replies; 91+ messages in thread
From: Thomas Graf @ 2006-07-08 23:48 UTC (permalink / raw)
  To: Jamal Hadi Salim; +Cc: kaber, netdev, David Miller

* Thomas Graf <tgraf@suug.ch> 2006-07-09 01:46
> __LINK_STATE_QDISC_RUNNING will prevent an eventual tx deadlock,
> it will not prevent the mirred deadlock.

BTW, it will also not protect you from deadlocking on
dev->queue_lock while enqueueing.

^ permalink raw reply	[flat|nested] 91+ messages in thread

* Re: [PATCH 2/3] [VLAN]: Update iif when receiving via VLAN device
  2006-07-08 23:46                                                                         ` Thomas Graf
  2006-07-08 23:48                                                                           ` Thomas Graf
@ 2006-07-09 12:52                                                                           ` Jamal Hadi Salim
  2006-07-09 13:17                                                                             ` Jamal Hadi Salim
  2006-07-09 13:33                                                                             ` Thomas Graf
  1 sibling, 2 replies; 91+ messages in thread
From: Jamal Hadi Salim @ 2006-07-09 12:52 UTC (permalink / raw)
  To: Thomas Graf; +Cc: kaber, netdev, David Miller

On Sun, 2006-09-07 at 01:46 +0200, Thomas Graf wrote:
> * Jamal Hadi Salim <hadi@cyberus.ca> 2006-07-08 10:14
[..]
> > There is no easy way to detect such a deadlock. I think it reduces to
> > the same case as eth0->eth0, i.e:
> 
> It's very simple. Just use the same method and add a flag if a mirred
> is currently in use as Herbert did to qdisc_run().
> 

But that would make it more complex in the sense it would require a lot
more code.

Another simpler approach is to check for recursion right in 
dev_queue_xmit() - but i did not dare to do that because i could not
justify it for any other code other than loops created by the action
code. i.e something along the lines of:

if (q->enqueue) {
		if (!spin_trylock(&dev->queue_lock)) {
			printk("yikes recursed device %s\n",dev->name);
                        goto tx_recursion_detected;
                }
.
.
.
recursion_detected:
        rc = -ENETDOWN;
        local_bh_enable();

out_kfree_skb:
        kfree_skb(skb);
        return rc;
out:
        local_bh_enable();
        return rc;
}

[BTW, notice how i used code to describe my view above ;->]
Again, I was hoping that Herbert's stuff may change this view (and
therefore give it more legitimacy) - but if the above can be accepted
then we can forget touching mirred.
The above change if acceptable will catch all sorts of scenarios:
A->A, A->B->A, A->B->C->B etc

> > I have a dated  patch to mirred (may not apply cleanly) that i believe
> > will fix this specific one. Try to see if it also fixes this case you
> > have.
> 
> This is complete nonsense, fixing a case out of a generic
> problem is useless.
> 

No Thomas - this is a valid check. As valid as mirred checking if device
is up first. I dont like it, like i said, because it adds one more check
in the first path. I was contemplating at some point even not doing the 

> > Its the second pathological case tx deadlock essentially and mirred
> > could help - it is not the mirred lock really if the attached patch
> > fixes it, but i could be wrong.
> 
> __LINK_STATE_QDISC_RUNNING will prevent an eventual tx deadlock,
> it will not prevent the mirred deadlock.
> 

By "mirred deadlock" i think you mean the deadlock that happens in
dev_queue_xmit()? 

> I think everything has been said about this, since I'm still not
> getting the intend of many parts I'm not able to fix these, sorry.
> 

What is still not clear above?

cheers,
jamal

^ permalink raw reply	[flat|nested] 91+ messages in thread

* Re: [PATCH 2/3] [VLAN]: Update iif when receiving via VLAN device
  2006-07-09 12:52                                                                           ` Jamal Hadi Salim
@ 2006-07-09 13:17                                                                             ` Jamal Hadi Salim
  2006-07-09 13:33                                                                             ` Thomas Graf
  1 sibling, 0 replies; 91+ messages in thread
From: Jamal Hadi Salim @ 2006-07-09 13:17 UTC (permalink / raw)
  To: Thomas Graf; +Cc: kaber, netdev, David Miller

On Sun, 2006-09-07 at 08:52 -0400, Jamal Hadi Salim wrote:

Ok, didnt complete my thought there ..

> I dont like it, like i said, because it adds one more check
> in the first path. I was contemplating at some point even not doing the 


".. not doing the " check for device up and just shove the packet at it.

cheers,
jamal

PS:-  warning on response still applies.


^ permalink raw reply	[flat|nested] 91+ messages in thread

* Re: [PATCH 2/3] [VLAN]: Update iif when receiving via VLAN device
  2006-07-09 12:52                                                                           ` Jamal Hadi Salim
  2006-07-09 13:17                                                                             ` Jamal Hadi Salim
@ 2006-07-09 13:33                                                                             ` Thomas Graf
  2006-07-09 14:03                                                                               ` Jamal Hadi Salim
  1 sibling, 1 reply; 91+ messages in thread
From: Thomas Graf @ 2006-07-09 13:33 UTC (permalink / raw)
  To: Jamal Hadi Salim; +Cc: kaber, netdev, David Miller

* Jamal Hadi Salim <hadi@cyberus.ca> 2006-07-09 08:52
> On Sun, 2006-09-07 at 01:46 +0200, Thomas Graf wrote:
> > * Jamal Hadi Salim <hadi@cyberus.ca> 2006-07-08 10:14
> [..]
> > > There is no easy way to detect such a deadlock. I think it reduces to
> > > the same case as eth0->eth0, i.e:
> > 
> > It's very simple. Just use the same method and add a flag if a mirred
> > is currently in use as Herbert did to qdisc_run().
> > 
> 
> But that would make it more complex in the sense it would require a lot
> more code.
> 
> Another simpler approach is to check for recursion right in 

It's very interesting that you change from "there is no easy way"
to "another simpler approach..." in just one posting :-)

> dev_queue_xmit() - but i did not dare to do that because i could not
> justify it for any other code other than loops created by the action
> code. i.e something along the lines of:
> 
> if (q->enqueue) {
> 		if (!spin_trylock(&dev->queue_lock)) {
> 			printk("yikes recursed device %s\n",dev->name);
>                         goto tx_recursion_detected;
>                 }

That's not gonna work, dev->queue_lock may be held legimitately
by someone else than an underlying dev_queue_xmit() call.

> [BTW, notice how i used code to describe my view above ;->]

I'm very proud of you.

> Again, I was hoping that Herbert's stuff may change this view (and
> therefore give it more legitimacy) - but if the above can be accepted
> then we can forget touching mirred.

You need this:

if (test_and_set_bit(__LINK_STATE_ENQUEUEING, &dev->state))
	goto tx_recursion_detected;

spin_lock(queue_lock);
enqueue(...)
qdisc_run()
spin_unlock(queue_lock);

clear_bit(__LINK_TATE_ENQUEUEING, &dev->state);

Unfortunately we'd still need __LINK_STATE_QDISC_RUNNING due
to net_tx_action().

> No Thomas - this is a valid check. As valid as mirred checking if device
> is up first. I dont like it, like i said, because it adds one more check
> in the first path. I was contemplating at some point even not doing the 

I'm not going to argue about this.

> By "mirred deadlock" i think you mean the deadlock that happens in
> dev_queue_xmit()? 

I meant the deadlock within mirred but the deadlock on queue_lock
is happening first anyway.

> What is still not clear above?

Frankly, I have no idea which deadlocks are intentional, what ideas
you have about ingress->egress, etc because it is not documented.
The list is very long. I'm not going to waste time trying to work
out a patch that will then not suit the ideas in your mind and on
your notes. You're maintaing this code, no?

^ permalink raw reply	[flat|nested] 91+ messages in thread

* Re: [PATCH 2/3] [VLAN]: Update iif when receiving via VLAN device
  2006-07-09 13:33                                                                             ` Thomas Graf
@ 2006-07-09 14:03                                                                               ` Jamal Hadi Salim
  2006-07-09 14:19                                                                                 ` Thomas Graf
  0 siblings, 1 reply; 91+ messages in thread
From: Jamal Hadi Salim @ 2006-07-09 14:03 UTC (permalink / raw)
  To: Thomas Graf; +Cc: kaber, netdev, David Miller

On Sun, 2006-09-07 at 15:33 +0200, Thomas Graf wrote:
> * Jamal Hadi Salim <hadi@cyberus.ca> 2006-07-09 08:52
[..]
> > Another simpler approach is to check for recursion right in 
> 
> It's very interesting that you change from "there is no easy way"
> to "another simpler approach..." in just one posting :-)
> 

I said there was no easy way of detecting - which is different from
preventing. Both approaches i showed were things i used (as far back as
last year) but did not find palatable to submit.
Hopefully we dont go into a tangent on this.

> > if (q->enqueue) {
> > 		if (!spin_trylock(&dev->queue_lock)) {
> > 			printk("yikes recursed device %s\n",dev->name);
> >                         goto tx_recursion_detected;
> >                 }
> 
> That's not gonna work, dev->queue_lock may be held legimitately
> by someone else than an underlying dev_queue_xmit() call.
> 

If there is a legitimate reason then it wont work. I cant think of one
though.

> > [BTW, notice how i used code to describe my view above ;->]
> 
> I'm very proud of you.
> 

Thank you thank you

> > Again, I was hoping that Herbert's stuff may change this view (and
> > therefore give it more legitimacy) - but if the above can be accepted
> > then we can forget touching mirred.
> 
> You need this:
> 
> if (test_and_set_bit(__LINK_STATE_ENQUEUEING, &dev->state))
> 	goto tx_recursion_detected;
> 
> spin_lock(queue_lock);
> enqueue(...)
> qdisc_run()
> spin_unlock(queue_lock);
> 
> clear_bit(__LINK_TATE_ENQUEUEING, &dev->state);
> 
> Unfortunately we'd still need __LINK_STATE_QDISC_RUNNING due
> to net_tx_action().
> 

This is also another approach that would work. If you think its simpler
go ahead and shoot a patch.

> > By "mirred deadlock" i think you mean the deadlock that happens in
> > dev_queue_xmit()? 
> 
> I meant the deadlock within mirred but the deadlock on queue_lock
> is happening first anyway.
> 

if you can stop things at the queue you wont have to worry about mirred.

> > What is still not clear above?
> 
> Frankly, I have no idea which deadlocks are intentional, what ideas
> you have about ingress->egress, etc because it is not documented.
> The list is very long. 

Ask one question at a time and i will answer. Some of the things we
discussed have no place to be commented-on in the code. But i think what
is obvious is:

A->*->A is a no-no.
And in some cases it is fine to let the user just fsck themselves
because then they will understand it is a bad idea [1] when shit
happens. OTOH, if there was a KISS way of doing it (as in the ifb case,
why not).

> I'm not going to waste time trying to work
> out a patch that will then not suit the ideas in your mind and on
> your notes. You're maintaing this code, no?

Yes, of course otherwise i wouldnt bother to comment on any patches.

cheers,
jamal

[1] there was a long discussion a few years back when i was still in
lkml on someone trying to redirect (i think) /dev/null -> /dev/mem
(dont hold me accountable if those are not the right devices, just
absorb the moral of this instead). The consensus was it would be very
difficult to do without making a lot of changes and if someone wishes to
do that they deserved what they are getting.

^ permalink raw reply	[flat|nested] 91+ messages in thread

* Re: [PATCH 2/3] [VLAN]: Update iif when receiving via VLAN device
  2006-07-09 14:03                                                                               ` Jamal Hadi Salim
@ 2006-07-09 14:19                                                                                 ` Thomas Graf
  2006-07-09 15:00                                                                                   ` Jamal Hadi Salim
  0 siblings, 1 reply; 91+ messages in thread
From: Thomas Graf @ 2006-07-09 14:19 UTC (permalink / raw)
  To: Jamal Hadi Salim; +Cc: kaber, netdev, David Miller

* Jamal Hadi Salim <hadi@cyberus.ca> 2006-07-09 10:03
> On Sun, 2006-09-07 at 15:33 +0200, Thomas Graf wrote:
> > That's not gonna work, dev->queue_lock may be held legimitately
> > by someone else than an underlying dev_queue_xmit() call.
> > 
> 
> If there is a legitimate reason then it wont work. I cant think of one
> though.

See sch_generic.c, it's documented. A simple grep on queue_lock
would have told you the same.

> This is also another approach that would work. If you think its simpler
> go ahead and shoot a patch.

It's not simpler, it's correct, while your patch is wrong.

> A->*->A is a no-no.
> And in some cases it is fine to let the user just fsck themselves
> because then they will understand it is a bad idea [1] when shit
> happens. OTOH, if there was a KISS way of doing it (as in the ifb case,
> why not).

I remind you that you started mentioning this A->*->A case while
talking about tx deadlocks that were supposed to be prevented with
the !from check or something along that lines. I can't really tell
because you explain it differently in every posting.

> Yes, of course otherwise i wouldnt bother to comment on any patches.

So maintain the code and fix your bugs.

^ permalink raw reply	[flat|nested] 91+ messages in thread

* Re: [PATCH 2/3] [VLAN]: Update iif when receiving via VLAN device
  2006-07-09 14:19                                                                                 ` Thomas Graf
@ 2006-07-09 15:00                                                                                   ` Jamal Hadi Salim
  2006-07-09 15:54                                                                                     ` Thomas Graf
  0 siblings, 1 reply; 91+ messages in thread
From: Jamal Hadi Salim @ 2006-07-09 15:00 UTC (permalink / raw)
  To: Thomas Graf; +Cc: kaber, netdev, David Miller

On Sun, 2006-09-07 at 16:19 +0200, Thomas Graf wrote:
> * Jamal Hadi Salim <hadi@cyberus.ca> 2006-07-09 10:03
> > On Sun, 2006-09-07 at 15:33 +0200, Thomas Graf wrote:
> > > That's not gonna work, dev->queue_lock may be held legimitately
> > > by someone else than an underlying dev_queue_xmit() call.
> > > 
> > 
> > If there is a legitimate reason then it wont work. I cant think of one
> > though.
> 
> See sch_generic.c, it's documented. A simple grep on queue_lock
> would have told you the same.
> 

If you mean that the device will also try to grab the qlock there, then
that is fine still for the serialization. It all starts at
dev_queue_xmit.

> > This is also another approach that would work. If you think its simpler
> > go ahead and shoot a patch.
> 
> It's not simpler, it's correct, while your patch is wrong.
> 

Ok, calm down, will you? Man, you make it very hard to follow Daves
Tibetan approach. Let me tell you exactly how i feel about your
approach: It is unnecessarily complex. The approach i posted is not only
fine, it works with only 4-5 lines of code; i have numerous tests
against it over a long period of time. I was trying to be polite and
follow Daves advice not to insist it has to be done my way - it almost
seems impossible with you trying to nitpick on little unimportant
details.

> > A->*->A is a no-no.
> > And in some cases it is fine to let the user just fsck themselves
> > because then they will understand it is a bad idea [1] when shit
> > happens. OTOH, if there was a KISS way of doing it (as in the ifb case,
> > why not).
> 
> I remind you that you started mentioning this A->*->A case while
> talking about tx deadlocks that were supposed to be prevented with
> the !from check or something along that lines. I can't really tell
> because you explain it differently in every posting.
> 

Because you are looking for preciseness at the micro/code level and i am
trying to explain at the conceptual/macro level (I have to adjust to
your mode but it is hard for me because i dont think at our level that
matters). When i sense you didnt understand me the last time, I try to
explain it better. The deadlock happens on transmit. If you want to be
precise, it happens when dev_queue_xmit is invoked. If you want more
preciseness, when the queue lock is contended. If you want more
preciseness, the code sample that i showed you should help illustrate
it.

> > Yes, of course otherwise i wouldnt bother to comment on any patches.
> 
> So maintain the code and fix your bugs.

Ok, I dont think this is gonna work - I may have to give up on getting
any resolution. 

Heres the suggestion again and you can still shoot a patch:
- If we can avoid doing anything at mirred that is the most preferable
approach. i.e get the change for free. In other words if it complicates
things it is not worth it. Someone redirecting eth0 to eth0 on egress
deserves whats happening to them.

- Try it against Herberts patch. It may resolve something or may require
an adjustment to Herberts patch so we can kill two birds with one stone
i.e get it for free. I dont know. I guess i shouldnt say "i dont know"
it is not precise, my point is you may find things when you attempt the
change. The patch is in Daves 2.6.18 tree.

- Iam fine with your suggestion to use a flag. The problem is not which
scheme you use - it is whether the change at such a crucial path in the
stack would be acceptable to other people.

Anyways, I am off.

cheers,
jamal

^ permalink raw reply	[flat|nested] 91+ messages in thread

* Re: [PATCH 2/3] [VLAN]: Update iif when receiving via VLAN device
  2006-07-09 15:00                                                                                   ` Jamal Hadi Salim
@ 2006-07-09 15:54                                                                                     ` Thomas Graf
  2006-07-09 23:06                                                                                       ` Jamal Hadi Salim
  0 siblings, 1 reply; 91+ messages in thread
From: Thomas Graf @ 2006-07-09 15:54 UTC (permalink / raw)
  To: Jamal Hadi Salim; +Cc: kaber, netdev, David Miller

* Jamal Hadi Salim <hadi@cyberus.ca> 2006-07-09 11:00
> If you mean that the device will also try to grab the qlock there, then
> that is fine still for the serialization. It all starts at
> dev_queue_xmit.

Look at where dev->queue_lock is taken, whenever a qdisc or
filter is added, modified or deleted the lock is taken. Using
your approach packets get dropped while such an operation is
taking place. Your approach is wrong.

^ permalink raw reply	[flat|nested] 91+ messages in thread

* Re: [PATCH 2/3] [VLAN]: Update iif when receiving via VLAN device
  2006-07-09 15:54                                                                                     ` Thomas Graf
@ 2006-07-09 23:06                                                                                       ` Jamal Hadi Salim
  0 siblings, 0 replies; 91+ messages in thread
From: Jamal Hadi Salim @ 2006-07-09 23:06 UTC (permalink / raw)
  To: Thomas Graf; +Cc: kaber, netdev, David Miller

On Sun, 2006-09-07 at 17:54 +0200, Thomas Graf wrote:

> Look at where dev->queue_lock is taken, whenever a qdisc or
> filter is added, modified or deleted the lock is taken. Using
> your approach packets get dropped while such an operation is
> taking place. 

True, this will be bad for devices that dont care about any of this
(looping etc).
Perhaps it may even have been a reason that held me back - i dont
remember and it doesnt matter.
So go ahead and submit a patch while keeping in mind the same reasoning
as above.

> Your approach is wrong.

And this is where we are going to go into a loop. Do you want to go that
path? I suggest we dont.

cheers,
jamal


^ permalink raw reply	[flat|nested] 91+ messages in thread

end of thread, other threads:[~2006-07-09 23:07 UTC | newest]

Thread overview: 91+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2006-06-26 14:54 [PATCHSET] Towards accurate incoming interface information Thomas Graf
2006-06-25 22:00 ` [PATCH 1/3] [NET]: Use interface index to keep input device information Thomas Graf
2006-06-25 22:00 ` [PATCH 2/3] [VLAN]: Update iif when receiving via VLAN device Thomas Graf
2006-06-26 17:04   ` Patrick McHardy
2006-06-26 17:46     ` David Miller
2006-06-26 18:24       ` Patrick McHardy
2006-06-26 18:44       ` Thomas Graf
2006-06-26 22:29         ` Patrick McHardy
2006-06-26 22:49           ` Patrick McHardy
2006-06-27 10:03           ` Thomas Graf
2006-06-27 13:07             ` jamal
2006-06-28 10:18               ` Thomas Graf
2006-06-28 12:22                 ` jamal
2006-06-28 13:01                   ` Thomas Graf
2006-06-28 13:46                     ` jamal
2006-06-29  8:51                       ` Thomas Graf
2006-06-29 20:55                         ` Thomas Graf
2006-06-29 23:23                         ` jamal
2006-06-29 23:39                           ` Thomas Graf
2006-06-29 23:47                             ` David Miller
2006-06-30  0:08                               ` jamal
2006-06-30  0:12                                 ` David Miller
2006-06-30  0:26                                   ` jamal
2006-06-30  0:29                                     ` David Miller
2006-06-30  0:44                                       ` Ben Greear
2006-06-30  0:48                                       ` jamal
2006-06-30  0:55                                         ` Thomas Graf
2006-06-30  1:18                                           ` jamal
2006-06-30  0:03                             ` jamal
2006-06-30  0:46                               ` Thomas Graf
2006-06-30  1:11                                 ` jamal
2006-06-30 13:08                                   ` Thomas Graf
2006-06-30 13:20                                     ` Nicolas Dichtel
2006-06-30 13:57                                     ` jamal
2006-06-30 14:15                                       ` Thomas Graf
2006-06-30 14:35                                         ` jamal
2006-06-30 15:40                                           ` Ben Greear
2006-06-30 16:32                                           ` Thomas Graf
2006-06-30 17:13                                             ` Thomas Graf
2006-06-30 17:18                                               ` Patrick McHardy
2006-06-30 17:32                                                 ` jamal
2006-06-30 17:42                                                   ` Patrick McHardy
2006-06-30 17:44                                                     ` Thomas Graf
2006-06-30 19:40                                                     ` jamal
2006-06-30 20:17                                                       ` David Miller
2006-06-30 17:42                                                   ` Thomas Graf
2006-06-30 19:34                                                     ` jamal
2006-06-30 20:08                                                       ` Thomas Graf
2006-06-30 20:42                                                         ` jamal
2006-06-30 17:27                                               ` Ben Greear
2006-06-30 21:09                                               ` jamal
2006-06-30 21:20                                                 ` Thomas Graf
2006-06-30 21:35                                                   ` jamal
2006-06-30 23:22                                                     ` Thomas Graf
2006-07-01  2:23                                                       ` jamal
2006-07-01 11:51                                                         ` Thomas Graf
2006-07-01 13:47                                                           ` jamal
2006-06-30 17:19                                             ` jamal
2006-06-30 17:33                                               ` Patrick McHardy
2006-06-30 19:59                                                 ` jamal
2006-06-30 20:30                                                   ` Thomas Graf
2006-06-30 20:33                                                     ` David Miller
2006-06-30 20:54                                                       ` jamal
2006-06-30 21:10                                                         ` Thomas Graf
2006-06-30 21:31                                                           ` jamal
2006-06-30 23:45                                                             ` Thomas Graf
2006-07-01  2:59                                                               ` jamal
2006-07-01 11:28                                                                 ` Thomas Graf
2006-07-01 13:35                                                                   ` jamal
2006-07-08 10:54                                                                     ` Thomas Graf
2006-07-08 14:14                                                                       ` Jamal Hadi Salim
2006-07-08 14:29                                                                         ` Jamal Hadi Salim
2006-07-08 23:46                                                                         ` Thomas Graf
2006-07-08 23:48                                                                           ` Thomas Graf
2006-07-09 12:52                                                                           ` Jamal Hadi Salim
2006-07-09 13:17                                                                             ` Jamal Hadi Salim
2006-07-09 13:33                                                                             ` Thomas Graf
2006-07-09 14:03                                                                               ` Jamal Hadi Salim
2006-07-09 14:19                                                                                 ` Thomas Graf
2006-07-09 15:00                                                                                   ` Jamal Hadi Salim
2006-07-09 15:54                                                                                     ` Thomas Graf
2006-07-09 23:06                                                                                       ` Jamal Hadi Salim
2006-07-01  2:47                                                   ` Patrick McHardy
2006-06-30 17:34                                               ` Thomas Graf
     [not found]                                     ` <44A52435.20909@6wind.com>
2006-06-30 13:36                                       ` Thomas Graf
2006-06-30 13:43                                         ` Nicolas Dichtel
2006-06-30 17:01                                       ` Patrick McHardy
2006-06-30 17:02                                         ` Patrick McHardy
2006-06-25 22:00 ` [PATCH 3/3] [PKT_SCHED]: Add iif meta match Thomas Graf
2006-06-27 15:07 ` [PATCHSET] Towards accurate incoming interface information Thomas Graf
2006-06-27 20:24   ` David Miller

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).