[PATCH 1/3] LRO: fix return code propogation

netdev.vger.kernel.org archive mirror
 help / color / mirror / Atom feed

* [PATCH 1/3] LRO: fix return code propogation
@ 2008-08-22  0:51 Jeff Kirsher
  2008-08-22  0:51 ` [PATCH 2/3] netlink: nal_parse_nested_compat was not parsing nested attributes Jeff Kirsher
                   ` (2 more replies)
  0 siblings, 3 replies; 41+ messages in thread
From: Jeff Kirsher @ 2008-08-22  0:51 UTC (permalink / raw)
  To: davem; +Cc: jeff, netdev, Jesse Brandeburg, Jeff Kirsher

From: Jesse Brandeburg <jesse.brandeburg@intel.com>

LRO code needs to propogate the return values from the netif_rx and
netif_receive_skb calls specifically to allow drivers that need that
information to react correctly.  Some places in the code couldn't
return the result of the upcall to the stack, so I added the dropped
counter, which just completes the cache line of the stats struct (on
64-bit), so shouldn't be a big deal.

Signed-off-by: Jesse Brandeburg <jesse.brandeburg@intel.com>
Signed-off-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
---

 include/linux/inet_lro.h |    9 +++---
 net/ipv4/inet_lro.c      |   68 +++++++++++++++++++++++++++-------------------
 2 files changed, 45 insertions(+), 32 deletions(-)

diff --git a/include/linux/inet_lro.h b/include/linux/inet_lro.h
index c4335fa..06380ec 100644
--- a/include/linux/inet_lro.h
+++ b/include/linux/inet_lro.h
@@ -39,6 +39,7 @@ struct net_lro_stats {
 	unsigned long aggregated;
 	unsigned long flushed;
 	unsigned long no_desc;
+	unsigned long dropped;
 };
 
 /*
@@ -132,7 +133,7 @@ struct net_lro_mgr {
  *        (for example get_tcp_ip_hdr)
  */
 
-void lro_receive_skb(struct net_lro_mgr *lro_mgr,
+int lro_receive_skb(struct net_lro_mgr *lro_mgr,
 		     struct sk_buff *skb,
 		     void *priv);
 
@@ -140,7 +141,7 @@ void lro_receive_skb(struct net_lro_mgr *lro_mgr,
  * Processes a SKB with VLAN HW acceleration support
  */
 
-void lro_vlan_hwaccel_receive_skb(struct net_lro_mgr *lro_mgr,
+int lro_vlan_hwaccel_receive_skb(struct net_lro_mgr *lro_mgr,
 				  struct sk_buff *skb,
 				  struct vlan_group *vgrp,
 				  u16 vlan_tag,
@@ -161,11 +162,11 @@ void lro_vlan_hwaccel_receive_skb(struct net_lro_mgr *lro_mgr,
  *        (for example get_tcp_ip_hdr)
  */
 
-void lro_receive_frags(struct net_lro_mgr *lro_mgr,
+int lro_receive_frags(struct net_lro_mgr *lro_mgr,
 		       struct skb_frag_struct *frags,
 		       int len, int true_size, void *priv, __wsum sum);
 
-void lro_vlan_hwaccel_receive_frags(struct net_lro_mgr *lro_mgr,
+int lro_vlan_hwaccel_receive_frags(struct net_lro_mgr *lro_mgr,
 				    struct skb_frag_struct *frags,
 				    int len, int true_size,
 				    struct vlan_group *vgrp,
diff --git a/net/ipv4/inet_lro.c b/net/ipv4/inet_lro.c
index cfd034a..918a3a1 100644
--- a/net/ipv4/inet_lro.c
+++ b/net/ipv4/inet_lro.c
@@ -301,9 +301,11 @@ out:
 	return lro_desc;
 }
 
-static void lro_flush(struct net_lro_mgr *lro_mgr,
+static int lro_flush(struct net_lro_mgr *lro_mgr,
 		      struct net_lro_desc *lro_desc)
 {
+	int ret = NET_RX_SUCCESS;
+
 	if (lro_desc->pkt_aggr_cnt > 1)
 		lro_update_tcp_ip_header(lro_desc);
 
@@ -311,23 +313,25 @@ static void lro_flush(struct net_lro_mgr *lro_mgr,
 
 	if (lro_desc->vgrp) {
 		if (lro_mgr->features & LRO_F_NAPI)
-			vlan_hwaccel_receive_skb(lro_desc->parent,
-						 lro_desc->vgrp,
-						 lro_desc->vlan_tag);
+			ret = vlan_hwaccel_receive_skb(lro_desc->parent,
+						       lro_desc->vgrp,
+						       lro_desc->vlan_tag);
 		else
-			vlan_hwaccel_rx(lro_desc->parent,
-					lro_desc->vgrp,
-					lro_desc->vlan_tag);
+			ret = vlan_hwaccel_rx(lro_desc->parent,
+					      lro_desc->vgrp,
+					      lro_desc->vlan_tag);
 
 	} else {
 		if (lro_mgr->features & LRO_F_NAPI)
-			netif_receive_skb(lro_desc->parent);
+			ret = netif_receive_skb(lro_desc->parent);
 		else
-			netif_rx(lro_desc->parent);
+			ret = netif_rx(lro_desc->parent);
 	}
 
 	LRO_INC_STATS(lro_mgr, flushed);
 	lro_clear_desc(lro_desc);
+
+	return ret;
 }
 
 static int __lro_proc_skb(struct net_lro_mgr *lro_mgr, struct sk_buff *skb,
@@ -376,12 +380,14 @@ static int __lro_proc_skb(struct net_lro_mgr *lro_mgr, struct sk_buff *skb,
 
 	if ((lro_desc->pkt_aggr_cnt >= lro_mgr->max_aggr) ||
 	    lro_desc->parent->len > (0xFFFF - lro_mgr->dev->mtu))
-		lro_flush(lro_mgr, lro_desc);
+		if (lro_flush(lro_mgr, lro_desc) == NET_RX_DROP)
+			LRO_INC_STATS(lro_mgr, dropped);
 
 	return 0;
 
 out2: /* send aggregated SKBs to stack */
-	lro_flush(lro_mgr, lro_desc);
+	if (lro_flush(lro_mgr, lro_desc) == NET_RX_DROP)
+		LRO_INC_STATS(lro_mgr, dropped);
 
 out:
 	return 1;
@@ -401,8 +407,10 @@ static struct sk_buff *lro_gen_skb(struct net_lro_mgr *lro_mgr,
 	int hdr_len = min(len, hlen);
 
 	skb = netdev_alloc_skb(lro_mgr->dev, hlen + lro_mgr->frag_align_pad);
-	if (!skb)
+	if (!skb) {
+		LRO_INC_STATS(lro_mgr, dropped);
 		return NULL;
+	}
 
 	skb_reserve(skb, lro_mgr->frag_align_pad);
 	skb->len = len;
@@ -496,12 +504,14 @@ static struct sk_buff *__lro_proc_segment(struct net_lro_mgr *lro_mgr,
 
 	if ((skb_shinfo(lro_desc->parent)->nr_frags >= lro_mgr->max_aggr) ||
 	    lro_desc->parent->len > (0xFFFF - lro_mgr->dev->mtu))
-		lro_flush(lro_mgr, lro_desc);
+		if (lro_flush(lro_mgr, lro_desc) == NET_RX_DROP)
+			LRO_INC_STATS(lro_mgr, dropped);
 
 	return NULL;
 
 out2: /* send aggregated packets to the stack */
-	lro_flush(lro_mgr, lro_desc);
+	if (lro_flush(lro_mgr, lro_desc) == NET_RX_DROP)
+		LRO_INC_STATS(lro_mgr, dropped);
 
 out1:  /* Original packet has to be posted to the stack */
 	skb = lro_gen_skb(lro_mgr, frags, len, true_size, mac_hdr,
@@ -510,20 +520,21 @@ out:
 	return skb;
 }
 
-void lro_receive_skb(struct net_lro_mgr *lro_mgr,
+int lro_receive_skb(struct net_lro_mgr *lro_mgr,
 		     struct sk_buff *skb,
 		     void *priv)
 {
 	if (__lro_proc_skb(lro_mgr, skb, NULL, 0, priv)) {
 		if (lro_mgr->features & LRO_F_NAPI)
-			netif_receive_skb(skb);
+			return netif_receive_skb(skb);
 		else
-			netif_rx(skb);
+			return netif_rx(skb);
 	}
+	return NET_RX_SUCCESS;
 }
 EXPORT_SYMBOL(lro_receive_skb);
 
-void lro_vlan_hwaccel_receive_skb(struct net_lro_mgr *lro_mgr,
+int lro_vlan_hwaccel_receive_skb(struct net_lro_mgr *lro_mgr,
 				  struct sk_buff *skb,
 				  struct vlan_group *vgrp,
 				  u16 vlan_tag,
@@ -531,14 +542,15 @@ void lro_vlan_hwaccel_receive_skb(struct net_lro_mgr *lro_mgr,
 {
 	if (__lro_proc_skb(lro_mgr, skb, vgrp, vlan_tag, priv)) {
 		if (lro_mgr->features & LRO_F_NAPI)
-			vlan_hwaccel_receive_skb(skb, vgrp, vlan_tag);
+			return vlan_hwaccel_receive_skb(skb, vgrp, vlan_tag);
 		else
-			vlan_hwaccel_rx(skb, vgrp, vlan_tag);
+			return vlan_hwaccel_rx(skb, vgrp, vlan_tag);
 	}
+	return NET_RX_SUCCESS;
 }
 EXPORT_SYMBOL(lro_vlan_hwaccel_receive_skb);
 
-void lro_receive_frags(struct net_lro_mgr *lro_mgr,
+int lro_receive_frags(struct net_lro_mgr *lro_mgr,
 		       struct skb_frag_struct *frags,
 		       int len, int true_size, void *priv, __wsum sum)
 {
@@ -547,16 +559,16 @@ void lro_receive_frags(struct net_lro_mgr *lro_mgr,
 	skb = __lro_proc_segment(lro_mgr, frags, len, true_size, NULL, 0,
 				 priv, sum);
 	if (!skb)
-		return;
+		return NET_RX_SUCCESS;
 
 	if (lro_mgr->features & LRO_F_NAPI)
-		netif_receive_skb(skb);
+		return netif_receive_skb(skb);
 	else
-		netif_rx(skb);
+		return netif_rx(skb);
 }
 EXPORT_SYMBOL(lro_receive_frags);
 
-void lro_vlan_hwaccel_receive_frags(struct net_lro_mgr *lro_mgr,
+int lro_vlan_hwaccel_receive_frags(struct net_lro_mgr *lro_mgr,
 				    struct skb_frag_struct *frags,
 				    int len, int true_size,
 				    struct vlan_group *vgrp,
@@ -567,12 +579,12 @@ void lro_vlan_hwaccel_receive_frags(struct net_lro_mgr *lro_mgr,
 	skb = __lro_proc_segment(lro_mgr, frags, len, true_size, vgrp,
 				 vlan_tag, priv, sum);
 	if (!skb)
-		return;
+		return NET_RX_SUCCESS;
 
 	if (lro_mgr->features & LRO_F_NAPI)
-		vlan_hwaccel_receive_skb(skb, vgrp, vlan_tag);
+		return vlan_hwaccel_receive_skb(skb, vgrp, vlan_tag);
 	else
-		vlan_hwaccel_rx(skb, vgrp, vlan_tag);
+		return vlan_hwaccel_rx(skb, vgrp, vlan_tag);
 }
 EXPORT_SYMBOL(lro_vlan_hwaccel_receive_frags);
 


^ permalink raw reply related	[flat|nested] 41+ messages in thread

* [PATCH 2/3] netlink: nal_parse_nested_compat was not parsing nested attributes
  2008-08-22  0:51 [PATCH 1/3] LRO: fix return code propogation Jeff Kirsher
@ 2008-08-22  0:51 ` Jeff Kirsher
  2008-08-22 10:18   ` David Miller
  2008-08-22  0:51 ` [PATCH 3/3] pkt_sched: restore multiqueue prio scheduler Jeff Kirsher
  2008-08-22 10:20 ` [PATCH 1/3] LRO: fix return code propogation David Miller
  2 siblings, 1 reply; 41+ messages in thread
From: Jeff Kirsher @ 2008-08-22  0:51 UTC (permalink / raw)
  To: davem; +Cc: jeff, netdev, Alexander Duyck, Jeff Kirsher

From: Alexander Duyck <alexander.h.duyck@intel.com>

This patch reverts previous commit: b9a2f2e450b0f770bb4347ae8d48eb2dea701e24
netlink: Fix nla_parse_nested_compat() to call nla_parse() directly

The purpose of nla_parse_nested_compat is to parse attributes which contain a
struct followed by a container attribute with a stream of nested attributes.
This patch reverts the previous patch which assumed that there was no container
attribute due to a malformed string of attributes being generated by
netem_parse_opts

Signed-off-by: Alexander Duyck <alexander.h.duyck@intel.com>
Signed-off-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
---

 include/net/netlink.h |   11 +++++------
 1 files changed, 5 insertions(+), 6 deletions(-)

diff --git a/include/net/netlink.h b/include/net/netlink.h
index 18024b8..b295cd1 100644
--- a/include/net/netlink.h
+++ b/include/net/netlink.h
@@ -770,13 +770,12 @@ static inline int __nla_parse_nested_compat(struct nlattr *tb[], int maxtype,
 					    const struct nla_policy *policy,
 					    int len)
 {
-	int nested_len = nla_len(nla) - NLA_ALIGN(len);
-
-	if (nested_len < 0)
+	if (nla_len(nla) < len)
 		return -EINVAL;
-	if (nested_len >= nla_attr_size(0))
-		return nla_parse(tb, maxtype, nla_data(nla) + NLA_ALIGN(len),
-				 nested_len, policy);
+	if (nla_len(nla) >= NLA_ALIGN(len) + sizeof(struct nlattr))
+		return nla_parse_nested(tb, maxtype,
+					nla_data(nla) + NLA_ALIGN(len),
+					policy);
 	memset(tb, 0, sizeof(struct nlattr *) * (maxtype + 1));
 	return 0;
 }


^ permalink raw reply related	[flat|nested] 41+ messages in thread

* Re: [PATCH 2/3] netlink: nal_parse_nested_compat was not parsing nested attributes
  2008-08-22  0:51 ` [PATCH 2/3] netlink: nal_parse_nested_compat was not parsing nested attributes Jeff Kirsher
@ 2008-08-22 10:18   ` David Miller
  2008-08-22 17:40     ` [PATCH 2/3] netlink: nla_parse_nested_compat " Duyck, Alexander H
  0 siblings, 1 reply; 41+ messages in thread
From: David Miller @ 2008-08-22 10:18 UTC (permalink / raw)
  To: jeffrey.t.kirsher; +Cc: jeff, netdev, alexander.h.duyck, tgraf

From: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
Date: Thu, 21 Aug 2008 17:51:26 -0700

Please at least CC: the author of the patch you are reverting so
that they can provide feedback and have a look at this.

Thomas Graf CC:'d.

> From: Alexander Duyck <alexander.h.duyck@intel.com>
> 
> This patch reverts previous commit: b9a2f2e450b0f770bb4347ae8d48eb2dea701e24
> netlink: Fix nla_parse_nested_compat() to call nla_parse() directly
> 
> The purpose of nla_parse_nested_compat is to parse attributes which contain a
> struct followed by a container attribute with a stream of nested attributes.
> This patch reverts the previous patch which assumed that there was no container
> attribute due to a malformed string of attributes being generated by
> netem_parse_opts
> 
> Signed-off-by: Alexander Duyck <alexander.h.duyck@intel.com>
> Signed-off-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
> ---
> 
>  include/net/netlink.h |   11 +++++------
>  1 files changed, 5 insertions(+), 6 deletions(-)
> 
> diff --git a/include/net/netlink.h b/include/net/netlink.h
> index 18024b8..b295cd1 100644
> --- a/include/net/netlink.h
> +++ b/include/net/netlink.h
> @@ -770,13 +770,12 @@ static inline int __nla_parse_nested_compat(struct nlattr *tb[], int maxtype,
>  					    const struct nla_policy *policy,
>  					    int len)
>  {
> -	int nested_len = nla_len(nla) - NLA_ALIGN(len);
> -
> -	if (nested_len < 0)
> +	if (nla_len(nla) < len)
>  		return -EINVAL;
> -	if (nested_len >= nla_attr_size(0))
> -		return nla_parse(tb, maxtype, nla_data(nla) + NLA_ALIGN(len),
> -				 nested_len, policy);
> +	if (nla_len(nla) >= NLA_ALIGN(len) + sizeof(struct nlattr))
> +		return nla_parse_nested(tb, maxtype,
> +					nla_data(nla) + NLA_ALIGN(len),
> +					policy);
>  	memset(tb, 0, sizeof(struct nlattr *) * (maxtype + 1));
>  	return 0;
>  }
> 

^ permalink raw reply	[flat|nested] 41+ messages in thread

* RE: [PATCH 2/3] netlink: nla_parse_nested_compat was not parsing nested attributes
  2008-08-22 10:18   ` David Miller
@ 2008-08-22 17:40     ` Duyck, Alexander H
  2008-08-27 14:52       ` Thomas Graf
  0 siblings, 1 reply; 41+ messages in thread
From: Duyck, Alexander H @ 2008-08-22 17:40 UTC (permalink / raw)
  To: David Miller, Kirsher, Jeffrey T
  Cc: jeff@garzik.org, netdev@vger.kernel.org, tgraf@suug.ch,
	shemminger@osdl.org, kaber@trash.net

I figured I would CC Stephen Hemminger and Patrick McHardy since they also will likely have an interest in this.

Just to put together a quick history on this issue I will list off how things got to where they are:

1.  "[NET_SCHED]: sch_netem: use nla_parse_nested_compat" (http://marc.info/?l=linux-netdev&m=120110639320780&w=2) from Patrick McHardy.  After this patch netem started reporting "bytes leftover after parsing attributes" errors as it wasn't actually receiving a nested compat attribute.  This patch was added around 2.6.25-rc1.

2.  "[NETLINK]: Fix nla_parse_nested_compat() to call nla_parse() directly" (http://marc.info/?l=linux-netdev&m=121145625328370&w=2) from Thomas Graf.  This resolved the errors but in turn any netlink messages that generated correct nested netlink attributes would have those attributes ignored as they would be automatically parsed out as one attribute with a length of all the nested attributes.  This patch was added around 2.6.26-rc4.

Patches 3 & 4 haven't been applied yet and are under review:

3.  "[PATCH 2/3] netlink: nla_parse_nested_compat was not parsing nested attributes" (http://marc.info/?l=linux-netdev&m=121936623112195&w=2) from me.  This reverts patch 2.

4.  "[PATCH] IPROUTE: correct nla nested message generated by netem_parse_opt" (http://marc.info/?l=linux-netdev&m=121936623112195&w=2) from me.  This patch changes iproute2 netem_parse_opt to generate a correctly formatted set of nested compat attributes that can be parsed after the introduction of patch 1.

The way I see it we have 3 possible solutions to all of this.  First there is my proposed solution in patches 3 & 4.  The second option would be to revert patch 1 and apply patch 3 which would maintain the kernel ABI and not require any patching to iproute2.  The third option I see would be to go back and patch include/net/netlink.h in the kernel and lib/libnetlink.c in iproute2 so that they all support the new format which is parsed by nla_parse_nested_compat.

Thanks,

Alex

-----Original Message-----
From: David Miller [mailto:davem@davemloft.net]
Sent: Friday, August 22, 2008 3:18 AM
To: Kirsher, Jeffrey T
Cc: jeff@garzik.org; netdev@vger.kernel.org; Duyck, Alexander H; tgraf@suug.ch
Subject: Re: [PATCH 2/3] netlink: nal_parse_nested_compat was not parsing nested attributes

From: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
Date: Thu, 21 Aug 2008 17:51:26 -0700

Please at least CC: the author of the patch you are reverting so
that they can provide feedback and have a look at this.

Thomas Graf CC:'d.

> From: Alexander Duyck <alexander.h.duyck@intel.com>
>
> This patch reverts previous commit: b9a2f2e450b0f770bb4347ae8d48eb2dea701e24
> netlink: Fix nla_parse_nested_compat() to call nla_parse() directly
>
> The purpose of nla_parse_nested_compat is to parse attributes which contain a
> struct followed by a container attribute with a stream of nested attributes.
> This patch reverts the previous patch which assumed that there was no container
> attribute due to a malformed string of attributes being generated by
> netem_parse_opts
>
> Signed-off-by: Alexander Duyck <alexander.h.duyck@intel.com>
> Signed-off-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
> ---
>
>  include/net/netlink.h |   11 +++++------
>  1 files changed, 5 insertions(+), 6 deletions(-)
>
> diff --git a/include/net/netlink.h b/include/net/netlink.h
> index 18024b8..b295cd1 100644
> --- a/include/net/netlink.h
> +++ b/include/net/netlink.h
> @@ -770,13 +770,12 @@ static inline int __nla_parse_nested_compat(struct nlattr *tb[], int maxtype,
>                                           const struct nla_policy *policy,
>                                           int len)
>  {
> -     int nested_len = nla_len(nla) - NLA_ALIGN(len);
> -
> -     if (nested_len < 0)
> +     if (nla_len(nla) < len)
>               return -EINVAL;
> -     if (nested_len >= nla_attr_size(0))
> -             return nla_parse(tb, maxtype, nla_data(nla) + NLA_ALIGN(len),
> -                              nested_len, policy);
> +     if (nla_len(nla) >= NLA_ALIGN(len) + sizeof(struct nlattr))
> +             return nla_parse_nested(tb, maxtype,
> +                                     nla_data(nla) + NLA_ALIGN(len),
> +                                     policy);
>       memset(tb, 0, sizeof(struct nlattr *) * (maxtype + 1));
>       return 0;
>  }
>

^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [PATCH 2/3] netlink: nla_parse_nested_compat was not parsing nested attributes
  2008-08-22 17:40     ` [PATCH 2/3] netlink: nla_parse_nested_compat " Duyck, Alexander H
@ 2008-08-27 14:52       ` Thomas Graf
  2008-08-27 18:09         ` Duyck, Alexander H
  0 siblings, 1 reply; 41+ messages in thread
From: Thomas Graf @ 2008-08-27 14:52 UTC (permalink / raw)
  To: Duyck, Alexander H
  Cc: David Miller, Kirsher, Jeffrey T, jeff@garzik.org,
	netdev@vger.kernel.org, shemminger@osdl.org, kaber@trash.net

* Duyck, Alexander H <alexander.h.duyck@intel.com> 2008-08-22 10:40
> I figured I would CC Stephen Hemminger and Patrick McHardy since they also will likely have an interest in this.
> 
> Just to put together a quick history on this issue I will list off how things got to where they are:
> 
> 1.  "[NET_SCHED]: sch_netem: use nla_parse_nested_compat" (http://marc.info/?l=linux-netdev&m=120110639320780&w=2) from Patrick McHardy.  After this patch netem started reporting "bytes leftover after parsing attributes" errors as it wasn't actually receiving a nested compat attribute.  This patch was added around 2.6.25-rc1.
> 
> 2.  "[NETLINK]: Fix nla_parse_nested_compat() to call nla_parse() directly" (http://marc.info/?l=linux-netdev&m=121145625328370&w=2) from Thomas Graf.  This resolved the errors but in turn any netlink messages that generated correct nested netlink attributes would have those attributes ignored as they would be automatically parsed out as one attribute with a length of all the nested attributes.  This patch was added around 2.6.26-rc4.

This patch was necessary as patch 1 broke the kernel ABI. It not only
resulted in leftover warnings, it resulted in the interpreation of
data with a 4 bytes offset which caused random configuration mess.

"Correct" nested netlink attributes are not supposed to be parsed
using nla_parse_nested_compat(), nla_parse_nested_compat() is
exclusively for attributes which contain a list of nested attributes
without a container attribute.

> Patches 3 & 4 haven't been applied yet and are under review:
> 
> 3.  "[PATCH 2/3] netlink: nla_parse_nested_compat was not parsing nested attributes" (http://marc.info/?l=linux-netdev&m=121936623112195&w=2) from me.  This reverts patch 2.
> 
> 4.  "[PATCH] IPROUTE: correct nla nested message generated by netem_parse_opt" (http://marc.info/?l=linux-netdev&m=121936623112195&w=2) from me.  This patch changes iproute2 netem_parse_opt to generate a correctly formatted set of nested compat attributes that can be parsed after the introduction of patch 1.

You can't change ABI! What your patches do is to generate new style nested
attributes and change nla_parse_nested_compat() to follow the same
semantisc as nla_parse_nested(). The _compat() is there for a reason.

^ permalink raw reply	[flat|nested] 41+ messages in thread

* RE: [PATCH 2/3] netlink: nla_parse_nested_compat was not parsing nested attributes
  2008-08-27 14:52       ` Thomas Graf
@ 2008-08-27 18:09         ` Duyck, Alexander H
  0 siblings, 0 replies; 41+ messages in thread
From: Duyck, Alexander H @ 2008-08-27 18:09 UTC (permalink / raw)
  To: Thomas Graf
  Cc: David Miller, Kirsher, Jeffrey T, jeff@garzik.org,
	netdev@vger.kernel.org, shemminger@osdl.org, kaber@trash.net

>This patch was necessary as patch 1 broke the kernel ABI. It not only
>resulted in leftover warnings, it resulted in the interpreation of
>data with a 4 bytes offset which caused random configuration mess.
>
>"Correct" nested netlink attributes are not supposed to be parsed
>using nla_parse_nested_compat(), nla_parse_nested_compat() is
>exclusively for attributes which contain a list of nested attributes
>without a container attribute.
>

First of all the first patch didn't break the kernel ABI, it broke netem.  The wrong function call was used and as a result you were seeing parsing errors.  Instead of either updating tc to generate the correct format to match the call or reverting the previous patch so that it was parsed correctly the decision was made to fix the call.  We had been using the nested compat attributes in the prio qdisc for some time and that patch had no effect on us.  It wasn't until you "corrected" the kernel ABI in your patch that things were broken.

If you insisit that you are parsing the correct message format then I have another pair or patches I would like to recommend you provide us since your original patch is incomplete.  One to fix nla_nest_compat_start and nla_nest_compat_end so that they can correctly generate messages that can be parsed by the "Correct" nla_parse_nested_compat.  The second to fix iproute so that the libnetlink.c can generate/read "Correct" compat attributes as well.  The only downside will be that iproute will not be compatible with any of the older kernels that are using nested compat attributes since we will have changed the kernel ABI to match your "Correct" format.

>> Patches 3 & 4 haven't been applied yet and are under review:
>>
>> 3.  "[PATCH 2/3] netlink: nla_parse_nested_compat was not
>parsing nested attributes"
>(http://marc.info/?l=linux-netdev&m=121936623112195&w=2) from
>me.  This reverts patch 2.
>>
>> 4.  "[PATCH] IPROUTE: correct nla nested message generated
>by netem_parse_opt"
>(http://marc.info/?l=linux-netdev&m=121936623112195&w=2) from
>me.  This patch changes iproute2 netem_parse_opt to generate a
>correctly formatted set of nested compat attributes that can
>be parsed after the introduction of patch 1.
>
>You can't change ABI! What your patches do is to generate new
>style nested
>attributes and change nla_parse_nested_compat() to follow the same
>semantisc as nla_parse_nested(). The _compat() is there for a reason.

So let me see if I understand this correctly.  Reverting your patch which changed the ABI in the first place is me changing it?  So what was it you did in the first place to support the custom format generated by netem then?

We should either revert 1 & 2, or add 3 & 4 so that the kernel ABI can read the same messages it generates.  If you want to provide your own patches that fix the kernel/iproute as I described above I would be perfectly happy with that too.  I honestly don't care how it gets solved as long as this gets fixed.

Alex

^ permalink raw reply	[flat|nested] 41+ messages in thread

* [PATCH 3/3] pkt_sched: restore multiqueue prio scheduler
  2008-08-22  0:51 [PATCH 1/3] LRO: fix return code propogation Jeff Kirsher
  2008-08-22  0:51 ` [PATCH 2/3] netlink: nal_parse_nested_compat was not parsing nested attributes Jeff Kirsher
@ 2008-08-22  0:51 ` Jeff Kirsher
  2008-08-22 10:16   ` David Miller
  2008-08-22 10:20 ` [PATCH 1/3] LRO: fix return code propogation David Miller
  2 siblings, 1 reply; 41+ messages in thread
From: Jeff Kirsher @ 2008-08-22  0:51 UTC (permalink / raw)
  To: davem; +Cc: jeff, netdev, Alexander Duyck, Jeff Kirsher

From: Alexander Duyck <alexander.h.duyck@intel.com>

This patch restores the multiqueue prio scheduler which was removed along with
the RR scheduler during the early changes for multiple tx queue support.  This
patch fixes the regression which occured as a result disabling the multiqueue
qdisc functionality.

Signed-off-by: Alexander Duyck <alexander.h.duyck@intel.com>
Signed-off-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
---

 include/linux/pkt_sched.h |    9 +++++++
 net/sched/sch_prio.c      |   57 ++++++++++++++++++++++++++++++++++++---------
 2 files changed, 55 insertions(+), 11 deletions(-)

diff --git a/include/linux/pkt_sched.h b/include/linux/pkt_sched.h
index e5de421..6ceef2e 100644
--- a/include/linux/pkt_sched.h
+++ b/include/linux/pkt_sched.h
@@ -123,6 +123,15 @@ struct tc_prio_qopt
 	__u8	priomap[TC_PRIO_MAX+1];	/* Map: logical priority -> PRIO band */
 };
 
+enum
+{
+ TCA_PRIO_UNSPEC,
+ TCA_PRIO_MQ,
+ __TCA_PRIO_MAX
+};
+
+#define TCA_PRIO_MAX (__TCA_PRIO_MAX - 1)
+
 /* TBF section */
 
 struct tc_tbf_qopt
diff --git a/net/sched/sch_prio.c b/net/sched/sch_prio.c
index a6697c6..ef3e978 100644
--- a/net/sched/sch_prio.c
+++ b/net/sched/sch_prio.c
@@ -27,6 +27,7 @@ struct prio_sched_data
 	struct tcf_proto *filter_list;
 	u8  prio2band[TC_PRIO_MAX+1];
 	struct Qdisc *queues[TCQ_PRIO_BANDS];
+	int mq;
 };
 
 
@@ -53,14 +54,17 @@ prio_classify(struct sk_buff *skb, struct Qdisc *sch, int *qerr)
 		if (!q->filter_list || err < 0) {
 			if (TC_H_MAJ(band))
 				band = 0;
-			return q->queues[q->prio2band[band&TC_PRIO_MAX]];
+			band = q->prio2band[band&TC_PRIO_MAX];
+			goto out;
 		}
 		band = res.classid;
 	}
 	band = TC_H_MIN(band) - 1;
 	if (band >= q->bands)
-		return q->queues[q->prio2band[0]];
-
+		band = q->prio2band[0];
+out:
+	if (q->mq)
+		skb_set_queue_mapping(skb, band);
 	return q->queues[band];
 }
 
@@ -127,11 +131,18 @@ static struct sk_buff *prio_dequeue(struct Qdisc* sch)
 	int prio;
 
 	for (prio = 0; prio < q->bands; prio++) {
-		struct Qdisc *qdisc = q->queues[prio];
-		struct sk_buff *skb = qdisc->dequeue(qdisc);
-		if (skb) {
-			sch->q.qlen--;
-			return skb;
+		/* Check if target subqueue is avaialble before
+		 * pulling an skb.  This way we avoid excessive requeues
+		 * for slower queues.
+		 */
+		if (!q->mq ||
+		    !__netif_subqueue_stopped(qdisc_dev(sch), prio)) {
+			struct Qdisc *qdisc = q->queues[prio];
+			struct sk_buff *skb = qdisc->dequeue(qdisc);
+			if (skb) {
+				sch->q.qlen--;
+				return skb;
+			}
 		}
 	}
 	return NULL;
@@ -182,11 +193,30 @@ static int prio_tune(struct Qdisc *sch, struct nlattr *opt)
 {
 	struct prio_sched_data *q = qdisc_priv(sch);
 	struct tc_prio_qopt *qopt;
+	struct nlattr *tb[TCA_PRIO_MAX + 1];
+	int err;
+	int mq;
 	int i;
 
-	if (nla_len(opt) < sizeof(*qopt))
-		return -EINVAL;
-	qopt = nla_data(opt);
+	err = nla_parse_nested_compat(tb, TCA_PRIO_MAX, opt, NULL, qopt,
+	                              sizeof(*qopt));
+	if (err < 0)
+		return err;
+	/* If we're multiqueue, make sure the number of bands equals the
+	 * number of transmit for the device.  If bands requested is 0 then
+	 * set the bands to match dev->real_num_tx_queues.  This qdisc can
+	 * only be added as a root qdisc since it must interact with the
+	 * underlying device.
+	 */
+	mq = nla_get_flag(tb[TCA_PRIO_MQ]);
+	if (mq) {
+		if (sch->parent != TC_H_ROOT)
+			return -EINVAL;
+		if (qopt->bands == 0)
+			qopt->bands = qdisc_dev(sch)->real_num_tx_queues;
+		else if (qopt->bands != qdisc_dev(sch)->real_num_tx_queues)
+			return -EINVAL;
+	}
 
 	if (qopt->bands > TCQ_PRIO_BANDS || qopt->bands < 2)
 		return -EINVAL;
@@ -197,6 +227,7 @@ static int prio_tune(struct Qdisc *sch, struct nlattr *opt)
 	}
 
 	sch_tree_lock(sch);
+	q->mq = mq;
 	q->bands = qopt->bands;
 	memcpy(q->prio2band, qopt->priomap, TC_PRIO_MAX+1);
 
@@ -263,6 +294,10 @@ static int prio_dump(struct Qdisc *sch, struct sk_buff *skb)
 	nest = nla_nest_compat_start(skb, TCA_OPTIONS, sizeof(opt), &opt);
 	if (nest == NULL)
 		goto nla_put_failure;
+	if (q->mq) {
+		if (nla_put_flag(skb, TCA_PRIO_MQ) < 0)
+			goto nla_put_failure;
+	}
 	nla_nest_compat_end(skb, nest);
 
 	return skb->len;


^ permalink raw reply related	[flat|nested] 41+ messages in thread

* Re: [PATCH 3/3] pkt_sched: restore multiqueue prio scheduler
  2008-08-22  0:51 ` [PATCH 3/3] pkt_sched: restore multiqueue prio scheduler Jeff Kirsher
@ 2008-08-22 10:16   ` David Miller
  2008-08-22 14:30     ` jamal
  0 siblings, 1 reply; 41+ messages in thread
From: David Miller @ 2008-08-22 10:16 UTC (permalink / raw)
  To: jeffrey.t.kirsher; +Cc: jeff, netdev, alexander.h.duyck

From: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
Date: Thu, 21 Aug 2008 17:51:29 -0700

> From: Alexander Duyck <alexander.h.duyck@intel.com>
> 
> This patch restores the multiqueue prio scheduler which was removed along with
> the RR scheduler during the early changes for multiple tx queue support.  This
> patch fixes the regression which occured as a result disabling the multiqueue
> qdisc functionality.
> 
> Signed-off-by: Alexander Duyck <alexander.h.duyck@intel.com>
> Signed-off-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com>

So how in the world would you use this?

If it's just to tag traffic into different TX queues by priority,
that's not very wise nor desirable.  What's the point?

The TX queues are useful for multiplexing traffic and seperating
the locking and cpu overhead across execution entities in the
system.  They can also be useful for virtualization, but that's
not relevant in this discussion.

The TX queues, on the other hand, are not useful for exposing the
round-robin or whatever algorithm that some cards just so happen to
enforce fairness amongst the TX queues.  That's an implementation
detail.

The truth is, the only reason the RR prio scheduler got added was
because Jamal and myself didn't understand very well how to use these
multiqueue cards, or at least I didn't understand it.

And therefore we, at the time, recommended to implement the RR prio
thing even though now I realize it's totally the wrong thing to do
for TX multiqueue stuff.

I removed this code very much intentionally, and it's only going to go
back if I hear a very good argument for doing so.  And "taking it away
is a regression" is not what I'm looking for, give me something better
than that :)

^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [PATCH 3/3] pkt_sched: restore multiqueue prio scheduler
  2008-08-22 10:16   ` David Miller
@ 2008-08-22 14:30     ` jamal
  2008-08-22 22:19       ` Jarek Poplawski
  0 siblings, 1 reply; 41+ messages in thread
From: jamal @ 2008-08-22 14:30 UTC (permalink / raw)
  To: David Miller; +Cc: jeffrey.t.kirsher, jeff, netdev, alexander.h.duyck

On Fri, 2008-22-08 at 03:16 -0700, David Miller wrote:

> If it's just to tag traffic into different TX queues by priority,
> that's not very wise nor desirable.  What's the point?
> 
> The TX queues are useful for multiplexing traffic and seperating
> the locking and cpu overhead across execution entities in the
> system.  They can also be useful for virtualization, but that's
> not relevant in this discussion.
> 
> The TX queues, on the other hand, are not useful for exposing the
> round-robin or whatever algorithm that some cards just so happen to
> enforce fairness amongst the TX queues.  That's an implementation
> detail.
>
> The truth is, the only reason the RR prio scheduler got added was
> because Jamal and myself didn't understand very well how to use these
> multiqueue cards, or at least I didn't understand it.
> 

For the record, I was against the approach taken not the end goal. IIRC,
I was slapped around with a big fish at the time and so i got out of the
way. I still dont like it;->

There are two issues at stake:
1) egress Multiq support and the desire to have concurency based on
however many cpus and hardware queues exist on the system.
2) scheduling of the such hardware queues being executed by the hardware
(and not by software).

Daves goal: #1; run faster than Usain Bolt.
What we were solving at the time: #2. My view was to solve it with
minimal changes.

#1 and #2 are orthogonal. Yes, there is religion: Dave yours is #1.
Intels is #2; And there are a lot of people in intels camp because
they bill their customers based on qos of resources. The wire being one
such resource.

Example: if you were to use this stuff for virtualization and gave one
customer a cpu and a hardware queue, scheduling is still important. Some
customers pay less (not everyone is Steve Wozniak with his little posse
and can jump queues). 
Therefore your statement that these schemes exist to "enforce fairness
amongst the TX queues" needs to be qualified mon ami;-> The end parts of
Animal Farm come to mind: Some animals have more rights than others;->

[Lets say we forget for a minute about multiegressq nics, we still have
other hardware devices (like hardware L2/L3 switch chips) that do both
multiq and funky prioritization that need to work in the same scheme]

Back to the subject:
I think if one was to use a "qdisc-pass-through" with the what you
have implemented, theres opportunity to let hardware do its scheduling
and meet the goals the intel folks. The filters above just select the
qdisc which is set in hardware.

cheers,
jamal

^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [PATCH 3/3] pkt_sched: restore multiqueue prio scheduler
  2008-08-22 14:30     ` jamal
@ 2008-08-22 22:19       ` Jarek Poplawski
  2008-08-23  0:01         ` Alexander Duyck
  2008-08-23  0:33         ` David Miller
  0 siblings, 2 replies; 41+ messages in thread
From: Jarek Poplawski @ 2008-08-22 22:19 UTC (permalink / raw)
  To: hadi; +Cc: David Miller, jeffrey.t.kirsher, jeff, netdev, alexander.h.duyck

jamal wrote, On 08/22/2008 04:30 PM:
...
> There are two issues at stake:
> 1) egress Multiq support and the desire to have concurency based on
> however many cpus and hardware queues exist on the system.
> 2) scheduling of the such hardware queues being executed by the hardware
> (and not by software).
> 
> Daves goal: #1; run faster than Usain Bolt.

Looks fine.

> What we were solving at the time: #2. My view was to solve it with
> minimal changes.
> 
> #1 and #2 are orthogonal. Yes, there is religion: Dave yours is #1.
> Intels is #2; And there are a lot of people in intels camp because
> they bill their customers based on qos of resources. The wire being one
> such resource.

If we can guarentee that current, automatic steering gives always the
best performance than David seems to be right. But I doubt it, and
that's why I think such a simple, manual control could be useful.
Especially if it doesn't add much overhead.

> Therefore your statement that these schemes exist to "enforce fairness
> amongst the TX queues" needs to be qualified mon ami;-> The end parts of
> Animal Farm come to mind: Some animals have more rights than others;->

Sure, but shouldn't this other kind of fairness be applied on lower
levels?

Cheers,
Jarek P.

^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [PATCH 3/3] pkt_sched: restore multiqueue prio scheduler
  2008-08-22 22:19       ` Jarek Poplawski
@ 2008-08-23  0:01         ` Alexander Duyck
  2008-08-23  0:40           ` David Miller
  2008-08-23  0:33         ` David Miller
  1 sibling, 1 reply; 41+ messages in thread
From: Alexander Duyck @ 2008-08-23  0:01 UTC (permalink / raw)
  To: Jarek Poplawski
  Cc: hadi, David Miller, jeffrey.t.kirsher, jeff, netdev,
	alexander.h.duyck

On Fri, Aug 22, 2008 at 3:19 PM, Jarek Poplawski <jarkao2@gmail.com> wrote:
> jamal wrote, On 08/22/2008 04:30 PM:
> ...
>> There are two issues at stake:
>> 1) egress Multiq support and the desire to have concurency based on
>> however many cpus and hardware queues exist on the system.
>> 2) scheduling of the such hardware queues being executed by the hardware
>> (and not by software).
>>
>> Daves goal: #1; run faster than Usain Bolt.
>
> Looks fine.
>
In the case of this scheduler our main focus is QOS and then
performance.  Basically I was trying to achieve the goal of setting up
the queues as needed without forcing a serious change in the current
transmit path or seriously impacting performance.  I figure since all
I am doing is restoring part of what was there for 2.6.26 the
likelihood of this causing a performance regression is small.

>> What we were solving at the time: #2. My view was to solve it with
>> minimal changes.
>>
>> #1 and #2 are orthogonal. Yes, there is religion: Dave yours is #1.
>> Intels is #2; And there are a lot of people in intels camp because
>> they bill their customers based on qos of resources. The wire being one
>> such resource.
>
> If we can guarentee that current, automatic steering gives always the
> best performance than David seems to be right. But I doubt it, and
> that's why I think such a simple, manual control could be useful.
> Especially if it doesn't add much overhead.

I am almost certain that David's approach using the hash will show
better performance than the multiqueue prio qdisc would.  The
multiqueue prio qdisc is meant to allow for classification of traffic
into separate traffic classes to support stuff like Enhanced Ethernet
for Data Center (EEDC) / Data Center Bridging (DCB).  Basically we
can't use the hash to place things into the correct queue because each
hardware queue has a specific traffic class assigned to it, and the
queue can be stopped via priority flow control packets leaving the
other queues still going.  The multiqueue prio qdisc will allow all of
the other traffic classes on the other queues to continue flowing
without causing any head-of-line issues simply because one queue is
stopped.

>
>> Therefore your statement that these schemes exist to "enforce fairness
>> amongst the TX queues" needs to be qualified mon ami;-> The end parts of
>> Animal Farm come to mind: Some animals have more rights than others;->
>
> Sure, but shouldn't this other kind of fairness be applied on lower
> levels?
>
> Cheers,
> Jarek P.

This qdisc isn't so much about fairness as just making sure that the
right traffic gets to the right tx queue, and that there are no
head-of-line issues that occur between traffic classes in the process.
One of the things I looked at was slipping an extra pass-thru qdisc
into the path prior to, or as part of, dev->select_queue.  It would
have required some serious changes to the transmit path in addition to
application changes to iproute2 in order to support adding rules on
yet another qdisc layer in the transmit path, with all the changes
that would have been required the risk was high so I defaulted to what
was already there prior to 2.6.27.  The multiqueue prio qdisc had been
in the kernel since 2.6.23 and already has an existing interface for
enabling it in iproute2 so the transition back to it should be fairly
seamless and the risk should be low.

Thanks,

Alex

^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [PATCH 3/3] pkt_sched: restore multiqueue prio scheduler
  2008-08-23  0:01         ` Alexander Duyck
@ 2008-08-23  0:40           ` David Miller
  2008-08-23  1:37             ` Alexander Duyck
  0 siblings, 1 reply; 41+ messages in thread
From: David Miller @ 2008-08-23  0:40 UTC (permalink / raw)
  To: alexander.duyck
  Cc: jarkao2, hadi, jeffrey.t.kirsher, jeff, netdev, alexander.h.duyck

From: "Alexander Duyck" <alexander.duyck@gmail.com>
Date: Fri, 22 Aug 2008 17:01:50 -0700

> I am almost certain that David's approach using the hash will show
> better performance than the multiqueue prio qdisc would.  The
> multiqueue prio qdisc is meant to allow for classification of traffic
> into separate traffic classes to support stuff like Enhanced Ethernet
> for Data Center (EEDC) / Data Center Bridging (DCB).

The only sensible way to implement this is to use the existing
classifier technology in the packet scheduler to choose traffic,
and then writing a TC actions or ematch module that sets the
TX queue of the SKB based upon the classification result.

The thing that was there before was very narrow in scope and
we'll just have to keep adding more special purpose modules
as more such uses come up.

With the classifiers, it's generic and any scheme can be implemented
by simply issuing different 'tc' commands.

^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [PATCH 3/3] pkt_sched: restore multiqueue prio scheduler
  2008-08-23  0:40           ` David Miller
@ 2008-08-23  1:37             ` Alexander Duyck
  2008-08-23  5:12               ` Herbert Xu
  2008-08-23  8:15               ` David Miller
  0 siblings, 2 replies; 41+ messages in thread
From: Alexander Duyck @ 2008-08-23  1:37 UTC (permalink / raw)
  To: David Miller
  Cc: jarkao2, hadi, jeffrey.t.kirsher, jeff, netdev, alexander.h.duyck

On Fri, Aug 22, 2008 at 5:40 PM, David Miller <davem@davemloft.net> wrote:
> From: "Alexander Duyck" <alexander.duyck@gmail.com>
> Date: Fri, 22 Aug 2008 17:01:50 -0700
>
>> I am almost certain that David's approach using the hash will show
>> better performance than the multiqueue prio qdisc would.  The
>> multiqueue prio qdisc is meant to allow for classification of traffic
>> into separate traffic classes to support stuff like Enhanced Ethernet
>> for Data Center (EEDC) / Data Center Bridging (DCB).
>
> The only sensible way to implement this is to use the existing
> classifier technology in the packet scheduler to choose traffic,
> and then writing a TC actions or ematch module that sets the
> TX queue of the SKB based upon the classification result.
>
> The thing that was there before was very narrow in scope and
> we'll just have to keep adding more special purpose modules
> as more such uses come up.
>
> With the classifiers, it's generic and any scheme can be implemented
> by simply issuing different 'tc' commands.
>

I thought about doing just that but then I realized that there would
be a number of issues.

First if I just set the skb->queue_mapping for the packet without
moving it to a qdisc dedicated to that tx queue I run into
head-of-line issues since multiple qdiscs will stop if holding packets
for a single tx queue that is full.  That is over come in this patch
by the fact that each qdisc has a specific fifo per tx queue.

That issue led me to the thought of creating a redirect action that
would take the packet from one qdisc to the correct qdisc for the
transmit queue.  That setup has two issues.  First, all traffic would
need to go to one queue by default to avoid a possible deadlock
condition in the event that two queues try to enqueue packets on one
another at the same time.  That combined with the fact that one packet
would then have to grab two qdisc locks to be enqueued seems to be
rather expensive performance wise.  Second the the action of
redirecting the packet once already in a qdisc requires cloning the
skb which would be a serious performance drop over the previous prio
qdisc implementation.  I don't incur these performance penalties with
the mq prio qdisc and the locking is clean since the transition occurs
after dequeue but before grabbing the transmit queue lock.

There were only two workable ways I could see of doing this that
didn't make a total mess of things.  The first is what I implemented
in this patch.  The second would have been to add a pass-thru qdisc
prior to or as part of select_queue that had the tc action that set
the skb->queue_mapping.  The only reason why I didn't really feel
comfortable implementing the extra qdisc is because I felt it would
have added extra unnecessary overhead and require more changes to the
tx path.

Thanks,

Alex

^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [PATCH 3/3] pkt_sched: restore multiqueue prio scheduler
  2008-08-23  1:37             ` Alexander Duyck
@ 2008-08-23  5:12               ` Herbert Xu
  2008-08-23  6:35                 ` Alexander Duyck
  2008-08-23  8:15               ` David Miller
  1 sibling, 1 reply; 41+ messages in thread
From: Herbert Xu @ 2008-08-23  5:12 UTC (permalink / raw)
  To: Alexander Duyck
  Cc: davem, jarkao2, hadi, jeffrey.t.kirsher, jeff, netdev,
	alexander.h.duyck

Alexander Duyck <alexander.duyck@gmail.com> wrote:
> 
> That issue led me to the thought of creating a redirect action that
> would take the packet from one qdisc to the correct qdisc for the
> transmit queue.  That setup has two issues.  First, all traffic would
> need to go to one queue by default to avoid a possible deadlock
> condition in the event that two queues try to enqueue packets on one
> another at the same time.  That combined with the fact that one packet

That looks like a problem but it isn't.  When you're overriding
the default queue selection you're breaking CPU affinity.  As such
cache-line bouncing is clearly not a concern or you wouldn't be
doing this (that is, you're doing this for QOS rather than CPU
scalability).  So having everything go through a single qdisc
shouldn't be a problem.

Cheers,
-- 
Visit Openswan at http://www.openswan.org/
Email: Herbert Xu ~{PmV>HI~} <herbert@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt

^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [PATCH 3/3] pkt_sched: restore multiqueue prio scheduler
  2008-08-23  5:12               ` Herbert Xu
@ 2008-08-23  6:35                 ` Alexander Duyck
  2008-08-23  7:07                   ` Herbert Xu
  2008-08-23  8:23                   ` David Miller
  0 siblings, 2 replies; 41+ messages in thread
From: Alexander Duyck @ 2008-08-23  6:35 UTC (permalink / raw)
  To: Herbert Xu
  Cc: davem, jarkao2, hadi, jeffrey.t.kirsher, jeff, netdev,
	alexander.h.duyck

On Fri, Aug 22, 2008 at 10:12 PM, Herbert Xu
<herbert@gondor.apana.org.au> wrote:
> Alexander Duyck <alexander.duyck@gmail.com> wrote:
>>
>> That issue led me to the thought of creating a redirect action that
>> would take the packet from one qdisc to the correct qdisc for the
>> transmit queue.  That setup has two issues.  First, all traffic would
>> need to go to one queue by default to avoid a possible deadlock
>> condition in the event that two queues try to enqueue packets on one
>> another at the same time.  That combined with the fact that one packet
>
> That looks like a problem but it isn't.  When you're overriding
> the default queue selection you're breaking CPU affinity.  As such
> cache-line bouncing is clearly not a concern or you wouldn't be
> doing this (that is, you're doing this for QOS rather than CPU
> scalability).  So having everything go through a single qdisc
> shouldn't be a problem.
>
> Cheers,

It isn't the performance aspect of running everything through one
queue that I am concerned about since that is how it was working
before.  My concern is that the action could cause a dead lock if
simple_tx_hash places traffic on 2 queues and then the tc rule tried
to swap the traffic between those queues while they are each holding
their own queue locks.  The tc rule would have to receive all traffic
onto a single qdisc to prevent this, which would require setting
select_queue for the netdev to return a fixed queue.  The multiqueue
prio qdisc is flexible enough to allow all traffic to be directed to
one queue or be received on multiple queues without causing the system
to lock up which means that implementing select_queue on the device
wouldn't be mandatory.

Thanks,

Alex

^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [PATCH 3/3] pkt_sched: restore multiqueue prio scheduler
  2008-08-23  6:35                 ` Alexander Duyck
@ 2008-08-23  7:07                   ` Herbert Xu
  2008-08-23  8:23                   ` David Miller
  1 sibling, 0 replies; 41+ messages in thread
From: Herbert Xu @ 2008-08-23  7:07 UTC (permalink / raw)
  To: Alexander Duyck
  Cc: herbert, davem, jarkao2, hadi, jeffrey.t.kirsher, jeff, netdev,
	alexander.h.duyck

Alexander Duyck <alexander.duyck@gmail.com> wrote:
> 
> It isn't the performance aspect of running everything through one
> queue that I am concerned about since that is how it was working
> before.  My concern is that the action could cause a dead lock if
> simple_tx_hash places traffic on 2 queues and then the tc rule tried
> to swap the traffic between those queues while they are each holding
> their own queue locks.  The tc rule would have to receive all traffic
> onto a single qdisc to prevent this, which would require setting
> select_queue for the netdev to return a fixed queue.  The multiqueue
> prio qdisc is flexible enough to allow all traffic to be directed to
> one queue or be received on multiple queues without causing the system
> to lock up which means that implementing select_queue on the device
> wouldn't be mandatory.

As it is if you create a custom qdisc then there will only be
a single qdisc no matter how many hardware queues you have.  So
you can safely dequeue from the qdisc into any hardware queue you
want.

Cheers,
-- 
Visit Openswan at http://www.openswan.org/
Email: Herbert Xu ~{PmV>HI~} <herbert@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt

^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [PATCH 3/3] pkt_sched: restore multiqueue prio scheduler
  2008-08-23  6:35                 ` Alexander Duyck
  2008-08-23  7:07                   ` Herbert Xu
@ 2008-08-23  8:23                   ` David Miller
  1 sibling, 0 replies; 41+ messages in thread
From: David Miller @ 2008-08-23  8:23 UTC (permalink / raw)
  To: alexander.duyck
  Cc: herbert, jarkao2, hadi, jeffrey.t.kirsher, jeff, netdev,
	alexander.h.duyck

From: "Alexander Duyck" <alexander.duyck@gmail.com>
Date: Fri, 22 Aug 2008 23:35:24 -0700

> It isn't the performance aspect of running everything through one
> queue that I am concerned about since that is how it was working
> before.  My concern is that the action could cause a dead lock if
> simple_tx_hash places traffic on 2 queues and then the tc rule tried
> to swap the traffic between those queues while they are each holding
> their own queue locks.

You really should look at how the current code works before
coming to conclusions like this ;-)

When you configure any qdisc onto a device using tc, you get a single
qdisc which is shared by all of the TXQ objects of the device.

The per-TXQ qdisc thing only happens for the default qdisc.

^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [PATCH 3/3] pkt_sched: restore multiqueue prio scheduler
  2008-08-23  1:37             ` Alexander Duyck
  2008-08-23  5:12               ` Herbert Xu
@ 2008-08-23  8:15               ` David Miller
  1 sibling, 0 replies; 41+ messages in thread
From: David Miller @ 2008-08-23  8:15 UTC (permalink / raw)
  To: alexander.duyck
  Cc: jarkao2, hadi, jeffrey.t.kirsher, jeff, netdev, alexander.h.duyck

From: "Alexander Duyck" <alexander.duyck@gmail.com>
Date: Fri, 22 Aug 2008 18:37:51 -0700

> I thought about doing just that but then I realized that there would
> be a number of issues.
> 
> First if I just set the skb->queue_mapping for the packet without
> moving it to a qdisc dedicated to that tx queue I run into
> head-of-line issues since multiple qdiscs will stop if holding packets
> for a single tx queue that is full.  That is over come in this patch
> by the fact that each qdisc has a specific fifo per tx queue.

Such fifos could be configured using 'tc' commands just
as easily.

Look, I know what you're trying to do is possible with at
best minor tweaks.  So don't press your luck listing how
impossible it is :-)


^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [PATCH 3/3] pkt_sched: restore multiqueue prio scheduler
  2008-08-22 22:19       ` Jarek Poplawski
  2008-08-23  0:01         ` Alexander Duyck
@ 2008-08-23  0:33         ` David Miller
  2008-08-23  8:47           ` Jarek Poplawski
  1 sibling, 1 reply; 41+ messages in thread
From: David Miller @ 2008-08-23  0:33 UTC (permalink / raw)
  To: jarkao2; +Cc: hadi, jeffrey.t.kirsher, jeff, netdev, alexander.h.duyck

From: Jarek Poplawski <jarkao2@gmail.com>
Date: Sat, 23 Aug 2008 00:19:13 +0200

> jamal wrote, On 08/22/2008 04:30 PM:
> ...
> > What we were solving at the time: #2. My view was to solve it with
> > minimal changes.
> > 
> > #1 and #2 are orthogonal. Yes, there is religion: Dave yours is #1.
> > Intels is #2; And there are a lot of people in intels camp because
> > they bill their customers based on qos of resources. The wire being one
> > such resource.
> 
> If we can guarentee that current, automatic steering gives always the
> best performance than David seems to be right. But I doubt it, and
> that's why I think such a simple, manual control could be useful.
> Especially if it doesn't add much overhead.

If we want queue selection in the packet scheduler, let's implement
it, but let's do so properly using classifiers and TC actions or a
ematch modules that can select the queue.

Then people can implement whatever policy they want, in completely
generic ways, and the kernel simply doesn't care.

The way this code worked was completely special purpose and ignored
the host of facilities and infrastructure we have for doing things
like this.


^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [PATCH 3/3] pkt_sched: restore multiqueue prio scheduler
  2008-08-23  0:33         ` David Miller
@ 2008-08-23  8:47           ` Jarek Poplawski
  2008-08-23 16:31             ` Alexander Duyck
  0 siblings, 1 reply; 41+ messages in thread
From: Jarek Poplawski @ 2008-08-23  8:47 UTC (permalink / raw)
  To: David Miller; +Cc: hadi, jeffrey.t.kirsher, jeff, netdev, alexander.h.duyck

On Fri, Aug 22, 2008 at 05:33:48PM -0700, David Miller wrote:
...
> If we want queue selection in the packet scheduler, let's implement
> it, but let's do so properly using classifiers and TC actions or a
> ematch modules that can select the queue.
> 
> Then people can implement whatever policy they want, in completely
> generic ways, and the kernel simply doesn't care.
> 
> The way this code worked was completely special purpose and ignored
> the host of facilities and infrastructure we have for doing things
> like this.

Alas I can't get your point here. prio is sched + classifier 2 in 1,
and if a small change in this is enough for somebody who really uses
this, and there seem to be noone interested in doing this better or
harmed with this, why bother with actions or other classifiers? Maybe
this prio method is less generic, but it's quite simple and there is
some infrastructure around this.

Jarek P.

^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [PATCH 3/3] pkt_sched: restore multiqueue prio scheduler
  2008-08-23  8:47           ` Jarek Poplawski
@ 2008-08-23 16:31             ` Alexander Duyck
  2008-08-23 16:49               ` jamal
  0 siblings, 1 reply; 41+ messages in thread
From: Alexander Duyck @ 2008-08-23 16:31 UTC (permalink / raw)
  To: Jarek Poplawski
  Cc: David Miller, hadi, jeffrey.t.kirsher, jeff, netdev,
	alexander.h.duyck

On Sat, Aug 23, 2008 at 1:47 AM, Jarek Poplawski <jarkao2@gmail.com> wrote:
> On Fri, Aug 22, 2008 at 05:33:48PM -0700, David Miller wrote:
> ...
>> If we want queue selection in the packet scheduler, let's implement
>> it, but let's do so properly using classifiers and TC actions or a
>> ematch modules that can select the queue.
>>
>> Then people can implement whatever policy they want, in completely
>> generic ways, and the kernel simply doesn't care.
>>
>> The way this code worked was completely special purpose and ignored
>> the host of facilities and infrastructure we have for doing things
>> like this.
>
> Alas I can't get your point here. prio is sched + classifier 2 in 1,
> and if a small change in this is enough for somebody who really uses
> this, and there seem to be noone interested in doing this better or
> harmed with this, why bother with actions or other classifiers? Maybe
> this prio method is less generic, but it's quite simple and there is
> some infrastructure around this.
>
> Jarek P.

Actually in this new multiple tx queue kernel this qdisc could serve a
very specific but needed purpose which just became apparent to me.
This qdisc resolves one very specific issue, head-of-line blocking if
one of the hardware queues is full.  What if I reversed things a bit
in prio_classify so that skb->queue_mapping determined the band
instead of the other way around in the case of the multiqueue option
being enabled?  I would think that in this configuration the qdisc
would prove to be pretty useful for purposes other than QOS since it
would allow a classful qdisc per transmit queue instead of per device.

I could then create a select_queue for the device that returns 0 when
in DCB/EEDC mode for our drivers, implement the tc action to set the
queue mapping, and have essentially the same result as I had before.

Thanks,

Alex

^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [PATCH 3/3] pkt_sched: restore multiqueue prio scheduler
  2008-08-23 16:31             ` Alexander Duyck
@ 2008-08-23 16:49               ` jamal
  2008-08-23 19:09                 ` Alexander Duyck
  0 siblings, 1 reply; 41+ messages in thread
From: jamal @ 2008-08-23 16:49 UTC (permalink / raw)
  To: Alexander Duyck
  Cc: Jarek Poplawski, David Miller, jeffrey.t.kirsher, jeff, netdev,
	alexander.h.duyck

On Sat, 2008-23-08 at 09:31 -0700, Alexander Duyck wrote:

> Actually in this new multiple tx queue kernel this qdisc could serve a
> very specific but needed purpose which just became apparent to me.
> This qdisc resolves one very specific issue, head-of-line blocking if
> one of the hardware queues is full.  What if I reversed things a bit
> in prio_classify so that skb->queue_mapping determined the band
> instead of the other way around in the case of the multiqueue option
> being enabled?  I would think that in this configuration the qdisc
> would prove to be pretty useful for purposes other than QOS since it
> would allow a classful qdisc per transmit queue instead of per device.
> 

i think thats whats Dave is saying not in so many words.

prio_classify calls classifiers.
dont modify the qdisc; rather do a classification which when matched
sets skb->queue_mapping to a value your policy likes it to be.  
you dont need a new classifier, u32 would do for example. or ematch if
you want to do some funky things. you will need a new action which sets
the skb->queue_mapping.

> I could then create a select_queue for the device that returns 0 when
> in DCB/EEDC mode for our drivers, implement the tc action to set the
> queue mapping, and have essentially the same result as I had before.
> 

put overflow queues in your driver. With the new multiq approach the
controls are per-queue not per driver.

cheers,
jamal


^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [PATCH 3/3] pkt_sched: restore multiqueue prio scheduler
  2008-08-23 16:49               ` jamal
@ 2008-08-23 19:09                 ` Alexander Duyck
  2008-08-24  7:53                   ` Jarek Poplawski
  0 siblings, 1 reply; 41+ messages in thread
From: Alexander Duyck @ 2008-08-23 19:09 UTC (permalink / raw)
  To: hadi
  Cc: Jarek Poplawski, David Miller, jeffrey.t.kirsher, jeff, netdev,
	alexander.h.duyck

On Sat, Aug 23, 2008 at 9:49 AM, jamal <hadi@cyberus.ca> wrote:
> On Sat, 2008-23-08 at 09:31 -0700, Alexander Duyck wrote:
>
>> Actually in this new multiple tx queue kernel this qdisc could serve a
>> very specific but needed purpose which just became apparent to me.
>> This qdisc resolves one very specific issue, head-of-line blocking if
>> one of the hardware queues is full.  What if I reversed things a bit
>> in prio_classify so that skb->queue_mapping determined the band
>> instead of the other way around in the case of the multiqueue option
>> being enabled?  I would think that in this configuration the qdisc
>> would prove to be pretty useful for purposes other than QOS since it
>> would allow a classful qdisc per transmit queue instead of per device.
>>
>
> i think thats whats Dave is saying not in so many words.
>
> prio_classify calls classifiers.
> dont modify the qdisc; rather do a classification which when matched
> sets skb->queue_mapping to a value your policy likes it to be.
> you dont need a new classifier, u32 would do for example. or ematch if
> you want to do some funky things. you will need a new action which sets
> the skb->queue_mapping.

We have to modify the existing prio qdisc or create a new qdisc in
order to resolve the head-of-line blocking once any of the queues have
stopped.  The new one wouldn't make any changes to the priority of the
packet but instead would only switch it to a separate qdisc based on
the queue_mapping of the skb.

I get the fact that all we would need is a new action to handle the
queue mapping, but in order for that to be an acceptable solution I
still need the qdisc changes.

>> I could then create a select_queue for the device that returns 0 when
>> in DCB/EEDC mode for our drivers, implement the tc action to set the
>> queue mapping, and have essentially the same result as I had before.
>>
> put overflow queues in your driver. With the new multiq approach the
> controls are per-queue not per driver.
>

I think you missed the email from Dave earlier, either that or I am
not understanding something and if I am feel free to explain.  My
understanding is that when we assign a new qdisc via tc the
qdisc/transmit queues end up with a configuration kind of like what
Dave has on slide 7 at
(http://vger.kernel.org/~davem/davem_seattle08.pdf).  Adding more
queues doesn't resolve anything since I am still going through the
same qdisc regardless of how many queues I have.  The changes to the
dequeue portion of this qdisc are the most important part for this
change since I need them to keep the qdisc from jamming up in a
dequeue/requeue loop if the packet at the head of the qdisc needs to
go to a stopped hardware transmit queue.

> cheers,
> jamal
>

Thanks,

Alex

^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [PATCH 3/3] pkt_sched: restore multiqueue prio scheduler
  2008-08-23 19:09                 ` Alexander Duyck
@ 2008-08-24  7:53                   ` Jarek Poplawski
  2008-08-24 13:39                     ` jamal
  0 siblings, 1 reply; 41+ messages in thread
From: Jarek Poplawski @ 2008-08-24  7:53 UTC (permalink / raw)
  To: Alexander Duyck
  Cc: hadi, David Miller, jeffrey.t.kirsher, jeff, netdev,
	alexander.h.duyck

On Sat, Aug 23, 2008 at 12:09:42PM -0700, Alexander Duyck wrote:
> On Sat, Aug 23, 2008 at 9:49 AM, jamal <hadi@cyberus.ca> wrote:
> > On Sat, 2008-23-08 at 09:31 -0700, Alexander Duyck wrote:
> >
> >> Actually in this new multiple tx queue kernel this qdisc could serve a
> >> very specific but needed purpose which just became apparent to me.
> >> This qdisc resolves one very specific issue, head-of-line blocking if
> >> one of the hardware queues is full.  What if I reversed things a bit
> >> in prio_classify so that skb->queue_mapping determined the band
> >> instead of the other way around in the case of the multiqueue option
> >> being enabled?  I would think that in this configuration the qdisc
> >> would prove to be pretty useful for purposes other than QOS since it
> >> would allow a classful qdisc per transmit queue instead of per device.

I'm not sure if I don't miss your point, but if you decided you can do
this with a new action there is probably no reason to change or revert
anything in prio_classify - you pass the band/class number as a result.

...
> >> I could then create a select_queue for the device that returns 0 when
> >> in DCB/EEDC mode for our drivers, implement the tc action to set the
> >> queue mapping, and have essentially the same result as I had before.
> >>
> > put overflow queues in your driver. With the new multiq approach the
> > controls are per-queue not per driver.
> >
> 
> I think you missed the email from Dave earlier, either that or I am
> not understanding something and if I am feel free to explain.  My
> understanding is that when we assign a new qdisc via tc the
> qdisc/transmit queues end up with a configuration kind of like what
> Dave has on slide 7 at
> (http://vger.kernel.org/~davem/davem_seattle08.pdf).  Adding more
> queues doesn't resolve anything since I am still going through the
> same qdisc regardless of how many queues I have.  The changes to the
> dequeue portion of this qdisc are the most important part for this
> change since I need them to keep the qdisc from jamming up in a
> dequeue/requeue loop if the packet at the head of the qdisc needs to
> go to a stopped hardware transmit queue.

I guess, Jamal meant additional, backup queues, and you're thinking
about remapping to the basic, working queues. There is a question
if it's not enough to check and do this during classification, then
changes to prio's ->dequeue() would be also unnecessary. On the other
hand, if you proved in some tests it's worth to do in dequeue, I think
David is a reasonable guy and should let for such improvements. BTW,
if this jamming with requeuing is really a problem, probably it should
be also considered in the generic code like simple_tx_hash() by using
some mask.

Thanks,
Jarek P.

^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [PATCH 3/3] pkt_sched: restore multiqueue prio scheduler
  2008-08-24  7:53                   ` Jarek Poplawski
@ 2008-08-24 13:39                     ` jamal
  2008-08-24 19:19                       ` Jarek Poplawski
  0 siblings, 1 reply; 41+ messages in thread
From: jamal @ 2008-08-24 13:39 UTC (permalink / raw)
  To: Jarek Poplawski
  Cc: Alexander Duyck, David Miller, jeffrey.t.kirsher, jeff, netdev,
	alexander.h.duyck

On Sun, 2008-24-08 at 09:53 +0200, Jarek Poplawski wrote:
> On Sat, Aug 23, 2008 at 12:09:42PM -0700, Alexander Duyck wrote:
 
> > I think you missed the email from Dave earlier, either that or I am
> > not understanding something and if I am feel free to explain.  My
> > understanding is that when we assign a new qdisc via tc the
> > qdisc/transmit queues end up with a configuration kind of like what
> > Dave has on slide 7 at
> > (http://vger.kernel.org/~davem/davem_seattle08.pdf).  Adding more
> > queues doesn't resolve anything since I am still going through the
> > same qdisc regardless of how many queues I have.  The changes to the
> > dequeue portion of this qdisc are the most important part for this
> > change since I need them to keep the qdisc from jamming up in a
> > dequeue/requeue loop if the packet at the head of the qdisc needs to
> > go to a stopped hardware transmit queue.
> 
> I guess, Jamal meant additional, backup queues, 

right; in the driver - but even that may be unnecessary.
With current controls being per qdisc instead of per netdevice,
the hol fear is unfounded. 
You send and when hw cant keep up, you block just the one hwqueue.
While hwqueue is blocked, you can accumulate packets in the prio qdisc
(hence my statement it may not be necessary to accumulate packets in
driver).
On tx completion when hwqueue has more space, you check your backup
queues; and send those first before opening up path from above. 

And to your other statement; if you use basic skb->priority to hwqueue
mapping, then you wouldnt need an extra action. 

cheers,
jamal


^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [PATCH 3/3] pkt_sched: restore multiqueue prio scheduler
  2008-08-24 13:39                     ` jamal
@ 2008-08-24 19:19                       ` Jarek Poplawski
  2008-08-24 19:27                         ` Jarek Poplawski
  2008-08-25  0:49                         ` David Miller
  0 siblings, 2 replies; 41+ messages in thread
From: Jarek Poplawski @ 2008-08-24 19:19 UTC (permalink / raw)
  To: jamal
  Cc: Alexander Duyck, David Miller, jeffrey.t.kirsher, jeff, netdev,
	alexander.h.duyck

On Sun, Aug 24, 2008 at 09:39:23AM -0400, jamal wrote:
...
> With current controls being per qdisc instead of per netdevice,
> the hol fear is unfounded. 
> You send and when hw cant keep up, you block just the one hwqueue.
> While hwqueue is blocked, you can accumulate packets in the prio qdisc
> (hence my statement it may not be necessary to accumulate packets in
> driver).

Jamal, maybe I miss something, but this could be like this only with
default pfifo_fast qdiscs, which really are per dev hwqueue. Other
qdiscs, including prio, are per device, so with prio, if a band with
the highest priority is blocked it would be requeued blocking other
bands (hwqueues in Alexander's case).

Cheers,
Jarek P.

^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [PATCH 3/3] pkt_sched: restore multiqueue prio scheduler
  2008-08-24 19:19                       ` Jarek Poplawski
@ 2008-08-24 19:27                         ` Jarek Poplawski
  2008-08-24 19:59                           ` Jarek Poplawski
  2008-08-25  0:50                           ` David Miller
  2008-08-25  0:49                         ` David Miller
  1 sibling, 2 replies; 41+ messages in thread
From: Jarek Poplawski @ 2008-08-24 19:27 UTC (permalink / raw)
  To: jamal
  Cc: Alexander Duyck, David Miller, jeffrey.t.kirsher, jeff, netdev,
	alexander.h.duyck

On Sun, Aug 24, 2008 at 09:19:05PM +0200, Jarek Poplawski wrote:
...
> Jamal, maybe I miss something, but this could be like this only with
> default pfifo_fast qdiscs, which really are per dev hwqueue. Other
> qdiscs, including prio, are per device, so with prio, if a band with
> the highest priority is blocked it would be requeued blocking other
> bands (hwqueues in Alexander's case).

And, actually, because of this, Alexander is right the prio_dequeue()
can't work like this without any changes.

Jarek P.

^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [PATCH 3/3] pkt_sched: restore multiqueue prio scheduler
  2008-08-24 19:27                         ` Jarek Poplawski
@ 2008-08-24 19:59                           ` Jarek Poplawski
  2008-08-24 20:18                             ` Jarek Poplawski
  2008-08-25  0:50                           ` David Miller
  1 sibling, 1 reply; 41+ messages in thread
From: Jarek Poplawski @ 2008-08-24 19:59 UTC (permalink / raw)
  To: jamal
  Cc: Alexander Duyck, David Miller, jeffrey.t.kirsher, jeff, netdev,
	alexander.h.duyck

On Sun, Aug 24, 2008 at 09:27:34PM +0200, Jarek Poplawski wrote:
> On Sun, Aug 24, 2008 at 09:19:05PM +0200, Jarek Poplawski wrote:
> ...
> > Jamal, maybe I miss something, but this could be like this only with
> > default pfifo_fast qdiscs, which really are per dev hwqueue. Other
> > qdiscs, including prio, are per device, so with prio, if a band with
> > the highest priority is blocked it would be requeued blocking other
> > bands (hwqueues in Alexander's case).
> 
> And, actually, because of this, Alexander is right the prio_dequeue()
> can't work like this without any changes.

...And it looks like it's not only about prio with queue remapping.
Similar fixes are probably needed by default to any other qdiscs
implementing priorities, like htb etc!

Jarek P.

^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [PATCH 3/3] pkt_sched: restore multiqueue prio scheduler
  2008-08-24 19:59                           ` Jarek Poplawski
@ 2008-08-24 20:18                             ` Jarek Poplawski
  0 siblings, 0 replies; 41+ messages in thread
From: Jarek Poplawski @ 2008-08-24 20:18 UTC (permalink / raw)
  To: jamal
  Cc: Alexander Duyck, David Miller, jeffrey.t.kirsher, jeff, netdev,
	alexander.h.duyck

On Sun, Aug 24, 2008 at 09:59:14PM +0200, Jarek Poplawski wrote:
> On Sun, Aug 24, 2008 at 09:27:34PM +0200, Jarek Poplawski wrote:
> > On Sun, Aug 24, 2008 at 09:19:05PM +0200, Jarek Poplawski wrote:
> > ...
> > > Jamal, maybe I miss something, but this could be like this only with
> > > default pfifo_fast qdiscs, which really are per dev hwqueue. Other
> > > qdiscs, including prio, are per device, so with prio, if a band with
> > > the highest priority is blocked it would be requeued blocking other
> > > bands (hwqueues in Alexander's case).
> > 
> > And, actually, because of this, Alexander is right the prio_dequeue()
> > can't work like this without any changes.
> 
> ...And it looks like it's not only about prio with queue remapping.
> Similar fixes are probably needed by default to any other qdiscs
> implementing priorities, like htb etc!

...Or even more: simple fifos need this, as well!!

Jarek P.

^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [PATCH 3/3] pkt_sched: restore multiqueue prio scheduler
  2008-08-24 19:27                         ` Jarek Poplawski
  2008-08-24 19:59                           ` Jarek Poplawski
@ 2008-08-25  0:50                           ` David Miller
  2008-08-25  3:03                             ` Alexander Duyck
  1 sibling, 1 reply; 41+ messages in thread
From: David Miller @ 2008-08-25  0:50 UTC (permalink / raw)
  To: jarkao2
  Cc: hadi, alexander.duyck, jeffrey.t.kirsher, jeff, netdev,
	alexander.h.duyck

From: Jarek Poplawski <jarkao2@gmail.com>
Date: Sun, 24 Aug 2008 21:27:34 +0200

> On Sun, Aug 24, 2008 at 09:19:05PM +0200, Jarek Poplawski wrote:
> ...
> > Jamal, maybe I miss something, but this could be like this only with
> > default pfifo_fast qdiscs, which really are per dev hwqueue. Other
> > qdiscs, including prio, are per device, so with prio, if a band with
> > the highest priority is blocked it would be requeued blocking other
> > bands (hwqueues in Alexander's case).
> 
> And, actually, because of this, Alexander is right the prio_dequeue()
> can't work like this without any changes.

Yes it can, see my other response.  Only the higher priority
queues filling up can block lower priority traffic, and that
is fine.


^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [PATCH 3/3] pkt_sched: restore multiqueue prio scheduler
  2008-08-25  0:50                           ` David Miller
@ 2008-08-25  3:03                             ` Alexander Duyck
  2008-08-25  6:16                               ` Jarek Poplawski
  0 siblings, 1 reply; 41+ messages in thread
From: Alexander Duyck @ 2008-08-25  3:03 UTC (permalink / raw)
  To: David Miller
  Cc: jarkao2, hadi, jeffrey.t.kirsher, jeff, netdev, alexander.h.duyck

On Sun, Aug 24, 2008 at 5:50 PM, David Miller <davem@davemloft.net> wrote:
> From: Jarek Poplawski <jarkao2@gmail.com>
> Date: Sun, 24 Aug 2008 21:27:34 +0200
>
>> On Sun, Aug 24, 2008 at 09:19:05PM +0200, Jarek Poplawski wrote:
>> ...
>> > Jamal, maybe I miss something, but this could be like this only with
>> > default pfifo_fast qdiscs, which really are per dev hwqueue. Other
>> > qdiscs, including prio, are per device, so with prio, if a band with
>> > the highest priority is blocked it would be requeued blocking other
>> > bands (hwqueues in Alexander's case).
>>
>> And, actually, because of this, Alexander is right the prio_dequeue()
>> can't work like this without any changes.
>
> Yes it can, see my other response.  Only the higher priority
> queues filling up can block lower priority traffic, and that
> is fine.
>

Let's get away from the priority discussion because it doesn't have
anything to do with what I am talking about.  One of the main reasons
why I am thinking it might be wise to move this out of the prio qdisc
and spin it off as a qdisc of its own is because what I am trying to
accomplish doesn't care about the priority of the packet.  All I am
concerned with is that this qdisc doesn't block the transmit of any
other hardware queues if one of the hardware queues are blocked and
that all queues have an equal chance to dequeue packets.

What I am planning to implement at this point is a simple multi-band
qdisc that assigns the band number based on queue mapping.  This
combined with something like the rr_dequeue from the original qdisc
will resolve the head-of-line issues.  Essentially it will try to
mimic the way the pfifo_fast qdiscs work with hardware multiqueue, but
it will have the ability to classify traffic and be stuck with one
spin lock for all queues.

I will try to get patches submitted for this new approach to the qdisc
and the new action in the next day or two.  In the meantime we can
probably drop this issue for now since I am in agreement that this
isn't the best approach to resolving this issue.

Thanks,

Alex

^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [PATCH 3/3] pkt_sched: restore multiqueue prio scheduler
  2008-08-25  3:03                             ` Alexander Duyck
@ 2008-08-25  6:16                               ` Jarek Poplawski
  2008-08-25  9:36                                 ` Jarek Poplawski
  0 siblings, 1 reply; 41+ messages in thread
From: Jarek Poplawski @ 2008-08-25  6:16 UTC (permalink / raw)
  To: Alexander Duyck
  Cc: David Miller, hadi, jeffrey.t.kirsher, jeff, netdev,
	alexander.h.duyck

On Sun, Aug 24, 2008 at 08:03:11PM -0700, Alexander Duyck wrote:
> On Sun, Aug 24, 2008 at 5:50 PM, David Miller <davem@davemloft.net> wrote:
> > From: Jarek Poplawski <jarkao2@gmail.com>
> > Date: Sun, 24 Aug 2008 21:27:34 +0200
> >
> >> On Sun, Aug 24, 2008 at 09:19:05PM +0200, Jarek Poplawski wrote:
> >> ...
> >> > Jamal, maybe I miss something, but this could be like this only with
> >> > default pfifo_fast qdiscs, which really are per dev hwqueue. Other
> >> > qdiscs, including prio, are per device, so with prio, if a band with
> >> > the highest priority is blocked it would be requeued blocking other
> >> > bands (hwqueues in Alexander's case).
> >>
> >> And, actually, because of this, Alexander is right the prio_dequeue()
> >> can't work like this without any changes.
> >
> > Yes it can, see my other response.  Only the higher priority
> > queues filling up can block lower priority traffic, and that
> > is fine.
> >
> 
> Let's get away from the priority discussion because it doesn't have
> anything to do with what I am talking about.  One of the main reasons
> why I am thinking it might be wise to move this out of the prio qdisc
> and spin it off as a qdisc of its own is because what I am trying to
> accomplish doesn't care about the priority of the packet.  All I am
> concerned with is that this qdisc doesn't block the transmit of any
> other hardware queues if one of the hardware queues are blocked and
> that all queues have an equal chance to dequeue packets.
> 
> What I am planning to implement at this point is a simple multi-band
> qdisc that assigns the band number based on queue mapping.  This
> combined with something like the rr_dequeue from the original qdisc
> will resolve the head-of-line issues.  Essentially it will try to
> mimic the way the pfifo_fast qdiscs work with hardware multiqueue, but
> it will have the ability to classify traffic and be stuck with one
> spin lock for all queues.
> 
> I will try to get patches submitted for this new approach to the qdisc
> and the new action in the next day or two.  In the meantime we can
> probably drop this issue for now since I am in agreement that this
> isn't the best approach to resolving this issue.

I agree with you that a separate qdisc, or actually some aliases, like
this older sch_rr, and maybe also independent sch_prio_mq with .enqueue
and .dequeue overridden, look most sensible to me.

Thanks,
Jarek P.

^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [PATCH 3/3] pkt_sched: restore multiqueue prio scheduler
  2008-08-25  6:16                               ` Jarek Poplawski
@ 2008-08-25  9:36                                 ` Jarek Poplawski
  0 siblings, 0 replies; 41+ messages in thread
From: Jarek Poplawski @ 2008-08-25  9:36 UTC (permalink / raw)
  To: Alexander Duyck
  Cc: David Miller, hadi, jeffrey.t.kirsher, jeff, netdev,
	alexander.h.duyck

On Mon, Aug 25, 2008 at 06:16:28AM +0000, Jarek Poplawski wrote:
> On Sun, Aug 24, 2008 at 08:03:11PM -0700, Alexander Duyck wrote:
...
> > What I am planning to implement at this point is a simple multi-band
> > qdisc that assigns the band number based on queue mapping.  This
> > combined with something like the rr_dequeue from the original qdisc
> > will resolve the head-of-line issues.  Essentially it will try to
> > mimic the way the pfifo_fast qdiscs work with hardware multiqueue, but
> > it will have the ability to classify traffic and be stuck with one
> > spin lock for all queues.
...
> I agree with you that a separate qdisc, or actually some aliases, like
> this older sch_rr, and maybe also independent sch_prio_mq with .enqueue
> and .dequeue overridden, look most sensible to me.

On the other hand, if these new qdiscs don't even need prio_classify(),
making it standalone looks better.

Jarek P.

^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [PATCH 3/3] pkt_sched: restore multiqueue prio scheduler
  2008-08-24 19:19                       ` Jarek Poplawski
  2008-08-24 19:27                         ` Jarek Poplawski
@ 2008-08-25  0:49                         ` David Miller
  2008-08-25  6:06                           ` Jarek Poplawski
  1 sibling, 1 reply; 41+ messages in thread
From: David Miller @ 2008-08-25  0:49 UTC (permalink / raw)
  To: jarkao2
  Cc: hadi, alexander.duyck, jeffrey.t.kirsher, jeff, netdev,
	alexander.h.duyck

From: Jarek Poplawski <jarkao2@gmail.com>
Date: Sun, 24 Aug 2008 21:19:05 +0200

> On Sun, Aug 24, 2008 at 09:39:23AM -0400, jamal wrote:
> ...
> > With current controls being per qdisc instead of per netdevice,
> > the hol fear is unfounded. 
> > You send and when hw cant keep up, you block just the one hwqueue.
> > While hwqueue is blocked, you can accumulate packets in the prio qdisc
> > (hence my statement it may not be necessary to accumulate packets in
> > driver).
> 
> Jamal, maybe I miss something, but this could be like this only with
> default pfifo_fast qdiscs, which really are per dev hwqueue. Other
> qdiscs, including prio, are per device, so with prio, if a band with
> the highest priority is blocked it would be requeued blocking other
> bands (hwqueues in Alexander's case).

It only blocks if the highest priority band's HW queue is blocked, and
that's what you want to happen.

Think about it, if the highest priority HW queue is full, queueing
packets to the lower priority queues won't make anything happen.

As the highest priority queue opens up and begins to have space,
we'll feed it high priority packets from the prio qdisc, and so
on and so forth.

This isn't a real issue, and I remember Patrick and Jamal going
back and forth about this some time ago. :-)


^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [PATCH 3/3] pkt_sched: restore multiqueue prio scheduler
  2008-08-25  0:49                         ` David Miller
@ 2008-08-25  6:06                           ` Jarek Poplawski
  2008-08-25  7:48                             ` David Miller
  0 siblings, 1 reply; 41+ messages in thread
From: Jarek Poplawski @ 2008-08-25  6:06 UTC (permalink / raw)
  To: David Miller
  Cc: hadi, alexander.duyck, jeffrey.t.kirsher, jeff, netdev,
	alexander.h.duyck

On Sun, Aug 24, 2008 at 05:49:49PM -0700, David Miller wrote:
> From: Jarek Poplawski <jarkao2@gmail.com>
> Date: Sun, 24 Aug 2008 21:19:05 +0200
> 
> > On Sun, Aug 24, 2008 at 09:39:23AM -0400, jamal wrote:
> > ...
> > > With current controls being per qdisc instead of per netdevice,
> > > the hol fear is unfounded. 
> > > You send and when hw cant keep up, you block just the one hwqueue.
> > > While hwqueue is blocked, you can accumulate packets in the prio qdisc
> > > (hence my statement it may not be necessary to accumulate packets in
> > > driver).
> > 
> > Jamal, maybe I miss something, but this could be like this only with
> > default pfifo_fast qdiscs, which really are per dev hwqueue. Other
> > qdiscs, including prio, are per device, so with prio, if a band with
> > the highest priority is blocked it would be requeued blocking other
> > bands (hwqueues in Alexander's case).
> 
> It only blocks if the highest priority band's HW queue is blocked, and
> that's what you want to happen.
> 
> Think about it, if the highest priority HW queue is full, queueing
> packets to the lower priority queues won't make anything happen.
> 
> As the highest priority queue opens up and begins to have space,
> we'll feed it high priority packets from the prio qdisc, and so
> on and so forth.

It seems the priority can really be misleading here. Do you mean these
hwqueues are internally prioritized too? This would be strange to me,
because why would we need this independent locking per hwqueue if
everything has to wait for the most prioritized hwqueue anyway? And,
if so, current dev_pick_tx() with simple_tx_hash() would always harm
some flows directing them to lower priority hwqueues?!

But, even if it's true, let's take a look at fifo: a packet at the
head of the qdisc's queue could be hashed to the last hwqueue. If
it's stopped for some reason, this packed would be constantly
requeued blocking all other packets, while their hwqueues are ready
and empty!

Jarek P.

^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [PATCH 3/3] pkt_sched: restore multiqueue prio scheduler
  2008-08-25  6:06                           ` Jarek Poplawski
@ 2008-08-25  7:48                             ` David Miller
  2008-08-25  7:57                               ` Jarek Poplawski
  0 siblings, 1 reply; 41+ messages in thread
From: David Miller @ 2008-08-25  7:48 UTC (permalink / raw)
  To: jarkao2
  Cc: hadi, alexander.duyck, jeffrey.t.kirsher, jeff, netdev,
	alexander.h.duyck

From: Jarek Poplawski <jarkao2@gmail.com>
Date: Mon, 25 Aug 2008 06:06:40 +0000

> It seems the priority can really be misleading here. Do you mean these
> hwqueues are internally prioritized too? This would be strange to me,
> because why would we need this independent locking per hwqueue if
> everything has to wait for the most prioritized hwqueue anyway? And,
> if so, current dev_pick_tx() with simple_tx_hash() would always harm
> some flows directing them to lower priority hwqueues?!

Yes some can do internal prioritization in hardware.

But even if not, this means even if the card does flow based
multiqueue, this is still the right thing to do.

Think about what actually happens on the wire as a result of
our actions, rather than intuition :-)

> But, even if it's true, let's take a look at fifo: a packet at the
> head of the qdisc's queue could be hashed to the last hwqueue. If
> it's stopped for some reason, this packed would be constantly
> requeued blocking all other packets, while their hwqueues are ready
> and empty!

If we feed packets after the first one to the card, we would not
be implementing a FIFO.

^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [PATCH 3/3] pkt_sched: restore multiqueue prio scheduler
  2008-08-25  7:48                             ` David Miller
@ 2008-08-25  7:57                               ` Jarek Poplawski
  2008-08-25  8:02                                 ` David Miller
  0 siblings, 1 reply; 41+ messages in thread
From: Jarek Poplawski @ 2008-08-25  7:57 UTC (permalink / raw)
  To: David Miller
  Cc: hadi, alexander.duyck, jeffrey.t.kirsher, jeff, netdev,
	alexander.h.duyck

On Mon, Aug 25, 2008 at 12:48:25AM -0700, David Miller wrote:
> From: Jarek Poplawski <jarkao2@gmail.com>
> Date: Mon, 25 Aug 2008 06:06:40 +0000
> 
> > It seems the priority can really be misleading here. Do you mean these
> > hwqueues are internally prioritized too? This would be strange to me,
> > because why would we need this independent locking per hwqueue if
> > everything has to wait for the most prioritized hwqueue anyway? And,
> > if so, current dev_pick_tx() with simple_tx_hash() would always harm
> > some flows directing them to lower priority hwqueues?!
> 
> Yes some can do internal prioritization in hardware.
> 
> But even if not, this means even if the card does flow based
> multiqueue, this is still the right thing to do.
> 
> Think about what actually happens on the wire as a result of
> our actions, rather than intuition :-)
> 
> > But, even if it's true, let's take a look at fifo: a packet at the
> > head of the qdisc's queue could be hashed to the last hwqueue. If
> > it's stopped for some reason, this packed would be constantly
> > requeued blocking all other packets, while their hwqueues are ready
> > and empty!
> 
> If we feed packets after the first one to the card, we would not
> be implementing a FIFO.

Not necessarilly so: if separate flows are hashed to "their" hwqueues,
a FIFO per flow would be still obeyed.

Jarek P.

^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [PATCH 3/3] pkt_sched: restore multiqueue prio scheduler
  2008-08-25  7:57                               ` Jarek Poplawski
@ 2008-08-25  8:02                                 ` David Miller
  2008-08-25  8:25                                   ` Jarek Poplawski
  0 siblings, 1 reply; 41+ messages in thread
From: David Miller @ 2008-08-25  8:02 UTC (permalink / raw)
  To: jarkao2
  Cc: hadi, alexander.duyck, jeffrey.t.kirsher, jeff, netdev,
	alexander.h.duyck

From: Jarek Poplawski <jarkao2@gmail.com>
Date: Mon, 25 Aug 2008 07:57:44 +0000

> On Mon, Aug 25, 2008 at 12:48:25AM -0700, David Miller wrote:
> > From: Jarek Poplawski <jarkao2@gmail.com>
> > Date: Mon, 25 Aug 2008 06:06:40 +0000
> > 
> > If we feed packets after the first one to the card, we would not
> > be implementing a FIFO.
> 
> Not necessarilly so: if separate flows are hashed to "their" hwqueues,
> a FIFO per flow would be still obeyed.

What appears on the wire is still going to be similar.

You have to subsequently ask if it's worth the complexity to do
what you seem to be proposing.

When a single hardware queue fills up, it's the SAME, semantically,
as when a unary TX queue of a traditional device fills up.

There is NO on the wire difference.  There will be NO performance
difference, because the device will have work to do as by definition
of one TX queue being full there are some packets queued up to
the device.

^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [PATCH 3/3] pkt_sched: restore multiqueue prio scheduler
  2008-08-25  8:02                                 ` David Miller
@ 2008-08-25  8:25                                   ` Jarek Poplawski
  2008-08-25  8:35                                     ` Jarek Poplawski
  0 siblings, 1 reply; 41+ messages in thread
From: Jarek Poplawski @ 2008-08-25  8:25 UTC (permalink / raw)
  To: David Miller
  Cc: hadi, alexander.duyck, jeffrey.t.kirsher, jeff, netdev,
	alexander.h.duyck

On Mon, Aug 25, 2008 at 01:02:06AM -0700, David Miller wrote:
> From: Jarek Poplawski <jarkao2@gmail.com>
> Date: Mon, 25 Aug 2008 07:57:44 +0000
> 
> > On Mon, Aug 25, 2008 at 12:48:25AM -0700, David Miller wrote:
> > > From: Jarek Poplawski <jarkao2@gmail.com>
> > > Date: Mon, 25 Aug 2008 06:06:40 +0000
> > > 
> > > If we feed packets after the first one to the card, we would not
> > > be implementing a FIFO.
> > 
> > Not necessarilly so: if separate flows are hashed to "their" hwqueues,
> > a FIFO per flow would be still obeyed.
> 
> What appears on the wire is still going to be similar.
> 
> You have to subsequently ask if it's worth the complexity to do
> what you seem to be proposing.
> 
> When a single hardware queue fills up, it's the SAME, semantically,
> as when a unary TX queue of a traditional device fills up.
> 
> There is NO on the wire difference.  There will be NO performance
> difference, because the device will have work to do as by definition
> of one TX queue being full there are some packets queued up to
> the device.

If with unary TX queue we had to fill one bigger queue (or all TX
queues) before device stopped a qdisc, and with mq TX it's enough to
have one TX filled to effectively stop a qdisc transmit, IMHO there
should be a performance difference.

Jarek P.

^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [PATCH 3/3] pkt_sched: restore multiqueue prio scheduler
  2008-08-25  8:25                                   ` Jarek Poplawski
@ 2008-08-25  8:35                                     ` Jarek Poplawski
  0 siblings, 0 replies; 41+ messages in thread
From: Jarek Poplawski @ 2008-08-25  8:35 UTC (permalink / raw)
  To: David Miller
  Cc: hadi, alexander.duyck, jeffrey.t.kirsher, jeff, netdev,
	alexander.h.duyck

On Mon, Aug 25, 2008 at 08:25:01AM +0000, Jarek Poplawski wrote:
...
> If with unary TX queue we had to fill one bigger queue (or all TX
> queues) before device stopped a qdisc, and with mq TX it's enough to
> have one TX filled to effectively stop a qdisc transmit, IMHO there
> should be a performance difference.

And we should probably consider some TX queue could be stopped for
other reasons, blocking all transmit with mq TX but not unary TX.

Jarek P.

^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [PATCH 1/3] LRO: fix return code propogation
  2008-08-22  0:51 [PATCH 1/3] LRO: fix return code propogation Jeff Kirsher
  2008-08-22  0:51 ` [PATCH 2/3] netlink: nal_parse_nested_compat was not parsing nested attributes Jeff Kirsher
  2008-08-22  0:51 ` [PATCH 3/3] pkt_sched: restore multiqueue prio scheduler Jeff Kirsher
@ 2008-08-22 10:20 ` David Miller
  2 siblings, 0 replies; 41+ messages in thread
From: David Miller @ 2008-08-22 10:20 UTC (permalink / raw)
  To: jeffrey.t.kirsher; +Cc: jeff, netdev, jesse.brandeburg

From: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
Date: Thu, 21 Aug 2008 17:51:24 -0700

> From: Jesse Brandeburg <jesse.brandeburg@intel.com>
> 
> LRO code needs to propogate the return values from the netif_rx and
> netif_receive_skb calls specifically to allow drivers that need that
> information to react correctly.  Some places in the code couldn't
> return the result of the upcall to the stack, so I added the dropped
> counter, which just completes the cache line of the stats struct (on
> 64-bit), so shouldn't be a big deal.
> 
> Signed-off-by: Jesse Brandeburg <jesse.brandeburg@intel.com>
> Signed-off-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com>

Too vague a commit description.

This patch makes a bunch of routines return status codes, but there
is no given example of a precise intended usage.  That part might
be incorrect (the return value isn't actually needed or the code
in question is doing something wrong), which is why providing such
example is important.

So I'm dropping this.  Please provide such info in any resubmission or
I'll just ignore it, thanks.

^ permalink raw reply	[flat|nested] 41+ messages in thread

end of thread, other threads:[~2008-08-27 18:09 UTC | newest]

Thread overview: 41+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2008-08-22  0:51 [PATCH 1/3] LRO: fix return code propogation Jeff Kirsher
2008-08-22  0:51 ` [PATCH 2/3] netlink: nal_parse_nested_compat was not parsing nested attributes Jeff Kirsher
2008-08-22 10:18   ` David Miller
2008-08-22 17:40     ` [PATCH 2/3] netlink: nla_parse_nested_compat " Duyck, Alexander H
2008-08-27 14:52       ` Thomas Graf
2008-08-27 18:09         ` Duyck, Alexander H
2008-08-22  0:51 ` [PATCH 3/3] pkt_sched: restore multiqueue prio scheduler Jeff Kirsher
2008-08-22 10:16   ` David Miller
2008-08-22 14:30     ` jamal
2008-08-22 22:19       ` Jarek Poplawski
2008-08-23  0:01         ` Alexander Duyck
2008-08-23  0:40           ` David Miller
2008-08-23  1:37             ` Alexander Duyck
2008-08-23  5:12               ` Herbert Xu
2008-08-23  6:35                 ` Alexander Duyck
2008-08-23  7:07                   ` Herbert Xu
2008-08-23  8:23                   ` David Miller
2008-08-23  8:15               ` David Miller
2008-08-23  0:33         ` David Miller
2008-08-23  8:47           ` Jarek Poplawski
2008-08-23 16:31             ` Alexander Duyck
2008-08-23 16:49               ` jamal
2008-08-23 19:09                 ` Alexander Duyck
2008-08-24  7:53                   ` Jarek Poplawski
2008-08-24 13:39                     ` jamal
2008-08-24 19:19                       ` Jarek Poplawski
2008-08-24 19:27                         ` Jarek Poplawski
2008-08-24 19:59                           ` Jarek Poplawski
2008-08-24 20:18                             ` Jarek Poplawski
2008-08-25  0:50                           ` David Miller
2008-08-25  3:03                             ` Alexander Duyck
2008-08-25  6:16                               ` Jarek Poplawski
2008-08-25  9:36                                 ` Jarek Poplawski
2008-08-25  0:49                         ` David Miller
2008-08-25  6:06                           ` Jarek Poplawski
2008-08-25  7:48                             ` David Miller
2008-08-25  7:57                               ` Jarek Poplawski
2008-08-25  8:02                                 ` David Miller
2008-08-25  8:25                                   ` Jarek Poplawski
2008-08-25  8:35                                     ` Jarek Poplawski
2008-08-22 10:20 ` [PATCH 1/3] LRO: fix return code propogation David Miller

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).