[PATCH net-next2.6] net: rename netdev rx_queue to ingress

netdev.vger.kernel.org archive mirror
 help / color / mirror / Atom feed

* [PATCH net-next2.6] net: rename netdev rx_queue to ingress_queue
@ 2010-09-28 15:58 Eric Dumazet
  2010-09-28 18:04 ` Jarek Poplawski
  2010-09-29 20:27 ` [PATCH net-next2.6] net: rename netdev rx_queue to ingress_queue David Miller
  0 siblings, 2 replies; 23+ messages in thread
From: Eric Dumazet @ 2010-09-28 15:58 UTC (permalink / raw)
  To: David Miller; +Cc: netdev, Jarek Poplawski

There is some confusion with rx_queue name after RPS, and net drivers
private rx_queue fields.

I suggest to rename "struct net_device"->rx_queue to ingress_queue.

Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
---

My plan is to make this structure dynamically allocated, in the rare
case ingress is setup, to shrink net_device.

 include/linux/netdevice.h |    2 +-
 net/core/dev.c            |    8 ++++----
 net/sched/sch_api.c       |   14 +++++++-------
 net/sched/sch_generic.c   |    8 ++++----
 4 files changed, 16 insertions(+), 16 deletions(-)

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 83de0eb..1ae0d65 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -983,7 +983,7 @@ struct net_device {
 	rx_handler_func_t	*rx_handler;
 	void			*rx_handler_data;
 
-	struct netdev_queue	rx_queue; /* use two cache lines */
+	struct netdev_queue	ingress_queue; /* use two cache lines */
 
 /*
  * Cache lines mostly used on transmit path
diff --git a/net/core/dev.c b/net/core/dev.c
index 42b200f..0ed0669 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -2674,7 +2674,7 @@ static int ing_filter(struct sk_buff *skb)
 	skb->tc_verd = SET_TC_RTTL(skb->tc_verd, ttl);
 	skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_INGRESS);
 
-	rxq = &dev->rx_queue;
+	rxq = &dev->ingress_queue;
 
 	q = rxq->qdisc;
 	if (q != &noop_qdisc) {
@@ -2691,7 +2691,7 @@ static inline struct sk_buff *handle_ing(struct sk_buff *skb,
 					 struct packet_type **pt_prev,
 					 int *ret, struct net_device *orig_dev)
 {
-	if (skb->dev->rx_queue.qdisc == &noop_qdisc)
+	if (skb->dev->ingress_queue.qdisc == &noop_qdisc)
 		goto out;
 
 	if (*pt_prev) {
@@ -4894,7 +4894,7 @@ static void __netdev_init_queue_locks_one(struct net_device *dev,
 static void netdev_init_queue_locks(struct net_device *dev)
 {
 	netdev_for_each_tx_queue(dev, __netdev_init_queue_locks_one, NULL);
-	__netdev_init_queue_locks_one(dev, &dev->rx_queue, NULL);
+	__netdev_init_queue_locks_one(dev, &dev->ingress_queue, NULL);
 }
 
 unsigned long netdev_fix_features(unsigned long features, const char *name)
@@ -5406,7 +5406,7 @@ static void netdev_init_one_queue(struct net_device *dev,
 
 static void netdev_init_queues(struct net_device *dev)
 {
-	netdev_init_one_queue(dev, &dev->rx_queue, NULL);
+	netdev_init_one_queue(dev, &dev->ingress_queue, NULL);
 	netdev_for_each_tx_queue(dev, netdev_init_one_queue, NULL);
 	spin_lock_init(&dev->tx_global_lock);
 }
diff --git a/net/sched/sch_api.c b/net/sched/sch_api.c
index 6fb3d41..b802078 100644
--- a/net/sched/sch_api.c
+++ b/net/sched/sch_api.c
@@ -240,7 +240,7 @@ struct Qdisc *qdisc_lookup(struct net_device *dev, u32 handle)
 	if (q)
 		goto out;
 
-	q = qdisc_match_from_root(dev->rx_queue.qdisc_sleeping, handle);
+	q = qdisc_match_from_root(dev->ingress_queue.qdisc_sleeping, handle);
 out:
 	return q;
 }
@@ -701,7 +701,7 @@ static int qdisc_graft(struct net_device *dev, struct Qdisc *parent,
 		}
 
 		for (i = 0; i < num_q; i++) {
-			struct netdev_queue *dev_queue = &dev->rx_queue;
+			struct netdev_queue *dev_queue = &dev->ingress_queue;
 
 			if (!ingress)
 				dev_queue = netdev_get_tx_queue(dev, i);
@@ -979,7 +979,7 @@ static int tc_get_qdisc(struct sk_buff *skb, struct nlmsghdr *n, void *arg)
 					return -ENOENT;
 				q = qdisc_leaf(p, clid);
 			} else { /* ingress */
-				q = dev->rx_queue.qdisc_sleeping;
+				q = dev->ingress_queue.qdisc_sleeping;
 			}
 		} else {
 			q = dev->qdisc;
@@ -1044,7 +1044,7 @@ replay:
 					return -ENOENT;
 				q = qdisc_leaf(p, clid);
 			} else { /*ingress */
-				q = dev->rx_queue.qdisc_sleeping;
+				q = dev->ingress_queue.qdisc_sleeping;
 			}
 		} else {
 			q = dev->qdisc;
@@ -1124,7 +1124,7 @@ create_n_graft:
 	if (!(n->nlmsg_flags&NLM_F_CREATE))
 		return -ENOENT;
 	if (clid == TC_H_INGRESS)
-		q = qdisc_create(dev, &dev->rx_queue, p,
+		q = qdisc_create(dev, &dev->ingress_queue, p,
 				 tcm->tcm_parent, tcm->tcm_parent,
 				 tca, &err);
 	else {
@@ -1304,7 +1304,7 @@ static int tc_dump_qdisc(struct sk_buff *skb, struct netlink_callback *cb)
 		if (tc_dump_qdisc_root(dev->qdisc, skb, cb, &q_idx, s_q_idx) < 0)
 			goto done;
 
-		dev_queue = &dev->rx_queue;
+		dev_queue = &dev->ingress_queue;
 		if (tc_dump_qdisc_root(dev_queue->qdisc_sleeping, skb, cb, &q_idx, s_q_idx) < 0)
 			goto done;
 
@@ -1595,7 +1595,7 @@ static int tc_dump_tclass(struct sk_buff *skb, struct netlink_callback *cb)
 	if (tc_dump_tclass_root(dev->qdisc, skb, tcm, cb, &t, s_t) < 0)
 		goto done;
 
-	dev_queue = &dev->rx_queue;
+	dev_queue = &dev->ingress_queue;
 	if (tc_dump_tclass_root(dev_queue->qdisc_sleeping, skb, tcm, cb, &t, s_t) < 0)
 		goto done;
 
diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c
index 2aeb3a4..545278a 100644
--- a/net/sched/sch_generic.c
+++ b/net/sched/sch_generic.c
@@ -753,7 +753,7 @@ void dev_activate(struct net_device *dev)
 
 	need_watchdog = 0;
 	netdev_for_each_tx_queue(dev, transition_one_qdisc, &need_watchdog);
-	transition_one_qdisc(dev, &dev->rx_queue, NULL);
+	transition_one_qdisc(dev, &dev->ingress_queue, NULL);
 
 	if (need_watchdog) {
 		dev->trans_start = jiffies;
@@ -812,7 +812,7 @@ static bool some_qdisc_is_busy(struct net_device *dev)
 void dev_deactivate(struct net_device *dev)
 {
 	netdev_for_each_tx_queue(dev, dev_deactivate_queue, &noop_qdisc);
-	dev_deactivate_queue(dev, &dev->rx_queue, &noop_qdisc);
+	dev_deactivate_queue(dev, &dev->ingress_queue, &noop_qdisc);
 
 	dev_watchdog_down(dev);
 
@@ -838,7 +838,7 @@ void dev_init_scheduler(struct net_device *dev)
 {
 	dev->qdisc = &noop_qdisc;
 	netdev_for_each_tx_queue(dev, dev_init_scheduler_queue, &noop_qdisc);
-	dev_init_scheduler_queue(dev, &dev->rx_queue, &noop_qdisc);
+	dev_init_scheduler_queue(dev, &dev->ingress_queue, &noop_qdisc);
 
 	setup_timer(&dev->watchdog_timer, dev_watchdog, (unsigned long)dev);
 }
@@ -861,7 +861,7 @@ static void shutdown_scheduler_queue(struct net_device *dev,
 void dev_shutdown(struct net_device *dev)
 {
 	netdev_for_each_tx_queue(dev, shutdown_scheduler_queue, &noop_qdisc);
-	shutdown_scheduler_queue(dev, &dev->rx_queue, &noop_qdisc);
+	shutdown_scheduler_queue(dev, &dev->ingress_queue, &noop_qdisc);
 	qdisc_destroy(dev->qdisc);
 	dev->qdisc = &noop_qdisc;
 



^ permalink raw reply related	[flat|nested] 23+ messages in thread

* Re: [PATCH net-next2.6] net: rename netdev rx_queue to ingress_queue
  2010-09-28 15:58 [PATCH net-next2.6] net: rename netdev rx_queue to ingress_queue Eric Dumazet
@ 2010-09-28 18:04 ` Jarek Poplawski
  2010-09-29 10:56   ` jamal
  2010-09-29 20:27 ` [PATCH net-next2.6] net: rename netdev rx_queue to ingress_queue David Miller
  1 sibling, 1 reply; 23+ messages in thread
From: Jarek Poplawski @ 2010-09-28 18:04 UTC (permalink / raw)
  To: Eric Dumazet; +Cc: jamal, David Miller, netdev

On Tue, Sep 28, 2010 at 05:58:37PM +0200, Eric Dumazet wrote:
> There is some confusion with rx_queue name after RPS, and net drivers
> private rx_queue fields.
> 
> I suggest to rename "struct net_device"->rx_queue to ingress_queue.
> 
> Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
> ---
> 
> My plan is to make this structure dynamically allocated, in the rare
> case ingress is setup, to shrink net_device.

I hope Master of Ingress will forgive you this CC some day... ;-)

Jarek P.

> 
>  include/linux/netdevice.h |    2 +-
>  net/core/dev.c            |    8 ++++----
>  net/sched/sch_api.c       |   14 +++++++-------
>  net/sched/sch_generic.c   |    8 ++++----
>  4 files changed, 16 insertions(+), 16 deletions(-)
> 
> diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
> index 83de0eb..1ae0d65 100644
> --- a/include/linux/netdevice.h
> +++ b/include/linux/netdevice.h
> @@ -983,7 +983,7 @@ struct net_device {
>  	rx_handler_func_t	*rx_handler;
>  	void			*rx_handler_data;
>  
> -	struct netdev_queue	rx_queue; /* use two cache lines */
> +	struct netdev_queue	ingress_queue; /* use two cache lines */
>  
>  /*
>   * Cache lines mostly used on transmit path
> diff --git a/net/core/dev.c b/net/core/dev.c
> index 42b200f..0ed0669 100644
> --- a/net/core/dev.c
> +++ b/net/core/dev.c
> @@ -2674,7 +2674,7 @@ static int ing_filter(struct sk_buff *skb)
>  	skb->tc_verd = SET_TC_RTTL(skb->tc_verd, ttl);
>  	skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_INGRESS);
>  
> -	rxq = &dev->rx_queue;
> +	rxq = &dev->ingress_queue;
>  
>  	q = rxq->qdisc;
>  	if (q != &noop_qdisc) {
> @@ -2691,7 +2691,7 @@ static inline struct sk_buff *handle_ing(struct sk_buff *skb,
>  					 struct packet_type **pt_prev,
>  					 int *ret, struct net_device *orig_dev)
>  {
> -	if (skb->dev->rx_queue.qdisc == &noop_qdisc)
> +	if (skb->dev->ingress_queue.qdisc == &noop_qdisc)
>  		goto out;
>  
>  	if (*pt_prev) {
> @@ -4894,7 +4894,7 @@ static void __netdev_init_queue_locks_one(struct net_device *dev,
>  static void netdev_init_queue_locks(struct net_device *dev)
>  {
>  	netdev_for_each_tx_queue(dev, __netdev_init_queue_locks_one, NULL);
> -	__netdev_init_queue_locks_one(dev, &dev->rx_queue, NULL);
> +	__netdev_init_queue_locks_one(dev, &dev->ingress_queue, NULL);
>  }
>  
>  unsigned long netdev_fix_features(unsigned long features, const char *name)
> @@ -5406,7 +5406,7 @@ static void netdev_init_one_queue(struct net_device *dev,
>  
>  static void netdev_init_queues(struct net_device *dev)
>  {
> -	netdev_init_one_queue(dev, &dev->rx_queue, NULL);
> +	netdev_init_one_queue(dev, &dev->ingress_queue, NULL);
>  	netdev_for_each_tx_queue(dev, netdev_init_one_queue, NULL);
>  	spin_lock_init(&dev->tx_global_lock);
>  }
> diff --git a/net/sched/sch_api.c b/net/sched/sch_api.c
> index 6fb3d41..b802078 100644
> --- a/net/sched/sch_api.c
> +++ b/net/sched/sch_api.c
> @@ -240,7 +240,7 @@ struct Qdisc *qdisc_lookup(struct net_device *dev, u32 handle)
>  	if (q)
>  		goto out;
>  
> -	q = qdisc_match_from_root(dev->rx_queue.qdisc_sleeping, handle);
> +	q = qdisc_match_from_root(dev->ingress_queue.qdisc_sleeping, handle);
>  out:
>  	return q;
>  }
> @@ -701,7 +701,7 @@ static int qdisc_graft(struct net_device *dev, struct Qdisc *parent,
>  		}
>  
>  		for (i = 0; i < num_q; i++) {
> -			struct netdev_queue *dev_queue = &dev->rx_queue;
> +			struct netdev_queue *dev_queue = &dev->ingress_queue;
>  
>  			if (!ingress)
>  				dev_queue = netdev_get_tx_queue(dev, i);
> @@ -979,7 +979,7 @@ static int tc_get_qdisc(struct sk_buff *skb, struct nlmsghdr *n, void *arg)
>  					return -ENOENT;
>  				q = qdisc_leaf(p, clid);
>  			} else { /* ingress */
> -				q = dev->rx_queue.qdisc_sleeping;
> +				q = dev->ingress_queue.qdisc_sleeping;
>  			}
>  		} else {
>  			q = dev->qdisc;
> @@ -1044,7 +1044,7 @@ replay:
>  					return -ENOENT;
>  				q = qdisc_leaf(p, clid);
>  			} else { /*ingress */
> -				q = dev->rx_queue.qdisc_sleeping;
> +				q = dev->ingress_queue.qdisc_sleeping;
>  			}
>  		} else {
>  			q = dev->qdisc;
> @@ -1124,7 +1124,7 @@ create_n_graft:
>  	if (!(n->nlmsg_flags&NLM_F_CREATE))
>  		return -ENOENT;
>  	if (clid == TC_H_INGRESS)
> -		q = qdisc_create(dev, &dev->rx_queue, p,
> +		q = qdisc_create(dev, &dev->ingress_queue, p,
>  				 tcm->tcm_parent, tcm->tcm_parent,
>  				 tca, &err);
>  	else {
> @@ -1304,7 +1304,7 @@ static int tc_dump_qdisc(struct sk_buff *skb, struct netlink_callback *cb)
>  		if (tc_dump_qdisc_root(dev->qdisc, skb, cb, &q_idx, s_q_idx) < 0)
>  			goto done;
>  
> -		dev_queue = &dev->rx_queue;
> +		dev_queue = &dev->ingress_queue;
>  		if (tc_dump_qdisc_root(dev_queue->qdisc_sleeping, skb, cb, &q_idx, s_q_idx) < 0)
>  			goto done;
>  
> @@ -1595,7 +1595,7 @@ static int tc_dump_tclass(struct sk_buff *skb, struct netlink_callback *cb)
>  	if (tc_dump_tclass_root(dev->qdisc, skb, tcm, cb, &t, s_t) < 0)
>  		goto done;
>  
> -	dev_queue = &dev->rx_queue;
> +	dev_queue = &dev->ingress_queue;
>  	if (tc_dump_tclass_root(dev_queue->qdisc_sleeping, skb, tcm, cb, &t, s_t) < 0)
>  		goto done;
>  
> diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c
> index 2aeb3a4..545278a 100644
> --- a/net/sched/sch_generic.c
> +++ b/net/sched/sch_generic.c
> @@ -753,7 +753,7 @@ void dev_activate(struct net_device *dev)
>  
>  	need_watchdog = 0;
>  	netdev_for_each_tx_queue(dev, transition_one_qdisc, &need_watchdog);
> -	transition_one_qdisc(dev, &dev->rx_queue, NULL);
> +	transition_one_qdisc(dev, &dev->ingress_queue, NULL);
>  
>  	if (need_watchdog) {
>  		dev->trans_start = jiffies;
> @@ -812,7 +812,7 @@ static bool some_qdisc_is_busy(struct net_device *dev)
>  void dev_deactivate(struct net_device *dev)
>  {
>  	netdev_for_each_tx_queue(dev, dev_deactivate_queue, &noop_qdisc);
> -	dev_deactivate_queue(dev, &dev->rx_queue, &noop_qdisc);
> +	dev_deactivate_queue(dev, &dev->ingress_queue, &noop_qdisc);
>  
>  	dev_watchdog_down(dev);
>  
> @@ -838,7 +838,7 @@ void dev_init_scheduler(struct net_device *dev)
>  {
>  	dev->qdisc = &noop_qdisc;
>  	netdev_for_each_tx_queue(dev, dev_init_scheduler_queue, &noop_qdisc);
> -	dev_init_scheduler_queue(dev, &dev->rx_queue, &noop_qdisc);
> +	dev_init_scheduler_queue(dev, &dev->ingress_queue, &noop_qdisc);
>  
>  	setup_timer(&dev->watchdog_timer, dev_watchdog, (unsigned long)dev);
>  }
> @@ -861,7 +861,7 @@ static void shutdown_scheduler_queue(struct net_device *dev,
>  void dev_shutdown(struct net_device *dev)
>  {
>  	netdev_for_each_tx_queue(dev, shutdown_scheduler_queue, &noop_qdisc);
> -	shutdown_scheduler_queue(dev, &dev->rx_queue, &noop_qdisc);
> +	shutdown_scheduler_queue(dev, &dev->ingress_queue, &noop_qdisc);
>  	qdisc_destroy(dev->qdisc);
>  	dev->qdisc = &noop_qdisc;
>  
> 
> 

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH net-next2.6] net: rename netdev rx_queue to ingress_queue
  2010-09-28 18:04 ` Jarek Poplawski
@ 2010-09-29 10:56   ` jamal
  2010-09-29 13:18     ` Jarek Poplawski
  2010-09-30 22:58     ` [PATCH net-next2.6] net: dynamic ingress_queue allocation Eric Dumazet
  0 siblings, 2 replies; 23+ messages in thread
From: jamal @ 2010-09-29 10:56 UTC (permalink / raw)
  To: Jarek Poplawski; +Cc: Eric Dumazet, David Miller, netdev

On Tue, 2010-09-28 at 20:04 +0200, Jarek Poplawski wrote:

> I hope Master of Ingress will forgive you this CC some day... ;-)

Forgiven-by: moi 

Which may or may not stand for "me" if you pardon my French

cheers,
jamal


^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH net-next2.6] net: rename netdev rx_queue to ingress_queue
  2010-09-29 10:56   ` jamal
@ 2010-09-29 13:18     ` Jarek Poplawski
  2010-09-29 16:54       ` Jarek Poplawski
  2010-09-30 22:58     ` [PATCH net-next2.6] net: dynamic ingress_queue allocation Eric Dumazet
  1 sibling, 1 reply; 23+ messages in thread
From: Jarek Poplawski @ 2010-09-29 13:18 UTC (permalink / raw)
  To: jamal; +Cc: Eric Dumazet, David Miller, netdev

On Wed, Sep 29, 2010 at 06:56:57AM -0400, jamal wrote:
> On Tue, 2010-09-28 at 20:04 +0200, Jarek Poplawski wrote:
> 
> > I hope Master of Ingress will forgive you this CC some day... ;-)
> 
> Forgiven-by: moi 
> 
> Which may or may not stand for "me" if you pardon my French

Which, may or may not stand for "my" if you pardon my Polish ;-)

Btw, now, of course, this patch looks OK to me.

Cheers,
Jarek P.

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH net-next2.6] net: rename netdev rx_queue to ingress_queue
  2010-09-29 13:18     ` Jarek Poplawski
@ 2010-09-29 16:54       ` Jarek Poplawski
  0 siblings, 0 replies; 23+ messages in thread
From: Jarek Poplawski @ 2010-09-29 16:54 UTC (permalink / raw)
  To: jamal; +Cc: Eric Dumazet, David Miller, netdev

On Wed, Sep 29, 2010 at 03:18:52PM +0200, Jarek Poplawski wrote:
> On Wed, Sep 29, 2010 at 06:56:57AM -0400, jamal wrote:
> > On Tue, 2010-09-28 at 20:04 +0200, Jarek Poplawski wrote:
> > 
> > > I hope Master of Ingress will forgive you this CC some day... ;-)
> > 
> > Forgiven-by: moi 
> > 
> > Which may or may not stand for "me" if you pardon my French
> 
> Which, may or may not stand for "my" if you pardon my Polish ;-)

Hmm...  I'm really slow!  FORGIVE "ME", "MOI" ;-)

Jarek P.

^ permalink raw reply	[flat|nested] 23+ messages in thread

* [PATCH net-next2.6] net: dynamic ingress_queue allocation
  2010-09-29 10:56   ` jamal
  2010-09-29 13:18     ` Jarek Poplawski
@ 2010-09-30 22:58     ` Eric Dumazet
  2010-10-01 11:45       ` jamal
  1 sibling, 1 reply; 23+ messages in thread
From: Eric Dumazet @ 2010-09-30 22:58 UTC (permalink / raw)
  To: hadi; +Cc: Jarek Poplawski, David Miller, netdev

Hi Jamal

Here is the dynamic allocation I promised. I lightly tested it, could
you review it please ?

Thanks !

[PATCH net-next2.6] net: dynamic ingress_queue allocation

ingress being not used very much, and net_device->ingress_queue being
quite a big object (128 or 256 bytes), use a dynamic allocation if
needed (tc qdisc add dev eth0 ingress ...)

Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
---
 include/linux/netdevice.h |   17 +++++++++++-
 net/core/dev.c            |   48 +++++++++++++++++++++++++++---------
 net/sched/sch_api.c       |   40 ++++++++++++++++++++----------
 net/sched/sch_generic.c   |   36 +++++++++++++++------------
 4 files changed, 98 insertions(+), 43 deletions(-)

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index ceed347..7a21a91 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -986,8 +986,9 @@ struct net_device {
 	rx_handler_func_t	*rx_handler;
 	void			*rx_handler_data;
 
-	struct netdev_queue	ingress_queue; /* use two cache lines */
-
+#ifdef CONFIG_NET_CLS_ACT
+	struct netdev_queue	*ingress_queue;
+#endif
 /*
  * Cache lines mostly used on transmit path
  */
@@ -1115,6 +1116,18 @@ static inline void netdev_for_each_tx_queue(struct net_device *dev,
 		f(dev, &dev->_tx[i], arg);
 }
 
+
+static inline struct netdev_queue *dev_ingress_queue(struct net_device *dev)
+{
+#ifdef CONFIG_NET_CLS_ACT
+	return dev->ingress_queue;
+#else
+	return NULL;
+#endif
+}
+
+extern struct netdev_queue *dev_ingress_queue_create(struct net_device *dev);
+
 /*
  * Net namespace inlines
  */
diff --git a/net/core/dev.c b/net/core/dev.c
index a313bab..1e68259 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -2702,11 +2702,10 @@ EXPORT_SYMBOL_GPL(br_fdb_test_addr_hook);
  * the ingress scheduler, you just cant add policies on ingress.
  *
  */
-static int ing_filter(struct sk_buff *skb)
+static int ing_filter(struct sk_buff *skb, struct netdev_queue *rxq)
 {
 	struct net_device *dev = skb->dev;
 	u32 ttl = G_TC_RTTL(skb->tc_verd);
-	struct netdev_queue *rxq;
 	int result = TC_ACT_OK;
 	struct Qdisc *q;
 
@@ -2720,8 +2719,6 @@ static int ing_filter(struct sk_buff *skb)
 	skb->tc_verd = SET_TC_RTTL(skb->tc_verd, ttl);
 	skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_INGRESS);
 
-	rxq = &dev->ingress_queue;
-
 	q = rxq->qdisc;
 	if (q != &noop_qdisc) {
 		spin_lock(qdisc_lock(q));
@@ -2737,7 +2734,9 @@ static inline struct sk_buff *handle_ing(struct sk_buff *skb,
 					 struct packet_type **pt_prev,
 					 int *ret, struct net_device *orig_dev)
 {
-	if (skb->dev->ingress_queue.qdisc == &noop_qdisc)
+	struct netdev_queue *rxq = dev_ingress_queue(skb->dev);
+
+	if (!rxq || rxq->qdisc == &noop_qdisc)
 		goto out;
 
 	if (*pt_prev) {
@@ -2745,7 +2744,7 @@ static inline struct sk_buff *handle_ing(struct sk_buff *skb,
 		*pt_prev = NULL;
 	}
 
-	switch (ing_filter(skb)) {
+	switch (ing_filter(skb, rxq)) {
 	case TC_ACT_SHOT:
 	case TC_ACT_STOLEN:
 		kfree_skb(skb);
@@ -4932,15 +4931,17 @@ static void __netdev_init_queue_locks_one(struct net_device *dev,
 					  struct netdev_queue *dev_queue,
 					  void *_unused)
 {
-	spin_lock_init(&dev_queue->_xmit_lock);
-	netdev_set_xmit_lockdep_class(&dev_queue->_xmit_lock, dev->type);
-	dev_queue->xmit_lock_owner = -1;
+	if (dev_queue) {
+		spin_lock_init(&dev_queue->_xmit_lock);
+		netdev_set_xmit_lockdep_class(&dev_queue->_xmit_lock, dev->type);
+		dev_queue->xmit_lock_owner = -1;
+	}
 }
 
 static void netdev_init_queue_locks(struct net_device *dev)
 {
 	netdev_for_each_tx_queue(dev, __netdev_init_queue_locks_one, NULL);
-	__netdev_init_queue_locks_one(dev, &dev->ingress_queue, NULL);
+	__netdev_init_queue_locks_one(dev, dev_ingress_queue(dev), NULL);
 }
 
 unsigned long netdev_fix_features(unsigned long features, const char *name)
@@ -5447,16 +5448,37 @@ static void netdev_init_one_queue(struct net_device *dev,
 				  struct netdev_queue *queue,
 				  void *_unused)
 {
-	queue->dev = dev;
+	if (queue)
+		queue->dev = dev;
 }
 
 static void netdev_init_queues(struct net_device *dev)
 {
-	netdev_init_one_queue(dev, &dev->ingress_queue, NULL);
+	netdev_init_one_queue(dev, dev_ingress_queue(dev), NULL);
 	netdev_for_each_tx_queue(dev, netdev_init_one_queue, NULL);
 	spin_lock_init(&dev->tx_global_lock);
 }
 
+struct netdev_queue *dev_ingress_queue_create(struct net_device *dev)
+{
+	struct netdev_queue *queue = NULL;
+
+#ifdef CONFIG_NET_CLS_ACT
+	if (dev->ingress_queue)
+		return dev->ingress_queue;
+	queue = kzalloc(sizeof(*queue), GFP_KERNEL);
+	if (!queue)
+		return NULL;
+	netdev_init_one_queue(dev, queue, NULL);
+	__netdev_init_queue_locks_one(dev, queue, NULL);
+	queue->qdisc = &noop_qdisc;
+	queue->qdisc_sleeping = &noop_qdisc;
+	smp_wmb();
+	dev->ingress_queue = queue;
+#endif
+	return queue;
+}
+
 /**
  *	alloc_netdev_mq - allocate network device
  *	@sizeof_priv:	size of private data to allocate space for
@@ -5559,6 +5581,8 @@ void free_netdev(struct net_device *dev)
 
 	kfree(dev->_tx);
 
+	kfree(dev_ingress_queue(dev));
+
 	/* Flush device addresses */
 	dev_addr_flush(dev);
 
diff --git a/net/sched/sch_api.c b/net/sched/sch_api.c
index b802078..8635110 100644
--- a/net/sched/sch_api.c
+++ b/net/sched/sch_api.c
@@ -240,7 +240,10 @@ struct Qdisc *qdisc_lookup(struct net_device *dev, u32 handle)
 	if (q)
 		goto out;
 
-	q = qdisc_match_from_root(dev->ingress_queue.qdisc_sleeping, handle);
+	if (!dev_ingress_queue(dev))
+		goto out;
+	q = qdisc_match_from_root(dev_ingress_queue(dev)->qdisc_sleeping,
+				  handle);
 out:
 	return q;
 }
@@ -690,6 +693,8 @@ static int qdisc_graft(struct net_device *dev, struct Qdisc *parent,
 		    (new && new->flags & TCQ_F_INGRESS)) {
 			num_q = 1;
 			ingress = 1;
+			if (!dev_ingress_queue(dev))
+				return -ENOENT;
 		}
 
 		if (dev->flags & IFF_UP)
@@ -701,7 +706,7 @@ static int qdisc_graft(struct net_device *dev, struct Qdisc *parent,
 		}
 
 		for (i = 0; i < num_q; i++) {
-			struct netdev_queue *dev_queue = &dev->ingress_queue;
+			struct netdev_queue *dev_queue = dev_ingress_queue(dev);
 
 			if (!ingress)
 				dev_queue = netdev_get_tx_queue(dev, i);
@@ -979,7 +984,8 @@ static int tc_get_qdisc(struct sk_buff *skb, struct nlmsghdr *n, void *arg)
 					return -ENOENT;
 				q = qdisc_leaf(p, clid);
 			} else { /* ingress */
-				q = dev->ingress_queue.qdisc_sleeping;
+				if (dev_ingress_queue(dev))
+					q = dev_ingress_queue(dev)->qdisc_sleeping;
 			}
 		} else {
 			q = dev->qdisc;
@@ -1044,7 +1050,8 @@ replay:
 					return -ENOENT;
 				q = qdisc_leaf(p, clid);
 			} else { /*ingress */
-				q = dev->ingress_queue.qdisc_sleeping;
+				if (dev_ingress_queue_create(dev))
+					q = dev_ingress_queue(dev)->qdisc_sleeping;
 			}
 		} else {
 			q = dev->qdisc;
@@ -1123,11 +1130,14 @@ replay:
 create_n_graft:
 	if (!(n->nlmsg_flags&NLM_F_CREATE))
 		return -ENOENT;
-	if (clid == TC_H_INGRESS)
-		q = qdisc_create(dev, &dev->ingress_queue, p,
-				 tcm->tcm_parent, tcm->tcm_parent,
-				 tca, &err);
-	else {
+	if (clid == TC_H_INGRESS) {
+		if (dev_ingress_queue(dev))
+			q = qdisc_create(dev, dev_ingress_queue(dev), p,
+					 tcm->tcm_parent, tcm->tcm_parent,
+					 tca, &err);
+		else
+			err = -ENOENT;
+	} else {
 		struct netdev_queue *dev_queue;
 
 		if (p && p->ops->cl_ops && p->ops->cl_ops->select_queue)
@@ -1304,8 +1314,10 @@ static int tc_dump_qdisc(struct sk_buff *skb, struct netlink_callback *cb)
 		if (tc_dump_qdisc_root(dev->qdisc, skb, cb, &q_idx, s_q_idx) < 0)
 			goto done;
 
-		dev_queue = &dev->ingress_queue;
-		if (tc_dump_qdisc_root(dev_queue->qdisc_sleeping, skb, cb, &q_idx, s_q_idx) < 0)
+		dev_queue = dev_ingress_queue(dev);
+		if (dev_queue &&
+		    tc_dump_qdisc_root(dev_queue->qdisc_sleeping, skb, cb,
+				       &q_idx, s_q_idx) < 0)
 			goto done;
 
 cont:
@@ -1595,8 +1607,10 @@ static int tc_dump_tclass(struct sk_buff *skb, struct netlink_callback *cb)
 	if (tc_dump_tclass_root(dev->qdisc, skb, tcm, cb, &t, s_t) < 0)
 		goto done;
 
-	dev_queue = &dev->ingress_queue;
-	if (tc_dump_tclass_root(dev_queue->qdisc_sleeping, skb, tcm, cb, &t, s_t) < 0)
+	dev_queue = dev_ingress_queue(dev);
+	if (dev_queue &&
+	    tc_dump_tclass_root(dev_queue->qdisc_sleeping, skb, tcm, cb,
+				&t, s_t) < 0)
 		goto done;
 
 done:
diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c
index 545278a..c42dec5 100644
--- a/net/sched/sch_generic.c
+++ b/net/sched/sch_generic.c
@@ -721,16 +721,18 @@ static void transition_one_qdisc(struct net_device *dev,
 				 struct netdev_queue *dev_queue,
 				 void *_need_watchdog)
 {
-	struct Qdisc *new_qdisc = dev_queue->qdisc_sleeping;
-	int *need_watchdog_p = _need_watchdog;
+	if (dev_queue) {
+		struct Qdisc *new_qdisc = dev_queue->qdisc_sleeping;
+		int *need_watchdog_p = _need_watchdog;
 
-	if (!(new_qdisc->flags & TCQ_F_BUILTIN))
-		clear_bit(__QDISC_STATE_DEACTIVATED, &new_qdisc->state);
+		if (!(new_qdisc->flags & TCQ_F_BUILTIN))
+			clear_bit(__QDISC_STATE_DEACTIVATED, &new_qdisc->state);
 
-	rcu_assign_pointer(dev_queue->qdisc, new_qdisc);
-	if (need_watchdog_p && new_qdisc != &noqueue_qdisc) {
-		dev_queue->trans_start = 0;
-		*need_watchdog_p = 1;
+		rcu_assign_pointer(dev_queue->qdisc, new_qdisc);
+		if (need_watchdog_p && new_qdisc != &noqueue_qdisc) {
+			dev_queue->trans_start = 0;
+			*need_watchdog_p = 1;
+		}
 	}
 }
 
@@ -753,7 +755,7 @@ void dev_activate(struct net_device *dev)
 
 	need_watchdog = 0;
 	netdev_for_each_tx_queue(dev, transition_one_qdisc, &need_watchdog);
-	transition_one_qdisc(dev, &dev->ingress_queue, NULL);
+	transition_one_qdisc(dev, dev_ingress_queue(dev), NULL);
 
 	if (need_watchdog) {
 		dev->trans_start = jiffies;
@@ -768,7 +770,7 @@ static void dev_deactivate_queue(struct net_device *dev,
 	struct Qdisc *qdisc_default = _qdisc_default;
 	struct Qdisc *qdisc;
 
-	qdisc = dev_queue->qdisc;
+	qdisc = dev_queue ? dev_queue->qdisc : NULL;
 	if (qdisc) {
 		spin_lock_bh(qdisc_lock(qdisc));
 
@@ -812,7 +814,7 @@ static bool some_qdisc_is_busy(struct net_device *dev)
 void dev_deactivate(struct net_device *dev)
 {
 	netdev_for_each_tx_queue(dev, dev_deactivate_queue, &noop_qdisc);
-	dev_deactivate_queue(dev, &dev->ingress_queue, &noop_qdisc);
+	dev_deactivate_queue(dev, dev_ingress_queue(dev), &noop_qdisc);
 
 	dev_watchdog_down(dev);
 
@@ -830,15 +832,17 @@ static void dev_init_scheduler_queue(struct net_device *dev,
 {
 	struct Qdisc *qdisc = _qdisc;
 
-	dev_queue->qdisc = qdisc;
-	dev_queue->qdisc_sleeping = qdisc;
+	if (dev_queue) {
+		dev_queue->qdisc = qdisc;
+		dev_queue->qdisc_sleeping = qdisc;
+	}
 }
 
 void dev_init_scheduler(struct net_device *dev)
 {
 	dev->qdisc = &noop_qdisc;
 	netdev_for_each_tx_queue(dev, dev_init_scheduler_queue, &noop_qdisc);
-	dev_init_scheduler_queue(dev, &dev->ingress_queue, &noop_qdisc);
+	dev_init_scheduler_queue(dev, dev_ingress_queue(dev), &noop_qdisc);
 
 	setup_timer(&dev->watchdog_timer, dev_watchdog, (unsigned long)dev);
 }
@@ -847,7 +851,7 @@ static void shutdown_scheduler_queue(struct net_device *dev,
 				     struct netdev_queue *dev_queue,
 				     void *_qdisc_default)
 {
-	struct Qdisc *qdisc = dev_queue->qdisc_sleeping;
+	struct Qdisc *qdisc = dev_queue ? dev_queue->qdisc_sleeping : NULL;
 	struct Qdisc *qdisc_default = _qdisc_default;
 
 	if (qdisc) {
@@ -861,7 +865,7 @@ static void shutdown_scheduler_queue(struct net_device *dev,
 void dev_shutdown(struct net_device *dev)
 {
 	netdev_for_each_tx_queue(dev, shutdown_scheduler_queue, &noop_qdisc);
-	shutdown_scheduler_queue(dev, &dev->ingress_queue, &noop_qdisc);
+	shutdown_scheduler_queue(dev, dev_ingress_queue(dev), &noop_qdisc);
 	qdisc_destroy(dev->qdisc);
 	dev->qdisc = &noop_qdisc;
 



^ permalink raw reply related	[flat|nested] 23+ messages in thread

* Re: [PATCH net-next2.6] net: dynamic ingress_queue allocation
  2010-09-30 22:58     ` [PATCH net-next2.6] net: dynamic ingress_queue allocation Eric Dumazet
@ 2010-10-01 11:45       ` jamal
  2010-10-01 13:56         ` [PATCH net-next V2] " Eric Dumazet
  0 siblings, 1 reply; 23+ messages in thread
From: jamal @ 2010-10-01 11:45 UTC (permalink / raw)
  To: Eric Dumazet; +Cc: Jarek Poplawski, David Miller, netdev

On Fri, 2010-10-01 at 00:58 +0200, Eric Dumazet wrote:
> Hi Jamal
> 
> Here is the dynamic allocation I promised. I lightly tested it, could
> you review it please ?
> Thanks !
> 
> [PATCH net-next2.6] net: dynamic ingress_queue allocation
> 
> ingress being not used very much, and net_device->ingress_queue being
> quite a big object (128 or 256 bytes), use a dynamic allocation if
> needed (tc qdisc add dev eth0 ingress ...)

I agree with the principle that it is valuable in making it dynamic for
people who dont want it; but, but (like my kid would say, sniff, sniff)
you are making me sad saying it is not used very much ;-> It is used
very much in my world. My friend Jarek uses it;->

> +#ifdef CONFIG_NET_CLS_ACT

I think appropriately this should be NET_SCH_INGRESS (everywhere else as
well).

> +static inline struct netdev_queue *dev_ingress_queue(struct net_device *dev)
> +{
> +#ifdef CONFIG_NET_CLS_ACT
> +	return dev->ingress_queue;
> +#else
> +	return NULL;
> +#endif

Above, if you just returned dev->ingress_queue wouldnt it always be 
NULL if it was not allocated?

> @@ -2737,7 +2734,9 @@ static inline struct sk_buff *handle_ing(struct sk_buff *skb,
>  					 struct packet_type **pt_prev,
>  					 int *ret, struct net_device *orig_dev)
>  {
> -	if (skb->dev->ingress_queue.qdisc == &noop_qdisc)
> +	struct netdev_queue *rxq = dev_ingress_queue(skb->dev);
> +
> +	if (!rxq || rxq->qdisc == &noop_qdisc)
>  		goto out;

I stared at above a little longer since this is the only fast path
affected; is it a few more cycles now for people who love ingress?

> @@ -690,6 +693,8 @@ static int qdisc_graft(struct net_device *dev, struct Qdisc *parent,
>  		    (new && new->flags & TCQ_F_INGRESS)) {
>  			num_q = 1;
>  			ingress = 1;
> +			if (!dev_ingress_queue(dev))
> +				return -ENOENT;
>  		}
>  

The above looks clever but worries me because it changes the old flow.
If you have time,  the following tests will alleviate my fears

1) compile support for ingress and add/delete ingress qdisc
2) Dont compile support and add/delete ingress qdisc
3) Compile ingress as a module and add/delete ingress qdisc

Other than that excellent work Eric. And you can add my
Acked/reviewed-by etc.

BTW, did i say i like your per-cpu stats stuff? It applies nicely to
qdiscs, actions etc ;->

cheers,
jamal

^ permalink raw reply	[flat|nested] 23+ messages in thread

* [PATCH net-next V2] net: dynamic ingress_queue allocation
  2010-10-01 11:45       ` jamal
@ 2010-10-01 13:56         ` Eric Dumazet
  2010-10-02  9:32           ` Jarek Poplawski
  2010-10-02 12:10           ` [PATCH net-next V2] " jamal
  0 siblings, 2 replies; 23+ messages in thread
From: Eric Dumazet @ 2010-10-01 13:56 UTC (permalink / raw)
  To: hadi; +Cc: Jarek Poplawski, David Miller, netdev

Le vendredi 01 octobre 2010 à 07:45 -0400, jamal a écrit :
> On Fri, 2010-10-01 at 00:58 +0200, Eric Dumazet wrote:
> > Hi Jamal
> > 
> > Here is the dynamic allocation I promised. I lightly tested it, could
> > you review it please ?
> > Thanks !
> > 
> > [PATCH net-next2.6] net: dynamic ingress_queue allocation
> > 
> > ingress being not used very much, and net_device->ingress_queue being
> > quite a big object (128 or 256 bytes), use a dynamic allocation if
> > needed (tc qdisc add dev eth0 ingress ...)
> 
> I agree with the principle that it is valuable in making it dynamic for
> people who dont want it; but, but (like my kid would say, sniff, sniff)
> you are making me sad saying it is not used very much ;-> It is used
> very much in my world. My friend Jarek uses it;->
> 

;)

> 
> > +#ifdef CONFIG_NET_CLS_ACT
> 
> I think appropriately this should be NET_SCH_INGRESS (everywhere else as
> well).
> 

I first thought of this, and found it would add a new dependence on
vmlinux :

If someone wants to add NET_SCH_INGRESS module, he would need to
recompile whole kernel and reboot.

This is probably why ing_filter() and handle_ing() are enclosed with
CONFIG_NET_CLS_ACT, not CONFIG_NET_SCH_INGRESS.

Since struct net_dev only holds one pointer (after this patch), I
believe its better to use same dependence.

> 
> > +static inline struct netdev_queue *dev_ingress_queue(struct net_device *dev)
> > +{
> > +#ifdef CONFIG_NET_CLS_ACT
> > +	return dev->ingress_queue;
> > +#else
> > +	return NULL;
> > +#endif
> 
> Above, if you just returned dev->ingress_queue wouldnt it always be 
> NULL if it was not allocated?
> 

ingress_queue is not defined in "struct net_device *dev" if 
!CONFIG_NET_CLS_ACT

Returning NULL here permits dead code elimination by compiler.

Then, probably nobody unset CONFIG_NET_CLS_ACT, so we can probably avoid
this preprocessor stuff.

> 
> > @@ -2737,7 +2734,9 @@ static inline struct sk_buff *handle_ing(struct sk_buff *skb,
> >  					 struct packet_type **pt_prev,
> >  					 int *ret, struct net_device *orig_dev)
> >  {
> > -	if (skb->dev->ingress_queue.qdisc == &noop_qdisc)
> > +	struct netdev_queue *rxq = dev_ingress_queue(skb->dev);
> > +
> > +	if (!rxq || rxq->qdisc == &noop_qdisc)
> >  		goto out;
> 
> I stared at above a little longer since this is the only fast path
> affected; is it a few more cycles now for people who love ingress?

I see, this adds an indirection and a conditional branch, but this
should be in cpu cache and well predicted.

I thought adding a fake "struct netdev_queue" object, with a qdisc
pointing to noop_qdisc. But this would slow down a bit non ingress
users ;)

For people not using ingress, my solution removes an access to an extra
cache line. So network latency is improved a bit when cpu caches are
full of user land data.

> 
> > @@ -690,6 +693,8 @@ static int qdisc_graft(struct net_device *dev, struct Qdisc *parent,
> >  		    (new && new->flags & TCQ_F_INGRESS)) {
> >  			num_q = 1;
> >  			ingress = 1;
> > +			if (!dev_ingress_queue(dev))
> > +				return -ENOENT;
> >  		}
> >  
> 
> The above looks clever but worries me because it changes the old flow.
> If you have time,  the following tests will alleviate my fears
> 
> 1) compile support for ingress and add/delete ingress qdisc

This worked for me, but I dont know complex setups.

> 2) Dont compile support and add/delete ingress qdisc

tc gives an error (a bit like !CONFIG_NET_SCH_INGRESS)

# tc qdisc add dev eth0 ingress
RTNETLINK answers: No such file or directory
# tc -s -d qdisc show dev eth0
qdisc mq 0: root 
 Sent 636 bytes 10 pkt (dropped 0, overlimits 0 requeues 0) 
 backlog 0b 0p requeues 0 


> 3) Compile ingress as a module and add/delete ingress qdisc
> 
> 

Seems to work like 1)

> Other than that excellent work Eric. And you can add my
> Acked/reviewed-by etc.
> 
> BTW, did i say i like your per-cpu stats stuff? It applies nicely to
> qdiscs, actions etc ;->

I took a look at ifb as suggested by Stephen but could not see trivial
changes (LLTX or per-cpu stats), since central lock is needed I am
afraid. And qdisc are the same, stats updates are mostly free as we
dirtied cache line for the lock.

Thanks Jamal !

Here is the V2, with two #ifdef removed.


[PATCH net-next V2] net: dynamic ingress_queue allocation

ingress being not used very much, and net_device->ingress_queue being
quite a big object (128 or 256 bytes), use a dynamic allocation if
needed (tc qdisc add dev eth0 ingress ...)

Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
---
 include/linux/netdevice.h |   11 ++++++--
 net/core/dev.c            |   48 +++++++++++++++++++++++++++---------
 net/sched/sch_api.c       |   40 ++++++++++++++++++++----------
 net/sched/sch_generic.c   |   36 +++++++++++++++------------
 4 files changed, 92 insertions(+), 43 deletions(-)

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index ceed347..4f86009 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -986,8 +986,7 @@ struct net_device {
 	rx_handler_func_t	*rx_handler;
 	void			*rx_handler_data;
 
-	struct netdev_queue	ingress_queue; /* use two cache lines */
-
+	struct netdev_queue	*ingress_queue;
 /*
  * Cache lines mostly used on transmit path
  */
@@ -1115,6 +1114,14 @@ static inline void netdev_for_each_tx_queue(struct net_device *dev,
 		f(dev, &dev->_tx[i], arg);
 }
 
+
+static inline struct netdev_queue *dev_ingress_queue(struct net_device *dev)
+{
+	return dev->ingress_queue;
+}
+
+extern struct netdev_queue *dev_ingress_queue_create(struct net_device *dev);
+
 /*
  * Net namespace inlines
  */
diff --git a/net/core/dev.c b/net/core/dev.c
index a313bab..e3bb8c9 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -2702,11 +2702,10 @@ EXPORT_SYMBOL_GPL(br_fdb_test_addr_hook);
  * the ingress scheduler, you just cant add policies on ingress.
  *
  */
-static int ing_filter(struct sk_buff *skb)
+static int ing_filter(struct sk_buff *skb, struct netdev_queue *rxq)
 {
 	struct net_device *dev = skb->dev;
 	u32 ttl = G_TC_RTTL(skb->tc_verd);
-	struct netdev_queue *rxq;
 	int result = TC_ACT_OK;
 	struct Qdisc *q;
 
@@ -2720,8 +2719,6 @@ static int ing_filter(struct sk_buff *skb)
 	skb->tc_verd = SET_TC_RTTL(skb->tc_verd, ttl);
 	skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_INGRESS);
 
-	rxq = &dev->ingress_queue;
-
 	q = rxq->qdisc;
 	if (q != &noop_qdisc) {
 		spin_lock(qdisc_lock(q));
@@ -2737,7 +2734,9 @@ static inline struct sk_buff *handle_ing(struct sk_buff *skb,
 					 struct packet_type **pt_prev,
 					 int *ret, struct net_device *orig_dev)
 {
-	if (skb->dev->ingress_queue.qdisc == &noop_qdisc)
+	struct netdev_queue *rxq = dev_ingress_queue(skb->dev);
+
+	if (!rxq || rxq->qdisc == &noop_qdisc)
 		goto out;
 
 	if (*pt_prev) {
@@ -2745,7 +2744,7 @@ static inline struct sk_buff *handle_ing(struct sk_buff *skb,
 		*pt_prev = NULL;
 	}
 
-	switch (ing_filter(skb)) {
+	switch (ing_filter(skb, rxq)) {
 	case TC_ACT_SHOT:
 	case TC_ACT_STOLEN:
 		kfree_skb(skb);
@@ -4932,15 +4931,17 @@ static void __netdev_init_queue_locks_one(struct net_device *dev,
 					  struct netdev_queue *dev_queue,
 					  void *_unused)
 {
-	spin_lock_init(&dev_queue->_xmit_lock);
-	netdev_set_xmit_lockdep_class(&dev_queue->_xmit_lock, dev->type);
-	dev_queue->xmit_lock_owner = -1;
+	if (dev_queue) {
+		spin_lock_init(&dev_queue->_xmit_lock);
+		netdev_set_xmit_lockdep_class(&dev_queue->_xmit_lock, dev->type);
+		dev_queue->xmit_lock_owner = -1;
+	}
 }
 
 static void netdev_init_queue_locks(struct net_device *dev)
 {
 	netdev_for_each_tx_queue(dev, __netdev_init_queue_locks_one, NULL);
-	__netdev_init_queue_locks_one(dev, &dev->ingress_queue, NULL);
+	__netdev_init_queue_locks_one(dev, dev_ingress_queue(dev), NULL);
 }
 
 unsigned long netdev_fix_features(unsigned long features, const char *name)
@@ -5447,16 +5448,37 @@ static void netdev_init_one_queue(struct net_device *dev,
 				  struct netdev_queue *queue,
 				  void *_unused)
 {
-	queue->dev = dev;
+	if (queue)
+		queue->dev = dev;
 }
 
 static void netdev_init_queues(struct net_device *dev)
 {
-	netdev_init_one_queue(dev, &dev->ingress_queue, NULL);
+	netdev_init_one_queue(dev, dev_ingress_queue(dev), NULL);
 	netdev_for_each_tx_queue(dev, netdev_init_one_queue, NULL);
 	spin_lock_init(&dev->tx_global_lock);
 }
 
+struct netdev_queue *dev_ingress_queue_create(struct net_device *dev)
+{
+	struct netdev_queue *queue = dev_ingress_queue(dev);
+
+#ifdef CONFIG_NET_CLS_ACT
+	if (queue)
+		return queue;
+	queue = kzalloc(sizeof(*queue), GFP_KERNEL);
+	if (!queue)
+		return NULL;
+	netdev_init_one_queue(dev, queue, NULL);
+	__netdev_init_queue_locks_one(dev, queue, NULL);
+	queue->qdisc = &noop_qdisc;
+	queue->qdisc_sleeping = &noop_qdisc;
+	smp_wmb();
+	dev->ingress_queue = queue;
+#endif
+	return queue;
+}
+
 /**
  *	alloc_netdev_mq - allocate network device
  *	@sizeof_priv:	size of private data to allocate space for
@@ -5559,6 +5581,8 @@ void free_netdev(struct net_device *dev)
 
 	kfree(dev->_tx);
 
+	kfree(dev_ingress_queue(dev));
+
 	/* Flush device addresses */
 	dev_addr_flush(dev);
 
diff --git a/net/sched/sch_api.c b/net/sched/sch_api.c
index b802078..8635110 100644
--- a/net/sched/sch_api.c
+++ b/net/sched/sch_api.c
@@ -240,7 +240,10 @@ struct Qdisc *qdisc_lookup(struct net_device *dev, u32 handle)
 	if (q)
 		goto out;
 
-	q = qdisc_match_from_root(dev->ingress_queue.qdisc_sleeping, handle);
+	if (!dev_ingress_queue(dev))
+		goto out;
+	q = qdisc_match_from_root(dev_ingress_queue(dev)->qdisc_sleeping,
+				  handle);
 out:
 	return q;
 }
@@ -690,6 +693,8 @@ static int qdisc_graft(struct net_device *dev, struct Qdisc *parent,
 		    (new && new->flags & TCQ_F_INGRESS)) {
 			num_q = 1;
 			ingress = 1;
+			if (!dev_ingress_queue(dev))
+				return -ENOENT;
 		}
 
 		if (dev->flags & IFF_UP)
@@ -701,7 +706,7 @@ static int qdisc_graft(struct net_device *dev, struct Qdisc *parent,
 		}
 
 		for (i = 0; i < num_q; i++) {
-			struct netdev_queue *dev_queue = &dev->ingress_queue;
+			struct netdev_queue *dev_queue = dev_ingress_queue(dev);
 
 			if (!ingress)
 				dev_queue = netdev_get_tx_queue(dev, i);
@@ -979,7 +984,8 @@ static int tc_get_qdisc(struct sk_buff *skb, struct nlmsghdr *n, void *arg)
 					return -ENOENT;
 				q = qdisc_leaf(p, clid);
 			} else { /* ingress */
-				q = dev->ingress_queue.qdisc_sleeping;
+				if (dev_ingress_queue(dev))
+					q = dev_ingress_queue(dev)->qdisc_sleeping;
 			}
 		} else {
 			q = dev->qdisc;
@@ -1044,7 +1050,8 @@ replay:
 					return -ENOENT;
 				q = qdisc_leaf(p, clid);
 			} else { /*ingress */
-				q = dev->ingress_queue.qdisc_sleeping;
+				if (dev_ingress_queue_create(dev))
+					q = dev_ingress_queue(dev)->qdisc_sleeping;
 			}
 		} else {
 			q = dev->qdisc;
@@ -1123,11 +1130,14 @@ replay:
 create_n_graft:
 	if (!(n->nlmsg_flags&NLM_F_CREATE))
 		return -ENOENT;
-	if (clid == TC_H_INGRESS)
-		q = qdisc_create(dev, &dev->ingress_queue, p,
-				 tcm->tcm_parent, tcm->tcm_parent,
-				 tca, &err);
-	else {
+	if (clid == TC_H_INGRESS) {
+		if (dev_ingress_queue(dev))
+			q = qdisc_create(dev, dev_ingress_queue(dev), p,
+					 tcm->tcm_parent, tcm->tcm_parent,
+					 tca, &err);
+		else
+			err = -ENOENT;
+	} else {
 		struct netdev_queue *dev_queue;
 
 		if (p && p->ops->cl_ops && p->ops->cl_ops->select_queue)
@@ -1304,8 +1314,10 @@ static int tc_dump_qdisc(struct sk_buff *skb, struct netlink_callback *cb)
 		if (tc_dump_qdisc_root(dev->qdisc, skb, cb, &q_idx, s_q_idx) < 0)
 			goto done;
 
-		dev_queue = &dev->ingress_queue;
-		if (tc_dump_qdisc_root(dev_queue->qdisc_sleeping, skb, cb, &q_idx, s_q_idx) < 0)
+		dev_queue = dev_ingress_queue(dev);
+		if (dev_queue &&
+		    tc_dump_qdisc_root(dev_queue->qdisc_sleeping, skb, cb,
+				       &q_idx, s_q_idx) < 0)
 			goto done;
 
 cont:
@@ -1595,8 +1607,10 @@ static int tc_dump_tclass(struct sk_buff *skb, struct netlink_callback *cb)
 	if (tc_dump_tclass_root(dev->qdisc, skb, tcm, cb, &t, s_t) < 0)
 		goto done;
 
-	dev_queue = &dev->ingress_queue;
-	if (tc_dump_tclass_root(dev_queue->qdisc_sleeping, skb, tcm, cb, &t, s_t) < 0)
+	dev_queue = dev_ingress_queue(dev);
+	if (dev_queue &&
+	    tc_dump_tclass_root(dev_queue->qdisc_sleeping, skb, tcm, cb,
+				&t, s_t) < 0)
 		goto done;
 
 done:
diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c
index 545278a..c42dec5 100644
--- a/net/sched/sch_generic.c
+++ b/net/sched/sch_generic.c
@@ -721,16 +721,18 @@ static void transition_one_qdisc(struct net_device *dev,
 				 struct netdev_queue *dev_queue,
 				 void *_need_watchdog)
 {
-	struct Qdisc *new_qdisc = dev_queue->qdisc_sleeping;
-	int *need_watchdog_p = _need_watchdog;
+	if (dev_queue) {
+		struct Qdisc *new_qdisc = dev_queue->qdisc_sleeping;
+		int *need_watchdog_p = _need_watchdog;
 
-	if (!(new_qdisc->flags & TCQ_F_BUILTIN))
-		clear_bit(__QDISC_STATE_DEACTIVATED, &new_qdisc->state);
+		if (!(new_qdisc->flags & TCQ_F_BUILTIN))
+			clear_bit(__QDISC_STATE_DEACTIVATED, &new_qdisc->state);
 
-	rcu_assign_pointer(dev_queue->qdisc, new_qdisc);
-	if (need_watchdog_p && new_qdisc != &noqueue_qdisc) {
-		dev_queue->trans_start = 0;
-		*need_watchdog_p = 1;
+		rcu_assign_pointer(dev_queue->qdisc, new_qdisc);
+		if (need_watchdog_p && new_qdisc != &noqueue_qdisc) {
+			dev_queue->trans_start = 0;
+			*need_watchdog_p = 1;
+		}
 	}
 }
 
@@ -753,7 +755,7 @@ void dev_activate(struct net_device *dev)
 
 	need_watchdog = 0;
 	netdev_for_each_tx_queue(dev, transition_one_qdisc, &need_watchdog);
-	transition_one_qdisc(dev, &dev->ingress_queue, NULL);
+	transition_one_qdisc(dev, dev_ingress_queue(dev), NULL);
 
 	if (need_watchdog) {
 		dev->trans_start = jiffies;
@@ -768,7 +770,7 @@ static void dev_deactivate_queue(struct net_device *dev,
 	struct Qdisc *qdisc_default = _qdisc_default;
 	struct Qdisc *qdisc;
 
-	qdisc = dev_queue->qdisc;
+	qdisc = dev_queue ? dev_queue->qdisc : NULL;
 	if (qdisc) {
 		spin_lock_bh(qdisc_lock(qdisc));
 
@@ -812,7 +814,7 @@ static bool some_qdisc_is_busy(struct net_device *dev)
 void dev_deactivate(struct net_device *dev)
 {
 	netdev_for_each_tx_queue(dev, dev_deactivate_queue, &noop_qdisc);
-	dev_deactivate_queue(dev, &dev->ingress_queue, &noop_qdisc);
+	dev_deactivate_queue(dev, dev_ingress_queue(dev), &noop_qdisc);
 
 	dev_watchdog_down(dev);
 
@@ -830,15 +832,17 @@ static void dev_init_scheduler_queue(struct net_device *dev,
 {
 	struct Qdisc *qdisc = _qdisc;
 
-	dev_queue->qdisc = qdisc;
-	dev_queue->qdisc_sleeping = qdisc;
+	if (dev_queue) {
+		dev_queue->qdisc = qdisc;
+		dev_queue->qdisc_sleeping = qdisc;
+	}
 }
 
 void dev_init_scheduler(struct net_device *dev)
 {
 	dev->qdisc = &noop_qdisc;
 	netdev_for_each_tx_queue(dev, dev_init_scheduler_queue, &noop_qdisc);
-	dev_init_scheduler_queue(dev, &dev->ingress_queue, &noop_qdisc);
+	dev_init_scheduler_queue(dev, dev_ingress_queue(dev), &noop_qdisc);
 
 	setup_timer(&dev->watchdog_timer, dev_watchdog, (unsigned long)dev);
 }
@@ -847,7 +851,7 @@ static void shutdown_scheduler_queue(struct net_device *dev,
 				     struct netdev_queue *dev_queue,
 				     void *_qdisc_default)
 {
-	struct Qdisc *qdisc = dev_queue->qdisc_sleeping;
+	struct Qdisc *qdisc = dev_queue ? dev_queue->qdisc_sleeping : NULL;
 	struct Qdisc *qdisc_default = _qdisc_default;
 
 	if (qdisc) {
@@ -861,7 +865,7 @@ static void shutdown_scheduler_queue(struct net_device *dev,
 void dev_shutdown(struct net_device *dev)
 {
 	netdev_for_each_tx_queue(dev, shutdown_scheduler_queue, &noop_qdisc);
-	shutdown_scheduler_queue(dev, &dev->ingress_queue, &noop_qdisc);
+	shutdown_scheduler_queue(dev, dev_ingress_queue(dev), &noop_qdisc);
 	qdisc_destroy(dev->qdisc);
 	dev->qdisc = &noop_qdisc;
 



^ permalink raw reply related	[flat|nested] 23+ messages in thread

* Re: [PATCH net-next V2] net: dynamic ingress_queue allocation
  2010-10-01 13:56         ` [PATCH net-next V2] " Eric Dumazet
@ 2010-10-02  9:32           ` Jarek Poplawski
  2010-10-02 16:11             ` [PATCH net-next V3] " Eric Dumazet
  2010-10-02 12:10           ` [PATCH net-next V2] " jamal
  1 sibling, 1 reply; 23+ messages in thread
From: Jarek Poplawski @ 2010-10-02  9:32 UTC (permalink / raw)
  To: Eric Dumazet; +Cc: hadi, David Miller, netdev

On Fri, Oct 01, 2010 at 03:56:28PM +0200, Eric Dumazet wrote:
> Le vendredi 01 octobre 2010 ?? 07:45 -0400, jamal a écrit :
> > On Fri, 2010-10-01 at 00:58 +0200, Eric Dumazet wrote:
> > > Hi Jamal
> > > 
> > > Here is the dynamic allocation I promised. I lightly tested it, could
> > > you review it please ?
> > > Thanks !
> > > 
> > > [PATCH net-next2.6] net: dynamic ingress_queue allocation
> > > 
> > > ingress being not used very much, and net_device->ingress_queue being
> > > quite a big object (128 or 256 bytes), use a dynamic allocation if
> > > needed (tc qdisc add dev eth0 ingress ...)
> > 
> > I agree with the principle that it is valuable in making it dynamic for
> > people who dont want it; but, but (like my kid would say, sniff, sniff)
> > you are making me sad saying it is not used very much ;-> It is used
> > very much in my world. My friend Jarek uses it;->

Thanks Jamal, my friend, I'm really glad! (And sad ;-)

> 
> ;)
> 
> > 
> > > +#ifdef CONFIG_NET_CLS_ACT
> > 
> > I think appropriately this should be NET_SCH_INGRESS (everywhere else as
> > well).
> > 
> 
> I first thought of this, and found it would add a new dependence on
> vmlinux :
> 
> If someone wants to add NET_SCH_INGRESS module, he would need to
> recompile whole kernel and reboot.
> 
> This is probably why ing_filter() and handle_ing() are enclosed with
> CONFIG_NET_CLS_ACT, not CONFIG_NET_SCH_INGRESS.
> 
> Since struct net_dev only holds one pointer (after this patch), I
> believe its better to use same dependence.
> 
> > 
> > > +static inline struct netdev_queue *dev_ingress_queue(struct net_device *dev)
> > > +{
> > > +#ifdef CONFIG_NET_CLS_ACT
> > > +	return dev->ingress_queue;
> > > +#else
> > > +	return NULL;
> > > +#endif
> > 
> > Above, if you just returned dev->ingress_queue wouldnt it always be 
> > NULL if it was not allocated?
> > 
> 
> ingress_queue is not defined in "struct net_device *dev" if 
> !CONFIG_NET_CLS_ACT
> 
> Returning NULL here permits dead code elimination by compiler.
> 
> Then, probably nobody unset CONFIG_NET_CLS_ACT, so we can probably avoid
> this preprocessor stuff.
> 
> > 
> > > @@ -2737,7 +2734,9 @@ static inline struct sk_buff *handle_ing(struct sk_buff *skb,
> > >  					 struct packet_type **pt_prev,
> > >  					 int *ret, struct net_device *orig_dev)
> > >  {
> > > -	if (skb->dev->ingress_queue.qdisc == &noop_qdisc)
> > > +	struct netdev_queue *rxq = dev_ingress_queue(skb->dev);
> > > +
> > > +	if (!rxq || rxq->qdisc == &noop_qdisc)
> > >  		goto out;
> > 
> > I stared at above a little longer since this is the only fast path
> > affected; is it a few more cycles now for people who love ingress?
> 
> I see, this adds an indirection and a conditional branch, but this
> should be in cpu cache and well predicted.
> 
> I thought adding a fake "struct netdev_queue" object, with a qdisc
> pointing to noop_qdisc. But this would slow down a bit non ingress
> users ;)
> 
> For people not using ingress, my solution removes an access to an extra
> cache line. So network latency is improved a bit when cpu caches are
> full of user land data.
> 
> > 
> > > @@ -690,6 +693,8 @@ static int qdisc_graft(struct net_device *dev, struct Qdisc *parent,
> > >  		    (new && new->flags & TCQ_F_INGRESS)) {
> > >  			num_q = 1;
> > >  			ingress = 1;
> > > +			if (!dev_ingress_queue(dev))
> > > +				return -ENOENT;
> > >  		}
> > >  
> > 
> > The above looks clever but worries me because it changes the old flow.
> > If you have time,  the following tests will alleviate my fears
> > 
> > 1) compile support for ingress and add/delete ingress qdisc
> 
> This worked for me, but I dont know complex setups.
> 
> > 2) Dont compile support and add/delete ingress qdisc
> 
> tc gives an error (a bit like !CONFIG_NET_SCH_INGRESS)
> 
> # tc qdisc add dev eth0 ingress
> RTNETLINK answers: No such file or directory
> # tc -s -d qdisc show dev eth0
> qdisc mq 0: root 
>  Sent 636 bytes 10 pkt (dropped 0, overlimits 0 requeues 0) 
>  backlog 0b 0p requeues 0 
> 
> 
> > 3) Compile ingress as a module and add/delete ingress qdisc
> > 
> > 
> 
> Seems to work like 1)
> 
> > Other than that excellent work Eric. And you can add my
> > Acked/reviewed-by etc.
> > 
> > BTW, did i say i like your per-cpu stats stuff? It applies nicely to
> > qdiscs, actions etc ;->
> 
> I took a look at ifb as suggested by Stephen but could not see trivial
> changes (LLTX or per-cpu stats), since central lock is needed I am
> afraid. And qdisc are the same, stats updates are mostly free as we
> dirtied cache line for the lock.
> 
> Thanks Jamal !
> 
> Here is the V2, with two #ifdef removed.
> 
> 
> [PATCH net-next V2] net: dynamic ingress_queue allocation
> 
> ingress being not used very much, and net_device->ingress_queue being
> quite a big object (128 or 256 bytes), use a dynamic allocation if
> needed (tc qdisc add dev eth0 ingress ...)
> 
> Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
> ---
>  include/linux/netdevice.h |   11 ++++++--
>  net/core/dev.c            |   48 +++++++++++++++++++++++++++---------
>  net/sched/sch_api.c       |   40 ++++++++++++++++++++----------
>  net/sched/sch_generic.c   |   36 +++++++++++++++------------
>  4 files changed, 92 insertions(+), 43 deletions(-)
> 
> diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
> index ceed347..4f86009 100644
> --- a/include/linux/netdevice.h
> +++ b/include/linux/netdevice.h
> @@ -986,8 +986,7 @@ struct net_device {
>  	rx_handler_func_t	*rx_handler;
>  	void			*rx_handler_data;
>  
> -	struct netdev_queue	ingress_queue; /* use two cache lines */
> -
> +	struct netdev_queue	*ingress_queue;
>  /*
>   * Cache lines mostly used on transmit path
>   */
> @@ -1115,6 +1114,14 @@ static inline void netdev_for_each_tx_queue(struct net_device *dev,
>  		f(dev, &dev->_tx[i], arg);
>  }
>  
> +
> +static inline struct netdev_queue *dev_ingress_queue(struct net_device *dev)
> +{
> +	return dev->ingress_queue;
> +}
> +
> +extern struct netdev_queue *dev_ingress_queue_create(struct net_device *dev);
> +
>  /*
>   * Net namespace inlines
>   */
> diff --git a/net/core/dev.c b/net/core/dev.c
> index a313bab..e3bb8c9 100644
> --- a/net/core/dev.c
> +++ b/net/core/dev.c
> @@ -2702,11 +2702,10 @@ EXPORT_SYMBOL_GPL(br_fdb_test_addr_hook);
>   * the ingress scheduler, you just cant add policies on ingress.
>   *
>   */
> -static int ing_filter(struct sk_buff *skb)
> +static int ing_filter(struct sk_buff *skb, struct netdev_queue *rxq)
>  {
>  	struct net_device *dev = skb->dev;
>  	u32 ttl = G_TC_RTTL(skb->tc_verd);
> -	struct netdev_queue *rxq;
>  	int result = TC_ACT_OK;
>  	struct Qdisc *q;
>  
> @@ -2720,8 +2719,6 @@ static int ing_filter(struct sk_buff *skb)
>  	skb->tc_verd = SET_TC_RTTL(skb->tc_verd, ttl);
>  	skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_INGRESS);
>  
> -	rxq = &dev->ingress_queue;
> -
>  	q = rxq->qdisc;
>  	if (q != &noop_qdisc) {
>  		spin_lock(qdisc_lock(q));
> @@ -2737,7 +2734,9 @@ static inline struct sk_buff *handle_ing(struct sk_buff *skb,
>  					 struct packet_type **pt_prev,
>  					 int *ret, struct net_device *orig_dev)
>  {
> -	if (skb->dev->ingress_queue.qdisc == &noop_qdisc)
> +	struct netdev_queue *rxq = dev_ingress_queue(skb->dev);
> +
> +	if (!rxq || rxq->qdisc == &noop_qdisc)
>  		goto out;
>  
>  	if (*pt_prev) {
> @@ -2745,7 +2744,7 @@ static inline struct sk_buff *handle_ing(struct sk_buff *skb,
>  		*pt_prev = NULL;
>  	}
>  
> -	switch (ing_filter(skb)) {
> +	switch (ing_filter(skb, rxq)) {
>  	case TC_ACT_SHOT:
>  	case TC_ACT_STOLEN:
>  		kfree_skb(skb);
> @@ -4932,15 +4931,17 @@ static void __netdev_init_queue_locks_one(struct net_device *dev,
>  					  struct netdev_queue *dev_queue,
>  					  void *_unused)
>  {
> -	spin_lock_init(&dev_queue->_xmit_lock);
> -	netdev_set_xmit_lockdep_class(&dev_queue->_xmit_lock, dev->type);
> -	dev_queue->xmit_lock_owner = -1;
> +	if (dev_queue) {
> +		spin_lock_init(&dev_queue->_xmit_lock);
> +		netdev_set_xmit_lockdep_class(&dev_queue->_xmit_lock, dev->type);
> +		dev_queue->xmit_lock_owner = -1;
> +	}
>  }
>  
>  static void netdev_init_queue_locks(struct net_device *dev)
>  {
>  	netdev_for_each_tx_queue(dev, __netdev_init_queue_locks_one, NULL);
> -	__netdev_init_queue_locks_one(dev, &dev->ingress_queue, NULL);
> +	__netdev_init_queue_locks_one(dev, dev_ingress_queue(dev), NULL);

Is dev_ingress_queue(dev) not NULL anytime here?

>  }
>  
>  unsigned long netdev_fix_features(unsigned long features, const char *name)
> @@ -5447,16 +5448,37 @@ static void netdev_init_one_queue(struct net_device *dev,
>  				  struct netdev_queue *queue,
>  				  void *_unused)
>  {
> -	queue->dev = dev;
> +	if (queue)
> +		queue->dev = dev;
>  }
>  
>  static void netdev_init_queues(struct net_device *dev)
>  {
> -	netdev_init_one_queue(dev, &dev->ingress_queue, NULL);
> +	netdev_init_one_queue(dev, dev_ingress_queue(dev), NULL);

Is dev_ingress_queue(dev) not NULL anytime here?

>  	netdev_for_each_tx_queue(dev, netdev_init_one_queue, NULL);
>  	spin_lock_init(&dev->tx_global_lock);
>  }
>  
> +struct netdev_queue *dev_ingress_queue_create(struct net_device *dev)
> +{
> +	struct netdev_queue *queue = dev_ingress_queue(dev);
> +
> +#ifdef CONFIG_NET_CLS_ACT
> +	if (queue)
> +		return queue;
> +	queue = kzalloc(sizeof(*queue), GFP_KERNEL);
> +	if (!queue)
> +		return NULL;
> +	netdev_init_one_queue(dev, queue, NULL);
> +	__netdev_init_queue_locks_one(dev, queue, NULL);
> +	queue->qdisc = &noop_qdisc;
> +	queue->qdisc_sleeping = &noop_qdisc;
> +	smp_wmb();

Why don't we need smp_rmb() in handle_ing()?

> +	dev->ingress_queue = queue;
> +#endif
> +	return queue;
> +}
> +
>  /**
>   *	alloc_netdev_mq - allocate network device
>   *	@sizeof_priv:	size of private data to allocate space for
> @@ -5559,6 +5581,8 @@ void free_netdev(struct net_device *dev)
>  
>  	kfree(dev->_tx);
>  
> +	kfree(dev_ingress_queue(dev));
> +
>  	/* Flush device addresses */
>  	dev_addr_flush(dev);
>  
> diff --git a/net/sched/sch_api.c b/net/sched/sch_api.c
> index b802078..8635110 100644
> --- a/net/sched/sch_api.c
> +++ b/net/sched/sch_api.c
> @@ -240,7 +240,10 @@ struct Qdisc *qdisc_lookup(struct net_device *dev, u32 handle)
>  	if (q)
>  		goto out;
>  
> -	q = qdisc_match_from_root(dev->ingress_queue.qdisc_sleeping, handle);
> +	if (!dev_ingress_queue(dev))
> +		goto out;
> +	q = qdisc_match_from_root(dev_ingress_queue(dev)->qdisc_sleeping,
> +				  handle);

I'd prefer:
 +	if (dev_ingress_queue(dev))
 +		q = qdisc_match_from_root(dev_ingress_queue(dev)->qdisc_sleeping,

>  out:
>  	return q;
>  }
> @@ -690,6 +693,8 @@ static int qdisc_graft(struct net_device *dev, struct Qdisc *parent,
>  		    (new && new->flags & TCQ_F_INGRESS)) {
>  			num_q = 1;
>  			ingress = 1;
> +			if (!dev_ingress_queue(dev))
> +				return -ENOENT;

Is this test really needed here?

>  		}
>  
>  		if (dev->flags & IFF_UP)
> @@ -701,7 +706,7 @@ static int qdisc_graft(struct net_device *dev, struct Qdisc *parent,
>  		}
>  
>  		for (i = 0; i < num_q; i++) {
> -			struct netdev_queue *dev_queue = &dev->ingress_queue;
> +			struct netdev_queue *dev_queue = dev_ingress_queue(dev);
>  
>  			if (!ingress)
>  				dev_queue = netdev_get_tx_queue(dev, i);
> @@ -979,7 +984,8 @@ static int tc_get_qdisc(struct sk_buff *skb, struct nlmsghdr *n, void *arg)
>  					return -ENOENT;
>  				q = qdisc_leaf(p, clid);
>  			} else { /* ingress */
> -				q = dev->ingress_queue.qdisc_sleeping;
> +				if (dev_ingress_queue(dev))
> +					q = dev_ingress_queue(dev)->qdisc_sleeping;
>  			}
>  		} else {
>  			q = dev->qdisc;
> @@ -1044,7 +1050,8 @@ replay:
>  					return -ENOENT;
>  				q = qdisc_leaf(p, clid);
>  			} else { /*ingress */
> -				q = dev->ingress_queue.qdisc_sleeping;
> +				if (dev_ingress_queue_create(dev))
> +					q = dev_ingress_queue(dev)->qdisc_sleeping;

I wonder if doing dev_ingress_queue_create() just before qdisc_create()
(and the test here) isn't more readable.

>  			}
>  		} else {
>  			q = dev->qdisc;
> @@ -1123,11 +1130,14 @@ replay:
>  create_n_graft:
>  	if (!(n->nlmsg_flags&NLM_F_CREATE))
>  		return -ENOENT;
> -	if (clid == TC_H_INGRESS)
> -		q = qdisc_create(dev, &dev->ingress_queue, p,
> -				 tcm->tcm_parent, tcm->tcm_parent,
> -				 tca, &err);
> -	else {
> +	if (clid == TC_H_INGRESS) {
> +		if (dev_ingress_queue(dev))
> +			q = qdisc_create(dev, dev_ingress_queue(dev), p,
> +					 tcm->tcm_parent, tcm->tcm_parent,
> +					 tca, &err);
> +		else
> +			err = -ENOENT;
> +	} else {
>  		struct netdev_queue *dev_queue;
>  
>  		if (p && p->ops->cl_ops && p->ops->cl_ops->select_queue)
> @@ -1304,8 +1314,10 @@ static int tc_dump_qdisc(struct sk_buff *skb, struct netlink_callback *cb)
>  		if (tc_dump_qdisc_root(dev->qdisc, skb, cb, &q_idx, s_q_idx) < 0)
>  			goto done;
>  
> -		dev_queue = &dev->ingress_queue;
> -		if (tc_dump_qdisc_root(dev_queue->qdisc_sleeping, skb, cb, &q_idx, s_q_idx) < 0)
> +		dev_queue = dev_ingress_queue(dev);
> +		if (dev_queue &&
> +		    tc_dump_qdisc_root(dev_queue->qdisc_sleeping, skb, cb,
> +				       &q_idx, s_q_idx) < 0)
>  			goto done;
>  
>  cont:
> @@ -1595,8 +1607,10 @@ static int tc_dump_tclass(struct sk_buff *skb, struct netlink_callback *cb)
>  	if (tc_dump_tclass_root(dev->qdisc, skb, tcm, cb, &t, s_t) < 0)
>  		goto done;
>  
> -	dev_queue = &dev->ingress_queue;
> -	if (tc_dump_tclass_root(dev_queue->qdisc_sleeping, skb, tcm, cb, &t, s_t) < 0)
> +	dev_queue = dev_ingress_queue(dev);
> +	if (dev_queue &&
> +	    tc_dump_tclass_root(dev_queue->qdisc_sleeping, skb, tcm, cb,
> +				&t, s_t) < 0)
>  		goto done;
>  
>  done:
> diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c
> index 545278a..c42dec5 100644
> --- a/net/sched/sch_generic.c
> +++ b/net/sched/sch_generic.c
> @@ -721,16 +721,18 @@ static void transition_one_qdisc(struct net_device *dev,
>  				 struct netdev_queue *dev_queue,
>  				 void *_need_watchdog)
>  {
> -	struct Qdisc *new_qdisc = dev_queue->qdisc_sleeping;
> -	int *need_watchdog_p = _need_watchdog;
> +	if (dev_queue) {
> +		struct Qdisc *new_qdisc = dev_queue->qdisc_sleeping;
> +		int *need_watchdog_p = _need_watchdog;
>  
> -	if (!(new_qdisc->flags & TCQ_F_BUILTIN))
> -		clear_bit(__QDISC_STATE_DEACTIVATED, &new_qdisc->state);
> +		if (!(new_qdisc->flags & TCQ_F_BUILTIN))
> +			clear_bit(__QDISC_STATE_DEACTIVATED, &new_qdisc->state);
>  
> -	rcu_assign_pointer(dev_queue->qdisc, new_qdisc);
> -	if (need_watchdog_p && new_qdisc != &noqueue_qdisc) {
> -		dev_queue->trans_start = 0;
> -		*need_watchdog_p = 1;
> +		rcu_assign_pointer(dev_queue->qdisc, new_qdisc);
> +		if (need_watchdog_p && new_qdisc != &noqueue_qdisc) {
> +			dev_queue->trans_start = 0;
> +			*need_watchdog_p = 1;
> +		}
>  	}
>  }
>  
> @@ -753,7 +755,7 @@ void dev_activate(struct net_device *dev)
>  
>  	need_watchdog = 0;
>  	netdev_for_each_tx_queue(dev, transition_one_qdisc, &need_watchdog);
> -	transition_one_qdisc(dev, &dev->ingress_queue, NULL);
> +	transition_one_qdisc(dev, dev_ingress_queue(dev), NULL);

I'd prefer here and similarly later:

 +	if (dev_ingress_queue(dev))
 +		transition_one_qdisc(dev, dev_ingress_queue(dev), NULL);

to show NULL dev_queue is only legal in this one case.

Cheers,
Jarek P.

>  
>  	if (need_watchdog) {
>  		dev->trans_start = jiffies;
> @@ -768,7 +770,7 @@ static void dev_deactivate_queue(struct net_device *dev,
>  	struct Qdisc *qdisc_default = _qdisc_default;
>  	struct Qdisc *qdisc;
>  
> -	qdisc = dev_queue->qdisc;
> +	qdisc = dev_queue ? dev_queue->qdisc : NULL;
>  	if (qdisc) {
>  		spin_lock_bh(qdisc_lock(qdisc));
>  
> @@ -812,7 +814,7 @@ static bool some_qdisc_is_busy(struct net_device *dev)
>  void dev_deactivate(struct net_device *dev)
>  {
>  	netdev_for_each_tx_queue(dev, dev_deactivate_queue, &noop_qdisc);
> -	dev_deactivate_queue(dev, &dev->ingress_queue, &noop_qdisc);
> +	dev_deactivate_queue(dev, dev_ingress_queue(dev), &noop_qdisc);
>  
>  	dev_watchdog_down(dev);
>  
> @@ -830,15 +832,17 @@ static void dev_init_scheduler_queue(struct net_device *dev,
>  {
>  	struct Qdisc *qdisc = _qdisc;
>  
> -	dev_queue->qdisc = qdisc;
> -	dev_queue->qdisc_sleeping = qdisc;
> +	if (dev_queue) {
> +		dev_queue->qdisc = qdisc;
> +		dev_queue->qdisc_sleeping = qdisc;
> +	}
>  }
>  
>  void dev_init_scheduler(struct net_device *dev)
>  {
>  	dev->qdisc = &noop_qdisc;
>  	netdev_for_each_tx_queue(dev, dev_init_scheduler_queue, &noop_qdisc);
> -	dev_init_scheduler_queue(dev, &dev->ingress_queue, &noop_qdisc);
> +	dev_init_scheduler_queue(dev, dev_ingress_queue(dev), &noop_qdisc);
>  
>  	setup_timer(&dev->watchdog_timer, dev_watchdog, (unsigned long)dev);
>  }
> @@ -847,7 +851,7 @@ static void shutdown_scheduler_queue(struct net_device *dev,
>  				     struct netdev_queue *dev_queue,
>  				     void *_qdisc_default)
>  {
> -	struct Qdisc *qdisc = dev_queue->qdisc_sleeping;
> +	struct Qdisc *qdisc = dev_queue ? dev_queue->qdisc_sleeping : NULL;
>  	struct Qdisc *qdisc_default = _qdisc_default;
>  
>  	if (qdisc) {
> @@ -861,7 +865,7 @@ static void shutdown_scheduler_queue(struct net_device *dev,
>  void dev_shutdown(struct net_device *dev)
>  {
>  	netdev_for_each_tx_queue(dev, shutdown_scheduler_queue, &noop_qdisc);
> -	shutdown_scheduler_queue(dev, &dev->ingress_queue, &noop_qdisc);
> +	shutdown_scheduler_queue(dev, dev_ingress_queue(dev), &noop_qdisc);
>  	qdisc_destroy(dev->qdisc);
>  	dev->qdisc = &noop_qdisc;
>  
> 
> 

^ permalink raw reply	[flat|nested] 23+ messages in thread

* [PATCH net-next V3] net: dynamic ingress_queue allocation
  2010-10-02  9:32           ` Jarek Poplawski
@ 2010-10-02 16:11             ` Eric Dumazet
  2010-10-03  9:42               ` Jarek Poplawski
  2010-10-05  7:24               ` David Miller
  0 siblings, 2 replies; 23+ messages in thread
From: Eric Dumazet @ 2010-10-02 16:11 UTC (permalink / raw)
  To: Jarek Poplawski; +Cc: hadi, David Miller, netdev

Le samedi 02 octobre 2010 à 11:32 +0200, Jarek Poplawski a écrit :
> On Fri, Oct 01, 2010 at 03:56:28PM +0200, Eric Dumazet wrote:
>  
> >  static void netdev_init_queue_locks(struct net_device *dev)
> >  {
> >  	netdev_for_each_tx_queue(dev, __netdev_init_queue_locks_one, NULL);
> > -	__netdev_init_queue_locks_one(dev, &dev->ingress_queue, NULL);
> > +	__netdev_init_queue_locks_one(dev, dev_ingress_queue(dev), NULL);
> 
> Is dev_ingress_queue(dev) not NULL anytime here?

Yes, but I felt this could be removed later. If you feel its OK right
now, I am OK too ;)

> 
> >  }
> >  
> >  unsigned long netdev_fix_features(unsigned long features, const char *name)
> > @@ -5447,16 +5448,37 @@ static void netdev_init_one_queue(struct net_device *dev,
> >  				  struct netdev_queue *queue,
> >  				  void *_unused)
> >  {
> > -	queue->dev = dev;
> > +	if (queue)
> > +		queue->dev = dev;
> >  }
> >  
> >  static void netdev_init_queues(struct net_device *dev)
> >  {
> > -	netdev_init_one_queue(dev, &dev->ingress_queue, NULL);
> > +	netdev_init_one_queue(dev, dev_ingress_queue(dev), NULL);
> 
> Is dev_ingress_queue(dev) not NULL anytime here?
> 

Yes

> >  	netdev_for_each_tx_queue(dev, netdev_init_one_queue, NULL);
> >  	spin_lock_init(&dev->tx_global_lock);
> >  }
> >  
> > +struct netdev_queue *dev_ingress_queue_create(struct net_device *dev)
> > +{
> > +	struct netdev_queue *queue = dev_ingress_queue(dev);
> > +
> > +#ifdef CONFIG_NET_CLS_ACT
> > +	if (queue)
> > +		return queue;
> > +	queue = kzalloc(sizeof(*queue), GFP_KERNEL);
> > +	if (!queue)
> > +		return NULL;
> > +	netdev_init_one_queue(dev, queue, NULL);
> > +	__netdev_init_queue_locks_one(dev, queue, NULL);
> > +	queue->qdisc = &noop_qdisc;
> > +	queue->qdisc_sleeping = &noop_qdisc;
> > +	smp_wmb();
> 
> Why don't we need smp_rmb() in handle_ing()?
> 

I only wanted to see if Al Viro was using ingress on its Alpha
machine ;)

I am going to use regular rcu api to ease code understanding :)


> > +	dev->ingress_queue = queue;
> > +#endif
> > +	return queue;
> > +}
> > +
> >  /**
> >   *	alloc_netdev_mq - allocate network device
> >   *	@sizeof_priv:	size of private data to allocate space for
> > @@ -5559,6 +5581,8 @@ void free_netdev(struct net_device *dev)
> >  
> >  	kfree(dev->_tx);
> >  
> > +	kfree(dev_ingress_queue(dev));
> > +
> >  	/* Flush device addresses */
> >  	dev_addr_flush(dev);
> >  
> > diff --git a/net/sched/sch_api.c b/net/sched/sch_api.c
> > index b802078..8635110 100644
> > --- a/net/sched/sch_api.c
> > +++ b/net/sched/sch_api.c
> > @@ -240,7 +240,10 @@ struct Qdisc *qdisc_lookup(struct net_device *dev, u32 handle)
> >  	if (q)
> >  		goto out;
> >  
> > -	q = qdisc_match_from_root(dev->ingress_queue.qdisc_sleeping, handle);
> > +	if (!dev_ingress_queue(dev))
> > +		goto out;
> > +	q = qdisc_match_from_root(dev_ingress_queue(dev)->qdisc_sleeping,
> > +				  handle);
> 


> I'd prefer:
>  +	if (dev_ingress_queue(dev))
>  +		q = qdisc_match_from_root(dev_ingress_queue(dev)->qdisc_sleeping,
> 

Yes

> >  out:
> >  	return q;
> >  }
> > @@ -690,6 +693,8 @@ static int qdisc_graft(struct net_device *dev, struct Qdisc *parent,
> >  		    (new && new->flags & TCQ_F_INGRESS)) {
> >  			num_q = 1;
> >  			ingress = 1;
> > +			if (!dev_ingress_queue(dev))
> > +				return -ENOENT;
> 
> Is this test really needed here?

To avoid a NULL dereference some lines later.
Do I have a guarantee its not NULL here ?

> 
> >  		}
> >  
> >  		if (dev->flags & IFF_UP)
> > @@ -701,7 +706,7 @@ static int qdisc_graft(struct net_device *dev, struct Qdisc *parent,
> >  		}
> >  
> >  		for (i = 0; i < num_q; i++) {
> > -			struct netdev_queue *dev_queue = &dev->ingress_queue;
> > +			struct netdev_queue *dev_queue = dev_ingress_queue(dev);
> >  
> >  			if (!ingress)
> >  				dev_queue = netdev_get_tx_queue(dev, i);
> > @@ -979,7 +984,8 @@ static int tc_get_qdisc(struct sk_buff *skb, struct nlmsghdr *n, void *arg)
> >  					return -ENOENT;
> >  				q = qdisc_leaf(p, clid);
> >  			} else { /* ingress */
> > -				q = dev->ingress_queue.qdisc_sleeping;
> > +				if (dev_ingress_queue(dev))
> > +					q = dev_ingress_queue(dev)->qdisc_sleeping;
> >  			}
> >  		} else {
> >  			q = dev->qdisc;
> > @@ -1044,7 +1050,8 @@ replay:
> >  					return -ENOENT;
> >  				q = qdisc_leaf(p, clid);
> >  			} else { /*ingress */
> > -				q = dev->ingress_queue.qdisc_sleeping;
> > +				if (dev_ingress_queue_create(dev))
> > +					q = dev_ingress_queue(dev)->qdisc_sleeping;
> 
> I wonder if doing dev_ingress_queue_create() just before qdisc_create()
> (and the test here) isn't more readable.

Sorry, I dont understand. I want to create ingress_queue only if user
wants it. If we setup (egress) trafic shaping, no need to setup
ingress_queue.

> >  
> > @@ -753,7 +755,7 @@ void dev_activate(struct net_device *dev)
> >  
> >  	need_watchdog = 0;
> >  	netdev_for_each_tx_queue(dev, transition_one_qdisc, &need_watchdog);
> > -	transition_one_qdisc(dev, &dev->ingress_queue, NULL);
> > +	transition_one_qdisc(dev, dev_ingress_queue(dev), NULL);
> 
> I'd prefer here and similarly later:
> 
>  +	if (dev_ingress_queue(dev))
>  +		transition_one_qdisc(dev, dev_ingress_queue(dev), NULL);
> 
> to show NULL dev_queue is only legal in this one case.

OK, thanks a lot for the extended review Jarek (and Jamal of course)

Here is the V3 then.

[PATCH net-next V3] net: dynamic ingress_queue allocation

ingress being not used very much, and net_device->ingress_queue being
quite a big object (128 or 256 bytes), use a dynamic allocation if
needed (tc qdisc add dev eth0 ingress ...)

dev_ingress_queue(dev) helper should be used only with RTNL taken.

Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
---
V3: add rcu notations & address Jarek comments
 include/linux/netdevice.h |    2 -
 include/linux/rtnetlink.h |    8 ++++++
 net/core/dev.c            |   34 ++++++++++++++++++++++-------
 net/sched/sch_api.c       |   42 ++++++++++++++++++++++++------------
 net/sched/sch_generic.c   |   12 ++++++----
 5 files changed, 71 insertions(+), 27 deletions(-)

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index ceed347..92d81ed 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -986,7 +986,7 @@ struct net_device {
 	rx_handler_func_t	*rx_handler;
 	void			*rx_handler_data;
 
-	struct netdev_queue	ingress_queue; /* use two cache lines */
+	struct netdev_queue __rcu *ingress_queue;
 
 /*
  * Cache lines mostly used on transmit path
diff --git a/include/linux/rtnetlink.h b/include/linux/rtnetlink.h
index 68c436b..0bb7b48 100644
--- a/include/linux/rtnetlink.h
+++ b/include/linux/rtnetlink.h
@@ -6,6 +6,7 @@
 #include <linux/if_link.h>
 #include <linux/if_addr.h>
 #include <linux/neighbour.h>
+#include <linux/netdevice.h>
 
 /* rtnetlink families. Values up to 127 are reserved for real address
  * families, values above 128 may be used arbitrarily.
@@ -769,6 +770,13 @@ extern int lockdep_rtnl_is_held(void);
 #define rtnl_dereference(p)					\
 	rcu_dereference_check(p, lockdep_rtnl_is_held())
 
+static inline struct netdev_queue *dev_ingress_queue(struct net_device *dev)
+{
+	return rtnl_dereference(dev->ingress_queue);
+}
+
+extern struct netdev_queue *dev_ingress_queue_create(struct net_device *dev);
+
 extern void rtnetlink_init(void);
 extern void __rtnl_unlock(void);
 
diff --git a/net/core/dev.c b/net/core/dev.c
index a313bab..b078ec8 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -2702,11 +2702,10 @@ EXPORT_SYMBOL_GPL(br_fdb_test_addr_hook);
  * the ingress scheduler, you just cant add policies on ingress.
  *
  */
-static int ing_filter(struct sk_buff *skb)
+static int ing_filter(struct sk_buff *skb, struct netdev_queue *rxq)
 {
 	struct net_device *dev = skb->dev;
 	u32 ttl = G_TC_RTTL(skb->tc_verd);
-	struct netdev_queue *rxq;
 	int result = TC_ACT_OK;
 	struct Qdisc *q;
 
@@ -2720,8 +2719,6 @@ static int ing_filter(struct sk_buff *skb)
 	skb->tc_verd = SET_TC_RTTL(skb->tc_verd, ttl);
 	skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_INGRESS);
 
-	rxq = &dev->ingress_queue;
-
 	q = rxq->qdisc;
 	if (q != &noop_qdisc) {
 		spin_lock(qdisc_lock(q));
@@ -2737,7 +2734,9 @@ static inline struct sk_buff *handle_ing(struct sk_buff *skb,
 					 struct packet_type **pt_prev,
 					 int *ret, struct net_device *orig_dev)
 {
-	if (skb->dev->ingress_queue.qdisc == &noop_qdisc)
+	struct netdev_queue *rxq = rcu_dereference(skb->dev->ingress_queue);
+
+	if (!rxq || rxq->qdisc == &noop_qdisc)
 		goto out;
 
 	if (*pt_prev) {
@@ -2745,7 +2744,7 @@ static inline struct sk_buff *handle_ing(struct sk_buff *skb,
 		*pt_prev = NULL;
 	}
 
-	switch (ing_filter(skb)) {
+	switch (ing_filter(skb, rxq)) {
 	case TC_ACT_SHOT:
 	case TC_ACT_STOLEN:
 		kfree_skb(skb);
@@ -4940,7 +4939,6 @@ static void __netdev_init_queue_locks_one(struct net_device *dev,
 static void netdev_init_queue_locks(struct net_device *dev)
 {
 	netdev_for_each_tx_queue(dev, __netdev_init_queue_locks_one, NULL);
-	__netdev_init_queue_locks_one(dev, &dev->ingress_queue, NULL);
 }
 
 unsigned long netdev_fix_features(unsigned long features, const char *name)
@@ -5452,11 +5450,29 @@ static void netdev_init_one_queue(struct net_device *dev,
 
 static void netdev_init_queues(struct net_device *dev)
 {
-	netdev_init_one_queue(dev, &dev->ingress_queue, NULL);
 	netdev_for_each_tx_queue(dev, netdev_init_one_queue, NULL);
 	spin_lock_init(&dev->tx_global_lock);
 }
 
+struct netdev_queue *dev_ingress_queue_create(struct net_device *dev)
+{
+	struct netdev_queue *queue = dev_ingress_queue(dev);
+
+#ifdef CONFIG_NET_CLS_ACT
+	if (queue)
+		return queue;
+	queue = kzalloc(sizeof(*queue), GFP_KERNEL);
+	if (!queue)
+		return NULL;
+	netdev_init_one_queue(dev, queue, NULL);
+	__netdev_init_queue_locks_one(dev, queue, NULL);
+	queue->qdisc = &noop_qdisc;
+	queue->qdisc_sleeping = &noop_qdisc;
+	rcu_assign_pointer(dev->ingress_queue, queue);
+#endif
+	return queue;
+}
+
 /**
  *	alloc_netdev_mq - allocate network device
  *	@sizeof_priv:	size of private data to allocate space for
@@ -5559,6 +5575,8 @@ void free_netdev(struct net_device *dev)
 
 	kfree(dev->_tx);
 
+	kfree(rcu_dereference_raw(dev->ingress_queue));
+
 	/* Flush device addresses */
 	dev_addr_flush(dev);
 
diff --git a/net/sched/sch_api.c b/net/sched/sch_api.c
index b802078..b22ca2d 100644
--- a/net/sched/sch_api.c
+++ b/net/sched/sch_api.c
@@ -240,7 +240,10 @@ struct Qdisc *qdisc_lookup(struct net_device *dev, u32 handle)
 	if (q)
 		goto out;
 
-	q = qdisc_match_from_root(dev->ingress_queue.qdisc_sleeping, handle);
+	if (dev_ingress_queue(dev))
+		q = qdisc_match_from_root(
+			dev_ingress_queue(dev)->qdisc_sleeping,
+			handle);
 out:
 	return q;
 }
@@ -690,6 +693,8 @@ static int qdisc_graft(struct net_device *dev, struct Qdisc *parent,
 		    (new && new->flags & TCQ_F_INGRESS)) {
 			num_q = 1;
 			ingress = 1;
+			if (!dev_ingress_queue(dev))
+				return -ENOENT;
 		}
 
 		if (dev->flags & IFF_UP)
@@ -701,7 +706,7 @@ static int qdisc_graft(struct net_device *dev, struct Qdisc *parent,
 		}
 
 		for (i = 0; i < num_q; i++) {
-			struct netdev_queue *dev_queue = &dev->ingress_queue;
+			struct netdev_queue *dev_queue = dev_ingress_queue(dev);
 
 			if (!ingress)
 				dev_queue = netdev_get_tx_queue(dev, i);
@@ -979,7 +984,8 @@ static int tc_get_qdisc(struct sk_buff *skb, struct nlmsghdr *n, void *arg)
 					return -ENOENT;
 				q = qdisc_leaf(p, clid);
 			} else { /* ingress */
-				q = dev->ingress_queue.qdisc_sleeping;
+				if (dev_ingress_queue(dev))
+					q = dev_ingress_queue(dev)->qdisc_sleeping;
 			}
 		} else {
 			q = dev->qdisc;
@@ -1043,8 +1049,9 @@ replay:
 				if ((p = qdisc_lookup(dev, TC_H_MAJ(clid))) == NULL)
 					return -ENOENT;
 				q = qdisc_leaf(p, clid);
-			} else { /*ingress */
-				q = dev->ingress_queue.qdisc_sleeping;
+			} else { /* ingress */
+				if (dev_ingress_queue_create(dev))
+					q = dev_ingress_queue(dev)->qdisc_sleeping;
 			}
 		} else {
 			q = dev->qdisc;
@@ -1123,11 +1130,14 @@ replay:
 create_n_graft:
 	if (!(n->nlmsg_flags&NLM_F_CREATE))
 		return -ENOENT;
-	if (clid == TC_H_INGRESS)
-		q = qdisc_create(dev, &dev->ingress_queue, p,
-				 tcm->tcm_parent, tcm->tcm_parent,
-				 tca, &err);
-	else {
+	if (clid == TC_H_INGRESS) {
+		if (dev_ingress_queue(dev))
+			q = qdisc_create(dev, dev_ingress_queue(dev), p,
+					 tcm->tcm_parent, tcm->tcm_parent,
+					 tca, &err);
+		else
+			err = -ENOENT;
+	} else {
 		struct netdev_queue *dev_queue;
 
 		if (p && p->ops->cl_ops && p->ops->cl_ops->select_queue)
@@ -1304,8 +1314,10 @@ static int tc_dump_qdisc(struct sk_buff *skb, struct netlink_callback *cb)
 		if (tc_dump_qdisc_root(dev->qdisc, skb, cb, &q_idx, s_q_idx) < 0)
 			goto done;
 
-		dev_queue = &dev->ingress_queue;
-		if (tc_dump_qdisc_root(dev_queue->qdisc_sleeping, skb, cb, &q_idx, s_q_idx) < 0)
+		dev_queue = dev_ingress_queue(dev);
+		if (dev_queue &&
+		    tc_dump_qdisc_root(dev_queue->qdisc_sleeping, skb, cb,
+				       &q_idx, s_q_idx) < 0)
 			goto done;
 
 cont:
@@ -1595,8 +1607,10 @@ static int tc_dump_tclass(struct sk_buff *skb, struct netlink_callback *cb)
 	if (tc_dump_tclass_root(dev->qdisc, skb, tcm, cb, &t, s_t) < 0)
 		goto done;
 
-	dev_queue = &dev->ingress_queue;
-	if (tc_dump_tclass_root(dev_queue->qdisc_sleeping, skb, tcm, cb, &t, s_t) < 0)
+	dev_queue = dev_ingress_queue(dev);
+	if (dev_queue &&
+	    tc_dump_tclass_root(dev_queue->qdisc_sleeping, skb, tcm, cb,
+				&t, s_t) < 0)
 		goto done;
 
 done:
diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c
index 545278a..3d57681 100644
--- a/net/sched/sch_generic.c
+++ b/net/sched/sch_generic.c
@@ -753,7 +753,8 @@ void dev_activate(struct net_device *dev)
 
 	need_watchdog = 0;
 	netdev_for_each_tx_queue(dev, transition_one_qdisc, &need_watchdog);
-	transition_one_qdisc(dev, &dev->ingress_queue, NULL);
+	if (dev_ingress_queue(dev))
+		transition_one_qdisc(dev, dev_ingress_queue(dev), NULL);
 
 	if (need_watchdog) {
 		dev->trans_start = jiffies;
@@ -812,7 +813,8 @@ static bool some_qdisc_is_busy(struct net_device *dev)
 void dev_deactivate(struct net_device *dev)
 {
 	netdev_for_each_tx_queue(dev, dev_deactivate_queue, &noop_qdisc);
-	dev_deactivate_queue(dev, &dev->ingress_queue, &noop_qdisc);
+	if (dev_ingress_queue(dev))
+		dev_deactivate_queue(dev, dev_ingress_queue(dev), &noop_qdisc);
 
 	dev_watchdog_down(dev);
 
@@ -838,7 +840,8 @@ void dev_init_scheduler(struct net_device *dev)
 {
 	dev->qdisc = &noop_qdisc;
 	netdev_for_each_tx_queue(dev, dev_init_scheduler_queue, &noop_qdisc);
-	dev_init_scheduler_queue(dev, &dev->ingress_queue, &noop_qdisc);
+	if (dev_ingress_queue(dev))
+		dev_init_scheduler_queue(dev, dev_ingress_queue(dev), &noop_qdisc);
 
 	setup_timer(&dev->watchdog_timer, dev_watchdog, (unsigned long)dev);
 }
@@ -861,7 +864,8 @@ static void shutdown_scheduler_queue(struct net_device *dev,
 void dev_shutdown(struct net_device *dev)
 {
 	netdev_for_each_tx_queue(dev, shutdown_scheduler_queue, &noop_qdisc);
-	shutdown_scheduler_queue(dev, &dev->ingress_queue, &noop_qdisc);
+	if (dev_ingress_queue(dev))
+		shutdown_scheduler_queue(dev, dev_ingress_queue(dev), &noop_qdisc);
 	qdisc_destroy(dev->qdisc);
 	dev->qdisc = &noop_qdisc;
 



^ permalink raw reply related	[flat|nested] 23+ messages in thread

* Re: [PATCH net-next V3] net: dynamic ingress_queue allocation
  2010-10-02 16:11             ` [PATCH net-next V3] " Eric Dumazet
@ 2010-10-03  9:42               ` Jarek Poplawski
  2010-10-03 13:10                 ` jamal
  2010-10-04  8:42                 ` Eric Dumazet
  2010-10-05  7:24               ` David Miller
  1 sibling, 2 replies; 23+ messages in thread
From: Jarek Poplawski @ 2010-10-03  9:42 UTC (permalink / raw)
  To: Eric Dumazet; +Cc: hadi, David Miller, netdev

On Sat, Oct 02, 2010 at 06:11:55PM +0200, Eric Dumazet wrote:
> Le samedi 02 octobre 2010 ?? 11:32 +0200, Jarek Poplawski a écrit :
> > On Fri, Oct 01, 2010 at 03:56:28PM +0200, Eric Dumazet wrote:
...
> > > diff --git a/net/sched/sch_api.c b/net/sched/sch_api.c
> > > index b802078..8635110 100644
> > > --- a/net/sched/sch_api.c
> > > +++ b/net/sched/sch_api.c
...
> > > @@ -690,6 +693,8 @@ static int qdisc_graft(struct net_device *dev, struct Qdisc *parent,
> > >  		    (new && new->flags & TCQ_F_INGRESS)) {
> > >  			num_q = 1;
> > >  			ingress = 1;
> > > +			if (!dev_ingress_queue(dev))
> > > +				return -ENOENT;
> > 
> > Is this test really needed here?
> 
> To avoid a NULL dereference some lines later.
> Do I have a guarantee its not NULL here ?

Do you have any scenario for NULL here? ;-)

Of course, it's your patch and responsibility, and I'll not guarantee,
but you could at least add a TODO comment, to check it later.

> > > @@ -1044,7 +1050,8 @@ replay:
> > >  					return -ENOENT;
> > >  				q = qdisc_leaf(p, clid);
> > >  			} else { /*ingress */
> > > -				q = dev->ingress_queue.qdisc_sleeping;
> > > +				if (dev_ingress_queue_create(dev))
> > > +					q = dev_ingress_queue(dev)->qdisc_sleeping;
> > 
> > I wonder if doing dev_ingress_queue_create() just before qdisc_create()
> > (and the test here) isn't more readable.
> 
> Sorry, I dont understand. I want to create ingress_queue only if user
> wants it. If we setup (egress) trafic shaping, no need to setup
> ingress_queue.

I mean doing both creates in one place:

> @@ -1123,11 +1130,14 @@ replay:
>  create_n_graft:
...
> +	if (clid == TC_H_INGRESS) {
 +		if (dev_ingress_queue_create(dev))
> +			q = qdisc_create(dev, dev_ingress_queue(dev), p,
> +					 tcm->tcm_parent, tcm->tcm_parent,
> +					 tca, &err);
> +		else
> +			err = -ENOENT;
> +	} else {
>  		struct netdev_queue *dev_queue;
...

> Here is the V3 then.
> 
> [PATCH net-next V3] net: dynamic ingress_queue allocation
> 
> ingress being not used very much, and net_device->ingress_queue being
> quite a big object (128 or 256 bytes), use a dynamic allocation if
> needed (tc qdisc add dev eth0 ingress ...)
> 
> dev_ingress_queue(dev) helper should be used only with RTNL taken.
> 
> Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
> ---
> V3: add rcu notations & address Jarek comments
>  include/linux/netdevice.h |    2 -
>  include/linux/rtnetlink.h |    8 ++++++
>  net/core/dev.c            |   34 ++++++++++++++++++++++-------
>  net/sched/sch_api.c       |   42 ++++++++++++++++++++++++------------
>  net/sched/sch_generic.c   |   12 ++++++----
>  5 files changed, 71 insertions(+), 27 deletions(-)
> 
> diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
> index ceed347..92d81ed 100644
> --- a/include/linux/netdevice.h
> +++ b/include/linux/netdevice.h
> @@ -986,7 +986,7 @@ struct net_device {
>  	rx_handler_func_t	*rx_handler;
>  	void			*rx_handler_data;
>  
> -	struct netdev_queue	ingress_queue; /* use two cache lines */
> +	struct netdev_queue __rcu *ingress_queue;
>  
>  /*
>   * Cache lines mostly used on transmit path
> diff --git a/include/linux/rtnetlink.h b/include/linux/rtnetlink.h
> index 68c436b..0bb7b48 100644
> --- a/include/linux/rtnetlink.h
> +++ b/include/linux/rtnetlink.h
> @@ -6,6 +6,7 @@
>  #include <linux/if_link.h>
>  #include <linux/if_addr.h>
>  #include <linux/neighbour.h>
> +#include <linux/netdevice.h>
>  
>  /* rtnetlink families. Values up to 127 are reserved for real address
>   * families, values above 128 may be used arbitrarily.
> @@ -769,6 +770,13 @@ extern int lockdep_rtnl_is_held(void);
>  #define rtnl_dereference(p)					\
>  	rcu_dereference_check(p, lockdep_rtnl_is_held())
>  
> +static inline struct netdev_queue *dev_ingress_queue(struct net_device *dev)
> +{
> +	return rtnl_dereference(dev->ingress_queue);

I'd consider rcu_dereference_rtnl(). Btw, technically qdisc_lookup()
doesn't require rtnl, and there was time it was used without it
(on xmit path).

I think you should also add a comment here why this rcu is used, and
that it changes only once in dev's liftime.

Jarek P.

PS: checkpatched or not checkpatched, that is the question... ;-)

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH net-next V3] net: dynamic ingress_queue allocation
  2010-10-03  9:42               ` Jarek Poplawski
@ 2010-10-03 13:10                 ` jamal
  2010-10-04  8:42                 ` Eric Dumazet
  1 sibling, 0 replies; 23+ messages in thread
From: jamal @ 2010-10-03 13:10 UTC (permalink / raw)
  To: Jarek Poplawski; +Cc: Eric Dumazet, David Miller, netdev

On Sun, 2010-10-03 at 11:42 +0200, Jarek Poplawski wrote:

> > 
> > To avoid a NULL dereference some lines later.
> > Do I have a guarantee its not NULL here ?
> 
> Do you have any scenario for NULL here? ;-)

This is why i called this part clever earlier ;-> It is 
clever. There are several scenarios (i attempted to represent them
in the tests that Eric run):

1) ingress qdisc has been compiled in
flags & TCQ_F_INGRESS is true
a) user trying to add ingress qdisc first time
then q is null, new is not null and this would work
b) user trying to delete already added qdisc
then q is not null, new is null 

2) ingress qdisc not compiled in
Repeat #1a above, and Eric's check will bail out ..

The one thing that may have been useful is to also try
a "replace" after #1a and maybe after #2

cheers,
jamal


^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH net-next V3] net: dynamic ingress_queue allocation
  2010-10-03  9:42               ` Jarek Poplawski
  2010-10-03 13:10                 ` jamal
@ 2010-10-04  8:42                 ` Eric Dumazet
  2010-10-04  9:00                   ` [PATCH net-next] net: relax rtnl_dereference() Eric Dumazet
  2010-10-04 12:06                   ` [PATCH net-next V3] net: dynamic ingress_queue allocation Jarek Poplawski
  1 sibling, 2 replies; 23+ messages in thread
From: Eric Dumazet @ 2010-10-04  8:42 UTC (permalink / raw)
  To: Jarek Poplawski; +Cc: hadi, David Miller, netdev

Le dimanche 03 octobre 2010 à 11:42 +0200, Jarek Poplawski a écrit :

> I'd consider rcu_dereference_rtnl(). Btw, technically qdisc_lookup()
> doesn't require rtnl, and there was time it was used without it
> (on xmit path).

Hmm, for me, rcu_dereference_rtnl() is a bit lazy.

Either we are a reader and should use rcu_dereference(), or a writer and
RTNL should be held.

Mixing two conditions in a "super macro" is a workaround that we used to
promptly shutup some lockdep splats. Real fix would be to use strict
lockdep conditions, because this better documents the code and the
locking invariants.

BTW, rtnl_dereference() should be changed to use
rcu_dereference_protected() instead of rcu_dereference_check() :
If RTBL is held, there is no need to force a barrier.

> I think you should also add a comment here why this rcu is used, and
> that it changes only once in dev's liftime.
> 

This comment was needed in the previous version of the patch, with the
smb_wmb() barrier. Now I switched to regular rcu use, nothing prevents
us to change dev->ingress_queue in flight. Of course there is no current
interest doing so.

^ permalink raw reply	[flat|nested] 23+ messages in thread

* [PATCH net-next] net: relax rtnl_dereference()
  2010-10-04  8:42                 ` Eric Dumazet
@ 2010-10-04  9:00                   ` Eric Dumazet
  2010-10-05  7:29                     ` David Miller
  2010-10-04 12:06                   ` [PATCH net-next V3] net: dynamic ingress_queue allocation Jarek Poplawski
  1 sibling, 1 reply; 23+ messages in thread
From: Eric Dumazet @ 2010-10-04  9:00 UTC (permalink / raw)
  To: David Miller; +Cc: hadi, netdev, Jarek Poplawski

Le lundi 04 octobre 2010 à 10:42 +0200, Eric Dumazet a écrit :

> BTW, rtnl_dereference() should be changed to use
> rcu_dereference_protected() instead of rcu_dereference_check() :
> If RTBL is held, there is no need to force a barrier.
> 

[PATCH net-next] net: relax rtnl_dereference()

rtnl_dereference() is used in contexts where RTNL is held, to fetch an
RCU protected pointer.
 
Updates to this pointer are prevented by RTNL, so we dont need
smp_read_barrier_depends() and the ACCESS_ONCE() provided in
rcu_dereference_check().

rtnl_dereference() is mainly a macro to document the locking invariant.

Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
---
 include/linux/rtnetlink.h |   10 ++++++----
 1 files changed, 6 insertions(+), 4 deletions(-)

diff --git a/include/linux/rtnetlink.h b/include/linux/rtnetlink.h
index 68c436b..d3c4efa 100644
--- a/include/linux/rtnetlink.h
+++ b/include/linux/rtnetlink.h
@@ -754,20 +754,22 @@ extern int lockdep_rtnl_is_held(void);
  * @p: The pointer to read, prior to dereferencing
  *
  * Do an rcu_dereference(p), but check caller either holds rcu_read_lock()
- * or RTNL
+ * or RTNL. Note : Please prefer rtnl_dereference() or rcu_dereference()
  */
 #define rcu_dereference_rtnl(p)					\
 	rcu_dereference_check(p, rcu_read_lock_held() ||	\
 				 lockdep_rtnl_is_held())
 
 /**
- * rtnl_dereference - rcu_dereference with debug checking
+ * rtnl_dereference - fetch RCU pointer when updates are prevented by RTNL
  * @p: The pointer to read, prior to dereferencing
  *
- * Do an rcu_dereference(p), but check caller holds RTNL
+ * Return the value of the specified RCU-protected pointer, but omit
+ * both the smp_read_barrier_depends() and the ACCESS_ONCE(), because
+ * caller holds RTNL.
  */
 #define rtnl_dereference(p)					\
-	rcu_dereference_check(p, lockdep_rtnl_is_held())
+	rcu_dereference_protected(p, lockdep_rtnl_is_held())
 
 extern void rtnetlink_init(void);
 extern void __rtnl_unlock(void);



^ permalink raw reply related	[flat|nested] 23+ messages in thread

* Re: [PATCH net-next] net: relax rtnl_dereference()
  2010-10-04  9:00                   ` [PATCH net-next] net: relax rtnl_dereference() Eric Dumazet
@ 2010-10-05  7:29                     ` David Miller
  0 siblings, 0 replies; 23+ messages in thread
From: David Miller @ 2010-10-05  7:29 UTC (permalink / raw)
  To: eric.dumazet; +Cc: hadi, netdev, jarkao2

From: Eric Dumazet <eric.dumazet@gmail.com>
Date: Mon, 04 Oct 2010 11:00:12 +0200

> Le lundi 04 octobre 2010 à 10:42 +0200, Eric Dumazet a écrit :
> 
>> BTW, rtnl_dereference() should be changed to use
>> rcu_dereference_protected() instead of rcu_dereference_check() :
>> If RTBL is held, there is no need to force a barrier.
>> 
> 
> [PATCH net-next] net: relax rtnl_dereference()
> 
> rtnl_dereference() is used in contexts where RTNL is held, to fetch an
> RCU protected pointer.
>  
> Updates to this pointer are prevented by RTNL, so we dont need
> smp_read_barrier_depends() and the ACCESS_ONCE() provided in
> rcu_dereference_check().
> 
> rtnl_dereference() is mainly a macro to document the locking invariant.
> 
> Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>

Applied, thanks.

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH net-next V3] net: dynamic ingress_queue allocation
  2010-10-04  8:42                 ` Eric Dumazet
  2010-10-04  9:00                   ` [PATCH net-next] net: relax rtnl_dereference() Eric Dumazet
@ 2010-10-04 12:06                   ` Jarek Poplawski
  2010-10-04 12:52                     ` Eric Dumazet
  1 sibling, 1 reply; 23+ messages in thread
From: Jarek Poplawski @ 2010-10-04 12:06 UTC (permalink / raw)
  To: Eric Dumazet; +Cc: hadi, David Miller, netdev

On Mon, Oct 04, 2010 at 10:42:09AM +0200, Eric Dumazet wrote:
> Le dimanche 03 octobre 2010 ?? 11:42 +0200, Jarek Poplawski a écrit :
> 
> > I'd consider rcu_dereference_rtnl(). Btw, technically qdisc_lookup()
> > doesn't require rtnl, and there was time it was used without it
> > (on xmit path).
> 
> 
> Hmm, for me, rcu_dereference_rtnl() is a bit lazy.
> 
> Either we are a reader and should use rcu_dereference(), or a writer and
> RTNL should be held.
> 
> Mixing two conditions in a "super macro" is a workaround that we used to
> promptly shutup some lockdep splats. Real fix would be to use strict
> lockdep conditions, because this better documents the code and the
> locking invariants.
> 
> BTW, rtnl_dereference() should be changed to use
> rcu_dereference_protected() instead of rcu_dereference_check() :
> If RTBL is held, there is no need to force a barrier.

Actually, I'm one of those (convinced by Patrick btw), who consider
rcu_dereference() on the clear update side as misleading, so I'm not
rtnl_dereference() fan with or without changes.

> 
> 
> > I think you should also add a comment here why this rcu is used, and
> > that it changes only once in dev's liftime.
> > 
> 
> This comment was needed in the previous version of the patch, with the
> smb_wmb() barrier. Now I switched to regular rcu use, nothing prevents
> us to change dev->ingress_queue in flight. Of course there is no current
> interest doing so.

Right, but then at least in qdisc_lookup():

if (dev_ingress_queue(dev))
	q = qdisc_match_from_root(dev_ingress_queue(dev), handle);

you should use a variable instead of the second dereference (rtnl isn't
mandatory here).

Jarek P.

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH net-next V3] net: dynamic ingress_queue allocation
  2010-10-04 12:06                   ` [PATCH net-next V3] net: dynamic ingress_queue allocation Jarek Poplawski
@ 2010-10-04 12:52                     ` Eric Dumazet
  2010-10-04 14:24                       ` Jarek Poplawski
  0 siblings, 1 reply; 23+ messages in thread
From: Eric Dumazet @ 2010-10-04 12:52 UTC (permalink / raw)
  To: Jarek Poplawski; +Cc: hadi, David Miller, netdev

Le lundi 04 octobre 2010 à 14:06 +0200, Jarek Poplawski a écrit :

> Right, but then at least in qdisc_lookup():
> 
> if (dev_ingress_queue(dev))
> 	q = qdisc_match_from_root(dev_ingress_queue(dev), handle);
> 
> you should use a variable instead of the second dereference (rtnl isn't
> mandatory here).

I am lost.

If rntl is not mandatory, what is the lock that protects us ?

qdisc_lookup() _is_ called under the protection of a lock.
Or as soon as we return from it, result could change under us.

Please name it, and I'll use it :

rcu_dereference_protected(dev->ingress_queue, lockdep_is_held(THISLOCK))




^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH net-next V3] net: dynamic ingress_queue allocation
  2010-10-04 12:52                     ` Eric Dumazet
@ 2010-10-04 14:24                       ` Jarek Poplawski
  2010-10-04 15:21                         ` Eric Dumazet
  0 siblings, 1 reply; 23+ messages in thread
From: Jarek Poplawski @ 2010-10-04 14:24 UTC (permalink / raw)
  To: Eric Dumazet; +Cc: hadi, David Miller, netdev

On Mon, Oct 04, 2010 at 02:52:00PM +0200, Eric Dumazet wrote:
> Le lundi 04 octobre 2010 ?? 14:06 +0200, Jarek Poplawski a écrit :
> 
> > Right, but then at least in qdisc_lookup():
> > 
> > if (dev_ingress_queue(dev))
> > 	q = qdisc_match_from_root(dev_ingress_queue(dev), handle);
> > 
> > you should use a variable instead of the second dereference (rtnl isn't
> > mandatory here).
> 
> I am lost.
> 
> If rntl is not mandatory, what is the lock that protects us ?
> 
> qdisc_lookup() _is_ called under the protection of a lock.
> Or as soon as we return from it, result could change under us.
> 
> Please name it, and I'll use it :
> 
> rcu_dereference_protected(dev->ingress_queue, lockdep_is_held(THISLOCK))
> 

You are right! There is no other lock. (I forgot I removed
qdisc_list_lock already ;-)

Sorry,
Jarek P.

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH net-next V3] net: dynamic ingress_queue allocation
  2010-10-04 14:24                       ` Jarek Poplawski
@ 2010-10-04 15:21                         ` Eric Dumazet
  0 siblings, 0 replies; 23+ messages in thread
From: Eric Dumazet @ 2010-10-04 15:21 UTC (permalink / raw)
  To: Jarek Poplawski; +Cc: hadi, David Miller, netdev

Le lundi 04 octobre 2010 à 16:24 +0200, Jarek Poplawski a écrit :

> You are right! There is no other lock. (I forgot I removed
> qdisc_list_lock already ;-)
> 
> Sorry,

No problem, better to ask before inclusion anyway ;)

About checkpatch, patch is fine (checkpatch.pl shouts because of yet
unknown __rcu marker, I believe someone submitted a patch for this [and
__percpu] weeks ago, but lkml people seem very slow these days)

ERROR: need consistent spacing around '*' (ctx:WxV)
#10: FILE: include/linux/netdevice.h:989:
+	struct netdev_queue __rcu *ingress_queue;
 	                          ^
diff --git a/scripts/checkpatch.pl b/scripts/checkpatch.pl
index 2039acd..25d3842 100755
--- a/scripts/checkpatch.pl
+++ b/scripts/checkpatch.pl
@@ -143,7 +143,9 @@ our $Sparse	= qr{
 			__must_check|
 			__init_refok|
 			__kprobes|
-			__ref
+			__ref|
+			__percpu|
+			__rcu
 		}x;
 
 # Notes to $Attribute:



^ permalink raw reply related	[flat|nested] 23+ messages in thread

* Re: [PATCH net-next V3] net: dynamic ingress_queue allocation
  2010-10-02 16:11             ` [PATCH net-next V3] " Eric Dumazet
  2010-10-03  9:42               ` Jarek Poplawski
@ 2010-10-05  7:24               ` David Miller
  2010-10-05  7:31                 ` Eric Dumazet
  1 sibling, 1 reply; 23+ messages in thread
From: David Miller @ 2010-10-05  7:24 UTC (permalink / raw)
  To: eric.dumazet; +Cc: jarkao2, hadi, netdev

From: Eric Dumazet <eric.dumazet@gmail.com>
Date: Sat, 02 Oct 2010 18:11:55 +0200

> [PATCH net-next V3] net: dynamic ingress_queue allocation
> 
> ingress being not used very much, and net_device->ingress_queue being
> quite a big object (128 or 256 bytes), use a dynamic allocation if
> needed (tc qdisc add dev eth0 ingress ...)
> 
> dev_ingress_queue(dev) helper should be used only with RTNL taken.
> 
> Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>

This looks good to me, applied, thanks Eric.

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH net-next V3] net: dynamic ingress_queue allocation
  2010-10-05  7:24               ` David Miller
@ 2010-10-05  7:31                 ` Eric Dumazet
  0 siblings, 0 replies; 23+ messages in thread
From: Eric Dumazet @ 2010-10-05  7:31 UTC (permalink / raw)
  To: David Miller; +Cc: jarkao2, hadi, netdev

Le mardi 05 octobre 2010 à 00:24 -0700, David Miller a écrit :
> From: Eric Dumazet <eric.dumazet@gmail.com>
> Date: Sat, 02 Oct 2010 18:11:55 +0200
> 
> > [PATCH net-next V3] net: dynamic ingress_queue allocation
> > 
> > ingress being not used very much, and net_device->ingress_queue being
> > quite a big object (128 or 256 bytes), use a dynamic allocation if
> > needed (tc qdisc add dev eth0 ingress ...)
> > 
> > dev_ingress_queue(dev) helper should be used only with RTNL taken.
> > 
> > Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
> 
> This looks good to me, applied, thanks Eric.

Thanks to Jarek, Jamal, and you :)



^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH net-next V2] net: dynamic ingress_queue allocation
  2010-10-01 13:56         ` [PATCH net-next V2] " Eric Dumazet
  2010-10-02  9:32           ` Jarek Poplawski
@ 2010-10-02 12:10           ` jamal
  1 sibling, 0 replies; 23+ messages in thread
From: jamal @ 2010-10-02 12:10 UTC (permalink / raw)
  To: Eric Dumazet; +Cc: Jarek Poplawski, David Miller, netdev

On Fri, 2010-10-01 at 15:56 +0200, Eric Dumazet wrote: 
> Le vendredi 01 octobre 2010 à 07:45 -0400, jamal a écrit :

> 
> I first thought of this, and found it would add a new dependence on
> vmlinux :
> 
> If someone wants to add NET_SCH_INGRESS module, he would need to
> recompile whole kernel and reboot.
> 
> This is probably why ing_filter() and handle_ing() are enclosed with
> CONFIG_NET_CLS_ACT, not CONFIG_NET_SCH_INGRESS.
> 
> Since struct net_dev only holds one pointer (after this patch), I
> believe its better to use same dependence.
> 

Ok - I just noticed that CONFIG_NET_SCH_INGRESS depends on 
CONFIG_NET_CLS_ACT - probably my fault too. I have a use case
for having just CONFIG_NET_SCH_INGRESS without need for
CONFIG_NET_CLS_ACT; but you are right to keep in the current
spirit it is fair to keep it as is; i wonder if it needs to be fixed.

> > 
> > > +static inline struct netdev_queue *dev_ingress_queue(struct net_device *dev)
> > > +{
> > > +#ifdef CONFIG_NET_CLS_ACT
> > > +	return dev->ingress_queue;
> > > +#else
> > > +	return NULL;
> > > +#endif
> > 
> > Above, if you just returned dev->ingress_queue wouldnt it always be 
> > NULL if it was not allocated?
> > 
> 
> ingress_queue is not defined in "struct net_device *dev" if 
> !CONFIG_NET_CLS_ACT
> 
> Returning NULL here permits dead code elimination by compiler.
> 
> Then, probably nobody unset CONFIG_NET_CLS_ACT, so we can probably avoid
> this preprocessor stuff.
> 

Ok


> I see, this adds an indirection and a conditional branch, but this
> should be in cpu cache and well predicted.

Ok - good enough for me.

> I thought adding a fake "struct netdev_queue" object, with a qdisc
> pointing to noop_qdisc. But this would slow down a bit non ingress
> users ;)

nod.

> For people not using ingress, my solution removes an access to an extra
> cache line. So network latency is improved a bit when cpu caches are
> full of user land data.

Who are these lunatics running without ingress anyways?

> > 1) compile support for ingress and add/delete ingress qdisc
> 
> This worked for me, but I dont know complex setups.
> 

I think if you covered the basic case, should be equivalent
to any other case. Maybe a replace after you do an add etc.

> > 2) Dont compile support and add/delete ingress qdisc
> 
> tc gives an error (a bit like !CONFIG_NET_SCH_INGRESS)
> 
> # tc qdisc add dev eth0 ingress
> RTNETLINK answers: No such file or directory
> # tc -s -d qdisc show dev eth0
> qdisc mq 0: root 
>  Sent 636 bytes 10 pkt (dropped 0, overlimits 0 requeues 0) 
>  backlog 0b 0p requeues 0 
> 

nice

> 
> > 3) Compile ingress as a module and add/delete ingress qdisc
> > 
> Seems to work like 1)
> 

ok then, nothing else to bitch about.

Acked-by: Jamal Hadi Salim <hadi@cyberus.ca>

> I took a look at ifb as suggested by Stephen but could not see trivial
> changes (LLTX or per-cpu stats), since central lock is needed I am
> afraid. 

I wasnt thinking ifb - but ifb (and probably other netdevs) could
benefit a tiny bit by rearranging net_stats struct rx and tx parts into
different cache lines since these are typically updated in different
paths. In kernel, rx path for ifb is guaranteed to run in one cpu at a
time but tx could be many cpus (so at least for rx path theres benefit)

> And qdisc are the same, stats updates are mostly free as we
> dirtied cache line for the lock.

ok - qdiscs are easier because a qdisc instance can be accessed by one
cpu at a time. actions could be configured to be shared by many filters;
in such a mode they could be accessed across many cpus.

cheers,
jamal


^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH net-next2.6] net: rename netdev rx_queue to ingress_queue
  2010-09-28 15:58 [PATCH net-next2.6] net: rename netdev rx_queue to ingress_queue Eric Dumazet
  2010-09-28 18:04 ` Jarek Poplawski
@ 2010-09-29 20:27 ` David Miller
  1 sibling, 0 replies; 23+ messages in thread
From: David Miller @ 2010-09-29 20:27 UTC (permalink / raw)
  To: eric.dumazet; +Cc: netdev, jarkao2

From: Eric Dumazet <eric.dumazet@gmail.com>
Date: Tue, 28 Sep 2010 17:58:37 +0200

> There is some confusion with rx_queue name after RPS, and net drivers
> private rx_queue fields.
> 
> I suggest to rename "struct net_device"->rx_queue to ingress_queue.
> 
> Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>

Applied.

^ permalink raw reply	[flat|nested] 23+ messages in thread

end of thread, other threads:[~2010-10-05  7:32 UTC | newest]

Thread overview: 23+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2010-09-28 15:58 [PATCH net-next2.6] net: rename netdev rx_queue to ingress_queue Eric Dumazet
2010-09-28 18:04 ` Jarek Poplawski
2010-09-29 10:56   ` jamal
2010-09-29 13:18     ` Jarek Poplawski
2010-09-29 16:54       ` Jarek Poplawski
2010-09-30 22:58     ` [PATCH net-next2.6] net: dynamic ingress_queue allocation Eric Dumazet
2010-10-01 11:45       ` jamal
2010-10-01 13:56         ` [PATCH net-next V2] " Eric Dumazet
2010-10-02  9:32           ` Jarek Poplawski
2010-10-02 16:11             ` [PATCH net-next V3] " Eric Dumazet
2010-10-03  9:42               ` Jarek Poplawski
2010-10-03 13:10                 ` jamal
2010-10-04  8:42                 ` Eric Dumazet
2010-10-04  9:00                   ` [PATCH net-next] net: relax rtnl_dereference() Eric Dumazet
2010-10-05  7:29                     ` David Miller
2010-10-04 12:06                   ` [PATCH net-next V3] net: dynamic ingress_queue allocation Jarek Poplawski
2010-10-04 12:52                     ` Eric Dumazet
2010-10-04 14:24                       ` Jarek Poplawski
2010-10-04 15:21                         ` Eric Dumazet
2010-10-05  7:24               ` David Miller
2010-10-05  7:31                 ` Eric Dumazet
2010-10-02 12:10           ` [PATCH net-next V2] " jamal
2010-09-29 20:27 ` [PATCH net-next2.6] net: rename netdev rx_queue to ingress_queue David Miller

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).