[RFC PATCH] net_sched: bulk free tcf

netdev.vger.kernel.org archive mirror
 help / color / mirror / Atom feed

* [RFC PATCH] net_sched: bulk free tcf_block
@ 2017-11-29 14:25 Paolo Abeni
  2017-11-29 16:14 ` Alexander Duyck
  2017-12-01  7:14 ` Cong Wang
  0 siblings, 2 replies; 7+ messages in thread
From: Paolo Abeni @ 2017-11-29 14:25 UTC (permalink / raw)
  To: netdev; +Cc: Jamal Hadi Salim, Cong Wang, Jiri Pirko, David S. Miller

Currently deleting qdisc with a large number of children and filters
can take a lot of time:

tc qdisc add dev lo root htb
for I in `seq 1 1000`; do
	tc class add dev lo parent 1: classid 1:$I htb rate 100kbit
	tc qdisc add dev lo parent 1:$I handle $((I + 1)): htb
	for J in `seq 1 10`; do
		tc filter add dev lo parent $((I + 1)): u32 match ip src 1.1.1.$J
	done
done
time tc qdisc del dev root

real	0m54.764s
user	0m0.023s
sys	0m0.000s

This is due to the multiple rcu_barrier() calls, one for each tcf_block
freed, invoked with the rtnl lock held. Most other network related
tasks will block in this timespan.

This change implements bulk free of tcf_block() at destroy() time:
when deleting a qdisc hierarchy, the to-be-deleted blocks are queued
in a rtnl_lock protected list, and a single rcu_barrier is invoked
for each burst.

The burst is flushed after the deletion of the topmost qdisc of the
destroyed hierarchy and all the queued blocks are deleted with a
single delayed work call.

After this change, the previous example gives:

real	0m0.193s
user	0m0.000s
sys	0m0.016s

Signed-off-by: Paolo Abeni <pabeni@redhat.com>
---
This patch adds 2 new list_head fields to tcf_block, that could be
replaced with a single pointer, open coding single linked list
manipulation in cls_api.c, if a lower memory impact is required.
---
 include/net/pkt_cls.h     |  1 +
 include/net/sch_generic.h |  5 +++
 net/sched/cls_api.c       | 78 +++++++++++++++++++++++++++++++++++------------
 net/sched/sch_api.c       |  1 +
 net/sched/sch_generic.c   | 17 +++++++++++
 net/sched/sch_ingress.c   |  1 +
 6 files changed, 83 insertions(+), 20 deletions(-)

diff --git a/include/net/pkt_cls.h b/include/net/pkt_cls.h
index 0105445cab83..12777cfae77c 100644
--- a/include/net/pkt_cls.h
+++ b/include/net/pkt_cls.h
@@ -45,6 +45,7 @@ int tcf_block_get_ext(struct tcf_block **p_block, struct Qdisc *q,
 void tcf_block_put(struct tcf_block *block);
 void tcf_block_put_ext(struct tcf_block *block, struct Qdisc *q,
 		       struct tcf_block_ext_info *ei);
+void tcf_flush_blocks(void);
 
 static inline struct Qdisc *tcf_block_q(struct tcf_block *block)
 {
diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h
index 65d0d25f2648..99846ee644a8 100644
--- a/include/net/sch_generic.h
+++ b/include/net/sch_generic.h
@@ -71,6 +71,7 @@ struct Qdisc {
 				      * qdisc_tree_decrease_qlen() should stop.
 				      */
 #define TCQ_F_INVISIBLE		0x80 /* invisible by default in dump */
+#define TCQ_F_DELETING		0x100
 	u32			limit;
 	const struct Qdisc_ops	*ops;
 	struct qdisc_size_table	__rcu *stab;
@@ -279,6 +280,10 @@ struct tcf_block {
 	struct Qdisc *q;
 	struct list_head cb_list;
 	struct work_struct work;
+
+	/* TODO: use a single list, do avoid wasting too much memory */
+	struct list_head del_list;
+	struct list_head del_head;
 };
 
 static inline void qdisc_cb_private_validate(const struct sk_buff *skb, int sz)
diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c
index 7d97f612c9b9..446b16c1f532 100644
--- a/net/sched/cls_api.c
+++ b/net/sched/cls_api.c
@@ -37,6 +37,61 @@ static LIST_HEAD(tcf_proto_base);
 /* Protects list of registered TC modules. It is pure SMP lock. */
 static DEFINE_RWLOCK(cls_mod_lock);
 
+/* List of tcf_blocks queued for deletion. Bulk freeing them we avoid the
+ * rcu_barrier() storm at root_qdisc->destroy() time
+ */
+static LIST_HEAD(queued_blocks);
+
+static void queue_for_deletion(struct tcf_block *block)
+{
+	if (WARN_ON(!list_empty(&block->del_list)))
+		return;
+
+	ASSERT_RTNL();
+	list_add(&block->del_list, &queued_blocks);
+}
+
+static void flush_blocks(struct work_struct *work)
+{
+	struct tcf_block *block, *tmp_block;
+	struct tcf_chain *chain, *tmp;
+	struct list_head *head;
+
+	head = &container_of(work, struct tcf_block, work)->del_head;
+	rtnl_lock();
+	list_for_each_entry(block, head, del_list)
+		/* Only chain 0 should be still here. */
+		list_for_each_entry_safe(chain, tmp, &block->chain_list, list)
+			tcf_chain_put(chain);
+	rtnl_unlock();
+
+	list_for_each_entry_safe(block, tmp_block, head, del_list)
+		kfree(block);
+}
+
+void tcf_flush_blocks(void)
+{
+	struct tcf_block *head;
+	LIST_HEAD(flush_burst);
+
+	ASSERT_RTNL();
+	if (list_empty(&queued_blocks))
+		return;
+
+	head = list_first_entry(&queued_blocks, struct tcf_block, del_list);
+	INIT_LIST_HEAD(&head->del_head);
+	list_splice_init(&queued_blocks, &head->del_head);
+	INIT_WORK(&head->work, flush_blocks);
+
+	/* Wait for existing RCU callbacks to cool down, make sure their works
+	 * have been queued before this. We can not flush pending works here
+	 * because we are holding the RTNL lock.
+	 */
+	rcu_barrier();
+	schedule_work(&head->work);
+}
+EXPORT_SYMBOL_GPL(tcf_flush_blocks);
+
 /* Find classifier type by string name */
 
 static const struct tcf_proto_ops *tcf_proto_lookup_ops(const char *kind)
@@ -288,6 +343,7 @@ int tcf_block_get_ext(struct tcf_block **p_block, struct Qdisc *q,
 		return -ENOMEM;
 	INIT_LIST_HEAD(&block->chain_list);
 	INIT_LIST_HEAD(&block->cb_list);
+	INIT_LIST_HEAD(&block->del_list);
 
 	/* Create chain 0 by default, it has to be always present. */
 	chain = tcf_chain_create(block, 0);
@@ -330,19 +386,6 @@ int tcf_block_get(struct tcf_block **p_block,
 }
 EXPORT_SYMBOL(tcf_block_get);
 
-static void tcf_block_put_final(struct work_struct *work)
-{
-	struct tcf_block *block = container_of(work, struct tcf_block, work);
-	struct tcf_chain *chain, *tmp;
-
-	rtnl_lock();
-	/* Only chain 0 should be still here. */
-	list_for_each_entry_safe(chain, tmp, &block->chain_list, list)
-		tcf_chain_put(chain);
-	rtnl_unlock();
-	kfree(block);
-}
-
 /* XXX: Standalone actions are not allowed to jump to any chain, and bound
  * actions should be all removed after flushing. However, filters are now
  * destroyed in tc filter workqueue with RTNL lock, they can not race here.
@@ -357,13 +400,7 @@ void tcf_block_put_ext(struct tcf_block *block, struct Qdisc *q,
 
 	tcf_block_offload_unbind(block, q, ei);
 
-	INIT_WORK(&block->work, tcf_block_put_final);
-	/* Wait for existing RCU callbacks to cool down, make sure their works
-	 * have been queued before this. We can not flush pending works here
-	 * because we are holding the RTNL lock.
-	 */
-	rcu_barrier();
-	tcf_queue_work(&block->work);
+	queue_for_deletion(block);
 }
 EXPORT_SYMBOL(tcf_block_put_ext);
 
@@ -920,6 +957,7 @@ static int tc_ctl_tfilter(struct sk_buff *skb, struct nlmsghdr *n,
 		if (tp_created)
 			tcf_proto_destroy(tp);
 	}
+	tcf_flush_blocks();
 
 errout:
 	if (chain)
diff --git a/net/sched/sch_api.c b/net/sched/sch_api.c
index b6c4f536876b..5fe2dcb73bfd 100644
--- a/net/sched/sch_api.c
+++ b/net/sched/sch_api.c
@@ -1645,6 +1645,7 @@ static int tclass_del_notify(struct net *net,
 		kfree_skb(skb);
 		return err;
 	}
+	tcf_flush_blocks();
 
 	return rtnetlink_send(skb, net, portid, RTNLGRP_TC,
 			      n->nlmsg_flags & NLM_F_ECHO);
diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c
index 3839cbbdc32b..299c5d33916a 100644
--- a/net/sched/sch_generic.c
+++ b/net/sched/sch_generic.c
@@ -28,6 +28,7 @@
 #include <linux/if_vlan.h>
 #include <net/sch_generic.h>
 #include <net/pkt_sched.h>
+#include <net/pkt_cls.h>
 #include <net/dst.h>
 #include <trace/events/qdisc.h>
 
@@ -708,11 +709,25 @@ static void qdisc_free(struct Qdisc *qdisc)
 void qdisc_destroy(struct Qdisc *qdisc)
 {
 	const struct Qdisc_ops  *ops = qdisc->ops;
+	struct Qdisc *p;
+	bool flush;
 
 	if (qdisc->flags & TCQ_F_BUILTIN ||
 	    !refcount_dec_and_test(&qdisc->refcnt))
 		return;
 
+	/* we can avoid flush the pending blocks if this qdisc is a child
+	 * deleted a recursive destroy() call and the parent qdisc is already
+	 * removed.
+	 */
+	qdisc->flags |= TCQ_F_DELETING;
+	if (qdisc->parent != TC_H_ROOT) {
+		p = qdisc_lookup(qdisc_dev(qdisc), TC_H_MAJ(qdisc->parent));
+		flush = p && !(p->flags & TCQ_F_DELETING);
+	} else {
+		flush = true;
+	}
+
 #ifdef CONFIG_NET_SCHED
 	qdisc_hash_del(qdisc);
 
@@ -723,6 +738,8 @@ void qdisc_destroy(struct Qdisc *qdisc)
 		ops->reset(qdisc);
 	if (ops->destroy)
 		ops->destroy(qdisc);
+	if (flush)
+		tcf_flush_blocks();
 
 	module_put(ops->owner);
 	dev_put(qdisc_dev(qdisc));
diff --git a/net/sched/sch_ingress.c b/net/sched/sch_ingress.c
index 5ecc38f35d47..7329b8f55160 100644
--- a/net/sched/sch_ingress.c
+++ b/net/sched/sch_ingress.c
@@ -201,6 +201,7 @@ static int clsact_init(struct Qdisc *sch, struct nlattr *opt)
 
 err_egress_block_get:
 	tcf_block_put_ext(q->ingress_block, sch, &q->ingress_block_info);
+	tcf_flush_blocks();
 	return err;
 }
 
-- 
2.13.6

^ permalink raw reply related	[flat|nested] 7+ messages in thread

* Re: [RFC PATCH] net_sched: bulk free tcf_block
  2017-11-29 14:25 [RFC PATCH] net_sched: bulk free tcf_block Paolo Abeni
@ 2017-11-29 16:14 ` Alexander Duyck
  2017-11-29 18:47   ` Paolo Abeni
  2017-12-01  7:14 ` Cong Wang
  1 sibling, 1 reply; 7+ messages in thread
From: Alexander Duyck @ 2017-11-29 16:14 UTC (permalink / raw)
  To: Paolo Abeni
  Cc: Netdev, Jamal Hadi Salim, Cong Wang, Jiri Pirko, David S. Miller

On Wed, Nov 29, 2017 at 6:25 AM, Paolo Abeni <pabeni@redhat.com> wrote:
> Currently deleting qdisc with a large number of children and filters
> can take a lot of time:
>
> tc qdisc add dev lo root htb
> for I in `seq 1 1000`; do
>         tc class add dev lo parent 1: classid 1:$I htb rate 100kbit
>         tc qdisc add dev lo parent 1:$I handle $((I + 1)): htb
>         for J in `seq 1 10`; do
>                 tc filter add dev lo parent $((I + 1)): u32 match ip src 1.1.1.$J
>         done
> done
> time tc qdisc del dev root
>
> real    0m54.764s
> user    0m0.023s
> sys     0m0.000s
>
> This is due to the multiple rcu_barrier() calls, one for each tcf_block
> freed, invoked with the rtnl lock held. Most other network related
> tasks will block in this timespan.
>
> This change implements bulk free of tcf_block() at destroy() time:
> when deleting a qdisc hierarchy, the to-be-deleted blocks are queued
> in a rtnl_lock protected list, and a single rcu_barrier is invoked
> for each burst.
>
> The burst is flushed after the deletion of the topmost qdisc of the
> destroyed hierarchy and all the queued blocks are deleted with a
> single delayed work call.
>
> After this change, the previous example gives:
>
> real    0m0.193s
> user    0m0.000s
> sys     0m0.016s
>
> Signed-off-by: Paolo Abeni <pabeni@redhat.com>

While I agree there is an issue I don't think this is being approached
quite the right way. The question I have is why aren't we using the
standard RCU approach for this and simply using either call_rcu or
kfree_rcu to free the object?

> ---
> This patch adds 2 new list_head fields to tcf_block, that could be
> replaced with a single pointer, open coding single linked list
> manipulation in cls_api.c, if a lower memory impact is required.
> ---
>  include/net/pkt_cls.h     |  1 +
>  include/net/sch_generic.h |  5 +++
>  net/sched/cls_api.c       | 78 +++++++++++++++++++++++++++++++++++------------
>  net/sched/sch_api.c       |  1 +
>  net/sched/sch_generic.c   | 17 +++++++++++
>  net/sched/sch_ingress.c   |  1 +
>  6 files changed, 83 insertions(+), 20 deletions(-)
>
> diff --git a/include/net/pkt_cls.h b/include/net/pkt_cls.h
> index 0105445cab83..12777cfae77c 100644
> --- a/include/net/pkt_cls.h
> +++ b/include/net/pkt_cls.h
> @@ -45,6 +45,7 @@ int tcf_block_get_ext(struct tcf_block **p_block, struct Qdisc *q,
>  void tcf_block_put(struct tcf_block *block);
>  void tcf_block_put_ext(struct tcf_block *block, struct Qdisc *q,
>                        struct tcf_block_ext_info *ei);
> +void tcf_flush_blocks(void);
>
>  static inline struct Qdisc *tcf_block_q(struct tcf_block *block)
>  {
> diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h
> index 65d0d25f2648..99846ee644a8 100644
> --- a/include/net/sch_generic.h
> +++ b/include/net/sch_generic.h
> @@ -71,6 +71,7 @@ struct Qdisc {
>                                       * qdisc_tree_decrease_qlen() should stop.
>                                       */
>  #define TCQ_F_INVISIBLE                0x80 /* invisible by default in dump */
> +#define TCQ_F_DELETING         0x100
>         u32                     limit;
>         const struct Qdisc_ops  *ops;
>         struct qdisc_size_table __rcu *stab;
> @@ -279,6 +280,10 @@ struct tcf_block {
>         struct Qdisc *q;
>         struct list_head cb_list;
>         struct work_struct work;
> +
> +       /* TODO: use a single list, do avoid wasting too much memory */
> +       struct list_head del_list;
> +       struct list_head del_head;
>  };
>

This is just adding yet another layer of deferred freeing. We already
have RCU why don't we just use that?

>  static inline void qdisc_cb_private_validate(const struct sk_buff *skb, int sz)
> diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c
> index 7d97f612c9b9..446b16c1f532 100644
> --- a/net/sched/cls_api.c
> +++ b/net/sched/cls_api.c
> @@ -37,6 +37,61 @@ static LIST_HEAD(tcf_proto_base);
>  /* Protects list of registered TC modules. It is pure SMP lock. */
>  static DEFINE_RWLOCK(cls_mod_lock);
>
> +/* List of tcf_blocks queued for deletion. Bulk freeing them we avoid the
> + * rcu_barrier() storm at root_qdisc->destroy() time
> + */
> +static LIST_HEAD(queued_blocks);
> +
> +static void queue_for_deletion(struct tcf_block *block)
> +{
> +       if (WARN_ON(!list_empty(&block->del_list)))
> +               return;
> +
> +       ASSERT_RTNL();
> +       list_add(&block->del_list, &queued_blocks);
> +}
> +
> +static void flush_blocks(struct work_struct *work)
> +{
> +       struct tcf_block *block, *tmp_block;
> +       struct tcf_chain *chain, *tmp;
> +       struct list_head *head;
> +
> +       head = &container_of(work, struct tcf_block, work)->del_head;
> +       rtnl_lock();
> +       list_for_each_entry(block, head, del_list)
> +               /* Only chain 0 should be still here. */
> +               list_for_each_entry_safe(chain, tmp, &block->chain_list, list)
> +                       tcf_chain_put(chain);
> +       rtnl_unlock();
> +
> +       list_for_each_entry_safe(block, tmp_block, head, del_list)
> +               kfree(block);
> +}
> +
> +void tcf_flush_blocks(void)
> +{
> +       struct tcf_block *head;
> +       LIST_HEAD(flush_burst);
> +
> +       ASSERT_RTNL();
> +       if (list_empty(&queued_blocks))
> +               return;
> +
> +       head = list_first_entry(&queued_blocks, struct tcf_block, del_list);
> +       INIT_LIST_HEAD(&head->del_head);
> +       list_splice_init(&queued_blocks, &head->del_head);
> +       INIT_WORK(&head->work, flush_blocks);
> +
> +       /* Wait for existing RCU callbacks to cool down, make sure their works
> +        * have been queued before this. We can not flush pending works here
> +        * because we are holding the RTNL lock.
> +        */
> +       rcu_barrier();
> +       schedule_work(&head->work);
> +}
> +EXPORT_SYMBOL_GPL(tcf_flush_blocks);
> +
>  /* Find classifier type by string name */
>
>  static const struct tcf_proto_ops *tcf_proto_lookup_ops(const char *kind)
> @@ -288,6 +343,7 @@ int tcf_block_get_ext(struct tcf_block **p_block, struct Qdisc *q,
>                 return -ENOMEM;
>         INIT_LIST_HEAD(&block->chain_list);
>         INIT_LIST_HEAD(&block->cb_list);
> +       INIT_LIST_HEAD(&block->del_list);
>
>         /* Create chain 0 by default, it has to be always present. */
>         chain = tcf_chain_create(block, 0);
> @@ -330,19 +386,6 @@ int tcf_block_get(struct tcf_block **p_block,
>  }
>  EXPORT_SYMBOL(tcf_block_get);
>
> -static void tcf_block_put_final(struct work_struct *work)
> -{
> -       struct tcf_block *block = container_of(work, struct tcf_block, work);
> -       struct tcf_chain *chain, *tmp;
> -
> -       rtnl_lock();
> -       /* Only chain 0 should be still here. */
> -       list_for_each_entry_safe(chain, tmp, &block->chain_list, list)
> -               tcf_chain_put(chain);

So it seems like the heart of the problem is right here. The
tcf_chain_put call is updating the reference count and I would assume
that is the only bit that really needs you to still be holding onto
the rtnl_lock.

The question I would have is if there is anything accessing the
reference count or manipulating the list itself without holding the
rtnl lock? If not you could look at converting this whole thing from
using a list to an rculist and it seems like it would save you a lot
of trouble. As far as I can tell the only thing you would then really
have to worry about would be the freeing of the chain itself which
would happen in an rcu callback instead of with the rtnl lock held.

> -       rtnl_unlock();
> -       kfree(block);
> -}
> -
>  /* XXX: Standalone actions are not allowed to jump to any chain, and bound
>   * actions should be all removed after flushing. However, filters are now
>   * destroyed in tc filter workqueue with RTNL lock, they can not race here.
> @@ -357,13 +400,7 @@ void tcf_block_put_ext(struct tcf_block *block, struct Qdisc *q,
>
>         tcf_block_offload_unbind(block, q, ei);
>
> -       INIT_WORK(&block->work, tcf_block_put_final);
> -       /* Wait for existing RCU callbacks to cool down, make sure their works
> -        * have been queued before this. We can not flush pending works here
> -        * because we are holding the RTNL lock.
> -        */
> -       rcu_barrier();
> -       tcf_queue_work(&block->work);
> +       queue_for_deletion(block);
>  }
>  EXPORT_SYMBOL(tcf_block_put_ext);
>
> @@ -920,6 +957,7 @@ static int tc_ctl_tfilter(struct sk_buff *skb, struct nlmsghdr *n,
>                 if (tp_created)
>                         tcf_proto_destroy(tp);
>         }
> +       tcf_flush_blocks();
>
>  errout:
>         if (chain)
> diff --git a/net/sched/sch_api.c b/net/sched/sch_api.c
> index b6c4f536876b..5fe2dcb73bfd 100644
> --- a/net/sched/sch_api.c
> +++ b/net/sched/sch_api.c
> @@ -1645,6 +1645,7 @@ static int tclass_del_notify(struct net *net,
>                 kfree_skb(skb);
>                 return err;
>         }
> +       tcf_flush_blocks();
>
>         return rtnetlink_send(skb, net, portid, RTNLGRP_TC,
>                               n->nlmsg_flags & NLM_F_ECHO);
> diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c
> index 3839cbbdc32b..299c5d33916a 100644
> --- a/net/sched/sch_generic.c
> +++ b/net/sched/sch_generic.c
> @@ -28,6 +28,7 @@
>  #include <linux/if_vlan.h>
>  #include <net/sch_generic.h>
>  #include <net/pkt_sched.h>
> +#include <net/pkt_cls.h>
>  #include <net/dst.h>
>  #include <trace/events/qdisc.h>
>
> @@ -708,11 +709,25 @@ static void qdisc_free(struct Qdisc *qdisc)
>  void qdisc_destroy(struct Qdisc *qdisc)
>  {
>         const struct Qdisc_ops  *ops = qdisc->ops;
> +       struct Qdisc *p;
> +       bool flush;
>
>         if (qdisc->flags & TCQ_F_BUILTIN ||
>             !refcount_dec_and_test(&qdisc->refcnt))
>                 return;
>
> +       /* we can avoid flush the pending blocks if this qdisc is a child
> +        * deleted a recursive destroy() call and the parent qdisc is already
> +        * removed.
> +        */
> +       qdisc->flags |= TCQ_F_DELETING;
> +       if (qdisc->parent != TC_H_ROOT) {
> +               p = qdisc_lookup(qdisc_dev(qdisc), TC_H_MAJ(qdisc->parent));
> +               flush = p && !(p->flags & TCQ_F_DELETING);
> +       } else {
> +               flush = true;
> +       }
> +
>  #ifdef CONFIG_NET_SCHED
>         qdisc_hash_del(qdisc);
>
> @@ -723,6 +738,8 @@ void qdisc_destroy(struct Qdisc *qdisc)
>                 ops->reset(qdisc);
>         if (ops->destroy)
>                 ops->destroy(qdisc);
> +       if (flush)
> +               tcf_flush_blocks();
>
>         module_put(ops->owner);
>         dev_put(qdisc_dev(qdisc));
> diff --git a/net/sched/sch_ingress.c b/net/sched/sch_ingress.c
> index 5ecc38f35d47..7329b8f55160 100644
> --- a/net/sched/sch_ingress.c
> +++ b/net/sched/sch_ingress.c
> @@ -201,6 +201,7 @@ static int clsact_init(struct Qdisc *sch, struct nlattr *opt)
>
>  err_egress_block_get:
>         tcf_block_put_ext(q->ingress_block, sch, &q->ingress_block_info);
> +       tcf_flush_blocks();
>         return err;
>  }
>
> --
> 2.13.6
>

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [RFC PATCH] net_sched: bulk free tcf_block
  2017-11-29 16:14 ` Alexander Duyck
@ 2017-11-29 18:47   ` Paolo Abeni
  0 siblings, 0 replies; 7+ messages in thread
From: Paolo Abeni @ 2017-11-29 18:47 UTC (permalink / raw)
  To: Alexander Duyck
  Cc: Netdev, Jamal Hadi Salim, Cong Wang, Jiri Pirko, David S. Miller

Hi,

On Wed, 2017-11-29 at 08:14 -0800, Alexander Duyck wrote:
> On Wed, Nov 29, 2017 at 6:25 AM, Paolo Abeni <pabeni@redhat.com> wrote:
> > Currently deleting qdisc with a large number of children and filters
> > can take a lot of time:
> > 
> > tc qdisc add dev lo root htb
> > for I in `seq 1 1000`; do
> >         tc class add dev lo parent 1: classid 1:$I htb rate 100kbit
> >         tc qdisc add dev lo parent 1:$I handle $((I + 1)): htb
> >         for J in `seq 1 10`; do
> >                 tc filter add dev lo parent $((I + 1)): u32 match ip src 1.1.1.$J
> >         done
> > done
> > time tc qdisc del dev root
> > 
> > real    0m54.764s
> > user    0m0.023s
> > sys     0m0.000s
> > 
> > This is due to the multiple rcu_barrier() calls, one for each tcf_block
> > freed, invoked with the rtnl lock held. Most other network related
> > tasks will block in this timespan.
> > 
> > This change implements bulk free of tcf_block() at destroy() time:
> > when deleting a qdisc hierarchy, the to-be-deleted blocks are queued
> > in a rtnl_lock protected list, and a single rcu_barrier is invoked
> > for each burst.
> > 
> > The burst is flushed after the deletion of the topmost qdisc of the
> > destroyed hierarchy and all the queued blocks are deleted with a
> > single delayed work call.
> > 
> > After this change, the previous example gives:
> > 
> > real    0m0.193s
> > user    0m0.000s
> > sys     0m0.016s
> > 
> > Signed-off-by: Paolo Abeni <pabeni@redhat.com>
> 
> While I agree there is an issue I don't think this is being approached
> quite the right way. The question I have is why aren't we using the
> standard RCU approach for this and simply using either call_rcu or
> kfree_rcu to free the object?
> 
> > ---
> > This patch adds 2 new list_head fields to tcf_block, that could be
> > replaced with a single pointer, open coding single linked list
> > manipulation in cls_api.c, if a lower memory impact is required.
> > ---
> >  include/net/pkt_cls.h     |  1 +
> >  include/net/sch_generic.h |  5 +++
> >  net/sched/cls_api.c       | 78 +++++++++++++++++++++++++++++++++++------------
> >  net/sched/sch_api.c       |  1 +
> >  net/sched/sch_generic.c   | 17 +++++++++++
> >  net/sched/sch_ingress.c   |  1 +
> >  6 files changed, 83 insertions(+), 20 deletions(-)
> > 
> > diff --git a/include/net/pkt_cls.h b/include/net/pkt_cls.h
> > index 0105445cab83..12777cfae77c 100644
> > --- a/include/net/pkt_cls.h
> > +++ b/include/net/pkt_cls.h
> > @@ -45,6 +45,7 @@ int tcf_block_get_ext(struct tcf_block **p_block, struct Qdisc *q,
> >  void tcf_block_put(struct tcf_block *block);
> >  void tcf_block_put_ext(struct tcf_block *block, struct Qdisc *q,
> >                        struct tcf_block_ext_info *ei);
> > +void tcf_flush_blocks(void);
> > 
> >  static inline struct Qdisc *tcf_block_q(struct tcf_block *block)
> >  {
> > diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h
> > index 65d0d25f2648..99846ee644a8 100644
> > --- a/include/net/sch_generic.h
> > +++ b/include/net/sch_generic.h
> > @@ -71,6 +71,7 @@ struct Qdisc {
> >                                       * qdisc_tree_decrease_qlen() should stop.
> >                                       */
> >  #define TCQ_F_INVISIBLE                0x80 /* invisible by default in dump */
> > +#define TCQ_F_DELETING         0x100
> >         u32                     limit;
> >         const struct Qdisc_ops  *ops;
> >         struct qdisc_size_table __rcu *stab;
> > @@ -279,6 +280,10 @@ struct tcf_block {
> >         struct Qdisc *q;
> >         struct list_head cb_list;
> >         struct work_struct work;
> > +
> > +       /* TODO: use a single list, do avoid wasting too much memory */
> > +       struct list_head del_list;
> > +       struct list_head del_head;
> >  };
> > 
> 
> This is just adding yet another layer of deferred freeing. We already
> have RCU why don't we just use that?
> 
> >  static inline void qdisc_cb_private_validate(const struct sk_buff *skb, int sz)
> > diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c
> > index 7d97f612c9b9..446b16c1f532 100644
> > --- a/net/sched/cls_api.c
> > +++ b/net/sched/cls_api.c
> > @@ -37,6 +37,61 @@ static LIST_HEAD(tcf_proto_base);
> >  /* Protects list of registered TC modules. It is pure SMP lock. */
> >  static DEFINE_RWLOCK(cls_mod_lock);
> > 
> > +/* List of tcf_blocks queued for deletion. Bulk freeing them we avoid the
> > + * rcu_barrier() storm at root_qdisc->destroy() time
> > + */
> > +static LIST_HEAD(queued_blocks);
> > +
> > +static void queue_for_deletion(struct tcf_block *block)
> > +{
> > +       if (WARN_ON(!list_empty(&block->del_list)))
> > +               return;
> > +
> > +       ASSERT_RTNL();
> > +       list_add(&block->del_list, &queued_blocks);
> > +}
> > +
> > +static void flush_blocks(struct work_struct *work)
> > +{
> > +       struct tcf_block *block, *tmp_block;
> > +       struct tcf_chain *chain, *tmp;
> > +       struct list_head *head;
> > +
> > +       head = &container_of(work, struct tcf_block, work)->del_head;
> > +       rtnl_lock();
> > +       list_for_each_entry(block, head, del_list)
> > +               /* Only chain 0 should be still here. */
> > +               list_for_each_entry_safe(chain, tmp, &block->chain_list, list)
> > +                       tcf_chain_put(chain);
> > +       rtnl_unlock();
> > +
> > +       list_for_each_entry_safe(block, tmp_block, head, del_list)
> > +               kfree(block);
> > +}
> > +
> > +void tcf_flush_blocks(void)
> > +{
> > +       struct tcf_block *head;
> > +       LIST_HEAD(flush_burst);
> > +
> > +       ASSERT_RTNL();
> > +       if (list_empty(&queued_blocks))
> > +               return;
> > +
> > +       head = list_first_entry(&queued_blocks, struct tcf_block, del_list);
> > +       INIT_LIST_HEAD(&head->del_head);
> > +       list_splice_init(&queued_blocks, &head->del_head);
> > +       INIT_WORK(&head->work, flush_blocks);
> > +
> > +       /* Wait for existing RCU callbacks to cool down, make sure their works
> > +        * have been queued before this. We can not flush pending works here
> > +        * because we are holding the RTNL lock.
> > +        */
> > +       rcu_barrier();
> > +       schedule_work(&head->work);
> > +}
> > +EXPORT_SYMBOL_GPL(tcf_flush_blocks);
> > +
> >  /* Find classifier type by string name */
> > 
> >  static const struct tcf_proto_ops *tcf_proto_lookup_ops(const char *kind)
> > @@ -288,6 +343,7 @@ int tcf_block_get_ext(struct tcf_block **p_block, struct Qdisc *q,
> >                 return -ENOMEM;
> >         INIT_LIST_HEAD(&block->chain_list);
> >         INIT_LIST_HEAD(&block->cb_list);
> > +       INIT_LIST_HEAD(&block->del_list);
> > 
> >         /* Create chain 0 by default, it has to be always present. */
> >         chain = tcf_chain_create(block, 0);
> > @@ -330,19 +386,6 @@ int tcf_block_get(struct tcf_block **p_block,
> >  }
> >  EXPORT_SYMBOL(tcf_block_get);
> > 
> > -static void tcf_block_put_final(struct work_struct *work)
> > -{
> > -       struct tcf_block *block = container_of(work, struct tcf_block, work);
> > -       struct tcf_chain *chain, *tmp;
> > -
> > -       rtnl_lock();
> > -       /* Only chain 0 should be still here. */
> > -       list_for_each_entry_safe(chain, tmp, &block->chain_list, list)
> > -               tcf_chain_put(chain);
> 
> So it seems like the heart of the problem is right here. The
> tcf_chain_put call is updating the reference count and I would assume
> that is the only bit that really needs you to still be holding onto
> the rtnl_lock.
> 
> The question I would have is if there is anything accessing the
> reference count or manipulating the list itself without holding the
> rtnl lock? If not you could look at converting this whole thing from
> using a list to an rculist and it seems like it would save you a lot
> of trouble. As far as I can tell the only thing you would then really
> have to worry about would be the freeing of the chain itself which
> would happen in an rcu callback instead of with the rtnl lock held.

Thank you for the feedback!

Big fat disclaimer: I'm not 100% sure I fully understood the locking
schema currently in use (ence the RFC tag), so please Jamal/Cong/Jiri
correct me if/where needed.

AFAIU, there are some more context information: tcf_block_put_final()
and the like must be invoked after that the related filters are freed.
Filter themself must be destroyed synchronously at namespace deletion
time, but must respect RCU grace period elsewhere - see comments in 
tcf_exts_get_net(). 

The rcu_barrier() enforces the proper ordering in the latter case, and
the rtnl_lock protects vs concurrency in the first case.

Cheers,

Paolo

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [RFC PATCH] net_sched: bulk free tcf_block
  2017-11-29 14:25 [RFC PATCH] net_sched: bulk free tcf_block Paolo Abeni
  2017-11-29 16:14 ` Alexander Duyck
@ 2017-12-01  7:14 ` Cong Wang
  2017-12-01 11:05   ` Paolo Abeni
  1 sibling, 1 reply; 7+ messages in thread
From: Cong Wang @ 2017-12-01  7:14 UTC (permalink / raw)
  To: Paolo Abeni
  Cc: Linux Kernel Network Developers, Jamal Hadi Salim, Jiri Pirko,
	David S. Miller

[-- Attachment #1: Type: text/plain, Size: 1137 bytes --]

On Wed, Nov 29, 2017 at 6:25 AM, Paolo Abeni <pabeni@redhat.com> wrote:
> Currently deleting qdisc with a large number of children and filters
> can take a lot of time:
>
> tc qdisc add dev lo root htb
> for I in `seq 1 1000`; do
>         tc class add dev lo parent 1: classid 1:$I htb rate 100kbit
>         tc qdisc add dev lo parent 1:$I handle $((I + 1)): htb
>         for J in `seq 1 10`; do
>                 tc filter add dev lo parent $((I + 1)): u32 match ip src 1.1.1.$J
>         done
> done
> time tc qdisc del dev root
>
> real    0m54.764s
> user    0m0.023s
> sys     0m0.000s
>
> This is due to the multiple rcu_barrier() calls, one for each tcf_block
> freed, invoked with the rtnl lock held. Most other network related
> tasks will block in this timespan.

Yeah, Eric pointed out this too and I already had an idea to cure
this.

As I already mentioned before, my idea is to refcnt the tcf block
so that we don't need to worry about which is the last. Something
like the attached patch below, note it is PoC _only_, not even
compiled yet. And I am not 100% sure it works either, I will look
deeper tomorrow.

Thanks.

[-- Attachment #2: tcf-block-refcnt.diff --]
[-- Type: text/plain, Size: 1874 bytes --]

diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h
index 65d0d25f2648..b051c519fd48 100644
--- a/include/net/sch_generic.h
+++ b/include/net/sch_generic.h
@@ -279,6 +279,7 @@ struct tcf_block {
 	struct Qdisc *q;
 	struct list_head cb_list;
 	struct work_struct work;
+	unsigned int nr_chains;
 };
 
 static inline void qdisc_cb_private_validate(const struct sk_buff *skb, int sz)
diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c
index ddcf04b4ab43..da74b311f09e 100644
--- a/net/sched/cls_api.c
+++ b/net/sched/cls_api.c
@@ -190,6 +190,7 @@ static struct tcf_chain *tcf_chain_create(struct tcf_block *block,
 		return NULL;
 	list_add_tail(&chain->list, &block->chain_list);
 	chain->block = block;
+	block->nr_chains++;
 	chain->index = chain_index;
 	chain->refcnt = 1;
 	return chain;
@@ -218,8 +219,12 @@ static void tcf_chain_flush(struct tcf_chain *chain)
 
 static void tcf_chain_destroy(struct tcf_chain *chain)
 {
+	struct tcf_block *block = chain->block;
+
 	list_del(&chain->list);
 	kfree(chain);
+	if (!--block->nr_chains)
+		kfree(block);
 }
 
 static void tcf_chain_hold(struct tcf_chain *chain)
@@ -341,7 +346,6 @@ static void tcf_block_put_final(struct work_struct *work)
 	list_for_each_entry_safe(chain, tmp, &block->chain_list, list)
 		tcf_chain_put(chain);
 	rtnl_unlock();
-	kfree(block);
 }
 
 /* XXX: Standalone actions are not allowed to jump to any chain, and bound
@@ -365,11 +369,6 @@ void tcf_block_put_ext(struct tcf_block *block, struct Qdisc *q,
 	tcf_block_offload_unbind(block, q, ei);
 
 	INIT_WORK(&block->work, tcf_block_put_final);
-	/* Wait for existing RCU callbacks to cool down, make sure their works
-	 * have been queued before this. We can not flush pending works here
-	 * because we are holding the RTNL lock.
-	 */
-	rcu_barrier();
 	tcf_queue_work(&block->work);
 }
 EXPORT_SYMBOL(tcf_block_put_ext);

^ permalink raw reply related	[flat|nested] 7+ messages in thread

* Re: [RFC PATCH] net_sched: bulk free tcf_block
  2017-12-01  7:14 ` Cong Wang
@ 2017-12-01 11:05   ` Paolo Abeni
  2017-12-01 22:07     ` Cong Wang
  0 siblings, 1 reply; 7+ messages in thread
From: Paolo Abeni @ 2017-12-01 11:05 UTC (permalink / raw)
  To: Cong Wang
  Cc: Linux Kernel Network Developers, Jamal Hadi Salim, Jiri Pirko,
	David S. Miller

On Thu, 2017-11-30 at 23:14 -0800, Cong Wang wrote:
> On Wed, Nov 29, 2017 at 6:25 AM, Paolo Abeni <pabeni@redhat.com> wrote:
> > Currently deleting qdisc with a large number of children and filters
> > can take a lot of time:
> > 
> > tc qdisc add dev lo root htb
> > for I in `seq 1 1000`; do
> >         tc class add dev lo parent 1: classid 1:$I htb rate 100kbit
> >         tc qdisc add dev lo parent 1:$I handle $((I + 1)): htb
> >         for J in `seq 1 10`; do
> >                 tc filter add dev lo parent $((I + 1)): u32 match ip src 1.1.1.$J
> >         done
> > done
> > time tc qdisc del dev root
> > 
> > real    0m54.764s
> > user    0m0.023s
> > sys     0m0.000s
> > 
> > This is due to the multiple rcu_barrier() calls, one for each tcf_block
> > freed, invoked with the rtnl lock held. Most other network related
> > tasks will block in this timespan.
> 
> Yeah, Eric pointed out this too and I already had an idea to cure
> this.
> 
> As I already mentioned before, my idea is to refcnt the tcf block
> so that we don't need to worry about which is the last. Something
> like the attached patch below, note it is PoC _only_, not even
> compiled yet. And I am not 100% sure it works either, I will look
> deeper tomorrow.

Thank you for the feedback.

I tested your patch and in the above scenario I measure:

real	0m0.017s
user	0m0.000s
sys	0m0.017s

so it apparently works well for this case.

We could still have a storm of rtnl lock/unlock operations while
deleting a large tc tree with lot of filters, and I think we can reduce
them with bulk free, evenutally applying it to filters, too. 

That will also reduce the pressure on the rtnl lock when e.g. OVS H/W
offload pushes a lot of rules/sec.

WDYT?

Cheers,

Paolo

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [RFC PATCH] net_sched: bulk free tcf_block
  2017-12-01 11:05   ` Paolo Abeni
@ 2017-12-01 22:07     ` Cong Wang
  2017-12-04 10:12       ` Paolo Abeni
  0 siblings, 1 reply; 7+ messages in thread
From: Cong Wang @ 2017-12-01 22:07 UTC (permalink / raw)
  To: Paolo Abeni
  Cc: Linux Kernel Network Developers, Jamal Hadi Salim, Jiri Pirko,
	David S. Miller

On Fri, Dec 1, 2017 at 3:05 AM, Paolo Abeni <pabeni@redhat.com> wrote:
>
> Thank you for the feedback.
>
> I tested your patch and in the above scenario I measure:
>
> real    0m0.017s
> user    0m0.000s
> sys     0m0.017s
>
> so it apparently works well for this case.

Thanks a lot for testing it! I will test it further. If it goes well I will
send a formal patch with your Tested-by unless you object it.

>
> We could still have a storm of rtnl lock/unlock operations while
> deleting a large tc tree with lot of filters, and I think we can reduce
> them with bulk free, evenutally applying it to filters, too.
>
> That will also reduce the pressure on the rtnl lock when e.g. OVS H/W
> offload pushes a lot of rules/sec.
>
> WDYT?
>

Why this is specific to tc filter? From what you are saying, we need to
batch all TC operations (qdisc, filter and action) rather than just filter?

In short term, I think batching rtnl lock/unlock is a good optimization,
so I have no objection. For long term, I think we need to revise RTNL
lock and probably move it down to each layer, but clearly it requires
much more work.

Thanks.

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [RFC PATCH] net_sched: bulk free tcf_block
  2017-12-01 22:07     ` Cong Wang
@ 2017-12-04 10:12       ` Paolo Abeni
  0 siblings, 0 replies; 7+ messages in thread
From: Paolo Abeni @ 2017-12-04 10:12 UTC (permalink / raw)
  To: Cong Wang
  Cc: Linux Kernel Network Developers, Jamal Hadi Salim, Jiri Pirko,
	David S. Miller

Hi,

On Fri, 2017-12-01 at 14:07 -0800, Cong Wang wrote:
> On Fri, Dec 1, 2017 at 3:05 AM, Paolo Abeni <pabeni@redhat.com> wrote:
> > 
> > Thank you for the feedback.
> > 
> > I tested your patch and in the above scenario I measure:
> > 
> > real    0m0.017s
> > user    0m0.000s
> > sys     0m0.017s
> > 
> > so it apparently works well for this case.
> 
> Thanks a lot for testing it! I will test it further. If it goes well I will
> send a formal patch with your Tested-by unless you object it.

I'm in late, but I was fine with the above ;)

> > We could still have a storm of rtnl lock/unlock operations while
> > deleting a large tc tree with lot of filters, and I think we can reduce
> > them with bulk free, evenutally applying it to filters, too.
> > 
> > That will also reduce the pressure on the rtnl lock when e.g. OVS H/W
> > offload pushes a lot of rules/sec.
> > 
> > WDYT?
> > 
> 
> Why this is specific to tc filter? From what you are saying, we need to
> batch all TC operations (qdisc, filter and action) rather than just filter?

Exactly, the idea would be to batch all the delayed works. I started
with blocks, to somewhat tackle the issue seen on qdisc removal.

> In short term, I think batching rtnl lock/unlock is a good optimization,
> so I have no objection. For long term, I think we need to revise RTNL
> lock and probably move it down to each layer, but clearly it requires
> much more work.

Agreed!

Paolo

^ permalink raw reply	[flat|nested] 7+ messages in thread

end of thread, other threads:[~2017-12-04 10:12 UTC | newest]

Thread overview: 7+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2017-11-29 14:25 [RFC PATCH] net_sched: bulk free tcf_block Paolo Abeni
2017-11-29 16:14 ` Alexander Duyck
2017-11-29 18:47   ` Paolo Abeni
2017-12-01  7:14 ` Cong Wang
2017-12-01 11:05   ` Paolo Abeni
2017-12-01 22:07     ` Cong Wang
2017-12-04 10:12       ` Paolo Abeni

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).