[UPDATED] [NET-NEXT PATCH 1/2] pkt_sched: Add multiqueue scheduler support

netdev.vger.kernel.org archive mirror
 help / color / mirror / Atom feed

* [UPDATED] [NET-NEXT PATCH 1/2] pkt_sched: Add multiqueue scheduler support
@ 2008-08-30  7:21 Jeff Kirsher
  2008-09-12  3:11 ` David Miller
  0 siblings, 1 reply; 12+ messages in thread
From: Jeff Kirsher @ 2008-08-30  7:21 UTC (permalink / raw)
  To: jeff; +Cc: netdev, davem, Alexander Duyck, Jeff Kirsher

From: Alexander Duyck <alexander.h.duyck@intel.com>

This patch is intended to add a qdisc to support the new tx multiqueue
architecture by providing a band for each hardware queue.  By doing
this it is possible to support a different qdisc per physical hardware
queue.

This qdisc uses the skb->queue_mapping to select which band to place
the traffic onto.  It then uses a round robin w/ a check to see if the
subqueue is stopped to determine which band to dequeue the packet from.

Signed-off-by: Alexander Duyck <alexander.h.duyck@intel.com>
Signed-off-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
---

 include/linux/pkt_sched.h |    6 +
 net/sched/Kconfig         |    9 +
 net/sched/Makefile        |    1 
 net/sched/sch_multiq.c    |  470 +++++++++++++++++++++++++++++++++++++++++++++
 4 files changed, 486 insertions(+), 0 deletions(-)
 create mode 100644 net/sched/sch_multiq.c

diff --git a/include/linux/pkt_sched.h b/include/linux/pkt_sched.h
index e5de421..7fbc952 100644
--- a/include/linux/pkt_sched.h
+++ b/include/linux/pkt_sched.h
@@ -123,6 +123,12 @@ struct tc_prio_qopt
 	__u8	priomap[TC_PRIO_MAX+1];	/* Map: logical priority -> PRIO band */
 };
 
+/* MULTIQ section */
+
+struct tc_multiq_qopt {
+	int	bands;			/* Number of bands */
+};
+
 /* TBF section */
 
 struct tc_tbf_qopt
diff --git a/net/sched/Kconfig b/net/sched/Kconfig
index 9437b27..efaa7a7 100644
--- a/net/sched/Kconfig
+++ b/net/sched/Kconfig
@@ -106,6 +106,15 @@ config NET_SCH_PRIO
 	  To compile this code as a module, choose M here: the
 	  module will be called sch_prio.
 
+config NET_SCH_MULTIQ
+	tristate "Hardware Multiqueue-aware Multi Band Queuing (MULTIQ)"
+	---help---
+	  Say Y here if you want to use an n-band queue packet scheduler
+	  to support devices that have multiple hardware transmit queues.
+
+	  To compile this code as a module, choose M here: the
+	  module will be called sch_multiq.
+
 config NET_SCH_RED
 	tristate "Random Early Detection (RED)"
 	---help---
diff --git a/net/sched/Makefile b/net/sched/Makefile
index 1d2b0f7..3d9b953 100644
--- a/net/sched/Makefile
+++ b/net/sched/Makefile
@@ -26,6 +26,7 @@ obj-$(CONFIG_NET_SCH_SFQ)	+= sch_sfq.o
 obj-$(CONFIG_NET_SCH_TBF)	+= sch_tbf.o
 obj-$(CONFIG_NET_SCH_TEQL)	+= sch_teql.o
 obj-$(CONFIG_NET_SCH_PRIO)	+= sch_prio.o
+obj-$(CONFIG_NET_SCH_MULTIQ)	+= sch_multiq.o
 obj-$(CONFIG_NET_SCH_ATM)	+= sch_atm.o
 obj-$(CONFIG_NET_SCH_NETEM)	+= sch_netem.o
 obj-$(CONFIG_NET_CLS_U32)	+= cls_u32.o
diff --git a/net/sched/sch_multiq.c b/net/sched/sch_multiq.c
new file mode 100644
index 0000000..708dd5c
--- /dev/null
+++ b/net/sched/sch_multiq.c
@@ -0,0 +1,470 @@
+/*
+ * net/sched/sch_multiq.c
+ * 		This qdisc is based off of the rr qdisc and is meant to
+ * 		prevent head-of-line blocking on devices that have multiple
+ * 		hardware queues.
+ *
+ *		This program is free software; you can redistribute it and/or
+ *		modify it under the terms of the GNU General Public License
+ *		as published by the Free Software Foundation; either version
+ *		2 of the License, or (at your option) any later version.
+ *
+ * Authors:	Alexander Duyck <alexander.h.duyck@intel.com>
+ */
+
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/string.h>
+#include <linux/errno.h>
+#include <linux/skbuff.h>
+#include <net/netlink.h>
+#include <net/pkt_sched.h>
+
+
+struct multiq_sched_data {
+	int bands;
+	int curband;
+	struct tcf_proto *filter_list;
+	struct Qdisc **queues;
+
+};
+
+
+static struct Qdisc *
+multiq_classify(struct sk_buff *skb, struct Qdisc *sch, int *qerr)
+{
+	struct multiq_sched_data *q = qdisc_priv(sch);
+	u32 band;
+	struct tcf_result res;
+	int err;
+
+	*qerr = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS;
+	err = tc_classify(skb, q->filter_list, &res);
+#ifdef CONFIG_NET_CLS_ACT
+	switch (err) {
+	case TC_ACT_STOLEN:
+	case TC_ACT_QUEUED:
+		*qerr = NET_XMIT_SUCCESS | __NET_XMIT_STOLEN;
+	case TC_ACT_SHOT:
+		return NULL;
+	}
+#endif
+	band = skb_get_queue_mapping(skb);
+
+	if (band >= q->bands)
+		return q->queues[0];
+
+	return q->queues[band];
+}
+
+static int
+multiq_enqueue(struct sk_buff *skb, struct Qdisc *sch)
+{
+	struct Qdisc *qdisc;
+	int ret;
+
+	qdisc = multiq_classify(skb, sch, &ret);
+#ifdef CONFIG_NET_CLS_ACT
+	if (qdisc == NULL) {
+
+		if (ret & __NET_XMIT_BYPASS)
+			sch->qstats.drops++;
+		kfree_skb(skb);
+		return ret;
+	}
+#endif
+
+	ret = qdisc_enqueue(skb, qdisc);
+	if (ret == NET_XMIT_SUCCESS) {
+		sch->bstats.bytes += qdisc_pkt_len(skb);
+		sch->bstats.packets++;
+		sch->q.qlen++;
+		return NET_XMIT_SUCCESS;
+	}
+	if (net_xmit_drop_count(ret))
+		sch->qstats.drops++;
+	return ret;
+}
+
+
+static int
+multiq_requeue(struct sk_buff *skb, struct Qdisc *sch)
+{
+	struct Qdisc *qdisc;
+	int ret;
+
+	qdisc = multiq_classify(skb, sch, &ret);
+#ifdef CONFIG_NET_CLS_ACT
+	if (qdisc == NULL) {
+		if (ret & __NET_XMIT_BYPASS)
+			sch->qstats.drops++;
+		kfree_skb(skb);
+		return ret;
+	}
+#endif
+
+	ret = qdisc->ops->requeue(skb, qdisc);
+	if (ret == NET_XMIT_SUCCESS) {
+		sch->q.qlen++;
+		sch->qstats.requeues++;
+		return NET_XMIT_SUCCESS;
+	}
+	if (net_xmit_drop_count(ret))
+		sch->qstats.drops++;
+	return ret;
+}
+
+
+static struct sk_buff *multiq_dequeue(struct Qdisc *sch)
+{
+	struct multiq_sched_data *q = qdisc_priv(sch);
+	struct Qdisc *qdisc;
+	struct sk_buff *skb;
+	int band;
+
+	for (band = 0; band < q->bands; band++) {
+		/* cycle through bands to ensure fairness */
+		q->curband++;
+		if (q->curband >= q->bands)
+			q->curband = 0;
+
+		/* Check that target subqueue is available before
+		 * pulling an skb to avoid excessive requeues
+		 */
+		if (!__netif_subqueue_stopped(qdisc_dev(sch), q->curband)) {
+			qdisc = q->queues[q->curband];
+			skb = qdisc->dequeue(qdisc);
+			if (skb) {
+				sch->q.qlen--;
+				return skb;
+			}
+		}
+	}
+	return NULL;
+
+}
+
+static unsigned int multiq_drop(struct Qdisc *sch)
+{
+	struct multiq_sched_data *q = qdisc_priv(sch);
+	int band;
+	unsigned int len;
+	struct Qdisc *qdisc;
+
+	for (band = q->bands-1; band >= 0; band--) {
+		qdisc = q->queues[band];
+		if (qdisc->ops->drop) {
+			len = qdisc->ops->drop(qdisc);
+			if (len != 0) {
+				sch->q.qlen--;
+				return len;
+			}
+		}
+	}
+	return 0;
+}
+
+
+static void
+multiq_reset(struct Qdisc *sch)
+{
+	int band;
+	struct multiq_sched_data *q = qdisc_priv(sch);
+
+	for (band = 0; band < q->bands; band++)
+		qdisc_reset(q->queues[band]);
+	sch->q.qlen = 0;
+}
+
+static void
+multiq_destroy(struct Qdisc *sch)
+{
+	int band;
+	struct multiq_sched_data *q = qdisc_priv(sch);
+
+	tcf_destroy_chain(&q->filter_list);
+	for (band = 0; band < q->bands; band++)
+		qdisc_destroy(q->queues[band]);
+
+	kfree(q->queues);
+}
+
+static int multiq_tune(struct Qdisc *sch, struct nlattr *opt)
+{
+	struct multiq_sched_data *q = qdisc_priv(sch);
+	struct tc_multiq_qopt *qopt;
+	struct Qdisc **queues;
+	int i;
+
+	if (sch->parent != TC_H_ROOT)
+		return -EINVAL;
+	if (!netif_is_multiqueue(qdisc_dev(sch)))
+		return -EINVAL;
+	if (nla_len(opt) < sizeof(*qopt))
+		return -EINVAL;
+
+	qopt = nla_data(opt);
+
+	qopt->bands = qdisc_dev(sch)->real_num_tx_queues;
+
+	queues = kzalloc(sizeof(struct Qdisc *)*qopt->bands, GFP_KERNEL);
+	if (!queues)
+		return -ENOBUFS;
+
+	for (i = 0; i < qopt->bands; i++)
+		queues[i] = &noop_qdisc;
+
+	sch_tree_lock(sch);
+	q->queues = xchg(&queues, q->queues);
+	if (queues != NULL) {
+		for (i = 0; i < q->bands; i++) {
+			if (queues[i] != &noop_qdisc) {
+				qdisc_tree_decrease_qlen(queues[i],
+							 queues[i]->q.qlen);
+				qdisc_destroy(queues[i]);
+			}
+		}
+		kfree(queues);
+
+	}
+	q->bands = qopt->bands;
+
+	sch_tree_unlock(sch);
+
+	for (i = 0; i < q->bands; i++) {
+		if (q->queues[i] == &noop_qdisc) {
+			struct Qdisc *child;
+			child = qdisc_create_dflt(qdisc_dev(sch),
+						  sch->dev_queue,
+						  &pfifo_qdisc_ops,
+						  TC_H_MAKE(sch->handle,
+							    i + 1));
+			if (child) {
+				sch_tree_lock(sch);
+				child = xchg(&q->queues[i], child);
+
+				if (child != &noop_qdisc) {
+					qdisc_tree_decrease_qlen(child,
+								 child->q.qlen);
+					qdisc_destroy(child);
+				}
+				sch_tree_unlock(sch);
+			}
+		}
+	}
+	return 0;
+}
+
+static int multiq_init(struct Qdisc *sch, struct nlattr *opt)
+{
+	struct multiq_sched_data *q = qdisc_priv(sch);
+
+	q->queues = NULL;
+
+	if (opt == NULL)
+		return -EINVAL;
+
+	return multiq_tune(sch, opt);
+}
+
+static int multiq_dump(struct Qdisc *sch, struct sk_buff *skb)
+{
+	struct multiq_sched_data *q = qdisc_priv(sch);
+	unsigned char *b = skb_tail_pointer(skb);
+	struct nlattr *nest;
+	struct tc_multiq_qopt opt;
+
+	opt.bands = q->bands;
+
+	nest = nla_nest_compat_start(skb, TCA_OPTIONS, sizeof(opt), &opt);
+	if (nest == NULL)
+		goto nla_put_failure;
+	nla_nest_compat_end(skb, nest);
+
+	return skb->len;
+
+nla_put_failure:
+	nlmsg_trim(skb, b);
+	return -1;
+}
+
+static int multiq_graft(struct Qdisc *sch, unsigned long arg, struct Qdisc *new,
+		      struct Qdisc **old)
+{
+	struct multiq_sched_data *q = qdisc_priv(sch);
+	unsigned long band = arg - 1;
+
+	if (band >= q->bands)
+		return -EINVAL;
+
+	if (new == NULL)
+		new = &noop_qdisc;
+
+	sch_tree_lock(sch);
+	*old = q->queues[band];
+	q->queues[band] = new;
+	qdisc_tree_decrease_qlen(*old, (*old)->q.qlen);
+	qdisc_reset(*old);
+	sch_tree_unlock(sch);
+
+	return 0;
+}
+
+static struct Qdisc *
+multiq_leaf(struct Qdisc *sch, unsigned long arg)
+{
+	struct multiq_sched_data *q = qdisc_priv(sch);
+	unsigned long band = arg - 1;
+
+	if (band >= q->bands)
+		return NULL;
+
+	return q->queues[band];
+}
+
+static unsigned long multiq_get(struct Qdisc *sch, u32 classid)
+{
+	struct multiq_sched_data *q = qdisc_priv(sch);
+	unsigned long band = TC_H_MIN(classid);
+
+	if (band - 1 >= q->bands)
+		return 0;
+	return band;
+}
+
+static unsigned long multiq_bind(struct Qdisc *sch, unsigned long parent,
+				 u32 classid)
+{
+	return multiq_get(sch, classid);
+}
+
+
+static void multiq_put(struct Qdisc *q, unsigned long cl)
+{
+	return;
+}
+
+static int multiq_change(struct Qdisc *sch, u32 handle, u32 parent,
+			 struct nlattr **tca, unsigned long *arg)
+{
+	unsigned long cl = *arg;
+	struct multiq_sched_data *q = qdisc_priv(sch);
+
+	if (cl - 1 > q->bands)
+		return -ENOENT;
+	return 0;
+}
+
+static int multiq_delete(struct Qdisc *sch, unsigned long cl)
+{
+	struct multiq_sched_data *q = qdisc_priv(sch);
+	if (cl - 1 > q->bands)
+		return -ENOENT;
+	return 0;
+}
+
+
+static int multiq_dump_class(struct Qdisc *sch, unsigned long cl,
+			     struct sk_buff *skb, struct tcmsg *tcm)
+{
+	struct multiq_sched_data *q = qdisc_priv(sch);
+
+	if (cl - 1 > q->bands)
+		return -ENOENT;
+	tcm->tcm_handle |= TC_H_MIN(cl);
+	if (q->queues[cl-1])
+		tcm->tcm_info = q->queues[cl-1]->handle;
+	return 0;
+}
+
+static int multiq_dump_class_stats(struct Qdisc *sch, unsigned long cl,
+				 struct gnet_dump *d)
+{
+	struct multiq_sched_data *q = qdisc_priv(sch);
+	struct Qdisc *cl_q;
+
+	cl_q = q->queues[cl - 1];
+	if (gnet_stats_copy_basic(d, &cl_q->bstats) < 0 ||
+	    gnet_stats_copy_queue(d, &cl_q->qstats) < 0)
+		return -1;
+
+	return 0;
+}
+
+static void multiq_walk(struct Qdisc *sch, struct qdisc_walker *arg)
+{
+	struct multiq_sched_data *q = qdisc_priv(sch);
+	int band;
+
+	if (arg->stop)
+		return;
+
+	for (band = 0; band < q->bands; band++) {
+		if (arg->count < arg->skip) {
+			arg->count++;
+			continue;
+		}
+		if (arg->fn(sch, band+1, arg) < 0) {
+			arg->stop = 1;
+			break;
+		}
+		arg->count++;
+	}
+}
+
+static struct tcf_proto **multiq_find_tcf(struct Qdisc *sch, unsigned long cl)
+{
+	struct multiq_sched_data *q = qdisc_priv(sch);
+
+	if (cl)
+		return NULL;
+	return &q->filter_list;
+}
+
+static const struct Qdisc_class_ops multiq_class_ops = {
+	.graft		=	multiq_graft,
+	.leaf		=	multiq_leaf,
+	.get		=	multiq_get,
+	.put		=	multiq_put,
+	.change		=	multiq_change,
+	.delete		=	multiq_delete,
+	.walk		=	multiq_walk,
+	.tcf_chain	=	multiq_find_tcf,
+	.bind_tcf	=	multiq_bind,
+	.unbind_tcf	=	multiq_put,
+	.dump		=	multiq_dump_class,
+	.dump_stats	=	multiq_dump_class_stats,
+};
+
+static struct Qdisc_ops multiq_qdisc_ops __read_mostly = {
+	.next		=	NULL,
+	.cl_ops		=	&multiq_class_ops,
+	.id		=	"multiq",
+	.priv_size	=	sizeof(struct multiq_sched_data),
+	.enqueue	=	multiq_enqueue,
+	.dequeue	=	multiq_dequeue,
+	.requeue	=	multiq_requeue,
+	.drop		=	multiq_drop,
+	.init		=	multiq_init,
+	.reset		=	multiq_reset,
+	.destroy	=	multiq_destroy,
+	.change		=	multiq_tune,
+	.dump		=	multiq_dump,
+	.owner		=	THIS_MODULE,
+};
+
+static int __init multiq_module_init(void)
+{
+	return register_qdisc(&multiq_qdisc_ops);
+}
+
+static void __exit multiq_module_exit(void)
+{
+	unregister_qdisc(&multiq_qdisc_ops);
+}
+
+module_init(multiq_module_init)
+module_exit(multiq_module_exit)
+
+MODULE_LICENSE("GPL");


^ permalink raw reply related	[flat|nested] 12+ messages in thread

* [UPDATED] [NET-NEXT PATCH 1/2] pkt_sched: Add multiqueue scheduler support
@ 2008-08-30  7:23 Jeff Kirsher
  2008-08-30  7:24 ` [UPDATED] [NET-NEXT PATCH 2/2] pkt_action: add new action skbedit Jeff Kirsher
  2008-09-01 21:05 ` [UPDATED] [NET-NEXT PATCH 1/2] pkt_sched: Add multiqueue scheduler support Jarek Poplawski
  0 siblings, 2 replies; 12+ messages in thread
From: Jeff Kirsher @ 2008-08-30  7:23 UTC (permalink / raw)
  To: jeff; +Cc: netdev, davem, Alexander Duyck, Jeff Kirsher

From: Alexander Duyck <alexander.h.duyck@intel.com>

This patch is intended to add a qdisc to support the new tx multiqueue
architecture by providing a band for each hardware queue.  By doing
this it is possible to support a different qdisc per physical hardware
queue.

This qdisc uses the skb->queue_mapping to select which band to place
the traffic onto.  It then uses a round robin w/ a check to see if the
subqueue is stopped to determine which band to dequeue the packet from.

Signed-off-by: Alexander Duyck <alexander.h.duyck@intel.com>
Signed-off-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
---

 include/linux/pkt_sched.h |    6 +
 net/sched/Kconfig         |    9 +
 net/sched/Makefile        |    1 
 net/sched/sch_multiq.c    |  470 +++++++++++++++++++++++++++++++++++++++++++++
 4 files changed, 486 insertions(+), 0 deletions(-)
 create mode 100644 net/sched/sch_multiq.c

diff --git a/include/linux/pkt_sched.h b/include/linux/pkt_sched.h
index e5de421..7fbc952 100644
--- a/include/linux/pkt_sched.h
+++ b/include/linux/pkt_sched.h
@@ -123,6 +123,12 @@ struct tc_prio_qopt
 	__u8	priomap[TC_PRIO_MAX+1];	/* Map: logical priority -> PRIO band */
 };
 
+/* MULTIQ section */
+
+struct tc_multiq_qopt {
+	int	bands;			/* Number of bands */
+};
+
 /* TBF section */
 
 struct tc_tbf_qopt
diff --git a/net/sched/Kconfig b/net/sched/Kconfig
index 9437b27..efaa7a7 100644
--- a/net/sched/Kconfig
+++ b/net/sched/Kconfig
@@ -106,6 +106,15 @@ config NET_SCH_PRIO
 	  To compile this code as a module, choose M here: the
 	  module will be called sch_prio.
 
+config NET_SCH_MULTIQ
+	tristate "Hardware Multiqueue-aware Multi Band Queuing (MULTIQ)"
+	---help---
+	  Say Y here if you want to use an n-band queue packet scheduler
+	  to support devices that have multiple hardware transmit queues.
+
+	  To compile this code as a module, choose M here: the
+	  module will be called sch_multiq.
+
 config NET_SCH_RED
 	tristate "Random Early Detection (RED)"
 	---help---
diff --git a/net/sched/Makefile b/net/sched/Makefile
index 1d2b0f7..3d9b953 100644
--- a/net/sched/Makefile
+++ b/net/sched/Makefile
@@ -26,6 +26,7 @@ obj-$(CONFIG_NET_SCH_SFQ)	+= sch_sfq.o
 obj-$(CONFIG_NET_SCH_TBF)	+= sch_tbf.o
 obj-$(CONFIG_NET_SCH_TEQL)	+= sch_teql.o
 obj-$(CONFIG_NET_SCH_PRIO)	+= sch_prio.o
+obj-$(CONFIG_NET_SCH_MULTIQ)	+= sch_multiq.o
 obj-$(CONFIG_NET_SCH_ATM)	+= sch_atm.o
 obj-$(CONFIG_NET_SCH_NETEM)	+= sch_netem.o
 obj-$(CONFIG_NET_CLS_U32)	+= cls_u32.o
diff --git a/net/sched/sch_multiq.c b/net/sched/sch_multiq.c
new file mode 100644
index 0000000..708dd5c
--- /dev/null
+++ b/net/sched/sch_multiq.c
@@ -0,0 +1,470 @@
+/*
+ * net/sched/sch_multiq.c
+ * 		This qdisc is based off of the rr qdisc and is meant to
+ * 		prevent head-of-line blocking on devices that have multiple
+ * 		hardware queues.
+ *
+ *		This program is free software; you can redistribute it and/or
+ *		modify it under the terms of the GNU General Public License
+ *		as published by the Free Software Foundation; either version
+ *		2 of the License, or (at your option) any later version.
+ *
+ * Authors:	Alexander Duyck <alexander.h.duyck@intel.com>
+ */
+
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/string.h>
+#include <linux/errno.h>
+#include <linux/skbuff.h>
+#include <net/netlink.h>
+#include <net/pkt_sched.h>
+
+
+struct multiq_sched_data {
+	int bands;
+	int curband;
+	struct tcf_proto *filter_list;
+	struct Qdisc **queues;
+
+};
+
+
+static struct Qdisc *
+multiq_classify(struct sk_buff *skb, struct Qdisc *sch, int *qerr)
+{
+	struct multiq_sched_data *q = qdisc_priv(sch);
+	u32 band;
+	struct tcf_result res;
+	int err;
+
+	*qerr = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS;
+	err = tc_classify(skb, q->filter_list, &res);
+#ifdef CONFIG_NET_CLS_ACT
+	switch (err) {
+	case TC_ACT_STOLEN:
+	case TC_ACT_QUEUED:
+		*qerr = NET_XMIT_SUCCESS | __NET_XMIT_STOLEN;
+	case TC_ACT_SHOT:
+		return NULL;
+	}
+#endif
+	band = skb_get_queue_mapping(skb);
+
+	if (band >= q->bands)
+		return q->queues[0];
+
+	return q->queues[band];
+}
+
+static int
+multiq_enqueue(struct sk_buff *skb, struct Qdisc *sch)
+{
+	struct Qdisc *qdisc;
+	int ret;
+
+	qdisc = multiq_classify(skb, sch, &ret);
+#ifdef CONFIG_NET_CLS_ACT
+	if (qdisc == NULL) {
+
+		if (ret & __NET_XMIT_BYPASS)
+			sch->qstats.drops++;
+		kfree_skb(skb);
+		return ret;
+	}
+#endif
+
+	ret = qdisc_enqueue(skb, qdisc);
+	if (ret == NET_XMIT_SUCCESS) {
+		sch->bstats.bytes += qdisc_pkt_len(skb);
+		sch->bstats.packets++;
+		sch->q.qlen++;
+		return NET_XMIT_SUCCESS;
+	}
+	if (net_xmit_drop_count(ret))
+		sch->qstats.drops++;
+	return ret;
+}
+
+
+static int
+multiq_requeue(struct sk_buff *skb, struct Qdisc *sch)
+{
+	struct Qdisc *qdisc;
+	int ret;
+
+	qdisc = multiq_classify(skb, sch, &ret);
+#ifdef CONFIG_NET_CLS_ACT
+	if (qdisc == NULL) {
+		if (ret & __NET_XMIT_BYPASS)
+			sch->qstats.drops++;
+		kfree_skb(skb);
+		return ret;
+	}
+#endif
+
+	ret = qdisc->ops->requeue(skb, qdisc);
+	if (ret == NET_XMIT_SUCCESS) {
+		sch->q.qlen++;
+		sch->qstats.requeues++;
+		return NET_XMIT_SUCCESS;
+	}
+	if (net_xmit_drop_count(ret))
+		sch->qstats.drops++;
+	return ret;
+}
+
+
+static struct sk_buff *multiq_dequeue(struct Qdisc *sch)
+{
+	struct multiq_sched_data *q = qdisc_priv(sch);
+	struct Qdisc *qdisc;
+	struct sk_buff *skb;
+	int band;
+
+	for (band = 0; band < q->bands; band++) {
+		/* cycle through bands to ensure fairness */
+		q->curband++;
+		if (q->curband >= q->bands)
+			q->curband = 0;
+
+		/* Check that target subqueue is available before
+		 * pulling an skb to avoid excessive requeues
+		 */
+		if (!__netif_subqueue_stopped(qdisc_dev(sch), q->curband)) {
+			qdisc = q->queues[q->curband];
+			skb = qdisc->dequeue(qdisc);
+			if (skb) {
+				sch->q.qlen--;
+				return skb;
+			}
+		}
+	}
+	return NULL;
+
+}
+
+static unsigned int multiq_drop(struct Qdisc *sch)
+{
+	struct multiq_sched_data *q = qdisc_priv(sch);
+	int band;
+	unsigned int len;
+	struct Qdisc *qdisc;
+
+	for (band = q->bands-1; band >= 0; band--) {
+		qdisc = q->queues[band];
+		if (qdisc->ops->drop) {
+			len = qdisc->ops->drop(qdisc);
+			if (len != 0) {
+				sch->q.qlen--;
+				return len;
+			}
+		}
+	}
+	return 0;
+}
+
+
+static void
+multiq_reset(struct Qdisc *sch)
+{
+	int band;
+	struct multiq_sched_data *q = qdisc_priv(sch);
+
+	for (band = 0; band < q->bands; band++)
+		qdisc_reset(q->queues[band]);
+	sch->q.qlen = 0;
+}
+
+static void
+multiq_destroy(struct Qdisc *sch)
+{
+	int band;
+	struct multiq_sched_data *q = qdisc_priv(sch);
+
+	tcf_destroy_chain(&q->filter_list);
+	for (band = 0; band < q->bands; band++)
+		qdisc_destroy(q->queues[band]);
+
+	kfree(q->queues);
+}
+
+static int multiq_tune(struct Qdisc *sch, struct nlattr *opt)
+{
+	struct multiq_sched_data *q = qdisc_priv(sch);
+	struct tc_multiq_qopt *qopt;
+	struct Qdisc **queues;
+	int i;
+
+	if (sch->parent != TC_H_ROOT)
+		return -EINVAL;
+	if (!netif_is_multiqueue(qdisc_dev(sch)))
+		return -EINVAL;
+	if (nla_len(opt) < sizeof(*qopt))
+		return -EINVAL;
+
+	qopt = nla_data(opt);
+
+	qopt->bands = qdisc_dev(sch)->real_num_tx_queues;
+
+	queues = kzalloc(sizeof(struct Qdisc *)*qopt->bands, GFP_KERNEL);
+	if (!queues)
+		return -ENOBUFS;
+
+	for (i = 0; i < qopt->bands; i++)
+		queues[i] = &noop_qdisc;
+
+	sch_tree_lock(sch);
+	q->queues = xchg(&queues, q->queues);
+	if (queues != NULL) {
+		for (i = 0; i < q->bands; i++) {
+			if (queues[i] != &noop_qdisc) {
+				qdisc_tree_decrease_qlen(queues[i],
+							 queues[i]->q.qlen);
+				qdisc_destroy(queues[i]);
+			}
+		}
+		kfree(queues);
+
+	}
+	q->bands = qopt->bands;
+
+	sch_tree_unlock(sch);
+
+	for (i = 0; i < q->bands; i++) {
+		if (q->queues[i] == &noop_qdisc) {
+			struct Qdisc *child;
+			child = qdisc_create_dflt(qdisc_dev(sch),
+						  sch->dev_queue,
+						  &pfifo_qdisc_ops,
+						  TC_H_MAKE(sch->handle,
+							    i + 1));
+			if (child) {
+				sch_tree_lock(sch);
+				child = xchg(&q->queues[i], child);
+
+				if (child != &noop_qdisc) {
+					qdisc_tree_decrease_qlen(child,
+								 child->q.qlen);
+					qdisc_destroy(child);
+				}
+				sch_tree_unlock(sch);
+			}
+		}
+	}
+	return 0;
+}
+
+static int multiq_init(struct Qdisc *sch, struct nlattr *opt)
+{
+	struct multiq_sched_data *q = qdisc_priv(sch);
+
+	q->queues = NULL;
+
+	if (opt == NULL)
+		return -EINVAL;
+
+	return multiq_tune(sch, opt);
+}
+
+static int multiq_dump(struct Qdisc *sch, struct sk_buff *skb)
+{
+	struct multiq_sched_data *q = qdisc_priv(sch);
+	unsigned char *b = skb_tail_pointer(skb);
+	struct nlattr *nest;
+	struct tc_multiq_qopt opt;
+
+	opt.bands = q->bands;
+
+	nest = nla_nest_compat_start(skb, TCA_OPTIONS, sizeof(opt), &opt);
+	if (nest == NULL)
+		goto nla_put_failure;
+	nla_nest_compat_end(skb, nest);
+
+	return skb->len;
+
+nla_put_failure:
+	nlmsg_trim(skb, b);
+	return -1;
+}
+
+static int multiq_graft(struct Qdisc *sch, unsigned long arg, struct Qdisc *new,
+		      struct Qdisc **old)
+{
+	struct multiq_sched_data *q = qdisc_priv(sch);
+	unsigned long band = arg - 1;
+
+	if (band >= q->bands)
+		return -EINVAL;
+
+	if (new == NULL)
+		new = &noop_qdisc;
+
+	sch_tree_lock(sch);
+	*old = q->queues[band];
+	q->queues[band] = new;
+	qdisc_tree_decrease_qlen(*old, (*old)->q.qlen);
+	qdisc_reset(*old);
+	sch_tree_unlock(sch);
+
+	return 0;
+}
+
+static struct Qdisc *
+multiq_leaf(struct Qdisc *sch, unsigned long arg)
+{
+	struct multiq_sched_data *q = qdisc_priv(sch);
+	unsigned long band = arg - 1;
+
+	if (band >= q->bands)
+		return NULL;
+
+	return q->queues[band];
+}
+
+static unsigned long multiq_get(struct Qdisc *sch, u32 classid)
+{
+	struct multiq_sched_data *q = qdisc_priv(sch);
+	unsigned long band = TC_H_MIN(classid);
+
+	if (band - 1 >= q->bands)
+		return 0;
+	return band;
+}
+
+static unsigned long multiq_bind(struct Qdisc *sch, unsigned long parent,
+				 u32 classid)
+{
+	return multiq_get(sch, classid);
+}
+
+
+static void multiq_put(struct Qdisc *q, unsigned long cl)
+{
+	return;
+}
+
+static int multiq_change(struct Qdisc *sch, u32 handle, u32 parent,
+			 struct nlattr **tca, unsigned long *arg)
+{
+	unsigned long cl = *arg;
+	struct multiq_sched_data *q = qdisc_priv(sch);
+
+	if (cl - 1 > q->bands)
+		return -ENOENT;
+	return 0;
+}
+
+static int multiq_delete(struct Qdisc *sch, unsigned long cl)
+{
+	struct multiq_sched_data *q = qdisc_priv(sch);
+	if (cl - 1 > q->bands)
+		return -ENOENT;
+	return 0;
+}
+
+
+static int multiq_dump_class(struct Qdisc *sch, unsigned long cl,
+			     struct sk_buff *skb, struct tcmsg *tcm)
+{
+	struct multiq_sched_data *q = qdisc_priv(sch);
+
+	if (cl - 1 > q->bands)
+		return -ENOENT;
+	tcm->tcm_handle |= TC_H_MIN(cl);
+	if (q->queues[cl-1])
+		tcm->tcm_info = q->queues[cl-1]->handle;
+	return 0;
+}
+
+static int multiq_dump_class_stats(struct Qdisc *sch, unsigned long cl,
+				 struct gnet_dump *d)
+{
+	struct multiq_sched_data *q = qdisc_priv(sch);
+	struct Qdisc *cl_q;
+
+	cl_q = q->queues[cl - 1];
+	if (gnet_stats_copy_basic(d, &cl_q->bstats) < 0 ||
+	    gnet_stats_copy_queue(d, &cl_q->qstats) < 0)
+		return -1;
+
+	return 0;
+}
+
+static void multiq_walk(struct Qdisc *sch, struct qdisc_walker *arg)
+{
+	struct multiq_sched_data *q = qdisc_priv(sch);
+	int band;
+
+	if (arg->stop)
+		return;
+
+	for (band = 0; band < q->bands; band++) {
+		if (arg->count < arg->skip) {
+			arg->count++;
+			continue;
+		}
+		if (arg->fn(sch, band+1, arg) < 0) {
+			arg->stop = 1;
+			break;
+		}
+		arg->count++;
+	}
+}
+
+static struct tcf_proto **multiq_find_tcf(struct Qdisc *sch, unsigned long cl)
+{
+	struct multiq_sched_data *q = qdisc_priv(sch);
+
+	if (cl)
+		return NULL;
+	return &q->filter_list;
+}
+
+static const struct Qdisc_class_ops multiq_class_ops = {
+	.graft		=	multiq_graft,
+	.leaf		=	multiq_leaf,
+	.get		=	multiq_get,
+	.put		=	multiq_put,
+	.change		=	multiq_change,
+	.delete		=	multiq_delete,
+	.walk		=	multiq_walk,
+	.tcf_chain	=	multiq_find_tcf,
+	.bind_tcf	=	multiq_bind,
+	.unbind_tcf	=	multiq_put,
+	.dump		=	multiq_dump_class,
+	.dump_stats	=	multiq_dump_class_stats,
+};
+
+static struct Qdisc_ops multiq_qdisc_ops __read_mostly = {
+	.next		=	NULL,
+	.cl_ops		=	&multiq_class_ops,
+	.id		=	"multiq",
+	.priv_size	=	sizeof(struct multiq_sched_data),
+	.enqueue	=	multiq_enqueue,
+	.dequeue	=	multiq_dequeue,
+	.requeue	=	multiq_requeue,
+	.drop		=	multiq_drop,
+	.init		=	multiq_init,
+	.reset		=	multiq_reset,
+	.destroy	=	multiq_destroy,
+	.change		=	multiq_tune,
+	.dump		=	multiq_dump,
+	.owner		=	THIS_MODULE,
+};
+
+static int __init multiq_module_init(void)
+{
+	return register_qdisc(&multiq_qdisc_ops);
+}
+
+static void __exit multiq_module_exit(void)
+{
+	unregister_qdisc(&multiq_qdisc_ops);
+}
+
+module_init(multiq_module_init)
+module_exit(multiq_module_exit)
+
+MODULE_LICENSE("GPL");


^ permalink raw reply related	[flat|nested] 12+ messages in thread

* [UPDATED] [NET-NEXT PATCH 2/2] pkt_action: add new action skbedit
  2008-08-30  7:23 [UPDATED] [NET-NEXT PATCH 1/2] pkt_sched: Add multiqueue scheduler support Jeff Kirsher
@ 2008-08-30  7:24 ` Jeff Kirsher
  2008-09-02 12:27   ` Jarek Poplawski
  2008-09-01 21:05 ` [UPDATED] [NET-NEXT PATCH 1/2] pkt_sched: Add multiqueue scheduler support Jarek Poplawski
  1 sibling, 1 reply; 12+ messages in thread
From: Jeff Kirsher @ 2008-08-30  7:24 UTC (permalink / raw)
  To: jeff; +Cc: netdev, davem, Alexander Duyck, Jeff Kirsher

From: Alexander Duyck <alexander.h.duyck@intel.com>

This new action will have the ability to change the priority and/or
queue_mapping fields on an sk_buff.

Signed-off-by: Alexander Duyck <alexander.h.duyck@intel.com>
Signed-off-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
---

 include/linux/tc_act/tc_skbedit.h |   23 ++++
 include/net/tc_act/tc_skbedit.h   |   15 +++
 net/sched/Kconfig                 |   11 ++
 net/sched/Makefile                |    1 
 net/sched/act_skbedit.c           |  200 +++++++++++++++++++++++++++++++++++++
 5 files changed, 250 insertions(+), 0 deletions(-)
 create mode 100644 include/linux/tc_act/tc_skbedit.h
 create mode 100644 include/net/tc_act/tc_skbedit.h
 create mode 100644 net/sched/act_skbedit.c

diff --git a/include/linux/tc_act/tc_skbedit.h b/include/linux/tc_act/tc_skbedit.h
new file mode 100644
index 0000000..3965636
--- /dev/null
+++ b/include/linux/tc_act/tc_skbedit.h
@@ -0,0 +1,23 @@
+#ifndef __LINUX_TC_SKBEDIT_H
+#define __LINUX_TC_SKBEDIT_H
+
+#include <linux/pkt_cls.h>
+
+#define SKBEDIT_F_PRIORITY		0x1
+#define SKBEDIT_F_QUEUE_MAPPING		0x2
+
+struct tc_skbedit {
+	tc_gen;
+};
+
+enum {
+	TCA_SKBEDIT_UNSPEC,
+	TCA_SKBEDIT_TM,
+	TCA_SKBEDIT_PARMS,
+	TCA_SKBEDIT_PRIORITY,
+	TCA_SKBEDIT_QUEUE_MAPPING,
+	__TCA_SKBEDIT_MAX
+};
+#define TCA_SKBEDIT_MAX (__TCA_SKBEDIT_MAX - 1)
+
+#endif
diff --git a/include/net/tc_act/tc_skbedit.h b/include/net/tc_act/tc_skbedit.h
new file mode 100644
index 0000000..cf4fd9f
--- /dev/null
+++ b/include/net/tc_act/tc_skbedit.h
@@ -0,0 +1,15 @@
+#ifndef __NET_TC_SKBEDIT_H
+#define __NET_TC_SKBEDIT_H
+
+#include <net/act_api.h>
+
+struct tcf_skbedit {
+	struct tcf_common	common;
+	u32			flags;
+	u32     		priority;
+	u16			queue_mapping;
+};
+#define to_skbedit(pc) \
+	container_of(pc, struct tcf_skbedit, common)
+
+#endif /* __NET_TC_SKBEDIT_H */
diff --git a/net/sched/Kconfig b/net/sched/Kconfig
index efaa7a7..6767e54 100644
--- a/net/sched/Kconfig
+++ b/net/sched/Kconfig
@@ -485,6 +485,17 @@ config NET_ACT_SIMP
 	  To compile this code as a module, choose M here: the
 	  module will be called simple.
 
+config NET_ACT_SKBEDIT
+        tristate "SKB Editing"
+        depends on NET_CLS_ACT
+        ---help---
+	  Say Y here to change skb priority or queue_mapping settings.
+
+	  If unsure, say N.
+
+	  To compile this code as a module, choose M here: the
+	  module will be called skbedit.
+
 config NET_CLS_IND
 	bool "Incoming device classification"
 	depends on NET_CLS_U32 || NET_CLS_FW
diff --git a/net/sched/Makefile b/net/sched/Makefile
index 3d9b953..e60c992 100644
--- a/net/sched/Makefile
+++ b/net/sched/Makefile
@@ -14,6 +14,7 @@ obj-$(CONFIG_NET_ACT_IPT)	+= act_ipt.o
 obj-$(CONFIG_NET_ACT_NAT)	+= act_nat.o
 obj-$(CONFIG_NET_ACT_PEDIT)	+= act_pedit.o
 obj-$(CONFIG_NET_ACT_SIMP)	+= act_simple.o
+obj-$(CONFIG_NET_ACT_SKBEDIT)	+= act_skbedit.o
 obj-$(CONFIG_NET_SCH_FIFO)	+= sch_fifo.o
 obj-$(CONFIG_NET_SCH_CBQ)	+= sch_cbq.o
 obj-$(CONFIG_NET_SCH_HTB)	+= sch_htb.o
diff --git a/net/sched/act_skbedit.c b/net/sched/act_skbedit.c
new file mode 100644
index 0000000..0bc4616
--- /dev/null
+++ b/net/sched/act_skbedit.c
@@ -0,0 +1,200 @@
+/*
+ * net/sched/act_skbedit.c	SKB Editing
+ *
+ *		This program is free software; you can redistribute it and/or
+ *		modify it under the terms of the GNU General Public License
+ *		as published by the Free Software Foundation; either version
+ *		2 of the License, or (at your option) any later version.
+ *
+ * Authors: Alexander Duyck <alexander.h.duyck@intel.com>
+ *
+ * Original Authors:	Jamal Hadi Salim (2005-8) (act_simple)
+ *
+ */
+
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/skbuff.h>
+#include <linux/rtnetlink.h>
+#include <net/netlink.h>
+#include <net/pkt_sched.h>
+
+#define TCA_ACT_SKBEDIT 11
+
+#include <linux/tc_act/tc_skbedit.h>
+#include <net/tc_act/tc_skbedit.h>
+
+#define SKBEDIT_TAB_MASK     15
+static struct tcf_common *tcf_skbedit_ht[SKBEDIT_TAB_MASK + 1];
+static u32 skbedit_idx_gen;
+static DEFINE_RWLOCK(skbedit_lock);
+
+static struct tcf_hashinfo skbedit_hash_info = {
+	.htab	=	tcf_skbedit_ht,
+	.hmask	=	SKBEDIT_TAB_MASK,
+	.lock	=	&skbedit_lock,
+};
+
+static int tcf_skbedit(struct sk_buff *skb, struct tc_action *a,
+		       struct tcf_result *res)
+{
+	struct tcf_skbedit *d = a->priv;
+
+	spin_lock(&d->tcf_lock);
+	d->tcf_tm.lastuse = jiffies;
+	d->tcf_bstats.bytes += qdisc_pkt_len(skb);
+	d->tcf_bstats.packets++;
+
+	if (d->flags & SKBEDIT_F_PRIORITY)
+		skb->priority = d->priority;
+	if (d->flags & SKBEDIT_F_QUEUE_MAPPING &&
+	    skb->dev->real_num_tx_queues > d->queue_mapping)
+		skb_set_queue_mapping(skb, d->queue_mapping);
+
+	spin_unlock(&d->tcf_lock);
+	return d->tcf_action;
+}
+
+static const struct nla_policy skbedit_policy[TCA_SKBEDIT_MAX + 1] = {
+	[TCA_SKBEDIT_PARMS]		= { .len = sizeof(struct tc_skbedit) },
+	[TCA_SKBEDIT_PRIORITY]		= { .len = sizeof(u32) },
+	[TCA_SKBEDIT_QUEUE_MAPPING]	= { .len = sizeof(u16) },
+};
+
+static int tcf_skbedit_init(struct nlattr *nla, struct nlattr *est,
+			 struct tc_action *a, int ovr, int bind)
+{
+	struct nlattr *tb[TCA_SKBEDIT_MAX + 1];
+	struct tc_skbedit *parm;
+	struct tcf_skbedit *d;
+	struct tcf_common *pc;
+	u32 flags = 0, *priority = NULL;
+	u16 *queue_mapping = NULL;
+	int ret = 0, err;
+
+	if (nla == NULL)
+		return -EINVAL;
+
+	err = nla_parse_nested(tb, TCA_SKBEDIT_MAX, nla, skbedit_policy);
+	if (err < 0)
+		return err;
+
+	if (tb[TCA_SKBEDIT_PARMS] == NULL)
+		return -EINVAL;
+
+	if (tb[TCA_SKBEDIT_PRIORITY] != NULL) {
+		flags |= SKBEDIT_F_PRIORITY;
+		priority = nla_data(tb[TCA_SKBEDIT_PRIORITY]);
+	}
+
+	if (tb[TCA_SKBEDIT_QUEUE_MAPPING] != NULL) {
+		flags |= SKBEDIT_F_QUEUE_MAPPING;
+		queue_mapping = nla_data(tb[TCA_SKBEDIT_QUEUE_MAPPING]);
+	}
+	if (!flags)
+		return -EINVAL;
+
+	parm = nla_data(tb[TCA_SKBEDIT_PARMS]);
+
+	pc = tcf_hash_check(parm->index, a, bind, &skbedit_hash_info);
+	if (!pc) {
+		pc = tcf_hash_create(parm->index, est, a, sizeof(*d), bind,
+				     &skbedit_idx_gen, &skbedit_hash_info);
+		if (unlikely(!pc))
+			return -ENOMEM;
+
+		d = to_skbedit(pc);
+		ret = ACT_P_CREATED;
+	} else {
+		d = to_skbedit(pc);
+		if (!ovr) {
+			tcf_hash_release(pc, bind, &skbedit_hash_info);
+			return -EEXIST;
+		}
+	}
+
+	spin_lock_bh(&d->tcf_lock);
+
+	d->flags = flags;
+	if (flags & SKBEDIT_F_PRIORITY)
+		d->priority = *priority;
+	if (flags & SKBEDIT_F_QUEUE_MAPPING)
+		d->queue_mapping = *queue_mapping;
+	d->tcf_action = parm->action;
+
+	spin_unlock_bh(&d->tcf_lock);
+
+	if (ret == ACT_P_CREATED)
+		tcf_hash_insert(pc, &skbedit_hash_info);
+	return ret;
+}
+
+static inline int tcf_skbedit_cleanup(struct tc_action *a, int bind)
+{
+	struct tcf_skbedit *d = a->priv;
+
+	if (d)
+		return tcf_hash_release(&d->common, bind, &skbedit_hash_info);
+	return 0;
+}
+
+static inline int tcf_skbedit_dump(struct sk_buff *skb, struct tc_action *a,
+				int bind, int ref)
+{
+	unsigned char *b = skb_tail_pointer(skb);
+	struct tcf_skbedit *d = a->priv;
+	struct tc_skbedit opt;
+	struct tcf_t t;
+
+	opt.index = d->tcf_index;
+	opt.refcnt = d->tcf_refcnt - ref;
+	opt.bindcnt = d->tcf_bindcnt - bind;
+	opt.action = d->tcf_action;
+	NLA_PUT(skb, TCA_SKBEDIT_PARMS, sizeof(opt), &opt);
+	if (d->flags & SKBEDIT_F_PRIORITY)
+		NLA_PUT(skb, TCA_SKBEDIT_PRIORITY, sizeof(d->priority),
+			&d->priority);
+	if (d->flags & SKBEDIT_F_QUEUE_MAPPING)
+		NLA_PUT(skb, TCA_SKBEDIT_QUEUE_MAPPING,
+			sizeof(d->queue_mapping), &d->queue_mapping);
+	t.install = jiffies_to_clock_t(jiffies - d->tcf_tm.install);
+	t.lastuse = jiffies_to_clock_t(jiffies - d->tcf_tm.lastuse);
+	t.expires = jiffies_to_clock_t(d->tcf_tm.expires);
+	NLA_PUT(skb, TCA_SKBEDIT_TM, sizeof(t), &t);
+	return skb->len;
+
+nla_put_failure:
+	nlmsg_trim(skb, b);
+	return -1;
+}
+
+static struct tc_action_ops act_skbedit_ops = {
+	.kind		=	"skbedit",
+	.hinfo		=	&skbedit_hash_info,
+	.type		=	TCA_ACT_SKBEDIT,
+	.capab		=	TCA_CAP_NONE,
+	.owner		=	THIS_MODULE,
+	.act		=	tcf_skbedit,
+	.dump		=	tcf_skbedit_dump,
+	.cleanup	=	tcf_skbedit_cleanup,
+	.init		=	tcf_skbedit_init,
+	.walk		=	tcf_generic_walker,
+};
+
+MODULE_AUTHOR("Alexander Duyck(2008)");
+MODULE_DESCRIPTION("SKB Editing");
+MODULE_LICENSE("GPL");
+
+static int __init skbedit_init_module(void)
+{
+	return tcf_register_action(&act_skbedit_ops);
+}
+
+static void __exit skbedit_cleanup_module(void)
+{
+	tcf_unregister_action(&act_skbedit_ops);
+}
+
+module_init(skbedit_init_module);
+module_exit(skbedit_cleanup_module);


^ permalink raw reply related	[flat|nested] 12+ messages in thread

* Re: [UPDATED] [NET-NEXT PATCH 1/2] pkt_sched: Add multiqueue scheduler support
  2008-08-30  7:23 [UPDATED] [NET-NEXT PATCH 1/2] pkt_sched: Add multiqueue scheduler support Jeff Kirsher
  2008-08-30  7:24 ` [UPDATED] [NET-NEXT PATCH 2/2] pkt_action: add new action skbedit Jeff Kirsher
@ 2008-09-01 21:05 ` Jarek Poplawski
  2008-09-01 22:49   ` Alexander Duyck
  1 sibling, 1 reply; 12+ messages in thread
From: Jarek Poplawski @ 2008-09-01 21:05 UTC (permalink / raw)
  To: Jeff Kirsher; +Cc: jeff, netdev, davem, Alexander Duyck

Jeff Kirsher wrote, On 08/30/2008 09:23 AM:

> From: Alexander Duyck <alexander.h.duyck@intel.com>
> 
> This patch is intended to add a qdisc to support the new tx multiqueue
> architecture by providing a band for each hardware queue.  By doing
> this it is possible to support a different qdisc per physical hardware
> queue.
> 
> This qdisc uses the skb->queue_mapping to select which band to place
> the traffic onto.  It then uses a round robin w/ a check to see if the
> subqueue is stopped to determine which band to dequeue the packet from.

Mostly looks OK to me, but a few (late) doubts below:

> 
> Signed-off-by: Alexander Duyck <alexander.h.duyck@intel.com>
> Signed-off-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
> ---
> 
>  include/linux/pkt_sched.h |    6 +
>  net/sched/Kconfig         |    9 +
>  net/sched/Makefile        |    1 
>  net/sched/sch_multiq.c    |  470 +++++++++++++++++++++++++++++++++++++++++++++
>  4 files changed, 486 insertions(+), 0 deletions(-)
>  create mode 100644 net/sched/sch_multiq.c
> 
> diff --git a/include/linux/pkt_sched.h b/include/linux/pkt_sched.h
> index e5de421..7fbc952 100644
> --- a/include/linux/pkt_sched.h
> +++ b/include/linux/pkt_sched.h
> @@ -123,6 +123,12 @@ struct tc_prio_qopt
>  	__u8	priomap[TC_PRIO_MAX+1];	/* Map: logical priority -> PRIO band */
>  };
>  
> +/* MULTIQ section */
> +
> +struct tc_multiq_qopt {
> +	int	bands;			/* Number of bands */

Probably __u16 or __u32 would look better here.

> +};
> +
>  /* TBF section */
>  
>  struct tc_tbf_qopt
> diff --git a/net/sched/Kconfig b/net/sched/Kconfig
> index 9437b27..efaa7a7 100644
> --- a/net/sched/Kconfig
> +++ b/net/sched/Kconfig
> @@ -106,6 +106,15 @@ config NET_SCH_PRIO
>  	  To compile this code as a module, choose M here: the
>  	  module will be called sch_prio.
>  
> +config NET_SCH_MULTIQ
> +	tristate "Hardware Multiqueue-aware Multi Band Queuing (MULTIQ)"
> +	---help---
> +	  Say Y here if you want to use an n-band queue packet scheduler
> +	  to support devices that have multiple hardware transmit queues.
> +
> +	  To compile this code as a module, choose M here: the
> +	  module will be called sch_multiq.
> +

It would be nice to bring back a few lines about MULTIQ(RR) to
Documentation/networking/multiqueue.txt and mention this here.

>  config NET_SCH_RED
>  	tristate "Random Early Detection (RED)"
>  	---help---
> diff --git a/net/sched/Makefile b/net/sched/Makefile
> index 1d2b0f7..3d9b953 100644
> --- a/net/sched/Makefile
> +++ b/net/sched/Makefile
> @@ -26,6 +26,7 @@ obj-$(CONFIG_NET_SCH_SFQ)	+= sch_sfq.o
>  obj-$(CONFIG_NET_SCH_TBF)	+= sch_tbf.o
>  obj-$(CONFIG_NET_SCH_TEQL)	+= sch_teql.o
>  obj-$(CONFIG_NET_SCH_PRIO)	+= sch_prio.o
> +obj-$(CONFIG_NET_SCH_MULTIQ)	+= sch_multiq.o
>  obj-$(CONFIG_NET_SCH_ATM)	+= sch_atm.o
>  obj-$(CONFIG_NET_SCH_NETEM)	+= sch_netem.o
>  obj-$(CONFIG_NET_CLS_U32)	+= cls_u32.o
> diff --git a/net/sched/sch_multiq.c b/net/sched/sch_multiq.c
> new file mode 100644
> index 0000000..708dd5c
> --- /dev/null
> +++ b/net/sched/sch_multiq.c
> @@ -0,0 +1,470 @@
> +/*
> + * net/sched/sch_multiq.c
> + * 		This qdisc is based off of the rr qdisc and is meant to
> + * 		prevent head-of-line blocking on devices that have multiple
> + * 		hardware queues.
> + *
> + *		This program is free software; you can redistribute it and/or
> + *		modify it under the terms of the GNU General Public License
> + *		as published by the Free Software Foundation; either version
> + *		2 of the License, or (at your option) any later version.
> + *
> + * Authors:	Alexander Duyck <alexander.h.duyck@intel.com>
> + */
> +
> +#include <linux/module.h>
> +#include <linux/types.h>
> +#include <linux/kernel.h>
> +#include <linux/string.h>
> +#include <linux/errno.h>
> +#include <linux/skbuff.h>
> +#include <net/netlink.h>
> +#include <net/pkt_sched.h>
> +
> +
> +struct multiq_sched_data {
> +	int bands;
> +	int curband;

unsigned etc?

> +	struct tcf_proto *filter_list;
> +	struct Qdisc **queues;
> +

A spurious line.

> +};

...
> +static void
> +multiq_reset(struct Qdisc *sch)
> +{
> +	int band;
> +	struct multiq_sched_data *q = qdisc_priv(sch);
> +
> +	for (band = 0; band < q->bands; band++)
> +		qdisc_reset(q->queues[band]);
> +	sch->q.qlen = 0;

  +	q->curband = 0; ?

...
> +static int multiq_tune(struct Qdisc *sch, struct nlattr *opt)
> +{
> +	struct multiq_sched_data *q = qdisc_priv(sch);
> +	struct tc_multiq_qopt *qopt;
> +	struct Qdisc **queues;
> +	int i;
> +
> +	if (sch->parent != TC_H_ROOT)
> +		return -EINVAL;

Is it necessary?

> +	if (!netif_is_multiqueue(qdisc_dev(sch)))
> +		return -EINVAL;
> +	if (nla_len(opt) < sizeof(*qopt))
> +		return -EINVAL;
> +
> +	qopt = nla_data(opt);
> +
> +	qopt->bands = qdisc_dev(sch)->real_num_tx_queues;
> +
> +	queues = kzalloc(sizeof(struct Qdisc *)*qopt->bands, GFP_KERNEL);

kcalloc()?

...
> +static int multiq_dump(struct Qdisc *sch, struct sk_buff *skb)
> +{
> +	struct multiq_sched_data *q = qdisc_priv(sch);
> +	unsigned char *b = skb_tail_pointer(skb);
> +	struct nlattr *nest;
> +	struct tc_multiq_qopt opt;
> +
> +	opt.bands = q->bands;
> +
> +	nest = nla_nest_compat_start(skb, TCA_OPTIONS, sizeof(opt), &opt);

http://marc.info/?l=linux-netdev&m=121993231608269&w=2

> +	if (nest == NULL)
> +		goto nla_put_failure;
> +	nla_nest_compat_end(skb, nest);
...

Jarek P.

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [UPDATED] [NET-NEXT PATCH 1/2] pkt_sched: Add multiqueue scheduler support
  2008-09-01 21:05 ` [UPDATED] [NET-NEXT PATCH 1/2] pkt_sched: Add multiqueue scheduler support Jarek Poplawski
@ 2008-09-01 22:49   ` Alexander Duyck
  2008-09-02  5:54     ` Jarek Poplawski
  0 siblings, 1 reply; 12+ messages in thread
From: Alexander Duyck @ 2008-09-01 22:49 UTC (permalink / raw)
  To: Jarek Poplawski; +Cc: Jeff Kirsher, jeff, netdev, davem, Alexander Duyck

On Mon, 2008-09-01 at 23:05 +0200, Jarek Poplawski wrote:

> Mostly looks OK to me, but a few (late) doubts below:

Most of your suggestions I agree with, with the following exceptions.
> ...
> > +static int multiq_tune(struct Qdisc *sch, struct nlattr *opt)
> > +{
> > +	struct multiq_sched_data *q = qdisc_priv(sch);
> > +	struct tc_multiq_qopt *qopt;
> > +	struct Qdisc **queues;
> > +	int i;
> > +
> > +	if (sch->parent != TC_H_ROOT)
> > +		return -EINVAL;
> 
> Is it necessary?
> 
I think so.  Basically I want to have this qdisc as the root for all
other qdiscs because the hardware queue decision needs to be made as
soon as possible in order to avoid any head of line blocking issues.
This way you don't end up with multiple qdiscs fighting over hardware
queues.

> > +static int multiq_dump(struct Qdisc *sch, struct sk_buff *skb)
> > +{
> > +	struct multiq_sched_data *q = qdisc_priv(sch);
> > +	unsigned char *b = skb_tail_pointer(skb);
> > +	struct nlattr *nest;
> > +	struct tc_multiq_qopt opt;
> > +
> > +	opt.bands = q->bands;
> > +
> > +	nest = nla_nest_compat_start(skb, TCA_OPTIONS, sizeof(opt), &opt);
> 
> http://marc.info/?l=linux-netdev&m=121993231608269&w=2

I can dump the whole nested_compat setup and replace it all with an
nla_put, because it is inefficient to waste the space on an empty nested
attribute that isn't needed.  I think this was just a holdover from
prio/rr anyway.

> 
> > +	if (nest == NULL)
> > +		goto nla_put_failure;
> > +	nla_nest_compat_end(skb, nest);
> ...
> 
> Jarek P.

Thanks,

Alex



^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [UPDATED] [NET-NEXT PATCH 1/2] pkt_sched: Add multiqueue scheduler support
  2008-09-01 22:49   ` Alexander Duyck
@ 2008-09-02  5:54     ` Jarek Poplawski
  2008-09-02  7:52       ` Jarek Poplawski
  0 siblings, 1 reply; 12+ messages in thread
From: Jarek Poplawski @ 2008-09-02  5:54 UTC (permalink / raw)
  To: Alexander Duyck; +Cc: Jeff Kirsher, jeff, netdev, davem, Alexander Duyck

On Mon, Sep 01, 2008 at 03:49:14PM -0700, Alexander Duyck wrote:
> On Mon, 2008-09-01 at 23:05 +0200, Jarek Poplawski wrote:
> 
> > Mostly looks OK to me, but a few (late) doubts below:
> 
> Most of your suggestions I agree with, with the following exceptions.
> > ...
> > > +static int multiq_tune(struct Qdisc *sch, struct nlattr *opt)
> > > +{
> > > +	struct multiq_sched_data *q = qdisc_priv(sch);
> > > +	struct tc_multiq_qopt *qopt;
> > > +	struct Qdisc **queues;
> > > +	int i;
> > > +
> > > +	if (sch->parent != TC_H_ROOT)
> > > +		return -EINVAL;
> > 
> > Is it necessary?
> > 
> I think so.  Basically I want to have this qdisc as the root for all
> other qdiscs because the hardware queue decision needs to be made as
> soon as possible in order to avoid any head of line blocking issues.
> This way you don't end up with multiple qdiscs fighting over hardware
> queues.

OK, but I wonder if it's not enough to treat this as a recommendation?
Actually, since dequeuing is under the common lock here, the main
difference seems to be this checking for subqueue_stopped could happen
a bit earlier, but this should be safe (a subqueue can't get another
packets in the meantime). So maybe I miss something but this looks
like blocking safe even when used as prio's leaf.

Thanks,
Jarek P.

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [UPDATED] [NET-NEXT PATCH 1/2] pkt_sched: Add multiqueue scheduler support
  2008-09-02  5:54     ` Jarek Poplawski
@ 2008-09-02  7:52       ` Jarek Poplawski
  2008-09-02 17:18         ` Duyck, Alexander H
  0 siblings, 1 reply; 12+ messages in thread
From: Jarek Poplawski @ 2008-09-02  7:52 UTC (permalink / raw)
  To: Alexander Duyck; +Cc: Jeff Kirsher, jeff, netdev, davem, Alexander Duyck

On Tue, Sep 02, 2008 at 05:54:11AM +0000, Jarek Poplawski wrote:
...
> OK, but I wonder if it's not enough to treat this as a recommendation?
> Actually, since dequeuing is under the common lock here, the main
> difference seems to be this checking for subqueue_stopped could happen
> a bit earlier,

Hmm.., actually a bit later... Then this should be a bit more exact?!
Anyway, still looks safe to me.

Jarek P.

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [UPDATED] [NET-NEXT PATCH 2/2] pkt_action: add new action skbedit
  2008-08-30  7:24 ` [UPDATED] [NET-NEXT PATCH 2/2] pkt_action: add new action skbedit Jeff Kirsher
@ 2008-09-02 12:27   ` Jarek Poplawski
  0 siblings, 0 replies; 12+ messages in thread
From: Jarek Poplawski @ 2008-09-02 12:27 UTC (permalink / raw)
  To: Jeff Kirsher; +Cc: jeff, netdev, davem, Alexander Duyck

On 30-08-2008 09:24, Jeff Kirsher wrote:
> From: Alexander Duyck <alexander.h.duyck@intel.com>
> 
> This new action will have the ability to change the priority and/or
> queue_mapping fields on an sk_buff.
> 
> Signed-off-by: Alexander Duyck <alexander.h.duyck@intel.com>
> Signed-off-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com>

Two cosmetic suggestions:

Probably: include/linux/tc_act/Kbuild should be updated too.

>  include/linux/tc_act/tc_skbedit.h |   23 ++++
>  include/net/tc_act/tc_skbedit.h   |   15 +++
>  net/sched/Kconfig                 |   11 ++
>  net/sched/Makefile                |    1 
>  net/sched/act_skbedit.c           |  200 +++++++++++++++++++++++++++++++++++++
...
> diff --git a/net/sched/act_skbedit.c b/net/sched/act_skbedit.c
> new file mode 100644
> index 0000000..0bc4616
> --- /dev/null
> +++ b/net/sched/act_skbedit.c
...
> +#include <net/pkt_sched.h>
> +
> +#define TCA_ACT_SKBEDIT 11

Doesn't matter, but most of others (except act_simple) keep this TCA_
in include/linux/tc_act/tc_xyz.h.

Otherwise it looks OK to me.

Jarek P.

^ permalink raw reply	[flat|nested] 12+ messages in thread

* RE: [UPDATED] [NET-NEXT PATCH 1/2] pkt_sched: Add multiqueue scheduler support
  2008-09-02  7:52       ` Jarek Poplawski
@ 2008-09-02 17:18         ` Duyck, Alexander H
  2008-09-02 20:09           ` Jarek Poplawski
  0 siblings, 1 reply; 12+ messages in thread
From: Duyck, Alexander H @ 2008-09-02 17:18 UTC (permalink / raw)
  To: Jarek Poplawski, Alexander Duyck
  Cc: Kirsher, Jeffrey T, jeff@garzik.org, netdev@vger.kernel.org,
	davem@davemloft.net

Jarek Poplawski wrote:
> On Tue, Sep 02, 2008 at 05:54:11AM +0000, Jarek Poplawski wrote:
> ...
>> OK, but I wonder if it's not enough to treat this as a
>> recommendation? Actually, since dequeuing is under the common lock
>> here, the main difference seems to be this checking for
>> subqueue_stopped could happen a bit earlier,
>
> Hmm.., actually a bit later... Then this should be a bit more exact?!
> Anyway, still looks safe to me.
>
> Jarek P.

Let me give an example of how this can go wrong.  Lets say we use multiq as a leaf for each band in prio and two different bands have a packet for hw queue 1.  If prio band 0 tries to pull and the leaf finds that queue 1 is stopped then dequeue returns null.  A tx interrupt fires and the driver then wakes queue 1 since space is available.  Then prio pull from band 1 and enqueues the packet on hw queue 1.  The end result is the lower priority packet slipping in before the higher priority packet in the hardware queue.

The advantage to making this qdisc root is that you then have exactly one qdisc band per hardware queue.  You can then place whatever qdiscs you want on each of the bands and the behavior will be consistent per queue.

Thanks,

Alex

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [UPDATED] [NET-NEXT PATCH 1/2] pkt_sched: Add multiqueue scheduler support
  2008-09-02 17:18         ` Duyck, Alexander H
@ 2008-09-02 20:09           ` Jarek Poplawski
  2008-09-02 20:53             ` Alexander Duyck
  0 siblings, 1 reply; 12+ messages in thread
From: Jarek Poplawski @ 2008-09-02 20:09 UTC (permalink / raw)
  To: Duyck, Alexander H
  Cc: Alexander Duyck, Kirsher, Jeffrey T, jeff@garzik.org,
	netdev@vger.kernel.org, davem@davemloft.net

On Tue, Sep 02, 2008 at 10:18:36AM -0700, Duyck, Alexander H wrote:
> Jarek Poplawski wrote:
> > On Tue, Sep 02, 2008 at 05:54:11AM +0000, Jarek Poplawski wrote:
> > ...
> >> OK, but I wonder if it's not enough to treat this as a
> >> recommendation? Actually, since dequeuing is under the common lock
> >> here, the main difference seems to be this checking for
> >> subqueue_stopped could happen a bit earlier,
> >
> > Hmm.., actually a bit later... Then this should be a bit more exact?!
> > Anyway, still looks safe to me.
> >
> > Jarek P.
> 
> Let me give an example of how this can go wrong.  Lets say we use multiq as a leaf for each band in prio and two different bands have a packet for hw queue 1.  If prio band 0 tries to pull and the leaf finds that queue 1 is stopped then dequeue returns null.  A tx interrupt fires and the driver then wakes queue 1 since space is available.  Then prio pull from band 1 and enqueues the packet on hw queue 1.  The end result is the lower priority packet slipping in before the higher priority packet in the hardware queue.
> 
> The advantage to making this qdisc root is that you then have exactly one qdisc band per hardware queue.  You can then place whatever qdiscs you want on each of the bands and the behavior will be consistent per queue.
> 

Right. But since this doesn't cause this additional blocking I'm not
sure there is a reason to forbid this. IMHO documenting this could be
enough, and letting to do this could be useful even for testing. But,
of course, you are the author, so I don't persist with this.

Thanks,
Jarek P.

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [UPDATED] [NET-NEXT PATCH 1/2] pkt_sched: Add multiqueue scheduler support
  2008-09-02 20:09           ` Jarek Poplawski
@ 2008-09-02 20:53             ` Alexander Duyck
  0 siblings, 0 replies; 12+ messages in thread
From: Alexander Duyck @ 2008-09-02 20:53 UTC (permalink / raw)
  To: Jarek Poplawski
  Cc: Alexander Duyck, Kirsher, Jeffrey T, jeff@garzik.org,
	netdev@vger.kernel.org, davem@davemloft.net

Jarek Poplawski wrote:
> Right. But since this doesn't cause this additional blocking I'm not
> sure there is a reason to forbid this. IMHO documenting this could be
> enough, and letting to do this could be useful even for testing. But,
> of course, you are the author, so I don't persist with this.
> 
> Thanks,
> Jarek P.

You are correct in that it doesn't cause any additional blocking, but it 
does break the way that the prio qdisc is supposed to work.  I figure it 
is best to keep all the qdiscs behaving the way they are supposed to 
without my qdisc changing their behavior by acting as a leaf.

Thanks for all the useful input.

Alex




^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [UPDATED] [NET-NEXT PATCH 1/2] pkt_sched: Add multiqueue scheduler support
  2008-08-30  7:21 Jeff Kirsher
@ 2008-09-12  3:11 ` David Miller
  0 siblings, 0 replies; 12+ messages in thread
From: David Miller @ 2008-09-12  3:11 UTC (permalink / raw)
  To: jeffrey.t.kirsher; +Cc: jeff, netdev, alexander.h.duyck

From: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
Date: Sat, 30 Aug 2008 00:21:37 -0700

> From: Alexander Duyck <alexander.h.duyck@intel.com>
> 
> This patch is intended to add a qdisc to support the new tx multiqueue
> architecture by providing a band for each hardware queue.  By doing
> this it is possible to support a different qdisc per physical hardware
> queue.
> 
> This qdisc uses the skb->queue_mapping to select which band to place
> the traffic onto.  It then uses a round robin w/ a check to see if the
> subqueue is stopped to determine which band to dequeue the packet from.
> 
> Signed-off-by: Alexander Duyck <alexander.h.duyck@intel.com>
> Signed-off-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com>

Jarek provided a bunch of feedback on these changes, some of which
Alexander stated he would integrate, so I'm waiting for the next
submission of these two changes.

Just FYI.


^ permalink raw reply	[flat|nested] 12+ messages in thread

end of thread, other threads:[~2008-09-12  3:11 UTC | newest]

Thread overview: 12+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2008-08-30  7:23 [UPDATED] [NET-NEXT PATCH 1/2] pkt_sched: Add multiqueue scheduler support Jeff Kirsher
2008-08-30  7:24 ` [UPDATED] [NET-NEXT PATCH 2/2] pkt_action: add new action skbedit Jeff Kirsher
2008-09-02 12:27   ` Jarek Poplawski
2008-09-01 21:05 ` [UPDATED] [NET-NEXT PATCH 1/2] pkt_sched: Add multiqueue scheduler support Jarek Poplawski
2008-09-01 22:49   ` Alexander Duyck
2008-09-02  5:54     ` Jarek Poplawski
2008-09-02  7:52       ` Jarek Poplawski
2008-09-02 17:18         ` Duyck, Alexander H
2008-09-02 20:09           ` Jarek Poplawski
2008-09-02 20:53             ` Alexander Duyck
  -- strict thread matches above, loose matches on Subject: below --
2008-08-30  7:21 Jeff Kirsher
2008-09-12  3:11 ` David Miller

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).