netdev.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCHv2 1/3] NET_SCHED: PSPacer qdisc module
@ 2007-11-26 15:50 Ryousei Takano
  2007-11-26 18:15 ` Patrick McHardy
  0 siblings, 1 reply; 7+ messages in thread
From: Ryousei Takano @ 2007-11-26 15:50 UTC (permalink / raw)
  To: netdev; +Cc: shemminger, kaber, dada1, t.kudoh

This patch includes the PSPacer (Precise Software Pacer) qdisc
module, which achieves precise transmission bandwidth control.
You can find more information at the project web page
(http://www.gridmpi.org/gridtcp.jsp).

Signed-off-by: Ryousei Takano <takano-ryousei@aist.go.jp>
---
 include/linux/pkt_sched.h |   37 ++
 net/sched/Kconfig         |    9 +
 net/sched/Makefile        |    1 +
 net/sched/sch_psp.c       |  958 +++++++++++++++++++++++++++++++++++++++++++++
 4 files changed, 1005 insertions(+), 0 deletions(-)
 create mode 100644 net/sched/sch_psp.c

diff --git a/include/linux/pkt_sched.h b/include/linux/pkt_sched.h
index 919af93..fda41cd 100644
--- a/include/linux/pkt_sched.h
+++ b/include/linux/pkt_sched.h
@@ -430,6 +430,43 @@ enum {
 
 #define TCA_ATM_MAX	(__TCA_ATM_MAX - 1)
 
+/* Precise Software Pacer section */
+
+#define TC_PSP_MAXDEPTH (8)
+
+typedef long long gapclock_t;
+
+enum {
+	MODE_NORMAL = 0,
+	MODE_STATIC = 1,
+};
+
+struct tc_psp_copt
+{
+	__u32	level;
+	__u32	mode;
+	__u32	rate;
+};
+
+struct tc_psp_qopt
+{
+	__u32	defcls;
+	__u32	rate;
+};
+
+struct tc_psp_xstats
+{
+	__u32	bytes;		/* gap packet statistics */
+	__u32	packets;
+};
+
+enum
+{
+	TCA_PSP_UNSPEC,
+	TCA_PSP_COPT,
+	TCA_PSP_QOPT,
+};
+
 /* Network emulator */
 
 enum
diff --git a/net/sched/Kconfig b/net/sched/Kconfig
index 9c15c48..ec40e43 100644
--- a/net/sched/Kconfig
+++ b/net/sched/Kconfig
@@ -184,6 +184,15 @@ config NET_SCH_DSMARK
 	  To compile this code as a module, choose M here: the
 	  module will be called sch_dsmark.
 
+config NET_SCH_PSP
+	tristate "Precise Software Pacer (PSP)"
+	---help---
+	  Say Y here if you want to include PSPacer module, which means
+	  that you will be able to control precise pacing.
+
+	  To compile this driver as a module, choose M here: the
+	  module will be called sch_psp.
+
 config NET_SCH_NETEM
 	tristate "Network emulator (NETEM)"
 	---help---
diff --git a/net/sched/Makefile b/net/sched/Makefile
index 81ecbe8..85425c2 100644
--- a/net/sched/Makefile
+++ b/net/sched/Makefile
@@ -27,6 +27,7 @@ obj-$(CONFIG_NET_SCH_TBF)	+= sch_tbf.o
 obj-$(CONFIG_NET_SCH_TEQL)	+= sch_teql.o
 obj-$(CONFIG_NET_SCH_PRIO)	+= sch_prio.o
 obj-$(CONFIG_NET_SCH_ATM)	+= sch_atm.o
+obj-$(CONFIG_NET_SCH_PSP)	+= sch_psp.o
 obj-$(CONFIG_NET_SCH_NETEM)	+= sch_netem.o
 obj-$(CONFIG_NET_CLS_U32)	+= cls_u32.o
 obj-$(CONFIG_NET_CLS_ROUTE4)	+= cls_route.o
diff --git a/net/sched/sch_psp.c b/net/sched/sch_psp.c
new file mode 100644
index 0000000..f475b50
--- /dev/null
+++ b/net/sched/sch_psp.c
@@ -0,0 +1,958 @@
+/*
+ * net/sched/sch_psp.c	PSPacer: Precise Software Pacer
+ *
+ *		Copyright (C) 2004-2007 National Institute of Advanced
+ *		Industrial Science and Technology (AIST), Japan.
+ *
+ *		This program is free software; you can redistribute it and/or
+ *		modify it under the terms of the GNU General Public License
+ *		as published by the Free Software Foundation; either version
+ *		2 of the License, or (at your option) any later version.
+ *
+ * Authors:	Ryousei Takano, <takano-ryousei@aist.go.jp>
+ */
+
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/netdevice.h>
+#include <linux/skbuff.h>
+#include <linux/rtnetlink.h>
+#include <linux/ethtool.h>
+#include <linux/if_arp.h>
+#include <linux/in.h>
+#include <linux/ip.h>
+#include <net/pkt_sched.h>
+
+/* PSPacer achieves precise rate regulation results, and no microscopic
+ * burst transmission which exceeds the limit is generated.
+ *
+ * The basic idea is that transmission timing can be precisely controlled,
+ * if packets are sent back-to-back at the wire rate.  PSPacer controls
+ * the packet transmision intervals by inserting additional packets,
+ * called gap packets, between adjacent packets.  The transmission interval
+ * can be controlled accurately by adjusting the number and size of the gap
+ * packets. PSPacer uses the 802.3x PAUSE frame as the gap packet.
+ *
+ * For the purpose of adjusting the gap size, this Qdisc maintains a byte
+ * clock which is recorded by a total transmitted byte per connection.
+ * Each sub-class has a class local clock which is used to make decision
+ * whether to send a packet or not.  If there is not any packets to send,
+ * gap packets are inserted.
+ *
+ * References:
+ * [1] R.Takano, T.Kudoh, Y.Kodama, M.Matsuda, H.Tezuka, and Y.Ishikawa,
+ *     "Design and Evaluation of Precise Software Pacing Mechanisms for
+ *     Fast Long-Distance Networks", PFLDnet2005.
+ * [2] http://www.gridmpi.org/gridtcp.jsp
+ */
+
+#define HW_GAP (16)	/* Preamble(8) + Inter Frame Gap(8) */
+#define FCS    (4)	/* Frame Check Sequence(4) */
+#define MIN_GAP (64)	/* Minimum size of gap packet */
+#define MIN_TARGET_RATE (1000) /* 1 KB/s (= 8 Kbps) */
+
+#define PSP_HSIZE (16)
+
+#define BIT2BYTE(n) ((n) >> 3)
+
+struct psp_class
+{
+	u32 classid;			/* class id */
+	int refcnt;			/* reference count */
+
+	struct gnet_stats_basic bstats;	/* basic stats */
+	struct gnet_stats_queue qstats;	/* queue stats */
+
+	int level;			/* class level in hierarchy */
+	struct psp_class *parent;	/* parent class */
+	struct list_head sibling;	/* sibling classes */
+	struct list_head children;	/* child classes */
+
+	struct Qdisc *qdisc;		/* leaf qdisc */
+
+	struct tcf_proto *filter_list;	/* filter list */
+	int filter_cnt;			/* filter count */
+
+	struct list_head hlist;		/* hash list */
+	struct list_head dlist;		/* drop list */
+	struct list_head plist;		/* normal/pacing class qdisc list */
+
+	int activity;			/* activity flag */
+#define FLAG_ACTIVE (0x00000001)	/*  this class has packets or not */
+#define FLAG_DMARK  (0x00000002)	/*  reset mark */
+	int mode;			/* normal/pacing */
+	u32 rate;			/* current target rate */
+	u32 max_rate;			/* maximum target rate */
+	u32 allocated_rate;		/* allocated rate to children */
+	int gapsize;			/* current gapsize */
+	gapclock_t clock;		/* class local clock */
+	long qtime;			/* sender-side queuing time (us) */
+};
+
+struct psp_sched_data
+{
+	int defcls;				/* default class id */
+	struct list_head root;			/* root class list */
+	struct list_head hash[PSP_HSIZE];	/* class hash */
+	struct list_head drop_list;		/* active leaf class list (for
+						   dropping) */
+	struct list_head pacing_list;		/* gap leaf class list (in
+						   order of the gap size) */
+	struct list_head normal_list;		/* no gap leaf class list */
+
+	struct sk_buff_head requeue;		/* requeued packet */
+
+	struct tcf_proto *filter_list;		/* filter list */
+	int filter_cnt;				/* filter count */
+
+	u32 max_rate;				/* physical rate */
+	u32 allocated_rate;			/* sum of allocated rate */
+	int mtu;				/* interface MTU size
+						   (included ethernet heaer) */
+	gapclock_t clock;			/* wall clock */
+
+	struct sk_buff *gap;			/* template of gap packets */
+	struct tc_psp_xstats xstats;		/* psp specific stats */
+};
+
+/* A gap packet header (struct ethhdr + h_opcode). */
+struct gaphdr {
+	unsigned char h_dest[ETH_ALEN];		/* destination eth addr */
+	unsigned char h_source[ETH_ALEN];	/* source eth addr */
+	__be16 h_proto;				/* MAC control */
+	__be16 h_opcode;			/* MAC control opcode */
+} __attribute__((packed));
+
+static const unsigned char gap_dest[ETH_ALEN] = {0x01, 0x80, 0xc2, 0x00,
+						 0x00, 0x01};
+
+
+static struct sk_buff *alloc_gap_packet(struct Qdisc *sch, int size)
+{
+	struct sk_buff *skb;
+	struct net_device *dev = sch->dev;
+	struct gaphdr *gap;
+	int pause_time = 0;
+
+	skb = alloc_skb(size, GFP_ATOMIC);
+	if (!skb)
+		return NULL;
+
+	memset(skb->data, 0xff, size);
+
+	gap = (struct gaphdr *)skb->data;
+	memcpy(gap->h_dest, gap_dest, ETH_ALEN);
+	memcpy(gap->h_source, dev->dev_addr, ETH_ALEN);
+	gap->h_proto = htons(ETH_P_PAUSE);
+	gap->h_opcode = htons(pause_time);
+
+	skb_put(skb, size);
+
+	skb->dev = sch->dev;
+	skb->protocol = ETH_P_802_3;
+	skb_reset_network_header(skb); /* For dev_queue_xmit_nit(). */
+
+	return skb;
+}
+
+static inline unsigned int psp_hash(u32 h)
+{
+	h ^= h >> 8;
+	h ^= h >> 4;
+	return h & (PSP_HSIZE - 1);
+}
+
+static inline struct psp_class *psp_find(u32 handle, struct Qdisc *sch)
+{
+	struct psp_sched_data *q = qdisc_priv(sch);
+	struct psp_class *cl;
+
+	list_for_each_entry(cl, &q->hash[psp_hash(handle)], hlist) {
+		if (cl->classid == handle)
+			return cl;
+	}
+	return NULL;
+}
+
+static struct psp_class *psp_classify(struct sk_buff *skb, struct Qdisc *sch,
+				      int *qerr)
+{
+	struct psp_sched_data *q = qdisc_priv(sch);
+	struct psp_class *cl;
+	struct tcf_result res;
+	struct tcf_proto *tcf;
+	int result;
+
+	cl = psp_find(skb->priority, sch);
+	if (cl != NULL && cl->level == 0)
+		return cl;
+
+	*qerr = NET_XMIT_BYPASS;
+	tcf = q->filter_list;
+	while (tcf && (result = tc_classify(skb, tcf, &res)) >= 0) {
+#ifdef CONFIG_NET_CLS_ACT
+		switch (result) {
+		case TC_ACT_QUEUED:
+		case TC_ACT_STOLEN:
+			*qerr = NET_XMIT_SUCCESS;
+		case TC_ACT_SHOT:
+			return NULL;
+		}
+#endif
+		cl = (struct psp_class *)res.class;
+		if (cl == NULL) {
+			cl = psp_find(res.classid, sch);
+			if (cl == NULL)
+				break; /* filter selected invalid classid */
+		}
+
+		if (cl->level == 0)
+			return cl; /* hit leaf class */
+
+		/* apply inner filter chain */
+		tcf = cl->filter_list;
+	}
+
+	/* classification failed, try default class */
+	cl = psp_find(TC_H_MAKE(TC_H_MAJ(sch->handle), q->defcls), sch);
+	if (cl == NULL || cl->level > 0)
+		return NULL;
+
+	return cl;
+}
+
+static inline void psp_activate(struct psp_sched_data *q, struct psp_class *cl)
+{
+	cl->activity |= FLAG_ACTIVE;
+	list_add_tail(&cl->dlist, &q->drop_list);
+}
+
+static inline void psp_deactivate(struct psp_sched_data *q,
+				  struct psp_class *cl)
+{
+	cl->activity &= ~FLAG_ACTIVE;
+	list_del_init(&cl->dlist);
+}
+
+static void add_leaf_class(struct psp_sched_data *q, struct psp_class *cl)
+{
+	struct psp_class *p;
+	int mtu = q->mtu + FCS;
+
+	/* chain normal/pacing class list */
+	switch (cl->mode) {
+	case MODE_NORMAL:
+		list_add_tail(&cl->plist, &q->normal_list);
+		break;
+	case MODE_STATIC:
+		cl->gapsize = (((q->max_rate / 1000) * mtu)
+			       / (cl->rate / 1000)) - mtu;
+		cl->gapsize -= (HW_GAP + FCS) * DIV_ROUND_UP(q->max_rate,
+							     cl->rate);
+		cl->gapsize = max_t(int, cl->gapsize, MIN_GAP);
+		cl->activity |= FLAG_DMARK;
+		list_for_each_entry(p, &q->pacing_list, plist) {
+			if (cl->gapsize < p->gapsize)
+				break;
+		}
+		list_add_tail(&cl->plist, &p->plist);
+		break;
+	}
+}
+
+static int recalc_gapsize(struct sk_buff *skb, struct Qdisc *sch)
+{
+	struct psp_sched_data *q = qdisc_priv(sch);
+	struct psp_class *cl;
+	unsigned int len = skb->len;
+	int gapsize = 0;
+	int err;
+
+	cl = psp_classify(skb, sch, &err);
+	switch (cl->mode) {
+	case MODE_STATIC:
+		gapsize = cl->gapsize;
+		if (len < q->mtu) /* workaround for overflow */
+			gapsize = (int)((long long)gapsize * len / q->mtu);
+		break;
+	}
+
+	return max_t(int, gapsize, MIN_GAP);
+}
+
+/* Update byte clocks
+ * When a packet is sent out:
+ *     Qdisc's clock += packet length
+ *     if the class is the pacing class:
+ *         class's clock += packet length + gap length
+ */
+static void update_clocks(struct sk_buff *skb, struct Qdisc *sch,
+			  struct psp_class *cl)
+{
+	struct psp_sched_data *q = qdisc_priv(sch);
+	unsigned int len = skb->len;
+	int gaplen;
+
+	q->clock += len;
+	if (cl == NULL || cl->mode == MODE_NORMAL)
+		return;
+
+	/* pacing class */
+	gaplen = recalc_gapsize(skb, sch);
+	if (!(cl->activity & FLAG_DMARK)) {
+		cl->clock += len + gaplen;
+	} else { /* reset class clock */
+		cl->activity &= ~FLAG_DMARK;
+		cl->clock = q->clock + gaplen;
+	}
+}
+
+/* Lookup next target class
+ * Firstly, search the pacing class list:
+ *     If the Qdisc's clock < the class's clock then the class is selected.
+ * Secondly, search the normal class list.
+ *
+ * Finally, a gap packet is inserted, because there is not any packets
+ * to send out.  And it returns the size of the gap packet.
+ */
+static struct psp_class *lookup_next_class(struct Qdisc *sch, int *gapsize)
+{
+	struct psp_sched_data *q = qdisc_priv(sch);
+	struct psp_class *cl, *next = NULL;
+	gapclock_t diff;
+	int shortest;
+
+	/* pacing class */
+	shortest = q->mtu;
+	list_for_each_entry(cl, &q->pacing_list, plist) {
+		diff = cl->clock - q->clock;
+		if (diff > 0) {
+			if ((gapclock_t)shortest > diff)
+				shortest = (int)diff;
+			continue;
+		}
+		if (!(cl->activity & FLAG_ACTIVE)) {
+			cl->activity |= FLAG_DMARK;
+			continue;
+		}
+
+		if (next == NULL)
+			next = cl;
+	}
+	if (next)
+		return next;
+
+	/* normal class */
+	list_for_each_entry(cl, &q->normal_list, plist) {
+		if (!(cl->activity & FLAG_ACTIVE))
+			continue;
+
+		list_move_tail(&cl->plist, &q->normal_list);
+		return cl;
+	}
+
+	/* gap packet */
+	*gapsize = max_t(int, shortest, sizeof(struct gaphdr));
+	return NULL;
+}
+
+static int psp_enqueue(struct sk_buff *skb, struct Qdisc *sch)
+{
+	struct psp_sched_data *q = qdisc_priv(sch);
+	struct psp_class *cl;
+	int err;
+
+	cl = psp_classify(skb, sch, &err);
+	if (cl == NULL) {
+		if (err == NET_XMIT_BYPASS)
+			sch->qstats.drops++;
+		kfree_skb(skb);
+		return err;
+	}
+
+	err = cl->qdisc->ops->enqueue(skb, cl->qdisc);
+	if (unlikely(err != NET_XMIT_SUCCESS)) {
+		sch->qstats.drops++;
+		cl->qstats.drops++;
+		return err;
+	}
+
+	cl->bstats.packets++;
+	cl->bstats.bytes += skb->len;
+	if (!(cl->activity & FLAG_ACTIVE))
+		psp_activate(q, cl);
+
+	sch->q.qlen++;
+	sch->bstats.packets++;
+	sch->bstats.bytes += skb->len;
+	return NET_XMIT_SUCCESS;
+}
+
+static int psp_requeue(struct sk_buff *skb, struct Qdisc *sch)
+{
+	struct psp_sched_data *q = qdisc_priv(sch);
+
+	__skb_queue_head(&q->requeue, skb);
+	sch->q.qlen++;
+	sch->qstats.requeues++;
+
+	return NET_XMIT_SUCCESS;
+}
+
+static struct sk_buff *psp_dequeue(struct Qdisc *sch)
+{
+	struct sk_buff *skb = NULL;
+	struct psp_sched_data *q = qdisc_priv(sch);
+	struct psp_class *cl;
+	int gapsize;
+
+	if (sch->q.qlen == 0)
+		return NULL;
+
+	/* requeue */
+	skb = __skb_dequeue(&q->requeue);
+	if (skb != NULL) {
+		sch->q.qlen--;
+		return skb;
+	}
+
+	/* normal/pacing class */
+	cl = lookup_next_class(sch, &gapsize);
+	if (cl != NULL) {
+		skb = cl->qdisc->ops->dequeue(cl->qdisc);
+		if (skb == NULL)
+			return NULL;
+
+		sch->q.qlen--;
+
+		goto update_clocks;
+	}
+
+	/* gap packets */
+	skb = skb_clone(q->gap, GFP_ATOMIC);
+	if (unlikely(!skb)) {
+		printk(KERN_ERR "psp: cannot clone a gap packet.\n");
+		return NULL;
+	}
+	skb_trim(skb, gapsize);
+	q->xstats.bytes += gapsize;
+	q->xstats.packets++;
+
+ update_clocks:
+	update_clocks(skb, sch, cl);
+	if (cl && cl->qdisc->q.qlen == 0)
+		psp_deactivate(q, cl);
+	return skb;
+}
+
+static unsigned int psp_drop(struct Qdisc *sch)
+{
+	struct psp_sched_data *q = qdisc_priv(sch);
+	struct psp_class *cl;
+	unsigned int len;
+
+	list_for_each_entry(cl, &q->drop_list, dlist) {
+		if (cl->qdisc->ops->drop != NULL
+		    && (len = cl->qdisc->ops->drop(cl->qdisc)) > 0) {
+			if (cl->qdisc->q.qlen == 0) {
+				psp_deactivate(q, cl);
+			} else {
+				list_move_tail(&cl->dlist, &q->drop_list);
+			}
+			cl->qstats.drops++;
+			sch->qstats.drops++;
+			sch->q.qlen--;
+			return len;
+		}
+	}
+	return 0;
+}
+
+static void psp_reset(struct Qdisc *sch)
+{
+	struct psp_sched_data *q = qdisc_priv(sch);
+	struct psp_class *cl;
+	int i;
+
+	for (i = 0; i < PSP_HSIZE; i++) {
+		list_for_each_entry(cl, &q->hash[i], hlist) {
+			if (cl->level == 0)
+				qdisc_reset(cl->qdisc);
+		}
+	}
+
+	sch->q.qlen = 0;
+}
+
+static void psp_destroy_class(struct Qdisc *sch, struct psp_class *cl)
+{
+	struct psp_sched_data *q = qdisc_priv(sch);
+
+	if (cl->mode == MODE_STATIC) {
+		if (cl->parent) {
+			cl->parent->allocated_rate -= cl->rate;
+		} else {
+			q->allocated_rate -= cl->rate;
+		}
+	}
+	if (cl->level == 0) {
+		sch->q.qlen -= cl->qdisc->q.qlen;
+		qdisc_destroy(cl->qdisc);
+	}
+
+	tcf_destroy_chain(q->filter_list);
+
+	while (!list_empty(&cl->children))
+		psp_destroy_class(sch, list_entry(cl->children.next,
+						  struct psp_class, sibling));
+
+	list_del(&cl->hlist);
+	list_del(&cl->sibling);
+	psp_deactivate(q, cl);
+	if (cl->level == 0)
+		list_del(&cl->plist);
+	kfree(cl);
+}
+
+static void psp_destroy(struct Qdisc *sch)
+{
+	struct psp_sched_data *q = qdisc_priv(sch);
+
+	tcf_destroy_chain(q->filter_list);
+
+	while (!list_empty(&q->root))
+		psp_destroy_class(sch, list_entry(q->root.next,
+						  struct psp_class, sibling));
+	__skb_queue_purge(&q->requeue);
+
+	/* free gap packet */
+	kfree_skb(q->gap);
+}
+
+static int psp_change_qdisc(struct Qdisc *sch, struct rtattr *opt)
+{
+	struct psp_sched_data *q = qdisc_priv(sch);
+	struct rtattr *tb[TCA_PSP_QOPT];
+	struct tc_psp_qopt *qopt;
+
+	if (!opt || rtattr_parse_nested(tb, TCA_PSP_QOPT, opt) ||
+	    tb[TCA_PSP_QOPT-1] == NULL ||
+	    RTA_PAYLOAD(tb[TCA_PSP_QOPT-1]) < sizeof(*qopt))
+		return -EINVAL;
+
+	qopt = RTA_DATA(tb[TCA_PSP_QOPT-1]);
+
+	sch_tree_lock(sch);
+	q->defcls = qopt->defcls;
+	if (qopt->rate)
+		q->max_rate = qopt->rate;
+	sch_tree_unlock(sch);
+	return 0;
+}
+
+static int psp_init(struct Qdisc *sch, struct rtattr *opt)
+{
+	struct psp_sched_data *q = qdisc_priv(sch);
+	struct rtattr *tb[TCA_PSP_QOPT];
+	struct net_device *dev = sch->dev;
+	struct tc_psp_qopt *qopt;
+	struct ethtool_cmd cmd = { ETHTOOL_GSET };
+	int i;
+
+	if (dev->type != ARPHRD_ETHER) {
+		printk(KERN_ERR "psp: PSPacer only supports Ethernet " \
+		       "devices.\n");
+		return -EINVAL;
+	}
+
+#ifdef NETIF_F_TSO
+	/* check TSO support */
+	if (ethtool_op_get_tso(dev)) {
+		printk(KERN_ERR "psp: PSPacer does not support TSO.  " \
+		       "You must disable it: \"ethtool -K %s tso off\"\n",
+		       dev->name);
+		return -EINVAL;
+	}
+#endif
+
+	if (!opt || rtattr_parse_nested(tb, TCA_PSP_QOPT, opt) ||
+	    tb[TCA_PSP_QOPT-1] == NULL ||
+	    RTA_PAYLOAD(tb[TCA_PSP_QOPT-1]) < sizeof(*qopt))
+		return -EINVAL;
+
+	qopt = RTA_DATA(tb[TCA_PSP_QOPT-1]);
+
+	memset(q, 0, sizeof(*q));
+	q->defcls = qopt->defcls;
+	q->mtu = dev->mtu + dev->hard_header_len;
+	q->gap = alloc_gap_packet(sch, q->mtu);
+	if (q->gap == NULL)
+		return -ENOBUFS;
+	if (qopt->rate == 0) {
+		/* set qdisc max rate.  If the kernel supports ethtool ioctl,
+		 * it sets to that value, otherwise it statically sets to
+		 * the GbE transmission rate (i.e. 125MB/s). */
+		/* NOTE: Since ethtool's {cmd.speed} specifies Mbps,
+		 * the value is converted in units of byte/sec */
+		q->max_rate = 125000000;
+
+		if (dev->ethtool_ops && dev->ethtool_ops->get_settings) {
+			if (dev->ethtool_ops->get_settings(dev, &cmd) == 0)
+				q->max_rate = BIT2BYTE(cmd.speed * 1000000);
+		}
+	} else {
+		q->max_rate = qopt->rate;
+	}
+
+	INIT_LIST_HEAD(&q->root);
+	for (i = 0; i < PSP_HSIZE; i++)
+		INIT_LIST_HEAD(q->hash + i);
+	INIT_LIST_HEAD(&q->drop_list);
+	INIT_LIST_HEAD(&q->pacing_list);
+	INIT_LIST_HEAD(&q->normal_list);
+	skb_queue_head_init(&q->requeue);
+
+	return 0;
+}
+
+static int psp_dump_qdisc(struct Qdisc *sch, struct sk_buff *skb)
+{
+	struct psp_sched_data *q = qdisc_priv(sch);
+	unsigned char *b = skb_tail_pointer(skb);
+	struct rtattr *rta;
+	struct tc_psp_qopt qopt;
+
+	qopt.defcls = q->defcls;
+	qopt.rate = q->max_rate;
+	rta = (struct rtattr *)b;
+	RTA_PUT(skb, TCA_OPTIONS, 0, NULL);
+	RTA_PUT(skb, TCA_PSP_QOPT, sizeof(qopt), &qopt);
+	rta->rta_len = skb_tail_pointer(skb) - b;
+
+	return skb->len;
+
+rtattr_failure:
+	skb_trim(skb, skb_tail_pointer(skb) - skb->data);
+	return -1;
+}
+
+static int psp_dump_qdisc_stats(struct Qdisc *sch, struct gnet_dump *d)
+{
+	struct psp_sched_data *q = qdisc_priv(sch);
+
+	return gnet_stats_copy_app(d, &q->xstats, sizeof(q->xstats));
+}
+
+static int psp_dump_class(struct Qdisc *sch, unsigned long arg,
+			  struct sk_buff *skb, struct tcmsg *tcm)
+{
+	struct psp_class *cl = (struct psp_class *)arg;
+	unsigned char *b = skb_tail_pointer(skb);
+	struct rtattr *rta;
+	struct tc_psp_copt copt;
+
+	tcm->tcm_parent = cl->parent ? cl->parent->classid : TC_H_ROOT;
+	tcm->tcm_handle = cl->classid;
+	if (cl->level == 0) {
+		tcm->tcm_info = cl->qdisc->handle;
+		cl->qstats.qlen = cl->qdisc->q.qlen;
+	}
+
+	rta = (struct rtattr *)b;
+	RTA_PUT(skb, TCA_OPTIONS, 0, NULL);
+	memset(&copt, 0, sizeof(copt));
+	copt.level = cl->level;
+	copt.mode = cl->mode;
+	copt.rate = cl->max_rate;
+	RTA_PUT(skb, TCA_PSP_COPT, sizeof(copt), &copt);
+	RTA_PUT(skb, TCA_PSP_QOPT, 0, NULL);
+	rta->rta_len = skb_tail_pointer(skb) - b;
+
+	return skb->len;
+ rtattr_failure:
+	skb_trim(skb, b - skb->data);
+	return -1;
+}
+
+static int psp_dump_class_stats(struct Qdisc *sch, unsigned long arg,
+				struct gnet_dump *d)
+{
+	struct psp_class *cl = (struct psp_class *)arg;
+
+	if (gnet_stats_copy_basic(d, &cl->bstats) < 0 ||
+	    gnet_stats_copy_queue(d, &cl->qstats) < 0)
+		return -1;
+
+	return 0;
+}
+
+static int psp_graft(struct Qdisc *sch, unsigned long arg, struct Qdisc *new,
+		     struct Qdisc **old)
+{
+	struct psp_class *cl = (struct psp_class *)arg;
+
+	if (cl == NULL)
+		return -ENOENT;
+	if (cl->level != 0)
+		return -EINVAL;
+	if (new == NULL) {
+		new = qdisc_create_dflt(sch->dev, &pfifo_qdisc_ops,
+					cl->classid);
+		if (new == NULL)
+			new = &noop_qdisc;
+	}
+
+	sch_tree_lock(sch);
+	*old = xchg(&cl->qdisc, new);
+	qdisc_tree_decrease_qlen(*old, (*old)->q.qlen);
+	qdisc_reset(*old);
+	sch_tree_unlock(sch);
+	return 0;
+}
+
+static struct Qdisc *psp_leaf(struct Qdisc *sch, unsigned long arg)
+{
+	struct psp_class *cl = (struct psp_class *)arg;
+
+	return (cl != NULL && cl->level == 0) ? cl->qdisc : NULL;
+}
+
+static unsigned long psp_get(struct Qdisc *sch, u32 classid)
+{
+	struct psp_class *cl = psp_find(classid, sch);
+
+	if (cl)
+		cl->refcnt++;
+	return (unsigned long)cl;
+}
+
+static void psp_put(struct Qdisc *sch, unsigned long arg)
+{
+	struct psp_class *cl = (struct psp_class *)arg;
+
+	if (--cl->refcnt == 0)
+		psp_destroy_class(sch, cl);
+}
+
+static int psp_change_class(struct Qdisc *sch, u32 classid, u32 parentid,
+			    struct rtattr **tca, unsigned long *arg)
+{
+	struct psp_sched_data *q = qdisc_priv(sch);
+	struct psp_class *cl = (struct psp_class *)*arg, *parent;
+	struct rtattr *opt = tca[TCA_OPTIONS-1];
+	struct rtattr *tb[TCA_PSP_QOPT];
+	struct tc_psp_copt *copt;
+	unsigned int limit;
+
+	if (opt == NULL
+	    || rtattr_parse(tb, TCA_PSP_QOPT, RTA_DATA(opt), RTA_PAYLOAD(opt)))
+		return -EINVAL;
+
+	copt = RTA_DATA(tb[TCA_PSP_COPT - 1]);
+
+	parent = (parentid == TC_H_ROOT ? NULL : psp_find(parentid, sch));
+
+	if (cl == NULL) { /* create new class */
+		struct Qdisc *new_q;
+
+		cl = kmalloc(sizeof(struct psp_class), GFP_KERNEL);
+		if (cl == NULL)
+			return -ENOBUFS;
+		memset(cl, 0, sizeof(struct psp_class));
+		cl->refcnt = 1;
+		INIT_LIST_HEAD(&cl->sibling);
+		INIT_LIST_HEAD(&cl->children);
+		INIT_LIST_HEAD(&cl->hlist);
+		INIT_LIST_HEAD(&cl->dlist);
+		INIT_LIST_HEAD(&cl->plist);
+
+		new_q = qdisc_create_dflt(sch->dev, &pfifo_qdisc_ops, classid);
+
+		sch_tree_lock(sch);
+		while (parent && parent->level != 0) {
+			/* turn parent into inner node */
+			sch->q.qlen -= parent->qdisc->q.qlen;
+			qdisc_destroy(parent->qdisc);
+			psp_deactivate(q, cl);
+			list_del(&parent->plist);
+
+			parent->level = (parent->parent
+					 ? parent->parent->level
+					 : TC_PSP_MAXDEPTH) - 1;
+		}
+		cl->qdisc = new_q ? new_q : &noop_qdisc;
+		cl->classid = classid;
+		cl->parent = parent;
+
+		list_add_tail(&cl->hlist, q->hash + psp_hash(classid));
+		list_add_tail(&cl->sibling, (parent
+					     ? &parent->children : &q->root));
+	} else {
+		if (cl->mode == MODE_STATIC)
+			q->allocated_rate -= cl->rate;
+
+		sch_tree_lock(sch);
+	}
+
+	/* setup mode and target rate */
+	cl->mode = copt->mode;
+	if (copt->rate < MIN_TARGET_RATE)
+		copt->rate = MIN_TARGET_RATE;
+	cl->max_rate = copt->rate;
+	switch (cl->mode) {
+	case MODE_STATIC:
+		limit = (parent ? parent->allocated_rate : q->allocated_rate)
+			+ cl->max_rate;
+		if (limit > q->max_rate) {
+			printk(KERN_ERR "psp: target rate is oversubscribed!");
+			list_del_init(&cl->hlist);
+			psp_deactivate(q, cl);
+			if (--cl->refcnt == 0)
+				psp_destroy_class(sch, cl);
+			sch_tree_unlock(sch);
+			return -EINVAL;
+		}
+		cl->rate = cl->max_rate;
+		if (parent) {
+			parent->allocated_rate += cl->rate;
+		} else {
+			q->allocated_rate += cl->rate;
+		}
+		break;
+	}
+
+	if (cl->level == 0) {
+		if (!list_empty(&cl->plist))
+			list_del(&cl->plist);
+		add_leaf_class(q, cl);
+	}
+	sch_tree_unlock(sch);
+	*arg = (unsigned long)cl;
+	return 0;
+}
+
+static struct tcf_proto **psp_find_tcf(struct Qdisc *sch, unsigned long arg)
+{
+	struct psp_sched_data *q = qdisc_priv(sch);
+	struct psp_class *cl = (struct psp_class *)arg;
+	struct tcf_proto **fl = cl ? &cl->filter_list : &q->filter_list;
+
+	return fl;
+}
+
+static unsigned long psp_bind_filter(struct Qdisc *sch, unsigned long parent,
+				     u32 classid)
+{
+	struct psp_sched_data *q = qdisc_priv(sch);
+	struct psp_class *cl = psp_find(classid, sch);
+
+	if (cl)
+		cl->filter_cnt++;
+	else
+		q->filter_cnt++;
+	return (unsigned long)cl;
+}
+
+static void psp_unbind_filter(struct Qdisc *sch, unsigned long arg)
+{
+	struct psp_sched_data *q = qdisc_priv(sch);
+	struct psp_class *cl = (struct psp_class *)arg;
+
+	if (cl)
+		cl->filter_cnt--;
+	else
+		q->filter_cnt--;
+}
+
+static int psp_delete(struct Qdisc *sch, unsigned long arg)
+{
+	struct psp_sched_data *q = qdisc_priv(sch);
+	struct psp_class *cl = (struct psp_class *)arg;
+
+	if (!list_empty(&cl->children) || cl->filter_cnt)
+		return -EBUSY;
+
+	sch_tree_lock(sch);
+
+	list_del_init(&cl->hlist);
+	psp_deactivate(q, cl);
+	list_del_init(&cl->plist);
+	if (--cl->refcnt == 0)
+		psp_destroy_class(sch, cl);
+
+	sch_tree_unlock(sch);
+	return 0;
+}
+
+static void psp_walk(struct Qdisc *sch, struct qdisc_walker *arg)
+{
+	struct psp_sched_data *q = qdisc_priv(sch);
+	int i;
+
+	if (arg->stop)
+		return;
+
+	for (i = 0; i < PSP_HSIZE; i++) {
+		struct psp_class *cl;
+
+		list_for_each_entry(cl, &q->hash[i], hlist) {
+			if (arg->count < arg->skip) {
+				arg->count++;
+				continue;
+			}
+			if (arg->fn(sch, (unsigned long)cl, arg) < 0) {
+				arg->stop = 1;
+				return;
+			}
+			arg->count++;
+		}
+	}
+}
+
+static const struct Qdisc_class_ops psp_class_ops = {
+	.graft		=	psp_graft,
+	.leaf		=	psp_leaf,
+	.get		=	psp_get,
+	.put		=	psp_put,
+	.change		=	psp_change_class,
+	.delete		=	psp_delete,
+	.walk		=	psp_walk,
+	.tcf_chain	=	psp_find_tcf,
+	.bind_tcf	=	psp_bind_filter,
+	.unbind_tcf	=	psp_unbind_filter,
+	.dump		=	psp_dump_class,
+	.dump_stats	=	psp_dump_class_stats,
+};
+
+static struct Qdisc_ops psp_qdisc_ops __read_mostly = {
+	.next		=	NULL,
+	.cl_ops		=	&psp_class_ops,
+	.id		=	"psp",
+	.priv_size	=	sizeof(struct psp_sched_data),
+	.enqueue	=	psp_enqueue,
+	.dequeue	=	psp_dequeue,
+	.requeue	=	psp_requeue,
+	.drop		=	psp_drop,
+	.init		=	psp_init,
+	.reset		=	psp_reset,
+	.destroy	=	psp_destroy,
+	.change		=	psp_change_qdisc,
+	.dump		=	psp_dump_qdisc,
+	.dump_stats	=	psp_dump_qdisc_stats,
+	.owner		=	THIS_MODULE,
+};
+
+static int __init psp_module_init(void)
+{
+	return register_qdisc(&psp_qdisc_ops);
+}
+
+static void __exit psp_module_exit(void)
+{
+	unregister_qdisc(&psp_qdisc_ops);
+}
+
+module_init(psp_module_init)
+module_exit(psp_module_exit)
+MODULE_LICENSE("GPL");
-- 
1.5.3.4


^ permalink raw reply related	[flat|nested] 7+ messages in thread

* Re: [PATCHv2 1/3] NET_SCHED: PSPacer qdisc module
  2007-11-26 15:50 [PATCHv2 1/3] NET_SCHED: PSPacer qdisc module Ryousei Takano
@ 2007-11-26 18:15 ` Patrick McHardy
  2007-11-27 11:08   ` TAKANO Ryousei
  0 siblings, 1 reply; 7+ messages in thread
From: Patrick McHardy @ 2007-11-26 18:15 UTC (permalink / raw)
  To: Ryousei Takano; +Cc: netdev, shemminger, dada1, t.kudoh

Ryousei Takano wrote:
> This patch includes the PSPacer (Precise Software Pacer) qdisc
> module, which achieves precise transmission bandwidth control.
> You can find more information at the project web page
> (http://www.gridmpi.org/gridtcp.jsp).


Thanks for the update, but you didn't answer any of my questions.
Another round of comments below.

> Signed-off-by: Ryousei Takano <takano-ryousei@aist.go.jp>
> ---
>  include/linux/pkt_sched.h |   37 ++
>  net/sched/Kconfig         |    9 +
>  net/sched/Makefile        |    1 +
>  net/sched/sch_psp.c       |  958 +++++++++++++++++++++++++++++++++++++++++++++
>  4 files changed, 1005 insertions(+), 0 deletions(-)
>  create mode 100644 net/sched/sch_psp.c
> 
> diff --git a/include/linux/pkt_sched.h b/include/linux/pkt_sched.h
> index 919af93..fda41cd 100644
> --- a/include/linux/pkt_sched.h
> +++ b/include/linux/pkt_sched.h
> @@ -430,6 +430,43 @@ enum {
>  
>  #define TCA_ATM_MAX	(__TCA_ATM_MAX - 1)
>  
> +/* Precise Software Pacer section */
> +
> +#define TC_PSP_MAXDEPTH (8)
> +
> +typedef long long gapclock_t;

It seems you only add to this, does it need to be signed?
How about using a fixed size type (u64) and getting rid
of the typedef?

> +
> +enum {
> +	MODE_NORMAL = 0,
> +	MODE_STATIC = 1,
> +};
> +
> +struct tc_psp_copt
> +{
> +	__u32	level;
> +	__u32	mode;
> +	__u32	rate;
> +};
> +
> +struct tc_psp_qopt
> +{
> +	__u32	defcls;
> +	__u32	rate;
> +};


What unit is rate measured in?

> +
> +struct tc_psp_xstats
> +{
> +	__u32	bytes;		/* gap packet statistics */
> +	__u32	packets;
> +};

How about using gnet_stats_basic for this?

> +
> +enum
> +{
> +	TCA_PSP_UNSPEC,
> +	TCA_PSP_COPT,
> +	TCA_PSP_QOPT,
> +};
> +
>  /* Network emulator */
>  

> +++ b/net/sched/sch_psp.c
> @@ -0,0 +1,958 @@
> +/*
> + * net/sched/sch_psp.c	PSPacer: Precise Software Pacer
> + *
> + *		Copyright (C) 2004-2007 National Institute of Advanced
> + *		Industrial Science and Technology (AIST), Japan.
> + *
> + *		This program is free software; you can redistribute it and/or
> + *		modify it under the terms of the GNU General Public License
> + *		as published by the Free Software Foundation; either version
> + *		2 of the License, or (at your option) any later version.
> + *
> + * Authors:	Ryousei Takano, <takano-ryousei@aist.go.jp>
> + */
> +
> +#include <linux/module.h>
> +#include <linux/types.h>
> +#include <linux/kernel.h>
> +#include <linux/netdevice.h>
> +#include <linux/skbuff.h>
> +#include <linux/rtnetlink.h>
> +#include <linux/ethtool.h>
> +#include <linux/if_arp.h>
> +#include <linux/in.h>
> +#include <linux/ip.h>
> +#include <net/pkt_sched.h>
> +
> +/* PSPacer achieves precise rate regulation results, and no microscopic
> + * burst transmission which exceeds the limit is generated.
> + *
> + * The basic idea is that transmission timing can be precisely controlled,
> + * if packets are sent back-to-back at the wire rate.  PSPacer controls
> + * the packet transmision intervals by inserting additional packets,
> + * called gap packets, between adjacent packets.  The transmission interval
> + * can be controlled accurately by adjusting the number and size of the gap
> + * packets. PSPacer uses the 802.3x PAUSE frame as the gap packet.
> + *
> + * For the purpose of adjusting the gap size, this Qdisc maintains a byte
> + * clock which is recorded by a total transmitted byte per connection.
> + * Each sub-class has a class local clock which is used to make decision
> + * whether to send a packet or not.  If there is not any packets to send,
> + * gap packets are inserted.
> + *
> + * References:
> + * [1] R.Takano, T.Kudoh, Y.Kodama, M.Matsuda, H.Tezuka, and Y.Ishikawa,
> + *     "Design and Evaluation of Precise Software Pacing Mechanisms for
> + *     Fast Long-Distance Networks", PFLDnet2005.
> + * [2] http://www.gridmpi.org/gridtcp.jsp
> + */
> +
> +#define HW_GAP (16)	/* Preamble(8) + Inter Frame Gap(8) */
> +#define FCS    (4)	/* Frame Check Sequence(4) */
> +#define MIN_GAP (64)	/* Minimum size of gap packet */
> +#define MIN_TARGET_RATE (1000) /* 1 KB/s (= 8 Kbps) */

What is the reason for this minimum?

> +
> +#define PSP_HSIZE (16)
> +
> +#define BIT2BYTE(n) ((n) >> 3)

Please remove this and simply open code "/ BITS_PER_BYTE" in
the only spot using it.

> +
> +struct psp_class
> +{
> +	u32 classid;			/* class id */
> +	int refcnt;			/* reference count */
> +
> +	struct gnet_stats_basic bstats;	/* basic stats */
> +	struct gnet_stats_queue qstats;	/* queue stats */
> +
> +	int level;			/* class level in hierarchy */
> +	struct psp_class *parent;	/* parent class */
> +	struct list_head sibling;	/* sibling classes */
> +	struct list_head children;	/* child classes */
> +
> +	struct Qdisc *qdisc;		/* leaf qdisc */
> +
> +	struct tcf_proto *filter_list;	/* filter list */
> +	int filter_cnt;			/* filter count */
> +
> +	struct list_head hlist;		/* hash list */
> +	struct list_head dlist;		/* drop list */
> +	struct list_head plist;		/* normal/pacing class qdisc list */
> +
> +	int activity;			/* activity flag */
> +#define FLAG_ACTIVE (0x00000001)	/*  this class has packets or not */
> +#define FLAG_DMARK  (0x00000002)	/*  reset mark */
> +	int mode;			/* normal/pacing */
> +	u32 rate;			/* current target rate */
> +	u32 max_rate;			/* maximum target rate */
> +	u32 allocated_rate;		/* allocated rate to children */
> +	int gapsize;			/* current gapsize */

Please use unsigned int for everything related to packet sizes
(including mtu).

> +	gapclock_t clock;		/* class local clock */
> +	long qtime;			/* sender-side queuing time (us) */

Unused

> +};
> +
> +struct psp_sched_data
> +{
> +	int defcls;				/* default class id */
> +	struct list_head root;			/* root class list */
> +	struct list_head hash[PSP_HSIZE];	/* class hash */
> +	struct list_head drop_list;		/* active leaf class list (for
> +						   dropping) */
> +	struct list_head pacing_list;		/* gap leaf class list (in
> +						   order of the gap size) */
> +	struct list_head normal_list;		/* no gap leaf class list */
> +
> +	struct sk_buff_head requeue;		/* requeued packet */
> +
> +	struct tcf_proto *filter_list;		/* filter list */
> +	int filter_cnt;				/* filter count */
> +
> +	u32 max_rate;				/* physical rate */
> +	u32 allocated_rate;			/* sum of allocated rate */
> +	int mtu;				/* interface MTU size
> +						   (included ethernet heaer) */
> +	gapclock_t clock;			/* wall clock */
> +
> +	struct sk_buff *gap;			/* template of gap packets */
> +	struct tc_psp_xstats xstats;		/* psp specific stats */
> +};
> +
> +/* A gap packet header (struct ethhdr + h_opcode). */
> +struct gaphdr {
> +	unsigned char h_dest[ETH_ALEN];		/* destination eth addr */
> +	unsigned char h_source[ETH_ALEN];	/* source eth addr */
> +	__be16 h_proto;				/* MAC control */
> +	__be16 h_opcode;			/* MAC control opcode */
> +} __attribute__((packed));
> +
> +static const unsigned char gap_dest[ETH_ALEN] = {0x01, 0x80, 0xc2, 0x00,
> +						 0x00, 0x01};
> +
> +
> +static struct sk_buff *alloc_gap_packet(struct Qdisc *sch, int size)
> +{
> +	struct sk_buff *skb;
> +	struct net_device *dev = sch->dev;
> +	struct gaphdr *gap;
> +	int pause_time = 0;
> +
> +	skb = alloc_skb(size, GFP_ATOMIC);

Only called from ->init, so GFP_KERNEL is fine.

> +	if (!skb)
> +		return NULL;
> +
> +	memset(skb->data, 0xff, size);

memsetting the header portion seems unnecessary since you overwrite
it again directly below.

> +
> +	gap = (struct gaphdr *)skb->data;
> +	memcpy(gap->h_dest, gap_dest, ETH_ALEN);
> +	memcpy(gap->h_source, dev->dev_addr, ETH_ALEN);
> +	gap->h_proto = htons(ETH_P_PAUSE);
> +	gap->h_opcode = htons(pause_time);
> +
> +	skb_put(skb, size);

This is usually done before putting data in the packet.

> +
> +	skb->dev = sch->dev;
> +	skb->protocol = ETH_P_802_3;

htons()?

> +	skb_reset_network_header(skb); /* For dev_queue_xmit_nit(). */
> +
> +	return skb;
> +}
> +
> +static inline unsigned int psp_hash(u32 h)
> +{
> +	h ^= h >> 8;
> +	h ^= h >> 4;
> +	return h & (PSP_HSIZE - 1);
> +}
> +
> +static inline struct psp_class *psp_find(u32 handle, struct Qdisc *sch)
> +{
> +	struct psp_sched_data *q = qdisc_priv(sch);
> +	struct psp_class *cl;
> +
> +	list_for_each_entry(cl, &q->hash[psp_hash(handle)], hlist) {
> +		if (cl->classid == handle)
> +			return cl;
> +	}
> +	return NULL;
> +}
> +
> +static struct psp_class *psp_classify(struct sk_buff *skb, struct Qdisc *sch,
> +				      int *qerr)
> +{
> +	struct psp_sched_data *q = qdisc_priv(sch);
> +	struct psp_class *cl;
> +	struct tcf_result res;
> +	struct tcf_proto *tcf;
> +	int result;
> +
> +	cl = psp_find(skb->priority, sch);

You could skip the search if the majors don't match:

         if (TC_H_MAJ(skb->priority ^ sch->handle) == 0 &&
             (cl = psp_find(skb->priority, sch)) != NULL)

> +	if (cl != NULL && cl->level == 0)
> +		return cl;
> +
> +	*qerr = NET_XMIT_BYPASS;
> +	tcf = q->filter_list;
> +	while (tcf && (result = tc_classify(skb, tcf, &res)) >= 0) {
> +#ifdef CONFIG_NET_CLS_ACT
> +		switch (result) {
> +		case TC_ACT_QUEUED:
> +		case TC_ACT_STOLEN:
> +			*qerr = NET_XMIT_SUCCESS;
> +		case TC_ACT_SHOT:
> +			return NULL;
> +		}
> +#endif
> +		cl = (struct psp_class *)res.class;
> +		if (cl == NULL) {
> +			cl = psp_find(res.classid, sch);
> +			if (cl == NULL)
> +				break; /* filter selected invalid classid */
> +		}
> +
> +		if (cl->level == 0)
> +			return cl; /* hit leaf class */
> +
> +		/* apply inner filter chain */
> +		tcf = cl->filter_list;
> +	}
> +
> +	/* classification failed, try default class */
> +	cl = psp_find(TC_H_MAKE(TC_H_MAJ(sch->handle), q->defcls), sch);
> +	if (cl == NULL || cl->level > 0)
> +		return NULL;
> +
> +	return cl;
> +}
> +
> +static inline void psp_activate(struct psp_sched_data *q, struct psp_class *cl)
> +{
> +	cl->activity |= FLAG_ACTIVE;
> +	list_add_tail(&cl->dlist, &q->drop_list);
> +}
> +
> +static inline void psp_deactivate(struct psp_sched_data *q,
> +				  struct psp_class *cl)
> +{
> +	cl->activity &= ~FLAG_ACTIVE;
> +	list_del_init(&cl->dlist);
> +}
> +
> +static void add_leaf_class(struct psp_sched_data *q, struct psp_class *cl)
> +{
> +	struct psp_class *p;
> +	int mtu = q->mtu + FCS;
> +
> +	/* chain normal/pacing class list */
> +	switch (cl->mode) {
> +	case MODE_NORMAL:
> +		list_add_tail(&cl->plist, &q->normal_list);
> +		break;
> +	case MODE_STATIC:
> +		cl->gapsize = (((q->max_rate / 1000) * mtu)
> +			       / (cl->rate / 1000)) - mtu;
> +		cl->gapsize -= (HW_GAP + FCS) * DIV_ROUND_UP(q->max_rate,
> +							     cl->rate);
> +		cl->gapsize = max_t(int, cl->gapsize, MIN_GAP);
> +		cl->activity |= FLAG_DMARK;
> +		list_for_each_entry(p, &q->pacing_list, plist) {
> +			if (cl->gapsize < p->gapsize)
> +				break;
> +		}
> +		list_add_tail(&cl->plist, &p->plist);
> +		break;
> +	}
> +}
> +
> +static int recalc_gapsize(struct sk_buff *skb, struct Qdisc *sch)
> +{
> +	struct psp_sched_data *q = qdisc_priv(sch);
> +	struct psp_class *cl;
> +	unsigned int len = skb->len;
> +	int gapsize = 0;
> +	int err;
> +
> +	cl = psp_classify(skb, sch, &err);
> +	switch (cl->mode) {
> +	case MODE_STATIC:
> +		gapsize = cl->gapsize;
> +		if (len < q->mtu) /* workaround for overflow */
> +			gapsize = (int)((long long)gapsize * len / q->mtu);

No need to cast to the LHS type. Shouldn't this also be rounded up
like the other gapsize calculation?

> +		break;
> +	}
> +
> +	return max_t(int, gapsize, MIN_GAP);
> +}
> +
> +/* Update byte clocks
> + * When a packet is sent out:
> + *     Qdisc's clock += packet length
> + *     if the class is the pacing class:
> + *         class's clock += packet length + gap length
> + */
> +static void update_clocks(struct sk_buff *skb, struct Qdisc *sch,
> +			  struct psp_class *cl)
> +{
> +	struct psp_sched_data *q = qdisc_priv(sch);
> +	unsigned int len = skb->len;
> +	int gaplen;
> +
> +	q->clock += len;
> +	if (cl == NULL || cl->mode == MODE_NORMAL)
> +		return;
> +
> +	/* pacing class */
> +	gaplen = recalc_gapsize(skb, sch);
> +	if (!(cl->activity & FLAG_DMARK)) {
> +		cl->clock += len + gaplen;
> +	} else { /* reset class clock */
> +		cl->activity &= ~FLAG_DMARK;
> +		cl->clock = q->clock + gaplen;
> +	}
> +}
> +
> +/* Lookup next target class
> + * Firstly, search the pacing class list:
> + *     If the Qdisc's clock < the class's clock then the class is selected.
> + * Secondly, search the normal class list.
> + *
> + * Finally, a gap packet is inserted, because there is not any packets
> + * to send out.  And it returns the size of the gap packet.
> + */
> +static struct psp_class *lookup_next_class(struct Qdisc *sch, int *gapsize)
> +{
> +	struct psp_sched_data *q = qdisc_priv(sch);
> +	struct psp_class *cl, *next = NULL;
> +	gapclock_t diff;
> +	int shortest;
> +
> +	/* pacing class */
> +	shortest = q->mtu;
> +	list_for_each_entry(cl, &q->pacing_list, plist) {
> +		diff = cl->clock - q->clock;
> +		if (diff > 0) {
> +			if ((gapclock_t)shortest > diff)
> +				shortest = (int)diff;

Is diff guaranteed not to exceed an int?

> +			continue;
> +		}
> +		if (!(cl->activity & FLAG_ACTIVE)) {
> +			cl->activity |= FLAG_DMARK;
> +			continue;
> +		}
> +
> +		if (next == NULL)
> +			next = cl;
> +	}
> +	if (next)
> +		return next;
> +
> +	/* normal class */
> +	list_for_each_entry(cl, &q->normal_list, plist) {
> +		if (!(cl->activity & FLAG_ACTIVE))
> +			continue;
> +
> +		list_move_tail(&cl->plist, &q->normal_list);
> +		return cl;
> +	}
> +
> +	/* gap packet */
> +	*gapsize = max_t(int, shortest, sizeof(struct gaphdr));
> +	return NULL;
> +}
> +
> +static int psp_enqueue(struct sk_buff *skb, struct Qdisc *sch)
> +{
> +	struct psp_sched_data *q = qdisc_priv(sch);
> +	struct psp_class *cl;
> +	int err;
> +
> +	cl = psp_classify(skb, sch, &err);
> +	if (cl == NULL) {
> +		if (err == NET_XMIT_BYPASS)
> +			sch->qstats.drops++;
> +		kfree_skb(skb);
> +		return err;
> +	}
> +
> +	err = cl->qdisc->ops->enqueue(skb, cl->qdisc);
> +	if (unlikely(err != NET_XMIT_SUCCESS)) {
> +		sch->qstats.drops++;
> +		cl->qstats.drops++;
> +		return err;
> +	}
> +
> +	cl->bstats.packets++;
> +	cl->bstats.bytes += skb->len;
> +	if (!(cl->activity & FLAG_ACTIVE))
> +		psp_activate(q, cl);
> +
> +	sch->q.qlen++;
> +	sch->bstats.packets++;
> +	sch->bstats.bytes += skb->len;
> +	return NET_XMIT_SUCCESS;
> +}
> +
> +static int psp_requeue(struct sk_buff *skb, struct Qdisc *sch)
> +{
> +	struct psp_sched_data *q = qdisc_priv(sch);
> +
> +	__skb_queue_head(&q->requeue, skb);
> +	sch->q.qlen++;
> +	sch->qstats.requeues++;
> +
> +	return NET_XMIT_SUCCESS;
> +}
> +
> +static struct sk_buff *psp_dequeue(struct Qdisc *sch)
> +{
> +	struct sk_buff *skb = NULL;
> +	struct psp_sched_data *q = qdisc_priv(sch);
> +	struct psp_class *cl;
> +	int gapsize;
> +
> +	if (sch->q.qlen == 0)
> +		return NULL;
> +
> +	/* requeue */
> +	skb = __skb_dequeue(&q->requeue);
> +	if (skb != NULL) {
> +		sch->q.qlen--;
> +		return skb;
> +	}
> +
> +	/* normal/pacing class */
> +	cl = lookup_next_class(sch, &gapsize);
> +	if (cl != NULL) {
> +		skb = cl->qdisc->ops->dequeue(cl->qdisc);
> +		if (skb == NULL)
> +			return NULL;
> +
> +		sch->q.qlen--;
> +
> +		goto update_clocks;
> +	}
> +
> +	/* gap packets */
> +	skb = skb_clone(q->gap, GFP_ATOMIC);
> +	if (unlikely(!skb)) {
> +		printk(KERN_ERR "psp: cannot clone a gap packet.\n");
> +		return NULL;
> +	}
> +	skb_trim(skb, gapsize);
> +	q->xstats.bytes += gapsize;
> +	q->xstats.packets++;
> +
> + update_clocks:
> +	update_clocks(skb, sch, cl);
> +	if (cl && cl->qdisc->q.qlen == 0)
> +		psp_deactivate(q, cl);
> +	return skb;
> +}
> +
> +static unsigned int psp_drop(struct Qdisc *sch)
> +{
> +	struct psp_sched_data *q = qdisc_priv(sch);
> +	struct psp_class *cl;
> +	unsigned int len;
> +
> +	list_for_each_entry(cl, &q->drop_list, dlist) {
> +		if (cl->qdisc->ops->drop != NULL
> +		    && (len = cl->qdisc->ops->drop(cl->qdisc)) > 0) {

Please put the && on the first line (everywhere else where you
have this style too please).

> +			if (cl->qdisc->q.qlen == 0) {
> +				psp_deactivate(q, cl);
> +			} else {
> +				list_move_tail(&cl->dlist, &q->drop_list);
> +			}

CodingStyle:

Do not unnecessarily use braces where a single statement will do.

if (condition)
         action();

> +			cl->qstats.drops++;
> +			sch->qstats.drops++;
> +			sch->q.qlen--;
> +			return len;
> +		}
> +	}
> +	return 0;
> +}
> +
> +static void psp_reset(struct Qdisc *sch)
> +{
> +	struct psp_sched_data *q = qdisc_priv(sch);
> +	struct psp_class *cl;
> +	int i;
> +
> +	for (i = 0; i < PSP_HSIZE; i++) {
> +		list_for_each_entry(cl, &q->hash[i], hlist) {
> +			if (cl->level == 0)
> +				qdisc_reset(cl->qdisc);
> +		}
> +	}
> +
> +	sch->q.qlen = 0;
> +}
> +
> +static void psp_destroy_class(struct Qdisc *sch, struct psp_class *cl)
> +{
> +	struct psp_sched_data *q = qdisc_priv(sch);
> +
> +	if (cl->mode == MODE_STATIC) {
> +		if (cl->parent) {
> +			cl->parent->allocated_rate -= cl->rate;
> +		} else {
> +			q->allocated_rate -= cl->rate;
> +		}
> +	}
> +	if (cl->level == 0) {
> +		sch->q.qlen -= cl->qdisc->q.qlen;
> +		qdisc_destroy(cl->qdisc);

qdisc_tree_decrease_qlen. But actually that should be done in
->delete_class since the lock is dropped in between and this
creates an externally visible error in the qlen counter.

> +	}
> +
> +	tcf_destroy_chain(q->filter_list);
> +
> +	while (!list_empty(&cl->children))
> +		psp_destroy_class(sch, list_entry(cl->children.next,
> +						  struct psp_class, sibling));
> +
> +	list_del(&cl->hlist);
> +	list_del(&cl->sibling);
> +	psp_deactivate(q, cl);
> +	if (cl->level == 0)
> +		list_del(&cl->plist);
> +	kfree(cl);
> +}
> +
> +static void psp_destroy(struct Qdisc *sch)
> +{
> +	struct psp_sched_data *q = qdisc_priv(sch);
> +
> +	tcf_destroy_chain(q->filter_list);
> +
> +	while (!list_empty(&q->root))
> +		psp_destroy_class(sch, list_entry(q->root.next,
> +						  struct psp_class, sibling));

list_for_each_entry_safe.

> +	__skb_queue_purge(&q->requeue);
> +
> +	/* free gap packet */
> +	kfree_skb(q->gap);
> +}
> +
> +static int psp_change_qdisc(struct Qdisc *sch, struct rtattr *opt)
> +{
> +	struct psp_sched_data *q = qdisc_priv(sch);
> +	struct rtattr *tb[TCA_PSP_QOPT];
> +	struct tc_psp_qopt *qopt;
> +
> +	if (!opt || rtattr_parse_nested(tb, TCA_PSP_QOPT, opt) ||
> +	    tb[TCA_PSP_QOPT-1] == NULL ||
> +	    RTA_PAYLOAD(tb[TCA_PSP_QOPT-1]) < sizeof(*qopt))
> +		return -EINVAL;
> +
> +	qopt = RTA_DATA(tb[TCA_PSP_QOPT-1]);
> +
> +	sch_tree_lock(sch);
> +	q->defcls = qopt->defcls;
> +	if (qopt->rate)
> +		q->max_rate = qopt->rate;
> +	sch_tree_unlock(sch);
> +	return 0;
> +}
> +
> +static int psp_init(struct Qdisc *sch, struct rtattr *opt)
> +{
> +	struct psp_sched_data *q = qdisc_priv(sch);
> +	struct rtattr *tb[TCA_PSP_QOPT];
> +	struct net_device *dev = sch->dev;
> +	struct tc_psp_qopt *qopt;
> +	struct ethtool_cmd cmd = { ETHTOOL_GSET };
> +	int i;
> +
> +	if (dev->type != ARPHRD_ETHER) {
> +		printk(KERN_ERR "psp: PSPacer only supports Ethernet " \
> +		       "devices.\n");
> +		return -EINVAL;
> +	}
> +
> +#ifdef NETIF_F_TSO

Unecessary #ifdef.

> +	/* check TSO support */
> +	if (ethtool_op_get_tso(dev)) {

dev->features & NETIF_F_TSO. What happens if TSO is enabled later on?

> +		printk(KERN_ERR "psp: PSPacer does not support TSO.  " \
> +		       "You must disable it: \"ethtool -K %s tso off\"\n",
> +		       dev->name);
> +		return -EINVAL;
> +	}
> +#endif
> +
> +	if (!opt || rtattr_parse_nested(tb, TCA_PSP_QOPT, opt) ||
> +	    tb[TCA_PSP_QOPT-1] == NULL ||
> +	    RTA_PAYLOAD(tb[TCA_PSP_QOPT-1]) < sizeof(*qopt))
> +		return -EINVAL;
> +
> +	qopt = RTA_DATA(tb[TCA_PSP_QOPT-1]);
> +
> +	memset(q, 0, sizeof(*q));
> +	q->defcls = qopt->defcls;
> +	q->mtu = dev->mtu + dev->hard_header_len;
> +	q->gap = alloc_gap_packet(sch, q->mtu);
> +	if (q->gap == NULL)
> +		return -ENOBUFS;
> +	if (qopt->rate == 0) {
> +		/* set qdisc max rate.  If the kernel supports ethtool ioctl,
> +		 * it sets to that value, otherwise it statically sets to
> +		 * the GbE transmission rate (i.e. 125MB/s). */
> +		/* NOTE: Since ethtool's {cmd.speed} specifies Mbps,
> +		 * the value is converted in units of byte/sec */
> +		q->max_rate = 125000000;
> +
> +		if (dev->ethtool_ops && dev->ethtool_ops->get_settings) {
> +			if (dev->ethtool_ops->get_settings(dev, &cmd) == 0)
> +				q->max_rate = BIT2BYTE(cmd.speed * 1000000);
> +		}
> +	} else {
> +		q->max_rate = qopt->rate;
> +	}
> +
> +	INIT_LIST_HEAD(&q->root);
> +	for (i = 0; i < PSP_HSIZE; i++)
> +		INIT_LIST_HEAD(q->hash + i);
> +	INIT_LIST_HEAD(&q->drop_list);
> +	INIT_LIST_HEAD(&q->pacing_list);
> +	INIT_LIST_HEAD(&q->normal_list);
> +	skb_queue_head_init(&q->requeue);
> +
> +	return 0;
> +}
> +
> +static int psp_dump_qdisc(struct Qdisc *sch, struct sk_buff *skb)
> +{
> +	struct psp_sched_data *q = qdisc_priv(sch);
> +	unsigned char *b = skb_tail_pointer(skb);
> +	struct rtattr *rta;
> +	struct tc_psp_qopt qopt;
> +
> +	qopt.defcls = q->defcls;
> +	qopt.rate = q->max_rate;
> +	rta = (struct rtattr *)b;
> +	RTA_PUT(skb, TCA_OPTIONS, 0, NULL);
> +	RTA_PUT(skb, TCA_PSP_QOPT, sizeof(qopt), &qopt);
> +	rta->rta_len = skb_tail_pointer(skb) - b;
> +
> +	return skb->len;
> +
> +rtattr_failure:
> +	skb_trim(skb, skb_tail_pointer(skb) - skb->data);
> +	return -1;
> +}
> +
> +static int psp_dump_qdisc_stats(struct Qdisc *sch, struct gnet_dump *d)
> +{
> +	struct psp_sched_data *q = qdisc_priv(sch);
> +
> +	return gnet_stats_copy_app(d, &q->xstats, sizeof(q->xstats));
> +}
> +
> +static int psp_dump_class(struct Qdisc *sch, unsigned long arg,
> +			  struct sk_buff *skb, struct tcmsg *tcm)
> +{
> +	struct psp_class *cl = (struct psp_class *)arg;
> +	unsigned char *b = skb_tail_pointer(skb);
> +	struct rtattr *rta;
> +	struct tc_psp_copt copt;
> +
> +	tcm->tcm_parent = cl->parent ? cl->parent->classid : TC_H_ROOT;
> +	tcm->tcm_handle = cl->classid;
> +	if (cl->level == 0) {
> +		tcm->tcm_info = cl->qdisc->handle;
> +		cl->qstats.qlen = cl->qdisc->q.qlen;
> +	}
> +
> +	rta = (struct rtattr *)b;
> +	RTA_PUT(skb, TCA_OPTIONS, 0, NULL);
> +	memset(&copt, 0, sizeof(copt));
> +	copt.level = cl->level;
> +	copt.mode = cl->mode;
> +	copt.rate = cl->max_rate;
> +	RTA_PUT(skb, TCA_PSP_COPT, sizeof(copt), &copt);
> +	RTA_PUT(skb, TCA_PSP_QOPT, 0, NULL);
> +	rta->rta_len = skb_tail_pointer(skb) - b;
> +
> +	return skb->len;
> + rtattr_failure:
> +	skb_trim(skb, b - skb->data);
> +	return -1;
> +}
> +
> +static int psp_dump_class_stats(struct Qdisc *sch, unsigned long arg,
> +				struct gnet_dump *d)
> +{
> +	struct psp_class *cl = (struct psp_class *)arg;
> +
> +	if (gnet_stats_copy_basic(d, &cl->bstats) < 0 ||
> +	    gnet_stats_copy_queue(d, &cl->qstats) < 0)
> +		return -1;
> +
> +	return 0;
> +}
> +
> +static int psp_graft(struct Qdisc *sch, unsigned long arg, struct Qdisc *new,
> +		     struct Qdisc **old)
> +{
> +	struct psp_class *cl = (struct psp_class *)arg;
> +
> +	if (cl == NULL)
> +		return -ENOENT;
> +	if (cl->level != 0)
> +		return -EINVAL;
> +	if (new == NULL) {
> +		new = qdisc_create_dflt(sch->dev, &pfifo_qdisc_ops,
> +					cl->classid);
> +		if (new == NULL)
> +			new = &noop_qdisc;
> +	}
> +
> +	sch_tree_lock(sch);
> +	*old = xchg(&cl->qdisc, new);
> +	qdisc_tree_decrease_qlen(*old, (*old)->q.qlen);
> +	qdisc_reset(*old);
> +	sch_tree_unlock(sch);
> +	return 0;
> +}
> +
> +static struct Qdisc *psp_leaf(struct Qdisc *sch, unsigned long arg)
> +{
> +	struct psp_class *cl = (struct psp_class *)arg;
> +
> +	return (cl != NULL && cl->level == 0) ? cl->qdisc : NULL;
> +}
> +
> +static unsigned long psp_get(struct Qdisc *sch, u32 classid)
> +{
> +	struct psp_class *cl = psp_find(classid, sch);
> +
> +	if (cl)
> +		cl->refcnt++;
> +	return (unsigned long)cl;
> +}
> +
> +static void psp_put(struct Qdisc *sch, unsigned long arg)
> +{
> +	struct psp_class *cl = (struct psp_class *)arg;
> +
> +	if (--cl->refcnt == 0)
> +		psp_destroy_class(sch, cl);
> +}
> +
> +static int psp_change_class(struct Qdisc *sch, u32 classid, u32 parentid,
> +			    struct rtattr **tca, unsigned long *arg)
> +{
> +	struct psp_sched_data *q = qdisc_priv(sch);
> +	struct psp_class *cl = (struct psp_class *)*arg, *parent;
> +	struct rtattr *opt = tca[TCA_OPTIONS-1];
> +	struct rtattr *tb[TCA_PSP_QOPT];
> +	struct tc_psp_copt *copt;
> +	unsigned int limit;
> +
> +	if (opt == NULL
> +	    || rtattr_parse(tb, TCA_PSP_QOPT, RTA_DATA(opt), RTA_PAYLOAD(opt)))
> +		return -EINVAL;
> +
> +	copt = RTA_DATA(tb[TCA_PSP_COPT - 1]);
> +
> +	parent = (parentid == TC_H_ROOT ? NULL : psp_find(parentid, sch));
> +
> +	if (cl == NULL) { /* create new class */
> +		struct Qdisc *new_q;
> +
> +		cl = kmalloc(sizeof(struct psp_class), GFP_KERNEL);
> +		if (cl == NULL)
> +			return -ENOBUFS;
> +		memset(cl, 0, sizeof(struct psp_class));

kzalloc

> +		cl->refcnt = 1;
> +		INIT_LIST_HEAD(&cl->sibling);
> +		INIT_LIST_HEAD(&cl->children);
> +		INIT_LIST_HEAD(&cl->hlist);
> +		INIT_LIST_HEAD(&cl->dlist);
> +		INIT_LIST_HEAD(&cl->plist);
> +
> +		new_q = qdisc_create_dflt(sch->dev, &pfifo_qdisc_ops, classid);
> +
> +		sch_tree_lock(sch);
> +		while (parent && parent->level != 0) {
> +			/* turn parent into inner node */
> +			sch->q.qlen -= parent->qdisc->q.qlen;
> +			qdisc_destroy(parent->qdisc);

Also needs qdisc_tree_decrease_qlen.

> +			psp_deactivate(q, cl);
> +			list_del(&parent->plist);
> +
> +			parent->level = (parent->parent
> +					 ? parent->parent->level
> +					 : TC_PSP_MAXDEPTH) - 1;
> +		}
> +		cl->qdisc = new_q ? new_q : &noop_qdisc;
> +		cl->classid = classid;
> +		cl->parent = parent;
> +
> +		list_add_tail(&cl->hlist, q->hash + psp_hash(classid));
> +		list_add_tail(&cl->sibling, (parent
> +					     ? &parent->children : &q->root));
> +	} else {
> +		if (cl->mode == MODE_STATIC)
> +			q->allocated_rate -= cl->rate;
> +
> +		sch_tree_lock(sch);
> +	}
> +
> +	/* setup mode and target rate */
> +	cl->mode = copt->mode;
> +	if (copt->rate < MIN_TARGET_RATE)
> +		copt->rate = MIN_TARGET_RATE;
> +	cl->max_rate = copt->rate;
> +	switch (cl->mode) {
> +	case MODE_STATIC:
> +		limit = (parent ? parent->allocated_rate : q->allocated_rate)
> +			+ cl->max_rate;
> +		if (limit > q->max_rate) {
> +			printk(KERN_ERR "psp: target rate is oversubscribed!");
> +			list_del_init(&cl->hlist);
> +			psp_deactivate(q, cl);
> +			if (--cl->refcnt == 0)
> +				psp_destroy_class(sch, cl);
> +			sch_tree_unlock(sch);
> +			return -EINVAL;
> +		}
> +		cl->rate = cl->max_rate;
> +		if (parent) {
> +			parent->allocated_rate += cl->rate;
> +		} else {
> +			q->allocated_rate += cl->rate;
> +		}
> +		break;
> +	}
> +
> +	if (cl->level == 0) {
> +		if (!list_empty(&cl->plist))
> +			list_del(&cl->plist);
> +		add_leaf_class(q, cl);
> +	}
> +	sch_tree_unlock(sch);
> +	*arg = (unsigned long)cl;
> +	return 0;
> +}
> +
> +static struct tcf_proto **psp_find_tcf(struct Qdisc *sch, unsigned long arg)
> +{
> +	struct psp_sched_data *q = qdisc_priv(sch);
> +	struct psp_class *cl = (struct psp_class *)arg;
> +	struct tcf_proto **fl = cl ? &cl->filter_list : &q->filter_list;
> +
> +	return fl;
> +}
> +
> +static unsigned long psp_bind_filter(struct Qdisc *sch, unsigned long parent,
> +				     u32 classid)
> +{
> +	struct psp_sched_data *q = qdisc_priv(sch);
> +	struct psp_class *cl = psp_find(classid, sch);
> +
> +	if (cl)
> +		cl->filter_cnt++;
> +	else
> +		q->filter_cnt++;
> +	return (unsigned long)cl;
> +}
> +
> +static void psp_unbind_filter(struct Qdisc *sch, unsigned long arg)
> +{
> +	struct psp_sched_data *q = qdisc_priv(sch);
> +	struct psp_class *cl = (struct psp_class *)arg;
> +
> +	if (cl)
> +		cl->filter_cnt--;
> +	else
> +		q->filter_cnt--;
> +}
> +
> +static int psp_delete(struct Qdisc *sch, unsigned long arg)
> +{
> +	struct psp_sched_data *q = qdisc_priv(sch);
> +	struct psp_class *cl = (struct psp_class *)arg;
> +
> +	if (!list_empty(&cl->children) || cl->filter_cnt)
> +		return -EBUSY;
> +
> +	sch_tree_lock(sch);
> +
> +	list_del_init(&cl->hlist);
> +	psp_deactivate(q, cl);
> +	list_del_init(&cl->plist);

It seems you need to adjust the class level here.

> +	if (--cl->refcnt == 0)
> +		psp_destroy_class(sch, cl);
> +
> +	sch_tree_unlock(sch);
> +	return 0;
> +}

> +
> +static const struct Qdisc_class_ops psp_class_ops = {
> +	.graft		=	psp_graft,
> +	.leaf		=	psp_leaf,
> +	.get		=	psp_get,
> +	.put		=	psp_put,
> +	.change		=	psp_change_class,
> +	.delete		=	psp_delete,
> +	.walk		=	psp_walk,
> +	.tcf_chain	=	psp_find_tcf,
> +	.bind_tcf	=	psp_bind_filter,
> +	.unbind_tcf	=	psp_unbind_filter,
> +	.dump		=	psp_dump_class,
> +	.dump_stats	=	psp_dump_class_stats,
> +};
> +
> +static struct Qdisc_ops psp_qdisc_ops __read_mostly = {
> +	.next		=	NULL,

Again, no need to initialize this to NULL.

> +	.cl_ops		=	&psp_class_ops,
> +	.id		=	"psp",
> +	.priv_size	=	sizeof(struct psp_sched_data),
> +	.enqueue	=	psp_enqueue,
> +	.dequeue	=	psp_dequeue,
> +	.requeue	=	psp_requeue,
> +	.drop		=	psp_drop,
> +	.init		=	psp_init,
> +	.reset		=	psp_reset,
> +	.destroy	=	psp_destroy,
> +	.change		=	psp_change_qdisc,
> +	.dump		=	psp_dump_qdisc,
> +	.dump_stats	=	psp_dump_qdisc_stats,
> +	.owner		=	THIS_MODULE,
> +};

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [PATCHv2 1/3] NET_SCHED: PSPacer qdisc module
  2007-11-26 18:15 ` Patrick McHardy
@ 2007-11-27 11:08   ` TAKANO Ryousei
  2007-11-27 12:16     ` Patrick McHardy
  0 siblings, 1 reply; 7+ messages in thread
From: TAKANO Ryousei @ 2007-11-27 11:08 UTC (permalink / raw)
  To: kaber; +Cc: takano-ryousei, netdev, shemminger, dada1, t.kudoh

Hi Patrick,

I appreciate your comments.
I will update and resend patches.

> > +typedef long long gapclock_t;
> 
> It seems you only add to this, does it need to be signed?
> How about using a fixed size type (u64) and getting rid
> of the typedef?
> 
OK. I will use u64 instead of gapclock_t, and remove the typedef.

> > +struct tc_psp_qopt
> > +{
> > +	__u32	defcls;
> > +	__u32	rate;
> > +};
> 
> 
> What unit is rate measured in?
> 
'rate' is the transmission rate in bytes per sec.

> > +struct tc_psp_xstats
> > +{
> > +	__u32	bytes;		/* gap packet statistics */
> > +	__u32	packets;
> > +};
> 
> How about using gnet_stats_basic for this?
> 
OK. I will use gnet_stats_basic.

> > +	if (!skb)
> > +		return NULL;
> > +
> > +	memset(skb->data, 0xff, size);
> 
> memsetting the header portion seems unnecessary since you overwrite
> it again directly below.
> 
The size of a gap packet is variable, so it fills a whole gap packet
with 0xff, where 'size' indicates the interface MTU size.

> > +	skb_put(skb, size);
> 
> This is usually done before putting data in the packet.
> 
Therefore, skb_put() is needed.

> > +static int recalc_gapsize(struct sk_buff *skb, struct Qdisc *sch)
> > +{
> > +	struct psp_sched_data *q = qdisc_priv(sch);
> > +	struct psp_class *cl;
> > +	unsigned int len = skb->len;
> > +	int gapsize = 0;
> > +	int err;
> > +
> > +	cl = psp_classify(skb, sch, &err);
> > +	switch (cl->mode) {
> > +	case MODE_STATIC:
> > +		gapsize = cl->gapsize;
> > +		if (len < q->mtu) /* workaround for overflow */
> > +			gapsize = (int)((long long)gapsize * len / q->mtu);
> 
> No need to cast to the LHS type. Shouldn't this also be rounded up
> like the other gapsize calculation?
> 
How about this?

	u64 gapsize = 0;
	:
	case MODE_STATIC:
		gapsize = (u64)cl->gapsize * len / q->mtu;
		gapsize = min_t(u64, gapsize, UINT_MAX);
		break;
	}

> > +static struct psp_class *lookup_next_class(struct Qdisc *sch, int *gapsize)
> > +{
> > +	struct psp_sched_data *q = qdisc_priv(sch);
> > +	struct psp_class *cl, *next = NULL;
> > +	gapclock_t diff;
> > +	int shortest;
> > +
> > +	/* pacing class */
> > +	shortest = q->mtu;
> > +	list_for_each_entry(cl, &q->pacing_list, plist) {
> > +		diff = cl->clock - q->clock;
> > +		if (diff > 0) {
> > +			if ((gapclock_t)shortest > diff)
> > +				shortest = (int)diff;
> 
> Is diff guaranteed not to exceed an int?
> 
No. But actually, 'diff' is much smaller than INT_MAX.
How about this?

	u64 diff, shortest;
	:
		if (q->clock < cl->clock) {
			diff = cl->clock - q->clock;
			if (shortest > diff)
				shortest = diff;
			continue;
		}


> > +static void psp_destroy(struct Qdisc *sch)
> > +{
> > +	struct psp_sched_data *q = qdisc_priv(sch);
> > +
> > +	tcf_destroy_chain(q->filter_list);
> > +
> > +	while (!list_empty(&q->root))
> > +		psp_destroy_class(sch, list_entry(q->root.next,
> > +						  struct psp_class, sibling));
> 
> list_for_each_entry_safe.
> 
I think it works well. Should I need to use list_for_each_entry_safe?

Best regards,
Ryousei Takano

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [PATCHv2 1/3] NET_SCHED: PSPacer qdisc module
  2007-11-27 11:08   ` TAKANO Ryousei
@ 2007-11-27 12:16     ` Patrick McHardy
  2007-11-27 16:57       ` Ryousei Takano
  2007-11-28  7:19       ` Ryousei Takano
  0 siblings, 2 replies; 7+ messages in thread
From: Patrick McHardy @ 2007-11-27 12:16 UTC (permalink / raw)
  To: TAKANO Ryousei; +Cc: takano-ryousei, netdev, shemminger, dada1, t.kudoh

TAKANO Ryousei wrote:
> Hi Patrick,
> 
>>> +struct tc_psp_qopt
>>> +{
>>> +	__u32	defcls;
>>> +	__u32	rate;
>>> +};
>>
>> What unit is rate measured in?
>>
> 'rate' is the transmission rate in bytes per sec.


So wouldn't it make sense to use u64 then for 10GBit networks?

>>> +	skb_put(skb, size);
>> This is usually done before putting data in the packet.
>>
> Therefore, skb_put() is needed.


I meant this is usually done before writing to the packet data,
so you should move it up a few lines.

>>> +static int recalc_gapsize(struct sk_buff *skb, struct Qdisc *sch)
>>> +{
>>> +	struct psp_sched_data *q = qdisc_priv(sch);
>>> +	struct psp_class *cl;
>>> +	unsigned int len = skb->len;
>>> +	int gapsize = 0;
>>> +	int err;
>>> +
>>> +	cl = psp_classify(skb, sch, &err);
>>> +	switch (cl->mode) {
>>> +	case MODE_STATIC:
>>> +		gapsize = cl->gapsize;
>>> +		if (len < q->mtu) /* workaround for overflow */
>>> +			gapsize = (int)((long long)gapsize * len / q->mtu);
>> No need to cast to the LHS type. Shouldn't this also be rounded up
>> like the other gapsize calculation?
>>
> How about this?
> 
> 	u64 gapsize = 0;
> 	:
> 	case MODE_STATIC:
> 		gapsize = (u64)cl->gapsize * len / q->mtu;
> 		gapsize = min_t(u64, gapsize, UINT_MAX);


Really a minimum of UINT_MAX? This will probably also break on 32 bit
unless you use do_div().

> 		break;
> 	}
> 
>>> +static struct psp_class *lookup_next_class(struct Qdisc *sch, int *gapsize)
>>> +{
>>> +	struct psp_sched_data *q = qdisc_priv(sch);
>>> +	struct psp_class *cl, *next = NULL;
>>> +	gapclock_t diff;
>>> +	int shortest;
>>> +
>>> +	/* pacing class */
>>> +	shortest = q->mtu;
>>> +	list_for_each_entry(cl, &q->pacing_list, plist) {
>>> +		diff = cl->clock - q->clock;
>>> +		if (diff > 0) {
>>> +			if ((gapclock_t)shortest > diff)
>>> +				shortest = (int)diff;
>> Is diff guaranteed not to exceed an int?
>>
> No. But actually, 'diff' is much smaller than INT_MAX.
> How about this?
> 
> 	u64 diff, shortest;
> 	:
> 		if (q->clock < cl->clock) {
> 			diff = cl->clock - q->clock;
> 			if (shortest > diff)
> 				shortest = diff;
> 			continue;
> 		}


I don't know, its your qdisc :) I was merely wondering whether
you need larger types since there seems to be some inconsisteny.
Your old code had a check for diff > 0, so it appears that
cl->clock could be smaller than q->clock.

>>> +	while (!list_empty(&q->root))
>>> +		psp_destroy_class(sch, list_entry(q->root.next,
>>> +						  struct psp_class, sibling));
>> list_for_each_entry_safe.
>>
> I think it works well. Should I need to use list_for_each_entry_safe?


I don't doubt that it works, but list_for_each_entry_safe is
the proper interface for this.

One more thing: your qdisc can only be used as root qdisc since it
produces packets itself and thereby violates the rule that a qdisc
can only hand out packets that were previously enqueued to it.
Using it as a leaf qdisc can make the upper qdiscs qlen counter
go negative, causing infinite dequeue-loops, so you should make
sure that its only possibly to use as root qdisc by checking the
parent. It would also be better to do something like netem
(enqueue produced packets at the root) to make sure the qlen
counter is always accurate.


^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [PATCHv2 1/3] NET_SCHED: PSPacer qdisc module
  2007-11-27 12:16     ` Patrick McHardy
@ 2007-11-27 16:57       ` Ryousei Takano
  2007-11-28  7:19       ` Ryousei Takano
  1 sibling, 0 replies; 7+ messages in thread
From: Ryousei Takano @ 2007-11-27 16:57 UTC (permalink / raw)
  To: kaber; +Cc: netdev, shemminger, dada1, t.kudoh

> One more thing: your qdisc can only be used as root qdisc since it
> produces packets itself and thereby violates the rule that a qdisc
> can only hand out packets that were previously enqueued to it.
> Using it as a leaf qdisc can make the upper qdiscs qlen counter
> go negative, causing infinite dequeue-loops, so you should make
> sure that its only possibly to use as root qdisc by checking the
> parent. It would also be better to do something like netem
> (enqueue produced packets at the root) to make sure the qlen
> counter is always accurate.
> 
I agree with you.
PSPacer should not use with other rate regulation qdiscs. But, 
I think that a combination of netem and PSPacer is a useful for
emulating networks. The following paper describes experimental
results using PSPacer with netem:

       "Large Scale Gigabit Emulated Testbed for Grid Transport
       Evaluation", PFLDnet 2006.
       http://www.hpcc.jp/pfldnet2006/paper/s1_02.pdf

Best regards,
Ryousei Takano

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [PATCHv2 1/3] NET_SCHED: PSPacer qdisc module
  2007-11-27 12:16     ` Patrick McHardy
  2007-11-27 16:57       ` Ryousei Takano
@ 2007-11-28  7:19       ` Ryousei Takano
  2007-11-28  9:17         ` Patrick McHardy
  1 sibling, 1 reply; 7+ messages in thread
From: Ryousei Takano @ 2007-11-28  7:19 UTC (permalink / raw)
  To: kaber; +Cc: netdev, shemminger, dada1, t.kudoh

Hi Patrick,

> >>> +struct tc_psp_qopt
> >>> +{
> >>> +	__u32	defcls;
> >>> +	__u32	rate;
> >>> +};
> >>
> >> What unit is rate measured in?
> >>
> > 'rate' is the transmission rate in bytes per sec.
> 
> 
> So wouldn't it make sense to use u64 then for 10GBit networks?
> 
I decided to use u32 after tc_ratespec.rate is u32.
u32 is large enough for 10Gbit networks, but small for 40Gbit.
I will use u64, because code becomes simple and consistent.


> >>> +	skb_put(skb, size);
> >> This is usually done before putting data in the packet.
> >>
> > Therefore, skb_put() is needed.
> 
> 
> I meant this is usually done before writing to the packet data,
> so you should move it up a few lines.
> 
I am silly:-) I understood. Thanks.


> >>> +	while (!list_empty(&q->root))
> >>> +		psp_destroy_class(sch, list_entry(q->root.next,
> >>> +						  struct psp_class, sibling));
> >> list_for_each_entry_safe.
> >>
> > I think it works well. Should I need to use list_for_each_entry_safe?
> 
> 
> I don't doubt that it works, but list_for_each_entry_safe is
> the proper interface for this.
> 
I will fix it. Thanks.

Best regards,
Ryousei Takano

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [PATCHv2 1/3] NET_SCHED: PSPacer qdisc module
  2007-11-28  7:19       ` Ryousei Takano
@ 2007-11-28  9:17         ` Patrick McHardy
  0 siblings, 0 replies; 7+ messages in thread
From: Patrick McHardy @ 2007-11-28  9:17 UTC (permalink / raw)
  To: Ryousei Takano; +Cc: netdev, shemminger, dada1, t.kudoh

Ryousei Takano wrote:
> Hi Patrick,
> 
>>>>> +struct tc_psp_qopt
>>>>> +{
>>>>> +	__u32	defcls;
>>>>> +	__u32	rate;
>>>>> +};
>>>> What unit is rate measured in?
>>>>
>>> 'rate' is the transmission rate in bytes per sec.
>>
>> So wouldn't it make sense to use u64 then for 10GBit networks?
>>
> I decided to use u32 after tc_ratespec.rate is u32.
> u32 is large enough for 10Gbit networks, but small for 40Gbit.
> I will use u64, because code becomes simple and consistent.


Right, I was thinking about bits per second :)




^ permalink raw reply	[flat|nested] 7+ messages in thread

end of thread, other threads:[~2007-11-28  9:17 UTC | newest]

Thread overview: 7+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2007-11-26 15:50 [PATCHv2 1/3] NET_SCHED: PSPacer qdisc module Ryousei Takano
2007-11-26 18:15 ` Patrick McHardy
2007-11-27 11:08   ` TAKANO Ryousei
2007-11-27 12:16     ` Patrick McHardy
2007-11-27 16:57       ` Ryousei Takano
2007-11-28  7:19       ` Ryousei Takano
2007-11-28  9:17         ` Patrick McHardy

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).