[LARTC] new perflow rate control queue

From: Wang Jian <lark@linux.net.cn>
To: lartc@vger.kernel.org
Subject: [LARTC] new perflow rate control queue
Date: Mon, 04 Apr 2005 07:21:17 +0000	[thread overview]
Message-ID: <20050404152117.6d90635d.lark@linux.net.cn> (raw)

[-- Attachment #1: Type: text/plain, Size: 1355 bytes --]

Hi,

One of my customer needs per flow rate control, so I write one.

The code I post here is not finished, but it seems to work as expected.

The kernel patch is agains kernel 2.6.11, the iproute2 patch is against
iproute2-2.6.11-050314. 

I write the code in a hurry to meet deadline. There are many other things
to do ahead for me. The code is written in 2 days (including read other
queue's code) and tested for a while to find obvious mistake. Don't be
suprised when you find many many bugs.

The test scenario is like this

      www server <- [ eth0   eth1 ] -> www clients

The attached t.sh is used to generate test rules. Clients download a
big ISO file from www server, so flows' rate can be estimated by view
progress. However I use wget to test the speed, so the speed is
accumulated, not current.

The problems I know:

1. The rtnetlink related code is quick hack. I am not familiar with
rtnetlink, so I look at other queue's code and use the simplest one.

2. perflow queue has no stats code. It will be added later.

3. I don't know what is the dump() method 's purpose, so I didn't write
dump() method. I will add it later when I know what it is for and how to
write rtnetlink code.

Any feedback is welcome. And test it if you can :)

PS: the code is licensed under GPL. If it is acceptable by upstream, it
will be submitted.

--
  lark

[-- Attachment #2: iproute2-2.6.11-050314-perflow.diff --]
[-- Type: application/octet-stream, Size: 4842 bytes --]

Index: iproute2-2.6.11-w/include/linux/pkt_sched.h
===================================================================

--- iproute2-2.6.11-w/include/linux/pkt_sched.h	(revision 1)
+++ iproute2-2.6.11-w/include/linux/pkt_sched.h	(working copy)
@@ -272,6 +272,28 @@
 	__u32 ctokens;
 };
 
+
+/* PERFLOW section */
+
+#define TC_PERFLOW_DEFAULTLIMIT	1024
+
+struct tc_perflow_qopt
+{
+	struct tc_ratespec 	rate;
+	struct tc_ratespec 	ceil;
+	__u32  limit;
+	__u32  qlen;
+};
+enum
+{
+	TCA_PERFLOW_UNSPEC,
+	TCA_PERFLOW_PARMS,
+	__TCA_PERFLOW_MAX,
+};
+
+#define TCA_PERFLOW_MAX	(__TCA_PERFLOW_MAX - 1)
+
+
 /* HFSC section */
 
 struct tc_hfsc_qopt
Index: iproute2-2.6.11-w/tc/q_perflow.c
===================================================================
--- iproute2-2.6.11-w/tc/q_perflow.c	(revision 0)
+++ iproute2-2.6.11-w/tc/q_perflow.c	(revision 0)
@@ -0,0 +1,153 @@
+/*
+ * q_perflow.c		PERFLOW.
+ *
+ *		This program is free software; you can redistribute it and/or
+ *		modify it under the terms of the GNU General Public License
+ *		as published by the Free Software Foundation; either version
+ *		2 of the License, or (at your option) any later version.
+ *
+ * Authors:	Wang Jian <lark@linux.net.cn>, 2005
+ *
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <syslog.h>
+#include <fcntl.h>
+#include <sys/socket.h>
+#include <netinet/in.h>
+#include <arpa/inet.h>
+#include <string.h>
+
+#include "utils.h"
+#include "tc_util.h"
+
+static void explain(void)
+{
+	fprintf(stderr,
+"Usage: ... perflow rate RATE [ ceil CEIL ] [ limit LIMIT ] [ qlen QLEN ]\n"
+"\n"
+"rate    rate allocated to each flow (flow can still borrow)\n"
+"ceil    upper limit for each flow (flow can not borrow)\n"
+"limit   maximum concurrent flows\n"
+"qlen    maximum queue length\n"
+	);
+}
+
+#define usage() return(-1)
+
+static int perflow_parse_opt(struct qdisc_util *qu, int argc, char **argv, struct nlmsghdr *n)
+{
+	struct tc_perflow_qopt opt;
+	struct rtattr *tail;
+	int ok = 0;
+
+	memset(&opt, 0, sizeof(opt));
+
+	while (argc > 0) {
+		if (strcmp(*argv, "rate") == 0) {
+			NEXT_ARG();
+			if (opt.rate.rate) {
+				fprintf(stderr, "Double \"ceil\" spec\n");
+				return -1;
+			}
+			if (get_rate(&opt.rate.rate, *argv)) {
+				fprintf(stderr, "Illegal \"rate\"\n");
+				return -1;
+			}
+			ok++;
+		} else if (strcmp(*argv, "ceil") == 0) {
+			NEXT_ARG();
+			if (opt.ceil.rate) {
+				fprintf(stderr, "Double \"ceil\" spec\n");
+				return -1;
+			}
+			if (get_rate(&opt.ceil.rate, *argv)) {
+				fprintf(stderr, "Illegal \"ceil\"\n");
+				return -1;
+			}
+			ok++;
+		} else if (strcmp(*argv, "limit") == 0) {
+			NEXT_ARG();
+			if (get_size(&opt.limit, *argv)) {
+				fprintf(stderr, "Illegal \"limit\"\n");
+				return -1;
+			}
+			ok++;
+		} else if (strcmp(*argv, "qlen") == 0) {
+			NEXT_ARG();
+			if (get_size(&opt.qlen, *argv)) {
+				fprintf(stderr, "Illegal \"limit\"\n");
+				return -1;
+			}
+			ok++;
+		} else if (strcmp(*argv, "help") == 0) {
+			explain();
+			return -1;
+		} else {
+			fprintf(stderr, "What is \"%s\"?\n", *argv);
+			explain();
+			return -1;
+		}
+		argc--; argv++;
+	}
+
+	if (!ok)
+		return 0;
+
+	if (opt.rate.rate == 0) {
+		fprintf(stderr, "\"rate\" is required.\n");
+		return -1;
+	}
+
+	if (opt.ceil.rate == 0)
+		opt.ceil = opt.rate;
+
+	if (opt.ceil.rate < opt.rate.rate) {
+		fprintf(stderr, "\"ceil\" must be >= \"rate\".\n");
+		return -1;
+	}
+
+	if (opt.limit == 0)
+		opt.limit = TC_PERFLOW_DEFAULTLIMIT;
+
+	if (opt.qlen > 0 && opt.qlen < 5) {
+		fprintf(stderr, "\"qlen\" must be >= 5.\n");
+		return -1;
+	}
+
+	tail = NLMSG_TAIL(n);
+	//addattr_l(n, 1024, TCA_OPTIONS, NULL, 0);
+	//addattr_l(n, 1024, TCA_PERFLOW_PARMS, &opt, sizeof(opt));
+	addattr_l(n, 1024, TCA_OPTIONS, &opt, sizeof(opt));
+	tail->rta_len = (void *) NLMSG_TAIL(n) - (void *) tail;
+
+	return 0;
+}
+
+static int perflow_print_opt(struct qdisc_util *qu, FILE *f, struct rtattr *opt)
+{
+	struct tc_perflow_qopt *qopt;
+
+	if (opt == NULL)
+		return 0;
+
+	if (RTA_PAYLOAD(opt) < sizeof(*qopt))
+		return -1;
+
+	qopt = RTA_DATA(opt);
+
+	SPRINT_BUF(b1);
+	fprintf(f, "rate %s ", sprint_rate(qopt->rate.rate, b1));
+	fprintf(f, "ceil %s ", sprint_rate(qopt->ceil.rate, b1));
+	fprintf(f, "limit %s ", sprint_size(qopt->rate.rate, b1));
+
+	return 0;
+}
+
+struct qdisc_util perflow_qdisc_util = {
+	.id		= "perflow",
+	.parse_qopt	= perflow_parse_opt,
+	.print_qopt	= perflow_print_opt,
+};
Index: iproute2-2.6.11-w/tc/Makefile
===================================================================
--- iproute2-2.6.11-w/tc/Makefile	(revision 1)
+++ iproute2-2.6.11-w/tc/Makefile	(working copy)
@@ -7,6 +7,7 @@
 TCMODULES += q_fifo.o
 TCMODULES += q_sfq.o
 TCMODULES += q_red.o
+TCMODULES += q_perflow.o
 TCMODULES += q_prio.o
 TCMODULES += q_tbf.o
 TCMODULES += q_cbq.o

[-- Attachment #3: linux-2.6.11-perflow.diff --]
[-- Type: application/octet-stream, Size: 15639 bytes --]

Index: linux-2.6.11-w/include/linux/pkt_sched.h
===================================================================
--- linux-2.6.11-w/include/linux/pkt_sched.h	(revision 1)
+++ linux-2.6.11-w/include/linux/pkt_sched.h	(working copy)
@@ -272,6 +272,28 @@
 	__u32 ctokens;
 };
 
+
+/* PERFLOW section */
+
+#define TC_PERFLOW_DEFAULTLIMIT	1024
+
+struct tc_perflow_qopt
+{
+	struct tc_ratespec 	rate;
+	struct tc_ratespec 	ceil;
+	__u32  limit;
+	__u32  qlen;
+};
+enum
+{
+	TCA_PERFLOW_UNSPEC,
+	TCA_PERFLOW_PARMS,
+	__TCA_PERFLOW_MAX,
+};
+
+#define TCA_PERFLOW_MAX	(__TCA_PERFLOW_MAX - 1)
+
+
 /* HFSC section */
 
 struct tc_hfsc_qopt
Index: linux-2.6.11-w/net/sched/Kconfig
===================================================================
--- linux-2.6.11-w/net/sched/Kconfig	(revision 1)
+++ linux-2.6.11-w/net/sched/Kconfig	(working copy)
@@ -192,6 +192,15 @@
 	  To compile this code as a module, choose M here: the
 	  module will be called sch_gred.
 
+config NET_SCH_PERFLOW
+	tristate "PERFLOW queue"
+	depends on NET_SCHED
+	---help---
+	  Say Y here if you want to use per flow rate control.
+
+	  To compile this code as a module, choose M here: the
+	  module will be called sch_perflow.
+
 config NET_SCH_DSMARK
 	tristate "Diffserv field marker"
 	depends on NET_SCHED
Index: linux-2.6.11-w/net/sched/Makefile
===================================================================
--- linux-2.6.11-w/net/sched/Makefile	(revision 1)
+++ linux-2.6.11-w/net/sched/Makefile	(working copy)
@@ -19,6 +19,7 @@
 obj-$(CONFIG_NET_SCH_HFSC)	+= sch_hfsc.o
 obj-$(CONFIG_NET_SCH_RED)	+= sch_red.o
 obj-$(CONFIG_NET_SCH_GRED)	+= sch_gred.o
+obj-$(CONFIG_NET_SCH_PERFLOW)	+= sch_perflow.o
 obj-$(CONFIG_NET_SCH_INGRESS)	+= sch_ingress.o 
 obj-$(CONFIG_NET_SCH_DSMARK)	+= sch_dsmark.o
 obj-$(CONFIG_NET_SCH_SFQ)	+= sch_sfq.o
Index: linux-2.6.11-w/net/sched/sch_perflow.c
===================================================================
--- linux-2.6.11-w/net/sched/sch_perflow.c	(revision 0)
+++ linux-2.6.11-w/net/sched/sch_perflow.c	(revision 0)
@@ -0,0 +1,551 @@
+/*
+ * net/sched/sch_perflow.c	Per flow rate control queue.
+ *
+ *	This program is free software; you can redistribute it and/or
+ *	modify it under the terms of the GNU General Public License
+ *	as published by the Free Software Foundation; either version
+ *	2 of the License, or (at your option) any later version.
+ *
+ * Authors:	Wang Jian <lark@linux.net.cn>, 2005
+ *
+ */
+
+#include <linux/config.h>
+#include <linux/module.h>
+#include <asm/uaccess.h>
+#include <linux/bitops.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/string.h>
+#include <linux/mm.h>
+#include <linux/socket.h>
+#include <linux/sockios.h>
+#include <linux/in.h>
+#include <linux/errno.h>
+#include <linux/interrupt.h>
+#include <linux/if_ether.h>
+#include <linux/inet.h>
+#include <linux/netdevice.h>
+#include <linux/etherdevice.h>
+#include <linux/notifier.h>
+#include <linux/jhash.h>
+#include <linux/timer.h>
+#include <net/ip.h>
+#include <net/route.h>
+#include <linux/skbuff.h>
+#include <net/sock.h>
+#include <net/pkt_sched.h>
+#include <net/inet_ecn.h>
+#include <net/dsfield.h>
+
+/* Per flow rate control algorithm
+
+Description
+-----------
+
+	When a new packet arrives, we lookup in the table to see if it
+	belongs to a flow being traced. If not, we create new entry for it.
+
+	For a packet belongs to a flow being traced, we see if it has
+	available token to send it out.
+
+	The 'rate' and 'ceil' have same meaning of HTB qdisc. The 'limit'
+	parameter defined how many flows we will trace, which defaults to
+	1024.
+
+	Any flow entry without traffic in LIFETIME seconds will be wiped
+	to free slot.
+
+
+Algorithm
+---------
+
+	The algorithm is simple and naive. We have a token bucket, which
+	generate grate/10 (= rate * limit / 10) token in 1/10 seconds.
+
+	A flow can always send when it is under rate. When a flow is over
+	rate but under ceil, every time it borrows, it is put into borrow
+	list, so it may receive penalty.
+
+	When a under-rate flow sends but token is short, penalty is added
+	to one of flows in borrow list, and this punished flow is
+	removed from borrow list.
+
+	When a flow carrying penalty has packet, the penalty is
+	added to used token. This means, this flow can borrow less than
+	last time (if in a new time slot), or can borrow less 
+	(ceil - rate - penalty).
+
+	There are other rules that handle fairness and low rate situation.
+	See code for details.
+
+	NOTE: the implementation of algorithm is not timer driven, but
+	packet driven.
+
+	The ideas behind this algorithm are
+
+	1. We assume that perflow qdisc has rate * limit guaranteed.
+	2. We can't affect past. We can only affect future.
+	3. If a flow's borrowing leads to overlimit, we let this flow
+	   borrow less in future.
+	4. With a round robin punishment style, a flow borrows more times,
+	   it stays in borrow list more times, and so receives more penalty.
+	   (But we should consider more about this, I think the fairness
+	   should be improved by sort borrow list.)
+
+
+Security Consideration
+----------------------
+
+	It's dangerous to create new entry for any new packets not belongs
+	to a flow being traced. For example, syn flood attack on a single
+	port may cause thousands of entries created. The 'limit' parameter
+	is used to set limit on maximum entries we will create.
+
+	But even with 'limit', port scan can fill the slots and make valid
+	new flow has no slot available and not be traced.
+
+	One of solution is to use netfilter MARK for classification 
+	(a.k.a. cls_fw.c). Only established session is given a fw mark, and
+	then, if port scan use a spoofing source address, no fw mark gotten.
+
+
+Application
+-----------
+
+	You should use HTB or other classful qdisc to enclose this qdisc.
+
+ */
+
+
+#define PERFLOW_HSIZE		251
+#define PERFLOW_LIFETIME	(10*HZ)
+
+struct perflow_entry
+{
+	u32	src;
+	u32	dst;
+	u16	sport;
+	u16	dport;
+
+	struct	list_head hlist;
+	struct	list_head borrow; 
+	struct	timer_list timer;
+	struct	perflow_sched_data *q;
+	u32	jiffies;
+	u32	used;
+	u32	penalty;
+
+	u8	protocol;
+};
+
+struct perflow_sched_data
+{
+	/* parameters */
+	u32	rate;			/* guaranted rate */
+	u32	ceil;			/* upper bound */
+	u32	limit;			/* maximum flows we trace */
+	u32	qlen;			/* maximum queue length */
+
+	/* variables */
+	u32	grate;			/* aggregative rate */
+
+	u32	jiffies;
+	u32	token;
+	u32	flow_count;		/* how many flows we are tracing */
+	struct	list_head ht[PERFLOW_HSIZE];	/* hash table */
+	struct	list_head borrow;	/* lists of borrowing flow */
+};
+
+struct commonhdr
+{
+	u16	source;
+	u16	dest;
+};
+
+static __inline__ u32 perflow_hash(struct perflow_entry *tuple)
+{
+	return (jhash_3words(tuple->src ^ tuple->protocol,
+				tuple->dst,
+				(tuple->sport << 16 | tuple->dport),
+				0x5e83ad03) % PERFLOW_HSIZE);
+}
+
+static __inline__ int perflow_is_valid(struct sk_buff *skb)
+{
+	struct iphdr *iph = skb->nh.iph;
+
+	if (skb->protocol != __constant_htons(ETH_P_IP))
+		return 0;
+
+	if (iph->protocol != IPPROTO_TCP && iph->protocol != IPPROTO_TCP
+			&& iph->protocol != IPPROTO_SCTP)
+		return 0;
+
+	return 1;
+}
+
+static __inline__ void perflow_flow_timer(unsigned long arg)
+{
+	struct perflow_entry *e = (struct perflow_entry *) arg;
+
+	e->q->flow_count--;
+	list_del_init(&e->hlist);
+	if (!list_empty(&e->borrow))
+		list_del_init(&e->borrow);
+	printk(KERN_WARNING "delete flow\n");
+	kfree(e);
+}
+
+static __inline__ struct perflow_entry *
+perflow_new_flow(u32 hash, struct perflow_entry *tuple, struct Qdisc *sch)
+{
+	struct perflow_sched_data *q = qdisc_priv(sch);
+	struct perflow_entry *e;
+
+	if (q->flow_count >= q->limit)
+		return NULL;
+
+	e = kmalloc(sizeof(*e), GFP_KERNEL);
+	if (!e)
+		return NULL;
+
+	q->flow_count++;
+
+	memset(e, 0, sizeof(*e));
+
+	e->src = tuple->src;
+	e->dst = tuple->dst;
+	e->sport = tuple->sport;
+	e->dport = tuple->dport;
+	e->protocol = tuple->protocol;
+
+	INIT_LIST_HEAD(&e->hlist);
+	INIT_LIST_HEAD(&e->borrow);
+	init_timer(&e->timer);
+	e->timer.function = perflow_flow_timer;
+	e->timer.data = (unsigned long) e;
+	e->jiffies = jiffies;
+	e->q = q;
+
+	list_add(&e->hlist, q->ht + hash);
+
+	return e;
+}
+
+static __inline__ void
+perflow_fill_tuple(struct perflow_entry *tuple, struct sk_buff *skb)
+{
+	struct iphdr *iph = skb->nh.iph;
+	struct commonhdr *ch;
+
+	memset(tuple, 0, sizeof(struct perflow_entry));
+#if 1
+	/* not a traceable flow, do nothing */
+	if (!perflow_is_valid(skb))
+		return;
+#endif
+
+	ch = (struct commonhdr *)((void *)iph + (iph->ihl << 2));
+	tuple->src = iph->saddr;
+	tuple->dst = iph->daddr;
+	tuple->sport = ch->source;
+	tuple->dport = ch->dest;
+	tuple->protocol = iph->protocol;
+}
+
+static struct perflow_entry *
+perflow_find_flow(u32 *hash, struct perflow_entry *flow, struct Qdisc *sch)
+{
+	struct perflow_sched_data *q = qdisc_priv(sch);
+	struct list_head *h, *head;
+	struct perflow_entry *e;
+       
+	*hash = perflow_hash(flow);
+	head = q->ht + (*hash);
+
+	list_for_each(h, head) {
+		e = list_entry(h, struct perflow_entry, hlist);
+		if (e->src == flow->src
+				&& e->dst == flow->dst
+				&& e->sport == flow->sport
+				&& e->dport == flow->dport
+				&& e->protocol == flow->protocol)
+			del_timer(&e->timer);
+			return e;
+	}
+	return NULL;
+}
+
+static void perflow_punish(int penalty, struct list_head *borrow)
+{
+	struct perflow_entry *e;
+	struct list_head *h;
+
+	if (!list_empty(borrow)) {
+		h = borrow->next;
+		list_del_init(h);
+		e = list_entry(h, struct perflow_entry, hlist);
+		e->penalty += penalty;
+	}
+}
+
+static __inline__ void
+perflow_adjust_used(struct perflow_entry *flow, u32 rate, u32 ceil)
+{
+	int cycles;
+
+	/* still in this time slot */
+	if ((jiffies - flow->jiffies) <= HZ) {
+		if ((ceil - flow->used) > flow->penalty) {
+			flow->used += flow->penalty;
+			flow->penalty = 0;
+		} else {
+			flow->used = ceil;
+			flow->penalty -= ceil - flow->used;
+		}
+		return;
+	}
+
+	/* the packet arrives after cycles slots */
+	cycles = (jiffies - flow->jiffies) / HZ;
+	flow->jiffies += cycles * HZ;
+
+	if ((cycles * ceil - flow->used) > flow->penalty) {
+		flow->used = 0;
+		flow->penalty = 0;
+	} else {
+		flow->used = flow->penalty + flow->used - cycles * ceil;
+		if (flow->used > ceil) {
+			flow->used = ceil;
+			flow->penalty -= ceil;
+		}
+	}
+}
+
+static int perflow_enqueue(struct sk_buff *skb, struct Qdisc *sch)
+{
+	struct perflow_sched_data *q = qdisc_priv(sch);
+	struct perflow_entry tuple, *flow;
+	u32 hash;
+
+	/* not a traceable flow */
+	if (!perflow_is_valid(skb))
+		goto drop;
+
+	/* if max qlen is set and current queue is full, drop */
+	if (q->qlen && sch->q.qlen >= q->qlen)
+		goto drop;
+
+	perflow_fill_tuple(&tuple, skb);
+	flow = perflow_find_flow(&hash, &tuple, sch);
+
+	if (flow == NULL) {
+		flow = perflow_new_flow(hash, &tuple, sch);
+		if (flow == NULL)
+			goto drop;
+	}
+
+	/* renew timer for flow */
+	flow->timer.expires = jiffies + PERFLOW_LIFETIME;
+	add_timer(&flow->timer);
+
+	/* renew used for this flow */
+	perflow_adjust_used(flow, q->rate, q->ceil);
+
+	if ((jiffies - q->jiffies) > HZ/10) {
+		q->token = q->grate / 10;
+		q->jiffies = jiffies;
+	}
+
+	/* we always satisfy flow under rate */
+	if (flow->used < q->rate) {
+		flow->used += skb->len;
+		if (flow->used > q->rate) {
+			/* if we borrow, we may receive penalty */
+			if (list_empty(&flow->borrow))
+				list_add_tail(&flow->borrow, &q->borrow);
+		}
+		if (flow->used > q->ceil)
+			flow->penalty += flow->used - q->ceil;
+		if (q->token < skb->len) {
+			q->token = 0;
+			perflow_punish(skb->len - q->token, &q->borrow);
+		} else {
+			q->token -= skb->len;
+		}
+	} else {
+		if (q->token < skb->len || flow->used >= q->ceil)
+			goto drop;
+
+		flow->used += skb->len;
+		q->token -= skb->len;
+
+		if (list_empty(&flow->borrow))
+			list_add_tail(&flow->borrow, &q->borrow);
+		if (flow->used > q->ceil)
+			flow->penalty += flow->used - q->ceil;
+	}
+
+enqueue:
+	sch->qstats.backlog += skb->len;
+	sch->bstats.bytes += skb->len;
+	sch->bstats.packets++;
+	__skb_queue_tail(&sch->q, skb);
+	return NET_XMIT_SUCCESS;
+
+drop:
+	printk(KERN_WARNING "perflow drop\n");
+	sch->qstats.overlimits++;
+	kfree_skb(skb);
+	sch->qstats.drops++;
+	return NET_XMIT_CN;
+}
+
+static int perflow_requeue(struct sk_buff *skb, struct Qdisc *sch)
+{
+	__skb_queue_head(&sch->q, skb);
+	sch->qstats.backlog += skb->len;
+	sch->qstats.requeues++;
+	return 0;
+}
+
+static struct sk_buff *perflow_dequeue(struct Qdisc *sch)
+{
+	struct sk_buff *skb;
+
+	skb = __skb_dequeue(&sch->q);
+	if (skb) {
+		sch->qstats.backlog -= skb->len;
+	}
+	return skb;
+}
+
+static unsigned int perflow_drop(struct Qdisc *sch)
+{
+	struct sk_buff *skb;
+
+	skb = __skb_dequeue_tail(&sch->q);
+	if (skb) {
+		unsigned int len = skb->len;
+		sch->qstats.backlog -= len;
+		sch->qstats.drops++;
+		kfree_skb(skb);
+		return len;
+	}
+	return 0;
+}
+
+static void perflow_reset(struct Qdisc *sch)
+{
+	skb_queue_purge(&sch->q);
+	sch->qstats.backlog = 0;
+}
+
+static int perflow_change(struct Qdisc *sch, struct rtattr *opt)
+{
+	struct perflow_sched_data *q = qdisc_priv(sch);
+	struct tc_perflow_qopt *ctl;
+
+	/* if limit and qlen are lowered, and we are in a over limit
+	 * state, we still don't drop flow or drop packet.
+	 * sch->q will decrease to allowed length
+	 * q->flow_count will decrease because no new flow is traced
+	 */
+	if (opt == NULL) {
+		return -EINVAL;
+	} else {
+		ctl = RTA_DATA(opt);
+		if (opt->rta_len < RTA_LENGTH(sizeof(*ctl)))
+			return -EINVAL;
+
+		q->rate = ctl->rate.rate;
+		q->ceil = ctl->ceil.rate;
+		q->limit = ctl->limit;
+		q->qlen = ctl->qlen;
+
+		if (q->limit == 0)
+			q->limit = 1024;
+		printk(KERN_WARNING "(rate,ceil,limit,qlen)=(%u,%u,%u,%u)\n",
+				q->rate, q->ceil, q->limit, q->qlen);
+		/* aggregative rate is (rate * 1.05 * limit) */
+		q->grate = (q->rate + q->rate/20) * q->limit;
+	}
+
+	return 0;
+}
+
+static int perflow_init(struct Qdisc *sch, struct rtattr *opt)
+{
+	struct perflow_sched_data *q = qdisc_priv(sch);
+	int i, ret;
+
+	ret = perflow_change(sch, opt);
+	if (ret)
+		return ret;
+
+	INIT_LIST_HEAD(&q->borrow);
+
+	for (i = 0; i < PERFLOW_HSIZE; i++)
+		INIT_LIST_HEAD(q->ht + i);
+
+	return 0;
+}
+
+static void perflow_destroy(struct Qdisc *sch)
+{
+	struct perflow_sched_data *q = qdisc_priv(sch);
+	struct list_head *h, *n;
+	struct perflow_entry *e;
+	int i;
+
+	for (i = 0; i < PERFLOW_HSIZE; i++) {
+		list_for_each_safe (h, n, q->ht + i) {
+			e = list_entry(h, struct perflow_entry, hlist);
+			del_timer(&e->timer);
+			if (!list_empty(&e->borrow))
+				list_del_init(&e->borrow);
+			list_del(h);
+			kfree(e);
+		}
+	}
+}
+
+static int perflow_dump(struct Qdisc *sch, struct sk_buff *skb)
+{
+	return -1;
+}
+
+static struct Qdisc_ops perflow_qdisc_ops = {
+	.next		= NULL,
+	.cl_ops		= NULL,
+	.id		= "perflow",
+	.priv_size	= sizeof(struct perflow_sched_data),
+	.enqueue	= perflow_enqueue,
+	.dequeue	= perflow_dequeue,
+	.requeue	= perflow_requeue,
+	.drop		= perflow_drop,
+	.init		= perflow_init,
+	.reset		= perflow_reset,
+	.destroy	= perflow_destroy,
+	.change		= perflow_change,
+	.dump		= perflow_dump,
+	.owner		= THIS_MODULE,
+};
+
+static int __init perflow_module_init(void)
+{
+	return register_qdisc(&perflow_qdisc_ops);
+}
+
+static void __exit perflow_module_exit(void)
+{
+	unregister_qdisc(&perflow_qdisc_ops);
+}
+
+module_init(perflow_module_init);
+module_exit(perflow_module_exit);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Wang Jian <lark@linux.net.cn>");

[-- Attachment #4: t.sh --]
[-- Type: application/x-sh, Size: 489 bytes --]

[-- Attachment #5: Type: text/plain, Size: 143 bytes --]

_______________________________________________
LARTC mailing list
LARTC@mailman.ds9a.nl
http://mailman.ds9a.nl/cgi-bin/mailman/listinfo/lartc