Re: [LARTC] new perflow rate control queue

From: Wang Jian <lark@linux.net.cn>
To: lartc@vger.kernel.org
Subject: Re: [LARTC] new perflow rate control queue
Date: Mon, 04 Apr 2005 15:57:16 +0000	[thread overview]
Message-ID: <20050404234744.023B.LARK@linux.net.cn> (raw)
In-Reply-To: <20050404152117.6d90635d.lark@linux.net.cn>

[-- Attachment #1: Type: text/plain, Size: 898 bytes --]

Hi Andy Furniss,


On Mon, 04 Apr 2005 16:23:30 +0100, Andy Furniss <andy.furniss@dsl.pipex.com> wrote:

> 
> > 
> > Because this per-flow queue is new, you can add things useful to it.
> 
> It does look good :-) I'll test when I get time.
> 

The attached is the latest. The last one doesn't sync time: queue has a
variable time slot length; every flow has it own ticks.

This new patch against 2.6.11 sync queue and flows' time. Every new flow
has it jiffies set to q->jiffies and use that as start. As q->jiffies
and flow->jiffies increament in HZ step, time is synced. This will
improved accuracy.

But HZ is too long for token calculation. Sometimes, one of flow borrows
too much and get no enough penalty, so another flow hurts. But anyway,
per flow queue provides better fairness in my test, either in
short time period or long time period.

Looking forward to your feedback :)


-- 
  lark

[-- Attachment #2: linux-2.6.11-perflow-r3.diff --]
[-- Type: application/octet-stream, Size: 15640 bytes --]

Index: linux-2.6.11-w/include/linux/pkt_sched.h
===================================================================

--- linux-2.6.11-w/include/linux/pkt_sched.h	(revision 2)
+++ linux-2.6.11-w/include/linux/pkt_sched.h	(revision 3)
@@ -272,6 +272,28 @@
 	__u32 ctokens;
 };
 
+
+/* PERFLOW section */
+
+#define TC_PERFLOW_DEFAULTLIMIT	1024
+
+struct tc_perflow_qopt
+{
+	struct tc_ratespec 	rate;
+	struct tc_ratespec 	ceil;
+	__u32  limit;
+	__u32  qlen;
+};
+enum
+{
+	TCA_PERFLOW_UNSPEC,
+	TCA_PERFLOW_PARMS,
+	__TCA_PERFLOW_MAX,
+};
+
+#define TCA_PERFLOW_MAX	(__TCA_PERFLOW_MAX - 1)
+
+
 /* HFSC section */
 
 struct tc_hfsc_qopt
Index: linux-2.6.11-w/net/sched/Kconfig
===================================================================
--- linux-2.6.11-w/net/sched/Kconfig	(revision 2)
+++ linux-2.6.11-w/net/sched/Kconfig	(revision 3)
@@ -192,6 +192,15 @@
 	  To compile this code as a module, choose M here: the
 	  module will be called sch_gred.
 
+config NET_SCH_PERFLOW
+	tristate "PERFLOW queue"
+	depends on NET_SCHED
+	---help---
+	  Say Y here if you want to use per flow rate control.
+
+	  To compile this code as a module, choose M here: the
+	  module will be called sch_perflow.
+
 config NET_SCH_DSMARK
 	tristate "Diffserv field marker"
 	depends on NET_SCHED
Index: linux-2.6.11-w/net/sched/Makefile
===================================================================
--- linux-2.6.11-w/net/sched/Makefile	(revision 2)
+++ linux-2.6.11-w/net/sched/Makefile	(revision 3)
@@ -19,6 +19,7 @@
 obj-$(CONFIG_NET_SCH_HFSC)	+= sch_hfsc.o
 obj-$(CONFIG_NET_SCH_RED)	+= sch_red.o
 obj-$(CONFIG_NET_SCH_GRED)	+= sch_gred.o
+obj-$(CONFIG_NET_SCH_PERFLOW)	+= sch_perflow.o
 obj-$(CONFIG_NET_SCH_INGRESS)	+= sch_ingress.o 
 obj-$(CONFIG_NET_SCH_DSMARK)	+= sch_dsmark.o
 obj-$(CONFIG_NET_SCH_SFQ)	+= sch_sfq.o
Index: linux-2.6.11-w/net/sched/sch_perflow.c
===================================================================
--- linux-2.6.11-w/net/sched/sch_perflow.c	(revision 0)
+++ linux-2.6.11-w/net/sched/sch_perflow.c	(revision 3)
@@ -0,0 +1,555 @@
+/*
+ * net/sched/sch_perflow.c	Per flow rate control queue.
+ *
+ *	This program is free software; you can redistribute it and/or
+ *	modify it under the terms of the GNU General Public License
+ *	as published by the Free Software Foundation; either version
+ *	2 of the License, or (at your option) any later version.
+ *
+ * Authors:	Wang Jian <lark@linux.net.cn>, 2005
+ *
+ */
+
+#include <linux/config.h>
+#include <linux/module.h>
+#include <asm/uaccess.h>
+#include <linux/bitops.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/string.h>
+#include <linux/mm.h>
+#include <linux/socket.h>
+#include <linux/sockios.h>
+#include <linux/in.h>
+#include <linux/errno.h>
+#include <linux/interrupt.h>
+#include <linux/if_ether.h>
+#include <linux/inet.h>
+#include <linux/netdevice.h>
+#include <linux/etherdevice.h>
+#include <linux/notifier.h>
+#include <linux/jhash.h>
+#include <linux/timer.h>
+#include <net/ip.h>
+#include <net/route.h>
+#include <linux/skbuff.h>
+#include <net/sock.h>
+#include <net/pkt_sched.h>
+#include <net/inet_ecn.h>
+#include <net/dsfield.h>
+
+/* Per flow rate control algorithm
+
+Description
+-----------
+
+	When a new packet arrives, we lookup in the table to see if it
+	belongs to a flow being traced. If not, we create new entry for it.
+
+	For a packet belongs to a flow being traced, we see if it has
+	available token to send it out.
+
+	The 'rate' and 'ceil' have same meaning of HTB qdisc. The 'limit'
+	parameter defined how many flows we will trace, which defaults to
+	1024.
+
+	Any flow entry without traffic in LIFETIME seconds will be wiped
+	to free slot.
+
+
+Algorithm
+---------
+
+	The algorithm is simple and naive. We have a token bucket, which
+	generate grate/10 (= rate * limit / 10) token in 1/10 seconds.
+
+	A flow can always send when it is under rate. When a flow is over
+	rate but under ceil, every time it borrows, it is put into borrow
+	list, so it may receive penalty.
+
+	When a under-rate flow sends but token is short, penalty is added
+	to one of flows in borrow list, and this punished flow is
+	removed from borrow list.
+
+	When a flow carrying penalty has packet, the penalty is
+	added to used token. This means, this flow can borrow less than
+	last time (if in a new time slot), or can borrow less 
+	(ceil - rate - penalty).
+
+	There are other rules that handle fairness and low rate situation.
+	See code for details.
+
+	NOTE: the implementation of algorithm is not timer driven, but
+	packet driven.
+
+	The ideas behind this algorithm are
+
+	1. We assume that perflow qdisc has rate * limit guaranteed.
+	2. We can't affect past. We can only affect future.
+	3. If a flow's borrowing leads to overlimit, we let this flow
+	   borrow less in future.
+	4. With a round robin punishment style, a flow borrows more times,
+	   it stays in borrow list more times, and so receives more penalty.
+	   (But we should consider more about this, I think the fairness
+	   should be improved by sort borrow list.)
+
+
+Security Consideration
+----------------------
+
+	It's dangerous to create new entry for any new packets not belongs
+	to a flow being traced. For example, syn flood attack on a single
+	port may cause thousands of entries created. The 'limit' parameter
+	is used to set limit on maximum entries we will create.
+
+	But even with 'limit', port scan can fill the slots and make valid
+	new flow has no slot available and not be traced.
+
+	One of solution is to use netfilter MARK for classification 
+	(a.k.a. cls_fw.c). Only established session is given a fw mark, and
+	then, if port scan use a spoofing source address, no fw mark gotten.
+
+
+Application
+-----------
+
+	You should use HTB or other classful qdisc to enclose this qdisc.
+
+ */
+
+
+#define PERFLOW_HSIZE		251
+#define PERFLOW_LIFETIME	(10*HZ)
+
+struct perflow_entry
+{
+	u32	src;
+	u32	dst;
+	u16	sport;
+	u16	dport;
+
+	struct	list_head hlist;
+	struct	list_head borrow; 
+	struct	timer_list timer;
+	struct	perflow_sched_data *q;
+	u32	jiffies;
+	u32	used;
+	u32	penalty;
+
+	u8	protocol;
+};
+
+struct perflow_sched_data
+{
+	/* parameters */
+	u32	rate;			/* guaranted rate */
+	u32	ceil;			/* upper bound */
+	u32	limit;			/* maximum flows we trace */
+	u32	qlen;			/* maximum queue length */
+
+	/* variables */
+	u32	grate;			/* aggregative rate */
+
+	u32	jiffies;
+	u32	token;
+	u32	flow_count;		/* how many flows we are tracing */
+	struct	list_head ht[PERFLOW_HSIZE];	/* hash table */
+	struct	list_head borrow;	/* lists of borrowing flow */
+};
+
+struct commonhdr
+{
+	u16	source;
+	u16	dest;
+};
+
+static __inline__ u32 perflow_hash(struct perflow_entry *tuple)
+{
+	return (jhash_3words(tuple->src ^ tuple->protocol,
+				tuple->dst,
+				(tuple->sport << 16 | tuple->dport),
+				0x5E83AD03) % PERFLOW_HSIZE);
+}
+
+static __inline__ int perflow_is_valid(struct sk_buff *skb)
+{
+	struct iphdr *iph = skb->nh.iph;
+
+	if (skb->protocol != __constant_htons(ETH_P_IP))
+		return 0;
+
+	if (iph->protocol != IPPROTO_TCP && iph->protocol != IPPROTO_TCP
+			&& iph->protocol != IPPROTO_SCTP)
+		return 0;
+
+	return 1;
+}
+
+static __inline__ void perflow_flow_timer(unsigned long arg)
+{
+	struct perflow_entry *e = (struct perflow_entry *) arg;
+
+	e->q->flow_count--;
+	list_del_init(&e->hlist);
+	if (!list_empty(&e->borrow))
+		list_del_init(&e->borrow);
+	kfree(e);
+}
+
+static __inline__ struct perflow_entry *
+perflow_new_flow(u32 hash, struct perflow_entry *tuple, struct Qdisc *sch)
+{
+	struct perflow_sched_data *q = qdisc_priv(sch);
+	struct perflow_entry *e;
+
+	if (q->flow_count >= q->limit)
+		return NULL;
+
+	e = kmalloc(sizeof(*e), GFP_KERNEL);
+	if (!e)
+		return NULL;
+
+	q->flow_count++;
+
+	memset(e, 0, sizeof(*e));
+
+	e->src = tuple->src;
+	e->dst = tuple->dst;
+	e->sport = tuple->sport;
+	e->dport = tuple->dport;
+	e->protocol = tuple->protocol;
+
+	INIT_LIST_HEAD(&e->hlist);
+	INIT_LIST_HEAD(&e->borrow);
+	init_timer(&e->timer);
+	e->timer.function = perflow_flow_timer;
+	e->timer.data = (unsigned long) e;
+	/* sync flow's tick to queue's tick */
+	e->jiffies = q->jiffies;
+	e->q = q;
+
+	list_add(&e->hlist, q->ht + hash);
+
+	return e;
+}
+
+static __inline__ void
+perflow_fill_tuple(struct perflow_entry *tuple, struct sk_buff *skb)
+{
+	struct iphdr *iph = skb->nh.iph;
+	struct commonhdr *ch;
+
+	memset(tuple, 0, sizeof(struct perflow_entry));
+#if 1
+	/* not a traceable flow, do nothing */
+	if (!perflow_is_valid(skb))
+		return;
+#endif
+
+	ch = (struct commonhdr *)((void *)iph + (iph->ihl << 2));
+	tuple->src = iph->saddr;
+	tuple->dst = iph->daddr;
+	tuple->sport = ch->source;
+	tuple->dport = ch->dest;
+	tuple->protocol = iph->protocol;
+}
+
+static struct perflow_entry *
+perflow_find_flow(u32 *hash, struct perflow_entry *flow, struct Qdisc *sch)
+{
+	struct perflow_sched_data *q = qdisc_priv(sch);
+	struct list_head *h, *head;
+	struct perflow_entry *e;
+       
+	*hash = perflow_hash(flow);
+	head = q->ht + (*hash);
+
+	list_for_each(h, head) {
+		e = list_entry(h, struct perflow_entry, hlist);
+		if (e->src == flow->src
+				&& e->dst == flow->dst
+				&& e->sport == flow->sport
+				&& e->dport == flow->dport
+				&& e->protocol == flow->protocol)
+			del_timer(&e->timer);
+			return e;
+	}
+	return NULL;
+}
+
+static void perflow_punish(int penalty, struct list_head *borrow)
+{
+	struct perflow_entry *e;
+	struct list_head *h;
+
+	if (!list_empty(borrow)) {
+		h = borrow->next;
+		list_del_init(h);
+		e = list_entry(h, struct perflow_entry, hlist);
+		e->penalty += penalty;
+	}
+}
+
+static __inline__ void
+perflow_adjust_used(struct perflow_entry *flow, u32 rate, u32 ceil)
+{
+	int cycles;
+
+	/* still in this time slot */
+	if ((jiffies - flow->jiffies) <= HZ) {
+		if ((ceil - flow->used) > flow->penalty) {
+			flow->used += flow->penalty;
+			flow->penalty = 0;
+		} else {
+			flow->used = ceil;
+			flow->penalty -= ceil - flow->used;
+		}
+		return;
+	}
+
+	/* the packet arrives after cycles slots */
+	cycles = (jiffies - flow->jiffies) / HZ;
+	flow->jiffies += cycles * HZ;
+
+	if ((cycles * ceil - flow->used) > flow->penalty) {
+		flow->used = 0;
+		flow->penalty = 0;
+	} else {
+		flow->used = flow->penalty + flow->used - cycles * ceil;
+		if (flow->used > ceil) {
+			flow->used = ceil;
+			flow->penalty -= ceil;
+		}
+	}
+}
+
+static int perflow_enqueue(struct sk_buff *skb, struct Qdisc *sch)
+{
+	struct perflow_sched_data *q = qdisc_priv(sch);
+	struct perflow_entry tuple, *flow;
+	u32 hash;
+
+	/* not a traceable flow */
+	if (!perflow_is_valid(skb))
+		goto drop;
+
+	/* if max qlen is set and current queue is full, drop */
+	if (q->qlen && sch->q.qlen >= q->qlen) {
+		printk(KERN_WARNING "qlen overlimit drop\n");
+		goto drop;
+	}
+
+	perflow_fill_tuple(&tuple, skb);
+	flow = perflow_find_flow(&hash, &tuple, sch);
+
+	if (flow == NULL) {
+		flow = perflow_new_flow(hash, &tuple, sch);
+		if (flow == NULL)
+			goto drop;
+	}
+
+	/* renew timer for flow */
+	flow->timer.expires = jiffies + PERFLOW_LIFETIME;
+	add_timer(&flow->timer);
+
+	if ((jiffies - q->jiffies) > HZ) {
+		q->token = q->grate;
+		q->jiffies += (jiffies - q->jiffies) / HZ * HZ;
+	}
+
+	/* renew used for this flow */
+	perflow_adjust_used(flow, q->rate, q->ceil);
+
+	/* we always satisfy flow under rate */
+	if (flow->used < q->rate) {
+		flow->used += skb->len;
+		if (flow->used > q->rate) {
+			/* if we borrow, we may receive penalty */
+			if (list_empty(&flow->borrow))
+				list_add_tail(&flow->borrow, &q->borrow);
+		}
+		if (flow->used > q->ceil)
+			flow->penalty += flow->used - q->ceil;
+		if (q->token < skb->len) {
+			q->token = 0;
+			perflow_punish(skb->len - q->token, &q->borrow);
+		} else {
+			q->token -= skb->len;
+		}
+	} else {
+		if (q->token < skb->len || flow->used >= q->ceil)
+			goto drop;
+
+		flow->used += skb->len;
+		q->token -= skb->len;
+
+		if (list_empty(&flow->borrow))
+			list_add_tail(&flow->borrow, &q->borrow);
+		else
+			list_move(&flow->borrow, &q->borrow);
+
+		if (flow->used > q->ceil)
+			flow->penalty += flow->used - q->ceil;
+	}
+
+enqueue:
+	sch->qstats.backlog += skb->len;
+	sch->bstats.bytes += skb->len;
+	sch->bstats.packets++;
+	__skb_queue_tail(&sch->q, skb);
+	return NET_XMIT_SUCCESS;
+
+drop:
+	sch->qstats.overlimits++;
+	kfree_skb(skb);
+	sch->qstats.drops++;
+	return NET_XMIT_CN;
+}
+
+static int perflow_requeue(struct sk_buff *skb, struct Qdisc *sch)
+{
+	__skb_queue_head(&sch->q, skb);
+	sch->qstats.backlog += skb->len;
+	sch->qstats.requeues++;
+	return 0;
+}
+
+static struct sk_buff *perflow_dequeue(struct Qdisc *sch)
+{
+	struct sk_buff *skb;
+
+	skb = __skb_dequeue(&sch->q);
+	if (skb) {
+		sch->qstats.backlog -= skb->len;
+	}
+	return skb;
+}
+
+static unsigned int perflow_drop(struct Qdisc *sch)
+{
+	struct sk_buff *skb;
+
+	skb = __skb_dequeue_tail(&sch->q);
+	if (skb) {
+		unsigned int len = skb->len;
+		sch->qstats.backlog -= len;
+		sch->qstats.drops++;
+		kfree_skb(skb);
+		return len;
+	}
+	return 0;
+}
+
+static void perflow_reset(struct Qdisc *sch)
+{
+	skb_queue_purge(&sch->q);
+	sch->qstats.backlog = 0;
+}
+
+static int perflow_change(struct Qdisc *sch, struct rtattr *opt)
+{
+	struct perflow_sched_data *q = qdisc_priv(sch);
+	struct tc_perflow_qopt *ctl;
+
+	/* if limit and qlen are lowered, and we are in a over limit
+	 * state, we still don't drop flow or drop packet.
+	 * sch->q will decrease to allowed length
+	 * q->flow_count will decrease because no new flow is traced
+	 */
+	if (opt == NULL) {
+		return -EINVAL;
+	} else {
+		ctl = RTA_DATA(opt);
+		if (opt->rta_len < RTA_LENGTH(sizeof(*ctl)))
+			return -EINVAL;
+
+		q->rate = ctl->rate.rate;
+		q->ceil = ctl->ceil.rate;
+		q->limit = ctl->limit;
+		q->qlen = ctl->qlen;
+
+		if (q->limit == 0)
+			q->limit = 1024;
+		/* aggregative rate is (rate * 1.05 * limit) */
+		q->grate = (q->rate + q->rate/20) * q->limit;
+
+		q->jiffies = jiffies;
+	}
+
+	return 0;
+}
+
+static int perflow_init(struct Qdisc *sch, struct rtattr *opt)
+{
+	struct perflow_sched_data *q = qdisc_priv(sch);
+	int i, ret;
+
+	ret = perflow_change(sch, opt);
+	if (ret)
+		return ret;
+
+	INIT_LIST_HEAD(&q->borrow);
+
+	for (i = 0; i < PERFLOW_HSIZE; i++)
+		INIT_LIST_HEAD(q->ht + i);
+
+	return 0;
+}
+
+static void perflow_destroy(struct Qdisc *sch)
+{
+	struct perflow_sched_data *q = qdisc_priv(sch);
+	struct list_head *h, *n;
+	struct perflow_entry *e;
+	int i;
+
+	for (i = 0; i < PERFLOW_HSIZE; i++) {
+		list_for_each_safe (h, n, q->ht + i) {
+			e = list_entry(h, struct perflow_entry, hlist);
+			del_timer(&e->timer);
+			if (!list_empty(&e->borrow))
+				list_del_init(&e->borrow);
+			list_del(h);
+			kfree(e);
+		}
+	}
+}
+
+static int perflow_dump(struct Qdisc *sch, struct sk_buff *skb)
+{
+	return -1;
+}
+
+static struct Qdisc_ops perflow_qdisc_ops = {
+	.next		= NULL,
+	.cl_ops		= NULL,
+	.id		= "perflow",
+	.priv_size	= sizeof(struct perflow_sched_data),
+	.enqueue	= perflow_enqueue,
+	.dequeue	= perflow_dequeue,
+	.requeue	= perflow_requeue,
+	.drop		= perflow_drop,
+	.init		= perflow_init,
+	.reset		= perflow_reset,
+	.destroy	= perflow_destroy,
+	.change		= perflow_change,
+	.dump		= perflow_dump,
+	.owner		= THIS_MODULE,
+};
+
+static int __init perflow_module_init(void)
+{
+	return register_qdisc(&perflow_qdisc_ops);
+}
+
+static void __exit perflow_module_exit(void)
+{
+	unregister_qdisc(&perflow_qdisc_ops);
+}
+
+module_init(perflow_module_init);
+module_exit(perflow_module_exit);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Wang Jian <lark@linux.net.cn>");

[-- Attachment #3: Type: text/plain, Size: 143 bytes --]

_______________________________________________
LARTC mailing list
LARTC@mailman.ds9a.nl
http://mailman.ds9a.nl/cgi-bin/mailman/listinfo/lartc