[RFC] QoS: new per flow queue

netdev.vger.kernel.org archive mirror
 help / color / mirror / Atom feed

* [RFC] QoS: new per flow queue
@ 2005-04-05 15:25 Wang Jian
  2005-04-05 17:57 ` jamal
  0 siblings, 1 reply; 17+ messages in thread
From: Wang Jian @ 2005-04-05 15:25 UTC (permalink / raw)
  To: netdev

[-- Attachment #1: Type: text/plain, Size: 1428 bytes --]

Hi,

I write a per flow rate control qdisc. I posted it to LARTC list. Some
discussion about it is here

    http://mailman.ds9a.nl/pipermail/lartc/2005q2/015381.html

I think I need more feedback and suggestion on it so I repost the patch
here. Please read the thread and get a picture about why and how.

The kernel patch is agains kernel 2.6.11, the iproute2 patch is against
iproute2-2.6.11-050314. 

The test scenario is like this

      www server <- [ eth0   eth1 ] -> www clients

The attached t.sh is used to generate test rules. Clients download a
big ISO file from www server, so flows' rate can be estimated by view
progress.

I have some test on it and it works well. It provides good fairness.
When all slot being used, in most time, the real rate can keep at
specified guaranteed rate. But I know it should receive more test.

I have some consideration though

1. In the test sometimes there a pair of unbalanced stream and don't get
balanced quickly. One stream get 8.4kbps and another get 11.5kbps. How
to find the flow with highest traffic and punish it most?

2. The default ceil equals to rate. Should I calculate it as
   ceil = rate * 1.05 * limit, or
   ceil = rate * 1.05?

3. when flow slots are full, optionally reclassify untraceable traffic
   into another specified class, instead of dropping it?

TODO:

1. rtnetlink related code should be improved;
2. dump() and dump_stat();


Regards
-- 
  lark

[-- Attachment #2: t.sh --]
[-- Type: application/octet-stream, Size: 489 bytes --]

./tc qdisc del dev eth1 root
./tc qdisc add dev eth1 root handle 1: htb default 10
./tc class add dev eth1 parent 1: classid 1:1 htb rate 100Mbit ceil 100Mbit

./tc class add dev eth1 parent 1:1 classid 1:10 htb rate 80kbps ceil 80kbps

./tc class add dev eth1 parent 1:1 classid 1:12 htb rate 80kbps ceil 80kbps
./tc qdisc add dev eth1 parent 1:12 handle 20: perflow rate 10kbps ceil 15kbps limit 7
./tc filter add dev eth1 parent 1:0 protocol ip u32 match ip sport 80 0xffff flowid 1:12

[-- Attachment #3: iproute2-2.6.11-050314-perflow.diff --]
[-- Type: application/octet-stream, Size: 4842 bytes --]

Index: iproute2-2.6.11-w/include/linux/pkt_sched.h
===================================================================
--- iproute2-2.6.11-w/include/linux/pkt_sched.h	(revision 1)
+++ iproute2-2.6.11-w/include/linux/pkt_sched.h	(working copy)
@@ -272,6 +272,28 @@
 	__u32 ctokens;
 };
 
+
+/* PERFLOW section */
+
+#define TC_PERFLOW_DEFAULTLIMIT	1024
+
+struct tc_perflow_qopt
+{
+	struct tc_ratespec 	rate;
+	struct tc_ratespec 	ceil;
+	__u32  limit;
+	__u32  qlen;
+};
+enum
+{
+	TCA_PERFLOW_UNSPEC,
+	TCA_PERFLOW_PARMS,
+	__TCA_PERFLOW_MAX,
+};
+
+#define TCA_PERFLOW_MAX	(__TCA_PERFLOW_MAX - 1)
+
+
 /* HFSC section */
 
 struct tc_hfsc_qopt
Index: iproute2-2.6.11-w/tc/q_perflow.c
===================================================================
--- iproute2-2.6.11-w/tc/q_perflow.c	(revision 0)
+++ iproute2-2.6.11-w/tc/q_perflow.c	(revision 0)
@@ -0,0 +1,153 @@
+/*
+ * q_perflow.c		PERFLOW.
+ *
+ *		This program is free software; you can redistribute it and/or
+ *		modify it under the terms of the GNU General Public License
+ *		as published by the Free Software Foundation; either version
+ *		2 of the License, or (at your option) any later version.
+ *
+ * Authors:	Wang Jian <lark@linux.net.cn>, 2005
+ *
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <syslog.h>
+#include <fcntl.h>
+#include <sys/socket.h>
+#include <netinet/in.h>
+#include <arpa/inet.h>
+#include <string.h>
+
+#include "utils.h"
+#include "tc_util.h"
+
+static void explain(void)
+{
+	fprintf(stderr,
+"Usage: ... perflow rate RATE [ ceil CEIL ] [ limit LIMIT ] [ qlen QLEN ]\n"
+"\n"
+"rate    rate allocated to each flow (flow can still borrow)\n"
+"ceil    upper limit for each flow (flow can not borrow)\n"
+"limit   maximum concurrent flows\n"
+"qlen    maximum queue length\n"
+	);
+}
+
+#define usage() return(-1)
+
+static int perflow_parse_opt(struct qdisc_util *qu, int argc, char **argv, struct nlmsghdr *n)
+{
+	struct tc_perflow_qopt opt;
+	struct rtattr *tail;
+	int ok = 0;
+
+	memset(&opt, 0, sizeof(opt));
+
+	while (argc > 0) {
+		if (strcmp(*argv, "rate") == 0) {
+			NEXT_ARG();
+			if (opt.rate.rate) {
+				fprintf(stderr, "Double \"ceil\" spec\n");
+				return -1;
+			}
+			if (get_rate(&opt.rate.rate, *argv)) {
+				fprintf(stderr, "Illegal \"rate\"\n");
+				return -1;
+			}
+			ok++;
+		} else if (strcmp(*argv, "ceil") == 0) {
+			NEXT_ARG();
+			if (opt.ceil.rate) {
+				fprintf(stderr, "Double \"ceil\" spec\n");
+				return -1;
+			}
+			if (get_rate(&opt.ceil.rate, *argv)) {
+				fprintf(stderr, "Illegal \"ceil\"\n");
+				return -1;
+			}
+			ok++;
+		} else if (strcmp(*argv, "limit") == 0) {
+			NEXT_ARG();
+			if (get_size(&opt.limit, *argv)) {
+				fprintf(stderr, "Illegal \"limit\"\n");
+				return -1;
+			}
+			ok++;
+		} else if (strcmp(*argv, "qlen") == 0) {
+			NEXT_ARG();
+			if (get_size(&opt.qlen, *argv)) {
+				fprintf(stderr, "Illegal \"limit\"\n");
+				return -1;
+			}
+			ok++;
+		} else if (strcmp(*argv, "help") == 0) {
+			explain();
+			return -1;
+		} else {
+			fprintf(stderr, "What is \"%s\"?\n", *argv);
+			explain();
+			return -1;
+		}
+		argc--; argv++;
+	}
+
+	if (!ok)
+		return 0;
+
+	if (opt.rate.rate == 0) {
+		fprintf(stderr, "\"rate\" is required.\n");
+		return -1;
+	}
+
+	if (opt.ceil.rate == 0)
+		opt.ceil = opt.rate;
+
+	if (opt.ceil.rate < opt.rate.rate) {
+		fprintf(stderr, "\"ceil\" must be >= \"rate\".\n");
+		return -1;
+	}
+
+	if (opt.limit == 0)
+		opt.limit = TC_PERFLOW_DEFAULTLIMIT;
+
+	if (opt.qlen > 0 && opt.qlen < 5) {
+		fprintf(stderr, "\"qlen\" must be >= 5.\n");
+		return -1;
+	}
+
+	tail = NLMSG_TAIL(n);
+	//addattr_l(n, 1024, TCA_OPTIONS, NULL, 0);
+	//addattr_l(n, 1024, TCA_PERFLOW_PARMS, &opt, sizeof(opt));
+	addattr_l(n, 1024, TCA_OPTIONS, &opt, sizeof(opt));
+	tail->rta_len = (void *) NLMSG_TAIL(n) - (void *) tail;
+
+	return 0;
+}
+
+static int perflow_print_opt(struct qdisc_util *qu, FILE *f, struct rtattr *opt)
+{
+	struct tc_perflow_qopt *qopt;
+
+	if (opt == NULL)
+		return 0;
+
+	if (RTA_PAYLOAD(opt) < sizeof(*qopt))
+		return -1;
+
+	qopt = RTA_DATA(opt);
+
+	SPRINT_BUF(b1);
+	fprintf(f, "rate %s ", sprint_rate(qopt->rate.rate, b1));
+	fprintf(f, "ceil %s ", sprint_rate(qopt->ceil.rate, b1));
+	fprintf(f, "limit %s ", sprint_size(qopt->rate.rate, b1));
+
+	return 0;
+}
+
+struct qdisc_util perflow_qdisc_util = {
+	.id		= "perflow",
+	.parse_qopt	= perflow_parse_opt,
+	.print_qopt	= perflow_print_opt,
+};
Index: iproute2-2.6.11-w/tc/Makefile
===================================================================
--- iproute2-2.6.11-w/tc/Makefile	(revision 1)
+++ iproute2-2.6.11-w/tc/Makefile	(working copy)
@@ -7,6 +7,7 @@
 TCMODULES += q_fifo.o
 TCMODULES += q_sfq.o
 TCMODULES += q_red.o
+TCMODULES += q_perflow.o
 TCMODULES += q_prio.o
 TCMODULES += q_tbf.o
 TCMODULES += q_cbq.o

[-- Attachment #4: linux-2.6.11-perflow-r3.diff --]
[-- Type: application/octet-stream, Size: 15640 bytes --]

Index: linux-2.6.11-w/include/linux/pkt_sched.h
===================================================================
--- linux-2.6.11-w/include/linux/pkt_sched.h	(revision 2)
+++ linux-2.6.11-w/include/linux/pkt_sched.h	(revision 3)
@@ -272,6 +272,28 @@
 	__u32 ctokens;
 };
 
+
+/* PERFLOW section */
+
+#define TC_PERFLOW_DEFAULTLIMIT	1024
+
+struct tc_perflow_qopt
+{
+	struct tc_ratespec 	rate;
+	struct tc_ratespec 	ceil;
+	__u32  limit;
+	__u32  qlen;
+};
+enum
+{
+	TCA_PERFLOW_UNSPEC,
+	TCA_PERFLOW_PARMS,
+	__TCA_PERFLOW_MAX,
+};
+
+#define TCA_PERFLOW_MAX	(__TCA_PERFLOW_MAX - 1)
+
+
 /* HFSC section */
 
 struct tc_hfsc_qopt
Index: linux-2.6.11-w/net/sched/Kconfig
===================================================================
--- linux-2.6.11-w/net/sched/Kconfig	(revision 2)
+++ linux-2.6.11-w/net/sched/Kconfig	(revision 3)
@@ -192,6 +192,15 @@
 	  To compile this code as a module, choose M here: the
 	  module will be called sch_gred.
 
+config NET_SCH_PERFLOW
+	tristate "PERFLOW queue"
+	depends on NET_SCHED
+	---help---
+	  Say Y here if you want to use per flow rate control.
+
+	  To compile this code as a module, choose M here: the
+	  module will be called sch_perflow.
+
 config NET_SCH_DSMARK
 	tristate "Diffserv field marker"
 	depends on NET_SCHED
Index: linux-2.6.11-w/net/sched/Makefile
===================================================================
--- linux-2.6.11-w/net/sched/Makefile	(revision 2)
+++ linux-2.6.11-w/net/sched/Makefile	(revision 3)
@@ -19,6 +19,7 @@
 obj-$(CONFIG_NET_SCH_HFSC)	+= sch_hfsc.o
 obj-$(CONFIG_NET_SCH_RED)	+= sch_red.o
 obj-$(CONFIG_NET_SCH_GRED)	+= sch_gred.o
+obj-$(CONFIG_NET_SCH_PERFLOW)	+= sch_perflow.o
 obj-$(CONFIG_NET_SCH_INGRESS)	+= sch_ingress.o 
 obj-$(CONFIG_NET_SCH_DSMARK)	+= sch_dsmark.o
 obj-$(CONFIG_NET_SCH_SFQ)	+= sch_sfq.o
Index: linux-2.6.11-w/net/sched/sch_perflow.c
===================================================================
--- linux-2.6.11-w/net/sched/sch_perflow.c	(revision 0)
+++ linux-2.6.11-w/net/sched/sch_perflow.c	(revision 3)
@@ -0,0 +1,555 @@
+/*
+ * net/sched/sch_perflow.c	Per flow rate control queue.
+ *
+ *	This program is free software; you can redistribute it and/or
+ *	modify it under the terms of the GNU General Public License
+ *	as published by the Free Software Foundation; either version
+ *	2 of the License, or (at your option) any later version.
+ *
+ * Authors:	Wang Jian <lark@linux.net.cn>, 2005
+ *
+ */
+
+#include <linux/config.h>
+#include <linux/module.h>
+#include <asm/uaccess.h>
+#include <linux/bitops.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/string.h>
+#include <linux/mm.h>
+#include <linux/socket.h>
+#include <linux/sockios.h>
+#include <linux/in.h>
+#include <linux/errno.h>
+#include <linux/interrupt.h>
+#include <linux/if_ether.h>
+#include <linux/inet.h>
+#include <linux/netdevice.h>
+#include <linux/etherdevice.h>
+#include <linux/notifier.h>
+#include <linux/jhash.h>
+#include <linux/timer.h>
+#include <net/ip.h>
+#include <net/route.h>
+#include <linux/skbuff.h>
+#include <net/sock.h>
+#include <net/pkt_sched.h>
+#include <net/inet_ecn.h>
+#include <net/dsfield.h>
+
+/* Per flow rate control algorithm
+
+Description
+-----------
+
+	When a new packet arrives, we lookup in the table to see if it
+	belongs to a flow being traced. If not, we create new entry for it.
+
+	For a packet belongs to a flow being traced, we see if it has
+	available token to send it out.
+
+	The 'rate' and 'ceil' have same meaning of HTB qdisc. The 'limit'
+	parameter defined how many flows we will trace, which defaults to
+	1024.
+
+	Any flow entry without traffic in LIFETIME seconds will be wiped
+	to free slot.
+
+
+Algorithm
+---------
+
+	The algorithm is simple and naive. We have a token bucket, which
+	generate grate/10 (= rate * limit / 10) token in 1/10 seconds.
+
+	A flow can always send when it is under rate. When a flow is over
+	rate but under ceil, every time it borrows, it is put into borrow
+	list, so it may receive penalty.
+
+	When a under-rate flow sends but token is short, penalty is added
+	to one of flows in borrow list, and this punished flow is
+	removed from borrow list.
+
+	When a flow carrying penalty has packet, the penalty is
+	added to used token. This means, this flow can borrow less than
+	last time (if in a new time slot), or can borrow less 
+	(ceil - rate - penalty).
+
+	There are other rules that handle fairness and low rate situation.
+	See code for details.
+
+	NOTE: the implementation of algorithm is not timer driven, but
+	packet driven.
+
+	The ideas behind this algorithm are
+
+	1. We assume that perflow qdisc has rate * limit guaranteed.
+	2. We can't affect past. We can only affect future.
+	3. If a flow's borrowing leads to overlimit, we let this flow
+	   borrow less in future.
+	4. With a round robin punishment style, a flow borrows more times,
+	   it stays in borrow list more times, and so receives more penalty.
+	   (But we should consider more about this, I think the fairness
+	   should be improved by sort borrow list.)
+
+
+Security Consideration
+----------------------
+
+	It's dangerous to create new entry for any new packets not belongs
+	to a flow being traced. For example, syn flood attack on a single
+	port may cause thousands of entries created. The 'limit' parameter
+	is used to set limit on maximum entries we will create.
+
+	But even with 'limit', port scan can fill the slots and make valid
+	new flow has no slot available and not be traced.
+
+	One of solution is to use netfilter MARK for classification 
+	(a.k.a. cls_fw.c). Only established session is given a fw mark, and
+	then, if port scan use a spoofing source address, no fw mark gotten.
+
+
+Application
+-----------
+
+	You should use HTB or other classful qdisc to enclose this qdisc.
+
+ */
+
+
+#define PERFLOW_HSIZE		251
+#define PERFLOW_LIFETIME	(10*HZ)
+
+struct perflow_entry
+{
+	u32	src;
+	u32	dst;
+	u16	sport;
+	u16	dport;
+
+	struct	list_head hlist;
+	struct	list_head borrow; 
+	struct	timer_list timer;
+	struct	perflow_sched_data *q;
+	u32	jiffies;
+	u32	used;
+	u32	penalty;
+
+	u8	protocol;
+};
+
+struct perflow_sched_data
+{
+	/* parameters */
+	u32	rate;			/* guaranted rate */
+	u32	ceil;			/* upper bound */
+	u32	limit;			/* maximum flows we trace */
+	u32	qlen;			/* maximum queue length */
+
+	/* variables */
+	u32	grate;			/* aggregative rate */
+
+	u32	jiffies;
+	u32	token;
+	u32	flow_count;		/* how many flows we are tracing */
+	struct	list_head ht[PERFLOW_HSIZE];	/* hash table */
+	struct	list_head borrow;	/* lists of borrowing flow */
+};
+
+struct commonhdr
+{
+	u16	source;
+	u16	dest;
+};
+
+static __inline__ u32 perflow_hash(struct perflow_entry *tuple)
+{
+	return (jhash_3words(tuple->src ^ tuple->protocol,
+				tuple->dst,
+				(tuple->sport << 16 | tuple->dport),
+				0x5E83AD03) % PERFLOW_HSIZE);
+}
+
+static __inline__ int perflow_is_valid(struct sk_buff *skb)
+{
+	struct iphdr *iph = skb->nh.iph;
+
+	if (skb->protocol != __constant_htons(ETH_P_IP))
+		return 0;
+
+	if (iph->protocol != IPPROTO_TCP && iph->protocol != IPPROTO_TCP
+			&& iph->protocol != IPPROTO_SCTP)
+		return 0;
+
+	return 1;
+}
+
+static __inline__ void perflow_flow_timer(unsigned long arg)
+{
+	struct perflow_entry *e = (struct perflow_entry *) arg;
+
+	e->q->flow_count--;
+	list_del_init(&e->hlist);
+	if (!list_empty(&e->borrow))
+		list_del_init(&e->borrow);
+	kfree(e);
+}
+
+static __inline__ struct perflow_entry *
+perflow_new_flow(u32 hash, struct perflow_entry *tuple, struct Qdisc *sch)
+{
+	struct perflow_sched_data *q = qdisc_priv(sch);
+	struct perflow_entry *e;
+
+	if (q->flow_count >= q->limit)
+		return NULL;
+
+	e = kmalloc(sizeof(*e), GFP_KERNEL);
+	if (!e)
+		return NULL;
+
+	q->flow_count++;
+
+	memset(e, 0, sizeof(*e));
+
+	e->src = tuple->src;
+	e->dst = tuple->dst;
+	e->sport = tuple->sport;
+	e->dport = tuple->dport;
+	e->protocol = tuple->protocol;
+
+	INIT_LIST_HEAD(&e->hlist);
+	INIT_LIST_HEAD(&e->borrow);
+	init_timer(&e->timer);
+	e->timer.function = perflow_flow_timer;
+	e->timer.data = (unsigned long) e;
+	/* sync flow's tick to queue's tick */
+	e->jiffies = q->jiffies;
+	e->q = q;
+
+	list_add(&e->hlist, q->ht + hash);
+
+	return e;
+}
+
+static __inline__ void
+perflow_fill_tuple(struct perflow_entry *tuple, struct sk_buff *skb)
+{
+	struct iphdr *iph = skb->nh.iph;
+	struct commonhdr *ch;
+
+	memset(tuple, 0, sizeof(struct perflow_entry));
+#if 1
+	/* not a traceable flow, do nothing */
+	if (!perflow_is_valid(skb))
+		return;
+#endif
+
+	ch = (struct commonhdr *)((void *)iph + (iph->ihl << 2));
+	tuple->src = iph->saddr;
+	tuple->dst = iph->daddr;
+	tuple->sport = ch->source;
+	tuple->dport = ch->dest;
+	tuple->protocol = iph->protocol;
+}
+
+static struct perflow_entry *
+perflow_find_flow(u32 *hash, struct perflow_entry *flow, struct Qdisc *sch)
+{
+	struct perflow_sched_data *q = qdisc_priv(sch);
+	struct list_head *h, *head;
+	struct perflow_entry *e;
+       
+	*hash = perflow_hash(flow);
+	head = q->ht + (*hash);
+
+	list_for_each(h, head) {
+		e = list_entry(h, struct perflow_entry, hlist);
+		if (e->src == flow->src
+				&& e->dst == flow->dst
+				&& e->sport == flow->sport
+				&& e->dport == flow->dport
+				&& e->protocol == flow->protocol)
+			del_timer(&e->timer);
+			return e;
+	}
+	return NULL;
+}
+
+static void perflow_punish(int penalty, struct list_head *borrow)
+{
+	struct perflow_entry *e;
+	struct list_head *h;
+
+	if (!list_empty(borrow)) {
+		h = borrow->next;
+		list_del_init(h);
+		e = list_entry(h, struct perflow_entry, hlist);
+		e->penalty += penalty;
+	}
+}
+
+static __inline__ void
+perflow_adjust_used(struct perflow_entry *flow, u32 rate, u32 ceil)
+{
+	int cycles;
+
+	/* still in this time slot */
+	if ((jiffies - flow->jiffies) <= HZ) {
+		if ((ceil - flow->used) > flow->penalty) {
+			flow->used += flow->penalty;
+			flow->penalty = 0;
+		} else {
+			flow->used = ceil;
+			flow->penalty -= ceil - flow->used;
+		}
+		return;
+	}
+
+	/* the packet arrives after cycles slots */
+	cycles = (jiffies - flow->jiffies) / HZ;
+	flow->jiffies += cycles * HZ;
+
+	if ((cycles * ceil - flow->used) > flow->penalty) {
+		flow->used = 0;
+		flow->penalty = 0;
+	} else {
+		flow->used = flow->penalty + flow->used - cycles * ceil;
+		if (flow->used > ceil) {
+			flow->used = ceil;
+			flow->penalty -= ceil;
+		}
+	}
+}
+
+static int perflow_enqueue(struct sk_buff *skb, struct Qdisc *sch)
+{
+	struct perflow_sched_data *q = qdisc_priv(sch);
+	struct perflow_entry tuple, *flow;
+	u32 hash;
+
+	/* not a traceable flow */
+	if (!perflow_is_valid(skb))
+		goto drop;
+
+	/* if max qlen is set and current queue is full, drop */
+	if (q->qlen && sch->q.qlen >= q->qlen) {
+		printk(KERN_WARNING "qlen overlimit drop\n");
+		goto drop;
+	}
+
+	perflow_fill_tuple(&tuple, skb);
+	flow = perflow_find_flow(&hash, &tuple, sch);
+
+	if (flow == NULL) {
+		flow = perflow_new_flow(hash, &tuple, sch);
+		if (flow == NULL)
+			goto drop;
+	}
+
+	/* renew timer for flow */
+	flow->timer.expires = jiffies + PERFLOW_LIFETIME;
+	add_timer(&flow->timer);
+
+	if ((jiffies - q->jiffies) > HZ) {
+		q->token = q->grate;
+		q->jiffies += (jiffies - q->jiffies) / HZ * HZ;
+	}
+
+	/* renew used for this flow */
+	perflow_adjust_used(flow, q->rate, q->ceil);
+
+	/* we always satisfy flow under rate */
+	if (flow->used < q->rate) {
+		flow->used += skb->len;
+		if (flow->used > q->rate) {
+			/* if we borrow, we may receive penalty */
+			if (list_empty(&flow->borrow))
+				list_add_tail(&flow->borrow, &q->borrow);
+		}
+		if (flow->used > q->ceil)
+			flow->penalty += flow->used - q->ceil;
+		if (q->token < skb->len) {
+			q->token = 0;
+			perflow_punish(skb->len - q->token, &q->borrow);
+		} else {
+			q->token -= skb->len;
+		}
+	} else {
+		if (q->token < skb->len || flow->used >= q->ceil)
+			goto drop;
+
+		flow->used += skb->len;
+		q->token -= skb->len;
+
+		if (list_empty(&flow->borrow))
+			list_add_tail(&flow->borrow, &q->borrow);
+		else
+			list_move(&flow->borrow, &q->borrow);
+
+		if (flow->used > q->ceil)
+			flow->penalty += flow->used - q->ceil;
+	}
+
+enqueue:
+	sch->qstats.backlog += skb->len;
+	sch->bstats.bytes += skb->len;
+	sch->bstats.packets++;
+	__skb_queue_tail(&sch->q, skb);
+	return NET_XMIT_SUCCESS;
+
+drop:
+	sch->qstats.overlimits++;
+	kfree_skb(skb);
+	sch->qstats.drops++;
+	return NET_XMIT_CN;
+}
+
+static int perflow_requeue(struct sk_buff *skb, struct Qdisc *sch)
+{
+	__skb_queue_head(&sch->q, skb);
+	sch->qstats.backlog += skb->len;
+	sch->qstats.requeues++;
+	return 0;
+}
+
+static struct sk_buff *perflow_dequeue(struct Qdisc *sch)
+{
+	struct sk_buff *skb;
+
+	skb = __skb_dequeue(&sch->q);
+	if (skb) {
+		sch->qstats.backlog -= skb->len;
+	}
+	return skb;
+}
+
+static unsigned int perflow_drop(struct Qdisc *sch)
+{
+	struct sk_buff *skb;
+
+	skb = __skb_dequeue_tail(&sch->q);
+	if (skb) {
+		unsigned int len = skb->len;
+		sch->qstats.backlog -= len;
+		sch->qstats.drops++;
+		kfree_skb(skb);
+		return len;
+	}
+	return 0;
+}
+
+static void perflow_reset(struct Qdisc *sch)
+{
+	skb_queue_purge(&sch->q);
+	sch->qstats.backlog = 0;
+}
+
+static int perflow_change(struct Qdisc *sch, struct rtattr *opt)
+{
+	struct perflow_sched_data *q = qdisc_priv(sch);
+	struct tc_perflow_qopt *ctl;
+
+	/* if limit and qlen are lowered, and we are in a over limit
+	 * state, we still don't drop flow or drop packet.
+	 * sch->q will decrease to allowed length
+	 * q->flow_count will decrease because no new flow is traced
+	 */
+	if (opt == NULL) {
+		return -EINVAL;
+	} else {
+		ctl = RTA_DATA(opt);
+		if (opt->rta_len < RTA_LENGTH(sizeof(*ctl)))
+			return -EINVAL;
+
+		q->rate = ctl->rate.rate;
+		q->ceil = ctl->ceil.rate;
+		q->limit = ctl->limit;
+		q->qlen = ctl->qlen;
+
+		if (q->limit == 0)
+			q->limit = 1024;
+		/* aggregative rate is (rate * 1.05 * limit) */
+		q->grate = (q->rate + q->rate/20) * q->limit;
+
+		q->jiffies = jiffies;
+	}
+
+	return 0;
+}
+
+static int perflow_init(struct Qdisc *sch, struct rtattr *opt)
+{
+	struct perflow_sched_data *q = qdisc_priv(sch);
+	int i, ret;
+
+	ret = perflow_change(sch, opt);
+	if (ret)
+		return ret;
+
+	INIT_LIST_HEAD(&q->borrow);
+
+	for (i = 0; i < PERFLOW_HSIZE; i++)
+		INIT_LIST_HEAD(q->ht + i);
+
+	return 0;
+}
+
+static void perflow_destroy(struct Qdisc *sch)
+{
+	struct perflow_sched_data *q = qdisc_priv(sch);
+	struct list_head *h, *n;
+	struct perflow_entry *e;
+	int i;
+
+	for (i = 0; i < PERFLOW_HSIZE; i++) {
+		list_for_each_safe (h, n, q->ht + i) {
+			e = list_entry(h, struct perflow_entry, hlist);
+			del_timer(&e->timer);
+			if (!list_empty(&e->borrow))
+				list_del_init(&e->borrow);
+			list_del(h);
+			kfree(e);
+		}
+	}
+}
+
+static int perflow_dump(struct Qdisc *sch, struct sk_buff *skb)
+{
+	return -1;
+}
+
+static struct Qdisc_ops perflow_qdisc_ops = {
+	.next		= NULL,
+	.cl_ops		= NULL,
+	.id		= "perflow",
+	.priv_size	= sizeof(struct perflow_sched_data),
+	.enqueue	= perflow_enqueue,
+	.dequeue	= perflow_dequeue,
+	.requeue	= perflow_requeue,
+	.drop		= perflow_drop,
+	.init		= perflow_init,
+	.reset		= perflow_reset,
+	.destroy	= perflow_destroy,
+	.change		= perflow_change,
+	.dump		= perflow_dump,
+	.owner		= THIS_MODULE,
+};
+
+static int __init perflow_module_init(void)
+{
+	return register_qdisc(&perflow_qdisc_ops);
+}
+
+static void __exit perflow_module_exit(void)
+{
+	unregister_qdisc(&perflow_qdisc_ops);
+}
+
+module_init(perflow_module_init);
+module_exit(perflow_module_exit);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Wang Jian <lark@linux.net.cn>");

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [RFC] QoS: new per flow queue
  2005-04-05 15:25 [RFC] QoS: new per flow queue Wang Jian
@ 2005-04-05 17:57 ` jamal
  2005-04-06  5:12   ` Wang Jian
  0 siblings, 1 reply; 17+ messages in thread
From: jamal @ 2005-04-05 17:57 UTC (permalink / raw)
  To: Wang Jian; +Cc: netdev


I quickly scanned the kernel portion. I dont think this is the best way
to achieve this - your qdisc is both a classifier and scheduler. I think
this is the major drwback.
And if you take out the classifier - whats left in your qdisc cant beat
htb or hfsc or cbq in terms of being proven to be accurate.
 
If you could write a meta action instead which is a simple dynamic
setter of something like fwmark that would suffice i.e something along
the lines of:

example:
----
tc filter ip u32 match ip sport 80 0xffff flowid 1:12 \
    action dynfwmark continue
tc filter fwmark 0x1 .. classid aaaa
tc filter fwmark 0x2 .. classid bbbb
..
..

tc qdisc htb/hfsc/cbq .... your rate parameters here.
---

dynfwmark will maintain your state table which gets deleted when timers
expire and will hash based on the current jenkins hash
Do you have to queue the packets? if not you could instead have the
police action (attached to fwmark) drop the packet once it exceeds
certain rate and then use any enqueueing scheme you want.
The drawback with above scheme is you will have as many entries for
fwmark as you want to have queues - each selecting its own queue.

cheers,
jamal

On Tue, 2005-04-05 at 11:25, Wang Jian wrote:
> Hi,
> 
> I write a per flow rate control qdisc. I posted it to LARTC list. Some
> discussion about it is here
> 
>     http://mailman.ds9a.nl/pipermail/lartc/2005q2/015381.html
> 
> I think I need more feedback and suggestion on it so I repost the patch
> here. Please read the thread and get a picture about why and how.
> 
> The kernel patch is agains kernel 2.6.11, the iproute2 patch is against
> iproute2-2.6.11-050314. 
> 
> The test scenario is like this
> 
>       www server <- [ eth0   eth1 ] -> www clients
> 
> The attached t.sh is used to generate test rules. Clients download a
> big ISO file from www server, so flows' rate can be estimated by view
> progress.
> 
> I have some test on it and it works well. It provides good fairness.
> When all slot being used, in most time, the real rate can keep at
> specified guaranteed rate. But I know it should receive more test.
> 
> I have some consideration though
> 
> 1. In the test sometimes there a pair of unbalanced stream and don't get
> balanced quickly. One stream get 8.4kbps and another get 11.5kbps. How
> to find the flow with highest traffic and punish it most?
> 
> 2. The default ceil equals to rate. Should I calculate it as
>    ceil = rate * 1.05 * limit, or
>    ceil = rate * 1.05?
> 
> 3. when flow slots are full, optionally reclassify untraceable traffic
>    into another specified class, instead of dropping it?
> 
> TODO:
> 
> 1. rtnetlink related code should be improved;
> 2. dump() and dump_stat();
> 
> 
> Regards

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [RFC] QoS: new per flow queue
  2005-04-05 17:57 ` jamal
@ 2005-04-06  5:12   ` Wang Jian
  2005-04-06 12:12     ` jamal
  0 siblings, 1 reply; 17+ messages in thread
From: Wang Jian @ 2005-04-06  5:12 UTC (permalink / raw)
  To: hadi; +Cc: netdev

Hi jamal,

I know your concern. Actually, I first try to implement it like you
point out, but then I dismiss that scheme, because it is hard to
maintain for the userspace.

But if the dynfwmark can dynamically alloc htb class in the given class
id range. For example

tc filter ip u32 tc filter ip u32 match ip sport 80 0xffff flowid 1:12 \
     action dynfwmark major 1 range 1:1000 <flow parameter> continue

When it detects a new flow, it creates the necessary htb class? So the
userspace work is simple. But when we have segmented class id space, we
may not get such an enough empty range.

On 05 Apr 2005 13:57:38 -0400, jamal <hadi@cyberus.ca> wrote:

> 
> I quickly scanned the kernel portion. I dont think this is the best way
> to achieve this - your qdisc is both a classifier and scheduler. I think
> this is the major drwback.
> And if you take out the classifier - whats left in your qdisc cant beat
> htb or hfsc or cbq in terms of being proven to be accurate.

I think per flow control in nature means that classifier must be
intimately coupled with scheduler. There is no way around it. If you
seperate them, you must provide a way to link them together again. The
dynfwmark way you suggested actually does so, but not clean (because you
choose to use existing nfmark infrastructure). If it carries an unique
id or something like in its own namespace, then it can be clean and
friendly for userspace, but I bet it is unnecessarily bloated.

 In my test, HTB performs very well. I intensionally requires a HTB
class to enclose a perflow queue to provide guaranteed sum bandwidth. The
queue is proven to be fair enough and can guarantee rate internally for
its flows (if the per flow rate is at or above 10kbps).

I haven't tested rate lower than 10kbps, because my test client is not
that accurate to show the number. It's simply a "lftpget <url>".

There are short threads before in which someone asked for a per flow
control solution, and was suggested to use HTB + SFQ. My test reveals
that SFQ is far away from fairness and can't meet the requirement of
bandwidth assurance.

For HFSC, I haven't any experience with it because the documentation is
lacked.

>  
> If you could write a meta action instead which is a simple dynamic
> setter of something like fwmark that would suffice i.e something along
> the lines of:
> 
> example:
> ----
> tc filter ip u32 match ip sport 80 0xffff flowid 1:12 \
>     action dynfwmark continue
> tc filter fwmark 0x1 .. classid aaaa
> tc filter fwmark 0x2 .. classid bbbb
> ..
> ..
> 
> tc qdisc htb/hfsc/cbq .... your rate parameters here.
> ---
> 
> dynfwmark will maintain your state table which gets deleted when timers
> expire and will hash based on the current jenkins hash
> Do you have to queue the packets? if not you could instead have the
> police action (attached to fwmark) drop the packet once it exceeds
> certain rate and then use any enqueueing scheme you want.
> The drawback with above scheme is you will have as many entries for
> fwmark as you want to have queues - each selecting its own queue.
> 
> cheers,
> jamal
> 
> On Tue, 2005-04-05 at 11:25, Wang Jian wrote:
> > Hi,
> > 
> > I write a per flow rate control qdisc. I posted it to LARTC list. Some
> > discussion about it is here
> > 
> >     http://mailman.ds9a.nl/pipermail/lartc/2005q2/015381.html
> > 
> > I think I need more feedback and suggestion on it so I repost the patch
> > here. Please read the thread and get a picture about why and how.
> > 
> > The kernel patch is agains kernel 2.6.11, the iproute2 patch is against
> > iproute2-2.6.11-050314. 
> > 
> > The test scenario is like this
> > 
> >       www server <- [ eth0   eth1 ] -> www clients
> > 
> > The attached t.sh is used to generate test rules. Clients download a
> > big ISO file from www server, so flows' rate can be estimated by view
> > progress.
> > 
> > I have some test on it and it works well. It provides good fairness.
> > When all slot being used, in most time, the real rate can keep at
> > specified guaranteed rate. But I know it should receive more test.
> > 
> > I have some consideration though
> > 
> > 1. In the test sometimes there a pair of unbalanced stream and don't get
> > balanced quickly. One stream get 8.4kbps and another get 11.5kbps. How
> > to find the flow with highest traffic and punish it most?
> > 
> > 2. The default ceil equals to rate. Should I calculate it as
> >    ceil = rate * 1.05 * limit, or
> >    ceil = rate * 1.05?
> > 
> > 3. when flow slots are full, optionally reclassify untraceable traffic
> >    into another specified class, instead of dropping it?
> > 
> > TODO:
> > 
> > 1. rtnetlink related code should be improved;
> > 2. dump() and dump_stat();
> > 
> > 
> > Regards

-- 
  lark

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [RFC] QoS: new per flow queue
  2005-04-06  5:12   ` Wang Jian
@ 2005-04-06 12:12     ` jamal
  2005-04-06 13:45       ` Wang Jian
  0 siblings, 1 reply; 17+ messages in thread
From: jamal @ 2005-04-06 12:12 UTC (permalink / raw)
  To: Wang Jian; +Cc: netdev

Wang,

On Wed, 2005-04-06 at 01:12, Wang Jian wrote:
> Hi jamal,
> 
> I know your concern. Actually, I first try to implement it like you
> point out, but then I dismiss that scheme, because it is hard to
> maintain for the userspace.
> 
> But if the dynfwmark can dynamically alloc htb class in the given class
> id range. For example
> 
> tc filter ip u32 tc filter ip u32 match ip sport 80 0xffff flowid 1:12 \
>      action dynfwmark major 1 range 1:1000 <flow parameter> continue
> 

Yes, this looks better except for the <flow parameter> part - why do you
need that? The syntax may have to explicitly say min 1 max 1000 for
usability, but thats not a big deal.
Essentially rewrite the classid/flowid. In the action just set
skb->tc_classid and u32 will make sure the packet goes to the correct
major:minor queue. This already works with kernels >= 2.6.10.

> When it detects a new flow, it creates the necessary htb class? So the
> userspace work is simple. But when we have segmented class id space, we
> may not get such an enough empty range.
> 

Yes, I thought about this too when i was responding to you earlier. 
Theres two ways to do it in my opinion:
1) create apriori at setup time 1024 or whatever number of HTB classes.
So when u32 selects the major:minor its already there.
OR
2) do what you are suggesting to dynamically create classes; i.e
"classifier returned a non-existing class, create default class
using that major/minor number". 
Default class could be defined from user space to be one that is
initialized to 10 Kbps rate burst 1kbps etc. You may have to teach the
qdisc the maximum number of classes it should create.

#1 above will work immediately and you dont have to hack any of the
qdiscs. The disadvantage is your user space app will have to
create every class individualy - so if you have 1024 queues then 1024
queues are needed. 
#2 is nice because it avoids the above problem - disadvantage is you
need to manage creation and deletion of these queues and write code.

I do think the idea of dynamically creating the flow queues is
interesting and useful as well. It would be nice if it can be done for
all classful qdiscs.

One thing i noticed is that you dont have multiple queues actually in
your code - did i misread that? i.e multiple classes go to the same
queue.

> I think per flow control in nature means that classifier must be
> intimately coupled with scheduler. There is no way around it. If you
> seperate them, you must provide a way to link them together again. The
> dynfwmark way you suggested actually does so, but not clean (because you
> choose to use existing nfmark infrastructure). If it carries an unique
> id or something like in its own namespace, then it can be clean and
> friendly for userspace, but I bet it is unnecessarily bloated.
> 

The only unfriendliness to user space is in #1 where you end up having a
script creating as many classes as you need. It is _not_ bloat because
you dont add any code at all. It is anti-bloat actually ;->

Look at above - and based on your suggestion; lets reset the
flowid/classid.

>  In my test, HTB performs very well. I intensionally requires a HTB
> class to enclose a perflow queue to provide guaranteed sum bandwidth. The
> queue is proven to be fair enough and can guarantee rate internally for
> its flows (if the per flow rate is at or above 10kbps).
> 

Well, HTB is just a replica of the token bucket scheduler with
additional knobs - so i suspect the numbers will look the same with TB
as well. Someone needs to be test all of them and see how accurate they
are. The clock sources at compile time and your hardware will also
affect you.

> I haven't tested rate lower than 10kbps, because my test client is not
> that accurate to show the number. It's simply a "lftpget <url>".
> 
> There are short threads before in which someone asked for a per flow
> control solution, and was suggested to use HTB + SFQ. My test reveals
> that SFQ is far away from fairness and can't meet the requirement of
> bandwidth assurance.
> 

I dont think SFQ will give you per flow; actually I should say - depends
on your definition of flow - seems yours is the 5 tuple { src/dst IP,
src/dst port, proto=UDP/TCP/SCTP}. SFQ will work for a subset of these
tuples and is therefore not fair at the granularity that you want.

> For HFSC, I haven't any experience with it because the documentation is
> lacked.
> 

I am suprised no one has compared all the rate control schemes.

btw, would policing also suffice for you? The only difference is it will
drop packets if you exceed your rate. You can also do hierachical
sharing.

cheers,
jamal

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [RFC] QoS: new per flow queue
  2005-04-06 12:12     ` jamal
@ 2005-04-06 13:45       ` Wang Jian
  2005-04-07 11:06         ` jamal
  0 siblings, 1 reply; 17+ messages in thread
From: Wang Jian @ 2005-04-06 13:45 UTC (permalink / raw)
  To: hadi; +Cc: netdev

Hi jamal,


On 06 Apr 2005 08:12:50 -0400, jamal <hadi@cyberus.ca> wrote:

> Wang,
> 
> On Wed, 2005-04-06 at 01:12, Wang Jian wrote:
> > Hi jamal,
> > 
> > I know your concern. Actually, I first try to implement it like you
> > point out, but then I dismiss that scheme, because it is hard to
> > maintain for the userspace.
> > 
> > But if the dynfwmark can dynamically alloc htb class in the given class
> > id range. For example
> > 
> > tc filter ip u32 tc filter ip u32 match ip sport 80 0xffff flowid 1:12 \
> >      action dynfwmark major 1 range 1:1000 <flow parameter> continue
> > 
> 
> Yes, this looks better except for the <flow parameter> part - why do you
> need that? The syntax may have to explicitly say min 1 max 1000 for
> usability, but thats not a big deal.

The <flow parameter> is used to dynamically create htb class, see below.


> Essentially rewrite the classid/flowid. In the action just set
> skb->tc_classid and u32 will make sure the packet goes to the correct
> major:minor queue. This already works with kernels >= 2.6.10.
> 

One question:

Can I set skb->tc_classid in a qdisc and pass this skb to the specified
class to handle?

> > When it detects a new flow, it creates the necessary htb class? So the
> > userspace work is simple. But when we have segmented class id space, we
> > may not get such an enough empty range.
> > 
> 
> Yes, I thought about this too when i was responding to you earlier. 
> Theres two ways to do it in my opinion:
> 1) create apriori at setup time 1024 or whatever number of HTB classes.
> So when u32 selects the major:minor its already there.
> OR
> 2) do what you are suggesting to dynamically create classes; i.e
> "classifier returned a non-existing class, create default class
> using that major/minor number". 
> Default class could be defined from user space to be one that is
> initialized to 10 Kbps rate burst 1kbps etc. You may have to teach the
> qdisc the maximum number of classes it should create.
> 
> #1 above will work immediately and you dont have to hack any of the
> qdiscs. The disadvantage is your user space app will have to
> create every class individualy - so if you have 1024 queues then 1024
> queues are needed. 
> #2 is nice because it avoids the above problem - disadvantage is you
> need to manage creation and deletion of these queues and write code.
> 

My idea is that the action itself dynamically creates classes. So you
don't need any other rules. It is looks like #2 but the work is done in
dynfwmark action. The workflow

0. setup, and create flow entry hash;
1. when a packet arrives, check if it is a flow or should be a new flow;
2. alloc a class id for this flow;
3. dynamically create a htb class using the <flow parameter>
4. skb->tc_classid = allocated_ht_class_id

1.5 if can't create flow, skb->tc_classid = default_class_id


> I do think the idea of dynamically creating the flow queues is
> interesting and useful as well. It would be nice if it can be done for
> all classful qdiscs.
> 
> One thing i noticed is that you dont have multiple queues actually in
> your code - did i misread that? i.e multiple classes go to the same
> queue.
> 

Didn't you notice that it is a classless qdisc? The queue is per qdisc,
not per class :) It is the parent class's duty to define what kind of
flow this qdisc handle. It is very generic, you can even mix UDP/TCP
flows together but I don't think it is good idea.

Think the scenario

1. You have VoIP flows (UDP)
2. You have pptp vpn flows (eh, per flow can't handle it at this time, I
think)

You create HTBs for them, and use filter to classify them. And then, use
per flow qdisc as the only child of these HTBs. per flow qdisc will
guarantee the per flow QoS.

> 
> > I think per flow control in nature means that classifier must be
> > intimately coupled with scheduler. There is no way around it. If you
> > seperate them, you must provide a way to link them together again. The
> > dynfwmark way you suggested actually does so, but not clean (because you
> > choose to use existing nfmark infrastructure). If it carries an unique
> > id or something like in its own namespace, then it can be clean and
> > friendly for userspace, but I bet it is unnecessarily bloated.
> > 
> 
> The only unfriendliness to user space is in #1 where you end up having a
> script creating as many classes as you need. It is _not_ bloat because
> you dont add any code at all. It is anti-bloat actually ;->
> 

In this way, it is very hard to write good user interface in userspace.
My current implementation takes good user-friendly (or user space
scripter/programmer friendly) into serious consideration.

The 'bloated' comment is on the 

"If it carries an unique id or something like in its own namespace, then
it can be clean and friendly for userspace"

I think too much and my thought is jumpy ;) You even didn't notice that
I gave another suggestion on implementation in this sentence.


> Look at above - and based on your suggestion; lets reset the
> flowid/classid.
> 
> >  In my test, HTB performs very well. I intensionally requires a HTB
> > class to enclose a perflow queue to provide guaranteed sum bandwidth. The
> > queue is proven to be fair enough and can guarantee rate internally for
> > its flows (if the per flow rate is at or above 10kbps).
> > 
> 
> Well, HTB is just a replica of the token bucket scheduler with
> additional knobs - so i suspect the numbers will look the same with TB
> as well. Someone needs to be test all of them and see how accurate they
> are. The clock sources at compile time and your hardware will also
> affect you.
> 
> > I haven't tested rate lower than 10kbps, because my test client is not
> > that accurate to show the number. It's simply a "lftpget <url>".
> > 
> > There are short threads before in which someone asked for a per flow
> > control solution, and was suggested to use HTB + SFQ. My test reveals
> > that SFQ is far away from fairness and can't meet the requirement of
> > bandwidth assurance.
> > 
> 
> I dont think SFQ will give you per flow; actually I should say - depends
> on your definition of flow - seems yours is the 5 tuple { src/dst IP,
> src/dst port, proto=UDP/TCP/SCTP}. SFQ will work for a subset of these
> tuples and is therefore not fair at the granularity that you want.
> 
> > For HFSC, I haven't any experience with it because the documentation is
> > lacked.
> > 
> 
> I am suprised no one has compared all the rate control schemes.
> 
> btw, would policing also suffice for you? The only difference is it will
> drop packets if you exceed your rate. You can also do hierachical
> sharing.

policy suffices, but doesn't serve the purpose of per flow queue.


And thanks for your discussion with me. It seems that few people is
interested in it.


-- 
  lark

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [RFC] QoS: new per flow queue
  2005-04-06 13:45       ` Wang Jian
@ 2005-04-07 11:06         ` jamal
  2005-04-07 13:14           ` Wang Jian
  0 siblings, 1 reply; 17+ messages in thread
From: jamal @ 2005-04-07 11:06 UTC (permalink / raw)
  To: Wang Jian; +Cc: netdev

Wang,

On Wed, 2005-04-06 at 09:45, Wang Jian wrote:
> Hi jamal,
> 
> 
> On 06 Apr 2005 08:12:50 -0400, jamal <hadi@cyberus.ca> wrote:
> 

> > Essentially rewrite the classid/flowid. In the action just set
> > skb->tc_classid and u32 will make sure the packet goes to the correct
> > major:minor queue. This already works with kernels >= 2.6.10.
> > 
> 
> One question:
> 
> Can I set skb->tc_classid in a qdisc and pass this skb to the specified
> class to handle?
> 

Well, remember the qdisc asks the filter what class a packet belongs to
every time. The best place to change things is at that point. Why not
tell the qdisc whatever class you want it to know instead of passing
metadata like skb->tc_classid for it to further intepret?

> 
> My idea is that the action itself dynamically creates classes. So you
> don't need any other rules. It is looks like #2 but the work is done in
> dynfwmark action. The workflow
> 
> 0. setup, and create flow entry hash;
> 1. when a packet arrives, check if it is a flow or should be a new flow;
> 2. alloc a class id for this flow;

so far so good. But you probably wanna drop the packet after a certain
upper bound for number of flows is reached. 

> 3. dynamically create a htb class using the <flow parameter>
> 4. skb->tc_classid = allocated_ht_class_id
> 
> 1.5 if can't create flow, skb->tc_classid = default_class_id
> 

This may be a little tricky with locks etc. Maybe you could send a
netlink message to the qdisc. Perhaps a workthread which creates these
classes etc. My suggestion was: 

a) qdisc asks filter "what class is this packet?"
b) filter responds "class 1:234"
c) qdisc says "hrm, dont have that class. call create_default_class"
d) qdisc::create_default_class() 
      create class and set default parameters. Default parameters are
      passed from user space (eg 10kbps rate etc). 
e) enqueue packet on new queue.

Also now you have to create the dynamic class rewriter action and change
a qdisc ;-> What i was saying earlier is that the create_default_class 
maybe a good addition to qdiscs. 
The question is do you keep those queues forver or kill them after a
period of no activity?
The idea of creating them permanently upto a certain max numbers is
probably ok. It will be no different than creating them via a script.

> > One thing i noticed is that you dont have multiple queues actually in
> > your code - did i misread that? i.e multiple classes go to the same
> > queue.
> > 
> 
> Didn't you notice that it is a classless qdisc? The queue is per qdisc,
> not per class :) 

Sorry missed that ;->

> It is the parent class's duty to define what kind of
> flow this qdisc handle. It is very generic, you can even mix UDP/TCP
> flows together but I don't think it is good idea.
> 

Doesnt that defeat the purpose of it being "per flow queue" ;->

> Think the scenario
> 
> 1. You have VoIP flows (UDP)
> 2. You have pptp vpn flows (eh, per flow can't handle it at this time, I
> think)
> 
> You create HTBs for them, and use filter to classify them. And then, use
> per flow qdisc as the only child of these HTBs. per flow qdisc will
> guarantee the per flow QoS.
> 

I got this. Of course you understand creating all these filters and
queues is taking system resources. It may be sufficient to create
heuristics like a simple priority qdisc where you put voip as first
class citizen, vpn as second and rest as best effort.

> > The only unfriendliness to user space is in #1 where you end up having a
> > script creating as many classes as you need. It is _not_ bloat because
> > you dont add any code at all. It is anti-bloat actually ;->
> > 
> 
> In this way, it is very hard to write good user interface in userspace.
> My current implementation takes good user-friendly (or user space
> scripter/programmer friendly) into serious consideration.
> 

It is not hard - its annoying in user space ;->
You could write a for loop to produce them. such as:

for i stating from 1 to 1000
 tc class add .... rate 10Kbps classid 1:$i
endfor

Trying to list those classes will list at least a 1000 lines of course.
But this listing issue will happen even when you dynamically create
these classes.
So annoyance is more descriptive.

> The 'bloated' comment is on the 
> 
> "If it carries an unique id or something like in its own namespace, then
> it can be clean and friendly for userspace"
> 

I am not disagreeing with you. If you want to go that path instead what
i am saying is its more coding.

> I think too much and my thought is jumpy ;) You even didn't notice that
> I gave another suggestion on implementation in this sentence.
> 

What? You did? 
just kidding;-> 

> > I am suprised no one has compared all the rate control schemes.
> > 
> > btw, would policing also suffice for you? The only difference is it will
> > drop packets if you exceed your rate. You can also do hierachical
> > sharing.
> 
> policy suffices, but doesn't serve the purpose of per flow queue.
> 

Policing will achieve the goal of rate control without worrying about
any queueing. I like the idea of someone trying to create dynamic queues
though ;-> 

cheers,
jamal

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [RFC] QoS: new per flow queue
  2005-04-07 11:06         ` jamal
@ 2005-04-07 13:14           ` Wang Jian
  2005-04-08 12:43             ` jamal
  0 siblings, 1 reply; 17+ messages in thread
From: Wang Jian @ 2005-04-07 13:14 UTC (permalink / raw)
  To: hadi; +Cc: netdev

Hi jamal,

On 07 Apr 2005 07:06:01 -0400, jamal <hadi@cyberus.ca> wrote:

> > 
> > One question:
> > 
> > Can I set skb->tc_classid in a qdisc and pass this skb to the specified
> > class to handle?
> > 
> 
> Well, remember the qdisc asks the filter what class a packet belongs to
> every time. The best place to change things is at that point. Why not
> tell the qdisc whatever class you want it to know instead of passing
> metadata like skb->tc_classid for it to further intepret?
> 

1. a flow (in my perflow queue implementation) is a tuple of five
elements, but, for some reason, if the users misused this queue and send
non-flow packet, e.g. ICMP packet here;

2. a queue is configured to handle 100 flows, but the 101st flow comes;

For this two situations, currently, the implementation just drops
packets. However, a clean way is to reclassify the packet into another
class (default) and provides no per flow guarantee.

> > 
> > Didn't you notice that it is a classless qdisc? The queue is per qdisc,
> > not per class :) 
> 
> Sorry missed that ;->
> 
> > It is the parent class's duty to define what kind of
> > flow this qdisc handle. It is very generic, you can even mix UDP/TCP
> > flows together but I don't think it is good idea.
> > 
> 
> Doesnt that defeat the purpose of it being "per flow queue" ;->

Per flow queue, is not one queue per flow ;) It is queue which can
provide bandwidth assurance and constraint per flow.

The implementation can be either one queue per flow, or all flows fit
into one queue.

> 
> > Think the scenario
> > 
> > 1. You have VoIP flows (UDP)
> > 2. You have pptp vpn flows (eh, per flow can't handle it at this time, I
> > think)
> > 
> > You create HTBs for them, and use filter to classify them. And then, use
> > per flow qdisc as the only child of these HTBs. per flow qdisc will
> > guarantee the per flow QoS.
> > 
> 
> I got this. Of course you understand creating all these filters and
> queues is taking system resources. It may be sufficient to create
> heuristics like a simple priority qdisc where you put voip as first
> class citizen, vpn as second and rest as best effort.

As I already said, this approach has drawbacks

1. when flow count overlimit, no guarantee
2. when flow count is underlimit, the guaranteed sum bandwidth can be
exploited to waste bandwidth.

So, thinking of per flow queue, it is "queue which can provide bandwidth
assurance and constraint per flow", and with only one queue!

You only need to create one HTB, one filter and one per flow queue for
VoIP; and one HTB, one filter and one per flow queue for VPN.

I think the "per flow" name does confuse you ;) My fault.

> 
> > > The only unfriendliness to user space is in #1 where you end up having a
> > > script creating as many classes as you need. It is _not_ bloat because
> > > you dont add any code at all. It is anti-bloat actually ;->
> > > 
> > 
> > In this way, it is very hard to write good user interface in userspace.
> > My current implementation takes good user-friendly (or user space
> > scripter/programmer friendly) into serious consideration.
> > 
> 
> It is not hard - its annoying in user space ;->
> You could write a for loop to produce them. such as:
> 
> for i stating from 1 to 1000
>  tc class add .... rate 10Kbps classid 1:$i
> endfor
> 
> Trying to list those classes will list at least a 1000 lines of course.
> But this listing issue will happen even when you dynamically create
> these classes.
> So annoyance is more descriptive.
>

The problem occurs when you delete and add, and so on. It not good idea
to reuse a used classid when there is stream classified as old.

For example, you give class 1:1000 htb rate 200kbps ceil 200kbps for
http, and you delete the class 1:1000 and redefine 1:1000 htb rate
30kbps ceil 30kbps for ftp.

At this time, the remained http streams carries a CONNMARK and restore
to MARK and then classified as 1:0000. Then 1:1000 is not what you want.

>  
> > > I am suprised no one has compared all the rate control schemes.
> > > 
> > > btw, would policing also suffice for you? The only difference is it will
> > > drop packets if you exceed your rate. You can also do hierachical
> > > sharing.
> > 
> > policy suffices, but doesn't serve the purpose of per flow queue.
> > 
> 
> Policing will achieve the goal of rate control without worrying about
> any queueing. I like the idea of someone trying to create dynamic queues
> though ;-> 
>

You need per flow queue to control something, like VoIP streams, or VPN
streams. If you just use policy, mixed traffic is send to per flow queue.
That is definitely not the purpose of per flow queue.

The dynamic queue creating is another way to implement the per flow
control (yes, one class and queue per flow). I think it is more complex
and wastes resources.

-- 
  lark

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [RFC] QoS: new per flow queue
  2005-04-07 13:14           ` Wang Jian
@ 2005-04-08 12:43             ` jamal
  2005-04-13  5:45               ` [RFC] QoS: frg queue (was [RFC] QoS: new per flow queue) Wang Jian
  0 siblings, 1 reply; 17+ messages in thread
From: jamal @ 2005-04-08 12:43 UTC (permalink / raw)
  To: Wang Jian; +Cc: netdev

Wang,

sorry for not getting back sooner - I actually wanted to read what you
are saying before responding ;->

On Thu, 2005-04-07 at 09:14, Wang Jian wrote:

> 1. a flow (in my perflow queue implementation) is a tuple of five
> elements, but, for some reason, if the users misused this queue and send
> non-flow packet, e.g. ICMP packet here;
> 
> 2. a queue is configured to handle 100 flows, but the 101st flow comes;
> 
> For this two situations, currently, the implementation just drops
> packets. However, a clean way is to reclassify the packet into another
> class (default) and provides no per flow guarantee.
> 

The reclassification or #1 will best be left to the user. This is not
hard to do.

> > Doesnt that defeat the purpose of it being "per flow queue" ;->
> 
> Per flow queue, is not one queue per flow ;) It is queue which can
> provide bandwidth assurance and constraint per flow.
> 
> The implementation can be either one queue per flow, or all flows fit
> into one queue.
> 

Ok, stop calling it per-flow-queue then ;-> You should call it
per-flow-rate-guarantee.

> As I already said, this approach has drawbacks
> 
> 1. when flow count overlimit, no guarantee
> 2. when flow count is underlimit, the guaranteed sum bandwidth can be
> exploited to waste bandwidth.
> 
> So, thinking of per flow queue, it is "queue which can provide bandwidth
> assurance and constraint per flow", and with only one queue!
> 

Sharing is not a big challenge -  and should be policy driven.
HTB and CBQ both support it. I am not sure about HFSC.

> You only need to create one HTB, one filter and one per flow queue for
> VoIP; and one HTB, one filter and one per flow queue for VPN.
> 
> I think the "per flow" name does confuse you ;) My fault.

The "queue" part is confusing - the "perflow" is fine. Lets stick with
per-flow-rate-guarantee as the description.

So it seems you want by all means to avoid entering something that
will take forever to list. Did i understand this correctly?

We can probably avoid insisting on dynamically creating classes maybe.
You can rate control before you enqueue and we can use fwmark perhaps.
Earlier i also asked whether policing will suffice. Heres an example
(maybe dated syntax, but concept still valid) that shows sharing using
policers:
http://www.cyberus.ca/~hadi/patches/action/README
look at the example where it says "--cut here --"

The only difference in this case is instead of creating 1000 classes
you create 1000 policers as a result of the hash. 
Something along:

    u32 classify for port 80 prio high \
       action dymfwmark create range min 1 max 1000 \
       action police to some rate if result is drop we stop here \
       else continue \
    fwmark classify prio low\
       select one of two queues (high prio or low prio Q)

Very small script but still doesnt avoid the "seeing 1000 things". In
this case if you list actions you see 1000.
The lockings in this case are more bearable than having the dynamic
marker creating queues.
Typically the actions in a topology graph are stiched together at policy
init time for efficiency reasons - so we dont have to do lookups at
runtime. In this case it will need to have static lookup instead because
depending on what the hash selects you want to select a different
policer instance. I think i know an easy way of doing this (example
storing per hash policer pointer in the dynmark table and doing the
invoke from within dynmark).

> The problem occurs when you delete and add, and so on. It not good idea
> to reuse a used classid when there is stream classified as old.
> 
> For example, you give class 1:1000 htb rate 200kbps ceil 200kbps for
> http, and you delete the class 1:1000 and redefine 1:1000 htb rate
> 30kbps ceil 30kbps for ftp.
> 
> At this time, the remained http streams carries a CONNMARK and restore
> to MARK and then classified as 1:0000. Then 1:1000 is not what you want.
> 

I would think the number 1000 should be related to hash of flow header,
no? In which case there should be no collision unless the hash of ftp
and http are 1000.

> 
> >  
> > > > I am suprised no one has compared all the rate control schemes.
> > > > 
> > > > btw, would policing also suffice for you? The only difference is it will
> > > > drop packets if you exceed your rate. You can also do hierachical
> > > > sharing.
> > > 
> > > policy suffices, but doesn't serve the purpose of per flow queue.
> > > 
> > 
> > Policing will achieve the goal of rate control without worrying about
> > any queueing. I like the idea of someone trying to create dynamic queues
> > though ;-> 
> >
> 
> You need per flow queue to control something, like VoIP streams, or VPN
> streams. If you just use policy, mixed traffic is send to per flow queue.
> That is definitely not the purpose of per flow queue.
> 
> The dynamic queue creating is another way to implement the per flow
> control (yes, one class and queue per flow). I think it is more complex
> and wastes resources.
> 

Look at the above suggestion - what you will waste in that case is
polices. You should actually not use HTB but rather strict prio qdisc
with policers.

cheers,
jamal

^ permalink raw reply	[flat|nested] 17+ messages in thread

* [RFC] QoS: frg queue (was [RFC] QoS: new per flow queue)
  2005-04-08 12:43             ` jamal
@ 2005-04-13  5:45               ` Wang Jian
  2005-04-18 13:14                 ` jamal
  0 siblings, 1 reply; 17+ messages in thread
From: Wang Jian @ 2005-04-13  5:45 UTC (permalink / raw)
  To: hadi; +Cc: netdev

Hi jamal,

Sorry for the late reply. I am occupied by other things and just have
time back to this topic.

On 08 Apr 2005 08:43:28 -0400, jamal <hadi@cyberus.ca> wrote:


> On Thu, 2005-04-07 at 09:14, Wang Jian wrote:
> 
> > 1. a flow (in my perflow queue implementation) is a tuple of five
> > elements, but, for some reason, if the users misused this queue and send
> > non-flow packet, e.g. ICMP packet here;
> > 
> > 2. a queue is configured to handle 100 flows, but the 101st flow comes;
> > 
> > For this two situations, currently, the implementation just drops
> > packets. However, a clean way is to reclassify the packet into another
> > class (default) and provides no per flow guarantee.
> > 
> 
> The reclassification or #1 will best be left to the user. This is not
> hard to do.

I scan through other code and find no easy way to redirect non-handled
traffic to another class. Can you give me some hint on that?


> 
> Ok, stop calling it per-flow-queue then ;-> You should call it
> per-flow-rate-guarantee.

I have renamed it to frg (flow rate guarantee) per your suggestion.
After the above reclassification is done, I will post new patch here.

I will extend the concept of flow to include GRE, so pptp VPN can be
supported. There are other 'flows' to consider.


> > As I already said, this approach has drawbacks
> > 
> > 1. when flow count overlimit, no guarantee
> > 2. when flow count is underlimit, the guaranteed sum bandwidth can be
> > exploited to waste bandwidth.
> > 
> > So, thinking of per flow queue, it is "queue which can provide bandwidth
> > assurance and constraint per flow", and with only one queue!
> > 
> 
> Sharing is not a big challenge -  and should be policy driven.
> HTB and CBQ both support it. I am not sure about HFSC.
> 

Still, I am not clear if you understand me. How it works for this
purpose:

guarantee and only guarantee rate * n when there are n flows?

When there only 1 flow, guarantee only rate * 1
When there only 2 flows, guarantee only rate * 2
...
and so on

If always guarantee rate * limit, then the excessive guaranteed rate can
be abused.

But if always guarantee only rate * 1, then it is not enough.

>  
> > You only need to create one HTB, one filter and one per flow queue for
> > VoIP; and one HTB, one filter and one per flow queue for VPN.
> > 
> > I think the "per flow" name does confuse you ;) My fault.
> 
> The "queue" part is confusing - the "perflow" is fine. Lets stick with
> per-flow-rate-guarantee as the description.
> 
> So it seems you want by all means to avoid entering something that
> will take forever to list. Did i understand this correctly?

Yes. It is special purpose but general enough. I think it's worthy of
adding a new qdisc for it to avoid the dirty long listing part.

> 
> We can probably avoid insisting on dynamically creating classes maybe.
> You can rate control before you enqueue and we can use fwmark perhaps.
> Earlier i also asked whether policing will suffice. Heres an example
> (maybe dated syntax, but concept still valid) that shows sharing using
> policers:
> http://www.cyberus.ca/~hadi/patches/action/README

I will look at it later.

> look at the example where it says "--cut here --"
> 
> The only difference in this case is instead of creating 1000 classes
> you create 1000 policers as a result of the hash. 
> Something along:
> 
>     u32 classify for port 80 prio high \
>        action dymfwmark create range min 1 max 1000 \
>        action police to some rate if result is drop we stop here \
>        else continue \
>     fwmark classify prio low\
>        select one of two queues (high prio or low prio Q)
>  
> Very small script but still doesnt avoid the "seeing 1000 things". In
> this case if you list actions you see 1000.
> The lockings in this case are more bearable than having the dynamic
> marker creating queues.
> Typically the actions in a topology graph are stiched together at policy
> init time for efficiency reasons - so we dont have to do lookups at
> runtime. In this case it will need to have static lookup instead because
> depending on what the hash selects you want to select a different
> policer instance. I think i know an easy way of doing this (example
> storing per hash policer pointer in the dynmark table and doing the
> invoke from within dynmark).
> 

If we can do it with one thing, we should avoid creating 1000 things.
The policy way works but is dirty.

> > The problem occurs when you delete and add, and so on. It not good idea
> > to reuse a used classid when there is stream classified as old.
> > 
> > For example, you give class 1:1000 htb rate 200kbps ceil 200kbps for
> > http, and you delete the class 1:1000 and redefine 1:1000 htb rate
> > 30kbps ceil 30kbps for ftp.
> > 
> > At this time, the remained http streams carries a CONNMARK and restore
> > to MARK and then classified as 1:0000. Then 1:1000 is not what you want.
> > 
> 
> I would think the number 1000 should be related to hash of flow header,
> no? In which case there should be no collision unless the hash of ftp
> and http are 1000.
> 

To save netfilter rule matching work, if the CONNMARK is set, then it
will be used to set nfmark.

If a CONNMARK is already set on this http stream, it will be kept. Then
if you redefine 1:1000 as another meaning, then this http carrying
1:1000 will be mis-classified.


> > 
> > >  
> > > > > I am suprised no one has compared all the rate control schemes.
> > > > > 
> > > > > btw, would policing also suffice for you? The only difference is it will
> > > > > drop packets if you exceed your rate. You can also do hierachical
> > > > > sharing.
> > > > 
> > > > policy suffices, but doesn't serve the purpose of per flow queue.
> > > > 
> > > 
> > > Policing will achieve the goal of rate control without worrying about
> > > any queueing. I like the idea of someone trying to create dynamic queues
> > > though ;-> 
> > >
> > 
> > You need per flow queue to control something, like VoIP streams, or VPN
> > streams. If you just use policy, mixed traffic is send to per flow queue.
> > That is definitely not the purpose of per flow queue.
> > 
> > The dynamic queue creating is another way to implement the per flow
> > control (yes, one class and queue per flow). I think it is more complex
> > and wastes resources.
> > 
> 
> Look at the above suggestion - what you will waste in that case is
> polices. You should actually not use HTB but rather strict prio qdisc
> with policers.

As I said above, it works, but is dirty anyway ;)

> cheers,
> jamal



-- 
  lark

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [RFC] QoS: frg queue (was [RFC] QoS: new per flow queue)
  2005-04-13  5:45               ` [RFC] QoS: frg queue (was [RFC] QoS: new per flow queue) Wang Jian
@ 2005-04-18 13:14                 ` jamal
  2005-04-18 14:50                   ` Thomas Graf
  2005-04-18 16:01                   ` Wang Jian
  0 siblings, 2 replies; 17+ messages in thread
From: jamal @ 2005-04-18 13:14 UTC (permalink / raw)
  To: Wang Jian; +Cc: netdev

Wang Jian,

On Wed, 2005-13-04 at 13:45 +0800, Wang Jian wrote: 
> Hi jamal,
> 
> Sorry for the late reply. I am occupied by other things and just have
> time back to this topic.
> 

Well, apologies from here as well - i missed responding on time.

> On 08 Apr 2005 08:43:28 -0400, jamal <hadi@cyberus.ca> wrote:
> 
> 

> > The reclassification or #1 will best be left to the user. This is not
> > hard to do.
> 
> I scan through other code and find no easy way to redirect non-handled
> traffic to another class. Can you give me some hint on that?
> 

There are two constructs at the classifier level: the first one is
"reclassify" which basically asks the classification activity to
restart. Another one is the " continue"  construct which basically asks
for the classification to continue where the last one ended.
Does this explanation help?

> > 
> > Ok, stop calling it per-flow-queue then ;-> You should call it
> > per-flow-rate-guarantee.
> 
> I have renamed it to frg (flow rate guarantee) per your suggestion.
> After the above reclassification is done, I will post new patch here.
> 
> I will extend the concept of flow to include GRE, so pptp VPN can be
> supported. There are other 'flows' to consider.
> 

I think you should start by decoupling the classification away from your
qdisc.

> > Sharing is not a big challenge -  and should be policy driven.
> > HTB and CBQ both support it. I am not sure about HFSC.
> > 
> 
> Still, I am not clear if you understand me. How it works for this
> purpose:
> 
> guarantee and only guarantee rate * n when there are n flows?
> 
> When there only 1 flow, guarantee only rate * 1
> When there only 2 flows, guarantee only rate * 2
> ...
> and so on
> 

sure and at some point you exceed available bandwidth. You can
over-provision of course.

> If always guarantee rate * limit, then the excessive guaranteed rate can
> be abused.
> 

I didnt follow this.

> But if always guarantee only rate * 1, then it is not enough.
> 

I also didnt follow this. 

Lets say you have 8 flows; each to be guaranteed 100Kbps. Lets say your
wire rate is only 1Mbps. 

Then you create policies so that each gets 100Kbps. If they all use
their quota you still have 200Kbps to play with. You could then say that
out of that 200Kps, 100kbps is to be shared amongst the 8 flows if they
exceed their allocated 100Kbps(sharing) and the other 100kbps is for
best effort traffic. 
In this case, each flow is _guaranteed_ 100Kbps and upto 100Kbps from
shared quota if no-one else is using that shared quota. 
If multi flows are using the shared 100Kbps then, its given out on first
come basis. "Guaranteed" in this case means it is _reserved_ i.e if flow
#3 is not using its allocation, flow #2 cant use it.

Ok, so now tell me where you and i are differing on semantics?
Is the above what you are also saying? 

> > So it seems you want by all means to avoid entering something that
> > will take forever to list. Did i understand this correctly?
> 
> Yes. It is special purpose but general enough. I think it's worthy of
> adding a new qdisc for it to avoid the dirty long listing part.
> 

I am not sure about the "general enough" part. You need to know what is
happening at any instance in time if this is to be useful. So that
information for listing should be available - you may wish not to
display it unless someone asks for verbose output.

> > 
> > We can probably avoid insisting on dynamically creating classes maybe.
> > You can rate control before you enqueue and we can use fwmark perhaps.
> > Earlier i also asked whether policing will suffice. Heres an example
> > (maybe dated syntax, but concept still valid) that shows sharing using
> > policers:
> > http://www.cyberus.ca/~hadi/patches/action/README
> 
> I will look at it later.
> 

Please do so we can have a synchronized discussion.

> If we can do it with one thing, we should avoid creating 1000 things.
> The policy way works but is dirty.

yes, but that hiding can be hidden at user space for example. 
There are several levels of verbosity if you insist:
- See nothing
- just see info which says "at the moment we have 234 classes
dynamically created" 
- get a full listing for each of 234 classes and their attributes.
- get a full listing for each 234 class and their attributes as well as
stats. 

> > I would think the number 1000 should be related to hash of flow header,
> > no? In which case there should be no collision unless the hash of ftp
> > and http are 1000.
> > 
> 
> To save netfilter rule matching work, if the CONNMARK is set, then it
> will be used to set nfmark.
> 
> If a CONNMARK is already set on this http stream, it will be kept. Then
> if you redefine 1:1000 as another meaning, then this http carrying
> 1:1000 will be mis-classified.
> 

your policy management/scripts are then responsible to make sure
everything is synchronized between iptables and tc. Once we have the
tracker action, this would only need to be doen via tc.

cheers,
jamal

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [RFC] QoS: frg queue (was [RFC] QoS: new per flow queue)
  2005-04-18 13:14                 ` jamal
@ 2005-04-18 14:50                   ` Thomas Graf
  2005-04-18 18:01                     ` Wang Jian
  2005-04-18 16:01                   ` Wang Jian
  1 sibling, 1 reply; 17+ messages in thread
From: Thomas Graf @ 2005-04-18 14:50 UTC (permalink / raw)
  To: jamal; +Cc: Wang Jian, netdev

Sorry for entering this discussion so late.

* jamal <1113830063.26757.15.camel@localhost.localdomain> 2005-04-18 09:14
> I think you should start by decoupling the classification away from your
> qdisc.

Here are my thoughts on per flow queueing, actually the name
"classification controlled queues" is be more accurate.

First of all the whole problem must be divided into two parts:

1) queueing with per flow specific queueing parameters, i.e. the flow
   serves a certain purpose and is known by static parameters.
2) queueing with generic parameters, i.e. the purpose of a flow
   is solely to be fair, there is no difference between each flow.

Both these cases can be handled with the current set of qdiscs
and classificiation tools but often a combination of both is needed
and that's where my thought begins:

We use tc_classid to describe a flow but also to describe its
assignment to the corresponding flow group (n flows are grouped
together into a group to define a namespace, generally at the
level of a qdisc).

The tc_classid can be set via actions, either by using a generic
action that creates a flowid out of the common tuple or by
providing an own simple-action for customization, e.g. one could
construct tc_classid ::= nfmark << 16 + hash(flowid)

tc_classid is interpreted by either one of the existing qdiscs
for static assignment or a new qdisc named "alloctor"

The purpose of the allocator is to create qdiscs on demand,
destroy them after they expired and to act as a muxer to enqueue
into the dynamic leaf qdiscs. The alloactor can be configured to
interpet the MSB 16bits of tc_classid as a flow group id and
enqueue the skb to the corresponding clasfull qdisc with matches
TC_H_MAJ_MASK bits.

The following is in attempt to convert my scribbling on paper
into ASCII:

Use Case 1: Per flow queues using TBF

                   2. +-----------+  +------------+
                   +->| cls_basic |->| act_flowid |
                   |  +-----------+  +------------+
      1.    +---------------+              |
  --------->| sch_allocator |<-------------+
            +---------------+        3. tc_classid=<flowid>
                   |4.
       +-----------+---------+- - - - - -
       |           |         |           |
    +-----+     +-----+   +-----+     + - - +
    | TBF |     | TBF |   | TBF |     | TBF |
    +-----+     +-----+   +-----+     + - - +

sch_alloctor configuration:
  - no flow group map
  - default policy: allocate TBF for every tc_classid && enqueue to new qdisc
  - default template: tbf add rtnetlink message
  - default deletion template: tbf del rtnetlink message

cls_basic configuration:
  - always true

act_flowid configuration:
  - default: generate flowid hash and store in tc_classid

Use Case 2: Flow groups

                     3. +---------+
                     +->| em_meta |
                     |  +---------+
                     +----+  |
                          |  v           4.
                   2. +-----------+  +-----------------+
                   +->| cls_basic |->| act_cust_flowid |
                   |  +-----------+  +-----------------+
      1.    +-----------------+               |
  --------->| sch_allocator_1 |<--------------+
            +-----------------+         5. tc_classid=(nfmark<<16)+(flowid&0xFF)
                   |6.
       +-----------+---+----------------+
       | 11:/12:       | 13:            | *:
    +-----+     +-----------------+ +---------+
    | HTB |     | sch_allocator_2 | | default |
    +-----+     +-----------------+ +---------+
       |               |
       |       +-------+- - - -     
       |       |       |       |
       |    +-----+ +-----+ + - - +
       |    | TBF | | TBF | | TBF |
       |    +-----+ +-----+ + - - +
       |
       |
       +------------------+
                          |
               +----------+--------------+
               |                         |
         +------------+            +------------+
         | Class 20:1 |            | Class 20:2 |
         +------------+            +------------+
               |                         |
     +---------+- - - - -              .....
     |         |         |
 +-------+ +-------+ +- - - -+
 | Class | | Class | | Class |
 +-------+ +-------+ +- - - -+

sch_alloctor_1 configuration:
  - flow group map:
     [11:] create class HTB parent 20:1 && enqueue to 20:
     [12:] create class HTB parent 20:2 && enqueue to 20:
     [13:] enqueue to sch_allocator_2
  - default policy: enqueue to default qdisc

sch_allocator_2 configuration:
  - no flow group map
  - default policy: allocate TBF for every tc_classid && enqueue to new qdisc
  - default template: tbf add rtnetlink message
  - default deletion template: tbf del rtnetlink message

cls_basic configuration:
  - always true

em_meta configuration:
  - filter out unknown nfmarks

act_cust_flowid configuration:
  - (nfmark<<16)+(flowid&0xff)

So basically what sch_allocator does is look at tc_classid, lookup
the corresponding flow in the flow map or use the default action,
execute the action, i.e. process the netlink message via a worker
thread, rewrite tc_classid if needed, manage the created qdiscs/classes,
account their last activity and destroy them eventually after no
activity for a certain configurable time by executing the corresponding
deletion netlink message.

Thoughts?

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [RFC] QoS: frg queue (was [RFC] QoS: new per flow queue)
  2005-04-18 13:14                 ` jamal
  2005-04-18 14:50                   ` Thomas Graf
@ 2005-04-18 16:01                   ` Wang Jian
  1 sibling, 0 replies; 17+ messages in thread
From: Wang Jian @ 2005-04-18 16:01 UTC (permalink / raw)
  To: hadi; +Cc: netdev

Hi jamal,

> > On 08 Apr 2005 08:43:28 -0400, jamal <hadi@cyberus.ca> wrote:
> > 
> > 
> 
> > > The reclassification or #1 will best be left to the user. This is not
> > > hard to do.
> > 
> > I scan through other code and find no easy way to redirect non-handled
> > traffic to another class. Can you give me some hint on that?
> > 
> 
> There are two constructs at the classifier level: the first one is
> "reclassify" which basically asks the classification activity to
> restart. Another one is the " continue"  construct which basically asks
> for the classification to continue where the last one ended.
> Does this explanation help?
> 

I will look into this afterwards. But I don't think this helps because
FRG is not at classifier level.

> > I will extend the concept of flow to include GRE, so pptp VPN can be
> > supported. There are other 'flows' to consider.
> > 
> 
> I think you should start by decoupling the classification away from your
> qdisc.

Actually, I am not very sure if FRG does do classification at the moment.
Because FRG is a quick hack from looking at SFQ and FIFO, I am still not
sure whether classification is what I think it is. There is so little
documentation on how to write tc code :(

FRG does things the same way as SFQ,  the only difference is the flow
tracking algorithm.  So IMHO, my current code doesn't do classification
at all. Am I wrong?

> 
> Lets say you have 8 flows; each to be guaranteed 100Kbps. Lets say your
> wire rate is only 1Mbps. 
> 
> Then you create policies so that each gets 100Kbps. If they all use
> their quota you still have 200Kbps to play with. You could then say that
> out of that 200Kps, 100kbps is to be shared amongst the 8 flows if they
> exceed their allocated 100Kbps(sharing) and the other 100kbps is for
> best effort traffic. 
> In this case, each flow is _guaranteed_ 100Kbps and upto 100Kbps from
> shared quota if no-one else is using that shared quota. 
> If multi flows are using the shared 100Kbps then, its given out on first
> come basis. "Guaranteed" in this case means it is _reserved_ i.e if flow
> #3 is not using its allocation, flow #2 cant use it.
> 
> Ok, so now tell me where you and i are differing on semantics?
> Is the above what you are also saying? 

In the above,  you still didn't mention "fewer flows than expected" case.
This is what FRG wants to address.  Also, "guaranteed" has difference
semantics.

For FRG, there are two rates: guaranteed and ceiled.

1. guaranteed rate is in the sense of "if you need so much, I will
provide so much; but if you need not so much, others can use the spared"

2. ceiled rate is in the sense of "you can't exceed this much whenever
and whatever"

The guaranteed and ceiled rate take effects on a per flow basis, not on a
group basis.

So, if currently there is only one flow, FRG will only have 1*100kbps
guaranteed, and have a ceil of 1*ceil kbps. Other traffic not managed by
FRG can takes from 1Mbps - 1*ceil to 1Mbps - 1*100kbps bandwidth. This
makes bandwidth to be used in more efficient way than _reserved_. This
is "fewer flows than expected" case.

But for the grouped quota, 1 flow can take upto 8*100kbps bandwidth,
and more from the left 200kbps on the first comes first served basis.
HTB/CBQ provides grouped quota.

Why the difference? Because in the sense of per flow guarantee, it is
supposed that if guaranteed rate is met, the flow will work well in most
cases, and the available extra bandwidth ( ceil - guarantted_rate)
merely is protection against possible jitter. A flow is hard limited below
ceil, which prevents a flow hog the bandwidth. 

Of course, one htb class with rate + ceil per flow can do the same thing.
And I think one policy perl flow can do the same thing too. The dynamic
class or policy we discussed before can also work it out. But see below.

> 
> > > So it seems you want by all means to avoid entering something that
> > > will take forever to list. Did i understand this correctly?
> > 
> > Yes. It is special purpose but general enough. I think it's worthy of
> > adding a new qdisc for it to avoid the dirty long listing part.
> > 
> 
> I am not sure about the "general enough" part. You need to know what is
> happening at any instance in time if this is to be useful. So that
> information for listing should be available - you may wish not to
> display it unless someone asks for verbose output.
> 
> > > 
> > > We can probably avoid insisting on dynamically creating classes maybe.
> > > You can rate control before you enqueue and we can use fwmark perhaps.
> > > Earlier i also asked whether policing will suffice. Heres an example
> > > (maybe dated syntax, but concept still valid) that shows sharing using
> > > policers:
> > > http://www.cyberus.ca/~hadi/patches/action/README
> > 
> > I will look at it later.
> > 
> 
> Please do so we can have a synchronized discussion.
> 

I have read it now.  Considering the purpose why FRG is created, I
didn't see it is useful, because it is still on  "grouped"  basis but not
"flow"  basis.

> 
> > If we can do it with one thing, we should avoid creating 1000 things.
> > The policy way works but is dirty.
> 
> yes, but that hiding can be hidden at user space for example. 
> There are several levels of verbosity if you insist:
> - See nothing
> - just see info which says "at the moment we have 234 classes
> dynamically created" 
> - get a full listing for each of 234 classes and their attributes.
> - get a full listing for each 234 class and their attributes as well as
> stats. 

Basically, the divergency between our opinions is not whether and how we
can do it in that way, but why we should do it in that way.

Current FRG implementation doesn't need its own filter. A FRG queue
processes packets from its parent (leaf) class, which defines flow
criteria. Internally, a FRG queue distinguishes and tracks flows, and do
rate guarantee and ceil by manipulating packets over a single queue.

The implementation is simple and clean, and does work as expected. It
also simplify user space management.

So the question is: why we should use a complex implementation which
is also complicated in userspace (considering dynamic classid allocation),
but not use a simple implementation which is also simple for userspace?

> 
> your policy management/scripts are then responsible to make sure
> everything is synchronized between iptables and tc. 

This is necessary, but this can range from simple to complex.
Unfortunately, dynamic class/policy goes to the complex end.

> Once we have the  tracker action, this would only need to be doen via tc.
> 

Is there any document about the tracker action?

-- 
  lark

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [RFC] QoS: frg queue (was [RFC] QoS: new per flow queue)
  2005-04-18 14:50                   ` Thomas Graf
@ 2005-04-18 18:01                     ` Wang Jian
  2005-04-18 18:40                       ` Thomas Graf
  0 siblings, 1 reply; 17+ messages in thread
From: Wang Jian @ 2005-04-18 18:01 UTC (permalink / raw)
  To: Thomas Graf; +Cc: jamal, netdev

Hi Thomas Graf,

In your big piture,

1. the dynamic allocated classids prepresent streams (I avoid using flow
here ntentionally)
2. the dynamic created TBFs are mapped 1:1 to classids, to provide rate
control

Let's simplify it to

1. namespace represent streams
2. rate controls for every name in the namespace (1:1 map)

The consideration:

1. there is no necessity that namespace must be classid space.
2. considering the resemblance of streams (they usually are the same
application), the rate control can be simplified. TBF or HTB is overkill.
3. grouped rate control method is not suitable for single stream.
4. fairness on streams, and total guarantee ( rate * n) can guarantee n
streams very well
5. per stream rate limit and total limit ( rate * n * 1.xx ) make sense
for bandwidth efficiency.
6. precisely counting of streams is necessary (when streams is more than
expected, existing streams are guaranteed, new streams are not
guaranteed, so at least some will work, not all don't work)

What FRG does:

1. the namespace is conntracks of streams;
2. rate control algorithm is very simple, based on the token bucket;
3. grouped rate control (HTB) is only used to do total guarantee
4. provides fairness on streams, and there is m * rate guarantee for
dynamic m streams;
5. per stream rate limit, total limit ( rate * max_streams * 1.05)
6. precisely counting of streams, and guarantees existing max_streams
stream.

FRG is created for VoIP like applications. It can be used to meet per
stream guarantee and prevent over provision.


On Mon, 18 Apr 2005 16:50:24 +0200, Thomas Graf <tgraf@suug.ch> wrote:

> Sorry for entering this discussion so late.
> 
> * jamal <1113830063.26757.15.camel@localhost.localdomain> 2005-04-18 09:14
> > I think you should start by decoupling the classification away from your
> > qdisc.
> 
> Here are my thoughts on per flow queueing, actually the name
> "classification controlled queues" is be more accurate.
> 
> First of all the whole problem must be divided into two parts:
> 
> 1) queueing with per flow specific queueing parameters, i.e. the flow
>    serves a certain purpose and is known by static parameters.
> 2) queueing with generic parameters, i.e. the purpose of a flow
>    is solely to be fair, there is no difference between each flow.
> 
> Both these cases can be handled with the current set of qdiscs
> and classificiation tools but often a combination of both is needed
> and that's where my thought begins:
> 
> We use tc_classid to describe a flow but also to describe its
> assignment to the corresponding flow group (n flows are grouped
> together into a group to define a namespace, generally at the
> level of a qdisc).
> 
> The tc_classid can be set via actions, either by using a generic
> action that creates a flowid out of the common tuple or by
> providing an own simple-action for customization, e.g. one could
> construct tc_classid ::= nfmark << 16 + hash(flowid)
> 
> tc_classid is interpreted by either one of the existing qdiscs
> for static assignment or a new qdisc named "alloctor"
> 
> The purpose of the allocator is to create qdiscs on demand,
> destroy them after they expired and to act as a muxer to enqueue
> into the dynamic leaf qdiscs. The alloactor can be configured to
> interpet the MSB 16bits of tc_classid as a flow group id and
> enqueue the skb to the corresponding clasfull qdisc with matches
> TC_H_MAJ_MASK bits.
> 
> The following is in attempt to convert my scribbling on paper
> into ASCII:
> 
> 
> Use Case 1: Per flow queues using TBF
> 
>                    2. +-----------+  +------------+
>                    +->| cls_basic |->| act_flowid |
>                    |  +-----------+  +------------+
>       1.    +---------------+              |
>   --------->| sch_allocator |<-------------+
>             +---------------+        3. tc_classid=<flowid>
>                    |4.
>        +-----------+---------+- - - - - -
>        |           |         |           |
>     +-----+     +-----+   +-----+     + - - +
>     | TBF |     | TBF |   | TBF |     | TBF |
>     +-----+     +-----+   +-----+     + - - +
> 
> 
> sch_alloctor configuration:
>   - no flow group map
>   - default policy: allocate TBF for every tc_classid && enqueue to new qdisc
>   - default template: tbf add rtnetlink message
>   - default deletion template: tbf del rtnetlink message
> 
> cls_basic configuration:
>   - always true
>  
> act_flowid configuration:
>   - default: generate flowid hash and store in tc_classid
> 
> 
> Use Case 2: Flow groups
> 
>                      3. +---------+
>                      +->| em_meta |
>                      |  +---------+
>                      +----+  |
>                           |  v           4.
>                    2. +-----------+  +-----------------+
>                    +->| cls_basic |->| act_cust_flowid |
>                    |  +-----------+  +-----------------+
>       1.    +-----------------+               |
>   --------->| sch_allocator_1 |<--------------+
>             +-----------------+         5. tc_classid=(nfmark<<16)+(flowid&0xFF)
>                    |6.
>        +-----------+---+----------------+
>        | 11:/12:       | 13:            | *:
>     +-----+     +-----------------+ +---------+
>     | HTB |     | sch_allocator_2 | | default |
>     +-----+     +-----------------+ +---------+
>        |               |
>        |       +-------+- - - -     
>        |       |       |       |
>        |    +-----+ +-----+ + - - +
>        |    | TBF | | TBF | | TBF |
>        |    +-----+ +-----+ + - - +
>        |
>        |
>        +------------------+
>                           |
>                +----------+--------------+
>                |                         |
>          +------------+            +------------+
>          | Class 20:1 |            | Class 20:2 |
>          +------------+            +------------+
>                |                         |
>      +---------+- - - - -              .....
>      |         |         |
>  +-------+ +-------+ +- - - -+
>  | Class | | Class | | Class |
>  +-------+ +-------+ +- - - -+
> 
> 
> sch_alloctor_1 configuration:
>   - flow group map:
>      [11:] create class HTB parent 20:1 && enqueue to 20:
>      [12:] create class HTB parent 20:2 && enqueue to 20:
>      [13:] enqueue to sch_allocator_2
>   - default policy: enqueue to default qdisc
> 
> sch_allocator_2 configuration:
>   - no flow group map
>   - default policy: allocate TBF for every tc_classid && enqueue to new qdisc
>   - default template: tbf add rtnetlink message
>   - default deletion template: tbf del rtnetlink message
>   
> cls_basic configuration:
>   - always true
> 
> em_meta configuration:
>   - filter out unknown nfmarks
> 
> act_cust_flowid configuration:
>   - (nfmark<<16)+(flowid&0xff)
> 
> 
> So basically what sch_allocator does is look at tc_classid, lookup
> the corresponding flow in the flow map or use the default action,
> execute the action, i.e. process the netlink message via a worker
> thread, rewrite tc_classid if needed, manage the created qdiscs/classes,
> account their last activity and destroy them eventually after no
> activity for a certain configurable time by executing the corresponding
> deletion netlink message.
> 
> Thoughts?



-- 
  lark

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [RFC] QoS: frg queue (was [RFC] QoS: new per flow queue)
  2005-04-18 18:01                     ` Wang Jian
@ 2005-04-18 18:40                       ` Thomas Graf
  2005-04-22  4:11                         ` Wang Jian
  0 siblings, 1 reply; 17+ messages in thread
From: Thomas Graf @ 2005-04-18 18:40 UTC (permalink / raw)
  To: Wang Jian; +Cc: jamal, netdev

* Wang Jian <20050419012147.038F.LARK@linux.net.cn> 2005-04-19 02:01
> In your big piture,
> 
> 1. the dynamic allocated classids prepresent streams (I avoid using flow
> here ntentionally)
> 2. the dynamic created TBFs are mapped 1:1 to classids, to provide rate
> control
> 
> Let's simplify it to
> 
> 1. namespace represent streams
> 2. rate controls for every name in the namespace (1:1 map)

This is only true for use 1 where the allocator creates indepdendent
qdiscs. Look at use case 2 where a major classid of 11: and 12: create
htb class siblings, this even allows to divided one big flow
namespaces into various group but still respecting global policies.

> 1. there is no necessity that namespace must be classid space.

Agreed.

> 2. considering the resemblance of streams (they usually are the same
> application), the rate control can be simplified. TBF or HTB is overkill.

Argueable.

> 3. grouped rate control method is not suitable for single stream.

I'm not sure what you mean with this.

> 4. fairness on streams, and total guarantee ( rate * n) can guarantee n
> streams very well

Agreed, we can put this into the allocator by letting the user specify
limits and run a rate estimator.

> 5. per stream rate limit and total limit ( rate * n * 1.xx ) make sense
> for bandwidth efficiency.

Agreed, make the allocator create HTB classes and you can have it.

> 6. precisely counting of streams is necessary (when streams is more than
> expected, existing streams are guaranteed, new streams are not
> guaranteed, so at least some will work, not all don't work)

Also agreed, I did not lose too many thoughts on this yet but it's
not hard to implement.

> What FRG does:
> 
> 1. the namespace is conntracks of streams;
> 2. rate control algorithm is very simple, based on the token bucket;
> 3. grouped rate control (HTB) is only used to do total guarantee
> 4. provides fairness on streams, and there is m * rate guarantee for
> dynamic m streams;
> 5. per stream rate limit, total limit ( rate * max_streams * 1.05)
> 6. precisely counting of streams, and guarantees existing max_streams
> stream.

I understand all of your arguments but I would like to, if possible,
avoid to add yet another quite specific qdisc which could have been
implemented in a generic way for everyone to use. Your FRG basically
does what the alloctor + classifier + action + qdiscs can do but it
is orientied at one specific use case.

Let's analyze your enqueue()

1) perflow_is_valid() // BTW, I think you have a typo in there, 2 times TCP ;->
   Should be done in the classifier with ematches:
   ... ematch meta(PROTOCOL eq IP) AND
              (cmp(ip_proto eq TCP) OR cmp(ip_proto eq UDP) ..)

2) perflow_(fill|find|new)_tuple()
   Should be done in the classifier as an action

3) qsch->q.qlen >= q->qlen
   Must be done in the qdisc after classification so this would
   go into the allocator.

4) flow->timer
   Must be handled by the alloctor

5) rate limiting
   IMHO: Should be done in separate qdiscs

Advantages?
 - What happens if you want to allow yet another protocol
   in your flow? You have to change sources etc.
 - Protocol specific flow hashing? No problem, replace the action.
 - ...

The only disadvantage I can see is a possible performance bottleneck
but this must be proved first by numbers.

So basically the direction we want to go is to strict separate the
classification from the queueing to allow the user to customize
everything by replacing small components. It might be worth to read
up on the discussion on "ematch" and "action" over the last 3 months.

Cheers, Thomas

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [RFC] QoS: frg queue (was [RFC] QoS: new per flow queue)
  2005-04-18 18:40                       ` Thomas Graf
@ 2005-04-22  4:11                         ` Wang Jian
  2005-04-22 11:11                           ` Thomas Graf
  0 siblings, 1 reply; 17+ messages in thread
From: Wang Jian @ 2005-04-22  4:11 UTC (permalink / raw)
  To: Thomas Graf; +Cc: jamal, netdev

Hi Thomas Graf,

Due to some reason, I was removed from this list and I wait for sometime
to make sure this.

On Mon, 18 Apr 2005 20:40:29 +0200, Thomas Graf <tgraf@suug.ch> wrote:

> * Wang Jian <20050419012147.038F.LARK@linux.net.cn> 2005-04-19 02:01
> > In your big piture,
> > 
> > 1. the dynamic allocated classids prepresent streams (I avoid using flow
> > here ntentionally)
> > 2. the dynamic created TBFs are mapped 1:1 to classids, to provide rate
> > control
> > 
> > Let's simplify it to
> > 
> > 1. namespace represent streams
> > 2. rate controls for every name in the namespace (1:1 map)
> 
> This is only true for use 1 where the allocator creates indepdendent
> qdiscs. Look at use case 2 where a major classid of 11: and 12: create
> htb class siblings, this even allows to divided one big flow
> namespaces into various group but still respecting global policies.
> 

Yes, the use case 2 you refer to is not in my original design.

> 
> I understand all of your arguments but I would like to, if possible,
> avoid to add yet another quite specific qdisc which could have been
> implemented in a generic way for everyone to use. Your FRG basically
> does what the alloctor + classifier + action + qdiscs can do but it
> is orientied at one specific use case.

Agree. I will maintain my code out of kernel tree, in case someone feels
my frg is usefull for his/her special case.

> 
> Let's analyze your enqueue()
> 
> 1) perflow_is_valid() // BTW, I think you have a typo in there, 2 times TCP ;->
>    Should be done in the classifier with ematches:
>    ... ematch meta(PROTOCOL eq IP) AND
>               (cmp(ip_proto eq TCP) OR cmp(ip_proto eq UDP) ..)
> 
> 2) perflow_(fill|find|new)_tuple()
>    Should be done in the classifier as an action
> 
> 3) qsch->q.qlen >= q->qlen
>    Must be done in the qdisc after classification so this would
>    go into the allocator.
> 
> 4) flow->timer
>    Must be handled by the alloctor
> 
> 5) rate limiting
>    IMHO: Should be done in separate qdiscs
> 
> Advantages?
>  - What happens if you want to allow yet another protocol
>    in your flow? You have to change sources etc.
>  - Protocol specific flow hashing? No problem, replace the action.
>  - ...
> 
> The only disadvantage I can see is a possible performance bottleneck
> but this must be proved first by numbers.

The other disadvantage is that the dynamic classid used is not predicted
by user space, it is very tricky for user space to handle the classid
allocation then (I mean for the management system like web management).
That is what I try to avoid so far.

> 
> So basically the direction we want to go is to strict separate the
> classification from the queueing to allow the user to customize
> everything by replacing small components. It might be worth to read
> up on the discussion on "ematch" and "action" over the last 3 months.
> 

I agree to this. But I must iterate one thing: the design should be
friendly to user space management for a heavy usage.



-- 
  lark

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [RFC] QoS: frg queue (was [RFC] QoS: new per flow queue)
  2005-04-22  4:11                         ` Wang Jian
@ 2005-04-22 11:11                           ` Thomas Graf
  2005-04-22 12:04                             ` Wang Jian
  0 siblings, 1 reply; 17+ messages in thread
From: Thomas Graf @ 2005-04-22 11:11 UTC (permalink / raw)
  To: Wang Jian; +Cc: jamal, netdev

> > I understand all of your arguments but I would like to, if possible,
> > avoid to add yet another quite specific qdisc which could have been
> > implemented in a generic way for everyone to use. Your FRG basically
> > does what the alloctor + classifier + action + qdiscs can do but it
> > is orientied at one specific use case.
> 
> Agree. I will maintain my code out of kernel tree, in case someone feels
> my frg is usefull for his/her special case.

Definitely, I think your work is very valuable and if the generic
approach doesn't work out I'll be all for including your work. Well,
maybe move the flow id generation into an action but leave the class
allocation in frg. 

> The other disadvantage is that the dynamic classid used is not predicted
> by user space, it is very tricky for user space to handle the classid
> allocation then (I mean for the management system like web management).
> That is what I try to avoid so far.

Good point, we can make the handle of the allocated classes/qdiscs
be derived from the flow id we get and additionaly broadcast any
change in the qdisc/class management to userspace for it to keep
track. Let me develop some more thoughts on this and get back to you.

> I agree to this. But I must iterate one thing: the design should be
> friendly to user space management for a heavy usage.

Agreed.

I had another look at your code and noticed the GFP_KERNEL masked
kmalloc call in perflow_new_flow() called from perflow_enqueue(),
you must change that to GFP_ATOMIC due to softirq context.

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [RFC] QoS: frg queue (was [RFC] QoS: new per flow queue)
  2005-04-22 11:11                           ` Thomas Graf
@ 2005-04-22 12:04                             ` Wang Jian
  0 siblings, 0 replies; 17+ messages in thread
From: Wang Jian @ 2005-04-22 12:04 UTC (permalink / raw)
  To: Thomas Graf; +Cc: jamal, netdev

Hi Thomas Graf,


On Fri, 22 Apr 2005 13:11:00 +0200, Thomas Graf <tgraf@suug.ch> wrote:

> 
> I had another look at your code and noticed the GFP_KERNEL masked
> kmalloc call in perflow_new_flow() called from perflow_enqueue(),
> you must change that to GFP_ATOMIC due to softirq context.

Acked. I will prepare another patch and put it online later.


-- 
  lark

^ permalink raw reply	[flat|nested] 17+ messages in thread

end of thread, other threads:[~2005-04-22 12:04 UTC | newest]

Thread overview: 17+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2005-04-05 15:25 [RFC] QoS: new per flow queue Wang Jian
2005-04-05 17:57 ` jamal
2005-04-06  5:12   ` Wang Jian
2005-04-06 12:12     ` jamal
2005-04-06 13:45       ` Wang Jian
2005-04-07 11:06         ` jamal
2005-04-07 13:14           ` Wang Jian
2005-04-08 12:43             ` jamal
2005-04-13  5:45               ` [RFC] QoS: frg queue (was [RFC] QoS: new per flow queue) Wang Jian
2005-04-18 13:14                 ` jamal
2005-04-18 14:50                   ` Thomas Graf
2005-04-18 18:01                     ` Wang Jian
2005-04-18 18:40                       ` Thomas Graf
2005-04-22  4:11                         ` Wang Jian
2005-04-22 11:11                           ` Thomas Graf
2005-04-22 12:04                             ` Wang Jian
2005-04-18 16:01                   ` Wang Jian

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).