All of lore.kernel.org
 help / color / mirror / Atom feed
From: Patrick McHardy <kaber@trash.net>
To: lartc@vger.kernel.org
Subject: Re: [LARTC] Re: Blue and SFB
Date: Sat, 06 Mar 2004 15:59:36 +0000	[thread overview]
Message-ID: <4049F568.90404@trash.net> (raw)
In-Reply-To: <87ishkzpf5.fsf@iki.fi>

[-- Attachment #1: Type: text/plain, Size: 1245 bytes --]

Nuutti Kotivuori wrote:
> Patrick McHardy wrote:
> 
>>I've completed the port and tested it yesterday, unfortunately it's
>>not useable in the real world as is. There is a strong bias against
>>non-ECN flows because their packets are simply dropped instead of
>>marked. At high load (50 ECN vs. 1 non-ECN flow) and a marking
>>probability of about 10% the non-ECN flow simply stalls.
>>I can send you the patch if you're interested ..
> 
> 
> Thank you, I am really interested.

Patch is attached. Use the original tc patch.

> 
> I will try how it behaves for me in various circumstances.
> 
> What you say, though, is probably true - and the situation is even
> more accentuated when considering that different TCP stacks react to
> ECN and packet drops differently - a single drop percent will not be
> enough.

OTOH, with only ECN flows it works great, not a single packet
drop after a couple of minutes, without BLUE there were multiple
thousands.

> Which ofcourse brings us to SFB - with Stochastic Fair Blue, the drop
> percentage for the non-ECN flow should be significantly lower and the
> connections should transfer more or less fairly.

I'm going to check this out, don't know anything about it.

Regards
Patrick

> 
> -- Naked
> 


[-- Attachment #2: blue.diff --]
[-- Type: text/x-patch, Size: 15059 bytes --]

# This is a BitKeeper generated diff -Nru style patch.
#
# ChangeSet
#   2004/03/06 16:57:42+01:00 kaber@trash.net 
#   Add BLUE scheduler
# 
# net/sched/sch_blue.c
#   2004/03/06 16:57:35+01:00 kaber@trash.net +453 -0
# 
# net/sched/sch_blue.c
#   2004/03/06 16:57:35+01:00 kaber@trash.net +0 -0
#   BitKeeper file /home/kaber/src/blue/linux-2.6/net/sched/sch_blue.c
# 
# net/sched/Makefile
#   2004/03/06 16:57:35+01:00 kaber@trash.net +1 -0
#   Add BLUE scheduler
# 
# net/sched/Kconfig
#   2004/03/06 16:57:35+01:00 kaber@trash.net +19 -0
#   Add BLUE scheduler
# 
# include/linux/pkt_sched.h
#   2004/03/06 16:57:35+01:00 kaber@trash.net +31 -0
#   Add BLUE scheduler
# 
diff -Nru a/include/linux/pkt_sched.h b/include/linux/pkt_sched.h
--- a/include/linux/pkt_sched.h	Sat Mar  6 16:58:06 2004
+++ b/include/linux/pkt_sched.h	Sat Mar  6 16:58:06 2004
@@ -321,6 +321,37 @@
 	TCA_HFSC_MAX = TCA_HFSC_USC
 };
 
+/* BLUE section */
+
+enum
+{
+	TCA_BLUE_UNSPEC,
+	TCA_BLUE_PARMS,
+};
+
+struct tc_blue_qopt
+{
+	__u32	limit;		/* Limit on queue length, bytes */
+	__s32	freeze_time;	/* Minimum time between Pmark updates, usec! */
+	__s32	pmark_init;	/* Initial Pmark */
+	__s32	pmark_inc;	/* Pmark increment step */
+	__s32	pmark_dec;	/* Pmark decrement step */
+	__s32	pmark_max;	/* Pmark never exceeds this maximum */
+	__u32	flags;
+};
+
+/* Flags: */
+#define TC_BLUE_ECN	1	/* Do ECN marking, if ECT is set in the packet */
+
+struct tc_blue_xstats
+{
+	__s32	pmark;		/* Current Pmark */
+	__u32	marked;		/* Marked packets */
+	__u32	early_drops;	/* Early drops */
+	__u32	limit_drops;	/* Drops due to queue limits */
+	__u32	other_drops;	/* Drops due to drop() calls */
+};
+
 /* CBQ section */
 
 #define TC_CBQ_MAXPRIO		8
diff -Nru a/net/sched/Kconfig b/net/sched/Kconfig
--- a/net/sched/Kconfig	Sat Mar  6 16:58:06 2004
+++ b/net/sched/Kconfig	Sat Mar  6 16:58:06 2004
@@ -101,6 +101,25 @@
 	  To compile this code as a module, choose M here: the
 	  module will be called sch_red.
 
+config NET_SCH_BLUE
+	tristate "BLUE queue"
+	depends on NET_SCHED
+	help
+	  Say Y here if you want to use the BLUE queue management algorithm
+	  for some of your network devices. BLUE is similar to RED as it
+	  randomly ECN-marks or drops packets, but uses a different approach
+	  to select the packets to be marked/dropped, which is intended to
+	  perform better under high load.
+	  
+	  For details and references about BLUE see:
+
+	  - http://thefengs.com/wuchang/blue/
+	  - http://www.sch.bme.hu/~bartoki/projects/thesis/
+	  - The top of net/sched/sch_blue.c
+
+	  To compile this code as a module, choose M here: the
+	  module will be called sch_blue.
+
 config NET_SCH_SFQ
 	tristate "SFQ queue"
 	depends on NET_SCHED
diff -Nru a/net/sched/Makefile b/net/sched/Makefile
--- a/net/sched/Makefile	Sat Mar  6 16:58:06 2004
+++ b/net/sched/Makefile	Sat Mar  6 16:58:06 2004
@@ -15,6 +15,7 @@
 obj-$(CONFIG_NET_SCH_HFSC)	+= sch_hfsc.o
 obj-$(CONFIG_NET_SCH_RED)	+= sch_red.o
 obj-$(CONFIG_NET_SCH_GRED)	+= sch_gred.o
+obj-$(CONFIG_NET_SCH_BLUE)	+= sch_blue.o
 obj-$(CONFIG_NET_SCH_INGRESS)	+= sch_ingress.o 
 obj-$(CONFIG_NET_SCH_DSMARK)	+= sch_dsmark.o
 obj-$(CONFIG_NET_SCH_SFQ)	+= sch_sfq.o
diff -Nru a/net/sched/sch_blue.c b/net/sched/sch_blue.c
--- /dev/null	Wed Dec 31 16:00:00 1969
+++ b/net/sched/sch_blue.c	Sat Mar  6 16:58:06 2004
@@ -0,0 +1,453 @@
+/*
+ * net/sched/sch_blue.c	BLUE queue.
+ *
+ *	This program is free software; you can redistribute it and/or
+ *	modify it under the terms of the GNU General Public License
+ *	as published by the Free Software Foundation; either version
+ *	2 of the License, or (at your option) any later version.
+ *
+ * Author:	Bartok Istvan, <bartoki@sch.bme.hu> <stefan.a.bartok@telia.se>
+ * Credits:	Based on sch_red.c written by Alexey Kuznetsov and others
+ *
+ * Changelog:
+ *
+ * Bartoki, 2001-Apr-10 - Initial version.
+ * Bartoki, 2001-Apr-11	- Added displaying Pmark in the statistics.
+ * 			- Added some moduletesting code (not to be included
+ * 			  in the final distribution).
+ * Bartoki, 2001-Apr-24 - Added compile-time setting of algorithm variations.
+ * Bartoki, 2001-May-24 - Added initial Pmark to the parameters
+ * kaber,   2004-Mar-05 - Ported to 2.6, IPv6 ECN marking bug fixed,
+ * 			  style changes
+ */
+
+#include <linux/kernel.h>
+#include <linux/config.h>
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/errno.h>
+#include <linux/init.h>
+#include <linux/netdevice.h>
+#include <linux/skbuff.h>
+#include <linux/pkt_sched.h>
+#include <net/pkt_sched.h>
+#include <net/inet_ecn.h>
+#include <net/ip.h>
+
+/*	The BLUE algorithm:
+	===================
+
+	Wu-chang Feng, Dilip D. Kandlur, Debanjan Saha, Kang G. Shin
+	"Blue: A New Class of Active Queue Management Algorithms"
+	U. Michigan CSE-TR-387-99, April 1999.
+
+	http://thefengs.com/wuchang/work/CSE-TR-387-99.pdf
+	http://thefengs.com/wuchang/blue/
+
+	This file codes the basic version of the algorithm as written down
+	on Page.5, Fig.3 of the paper. (Without Qave calculation)
+
+	From the paper:
+
+	"BLUE maintains a single probability Pm, which it uses to mark
+	(or drop) packets when they are enqueued. If the queue is continually
+	dropping packets due to queue overflow, BLUE increments Pm, thus
+	increasing the rate at which it sends back congestion notification.
+	Conversely, if the queue becomes empty or if the link is idle, BLUE
+	decreases its marking probability. This effectively allows BLUE to
+	'learn' the correct rate it needs to send back congestion
+	notification."
+
+	Upon packet loss:	if ( (now-last_update) > freeze_time ) {
+					pm += increment;
+					last_update = now;
+				}
+
+	Upon link idle event:	if ( (now-last_update) > freeze_time ) {
+					pm -= decrement;
+					last_update = now;
+				}
+
+
+	Parameters, settable by user:
+	-----------------------------
+
+	limit		- BYTES - Limit on queue length: BLUE will not enqueue
+			new packets while its backlog is greater than limit.
+			Note that backlog can exceed limit when a packet
+			gets enqueued for a  backlog > (limit-MTU)  queue,
+			or when one gets requeued.
+
+	pmark_init	- see include/linux/pkt_sched.h
+	freeze_time
+	pmark_inc
+	pmark_dec
+	pmark_max
+	flags		- For now, only one flag:
+
+			'ecn' - if set, tries to ECN-mark packets instead of
+			early dropping (but drops if the packet is not
+			ECN-capable). As sch_red does, this module can ECN-mark
+			even if the kernel was compiled without ECN-support.
+
+
+	Implementation:
+	---------------
+
+	Fractional fixed point format is used to represent Pmark:
+
+	                   sign 1/2 1/8
+	                      | |   |
+	Float 0.625 will be:  0.1 0 1 0 0 0 0...[all 0s] = 0x50000000
+	                       |  |
+	            binary point  1/4
+
+	As only the [0.0, 1.0] range is used (1.0 is represented as
+	0x7FFFFFFF = ~0.9999999995 for i386) this is an easy way do detect
+	over/underflows.
+
+	The queue is declared empty/idle when the dequeue function is called
+	with an empty queue. Note that the network device is not neccessarily
+	idle at that moment, but we could not get any much further with
+	estimating when is the end of the transmission as the hardware or
+	the device driver can have an internal buffer of few packets (to enable
+	end-to-end transmits) anyway.
+
+	For further details on design and performance analysis see:
+	http://www.sch.bme.hu/~bartoki/projects/thesis/
+
+*/
+
+
+/* 1 - Single last_update
+ * 0 - Separated timestamps for increase and decrease
+ */
+#define	BLUE_SINGLE_UPDATE_TIME	0
+
+/* 1 - Try increase Pmark not just on a tail-drop event, but also when
+ *     an enqueue occurs to a more then half-filled queue
+ * 0 - Try increase Pmark only on tail-drop
+ */
+#define BLUE_INCREASE_ON_HALFQ	1
+
+
+struct blue_sched_data
+{
+	struct tc_blue_qopt	qopt;	/* Parameters */
+	struct tc_blue_xstats	xstats;	/* BLUE-specific statistics */
+
+	int		pmark;		/* Packet marking probability */
+
+#if BLUE_SINGLE_UPDATE_TIME
+	psched_time_t	last_update;	/* Timestamp of the last pmark update */
+#else
+	psched_time_t	last_inc;	/* Timestamp of the last pmark increase */
+	psched_time_t	last_dec;	/* Timestamp of the last pmark decrease */
+#endif
+
+};
+
+
+/*
+ * Increments pmark
+ * Assumes 0 <= pmark and 0 <= pmark_inc
+ */
+static inline void blue_inc(struct blue_sched_data *q)
+{
+	q->pmark += q->qopt.pmark_inc;
+
+	/* On overflow or exceeding max use pmark_max: */
+	if (q->pmark < 0 || q->pmark > q->qopt.pmark_max)
+		q->pmark = q->qopt.pmark_max;
+}
+
+/*
+ * Increments pmark, but only if ((now - last_update) >= freeze_time)
+ */
+static inline void blue_try_inc(struct blue_sched_data *q)
+{
+	psched_time_t now;
+	psched_tdiff_t diff;
+
+	PSCHED_GET_TIME(now);
+
+#if BLUE_SINGLE_UPDATE_TIME
+	diff = PSCHED_TDIFF_SAFE(now, q->last_update, q->qopt.freeze_time, 0);
+#else
+	diff = PSCHED_TDIFF_SAFE(now, q->last_inc, q->qopt.freeze_time, 0);
+#endif
+
+	/*
+	 * The time difference is plain long, it will wrap often on 32-bit
+	 * architectures, so even if (last_update < now) is true, the diff
+	 * can be negative.
+	 *
+	 * NOTE: this code still won't increase when:
+	 *	now = last_update + k*MAX_LONG + d
+	 *	where 0 <= d < freeze_time
+	 */
+	if (diff >= q->qopt.freeze_time || diff < 0) {
+		blue_inc(q);
+#if BLUE_SINGLE_UPDATE_TIME
+		q->last_update = now;
+#else
+		q->last_inc = now;
+#endif
+	}
+}
+
+/*
+ * Decrements pmark
+ * Assumes 0 <= pmark and 0 <= pmark_dec
+ */
+static inline void blue_dec(struct blue_sched_data *q)
+{
+	q->pmark -= q->qopt.pmark_dec;
+
+	/* On underflow use 0 */
+	if (q->pmark < 0)
+		q->pmark = 0;
+}
+
+/*
+ * Decrements pmark, but only if ((now - last_update) >= freeze_time)
+ */
+static inline void blue_try_dec(struct blue_sched_data *q)
+{
+	psched_time_t now;
+	psched_tdiff_t diff;
+
+	PSCHED_GET_TIME(now);
+
+#if BLUE_SINGLE_UPDATE_TIME
+	diff = PSCHED_TDIFF_SAFE(now, q->last_update, q->qopt.freeze_time, 0);
+#else
+	diff = PSCHED_TDIFF_SAFE(now, q->last_dec, q->qopt.freeze_time, 0);
+#endif
+
+	/*
+	 * For diff < 0 see the comment at the same place in blue_try_inc()
+	 */
+	if (diff >= q->qopt.freeze_time || diff < 0) {
+		blue_dec(q);
+#if BLUE_SINGLE_UPDATE_TIME
+		q->last_update = now;
+#else
+		q->last_dec = now;
+#endif
+	}
+}
+
+/*
+ * Tries to ECN-mark the packet.
+ * Returns:	1 - success: marked now, or was marked already
+ *		0 - not marked: ECT bit is bot set in the packet,
+ *		    or not an IP packet
+ */
+static int blue_ecn_mark(struct sk_buff *skb)
+{
+	if (skb->nh.raw + 20 > skb->tail)
+		return 0;
+
+	switch (skb->protocol) {
+	case __constant_htons(ETH_P_IP):
+		if (!INET_ECN_is_capable(skb->nh.iph->tos))
+			return 0;
+		if (INET_ECN_is_not_ce(skb->nh.iph->tos))
+			IP_ECN_set_ce(skb->nh.iph);
+		return 1;
+	case __constant_htons(ETH_P_IPV6):
+		if (!INET_ECN_is_capable(ip6_get_dsfield(skb->nh.ipv6h)))
+			return 0;
+		IP6_ECN_set_ce(skb->nh.ipv6h);
+		return 1;
+	default:
+		return 0;
+	}
+}
+
+static int
+blue_enqueue(struct sk_buff *skb, struct Qdisc *sch)
+{
+	struct blue_sched_data *q = (struct blue_sched_data *)sch->data;
+	
+	/* Try to mark or early drop with probability pmark: */
+	/* Operator < in the comparision allows true 0.0 and 1.0 probability */
+
+	if (((u32)net_random() >> 1) < q->pmark) {
+		goto mark;
+	} else {
+		goto enqueue;
+	}
+
+enqueue:
+	if (sch->stats.backlog <= q->qopt.limit) {
+
+#if BLUE_INCREASE_ON_HALFQ
+		if (sch->stats.backlog >= q->qopt.limit/2)
+			blue_try_inc(q);
+#endif
+
+		__skb_queue_tail(&sch->q, skb);
+		sch->stats.backlog += skb->len;
+		sch->stats.bytes += skb->len;
+		sch->stats.packets++;
+
+		return NET_XMIT_SUCCESS;
+	} else {
+		/* No space left, try to increment pmark and drop the packet */
+		blue_try_inc(q);
+		q->xstats.limit_drops++;
+		goto drop;
+	}
+
+mark:
+	sch->stats.overlimits++;
+	if ((q->qopt.flags & TC_BLUE_ECN) && blue_ecn_mark(skb)) {
+		q->xstats.marked++;
+		goto enqueue;
+	} else {
+		q->xstats.early_drops++;
+		goto drop;
+	}
+
+drop:
+	kfree_skb(skb);
+	sch->stats.drops++;
+	return NET_XMIT_CN;
+}
+
+static int
+blue_requeue(struct sk_buff *skb, struct Qdisc *sch)
+{
+	__skb_queue_head(&sch->q, skb);
+	sch->stats.backlog += skb->len;
+	return NET_XMIT_SUCCESS;
+}
+
+static struct sk_buff *
+blue_dequeue(struct Qdisc *sch)
+{
+	struct blue_sched_data *q = (struct blue_sched_data *)sch->data;
+	struct sk_buff *skb;
+
+	skb = __skb_dequeue(&sch->q);
+	if (skb) {
+		sch->stats.backlog -= skb->len;
+		return skb;
+	} else {
+		/* Queue empty, try to decrement pmark */
+		blue_try_dec(q);
+		return NULL;
+	}
+}
+
+static unsigned int
+blue_drop(struct Qdisc *sch)
+{
+	struct blue_sched_data *q = (struct blue_sched_data *)sch->data;
+	struct sk_buff *skb;
+	unsigned int len;
+
+	skb = __skb_dequeue_tail(&sch->q);
+	if (skb) {
+		sch->stats.backlog -= skb->len;
+		sch->stats.drops++;
+		q->xstats.other_drops++;
+		len = skb->len;
+		kfree_skb(skb);
+		return len;
+	} else {
+		return 0;
+	}
+}
+
+static void
+blue_reset(struct Qdisc *sch)
+{
+	struct blue_sched_data *q = (struct blue_sched_data *)sch->data;
+
+	__skb_queue_purge(&sch->q);
+	sch->stats.backlog = 0;
+	q->pmark = q->qopt.pmark_init;
+}
+
+static int
+blue_init(struct Qdisc *sch, struct rtattr *opt)
+{
+	struct blue_sched_data *q = (struct blue_sched_data *)sch->data;
+	struct rtattr *tb[TCA_BLUE_PARMS];
+	struct tc_blue_qopt *ctl;
+
+	if (opt == NULL
+	    || rtattr_parse(tb, TCA_BLUE_PARMS, RTA_DATA(opt), RTA_PAYLOAD(opt))
+	    || tb[TCA_BLUE_PARMS-1] == 0
+	    || RTA_PAYLOAD(tb[TCA_BLUE_PARMS-1]) < sizeof(*ctl))
+		return -EINVAL;
+	ctl = RTA_DATA(tb[TCA_BLUE_PARMS-1]);
+
+	sch_tree_lock(sch);
+	q->qopt = *ctl;
+	q->pmark = q->qopt.pmark_init;
+	sch_tree_unlock(sch);
+	return 0;
+}
+
+static int blue_dump_xstats(struct blue_sched_data *q, struct sk_buff *skb)
+{
+	q->xstats.pmark = q->pmark;
+        RTA_PUT(skb, TCA_XSTATS, sizeof(q->xstats), &q->xstats);
+        return 0;
+
+rtattr_failure:
+        return 1;
+}
+
+static int
+blue_dump(struct Qdisc *sch, struct sk_buff *skb)
+{
+	struct blue_sched_data *q = (struct blue_sched_data *)sch->data;
+	unsigned char *b = skb->tail;
+	struct rtattr *rta = (struct rtattr *)b;
+
+	RTA_PUT(skb, TCA_OPTIONS, 0, NULL);
+	RTA_PUT(skb, TCA_BLUE_PARMS, sizeof(q->qopt), &q->qopt);
+	rta->rta_len = skb->tail - b;
+
+	if (blue_dump_xstats(q, skb))
+		goto rtattr_failure;
+	return skb->len;
+
+rtattr_failure:
+	skb_trim(skb, b - skb->data);
+	return -1;
+}
+
+struct Qdisc_ops blue_qdisc_ops =
+{
+	.id		= "blue",
+	.init		= blue_init,
+	.change		= blue_init,
+	.reset		= blue_reset,
+	.dump		= blue_dump,
+	.enqueue	= blue_enqueue,
+	.dequeue	= blue_dequeue,
+	.requeue	= blue_requeue,
+	.drop		= blue_drop,
+	.priv_size	= sizeof(struct blue_sched_data),
+	.owner		= THIS_MODULE,
+};
+
+static int __init init_blue(void)
+{
+	return register_qdisc(&blue_qdisc_ops);
+}
+
+static void __exit exit_blue(void)
+{
+	unregister_qdisc(&blue_qdisc_ops);
+}
+
+MODULE_LICENSE("GPL");
+module_init(init_blue);
+module_exit(exit_blue);

      parent reply	other threads:[~2004-03-06 15:59 UTC|newest]

Thread overview: 5+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2004-03-04 13:56 [LARTC] Re: Blue and SFB Nuutti Kotivuori
2004-03-04 14:05 ` Patrick McHardy
2004-03-06 15:09 ` Patrick McHardy
2004-03-06 15:33 ` Nuutti Kotivuori
2004-03-06 15:59 ` Patrick McHardy [this message]

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=4049F568.90404@trash.net \
    --to=kaber@trash.net \
    --cc=lartc@vger.kernel.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.