[RFC/PATCH] IMQ port to 2.6

netdev.vger.kernel.org archive mirror
 help / color / mirror / Atom feed

* [RFC/PATCH] IMQ port to 2.6
@ 2004-01-25 15:24 Marcel Sebek
  2004-01-25 16:44 ` Tomas Szepe
  2004-01-25 19:25 ` David S. Miller
  0 siblings, 2 replies; 29+ messages in thread
From: Marcel Sebek @ 2004-01-25 15:24 UTC (permalink / raw)
  To: linux-kernel; +Cc: netdev

I have ported IMQ driver from 2.4 to 2.6.2-rc1.

Original version was from http://trash.net/~kaber/imq/.


diff -urN linux-2.6.orig/drivers/net/Kconfig linux-2.6.new/drivers/net/Kconfig
--- linux-2.6.orig/drivers/net/Kconfig	2004-01-21 19:33:36.000000000 +0100
+++ linux-2.6.new/drivers/net/Kconfig	2004-01-25 15:08:20.000000000 +0100
@@ -85,6 +85,20 @@
 	  To compile this driver as a module, choose M here: the module
 	  will be called eql.  If unsure, say N.
 
+config IMQ
+	tristate "IMQ (intermediate queueing device) support"
+	depends on NETDEVICES && NETFILTER
+	---help---
+	  The imq device(s) is used as placeholder for QoS queueing disciplines.
+	  Every packet entering/leaving the ip stack can be directed through
+	  the imq device where it's enqueued/dequeued to the attached qdisc.
+	  This allows you to treat network devices as classes and distribute
+	  bandwidth among them. Iptables is used to specify through which imq
+	  device, if any, packets travel.
+
+	  To compile this driver as a module, choose M here: the module
+	  will be called imq.  If unsure, say N.
+
 config TUN
 	tristate "Universal TUN/TAP device driver support"
 	depends on NETDEVICES
diff -urN linux-2.6.orig/drivers/net/Makefile linux-2.6.new/drivers/net/Makefile
--- linux-2.6.orig/drivers/net/Makefile	2004-01-21 19:33:36.000000000 +0100
+++ linux-2.6.new/drivers/net/Makefile	2004-01-25 15:08:20.000000000 +0100
@@ -110,6 +110,7 @@
 endif
 
 obj-$(CONFIG_DUMMY) += dummy.o
+obj-$(CONFIG_IMQ) += imq.o
 obj-$(CONFIG_DE600) += de600.o
 obj-$(CONFIG_DE620) += de620.o
 obj-$(CONFIG_AT1500) += lance.o
diff -urN linux-2.6.orig/drivers/net/imq.c linux-2.6.new/drivers/net/imq.c
--- linux-2.6.orig/drivers/net/imq.c	1970-01-01 01:00:00.000000000 +0100
+++ linux-2.6.new/drivers/net/imq.c	2004-01-25 15:08:51.000000000 +0100
@@ -0,0 +1,323 @@
+/*
+ *             Pseudo-driver for the intermediate queue device.
+ *
+ *             This program is free software; you can redistribute it and/or
+ *             modify it under the terms of the GNU General Public License
+ *             as published by the Free Software Foundation; either version
+ *             2 of the License, or (at your option) any later version.
+ *
+ * Authors:    Patrick McHardy, <kaber@trash.net>
+ *
+ * 	       The first version was written by Martin Devera, <devik@cdi.cz>
+ *
+ * Credits:    Jan Rafaj <imq2t@cedric.vabo.cz>
+ *              - Update patch to 2.4.21
+ *             Sebastian Strollo <sstrollo@nortelnetworks.com>
+ *              - Fix "Dead-loop on netdevice imq"-issue
+ *             Marcel Sebek <sebek64@post.cz>
+ *              - Update to 2.6.2-rc1
+ */
+
+#include <linux/config.h>
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/moduleparam.h>
+#include <linux/skbuff.h>
+#include <linux/netdevice.h>
+#include <linux/rtnetlink.h>
+#include <linux/if_arp.h>
+#include <linux/netfilter.h>
+#include <linux/netfilter_ipv4.h>
+#if defined(CONFIG_IPV6) || defined (CONFIG_IPV6_MODULE)
+#include <linux/netfilter_ipv6.h>
+#endif
+#include <linux/imq.h>
+#include <net/pkt_sched.h>
+
+static nf_hookfn imq_nf_hook;
+
+static struct nf_hook_ops imq_ingress_ipv4 = {
+	.hook		= imq_nf_hook,
+	.owner		= THIS_MODULE,
+	.pf		= PF_INET,
+	.hooknum	= NF_IP_PRE_ROUTING,
+	.priority	= NF_IP_PRI_MANGLE + 1
+};
+
+static struct nf_hook_ops imq_egress_ipv4 = {
+	.hook		= imq_nf_hook,
+	.owner		= THIS_MODULE,
+	.pf		= PF_INET,
+	.hooknum	= NF_IP_POST_ROUTING,
+	.priority	= NF_IP_PRI_LAST
+};
+
+#if defined(CONFIG_IPV6) || defined (CONFIG_IPV6_MODULE)
+static struct nf_hook_ops imq_ingress_ipv6 = {
+	.hook		= imq_nf_hook,
+	.owner		= THIS_MODULE,
+	.pf		= PF_INET6,
+	.hooknum	= NF_IP6_PRE_ROUTING,
+	.priority	= NF_IP6_PRI_MANGLE + 1
+};
+
+static struct nf_hook_ops imq_egress_ipv6 = {
+	.hook		= imq_nf_hook,
+	.owner		= THIS_MODULE,
+	.pf		= PF_INET6,
+	.hooknum	= NF_IP6_POST_ROUTING,
+	.priority	= NF_IP6_PRI_LAST
+};
+#endif
+
+static unsigned int numdevs = 2;
+
+module_param(numdevs, int, 0);
+
+static struct net_device *imq_devs;
+
+
+static struct net_device_stats *imq_get_stats(struct net_device *dev)
+{
+	return (struct net_device_stats *)dev->priv;
+}
+
+/* called for packets kfree'd in qdiscs at places other than enqueue */
+static void imq_skb_destructor(struct sk_buff *skb)
+{
+	struct nf_info *info = skb->nf_info;
+
+	if (info) {
+		if (info->indev)
+			dev_put(info->indev);
+		if (info->outdev)
+			dev_put(info->outdev);
+		kfree(info);
+	}
+}
+
+static int imq_dev_xmit(struct sk_buff *skb, struct net_device *dev)
+{
+	struct net_device_stats *stats = (struct net_device_stats*) dev->priv;
+
+	stats->tx_bytes += skb->len;
+	stats->tx_packets++;
+
+	skb->imq_flags = 0;
+	skb->destructor = NULL;
+
+	dev->trans_start = jiffies;
+	nf_reinject(skb, skb->nf_info, NF_ACCEPT);
+	return 0;
+}
+
+static int imq_nf_queue(struct sk_buff *skb, struct nf_info *info,
+			void *data)
+{
+	struct net_device *dev;
+	struct net_device_stats *stats;
+	struct sk_buff *skb2 = NULL;
+	struct Qdisc *q;
+	unsigned int index = skb->imq_flags&IMQ_F_IFMASK;
+	int ret = -1;
+
+	if (index > numdevs) 
+		return -1;
+
+	dev = imq_devs + index;
+	if (!(dev->flags & IFF_UP)) {
+		skb->imq_flags = 0;
+		nf_reinject(skb, info, NF_ACCEPT);
+		return 0;
+	}
+	dev->last_rx = jiffies;
+
+	if (skb->destructor) {
+		skb2 = skb;
+		skb = skb_clone(skb, GFP_ATOMIC);
+		if (!skb)
+			return -1;
+	}
+	skb->nf_info = info;
+
+	stats = (struct net_device_stats *)dev->priv;
+	stats->rx_bytes+= skb->len;
+	stats->rx_packets++;
+
+	spin_lock_bh(&dev->queue_lock);
+	q = dev->qdisc;
+	if (q->enqueue) {
+		q->enqueue(skb_get(skb), q);
+		if (skb_shared(skb)) {
+			skb->destructor = imq_skb_destructor;
+			kfree_skb(skb);
+			ret = 0;
+		}
+	}
+	if (spin_is_locked(&dev->xmit_lock))
+		netif_schedule(dev);
+	else
+		qdisc_run(dev);
+	spin_unlock_bh(&dev->queue_lock);
+
+	if (skb2)
+		kfree_skb(ret ? skb : skb2);
+
+	return ret;
+}
+
+static unsigned int imq_nf_hook(unsigned int hook, struct sk_buff **pskb,
+		   const struct net_device *indev,
+		   const struct net_device *outdev,
+		   int (*okfn)(struct sk_buff *))
+{
+	if ((*pskb)->imq_flags & IMQ_F_ENQUEUE)
+		return NF_QUEUE;
+
+	return NF_ACCEPT;
+}
+
+
+static int __init imq_init_hooks(void)
+{
+	int err;
+
+	if ((err = nf_register_queue_handler(PF_INET, imq_nf_queue, NULL)))
+		goto err1;
+	if ((err = nf_register_hook(&imq_ingress_ipv4)))
+		goto err2;
+	if ((err = nf_register_hook(&imq_egress_ipv4)))
+		goto err3;
+#if defined(CONFIG_IPV6) || defined (CONFIG_IPV6_MODULE)
+	if ((err = nf_register_queue_handler(PF_INET6, imq_nf_queue, NULL)))
+		goto err4;
+	if ((err = nf_register_hook(&imq_ingress_ipv6)))
+		goto err5;
+	if ((err = nf_register_hook(&imq_egress_ipv6)))
+		goto err6;
+#endif
+
+	return 0;
+
+#if defined(CONFIG_IPV6) || defined (CONFIG_IPV6_MODULE)
+err6:
+	nf_unregister_hook(&imq_ingress_ipv6);
+err5:
+	nf_unregister_queue_handler(PF_INET6);
+err4:
+	nf_unregister_hook(&imq_egress_ipv4);
+#endif
+err3:
+	nf_unregister_hook(&imq_ingress_ipv4);
+err2:
+	nf_unregister_queue_handler(PF_INET);
+err1:
+	return err;
+}
+
+static void __exit imq_unhook(void)
+{
+	nf_unregister_hook(&imq_ingress_ipv4);
+	nf_unregister_hook(&imq_egress_ipv4);
+	nf_unregister_queue_handler(PF_INET);
+#if defined(CONFIG_IPV6) || defined (CONFIG_IPV6_MODULE)
+	nf_unregister_hook(&imq_ingress_ipv6);
+	nf_unregister_hook(&imq_egress_ipv6);
+	nf_unregister_queue_handler(PF_INET6);
+#endif
+}
+
+static int __init imq_dev_init(struct net_device *dev)
+{
+	dev->hard_start_xmit	= imq_dev_xmit;
+	dev->type		= ARPHRD_VOID;
+	dev->mtu		= 1500;
+	dev->tx_queue_len	= 30;
+	dev->flags		= IFF_NOARP;
+	dev->priv = kmalloc(sizeof(struct net_device_stats), GFP_KERNEL);
+	if (dev->priv == NULL)
+		return -ENOMEM;
+	memset(dev->priv, 0, sizeof(struct net_device_stats));
+	dev->get_stats		= imq_get_stats;
+
+	return 0;
+}
+
+static void imq_dev_uninit(struct net_device *dev)
+{
+	kfree(dev->priv);
+}
+
+static int __init imq_init_devs(void)
+{
+	struct net_device *dev;
+	int i;
+
+	if (!numdevs || numdevs > IMQ_MAX_DEVS) {
+		printk(KERN_ERR "numdevs has to be betweed 1 and %u\n",
+		       IMQ_MAX_DEVS);
+		return -EINVAL;
+	}
+
+	imq_devs = kmalloc(sizeof(struct net_device) * numdevs, GFP_KERNEL);
+	if (!imq_devs)
+		return -ENOMEM;
+	memset(imq_devs, 0, sizeof(struct net_device) * numdevs);
+
+	/* we start counting at zero */
+	numdevs--;
+
+	for (i = 0, dev = imq_devs; i <= numdevs; i++, dev++) {
+		SET_MODULE_OWNER(dev);
+		strcpy(dev->name, "imq%d");
+		dev->init   = imq_dev_init;
+		dev->uninit = imq_dev_uninit;
+
+		if (register_netdev(dev) < 0)
+			goto err_register;
+	}
+	return 0;
+
+err_register:
+	for (; i; i--)
+		unregister_netdev(--dev);
+	kfree(imq_devs);
+	return -EIO;
+}
+
+static void imq_cleanup_devs(void)
+{
+	int i;
+	struct net_device *dev = imq_devs;
+
+	for (i = 0; i <= numdevs; i++)
+		unregister_netdev(dev++);
+
+	kfree(imq_devs);
+}
+
+static int __init imq_init_module(void)
+{
+	int err;
+
+	if ((err = imq_init_devs()))
+		return err;
+	if ((err = imq_init_hooks())) {
+		imq_cleanup_devs();
+		return err;
+	}
+
+	printk(KERN_INFO "imq driver loaded.\n");
+
+	return 0;
+}
+
+static void __exit imq_cleanup_module(void)
+{
+	imq_unhook();
+	imq_cleanup_devs();
+}
+
+module_init(imq_init_module);
+module_exit(imq_cleanup_module);
+MODULE_LICENSE("GPL");
diff -urN linux-2.6.orig/include/linux/imq.h linux-2.6.new/include/linux/imq.h
--- linux-2.6.orig/include/linux/imq.h	1970-01-01 01:00:00.000000000 +0100
+++ linux-2.6.new/include/linux/imq.h	2004-01-25 15:08:20.000000000 +0100
@@ -0,0 +1,9 @@
+#ifndef _IMQ_H
+#define _IMQ_H
+
+#define IMQ_MAX_DEVS   16
+
+#define IMQ_F_IFMASK   0x7f
+#define IMQ_F_ENQUEUE  0x80
+
+#endif /* _IMQ_H */
diff -urN linux-2.6.orig/include/linux/netfilter_ipv4/ipt_IMQ.h linux-2.6.new/include/linux/netfilter_ipv4/ipt_IMQ.h
--- linux-2.6.orig/include/linux/netfilter_ipv4/ipt_IMQ.h	1970-01-01 01:00:00.000000000 +0100
+++ linux-2.6.new/include/linux/netfilter_ipv4/ipt_IMQ.h	2004-01-25 15:08:20.000000000 +0100
@@ -0,0 +1,8 @@
+#ifndef _IPT_IMQ_H
+#define _IPT_IMQ_H
+
+struct ipt_imq_info {
+	unsigned int todev;	/* target imq device */
+};
+
+#endif /* _IPT_IMQ_H */
diff -urN linux-2.6.orig/include/linux/netfilter_ipv6/ip6t_IMQ.h linux-2.6.new/include/linux/netfilter_ipv6/ip6t_IMQ.h
--- linux-2.6.orig/include/linux/netfilter_ipv6/ip6t_IMQ.h	1970-01-01 01:00:00.000000000 +0100
+++ linux-2.6.new/include/linux/netfilter_ipv6/ip6t_IMQ.h	2004-01-25 15:08:20.000000000 +0100
@@ -0,0 +1,8 @@
+#ifndef _IP6T_IMQ_H
+#define _IP6T_IMQ_H
+
+struct ip6t_imq_info {
+	unsigned int todev;	/* target imq device */
+};
+
+#endif /* _IP6T_IMQ_H */
diff -urN linux-2.6.orig/include/linux/skbuff.h linux-2.6.new/include/linux/skbuff.h
--- linux-2.6.orig/include/linux/skbuff.h	2004-01-10 14:02:40.000000000 +0100
+++ linux-2.6.new/include/linux/skbuff.h	2004-01-25 15:08:20.000000000 +0100
@@ -98,6 +98,10 @@
 	struct nf_conntrack *master;
 };
 
+#if defined(CONFIG_IMQ) || defined(CONFIG_IMQ_MODULE)
+struct nf_info;
+#endif
+
 #ifdef CONFIG_BRIDGE_NETFILTER
 struct nf_bridge_info {
 	atomic_t use;
@@ -246,6 +250,10 @@
         unsigned long		nfmark;
 	__u32			nfcache;
 	struct nf_ct_info	*nfct;
+#if defined(CONFIG_IMQ) || defined(CONFIG_IMQ_MODULE)
+	unsigned char		imq_flags;
+	struct nf_info		*nf_info;
+#endif
 #ifdef CONFIG_NETFILTER_DEBUG
         unsigned int		nf_debug;
 #endif
diff -urN linux-2.6.orig/net/core/skbuff.c linux-2.6.new/net/core/skbuff.c
--- linux-2.6.orig/net/core/skbuff.c	2003-11-25 16:58:45.000000000 +0100
+++ linux-2.6.new/net/core/skbuff.c	2004-01-25 15:08:20.000000000 +0100
@@ -313,6 +313,10 @@
 #ifdef CONFIG_NET_SCHED
 	C(tc_index);
 #endif
+#if defined(CONFIG_IMQ) || defined(CONFIG_IMQ_MODULE)
+	C(imq_flags);
+	C(nf_info);
+#endif
 	C(truesize);
 	atomic_set(&n->users, 1);
 	C(head);
@@ -357,6 +361,10 @@
 	new->nfcache	= old->nfcache;
 	new->nfct	= old->nfct;
 	nf_conntrack_get(old->nfct);
+#if defined(CONFIG_IMQ) || defined(CONFIG_IMQ_MODULE)
+	new->imq_flags	= old->imq_flags;
+	new->nf_info	= old->nf_info;
+#endif
 #ifdef CONFIG_NETFILTER_DEBUG
 	new->nf_debug	= old->nf_debug;
 #endif
diff -urN linux-2.6.orig/net/ipv4/netfilter/Kconfig linux-2.6.new/net/ipv4/netfilter/Kconfig
--- linux-2.6.orig/net/ipv4/netfilter/Kconfig	2004-01-21 19:34:33.000000000 +0100
+++ linux-2.6.new/net/ipv4/netfilter/Kconfig	2004-01-25 15:08:20.000000000 +0100
@@ -478,6 +478,15 @@
 
 	  To compile it as a module, choose M here.  If unsure, say N.
 
+config IP_NF_TARGET_IMQ
+	tristate "IMQ target support"
+	depends on IP_NF_MANGLE
+	help
+	  This option adds a `IMQ' target which is used to specify if and
+	  to which imq device packets should get enqueued/dequeued.
+
+	  To compile it as a module, choose M here.  If unsure, say N.
+
 config IP_NF_TARGET_LOG
 	tristate "LOG target support"
 	depends on IP_NF_IPTABLES
diff -urN linux-2.6.orig/net/ipv4/netfilter/Makefile linux-2.6.new/net/ipv4/netfilter/Makefile
--- linux-2.6.orig/net/ipv4/netfilter/Makefile	2003-09-10 16:09:48.000000000 +0200
+++ linux-2.6.new/net/ipv4/netfilter/Makefile	2004-01-25 15:08:21.000000000 +0100
@@ -72,6 +72,7 @@
 obj-$(CONFIG_IP_NF_TARGET_ECN) += ipt_ECN.o
 obj-$(CONFIG_IP_NF_TARGET_DSCP) += ipt_DSCP.o
 obj-$(CONFIG_IP_NF_TARGET_MARK) += ipt_MARK.o
+obj-$(CONFIG_IP_NF_TARGET_IMQ) += ipt_IMQ.o
 obj-$(CONFIG_IP_NF_TARGET_MASQUERADE) += ipt_MASQUERADE.o
 obj-$(CONFIG_IP_NF_TARGET_REDIRECT) += ipt_REDIRECT.o
 obj-$(CONFIG_IP_NF_TARGET_NETMAP) += ipt_NETMAP.o
diff -urN linux-2.6.orig/net/ipv4/netfilter/ipt_IMQ.c linux-2.6.new/net/ipv4/netfilter/ipt_IMQ.c
--- linux-2.6.orig/net/ipv4/netfilter/ipt_IMQ.c	1970-01-01 01:00:00.000000000 +0100
+++ linux-2.6.new/net/ipv4/netfilter/ipt_IMQ.c	2004-01-25 15:08:21.000000000 +0100
@@ -0,0 +1,78 @@
+/*
+ * This target marks packets to be enqueued to an imq device
+ */
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <linux/netfilter_ipv4/ip_tables.h>
+#include <linux/netfilter_ipv4/ipt_IMQ.h>
+#include <linux/imq.h>
+
+static unsigned int imq_target(struct sk_buff **pskb,
+			       const struct net_device *in,
+			       const struct net_device *out,
+			       unsigned int hooknum,
+			       const void *targinfo,
+			       void *userdata)
+{
+	struct ipt_imq_info *mr = (struct ipt_imq_info*)targinfo;
+
+	(*pskb)->imq_flags = mr->todev | IMQ_F_ENQUEUE;
+	(*pskb)->nfcache |= NFC_ALTERED;
+
+	return IPT_CONTINUE;
+}
+
+static int imq_checkentry(const char *tablename,
+			  const struct ipt_entry *e,
+			  void *targinfo,
+			  unsigned int targinfosize,
+			  unsigned int hook_mask)
+{
+	struct ipt_imq_info *mr;
+
+	if (targinfosize != IPT_ALIGN(sizeof(struct ipt_imq_info))) {
+		printk(KERN_WARNING "IMQ: invalid targinfosize\n");
+		return 0;
+	}
+	mr = (struct ipt_imq_info*)targinfo;
+
+	if (strcmp(tablename, "mangle") != 0) {
+		printk(KERN_WARNING
+		       "IMQ: IMQ can only be called from \"mangle\" table, not \"%s\"\n",
+		       tablename);
+		return 0;
+	}
+
+	if (mr->todev > IMQ_MAX_DEVS) {
+		printk(KERN_WARNING
+		       "IMQ: invalid device specified, highest is %u\n",
+		       IMQ_MAX_DEVS);
+		return 0;
+	}
+
+	return 1;
+}
+
+static struct ipt_target ipt_imq_reg = {
+	.name		= "IMQ",
+	.target		= imq_target,
+	.checkentry	= imq_checkentry,
+	.me		= THIS_MODULE
+};
+
+static int __init init(void)
+{
+	if (ipt_register_target(&ipt_imq_reg))
+		return -EINVAL;
+
+	return 0;
+}
+
+static void __exit fini(void)
+{
+	ipt_unregister_target(&ipt_imq_reg);
+}
+
+module_init(init);
+module_exit(fini);
+MODULE_LICENSE("GPL");
diff -urN linux-2.6.orig/net/ipv6/netfilter/Kconfig linux-2.6.new/net/ipv6/netfilter/Kconfig
--- linux-2.6.orig/net/ipv6/netfilter/Kconfig	2003-09-28 10:43:59.000000000 +0200
+++ linux-2.6.new/net/ipv6/netfilter/Kconfig	2004-01-25 15:08:21.000000000 +0100
@@ -217,6 +217,15 @@
 
 	  To compile it as a module, choose M here.  If unsure, say N.
 
+config IP6_NF_TARGET_IMQ
+	tristate "IMQ target support"
+	depends on IP6_NF_MANGLE
+	help
+	  This option adds a `IMQ' target which is used to specify if and
+	  to which imq device packets should get enqueued/dequeued.
+
+	  To compile it as a module, choose M here.  If unsure, say N.
+
 #dep_tristate '  LOG target support' CONFIG_IP6_NF_TARGET_LOG $CONFIG_IP6_NF_IPTABLES
 endmenu
 
diff -urN linux-2.6.orig/net/ipv6/netfilter/Makefile linux-2.6.new/net/ipv6/netfilter/Makefile
--- linux-2.6.orig/net/ipv6/netfilter/Makefile	2003-05-05 01:53:32.000000000 +0200
+++ linux-2.6.new/net/ipv6/netfilter/Makefile	2004-01-25 15:08:21.000000000 +0100
@@ -19,6 +19,7 @@
 obj-$(CONFIG_IP6_NF_FILTER) += ip6table_filter.o
 obj-$(CONFIG_IP6_NF_MANGLE) += ip6table_mangle.o
 obj-$(CONFIG_IP6_NF_TARGET_MARK) += ip6t_MARK.o
+obj-$(CONFIG_IP6_NF_TARGET_IMQ) += ip6t_IMQ.o
 obj-$(CONFIG_IP6_NF_QUEUE) += ip6_queue.o
 obj-$(CONFIG_IP6_NF_TARGET_LOG) += ip6t_LOG.o
 obj-$(CONFIG_IP6_NF_MATCH_HL) += ip6t_hl.o
diff -urN linux-2.6.orig/net/ipv6/netfilter/ip6t_IMQ.c linux-2.6.new/net/ipv6/netfilter/ip6t_IMQ.c
--- linux-2.6.orig/net/ipv6/netfilter/ip6t_IMQ.c	1970-01-01 01:00:00.000000000 +0100
+++ linux-2.6.new/net/ipv6/netfilter/ip6t_IMQ.c	2004-01-25 15:08:21.000000000 +0100
@@ -0,0 +1,78 @@
+/*
+ * This target marks packets to be enqueued to an imq device
+ */
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <linux/netfilter_ipv6/ip6_tables.h>
+#include <linux/netfilter_ipv6/ip6t_IMQ.h>
+#include <linux/imq.h>
+
+static unsigned int imq_target(struct sk_buff **pskb,
+			       unsigned int hooknum,
+			       const struct net_device *in,
+			       const struct net_device *out,
+			       const void *targinfo,
+			       void *userdata)
+{
+	struct ip6t_imq_info *mr = (struct ip6t_imq_info*)targinfo;
+
+	(*pskb)->imq_flags = mr->todev | IMQ_F_ENQUEUE;
+	(*pskb)->nfcache |= NFC_ALTERED;
+
+	return IP6T_CONTINUE;
+}
+
+static int imq_checkentry(const char *tablename,
+			  const struct ip6t_entry *e,
+			  void *targinfo,
+			  unsigned int targinfosize,
+			  unsigned int hook_mask)
+{
+	struct ip6t_imq_info *mr;
+
+	if (targinfosize != IP6T_ALIGN(sizeof(struct ip6t_imq_info))) {
+		printk(KERN_WARNING "IMQ: invalid targinfosize\n");
+		return 0;
+	}
+	mr = (struct ip6t_imq_info*)targinfo;
+
+	if (strcmp(tablename, "mangle") != 0) {
+		printk(KERN_WARNING
+		       "IMQ: IMQ can only be called from \"mangle\" table, not \"%s\"\n",
+		       tablename);
+		return 0;
+	}
+
+	if (mr->todev > IMQ_MAX_DEVS) {
+		printk(KERN_WARNING
+		       "IMQ: invalid device specified, highest is %u\n",
+		       IMQ_MAX_DEVS);
+		return 0;
+	}
+
+	return 1;
+}
+
+static struct ip6t_target ip6t_imq_reg = {
+	.name		= "IMQ",
+	.target		= imq_target,
+	.checkentry	= imq_checkentry,
+	.me		= THIS_MODULE
+};
+
+static int __init init(void)
+{
+	if (ip6t_register_target(&ip6t_imq_reg))
+		return -EINVAL;
+
+	return 0;
+}
+
+static void __exit fini(void)
+{
+	ip6t_unregister_target(&ip6t_imq_reg);
+}
+
+module_init(init);
+module_exit(fini);
+MODULE_LICENSE("GPL");
diff -urN linux-2.6.orig/net/sched/sch_generic.c linux-2.6.new/net/sched/sch_generic.c
--- linux-2.6.orig/net/sched/sch_generic.c	2003-11-25 16:58:47.000000000 +0100
+++ linux-2.6.new/net/sched/sch_generic.c	2004-01-25 15:08:21.000000000 +0100
@@ -30,6 +30,9 @@
 #include <linux/skbuff.h>
 #include <linux/rtnetlink.h>
 #include <linux/init.h>
+#if defined(CONFIG_IMQ) || defined(CONFIG_IMQ_MODULE)
+#include <linux/imq.h>
+#endif
 #include <net/sock.h>
 #include <net/pkt_sched.h>
 
@@ -90,7 +93,11 @@
 			spin_unlock(&dev->queue_lock);
 
 			if (!netif_queue_stopped(dev)) {
-				if (netdev_nit)
+				if (netdev_nit
+#if defined(CONFIG_IMQ) || defined(CONFIG_IMQ_MODULE)
+				   && !(skb->imq_flags & IMQ_F_ENQUEUE)
+#endif
+				   )
 					dev_queue_xmit_nit(skb, dev);
 
 				if (dev->hard_start_xmit(skb, dev) == 0) {

-- 
Marcel Sebek
jabber: sebek@jabber.cz                     ICQ: 279852819
linux user number: 307850                 GPG ID: 5F88735E
GPG FP: 0F01 BAB8 3148 94DB B95D  1FCA 8B63 CA06 5F88 735E

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [RFC/PATCH] IMQ port to 2.6
  2004-01-25 15:24 [RFC/PATCH] IMQ port to 2.6 Marcel Sebek
@ 2004-01-25 16:44 ` Tomas Szepe
  2004-01-25 19:22   ` jamal
  2004-01-25 19:25 ` David S. Miller
  1 sibling, 1 reply; 29+ messages in thread
From: Tomas Szepe @ 2004-01-25 16:44 UTC (permalink / raw)
  To: linux-kernel, netdev

On Jan-25 2004, Sun, 16:24 +0100
Marcel Sebek <sebek64@post.cz> wrote:

> I have ported IMQ driver from 2.4 to 2.6.2-rc1.
> Original version was from http://trash.net/~kaber/imq/.
> ...

It would definitely be nice to see IMQ merged at last.

-- 
Tomas Szepe <szepe@pinerecords.com>

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [RFC/PATCH] IMQ port to 2.6
  2004-01-25 16:44 ` Tomas Szepe
@ 2004-01-25 19:22   ` jamal
  2004-01-25 20:21     ` Vladimir B. Savkin
  0 siblings, 1 reply; 29+ messages in thread
From: jamal @ 2004-01-25 19:22 UTC (permalink / raw)
  To: Tomas Szepe; +Cc: linux-kernel, netdev

There has been no real good reason as to why IMQ is needed to begin
with. It may be easy to use and has been highly publized (which is
always a dangerous thing in Linux).

Maybe lets take a step back and see how people use it. How and why do
you use IMQ? Is this because you couldnt use the ingress qdisc?
Note, the abstraction to begin with is in the wrong place - it sure is
an easy and nice looking hack. So is the current ingress qdisc, but we
are laying that to rest with TC extensions.

cheers,
jamal

On Sun, 2004-01-25 at 11:44, Tomas Szepe wrote:
> On Jan-25 2004, Sun, 16:24 +0100
> Marcel Sebek <sebek64@post.cz> wrote:
> 
> > I have ported IMQ driver from 2.4 to 2.6.2-rc1.
> > Original version was from http://trash.net/~kaber/imq/.
> > ...
> 
> It would definitely be nice to see IMQ merged at last.

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [RFC/PATCH] IMQ port to 2.6
  2004-01-25 15:24 [RFC/PATCH] IMQ port to 2.6 Marcel Sebek
  2004-01-25 16:44 ` Tomas Szepe
@ 2004-01-25 19:25 ` David S. Miller
  2004-01-25 20:23   ` Patrick McHardy
  1 sibling, 1 reply; 29+ messages in thread
From: David S. Miller @ 2004-01-25 19:25 UTC (permalink / raw)
  To: sebek64; +Cc: linux-kernel, netdev, kaber

   From: sebek64@post.cz (Marcel Sebek)
   Date: Sun, 25 Jan 2004 16:24:19 +0100

   I have ported IMQ driver from 2.4 to 2.6.2-rc1.
   
   Original version was from http://trash.net/~kaber/imq/.
   
Patrick, do you mind if I merge this 2.6.x port into my tree?

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [RFC/PATCH] IMQ port to 2.6
  2004-01-25 19:22   ` jamal
@ 2004-01-25 20:21     ` Vladimir B. Savkin
  2004-01-25 23:45       ` jamal
  0 siblings, 1 reply; 29+ messages in thread
From: Vladimir B. Savkin @ 2004-01-25 20:21 UTC (permalink / raw)
  To: jamal; +Cc: linux-kernel, netdev

On Sun, Jan 25, 2004 at 02:22:19PM -0500, jamal wrote:
> 
> There has been no real good reason as to why IMQ is needed to begin
> with. It may be easy to use and has been highly publized (which is
> always a dangerous thing in Linux).
> 
> Maybe lets take a step back and see how people use it. How and why do
> you use IMQ? Is this because you couldnt use the ingress qdisc?

Think multiple clients connected via PPP. I want to shape traffic,
so ingress is out of question. I want different clients in a same
htb class, so using qdisc on each ppp interface is out of
question. It seems to me that IMQ is the only way to achieve my goals.

> Note, the abstraction to begin with is in the wrong place - it sure is
> an easy and nice looking hack. So is the current ingress qdisc, but we
> are laying that to rest with TC extensions.
> 
> 
~
:wq
                                        With best regards, 
                                           Vladimir Savkin. 

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [RFC/PATCH] IMQ port to 2.6
  2004-01-25 19:25 ` David S. Miller
@ 2004-01-25 20:23   ` Patrick McHardy
  2004-01-25 21:55     ` David S. Miller
  0 siblings, 1 reply; 29+ messages in thread
From: Patrick McHardy @ 2004-01-25 20:23 UTC (permalink / raw)
  To: David S. Miller; +Cc: sebek64, linux-kernel, netdev

David S. Miller wrote:
>    From: sebek64@post.cz (Marcel Sebek)
>    Date: Sun, 25 Jan 2004 16:24:19 +0100
> 
>    I have ported IMQ driver from 2.4 to 2.6.2-rc1.
>    
>    Original version was from http://trash.net/~kaber/imq/.
>    
> Patrick, do you mind if I merge this 2.6.x port into my tree?
> 

Please don't. The imq device is buggy, it crashes when used
for ingress and egress at the same time, additionally it's
unmaintained since one or two years. The lartc list is full
of bugreports. Some users that depend on the functionality
are working on a better implementation, I'd suggest to wait
until then.

Best regards,
Patrick

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [RFC/PATCH] IMQ port to 2.6
  2004-01-25 20:23   ` Patrick McHardy
@ 2004-01-25 21:55     ` David S. Miller
  0 siblings, 0 replies; 29+ messages in thread
From: David S. Miller @ 2004-01-25 21:55 UTC (permalink / raw)
  To: kaber; +Cc: sebek64, linux-kernel, netdev

   From: Patrick McHardy <kaber@trash.net>
   Date: Sun, 25 Jan 2004 21:23:18 +0100

   David S. Miller wrote:
   > Patrick, do you mind if I merge this 2.6.x port into my tree?
   
   Please don't. The imq device is buggy, 
 ...
   Some users that depend on the functionality
   are working on a better implementation, I'd suggest to wait
   until then.

Ok.

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [RFC/PATCH] IMQ port to 2.6
  2004-01-25 20:21     ` Vladimir B. Savkin
@ 2004-01-25 23:45       ` jamal
  2004-01-26  0:11         ` Vladimir B. Savkin
  0 siblings, 1 reply; 29+ messages in thread
From: jamal @ 2004-01-25 23:45 UTC (permalink / raw)
  To: Vladimir B. Savkin; +Cc: linux-kernel, netdev

On Sun, 2004-01-25 at 15:21, Vladimir B. Savkin wrote:
> On Sun, Jan 25, 2004 at 02:22:19PM -0500, jamal wrote:

> Think multiple clients connected via PPP. I want to shape traffic,
> so ingress is out of question. I want different clients in a same

Ok, 
a) why do you want to shape on ingress instead of policing?
OR
b) Why cant you achieve the same results by marking on ingress and
shaping on egress? 

> htb class, so using qdisc on each ppp interface is out of
> question. It seems to me that IMQ is the only way to achieve my goals.

By multiple clients i believe you mean you want to say "-i ppp+"?
We had a long discussion on this a while back (search netdev) 
and i think it is a valid point for dynamic devices like ppp. 
We need to rethink how we do things. Theres a lot of valu in having per
device tables (scalability being one).
IMO, this alone does not justify the existence of IMQ. 
We should do this (and other things) right, maybe a sync with the
netfilter folks will be the right thing to do. 

cheers,
jamal

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [RFC/PATCH] IMQ port to 2.6
  2004-01-25 23:45       ` jamal
@ 2004-01-26  0:11         ` Vladimir B. Savkin
  2004-01-26  3:09           ` jamal
  0 siblings, 1 reply; 29+ messages in thread
From: Vladimir B. Savkin @ 2004-01-26  0:11 UTC (permalink / raw)
  To: jamal; +Cc: linux-kernel, netdev

On Sun, Jan 25, 2004 at 06:45:16PM -0500, jamal wrote:
> On Sun, 2004-01-25 at 15:21, Vladimir B. Savkin wrote:
> > On Sun, Jan 25, 2004 at 02:22:19PM -0500, jamal wrote:
> 
> > Think multiple clients connected via PPP. I want to shape traffic,
> > so ingress is out of question. I want different clients in a same
> 
> Ok, 
> a) why do you want to shape on ingress instead of policing?

With typical internet traffic patterns, policing will drop many packets,
and shaping will not.

> OR
> b) Why cant you achieve the same results by marking on ingress and
> shaping on egress? 

Well, as I understand it, there's no "real" ingress and "real" egress.
Look at this:
Any forwarded packet
  1) comes from one interface
  2) receives some treatment (filtering, routing decision, maybe
    delaying if we shape, mangling etc.)
  and
  3) goes away via some other interface

step (1) is "ingress"
step (3) is "egress"
qdiscs work at step (2), so all of them are intermediate in this sense

Well, ok, if a qdisc receives a feedback from egress interface
on when to dequeue a packet (when interface is ready to send),
we can say that it is an egress qdisc.

But in my case, PPP connections are really PPTP or PPPoE.
Internal network bandwidth is not a premium, so all internal
interfaces are always ready to send.

So, I don't shape at ingress or at egress, I shape passing-through
traffic.

> > htb class, so using qdisc on each ppp interface is out of
> > question. It seems to me that IMQ is the only way to achieve my goals.
> 
> By multiple clients i believe you mean you want to say "-i ppp+"?
> We had a long discussion on this a while back (search netdev) 
> and i think it is a valid point for dynamic devices like ppp. 

Well, I don't really care whether those interfaces are dynamic or
static. They could be multiple vlans, and nothing would
change in marking or shaping. I use clients' IPs for marking,
and routing table cares about interfaces.

> We need to rethink how we do things. Theres a lot of valu in having per
> device tables (scalability being one).
> IMO, this alone does not justify the existence of IMQ. 

I just can't think of a better abstraction that would handle my case.

> We should do this (and other things) right, maybe a sync with the
> netfilter folks will be the right thing to do. 
> 

~
:wq
                                        With best regards, 
                                           Vladimir Savkin. 

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [RFC/PATCH] IMQ port to 2.6
  2004-01-26  0:11         ` Vladimir B. Savkin
@ 2004-01-26  3:09           ` jamal
  2004-01-26  9:32             ` Vladimir B. Savkin
  0 siblings, 1 reply; 29+ messages in thread
From: jamal @ 2004-01-26  3:09 UTC (permalink / raw)
  To: Vladimir B. Savkin; +Cc: linux-kernel, netdev

On Sun, 2004-01-25 at 19:11, Vladimir B. Savkin wrote:
> On Sun, Jan 25, 2004 at 06:45:16PM -0500, jamal wrote:
[..]
> 
> With typical internet traffic patterns, policing will drop many packets,
> and shaping will not.

What is typical internet traffic? I guess you mean TCP (thats what 90%
of the traffic is)
In that case, the effect of dropping or delaying on throughput is
similar. Studies i have seen indicate that throughput is directly
proportional to the square root of the drop probability
(drop is what you get when you police).
It is also influenced by the delay (which is what you introduce when you
shape). I have not seen anything in favor of shaping; i could be wrong
(so if you know of something or have experimented pass the data).
For detailed analysis at least fro RENO, this would be a good reference:
http://citeseer.nj.nec.com/padhye98modeling.html

> 
> > OR
> > b) Why cant you achieve the same results by marking on ingress and
> > shaping on egress? 
> 
> Well, as I understand it, there's no "real" ingress and "real" egress.

There is essentially only egress.

> Look at this:
> Any forwarded packet
>   1) comes from one interface
>   2) receives some treatment (filtering, routing decision, maybe
>     delaying if we shape, mangling etc.)
>   and
>   3) goes away via some other interface
>
> step (1) is "ingress"

There is no ingress perse. Separation of ingress and egress is typically
a switch fabric or even a bus. So in this case, since you already
have crossed the bus you are in ingress teritory.
There is an ingress qdisc, but it is fake. The major value it adds
is to drop early when there is need to (no point in making forwarding
decision when you know you will drop the packet i.e no point in wasting
those processor cycles)- and therefore the ingress qdisc act as a 
holder of filters.

> step (3) is "egress"
> qdiscs work at step (2), so all of them are intermediate in this sense
> 
>
>
> Well, ok, if a qdisc receives a feedback from egress interface
> on when to dequeue a packet (when interface is ready to send),
> we can say that it is an egress qdisc.
> 

Look at my explanation above. 

> But in my case, PPP connections are really PPTP or PPPoE.
> Internal network bandwidth is not a premium, so all internal
> interfaces are always ready to send.
> 
> So, I don't shape at ingress or at egress, I shape passing-through
> traffic.
> 

The noun is not important. You crossed the bus already, you are in
processor land. 
The value is being able to drop as early as possible when you need to.
If you are not dropping and desire only to delay the packets, then do it
at the proper egress device.

> > > htb class, so using qdisc on each ppp interface is out of
> > > question. It seems to me that IMQ is the only way to achieve my goals.
> > 
> > By multiple clients i believe you mean you want to say "-i ppp+"?
> > We had a long discussion on this a while back (search netdev) 
> > and i think it is a valid point for dynamic devices like ppp. 
> 
> Well, I don't really care whether those interfaces are dynamic or
> static. They could be multiple vlans, and nothing would
> change in marking or shaping. I use clients' IPs for marking,
> and routing table cares about interfaces.
> 

Maybe i am misunderstanding what you are after.
couldnt you use -i ppp+ -j mark --set-mark x in the ingress/prerouting
and use the fwmark to shape on the egress?
Post your script examples. 

> > We need to rethink how we do things. Theres a lot of valu in having per
> > device tables (scalability being one).
> > IMO, this alone does not justify the existence of IMQ. 
> 
> I just can't think of a better abstraction that would handle my case.

I think it is time we came with a single solution for how packets are
managed. Your needs should be met, the problem is we may be having too
many cooks creating the same meal.

cheers,
jamal

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [RFC/PATCH] IMQ port to 2.6
  2004-01-26  3:09           ` jamal
@ 2004-01-26  9:32             ` Vladimir B. Savkin
  2004-01-26 13:38               ` jamal
  0 siblings, 1 reply; 29+ messages in thread
From: Vladimir B. Savkin @ 2004-01-26  9:32 UTC (permalink / raw)
  To: jamal; +Cc: linux-kernel, netdev

On Sun, Jan 25, 2004 at 10:09:48PM -0500, jamal wrote:
> On Sun, 2004-01-25 at 19:11, Vladimir B. Savkin wrote:
> > On Sun, Jan 25, 2004 at 06:45:16PM -0500, jamal wrote:
> [..]
> > 
> > With typical internet traffic patterns, policing will drop many packets,
> > and shaping will not.
> 
> What is typical internet traffic? I guess you mean TCP (thats what 90%
> of the traffic is)
> In that case, the effect of dropping or delaying on throughput is
> similar. Studies i have seen indicate that throughput is directly
> proportional to the square root of the drop probability
> (drop is what you get when you police).
> It is also influenced by the delay (which is what you introduce when you
> shape). I have not seen anything in favor of shaping; i could be wrong
> (so if you know of something or have experimented pass the data).

Yes, I have experimented. Shaping works much better:
much less packets dropped, much better donwload rates for clients.

> For detailed analysis at least fro RENO, this would be a good reference:
> http://citeseer.nj.nec.com/padhye98modeling.html
> 
[snip]
> Maybe i am misunderstanding what you are after.
> couldnt you use -i ppp+ -j mark --set-mark x in the ingress/prerouting
> and use the fwmark to shape on the egress?
> Post your script examples. 
> 

I want to shape traffic that comes from upstream to clients connected
via PPTP.

Here is a part of my scripts:

DEVICE=imq0
/sbin/tc qidisc add dev $DEVICE root handle 10: htb r2q 1 default 100
/sbin/tc class add dev $DEVICE parent 10:0 classid 10:1 est 1sec 8sec htb \
        rate 10Mbit burst 400k
/sbin/tc class add dev $DEVICE parent 10:1 classid 10:2 est 1sec 8sec htb \
        rate 180kbps ceil 180kbps burst 3000
# default class for users
/sbin/tc class add dev $DEVICE parent 10:2 classid 10:101 est 1sec 8sec htb \
        rate 20kbps burst 1k ceil 50kbps cburst 1k
/sbin/tc qdisc add dev $DEVICE parent 10:101 wrr \
        dest ip 128 1 wmode1=1 wmode2=1
/sbin/tc filter add dev $DEVICE protocol ip parent 10:0 \
        prio 100 handle 1 fw flowid 10:101
# more classes to follow ...


The limit 50kbps is artificial, so there's no bottleneck in
connection from upstream to this router. I cannot allocate all
the channel bandwidth to clients for some political reasons.
Then, I mark packets I want to go to this default user class with mark "1",
like this:

iptables -t mangle -A FORWARD -i $UPLINK_DEV -d $CLIENTS_NET \
	-j IMQ --todev 0 # traffic from internet to clients
iptables -t mangle -A FORWARD -i $UPLINK_DEV -d $CLIENTS_NET \
	-j MARK --set-mark 1 # default class
# here I can change fwmark for packets that deserve 
# some special treatment

So, I shape traffic destined to clients, and I use "wrr" to
divide bandwidth fairly. I cannot attach qdisc to an egress device
because there's no single one, each client has its own ppp interface.

Well, I could move this shaping upstream, but what if upstream router was
some dumb cisco with no "wrr" qdisc? 


~
:wq
                                        With best regards, 
                                           Vladimir Savkin. 

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [RFC/PATCH] IMQ port to 2.6
  2004-01-26  9:32             ` Vladimir B. Savkin
@ 2004-01-26 13:38               ` jamal
  2004-01-26 13:55                 ` Vladimir B. Savkin
  0 siblings, 1 reply; 29+ messages in thread
From: jamal @ 2004-01-26 13:38 UTC (permalink / raw)
  To: Vladimir B. Savkin; +Cc: linux-kernel, netdev

On Mon, 2004-01-26 at 04:32, Vladimir B. Savkin wrote:

> On Sun, Jan 25, 2004 at 10:09:48PM -0500, jamal wrote:
[..]
> > shape). I have not seen anything in favor of shaping; i could be wrong
> > (so if you know of something or have experimented pass the data).
> 
> Yes, I have experimented. Shaping works much better:
> much less packets dropped, much better donwload rates for clients.
> 

I cant say i doubt you, but your word alone is insufficient data ;->

The important point is the eventual effective throughput and fairness
amongst the flows. Whether it is induced by an increased RTT from
shaping or a single packet retransmit on some misbehaving flows because
of policing is less important. i.e it is not evil for packets to
be dropped.
When you analyse something like this you should look at the aggregate
throughput instead of a single client with better downloads (probably at
the expense of another poor client download).

> I want to shape traffic that comes from upstream to clients connected
> via PPTP.

So if i understand correctly and was to draw this: 
you have clients on the left side coming in through ethx and that need
to be tunneled to some pppoe/pptp before going out ethy on the right
hand side. The right handside represents "upstream" in your terminology.
Is this correct? I hate it when people ask me for a diagram for
something that looks obvious;-> but bear with me and supply me with a
diagram if i didnt understand you.

> 
> Here is a part of my scripts:
> 
> DEVICE=imq0
> /sbin/tc qidisc add dev $DEVICE root handle 10: htb r2q 1 default 100
> /sbin/tc class add dev $DEVICE parent 10:0 classid 10:1 est 1sec 8sec htb \
>         rate 10Mbit burst 400k
> /sbin/tc class add dev $DEVICE parent 10:1 classid 10:2 est 1sec 8sec htb \
>         rate 180kbps ceil 180kbps burst 3000
> # default class for users
> /sbin/tc class add dev $DEVICE parent 10:2 classid 10:101 est 1sec 8sec htb \
>         rate 20kbps burst 1k ceil 50kbps cburst 1k
> /sbin/tc qdisc add dev $DEVICE parent 10:101 wrr \
>         dest ip 128 1 wmode1=1 wmode2=1
> /sbin/tc filter add dev $DEVICE protocol ip parent 10:0 \
>         prio 100 handle 1 fw flowid 10:101
> # more classes to follow ...
> 

So why not have the above attached to ethy? Why does it have to be done
at some other device?

> 
> The limit 50kbps is artificial, so there's no bottleneck in
> connection from upstream to this router. I cannot allocate all
> the channel bandwidth to clients for some political reasons.
> Then, I mark packets I want to go to this default user class with mark "1",
> like this:
> 
> iptables -t mangle -A FORWARD -i $UPLINK_DEV -d $CLIENTS_NET \
> 	-j IMQ --todev 0 # traffic from internet to clients
> iptables -t mangle -A FORWARD -i $UPLINK_DEV -d $CLIENTS_NET \
> 	-j MARK --set-mark 1 # default class

Why do you need the redirect to IMQ?
If you can selectively mark packets here (or at any other netfilter
hook) you could use the fwmark classifier to attach to different
10:x classes on the ethy interface. I feel i am missing something.

> So, I shape traffic destined to clients, and I use "wrr" to
> divide bandwidth fairly. I cannot attach qdisc to an egress device
> because there's no single one, each client has its own ppp interface.
> 

I mean the ethy interface not the ppp* interfaces. Mark the packets;
use fwmark classifier.

> Well, I could move this shaping upstream, but what if upstream router was
> some dumb cisco with no "wrr" qdisc? 

You dont have to.
Give me the diagram.

cheers,
jamal

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [RFC/PATCH] IMQ port to 2.6
  2004-01-26 13:38               ` jamal
@ 2004-01-26 13:55                 ` Vladimir B. Savkin
  2004-01-26 14:29                   ` jamal
  2004-01-26 15:24                   ` Tomas Szepe
  0 siblings, 2 replies; 29+ messages in thread
From: Vladimir B. Savkin @ 2004-01-26 13:55 UTC (permalink / raw)
  To: jamal; +Cc: linux-kernel, netdev

On Mon, Jan 26, 2004 at 08:38:33AM -0500, jamal wrote:
> On Mon, 2004-01-26 at 04:32, Vladimir B. Savkin wrote:
> 
> > On Sun, Jan 25, 2004 at 10:09:48PM -0500, jamal wrote:
> [..]
> > > shape). I have not seen anything in favor of shaping; i could be wrong
> > > (so if you know of something or have experimented pass the data).
> > 
> > Yes, I have experimented. Shaping works much better:
> > much less packets dropped, much better donwload rates for clients.
> > 
> 
> I cant say i doubt you, but your word alone is insufficient data ;->

You can see for youself. Police users' traffic to half of the normal rate
and here them scream :) Then change policing to shaping using wrr
(or htb class for each user), and sfq on the leafs, and users are happy.

> The important point is the eventual effective throughput and fairness
> amongst the flows. Whether it is induced by an increased RTT from
> shaping or a single packet retransmit on some misbehaving flows because
> of policing is less important. i.e it is not evil for packets to
> be dropped.
> When you analyse something like this you should look at the aggregate
> throughput instead of a single client with better downloads (probably at
> the expense of another poor client download).

Well, I use wrr + sfq exactly for fairness. No such thing can be
achieved with policing only.

> 
> > I want to shape traffic that comes from upstream to clients connected
> > via PPTP.
> 
> So if i understand correctly and was to draw this: 
> you have clients on the left side coming in through ethx and that need
> to be tunneled to some pppoe/pptp before going out ethy on the right
> hand side. The right handside represents "upstream" in your terminology.
> Is this correct? I hate it when people ask me for a diagram for
> something that looks obvious;-> but bear with me and supply me with a
> diagram if i didnt understand you.

Here it is:

                    +---------+       +-ppp0- ... - client0
                    |         +-eth1-<+-ppp1- ... - client1
Internet ----- eth0-+ router  |     . . . . . . . .
                    |         +-eth2-<  . . . . . .
                    +---------+       +-pppN- ... - clientN
		    

Traffic flows from internet to clients. 
The ethX names are for example only, my setup is more complex actually,
but that complexity is not related to IMQ or traffic shaping.
Clients use PPTP or PPPoE to connect to router.
See, there's no single interface I can attach qdisc to, if I want
to put all clients into the same qdisc. 

> 
> > 
> > Here is a part of my scripts:
> > 
> > DEVICE=imq0
> > /sbin/tc qidisc add dev $DEVICE root handle 10: htb r2q 1 default 100
> > /sbin/tc class add dev $DEVICE parent 10:0 classid 10:1 est 1sec 8sec htb \
> >         rate 10Mbit burst 400k
> > /sbin/tc class add dev $DEVICE parent 10:1 classid 10:2 est 1sec 8sec htb \
> >         rate 180kbps ceil 180kbps burst 3000
> > # default class for users
> > /sbin/tc class add dev $DEVICE parent 10:2 classid 10:101 est 1sec 8sec htb \
> >         rate 20kbps burst 1k ceil 50kbps cburst 1k
> > /sbin/tc qdisc add dev $DEVICE parent 10:101 wrr \
> >         dest ip 128 1 wmode1=1 wmode2=1
> > /sbin/tc filter add dev $DEVICE protocol ip parent 10:0 \
> >         prio 100 handle 1 fw flowid 10:101
> > # more classes to follow ...
> > 
> 
> So why not have the above attached to ethy? Why does it have to be done
> at some other device?
> 
> > 
> > The limit 50kbps is artificial, so there's no bottleneck in
> > connection from upstream to this router. I cannot allocate all
> > the channel bandwidth to clients for some political reasons.
> > Then, I mark packets I want to go to this default user class with mark "1",
> > like this:
> > 
> > iptables -t mangle -A FORWARD -i $UPLINK_DEV -d $CLIENTS_NET \
> > 	-j IMQ --todev 0 # traffic from internet to clients
> > iptables -t mangle -A FORWARD -i $UPLINK_DEV -d $CLIENTS_NET \
> > 	-j MARK --set-mark 1 # default class
> 
> 
> Why do you need the redirect to IMQ?
> If you can selectively mark packets here (or at any other netfilter
> hook) you could use the fwmark classifier to attach to different
> 10:x classes on the ethy interface. I feel i am missing something.
> 
> > So, I shape traffic destined to clients, and I use "wrr" to
> > divide bandwidth fairly. I cannot attach qdisc to an egress device
> > because there's no single one, each client has its own ppp interface.
> > 
> 
> I mean the ethy interface not the ppp* interfaces. Mark the packets;
> use fwmark classifier.
> 
> > Well, I could move this shaping upstream, but what if upstream router was
> > some dumb cisco with no "wrr" qdisc? 
> 
> You dont have to.
> Give me the diagram.
> 
> cheers,
> jamal
> 
~
:wq
                                        With best regards, 
                                           Vladimir Savkin. 

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [RFC/PATCH] IMQ port to 2.6
  2004-01-26 13:55                 ` Vladimir B. Savkin
@ 2004-01-26 14:29                   ` jamal
  2004-01-26 17:41                     ` Vladimir B. Savkin
  2004-01-31 18:52                     ` Vladimir B. Savkin
  2004-01-26 15:24                   ` Tomas Szepe
  1 sibling, 2 replies; 29+ messages in thread
From: jamal @ 2004-01-26 14:29 UTC (permalink / raw)
  To: Vladimir B. Savkin; +Cc: linux-kernel, netdev

On Mon, 2004-01-26 at 08:55, Vladimir B. Savkin wrote:
> On Mon, Jan 26, 2004 at 08:38:33AM -0500, jamal wrote:

> > I cant say i doubt you, but your word alone is insufficient data ;->
> 
> You can see for youself. Police users' traffic to half of the normal rate
> and here them scream :) Then change policing to shaping using wrr
> (or htb class for each user), and sfq on the leafs, and users are happy.
> 

;-> Sorry I dont have time. But this could be a nice paper since
i havent seen this topic covered. If you want to write one i could
help provide you an outline.

> Well, I use wrr + sfq exactly for fairness. No such thing can be
> achieved with policing only.
> 

Thats what i was assuming. Shaping alone is insufficient as well.

> Here it is:
> 
>                     +---------+       +-ppp0- ... - client0
>                     |         +-eth1-<+-ppp1- ... - client1
> Internet ----- eth0-+ router  |     . . . . . . . .
>                     |         +-eth2-<  . . . . . .
>                     +---------+       +-pppN- ... - clientN
> 		    
> 
> Traffic flows from internet to clients. 
> The ethX names are for example only, my setup is more complex actually,
> but that complexity is not related to IMQ or traffic shaping.
> Clients use PPTP or PPPoE to connect to router.
> See, there's no single interface I can attach qdisc to, if I want
> to put all clients into the same qdisc. 
> 

So why cant you attach a ingress qdisc on eth1-2 and use policing to
mark excess traffic (not drop)? On eth0 all you do is based on the mark
you stash them on a different class i.e move the stuff you have on
IMQ0 to eth0.

Example on ingress:

meter1=" police index 1 rate $CIR1"
meter1a=" police index 2 rate $PIR1"

index 2 is shared by all flows for default.
index 1 (and others) is guaranteeing rate (20Kbps) for each of the flows
etc.
Look for example at examples/Edge32-ca-u32

The most important thing to know is that policers can be shared across 
devices, flows etc using the "index" operator.

I just noticed you are copying linux-kernel. Please take it off the list
in your response, this is a netdev issue. This should warn anyone
interested in the thread to join netdev.

cheers,
jamal

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [RFC/PATCH] IMQ port to 2.6
  2004-01-26 13:55                 ` Vladimir B. Savkin
  2004-01-26 14:29                   ` jamal
@ 2004-01-26 15:24                   ` Tomas Szepe
  2004-01-27  3:14                     ` jamal
  1 sibling, 1 reply; 29+ messages in thread
From: Tomas Szepe @ 2004-01-26 15:24 UTC (permalink / raw)
  To: Vladimir B. Savkin; +Cc: jamal, linux-kernel, netdev

On Jan-26 2004, Mon, 16:55 +0300
Vladimir B. Savkin <master@sectorb.msk.ru> wrote:

>                     +---------+       +-ppp0- ... - client0
>                     |         +-eth1-<+-ppp1- ... - client1
> Internet ----- eth0-+ router  |     . . . . . . . .
>                     |         +-eth2-<  . . . . . .
>                     +---------+       +-pppN- ... - clientN

Actually, this is very much like what we're using IMQ for:

                  +-----------+ eth1 --- \
                  | shaper    + eth2 ---
Internet --- eth0 + in bridge + .    ---    ... WAN (10 C's of customer IPs)
                  | setup     + .    ---
                  +-----------+ ethN --- /

We're shaping single IPs and groups of IPs, applying tariff rates
on the sum of inbound and outbound flow (this last point, I'm told,
is the primary reason for our use of IMQ).  The machine also does
IP accounting (through custom userland software based on libpcap)
and has to be an ethernet bridge so that it can be replaced by
a piece of wire should it fail and there was no backup hardware left.

At this moment we're on sfq/u32/htb/IMQ/mangle.  We've figured out
that unless we mess with iptable_nat, IMQ-enabled kernels will work
perfectly reliably (SNAT in particular seems deadly).  We don't
insist on IMQ.  In fact, we would be very grateful if somebody
could point us to an alternative mechanism to IMQ that would allow
us to effectively shape by the sum of both traffic directions of
a given IP, as we'd like to deploy "shaping firewalls" that would
also do SNAT.

-- 
Tomas Szepe <szepe@pinerecords.com>

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [RFC/PATCH] IMQ port to 2.6
  2004-01-26 14:29                   ` jamal
@ 2004-01-26 17:41                     ` Vladimir B. Savkin
  2004-01-27  3:25                       ` jamal
  2004-01-31 18:52                     ` Vladimir B. Savkin
  1 sibling, 1 reply; 29+ messages in thread
From: Vladimir B. Savkin @ 2004-01-26 17:41 UTC (permalink / raw)
  To: jamal; +Cc: netdev

On Mon, Jan 26, 2004 at 09:29:56AM -0500, jamal wrote:
> > You can see for youself. Police users' traffic to half of the normal rate
> > and here them scream :) Then change policing to shaping using wrr
> > (or htb class for each user), and sfq on the leafs, and users are happy.
> > 
> 
> ;-> Sorry I dont have time. But this could be a nice paper since
> i havent seen this topic covered. If you want to write one i could
> help provide you an outline.

Over here every good networking engineer I have talked to knows this :)

> > Well, I use wrr + sfq exactly for fairness. No such thing can be
> > achieved with policing only.
> > 
> 
> Thats what i was assuming. Shaping alone is insufficient as well.

I don't quite understand what you mean here.
Ultimately, any packet will land in some leaf qdisc,
where there is a queue of some maximum size.
If a sender does not reduce its rate, queue overflows, and we drop.
But in my experience this rarely happens with TCP. I think that sender
just see measured RTT increase and reduce its rate or shrinks
its window. I don't know modern TCP implementations in detail, 
but I can see it works.
Is this what you call "shaping alone"? If yes, then I don't agree with
you here.

> 
> > Here it is:
> > 
> >                     +---------+       +-ppp0- ... - client0
> >                     |         +-eth1-<+-ppp1- ... - client1
> > Internet ----- eth0-+ router  |     . . . . . . . .
> >                     |         +-eth2-<  . . . . . .
> >                     +---------+       +-pppN- ... - clientN
> > 		    
> > 
> > Traffic flows from internet to clients. 
> > The ethX names are for example only, my setup is more complex actually,
> > but that complexity is not related to IMQ or traffic shaping.
> > Clients use PPTP or PPPoE to connect to router.
> > See, there's no single interface I can attach qdisc to, if I want
> > to put all clients into the same qdisc. 
> > 
> 
> So why cant you attach a ingress qdisc on eth1-2 and use policing to
> mark excess traffic (not drop)? On eth0 all you do is based on the mark

And where to drop then?

> you stash them on a different class i.e move the stuff you have on
> IMQ0 to eth0.
> 
> Example on ingress:
> 
> meter1=" police index 1 rate $CIR1"
> meter1a=" police index 2 rate $PIR1"
> 
> index 2 is shared by all flows for default.
> index 1 (and others) is guaranteeing rate (20Kbps) for each of the flows
> etc.
> Look for example at examples/Edge32-ca-u32
> 
> The most important thing to know is that policers can be shared across 
> devices, flows etc using the "index" operator.

So, it's just like IMQ, but without that Q bit, only marking?

But how would I calculate guaranteed rate for a client?
Suppose I have 100 clients connected, then I can only
guarantee a 1/100th of the pipe to each. But if only 5 of them
are active, then each can get 1/5th of the pipe.
Round-robin mechanism such as wrr effectively adjusts rates in dynamic.
I use two-layer hierarchy actually, by applying sfq to every wrr class,
so a user can download a file and play Quake at the same time,
with acceptable delays and no packet loss. At the same time,
user that opens 1000 connections with some evil multithreaded downloader
thing, has the same aggregate rate, but can't play Quake because
of high latency.  It works wonderfully.

I suppose we can have a flavor of wrr that will not queue packets,
only find over-active flows and mark or drop over-profile packets
but 1) no such thing exist AFAIK and 2) it will not have separate
queue for each user/flow, thus all flows will have same latency,
only drop probabilities will differ.

So, it seems to me that IMQ fits nicely when there're some artificial
bandwidth limits (as opposed to bandwidth of some physical interface)
and no single egress interface for all flows to be shaped.
> 
> I just noticed you are copying linux-kernel. Please take it off the list
> in your response, this is a netdev issue. This should warn anyone
> interested in the thread to join netdev.
> 

Done.

~
:wq
                                        With best regards, 
                                           Vladimir Savkin. 

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [RFC/PATCH] IMQ port to 2.6
  2004-01-26 15:24                   ` Tomas Szepe
@ 2004-01-27  3:14                     ` jamal
  2004-01-27 11:59                       ` Tomas Szepe
  0 siblings, 1 reply; 29+ messages in thread
From: jamal @ 2004-01-27  3:14 UTC (permalink / raw)
  To: Tomas Szepe; +Cc: Vladimir B. Savkin, netdev

On Mon, 2004-01-26 at 10:24, Tomas Szepe wrote:
[..]
> Actually, this is very much like what we're using IMQ for:
> 
>                   +-----------+ eth1 --- \
>                   | shaper    + eth2 ---
> Internet --- eth0 + in bridge + .    ---    ... WAN (10 C's of customer IPs)
>                   | setup     + .    ---
>                   +-----------+ ethN --- /
> 
> We're shaping single IPs and groups of IPs, applying tariff rates
> on the sum of inbound and outbound flow (this last point, I'm told,
> is the primary reason for our use of IMQ).]

This does not IMQ. I am going to type an example at the end of the
email.

> The machine also does
> IP accounting (through custom userland software based on libpcap)
> and has to be an ethernet bridge so that it can be replaced by
> a piece of wire should it fail and there was no backup hardware left.
> 

Ok, now you are throwing an extra  wrench ;->
As i mentioned earlier current dependency of ingress on netfilter
is the wrong abstraction (this also applies to IMQ). And for
this reason it must go. If you are running 2.4.x i can give you a patch
that fixes this and will get things working for you even when you use
bridging. Infact i will give example based on this patch.
BTW, how are you going to do SNAT with bridging?

The example below tries to show many things. Example sharing of
policers across many flows within a device, and across devices.
Also shows how to do it so that inbound and outbound are summed up.

I spent about 30 minutes coming up with this; i hope it illustrates
the potential

cheers,
jamal

---- start untested script here -------------------
#
#
# lets take example flow1 10.0.0.21 sits behind eth1 packets
# 
#
# the idea is to have 10.0.0.21/32 first try to use bandwith
# guaranteed to it (index 1) if exceeds that it gets demoted to mark 2
# and it tries to use bandwidth that is shared by all flows 
# behind eth1; (index 100)
# if that fails it gets demoted even more to mark 3 and it tries to use
# from a pool of bandwith available to every flow on every device
# index 300 if that fails then drop the packet etc
#
# on egress use the marks to select different priority queues.
# Give better treatment to mark 1 than 2 than 3 ..
#
# On the return path from internet to eth1, packets from
# internet to 10.0.0.21 are forced to use policer index 1
# and therefore ensuring that the bandwidth is allocated
# is the sum of inbound and outbound for that flow ..
# 
#
#add ingress qdisc
tc qdisc add dev eth1 ingress
#
tc filter add dev eth1 parent ffff: protocol ip prio 1 \
u32 match ip src 10.0.0.21/32 flowid 1:15 \
# first give it a mark of 1
action ipt -j mark --set-mark 1 index 2 \
# ensure policer index 1 is used
action police index 1 rate 1kbit burst 9k pipe \
# exceeded flows bound rate ..
action ipt -j mark --set-mark 2 \
#
action police index 200 mtu 5000 rate 1kbit burst 10k pipe \
action ipt -j mark --set-mark 3 \
action police index 300 mtu 5000 rate 1kbit burst 90k drop
#
#
# do something on eth0 with these firewall marks
# example use them to send packets to different classes/queue
# give priority to marks 1 then 2 then 3
#
.
.
.
# now the return path to 10.0.0.21 ...
tc qdisc add dev eth1 handle 1:0 root prio 
#
# note how exactly the same policer is used ("index 1")
tc filter add dev eth1 parent 1:0 protocol ip prio 1 \
u32 match ip dst 10.0.0.21/32 flowid 1:25 \
action police index 1 rate 1kbit burst 9k pipe 
.
.
.

look at the stats with "tc -s filter show parent ffff: dev eth1"
.
A sample would look like:

------------
jroot# tc -s filter show parent ffff: dev eth0
filter protocol ip pref 1 u32 
filter protocol ip pref 1 u32 fh 800: ht divisor 1 
filter protocol ip pref 1 u32 fh 800::800 order 2048 key ht 800 bkt 0 flowid 1:1
5
match 0a000015/ffffffff at 12
.
.
   action order 2: police 1 action pipe rate 1Kbit burst 9Kb mtu 2Kb 
         Sent 188832 bytes 2248 pkts (dropped 0, overlimits 2122) 
.
.

-------------


 

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [RFC/PATCH] IMQ port to 2.6
  2004-01-26 17:41                     ` Vladimir B. Savkin
@ 2004-01-27  3:25                       ` jamal
  0 siblings, 0 replies; 29+ messages in thread
From: jamal @ 2004-01-27  3:25 UTC (permalink / raw)
  To: Vladimir B. Savkin; +Cc: netdev

On Mon, 2004-01-26 at 12:41, Vladimir B. Savkin wrote:
> On Mon, Jan 26, 2004 at 09:29:56AM -0500, jamal wrote:
[..]
> 
> Over here every good networking engineer I have talked to knows this :)

This may be true but its like sticking a finger in the air
and saying the "wind blows south" ;-> Data my friend.

> > Thats what i was assuming. Shaping alone is insufficient as well.
> 
> I don't quite understand what you mean here.
> Ultimately, any packet will land in some leaf qdisc,
> where there is a queue of some maximum size.
> If a sender does not reduce its rate, queue overflows, and we drop.
> But in my experience this rarely happens with TCP. I think that sender
> just see measured RTT increase and reduce its rate or shrinks
> its window. I don't know modern TCP implementations in detail, 
> but I can see it works.

We are saying the same thing. And we are also digressing from the main
point. So lets drop this part if you dont mind.

> > So why cant you attach a ingress qdisc on eth1-2 and use policing to
> > mark excess traffic (not drop)? On eth0 all you do is based on the mark
> 
> And where to drop then?
> 

Look at the example i just typed.
In your case you dont need the patch i described; use the standard
ingress qdisc and mark with iptables.

> So, it's just like IMQ, but without that Q bit, only marking?
> 

Exactly.

> But how would I calculate guaranteed rate for a client?

Note how i used index 1 for the meter in the example i posted.
index 1 is only for one client.

> Suppose I have 100 clients connected, then I can only
> guarantee a 1/100th of the pipe to each. But if only 5 of them
> are active, then each can get 1/5th of the pipe.

Look at the way i had index 200 and 300 one for sharing within a
device and another for the whole system. 
You should also just be able to use marks and shape on egress.

> Round-robin mechanism such as wrr effectively adjusts rates in dynamic.
> I use two-layer hierarchy actually, by applying sfq to every wrr class,
> so a user can download a file and play Quake at the same time,
> with acceptable delays and no packet loss. At the same time,
> user that opens 1000 connections with some evil multithreaded downloader
> thing, has the same aggregate rate, but can't play Quake because
> of high latency.  It works wonderfully.
> 
> I suppose we can have a flavor of wrr that will not queue packets,
> only find over-active flows and mark or drop over-profile packets
> but 1) no such thing exist AFAIK and 2) it will not have separate
> queue for each user/flow, thus all flows will have same latency,
> only drop probabilities will differ.
> 
> So, it seems to me that IMQ fits nicely when there're some artificial
> bandwidth limits (as opposed to bandwidth of some physical interface)
> and no single egress interface for all flows to be shaped.

Look at that sample and then lets discuss further. I spent a long time
typing it (and wanna catch up with other email). I think we may be
gettin close.

cheers,
jamal

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [RFC/PATCH] IMQ port to 2.6
  2004-01-27  3:14                     ` jamal
@ 2004-01-27 11:59                       ` Tomas Szepe
  2004-01-31 17:02                         ` jamal
  0 siblings, 1 reply; 29+ messages in thread
From: Tomas Szepe @ 2004-01-27 11:59 UTC (permalink / raw)
  To: jamal; +Cc: Vladimir B. Savkin, netdev, volf

On Jan-26 2004, Mon, 22:14 -0500
jamal <hadi@cyberus.ca> wrote:

> On Mon, 2004-01-26 at 10:24, Tomas Szepe wrote:
> [..]
> > Actually, this is very much like what we're using IMQ for:
> > 
> >                   +-----------+ eth1 --- \
> >                   | shaper    + eth2 ---
> > Internet --- eth0 + in bridge + .    ---    ... WAN (10 C's of customer IPs)
> >                   | setup     + .    ---
> >                   +-----------+ ethN --- /
> > 
> > We're shaping single IPs and groups of IPs, applying tariff rates
> > on the sum of inbound and outbound flow (this last point, I'm told,
> > is the primary reason for our use of IMQ).]
> 
> This does not IMQ. I am going to type an example at the end of the
> email.

Thanks for your reply, Jamal.  Unfortunately, we don't really understand
your example.  Please see below.

[snip]
> BTW, how are you going to do SNAT with bridging?

We aren't.  :) We won't need bridging on those firewalls, it's only
necessary for the main shaper box.  I apologize for not making that
clear in my previous post.

> The example below tries to show many things. Example sharing of
> policers across many flows within a device, and across devices.
> Also shows how to do it so that inbound and outbound are summed up.
> [snip]

What's the mechanism for matching the IPs?  We need to insert
thousands of these rules and shape constant 20+ Mbit flow of
traffic.  If it doesn't use a hash or similar, we're back to
where we started.

> # On the return path from internet to eth1, packets from
> # internet to 10.0.0.21 are forced to use policer index 1
> # and therefore ensuring that the bandwidth is allocated
> # is the sum of inbound and outbound for that flow ..
> # 
> #
> #add ingress qdisc
> tc qdisc add dev eth1 ingress
> #
> tc filter add dev eth1 parent ffff: protocol ip prio 1 \
> u32 match ip src 10.0.0.21/32 flowid 1:15 \
> # first give it a mark of 1
> action ipt -j mark --set-mark 1 index 2 \
> # ensure policer index 1 is used
> action police index 1 rate 1kbit burst 9k pipe \
> # exceeded flows bound rate ..
> action ipt -j mark --set-mark 2 \
> #
> action police index 200 mtu 5000 rate 1kbit burst 10k pipe \
> action ipt -j mark --set-mark 3 \
> action police index 300 mtu 5000 rate 1kbit burst 90k drop
> #
> #
> # do something on eth0 with these firewall marks
> # example use them to send packets to different classes/queue
> # give priority to marks 1 then 2 then 3
> #
> .
> .
> .
> # now the return path to 10.0.0.21 ...
> tc qdisc add dev eth1 handle 1:0 root prio 
> #
> # note how exactly the same policer is used ("index 1")
> tc filter add dev eth1 parent 1:0 protocol ip prio 1 \
> u32 match ip dst 10.0.0.21/32 flowid 1:25 \
> action police index 1 rate 1kbit burst 9k pipe 

Would you know of any real documentation on tc/ingress that
we could use to deconstruct this example and understand it?

At this moment we can only guess at what's happening. :(

-- 
Tomas Szepe <szepe@pinerecords.com>

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [RFC/PATCH] IMQ port to 2.6
  2004-01-27 11:59                       ` Tomas Szepe
@ 2004-01-31 17:02                         ` jamal
  0 siblings, 0 replies; 29+ messages in thread
From: jamal @ 2004-01-31 17:02 UTC (permalink / raw)
  To: Tomas Szepe; +Cc: Vladimir B. Savkin, netdev, volf

Hi Tomas,

Sorry - didnt mean to keep you hanging this long. Work comes first
then fun stuff like this.

On Tue, 2004-01-27 at 06:59, Tomas Szepe wrote:
> On Jan-26 2004, Mon, 22:14 -0500
> jamal <hadi@cyberus.ca> wrote:
[..]
> Thanks for your reply, Jamal.  Unfortunately, we don't really understand
> your example.  Please see below.
> 

How about we take this offline. I help you and you document it so other
people understand it?

> What's the mechanism for matching the IPs?  We need to insert
> thousands of these rules and shape constant 20+ Mbit flow of
> traffic.  If it doesn't use a hash or similar, we're back to
> where we started.

Its the u32 classifier which is more sophisticated than the standard
iptables one. It does go into a tree of hashes. If you are unhappy with
it, it is trivial to add your own classifier as well.

[..]

> 
> Would you know of any real documentation on tc/ingress that
> we could use to deconstruct this example and understand it?
> 

Lets take it offline - there is no docs really. In particular this 
feature i am talking about is not documented anywhere. Unfortunately
what that means is people reinvent it every summer. IMQ being one of
those inventions.

I would like to also satisfy Vladmirs requirement of delaying packets
instead of policing them.

cheers,
jamal

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [RFC/PATCH] IMQ port to 2.6
  2004-01-26 14:29                   ` jamal
  2004-01-26 17:41                     ` Vladimir B. Savkin
@ 2004-01-31 18:52                     ` Vladimir B. Savkin
  2004-01-31 20:26                       ` jamal
  1 sibling, 1 reply; 29+ messages in thread
From: Vladimir B. Savkin @ 2004-01-31 18:52 UTC (permalink / raw)
  To: jamal; +Cc: netdev

Jamal, I think you did not understand the role of IMQ in my setup.

Here is my diagram:

> > 
> >                     +---------+       +-ppp0- ... - client0
> >                     |         +-eth1-<+-ppp1- ... - client1
> > Internet ----- eth0-+ router  |     . . . . . . . .
> >                     |         +-eth2-<  . . . . . .
> >                     +---------+       +-pppN- ... - clientN
> > 		    
> > 

Please notice this:

> > Traffic flows from internet to clients. 
This means from _left_ to _right_ :)

And here is what you suggest:

> So why cant you attach a ingress qdisc on eth1-2 and use policing to
> mark excess traffic (not drop)? On eth0 all you do is based on the mark
> you stash them on a different class i.e move the stuff you have on
> IMQ0 to eth0.
> 

But in my case, eth0 is an ingress device, and eth1 and eth2 are 
(physical) egress devices.
For traffic going other direction (from right to left) I could do
without IMQ, as you suggest. 
But on the right side of a diagram we can see no single device
(physical or virtual) we can attach qdisc to, hence the need for IMQ.

> Example on ingress:
> 
> meter1=" police index 1 rate $CIR1"
> meter1a=" police index 2 rate $PIR1"
> 
> index 2 is shared by all flows for default.
> index 1 (and others) is guaranteeing rate (20Kbps) for each of the flows
> etc.
> Look for example at examples/Edge32-ca-u32
> 
> The most important thing to know is that policers can be shared across 
> devices, flows etc using the "index" operator.
> 

Ok, this looks like typical diffserv setup, as they described in RFCs.
It doesn't assure fair bandwith sharing between active clients. 
We just can't decide what traffic is excess using some predetermined
rate, we must look for current rates of other clients and penalize those
who use unfair shares. Such meters and policers could exist but I don't
know any. wrr and htb can do it, but they use queuing and round-robin
to achive fairness, not meters and policers.

~
:wq
                                        With best regards, 
                                           Vladimir Savkin. 

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [RFC/PATCH] IMQ port to 2.6
  2004-01-31 18:52                     ` Vladimir B. Savkin
@ 2004-01-31 20:26                       ` jamal
  2004-01-31 20:53                         ` Vladimir B. Savkin
  0 siblings, 1 reply; 29+ messages in thread
From: jamal @ 2004-01-31 20:26 UTC (permalink / raw)
  To: Vladimir B. Savkin; +Cc: netdev

Hello Vladimir,

On Sat, 2004-01-31 at 13:52, Vladimir B. Savkin wrote:
> Jamal, I think you did not understand the role of IMQ in my setup.
> 
[..]
> > > 
> > >                     +---------+       +-ppp0- ... - client0
> > >                     |         +-eth1-<+-ppp1- ... - client1
> > > Internet ----- eth0-+ router  |     . . . . . . . .
> > >                     |         +-eth2-<  . . . . . .
> > >                     +---------+       +-pppN- ... - clientN

> And here is what you suggest:

> > So why cant you attach a ingress qdisc on eth1-2 and use policing to
> > mark excess traffic (not drop)? On eth0 all you do is based on the mark
> > you stash them on a different class i.e move the stuff you have on
> > IMQ0 to eth0. 
> 
> But in my case, eth0 is an ingress device, and eth1 and eth2 are 
> (physical) egress devices.

You are correct to say i misundertood because i only covered only one
direction. Lets cover both cases now. I am going to try to be verbose.

Lets take something like an ftp/http download as an example and hopefuly
I can grasp your requirements if i am wrong.

Case1: bulk transfer going right->left  

In this case you want to restrict how much client0..N can send to the
internet. There are two ways to do it, the first one as you say below:

> For traffic going other direction (from right to left) I could do
> without IMQ, as you suggest. 

i.e based on client0..N you attach some rules on eth0.

The second scheme is what i was saying in my email to Tomas - it is
achievable via the ingress qdisc on both eth1 and eth2 and egress root
prio qdisc on eth0. A diagram would help to show how the policing is
done:

  ^
  |
  |
  +--------- MAX available internet bandwidth for all clients
  |
  ^
  |   Area B
  | 
  +--------- MAX available internet b/width for each of eth1/2
  |
  ^
  |  Area A
  |  
  +----------MAX guaranteed bandwith available to client for internet
  | 
  +---------- 0 b/width

Not sure if this diagram is clear or not - theres a gauge of bandwith
going up vertically. The maximum is whatever bwidth you have on the
internet side.
Each client is given a guaranteed (long time) bandwidth. This is
labelled "MAX guaranteed bandwith available to client for internet".
What you do is fwmark the packet if it doesnt exceed its fair bandwith
share - lets say mark 1. 
If this is exceeded then the client is in "Area A" of the gauge above.
Everything in Area A is what a combination of all clients on each device
(eth1 for example) can use. If a client reaches this area you mark them
with 2.
If this is exceeded then the client is in area B. In this case, you mark
the client with a 3.
If they exceed "MAX available internet bandwidth for all clients"
then no point in sending them on their way to eth0, just drop them

On eth0 side, all you need to do really is put the different marks
(using fwmark classifier) on different priority queues (use simple prio
qdisc nothing more).
prioritiwise: mark 1 > mark 2 > mark 3.
i.e as long as you have mark 1 packets send only those to the internet.
If there are no more on mark 1 queue send mark 2. If no more of those
send prio 3.
Eventually some of these queues will be overflowing for clients which
are greedy.

Note that Areas A and B are shared between many clients and is here to
serve as an example just to show how you can do the following:
a) a client gets a fair share i.e Guaranteed rate in a long period of
time.
b) many clients coming from one device like eth1 share some excess
bandwidth allocated for eth1 if it is available.
c) Many clients share bandwidth allocated for the system (i.e
fre-for-all for eth1 and eth2).

What i dont show is case d) which Tomas asked for.
Allocated bandwidth shared for a client to be used for both outgoing
(to internet) and incoming (from internet) i.e if clients
incoming+outgoing bandwidth exceeds its allocated rate, then it enters
Area A etc.
I pointed out (in my email to Tomas) that because policers can be given
explicit IDs (operator "index") this is doable.

> But on the right side of a diagram we can see no single device
> (physical or virtual) we can attach qdisc to, hence the need for IMQ.
> 

Ok, so lets take  a look at case 2 which is right <- left 
i.e clients downloading from the internet.
Repeat what i described as method 2 above with ingress qdisc at eth0
and egress on eth1 and eth2.

The nice thing about the policer is the ability to share bandwidth
between devices and flows regardless of direction.

> > The most important thing to know is that policers can be shared across 
> > devices, flows etc using the "index" operator.
> > 
> 
> Ok, this looks like typical diffserv setup, as they described in RFCs.
> It doesn't assure fair bandwith sharing between active clients. 
>
> We just can't decide what traffic is excess using some predetermined
> rate, we must look for current rates of other clients and penalize those
> who use unfair shares. Such meters and policers could exist but I don't
> know any. 
> wrr and htb can do it, but they use queuing and round-robin
> to achive fairness, not meters and policers.
> 

Look at what i said above. A simple priority scheduler is sufficient no
need for WRR or HTB. 

cheers,
jamal

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [RFC/PATCH] IMQ port to 2.6
  2004-01-31 20:26                       ` jamal
@ 2004-01-31 20:53                         ` Vladimir B. Savkin
  2004-01-31 21:25                           ` jamal
  0 siblings, 1 reply; 29+ messages in thread
From: Vladimir B. Savkin @ 2004-01-31 20:53 UTC (permalink / raw)
  To: jamal; +Cc: netdev

On Sat, Jan 31, 2004 at 03:26:53PM -0500, jamal wrote:
> Hello Vladimir,
> 
> On Sat, 2004-01-31 at 13:52, Vladimir B. Savkin wrote:
> > Jamal, I think you did not understand the role of IMQ in my setup.
> > 
> [..]
> > > > 
> > > >                     +---------+       +-ppp0- ... - client0
> > > >                     |         +-eth1-<+-ppp1- ... - client1
> > > > Internet ----- eth0-+ router  |     . . . . . . . .
> > > >                     |         +-eth2-<  . . . . . .
> > > >                     +---------+       +-pppN- ... - clientN
> 
> > And here is what you suggest:
> 
> > > So why cant you attach a ingress qdisc on eth1-2 and use policing to
> > > mark excess traffic (not drop)? On eth0 all you do is based on the mark
> > > you stash them on a different class i.e move the stuff you have on
> > > IMQ0 to eth0. 
> > 
> > But in my case, eth0 is an ingress device, and eth1 and eth2 are 
> > (physical) egress devices.
> 
> You are correct to say i misundertood because i only covered only one
> direction. Lets cover both cases now. I am going to try to be verbose.
> 
> Lets take something like an ftp/http download as an example and hopefuly
> I can grasp your requirements if i am wrong.
> 
> Case1: bulk transfer going right->left  
> 
> In this case you want to restrict how much client0..N can send to the
> internet. There are two ways to do it, the first one as you say below:
> 
> > For traffic going other direction (from right to left) I could do
> > without IMQ, as you suggest. 
> 
> i.e based on client0..N you attach some rules on eth0.
> 
> The second scheme is what i was saying in my email to Tomas - it is
> achievable via the ingress qdisc on both eth1 and eth2 and egress root
> prio qdisc on eth0. A diagram would help to show how the policing is
> done:
> 
>   
>   ^
>   |
>   |
>   +--------- MAX available internet bandwidth for all clients
>   |
>   ^
>   |   Area B
>   | 
>   +--------- MAX available internet b/width for each of eth1/2
>   |
>   ^
>   |  Area A
>   |  
>   +----------MAX guaranteed bandwith available to client for internet
>   | 
>   +---------- 0 b/width
> 
> 
> Not sure if this diagram is clear or not - theres a gauge of bandwith
> going up vertically. The maximum is whatever bwidth you have on the
> internet side.
> Each client is given a guaranteed (long time) bandwidth. This is
> labelled "MAX guaranteed bandwith available to client for internet".
> What you do is fwmark the packet if it doesnt exceed its fair bandwith
> share - lets say mark 1. 
> If this is exceeded then the client is in "Area A" of the gauge above.
> Everything in Area A is what a combination of all clients on each device
> (eth1 for example) can use. If a client reaches this area you mark them
> with 2.
> If this is exceeded then the client is in area B. In this case, you mark
> the client with a 3.
> If they exceed "MAX available internet bandwidth for all clients"
> then no point in sending them on their way to eth0, just drop them
> 
> On eth0 side, all you need to do really is put the different marks
> (using fwmark classifier) on different priority queues (use simple prio
> qdisc nothing more).
> prioritiwise: mark 1 > mark 2 > mark 3.
> i.e as long as you have mark 1 packets send only those to the internet.
> If there are no more on mark 1 queue send mark 2. If no more of those
> send prio 3.
> Eventually some of these queues will be overflowing for clients which
> are greedy.
>  
> Note that Areas A and B are shared between many clients and is here to
> serve as an example just to show how you can do the following:
> a) a client gets a fair share i.e Guaranteed rate in a long period of
> time.

No, that's not what I mean by fairness!
No problem to give everyone their guaranteed rate.

> b) many clients coming from one device like eth1 share some excess
> bandwidth allocated for eth1 if it is available.
> c) Many clients share bandwidth allocated for the system (i.e
> fre-for-all for eth1 and eth2).

Yes, they will share it. But in what proportion?
Your proposal does not guarantee anything about this.
And I want client to share excess bandwidth fairly.
That's what round-robin schemes can give.

With your solution, if every client open some number of TCP connections
to download files, bandwidth will be divided between clients in
proportion to number of connections, since every connection will be in
equal conditions. That's exactly what I am to prevent.

> Ok, so lets take  a look at case 2 which is right <- left 
> i.e clients downloading from the internet.
> Repeat what i described as method 2 above with ingress qdisc at eth0
> and egress on eth1 and eth2.

There is no way for internet traffic to saturate fast ethernet link,
since uplink is only few megabits/sec.
So, egress queue will always be empty, priorities will have no effect
whatsoever, and packets will be neither dropped nor delayed.

See, my bandwidth limit is artificial and defined by political reasons.
And that's the only restriction that is defined, and the goal
is the maximal fairness. Minimal guaranteed rate for each client is
not enough.
With your proposal, there's just no place to put this aggregate
restriction, except a policer, which doesn't give fairness.

> 
> The nice thing about the policer is the ability to share bandwidth
> between devices and flows regardless of direction.
>  
> 
> > > The most important thing to know is that policers can be shared across 
> > > devices, flows etc using the "index" operator.
> > > 
> > 
> > Ok, this looks like typical diffserv setup, as they described in RFCs.
> > It doesn't assure fair bandwith sharing between active clients. 
> >
> > We just can't decide what traffic is excess using some predetermined
> > rate, we must look for current rates of other clients and penalize those
> > who use unfair shares. Such meters and policers could exist but I don't
> > know any. 
> > wrr and htb can do it, but they use queuing and round-robin
> > to achive fairness, not meters and policers.
> > 
> 
> Look at what i said above. A simple priority scheduler is sufficient no
> need for WRR or HTB. 

No, it's not, as described above.

~
:wq
                                        With best regards, 
                                           Vladimir Savkin. 

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [RFC/PATCH] IMQ port to 2.6
  2004-01-31 20:53                         ` Vladimir B. Savkin
@ 2004-01-31 21:25                           ` jamal
  2004-01-31 21:32                             ` Vladimir B. Savkin
  0 siblings, 1 reply; 29+ messages in thread
From: jamal @ 2004-01-31 21:25 UTC (permalink / raw)
  To: Vladimir B. Savkin; +Cc: netdev

On Sat, 2004-01-31 at 15:53, Vladimir B. Savkin wrote:
> On Sat, Jan 31, 2004 at 03:26:53PM -0500, jamal wrote:
[..]
> > Note that Areas A and B are shared between many clients and is here to
> > serve as an example just to show how you can do the following:
> > a) a client gets a fair share i.e Guaranteed rate in a long period of
> > time.
> 
> No, that's not what I mean by fairness!
> No problem to give everyone their guaranteed rate.
> 

> > b) many clients coming from one device like eth1 share some excess
> > bandwidth allocated for eth1 if it is available.
> > c) Many clients share bandwidth allocated for the system (i.e
> > fre-for-all for eth1 and eth2).
> 
> Yes, they will share it. But in what proportion?

Excess b/width is shared in FIFO mode in what i described whoever comes
first grabs whats excess.
Thanks for clarifying this point. 

[..]

> With your solution, if every client open some number of TCP connections

[..]


> See, my bandwidth limit is artificial and defined by political reasons.
> And that's the only restriction that is defined, and the goal
> is the maximal fairness. Minimal guaranteed rate for each client is
> not enough.
> With your proposal, there's just no place to put this aggregate
> restriction, except a policer, which doesn't give fairness.
> 

Ok, i think i have understood you finally;->
The challenge is in this one direction whose characteristics can be
described as follows:
a) Incoming pipe (from internet) is smaller than outgoing pipe (to
clients).
b) Desire is to have excess bwidth with max fairness to all flows
instead of free-for-all scheme.
[This can only be achieved by a non-work conserving scheduler].

Is the above correct?

cheers,
jamal

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [RFC/PATCH] IMQ port to 2.6
  2004-01-31 21:25                           ` jamal
@ 2004-01-31 21:32                             ` Vladimir B. Savkin
  2004-01-31 21:49                               ` jamal
  0 siblings, 1 reply; 29+ messages in thread
From: Vladimir B. Savkin @ 2004-01-31 21:32 UTC (permalink / raw)
  To: jamal; +Cc: netdev

> Ok, i think i have understood you finally;->
> The challenge is in this one direction whose characteristics can be
> described as follows:

In other direction, the goal is the same, but IMQ is not needed,
since there is only one Internet link.

> a) Incoming pipe (from internet) is smaller than outgoing pipe (to
> clients).

Yes, and artificial limit is even smaller.

> b) Desire is to have excess bwidth with max fairness to all flows
> instead of free-for-all scheme.

Yes, if you define "flow" as all traffic to one client.
Actually, I use two-level hierarchy: in every flow in above sense
each micro-flow receives a fair amount of bandwidth (approximatly,
using sfq).

> [This can only be achieved by a non-work conserving scheduler].

Yes.

> 
> Is the above correct?
> 

It seems so :)

~
:wq
                                        With best regards, 
                                           Vladimir Savkin. 

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [RFC/PATCH] IMQ port to 2.6
  2004-01-31 21:32                             ` Vladimir B. Savkin
@ 2004-01-31 21:49                               ` jamal
  2004-01-31 21:58                                 ` Vladimir B. Savkin
  0 siblings, 1 reply; 29+ messages in thread
From: jamal @ 2004-01-31 21:49 UTC (permalink / raw)
  To: Vladimir B. Savkin; +Cc: netdev

On Sat, 2004-01-31 at 16:32, Vladimir B. Savkin wrote:


> Yes, if you define "flow" as all traffic to one client.
>
> Actually, I use two-level hierarchy: in every flow in above sense
> each micro-flow receives a fair amount of bandwidth (approximatly,
> using sfq).

Ok. 

> > [This can only be achieved by a non-work conserving scheduler].
> 
> Yes.

Still a few rough edges, so bear with me:
Would you not be able to achieve the same if you used the marking scheme
i described earlier on eth0 and used HTB or HFSC or CBQ (as non-work
conserving) on eth1/2? I was suggesting prio before and you pointed you
the queues will never be full for that to have any value. 

cheers,
jamal

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [RFC/PATCH] IMQ port to 2.6
  2004-01-31 21:49                               ` jamal
@ 2004-01-31 21:58                                 ` Vladimir B. Savkin
  2004-01-31 22:26                                   ` jamal
  0 siblings, 1 reply; 29+ messages in thread
From: Vladimir B. Savkin @ 2004-01-31 21:58 UTC (permalink / raw)
  To: jamal; +Cc: netdev

On Sat, Jan 31, 2004 at 04:49:24PM -0500, jamal wrote:
> Still a few rough edges, so bear with me:
> Would you not be able to achieve the same if you used the marking scheme
> i described earlier on eth0 and used HTB or HFSC or CBQ (as non-work
> conserving) on eth1/2? I was suggesting prio before and you pointed you
> the queues will never be full for that to have any value. 
> 

Well, not, the primary reason being that there would be no single class
with appropriate bandwith limit (ceil). There would be multiple classes,
one for each egress interface, and actual upper limit would be the sum
of bandwidths of every class. I would have to limit every class to
some part of the aggregate limit, and it would have been enforced, even if
other classes are not using their shares. So, no fairness.

~
:wq
                                        With best regards, 
                                           Vladimir Savkin. 

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [RFC/PATCH] IMQ port to 2.6
  2004-01-31 21:58                                 ` Vladimir B. Savkin
@ 2004-01-31 22:26                                   ` jamal
  2004-04-11 19:32                                     ` (Long) ANNOUNCE: IMQ replacement WAS(Re: " jamal
  0 siblings, 1 reply; 29+ messages in thread
From: jamal @ 2004-01-31 22:26 UTC (permalink / raw)
  To: Vladimir B. Savkin; +Cc: netdev

On Sat, 2004-01-31 at 16:58, Vladimir B. Savkin wrote:

> Well, not, the primary reason being that there would be no single class
> with appropriate bandwith limit (ceil). There would be multiple classes,

Ok - i think you made your point. 
So i should add that a third condition is there are multiple devices
towards the clients.
You have convinced me there is value in such a scheme as IMQ provides
for such conditions. As it is right now though IMQ needs to have the
right abstraction (and not be dependent on netfilter).And may be we
could abuse it to do other things. 
Let me hear from Tomas and then we should take it from there.

cheers,
jamal

^ permalink raw reply	[flat|nested] 29+ messages in thread

* (Long) ANNOUNCE: IMQ replacement WAS(Re: [RFC/PATCH] IMQ port to 2.6
  2004-01-31 22:26                                   ` jamal
@ 2004-04-11 19:32                                     ` jamal
  0 siblings, 0 replies; 29+ messages in thread
From: jamal @ 2004-04-11 19:32 UTC (permalink / raw)
  To: Vladimir B. Savkin; +Cc: netdev

Hello,

Following up on a 3 month old email ;->

I finally hacked dummy device as a good replacement (IMO) for IMQ.
I am only subscribed to netdev so if there are other lists which
are of interest to this subject please forward on, but make sure
responses make it to netdev.

Well, why dummy you ask? Because it is such dumb a device ;->
Ok, that may not be funny enough, how about:
because nobody has touched the dummy device in 10 years - that cant be
right in Linux. On a serious note though, because i didnt think it was
worth writting another device for this. Dummy continues to work the same
way when not used with tc extensions. Like i said in my email at the
bottom that IMQ was just at the wrong abstraction layer. The dummy
extension can now pick ANY packets (not just IP and requiring to attach
to a few hooks to get IPV6, arp etc)
Of course all this needs the tc extensions (which has a lot of other
features that i wont discuss here).

Why dont i show an example:

----
export TC="/sbin/tc"
#
#attach prio qdisc to the dummy0 device
#
$TC qdisc add dev dummy0 root handle 1: prio
$TC qdisc add dev dummy0 parent 1:1 handle 10: sfq
$TC qdisc add dev dummy0 parent 1:2 handle 20: tbf rate 20kbit buffer
1600 limit 3000
$TC qdisc add dev dummy0 parent 1:3 handle 30:
sfq                                
# redirect packets coming in with fwmark 1 to class 1:1 (sfq)
$TC filter add dev dummy0 protocol ip pref 1 parent 1: handle 1 fw
classid 1:1
#redirect packets tagged with fwmark 2 to 1:2 (tbf)
$TC filter add dev dummy0 protocol ip pref 2 parent 1: handle 2 fw
classid 1:2

#bring up dummy0
ifconfig dummy0 up

#watch the ingress of eth0;
$TC qdisc add dev eth0 ingress

# redirect all IP packets arriving in eth0 to dummy0
# use mark 1 --> puts them onto class 1:1
$TC filter add dev eth0 parent ffff: protocol ip prio 10 u32 \
match u32 0 0 flowid 1:1 \
action ipt -j MARK --set-mark 1 \
action mirred egress redirect dev dummy0

# note, the above just shows eth0 and only at ingress;
# you could repeat this on egress/ingress of any device
# and redirect to dummy0 if you wanted; 

A Little test:

from another machine ping so that you have packets going into the box:
-----
[root@jzny action-tests]# ping 10.22
PING 10.22 (10.0.0.22): 56 data bytes
64 bytes from 10.0.0.22: icmp_seq=0 ttl=64 time=2.8 ms
64 bytes from 10.0.0.22: icmp_seq=1 ttl=64 time=0.6 ms
64 bytes from 10.0.0.22: icmp_seq=2 ttl=64 time=0.6 ms

--- 10.22 ping statistics ---
3 packets transmitted, 3 packets received, 0% packet loss
round-trip min/avg/max = 0.6/1.3/2.8 ms
[root@jzny action-tests]#

Now look at some stats:
-----
[root@jmandrake]:~# tc -s filter show parent ffff: dev eth0
filter protocol ip pref 10 u32
filter protocol ip pref 10 u32 fh 800: ht divisor 1
filter protocol ip pref 10 u32 fh 800::800 order 2048 key ht 800 bkt 0
flowid 1:1 
  match 00000000/00000000 at 0
        action order 1: tablename: mangle  hook: NF_IP_PRE_ROUTING
        target MARK set 0x1
        index 1 ref 1 bind 1 installed 4195sec  used 27sec
         Sent 252 bytes 3 pkts (dropped 0, overlimits 0)

        action order 2: mirred (Egress Redirect to device dummy0) stolen
        index 1 ref 1 bind 1 installed 165 sec used 27 sec
         Sent 252 bytes 3 pkts (dropped 0, overlimits 0)

[root@jmandrake]:~# ifconfig dummy0
dummy0    Link encap:Ethernet  HWaddr 00:00:00:00:00:00
          inet6 addr: fe80::200:ff:fe00:0/64 Scope:Link
          UP BROADCAST RUNNING NOARP  MTU:1500  Metric:1
          RX packets:6 errors:0 dropped:3 overruns:0 frame:0
          TX packets:3 errors:0 dropped:0 overruns:0 carrier:0
          collisions:0 txqueuelen:32
          RX bytes:504 (504.0 b)  TX bytes:252 (252.0 b)
-----

Note the three extra received packets on dummy0 were ndisc packets
sent by the stack when it booted up (which would normally be dropped -
they were). Also, the mirred action can do a _lot_ more, but thats not
the point of this email. Send me private email if you want to know more.
Additionaly note: the ipt report of NF_IP_PRE_ROUTING is a lie since
this happens waaay before IP.
This has been tested on both uni and smp machines. Unfortunately, the
code is only available for 2.4.x (2.4.25 patches available - more
vigorous testing happened on 2.4.21 - my two machines above)

What am i looking for? 
1) users and authors of IMQ to tell me if this achieves what IMQ started
as. I have to say I DONT like the level of obstrutiveness from IMQ as is
today. The code added by this is small (100 or less lines on top of
dummy) and doesnt touch any of the main core bits.
2) testing of the above by people who use IMQ
3) If someone has better ideas - i am not religious about keeping this;
but it certainly cant be the blasphemy that IMQ introduces.

I have also introduced hooks to easily add a -i <input dev> to tc
classifiers - still on the TODO list. So on the egress you could now
classify based on which incoming device the packet arrived on.

cheers,
jamal

On Sat, 2004-01-31 at 17:26, jamal wrote:
> On Sat, 2004-01-31 at 16:58, Vladimir B. Savkin wrote:
> 
> > Well, not, the primary reason being that there would be no single class
> > with appropriate bandwith limit (ceil). There would be multiple classes,
> 
> Ok - i think you made your point. 
> So i should add that a third condition is there are multiple devices
> towards the clients.
> You have convinced me there is value in such a scheme as IMQ provides
> for such conditions. As it is right now though IMQ needs to have the
> right abstraction (and not be dependent on netfilter).And may be we
> could abuse it to do other things. 
> Let me hear from Tomas and then we should take it from there.
> 
> cheers,
> jamal
> 
> 

^ permalink raw reply	[flat|nested] 29+ messages in thread

end of thread, other threads:[~2004-04-11 19:32 UTC | newest]

Thread overview: 29+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2004-01-25 15:24 [RFC/PATCH] IMQ port to 2.6 Marcel Sebek
2004-01-25 16:44 ` Tomas Szepe
2004-01-25 19:22   ` jamal
2004-01-25 20:21     ` Vladimir B. Savkin
2004-01-25 23:45       ` jamal
2004-01-26  0:11         ` Vladimir B. Savkin
2004-01-26  3:09           ` jamal
2004-01-26  9:32             ` Vladimir B. Savkin
2004-01-26 13:38               ` jamal
2004-01-26 13:55                 ` Vladimir B. Savkin
2004-01-26 14:29                   ` jamal
2004-01-26 17:41                     ` Vladimir B. Savkin
2004-01-27  3:25                       ` jamal
2004-01-31 18:52                     ` Vladimir B. Savkin
2004-01-31 20:26                       ` jamal
2004-01-31 20:53                         ` Vladimir B. Savkin
2004-01-31 21:25                           ` jamal
2004-01-31 21:32                             ` Vladimir B. Savkin
2004-01-31 21:49                               ` jamal
2004-01-31 21:58                                 ` Vladimir B. Savkin
2004-01-31 22:26                                   ` jamal
2004-04-11 19:32                                     ` (Long) ANNOUNCE: IMQ replacement WAS(Re: " jamal
2004-01-26 15:24                   ` Tomas Szepe
2004-01-27  3:14                     ` jamal
2004-01-27 11:59                       ` Tomas Szepe
2004-01-31 17:02                         ` jamal
2004-01-25 19:25 ` David S. Miller
2004-01-25 20:23   ` Patrick McHardy
2004-01-25 21:55     ` David S. Miller

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).