* [RFC] NET: Multiple queue hardware support
@ 2007-06-04 21:40 PJ Waskiewicz
2007-06-04 21:40 ` [PATCH] NET: Multiqueue network device support PJ Waskiewicz
0 siblings, 1 reply; 153+ messages in thread
From: PJ Waskiewicz @ 2007-06-04 21:40 UTC (permalink / raw)
To: davem; +Cc: netdev, jeff, auke-jan.h.kok
This patchset is an updated version of previous multiqueue network device
support patches. The general approach of introducing a new API for multiqueue
network devices to register with the stack has remained. The changes include
adding a round-robin qdisc, heavily based on sch_prio, which will allow
queueing to hardware with no OS-enforced queuing policy. sch_prio still has
the multiqueue code in it, but has a Kconfig option to compile it out of the
qdisc. This allows people with hardware containing scheduling policies to
use sch_rr (round-robin), and others without scheduling policies in hardware
to continue using sch_prio if they wish to have some notion of scheduling
priority.
The patches to iproute2 for tc will be sent separately, to support sch_rr.
I'm soliciting feedback for a 2.6.23 submission. Thanks.
--
PJ Waskiewicz <peter.p.waskiewicz.jr@intel.com>
^ permalink raw reply [flat|nested] 153+ messages in thread
* [PATCH] NET: Multiqueue network device support.
2007-06-04 21:40 [RFC] NET: Multiple queue hardware support PJ Waskiewicz
@ 2007-06-04 21:40 ` PJ Waskiewicz
2007-06-05 11:50 ` jamal
` (2 more replies)
0 siblings, 3 replies; 153+ messages in thread
From: PJ Waskiewicz @ 2007-06-04 21:40 UTC (permalink / raw)
To: davem; +Cc: netdev, jeff, auke-jan.h.kok
API added to support multiple hardware queues on an ethernet device.
Round-robin scheduler added (sch_rr) to provide a no-scheduling policy
qdisc for hardware with multiple queues.
Signed-off-by: Peter P Waskiewicz Jr <peter.p.waskiewicz.jr@intel.com>
---
include/linux/etherdevice.h | 3
include/linux/netdevice.h | 62 +++++
include/linux/pkt_sched.h | 11 +
include/linux/skbuff.h | 2
net/core/dev.c | 27 ++
net/core/skbuff.c | 3
net/ethernet/eth.c | 9 -
net/sched/Kconfig | 22 ++
net/sched/Makefile | 1
net/sched/sch_generic.c | 4
net/sched/sch_prio.c | 66 +++++-
net/sched/sch_rr.c | 516 +++++++++++++++++++++++++++++++++++++++++++
12 files changed, 706 insertions(+), 20 deletions(-)
diff --git a/include/linux/etherdevice.h b/include/linux/etherdevice.h
index 071c67a..283e687 100644
--- a/include/linux/etherdevice.h
+++ b/include/linux/etherdevice.h
@@ -39,7 +39,8 @@ extern void eth_header_cache_update(struct hh_cache *hh, struct net_device *dev
extern int eth_header_cache(struct neighbour *neigh,
struct hh_cache *hh);
-extern struct net_device *alloc_etherdev(int sizeof_priv);
+extern struct net_device *alloc_etherdev_mq(int sizeof_priv, int queue_count);
+#define alloc_etherdev(sizeof_priv) alloc_etherdev_mq(sizeof_priv, 1)
static inline void eth_copy_and_sum (struct sk_buff *dest,
const unsigned char *src,
int len, int base)
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index f671cd2..376a0d2 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -108,6 +108,14 @@ struct wireless_dev;
#define MAX_HEADER (LL_MAX_HEADER + 48)
#endif
+struct net_device_subqueue
+{
+ /* Give a control state for each queue. This struct may contain
+ * per-queue locks in the future.
+ */
+ unsigned long state;
+};
+
/*
* Network device statistics. Akin to the 2.0 ether stats but
* with byte counters.
@@ -325,6 +333,7 @@ struct net_device
#define NETIF_F_VLAN_CHALLENGED 1024 /* Device cannot handle VLAN packets */
#define NETIF_F_GSO 2048 /* Enable software GSO. */
#define NETIF_F_LLTX 4096 /* LockLess TX */
+#define NETIF_F_MULTI_QUEUE 16384 /* Has multiple TX/RX queues */
/* Segmentation offload features */
#define NETIF_F_GSO_SHIFT 16
@@ -540,6 +549,10 @@ struct net_device
struct device dev;
/* space for optional statistics and wireless sysfs groups */
struct attribute_group *sysfs_groups[3];
+
+ /* The TX queue control structures */
+ struct net_device_subqueue *egress_subqueue;
+ int egress_subqueue_count;
};
#define to_net_dev(d) container_of(d, struct net_device, dev)
@@ -702,6 +715,48 @@ static inline int netif_running(const struct net_device *dev)
return test_bit(__LINK_STATE_START, &dev->state);
}
+/*
+ * Routines to manage the subqueues on a device. We only need start
+ * stop, and a check if it's stopped. All other device management is
+ * done at the overall netdevice level.
+ * Also test the device if we're multiqueue.
+ */
+static inline void netif_start_subqueue(struct net_device *dev, u16 queue_index)
+{
+ clear_bit(__LINK_STATE_XOFF, &dev->egress_subqueue[queue_index].state);
+}
+
+static inline void netif_stop_subqueue(struct net_device *dev, u16 queue_index)
+{
+#ifdef CONFIG_NETPOLL_TRAP
+ if (netpoll_trap())
+ return;
+#endif
+ set_bit(__LINK_STATE_XOFF, &dev->egress_subqueue[queue_index].state);
+}
+
+static inline int netif_subqueue_stopped(const struct net_device *dev,
+ u16 queue_index)
+{
+ return test_bit(__LINK_STATE_XOFF,
+ &dev->egress_subqueue[queue_index].state);
+}
+
+static inline void netif_wake_subqueue(struct net_device *dev, u16 queue_index)
+{
+#ifdef CONFIG_NETPOLL_TRAP
+ if (netpoll_trap())
+ return;
+#endif
+ if (test_and_clear_bit(__LINK_STATE_XOFF,
+ &dev->egress_subqueue[queue_index].state))
+ __netif_schedule(dev);
+}
+
+static inline int netif_is_multiqueue(const struct net_device *dev)
+{
+ return (!!(NETIF_F_MULTI_QUEUE & dev->features));
+}
/* Use this variant when it is known for sure that it
* is executing from interrupt context.
@@ -995,8 +1050,11 @@ static inline void netif_tx_disable(struct net_device *dev)
extern void ether_setup(struct net_device *dev);
/* Support for loadable net-drivers */
-extern struct net_device *alloc_netdev(int sizeof_priv, const char *name,
- void (*setup)(struct net_device *));
+extern struct net_device *alloc_netdev_mq(int sizeof_priv, const char *name,
+ void (*setup)(struct net_device *),
+ int queue_count);
+#define alloc_netdev(sizeof_priv, name, setup) \
+ alloc_netdev_mq(sizeof_priv, name, setup, 1)
extern int register_netdev(struct net_device *dev);
extern void unregister_netdev(struct net_device *dev);
/* Functions used for multicast support */
diff --git a/include/linux/pkt_sched.h b/include/linux/pkt_sched.h
index d10f353..0d1adaf 100644
--- a/include/linux/pkt_sched.h
+++ b/include/linux/pkt_sched.h
@@ -22,6 +22,7 @@
#define TC_PRIO_CONTROL 7
#define TC_PRIO_MAX 15
+#define TC_RR_MAX 15
/* Generic queue statistics, available for all the elements.
Particular schedulers may have also their private records.
@@ -90,6 +91,16 @@ struct tc_fifo_qopt
__u32 limit; /* Queue length: bytes for bfifo, packets for pfifo */
};
+/* RR section */
+#define TCQ_RR_BANDS 16
+#define TCQ_MIN_RR_BANDS 2
+
+struct tc_rr_qopt
+{
+ int bands; /* Number of bands */
+ __u8 priomap[TC_RR_MAX+1]; /* Map: Linux priority -> RR band */
+};
+
/* PRIO section */
#define TCQ_PRIO_BANDS 16
diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index e7367c7..8bcd870 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -215,6 +215,7 @@ typedef unsigned char *sk_buff_data_t;
* @pkt_type: Packet class
* @fclone: skbuff clone status
* @ip_summed: Driver fed us an IP checksum
+ * @queue_mapping: Queue mapping for multiqueue devices
* @priority: Packet queueing priority
* @users: User count - see {datagram,tcp}.c
* @protocol: Packet protocol from driver
@@ -269,6 +270,7 @@ struct sk_buff {
__u16 csum_offset;
};
};
+ __u16 queue_mapping;
__u32 priority;
__u8 local_df:1,
cloned:1,
diff --git a/net/core/dev.c b/net/core/dev.c
index 4317c1b..27c90e1 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -1477,6 +1477,8 @@ gso:
spin_lock(&dev->queue_lock);
q = dev->qdisc;
if (q->enqueue) {
+ /* reset queue_mapping to zero */
+ skb->queue_mapping = 0;
rc = q->enqueue(skb, q);
qdisc_run(dev);
spin_unlock(&dev->queue_lock);
@@ -3273,16 +3275,18 @@ static struct net_device_stats *internal_stats(struct net_device *dev)
}
/**
- * alloc_netdev - allocate network device
+ * alloc_netdev_mq - allocate network device
* @sizeof_priv: size of private data to allocate space for
* @name: device name format string
* @setup: callback to initialize device
+ * @queue_count: the number of subqueues to allocate
*
* Allocates a struct net_device with private data area for driver use
- * and performs basic initialization.
+ * and performs basic initialization. Also allocates subqueue structs
+ * for each queue on the device.
*/
-struct net_device *alloc_netdev(int sizeof_priv, const char *name,
- void (*setup)(struct net_device *))
+struct net_device *alloc_netdev_mq(int sizeof_priv, const char *name,
+ void (*setup)(struct net_device *), int queue_count)
{
void *p;
struct net_device *dev;
@@ -3307,12 +3311,23 @@ struct net_device *alloc_netdev(int sizeof_priv, const char *name,
if (sizeof_priv)
dev->priv = netdev_priv(dev);
+ alloc_size = (sizeof(struct net_device_subqueue) * queue_count);
+
+ p = kzalloc(alloc_size, GFP_KERNEL);
+ if (!p) {
+ printk(KERN_ERR "alloc_netdev: Unable to allocate queues.\n");
+ return NULL;
+ }
+
+ dev->egress_subqueue = p;
+ dev->egress_subqueue_count = queue_count;
+
dev->get_stats = internal_stats;
setup(dev);
strcpy(dev->name, name);
return dev;
}
-EXPORT_SYMBOL(alloc_netdev);
+EXPORT_SYMBOL(alloc_netdev_mq);
/**
* free_netdev - free network device
@@ -3326,6 +3341,7 @@ void free_netdev(struct net_device *dev)
{
#ifdef CONFIG_SYSFS
/* Compatibility with error handling in drivers */
+ kfree((char *)dev->egress_subqueue);
if (dev->reg_state == NETREG_UNINITIALIZED) {
kfree((char *)dev - dev->padded);
return;
@@ -3337,6 +3353,7 @@ void free_netdev(struct net_device *dev)
/* will free via device release */
put_device(&dev->dev);
#else
+ kfree((char *)dev->egress_subqueue);
kfree((char *)dev - dev->padded);
#endif
}
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index 1422573..0528cf3 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -418,6 +418,7 @@ struct sk_buff *skb_clone(struct sk_buff *skb, gfp_t gfp_mask)
n->nohdr = 0;
C(pkt_type);
C(ip_summed);
+ C(queue_mapping);
C(priority);
#if defined(CONFIG_IP_VS) || defined(CONFIG_IP_VS_MODULE)
C(ipvs_property);
@@ -459,6 +460,7 @@ static void copy_skb_header(struct sk_buff *new, const struct sk_buff *old)
#endif
new->sk = NULL;
new->dev = old->dev;
+ new->queue_mapping = old->queue_mapping;
new->priority = old->priority;
new->protocol = old->protocol;
new->dst = dst_clone(old->dst);
@@ -1926,6 +1928,7 @@ struct sk_buff *skb_segment(struct sk_buff *skb, int features)
tail = nskb;
nskb->dev = skb->dev;
+ nskb->queue_mapping = skb->queue_mapping;
nskb->priority = skb->priority;
nskb->protocol = skb->protocol;
nskb->dst = dst_clone(skb->dst);
diff --git a/net/ethernet/eth.c b/net/ethernet/eth.c
index 0ac2524..87a509c 100644
--- a/net/ethernet/eth.c
+++ b/net/ethernet/eth.c
@@ -316,9 +316,10 @@ void ether_setup(struct net_device *dev)
EXPORT_SYMBOL(ether_setup);
/**
- * alloc_etherdev - Allocates and sets up an Ethernet device
+ * alloc_etherdev_mq - Allocates and sets up an Ethernet device
* @sizeof_priv: Size of additional driver-private structure to be allocated
* for this Ethernet device
+ * @queue_count: The number of queues this device has.
*
* Fill in the fields of the device structure with Ethernet-generic
* values. Basically does everything except registering the device.
@@ -328,8 +329,8 @@ EXPORT_SYMBOL(ether_setup);
* this private data area.
*/
-struct net_device *alloc_etherdev(int sizeof_priv)
+struct net_device *alloc_etherdev_mq(int sizeof_priv, int queue_count)
{
- return alloc_netdev(sizeof_priv, "eth%d", ether_setup);
+ return alloc_netdev_mq(sizeof_priv, "eth%d", ether_setup, queue_count);
}
-EXPORT_SYMBOL(alloc_etherdev);
+EXPORT_SYMBOL(alloc_etherdev_mq);
diff --git a/net/sched/Kconfig b/net/sched/Kconfig
index 475df84..a532554 100644
--- a/net/sched/Kconfig
+++ b/net/sched/Kconfig
@@ -111,6 +111,28 @@ config NET_SCH_PRIO
To compile this code as a module, choose M here: the
module will be called sch_prio.
+config NET_SCH_PRIO_MQ
+ bool "Multiple hardware queue support for PRIO"
+ depends on NET_SCH_PRIO
+ ---help---
+ Say Y here if you want to allow the PRIO qdisc to assign
+ flows to multiple hardware queues on an ethernet device. This
+ will still work on devices with 1 queue.
+
+ Consider this scheduler for devices that do not use
+ hardware-based scheduling policies. Otherwise, use NET_SCH_RR.
+
+ Most people will say N here.
+
+config NET_SCH_RR
+ tristate "Multi Band Round Robin Queuing (RR)"
+ ---help---
+ Say Y here if you want to use an n-band round robin packet
+ scheduler.
+
+ To compile this code as a module, choose M here: the
+ module will be caleld sch_rr.
+
config NET_SCH_RED
tristate "Random Early Detection (RED)"
---help---
diff --git a/net/sched/Makefile b/net/sched/Makefile
index 020767a..d3ed44e 100644
--- a/net/sched/Makefile
+++ b/net/sched/Makefile
@@ -26,6 +26,7 @@ obj-$(CONFIG_NET_SCH_SFQ) += sch_sfq.o
obj-$(CONFIG_NET_SCH_TBF) += sch_tbf.o
obj-$(CONFIG_NET_SCH_TEQL) += sch_teql.o
obj-$(CONFIG_NET_SCH_PRIO) += sch_prio.o
+obj-$(CONFIG_NET_SCH_RR) += sch_rr.o
obj-$(CONFIG_NET_SCH_ATM) += sch_atm.o
obj-$(CONFIG_NET_SCH_NETEM) += sch_netem.o
obj-$(CONFIG_NET_CLS_U32) += cls_u32.o
diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c
index f28bb2d..b9dc2a6 100644
--- a/net/sched/sch_generic.c
+++ b/net/sched/sch_generic.c
@@ -123,7 +123,8 @@ static inline int qdisc_restart(struct net_device *dev)
/* And release queue */
spin_unlock(&dev->queue_lock);
- if (!netif_queue_stopped(dev)) {
+ if (!netif_queue_stopped(dev) &&
+ !netif_subqueue_stopped(dev, skb->queue_mapping)) {
int ret;
ret = dev_hard_start_xmit(skb, dev);
@@ -141,7 +142,6 @@ static inline int qdisc_restart(struct net_device *dev)
goto collision;
}
}
-
/* NETDEV_TX_BUSY - we need to requeue */
/* Release the driver */
if (!nolock) {
diff --git a/net/sched/sch_prio.c b/net/sched/sch_prio.c
index 269a6e1..c78dba4 100644
--- a/net/sched/sch_prio.c
+++ b/net/sched/sch_prio.c
@@ -43,6 +43,7 @@ struct prio_sched_data
struct tcf_proto *filter_list;
u8 prio2band[TC_PRIO_MAX+1];
struct Qdisc *queues[TCQ_PRIO_BANDS];
+ u16 band2queue[TC_PRIO_MAX + 1];
};
@@ -70,13 +71,26 @@ prio_classify(struct sk_buff *skb, struct Qdisc *sch, int *qerr)
#endif
if (TC_H_MAJ(band))
band = 0;
+#ifdef CONFIG_NET_SCH_PRIO_MQ
+ skb->queue_mapping =
+ q->band2queue[q->prio2band[band&TC_PRIO_MAX]];
+#endif
+
return q->queues[q->prio2band[band&TC_PRIO_MAX]];
}
band = res.classid;
}
band = TC_H_MIN(band) - 1;
- if (band > q->bands)
+ if (band > q->bands) {
+#ifdef CONFIG_NET_SCH_PRIO_MQ
+ skb->queue_mapping = q->band2queue[q->prio2band[0]];
+#endif
return q->queues[q->prio2band[0]];
+ }
+
+#ifdef CONFIG_NET_SCH_PRIO_MQ
+ skb->queue_mapping = q->band2queue[band];
+#endif
return q->queues[band];
}
@@ -144,12 +158,22 @@ prio_dequeue(struct Qdisc* sch)
struct Qdisc *qdisc;
for (prio = 0; prio < q->bands; prio++) {
- qdisc = q->queues[prio];
- skb = qdisc->dequeue(qdisc);
- if (skb) {
- sch->q.qlen--;
- return skb;
+#ifdef CONFIG_NET_SCH_PRIO_MQ
+ /* Check if the target subqueue is available before
+ * pulling an skb. This way we avoid excessive requeues
+ * for slower queues.
+ */
+ if (!netif_subqueue_stopped(sch->dev, q->band2queue[prio])) {
+#endif
+ qdisc = q->queues[prio];
+ skb = qdisc->dequeue(qdisc);
+ if (skb) {
+ sch->q.qlen--;
+ return skb;
+ }
+#ifdef CONFIG_NET_SCH_PRIO_MQ
}
+#endif
}
return NULL;
@@ -200,6 +224,10 @@ static int prio_tune(struct Qdisc *sch, struct rtattr *opt)
struct prio_sched_data *q = qdisc_priv(sch);
struct tc_prio_qopt *qopt = RTA_DATA(opt);
int i;
+ int queue;
+ int qmapoffset;
+ int offset;
+ int mod;
if (opt->rta_len < RTA_LENGTH(sizeof(*qopt)))
return -EINVAL;
@@ -242,6 +270,32 @@ static int prio_tune(struct Qdisc *sch, struct rtattr *opt)
}
}
}
+#ifdef CONFIG_NET_SCH_PRIO_MQ
+ /* setup queue to band mapping */
+ if (q->bands < sch->dev->egress_subqueue_count) {
+ qmapoffset = 1;
+ mod = sch->dev->egress_subqueue_count;
+ } else {
+ mod = q->bands % sch->dev->egress_subqueue_count;
+ qmapoffset = q->bands / sch->dev->egress_subqueue_count
+ + ((mod) ? 1 : 0);
+ }
+
+ queue = 0;
+ offset = 0;
+ for (i = 0; i < q->bands; i++) {
+ q->band2queue[i] = queue;
+ if ( ((i + 1) - offset) == qmapoffset) {
+ queue++;
+ offset += qmapoffset;
+ if (mod)
+ mod--;
+ qmapoffset = q->bands /
+ sch->dev->egress_subqueue_count +
+ ((mod) ? 1 : 0);
+ }
+ }
+#endif
return 0;
}
diff --git a/net/sched/sch_rr.c b/net/sched/sch_rr.c
new file mode 100644
index 0000000..ce9f237
--- /dev/null
+++ b/net/sched/sch_rr.c
@@ -0,0 +1,516 @@
+/*
+ * net/sched/sch_rr.c Simple n-band round-robin scheduler.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ * The core part of this qdisc is based on sch_prio. ->dequeue() is where
+ * this scheduler functionally differs.
+ *
+ * Author: PJ Waskiewicz, <peter.p.waskiewicz.jr@intel.com>
+ *
+ * Original Authors (from PRIO): Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
+ * Fixes: 19990609: J Hadi Salim <hadi@nortelnetworks.com>:
+ * Init -- EINVAL when opt undefined
+ */
+
+#include <linux/module.h>
+#include <asm/uaccess.h>
+#include <asm/system.h>
+#include <linux/bitops.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/string.h>
+#include <linux/mm.h>
+#include <linux/socket.h>
+#include <linux/sockios.h>
+#include <linux/in.h>
+#include <linux/errno.h>
+#include <linux/interrupt.h>
+#include <linux/if_ether.h>
+#include <linux/inet.h>
+#include <linux/netdevice.h>
+#include <linux/etherdevice.h>
+#include <linux/notifier.h>
+#include <net/ip.h>
+#include <net/route.h>
+#include <linux/skbuff.h>
+#include <net/netlink.h>
+#include <net/sock.h>
+#include <net/pkt_sched.h>
+
+
+struct rr_sched_data
+{
+ int bands;
+ int curband;
+ struct tcf_proto *filter_list;
+ u8 prio2band[TC_RR_MAX + 1];
+ struct Qdisc *queues[TCQ_RR_BANDS];
+ u16 band2queue[TC_RR_MAX + 1];
+};
+
+
+static struct Qdisc *rr_classify(struct sk_buff *skb, struct Qdisc *sch,
+ int *qerr)
+{
+ struct rr_sched_data *q = qdisc_priv(sch);
+ u32 band = skb->priority;
+ struct tcf_result res;
+
+ *qerr = NET_XMIT_BYPASS;
+ if (TC_H_MAJ(skb->priority) != sch->handle) {
+#ifdef CONFIG_NET_CLS_ACT
+ switch (tc_classify(skb, q->filter_list, &res)) {
+ case TC_ACT_STOLEN:
+ case TC_ACT_QUEUED:
+ *qerr = NET_XMIT_SUCCESS;
+ case TC_ACT_SHOT:
+ return NULL;
+ }
+
+ if (!q->filter_list ) {
+#else
+ if (!q->filter_list || tc_classify(skb, q->filter_list, &res)) {
+#endif
+ if (TC_H_MAJ(band))
+ band = 0;
+ skb->queue_mapping =
+ q->band2queue[q->prio2band[band&TC_RR_MAX]];
+
+ return q->queues[q->prio2band[band&TC_RR_MAX]];
+ }
+ band = res.classid;
+ }
+ band = TC_H_MIN(band) - 1;
+ if (band > q->bands) {
+ skb->queue_mapping = q->band2queue[q->prio2band[0]];
+ return q->queues[q->prio2band[0]];
+ }
+
+ skb->queue_mapping = q->band2queue[band];
+
+ return q->queues[band];
+}
+
+static int rr_enqueue(struct sk_buff *skb, struct Qdisc *sch)
+{
+ struct Qdisc *qdisc;
+ int ret;
+
+ qdisc = rr_classify(skb, sch, &ret);
+#ifdef CONFIG_NET_CLS_ACT
+ if (qdisc == NULL) {
+
+ if (ret == NET_XMIT_BYPASS)
+ sch->qstats.drops++;
+ kfree_skb(skb);
+ return ret;
+ }
+#endif
+
+ if ((ret = qdisc->enqueue(skb, qdisc)) == NET_XMIT_SUCCESS) {
+ sch->bstats.bytes += skb->len;
+ sch->bstats.packets++;
+ sch->q.qlen++;
+ return NET_XMIT_SUCCESS;
+ }
+ sch->qstats.drops++;
+ return ret;
+}
+
+
+static int rr_requeue(struct sk_buff *skb, struct Qdisc* sch)
+{
+ struct Qdisc *qdisc;
+ int ret;
+
+ qdisc = rr_classify(skb, sch, &ret);
+#ifdef CONFIG_NET_CLS_ACT
+ if (qdisc == NULL) {
+ if (ret == NET_XMIT_BYPASS)
+ sch->qstats.drops++;
+ kfree_skb(skb);
+ return ret;
+ }
+#endif
+
+ if ((ret = qdisc->ops->requeue(skb, qdisc)) == NET_XMIT_SUCCESS) {
+ sch->q.qlen++;
+ sch->qstats.requeues++;
+ return 0;
+ }
+ sch->qstats.drops++;
+ return NET_XMIT_DROP;
+}
+
+
+static struct sk_buff *rr_dequeue(struct Qdisc* sch)
+{
+ struct sk_buff *skb;
+ struct rr_sched_data *q = qdisc_priv(sch);
+ struct Qdisc *qdisc;
+ int bandcount;
+
+ /* Only take one pass through the queues. If nothing is available,
+ * return nothing.
+ */
+ for (bandcount = 0; bandcount < q->bands; bandcount++) {
+ /* Check if the target subqueue is available before
+ * pulling an skb. This way we avoid excessive requeues
+ * for slower queues. If the queue is stopped, try the
+ * next queue.
+ */
+ if (!netif_subqueue_stopped(sch->dev, q->band2queue[q->curband])) {
+ qdisc = q->queues[q->curband];
+ skb = qdisc->dequeue(qdisc);
+ if (skb) {
+ sch->q.qlen--;
+ q->curband++;
+ if (q->curband >= q->bands)
+ q->curband = 0;
+ return skb;
+ }
+ }
+ q->curband++;
+ if (q->curband >= q->bands)
+ q->curband = 0;
+ }
+ return NULL;
+}
+
+static unsigned int rr_drop(struct Qdisc* sch)
+{
+ struct rr_sched_data *q = qdisc_priv(sch);
+ int band;
+ unsigned int len;
+ struct Qdisc *qdisc;
+
+ for (band = q->bands - 1; band >= 0; band--) {
+ qdisc = q->queues[band];
+ if (qdisc->ops->drop && (len = qdisc->ops->drop(qdisc)) != 0) {
+ sch->q.qlen--;
+ return len;
+ }
+ }
+ return 0;
+}
+
+
+static void rr_reset(struct Qdisc* sch)
+{
+ int band;
+ struct rr_sched_data *q = qdisc_priv(sch);
+
+ for (band = 0; band < q->bands; band++)
+ qdisc_reset(q->queues[band]);
+ sch->q.qlen = 0;
+}
+
+static void rr_destroy(struct Qdisc* sch)
+{
+ int band;
+ struct rr_sched_data *q = qdisc_priv(sch);
+
+ tcf_destroy_chain(q->filter_list);
+ for (band = 0; band < q->bands; band++)
+ qdisc_destroy(q->queues[band]);
+}
+
+static int rr_tune(struct Qdisc *sch, struct rtattr *opt)
+{
+ struct rr_sched_data *q = qdisc_priv(sch);
+ struct tc_rr_qopt *qopt = RTA_DATA(opt);
+ int i;
+ int queue;
+ int qmapoffset;
+ int offset;
+ int mod;
+
+ if (opt->rta_len < RTA_LENGTH(sizeof(*qopt)))
+ return -EINVAL;
+ if (qopt->bands > TCQ_RR_BANDS || qopt->bands < 2)
+ return -EINVAL;
+
+ for (i = 0; i <= TC_RR_MAX; i++) {
+ if (qopt->priomap[i] >= qopt->bands)
+ return -EINVAL;
+ }
+
+ sch_tree_lock(sch);
+ q->bands = qopt->bands;
+ memcpy(q->prio2band, qopt->priomap, TC_PRIO_MAX+1);
+ q->curband = 0;
+
+ for (i = q->bands; i < TCQ_RR_BANDS; i++) {
+ struct Qdisc *child = xchg(&q->queues[i], &noop_qdisc);
+ if (child != &noop_qdisc) {
+ qdisc_tree_decrease_qlen(child, child->q.qlen);
+ qdisc_destroy(child);
+ }
+ }
+ sch_tree_unlock(sch);
+
+ for (i = 0; i < q->bands; i++) {
+ if (q->queues[i] == &noop_qdisc) {
+ struct Qdisc *child;
+ child = qdisc_create_dflt(sch->dev, &pfifo_qdisc_ops,
+ TC_H_MAKE(sch->handle, i + 1));
+ if (child) {
+ sch_tree_lock(sch);
+ child = xchg(&q->queues[i], child);
+
+ if (child != &noop_qdisc) {
+ qdisc_tree_decrease_qlen(child,
+ child->q.qlen);
+ qdisc_destroy(child);
+ }
+ sch_tree_unlock(sch);
+ }
+ }
+ }
+ /* setup queue to band mapping - best effort to map into available
+ * hardware queues
+ */
+ if (q->bands < sch->dev->egress_subqueue_count) {
+ qmapoffset = 1;
+ mod = sch->dev->egress_subqueue_count;
+ } else {
+ mod = q->bands % sch->dev->egress_subqueue_count;
+ qmapoffset = q->bands / sch->dev->egress_subqueue_count
+ + ((mod) ? 1 : 0);
+ }
+
+ queue = 0;
+ offset = 0;
+ for (i = 0; i < q->bands; i++) {
+ q->band2queue[i] = queue;
+ if ( ((i + 1) - offset) == qmapoffset) {
+ queue++;
+ offset += qmapoffset;
+ if (mod)
+ mod--;
+ qmapoffset = q->bands /
+ sch->dev->egress_subqueue_count +
+ ((mod) ? 1 : 0);
+ }
+ }
+
+ return 0;
+}
+
+static int rr_init(struct Qdisc *sch, struct rtattr *opt)
+{
+ struct rr_sched_data *q = qdisc_priv(sch);
+ int i;
+
+ for (i = 0; i < TCQ_RR_BANDS; i++)
+ q->queues[i] = &noop_qdisc;
+
+ if (opt == NULL) {
+ return -EINVAL;
+ } else {
+ int err;
+
+ if ((err = rr_tune(sch, opt)) != 0)
+ return err;
+ }
+ return 0;
+}
+
+static int rr_dump(struct Qdisc *sch, struct sk_buff *skb)
+{
+ struct rr_sched_data *q = qdisc_priv(sch);
+ unsigned char *b = skb_tail_pointer(skb);
+ struct tc_rr_qopt opt;
+
+ opt.bands = q->bands;
+ memcpy(&opt.priomap, q->prio2band, TC_RR_MAX + 1);
+ RTA_PUT(skb, TCA_OPTIONS, sizeof(opt), &opt);
+ return skb->len;
+
+rtattr_failure:
+ nlmsg_trim(skb, b);
+ return -1;
+}
+
+static int rr_graft(struct Qdisc *sch, unsigned long arg, struct Qdisc *new,
+ struct Qdisc **old)
+{
+ struct rr_sched_data *q = qdisc_priv(sch);
+ unsigned long band = arg - 1;
+
+ if (band >= q->bands)
+ return -EINVAL;
+
+ if (new == NULL)
+ new = &noop_qdisc;
+
+ sch_tree_lock(sch);
+ *old = q->queues[band];
+ q->queues[band] = new;
+ qdisc_tree_decrease_qlen(*old, (*old)->q.qlen);
+ qdisc_reset(*old);
+ sch_tree_unlock(sch);
+
+ return 0;
+}
+
+static struct Qdisc *rr_leaf(struct Qdisc *sch, unsigned long arg)
+{
+ struct rr_sched_data *q = qdisc_priv(sch);
+ unsigned long band = arg - 1;
+
+ if (band >= q->bands)
+ return NULL;
+
+ return q->queues[band];
+}
+
+static unsigned long rr_get(struct Qdisc *sch, u32 classid)
+{
+ struct rr_sched_data *q = qdisc_priv(sch);
+ unsigned long band = TC_H_MIN(classid);
+
+ if (band - 1 >= q->bands)
+ return 0;
+ return band;
+}
+
+static unsigned long rr_bind(struct Qdisc *sch, unsigned long parent,
+ u32 classid)
+{
+ return rr_get(sch, classid);
+}
+
+
+static void rr_put(struct Qdisc *q, unsigned long cl)
+{
+ return;
+}
+
+static int rr_change(struct Qdisc *sch, u32 handle, u32 parent,
+ struct rtattr **tca, unsigned long *arg)
+{
+ unsigned long cl = *arg;
+ struct rr_sched_data *q = qdisc_priv(sch);
+
+ if (cl - 1 > q->bands)
+ return -ENOENT;
+ return 0;
+}
+
+static int rr_delete(struct Qdisc *sch, unsigned long cl)
+{
+ struct rr_sched_data *q = qdisc_priv(sch);
+ if (cl - 1 > q->bands)
+ return -ENOENT;
+ return 0;
+}
+
+
+static int rr_dump_class(struct Qdisc *sch, unsigned long cl,
+ struct sk_buff *skb, struct tcmsg *tcm)
+{
+ struct rr_sched_data *q = qdisc_priv(sch);
+
+ if (cl - 1 > q->bands)
+ return -ENOENT;
+ tcm->tcm_handle |= TC_H_MIN(cl);
+ if (q->queues[cl - 1])
+ tcm->tcm_info = q->queues[cl - 1]->handle;
+ return 0;
+}
+
+static int rr_dump_class_stats(struct Qdisc *sch, unsigned long cl,
+ struct gnet_dump *d)
+{
+ struct rr_sched_data *q = qdisc_priv(sch);
+ struct Qdisc *cl_q;
+
+ cl_q = q->queues[cl - 1];
+ if (gnet_stats_copy_basic(d, &cl_q->bstats) < 0 ||
+ gnet_stats_copy_queue(d, &cl_q->qstats) < 0)
+ return -1;
+
+ return 0;
+}
+
+static void rr_walk(struct Qdisc *sch, struct qdisc_walker *arg)
+{
+ struct rr_sched_data *q = qdisc_priv(sch);
+ int band;
+
+ if (arg->stop)
+ return;
+
+ for (band = 0; band < q->bands; band++) {
+ if (arg->count < arg->skip) {
+ arg->count++;
+ continue;
+ }
+ if (arg->fn(sch, band + 1, arg) < 0) {
+ arg->stop = 1;
+ break;
+ }
+ arg->count++;
+ }
+}
+
+static struct tcf_proto **rr_find_tcf(struct Qdisc *sch, unsigned long cl)
+{
+ struct rr_sched_data *q = qdisc_priv(sch);
+
+ if (cl)
+ return NULL;
+ return &q->filter_list;
+}
+
+static struct Qdisc_class_ops rr_class_ops = {
+ .graft = rr_graft,
+ .leaf = rr_leaf,
+ .get = rr_get,
+ .put = rr_put,
+ .change = rr_change,
+ .delete = rr_delete,
+ .walk = rr_walk,
+ .tcf_chain = rr_find_tcf,
+ .bind_tcf = rr_bind,
+ .unbind_tcf = rr_put,
+ .dump = rr_dump_class,
+ .dump_stats = rr_dump_class_stats,
+};
+
+static struct Qdisc_ops rr_qdisc_ops = {
+ .next = NULL,
+ .cl_ops = &rr_class_ops,
+ .id = "rr",
+ .priv_size = sizeof(struct rr_sched_data),
+ .enqueue = rr_enqueue,
+ .dequeue = rr_dequeue,
+ .requeue = rr_requeue,
+ .drop = rr_drop,
+ .init = rr_init,
+ .reset = rr_reset,
+ .destroy = rr_destroy,
+ .change = rr_tune,
+ .dump = rr_dump,
+ .owner = THIS_MODULE,
+};
+
+static int __init rr_module_init(void)
+{
+ return register_qdisc(&rr_qdisc_ops);
+}
+
+static void __exit rr_module_exit(void)
+{
+ unregister_qdisc(&rr_qdisc_ops);
+}
+
+module_init(rr_module_init)
+module_exit(rr_module_exit)
+
+MODULE_LICENSE("GPL");
^ permalink raw reply related [flat|nested] 153+ messages in thread
* Re: [PATCH] NET: Multiqueue network device support.
2007-06-04 21:40 ` [PATCH] NET: Multiqueue network device support PJ Waskiewicz
@ 2007-06-05 11:50 ` jamal
2007-06-05 15:51 ` Waskiewicz Jr, Peter P
2007-06-11 17:36 ` Patrick McHardy
2007-06-11 17:52 ` Patrick McHardy
2 siblings, 1 reply; 153+ messages in thread
From: jamal @ 2007-06-05 11:50 UTC (permalink / raw)
To: PJ Waskiewicz; +Cc: davem, netdev, jeff, auke-jan.h.kok
On Mon, 2007-04-06 at 14:40 -0700, PJ Waskiewicz wrote:
> API added to support multiple hardware queues on an ethernet device.
> Round-robin scheduler added (sch_rr) to provide a no-scheduling policy
> qdisc for hardware with multiple queues.
>
>From a high level i see a good start that you at least have a separate
qdisc. I dont see the need for making any subqueue semantics in the
qdisc. We already have them.
I also still dont see the need for the patching of the prio qdisc or the
subqueue control.
I am now uncertain that after all those discussions (and a lot other
private ones) whether you understood me. We are still not meeting in the
middle.
Sorry, Peter i dont mean to rain on your parade but i cant let this just
slide by[1]. So please give me sometime and this week i will send
patches to demonstrate my view. I didnt mean to do that, but as i see it
i have no other choice.
BTW, wheres the e1000 change?
cheers,
jamal
[1] If for example you wrote a classifier or a qdisc (as in a recent
discussion I had with Patrick) i would say it is your code and your
effort and i have the choice not to use it (by virtue of there being
other alternatives). I have no such luxury but to use the changes you
make to that code path whenever i use multi tx rings.
PS:- It is polite to CC someone who has engaged you in a conversation;
in particular as i have told you before at times i dont read netdev for
days but read emails addressed to me even when busyed out or travelling.
^ permalink raw reply [flat|nested] 153+ messages in thread
* RE: [PATCH] NET: Multiqueue network device support.
2007-06-05 11:50 ` jamal
@ 2007-06-05 15:51 ` Waskiewicz Jr, Peter P
2007-06-05 22:28 ` jamal
0 siblings, 1 reply; 153+ messages in thread
From: Waskiewicz Jr, Peter P @ 2007-06-05 15:51 UTC (permalink / raw)
To: hadi; +Cc: davem, netdev, jeff, Kok, Auke-jan H
>From a high level i see a good start that you at least have
> a separate
> qdisc. I dont see the need for making any subqueue semantics
> in the qdisc. We already have them.
No, we don't have subqueue semantics going directly to the hardware
subqueues. Only in software.
> I also still dont see the need for the patching of the prio
> qdisc or the subqueue control.
sch_rr gives a qdisc without a strict scheduling policy for people that
either want direct control of where flows go, or hardware with a
scheduler. sch_prio can give people the flexibility to have a
scheduling policy for hardware that has none (e1000 for example).
Patrick had suggested/requested a qdisc like sch_rr before, so here it
is. I kept sch_prio for flexibility and choice for users.
> I am now uncertain that after all those discussions (and a
> lot other private ones) whether you understood me. We are
> still not meeting in the middle.
I certainly understood what you were saying, however, what I'm trying to
solve with my patches is not what you were suggesting. A WRR scheduler,
and no exposure of hardware queues in the network stack is not what I'm
trying to solve.
> Sorry, Peter i dont mean to rain on your parade but i cant
> let this just slide by[1]. So please give me sometime and
> this week i will send patches to demonstrate my view. I didnt
> mean to do that, but as i see it i have no other choice.
I don't want to seem ungrateful, but this is what I've been asking of
you since you had objections to my patches. Patrick, Thomas, and Yi Zhu
all gave technical feedback on the patches, I defended and/or updated
the patches, and they seemed fine with them then. However, you want
something different from what I'm doing, not a different approach for
what it is I'm proposing. I'd love to see the patches you're thinking
of, and see if they really do solve what I'm trying to solve.
> BTW, wheres the e1000 change?
The previously posted e1000 patch for this multiqueue patchset is
identical. I can repost it if you want, but this is just an RFC
patchset for the new qdisc, and I didn't want to cloud the point of the
RFC.
> [1] If for example you wrote a classifier or a qdisc (as in a
> recent discussion I had with Patrick) i would say it is your
> code and your effort and i have the choice not to use it (by
> virtue of there being other alternatives). I have no such
> luxury but to use the changes you make to that code path
> whenever i use multi tx rings.
I disagree completely. Have you seriously looked at the patches?? The
driver is in control whether or not it wants to present multiple tx
rings to the stack. The driver has to call alloc_etherdev_mq()
explicitly to allow the stack to see the queues; otherwise, there's only
one queue presented. I don't understand what you have a problem with
here; the API allows complete control from the driver's perspective to
use the new multiqueue codepath or not use it. You have all the control
in the world to decide whether to use or not use the multiqueue
codepath. Can you please explain what your real issue is here?
Cheers,
-PJ Waskiewicz
^ permalink raw reply [flat|nested] 153+ messages in thread
* RE: [PATCH] NET: Multiqueue network device support.
2007-06-05 15:51 ` Waskiewicz Jr, Peter P
@ 2007-06-05 22:28 ` jamal
2007-06-06 15:11 ` Patrick McHardy
0 siblings, 1 reply; 153+ messages in thread
From: jamal @ 2007-06-05 22:28 UTC (permalink / raw)
To: Waskiewicz Jr, Peter P; +Cc: davem, netdev, jeff, Kok, Auke-jan H
On Tue, 2007-05-06 at 08:51 -0700, Waskiewicz Jr, Peter P wrote:
> No, we don't have subqueue semantics going directly to the hardware
> subqueues. Only in software.
Yes, that is one thing i was speaking against.
> sch_rr gives a qdisc without a strict scheduling policy for people that
> either want direct control of where flows go, or hardware with a
> scheduler. sch_prio can give people the flexibility to have a
> scheduling policy for hardware that has none (e1000 for example).
> Patrick had suggested/requested a qdisc like sch_rr before, so here it
> is.
I did too - right here:
http://marc.info/?l=linux-netdev&m=117810985623646&w=2
[..]
> > BTW, wheres the e1000 change?
>
> The previously posted e1000 patch for this multiqueue patchset is
> identical. I can repost it if you want, but this is just an RFC
> patchset for the new qdisc, and I didn't want to cloud the point of the
> RFC.
>
Please send it to me privately.
> I disagree completely. Have you seriously looked at the patches??
yes, I have looked at the patches. And i gave you the nod that you have
improved over the previous patches. you actually semi-listened - but the
core conflicting views we have still remain.
> The
> driver is in control whether or not it wants to present multiple tx
> rings to the stack. The driver has to call alloc_etherdev_mq()
> explicitly to allow the stack to see the queues; otherwise, there's only
> one queue presented. I don't understand what you have a problem with
> here; the API allows complete control from the driver's perspective to
> use the new multiqueue codepath or not use it. You have all the control
> in the world to decide whether to use or not use the multiqueue
> codepath. Can you please explain what your real issue is here?
There will be no issue if a) multiple APIs would be allowed for driver
multi-rings[1] and b) you didnt touch the qdiscs.
Given that #a is not a sensible thing to do since there can only be one
API and for #b you are not compromising, what do you want me to do?
cheers,
jamal
[1] Our main difference for the API remains that according to me, the
core needs to know nothing about the multi rings and according to you,
the driver exposes such info to the core. They are two conflicting
approaches.
^ permalink raw reply [flat|nested] 153+ messages in thread
* Re: [PATCH] NET: Multiqueue network device support.
2007-06-05 22:28 ` jamal
@ 2007-06-06 15:11 ` Patrick McHardy
2007-06-06 22:13 ` jamal
0 siblings, 1 reply; 153+ messages in thread
From: Patrick McHardy @ 2007-06-06 15:11 UTC (permalink / raw)
To: hadi; +Cc: Waskiewicz Jr, Peter P, davem, netdev, jeff, Kok, Auke-jan H
jamal wrote:
> There will be no issue if a) multiple APIs would be allowed for driver
> multi-rings[1] and b) you didnt touch the qdiscs.
>
> Given that #a is not a sensible thing to do since there can only be one
> API and for #b you are not compromising, what do you want me to do?
I haven't followed the entire discussion, but I still don't see a
alternative to touching the qdisc layer - multiple hardware queues
need multiple queue states if you want to avoid a busy hardware
queue stopping the qdisc entirely and thereby preventing the qdisc
to continue feeding packets to other active HW queues. And to make
use of the multiple queue states you need multiple queues.
I would love to see your alternative patches.
^ permalink raw reply [flat|nested] 153+ messages in thread
* Re: [PATCH] NET: Multiqueue network device support.
2007-06-06 15:11 ` Patrick McHardy
@ 2007-06-06 22:13 ` jamal
2007-06-06 22:30 ` Waskiewicz Jr, Peter P
` (2 more replies)
0 siblings, 3 replies; 153+ messages in thread
From: jamal @ 2007-06-06 22:13 UTC (permalink / raw)
To: Patrick McHardy
Cc: Waskiewicz Jr, Peter P, davem, netdev, jeff, Kok, Auke-jan H
On Wed, 2007-06-06 at 17:11 +0200, Patrick McHardy wrote:
> I haven't followed the entire discussion, but I still don't see a
> alternative to touching the qdisc layer - multiple hardware queues
> need multiple queue states if you want to avoid a busy hardware
> queue stopping the qdisc entirely
If you start with the above premise then ...
> and thereby preventing the qdisc
> to continue feeding packets to other active HW queues. And to make
> use of the multiple queue states you need multiple queues.
.... you will logically lead to the above conclusion.
[Which of course leads to the complexity (and not optimizing for the
common - which is single ring NICs)].
The problem is the premise is _innacurate_.
Since you havent followed the discussion, i will try to be brief (which
is hard).
If you want verbosity it is in my previous emails:
Consider a simple example of strict prio qdisc which is mirror
configuration of a specific hardware.
Then for sake of discussion, assume two prio queues in the qdisc - PSL
and PSH and two hardware queues/rings in a NIC which does strict prio
with queues PHL and PHH.
The mapping is as follows:
PSL --- maps to --- PHL
PSH --- maps to --- PHH
Assume the PxH has a higher prio than PxL.
Strict prio will always favor H over L.
Two scenarios:
a) a lot of packets for PSL arriving on the stack.
They only get sent from PSL -> PHL if and only if there are no
packets from PSH->PHH.
b)a lot of packets for PSH arriving from the stack.
They will always be favored over PSL in sending to the hardware.
>From the above:
The only way PHL will ever shutdown the path to the hardware is when
there are sufficient PHL packets.
Corrollary,
The only way PSL will ever shutdown the path to the hardware is when
there are _NO_ PSH packets.
So there is no need to do per queue control because the schedule will
ensure things work out fine as long as what you have the correct qdisc;
and it is a qdisc that will work just fine with a single ring with zero
mods.
What you need is a driver API to ask it to select the ring given an
index. This is similar to the qdisc filter used to select a queue.
You can extend the use case i described above to N queues. You can
extend it to other schedulers (WRR or any non-work conserving queues)
etc. It is consistent. Of course if you configure CBQ for a hardware
that does strict prio - that is a misconfig etc.
Infact for the wired case i see little value (there is some) in using
multiple rings. In the case of wireless (which is strict prio based) it
provides more value.
> I would love to see your alternative patches.
>From the above you can see they are simple. I am working on a couple of
things (batching and recovering pktgen ipsec patches)- I will work on
those patches soon after.
Iam actually not against the subqueue control - i know Peter needs it
for certain hardware; i am just against the mucking around of the common
case (single ring NIC) just to get that working.
cheers,
jamal
^ permalink raw reply [flat|nested] 153+ messages in thread
* RE: [PATCH] NET: Multiqueue network device support.
2007-06-06 22:13 ` jamal
@ 2007-06-06 22:30 ` Waskiewicz Jr, Peter P
2007-06-06 22:40 ` David Miller
2007-06-09 14:58 ` Leonid Grossman
2007-06-06 22:35 ` David Miller
2007-06-11 11:58 ` Patrick McHardy
2 siblings, 2 replies; 153+ messages in thread
From: Waskiewicz Jr, Peter P @ 2007-06-06 22:30 UTC (permalink / raw)
To: hadi, Patrick McHardy; +Cc: davem, netdev, jeff, Kok, Auke-jan H
> [Which of course leads to the complexity (and not optimizing
> for the common - which is single ring NICs)].
The common for 100 Mbit and older 1Gbit is single ring NICs. Newer
PCI-X and PCIe NICs from 1Gbit to 10Gbit support multiple rings in the
hardware, and it's all headed in that direction, so it's becoming the
common case.
> Infact for the wired case i see little value (there is some)
> in using multiple rings. In the case of wireless (which is
> strict prio based) it provides more value.
There is value, hence why NIC manufacturers are building wired parts
that support multiple rings today. And wireless may not want strict
prio in software, and may just want round-robin from the stack. Either
way, Yi Zhu has represented the wireless side in this discussion
agreeing with these per-queue control patches. Is wireless not a common
case to be considered?
> > I would love to see your alternative patches.
>
> >From the above you can see they are simple.
The description above won't provide what I'm trying to solve and what
wireless has stated they want.
> Iam actually not against the subqueue control - i know Peter
> needs it for certain hardware; i am just against the mucking
> around of the common case (single ring NIC) just to get that working.
Single-ring NICs see no difference here. Please explain why using my
patches with pfifo_fast, sch_prio, or any other existing qdisc will
change the behavior for single-ring NICs? If the driver doesn't call
alloc_etherdev_mq() explicity, or use the new sch_rr qdisc, then the Tx
path is identical to the kernel today. What am I mucking around with?
And these patches are not for specific hardware; rather they're for all
the NICs today that have multiple rings, and want to control them in the
OS instead of the driver, which is most of wireless and a handful of
NICs from Intel and Neterion afaik.
-PJ Waskiewicz
^ permalink raw reply [flat|nested] 153+ messages in thread
* Re: [PATCH] NET: Multiqueue network device support.
2007-06-06 22:13 ` jamal
2007-06-06 22:30 ` Waskiewicz Jr, Peter P
@ 2007-06-06 22:35 ` David Miller
2007-06-06 22:57 ` Waskiewicz Jr, Peter P
2007-06-06 23:32 ` jamal
2007-06-11 11:58 ` Patrick McHardy
2 siblings, 2 replies; 153+ messages in thread
From: David Miller @ 2007-06-06 22:35 UTC (permalink / raw)
To: hadi; +Cc: kaber, peter.p.waskiewicz.jr, netdev, jeff, auke-jan.h.kok
From: jamal <hadi@cyberus.ca>
Date: Wed, 06 Jun 2007 18:13:40 -0400
> From the above you can see they are simple. I am working on a couple of
> things (batching and recovering pktgen ipsec patches)- I will work on
> those patches soon after.
>
> Iam actually not against the subqueue control - i know Peter needs it
> for certain hardware; i am just against the mucking around of the common
> case (single ring NIC) just to get that working.
There are other reasons to do interesting things in this area,
purely for parallelization reasons.
For example, consider a chip that has N totally independant TX packet
queues going out to the same ethernet port. You can lock and transmit
on them independantly, and the chip internally arbitrates using DRR or
whatever to blast the queues out to the physical port in some fair'ish
manner.
In that case you'd want to be able to do something like:
struct mydev_tx_queue *q = &mydev->tx_q[smp_processor_id() % N];
or similar in the ->hard_start_xmit() driver. But something generic
to support this kind of parallelization would be great (and necessary)
because the TX lock is unary per netdev and destroys all of the
parallelization possible with something like the above.
With the above for transmit, and having N "struct napi_struct"
instances for MSI-X directed RX queues, we'll have no problem keeping
a 10gbit (or even faster) port completely full with lots of cpu to
spare on multi-core boxes.
However, I have to disagree with your analysis of the multi-qdisc
situation, and I tend to agree with Patrick.
If you only have one qdisc to indicate status on, when is the queue
full? That is the core issue. Indicating full status when any of
the hardware queues are full is broken, because we should never
block out queuing of higher priority packets just because the
low priority queue can't take any more frames, _and_ vice versa.
I really want to believe your proofs but they are something out of
a fairy tale :-)
> The only way PHL will ever shutdown the path to the hardware is when
> there are sufficient PHL packets.
> Corrollary,
> The only way PSL will ever shutdown the path to the hardware is when
> there are _NO_ PSH packets.
The problem with this line of thinking is that it ignores the fact
that it is bad to not queue to the device when there is space
available, _even_ for lower priority packets.
The more you keep all available TX queues full, the less likely
delays in CPU processing will lead to a device with nothing to
do.
^ permalink raw reply [flat|nested] 153+ messages in thread
* Re: [PATCH] NET: Multiqueue network device support.
2007-06-06 22:30 ` Waskiewicz Jr, Peter P
@ 2007-06-06 22:40 ` David Miller
2007-06-06 23:35 ` jamal
2007-06-09 14:58 ` Leonid Grossman
1 sibling, 1 reply; 153+ messages in thread
From: David Miller @ 2007-06-06 22:40 UTC (permalink / raw)
To: peter.p.waskiewicz.jr; +Cc: hadi, kaber, netdev, jeff, auke-jan.h.kok
From: "Waskiewicz Jr, Peter P" <peter.p.waskiewicz.jr@intel.com>
Date: Wed, 6 Jun 2007 15:30:39 -0700
> > [Which of course leads to the complexity (and not optimizing
> > for the common - which is single ring NICs)].
>
> The common for 100 Mbit and older 1Gbit is single ring NICs. Newer
> PCI-X and PCIe NICs from 1Gbit to 10Gbit support multiple rings in the
> hardware, and it's all headed in that direction, so it's becoming the
> common case.
I totally agree. No modern commodity 1gb and faster card is going
to be without many queues on both TX and RX.
> > Iam actually not against the subqueue control - i know Peter
> > needs it for certain hardware; i am just against the mucking
> > around of the common case (single ring NIC) just to get that working.
>
> Single-ring NICs see no difference here. Please explain why using my
> patches with pfifo_fast, sch_prio, or any other existing qdisc will
> change the behavior for single-ring NICs?
I agree with the implication here, there is no penalty for existing
devices.
There are two core issues in my mind:
1) multi-queue on both RX and TX is going to be very pervasive very
soon, everyone is putting this into silicon.
The parallelization gain potential is enormous, and we have to
design for this.
2) Queues are meant to be filled as much as possible, you can't do
that by having only one qdisc attached to the device indicating
unary full status, you simply can't.
^ permalink raw reply [flat|nested] 153+ messages in thread
* RE: [PATCH] NET: Multiqueue network device support.
2007-06-06 22:35 ` David Miller
@ 2007-06-06 22:57 ` Waskiewicz Jr, Peter P
2007-06-06 23:00 ` David Miller
2007-06-06 23:32 ` jamal
1 sibling, 1 reply; 153+ messages in thread
From: Waskiewicz Jr, Peter P @ 2007-06-06 22:57 UTC (permalink / raw)
To: David Miller; +Cc: kaber, netdev, jeff, Kok, Auke-jan H, hadi
> However, I have to disagree with your analysis of the
> multi-qdisc situation, and I tend to agree with Patrick.
>
> If you only have one qdisc to indicate status on, when is the
> queue full? That is the core issue. Indicating full status
> when any of the hardware queues are full is broken, because
> we should never block out queuing of higher priority packets
> just because the low priority queue can't take any more
> frames, _and_ vice versa.
>
> I really want to believe your proofs but they are something
> out of a fairy tale :-)
Dave,
Can we move forward on this please? If you are comfortable that
my patches give the kernel the ability to manage hardware queues
sufficiently, I'd like to request that 2.6.23 be opened (wink wink) so I
can submit the patches for inclusion to that kernel.
Thanks,
-PJ Waskiewicz
peter.p.waskiewicz.jr@intel.com
^ permalink raw reply [flat|nested] 153+ messages in thread
* Re: [PATCH] NET: Multiqueue network device support.
2007-06-06 22:57 ` Waskiewicz Jr, Peter P
@ 2007-06-06 23:00 ` David Miller
2007-06-06 23:14 ` Waskiewicz Jr, Peter P
0 siblings, 1 reply; 153+ messages in thread
From: David Miller @ 2007-06-06 23:00 UTC (permalink / raw)
To: peter.p.waskiewicz.jr; +Cc: kaber, netdev, jeff, auke-jan.h.kok, hadi
From: "Waskiewicz Jr, Peter P" <peter.p.waskiewicz.jr@intel.com>
Date: Wed, 6 Jun 2007 15:57:35 -0700
> Can we move forward on this please? If you are comfortable
> that my patches give the kernel the ability to manage hardware
> queues sufficiently, I'd like to request that 2.6.23 be opened (wink
> wink) so I can submit the patches for inclusion to that kernel.
While I am growing in support of your changes, there are
two things:
1) I want to study them more and hear more about what Patrick has to
say about them when he returns from his trip on Sunday
2) I don't want to open up a net-2.6.23 tree yet so that people
concentrate on bug fixes and regressions, pick an open bug or
regression report and help out if you want net-2.6.23 cut faster
:-)
^ permalink raw reply [flat|nested] 153+ messages in thread
* RE: [PATCH] NET: Multiqueue network device support.
2007-06-06 23:00 ` David Miller
@ 2007-06-06 23:14 ` Waskiewicz Jr, Peter P
2007-06-06 23:36 ` Jeff Garzik
0 siblings, 1 reply; 153+ messages in thread
From: Waskiewicz Jr, Peter P @ 2007-06-06 23:14 UTC (permalink / raw)
To: David Miller; +Cc: kaber, netdev, jeff, Kok, Auke-jan H, hadi
> While I am growing in support of your changes, there are two things:
>
> 1) I want to study them more and hear more about what Patrick has to
> say about them when he returns from his trip on Sunday
>
> 2) I don't want to open up a net-2.6.23 tree yet so that people
> concentrate on bug fixes and regressions, pick an open bug or
> regression report and help out if you want net-2.6.23 cut faster
> :-)
Where can I find such reports?
^ permalink raw reply [flat|nested] 153+ messages in thread
* Re: [PATCH] NET: Multiqueue network device support.
2007-06-06 22:35 ` David Miller
2007-06-06 22:57 ` Waskiewicz Jr, Peter P
@ 2007-06-06 23:32 ` jamal
2007-06-06 23:48 ` Rick Jones
` (3 more replies)
1 sibling, 4 replies; 153+ messages in thread
From: jamal @ 2007-06-06 23:32 UTC (permalink / raw)
To: David Miller; +Cc: kaber, peter.p.waskiewicz.jr, netdev, jeff, auke-jan.h.kok
On Wed, 2007-06-06 at 15:35 -0700, David Miller wrote:
> From: jamal <hadi@cyberus.ca>
> Date: Wed, 06 Jun 2007 18:13:40 -0400
> There are other reasons to do interesting things in this area,
> purely for parallelization reasons.
>
> For example, consider a chip that has N totally independant TX packet
> queues going out to the same ethernet port. You can lock and transmit
> on them independantly, and the chip internally arbitrates using DRR or
> whatever to blast the queues out to the physical port in some fair'ish
> manner.
>
> In that case you'd want to be able to do something like:
>
> struct mydev_tx_queue *q = &mydev->tx_q[smp_processor_id() % N];
>
> or similar in the ->hard_start_xmit() driver. But something generic
> to support this kind of parallelization would be great (and necessary)
> because the TX lock is unary per netdev and destroys all of the
> parallelization possible with something like the above.
>
I cant think of any egress scheduler that will benefit from that
approach. The scheduler is the decider of which packet goes out next
on the wire.
> With the above for transmit, and having N "struct napi_struct"
> instances for MSI-X directed RX queues, we'll have no problem keeping
> a 10gbit (or even faster) port completely full with lots of cpu to
> spare on multi-core boxes.
>
RX queues - yes, I can see; TX queues, it doesnt make sense to put
different rings on different CPUs.
> However, I have to disagree with your analysis of the multi-qdisc
> situation, and I tend to agree with Patrick.
> If you only have one qdisc to indicate status on, when is the queue
> full? That is the core issue.
I just described why it is not an issue. If you make the assumption it
is an issue, then it becomes one.
> Indicating full status when any of
> the hardware queues are full is broken, because we should never
> block out queuing of higher priority packets just because the
> low priority queue can't take any more frames, _and_ vice versa.
Dave, you didnt read anything i said ;-> The situation you describe is
impossible. low prio will never block high prio.
> I really want to believe your proofs but they are something out of
> a fairy tale :-)
They are a lot real than it seems. Please read again what i typed in ;->
And i will produce patches since this seems to be complex to explain.
> > The only way PHL will ever shutdown the path to the hardware is when
> > there are sufficient PHL packets.
> > Corrollary,
> > The only way PSL will ever shutdown the path to the hardware is when
> > there are _NO_ PSH packets.
>
> The problem with this line of thinking is that it ignores the fact
> that it is bad to not queue to the device when there is space
> available, _even_ for lower priority packets.
So use a different scheduler. Dont use strict prio. Strict prio will
guarantee starvation of low prio packets as long as there are high prio
packets. Thats the intent.
> The more you keep all available TX queues full, the less likely
> delays in CPU processing will lead to a device with nothing to
> do.
It is design intent - thats how the specific scheduler works.
cheers,
jamal
^ permalink raw reply [flat|nested] 153+ messages in thread
* Re: [PATCH] NET: Multiqueue network device support.
2007-06-06 22:40 ` David Miller
@ 2007-06-06 23:35 ` jamal
2007-06-06 23:56 ` David Miller
0 siblings, 1 reply; 153+ messages in thread
From: jamal @ 2007-06-06 23:35 UTC (permalink / raw)
To: David Miller; +Cc: peter.p.waskiewicz.jr, kaber, netdev, jeff, auke-jan.h.kok
On Wed, 2007-06-06 at 15:40 -0700, David Miller wrote:
> There are two core issues in my mind:
>
> 1) multi-queue on both RX and TX is going to be very pervasive very
> soon, everyone is putting this into silicon.
>
> The parallelization gain potential is enormous, and we have to
> design for this.
>
There is no potential for parallelizing on transmit that i can think of.
Dave, please explain it slowly so i can understand it.
There is huge potential for parallelizing on receive. But i am certainly
missing the value in the transmit.
cheers,
jamal
^ permalink raw reply [flat|nested] 153+ messages in thread
* Re: [PATCH] NET: Multiqueue network device support.
2007-06-06 23:14 ` Waskiewicz Jr, Peter P
@ 2007-06-06 23:36 ` Jeff Garzik
0 siblings, 0 replies; 153+ messages in thread
From: Jeff Garzik @ 2007-06-06 23:36 UTC (permalink / raw)
To: Waskiewicz Jr, Peter P; +Cc: David Miller, kaber, netdev, Kok, Auke-jan H, hadi
On Wed, Jun 06, 2007 at 04:14:12PM -0700, Waskiewicz Jr, Peter P wrote:
> > While I am growing in support of your changes, there are two things:
> >
> > 1) I want to study them more and hear more about what Patrick has to
> > say about them when he returns from his trip on Sunday
> >
> > 2) I don't want to open up a net-2.6.23 tree yet so that people
> > concentrate on bug fixes and regressions, pick an open bug or
> > regression report and help out if you want net-2.6.23 cut faster
> > :-)
>
> Where can I find such reports?
Michal Piotrowski's list is a good place to start, while not
network-specific, is turning into _the_ place to list regressions.
Jeff
^ permalink raw reply [flat|nested] 153+ messages in thread
* Re: [PATCH] NET: Multiqueue network device support.
2007-06-06 23:32 ` jamal
@ 2007-06-06 23:48 ` Rick Jones
2007-06-06 23:54 ` jamal
2007-06-06 23:58 ` David Miller
2007-06-06 23:52 ` David Miller
` (2 subsequent siblings)
3 siblings, 2 replies; 153+ messages in thread
From: Rick Jones @ 2007-06-06 23:48 UTC (permalink / raw)
To: hadi
Cc: David Miller, kaber, peter.p.waskiewicz.jr, netdev, jeff,
auke-jan.h.kok
> RX queues - yes, I can see; TX queues, it doesnt make sense to put
> different rings on different CPUs.
To what extent might that preclude some cachelines bouncing hither and
yon between the CPUs?
rick jones
^ permalink raw reply [flat|nested] 153+ messages in thread
* Re: [PATCH] NET: Multiqueue network device support.
2007-06-06 23:32 ` jamal
2007-06-06 23:48 ` Rick Jones
@ 2007-06-06 23:52 ` David Miller
2007-06-07 0:47 ` Jeff Garzik
2007-06-06 23:53 ` David Miller
2007-06-11 12:01 ` Patrick McHardy
3 siblings, 1 reply; 153+ messages in thread
From: David Miller @ 2007-06-06 23:52 UTC (permalink / raw)
To: hadi; +Cc: kaber, peter.p.waskiewicz.jr, netdev, jeff, auke-jan.h.kok
From: jamal <hadi@cyberus.ca>
Date: Wed, 06 Jun 2007 19:32:46 -0400
> On Wed, 2007-06-06 at 15:35 -0700, David Miller wrote:
> > With the above for transmit, and having N "struct napi_struct"
> > instances for MSI-X directed RX queues, we'll have no problem keeping
> > a 10gbit (or even faster) port completely full with lots of cpu to
> > spare on multi-core boxes.
> >
>
> RX queues - yes, I can see; TX queues, it doesnt make sense to put
> different rings on different CPUs.
For the locking is makes a ton of sense.
If you have sendmsg() calls going on N cpus, would you rather
they:
1) All queue up to the single netdev->tx_lock
or
2) All take local per-hw-queue locks
to transmit the data they are sending?
I thought this was obvious... guess not :-)
^ permalink raw reply [flat|nested] 153+ messages in thread
* Re: [PATCH] NET: Multiqueue network device support.
2007-06-06 23:32 ` jamal
2007-06-06 23:48 ` Rick Jones
2007-06-06 23:52 ` David Miller
@ 2007-06-06 23:53 ` David Miller
2007-06-07 1:08 ` jamal
2007-06-11 12:01 ` Patrick McHardy
3 siblings, 1 reply; 153+ messages in thread
From: David Miller @ 2007-06-06 23:53 UTC (permalink / raw)
To: hadi; +Cc: kaber, peter.p.waskiewicz.jr, netdev, jeff, auke-jan.h.kok
From: jamal <hadi@cyberus.ca>
Date: Wed, 06 Jun 2007 19:32:46 -0400
> So use a different scheduler. Dont use strict prio. Strict prio will
> guarantee starvation of low prio packets as long as there are high prio
> packets. Thats the intent.
Ok, point taken.
There are of course other uses for multiple TX queues, and in
particular my finer-grained locking example.
I'm still amazed the TX locking issue wasn't clear to you,
too nervous about tonight's game? :)
^ permalink raw reply [flat|nested] 153+ messages in thread
* Re: [PATCH] NET: Multiqueue network device support.
2007-06-06 23:48 ` Rick Jones
@ 2007-06-06 23:54 ` jamal
2007-06-07 0:01 ` David Miller
2007-06-06 23:58 ` David Miller
1 sibling, 1 reply; 153+ messages in thread
From: jamal @ 2007-06-06 23:54 UTC (permalink / raw)
To: Rick Jones
Cc: David Miller, kaber, peter.p.waskiewicz.jr, netdev, jeff,
auke-jan.h.kok
On Wed, 2007-06-06 at 16:48 -0700, Rick Jones wrote:
> > RX queues - yes, I can see; TX queues, it doesnt make sense to put
> > different rings on different CPUs.
>
> To what extent might that preclude some cachelines bouncing hither and
> yon between the CPUs?
I think the bouncing will exist a lot more with the multi CPUs. But one
would assume if you go that path, you would also parallelize the stack
on egress to reduce such an effect. I guess the point i am not seeing is
the value. The tx, once hitting the NIC is an IO issue not a CPU issue.
OTOH, the receive path once a packet is received, that is a CPU problem
(and therefore multi CPUs help).
To be fair to Peter, that is not what his patches are trying to address
(and infact, they cant solve that problem).
off for the night.
cheers,
jamal
^ permalink raw reply [flat|nested] 153+ messages in thread
* Re: [PATCH] NET: Multiqueue network device support.
2007-06-06 23:35 ` jamal
@ 2007-06-06 23:56 ` David Miller
2007-06-07 16:08 ` Stephen Hemminger
0 siblings, 1 reply; 153+ messages in thread
From: David Miller @ 2007-06-06 23:56 UTC (permalink / raw)
To: hadi; +Cc: peter.p.waskiewicz.jr, kaber, netdev, jeff, auke-jan.h.kok
From: jamal <hadi@cyberus.ca>
Date: Wed, 06 Jun 2007 19:35:46 -0400
> There is no potential for parallelizing on transmit that i can think of.
> Dave, please explain it slowly so i can understand it.
>
> There is huge potential for parallelizing on receive. But i am certainly
> missing the value in the transmit.
I gave an example in another response, you have N processes
queueing up data for TCP or UDP or whatever in parallel on
different cpus, all going out the same 10gbit device.
All of them enter into ->hard_start_xmit(), and thus all of them try
to take the same netdev->tx_lock
If they have multiple TX queues, independantly programmable, that
single lock is stupid.
We could use per-queue TX locks for such hardware, but we can't
support that currently.
^ permalink raw reply [flat|nested] 153+ messages in thread
* Re: [PATCH] NET: Multiqueue network device support.
2007-06-06 23:48 ` Rick Jones
2007-06-06 23:54 ` jamal
@ 2007-06-06 23:58 ` David Miller
1 sibling, 0 replies; 153+ messages in thread
From: David Miller @ 2007-06-06 23:58 UTC (permalink / raw)
To: rick.jones2
Cc: hadi, kaber, peter.p.waskiewicz.jr, netdev, jeff, auke-jan.h.kok
From: Rick Jones <rick.jones2@hp.com>
Date: Wed, 06 Jun 2007 16:48:59 -0700
> > RX queues - yes, I can see; TX queues, it doesnt make sense to put
> > different rings on different CPUs.
>
> To what extent might that preclude some cachelines bouncing hither and
> yon between the CPUs?
I think per-TX-queue locking takes locality as another advantage.
You only touch the TX descriptors for queue N, rather than a single
globally shared one.
Same goes for RX.
^ permalink raw reply [flat|nested] 153+ messages in thread
* Re: [PATCH] NET: Multiqueue network device support.
2007-06-06 23:54 ` jamal
@ 2007-06-07 0:01 ` David Miller
0 siblings, 0 replies; 153+ messages in thread
From: David Miller @ 2007-06-07 0:01 UTC (permalink / raw)
To: hadi
Cc: rick.jones2, kaber, peter.p.waskiewicz.jr, netdev, jeff,
auke-jan.h.kok
From: jamal <hadi@cyberus.ca>
Date: Wed, 06 Jun 2007 19:54:47 -0400
> On Wed, 2007-06-06 at 16:48 -0700, Rick Jones wrote:
> > > RX queues - yes, I can see; TX queues, it doesnt make sense to put
> > > different rings on different CPUs.
> >
> > To what extent might that preclude some cachelines bouncing hither and
> > yon between the CPUs?
>
> I think the bouncing will exist a lot more with the multi CPUs. But one
> would assume if you go that path, you would also parallelize the stack
> on egress to reduce such an effect. I guess the point i am not seeing is
> the value. The tx, once hitting the NIC is an IO issue not a CPU issue.
Disagred, that single TX lock kills cpu cycles.
If all of the TX queues are independantly programmable of one
another, the single TX lock kills performance.
> off for the night.
Enjoy the game.
^ permalink raw reply [flat|nested] 153+ messages in thread
* Re: [PATCH] NET: Multiqueue network device support.
2007-06-06 23:52 ` David Miller
@ 2007-06-07 0:47 ` Jeff Garzik
2007-06-07 12:29 ` jamal
0 siblings, 1 reply; 153+ messages in thread
From: Jeff Garzik @ 2007-06-07 0:47 UTC (permalink / raw)
To: David Miller; +Cc: hadi, kaber, peter.p.waskiewicz.jr, netdev, auke-jan.h.kok
On Wed, Jun 06, 2007 at 04:52:15PM -0700, David Miller wrote:
> For the locking is makes a ton of sense.
>
> If you have sendmsg() calls going on N cpus, would you rather
> they:
>
> 1) All queue up to the single netdev->tx_lock
>
> or
>
> 2) All take local per-hw-queue locks
>
> to transmit the data they are sending?
>
> I thought this was obvious... guess not :-)
Agreed ++
For my part, I definitely want to see parallel Tx as well as parallel Rx.
It's the only thing that makes sense for modern multi-core CPUs.
Two warnings flags are raised in my brain though:
1) you need (a) well-designed hardware _and_ (b) a smart driver writer
to avoid bottlenecking on internal driver locks. As you can see we have
both (a) and (b) for tg3 ;-) But it's up in the air whether a
multi-TX-queue scheme can be sanely locked internally on other hardware.
At the moment we have to hope Intel gets it right in their driver...
2) I fear that the getting-it-into-the-Tx-queue part will take some
thought in order to make this happen, too. Just like you have the
SMP/SMT/Multi-core scheduler scheduling various resources, surely we
will want some smarts so that sockets are not bouncing wildly across
CPUs, absent other factors outside our control.
Otherwise you will negate a lot of the value of the nifty multi-TX-lock
driver API, by bouncing data across CPUs on each transmit anyway.
IOW, you will have to sanely fill each of the TX queues.
Jeff
^ permalink raw reply [flat|nested] 153+ messages in thread
* Re: [PATCH] NET: Multiqueue network device support.
2007-06-06 23:53 ` David Miller
@ 2007-06-07 1:08 ` jamal
2007-06-07 12:22 ` jamal
0 siblings, 1 reply; 153+ messages in thread
From: jamal @ 2007-06-07 1:08 UTC (permalink / raw)
To: David Miller; +Cc: kaber, peter.p.waskiewicz.jr, netdev, jeff, auke-jan.h.kok
On Wed, 2007-06-06 at 16:53 -0700, David Miller wrote:
> There are of course other uses for multiple TX queues, and in
> particular my finer-grained locking example.
>
> I'm still amazed the TX locking issue wasn't clear to you,
> too nervous about tonight's game? :)
It's too depressing - so i came back here for a break ;->
I cant even stand Don Cherry today.
As a side note: You will have to do a lot of surgery to the current code
to make tx run on multi CPUs. It needs some experimenting to get right.
And i am begining to like Herberts changes ;->
I am not against multi-rings; iam just suggesting an alternative
approach which is less disruptive.
In regards to the tx lock - my thinking is resolving that via tx
batching. You amortize the lock over multiple packets. There may be
value in fine grained locking - i need to think about it. A small
extension to the batching patches will provide the change i am
proposing.
cheers,
jamal
^ permalink raw reply [flat|nested] 153+ messages in thread
* Re: [PATCH] NET: Multiqueue network device support.
2007-06-07 1:08 ` jamal
@ 2007-06-07 12:22 ` jamal
0 siblings, 0 replies; 153+ messages in thread
From: jamal @ 2007-06-07 12:22 UTC (permalink / raw)
To: David Miller; +Cc: kaber, peter.p.waskiewicz.jr, netdev, jeff, auke-jan.h.kok
On Wed, 2007-06-06 at 21:08 -0400, jamal wrote:
> It's too depressing - so i came back here for a break ;->
I am sure you would agree it was too depressing ;->
> As a side note: You will have to do a lot of surgery to the current code
> to make tx run on multi CPUs. It needs some experimenting to get right.
> And i am begining to like Herberts changes ;->
On clear morning thinking:
There is a scenario where multi CPUs will benefit and that is if a
single CPU cant pump fast enough to fill the wire. I think this may have
been your initial comment but we digressed into the locks. I still think
the bottleneck on the tx that needs to improve is IO (DMA, PCI/X/E etc).
And as you know though that requires some major surgery (given the
multiple producer single consumer approach we have today) and certainly
Peters patches dont add value in that direction.
cheers,
jamal
^ permalink raw reply [flat|nested] 153+ messages in thread
* Re: [PATCH] NET: Multiqueue network device support.
2007-06-07 0:47 ` Jeff Garzik
@ 2007-06-07 12:29 ` jamal
2007-06-07 15:03 ` Kok, Auke
0 siblings, 1 reply; 153+ messages in thread
From: jamal @ 2007-06-07 12:29 UTC (permalink / raw)
To: Jeff Garzik
Cc: David Miller, kaber, peter.p.waskiewicz.jr, netdev,
auke-jan.h.kok
[-- Attachment #1: Type: text/plain, Size: 493 bytes --]
On Wed, 2007-06-06 at 20:47 -0400, Jeff Garzik wrote:
> 1) you need (a) well-designed hardware _and_ (b) a smart driver writer
> to avoid bottlenecking on internal driver locks. As you can see we have
> both (a) and (b) for tg3 ;-)
How about the following patch which fixes #b for e1000 ;->
I think the e1000s challenges are related to the gazillion variations of
boards they support and a little challenge of too many intel cooks.
Auke, why do you need the tx ring lock?
cheers,
jamal
[-- Attachment #2: e1000-ntxl --]
[-- Type: text/x-patch, Size: 2127 bytes --]
diff --git a/drivers/net/e1000/e1000.h b/drivers/net/e1000/e1000.h
index 16a6edf..4483d0f 100644
--- a/drivers/net/e1000/e1000.h
+++ b/drivers/net/e1000/e1000.h
@@ -185,7 +185,6 @@ struct e1000_tx_ring {
/* array of buffer information structs */
struct e1000_buffer *buffer_info;
- spinlock_t tx_lock;
uint16_t tdh;
uint16_t tdt;
boolean_t last_tx_tso;
diff --git a/drivers/net/e1000/e1000_main.c b/drivers/net/e1000/e1000_main.c
index cf8af92..2dd6bc0 100644
--- a/drivers/net/e1000/e1000_main.c
+++ b/drivers/net/e1000/e1000_main.c
@@ -1597,7 +1597,6 @@ setup_tx_desc_die:
txdr->next_to_use = 0;
txdr->next_to_clean = 0;
- spin_lock_init(&txdr->tx_lock);
return 0;
}
@@ -3368,14 +3367,9 @@ e1000_xmit_frame(struct sk_buff *skb, struct net_device *netdev)
(adapter->hw.mac_type == e1000_82573))
e1000_transfer_dhcp_info(adapter, skb);
- if (!spin_trylock_irqsave(&tx_ring->tx_lock, flags))
- /* Collision - tell upper layer to requeue */
- return NETDEV_TX_LOCKED;
-
/* need: count + 2 desc gap to keep tail from touching
* head, otherwise try next time */
if (unlikely(e1000_maybe_stop_tx(netdev, tx_ring, count + 2))) {
- spin_unlock_irqrestore(&tx_ring->tx_lock, flags);
return NETDEV_TX_BUSY;
}
@@ -3383,7 +3377,6 @@ e1000_xmit_frame(struct sk_buff *skb, struct net_device *netdev)
if (unlikely(e1000_82547_fifo_workaround(adapter, skb))) {
netif_stop_queue(netdev);
mod_timer(&adapter->tx_fifo_stall_timer, jiffies + 1);
- spin_unlock_irqrestore(&tx_ring->tx_lock, flags);
return NETDEV_TX_BUSY;
}
}
@@ -3398,7 +3391,6 @@ e1000_xmit_frame(struct sk_buff *skb, struct net_device *netdev)
tso = e1000_tso(adapter, tx_ring, skb);
if (tso < 0) {
dev_kfree_skb_any(skb);
- spin_unlock_irqrestore(&tx_ring->tx_lock, flags);
return NETDEV_TX_OK;
}
@@ -3423,7 +3415,6 @@ e1000_xmit_frame(struct sk_buff *skb, struct net_device *netdev)
/* Make sure there is space in the ring for the next send. */
e1000_maybe_stop_tx(netdev, tx_ring, MAX_SKB_FRAGS + 2);
- spin_unlock_irqrestore(&tx_ring->tx_lock, flags);
return NETDEV_TX_OK;
}
^ permalink raw reply related [flat|nested] 153+ messages in thread
* Re: [PATCH] NET: Multiqueue network device support.
2007-06-07 12:29 ` jamal
@ 2007-06-07 15:03 ` Kok, Auke
2007-06-07 21:57 ` jamal
0 siblings, 1 reply; 153+ messages in thread
From: Kok, Auke @ 2007-06-07 15:03 UTC (permalink / raw)
To: hadi
Cc: Jeff Garzik, David Miller, kaber, peter.p.waskiewicz.jr, netdev,
Jesse Brandeburg
jamal wrote:
> On Wed, 2007-06-06 at 20:47 -0400, Jeff Garzik wrote:
>
>> 1) you need (a) well-designed hardware _and_ (b) a smart driver writer
>> to avoid bottlenecking on internal driver locks. As you can see we have
>> both (a) and (b) for tg3 ;-)
>
> How about the following patch which fixes #b for e1000 ;->
> I think the e1000s challenges are related to the gazillion variations of
> boards they support and a little challenge of too many intel cooks.
>
> Auke, why do you need the tx ring lock?
To prevent against multiple entries bumping head & tail at the same time as well
as overwriting the same entries in the tx ring (contention for
next_to_watch/next_to_clean)? It may be unlikely but ripping out the tx ring
lock might not be a good idea, perhaps after we get rid of LLTX in e1000?
to be honest: I'm open for ideas and I'll give it a try, but stuff like this
needs to go through some nasty stress testing (multiple clients, long time)
before I will consider it seriously, but fortunately that's something I can do.
Auke
^ permalink raw reply [flat|nested] 153+ messages in thread
* Re: [PATCH] NET: Multiqueue network device support.
2007-06-06 23:56 ` David Miller
@ 2007-06-07 16:08 ` Stephen Hemminger
2007-06-07 16:59 ` Waskiewicz Jr, Peter P
2007-06-07 22:04 ` jamal
0 siblings, 2 replies; 153+ messages in thread
From: Stephen Hemminger @ 2007-06-07 16:08 UTC (permalink / raw)
To: David Miller
Cc: hadi, peter.p.waskiewicz.jr, kaber, netdev, jeff, auke-jan.h.kok
On Wed, 06 Jun 2007 16:56:02 -0700 (PDT)
David Miller <davem@davemloft.net> wrote:
> From: jamal <hadi@cyberus.ca>
> Date: Wed, 06 Jun 2007 19:35:46 -0400
>
> > There is no potential for parallelizing on transmit that i can think of.
> > Dave, please explain it slowly so i can understand it.
> >
> > There is huge potential for parallelizing on receive. But i am certainly
> > missing the value in the transmit.
>
> I gave an example in another response, you have N processes
> queueing up data for TCP or UDP or whatever in parallel on
> different cpus, all going out the same 10gbit device.
>
> All of them enter into ->hard_start_xmit(), and thus all of them try
> to take the same netdev->tx_lock
>
> If they have multiple TX queues, independantly programmable, that
> single lock is stupid.
>
> We could use per-queue TX locks for such hardware, but we can't
> support that currently.
There could be bad packet reordering with this (like some SMP routers used to do).
--
Stephen Hemminger <shemminger@linux-foundation.org>
^ permalink raw reply [flat|nested] 153+ messages in thread
* RE: [PATCH] NET: Multiqueue network device support.
2007-06-07 16:08 ` Stephen Hemminger
@ 2007-06-07 16:59 ` Waskiewicz Jr, Peter P
2007-06-11 12:08 ` Patrick McHardy
2007-06-07 22:04 ` jamal
1 sibling, 1 reply; 153+ messages in thread
From: Waskiewicz Jr, Peter P @ 2007-06-07 16:59 UTC (permalink / raw)
To: Stephen Hemminger, David Miller
Cc: hadi, kaber, netdev, jeff, Kok, Auke-jan H
> > If they have multiple TX queues, independantly programmable, that
> > single lock is stupid.
> >
> > We could use per-queue TX locks for such hardware, but we can't
> > support that currently.
>
> There could be bad packet reordering with this (like some SMP
> routers used to do).
My original multiqueue patches I submitted actually had a per-queue Tx
lock, but it was removed since the asymmetry in the stack for locking
was something people didn't like. Locking a queue for ->enqueue(),
unlocking, then locking for ->dequeue(), unlocking, was something people
didn't like very much. Also knowing what queue to lock on ->enqueue()
was where the original ->map_queue() idea came from, since we wanted to
lock before calling ->enqueue().
-PJ
^ permalink raw reply [flat|nested] 153+ messages in thread
* Re: [PATCH] NET: Multiqueue network device support.
2007-06-07 15:03 ` Kok, Auke
@ 2007-06-07 21:57 ` jamal
2007-06-07 22:06 ` Kok, Auke
2007-06-07 22:44 ` David Miller
0 siblings, 2 replies; 153+ messages in thread
From: jamal @ 2007-06-07 21:57 UTC (permalink / raw)
To: Kok, Auke
Cc: Jeff Garzik, David Miller, kaber, peter.p.waskiewicz.jr, netdev,
Jesse Brandeburg
On Thu, 2007-07-06 at 08:03 -0700, Kok, Auke wrote:
> To prevent against multiple entries bumping head & tail at the same time as well
> as overwriting the same entries in the tx ring (contention for
> next_to_watch/next_to_clean)?
In current code that lock certainly doesnt protect those specifics.
I thought at some point thats what it did; somehow that seems to have
changed - the rx path/tx prunning is protected by tx_queue_lock
I have tested it the patch on smp and it works.
> It may be unlikely but ripping out the tx ring
> lock might not be a good idea, perhaps after we get rid of LLTX in e1000?
I dont think it matters either way. At the moment, you are _guaranteed_
only one cpu can enter tx path. There may be another CPU, but as long
(as in current code) you dont have any contention between tx and rx, it
seems to be a non-issue.
> to be honest: I'm open for ideas and I'll give it a try, but stuff like this
> needs to go through some nasty stress testing (multiple clients, long time)
> before I will consider it seriously, but fortunately that's something I can do.
I empathize but take a closer look; seems mostly useless.
And like i said I have done a quick test with an SMP machine and it
seems to work fine; but your tests will probably be more thorough.
cheers,
jamal
^ permalink raw reply [flat|nested] 153+ messages in thread
* Re: [PATCH] NET: Multiqueue network device support.
2007-06-07 16:08 ` Stephen Hemminger
2007-06-07 16:59 ` Waskiewicz Jr, Peter P
@ 2007-06-07 22:04 ` jamal
1 sibling, 0 replies; 153+ messages in thread
From: jamal @ 2007-06-07 22:04 UTC (permalink / raw)
To: Stephen Hemminger
Cc: David Miller, peter.p.waskiewicz.jr, kaber, netdev, jeff,
auke-jan.h.kok
On Thu, 2007-07-06 at 09:08 -0700, Stephen Hemminger wrote:
>
> There could be bad packet reordering with this (like some SMP routers used to do).
You can avoid re-ordering if you guarantee that "related" flows always
end up on the same CPU via say tc filters i.e i dont think just a 5
tuple classification would be sufficient.
cheers,
jamal
^ permalink raw reply [flat|nested] 153+ messages in thread
* Re: [PATCH] NET: Multiqueue network device support.
2007-06-07 21:57 ` jamal
@ 2007-06-07 22:06 ` Kok, Auke
2007-06-07 22:26 ` jamal
2007-06-07 22:44 ` David Miller
1 sibling, 1 reply; 153+ messages in thread
From: Kok, Auke @ 2007-06-07 22:06 UTC (permalink / raw)
To: hadi
Cc: Jeff Garzik, David Miller, kaber, peter.p.waskiewicz.jr, netdev,
Jesse Brandeburg
jamal wrote:
> On Thu, 2007-07-06 at 08:03 -0700, Kok, Auke wrote:
>> To prevent against multiple entries bumping head & tail at the same time as well
>> as overwriting the same entries in the tx ring (contention for
>> next_to_watch/next_to_clean)?
>
> In current code that lock certainly doesnt protect those specifics.
> I thought at some point thats what it did; somehow that seems to have
> changed - the rx path/tx prunning is protected by tx_queue_lock
> I have tested it the patch on smp and it works.
>
>> It may be unlikely but ripping out the tx ring
>> lock might not be a good idea, perhaps after we get rid of LLTX in e1000?
>
> I dont think it matters either way. At the moment, you are _guaranteed_
> only one cpu can enter tx path. There may be another CPU, but as long
> (as in current code) you dont have any contention between tx and rx, it
> seems to be a non-issue.
>
>> to be honest: I'm open for ideas and I'll give it a try, but stuff like this
>> needs to go through some nasty stress testing (multiple clients, long time)
>> before I will consider it seriously, but fortunately that's something I can do.
>
> I empathize but take a closer look; seems mostly useless.
> And like i said I have done a quick test with an SMP machine and it
> seems to work fine; but your tests will probably be more thorough.
the contention isn't between multiple tx attempts, but between e1000_clean and
tx. You'll need bidirectional traffic with multiple clients probably to hit it...
Auke
^ permalink raw reply [flat|nested] 153+ messages in thread
* Re: [PATCH] NET: Multiqueue network device support.
2007-06-07 22:06 ` Kok, Auke
@ 2007-06-07 22:26 ` jamal
2007-06-07 22:30 ` Kok, Auke
0 siblings, 1 reply; 153+ messages in thread
From: jamal @ 2007-06-07 22:26 UTC (permalink / raw)
To: Kok, Auke
Cc: Jeff Garzik, David Miller, kaber, peter.p.waskiewicz.jr, netdev,
Jesse Brandeburg
On Thu, 2007-07-06 at 15:06 -0700, Kok, Auke wrote:
> the contention isn't between multiple tx attempts, but between e1000_clean and
> tx.
I got you the first time but i think i am missing something: given that
the lock is used only on tx - how is that protecting the contention
between tx and rx?
> You'll need bidirectional traffic with multiple clients probably to hit it...
I did - but it was asymettric i.e very heavy on tx.
cheers,
jamal
^ permalink raw reply [flat|nested] 153+ messages in thread
* Re: [PATCH] NET: Multiqueue network device support.
2007-06-07 22:26 ` jamal
@ 2007-06-07 22:30 ` Kok, Auke
2007-06-07 22:57 ` jamal
0 siblings, 1 reply; 153+ messages in thread
From: Kok, Auke @ 2007-06-07 22:30 UTC (permalink / raw)
To: hadi
Cc: Jeff Garzik, David Miller, kaber, peter.p.waskiewicz.jr, netdev,
Jesse Brandeburg
jamal wrote:
> On Thu, 2007-07-06 at 15:06 -0700, Kok, Auke wrote:
>
>> the contention isn't between multiple tx attempts, but between e1000_clean and
>> tx.
>
> I got you the first time but i think i am missing something: given that
> the lock is used only on tx - how is that protecting the contention
> between tx and rx?
our rx interrupt/clean can trigger tx cleans, reaching the same code...
Auke
^ permalink raw reply [flat|nested] 153+ messages in thread
* Re: [PATCH] NET: Multiqueue network device support.
2007-06-07 21:57 ` jamal
2007-06-07 22:06 ` Kok, Auke
@ 2007-06-07 22:44 ` David Miller
2007-06-07 22:54 ` jamal
2007-06-07 22:55 ` Waskiewicz Jr, Peter P
1 sibling, 2 replies; 153+ messages in thread
From: David Miller @ 2007-06-07 22:44 UTC (permalink / raw)
To: hadi
Cc: auke-jan.h.kok, jeff, kaber, peter.p.waskiewicz.jr, netdev,
jesse.brandeburg
From: jamal <hadi@cyberus.ca>
Date: Thu, 07 Jun 2007 17:57:25 -0400
> I empathize but take a closer look; seems mostly useless.
I thought E1000 still uses LLTX, and if so then multiple cpus can most
definitely get into the ->hard_start_xmit() in parallel.
^ permalink raw reply [flat|nested] 153+ messages in thread
* Re: [PATCH] NET: Multiqueue network device support.
2007-06-07 22:44 ` David Miller
@ 2007-06-07 22:54 ` jamal
2007-06-07 23:00 ` David Miller
2007-06-07 22:55 ` Waskiewicz Jr, Peter P
1 sibling, 1 reply; 153+ messages in thread
From: jamal @ 2007-06-07 22:54 UTC (permalink / raw)
To: David Miller
Cc: auke-jan.h.kok, jeff, kaber, peter.p.waskiewicz.jr, netdev,
jesse.brandeburg
On Thu, 2007-07-06 at 15:44 -0700, David Miller wrote:
> From: jamal <hadi@cyberus.ca>
> Date: Thu, 07 Jun 2007 17:57:25 -0400
>
> > I empathize but take a closer look; seems mostly useless.
>
> I thought E1000 still uses LLTX, and if so then multiple cpus can most
> definitely get into the ->hard_start_xmit() in parallel.
AFAICS, only one CPU can dequeue from the qdisc i.e s/he who holds
__LINK_STATE_QDISC_RUNNING is the only one who can call qdisc_restart
and only s/he can enter ->hard_start_xmit().
Am i missing something?
cheers,
jamal
^ permalink raw reply [flat|nested] 153+ messages in thread
* RE: [PATCH] NET: Multiqueue network device support.
2007-06-07 22:44 ` David Miller
2007-06-07 22:54 ` jamal
@ 2007-06-07 22:55 ` Waskiewicz Jr, Peter P
2007-06-09 1:05 ` Ramkrishna Vepa
1 sibling, 1 reply; 153+ messages in thread
From: Waskiewicz Jr, Peter P @ 2007-06-07 22:55 UTC (permalink / raw)
To: David Miller, hadi
Cc: Kok, Auke-jan H, jeff, kaber, netdev, Brandeburg, Jesse
> > I empathize but take a closer look; seems mostly useless.
>
> I thought E1000 still uses LLTX, and if so then multiple cpus
> can most definitely get into the ->hard_start_xmit() in parallel.
Not with how the qdisc status protects it today:
include/net/pkt_sched.h:
static inline void qdisc_run(struct net_device *dev)
{
if (!netif_queue_stopped(dev) &&
!test_and_set_bit(__LINK_STATE_QDISC_RUNNING, &dev->state))
__qdisc_run(dev);
}
^ permalink raw reply [flat|nested] 153+ messages in thread
* Re: [PATCH] NET: Multiqueue network device support.
2007-06-07 22:30 ` Kok, Auke
@ 2007-06-07 22:57 ` jamal
0 siblings, 0 replies; 153+ messages in thread
From: jamal @ 2007-06-07 22:57 UTC (permalink / raw)
To: Kok, Auke
Cc: Jeff Garzik, David Miller, kaber, peter.p.waskiewicz.jr, netdev,
Jesse Brandeburg
On Thu, 2007-07-06 at 15:30 -0700, Kok, Auke wrote:
> our rx interrupt/clean can trigger tx cleans, reaching the same code...
I see that - What i am saying is tx_lock never protects that.
Am i mistaken? i.e
CPU0 entering tx and and CPU1 entering rx interupt/clean can not be
blocked from each other simply by tp->tx_lock because tp->tx_lock only
runs on CPU0.
Is it a bug then?
cheers,
jamal
^ permalink raw reply [flat|nested] 153+ messages in thread
* Re: [PATCH] NET: Multiqueue network device support.
2007-06-07 22:54 ` jamal
@ 2007-06-07 23:00 ` David Miller
2007-06-07 23:03 ` jamal
2007-06-08 0:31 ` Sridhar Samudrala
0 siblings, 2 replies; 153+ messages in thread
From: David Miller @ 2007-06-07 23:00 UTC (permalink / raw)
To: hadi
Cc: auke-jan.h.kok, jeff, kaber, peter.p.waskiewicz.jr, netdev,
jesse.brandeburg
From: jamal <hadi@cyberus.ca>
Date: Thu, 07 Jun 2007 18:54:08 -0400
> On Thu, 2007-07-06 at 15:44 -0700, David Miller wrote:
> > From: jamal <hadi@cyberus.ca>
> > Date: Thu, 07 Jun 2007 17:57:25 -0400
> >
> > > I empathize but take a closer look; seems mostly useless.
> >
> > I thought E1000 still uses LLTX, and if so then multiple cpus can most
> > definitely get into the ->hard_start_xmit() in parallel.
>
> AFAICS, only one CPU can dequeue from the qdisc i.e s/he who holds
> __LINK_STATE_QDISC_RUNNING is the only one who can call qdisc_restart
> and only s/he can enter ->hard_start_xmit().
>
> Am i missing something?
That's right we fixed that the other week.
^ permalink raw reply [flat|nested] 153+ messages in thread
* Re: [PATCH] NET: Multiqueue network device support.
2007-06-07 23:00 ` David Miller
@ 2007-06-07 23:03 ` jamal
2007-06-08 0:31 ` Sridhar Samudrala
1 sibling, 0 replies; 153+ messages in thread
From: jamal @ 2007-06-07 23:03 UTC (permalink / raw)
To: David Miller
Cc: auke-jan.h.kok, jeff, kaber, peter.p.waskiewicz.jr, netdev,
jesse.brandeburg
On Thu, 2007-07-06 at 16:00 -0700, David Miller wrote:
> That's right we fixed that the other week.
Circa 2.6.18 to be exact - Hence "In Pursuit of Herbert Xu" ;->
cheers,
jamal
^ permalink raw reply [flat|nested] 153+ messages in thread
* Re: [PATCH] NET: Multiqueue network device support.
2007-06-07 23:00 ` David Miller
2007-06-07 23:03 ` jamal
@ 2007-06-08 0:31 ` Sridhar Samudrala
2007-06-08 1:35 ` jamal
2007-06-08 5:32 ` Krishna Kumar2
1 sibling, 2 replies; 153+ messages in thread
From: Sridhar Samudrala @ 2007-06-08 0:31 UTC (permalink / raw)
To: David Miller
Cc: hadi, auke-jan.h.kok, jeff, kaber, peter.p.waskiewicz.jr, netdev,
jesse.brandeburg
On Thu, 2007-06-07 at 16:00 -0700, David Miller wrote:
> From: jamal <hadi@cyberus.ca>
> Date: Thu, 07 Jun 2007 18:54:08 -0400
>
> > On Thu, 2007-07-06 at 15:44 -0700, David Miller wrote:
> > > From: jamal <hadi@cyberus.ca>
> > > Date: Thu, 07 Jun 2007 17:57:25 -0400
> > >
> > > > I empathize but take a closer look; seems mostly useless.
> > >
> > > I thought E1000 still uses LLTX, and if so then multiple cpus can most
> > > definitely get into the ->hard_start_xmit() in parallel.
> >
> > AFAICS, only one CPU can dequeue from the qdisc i.e s/he who holds
> > __LINK_STATE_QDISC_RUNNING is the only one who can call qdisc_restart
> > and only s/he can enter ->hard_start_xmit().
> >
> > Am i missing something?
>
> That's right we fixed that the other week.
If the QDISC_RUNNING flag guarantees that only one CPU can call
dev->hard_start_xmit(), then why do we need to hold netif_tx_lock
for non-LLTX drivers?
Thanks
Sridhar
^ permalink raw reply [flat|nested] 153+ messages in thread
* Re: [PATCH] NET: Multiqueue network device support.
2007-06-08 0:31 ` Sridhar Samudrala
@ 2007-06-08 1:35 ` jamal
2007-06-08 10:39 ` Herbert Xu
2007-06-08 5:32 ` Krishna Kumar2
1 sibling, 1 reply; 153+ messages in thread
From: jamal @ 2007-06-08 1:35 UTC (permalink / raw)
To: Sridhar Samudrala
Cc: Herbert Xu, David Miller, auke-jan.h.kok, jeff, kaber,
peter.p.waskiewicz.jr, netdev, jesse.brandeburg
On Thu, 2007-07-06 at 17:31 -0700, Sridhar Samudrala wrote:
> If the QDISC_RUNNING flag guarantees that only one CPU can call
> dev->hard_start_xmit(), then why do we need to hold netif_tx_lock
> for non-LLTX drivers?
I havent stared at other drivers, but for e1000 seems to me
even if you got rid of LLTX that netif_tx_lock is unnecessary.
Herbert?
cheers,
jamal
^ permalink raw reply [flat|nested] 153+ messages in thread
* Re: [PATCH] NET: Multiqueue network device support.
2007-06-08 0:31 ` Sridhar Samudrala
2007-06-08 1:35 ` jamal
@ 2007-06-08 5:32 ` Krishna Kumar2
2007-06-08 19:55 ` Waskiewicz Jr, Peter P
1 sibling, 1 reply; 153+ messages in thread
From: Krishna Kumar2 @ 2007-06-08 5:32 UTC (permalink / raw)
To: Sridhar Samudrala
Cc: auke-jan.h.kok, David Miller, hadi, jeff, jesse.brandeburg, kaber,
netdev, peter.p.waskiewicz.jr
> If the QDISC_RUNNING flag guarantees that only one CPU can call
> dev->hard_start_xmit(), then why do we need to hold netif_tx_lock
> for non-LLTX drivers?
I thought the correct use is to get this lock on clean_tx side which
can get called on a different cpu on rx (which also cleans up slots
for skbs that have finished xmit). Both TX and clean_tx uses the
same tx_ring's head/tail ptrs and should be exclusive. But I don't
find clean tx using this lock in the code, so I am confused :-)
- KK
^ permalink raw reply [flat|nested] 153+ messages in thread
* Re: [PATCH] NET: Multiqueue network device support.
2007-06-08 1:35 ` jamal
@ 2007-06-08 10:39 ` Herbert Xu
2007-06-08 11:34 ` jamal
0 siblings, 1 reply; 153+ messages in thread
From: Herbert Xu @ 2007-06-08 10:39 UTC (permalink / raw)
To: jamal
Cc: Sridhar Samudrala, David Miller, auke-jan.h.kok, jeff, kaber,
peter.p.waskiewicz.jr, netdev, jesse.brandeburg
On Thu, Jun 07, 2007 at 09:35:36PM -0400, jamal wrote:
> On Thu, 2007-07-06 at 17:31 -0700, Sridhar Samudrala wrote:
>
> > If the QDISC_RUNNING flag guarantees that only one CPU can call
> > dev->hard_start_xmit(), then why do we need to hold netif_tx_lock
> > for non-LLTX drivers?
>
> I havent stared at other drivers, but for e1000 seems to me
> even if you got rid of LLTX that netif_tx_lock is unnecessary.
> Herbert?
It would guard against the poll routine which would acquire this lock
when cleaning the TX ring.
Cheers,
--
Visit Openswan at http://www.openswan.org/
Email: Herbert Xu ~{PmV>HI~} <herbert@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt
^ permalink raw reply [flat|nested] 153+ messages in thread
* Re: [PATCH] NET: Multiqueue network device support.
2007-06-08 10:39 ` Herbert Xu
@ 2007-06-08 11:34 ` jamal
2007-06-08 12:37 ` Herbert Xu
0 siblings, 1 reply; 153+ messages in thread
From: jamal @ 2007-06-08 11:34 UTC (permalink / raw)
To: Herbert Xu
Cc: Sridhar Samudrala, David Miller, auke-jan.h.kok, jeff, kaber,
peter.p.waskiewicz.jr, netdev, jesse.brandeburg
On Fri, 2007-08-06 at 20:39 +1000, Herbert Xu wrote:
> It would guard against the poll routine which would acquire this lock
> when cleaning the TX ring.
Ok, then i suppose we can conclude it is a bug on e1000 (holds tx_lock
on tx side and adapter queue lock on rx). Adding that lock will
certainly bring down the performance numbers on a send/recv profile.
The bizare thing is things run just fine even under the heavy tx/rx
traffic i was testing under. I guess i didnt hit hard enough.
cheers,
jamal
^ permalink raw reply [flat|nested] 153+ messages in thread
* Re: [PATCH] NET: Multiqueue network device support.
2007-06-08 11:34 ` jamal
@ 2007-06-08 12:37 ` Herbert Xu
2007-06-08 13:12 ` jamal
0 siblings, 1 reply; 153+ messages in thread
From: Herbert Xu @ 2007-06-08 12:37 UTC (permalink / raw)
To: jamal
Cc: Sridhar Samudrala, David Miller, auke-jan.h.kok, jeff, kaber,
peter.p.waskiewicz.jr, netdev, jesse.brandeburg
On Fri, Jun 08, 2007 at 07:34:57AM -0400, jamal wrote:
> On Fri, 2007-08-06 at 20:39 +1000, Herbert Xu wrote:
>
> > It would guard against the poll routine which would acquire this lock
> > when cleaning the TX ring.
>
> Ok, then i suppose we can conclude it is a bug on e1000 (holds tx_lock
> on tx side and adapter queue lock on rx). Adding that lock will
> certainly bring down the performance numbers on a send/recv profile.
> The bizare thing is things run just fine even under the heavy tx/rx
> traffic i was testing under. I guess i didnt hit hard enough.
Hmm I wasn't describing how it works now. I'm talking about how it
would work if we removed LLTX and replaced the private tx_lock with
netif_tx_lock.
Cheers,
--
Visit Openswan at http://www.openswan.org/
Email: Herbert Xu ~{PmV>HI~} <herbert@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt
^ permalink raw reply [flat|nested] 153+ messages in thread
* Re: [PATCH] NET: Multiqueue network device support.
2007-06-08 12:37 ` Herbert Xu
@ 2007-06-08 13:12 ` jamal
2007-06-09 11:08 ` Herbert Xu
0 siblings, 1 reply; 153+ messages in thread
From: jamal @ 2007-06-08 13:12 UTC (permalink / raw)
To: Herbert Xu
Cc: Sridhar Samudrala, David Miller, auke-jan.h.kok, jeff, kaber,
peter.p.waskiewicz.jr, netdev, jesse.brandeburg
On Fri, 2007-08-06 at 22:37 +1000, Herbert Xu wrote:
> Hmm I wasn't describing how it works now. I'm talking about how it
> would work if we removed LLTX and replaced the private tx_lock with
> netif_tx_lock.
I got that - it is what tg3 does for example.
To mimick that behavior in LLTX, a driver needs to use the same lock on
both tx and receive. e1000 holds a different lock on tx path from rx
path. Maybe theres something clever i am missing; but it seems to be a
bug on e1000.
The point i was making is that it was strange i never had problems
despite taking away the lock on the tx side and using the rx side
concurently.
cheers,
jamal
^ permalink raw reply [flat|nested] 153+ messages in thread
* RE: [PATCH] NET: Multiqueue network device support.
2007-06-08 5:32 ` Krishna Kumar2
@ 2007-06-08 19:55 ` Waskiewicz Jr, Peter P
2007-06-09 0:24 ` jamal
0 siblings, 1 reply; 153+ messages in thread
From: Waskiewicz Jr, Peter P @ 2007-06-08 19:55 UTC (permalink / raw)
To: Krishna Kumar2, Sridhar Samudrala
Cc: Kok, Auke-jan H, David Miller, hadi, jeff, Brandeburg, Jesse,
kaber, netdev
> I thought the correct use is to get this lock on clean_tx
> side which can get called on a different cpu on rx (which
> also cleans up slots for skbs that have finished xmit). Both
> TX and clean_tx uses the same tx_ring's head/tail ptrs and
> should be exclusive. But I don't find clean tx using this
> lock in the code, so I am confused :-)
>From e1000_main.c, e1000_clean():
/* e1000_clean is called per-cpu. This lock protects
* tx_ring[0] from being cleaned by multiple cpus
* simultaneously. A failure obtaining the lock means
* tx_ring[0] is currently being cleaned anyway. */
if (spin_trylock(&adapter->tx_queue_lock)) {
tx_cleaned = e1000_clean_tx_irq(adapter,
&adapter->tx_ring[0]);
spin_unlock(&adapter->tx_queue_lock);
}
In a multi-ring implementation of the driver, this is wrapped with for
(i = 0; i < adapter->num_tx_queues; i++) and &adapter->tx_ring[i]. This
lock also prevents the clean routine from stomping on xmit_frame() when
transmitting. Also in the multi-ring implementation, the tx_lock is
pushed down into the individual tx_ring struct, not at the adapter
level.
Cheers,
-PJ Waskiewicz
peter.p.waskiewicz.jr@intel.com
^ permalink raw reply [flat|nested] 153+ messages in thread
* RE: [PATCH] NET: Multiqueue network device support.
2007-06-08 19:55 ` Waskiewicz Jr, Peter P
@ 2007-06-09 0:24 ` jamal
0 siblings, 0 replies; 153+ messages in thread
From: jamal @ 2007-06-09 0:24 UTC (permalink / raw)
To: Waskiewicz Jr, Peter P
Cc: Krishna Kumar2, Sridhar Samudrala, Kok, Auke-jan H, David Miller,
jeff, Brandeburg, Jesse, kaber, netdev
On Fri, 2007-08-06 at 12:55 -0700, Waskiewicz Jr, Peter P wrote:
> > I thought the correct use is to get this lock on clean_tx
> > side which can get called on a different cpu on rx (which
> > also cleans up slots for skbs that have finished xmit). Both
> > TX and clean_tx uses the same tx_ring's head/tail ptrs and
> > should be exclusive. But I don't find clean tx using this
> > lock in the code, so I am confused :-)
>
> >From e1000_main.c, e1000_clean():
>
> /* e1000_clean is called per-cpu. This lock protects
> * tx_ring[0] from being cleaned by multiple cpus
> * simultaneously. A failure obtaining the lock means
> * tx_ring[0] is currently being cleaned anyway. */
> if (spin_trylock(&adapter->tx_queue_lock)) {
> tx_cleaned = e1000_clean_tx_irq(adapter,
> &adapter->tx_ring[0]);
> spin_unlock(&adapter->tx_queue_lock);
> }
Are you saying theres no problem because the adapter->tx_queue_lock is
being held?
> In a multi-ring implementation of the driver, this is wrapped with for
> (i = 0; i < adapter->num_tx_queues; i++) and &adapter->tx_ring[i]. This
> lock also prevents the clean routine from stomping on xmit_frame() when
> transmitting. Also in the multi-ring implementation, the tx_lock is
> pushed down into the individual tx_ring struct, not at the adapter
> level.
That sounds right - but the adapter lock is not related to tx_lock in
current e1000, correct?
cheers,
jamal
^ permalink raw reply [flat|nested] 153+ messages in thread
* RE: [PATCH] NET: Multiqueue network device support.
2007-06-07 22:55 ` Waskiewicz Jr, Peter P
@ 2007-06-09 1:05 ` Ramkrishna Vepa
0 siblings, 0 replies; 153+ messages in thread
From: Ramkrishna Vepa @ 2007-06-09 1:05 UTC (permalink / raw)
To: Waskiewicz Jr, Peter P, David Miller, hadi
Cc: Kok, Auke-jan H, jeff, kaber, netdev, Brandeburg, Jesse
Peter,
Where is your git tree located?
Ram
> -----Original Message-----
> From: netdev-owner@vger.kernel.org
[mailto:netdev-owner@vger.kernel.org]
> On Behalf Of Waskiewicz Jr, Peter P
> Sent: Thursday, June 07, 2007 3:56 PM
> To: David Miller; hadi@cyberus.ca
> Cc: Kok, Auke-jan H; jeff@garzik.org; kaber@trash.net;
> netdev@vger.kernel.org; Brandeburg, Jesse
> Subject: RE: [PATCH] NET: Multiqueue network device support.
>
> > > I empathize but take a closer look; seems mostly useless.
> >
> > I thought E1000 still uses LLTX, and if so then multiple cpus
> > can most definitely get into the ->hard_start_xmit() in parallel.
>
> Not with how the qdisc status protects it today:
>
> include/net/pkt_sched.h:
>
> static inline void qdisc_run(struct net_device *dev)
> {
> if (!netif_queue_stopped(dev) &&
> !test_and_set_bit(__LINK_STATE_QDISC_RUNNING,
&dev->state))
> __qdisc_run(dev);
> }
> -
> To unsubscribe from this list: send the line "unsubscribe netdev" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at http://vger.kernel.org/majordomo-info.html
^ permalink raw reply [flat|nested] 153+ messages in thread
* Re: [PATCH] NET: Multiqueue network device support.
2007-06-08 13:12 ` jamal
@ 2007-06-09 11:08 ` Herbert Xu
2007-06-09 14:36 ` jamal
0 siblings, 1 reply; 153+ messages in thread
From: Herbert Xu @ 2007-06-09 11:08 UTC (permalink / raw)
To: jamal
Cc: Sridhar Samudrala, David Miller, auke-jan.h.kok, jeff, kaber,
peter.p.waskiewicz.jr, netdev, jesse.brandeburg
On Fri, Jun 08, 2007 at 09:12:52AM -0400, jamal wrote:
>
> To mimick that behavior in LLTX, a driver needs to use the same lock on
> both tx and receive. e1000 holds a different lock on tx path from rx
> path. Maybe theres something clever i am missing; but it seems to be a
> bug on e1000.
It's both actually :)
It takes the tx_lock in the xmit routine as well as in the clean-up
routine. However, the lock is only taken when it updates the queue
status.
Thanks to the ring buffer structure the rest of the clean-up/xmit code
will run concurrently just fine.
Cheers,
--
Visit Openswan at http://www.openswan.org/
Email: Herbert Xu ~{PmV>HI~} <herbert@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt
^ permalink raw reply [flat|nested] 153+ messages in thread
* Re: [PATCH] NET: Multiqueue network device support.
2007-06-09 11:08 ` Herbert Xu
@ 2007-06-09 14:36 ` jamal
0 siblings, 0 replies; 153+ messages in thread
From: jamal @ 2007-06-09 14:36 UTC (permalink / raw)
To: Herbert Xu
Cc: Sridhar Samudrala, David Miller, auke-jan.h.kok, jeff, kaber,
peter.p.waskiewicz.jr, netdev, jesse.brandeburg
On Sat, 2007-09-06 at 21:08 +1000, Herbert Xu wrote:
> It takes the tx_lock in the xmit routine as well as in the clean-up
> routine. However, the lock is only taken when it updates the queue
> status.
>
> Thanks to the ring buffer structure the rest of the clean-up/xmit code
> will run concurrently just fine.
I know you are a patient man Herbert - so please explain slowly (if that
doesnt make sense on email, then bear with me as usual) ;->
- it seems the cleverness is that some parts of the ring description are
written to on tx but not rx (and vice-versa), correct? example the
next_to_watch/use bits. If thats a yes - there at least should have been
a big fat comment on the code so nobody changes it;
- and even if thats the case,
a) then the tx_lock sounds unneeded, correct? (given the RUNNING
atomicity).
b) do you even need the adapter lock? ;-> given the nature of the NAPI
poll only one CPU can prune the descriptors.
I have tested with just getting rid of tx_lock and it worked fine. I
havent tried removing the adapter lock.
cheers,
jamal
^ permalink raw reply [flat|nested] 153+ messages in thread
* RE: [PATCH] NET: Multiqueue network device support.
2007-06-06 22:30 ` Waskiewicz Jr, Peter P
2007-06-06 22:40 ` David Miller
@ 2007-06-09 14:58 ` Leonid Grossman
2007-06-09 19:23 ` jamal
1 sibling, 1 reply; 153+ messages in thread
From: Leonid Grossman @ 2007-06-09 14:58 UTC (permalink / raw)
To: Waskiewicz Jr, Peter P, hadi, Patrick McHardy
Cc: davem, netdev, jeff, Kok, Auke-jan H, Ramkrishna Vepa,
Alex Aizman
> -----Original Message-----
> From: netdev-owner@vger.kernel.org [mailto:netdev-
> owner@vger.kernel.org] On Behalf Of Waskiewicz Jr, Peter P
> Sent: Wednesday, June 06, 2007 3:31 PM
> To: hadi@cyberus.ca; Patrick McHardy
> Cc: davem@davemloft.net; netdev@vger.kernel.org; jeff@garzik.org; Kok,
> Auke-jan H
> Subject: RE: [PATCH] NET: Multiqueue network device support.
>
> > [Which of course leads to the complexity (and not optimizing
> > for the common - which is single ring NICs)].
>
> The common for 100 Mbit and older 1Gbit is single ring NICs. Newer
> PCI-X and PCIe NICs from 1Gbit to 10Gbit support multiple rings in the
> hardware, and it's all headed in that direction, so it's becoming the
> common case.
IMHO, in addition to current Intel and Neterion NICs, some/most upcoming
NICs are likely to be multiqueue, since virtualization emerges as a
major driver for hw designs (there are other things of course that drive
hw, but these are complimentary to multiqueue).
PCI-SIG IOV extensions for pci spec are almost done, and a typical NIC
(at least, typical 10GbE NIC that supports some subset of IOV) in the
near future is likely to have at least 8 independent channels with its
own tx/rx queue, MAC address, msi-x vector(s), reset that doesn't affect
other channels, etc.
Basically, each channel could be used as an independent NIC that just
happens to share pci bus and 10GbE PHY with other channels (but has
per-channel QoS and throughput guarantees).
In a non-virtualized system, such NICs could be used in a mode when each
channel runs on one core; this may eliminate some locking... This mode
will require btw deterministic session steering, current hashing
approach in the patch is not sufficient; this is something we can
contribute once Peter's code is in.
In general, a consensus on kernel support for multiqueue NICs will be
beneficial since multiqueue HW is here and other stacks already taking
advantage of it.
^ permalink raw reply [flat|nested] 153+ messages in thread
* RE: [PATCH] NET: Multiqueue network device support.
2007-06-09 14:58 ` Leonid Grossman
@ 2007-06-09 19:23 ` jamal
2007-06-09 21:23 ` Leonid Grossman
0 siblings, 1 reply; 153+ messages in thread
From: jamal @ 2007-06-09 19:23 UTC (permalink / raw)
To: Leonid Grossman
Cc: Waskiewicz Jr, Peter P, Patrick McHardy, davem, netdev, jeff,
Kok, Auke-jan H, Ramkrishna Vepa, Alex Aizman
On Sat, 2007-09-06 at 10:58 -0400, Leonid Grossman wrote:
> IMHO, in addition to current Intel and Neterion NICs, some/most upcoming
> NICs are likely to be multiqueue, since virtualization emerges as a
> major driver for hw designs (there are other things of course that drive
> hw, but these are complimentary to multiqueue).
>
> PCI-SIG IOV extensions for pci spec are almost done, and a typical NIC
> (at least, typical 10GbE NIC that supports some subset of IOV) in the
> near future is likely to have at least 8 independent channels with its
> own tx/rx queue, MAC address, msi-x vector(s), reset that doesn't affect
> other channels, etc.
Leonid - any relation between that and data center ethernet? i.e
http://www.ieee802.org/3/ar/public/0503/wadekar_1_0503.pdf
It seems to desire to do virtualization as well.
Is there any open spec for PCI-SIG IOV?
> Basically, each channel could be used as an independent NIC that just
> happens to share pci bus and 10GbE PHY with other channels (but has
> per-channel QoS and throughput guarantees).
Sounds very similar to data centre ethernet - except data centre
ethernet seems to map "channels" to rings; whereas the scheme you
describe maps a channel essentially to a virtual nic which seems to read
in the common case as a single tx, single rx ring. Is that right? If
yes, we should be able to do the virtual nics today without any changes
really since each one appears as a separate NIC. It will be a matter of
probably boot time partitioning and parametrization to create virtual
nics (ex of priorities of each virtual NIC etc).
> In a non-virtualized system, such NICs could be used in a mode when each
> channel runs on one core; this may eliminate some locking... This mode
> will require btw deterministic session steering, current hashing
> approach in the patch is not sufficient; this is something we can
> contribute once Peter's code is in.
I can actually see how the PCI-SIG approach using virtual NIC approach
could run on multiple CPUs (since each is no different from a NIC that
we have today). And our current Linux steering would also work just
fine.
In the case of non-virtual NICs, i am afraid i dont think it is as easy
as simple session steering - if you want to be generic that is; you may
wanna consider a more complex connection tracking i.e a grouping of
sessions as the basis for steering to a tx ring (and therefore tying to
a specific CPU).
If you are an ISP or a data center with customers partitioned based on
simple subnets, then i can see a simple classification based on subnets
being tied to a hw ring/CPU. And in such cases simple flow control on a
per ring basis makes sense.
Have you guys experimented on the the non-virtual case? And are you
doing the virtual case as a pair of tx/rx being a single virtual nic?
> In general, a consensus on kernel support for multiqueue NICs will be
> beneficial since multiqueue HW is here and other stacks already taking
> advantage of it.
My main contention with the Peters approach has been to do with the
propagating of flow control back to the qdisc queues. However, if this
PCI SIG standard is also desiring such an approach then it will shed a
different light.
cheers,
jamal
^ permalink raw reply [flat|nested] 153+ messages in thread
* RE: [PATCH] NET: Multiqueue network device support.
2007-06-09 19:23 ` jamal
@ 2007-06-09 21:23 ` Leonid Grossman
2007-06-09 22:14 ` Jeff Garzik
2007-06-10 3:02 ` jamal
0 siblings, 2 replies; 153+ messages in thread
From: Leonid Grossman @ 2007-06-09 21:23 UTC (permalink / raw)
To: hadi
Cc: Waskiewicz Jr, Peter P, Patrick McHardy, davem, netdev, jeff,
Kok, Auke-jan H, Ramkrishna Vepa, Alex Aizman
> -----Original Message-----
> From: J Hadi Salim [mailto:j.hadi123@gmail.com] On Behalf Of jamal
> Sent: Saturday, June 09, 2007 12:23 PM
> To: Leonid Grossman
> Cc: Waskiewicz Jr, Peter P; Patrick McHardy; davem@davemloft.net;
> netdev@vger.kernel.org; jeff@garzik.org; Kok, Auke-jan H; Ramkrishna
> Vepa; Alex Aizman
> Subject: RE: [PATCH] NET: Multiqueue network device support.
>
> On Sat, 2007-09-06 at 10:58 -0400, Leonid Grossman wrote:
>
> > IMHO, in addition to current Intel and Neterion NICs, some/most
> upcoming
> > NICs are likely to be multiqueue, since virtualization emerges as a
> > major driver for hw designs (there are other things of course that
> drive
> > hw, but these are complimentary to multiqueue).
> >
> > PCI-SIG IOV extensions for pci spec are almost done, and a typical
> NIC
> > (at least, typical 10GbE NIC that supports some subset of IOV) in
the
> > near future is likely to have at least 8 independent channels with
> its
> > own tx/rx queue, MAC address, msi-x vector(s), reset that doesn't
> affect
> > other channels, etc.
>
> Leonid - any relation between that and data center ethernet? i.e
> http://www.ieee802.org/3/ar/public/0503/wadekar_1_0503.pdf
> It seems to desire to do virtualization as well.
Not really. This is a very old presentation; you probably saw some newer
PR on Convergence Enhanced Ethernet, Congestion Free Ethernet etc.
These efforts are in very early stages and arguably orthogonal to
virtualization, but in general having per channel QoS (flow control is
just a part of it) is a good thing.
> Is there any open spec for PCI-SIG IOV?
I don't think so, the actual specs and event presentations at
www.pcisig.org are members-only, although there are many PRs about early
IOV support that may shed some light on the features.
But my point was that while virtualization capabilities of upcoming NICs
may be not even relevant to Linux, the multi-channel hw designs (a side
effect of virtualization push, if you will) will be there and a
non-virtualized stack can take advantage of them.
Actually, our current 10GbE NICs have most of such multichannel
framework already shipping (in pre-IOV fashion), so the programming
manual on the website can probably give you a pretty good idea about how
multi-channel 10GbE NICs may look like.
>
> > Basically, each channel could be used as an independent NIC that
just
> > happens to share pci bus and 10GbE PHY with other channels (but has
> > per-channel QoS and throughput guarantees).
>
> Sounds very similar to data centre ethernet - except data centre
> ethernet seems to map "channels" to rings; whereas the scheme you
> describe maps a channel essentially to a virtual nic which seems to
> read
> in the common case as a single tx, single rx ring. Is that right? If
> yes, we should be able to do the virtual nics today without any
changes
> really since each one appears as a separate NIC. It will be a matter
of
> probably boot time partitioning and parametrization to create virtual
> nics (ex of priorities of each virtual NIC etc).
Right, this is one deployment scenario for a multi-channel NIC, and it
will require very few changes in the stack (couple extra IOCTLS would be
nice).
There are two reasons why you still may want to have a generic
multi-channel support/awareness in the stack:
1. Some users may want to have single ip interface with multiple
channels.
2. While multi-channel NICs will likely to be many, only "best-in-class"
will make the hw "channels" completely independent and able to operate
as a separate nic. Other implementations may have some limitations, and
will work as multi-channel API compliant devices but not nesseserily as
independent mac devices.
I agree though that supporting multi-channel APIs is a bigger effort.
>
> > In a non-virtualized system, such NICs could be used in a mode when
> each
> > channel runs on one core; this may eliminate some locking... This
> mode
> > will require btw deterministic session steering, current hashing
> > approach in the patch is not sufficient; this is something we can
> > contribute once Peter's code is in.
>
> I can actually see how the PCI-SIG approach using virtual NIC approach
> could run on multiple CPUs (since each is no different from a NIC that
> we have today). And our current Linux steering would also work just
> fine.
>
> In the case of non-virtual NICs, i am afraid i dont think it is as
easy
> as simple session steering - if you want to be generic that is; you
may
> wanna consider a more complex connection tracking i.e a grouping of
> sessions as the basis for steering to a tx ring (and therefore tying
to
> a specific CPU).
> If you are an ISP or a data center with customers partitioned based on
> simple subnets, then i can see a simple classification based on
subnets
> being tied to a hw ring/CPU. And in such cases simple flow control on
a
> per ring basis makes sense.
> Have you guys experimented on the the non-virtual case? And are you
> doing the virtual case as a pair of tx/rx being a single virtual nic?
To a degree. We have quite a bit of testing done in non-virtual OS (not
in Linux though), using channels with tx/rx rings, msi-x etc as
independent NICs. Flow control was not a focus since the fabric
typically was not congested in these tests, but in theory per-channel
flow control should work reasonably well. Of course, flow control is
only part of resource sharing problem.
>
> > In general, a consensus on kernel support for multiqueue NICs will
be
> > beneficial since multiqueue HW is here and other stacks already
> taking
> > advantage of it.
>
> My main contention with the Peters approach has been to do with the
> propagating of flow control back to the qdisc queues. However, if this
> PCI SIG standard is also desiring such an approach then it will shed a
> different light.
This is not what I'm saying :-). The IEEE link you sent shows that
per-link flow control is a separate effort, and it will likely to take
time to become a standard.
Also, (besides the shared link) the channels will share pci bus.
One solution could be to provide a generic API for QoS level to a
channel
(and also to a generic NIC!).
Internally, device driver can translate QoS requirements into flow
control, pci bus bandwidth, and whatever else is shared on the physical
NIC between the channels.
As always, as some of that code becomes common between the drivers it
can migrate up.
Best, Leonid
>
> cheers,
> jamal
^ permalink raw reply [flat|nested] 153+ messages in thread
* Re: [PATCH] NET: Multiqueue network device support.
2007-06-09 21:23 ` Leonid Grossman
@ 2007-06-09 22:14 ` Jeff Garzik
2007-06-10 3:02 ` jamal
1 sibling, 0 replies; 153+ messages in thread
From: Jeff Garzik @ 2007-06-09 22:14 UTC (permalink / raw)
To: Leonid Grossman
Cc: hadi, Waskiewicz Jr, Peter P, Patrick McHardy, davem, netdev,
Kok, Auke-jan H, Ramkrishna Vepa, Alex Aizman
Leonid Grossman wrote:
> But my point was that while virtualization capabilities of upcoming NICs
> may be not even relevant to Linux, the multi-channel hw designs (a side
> effect of virtualization push, if you will) will be there and a
> non-virtualized stack can take advantage of them.
I'm looking at the current hardware virtualization efforts, and often
grimacing. A lot of these efforts assume that "virtual PCI devices"
will be wonderful virtualization solutions, without stopping to think
about global events that affect all such devices, such as silicon resets
or errata workarounds. In the real world, you wind up having to
un-virtualize to deal with certain exceptional events.
But as you point out, these hardware virt efforts can bestow benefits on
non-virtualized stacks.
Jeff
^ permalink raw reply [flat|nested] 153+ messages in thread
* RE: [PATCH] NET: Multiqueue network device support.
2007-06-09 21:23 ` Leonid Grossman
2007-06-09 22:14 ` Jeff Garzik
@ 2007-06-10 3:02 ` jamal
2007-06-10 15:27 ` Leonid Grossman
1 sibling, 1 reply; 153+ messages in thread
From: jamal @ 2007-06-10 3:02 UTC (permalink / raw)
To: Leonid Grossman
Cc: Waskiewicz Jr, Peter P, Patrick McHardy, davem, netdev, jeff,
Kok, Auke-jan H, Ramkrishna Vepa, Alex Aizman
On Sat, 2007-09-06 at 17:23 -0400, Leonid Grossman wrote:
> Not really. This is a very old presentation; you probably saw some newer
> PR on Convergence Enhanced Ethernet, Congestion Free Ethernet etc.
Not been keeping up to date in that area.
> These efforts are in very early stages and arguably orthogonal to
> virtualization, but in general having per channel QoS (flow control is
> just a part of it) is a good thing.
our definition of "channel" on linux so far is a netdev
(not a DMA ring). A netdev is the entity that can be bound to a CPU.
Link layer flow control terminates (and emanates) from the netdev.
> But my point was that while virtualization capabilities of upcoming NICs
> may be not even relevant to Linux, the multi-channel hw designs (a side
> effect of virtualization push, if you will) will be there and a
> non-virtualized stack can take advantage of them.
Makes sense...
> Actually, our current 10GbE NICs have most of such multichannel
> framework already shipping (in pre-IOV fashion), so the programming
> manual on the website can probably give you a pretty good idea about how
> multi-channel 10GbE NICs may look like.
Ok, thanks.
> Right, this is one deployment scenario for a multi-channel NIC, and it
> will require very few changes in the stack (couple extra IOCTLS would be
> nice).
Essentially a provisioning interface.
> There are two reasons why you still may want to have a generic
> multi-channel support/awareness in the stack:
> 1. Some users may want to have single ip interface with multiple
> channels.
> 2. While multi-channel NICs will likely to be many, only "best-in-class"
> will make the hw "channels" completely independent and able to operate
> as a separate nic. Other implementations may have some limitations, and
> will work as multi-channel API compliant devices but not nesseserily as
> independent mac devices.
> I agree though that supporting multi-channel APIs is a bigger effort.
IMO, the challenges you describe above are solvable via a parent
netdevice (similar to bonding) with children being the virtual NICs. The
IP address is attached to the parent. Of course the other model is not
to show the parent device at all.
> To a degree. We have quite a bit of testing done in non-virtual OS (not
> in Linux though), using channels with tx/rx rings, msi-x etc as
> independent NICs. Flow control was not a focus since the fabric
> typically was not congested in these tests, but in theory per-channel
> flow control should work reasonably well. Of course, flow control is
> only part of resource sharing problem.
In the current model - flow control to the s/ware queueing level (qdisc)
is implicit. i.e hardware receives pause frames - stops sending; ring
becomes full as hardware sends, netdev tx path gets shut until things
open up when
> This is not what I'm saying :-). The IEEE link you sent shows that
> per-link flow control is a separate effort, and it will likely to take
> time to become a standard.
Ok, my impression was it was happening already or it will happen
tommorow morning ;->
> Also, (besides the shared link) the channels will share pci bus.
>
> One solution could be to provide a generic API for QoS level to a
> channel
> (and also to a generic NIC!).
> Internally, device driver can translate QoS requirements into flow
> control, pci bus bandwidth, and whatever else is shared on the physical
> NIC between the channels.
> As always, as some of that code becomes common between the drivers it
> can migrate up.
indeed.
cheers,
jamal
^ permalink raw reply [flat|nested] 153+ messages in thread
* RE: [PATCH] NET: Multiqueue network device support.
2007-06-10 3:02 ` jamal
@ 2007-06-10 15:27 ` Leonid Grossman
0 siblings, 0 replies; 153+ messages in thread
From: Leonid Grossman @ 2007-06-10 15:27 UTC (permalink / raw)
To: hadi
Cc: Waskiewicz Jr, Peter P, Patrick McHardy, davem, netdev, jeff,
Kok, Auke-jan H, Ramkrishna Vepa, Alex Aizman
> -----Original Message-----
> From: J Hadi Salim [mailto:j.hadi123@gmail.com] On Behalf Of jamal
> Sent: Saturday, June 09, 2007 8:03 PM
> To: Leonid Grossman
> Cc: Waskiewicz Jr, Peter P; Patrick McHardy; davem@davemloft.net;
> netdev@vger.kernel.org; jeff@garzik.org; Kok, Auke-jan H; Ramkrishna
> Vepa; Alex Aizman
> Subject: RE: [PATCH] NET: Multiqueue network device support.
>
> our definition of "channel" on linux so far is a netdev
> (not a DMA ring). A netdev is the entity that can be bound to a CPU.
> Link layer flow control terminates (and emanates) from the netdev.
I think we are saying the same thing. Link layer flow control frames are
generated (and terminated) by the hardware; the hardware gets configured
by netdev.
And if a hw channel has enough resources, it could be configured as a
separate netdev and handle it's flow control the same way single-channel
NICs do now.
I'm not advocating flow control on per DMA ring basis.
> > This is not what I'm saying :-). The IEEE link you sent shows that
> > per-link flow control is a separate effort, and it will likely to
> take
> > time to become a standard.
>
> Ok, my impression was it was happening already or it will happen
> tommorow morning ;->
the proposal you mentioned is dated 2005, but something like that will
probably happen sooner or later in IEEE. Some non-standard options,
including ours, are already here - but as we just discussed, in any case
flow control is arguably a netdev property not a queue property.
The multi-queue patch itself though (and possibly some additional
per-queue properties) is a good thing :-)
^ permalink raw reply [flat|nested] 153+ messages in thread
* Re: [PATCH] NET: Multiqueue network device support.
2007-06-06 22:13 ` jamal
2007-06-06 22:30 ` Waskiewicz Jr, Peter P
2007-06-06 22:35 ` David Miller
@ 2007-06-11 11:58 ` Patrick McHardy
2007-06-11 12:23 ` jamal
2 siblings, 1 reply; 153+ messages in thread
From: Patrick McHardy @ 2007-06-11 11:58 UTC (permalink / raw)
To: hadi; +Cc: Waskiewicz Jr, Peter P, davem, netdev, jeff, Kok, Auke-jan H
jamal wrote:
> On Wed, 2007-06-06 at 17:11 +0200, Patrick McHardy wrote:
>
>
>>[...]
> The problem is the premise is _innacurate_.
> Since you havent followed the discussion, i will try to be brief (which
> is hard).
> If you want verbosity it is in my previous emails:
>
> Consider a simple example of strict prio qdisc which is mirror
> configuration of a specific hardware.
> Then for sake of discussion, assume two prio queues in the qdisc - PSL
> and PSH and two hardware queues/rings in a NIC which does strict prio
> with queues PHL and PHH.
> The mapping is as follows:
> PSL --- maps to --- PHL
> PSH --- maps to --- PHH
>
> Assume the PxH has a higher prio than PxL.
> Strict prio will always favor H over L.
>
> Two scenarios:
> a) a lot of packets for PSL arriving on the stack.
> They only get sent from PSL -> PHL if and only if there are no
> packets from PSH->PHH.
> b)a lot of packets for PSH arriving from the stack.
> They will always be favored over PSL in sending to the hardware.
>
>>From the above:
> The only way PHL will ever shutdown the path to the hardware is when
> there are sufficient PHL packets.
> Corrollary,
> The only way PSL will ever shutdown the path to the hardware is when
> there are _NO_ PSH packets.
Thats not true. Assume PSL has lots of packets, PSH is empty. We
fill the PHL queue until their is no room left, so the driver
has to stop the queue. Now some PSH packets arrive, but the queue
is stopped, no packets will be sent. Now, you can argue that as
soon as the first PHL packet is sent there is room for more and
the queue will be activated again and we'll take PSH packets,
so it doesn't matter because we can't send two packets at once
anyway. Fine. Take three HW queues, prio 0-2. The prio 2 queue
is entirely full, prio 1 has some packets queued and prio 0 is
empty. Now, because prio 2 is completely full, the driver has to
stop the queue. Before it can start it again it has to send all
prio 1 packets and then at least one packet of prio 2. Until
this happens, no packets can be queued to prio 0.
^ permalink raw reply [flat|nested] 153+ messages in thread
* Re: [PATCH] NET: Multiqueue network device support.
2007-06-06 23:32 ` jamal
` (2 preceding siblings ...)
2007-06-06 23:53 ` David Miller
@ 2007-06-11 12:01 ` Patrick McHardy
3 siblings, 0 replies; 153+ messages in thread
From: Patrick McHardy @ 2007-06-11 12:01 UTC (permalink / raw)
To: hadi; +Cc: David Miller, peter.p.waskiewicz.jr, netdev, jeff, auke-jan.h.kok
jamal wrote:
> On Wed, 2007-06-06 at 15:35 -0700, David Miller wrote:
>
>>The problem with this line of thinking is that it ignores the fact
>>that it is bad to not queue to the device when there is space
>>available, _even_ for lower priority packets.
>
>
> So use a different scheduler. Dont use strict prio. Strict prio will
> guarantee starvation of low prio packets as long as there are high prio
> packets. Thats the intent.
With a single queue state _any_ full HW queue will starve all other
queues, independant of the software queueing discipline.
^ permalink raw reply [flat|nested] 153+ messages in thread
* Re: [PATCH] NET: Multiqueue network device support.
2007-06-07 16:59 ` Waskiewicz Jr, Peter P
@ 2007-06-11 12:08 ` Patrick McHardy
0 siblings, 0 replies; 153+ messages in thread
From: Patrick McHardy @ 2007-06-11 12:08 UTC (permalink / raw)
To: Waskiewicz Jr, Peter P
Cc: Stephen Hemminger, David Miller, hadi, netdev, jeff,
Kok, Auke-jan H
Waskiewicz Jr, Peter P wrote:
>>>If they have multiple TX queues, independantly programmable, that
>>>single lock is stupid.
>>>
>>>We could use per-queue TX locks for such hardware, but we can't
>>>support that currently.
>>
>>There could be bad packet reordering with this (like some SMP
>>routers used to do).
>
>
> My original multiqueue patches I submitted actually had a per-queue Tx
> lock, but it was removed since the asymmetry in the stack for locking
> was something people didn't like. Locking a queue for ->enqueue(),
> unlocking, then locking for ->dequeue(), unlocking, was something people
> didn't like very much. Also knowing what queue to lock on ->enqueue()
> was where the original ->map_queue() idea came from, since we wanted to
> lock before calling ->enqueue().
I guess there were a few more reasons why people (at least me) didn't
like it. IIRC It didn't include any sch_api locking changes, to it
was completely broken wrt. concurrent configuration changes (easy
fixable though). Additionally it assumed that classification was
deterministic and two classify calls would return the same result,
which is not necessarily true and might have resulted in locking
the wrong queue, and it didn't deal with TC actions doing stuff
to a packet during the first classification.
^ permalink raw reply [flat|nested] 153+ messages in thread
* Re: [PATCH] NET: Multiqueue network device support.
2007-06-11 11:58 ` Patrick McHardy
@ 2007-06-11 12:23 ` jamal
2007-06-11 12:39 ` Patrick McHardy
2007-06-12 9:19 ` Johannes Berg
0 siblings, 2 replies; 153+ messages in thread
From: jamal @ 2007-06-11 12:23 UTC (permalink / raw)
To: Patrick McHardy
Cc: Waskiewicz Jr, Peter P, davem, netdev, jeff, Kok, Auke-jan H
On Mon, 2007-11-06 at 13:58 +0200, Patrick McHardy wrote:
> Thats not true. Assume PSL has lots of packets, PSH is empty. We
> fill the PHL queue until their is no room left, so the driver
> has to stop the queue.
Sure. Packets stashed on the any DMA ring are considered "gone to the
wire". That is a very valid assumption to make.
> Now some PSH packets arrive, but the queue
> is stopped, no packets will be sent.
> Now, you can argue that as
> soon as the first PHL packet is sent there is room for more and
> the queue will be activated again and we'll take PSH packets,
_exactly_ ;->
> so it doesn't matter because we can't send two packets at once
> anyway. Fine.
i can see your thought process building -
You are actually following what i am saying;->
> Take three HW queues, prio 0-2. The prio 2 queue
> is entirely full, prio 1 has some packets queued and prio 0 is
> empty. Now, because prio 2 is completely full, the driver has to
> stop the queue. Before it can start it again it has to send all
> prio 1 packets and then at least one packet of prio 2. Until
> this happens, no packets can be queued to prio 0.
The assumption is packets gone to the DMA are gone to the wire, thats
it.
If you have a strict prio scheduler, contention from the stack is only
valid if they both arrive at the same time.
If that happens then (assuming 0 is more important than 1 which is more
important than 2) then 0 always wins over 1 which wins over 2.
Same thing if you queue into hardware and the priorization is the same.
cheers,
jamal
^ permalink raw reply [flat|nested] 153+ messages in thread
* Re: [PATCH] NET: Multiqueue network device support.
2007-06-11 12:23 ` jamal
@ 2007-06-11 12:39 ` Patrick McHardy
2007-06-11 12:52 ` jamal
2007-06-12 9:19 ` Johannes Berg
1 sibling, 1 reply; 153+ messages in thread
From: Patrick McHardy @ 2007-06-11 12:39 UTC (permalink / raw)
To: hadi; +Cc: Waskiewicz Jr, Peter P, davem, netdev, jeff, Kok, Auke-jan H
jamal wrote:
> On Mon, 2007-11-06 at 13:58 +0200, Patrick McHardy wrote:
>
>
>>Thats not true. Assume PSL has lots of packets, PSH is empty. We
>>fill the PHL queue until their is no room left, so the driver
>>has to stop the queue.
>
>
> Sure. Packets stashed on the any DMA ring are considered "gone to the
> wire". That is a very valid assumption to make.
I disagree, its obviously not true and leads to the behaviour I
described. If it were true there would be no reason to use multiple
HW TX queues to begin with.
>>[...]
>
> i can see your thought process building -
> You are actually following what i am saying;->
I am :)
^ permalink raw reply [flat|nested] 153+ messages in thread
* Re: [PATCH] NET: Multiqueue network device support.
2007-06-11 12:39 ` Patrick McHardy
@ 2007-06-11 12:52 ` jamal
2007-06-11 13:03 ` Patrick McHardy
0 siblings, 1 reply; 153+ messages in thread
From: jamal @ 2007-06-11 12:52 UTC (permalink / raw)
To: Patrick McHardy
Cc: Waskiewicz Jr, Peter P, davem, netdev, jeff, Kok, Auke-jan H
On Mon, 2007-11-06 at 14:39 +0200, Patrick McHardy wrote:
> jamal wrote:
> > On Mon, 2007-11-06 at 13:58 +0200, Patrick McHardy wrote:
> >
> > Sure. Packets stashed on the any DMA ring are considered "gone to the
> > wire". That is a very valid assumption to make.
>
>
> I disagree, its obviously not true
Patrick, you are making too strong a statement. Take a step back:
When you put a packet on the DMA ring, are you ever going to take it
away at some point before it goes to the wire?
> and leads to the behaviour I
> described. If it were true there would be no reason to use multiple
> HW TX queues to begin with.
In the general case, they are totaly useless.
They are useful when theres contention/congestion. Even in a shared
media like wireless.
And if there is contention, the qdisc scheduler will do the right thing.
cheers,
jamal
^ permalink raw reply [flat|nested] 153+ messages in thread
* Re: [PATCH] NET: Multiqueue network device support.
2007-06-11 12:52 ` jamal
@ 2007-06-11 13:03 ` Patrick McHardy
2007-06-11 13:29 ` jamal
0 siblings, 1 reply; 153+ messages in thread
From: Patrick McHardy @ 2007-06-11 13:03 UTC (permalink / raw)
To: hadi; +Cc: Waskiewicz Jr, Peter P, davem, netdev, jeff, Kok, Auke-jan H
jamal wrote:
> On Mon, 2007-11-06 at 14:39 +0200, Patrick McHardy wrote:
>
>>>Sure. Packets stashed on the any DMA ring are considered "gone to the
>>>wire". That is a very valid assumption to make.
>>
>>
>>I disagree, its obviously not true
>
>
> Patrick, you are making too strong a statement.
Well, its not.
> Take a step back:
> When you put a packet on the DMA ring, are you ever going to take it
> away at some point before it goes to the wire?
No, but its nevertheless not on the wire yet and the HW scheduler
controls when it will get there. It might in theory even never get
there if higher priority queues are continously active.
>>and leads to the behaviour I
>>described. If it were true there would be no reason to use multiple
>>HW TX queues to begin with.
>
>
> In the general case, they are totaly useless.
> They are useful when theres contention/congestion. Even in a shared
> media like wireless.
The same is true for any work-conserving queue, software or hardware.
> And if there is contention, the qdisc scheduler will do the right thing.
That ignores a few points that were raised in this thread,
- you can treat each HW queue as an indivdual network device
- you can avoid synchronizing on a single queue lock for
multiple TX queues
- it is desirable to keep all queues full
^ permalink raw reply [flat|nested] 153+ messages in thread
* Re: [PATCH] NET: Multiqueue network device support.
2007-06-11 13:03 ` Patrick McHardy
@ 2007-06-11 13:29 ` jamal
2007-06-11 14:03 ` Patrick McHardy
0 siblings, 1 reply; 153+ messages in thread
From: jamal @ 2007-06-11 13:29 UTC (permalink / raw)
To: Patrick McHardy
Cc: Waskiewicz Jr, Peter P, davem, netdev, jeff, Kok, Auke-jan H
On Mon, 2007-11-06 at 15:03 +0200, Patrick McHardy wrote:
> jamal wrote:
> Well, its not.
I dont wanna go into those old style debates again; so lets drop this
point.
> > Take a step back:
> > When you put a packet on the DMA ring, are you ever going to take it
> > away at some point before it goes to the wire?
>
>
> No,
> but its nevertheless not on the wire yet and the HW scheduler
> controls when it will get there.
>
> It might in theory even never get
> there if higher priority queues are continously active.
Sure - but what is wrong with that?
What would be wrong is in the case of contention for a resource like a
wire between a less important packet and a more important packet, the
more important packet gets favored.
Nothing like that ever happens in what i described.
Remember there is no issue if there is no congestion or contention for
local resources.
> > And if there is contention, the qdisc scheduler will do the right thing.
>
>
> That ignores a few points that were raised in this thread,
>
> - you can treat each HW queue as an indivdual network device
You can treat a pair of tx/rx as a netdev. In which case none of this is
important. You instantiate a different netdev and it only holds the
appropriate locks.
> - you can avoid synchronizing on a single queue lock for
> multiple TX queues
Unneeded if you do what i described. Zero changes to the qdisc code.
> - it is desirable to keep all queues full
It is desirable to keep resources fully utilized. Sometimes that is
achieved by keeping _all_ queues full. If i fill up a single queue full
and transmit at wire rate, there is no issue.
cheers,
jamal
^ permalink raw reply [flat|nested] 153+ messages in thread
* Re: [PATCH] NET: Multiqueue network device support.
2007-06-11 13:29 ` jamal
@ 2007-06-11 14:03 ` Patrick McHardy
2007-06-11 14:30 ` Cohen, Guy
2007-06-11 14:40 ` jamal
0 siblings, 2 replies; 153+ messages in thread
From: Patrick McHardy @ 2007-06-11 14:03 UTC (permalink / raw)
To: hadi; +Cc: Waskiewicz Jr, Peter P, davem, netdev, jeff, Kok, Auke-jan H
jamal wrote:
> On Mon, 2007-11-06 at 15:03 +0200, Patrick McHardy wrote:
>
>>>Take a step back:
>>>When you put a packet on the DMA ring, are you ever going to take it
>>>away at some point before it goes to the wire?
>>
>>
>>No, but its nevertheless not on the wire yet and the HW scheduler
>>controls when it will get there.
>>
>>It might in theory even never get
>>there if higher priority queues are continously active.
>
>
> Sure - but what is wrong with that?
Nothing, this was just to illustrate why I disagree with the assumption
that the packet has hit the wire. On second thought I do agree with your
assumption for the single HW queue case, at the point we hand the packet
to the HW the packet order is determined and is unchangeable. But this
is not the case if the hardware includes its own scheduler. The qdisc
is simply not fully in charge anymore.
> What would be wrong is in the case of contention for a resource like a
> wire between a less important packet and a more important packet, the
> more important packet gets favored.
Read again what I wrote about the n > 2 case. Low priority queues might
starve high priority queues when using a single queue state for a
maximum of the time it takes to service n - 2 queues with max_qlen - 1
packets queued plus the time for a single packet. Thats assuming the
worst case of n - 2 queues with max_qlen - 1 packets and the lowest
priority queue full, so the queue is stopped until we can send at
least one lowest priority packet, which requires to fully service
all higher priority queues previously.
> Nothing like that ever happens in what i described.
> Remember there is no issue if there is no congestion or contention for
> local resources.
Your basic assumption seems to be that the qdisc is still in charge
of when packets get sent. This isn't the case if there is another
scheduler after the qdisc and there is contention in the second
queue.
^ permalink raw reply [flat|nested] 153+ messages in thread
* RE: [PATCH] NET: Multiqueue network device support.
2007-06-11 14:03 ` Patrick McHardy
@ 2007-06-11 14:30 ` Cohen, Guy
2007-06-11 14:38 ` Patrick McHardy
2007-06-11 14:48 ` jamal
2007-06-11 14:40 ` jamal
1 sibling, 2 replies; 153+ messages in thread
From: Cohen, Guy @ 2007-06-11 14:30 UTC (permalink / raw)
To: Patrick McHardy, hadi
Cc: Waskiewicz Jr, Peter P, davem, netdev, jeff, Kok, Auke-jan H
Patrick McHardy wrote:
> jamal wrote:
> > Sure - but what is wrong with that?
>
>
> Nothing, this was just to illustrate why I disagree with the
assumption
> that the packet has hit the wire. On second thought I do agree with
your
> assumption for the single HW queue case, at the point we hand the
packet
> to the HW the packet order is determined and is unchangeable. But this
> is not the case if the hardware includes its own scheduler. The qdisc
> is simply not fully in charge anymore.
For WiFi devices the HW often implements the scheduling, especially when
QoS (WMM/11e/11n) is implemented. There are few traffic queues defined
by the specs and the selection of the next queue to transmit a packet
from, is determined in real time, just when there is a tx opportunity.
This cannot be predicted in advance since it depends on the medium usage
of other stations.
Hence, to make it possible for wireless devices to use the qdisc
mechanism properly, the HW queues should _ALL_ be non-empty at all
times, whenever data is available in the upper layers. Or in other
words, the upper layers should not block a specific queue because of the
usage of any other queue.
>
> Your basic assumption seems to be that the qdisc is still in charge
> of when packets get sent. This isn't the case if there is another
> scheduler after the qdisc and there is contention in the second
> queue.
Which is often the case in wireless devices - transmission scheduling is
done in HW.
^ permalink raw reply [flat|nested] 153+ messages in thread
* Re: [PATCH] NET: Multiqueue network device support.
2007-06-11 14:30 ` Cohen, Guy
@ 2007-06-11 14:38 ` Patrick McHardy
2007-06-11 14:48 ` jamal
1 sibling, 0 replies; 153+ messages in thread
From: Patrick McHardy @ 2007-06-11 14:38 UTC (permalink / raw)
To: Cohen, Guy
Cc: hadi, Waskiewicz Jr, Peter P, davem, netdev, jeff,
Kok, Auke-jan H
Cohen, Guy wrote:
> Patrick McHardy wrote:
>
>>jamal wrote:
>>
>>>Sure - but what is wrong with that?
>>
>>
>>Nothing, this was just to illustrate why I disagree with the
>
> assumption
>
>>that the packet has hit the wire. On second thought I do agree with
>
> your
>
>>assumption for the single HW queue case, at the point we hand the
>
> packet
>
>>to the HW the packet order is determined and is unchangeable. But this
>>is not the case if the hardware includes its own scheduler. The qdisc
>>is simply not fully in charge anymore.
>
>
> For WiFi devices the HW often implements the scheduling, especially when
> QoS (WMM/11e/11n) is implemented. There are few traffic queues defined
> by the specs and the selection of the next queue to transmit a packet
> from, is determined in real time, just when there is a tx opportunity.
> This cannot be predicted in advance since it depends on the medium usage
> of other stations.
>
> Hence, to make it possible for wireless devices to use the qdisc
> mechanism properly, the HW queues should _ALL_ be non-empty at all
> times, whenever data is available in the upper layers. Or in other
> words, the upper layers should not block a specific queue because of the
> usage of any other queue.
Thats exactly what I'm saying. And its not possible with a single
queue state as I tried to explain in my last last.
^ permalink raw reply [flat|nested] 153+ messages in thread
* Re: [PATCH] NET: Multiqueue network device support.
2007-06-11 14:03 ` Patrick McHardy
2007-06-11 14:30 ` Cohen, Guy
@ 2007-06-11 14:40 ` jamal
2007-06-11 14:49 ` Patrick McHardy
1 sibling, 1 reply; 153+ messages in thread
From: jamal @ 2007-06-11 14:40 UTC (permalink / raw)
To: Patrick McHardy
Cc: Waskiewicz Jr, Peter P, davem, netdev, jeff, Kok, Auke-jan H
On Mon, 2007-11-06 at 16:03 +0200, Patrick McHardy wrote:
> jamal wrote:
> > Sure - but what is wrong with that?
>
> Nothing, this was just to illustrate why I disagree with the assumption
> that the packet has hit the wire.
fair enough.
> On second thought I do agree with your
> assumption for the single HW queue case, at the point we hand the packet
> to the HW the packet order is determined and is unchangeable. But this
> is not the case if the hardware includes its own scheduler. The qdisc
> is simply not fully in charge anymore.
i am making the case that it does not affect the overall results
as long as you use the same parameterization on qdisc and hardware.
If in fact the qdisc high prio packets made it to the driver before
the they make it out onto the wire, it is probably a good thing
that the hardware scheduler starves the low prio packets.
> Read again what I wrote about the n > 2 case. Low priority queues might
> starve high priority queues when using a single queue state for a
> maximum of the time it takes to service n - 2 queues with max_qlen - 1
> packets queued plus the time for a single packet. Thats assuming the
> worst case of n - 2 queues with max_qlen - 1 packets and the lowest
> priority queue full, so the queue is stopped until we can send at
> least one lowest priority packet, which requires to fully service
> all higher priority queues previously.
I didnt quiet follow the above - I will try retrieving reading your
other email to see if i can make sense of it.
> Your basic assumption seems to be that the qdisc is still in charge
> of when packets get sent. This isn't the case if there is another
> scheduler after the qdisc and there is contention in the second
> queue.
My basic assumption is if you use the same scheduler in both the
hardware and qdisc, configured the same same number of queues and
mapped the same priorities then you dont need to make any changes
to the qdisc code. If i have a series of routers through which a packet
traveses to its destination with the same qos parameters i also achieve
the same results.
cheers,
jamal
^ permalink raw reply [flat|nested] 153+ messages in thread
* RE: [PATCH] NET: Multiqueue network device support.
2007-06-11 14:30 ` Cohen, Guy
2007-06-11 14:38 ` Patrick McHardy
@ 2007-06-11 14:48 ` jamal
2007-06-11 15:00 ` Tomas Winkler
2007-06-11 15:34 ` Cohen, Guy
1 sibling, 2 replies; 153+ messages in thread
From: jamal @ 2007-06-11 14:48 UTC (permalink / raw)
To: Cohen, Guy
Cc: Patrick McHardy, Waskiewicz Jr, Peter P, davem, netdev, jeff,
Kok, Auke-jan H
On Mon, 2007-11-06 at 17:30 +0300, Cohen, Guy wrote:
>
> For WiFi devices the HW often implements the scheduling, especially when
> QoS (WMM/11e/11n) is implemented. There are few traffic queues defined
> by the specs and the selection of the next queue to transmit a packet
> from, is determined in real time, just when there is a tx opportunity.
> This cannot be predicted in advance since it depends on the medium usage
> of other stations.
WMM is a strict prio mechanism.
The parametrization very much favors the high prio packets when the
tx opportunity to send shows up.
> Hence, to make it possible for wireless devices to use the qdisc
> mechanism properly, the HW queues should _ALL_ be non-empty at all
> times, whenever data is available in the upper layers.
agreed.
> Or in other
> words, the upper layers should not block a specific queue because of the
> usage of any other queue.
This is where we are going to disagree.
There is no way the stack will send the driver packets which are low
prio if there are some which are high prio. There is therefore, on
contention between low and high prio, no way for low prio packets to
obstruct the high prio packets; however, it is feasible that high prio
packets will obstruct low prio packets (which is fine).
cheers,
jamal
^ permalink raw reply [flat|nested] 153+ messages in thread
* Re: [PATCH] NET: Multiqueue network device support.
2007-06-11 14:40 ` jamal
@ 2007-06-11 14:49 ` Patrick McHardy
2007-06-11 15:05 ` jamal
0 siblings, 1 reply; 153+ messages in thread
From: Patrick McHardy @ 2007-06-11 14:49 UTC (permalink / raw)
To: hadi; +Cc: Waskiewicz Jr, Peter P, davem, netdev, jeff, Kok, Auke-jan H
jamal wrote:
> On Mon, 2007-11-06 at 16:03 +0200, Patrick McHardy wrote:
>
>>Read again what I wrote about the n > 2 case. Low priority queues might
>>starve high priority queues when using a single queue state for a
>>maximum of the time it takes to service n - 2 queues with max_qlen - 1
>>packets queued plus the time for a single packet. Thats assuming the
>>worst case of n - 2 queues with max_qlen - 1 packets and the lowest
>>priority queue full, so the queue is stopped until we can send at
>>least one lowest priority packet, which requires to fully service
>>all higher priority queues previously.
>
>
> I didnt quiet follow the above - I will try retrieving reading your
> other email to see if i can make sense of it.
Let me explain with some ASCII art :)
We have n empty HW queues with a maximum length of m packets per queue:
[0] empty
[1] empty
[2] empty
..
[n-1] empty
Now we receive m - 1 packets for each all priorities >= 1 and < n - 1,
so we have:
[0] empty
[1] m - 1 packets
[2] m - 1 packets
..
[n-2] m - 1 packets
[n] empty
Since no queue is completely full, the queue is still active.
Now we receive m packets of priorty n:
[0] empty
[1] m - 1 packets
[2] m - 1 packets
..
[n-2] m - 1 packets
[n-1] m packets
At this point the queue needs to be stopped since the highest
priority queue is entirely full. To start it again at least
one packet of queue n - 1 needs to be sent, which (assuming
strict priority) requires that queues 1 to n - 2 are serviced
first. So any prio 0 packets arriving during this period will
sit in the qdisc and will not reach the device for a possibly
quite long time. With multiple queue states we'd know that
queue 0 can still take packets.
>>Your basic assumption seems to be that the qdisc is still in charge
>>of when packets get sent. This isn't the case if there is another
>>scheduler after the qdisc and there is contention in the second
>>queue.
>
>
> My basic assumption is if you use the same scheduler in both the
> hardware and qdisc, configured the same same number of queues and
> mapped the same priorities then you dont need to make any changes
> to the qdisc code. If i have a series of routers through which a packet
> traveses to its destination with the same qos parameters i also achieve
> the same results.
Did my example above convince you that this is not the case?
^ permalink raw reply [flat|nested] 153+ messages in thread
* Re: [PATCH] NET: Multiqueue network device support.
2007-06-11 14:48 ` jamal
@ 2007-06-11 15:00 ` Tomas Winkler
2007-06-11 15:14 ` jamal
2007-06-11 15:34 ` Cohen, Guy
1 sibling, 1 reply; 153+ messages in thread
From: Tomas Winkler @ 2007-06-11 15:00 UTC (permalink / raw)
To: hadi
Cc: Cohen, Guy, Patrick McHardy, Waskiewicz Jr, Peter P, davem,
netdev, jeff, Kok, Auke-jan H
On 6/11/07, jamal <hadi@cyberus.ca> wrote:
> On Mon, 2007-11-06 at 17:30 +0300, Cohen, Guy wrote:
>
> >
> > For WiFi devices the HW often implements the scheduling, especially when
> > QoS (WMM/11e/11n) is implemented. There are few traffic queues defined
> > by the specs and the selection of the next queue to transmit a packet
> > from, is determined in real time, just when there is a tx opportunity.
> > This cannot be predicted in advance since it depends on the medium usage
> > of other stations.
>
> WMM is a strict prio mechanism.
> The parametrization very much favors the high prio packets when the
> tx opportunity to send shows up.
>
This is not true, there is no simple priority order from 1 to 4 ,
rather set of parameters that dermises access to medium. You have to
emulate medium behavior to schedule packets in correct order. That's
why this pushed to HW, otherwise nobody would invest money in this
part of silicon :)
> > Hence, to make it possible for wireless devices to use the qdisc
> > mechanism properly, the HW queues should _ALL_ be non-empty at all
> > times, whenever data is available in the upper layers.
>
> agreed.
>
> > Or in other
> > words, the upper layers should not block a specific queue because of the
> > usage of any other queue.
>
> This is where we are going to disagree.
> There is no way the stack will send the driver packets which are low
> prio if there are some which are high prio. There is therefore, on
> contention between low and high prio, no way for low prio packets to
> obstruct the high prio packets; however, it is feasible that high prio
> packets will obstruct low prio packets (which is fine).
>
> cheers,
> jamal
>
> -
> To unsubscribe from this list: send the line "unsubscribe netdev" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at http://vger.kernel.org/majordomo-info.html
>
^ permalink raw reply [flat|nested] 153+ messages in thread
* Re: [PATCH] NET: Multiqueue network device support.
2007-06-11 14:49 ` Patrick McHardy
@ 2007-06-11 15:05 ` jamal
2007-06-11 15:12 ` Patrick McHardy
0 siblings, 1 reply; 153+ messages in thread
From: jamal @ 2007-06-11 15:05 UTC (permalink / raw)
To: Patrick McHardy
Cc: Waskiewicz Jr, Peter P, davem, netdev, jeff, Kok, Auke-jan H
On Mon, 2007-11-06 at 16:49 +0200, Patrick McHardy wrote:
> Let me explain with some ASCII art :)
Ok ;->
> We have n empty HW queues with a maximum length of m packets per queue:
>
> [0] empty
> [1] empty
> [2] empty
> ..
> [n-1] empty
>
Asumming 0 i take it is higher prio than n-1.
> Now we receive m - 1 packets for each all priorities >= 1 and < n - 1,
> so we have:
>
> [0] empty
> [1] m - 1 packets
> [2] m - 1 packets
> ..
> [n-2] m - 1 packets
> [n] empty
>
> Since no queue is completely full, the queue is still active.
and packets are being fired on the wire by the driver etc ...
> Now we receive m packets of priorty n:
n-1 (i think?)
> [0] empty
> [1] m - 1 packets
> [2] m - 1 packets
> ..
> [n-2] m - 1 packets
> [n-1] m packets
>
> At this point the queue needs to be stopped since the highest
> priority queue is entirely full.
ok, so 0 is lower prio than n-1
> To start it again at least
> one packet of queue n - 1 needs to be sent,
following so far ...
> which (assuming
> strict priority) requires that queues 1 to n - 2 are serviced
> first.
Ok, so let me revert that; 0 is higher prio than n-1.
> So any prio 0 packets arriving during this period will
> sit in the qdisc and will not reach the device for a possibly
> quite long time.
"possibly long time" is where we diverge ;->
If you throw the burden to the driver (as i am recommending in all my
arguements so far), it should open up sooner based on priorities.
I didnt wanna bring this earlier because it may take the discussion in
the wrong direction.
So in your example if n-1 shuts down the driver, then it is upto to the
driver to open it up if any higher prio packet makes it out.
> With multiple queue states we'd know that
> queue 0 can still take packets.
And with what i described you dont make any such changes to the core;
the burden is on the driver.
cheers,
jamal
^ permalink raw reply [flat|nested] 153+ messages in thread
* Re: [PATCH] NET: Multiqueue network device support.
2007-06-11 15:05 ` jamal
@ 2007-06-11 15:12 ` Patrick McHardy
2007-06-11 15:25 ` jamal
0 siblings, 1 reply; 153+ messages in thread
From: Patrick McHardy @ 2007-06-11 15:12 UTC (permalink / raw)
To: hadi; +Cc: Waskiewicz Jr, Peter P, davem, netdev, jeff, Kok, Auke-jan H
jamal wrote:
> On Mon, 2007-11-06 at 16:49 +0200, Patrick McHardy wrote:
>
>>We have n empty HW queues with a maximum length of m packets per queue:
>>
>>[0] empty
>>[1] empty
>>[2] empty
>>..
>>[n-1] empty
>
>
> Asumming 0 i take it is higher prio than n-1.
Yes.
>>Now we receive m - 1 packets for each all priorities >= 1 and < n - 1,
>>so we have:
>>
>>[0] empty
>>[1] m - 1 packets
>>[2] m - 1 packets
>>..
>>[n-2] m - 1 packets
>>[n] empty
>>
>>Since no queue is completely full, the queue is still active.
>>Now we receive m packets of priorty n:
>
>
> n-1 (i think?)
Right.
>>[0] empty
>>[1] m - 1 packets
>>[2] m - 1 packets
>>..
>>[n-2] m - 1 packets
>>[n-1] m packets
>>
>>At this point the queue needs to be stopped since the highest
>>priority queue is entirely full.
>
>
> ok, so 0 is lower prio than n-1
Higher priority. But we don't know what the priority of the
next packet is going to be, so we have to stop the entire
qdisc anyway.
>>To start it again at least one packet of queue n - 1 needs to be sent,
>
>
> following so far ...
>
>
>>which (assuming
>>strict priority) requires that queues 1 to n - 2 are serviced
>>first.
>
>
> Ok, so let me revert that; 0 is higher prio than n-1.
Yes.
>>So any prio 0 packets arriving during this period will
>>sit in the qdisc and will not reach the device for a possibly
>>quite long time.
>
>
> "possibly long time" is where we diverge ;->
Worst cast is (n - 2) * (m - 1) + 1 full sized packet transmission
times.
You can do the math yourself, but we're talking about potentially
a lot of packets.
> If you throw the burden to the driver (as i am recommending in all my
> arguements so far), it should open up sooner based on priorities.
> I didnt wanna bring this earlier because it may take the discussion in
> the wrong direction.
> So in your example if n-1 shuts down the driver, then it is upto to the
> driver to open it up if any higher prio packet makes it out.
How could it do that? n-1 is still completely full and you don't
know what the next packet is going to be. Are you proposing to
simply throw the packet away in the driver even though its within
the configured limits of the qdisc?
^ permalink raw reply [flat|nested] 153+ messages in thread
* Re: [PATCH] NET: Multiqueue network device support.
2007-06-11 15:00 ` Tomas Winkler
@ 2007-06-11 15:14 ` jamal
0 siblings, 0 replies; 153+ messages in thread
From: jamal @ 2007-06-11 15:14 UTC (permalink / raw)
To: Tomas Winkler
Cc: Cohen, Guy, Patrick McHardy, Waskiewicz Jr, Peter P, davem,
netdev, jeff, Kok, Auke-jan H
On Mon, 2007-11-06 at 18:00 +0300, Tomas Winkler wrote:
> On 6/11/07, jamal <hadi@cyberus.ca> wrote:
> > On Mon, 2007-11-06 at 17:30 +0300, Cohen, Guy wrote:
> >
> > >
> > > For WiFi devices the HW often implements the scheduling, especially when
> > > QoS (WMM/11e/11n) is implemented. There are few traffic queues defined
> > > by the specs and the selection of the next queue to transmit a packet
> > > from, is determined in real time, just when there is a tx opportunity.
> > > This cannot be predicted in advance since it depends on the medium usage
> > > of other stations.
> >
> > WMM is a strict prio mechanism.
> > The parametrization very much favors the high prio packets when the
> > tx opportunity to send shows up.
> >
>
> This is not true, there is no simple priority order from 1 to 4 ,
> rather set of parameters that dermises access to medium. You have to
> emulate medium behavior to schedule packets in correct order. That's
> why this pushed to HW, otherwise nobody would invest money in this
> part of silicon :)
>
I dont have the specs neither am i arguing the value of having the
scheduler in hardware. (I think the over radio contention clearly
needs the scheduler in hardware).
But i have read a couple of papers on people simulating this in s/ware.
And have seen people describe the parametrization that is default,
example Slide 43 on:
http://madwifi.org/attachment/wiki/ChipsetFeatures/WMM/qos11e.pdf?format=raw
seems to indicate the default parameters for the different timers
is clearly strictly in favor of you if you have higher prio.
If the info quoted is correct, it doesnt change anything i have said so
far.
i.e it is strict prio scheduling with some statistical chance a low prio
packet will make it.
cheers,
jamal
^ permalink raw reply [flat|nested] 153+ messages in thread
* Re: [PATCH] NET: Multiqueue network device support.
2007-06-11 15:12 ` Patrick McHardy
@ 2007-06-11 15:25 ` jamal
2007-06-11 15:44 ` Patrick McHardy
0 siblings, 1 reply; 153+ messages in thread
From: jamal @ 2007-06-11 15:25 UTC (permalink / raw)
To: Patrick McHardy
Cc: Waskiewicz Jr, Peter P, davem, netdev, jeff, Kok, Auke-jan H
On Mon, 2007-11-06 at 17:12 +0200, Patrick McHardy wrote:
> > Ok, so let me revert that; 0 is higher prio than n-1.
>
>
> Yes.
>
Ok, gotcha.
> > "possibly long time" is where we diverge ;->
>
> Worst cast is (n - 2) * (m - 1) + 1 full sized packet transmission
> times.
>
> You can do the math yourself, but we're talking about potentially
> a lot of packets.
I agree if you use the strategy of "a ring shutdown down" implies
"dont wake up until the ring that caused the shutdown opens up"
What i am saying below is to make a change to that strategy.
> > If you throw the burden to the driver (as i am recommending in all my
> > arguements so far), it should open up sooner based on priorities.
> > I didnt wanna bring this earlier because it may take the discussion in
> > the wrong direction.
> > So in your example if n-1 shuts down the driver, then it is upto to the
> > driver to open it up if any higher prio packet makes it out.
>
>
> How could it do that? n-1 is still completely full and you don't
> know what the next packet is going to be. Are you proposing to
> simply throw the packet away in the driver even though its within
> the configured limits of the qdisc?
No no Patrick - i am just saying the following:
- let the driver shutdown whenever a ring is full. Remember which ring X
shut it down.
- when you get a tx interupt or prun tx descriptors, if a ring <= X has
transmitted a packet (or threshold of packets), then wake up the driver
(i.e open up).
In the meantime packets from the stack are sitting on the qdisc and will
be sent when the driver opens up.
Anyways, I have to run to work; thanks for keeping the discussion at the
level you did.
cheers,
jamal
^ permalink raw reply [flat|nested] 153+ messages in thread
* RE: [PATCH] NET: Multiqueue network device support.
2007-06-11 14:48 ` jamal
2007-06-11 15:00 ` Tomas Winkler
@ 2007-06-11 15:34 ` Cohen, Guy
2007-06-11 22:22 ` jamal
1 sibling, 1 reply; 153+ messages in thread
From: Cohen, Guy @ 2007-06-11 15:34 UTC (permalink / raw)
To: hadi
Cc: Patrick McHardy, Waskiewicz Jr, Peter P, davem, netdev, jeff,
Kok, Auke-jan H
Some more details inside regarding wireless QoS.
jamal wrote:
> On Mon, 2007-11-06 at 17:30 +0300, Cohen, Guy wrote:
>
> >
> > For WiFi devices the HW often implements the scheduling, especially
when
> > QoS (WMM/11e/11n) is implemented. There are few traffic queues
defined
> > by the specs and the selection of the next queue to transmit a
packet
> > from, is determined in real time, just when there is a tx
opportunity.
> > This cannot be predicted in advance since it depends on the medium
usage
> > of other stations.
>
> WMM is a strict prio mechanism.
> The parametrization very much favors the high prio packets when the
> tx opportunity to send shows up.
Sorry, but this not as simple as you describe it. WMM is much more
complicated. WMM defines the HW queues as virtually multiple clients
that compete on the medium access individually. Each implements a
contention-based medium access. The Access Point publishes to the
clients the medium access parameters (e.g. back off parameters) that are
different for each access category (virtual client). There is _not_ a
strict priority assigned to each access category. The behavior of each
access category totally depends on the medium usage of other clients and
is totally different for each access category. This cannot be predicated
at the host SW.
> > Hence, to make it possible for wireless devices to use the qdisc
> > mechanism properly, the HW queues should _ALL_ be non-empty at all
> > times, whenever data is available in the upper layers.
>
> agreed.
>
> > Or in other
> > words, the upper layers should not block a specific queue because of
the
> > usage of any other queue.
>
> This is where we are going to disagree.
> There is no way the stack will send the driver packets which are low
> prio if there are some which are high prio. There is therefore, on
> contention between low and high prio, no way for low prio packets to
> obstruct the high prio packets;
And this is not the right behavior for a WLAN stack. QoS in WLAN doesn't
favor strictly one access category over another, but defines some softer
and smarter prioritization. This is implemented in the HW/Firmware. I
just think that providing a per-queue controls (start/stop) will allow
WLAN drivers/Firmware/HW to do that while still using qdisc (and it will
work properly even when one queue is full and others are empty).
> however, it is feasible that high prio
> packets will obstruct low prio packets (which is fine).
No this is _not_ fine. Just to emphasize again, WMM doesn't define
priority in the way it is implemented in airplane boarding (Pilots
first, Business passengers next, couch passengers at the end), but more
like _distributed_ weights prioritization (between all the multiple
queues of all the clients on the channel).
As a side note, in one of the WFA WMM certification tests, the AP
changes the medium access parameters of the access categories in a way
that favors a lower access category. This is something very soft that
cannot be reflected in any intuitive way in the host SW.
> cheers,
> jamal
^ permalink raw reply [flat|nested] 153+ messages in thread
* Re: [PATCH] NET: Multiqueue network device support.
2007-06-11 15:25 ` jamal
@ 2007-06-11 15:44 ` Patrick McHardy
2007-06-11 21:35 ` jamal
0 siblings, 1 reply; 153+ messages in thread
From: Patrick McHardy @ 2007-06-11 15:44 UTC (permalink / raw)
To: hadi; +Cc: Waskiewicz Jr, Peter P, davem, netdev, jeff, Kok, Auke-jan H
jamal wrote:
> On Mon, 2007-11-06 at 17:12 +0200, Patrick McHardy wrote:
>
>>Worst cast is (n - 2) * (m - 1) + 1 full sized packet transmission
>>times.
>>
>>You can do the math yourself, but we're talking about potentially
>>a lot of packets.
>
>
> I agree if you use the strategy of "a ring shutdown down" implies
> "dont wake up until the ring that caused the shutdown opens up"
> What i am saying below is to make a change to that strategy.
Glad we agree on something. Now all I have to do is convince you that
a change to this strategy is not a good idea :)
>>>If you throw the burden to the driver (as i am recommending in all my
>>>arguements so far), it should open up sooner based on priorities.
>>>I didnt wanna bring this earlier because it may take the discussion in
>>>the wrong direction.
>>>So in your example if n-1 shuts down the driver, then it is upto to the
>>>driver to open it up if any higher prio packet makes it out.
>>
>>
>>How could it do that? n-1 is still completely full and you don't
>>know what the next packet is going to be. Are you proposing to
>>simply throw the packet away in the driver even though its within
>>the configured limits of the qdisc?
>
>
> No no Patrick - i am just saying the following:
> - let the driver shutdown whenever a ring is full. Remember which ring X
> shut it down.
> - when you get a tx interupt or prun tx descriptors, if a ring <= X has
> transmitted a packet (or threshold of packets), then wake up the driver
> (i.e open up).
At this point the qdisc might send new packets. What do you do when a
packet for a full ring arrives?
I see three choices:
- drop it, even though its still within the qdiscs configured limits
- requeue it, which does not work because the qdisc is still active
and might just hand you the same packet over and over again in a
busy loop, until the ring has more room (which has the same worst
case, just that we're sitting in a busy loop now).
- requeue and stop the queue: we're back to where we started since
now higher priority packets will not get passed to the driver.
^ permalink raw reply [flat|nested] 153+ messages in thread
* Re: [PATCH] NET: Multiqueue network device support.
2007-06-04 21:40 ` [PATCH] NET: Multiqueue network device support PJ Waskiewicz
2007-06-05 11:50 ` jamal
@ 2007-06-11 17:36 ` Patrick McHardy
2007-06-11 18:05 ` Waskiewicz Jr, Peter P
2007-06-13 18:34 ` Waskiewicz Jr, Peter P
2007-06-11 17:52 ` Patrick McHardy
2 siblings, 2 replies; 153+ messages in thread
From: Patrick McHardy @ 2007-06-11 17:36 UTC (permalink / raw)
To: PJ Waskiewicz; +Cc: davem, netdev, jeff, auke-jan.h.kok
PJ Waskiewicz wrote:
> diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c
> index f28bb2d..b9dc2a6 100644
> --- a/net/sched/sch_generic.c
> +++ b/net/sched/sch_generic.c
> @@ -123,7 +123,8 @@ static inline int qdisc_restart(struct net_device *dev)
> /* And release queue */
> spin_unlock(&dev->queue_lock);
>
> - if (!netif_queue_stopped(dev)) {
> + if (!netif_queue_stopped(dev) &&
> + !netif_subqueue_stopped(dev, skb->queue_mapping)) {
> int ret;
>
> ret = dev_hard_start_xmit(skb, dev);
Your patch doesn't update any other users of netif_queue_stopped().
The assumption that they can pass packets to the driver when the
queue is running is no longer valid since they don't know whether
the subqueue the packet will end up in is active (it might be
different from queue 0 if packets were redirected from a multiqueue
aware qdisc through TC actions). So they need to be changed to
check the subqueue state as well.
BTW, I couldn't find anything but a single netif_wake_subqueue
in your (old) e1000 patch. Why doesn't it stop subqueues?
^ permalink raw reply [flat|nested] 153+ messages in thread
* Re: [PATCH] NET: Multiqueue network device support.
2007-06-04 21:40 ` [PATCH] NET: Multiqueue network device support PJ Waskiewicz
2007-06-05 11:50 ` jamal
2007-06-11 17:36 ` Patrick McHardy
@ 2007-06-11 17:52 ` Patrick McHardy
2007-06-11 17:57 ` Waskiewicz Jr, Peter P
2 siblings, 1 reply; 153+ messages in thread
From: Patrick McHardy @ 2007-06-11 17:52 UTC (permalink / raw)
To: PJ Waskiewicz; +Cc: davem, netdev, jeff, auke-jan.h.kok
PJ Waskiewicz wrote:
> diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
> index e7367c7..8bcd870 100644
> --- a/include/linux/skbuff.h
> +++ b/include/linux/skbuff.h
> @@ -215,6 +215,7 @@ typedef unsigned char *sk_buff_data_t;
> * @pkt_type: Packet class
> * @fclone: skbuff clone status
> * @ip_summed: Driver fed us an IP checksum
> + * @queue_mapping: Queue mapping for multiqueue devices
> * @priority: Packet queueing priority
> * @users: User count - see {datagram,tcp}.c
> * @protocol: Packet protocol from driver
> @@ -269,6 +270,7 @@ struct sk_buff {
> __u16 csum_offset;
> };
> };
> + __u16 queue_mapping;
> __u32 priority;
> __u8 local_df:1,
> cloned:1,
I think we can reuse skb->priority. Assuming only real hardware
devices use multiqueue support, there should be no user of
skb->priority after egress qdisc classification. The only reason
to preserve it in the qdisc layer is for software devices.
Grepping through drivers/net shows a few users, bot most seem
to be using it on the RX path and some use it to store internal
data.
^ permalink raw reply [flat|nested] 153+ messages in thread
* RE: [PATCH] NET: Multiqueue network device support.
2007-06-11 17:52 ` Patrick McHardy
@ 2007-06-11 17:57 ` Waskiewicz Jr, Peter P
2007-06-11 18:05 ` Patrick McHardy
0 siblings, 1 reply; 153+ messages in thread
From: Waskiewicz Jr, Peter P @ 2007-06-11 17:57 UTC (permalink / raw)
To: Patrick McHardy; +Cc: davem, netdev, jeff, Kok, Auke-jan H
> PJ Waskiewicz wrote:
> > diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index
> > e7367c7..8bcd870 100644
> > --- a/include/linux/skbuff.h
> > +++ b/include/linux/skbuff.h
> > @@ -215,6 +215,7 @@ typedef unsigned char *sk_buff_data_t;
> > * @pkt_type: Packet class
> > * @fclone: skbuff clone status
> > * @ip_summed: Driver fed us an IP checksum
> > + * @queue_mapping: Queue mapping for multiqueue devices
> > * @priority: Packet queueing priority
> > * @users: User count - see {datagram,tcp}.c
> > * @protocol: Packet protocol from driver
> > @@ -269,6 +270,7 @@ struct sk_buff {
> > __u16 csum_offset;
> > };
> > };
> > + __u16 queue_mapping;
> > __u32 priority;
> > __u8 local_df:1,
> > cloned:1,
>
>
> I think we can reuse skb->priority. Assuming only real
> hardware devices use multiqueue support, there should be no user of
> skb->priority after egress qdisc classification. The only reason
> to preserve it in the qdisc layer is for software devices.
That would be oustanding.
> Grepping through drivers/net shows a few users, bot most seem
> to be using it on the RX path and some use it to store internal data.
Thank you for hunting this down. I will test on my little environment
here to see if I run into any issues.
-PJ
^ permalink raw reply [flat|nested] 153+ messages in thread
* RE: [PATCH] NET: Multiqueue network device support.
2007-06-11 17:36 ` Patrick McHardy
@ 2007-06-11 18:05 ` Waskiewicz Jr, Peter P
2007-06-11 18:07 ` Patrick McHardy
2007-06-13 18:34 ` Waskiewicz Jr, Peter P
1 sibling, 1 reply; 153+ messages in thread
From: Waskiewicz Jr, Peter P @ 2007-06-11 18:05 UTC (permalink / raw)
To: Patrick McHardy; +Cc: davem, netdev, jeff, Kok, Auke-jan H
> PJ Waskiewicz wrote:
> > diff --git a/net/sched/sch_generic.c
> b/net/sched/sch_generic.c index
> > f28bb2d..b9dc2a6 100644
> > --- a/net/sched/sch_generic.c
> > +++ b/net/sched/sch_generic.c
> > @@ -123,7 +123,8 @@ static inline int qdisc_restart(struct
> net_device *dev)
> > /* And release queue */
> > spin_unlock(&dev->queue_lock);
> >
> > - if (!netif_queue_stopped(dev)) {
> > + if (!netif_queue_stopped(dev) &&
> > + !netif_subqueue_stopped(dev,
> skb->queue_mapping)) {
> > int ret;
> >
> > ret = dev_hard_start_xmit(skb, dev);
>
>
> Your patch doesn't update any other users of netif_queue_stopped().
> The assumption that they can pass packets to the driver when
> the queue is running is no longer valid since they don't know
> whether the subqueue the packet will end up in is active (it
> might be different from queue 0 if packets were redirected
> from a multiqueue aware qdisc through TC actions). So they
> need to be changed to check the subqueue state as well.
I will look at all these cases and change them accordingly. Thanks for
catching that.
> BTW, I couldn't find anything but a single
> netif_wake_subqueue in your (old) e1000 patch. Why doesn't it
> stop subqueues?
A previous e1000 patch stopped subqueues. The last e1000 patch I sent
to the list doesn't stop them, and that's a problem with that patch; it
was sent purely to show how the alloc_etherdev_mq() stuff worked, but I
missed the subqueue control. I can fix that and send an updated patch
if you'd like. The reason I missed it is we maintain an out-of-tree
driver and an in-tree driver, and mixing/matching code between the two
becomes a bit of a juggling act sometimes when doing little engineering
snippits.
Thanks for reviewing these. I'll repost something with updates from
your feedback.
Cheers,
-PJ Waskiewicz
^ permalink raw reply [flat|nested] 153+ messages in thread
* Re: [PATCH] NET: Multiqueue network device support.
2007-06-11 17:57 ` Waskiewicz Jr, Peter P
@ 2007-06-11 18:05 ` Patrick McHardy
2007-06-11 18:15 ` Waskiewicz Jr, Peter P
0 siblings, 1 reply; 153+ messages in thread
From: Patrick McHardy @ 2007-06-11 18:05 UTC (permalink / raw)
To: Waskiewicz Jr, Peter P; +Cc: davem, netdev, jeff, Kok, Auke-jan H
Waskiewicz Jr, Peter P wrote:
>>I think we can reuse skb->priority. Assuming only real
>>hardware devices use multiqueue support, there should be no user of
>>skb->priority after egress qdisc classification. The only reason
>>to preserve it in the qdisc layer is for software devices.
>
>
> That would be oustanding.
>
>
>>Grepping through drivers/net shows a few users, bot most seem
>>to be using it on the RX path and some use it to store internal data.
>
>
> Thank you for hunting this down. I will test on my little environment
> here to see if I run into any issues.
I think grepping will help more than testing :)
The only issue I can see is that packets going to a multiqueue device
that doesn't have a multiqueue aware qdisc attached will get a random
value. So you would have to conditionally reset it before ->enqueue.
Another question is what to do about other hard_start_xmit callers.
Independant of which field is used, should the classification that
may have happend on a different device be retained (TC actions again)?
^ permalink raw reply [flat|nested] 153+ messages in thread
* Re: [PATCH] NET: Multiqueue network device support.
2007-06-11 18:05 ` Waskiewicz Jr, Peter P
@ 2007-06-11 18:07 ` Patrick McHardy
0 siblings, 0 replies; 153+ messages in thread
From: Patrick McHardy @ 2007-06-11 18:07 UTC (permalink / raw)
To: Waskiewicz Jr, Peter P; +Cc: davem, netdev, jeff, Kok, Auke-jan H
Waskiewicz Jr, Peter P wrote:
>>BTW, I couldn't find anything but a single
>>netif_wake_subqueue in your (old) e1000 patch. Why doesn't it
>>stop subqueues?
>
>
> A previous e1000 patch stopped subqueues. The last e1000 patch I sent
> to the list doesn't stop them, and that's a problem with that patch; it
> was sent purely to show how the alloc_etherdev_mq() stuff worked, but I
> missed the subqueue control. I can fix that and send an updated patch
> if you'd like. The reason I missed it is we maintain an out-of-tree
> driver and an in-tree driver, and mixing/matching code between the two
> becomes a bit of a juggling act sometimes when doing little engineering
> snippits.
>
> Thanks for reviewing these. I'll repost something with updates from
> your feedback.
Thanks, I do have some more comments, but a repost with the patches
split up in infrastructure changes, qdisc changes one patch per qdisc
and the e1000 patch would make that easier.
^ permalink raw reply [flat|nested] 153+ messages in thread
* RE: [PATCH] NET: Multiqueue network device support.
2007-06-11 18:05 ` Patrick McHardy
@ 2007-06-11 18:15 ` Waskiewicz Jr, Peter P
2007-06-11 18:24 ` Patrick McHardy
0 siblings, 1 reply; 153+ messages in thread
From: Waskiewicz Jr, Peter P @ 2007-06-11 18:15 UTC (permalink / raw)
To: Patrick McHardy; +Cc: davem, netdev, jeff, Kok, Auke-jan H
> I think grepping will help more than testing :)
>
> The only issue I can see is that packets going to a
> multiqueue device that doesn't have a multiqueue aware qdisc
> attached will get a random value. So you would have to
> conditionally reset it before ->enqueue.
I currently clear queue_mapping before ->enqueue(). Perhaps keeping
queue_mapping in there might solve needing a conditional in the hotpath.
Let me think about this one.
> Another question is what to do about other hard_start_xmit callers.
> Independant of which field is used, should the classification
> that may have happend on a different device be retained (TC
> actions again)?
The two cases I can think of here are ip forwarding and bonding. In the
case of bonding, things should be fine since the bonded device will only
have one "ring." Therefore if the underlying slave devices are
heterogenous, there shouldn't be a problem retaining the previous TC
classification; if the device has its own qdisc and classifiers, it can
override it.
For ip forwarding, I believe it will also be ok since ultimately the
device doing the last transmit will have his classifiers applied and
remap skb's if necessary. Either way, before it gets enqueued through
dev_queue_xmit(), the value will get cleared, so having an artifact
laying around won't be possible.
If that's not what you're referring to, please let me know.
Thanks,
-PJ Waskiewicz
^ permalink raw reply [flat|nested] 153+ messages in thread
* Re: [PATCH] NET: Multiqueue network device support.
2007-06-11 18:15 ` Waskiewicz Jr, Peter P
@ 2007-06-11 18:24 ` Patrick McHardy
0 siblings, 0 replies; 153+ messages in thread
From: Patrick McHardy @ 2007-06-11 18:24 UTC (permalink / raw)
To: Waskiewicz Jr, Peter P; +Cc: davem, netdev, jeff, Kok, Auke-jan H
Waskiewicz Jr, Peter P wrote:
>>Another question is what to do about other hard_start_xmit callers.
>>Independant of which field is used, should the classification
>>that may have happend on a different device be retained (TC
>>actions again)?
>
>
> [...] Either way, before it gets enqueued through
> dev_queue_xmit(), the value will get cleared, so having an artifact
> laying around won't be possible.
You're right, I was thinking of a case where a packet would
be redirected from a multiqueue device to another one and
then not go through dev_queue_xmit but some other path to
hard_start_xmit that doesn't update the classification.
But there is no case like this.
^ permalink raw reply [flat|nested] 153+ messages in thread
* Re: [PATCH] NET: Multiqueue network device support.
2007-06-11 15:44 ` Patrick McHardy
@ 2007-06-11 21:35 ` jamal
2007-06-11 23:01 ` Patrick McHardy
2007-06-12 0:58 ` Patrick McHardy
0 siblings, 2 replies; 153+ messages in thread
From: jamal @ 2007-06-11 21:35 UTC (permalink / raw)
To: Patrick McHardy
Cc: Waskiewicz Jr, Peter P, davem, netdev, jeff, Kok, Auke-jan H
On Mon, 2007-11-06 at 17:44 +0200, Patrick McHardy wrote:
> jamal wrote:
[..]
> > - let the driver shutdown whenever a ring is full. Remember which ring X
> > shut it down.
> > - when you get a tx interupt or prun tx descriptors, if a ring <= X has
> > transmitted a packet (or threshold of packets), then wake up the driver
> > (i.e open up).
>
>
> At this point the qdisc might send new packets. What do you do when a
> packet for a full ring arrives?
>
Hrm... ok, is this a trick question or i am missing the obvious?;->
What is wrong with what any driver would do today - which is:
netif_stop and return BUSY; core requeues the packet?
> I see three choices:
>
> - drop it, even though its still within the qdiscs configured limits
> - requeue it, which does not work because the qdisc is still active
> and might just hand you the same packet over and over again in a
> busy loop, until the ring has more room (which has the same worst
> case, just that we're sitting in a busy loop now).
> - requeue and stop the queue: we're back to where we started since
> now higher priority packets will not get passed to the driver.
Refer to choice #4 above.
The patches are trivial - really; as soon as Peter posts the e1000
change for his version i should be able to cutnpaste and produce one
that will work with what i am saying.
I am going to try my best to do that this week - i am going to be a
little busy and have a few outstanding items (like the pktgen thing)
that i want to get out of the way...
cheers,
jamal
^ permalink raw reply [flat|nested] 153+ messages in thread
* RE: [PATCH] NET: Multiqueue network device support.
2007-06-11 15:34 ` Cohen, Guy
@ 2007-06-11 22:22 ` jamal
2007-06-12 14:04 ` Cohen, Guy
0 siblings, 1 reply; 153+ messages in thread
From: jamal @ 2007-06-11 22:22 UTC (permalink / raw)
To: Cohen, Guy
Cc: Patrick McHardy, Waskiewicz Jr, Peter P, davem, netdev, jeff,
Kok, Auke-jan H
On Mon, 2007-11-06 at 18:34 +0300, Cohen, Guy wrote:
> jamal wrote:
[..]
> > WMM is a strict prio mechanism.
> > The parametrization very much favors the high prio packets when the
> > tx opportunity to send shows up.
>
> Sorry, but this not as simple as you describe it. WMM is much more
> complicated. WMM defines the HW queues as virtually multiple clients
> that compete on the medium access individually. Each implements a
> contention-based medium access. The Access Point publishes to the
> clients the medium access parameters (e.g. back off parameters) that are
> different for each access category (virtual client). There is _not_ a
> strict priority assigned to each access category.
You sound like you know this stuff well so please bear with me. I am
actually hoping i will learn from you.
I dont have access to the IEEE docs but i have been reasonably following
up on the qos aspect and i have a good feel for how the parameters work.
I posted a url to a pdf earlier which describes the WMM default
parameterization for each AC you refer to above - do you wanna comment
on the accuracy of that?
> The behavior of each
> access category totally depends on the medium usage of other clients and
> is totally different for each access category. This cannot be predicated
> at the host SW.
It could be estimated well by the host sw; but lets defer that to later
in case i am clueless on something or you misunderstood something i
said.
> QoS in WLAN doesn't
> favor strictly one access category over another, but defines some softer
> and smarter prioritization. This is implemented in the HW/Firmware.
I understand. Please correct me if am wrong:
The only reason AC_BK packet will go out instead of AC_VO when
contending in hardware is because of a statistical opportunity not the
firmware intentionaly trying to allow AC_BK out
i.e it is influenced by the three variables:
1) The contention window 2) the backoff timer and 3)the tx opportunity
And if you look at the default IEEE parameters as in that url slide 43,
the only time AC_BK will win is luck.
> I
> just think that providing a per-queue controls (start/stop) will allow
> WLAN drivers/Firmware/HW to do that while still using qdisc (and it will
> work properly even when one queue is full and others are empty).
I dont see it the same way. But iam willing to see wireless in a
different light than wireless, more below.
> > however, it is feasible that high prio
> > packets will obstruct low prio packets (which is fine).
>
> No this is _not_ fine. Just to emphasize again, WMM doesn't define
> priority in the way it is implemented in airplane boarding (Pilots
> first, Business passengers next, couch passengers at the end), but more
> like _distributed_ weights prioritization (between all the multiple
> queues of all the clients on the channel).
I am not trying to be obtuse in any way - but let me ask this for
wireless contention resolution:
When a bussiness passenger is trying to get into plane at the same time
as a couch passenger and the attendant notices i.e to resolve the
contention, who gets preferential treatment? There is the case of the
attendant statistically not noticing (but that accounts for luck)...
Heres a really dated paper before the standard was ratified:
http://www.mwnl.snu.ac.kr/~schoi/publication/Conferences/02-EW.pdf
a) looking at table 1 at the AIFS, CWmin/max and PF used in the
experiment I dont see how a low prio or mid prio ac will
ever beat something in the high prio just by virtue that they have
longer AIFS + CW values. Maybe you can explain (trust me i am trying to
resolve this in my mind and not trying to be difficult in any way; i am
a geek and these sorts of things intrigue me; i may curse but thats ok)
The only way it would happen is if there is no collision i.e stastical
"luck".
b) The paragraph between fig 4 and fig 5 talks about "virtual collision"
between two TCs within a station as _always_ favoring the higher prio.
Slide 43 on:
http://madwifi.org/attachment/wiki/ChipsetFeatures/WMM/qos11e.pdf?format=raw
also seems to indicate the default parameters for the different timers
is clearly strictly in favor of you if you have higher prio.
Do those numbers cross-reference with the IEEE doc you may have?
> As a side note, in one of the WFA WMM certification tests, the AP
> changes the medium access parameters of the access categories in a way
> that favors a lower access category. This is something very soft that
> cannot be reflected in any intuitive way in the host SW.
So essentially the test you mention changes priorities in real time.
What is the purpose of this test? Is WMM expected to change its
priorities in real time?
cheers,
jamal
^ permalink raw reply [flat|nested] 153+ messages in thread
* Re: [PATCH] NET: Multiqueue network device support.
2007-06-11 21:35 ` jamal
@ 2007-06-11 23:01 ` Patrick McHardy
2007-06-12 0:58 ` Patrick McHardy
1 sibling, 0 replies; 153+ messages in thread
From: Patrick McHardy @ 2007-06-11 23:01 UTC (permalink / raw)
To: hadi; +Cc: Waskiewicz Jr, Peter P, davem, netdev, jeff, Kok, Auke-jan H
jamal wrote:
> On Mon, 2007-11-06 at 17:44 +0200, Patrick McHardy wrote:
>
>>jamal wrote:
>
> [..]
>
>>>- let the driver shutdown whenever a ring is full. Remember which ring X
>>>shut it down.
>>>- when you get a tx interupt or prun tx descriptors, if a ring <= X has
>>>transmitted a packet (or threshold of packets), then wake up the driver
>>>(i.e open up).
>>
>>
>>At this point the qdisc might send new packets. What do you do when a
>>packet for a full ring arrives?
>>
>
>
> Hrm... ok, is this a trick question or i am missing the obvious?;->
> What is wrong with what any driver would do today - which is:
> netif_stop and return BUSY; core requeues the packet?
That doesn't fix the problem, high priority queues may be starved
by low priority queues if you do that.
BTW, I missed something you said before:
--quote--
i am making the case that it does not affect the overall results
as long as you use the same parameterization on qdisc and hardware.
--end quote--
I agree that multiple queue states wouldn't be necessary if they
would be parameterized the same, in that case we wouldn't even
need the qdisc at all (as you're saying). But one of the
parameters is the maximum queue length, and we want to be able
to parameterize the qdisc differently than the hardware here.
Which is the only reason for the possible starvation.
^ permalink raw reply [flat|nested] 153+ messages in thread
* Re: [PATCH] NET: Multiqueue network device support.
2007-06-11 21:35 ` jamal
2007-06-11 23:01 ` Patrick McHardy
@ 2007-06-12 0:58 ` Patrick McHardy
2007-06-12 2:29 ` jamal
1 sibling, 1 reply; 153+ messages in thread
From: Patrick McHardy @ 2007-06-12 0:58 UTC (permalink / raw)
To: hadi; +Cc: Waskiewicz Jr, Peter P, davem, netdev, jeff, Kok, Auke-jan H
jamal wrote:
> On Mon, 2007-11-06 at 17:44 +0200, Patrick McHardy wrote:
>
>>At this point the qdisc might send new packets. What do you do when a
>>packet for a full ring arrives?
>>
>
>
> Hrm... ok, is this a trick question or i am missing the obvious?;->
> What is wrong with what any driver would do today - which is:
> netif_stop and return BUSY; core requeues the packet?
>
>
>>I see three choices:
>>
>>- drop it, even though its still within the qdiscs configured limits
>>- requeue it, which does not work because the qdisc is still active
>> and might just hand you the same packet over and over again in a
>> busy loop, until the ring has more room (which has the same worst
>> case, just that we're sitting in a busy loop now).
>>- requeue and stop the queue: we're back to where we started since
>> now higher priority packets will not get passed to the driver.
>
>
> Refer to choice #4 above.
Replying again so we can hopefully move forward soon. Your choice #4
is exactly what I proposed as choice number 3.
Let me repeat my example why it doesn't work (well) without multiple
queue states (with typos etc fixed) and describe the possibilities.
If you still disagree I hope you can just change my example to show
how it gets fixed. As a thank you I will actually understand that
your solution works as well :)
We have n empty HW queues served in ascending priority order
with a maximum length of m packets per queue:
[0] empty
[1] empty
[2] empty
..
[n-1] empty
Now we receive m - 1 packets for all priorities >= 1 and < n - 1,
so we have:
[0] empty
[1] m - 1 packets
[2] m - 1 packets
..
[n-2] m - 1 packets
[n-1] empty
Since no HW queue is completely full, the queue is still active.
Now we receive m packets of priority n - 1:
[0] empty
[1] m - 1 packets
[2] m - 1 packets
..
[n-2] m - 1 packets
[n-1] m packets
At this point the queue needs to be stopped since the highest
priority queue is entirely full. To start it again at least
one packet of queue n - 1 needs to be sent, which requires
that queues 1 to n - 2 are serviced first. So any prio 0 packet
arriving during this period will sit in the qdisc and will not
reach the device for up to the time for (n - 2) * (m - 1) + 1
full sized packet transmissions. With multiple queue states
we'd know that queue 0 can still take packets.
You agreed that this is a problem and instead of keeping the
queue stopped until all rings can take at least one packet
again you proposed:
> - let the driver shutdown whenever a ring is full. Remember which
> ring X shut it down.
> - when you get a tx interupt or prun tx descriptors, if a
> ring <= X has transmitted a packet (or threshold of packets),
> then wake up the driver (i.e open up).
At this point the queue is active, but at least one ring is already
full and the qdisc can still pass packets for it to the driver.
When this happens we can:
- drop it. This makes qdisc configured limit meaningless since
the qdisc can't anticipate when the packet will make it through
or get dropped.
- requeue it: this might result in a busy loop if the qdisc
decides to hand out the packet again. The loop will be
terminated once the ring has more room available an can
eat the packet, which has the same worst case behaviour
I described above.
- requeue (return BUSY) and stop the queue: thats what you
proposed as option #4. The question is when to wake the
queue again. Your suggestion was to wake it when some
other queue with equal or higher priority got dequeued.
Correcting my previous statement, you are correct that
this will fix the starvation of higher priority queues
because the qdisc has a chance to hand out either a packet
of the same priority or higher priority, but at the cost of
at worst (n - 1) * m unnecessary dequeues+requeues in case
there is only a packet of lowest priority and we need to
fully serve all higher priority HW queues before it can
actually be dequeued. The other possibility would be to
activate the queue again once all rings can take packets
again, but that wouldn't fix the problem, which you can
easily see if you go back to my example and assume we still
have a low priority packet within the qdisc when the lowest
priority ring fills up (and the queue is stopped), and after
we tried to wake it and stopped it again the higher priority
packet arrives.
Considering your proposal in combination with RR, you can see
the same problem of unnecessary dequeues+requeues. Since there
is no priority for waking the queue when a equal or higher
priority ring got dequeued as in the prio case, I presume you
would wake the queue whenever a packet was sent. For the RR
qdisc dequeue after requeue should hand out the same packet,
independantly of newly enqueued packets (which doesn't happen
and is a bug in Peter's RR version), so in the worst case the
HW has to make the entire round before a packet can get
dequeued in case the corresponding HW queue is full. This is
a bit better than prio, but still up to n - 1 unnecessary
requeues+dequeues. I think it can happen more often than
for prio though.
> The patches are trivial - really; as soon as Peter posts the e1000
> change for his version i should be able to cutnpaste and produce one
> that will work with what i am saying.
Forgetting about things like multiple qdisc locks and just
looking at queueing behaviour, the question seems to come
down to whether the unnecessary dequeues/requeues are acceptable
(which I don't think since they are easily avoidable). OTOH
you could turn it around and argue that the patches won't do
much harm since ripping them out again (modulo queue mapping)
should result in the same behaviour with just more overhead.
^ permalink raw reply [flat|nested] 153+ messages in thread
* Re: [PATCH] NET: Multiqueue network device support.
2007-06-12 0:58 ` Patrick McHardy
@ 2007-06-12 2:29 ` jamal
2007-06-12 13:21 ` Patrick McHardy
0 siblings, 1 reply; 153+ messages in thread
From: jamal @ 2007-06-12 2:29 UTC (permalink / raw)
To: Patrick McHardy
Cc: Waskiewicz Jr, Peter P, davem, netdev, jeff, Kok, Auke-jan H
Sorry - i was distracted elsewhere and didnt respond to your
earlier email; this one seems a superset.
On Tue, 2007-12-06 at 02:58 +0200, Patrick McHardy wrote:
> jamal wrote:
> > On Mon, 2007-11-06 at 17:44 +0200, Patrick McHardy wrote:
[use case abbreviated..]
the use case is sensible.
> the qdisc has a chance to hand out either a packet
> of the same priority or higher priority, but at the cost of
> at worst (n - 1) * m unnecessary dequeues+requeues in case
> there is only a packet of lowest priority and we need to
> fully serve all higher priority HW queues before it can
> actually be dequeued.
yes, i see that.
[It actually is related to the wake threshold you use in the
driver. tg3 and e1000 for example will do it after 30 or so packets.
But i get your point - what you are trying to describe is a worst case
scenario].
> The other possibility would be to
> activate the queue again once all rings can take packets
> again, but that wouldn't fix the problem, which you can
> easily see if you go back to my example and assume we still
> have a low priority packet within the qdisc when the lowest
> priority ring fills up (and the queue is stopped), and after
> we tried to wake it and stopped it again the higher priority
> packet arrives.
In your use case, only low prio packets are available on the stack.
Above you mention arrival of high prio - assuming thats intentional and
not it being late over there ;->
If higher prio packets are arriving on the qdisc when you open up, then
given strict prio those packets get to go to the driver first until
there are no more left; followed of course by low prio which then
shutdown the path again...
> Considering your proposal in combination with RR, you can see
> the same problem of unnecessary dequeues+requeues.
Well, we havent really extended the use case from prio to RR.
But this is a good start as any since all sorts of work conserving
schedulers will behave in a similar fashion ..
> Since there
> is no priority for waking the queue when a equal or higher
> priority ring got dequeued as in the prio case, I presume you
> would wake the queue whenever a packet was sent.
I suppose that is a viable approach if the hardware is RR based.
Actually in the case of e1000 it is WRR not plain RR, but that is a
moot point which doesnt affect the discussion.
> For the RR
> qdisc dequeue after requeue should hand out the same packet,
> independantly of newly enqueued packets (which doesn't happen
> and is a bug in Peter's RR version), so in the worst case the
> HW has to make the entire round before a packet can get
> dequeued in case the corresponding HW queue is full. This is
> a bit better than prio, but still up to n - 1 unnecessary
> requeues+dequeues. I think it can happen more often than
> for prio though.
I think what would better to be use is DRR. I pointed the code i did
a long time ago to Peter.
With DRR, a deficit is viable to be carried forward.
> Forgetting about things like multiple qdisc locks and just
> looking at queueing behaviour, the question seems to come
> down to whether the unnecessary dequeues/requeues are acceptable
> (which I don't think since they are easily avoidable).
As i see it, the worst case scenario would have a finite time.
A 100Mbps NIC should be able to dish out, depending on packet size,
148Kpps to 8.6Kpps; a GigE 10x that.
so i think the phase in general wont last that long given the assumption
is packets are coming in from the stack to the driver with about the
packet rate equivalent to wire rate (for the case of all work conserving
schedulers).
In the general case there should be no contention at all.
> OTOH
> you could turn it around and argue that the patches won't do
> much harm since ripping them out again (modulo queue mapping)
> should result in the same behaviour with just more overhead.
I am not sure i understood - but note that i have asked for a middle
ground from the begining.
Thanks again for the patience and taking the time to go over this.
cheers,
jamal
^ permalink raw reply [flat|nested] 153+ messages in thread
* Re: [PATCH] NET: Multiqueue network device support.
2007-06-11 12:23 ` jamal
2007-06-11 12:39 ` Patrick McHardy
@ 2007-06-12 9:19 ` Johannes Berg
2007-06-12 12:17 ` jamal
1 sibling, 1 reply; 153+ messages in thread
From: Johannes Berg @ 2007-06-12 9:19 UTC (permalink / raw)
To: hadi
Cc: Patrick McHardy, Waskiewicz Jr, Peter P, davem, netdev, jeff,
Kok, Auke-jan H
[-- Attachment #1: Type: text/plain, Size: 660 bytes --]
On Mon, 2007-06-11 at 08:23 -0400, jamal wrote:
> On Mon, 2007-11-06 at 13:58 +0200, Patrick McHardy wrote:
>
> > Thats not true. Assume PSL has lots of packets, PSH is empty. We
> > fill the PHL queue until their is no room left, so the driver
> > has to stop the queue.
>
> Sure. Packets stashed on the any DMA ring are considered "gone to the
> wire". That is a very valid assumption to make.
Not at all! Packets could be on the DMA queue forever if you're feeding
out more packets. Heck, on most wireless hardware packets can even be
*expired* from the DMA queue and you get an indication that it was
impossible to send them.
johannes
[-- Attachment #2: This is a digitally signed message part --]
[-- Type: application/pgp-signature, Size: 190 bytes --]
^ permalink raw reply [flat|nested] 153+ messages in thread
* Re: [PATCH] NET: Multiqueue network device support.
2007-06-12 9:19 ` Johannes Berg
@ 2007-06-12 12:17 ` jamal
0 siblings, 0 replies; 153+ messages in thread
From: jamal @ 2007-06-12 12:17 UTC (permalink / raw)
To: Johannes Berg
Cc: Patrick McHardy, Waskiewicz Jr, Peter P, davem, netdev, jeff,
Kok, Auke-jan H
On Tue, 2007-12-06 at 11:19 +0200, Johannes Berg wrote:
> On Mon, 2007-06-11 at 08:23 -0400, jamal wrote:
> > Sure. Packets stashed on the any DMA ring are considered "gone to the
> > wire". That is a very valid assumption to make.
>
> Not at all! Packets could be on the DMA queue forever if you're feeding
> out more packets. Heck, on most wireless hardware packets can even be
> *expired* from the DMA queue and you get an indication that it was
> impossible to send them.
The spirit of the discussion you are quoting was much higher level than
that. Yes what you describe can happen on any DMA (to hard-disk etc)
A simpler example, if you tcpdump on an outgoing packet you see it on
its way to the driver - it is accounted for as "gone"[1].
In any case, read the rest of the thread.
cheers,
jamal
[1] Current Linux tcpdumping is not that accurate, but i dont wanna go
into that discussion
^ permalink raw reply [flat|nested] 153+ messages in thread
* Re: [PATCH] NET: Multiqueue network device support.
2007-06-12 2:29 ` jamal
@ 2007-06-12 13:21 ` Patrick McHardy
2007-06-12 15:12 ` jamal
2007-06-12 21:02 ` David Miller
0 siblings, 2 replies; 153+ messages in thread
From: Patrick McHardy @ 2007-06-12 13:21 UTC (permalink / raw)
To: hadi; +Cc: Waskiewicz Jr, Peter P, davem, netdev, jeff, Kok, Auke-jan H
jamal wrote:
>>the qdisc has a chance to hand out either a packet
>> of the same priority or higher priority, but at the cost of
>> at worst (n - 1) * m unnecessary dequeues+requeues in case
>> there is only a packet of lowest priority and we need to
>> fully serve all higher priority HW queues before it can
>> actually be dequeued.
>
>
> yes, i see that.
> [It actually is related to the wake threshold you use in the
> driver. tg3 and e1000 for example will do it after 30 or so packets.
> But i get your point - what you are trying to describe is a worst case
> scenario].
Yes. Using a higher threshold reduces the overhead, but leads to
lower priority packets getting out even if higher priority packets
are present in the qdisc. Note that if we use the threshold with
multiple queue states (threshold per ring) this doesn't happen.
>> The other possibility would be to
>> activate the queue again once all rings can take packets
>> again, but that wouldn't fix the problem, which you can
>> easily see if you go back to my example and assume we still
>> have a low priority packet within the qdisc when the lowest
>> priority ring fills up (and the queue is stopped), and after
>> we tried to wake it and stopped it again the higher priority
>> packet arrives.
>
>
> In your use case, only low prio packets are available on the stack.
> Above you mention arrival of high prio - assuming thats intentional and
> not it being late over there ;->
> If higher prio packets are arriving on the qdisc when you open up, then
> given strict prio those packets get to go to the driver first until
> there are no more left; followed of course by low prio which then
> shutdown the path again...
Whats happening is: Lowest priority ring fills up, queue is stopped.
We have more packets for it in the qdisc. A higher priority packet
is transmitted, the queue is woken up again, the lowest priority packet
goes to the driver and hits the full ring, packet is requeued and
queue shut down until ring frees up again. Now a high priority packet
arrives. It won't get to the driver anymore. But its not very important
since having two different wakeup-strategies would be a bit strange
anyway, so lets just rule out this possibility.
>>Considering your proposal in combination with RR, you can see
>>the same problem of unnecessary dequeues+requeues.
>
>
> Well, we havent really extended the use case from prio to RR.
> But this is a good start as any since all sorts of work conserving
> schedulers will behave in a similar fashion ..
>
>
>>Since there
>>is no priority for waking the queue when a equal or higher
>>priority ring got dequeued as in the prio case, I presume you
>>would wake the queue whenever a packet was sent.
>
>
> I suppose that is a viable approach if the hardware is RR based.
> Actually in the case of e1000 it is WRR not plain RR, but that is a
> moot point which doesnt affect the discussion.
>
>
>>For the RR
>>qdisc dequeue after requeue should hand out the same packet,
>>independantly of newly enqueued packets (which doesn't happen
>>and is a bug in Peter's RR version), so in the worst case the
>>HW has to make the entire round before a packet can get
>>dequeued in case the corresponding HW queue is full. This is
>>a bit better than prio, but still up to n - 1 unnecessary
>>requeues+dequeues. I think it can happen more often than
>>for prio though.
>
>
> I think what would better to be use is DRR. I pointed the code i did
> a long time ago to Peter.
> With DRR, a deficit is viable to be carried forward.
If both driver and HW do it, its probably OK for short term, but it
shouldn't grow too large since short-term fairness is also important.
But the unnecessary dequeues+requeues can still happen.
>>Forgetting about things like multiple qdisc locks and just
>>looking at queueing behaviour, the question seems to come
>>down to whether the unnecessary dequeues/requeues are acceptable
>>(which I don't think since they are easily avoidable).
>
>
> As i see it, the worst case scenario would have a finite time.
> A 100Mbps NIC should be able to dish out, depending on packet size,
> 148Kpps to 8.6Kpps; a GigE 10x that.
> so i think the phase in general wont last that long given the assumption
> is packets are coming in from the stack to the driver with about the
> packet rate equivalent to wire rate (for the case of all work conserving
> schedulers).
> In the general case there should be no contention at all.
It does have finite time, but its still undesirable. The average case
would probably have been more interesting, but its also harder :)
I also expect to see lots of requeues under "normal" load that doesn't
ressemble the worst-case, but only tests can confirm that.
>> OTOH
>>you could turn it around and argue that the patches won't do
>>much harm since ripping them out again (modulo queue mapping)
>>should result in the same behaviour with just more overhead.
>
>
> I am not sure i understood - but note that i have asked for a middle
> ground from the begining.
I just mean that we could rip the patches out at any point again
without user visible impact aside from more overhead. So even
if they turn out to be a mistake its easily correctable.
I've also looked into moving all multiqueue specific handling to
the top-level qdisc out of sch_generic, unfortunately that leads
to races unless all subqueue state operations takes dev->qdisc_lock.
Besides the overhead I think it would lead to ABBA deadlocks.
So how do we move forward?
^ permalink raw reply [flat|nested] 153+ messages in thread
* RE: [PATCH] NET: Multiqueue network device support.
2007-06-11 22:22 ` jamal
@ 2007-06-12 14:04 ` Cohen, Guy
2007-06-12 15:23 ` jamal
2007-06-12 23:38 ` jamal
0 siblings, 2 replies; 153+ messages in thread
From: Cohen, Guy @ 2007-06-12 14:04 UTC (permalink / raw)
To: hadi
Cc: Patrick McHardy, Waskiewicz Jr, Peter P, davem, netdev, jeff,
Kok, Auke-jan H
Hi Jamal,
Here is a simple scenario (nothing here is rare of extreme case):
- Busy wireless environment
- FTP TX on BE queue (low priority)
- Skype TX on VO queue (high priority)
The channel is busy with high priority packets hence the BE packets are
transmitted to the air rarely so the DMA/HW queue of the BE access
category gets full and the qdisc is stopped.
Now periodic VO-tagged Skype packets arrive. I would expect that they
get the priority (and pass) in all stages of the stack and reach the HW
ASAP and compete there on the medium with the other access categories
and the other clients on the channel.
Now this packet will be stuck in the qdisc and wait there until a BE
packet is transmitted, which can take a long time. This is a real
problem.
There is also a problem with the queues that will be dedicated to TX
aggregation in 11n (currently implemented) - the packets will be
classified to queues by the destination MAC address and not only by the
priority class, but I don't want to get into that now. I think that
there are enough arguments now why the patch that started this thread is
needed...
Please see below some replies to your questions.
Regards,
Guy.
jamal wrote:
> It could be estimated well by the host sw; but lets defer that to
later
> in case i am clueless on something or you misunderstood something i
> said.
It cannot be estimated well by the host SW. This is one of the main
issues - we can't put it aside...
> I understand. Please correct me if am wrong:
> The only reason AC_BK packet will go out instead of AC_VO when
> contending in hardware is because of a statistical opportunity not the
> firmware intentionaly trying to allow AC_BK out
> i.e it is influenced by the three variables:
> 1) The contention window 2) the backoff timer and 3)the tx opportunity
> And if you look at the default IEEE parameters as in that url slide
43,
> the only time AC_BK will win is luck.
In most scenarios BK packets will be transmitted and will win the medium
against VO packets (thought, in some non-favored ratio).
> Heres a really dated paper before the standard was ratified:
> http://www.mwnl.snu.ac.kr/~schoi/publication/Conferences/02-EW.pdf
Sorry, I'm really overloaded - I won't be able to review the docs you
sent (really apologize for that).
> So essentially the test you mention changes priorities in real time.
> What is the purpose of this test? Is WMM expected to change its
> priorities in real time?
The WMM parameters of the AC are set and controlled by the network/BSS
(access point) administrator and can be used in anyway. There are the
default parameters but they can be changed.
Regards,
Guy.
^ permalink raw reply [flat|nested] 153+ messages in thread
* Re: [PATCH] NET: Multiqueue network device support.
2007-06-12 13:21 ` Patrick McHardy
@ 2007-06-12 15:12 ` jamal
2007-06-12 21:02 ` David Miller
1 sibling, 0 replies; 153+ messages in thread
From: jamal @ 2007-06-12 15:12 UTC (permalink / raw)
To: Patrick McHardy
Cc: Waskiewicz Jr, Peter P, davem, netdev, jeff, Kok, Auke-jan H
On Tue, 2007-12-06 at 15:21 +0200, Patrick McHardy wrote:
> jamal wrote:
>
>
> Yes. Using a higher threshold reduces the overhead, but leads to
> lower priority packets getting out even if higher priority packets
> are present in the qdisc.
As per earlier discussion, the packets already given to hardware should
be fine to go out first. If they get overriden by the chance arrival of
higher prio packets from the stack, then that is fine.
> Note that if we use the threshold with
> multiple queue states (threshold per ring) this doesn't happen.
I think if you do the math, youll find that (n - 1) * m is actually
not that unreasonable given parameters typically used on the drivers;
Lets for example take the parameters from e1000; the tx ring is around
256, the wake threshold is 32 packets (although i have found a better
number is 1/2 the tx size and have that changed in my batching patches).
Assume such a driver with above parameters doing Gige exists and it
implements 4 queus (n = 4); in such a case, (n-1)*m/32 is
3*256/32 = 3*8 = 24 times.
You have to admit your use case is a real corner case but lets be
conservative since we are doing a worst case scenario and from that
perspective consider that gige can be achieved at pkt levels of 86Kpps
to 1.48Mpps and if you are non-work conserving you will be running at
that rate and lets pick the low end of 86Kpps - what that means is there
is a blip (remember again this to be a corner case) for a few microsecs
once in a while with probability of what you described actually
occuring...
Ok, so then update the threshold to 1/2 the tx ring etc and it is even
less. You get the message.
> If both driver and HW do it, its probably OK for short term, but it
> shouldn't grow too large since short-term fairness is also important.
> But the unnecessary dequeues+requeues can still happen.
In a corner case, yes there is a probability that will happen.
I think its extremely low.
>
> It does have finite time, but its still undesirable. The average case
> would probably have been more interesting, but its also harder :)
> I also expect to see lots of requeues under "normal" load that doesn't
> ressemble the worst-case, but only tests can confirm that.
>
And that is what i was asking of Peter. Some testing. Clearly the
subqueueing is more complex; what i am asking for is for the driver
to bear the brunt and not for it to be an impacting architectural
change.
> > I am not sure i understood - but note that i have asked for a middle
> > ground from the begining.
>
>
> I just mean that we could rip the patches out at any point again
> without user visible impact aside from more overhead. So even
> if they turn out to be a mistake its easily correctable.
That is a good compromise i think. The reason i am spending my time
discussing this is i believe this to be a very important subsystem.
You know i have been voiceferous for years on this topic.
What i was worried about is these patches make it and become engrained
with hot lava on stone.
> I've also looked into moving all multiqueue specific handling to
> the top-level qdisc out of sch_generic, unfortunately that leads
> to races unless all subqueue state operations takes dev->qdisc_lock.
> Besides the overhead I think it would lead to ABBA deadlocks.
I am confident you can handle that.
> So how do we move forward?
What you described above is a good compromise IMO. I dont have much time
to chase this path at the moment but what it does is give me freedom to
revisit later on with data points. More importantly you understand my
view;-> And of course you did throw a lot of rocks but it
a definete alternative ;->
cheers,
jamal
^ permalink raw reply [flat|nested] 153+ messages in thread
* RE: [PATCH] NET: Multiqueue network device support.
2007-06-12 14:04 ` Cohen, Guy
@ 2007-06-12 15:23 ` jamal
2007-06-12 23:38 ` jamal
1 sibling, 0 replies; 153+ messages in thread
From: jamal @ 2007-06-12 15:23 UTC (permalink / raw)
To: Cohen, Guy
Cc: Patrick McHardy, Waskiewicz Jr, Peter P, davem, netdev, jeff,
Kok, Auke-jan H
Guy,
I apologize for not responding immediately - i promise to in a few hours
when i get back (and read it over some good coffee) - seems like you
have some good stuff there; thanks for taking the time despite the
overload.
cheers,
jamal
On Tue, 2007-12-06 at 17:04 +0300, Cohen, Guy wrote:
> Hi Jamal,
>
> Here is a simple scenario (nothing here is rare of extreme case):
> - Busy wireless environment
> - FTP TX on BE queue (low priority)
> - Skype TX on VO queue (high priority)
>
^ permalink raw reply [flat|nested] 153+ messages in thread
* Re: [PATCH] NET: Multiqueue network device support.
2007-06-12 13:21 ` Patrick McHardy
2007-06-12 15:12 ` jamal
@ 2007-06-12 21:02 ` David Miller
2007-06-12 21:13 ` Jeff Garzik
2007-06-12 21:17 ` Patrick McHardy
1 sibling, 2 replies; 153+ messages in thread
From: David Miller @ 2007-06-12 21:02 UTC (permalink / raw)
To: kaber; +Cc: hadi, peter.p.waskiewicz.jr, netdev, jeff, auke-jan.h.kok
From: Patrick McHardy <kaber@trash.net>
Date: Tue, 12 Jun 2007 15:21:54 +0200
> So how do we move forward?
We're going to put hw multiqueue support in, all of this discussion
has been pointless, I just watch this thread and basically laugh at
the resistence to hw multiqueue support :-)
^ permalink raw reply [flat|nested] 153+ messages in thread
* Re: [PATCH] NET: Multiqueue network device support.
2007-06-12 21:02 ` David Miller
@ 2007-06-12 21:13 ` Jeff Garzik
2007-06-12 21:17 ` Ben Greear
2007-06-13 16:44 ` Rick Jones
2007-06-12 21:17 ` Patrick McHardy
1 sibling, 2 replies; 153+ messages in thread
From: Jeff Garzik @ 2007-06-12 21:13 UTC (permalink / raw)
To: netdev; +Cc: David Miller, kaber, hadi, peter.p.waskiewicz.jr, auke-jan.h.kok
If hardware w/ multiple queues will the capability for different MAC
addresses, different RX filters, etc. does it make sense to add that
below the net_device level?
We will have to add all the configuration machinery at the per-queue
level that already exists at the per-netdev level.
Jeff
^ permalink raw reply [flat|nested] 153+ messages in thread
* Re: [PATCH] NET: Multiqueue network device support.
2007-06-12 21:02 ` David Miller
2007-06-12 21:13 ` Jeff Garzik
@ 2007-06-12 21:17 ` Patrick McHardy
2007-06-13 5:56 ` Zhu Yi
1 sibling, 1 reply; 153+ messages in thread
From: Patrick McHardy @ 2007-06-12 21:17 UTC (permalink / raw)
To: David Miller; +Cc: hadi, peter.p.waskiewicz.jr, netdev, jeff, auke-jan.h.kok
David Miller wrote:
> From: Patrick McHardy <kaber@trash.net>
> Date: Tue, 12 Jun 2007 15:21:54 +0200
>
>
>>So how do we move forward?
>
>
> We're going to put hw multiqueue support in, all of this discussion
> has been pointless, I just watch this thread and basically laugh at
> the resistence to hw multiqueue support :-)
It did help me understand the consequences of the different approaches.
I'm still in favour of putting the patches in, but I've hacked up a
small multiqueue simulator device and to my big surprise my testing
showed that Jamal's suggestion of using a single queue state seems to
work better than I expected. But I've been doing mostly testing of
the device itself up to now with very simple traffic patterns (mostly
just "flood all queues"), so I'll try to get some real results tomorrow.
^ permalink raw reply [flat|nested] 153+ messages in thread
* Re: [PATCH] NET: Multiqueue network device support.
2007-06-12 21:13 ` Jeff Garzik
@ 2007-06-12 21:17 ` Ben Greear
2007-06-12 21:26 ` David Miller
2007-06-13 16:44 ` Rick Jones
1 sibling, 1 reply; 153+ messages in thread
From: Ben Greear @ 2007-06-12 21:17 UTC (permalink / raw)
To: Jeff Garzik
Cc: netdev, David Miller, kaber, hadi, peter.p.waskiewicz.jr,
auke-jan.h.kok
Jeff Garzik wrote:
>
> If hardware w/ multiple queues will the capability for different MAC
> addresses, different RX filters, etc. does it make sense to add that
> below the net_device level?
>
> We will have to add all the configuration machinery at the per-queue
> level that already exists at the per-netdev level.
Perhaps the mac-vlan patch would be a good fit. Currently it is all
software based, but if the hardware can filter on MAC, it can basically
do mac-vlan acceleration. The mac-vlan devices are just like 'real' ethernet
devices, so they can be used with whatever schemes work with regular devices.
Thanks,
Ben
>
> Jeff
>
>
>
> -
> To unsubscribe from this list: send the line "unsubscribe netdev" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at http://vger.kernel.org/majordomo-info.html
--
Ben Greear <greearb@candelatech.com>
Candela Technologies Inc http://www.candelatech.com
^ permalink raw reply [flat|nested] 153+ messages in thread
* Re: [PATCH] NET: Multiqueue network device support.
2007-06-12 21:17 ` Ben Greear
@ 2007-06-12 21:26 ` David Miller
2007-06-12 21:46 ` Jeff Garzik
` (2 more replies)
0 siblings, 3 replies; 153+ messages in thread
From: David Miller @ 2007-06-12 21:26 UTC (permalink / raw)
To: greearb; +Cc: jeff, netdev, kaber, hadi, peter.p.waskiewicz.jr, auke-jan.h.kok
From: Ben Greear <greearb@candelatech.com>
Date: Tue, 12 Jun 2007 14:17:44 -0700
> Jeff Garzik wrote:
> >
> > If hardware w/ multiple queues will the capability for different MAC
> > addresses, different RX filters, etc. does it make sense to add that
> > below the net_device level?
> >
> > We will have to add all the configuration machinery at the per-queue
> > level that already exists at the per-netdev level.
>
> Perhaps the mac-vlan patch would be a good fit. Currently it is all
> software based, but if the hardware can filter on MAC, it can basically
> do mac-vlan acceleration. The mac-vlan devices are just like 'real' ethernet
> devices, so they can be used with whatever schemes work with regular devices.
Interesting.
But to answer Jeff's question, that's not really the model being
used to implement multiple queues.
The MAC is still very much centralized in most designs.
So one way they'll do it is to support assigning N MAC addresses,
and you configure the input filters of the chip to push packets
for each MAC to the proper receive queue.
So the MAC will accept any of those in the N MAC addresses as
it's own, then you use the filtering facilities to steer
frames to the correct RX queue.
The TX and RX queues can be so isolated as to be able to be exported
to virtualization nodes. You can give them full access to the DMA
queues and assosciated mailboxes. So instead of all of this bogus
virtualized device overhead, you just give the guest access to the
real device.
So you can use multiple queues either for better single node SMP
performance, or better virtualization performance.
^ permalink raw reply [flat|nested] 153+ messages in thread
* Re: [PATCH] NET: Multiqueue network device support.
2007-06-12 21:26 ` David Miller
@ 2007-06-12 21:46 ` Jeff Garzik
2007-06-12 21:52 ` Roland Dreier
2007-06-12 21:53 ` David Miller
2007-06-12 21:46 ` Ben Greear
2007-06-12 21:47 ` Jason Lunz
2 siblings, 2 replies; 153+ messages in thread
From: Jeff Garzik @ 2007-06-12 21:46 UTC (permalink / raw)
To: David Miller
Cc: greearb, netdev, kaber, hadi, peter.p.waskiewicz.jr,
auke-jan.h.kok
David Miller wrote:
> From: Ben Greear <greearb@candelatech.com>
> Date: Tue, 12 Jun 2007 14:17:44 -0700
>
>> Jeff Garzik wrote:
>>> If hardware w/ multiple queues will the capability for different MAC
>>> addresses, different RX filters, etc. does it make sense to add that
>>> below the net_device level?
>>>
>>> We will have to add all the configuration machinery at the per-queue
>>> level that already exists at the per-netdev level.
>> Perhaps the mac-vlan patch would be a good fit. Currently it is all
>> software based, but if the hardware can filter on MAC, it can basically
>> do mac-vlan acceleration. The mac-vlan devices are just like 'real' ethernet
>> devices, so they can be used with whatever schemes work with regular devices.
>
> Interesting.
>
> But to answer Jeff's question, that's not really the model being
> used to implement multiple queues.
>
> The MAC is still very much centralized in most designs.
>
> So one way they'll do it is to support assigning N MAC addresses,
> and you configure the input filters of the chip to push packets
> for each MAC to the proper receive queue.
>
> So the MAC will accept any of those in the N MAC addresses as
> it's own, then you use the filtering facilities to steer
> frames to the correct RX queue.
Not quite... You'll have to deal with multiple Rx filters, not just the
current one-filter-for-all model present in today's NICs. Pools of
queues will have separate configured characteristics. The "steer"
portion you mention is a bottleneck that wants to be eliminated.
Jeff
^ permalink raw reply [flat|nested] 153+ messages in thread
* Re: [PATCH] NET: Multiqueue network device support.
2007-06-12 21:26 ` David Miller
2007-06-12 21:46 ` Jeff Garzik
@ 2007-06-12 21:46 ` Ben Greear
2007-06-12 21:54 ` David Miller
2007-06-12 22:30 ` Jeff Garzik
2007-06-12 21:47 ` Jason Lunz
2 siblings, 2 replies; 153+ messages in thread
From: Ben Greear @ 2007-06-12 21:46 UTC (permalink / raw)
To: David Miller
Cc: jeff, netdev, kaber, hadi, peter.p.waskiewicz.jr, auke-jan.h.kok
David Miller wrote:
> From: Ben Greear <greearb@candelatech.com>
> Date: Tue, 12 Jun 2007 14:17:44 -0700
>
>> Jeff Garzik wrote:
>>> If hardware w/ multiple queues will the capability for different MAC
>>> addresses, different RX filters, etc. does it make sense to add that
>>> below the net_device level?
>>>
>>> We will have to add all the configuration machinery at the per-queue
>>> level that already exists at the per-netdev level.
>> Perhaps the mac-vlan patch would be a good fit. Currently it is all
>> software based, but if the hardware can filter on MAC, it can basically
>> do mac-vlan acceleration. The mac-vlan devices are just like 'real' ethernet
>> devices, so they can be used with whatever schemes work with regular devices.
>
> Interesting.
>
> But to answer Jeff's question, that's not really the model being
> used to implement multiple queues.
>
> The MAC is still very much centralized in most designs.
>
> So one way they'll do it is to support assigning N MAC addresses,
> and you configure the input filters of the chip to push packets
> for each MAC to the proper receive queue.
>
> So the MAC will accept any of those in the N MAC addresses as
> it's own, then you use the filtering facilities to steer
> frames to the correct RX queue.
>
> The TX and RX queues can be so isolated as to be able to be exported
> to virtualization nodes. You can give them full access to the DMA
> queues and assosciated mailboxes. So instead of all of this bogus
> virtualized device overhead, you just give the guest access to the
> real device.
>
> So you can use multiple queues either for better single node SMP
> performance, or better virtualization performance.
That sounds plausible for many uses, but it may also be useful to have
the virtual devices. Having 802.1Q VLANs be 'real' devices has worked out
quite well, so I think there is a place for a 'mac-vlan' as well.
With your description above, the 'correct RX queue' could be the
only queue that the mac-vlan sees, so it would behave somewhat like
a vanilla ethernet driver. When the mac-vlan transmits, it could
transmit directly into it's particular TX queue on the underlying device.
In a non guest environment, I believe the mac-vlan will act somewhat like
a more flexible form of an ip-alias. When name-spaces are implemented,
the mac-vlan would very easily allow the different name-spaces to share the same physical
hardware. The overhead should be minimal, and it's likely that using
a 'real' network device will be a lot easier to maintain than trying to directly
share separate queues on a single device that is somehow visible in multiple
namespaces.
And, since the mac-vlan can work as pure software on top of any NIC that
can go promisc and send with arbitrary source MAC, it will already work
with virtually all wired ethernet devices currently in existence.
Thanks,
Ben
--
Ben Greear <greearb@candelatech.com>
Candela Technologies Inc http://www.candelatech.com
^ permalink raw reply [flat|nested] 153+ messages in thread
* Re: [PATCH] NET: Multiqueue network device support.
2007-06-12 21:26 ` David Miller
2007-06-12 21:46 ` Jeff Garzik
2007-06-12 21:46 ` Ben Greear
@ 2007-06-12 21:47 ` Jason Lunz
2007-06-12 21:55 ` David Miller
2007-06-13 3:41 ` Leonid Grossman
2 siblings, 2 replies; 153+ messages in thread
From: Jason Lunz @ 2007-06-12 21:47 UTC (permalink / raw)
To: David Miller
Cc: greearb, jeff, netdev, kaber, hadi, peter.p.waskiewicz.jr,
auke-jan.h.kok
On Tue, Jun 12, 2007 at 02:26:58PM -0700, David Miller wrote:
> The MAC is still very much centralized in most designs.
>
> So one way they'll do it is to support assigning N MAC addresses,
> and you configure the input filters of the chip to push packets
> for each MAC to the proper receive queue.
>
> So the MAC will accept any of those in the N MAC addresses as
> it's own, then you use the filtering facilities to steer
> frames to the correct RX queue.
>
> The TX and RX queues can be so isolated as to be able to be exported
> to virtualization nodes. You can give them full access to the DMA
> queues and assosciated mailboxes. So instead of all of this bogus
> virtualized device overhead, you just give the guest access to the
> real device.
>
> So you can use multiple queues either for better single node SMP
> performance, or better virtualization performance.
Are you aware of any hardware designs that allow other ways to map
packets onto rx queues? I can think of several scenarios where it could
be advantageous to map packets by IP 3- or 5-tuple to get cpu locality
all the way up the stack on a flow-by-flow basis. But doing this would
require some way to request this mapping from the hardware.
In the extreme case it would be cool if it were possible to push a
bpf-like classifier down into the hardware to allow arbitrary kinds of
flow distribution.
Jason
^ permalink raw reply [flat|nested] 153+ messages in thread
* Re: [PATCH] NET: Multiqueue network device support.
2007-06-12 21:46 ` Jeff Garzik
@ 2007-06-12 21:52 ` Roland Dreier
2007-06-12 21:59 ` Jeff Garzik
2007-06-12 22:00 ` David Miller
2007-06-12 21:53 ` David Miller
1 sibling, 2 replies; 153+ messages in thread
From: Roland Dreier @ 2007-06-12 21:52 UTC (permalink / raw)
To: Jeff Garzik
Cc: David Miller, greearb, netdev, kaber, hadi, peter.p.waskiewicz.jr,
auke-jan.h.kok
> > The MAC is still very much centralized in most designs.
> > So one way they'll do it is to support assigning N MAC addresses,
> > and you configure the input filters of the chip to push packets
> > for each MAC to the proper receive queue.
> > So the MAC will accept any of those in the N MAC addresses as
> > it's own, then you use the filtering facilities to steer
> > frames to the correct RX queue.
>
> Not quite... You'll have to deal with multiple Rx filters, not just
> the current one-filter-for-all model present in today's NICs. Pools
> of queues will have separate configured characteristics. The "steer"
> portion you mention is a bottleneck that wants to be eliminated.
I think you're misunderstanding. These NICs still have only one
physical port, so sending or receiving real packets onto a physical
wire is fundamentally serialized. The steering of packets to receive
queues is done right after the packets are received from the wire --
in fact it can be done as soon as the NIC has parsed enough of the
headers to make a decision, which might be before the full packet has
even been received. The steering is no more of a bottleneck than the
physical link is.
- R.
^ permalink raw reply [flat|nested] 153+ messages in thread
* Re: [PATCH] NET: Multiqueue network device support.
2007-06-12 21:46 ` Jeff Garzik
2007-06-12 21:52 ` Roland Dreier
@ 2007-06-12 21:53 ` David Miller
2007-06-12 22:01 ` Jeff Garzik
1 sibling, 1 reply; 153+ messages in thread
From: David Miller @ 2007-06-12 21:53 UTC (permalink / raw)
To: jeff; +Cc: greearb, netdev, kaber, hadi, peter.p.waskiewicz.jr,
auke-jan.h.kok
From: Jeff Garzik <jeff@garzik.org>
Date: Tue, 12 Jun 2007 17:46:20 -0400
> Not quite... You'll have to deal with multiple Rx filters, not just the
> current one-filter-for-all model present in today's NICs. Pools of
> queues will have separate configured characteristics. The "steer"
> portion you mention is a bottleneck that wants to be eliminated.
It runs in hardware at wire speed, what's the issue? :-)
^ permalink raw reply [flat|nested] 153+ messages in thread
* Re: [PATCH] NET: Multiqueue network device support.
2007-06-12 21:46 ` Ben Greear
@ 2007-06-12 21:54 ` David Miller
2007-06-12 22:30 ` Jeff Garzik
1 sibling, 0 replies; 153+ messages in thread
From: David Miller @ 2007-06-12 21:54 UTC (permalink / raw)
To: greearb; +Cc: jeff, netdev, kaber, hadi, peter.p.waskiewicz.jr, auke-jan.h.kok
From: Ben Greear <greearb@candelatech.com>
Date: Tue, 12 Jun 2007 14:46:50 -0700
> And, since the mac-vlan can work as pure software on top of any NIC that
> can go promisc and send with arbitrary source MAC, it will already work
> with virtually all wired ethernet devices currently in existence.
Absolutely, I'm not against something like mac-vlan at all.
^ permalink raw reply [flat|nested] 153+ messages in thread
* Re: [PATCH] NET: Multiqueue network device support.
2007-06-12 21:47 ` Jason Lunz
@ 2007-06-12 21:55 ` David Miller
2007-06-12 22:17 ` Jason Lunz
2007-06-13 3:41 ` Leonid Grossman
1 sibling, 1 reply; 153+ messages in thread
From: David Miller @ 2007-06-12 21:55 UTC (permalink / raw)
To: lunz
Cc: greearb, jeff, netdev, kaber, hadi, peter.p.waskiewicz.jr,
auke-jan.h.kok
From: Jason Lunz <lunz@falooley.org>
Date: Tue, 12 Jun 2007 17:47:53 -0400
> Are you aware of any hardware designs that allow other ways to map
> packets onto rx queues? I can think of several scenarios where it could
> be advantageous to map packets by IP 3- or 5-tuple to get cpu locality
> all the way up the stack on a flow-by-flow basis. But doing this would
> require some way to request this mapping from the hardware.
These chips allow this too, Microsoft defined a standard for
RX queue interrupt hashing by flow so everyone puts it, or
something like it, in hardware.
> In the extreme case it would be cool if it were possible to push a
> bpf-like classifier down into the hardware to allow arbitrary kinds of
> flow distribution.
Maybe not a fully bpf, but many chips allow something close.
^ permalink raw reply [flat|nested] 153+ messages in thread
* Re: [PATCH] NET: Multiqueue network device support.
2007-06-12 21:52 ` Roland Dreier
@ 2007-06-12 21:59 ` Jeff Garzik
2007-06-12 22:04 ` David Miller
2007-06-12 22:00 ` David Miller
1 sibling, 1 reply; 153+ messages in thread
From: Jeff Garzik @ 2007-06-12 21:59 UTC (permalink / raw)
To: Roland Dreier
Cc: David Miller, greearb, netdev, kaber, hadi, peter.p.waskiewicz.jr,
auke-jan.h.kok
Roland Dreier wrote:
> > > The MAC is still very much centralized in most designs.
> > > So one way they'll do it is to support assigning N MAC addresses,
> > > and you configure the input filters of the chip to push packets
> > > for each MAC to the proper receive queue.
> > > So the MAC will accept any of those in the N MAC addresses as
> > > it's own, then you use the filtering facilities to steer
> > > frames to the correct RX queue.
> >
> > Not quite... You'll have to deal with multiple Rx filters, not just
> > the current one-filter-for-all model present in today's NICs. Pools
> > of queues will have separate configured characteristics. The "steer"
> > portion you mention is a bottleneck that wants to be eliminated.
>
> I think you're misunderstanding. These NICs still have only one
> physical port, so sending or receiving real packets onto a physical
> wire is fundamentally serialized. The steering of packets to receive
> queues is done right after the packets are received from the wire --
> in fact it can be done as soon as the NIC has parsed enough of the
> headers to make a decision, which might be before the full packet has
> even been received. The steering is no more of a bottleneck than the
> physical link is.
No, you're misreading. People are putting in independent configurable
Rx filters because a single Rx filter setup for all queues was a
bottleneck. Not a performance bottleneck but a configuration and
flexibility limitation that's being removed.
And where shall we put the configuration machinery, to support sub-queues?
Shall we duplicate the existing configuration code for sub-queues?
What will ifconfig/ip usage look like?
How will it differ from configurating full net_devices, if you are
assigning the same types of parameters?
Jeff
^ permalink raw reply [flat|nested] 153+ messages in thread
* Re: [PATCH] NET: Multiqueue network device support.
2007-06-12 21:52 ` Roland Dreier
2007-06-12 21:59 ` Jeff Garzik
@ 2007-06-12 22:00 ` David Miller
1 sibling, 0 replies; 153+ messages in thread
From: David Miller @ 2007-06-12 22:00 UTC (permalink / raw)
To: rdreier
Cc: jeff, greearb, netdev, kaber, hadi, peter.p.waskiewicz.jr,
auke-jan.h.kok
From: Roland Dreier <rdreier@cisco.com>
Date: Tue, 12 Jun 2007 14:52:11 -0700
> I think you're misunderstanding. These NICs still have only one
> physical port, so sending or receiving real packets onto a physical
> wire is fundamentally serialized. The steering of packets to receive
> queues is done right after the packets are received from the wire --
> in fact it can be done as soon as the NIC has parsed enough of the
> headers to make a decision, which might be before the full packet has
> even been received. The steering is no more of a bottleneck than the
> physical link is.
Yep, that's right.
^ permalink raw reply [flat|nested] 153+ messages in thread
* Re: [PATCH] NET: Multiqueue network device support.
2007-06-12 21:53 ` David Miller
@ 2007-06-12 22:01 ` Jeff Garzik
0 siblings, 0 replies; 153+ messages in thread
From: Jeff Garzik @ 2007-06-12 22:01 UTC (permalink / raw)
To: David Miller
Cc: greearb, netdev, kaber, hadi, peter.p.waskiewicz.jr,
auke-jan.h.kok
David Miller wrote:
> From: Jeff Garzik <jeff@garzik.org>
> Date: Tue, 12 Jun 2007 17:46:20 -0400
>
>> Not quite... You'll have to deal with multiple Rx filters, not just the
>> current one-filter-for-all model present in today's NICs. Pools of
>> queues will have separate configured characteristics. The "steer"
>> portion you mention is a bottleneck that wants to be eliminated.
>
> It runs in hardware at wire speed, what's the issue? :-)
Configuration is the issue.
Where shall we put the configuration machinery, to support sub-queues?
Shall we duplicate the existing configuration code for sub-queues?
What will ifconfig/ip usage look like?
How will it differ from configuring full net_devices, if you are
assigning the same types of parameters? Why?
Jeff
^ permalink raw reply [flat|nested] 153+ messages in thread
* Re: [PATCH] NET: Multiqueue network device support.
2007-06-12 21:59 ` Jeff Garzik
@ 2007-06-12 22:04 ` David Miller
2007-06-12 22:18 ` Jeff Garzik
0 siblings, 1 reply; 153+ messages in thread
From: David Miller @ 2007-06-12 22:04 UTC (permalink / raw)
To: jeff
Cc: rdreier, greearb, netdev, kaber, hadi, peter.p.waskiewicz.jr,
auke-jan.h.kok
From: Jeff Garzik <jeff@garzik.org>
Date: Tue, 12 Jun 2007 17:59:43 -0400
> And where shall we put the configuration machinery, to support sub-queues?
> Shall we duplicate the existing configuration code for sub-queues?
> What will ifconfig/ip usage look like?
> How will it differ from configurating full net_devices, if you are
> assigning the same types of parameters?
If you're asking about the virtualization scenerio, the
control node (dom0 or whatever) is the only entity which
can get at programming the filters and will set it up
properly based upon which parts of the physical device
are being exported to which guest nodes.
For the non-virtualized case, it's a good question.
But really the current hardware is just about simple queue steering,
and simple static DRR/WRED fairness algorithms applied to the queues
in hardware.
We don't need to add support for configuring anything fancy from the
start just to get something working. Especially the important bits
such as the virtualization case and the interrupt and queue
distribution case on SMP. The latter can even be configured
automatically by the driver, and that's in fact what I expect
drivers to do initially.
^ permalink raw reply [flat|nested] 153+ messages in thread
* Re: [PATCH] NET: Multiqueue network device support.
2007-06-12 21:55 ` David Miller
@ 2007-06-12 22:17 ` Jason Lunz
0 siblings, 0 replies; 153+ messages in thread
From: Jason Lunz @ 2007-06-12 22:17 UTC (permalink / raw)
To: David Miller
Cc: greearb, jeff, netdev, kaber, hadi, peter.p.waskiewicz.jr,
auke-jan.h.kok
On Tue, Jun 12, 2007 at 02:55:34PM -0700, David Miller wrote:
> These chips allow this too, Microsoft defined a standard for
> RX queue interrupt hashing by flow so everyone puts it, or
> something like it, in hardware.
I think you're referring to "RSS"?
http://www.microsoft.com/whdc/device/network/NDIS_RSS.mspx
http://msdn2.microsoft.com/en-us/library/ms795609.aspx
Jason
^ permalink raw reply [flat|nested] 153+ messages in thread
* Re: [PATCH] NET: Multiqueue network device support.
2007-06-12 22:04 ` David Miller
@ 2007-06-12 22:18 ` Jeff Garzik
0 siblings, 0 replies; 153+ messages in thread
From: Jeff Garzik @ 2007-06-12 22:18 UTC (permalink / raw)
To: David Miller
Cc: rdreier, greearb, netdev, kaber, hadi, peter.p.waskiewicz.jr,
auke-jan.h.kok
David Miller wrote:
> If you're asking about the virtualization scenerio, the
> control node (dom0 or whatever) is the only entity which
> can get at programming the filters and will set it up
> properly based upon which parts of the physical device
> are being exported to which guest nodes.
You're avoiding the question. Clearly guest VMs must contact the host
VM (dom0) to get real work done.
They are ultimately going to have to pass the same configuration info as
the non-virt case.
> For the non-virtualized case, it's a good question.
...
> But really the current hardware is just about simple queue steering,
> and simple static DRR/WRED fairness algorithms applied to the queues
> in hardware.
>
> We don't need to add support for configuring anything fancy from the
> start just to get something working.
Correct. But if we don't plan for the future that's currently in the
silicon pipeline, our ass will be in a sling WHEN we must figure out the
best configuration points for sub-queues.
Or are we prepared to rip out sub-queues for a non-experimental
solution, when confronted with the obvious necessity of configuring them?
You know I want multi-queue and increased parallelism it provides. A lot.
But let's not dig ourselves into a hole we must climb out of in 6-12
months. We need to think about configuration issues -now-.
Jeff
^ permalink raw reply [flat|nested] 153+ messages in thread
* Re: [PATCH] NET: Multiqueue network device support.
2007-06-12 21:46 ` Ben Greear
2007-06-12 21:54 ` David Miller
@ 2007-06-12 22:30 ` Jeff Garzik
2007-06-12 22:40 ` Ben Greear
1 sibling, 1 reply; 153+ messages in thread
From: Jeff Garzik @ 2007-06-12 22:30 UTC (permalink / raw)
To: Ben Greear
Cc: David Miller, netdev, kaber, hadi, peter.p.waskiewicz.jr,
auke-jan.h.kok
Ben Greear wrote:
> That sounds plausible for many uses, but it may also be useful to have
> the virtual devices. Having 802.1Q VLANs be 'real' devices has worked out
> quite well, so I think there is a place for a 'mac-vlan' as well.
Virtual devices are pretty much the only solution we have right now,
both in terms of available control points, and in terms of mapping to
similar existing solutions (like wireless and its multiple net devices).
Jeff
^ permalink raw reply [flat|nested] 153+ messages in thread
* Re: [PATCH] NET: Multiqueue network device support.
2007-06-12 22:30 ` Jeff Garzik
@ 2007-06-12 22:40 ` Ben Greear
0 siblings, 0 replies; 153+ messages in thread
From: Ben Greear @ 2007-06-12 22:40 UTC (permalink / raw)
To: Jeff Garzik
Cc: David Miller, netdev, kaber, hadi, peter.p.waskiewicz.jr,
auke-jan.h.kok
Jeff Garzik wrote:
> Ben Greear wrote:
>> That sounds plausible for many uses, but it may also be useful to have
>> the virtual devices. Having 802.1Q VLANs be 'real' devices has worked
>> out
>> quite well, so I think there is a place for a 'mac-vlan' as well.
>
> Virtual devices are pretty much the only solution we have right now,
> both in terms of available control points, and in terms of mapping to
> similar existing solutions (like wireless and its multiple net devices).
I believe Patrick is working on cleaning up mac-vlans and converting them
to use the new netlink configuration API, so there should be a patch for
these hitting the list shortly.
Thanks,
Ben
--
Ben Greear <greearb@candelatech.com>
Candela Technologies Inc http://www.candelatech.com
^ permalink raw reply [flat|nested] 153+ messages in thread
* RE: [PATCH] NET: Multiqueue network device support.
2007-06-12 14:04 ` Cohen, Guy
2007-06-12 15:23 ` jamal
@ 2007-06-12 23:38 ` jamal
1 sibling, 0 replies; 153+ messages in thread
From: jamal @ 2007-06-12 23:38 UTC (permalink / raw)
To: Cohen, Guy
Cc: Patrick McHardy, Waskiewicz Jr, Peter P, davem, netdev, jeff,
Kok, Auke-jan H
Hi Guy,
On Tue, 2007-12-06 at 17:04 +0300, Cohen, Guy wrote:
> Hi Jamal,
>
> Here is a simple scenario (nothing here is rare of extreme case):
> - Busy wireless environment
> - FTP TX on BE queue (low priority)
> - Skype TX on VO queue (high priority)
>
> The channel is busy with high priority packets hence the BE packets are
> transmitted to the air rarely so the DMA/HW queue of the BE access
> category gets full and the qdisc is stopped.
> Now periodic VO-tagged Skype packets arrive. I would expect that they
> get the priority (and pass) in all stages of the stack and reach the HW
> ASAP and compete there on the medium with the other access categories
> and the other clients on the channel.
> Now this packet will be stuck in the qdisc and wait there until a BE
> packet is transmitted, which can take a long time. This is a real
> problem.
Understood.
My take is that this is resolvable by understanding the nature of the
beast. IOW, the strategy of when to open up on such a medium is not
conventional as one of a wired netdev.
You can use signalling from the media such as an AP giving you
signals for different ACs to open up; example: if the AC_BE is not being
allowed out and it is just rotting because the AP is favoring VO, then
you need to occasionally open up the tx path for the driver etc.
> There is also a problem with the queues that will be dedicated to TX
> aggregation in 11n (currently implemented) - the packets will be
> classified to queues by the destination MAC address and not only by the
> priority class, but I don't want to get into that now.
We have an infrastructure at the qdisc level for selecting queues based
on literally anything you can think of in a packet as well as metadata.
So i think this aspect should be fine.
> I think that
> there are enough arguments now why the patch that started this thread is
> needed...
Sorry Guy, I dont see it that way - unfortunately i dont think anybody
else other than Patrick understood what i said and this thread is going
on for too long i doubt 99% of the people are following any more ;->
> In most scenarios BK packets will be transmitted and will win the medium
> against VO packets (thought, in some non-favored ratio).
So if understand you correctly: over a period of time, yes BK will make
it out but under contention it will loose; is that always? Is there some
mathematics behind this stuff?
> Sorry, I'm really overloaded - I won't be able to review the docs you
> sent (really apologize for that).
No problem. I totaly understand.
> The WMM parameters of the AC are set and controlled by the network/BSS
> (access point) administrator and can be used in anyway. There are the
> default parameters but they can be changed.
It would certainly lead to unexpected behavior if you start favoring BE
over VO, no? Would that ever happen by adjusting the WMM parameters?
cheers,
jamal
^ permalink raw reply [flat|nested] 153+ messages in thread
* RE: [PATCH] NET: Multiqueue network device support.
2007-06-12 21:47 ` Jason Lunz
2007-06-12 21:55 ` David Miller
@ 2007-06-13 3:41 ` Leonid Grossman
1 sibling, 0 replies; 153+ messages in thread
From: Leonid Grossman @ 2007-06-13 3:41 UTC (permalink / raw)
To: Jason Lunz, David Miller
Cc: greearb, jeff, netdev, kaber, hadi, peter.p.waskiewicz.jr,
auke-jan.h.kok
> -----Original Message-----
> From: netdev-owner@vger.kernel.org [mailto:netdev-
> owner@vger.kernel.org] On Behalf Of Jason Lunz
> Sent: Tuesday, June 12, 2007 2:48 PM
> To: David Miller
> Cc: greearb@candelatech.com; jeff@garzik.org; netdev@vger.kernel.org;
> kaber@trash.net; hadi@cyberus.ca; peter.p.waskiewicz.jr@intel.com;
> auke-jan.h.kok@intel.com
> Subject: Re: [PATCH] NET: Multiqueue network device support.
>
> On Tue, Jun 12, 2007 at 02:26:58PM -0700, David Miller wrote:
> > The MAC is still very much centralized in most designs.
> >
> > So one way they'll do it is to support assigning N MAC addresses,
> > and you configure the input filters of the chip to push packets
> > for each MAC to the proper receive queue.
> >
> > So the MAC will accept any of those in the N MAC addresses as
> > it's own, then you use the filtering facilities to steer
> > frames to the correct RX queue.
> >
> > The TX and RX queues can be so isolated as to be able to be exported
> > to virtualization nodes. You can give them full access to the DMA
> > queues and assosciated mailboxes. So instead of all of this bogus
> > virtualized device overhead, you just give the guest access to the
> > real device.
> >
> > So you can use multiple queues either for better single node SMP
> > performance, or better virtualization performance.
>
> Are you aware of any hardware designs that allow other ways to map
> packets onto rx queues? I can think of several scenarios where it
> could
> be advantageous to map packets by IP 3- or 5-tuple to get cpu locality
> all the way up the stack on a flow-by-flow basis. But doing this would
> require some way to request this mapping from the hardware.
10GbE Xframe NICs do that, as well as rx steering by MAC address, VLAN,
MS RSS, generic hashing and bunch of other criteria (there is actually a
decent chapter on rx steering in the ASIC manual at www.neterion.com
support page).
The caveat is that in the current products the tuple table is limited to
256 entries only. Next ASIC bumps this number to 64k.
>
> In the extreme case it would be cool if it were possible to push a
> bpf-like classifier down into the hardware to allow arbitrary kinds of
> flow distribution.
>
> Jason
> -
> To unsubscribe from this list: send the line "unsubscribe netdev" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at http://vger.kernel.org/majordomo-info.html
^ permalink raw reply [flat|nested] 153+ messages in thread
* Re: [PATCH] NET: Multiqueue network device support.
2007-06-12 21:17 ` Patrick McHardy
@ 2007-06-13 5:56 ` Zhu Yi
2007-06-13 11:34 ` Patrick McHardy
2007-06-13 12:32 ` jamal
0 siblings, 2 replies; 153+ messages in thread
From: Zhu Yi @ 2007-06-13 5:56 UTC (permalink / raw)
To: Patrick McHardy
Cc: David Miller, hadi, peter.p.waskiewicz.jr, netdev, jeff,
auke-jan.h.kok
On Tue, 2007-06-12 at 23:17 +0200, Patrick McHardy wrote:
> I've hacked up a
> small multiqueue simulator device and to my big surprise my testing
> showed that Jamal's suggestion of using a single queue state seems to
> work better than I expected. But I've been doing mostly testing of
> the device itself up to now with very simple traffic patterns (mostly
> just "flood all queues"), so I'll try to get some real results
> tomorrow.
The key argument for Jamal's solution is the NIC will send out 32
packets in the full PHL in a reasonably short time (a few microsecs per
Jamal's calculation). But for wireless, the PHL hardware has low
probability to seize the wireless medium when there are full of high
priority frames in the air. That is, the chance for transmission in PHL
and PHH is not equal. Queuing packets in software will starve high
priority packets than putting them to PHH as early as possible.
Patrick, I don't think your testing considered about above scenario,
right?
Thanks,
-yi
^ permalink raw reply [flat|nested] 153+ messages in thread
* Re: [PATCH] NET: Multiqueue network device support.
2007-06-13 5:56 ` Zhu Yi
@ 2007-06-13 11:34 ` Patrick McHardy
2007-06-14 1:51 ` Zhu Yi
2007-06-13 12:32 ` jamal
1 sibling, 1 reply; 153+ messages in thread
From: Patrick McHardy @ 2007-06-13 11:34 UTC (permalink / raw)
To: Zhu Yi
Cc: David Miller, hadi, peter.p.waskiewicz.jr, netdev, jeff,
auke-jan.h.kok
Zhu Yi wrote:
> On Tue, 2007-06-12 at 23:17 +0200, Patrick McHardy wrote:
>
>>I've hacked up a
>>small multiqueue simulator device and to my big surprise my testing
>>showed that Jamal's suggestion of using a single queue state seems to
>>work better than I expected. But I've been doing mostly testing of
>>the device itself up to now with very simple traffic patterns (mostly
>>just "flood all queues"), so I'll try to get some real results
>>tomorrow.
>
>
> The key argument for Jamal's solution is the NIC will send out 32
> packets in the full PHL in a reasonably short time (a few microsecs per
> Jamal's calculation). But for wireless, the PHL hardware has low
> probability to seize the wireless medium when there are full of high
> priority frames in the air. That is, the chance for transmission in PHL
> and PHH is not equal. Queuing packets in software will starve high
> priority packets than putting them to PHH as early as possible.
Well, the key result of our discussion was that it makes no difference
wrt. queuing behaviour if the queue wakeup strategy is suitable chosen
for the specific queueing discipline, but it might add some overhead.
> Patrick, I don't think your testing considered about above scenario,
> right?
No, as stated my testing so far has been very limited. I'll try to
get some better results later.
^ permalink raw reply [flat|nested] 153+ messages in thread
* Re: [PATCH] NET: Multiqueue network device support.
2007-06-13 5:56 ` Zhu Yi
2007-06-13 11:34 ` Patrick McHardy
@ 2007-06-13 12:32 ` jamal
2007-06-13 13:12 ` Robert Olsson
2007-06-14 2:44 ` Zhu Yi
1 sibling, 2 replies; 153+ messages in thread
From: jamal @ 2007-06-13 12:32 UTC (permalink / raw)
To: Zhu Yi
Cc: Patrick McHardy, David Miller, peter.p.waskiewicz.jr, netdev,
jeff, auke-jan.h.kok
On Wed, 2007-13-06 at 13:56 +0800, Zhu Yi wrote:
> The key argument for Jamal's solution is the NIC will send out 32
> packets in the full PHL in a reasonably short time (a few microsecs per
> Jamal's calculation). But for wireless, the PHL hardware has low
> probability to seize the wireless medium when there are full of high
> priority frames in the air. That is, the chance for transmission in PHL
> and PHH is not equal. Queuing packets in software will starve high
> priority packets than putting them to PHH as early as possible.
>
The key arguement i make (from day one actually) is to leave the
majority of the work to the driver.
My view of wireless WMM etc is it is a different media behavior
(compared to wired ethernet) which means a different view of strategy
for when it opens the valve to allow in more packets. 802.11 media has
embedded signalling which is usable. Guy Cohen gave a good use case
which i responded to. Do you wanna look at that and respond?
cheers,
jamal
^ permalink raw reply [flat|nested] 153+ messages in thread
* Re: [PATCH] NET: Multiqueue network device support.
2007-06-13 12:32 ` jamal
@ 2007-06-13 13:12 ` Robert Olsson
2007-06-13 13:33 ` jamal
2007-06-14 2:44 ` Zhu Yi
1 sibling, 1 reply; 153+ messages in thread
From: Robert Olsson @ 2007-06-13 13:12 UTC (permalink / raw)
To: hadi
Cc: Zhu Yi, Patrick McHardy, David Miller, peter.p.waskiewicz.jr,
netdev, jeff, auke-jan.h.kok
jamal writes:
> The key arguement i make (from day one actually) is to leave the
> majority of the work to the driver.
> My view of wireless WMM etc is it is a different media behavior
> (compared to wired ethernet) which means a different view of strategy
> for when it opens the valve to allow in more packets. 802.11 media has
> embedded signalling which is usable. Guy Cohen gave a good use case
> which i responded to. Do you wanna look at that and respond?
Hello,
Haven't got all details. IMO we need to support some "bonding-like"
scenario too. Where one CPU is feeding just one TX-ring. (and TX-buffers
cleared by same CPU). We probably don't want to stall all queuing when
when one ring is full.
The scenario I see is to support parallelism in forwarding/firewalling etc.
For example when RX load via HW gets split into different CPU's and for
cache reasons we want to process in same CPU even with TX.
If RX HW split keeps packets from the same flow on same CPU we shouldn't
get reordering within flows.
Cheers
--ro
^ permalink raw reply [flat|nested] 153+ messages in thread
* Re: [PATCH] NET: Multiqueue network device support.
2007-06-13 13:12 ` Robert Olsson
@ 2007-06-13 13:33 ` jamal
2007-06-13 15:01 ` Leonid Grossman
` (2 more replies)
0 siblings, 3 replies; 153+ messages in thread
From: jamal @ 2007-06-13 13:33 UTC (permalink / raw)
To: Robert Olsson
Cc: Zhu Yi, Leonid Grossman, Patrick McHardy, David Miller,
peter.p.waskiewicz.jr, netdev, jeff, auke-jan.h.kok
Wow - Robert in the house, I cant resist i have to say something before
i run out;->
On Wed, 2007-13-06 at 15:12 +0200, Robert Olsson wrote:
> Haven't got all details. IMO we need to support some "bonding-like"
> scenario too. Where one CPU is feeding just one TX-ring. (and TX-buffers
> cleared by same CPU). We probably don't want to stall all queuing when
> when one ring is full.
>
For newer NICs - the kind of that Leonid Grossman was talking about,
makes a lot of sense in non-virtual environment.
I think the one described by Leonid has not just 8 tx/rx rings but also
a separate register set, MSI binding etc iirc. The only shared resources
as far as i understood Leonid are the bus and the ethernet wire.
So in such a case (assuming 8 rings),
One model is creating 4 netdev devices each based on single tx/rx ring
and register set and then having a mother netdev (what you call the
bond) that feeds these children netdev based on some qos parametrization
is very sensible. Each of the children netdevices (by virtue of how we
do things today) could be tied to a CPU for effectiveness (because our
per CPU work is based on netdevs).
In virtual environments, the supervisor will be in charge of the
bond-like parent device.
Another model is creating a child netdev based on more than one ring
example 2 tx and 2 rcv rings for two netdevices etc.
> The scenario I see is to support parallelism in forwarding/firewalling etc.
> For example when RX load via HW gets split into different CPU's and for
> cache reasons we want to process in same CPU even with TX.
>
> If RX HW split keeps packets from the same flow on same CPU we shouldn't
> get reordering within flows.
For the Leonid-NIC (for lack of better name) it may be harder to do
parallelization on rcv if you use what i said above. But you could
use a different model on receive - such as create a single netdev and
with 8 rcv rings and MSI tied on rcv to 8 different CPUs
Anyways, it is an important discussion to have. ttl.
cheers,
jamal
^ permalink raw reply [flat|nested] 153+ messages in thread
* RE: [PATCH] NET: Multiqueue network device support.
2007-06-13 13:33 ` jamal
@ 2007-06-13 15:01 ` Leonid Grossman
2007-06-13 15:53 ` Robert Olsson
2007-06-13 18:20 ` David Miller
2 siblings, 0 replies; 153+ messages in thread
From: Leonid Grossman @ 2007-06-13 15:01 UTC (permalink / raw)
To: hadi, Robert Olsson
Cc: Zhu Yi, Patrick McHardy, David Miller, peter.p.waskiewicz.jr,
netdev, jeff, auke-jan.h.kok
> -----Original Message-----
> From: J Hadi Salim [mailto:j.hadi123@gmail.com] On Behalf Of jamal
> For the Leonid-NIC (for lack of better name) it may be harder to do
> parallelization on rcv if you use what i said above. But you could
> use a different model on receive - such as create a single netdev and
> with 8 rcv rings and MSI tied on rcv to 8 different CPUs
> Anyways, it is an important discussion to have. ttl.
Call it "IOV-style NIC" :-)
Or something like that, it's a bit too early to talk about full IOV
compliance...
>From what I see in Intel new pci-e 10GbE driver, they have quite a few
of the same attributes, and the category is likely to grow further.
In IOV world, hw channel requirements are pretty brutal; in a nutshell
each channel could be owned by a separate OS instance (and the OS
instances do not even have to be the same type). For a non-virtualized
OS some of these capabilities are not a "must have", but they are/will
be there and Linux may as well take advantage of it.
Leonid
>
> cheers,
> jamal
>
^ permalink raw reply [flat|nested] 153+ messages in thread
* Re: [PATCH] NET: Multiqueue network device support.
2007-06-13 13:33 ` jamal
2007-06-13 15:01 ` Leonid Grossman
@ 2007-06-13 15:53 ` Robert Olsson
2007-06-13 18:20 ` David Miller
2 siblings, 0 replies; 153+ messages in thread
From: Robert Olsson @ 2007-06-13 15:53 UTC (permalink / raw)
To: hadi
Cc: Robert Olsson, Zhu Yi, Leonid Grossman, Patrick McHardy,
David Miller, peter.p.waskiewicz.jr, netdev, jeff, auke-jan.h.kok
jamal writes:
> I think the one described by Leonid has not just 8 tx/rx rings but also
> a separate register set, MSI binding etc iirc. The only shared resources
> as far as i understood Leonid are the bus and the ethernet wire.
AFAIK most new NIC will look like this...
I still lack a lot of crucial hardware understanding
What will happen when if we for some reason is not capable of serving
one TX ring? NIC still working so we continue filling/sending/clearing
on other rings?
> So in such a case (assuming 8 rings),
> One model is creating 4 netdev devices each based on single tx/rx ring
> and register set and then having a mother netdev (what you call the
> bond) that feeds these children netdev based on some qos parametrization
> is very sensible. Each of the children netdevices (by virtue of how we
> do things today) could be tied to a CPU for effectiveness (because our
> per CPU work is based on netdevs).
Some kind of supervising function for the TX is probably needed as we still
want see the device as one entity. But if upcoming HW supports parallelism
straight to the TX-ring we of course like to use to get mininal cache
effects. It's up to how this "master netdev" or queue superviser can be
designed.
> For the Leonid-NIC (for lack of better name) it may be harder to do
> parallelization on rcv if you use what i said above. But you could
> use a different model on receive - such as create a single netdev and
> with 8 rcv rings and MSI tied on rcv to 8 different CPUs
Yes that should be the way do it... and ethtool or something to hint
the NIC how the incoming data classified wrt available CPU's. Maybe
something more dynamic for the brave ones.
Cheers
-ro
^ permalink raw reply [flat|nested] 153+ messages in thread
* Re: [PATCH] NET: Multiqueue network device support.
2007-06-12 21:13 ` Jeff Garzik
2007-06-12 21:17 ` Ben Greear
@ 2007-06-13 16:44 ` Rick Jones
1 sibling, 0 replies; 153+ messages in thread
From: Rick Jones @ 2007-06-13 16:44 UTC (permalink / raw)
To: Jeff Garzik
Cc: netdev, David Miller, kaber, hadi, peter.p.waskiewicz.jr,
auke-jan.h.kok
I'm starting to wonder how a multi-queue NIC differs from a bunch of
bonded single-queue NICs, and if there is leverage opportunity there.
rick jones
^ permalink raw reply [flat|nested] 153+ messages in thread
* Re: [PATCH] NET: Multiqueue network device support.
2007-06-13 13:33 ` jamal
2007-06-13 15:01 ` Leonid Grossman
2007-06-13 15:53 ` Robert Olsson
@ 2007-06-13 18:20 ` David Miller
2007-06-13 18:22 ` Waskiewicz Jr, Peter P
2007-06-13 21:30 ` jamal
2 siblings, 2 replies; 153+ messages in thread
From: David Miller @ 2007-06-13 18:20 UTC (permalink / raw)
To: hadi
Cc: Robert.Olsson, yi.zhu, Leonid.Grossman, kaber,
peter.p.waskiewicz.jr, netdev, jeff, auke-jan.h.kok
From: jamal <hadi@cyberus.ca>
Date: Wed, 13 Jun 2007 09:33:22 -0400
> So in such a case (assuming 8 rings), One model is creating 4 netdev
> devices each based on single tx/rx ring and register set and then
> having a mother netdev (what you call the bond) that feeds these
> children netdev based on some qos parametrization is very sensible.
Why all of this layering and overhead for something so
BLOODY SIMPLE?!?!?
^ permalink raw reply [flat|nested] 153+ messages in thread
* RE: [PATCH] NET: Multiqueue network device support.
2007-06-13 18:20 ` David Miller
@ 2007-06-13 18:22 ` Waskiewicz Jr, Peter P
2007-06-13 21:30 ` jamal
1 sibling, 0 replies; 153+ messages in thread
From: Waskiewicz Jr, Peter P @ 2007-06-13 18:22 UTC (permalink / raw)
To: David Miller, hadi
Cc: Robert.Olsson, Zhu, Yi, Leonid.Grossman, kaber, netdev, jeff,
Kok, Auke-jan H
> From: jamal <hadi@cyberus.ca>
> Date: Wed, 13 Jun 2007 09:33:22 -0400
>
> > So in such a case (assuming 8 rings), One model is creating
> 4 netdev
> > devices each based on single tx/rx ring and register set and then
> > having a mother netdev (what you call the bond) that feeds these
> > children netdev based on some qos parametrization is very sensible.
>
> Why all of this layering and overhead for something so BLOODY
> SIMPLE?!?!?
>
I am currently packing up the newest patches against 2.6.23, with
feedback from Patrick. The delay in posting them was a weird panic with
the loopback device, which I just found. Let me run a test cycle or
two, and I'll send them today for review, including an e1000 patch to
show how to use the API.
Cheers,
-PJ Waskiewicz
^ permalink raw reply [flat|nested] 153+ messages in thread
* RE: [PATCH] NET: Multiqueue network device support.
2007-06-11 17:36 ` Patrick McHardy
2007-06-11 18:05 ` Waskiewicz Jr, Peter P
@ 2007-06-13 18:34 ` Waskiewicz Jr, Peter P
1 sibling, 0 replies; 153+ messages in thread
From: Waskiewicz Jr, Peter P @ 2007-06-13 18:34 UTC (permalink / raw)
To: Patrick McHardy; +Cc: davem, netdev, jeff, Kok, Auke-jan H
> PJ Waskiewicz wrote:
> > diff --git a/net/sched/sch_generic.c
> b/net/sched/sch_generic.c index
> > f28bb2d..b9dc2a6 100644
> > --- a/net/sched/sch_generic.c
> > +++ b/net/sched/sch_generic.c
> > @@ -123,7 +123,8 @@ static inline int qdisc_restart(struct
> net_device *dev)
> > /* And release queue */
> > spin_unlock(&dev->queue_lock);
> >
> > - if (!netif_queue_stopped(dev)) {
> > + if (!netif_queue_stopped(dev) &&
> > + !netif_subqueue_stopped(dev,
> skb->queue_mapping)) {
> > int ret;
> >
> > ret = dev_hard_start_xmit(skb, dev);
>
>
> Your patch doesn't update any other users of netif_queue_stopped().
> The assumption that they can pass packets to the driver when
> the queue is running is no longer valid since they don't know
> whether the subqueue the packet will end up in is active (it
> might be different from queue 0 if packets were redirected
> from a multiqueue aware qdisc through TC actions). So they
> need to be changed to check the subqueue state as well.
The cases I found were net/core/netpoll.c, net/core/pktgen.c, and the
software device case in net/core/dev.c. In all cases, the value of
skb->queue_mapping will be zero, but they don't initialize the subqueue
lock of the single allocated queue (hence panic when trying to use
it...). I also don't think it makes sense for them to care, since
->enqueue() doesn't get called as far as I can tell, therefore the
classification won't happen. Did I miss something in looking at this?
Thanks,
-PJ
^ permalink raw reply [flat|nested] 153+ messages in thread
* Re: [PATCH] NET: Multiqueue network device support.
2007-06-13 18:20 ` David Miller
2007-06-13 18:22 ` Waskiewicz Jr, Peter P
@ 2007-06-13 21:30 ` jamal
1 sibling, 0 replies; 153+ messages in thread
From: jamal @ 2007-06-13 21:30 UTC (permalink / raw)
To: David Miller
Cc: Robert.Olsson, yi.zhu, Leonid.Grossman, kaber,
peter.p.waskiewicz.jr, netdev, jeff, auke-jan.h.kok
On Wed, 2007-13-06 at 11:20 -0700, David Miller wrote:
> From: jamal <hadi@cyberus.ca>
> Date: Wed, 13 Jun 2007 09:33:22 -0400
>
> > So in such a case (assuming 8 rings), One model is creating 4 netdev
> > devices each based on single tx/rx ring and register set and then
> > having a mother netdev (what you call the bond) that feeds these
> > children netdev based on some qos parametrization is very sensible.
>
> Why all of this layering and overhead for something so
> BLOODY SIMPLE?!?!?
Are we still talking about the same thing?;->
This was about NICs which have multi register sets, tx/rx rings;
the only shared resource is the bus and the wire.
The e1000 cant do that. The thread is too long, so you may be talking
about the same thing.
cheers,
jamal
^ permalink raw reply [flat|nested] 153+ messages in thread
* Re: [PATCH] NET: Multiqueue network device support.
2007-06-13 11:34 ` Patrick McHardy
@ 2007-06-14 1:51 ` Zhu Yi
0 siblings, 0 replies; 153+ messages in thread
From: Zhu Yi @ 2007-06-14 1:51 UTC (permalink / raw)
To: Patrick McHardy
Cc: David Miller, hadi, peter.p.waskiewicz.jr, netdev, jeff,
auke-jan.h.kok
On Wed, 2007-06-13 at 13:34 +0200, Patrick McHardy wrote:
> > The key argument for Jamal's solution is the NIC will send out 32
> > packets in the full PHL in a reasonably short time (a few microsecs
> per
> > Jamal's calculation). But for wireless, the PHL hardware has low
> > probability to seize the wireless medium when there are full of high
> > priority frames in the air. That is, the chance for transmission in
> PHL
> > and PHH is not equal. Queuing packets in software will starve high
> > priority packets than putting them to PHH as early as possible.
>
>
> Well, the key result of our discussion was that it makes no difference
> wrt. queuing behaviour if the queue wakeup strategy is suitable chosen
> for the specific queueing discipline, but it might add some overhead.
My point is the overhead is hugh for the wireless case which causes it
unacceptable. Given the above example in wireless medium, which queue
wakeup strategy will you choose? I guess it might be the "not stop tx
ring + requeue"? If this is selected, when there is a low priority
packet coming (PHL is full), the Qdisc will keep dequeue and requeue for
the same packet for a long time (given the fact of wireless medium) and
chew tons of CPU. We met this problem before in our driver and this (not
stop tx ring + requeue) is not a good thing to do.
Thanks,
-yi
^ permalink raw reply [flat|nested] 153+ messages in thread
* Re: [PATCH] NET: Multiqueue network device support.
2007-06-13 12:32 ` jamal
2007-06-13 13:12 ` Robert Olsson
@ 2007-06-14 2:44 ` Zhu Yi
2007-06-14 11:48 ` jamal
1 sibling, 1 reply; 153+ messages in thread
From: Zhu Yi @ 2007-06-14 2:44 UTC (permalink / raw)
To: hadi
Cc: Patrick McHardy, David Miller, peter.p.waskiewicz.jr, netdev,
jeff, auke-jan.h.kok
On Wed, 2007-06-13 at 08:32 -0400, jamal wrote:
> The key arguement i make (from day one actually) is to leave the
> majority of the work to the driver.
But it seems not feasible the Qdisc needs to know nothing about the
hardware rings.
> My view of wireless WMM etc is it is a different media behavior
> (compared to wired ethernet) which means a different view of strategy
> for when it opens the valve to allow in more packets. 802.11 media has
> embedded signalling which is usable. Guy Cohen gave a good use case
> which i responded to. Do you wanna look at that and respond?
The key to support multi-ring hardware for software is to put packets
into hardware as much/early as possible. Guy gave a good VO vs. BK
example. To achieve this in your model, you have to keep the TX ring
running (in the case of PHL full) and requeue. But when there are only
BK packets coming, you do want to stop the ring, right? AFAICS, the
driver is not the best place to make the decision (it only knows the
current and previous packets, but not the _next_), the Qdisc is the best
place.
Thanks,
-yi
^ permalink raw reply [flat|nested] 153+ messages in thread
* Re: [PATCH] NET: Multiqueue network device support.
2007-06-14 2:44 ` Zhu Yi
@ 2007-06-14 11:48 ` jamal
2007-06-15 1:27 ` Zhu Yi
0 siblings, 1 reply; 153+ messages in thread
From: jamal @ 2007-06-14 11:48 UTC (permalink / raw)
To: Zhu Yi
Cc: Patrick McHardy, David Miller, peter.p.waskiewicz.jr, netdev,
jeff, auke-jan.h.kok
Hi Yi,
On Thu, 2007-14-06 at 10:44 +0800, Zhu Yi wrote:
> On Wed, 2007-06-13 at 08:32 -0400, jamal wrote:
> > The key arguement i make (from day one actually) is to leave the
> > majority of the work to the driver.
>
> But it seems not feasible the Qdisc needs to know nothing about the
> hardware rings.
This discussion is addressing whether it is feasible to do it without
the qdisc knowing anything about the hardware ring.
> > My view of wireless WMM etc is it is a different media behavior
> > (compared to wired ethernet) which means a different view of strategy
> > for when it opens the valve to allow in more packets. 802.11 media has
> > embedded signalling which is usable. Guy Cohen gave a good use case
> > which i responded to. Do you wanna look at that and respond?
>
> The key to support multi-ring hardware for software is to put packets
> into hardware as much/early as possible. Guy gave a good VO vs. BK
> example. To achieve this in your model, you have to keep the TX ring
> running (in the case of PHL full) and requeue. But when there are only
> BK packets coming, you do want to stop the ring, right? AFAICS, the
> driver is not the best place to make the decision (it only knows the
> current and previous packets, but not the _next_), the Qdisc is the best
> place.
>
I dont have much time to followup for sometime to come. I have left my
answer above. To clarify, incase i wasnt clear, I am saying:
a) It is better to have the driver change via some strategy of when to
open the tx path than trying to be generic. This shifts the burden to
the driver.
b) given the behavior of wireless media (which is very different from
wired ethernet media), you need a different strategy. In response to
Guy's question, I gave the example of being able to use management
frames to open up the tx path for VO (even when you dont know VO packets
are sitting on the qdisc); alternatively you could use a timer etc.
Theres many ways to skin the cat (with apologies to cat lovers/owners).
i.e you need to look at the media and be creative.
Peters DCE for example could also be handled by having a specific
strategy.
I will try to continue participating in the discussion (if CCed) but
much less for about a week. In any case I think i have had the
discussion i was hoping for and trust Patrick understands both sides.
This thread has run for too long folks, eh?
cheers,
jamal
^ permalink raw reply [flat|nested] 153+ messages in thread
* Re: [PATCH] NET: Multiqueue network device support.
2007-06-14 11:48 ` jamal
@ 2007-06-15 1:27 ` Zhu Yi
2007-06-15 10:49 ` jamal
0 siblings, 1 reply; 153+ messages in thread
From: Zhu Yi @ 2007-06-15 1:27 UTC (permalink / raw)
To: hadi
Cc: Patrick McHardy, David Miller, peter.p.waskiewicz.jr, netdev,
jeff, auke-jan.h.kok
On Thu, 2007-06-14 at 07:48 -0400, jamal wrote:
> I dont have much time to followup for sometime to come. I have left my
> answer above. To clarify, incase i wasnt clear, I am saying:
> a) It is better to have the driver change via some strategy of when to
> open the tx path than trying to be generic. This shifts the burden to
> the driver.
> b) given the behavior of wireless media (which is very different from
> wired ethernet media), you need a different strategy. In response to
> Guy's question, I gave the example of being able to use management
> frames to open up the tx path for VO (even when you dont know VO
> packets
> are sitting on the qdisc); alternatively you could use a timer etc.
> Theres many ways to skin the cat (with apologies to cat
> lovers/owners).
> i.e you need to look at the media and be creative.
> Peters DCE for example could also be handled by having a specific
> strategy.
OK. You tried so much to guess the traffic flow pattern in the low level
driver, which could be implemented straightforward in the Qdisc. The pro
is the Qdisc API is untouched. But the cons are:
1. driver becomes complicated (as it is too elaborate in the queue
wakeup strategies design)
2. duplicated code among drivers (otherwise you put all the queue
management logics in a new layer?)
3. it's not 100% accurate. there has to be some overhead, more or less
depends on the queue wakeup strategy the driver selected.
Time for voting?
Thanks,
-yi
^ permalink raw reply [flat|nested] 153+ messages in thread
* Re: [PATCH] NET: Multiqueue network device support.
2007-06-15 1:27 ` Zhu Yi
@ 2007-06-15 10:49 ` jamal
2007-06-18 1:18 ` Zhu Yi
0 siblings, 1 reply; 153+ messages in thread
From: jamal @ 2007-06-15 10:49 UTC (permalink / raw)
To: Zhu Yi
Cc: Patrick McHardy, David Miller, peter.p.waskiewicz.jr, netdev,
jeff, auke-jan.h.kok
Hello Yi,
On Fri, 2007-15-06 at 09:27 +0800, Zhu Yi wrote:
> 1. driver becomes complicated (as it is too elaborate in the queue
> wakeup strategies design)
I am not sure i see the complexity in the wireless driver's wakeup
strategy. I just gave some suggestions to use management frames - they
dont have to be literally that way.
> 2. duplicated code among drivers (otherwise you put all the queue
> management logics in a new layer?)
There will be some shared code on drivers of same media on the
netif_stop/wake strategy perhaps, but not related to queue management.
> 3. it's not 100% accurate. there has to be some overhead, more or less
> depends on the queue wakeup strategy the driver selected.
Why is it not accurate for wireless? I can see the corner case Patrick
mentioned in wired ethernet but then wired ethernet doesnt have other
events such as management frames (actually DCE does) to help.
cheers,
jamal
^ permalink raw reply [flat|nested] 153+ messages in thread
* Re: [PATCH] NET: Multiqueue network device support.
2007-06-15 10:49 ` jamal
@ 2007-06-18 1:18 ` Zhu Yi
2007-06-18 15:16 ` jamal
0 siblings, 1 reply; 153+ messages in thread
From: Zhu Yi @ 2007-06-18 1:18 UTC (permalink / raw)
To: hadi
Cc: Patrick McHardy, David Miller, peter.p.waskiewicz.jr, netdev,
jeff, auke-jan.h.kok
On Fri, 2007-06-15 at 06:49 -0400, jamal wrote:
> Hello Yi,
>
> On Fri, 2007-15-06 at 09:27 +0800, Zhu Yi wrote:
>
> > 1. driver becomes complicated (as it is too elaborate in the queue
> > wakeup strategies design)
>
> I am not sure i see the complexity in the wireless driver's wakeup
> strategy. I just gave some suggestions to use management frames - they
> dont have to be literally that way.
>
> > 2. duplicated code among drivers (otherwise you put all the queue
> > management logics in a new layer?)
>
> There will be some shared code on drivers of same media on the
> netif_stop/wake strategy perhaps, but not related to queue management.
>
> > 3. it's not 100% accurate. there has to be some overhead, more or less
> > depends on the queue wakeup strategy the driver selected.
>
> Why is it not accurate for wireless? I can see the corner case Patrick
> mentioned in wired ethernet but then wired ethernet doesnt have other
> events such as management frames (actually DCE does) to help.
Would you respond the question I asked early, in your model how to
define the queue wakeup strategy in the driver to deal with the PHL full
situation? Consider about 1) both high prio and low prio packets could
come (you cannot predict it beforehand) 2) the time for PHL to send out
a packet to the wireless medium is relative long (given the medium is
congested). If you can resolve it in an elegant way, I'm all ears.
Thanks,
-yi
^ permalink raw reply [flat|nested] 153+ messages in thread
* Re: [PATCH] NET: Multiqueue network device support.
2007-06-18 1:18 ` Zhu Yi
@ 2007-06-18 15:16 ` jamal
2007-06-19 2:12 ` Zhu Yi
0 siblings, 1 reply; 153+ messages in thread
From: jamal @ 2007-06-18 15:16 UTC (permalink / raw)
To: Zhu Yi
Cc: Patrick McHardy, David Miller, peter.p.waskiewicz.jr, netdev,
jeff, auke-jan.h.kok
Hello Yi,
On Mon, 2007-18-06 at 09:18 +0800, Zhu Yi wrote:
> Would you respond the question I asked early,
I thought i did respond to all questions you asked but some may have
been lost in the noise.
> in your model how to
> define the queue wakeup strategy in the driver to deal with the PHL full
> situation? Consider about 1) both high prio and low prio packets could
> come (you cannot predict it beforehand)
I am assuming by "come" you mean from the stack (example an ssh packet)
as opposed from the outside.
> 2) the time for PHL to send out
> a packet to the wireless medium is relative long (given the medium is
> congested). If you can resolve it in an elegant way, I'm all ears.
Congestion periods are the only time any of this stuff makes sense.
Ok, so let me repeat what i said earlier:
Once a packet is in the DMA ring, we dont take it out. If a high prio
packet is blocking a low prio one, i consider that to be fine. If otoh,
you receive a management detail from the AP indicating that LP has its
priority bumped or HP has its prio lowered, then by all means use that
info to open up the path again. Again, that is an example, you could use
that or schemes (refer to my expression on cats earlier).
Anyways, you will have to forgive me - this thread is getting too long
and i dont have much time to follow up on this topic for about a week;
and given we are not meeting anywhere in the middle i am having a hard
time continuing to repeat the same arguements over and over again. It is
ok for rational people to agree to disagree for the sake of progress
cheers,
jamal
^ permalink raw reply [flat|nested] 153+ messages in thread
* Re: [PATCH] NET: Multiqueue network device support.
2007-06-18 15:16 ` jamal
@ 2007-06-19 2:12 ` Zhu Yi
2007-06-19 16:04 ` jamal
0 siblings, 1 reply; 153+ messages in thread
From: Zhu Yi @ 2007-06-19 2:12 UTC (permalink / raw)
To: hadi
Cc: Patrick McHardy, David Miller, peter.p.waskiewicz.jr, netdev,
jeff, auke-jan.h.kok
On Mon, 2007-06-18 at 11:16 -0400, jamal wrote:
> > in your model how to
> > define the queue wakeup strategy in the driver to deal with the PHL full
> > situation? Consider about 1) both high prio and low prio packets could
> > come (you cannot predict it beforehand)
>
> I am assuming by "come" you mean from the stack (example an ssh packet)
> as opposed from the outside.
Right.
> > 2) the time for PHL to send out
> > a packet to the wireless medium is relative long (given the medium is
> > congested). If you can resolve it in an elegant way, I'm all ears.
>
> Congestion periods are the only time any of this stuff makes sense.
We are talking about the period from the time PHL is full to the time it
can accept more packets again. How to design the queue wakeup policy in
this period is the question.
> Ok, so let me repeat what i said earlier:
>
> Once a packet is in the DMA ring, we dont take it out. If a high prio
> packet is blocking a low prio one, i consider that to be fine. If otoh,
> you receive a management detail from the AP indicating that LP has its
> priority bumped or HP has its prio lowered, then by all means use that
> info to open up the path again. Again, that is an example, you could use
> that or schemes (refer to my expression on cats earlier).
No, this is not my question. Mine was much simpler. We don't need to
consider the wireless dynamic priority change case at this time. Just
tell me what you suppose the driver to do (stop|start queue) when the
hardware PHL is full but PHH is empty?
Thanks,
-yi
^ permalink raw reply [flat|nested] 153+ messages in thread
* Re: [PATCH] NET: Multiqueue network device support.
2007-06-19 2:12 ` Zhu Yi
@ 2007-06-19 16:04 ` jamal
2007-06-20 5:51 ` Zhu Yi
0 siblings, 1 reply; 153+ messages in thread
From: jamal @ 2007-06-19 16:04 UTC (permalink / raw)
To: Zhu Yi
Cc: Patrick McHardy, David Miller, peter.p.waskiewicz.jr, netdev,
jeff, auke-jan.h.kok
On Tue, 2007-19-06 at 10:12 +0800, Zhu Yi wrote:
> Mine was much simpler. We don't need to
> consider the wireless dynamic priority change case at this time. Just
> tell me what you suppose the driver to do (stop|start queue) when the
> hardware PHL is full but PHH is empty?
I already responded to this a few emails back.
My suggestion then was:
Pick between a timer and a number of packets X transmitted, whichever
comes first. [In e1000 for example, the opening strategy is every time
32 packets get transmitted, you open up].
In the case of wireless, pick two numbers XHH and XHL with XHL < XHH.
The timers would be similar in nature (THH > THL). All these variables
are only valid if you shutdown the ring.
So in the case HL shuts down the ring, you fire THL. If either XHL
packets are transmitted or THL expires, you netif_wake.
Did that make sense?
BTW, this thread is going back and forth on the same recycled
arguements. As an example, i have responded to this specific question.
Can we drop the discussion?
cheers,
jamal
^ permalink raw reply [flat|nested] 153+ messages in thread
* Re: [PATCH] NET: Multiqueue network device support.
2007-06-19 16:04 ` jamal
@ 2007-06-20 5:51 ` Zhu Yi
2007-06-21 15:39 ` jamal
0 siblings, 1 reply; 153+ messages in thread
From: Zhu Yi @ 2007-06-20 5:51 UTC (permalink / raw)
To: hadi
Cc: Patrick McHardy, David Miller, peter.p.waskiewicz.jr, netdev,
jeff, auke-jan.h.kok
On Tue, 2007-06-19 at 12:04 -0400, jamal wrote:
> In the case of wireless, pick two numbers XHH and XHL with XHL < XHH.
> The timers would be similar in nature (THH > THL). All these variables
> are only valid if you shutdown the ring.
> So in the case HL shuts down the ring, you fire THL. If either XHL
> packets are transmitted or THL expires, you netif_wake.
> Did that make sense?
No, because this is over-engineered. Furthermore, don't you think the
algorithm is complicated and unnecessary (i.e. one timer per h/w queue)?
Do you think the driver maintainer will accept such kind of workaround
patch? You did too much to keep the Qdisc interface untouched!
Besides, the lower THL you choose, the more CPU time is wasted in busy
loop for the only PL case; the higher THL you choose, the slower the PH
packets will be sent out than expected (the driver doesn't fully utilize
the device function -- multiple rings, which conlicts with a device
driver's intention). You can never make a good trade off in this model.
I think I have fully understood you, but your point is invalid. The
Qdisc must be changed to have the hardware queue information to support
multiple hardware queues devices.
Thanks,
-yi
^ permalink raw reply [flat|nested] 153+ messages in thread
* Re: [PATCH] NET: Multiqueue network device support.
2007-06-20 5:51 ` Zhu Yi
@ 2007-06-21 15:39 ` jamal
2007-06-22 1:26 ` Zhu Yi
0 siblings, 1 reply; 153+ messages in thread
From: jamal @ 2007-06-21 15:39 UTC (permalink / raw)
To: Zhu Yi
Cc: Patrick McHardy, David Miller, peter.p.waskiewicz.jr, netdev,
jeff, auke-jan.h.kok
I gave you two opportunities to bail out of this discussion, i am gonna
take that your rejection to that offer implies you my friend wants to
get to the bottom of this i.e you are on a mission to find the truth.
So lets continue this.
On Wed, 2007-20-06 at 13:51 +0800, Zhu Yi wrote:
> No, because this is over-engineered.
> Furthermore, don't you think the
> algorithm is complicated and unnecessary (i.e. one timer per h/w queue)?
The (one-shot) timer is only necessary when a ring shuts down the
driver. This is only for the case of wireless media. Standard Wired
Ethernet doesnt need it.
Note: You are not going to convince me by throwing cliches like "this is
over-engineering" around. Because it leads to a response like "Not at
all. I think Sending flow control messages back to the stack is
over-engineering. " And where do we go then?
> Do you think the driver maintainer will accept such kind of workaround
> patch?
Give me access to your manual for the chip on my laptop wireless which
is 3945ABG and i can produce a very simple patch for you. Actually if
you answer some questions for me, it may be good enough to produce such
a patch.
> You did too much to keep the Qdisc interface untouched!
What metric do you want to define for "too much" - lines of code?
Complexity? I consider architecture cleanliness to be more important.
> Besides, the lower THL you choose, the more CPU time is wasted in busy
> loop for the only PL case;
Your choice of THL and THH has nothing to do with what i am proposing.
I am not proposing you even touch that. What numbers do you have today?
What i am saying is you use _some_ value for opening up the driver; some
enlightened drivers such as the tg3 (and the e1000 - for which i
unashamedly take credit) do have such parametrization. This has already
been proven to be valuable.
The timer fires only if a ring shuts down the interface. Where is the
busy loop? If packets go out, there is no timer.
> the higher THL you choose, the slower the PH
> packets will be sent out than expected (the driver doesn't fully utilize
> the device function -- multiple rings,
I dont think you understood: Whatever value you choose for THL and THH
today, keep those. OTOH, the wake threshold is what i was refering to.
> which conlicts with a device driver's intention).
I dont see how given i am talking about wake thresholds.
> You can never make a good trade off in this model.
Refer to above.
> I think I have fully understood you,
Thanks for coming such a long way - you stated it couldnt be done before
unless you sent feedback to the stack.
> but your point is invalid. The
> Qdisc must be changed to have the hardware queue information to support
> multiple hardware queues devices.
>
Handwaving as above doesnt add value to a discussion. If you want
meaningful discussions, stop these cliches.
cheers,
jamal
^ permalink raw reply [flat|nested] 153+ messages in thread
* Re: [PATCH] NET: Multiqueue network device support.
2007-06-21 15:39 ` jamal
@ 2007-06-22 1:26 ` Zhu Yi
2007-06-25 16:47 ` jamal
0 siblings, 1 reply; 153+ messages in thread
From: Zhu Yi @ 2007-06-22 1:26 UTC (permalink / raw)
To: hadi
Cc: Patrick McHardy, David Miller, peter.p.waskiewicz.jr, netdev,
jeff, auke-jan.h.kok
On Thu, 2007-06-21 at 11:39 -0400, jamal wrote:
> I gave you two opportunities to bail out of this discussion, i am gonna
> take that your rejection to that offer implies you my friend wants to
> get to the bottom of this i.e you are on a mission to find the truth.
> So lets continue this.
It sounds stupid I'm still trying to convince you why we need multiqueue
support in Qdisc when everybody else are already working on the code,
fixing bugs and preparing for merge. The only reason I keep the
conversation is that I think you _might_ have some really good points
that buried under everybody else's positive support for multiqueue. But
with the conversation goes on, it turns out not the truth. Let me snip
the nonsense part below and only focus on technical.
> > Besides, the lower THL you choose, the more CPU time is wasted in busy
> > loop for the only PL case;
>
> Your choice of THL and THH has nothing to do with what i am proposing.
> I am not proposing you even touch that. What numbers do you have today?
We don't have THL and THH in our driver. They are what you suggested.
The queue wakeup number is 1/4 of the ring size.
> What i am saying is you use _some_ value for opening up the driver; some
> enlightened drivers such as the tg3 (and the e1000 - for which i
> unashamedly take credit) do have such parametrization. This has already
> been proven to be valuable.
>
> The timer fires only if a ring shuts down the interface. Where is the
> busy loop? If packets go out, there is no timer.
The busy loop happens in the period after the ring is shut down and
before it is opened again. During this period, the Qdisc will keep
dequeuing and requeuing PL packets in the Tx SoftIRQ, where the busy
loop happens.
> > the higher THL you choose, the slower the PH
> > packets will be sent out than expected (the driver doesn't fully utilize
> > the device function -- multiple rings,
>
> I dont think you understood: Whatever value you choose for THL and THH
> today, keep those. OTOH, the wake threshold is what i was refering to.
I don't even care about the threshold. Even you set it to 1, there is
still busy loop during the period before this first packet is sent out
in the air. But you cannot ignore this small time, because it could be
longer when the wireless medium is congested with high prio packets.
Thanks,
-yi
^ permalink raw reply [flat|nested] 153+ messages in thread
* Re: [PATCH] NET: Multiqueue network device support.
2007-06-22 1:26 ` Zhu Yi
@ 2007-06-25 16:47 ` jamal
2007-06-25 20:47 ` David Miller
0 siblings, 1 reply; 153+ messages in thread
From: jamal @ 2007-06-25 16:47 UTC (permalink / raw)
To: Zhu Yi
Cc: Patrick McHardy, David Miller, peter.p.waskiewicz.jr, netdev,
jeff, auke-jan.h.kok
On Fri, 2007-22-06 at 09:26 +0800, Zhu Yi wrote:
> On Thu, 2007-06-21 at 11:39 -0400, jamal wrote:
> It sounds stupid I'm still trying to convince you why we need multiqueue
> support in Qdisc when everybody else are already working on the code,
If you go back historically (maybe 2 years ago on netdev?) - i was a big
fan of the scheme used in those patches ;->
In a Xcracy like Linux, you have to agree to disagree at some point and
move on. I dont need any core changes to deploy what i am suggesting.
> fixing bugs and preparing for merge. The only reason I keep the
> conversation is that I think you _might_ have some really good points
> that buried under everybody else's positive support for multiqueue. But
> with the conversation goes on, it turns out not the truth.
We have come a long way and maybe you just didnt understand me
initially.
> We don't have THL and THH in our driver. They are what you suggested.
> The queue wakeup number is 1/4 of the ring size.
So how did you pick 1/4? Experimentation? If you look at tg3 its much
higher for example.
> > The timer fires only if a ring shuts down the interface. Where is the
> > busy loop? If packets go out, there is no timer.
>
> The busy loop happens in the period after the ring is shut down and
> before it is opened again. During this period, the Qdisc will keep
> dequeuing and requeuing PL packets in the Tx SoftIRQ, where the busy
> loop happens.
Ok, sure - this boils to what Patrick pointed out as well. I would see
this as being similar to any other corner case you meet. You may be able
to convince me otherwise if you can show me some numbers, for example:
how often this would happen, and how long would a LP be disallowed from
sending on the wire when the ring is full. I have read a few papers (i
posted one or two on the list) and none seem to have come across this as
an issue. You may have better insight.
So to me this is a corner case which is resolvable. I wouldnt consider
this to be any different than say dealing with failed allocs.You have to
deal with them.
So lets make this an engineering challenge and try to see how many ways
we can solve it ...
Here's one:
Use an exponentially backoff timer. i.e if you decide that you will open
the path every 1 sec or X packets, then the next time it turns to be a
false positive, increment the timer; upto an upper bound ( a lot of
protocols do this). That should cut down substantially how many times
you open up and finding no HP packets.
> > I dont think you understood: Whatever value you choose for THL and THH
> > today, keep those. OTOH, the wake threshold is what i was refering to.
>
> I don't even care about the threshold. Even you set it to 1, there is
> still busy loop during the period before this first packet is sent out
> in the air. But you cannot ignore this small time, because it could be
> longer when the wireless medium is congested with high prio packets.
Give me some numbers and you may be able to convince me that this may
not be so good for wireless. I have had no problems with prescribed
scheme for multiqueue ethernet chips.
cheers,
jamal
^ permalink raw reply [flat|nested] 153+ messages in thread
* Re: [PATCH] NET: Multiqueue network device support.
2007-06-25 16:47 ` jamal
@ 2007-06-25 20:47 ` David Miller
2007-06-26 13:27 ` jamal
0 siblings, 1 reply; 153+ messages in thread
From: David Miller @ 2007-06-25 20:47 UTC (permalink / raw)
To: hadi; +Cc: yi.zhu, kaber, peter.p.waskiewicz.jr, netdev, jeff,
auke-jan.h.kok
From: jamal <hadi@cyberus.ca>
Date: Mon, 25 Jun 2007 12:47:31 -0400
> On Fri, 2007-22-06 at 09:26 +0800, Zhu Yi wrote:
> > We don't have THL and THH in our driver. They are what you suggested.
> > The queue wakeup number is 1/4 of the ring size.
>
> So how did you pick 1/4? Experimentation? If you look at tg3 its much
> higher for example.
tg3 uses 1/4:
#define TG3_TX_WAKEUP_THRESH(tp) ((tp)->tx_pending / 4)
tp->tx_pending is the current configured ring size, configurable
via ethtool.
^ permalink raw reply [flat|nested] 153+ messages in thread
* Re: [PATCH] NET: Multiqueue network device support.
2007-06-25 20:47 ` David Miller
@ 2007-06-26 13:27 ` jamal
2007-06-26 20:57 ` David Miller
0 siblings, 1 reply; 153+ messages in thread
From: jamal @ 2007-06-26 13:27 UTC (permalink / raw)
To: David Miller
Cc: yi.zhu, kaber, peter.p.waskiewicz.jr, netdev, jeff,
auke-jan.h.kok
On Mon, 2007-25-06 at 13:47 -0700, David Miller wrote:
> tg3 uses 1/4:
>
> #define TG3_TX_WAKEUP_THRESH(tp) ((tp)->tx_pending / 4)
>
Sorry - meant tg3 uses a much higher value (default is 128) - say
relative to e1000 (default of 32).
My tests with batching on e1000 indicate 128 gives the best results
which is 1/2 if you look at it from tx ring size perspective. So i have
hardcoded it in the git tree.
> tp->tx_pending is the current configured ring size, configurable
> via ethtool.
Thats what e1000 needs as well - currently it hardcodes things.
I think ethtool is the way to go - I will update the batch tree with
that fix for e1000.
Back to the question: Do you recall how this number was arrived at?
128 packets will be sent out at GiGe in about 80 microsecs, so from a
feel-the-wind-direction perspective it seems reasonable.
cheers,
jamal
^ permalink raw reply [flat|nested] 153+ messages in thread
* Re: [PATCH] NET: Multiqueue network device support.
2007-06-26 13:27 ` jamal
@ 2007-06-26 20:57 ` David Miller
2007-06-27 22:32 ` jamal
0 siblings, 1 reply; 153+ messages in thread
From: David Miller @ 2007-06-26 20:57 UTC (permalink / raw)
To: hadi; +Cc: yi.zhu, kaber, peter.p.waskiewicz.jr, netdev, jeff,
auke-jan.h.kok
From: jamal <hadi@cyberus.ca>
Date: Tue, 26 Jun 2007 09:27:28 -0400
> Back to the question: Do you recall how this number was arrived at?
> 128 packets will be sent out at GiGe in about 80 microsecs, so from a
> feel-the-wind-direction perspective it seems reasonable.
I picked it out of a hat.
^ permalink raw reply [flat|nested] 153+ messages in thread
* Re: [PATCH] NET: Multiqueue network device support.
2007-06-26 20:57 ` David Miller
@ 2007-06-27 22:32 ` jamal
2007-06-27 22:54 ` David Miller
0 siblings, 1 reply; 153+ messages in thread
From: jamal @ 2007-06-27 22:32 UTC (permalink / raw)
To: David Miller
Cc: yi.zhu, kaber, peter.p.waskiewicz.jr, netdev, jeff,
auke-jan.h.kok
On Tue, 2007-26-06 at 13:57 -0700, David Miller wrote:
> From: jamal <hadi@cyberus.ca>
> Date: Tue, 26 Jun 2007 09:27:28 -0400
>
> > Back to the question: Do you recall how this number was arrived at?
> > 128 packets will be sent out at GiGe in about 80 microsecs, so from a
> > feel-the-wind-direction perspective it seems reasonable.
>
> I picked it out of a hat.
It is not a bad value for Gige; doubt it will be a good one for 10/100
or even 10GE.
But you could say that about the ring sizes too.
cheers,
jamal
^ permalink raw reply [flat|nested] 153+ messages in thread
* Re: [PATCH] NET: Multiqueue network device support.
2007-06-27 22:32 ` jamal
@ 2007-06-27 22:54 ` David Miller
2007-06-28 0:15 ` jamal
0 siblings, 1 reply; 153+ messages in thread
From: David Miller @ 2007-06-27 22:54 UTC (permalink / raw)
To: hadi; +Cc: yi.zhu, kaber, peter.p.waskiewicz.jr, netdev, jeff,
auke-jan.h.kok
From: jamal <hadi@cyberus.ca>
Date: Wed, 27 Jun 2007 18:32:45 -0400
> On Tue, 2007-26-06 at 13:57 -0700, David Miller wrote:
> > From: jamal <hadi@cyberus.ca>
> > Date: Tue, 26 Jun 2007 09:27:28 -0400
> >
> > > Back to the question: Do you recall how this number was arrived at?
> > > 128 packets will be sent out at GiGe in about 80 microsecs, so from a
> > > feel-the-wind-direction perspective it seems reasonable.
> >
> > I picked it out of a hat.
>
> It is not a bad value for Gige; doubt it will be a good one for 10/100
> or even 10GE.
> But you could say that about the ring sizes too.
The thing that's really important is that the value is not so
large such that the TX ring can become empty.
^ permalink raw reply [flat|nested] 153+ messages in thread
* Re: [PATCH] NET: Multiqueue network device support.
2007-06-27 22:54 ` David Miller
@ 2007-06-28 0:15 ` jamal
2007-06-28 0:31 ` David Miller
0 siblings, 1 reply; 153+ messages in thread
From: jamal @ 2007-06-28 0:15 UTC (permalink / raw)
To: David Miller
Cc: yi.zhu, kaber, peter.p.waskiewicz.jr, netdev, jeff,
auke-jan.h.kok
On Wed, 2007-27-06 at 15:54 -0700, David Miller wrote:
> The thing that's really important is that the value is not so
> large such that the TX ring can become empty.
In the case of batching, varying the values makes a difference.
The logic is that if you can tune it so that the driver takes
"sufficiently long" to stay closed the more packets you accumulate at
the qdisc and the more you can batch to the driver (when it opens up).
Deciding what "sufficiently long" is an art - and i am sure speed
dependent. With e1000 at gige 128 seems to be a good value, going above
or below that gave lesser performance.
cheers,
jamal
^ permalink raw reply [flat|nested] 153+ messages in thread
* Re: [PATCH] NET: Multiqueue network device support.
2007-06-28 0:15 ` jamal
@ 2007-06-28 0:31 ` David Miller
0 siblings, 0 replies; 153+ messages in thread
From: David Miller @ 2007-06-28 0:31 UTC (permalink / raw)
To: hadi; +Cc: yi.zhu, kaber, peter.p.waskiewicz.jr, netdev, jeff,
auke-jan.h.kok
From: jamal <hadi@cyberus.ca>
Date: Wed, 27 Jun 2007 20:15:47 -0400
> On Wed, 2007-27-06 at 15:54 -0700, David Miller wrote:
>
> > The thing that's really important is that the value is not so
> > large such that the TX ring can become empty.
>
> In the case of batching, varying the values makes a difference.
> The logic is that if you can tune it so that the driver takes
> "sufficiently long" to stay closed the more packets you accumulate at
> the qdisc and the more you can batch to the driver (when it opens up).
> Deciding what "sufficiently long" is an art - and i am sure speed
> dependent. With e1000 at gige 128 seems to be a good value, going above
> or below that gave lesser performance.
Right. And another thing you want to moderate is lock hold
times, perhaps even at the slight expense of performance.
^ permalink raw reply [flat|nested] 153+ messages in thread
end of thread, other threads:[~2007-06-28 0:31 UTC | newest]
Thread overview: 153+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2007-06-04 21:40 [RFC] NET: Multiple queue hardware support PJ Waskiewicz
2007-06-04 21:40 ` [PATCH] NET: Multiqueue network device support PJ Waskiewicz
2007-06-05 11:50 ` jamal
2007-06-05 15:51 ` Waskiewicz Jr, Peter P
2007-06-05 22:28 ` jamal
2007-06-06 15:11 ` Patrick McHardy
2007-06-06 22:13 ` jamal
2007-06-06 22:30 ` Waskiewicz Jr, Peter P
2007-06-06 22:40 ` David Miller
2007-06-06 23:35 ` jamal
2007-06-06 23:56 ` David Miller
2007-06-07 16:08 ` Stephen Hemminger
2007-06-07 16:59 ` Waskiewicz Jr, Peter P
2007-06-11 12:08 ` Patrick McHardy
2007-06-07 22:04 ` jamal
2007-06-09 14:58 ` Leonid Grossman
2007-06-09 19:23 ` jamal
2007-06-09 21:23 ` Leonid Grossman
2007-06-09 22:14 ` Jeff Garzik
2007-06-10 3:02 ` jamal
2007-06-10 15:27 ` Leonid Grossman
2007-06-06 22:35 ` David Miller
2007-06-06 22:57 ` Waskiewicz Jr, Peter P
2007-06-06 23:00 ` David Miller
2007-06-06 23:14 ` Waskiewicz Jr, Peter P
2007-06-06 23:36 ` Jeff Garzik
2007-06-06 23:32 ` jamal
2007-06-06 23:48 ` Rick Jones
2007-06-06 23:54 ` jamal
2007-06-07 0:01 ` David Miller
2007-06-06 23:58 ` David Miller
2007-06-06 23:52 ` David Miller
2007-06-07 0:47 ` Jeff Garzik
2007-06-07 12:29 ` jamal
2007-06-07 15:03 ` Kok, Auke
2007-06-07 21:57 ` jamal
2007-06-07 22:06 ` Kok, Auke
2007-06-07 22:26 ` jamal
2007-06-07 22:30 ` Kok, Auke
2007-06-07 22:57 ` jamal
2007-06-07 22:44 ` David Miller
2007-06-07 22:54 ` jamal
2007-06-07 23:00 ` David Miller
2007-06-07 23:03 ` jamal
2007-06-08 0:31 ` Sridhar Samudrala
2007-06-08 1:35 ` jamal
2007-06-08 10:39 ` Herbert Xu
2007-06-08 11:34 ` jamal
2007-06-08 12:37 ` Herbert Xu
2007-06-08 13:12 ` jamal
2007-06-09 11:08 ` Herbert Xu
2007-06-09 14:36 ` jamal
2007-06-08 5:32 ` Krishna Kumar2
2007-06-08 19:55 ` Waskiewicz Jr, Peter P
2007-06-09 0:24 ` jamal
2007-06-07 22:55 ` Waskiewicz Jr, Peter P
2007-06-09 1:05 ` Ramkrishna Vepa
2007-06-06 23:53 ` David Miller
2007-06-07 1:08 ` jamal
2007-06-07 12:22 ` jamal
2007-06-11 12:01 ` Patrick McHardy
2007-06-11 11:58 ` Patrick McHardy
2007-06-11 12:23 ` jamal
2007-06-11 12:39 ` Patrick McHardy
2007-06-11 12:52 ` jamal
2007-06-11 13:03 ` Patrick McHardy
2007-06-11 13:29 ` jamal
2007-06-11 14:03 ` Patrick McHardy
2007-06-11 14:30 ` Cohen, Guy
2007-06-11 14:38 ` Patrick McHardy
2007-06-11 14:48 ` jamal
2007-06-11 15:00 ` Tomas Winkler
2007-06-11 15:14 ` jamal
2007-06-11 15:34 ` Cohen, Guy
2007-06-11 22:22 ` jamal
2007-06-12 14:04 ` Cohen, Guy
2007-06-12 15:23 ` jamal
2007-06-12 23:38 ` jamal
2007-06-11 14:40 ` jamal
2007-06-11 14:49 ` Patrick McHardy
2007-06-11 15:05 ` jamal
2007-06-11 15:12 ` Patrick McHardy
2007-06-11 15:25 ` jamal
2007-06-11 15:44 ` Patrick McHardy
2007-06-11 21:35 ` jamal
2007-06-11 23:01 ` Patrick McHardy
2007-06-12 0:58 ` Patrick McHardy
2007-06-12 2:29 ` jamal
2007-06-12 13:21 ` Patrick McHardy
2007-06-12 15:12 ` jamal
2007-06-12 21:02 ` David Miller
2007-06-12 21:13 ` Jeff Garzik
2007-06-12 21:17 ` Ben Greear
2007-06-12 21:26 ` David Miller
2007-06-12 21:46 ` Jeff Garzik
2007-06-12 21:52 ` Roland Dreier
2007-06-12 21:59 ` Jeff Garzik
2007-06-12 22:04 ` David Miller
2007-06-12 22:18 ` Jeff Garzik
2007-06-12 22:00 ` David Miller
2007-06-12 21:53 ` David Miller
2007-06-12 22:01 ` Jeff Garzik
2007-06-12 21:46 ` Ben Greear
2007-06-12 21:54 ` David Miller
2007-06-12 22:30 ` Jeff Garzik
2007-06-12 22:40 ` Ben Greear
2007-06-12 21:47 ` Jason Lunz
2007-06-12 21:55 ` David Miller
2007-06-12 22:17 ` Jason Lunz
2007-06-13 3:41 ` Leonid Grossman
2007-06-13 16:44 ` Rick Jones
2007-06-12 21:17 ` Patrick McHardy
2007-06-13 5:56 ` Zhu Yi
2007-06-13 11:34 ` Patrick McHardy
2007-06-14 1:51 ` Zhu Yi
2007-06-13 12:32 ` jamal
2007-06-13 13:12 ` Robert Olsson
2007-06-13 13:33 ` jamal
2007-06-13 15:01 ` Leonid Grossman
2007-06-13 15:53 ` Robert Olsson
2007-06-13 18:20 ` David Miller
2007-06-13 18:22 ` Waskiewicz Jr, Peter P
2007-06-13 21:30 ` jamal
2007-06-14 2:44 ` Zhu Yi
2007-06-14 11:48 ` jamal
2007-06-15 1:27 ` Zhu Yi
2007-06-15 10:49 ` jamal
2007-06-18 1:18 ` Zhu Yi
2007-06-18 15:16 ` jamal
2007-06-19 2:12 ` Zhu Yi
2007-06-19 16:04 ` jamal
2007-06-20 5:51 ` Zhu Yi
2007-06-21 15:39 ` jamal
2007-06-22 1:26 ` Zhu Yi
2007-06-25 16:47 ` jamal
2007-06-25 20:47 ` David Miller
2007-06-26 13:27 ` jamal
2007-06-26 20:57 ` David Miller
2007-06-27 22:32 ` jamal
2007-06-27 22:54 ` David Miller
2007-06-28 0:15 ` jamal
2007-06-28 0:31 ` David Miller
2007-06-12 9:19 ` Johannes Berg
2007-06-12 12:17 ` jamal
2007-06-11 17:36 ` Patrick McHardy
2007-06-11 18:05 ` Waskiewicz Jr, Peter P
2007-06-11 18:07 ` Patrick McHardy
2007-06-13 18:34 ` Waskiewicz Jr, Peter P
2007-06-11 17:52 ` Patrick McHardy
2007-06-11 17:57 ` Waskiewicz Jr, Peter P
2007-06-11 18:05 ` Patrick McHardy
2007-06-11 18:15 ` Waskiewicz Jr, Peter P
2007-06-11 18:24 ` Patrick McHardy
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).