* [PATCH 2/3] NET: [UPDATED] Multiqueue network device support implementation.
2007-04-13 0:16 [PATCH 0/3] [UPDATED]: Multiqueue network device support Peter P Waskiewicz Jr
2007-04-13 0:19 ` [PATCH 1/3] NET: Multiqueue network device support documentation Peter P Waskiewicz Jr
@ 2007-04-13 0:19 ` Peter P Waskiewicz Jr
2007-04-13 0:15 ` Patrick McHardy
2007-04-13 0:19 ` [PATCH 3/3] NET: [e1000] Example implementation of multiqueue network device API Peter P Waskiewicz Jr
2 siblings, 1 reply; 7+ messages in thread
From: Peter P Waskiewicz Jr @ 2007-04-13 0:19 UTC (permalink / raw)
To: davem
Cc: netdev, linux-kernel, jgarzik, cramerj, auke-jan.h.kok,
christopher.leech
From: Peter P Waskiewicz Jr <peter.p.waskiewicz.jr@intel.com>
Update: Removed unnecessary whitespace removals. Reset skb->queue_mapping to
zero prior to enqueueing to a qdisc. Fixed band2queue mapping algorithm for
bands less than queues.
Added an API and associated supporting routines for multiqueue network devices.
This allows network devices supporting multiple TX queues to configure each
queue within the netdevice and manage each queue independantly. Changes to the
PRIO Qdisc also allow a user to map multiple flows to individual TX queues,
taking advantage of each queue on the device.
Signed-off-by: Peter P. Waskiewicz Jr <peter.p.waskiewicz.jr@intel.com>
Signed-off-by: Auke Kok <auke-jan.h.kok@intel.com>
---
include/linux/etherdevice.h | 3 +-
include/linux/netdevice.h | 62 ++++++++++++++++++++++++++++++++++++++++++-
include/linux/skbuff.h | 2 +
net/core/dev.c | 27 +++++++++++++++----
net/core/skbuff.c | 3 ++
net/ethernet/eth.c | 9 +++---
net/sched/sch_generic.c | 3 +-
net/sched/sch_prio.c | 54 +++++++++++++++++++++++++++++++++----
8 files changed, 144 insertions(+), 19 deletions(-)
diff --git a/include/linux/etherdevice.h b/include/linux/etherdevice.h
index 745c988..446de39 100644
--- a/include/linux/etherdevice.h
+++ b/include/linux/etherdevice.h
@@ -39,7 +39,8 @@ extern void eth_header_cache_update(struct hh_cache *hh, struct net_device *dev
extern int eth_header_cache(struct neighbour *neigh,
struct hh_cache *hh);
-extern struct net_device *alloc_etherdev(int sizeof_priv);
+extern struct net_device *alloc_etherdev_mq(int sizeof_priv, int queue_count);
+#define alloc_etherdev(sizeof_priv) alloc_etherdev_mq(sizeof_priv, 1)
static inline void eth_copy_and_sum (struct sk_buff *dest,
const unsigned char *src,
int len, int base)
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 71fc8ff..f00b94a 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -106,6 +106,14 @@ struct netpoll_info;
#define MAX_HEADER (LL_MAX_HEADER + 48)
#endif
+struct net_device_subqueue
+{
+ /* Give a control state for each queue. This struct may contain
+ * per-queue locks in the future.
+ */
+ unsigned long state;
+};
+
/*
* Network device statistics. Akin to the 2.0 ether stats but
* with byte counters.
@@ -324,6 +332,7 @@ struct net_device
#define NETIF_F_GSO 2048 /* Enable software GSO. */
#define NETIF_F_LLTX 4096 /* LockLess TX */
#define NETIF_F_INTERNAL_STATS 8192 /* Use stats structure in net_device */
+#define NETIF_F_MULTI_QUEUE 16384 /* Has multiple TX/RX queues */
/* Segmentation offload features */
#define NETIF_F_GSO_SHIFT 16
@@ -534,6 +543,10 @@ struct net_device
struct device dev;
/* space for optional statistics and wireless sysfs groups */
struct attribute_group *sysfs_groups[3];
+
+ /* The TX queue control structures */
+ struct net_device_subqueue *egress_subqueue;
+ int egress_subqueue_count;
};
#define to_net_dev(d) container_of(d, struct net_device, dev)
@@ -675,6 +688,48 @@ static inline int netif_running(const struct net_device *dev)
return test_bit(__LINK_STATE_START, &dev->state);
}
+/*
+ * Routines to manage the subqueues on a device. We only need start
+ * stop, and a check if it's stopped. All other device management is
+ * done at the overall netdevice level.
+ * Also test the device if we're multiqueue.
+ */
+static inline void netif_start_subqueue(struct net_device *dev, u16 queue_index)
+{
+ clear_bit(__LINK_STATE_XOFF, &dev->egress_subqueue[queue_index].state);
+}
+
+static inline void netif_stop_subqueue(struct net_device *dev, u16 queue_index)
+{
+#ifdef CONFIG_NETPOLL_TRAP
+ if (netpoll_trap())
+ return;
+#endif
+ set_bit(__LINK_STATE_XOFF, &dev->egress_subqueue[queue_index].state);
+}
+
+static inline int netif_subqueue_stopped(const struct net_device *dev,
+ u16 queue_index)
+{
+ return test_bit(__LINK_STATE_XOFF,
+ &dev->egress_subqueue[queue_index].state);
+}
+
+static inline void netif_wake_subqueue(struct net_device *dev, u16 queue_index)
+{
+#ifdef CONFIG_NETPOLL_TRAP
+ if (netpoll_trap())
+ return;
+#endif
+ if (test_and_clear_bit(__LINK_STATE_XOFF,
+ &dev->egress_subqueue[queue_index].state))
+ __netif_schedule(dev);
+}
+
+static inline int netif_is_multiqueue(const struct net_device *dev)
+{
+ return (!!(NETIF_F_MULTI_QUEUE & dev->features));
+}
/* Use this variant when it is known for sure that it
* is executing from interrupt context.
@@ -968,8 +1023,11 @@ static inline void netif_tx_disable(struct net_device *dev)
extern void ether_setup(struct net_device *dev);
/* Support for loadable net-drivers */
-extern struct net_device *alloc_netdev(int sizeof_priv, const char *name,
- void (*setup)(struct net_device *));
+extern struct net_device *alloc_netdev_mq(int sizeof_priv, const char *name,
+ void (*setup)(struct net_device *),
+ int queue_count);
+#define alloc_netdev(sizeof_priv, name, setup) \
+ alloc_netdev_mq(sizeof_priv, name, setup, 1)
extern int register_netdev(struct net_device *dev);
extern void unregister_netdev(struct net_device *dev);
/* Functions used for multicast support */
diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 08fa5c8..96fd263 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -212,6 +212,7 @@ typedef unsigned char *sk_buff_data_t;
* @pkt_type: Packet class
* @fclone: skbuff clone status
* @ip_summed: Driver fed us an IP checksum
+ * @queue_mapping: Queue mapping for multiqueue devices
* @priority: Packet queueing priority
* @users: User count - see {datagram,tcp}.c
* @protocol: Packet protocol from driver
@@ -264,6 +265,7 @@ struct sk_buff {
__wsum csum;
__u32 csum_offset;
};
+ __u16 queue_mapping;
__u32 priority;
__u8 local_df:1,
cloned:1,
diff --git a/net/core/dev.c b/net/core/dev.c
index 219a57f..3ce449e 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -1471,6 +1471,8 @@ gso:
q = dev->qdisc;
if (q->enqueue) {
rc = q->enqueue(skb, q);
+ /* reset queue_mapping to zero */
+ skb->queue_mapping = 0;
qdisc_run(dev);
spin_unlock(&dev->queue_lock);
@@ -3292,16 +3294,18 @@ static struct net_device_stats *maybe_internal_stats(struct net_device *dev)
}
/**
- * alloc_netdev - allocate network device
+ * alloc_netdev_mq - allocate network device
* @sizeof_priv: size of private data to allocate space for
* @name: device name format string
* @setup: callback to initialize device
+ * @queue_count: the number of subqueues to allocate
*
* Allocates a struct net_device with private data area for driver use
- * and performs basic initialization.
+ * and performs basic initialization. Also allocates subqueue structs
+ * for each queue on the device.
*/
-struct net_device *alloc_netdev(int sizeof_priv, const char *name,
- void (*setup)(struct net_device *))
+struct net_device *alloc_netdev_mq(int sizeof_priv, const char *name,
+ void (*setup)(struct net_device *), int queue_count)
{
void *p;
struct net_device *dev;
@@ -3326,12 +3330,23 @@ struct net_device *alloc_netdev(int sizeof_priv, const char *name,
if (sizeof_priv)
dev->priv = netdev_priv(dev);
+ alloc_size = (sizeof(struct net_device_subqueue) * queue_count);
+
+ p = kzalloc(alloc_size, GFP_KERNEL);
+ if (!p) {
+ printk(KERN_ERR "alloc_netdev: Unable to allocate queues.\n");
+ return NULL;
+ }
+
+ dev->egress_subqueue = p;
+ dev->egress_subqueue_count = queue_count;
+
dev->get_stats = maybe_internal_stats;
setup(dev);
strcpy(dev->name, name);
return dev;
}
-EXPORT_SYMBOL(alloc_netdev);
+EXPORT_SYMBOL(alloc_netdev_mq);
/**
* free_netdev - free network device
@@ -3345,6 +3360,7 @@ void free_netdev(struct net_device *dev)
{
#ifdef CONFIG_SYSFS
/* Compatibility with error handling in drivers */
+ kfree(dev->egress_subqueue);
if (dev->reg_state == NETREG_UNINITIALIZED) {
kfree((char *)dev - dev->padded);
return;
@@ -3356,6 +3372,7 @@ void free_netdev(struct net_device *dev)
/* will free via device release */
put_device(&dev->dev);
#else
+ kfree(dev->egress_subqueue);
kfree((char *)dev - dev->padded);
#endif
}
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index e60864e..642aab9 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -474,6 +474,7 @@ struct sk_buff *skb_clone(struct sk_buff *skb, gfp_t gfp_mask)
n->nohdr = 0;
C(pkt_type);
C(ip_summed);
+ C(queue_mapping);
C(priority);
#if defined(CONFIG_IP_VS) || defined(CONFIG_IP_VS_MODULE)
C(ipvs_property);
@@ -516,6 +517,7 @@ static void copy_skb_header(struct sk_buff *new, const struct sk_buff *old)
#endif
new->sk = NULL;
new->dev = old->dev;
+ new->queue_mapping = old->queue_mapping;
new->priority = old->priority;
new->protocol = old->protocol;
new->dst = dst_clone(old->dst);
@@ -1974,6 +1976,7 @@ struct sk_buff *skb_segment(struct sk_buff *skb, int features)
tail = nskb;
nskb->dev = skb->dev;
+ nskb->queue_mapping = skb->queue_mapping;
nskb->priority = skb->priority;
nskb->protocol = skb->protocol;
nskb->dst = dst_clone(skb->dst);
diff --git a/net/ethernet/eth.c b/net/ethernet/eth.c
index 0ac2524..87a509c 100644
--- a/net/ethernet/eth.c
+++ b/net/ethernet/eth.c
@@ -316,9 +316,10 @@ void ether_setup(struct net_device *dev)
EXPORT_SYMBOL(ether_setup);
/**
- * alloc_etherdev - Allocates and sets up an Ethernet device
+ * alloc_etherdev_mq - Allocates and sets up an Ethernet device
* @sizeof_priv: Size of additional driver-private structure to be allocated
* for this Ethernet device
+ * @queue_count: The number of queues this device has.
*
* Fill in the fields of the device structure with Ethernet-generic
* values. Basically does everything except registering the device.
@@ -328,8 +329,8 @@ EXPORT_SYMBOL(ether_setup);
* this private data area.
*/
-struct net_device *alloc_etherdev(int sizeof_priv)
+struct net_device *alloc_etherdev_mq(int sizeof_priv, int queue_count)
{
- return alloc_netdev(sizeof_priv, "eth%d", ether_setup);
+ return alloc_netdev_mq(sizeof_priv, "eth%d", ether_setup, queue_count);
}
-EXPORT_SYMBOL(alloc_etherdev);
+EXPORT_SYMBOL(alloc_etherdev_mq);
diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c
index 52eb343..caf16bb 100644
--- a/net/sched/sch_generic.c
+++ b/net/sched/sch_generic.c
@@ -133,7 +133,8 @@ static inline int qdisc_restart(struct net_device *dev)
/* And release queue */
spin_unlock(&dev->queue_lock);
- if (!netif_queue_stopped(dev)) {
+ if (!netif_queue_stopped(dev) &&
+ !netif_subqueue_stopped(dev, skb->queue_mapping)) {
int ret;
ret = dev_hard_start_xmit(skb, dev);
diff --git a/net/sched/sch_prio.c b/net/sched/sch_prio.c
index 5cfe60b..6a38905 100644
--- a/net/sched/sch_prio.c
+++ b/net/sched/sch_prio.c
@@ -43,6 +43,7 @@ struct prio_sched_data
struct tcf_proto *filter_list;
u8 prio2band[TC_PRIO_MAX+1];
struct Qdisc *queues[TCQ_PRIO_BANDS];
+ u16 band2queue[TC_PRIO_MAX + 1];
};
@@ -70,13 +71,20 @@ prio_classify(struct sk_buff *skb, struct Qdisc *sch, int *qerr)
#endif
if (TC_H_MAJ(band))
band = 0;
+ skb->queue_mapping =
+ q->prio2band[q->band2queue[band&TC_PRIO_MAX]];
+
return q->queues[q->prio2band[band&TC_PRIO_MAX]];
}
band = res.classid;
}
band = TC_H_MIN(band) - 1;
- if (band > q->bands)
+ if (band > q->bands) {
+ skb->queue_mapping = q->prio2band[q->band2queue[0]];
return q->queues[q->prio2band[0]];
+ }
+
+ skb->queue_mapping = q->band2queue[band];
return q->queues[band];
}
@@ -144,11 +152,17 @@ prio_dequeue(struct Qdisc* sch)
struct Qdisc *qdisc;
for (prio = 0; prio < q->bands; prio++) {
- qdisc = q->queues[prio];
- skb = qdisc->dequeue(qdisc);
- if (skb) {
- sch->q.qlen--;
- return skb;
+ /* Check if the target subqueue is available before
+ * pulling an skb. This way we avoid excessive requeues
+ * for slower queues.
+ */
+ if (!netif_subqueue_stopped(sch->dev, q->band2queue[prio])) {
+ qdisc = q->queues[prio];
+ skb = qdisc->dequeue(qdisc);
+ if (skb) {
+ sch->q.qlen--;
+ return skb;
+ }
}
}
return NULL;
@@ -200,6 +214,10 @@ static int prio_tune(struct Qdisc *sch, struct rtattr *opt)
struct prio_sched_data *q = qdisc_priv(sch);
struct tc_prio_qopt *qopt = RTA_DATA(opt);
int i;
+ int queue;
+ int qmapoffset;
+ int offset;
+ int mod;
if (opt->rta_len < RTA_LENGTH(sizeof(*qopt)))
return -EINVAL;
@@ -242,6 +260,30 @@ static int prio_tune(struct Qdisc *sch, struct rtattr *opt)
}
}
}
+ /* setup queue to band mapping */
+ if (q->bands < sch->dev->egress_subqueue_count) {
+ qmapoffset = 1;
+ mod = sch->dev->egress_subqueue_count;
+ } else {
+ mod = q->bands % sch->dev->egress_subqueue_count;
+ qmapoffset = q->bands / sch->dev->egress_subqueue_count +
+ ((mod) ? 1 : 0);
+ }
+
+ queue = 0;
+ offset = 0;
+ for (i = 0; i < q->bands; i++) {
+ q->band2queue[i] = queue;
+ if ( ((i + 1) - offset) == qmapoffset) {
+ queue++;
+ offset += qmapoffset;
+ if (mod)
+ mod--;
+ qmapoffset = q->bands /
+ sch->dev->egress_subqueue_count +
+ ((mod) ? 1 : 0);
+ }
+ }
return 0;
}
^ permalink raw reply related [flat|nested] 7+ messages in thread* [PATCH 3/3] NET: [e1000] Example implementation of multiqueue network device API
2007-04-13 0:16 [PATCH 0/3] [UPDATED]: Multiqueue network device support Peter P Waskiewicz Jr
2007-04-13 0:19 ` [PATCH 1/3] NET: Multiqueue network device support documentation Peter P Waskiewicz Jr
2007-04-13 0:19 ` [PATCH 2/3] NET: [UPDATED] Multiqueue network device support implementation Peter P Waskiewicz Jr
@ 2007-04-13 0:19 ` Peter P Waskiewicz Jr
2 siblings, 0 replies; 7+ messages in thread
From: Peter P Waskiewicz Jr @ 2007-04-13 0:19 UTC (permalink / raw)
To: davem
Cc: netdev, linux-kernel, jgarzik, cramerj, auke-jan.h.kok,
christopher.leech
From: Peter P Waskiewicz Jr <peter.p.waskiewicz.jr@intel.com>
This patch is *not* intended to be integrated into any tree please. This is
fulfilling a request to demonstrate the proposed multiqueue network device
API in a driver. The necessary updates to the e1000 driver will come in a
more official release. This is an as-is patch to this version of e1000, and
should not be used outside of testing purposes only.
Signed-off-by: Peter P. Waskiewicz Jr <peter.p.waskiewicz.jr@intel.com>
---
drivers/net/e1000/e1000.h | 8 ++
drivers/net/e1000/e1000_ethtool.c | 47 ++++++++++-
drivers/net/e1000/e1000_main.c | 164 ++++++++++++++++++++++++++++++++-----
3 files changed, 194 insertions(+), 25 deletions(-)
diff --git a/drivers/net/e1000/e1000.h b/drivers/net/e1000/e1000.h
index dd4b728..15e484e 100644
--- a/drivers/net/e1000/e1000.h
+++ b/drivers/net/e1000/e1000.h
@@ -168,6 +168,10 @@ struct e1000_buffer {
uint16_t next_to_watch;
};
+struct e1000_queue_stats {
+ u64 packets;
+ u64 bytes;
+};
struct e1000_ps_page { struct page *ps_page[PS_PAGE_BUFFERS]; };
struct e1000_ps_page_dma { uint64_t ps_page_dma[PS_PAGE_BUFFERS]; };
@@ -188,9 +192,11 @@ struct e1000_tx_ring {
/* array of buffer information structs */
struct e1000_buffer *buffer_info;
+ spinlock_t tx_queue_lock;
spinlock_t tx_lock;
uint16_t tdh;
uint16_t tdt;
+ struct e1000_queue_stats tx_stats;
boolean_t last_tx_tso;
};
@@ -218,6 +224,7 @@ struct e1000_rx_ring {
uint16_t rdh;
uint16_t rdt;
+ struct e1000_queue_stats rx_stats;
};
#define E1000_DESC_UNUSED(R) \
@@ -271,6 +278,7 @@ struct e1000_adapter {
/* TX */
struct e1000_tx_ring *tx_ring; /* One per active queue */
+ struct e1000_tx_ring **cpu_tx_ring;
unsigned int restart_queue;
unsigned long tx_queue_len;
uint32_t txd_cmd;
diff --git a/drivers/net/e1000/e1000_ethtool.c b/drivers/net/e1000/e1000_ethtool.c
index 6777887..fd466a1 100644
--- a/drivers/net/e1000/e1000_ethtool.c
+++ b/drivers/net/e1000/e1000_ethtool.c
@@ -105,7 +105,12 @@ static const struct e1000_stats e1000_gstrings_stats[] = {
{ "dropped_smbus", E1000_STAT(stats.mgpdc) },
};
-#define E1000_QUEUE_STATS_LEN 0
+#define E1000_QUEUE_STATS_LEN \
+ ((((((struct e1000_adapter *)netdev->priv)->num_rx_queues > 1) ? \
+ ((struct e1000_adapter *)netdev->priv)->num_rx_queues : 0 ) + \
+ (((((struct e1000_adapter *)netdev->priv)->num_tx_queues > 1) ? \
+ ((struct e1000_adapter *)netdev->priv)->num_tx_queues : 0 ))) * \
+ (sizeof(struct e1000_queue_stats) / sizeof(u64)))
#define E1000_GLOBAL_STATS_LEN \
sizeof(e1000_gstrings_stats) / sizeof(struct e1000_stats)
#define E1000_STATS_LEN (E1000_GLOBAL_STATS_LEN + E1000_QUEUE_STATS_LEN)
@@ -693,8 +698,10 @@ e1000_set_ringparam(struct net_device *netdev,
E1000_MAX_TXD : E1000_MAX_82544_TXD));
E1000_ROUNDUP(txdr->count, REQ_TX_DESCRIPTOR_MULTIPLE);
- for (i = 0; i < adapter->num_tx_queues; i++)
+ for (i = 0; i < adapter->num_tx_queues; i++) {
txdr[i].count = txdr->count;
+ spin_lock_init(&adapter->tx_ring[i].tx_queue_lock);
+ }
for (i = 0; i < adapter->num_rx_queues; i++)
rxdr[i].count = rxdr->count;
@@ -1909,6 +1916,9 @@ e1000_get_ethtool_stats(struct net_device *netdev,
struct ethtool_stats *stats, uint64_t *data)
{
struct e1000_adapter *adapter = netdev_priv(netdev);
+ u64 *queue_stat;
+ int stat_count = sizeof(struct e1000_queue_stats) / sizeof(u64);
+ int j, k;
int i;
e1000_update_stats(adapter);
@@ -1917,12 +1927,29 @@ e1000_get_ethtool_stats(struct net_device *netdev,
data[i] = (e1000_gstrings_stats[i].sizeof_stat ==
sizeof(uint64_t)) ? *(uint64_t *)p : *(uint32_t *)p;
}
+ if (adapter->num_tx_queues > 1) {
+ for (j = 0; j < adapter->num_tx_queues; j++) {
+ queue_stat = (u64 *)&adapter->tx_ring[j].tx_stats;
+ for (k = 0; k < stat_count; k++)
+ data[i + k] = queue_stat[k];
+ i += k;
+ }
+ }
+ if (adapter->num_rx_queues > 1) {
+ for (j = 0; j < adapter->num_rx_queues; j++) {
+ queue_stat = (u64 *)&adapter->rx_ring[j].rx_stats;
+ for (k = 0; k < stat_count; k++)
+ data[i + k] = queue_stat[k];
+ i += k;
+ }
+ }
/* BUG_ON(i != E1000_STATS_LEN); */
}
static void
e1000_get_strings(struct net_device *netdev, uint32_t stringset, uint8_t *data)
{
+ struct e1000_adapter *adapter = netdev_priv(netdev);
uint8_t *p = data;
int i;
@@ -1937,6 +1964,22 @@ e1000_get_strings(struct net_device *netdev, uint32_t stringset, uint8_t *data)
ETH_GSTRING_LEN);
p += ETH_GSTRING_LEN;
}
+ if (adapter->num_tx_queues > 1) {
+ for (i = 0; i < adapter->num_tx_queues; i++) {
+ sprintf(p, "tx_queue_%u_packets", i);
+ p += ETH_GSTRING_LEN;
+ sprintf(p, "tx_queue_%u_bytes", i);
+ p += ETH_GSTRING_LEN;
+ }
+ }
+ if (adapter->num_rx_queues > 1) {
+ for (i = 0; i < adapter->num_rx_queues; i++) {
+ sprintf(p, "rx_queue_%u_packets", i);
+ p += ETH_GSTRING_LEN;
+ sprintf(p, "rx_queue_%u_bytes", i);
+ p += ETH_GSTRING_LEN;
+ }
+ }
/* BUG_ON(p - data != E1000_STATS_LEN * ETH_GSTRING_LEN); */
break;
}
diff --git a/drivers/net/e1000/e1000_main.c b/drivers/net/e1000/e1000_main.c
index 913db0c..4753674 100644
--- a/drivers/net/e1000/e1000_main.c
+++ b/drivers/net/e1000/e1000_main.c
@@ -29,6 +29,9 @@
#include "e1000.h"
#include <net/ip6_checksum.h>
+#include <linux/cpu.h>
+#include <linux/smp.h>
+
char e1000_driver_name[] = "e1000";
static char e1000_driver_string[] = "Intel(R) PRO/1000 Network Driver";
#ifndef CONFIG_E1000_NAPI
@@ -137,6 +140,7 @@ static void e1000_exit_module(void);
static int e1000_probe(struct pci_dev *pdev, const struct pci_device_id *ent);
static void __devexit e1000_remove(struct pci_dev *pdev);
static int e1000_alloc_queues(struct e1000_adapter *adapter);
+static void e1000_setup_queue_mapping(struct e1000_adapter *adapter);
static int e1000_sw_init(struct e1000_adapter *adapter);
static int e1000_open(struct net_device *netdev);
static int e1000_close(struct net_device *netdev);
@@ -547,6 +551,8 @@ e1000_up(struct e1000_adapter *adapter)
E1000_DESC_UNUSED(ring));
}
+ e1000_setup_queue_mapping(adapter);
+
adapter->tx_queue_len = netdev->tx_queue_len;
#ifdef CONFIG_E1000_NAPI
@@ -900,7 +906,7 @@ e1000_probe(struct pci_dev *pdev,
pci_set_master(pdev);
err = -ENOMEM;
- netdev = alloc_etherdev(sizeof(struct e1000_adapter));
+ netdev = alloc_etherdev_mq(sizeof(struct e1000_adapter), 2);
if (!netdev)
goto err_alloc_etherdev;
@@ -1001,6 +1007,8 @@ e1000_probe(struct pci_dev *pdev,
netdev->features |= NETIF_F_LLTX;
+ netdev->features |= NETIF_F_MULTI_QUEUE;
+
adapter->en_mng_pt = e1000_enable_mng_pass_thru(&adapter->hw);
/* initialize eeprom parameters */
@@ -1317,8 +1325,8 @@ e1000_sw_init(struct e1000_adapter *adapter)
hw->master_slave = E1000_MASTER_SLAVE;
}
- adapter->num_tx_queues = 1;
- adapter->num_rx_queues = 1;
+ adapter->num_tx_queues = 2;
+ adapter->num_rx_queues = 2;
if (e1000_alloc_queues(adapter)) {
DPRINTK(PROBE, ERR, "Unable to allocate memory for queues\n");
@@ -1334,6 +1342,8 @@ e1000_sw_init(struct e1000_adapter *adapter)
set_bit(__LINK_STATE_START, &adapter->polling_netdev[i].state);
}
spin_lock_init(&adapter->tx_queue_lock);
+ for (i = 0; i < adapter->num_tx_queues; i++)
+ spin_lock_init(&adapter->tx_ring[i].tx_queue_lock);
#endif
atomic_set(&adapter->irq_sem, 1);
@@ -1382,10 +1392,26 @@ e1000_alloc_queues(struct e1000_adapter *adapter)
}
memset(adapter->polling_netdev, 0, size);
#endif
+ adapter->cpu_tx_ring = alloc_percpu(struct e1000_tx_ring *);
return E1000_SUCCESS;
}
+static void
+e1000_setup_queue_mapping(struct e1000_adapter *adapter)
+{
+ int i, cpu;
+
+ lock_cpu_hotplug();
+ i = 0;
+ for_each_online_cpu(cpu) {
+ *per_cpu_ptr(adapter->cpu_tx_ring, cpu) =
+ &adapter->tx_ring[i % adapter->num_tx_queues];
+ i++;
+ }
+ unlock_cpu_hotplug();
+}
+
/**
* e1000_open - Called when a network interface is made active
* @netdev: network interface device structure
@@ -1640,7 +1666,17 @@ e1000_configure_tx(struct e1000_adapter *adapter)
/* Setup the HW Tx Head and Tail descriptor pointers */
switch (adapter->num_tx_queues) {
- case 1:
+ case 2:
+ tdba = adapter->tx_ring[1].dma;
+ tdlen = adapter->tx_ring[1].count *
+ sizeof(struct e1000_tx_desc);
+ E1000_WRITE_REG(hw, TDLEN1, tdlen);
+ E1000_WRITE_REG(hw, TDBAH1, (tdba >> 32));
+ E1000_WRITE_REG(hw, TDBAL1, (tdba & 0x00000000ffffffffULL));
+ E1000_WRITE_REG(hw, TDT1, 0);
+ E1000_WRITE_REG(hw, TDH1, 0);
+ adapter->tx_ring[1].tdh = ((hw->mac_type >= e1000_82543) ? E1000_TDH1 : E1000_82542_TDH1);
+ adapter->tx_ring[1].tdt = ((hw->mac_type >= e1000_82543) ? E1000_TDT1 : E1000_82542_TDT1);
default:
tdba = adapter->tx_ring[0].dma;
tdlen = adapter->tx_ring[0].count *
@@ -2043,8 +2079,7 @@ e1000_configure_rx(struct e1000_adapter *adapter)
/* Setup the HW Rx Head and Tail Descriptor Pointers and
* the Base and Length of the Rx Descriptor Ring */
switch (adapter->num_rx_queues) {
- case 1:
- default:
+ case 2:
rdba = adapter->rx_ring[0].dma;
E1000_WRITE_REG(hw, RDLEN, rdlen);
E1000_WRITE_REG(hw, RDBAH, (rdba >> 32));
@@ -2053,11 +2088,45 @@ e1000_configure_rx(struct e1000_adapter *adapter)
E1000_WRITE_REG(hw, RDH, 0);
adapter->rx_ring[0].rdh = ((hw->mac_type >= e1000_82543) ? E1000_RDH : E1000_82542_RDH);
adapter->rx_ring[0].rdt = ((hw->mac_type >= e1000_82543) ? E1000_RDT : E1000_82542_RDT);
+ /* fall through */
+ default:
+ rdba = adapter->rx_ring[1].dma;
+ E1000_WRITE_REG(hw, RDLEN1, rdlen);
+ E1000_WRITE_REG(hw, RDBAH1, (rdba >> 32));
+ E1000_WRITE_REG(hw, RDBAL1, (rdba & 0x00000000ffffffffULL));
+ E1000_WRITE_REG(hw, RDT1, 0);
+ E1000_WRITE_REG(hw, RDH1, 0);
+ adapter->rx_ring[1].rdh = ((hw->mac_type >= e1000_82543) ? E1000_RDH1 : E1000_82542_RDH1);
+ adapter->rx_ring[1].rdt = ((hw->mac_type >= e1000_82543) ? E1000_RDT1 : E1000_82542_RDT1);
break;
}
- /* Enable 82543 Receive Checksum Offload for TCP and UDP */
- if (hw->mac_type >= e1000_82543) {
+ if (adapter->num_rx_queues > 1) {
+ u32 random[10];
+ u32 reta, mrqc;
+ int i;
+
+ get_random_bytes(&random[0], 40);
+
+ reta = 0x00800080;
+ mrqc = E1000_MRQC_ENABLE_RSS_2Q;
+ /* Fill out redirection table */
+ for (i = 0; i < 32; i++)
+ E1000_WRITE_REG_ARRAY(hw, RETA, i, reta);
+ /* Fill out hash function seeds */
+ for (i = 0; i < 10; i++)
+ E1000_WRITE_REG_ARRAY(hw, RSSRK, i, random[i]);
+
+ mrqc |= (E1000_MRQC_RSS_FIELD_IPV4 |
+ E1000_MRQC_RSS_FIELD_IPV4_TCP);
+ E1000_WRITE_REG(hw, MRQC, mrqc);
+
+ /* Multiqueue and packet checksumming are mutually exclusive. */
+ rxcsum = E1000_READ_REG(hw, RXCSUM);
+ rxcsum |= E1000_RXCSUM_PCSD;
+ E1000_WRITE_REG(hw, RXCSUM, rxcsum);
+ } else if (hw->mac_type >= e1000_82543) {
+ /* Enable 82543 Receive Checksum Offload for TCP and UDP */
rxcsum = E1000_READ_REG(hw, RXCSUM);
if (adapter->rx_csum == TRUE) {
rxcsum |= E1000_RXCSUM_TUOFL;
@@ -2555,6 +2624,7 @@ e1000_watchdog(unsigned long data)
struct e1000_tx_ring *txdr = adapter->tx_ring;
uint32_t link, tctl;
int32_t ret_val;
+ int i;
ret_val = e1000_check_for_link(&adapter->hw);
if ((ret_val == E1000_ERR_PHY) &&
@@ -2652,6 +2722,8 @@ e1000_watchdog(unsigned long data)
netif_carrier_on(netdev);
netif_wake_queue(netdev);
+ for (i = 0; i < adapter->num_tx_queues; i++)
+ netif_wake_subqueue(netdev, i);
mod_timer(&adapter->phy_info_timer, jiffies + 2 * HZ);
adapter->smartspeed = 0;
} else {
@@ -3266,7 +3338,7 @@ e1000_xmit_frame(struct sk_buff *skb, struct net_device *netdev)
* to a flow. Right now, performance is impacted slightly negatively
* if using multiple tx queues. If the stack breaks away from a
* single qdisc implementation, we can look at this again. */
- tx_ring = adapter->tx_ring;
+ tx_ring = &adapter->tx_ring[skb->queue_mapping];
if (unlikely(skb->len <= 0)) {
dev_kfree_skb_any(skb);
@@ -3751,7 +3823,8 @@ e1000_intr_msi(int irq, void *data)
struct e1000_adapter *adapter = netdev_priv(netdev);
struct e1000_hw *hw = &adapter->hw;
#ifndef CONFIG_E1000_NAPI
- int i;
+ int i, j;
+ int rx_cleaned, tx_cleaned;
#endif
uint32_t icr = E1000_READ_REG(hw, ICR);
@@ -3791,10 +3864,20 @@ e1000_intr_msi(int irq, void *data)
adapter->total_tx_packets = 0;
adapter->total_rx_packets = 0;
- for (i = 0; i < E1000_MAX_INTR; i++)
- if (unlikely(!adapter->clean_rx(adapter, adapter->rx_ring) &
- e1000_clean_tx_irq(adapter, adapter->tx_ring)))
+ for (i = 0; i < E1000_MAX_INTR; i++) {
+ rx_cleaned = 0;
+ for (j = 0; j < adapter->num_rx_queues; j++)
+ rx_cleaned |= adapter->clean_rx(adapter,
+ &adapter->rx_ring[j]);
+
+ tx_cleaned = 0;
+ for (j = 0 ; j < adapter->num_tx_queues ; j++)
+ tx_cleaned |= e1000_clean_tx_irq(adapter,
+ &adapter->tx_ring[j]);
+
+ if (!rx_cleaned & tx_cleaned)
break;
+ }
if (likely(adapter->itr_setting & 3))
e1000_set_itr(adapter);
@@ -3818,7 +3901,7 @@ e1000_intr(int irq, void *data)
struct e1000_hw *hw = &adapter->hw;
uint32_t rctl, icr = E1000_READ_REG(hw, ICR);
#ifndef CONFIG_E1000_NAPI
- int i;
+ int i, j;
#endif
if (unlikely(!icr))
return IRQ_NONE; /* Not our interrupt */
@@ -3894,10 +3977,20 @@ e1000_intr(int irq, void *data)
adapter->total_tx_packets = 0;
adapter->total_rx_packets = 0;
- for (i = 0; i < E1000_MAX_INTR; i++)
- if (unlikely(!adapter->clean_rx(adapter, adapter->rx_ring) &
- e1000_clean_tx_irq(adapter, adapter->tx_ring)))
+ for (i = 0; i < E1000_MAX_INTR; i++) {
+ rx_cleaned = 0;
+ for (j = 0; j < adapter->num_rx_queues; j++)
+ rx_cleaned |= adapter->clean_rx(adapter,
+ &adapter->rx_ring[j]);
+
+ tx_cleaned = 0;
+ for (j = 0 ; j < adapter->num_tx_queues ; j++)
+ tx_cleaned |= e1000_clean_tx_irq(adapter,
+ &adapter->tx_ring[j]);
+
+ if (!rx_cleaned & tx_cleaned)
break;
+ }
if (likely(adapter->itr_setting & 3))
e1000_set_itr(adapter);
@@ -3920,7 +4013,8 @@ e1000_clean(struct net_device *poll_dev, int *budget)
{
struct e1000_adapter *adapter;
int work_to_do = min(*budget, poll_dev->quota);
- int tx_cleaned = 0, work_done = 0;
+ int tx_cleaned = 1, work_done = 0;
+ int i;
/* Must NOT use netdev_priv macro here. */
adapter = poll_dev->priv;
@@ -3933,14 +4027,29 @@ e1000_clean(struct net_device *poll_dev, int *budget)
* tx_ring[0] from being cleaned by multiple cpus
* simultaneously. A failure obtaining the lock means
* tx_ring[0] is currently being cleaned anyway. */
- if (spin_trylock(&adapter->tx_queue_lock)) {
+ for (i = 0; i < adapter->num_tx_queues; i++) {
+ if (spin_trylock(&adapter->tx_ring[i].tx_queue_lock)) {
+ tx_cleaned &= e1000_clean_tx_irq(adapter,
+ &adapter->tx_ring[i]);
+ spin_unlock(&adapter->tx_ring[i].tx_queue_lock);
+ }
+ }
+ if (adapter->num_tx_queues == 1 &&
+ spin_trylock(&adapter->tx_queue_lock)) {
tx_cleaned = e1000_clean_tx_irq(adapter,
&adapter->tx_ring[0]);
spin_unlock(&adapter->tx_queue_lock);
}
- adapter->clean_rx(adapter, &adapter->rx_ring[0],
- &work_done, work_to_do);
+ for (i = 0; i < adapter->num_rx_queues; i++) {
+ /* XXX if the number of queues was limited to a power of two
+ * this would not need a div */
+ adapter->clean_rx(adapter, &adapter->rx_ring[i],
+ &work_done,
+ work_to_do / adapter->num_rx_queues);
+ *budget -= work_done;
+ poll_dev->quota -= work_done;
+ }
*budget -= work_done;
poll_dev->quota -= work_done;
@@ -3989,6 +4098,8 @@ e1000_clean_tx_irq(struct e1000_adapter *adapter,
buffer_info = &tx_ring->buffer_info[i];
cleaned = (i == eop);
+ tx_ring->tx_stats.bytes += buffer_info->length;
+
if (cleaned) {
struct sk_buff *skb = buffer_info->skb;
unsigned int segs, bytecount;
@@ -4005,6 +4116,8 @@ e1000_clean_tx_irq(struct e1000_adapter *adapter,
if (unlikely(++i == tx_ring->count)) i = 0;
}
+ tx_ring->tx_stats.packets++;
+
eop = tx_ring->buffer_info[i].next_to_watch;
eop_desc = E1000_TX_DESC(*tx_ring, eop);
#ifdef CONFIG_E1000_NAPI
@@ -4266,6 +4379,8 @@ e1000_clean_rx_irq(struct e1000_adapter *adapter,
}
#endif /* CONFIG_E1000_NAPI */
netdev->last_rx = jiffies;
+ rx_ring->rx_stats.packets++;
+ rx_ring->rx_stats.bytes += length;
next_desc:
rx_desc->status = 0;
@@ -5222,12 +5337,15 @@ static void
e1000_netpoll(struct net_device *netdev)
{
struct e1000_adapter *adapter = netdev_priv(netdev);
+ int i;
disable_irq(adapter->pdev->irq);
e1000_intr(adapter->pdev->irq, netdev);
- e1000_clean_tx_irq(adapter, adapter->tx_ring);
+ for (i = 0; i < adapter->num_tx_queues; i++)
+ e1000_clean_tx_irq(adapter, &adapter->tx_ring[i]);
#ifndef CONFIG_E1000_NAPI
- adapter->clean_rx(adapter, adapter->rx_ring);
+ for (i = 0; i < adatper->num_rx_queues; i++)
+ adapter->clean_rx(adapter, &adapter->rx_ring[i]);
#endif
enable_irq(adapter->pdev->irq);
}
^ permalink raw reply related [flat|nested] 7+ messages in thread