From: Changli Gao <xiaosuo@gmail.com>
To: "David S. Miller" <davem@davemloft.net>
Cc: netdev@vger.kernel.org, xiaosuo <xiaosuo@gmail.com>,
Tom Herbert <therbert@google.com>
Subject: [PATCH] ifb: add multi-queue support
Date: Tue, 10 Nov 2009 16:30:29 +0800 [thread overview]
Message-ID: <4AF924A5.1050303@gmail.com> (raw)
ifb: add multi-queue support
Add multi-queue support, and one kernel thread is created for per queue.
It can used to emulate multi-queue NIC in software, and distribute work
among CPUs.
gentux linux # modprobe ifb numtxqs=2
gentux linux # ifconfig ifb0 up
gentux linux # pgrep ifb0
18508
18509
gentux linux # taskset -p 1 18508
pid 18508's current affinity mask: 3
pid 18508's new affinity mask: 1
gentux linux # taskset -p 2 18509
pid 18509's current affinity mask: 3
pid 18509's new affinity mask: 2
gentux linux # tc qdisc add dev br0 ingress
gentux linux # tc filter add dev br0 parent ffff: protocol ip basic
action mirred egress redirect dev ifb0
Signed-off-by: Changli Gao <xiaosuo@gmail.com>
----
drivers/net/ifb.c | 309
++++++++++++++++++++++++++++++++----------------------
1 file changed, 186 insertions(+), 123 deletions(-)
diff --git a/drivers/net/ifb.c b/drivers/net/ifb.c
index 030913f..6e04188 100644
--- a/drivers/net/ifb.c
+++ b/drivers/net/ifb.c
@@ -33,139 +33,101 @@
#include <linux/etherdevice.h>
#include <linux/init.h>
#include <linux/moduleparam.h>
+#include <linux/wait.h>
+#include <linux/sched.h>
+#include <linux/kthread.h>
+#include <linux/ip.h>
+#include <linux/ipv6.h>
+#include <net/ip.h>
#include <net/pkt_sched.h>
#include <net/net_namespace.h>
-#define TX_TIMEOUT (2*HZ)
-
#define TX_Q_LIMIT 32
+
struct ifb_private {
- struct tasklet_struct ifb_tasklet;
- int tasklet_pending;
- /* mostly debug stats leave in for now */
- unsigned long st_task_enter; /* tasklet entered */
- unsigned long st_txq_refl_try; /* transmit queue refill attempt */
- unsigned long st_rxq_enter; /* receive queue entered */
- unsigned long st_rx2tx_tran; /* receive to trasmit transfers */
- unsigned long st_rxq_notenter; /*receiveQ not entered, resched */
- unsigned long st_rx_frm_egr; /* received from egress path */
- unsigned long st_rx_frm_ing; /* received from ingress path */
- unsigned long st_rxq_check;
- unsigned long st_rxq_rsch;
- struct sk_buff_head rq;
- struct sk_buff_head tq;
+ struct net_device *dev;
+ struct sk_buff_head rq;
+ struct sk_buff_head tq;
+ wait_queue_head_t wq;
+ struct task_struct *task;
};
+/* Number of ifb devices to be set up by this module. */
static int numifbs = 2;
+module_param(numifbs, int, 0444);
+MODULE_PARM_DESC(numifbs, "Number of ifb devices");
-static void ri_tasklet(unsigned long dev);
-static netdev_tx_t ifb_xmit(struct sk_buff *skb, struct net_device *dev);
-static int ifb_open(struct net_device *dev);
-static int ifb_close(struct net_device *dev);
+/* Number of TX queues per ifb */
+static int numtxqs = 1;
+module_param(numtxqs, int, 0444);
+MODULE_PARM_DESC(numtxqs, "Number of TX queues per ifb");
-static void ri_tasklet(unsigned long dev)
+static int ifb_thread(void *priv)
{
-
- struct net_device *_dev = (struct net_device *)dev;
- struct ifb_private *dp = netdev_priv(_dev);
- struct net_device_stats *stats = &_dev->stats;
- struct netdev_queue *txq;
+ struct ifb_private *dp = (struct ifb_private*)priv;
+ struct net_device *dev = dp->dev;
+ struct net_device_stats *stats = &dev->stats;
+ unsigned int num = dp - (struct ifb_private*)netdev_priv(dev);
+ struct netdev_queue *txq = netdev_get_tx_queue(dev, num);
struct sk_buff *skb;
-
- txq = netdev_get_tx_queue(_dev, 0);
- dp->st_task_enter++;
- if ((skb = skb_peek(&dp->tq)) == NULL) {
- dp->st_txq_refl_try++;
- if (__netif_tx_trylock(txq)) {
- dp->st_rxq_enter++;
- while ((skb = skb_dequeue(&dp->rq)) != NULL) {
+ DEFINE_WAIT(wait);
+
+ while (1) {
+ /* move skb from rq to tq */
+ while (1) {
+ prepare_to_wait(&dp->wq, &wait, TASK_UNINTERRUPTIBLE);
+ while (!__netif_tx_trylock(txq))
+ yield();
+ while ((skb = skb_dequeue(&dp->rq)) != NULL)
skb_queue_tail(&dp->tq, skb);
- dp->st_rx2tx_tran++;
- }
+ if (netif_queue_stopped(dev))
+ netif_wake_queue(dev);
__netif_tx_unlock(txq);
- } else {
- /* reschedule */
- dp->st_rxq_notenter++;
- goto resched;
+ if (kthread_should_stop() || !skb_queue_empty(&dp->tq))
+ break;
+ schedule();
}
- }
-
- while ((skb = skb_dequeue(&dp->tq)) != NULL) {
- u32 from = G_TC_FROM(skb->tc_verd);
-
- skb->tc_verd = 0;
- skb->tc_verd = SET_TC_NCLS(skb->tc_verd);
- stats->tx_packets++;
- stats->tx_bytes +=skb->len;
-
- skb->dev = dev_get_by_index(&init_net, skb->iif);
- if (!skb->dev) {
- dev_kfree_skb(skb);
- stats->tx_dropped++;
+ finish_wait(&dp->wq, &wait);
+ if (kthread_should_stop())
break;
- }
- dev_put(skb->dev);
- skb->iif = _dev->ifindex;
-
- if (from & AT_EGRESS) {
- dp->st_rx_frm_egr++;
- dev_queue_xmit(skb);
- } else if (from & AT_INGRESS) {
- dp->st_rx_frm_ing++;
- skb_pull(skb, skb->dev->hard_header_len);
- netif_rx(skb);
- } else
- BUG();
- }
- if (__netif_tx_trylock(txq)) {
- dp->st_rxq_check++;
- if ((skb = skb_peek(&dp->rq)) == NULL) {
- dp->tasklet_pending = 0;
- if (netif_queue_stopped(_dev))
- netif_wake_queue(_dev);
- } else {
- dp->st_rxq_rsch++;
- __netif_tx_unlock(txq);
- goto resched;
+ /* transfer packets */
+ while ((skb = skb_dequeue(&dp->tq)) != NULL) {
+ u32 from = G_TC_FROM(skb->tc_verd);
+
+ skb->tc_verd = 0;
+ skb->tc_verd = SET_TC_NCLS(skb->tc_verd);
+ stats->tx_packets++;
+ stats->tx_bytes +=skb->len;
+
+ skb->dev = dev_get_by_index(&init_net, skb->iif);
+ if (!skb->dev) {
+ dev_kfree_skb(skb);
+ stats->tx_dropped++;
+ break;
+ }
+ dev_put(skb->dev);
+ skb->iif = dev->ifindex;
+
+ if (from & AT_EGRESS) {
+ dev_queue_xmit(skb);
+ } else if (from & AT_INGRESS) {
+ skb_pull(skb, skb->dev->hard_header_len);
+ netif_rx_ni(skb);
+ } else
+ BUG();
}
- __netif_tx_unlock(txq);
- } else {
-resched:
- dp->tasklet_pending = 1;
- tasklet_schedule(&dp->ifb_tasklet);
}
-}
-
-static const struct net_device_ops ifb_netdev_ops = {
- .ndo_open = ifb_open,
- .ndo_stop = ifb_close,
- .ndo_start_xmit = ifb_xmit,
- .ndo_validate_addr = eth_validate_addr,
-};
-
-static void ifb_setup(struct net_device *dev)
-{
- /* Initialize the device structure. */
- dev->destructor = free_netdev;
- dev->netdev_ops = &ifb_netdev_ops;
-
- /* Fill in device structure with ethernet-generic values. */
- ether_setup(dev);
- dev->tx_queue_len = TX_Q_LIMIT;
-
- dev->flags |= IFF_NOARP;
- dev->flags &= ~IFF_MULTICAST;
- dev->priv_flags &= ~IFF_XMIT_DST_RELEASE;
- random_ether_addr(dev->dev_addr);
+ return 0;
}
static netdev_tx_t ifb_xmit(struct sk_buff *skb, struct net_device *dev)
{
- struct ifb_private *dp = netdev_priv(dev);
struct net_device_stats *stats = &dev->stats;
u32 from = G_TC_FROM(skb->tc_verd);
+ int num = skb_get_queue_mapping(skb);
+ struct ifb_private *dp = ((struct ifb_private*)netdev_priv(dev)) + num;
stats->rx_packets++;
stats->rx_bytes+=skb->len;
@@ -182,10 +144,8 @@ static netdev_tx_t ifb_xmit(struct sk_buff *skb, struct net_device *dev)
dev->trans_start = jiffies;
skb_queue_tail(&dp->rq, skb);
- if (!dp->tasklet_pending) {
- dp->tasklet_pending = 1;
- tasklet_schedule(&dp->ifb_tasklet);
- }
+ if (skb_queue_len(&dp->rq) == 1)
+ wake_up(&dp->wq);
return NETDEV_TX_OK;
}
@@ -193,26 +153,132 @@ static netdev_tx_t ifb_xmit(struct sk_buff *skb, struct net_device *dev)
static int ifb_close(struct net_device *dev)
{
struct ifb_private *dp = netdev_priv(dev);
+ int i;
+
+ for (i = 0; i < dev->real_num_tx_queues; i++) {
+ kthread_stop(dp[i].task);
+ skb_queue_purge(&dp[i].tq);
+ skb_queue_purge(&dp[i].rq);
+ }
- tasklet_kill(&dp->ifb_tasklet);
netif_stop_queue(dev);
- skb_queue_purge(&dp->rq);
- skb_queue_purge(&dp->tq);
+
return 0;
}
static int ifb_open(struct net_device *dev)
{
struct ifb_private *dp = netdev_priv(dev);
+ int i;
+
+ for (i = 0; i < dev->real_num_tx_queues; i++) {
+ dp[i].dev = dev;
+ skb_queue_head_init(&dp[i].rq);
+ skb_queue_head_init(&dp[i].tq);
+ init_waitqueue_head(&dp[i].wq);
+ dp[i].task = kthread_run(ifb_thread, &dp[i], "%s/%d", dev->name,
+ i);
+ if (IS_ERR(dp[i].task)) {
+ int err = PTR_ERR(dp[i].task);
+ while (--i >= 0)
+ kthread_stop(dp[i].task);
+ return err;
+ }
+ }
- tasklet_init(&dp->ifb_tasklet, ri_tasklet, (unsigned long)dev);
- skb_queue_head_init(&dp->rq);
- skb_queue_head_init(&dp->tq);
netif_start_queue(dev);
return 0;
}
+static u32 simple_tx_hashrnd;
+
+static u16 ifb_select_queue(struct net_device *dev, struct sk_buff *skb)
+{
+ u32 addr1, addr2;
+ u32 hash, ihl;
+ union {
+ u16 in16[2];
+ u32 in32;
+ } ports;
+ u8 ip_proto;
+
+ if ((hash = skb_rx_queue_recorded(skb))) {
+ while (hash >= dev->real_num_tx_queues)
+ hash -= dev->real_num_tx_queues;
+ return hash;
+ }
+
+ switch (skb->protocol) {
+ case __constant_htons(ETH_P_IP):
+ if (!(ip_hdr(skb)->frag_off & htons(IP_MF | IP_OFFSET)))
+ ip_proto = ip_hdr(skb)->protocol;
+ else
+ ip_proto = 0;
+ addr1 = ip_hdr(skb)->saddr;
+ addr2 = ip_hdr(skb)->daddr;
+ ihl = ip_hdr(skb)->ihl << 2;
+ break;
+ case __constant_htons(ETH_P_IPV6):
+ ip_proto = ipv6_hdr(skb)->nexthdr;
+ addr1 = ipv6_hdr(skb)->saddr.s6_addr32[3];
+ addr2 = ipv6_hdr(skb)->daddr.s6_addr32[3];
+ ihl = 10;
+ break;
+ default:
+ return 0;
+ }
+ if (addr1 > addr2)
+ swap(addr1, addr2);
+
+ switch (ip_proto) {
+ case IPPROTO_TCP:
+ case IPPROTO_UDP:
+ case IPPROTO_DCCP:
+ case IPPROTO_ESP:
+ case IPPROTO_AH:
+ case IPPROTO_SCTP:
+ case IPPROTO_UDPLITE:
+ ports.in32 = *((u32 *) (skb_network_header(skb) + ihl));
+ if (ports.in16[0] > ports.in16[1])
+ swap(ports.in16[0], ports.in16[1]);
+ break;
+
+ default:
+ ports.in32 = 0;
+ break;
+ }
+
+ hash = jhash_3words(addr1, addr2, ports.in32,
+ simple_tx_hashrnd ^ ip_proto);
+
+ return (u16) (((u64) hash * dev->real_num_tx_queues) >> 32);
+}
+
+static const struct net_device_ops ifb_netdev_ops = {
+ .ndo_open = ifb_open,
+ .ndo_stop = ifb_close,
+ .ndo_start_xmit = ifb_xmit,
+ .ndo_validate_addr = eth_validate_addr,
+ .ndo_select_queue = ifb_select_queue,
+};
+
+static void ifb_setup(struct net_device *dev)
+{
+ /* Initialize the device structure. */
+ dev->destructor = free_netdev;
+ dev->netdev_ops = &ifb_netdev_ops;
+
+ /* Fill in device structure with ethernet-generic values. */
+ ether_setup(dev);
+ dev->tx_queue_len = TX_Q_LIMIT;
+
+ dev->flags |= IFF_NOARP;
+ dev->flags &= ~IFF_MULTICAST;
+ dev->priv_flags &= ~IFF_XMIT_DST_RELEASE;
+ random_ether_addr(dev->dev_addr);
+}
+
static int ifb_validate(struct nlattr *tb[], struct nlattr *data[])
{
if (tb[IFLA_ADDRESS]) {
@@ -231,17 +297,13 @@ static struct rtnl_link_ops ifb_link_ops __read_mostly = {
.validate = ifb_validate,
};
-/* Number of ifb devices to be set up by this module. */
-module_param(numifbs, int, 0);
-MODULE_PARM_DESC(numifbs, "Number of ifb devices");
-
static int __init ifb_init_one(int index)
{
struct net_device *dev_ifb;
int err;
- dev_ifb = alloc_netdev(sizeof(struct ifb_private),
- "ifb%d", ifb_setup);
+ dev_ifb = alloc_netdev_mq(sizeof(struct ifb_private) * numtxqs, "ifb%d",
+ ifb_setup, numtxqs);
if (!dev_ifb)
return -ENOMEM;
@@ -266,6 +328,7 @@ static int __init ifb_init_module(void)
{
int i, err;
+ get_random_bytes(&simple_tx_hashrnd, 4);
rtnl_lock();
err = __rtnl_link_register(&ifb_link_ops);
next reply other threads:[~2009-11-10 8:30 UTC|newest]
Thread overview: 62+ messages / expand[flat|nested] mbox.gz Atom feed top
2009-11-10 8:30 Changli Gao [this message]
2009-11-10 9:07 ` [PATCH] ifb: add multi-queue support Eric Dumazet
2009-11-10 9:43 ` Changli Gao
2009-11-10 10:57 ` Eric Dumazet
2009-11-10 11:14 ` Changli Gao
2009-11-10 11:41 ` Patrick McHardy
2009-11-10 12:14 ` Changli Gao
2009-11-10 12:19 ` Patrick McHardy
2009-11-10 12:37 ` Changli Gao
2009-11-10 12:45 ` Patrick McHardy
2009-11-10 13:06 ` Changli Gao
2009-11-10 13:34 ` Eric Dumazet
2009-11-10 13:49 ` Changli Gao
2009-11-10 16:45 ` Stephen Hemminger
2009-11-11 6:30 ` Changli Gao
2009-11-10 10:29 ` Patrick McHardy
2009-11-10 10:48 ` Changli Gao
2009-11-10 10:55 ` Eric Dumazet
-- strict thread matches above, loose matches on Subject: below --
2009-11-11 9:51 Changli Gao
2009-11-11 9:56 ` Changli Gao
2009-11-11 10:30 ` Eric Dumazet
2009-11-11 10:57 ` Changli Gao
2009-11-11 15:59 ` Patrick McHardy
2009-11-12 3:12 ` Changli Gao
2009-11-12 8:52 ` Jarek Poplawski
2009-11-12 9:32 ` Changli Gao
2009-11-12 15:10 ` Patrick McHardy
2009-11-13 1:28 ` Changli Gao
2009-11-12 9:44 ` Changli Gao
2009-11-12 9:48 ` Changli Gao
2009-11-12 15:11 ` Patrick McHardy
2009-11-13 1:32 ` Changli Gao
2009-11-13 7:18 ` Patrick McHardy
2009-11-12 12:48 ` Eric Dumazet
2009-11-13 1:26 ` Changli Gao
2009-11-13 5:56 ` Eric Dumazet
2009-11-13 6:16 ` Changli Gao
2009-11-13 7:45 ` Jarek Poplawski
2009-11-13 8:54 ` Changli Gao
2009-11-13 9:18 ` Jarek Poplawski
2009-11-13 9:38 ` Changli Gao
2009-11-13 9:57 ` Jarek Poplawski
2009-11-13 11:25 ` Changli Gao
2009-11-13 12:32 ` Jarek Poplawski
2009-11-13 13:10 ` Eric Dumazet
2009-11-13 16:15 ` Stephen Hemminger
2009-11-13 23:28 ` Changli Gao
2009-11-13 23:32 ` Stephen Hemminger
2009-11-13 23:42 ` Changli Gao
2009-11-14 12:53 ` Eric Dumazet
2009-11-14 13:30 ` Changli Gao
2009-11-13 13:55 ` Eric Dumazet
2009-11-13 4:37 ` Changli Gao
2009-11-16 16:39 ` Stephen Hemminger
2009-11-17 3:10 ` David Miller
2009-11-17 5:38 ` Changli Gao
2009-11-17 6:02 ` Stephen Hemminger
2009-11-13 4:42 Changli Gao
2009-11-13 4:46 ` Changli Gao
2009-11-16 7:31 Changli Gao
2009-11-16 8:19 ` Eric Dumazet
2009-11-16 8:43 ` Changli Gao
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=4AF924A5.1050303@gmail.com \
--to=xiaosuo@gmail.com \
--cc=davem@davemloft.net \
--cc=netdev@vger.kernel.org \
--cc=therbert@google.com \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.