[Pv-ops][PATCH] Netback multiple tasklet support

xen-devel.lists.xenproject.org archive mirror
 help / color / mirror / Atom feed

* [Pv-ops][PATCH] Netback multiple tasklet support
@ 2009-11-27  2:26 Xu, Dongxiao
  2009-11-27  9:42 ` Ian Campbell
  2009-11-27 16:15 ` Ian Pratt
  0 siblings, 2 replies; 46+ messages in thread
From: Xu, Dongxiao @ 2009-11-27  2:26 UTC (permalink / raw)
  To: xen-devel@lists.xensource.com; +Cc: Jeremy Fitzhardinge

[-- Attachment #1: Type: text/plain, Size: 1210 bytes --]

Current netback uses one pair of tasklets for Tx/Rx data transaction. Netback tasklet could only run at one CPU at a time, and it is used to serve all the netfronts. Therefore it has become a performance bottle neck. This patch is to use multiple tasklet pairs to replace the current single pair in dom0. 
	Assuming that Dom0 has CPUNR VCPUs, we define CPUNR kinds of tasklets pair (CPUNR for Tx, and CPUNR for Rx). Each pare of tasklets serve specific group of netfronts. Also for those global and static variables, we duplicated them for each group in order to avoid the spinlock. 

Test senario:
We use ten 1G NIC interface to talk with 10 VMs (netfront) in server. So the total bandwidth is 10G. 
For host machine, bind each guest's netfront with each NIC interface.
For client machine, do netperf testing with each guest.

Test Case	Packet Size	Throughput(Mbps)	Dom0 CPU Util	Guests CPU Util
w/o patch	1400		4304.30		400.33%		112.21%
w/   patch	1400		9533.13		461.64%		243.81%

BTW, when we test this patch, we found that the domain_lock in grant table operation becomes a bottle neck. We temporarily remove the global domain_lock to achieve good performance.
 
Best Regards, 
-- Dongxiao

[-- Attachment #2: 0001-Netback-multiple-tasklets-support.patch --]
[-- Type: application/octet-stream, Size: 44319 bytes --]

From 590ec4af7e7964c7249a812fc99be37b1648d058 Mon Sep 17 00:00:00 2001
From: Dongxiao Xu <dongxiao.xu@intel.com>
Date: Fri, 27 Nov 2009 10:13:57 +0800
Subject: [PATCH] Netback multiple tasklets support.
     Now netback uses one pair of tasklets for Tx/Rx data transaction. Netback
 tasklet could only run at one CPU at a time, and it is used to serve all the
 netfronts. Therefore it has become a performance bottle neck. This patch is to
 use multiple tasklet pairs to replace the current single pair in dom0.
     Assuming that Dom0 has CPUNR VCPUs, we define CPUNR kinds of tasklets pair
 (CPUNR for Tx, and CPUNR for Rx). Each pare of tasklets serve specific group of
 netfronts. Also for those global and static variables, we duplicated them for
 each group in order to avoid the spinlock.

Signed-off-by: Dongxiao Xu <dongxiao.xu@intel.com>
---
 drivers/xen/netback/common.h    |   78 ++++++
 drivers/xen/netback/interface.c |   64 +++++-
 drivers/xen/netback/netback.c   |  564 +++++++++++++++++++++------------------
 3 files changed, 445 insertions(+), 261 deletions(-)

diff --git a/drivers/xen/netback/common.h b/drivers/xen/netback/common.h
index 348644a..3e91012 100644
--- a/drivers/xen/netback/common.h
+++ b/drivers/xen/netback/common.h
@@ -56,6 +56,7 @@
 struct xen_netif {
 	/* Unique identifier for this interface. */
 	domid_t          domid;
+	int 		 grp_index;
 	unsigned int     handle;
 
 	u8               fe_dev_addr[6];
@@ -220,4 +221,81 @@ static inline int netbk_can_sg(struct net_device *dev)
 	return netif->features & NETIF_F_SG;
 }
 
+struct pending_tx_info {
+	struct xen_netif_tx_request req;
+	struct xen_netif *netif;
+};
+typedef unsigned int pending_ring_idx_t;
+
+struct page_ext {
+	unsigned long grp_index;
+	unsigned long idx;
+};
+
+struct netbk_rx_meta {
+	skb_frag_t frag;
+	int id;
+};
+
+struct netbk_tx_pending_inuse {
+	struct list_head list;
+	unsigned long alloc_time;
+};
+
+#define MAX_PENDING_REQS 256
+
+struct netbk {
+	struct tasklet_struct net_tx_tasklet;
+	struct tasklet_struct net_rx_tasklet;
+
+	struct sk_buff_head rx_queue;
+	struct sk_buff_head tx_queue;
+
+	struct timer_list net_timer;
+	struct timer_list netbk_tx_pending_timer;
+
+	struct page **mmap_pages;
+
+	struct page_ext page_extinfo[MAX_PENDING_REQS];
+
+	struct pending_tx_info pending_tx_info[MAX_PENDING_REQS];
+	struct netbk_tx_pending_inuse pending_inuse[MAX_PENDING_REQS];
+	struct gnttab_unmap_grant_ref tx_unmap_ops[MAX_PENDING_REQS];
+	struct gnttab_map_grant_ref tx_map_ops[MAX_PENDING_REQS];
+
+	grant_handle_t grant_tx_handle[MAX_PENDING_REQS];
+	u16 pending_ring[MAX_PENDING_REQS];
+	u16 dealloc_ring[MAX_PENDING_REQS];
+
+	pending_ring_idx_t pending_prod;
+	pending_ring_idx_t pending_cons;
+	pending_ring_idx_t dealloc_prod;
+	pending_ring_idx_t dealloc_cons;
+
+	struct list_head pending_inuse_head;
+	struct list_head net_schedule_list;
+
+	struct multicall_entry rx_mcl[NET_RX_RING_SIZE+3];
+	struct mmu_update rx_mmu[NET_RX_RING_SIZE];
+	struct gnttab_transfer grant_trans_op[NET_RX_RING_SIZE];
+	struct gnttab_copy grant_copy_op[NET_RX_RING_SIZE];
+	unsigned char rx_notify[NR_IRQS];
+	u16 notify_list[NET_RX_RING_SIZE];
+	struct netbk_rx_meta meta[NET_RX_RING_SIZE];
+
+	spinlock_t net_schedule_list_lock;
+	spinlock_t domain_list_lock;
+	struct list_head domains;
+	unsigned int domain_nr;
+};
+
+extern struct netbk *netbk;
+extern int cpu_online_nr;
+extern struct page_foreign_tracker *foreign_page_tracker;
+
+struct domain_entry {
+	int domid;
+	struct list_head dom;
+};
+
 #endif /* __NETIF__BACKEND__COMMON_H__ */
diff --git a/drivers/xen/netback/interface.c b/drivers/xen/netback/interface.c
index 21c1f95..e87751a 100644
--- a/drivers/xen/netback/interface.c
+++ b/drivers/xen/netback/interface.c
@@ -54,6 +54,57 @@
 static unsigned long netbk_queue_length = 32;
 module_param_named(queue_length, netbk_queue_length, ulong, 0644);
 
+static int add_domain_to_list(struct netbk *netbk, int netbk_nr,
+		       struct xen_netif *netif)
+{
+	struct domain_entry *dom_entry;
+	int min_domain_list = 0;
+	int min_domain_nr = 0;
+	int i;
+
+	dom_entry = (struct domain_entry *)
+		kmalloc(sizeof(struct domain_entry), GFP_KERNEL);
+	if (!dom_entry)
+		return -ENOMEM;
+
+	/* Find out the list which contains least number of domain */
+	min_domain_nr = netbk[0].domain_nr;
+	for (i = 0; i < netbk_nr; i++) {
+		if (netbk[i].domain_nr < min_domain_nr) {
+			min_domain_list = i;
+			min_domain_nr = netbk[i].domain_nr;
+		}
+	}
+
+	netif->grp_index = min_domain_list;
+	dom_entry->domid = netif->domid;
+	spin_lock(&netbk[netif->grp_index].domain_list_lock);
+	list_add_tail(&dom_entry->dom, &netbk[netif->grp_index].domains);
+	netbk[netif->grp_index].domain_nr++;
+	spin_unlock(&netbk[netif->grp_index].domain_list_lock);
+	return netif->grp_index;
+}
+
+static void remove_domain_from_list(struct netbk *netbk, int netbk_nr,
+			     struct xen_netif *netif)
+{
+	struct domain_entry *dom_entry = NULL;
+	int grp_index = netif->grp_index;
+
+	list_for_each_entry(dom_entry, &netbk[grp_index].domains, dom) {
+		if (dom_entry->domid == netif->domid)
+			break;
+	}
+	if (!dom_entry)
+		return;
+
+	spin_lock(&netbk[netif->grp_index].domain_list_lock);
+	netbk[netif->grp_index].domain_nr--;
+	list_del(&dom_entry->dom);
+	spin_unlock(&netbk[netif->grp_index].domain_list_lock);
+	kfree(dom_entry);
+}
+
 static void __netif_up(struct xen_netif *netif)
 {
 	enable_irq(netif->irq);
@@ -70,6 +121,7 @@ static int net_open(struct net_device *dev)
 {
 	struct xen_netif *netif = netdev_priv(dev);
 	if (netback_carrier_ok(netif)) {
+		add_domain_to_list(netbk, cpu_online_nr, netif);
 		__netif_up(netif);
 		netif_start_queue(dev);
 	}
@@ -79,8 +131,10 @@ static int net_open(struct net_device *dev)
 static int net_close(struct net_device *dev)
 {
 	struct xen_netif *netif = netdev_priv(dev);
-	if (netback_carrier_ok(netif))
+	if (netback_carrier_ok(netif)) {
 		__netif_down(netif);
+		remove_domain_from_list(netbk, cpu_online_nr, netif);
+	}
 	netif_stop_queue(dev);
 	return 0;
 }
@@ -329,6 +383,9 @@ int netif_map(struct xen_netif *netif, unsigned long tx_ring_ref,
 	if (netif->rx_comms_area == NULL)
 		goto err_rx;
 
+	if (add_domain_to_list(netbk, cpu_online_nr, netif) < 0)
+		goto err_map;
+
 	err = map_frontend_pages(netif, tx_ring_ref, rx_ring_ref);
 	if (err)
 		goto err_map;
@@ -361,6 +418,7 @@ int netif_map(struct xen_netif *netif, unsigned long tx_ring_ref,
 	return 0;
 err_hypervisor:
 	unmap_frontend_pages(netif);
+	remove_domain_from_list(netbk, cpu_online_nr, netif);
 err_map:
 	free_vm_area(netif->rx_comms_area);
 err_rx:
@@ -374,8 +432,10 @@ void netif_disconnect(struct xen_netif *netif)
 		rtnl_lock();
 		netback_carrier_off(netif);
 		netif_carrier_off(netif->dev); /* discard queued packets */
-		if (netif_running(netif->dev))
+		if (netif_running(netif->dev)) {
 			__netif_down(netif);
+			remove_domain_from_list(netbk, cpu_online_nr, netif);
+		}
 		rtnl_unlock();
 		netif_put(netif);
 	}
diff --git a/drivers/xen/netback/netback.c b/drivers/xen/netback/netback.c
index c24debf..103ee8a 100644
--- a/drivers/xen/netback/netback.c
+++ b/drivers/xen/netback/netback.c
@@ -49,18 +49,7 @@
 
 /*define NETBE_DEBUG_INTERRUPT*/
 
-struct netbk_rx_meta {
-	skb_frag_t frag;
-	int id;
-};
-
-struct netbk_tx_pending_inuse {
-	struct list_head list;
-	unsigned long alloc_time;
-};
-
-
-static void netif_idx_release(u16 pending_idx);
+static void netif_idx_release(int grp_index, u16 pending_idx);
 static void make_tx_response(struct xen_netif *netif,
 			     struct xen_netif_tx_request *txp,
 			     s8       st);
@@ -71,44 +60,39 @@ static struct xen_netif_rx_response *make_rx_response(struct xen_netif *netif,
 					     u16      size,
 					     u16      flags);
 
-static void net_tx_action(unsigned long unused);
-static DECLARE_TASKLET(net_tx_tasklet, net_tx_action, 0);
-
-static void net_rx_action(unsigned long unused);
-static DECLARE_TASKLET(net_rx_tasklet, net_rx_action, 0);
+static void net_tx_action(unsigned long grp_index);
 
-static struct timer_list net_timer;
-static struct timer_list netbk_tx_pending_timer;
+static void net_rx_action(unsigned long grp_index);
 
-#define MAX_PENDING_REQS 256
-
-static struct sk_buff_head rx_queue;
-
-static struct page **mmap_pages;
-static inline unsigned long idx_to_pfn(unsigned int idx)
+static inline unsigned long idx_to_pfn(int grp_index, unsigned int idx)
 {
-	return page_to_pfn(mmap_pages[idx]);
+	return page_to_pfn(netbk[grp_index].mmap_pages[idx]);
 }
 
-static inline unsigned long idx_to_kaddr(unsigned int idx)
+static inline unsigned long idx_to_kaddr(int grp_index, unsigned int idx)
 {
-	return (unsigned long)pfn_to_kaddr(idx_to_pfn(idx));
+	return (unsigned long)pfn_to_kaddr(idx_to_pfn(grp_index, idx));
 }
 
 /* extra field used in struct page */
-static inline void netif_set_page_index(struct page *pg, unsigned int index)
+static inline void netif_set_page_index(struct page *pg,
+					struct page_ext *page_extinfo)
 {
-	*(unsigned long *)&pg->mapping = index + 1;
+	pg->mapping = (struct address_space *)page_extinfo;
 }
 
 static inline int netif_page_index(struct page *pg)
 {
-	unsigned long idx = (unsigned long)pg->mapping - 1;
+	int grp_index;
+	int idx;
 
 	if (!PageForeign(pg))
 		return -1;
 
-	if ((idx >= MAX_PENDING_REQS) || (mmap_pages[idx] != pg))
+	grp_index = ((struct page_ext *)(pg->mapping))->grp_index;
+	idx = ((struct page_ext *)(pg->mapping))->idx;
+	if ((idx >= MAX_PENDING_REQS) ||
+			(netbk[grp_index].mmap_pages[idx] != pg))
 		return -1;
 
 	return idx;
@@ -125,67 +109,36 @@ static inline int netif_page_index(struct page *pg)
  */
 #define PKT_PROT_LEN 64
 
-static struct pending_tx_info {
-	struct xen_netif_tx_request req;
-	struct xen_netif *netif;
-} pending_tx_info[MAX_PENDING_REQS];
-static u16 pending_ring[MAX_PENDING_REQS];
-typedef unsigned int pending_ring_idx_t;
-
 static inline pending_ring_idx_t pending_index(unsigned i)
 {
 	return i & (MAX_PENDING_REQS-1);
 }
 
-static pending_ring_idx_t pending_prod, pending_cons;
-
-static inline pending_ring_idx_t nr_pending_reqs(void)
+static inline pending_ring_idx_t nr_pending_reqs(int grp_index)
 {
-	return MAX_PENDING_REQS - pending_prod + pending_cons;
+	return MAX_PENDING_REQS -
+		netbk[grp_index].pending_prod + netbk[grp_index].pending_cons;
 }
 
-static struct page_foreign_tracker *foreign_page_tracker;
-
-/* Freed TX SKBs get batched on this ring before return to pending_ring. */
-static u16 dealloc_ring[MAX_PENDING_REQS];
-static pending_ring_idx_t dealloc_prod, dealloc_cons;
+struct netbk *netbk;
 
-/* Doubly-linked list of in-use pending entries. */
-static struct netbk_tx_pending_inuse pending_inuse[MAX_PENDING_REQS];
-static LIST_HEAD(pending_inuse_head);
-
-static struct sk_buff_head tx_queue;
-
-static grant_handle_t grant_tx_handle[MAX_PENDING_REQS];
-static struct gnttab_unmap_grant_ref tx_unmap_ops[MAX_PENDING_REQS];
-static struct gnttab_map_grant_ref tx_map_ops[MAX_PENDING_REQS];
-
-static LIST_HEAD(net_schedule_list);
-static DEFINE_SPINLOCK(net_schedule_list_lock);
-
-#define MAX_MFN_ALLOC 64
-static unsigned long mfn_list[MAX_MFN_ALLOC];
-static unsigned int alloc_index = 0;
+#define GET_GROUP_INDEX(netif) ((netif)->grp_index)
 
 /* Setting this allows the safe use of this driver without netloop. */
 static int MODPARM_copy_skb = 1;
 module_param_named(copy_skb, MODPARM_copy_skb, bool, 0);
 MODULE_PARM_DESC(copy_skb, "Copy data received from netfront without netloop");
 
+int cpu_online_nr;
+struct page_foreign_tracker *foreign_page_tracker;
 int netbk_copy_skb_mode;
 
-static inline unsigned long alloc_mfn(void)
-{
-	BUG_ON(alloc_index == 0);
-	return mfn_list[--alloc_index];
-}
-
-static inline void maybe_schedule_tx_action(void)
+static inline void maybe_schedule_tx_action(int grp_index)
 {
 	smp_mb();
-	if ((nr_pending_reqs() < (MAX_PENDING_REQS/2)) &&
-	    !list_empty(&net_schedule_list))
-		tasklet_schedule(&net_tx_tasklet);
+	if ((nr_pending_reqs(grp_index) < (MAX_PENDING_REQS/2)) &&
+			!list_empty(&netbk[grp_index].net_schedule_list))
+		tasklet_schedule(&netbk[grp_index].net_tx_tasklet);
 }
 
 static struct sk_buff *netbk_copy_skb(struct sk_buff *skb)
@@ -290,6 +243,7 @@ static void tx_queue_callback(unsigned long data)
 int netif_be_start_xmit(struct sk_buff *skb, struct net_device *dev)
 {
 	struct xen_netif *netif = netdev_priv(dev);
+	int grp_index;
 
 	BUG_ON(skb->dev != dev);
 
@@ -334,9 +288,9 @@ int netif_be_start_xmit(struct sk_buff *skb, struct net_device *dev)
 			mod_timer(&netif->tx_queue_timeout, jiffies + HZ/2);
 		}
 	}
-
-	skb_queue_tail(&rx_queue, skb);
-	tasklet_schedule(&net_rx_tasklet);
+	grp_index = GET_GROUP_INDEX(netif);
+	skb_queue_tail(&netbk[grp_index].rx_queue, skb);
+	tasklet_schedule(&netbk[grp_index].net_rx_tasklet);
 
 	return 0;
 
@@ -495,7 +449,7 @@ static void netbk_add_frag_responses(struct xen_netif *netif, int status,
 	}
 }
 
-static void net_rx_action(unsigned long unused)
+static void net_rx_action(unsigned long grp_index)
 {
 	struct xen_netif *netif = NULL;
 	s8 status;
@@ -510,30 +464,19 @@ static void net_rx_action(unsigned long unused)
 	int count;
 	unsigned long offset;
 
-	/*
-	 * Putting hundreds of bytes on the stack is considered rude.
-	 * Static works because a tasklet can only be on one CPU at any time.
-	 */
-	static struct multicall_entry rx_mcl[NET_RX_RING_SIZE+3];
-	static struct mmu_update rx_mmu[NET_RX_RING_SIZE];
-	static struct gnttab_transfer grant_trans_op[NET_RX_RING_SIZE];
-	static struct gnttab_copy grant_copy_op[NET_RX_RING_SIZE];
-	static unsigned char rx_notify[NR_IRQS];
-	static u16 notify_list[NET_RX_RING_SIZE];
-	static struct netbk_rx_meta meta[NET_RX_RING_SIZE];
-
 	struct netrx_pending_operations npo = {
-		mmu: rx_mmu,
-		trans: grant_trans_op,
-		copy: grant_copy_op,
-		mcl: rx_mcl,
-		meta: meta};
+		.mmu   = netbk[grp_index].rx_mmu,
+		.trans = netbk[grp_index].grant_trans_op,
+		.copy  = netbk[grp_index].grant_copy_op,
+		.mcl   = netbk[grp_index].rx_mcl,
+		.meta  = netbk[grp_index].meta,
+	};
 
 	skb_queue_head_init(&rxq);
 
 	count = 0;
 
-	while ((skb = skb_dequeue(&rx_queue)) != NULL) {
+	while ((skb = skb_dequeue(&netbk[grp_index].rx_queue)) != NULL) {
 		nr_frags = skb_shinfo(skb)->nr_frags;
 		*(int *)skb->cb = nr_frags;
 
@@ -548,39 +491,41 @@ static void net_rx_action(unsigned long unused)
 			break;
 	}
 
-	BUG_ON(npo.meta_prod > ARRAY_SIZE(meta));
+	BUG_ON(npo.meta_prod > ARRAY_SIZE(netbk[grp_index].meta));
 
 	npo.mmu_mcl = npo.mcl_prod;
 	if (npo.mcl_prod) {
 		BUG_ON(xen_feature(XENFEAT_auto_translated_physmap));
-		BUG_ON(npo.mmu_prod > ARRAY_SIZE(rx_mmu));
+		BUG_ON(npo.mmu_prod > ARRAY_SIZE(netbk[grp_index].rx_mmu));
 		mcl = npo.mcl + npo.mcl_prod++;
 
 		BUG_ON(mcl[-1].op != __HYPERVISOR_update_va_mapping);
 		mcl[-1].args[MULTI_UVMFLAGS_INDEX] = UVMF_TLB_FLUSH|UVMF_ALL;
 
 		mcl->op = __HYPERVISOR_mmu_update;
-		mcl->args[0] = (unsigned long)rx_mmu;
+		mcl->args[0] = (unsigned long)netbk[grp_index].rx_mmu;
 		mcl->args[1] = npo.mmu_prod;
 		mcl->args[2] = 0;
 		mcl->args[3] = DOMID_SELF;
 	}
 
 	if (npo.trans_prod) {
-		BUG_ON(npo.trans_prod > ARRAY_SIZE(grant_trans_op));
+		BUG_ON(npo.trans_prod >
+				ARRAY_SIZE(netbk[grp_index].grant_trans_op));
 		mcl = npo.mcl + npo.mcl_prod++;
 		mcl->op = __HYPERVISOR_grant_table_op;
 		mcl->args[0] = GNTTABOP_transfer;
-		mcl->args[1] = (unsigned long)grant_trans_op;
+		mcl->args[1] = (unsigned long)netbk[grp_index].grant_trans_op;
 		mcl->args[2] = npo.trans_prod;
 	}
 
 	if (npo.copy_prod) {
-		BUG_ON(npo.copy_prod > ARRAY_SIZE(grant_copy_op));
+		BUG_ON(npo.copy_prod >
+				ARRAY_SIZE(netbk[grp_index].grant_copy_op));
 		mcl = npo.mcl + npo.mcl_prod++;
 		mcl->op = __HYPERVISOR_grant_table_op;
 		mcl->args[0] = GNTTABOP_copy;
-		mcl->args[1] = (unsigned long)grant_copy_op;
+		mcl->args[1] = (unsigned long)netbk[grp_index].grant_copy_op;
 		mcl->args[2] = npo.copy_prod;
 	}
 
@@ -588,7 +533,7 @@ static void net_rx_action(unsigned long unused)
 	if (!npo.mcl_prod)
 		return;
 
-	BUG_ON(npo.mcl_prod > ARRAY_SIZE(rx_mcl));
+	BUG_ON(npo.mcl_prod > ARRAY_SIZE(netbk[grp_index].rx_mcl));
 
 	ret = HYPERVISOR_multicall(npo.mcl, npo.mcl_prod);
 	BUG_ON(ret != 0);
@@ -605,7 +550,7 @@ static void net_rx_action(unsigned long unused)
 
 		status = netbk_check_gop(nr_frags, netif->domid, &npo);
 
-		id = meta[npo.meta_cons].id;
+		id = netbk[grp_index].meta[npo.meta_cons].id;
 		flags = nr_frags ? NETRXF_more_data : 0;
 
 		if (skb->ip_summed == CHECKSUM_PARTIAL) /* local packet? */
@@ -618,7 +563,7 @@ static void net_rx_action(unsigned long unused)
 		resp = make_rx_response(netif, id, status, offset,
 					skb_headlen(skb), flags);
 
-		if (meta[npo.meta_cons].frag.size) {
+		if (netbk[grp_index].meta[npo.meta_cons].frag.size) {
 			struct xen_netif_extra_info *gso =
 				(struct xen_netif_extra_info *)
 				RING_GET_RESPONSE(&netif->rx,
@@ -626,7 +571,8 @@ static void net_rx_action(unsigned long unused)
 
 			resp->flags |= NETRXF_extra_info;
 
-			gso->u.gso.size = meta[npo.meta_cons].frag.size;
+			gso->u.gso.size =
+				netbk[grp_index].meta[npo.meta_cons].frag.size;
 			gso->u.gso.type = XEN_NETIF_GSO_TYPE_TCPV4;
 			gso->u.gso.pad = 0;
 			gso->u.gso.features = 0;
@@ -636,15 +582,14 @@ static void net_rx_action(unsigned long unused)
 		}
 
 		netbk_add_frag_responses(netif, status,
-					 meta + npo.meta_cons + 1,
-					 nr_frags);
+				netbk[grp_index].meta + npo.meta_cons + 1,
+				nr_frags);
 
 		RING_PUSH_RESPONSES_AND_CHECK_NOTIFY(&netif->rx, ret);
 		irq = netif->irq;
-		if (ret && !rx_notify[irq] &&
-				(netif->smart_poll != 1)) {
-			rx_notify[irq] = 1;
-			notify_list[notify_nr++] = irq;
+		if (ret && !netbk[grp_index].rx_notify[irq]) {
+			netbk[grp_index].rx_notify[irq] = 1;
+			netbk[grp_index].notify_list[notify_nr++] = irq;
 		}
 
 		if (netif_queue_stopped(netif->dev) &&
@@ -669,28 +614,29 @@ static void net_rx_action(unsigned long unused)
 	}
 
 	while (notify_nr != 0) {
-		irq = notify_list[--notify_nr];
-		rx_notify[irq] = 0;
+		irq = netbk[grp_index].notify_list[--notify_nr];
+		netbk[grp_index].rx_notify[irq] = 0;
 		notify_remote_via_irq(irq);
 	}
 
 	/* More work to do? */
-	if (!skb_queue_empty(&rx_queue) && !timer_pending(&net_timer))
-		tasklet_schedule(&net_rx_tasklet);
+	if (!skb_queue_empty(&netbk[grp_index].rx_queue)
+			&& !timer_pending(&netbk[grp_index].net_timer))
+		tasklet_schedule(&netbk[grp_index].net_rx_tasklet);
 #if 0
 	else
 		xen_network_done_notify();
 #endif
 }
 
-static void net_alarm(unsigned long unused)
+static void net_alarm(unsigned long grp_index)
 {
-	tasklet_schedule(&net_rx_tasklet);
+	tasklet_schedule(&netbk[grp_index].net_rx_tasklet);
 }
 
-static void netbk_tx_pending_timeout(unsigned long unused)
+static void netbk_tx_pending_timeout(unsigned long grp_index)
 {
-	tasklet_schedule(&net_tx_tasklet);
+	tasklet_schedule(&netbk[grp_index].net_tx_tasklet);
 }
 
 struct net_device_stats *netif_be_get_stats(struct net_device *dev)
@@ -706,37 +652,41 @@ static int __on_net_schedule_list(struct xen_netif *netif)
 
 static void remove_from_net_schedule_list(struct xen_netif *netif)
 {
-	spin_lock_irq(&net_schedule_list_lock);
+	int grp_index = GET_GROUP_INDEX(netif);
+	spin_lock_irq(&netbk[grp_index].net_schedule_list_lock);
 	if (likely(__on_net_schedule_list(netif))) {
 		list_del_init(&netif->list);
 		netif_put(netif);
 	}
-	spin_unlock_irq(&net_schedule_list_lock);
+	spin_unlock_irq(&netbk[grp_index].net_schedule_list_lock);
 }
 
 static void add_to_net_schedule_list_tail(struct xen_netif *netif)
 {
+	int grp_index = GET_GROUP_INDEX(netif);
 	if (__on_net_schedule_list(netif))
 		return;
 
-	spin_lock_irq(&net_schedule_list_lock);
+	spin_lock_irq(&netbk[grp_index].net_schedule_list_lock);
 	if (!__on_net_schedule_list(netif) &&
 	    likely(netif_schedulable(netif))) {
-		list_add_tail(&netif->list, &net_schedule_list);
+		list_add_tail(&netif->list,
+				&netbk[grp_index].net_schedule_list);
 		netif_get(netif);
 	}
-	spin_unlock_irq(&net_schedule_list_lock);
+	spin_unlock_irq(&netbk[grp_index].net_schedule_list_lock);
 }
 
 void netif_schedule_work(struct xen_netif *netif)
 {
 	int more_to_do;
+	int grp_index = GET_GROUP_INDEX(netif);
 
 	RING_FINAL_CHECK_FOR_REQUESTS(&netif->tx, more_to_do);
 
 	if (more_to_do) {
 		add_to_net_schedule_list_tail(netif);
-		maybe_schedule_tx_action();
+		maybe_schedule_tx_action(grp_index);
 	}
 }
 
@@ -773,13 +723,15 @@ static void tx_credit_callback(unsigned long data)
 	netif_schedule_work(netif);
 }
 
-static inline int copy_pending_req(pending_ring_idx_t pending_idx)
+static inline int copy_pending_req(int grp_index,
+				   pending_ring_idx_t pending_idx)
 {
-	return gnttab_copy_grant_page(grant_tx_handle[pending_idx],
-				      &mmap_pages[pending_idx]);
+	return gnttab_copy_grant_page(
+			netbk[grp_index].grant_tx_handle[pending_idx],
+			&netbk[grp_index].mmap_pages[pending_idx]);
 }
 
-inline static void net_tx_action_dealloc(void)
+static inline void net_tx_action_dealloc(int grp_index)
 {
 	struct netbk_tx_pending_inuse *inuse, *n;
 	struct gnttab_unmap_grant_ref *gop;
@@ -789,51 +741,64 @@ inline static void net_tx_action_dealloc(void)
 	int ret;
 	LIST_HEAD(list);
 
-	dc = dealloc_cons;
-	gop = tx_unmap_ops;
+	dc = netbk[grp_index].dealloc_cons;
+	gop = netbk[grp_index].tx_unmap_ops;
 
 	/*
 	 * Free up any grants we have finished using
 	 */
 	do {
-		dp = dealloc_prod;
+		dp = netbk[grp_index].dealloc_prod;
 
 		/* Ensure we see all indices enqueued by netif_idx_release(). */
 		smp_rmb();
 
 		while (dc != dp) {
 			unsigned long pfn;
-
-			pending_idx = dealloc_ring[pending_index(dc++)];
+			struct netbk_tx_pending_inuse *pending_inuse;
+			pending_ring_idx_t p_index;
+			grant_handle_t handle;
+			struct page *page;
+
+			p_index = pending_index(dc++);
+			pending_idx = netbk[grp_index].dealloc_ring[p_index];
+			pending_inuse = netbk[grp_index].pending_inuse;
 			list_move_tail(&pending_inuse[pending_idx].list, &list);
 
-			pfn = idx_to_pfn(pending_idx);
+			pfn = idx_to_pfn(grp_index, pending_idx);
 			/* Already unmapped? */
 			if (!phys_to_machine_mapping_valid(pfn))
 				continue;
 
-			stop_tracking_page(mmap_pages[pending_idx]);
+			page = netbk[grp_index].mmap_pages[pending_idx];
+			stop_tracking_page(page);
 
-			gnttab_set_unmap_op(gop, idx_to_kaddr(pending_idx),
-					    GNTMAP_host_map,
-					    grant_tx_handle[pending_idx]);
+			handle = netbk[grp_index].grant_tx_handle[pending_idx];
+			gnttab_set_unmap_op(gop,
+					idx_to_kaddr(grp_index, pending_idx),
+					GNTMAP_host_map,
+					handle);
 			gop++;
 		}
 
 		if (netbk_copy_skb_mode != NETBK_DELAYED_COPY_SKB ||
-		    list_empty(&pending_inuse_head))
+		    list_empty(&netbk[grp_index].pending_inuse_head))
 			break;
 
 		/* Copy any entries that have been pending for too long. */
-		list_for_each_entry_safe(inuse, n, &pending_inuse_head, list) {
+		list_for_each_entry_safe(inuse, n,
+				&netbk[grp_index].pending_inuse_head, list) {
+			struct pending_tx_info *txinfo;
+
 			if (time_after(inuse->alloc_time + HZ / 2, jiffies))
 				break;
 
-			pending_idx = inuse - pending_inuse;
+			pending_idx = inuse - netbk[grp_index].pending_inuse;
 
-			pending_tx_info[pending_idx].netif->nr_copied_skbs++;
+			txinfo = &netbk[grp_index].pending_tx_info[pending_idx];
+			txinfo->netif->nr_copied_skbs++;
 
-			switch (copy_pending_req(pending_idx)) {
+			switch (copy_pending_req(grp_index, pending_idx)) {
 			case 0:
 				list_move_tail(&inuse->list, &list);
 				continue;
@@ -846,26 +811,34 @@ inline static void net_tx_action_dealloc(void)
 
 			break;
 		}
-	} while (dp != dealloc_prod);
+	} while (dp != netbk[grp_index].dealloc_prod);
 
-	dealloc_cons = dc;
+	netbk[grp_index].dealloc_cons = dc;
 
 	ret = HYPERVISOR_grant_table_op(
-		GNTTABOP_unmap_grant_ref, tx_unmap_ops, gop - tx_unmap_ops);
+			GNTTABOP_unmap_grant_ref,
+			netbk[grp_index].tx_unmap_ops,
+			gop - netbk[grp_index].tx_unmap_ops);
 	BUG_ON(ret);
 
 	list_for_each_entry_safe(inuse, n, &list, list) {
-		pending_idx = inuse - pending_inuse;
+		struct pending_tx_info *txinfo;
+		pending_ring_idx_t index;
+
+		pending_idx = inuse - netbk[grp_index].pending_inuse;
+		txinfo = netbk[grp_index].pending_tx_info;
 
-		netif = pending_tx_info[pending_idx].netif;
+		netif = txinfo[pending_idx].netif;
 
-		make_tx_response(netif, &pending_tx_info[pending_idx].req,
-				 NETIF_RSP_OKAY);
+		make_tx_response(netif, &txinfo[pending_idx].req,
+				NETIF_RSP_OKAY);
 
 		/* Ready for next use. */
-		gnttab_reset_grant_page(mmap_pages[pending_idx]);
+		gnttab_reset_grant_page(
+				netbk[grp_index].mmap_pages[pending_idx]);
 
-		pending_ring[pending_index(pending_prod++)] = pending_idx;
+		index = pending_index(netbk[grp_index].pending_prod++);
+		netbk[grp_index].pending_ring[index] = pending_idx;
 
 		netif_put(netif);
 
@@ -873,7 +846,8 @@ inline static void net_tx_action_dealloc(void)
 	}
 }
 
-static void netbk_tx_err(struct xen_netif *netif, struct xen_netif_tx_request *txp, RING_IDX end)
+static void netbk_tx_err(struct xen_netif *netif,
+		struct xen_netif_tx_request *txp, RING_IDX end)
 {
 	RING_IDX cons = netif->tx.req_cons;
 
@@ -890,7 +864,8 @@ static void netbk_tx_err(struct xen_netif *netif, struct xen_netif_tx_request *t
 
 static int netbk_count_requests(struct xen_netif *netif,
 				struct xen_netif_tx_request *first,
-				struct xen_netif_tx_request *txp, int work_to_do)
+				struct xen_netif_tx_request *txp,
+				int work_to_do)
 {
 	RING_IDX cons = netif->tx.req_cons;
 	int frags = 0;
@@ -930,35 +905,41 @@ static int netbk_count_requests(struct xen_netif *netif,
 }
 
 static struct gnttab_map_grant_ref *netbk_get_requests(struct xen_netif *netif,
-						  struct sk_buff *skb,
-						  struct xen_netif_tx_request *txp,
-						  struct gnttab_map_grant_ref *mop)
+					struct sk_buff *skb,
+					struct xen_netif_tx_request *txp,
+					struct gnttab_map_grant_ref *mop)
 {
 	struct skb_shared_info *shinfo = skb_shinfo(skb);
 	skb_frag_t *frags = shinfo->frags;
 	unsigned long pending_idx = *((u16 *)skb->data);
 	int i, start;
+	int grp_index = GET_GROUP_INDEX(netif);
 
 	/* Skip first skb fragment if it is on same page as header fragment. */
 	start = ((unsigned long)shinfo->frags[0].page == pending_idx);
 
 	for (i = start; i < shinfo->nr_frags; i++, txp++) {
-		pending_idx = pending_ring[pending_index(pending_cons++)];
+		pending_ring_idx_t index;
+		struct pending_tx_info *txinfo;
 
-		gnttab_set_map_op(mop++, idx_to_kaddr(pending_idx),
+		index = pending_index(netbk[grp_index].pending_cons++);
+		pending_idx = netbk[grp_index].pending_ring[index];
+
+		gnttab_set_map_op(mop++, idx_to_kaddr(grp_index, pending_idx),
 				  GNTMAP_host_map | GNTMAP_readonly,
 				  txp->gref, netif->domid);
 
-		memcpy(&pending_tx_info[pending_idx].req, txp, sizeof(*txp));
+		txinfo = netbk[grp_index].pending_tx_info;
+		memcpy(&txinfo[pending_idx].req, txp, sizeof(*txp));
 		netif_get(netif);
-		pending_tx_info[pending_idx].netif = netif;
+		txinfo[pending_idx].netif = netif;
 		frags[i].page = (void *)pending_idx;
 
 		start_tracking_page(foreign_page_tracker,
-				    mmap_pages[pending_idx],
+				    netbk[grp_index].mmap_pages[pending_idx],
 				    netif->domid,
-				    pending_tx_info[pending_idx].req.gref,
-				    pending_idx,
+				    txinfo[pending_idx].req.gref,
+				    grp_index * MAX_PENDING_REQS + pending_idx,
 				    NULL);
 	}
 
@@ -966,28 +947,34 @@ static struct gnttab_map_grant_ref *netbk_get_requests(struct xen_netif *netif,
 }
 
 static int netbk_tx_check_mop(struct sk_buff *skb,
-			       struct gnttab_map_grant_ref **mopp)
+			       struct gnttab_map_grant_ref **mopp,
+				int grp_index)
 {
 	struct gnttab_map_grant_ref *mop = *mopp;
 	int pending_idx = *((u16 *)skb->data);
-	struct xen_netif *netif = pending_tx_info[pending_idx].netif;
+	struct xen_netif *netif;
 	struct xen_netif_tx_request *txp;
 	struct skb_shared_info *shinfo = skb_shinfo(skb);
 	int nr_frags = shinfo->nr_frags;
 	int i, err, start;
 
+	netif = netbk[grp_index].pending_tx_info[pending_idx].netif;
 	/* Check status of header. */
 	err = mop->status;
 	if (unlikely(err)) {
-		txp = &pending_tx_info[pending_idx].req;
+		pending_ring_idx_t index;
+		index = pending_index(netbk[grp_index].pending_prod++);
+		txp = &netbk[grp_index].pending_tx_info[pending_idx].req;
 		make_tx_response(netif, txp, NETIF_RSP_ERROR);
-		pending_ring[pending_index(pending_prod++)] = pending_idx;
+		netbk[grp_index].pending_ring[index] = pending_idx;
 		netif_put(netif);
 	} else {
+		unsigned long addr;
+		addr = idx_to_kaddr(grp_index, pending_idx);
 		set_phys_to_machine(
-			__pa(idx_to_kaddr(pending_idx)) >> PAGE_SHIFT,
+			__pa(addr) >> PAGE_SHIFT,
 			FOREIGN_FRAME(mop->dev_bus_addr >> PAGE_SHIFT));
-		grant_tx_handle[pending_idx] = mop->handle;
+		netbk[grp_index].grant_tx_handle[pending_idx] = mop->handle;
 	}
 
 	/* Skip first skb fragment if it is on same page as header fragment. */
@@ -995,26 +982,31 @@ static int netbk_tx_check_mop(struct sk_buff *skb,
 
 	for (i = start; i < nr_frags; i++) {
 		int j, newerr;
+		pending_ring_idx_t index;
 
 		pending_idx = (unsigned long)shinfo->frags[i].page;
 
 		/* Check error status: if okay then remember grant handle. */
 		newerr = (++mop)->status;
 		if (likely(!newerr)) {
+			unsigned long addr;
+			addr = idx_to_kaddr(grp_index, pending_idx);
 			set_phys_to_machine(
-				__pa(idx_to_kaddr(pending_idx))>>PAGE_SHIFT,
+				__pa(addr)>>PAGE_SHIFT,
 				FOREIGN_FRAME(mop->dev_bus_addr>>PAGE_SHIFT));
-			grant_tx_handle[pending_idx] = mop->handle;
+			netbk[grp_index].grant_tx_handle[pending_idx] =
+				mop->handle;
 			/* Had a previous error? Invalidate this fragment. */
 			if (unlikely(err))
-				netif_idx_release(pending_idx);
+				netif_idx_release(grp_index, pending_idx);
 			continue;
 		}
 
 		/* Error on this fragment: respond to client with an error. */
-		txp = &pending_tx_info[pending_idx].req;
+		txp = &netbk[grp_index].pending_tx_info[pending_idx].req;
 		make_tx_response(netif, txp, NETIF_RSP_ERROR);
-		pending_ring[pending_index(pending_prod++)] = pending_idx;
+		index = pending_index(netbk[grp_index].pending_prod++);
+		netbk[grp_index].pending_ring[index] = pending_idx;
 		netif_put(netif);
 
 		/* Not the first error? Preceding frags already invalidated. */
@@ -1023,10 +1015,10 @@ static int netbk_tx_check_mop(struct sk_buff *skb,
 
 		/* First error: invalidate header and preceding fragments. */
 		pending_idx = *((u16 *)skb->data);
-		netif_idx_release(pending_idx);
+		netif_idx_release(grp_index, pending_idx);
 		for (j = start; j < i; j++) {
 			pending_idx = (unsigned long)shinfo->frags[i].page;
-			netif_idx_release(pending_idx);
+			netif_idx_release(grp_index, pending_idx);
 		}
 
 		/* Remember the error: invalidate all subsequent fragments. */
@@ -1037,7 +1029,7 @@ static int netbk_tx_check_mop(struct sk_buff *skb,
 	return err;
 }
 
-static void netbk_fill_frags(struct sk_buff *skb)
+static void netbk_fill_frags(struct sk_buff *skb, int grp_index)
 {
 	struct skb_shared_info *shinfo = skb_shinfo(skb);
 	int nr_frags = shinfo->nr_frags;
@@ -1050,12 +1042,13 @@ static void netbk_fill_frags(struct sk_buff *skb)
 
 		pending_idx = (unsigned long)frag->page;
 
-		pending_inuse[pending_idx].alloc_time = jiffies;
-		list_add_tail(&pending_inuse[pending_idx].list,
-			      &pending_inuse_head);
+		netbk[grp_index].pending_inuse[pending_idx].alloc_time =
+			jiffies;
+		list_add_tail(&netbk[grp_index].pending_inuse[pending_idx].list,
+			      &netbk[grp_index].pending_inuse_head);
 
-		txp = &pending_tx_info[pending_idx].req;
-		frag->page = virt_to_page(idx_to_kaddr(pending_idx));
+		txp = &netbk[grp_index].pending_tx_info[pending_idx].req;
+		frag->page = virt_to_page(idx_to_kaddr(grp_index, pending_idx));
 		frag->size = txp->size;
 		frag->page_offset = txp->offset;
 
@@ -1187,15 +1180,16 @@ static bool tx_credit_exceeded(struct xen_netif *netif, unsigned size)
 	return false;
 }
 
-static unsigned net_tx_build_mops(void)
+static unsigned net_tx_build_mops(int grp_index)
 {
 	struct gnttab_map_grant_ref *mop;
 	struct sk_buff *skb;
 	int ret;
 
-	mop = tx_map_ops;
-	while (((nr_pending_reqs() + MAX_SKB_FRAGS) < MAX_PENDING_REQS) &&
-		!list_empty(&net_schedule_list)) {
+	mop = netbk[grp_index].tx_map_ops;
+	while (((nr_pending_reqs(grp_index) + MAX_SKB_FRAGS) <
+				MAX_PENDING_REQS) &&
+		!list_empty(&netbk[grp_index].net_schedule_list)) {
 		struct xen_netif *netif;
 		struct xen_netif_tx_request txreq;
 		struct xen_netif_tx_request txfrags[MAX_SKB_FRAGS];
@@ -1204,9 +1198,11 @@ static unsigned net_tx_build_mops(void)
 		RING_IDX idx;
 		int work_to_do;
 		unsigned int data_len;
+		pending_ring_idx_t index;
 	
 		/* Get a netif from the list with work to do. */
-		netif = list_first_entry(&net_schedule_list, struct xen_netif, list);
+		netif = list_first_entry(&netbk[grp_index].net_schedule_list,
+				struct xen_netif, list);
 		netif_get(netif);
 		remove_from_net_schedule_list(netif);
 
@@ -1265,7 +1261,8 @@ static unsigned net_tx_build_mops(void)
 			continue;
 		}
 
-		pending_idx = pending_ring[pending_index(pending_cons)];
+		index = pending_index(netbk[grp_index].pending_cons);
+		pending_idx = netbk[grp_index].pending_ring[index];
 
 		data_len = (txreq.size > PKT_PROT_LEN &&
 			    ret < MAX_SKB_FRAGS) ?
@@ -1293,21 +1290,21 @@ static unsigned net_tx_build_mops(void)
 			}
 		}
 
-		gnttab_set_map_op(mop, idx_to_kaddr(pending_idx),
+		gnttab_set_map_op(mop, idx_to_kaddr(grp_index, pending_idx),
 				  GNTMAP_host_map | GNTMAP_readonly,
 				  txreq.gref, netif->domid);
 		mop++;
 
 		start_tracking_page(foreign_page_tracker,
-				    mmap_pages[pending_idx],
+				    netbk[grp_index].mmap_pages[pending_idx],
 				    netif->domid,
 				    txreq.gref,
-				    pending_idx,
+				    grp_index * MAX_PENDING_REQS + pending_idx,
 				    NULL);
 
-		memcpy(&pending_tx_info[pending_idx].req,
+		memcpy(&netbk[grp_index].pending_tx_info[pending_idx].req,
 		       &txreq, sizeof(txreq));
-		pending_tx_info[pending_idx].netif = netif;
+		netbk[grp_index].pending_tx_info[pending_idx].netif = netif;
 		*((u16 *)skb->data) = pending_idx;
 
 		__skb_put(skb, data_len);
@@ -1322,40 +1319,42 @@ static unsigned net_tx_build_mops(void)
 			skb_shinfo(skb)->frags[0].page = (void *)~0UL;
 		}
 
-		__skb_queue_tail(&tx_queue, skb);
+		__skb_queue_tail(&netbk[grp_index].tx_queue, skb);
 
-		pending_cons++;
+		netbk[grp_index].pending_cons++;
 
 		mop = netbk_get_requests(netif, skb, txfrags, mop);
 
 		netif->tx.req_cons = idx;
 		netif_schedule_work(netif);
 
-		if ((mop - tx_map_ops) >= ARRAY_SIZE(tx_map_ops))
+		if ((mop - netbk[grp_index].tx_map_ops) >=
+				ARRAY_SIZE(netbk[grp_index].tx_map_ops))
 			break;
 	}
 
-	return mop - tx_map_ops;
+	return mop - netbk[grp_index].tx_map_ops;
 }
 
-static void net_tx_submit(void)
+static void net_tx_submit(int grp_index)
 {
 	struct gnttab_map_grant_ref *mop;
 	struct sk_buff *skb;
 
-	mop = tx_map_ops;
-	while ((skb = __skb_dequeue(&tx_queue)) != NULL) {
+	mop = netbk[grp_index].tx_map_ops;
+	while ((skb = __skb_dequeue(&netbk[grp_index].tx_queue)) != NULL) {
 		struct xen_netif_tx_request *txp;
 		struct xen_netif *netif;
 		u16 pending_idx;
 		unsigned data_len;
+		unsigned long addr;
 
 		pending_idx = *((u16 *)skb->data);
-		netif       = pending_tx_info[pending_idx].netif;
-		txp         = &pending_tx_info[pending_idx].req;
+		netif = netbk[grp_index].pending_tx_info[pending_idx].netif;
+		txp = &netbk[grp_index].pending_tx_info[pending_idx].req;
 
 		/* Check the remap error code. */
-		if (unlikely(netbk_tx_check_mop(skb, &mop))) {
+		if (unlikely(netbk_tx_check_mop(skb, &mop, grp_index))) {
 			DPRINTK("netback grant failed.\n");
 			skb_shinfo(skb)->nr_frags = 0;
 			kfree_skb(skb);
@@ -1363,8 +1362,9 @@ static void net_tx_submit(void)
 		}
 
 		data_len = skb->len;
+		addr = idx_to_kaddr(grp_index, pending_idx);
 		memcpy(skb->data,
-		       (void *)(idx_to_kaddr(pending_idx)|txp->offset),
+		       (void *)(addr|txp->offset),
 		       data_len);
 		if (data_len < txp->size) {
 			/* Append the packet payload as a fragment. */
@@ -1372,7 +1372,7 @@ static void net_tx_submit(void)
 			txp->size -= data_len;
 		} else {
 			/* Schedule a response immediately. */
-			netif_idx_release(pending_idx);
+			netif_idx_release(grp_index, pending_idx);
 		}
 
 		/*
@@ -1384,7 +1384,7 @@ static void net_tx_submit(void)
 		else
 			skb->ip_summed = CHECKSUM_NONE;
 
-		netbk_fill_frags(skb);
+		netbk_fill_frags(skb, grp_index);
 
 		skb->dev      = netif->dev;
 		skb->protocol = eth_type_trans(skb, skb->dev);
@@ -1412,65 +1412,70 @@ static void net_tx_submit(void)
 	}
 
 	if (netbk_copy_skb_mode == NETBK_DELAYED_COPY_SKB &&
-	    !list_empty(&pending_inuse_head)) {
+	    !list_empty(&netbk[grp_index].pending_inuse_head)) {
 		struct netbk_tx_pending_inuse *oldest;
 
-		oldest = list_entry(pending_inuse_head.next,
+		oldest = list_entry(netbk[grp_index].pending_inuse_head.next,
 				    struct netbk_tx_pending_inuse, list);
-		mod_timer(&netbk_tx_pending_timer, oldest->alloc_time + HZ);
+		mod_timer(&netbk[grp_index].netbk_tx_pending_timer,
+				oldest->alloc_time + HZ);
 	}
 }
 
 /* Called after netfront has transmitted */
-static void net_tx_action(unsigned long unused)
+static void net_tx_action(unsigned long grp_index)
 {
 	unsigned nr_mops;
 	int ret;
 
-	if (dealloc_cons != dealloc_prod)
-		net_tx_action_dealloc();
+	if (netbk[grp_index].dealloc_cons != netbk[grp_index].dealloc_prod)
+		net_tx_action_dealloc(grp_index);
 
-	nr_mops = net_tx_build_mops();
+	nr_mops = net_tx_build_mops(grp_index);
 
 	if (nr_mops == 0)
 		return;
 
 	ret = HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref,
-					tx_map_ops, nr_mops);
+					netbk[grp_index].tx_map_ops, nr_mops);
 	BUG_ON(ret);
 
-	net_tx_submit();
+	net_tx_submit(grp_index);
 }
 
-static void netif_idx_release(u16 pending_idx)
+static void netif_idx_release(int grp_index, u16 pending_idx)
 {
 	static DEFINE_SPINLOCK(_lock);
 	unsigned long flags;
+	pending_ring_idx_t index;
 
 	spin_lock_irqsave(&_lock, flags);
-	dealloc_ring[pending_index(dealloc_prod)] = pending_idx;
+	index = pending_index(netbk[grp_index].dealloc_prod);
+	netbk[grp_index].dealloc_ring[index] = pending_idx;
 	/* Sync with net_tx_action_dealloc: insert idx /then/ incr producer. */
 	smp_wmb();
-	dealloc_prod++;
+	netbk[grp_index].dealloc_prod++;
 	spin_unlock_irqrestore(&_lock, flags);
 
-	tasklet_schedule(&net_tx_tasklet);
+	tasklet_schedule(&netbk[grp_index].net_tx_tasklet);
 }
 
 static void netif_page_release(struct page *page, unsigned int order)
 {
 	int idx = netif_page_index(page);
+	int grp_index = ((struct page_ext *)(page->mapping))->grp_index;
 	BUG_ON(order);
 	BUG_ON(idx < 0);
-	netif_idx_release(idx);
+	netif_idx_release(grp_index, idx);
 }
 
 irqreturn_t netif_be_int(int irq, void *dev_id)
 {
 	struct xen_netif *netif = dev_id;
+	int grp_index = GET_GROUP_INDEX(netif);
 
 	add_to_net_schedule_list_tail(netif);
-	maybe_schedule_tx_action();
+	maybe_schedule_tx_action(grp_index);
 
 	if (netif_schedulable(netif) && !netbk_queue_full(netif))
 		netif_wake_queue(netif->dev);
@@ -1536,13 +1541,14 @@ static struct xen_netif_rx_response *make_rx_response(struct xen_netif *netif,
 static irqreturn_t netif_be_dbg(int irq, void *dev_id, struct pt_regs *regs)
 {
 	struct list_head *ent;
-	struct xen_netif *netif;
+	struct xen_netif *netif = dev_id;
+	int grp_index = GET_GROUP_INDEX(netif);
 	int i = 0;
 
 	printk(KERN_ALERT "netif_schedule_list:\n");
-	spin_lock_irq(&net_schedule_list_lock);
+	spin_lock_irq(&netbk[grp_index].net_schedule_list_lock);
 
-	list_for_each (ent, &net_schedule_list) {
+	list_for_each(ent, &netbk[grp_index].net_schedule_list) {
 		netif = list_entry(ent, struct xen_netif, list);
 		printk(KERN_ALERT " %d: private(rx_req_cons=%08x "
 		       "rx_resp_prod=%08x\n",
@@ -1559,7 +1565,7 @@ static irqreturn_t netif_be_dbg(int irq, void *dev_id, struct pt_regs *regs)
 		i++;
 	}
 
-	spin_unlock_irq(&net_schedule_list_lock);
+	spin_unlock_irq(&netbk[grp_index].net_schedule_list_lock);
 	printk(KERN_ALERT " ** End of netif_schedule_list **\n");
 
 	return IRQ_HANDLED;
@@ -1569,47 +1575,82 @@ static irqreturn_t netif_be_dbg(int irq, void *dev_id, struct pt_regs *regs)
 static int __init netback_init(void)
 {
 	int i;
+	int grp_index;
 	struct page *page;
 	int rc = 0;
 
 	if (!xen_domain())
 		return -ENODEV;
 
+	cpu_online_nr = num_online_cpus();
+
 	/* We can increase reservation by this much in net_rx_action(). */
 //	balloon_update_driver_allowance(NET_RX_RING_SIZE);
 
-	skb_queue_head_init(&rx_queue);
-	skb_queue_head_init(&tx_queue);
-
-	init_timer(&net_timer);
-	net_timer.data = 0;
-	net_timer.function = net_alarm;
-
-	init_timer(&netbk_tx_pending_timer);
-	netbk_tx_pending_timer.data = 0;
-	netbk_tx_pending_timer.function = netbk_tx_pending_timeout;
-
-	foreign_page_tracker = alloc_page_foreign_tracker(MAX_PENDING_REQS);
-	if (!foreign_page_tracker)
-		return -ENOMEM;
-	mmap_pages = alloc_empty_pages_and_pagevec(MAX_PENDING_REQS);
-	if (mmap_pages == NULL) {
-		printk("%s: out of memory\n", __FUNCTION__);
-		free_page_foreign_tracker(foreign_page_tracker);
+	netbk = kzalloc(cpu_online_nr * sizeof(struct netbk), GFP_KERNEL);
+	if (!netbk) {
+		printk(KERN_ALERT "%s: out of memory\n", __func__);
 		return -ENOMEM;
 	}
 
-	for (i = 0; i < MAX_PENDING_REQS; i++) {
-		page = mmap_pages[i];
-		SetPageForeign(page, netif_page_release);
-		netif_set_page_index(page, i);
-		INIT_LIST_HEAD(&pending_inuse[i].list);
+	foreign_page_tracker =
+		alloc_page_foreign_tracker(cpu_online_nr * MAX_PENDING_REQS);
+	if (!foreign_page_tracker) {
+		kfree(netbk);
+		return -ENOMEM;
 	}
 
-	pending_cons = 0;
-	pending_prod = MAX_PENDING_REQS;
-	for (i = 0; i < MAX_PENDING_REQS; i++)
-		pending_ring[i] = i;
+	for (grp_index = 0; grp_index < cpu_online_nr; grp_index++) {
+		tasklet_init(&netbk[grp_index].net_tx_tasklet,
+				net_tx_action, grp_index);
+		tasklet_init(&netbk[grp_index].net_rx_tasklet,
+				net_rx_action, grp_index);
+
+		skb_queue_head_init(&netbk[grp_index].rx_queue);
+		skb_queue_head_init(&netbk[grp_index].tx_queue);
+
+		netbk[grp_index].mmap_pages =
+			alloc_empty_pages_and_pagevec(MAX_PENDING_REQS);
+		if (netbk[grp_index].mmap_pages == NULL) {
+			printk(KERN_ALERT "%s: out of memory\n", __func__);
+			rc = -ENOMEM;
+			goto failed_init;
+		}
+
+		init_timer(&netbk[grp_index].net_timer);
+		netbk[grp_index].net_timer.data = (unsigned long)grp_index;
+		netbk[grp_index].net_timer.function = net_alarm;
+
+		init_timer(&netbk[grp_index].netbk_tx_pending_timer);
+		netbk[grp_index].netbk_tx_pending_timer.data =
+			(unsigned long)grp_index;
+		netbk[grp_index].netbk_tx_pending_timer.function =
+			netbk_tx_pending_timeout;
+
+		for (i = 0; i < MAX_PENDING_REQS; i++) {
+			page = netbk[grp_index].mmap_pages[i];
+			SetPageForeign(page, netif_page_release);
+			netbk[grp_index].page_extinfo[i].grp_index = grp_index;
+			netbk[grp_index].page_extinfo[i].idx = i;
+			netif_set_page_index(page,
+					&netbk[grp_index].page_extinfo[i]);
+			INIT_LIST_HEAD(&netbk[grp_index].pending_inuse[i].list);
+		}
+		INIT_LIST_HEAD(&netbk[grp_index].pending_inuse_head);
+		INIT_LIST_HEAD(&netbk[grp_index].net_schedule_list);
+
+		netbk[grp_index].pending_cons = 0;
+		netbk[grp_index].pending_prod = MAX_PENDING_REQS;
+
+		for (i = 0; i < MAX_PENDING_REQS; i++)
+			netbk[grp_index].pending_ring[i] = i;
+
+		spin_lock_init(&netbk[grp_index].net_schedule_list_lock);
+
+		INIT_LIST_HEAD(&netbk[grp_index].domains);
+		spin_lock_init(&netbk[grp_index].domain_list_lock);
+		netbk[grp_index].domain_nr = 0;
+	}
 
 	netbk_copy_skb_mode = NETBK_DONT_COPY_SKB;
 	if (MODPARM_copy_skb) {
@@ -1638,9 +1679,14 @@ static int __init netback_init(void)
 	return 0;
 
 failed_init:
-	free_empty_pages_and_pagevec(mmap_pages, MAX_PENDING_REQS);
-	del_timer(&netbk_tx_pending_timer);
-	del_timer(&net_timer);
+	for (i = 0; i < grp_index; i++) {
+		free_empty_pages_and_pagevec(netbk[i].mmap_pages,
+				MAX_PENDING_REQS);
+		del_timer(&netbk[i].netbk_tx_pending_timer);
+		del_timer(&netbk[i].net_timer);
+	}
+	kfree(netbk);
+	free_page_foreign_tracker(foreign_page_tracker);
 	return rc;
 
 }
-- 
1.6.3


[-- Attachment #3: Type: text/plain, Size: 138 bytes --]

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xensource.com
http://lists.xensource.com/xen-devel

^ permalink raw reply related	[flat|nested] 46+ messages in thread

* Re: [Pv-ops][PATCH] Netback multiple tasklet support
  2009-11-27  2:26 [Pv-ops][PATCH] Netback multiple tasklet support Xu, Dongxiao
@ 2009-11-27  9:42 ` Ian Campbell
  2009-11-27 16:08   ` Xu, Dongxiao
  2009-11-27 16:15 ` Ian Pratt
  1 sibling, 1 reply; 46+ messages in thread
From: Ian Campbell @ 2009-11-27  9:42 UTC (permalink / raw)
  To: Xu, Dongxiao; +Cc: Fitzhardinge, xen-devel@lists.xensource.com, Jeremy

[-- Attachment #1: Type: text/plain, Size: 2245 bytes --]

Hi,

Does this change have any impact on the responsiveness of domain 0
userspace while the host is under heavy network load? We have found that
the netback tasklets can completely dominate dom0's VCPU to the point
where no userspace process ever gets the chance to run, since this
includes sshd and the management toolstack that can be quite annoying. 

The issue was probably specific to using a single-VCPU domain 0 in
XenServer but since you patch introduces a tasklet per VCPU it could
possibly happen to multi-VCPU domain 0.

For XenServer we converted the tasklets into a kernel thread, at the
cost of a small reduction in overall throughput but yielding a massive
improvement in domain 0 responsiveness. Unfortunately the change was
made by someone who has since left Citrix and I cannot locate the
numbers he left behind :-(

Our patch is attached. A netback thread per domain 0 VCPU might be
interesting to experiment with?

Ian.

On Fri, 2009-11-27 at 02:26 +0000, Xu, Dongxiao wrote:
> Current netback uses one pair of tasklets for Tx/Rx data transaction.
> Netback tasklet could only run at one CPU at a time, and it is used to
> serve all the netfronts. Therefore it has become a performance bottle
> neck. This patch is to use multiple tasklet pairs to replace the
> current single pair in dom0. 
> 	Assuming that Dom0 has CPUNR VCPUs, we define CPUNR kinds of tasklets
> pair (CPUNR for Tx, and CPUNR for Rx). Each pare of tasklets serve
> specific group of netfronts. Also for those global and static
> variables, we duplicated them for each group in order to avoid the
> spinlock. 
> 
> Test senario:
> We use ten 1G NIC interface to talk with 10 VMs (netfront) in server.
> So the total bandwidth is 10G. 
> For host machine, bind each guest's netfront with each NIC interface.
> For client machine, do netperf testing with each guest.
> 
> Test Case	Packet Size	Throughput(Mbps)	Dom0 CPU Util	Guests CPU Util
> w/o patch	1400		4304.30		400.33%		112.21%
> w/   patch	1400		9533.13		461.64%		243.81%
> 
> BTW, when we test this patch, we found that the domain_lock in grant
> table operation becomes a bottle neck. We temporarily remove the
> global domain_lock to achieve good performance.
>  
> Best Regards, 
> -- Dongxiao


[-- Attachment #2: netback-thread --]
[-- Type: text/x-patch, Size: 4240 bytes --]

diff -r 76eb6cc5bfd1 -r 33722a4d7abd drivers/xen/netback/netback.c
--- a/drivers/xen/netback/netback.c	Mon Sep 22 11:40:53 2008 +0100
+++ b/drivers/xen/netback/netback.c	Mon Sep 22 11:40:55 2008 +0100
@@ -37,6 +37,7 @@
 #include "common.h"
 #include <xen/balloon.h>
 #include <xen/interface/memory.h>
+#include <linux/kthread.h>
 
 /*define NETBE_DEBUG_INTERRUPT*/
 
@@ -65,11 +66,9 @@ static netif_rx_response_t *make_rx_resp
 					     u16      size,
 					     u16      flags);
 
-static void net_tx_action(unsigned long unused);
-static DECLARE_TASKLET(net_tx_tasklet, net_tx_action, 0);
-
-static void net_rx_action(unsigned long unused);
-static DECLARE_TASKLET(net_rx_tasklet, net_rx_action, 0);
+static void net_rx_action(void);
+static void net_tx_action(void);
+static DECLARE_WAIT_QUEUE_HEAD(netbk_action_wq);
 
 void netback_dump_free_callpath(struct page *page);
 
@@ -190,7 +189,7 @@ static inline void maybe_schedule_tx_act
 	smp_mb();
 	if ((NR_PENDING_REQS < (MAX_PENDING_REQS/2)) &&
 	    !list_empty(&net_schedule_list))
-		tasklet_schedule(&net_tx_tasklet);
+		wake_up(&netbk_action_wq);
 }
 
 static struct sk_buff *netbk_copy_skb(struct sk_buff *skb)
@@ -347,7 +346,7 @@ int netif_be_start_xmit(struct sk_buff *
 	}
 
 	skb_queue_tail(&rx_queue, skb);
-	tasklet_schedule(&net_rx_tasklet);
+	wake_up(&netbk_action_wq);
 
 	return 0;
 
@@ -645,7 +644,12 @@ struct skb_cb_overlay {
 	int meta_slots_used;
 };
 
-static void net_rx_action(unsigned long unused)
+static inline int net_rx_action_work_to_do(void)
+{
+	return !skb_queue_empty(&rx_queue);
+}
+
+static void net_rx_action(void)
 {
 	netif_t *netif = NULL;
 	s8 status;
@@ -854,9 +858,11 @@ static void net_rx_action(unsigned long 
 		notify_remote_via_irq(irq);
 	}
 
+#if 0
 	/* More work to do? */
 	if (!skb_queue_empty(&rx_queue) && !timer_pending(&net_timer))
-		tasklet_schedule(&net_rx_tasklet);
+		wake_up(&netbk_action_wq);
+#endif
 #if 0
 	else
 		xen_network_done_notify();
@@ -865,12 +871,12 @@ static void net_rx_action(unsigned long 
 
 static void net_alarm(unsigned long unused)
 {
-	tasklet_schedule(&net_rx_tasklet);
+	wake_up(&netbk_action_wq);
 }
 
 static void netbk_tx_pending_timeout(unsigned long unused)
 {
-	tasklet_schedule(&net_tx_tasklet);
+	wake_up(&netbk_action_wq);
 }
 
 struct net_device_stats *netif_be_get_stats(struct net_device *dev)
@@ -1295,8 +1301,20 @@ static int netbk_set_skb_gso(struct sk_b
 	return 0;
 }
 
+static inline int net_tx_action_work_to_do(void)
+{
+	if (dealloc_cons != dealloc_prod)
+		return 1;
+
+	if (((NR_PENDING_REQS + MAX_SKB_FRAGS) < MAX_PENDING_REQS) &&
+		!list_empty(&net_schedule_list))
+		return 1;
+
+	return 0;
+}
+
 /* Called after netfront has transmitted */
-static void net_tx_action(unsigned long unused)
+static void net_tx_action(void)
 {
 	struct list_head *ent;
 	struct sk_buff *skb;
@@ -1528,7 +1546,7 @@ static void net_tx_action(unsigned long 
 			continue;
 		}
 
-		netif_rx(skb);
+		netif_rx_ni(skb);
 		netif->dev->last_rx = jiffies;
 	}
 
@@ -1554,7 +1572,7 @@ static void netif_idx_release(u16 pendin
 	dealloc_prod++;
 	spin_unlock_irqrestore(&_lock, flags);
 
-	tasklet_schedule(&net_tx_tasklet);
+	wake_up(&netbk_action_wq);
 }
 
 static void netif_page_release(struct page *page)
@@ -1627,6 +1645,24 @@ static netif_rx_response_t *make_rx_resp
 
 	return resp;
 }
+
+static int netbk_action_thread(void *unused)
+{
+	while (1) {
+		wait_event_interruptible(netbk_action_wq,
+			net_rx_action_work_to_do() || net_tx_action_work_to_do());
+		cond_resched();
+
+		if (net_rx_action_work_to_do())
+			net_rx_action();
+
+		if (net_tx_action_work_to_do())
+			net_tx_action();
+	}
+
+	return 0;
+}
+
 
 #ifdef NETBE_DEBUG_INTERRUPT
 static irqreturn_t netif_be_dbg(int irq, void *dev_id, struct pt_regs *regs)
@@ -1666,6 +1702,7 @@ static int __init netback_init(void)
 {
 	int i;
 	struct page *page;
+	struct task_struct *task;
 
 	if (!is_running_on_xen())
 		return -ENODEV;
@@ -1717,6 +1754,10 @@ static int __init netback_init(void)
 	netif_accel_init();
 
 	netif_xenbus_init();
+
+	task = kthread_run(netbk_action_thread, NULL, "netback");
+	if (IS_ERR(task))
+		return PTR_ERR(task);
 
 #ifdef NETBE_DEBUG_INTERRUPT
 	(void)bind_virq_to_irqhandler(VIRQ_DEBUG,

[-- Attachment #3: Type: text/plain, Size: 138 bytes --]

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xensource.com
http://lists.xensource.com/xen-devel

^ permalink raw reply	[flat|nested] 46+ messages in thread

* RE: [Pv-ops][PATCH] Netback multiple tasklet support
  2009-11-27  9:42 ` Ian Campbell
@ 2009-11-27 16:08   ` Xu, Dongxiao
  0 siblings, 0 replies; 46+ messages in thread
From: Xu, Dongxiao @ 2009-11-27 16:08 UTC (permalink / raw)
  To: Ian Campbell; +Cc: Fitzhardinge, xen-devel@lists.xensource.com, Jeremy

Ian, 
	Thanks for your comments. Some explainations below.

Best Regards,
-- Dongxiao

Ian Campbell wrote:
> Hi,
> 
> Does this change have any impact on the responsiveness of domain 0
> userspace while the host is under heavy network load? We have found
> that the netback tasklets can completely dominate dom0's VCPU to the
> point where no userspace process ever gets the chance to run, since
> this includes sshd and the management toolstack that can be quite
> annoying. 
> 
> The issue was probably specific to using a single-VCPU domain 0 in
> XenServer but since you patch introduces a tasklet per VCPU it could
> possibly happen to multi-VCPU domain 0.

The former case you found is because all the netfronts are processed by one dom0's tasklet, therefore the only vcpu which handles the tasklet will become super busy and have no time to handle other userspace issues. My patch separates the netback's workload to different tasklets, and if the tasklets could bind with different vcpus in dom0 (by irqbalance or manually pin interrupt), the total CPU utilization will be delivered to each vcpu in average, which could make dom0 more scalable. Take our test case as an example, the system is in heavy network load, whose throughput is close to its network bandwidth (9.55G/10G), but it only uses ~460% dom0's CPU (Dom0 totally has 10 vcpus to handle the network, so each vcpu cost 46% for the network workload). Therefore for 1G NIC, there will be no problem. Also for the current 10G NIC, most of them have multi-queue technology, interrupts will be deliver to different cpus, so dom0's each vcpu is only needed to handle part of the workload, I believe there will be no problem too. 

> 
> For XenServer we converted the tasklets into a kernel thread, at the
> cost of a small reduction in overall throughput but yielding a massive
> improvement in domain 0 responsiveness. Unfortunately the change was
> made by someone who has since left Citrix and I cannot locate the
> numbers he left behind :-(
> 
> Our patch is attached. A netback thread per domain 0 VCPU might be
> interesting to experiment with?

Adding the kernel thread mechanism to netback is a good way to improve dom0's responsiveness in UP case. However for multiple vcpu dom0, I think it may be not needed. Anyway it is another story from my multiple tasklet approach. 

> 
> Ian.
> 
> On Fri, 2009-11-27 at 02:26 +0000, Xu, Dongxiao wrote:
>> Current netback uses one pair of tasklets for Tx/Rx data transaction.
>> Netback tasklet could only run at one CPU at a time, and it is used
>> to serve all the netfronts. Therefore it has become a performance
>> bottle neck. This patch is to use multiple tasklet pairs to replace
>> 	the current single pair in dom0. Assuming that Dom0 has CPUNR
>> VCPUs, we define CPUNR kinds of tasklets pair (CPUNR for Tx, and
>> CPUNR for Rx). Each pare of tasklets serve specific group of
>> netfronts. Also for those global and static variables, we duplicated
>> them for each group in order to avoid the spinlock. 
>> 
>> Test senario:
>> We use ten 1G NIC interface to talk with 10 VMs (netfront) in server.
>> So the total bandwidth is 10G.
>> For host machine, bind each guest's netfront with each NIC interface.
>> For client machine, do netperf testing with each guest.
>> 
>> Test Case	Packet Size	Throughput(Mbps)	Dom0 CPU Util	Guests CPU Util
>> w/o patch	1400		4304.30		400.33%		112.21%
>> w/   patch	1400		9533.13		461.64%		243.81%
>> 
>> BTW, when we test this patch, we found that the domain_lock in grant
>> table operation becomes a bottle neck. We temporarily remove the
>> global domain_lock to achieve good performance.
>> 
>> Best Regards,
>> -- Dongxiao

^ permalink raw reply	[flat|nested] 46+ messages in thread

* RE: [Pv-ops][PATCH] Netback multiple tasklet support
  2009-11-27  2:26 [Pv-ops][PATCH] Netback multiple tasklet support Xu, Dongxiao
  2009-11-27  9:42 ` Ian Campbell
@ 2009-11-27 16:15 ` Ian Pratt
  2009-11-27 16:57   ` Xu, Dongxiao
  1 sibling, 1 reply; 46+ messages in thread
From: Ian Pratt @ 2009-11-27 16:15 UTC (permalink / raw)
  To: Xu, Dongxiao, xen-devel@lists.xensource.com
  Cc: Jeremy Fitzhardinge, Ian Pratt

 
> Test Case	Packet Size	Throughput(Mbps)	Dom0 CPU Util	Guests CPU
> Util
> w/o patch	1400		4304.30		400.33%		112.21%
> w/   patch	1400		9533.13		461.64%		243.81%
> 
> BTW, when we test this patch, we found that the domain_lock in grant table
> operation becomes a bottle neck. We temporarily remove the global
> domain_lock to achieve good performance.

What are the figures with the domain_lock still present? How many VCPUs did dom0 have (it would be good to see numbers for 2,3 and 4 VCPUs). 

I'd rather see use of kthreads than tasklets as this enables more control over QoS (I believe there are patches for this).

Thanks,
Ian
 

^ permalink raw reply	[flat|nested] 46+ messages in thread

* RE: [Pv-ops][PATCH] Netback multiple tasklet support
  2009-11-27 16:15 ` Ian Pratt
@ 2009-11-27 16:57   ` Xu, Dongxiao
  2009-11-28 13:15     ` Ian Pratt
  0 siblings, 1 reply; 46+ messages in thread
From: Xu, Dongxiao @ 2009-11-27 16:57 UTC (permalink / raw)
  To: Ian Pratt, xen-devel@lists.xensource.com; +Cc: Jeremy Fitzhardinge

Ian Pratt wrote:
>> Test Case	Packet Size	Throughput(Mbps)	Dom0 CPU Util	Guests CPU
>> Util
>> w/o patch	1400		4304.30		400.33%		112.21%
>> w/   patch	1400		9533.13		461.64%		243.81%
>> 
>> BTW, when we test this patch, we found that the domain_lock in grant
>> table operation becomes a bottle neck. We temporarily remove the
>> global domain_lock to achieve good performance.
> 
> What are the figures with the domain_lock still present? How many
> VCPUs did dom0 have (it would be good to see numbers for 2,3 and 4
> VCPUs).  
> 
> I'd rather see use of kthreads than tasklets as this enables more
> control over QoS (I believe there are patches for this). 
> 
> Thanks,
> Ian

The domain lock is in grant_op hypercall. If the multiple tasklets are fighting
with each other for this big domain lock, it would become a bottleneck and 
hurt the performance. 
Our test system has 16 LP in total, so we have 16 vcpus in dom0 by default.
10 of them are used to handle the network load. For our test case, dom0's total
cpu utilization is  ~461.64%,  so each vcpu ocupies ~46%. 
Actually the multiple tasklet in netback could already improve the the QoS of the 
system, therefore I think it can also help to get better responseness for that vcpu.
I think I can try to write another patch which replace the tasklet by kthread, 
because I think is a different job with the multi-tasklet netback support. (kthread is 
used to guarantee the responseness of userspace, however multi-tasklet netback is
used to remove the dom0's cpu utilization bottleneck). However I am not sure 
whether the improvement in QoS by this change is needed In MP system? 

Thanks!
Dongxiao

^ permalink raw reply	[flat|nested] 46+ messages in thread

* RE: [Pv-ops][PATCH] Netback multiple tasklet support
  2009-11-27 16:57   ` Xu, Dongxiao
@ 2009-11-28 13:15     ` Ian Pratt
  2009-12-02 10:17       ` Xu, Dongxiao
  0 siblings, 1 reply; 46+ messages in thread
From: Ian Pratt @ 2009-11-28 13:15 UTC (permalink / raw)
  To: Xu, Dongxiao, xen-devel@lists.xensource.com
  Cc: Jeremy Fitzhardinge, Ian Pratt

> The domain lock is in grant_op hypercall. If the multiple tasklets are fighting
> with each other for this big domain lock, it would become a bottleneck and
> hurt the performance.
> Our test system has 16 LP in total, so we have 16 vcpus in dom0 by
> default.
> 10 of them are used to handle the network load. For our test case, dom0's
> totalvcpu utilization is  ~461.64%,  so each vcpu ocupies ~46%.

Having 10 VCPUs for dom0 doesn't seem like a good idea -- it really oughtn't to need that many CPUs to handle IO load. Have you got any results with e.g. 2 or 4 VCPUs?

When we switch over to using netchannel2 by default this issue should largely go away anyhow as the copy is not done by dom0. Have you done any tests with netchannel2?

> Actually the multiple tasklet in netback could already improve the the QoS of the
> system, therefore I think it can also help to get better responseness for
> that vcpu.
> I think I can try to write another patch which replace the tasklet by kthread,
> because I think is a different job with the multi-tasklet netback support.
> (kthread is used to guarantee the responseness of userspace, however multi-tasklet
> netback is used to remove the dom0's cpu utilization bottleneck). However I am not
> sure whether the improvement in QoS by this change is needed In MP system?

Have you looked at the patch that xenserver uses to replace the tasklets by kthreads?

Thanks,
Ian

^ permalink raw reply	[flat|nested] 46+ messages in thread

* RE: [Pv-ops][PATCH] Netback multiple tasklet support
  2009-11-28 13:15     ` Ian Pratt
@ 2009-12-02 10:17       ` Xu, Dongxiao
  2009-12-03 21:28         ` Jeremy Fitzhardinge
  0 siblings, 1 reply; 46+ messages in thread
From: Xu, Dongxiao @ 2009-12-02 10:17 UTC (permalink / raw)
  To: Ian Pratt, Ian Campbell, xen-devel@lists.xensource.com,
	Jeremy Fitzhardinge

[-- Attachment #1: Type: text/plain, Size: 3353 bytes --]

Hi, 
	According to your feedback, I revised my patch and resend it now.
	
[PATCH 01]: Use multiple tasklet pairs to replace the current single pair in dom0.
[PATCH 02]: Replace the tasklet with kernel thread. It may hurt the performance, but could improve the responseness from userspace.

Test senario:
We use ten 1G NIC interface to talk with 10 VMs (netfront) in server. So the total bandwidth is 10G. 
For host machine, bind each guest's netfront with each NIC interface.
For client machine, do netperf testing with each guest.

Test Case			Throughput(Mbps)	Dom0 CPU Util	Guests CPU Util
w/o any patch			4304.30		400.33%		112.21%
w/   01   patch			9533.13		461.64%		243.81%
w/ 01 and 02 patches		7942.68		597.83%		250.53%

>From the result we can see that, the case "w/ 01 and 02 patches" didn't reach/near the total bandwidth. It is because some vcpus in dom0 are saturated due to the context switch with other tasks, thus hurt the performance. To prove this idea, I did a experiment, which sets the kernel thread to SCHED_FIFO type, in order to avoid preemption by normal tasks. The experiment result is showed below, and it could get good performance. However like tasklet, set the kernel thread to high priority could also influence the userspace responseness because the usespace application (for example, sshd) could not preempt that netback kernel thread. 

w/ hi-priority kthread		9535.74		543.56%		241.26%

For netchannel2, it omits the grant copy in dom0, I didn't try it yet. But I used xenoprofile in current netback system to get a feeling that, grant copy occupies  ~1/6 cpu cycle of dom0 (including Xen and dom0 vmlinux). 

BTW, 02 patch is ported from the patch given by Ian Campbell. You can add your signed-off-by if you want. :)

Best Regards, 
-- Dongxiao


Ian Pratt wrote:
>> The domain lock is in grant_op hypercall. If the multiple tasklets
>> are fighting with each other for this big domain lock, it would
>> become a bottleneck and 
>> hurt the performance.
>> Our test system has 16 LP in total, so we have 16 vcpus in dom0 by
>> default.
>> 10 of them are used to handle the network load. For our test case,
>> dom0's totalvcpu utilization is  ~461.64%,  so each vcpu ocupies
>> ~46%. 
> 
> Having 10 VCPUs for dom0 doesn't seem like a good idea -- it really
> oughtn't to need that many CPUs to handle IO load. Have you got any
> results with e.g. 2 or 4 VCPUs?  
> 
> When we switch over to using netchannel2 by default this issue should
> largely go away anyhow as the copy is not done by dom0. Have you done
> any tests with netchannel2?  
> 
>> Actually the multiple tasklet in netback could already improve the
>> the QoS of the system, therefore I think it can also help to get
>> better responseness for 
>> that vcpu.
>> I think I can try to write another patch which replace the tasklet
>> by kthread, because I think is a different job with the
>> multi-tasklet netback support. (kthread is used to guarantee the
>> responseness of userspace, however multi-tasklet netback is used to
>> remove the dom0's cpu utilization bottleneck). However I am not sure
>> whether the improvement in QoS by this change is needed In MP
>> system?  
> 
> Have you looked at the patch that xenserver uses to replace the
> tasklets by kthreads? 
> 
> Thanks,
> Ian

[-- Attachment #2: 0001-Netback-multiple-tasklets-support.patch --]
[-- Type: application/octet-stream, Size: 44323 bytes --]

From 590ec4af7e7964c7249a812fc99be37b1648d058 Mon Sep 17 00:00:00 2001
From: Dongxiao Xu <dongxiao.xu@intel.com>
Date: Fri, 27 Nov 2009 10:13:57 +0800
Subject: [PATCH 1/2] Netback multiple tasklets support.
     Now netback uses one pair of tasklets for Tx/Rx data transaction. Netback
 tasklet could only run at one CPU at a time, and it is used to serve all the
 netfronts. Therefore it has become a performance bottle neck. This patch is to
 use multiple tasklet pairs to replace the current single pair in dom0.
     Assuming that Dom0 has CPUNR VCPUs, we define CPUNR kinds of tasklets pair
 (CPUNR for Tx, and CPUNR for Rx). Each pare of tasklets serve specific group of
 netfronts. Also for those global and static variables, we duplicated them for
 each group in order to avoid the spinlock.

Signed-off-by: Dongxiao Xu <dongxiao.xu@intel.com>
---
 drivers/xen/netback/common.h    |   78 ++++++
 drivers/xen/netback/interface.c |   64 +++++-
 drivers/xen/netback/netback.c   |  564 +++++++++++++++++++++------------------
 3 files changed, 445 insertions(+), 261 deletions(-)

diff --git a/drivers/xen/netback/common.h b/drivers/xen/netback/common.h
index 348644a..3e91012 100644
--- a/drivers/xen/netback/common.h
+++ b/drivers/xen/netback/common.h
@@ -56,6 +56,7 @@
 struct xen_netif {
 	/* Unique identifier for this interface. */
 	domid_t          domid;
+	int 		 grp_index;
 	unsigned int     handle;
 
 	u8               fe_dev_addr[6];
@@ -220,4 +221,81 @@ static inline int netbk_can_sg(struct net_device *dev)
 	return netif->features & NETIF_F_SG;
 }
 
+struct pending_tx_info {
+	struct xen_netif_tx_request req;
+	struct xen_netif *netif;
+};
+typedef unsigned int pending_ring_idx_t;
+
+struct page_ext {
+	unsigned long grp_index;
+	unsigned long idx;
+};
+
+struct netbk_rx_meta {
+	skb_frag_t frag;
+	int id;
+};
+
+struct netbk_tx_pending_inuse {
+	struct list_head list;
+	unsigned long alloc_time;
+};
+
+#define MAX_PENDING_REQS 256
+
+struct netbk {
+	struct tasklet_struct net_tx_tasklet;
+	struct tasklet_struct net_rx_tasklet;
+
+	struct sk_buff_head rx_queue;
+	struct sk_buff_head tx_queue;
+
+	struct timer_list net_timer;
+	struct timer_list netbk_tx_pending_timer;
+
+	struct page **mmap_pages;
+
+	struct page_ext page_extinfo[MAX_PENDING_REQS];
+
+	struct pending_tx_info pending_tx_info[MAX_PENDING_REQS];
+	struct netbk_tx_pending_inuse pending_inuse[MAX_PENDING_REQS];
+	struct gnttab_unmap_grant_ref tx_unmap_ops[MAX_PENDING_REQS];
+	struct gnttab_map_grant_ref tx_map_ops[MAX_PENDING_REQS];
+
+	grant_handle_t grant_tx_handle[MAX_PENDING_REQS];
+	u16 pending_ring[MAX_PENDING_REQS];
+	u16 dealloc_ring[MAX_PENDING_REQS];
+
+	pending_ring_idx_t pending_prod;
+	pending_ring_idx_t pending_cons;
+	pending_ring_idx_t dealloc_prod;
+	pending_ring_idx_t dealloc_cons;
+
+	struct list_head pending_inuse_head;
+	struct list_head net_schedule_list;
+
+	struct multicall_entry rx_mcl[NET_RX_RING_SIZE+3];
+	struct mmu_update rx_mmu[NET_RX_RING_SIZE];
+	struct gnttab_transfer grant_trans_op[NET_RX_RING_SIZE];
+	struct gnttab_copy grant_copy_op[NET_RX_RING_SIZE];
+	unsigned char rx_notify[NR_IRQS];
+	u16 notify_list[NET_RX_RING_SIZE];
+	struct netbk_rx_meta meta[NET_RX_RING_SIZE];
+
+	spinlock_t net_schedule_list_lock;
+	spinlock_t domain_list_lock;
+	struct list_head domains;
+	unsigned int domain_nr;
+};
+
+extern struct netbk *netbk;
+extern int cpu_online_nr;
+extern struct page_foreign_tracker *foreign_page_tracker;
+
+struct domain_entry {
+	int domid;
+	struct list_head dom;
+};
+
 #endif /* __NETIF__BACKEND__COMMON_H__ */
diff --git a/drivers/xen/netback/interface.c b/drivers/xen/netback/interface.c
index 21c1f95..e87751a 100644
--- a/drivers/xen/netback/interface.c
+++ b/drivers/xen/netback/interface.c
@@ -54,6 +54,57 @@
 static unsigned long netbk_queue_length = 32;
 module_param_named(queue_length, netbk_queue_length, ulong, 0644);
 
+static int add_domain_to_list(struct netbk *netbk, int netbk_nr,
+		       struct xen_netif *netif)
+{
+	struct domain_entry *dom_entry;
+	int min_domain_list = 0;
+	int min_domain_nr = 0;
+	int i;
+
+	dom_entry = (struct domain_entry *)
+		kmalloc(sizeof(struct domain_entry), GFP_KERNEL);
+	if (!dom_entry)
+		return -ENOMEM;
+
+	/* Find out the list which contains least number of domain */
+	min_domain_nr = netbk[0].domain_nr;
+	for (i = 0; i < netbk_nr; i++) {
+		if (netbk[i].domain_nr < min_domain_nr) {
+			min_domain_list = i;
+			min_domain_nr = netbk[i].domain_nr;
+		}
+	}
+
+	netif->grp_index = min_domain_list;
+	dom_entry->domid = netif->domid;
+	spin_lock(&netbk[netif->grp_index].domain_list_lock);
+	list_add_tail(&dom_entry->dom, &netbk[netif->grp_index].domains);
+	netbk[netif->grp_index].domain_nr++;
+	spin_unlock(&netbk[netif->grp_index].domain_list_lock);
+	return netif->grp_index;
+}
+
+static void remove_domain_from_list(struct netbk *netbk, int netbk_nr,
+			     struct xen_netif *netif)
+{
+	struct domain_entry *dom_entry = NULL;
+	int grp_index = netif->grp_index;
+
+	list_for_each_entry(dom_entry, &netbk[grp_index].domains, dom) {
+		if (dom_entry->domid == netif->domid)
+			break;
+	}
+	if (!dom_entry)
+		return;
+
+	spin_lock(&netbk[netif->grp_index].domain_list_lock);
+	netbk[netif->grp_index].domain_nr--;
+	list_del(&dom_entry->dom);
+	spin_unlock(&netbk[netif->grp_index].domain_list_lock);
+	kfree(dom_entry);
+}
+
 static void __netif_up(struct xen_netif *netif)
 {
 	enable_irq(netif->irq);
@@ -70,6 +121,7 @@ static int net_open(struct net_device *dev)
 {
 	struct xen_netif *netif = netdev_priv(dev);
 	if (netback_carrier_ok(netif)) {
+		add_domain_to_list(netbk, cpu_online_nr, netif);
 		__netif_up(netif);
 		netif_start_queue(dev);
 	}
@@ -79,8 +131,10 @@ static int net_open(struct net_device *dev)
 static int net_close(struct net_device *dev)
 {
 	struct xen_netif *netif = netdev_priv(dev);
-	if (netback_carrier_ok(netif))
+	if (netback_carrier_ok(netif)) {
 		__netif_down(netif);
+		remove_domain_from_list(netbk, cpu_online_nr, netif);
+	}
 	netif_stop_queue(dev);
 	return 0;
 }
@@ -329,6 +383,9 @@ int netif_map(struct xen_netif *netif, unsigned long tx_ring_ref,
 	if (netif->rx_comms_area == NULL)
 		goto err_rx;
 
+	if (add_domain_to_list(netbk, cpu_online_nr, netif) < 0)
+		goto err_map;
+
 	err = map_frontend_pages(netif, tx_ring_ref, rx_ring_ref);
 	if (err)
 		goto err_map;
@@ -361,6 +418,7 @@ int netif_map(struct xen_netif *netif, unsigned long tx_ring_ref,
 	return 0;
 err_hypervisor:
 	unmap_frontend_pages(netif);
+	remove_domain_from_list(netbk, cpu_online_nr, netif);
 err_map:
 	free_vm_area(netif->rx_comms_area);
 err_rx:
@@ -374,8 +432,10 @@ void netif_disconnect(struct xen_netif *netif)
 		rtnl_lock();
 		netback_carrier_off(netif);
 		netif_carrier_off(netif->dev); /* discard queued packets */
-		if (netif_running(netif->dev))
+		if (netif_running(netif->dev)) {
 			__netif_down(netif);
+			remove_domain_from_list(netbk, cpu_online_nr, netif);
+		}
 		rtnl_unlock();
 		netif_put(netif);
 	}
diff --git a/drivers/xen/netback/netback.c b/drivers/xen/netback/netback.c
index c24debf..103ee8a 100644
--- a/drivers/xen/netback/netback.c
+++ b/drivers/xen/netback/netback.c
@@ -49,18 +49,7 @@
 
 /*define NETBE_DEBUG_INTERRUPT*/
 
-struct netbk_rx_meta {
-	skb_frag_t frag;
-	int id;
-};
-
-struct netbk_tx_pending_inuse {
-	struct list_head list;
-	unsigned long alloc_time;
-};
-
-
-static void netif_idx_release(u16 pending_idx);
+static void netif_idx_release(int grp_index, u16 pending_idx);
 static void make_tx_response(struct xen_netif *netif,
 			     struct xen_netif_tx_request *txp,
 			     s8       st);
@@ -71,44 +60,39 @@ static struct xen_netif_rx_response *make_rx_response(struct xen_netif *netif,
 					     u16      size,
 					     u16      flags);
 
-static void net_tx_action(unsigned long unused);
-static DECLARE_TASKLET(net_tx_tasklet, net_tx_action, 0);
-
-static void net_rx_action(unsigned long unused);
-static DECLARE_TASKLET(net_rx_tasklet, net_rx_action, 0);
+static void net_tx_action(unsigned long grp_index);
 
-static struct timer_list net_timer;
-static struct timer_list netbk_tx_pending_timer;
+static void net_rx_action(unsigned long grp_index);
 
-#define MAX_PENDING_REQS 256
-
-static struct sk_buff_head rx_queue;
-
-static struct page **mmap_pages;
-static inline unsigned long idx_to_pfn(unsigned int idx)
+static inline unsigned long idx_to_pfn(int grp_index, unsigned int idx)
 {
-	return page_to_pfn(mmap_pages[idx]);
+	return page_to_pfn(netbk[grp_index].mmap_pages[idx]);
 }
 
-static inline unsigned long idx_to_kaddr(unsigned int idx)
+static inline unsigned long idx_to_kaddr(int grp_index, unsigned int idx)
 {
-	return (unsigned long)pfn_to_kaddr(idx_to_pfn(idx));
+	return (unsigned long)pfn_to_kaddr(idx_to_pfn(grp_index, idx));
 }
 
 /* extra field used in struct page */
-static inline void netif_set_page_index(struct page *pg, unsigned int index)
+static inline void netif_set_page_index(struct page *pg,
+					struct page_ext *page_extinfo)
 {
-	*(unsigned long *)&pg->mapping = index + 1;
+	pg->mapping = (struct address_space *)page_extinfo;
 }
 
 static inline int netif_page_index(struct page *pg)
 {
-	unsigned long idx = (unsigned long)pg->mapping - 1;
+	int grp_index;
+	int idx;
 
 	if (!PageForeign(pg))
 		return -1;
 
-	if ((idx >= MAX_PENDING_REQS) || (mmap_pages[idx] != pg))
+	grp_index = ((struct page_ext *)(pg->mapping))->grp_index;
+	idx = ((struct page_ext *)(pg->mapping))->idx;
+	if ((idx >= MAX_PENDING_REQS) ||
+			(netbk[grp_index].mmap_pages[idx] != pg))
 		return -1;
 
 	return idx;
@@ -125,67 +109,36 @@ static inline int netif_page_index(struct page *pg)
  */
 #define PKT_PROT_LEN 64
 
-static struct pending_tx_info {
-	struct xen_netif_tx_request req;
-	struct xen_netif *netif;
-} pending_tx_info[MAX_PENDING_REQS];
-static u16 pending_ring[MAX_PENDING_REQS];
-typedef unsigned int pending_ring_idx_t;
-
 static inline pending_ring_idx_t pending_index(unsigned i)
 {
 	return i & (MAX_PENDING_REQS-1);
 }
 
-static pending_ring_idx_t pending_prod, pending_cons;
-
-static inline pending_ring_idx_t nr_pending_reqs(void)
+static inline pending_ring_idx_t nr_pending_reqs(int grp_index)
 {
-	return MAX_PENDING_REQS - pending_prod + pending_cons;
+	return MAX_PENDING_REQS -
+		netbk[grp_index].pending_prod + netbk[grp_index].pending_cons;
 }
 
-static struct page_foreign_tracker *foreign_page_tracker;
-
-/* Freed TX SKBs get batched on this ring before return to pending_ring. */
-static u16 dealloc_ring[MAX_PENDING_REQS];
-static pending_ring_idx_t dealloc_prod, dealloc_cons;
+struct netbk *netbk;
 
-/* Doubly-linked list of in-use pending entries. */
-static struct netbk_tx_pending_inuse pending_inuse[MAX_PENDING_REQS];
-static LIST_HEAD(pending_inuse_head);
-
-static struct sk_buff_head tx_queue;
-
-static grant_handle_t grant_tx_handle[MAX_PENDING_REQS];
-static struct gnttab_unmap_grant_ref tx_unmap_ops[MAX_PENDING_REQS];
-static struct gnttab_map_grant_ref tx_map_ops[MAX_PENDING_REQS];
-
-static LIST_HEAD(net_schedule_list);
-static DEFINE_SPINLOCK(net_schedule_list_lock);
-
-#define MAX_MFN_ALLOC 64
-static unsigned long mfn_list[MAX_MFN_ALLOC];
-static unsigned int alloc_index = 0;
+#define GET_GROUP_INDEX(netif) ((netif)->grp_index)
 
 /* Setting this allows the safe use of this driver without netloop. */
 static int MODPARM_copy_skb = 1;
 module_param_named(copy_skb, MODPARM_copy_skb, bool, 0);
 MODULE_PARM_DESC(copy_skb, "Copy data received from netfront without netloop");
 
+int cpu_online_nr;
+struct page_foreign_tracker *foreign_page_tracker;
 int netbk_copy_skb_mode;
 
-static inline unsigned long alloc_mfn(void)
-{
-	BUG_ON(alloc_index == 0);
-	return mfn_list[--alloc_index];
-}
-
-static inline void maybe_schedule_tx_action(void)
+static inline void maybe_schedule_tx_action(int grp_index)
 {
 	smp_mb();
-	if ((nr_pending_reqs() < (MAX_PENDING_REQS/2)) &&
-	    !list_empty(&net_schedule_list))
-		tasklet_schedule(&net_tx_tasklet);
+	if ((nr_pending_reqs(grp_index) < (MAX_PENDING_REQS/2)) &&
+			!list_empty(&netbk[grp_index].net_schedule_list))
+		tasklet_schedule(&netbk[grp_index].net_tx_tasklet);
 }
 
 static struct sk_buff *netbk_copy_skb(struct sk_buff *skb)
@@ -290,6 +243,7 @@ static void tx_queue_callback(unsigned long data)
 int netif_be_start_xmit(struct sk_buff *skb, struct net_device *dev)
 {
 	struct xen_netif *netif = netdev_priv(dev);
+	int grp_index;
 
 	BUG_ON(skb->dev != dev);
 
@@ -334,9 +288,9 @@ int netif_be_start_xmit(struct sk_buff *skb, struct net_device *dev)
 			mod_timer(&netif->tx_queue_timeout, jiffies + HZ/2);
 		}
 	}
-
-	skb_queue_tail(&rx_queue, skb);
-	tasklet_schedule(&net_rx_tasklet);
+	grp_index = GET_GROUP_INDEX(netif);
+	skb_queue_tail(&netbk[grp_index].rx_queue, skb);
+	tasklet_schedule(&netbk[grp_index].net_rx_tasklet);
 
 	return 0;
 
@@ -495,7 +449,7 @@ static void netbk_add_frag_responses(struct xen_netif *netif, int status,
 	}
 }
 
-static void net_rx_action(unsigned long unused)
+static void net_rx_action(unsigned long grp_index)
 {
 	struct xen_netif *netif = NULL;
 	s8 status;
@@ -510,30 +464,19 @@ static void net_rx_action(unsigned long unused)
 	int count;
 	unsigned long offset;
 
-	/*
-	 * Putting hundreds of bytes on the stack is considered rude.
-	 * Static works because a tasklet can only be on one CPU at any time.
-	 */
-	static struct multicall_entry rx_mcl[NET_RX_RING_SIZE+3];
-	static struct mmu_update rx_mmu[NET_RX_RING_SIZE];
-	static struct gnttab_transfer grant_trans_op[NET_RX_RING_SIZE];
-	static struct gnttab_copy grant_copy_op[NET_RX_RING_SIZE];
-	static unsigned char rx_notify[NR_IRQS];
-	static u16 notify_list[NET_RX_RING_SIZE];
-	static struct netbk_rx_meta meta[NET_RX_RING_SIZE];
-
 	struct netrx_pending_operations npo = {
-		mmu: rx_mmu,
-		trans: grant_trans_op,
-		copy: grant_copy_op,
-		mcl: rx_mcl,
-		meta: meta};
+		.mmu   = netbk[grp_index].rx_mmu,
+		.trans = netbk[grp_index].grant_trans_op,
+		.copy  = netbk[grp_index].grant_copy_op,
+		.mcl   = netbk[grp_index].rx_mcl,
+		.meta  = netbk[grp_index].meta,
+	};
 
 	skb_queue_head_init(&rxq);
 
 	count = 0;
 
-	while ((skb = skb_dequeue(&rx_queue)) != NULL) {
+	while ((skb = skb_dequeue(&netbk[grp_index].rx_queue)) != NULL) {
 		nr_frags = skb_shinfo(skb)->nr_frags;
 		*(int *)skb->cb = nr_frags;
 
@@ -548,39 +491,41 @@ static void net_rx_action(unsigned long unused)
 			break;
 	}
 
-	BUG_ON(npo.meta_prod > ARRAY_SIZE(meta));
+	BUG_ON(npo.meta_prod > ARRAY_SIZE(netbk[grp_index].meta));
 
 	npo.mmu_mcl = npo.mcl_prod;
 	if (npo.mcl_prod) {
 		BUG_ON(xen_feature(XENFEAT_auto_translated_physmap));
-		BUG_ON(npo.mmu_prod > ARRAY_SIZE(rx_mmu));
+		BUG_ON(npo.mmu_prod > ARRAY_SIZE(netbk[grp_index].rx_mmu));
 		mcl = npo.mcl + npo.mcl_prod++;
 
 		BUG_ON(mcl[-1].op != __HYPERVISOR_update_va_mapping);
 		mcl[-1].args[MULTI_UVMFLAGS_INDEX] = UVMF_TLB_FLUSH|UVMF_ALL;
 
 		mcl->op = __HYPERVISOR_mmu_update;
-		mcl->args[0] = (unsigned long)rx_mmu;
+		mcl->args[0] = (unsigned long)netbk[grp_index].rx_mmu;
 		mcl->args[1] = npo.mmu_prod;
 		mcl->args[2] = 0;
 		mcl->args[3] = DOMID_SELF;
 	}
 
 	if (npo.trans_prod) {
-		BUG_ON(npo.trans_prod > ARRAY_SIZE(grant_trans_op));
+		BUG_ON(npo.trans_prod >
+				ARRAY_SIZE(netbk[grp_index].grant_trans_op));
 		mcl = npo.mcl + npo.mcl_prod++;
 		mcl->op = __HYPERVISOR_grant_table_op;
 		mcl->args[0] = GNTTABOP_transfer;
-		mcl->args[1] = (unsigned long)grant_trans_op;
+		mcl->args[1] = (unsigned long)netbk[grp_index].grant_trans_op;
 		mcl->args[2] = npo.trans_prod;
 	}
 
 	if (npo.copy_prod) {
-		BUG_ON(npo.copy_prod > ARRAY_SIZE(grant_copy_op));
+		BUG_ON(npo.copy_prod >
+				ARRAY_SIZE(netbk[grp_index].grant_copy_op));
 		mcl = npo.mcl + npo.mcl_prod++;
 		mcl->op = __HYPERVISOR_grant_table_op;
 		mcl->args[0] = GNTTABOP_copy;
-		mcl->args[1] = (unsigned long)grant_copy_op;
+		mcl->args[1] = (unsigned long)netbk[grp_index].grant_copy_op;
 		mcl->args[2] = npo.copy_prod;
 	}
 
@@ -588,7 +533,7 @@ static void net_rx_action(unsigned long unused)
 	if (!npo.mcl_prod)
 		return;
 
-	BUG_ON(npo.mcl_prod > ARRAY_SIZE(rx_mcl));
+	BUG_ON(npo.mcl_prod > ARRAY_SIZE(netbk[grp_index].rx_mcl));
 
 	ret = HYPERVISOR_multicall(npo.mcl, npo.mcl_prod);
 	BUG_ON(ret != 0);
@@ -605,7 +550,7 @@ static void net_rx_action(unsigned long unused)
 
 		status = netbk_check_gop(nr_frags, netif->domid, &npo);
 
-		id = meta[npo.meta_cons].id;
+		id = netbk[grp_index].meta[npo.meta_cons].id;
 		flags = nr_frags ? NETRXF_more_data : 0;
 
 		if (skb->ip_summed == CHECKSUM_PARTIAL) /* local packet? */
@@ -618,7 +563,7 @@ static void net_rx_action(unsigned long unused)
 		resp = make_rx_response(netif, id, status, offset,
 					skb_headlen(skb), flags);
 
-		if (meta[npo.meta_cons].frag.size) {
+		if (netbk[grp_index].meta[npo.meta_cons].frag.size) {
 			struct xen_netif_extra_info *gso =
 				(struct xen_netif_extra_info *)
 				RING_GET_RESPONSE(&netif->rx,
@@ -626,7 +571,8 @@ static void net_rx_action(unsigned long unused)
 
 			resp->flags |= NETRXF_extra_info;
 
-			gso->u.gso.size = meta[npo.meta_cons].frag.size;
+			gso->u.gso.size =
+				netbk[grp_index].meta[npo.meta_cons].frag.size;
 			gso->u.gso.type = XEN_NETIF_GSO_TYPE_TCPV4;
 			gso->u.gso.pad = 0;
 			gso->u.gso.features = 0;
@@ -636,15 +582,14 @@ static void net_rx_action(unsigned long unused)
 		}
 
 		netbk_add_frag_responses(netif, status,
-					 meta + npo.meta_cons + 1,
-					 nr_frags);
+				netbk[grp_index].meta + npo.meta_cons + 1,
+				nr_frags);
 
 		RING_PUSH_RESPONSES_AND_CHECK_NOTIFY(&netif->rx, ret);
 		irq = netif->irq;
-		if (ret && !rx_notify[irq] &&
-				(netif->smart_poll != 1)) {
-			rx_notify[irq] = 1;
-			notify_list[notify_nr++] = irq;
+		if (ret && !netbk[grp_index].rx_notify[irq]) {
+			netbk[grp_index].rx_notify[irq] = 1;
+			netbk[grp_index].notify_list[notify_nr++] = irq;
 		}
 
 		if (netif_queue_stopped(netif->dev) &&
@@ -669,28 +614,29 @@ static void net_rx_action(unsigned long unused)
 	}
 
 	while (notify_nr != 0) {
-		irq = notify_list[--notify_nr];
-		rx_notify[irq] = 0;
+		irq = netbk[grp_index].notify_list[--notify_nr];
+		netbk[grp_index].rx_notify[irq] = 0;
 		notify_remote_via_irq(irq);
 	}
 
 	/* More work to do? */
-	if (!skb_queue_empty(&rx_queue) && !timer_pending(&net_timer))
-		tasklet_schedule(&net_rx_tasklet);
+	if (!skb_queue_empty(&netbk[grp_index].rx_queue)
+			&& !timer_pending(&netbk[grp_index].net_timer))
+		tasklet_schedule(&netbk[grp_index].net_rx_tasklet);
 #if 0
 	else
 		xen_network_done_notify();
 #endif
 }
 
-static void net_alarm(unsigned long unused)
+static void net_alarm(unsigned long grp_index)
 {
-	tasklet_schedule(&net_rx_tasklet);
+	tasklet_schedule(&netbk[grp_index].net_rx_tasklet);
 }
 
-static void netbk_tx_pending_timeout(unsigned long unused)
+static void netbk_tx_pending_timeout(unsigned long grp_index)
 {
-	tasklet_schedule(&net_tx_tasklet);
+	tasklet_schedule(&netbk[grp_index].net_tx_tasklet);
 }
 
 struct net_device_stats *netif_be_get_stats(struct net_device *dev)
@@ -706,37 +652,41 @@ static int __on_net_schedule_list(struct xen_netif *netif)
 
 static void remove_from_net_schedule_list(struct xen_netif *netif)
 {
-	spin_lock_irq(&net_schedule_list_lock);
+	int grp_index = GET_GROUP_INDEX(netif);
+	spin_lock_irq(&netbk[grp_index].net_schedule_list_lock);
 	if (likely(__on_net_schedule_list(netif))) {
 		list_del_init(&netif->list);
 		netif_put(netif);
 	}
-	spin_unlock_irq(&net_schedule_list_lock);
+	spin_unlock_irq(&netbk[grp_index].net_schedule_list_lock);
 }
 
 static void add_to_net_schedule_list_tail(struct xen_netif *netif)
 {
+	int grp_index = GET_GROUP_INDEX(netif);
 	if (__on_net_schedule_list(netif))
 		return;
 
-	spin_lock_irq(&net_schedule_list_lock);
+	spin_lock_irq(&netbk[grp_index].net_schedule_list_lock);
 	if (!__on_net_schedule_list(netif) &&
 	    likely(netif_schedulable(netif))) {
-		list_add_tail(&netif->list, &net_schedule_list);
+		list_add_tail(&netif->list,
+				&netbk[grp_index].net_schedule_list);
 		netif_get(netif);
 	}
-	spin_unlock_irq(&net_schedule_list_lock);
+	spin_unlock_irq(&netbk[grp_index].net_schedule_list_lock);
 }
 
 void netif_schedule_work(struct xen_netif *netif)
 {
 	int more_to_do;
+	int grp_index = GET_GROUP_INDEX(netif);
 
 	RING_FINAL_CHECK_FOR_REQUESTS(&netif->tx, more_to_do);
 
 	if (more_to_do) {
 		add_to_net_schedule_list_tail(netif);
-		maybe_schedule_tx_action();
+		maybe_schedule_tx_action(grp_index);
 	}
 }
 
@@ -773,13 +723,15 @@ static void tx_credit_callback(unsigned long data)
 	netif_schedule_work(netif);
 }
 
-static inline int copy_pending_req(pending_ring_idx_t pending_idx)
+static inline int copy_pending_req(int grp_index,
+				   pending_ring_idx_t pending_idx)
 {
-	return gnttab_copy_grant_page(grant_tx_handle[pending_idx],
-				      &mmap_pages[pending_idx]);
+	return gnttab_copy_grant_page(
+			netbk[grp_index].grant_tx_handle[pending_idx],
+			&netbk[grp_index].mmap_pages[pending_idx]);
 }
 
-inline static void net_tx_action_dealloc(void)
+static inline void net_tx_action_dealloc(int grp_index)
 {
 	struct netbk_tx_pending_inuse *inuse, *n;
 	struct gnttab_unmap_grant_ref *gop;
@@ -789,51 +741,64 @@ inline static void net_tx_action_dealloc(void)
 	int ret;
 	LIST_HEAD(list);
 
-	dc = dealloc_cons;
-	gop = tx_unmap_ops;
+	dc = netbk[grp_index].dealloc_cons;
+	gop = netbk[grp_index].tx_unmap_ops;
 
 	/*
 	 * Free up any grants we have finished using
 	 */
 	do {
-		dp = dealloc_prod;
+		dp = netbk[grp_index].dealloc_prod;
 
 		/* Ensure we see all indices enqueued by netif_idx_release(). */
 		smp_rmb();
 
 		while (dc != dp) {
 			unsigned long pfn;
-
-			pending_idx = dealloc_ring[pending_index(dc++)];
+			struct netbk_tx_pending_inuse *pending_inuse;
+			pending_ring_idx_t p_index;
+			grant_handle_t handle;
+			struct page *page;
+
+			p_index = pending_index(dc++);
+			pending_idx = netbk[grp_index].dealloc_ring[p_index];
+			pending_inuse = netbk[grp_index].pending_inuse;
 			list_move_tail(&pending_inuse[pending_idx].list, &list);
 
-			pfn = idx_to_pfn(pending_idx);
+			pfn = idx_to_pfn(grp_index, pending_idx);
 			/* Already unmapped? */
 			if (!phys_to_machine_mapping_valid(pfn))
 				continue;
 
-			stop_tracking_page(mmap_pages[pending_idx]);
+			page = netbk[grp_index].mmap_pages[pending_idx];
+			stop_tracking_page(page);
 
-			gnttab_set_unmap_op(gop, idx_to_kaddr(pending_idx),
-					    GNTMAP_host_map,
-					    grant_tx_handle[pending_idx]);
+			handle = netbk[grp_index].grant_tx_handle[pending_idx];
+			gnttab_set_unmap_op(gop,
+					idx_to_kaddr(grp_index, pending_idx),
+					GNTMAP_host_map,
+					handle);
 			gop++;
 		}
 
 		if (netbk_copy_skb_mode != NETBK_DELAYED_COPY_SKB ||
-		    list_empty(&pending_inuse_head))
+		    list_empty(&netbk[grp_index].pending_inuse_head))
 			break;
 
 		/* Copy any entries that have been pending for too long. */
-		list_for_each_entry_safe(inuse, n, &pending_inuse_head, list) {
+		list_for_each_entry_safe(inuse, n,
+				&netbk[grp_index].pending_inuse_head, list) {
+			struct pending_tx_info *txinfo;
+
 			if (time_after(inuse->alloc_time + HZ / 2, jiffies))
 				break;
 
-			pending_idx = inuse - pending_inuse;
+			pending_idx = inuse - netbk[grp_index].pending_inuse;
 
-			pending_tx_info[pending_idx].netif->nr_copied_skbs++;
+			txinfo = &netbk[grp_index].pending_tx_info[pending_idx];
+			txinfo->netif->nr_copied_skbs++;
 
-			switch (copy_pending_req(pending_idx)) {
+			switch (copy_pending_req(grp_index, pending_idx)) {
 			case 0:
 				list_move_tail(&inuse->list, &list);
 				continue;
@@ -846,26 +811,34 @@ inline static void net_tx_action_dealloc(void)
 
 			break;
 		}
-	} while (dp != dealloc_prod);
+	} while (dp != netbk[grp_index].dealloc_prod);
 
-	dealloc_cons = dc;
+	netbk[grp_index].dealloc_cons = dc;
 
 	ret = HYPERVISOR_grant_table_op(
-		GNTTABOP_unmap_grant_ref, tx_unmap_ops, gop - tx_unmap_ops);
+			GNTTABOP_unmap_grant_ref,
+			netbk[grp_index].tx_unmap_ops,
+			gop - netbk[grp_index].tx_unmap_ops);
 	BUG_ON(ret);
 
 	list_for_each_entry_safe(inuse, n, &list, list) {
-		pending_idx = inuse - pending_inuse;
+		struct pending_tx_info *txinfo;
+		pending_ring_idx_t index;
+
+		pending_idx = inuse - netbk[grp_index].pending_inuse;
+		txinfo = netbk[grp_index].pending_tx_info;
 
-		netif = pending_tx_info[pending_idx].netif;
+		netif = txinfo[pending_idx].netif;
 
-		make_tx_response(netif, &pending_tx_info[pending_idx].req,
-				 NETIF_RSP_OKAY);
+		make_tx_response(netif, &txinfo[pending_idx].req,
+				NETIF_RSP_OKAY);
 
 		/* Ready for next use. */
-		gnttab_reset_grant_page(mmap_pages[pending_idx]);
+		gnttab_reset_grant_page(
+				netbk[grp_index].mmap_pages[pending_idx]);
 
-		pending_ring[pending_index(pending_prod++)] = pending_idx;
+		index = pending_index(netbk[grp_index].pending_prod++);
+		netbk[grp_index].pending_ring[index] = pending_idx;
 
 		netif_put(netif);
 
@@ -873,7 +846,8 @@ inline static void net_tx_action_dealloc(void)
 	}
 }
 
-static void netbk_tx_err(struct xen_netif *netif, struct xen_netif_tx_request *txp, RING_IDX end)
+static void netbk_tx_err(struct xen_netif *netif,
+		struct xen_netif_tx_request *txp, RING_IDX end)
 {
 	RING_IDX cons = netif->tx.req_cons;
 
@@ -890,7 +864,8 @@ static void netbk_tx_err(struct xen_netif *netif, struct xen_netif_tx_request *t
 
 static int netbk_count_requests(struct xen_netif *netif,
 				struct xen_netif_tx_request *first,
-				struct xen_netif_tx_request *txp, int work_to_do)
+				struct xen_netif_tx_request *txp,
+				int work_to_do)
 {
 	RING_IDX cons = netif->tx.req_cons;
 	int frags = 0;
@@ -930,35 +905,41 @@ static int netbk_count_requests(struct xen_netif *netif,
 }
 
 static struct gnttab_map_grant_ref *netbk_get_requests(struct xen_netif *netif,
-						  struct sk_buff *skb,
-						  struct xen_netif_tx_request *txp,
-						  struct gnttab_map_grant_ref *mop)
+					struct sk_buff *skb,
+					struct xen_netif_tx_request *txp,
+					struct gnttab_map_grant_ref *mop)
 {
 	struct skb_shared_info *shinfo = skb_shinfo(skb);
 	skb_frag_t *frags = shinfo->frags;
 	unsigned long pending_idx = *((u16 *)skb->data);
 	int i, start;
+	int grp_index = GET_GROUP_INDEX(netif);
 
 	/* Skip first skb fragment if it is on same page as header fragment. */
 	start = ((unsigned long)shinfo->frags[0].page == pending_idx);
 
 	for (i = start; i < shinfo->nr_frags; i++, txp++) {
-		pending_idx = pending_ring[pending_index(pending_cons++)];
+		pending_ring_idx_t index;
+		struct pending_tx_info *txinfo;
 
-		gnttab_set_map_op(mop++, idx_to_kaddr(pending_idx),
+		index = pending_index(netbk[grp_index].pending_cons++);
+		pending_idx = netbk[grp_index].pending_ring[index];
+
+		gnttab_set_map_op(mop++, idx_to_kaddr(grp_index, pending_idx),
 				  GNTMAP_host_map | GNTMAP_readonly,
 				  txp->gref, netif->domid);
 
-		memcpy(&pending_tx_info[pending_idx].req, txp, sizeof(*txp));
+		txinfo = netbk[grp_index].pending_tx_info;
+		memcpy(&txinfo[pending_idx].req, txp, sizeof(*txp));
 		netif_get(netif);
-		pending_tx_info[pending_idx].netif = netif;
+		txinfo[pending_idx].netif = netif;
 		frags[i].page = (void *)pending_idx;
 
 		start_tracking_page(foreign_page_tracker,
-				    mmap_pages[pending_idx],
+				    netbk[grp_index].mmap_pages[pending_idx],
 				    netif->domid,
-				    pending_tx_info[pending_idx].req.gref,
-				    pending_idx,
+				    txinfo[pending_idx].req.gref,
+				    grp_index * MAX_PENDING_REQS + pending_idx,
 				    NULL);
 	}
 
@@ -966,28 +947,34 @@ static struct gnttab_map_grant_ref *netbk_get_requests(struct xen_netif *netif,
 }
 
 static int netbk_tx_check_mop(struct sk_buff *skb,
-			       struct gnttab_map_grant_ref **mopp)
+			       struct gnttab_map_grant_ref **mopp,
+				int grp_index)
 {
 	struct gnttab_map_grant_ref *mop = *mopp;
 	int pending_idx = *((u16 *)skb->data);
-	struct xen_netif *netif = pending_tx_info[pending_idx].netif;
+	struct xen_netif *netif;
 	struct xen_netif_tx_request *txp;
 	struct skb_shared_info *shinfo = skb_shinfo(skb);
 	int nr_frags = shinfo->nr_frags;
 	int i, err, start;
 
+	netif = netbk[grp_index].pending_tx_info[pending_idx].netif;
 	/* Check status of header. */
 	err = mop->status;
 	if (unlikely(err)) {
-		txp = &pending_tx_info[pending_idx].req;
+		pending_ring_idx_t index;
+		index = pending_index(netbk[grp_index].pending_prod++);
+		txp = &netbk[grp_index].pending_tx_info[pending_idx].req;
 		make_tx_response(netif, txp, NETIF_RSP_ERROR);
-		pending_ring[pending_index(pending_prod++)] = pending_idx;
+		netbk[grp_index].pending_ring[index] = pending_idx;
 		netif_put(netif);
 	} else {
+		unsigned long addr;
+		addr = idx_to_kaddr(grp_index, pending_idx);
 		set_phys_to_machine(
-			__pa(idx_to_kaddr(pending_idx)) >> PAGE_SHIFT,
+			__pa(addr) >> PAGE_SHIFT,
 			FOREIGN_FRAME(mop->dev_bus_addr >> PAGE_SHIFT));
-		grant_tx_handle[pending_idx] = mop->handle;
+		netbk[grp_index].grant_tx_handle[pending_idx] = mop->handle;
 	}
 
 	/* Skip first skb fragment if it is on same page as header fragment. */
@@ -995,26 +982,31 @@ static int netbk_tx_check_mop(struct sk_buff *skb,
 
 	for (i = start; i < nr_frags; i++) {
 		int j, newerr;
+		pending_ring_idx_t index;
 
 		pending_idx = (unsigned long)shinfo->frags[i].page;
 
 		/* Check error status: if okay then remember grant handle. */
 		newerr = (++mop)->status;
 		if (likely(!newerr)) {
+			unsigned long addr;
+			addr = idx_to_kaddr(grp_index, pending_idx);
 			set_phys_to_machine(
-				__pa(idx_to_kaddr(pending_idx))>>PAGE_SHIFT,
+				__pa(addr)>>PAGE_SHIFT,
 				FOREIGN_FRAME(mop->dev_bus_addr>>PAGE_SHIFT));
-			grant_tx_handle[pending_idx] = mop->handle;
+			netbk[grp_index].grant_tx_handle[pending_idx] =
+				mop->handle;
 			/* Had a previous error? Invalidate this fragment. */
 			if (unlikely(err))
-				netif_idx_release(pending_idx);
+				netif_idx_release(grp_index, pending_idx);
 			continue;
 		}
 
 		/* Error on this fragment: respond to client with an error. */
-		txp = &pending_tx_info[pending_idx].req;
+		txp = &netbk[grp_index].pending_tx_info[pending_idx].req;
 		make_tx_response(netif, txp, NETIF_RSP_ERROR);
-		pending_ring[pending_index(pending_prod++)] = pending_idx;
+		index = pending_index(netbk[grp_index].pending_prod++);
+		netbk[grp_index].pending_ring[index] = pending_idx;
 		netif_put(netif);
 
 		/* Not the first error? Preceding frags already invalidated. */
@@ -1023,10 +1015,10 @@ static int netbk_tx_check_mop(struct sk_buff *skb,
 
 		/* First error: invalidate header and preceding fragments. */
 		pending_idx = *((u16 *)skb->data);
-		netif_idx_release(pending_idx);
+		netif_idx_release(grp_index, pending_idx);
 		for (j = start; j < i; j++) {
 			pending_idx = (unsigned long)shinfo->frags[i].page;
-			netif_idx_release(pending_idx);
+			netif_idx_release(grp_index, pending_idx);
 		}
 
 		/* Remember the error: invalidate all subsequent fragments. */
@@ -1037,7 +1029,7 @@ static int netbk_tx_check_mop(struct sk_buff *skb,
 	return err;
 }
 
-static void netbk_fill_frags(struct sk_buff *skb)
+static void netbk_fill_frags(struct sk_buff *skb, int grp_index)
 {
 	struct skb_shared_info *shinfo = skb_shinfo(skb);
 	int nr_frags = shinfo->nr_frags;
@@ -1050,12 +1042,13 @@ static void netbk_fill_frags(struct sk_buff *skb)
 
 		pending_idx = (unsigned long)frag->page;
 
-		pending_inuse[pending_idx].alloc_time = jiffies;
-		list_add_tail(&pending_inuse[pending_idx].list,
-			      &pending_inuse_head);
+		netbk[grp_index].pending_inuse[pending_idx].alloc_time =
+			jiffies;
+		list_add_tail(&netbk[grp_index].pending_inuse[pending_idx].list,
+			      &netbk[grp_index].pending_inuse_head);
 
-		txp = &pending_tx_info[pending_idx].req;
-		frag->page = virt_to_page(idx_to_kaddr(pending_idx));
+		txp = &netbk[grp_index].pending_tx_info[pending_idx].req;
+		frag->page = virt_to_page(idx_to_kaddr(grp_index, pending_idx));
 		frag->size = txp->size;
 		frag->page_offset = txp->offset;
 
@@ -1187,15 +1180,16 @@ static bool tx_credit_exceeded(struct xen_netif *netif, unsigned size)
 	return false;
 }
 
-static unsigned net_tx_build_mops(void)
+static unsigned net_tx_build_mops(int grp_index)
 {
 	struct gnttab_map_grant_ref *mop;
 	struct sk_buff *skb;
 	int ret;
 
-	mop = tx_map_ops;
-	while (((nr_pending_reqs() + MAX_SKB_FRAGS) < MAX_PENDING_REQS) &&
-		!list_empty(&net_schedule_list)) {
+	mop = netbk[grp_index].tx_map_ops;
+	while (((nr_pending_reqs(grp_index) + MAX_SKB_FRAGS) <
+				MAX_PENDING_REQS) &&
+		!list_empty(&netbk[grp_index].net_schedule_list)) {
 		struct xen_netif *netif;
 		struct xen_netif_tx_request txreq;
 		struct xen_netif_tx_request txfrags[MAX_SKB_FRAGS];
@@ -1204,9 +1198,11 @@ static unsigned net_tx_build_mops(void)
 		RING_IDX idx;
 		int work_to_do;
 		unsigned int data_len;
+		pending_ring_idx_t index;
 	
 		/* Get a netif from the list with work to do. */
-		netif = list_first_entry(&net_schedule_list, struct xen_netif, list);
+		netif = list_first_entry(&netbk[grp_index].net_schedule_list,
+				struct xen_netif, list);
 		netif_get(netif);
 		remove_from_net_schedule_list(netif);
 
@@ -1265,7 +1261,8 @@ static unsigned net_tx_build_mops(void)
 			continue;
 		}
 
-		pending_idx = pending_ring[pending_index(pending_cons)];
+		index = pending_index(netbk[grp_index].pending_cons);
+		pending_idx = netbk[grp_index].pending_ring[index];
 
 		data_len = (txreq.size > PKT_PROT_LEN &&
 			    ret < MAX_SKB_FRAGS) ?
@@ -1293,21 +1290,21 @@ static unsigned net_tx_build_mops(void)
 			}
 		}
 
-		gnttab_set_map_op(mop, idx_to_kaddr(pending_idx),
+		gnttab_set_map_op(mop, idx_to_kaddr(grp_index, pending_idx),
 				  GNTMAP_host_map | GNTMAP_readonly,
 				  txreq.gref, netif->domid);
 		mop++;
 
 		start_tracking_page(foreign_page_tracker,
-				    mmap_pages[pending_idx],
+				    netbk[grp_index].mmap_pages[pending_idx],
 				    netif->domid,
 				    txreq.gref,
-				    pending_idx,
+				    grp_index * MAX_PENDING_REQS + pending_idx,
 				    NULL);
 
-		memcpy(&pending_tx_info[pending_idx].req,
+		memcpy(&netbk[grp_index].pending_tx_info[pending_idx].req,
 		       &txreq, sizeof(txreq));
-		pending_tx_info[pending_idx].netif = netif;
+		netbk[grp_index].pending_tx_info[pending_idx].netif = netif;
 		*((u16 *)skb->data) = pending_idx;
 
 		__skb_put(skb, data_len);
@@ -1322,40 +1319,42 @@ static unsigned net_tx_build_mops(void)
 			skb_shinfo(skb)->frags[0].page = (void *)~0UL;
 		}
 
-		__skb_queue_tail(&tx_queue, skb);
+		__skb_queue_tail(&netbk[grp_index].tx_queue, skb);
 
-		pending_cons++;
+		netbk[grp_index].pending_cons++;
 
 		mop = netbk_get_requests(netif, skb, txfrags, mop);
 
 		netif->tx.req_cons = idx;
 		netif_schedule_work(netif);
 
-		if ((mop - tx_map_ops) >= ARRAY_SIZE(tx_map_ops))
+		if ((mop - netbk[grp_index].tx_map_ops) >=
+				ARRAY_SIZE(netbk[grp_index].tx_map_ops))
 			break;
 	}
 
-	return mop - tx_map_ops;
+	return mop - netbk[grp_index].tx_map_ops;
 }
 
-static void net_tx_submit(void)
+static void net_tx_submit(int grp_index)
 {
 	struct gnttab_map_grant_ref *mop;
 	struct sk_buff *skb;
 
-	mop = tx_map_ops;
-	while ((skb = __skb_dequeue(&tx_queue)) != NULL) {
+	mop = netbk[grp_index].tx_map_ops;
+	while ((skb = __skb_dequeue(&netbk[grp_index].tx_queue)) != NULL) {
 		struct xen_netif_tx_request *txp;
 		struct xen_netif *netif;
 		u16 pending_idx;
 		unsigned data_len;
+		unsigned long addr;
 
 		pending_idx = *((u16 *)skb->data);
-		netif       = pending_tx_info[pending_idx].netif;
-		txp         = &pending_tx_info[pending_idx].req;
+		netif = netbk[grp_index].pending_tx_info[pending_idx].netif;
+		txp = &netbk[grp_index].pending_tx_info[pending_idx].req;
 
 		/* Check the remap error code. */
-		if (unlikely(netbk_tx_check_mop(skb, &mop))) {
+		if (unlikely(netbk_tx_check_mop(skb, &mop, grp_index))) {
 			DPRINTK("netback grant failed.\n");
 			skb_shinfo(skb)->nr_frags = 0;
 			kfree_skb(skb);
@@ -1363,8 +1362,9 @@ static void net_tx_submit(void)
 		}
 
 		data_len = skb->len;
+		addr = idx_to_kaddr(grp_index, pending_idx);
 		memcpy(skb->data,
-		       (void *)(idx_to_kaddr(pending_idx)|txp->offset),
+		       (void *)(addr|txp->offset),
 		       data_len);
 		if (data_len < txp->size) {
 			/* Append the packet payload as a fragment. */
@@ -1372,7 +1372,7 @@ static void net_tx_submit(void)
 			txp->size -= data_len;
 		} else {
 			/* Schedule a response immediately. */
-			netif_idx_release(pending_idx);
+			netif_idx_release(grp_index, pending_idx);
 		}
 
 		/*
@@ -1384,7 +1384,7 @@ static void net_tx_submit(void)
 		else
 			skb->ip_summed = CHECKSUM_NONE;
 
-		netbk_fill_frags(skb);
+		netbk_fill_frags(skb, grp_index);
 
 		skb->dev      = netif->dev;
 		skb->protocol = eth_type_trans(skb, skb->dev);
@@ -1412,65 +1412,70 @@ static void net_tx_submit(void)
 	}
 
 	if (netbk_copy_skb_mode == NETBK_DELAYED_COPY_SKB &&
-	    !list_empty(&pending_inuse_head)) {
+	    !list_empty(&netbk[grp_index].pending_inuse_head)) {
 		struct netbk_tx_pending_inuse *oldest;
 
-		oldest = list_entry(pending_inuse_head.next,
+		oldest = list_entry(netbk[grp_index].pending_inuse_head.next,
 				    struct netbk_tx_pending_inuse, list);
-		mod_timer(&netbk_tx_pending_timer, oldest->alloc_time + HZ);
+		mod_timer(&netbk[grp_index].netbk_tx_pending_timer,
+				oldest->alloc_time + HZ);
 	}
 }
 
 /* Called after netfront has transmitted */
-static void net_tx_action(unsigned long unused)
+static void net_tx_action(unsigned long grp_index)
 {
 	unsigned nr_mops;
 	int ret;
 
-	if (dealloc_cons != dealloc_prod)
-		net_tx_action_dealloc();
+	if (netbk[grp_index].dealloc_cons != netbk[grp_index].dealloc_prod)
+		net_tx_action_dealloc(grp_index);
 
-	nr_mops = net_tx_build_mops();
+	nr_mops = net_tx_build_mops(grp_index);
 
 	if (nr_mops == 0)
 		return;
 
 	ret = HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref,
-					tx_map_ops, nr_mops);
+					netbk[grp_index].tx_map_ops, nr_mops);
 	BUG_ON(ret);
 
-	net_tx_submit();
+	net_tx_submit(grp_index);
 }
 
-static void netif_idx_release(u16 pending_idx)
+static void netif_idx_release(int grp_index, u16 pending_idx)
 {
 	static DEFINE_SPINLOCK(_lock);
 	unsigned long flags;
+	pending_ring_idx_t index;
 
 	spin_lock_irqsave(&_lock, flags);
-	dealloc_ring[pending_index(dealloc_prod)] = pending_idx;
+	index = pending_index(netbk[grp_index].dealloc_prod);
+	netbk[grp_index].dealloc_ring[index] = pending_idx;
 	/* Sync with net_tx_action_dealloc: insert idx /then/ incr producer. */
 	smp_wmb();
-	dealloc_prod++;
+	netbk[grp_index].dealloc_prod++;
 	spin_unlock_irqrestore(&_lock, flags);
 
-	tasklet_schedule(&net_tx_tasklet);
+	tasklet_schedule(&netbk[grp_index].net_tx_tasklet);
 }
 
 static void netif_page_release(struct page *page, unsigned int order)
 {
 	int idx = netif_page_index(page);
+	int grp_index = ((struct page_ext *)(page->mapping))->grp_index;
 	BUG_ON(order);
 	BUG_ON(idx < 0);
-	netif_idx_release(idx);
+	netif_idx_release(grp_index, idx);
 }
 
 irqreturn_t netif_be_int(int irq, void *dev_id)
 {
 	struct xen_netif *netif = dev_id;
+	int grp_index = GET_GROUP_INDEX(netif);
 
 	add_to_net_schedule_list_tail(netif);
-	maybe_schedule_tx_action();
+	maybe_schedule_tx_action(grp_index);
 
 	if (netif_schedulable(netif) && !netbk_queue_full(netif))
 		netif_wake_queue(netif->dev);
@@ -1536,13 +1541,14 @@ static struct xen_netif_rx_response *make_rx_response(struct xen_netif *netif,
 static irqreturn_t netif_be_dbg(int irq, void *dev_id, struct pt_regs *regs)
 {
 	struct list_head *ent;
-	struct xen_netif *netif;
+	struct xen_netif *netif = dev_id;
+	int grp_index = GET_GROUP_INDEX(netif);
 	int i = 0;
 
 	printk(KERN_ALERT "netif_schedule_list:\n");
-	spin_lock_irq(&net_schedule_list_lock);
+	spin_lock_irq(&netbk[grp_index].net_schedule_list_lock);
 
-	list_for_each (ent, &net_schedule_list) {
+	list_for_each(ent, &netbk[grp_index].net_schedule_list) {
 		netif = list_entry(ent, struct xen_netif, list);
 		printk(KERN_ALERT " %d: private(rx_req_cons=%08x "
 		       "rx_resp_prod=%08x\n",
@@ -1559,7 +1565,7 @@ static irqreturn_t netif_be_dbg(int irq, void *dev_id, struct pt_regs *regs)
 		i++;
 	}
 
-	spin_unlock_irq(&net_schedule_list_lock);
+	spin_unlock_irq(&netbk[grp_index].net_schedule_list_lock);
 	printk(KERN_ALERT " ** End of netif_schedule_list **\n");
 
 	return IRQ_HANDLED;
@@ -1569,47 +1575,82 @@ static irqreturn_t netif_be_dbg(int irq, void *dev_id, struct pt_regs *regs)
 static int __init netback_init(void)
 {
 	int i;
+	int grp_index;
 	struct page *page;
 	int rc = 0;
 
 	if (!xen_domain())
 		return -ENODEV;
 
+	cpu_online_nr = num_online_cpus();
+
 	/* We can increase reservation by this much in net_rx_action(). */
 //	balloon_update_driver_allowance(NET_RX_RING_SIZE);
 
-	skb_queue_head_init(&rx_queue);
-	skb_queue_head_init(&tx_queue);
-
-	init_timer(&net_timer);
-	net_timer.data = 0;
-	net_timer.function = net_alarm;
-
-	init_timer(&netbk_tx_pending_timer);
-	netbk_tx_pending_timer.data = 0;
-	netbk_tx_pending_timer.function = netbk_tx_pending_timeout;
-
-	foreign_page_tracker = alloc_page_foreign_tracker(MAX_PENDING_REQS);
-	if (!foreign_page_tracker)
-		return -ENOMEM;
-	mmap_pages = alloc_empty_pages_and_pagevec(MAX_PENDING_REQS);
-	if (mmap_pages == NULL) {
-		printk("%s: out of memory\n", __FUNCTION__);
-		free_page_foreign_tracker(foreign_page_tracker);
+	netbk = kzalloc(cpu_online_nr * sizeof(struct netbk), GFP_KERNEL);
+	if (!netbk) {
+		printk(KERN_ALERT "%s: out of memory\n", __func__);
 		return -ENOMEM;
 	}
 
-	for (i = 0; i < MAX_PENDING_REQS; i++) {
-		page = mmap_pages[i];
-		SetPageForeign(page, netif_page_release);
-		netif_set_page_index(page, i);
-		INIT_LIST_HEAD(&pending_inuse[i].list);
+	foreign_page_tracker =
+		alloc_page_foreign_tracker(cpu_online_nr * MAX_PENDING_REQS);
+	if (!foreign_page_tracker) {
+		kfree(netbk);
+		return -ENOMEM;
 	}
 
-	pending_cons = 0;
-	pending_prod = MAX_PENDING_REQS;
-	for (i = 0; i < MAX_PENDING_REQS; i++)
-		pending_ring[i] = i;
+	for (grp_index = 0; grp_index < cpu_online_nr; grp_index++) {
+		tasklet_init(&netbk[grp_index].net_tx_tasklet,
+				net_tx_action, grp_index);
+		tasklet_init(&netbk[grp_index].net_rx_tasklet,
+				net_rx_action, grp_index);
+
+		skb_queue_head_init(&netbk[grp_index].rx_queue);
+		skb_queue_head_init(&netbk[grp_index].tx_queue);
+
+		netbk[grp_index].mmap_pages =
+			alloc_empty_pages_and_pagevec(MAX_PENDING_REQS);
+		if (netbk[grp_index].mmap_pages == NULL) {
+			printk(KERN_ALERT "%s: out of memory\n", __func__);
+			rc = -ENOMEM;
+			goto failed_init;
+		}
+
+		init_timer(&netbk[grp_index].net_timer);
+		netbk[grp_index].net_timer.data = (unsigned long)grp_index;
+		netbk[grp_index].net_timer.function = net_alarm;
+
+		init_timer(&netbk[grp_index].netbk_tx_pending_timer);
+		netbk[grp_index].netbk_tx_pending_timer.data =
+			(unsigned long)grp_index;
+		netbk[grp_index].netbk_tx_pending_timer.function =
+			netbk_tx_pending_timeout;
+
+		for (i = 0; i < MAX_PENDING_REQS; i++) {
+			page = netbk[grp_index].mmap_pages[i];
+			SetPageForeign(page, netif_page_release);
+			netbk[grp_index].page_extinfo[i].grp_index = grp_index;
+			netbk[grp_index].page_extinfo[i].idx = i;
+			netif_set_page_index(page,
+					&netbk[grp_index].page_extinfo[i]);
+			INIT_LIST_HEAD(&netbk[grp_index].pending_inuse[i].list);
+		}
+		INIT_LIST_HEAD(&netbk[grp_index].pending_inuse_head);
+		INIT_LIST_HEAD(&netbk[grp_index].net_schedule_list);
+
+		netbk[grp_index].pending_cons = 0;
+		netbk[grp_index].pending_prod = MAX_PENDING_REQS;
+
+		for (i = 0; i < MAX_PENDING_REQS; i++)
+			netbk[grp_index].pending_ring[i] = i;
+
+		spin_lock_init(&netbk[grp_index].net_schedule_list_lock);
+
+		INIT_LIST_HEAD(&netbk[grp_index].domains);
+		spin_lock_init(&netbk[grp_index].domain_list_lock);
+		netbk[grp_index].domain_nr = 0;
+	}
 
 	netbk_copy_skb_mode = NETBK_DONT_COPY_SKB;
 	if (MODPARM_copy_skb) {
@@ -1638,9 +1679,14 @@ static int __init netback_init(void)
 	return 0;
 
 failed_init:
-	free_empty_pages_and_pagevec(mmap_pages, MAX_PENDING_REQS);
-	del_timer(&netbk_tx_pending_timer);
-	del_timer(&net_timer);
+	for (i = 0; i < grp_index; i++) {
+		free_empty_pages_and_pagevec(netbk[i].mmap_pages,
+				MAX_PENDING_REQS);
+		del_timer(&netbk[i].netbk_tx_pending_timer);
+		del_timer(&netbk[i].net_timer);
+	}
+	kfree(netbk);
+	free_page_foreign_tracker(foreign_page_tracker);
 	return rc;
 
 }
-- 
1.6.3


[-- Attachment #3: 0002-Use-Kernel-thread-to-replace-the-tasklet.patch --]
[-- Type: application/octet-stream, Size: 5826 bytes --]

From ee7acfe5019b98e176d567d4300efb6911e1d903 Mon Sep 17 00:00:00 2001
From: Dongxiao Xu <dongxiao.xu@intel.com>
Date: Wed, 2 Dec 2009 17:27:32 +0800
Subject: [PATCH 2/2] Use Kernel thread to replace the tasklet.
     Kernel thread has more control over QoS, and could improve
 dom0's userspace responseness.

Signed-off-by: Dongxiao Xu <dongxiao.xu@intel.com>
---
 drivers/xen/netback/common.h  |    4 +-
 drivers/xen/netback/netback.c |   71 ++++++++++++++++++++++++++++++++++-------
 2 files changed, 61 insertions(+), 14 deletions(-)

diff --git a/drivers/xen/netback/common.h b/drivers/xen/netback/common.h
index 3e91012..732019f 100644
--- a/drivers/xen/netback/common.h
+++ b/drivers/xen/netback/common.h
@@ -245,8 +245,8 @@ struct netbk_tx_pending_inuse {
 #define MAX_PENDING_REQS 256
 
 struct netbk {
-	struct tasklet_struct net_tx_tasklet;
-	struct tasklet_struct net_rx_tasklet;
+	wait_queue_head_t netbk_action_wq;
+	struct task_struct *task;
 
 	struct sk_buff_head rx_queue;
 	struct sk_buff_head tx_queue;
diff --git a/drivers/xen/netback/netback.c b/drivers/xen/netback/netback.c
index 103ee8a..f329659 100644
--- a/drivers/xen/netback/netback.c
+++ b/drivers/xen/netback/netback.c
@@ -38,6 +38,7 @@
 
 #include <linux/tcp.h>
 #include <linux/udp.h>
+#include <linux/kthread.h>
 
 #include <xen/balloon.h>
 #include <xen/events.h>
@@ -138,7 +139,7 @@ static inline void maybe_schedule_tx_action(int grp_index)
 	smp_mb();
 	if ((nr_pending_reqs(grp_index) < (MAX_PENDING_REQS/2)) &&
 			!list_empty(&netbk[grp_index].net_schedule_list))
-		tasklet_schedule(&netbk[grp_index].net_tx_tasklet);
+		wake_up(&netbk[grp_index].netbk_action_wq);
 }
 
 static struct sk_buff *netbk_copy_skb(struct sk_buff *skb)
@@ -290,7 +291,7 @@ int netif_be_start_xmit(struct sk_buff *skb, struct net_device *dev)
 	}
 	grp_index = GET_GROUP_INDEX(netif);
 	skb_queue_tail(&netbk[grp_index].rx_queue, skb);
-	tasklet_schedule(&netbk[grp_index].net_rx_tasklet);
+	wake_up(&netbk[grp_index].netbk_action_wq);
 
 	return 0;
 
@@ -622,7 +623,7 @@ static void net_rx_action(unsigned long grp_index)
 	/* More work to do? */
 	if (!skb_queue_empty(&netbk[grp_index].rx_queue)
 			&& !timer_pending(&netbk[grp_index].net_timer))
-		tasklet_schedule(&netbk[grp_index].net_rx_tasklet);
+		wake_up(&netbk[grp_index].netbk_action_wq);
 #if 0
 	else
 		xen_network_done_notify();
@@ -631,12 +632,12 @@ static void net_rx_action(unsigned long grp_index)
 
 static void net_alarm(unsigned long grp_index)
 {
-	tasklet_schedule(&netbk[grp_index].net_rx_tasklet);
+	wake_up(&netbk[grp_index].netbk_action_wq);
 }
 
 static void netbk_tx_pending_timeout(unsigned long grp_index)
 {
-	tasklet_schedule(&netbk[grp_index].net_tx_tasklet);
+	wake_up(&netbk[grp_index].netbk_action_wq);
 }
 
 struct net_device_stats *netif_be_get_stats(struct net_device *dev)
@@ -1407,7 +1408,7 @@ static void net_tx_submit(int grp_index)
 			continue;
 		}
 
-		netif_rx(skb);
+		netif_rx_ni(skb);
 		netif->dev->last_rx = jiffies;
 	}
 
@@ -1457,7 +1458,7 @@ static void netif_idx_release(int grp_index, u16 pending_idx)
 	netbk[grp_index].dealloc_prod++;
 	spin_unlock_irqrestore(&_lock, flags);
 
-	tasklet_schedule(&netbk[grp_index].net_tx_tasklet);
+	wake_up(&netbk[grp_index].netbk_action_wq);
 }
 
 static void netif_page_release(struct page *page, unsigned int order)
@@ -1572,10 +1573,46 @@ static irqreturn_t netif_be_dbg(int irq, void *dev_id, struct pt_regs *regs)
 }
 #endif
 
+static inline int rx_work_todo(int grp_index)
+{
+	return !skb_queue_empty(&netbk[grp_index].rx_queue);
+}
+
+static inline int tx_work_todo(int grp_index)
+{
+	if (netbk[grp_index].dealloc_cons != netbk[grp_index].dealloc_prod)
+		return 1;
+
+	if (((nr_pending_reqs(grp_index) + MAX_SKB_FRAGS) < MAX_PENDING_REQS) &&
+			!list_empty(&netbk[grp_index].net_schedule_list))
+		return 1;
+
+	return 0;
+}
+
+static int netbk_action_thread(void *index)
+{
+	unsigned long grp_index = (unsigned long)index;
+	while (1) {
+		wait_event_interruptible(netbk[grp_index].netbk_action_wq,
+				rx_work_todo(grp_index)
+				|| tx_work_todo(grp_index));
+		cond_resched();
+
+		if (rx_work_todo(grp_index))
+			net_rx_action(grp_index);
+
+		if (tx_work_todo(grp_index))
+			net_tx_action(grp_index);
+	}
+
+	return 0;
+}
+
 static int __init netback_init(void)
 {
 	int i;
-	int grp_index;
+	unsigned long grp_index;
 	struct page *page;
 	int rc = 0;
 
@@ -1601,10 +1638,18 @@ static int __init netback_init(void)
 	}
 
 	for (grp_index = 0; grp_index < cpu_online_nr; grp_index++) {
-		tasklet_init(&netbk[grp_index].net_tx_tasklet,
-				net_tx_action, grp_index);
-		tasklet_init(&netbk[grp_index].net_rx_tasklet,
-				net_rx_action, grp_index);
+		init_waitqueue_head(&netbk[grp_index].netbk_action_wq);
+		netbk[grp_index].task =	kthread_create(netbk_action_thread,
+				(void *)grp_index, "netback/%lu", grp_index);
+
+		if (!IS_ERR(netbk[grp_index].task)) {
+			kthread_bind(netbk[grp_index].task, grp_index);
+			wake_up_process(netbk[grp_index].task);
+		} else {
+			printk(KERN_ALERT "kthread_run() fails at netback\n");
+			rc = PTR_ERR(netbk[grp_index].task);
+			goto failed_init;
+		}
 
 		skb_queue_head_init(&netbk[grp_index].rx_queue);
 		skb_queue_head_init(&netbk[grp_index].tx_queue);
@@ -1613,6 +1658,7 @@ static int __init netback_init(void)
 			alloc_empty_pages_and_pagevec(MAX_PENDING_REQS);
 		if (netbk[grp_index].mmap_pages == NULL) {
 			printk(KERN_ALERT "%s: out of memory\n", __func__);
+			kthread_stop(netbk[grp_index].task);
 			rc = -ENOMEM;
 			goto failed_init;
 		}
@@ -1680,6 +1726,7 @@ static int __init netback_init(void)
 
 failed_init:
 	for (i = 0; i < grp_index; i++) {
+		kthread_stop(netbk[i].task);
 		free_empty_pages_and_pagevec(netbk[i].mmap_pages,
 				MAX_PENDING_REQS);
 		del_timer(&netbk[i].netbk_tx_pending_timer);
-- 
1.6.3


[-- Attachment #4: Type: text/plain, Size: 138 bytes --]

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xensource.com
http://lists.xensource.com/xen-devel

^ permalink raw reply related	[flat|nested] 46+ messages in thread

* Re: [Pv-ops][PATCH] Netback multiple tasklet support
  2009-12-02 10:17       ` Xu, Dongxiao
@ 2009-12-03 21:28         ` Jeremy Fitzhardinge
  2009-12-04  2:13           ` Xu, Dongxiao
  0 siblings, 1 reply; 46+ messages in thread
From: Jeremy Fitzhardinge @ 2009-12-03 21:28 UTC (permalink / raw)
  To: Xu, Dongxiao
  Cc: Steven Smith, Ian Pratt, xen-devel@lists.xensource.com,
	Ian Campbell

On 12/02/09 02:17, Xu, Dongxiao wrote:
> Hi,
> 	According to your feedback, I revised my patch and resend it now.
> 	
> [PATCH 01]: Use multiple tasklet pairs to replace the current single pair in dom0.
> [PATCH 02]: Replace the tasklet with kernel thread. It may hurt the performance, but could improve the responseness from userspace.
>
> Test senario:
> We use ten 1G NIC interface to talk with 10 VMs (netfront) in server. So the total bandwidth is 10G.
> For host machine, bind each guest's netfront with each NIC interface.
> For client machine, do netperf testing with each guest.
>
> Test Case			Throughput(Mbps)	Dom0 CPU Util	Guests CPU Util
> w/o any patch			4304.30		400.33%		112.21%
> w/   01   patch			9533.13		461.64%		243.81%
> w/ 01 and 02 patches		7942.68		597.83%		250.53%
>
>  From the result we can see that, the case "w/ 01 and 02 patches" didn't reach/near the total bandwidth. It is because some vcpus in dom0 are saturated due to the context switch with other tasks, thus hurt the performance. To prove this idea, I did a experiment, which sets the kernel thread to SCHED_FIFO type, in order to avoid preemption by normal tasks. The experiment result is showed below, and it could get good performance. However like tasklet, set the kernel thread to high priority could also influence the userspace responseness because the usespace application (for example, sshd) could not preempt that netback kernel thread.
>
> w/ hi-priority kthread		9535.74		543.56%		241.26%
>
> For netchannel2, it omits the grant copy in dom0, I didn't try it yet. But I used xenoprofile in current netback system to get a feeling that, grant copy occupies  ~1/6 cpu cycle of dom0 (including Xen and dom0 vmlinux).
>
> BTW, 02 patch is ported from the patch given by Ian Campbell. You can add your signed-off-by if you want. :)
>    

I've applied this to the xen/dom0/backend/netback-tasklet branch for 
now.  However, I noticed a number of problems with a quick lookover of 
the code:

    * "netbk" should either be static, or have a longer name (mentioning
      xen)
    * same with "foreign_page_tracker"
          o (the foreign page tracker API should have better names, but
            that's not your problem)
    * What's cpu_online_nr for?  I don't think it should be necessary at
      all, and if it is, then it needs a much more distinct name.
    * If they're really per-cpu variables, they should use the percpu
      mechanism
    * How do you relate the number of online CPUs to the whole group
      index/pending index computation?  It isn't obvious how they're
      connected, or how it guarantees that the index is enough.
    * What happens if you start hotplugging cpus?
    * All the repeated netbk[group_idx]. expressions would be improved
      by defining a local pointer for that value.

     J

^ permalink raw reply	[flat|nested] 46+ messages in thread

* RE: [Pv-ops][PATCH] Netback multiple tasklet support
  2009-12-03 21:28         ` Jeremy Fitzhardinge
@ 2009-12-04  2:13           ` Xu, Dongxiao
  2009-12-04  2:33             ` Jeremy Fitzhardinge
  0 siblings, 1 reply; 46+ messages in thread
From: Xu, Dongxiao @ 2009-12-04  2:13 UTC (permalink / raw)
  To: Jeremy Fitzhardinge
  Cc: Steven Smith, Ian Pratt, xen-devel@lists.xensource.com,
	Ian Campbell

Hi, Jeremy, 
	Thanks much for your comments, here are some explainations for the 01 patch. 

Best Regards, 
-- Dongxiao

Jeremy Fitzhardinge wrote:
> On 12/02/09 02:17, Xu, Dongxiao wrote:
>> Hi,
>> 	According to your feedback, I revised my patch and resend it now.
>> 
>> [PATCH 01]: Use multiple tasklet pairs to replace the current single
>> pair in dom0. [PATCH 02]: Replace the tasklet with kernel thread. It
>> may hurt the performance, but could improve the responseness from
>> userspace.  
>> 
>> Test senario:
>> We use ten 1G NIC interface to talk with 10 VMs (netfront) in
>> server. So the total bandwidth is 10G. 
>> For host machine, bind each guest's netfront with each NIC interface.
>> For client machine, do netperf testing with each guest.
>> 
>> Test Case			Throughput(Mbps)	Dom0 CPU Util	Guests CPU Util
>> w/o any patch			4304.30		400.33%		112.21%
>> w/   01   patch			9533.13		461.64%		243.81%
>> w/ 01 and 02 patches		7942.68		597.83%		250.53%
>> 
>>  From the result we can see that, the case "w/ 01 and 02 patches"
>> didn't reach/near the total bandwidth. It is because some vcpus in
>> dom0 are saturated due to the context switch with other tasks, thus
>> hurt the performance. To prove this idea, I did a experiment, which
>> sets the kernel thread to SCHED_FIFO type, in order to avoid
>> preemption by normal tasks. The experiment result is showed below,
>> and it could get good performance. However like tasklet, set the
>> kernel thread to high priority could also influence the userspace
>> responseness because the usespace application (for example, sshd)
>> could not preempt that netback kernel thread.         
>> 
>> w/ hi-priority kthread		9535.74		543.56%		241.26%
>> 
>> For netchannel2, it omits the grant copy in dom0, I didn't try it
>> yet. But I used xenoprofile in current netback system to get a
>> feeling that, grant copy occupies  ~1/6 cpu cycle of dom0 (including
>> Xen and dom0 vmlinux).   
>> 
>> BTW, 02 patch is ported from the patch given by Ian Campbell. You
>> can add your signed-off-by if you want. :) 
>> 
> 
> I've applied this to the xen/dom0/backend/netback-tasklet branch for
> now.  However, I noticed a number of problems with a quick lookover of
> the code:
> 
>     * "netbk" should either be static, or have a longer name
>       (mentioning xen)

OK, I will rename it. 

>     * same with "foreign_page_tracker"
>           o (the foreign page tracker API should have better names,
>             but that's not your problem)
>     * What's cpu_online_nr for?  I don't think it should be necessary
>       at all, and if it is, then it needs a much more distinct name.
>     * If they're really per-cpu variables, they should use the percpu
>       mechanism

Actually those tasklets are not per-cpu variables. 
We just defined cpu_online_nr of tasklets, in order to get the best performance 
if each tasklet could run on each cpu. However, they are not binded with cpus. 
Some tasklets may run on the same vcpu of dom0 due to interrupt delivery
affinity. Therefore these tasklets are not per-cpu variables. 

>     * How do you relate the number of online CPUs to the whole group
>       index/pending index computation?  It isn't obvious how they're
>       connected, or how it guarantees that the index is enough.

Same explaination as above. Whether online cpus number is greater or less than 
tasklet number does not matter in our case. We defined them to the same value
is only for getting best performance.
 
>     * What happens if you start hotplugging cpus?

It doesn't matter. 
Assumes that dom0 has 3 vcpus now, so there are 3 tasklets to handle network traffic. 
At one time, admin add one vcpu to dom0. In this case, 4 vcpus will handle the three 
tasklets.  Think of the current situation without my patch, dom0's all vcpus handle only
one tasklet, which is a bottleneck. 

>     * All the repeated netbk[group_idx]. expressions would be improved
>       by defining a local pointer for that value.
OK, I will improve it. Thanks!
> 
>      J

^ permalink raw reply	[flat|nested] 46+ messages in thread

* Re: [Pv-ops][PATCH] Netback multiple tasklet support
  2009-12-04  2:13           ` Xu, Dongxiao
@ 2009-12-04  2:33             ` Jeremy Fitzhardinge
  2009-12-08  9:22               ` Xu, Dongxiao
  0 siblings, 1 reply; 46+ messages in thread
From: Jeremy Fitzhardinge @ 2009-12-04  2:33 UTC (permalink / raw)
  To: Xu, Dongxiao
  Cc: Steven Smith, Ian Pratt, xen-devel@lists.xensource.com,
	Ian Campbell

On 12/03/09 18:13, Xu, Dongxiao wrote:
>>      * same with "foreign_page_tracker"
>>            o (the foreign page tracker API should have better names,
>>              but that's not your problem)
>>      * What's cpu_online_nr for?  I don't think it should be necessary
>>        at all, and if it is, then it needs a much more distinct name.
>>      * If they're really per-cpu variables, they should use the percpu
>>        mechanism
>>      
> Actually those tasklets are not per-cpu variables.
> We just defined cpu_online_nr of tasklets, in order to get the best performance
> if each tasklet could run on each cpu. However, they are not binded with cpus.
> Some tasklets may run on the same vcpu of dom0 due to interrupt delivery
> affinity. Therefore these tasklets are not per-cpu variables.
>    

OK, you should name the variable to what it actually means, not what its 
value happens to be.  It seems like a parameter which should be 
adjustable via sysfs or something.

How did you arrive at 3?

>>      * How do you relate the number of online CPUs to the whole group
>>        index/pending index computation?  It isn't obvious how they're
>>        connected, or how it guarantees that the index is enough.
>>      
> Same explaination as above. Whether online cpus number is greater or less than
> tasklet number does not matter in our case. We defined them to the same value
> is only for getting best performance.
>    

Nevertheless, it isn't at all clear how we can be certain the index 
calculations are less guaranteed to be less than the number of 
tasklets.  There is a lot of code scattered around the place; perhaps 
you could condense it into a smaller number of places?

In fact, the overall patch size is very large, and hard to review and 
test.  Could you please give some thought to how you can incrementally 
modify netback to get the result you want.  For example, keep the 
current functional structure, but make the changes to generalize to N 
processing handlers (but keeping N=1), then convert the softirq to a 
tasklet, then make N > 1.

Thanks,
     J

^ permalink raw reply	[flat|nested] 46+ messages in thread

* RE: [Pv-ops][PATCH] Netback multiple tasklet support
  2009-12-04  2:33             ` Jeremy Fitzhardinge
@ 2009-12-08  9:22               ` Xu, Dongxiao
  2009-12-09 20:23                 ` Jeremy Fitzhardinge
  0 siblings, 1 reply; 46+ messages in thread
From: Xu, Dongxiao @ 2009-12-08  9:22 UTC (permalink / raw)
  To: Jeremy Fitzhardinge
  Cc: Steven Smith, Ian Pratt, xen-devel@lists.xensource.com,
	Ian Campbell

[-- Attachment #1: Type: text/plain, Size: 2609 bytes --]

Jeremy, 
	I have revised the patch according to your suggestion. See attachment. 
0001: Keep group number as 1, and put all the global/static variables to struct xen_netbk. Do some preparations for multiple tasklets support.
0002: Support for netback multiple tasklet.
0003: Use kernel thread to replace the tasklet in order to ensure the dom0 userspace QoS.

Thanks!
Dongxiao.

Jeremy Fitzhardinge wrote:
> On 12/03/09 18:13, Xu, Dongxiao wrote:
>>>      * same with "foreign_page_tracker"
>>>            o (the foreign page tracker API should have better names,
>>>              but that's not your problem)
>>>      * What's cpu_online_nr for?  I don't think it should be
>>>        necessary at all, and if it is, then it needs a much more
>>>      distinct name. * If they're really per-cpu variables, they
>>> should use the percpu        mechanism 
>>> 
>> Actually those tasklets are not per-cpu variables.
>> We just defined cpu_online_nr of tasklets, in order to get the best
>> performance if each tasklet could run on each cpu. However, they are
>> not binded with cpus. Some tasklets may run on the same vcpu of dom0
>> due to interrupt delivery affinity. Therefore these tasklets are not
>> per-cpu variables. 
>> 
> 
> OK, you should name the variable to what it actually means, not what
> its value happens to be.  It seems like a parameter which should be
> adjustable via sysfs or something.
> 
> How did you arrive at 3?
> 
>>>      * How do you relate the number of online CPUs to the whole
>>>        group index/pending index computation?  It isn't obvious how
>>>        they're connected, or how it guarantees that the index is
>>> enough. 
>>> 
>> Same explaination as above. Whether online cpus number is greater or
>> less than tasklet number does not matter in our case. We defined
>> them to the same value is only for getting best performance.
>> 
> 
> Nevertheless, it isn't at all clear how we can be certain the index
> calculations are less guaranteed to be less than the number of
> tasklets.  There is a lot of code scattered around the place; perhaps
> you could condense it into a smaller number of places?
> 
> In fact, the overall patch size is very large, and hard to review and
> test.  Could you please give some thought to how you can incrementally
> modify netback to get the result you want.  For example, keep the
> current functional structure, but make the changes to generalize to N
> processing handlers (but keeping N=1), then convert the softirq to a
> tasklet, then make N > 1.
> 
> Thanks,
>      J

[-- Attachment #2: 0001-Netback-Generilize-static-global-variables-into-stru.patch --]
[-- Type: application/octet-stream, Size: 37867 bytes --]

From 08bb27c1a63fb21be402181dbdaade4f8eea50b8 Mon Sep 17 00:00:00 2001
From: Dongxiao Xu <dongxiao.xu@intel.com>
Date: Tue, 8 Dec 2009 14:36:44 +0800
Subject: [PATCH 1/3] Netback: Generilize static/global variables into 'struct xen_netbk'.
     Put all the static/global variables in netback.c into xen_netback
 structure. Do some preparations for the support of netback multiple
 threads.

Signed-off-by: Dongxiao Xu <dongxiao.xu@intel.com>
---
 drivers/xen/netback/common.h  |   64 +++++
 drivers/xen/netback/netback.c |  542 +++++++++++++++++++++--------------------
 2 files changed, 348 insertions(+), 258 deletions(-)

diff --git a/drivers/xen/netback/common.h b/drivers/xen/netback/common.h
index 348644a..8eff6c8 100644
--- a/drivers/xen/netback/common.h
+++ b/drivers/xen/netback/common.h
@@ -220,4 +220,68 @@ static inline int netbk_can_sg(struct net_device *dev)
 	return netif->features & NETIF_F_SG;
 }
 
+struct pending_tx_info {
+	struct xen_netif_tx_request req;
+	struct xen_netif *netif;
+};
+typedef unsigned int pending_ring_idx_t;
+
+struct page_ext {
+	unsigned long group;
+	unsigned long idx;
+};
+
+struct netbk_rx_meta {
+	skb_frag_t frag;
+	int id;
+};
+
+struct netbk_tx_pending_inuse {
+	struct list_head list;
+	unsigned long alloc_time;
+};
+
+#define MAX_PENDING_REQS 256
+
+struct xen_netbk {
+	struct tasklet_struct net_tx_tasklet;
+	struct tasklet_struct net_rx_tasklet;
+
+	struct sk_buff_head rx_queue;
+	struct sk_buff_head tx_queue;
+
+	struct timer_list net_timer;
+	struct timer_list netbk_tx_pending_timer;
+
+	struct page **mmap_pages;
+
+	struct page_ext page_extinfo[MAX_PENDING_REQS];
+
+	struct pending_tx_info pending_tx_info[MAX_PENDING_REQS];
+	struct netbk_tx_pending_inuse pending_inuse[MAX_PENDING_REQS];
+	struct gnttab_unmap_grant_ref tx_unmap_ops[MAX_PENDING_REQS];
+	struct gnttab_map_grant_ref tx_map_ops[MAX_PENDING_REQS];
+
+	grant_handle_t grant_tx_handle[MAX_PENDING_REQS];
+	u16 pending_ring[MAX_PENDING_REQS];
+	u16 dealloc_ring[MAX_PENDING_REQS];
+
+	pending_ring_idx_t pending_prod;
+	pending_ring_idx_t pending_cons;
+	pending_ring_idx_t dealloc_prod;
+	pending_ring_idx_t dealloc_cons;
+
+	struct list_head pending_inuse_head;
+	struct list_head net_schedule_list;
+
+	struct multicall_entry rx_mcl[NET_RX_RING_SIZE+3];
+	struct mmu_update rx_mmu[NET_RX_RING_SIZE];
+	struct gnttab_transfer grant_trans_op[NET_RX_RING_SIZE];
+	struct gnttab_copy grant_copy_op[NET_RX_RING_SIZE];
+	unsigned char rx_notify[NR_IRQS];
+	u16 notify_list[NET_RX_RING_SIZE];
+	struct netbk_rx_meta meta[NET_RX_RING_SIZE];
+
+	spinlock_t net_schedule_list_lock;
+};
 #endif /* __NETIF__BACKEND__COMMON_H__ */
diff --git a/drivers/xen/netback/netback.c b/drivers/xen/netback/netback.c
index c24debf..a484b0a 100644
--- a/drivers/xen/netback/netback.c
+++ b/drivers/xen/netback/netback.c
@@ -49,18 +49,13 @@
 
 /*define NETBE_DEBUG_INTERRUPT*/
 
-struct netbk_rx_meta {
-	skb_frag_t frag;
-	int id;
-};
-
-struct netbk_tx_pending_inuse {
-	struct list_head list;
-	unsigned long alloc_time;
-};
+struct xen_netbk *xen_netbk;
+int group_nr = 1;
+struct page_foreign_tracker *foreign_page_tracker;
 
+#define GET_GROUP_INDEX(netif) (0)
 
-static void netif_idx_release(u16 pending_idx);
+static void netif_idx_release(int group, u16 pending_idx);
 static void make_tx_response(struct xen_netif *netif,
 			     struct xen_netif_tx_request *txp,
 			     s8       st);
@@ -71,47 +66,26 @@ static struct xen_netif_rx_response *make_rx_response(struct xen_netif *netif,
 					     u16      size,
 					     u16      flags);
 
-static void net_tx_action(unsigned long unused);
-static DECLARE_TASKLET(net_tx_tasklet, net_tx_action, 0);
-
-static void net_rx_action(unsigned long unused);
-static DECLARE_TASKLET(net_rx_tasklet, net_rx_action, 0);
+static void net_tx_action(unsigned long group);
 
-static struct timer_list net_timer;
-static struct timer_list netbk_tx_pending_timer;
+static void net_rx_action(unsigned long group);
 
-#define MAX_PENDING_REQS 256
-
-static struct sk_buff_head rx_queue;
-
-static struct page **mmap_pages;
-static inline unsigned long idx_to_pfn(unsigned int idx)
+static inline unsigned long idx_to_pfn(int group, unsigned int idx)
 {
-	return page_to_pfn(mmap_pages[idx]);
+	struct xen_netbk *netbk = &xen_netbk[group];
+	return page_to_pfn(netbk->mmap_pages[idx]);
 }
 
-static inline unsigned long idx_to_kaddr(unsigned int idx)
+static inline unsigned long idx_to_kaddr(int group, unsigned int idx)
 {
-	return (unsigned long)pfn_to_kaddr(idx_to_pfn(idx));
+	return (unsigned long)pfn_to_kaddr(idx_to_pfn(group, idx));
 }
 
 /* extra field used in struct page */
-static inline void netif_set_page_index(struct page *pg, unsigned int index)
+static inline void netif_set_page_index(struct page *pg,
+					struct page_ext *page_extinfo)
 {
-	*(unsigned long *)&pg->mapping = index + 1;
-}
-
-static inline int netif_page_index(struct page *pg)
-{
-	unsigned long idx = (unsigned long)pg->mapping - 1;
-
-	if (!PageForeign(pg))
-		return -1;
-
-	if ((idx >= MAX_PENDING_REQS) || (mmap_pages[idx] != pg))
-		return -1;
-
-	return idx;
+	pg->mapping = (struct address_space *)page_extinfo;
 }
 
 /*
@@ -125,48 +99,18 @@ static inline int netif_page_index(struct page *pg)
  */
 #define PKT_PROT_LEN 64
 
-static struct pending_tx_info {
-	struct xen_netif_tx_request req;
-	struct xen_netif *netif;
-} pending_tx_info[MAX_PENDING_REQS];
-static u16 pending_ring[MAX_PENDING_REQS];
-typedef unsigned int pending_ring_idx_t;
-
 static inline pending_ring_idx_t pending_index(unsigned i)
 {
 	return i & (MAX_PENDING_REQS-1);
 }
 
-static pending_ring_idx_t pending_prod, pending_cons;
-
-static inline pending_ring_idx_t nr_pending_reqs(void)
+static inline pending_ring_idx_t nr_pending_reqs(int group)
 {
-	return MAX_PENDING_REQS - pending_prod + pending_cons;
+	struct xen_netbk *netbk = &xen_netbk[group];
+	return MAX_PENDING_REQS -
+		netbk->pending_prod + netbk->pending_cons;
 }
 
-static struct page_foreign_tracker *foreign_page_tracker;
-
-/* Freed TX SKBs get batched on this ring before return to pending_ring. */
-static u16 dealloc_ring[MAX_PENDING_REQS];
-static pending_ring_idx_t dealloc_prod, dealloc_cons;
-
-/* Doubly-linked list of in-use pending entries. */
-static struct netbk_tx_pending_inuse pending_inuse[MAX_PENDING_REQS];
-static LIST_HEAD(pending_inuse_head);
-
-static struct sk_buff_head tx_queue;
-
-static grant_handle_t grant_tx_handle[MAX_PENDING_REQS];
-static struct gnttab_unmap_grant_ref tx_unmap_ops[MAX_PENDING_REQS];
-static struct gnttab_map_grant_ref tx_map_ops[MAX_PENDING_REQS];
-
-static LIST_HEAD(net_schedule_list);
-static DEFINE_SPINLOCK(net_schedule_list_lock);
-
-#define MAX_MFN_ALLOC 64
-static unsigned long mfn_list[MAX_MFN_ALLOC];
-static unsigned int alloc_index = 0;
-
 /* Setting this allows the safe use of this driver without netloop. */
 static int MODPARM_copy_skb = 1;
 module_param_named(copy_skb, MODPARM_copy_skb, bool, 0);
@@ -174,18 +118,13 @@ MODULE_PARM_DESC(copy_skb, "Copy data received from netfront without netloop");
 
 int netbk_copy_skb_mode;
 
-static inline unsigned long alloc_mfn(void)
-{
-	BUG_ON(alloc_index == 0);
-	return mfn_list[--alloc_index];
-}
-
-static inline void maybe_schedule_tx_action(void)
+static inline void maybe_schedule_tx_action(int group)
 {
+	struct xen_netbk *netbk = &xen_netbk[group];
 	smp_mb();
-	if ((nr_pending_reqs() < (MAX_PENDING_REQS/2)) &&
-	    !list_empty(&net_schedule_list))
-		tasklet_schedule(&net_tx_tasklet);
+	if ((nr_pending_reqs(group) < (MAX_PENDING_REQS/2)) &&
+	    !list_empty(&netbk->net_schedule_list))
+		tasklet_schedule(&netbk->net_tx_tasklet);
 }
 
 static struct sk_buff *netbk_copy_skb(struct sk_buff *skb)
@@ -289,7 +228,9 @@ static void tx_queue_callback(unsigned long data)
 
 int netif_be_start_xmit(struct sk_buff *skb, struct net_device *dev)
 {
+	struct xen_netbk *netbk;
 	struct xen_netif *netif = netdev_priv(dev);
+	int group;
 
 	BUG_ON(skb->dev != dev);
 
@@ -334,9 +275,10 @@ int netif_be_start_xmit(struct sk_buff *skb, struct net_device *dev)
 			mod_timer(&netif->tx_queue_timeout, jiffies + HZ/2);
 		}
 	}
-
-	skb_queue_tail(&rx_queue, skb);
-	tasklet_schedule(&net_rx_tasklet);
+	group = GET_GROUP_INDEX(netif);
+	netbk = &xen_netbk[group];
+	skb_queue_tail(&netbk->rx_queue, skb);
+	tasklet_schedule(&netbk->net_rx_tasklet);
 
 	return 0;
 
@@ -495,7 +437,7 @@ static void netbk_add_frag_responses(struct xen_netif *netif, int status,
 	}
 }
 
-static void net_rx_action(unsigned long unused)
+static void net_rx_action(unsigned long group)
 {
 	struct xen_netif *netif = NULL;
 	s8 status;
@@ -509,31 +451,21 @@ static void net_rx_action(unsigned long unused)
 	int nr_frags;
 	int count;
 	unsigned long offset;
-
-	/*
-	 * Putting hundreds of bytes on the stack is considered rude.
-	 * Static works because a tasklet can only be on one CPU at any time.
-	 */
-	static struct multicall_entry rx_mcl[NET_RX_RING_SIZE+3];
-	static struct mmu_update rx_mmu[NET_RX_RING_SIZE];
-	static struct gnttab_transfer grant_trans_op[NET_RX_RING_SIZE];
-	static struct gnttab_copy grant_copy_op[NET_RX_RING_SIZE];
-	static unsigned char rx_notify[NR_IRQS];
-	static u16 notify_list[NET_RX_RING_SIZE];
-	static struct netbk_rx_meta meta[NET_RX_RING_SIZE];
+	struct xen_netbk *netbk = &xen_netbk[group];
 
 	struct netrx_pending_operations npo = {
-		mmu: rx_mmu,
-		trans: grant_trans_op,
-		copy: grant_copy_op,
-		mcl: rx_mcl,
-		meta: meta};
+		.mmu   = netbk->rx_mmu,
+		.trans = netbk->grant_trans_op,
+		.copy  = netbk->grant_copy_op,
+		.mcl   = netbk->rx_mcl,
+		.meta  = netbk->meta,
+	};
 
 	skb_queue_head_init(&rxq);
 
 	count = 0;
 
-	while ((skb = skb_dequeue(&rx_queue)) != NULL) {
+	while ((skb = skb_dequeue(&netbk->rx_queue)) != NULL) {
 		nr_frags = skb_shinfo(skb)->nr_frags;
 		*(int *)skb->cb = nr_frags;
 
@@ -548,39 +480,39 @@ static void net_rx_action(unsigned long unused)
 			break;
 	}
 
-	BUG_ON(npo.meta_prod > ARRAY_SIZE(meta));
+	BUG_ON(npo.meta_prod > ARRAY_SIZE(netbk->meta));
 
 	npo.mmu_mcl = npo.mcl_prod;
 	if (npo.mcl_prod) {
 		BUG_ON(xen_feature(XENFEAT_auto_translated_physmap));
-		BUG_ON(npo.mmu_prod > ARRAY_SIZE(rx_mmu));
+		BUG_ON(npo.mmu_prod > ARRAY_SIZE(netbk->rx_mmu));
 		mcl = npo.mcl + npo.mcl_prod++;
 
 		BUG_ON(mcl[-1].op != __HYPERVISOR_update_va_mapping);
 		mcl[-1].args[MULTI_UVMFLAGS_INDEX] = UVMF_TLB_FLUSH|UVMF_ALL;
 
 		mcl->op = __HYPERVISOR_mmu_update;
-		mcl->args[0] = (unsigned long)rx_mmu;
+		mcl->args[0] = (unsigned long)netbk->rx_mmu;
 		mcl->args[1] = npo.mmu_prod;
 		mcl->args[2] = 0;
 		mcl->args[3] = DOMID_SELF;
 	}
 
 	if (npo.trans_prod) {
-		BUG_ON(npo.trans_prod > ARRAY_SIZE(grant_trans_op));
+		BUG_ON(npo.trans_prod > ARRAY_SIZE(netbk->grant_trans_op));
 		mcl = npo.mcl + npo.mcl_prod++;
 		mcl->op = __HYPERVISOR_grant_table_op;
 		mcl->args[0] = GNTTABOP_transfer;
-		mcl->args[1] = (unsigned long)grant_trans_op;
+		mcl->args[1] = (unsigned long)netbk->grant_trans_op;
 		mcl->args[2] = npo.trans_prod;
 	}
 
 	if (npo.copy_prod) {
-		BUG_ON(npo.copy_prod > ARRAY_SIZE(grant_copy_op));
+		BUG_ON(npo.copy_prod > ARRAY_SIZE(netbk->grant_copy_op));
 		mcl = npo.mcl + npo.mcl_prod++;
 		mcl->op = __HYPERVISOR_grant_table_op;
 		mcl->args[0] = GNTTABOP_copy;
-		mcl->args[1] = (unsigned long)grant_copy_op;
+		mcl->args[1] = (unsigned long)netbk->grant_copy_op;
 		mcl->args[2] = npo.copy_prod;
 	}
 
@@ -588,7 +520,7 @@ static void net_rx_action(unsigned long unused)
 	if (!npo.mcl_prod)
 		return;
 
-	BUG_ON(npo.mcl_prod > ARRAY_SIZE(rx_mcl));
+	BUG_ON(npo.mcl_prod > ARRAY_SIZE(netbk->rx_mcl));
 
 	ret = HYPERVISOR_multicall(npo.mcl, npo.mcl_prod);
 	BUG_ON(ret != 0);
@@ -605,7 +537,7 @@ static void net_rx_action(unsigned long unused)
 
 		status = netbk_check_gop(nr_frags, netif->domid, &npo);
 
-		id = meta[npo.meta_cons].id;
+		id = netbk->meta[npo.meta_cons].id;
 		flags = nr_frags ? NETRXF_more_data : 0;
 
 		if (skb->ip_summed == CHECKSUM_PARTIAL) /* local packet? */
@@ -618,7 +550,7 @@ static void net_rx_action(unsigned long unused)
 		resp = make_rx_response(netif, id, status, offset,
 					skb_headlen(skb), flags);
 
-		if (meta[npo.meta_cons].frag.size) {
+		if (netbk->meta[npo.meta_cons].frag.size) {
 			struct xen_netif_extra_info *gso =
 				(struct xen_netif_extra_info *)
 				RING_GET_RESPONSE(&netif->rx,
@@ -626,7 +558,7 @@ static void net_rx_action(unsigned long unused)
 
 			resp->flags |= NETRXF_extra_info;
 
-			gso->u.gso.size = meta[npo.meta_cons].frag.size;
+			gso->u.gso.size = netbk->meta[npo.meta_cons].frag.size;
 			gso->u.gso.type = XEN_NETIF_GSO_TYPE_TCPV4;
 			gso->u.gso.pad = 0;
 			gso->u.gso.features = 0;
@@ -636,15 +568,15 @@ static void net_rx_action(unsigned long unused)
 		}
 
 		netbk_add_frag_responses(netif, status,
-					 meta + npo.meta_cons + 1,
-					 nr_frags);
+				netbk->meta + npo.meta_cons + 1,
+				nr_frags);
 
 		RING_PUSH_RESPONSES_AND_CHECK_NOTIFY(&netif->rx, ret);
 		irq = netif->irq;
-		if (ret && !rx_notify[irq] &&
+		if (ret && !netbk->rx_notify[irq] &&
 				(netif->smart_poll != 1)) {
-			rx_notify[irq] = 1;
-			notify_list[notify_nr++] = irq;
+			netbk->rx_notify[irq] = 1;
+			netbk->notify_list[notify_nr++] = irq;
 		}
 
 		if (netif_queue_stopped(netif->dev) &&
@@ -669,28 +601,31 @@ static void net_rx_action(unsigned long unused)
 	}
 
 	while (notify_nr != 0) {
-		irq = notify_list[--notify_nr];
-		rx_notify[irq] = 0;
+		irq = netbk->notify_list[--notify_nr];
+		netbk->rx_notify[irq] = 0;
 		notify_remote_via_irq(irq);
 	}
 
 	/* More work to do? */
-	if (!skb_queue_empty(&rx_queue) && !timer_pending(&net_timer))
-		tasklet_schedule(&net_rx_tasklet);
+	if (!skb_queue_empty(&netbk->rx_queue) &&
+			!timer_pending(&netbk->net_timer))
+		tasklet_schedule(&netbk->net_rx_tasklet);
 #if 0
 	else
 		xen_network_done_notify();
 #endif
 }
 
-static void net_alarm(unsigned long unused)
+static void net_alarm(unsigned long group)
 {
-	tasklet_schedule(&net_rx_tasklet);
+	struct xen_netbk *netbk = &xen_netbk[group];
+	tasklet_schedule(&netbk->net_rx_tasklet);
 }
 
-static void netbk_tx_pending_timeout(unsigned long unused)
+static void netbk_tx_pending_timeout(unsigned long group)
 {
-	tasklet_schedule(&net_tx_tasklet);
+	struct xen_netbk *netbk = &xen_netbk[group];
+	tasklet_schedule(&netbk->net_tx_tasklet);
 }
 
 struct net_device_stats *netif_be_get_stats(struct net_device *dev)
@@ -706,37 +641,42 @@ static int __on_net_schedule_list(struct xen_netif *netif)
 
 static void remove_from_net_schedule_list(struct xen_netif *netif)
 {
-	spin_lock_irq(&net_schedule_list_lock);
+	int group = GET_GROUP_INDEX(netif);
+	struct xen_netbk *netbk = &xen_netbk[group];
+	spin_lock_irq(&netbk->net_schedule_list_lock);
 	if (likely(__on_net_schedule_list(netif))) {
 		list_del_init(&netif->list);
 		netif_put(netif);
 	}
-	spin_unlock_irq(&net_schedule_list_lock);
+	spin_unlock_irq(&netbk->net_schedule_list_lock);
 }
 
 static void add_to_net_schedule_list_tail(struct xen_netif *netif)
 {
+	int group = GET_GROUP_INDEX(netif);
+	struct xen_netbk *netbk = &xen_netbk[group];
 	if (__on_net_schedule_list(netif))
 		return;
 
-	spin_lock_irq(&net_schedule_list_lock);
+	spin_lock_irq(&netbk->net_schedule_list_lock);
 	if (!__on_net_schedule_list(netif) &&
 	    likely(netif_schedulable(netif))) {
-		list_add_tail(&netif->list, &net_schedule_list);
+		list_add_tail(&netif->list, &netbk->net_schedule_list);
 		netif_get(netif);
 	}
-	spin_unlock_irq(&net_schedule_list_lock);
+	spin_unlock_irq(&netbk->net_schedule_list_lock);
 }
 
 void netif_schedule_work(struct xen_netif *netif)
 {
 	int more_to_do;
+	int group = GET_GROUP_INDEX(netif);
 
 	RING_FINAL_CHECK_FOR_REQUESTS(&netif->tx, more_to_do);
 
 	if (more_to_do) {
 		add_to_net_schedule_list_tail(netif);
-		maybe_schedule_tx_action();
+		maybe_schedule_tx_action(group);
 	}
 }
 
@@ -773,13 +713,16 @@ static void tx_credit_callback(unsigned long data)
 	netif_schedule_work(netif);
 }
 
-static inline int copy_pending_req(pending_ring_idx_t pending_idx)
+static inline int copy_pending_req(int group,
+				   pending_ring_idx_t pending_idx)
 {
-	return gnttab_copy_grant_page(grant_tx_handle[pending_idx],
-				      &mmap_pages[pending_idx]);
+	struct xen_netbk *netbk = &xen_netbk[group];
+	return gnttab_copy_grant_page(
+			netbk->grant_tx_handle[pending_idx],
+			&netbk->mmap_pages[pending_idx]);
 }
 
-inline static void net_tx_action_dealloc(void)
+static inline void net_tx_action_dealloc(int group)
 {
 	struct netbk_tx_pending_inuse *inuse, *n;
 	struct gnttab_unmap_grant_ref *gop;
@@ -787,53 +730,61 @@ inline static void net_tx_action_dealloc(void)
 	pending_ring_idx_t dc, dp;
 	struct xen_netif *netif;
 	int ret;
+	struct xen_netbk *netbk = &xen_netbk[group];
 	LIST_HEAD(list);
 
-	dc = dealloc_cons;
-	gop = tx_unmap_ops;
+	dc = netbk->dealloc_cons;
+	gop = netbk->tx_unmap_ops;
 
 	/*
 	 * Free up any grants we have finished using
 	 */
 	do {
-		dp = dealloc_prod;
+		dp = netbk->dealloc_prod;
 
 		/* Ensure we see all indices enqueued by netif_idx_release(). */
 		smp_rmb();
 
 		while (dc != dp) {
 			unsigned long pfn;
+			struct netbk_tx_pending_inuse *pending_inuse =
+					netbk->pending_inuse;
 
-			pending_idx = dealloc_ring[pending_index(dc++)];
+			pending_idx = netbk->dealloc_ring[pending_index(dc++)];
 			list_move_tail(&pending_inuse[pending_idx].list, &list);
 
-			pfn = idx_to_pfn(pending_idx);
+			pfn = idx_to_pfn(group, pending_idx);
 			/* Already unmapped? */
 			if (!phys_to_machine_mapping_valid(pfn))
 				continue;
 
-			stop_tracking_page(mmap_pages[pending_idx]);
+			stop_tracking_page(netbk->mmap_pages[pending_idx]);
 
-			gnttab_set_unmap_op(gop, idx_to_kaddr(pending_idx),
-					    GNTMAP_host_map,
-					    grant_tx_handle[pending_idx]);
+			gnttab_set_unmap_op(gop,
+					idx_to_kaddr(group, pending_idx),
+					GNTMAP_host_map,
+					netbk->grant_tx_handle[pending_idx]);
 			gop++;
 		}
 
 		if (netbk_copy_skb_mode != NETBK_DELAYED_COPY_SKB ||
-		    list_empty(&pending_inuse_head))
+		    list_empty(&netbk->pending_inuse_head))
 			break;
 
 		/* Copy any entries that have been pending for too long. */
-		list_for_each_entry_safe(inuse, n, &pending_inuse_head, list) {
+		list_for_each_entry_safe(inuse, n,
+				&netbk->pending_inuse_head, list) {
+			struct pending_tx_info *pending_tx_info;
+			pending_tx_info = netbk->pending_tx_info;
+
 			if (time_after(inuse->alloc_time + HZ / 2, jiffies))
 				break;
 
-			pending_idx = inuse - pending_inuse;
+			pending_idx = inuse - netbk->pending_inuse;
 
 			pending_tx_info[pending_idx].netif->nr_copied_skbs++;
 
-			switch (copy_pending_req(pending_idx)) {
+			switch (copy_pending_req(group, pending_idx)) {
 			case 0:
 				list_move_tail(&inuse->list, &list);
 				continue;
@@ -846,16 +797,21 @@ inline static void net_tx_action_dealloc(void)
 
 			break;
 		}
-	} while (dp != dealloc_prod);
+	} while (dp != netbk->dealloc_prod);
 
-	dealloc_cons = dc;
+	netbk->dealloc_cons = dc;
 
 	ret = HYPERVISOR_grant_table_op(
-		GNTTABOP_unmap_grant_ref, tx_unmap_ops, gop - tx_unmap_ops);
+		GNTTABOP_unmap_grant_ref, netbk->tx_unmap_ops,
+		gop - netbk->tx_unmap_ops);
 	BUG_ON(ret);
 
 	list_for_each_entry_safe(inuse, n, &list, list) {
-		pending_idx = inuse - pending_inuse;
+		struct pending_tx_info *pending_tx_info;
+		pending_ring_idx_t index;
+
+		pending_tx_info = netbk->pending_tx_info;
+		pending_idx = inuse - netbk->pending_inuse;
 
 		netif = pending_tx_info[pending_idx].netif;
 
@@ -863,9 +819,10 @@ inline static void net_tx_action_dealloc(void)
 				 NETIF_RSP_OKAY);
 
 		/* Ready for next use. */
-		gnttab_reset_grant_page(mmap_pages[pending_idx]);
+		gnttab_reset_grant_page(netbk->mmap_pages[pending_idx]);
 
-		pending_ring[pending_index(pending_prod++)] = pending_idx;
+		index = pending_index(netbk->pending_prod++);
+		netbk->pending_ring[index] = pending_idx;
 
 		netif_put(netif);
 
@@ -873,7 +830,8 @@ inline static void net_tx_action_dealloc(void)
 	}
 }
 
-static void netbk_tx_err(struct xen_netif *netif, struct xen_netif_tx_request *txp, RING_IDX end)
+static void netbk_tx_err(struct xen_netif *netif,
+		struct xen_netif_tx_request *txp, RING_IDX end)
 {
 	RING_IDX cons = netif->tx.req_cons;
 
@@ -943,9 +901,16 @@ static struct gnttab_map_grant_ref *netbk_get_requests(struct xen_netif *netif,
 	start = ((unsigned long)shinfo->frags[0].page == pending_idx);
 
 	for (i = start; i < shinfo->nr_frags; i++, txp++) {
-		pending_idx = pending_ring[pending_index(pending_cons++)];
+		int group = GET_GROUP_INDEX(netif);
+		struct xen_netbk *netbk = &xen_netbk[group];
+		pending_ring_idx_t index;
+		struct pending_tx_info *pending_tx_info =
+			netbk->pending_tx_info;
+
+		index = pending_index(netbk->pending_cons++);
+		pending_idx = netbk->pending_ring[index];
 
-		gnttab_set_map_op(mop++, idx_to_kaddr(pending_idx),
+		gnttab_set_map_op(mop++, idx_to_kaddr(group, pending_idx),
 				  GNTMAP_host_map | GNTMAP_readonly,
 				  txp->gref, netif->domid);
 
@@ -955,10 +920,10 @@ static struct gnttab_map_grant_ref *netbk_get_requests(struct xen_netif *netif,
 		frags[i].page = (void *)pending_idx;
 
 		start_tracking_page(foreign_page_tracker,
-				    mmap_pages[pending_idx],
+				    netbk->mmap_pages[pending_idx],
 				    netif->domid,
 				    pending_tx_info[pending_idx].req.gref,
-				    pending_idx,
+				    group * MAX_PENDING_REQS + pending_idx,
 				    NULL);
 	}
 
@@ -966,10 +931,12 @@ static struct gnttab_map_grant_ref *netbk_get_requests(struct xen_netif *netif,
 }
 
 static int netbk_tx_check_mop(struct sk_buff *skb,
-			       struct gnttab_map_grant_ref **mopp)
+			       struct gnttab_map_grant_ref **mopp, int group)
 {
 	struct gnttab_map_grant_ref *mop = *mopp;
 	int pending_idx = *((u16 *)skb->data);
+	struct xen_netbk *netbk = &xen_netbk[group];
+	struct pending_tx_info *pending_tx_info = netbk->pending_tx_info;
 	struct xen_netif *netif = pending_tx_info[pending_idx].netif;
 	struct xen_netif_tx_request *txp;
 	struct skb_shared_info *shinfo = skb_shinfo(skb);
@@ -979,15 +946,17 @@ static int netbk_tx_check_mop(struct sk_buff *skb,
 	/* Check status of header. */
 	err = mop->status;
 	if (unlikely(err)) {
+		pending_ring_idx_t index;
+		index = pending_index(netbk->pending_prod++);
 		txp = &pending_tx_info[pending_idx].req;
 		make_tx_response(netif, txp, NETIF_RSP_ERROR);
-		pending_ring[pending_index(pending_prod++)] = pending_idx;
+		netbk->pending_ring[index] = pending_idx;
 		netif_put(netif);
 	} else {
 		set_phys_to_machine(
-			__pa(idx_to_kaddr(pending_idx)) >> PAGE_SHIFT,
+			__pa(idx_to_kaddr(group, pending_idx)) >> PAGE_SHIFT,
 			FOREIGN_FRAME(mop->dev_bus_addr >> PAGE_SHIFT));
-		grant_tx_handle[pending_idx] = mop->handle;
+		netbk->grant_tx_handle[pending_idx] = mop->handle;
 	}
 
 	/* Skip first skb fragment if it is on same page as header fragment. */
@@ -995,26 +964,30 @@ static int netbk_tx_check_mop(struct sk_buff *skb,
 
 	for (i = start; i < nr_frags; i++) {
 		int j, newerr;
+		pending_ring_idx_t index;
 
 		pending_idx = (unsigned long)shinfo->frags[i].page;
 
 		/* Check error status: if okay then remember grant handle. */
 		newerr = (++mop)->status;
 		if (likely(!newerr)) {
+			unsigned long addr;
+			addr = idx_to_kaddr(group, pending_idx);
 			set_phys_to_machine(
-				__pa(idx_to_kaddr(pending_idx))>>PAGE_SHIFT,
+				__pa(addr)>>PAGE_SHIFT,
 				FOREIGN_FRAME(mop->dev_bus_addr>>PAGE_SHIFT));
-			grant_tx_handle[pending_idx] = mop->handle;
+			netbk->grant_tx_handle[pending_idx] = mop->handle;
 			/* Had a previous error? Invalidate this fragment. */
 			if (unlikely(err))
-				netif_idx_release(pending_idx);
+				netif_idx_release(group, pending_idx);
 			continue;
 		}
 
 		/* Error on this fragment: respond to client with an error. */
-		txp = &pending_tx_info[pending_idx].req;
+		txp = &netbk->pending_tx_info[pending_idx].req;
 		make_tx_response(netif, txp, NETIF_RSP_ERROR);
-		pending_ring[pending_index(pending_prod++)] = pending_idx;
+		index = pending_index(netbk->pending_prod++);
+		netbk->pending_ring[index] = pending_idx;
 		netif_put(netif);
 
 		/* Not the first error? Preceding frags already invalidated. */
@@ -1023,10 +996,10 @@ static int netbk_tx_check_mop(struct sk_buff *skb,
 
 		/* First error: invalidate header and preceding fragments. */
 		pending_idx = *((u16 *)skb->data);
-		netif_idx_release(pending_idx);
+		netif_idx_release(group, pending_idx);
 		for (j = start; j < i; j++) {
 			pending_idx = (unsigned long)shinfo->frags[i].page;
-			netif_idx_release(pending_idx);
+			netif_idx_release(group, pending_idx);
 		}
 
 		/* Remember the error: invalidate all subsequent fragments. */
@@ -1037,10 +1010,11 @@ static int netbk_tx_check_mop(struct sk_buff *skb,
 	return err;
 }
 
-static void netbk_fill_frags(struct sk_buff *skb)
+static void netbk_fill_frags(struct sk_buff *skb, int group)
 {
 	struct skb_shared_info *shinfo = skb_shinfo(skb);
 	int nr_frags = shinfo->nr_frags;
+	struct xen_netbk *netbk = &xen_netbk[group];
 	int i;
 
 	for (i = 0; i < nr_frags; i++) {
@@ -1050,12 +1024,12 @@ static void netbk_fill_frags(struct sk_buff *skb)
 
 		pending_idx = (unsigned long)frag->page;
 
-		pending_inuse[pending_idx].alloc_time = jiffies;
-		list_add_tail(&pending_inuse[pending_idx].list,
-			      &pending_inuse_head);
+		netbk->pending_inuse[pending_idx].alloc_time = jiffies;
+		list_add_tail(&netbk->pending_inuse[pending_idx].list,
+			      &netbk->pending_inuse_head);
 
-		txp = &pending_tx_info[pending_idx].req;
-		frag->page = virt_to_page(idx_to_kaddr(pending_idx));
+		txp = &netbk->pending_tx_info[pending_idx].req;
+		frag->page = virt_to_page(idx_to_kaddr(group, pending_idx));
 		frag->size = txp->size;
 		frag->page_offset = txp->offset;
 
@@ -1187,15 +1161,16 @@ static bool tx_credit_exceeded(struct xen_netif *netif, unsigned size)
 	return false;
 }
 
-static unsigned net_tx_build_mops(void)
+static unsigned net_tx_build_mops(int group)
 {
 	struct gnttab_map_grant_ref *mop;
 	struct sk_buff *skb;
+	struct xen_netbk *netbk = &xen_netbk[group];
 	int ret;
 
-	mop = tx_map_ops;
-	while (((nr_pending_reqs() + MAX_SKB_FRAGS) < MAX_PENDING_REQS) &&
-		!list_empty(&net_schedule_list)) {
+	mop = netbk->tx_map_ops;
+	while (((nr_pending_reqs(group) + MAX_SKB_FRAGS) < MAX_PENDING_REQS) &&
+		!list_empty(&netbk->net_schedule_list)) {
 		struct xen_netif *netif;
 		struct xen_netif_tx_request txreq;
 		struct xen_netif_tx_request txfrags[MAX_SKB_FRAGS];
@@ -1204,9 +1179,11 @@ static unsigned net_tx_build_mops(void)
 		RING_IDX idx;
 		int work_to_do;
 		unsigned int data_len;
+		pending_ring_idx_t index;
 	
 		/* Get a netif from the list with work to do. */
-		netif = list_first_entry(&net_schedule_list, struct xen_netif, list);
+		netif = list_first_entry(&netbk->net_schedule_list,
+				struct xen_netif, list);
 		netif_get(netif);
 		remove_from_net_schedule_list(netif);
 
@@ -1265,7 +1242,8 @@ static unsigned net_tx_build_mops(void)
 			continue;
 		}
 
-		pending_idx = pending_ring[pending_index(pending_cons)];
+		index = pending_index(netbk->pending_cons);
+		pending_idx = netbk->pending_ring[index];
 
 		data_len = (txreq.size > PKT_PROT_LEN &&
 			    ret < MAX_SKB_FRAGS) ?
@@ -1293,21 +1271,21 @@ static unsigned net_tx_build_mops(void)
 			}
 		}
 
-		gnttab_set_map_op(mop, idx_to_kaddr(pending_idx),
+		gnttab_set_map_op(mop, idx_to_kaddr(group, pending_idx),
 				  GNTMAP_host_map | GNTMAP_readonly,
 				  txreq.gref, netif->domid);
 		mop++;
 
 		start_tracking_page(foreign_page_tracker,
-				    mmap_pages[pending_idx],
+				    netbk->mmap_pages[pending_idx],
 				    netif->domid,
 				    txreq.gref,
-				    pending_idx,
+				    group * MAX_PENDING_REQS + pending_idx,
 				    NULL);
 
-		memcpy(&pending_tx_info[pending_idx].req,
+		memcpy(&netbk->pending_tx_info[pending_idx].req,
 		       &txreq, sizeof(txreq));
-		pending_tx_info[pending_idx].netif = netif;
+		netbk->pending_tx_info[pending_idx].netif = netif;
 		*((u16 *)skb->data) = pending_idx;
 
 		__skb_put(skb, data_len);
@@ -1322,40 +1300,41 @@ static unsigned net_tx_build_mops(void)
 			skb_shinfo(skb)->frags[0].page = (void *)~0UL;
 		}
 
-		__skb_queue_tail(&tx_queue, skb);
+		__skb_queue_tail(&netbk->tx_queue, skb);
 
-		pending_cons++;
+		netbk->pending_cons++;
 
 		mop = netbk_get_requests(netif, skb, txfrags, mop);
 
 		netif->tx.req_cons = idx;
 		netif_schedule_work(netif);
 
-		if ((mop - tx_map_ops) >= ARRAY_SIZE(tx_map_ops))
+		if ((mop - netbk->tx_map_ops) >= ARRAY_SIZE(netbk->tx_map_ops))
 			break;
 	}
 
-	return mop - tx_map_ops;
+	return mop - netbk->tx_map_ops;
 }
 
-static void net_tx_submit(void)
+static void net_tx_submit(int group)
 {
 	struct gnttab_map_grant_ref *mop;
 	struct sk_buff *skb;
+	struct xen_netbk *netbk = &xen_netbk[group];
 
-	mop = tx_map_ops;
-	while ((skb = __skb_dequeue(&tx_queue)) != NULL) {
+	mop = netbk->tx_map_ops;
+	while ((skb = __skb_dequeue(&netbk->tx_queue)) != NULL) {
 		struct xen_netif_tx_request *txp;
 		struct xen_netif *netif;
 		u16 pending_idx;
 		unsigned data_len;
 
 		pending_idx = *((u16 *)skb->data);
-		netif       = pending_tx_info[pending_idx].netif;
-		txp         = &pending_tx_info[pending_idx].req;
+		netif = netbk->pending_tx_info[pending_idx].netif;
+		txp = &netbk->pending_tx_info[pending_idx].req;
 
 		/* Check the remap error code. */
-		if (unlikely(netbk_tx_check_mop(skb, &mop))) {
+		if (unlikely(netbk_tx_check_mop(skb, &mop, group))) {
 			DPRINTK("netback grant failed.\n");
 			skb_shinfo(skb)->nr_frags = 0;
 			kfree_skb(skb);
@@ -1364,7 +1343,7 @@ static void net_tx_submit(void)
 
 		data_len = skb->len;
 		memcpy(skb->data,
-		       (void *)(idx_to_kaddr(pending_idx)|txp->offset),
+		       (void *)(idx_to_kaddr(group, pending_idx)|txp->offset),
 		       data_len);
 		if (data_len < txp->size) {
 			/* Append the packet payload as a fragment. */
@@ -1372,7 +1351,7 @@ static void net_tx_submit(void)
 			txp->size -= data_len;
 		} else {
 			/* Schedule a response immediately. */
-			netif_idx_release(pending_idx);
+			netif_idx_release(group, pending_idx);
 		}
 
 		/*
@@ -1384,7 +1363,7 @@ static void net_tx_submit(void)
 		else
 			skb->ip_summed = CHECKSUM_NONE;
 
-		netbk_fill_frags(skb);
+		netbk_fill_frags(skb, group);
 
 		skb->dev      = netif->dev;
 		skb->protocol = eth_type_trans(skb, skb->dev);
@@ -1412,65 +1391,75 @@ static void net_tx_submit(void)
 	}
 
 	if (netbk_copy_skb_mode == NETBK_DELAYED_COPY_SKB &&
-	    !list_empty(&pending_inuse_head)) {
+	    !list_empty(&netbk->pending_inuse_head)) {
 		struct netbk_tx_pending_inuse *oldest;
 
-		oldest = list_entry(pending_inuse_head.next,
+		oldest = list_entry(netbk->pending_inuse_head.next,
 				    struct netbk_tx_pending_inuse, list);
-		mod_timer(&netbk_tx_pending_timer, oldest->alloc_time + HZ);
+		mod_timer(&netbk->netbk_tx_pending_timer,
+				oldest->alloc_time + HZ);
 	}
 }
 
 /* Called after netfront has transmitted */
-static void net_tx_action(unsigned long unused)
+static void net_tx_action(unsigned long group)
 {
 	unsigned nr_mops;
+	struct xen_netbk *netbk = &xen_netbk[group];
 	int ret;
 
-	if (dealloc_cons != dealloc_prod)
-		net_tx_action_dealloc();
+	if (netbk->dealloc_cons != netbk->dealloc_prod)
+		net_tx_action_dealloc(group);
 
-	nr_mops = net_tx_build_mops();
+	nr_mops = net_tx_build_mops(group);
 
 	if (nr_mops == 0)
 		return;
 
 	ret = HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref,
-					tx_map_ops, nr_mops);
+					netbk->tx_map_ops, nr_mops);
 	BUG_ON(ret);
 
-	net_tx_submit();
+	net_tx_submit(group);
 }
 
-static void netif_idx_release(u16 pending_idx)
+static void netif_idx_release(int group, u16 pending_idx)
 {
 	static DEFINE_SPINLOCK(_lock);
 	unsigned long flags;
+	struct xen_netbk *netbk = &xen_netbk[group];
+	pending_ring_idx_t index;
 
 	spin_lock_irqsave(&_lock, flags);
-	dealloc_ring[pending_index(dealloc_prod)] = pending_idx;
+	index = pending_index(netbk->dealloc_prod);
+	netbk->dealloc_ring[index] = pending_idx;
 	/* Sync with net_tx_action_dealloc: insert idx /then/ incr producer. */
 	smp_wmb();
-	dealloc_prod++;
+	netbk->dealloc_prod++;
 	spin_unlock_irqrestore(&_lock, flags);
 
-	tasklet_schedule(&net_tx_tasklet);
+	tasklet_schedule(&netbk->net_tx_tasklet);
 }
 
 static void netif_page_release(struct page *page, unsigned int order)
 {
-	int idx = netif_page_index(page);
-	BUG_ON(order);
-	BUG_ON(idx < 0);
-	netif_idx_release(idx);
+	int idx = ((struct page_ext *)(page->mapping))->idx;
+	int group = ((struct page_ext *)(page->mapping))->group;
+	struct xen_netbk *netbk = &xen_netbk[group];
+	BUG_ON(order ||
+	       group < 0 || group >= group_nr ||
+	       idx < 0 || idx >= MAX_PENDING_REQS ||
+	       netbk->mmap_pages[idx] != page);
+	netif_idx_release(group, idx);
 }
 
 irqreturn_t netif_be_int(int irq, void *dev_id)
 {
 	struct xen_netif *netif = dev_id;
+	int group = GET_GROUP_INDEX(netif);
 
 	add_to_net_schedule_list_tail(netif);
-	maybe_schedule_tx_action();
+	maybe_schedule_tx_action(group);
 
 	if (netif_schedulable(netif) && !netbk_queue_full(netif))
 		netif_wake_queue(netif->dev);
@@ -1536,13 +1525,15 @@ static struct xen_netif_rx_response *make_rx_response(struct xen_netif *netif,
 static irqreturn_t netif_be_dbg(int irq, void *dev_id, struct pt_regs *regs)
 {
 	struct list_head *ent;
-	struct xen_netif *netif;
+	struct xen_netif *netif = dev_id;
+	int group = GET_GROUP_INDEX(netif);
+	struct xen_netbk *netbk = &xen_netbk[group];
 	int i = 0;
 
 	printk(KERN_ALERT "netif_schedule_list:\n");
-	spin_lock_irq(&net_schedule_list_lock);
+	spin_lock_irq(&netbk->net_schedule_list_lock);
 
-	list_for_each (ent, &net_schedule_list) {
+	list_for_each(ent, &netbk->net_schedule_list) {
 		netif = list_entry(ent, struct xen_netif, list);
 		printk(KERN_ALERT " %d: private(rx_req_cons=%08x "
 		       "rx_resp_prod=%08x\n",
@@ -1559,7 +1550,7 @@ static irqreturn_t netif_be_dbg(int irq, void *dev_id, struct pt_regs *regs)
 		i++;
 	}
 
-	spin_unlock_irq(&net_schedule_list_lock);
+	spin_unlock_irq(&netbk->net_schedule_list_lock);
 	printk(KERN_ALERT " ** End of netif_schedule_list **\n");
 
 	return IRQ_HANDLED;
@@ -1569,6 +1560,7 @@ static irqreturn_t netif_be_dbg(int irq, void *dev_id, struct pt_regs *regs)
 static int __init netback_init(void)
 {
 	int i;
+	int group;
 	struct page *page;
 	int rc = 0;
 
@@ -1578,38 +1570,67 @@ static int __init netback_init(void)
 	/* We can increase reservation by this much in net_rx_action(). */
 //	balloon_update_driver_allowance(NET_RX_RING_SIZE);
 
-	skb_queue_head_init(&rx_queue);
-	skb_queue_head_init(&tx_queue);
-
-	init_timer(&net_timer);
-	net_timer.data = 0;
-	net_timer.function = net_alarm;
-
-	init_timer(&netbk_tx_pending_timer);
-	netbk_tx_pending_timer.data = 0;
-	netbk_tx_pending_timer.function = netbk_tx_pending_timeout;
-
-	foreign_page_tracker = alloc_page_foreign_tracker(MAX_PENDING_REQS);
-	if (!foreign_page_tracker)
-		return -ENOMEM;
-	mmap_pages = alloc_empty_pages_and_pagevec(MAX_PENDING_REQS);
-	if (mmap_pages == NULL) {
-		printk("%s: out of memory\n", __FUNCTION__);
-		free_page_foreign_tracker(foreign_page_tracker);
+	xen_netbk = kzalloc(group_nr * sizeof(struct xen_netbk), GFP_KERNEL);
+	if (!xen_netbk) {
+		printk(KERN_ALERT "%s: out of memory\n", __func__);
 		return -ENOMEM;
 	}
 
-	for (i = 0; i < MAX_PENDING_REQS; i++) {
-		page = mmap_pages[i];
-		SetPageForeign(page, netif_page_release);
-		netif_set_page_index(page, i);
-		INIT_LIST_HEAD(&pending_inuse[i].list);
+	foreign_page_tracker =
+		alloc_page_foreign_tracker(group_nr * MAX_PENDING_REQS);
+	if (!foreign_page_tracker) {
+		kfree(xen_netbk);
+		return -ENOMEM;
 	}
 
-	pending_cons = 0;
-	pending_prod = MAX_PENDING_REQS;
-	for (i = 0; i < MAX_PENDING_REQS; i++)
-		pending_ring[i] = i;
+	for (group = 0; group < group_nr; group++) {
+		struct xen_netbk *netbk = &xen_netbk[group];
+		tasklet_init(&netbk->net_tx_tasklet,
+				net_tx_action, group);
+		tasklet_init(&netbk->net_rx_tasklet,
+				net_rx_action, group);
+
+		skb_queue_head_init(&netbk->rx_queue);
+		skb_queue_head_init(&netbk->tx_queue);
+
+		netbk->mmap_pages =
+			alloc_empty_pages_and_pagevec(MAX_PENDING_REQS);
+		if (netbk->mmap_pages == NULL) {
+			printk(KERN_ALERT "%s: out of memory\n", __func__);
+			rc = -ENOMEM;
+			goto failed_init;
+		}
+
+		init_timer(&netbk->net_timer);
+		netbk->net_timer.data = (unsigned long)group;
+		netbk->net_timer.function = net_alarm;
+
+		init_timer(&netbk->netbk_tx_pending_timer);
+		netbk->netbk_tx_pending_timer.data =
+			(unsigned long)group;
+		netbk->netbk_tx_pending_timer.function =
+			netbk_tx_pending_timeout;
+
+		for (i = 0; i < MAX_PENDING_REQS; i++) {
+			page = netbk->mmap_pages[i];
+			SetPageForeign(page, netif_page_release);
+			netbk->page_extinfo[i].group = group;
+			netbk->page_extinfo[i].idx = i;
+			netif_set_page_index(page,
+					&netbk->page_extinfo[i]);
+			INIT_LIST_HEAD(&netbk->pending_inuse[i].list);
+		}
+		INIT_LIST_HEAD(&netbk->pending_inuse_head);
+		INIT_LIST_HEAD(&netbk->net_schedule_list);
+
+		netbk->pending_cons = 0;
+		netbk->pending_prod = MAX_PENDING_REQS;
+
+		for (i = 0; i < MAX_PENDING_REQS; i++)
+			netbk->pending_ring[i] = i;
+
+		spin_lock_init(&netbk->net_schedule_list_lock);
+	}
 
 	netbk_copy_skb_mode = NETBK_DONT_COPY_SKB;
 	if (MODPARM_copy_skb) {
@@ -1638,9 +1659,14 @@ static int __init netback_init(void)
 	return 0;
 
 failed_init:
-	free_empty_pages_and_pagevec(mmap_pages, MAX_PENDING_REQS);
-	del_timer(&netbk_tx_pending_timer);
-	del_timer(&net_timer);
+	for (i = 0; i < group; i++) {
+		free_empty_pages_and_pagevec(xen_netbk[i].mmap_pages,
+				MAX_PENDING_REQS);
+		del_timer(&xen_netbk[i].netbk_tx_pending_timer);
+		del_timer(&xen_netbk[i].net_timer);
+	}
+	kfree(xen_netbk);
+	free_page_foreign_tracker(foreign_page_tracker);
 	return rc;
 
 }
-- 
1.6.3


[-- Attachment #3: 0002-Netback-Multiple-tasklets-support.patch --]
[-- Type: application/octet-stream, Size: 6745 bytes --]

From b2c084555cd792a6756651324aa2883ebc14a98b Mon Sep 17 00:00:00 2001
From: Dongxiao Xu <dongxiao.xu@intel.com>
Date: Tue, 8 Dec 2009 16:36:08 +0800
Subject: [PATCH 2/3] Netback: Multiple tasklets support.
      Now netback uses one pair of tasklets for Tx/Rx data transaction. Netback
  tasklet could only run at one CPU at a time, and it is used to serve all the
  netfronts. Therefore it has become a performance bottle neck. This patch is to
  use multiple tasklet pairs to replace the current single pair in dom0.
      Assuming that Dom0 has CPUNR VCPUs, we define CPUNR kinds of tasklets pair
  (CPUNR for Tx, and CPUNR for Rx). Each pare of tasklets serve specific group of
  netfronts. Also for those global and static variables, we duplicated them for
  each group in order to avoid the spinlock.

Signed-off-by: Dongxiao Xu <dongxiao.xu@intel.com>
---
 drivers/xen/netback/common.h    |   13 ++++++++
 drivers/xen/netback/interface.c |   66 +++++++++++++++++++++++++++++++++++++-
 drivers/xen/netback/netback.c   |    9 ++++-
 3 files changed, 84 insertions(+), 4 deletions(-)

diff --git a/drivers/xen/netback/common.h b/drivers/xen/netback/common.h
index 8eff6c8..f343353 100644
--- a/drivers/xen/netback/common.h
+++ b/drivers/xen/netback/common.h
@@ -56,6 +56,7 @@
 struct xen_netif {
 	/* Unique identifier for this interface. */
 	domid_t          domid;
+	int              group;
 	unsigned int     handle;
 
 	u8               fe_dev_addr[6];
@@ -241,6 +242,11 @@ struct netbk_tx_pending_inuse {
 	unsigned long alloc_time;
 };
 
+struct domain_entry {
+       int domid;
+       struct list_head dom;
+};
+
 #define MAX_PENDING_REQS 256
 
 struct xen_netbk {
@@ -283,5 +289,12 @@ struct xen_netbk {
 	struct netbk_rx_meta meta[NET_RX_RING_SIZE];
 
 	spinlock_t net_schedule_list_lock;
+	spinlock_t group_domain_list_lock;
+	struct list_head group_domain_list;
+	unsigned int group_domain_nr;
 };
+
+extern struct xen_netbk *xen_netbk;
+extern int group_nr;
+extern struct page_foreign_tracker *foreign_page_tracker;
 #endif /* __NETIF__BACKEND__COMMON_H__ */
diff --git a/drivers/xen/netback/interface.c b/drivers/xen/netback/interface.c
index 21c1f95..bdf3c1d 100644
--- a/drivers/xen/netback/interface.c
+++ b/drivers/xen/netback/interface.c
@@ -54,6 +54,59 @@
 static unsigned long netbk_queue_length = 32;
 module_param_named(queue_length, netbk_queue_length, ulong, 0644);
 
+static int add_domain_to_list(struct xen_netbk *netbk, int group_nr,
+		       struct xen_netif *netif)
+{
+	struct domain_entry *dom_entry;
+	int min_group = 0;
+	int min_group_domain_nr = 0;
+	int i;
+
+	dom_entry = (struct domain_entry *)
+		kmalloc(sizeof(struct domain_entry), GFP_KERNEL);
+	if (!dom_entry)
+		return -ENOMEM;
+
+	/* Find out the list which contains least number of domain */
+	min_group_domain_nr = netbk[0].group_domain_nr;
+	for (i = 0; i < group_nr; i++) {
+		if (netbk[i].group_domain_nr < min_group_domain_nr) {
+			min_group = i;
+			min_group_domain_nr = netbk[i].group_domain_nr;
+		}
+	}
+
+	netif->group = min_group;
+	dom_entry->domid = netif->domid;
+	spin_lock(&netbk[netif->group].group_domain_list_lock);
+	list_add_tail(&dom_entry->dom,
+			&netbk[netif->group].group_domain_list);
+	netbk[netif->group].group_domain_nr++;
+	spin_unlock(&netbk[netif->group].group_domain_list_lock);
+	return netif->group;
+}
+
+static void remove_domain_from_list(struct xen_netbk *netbk,
+			     struct xen_netif *netif)
+{
+	struct domain_entry *dom_entry = NULL;
+	int group = netif->group;
+
+	list_for_each_entry(dom_entry,
+			&netbk[group].group_domain_list, dom) {
+		if (dom_entry->domid == netif->domid)
+			break;
+	}
+	if (!dom_entry)
+		return;
+
+	spin_lock(&netbk[netif->group].group_domain_list_lock);
+	netbk[netif->group].group_domain_nr--;
+	list_del(&dom_entry->dom);
+	spin_unlock(&netbk[netif->group].group_domain_list_lock);
+	kfree(dom_entry);
+}
+
 static void __netif_up(struct xen_netif *netif)
 {
 	enable_irq(netif->irq);
@@ -70,6 +123,7 @@ static int net_open(struct net_device *dev)
 {
 	struct xen_netif *netif = netdev_priv(dev);
 	if (netback_carrier_ok(netif)) {
+		add_domain_to_list(xen_netbk, group_nr, netif);
 		__netif_up(netif);
 		netif_start_queue(dev);
 	}
@@ -79,8 +133,10 @@ static int net_open(struct net_device *dev)
 static int net_close(struct net_device *dev)
 {
 	struct xen_netif *netif = netdev_priv(dev);
-	if (netback_carrier_ok(netif))
+	if (netback_carrier_ok(netif)) {
 		__netif_down(netif);
+		remove_domain_from_list(xen_netbk, netif);
+	}
 	netif_stop_queue(dev);
 	return 0;
 }
@@ -329,6 +385,9 @@ int netif_map(struct xen_netif *netif, unsigned long tx_ring_ref,
 	if (netif->rx_comms_area == NULL)
 		goto err_rx;
 
+	if (add_domain_to_list(xen_netbk, group_nr, netif) < 0)
+		goto err_map;
+
 	err = map_frontend_pages(netif, tx_ring_ref, rx_ring_ref);
 	if (err)
 		goto err_map;
@@ -361,6 +420,7 @@ int netif_map(struct xen_netif *netif, unsigned long tx_ring_ref,
 	return 0;
 err_hypervisor:
 	unmap_frontend_pages(netif);
+	remove_domain_from_list(xen_netbk, netif);
 err_map:
 	free_vm_area(netif->rx_comms_area);
 err_rx:
@@ -374,8 +434,10 @@ void netif_disconnect(struct xen_netif *netif)
 		rtnl_lock();
 		netback_carrier_off(netif);
 		netif_carrier_off(netif->dev); /* discard queued packets */
-		if (netif_running(netif->dev))
+		if (netif_running(netif->dev)) {
 			__netif_down(netif);
+			remove_domain_from_list(xen_netbk, netif);
+		}
 		rtnl_unlock();
 		netif_put(netif);
 	}
diff --git a/drivers/xen/netback/netback.c b/drivers/xen/netback/netback.c
index a484b0a..773cd4f 100644
--- a/drivers/xen/netback/netback.c
+++ b/drivers/xen/netback/netback.c
@@ -50,10 +50,10 @@
 /*define NETBE_DEBUG_INTERRUPT*/
 
 struct xen_netbk *xen_netbk;
-int group_nr = 1;
+int group_nr;
 struct page_foreign_tracker *foreign_page_tracker;
 
-#define GET_GROUP_INDEX(netif) (0)
+#define GET_GROUP_INDEX(netif) ((netif)->group)
 
 static void netif_idx_release(int group, u16 pending_idx);
 static void make_tx_response(struct xen_netif *netif,
@@ -1570,6 +1570,7 @@ static int __init netback_init(void)
 	/* We can increase reservation by this much in net_rx_action(). */
 //	balloon_update_driver_allowance(NET_RX_RING_SIZE);
 
+	group_nr = num_online_cpus();
 	xen_netbk = kzalloc(group_nr * sizeof(struct xen_netbk), GFP_KERNEL);
 	if (!xen_netbk) {
 		printk(KERN_ALERT "%s: out of memory\n", __func__);
@@ -1630,6 +1631,10 @@ static int __init netback_init(void)
 			netbk->pending_ring[i] = i;
 
 		spin_lock_init(&netbk->net_schedule_list_lock);
+
+		INIT_LIST_HEAD(&netbk->group_domain_list);
+		spin_lock_init(&netbk->group_domain_list_lock);
+		netbk->group_domain_nr = 0;
 	}
 
 	netbk_copy_skb_mode = NETBK_DONT_COPY_SKB;
-- 
1.6.3


[-- Attachment #4: 0003-Use-Kernel-thread-to-replace-the-tasklet.patch --]
[-- Type: application/octet-stream, Size: 5929 bytes --]

From db653f69fa23de737151335fe6542fcb373e1ce1 Mon Sep 17 00:00:00 2001
From: Dongxiao Xu <dongxiao.xu@intel.com>
Date: Tue, 8 Dec 2009 17:08:43 +0800
Subject: [PATCH 3/3] Use Kernel thread to replace the tasklet.
     Kernel thread has more control over QoS, and could improve
 dom0's userspace responseness.

Signed-off-by: Dongxiao Xu <dongxiao.xu@intel.com>
---
 drivers/xen/netback/common.h  |    4 +-
 drivers/xen/netback/netback.c |   81 +++++++++++++++++++++++++++++++++--------
 2 files changed, 68 insertions(+), 17 deletions(-)

diff --git a/drivers/xen/netback/common.h b/drivers/xen/netback/common.h
index f343353..de6e171 100644
--- a/drivers/xen/netback/common.h
+++ b/drivers/xen/netback/common.h
@@ -250,8 +250,8 @@ struct domain_entry {
 #define MAX_PENDING_REQS 256
 
 struct xen_netbk {
-	struct tasklet_struct net_tx_tasklet;
-	struct tasklet_struct net_rx_tasklet;
+	wait_queue_head_t netbk_action_wq;
+	struct task_struct *task;
 
 	struct sk_buff_head rx_queue;
 	struct sk_buff_head tx_queue;
diff --git a/drivers/xen/netback/netback.c b/drivers/xen/netback/netback.c
index 773cd4f..8b55efc 100644
--- a/drivers/xen/netback/netback.c
+++ b/drivers/xen/netback/netback.c
@@ -38,6 +38,7 @@
 
 #include <linux/tcp.h>
 #include <linux/udp.h>
+#include <linux/kthread.h>
 
 #include <xen/balloon.h>
 #include <xen/events.h>
@@ -124,7 +125,7 @@ static inline void maybe_schedule_tx_action(int group)
 	smp_mb();
 	if ((nr_pending_reqs(group) < (MAX_PENDING_REQS/2)) &&
 	    !list_empty(&netbk->net_schedule_list))
-		tasklet_schedule(&netbk->net_tx_tasklet);
+		wake_up(&netbk->netbk_action_wq);
 }
 
 static struct sk_buff *netbk_copy_skb(struct sk_buff *skb)
@@ -278,7 +279,7 @@ int netif_be_start_xmit(struct sk_buff *skb, struct net_device *dev)
 	group = GET_GROUP_INDEX(netif);
 	netbk = &xen_netbk[group];
 	skb_queue_tail(&netbk->rx_queue, skb);
-	tasklet_schedule(&netbk->net_rx_tasklet);
+	wake_up(&netbk->netbk_action_wq);
 
 	return 0;
 
@@ -609,7 +610,7 @@ static void net_rx_action(unsigned long group)
 	/* More work to do? */
 	if (!skb_queue_empty(&netbk->rx_queue) &&
 			!timer_pending(&netbk->net_timer))
-		tasklet_schedule(&netbk->net_rx_tasklet);
+		wake_up(&netbk->netbk_action_wq);
 #if 0
 	else
 		xen_network_done_notify();
@@ -619,13 +620,13 @@ static void net_rx_action(unsigned long group)
 static void net_alarm(unsigned long group)
 {
 	struct xen_netbk *netbk = &xen_netbk[group];
-	tasklet_schedule(&netbk->net_rx_tasklet);
+	wake_up(&netbk->netbk_action_wq);
 }
 
 static void netbk_tx_pending_timeout(unsigned long group)
 {
 	struct xen_netbk *netbk = &xen_netbk[group];
-	tasklet_schedule(&netbk->net_tx_tasklet);
+	wake_up(&netbk->netbk_action_wq);
 }
 
 struct net_device_stats *netif_be_get_stats(struct net_device *dev)
@@ -1386,7 +1387,7 @@ static void net_tx_submit(int group)
 			continue;
 		}
 
-		netif_rx(skb);
+		netif_rx_ni(skb);
 		netif->dev->last_rx = jiffies;
 	}
 
@@ -1438,7 +1439,7 @@ static void netif_idx_release(int group, u16 pending_idx)
 	netbk->dealloc_prod++;
 	spin_unlock_irqrestore(&_lock, flags);
 
-	tasklet_schedule(&netbk->net_tx_tasklet);
+	wake_up(&netbk->netbk_action_wq);
 }
 
 static void netif_page_release(struct page *page, unsigned int order)
@@ -1557,6 +1558,46 @@ static irqreturn_t netif_be_dbg(int irq, void *dev_id, struct pt_regs *regs)
 }
 #endif
 
+static inline int rx_work_todo(int group)
+{
+	struct xen_netbk *netbk = &xen_netbk[group];
+	return !skb_queue_empty(&netbk->rx_queue);
+}
+
+static inline int tx_work_todo(int group)
+{
+	struct xen_netbk *netbk = &xen_netbk[group];
+	if (netbk->dealloc_cons != netbk->dealloc_prod)
+		return 1;
+
+	if (((nr_pending_reqs(group) + MAX_SKB_FRAGS) < MAX_PENDING_REQS) &&
+			!list_empty(&netbk->net_schedule_list))
+		return 1;
+
+	return 0;
+}
+
+static int netbk_action_thread(void *index)
+{
+	int group = (long)index;
+	struct xen_netbk *netbk = &xen_netbk[group];
+	while (1) {
+		wait_event_interruptible(netbk->netbk_action_wq,
+				rx_work_todo(group)
+				|| tx_work_todo(group));
+		cond_resched();
+
+		if (rx_work_todo(group))
+			net_rx_action(group);
+
+		if (tx_work_todo(group))
+			net_tx_action(group);
+	}
+
+	return 0;
+}
+
+
 static int __init netback_init(void)
 {
 	int i;
@@ -1586,10 +1627,18 @@ static int __init netback_init(void)
 
 	for (group = 0; group < group_nr; group++) {
 		struct xen_netbk *netbk = &xen_netbk[group];
-		tasklet_init(&netbk->net_tx_tasklet,
-				net_tx_action, group);
-		tasklet_init(&netbk->net_rx_tasklet,
-				net_rx_action, group);
+		init_waitqueue_head(&netbk->netbk_action_wq);
+		netbk->task = kthread_create(netbk_action_thread,
+				(void *)(long)group, "netback/%u", group);
+
+		if (!IS_ERR(netbk->task)) {
+			kthread_bind(netbk->task, group);
+			wake_up_process(netbk->task);
+		} else {
+			printk(KERN_ALERT "kthread_run() fails at netback\n");
+			rc = PTR_ERR(netbk->task);
+			goto failed_init;
+		}
 
 		skb_queue_head_init(&netbk->rx_queue);
 		skb_queue_head_init(&netbk->tx_queue);
@@ -1598,6 +1647,7 @@ static int __init netback_init(void)
 			alloc_empty_pages_and_pagevec(MAX_PENDING_REQS);
 		if (netbk->mmap_pages == NULL) {
 			printk(KERN_ALERT "%s: out of memory\n", __func__);
+			kthread_stop(netbk->task);
 			rc = -ENOMEM;
 			goto failed_init;
 		}
@@ -1665,15 +1715,16 @@ static int __init netback_init(void)
 
 failed_init:
 	for (i = 0; i < group; i++) {
-		free_empty_pages_and_pagevec(xen_netbk[i].mmap_pages,
+		struct xen_netbk *netbk = &xen_netbk[i];
+		kthread_stop(netbk->task);
+		free_empty_pages_and_pagevec(xen_netbk->mmap_pages,
 				MAX_PENDING_REQS);
-		del_timer(&xen_netbk[i].netbk_tx_pending_timer);
-		del_timer(&xen_netbk[i].net_timer);
+		del_timer(&xen_netbk->netbk_tx_pending_timer);
+		del_timer(&xen_netbk->net_timer);
 	}
 	kfree(xen_netbk);
 	free_page_foreign_tracker(foreign_page_tracker);
 	return rc;
-
 }
 
 module_init(netback_init);
-- 
1.6.3


[-- Attachment #5: Type: text/plain, Size: 138 bytes --]

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xensource.com
http://lists.xensource.com/xen-devel

^ permalink raw reply related	[flat|nested] 46+ messages in thread

* Re: [Pv-ops][PATCH] Netback multiple tasklet support
  2009-12-08  9:22               ` Xu, Dongxiao
@ 2009-12-09 20:23                 ` Jeremy Fitzhardinge
  2009-12-10  3:29                   ` Xu, Dongxiao
  2009-12-10  9:07                   ` [Pv-ops][PATCH] Netback multiple tasklet support Ian Campbell
  0 siblings, 2 replies; 46+ messages in thread
From: Jeremy Fitzhardinge @ 2009-12-09 20:23 UTC (permalink / raw)
  To: Xu, Dongxiao
  Cc: Steven Smith, Ian Pratt, xen-devel@lists.xensource.com,
	Ian Campbell

On 12/08/09 01:22, Xu, Dongxiao wrote:
> Jeremy,
> 	I have revised the patch according to your suggestion. See attachment.
> 0001: Keep group number as 1, and put all the global/static variables to struct xen_netbk. Do some preparations for multiple tasklets support.
> 0002: Support for netback multiple tasklet.
> 0003: Use kernel thread to replace the tasklet in order to ensure the dom0 userspace QoS.
>    

Thanks, this looks much better.  The only thing which lept out at me 
from a first reading is that "group_nr" is still too generic a name for 
a global symbol.  Something like "xen_netbk_nr_groups" perhaps?

Also, is it worth making it a tunable?  Presumably it needn't scale 
exactly with the number of dom0 cpus; if you only have one or two gbit 
interfaces, then you could saturate that pretty quickly with a small 
number of cpus, regardless of how many domains you have.

I've pushed this out in its own branch: 
xen/dom0/backend/netback-tasklet; please post any future patches against 
this branch.

     J

^ permalink raw reply	[flat|nested] 46+ messages in thread

* RE: [Pv-ops][PATCH] Netback multiple tasklet support
  2009-12-09 20:23                 ` Jeremy Fitzhardinge
@ 2009-12-10  3:29                   ` Xu, Dongxiao
  2009-12-10 18:01                     ` Jeremy Fitzhardinge
  2009-12-10  9:07                   ` [Pv-ops][PATCH] Netback multiple tasklet support Ian Campbell
  1 sibling, 1 reply; 46+ messages in thread
From: Xu, Dongxiao @ 2009-12-10  3:29 UTC (permalink / raw)
  To: Jeremy Fitzhardinge
  Cc: Steven Smith, Ian Pratt, xen-devel@lists.xensource.com,
	Ian Campbell

Jeremy Fitzhardinge wrote:
> On 12/08/09 01:22, Xu, Dongxiao wrote:
>> Jeremy,
>> 	I have revised the patch according to your suggestion. See
>> attachment. 0001: Keep group number as 1, and put all the
>> global/static variables to struct xen_netbk. Do some preparations
>> for multiple tasklets support. 0002: Support for netback multiple
>> tasklet. 0003: Use kernel thread to replace the tasklet in order to
>> ensure the dom0 userspace QoS.  
>> 
> 
> Thanks, this looks much better.  The only thing which lept out at me
> from a first reading is that "group_nr" is still too generic a name
> for a global symbol.  Something like "xen_netbk_nr_groups" perhaps?

Thanks, I will change the name for it. 

> 
> Also, is it worth making it a tunable?  Presumably it needn't scale
> exactly with the number of dom0 cpus; if you only have one or two gbit
> interfaces, then you could saturate that pretty quickly with a small
> number of cpus, regardless of how many domains you have.

How many CPUs are serving for the NIC interface is determined by how 
interrupt is delivered. If system only has two gbit interfaces, and they
delivier interrupts to CPU0 and CPU1, then the case is: two CPUs handle
two tasklets. Other CPUs are idle. The group_nr just defines the max 
number of tasklets, however it doesn't decide how tasklet is handled by
CPU.

> 
> I've pushed this out in its own branch:
> xen/dom0/backend/netback-tasklet; please post any future patches
> against this branch.

What's my next step for this netback-tasklet tree merging into xen/master? 

Thanks!
Dongxiao

> 
>      J

^ permalink raw reply	[flat|nested] 46+ messages in thread

* Re: [Pv-ops][PATCH] Netback multiple tasklet support
  2009-12-09 20:23                 ` Jeremy Fitzhardinge
  2009-12-10  3:29                   ` Xu, Dongxiao
@ 2009-12-10  9:07                   ` Ian Campbell
  2009-12-10 17:54                     ` Jeremy Fitzhardinge
  2010-01-13 10:17                     ` [Pv-ops][PATCH] Netback multiple tasklet support Jan Beulich
  1 sibling, 2 replies; 46+ messages in thread
From: Ian Campbell @ 2009-12-10  9:07 UTC (permalink / raw)
  To: Jeremy Fitzhardinge
  Cc: Steven Smith, Xu, Dongxiao, xen-devel@lists.xensource.com,
	Ian Pratt

On Wed, 2009-12-09 at 20:23 +0000, Jeremy Fitzhardinge wrote:
> 
> I've pushed this out in its own branch: 
> xen/dom0/backend/netback-tasklet; please post any future patches
> against 
> this branch. 

This is probably a bit pointless given the code is commented out but I
wanted to get it out there while it was in my head:

Subject: xen: ensure locking gnttab_copy_grant_page is safe against interrupts.

Now that netback processing occurs in a thread instead of a tasklet
gnttab_copy_grant_page needs to be safe against interrupts.

The code is currently commented out in this tree but on 2.6.18 we observed a
deadlock where the netback thread called gnttab_copy_grant_page, locked
gnttab_dma_lock for writing, was interrupted and on return from interrupt the
network stack's TX tasklet ended up calling __gnttab_dma_map_page via the
hardware driver->swiotlb and tries to take gnttab_dma_lock for reading.

Correct the commented code so we don't get bitten if/when it is re-enabled.

Signed-off-by: Ian Campbell <ian.campbell@citrix.com>
Cc: Jeremy Fitzhardinge <jeremy@goop.org>#
Cc: "Xu, Dongxiao" <dongxiao.xu@intel.com>
---
 drivers/xen/grant-table.c |    6 +++---
 1 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/drivers/xen/grant-table.c b/drivers/xen/grant-table.c
index c653970..f25a2bc 100644
--- a/drivers/xen/grant-table.c
+++ b/drivers/xen/grant-table.c
@@ -828,14 +828,14 @@ int gnttab_copy_grant_page(grant_ref_t ref, struct page **pagep)
 	mfn = pfn_to_mfn(pfn);
 	new_mfn = virt_to_mfn(new_addr);
 
-//	write_seqlock(&gnttab_dma_lock); /* protects __gnttab_dma_map_page on 2.6.18 */
+//	write_seqlock_irq(&gnttab_dma_lock); /* protects __gnttab_dma_map_page on 2.6.18 */
 
 	/* Make seq visible before checking page_mapped. */
 	smp_mb();
 
 	/* Has the page been DMA-mapped? */
 	if (unlikely(page_mapped(page))) {
-		//write_sequnlock(&gnttab_dma_lock);
+		//write_sequnlock_irq(&gnttab_dma_lock);
 		put_page(new_page);
 		err = -EBUSY;
 		goto out;
@@ -855,7 +855,7 @@ int gnttab_copy_grant_page(grant_ref_t ref, struct page **pagep)
 	BUG_ON(err);
 	BUG_ON(unmap.status);
 
-//	write_sequnlock(&gnttab_dma_lock);
+//	write_sequnlock_irq(&gnttab_dma_lock);
 
 	if (!xen_feature(XENFEAT_auto_translated_physmap)) {
 		set_phys_to_machine(page_to_pfn(new_page), INVALID_P2M_ENTRY);
-- 
1.5.6.5

^ permalink raw reply related	[flat|nested] 46+ messages in thread

* Re: [Pv-ops][PATCH] Netback multiple tasklet support
  2009-12-10  9:07                   ` [Pv-ops][PATCH] Netback multiple tasklet support Ian Campbell
@ 2009-12-10 17:54                     ` Jeremy Fitzhardinge
  2009-12-10 18:07                       ` Ian Campbell
  2010-03-17  8:46                       ` [PATCH] [pv-ops] fix dom0 S3 when MSI is used Cui, Dexuan
  2010-01-13 10:17                     ` [Pv-ops][PATCH] Netback multiple tasklet support Jan Beulich
  1 sibling, 2 replies; 46+ messages in thread
From: Jeremy Fitzhardinge @ 2009-12-10 17:54 UTC (permalink / raw)
  To: Ian Campbell
  Cc: Steven Smith, Xu, Dongxiao, xen-devel@lists.xensource.com,
	Ian Pratt

On 12/10/09 01:07, Ian Campbell wrote:
> Subject: xen: ensure locking gnttab_copy_grant_page is safe against interrupts.
>
> Now that netback processing occurs in a thread instead of a tasklet
> gnttab_copy_grant_page needs to be safe against interrupts.
>
> The code is currently commented out in this tree but on 2.6.18 we observed a
> deadlock where the netback thread called gnttab_copy_grant_page, locked
> gnttab_dma_lock for writing, was interrupted and on return from interrupt the
> network stack's TX tasklet ended up calling __gnttab_dma_map_page via the
> hardware driver->swiotlb and tries to take gnttab_dma_lock for reading.
>
> Correct the commented code so we don't get bitten if/when it is re-enabled.
>    

What's the issue here?  I'm inclined to just remove the commented-out 
code if it isn't being used, and re-evaluate the locking if/when it 
becomes necessary.

     J

^ permalink raw reply	[flat|nested] 46+ messages in thread

* Re: [Pv-ops][PATCH] Netback multiple tasklet support
  2009-12-10  3:29                   ` Xu, Dongxiao
@ 2009-12-10 18:01                     ` Jeremy Fitzhardinge
  2009-12-11  1:34                       ` Xu, Dongxiao
  2010-04-26 14:27                       ` [Pv-ops][PATCH 0/3] Resend: Netback multiple thread support Xu, Dongxiao
  0 siblings, 2 replies; 46+ messages in thread
From: Jeremy Fitzhardinge @ 2009-12-10 18:01 UTC (permalink / raw)
  To: Xu, Dongxiao
  Cc: Steven Smith, Ian Pratt, xen-devel@lists.xensource.com,
	Ian Campbell

On 12/09/09 19:29, Xu, Dongxiao wrote:
>> Also, is it worth making it a tunable?  Presumably it needn't scale
>> exactly with the number of dom0 cpus; if you only have one or two gbit
>> interfaces, then you could saturate that pretty quickly with a small
>> number of cpus, regardless of how many domains you have.
>>      
> How many CPUs are serving for the NIC interface is determined by how
> interrupt is delivered. If system only has two gbit interfaces, and they
> delivier interrupts to CPU0 and CPU1, then the case is: two CPUs handle
> two tasklets. Other CPUs are idle. The group_nr just defines the max
> number of tasklets, however it doesn't decide how tasklet is handled by
> CPU.
>    

So does this mean that a given vcpu will be used to handle the interrupt 
if happens to be running on a pcpu with affinity for the device?  Or 
that particular devices will be handled by particular vcpus?

>> I've pushed this out in its own branch:
>> xen/dom0/backend/netback-tasklet; please post any future patches
>> against this branch.
>>      
> What's my next step for this netback-tasklet tree merging into xen/master?
>    

Hm, well, I guess:

    * I'd like to see some comments Keir/Ian(s)/others that this is
      basically the right approach.  It looks OK to me, but I don't have
      much experience with performance in the field.
          o does nc2 make nc1 obsolete?
    * Testing to make sure it really works.  Netback is clearly critical
      functionality, so I'd like to be sure we're not introducing big
      regressions

     J

^ permalink raw reply	[flat|nested] 46+ messages in thread

* Re: [Pv-ops][PATCH] Netback multiple tasklet support
  2009-12-10 17:54                     ` Jeremy Fitzhardinge
@ 2009-12-10 18:07                       ` Ian Campbell
  2009-12-11  8:34                         ` Jan Beulich
  2010-03-17  8:46                       ` [PATCH] [pv-ops] fix dom0 S3 when MSI is used Cui, Dexuan
  1 sibling, 1 reply; 46+ messages in thread
From: Ian Campbell @ 2009-12-10 18:07 UTC (permalink / raw)
  To: Jeremy Fitzhardinge
  Cc: Steven Smith, Xu, Dongxiao, xen-devel@lists.xensource.com,
	Ian Pratt

On Thu, 2009-12-10 at 17:54 +0000, Jeremy Fitzhardinge wrote: 
> On 12/10/09 01:07, Ian Campbell wrote:
> > Subject: xen: ensure locking gnttab_copy_grant_page is safe against interrupts.
> >
> > Now that netback processing occurs in a thread instead of a tasklet
> > gnttab_copy_grant_page needs to be safe against interrupts.
> >
> > The code is currently commented out in this tree but on 2.6.18 we observed a
> > deadlock where the netback thread called gnttab_copy_grant_page, locked
> > gnttab_dma_lock for writing, was interrupted and on return from interrupt the
> > network stack's TX tasklet ended up calling __gnttab_dma_map_page via the
> > hardware driver->swiotlb and tries to take gnttab_dma_lock for reading.
> >
> > Correct the commented code so we don't get bitten if/when it is re-enabled.
> >    
> 
> What's the issue here?

a deadlock if someone naively uncomments the existing code.

>   I'm inclined to just remove the commented-out 
> code if it isn't being used, and re-evaluate the locking if/when it 
> becomes necessary.

that would be fine also.

Ian.

^ permalink raw reply	[flat|nested] 46+ messages in thread

* RE: [Pv-ops][PATCH] Netback multiple tasklet support
  2009-12-10 18:01                     ` Jeremy Fitzhardinge
@ 2009-12-11  1:34                       ` Xu, Dongxiao
  2010-04-26 14:27                       ` [Pv-ops][PATCH 0/3] Resend: Netback multiple thread support Xu, Dongxiao
  1 sibling, 0 replies; 46+ messages in thread
From: Xu, Dongxiao @ 2009-12-11  1:34 UTC (permalink / raw)
  To: Jeremy Fitzhardinge
  Cc: Steven Smith, Ian Pratt, xen-devel@lists.xensource.com,
	Ian Campbell

Jeremy Fitzhardinge wrote:
> On 12/09/09 19:29, Xu, Dongxiao wrote:
>>> Also, is it worth making it a tunable?  Presumably it needn't scale
>>> exactly with the number of dom0 cpus; if you only have one or two
>>> gbit interfaces, then you could saturate that pretty quickly with a
>>> small number of cpus, regardless of how many domains you have.
>>> 
>> How many CPUs are serving for the NIC interface is determined by how
>> interrupt is delivered. If system only has two gbit interfaces, and
>> they delivier interrupts to CPU0 and CPU1, then the case is: two
>> CPUs handle two tasklets. Other CPUs are idle. The group_nr just
>> defines the max number of tasklets, however it doesn't decide how
>> tasklet is handled by CPU. 
>> 
> 
> So does this mean that a given vcpu will be used to handle the
> interrupt if happens to be running on a pcpu with affinity for the
> device?  Or that particular devices will be handled by particular
> vcpus? 

If NIC device is owned by Dom0, then its interrupt affinity is related
with Dom0's *VCPU* (I think its not PCPU). Which VCPU will handle
the device interrupt is determined by the interrupt affinity, either set
manually by commands such as: 
"echo XXX > /proc/irq/irq_num/smp_processor_id",  or automatically 
adjusted by irqbalanced.

> 
>>> I've pushed this out in its own branch:
>>> xen/dom0/backend/netback-tasklet; please post any future patches
>>> against this branch. 
>>> 
>> What's my next step for this netback-tasklet tree merging into
>> xen/master? 
>> 
> 
> Hm, well, I guess:
> 
>     * I'd like to see some comments Keir/Ian(s)/others that this is
>       basically the right approach.  It looks OK to me, but I don't
>       have much experience with performance in the field.
>           o does nc2 make nc1 obsolete?
>     * Testing to make sure it really works.  Netback is clearly
>       critical functionality, so I'd like to be sure we're not
>       introducing big regressions

I will do another round of testing for this patch, and will give you reply then. 

Thanks!
Dongxiao
> 
>      J

^ permalink raw reply	[flat|nested] 46+ messages in thread

* Re: [Pv-ops][PATCH] Netback multiple tasklet support
  2009-12-10 18:07                       ` Ian Campbell
@ 2009-12-11  8:34                         ` Jan Beulich
  2009-12-11  9:34                           ` Ian Campbell
  0 siblings, 1 reply; 46+ messages in thread
From: Jan Beulich @ 2009-12-11  8:34 UTC (permalink / raw)
  To: Ian Campbell, Jeremy Fitzhardinge
  Cc: Steven Smith, Ian Pratt, xen-devel@lists.xensource.com,
	Dongxiao Xu

>>> Ian Campbell <Ian.Campbell@citrix.com> 10.12.09 19:07 >>>
>On Thu, 2009-12-10 at 17:54 +0000, Jeremy Fitzhardinge wrote: 
>> On 12/10/09 01:07, Ian Campbell wrote:
>> > Subject: xen: ensure locking gnttab_copy_grant_page is safe against interrupts.
>> >
>> > Now that netback processing occurs in a thread instead of a tasklet
>> > gnttab_copy_grant_page needs to be safe against interrupts.
>> >
>> > The code is currently commented out in this tree but on 2.6.18 we observed a
>> > deadlock where the netback thread called gnttab_copy_grant_page, locked
>> > gnttab_dma_lock for writing, was interrupted and on return from interrupt the
>> > network stack's TX tasklet ended up calling __gnttab_dma_map_page via the
>> > hardware driver->swiotlb and tries to take gnttab_dma_lock for reading.
>> >
>> > Correct the commented code so we don't get bitten if/when it is re-enabled.
>> >    
>> 
>> What's the issue here?
>
>a deadlock if someone naively uncomments the existing code.

Btw., can any of you explain why 2.6.18 needs this (and the related) code,
but pv-ops doesn't?

Thanks, Jan

^ permalink raw reply	[flat|nested] 46+ messages in thread

* Re: [Pv-ops][PATCH] Netback multiple tasklet  support
  2009-12-11  8:34                         ` Jan Beulich
@ 2009-12-11  9:34                           ` Ian Campbell
  2009-12-11 14:24                             ` Konrad Rzeszutek Wilk
  0 siblings, 1 reply; 46+ messages in thread
From: Ian Campbell @ 2009-12-11  9:34 UTC (permalink / raw)
  To: Jan Beulich
  Cc: Steven Smith, Jeremy Fitzhardinge, xen-devel@lists.xensource.com,
	Ian Pratt, Dongxiao Xu

On Fri, 2009-12-11 at 08:34 +0000, Jan Beulich wrote:
> 
> Btw., can any of you explain why 2.6.18 needs this (and the related)
> code,but pv-ops doesn't? 

The 2.6.18 bits came from
http://xenbits.xensource.com/linux-2.6.18-xen.hg?rev/a395e58bd234

I can't immediately see why this wouldn't apply to pv ops too. There are
comments in drivers/pci/xen-iommu.c (e.g. in xen_dma_unmap_page) which
suggest someone has though about it at some point...

Ian.

^ permalink raw reply	[flat|nested] 46+ messages in thread

* Re: [Pv-ops][PATCH] Netback multiple tasklet  support
  2009-12-11  9:34                           ` Ian Campbell
@ 2009-12-11 14:24                             ` Konrad Rzeszutek Wilk
  0 siblings, 0 replies; 46+ messages in thread
From: Konrad Rzeszutek Wilk @ 2009-12-11 14:24 UTC (permalink / raw)
  To: Ian Campbell
  Cc: Jeremy Fitzhardinge, xen-devel@lists.xensource.com, Ian Pratt,
	Jan Beulich, Dongxiao Xu, Steven Smith

On Fri, Dec 11, 2009 at 09:34:42AM +0000, Ian Campbell wrote:
> On Fri, 2009-12-11 at 08:34 +0000, Jan Beulich wrote:
> > 
> > Btw., can any of you explain why 2.6.18 needs this (and the related)
> > code,but pv-ops doesn't? 
> 
> The 2.6.18 bits came from
> http://xenbits.xensource.com/linux-2.6.18-xen.hg?rev/a395e58bd234

Oh boy. That code (and its friends):
/* Has the page been DMA-mapped? */
+	if (unlikely(page_mapped(page))) {

caused us at Virtual Iron so much head-ache. If a guest
had iSCSI backed storage, and the page was grant mapped back
to Dom0, in some cases we would two page references taken with one
put - meaning that 'page_mapped(page))' would true forever.
Joshua Nicholas traced it down to the TCP stack
taking a page reference and never giving it up. The end result is
that during migration (and shutdown) the guest would never die due
to outstanding page grants.

We never troubleshooted the TCP stack, but we implemented a similar
mechanism in the blkback that netback does - squash the page reference
when done. Not the most cleanest but it did the job.

> 
> I can't immediately see why this wouldn't apply to pv ops too. There are
> comments in drivers/pci/xen-iommu.c (e.g. in xen_dma_unmap_page) which
> suggest someone has though about it at some point...

Lets wait and see if its needed ?

^ permalink raw reply	[flat|nested] 46+ messages in thread

* Re: [Pv-ops][PATCH] Netback multiple tasklet support
  2009-12-10  9:07                   ` [Pv-ops][PATCH] Netback multiple tasklet support Ian Campbell
  2009-12-10 17:54                     ` Jeremy Fitzhardinge
@ 2010-01-13 10:17                     ` Jan Beulich
  2010-01-14 16:55                       ` Ian Campbell
  1 sibling, 1 reply; 46+ messages in thread
From: Jan Beulich @ 2010-01-13 10:17 UTC (permalink / raw)
  To: Ian Campbell
  Cc: Steven Smith, Ian Pratt, xen-devel@lists.xensource.com,
	Jeremy Fitzhardinge, Dongxiao Xu

>>> Ian Campbell <Ian.Campbell@citrix.com> 10.12.09 10:07 >>>
>Subject: xen: ensure locking gnttab_copy_grant_page is safe against interrupts.
>
>Now that netback processing occurs in a thread instead of a tasklet
>gnttab_copy_grant_page needs to be safe against interrupts.
>
>The code is currently commented out in this tree but on 2.6.18 we observed a
>deadlock where the netback thread called gnttab_copy_grant_page, locked
>gnttab_dma_lock for writing, was interrupted and on return from interrupt the
>network stack's TX tasklet ended up calling __gnttab_dma_map_page via the
>hardware driver->swiotlb and tries to take gnttab_dma_lock for reading.
>
>Correct the commented code so we don't get bitten if/when it is re-enabled.

Wouldn't safe-against-softirq be sufficient then? Or if not (i.e. if meant
to be generic), wouldn't the irq-safe variant need to be used
independent of the new feature (and then also in the 2.6.18 tree)?

Jan

^ permalink raw reply	[flat|nested] 46+ messages in thread

* Re: [Pv-ops][PATCH] Netback multiple tasklet  support
  2010-01-13 10:17                     ` [Pv-ops][PATCH] Netback multiple tasklet support Jan Beulich
@ 2010-01-14 16:55                       ` Ian Campbell
  0 siblings, 0 replies; 46+ messages in thread
From: Ian Campbell @ 2010-01-14 16:55 UTC (permalink / raw)
  To: Jan Beulich
  Cc: Jeremy Fitzhardinge, xen-devel@lists.xensource.com, Xu, Ian Pratt,
	Steven Smith, Dongxiao

On Wed, 2010-01-13 at 10:17 +0000, Jan Beulich wrote:
> >>> Ian Campbell <Ian.Campbell@citrix.com> 10.12.09 10:07 >>>
> >Subject: xen: ensure locking gnttab_copy_grant_page is safe against interrupts.
> >
> >Now that netback processing occurs in a thread instead of a tasklet
> >gnttab_copy_grant_page needs to be safe against interrupts.
> >
> >The code is currently commented out in this tree but on 2.6.18 we observed a
> >deadlock where the netback thread called gnttab_copy_grant_page, locked
> >gnttab_dma_lock for writing, was interrupted and on return from interrupt the
> >network stack's TX tasklet ended up calling __gnttab_dma_map_page via the
> >hardware driver->swiotlb and tries to take gnttab_dma_lock for reading.
> >
> >Correct the commented code so we don't get bitten if/when it is re-enabled.
> 
> Wouldn't safe-against-softirq be sufficient then?

Yes, I guess so.

> Or if not (i.e. if meant to be generic), wouldn't the irq-safe variant need to be used
> independent of the new feature (and then also in the 2.6.18 tree)?

Ian.

^ permalink raw reply	[flat|nested] 46+ messages in thread

* [PATCH] [pv-ops] fix dom0 S3 when MSI is used.
  2009-12-10 17:54                     ` Jeremy Fitzhardinge
  2009-12-10 18:07                       ` Ian Campbell
@ 2010-03-17  8:46                       ` Cui, Dexuan
  2010-03-17 14:28                         ` Konrad Rzeszutek Wilk
  2010-03-19  1:03                         ` Jeremy Fitzhardinge
  1 sibling, 2 replies; 46+ messages in thread
From: Cui, Dexuan @ 2010-03-17  8:46 UTC (permalink / raw)
  To: Jeremy Fitzhardinge; +Cc: xen-devel@lists.xensource.com

[-- Attachment #1: Type: text/plain, Size: 1934 bytes --]

The old commit a234848f works only when the device supports D3hot; when the
device only supports D3cold, the device doesn't work properly after resuming
from Dom0 S3.
A better workaround is invoking the PHYSDEVOP_restore_msi hypercall.
The patch reverts the old commit and invokes the hypercall.

Signed-off-by: Dexuan Cui <dexuan.cui@intel.com>

diff --git a/drivers/pci/msi.c b/drivers/pci/msi.c
index b40c6d0..c6bffe2 100644
--- a/drivers/pci/msi.c
+++ b/drivers/pci/msi.c
@@ -20,6 +20,7 @@
 #include <asm/errno.h>
 #include <asm/io.h>
 
+#include <asm/xen/hypercall.h>
 #include <asm/xen/hypervisor.h>
 
 #include "pci.h"
@@ -271,8 +272,7 @@ void write_msi_msg(unsigned int irq, struct msi_msg *msg)
 {
 	struct irq_desc *desc = irq_to_desc(irq);
 
-	if (!xen_initial_domain())
-		write_msi_msg_desc(desc, msg);
+	write_msi_msg_desc(desc, msg);
 }
 
 static int msi_free_irqs(struct pci_dev* dev);
@@ -347,6 +347,20 @@ static void __pci_restore_msix_state(struct pci_dev *dev)
 
 void pci_restore_msi_state(struct pci_dev *dev)
 {
+	if (xen_initial_domain()) {
+		struct physdev_restore_msi physdev;
+
+		if (!dev->msi_enabled && !dev->msix_enabled)
+			return;
+
+		pci_intx_for_msi(dev, 0);
+
+		physdev.bus = dev->bus->number;
+		physdev.devfn = dev->devfn;
+		HYPERVISOR_physdev_op(PHYSDEVOP_restore_msi, &physdev);
+
+		return;
+	}
 	__pci_restore_msi_state(dev);
 	__pci_restore_msix_state(dev);
 }
diff --git a/include/xen/interface/physdev.h b/include/xen/interface/physdev.h
index ac5de37..516d2b4 100644
--- a/include/xen/interface/physdev.h
+++ b/include/xen/interface/physdev.h
@@ -144,6 +144,13 @@ struct physdev_manage_pci {
 	uint8_t devfn;
 };
 
+#define PHYSDEVOP_restore_msi		19
+struct physdev_restore_msi {
+	/* IN */
+	uint8_t bus;
+	uint8_t devfn;
+};
+
 #define PHYSDEVOP_manage_pci_add_ext	20
 struct physdev_manage_pci_ext {
 	/* IN */

[-- Attachment #2: fix_pvops_dom0_s3.patch --]
[-- Type: application/octet-stream, Size: 1896 bytes --]

fix dom0 S3 when MSI is used.

The old commit a234848f works only when the device supports D3hot; when the
device only supports D3cold, the device doesn't work properly after resuming
from Dom0 S3.
A better workaround is invoking the PHYSDEVOP_restore_msi hypercall.
The patch reverts the old commit and invokes the hypercall.

Signed-off-by: Dexuan Cui <dexuan.cui@intel.com>

diff --git a/drivers/pci/msi.c b/drivers/pci/msi.c
index b40c6d0..c6bffe2 100644
--- a/drivers/pci/msi.c
+++ b/drivers/pci/msi.c
@@ -20,6 +20,7 @@
 #include <asm/errno.h>
 #include <asm/io.h>
 
+#include <asm/xen/hypercall.h>
 #include <asm/xen/hypervisor.h>
 
 #include "pci.h"
@@ -271,8 +272,7 @@ void write_msi_msg(unsigned int irq, struct msi_msg *msg)
 {
 	struct irq_desc *desc = irq_to_desc(irq);
 
-	if (!xen_initial_domain())
-		write_msi_msg_desc(desc, msg);
+	write_msi_msg_desc(desc, msg);
 }
 
 static int msi_free_irqs(struct pci_dev* dev);
@@ -347,6 +347,20 @@ static void __pci_restore_msix_state(struct pci_dev *dev)
 
 void pci_restore_msi_state(struct pci_dev *dev)
 {
+	if (xen_initial_domain()) {
+		struct physdev_restore_msi physdev;
+
+		if (!dev->msi_enabled && !dev->msix_enabled)
+			return;
+
+		pci_intx_for_msi(dev, 0);
+
+		physdev.bus = dev->bus->number;
+		physdev.devfn = dev->devfn;
+		HYPERVISOR_physdev_op(PHYSDEVOP_restore_msi, &physdev);
+
+		return;
+	}
 	__pci_restore_msi_state(dev);
 	__pci_restore_msix_state(dev);
 }
diff --git a/include/xen/interface/physdev.h b/include/xen/interface/physdev.h
index ac5de37..516d2b4 100644
--- a/include/xen/interface/physdev.h
+++ b/include/xen/interface/physdev.h
@@ -144,6 +144,13 @@ struct physdev_manage_pci {
 	uint8_t devfn;
 };
 
+#define PHYSDEVOP_restore_msi		19
+struct physdev_restore_msi {
+	/* IN */
+	uint8_t bus;
+	uint8_t devfn;
+};
+
 #define PHYSDEVOP_manage_pci_add_ext	20
 struct physdev_manage_pci_ext {
 	/* IN */

[-- Attachment #3: Type: text/plain, Size: 138 bytes --]

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xensource.com
http://lists.xensource.com/xen-devel

^ permalink raw reply related	[flat|nested] 46+ messages in thread

* Re: [PATCH] [pv-ops] fix dom0 S3 when MSI is used.
  2010-03-17  8:46                       ` [PATCH] [pv-ops] fix dom0 S3 when MSI is used Cui, Dexuan
@ 2010-03-17 14:28                         ` Konrad Rzeszutek Wilk
  2010-03-18  3:05                           ` Cui, Dexuan
  2010-03-19  1:04                           ` Jeremy Fitzhardinge
  2010-03-19  1:03                         ` Jeremy Fitzhardinge
  1 sibling, 2 replies; 46+ messages in thread
From: Konrad Rzeszutek Wilk @ 2010-03-17 14:28 UTC (permalink / raw)
  To: Cui, Dexuan; +Cc: Jeremy Fitzhardinge, xen-devel@lists.xensource.com

On Wed, Mar 17, 2010 at 04:46:47PM +0800, Cui, Dexuan wrote:
> The old commit a234848f works only when the device supports D3hot; when the
> device only supports D3cold, the device doesn't work properly after resuming
> from Dom0 S3.
> A better workaround is invoking the PHYSDEVOP_restore_msi hypercall.
> The patch reverts the old commit and invokes the hypercall.
> 
> Signed-off-by: Dexuan Cui <dexuan.cui@intel.com>
> 
> diff --git a/drivers/pci/msi.c b/drivers/pci/msi.c
> index b40c6d0..c6bffe2 100644
> --- a/drivers/pci/msi.c
> +++ b/drivers/pci/msi.c
> @@ -20,6 +20,7 @@
>  #include <asm/errno.h>
>  #include <asm/io.h>
>  
> +#include <asm/xen/hypercall.h>
>  #include <asm/xen/hypervisor.h>
>  
>  #include "pci.h"
> @@ -271,8 +272,7 @@ void write_msi_msg(unsigned int irq, struct msi_msg *msg)
>  {
>  	struct irq_desc *desc = irq_to_desc(irq);
>  
> -	if (!xen_initial_domain())
> -		write_msi_msg_desc(desc, msg);
> +	write_msi_msg_desc(desc, msg);

Nice. That will remove the other platofmr build problem.
>  }
>  
>  static int msi_free_irqs(struct pci_dev* dev);
> @@ -347,6 +347,20 @@ static void __pci_restore_msix_state(struct pci_dev *dev)
>  
>  void pci_restore_msi_state(struct pci_dev *dev)
>  {
> +	if (xen_initial_domain()) {

That won't do. If you try to compile this kernel on other platforms (say
PPC), this will throw a huge problem.

> +		struct physdev_restore_msi physdev;
> +
> +		if (!dev->msi_enabled && !dev->msix_enabled)
> +			return;

This seems redundant.

I think the problem you are trying to address is doing to show up when
doing PC passthrough using pciback and xen-pcifront.

The mechanims that was employed there to make it work, was to utilize
the arch_setup_msi_irqs in (arch/x86/kernel/apic/io_apic.c) and make it
call functions in arch/x86/pci/xen.c. That code then figures out if you
are running in priviliged or un-priviliged mode and makes the appropiate
call.

Perhaps you should using that idea and expand on it. I would suggest you
take a look at the how PPC implements this and see if there is something
that can be borrowed from their mechanism.

> +
> +		pci_intx_for_msi(dev, 0);
> +
> +		physdev.bus = dev->bus->number;
> +		physdev.devfn = dev->devfn;
> +		HYPERVISOR_physdev_op(PHYSDEVOP_restore_msi, &physdev);
> +
> +		return;
> +	}
>  	__pci_restore_msi_state(dev);
>  	__pci_restore_msix_state(dev);
>  }
> diff --git a/include/xen/interface/physdev.h b/include/xen/interface/physdev.h
> index ac5de37..516d2b4 100644
> --- a/include/xen/interface/physdev.h
> +++ b/include/xen/interface/physdev.h
> @@ -144,6 +144,13 @@ struct physdev_manage_pci {
>  	uint8_t devfn;
>  };
>  
> +#define PHYSDEVOP_restore_msi		19
> +struct physdev_restore_msi {
> +	/* IN */
> +	uint8_t bus;
> +	uint8_t devfn;
> +};
> +
>  #define PHYSDEVOP_manage_pci_add_ext	20
>  struct physdev_manage_pci_ext {
>  	/* IN */


> _______________________________________________
> Xen-devel mailing list
> Xen-devel@lists.xensource.com
> http://lists.xensource.com/xen-devel

^ permalink raw reply	[flat|nested] 46+ messages in thread

* RE: [PATCH] [pv-ops] fix dom0 S3 when MSI is used.
  2010-03-17 14:28                         ` Konrad Rzeszutek Wilk
@ 2010-03-18  3:05                           ` Cui, Dexuan
  2010-03-19  1:04                           ` Jeremy Fitzhardinge
  1 sibling, 0 replies; 46+ messages in thread
From: Cui, Dexuan @ 2010-03-18  3:05 UTC (permalink / raw)
  To: Konrad Rzeszutek Wilk; +Cc: Jeremy Fitzhardinge, xen-devel@lists.xensource.com

Konrad Rzeszutek Wilk wrote:
> On Wed, Mar 17, 2010 at 04:46:47PM +0800, Cui, Dexuan wrote:
>> The old commit a234848f works only when the device supports D3hot;
>> when the device only supports D3cold, the device doesn't work
>> properly after resuming from Dom0 S3. A better workaround is
>> invoking the PHYSDEVOP_restore_msi hypercall. 
>> The patch reverts the old commit and invokes the hypercall.
>> 
>> Signed-off-by: Dexuan Cui <dexuan.cui@intel.com>
>> 
>> diff --git a/drivers/pci/msi.c b/drivers/pci/msi.c
>> index b40c6d0..c6bffe2 100644
>> --- a/drivers/pci/msi.c
>> +++ b/drivers/pci/msi.c
>> @@ -20,6 +20,7 @@
>>  #include <asm/errno.h>
>>  #include <asm/io.h>
>> 
>> +#include <asm/xen/hypercall.h>
>>  #include <asm/xen/hypervisor.h>
>> 
>>  #include "pci.h"
>> @@ -271,8 +272,7 @@ void write_msi_msg(unsigned int irq, struct
>>  	msi_msg *msg)  { struct irq_desc *desc = irq_to_desc(irq);
>> 
>> -	if (!xen_initial_domain())
>> -		write_msi_msg_desc(desc, msg);
>> +	write_msi_msg_desc(desc, msg);
> 
> Nice. That will remove the other platofmr build problem.
>>  }
>> 
>>  static int msi_free_irqs(struct pci_dev* dev);
>> @@ -347,6 +347,20 @@ static void __pci_restore_msix_state(struct
>> pci_dev *dev) 
>> 
>>  void pci_restore_msi_state(struct pci_dev *dev)
>>  {
>> +	if (xen_initial_domain()) {
> 
> That won't do. If you try to compile this kernel on other platforms
Hi Konrad,
Thanks for the comments!
The patch is used to act as "a better workaround" for the actual Dom0 S3 issue for now.
I think the MSI implementation in pv-ops dom0 will be re-worked in future, just like the IOAPIC rework that happened.
This patch is not meant to fix all the existing issues. :-)

> (say PPC), this will throw a huge problem.
> 
>> +		struct physdev_restore_msi physdev;
>> +
>> +		if (!dev->msi_enabled && !dev->msix_enabled)
>> +			return;
> 
> This seems redundant.
Seems not redundant. 
if (!dev->msi_enabled && !dev->msix_enabled), we don't need to go further to invoke the hypercall.

> 
> I think the problem you are trying to address is doing to show up when
> doing PC passthrough using pciback and xen-pcifront.
Actually the bug I'm trying to fix is only limited to pv-dom0 itself:
In my host, after Dom0 S3 resume, the SATA disk can't work properly in Dom0 and Dom0's filesytem would become inaccessibe.
With pci=nomsi, or with the patch, Dom0 S3 can work fine.

> 
> The mechanims that was employed there to make it work, was to utilize
> the arch_setup_msi_irqs in (arch/x86/kernel/apic/io_apic.c) and make
> it call functions in arch/x86/pci/xen.c. That code then figures out
> if you are running in priviliged or un-priviliged mode and makes the
> appropiate call.
> 
> Perhaps you should using that idea and expand on it. I would suggest
> you take a look at the how PPC implements this and see if there is
> something that can be borrowed from their mechanism.
Agree. I  think this could be considered in the coming MSI rework in pv-ops dom0.

Thanks,
-- Dexuan

^ permalink raw reply	[flat|nested] 46+ messages in thread

* Re: [PATCH] [pv-ops] fix dom0 S3 when MSI is used.
  2010-03-17  8:46                       ` [PATCH] [pv-ops] fix dom0 S3 when MSI is used Cui, Dexuan
  2010-03-17 14:28                         ` Konrad Rzeszutek Wilk
@ 2010-03-19  1:03                         ` Jeremy Fitzhardinge
  2010-03-19  1:29                           ` Cui, Dexuan
  1 sibling, 1 reply; 46+ messages in thread
From: Jeremy Fitzhardinge @ 2010-03-19  1:03 UTC (permalink / raw)
  To: Cui, Dexuan; +Cc: xen-devel@lists.xensource.com

On 03/17/2010 01:46 AM, Cui, Dexuan wrote:
> The old commit a234848f works only when the device supports D3hot; when the
> device only supports D3cold, the device doesn't work properly after resuming
> from Dom0 S3.
>    
What branches should I apply this to?  Both 2.6.31 and .32, or just .32?
> A better workaround is invoking the PHYSDEVOP_restore_msi hypercall.
> The patch reverts the old commit and invokes the hypercall.
>    
Is this a new hypercall?

Aside from that, it looks fine as a stopgap until we do MSI properly.

Thanks,
     J

> Signed-off-by: Dexuan Cui<dexuan.cui@intel.com>
>
> diff --git a/drivers/pci/msi.c b/drivers/pci/msi.c
> index b40c6d0..c6bffe2 100644
> --- a/drivers/pci/msi.c
> +++ b/drivers/pci/msi.c
> @@ -20,6 +20,7 @@
>   #include<asm/errno.h>
>   #include<asm/io.h>
>
> +#include<asm/xen/hypercall.h>
>   #include<asm/xen/hypervisor.h>
>
>   #include "pci.h"
> @@ -271,8 +272,7 @@ void write_msi_msg(unsigned int irq, struct msi_msg *msg)
>   {
>   	struct irq_desc *desc = irq_to_desc(irq);
>
> -	if (!xen_initial_domain())
> -		write_msi_msg_desc(desc, msg);
> +	write_msi_msg_desc(desc, msg);
>   }
>
>   static int msi_free_irqs(struct pci_dev* dev);
> @@ -347,6 +347,20 @@ static void __pci_restore_msix_state(struct pci_dev *dev)
>
>   void pci_restore_msi_state(struct pci_dev *dev)
>   {
> +	if (xen_initial_domain()) {
> +		struct physdev_restore_msi physdev;
> +
> +		if (!dev->msi_enabled&&  !dev->msix_enabled)
> +			return;
> +
> +		pci_intx_for_msi(dev, 0);
> +
> +		physdev.bus = dev->bus->number;
> +		physdev.devfn = dev->devfn;
> +		HYPERVISOR_physdev_op(PHYSDEVOP_restore_msi,&physdev);
> +
> +		return;
> +	}
>   	__pci_restore_msi_state(dev);
>   	__pci_restore_msix_state(dev);
>   }
> diff --git a/include/xen/interface/physdev.h b/include/xen/interface/physdev.h
> index ac5de37..516d2b4 100644
> --- a/include/xen/interface/physdev.h
> +++ b/include/xen/interface/physdev.h
> @@ -144,6 +144,13 @@ struct physdev_manage_pci {
>   	uint8_t devfn;
>   };
>
> +#define PHYSDEVOP_restore_msi		19
> +struct physdev_restore_msi {
> +	/* IN */
> +	uint8_t bus;
> +	uint8_t devfn;
> +};
> +
>   #define PHYSDEVOP_manage_pci_add_ext	20
>   struct physdev_manage_pci_ext {
>   	/* IN */
>    

^ permalink raw reply	[flat|nested] 46+ messages in thread

* Re: [PATCH] [pv-ops] fix dom0 S3 when MSI is used.
  2010-03-17 14:28                         ` Konrad Rzeszutek Wilk
  2010-03-18  3:05                           ` Cui, Dexuan
@ 2010-03-19  1:04                           ` Jeremy Fitzhardinge
  1 sibling, 0 replies; 46+ messages in thread
From: Jeremy Fitzhardinge @ 2010-03-19  1:04 UTC (permalink / raw)
  To: Konrad Rzeszutek Wilk; +Cc: xen-devel@lists.xensource.com, Cui, Dexuan

On 03/17/2010 07:28 AM, Konrad Rzeszutek Wilk wrote:
>>   static int msi_free_irqs(struct pci_dev* dev);
>> @@ -347,6 +347,20 @@ static void __pci_restore_msix_state(struct pci_dev *dev)
>>
>>   void pci_restore_msi_state(struct pci_dev *dev)
>>   {
>> +	if (xen_initial_domain()) {
>>      
> That won't do. If you try to compile this kernel on other platforms (say
> PPC), this will throw a huge problem.
>    

Actually, <xen/xen.h> now defines these for all architectures, so 
they're safe to put into arch-independent code.

     J

^ permalink raw reply	[flat|nested] 46+ messages in thread

* RE: [PATCH] [pv-ops] fix dom0 S3 when MSI is used.
  2010-03-19  1:03                         ` Jeremy Fitzhardinge
@ 2010-03-19  1:29                           ` Cui, Dexuan
  0 siblings, 0 replies; 46+ messages in thread
From: Cui, Dexuan @ 2010-03-19  1:29 UTC (permalink / raw)
  To: Jeremy Fitzhardinge; +Cc: xen-devel@lists.xensource.com

Jeremy Fitzhardinge wrote:
> On 03/17/2010 01:46 AM, Cui, Dexuan wrote:
>> The old commit a234848f works only when the device supports D3hot;
>> when the device only supports D3cold, the device doesn't work
>> properly after resuming from Dom0 S3. 
>> 
> What branches should I apply this to?  Both 2.6.31 and .32, or just
> .32? 
I think it should be both. :-)

>> A better workaround is invoking the PHYSDEVOP_restore_msi hypercall.
>> The patch reverts the old commit and invokes the hypercall.
>> 
> Is this a new hypercall?
No. The hypercall has been there from changeset 18937: 2dffa6ceb0af, 15 months ago. :-)

> 
> Aside from that, it looks fine as a stopgap until we do MSI properly.
Agree.

Thanks,
-- Dexuan

^ permalink raw reply	[flat|nested] 46+ messages in thread

* [Pv-ops][PATCH 0/3] Resend: Netback multiple thread support
  2009-12-10 18:01                     ` Jeremy Fitzhardinge
  2009-12-11  1:34                       ` Xu, Dongxiao
@ 2010-04-26 14:27                       ` Xu, Dongxiao
  2010-04-27  0:19                         ` Konrad Rzeszutek Wilk
                                           ` (2 more replies)
  1 sibling, 3 replies; 46+ messages in thread
From: Xu, Dongxiao @ 2010-04-26 14:27 UTC (permalink / raw)
  To: xen-devel@lists.xensource.com, Jeremy Fitzhardinge

Hi Jeremy and all,

I'd like to make an update on these patches. The main logic is not
changed, and I only did a rebase towards the upstream pv-ops kernel.
See attached patch. The original patches are checked in Jeremy's
netback-tasklet branch.

Let me explain the main idea of the patchset again:

Current netback uses one pair of tasklets for Tx/Rx data transaction.
Netback tasklet could only run at one CPU at a time, and it is used to
serve all the netfronts. Therefore it has become a performance bottle
neck. This patch is to use multiple tasklet pairs to replace the current
single pair in dom0. 

Assuming that Dom0 has CPUNR VCPUs, we define CPUNR kinds of
tasklets pair (CPUNR for Tx, and CPUNR for Rx). Each pare of tasklets
serve specific group of netfronts. Also for those global and static
variables, we duplicated them for each group in order to avoid the
spinlock. 

PATCH 01: Generilize static/global variables into 'struct xen_netbk'.

PATCH 02: Multiple tasklets support.

PATCH 03: Use Kernel thread to replace the tasklet.

Recently I re-tested the patchset with Intel 10G multi-queue NIC device,
and use 10 outside 1G NICs to do netperf tests with that 10G NIC.

Case 1: Dom0 has more than 10 vcpus pinned with each physical CPU.
With the patchset, the performance is 2x of the original throughput.

Case 2: Dom0 has 4 vcpus pinned with 4 physical CPUs.
With the patchset, the performance is 3.7x of the original throughput. 

when we test this patch, we found that the domain_lock in grant table
operation (gnttab_copy()) becomes a bottle neck. We temporarily
remove the global domain_lock to achieve good performance.

Thanks,
Dongxiao

Jeremy Fitzhardinge wrote:
> On 12/09/09 19:29, Xu, Dongxiao wrote:
>>> Also, is it worth making it a tunable?  Presumably it needn't scale
>>> exactly with the number of dom0 cpus; if you only have one or two
>>> gbit interfaces, then you could saturate that pretty quickly with a
>>> small number of cpus, regardless of how many domains you have.
>>> 
>> How many CPUs are serving for the NIC interface is determined by how
>> interrupt is delivered. If system only has two gbit interfaces, and
>> they delivier interrupts to CPU0 and CPU1, then the case is: two
>> CPUs handle two tasklets. Other CPUs are idle. The group_nr just
>> defines the max number of tasklets, however it doesn't decide how
>> tasklet is handled by CPU. 
>> 
> 
> So does this mean that a given vcpu will be used to handle the
> interrupt if happens to be running on a pcpu with affinity for the
> device?  Or that particular devices will be handled by particular
> vcpus? 
> 
>>> I've pushed this out in its own branch:
>>> xen/dom0/backend/netback-tasklet; please post any future patches
>>> against this branch. 
>>> 
>> What's my next step for this netback-tasklet tree merging into
>> xen/master? 
>> 
> 
> Hm, well, I guess:
> 
>     * I'd like to see some comments Keir/Ian(s)/others that this is
>       basically the right approach.  It looks OK to me, but I don't
>       have much experience with performance in the field.
>           o does nc2 make nc1 obsolete?
>     * Testing to make sure it really works.  Netback is clearly
>       critical functionality, so I'd like to be sure we're not
>       introducing big regressions
> 
>      J

^ permalink raw reply	[flat|nested] 46+ messages in thread

* Re: [Pv-ops][PATCH 0/3] Resend: Netback multiple thread support
  2010-04-26 14:27                       ` [Pv-ops][PATCH 0/3] Resend: Netback multiple thread support Xu, Dongxiao
@ 2010-04-27  0:19                         ` Konrad Rzeszutek Wilk
  2010-04-27  0:40                           ` Xu, Dongxiao
  2010-04-27  3:02                         ` Xu, Dongxiao
  2010-04-27 10:49                         ` Steven Smith
  2 siblings, 1 reply; 46+ messages in thread
From: Konrad Rzeszutek Wilk @ 2010-04-27  0:19 UTC (permalink / raw)
  To: Xu, Dongxiao; +Cc: Jeremy Fitzhardinge, xen-devel@lists.xensource.com

> Case 1: Dom0 has more than 10 vcpus pinned with each physical CPU.
> With the patchset, the performance is 2x of the original throughput.
> 
> Case 2: Dom0 has 4 vcpus pinned with 4 physical CPUs.
> With the patchset, the performance is 3.7x of the original throughput. 

What was the original throughput? 1GB?

^ permalink raw reply	[flat|nested] 46+ messages in thread

* RE: [Pv-ops][PATCH 0/3] Resend: Netback multiple thread support
  2010-04-27  0:19                         ` Konrad Rzeszutek Wilk
@ 2010-04-27  0:40                           ` Xu, Dongxiao
  0 siblings, 0 replies; 46+ messages in thread
From: Xu, Dongxiao @ 2010-04-27  0:40 UTC (permalink / raw)
  To: Konrad Rzeszutek Wilk; +Cc: Fitzhardinge, xen-devel@lists.xensource.com, Jeremy

Konrad Rzeszutek Wilk wrote:
>> Case 1: Dom0 has more than 10 vcpus pinned with each physical CPU.
>> With the patchset, the performance is 2x of the original throughput.
>> 
>> Case 2: Dom0 has 4 vcpus pinned with 4 physical CPUs.
>> With the patchset, the performance is 3.7x of the original
>> throughput. 
> 
> What was the original throughput? 1GB?

The total bandwith for both cases should be 10G, since we are using an 10G NIC card.

For case 1, the original throughput is 3539.33Gbps.
And for case 2, the original throughput is 1310.72Gbps.

Thanks,
Dongxiao

^ permalink raw reply	[flat|nested] 46+ messages in thread

* RE: [Pv-ops][PATCH 0/3] Resend: Netback multiple thread support
  2010-04-26 14:27                       ` [Pv-ops][PATCH 0/3] Resend: Netback multiple thread support Xu, Dongxiao
  2010-04-27  0:19                         ` Konrad Rzeszutek Wilk
@ 2010-04-27  3:02                         ` Xu, Dongxiao
  2010-04-27 10:49                         ` Steven Smith
  2 siblings, 0 replies; 46+ messages in thread
From: Xu, Dongxiao @ 2010-04-27  3:02 UTC (permalink / raw)
  To: xen-devel@lists.xensource.com, Jeremy Fitzhardinge; +Cc: Konrad Rzeszutek Wilk

Xu, Dongxiao wrote:
> Hi Jeremy and all,
> 
> I'd like to make an update on these patches. The main logic is not
> changed, and I only did a rebase towards the upstream pv-ops kernel.
> See attached patch. The original patches are checked in Jeremy's
> netback-tasklet branch.
> 
> Let me explain the main idea of the patchset again:
> 
> Current netback uses one pair of tasklets for Tx/Rx data transaction.
> Netback tasklet could only run at one CPU at a time, and it is used to
> serve all the netfronts. Therefore it has become a performance bottle
> neck. This patch is to use multiple tasklet pairs to replace the
> current single pair in dom0.
> 
> Assuming that Dom0 has CPUNR VCPUs, we define CPUNR kinds of
> tasklets pair (CPUNR for Tx, and CPUNR for Rx). Each pare of tasklets
> serve specific group of netfronts. Also for those global and static
> variables, we duplicated them for each group in order to avoid the
> spinlock.
> 
> PATCH 01: Generilize static/global variables into 'struct xen_netbk'.
> 
> PATCH 02: Multiple tasklets support.
> 
> PATCH 03: Use Kernel thread to replace the tasklet.
> 
> Recently I re-tested the patchset with Intel 10G multi-queue NIC
> device, and use 10 outside 1G NICs to do netperf tests with that 10G
> NIC. 

Here are more descriptions about the test:

In the host side, we launch 10 HVM guests, each is installed with PV
VNIF driver, all the vif interfaces are bound with a 10G NIC through
bridge. So the 10 guests are sharing the 10G bandwidth.

Outside the host, we use 10 1G NIC interfaces to do netperf test with
the 10 HVM guests. 

Thanks,
Dongxiao

> 
> Case 1: Dom0 has more than 10 vcpus pinned with each physical CPU.
> With the patchset, the performance is 2x of the original throughput.
> 
> Case 2: Dom0 has 4 vcpus pinned with 4 physical CPUs.
> With the patchset, the performance is 3.7x of the original throughput.
> 
> when we test this patch, we found that the domain_lock in grant table
> operation (gnttab_copy()) becomes a bottle neck. We temporarily
> remove the global domain_lock to achieve good performance.
> 
> Thanks,
> Dongxiao
> 
> Jeremy Fitzhardinge wrote:
>> On 12/09/09 19:29, Xu, Dongxiao wrote:
>>>> Also, is it worth making it a tunable?  Presumably it needn't scale
>>>> exactly with the number of dom0 cpus; if you only have one or two
>>>> gbit interfaces, then you could saturate that pretty quickly with a
>>>> small number of cpus, regardless of how many domains you have.
>>>> 
>>> How many CPUs are serving for the NIC interface is determined by how
>>> interrupt is delivered. If system only has two gbit interfaces, and
>>> they delivier interrupts to CPU0 and CPU1, then the case is: two
>>> CPUs handle two tasklets. Other CPUs are idle. The group_nr just
>>> defines the max number of tasklets, however it doesn't decide how
>>> tasklet is handled by CPU. 
>>> 
>> 
>> So does this mean that a given vcpu will be used to handle the
>> interrupt if happens to be running on a pcpu with affinity for the
>> device?  Or that particular devices will be handled by particular
>> vcpus? 
>> 
>>>> I've pushed this out in its own branch:
>>>> xen/dom0/backend/netback-tasklet; please post any future patches
>>>> against this branch. 
>>>> 
>>> What's my next step for this netback-tasklet tree merging into
>>> xen/master? 
>>> 
>> 
>> Hm, well, I guess:
>> 
>>     * I'd like to see some comments Keir/Ian(s)/others that this is
>>       basically the right approach.  It looks OK to me, but I don't
>>       have much experience with performance in the field.
>>           o does nc2 make nc1 obsolete?
>>     * Testing to make sure it really works.  Netback is clearly
>>       critical functionality, so I'd like to be sure we're not
>>       introducing big regressions
>> 
>>      J
> 
> 
> _______________________________________________
> Xen-devel mailing list
> Xen-devel@lists.xensource.com
> http://lists.xensource.com/xen-devel

^ permalink raw reply	[flat|nested] 46+ messages in thread

* Re: [Pv-ops][PATCH 0/3] Resend: Netback multiple thread support
  2010-04-26 14:27                       ` [Pv-ops][PATCH 0/3] Resend: Netback multiple thread support Xu, Dongxiao
  2010-04-27  0:19                         ` Konrad Rzeszutek Wilk
  2010-04-27  3:02                         ` Xu, Dongxiao
@ 2010-04-27 10:49                         ` Steven Smith
  2010-04-27 18:37                           ` Jeremy Fitzhardinge
  2010-04-28 10:27                           ` Xu, Dongxiao
  2 siblings, 2 replies; 46+ messages in thread
From: Steven Smith @ 2010-04-27 10:49 UTC (permalink / raw)
  To: Xu, Dongxiao; +Cc: Fitzhardinge, xen-devel@lists.xensource.com, Jeremy


[-- Attachment #1.1: Type: text/plain, Size: 5830 bytes --]

> I'd like to make an update on these patches. The main logic is not
> changed, and I only did a rebase towards the upstream pv-ops kernel.
> See attached patch. The original patches are checked in Jeremy's
> netback-tasklet branch.
I have a couple of (quite minor) comments on the patches:

0001-Netback-Generilize-static-global-variables-into-stru.txt:
> diff --git a/drivers/xen/netback/netback.c b/drivers/xen/netback/netback.c
> index c24debf..a484b0a 100644
> --- a/drivers/xen/netback/netback.c
> +++ b/drivers/xen/netback/netback.c
> @@ -49,18 +49,13 @@
>  
>  /*define NETBE_DEBUG_INTERRUPT*/
>  
> +struct xen_netbk *xen_netbk;

> +int group_nr = 1;
> +struct page_foreign_tracker *foreign_page_tracker;
I think these two would benefit from more descriptive names, given
that they're not static.



If I was feeling pedantic, I'd complain that this includes some bits
of support for multiple struct xen_netbks, rather than just moving all
of the fields around, which reduces its obviously-correct-ness quite a
bit.

Even more pedantically, it might be better to pass around a struct
xen_netbk in a few places, rather than an int group, so that you get
better compiler type checking.



0002-Netback-Multiple-tasklets-support.txt:
Design question: you do your balancing by making each tasklet serve
roughly equal numbers of remote domains.  Would it not have been
easier to make them serve equal numbers of netfronts?  You could then
get rid of the domain_entry business completely and just have a count
of serviced netfronts in each xen_netbk, which might be a bit easier
to deal with.

> diff --git a/drivers/xen/netback/interface.c b/drivers/xen/netback/interface.c
> index 21c1f95..bdf3c1d 100644
> --- a/drivers/xen/netback/interface.c
> +++ b/drivers/xen/netback/interface.c
> @@ -54,6 +54,59 @@
>  static unsigned long netbk_queue_length = 32;
>  module_param_named(queue_length, netbk_queue_length, ulong, 0644);
>  
> 
> +static void remove_domain_from_list(struct xen_netbk *netbk,
> +			     struct xen_netif *netif)
> +{
> +	struct domain_entry *dom_entry = NULL;
> +	int group = netif->group;
> +
> +	list_for_each_entry(dom_entry,
> +			&netbk[group].group_domain_list, dom) {
> +		if (dom_entry->domid == netif->domid)
> +			break;
> +	}
> +	if (!dom_entry)
> +		return;
Can this ever happen?  If so, you might have issues when several
netfronts all end in the same frontend domain e.g.:

-- netfront A arrives and is added to list
-- netfront B arrives and is added to list
-- netfront B is removed
-- netfront B is removed again.  It's no longer in the list,
   but A is, so A gets removed instead.

The end result being that netback thinks the group is idle, but it
actually has netfront A in it.  I guess the worst that can happen is
that you don't balance across tasklets properly, but it'd still be
better avoided.

If it *can't* happen, there should probably be some kind of warning
when it does.

> +
> +	spin_lock(&netbk[netif->group].group_domain_list_lock);
> +	netbk[netif->group].group_domain_nr--;
> +	list_del(&dom_entry->dom);
> +	spin_unlock(&netbk[netif->group].group_domain_list_lock);
> +	kfree(dom_entry);
> +}
> +
>  static void __netif_up(struct xen_netif *netif)
>  {
>  	enable_irq(netif->irq);
> 
> @@ -70,6 +123,7 @@ static int net_open(struct net_device *dev)
>  {
>  	struct xen_netif *netif = netdev_priv(dev);
>  	if (netback_carrier_ok(netif)) {
> +		add_domain_to_list(xen_netbk, group_nr, netif);
>  		__netif_up(netif);
>  		netif_start_queue(dev);
>  	}
> @@ -79,8 +133,10 @@ static int net_open(struct net_device *dev)
>  static int net_close(struct net_device *dev)
>  {
>  	struct xen_netif *netif = netdev_priv(dev);
> -	if (netback_carrier_ok(netif))
> +	if (netback_carrier_ok(netif)) {
>  		__netif_down(netif);
> +		remove_domain_from_list(xen_netbk, netif);
> +	}
>  	netif_stop_queue(dev);
>  	return 0;
>  }
Okay, so if the interface gets down'd and then up'd it'll potentially
move to a different group.  How much testing has that situation had?

I'd be tempted to add the interface to the list as soon as it's
created and then leave it there until it's removed, and not rebalance
during its lifetime at all, and hence avoid the issue.

> @@ -1570,6 +1570,7 @@ static int __init netback_init(void)
>  	/* We can increase reservation by this much in net_rx_action(). */
>  //	balloon_update_driver_allowance(NET_RX_RING_SIZE);
>  
> +	group_nr = num_online_cpus();
>  	xen_netbk = kzalloc(group_nr * sizeof(struct xen_netbk), GFP_KERNEL);
>  	if (!xen_netbk) {
>  		printk(KERN_ALERT "%s: out of memory\n", __func__);
What happens if the number of online CPUs changes while netback is
running?  In particular, do we stop trying to send a tasklet/thread to
a CPU which has been offlined?


0003-Use-Kernel-thread-to-replace-the-tasklet.txt:
> diff --git a/drivers/xen/netback/netback.c b/drivers/xen/netback/netback.c
> index 773cd4f..8b55efc 100644
> --- a/drivers/xen/netback/netback.c
> +++ b/drivers/xen/netback/netback.c
> +static int netbk_action_thread(void *index)
> +{
> +	int group = (long)index;
> +	struct xen_netbk *netbk = &xen_netbk[group];
> +	while (1) {
> +		wait_event_interruptible(netbk->netbk_action_wq,
> +				rx_work_todo(group)
> +				|| tx_work_todo(group));
> +		cond_resched();
> +
> +		if (rx_work_todo(group))
> +			net_rx_action(group);
> +
> +		if (tx_work_todo(group))
> +			net_tx_action(group);
> +	}
Hmm... You use kthread_stop() on this thread, so you should probably
test kthread_should_stop() every so often.

> +
> +	return 0;
> +}
> +
> +
>  static int __init netback_init(void)
>  {
>  	int i;


Apart from that, it all looks fine to me.

Steven.

[-- Attachment #1.2: Digital signature --]
[-- Type: application/pgp-signature, Size: 189 bytes --]

[-- Attachment #2: Type: text/plain, Size: 138 bytes --]

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xensource.com
http://lists.xensource.com/xen-devel

^ permalink raw reply	[flat|nested] 46+ messages in thread

* Re: [Pv-ops][PATCH 0/3] Resend: Netback multiple thread support
  2010-04-27 10:49                         ` Steven Smith
@ 2010-04-27 18:37                           ` Jeremy Fitzhardinge
  2010-04-28  9:31                             ` Steven Smith
  2010-04-28 10:27                           ` Xu, Dongxiao
  1 sibling, 1 reply; 46+ messages in thread
From: Jeremy Fitzhardinge @ 2010-04-27 18:37 UTC (permalink / raw)
  To: Steven Smith; +Cc: Xu, Dongxiao, xen-devel@lists.xensource.com

On 04/27/2010 03:49 AM, Steven Smith wrote:
>> I'd like to make an update on these patches. The main logic is not
>> changed, and I only did a rebase towards the upstream pv-ops kernel.
>> See attached patch. The original patches are checked in Jeremy's
>> netback-tasklet branch.
>>     
> I have a couple of (quite minor) comments on the patches:
>
> 0001-Netback-Generilize-static-global-variables-into-stru.txt:
>   
>> diff --git a/drivers/xen/netback/netback.c b/drivers/xen/netback/netback.c
>> index c24debf..a484b0a 100644
>> --- a/drivers/xen/netback/netback.c
>> +++ b/drivers/xen/netback/netback.c
>> @@ -49,18 +49,13 @@
>>  
>>  /*define NETBE_DEBUG_INTERRUPT*/
>>  
>> +struct xen_netbk *xen_netbk;
>>     
>   
>> +int group_nr = 1;
>> +struct page_foreign_tracker *foreign_page_tracker;
>>     
> I think these two would benefit from more descriptive names, given
> that they're not static.
>
>   

Yes.  Actually I thought I raised the same points the first time through
and Dongxiao had posted patches addressing them.  I have to admit I
haven't looked at the reposted patches in detail yet.  Have we suffered
a regression here?

Hm, maybe its just this issue which slipped through.


> If I was feeling pedantic, I'd complain that this includes some bits
> of support for multiple struct xen_netbks, rather than just moving all
> of the fields around, which reduces its obviously-correct-ness quite a
> bit.
>   

I'm always in favour of having more, smaller, functionally distinct
patches, so if this patch can be further subdivided then it would be
nice to do so.  It's especially useful when trying to track down
regressions via bisection.

> Even more pedantically, it might be better to pass around a struct
> xen_netbk in a few places, rather than an int group, so that you get
> better compiler type checking.
>   
+1

> Apart from that, it all looks fine to me.
>   

Thanks for looking at this.  It had been missing the gaze of some
networking-savvy eyes.

    J

^ permalink raw reply	[flat|nested] 46+ messages in thread

* Re: [Pv-ops][PATCH 0/3] Resend: Netback multiple thread support
  2010-04-27 18:37                           ` Jeremy Fitzhardinge
@ 2010-04-28  9:31                             ` Steven Smith
  2010-04-28 11:36                               ` Xu, Dongxiao
  0 siblings, 1 reply; 46+ messages in thread
From: Steven Smith @ 2010-04-28  9:31 UTC (permalink / raw)
  To: Jeremy Fitzhardinge
  Cc: Steven Smith, Xu, Dongxiao, xen-devel@lists.xensource.com


[-- Attachment #1.1: Type: text/plain, Size: 2693 bytes --]

> >> diff --git a/drivers/xen/netback/netback.c b/drivers/xen/netback/netback.c
> >> index c24debf..a484b0a 100644
> >> --- a/drivers/xen/netback/netback.c
> >> +++ b/drivers/xen/netback/netback.c
> >> @@ -49,18 +49,13 @@
> >>  
> >>  /*define NETBE_DEBUG_INTERRUPT*/
> >>  
> >> +struct xen_netbk *xen_netbk;
> >>     
> >   
> >> +int group_nr = 1;
> >> +struct page_foreign_tracker *foreign_page_tracker;
> >>     
> > I think these two would benefit from more descriptive names, given
> > that they're not static.
> 
> Yes.  Actually I thought I raised the same points the first time through
> and Dongxiao had posted patches addressing them.  I have to admit I
> haven't looked at the reposted patches in detail yet.  Have we suffered
> a regression here?
> 
> Hm, maybe its just this issue which slipped through.
I think so, yes (assuming the patches posted on the 26th of April are
the most recent version).

> > Apart from that, it all looks fine to me.
> Thanks for looking at this.  It had been missing the gaze of some
> networking-savvy eyes.
There is one other potential issue which just occurred to me.  These
patches assign netifs to groups pretty much arbitrarily, beyond trying
to keep the groups balanced.  It might be better to try to group
interfaces so that the tasklet runs on the same VCPU as the interrupt
i.e. grouping interfaces according to interrupt affinity.  That would
have two main benefits:

-- Less cross-VCPU traffic, and hence better cache etc. behaviour.
-- Potentially better balancing.  If you find that you've accidentally
   assigned two high-traffic interfaces to the same group, irqbalance
   or whatnot should rebalance the interrupts to different vcpus, but
   that doesn't automatically do us much good because most of the work
   is done in the tasklet (which will still only run on one vcpu and
   hence become a bottleneck).  If we rebalanced the netif groups when
   irqbalance rebalanced the interrupts then we'd bypass the issue.

Of course, someone would need to go and implement the
rebalance-in-response-to-irqbalance, which would be non-trivial.

You could imagine doing it by just getting rid of the explicit group
field in struct xen_netif and using smp_processor_id() instead, but
that would need quite a bit of extra thought about what happens if
e.g. the start_xmit and the tx complete interrupt happen on different
vcpus.

It sounds like the patch provides a useful improvement as it stands,
and the rebalancing would probably be a bit of a pain, so I don't
think this is a blocker to an immediate merge, but it'd be nice if
someone could look at it eventually.

Steven.

[-- Attachment #1.2: Digital signature --]
[-- Type: application/pgp-signature, Size: 189 bytes --]

[-- Attachment #2: Type: text/plain, Size: 138 bytes --]

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xensource.com
http://lists.xensource.com/xen-devel

^ permalink raw reply	[flat|nested] 46+ messages in thread

* RE: [Pv-ops][PATCH 0/3] Resend: Netback multiple thread support
  2010-04-27 10:49                         ` Steven Smith
  2010-04-27 18:37                           ` Jeremy Fitzhardinge
@ 2010-04-28 10:27                           ` Xu, Dongxiao
  2010-04-28 11:51                             ` Steven Smith
  1 sibling, 1 reply; 46+ messages in thread
From: Xu, Dongxiao @ 2010-04-28 10:27 UTC (permalink / raw)
  To: Steven Smith; +Cc: Fitzhardinge, xen-devel@lists.xensource.com, Jeremy

Hi Steven,

Thanks for your careful review. Some explaination inline. 
For your consideration of group and interrupt affinity, I will reply in another
thread. 

Thanks,
Dongxiao

Steven Smith wrote:
>> I'd like to make an update on these patches. The main logic is not
>> changed, and I only did a rebase towards the upstream pv-ops kernel.
>> See attached patch. The original patches are checked in Jeremy's
>> netback-tasklet branch.
> I have a couple of (quite minor) comments on the patches:
> 
> 0001-Netback-Generilize-static-global-variables-into-stru.txt:
>> diff --git a/drivers/xen/netback/netback.c
>> b/drivers/xen/netback/netback.c index c24debf..a484b0a 100644 ---
>> a/drivers/xen/netback/netback.c +++ b/drivers/xen/netback/netback.c
>> @@ -49,18 +49,13 @@
>> 
>>  /*define NETBE_DEBUG_INTERRUPT*/
>> 
>> +struct xen_netbk *xen_netbk;
> 
>> +int group_nr = 1;
>> +struct page_foreign_tracker *foreign_page_tracker;
> I think these two would benefit from more descriptive names, given
> that they're not static.

Oops...I thought I had modified this when Jeremy commented this
last time, maybe there was some mistake and I left it until today...
I swear I will change this in next version of patchset. ;-)

> 
> 
> 
> If I was feeling pedantic, I'd complain that this includes some bits
> of support for multiple struct xen_netbks, rather than just moving all
> of the fields around, which reduces its obviously-correct-ness quite a
> bit.

Actually I was struggling how to split the first patch into small ones,
however I don't have much idea since the patch changes the function
Interface/data structure, so the corresponding caller needs change too,
which generates a 1k line of patch...

> 
> Even more pedantically, it might be better to pass around a struct
> xen_netbk in a few places, rather than an int group, so that you get
> better compiler type checking.

I will change this in next version of patch.

> 
> 
> 
> 0002-Netback-Multiple-tasklets-support.txt:
> Design question: you do your balancing by making each tasklet serve
> roughly equal numbers of remote domains.  Would it not have been
> easier to make them serve equal numbers of netfronts?  You could then
> get rid of the domain_entry business completely and just have a count
> of serviced netfronts in each xen_netbk, which might be a bit easier
> to deal with.

According to my understanding, one guest with VNIF driver represented
by one netfront. Is this true? Therefore I don't understand the difference
between "number of domains" and "number of netfronts", I used to thought
they were the same. Please correct me my understanding is wrong.

Actually in the very early stage of this patch, I use a simple method to
identify which group does a netfront belong to, by calculating
(domid % online_cpu_nr()). The advantage is simple, however it may
cause netfront count unbalanced between different groups.

I will try to remove "domain_entry" related code in next version patch.

> 
>> diff --git a/drivers/xen/netback/interface.c
>> b/drivers/xen/netback/interface.c index 21c1f95..bdf3c1d 100644 ---
>> a/drivers/xen/netback/interface.c +++
>> b/drivers/xen/netback/interface.c @@ -54,6 +54,59 @@
>>  static unsigned long netbk_queue_length = 32;
>>  module_param_named(queue_length, netbk_queue_length, ulong, 0644);
>> 
>> 
>> +static void remove_domain_from_list(struct xen_netbk *netbk,
>> +			     struct xen_netif *netif)
>> +{
>> +	struct domain_entry *dom_entry = NULL;
>> +	int group = netif->group;
>> +
>> +	list_for_each_entry(dom_entry,
>> +			&netbk[group].group_domain_list, dom) {
>> +		if (dom_entry->domid == netif->domid)
>> +			break;
>> +	}
>> +	if (!dom_entry)
>> +		return;
> Can this ever happen?  If so, you might have issues when several
> netfronts all end in the same frontend domain e.g.:
> 
> -- netfront A arrives and is added to list
> -- netfront B arrives and is added to list
> -- netfront B is removed
> -- netfront B is removed again.  It's no longer in the list,
>    but A is, so A gets removed instead.
> 
> The end result being that netback thinks the group is idle, but it
> actually has netfront A in it.  I guess the worst that can happen is
> that you don't balance across tasklets properly, but it'd still be
> better avoided.
> 
> If it *can't* happen, there should probably be some kind of warning
> when it does.
> 
>> +
>> +	spin_lock(&netbk[netif->group].group_domain_list_lock);
>> +	netbk[netif->group].group_domain_nr--;
>> +	list_del(&dom_entry->dom);
>> +	spin_unlock(&netbk[netif->group].group_domain_list_lock);
>> +	kfree(dom_entry); +}
>> +
>>  static void __netif_up(struct xen_netif *netif)
>>  {
>>  	enable_irq(netif->irq);
>> 
>> @@ -70,6 +123,7 @@ static int net_open(struct net_device *dev)  {
>>  	struct xen_netif *netif = netdev_priv(dev);
>>  	if (netback_carrier_ok(netif)) {
>> +		add_domain_to_list(xen_netbk, group_nr, netif);
>>  		__netif_up(netif);
>>  		netif_start_queue(dev);
>>  	}
>> @@ -79,8 +133,10 @@ static int net_open(struct net_device *dev)
>>  static int net_close(struct net_device *dev)
>>  {
>>  	struct xen_netif *netif = netdev_priv(dev);
>> -	if (netback_carrier_ok(netif))
>> +	if (netback_carrier_ok(netif)) {
>>  		__netif_down(netif);
>> +		remove_domain_from_list(xen_netbk, netif);
>> +	}
>>  	netif_stop_queue(dev);
>>  	return 0;
>>  }
> Okay, so if the interface gets down'd and then up'd it'll potentially
> move to a different group.  How much testing has that situation had?
> 
> I'd be tempted to add the interface to the list as soon as it's
> created and then leave it there until it's removed, and not rebalance
> during its lifetime at all, and hence avoid the issue.
> 
>> @@ -1570,6 +1570,7 @@ static int __init netback_init(void)
>>  	/* We can increase reservation by this much in net_rx_action(). */
>>  //	balloon_update_driver_allowance(NET_RX_RING_SIZE);
>> 
>> +	group_nr = num_online_cpus();
>>  	xen_netbk = kzalloc(group_nr * sizeof(struct xen_netbk),
>>  		GFP_KERNEL);  	if (!xen_netbk) { printk(KERN_ALERT "%s: out of
>> memory\n", __func__); 
> What happens if the number of online CPUs changes while netback is
> running?  In particular, do we stop trying to send a tasklet/thread to
> a CPU which has been offlined?

The group_nr just defines the max number of tasklets, however it doesn't decide
which tasklet is handled by which CPU. It is decided by the delivery of interrupt. 

> 
> 
> 0003-Use-Kernel-thread-to-replace-the-tasklet.txt:
>> diff --git a/drivers/xen/netback/netback.c
>> b/drivers/xen/netback/netback.c index 773cd4f..8b55efc 100644 ---
>> a/drivers/xen/netback/netback.c +++ b/drivers/xen/netback/netback.c
>> +static int netbk_action_thread(void *index)
>> +{
>> +	int group = (long)index;
>> +	struct xen_netbk *netbk = &xen_netbk[group];
>> +	while (1) {
>> +		wait_event_interruptible(netbk->netbk_action_wq,
>> +				rx_work_todo(group) +				|| tx_work_todo(group));
>> +		cond_resched();
>> +
>> +		if (rx_work_todo(group))
>> +			net_rx_action(group);
>> +
>> +		if (tx_work_todo(group))
>> +			net_tx_action(group);
>> +	}
> Hmm... You use kthread_stop() on this thread, so you should probably
> test kthread_should_stop() every so often.

OK, I will modify it.

> 
>> +
>> +	return 0;
>> +}
>> +
>> +
>>  static int __init netback_init(void)
>>  {
>>  	int i;
> 
> 
> Apart from that, it all looks fine to me.
> 
> Steven.

^ permalink raw reply	[flat|nested] 46+ messages in thread

* RE: [Pv-ops][PATCH 0/3] Resend: Netback multiple thread support
  2010-04-28  9:31                             ` Steven Smith
@ 2010-04-28 11:36                               ` Xu, Dongxiao
  2010-04-28 12:04                                 ` Steven Smith
  0 siblings, 1 reply; 46+ messages in thread
From: Xu, Dongxiao @ 2010-04-28 11:36 UTC (permalink / raw)
  To: Steven Smith, Jeremy Fitzhardinge
  Cc: Steven Smith, xen-devel@lists.xensource.com

Steven Smith wrote:
>>>> diff --git a/drivers/xen/netback/netback.c
>>>> b/drivers/xen/netback/netback.c index c24debf..a484b0a 100644 ---
>>>> a/drivers/xen/netback/netback.c +++ b/drivers/xen/netback/netback.c
>>>> @@ -49,18 +49,13 @@
>>>> 
>>>>  /*define NETBE_DEBUG_INTERRUPT*/
>>>> 
>>>> +struct xen_netbk *xen_netbk;
>>>> 
>>> 
>>>> +int group_nr = 1;
>>>> +struct page_foreign_tracker *foreign_page_tracker;
>>>> 
>>> I think these two would benefit from more descriptive names, given
>>> that they're not static.
>> 
>> Yes.  Actually I thought I raised the same points the first time
>> through and Dongxiao had posted patches addressing them.  I have to
>> admit I haven't looked at the reposted patches in detail yet.  Have
>> we suffered a regression here? 
>> 
>> Hm, maybe its just this issue which slipped through.
> I think so, yes (assuming the patches posted on the 26th of April are
> the most recent version).
> 
>>> Apart from that, it all looks fine to me.
>> Thanks for looking at this.  It had been missing the gaze of some
>> networking-savvy eyes.
> There is one other potential issue which just occurred to me.  These
> patches assign netifs to groups pretty much arbitrarily, beyond trying
> to keep the groups balanced.  It might be better to try to group
> interfaces so that the tasklet runs on the same VCPU as the interrupt
> i.e. grouping interfaces according to interrupt affinity.  That would
> have two main benefits:
> 
> -- Less cross-VCPU traffic, and hence better cache etc. behaviour.
> -- Potentially better balancing.  If you find that you've accidentally
>    assigned two high-traffic interfaces to the same group, irqbalance
>    or whatnot should rebalance the interrupts to different vcpus, but
>    that doesn't automatically do us much good because most of the work
>    is done in the tasklet (which will still only run on one vcpu and
>    hence become a bottleneck).  If we rebalanced the netif groups when
>    irqbalance rebalanced the interrupts then we'd bypass the issue.
> 
> Of course, someone would need to go and implement the
> rebalance-in-response-to-irqbalance, which would be non-trivial.

Your idea is workable if the netfront is bound with a single queue
NIC via a bridge. Hence we know which interrupt is used to serve the
netfront, and then we can group netfronts according to the interrupt
affinity. And as you said, the effort is non-trivial.

However if the NIC is multi-queued, which has only one interface but
multiple interrupt queues. All netfronts are bounded with this interface
via one bridge. We have no idea which interrupt queue is serving for
a specified netfront. So the rebalance according to interrupt affinity
is a challenge. Do you have idea on this point?

Thanks,
Dongxiao

> 
> You could imagine doing it by just getting rid of the explicit group
> field in struct xen_netif and using smp_processor_id() instead, but
> that would need quite a bit of extra thought about what happens if
> e.g. the start_xmit and the tx complete interrupt happen on different
> vcpus.
> 
> It sounds like the patch provides a useful improvement as it stands,
> and the rebalancing would probably be a bit of a pain, so I don't
> think this is a blocker to an immediate merge, but it'd be nice if
> someone could look at it eventually.
> 
> Steven.

^ permalink raw reply	[flat|nested] 46+ messages in thread

* Re: [Pv-ops][PATCH 0/3] Resend: Netback multiple thread support
  2010-04-28 10:27                           ` Xu, Dongxiao
@ 2010-04-28 11:51                             ` Steven Smith
  2010-04-28 12:23                               ` Xu, Dongxiao
  2010-04-28 12:43                               ` Jan Beulich
  0 siblings, 2 replies; 46+ messages in thread
From: Steven Smith @ 2010-04-28 11:51 UTC (permalink / raw)
  To: Xu, Dongxiao
  Cc: Steven Smith, Jeremy Fitzhardinge, xen-devel@lists.xensource.com


[-- Attachment #1.1: Type: text/plain, Size: 6770 bytes --]

> >> I'd like to make an update on these patches. The main logic is not
> >> changed, and I only did a rebase towards the upstream pv-ops kernel.
> >> See attached patch. The original patches are checked in Jeremy's
> >> netback-tasklet branch.
> > I have a couple of (quite minor) comments on the patches:
> > 
> > 0001-Netback-Generilize-static-global-variables-into-stru.txt:
> >> diff --git a/drivers/xen/netback/netback.c
> >> b/drivers/xen/netback/netback.c index c24debf..a484b0a 100644 ---
> >> a/drivers/xen/netback/netback.c +++ b/drivers/xen/netback/netback.c
> >> @@ -49,18 +49,13 @@
> >> 
> >>  /*define NETBE_DEBUG_INTERRUPT*/
> >> 
> >> +struct xen_netbk *xen_netbk;
> > 
> >> +int group_nr = 1;
> >> +struct page_foreign_tracker *foreign_page_tracker;
> > I think these two would benefit from more descriptive names, given
> > that they're not static.
> Oops...I thought I had modified this when Jeremy commented this
> last time, maybe there was some mistake and I left it until today...
Easily done.

> > If I was feeling pedantic, I'd complain that this includes some bits
> > of support for multiple struct xen_netbks, rather than just moving all
> > of the fields around, which reduces its obviously-correct-ness quite a
> > bit.
> Actually I was struggling how to split the first patch into small ones,
> however I don't have much idea since the patch changes the function
> Interface/data structure, so the corresponding caller needs change too,
> which generates a 1k line of patch...
The approach I would take would be something like this:

1) Gather all the global data together into a struct xen_netbk, and
   then have a single, global, instance of that structure.  Go through
   and turn every reference to a bit of global data into a reference
   to a field of that structure.  This will be a massive patch, but
   it's purely mechanical and it's very easy to check that it's safe.

2) Introduce struct ext_page and use it everywhere you use it in the
   current patch.  This should be fairly small.

3) Generalise to multiple struct xen_netbks by changing the single
   global instance into a struct xen_netbk * and fixing the resulting
   compile errors.  Another big patch, but provided you remember to
   initialise everything properly the compiler will check almost all
   of it for you.

This is to some extent a bikeshed argument, so if you prefer the
current patch structure then that would work as well.

> > Even more pedantically, it might be better to pass around a struct
> > xen_netbk in a few places, rather than an int group, so that you get
> > better compiler type checking.
> I will change this in next version of patch.
Thanks.

> > 0002-Netback-Multiple-tasklets-support.txt:
> > Design question: you do your balancing by making each tasklet serve
> > roughly equal numbers of remote domains.  Would it not have been
> > easier to make them serve equal numbers of netfronts?  You could then
> > get rid of the domain_entry business completely and just have a count
> > of serviced netfronts in each xen_netbk, which might be a bit easier
> > to deal with.
> According to my understanding, one guest with VNIF driver represented
> by one netfront. Is this true? Therefore I don't understand the difference
> between "number of domains" and "number of netfronts", I used to thought
> they were the same. Please correct me my understanding is wrong.
I think we might be using slightly different terminology here.  When I
say ``netfront'', I mean the frontend half of a virtual network
interface, rather than the netfront driver, so a single domain can be
configured with multiple netfronts in the same way that a single
physical host can have multiple ixgbes (say), despite only having one
ixgbe driver loaded.

So, my original point was that it might be better to balance
interfaces such that the number of interfaces in each group is the
same, ignoring the frontend domain ID completely.  This would mean
that if, for instance, a domain had two very busy NICs then they
wouldn't be forced to share a tasklet, which might otherwise be a
bottleneck.

The downside, of course, is that it would allow domains with multiple
vNICs to use more dom0 CPU time, potentially aggravating starvation
and unfairness problems.  On the other hand, a domain with N vNICs
wouldn't be able to do any more damage than N domains with 1 vNIC
each, so I don't think it's too bad.

> Actually in the very early stage of this patch, I use a simple method to
> identify which group does a netfront belong to, by calculating
> (domid % online_cpu_nr()). The advantage is simple, however it may
> cause netfront count unbalanced between different groups.
Well, any static scheme will potentially come unbalanced some of the
time, if different interfaces experience different levels of traffic.
But see the other thread for another discussion of balancing issues.

> I will try to remove "domain_entry" related code in next version patch.
Thanks.

> >> @@ -1570,6 +1570,7 @@ static int __init netback_init(void)
> >>  	/* We can increase reservation by this much in net_rx_action(). */
> >>  //	balloon_update_driver_allowance(NET_RX_RING_SIZE);
> >> 
> >> +	group_nr = num_online_cpus();
> >>  	xen_netbk = kzalloc(group_nr * sizeof(struct xen_netbk),
> >>  		GFP_KERNEL);  	if (!xen_netbk) { printk(KERN_ALERT "%s: out of
> >> memory\n", __func__); 
> > What happens if the number of online CPUs changes while netback is
> > running?  In particular, do we stop trying to send a tasklet/thread to
> > a CPU which has been offlined?
> The group_nr just defines the max number of tasklets, however it doesn't decide
> which tasklet is handled by which CPU. It is decided by the delivery of interrupt. 
Ah, yes, so it is.  Thanks for explaining it.

> > 0003-Use-Kernel-thread-to-replace-the-tasklet.txt:
> >> diff --git a/drivers/xen/netback/netback.c
> >> b/drivers/xen/netback/netback.c index 773cd4f..8b55efc 100644 ---
> >> a/drivers/xen/netback/netback.c +++ b/drivers/xen/netback/netback.c
> >> +static int netbk_action_thread(void *index)
> >> +{
> >> +	int group = (long)index;
> >> +	struct xen_netbk *netbk = &xen_netbk[group];
> >> +	while (1) {
> >> +		wait_event_interruptible(netbk->netbk_action_wq,
> >> +				rx_work_todo(group) +				|| tx_work_todo(group));
> >> +		cond_resched();
> >> +
> >> +		if (rx_work_todo(group))
> >> +			net_rx_action(group);
> >> +
> >> +		if (tx_work_todo(group))
> >> +			net_tx_action(group);
> >> +	}
> > Hmm... You use kthread_stop() on this thread, so you should probably
> > test kthread_should_stop() every so often.
> OK, I will modify it.
Thanks.

Steven.

[-- Attachment #1.2: Digital signature --]
[-- Type: application/pgp-signature, Size: 189 bytes --]

[-- Attachment #2: Type: text/plain, Size: 138 bytes --]

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xensource.com
http://lists.xensource.com/xen-devel

^ permalink raw reply	[flat|nested] 46+ messages in thread

* Re: [Pv-ops][PATCH 0/3] Resend: Netback multiple thread support
  2010-04-28 11:36                               ` Xu, Dongxiao
@ 2010-04-28 12:04                                 ` Steven Smith
  2010-04-28 13:33                                   ` Xu, Dongxiao
  0 siblings, 1 reply; 46+ messages in thread
From: Steven Smith @ 2010-04-28 12:04 UTC (permalink / raw)
  To: Xu, Dongxiao
  Cc: Steven Smith, Jeremy Fitzhardinge, xen-devel@lists.xensource.com


[-- Attachment #1.1: Type: text/plain, Size: 2842 bytes --]

> >>> Apart from that, it all looks fine to me.
> >> Thanks for looking at this.  It had been missing the gaze of some
> >> networking-savvy eyes.
> > There is one other potential issue which just occurred to me.  These
> > patches assign netifs to groups pretty much arbitrarily, beyond trying
> > to keep the groups balanced.  It might be better to try to group
> > interfaces so that the tasklet runs on the same VCPU as the interrupt
> > i.e. grouping interfaces according to interrupt affinity.  That would
> > have two main benefits:
> > 
> > -- Less cross-VCPU traffic, and hence better cache etc. behaviour.
> > -- Potentially better balancing.  If you find that you've accidentally
> >    assigned two high-traffic interfaces to the same group, irqbalance
> >    or whatnot should rebalance the interrupts to different vcpus, but
> >    that doesn't automatically do us much good because most of the work
> >    is done in the tasklet (which will still only run on one vcpu and
> >    hence become a bottleneck).  If we rebalanced the netif groups when
> >    irqbalance rebalanced the interrupts then we'd bypass the issue.
> > 
> > Of course, someone would need to go and implement the
> > rebalance-in-response-to-irqbalance, which would be non-trivial.
> Your idea is workable if the netfront is bound with a single queue
> NIC via a bridge. Hence we know which interrupt is used to serve the
> netfront, and then we can group netfronts according to the interrupt
> affinity. And as you said, the effort is non-trivial.
> 
> However if the NIC is multi-queued, which has only one interface but
> multiple interrupt queues. All netfronts are bounded with this interface
> via one bridge. We have no idea which interrupt queue is serving for
> a specified netfront. So the rebalance according to interrupt affinity
> is a challenge. Do you have idea on this point?
Sorry, I should have been clearer here.  When I said ``interrupt'' I
meant the event channel interrupt which the netfront instance will use
to notify netback, not the physical hardware interrupt of whatever
physical NIC is ultimately associated with it.  We should always know
which event channel a given netfront is using, and hence which
interrupt, and so we should be able to find out its affinity.  In
effect, we'd rebalance in response to messages from the guest to
netback, which is at least vaguely reasonable as a proxy for actual
load.

There are at least three relevant contexts here:

-- Interrupts generated by the hardware
-- The netback tasklets
-- Interrupts generated by the guest

As you say, doing anything based on where hardware interrupts are
being delivered is somewhere between hard an impossible, but it might
be possible to do something useful with the interrupts from the guest.

Steven.

[-- Attachment #1.2: Digital signature --]
[-- Type: application/pgp-signature, Size: 189 bytes --]

[-- Attachment #2: Type: text/plain, Size: 138 bytes --]

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xensource.com
http://lists.xensource.com/xen-devel

^ permalink raw reply	[flat|nested] 46+ messages in thread

* RE: [Pv-ops][PATCH 0/3] Resend: Netback multiple thread support
  2010-04-28 11:51                             ` Steven Smith
@ 2010-04-28 12:23                               ` Xu, Dongxiao
  2010-04-28 12:43                               ` Jan Beulich
  1 sibling, 0 replies; 46+ messages in thread
From: Xu, Dongxiao @ 2010-04-28 12:23 UTC (permalink / raw)
  To: Steven Smith
  Cc: Steven Smith, Jeremy Fitzhardinge, xen-devel@lists.xensource.com

Steven Smith wrote:
>>>> I'd like to make an update on these patches. The main logic is not
>>>> changed, and I only did a rebase towards the upstream pv-ops
>>>> kernel. See attached patch. The original patches are checked in
>>>> Jeremy's netback-tasklet branch.
>>> I have a couple of (quite minor) comments on the patches:
>>> 
>>> 0001-Netback-Generilize-static-global-variables-into-stru.txt:
>>>> diff --git a/drivers/xen/netback/netback.c
>>>> b/drivers/xen/netback/netback.c index c24debf..a484b0a 100644 ---
>>>> a/drivers/xen/netback/netback.c +++
>>>> b/drivers/xen/netback/netback.c @@ -49,18 +49,13 @@ 
>>>> 
>>>>  /*define NETBE_DEBUG_INTERRUPT*/
>>>> 
>>>> +struct xen_netbk *xen_netbk;
>>> 
>>>> +int group_nr = 1;
>>>> +struct page_foreign_tracker *foreign_page_tracker;
>>> I think these two would benefit from more descriptive names, given
>>> that they're not static.
>> Oops...I thought I had modified this when Jeremy commented this
>> last time, maybe there was some mistake and I left it until today...
>> Easily done. 
> 
>>> If I was feeling pedantic, I'd complain that this includes some bits
>>> of support for multiple struct xen_netbks, rather than just moving
>>> all of the fields around, which reduces its obviously-correct-ness
>>> quite a bit.
>> Actually I was struggling how to split the first patch into small
>> ones, 
>> however I don't have much idea since the patch changes the function
>> Interface/data structure, so the corresponding caller needs change
>> too, 
>> which generates a 1k line of patch...
> The approach I would take would be something like this:
> 
> 1) Gather all the global data together into a struct xen_netbk, and
>    then have a single, global, instance of that structure.  Go through
>    and turn every reference to a bit of global data into a reference
>    to a field of that structure.  This will be a massive patch, but
>    it's purely mechanical and it's very easy to check that it's safe.
> 
> 2) Introduce struct ext_page and use it everywhere you use it in the
>    current patch.  This should be fairly small.
> 
> 3) Generalise to multiple struct xen_netbks by changing the single
>    global instance into a struct xen_netbk * and fixing the resulting
>    compile errors.  Another big patch, but provided you remember to
>    initialise everything properly the compiler will check almost all
>    of it for you.
> 
> This is to some extent a bikeshed argument, so if you prefer the
> current patch structure then that would work as well.

Thanks for your suggestion, I will have a try on this.

> 
>>> Even more pedantically, it might be better to pass around a struct
>>> xen_netbk in a few places, rather than an int group, so that you get
>>> better compiler type checking.
>> I will change this in next version of patch. Thanks.
> 
>>> 0002-Netback-Multiple-tasklets-support.txt:
>>> Design question: you do your balancing by making each tasklet serve
>>> roughly equal numbers of remote domains.  Would it not have been
>>> easier to make them serve equal numbers of netfronts?  You could
>>> then get rid of the domain_entry business completely and just have
>>> a count of serviced netfronts in each xen_netbk, which might be a
>>> bit easier to deal with.
>> According to my understanding, one guest with VNIF driver represented
>> by one netfront. Is this true? Therefore I don't understand the
>> difference between "number of domains" and "number of netfronts", I
>> used to thought 
>> they were the same. Please correct me my understanding is wrong.
> I think we might be using slightly different terminology here.  When I
> say ``netfront'', I mean the frontend half of a virtual network
> interface, rather than the netfront driver, so a single domain can be
> configured with multiple netfronts in the same way that a single
> physical host can have multiple ixgbes (say), despite only having one
> ixgbe driver loaded.
> 
> So, my original point was that it might be better to balance
> interfaces such that the number of interfaces in each group is the
> same, ignoring the frontend domain ID completely.  This would mean
> that if, for instance, a domain had two very busy NICs then they
> wouldn't be forced to share a tasklet, which might otherwise be a
> bottleneck.
> 
> The downside, of course, is that it would allow domains with multiple
> vNICs to use more dom0 CPU time, potentially aggravating starvation
> and unfairness problems.  On the other hand, a domain with N vNICs
> wouldn't be able to do any more damage than N domains with 1 vNIC
> each, so I don't think it's too bad.

It's my mis-understanding previously, thanks for explaination on this point. 

Regards,
Dongxiao

> 
>> Actually in the very early stage of this patch, I use a simple
>> method to 
>> identify which group does a netfront belong to, by calculating
>> (domid % online_cpu_nr()). The advantage is simple, however it may
>> cause netfront count unbalanced between different groups.
> Well, any static scheme will potentially come unbalanced some of the
> time, if different interfaces experience different levels of traffic.
> But see the other thread for another discussion of balancing issues.
> 
>> I will try to remove "domain_entry" related code in next version
>> patch. Thanks. 
> 
>>>> @@ -1570,6 +1570,7 @@ static int __init netback_init(void)
>>>>  	/* We can increase reservation by this much in net_rx_action().
>>>>  */ //	balloon_update_driver_allowance(NET_RX_RING_SIZE);
>>>> 
>>>> +	group_nr = num_online_cpus();
>>>>  	xen_netbk = kzalloc(group_nr * sizeof(struct xen_netbk),
>>>>  		GFP_KERNEL);  	if (!xen_netbk) { printk(KERN_ALERT "%s: out of
>>>> memory\n", __func__);
>>> What happens if the number of online CPUs changes while netback is
>>> running?  In particular, do we stop trying to send a tasklet/thread
>>> to a CPU which has been offlined?
>> The group_nr just defines the max number of tasklets, however it
>> doesn't decide which tasklet is handled by which CPU. It is decided
>> by the delivery of interrupt. 
> Ah, yes, so it is.  Thanks for explaining it.
> 
>>> 0003-Use-Kernel-thread-to-replace-the-tasklet.txt:
>>>> diff --git a/drivers/xen/netback/netback.c
>>>> b/drivers/xen/netback/netback.c index 773cd4f..8b55efc 100644 ---
>>>> a/drivers/xen/netback/netback.c +++ b/drivers/xen/netback/netback.c
>>>> +static int netbk_action_thread(void *index)
>>>> +{
>>>> +	int group = (long)index;
>>>> +	struct xen_netbk *netbk = &xen_netbk[group];
>>>> +	while (1) {
>>>> +		wait_event_interruptible(netbk->netbk_action_wq,
>>>> +				rx_work_todo(group) +				|| tx_work_todo(group));
>>>> +		cond_resched(); +
>>>> +		if (rx_work_todo(group))
>>>> +			net_rx_action(group);
>>>> +
>>>> +		if (tx_work_todo(group))
>>>> +			net_tx_action(group);
>>>> +	}
>>> Hmm... You use kthread_stop() on this thread, so you should probably
>>> test kthread_should_stop() every so often.
>> OK, I will modify it.
> Thanks.
> 
> Steven.

^ permalink raw reply	[flat|nested] 46+ messages in thread

* Re: [Pv-ops][PATCH 0/3] Resend: Netback multiple thread support
  2010-04-28 11:51                             ` Steven Smith
  2010-04-28 12:23                               ` Xu, Dongxiao
@ 2010-04-28 12:43                               ` Jan Beulich
  2010-04-30  7:29                                 ` Steven Smith
  1 sibling, 1 reply; 46+ messages in thread
From: Jan Beulich @ 2010-04-28 12:43 UTC (permalink / raw)
  To: Steven Smith, Dongxiao Xu
  Cc: Steven Smith, Jeremy Fitzhardinge, xen-devel@lists.xensource.com

>>> Steven Smith <steven.smith@citrix.com> 28.04.10 13:51 >>>
>2) Introduce struct ext_page and use it everywhere you use it in the
>   current patch.  This should be fairly small.

In working through the patches to make them usable on our forward
ported trees, I wondered what this is good for at all, for two reasons:

On 64-bits embedding the data directly into page->mapping would
be possible without any other consideration.

Even on 32-bits embedding is possible based on the observation
that the two fields together don't need more than 32 bits (idx
always being less than MAX_PENDING_REQS [which itself could
even grow significantly] and group being bounded by NR_CPUS).

>I think we might be using slightly different terminology here.  When I
>say ``netfront'', I mean the frontend half of a virtual network
>interface, rather than the netfront driver, so a single domain can be
>configured with multiple netfronts in the same way that a single
>physical host can have multiple ixgbes (say), despite only having one
>ixgbe driver loaded.
>
>So, my original point was that it might be better to balance
>interfaces such that the number of interfaces in each group is the
>same, ignoring the frontend domain ID completely.  This would mean
>that if, for instance, a domain had two very busy NICs then they
>wouldn't be forced to share a tasklet, which might otherwise be a
>bottleneck.

As you had pointed out in an earlier reply, the use of the domain
ID here is flawed anyway - it had to be replaced for the whole
set to be usable for us. We count netif-s and balance based on
that count, at once eliminating the need to do any allocation
when adding a new netif.

Jan

^ permalink raw reply	[flat|nested] 46+ messages in thread

* RE: [Pv-ops][PATCH 0/3] Resend: Netback multiple thread support
  2010-04-28 12:04                                 ` Steven Smith
@ 2010-04-28 13:33                                   ` Xu, Dongxiao
  2010-04-30  7:35                                     ` Steven Smith
  0 siblings, 1 reply; 46+ messages in thread
From: Xu, Dongxiao @ 2010-04-28 13:33 UTC (permalink / raw)
  To: Steven Smith
  Cc: Steven Smith, Jeremy Fitzhardinge, xen-devel@lists.xensource.com

Steven Smith wrote:
>>>>> Apart from that, it all looks fine to me.
>>>> Thanks for looking at this.  It had been missing the gaze of some
>>>> networking-savvy eyes.
>>> There is one other potential issue which just occurred to me.  These
>>> patches assign netifs to groups pretty much arbitrarily, beyond
>>> trying to keep the groups balanced.  It might be better to try to
>>> group interfaces so that the tasklet runs on the same VCPU as the
>>> interrupt i.e. grouping interfaces according to interrupt affinity.
>>> That would have two main benefits: 
>>> 
>>> -- Less cross-VCPU traffic, and hence better cache etc. behaviour.
>>> -- Potentially better balancing.  If you find that you've
>>>    accidentally assigned two high-traffic interfaces to the same
>>>    group, irqbalance or whatnot should rebalance the interrupts to
>>>    different vcpus, but that doesn't automatically do us much good
>>>    because most of the work is done in the tasklet (which will
>>>    still only run on one vcpu and hence become a bottleneck).  If
>>>    we rebalanced the netif groups when irqbalance rebalanced the
>>> interrupts then we'd bypass the issue. 
>>> 
>>> Of course, someone would need to go and implement the
>>> rebalance-in-response-to-irqbalance, which would be non-trivial.
>> Your idea is workable if the netfront is bound with a single queue
>> NIC via a bridge. Hence we know which interrupt is used to serve the
>> netfront, and then we can group netfronts according to the interrupt
>> affinity. And as you said, the effort is non-trivial.
>> 
>> However if the NIC is multi-queued, which has only one interface but
>> multiple interrupt queues. All netfronts are bounded with this
>> interface via one bridge. We have no idea which interrupt queue is
>> serving for 
>> a specified netfront. So the rebalance according to interrupt
>> affinity is a challenge. Do you have idea on this point?
> Sorry, I should have been clearer here.  When I said ``interrupt'' I
> meant the event channel interrupt which the netfront instance will use
> to notify netback, not the physical hardware interrupt of whatever
> physical NIC is ultimately associated with it.  We should always know
> which event channel a given netfront is using, and hence which
> interrupt, and so we should be able to find out its affinity.  In
> effect, we'd rebalance in response to messages from the guest to
> netback, which is at least vaguely reasonable as a proxy for actual
> load.

OK, I understand, what you were thinking about is on netfront TX,
while I was talking about the netfront RX.

In my solution, each tasklet PAIR will be assigned to a group. So I think
the optimization should work for both directions.

As we have a common view that rebalance on RX rebalance is hard to
implement, and the optimization point is on TX rebalance. Do you think
if TX rebalance would have side effect on RX direction?

However in my next version of patch, I would not include this logic
since the change is not small and needs more effort.

Thanks,
Dongxiao

> 
> There are at least three relevant contexts here:
> 
> -- Interrupts generated by the hardware
> -- The netback tasklets
> -- Interrupts generated by the guest
> 
> As you say, doing anything based on where hardware interrupts are
> being delivered is somewhere between hard an impossible, but it might
> be possible to do something useful with the interrupts from the guest.
> 
> Steven.

^ permalink raw reply	[flat|nested] 46+ messages in thread

* Re: [Pv-ops][PATCH 0/3] Resend: Netback multiple  thread support
  2010-04-28 12:43                               ` Jan Beulich
@ 2010-04-30  7:29                                 ` Steven Smith
  2010-04-30  8:27                                   ` Jan Beulich
  0 siblings, 1 reply; 46+ messages in thread
From: Steven Smith @ 2010-04-30  7:29 UTC (permalink / raw)
  To: Jan Beulich
  Cc: Steven Smith, Dongxiao Xu, xen-devel@lists.xensource.com,
	Jeremy Fitzhardinge


[-- Attachment #1.1: Type: text/plain, Size: 923 bytes --]

> >>> Steven Smith <steven.smith@citrix.com> 28.04.10 13:51 >>>
> >2) Introduce struct ext_page and use it everywhere you use it in the
> >   current patch.  This should be fairly small.
> In working through the patches to make them usable on our forward
> ported trees, I wondered what this is good for at all, for two reasons:
> 
> On 64-bits embedding the data directly into page->mapping would
> be possible without any other consideration.
> 
> Even on 32-bits embedding is possible based on the observation
> that the two fields together don't need more than 32 bits (idx
> always being less than MAX_PENDING_REQS [which itself could
> even grow significantly] and group being bounded by NR_CPUS).
Good point; I hadn't noticed that.  That would be a nicer way of doing
things.

It sounds like you've had a pretty good look at these patches.  Did
you see anything else worth pointing out?

Steven.

[-- Attachment #1.2: Digital signature --]
[-- Type: application/pgp-signature, Size: 189 bytes --]

[-- Attachment #2: Type: text/plain, Size: 138 bytes --]

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xensource.com
http://lists.xensource.com/xen-devel

^ permalink raw reply	[flat|nested] 46+ messages in thread

* Re: [Pv-ops][PATCH 0/3] Resend: Netback multiple thread support
  2010-04-28 13:33                                   ` Xu, Dongxiao
@ 2010-04-30  7:35                                     ` Steven Smith
  0 siblings, 0 replies; 46+ messages in thread
From: Steven Smith @ 2010-04-30  7:35 UTC (permalink / raw)
  To: Xu, Dongxiao
  Cc: Steven Smith, Jeremy Fitzhardinge, xen-devel@lists.xensource.com


[-- Attachment #1.1: Type: text/plain, Size: 2160 bytes --]

> >> However if the NIC is multi-queued, which has only one interface but
> >> multiple interrupt queues. All netfronts are bounded with this
> >> interface via one bridge. We have no idea which interrupt queue is
> >> serving for 
> >> a specified netfront. So the rebalance according to interrupt
> >> affinity is a challenge. Do you have idea on this point?
> > Sorry, I should have been clearer here.  When I said ``interrupt'' I
> > meant the event channel interrupt which the netfront instance will use
> > to notify netback, not the physical hardware interrupt of whatever
> > physical NIC is ultimately associated with it.  We should always know
> > which event channel a given netfront is using, and hence which
> > interrupt, and so we should be able to find out its affinity.  In
> > effect, we'd rebalance in response to messages from the guest to
> > netback, which is at least vaguely reasonable as a proxy for actual
> > load.
> 
> OK, I understand, what you were thinking about is on netfront TX,
> while I was talking about the netfront RX.
> 
> In my solution, each tasklet PAIR will be assigned to a group. So I think
> the optimization should work for both directions.
Yep.

> As we have a common view that rebalance on RX rebalance is hard to
> implement, and the optimization point is on TX rebalance. Do you think
> if TX rebalance would have side effect on RX direction?
Most network traffic is at least a little bit symmetrical, so
rebalancing for TX should help at least a little bit for RX-heavy
workloads as well.  There are some potential oddities if you have some
interfaces which are TX-heavy and some which are RX-heavy, though.  I
think this needs more thought, and probably some experimentation,
before we can decide on the best approach.

Of course, there's no reason not to use a very simple balancer
initially (e.g. equal numbers of netifs in each group) and only do the
more complicated bits if it starts causing problems.

> However in my next version of patch, I would not include this logic
> since the change is not small and needs more effort.
Perfectly reasonable.

Steven.

[-- Attachment #1.2: Digital signature --]
[-- Type: application/pgp-signature, Size: 189 bytes --]

[-- Attachment #2: Type: text/plain, Size: 138 bytes --]

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xensource.com
http://lists.xensource.com/xen-devel

^ permalink raw reply	[flat|nested] 46+ messages in thread

* Re: [Pv-ops][PATCH 0/3] Resend: Netback multiple thread support
  2010-04-30  7:29                                 ` Steven Smith
@ 2010-04-30  8:27                                   ` Jan Beulich
  0 siblings, 0 replies; 46+ messages in thread
From: Jan Beulich @ 2010-04-30  8:27 UTC (permalink / raw)
  To: Steven Smith, Steven.Smith
  Cc: Jeremy Fitzhardinge, xen-devel@lists.xensource.com, Dongxiao Xu

>>> Steven Smith <steven.smith@citrix.com> 30.04.10 09:29 >>>
>It sounds like you've had a pretty good look at these patches.  Did
>you see anything else worth pointing out?

No, the major ones you had already pointed out.

Despite your earlier comments, even in the latest version the load
balancing code still doesn't seem right. I found it necessary to do
this right in __netif_{up,down}(), so that there is no potential of
mis-balanced increments and decrements (we also still have the
[pointless] list inserts/removes in our version of the patches,
which was what actually pointed out the issue by way of seeing
crashes or endless loops). This in turn made it necessary to add
logic to ignore the first (couple of?) invocation(s) of netif_be_int()
(i.e. before the netif as assigned to a group).

Also in spite of your earlier comments, the use of
kthread_should_stop() in the latest version of the patches still
seems insufficient - afaict it should also be used in the
expression passed to wait_event_interruptible().

Minor ones are
- should not use kzalloc() for allocating the (huge) array of struct
  xen_netbk in netback_init()
- the changes to netif_be_dbg() are bogus
- tasklets get replaced unconditionally with kthreads - I opted to
  make this dependent on a command line option, as the general
  description hinted both having their up- and downsides
- placement of fields in struct xen_netbk (I'd prefer all small fields to
  precede the large arrays - on x86 this results in smaller code)

Jan

^ permalink raw reply	[flat|nested] 46+ messages in thread

end of thread, other threads:[~2010-04-30  8:27 UTC | newest]

Thread overview: 46+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2009-11-27  2:26 [Pv-ops][PATCH] Netback multiple tasklet support Xu, Dongxiao
2009-11-27  9:42 ` Ian Campbell
2009-11-27 16:08   ` Xu, Dongxiao
2009-11-27 16:15 ` Ian Pratt
2009-11-27 16:57   ` Xu, Dongxiao
2009-11-28 13:15     ` Ian Pratt
2009-12-02 10:17       ` Xu, Dongxiao
2009-12-03 21:28         ` Jeremy Fitzhardinge
2009-12-04  2:13           ` Xu, Dongxiao
2009-12-04  2:33             ` Jeremy Fitzhardinge
2009-12-08  9:22               ` Xu, Dongxiao
2009-12-09 20:23                 ` Jeremy Fitzhardinge
2009-12-10  3:29                   ` Xu, Dongxiao
2009-12-10 18:01                     ` Jeremy Fitzhardinge
2009-12-11  1:34                       ` Xu, Dongxiao
2010-04-26 14:27                       ` [Pv-ops][PATCH 0/3] Resend: Netback multiple thread support Xu, Dongxiao
2010-04-27  0:19                         ` Konrad Rzeszutek Wilk
2010-04-27  0:40                           ` Xu, Dongxiao
2010-04-27  3:02                         ` Xu, Dongxiao
2010-04-27 10:49                         ` Steven Smith
2010-04-27 18:37                           ` Jeremy Fitzhardinge
2010-04-28  9:31                             ` Steven Smith
2010-04-28 11:36                               ` Xu, Dongxiao
2010-04-28 12:04                                 ` Steven Smith
2010-04-28 13:33                                   ` Xu, Dongxiao
2010-04-30  7:35                                     ` Steven Smith
2010-04-28 10:27                           ` Xu, Dongxiao
2010-04-28 11:51                             ` Steven Smith
2010-04-28 12:23                               ` Xu, Dongxiao
2010-04-28 12:43                               ` Jan Beulich
2010-04-30  7:29                                 ` Steven Smith
2010-04-30  8:27                                   ` Jan Beulich
2009-12-10  9:07                   ` [Pv-ops][PATCH] Netback multiple tasklet support Ian Campbell
2009-12-10 17:54                     ` Jeremy Fitzhardinge
2009-12-10 18:07                       ` Ian Campbell
2009-12-11  8:34                         ` Jan Beulich
2009-12-11  9:34                           ` Ian Campbell
2009-12-11 14:24                             ` Konrad Rzeszutek Wilk
2010-03-17  8:46                       ` [PATCH] [pv-ops] fix dom0 S3 when MSI is used Cui, Dexuan
2010-03-17 14:28                         ` Konrad Rzeszutek Wilk
2010-03-18  3:05                           ` Cui, Dexuan
2010-03-19  1:04                           ` Jeremy Fitzhardinge
2010-03-19  1:03                         ` Jeremy Fitzhardinge
2010-03-19  1:29                           ` Cui, Dexuan
2010-01-13 10:17                     ` [Pv-ops][PATCH] Netback multiple tasklet support Jan Beulich
2010-01-14 16:55                       ` Ian Campbell

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).