netdev.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [patch 26/26] Xen-paravirt_ops: Add the Xen virtual network device driver.
       [not found] <20070301232443.195603797@goop.org>
@ 2007-03-01 23:25 ` Jeremy Fitzhardinge
  2007-03-02  0:42   ` Stephen Hemminger
       [not found] ` <20070301232527.956565107@goop.org>
  1 sibling, 1 reply; 93+ messages in thread
From: Jeremy Fitzhardinge @ 2007-03-01 23:25 UTC (permalink / raw)
  To: Andi Kleen
  Cc: Andrew Morton, linux-kernel, virtualization, xen-devel,
	Chris Wright, Zachary Amsden, Rusty Russell, Ian Pratt,
	Christian Limpach, netdev, Jeff Garzik

[-- Attachment #1: xen-netfront.patch --]
[-- Type: text/plain, Size: 56757 bytes --]

The network device frontend driver allows the kernel to access network
devices exported exported by a virtual machine containing a physical
network device driver.

Signed-off-by: Ian Pratt <ian.pratt@xensource.com>
Signed-off-by: Christian Limpach <Christian.Limpach@cl.cam.ac.uk>
Signed-off-by: Chris Wright <chrisw@sous-sol.org>
Signed-off-by: Jeremy Fitzhardinge <jeremy@xensource.com>
Cc: netdev@vger.kernel.org
Cc: Jeff Garzik <jeff@garzik.org>

---
 drivers/net/Kconfig        |   12 
 drivers/net/Makefile       |    2 
 drivers/net/xen-netfront.c | 2066 ++++++++++++++++++++++++++++++++++++++++++++
 include/xen/events.h       |    2 
 4 files changed, 2082 insertions(+)

===================================================================
--- a/drivers/net/Kconfig
+++ b/drivers/net/Kconfig
@@ -2525,6 +2525,18 @@ source "drivers/atm/Kconfig"
 
 source "drivers/s390/net/Kconfig"
 
+config XEN_NETDEV_FRONTEND
+	tristate "Xen network device frontend driver"
+	depends on XEN
+	default y
+	help
+	  The network device frontend driver allows the kernel to
+	  access network devices exported exported by a virtual
+	  machine containing a physical network device driver. The
+	  frontend driver is intended for unprivileged guest domains;
+	  if you are compiling a kernel for a Xen guest, you almost
+	  certainly want to enable this.
+
 config ISERIES_VETH
 	tristate "iSeries Virtual Ethernet driver support"
 	depends on PPC_ISERIES
===================================================================
--- a/drivers/net/Makefile
+++ b/drivers/net/Makefile
@@ -218,3 +218,5 @@ obj-$(CONFIG_FS_ENET) += fs_enet/
 obj-$(CONFIG_FS_ENET) += fs_enet/
 
 obj-$(CONFIG_NETXEN_NIC) += netxen/
+
+obj-$(CONFIG_XEN_NETDEV_FRONTEND) += xen-netfront.o
===================================================================
--- /dev/null
+++ b/drivers/net/xen-netfront.c
@@ -0,0 +1,2066 @@
+/******************************************************************************
+ * Virtual network driver for conversing with remote driver backends.
+ *
+ * Copyright (c) 2002-2005, K A Fraser
+ * Copyright (c) 2005, XenSource Ltd
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation; or, when distributed
+ * separately from the Linux kernel or incorporated into other
+ * software packages, subject to the following license:
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this source file (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy, modify,
+ * merge, publish, distribute, sublicense, and/or sell copies of the Software,
+ * and to permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include <linux/module.h>
+#include <linux/version.h>
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/string.h>
+#include <linux/errno.h>
+#include <linux/netdevice.h>
+#include <linux/inetdevice.h>
+#include <linux/etherdevice.h>
+#include <linux/skbuff.h>
+#include <linux/init.h>
+#include <linux/bitops.h>
+#include <linux/ethtool.h>
+#include <linux/in.h>
+#include <linux/if_ether.h>
+#include <linux/io.h>
+#include <linux/moduleparam.h>
+#include <net/sock.h>
+#include <net/pkt_sched.h>
+#include <net/arp.h>
+#include <net/route.h>
+#include <asm/uaccess.h>
+#include <xen/xenbus.h>
+#include <xen/interface/io/netif.h>
+#include <xen/interface/memory.h>
+#ifdef CONFIG_XEN_BALLOON
+#include <xen/balloon.h>
+#endif
+#include <asm/page.h>
+#include <xen/interface/grant_table.h>
+
+#include <xen/events.h>
+#include <xen/page.h>
+#include <xen/grant_table.h>
+
+/*
+ * Mutually-exclusive module options to select receive data path:
+ *  rx_copy : Packets are copied by network backend into local memory
+ *  rx_flip : Page containing packet data is transferred to our ownership
+ * For fully-virtualised guests there is no option - copying must be used.
+ * For paravirtualised guests, flipping is the default.
+ */
+#ifdef CONFIG_XEN
+static int MODPARM_rx_copy = 0;
+module_param_named(rx_copy, MODPARM_rx_copy, bool, 0);
+MODULE_PARM_DESC(rx_copy, "Copy packets from network card (rather than flip)");
+static int MODPARM_rx_flip = 0;
+module_param_named(rx_flip, MODPARM_rx_flip, bool, 0);
+MODULE_PARM_DESC(rx_flip, "Flip packets from network card (rather than copy)");
+#else
+static const int MODPARM_rx_copy = 1;
+static const int MODPARM_rx_flip = 0;
+#endif
+
+#define RX_COPY_THRESHOLD 256
+
+#define GRANT_INVALID_REF	0
+
+#define NET_TX_RING_SIZE __RING_SIZE((struct netif_tx_sring *)0, PAGE_SIZE)
+#define NET_RX_RING_SIZE __RING_SIZE((struct netif_rx_sring *)0, PAGE_SIZE)
+
+struct netfront_info {
+	struct list_head list;
+	struct net_device *netdev;
+
+	struct net_device_stats stats;
+
+	struct netif_tx_front_ring tx;
+	struct netif_rx_front_ring rx;
+
+	spinlock_t   tx_lock;
+	spinlock_t   rx_lock;
+
+	unsigned int evtchn, irq;
+	unsigned int copying_receiver;
+
+	/* Receive-ring batched refills. */
+#define RX_MIN_TARGET 8
+#define RX_DFL_MIN_TARGET 64
+#define RX_MAX_TARGET min_t(int, NET_RX_RING_SIZE, 256)
+	unsigned rx_min_target, rx_max_target, rx_target;
+	struct sk_buff_head rx_batch;
+
+	struct timer_list rx_refill_timer;
+
+	/*
+	 * {tx,rx}_skbs store outstanding skbuffs. The first entry in tx_skbs
+	 * is an index into a chain of free entries.
+	 */
+	struct sk_buff *tx_skbs[NET_TX_RING_SIZE+1];
+	struct sk_buff *rx_skbs[NET_RX_RING_SIZE];
+
+#define TX_MAX_TARGET min_t(int, NET_RX_RING_SIZE, 256)
+	grant_ref_t gref_tx_head;
+	grant_ref_t grant_tx_ref[NET_TX_RING_SIZE + 1];
+	grant_ref_t gref_rx_head;
+	grant_ref_t grant_rx_ref[NET_RX_RING_SIZE];
+
+	struct xenbus_device *xbdev;
+	int tx_ring_ref;
+	int rx_ring_ref;
+	u8 mac[ETH_ALEN];
+
+	unsigned long rx_pfn_array[NET_RX_RING_SIZE];
+	struct multicall_entry rx_mcl[NET_RX_RING_SIZE+1];
+	struct mmu_update rx_mmu[NET_RX_RING_SIZE];
+};
+
+struct netfront_rx_info {
+	struct netif_rx_response rx;
+	struct netif_extra_info extras[XEN_NETIF_EXTRA_TYPE_MAX - 1];
+};
+
+/*
+ * Access macros for acquiring freeing slots in tx_skbs[].
+ */
+
+static inline void add_id_to_freelist(struct sk_buff **list, unsigned short id)
+{
+	list[id] = list[0];
+	list[0]  = (void *)(unsigned long)id;
+}
+
+static inline unsigned short get_id_from_freelist(struct sk_buff **list)
+{
+	unsigned int id = (unsigned int)(unsigned long)list[0];
+	list[0] = list[id];
+	return id;
+}
+
+static inline int xennet_rxidx(RING_IDX idx)
+{
+	return idx & (NET_RX_RING_SIZE - 1);
+}
+
+static inline struct sk_buff *xennet_get_rx_skb(struct netfront_info *np,
+						RING_IDX ri)
+{
+	int i = xennet_rxidx(ri);
+	struct sk_buff *skb = np->rx_skbs[i];
+	np->rx_skbs[i] = NULL;
+	return skb;
+}
+
+static inline grant_ref_t xennet_get_rx_ref(struct netfront_info *np,
+					    RING_IDX ri)
+{
+	int i = xennet_rxidx(ri);
+	grant_ref_t ref = np->grant_rx_ref[i];
+	np->grant_rx_ref[i] = GRANT_INVALID_REF;
+	return ref;
+}
+
+#define DPRINTK(fmt, args...)				\
+	pr_debug("netfront (%s:%d) " fmt,		\
+		 __FUNCTION__, __LINE__, ##args)
+#define IPRINTK(fmt, args...)				\
+	printk(KERN_INFO "netfront: " fmt, ##args)
+#define WPRINTK(fmt, args...)				\
+	printk(KERN_WARNING "netfront: " fmt, ##args)
+
+static int setup_device(struct xenbus_device *, struct netfront_info *);
+static struct net_device *create_netdev(struct xenbus_device *);
+
+static void netfront_closing(struct xenbus_device *);
+
+static void end_access(int, void *);
+static void netif_disconnect_backend(struct netfront_info *);
+static int open_netdev(struct netfront_info *);
+static void close_netdev(struct netfront_info *);
+
+static int network_connect(struct net_device *);
+static void network_tx_buf_gc(struct net_device *);
+static void network_alloc_rx_buffers(struct net_device *);
+static int send_fake_arp(struct net_device *);
+
+static irqreturn_t netif_int(int irq, void *dev_id);
+
+#ifdef CONFIG_SYSFS
+static int xennet_sysfs_addif(struct net_device *netdev);
+static void xennet_sysfs_delif(struct net_device *netdev);
+#else /* !CONFIG_SYSFS */
+#define xennet_sysfs_addif(dev) (0)
+#define xennet_sysfs_delif(dev) do { } while(0)
+#endif
+
+static inline int xennet_can_sg(struct net_device *dev)
+{
+	return dev->features & NETIF_F_SG;
+}
+
+/**
+ * Entry point to this code when a new device is created.  Allocate the basic
+ * structures and the ring buffers for communication with the backend, and
+ * inform the backend of the appropriate details for those.
+ */
+static int __devinit netfront_probe(struct xenbus_device *dev,
+				    const struct xenbus_device_id *id)
+{
+	int err;
+	struct net_device *netdev;
+	struct netfront_info *info;
+
+	netdev = create_netdev(dev);
+	if (IS_ERR(netdev)) {
+		err = PTR_ERR(netdev);
+		xenbus_dev_fatal(dev, err, "creating netdev");
+		return err;
+	}
+
+	info = netdev_priv(netdev);
+	dev->dev.driver_data = info;
+
+	err = open_netdev(info);
+	if (err)
+		goto fail;
+
+	return 0;
+
+ fail:
+	free_netdev(netdev);
+	dev->dev.driver_data = NULL;
+	return err;
+}
+
+
+/**
+ * We are reconnecting to the backend, due to a suspend/resume, or a backend
+ * driver restart.  We tear down our netif structure and recreate it, but
+ * leave the device-layer structures intact so that this is transparent to the
+ * rest of the kernel.
+ */
+static int netfront_resume(struct xenbus_device *dev)
+{
+	struct netfront_info *info = dev->dev.driver_data;
+
+	DPRINTK("%s\n", dev->nodename);
+
+	netif_disconnect_backend(info);
+	return 0;
+}
+
+static int xen_net_read_mac(struct xenbus_device *dev, u8 mac[])
+{
+	char *s, *e, *macstr;
+	int i;
+
+	macstr = s = xenbus_read(XBT_NIL, dev->nodename, "mac", NULL);
+	if (IS_ERR(macstr))
+		return PTR_ERR(macstr);
+
+	for (i = 0; i < ETH_ALEN; i++) {
+		mac[i] = simple_strtoul(s, &e, 16);
+		if ((s == e) || (*e != ((i == ETH_ALEN-1) ? '\0' : ':'))) {
+			kfree(macstr);
+			return -ENOENT;
+		}
+		s = e+1;
+	}
+
+	kfree(macstr);
+	return 0;
+}
+
+/* Common code used when first setting up, and when resuming. */
+static int talk_to_backend(struct xenbus_device *dev,
+			   struct netfront_info *info)
+{
+	const char *message;
+	struct xenbus_transaction xbt;
+	int err;
+
+	err = xen_net_read_mac(dev, info->mac);
+	if (err) {
+		xenbus_dev_fatal(dev, err, "parsing %s/mac", dev->nodename);
+		goto out;
+	}
+
+	/* Create shared ring, alloc event channel. */
+	err = setup_device(dev, info);
+	if (err)
+		goto out;
+
+again:
+	err = xenbus_transaction_start(&xbt);
+	if (err) {
+		xenbus_dev_fatal(dev, err, "starting transaction");
+		goto destroy_ring;
+	}
+
+	err = xenbus_printf(xbt, dev->nodename, "tx-ring-ref","%u",
+			    info->tx_ring_ref);
+	if (err) {
+		message = "writing tx ring-ref";
+		goto abort_transaction;
+	}
+	err = xenbus_printf(xbt, dev->nodename, "rx-ring-ref","%u",
+			    info->rx_ring_ref);
+	if (err) {
+		message = "writing rx ring-ref";
+		goto abort_transaction;
+	}
+	err = xenbus_printf(xbt, dev->nodename,
+			    "event-channel", "%u", info->evtchn);
+	if (err) {
+		message = "writing event-channel";
+		goto abort_transaction;
+	}
+
+	err = xenbus_printf(xbt, dev->nodename, "request-rx-copy", "%u",
+			    info->copying_receiver);
+	if (err) {
+		message = "writing request-rx-copy";
+		goto abort_transaction;
+	}
+
+	err = xenbus_printf(xbt, dev->nodename, "feature-rx-notify", "%d", 1);
+	if (err) {
+		message = "writing feature-rx-notify";
+		goto abort_transaction;
+	}
+
+	err = xenbus_printf(xbt, dev->nodename, "feature-sg", "%d", 1);
+	if (err) {
+		message = "writing feature-sg";
+		goto abort_transaction;
+	}
+
+	err = xenbus_printf(xbt, dev->nodename, "feature-gso-tcpv4", "%d", 1);
+	if (err) {
+		message = "writing feature-gso-tcpv4";
+		goto abort_transaction;
+	}
+
+	err = xenbus_transaction_end(xbt, 0);
+	if (err) {
+		if (err == -EAGAIN)
+			goto again;
+		xenbus_dev_fatal(dev, err, "completing transaction");
+		goto destroy_ring;
+	}
+
+	return 0;
+
+ abort_transaction:
+	xenbus_transaction_end(xbt, 1);
+	xenbus_dev_fatal(dev, err, "%s", message);
+ destroy_ring:
+	netif_disconnect_backend(info);
+ out:
+	return err;
+}
+
+static int setup_device(struct xenbus_device *dev, struct netfront_info *info)
+{
+	struct netif_tx_sring *txs;
+	struct netif_rx_sring *rxs;
+	int err;
+	struct net_device *netdev = info->netdev;
+
+	info->tx_ring_ref = GRANT_INVALID_REF;
+	info->rx_ring_ref = GRANT_INVALID_REF;
+	info->rx.sring = NULL;
+	info->tx.sring = NULL;
+	info->irq = 0;
+
+	txs = (struct netif_tx_sring *)get_zeroed_page(GFP_KERNEL);
+	if (!txs) {
+		err = -ENOMEM;
+		xenbus_dev_fatal(dev, err, "allocating tx ring page");
+		goto fail;
+	}
+	SHARED_RING_INIT(txs);
+	FRONT_RING_INIT(&info->tx, txs, PAGE_SIZE);
+
+	err = xenbus_grant_ring(dev, virt_to_mfn(txs));
+	if (err < 0) {
+		free_page((unsigned long)txs);
+		goto fail;
+	}
+
+	info->tx_ring_ref = err;
+	rxs = (struct netif_rx_sring *)get_zeroed_page(GFP_KERNEL);
+	if (!rxs) {
+		err = -ENOMEM;
+		xenbus_dev_fatal(dev, err, "allocating rx ring page");
+		goto fail;
+	}
+	SHARED_RING_INIT(rxs);
+	FRONT_RING_INIT(&info->rx, rxs, PAGE_SIZE);
+
+	err = xenbus_grant_ring(dev, virt_to_mfn(rxs));
+	if (err < 0) {
+		free_page((unsigned long)rxs);
+		goto fail;
+	}
+	info->rx_ring_ref = err;
+
+	err = xenbus_alloc_evtchn(dev, &info->evtchn);
+	if (err)
+		goto fail;
+
+	memcpy(netdev->dev_addr, info->mac, ETH_ALEN);
+	err = bind_evtchn_to_irqhandler(info->evtchn, netif_int,
+					SA_SAMPLE_RANDOM, netdev->name,
+					netdev);
+	if (err < 0)
+		goto fail;
+	info->irq = err;
+	return 0;
+
+ fail:
+	return err;
+}
+
+/**
+ * Callback received when the backend's state changes.
+ */
+static void backend_changed(struct xenbus_device *dev,
+			    enum xenbus_state backend_state)
+{
+	struct netfront_info *np = dev->dev.driver_data;
+	struct net_device *netdev = np->netdev;
+
+	DPRINTK("%s\n", xenbus_strstate(backend_state));
+
+	switch (backend_state) {
+	case XenbusStateInitialising:
+	case XenbusStateInitialised:
+	case XenbusStateConnected:
+	case XenbusStateUnknown:
+	case XenbusStateClosed:
+		break;
+
+	case XenbusStateInitWait:
+		if (dev->state != XenbusStateInitialising)
+			break;
+		if (network_connect(netdev) != 0)
+			break;
+		xenbus_switch_state(dev, XenbusStateConnected);
+		(void)send_fake_arp(netdev);
+		break;
+
+	case XenbusStateClosing:
+		if (dev->state == XenbusStateClosed)
+			break;
+		netfront_closing(dev);
+		break;
+	}
+}
+
+/** Send a packet on a net device to encourage switches to learn the
+ * MAC. We send a fake ARP request.
+ *
+ * @param dev device
+ * @return 0 on success, error code otherwise
+ */
+static int send_fake_arp(struct net_device *dev)
+{
+	struct sk_buff *skb;
+	u32             src_ip, dst_ip;
+
+	dst_ip = INADDR_BROADCAST;
+	src_ip = inet_select_addr(dev, dst_ip, RT_SCOPE_LINK);
+
+	/* No IP? Then nothing to do. */
+	if (src_ip == 0)
+		return 0;
+
+	skb = arp_create(ARPOP_REPLY, ETH_P_ARP,
+			 dst_ip, dev, src_ip,
+			 /*dst_hw*/ NULL, /*src_hw*/ NULL,
+			 /*target_hw*/ dev->dev_addr);
+	if (skb == NULL)
+		return -ENOMEM;
+
+	return dev_queue_xmit(skb);
+}
+
+static int network_open(struct net_device *dev)
+{
+	struct netfront_info *np = netdev_priv(dev);
+
+	memset(&np->stats, 0, sizeof(np->stats));
+
+	spin_lock(&np->rx_lock);
+	if (netif_carrier_ok(dev)) {
+		network_alloc_rx_buffers(dev);
+		np->rx.sring->rsp_event = np->rx.rsp_cons + 1;
+		if (RING_HAS_UNCONSUMED_RESPONSES(&np->rx))
+			netif_rx_schedule(dev);
+	}
+	spin_unlock(&np->rx_lock);
+
+	netif_start_queue(dev);
+
+	return 0;
+}
+
+static inline int netfront_tx_slot_available(struct netfront_info *np)
+{
+	return RING_FREE_REQUESTS(&np->tx) >= MAX_SKB_FRAGS + 2;
+}
+
+static inline void network_maybe_wake_tx(struct net_device *dev)
+{
+	struct netfront_info *np = netdev_priv(dev);
+
+	if (unlikely(netif_queue_stopped(dev)) &&
+	    netfront_tx_slot_available(np) &&
+	    likely(netif_running(dev)))
+		netif_wake_queue(dev);
+}
+
+static void network_tx_buf_gc(struct net_device *dev)
+{
+	RING_IDX cons, prod;
+	unsigned short id;
+	struct netfront_info *np = netdev_priv(dev);
+	struct sk_buff *skb;
+
+	BUG_ON(!netif_carrier_ok(dev));
+
+	do {
+		prod = np->tx.sring->rsp_prod;
+		rmb(); /* Ensure we see responses up to 'rp'. */
+
+		for (cons = np->tx.rsp_cons; cons != prod; cons++) {
+			struct netif_tx_response *txrsp;
+
+			txrsp = RING_GET_RESPONSE(&np->tx, cons);
+			if (txrsp->status == NETIF_RSP_NULL)
+				continue;
+
+			id  = txrsp->id;
+			skb = np->tx_skbs[id];
+			if (unlikely(gnttab_query_foreign_access(
+				np->grant_tx_ref[id]) != 0)) {
+				printk(KERN_ALERT "network_tx_buf_gc: warning "
+				       "-- grant still in use by backend "
+				       "domain.\n");
+				BUG();
+			}
+			gnttab_end_foreign_access_ref(
+				np->grant_tx_ref[id], GNTMAP_readonly);
+			gnttab_release_grant_reference(
+				&np->gref_tx_head, np->grant_tx_ref[id]);
+			np->grant_tx_ref[id] = GRANT_INVALID_REF;
+			add_id_to_freelist(np->tx_skbs, id);
+			dev_kfree_skb_irq(skb);
+		}
+
+		np->tx.rsp_cons = prod;
+
+		/*
+		 * Set a new event, then check for race with update of tx_cons.
+		 * Note that it is essential to schedule a callback, no matter
+		 * how few buffers are pending. Even if there is space in the
+		 * transmit ring, higher layers may be blocked because too much
+		 * data is outstanding: in such cases notification from Xen is
+		 * likely to be the only kick that we'll get.
+		 */
+		np->tx.sring->rsp_event =
+			prod + ((np->tx.sring->req_prod - prod) >> 1) + 1;
+		mb();
+	} while ((cons == prod) && (prod != np->tx.sring->rsp_prod));
+
+	network_maybe_wake_tx(dev);
+}
+
+static void rx_refill_timeout(unsigned long data)
+{
+	struct net_device *dev = (struct net_device *)data;
+	netif_rx_schedule(dev);
+}
+
+static void network_alloc_rx_buffers(struct net_device *dev)
+{
+	unsigned short id;
+	struct netfront_info *np = netdev_priv(dev);
+	struct sk_buff *skb;
+	struct page *page;
+	int i, batch_target, notify;
+	RING_IDX req_prod = np->rx.req_prod_pvt;
+	struct xen_memory_reservation reservation;
+	grant_ref_t ref;
+	unsigned long pfn;
+	void *vaddr;
+	int nr_flips;
+	struct netif_rx_request *req;
+
+	if (unlikely(!netif_carrier_ok(dev)))
+		return;
+
+	/*
+	 * Allocate skbuffs greedily, even though we batch updates to the
+	 * receive ring. This creates a less bursty demand on the memory
+	 * allocator, so should reduce the chance of failed allocation requests
+	 * both for ourself and for other kernel subsystems.
+	 */
+	batch_target = np->rx_target - (req_prod - np->rx.rsp_cons);
+	for (i = skb_queue_len(&np->rx_batch); i < batch_target; i++) {
+		/*
+		 * Allocate an skb and a page. Do not use __dev_alloc_skb as
+		 * that will allocate page-sized buffers which is not
+		 * necessary here.
+		 * 16 bytes added as necessary headroom for netif_receive_skb.
+		 */
+		skb = alloc_skb(RX_COPY_THRESHOLD + 16 + NET_IP_ALIGN,
+				GFP_ATOMIC | __GFP_NOWARN);
+		if (unlikely(!skb))
+			goto no_skb;
+
+		page = alloc_page(GFP_ATOMIC | __GFP_NOWARN);
+		if (!page) {
+			kfree_skb(skb);
+no_skb:
+			/* Any skbuffs queued for refill? Force them out. */
+			if (i != 0)
+				goto refill;
+			/* Could not allocate any skbuffs. Try again later. */
+			mod_timer(&np->rx_refill_timer,
+				  jiffies + (HZ/10));
+			break;
+		}
+
+		skb_reserve(skb, 16 + NET_IP_ALIGN); /* mimic dev_alloc_skb() */
+		skb_shinfo(skb)->frags[0].page = page;
+		skb_shinfo(skb)->nr_frags = 1;
+		__skb_queue_tail(&np->rx_batch, skb);
+	}
+
+	/* Is the batch large enough to be worthwhile? */
+	if (i < (np->rx_target/2)) {
+		if (req_prod > np->rx.sring->req_prod)
+			goto push;
+		return;
+	}
+
+	/* Adjust our fill target if we risked running out of buffers. */
+	if (((req_prod - np->rx.sring->rsp_prod) < (np->rx_target / 4)) &&
+	    ((np->rx_target *= 2) > np->rx_max_target))
+		np->rx_target = np->rx_max_target;
+
+ refill:
+	for (nr_flips = i = 0; ; i++) {
+		if ((skb = __skb_dequeue(&np->rx_batch)) == NULL)
+			break;
+
+		skb->dev = dev;
+
+		id = xennet_rxidx(req_prod + i);
+
+		BUG_ON(np->rx_skbs[id]);
+		np->rx_skbs[id] = skb;
+
+		ref = gnttab_claim_grant_reference(&np->gref_rx_head);
+		BUG_ON((signed short)ref < 0);
+		np->grant_rx_ref[id] = ref;
+
+		pfn = page_to_pfn(skb_shinfo(skb)->frags[0].page);
+		vaddr = page_address(skb_shinfo(skb)->frags[0].page);
+
+		req = RING_GET_REQUEST(&np->rx, req_prod + i);
+		if (!np->copying_receiver) {
+			gnttab_grant_foreign_transfer_ref(ref,
+							  np->xbdev->otherend_id,
+							  pfn);
+			np->rx_pfn_array[nr_flips] = pfn_to_mfn(pfn);
+			if (!xen_feature(XENFEAT_auto_translated_physmap)) {
+				/* Remove this page before passing
+				 * back to Xen. */
+				set_phys_to_machine(pfn, INVALID_P2M_ENTRY);
+				MULTI_update_va_mapping(np->rx_mcl+i,
+							(unsigned long)vaddr,
+							__pte(0), 0);
+			}
+			nr_flips++;
+		} else {
+			gnttab_grant_foreign_access_ref(ref,
+							np->xbdev->otherend_id,
+							pfn_to_mfn(pfn),
+							0);
+		}
+
+		req->id = id;
+		req->gref = ref;
+	}
+
+	if ( nr_flips != 0 ) {
+#ifdef CONFIG_XEN_BALLOON
+		/* Tell the ballon driver what is going on. */
+		balloon_update_driver_allowance(i);
+#endif
+
+		reservation.extent_start = np->rx_pfn_array;
+		reservation.nr_extents   = nr_flips;
+		reservation.extent_order = 0;
+		reservation.address_bits = 0;
+		reservation.domid        = DOMID_SELF;
+
+		if (!xen_feature(XENFEAT_auto_translated_physmap)) {
+			/* After all PTEs have been zapped, flush the TLB. */
+			np->rx_mcl[i-1].args[MULTI_UVMFLAGS_INDEX] =
+				UVMF_TLB_FLUSH|UVMF_ALL;
+
+			/* Give away a batch of pages. */
+			np->rx_mcl[i].op = __HYPERVISOR_memory_op;
+			np->rx_mcl[i].args[0] = XENMEM_decrease_reservation;
+			np->rx_mcl[i].args[1] = (unsigned long)&reservation;
+
+			/* Zap PTEs and give away pages in one big
+			 * multicall. */
+			(void)HYPERVISOR_multicall(np->rx_mcl, i+1);
+
+			/* Check return status of HYPERVISOR_memory_op(). */
+			if (unlikely(np->rx_mcl[i].result != i))
+				panic("Unable to reduce memory reservation\n");
+		} else {
+			if (HYPERVISOR_memory_op(XENMEM_decrease_reservation,
+						 &reservation) != i)
+				panic("Unable to reduce memory reservation\n");
+		}
+	} else {
+		wmb();
+	}
+
+	/* Above is a suitable barrier to ensure backend will see requests. */
+	np->rx.req_prod_pvt = req_prod + i;
+ push:
+	RING_PUSH_REQUESTS_AND_CHECK_NOTIFY(&np->rx, notify);
+	if (notify)
+		notify_remote_via_irq(np->irq);
+}
+
+static void xennet_move_rx_slot(struct netfront_info *np, struct sk_buff *skb,
+				grant_ref_t ref)
+{
+	int new = xennet_rxidx(np->rx.req_prod_pvt);
+
+	BUG_ON(np->rx_skbs[new]);
+	np->rx_skbs[new] = skb;
+	np->grant_rx_ref[new] = ref;
+	RING_GET_REQUEST(&np->rx, np->rx.req_prod_pvt)->id = new;
+	RING_GET_REQUEST(&np->rx, np->rx.req_prod_pvt)->gref = ref;
+	np->rx.req_prod_pvt++;
+}
+
+static void xennet_make_frags(struct sk_buff *skb, struct net_device *dev,
+			      struct netif_tx_request *tx)
+{
+	struct netfront_info *np = netdev_priv(dev);
+	char *data = skb->data;
+	unsigned long mfn;
+	RING_IDX prod = np->tx.req_prod_pvt;
+	int frags = skb_shinfo(skb)->nr_frags;
+	unsigned int offset = offset_in_page(data);
+	unsigned int len = skb_headlen(skb);
+	unsigned int id;
+	grant_ref_t ref;
+	int i;
+
+	while (len > PAGE_SIZE - offset) {
+		tx->size = PAGE_SIZE - offset;
+		tx->flags |= NETTXF_more_data;
+		len -= tx->size;
+		data += tx->size;
+		offset = 0;
+
+		id = get_id_from_freelist(np->tx_skbs);
+		np->tx_skbs[id] = skb_get(skb);
+		tx = RING_GET_REQUEST(&np->tx, prod++);
+		tx->id = id;
+		ref = gnttab_claim_grant_reference(&np->gref_tx_head);
+		BUG_ON((signed short)ref < 0);
+
+		mfn = virt_to_mfn(data);
+		gnttab_grant_foreign_access_ref(ref, np->xbdev->otherend_id,
+						mfn, GNTMAP_readonly);
+
+		tx->gref = np->grant_tx_ref[id] = ref;
+		tx->offset = offset;
+		tx->size = len;
+		tx->flags = 0;
+	}
+
+	for (i = 0; i < frags; i++) {
+		skb_frag_t *frag = skb_shinfo(skb)->frags + i;
+
+		tx->flags |= NETTXF_more_data;
+
+		id = get_id_from_freelist(np->tx_skbs);
+		np->tx_skbs[id] = skb_get(skb);
+		tx = RING_GET_REQUEST(&np->tx, prod++);
+		tx->id = id;
+		ref = gnttab_claim_grant_reference(&np->gref_tx_head);
+		BUG_ON((signed short)ref < 0);
+
+		mfn = pfn_to_mfn(page_to_pfn(frag->page));
+		gnttab_grant_foreign_access_ref(ref, np->xbdev->otherend_id,
+						mfn, GNTMAP_readonly);
+
+		tx->gref = np->grant_tx_ref[id] = ref;
+		tx->offset = frag->page_offset;
+		tx->size = frag->size;
+		tx->flags = 0;
+	}
+
+	np->tx.req_prod_pvt = prod;
+}
+
+static int network_start_xmit(struct sk_buff *skb, struct net_device *dev)
+{
+	unsigned short id;
+	struct netfront_info *np = netdev_priv(dev);
+	struct netif_tx_request *tx;
+	struct netif_extra_info *extra;
+	char *data = skb->data;
+	RING_IDX i;
+	grant_ref_t ref;
+	unsigned long mfn;
+	int notify;
+	int frags = skb_shinfo(skb)->nr_frags;
+	unsigned int offset = offset_in_page(data);
+	unsigned int len = skb_headlen(skb);
+
+	frags += (offset + len + PAGE_SIZE - 1) / PAGE_SIZE;
+	if (unlikely(frags > MAX_SKB_FRAGS + 1)) {
+		printk(KERN_ALERT "xennet: skb rides the rocket: %d frags\n",
+		       frags);
+		dump_stack();
+		goto drop;
+	}
+
+	spin_lock_irq(&np->tx_lock);
+
+	if (unlikely(!netif_carrier_ok(dev) ||
+		     (frags > 1 && !xennet_can_sg(dev)) ||
+		     netif_needs_gso(dev, skb))) {
+		spin_unlock_irq(&np->tx_lock);
+		goto drop;
+	}
+
+	i = np->tx.req_prod_pvt;
+
+	id = get_id_from_freelist(np->tx_skbs);
+	np->tx_skbs[id] = skb;
+
+	tx = RING_GET_REQUEST(&np->tx, i);
+
+	tx->id   = id;
+	ref = gnttab_claim_grant_reference(&np->gref_tx_head);
+	BUG_ON((signed short)ref < 0);
+	mfn = virt_to_mfn(data);
+	gnttab_grant_foreign_access_ref(
+		ref, np->xbdev->otherend_id, mfn, GNTMAP_readonly);
+	tx->gref = np->grant_tx_ref[id] = ref;
+	tx->offset = offset;
+	tx->size = len;
+	extra = NULL;
+
+ 	tx->flags = 0;
+ 	if (skb->ip_summed == CHECKSUM_PARTIAL) /* local packet? */
+ 		tx->flags |= NETTXF_csum_blank | NETTXF_data_validated;
+
+	if (skb_shinfo(skb)->gso_size) {
+		struct netif_extra_info *gso = (struct netif_extra_info *)
+			RING_GET_REQUEST(&np->tx, ++i);
+
+		if (extra)
+			extra->flags |= XEN_NETIF_EXTRA_FLAG_MORE;
+		else
+			tx->flags |= NETTXF_extra_info;
+
+		gso->u.gso.size = skb_shinfo(skb)->gso_size;
+		gso->u.gso.type = XEN_NETIF_GSO_TYPE_TCPV4;
+		gso->u.gso.pad = 0;
+		gso->u.gso.features = 0;
+
+		gso->type = XEN_NETIF_EXTRA_TYPE_GSO;
+		gso->flags = 0;
+		extra = gso;
+	}
+
+	np->tx.req_prod_pvt = i + 1;
+
+	xennet_make_frags(skb, dev, tx);
+	tx->size = skb->len;
+
+	RING_PUSH_REQUESTS_AND_CHECK_NOTIFY(&np->tx, notify);
+	if (notify)
+		notify_remote_via_irq(np->irq);
+
+	network_tx_buf_gc(dev);
+
+	if (!netfront_tx_slot_available(np))
+		netif_stop_queue(dev);
+
+	spin_unlock_irq(&np->tx_lock);
+
+	np->stats.tx_bytes += skb->len;
+	np->stats.tx_packets++;
+
+	return 0;
+
+ drop:
+	np->stats.tx_dropped++;
+	dev_kfree_skb(skb);
+	return 0;
+}
+
+static irqreturn_t netif_int(int irq, void *dev_id)
+{
+	struct net_device *dev = dev_id;
+	struct netfront_info *np = netdev_priv(dev);
+	unsigned long flags;
+
+	spin_lock_irqsave(&np->tx_lock, flags);
+
+	if (likely(netif_carrier_ok(dev))) {
+		network_tx_buf_gc(dev);
+		/* Under tx_lock: protects access to rx shared-ring indexes. */
+		if (RING_HAS_UNCONSUMED_RESPONSES(&np->rx))
+			netif_rx_schedule(dev);
+	}
+
+	spin_unlock_irqrestore(&np->tx_lock, flags);
+
+	return IRQ_HANDLED;
+}
+
+static void handle_incoming_queue(struct net_device *dev, struct sk_buff_head *rxq)
+{
+	struct sk_buff *skb;
+
+	while ((skb = __skb_dequeue(rxq)) != NULL) {
+		struct page *page = (struct page *)skb->nh.raw;
+		void *vaddr = page_address(page);
+
+		memcpy(skb->data, vaddr + (skb->h.raw - skb->nh.raw),
+		       skb_headlen(skb));
+
+		if (page != skb_shinfo(skb)->frags[0].page)
+			__free_page(page);
+
+		/* Ethernet work: Delayed to here as it peeks the header. */
+		skb->protocol = eth_type_trans(skb, dev);
+
+		/* Pass it up. */
+		netif_receive_skb(skb);
+		dev->last_rx = jiffies;
+	}
+}
+
+int xennet_get_extras(struct netfront_info *np,
+		      struct netif_extra_info *extras, RING_IDX rp)
+
+{
+	struct netif_extra_info *extra;
+	RING_IDX cons = np->rx.rsp_cons;
+	int err = 0;
+
+	do {
+		struct sk_buff *skb;
+		grant_ref_t ref;
+
+		if (unlikely(cons + 1 == rp)) {
+			if (net_ratelimit())
+				WPRINTK("Missing extra info\n");
+			err = -EBADR;
+			break;
+		}
+
+		extra = (struct netif_extra_info *)
+			RING_GET_RESPONSE(&np->rx, ++cons);
+
+		if (unlikely(!extra->type ||
+			     extra->type >= XEN_NETIF_EXTRA_TYPE_MAX)) {
+			if (net_ratelimit())
+				WPRINTK("Invalid extra type: %d\n",
+					extra->type);
+			err = -EINVAL;
+		} else {
+			memcpy(&extras[extra->type - 1], extra,
+			       sizeof(*extra));
+		}
+
+		skb = xennet_get_rx_skb(np, cons);
+		ref = xennet_get_rx_ref(np, cons);
+		xennet_move_rx_slot(np, skb, ref);
+	} while (extra->flags & XEN_NETIF_EXTRA_FLAG_MORE);
+
+	np->rx.rsp_cons = cons;
+	return err;
+}
+
+static int xennet_get_responses(struct netfront_info *np,
+				struct netfront_rx_info *rinfo, RING_IDX rp,
+				struct sk_buff_head *list,
+				int *pages_flipped_p)
+{
+	int pages_flipped = *pages_flipped_p;
+	struct mmu_update *mmu;
+	struct multicall_entry *mcl;
+	struct netif_rx_response *rx = &rinfo->rx;
+	struct netif_extra_info *extras = rinfo->extras;
+	RING_IDX cons = np->rx.rsp_cons;
+	struct sk_buff *skb = xennet_get_rx_skb(np, cons);
+	grant_ref_t ref = xennet_get_rx_ref(np, cons);
+	int max = MAX_SKB_FRAGS + (rx->status <= RX_COPY_THRESHOLD);
+	int frags = 1;
+	int err = 0;
+	unsigned long ret;
+
+	if (rx->flags & NETRXF_extra_info) {
+		err = xennet_get_extras(np, extras, rp);
+		cons = np->rx.rsp_cons;
+	}
+
+	for (;;) {
+		unsigned long mfn;
+
+		if (unlikely(rx->status < 0 ||
+			     rx->offset + rx->status > PAGE_SIZE)) {
+			if (net_ratelimit())
+				WPRINTK("rx->offset: %x, size: %u\n",
+					rx->offset, rx->status);
+			xennet_move_rx_slot(np, skb, ref);
+			err = -EINVAL;
+			goto next;
+		}
+
+		/*
+		 * This definitely indicates a bug, either in this driver or in
+		 * the backend driver. In future this should flag the bad
+		 * situation to the system controller to reboot the backed.
+		 */
+		if (ref == GRANT_INVALID_REF) {
+			if (net_ratelimit())
+				WPRINTK("Bad rx response id %d.\n", rx->id);
+			err = -EINVAL;
+			goto next;
+		}
+
+		if (!np->copying_receiver) {
+			/* Memory pressure, insufficient buffer
+			 * headroom, ... */
+			mfn = gnttab_end_foreign_transfer_ref(ref);
+			if (!mfn) {
+				if (net_ratelimit())
+					WPRINTK("Unfulfilled rx req "
+						"(id=%d, st=%d).\n",
+						rx->id, rx->status);
+				xennet_move_rx_slot(np, skb, ref);
+				err = -ENOMEM;
+				goto next;
+			}
+
+			if (!xen_feature(XENFEAT_auto_translated_physmap)) {
+				/* Remap the page. */
+				struct page *page =
+					skb_shinfo(skb)->frags[0].page;
+				unsigned long pfn = page_to_pfn(page);
+				void *vaddr = page_address(page);
+
+				mcl = np->rx_mcl + pages_flipped;
+				mmu = np->rx_mmu + pages_flipped;
+
+				MULTI_update_va_mapping(mcl,
+							(unsigned long)vaddr,
+							mfn_pte(mfn, PAGE_KERNEL),
+							0);
+				mmu->ptr = ((u64)mfn << PAGE_SHIFT)
+					| MMU_MACHPHYS_UPDATE;
+				mmu->val = pfn;
+
+				set_phys_to_machine(pfn, mfn);
+			}
+			pages_flipped++;
+		} else {
+			ret = gnttab_end_foreign_access_ref(ref, 0);
+			BUG_ON(!ret);
+		}
+
+		gnttab_release_grant_reference(&np->gref_rx_head, ref);
+
+		__skb_queue_tail(list, skb);
+
+next:
+		if (!(rx->flags & NETRXF_more_data))
+			break;
+
+		if (cons + frags == rp) {
+			if (net_ratelimit())
+				WPRINTK("Need more frags\n");
+			err = -ENOENT;
+			break;
+		}
+
+		rx = RING_GET_RESPONSE(&np->rx, cons + frags);
+		skb = xennet_get_rx_skb(np, cons + frags);
+		ref = xennet_get_rx_ref(np, cons + frags);
+		frags++;
+	}
+
+	if (unlikely(frags > max)) {
+		if (net_ratelimit())
+			WPRINTK("Too many frags\n");
+		err = -E2BIG;
+	}
+
+	if (unlikely(err))
+		np->rx.rsp_cons = cons + frags;
+
+	*pages_flipped_p = pages_flipped;
+
+	return err;
+}
+
+static RING_IDX xennet_fill_frags(struct netfront_info *np,
+				  struct sk_buff *skb,
+				  struct sk_buff_head *list)
+{
+	struct skb_shared_info *shinfo = skb_shinfo(skb);
+	int nr_frags = shinfo->nr_frags;
+	RING_IDX cons = np->rx.rsp_cons;
+	skb_frag_t *frag = shinfo->frags + nr_frags;
+	struct sk_buff *nskb;
+
+	while ((nskb = __skb_dequeue(list))) {
+		struct netif_rx_response *rx =
+			RING_GET_RESPONSE(&np->rx, ++cons);
+
+		frag->page = skb_shinfo(nskb)->frags[0].page;
+		frag->page_offset = rx->offset;
+		frag->size = rx->status;
+
+		skb->data_len += rx->status;
+
+		skb_shinfo(nskb)->nr_frags = 0;
+		kfree_skb(nskb);
+
+		frag++;
+		nr_frags++;
+	}
+
+	shinfo->nr_frags = nr_frags;
+	return cons;
+}
+
+static int xennet_set_skb_gso(struct sk_buff *skb, struct netif_extra_info *gso)
+{
+	if (!gso->u.gso.size) {
+		if (net_ratelimit())
+			WPRINTK("GSO size must not be zero.\n");
+		return -EINVAL;
+	}
+
+	/* Currently only TCPv4 S.O. is supported. */
+	if (gso->u.gso.type != XEN_NETIF_GSO_TYPE_TCPV4) {
+		if (net_ratelimit())
+			WPRINTK("Bad GSO type %d.\n", gso->u.gso.type);
+		return -EINVAL;
+	}
+
+	skb_shinfo(skb)->gso_size = gso->u.gso.size;
+	skb_shinfo(skb)->gso_type = SKB_GSO_TCPV4;
+
+	/* Header must be checked, and gso_segs computed. */
+	skb_shinfo(skb)->gso_type |= SKB_GSO_DODGY;
+	skb_shinfo(skb)->gso_segs = 0;
+
+	return 0;
+}
+
+static int netif_poll(struct net_device *dev, int *pbudget)
+{
+	struct netfront_info *np = netdev_priv(dev);
+	struct sk_buff *skb;
+	struct netfront_rx_info rinfo;
+	struct netif_rx_response *rx = &rinfo.rx;
+	struct netif_extra_info *extras = rinfo.extras;
+	RING_IDX i, rp;
+	struct multicall_entry *mcl;
+	int work_done, budget, more_to_do = 1;
+	struct sk_buff_head rxq;
+	struct sk_buff_head errq;
+	struct sk_buff_head tmpq;
+	unsigned long flags;
+	unsigned int len;
+	int pages_flipped = 0;
+	int err;
+
+	spin_lock(&np->rx_lock);
+
+	if (unlikely(!netif_carrier_ok(dev))) {
+		spin_unlock(&np->rx_lock);
+		return 0;
+	}
+
+	skb_queue_head_init(&rxq);
+	skb_queue_head_init(&errq);
+	skb_queue_head_init(&tmpq);
+
+	if ((budget = *pbudget) > dev->quota)
+		budget = dev->quota;
+	rp = np->rx.sring->rsp_prod;
+	rmb(); /* Ensure we see queued responses up to 'rp'. */
+
+	i = np->rx.rsp_cons;
+	work_done = 0;
+	while ((i != rp) && (work_done < budget)) {
+		memcpy(rx, RING_GET_RESPONSE(&np->rx, i), sizeof(*rx));
+		memset(extras, 0, sizeof(extras));
+
+		err = xennet_get_responses(np, &rinfo, rp, &tmpq,
+					   &pages_flipped);
+
+		if (unlikely(err)) {
+err:
+			while ((skb = __skb_dequeue(&tmpq)))
+				__skb_queue_tail(&errq, skb);
+			np->stats.rx_errors++;
+			i = np->rx.rsp_cons;
+			continue;
+		}
+
+		skb = __skb_dequeue(&tmpq);
+
+		if (extras[XEN_NETIF_EXTRA_TYPE_GSO - 1].type) {
+			struct netif_extra_info *gso;
+			gso = &extras[XEN_NETIF_EXTRA_TYPE_GSO - 1];
+
+			if (unlikely(xennet_set_skb_gso(skb, gso))) {
+				__skb_queue_head(&tmpq, skb);
+				np->rx.rsp_cons += skb_queue_len(&tmpq);
+				goto err;
+			}
+		}
+
+		skb->nh.raw = (void *)skb_shinfo(skb)->frags[0].page;
+		skb->h.raw = skb->nh.raw + rx->offset;
+
+		len = rx->status;
+		if (len > RX_COPY_THRESHOLD)
+			len = RX_COPY_THRESHOLD;
+		skb_put(skb, len);
+
+		if (rx->status > len) {
+			skb_shinfo(skb)->frags[0].page_offset =
+				rx->offset + len;
+			skb_shinfo(skb)->frags[0].size = rx->status - len;
+			skb->data_len = rx->status - len;
+		} else {
+			skb_shinfo(skb)->frags[0].page = NULL;
+			skb_shinfo(skb)->nr_frags = 0;
+		}
+
+		i = xennet_fill_frags(np, skb, &tmpq);
+
+		/*
+		 * Truesize must approximates the size of true data plus
+		 * any supervisor overheads. Adding hypervisor overheads
+		 * has been shown to significantly reduce achievable
+		 * bandwidth with the default receive buffer size. It is
+		 * therefore not wise to account for it here.
+		 *
+		 * After alloc_skb(RX_COPY_THRESHOLD), truesize is set to
+		 * RX_COPY_THRESHOLD + the supervisor overheads. Here, we
+		 * add the size of the data pulled in xennet_fill_frags().
+		 *
+		 * We also adjust for any unused space in the main data
+		 * area by subtracting (RX_COPY_THRESHOLD - len). This is
+		 * especially important with drivers which split incoming
+		 * packets into header and data, using only 66 bytes of
+		 * the main data area (see the e1000 driver for example.)
+		 * On such systems, without this last adjustement, our
+		 * achievable receive throughout using the standard receive
+		 * buffer size was cut by 25%(!!!).
+		 */
+		skb->truesize += skb->data_len - (RX_COPY_THRESHOLD - len);
+		skb->len += skb->data_len;
+
+		if (rx->flags & NETRXF_data_validated)
+			skb->ip_summed = CHECKSUM_UNNECESSARY;
+
+		np->stats.rx_packets++;
+		np->stats.rx_bytes += skb->len;
+
+		__skb_queue_tail(&rxq, skb);
+
+		np->rx.rsp_cons = ++i;
+		work_done++;
+	}
+
+	if (pages_flipped) {
+#ifdef CONFIG_XEN_BALLOON
+		/* Some pages are no longer absent... */
+		balloon_update_driver_allowance(-pages_flipped);
+#endif
+		/* Do all the remapping work, and M2P updates. */
+		if (!xen_feature(XENFEAT_auto_translated_physmap)) {
+			mcl = np->rx_mcl + pages_flipped;
+			MULTI_mmu_update(mcl, np->rx_mmu,
+					 pages_flipped, 0, DOMID_SELF);
+			(void)HYPERVISOR_multicall(np->rx_mcl,
+						   pages_flipped + 1);
+		}
+	}
+
+	while ((skb = __skb_dequeue(&errq)))
+		kfree_skb(skb);
+
+	handle_incoming_queue(dev, &rxq);
+
+	/* If we get a callback with very few responses, reduce fill target. */
+	/* NB. Note exponential increase, linear decrease. */
+	if (((np->rx.req_prod_pvt - np->rx.sring->rsp_prod) >
+	     ((3*np->rx_target) / 4)) &&
+	    (--np->rx_target < np->rx_min_target))
+		np->rx_target = np->rx_min_target;
+
+	network_alloc_rx_buffers(dev);
+
+	*pbudget   -= work_done;
+	dev->quota -= work_done;
+
+	if (work_done < budget) {
+		local_irq_save(flags);
+
+		RING_FINAL_CHECK_FOR_RESPONSES(&np->rx, more_to_do);
+		if (!more_to_do)
+			__netif_rx_complete(dev);
+
+		local_irq_restore(flags);
+	}
+
+	spin_unlock(&np->rx_lock);
+
+	return more_to_do;
+}
+
+static void netif_release_tx_bufs(struct netfront_info *np)
+{
+	struct sk_buff *skb;
+	int i;
+
+	for (i = 1; i <= NET_TX_RING_SIZE; i++) {
+		if ((unsigned long)np->tx_skbs[i] < PAGE_OFFSET)
+			continue;
+
+		skb = np->tx_skbs[i];
+		gnttab_end_foreign_access_ref(
+			np->grant_tx_ref[i], GNTMAP_readonly);
+		gnttab_release_grant_reference(
+			&np->gref_tx_head, np->grant_tx_ref[i]);
+		np->grant_tx_ref[i] = GRANT_INVALID_REF;
+		add_id_to_freelist(np->tx_skbs, i);
+		dev_kfree_skb_irq(skb);
+	}
+}
+
+static void netif_release_rx_bufs(struct netfront_info *np)
+{
+	struct mmu_update      *mmu = np->rx_mmu;
+	struct multicall_entry *mcl = np->rx_mcl;
+	struct sk_buff_head free_list;
+	struct sk_buff *skb;
+	unsigned long mfn;
+	int xfer = 0, noxfer = 0, unused = 0;
+	int id, ref;
+
+	if (np->copying_receiver) {
+		printk("%s: fix me for copying receiver.\n", __FUNCTION__);
+		return;
+	}
+
+	skb_queue_head_init(&free_list);
+
+	spin_lock(&np->rx_lock);
+
+	for (id = 0; id < NET_RX_RING_SIZE; id++) {
+		if ((ref = np->grant_rx_ref[id]) == GRANT_INVALID_REF) {
+			unused++;
+			continue;
+		}
+
+		skb = np->rx_skbs[id];
+		mfn = gnttab_end_foreign_transfer_ref(ref);
+		gnttab_release_grant_reference(&np->gref_rx_head, ref);
+		np->grant_rx_ref[id] = GRANT_INVALID_REF;
+		add_id_to_freelist(np->rx_skbs, id);
+
+		if (0 == mfn) {
+#ifdef CONFIG_XEN_BALLOON
+			struct page *page = skb_shinfo(skb)->frags[0].page;
+			balloon_release_driver_page(page);
+#endif
+			skb_shinfo(skb)->nr_frags = 0;
+			dev_kfree_skb(skb);
+			noxfer++;
+			continue;
+		}
+
+		if (!xen_feature(XENFEAT_auto_translated_physmap)) {
+			/* Remap the page. */
+			struct page *page = skb_shinfo(skb)->frags[0].page;
+			unsigned long pfn = page_to_pfn(page);
+			void *vaddr = page_address(page);
+
+			MULTI_update_va_mapping(mcl, (unsigned long)vaddr,
+						mfn_pte(mfn, PAGE_KERNEL),
+						0);
+			mcl++;
+			mmu->ptr = ((u64)mfn << PAGE_SHIFT)
+				| MMU_MACHPHYS_UPDATE;
+			mmu->val = pfn;
+			mmu++;
+
+			set_phys_to_machine(pfn, mfn);
+		}
+		__skb_queue_tail(&free_list, skb);
+		xfer++;
+	}
+
+	printk("%s: %d xfer, %d noxfer, %d unused\n",
+	       __FUNCTION__, xfer, noxfer, unused);
+
+	if (xfer) {
+#ifdef CONFIG_XEN_BALLOON
+		/* Some pages are no longer absent... */
+		balloon_update_driver_allowance(-xfer);
+#endif
+
+		if (!xen_feature(XENFEAT_auto_translated_physmap)) {
+			/* Do all the remapping work and M2P updates. */
+			mcl->op = __HYPERVISOR_mmu_update;
+			mcl->args[0] = (unsigned long)np->rx_mmu;
+			mcl->args[1] = mmu - np->rx_mmu;
+			mcl->args[2] = 0;
+			mcl->args[3] = DOMID_SELF;
+			mcl++;
+			HYPERVISOR_multicall(np->rx_mcl, mcl - np->rx_mcl);
+		}
+	}
+
+	while ((skb = __skb_dequeue(&free_list)) != NULL)
+		dev_kfree_skb(skb);
+
+	spin_unlock(&np->rx_lock);
+}
+
+static int network_close(struct net_device *dev)
+{
+	struct netfront_info *np = netdev_priv(dev);
+	netif_stop_queue(np->netdev);
+	return 0;
+}
+
+
+static struct net_device_stats *network_get_stats(struct net_device *dev)
+{
+	struct netfront_info *np = netdev_priv(dev);
+	return &np->stats;
+}
+
+static int xennet_change_mtu(struct net_device *dev, int mtu)
+{
+	int max = xennet_can_sg(dev) ? 65535 - ETH_HLEN : ETH_DATA_LEN;
+
+	if (mtu > max)
+		return -EINVAL;
+	dev->mtu = mtu;
+	return 0;
+}
+
+static int xennet_set_sg(struct net_device *dev, u32 data)
+{
+	if (data) {
+		struct netfront_info *np = netdev_priv(dev);
+		int val;
+
+		if (xenbus_scanf(XBT_NIL, np->xbdev->otherend, "feature-sg",
+				 "%d", &val) < 0)
+			val = 0;
+		if (!val)
+			return -ENOSYS;
+	} else if (dev->mtu > ETH_DATA_LEN)
+		dev->mtu = ETH_DATA_LEN;
+
+	return ethtool_op_set_sg(dev, data);
+}
+
+static int xennet_set_tso(struct net_device *dev, u32 data)
+{
+	if (data) {
+		struct netfront_info *np = netdev_priv(dev);
+		int val;
+
+		if (xenbus_scanf(XBT_NIL, np->xbdev->otherend,
+				 "feature-gso-tcpv4", "%d", &val) < 0)
+			val = 0;
+		if (!val)
+			return -ENOSYS;
+	}
+
+	return ethtool_op_set_tso(dev, data);
+}
+
+static void xennet_set_features(struct net_device *dev)
+{
+	/* Turn off all GSO bits except ROBUST. */
+	dev->features &= (1 << NETIF_F_GSO_SHIFT) - 1;
+	dev->features |= NETIF_F_GSO_ROBUST;
+	xennet_set_sg(dev, 0);
+
+	/* We need checksum offload to enable scatter/gather and TSO. */
+	if (!(dev->features & NETIF_F_IP_CSUM))
+		return;
+
+	if (!xennet_set_sg(dev, 1))
+		xennet_set_tso(dev, 1);
+}
+
+static int network_connect(struct net_device *dev)
+{
+	struct netfront_info *np = netdev_priv(dev);
+	int i, requeue_idx, err;
+	struct sk_buff *skb;
+	grant_ref_t ref;
+	struct netif_rx_request *req;
+	unsigned int feature_rx_copy, feature_rx_flip;
+
+	err = xenbus_scanf(XBT_NIL, np->xbdev->otherend,
+			   "feature-rx-copy", "%u", &feature_rx_copy);
+	if (err != 1)
+		feature_rx_copy = 0;
+	err = xenbus_scanf(XBT_NIL, np->xbdev->otherend,
+			   "feature-rx-flip", "%u", &feature_rx_flip);
+	if (err != 1)
+		feature_rx_flip = 1;
+
+	/*
+	 * Copy packets on receive path if:
+	 *  (a) This was requested by user, and the backend supports it; or
+	 *  (b) Flipping was requested, but this is unsupported by the backend.
+	 */
+	np->copying_receiver = ((MODPARM_rx_copy && feature_rx_copy) ||
+				(MODPARM_rx_flip && !feature_rx_flip));
+
+	err = talk_to_backend(np->xbdev, np);
+	if (err)
+		return err;
+
+	xennet_set_features(dev);
+
+	IPRINTK("device %s has %sing receive path.\n",
+		dev->name, np->copying_receiver ? "copy" : "flipp");
+
+	spin_lock_irq(&np->tx_lock);
+	spin_lock(&np->rx_lock);
+
+	/*
+	 * Recovery procedure:
+	 *  NB. Freelist index entries are always going to be less than
+	 *  PAGE_OFFSET, whereas pointers to skbs will always be equal or
+	 *  greater than PAGE_OFFSET: we use this property to distinguish
+	 *  them.
+	 */
+
+	/* Step 1: Discard all pending TX packet fragments. */
+	netif_release_tx_bufs(np);
+
+	/* Step 2: Rebuild the RX buffer freelist and the RX ring itself. */
+	for (requeue_idx = 0, i = 0; i < NET_RX_RING_SIZE; i++) {
+		if (!np->rx_skbs[i])
+			continue;
+
+		skb = np->rx_skbs[requeue_idx] = xennet_get_rx_skb(np, i);
+		ref = np->grant_rx_ref[requeue_idx] = xennet_get_rx_ref(np, i);
+		req = RING_GET_REQUEST(&np->rx, requeue_idx);
+
+		if (!np->copying_receiver) {
+			gnttab_grant_foreign_transfer_ref(
+				ref, np->xbdev->otherend_id,
+				page_to_pfn(skb_shinfo(skb)->frags->page));
+		} else {
+			gnttab_grant_foreign_access_ref(
+				ref, np->xbdev->otherend_id,
+				pfn_to_mfn(page_to_pfn(skb_shinfo(skb)->
+						       frags->page)),
+				0);
+		}
+		req->gref = ref;
+		req->id   = requeue_idx;
+
+		requeue_idx++;
+	}
+
+	np->rx.req_prod_pvt = requeue_idx;
+
+	/*
+	 * Step 3: All public and private state should now be sane.  Get
+	 * ready to start sending and receiving packets and give the driver
+	 * domain a kick because we've probably just requeued some
+	 * packets.
+	 */
+	netif_carrier_on(dev);
+	notify_remote_via_irq(np->irq);
+	network_tx_buf_gc(dev);
+	network_alloc_rx_buffers(dev);
+
+	spin_unlock(&np->rx_lock);
+	spin_unlock_irq(&np->tx_lock);
+
+	return 0;
+}
+
+static void netif_uninit(struct net_device *dev)
+{
+	struct netfront_info *np = netdev_priv(dev);
+	netif_release_tx_bufs(np);
+	netif_release_rx_bufs(np);
+	gnttab_free_grant_references(np->gref_tx_head);
+	gnttab_free_grant_references(np->gref_rx_head);
+}
+
+static struct ethtool_ops network_ethtool_ops =
+{
+	.get_tx_csum = ethtool_op_get_tx_csum,
+	.set_tx_csum = ethtool_op_set_tx_csum,
+	.get_sg = ethtool_op_get_sg,
+	.set_sg = xennet_set_sg,
+	.get_tso = ethtool_op_get_tso,
+	.set_tso = xennet_set_tso,
+	.get_link = ethtool_op_get_link,
+};
+
+#ifdef CONFIG_SYSFS
+static ssize_t show_rxbuf_min(struct device *dev,
+			      struct device_attribute *attr, char *buf)
+{
+	struct net_device *netdev = to_net_dev(dev);
+	struct netfront_info *info = netdev_priv(netdev);
+
+	return sprintf(buf, "%u\n", info->rx_min_target);
+}
+
+static ssize_t store_rxbuf_min(struct device *dev,
+			       struct device_attribute *attr,
+			       const char *buf, size_t len)
+{
+	struct net_device *netdev = to_net_dev(dev);
+	struct netfront_info *np = netdev_priv(netdev);
+	char *endp;
+	unsigned long target;
+
+	if (!capable(CAP_NET_ADMIN))
+		return -EPERM;
+
+	target = simple_strtoul(buf, &endp, 0);
+	if (endp == buf)
+		return -EBADMSG;
+
+	if (target < RX_MIN_TARGET)
+		target = RX_MIN_TARGET;
+	if (target > RX_MAX_TARGET)
+		target = RX_MAX_TARGET;
+
+	spin_lock(&np->rx_lock);
+	if (target > np->rx_max_target)
+		np->rx_max_target = target;
+	np->rx_min_target = target;
+	if (target > np->rx_target)
+		np->rx_target = target;
+
+	network_alloc_rx_buffers(netdev);
+
+	spin_unlock(&np->rx_lock);
+	return len;
+}
+
+static ssize_t show_rxbuf_max(struct device *dev,
+			      struct device_attribute *attr, char *buf)
+{
+	struct net_device *netdev = to_net_dev(dev);
+	struct netfront_info *info = netdev_priv(netdev);
+
+	return sprintf(buf, "%u\n", info->rx_max_target);
+}
+
+static ssize_t store_rxbuf_max(struct device *dev,
+			       struct device_attribute *attr,
+			       const char *buf, size_t len)
+{
+	struct net_device *netdev = to_net_dev(dev);
+	struct netfront_info *np = netdev_priv(netdev);
+	char *endp;
+	unsigned long target;
+
+	if (!capable(CAP_NET_ADMIN))
+		return -EPERM;
+
+	target = simple_strtoul(buf, &endp, 0);
+	if (endp == buf)
+		return -EBADMSG;
+
+	if (target < RX_MIN_TARGET)
+		target = RX_MIN_TARGET;
+	if (target > RX_MAX_TARGET)
+		target = RX_MAX_TARGET;
+
+	spin_lock(&np->rx_lock);
+	if (target < np->rx_min_target)
+		np->rx_min_target = target;
+	np->rx_max_target = target;
+	if (target < np->rx_target)
+		np->rx_target = target;
+
+	network_alloc_rx_buffers(netdev);
+
+	spin_unlock(&np->rx_lock);
+	return len;
+}
+
+static ssize_t show_rxbuf_cur(struct device *dev,
+			      struct device_attribute *attr, char *buf)
+{
+	struct net_device *netdev = to_net_dev(dev);
+	struct netfront_info *info = netdev_priv(netdev);
+
+	return sprintf(buf, "%u\n", info->rx_target);
+}
+
+static struct device_attribute xennet_attrs[] = {
+	__ATTR(rxbuf_min, S_IRUGO|S_IWUSR, show_rxbuf_min, store_rxbuf_min),
+	__ATTR(rxbuf_max, S_IRUGO|S_IWUSR, show_rxbuf_max, store_rxbuf_max),
+	__ATTR(rxbuf_cur, S_IRUGO, show_rxbuf_cur, NULL),
+};
+
+static int xennet_sysfs_addif(struct net_device *netdev)
+{
+	int i;
+	int error = 0;
+
+	for (i = 0; i < ARRAY_SIZE(xennet_attrs); i++) {
+		error = device_create_file(&netdev->dev,
+					   &xennet_attrs[i]);
+		if (error)
+			goto fail;
+	}
+	return 0;
+
+ fail:
+	while (--i >= 0)
+		device_remove_file(&netdev->dev, &xennet_attrs[i]);
+	return error;
+}
+
+static void xennet_sysfs_delif(struct net_device *netdev)
+{
+	int i;
+
+	for (i = 0; i < ARRAY_SIZE(xennet_attrs); i++) {
+		device_remove_file(&netdev->dev, &xennet_attrs[i]);
+	}
+}
+
+#endif /* CONFIG_SYSFS */
+
+
+/*
+ * Nothing to do here. Virtual interface is point-to-point and the
+ * physical interface is probably promiscuous anyway.
+ */
+static void network_set_multicast_list(struct net_device *dev)
+{
+}
+
+static struct net_device * __devinit create_netdev(struct xenbus_device *dev)
+{
+	int i, err = 0;
+	struct net_device *netdev = NULL;
+	struct netfront_info *np = NULL;
+
+	netdev = alloc_etherdev(sizeof(struct netfront_info));
+	if (!netdev) {
+		printk(KERN_WARNING "%s> alloc_etherdev failed.\n",
+		       __FUNCTION__);
+		return ERR_PTR(-ENOMEM);
+	}
+
+	np                   = netdev_priv(netdev);
+	np->xbdev            = dev;
+
+	netif_carrier_off(netdev);
+
+	spin_lock_init(&np->tx_lock);
+	spin_lock_init(&np->rx_lock);
+
+	skb_queue_head_init(&np->rx_batch);
+	np->rx_target     = RX_DFL_MIN_TARGET;
+	np->rx_min_target = RX_DFL_MIN_TARGET;
+	np->rx_max_target = RX_MAX_TARGET;
+
+	init_timer(&np->rx_refill_timer);
+	np->rx_refill_timer.data = (unsigned long)netdev;
+	np->rx_refill_timer.function = rx_refill_timeout;
+
+	/* Initialise {tx,rx}_skbs as a free chain containing every entry. */
+	for (i = 0; i <= NET_TX_RING_SIZE; i++) {
+		np->tx_skbs[i] = (void *)((unsigned long) i+1);
+		np->grant_tx_ref[i] = GRANT_INVALID_REF;
+	}
+
+	for (i = 0; i < NET_RX_RING_SIZE; i++) {
+		np->rx_skbs[i] = NULL;
+		np->grant_rx_ref[i] = GRANT_INVALID_REF;
+	}
+
+	/* A grant for every tx ring slot */
+	if (gnttab_alloc_grant_references(TX_MAX_TARGET,
+					  &np->gref_tx_head) < 0) {
+		printk(KERN_ALERT "#### netfront can't alloc tx grant refs\n");
+		err = -ENOMEM;
+		goto exit;
+	}
+	/* A grant for every rx ring slot */
+	if (gnttab_alloc_grant_references(RX_MAX_TARGET,
+					  &np->gref_rx_head) < 0) {
+		printk(KERN_ALERT "#### netfront can't alloc rx grant refs\n");
+		err = -ENOMEM;
+		goto exit_free_tx;
+	}
+
+	netdev->open            = network_open;
+	netdev->hard_start_xmit = network_start_xmit;
+	netdev->stop            = network_close;
+	netdev->get_stats       = network_get_stats;
+	netdev->poll            = netif_poll;
+	netdev->set_multicast_list = network_set_multicast_list;
+	netdev->uninit          = netif_uninit;
+	netdev->change_mtu	= xennet_change_mtu;
+	netdev->weight          = 64;
+	netdev->features        = NETIF_F_IP_CSUM;
+
+	SET_ETHTOOL_OPS(netdev, &network_ethtool_ops);
+	SET_MODULE_OWNER(netdev);
+	SET_NETDEV_DEV(netdev, &dev->dev);
+
+	np->netdev = netdev;
+	return netdev;
+
+ exit_free_tx:
+	gnttab_free_grant_references(np->gref_tx_head);
+ exit:
+	free_netdev(netdev);
+	return ERR_PTR(err);
+}
+
+/*
+ * We use this notifier to send out a fake ARP reply to reset switches and
+ * router ARP caches when an IP interface is brought up on a VIF.
+ */
+static int
+inetdev_notify(struct notifier_block *this, unsigned long event, void *ptr)
+{
+	struct in_ifaddr  *ifa = (struct in_ifaddr *)ptr;
+	struct net_device *dev = ifa->ifa_dev->dev;
+
+	/* UP event and is it one of our devices? */
+	if (event == NETDEV_UP && dev->open == network_open)
+		(void)send_fake_arp(dev);
+
+	return NOTIFY_DONE;
+}
+
+
+/* ** Close down ** */
+
+
+/**
+ * Handle the change of state of the backend to Closing.  We must delete our
+ * device-layer structures now, to ensure that writes are flushed through to
+ * the backend.  Once is this done, we can switch to Closed in
+ * acknowledgement.
+ */
+static void netfront_closing(struct xenbus_device *dev)
+{
+	struct netfront_info *info = dev->dev.driver_data;
+
+	DPRINTK("%s\n", dev->nodename);
+
+	close_netdev(info);
+	xenbus_frontend_closed(dev);
+}
+
+
+static int __devexit netfront_remove(struct xenbus_device *dev)
+{
+	struct netfront_info *info = dev->dev.driver_data;
+
+	DPRINTK("%s\n", dev->nodename);
+
+	netif_disconnect_backend(info);
+	free_netdev(info->netdev);
+
+	return 0;
+}
+
+
+static int open_netdev(struct netfront_info *info)
+{
+	int err;
+
+	err = register_netdev(info->netdev);
+	if (err) {
+		printk(KERN_WARNING "%s: register_netdev err=%d\n",
+		       __FUNCTION__, err);
+		return err;
+	}
+
+	err = xennet_sysfs_addif(info->netdev);
+	if (err) {
+		unregister_netdev(info->netdev);
+		printk(KERN_WARNING "%s: add sysfs failed err=%d\n",
+		       __FUNCTION__, err);
+		return err;
+	}
+
+	return 0;
+}
+
+static void close_netdev(struct netfront_info *info)
+{
+	del_timer_sync(&info->rx_refill_timer);
+
+	xennet_sysfs_delif(info->netdev);
+	unregister_netdev(info->netdev);
+}
+
+
+static void netif_disconnect_backend(struct netfront_info *info)
+{
+	/* Stop old i/f to prevent errors whilst we rebuild the state. */
+	spin_lock_irq(&info->tx_lock);
+	spin_lock(&info->rx_lock);
+	netif_carrier_off(info->netdev);
+	spin_unlock(&info->rx_lock);
+	spin_unlock_irq(&info->tx_lock);
+
+	if (info->irq)
+		unbind_from_irqhandler(info->irq, info->netdev);
+	info->evtchn = info->irq = 0;
+
+	end_access(info->tx_ring_ref, info->tx.sring);
+	end_access(info->rx_ring_ref, info->rx.sring);
+	info->tx_ring_ref = GRANT_INVALID_REF;
+	info->rx_ring_ref = GRANT_INVALID_REF;
+	info->tx.sring = NULL;
+	info->rx.sring = NULL;
+}
+
+
+static void end_access(int ref, void *page)
+{
+	if (ref != GRANT_INVALID_REF)
+		gnttab_end_foreign_access(ref, 0, (unsigned long)page);
+}
+
+
+/* ** Driver registration ** */
+
+
+static struct xenbus_device_id netfront_ids[] = {
+	{ "vif" },
+	{ "" }
+};
+
+
+static struct xenbus_driver netfront = {
+	.name = "vif",
+	.owner = THIS_MODULE,
+	.ids = netfront_ids,
+	.probe = netfront_probe,
+	.remove = __devexit_p(netfront_remove),
+	.resume = netfront_resume,
+	.otherend_changed = backend_changed,
+};
+
+
+static struct notifier_block notifier_inetdev = {
+	.notifier_call  = inetdev_notify,
+};
+
+static int __init netif_init(void)
+{
+	if (!is_running_on_xen())
+		return -ENODEV;
+
+#ifdef CONFIG_XEN
+	if (MODPARM_rx_flip && MODPARM_rx_copy) {
+		WPRINTK("Cannot specify both rx_copy and rx_flip.\n");
+		return -EINVAL;
+	}
+
+	if (!MODPARM_rx_flip && !MODPARM_rx_copy)
+		MODPARM_rx_flip = 1; /* Default is to flip. */
+#endif
+
+	if (is_initial_xendomain())
+		return 0;
+
+	IPRINTK("Initialising virtual ethernet driver.\n");
+
+	(void)register_inetaddr_notifier(&notifier_inetdev);
+
+	return xenbus_register_frontend(&netfront);
+}
+module_init(netif_init);
+
+
+static void __exit netif_exit(void)
+{
+	if (is_initial_xendomain())
+		return;
+
+	unregister_inetaddr_notifier(&notifier_inetdev);
+
+	return xenbus_unregister_driver(&netfront);
+}
+module_exit(netif_exit);
+
+MODULE_LICENSE("Dual BSD/GPL");
===================================================================
--- a/include/xen/events.h
+++ b/include/xen/events.h
@@ -2,6 +2,8 @@
 #define _XEN_EVENTS_H
 
 #include <linux/irq.h>
+#include <xen/interface/event_channel.h>
+#include <asm/xen/hypercall.h>
 
 int bind_evtchn_to_irq(unsigned int evtchn);
 int bind_evtchn_to_irqhandler(unsigned int evtchn,

-- 


^ permalink raw reply	[flat|nested] 93+ messages in thread

* Re: [patch 26/26] Xen-paravirt_ops: Add the Xen virtual network device driver.
  2007-03-01 23:25 ` [patch 26/26] Xen-paravirt_ops: Add the Xen virtual network device driver Jeremy Fitzhardinge
@ 2007-03-02  0:42   ` Stephen Hemminger
  2007-03-02  0:56     ` Jeremy Fitzhardinge
  2007-03-02  1:21     ` [patch 26/26] Xen-paravirt_ops: Add the Xen virtual network device driver Christoph Hellwig
  0 siblings, 2 replies; 93+ messages in thread
From: Stephen Hemminger @ 2007-03-02  0:42 UTC (permalink / raw)
  To: Jeremy Fitzhardinge
  Cc: xen-devel, Jeff Garzik, Ian Pratt, netdev, linux-kernel,
	Chris Wright, Andi Kleen, Zachary, virtualization, Andrew Morton

On Thu, 01 Mar 2007 15:25:09 -0800
Jeremy Fitzhardinge <jeremy@goop.org> wrote:

> The network device frontend driver allows the kernel to access network
> devices exported exported by a virtual machine containing a physical
> network device driver.
> 
> Signed-off-by: Ian Pratt <ian.pratt@xensource.com>
> Signed-off-by: Christian Limpach <Christian.Limpach@cl.cam.ac.uk>
> Signed-off-by: Chris Wright <chrisw@sous-sol.org>
> Signed-off-by: Jeremy Fitzhardinge <jeremy@xensource.com>
> Cc: netdev@vger.kernel.org
> Cc: Jeff Garzik <jeff@garzik.org>
> 
> ---
>  drivers/net/Kconfig        |   12 
>  drivers/net/Makefile       |    2 
>  drivers/net/xen-netfront.c | 2066 ++++++++++++++++++++++++++++++++++++++++++++
>  include/xen/events.h       |    2 
>  4 files changed, 2082 insertions(+)
> 
> ===================================================================
> --- a/drivers/net/Kconfig
> +++ b/drivers/net/Kconfig
> @@ -2525,6 +2525,18 @@ source "drivers/atm/Kconfig"
>  
>  source "drivers/s390/net/Kconfig"
>  
> +config XEN_NETDEV_FRONTEND
> +	tristate "Xen network device frontend driver"
> +	depends on XEN
> +	default y
> +	help
> +	  The network device frontend driver allows the kernel to
> +	  access network devices exported exported by a virtual
> +	  machine containing a physical network device driver. The
> +	  frontend driver is intended for unprivileged guest domains;
> +	  if you are compiling a kernel for a Xen guest, you almost
> +	  certainly want to enable this.
> +
>  config ISERIES_VETH
>  	tristate "iSeries Virtual Ethernet driver support"
>  	depends on PPC_ISERIES

Might make more sense earlier in list (near other virtual devices).
===================================================================
>
> +/*
> + * Mutually-exclusive module options to select receive data path:
> + *  rx_copy : Packets are copied by network backend into local memory
> + *  rx_flip : Page containing packet data is transferred to our ownership
> + * For fully-virtualised guests there is no option - copying must be used.
> + * For paravirtualised guests, flipping is the default.
> + */
> +#ifdef CONFIG_XEN


Hey I thought this driver depended on CONFIG_XEN already?

> +static int MODPARM_rx_copy = 0;
> +module_param_named(rx_copy, MODPARM_rx_copy, bool, 0);
> +MODULE_PARM_DESC(rx_copy, "Copy packets from network card (rather than flip)");
> +static int MODPARM_rx_flip = 0;
> +module_param_named(rx_flip, MODPARM_rx_flip, bool, 0);
> +MODULE_PARM_DESC(rx_flip, "Flip packets from network card (rather than copy)");
> +#else
> +static const int MODPARM_rx_copy = 1;
> +static const int MODPARM_rx_flip = 0;
> +#endif


No MIXED case variable names please.


Why have two mutually exclusive values instead of just one value
with three states: 0 = normal, 1 = copy, 2 = flip?


> +#define DPRINTK(fmt, args...)				\
> +	pr_debug("netfront (%s:%d) " fmt,		\
> +		 __FUNCTION__, __LINE__, ##args)
> +#define IPRINTK(fmt, args...)				\
> +	printk(KERN_INFO "netfront: " fmt, ##args)
> +#define WPRINTK(fmt, args...)				\
> +	printk(KERN_WARNING "netfront: " fmt, ##args)


Could you use dev_dbg, dev_info, dev_warn instead of these macros?

> +
> +/** Send a packet on a net device to encourage switches to learn the
> + * MAC. We send a fake ARP request.
> + *
> + * @param dev device
> + * @return 0 on success, error code otherwise
> + */
Why the sudden urge to use docbook format on one internal function.


> +static int send_fake_arp(struct net_device *dev)
> +{
> +	struct sk_buff *skb;
> +	u32             src_ip, dst_ip;
> +
> +	dst_ip = INADDR_BROADCAST;
> +	src_ip = inet_select_addr(dev, dst_ip, RT_SCOPE_LINK);
> +
> +	/* No IP? Then nothing to do. */
> +	if (src_ip == 0)
> +		return 0;
> +
> +	skb = arp_create(ARPOP_REPLY, ETH_P_ARP,
> +			 dst_ip, dev, src_ip,
> +			 /*dst_hw*/ NULL, /*src_hw*/ NULL,
> +			 /*target_hw*/ dev->dev_addr);
> +	if (skb == NULL)
> +		return -ENOMEM;
> +
> +	return dev_queue_xmit(skb);
> +}

This should probably be done in generic (non driver code).
It creates lots of dependencies here.

> +/*
> + * We use this notifier to send out a fake ARP reply to reset switches and
> + * router ARP caches when an IP interface is brought up on a VIF.
> + */
> +static int
> +inetdev_notify(struct notifier_block *this, unsigned long event, void *ptr)
> +{
> +	struct in_ifaddr  *ifa = (struct in_ifaddr *)ptr;
> +	struct net_device *dev = ifa->ifa_dev->dev;
> +
> +	/* UP event and is it one of our devices? */
> +	if (event == NETDEV_UP && dev->open == network_open)
> +		(void)send_fake_arp(dev);
> +
> +	return NOTIFY_DONE;
> +}

Shouldn't just be a global kernel option for gratuitous ARP.
Doesn't seem to be unique to this driver.
With sysctl to enable it.

-- 
Stephen Hemminger <shemminger@linux-foundation.org>

^ permalink raw reply	[flat|nested] 93+ messages in thread

* Re: [patch 26/26] Xen-paravirt_ops: Add the Xen virtual network device driver.
  2007-03-02  0:42   ` Stephen Hemminger
@ 2007-03-02  0:56     ` Jeremy Fitzhardinge
  2007-03-02  1:30       ` [RFC] Arp announce (for Xen) Stephen Hemminger
  2007-03-02  1:21     ` [patch 26/26] Xen-paravirt_ops: Add the Xen virtual network device driver Christoph Hellwig
  1 sibling, 1 reply; 93+ messages in thread
From: Jeremy Fitzhardinge @ 2007-03-02  0:56 UTC (permalink / raw)
  To: Stephen Hemminger
  Cc: Andi Kleen, Andrew Morton, linux-kernel, virtualization,
	xen-devel, Chris Wright, Zachary Amsden, Rusty Russell, Ian Pratt,
	Christian Limpach, netdev, Jeff Garzik

Stephen Hemminger wrote:
> On Thu, 01 Mar 2007 15:25:09 -0800
> Jeremy Fitzhardinge <jeremy@goop.org> wrote:
>
>   
>> The network device frontend driver allows the kernel to access network
>> devices exported exported by a virtual machine containing a physical
>> network device driver.
>>
>> Signed-off-by: Ian Pratt <ian.pratt@xensource.com>
>> Signed-off-by: Christian Limpach <Christian.Limpach@cl.cam.ac.uk>
>> Signed-off-by: Chris Wright <chrisw@sous-sol.org>
>> Signed-off-by: Jeremy Fitzhardinge <jeremy@xensource.com>
>> Cc: netdev@vger.kernel.org
>> Cc: Jeff Garzik <jeff@garzik.org>
>>
>> ---
>>  drivers/net/Kconfig        |   12 
>>  drivers/net/Makefile       |    2 
>>  drivers/net/xen-netfront.c | 2066 ++++++++++++++++++++++++++++++++++++++++++++
>>  include/xen/events.h       |    2 
>>  4 files changed, 2082 insertions(+)
>>
>> ===================================================================
>> --- a/drivers/net/Kconfig
>> +++ b/drivers/net/Kconfig
>> @@ -2525,6 +2525,18 @@ source "drivers/atm/Kconfig"
>>  
>>  source "drivers/s390/net/Kconfig"
>>  
>> +config XEN_NETDEV_FRONTEND
>> +	tristate "Xen network device frontend driver"
>> +	depends on XEN
>> +	default y
>> +	help
>> +	  The network device frontend driver allows the kernel to
>> +	  access network devices exported exported by a virtual
>> +	  machine containing a physical network device driver. The
>> +	  frontend driver is intended for unprivileged guest domains;
>> +	  if you are compiling a kernel for a Xen guest, you almost
>> +	  certainly want to enable this.
>> +
>>  config ISERIES_VETH
>>  	tristate "iSeries Virtual Ethernet driver support"
>>  	depends on PPC_ISERIES
>>     
>
> Might make more sense earlier in list (near other virtual devices).
>   

OK.

>
> Hey I thought this driver depended on CONFIG_XEN already?
>   

You're right.

>> +static int MODPARM_rx_copy = 0;
>> +module_param_named(rx_copy, MODPARM_rx_copy, bool, 0);
>> +MODULE_PARM_DESC(rx_copy, "Copy packets from network card (rather than flip)");
>> +static int MODPARM_rx_flip = 0;
>> +module_param_named(rx_flip, MODPARM_rx_flip, bool, 0);
>> +MODULE_PARM_DESC(rx_flip, "Flip packets from network card (rather than copy)");
>> +#else
>> +static const int MODPARM_rx_copy = 1;
>> +static const int MODPARM_rx_flip = 0;
>> +#endif
>>     
>
>
> No MIXED case variable names please.
>   

OK.

> Why have two mutually exclusive values instead of just one value
> with three states: 0 = normal, 1 = copy, 2 = flip?
>   

That does seem odd.  I'll fix it up.

>> +#define DPRINTK(fmt, args...)				\
>> +	pr_debug("netfront (%s:%d) " fmt,		\
>> +		 __FUNCTION__, __LINE__, ##args)
>> +#define IPRINTK(fmt, args...)				\
>> +	printk(KERN_INFO "netfront: " fmt, ##args)
>> +#define WPRINTK(fmt, args...)				\
>> +	printk(KERN_WARNING "netfront: " fmt, ##args)
>>     
>
>
> Could you use dev_dbg, dev_info, dev_warn instead of these macros?
>   

Yes.  I'd done that conversion elsewhere, but overlooked it here.

>> +
>> +/** Send a packet on a net device to encourage switches to learn the
>> + * MAC. We send a fake ARP request.
>> + *
>> + * @param dev device
>> + * @return 0 on success, error code otherwise
>> + */
>>     
> Why the sudden urge to use docbook format on one internal function.
>   

It probably got copied from somewhere.  I'll clean it up.

>> +static int send_fake_arp(struct net_device *dev)
>> +{
>> +	struct sk_buff *skb;
>> +	u32             src_ip, dst_ip;
>> +
>> +	dst_ip = INADDR_BROADCAST;
>> +	src_ip = inet_select_addr(dev, dst_ip, RT_SCOPE_LINK);
>> +
>> +	/* No IP? Then nothing to do. */
>> +	if (src_ip == 0)
>> +		return 0;
>> +
>> +	skb = arp_create(ARPOP_REPLY, ETH_P_ARP,
>> +			 dst_ip, dev, src_ip,
>> +			 /*dst_hw*/ NULL, /*src_hw*/ NULL,
>> +			 /*target_hw*/ dev->dev_addr);
>> +	if (skb == NULL)
>> +		return -ENOMEM;
>> +
>> +	return dev_queue_xmit(skb);
>> +}
>>     
>
> This should probably be done in generic (non driver code).
> It creates lots of dependencies here.
> [...]
> Shouldn't just be a global kernel option for gratuitous ARP.
> Doesn't seem to be unique to this driver.
> With sysctl to enable it.
>   

I agree it would be nice to not have to do this in the driver.  The
specific requirement here is to make it send a packet after resume
(which includes arriving after a migration) so that a switch can quickly
work that the machine is there.  Is there a generic mechanism for doing
this?

    J


^ permalink raw reply	[flat|nested] 93+ messages in thread

* Re: [patch 26/26] Xen-paravirt_ops: Add the Xen virtual network device driver.
  2007-03-02  0:42   ` Stephen Hemminger
  2007-03-02  0:56     ` Jeremy Fitzhardinge
@ 2007-03-02  1:21     ` Christoph Hellwig
  2007-03-02  1:26       ` Chris Wright
  1 sibling, 1 reply; 93+ messages in thread
From: Christoph Hellwig @ 2007-03-02  1:21 UTC (permalink / raw)
  To: Stephen Hemminger
  Cc: Jeremy Fitzhardinge, Andi Kleen, Andrew Morton, linux-kernel,
	virtualization, xen-devel, Chris Wright, Zachary Amsden,
	Rusty Russell, Ian Pratt, Christian Limpach, netdev, Jeff Garzik

On Thu, Mar 01, 2007 at 04:42:14PM -0800, Stephen Hemminger wrote:
> > +
> > +/** Send a packet on a net device to encourage switches to learn the
> > + * MAC. We send a fake ARP request.
> > + *
> > + * @param dev device
> > + * @return 0 on success, error code otherwise
> > + */
> Why the sudden urge to use docbook format on one internal function.

And it's not even proper kerneldoc but looks more like javadoc.
If you want to write kerneldoc comments please also verify them by
running the tools over it.

> This should probably be done in generic (non driver code).
> It creates lots of dependencies here.

Actually the right way to do it is in userspace, as all clustering
solutions do.  That's whay everyone told the Xen folks by the just
refuse to rip this junk out.


^ permalink raw reply	[flat|nested] 93+ messages in thread

* Re: [patch 26/26] Xen-paravirt_ops: Add the Xen virtual network device driver.
  2007-03-02  1:21     ` [patch 26/26] Xen-paravirt_ops: Add the Xen virtual network device driver Christoph Hellwig
@ 2007-03-02  1:26       ` Chris Wright
  0 siblings, 0 replies; 93+ messages in thread
From: Chris Wright @ 2007-03-02  1:26 UTC (permalink / raw)
  To: Christoph Hellwig, Stephen Hemminger, Jeremy Fitzhardinge,
	Andi Kleen, Andrew Morton, linux-kernel, virtualization,
	xen-devel, Chris Wright, Zachary Amsden, Rusty Russell, Ian Pratt,
	Christian Limpach, netdev, Jeff Garzik

* Christoph Hellwig (hch@infradead.org) wrote:
> Actually the right way to do it is in userspace, as all clustering
> solutions do.  That's whay everyone told the Xen folks by the just
> refuse to rip this junk out.

I'm ripping it out.

^ permalink raw reply	[flat|nested] 93+ messages in thread

* [RFC] Arp announce (for Xen)
  2007-03-02  0:56     ` Jeremy Fitzhardinge
@ 2007-03-02  1:30       ` Stephen Hemminger
  2007-03-02  8:09         ` Pekka Savola
                           ` (4 more replies)
  0 siblings, 5 replies; 93+ messages in thread
From: Stephen Hemminger @ 2007-03-02  1:30 UTC (permalink / raw)
  To: Jeremy Fitzhardinge
  Cc: xen-devel, Jeff Garzik, Ian Pratt, netdev, linux-kernel,
	Chris Wright, Andi Kleen, Zachary, virtualization, Andrew Morton

What about implementing the unused arp_announce flag on the inetdevice?
Something like the following.  Totally untested...

Looks like it either was there (and got removed) or was planned but
never implemented.

diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c
index e10794d..cefc339 100644
--- a/net/ipv4/devinet.c
+++ b/net/ipv4/devinet.c
@@ -1089,6 +1089,16 @@ static int inetdev_event(struct notifier
 			}
 		}
 		ip_mc_up(in_dev);
+		/* fallthru */
+
+	case NETDEV_CHANGEADDR:
+		/* Send gratuitous ARP in case of address change or new device */
+		if (IN_DEV_ARP_ANNOUNCE(in_dev))
+			arp_send(ARPOP_REQUEST, ETH_P_ARP,
+				 in_dev->ifa_list->ifa_address, dev,
+				 in_dev->ifa_list->ifa_address, NULL, 
+				 dev->dev_addr, NULL);
+			
 		break;
 	case NETDEV_DOWN:
 		ip_mc_down(in_dev);

^ permalink raw reply related	[flat|nested] 93+ messages in thread

* Re: [RFC] Arp announce (for Xen)
  2007-03-02  1:30       ` [RFC] Arp announce (for Xen) Stephen Hemminger
@ 2007-03-02  8:09         ` Pekka Savola
  2007-03-02 18:29           ` Ben Greear
  2007-03-02  8:46         ` Keir Fraser
                           ` (3 subsequent siblings)
  4 siblings, 1 reply; 93+ messages in thread
From: Pekka Savola @ 2007-03-02  8:09 UTC (permalink / raw)
  To: Stephen Hemminger; +Cc: netdev

On Thu, 1 Mar 2007, Stephen Hemminger wrote:
> What about implementing the unused arp_announce flag on the inetdevice?
> Something like the following.  Totally untested...
>
> Looks like it either was there (and got removed) or was planned but
> never implemented.

If something like this goes in, it wouldn't hurt to do similar with 
IPv6 (RFC2461 section 7.2.6).

There are very popular hardware-based routers which refresh their NDP 
caches only every 24 hours or 20 minutes (depending on the software 
version).  Sending unsolicited NAs would eliminate traffic 
blackholing.

> diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c
> index e10794d..cefc339 100644
> --- a/net/ipv4/devinet.c
> +++ b/net/ipv4/devinet.c
> @@ -1089,6 +1089,16 @@ static int inetdev_event(struct notifier
> 			}
> 		}
> 		ip_mc_up(in_dev);
> +		/* fallthru */
> +
> +	case NETDEV_CHANGEADDR:
> +		/* Send gratuitous ARP in case of address change or new device */
> +		if (IN_DEV_ARP_ANNOUNCE(in_dev))
> +			arp_send(ARPOP_REQUEST, ETH_P_ARP,
> +				 in_dev->ifa_list->ifa_address, dev,
> +				 in_dev->ifa_list->ifa_address, NULL,
> +				 dev->dev_addr, NULL);
> +
> 		break;
> 	case NETDEV_DOWN:
> 		ip_mc_down(in_dev);
>
> -
> To unsubscribe from this list: send the line "unsubscribe netdev" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
>

-- 
Pekka Savola                 "You each name yourselves king, yet the
Netcore Oy                    kingdom bleeds."
Systems. Networks. Security. -- George R.R. Martin: A Clash of Kings

^ permalink raw reply	[flat|nested] 93+ messages in thread

* Re: [RFC] Arp announce (for Xen)
  2007-03-02  1:30       ` [RFC] Arp announce (for Xen) Stephen Hemminger
  2007-03-02  8:09         ` Pekka Savola
@ 2007-03-02  8:46         ` Keir Fraser
  2007-03-02 12:54         ` Andi Kleen
                           ` (2 subsequent siblings)
  4 siblings, 0 replies; 93+ messages in thread
From: Keir Fraser @ 2007-03-02  8:46 UTC (permalink / raw)
  To: Stephen Hemminger, Jeremy Fitzhardinge
  Cc: xen-devel, Jeff Garzik, Andi Kleen, netdev, linux-kernel,
	Chris Wright, Ian Pratt, Zachary, virtualization, Andrew Morton

On 2/3/07 01:30, "Stephen Hemminger" <shemminger@linux-foundation.org>
wrote:

> What about implementing the unused arp_announce flag on the inetdevice?
> Something like the following.  Totally untested...
> 
> Looks like it either was there (and got removed) or was planned but
> never implemented.

This would be acceptable. It only needs a simple script to poke the correct
value, and that could be easily integrated into vendor scripts. The general
idea of being able to have a callback to userspace is okay but we'd like
something lighter weight for live migration, which is typically done only
within the scope of a single layer-2 network and hence all that is really
required is the gratuitous ARP to kick the switch hardware.

 -- Keir

^ permalink raw reply	[flat|nested] 93+ messages in thread

* Re: [RFC] Arp announce (for Xen)
  2007-03-02  1:30       ` [RFC] Arp announce (for Xen) Stephen Hemminger
  2007-03-02  8:09         ` Pekka Savola
  2007-03-02  8:46         ` Keir Fraser
@ 2007-03-02 12:54         ` Andi Kleen
  2007-03-02 14:08           ` jamal
  2007-03-02 18:08         ` Chris Wright
  2007-03-06  4:35         ` David Miller
  4 siblings, 1 reply; 93+ messages in thread
From: Andi Kleen @ 2007-03-02 12:54 UTC (permalink / raw)
  To: Stephen Hemminger
  Cc: Zachary Amsden, Jeremy Fitzhardinge, xen-devel, Jeff Garzik,
	Ian Pratt, netdev, Rusty Russell, linux-kernel, Chris Wright,
	virtualization, Andrew Morton, Christian Limpach

On Thu, Mar 01, 2007 at 05:30:30PM -0800, Stephen Hemminger wrote:
> What about implementing the unused arp_announce flag on the inetdevice?
> Something like the following.  Totally untested...
> 
> Looks like it either was there (and got removed) or was planned but
> never implemented.

Seems like the right solution to me (if it works)

-Andi

^ permalink raw reply	[flat|nested] 93+ messages in thread

* Re: [RFC] Arp announce (for Xen)
  2007-03-02 12:54         ` Andi Kleen
@ 2007-03-02 14:08           ` jamal
  0 siblings, 0 replies; 93+ messages in thread
From: jamal @ 2007-03-02 14:08 UTC (permalink / raw)
  To: Andi Kleen
  Cc: Stephen Hemminger, Jeremy Fitzhardinge, Andrew Morton,
	linux-kernel, virtualization, xen-devel, Chris Wright,
	Zachary Amsden, Rusty Russell, Ian Pratt, Christian Limpach,
	netdev, Jeff Garzik

On Fri, 2007-02-03 at 13:54 +0100, Andi Kleen wrote:

> Seems like the right solution to me (if it works)

I like it as well given the simple change. 
These are the kind of optimizations that justify offloading things to
the kernel (instead of user space) i.e very little lines of code, covers
a large number of user-desires, and can be turned off if you want to do
something complex. _Almost_ elegant Stephen;->

cheers,
jamal


^ permalink raw reply	[flat|nested] 93+ messages in thread

* Re: [RFC] Arp announce (for Xen)
  2007-03-02  1:30       ` [RFC] Arp announce (for Xen) Stephen Hemminger
                           ` (2 preceding siblings ...)
  2007-03-02 12:54         ` Andi Kleen
@ 2007-03-02 18:08         ` Chris Wright
  2007-03-06  4:35         ` David Miller
  4 siblings, 0 replies; 93+ messages in thread
From: Chris Wright @ 2007-03-02 18:08 UTC (permalink / raw)
  To: Stephen Hemminger
  Cc: Zachary Amsden, Jeremy Fitzhardinge, xen-devel, Jeff Garzik,
	Ian Pratt, Andi Kleen, netdev, Rusty Russell, linux-kernel,
	Chris Wright, virtualization, Andrew Morton, Christian Limpach

* Stephen Hemminger (shemminger@linux-foundation.org) wrote:
> +	case NETDEV_CHANGEADDR:
> +		/* Send gratuitous ARP in case of address change or new device */
> +		if (IN_DEV_ARP_ANNOUNCE(in_dev))

Conceptually right on, but it looks like improper hijacking
of arp_announce sysctl.  Could introduce another

thanks,
-chris

^ permalink raw reply	[flat|nested] 93+ messages in thread

* Re: [RFC] Arp announce (for Xen)
  2007-03-02  8:09         ` Pekka Savola
@ 2007-03-02 18:29           ` Ben Greear
  2007-03-02 19:59             ` Stephen Hemminger
  0 siblings, 1 reply; 93+ messages in thread
From: Ben Greear @ 2007-03-02 18:29 UTC (permalink / raw)
  To: Pekka Savola; +Cc: Stephen Hemminger, netdev

Pekka Savola wrote:
> On Thu, 1 Mar 2007, Stephen Hemminger wrote:
>> What about implementing the unused arp_announce flag on the inetdevice?
>> Something like the following.  Totally untested...
>>
>> Looks like it either was there (and got removed) or was planned but
>> never implemented.
IN_DEV_ARP_ANNOUNCE is in 2.6.18, at least..used in arp_solicit in arp.c

I really hope this didn't get removed because I find it very useful!

But, you could certainly add another sysctl...

Thanks,
Ben

-- 
Ben Greear <greearb@candelatech.com> 
Candela Technologies Inc  http://www.candelatech.com



^ permalink raw reply	[flat|nested] 93+ messages in thread

* Re: [RFC] Arp announce (for Xen)
  2007-03-02 18:29           ` Ben Greear
@ 2007-03-02 19:59             ` Stephen Hemminger
  0 siblings, 0 replies; 93+ messages in thread
From: Stephen Hemminger @ 2007-03-02 19:59 UTC (permalink / raw)
  To: Ben Greear; +Cc: Pekka Savola, netdev

On Fri, 02 Mar 2007 10:29:53 -0800
Ben Greear <greearb@candelatech.com> wrote:

> Pekka Savola wrote:
> > On Thu, 1 Mar 2007, Stephen Hemminger wrote:
> >> What about implementing the unused arp_announce flag on the inetdevice?
> >> Something like the following.  Totally untested...
> >>
> >> Looks like it either was there (and got removed) or was planned but
> >> never implemented.
> IN_DEV_ARP_ANNOUNCE is in 2.6.18, at least..used in arp_solicit in arp.c
> 
> I really hope this didn't get removed because I find it very useful!
> 
> But, you could certainly add another sysctl...
> 
> Thanks,
> Ben
> 

yeah, something new like arp_notify? or arp_gratiutous

There are other drivers that do their own arp, they need to be fixed.

-- 
Stephen Hemminger <shemminger@linux-foundation.org>

^ permalink raw reply	[flat|nested] 93+ messages in thread

* Re: [RFC] Arp announce (for Xen)
  2007-03-02  1:30       ` [RFC] Arp announce (for Xen) Stephen Hemminger
                           ` (3 preceding siblings ...)
  2007-03-02 18:08         ` Chris Wright
@ 2007-03-06  4:35         ` David Miller
  2007-03-06 18:51           ` [RFC] ARP notify option Stephen Hemminger
  4 siblings, 1 reply; 93+ messages in thread
From: David Miller @ 2007-03-06  4:35 UTC (permalink / raw)
  To: shemminger
  Cc: jeremy, ak, akpm, linux-kernel, virtualization, xen-devel, chrisw,
	zach, rusty, ian.pratt, Christian.Limpach, netdev, jeff

From: Stephen Hemminger <shemminger@linux-foundation.org>
Date: Thu, 1 Mar 2007 17:30:30 -0800

> What about implementing the unused arp_announce flag on the inetdevice?
> Something like the following.  Totally untested...
> 
> Looks like it either was there (and got removed) or was planned but
> never implemented.

This idea is fine.  But:

> +	case NETDEV_CHANGEADDR:
> +		/* Send gratuitous ARP in case of address change or new device */
> +		if (IN_DEV_ARP_ANNOUNCE(in_dev))
> +			arp_send(ARPOP_REQUEST, ETH_P_ARP,
> +				 in_dev->ifa_list->ifa_address, dev,
> +				 in_dev->ifa_list->ifa_address, NULL, 
> +				 dev->dev_addr, NULL);

We'll need to make sure the appropriate 'arp_anounce' address
selection is employed here.

One idea is to change arp_solicit() such that it can be invoked in
this context, or provide a new helper function which will do the
source address selection rules of 'arp_announce' and then invoke
arp_send() as appropriate for us.

^ permalink raw reply	[flat|nested] 93+ messages in thread

* [RFC] ARP notify option
  2007-03-06  4:35         ` David Miller
@ 2007-03-06 18:51           ` Stephen Hemminger
  2007-03-06 19:04             ` Jeremy Fitzhardinge
                               ` (2 more replies)
  0 siblings, 3 replies; 93+ messages in thread
From: Stephen Hemminger @ 2007-03-06 18:51 UTC (permalink / raw)
  To: David Miller
  Cc: xen-devel, jeff, ian.pratt, netdev, linux-kernel, chrisw,
	virtualization, akpm, ak

This adds another inet device option to enable gratuitous ARP
when device is brought up or address change. This is handy for
clusters or virtualization.

Tested on a normal device (not Xen).

Signed-off-by: Stephen Hemminger <shemminge@linux-foundation.org>

---
 Documentation/networking/ip-sysctl.txt |    6 ++++++
 include/linux/inetdevice.h             |    2 ++
 include/linux/sysctl.h                 |    1 +
 net/ipv4/devinet.c                     |   16 ++++++++++++++++
 4 files changed, 25 insertions(+)

--- net-2.6.22.orig/Documentation/networking/ip-sysctl.txt	2007-03-05 14:35:31.000000000 -0800
+++ net-2.6.22/Documentation/networking/ip-sysctl.txt	2007-03-05 16:46:47.000000000 -0800
@@ -732,6 +732,12 @@
 	The max value from conf/{all,interface}/arp_ignore is used
 	when ARP request is received on the {interface}
 
+arp_notify - BOOLEAN
+	Define mode for notification of address and device changes.
+	0 - (default): do nothing
+	1 - Generate gratuitous arp replies when device is brought up
+	    or hardware address changes.
+
 arp_accept - BOOLEAN
 	Define behavior when gratuitous arp replies are received:
 	0 - drop gratuitous arp frames
--- net-2.6.22.orig/include/linux/inetdevice.h	2007-03-05 14:35:34.000000000 -0800
+++ net-2.6.22/include/linux/inetdevice.h	2007-03-05 16:46:47.000000000 -0800
@@ -26,6 +26,7 @@
 	int	arp_announce;
 	int	arp_ignore;
 	int	arp_accept;
+	int	arp_notify;
 	int	medium_id;
 	int	no_xfrm;
 	int	no_policy;
@@ -84,6 +85,7 @@
 #define IN_DEV_ARPFILTER(in_dev)	(ipv4_devconf.arp_filter || (in_dev)->cnf.arp_filter)
 #define IN_DEV_ARP_ANNOUNCE(in_dev)	(max(ipv4_devconf.arp_announce, (in_dev)->cnf.arp_announce))
 #define IN_DEV_ARP_IGNORE(in_dev)	(max(ipv4_devconf.arp_ignore, (in_dev)->cnf.arp_ignore))
+#define IN_DEV_ARP_NOTIFY(in_dev)	(ipv4_devconf.arp_notify || (in_dev)->cnf.arp_notify)
 
 struct in_ifaddr
 {
--- net-2.6.22.orig/include/linux/sysctl.h	2007-03-05 14:35:34.000000000 -0800
+++ net-2.6.22/include/linux/sysctl.h	2007-03-05 16:46:47.000000000 -0800
@@ -495,6 +495,7 @@
 	NET_IPV4_CONF_ARP_IGNORE=19,
 	NET_IPV4_CONF_PROMOTE_SECONDARIES=20,
 	NET_IPV4_CONF_ARP_ACCEPT=21,
+	NET_IPV4_CONF_ARP_NOTIFY=22,
 	__NET_IPV4_CONF_MAX
 };
 
--- net-2.6.22.orig/net/ipv4/devinet.c	2007-03-05 14:35:34.000000000 -0800
+++ net-2.6.22/net/ipv4/devinet.c	2007-03-05 16:46:47.000000000 -0800
@@ -1089,6 +1089,14 @@
 			}
 		}
 		ip_mc_up(in_dev);
+		/* fall through */
+	case NETDEV_CHANGEADDR:
+		if (IN_DEV_ARP_NOTIFY(in_dev))
+			arp_send(ARPOP_REQUEST, ETH_P_ARP,
+				 in_dev->ifa_list->ifa_address,
+				 dev,
+				 in_dev->ifa_list->ifa_address,
+				 NULL, dev->dev_addr, NULL);
 		break;
 	case NETDEV_DOWN:
 		ip_mc_down(in_dev);
@@ -1495,6 +1503,14 @@
 			.proc_handler	= &proc_dointvec,
 		},
 		{
+			.ctl_name	= NET_IPV4_CONF_ARP_NOTIFY,
+			.procname	= "arp_notify",
+			.data		= &ipv4_devconf.arp_notify,
+			.maxlen		= sizeof(int),
+			.mode		= 0644,
+			.proc_handler	= &proc_dointvec,
+		},
+		{
 			.ctl_name	= NET_IPV4_CONF_NOXFRM,
 			.procname	= "disable_xfrm",
 			.data		= &ipv4_devconf.no_xfrm,

^ permalink raw reply	[flat|nested] 93+ messages in thread

* Re: [RFC] ARP notify option
  2007-03-06 18:51           ` [RFC] ARP notify option Stephen Hemminger
@ 2007-03-06 19:04             ` Jeremy Fitzhardinge
  2007-03-06 19:07             ` Chris Wright
  2007-03-06 21:18             ` Chris Friesen
  2 siblings, 0 replies; 93+ messages in thread
From: Jeremy Fitzhardinge @ 2007-03-06 19:04 UTC (permalink / raw)
  To: Stephen Hemminger
  Cc: David Miller, ak, akpm, linux-kernel, virtualization, xen-devel,
	chrisw, zach, rusty, ian.pratt, Christian.Limpach, netdev, jeff

Stephen Hemminger wrote:
> This adds another inet device option to enable gratuitous ARP
> when device is brought up or address change. This is handy for
> clusters or virtualization.
>   

Thanks Stephen.  Haven't tested this yet, but it definitely cleans up a
warty corner of netfront.

    J

^ permalink raw reply	[flat|nested] 93+ messages in thread

* Re: [RFC] ARP notify option
  2007-03-06 18:51           ` [RFC] ARP notify option Stephen Hemminger
  2007-03-06 19:04             ` Jeremy Fitzhardinge
@ 2007-03-06 19:07             ` Chris Wright
  2007-03-06 21:18             ` Chris Friesen
  2 siblings, 0 replies; 93+ messages in thread
From: Chris Wright @ 2007-03-06 19:07 UTC (permalink / raw)
  To: Stephen Hemminger
  Cc: zach, jeremy, xen-devel, jeff, ian.pratt, ak, netdev, rusty,
	linux-kernel, chrisw, virtualization, akpm, David Miller,
	Christian.Limpach

* Stephen Hemminger (shemminger@linux-foundation.org) wrote:
> This adds another inet device option to enable gratuitous ARP
> when device is brought up or address change. This is handy for
> clusters or virtualization.

This looks good.  I'll test with Xen.  What about the source
addr selection?

thanks,
-chris

^ permalink raw reply	[flat|nested] 93+ messages in thread

* Re: [RFC] ARP notify option
  2007-03-06 18:51           ` [RFC] ARP notify option Stephen Hemminger
  2007-03-06 19:04             ` Jeremy Fitzhardinge
  2007-03-06 19:07             ` Chris Wright
@ 2007-03-06 21:18             ` Chris Friesen
  2007-03-06 22:52               ` Stephen Hemminger
  2007-03-07  6:42               ` Pekka Savola
  2 siblings, 2 replies; 93+ messages in thread
From: Chris Friesen @ 2007-03-06 21:18 UTC (permalink / raw)
  To: Stephen Hemminger
  Cc: xen-devel, jeff, ian.pratt, netdev, linux-kernel, chrisw,
	virtualization, akpm, David Miller, ak

Stephen Hemminger wrote:

> +arp_notify - BOOLEAN
> +	Define mode for notification of address and device changes.
> +	0 - (default): do nothing
> +	1 - Generate gratuitous arp replies when device is brought up
> +	    or hardware address changes.

Did you consider using gratuitous arp requests instead?  I remember 
reading about some hardware that updated its arp cache on gratuitous 
requests but not gratuitous replies.

Chris

^ permalink raw reply	[flat|nested] 93+ messages in thread

* Re: [RFC] ARP notify option
  2007-03-06 21:18             ` Chris Friesen
@ 2007-03-06 22:52               ` Stephen Hemminger
  2007-03-07  6:42               ` Pekka Savola
  1 sibling, 0 replies; 93+ messages in thread
From: Stephen Hemminger @ 2007-03-06 22:52 UTC (permalink / raw)
  To: Chris Friesen
  Cc: xen-devel, jeff, ian.pratt, netdev, linux-kernel, chrisw,
	virtualization, akpm, David Miller, ak

On Tue, 06 Mar 2007 15:18:07 -0600
"Chris Friesen" <cfriesen@nortel.com> wrote:

> Stephen Hemminger wrote:
> 
> > +arp_notify - BOOLEAN
> > +	Define mode for notification of address and device changes.
> > +	0 - (default): do nothing
> > +	1 - Generate gratuitous arp replies when device is brought up
> > +	    or hardware address changes.
> 
> Did you consider using gratuitous arp requests instead?  I remember 
> reading about some hardware that updated its arp cache on gratuitous 
> requests but not gratuitous replies.
> 
> Chris

I copied the ARP generation from other places that were doing
gratuitous ARP already:  Xen and irlan.
Our local switch used REPLY's to do the same thing.

One could imagine making it a ternary value and having 2 generate
REQUEST's.


-- 
Stephen Hemminger <shemminger@linux-foundation.org>

^ permalink raw reply	[flat|nested] 93+ messages in thread

* Re: [RFC] ARP notify option
  2007-03-06 21:18             ` Chris Friesen
  2007-03-06 22:52               ` Stephen Hemminger
@ 2007-03-07  6:42               ` Pekka Savola
  2007-03-07 17:00                 ` Stephen Hemminger
  1 sibling, 1 reply; 93+ messages in thread
From: Pekka Savola @ 2007-03-07  6:42 UTC (permalink / raw)
  To: Chris Friesen
  Cc: xen-devel, jeff, ian.pratt, virtualization, netdev, linux-kernel,
	chrisw, ak, akpm, Stephen Hemminger, David Miller

On Tue, 6 Mar 2007, Chris Friesen wrote:
> Stephen Hemminger wrote:
>>  +arp_notify - BOOLEAN
>>  +	Define mode for notification of address and device changes.
>>  +	0 - (default): do nothing
>>  +	1 - Generate gratuitous arp replies when device is brought up
>>  +	    or hardware address changes.
>
> Did you consider using gratuitous arp requests instead?  I remember reading 
> about some hardware that updated its arp cache on gratuitous requests but not 
> gratuitous replies.

You might be interested in taking a look at:

http://tools.ietf.org/id/draft-cheshire-ipv4-acd

There has been some follow-up discussion on this in the thread 
starting at:

http://www1.ietf.org/mail-archive/web/int-area/current/msg00611.html

In particular, you may be interested in this comment about ARP 
request and ARP reply for gratuitous ARP:

http://www1.ietf.org/mail-archive/web/int-area/current/msg00669.html

-- 
Pekka Savola                 "You each name yourselves king, yet the
Netcore Oy                    kingdom bleeds."
Systems. Networks. Security. -- George R.R. Martin: A Clash of Kings

^ permalink raw reply	[flat|nested] 93+ messages in thread

* Re: [RFC] ARP notify option
  2007-03-07  6:42               ` Pekka Savola
@ 2007-03-07 17:00                 ` Stephen Hemminger
  0 siblings, 0 replies; 93+ messages in thread
From: Stephen Hemminger @ 2007-03-07 17:00 UTC (permalink / raw)
  To: Pekka Savola
  Cc: xen-devel, jeff, ian.pratt, ak, netdev, linux-kernel, chrisw,
	Chris Friesen, virtualization, akpm, David Miller

On Wed, 7 Mar 2007 08:42:39 +0200 (EET)
Pekka Savola <pekkas@netcore.fi> wrote:

> On Tue, 6 Mar 2007, Chris Friesen wrote:
> > Stephen Hemminger wrote:
> >>  +arp_notify - BOOLEAN
> >>  +	Define mode for notification of address and device changes.
> >>  +	0 - (default): do nothing
> >>  +	1 - Generate gratuitous arp replies when device is brought up
> >>  +	    or hardware address changes.
> >
> > Did you consider using gratuitous arp requests instead?  I remember reading 
> > about some hardware that updated its arp cache on gratuitous requests but not 
> > gratuitous replies.
> 
> You might be interested in taking a look at:
> 
> http://tools.ietf.org/id/draft-cheshire-ipv4-acd
> 
> There has been some follow-up discussion on this in the thread 
> starting at:
> 
> http://www1.ietf.org/mail-archive/web/int-area/current/msg00611.html
> 
> In particular, you may be interested in this comment about ARP 
> request and ARP reply for gratuitous ARP:
> 
> http://www1.ietf.org/mail-archive/web/int-area/current/msg00669.html

Looks like REQUESTS make more sense. (See below). I will
rework this patch.

5. Why are ARP Announcements performed using ARP Request packets
   and not ARP Reply packets?

   During IETF deliberation of IPv4 Address Conflict Detection from 2000
   to 2005, a question that kept being asked repeatedly was, "Shouldn't
   ARP Announcements be performed using gratuitous ARP Reply packets?"

   On the face of it, this seems reasonable.  A conventional ARP Reply
   is an answer to a question.  If in fact no question had been asked,
   then it would be reasonable to describe such a reply as gratuitous. 
   This description would seem to apply perfectly to an ARP
   Announcement: an answer to an implied question that in fact no one
   asked.

   However reasonable this may seem in principle, there are two reasons
   why in practice ARP Request packets are the better choice.  One is
   historical precedent, and the other is practicality.


Expires 11th January 2006             Cheshire                 [Page 14]
\f
Internet Draft       IPv4 Address Conflict Detection      11th July 2005


   The historical precedent is that, as described above in Section 4,
   Gratuitous ARP is described in Stevens Networking [Ste94] as using
   ARP Request packets.  BSD Unix, Windows, Mac OS 9, Mac OS X, etc.
   all use ARP Request packets as described in Stevens.  At this stage,
   trying to mandate that they all switch to using ARP Reply packets
   would be futile.

   The practical reason is that ARP Request packets are more likely to
   work correctly with more existing ARP implementations, some of which
   may not implement RFC 826 correctly.  The Packet Reception rules in
   RFC 826 state that the opcode is the last thing to check in packet
   processing, so it really shouldn't matter, but there may be
   "creative" implementations that have different packet processing
   depending on the ar$op field, and there are several reasons why these
   are more likely to accept gratuitous ARP Requests than gratuitous ARP
   Replies:

   * An incorrect ARP implementation may expect that ARP Replies are
     only sent via unicast.  RFC 826 does not say this, but an incorrect
     implementation may assume it, and the "principle of least surprise"
     dictates that where there are two or more ways to solve a
     networking problem that are otherwise equally good, the one with
     the fewest unusual properties is the one likely to have the fewest
     interoperability problems with existing implementations.  An ARP
     Announcement needs to broadcast information to all hosts on the
     link.  Since ARP Request packets are always broadcast, and ARP
     Reply packets are not, receiving an ARP Request packet via
     broadcast is less surprising than receiving an ARP Reply packet via
     broadcast.

   * An incorrect ARP implementation may expect that ARP Replies are
     only received in response to ARP Requests that have been issued
     recently by that implementation.  Unexpected unsolicited Replies
     may be ignored.

   * An incorrect ARP implementation may ignore ARP Replies where
     ar$tha doesn't match its hardware address.

   * An incorrect ARP implementation may ignore ARP Replies where
     ar$tpa doesn't match its IP address.

   In summary, there are more ways that an incorrect ARP implementation
   might plausibly reject an ARP Reply (which usually occurs as a result
   of being solicited by the client) than an ARP Request (which is
   already expected to occur unsolicited).

^ permalink raw reply	[flat|nested] 93+ messages in thread

* Re: [patch 13/26] Xen-paravirt_ops: Consistently wrap paravirt ops callsites to make them patchable
       [not found]     ` <20070316.023331.59468179.davem@davemloft.net>
@ 2007-03-16 20:38       ` Jeremy Fitzhardinge
  2007-03-17 10:33         ` Rusty Russell
  0 siblings, 1 reply; 93+ messages in thread
From: Jeremy Fitzhardinge @ 2007-03-16 20:38 UTC (permalink / raw)
  To: David Miller
  Cc: mingo, ak, akpm, linux-kernel, virtualization, xen-devel, chrisw,
	zach, rusty, anthony, torvalds, NetDev

David Miller wrote:
> Perhaps the problem can be dealt with using ELF relocations.
>
> There is another case, discussed yesterday on netdev, where run-time
> resolution of ELF relocations would be useful (for
> very-very-very-read-only variables) so if it can solve this problem
> too it would be nice to have a generic infrastructure for it.

That's an interesting idea.  Have you or anyone else looked at what it
would take to code up?

For this case, I guess you'd walk the relocs looking for references into
the paravirt_ops structure.  You'd need to check that was a reference
from an indirect jump or call instruction, which would identify a
patchable callsite.  The offset into the pv_ops structure would identify
which operation is involved.  That would be enough to use the existing
paravirt_ops patch machinery to insert a patch, though it doesn't
provide quite as much information as the current scheme.  The current
scheme also tells us how much space is available for patching over, and
what registers the patched-in code can use/clobber.

What are the netdev requirements?

    J

^ permalink raw reply	[flat|nested] 93+ messages in thread

* Re: [patch 13/26] Xen-paravirt_ops: Consistently wrap paravirt ops callsites to make them patchable
  2007-03-16 20:38       ` [patch 13/26] Xen-paravirt_ops: Consistently wrap paravirt ops callsites to make them patchable Jeremy Fitzhardinge
@ 2007-03-17 10:33         ` Rusty Russell
  2007-03-18  7:33           ` David Miller
  0 siblings, 1 reply; 93+ messages in thread
From: Rusty Russell @ 2007-03-17 10:33 UTC (permalink / raw)
  To: Jeremy Fitzhardinge
  Cc: zach, xen-devel, akpm, ak, NetDev, linux-kernel, chrisw,
	virtualization, anthony, mingo, torvalds, David Miller

On Fri, 2007-03-16 at 13:38 -0700, Jeremy Fitzhardinge wrote:
> David Miller wrote:
> > Perhaps the problem can be dealt with using ELF relocations.
> >
> > There is another case, discussed yesterday on netdev, where run-time
> > resolution of ELF relocations would be useful (for
> > very-very-very-read-only variables) so if it can solve this problem
> > too it would be nice to have a generic infrastructure for it.
> 
> That's an interesting idea.  Have you or anyone else looked at what it
> would take to code up?
> 
> For this case, I guess you'd walk the relocs looking for references into
> the paravirt_ops structure.  You'd need to check that was a reference
> from an indirect jump or call instruction, which would identify a
> patchable callsite.  The offset into the pv_ops structure would identify
> which operation is involved.

I wrote a whole email on ways to do this, BUT...

Perhaps our patching code can already be vastly simplified to something
like:

#define pv_patch(call, args...) \
	asm volatile("8888:"); 
	call(args);
	asm volatile("8889:"
	 [ stuff to put 8889, 8888 and call in fixup section ]

The patching code doesn't even need to decode between those two: it
simply looks for an indirect call insn (0xff 0xd?).  If it finds more
than one, it doesn't patch.  If it only finds one, it patches.  We'll
probably hit them all just doing that.

> What are the netdev requirements?

Reading Ben LaHaise's (very cool!) patch, it's not clear that using
reloc postprocessing is going to be clearer than open-coding it as he
has done.

Rusty.

^ permalink raw reply	[flat|nested] 93+ messages in thread

* Re: [patch 13/26] Xen-paravirt_ops: Consistently wrap paravirt ops callsites to make them patchable
  2007-03-17 10:33         ` Rusty Russell
@ 2007-03-18  7:33           ` David Miller
  2007-03-18  7:59             ` Jeremy Fitzhardinge
  2007-03-18 12:08             ` Andi Kleen
  0 siblings, 2 replies; 93+ messages in thread
From: David Miller @ 2007-03-18  7:33 UTC (permalink / raw)
  To: rusty
  Cc: jeremy, mingo, ak, akpm, linux-kernel, virtualization, xen-devel,
	chrisw, zach, anthony, torvalds, netdev

From: Rusty Russell <rusty@rustcorp.com.au>
Date: Sat, 17 Mar 2007 21:33:58 +1100

> On Fri, 2007-03-16 at 13:38 -0700, Jeremy Fitzhardinge wrote:
> > David Miller wrote:
> > > Perhaps the problem can be dealt with using ELF relocations.
> > >
> > > There is another case, discussed yesterday on netdev, where run-time
> > > resolution of ELF relocations would be useful (for
> > > very-very-very-read-only variables) so if it can solve this problem
> > > too it would be nice to have a generic infrastructure for it.
> > 
> > That's an interesting idea.  Have you or anyone else looked at what it
> > would take to code up?
> > 
> > For this case, I guess you'd walk the relocs looking for references into
> > the paravirt_ops structure.  You'd need to check that was a reference
> > from an indirect jump or call instruction, which would identify a
> > patchable callsite.  The offset into the pv_ops structure would identify
> > which operation is involved.
> 
> I wrote a whole email on ways to do this, BUT...

The idea is _NOT_ that you go look for references to the paravirt_ops
members structure, that would be stupid and you wouldn't be able to
use the most efficient addressing mode on a given cpu, you'd be
patching up indirect calls and crap like that.  Just say no...

Instead you get rid of paravirt ops completely, and you call functions
whose symbol name will not resolve in the initial kernel link.

You do an initial link of the kernel, look for the unresolved symbols
in the ELF relocation tables (just like the linker does), and put
those references into a table that is use to patch things up and you
can use standard ELF relocation code to handle this, exactly like code
we already have for module loading in the kernel already.

This idea is about 15 years old, sparc32 has been doing exactly this
via something called "btfixup" to handle the page table, TLB, and
cache differences of 15 different cpu+cache type combinations.

> #define pv_patch(call, args...) \
> 	asm volatile("8888:"); 
> 	call(args);
> 	asm volatile("8889:"
> 	 [ stuff to put 8889, 8888 and call in fixup section ]

Please, use ELF and it's powerful and clean existing way to
do this please. :-)

> > What are the netdev requirements?
> 
> Reading Ben LaHaise's (very cool!) patch, it's not clear that using
> reloc postprocessing is going to be clearer than open-coding it as he
> has done.

Ben's case can be handled in the same way.  Just do not define the
symbols, pre-link, look for the references in the relocation tables,
and run through that when you do the set_very_readonly() or
install_paravirt_ops() thing.

^ permalink raw reply	[flat|nested] 93+ messages in thread

* Re: [patch 13/26] Xen-paravirt_ops: Consistently wrap paravirt ops callsites to make them patchable
  2007-03-18  7:33           ` David Miller
@ 2007-03-18  7:59             ` Jeremy Fitzhardinge
  2007-03-18 12:08             ` Andi Kleen
  1 sibling, 0 replies; 93+ messages in thread
From: Jeremy Fitzhardinge @ 2007-03-18  7:59 UTC (permalink / raw)
  To: David Miller
  Cc: zach, xen-devel, ak, netdev, rusty, linux-kernel, chrisw,
	virtualization, anthony, mingo, torvalds, akpm

David Miller wrote:
> The idea is _NOT_ that you go look for references to the paravirt_ops
> members structure, that would be stupid and you wouldn't be able to
> use the most efficient addressing mode on a given cpu, you'd be
> patching up indirect calls and crap like that.  Just say no...
>
> Instead you get rid of paravirt ops completely, and you call functions
> whose symbol name will not resolve in the initial kernel link.
>   

Yeah, I came to that conclusion after thinking about it for a while. 
Thanks for the pointer to the sparc stuff; it looks very interesting.

    J

^ permalink raw reply	[flat|nested] 93+ messages in thread

* Re: [patch 13/26] Xen-paravirt_ops: Consistently wrap paravirt ops callsites to make them patchable
  2007-03-18  7:33           ` David Miller
  2007-03-18  7:59             ` Jeremy Fitzhardinge
@ 2007-03-18 12:08             ` Andi Kleen
  2007-03-18 15:58               ` Jeremy Fitzhardinge
  2007-03-19  2:47               ` Rusty Russell
  1 sibling, 2 replies; 93+ messages in thread
From: Andi Kleen @ 2007-03-18 12:08 UTC (permalink / raw)
  To: David Miller
  Cc: rusty, jeremy, mingo, akpm, linux-kernel, virtualization,
	xen-devel, chrisw, zach, anthony, torvalds, netdev

> The idea is _NOT_ that you go look for references to the paravirt_ops
> members structure, that would be stupid and you wouldn't be able to
> use the most efficient addressing mode on a given cpu, you'd be
> patching up indirect calls and crap like that.  Just say no...

That wouldn't handle inlines though. At least some of the current
paravirtops like cli/sti are critical enough to require inlining.

-Andi

^ permalink raw reply	[flat|nested] 93+ messages in thread

* Re: [patch 13/26] Xen-paravirt_ops: Consistently wrap paravirt ops callsites to make them patchable
  2007-03-18 12:08             ` Andi Kleen
@ 2007-03-18 15:58               ` Jeremy Fitzhardinge
  2007-03-18 17:04                 ` Andi Kleen
  2007-03-19  2:47               ` Rusty Russell
  1 sibling, 1 reply; 93+ messages in thread
From: Jeremy Fitzhardinge @ 2007-03-18 15:58 UTC (permalink / raw)
  To: Andi Kleen
  Cc: David Miller, rusty, mingo, akpm, linux-kernel, virtualization,
	xen-devel, chrisw, zach, anthony, torvalds, netdev

Andi Kleen wrote:
>> The idea is _NOT_ that you go look for references to the paravirt_ops
>> members structure, that would be stupid and you wouldn't be able to
>> use the most efficient addressing mode on a given cpu, you'd be
>> patching up indirect calls and crap like that.  Just say no...
>>     
>
> That wouldn't handle inlines though. At least some of the current
> paravirtops like cli/sti are critical enough to require inlining.
>   

You could special-case it in the thing handling the relocs; if you're
relocating to point to a function which you have an inline substitution,
then inline.

The bigger problem is that you don't know what registers you can
clobber.  For the pv_ops in hand-written asm, that a big pain.  The code
either has to assume the worst, or the "relocator" has to do something
more sophisticated (like look for push/pop pairs surrounding the call,
perhaps?).

    J

^ permalink raw reply	[flat|nested] 93+ messages in thread

* Re: [patch 13/26] Xen-paravirt_ops: Consistently wrap paravirt ops callsites to make them patchable
  2007-03-18 15:58               ` Jeremy Fitzhardinge
@ 2007-03-18 17:04                 ` Andi Kleen
  2007-03-18 17:29                   ` Jeremy Fitzhardinge
  0 siblings, 1 reply; 93+ messages in thread
From: Andi Kleen @ 2007-03-18 17:04 UTC (permalink / raw)
  To: Jeremy Fitzhardinge
  Cc: David Miller, rusty, mingo, akpm, linux-kernel, virtualization,
	xen-devel, chrisw, zach, anthony, torvalds, netdev

> The bigger problem is that you don't know what registers you can
> clobber.  For the pv_ops in hand-written asm, that a big pain.  The code
> either has to assume the worst, or the "relocator" has to do something
> more sophisticated (like look for push/pop pairs surrounding the call,
> perhaps?).

You could use the dwarf2 unwind tables. They have exact information
what register has what. But it would likely get complicated.

-Andi

^ permalink raw reply	[flat|nested] 93+ messages in thread

* Re: [patch 13/26] Xen-paravirt_ops: Consistently wrap paravirt ops callsites to make them patchable
  2007-03-18 17:04                 ` Andi Kleen
@ 2007-03-18 17:29                   ` Jeremy Fitzhardinge
  2007-03-18 19:30                     ` Andi Kleen
  0 siblings, 1 reply; 93+ messages in thread
From: Jeremy Fitzhardinge @ 2007-03-18 17:29 UTC (permalink / raw)
  To: Andi Kleen
  Cc: David Miller, rusty, mingo, akpm, linux-kernel, virtualization,
	xen-devel, chrisw, zach, anthony, torvalds, netdev

Andi Kleen wrote:
> You could use the dwarf2 unwind tables. They have exact information
> what register has what. But it would likely get complicated.

Yes.  And would they be accurate for hand-written asm, which is where we
have this problem?

    J

^ permalink raw reply	[flat|nested] 93+ messages in thread

* Re: [patch 13/26] Xen-paravirt_ops: Consistently wrap paravirt ops callsites to make them patchable
  2007-03-18 17:29                   ` Jeremy Fitzhardinge
@ 2007-03-18 19:30                     ` Andi Kleen
  2007-03-18 23:46                       ` Jeremy Fitzhardinge
  0 siblings, 1 reply; 93+ messages in thread
From: Andi Kleen @ 2007-03-18 19:30 UTC (permalink / raw)
  To: Jeremy Fitzhardinge
  Cc: David Miller, rusty, mingo, akpm, linux-kernel, virtualization,
	xen-devel, chrisw, zach, anthony, torvalds, netdev

On Sun, Mar 18, 2007 at 10:29:10AM -0700, Jeremy Fitzhardinge wrote:
> Andi Kleen wrote:
> > You could use the dwarf2 unwind tables. They have exact information
> > what register has what. But it would likely get complicated.
> 
> Yes.  And would they be accurate for hand-written asm, which is where we
> have this problem?

Yes. All inline assembly tells gcc what registers are clobbered
and it fills in the tables. Hand clobbering in inline assembly cannot
be expressed with the current toolchain, so we moved all those
out of line.

But again I'm not sure it will work anyways. For once you would
need large padding around the calls anyways for inline replacement --
how would you generate that? I expect you would need to put the calls
into asm() again and with that a custom annotiation format looks reasonable.

-Andi

^ permalink raw reply	[flat|nested] 93+ messages in thread

* Re: [patch 13/26] Xen-paravirt_ops: Consistently wrap paravirt ops callsites to make them patchable
  2007-03-18 19:30                     ` Andi Kleen
@ 2007-03-18 23:46                       ` Jeremy Fitzhardinge
  2007-03-19 10:57                         ` Andi Kleen
  2007-03-20  1:23                         ` Zachary Amsden
  0 siblings, 2 replies; 93+ messages in thread
From: Jeremy Fitzhardinge @ 2007-03-18 23:46 UTC (permalink / raw)
  To: Andi Kleen
  Cc: zach, xen-devel, akpm, netdev, rusty, linux-kernel, chrisw,
	virtualization, anthony, mingo, torvalds, David Miller

Andi Kleen wrote:
> Yes. All inline assembly tells gcc what registers are clobbered
> and it fills in the tables. Hand clobbering in inline assembly cannot
> be expressed with the current toolchain, so we moved all those
> out of line.
>
> But again I'm not sure it will work anyways. For once you would
> need large padding around the calls anyways for inline replacement --
> how would you generate that? I expect you would need to put the calls
> into asm() again and with that a custom annotiation format looks reasonable.

Inlining is most important for very small code: sti, cli, pushf;pop eax,
etc (in many cases, no-ops).  We'd have at least 5 bytes to work in, and
maybe more if there are surrounding push/pops to be consumed.

For example, say we wanted to put a general call for sti into entry.S,
where its expected it won't touch any registers.  In that case, we'd
have a sequence like:

    push %eax
    push %ecx
    push %edx
    call paravirt_cli
    pop %edx
    pop %ecx
    pop %eax
      

If we parse the relocs, then we'd find the reference to paravirt_cli. 
If we look at the byte before and see 0xe8, then we can see if its a
call.  If we then work out in each direction and see matched push/pops,
then we know what registers can be trashed in the call.  This also
allows us to determine the callsite size, and therefore how much space
we need for inlining.

So in this case, we see that there are 5 bytes for the call and a
further 6 bytes of push/pops available for inlining.

Of course this is hand-written code anyway, so there's no particular
burden to having some extra metadata stashed away in another section. 
For compiler-generated code, we know that it's already expecting
standard C ABI calling conventions.  The downside, of course, is that
only the 5 byte call space is available for inline patching.

    J

^ permalink raw reply	[flat|nested] 93+ messages in thread

* Re: [patch 13/26] Xen-paravirt_ops: Consistently wrap paravirt ops callsites to make them patchable
  2007-03-18 12:08             ` Andi Kleen
  2007-03-18 15:58               ` Jeremy Fitzhardinge
@ 2007-03-19  2:47               ` Rusty Russell
  2007-03-19 18:25                 ` Eric W. Biederman
  1 sibling, 1 reply; 93+ messages in thread
From: Rusty Russell @ 2007-03-19  2:47 UTC (permalink / raw)
  To: Andi Kleen
  Cc: David Miller, jeremy, mingo, akpm, linux-kernel, virtualization,
	xen-devel, chrisw, zach, anthony, torvalds, netdev

On Sun, 2007-03-18 at 13:08 +0100, Andi Kleen wrote:
> > The idea is _NOT_ that you go look for references to the paravirt_ops
> > members structure, that would be stupid and you wouldn't be able to
> > use the most efficient addressing mode on a given cpu, you'd be
> > patching up indirect calls and crap like that.  Just say no...
> 
> That wouldn't handle inlines though. At least some of the current
> paravirtops like cli/sti are critical enough to require inlining.

Well, we'd patch the inline over the call if we have room.

Magic patching would be neat, but the downsides are that (1) we can't
expand the patching room and (2) there's no way of attaching clobber
info to the call site (doing register liveness analysis is not
appealing).

Now, this may not be fatal.  5 bytes is enough for all the native ops to
be patched inline.   For lguest this covers popf and pushf, but not cli
and sti (10 bytes): they'd have to be calls.

As for clobber info, it turns out that almost all of the calls can
clobber %eax, which is probably enough.  We just need to mark the
handful of asm ones where this isn't true.

Thoughts?
Rusty.


^ permalink raw reply	[flat|nested] 93+ messages in thread

* Re: [patch 13/26] Xen-paravirt_ops: Consistently wrap paravirt ops callsites to make them patchable
  2007-03-18 23:46                       ` Jeremy Fitzhardinge
@ 2007-03-19 10:57                         ` Andi Kleen
  2007-03-19 17:58                           ` Jeremy Fitzhardinge
  2007-03-19 19:08                           ` David Miller
  2007-03-20  1:23                         ` Zachary Amsden
  1 sibling, 2 replies; 93+ messages in thread
From: Andi Kleen @ 2007-03-19 10:57 UTC (permalink / raw)
  To: virtualization, jbeulich
  Cc: Jeremy Fitzhardinge, Andi Kleen, xen-devel, netdev, linux-kernel,
	David Miller, chrisw, virtualization, anthony, akpm, torvalds,
	mingo

On Monday 19 March 2007 00:46, Jeremy Fitzhardinge wrote:
> Andi Kleen wrote:
> > Yes. All inline assembly tells gcc what registers are clobbered
> > and it fills in the tables. Hand clobbering in inline assembly cannot
> > be expressed with the current toolchain, so we moved all those
> > out of line.
> >
> > But again I'm not sure it will work anyways. For once you would
> > need large padding around the calls anyways for inline replacement --
> > how would you generate that? I expect you would need to put the calls
> > into asm() again and with that a custom annotiation format looks
> > reasonable.
>
> Inlining is most important for very small code: sti, cli, pushf;pop eax,
> etc (in many cases, no-ops).  We'd have at least 5 bytes to work in, and
> maybe more if there are surrounding push/pops to be consumed.
>
> For example, say we wanted to put a general call for sti into entry.S,
> where its expected it won't touch any registers.  In that case, we'd
> have a sequence like:
>
>     push %eax
>     push %ecx
>     push %edx
>     call paravirt_cli
>     pop %edx
>     pop %ecx
>     pop %eax

This cannot right now be expressed as inline assembly in the unwinder at all 
because there is no way to inject the push/pops into the compiler generated
ehframe tables.

[BTW I plan to resubmit the unwinder with some changes]

>
>
> If we parse the relocs, then we'd find the reference to paravirt_cli.
> If we look at the byte before and see 0xe8, then we can see if its a
> call.  If we then work out in each direction and see matched push/pops,
> then we know what registers can be trashed in the call.  This also
> allows us to determine the callsite size, and therefore how much space
> we need for inlining.

gcc normally doesn't generate push/pops around directly around the
call site, but somewhere else due to the way its register allocator works.
It can be anywhere in the function or even not there at all if the register
didn't contain anything useful. And they're not necessarily push/pops of 
course.

So you would need to write it as inline assembly. I'm not sure it would
be significantly cleaner than just having tables then.

> So in this case, we see that there are 5 bytes for the call and a
> further 6 bytes of push/pops available for inlining.
>
> Of course this is hand-written code anyway, so there's no particular
> burden to having some extra metadata stashed away in another section.
> For compiler-generated code, we know that it's already expecting
> standard C ABI calling conventions.  The downside, of course, is that
> only the 5 byte call space is available for inline patching.

It's unlikely you can do much useful in 5 bytes I guess.

Regarding cli/sti: i've been actually thinking about changing it in the
non paravirt kernel. IIRC most save_flags/restore_flags are inside
spin_lock_irqsave/restore() and that is a separate function anyways
so a little larger special case code is ok as long as it is not slower. 
There is some evidence that at least on P4 a software cli/sti flag without 
pushf/popf would be faster.

-Andi


^ permalink raw reply	[flat|nested] 93+ messages in thread

* Re: [patch 13/26] Xen-paravirt_ops: Consistently wrap paravirt ops callsites to make them patchable
  2007-03-19 10:57                         ` Andi Kleen
@ 2007-03-19 17:58                           ` Jeremy Fitzhardinge
  2007-03-19 19:08                           ` David Miller
  1 sibling, 0 replies; 93+ messages in thread
From: Jeremy Fitzhardinge @ 2007-03-19 17:58 UTC (permalink / raw)
  To: Andi Kleen
  Cc: virtualization, jbeulich, Andi Kleen, xen-devel, netdev,
	linux-kernel, David Miller, chrisw, virtualization, anthony, akpm,
	torvalds, mingo

Andi Kleen wrote:
>>
>>     push %eax
>>     push %ecx
>>     push %edx
>>     call paravirt_cli
>>     pop %edx
>>     pop %ecx
>>     pop %eax
>>     
>
> This cannot right now be expressed as inline assembly in the unwinder at all 
> because there is no way to inject the push/pops into the compiler generated
> ehframe tables.
>   

I was thinking more of entry.S rather than inline asms.  The only ones I
can think of which would be relevant are the interrupt operations in
spinlocks, and even then we get a bit more leeway.

> gcc normally doesn't generate push/pops around directly around the
> call site, but somewhere else due to the way its register allocator works.
> It can be anywhere in the function or even not there at all if the register
> didn't contain anything useful. And they're not necessarily push/pops of 
> course.
>   

Right, I'm not making any assumptions about what gcc generates for
calls, other than assuming it uses a standard "call X" direct call
instruction.

> So you would need to write it as inline assembly. I'm not sure it would
> be significantly cleaner than just having tables then.
>   

No, my intention is that the vast majority of pv_ops uses, which are
calls from C code, would simply be normal C calls, syntatically and
semantically.

> It's unlikely you can do much useful in 5 bytes I guess.
>   

I think the main value is retaining non-PARAVIRT native performance
levels.  In the native case, the inlined instructions amount to 1 or 2
bytes, and having small patch sites is an advantage because there's less
space wasted on nops.  My understanding is that a direct call has almost
zero overhead on modern processors, because both the call and the return
can be completely predicted and prefetched, so the threshhold at which
you make inline vs call tradeoff is pretty small.

> Regarding cli/sti: i've been actually thinking about changing it in the
> non paravirt kernel. IIRC most save_flags/restore_flags are inside
> spin_lock_irqsave/restore() and that is a separate function anyways
> so a little larger special case code is ok as long as it is not slower. 
> There is some evidence that at least on P4 a software cli/sti flag without 
> pushf/popf would be faster.
>   

There are also the ones in entry.S.  I suppose spinlocks do get used
more than syscalls and interrupts.


    J

^ permalink raw reply	[flat|nested] 93+ messages in thread

* Re: [patch 13/26] Xen-paravirt_ops: Consistently wrap paravirt ops callsites to make them patchable
  2007-03-19  2:47               ` Rusty Russell
@ 2007-03-19 18:25                 ` Eric W. Biederman
  2007-03-19 18:38                   ` Linus Torvalds
                                     ` (2 more replies)
  0 siblings, 3 replies; 93+ messages in thread
From: Eric W. Biederman @ 2007-03-19 18:25 UTC (permalink / raw)
  To: Rusty Russell
  Cc: Andi Kleen, David Miller, jeremy, mingo, akpm, linux-kernel,
	virtualization, xen-devel, chrisw, zach, anthony, torvalds,
	netdev

Rusty Russell <rusty@rustcorp.com.au> writes:

> On Sun, 2007-03-18 at 13:08 +0100, Andi Kleen wrote:
>> > The idea is _NOT_ that you go look for references to the paravirt_ops
>> > members structure, that would be stupid and you wouldn't be able to
>> > use the most efficient addressing mode on a given cpu, you'd be
>> > patching up indirect calls and crap like that.  Just say no...
>> 
>> That wouldn't handle inlines though. At least some of the current
>> paravirtops like cli/sti are critical enough to require inlining.
>
> Well, we'd patch the inline over the call if we have room.
>
> Magic patching would be neat, but the downsides are that (1) we can't
> expand the patching room and (2) there's no way of attaching clobber
> info to the call site (doing register liveness analysis is not
> appealing).

True.  You can use all of the call clobbered registers.

> Now, this may not be fatal.  5 bytes is enough for all the native ops to
> be patched inline.   For lguest this covers popf and pushf, but not cli
> and sti (10 bytes): they'd have to be calls.
>
> As for clobber info, it turns out that almost all of the calls can
> clobber %eax, which is probably enough.  We just need to mark the
> handful of asm ones where this isn't true.

I guess if the code is larger than a function call I'm failing to see
the disadvantage in making it a direct function call.  Any modern
processor ought to be able to predict it perfectly, and processors
like the P4 may even optimize the call out of their L1 instruction
cache.

If what David is suggesting works, making all of these direct calls
looks easy and very maintainable.   At which point patching
instructions inline is quite possibly overkill.

Is it truly critical to inline any of these instructions?

Eric

^ permalink raw reply	[flat|nested] 93+ messages in thread

* Re: [patch 13/26] Xen-paravirt_ops: Consistently wrap paravirt ops callsites to make them patchable
  2007-03-19 18:25                 ` Eric W. Biederman
@ 2007-03-19 18:38                   ` Linus Torvalds
  2007-03-19 18:44                     ` Linus Torvalds
                                       ` (2 more replies)
  2007-03-19 18:41                   ` Chris Wright
  2007-03-19 19:10                   ` Jeremy Fitzhardinge
  2 siblings, 3 replies; 93+ messages in thread
From: Linus Torvalds @ 2007-03-19 18:38 UTC (permalink / raw)
  To: Eric W. Biederman
  Cc: xen-devel, virtualization, netdev, linux-kernel, David Miller,
	chrisw, Andi Kleen, anthony, mingo, akpm



On Mon, 19 Mar 2007, Eric W. Biederman wrote:
> 
> True.  You can use all of the call clobbered registers.

Quite often, the biggest single win of inlining is not so much the code 
size (although if done right, that will be smaller too), but the fact that 
inlining DOES NOT CLOBBER AS MANY REGISTERS!

The lack of register clobbering, and the freedom for the compiler to 
choose registers around an inlined function is usually the biggest win! If 
you can't do that, then inlining generally doesn't actually even help: a 
call/return to a single instruction isn't all that much slower than just 
doing the "cli" in the first place.

If we end up with a setup where any inlined instruction needs to act as if 
it was a function call (just with the "call" instruction papered over with 
the inlined instruction sequence), then there is no point to this at all. 

In short: people here seem to think that inlining is about avoiding the 
call/ret sequence. Not so. The real advantages of inlining are elsewhere.

So *please* don't believe that you can make it "as cheap" to have some 
automatic fixup of two sequences, one inlined and one as a "call".  It may 
look so when you look at the single instruction generated, but you're 
ignoring all the instructions *around* the site.

			Linus

^ permalink raw reply	[flat|nested] 93+ messages in thread

* Re: [patch 13/26] Xen-paravirt_ops: Consistently wrap paravirt ops callsites to make them patchable
  2007-03-19 18:25                 ` Eric W. Biederman
  2007-03-19 18:38                   ` Linus Torvalds
@ 2007-03-19 18:41                   ` Chris Wright
  2007-03-19 19:10                   ` Jeremy Fitzhardinge
  2 siblings, 0 replies; 93+ messages in thread
From: Chris Wright @ 2007-03-19 18:41 UTC (permalink / raw)
  To: Eric W. Biederman
  Cc: zach, jeremy, xen-devel, anthony, virtualization, netdev,
	Rusty Russell, linux-kernel, chrisw, Andi Kleen, mingo, akpm,
	torvalds, David Miller

* Eric W. Biederman (ebiederm@xmission.com) wrote:
> Is it truly critical to inline any of these instructions?

I don't have any current measurements.  But we'd been aiming
at getting irq_{en,dis}able to a simple memory write to pda.
But simplicity, maintenance, etc. win over trimming a couple
cycles, so still worth real look.

thanks,
-chris

^ permalink raw reply	[flat|nested] 93+ messages in thread

* Re: [patch 13/26] Xen-paravirt_ops: Consistently wrap paravirt ops callsites to make them patchable
  2007-03-19 18:38                   ` Linus Torvalds
@ 2007-03-19 18:44                     ` Linus Torvalds
  2007-03-19 19:33                     ` Jeremy Fitzhardinge
  2007-03-20  0:01                     ` Rusty Russell
  2 siblings, 0 replies; 93+ messages in thread
From: Linus Torvalds @ 2007-03-19 18:44 UTC (permalink / raw)
  To: Eric W. Biederman
  Cc: xen-devel, virtualization, netdev, linux-kernel, David Miller,
	chrisw, Andi Kleen, anthony, mingo, akpm



On Mon, 19 Mar 2007, Linus Torvalds wrote:
> 
> So *please* don't believe that you can make it "as cheap" to have some 
> automatic fixup of two sequences, one inlined and one as a "call".  It may 
> look so when you look at the single instruction generated, but you're 
> ignoring all the instructions *around* the site.

Side note, you can certainly fix things like this at least in theory, but 
it requires:

 - the "call" instruction that is used instead of the inlining should 
   basically have no callee-clobbers. Any paravirt_ops called this way
   should have a special calling sequence where they simple save and 
   restore all the registers they use.

   This is usually not that bad. Just create a per-architecture wrapper 
   function that saves/restores anything that the C calling convention on 
   that architecture says is clobbered by calls.

 - if the function has arguments, and the inlined sequence can take the 
   arguments in arbitrary registers, you are going to penalize the inlined 
   sequence anyway (by forcing some fixed arbitrary register allocation 
   policy).This thing is largely unfixable without some really extreme 
   compiler games (like post-processing the assembler output and having 
   different entry-points depending on where the arguments are..)

.. it will obviously depend on how thngs are done whether these things are 
useful or not. But it does mean that it's always a good idea to just have 
a config option of "turn off all the paravirt crap, because it *does* add 
overhead, and replacing instructions on the fly doesn't make that 
overhead go away".

		Linus

^ permalink raw reply	[flat|nested] 93+ messages in thread

* Re: [patch 13/26] Xen-paravirt_ops: Consistently wrap paravirt ops callsites to make them patchable
  2007-03-19 10:57                         ` Andi Kleen
  2007-03-19 17:58                           ` Jeremy Fitzhardinge
@ 2007-03-19 19:08                           ` David Miller
  2007-03-19 20:59                             ` Andi Kleen
  1 sibling, 1 reply; 93+ messages in thread
From: David Miller @ 2007-03-19 19:08 UTC (permalink / raw)
  To: ak
  Cc: virtualization, jbeulich, jeremy, ak, xen-devel, netdev,
	linux-kernel, chrisw, virtualization, anthony, akpm, torvalds,
	mingo

From: Andi Kleen <ak@suse.de>
Date: Mon, 19 Mar 2007 11:57:28 +0100

> On Monday 19 March 2007 00:46, Jeremy Fitzhardinge wrote:
> > Andi Kleen wrote:
> > For example, say we wanted to put a general call for sti into entry.S,
> > where its expected it won't touch any registers.  In that case, we'd
> > have a sequence like:
> >
> >     push %eax
> >     push %ecx
> >     push %edx
> >     call paravirt_cli
> >     pop %edx
> >     pop %ecx
> >     pop %eax
> 
> This cannot right now be expressed as inline assembly in the unwinder at all 
> because there is no way to inject the push/pops into the compiler generated
> ehframe tables.
> 
> [BTW I plan to resubmit the unwinder with some changes]

It's inability to handle sequences like the above sounds to me like
a very good argument to _not_ merge the unwinder back into the tree.

To me, that unwinder is nothing but trouble, it severly limits what
cases you can use special calling conventions via inline asm (and we
have done that on several occaisions) and even ignoring that the
unwinder only works half the time.

Please don't subject us to another couple months of hair-pulling only
to have Linus yank the thing out again, there are certainly more
useful things to spend time on :-)

^ permalink raw reply	[flat|nested] 93+ messages in thread

* Re: [patch 13/26] Xen-paravirt_ops: Consistently wrap paravirt ops callsites to make them patchable
  2007-03-19 18:25                 ` Eric W. Biederman
  2007-03-19 18:38                   ` Linus Torvalds
  2007-03-19 18:41                   ` Chris Wright
@ 2007-03-19 19:10                   ` Jeremy Fitzhardinge
  2007-03-19 19:46                     ` David Miller
  2007-03-19 23:42                     ` Andi Kleen
  2 siblings, 2 replies; 93+ messages in thread
From: Jeremy Fitzhardinge @ 2007-03-19 19:10 UTC (permalink / raw)
  To: Eric W. Biederman
  Cc: zach, xen-devel, akpm, virtualization, netdev, Rusty Russell,
	linux-kernel, chrisw, Andi Kleen, anthony, mingo, torvalds,
	David Miller

Eric W. Biederman wrote:
> Rusty Russell <rusty@rustcorp.com.au> writes:
>
>   
>> On Sun, 2007-03-18 at 13:08 +0100, Andi Kleen wrote:
>>     
>>>> The idea is _NOT_ that you go look for references to the paravirt_ops
>>>> members structure, that would be stupid and you wouldn't be able to
>>>> use the most efficient addressing mode on a given cpu, you'd be
>>>> patching up indirect calls and crap like that.  Just say no...
>>>>         
>>> That wouldn't handle inlines though. At least some of the current
>>> paravirtops like cli/sti are critical enough to require inlining.
>>>       
>> Well, we'd patch the inline over the call if we have room.
>>
>> Magic patching would be neat, but the downsides are that (1) we can't
>> expand the patching room and (2) there's no way of attaching clobber
>> info to the call site (doing register liveness analysis is not
>> appealing).
>>     
>
> True.  You can use all of the call clobbered registers.
>   

The trouble is that there are places where we want to intercept things
like sti/cli in entry.S in a context where all the registers are in use,
so we can't generally make the assumption that caller-saved regs are
clobberable.  For any call-site in compiler-generated code we can make
that assumption, of course.

>> Now, this may not be fatal.  5 bytes is enough for all the native ops to
>> be patched inline.   For lguest this covers popf and pushf, but not cli
>> and sti (10 bytes): they'd have to be calls.
>>
>> As for clobber info, it turns out that almost all of the calls can
>> clobber %eax, which is probably enough.  We just need to mark the
>> handful of asm ones where this isn't true.
>>     
>
> I guess if the code is larger than a function call I'm failing to see
> the disadvantage in making it a direct function call.  Any modern
> processor ought to be able to predict it perfectly, and processors
> like the P4 may even optimize the call out of their L1 instruction
> cache.
>   

For current processors, I think i-cache pollution is really the only
downside to a call.  But maybe you could put multiple pv-op functions
into a cacheline if they're used in close proximity
(cli/sti/save_fl/restore_fl would all fit into a single cacheline).

> If what David is suggesting works, making all of these direct calls
> looks easy and very maintainable.   At which point patching
> instructions inline is quite possibly overkill.
>   

I spent some time looking at what it would take to get this going.  It
would probably look something like:

   1. make CONFIG_PARAVIRT select CONFIG_RELOCATABLE
   2. generate vmlinux
   3. extract relocs, and process the references to paravirt symbols
      into a table
   4. compile and link that table into the kernel again (like the ksyms
      dance)
   5. at boot time, use that table to redirect calls/references into the
      paravirt backend to the appropriate one

I think if we store the reloc references as section-relative, and the
reloc table itself is linked into its own section, then there's no need
for multipass linking to combine the reloc table into the kernel image,
since adding that table won't affect any existing reloc.

All this is doable; I'd probably end up hacking boot/compressed/relocs.c
to generate the appropriate reloc table.  My main concern is hacking the
kernel build process itself; I'm unsure of what it would actually take
to implement all this.

> Is it truly critical to inline any of these instructions?
>   

Possibly not, but I'd like to be able to say with confidence that
running a PARAVIRT kernel on bare hardware has no performance loss
compared to running a !PARAVIRT kernel.  There's the case of small
instruction sequences which have been replaced with calls (such as
sti/cli/push;popf/etc), and also the case of hooks which are noops on
native (like make_pte/pte_val, etc).  In those cases it would be nice to
patch in a replacement, either a real instruction or just nops.


    J

^ permalink raw reply	[flat|nested] 93+ messages in thread

* Re: [patch 13/26] Xen-paravirt_ops: Consistently wrap paravirt ops callsites to make them patchable
  2007-03-19 18:38                   ` Linus Torvalds
  2007-03-19 18:44                     ` Linus Torvalds
@ 2007-03-19 19:33                     ` Jeremy Fitzhardinge
  2007-03-20  0:01                     ` Rusty Russell
  2 siblings, 0 replies; 93+ messages in thread
From: Jeremy Fitzhardinge @ 2007-03-19 19:33 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: Eric W. Biederman, Rusty Russell, Andi Kleen, David Miller, mingo,
	akpm, linux-kernel, virtualization, xen-devel, chrisw, zach,
	anthony, netdev

Linus Torvalds wrote:
> On Mon, 19 Mar 2007, Eric W. Biederman wrote:
>   
>> True.  You can use all of the call clobbered registers.
>>     
>
> Quite often, the biggest single win of inlining is not so much the code 
> size (although if done right, that will be smaller too), but the fact that 
> inlining DOES NOT CLOBBER AS MANY REGISTERS!
>   

Yes, that's something that we've been conscious of when designing the
pv_ops patching mechanism.  As it stands, each time you emit a call to a
pv-ops operation, it not only stores the start and end of the patchable
area, but also which registers are available for clobbering at that
callsite by the patched-in code.  For things like sti/cli, where the
surrounding code expects nothing to be clobbered, we make sure that the
regs are preserved on the pv-ops side, even though there's a call to a
normal C function in the middle.  That gives in the pvops backend the
flexibility to patch over that site with either some inline code or a
call out to some function which doesn't necessarily clobber the full
caller-save set (or even any registers).

> In short: people here seem to think that inlining is about avoiding the 
> call/ret sequence. Not so. The real advantages of inlining are elsewhere.
>   

Yes.  Probably the biggest win of inlining is constant propagation into
the inline function, but register-use flexibility is good small-scale
win (vs the micro-scale call/ret elimination).

> Side note, you can certainly fix things like this at least in theory, but 
> it requires:
>
>  - the "call" instruction that is used instead of the inlining should 
>    basically have no callee-clobbers. Any paravirt_ops called this way
>    should have a special calling sequence where they simple save and 
>    restore all the registers they use.
>
>    This is usually not that bad. Just create a per-architecture wrapper 
>    function that saves/restores anything that the C calling convention on 
>    that architecture says is clobbered by calls.
>   

Most of the places we intercept are normal C calls anyway, so this isn't
a big issue.  Its mainly the places where we replace single instructions
that this will have a big effect.

The trouble with this is that we're back to having to have special
wrappers around each call to hide the normal C calling convention from
the compiler, so that it knows that it has more registers to play with. 
This was the main complaint about the original version of the patch,
because it all looks very ugly.

>  - if the function has arguments, and the inlined sequence can take the 
>    arguments in arbitrary registers, you are going to penalize the inlined 
>    sequence anyway (by forcing some fixed arbitrary register allocation 
>    policy).This thing is largely unfixable without some really extreme 
>    compiler games (like post-processing the assembler output and having 
>    different entry-points depending on where the arguments are..)
>   

Yeah, that doesn't sound like much fun.  I think using the normal
regparm calling convention will be OK.  Aside from some slightly longer
instruction encodings, all the registers are more or less
interchangeable anyway.

> .. it will obviously depend on how thngs are done whether these things are 
> useful or not. But it does mean that it's always a good idea to just have 
> a config option of "turn off all the paravirt crap, because it *does* add 
> overhead, and replacing instructions on the fly doesn't make that 
> overhead go away".

Yes, there's a big switch to turn all this off.  It would be nice if we
could get things to the point that it isn't necessary to have (ie,
running on bare hardware really is indistinguishable either way), but
we're a fair way from being able to prove that.  In the meantime, the
goal is to try to keep the source-level changes a local as possible so
that maintaining the CONFIG_PARAVIRT vs non-PARAVIRT distinction is
straightforward.


    J

^ permalink raw reply	[flat|nested] 93+ messages in thread

* Re: [patch 13/26] Xen-paravirt_ops: Consistently wrap paravirt ops callsites to make them patchable
  2007-03-19 19:10                   ` Jeremy Fitzhardinge
@ 2007-03-19 19:46                     ` David Miller
  2007-03-19 20:06                       ` Jeremy Fitzhardinge
  2007-03-19 23:42                     ` Andi Kleen
  1 sibling, 1 reply; 93+ messages in thread
From: David Miller @ 2007-03-19 19:46 UTC (permalink / raw)
  To: jeremy
  Cc: ebiederm, rusty, ak, mingo, akpm, linux-kernel, virtualization,
	xen-devel, chrisw, zach, anthony, torvalds, netdev

From: Jeremy Fitzhardinge <jeremy@goop.org>
Date: Mon, 19 Mar 2007 12:10:08 -0700

> All this is doable; I'd probably end up hacking boot/compressed/relocs.c
> to generate the appropriate reloc table.  My main concern is hacking the
> kernel build process itself; I'm unsure of what it would actually take
> to implement all this.

32-bit Sparc's btfixup might be usable as a guide.

Another point worth making is that for function calls you
can fix things up lazily if you want.

So you link, build the reloc tables, then link in a *.o file that
does provide the functions in the form of stubs.  The stubs intercept
the call, and patch the callsite, then revector to the real handler.

I don't like this idea actually because it essentially means you
either:

1) Only allow one setting of the operations

OR

2) Need to have code which walks the whole reloc table anyways
   to handle settings after the first so you can revector
   everyone back to the stubs and lazy reloc properly again

In fact forget I mentioned this idea :)

As another note, I do agree with Linus about the register usage
arguments.  It is important.  I think it's been mentioned but what you
could do is save nothing (so that "sti" and "cli" are just that and
cost nothing), but the more complicated versions save and restore
enough registers to operate.

It all depends upon what you're trying to do.  For example, it's
easy to use patching to make different PTE layouts be supportable
in the same kernel image.  We do this on sparc64 since sun4v
has a different PTE layout than sun4u, you can see the code in
asm-sparc64/pgtable.h for details (search for "sun4v_*_patch")

^ permalink raw reply	[flat|nested] 93+ messages in thread

* Re: [patch 13/26] Xen-paravirt_ops: Consistently wrap paravirt ops callsites to make them patchable
  2007-03-19 19:46                     ` David Miller
@ 2007-03-19 20:06                       ` Jeremy Fitzhardinge
  0 siblings, 0 replies; 93+ messages in thread
From: Jeremy Fitzhardinge @ 2007-03-19 20:06 UTC (permalink / raw)
  To: David Miller
  Cc: zach, xen-devel, virtualization, netdev, rusty, linux-kernel,
	chrisw, ak, ebiederm, anthony, mingo, torvalds, akpm

David Miller wrote:
> Another point worth making is that for function calls you
> can fix things up lazily if you want.
> [...]
> In fact forget I mentioned this idea :)
>   

OK :)  I think we'll only ever want to bind to a hypervisor once, since
the underlying hypervisor can't change on the fly (well, in principle it
might if you migrate, but you'll have more problems than just dealing
with the hook points).  And lazy binding doesn't buy much; I think its
much better to get it all out of the way at once, and then throw the
reloc info away.

> As another note, I do agree with Linus about the register usage
> arguments.  It is important.  I think it's been mentioned but what you
> could do is save nothing (so that "sti" and "cli" are just that and
> cost nothing), but the more complicated versions save and restore
> enough registers to operate.
>   

Right, that's pretty much what we do now.

> It all depends upon what you're trying to do.  For example, it's
> easy to use patching to make different PTE layouts be supportable
> in the same kernel image.  We do this on sparc64 since sun4v
> has a different PTE layout than sun4u, you can see the code in
> asm-sparc64/pgtable.h for details (search for "sun4v_*_patch")
>   

I see.  We want to do something conceptually like this, but we need to
handle more than just adjusting which of two constants to use as a
mask.  For example, Xen needs to run pfns through a pfn->mfn (machine
frame number) conversion and back when making/unpacking pagetable entries.

    J

^ permalink raw reply	[flat|nested] 93+ messages in thread

* Re: [patch 13/26] Xen-paravirt_ops: Consistently wrap paravirt ops callsites to make them patchable
  2007-03-19 19:08                           ` David Miller
@ 2007-03-19 20:59                             ` Andi Kleen
  2007-03-20  3:18                               ` Linus Torvalds
  0 siblings, 1 reply; 93+ messages in thread
From: Andi Kleen @ 2007-03-19 20:59 UTC (permalink / raw)
  To: David Miller
  Cc: virtualization, jbeulich, jeremy, xen-devel, netdev, linux-kernel,
	chrisw, virtualization, anthony, akpm, torvalds, mingo


> It's inability to handle sequences like the above sounds to me like
> a very good argument to _not_ merge the unwinder back into the tree.

The unwinder can handle it fine, it is just that gcc right now cannot
be taught to generate correct unwind tables for it. If paravirt ops
is widely used i guess it would be possible to find some workaround
for that though.
 
There is one case it truly cannot handle (it would be possible by switching 
to dwarf3), but that was relatively easily eliminated and wasn't a problem
imho.

> To me, that unwinder is nothing but trouble, it severly limits what
> cases you can use special calling conventions via inline asm (and we
> have done that on several occaisions) and even ignoring that the
> unwinder only works half the time.

Initially we had some bugs that accounted for near all failures, but they 
were all fixed in the latest version.
 
> Please don't subject us to another couple months of hair-pulling only
> to have Linus yank the thing out again, there are certainly more
> useful things to spend time on :-)

I think better backtraces are a good use of my time. Sorry if you
disagree on that.

-Andi

^ permalink raw reply	[flat|nested] 93+ messages in thread

* Re: [patch 13/26] Xen-paravirt_ops: Consistently wrap paravirt ops callsites to make them patchable
  2007-03-19 19:10                   ` Jeremy Fitzhardinge
  2007-03-19 19:46                     ` David Miller
@ 2007-03-19 23:42                     ` Andi Kleen
  1 sibling, 0 replies; 93+ messages in thread
From: Andi Kleen @ 2007-03-19 23:42 UTC (permalink / raw)
  To: Jeremy Fitzhardinge
  Cc: Eric W. Biederman, Rusty Russell, David Miller, mingo, akpm,
	linux-kernel, virtualization, xen-devel, chrisw, zach, anthony,
	torvalds, netdev

> Possibly not, but I'd like to be able to say with confidence that
> running a PARAVIRT kernel on bare hardware has no performance loss
> compared to running a !PARAVIRT kernel.  There's the case of small
> instruction sequences which have been replaced with calls (such as
> sti/cli/push;popf/etc), 

My guess is that most critical pushf/popf are in spin_lock_irqsave(). It would 
be possible to special case that one -- inline it -- and use out of line
versions for all the others.

-Andi

^ permalink raw reply	[flat|nested] 93+ messages in thread

* Re: [patch 13/26] Xen-paravirt_ops: Consistently wrap paravirt ops callsites to make them patchable
  2007-03-19 18:38                   ` Linus Torvalds
  2007-03-19 18:44                     ` Linus Torvalds
  2007-03-19 19:33                     ` Jeremy Fitzhardinge
@ 2007-03-20  0:01                     ` Rusty Russell
  2007-03-20  2:00                       ` Zachary Amsden
  2 siblings, 1 reply; 93+ messages in thread
From: Rusty Russell @ 2007-03-20  0:01 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: Eric W. Biederman, Andi Kleen, David Miller, jeremy, mingo, akpm,
	linux-kernel, virtualization, xen-devel, chrisw, zach, anthony,
	netdev

On Mon, 2007-03-19 at 11:38 -0700, Linus Torvalds wrote:
> 
> On Mon, 19 Mar 2007, Eric W. Biederman wrote:
> > 
> > True.  You can use all of the call clobbered registers.
> 
> Quite often, the biggest single win of inlining is not so much the code 
> size (although if done right, that will be smaller too), but the fact that 
> inlining DOES NOT CLOBBER AS MANY REGISTERS!

Thanks Linus.

*This* was the reason that the current hand-coded calls only clobber %
eax.  It was a compromise between native (no clobbers) and others (might
need a reg).

Now, since we decided to allow paravirt_ops operations to be normal C
(ie. the patching is optional and done late), we actually push and pop %
ecx and %edx.  This makes the call site 10 bytes long, which is a nice
size for patching anyway (enough for a movl $0, <addr>, a-la lguest's
cli, or movw $0, %gs:<addr> if we supported SMP).

The current 6 paravirt ops which are patched cover the vast majority of
calls (until the Xen patches, then we need ~4 more?).  Jeremy chose to
expand patching to cover *all* paravirt ops, rather than just the new
hot ones, and that's where we tipped over the ugliness threshold.

Cheers,
Rusty.


^ permalink raw reply	[flat|nested] 93+ messages in thread

* Re: [patch 13/26] Xen-paravirt_ops: Consistently wrap paravirt ops callsites to make them patchable
  2007-03-18 23:46                       ` Jeremy Fitzhardinge
  2007-03-19 10:57                         ` Andi Kleen
@ 2007-03-20  1:23                         ` Zachary Amsden
  2007-03-20  1:45                           ` Jeremy Fitzhardinge
  1 sibling, 1 reply; 93+ messages in thread
From: Zachary Amsden @ 2007-03-20  1:23 UTC (permalink / raw)
  To: Jeremy Fitzhardinge
  Cc: Andi Kleen, David Miller, rusty, mingo, akpm, linux-kernel,
	virtualization, xen-devel, chrisw, anthony, torvalds, netdev

Jeremy Fitzhardinge wrote:
> For example, say we wanted to put a general call for sti into entry.S,
> where its expected it won't touch any registers.  In that case, we'd
> have a sequence like:
>
>     push %eax
>     push %ecx
>     push %edx
>     call paravirt_cli
>     pop %edx
>     pop %ecx
>     pop %eax
>       
>
> If we parse the relocs, then we'd find the reference to paravirt_cli. 
> If we look at the byte before and see 0xe8, then we can see if its a
> call.  If we then work out in each direction and see matched push/pops,
> then we know what registers can be trashed in the call.  This also
> allows us to determine the callsite size, and therefore how much space
> we need for inlining.
>   

No, that is a very dangerous suggestion.  You absolutely *cannot* do 
this safely without explicitly marking the start EIP of this code.  You 
*must* use metadata to do that.  It is never safe to disassemble 
backwards or "rewind" EIP for x86 code.

Zach

^ permalink raw reply	[flat|nested] 93+ messages in thread

* Re: [patch 13/26] Xen-paravirt_ops: Consistently wrap paravirt ops callsites to make them patchable
  2007-03-20  1:23                         ` Zachary Amsden
@ 2007-03-20  1:45                           ` Jeremy Fitzhardinge
  0 siblings, 0 replies; 93+ messages in thread
From: Jeremy Fitzhardinge @ 2007-03-20  1:45 UTC (permalink / raw)
  To: Zachary Amsden
  Cc: Andi Kleen, David Miller, rusty, mingo, akpm, linux-kernel,
	virtualization, xen-devel, chrisw, anthony, torvalds, netdev

Zachary Amsden wrote:
> Jeremy Fitzhardinge wrote:
>>  If we then work out in each direction and see matched push/pops,
>> then we know what registers can be trashed in the call.  This also
>> allows us to determine the callsite size, and therefore how much space
>> we need for inlining.
>>   
>
> No, that is a very dangerous suggestion.  You absolutely *cannot* do
> this safely without explicitly marking the start EIP of this code. 
> You *must* use metadata to do that.  It is never safe to disassemble
> backwards or "rewind" EIP for x86 code. 

What do you mean the instruction before is "mov $0x52515000,%eax"?

Yeah, you're right.  Oh well.

    J

^ permalink raw reply	[flat|nested] 93+ messages in thread

* Re: [patch 13/26] Xen-paravirt_ops: Consistently wrap paravirt ops callsites to make them patchable
  2007-03-20  0:01                     ` Rusty Russell
@ 2007-03-20  2:00                       ` Zachary Amsden
  2007-03-20  4:20                         ` Rusty Russell
  2007-03-20  5:54                         ` Jeremy Fitzhardinge
  0 siblings, 2 replies; 93+ messages in thread
From: Zachary Amsden @ 2007-03-20  2:00 UTC (permalink / raw)
  To: Rusty Russell
  Cc: Linus Torvalds, Eric W. Biederman, Andi Kleen, David Miller,
	jeremy, mingo, akpm, linux-kernel, virtualization, xen-devel,
	chrisw, anthony, netdev

Rusty Russell wrote:
> On Mon, 2007-03-19 at 11:38 -0700, Linus Torvalds wrote:
>   
>> On Mon, 19 Mar 2007, Eric W. Biederman wrote:
>>     
>>> True.  You can use all of the call clobbered registers.
>>>       
>> Quite often, the biggest single win of inlining is not so much the code 
>> size (although if done right, that will be smaller too), but the fact that 
>> inlining DOES NOT CLOBBER AS MANY REGISTERS!
>>     

For VMI, the default clobber was "cc", and you need a way to allow at 
least that, because saving and restoring flags is too expensive on x86.

> Thanks Linus.
>
> *This* was the reason that the current hand-coded calls only clobber %
> eax.  It was a compromise between native (no clobbers) and others (might
> need a reg).
>   

I still don't think this was a good trade.  The primary motivation for 
clobbering %eax was that Xen wanted a free register to use for computing 
the offset into the shared data in the case of SMP preemptible kernels.  
Xen no longer needs such a register, they can use the PDA offset 
instead.  And it does hurt native performance by unconditionally 
stealing a register in the four most commonly invoked paravirt-ops code 
sequences.

> Now, since we decided to allow paravirt_ops operations to be normal C
> (ie. the patching is optional and done late), we actually push and pop %
> ecx and %edx.  This makes the call site 10 bytes long, which is a nice
> size for patching anyway (enough for a movl $0, <addr>, a-la lguest's
> cli, or movw $0, %gs:<addr> if we supported SMP).
>   

You can do it in 11 bytes with no clobbers and normal C semantics by 
linking to a direct address instead of calling to an indirect, but then 
you need some gross fixup technology in paravirt_patch:

if (call_addr == (void*)native_sti) {
  ...
}

I think we should probably try to do it in 12 bytes.  Freeing eax to the 
inline caller is likely to make up the 2 bytes of space more we have to nop.

One thing I always tried to get in VMI was to encapsulate the actual 
code which went through the business of computing arguments that were 
not even used in the native case.  Unfortunately, that seems impossible 
in the current design, but I don't think it is an issue because I don't 
think there is actually a way to express:

SWITCHABLE_CODE_BLOCK_BEGIN {
   /* arbitrary C code for native */
} SWITCHABLE_CODE_BLOCK_ALTERNATIVE {
   /* arbitrary C code for something else */
}

Dave's linker suggestion is probably the best for things like that.

Zach

^ permalink raw reply	[flat|nested] 93+ messages in thread

* Re: [patch 13/26] Xen-paravirt_ops: Consistently wrap paravirt ops callsites to make them patchable
  2007-03-19 20:59                             ` Andi Kleen
@ 2007-03-20  3:18                               ` Linus Torvalds
  2007-03-20  3:47                                 ` David Miller
  0 siblings, 1 reply; 93+ messages in thread
From: Linus Torvalds @ 2007-03-20  3:18 UTC (permalink / raw)
  To: Andi Kleen
  Cc: xen-devel, netdev, mingo, linux-kernel, jbeulich, virtualization,
	chrisw, virtualization, anthony, akpm, David Miller



On Mon, 19 Mar 2007, Andi Kleen wrote:
> 
> Initially we had some bugs that accounted for near all failures, but they 
> were all fixed in the latest version.

No. The real bugs were that the people involved wouldn't even accept that 
unwinding information was inevitably buggy and/or incomplete.

That much more fundamental bug never got fixed, as far as I know. 

I'm not going to merge anything that depends on unwind tables as things 
stand. The pain just isn't worth it.

> > Please don't subject us to another couple months of hair-pulling only
> > to have Linus yank the thing out again, there are certainly more
> > useful things to spend time on :-)

Good call. Dwarf2 unwinding simply isn't worth doing. But I won't yank it 
out, I simply won't merge it. It was more than just totally buggy code, it 
was an inability of the people to understand that even bugfree code 
isn't enough - you have to be able to also handle buggy data.

		Linus

^ permalink raw reply	[flat|nested] 93+ messages in thread

* Re: [patch 13/26] Xen-paravirt_ops: Consistently wrap paravirt ops callsites to make them patchable
  2007-03-20  3:18                               ` Linus Torvalds
@ 2007-03-20  3:47                                 ` David Miller
  2007-03-20  4:19                                   ` Eric W. Biederman
  0 siblings, 1 reply; 93+ messages in thread
From: David Miller @ 2007-03-20  3:47 UTC (permalink / raw)
  To: torvalds
  Cc: ak, virtualization, jbeulich, jeremy, xen-devel, netdev,
	linux-kernel, chrisw, virtualization, anthony, akpm, mingo

From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Mon, 19 Mar 2007 20:18:14 -0700 (PDT)

> > > Please don't subject us to another couple months of hair-pulling only
> > > to have Linus yank the thing out again, there are certainly more
> > > useful things to spend time on :-)
> 
> Good call. Dwarf2 unwinding simply isn't worth doing. But I won't yank it 
> out, I simply won't merge it. It was more than just totally buggy code, it 
> was an inability of the people to understand that even bugfree code 
> isn't enough - you have to be able to also handle buggy data.

Thank you.

^ permalink raw reply	[flat|nested] 93+ messages in thread

* Re: [patch 13/26] Xen-paravirt_ops: Consistently wrap paravirt ops callsites to make them patchable
  2007-03-20  3:47                                 ` David Miller
@ 2007-03-20  4:19                                   ` Eric W. Biederman
  2007-03-20 13:28                                     ` Andi Kleen
  2007-03-20 16:12                                     ` Chuck Ebbert
  0 siblings, 2 replies; 93+ messages in thread
From: Eric W. Biederman @ 2007-03-20  4:19 UTC (permalink / raw)
  To: David Miller
  Cc: xen-devel, netdev, linux-kernel, jbeulich, chrisw, virtualization,
	anthony, akpm, virtualization, torvalds, mingo

David Miller <davem@davemloft.net> writes:

> From: Linus Torvalds <torvalds@linux-foundation.org>
> Date: Mon, 19 Mar 2007 20:18:14 -0700 (PDT)
>
>> > > Please don't subject us to another couple months of hair-pulling only
>> > > to have Linus yank the thing out again, there are certainly more
>> > > useful things to spend time on :-)
>> 
>> Good call. Dwarf2 unwinding simply isn't worth doing. But I won't yank it 
>> out, I simply won't merge it. It was more than just totally buggy code, it 
>> was an inability of the people to understand that even bugfree code 
>> isn't enough - you have to be able to also handle buggy data.
>
> Thank you.

Hmm..

I know the feeling I have had a similar rant about the kexec on panic
code path.   The code is still no where near as paranoid about normal
kernel things not working as it could be, but by ranting about it
periodically the people doing the work are gradually making it better.

I'm conflicted about the dwarf unwinder.  I was off doing other things
at the time so I missed the pain, but I do have a distinct recollection of
the back traces on x86_64 being distinctly worse the on i386.  Lately
I haven't seen that so it may be I was misinterpreting what I was
seeing, and the compiler optimizations were what gave me such weird
back traces.  

But if the quality of our backtraces has gone down and dwarf unwinder
could give us better back traces it is likely worth pursuing.  Of
course it would need to start with the assumption that it's tables
may be borked (the kernel is busted after all) and be much more
careful than Andi's last attempt.

Eric

^ permalink raw reply	[flat|nested] 93+ messages in thread

* Re: [patch 13/26] Xen-paravirt_ops: Consistently wrap paravirt ops callsites to make them patchable
  2007-03-20  2:00                       ` Zachary Amsden
@ 2007-03-20  4:20                         ` Rusty Russell
  2007-03-20  5:54                         ` Jeremy Fitzhardinge
  1 sibling, 0 replies; 93+ messages in thread
From: Rusty Russell @ 2007-03-20  4:20 UTC (permalink / raw)
  To: Zachary Amsden
  Cc: jeremy, xen-devel, akpm, virtualization, netdev, linux-kernel,
	chrisw, Andi Kleen, Eric W. Biederman, anthony, mingo,
	Linus Torvalds, David Miller

On Mon, 2007-03-19 at 18:00 -0800, Zachary Amsden wrote:
> Rusty Russell wrote:
> > *This* was the reason that the current hand-coded calls only clobber %
> > eax.  It was a compromise between native (no clobbers) and others (might
> > need a reg).
> 
> I still don't think this was a good trade.
...
> Xen no longer needs such a register

Hmm, well, if VMI is happy, Xen is happy, and lguest is happy, then
perhaps we're better off with a cc-only clobber rule?  Certainly makes
life simpler.

> > Now, since we decided to allow paravirt_ops operations to be normal C
> > (ie. the patching is optional and done late), we actually push and pop %
> > ecx and %edx.  This makes the call site 10 bytes long, which is a nice
> > size for patching anyway (enough for a movl $0, <addr>, a-la lguest's
> > cli, or movw $0, %gs:<addr> if we supported SMP).
> 
> You can do it in 11 bytes with no clobbers and normal C semantics by 
> linking to a direct address instead of calling to an indirect, but then 
> you need some gross fixup technology in paravirt_patch:
> 
> if (call_addr == (void*)native_sti) {
>   ...
> }

Well, I don't think we need such hacks: since we have to use handcoded
asm and mark the callsites anyway, marking what they're calling is
trivial.

The other idea from "btfixup" is that we can do the patching *much*
earlier, so we don't need the initial code to be valid at all if we
wanted to: we just need room to patch in a call insn.  We could then
generate trampolines which do the necessary pushes & pops automatically
for backends which want to use C calling conventions.

Perhaps it's time for code and benchmarks?

Rusty.

^ permalink raw reply	[flat|nested] 93+ messages in thread

* Re: [patch 13/26] Xen-paravirt_ops: Consistently wrap paravirt ops callsites to make them patchable
  2007-03-20  2:00                       ` Zachary Amsden
  2007-03-20  4:20                         ` Rusty Russell
@ 2007-03-20  5:54                         ` Jeremy Fitzhardinge
  2007-03-20 11:33                           ` Andreas Kleen
  2007-03-20 15:09                           ` Linus Torvalds
  1 sibling, 2 replies; 93+ messages in thread
From: Jeremy Fitzhardinge @ 2007-03-20  5:54 UTC (permalink / raw)
  To: Zachary Amsden
  Cc: Rusty Russell, Linus Torvalds, Eric W. Biederman, Andi Kleen,
	David Miller, mingo, akpm, linux-kernel, virtualization,
	xen-devel, chrisw, anthony, netdev

Zachary Amsden wrote:
> For VMI, the default clobber was "cc", and you need a way to allow at
> least that, because saving and restoring flags is too expensive on x86.

According to lore (Andi, I think), asm() always clobbers cc. 

> I still don't think this was a good trade.  The primary motivation for
> clobbering %eax was that Xen wanted a free register to use for
> computing the offset into the shared data in the case of SMP
> preemptible kernels.  Xen no longer needs such a register, they can
> use the PDA offset instead.  And it does hurt native performance by
> unconditionally stealing a register in the four most commonly invoked
> paravirt-ops code sequences.

Actually, it still does need a temp register.  The sequence for cli is:

    mov %fs:xen_vcpu, %eax
    movb $1,1(%eax)

At some point I hope to move the vcpu structure directly into the
pda/percpu variables, at which point it will need no temps.

    J

^ permalink raw reply	[flat|nested] 93+ messages in thread

* Re: [patch 13/26] Xen-paravirt_ops: Consistently wrap paravirt ops callsites to make them patchable
  2007-03-20  5:54                         ` Jeremy Fitzhardinge
@ 2007-03-20 11:33                           ` Andreas Kleen
  2007-03-20 15:09                           ` Linus Torvalds
  1 sibling, 0 replies; 93+ messages in thread
From: Andreas Kleen @ 2007-03-20 11:33 UTC (permalink / raw)
  To: Jeremy Fitzhardinge
  Cc: Zachary Amsden, xen-devel, akpm, virtualization, netdev,
	linux-kernel, chrisw, Andi Kleen, Eric W. Biederman, anthony,
	mingo, Linus Torvalds, David Miller

Am Di 20.03.2007 06:54 schrieb Jeremy Fitzhardinge <jeremy@goop.org>:

> Zachary Amsden wrote:
> > For VMI, the default clobber was "cc", and you need a way to allow
> > at
> > least that, because saving and restoring flags is too expensive on
> > x86.
>
> According to lore (Andi, I think), asm() always clobbers cc.

asm with input and/or output. I'm not sure about asms without that.

-Andi



^ permalink raw reply	[flat|nested] 93+ messages in thread

* Re: [patch 13/26] Xen-paravirt_ops: Consistently wrap paravirt ops callsites to make them patchable
  2007-03-20  4:19                                   ` Eric W. Biederman
@ 2007-03-20 13:28                                     ` Andi Kleen
  2007-03-20 16:25                                       ` Eric W. Biederman
  2007-03-20 16:12                                     ` Chuck Ebbert
  1 sibling, 1 reply; 93+ messages in thread
From: Andi Kleen @ 2007-03-20 13:28 UTC (permalink / raw)
  To: Eric W. Biederman
  Cc: jeremy, xen-devel, netdev, mingo, linux-kernel, jbeulich,
	virtualization, chrisw, virtualization, anthony, akpm, torvalds,
	David Miller

\
> I'm conflicted about the dwarf unwinder.  I was off doing other things
> at the time so I missed the pain, but I do have a distinct recollection of
> the back traces on x86_64 being distinctly worse the on i386. 

The only case were i386 was better was with frame pointers, which
was never fully implemented for x86-64. However i find that hilarious: 
people are spending a lot of time right here in this thread to squeeze
out the best call sequences for the paravirt ops, but then accept
losing a full frame pointer register on i386. I never found that
acceptable, that is why I prefered the unwinder instead. 

This said the big problem with the frame pointers is mostly gone now:
on older CPUs it tended to cause a pipeline stall early in the function.
That is now fixed in the latest Intel/upcomming AMD CPUs, but there 
are still millions and millions of older CPUs out there so I still
don't consider it acceptable.

> Lately 
> I haven't seen that so it may be I was misinterpreting what I was
> seeing, and the compiler optimizations were what gave me such weird
> back traces. 

The main problem is that subsystems are getting more and more complex
and especially callbacks seem to multiply far too quickly.

In 2.4 it was often very reasonable to just sort out the false positives,
but with sometimes 20-30+ level deep call chains in 2.6 with many callbacks that just
gets far too tenuous. 
 
> But if the quality of our backtraces has gone down and dwarf unwinder
> could give us better back traces it is likely worth pursuing.  Of
> course it would need to start with the assumption that it's tables
> may be borked (the kernel is busted after all) and be much more
> careful than Andi's last attempt.

The latest version validates the stack always. It was only a few lines
of change. I doubt it will make much difference though. The few true crashes
we had were not actually due the unwinder itself, but the buggy fallback code
(which were fixed quickly). But anyways it should satisfy everybody's paranoia now.

Although in future it would be good if people did some more analysis in root causes
for failures before let the paranoia take over and revert patches.

We see a good example here of what I call the JFS/ACPI effect: code gets merged
too early with some visible problems. It gets a bad name and afterwards people never
look objectively at it again and just trust their prejudices. 

But that's not a good strategy to get good code in the end I think. If there
is enough evidence the early problems were fixed then prejudices should
be reevaluated.

I will let it cook some time in -mm* and we will see if it works now or not.
I'm pretty confident it will though. And if it does there is no reason not 
to resubmit it.

-Andi

^ permalink raw reply	[flat|nested] 93+ messages in thread

* Re: [patch 13/26] Xen-paravirt_ops: Consistently wrap paravirt ops callsites to make them patchable
  2007-03-20  5:54                         ` Jeremy Fitzhardinge
  2007-03-20 11:33                           ` Andreas Kleen
@ 2007-03-20 15:09                           ` Linus Torvalds
  2007-03-20 15:58                             ` Eric W. Biederman
                                               ` (2 more replies)
  1 sibling, 3 replies; 93+ messages in thread
From: Linus Torvalds @ 2007-03-20 15:09 UTC (permalink / raw)
  To: Jeremy Fitzhardinge
  Cc: xen-devel, akpm, virtualization, netdev, linux-kernel, chrisw,
	Andi Kleen, Eric W. Biederman, anthony, mingo, David Miller



On Mon, 19 Mar 2007, Jeremy Fitzhardinge wrote:
>
> Zachary Amsden wrote:
> > For VMI, the default clobber was "cc", and you need a way to allow at
> > least that, because saving and restoring flags is too expensive on x86.
> 
> According to lore (Andi, I think), asm() always clobbers cc. 

On x86, yes. Practically any instruction will clobber cc anyway, so it's 
the default.

Although the gcc guys have occasionally been suggesting we should set it 
in our asms.

> Actually, it still does need a temp register.  The sequence for cli is:
> 
>     mov %fs:xen_vcpu, %eax
>     movb $1,1(%eax)

We should just do this natively. There's been several tests over the years 
saying that it's much more efficient to do sti/cli as a simple store, and 
handling the "oops, we got an interrupt while interrupts were disabled" as 
a special case.

I have this dim memory that ARM has done it that way for a long time 
because it's so expensive to do a "real" cli/sti.

And I think -rt does it for other reasons. It's just more flexible.

		Linus

^ permalink raw reply	[flat|nested] 93+ messages in thread

* Re: [patch 13/26] Xen-paravirt_ops: Consistently wrap paravirt ops callsites to make them patchable
  2007-03-20 15:09                           ` Linus Torvalds
@ 2007-03-20 15:58                             ` Eric W. Biederman
  2007-03-20 16:06                               ` Linus Torvalds
                                                 ` (2 more replies)
  2007-03-20 17:00                             ` Ingo Molnar
  2007-03-21  0:03                             ` Paul Mackerras
  2 siblings, 3 replies; 93+ messages in thread
From: Eric W. Biederman @ 2007-03-20 15:58 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: Jeremy Fitzhardinge, Zachary Amsden, Rusty Russell, Andi Kleen,
	David Miller, mingo, akpm, linux-kernel, virtualization,
	xen-devel, chrisw, anthony, netdev

Linus Torvalds <torvalds@linux-foundation.org> writes:

> On Mon, 19 Mar 2007, Jeremy Fitzhardinge wrote:
>> Actually, it still does need a temp register.  The sequence for cli is:
>> 
>>     mov %fs:xen_vcpu, %eax
>>     movb $1,1(%eax)
>
> We should just do this natively. There's been several tests over the years 
> saying that it's much more efficient to do sti/cli as a simple store, and 
> handling the "oops, we got an interrupt while interrupts were disabled" as 
> a special case.
>
> I have this dim memory that ARM has done it that way for a long time 
> because it's so expensive to do a "real" cli/sti.
>
> And I think -rt does it for other reasons. It's just more flexible.

If that is the case.  In the normal kernel what would
the "the oops, we got an interrupt code do?"
I assume it would leave interrupts disabled when it returns?
Like we currently do with the delayed disable of normal interrupts?

I'm trying to understand the proposed semantics.

Looking at the above code snippet.  I guess it is about time to
merge our per_cpu and pda variables...

Eric

^ permalink raw reply	[flat|nested] 93+ messages in thread

* Re: [patch 13/26] Xen-paravirt_ops: Consistently wrap paravirt ops callsites to make them patchable
  2007-03-20 15:58                             ` Eric W. Biederman
@ 2007-03-20 16:06                               ` Linus Torvalds
  2007-03-20 16:31                                 ` Jeremy Fitzhardinge
  2007-03-20 19:28                                 ` Andi Kleen
  2007-03-20 16:26                               ` Jeremy Fitzhardinge
  2007-03-20 22:41                               ` Rusty Russell
  2 siblings, 2 replies; 93+ messages in thread
From: Linus Torvalds @ 2007-03-20 16:06 UTC (permalink / raw)
  To: Eric W. Biederman
  Cc: xen-devel, virtualization, netdev, linux-kernel, David Miller,
	chrisw, Andi Kleen, anthony, akpm, mingo



On Tue, 20 Mar 2007, Eric W. Biederman wrote:
> 
> If that is the case.  In the normal kernel what would
> the "the oops, we got an interrupt code do?"
> I assume it would leave interrupts disabled when it returns?
> Like we currently do with the delayed disable of normal interrupts?

Yeah, disable interrupts, and set a flag that the fake "sti" can test, and 
just return without doing anything.

(You may or may not also need to do extra work to Ack the hardware 
interrupt etc, which may be irq-controller specific. Once the CPU has 
accepted the interrupt, you may not be able to just leave it dangling)

But I was throwing that out as a long-term thing. I'm not claiming it's 
trivial, but it should be doable.

		Linus

^ permalink raw reply	[flat|nested] 93+ messages in thread

* Re: [patch 13/26] Xen-paravirt_ops: Consistently wrap paravirt ops callsites to make them patchable
  2007-03-20  4:19                                   ` Eric W. Biederman
  2007-03-20 13:28                                     ` Andi Kleen
@ 2007-03-20 16:12                                     ` Chuck Ebbert
  1 sibling, 0 replies; 93+ messages in thread
From: Chuck Ebbert @ 2007-03-20 16:12 UTC (permalink / raw)
  To: Eric W. Biederman
  Cc: xen-devel, netdev, mingo, jbeulich, linux-kernel, chrisw,
	virtualization, anthony, akpm, virtualization, torvalds,
	David Miller

Eric W. Biederman wrote:
 
> I'm conflicted about the dwarf unwinder.  I was off doing other things
> at the time so I missed the pain, but I do have a distinct recollection of
> the back traces on x86_64 being distinctly worse the on i386.  Lately
> I haven't seen that so it may be I was misinterpreting what I was
> seeing, and the compiler optimizations were what gave me such weird
> back traces.  
> 

Well, if you compile x86_64 with frame pointers it helps a bit because
the compiler doesn't tail merge function calls. But the stack backtrace
ignores the frame pointers even if they're present, unlike i386 which
will use them.

^ permalink raw reply	[flat|nested] 93+ messages in thread

* Re: [patch 13/26] Xen-paravirt_ops: Consistently wrap paravirt ops callsites to make them patchable
  2007-03-20 13:28                                     ` Andi Kleen
@ 2007-03-20 16:25                                       ` Eric W. Biederman
  2007-03-20 17:42                                         ` Andi Kleen
  0 siblings, 1 reply; 93+ messages in thread
From: Eric W. Biederman @ 2007-03-20 16:25 UTC (permalink / raw)
  To: Andi Kleen
  Cc: xen-devel, netdev, mingo, linux-kernel, jbeulich, virtualization,
	chrisw, virtualization, anthony, akpm, torvalds, David Miller

Andi Kleen <ak@suse.de> writes:

>> I'm conflicted about the dwarf unwinder.  I was off doing other things
>> at the time so I missed the pain, but I do have a distinct recollection of
>> the back traces on x86_64 being distinctly worse the on i386. 
>
> The only case were i386 was better was with frame pointers, which
> was never fully implemented for x86-64. However i find that hilarious: 
> people are spending a lot of time right here in this thread to squeeze
> out the best call sequences for the paravirt ops, but then accept
> losing a full frame pointer register on i386. I never found that
> acceptable, that is why I prefered the unwinder instead. 
>
> This said the big problem with the frame pointers is mostly gone now:
> on older CPUs it tended to cause a pipeline stall early in the function.
> That is now fixed in the latest Intel/upcomming AMD CPUs, but there 
> are still millions and millions of older CPUs out there so I still
> don't consider it acceptable.

What I recall observing is call traces that made no sense.  Not just
extra noise in the stack trace but things like seeing a function that
has exactly one path to it, and not seeing all of the functions on
that path in the call trace.

In my later debugging I have been reasonably able to attribute those
kinds of things to compiler optimizations like inlining and tail call
optimization.

Now I will agree that having fewer or no false positives to weed
through is a good thing, if we can do it reliably.

>> Lately 
>> I haven't seen that so it may be I was misinterpreting what I was
>> seeing, and the compiler optimizations were what gave me such weird
>> back traces. 
>
> The main problem is that subsystems are getting more and more complex
> and especially callbacks seem to multiply far too quickly.
>
> In 2.4 it was often very reasonable to just sort out the false positives,
> but with sometimes 20-30+ level deep call chains in 2.6 with many callbacks that
> just
> gets far too tenuous. 

Hmm.  I haven't seen those traces, but I wonder if the size of those
stack traces indicates potential stack overflow problems.
  
>> But if the quality of our backtraces has gone down and dwarf unwinder
>> could give us better back traces it is likely worth pursuing.  Of
>> course it would need to start with the assumption that it's tables
>> may be borked (the kernel is busted after all) and be much more
>> careful than Andi's last attempt.
>
> The latest version validates the stack always. It was only a few lines
> of change. I doubt it will make much difference though. The few true crashes
> we had were not actually due the unwinder itself, but the buggy fallback code
> (which were fixed quickly). But anyways it should satisfy everybody's paranoia
> now.

Do you also validate the unwind data?

> Although in future it would be good if people did some more analysis in root
> causes for failures before let the paranoia take over and revert patches.
>
> We see a good example here of what I call the JFS/ACPI effect: code gets merged
> too early with some visible problems. It gets a bad name and afterwards people
> never look objectively at it again and just trust their prejudices. 

I don't know.  The impression I got was the root cause analysis stopped 
when it was observed that the code was unsuitable for solving the problem.
When asked about it, it appeared the developer did not understand the
question.  Therefore the root cause was assumed to be the developer.

At least that is how I have read the few little bits I have seen.

> But that's not a good strategy to get good code in the end I think. If there
> is enough evidence the early problems were fixed then prejudices should
> be reevaluated.

Certainly.  However if the developer has lost a certain amount of
initial trust, the burden becomes much higher.

Eric

^ permalink raw reply	[flat|nested] 93+ messages in thread

* Re: [patch 13/26] Xen-paravirt_ops: Consistently wrap paravirt ops callsites to make them patchable
  2007-03-20 15:58                             ` Eric W. Biederman
  2007-03-20 16:06                               ` Linus Torvalds
@ 2007-03-20 16:26                               ` Jeremy Fitzhardinge
  2007-03-20 22:41                               ` Rusty Russell
  2 siblings, 0 replies; 93+ messages in thread
From: Jeremy Fitzhardinge @ 2007-03-20 16:26 UTC (permalink / raw)
  To: Eric W. Biederman
  Cc: Linus Torvalds, Zachary Amsden, Rusty Russell, Andi Kleen,
	David Miller, mingo, akpm, linux-kernel, virtualization,
	xen-devel, chrisw, anthony, netdev

Eric W. Biederman wrote:
> Looking at the above code snippet.  I guess it is about time to
> merge our per_cpu and pda variables...

Rusty has a nice patch series to do just that; I think he's still
looking for a taker.

    J

^ permalink raw reply	[flat|nested] 93+ messages in thread

* Re: [patch 13/26] Xen-paravirt_ops: Consistently wrap paravirt ops callsites to make them patchable
  2007-03-20 16:06                               ` Linus Torvalds
@ 2007-03-20 16:31                                 ` Jeremy Fitzhardinge
  2007-03-20 22:09                                   ` Zachary Amsden
  2007-03-20 22:43                                   ` Matt Mackall
  2007-03-20 19:28                                 ` Andi Kleen
  1 sibling, 2 replies; 93+ messages in thread
From: Jeremy Fitzhardinge @ 2007-03-20 16:31 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: Zachary Amsden, xen-devel, akpm, virtualization, netdev,
	Rusty Russell, linux-kernel, chrisw, Andi Kleen,
	Eric W. Biederman, anthony, mingo, David Miller

Linus Torvalds wrote:
> On Tue, 20 Mar 2007, Eric W. Biederman wrote:
>   
>> If that is the case.  In the normal kernel what would
>> the "the oops, we got an interrupt code do?"
>> I assume it would leave interrupts disabled when it returns?
>> Like we currently do with the delayed disable of normal interrupts?
>>     
>
> Yeah, disable interrupts, and set a flag that the fake "sti" can test, and 
> just return without doing anything.
>
> (You may or may not also need to do extra work to Ack the hardware 
> interrupt etc, which may be irq-controller specific. Once the CPU has 
> accepted the interrupt, you may not be able to just leave it dangling)
>   

So it would be something like:

    pda.intr_mask = 1;		/* disable interrupts */
    ...
    pda.intr_mask = 0;		/* enable interrupts */
    if (xchg(&pda.intr_pending, 0))	/* check pending */
    	asm("sti");		/* was pending; isr left cpu interrupts masked */
      

and in the interrupt handler:

    if (pda.intr_mask) {
    	pda.intr_pending = 1;
    	regs->eflags &= ~IF;
    	maybe_ack_interrupt_controller();
    	iret
      

    }

?

    J

^ permalink raw reply	[flat|nested] 93+ messages in thread

* Re: [patch 13/26] Xen-paravirt_ops: Consistently wrap paravirt ops callsites to make them patchable
  2007-03-20 17:42                                         ` Andi Kleen
@ 2007-03-20 16:52                                           ` Linus Torvalds
  2007-03-20 18:03                                             ` Andi Kleen
  0 siblings, 1 reply; 93+ messages in thread
From: Linus Torvalds @ 2007-03-20 16:52 UTC (permalink / raw)
  To: Andi Kleen
  Cc: xen-devel, netdev, mingo, linux-kernel, jbeulich, virtualization,
	chrisw, virtualization, Eric W. Biederman, anthony, akpm,
	David Miller



On Tue, 20 Mar 2007, Andi Kleen wrote:
> 
> No, me and Jan fixed all reported bugs as far as I know.

No you did not. You didn't fix the ones I reported. Which is why it got 
removed, and will not get added back until there is another maintainer.

The ones I reported were all about trusting the stack contents implicitly, 
and assuming that the unwind info was there and valid. Using things like 
"__get_user()" didn't fix it, because if a WARN_ON() happened while we 
held the mm semaphore and the unwind info was bogus, it would take a 
page-fault and deadlock.

Those kinds of things are not acceptable for debugging output. If I cannot 
use WARN_ON() because I hold the MM lock and I'm afraid there might be 
kernel corruption, then something is *wrong*!

And I told you guys this. Over *months*. And you ignored me. You told me 
everything was fine. Each time, somebody else ended up reporting a hang 
where the unwinder was at fault. And since I couldn't trust the 
maintainers to fix it, removing the broken feature that only caused more 
problems than it fixed was the only option.

And you clearly *still* haven't accepted the fact that the code was buggy. 

Does anybody wonder why I wouldn't merge it back?

		Linus

^ permalink raw reply	[flat|nested] 93+ messages in thread

* Re: [patch 13/26] Xen-paravirt_ops: Consistently wrap paravirt ops callsites to make them patchable
  2007-03-20 15:09                           ` Linus Torvalds
  2007-03-20 15:58                             ` Eric W. Biederman
@ 2007-03-20 17:00                             ` Ingo Molnar
  2007-03-21  0:03                             ` Paul Mackerras
  2 siblings, 0 replies; 93+ messages in thread
From: Ingo Molnar @ 2007-03-20 17:00 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: Jeremy Fitzhardinge, Zachary Amsden, Rusty Russell,
	Eric W. Biederman, Andi Kleen, David Miller, akpm, linux-kernel,
	virtualization, xen-devel, chrisw, anthony, netdev


* Linus Torvalds <torvalds@linux-foundation.org> wrote:

> I have this dim memory that ARM has done it that way for a long time 
> because it's so expensive to do a "real" cli/sti.
> 
> And I think -rt does it for other reasons. It's just more flexible.

-rt doesnt wrap cli/sti anymore: spin_lock_irq*() doesnt disable irq 
flags on -rt, thus the amount of real irqs-off sections is very small 
and reviewable.

But nevertheless an incarnation of the code survived and is upstream 
already, in the form of TRACE_IRQFLAGS lockdep code ;) This implements a 
soft hardirq flag _today_: all that would be needed is for Xen to define 
raw_local_irq_disable() as a NOP, and to use the 
current->hardirqs_enabled as 'soft IRQ-off flag'.

Note that ->hardirqs_enabled is self-maintained, i.e. it's not just a 
stupid shadow of the hardirq flag, it's an independently maintained flag 
that does not rely on the existence of the hard flag.

[ this code even has its own debugging code, so out-of-sync-flags,
  double-off and double-on is detected and complained about. So all the 
  hard stuff has already been done as part of lockdep, and it's even 
  long-term maintainable because under a native lockdep kernel we check 
  the soft flag against the hard flag. ]

so i think a soft cli/sti flag support should be merged/unified with the 
trace_hardirqs_on()/trace_hardirqs_off() code.

	Ingo

^ permalink raw reply	[flat|nested] 93+ messages in thread

* Re: [patch 13/26] Xen-paravirt_ops: Consistently wrap paravirt ops callsites to make them patchable
  2007-03-20 18:03                                             ` Andi Kleen
@ 2007-03-20 17:27                                               ` Linus Torvalds
  2007-03-20 19:21                                                 ` Andi Kleen
  0 siblings, 1 reply; 93+ messages in thread
From: Linus Torvalds @ 2007-03-20 17:27 UTC (permalink / raw)
  To: Andi Kleen
  Cc: xen-devel, netdev, mingo, linux-kernel, jbeulich, virtualization,
	chrisw, virtualization, Eric W. Biederman, anthony, akpm,
	David Miller



On Tue, 20 Mar 2007, Andi Kleen wrote:
> 
> The code never did that. In fact many of the problems we had initially
> especially came out of that -- the fallback code that would handle
> this case wasn't fully correct.

I don't keep my emails any more, but you *never* fixed the problems in 
arch/*/kernel/traps.c.

Yes, the kernel/unwind.c issues generally got fixed. The infinite loops in 
the *callers* never did.

> Also frankly often your analysis about what went wrong was just
> incorrect.

Still in denial, I see.

Do you still claim that "the fallback position always did the right 
thing"? Despite the fact that the unwinder had sometimes *corrupted* the 
incoming information so much that the fallback position was the one that 
oopsed? And no, you didn't fix that.

And no, IT DID NOT use probe_kernel_address like you still claim.

Anyway, you work for Suse, I don't care what you do to the Suse kernel. 
Maybe it will get stable some day. Somehow, I doubt it. 

			Linus

^ permalink raw reply	[flat|nested] 93+ messages in thread

* Re: [patch 13/26] Xen-paravirt_ops: Consistently wrap paravirt ops callsites to make them patchable
  2007-03-20 16:25                                       ` Eric W. Biederman
@ 2007-03-20 17:42                                         ` Andi Kleen
  2007-03-20 16:52                                           ` Linus Torvalds
  0 siblings, 1 reply; 93+ messages in thread
From: Andi Kleen @ 2007-03-20 17:42 UTC (permalink / raw)
  To: Eric W. Biederman
  Cc: Andi Kleen, David Miller, torvalds, virtualization, jbeulich,
	jeremy, xen-devel, netdev, linux-kernel, chrisw, virtualization,
	anthony, akpm, mingo

On Tue, Mar 20, 2007 at 10:25:20AM -0600, Eric W. Biederman wrote:
> What I recall observing is call traces that made no sense.  Not just
> extra noise in the stack trace but things like seeing a function that
> has exactly one path to it, and not seeing all of the functions on
> that path in the call trace.

That's tail call/sibling call optimization. No unwinder can untangle that
because the return address is lost.  But it's also an quite important optimization.

> >
> > In 2.4 it was often very reasonable to just sort out the false positives,
> > but with sometimes 20-30+ level deep call chains in 2.6 with many callbacks that
> > just
> > gets far too tenuous. 
> 
> Hmm.  I haven't seen those traces, but I wonder if the size of those
> stack traces indicates potential stack overflow problems.

Most functions have quite small frames, so 20-30 is still not a problem

> Do you also validate the unwind data?

There are many sanity checks in the unwind code and it will fall back
to the old unwinder when it gets stuck.

> 
> > Although in future it would be good if people did some more analysis in root
> > causes for failures before let the paranoia take over and revert patches.
> >
> > We see a good example here of what I call the JFS/ACPI effect: code gets merged
> > too early with some visible problems. It gets a bad name and afterwards people
> > never look objectively at it again and just trust their prejudices. 
> 
> I don't know.  The impression I got was the root cause analysis stopped 
> when it was observed that the code was unsuitable for solving the problem.

No, me and Jan fixed all reported bugs as far as I know.

-Andi


^ permalink raw reply	[flat|nested] 93+ messages in thread

* Re: [patch 13/26] Xen-paravirt_ops: Consistently wrap paravirt ops callsites to make them patchable
  2007-03-20 16:52                                           ` Linus Torvalds
@ 2007-03-20 18:03                                             ` Andi Kleen
  2007-03-20 17:27                                               ` Linus Torvalds
  0 siblings, 1 reply; 93+ messages in thread
From: Andi Kleen @ 2007-03-20 18:03 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: Andi Kleen, Eric W. Biederman, David Miller, virtualization,
	jbeulich, jeremy, xen-devel, netdev, linux-kernel, chrisw,
	virtualization, anthony, akpm, mingo

On Tue, Mar 20, 2007 at 09:52:42AM -0700, Linus Torvalds wrote:
> The ones I reported were all about trusting the stack contents implicitly, 

The latest version added the full double check you wanted - every
new RSP is validated against the current stack or the exception stacks.

> and assuming that the unwind info was there and valid. Using things like 

The code never did that. In fact many of the problems we had initially
especially came out of that -- the fallback code that would handle
this case wasn't fully correct.

> "__get_user()" didn't fix it, because if a WARN_ON() happened while we 
> held the mm semaphore and the unwind info was bogus, it would take a 
> page-fault and deadlock.

That was fixed too even before you dropped it by using probe_kernel_address()

> And I told you guys this. Over *months*. And you ignored me. You told me 
> everything was fine. Each time, somebody else ended up reporting a hang 
> where the unwinder was at fault. 

That was because most of the bugs were reported many times duplicated --
and reports from older releases kept streaming in even after the fix
went in.

Also frankly often your analysis about what went wrong was just
incorrect.
 
> And you clearly *still* haven't accepted the fact that the code was buggy. 

There were some bugs, but they were all fixed. Or at least I hope -- 
we will find out during testing.

> Does anybody wonder why I wouldn't merge it back?

So do you have an alternative to the unwinder? Don't tell me 
"real men decode 30+ entry long stack traces with lots of callbacks
by hand". Or do you prefer to use frame pointers everywhere? 

-Andi


^ permalink raw reply	[flat|nested] 93+ messages in thread

* Re: [patch 13/26] Xen-paravirt_ops: Consistently wrap paravirt ops callsites to make them patchable
  2007-03-20 19:21                                                 ` Andi Kleen
@ 2007-03-20 18:49                                                   ` Linus Torvalds
  2007-03-20 20:23                                                     ` Andi Kleen
  0 siblings, 1 reply; 93+ messages in thread
From: Linus Torvalds @ 2007-03-20 18:49 UTC (permalink / raw)
  To: Andi Kleen
  Cc: xen-devel, netdev, mingo, linux-kernel, jbeulich, virtualization,
	chrisw, virtualization, Eric W. Biederman, anthony, akpm,
	David Miller



On Tue, 20 Mar 2007, Andi Kleen wrote:
> 
> So what is your proposed alternative to handle long backtraces? 
> You didn't answer that question. Please do, I'm curious about your thoughts
> in this area.

the thing is, I'd rather see a long backtrace that is hard to decipher but 
that *never* *ever* causes any additional problems, over a pretty one.

Because that's really the issue: do you want a "pretty" backtrace, or do 
you want one that is rock solid but has some crud in it.

I'll take the rock solid one any day. Especially as even the pretty one 
won't fix the most common problem, which is "I don't see the caller" (due 
to inlining or tail-calls).

In contrast, the ugly backtrace will have some "garbage entries" from 
previous frames that didn't get overwritten, but there have actually been 
(admittedly rare) cases where those garbage entries have given hints about 
what happened just before.

			Linus

^ permalink raw reply	[flat|nested] 93+ messages in thread

* Re: [patch 13/26] Xen-paravirt_ops: Consistently wrap paravirt ops callsites to make them patchable
  2007-03-20 17:27                                               ` Linus Torvalds
@ 2007-03-20 19:21                                                 ` Andi Kleen
  2007-03-20 18:49                                                   ` Linus Torvalds
  0 siblings, 1 reply; 93+ messages in thread
From: Andi Kleen @ 2007-03-20 19:21 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: Andi Kleen, Eric W. Biederman, David Miller, virtualization,
	jbeulich, jeremy, xen-devel, netdev, linux-kernel, chrisw,
	virtualization, anthony, akpm, mingo

On Tue, Mar 20, 2007 at 10:27:00AM -0700, Linus Torvalds wrote:
> 
> 
> On Tue, 20 Mar 2007, Andi Kleen wrote:
> > 
> > The code never did that. In fact many of the problems we had initially
> > especially came out of that -- the fallback code that would handle
> > this case wasn't fully correct.
> 
> I don't keep my emails any more, but you *never* fixed the problems in 
> arch/*/kernel/traps.c.

I fixed that one after you dropped it (hmm, double checking: or at least
I thought I had fixed it, but don't see the code right now; will redo then) 
Basically it was just a one liner anyways - always check against all the
stacks that are there.


> Yes, the kernel/unwind.c issues generally got fixed. The infinite loops in 
> the *callers* never did.

There was later a weaker form that should have caught most loops, but admittedly
it wasn't 100% bullet-proof with exception stacks.

> 
> > Also frankly often your analysis about what went wrong was just
> > incorrect.
> 
> Still in denial, I see.
> 
> Do you still claim that "the fallback position always did the right 
> thing"

No initially it was buggy and that caused several of the crashes.

> Despite the fact that the unwinder had sometimes *corrupted* the 
> incoming information so much that the fallback position was the one that 
> oopsed? And no, you didn't fix that.

No, it oopsed because it was broken by itself.
Anyways that got fixed quickly.

> 
> And no, IT DID NOT use probe_kernel_address like you still claim.

There definitely was a patch that made it use it. You might have
not merged it though.
 
> Anyway, you work for Suse, I don't care what you do to the Suse kernel. 
> Maybe it will get stable some day. Somehow, I doubt it. 

So what is your proposed alternative to handle long backtraces? 
You didn't answer that question. Please do, I'm curious about your thoughts
in this area.

-Andi


^ permalink raw reply	[flat|nested] 93+ messages in thread

* Re: [patch 13/26] Xen-paravirt_ops: Consistently wrap paravirt ops callsites to make them patchable
  2007-03-20 16:06                               ` Linus Torvalds
  2007-03-20 16:31                                 ` Jeremy Fitzhardinge
@ 2007-03-20 19:28                                 ` Andi Kleen
  2007-03-20 19:54                                   ` Zachary Amsden
  1 sibling, 1 reply; 93+ messages in thread
From: Andi Kleen @ 2007-03-20 19:28 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: Eric W. Biederman, Jeremy Fitzhardinge, Zachary Amsden,
	Rusty Russell, David Miller, mingo, akpm, linux-kernel,
	virtualization, xen-devel, chrisw, anthony, netdev

> But I was throwing that out as a long-term thing. I'm not claiming it's 
> trivial, but it should be doable.

One thing I was pondering was to replace the expensive popfs with

bt  $IF,(%rsp) 
jnc 1f 
sti
1: 

But would be mostly a P4 optimization again and I'm not 100% sure it is
worth it.

-Andi


^ permalink raw reply	[flat|nested] 93+ messages in thread

* Re: [patch 13/26] Xen-paravirt_ops: Consistently wrap paravirt ops callsites to make them patchable
  2007-03-20 19:28                                 ` Andi Kleen
@ 2007-03-20 19:54                                   ` Zachary Amsden
  2007-03-20 20:02                                     ` Andi Kleen
  0 siblings, 1 reply; 93+ messages in thread
From: Zachary Amsden @ 2007-03-20 19:54 UTC (permalink / raw)
  To: Andi Kleen
  Cc: Linus Torvalds, Eric W. Biederman, Jeremy Fitzhardinge,
	Rusty Russell, David Miller, mingo, akpm, linux-kernel,
	virtualization, xen-devel, chrisw, anthony, netdev

Andi Kleen wrote:
> One thing I was pondering was to replace the expensive popfs with
>
> bt  $IF,(%rsp) 
> jnc 1f 
> sti
> 1: 
>
> But would be mostly a P4 optimization again and I'm not 100% sure it is
> worth it.
>   

Worth it on 32-bit.  On AMD64, probably not.  On Intel 64-bit, maybe, 
but less important than in P4 days.

This could change character completely if used at the tail of a function 
where you now have

sti; 1: ret

Which generates an interrupt holdoff on the ret, an unusual thing to do.

Zach

^ permalink raw reply	[flat|nested] 93+ messages in thread

* Re: [patch 13/26] Xen-paravirt_ops: Consistently wrap paravirt ops callsites to make them patchable
  2007-03-20 19:54                                   ` Zachary Amsden
@ 2007-03-20 20:02                                     ` Andi Kleen
  0 siblings, 0 replies; 93+ messages in thread
From: Andi Kleen @ 2007-03-20 20:02 UTC (permalink / raw)
  To: Zachary Amsden
  Cc: Linus Torvalds, Eric W. Biederman, Jeremy Fitzhardinge,
	Rusty Russell, David Miller, mingo, akpm, linux-kernel,
	virtualization, xen-devel, chrisw, anthony, netdev

> Worth it on 32-bit.  On AMD64, probably not.  On Intel 64-bit, maybe, 
> but less important than in P4 days.

Well most of Intel 64bit is P4 -- and Intel is still shipping
millions more of them each quarter.

> This could change character completely if used at the tail of a function 
> where you now have
> 
> sti; 1: ret
> 
> Which generates an interrupt holdoff on the ret, an unusual thing to do.

Unusual yes, but I don't see how it should cause problems. Or do you
have anything specific in mind? 

Worse is probably that on K8 this case might cause a pipeline stall
if there isn't a prefix in front of the ret.

-Andi

^ permalink raw reply	[flat|nested] 93+ messages in thread

* Re: [patch 13/26] Xen-paravirt_ops: Consistently wrap paravirt ops callsites to make them patchable
  2007-03-20 18:49                                                   ` Linus Torvalds
@ 2007-03-20 20:23                                                     ` Andi Kleen
  2007-03-20 21:39                                                       ` Alan Cox
                                                                         ` (2 more replies)
  0 siblings, 3 replies; 93+ messages in thread
From: Andi Kleen @ 2007-03-20 20:23 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: Andi Kleen, Eric W. Biederman, David Miller, virtualization,
	jbeulich, jeremy, xen-devel, netdev, linux-kernel, chrisw,
	virtualization, anthony, akpm, mingo

On Tue, Mar 20, 2007 at 11:49:39AM -0700, Linus Torvalds wrote:
> 
> 
> On Tue, 20 Mar 2007, Andi Kleen wrote:
> > 
> > So what is your proposed alternative to handle long backtraces? 
> > You didn't answer that question. Please do, I'm curious about your thoughts
> > in this area.
> 
> the thing is, I'd rather see a long backtrace that is hard to decipher but 
> that *never* *ever* causes any additional problems, over a pretty one.

Well it causes additional problems. We had some cases where it was really
hard to distingush garbage and the true call chain. I can probably dig
out some examples if you want.

With lots of call backs (e.g. common with sysfs) it is also frequently
not obvious how the call chains are supposed to go.

I would have agreed with you in the 2.4 time frame. It was also
always fun to explain this to a newbie with oopses and watch their
face when they secretly think you're crazy ;-) But code is getting
much and more complex unfortunately and I had a few cases where
it was really ugly to sort them out. 

> Because that's really the issue: do you want a "pretty" backtrace, or do 
> you want one that is rock solid but has some crud in it.

I just want an as exact backtrace as possible. I also think
that we can make the unwinder robust enough.

> I'll take the rock solid one any day. Especially as even the pretty one 
> won't fix the most common problem, which is "I don't see the caller" (due 
> to inlining or tail-calls).

The problem is not one or a few levels of calls. That can be always figured
out from the source. The problem is when you have a double
digit number of them with non obvious (dynamic indirect) dependencies.  Yes it 
can be figured out too, but it is a long and very annoying process
with lots of grepping etc. and even then you sometimes can't be 100% 
sure in some cases. 

It's also a mechanic process that just cries to be done by a machine instead.

> In contrast, the ugly backtrace will have some "garbage entries" from 
> previous frames that didn't get overwritten, but there have actually been 

If it was only a few that would be great ...

-Andi


^ permalink raw reply	[flat|nested] 93+ messages in thread

* Re: Re: [patch 13/26] Xen-paravirt_ops: Consistently wrap paravirt ops callsites to make them patchable
  2007-03-20 20:23                                                     ` Andi Kleen
@ 2007-03-20 21:39                                                       ` Alan Cox
  2007-03-20 21:49                                                         ` [Xen-devel] " Andi Kleen
  2007-03-20 23:43                                                       ` Linus Torvalds
  2007-03-21  6:08                                                       ` Andrew Morton
  2 siblings, 1 reply; 93+ messages in thread
From: Alan Cox @ 2007-03-20 21:39 UTC (permalink / raw)
  To: Andi Kleen
  Cc: jeremy, xen-devel, Andi, David Miller, netdev, Kleen, jbeulich,
	virtualization, chrisw, virtualization, Eric W. Biederman,
	anthony, mingo, Linus Torvalds, akpm, linux-kernel

> > Because that's really the issue: do you want a "pretty" backtrace, or do 
> > you want one that is rock solid but has some crud in it.
> 
> I just want an as exact backtrace as possible. I also think
> that we can make the unwinder robust enough.

Any reason you can't put the exact back trace in "[xxx]" and the ones we
see on the stack which dont look like call trace as ?xxx? It makes the
code a bit trickier but we depend on the quality of traces

Alan

^ permalink raw reply	[flat|nested] 93+ messages in thread

* Re: [Xen-devel] Re: [patch 13/26] Xen-paravirt_ops: Consistently wrap paravirt ops callsites to make them patchable
  2007-03-20 21:39                                                       ` Alan Cox
@ 2007-03-20 21:49                                                         ` Andi Kleen
  2007-03-20 23:51                                                           ` Linus Torvalds
  0 siblings, 1 reply; 93+ messages in thread
From: Andi Kleen @ 2007-03-20 21:49 UTC (permalink / raw)
  To: Alan Cox
  Cc: Andi Kleen, Linus Torvalds, jeremy, xen-devel, netdev, mingo,
	jbeulich, virtualization, chrisw, virtualization,
	Eric W. Biederman, anthony, akpm, David Miller, linux-kernel

On Tue, Mar 20, 2007 at 09:39:18PM +0000, Alan Cox wrote:
> > > Because that's really the issue: do you want a "pretty" backtrace, or do 
> > > you want one that is rock solid but has some crud in it.
> > 
> > I just want an as exact backtrace as possible. I also think
> > that we can make the unwinder robust enough.
> 
> Any reason you can't put the exact back trace in "[xxx]" and the ones we
> see on the stack which dont look like call trace as ?xxx? It makes the
> code a bit trickier but we depend on the quality of traces

Linus is worried about the unwinder crashing -- that wouldn't help with that.

What the (now out of tree) unwinder does is to check if it finishes
the trace and if not fall back to the old unwinder.

-Andi

^ permalink raw reply	[flat|nested] 93+ messages in thread

* Re: [patch 13/26] Xen-paravirt_ops: Consistently wrap paravirt ops callsites to make them patchable
  2007-03-20 16:31                                 ` Jeremy Fitzhardinge
@ 2007-03-20 22:09                                   ` Zachary Amsden
  2007-03-21  0:24                                     ` Linus Torvalds
  2007-03-20 22:43                                   ` Matt Mackall
  1 sibling, 1 reply; 93+ messages in thread
From: Zachary Amsden @ 2007-03-20 22:09 UTC (permalink / raw)
  To: Jeremy Fitzhardinge
  Cc: xen-devel, akpm, virtualization, netdev, Rusty Russell,
	linux-kernel, chrisw, Andi Kleen, Eric W. Biederman, anthony,
	mingo, Linus Torvalds, David Miller

Jeremy Fitzhardinge wrote:

>> Yeah, disable interrupts, and set a flag that the fake "sti" can test, and 
>> just return without doing anything.
>>
>> (You may or may not also need to do extra work to Ack the hardware 
>> interrupt etc, which may be irq-controller specific. Once the CPU has 
>> accepted the interrupt, you may not be able to just leave it dangling)
>>   
>>     
>
> So it would be something like:
>
>     pda.intr_mask = 1;		/* disable interrupts */
>     ...
>     pda.intr_mask = 0;		/* enable interrupts */
>     if (xchg(&pda.intr_pending, 0))	/* check pending */
>   

Well, can't do xchg, since it implies #LOCK, and you'll lose more than 
you gain on the processors where it matters.  Cmpxchg is fine, but 
processor dependent.

Or, just make the interrupt handlers use software resend for IRQs when 
pda.intr_mask is set to zero.  Now, local_irq_save / restore are very 
pretty:

int local_irq_save(void)
{
   int tmp = pda.intr_mask;
   pda.intr_mask = 0;
   /*
    * note there is a window here where local IRQs notice intr_mask == 0
    * in that case, they will attempt to resend the IRQ via a tasklet,
    * and will succeed, albeit through a slightly longer path
    */
   local_bh_disable();
   return tmp;
}

void local_irq_restore(int enabled)
{
    pda.intr_mask = enabled;
    /*
     * note there is a window here where softirqs are not processed by
     * the interrupt handler, but that is not a problem, since it will
     * get done here in the outer enable of any nested pair.
     */
    if (enabled)
        local_bh_enable();
}

I think Ingo's suggestion of using the hardirq tracing is another way 
that could work, but it seems to be too heavyweight and tied too much to 
the lockdep verification code - plus it inserts additional 
raw_irq_disable in places that seem counter to the goal of getting rid 
of them in the first place.  Perhaps I'm misunderstanding the code, though.

Zach

^ permalink raw reply	[flat|nested] 93+ messages in thread

* Re: [patch 13/26] Xen-paravirt_ops: Consistently wrap paravirt ops callsites to make them patchable
  2007-03-20 15:58                             ` Eric W. Biederman
  2007-03-20 16:06                               ` Linus Torvalds
  2007-03-20 16:26                               ` Jeremy Fitzhardinge
@ 2007-03-20 22:41                               ` Rusty Russell
  2 siblings, 0 replies; 93+ messages in thread
From: Rusty Russell @ 2007-03-20 22:41 UTC (permalink / raw)
  To: Eric W. Biederman
  Cc: Zachary Amsden, Jeremy Fitzhardinge, xen-devel, akpm,
	virtualization, netdev, linux-kernel, chrisw, Andi Kleen, anthony,
	mingo, Linus Torvalds, David Miller

On Tue, 2007-03-20 at 09:58 -0600, Eric W. Biederman wrote:
> Looking at the above code snippet.  I guess it is about time to
> merge our per_cpu and pda variables...

Indeed, thanks for the prod.  Now 2.6.21-rc4-mm1 is out, I'll resend the
patches.

Cheers,
Rusty.

^ permalink raw reply	[flat|nested] 93+ messages in thread

* Re: [patch 13/26] Xen-paravirt_ops: Consistently wrap paravirt ops callsites to make them patchable
  2007-03-20 16:31                                 ` Jeremy Fitzhardinge
  2007-03-20 22:09                                   ` Zachary Amsden
@ 2007-03-20 22:43                                   ` Matt Mackall
  2007-03-20 23:08                                     ` Zachary Amsden
  2007-03-21  0:20                                     ` Jeremy Fitzhardinge
  1 sibling, 2 replies; 93+ messages in thread
From: Matt Mackall @ 2007-03-20 22:43 UTC (permalink / raw)
  To: Jeremy Fitzhardinge
  Cc: xen-devel, akpm, virtualization, netdev, linux-kernel, chrisw,
	Andi Kleen, Eric W. Biederman, anthony, mingo, Linus Torvalds,
	David Miller

On Tue, Mar 20, 2007 at 09:31:58AM -0700, Jeremy Fitzhardinge wrote:
> Linus Torvalds wrote:
> > On Tue, 20 Mar 2007, Eric W. Biederman wrote:
> >   
> >> If that is the case.  In the normal kernel what would
> >> the "the oops, we got an interrupt code do?"
> >> I assume it would leave interrupts disabled when it returns?
> >> Like we currently do with the delayed disable of normal interrupts?
> >>     
> >
> > Yeah, disable interrupts, and set a flag that the fake "sti" can test, and 
> > just return without doing anything.
> >
> > (You may or may not also need to do extra work to Ack the hardware 
> > interrupt etc, which may be irq-controller specific. Once the CPU has 
> > accepted the interrupt, you may not be able to just leave it dangling)
> >   
> 
> So it would be something like:
> 
>     pda.intr_mask = 1;		/* disable interrupts */
>     ...
>     pda.intr_mask = 0;		/* enable interrupts */
>     if (xchg(&pda.intr_pending, 0))	/* check pending */
>     	asm("sti");		/* was pending; isr left cpu interrupts masked */

I don't know that you need an xchg there. If you're still on the same
CPU, it should all be nice and causal even across an interrupt handler.
So it could be:

   pda.intr_mask = 0; /* intr_pending can't get set after this */
   if (unlikely(pda.intr_pending)) {
      pda.intr_pending = 0;
      asm("sti");
   }

(This would actually need a C barrier, but I'll ignore that as this'd
end up being asm...)

But other interesting things could happen. If we never did a real CLI
and we get preempted and switched to another CPU between clearing
intr_mask and checking intr_pending, we get a little confused. 

But perhaps that doesn't matter because we'd by definition have no
pending interrupts on either processor?

Is it expensive to do an STI if interrupts are already enabled?

-- 
Mathematics is the supreme nostalgia of our time.

^ permalink raw reply	[flat|nested] 93+ messages in thread

* Re: [patch 13/26] Xen-paravirt_ops: Consistently wrap paravirt ops callsites to make them patchable
  2007-03-20 22:43                                   ` Matt Mackall
@ 2007-03-20 23:08                                     ` Zachary Amsden
  2007-03-20 23:33                                       ` Jeremy Fitzhardinge
  2007-03-20 23:41                                       ` Matt Mackall
  2007-03-21  0:20                                     ` Jeremy Fitzhardinge
  1 sibling, 2 replies; 93+ messages in thread
From: Zachary Amsden @ 2007-03-20 23:08 UTC (permalink / raw)
  To: Matt Mackall
  Cc: Jeremy Fitzhardinge, Linus Torvalds, Eric W. Biederman,
	Rusty Russell, Andi Kleen, David Miller, mingo, akpm,
	linux-kernel, virtualization, xen-devel, chrisw, anthony, netdev

Matt Mackall wrote:
> I don't know that you need an xchg there. If you're still on the same
> CPU, it should all be nice and causal even across an interrupt handler.
> So it could be:
>
>    pda.intr_mask = 0; /* intr_pending can't get set after this */
>   

Why not?  Oh, I see.  intr_mask is inverted form of EFLAGS_IF.

>    if (unlikely(pda.intr_pending)) {
>       pda.intr_pending = 0;
>       asm("sti");
>    }
>
> (This would actually need a C barrier, but I'll ignore that as this'd
> end up being asm...)
>
> But other interesting things could happen. If we never did a real CLI
> and we get preempted and switched to another CPU between clearing
> intr_mask and checking intr_pending, we get a little confused. 
>   

I think Jeremy's idea was to have interrupt handlers leave interrupts 
disabled on exit if pda.intr_mask was set.  In which case, they would 
bypass all work and we could never get preempted.  I don't think leaving 
hardware interrupts disabled for such a long time is good though.

> But perhaps that doesn't matter because we'd by definition have no
> pending interrupts on either processor?
>
> Is it expensive to do an STI if interrupts are already enabled?
>   

Yes.

Zach

^ permalink raw reply	[flat|nested] 93+ messages in thread

* Re: [patch 13/26] Xen-paravirt_ops: Consistently wrap paravirt ops callsites to make them patchable
  2007-03-20 23:08                                     ` Zachary Amsden
@ 2007-03-20 23:33                                       ` Jeremy Fitzhardinge
  2007-03-21  1:14                                         ` Zachary Amsden
  2007-03-20 23:41                                       ` Matt Mackall
  1 sibling, 1 reply; 93+ messages in thread
From: Jeremy Fitzhardinge @ 2007-03-20 23:33 UTC (permalink / raw)
  To: Zachary Amsden
  Cc: xen-devel, akpm, virtualization, netdev, Rusty Russell,
	linux-kernel, chrisw, Andi Kleen, Eric W. Biederman, anthony,
	Matt Mackall, mingo, Linus Torvalds, David Miller

Zachary Amsden wrote:
> I think Jeremy's idea was to have interrupt handlers leave interrupts
> disabled on exit if pda.intr_mask was set.  In which case, they would
> bypass all work and we could never get preempted.

Yes, I was worried that if we left the isr without actually handling the
interrupt, it would still be asserted and we'd just get interrupted
again.  The idea is that we avoid touching cli/sti for the common case
of no interrupts while interrupts are disabled, but we'd still need to
fall back to using them if an interrupt becomes pending.

> I don't think leaving hardware interrupts disabled for such a long
> time is good though. 

How long?  It would be no longer than now, and possibly less, wouldn't it?

    J

^ permalink raw reply	[flat|nested] 93+ messages in thread

* Re: [patch 13/26] Xen-paravirt_ops: Consistently wrap paravirt ops callsites to make them patchable
  2007-03-20 23:08                                     ` Zachary Amsden
  2007-03-20 23:33                                       ` Jeremy Fitzhardinge
@ 2007-03-20 23:41                                       ` Matt Mackall
  1 sibling, 0 replies; 93+ messages in thread
From: Matt Mackall @ 2007-03-20 23:41 UTC (permalink / raw)
  To: Zachary Amsden
  Cc: xen-devel, akpm, virtualization, netdev, linux-kernel, chrisw,
	Andi Kleen, Eric W. Biederman, anthony, mingo, Linus Torvalds,
	David Miller

On Tue, Mar 20, 2007 at 03:08:19PM -0800, Zachary Amsden wrote:
> Matt Mackall wrote:
> >I don't know that you need an xchg there. If you're still on the same
> >CPU, it should all be nice and causal even across an interrupt handler.
> >So it could be:
> >
> >   pda.intr_mask = 0; /* intr_pending can't get set after this */
> >  
> 
> Why not?  Oh, I see.  intr_mask is inverted form of EFLAGS_IF.

It's not even that. There are two things that can happen:

case 1:

  intr_mask = 1;
            <interrupt occurs and is deferred>
  intr_mask = 0;
  /* intr_pending is already set and CLI is in effect */
  if(intr_pending)

case 2:

  intr_mask = 1;
  intr_mask = 0;
            <interrupt occurs and is processed>
  /* intr_pending remains cleared */
  if(intr_pending)

As this is all about local interrupts, it's all on a single CPU and
out of order issues aren't visible..
 
> >(This would actually need a C barrier, but I'll ignore that as this'd
> >end up being asm...)

..unless the compiler is doing the reordering, of course.

> >But other interesting things could happen. If we never did a real CLI
> >and we get preempted and switched to another CPU between clearing
> >intr_mask and checking intr_pending, we get a little confused. 
> 
> I think Jeremy's idea was to have interrupt handlers leave interrupts 
> disabled on exit if pda.intr_mask was set.  In which case, they would 
> bypass all work and we could never get preempted.

I was actually worrying about the case where the interrupt came in
"late". But I don't think it's a problem there either.

> I don't think leaving 
> hardware interrupts disabled for such a long time is good though.

It can only be worse than the current situation by the amount of time
it takes to defer an interrupt once. On average, it'll be a lot
better as most critical sections are -tiny-.

-- 
Mathematics is the supreme nostalgia of our time.

^ permalink raw reply	[flat|nested] 93+ messages in thread

* Re: [patch 13/26] Xen-paravirt_ops: Consistently wrap paravirt ops callsites to make them patchable
  2007-03-20 20:23                                                     ` Andi Kleen
  2007-03-20 21:39                                                       ` Alan Cox
@ 2007-03-20 23:43                                                       ` Linus Torvalds
  2007-03-21  6:08                                                       ` Andrew Morton
  2 siblings, 0 replies; 93+ messages in thread
From: Linus Torvalds @ 2007-03-20 23:43 UTC (permalink / raw)
  To: Andi Kleen
  Cc: xen-devel, netdev, mingo, linux-kernel, jbeulich, virtualization,
	chrisw, virtualization, Eric W. Biederman, anthony, akpm,
	David Miller



On Tue, 20 Mar 2007, Andi Kleen wrote:

> On Tue, Mar 20, 2007 at 11:49:39AM -0700, Linus Torvalds wrote:
> > 
> > the thing is, I'd rather see a long backtrace that is hard to decipher but 
> > that *never* *ever* causes any additional problems, over a pretty one.
> 
> Well it causes additional problems. We had some cases where it was really
> hard to distingush garbage and the true call chain. I can probably dig
> out some examples if you want.

Well, by "additional problems" _I_ mean things like "a warning turned into 
a fatal oops and didn't get logged at all".

That's a lot more serious than "there were a few extra entries in the 
traceback that caused us some confusion".

And yes, we had exactly that case happen several times.

> With lots of call backs (e.g. common with sysfs) it is also frequently
> not obvious how the call chains are supposed to go.

With callbacks, it's actually often nice to see the callback data that is 
on the stack (and it's very obvious from the "<function+0>" ksymtab 
explanation: you can't have a <function+0> that is anything but a callback 
pointer (since it isn't a return address).

		Linus

^ permalink raw reply	[flat|nested] 93+ messages in thread

* Re: [Xen-devel] Re: [patch 13/26] Xen-paravirt_ops: Consistently wrap paravirt ops callsites to make them patchable
  2007-03-20 21:49                                                         ` [Xen-devel] " Andi Kleen
@ 2007-03-20 23:51                                                           ` Linus Torvalds
  0 siblings, 0 replies; 93+ messages in thread
From: Linus Torvalds @ 2007-03-20 23:51 UTC (permalink / raw)
  To: Andi Kleen
  Cc: xen-devel, David Miller, netdev, linux-kernel, jbeulich,
	virtualization, chrisw, virtualization, Eric W. Biederman,
	anthony, mingo, akpm, Alan Cox



On Tue, 20 Mar 2007, Andi Kleen wrote:
> 
> Linus is worried about the unwinder crashing -- that wouldn't help with that.

And to make it clear: this is not a theoretical worry. It happened many 
times over the months the unwinder was in. 

It was supposed to help debugging, but it made bugs that *would* have been 
nicely debuggable without it into nightmares. So the only reason for it 
existing in the first place was actually the thing that made it not work.

		Linus

^ permalink raw reply	[flat|nested] 93+ messages in thread

* Re: [patch 13/26] Xen-paravirt_ops: Consistently wrap paravirt ops callsites to make them patchable
  2007-03-20 15:09                           ` Linus Torvalds
  2007-03-20 15:58                             ` Eric W. Biederman
  2007-03-20 17:00                             ` Ingo Molnar
@ 2007-03-21  0:03                             ` Paul Mackerras
  2007-04-12 23:16                               ` David Miller
  2 siblings, 1 reply; 93+ messages in thread
From: Paul Mackerras @ 2007-03-21  0:03 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: Jeremy Fitzhardinge, Zachary Amsden, Rusty Russell,
	Eric W. Biederman, Andi Kleen, David Miller, mingo, akpm,
	linux-kernel, virtualization, xen-devel, chrisw, anthony, netdev

Linus Torvalds writes:

> We should just do this natively. There's been several tests over the years 
> saying that it's much more efficient to do sti/cli as a simple store, and 
> handling the "oops, we got an interrupt while interrupts were disabled" as 
> a special case.
> 
> I have this dim memory that ARM has done it that way for a long time 
> because it's so expensive to do a "real" cli/sti.
> 
> And I think -rt does it for other reasons. It's just more flexible.

64-bit powerpc does this now as well.

Paul.

^ permalink raw reply	[flat|nested] 93+ messages in thread

* Re: [patch 13/26] Xen-paravirt_ops: Consistently wrap paravirt ops callsites to make them patchable
  2007-03-20 22:43                                   ` Matt Mackall
  2007-03-20 23:08                                     ` Zachary Amsden
@ 2007-03-21  0:20                                     ` Jeremy Fitzhardinge
  1 sibling, 0 replies; 93+ messages in thread
From: Jeremy Fitzhardinge @ 2007-03-21  0:20 UTC (permalink / raw)
  To: Matt Mackall
  Cc: Linus Torvalds, Eric W. Biederman, Zachary Amsden, Rusty Russell,
	Andi Kleen, David Miller, mingo, akpm, linux-kernel,
	virtualization, xen-devel, chrisw, anthony, netdev

Matt Mackall wrote:
> On Tue, Mar 20, 2007 at 09:31:58AM -0700, Jeremy Fitzhardinge wrote:
>   
>> Linus Torvalds wrote:
>>     
>>> On Tue, 20 Mar 2007, Eric W. Biederman wrote:
>>>   
>>>       
>>>> If that is the case.  In the normal kernel what would
>>>> the "the oops, we got an interrupt code do?"
>>>> I assume it would leave interrupts disabled when it returns?
>>>> Like we currently do with the delayed disable of normal interrupts?
>>>>     
>>>>         
>>> Yeah, disable interrupts, and set a flag that the fake "sti" can test, and 
>>> just return without doing anything.
>>>
>>> (You may or may not also need to do extra work to Ack the hardware 
>>> interrupt etc, which may be irq-controller specific. Once the CPU has 
>>> accepted the interrupt, you may not be able to just leave it dangling)
>>>   
>>>       
>> So it would be something like:
>>
>>     pda.intr_mask = 1;		/* disable interrupts */
>>     ...
>>     pda.intr_mask = 0;		/* enable interrupts */
>>     if (xchg(&pda.intr_pending, 0))	/* check pending */
>>     	asm("sti");		/* was pending; isr left cpu interrupts masked */
>>     
>
> I don't know that you need an xchg there. If you're still on the same
> CPU, it should all be nice and causal even across an interrupt handler.
> So it could be:
>
>    pda.intr_mask = 0; /* intr_pending can't get set after this */
>    if (unlikely(pda.intr_pending)) {
>       pda.intr_pending = 0;
>       asm("sti");
>    }
>
> (This would actually need a C barrier, but I'll ignore that as this'd
> end up being asm...)
>
> But other interesting things could happen. If we never did a real CLI
> and we get preempted and switched to another CPU between clearing
> intr_mask and checking intr_pending, we get a little confused. 
>   

Could prevent preempt if pda.intr_mask is set.  preemptible() is defined as:

    # define preemptible()    (preempt_count() == 0 && !irqs_disabled())

anyway, so that would be changed to look at the intr_mask rather than
eflags.
(I'm not sure if preemptible() is actually used to determine whether
preempt or not).

Alternatively, the intr_mask could be encoded in a bit of preempt_count...

    J

^ permalink raw reply	[flat|nested] 93+ messages in thread

* Re: [patch 13/26] Xen-paravirt_ops: Consistently wrap paravirt ops callsites to make them patchable
  2007-03-20 22:09                                   ` Zachary Amsden
@ 2007-03-21  0:24                                     ` Linus Torvalds
  2007-03-21  2:53                                       ` Zachary Amsden
  0 siblings, 1 reply; 93+ messages in thread
From: Linus Torvalds @ 2007-03-21  0:24 UTC (permalink / raw)
  To: Zachary Amsden
  Cc: xen-devel, akpm, virtualization, netdev, linux-kernel, chrisw,
	Andi Kleen, Eric W. Biederman, anthony, mingo, David Miller



On Tue, 20 Mar 2007, Zachary Amsden wrote:
> 
> void local_irq_restore(int enabled)
> {
>    pda.intr_mask = enabled;
>    /*
>     * note there is a window here where softirqs are not processed by
>     * the interrupt handler, but that is not a problem, since it will
>     * get done here in the outer enable of any nested pair.
>     */
>    if (enabled)
>        local_bh_enable();
> }

Actually, this one is more complicated. You also need to actually enable 
hardware interrupts again if they got disabled by an interrupt actually 
occurring while the "soft-interrupt" was disabled.

But since it's all a local-cpu issue, you can do things like test 
cpu-local memory flags for whetehr that has happened or not.

So it *should* be something as simple as

	local_irq_disable()
	{
		pda.irq_enable = 0;
	}

	handle_interrupt()
	{
		if (!pda.irq_enable) {
			pda.irq_queued = 1;
			queue_interrupt();
			.. make sure we return with hardirq's now 
			   disabled: just clear IF in the pt_regs ..
			return;
		}
		.. normal ..
	}

	local_irq_enable()
	{
		pda.irq_enable = 1;
		barrier();
		/* Common case - nothing happened while we were fake-disabled.. */
		if (!pda.irq_queued)
			return; 

		/* Ok, actually handle the things! */
		handle_queued_irqs();

		/*
		 * And enable the hw interrupts again, they got disabled 
		 * when we were queueing stuff.. 
		 */
		hardware_sti();
	}

but I haven't really gone over it in any detail, I may have missed 
something really obvious.

Anyway, it really *should* be pretty damn simple. No need to disable 
preemption, there should be no events that can *cause* it, since all 
interrupts get headed off at the pass.. (the return-from-interrupt thng 
should already notice that it's returning to an interrupts-disabled 
section and not try to do any preemption).

What did I miss?

		Linus

^ permalink raw reply	[flat|nested] 93+ messages in thread

* Re: [patch 13/26] Xen-paravirt_ops: Consistently wrap paravirt ops callsites to make them patchable
  2007-03-20 23:33                                       ` Jeremy Fitzhardinge
@ 2007-03-21  1:14                                         ` Zachary Amsden
  0 siblings, 0 replies; 93+ messages in thread
From: Zachary Amsden @ 2007-03-21  1:14 UTC (permalink / raw)
  To: Jeremy Fitzhardinge
  Cc: Matt Mackall, Linus Torvalds, Eric W. Biederman, Rusty Russell,
	Andi Kleen, David Miller, mingo, akpm, linux-kernel,
	virtualization, xen-devel, chrisw, anthony, netdev

Jeremy Fitzhardinge wrote:
> Zachary Amsden wrote:
>   
>> I think Jeremy's idea was to have interrupt handlers leave interrupts
>> disabled on exit if pda.intr_mask was set.  In which case, they would
>> bypass all work and we could never get preempted.
>>     
>
> Yes, I was worried that if we left the isr without actually handling the
> interrupt, it would still be asserted and we'd just get interrupted
> again.  The idea is that we avoid touching cli/sti for the common case
> of no interrupts while interrupts are disabled, but we'd still need to
> fall back to using them if an interrupt becomes pending.
>
>   
>> I don't think leaving hardware interrupts disabled for such a long
>> time is good though. 
>>     
>
> How long?  It would be no longer than now, and possibly less, wouldn't it?
>   

Hmm.  Perhaps.  Something about the asymmetry bothers me alot though.

Zach


^ permalink raw reply	[flat|nested] 93+ messages in thread

* Re: [patch 13/26] Xen-paravirt_ops: Consistently wrap paravirt ops callsites to make them patchable
  2007-03-21  2:53                                       ` Zachary Amsden
@ 2007-03-21  2:15                                         ` Linus Torvalds
  2007-03-21  3:43                                           ` Zachary Amsden
  0 siblings, 1 reply; 93+ messages in thread
From: Linus Torvalds @ 2007-03-21  2:15 UTC (permalink / raw)
  To: Zachary Amsden
  Cc: xen-devel, akpm, virtualization, netdev, linux-kernel, chrisw,
	Andi Kleen, Eric W. Biederman, anthony, mingo, David Miller



On Tue, 20 Mar 2007, Zachary Amsden wrote:
> 
> Actually, I was thinking the irq handlers would just not mess around with
> eflags on the stack, just call the chip to ack the interrupt and re-enable
> hardware interrupts when they left, since that is free anyway with the iret.

No can do. Think level-triggered. You *need* to disable the interrupt, and 
disabling it at the CPU is the easiest approach. Even so, you need to 
worry about SMP and screaming interrupts at all CPU's, but if you don't 
ack it to the IO-APIC until later, that should be ok (alternatively, you 
need to just mask-and-ack the irq controller).

> Maybe leaving irqs disabled is better.

One of the advantages of doing that is that you only ever have a queue of 
one single entry, which then makes it easier to do the replay.

		Linus

^ permalink raw reply	[flat|nested] 93+ messages in thread

* Re: [patch 13/26] Xen-paravirt_ops: Consistently wrap paravirt ops callsites to make them patchable
  2007-03-21  0:24                                     ` Linus Torvalds
@ 2007-03-21  2:53                                       ` Zachary Amsden
  2007-03-21  2:15                                         ` Linus Torvalds
  0 siblings, 1 reply; 93+ messages in thread
From: Zachary Amsden @ 2007-03-21  2:53 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: Jeremy Fitzhardinge, Eric W. Biederman, Rusty Russell, Andi Kleen,
	David Miller, mingo, akpm, linux-kernel, virtualization,
	xen-devel, chrisw, anthony, netdev

Linus Torvalds wrote:
> On Tue, 20 Mar 2007, Zachary Amsden wrote:
>   
>> void local_irq_restore(int enabled)
>> {
>>    pda.intr_mask = enabled;
>>    /*
>>     * note there is a window here where softirqs are not processed by
>>     * the interrupt handler, but that is not a problem, since it will
>>     * get done here in the outer enable of any nested pair.
>>     */
>>    if (enabled)
>>        local_bh_enable();
>> }
>>     
>
> Actually, this one is more complicated. You also need to actually enable 
> hardware interrupts again if they got disabled by an interrupt actually 
> occurring while the "soft-interrupt" was disabled.
>   

Actually, I was thinking the irq handlers would just not mess around 
with eflags on the stack, just call the chip to ack the interrupt and 
re-enable hardware interrupts when they left, since that is free anyway 
with the iret.  Maybe leaving irqs disabled is better.

> Anyway, it really *should* be pretty damn simple. No need to disable 
> preemption, there should be no events that can *cause* it, since all 
> interrupts get headed off at the pass.. (the return-from-interrupt thng 
> should already notice that it's returning to an interrupts-disabled 
> section and not try to do any preemption).
>   
> What did I miss?
>   

I wasn't disabling preemption to actually disable preemption.  I was 
just using bh_disable as a global hammer to stop softirqs (thus the irq 
replay tasklet) from running during the normal irq_exit path.  Then, we 
can just use the existing software IRQ replay code, and I think barely 
any new code (queue_irq(), etc) has to be written.

Zach

^ permalink raw reply	[flat|nested] 93+ messages in thread

* Re: [patch 13/26] Xen-paravirt_ops: Consistently wrap paravirt ops callsites to make them patchable
  2007-03-21  2:15                                         ` Linus Torvalds
@ 2007-03-21  3:43                                           ` Zachary Amsden
  0 siblings, 0 replies; 93+ messages in thread
From: Zachary Amsden @ 2007-03-21  3:43 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: Jeremy Fitzhardinge, Eric W. Biederman, Rusty Russell, Andi Kleen,
	David Miller, mingo, akpm, linux-kernel, virtualization,
	xen-devel, chrisw, anthony, netdev

Linus Torvalds wrote:
> On Tue, 20 Mar 2007, Zachary Amsden wrote:
>   
>> Actually, I was thinking the irq handlers would just not mess around with
>> eflags on the stack, just call the chip to ack the interrupt and re-enable
>> hardware interrupts when they left, since that is free anyway with the iret.
>>     
>
> No can do. Think level-triggered. You *need* to disable the interrupt, and 
> disabling it at the CPU is the easiest approach. Even so, you need to 
> worry about SMP and screaming interrupts at all CPU's, but if you don't 
> ack it to the IO-APIC until later, that should be ok (alternatively, you 
> need to just mask-and-ack the irq controller).
>   

Well, you can keep it masked, but a more important point is that I've 
entirely neglected local interrupts.  This might work for IRQs, but for 
local timer or thermal or IPIs, using the tasklet based replay simply 
will not work.

> One of the advantages of doing that is that you only ever have a queue of 
> one single entry, which then makes it easier to do the replay.
>   

Yes.  Unfortunately now both do_IRQ and all the smp_foo interrupt 
handlers need to detect and queue for replay, but fortunately they all 
have the interrupt number conveniently on the stack.

Zach

^ permalink raw reply	[flat|nested] 93+ messages in thread

* Re: [patch 13/26] Xen-paravirt_ops: Consistently wrap paravirt ops callsites to make them patchable
  2007-03-20 20:23                                                     ` Andi Kleen
  2007-03-20 21:39                                                       ` Alan Cox
  2007-03-20 23:43                                                       ` Linus Torvalds
@ 2007-03-21  6:08                                                       ` Andrew Morton
  2 siblings, 0 replies; 93+ messages in thread
From: Andrew Morton @ 2007-03-21  6:08 UTC (permalink / raw)
  To: Andi Kleen
  Cc: xen-devel, netdev, linux-kernel, jbeulich, virtualization, chrisw,
	virtualization, Eric W. Biederman, anthony, mingo, Linus Torvalds,
	David Miller

On Tue, 20 Mar 2007 21:23:52 +0100 Andi Kleen <ak@suse.de> wrote:

> Well it causes additional problems. We had some cases where it was really
> hard to distingush garbage and the true call chain.

yes, for some reason the naive backtraces seem to have got messier and messier over
the years and some of them are really quite hard to piece together nowadays.

An accurate backtrace would have some value, if we can get it bullet-proof.

The fault-injection driver wants it too.  And lockdep, I guess.

^ permalink raw reply	[flat|nested] 93+ messages in thread

* Re: [patch 13/26] Xen-paravirt_ops: Consistently wrap paravirt ops callsites to make them patchable
  2007-03-21  0:03                             ` Paul Mackerras
@ 2007-04-12 23:16                               ` David Miller
  0 siblings, 0 replies; 93+ messages in thread
From: David Miller @ 2007-04-12 23:16 UTC (permalink / raw)
  To: paulus
  Cc: xen-devel, virtualization, netdev, linux-kernel, chrisw, ak,
	ebiederm, anthony, mingo, torvalds, akpm

From: Paul Mackerras <paulus@samba.org>
Date: Wed, 21 Mar 2007 11:03:14 +1100

> Linus Torvalds writes:
> 
> > We should just do this natively. There's been several tests over the years 
> > saying that it's much more efficient to do sti/cli as a simple store, and 
> > handling the "oops, we got an interrupt while interrupts were disabled" as 
> > a special case.
> > 
> > I have this dim memory that ARM has done it that way for a long time 
> > because it's so expensive to do a "real" cli/sti.
> > 
> > And I think -rt does it for other reasons. It's just more flexible.
> 
> 64-bit powerpc does this now as well.

I was curious about this so I had a look.

There appears to be three pieces of state used to manage this
on powerpc, PACASOFTIRQEN(r13), PACAHARDIRQEN(r13) and the
SOFTE() in the stackframe.

Plus there is all of this complicated logic on trap entry and
exit to manage these three values properly.

local_irq_restore() doesn't look like a simple piece of code
either.  Logically it should be simple, update the software
binary state, and if enabling see if any interrupts came in
while we were disable so we can run them.

Given all of that, is it really cheaper than just flipping the
bit in the cpu control register? :-/

^ permalink raw reply	[flat|nested] 93+ messages in thread

end of thread, other threads:[~2007-04-12 23:16 UTC | newest]

Thread overview: 93+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
     [not found] <20070301232443.195603797@goop.org>
2007-03-01 23:25 ` [patch 26/26] Xen-paravirt_ops: Add the Xen virtual network device driver Jeremy Fitzhardinge
2007-03-02  0:42   ` Stephen Hemminger
2007-03-02  0:56     ` Jeremy Fitzhardinge
2007-03-02  1:30       ` [RFC] Arp announce (for Xen) Stephen Hemminger
2007-03-02  8:09         ` Pekka Savola
2007-03-02 18:29           ` Ben Greear
2007-03-02 19:59             ` Stephen Hemminger
2007-03-02  8:46         ` Keir Fraser
2007-03-02 12:54         ` Andi Kleen
2007-03-02 14:08           ` jamal
2007-03-02 18:08         ` Chris Wright
2007-03-06  4:35         ` David Miller
2007-03-06 18:51           ` [RFC] ARP notify option Stephen Hemminger
2007-03-06 19:04             ` Jeremy Fitzhardinge
2007-03-06 19:07             ` Chris Wright
2007-03-06 21:18             ` Chris Friesen
2007-03-06 22:52               ` Stephen Hemminger
2007-03-07  6:42               ` Pekka Savola
2007-03-07 17:00                 ` Stephen Hemminger
2007-03-02  1:21     ` [patch 26/26] Xen-paravirt_ops: Add the Xen virtual network device driver Christoph Hellwig
2007-03-02  1:26       ` Chris Wright
     [not found] ` <20070301232527.956565107@goop.org>
     [not found]   ` <20070316092445.GM23174@elte.hu>
     [not found]     ` <20070316.023331.59468179.davem@davemloft.net>
2007-03-16 20:38       ` [patch 13/26] Xen-paravirt_ops: Consistently wrap paravirt ops callsites to make them patchable Jeremy Fitzhardinge
2007-03-17 10:33         ` Rusty Russell
2007-03-18  7:33           ` David Miller
2007-03-18  7:59             ` Jeremy Fitzhardinge
2007-03-18 12:08             ` Andi Kleen
2007-03-18 15:58               ` Jeremy Fitzhardinge
2007-03-18 17:04                 ` Andi Kleen
2007-03-18 17:29                   ` Jeremy Fitzhardinge
2007-03-18 19:30                     ` Andi Kleen
2007-03-18 23:46                       ` Jeremy Fitzhardinge
2007-03-19 10:57                         ` Andi Kleen
2007-03-19 17:58                           ` Jeremy Fitzhardinge
2007-03-19 19:08                           ` David Miller
2007-03-19 20:59                             ` Andi Kleen
2007-03-20  3:18                               ` Linus Torvalds
2007-03-20  3:47                                 ` David Miller
2007-03-20  4:19                                   ` Eric W. Biederman
2007-03-20 13:28                                     ` Andi Kleen
2007-03-20 16:25                                       ` Eric W. Biederman
2007-03-20 17:42                                         ` Andi Kleen
2007-03-20 16:52                                           ` Linus Torvalds
2007-03-20 18:03                                             ` Andi Kleen
2007-03-20 17:27                                               ` Linus Torvalds
2007-03-20 19:21                                                 ` Andi Kleen
2007-03-20 18:49                                                   ` Linus Torvalds
2007-03-20 20:23                                                     ` Andi Kleen
2007-03-20 21:39                                                       ` Alan Cox
2007-03-20 21:49                                                         ` [Xen-devel] " Andi Kleen
2007-03-20 23:51                                                           ` Linus Torvalds
2007-03-20 23:43                                                       ` Linus Torvalds
2007-03-21  6:08                                                       ` Andrew Morton
2007-03-20 16:12                                     ` Chuck Ebbert
2007-03-20  1:23                         ` Zachary Amsden
2007-03-20  1:45                           ` Jeremy Fitzhardinge
2007-03-19  2:47               ` Rusty Russell
2007-03-19 18:25                 ` Eric W. Biederman
2007-03-19 18:38                   ` Linus Torvalds
2007-03-19 18:44                     ` Linus Torvalds
2007-03-19 19:33                     ` Jeremy Fitzhardinge
2007-03-20  0:01                     ` Rusty Russell
2007-03-20  2:00                       ` Zachary Amsden
2007-03-20  4:20                         ` Rusty Russell
2007-03-20  5:54                         ` Jeremy Fitzhardinge
2007-03-20 11:33                           ` Andreas Kleen
2007-03-20 15:09                           ` Linus Torvalds
2007-03-20 15:58                             ` Eric W. Biederman
2007-03-20 16:06                               ` Linus Torvalds
2007-03-20 16:31                                 ` Jeremy Fitzhardinge
2007-03-20 22:09                                   ` Zachary Amsden
2007-03-21  0:24                                     ` Linus Torvalds
2007-03-21  2:53                                       ` Zachary Amsden
2007-03-21  2:15                                         ` Linus Torvalds
2007-03-21  3:43                                           ` Zachary Amsden
2007-03-20 22:43                                   ` Matt Mackall
2007-03-20 23:08                                     ` Zachary Amsden
2007-03-20 23:33                                       ` Jeremy Fitzhardinge
2007-03-21  1:14                                         ` Zachary Amsden
2007-03-20 23:41                                       ` Matt Mackall
2007-03-21  0:20                                     ` Jeremy Fitzhardinge
2007-03-20 19:28                                 ` Andi Kleen
2007-03-20 19:54                                   ` Zachary Amsden
2007-03-20 20:02                                     ` Andi Kleen
2007-03-20 16:26                               ` Jeremy Fitzhardinge
2007-03-20 22:41                               ` Rusty Russell
2007-03-20 17:00                             ` Ingo Molnar
2007-03-21  0:03                             ` Paul Mackerras
2007-04-12 23:16                               ` David Miller
2007-03-19 18:41                   ` Chris Wright
2007-03-19 19:10                   ` Jeremy Fitzhardinge
2007-03-19 19:46                     ` David Miller
2007-03-19 20:06                       ` Jeremy Fitzhardinge
2007-03-19 23:42                     ` Andi Kleen

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).