Re: Unstableness in grant table block drivers

All of lore.kernel.org
 help / color / mirror / Atom feed

From: Christopher Clark <christopher.w.clark@gmail.com>
To: Ryan Harper <ryanh@us.ibm.com>, Anthony Liguori <aliguori@us.ibm.com>
Cc: xen-devel@lists.xensource.com
Subject: Re: Unstableness in grant table block drivers
Date: Mon, 18 Apr 2005 09:38:50 -0700	[thread overview]
Message-ID: <eab0875405041809385510d0f9@mail.gmail.com> (raw)
In-Reply-To: <20050416153922.GH27157@us.ibm.com>

[-- Attachment #1: Type: text/plain, Size: 801 bytes --]

> > Ryan: is this an SMP guest?
> 
> Yes, that only shows up when trying to boot an SMP guest.

Does the crash only occur when trying to boot an SMP guest - ie. do UP
guests start correctly?

Ryan and Anthony: I've attached a patch that I'd like you to apply to
your unstable tree -- it just printk's almost everything so I can get
an idea of the code path you're running.
Alternatively, the three .c files modified by the patch are also
attached - they're taken from 2005-04-14 unstable and you should be
able to just replace the files in your tree.

After changing your tree and rebuilding, please send the console logs
from boot onwards for xen/dom0 and domU, in a uniprocessor
configuration, both with and without using grant tables for block
transport.

thanks

Christopher

[-- Warning: decoded text below may be mangled, UTF-8 assumed --]
[-- Attachment #2: instrument.patch --]
[-- Type: text/x-patch; name="instrument.patch", Size: 18364 bytes --]

===== linux-2.6.11-xen-sparse/drivers/xen/blkback/blkback.c 1.38 vs edited =====
--- 1.38/linux-2.6.11-xen-sparse/drivers/xen/blkback/blkback.c	2005-04-05 05:28:05 -07:00
+++ edited/linux-2.6.11-xen-sparse/drivers/xen/blkback/blkback.c	2005-04-17 21:14:38 -07:00
@@ -115,6 +115,7 @@
 
 static void fast_flush_area(int idx, int nr_pages)
 {
+    printk("cwc:%s:in\n", __FUNCTION__);
 #ifdef CONFIG_XEN_BLKDEV_GRANT
     gnttab_op_t       aop[BLKIF_MAX_SEGMENTS_PER_REQUEST];
     unsigned int      i, invcount = 0;
@@ -151,6 +152,7 @@
     if ( unlikely(HYPERVISOR_multicall(mcl, nr_pages) != 0) )
         BUG();
 #endif
+    printk("cwc:%s:out\n", __FUNCTION__);
 }
 
 
@@ -374,6 +376,7 @@
 {
     int rsp = BLKIF_RSP_ERROR;
     int pending_idx = pending_ring[MASK_PEND_IDX(pending_cons)];
+    printk("cwc:%s:in id:%lu\n", __FUNCTION__, req->id);
 
     /* We expect one buffer only. */
     if ( unlikely(req->nr_segments != 1) )
@@ -384,6 +387,7 @@
          (blkif_last_sect(req->frame_and_sects[0]) != 7) )
         goto out;
 
+    printk("cwc:%s:mid\n", __FUNCTION__);
 #ifdef CONFIG_XEN_BLKDEV_GRANT
     {
         gnttab_op_t     op;
@@ -428,12 +432,14 @@
 #endif
 #endif /* endif CONFIG_XEN_BLKDEV_GRANT */
    
+    printk("cwc:%s:vbd_probe\n", __FUNCTION__);
     rsp = vbd_probe(blkif, (vdisk_t *)MMAP_VADDR(pending_idx, 0), 
                     PAGE_SIZE / sizeof(vdisk_t));
 
  out:
     fast_flush_area(pending_idx, 1);
     make_response(blkif, req->id, req->operation, rsp);
+    printk("cwc:%s:out response:%d\n", __FUNCTION__, rsp);
 }
 
 static void dispatch_rw_block_io(blkif_t *blkif, blkif_request_t *req)
@@ -464,6 +470,7 @@
 
     /* Check that number of segments is sane. */
     nseg = req->nr_segments;
+    printk("cwc:%s:in nsegs:%u\n", __FUNCTION__, nseg);
     if ( unlikely(nseg == 0) || 
          unlikely(nseg > BLKIF_MAX_SEGMENTS_PER_REQUEST) )
     {
@@ -497,6 +504,7 @@
     if ( unlikely(HYPERVISOR_grant_table_op(
                     GNTTABOP_map_grant_ref, aop, nseg)))
         BUG();
+    printk("cwc:%s:post map\n", __FUNCTION__);
 
     for ( i = 0; i < nseg; i++ )
     {
@@ -513,6 +521,7 @@
         pending_handle(pending_idx, i) = aop[i].u.map_grant_ref.handle;
     }
 #endif
+    printk("cwc:%s:pre populate segs\n", __FUNCTION__);
 
     for ( i = 0; i < nseg; i++ )
     {
@@ -527,6 +536,8 @@
             goto bad_descriptor;
         preq.nr_sects += seg[i].nsec;
 #endif
+     	printk("cwc:%s: buf:%lx populated %u sects\n",
+               __FUNCTION__, seg[i].buf, seg[i].nsec);
     }
 
     if ( vbd_translate(&preq, blkif, operation) != 0 )
@@ -646,6 +657,7 @@
 
         preq.sector_number += seg[i].nsec;
     }
+    printk("cwc:%s: post bio_add_page\n", __FUNCTION__);
 
     if ( (q = bdev_get_queue(bio->bi_bdev)) != plugged_queue )
     {
@@ -662,11 +674,13 @@
         submit_bio(operation, biolist[i]);
 
 #endif
+    printk("cwc:%s: %d bios submitted\n", __FUNCTION__, nbio);
 
     return;
 
  bad_descriptor:
     make_response(blkif, req->id, req->operation, BLKIF_RSP_ERROR);
+    printk("cwc:%s: %d bad descriptor\n", __FUNCTION__);
 } 
 
 
===== linux-2.6.11-xen-sparse/drivers/xen/blkfront/blkfront.c 1.52 vs edited =====
--- 1.52/linux-2.6.11-xen-sparse/drivers/xen/blkfront/blkfront.c	2005-04-05 00:43:40 -07:00
+++ edited/linux-2.6.11-xen-sparse/drivers/xen/blkfront/blkfront.c	2005-04-17 22:34:21 -07:00
@@ -226,6 +226,7 @@
 {
     struct gendisk *gd = inode->i_bdev->bd_disk;
     struct xlbd_disk_info *di = (struct xlbd_disk_info *)gd->private_data;
+    printk("cwc:%s: called\n", __FUNCTION__);
 
     /* Update of usage count is protected by per-device semaphore. */
     di->mi->usage++;
@@ -238,6 +239,7 @@
 {
     struct gendisk *gd = inode->i_bdev->bd_disk;
     struct xlbd_disk_info *di = (struct xlbd_disk_info *)gd->private_data;
+    printk("cwc:%s: called\n", __FUNCTION__);
 
     /*
      * When usage drops to zero it may allow more VBD updates to occur.
@@ -255,6 +257,7 @@
                 unsigned command, unsigned long argument)
 {
     int i;
+    printk("cwc:%s: called\n", __FUNCTION__);
 
     DPRINTK_IOCTL("command: 0x%x, argument: 0x%lx, dev: 0x%04x\n",
                   command, (long)argument, inode->i_rdev); 
@@ -305,6 +308,7 @@
 #ifdef CONFIG_XEN_BLKDEV_GRANT
     int ref;
 #endif
+    printk("cwc:%s: in blkif_state:%u\n", __FUNCTION__, blkif_state);
 
     if ( unlikely(blkif_state != BLKIF_STATE_CONNECTED) )
         return 1;
@@ -355,6 +359,7 @@
     /* Keep a private copy so we can reissue requests when recovering. */
     translate_req_to_pfn(&rec_ring[id], ring_req);
 
+    printk("cwc:%s: out\n", __FUNCTION__);
     return 0;
 }
 
@@ -367,6 +372,7 @@
 {
     struct request *req;
     int queued;
+    printk("cwc:%s: called\n", __FUNCTION__);
 
     DPRINTK("Entered do_blkif_request\n"); 
 
@@ -407,12 +413,15 @@
     RING_IDX i, rp;
     unsigned long flags; 
     
+    printk("cwc:%s: pre spin_lock_irqsave\n", __FUNCTION__);
     spin_lock_irqsave(&blkif_io_lock, flags);     
+    printk("cwc:%s: post spin_lock_irqsave\n", __FUNCTION__);
 
     if ( unlikely(blkif_state == BLKIF_STATE_CLOSED) || 
          unlikely(recovery) )
     {
         spin_unlock_irqrestore(&blkif_io_lock, flags);
+        printk("cwc:%s: spin_unlock_irqsave\n", __FUNCTION__);
         return IRQ_HANDLED;
     }
     
@@ -426,6 +435,7 @@
         bret = RING_GET_RESPONSE(&blk_ring, i);
         id = bret->id;
         req = (struct request *)rec_ring[id].id;
+        printk("cwc:%s: blkif_completion call id %lu\n", __FUNCTION__, id);
         blkif_completion( &rec_ring[id] );
 
         ADD_ID_TO_FREELIST(id); /* overwrites req */
@@ -454,12 +464,14 @@
             BUG();
         }
     }
+    printk("cwc:%s: post loop\n", __FUNCTION__);
 
     blk_ring.rsp_cons = i;
     
     kick_pending_request_queues();
 
     spin_unlock_irqrestore(&blkif_io_lock, flags);
+    printk("cwc:%s: out unlock_irq_restore\n", __FUNCTION__);
 
     return IRQ_HANDLED;
 }
@@ -768,6 +780,7 @@
 #ifdef CONFIG_XEN_BLKDEV_GRANT
     int ref;
 #endif
+    printk("cwc:%s: in op:%d\n", __FUNCTION__, operation);
 
     fsect = (buffer_ma & ~PAGE_MASK) >> 9;
     lsect = fsect + nr_sectors - 1;
@@ -809,6 +822,7 @@
              (sg_dev == device) &&
              (sg_next_sect == sector_number) )
         {
+            printk("cwc:%s: building req\n", __FUNCTION__);
             req = RING_GET_REQUEST(&blk_ring, 
                                    blk_ring.req_prod_pvt - 1);
             bh = (struct buffer_head *)id;
@@ -842,10 +856,12 @@
             /* Update the copy of the request in the recovery ring. */
             translate_req_to_pfn(&rec_ring[req->id], req );
 
+            printk("cwc:%s: access granted to %lx\n", __FUNCTION__, buffer_ma);
             return 0;
         }
         else if ( RING_FULL(&blk_ring) )
         {
+            printk("cwc:%s: ring full\n", __FUNCTION__);
             return 1;
         }
         else
@@ -857,8 +873,12 @@
         break;
 
     default:
+        {
+        printk("cwc:%s: pre panic for unknown op %d\n", __FUNCTION__, operation);
         panic("unknown op %d\n", operation);
+        }
     }
+    printk("cwc:%s: mid\n", __FUNCTION__);
 
     /* Fill out a communications ring structure. */
     req = RING_GET_REQUEST(&blk_ring, blk_ring.req_prod_pvt);
@@ -892,6 +912,7 @@
 
     blk_ring.req_prod_pvt++;
     
+    printk("cwc:%s: out buf:%lx\n", __FUNCTION__, buffer_ma);
     return 0;
 }
 
@@ -1051,6 +1072,7 @@
     ASSERT( ref != -ENOSPC );
 
     gnttab_grant_foreign_access_ref( ref, rdomid, address >> PAGE_SHIFT, 0 );
+    printk("cwc:%s: grant access to buf:%lx\n", __FUNCTION__, address >> PAGE_SHIFT);
 
     req->frame_and_sects[0] = (((u32) ref) << 16) | 7;
 
@@ -1062,6 +1084,7 @@
 {
     unsigned long flags, id;
     blkif_request_t *req_d;
+    printk("cwc:%s: in\n", __FUNCTION__);
 
  retry:
     while ( RING_FULL(&blk_ring) )
@@ -1070,9 +1093,12 @@
         schedule_timeout(1);
     }
 
+    printk("cwc:%s: pre spin_lock_irqsave\n", __FUNCTION__);
     spin_lock_irqsave(&blkif_io_lock, flags);
+    printk("cwc:%s: post spin_lock_irqsave\n", __FUNCTION__);
     if ( RING_FULL(&blk_ring) )
     {
+        printk("cwc:%s: spin_unlock_irqrestore\n", __FUNCTION__);
         spin_unlock_irqrestore(&blkif_io_lock, flags);
         goto retry;
     }
@@ -1090,6 +1116,7 @@
     blk_ring.req_prod_pvt++;
     flush_requests();
 
+    printk("cwc:%s: spin_unlock_irqrestore 2\n", __FUNCTION__);
     spin_unlock_irqrestore(&blkif_io_lock, flags);
 
     while ( !blkif_control_rsp_valid )
@@ -1100,6 +1127,7 @@
 
     memcpy(rsp, &blkif_control_rsp, sizeof(*rsp));
     blkif_control_rsp_valid = 0;
+    printk("cwc:%s: out\n", __FUNCTION__);
 }
 
 
@@ -1115,6 +1143,7 @@
     
     msg->status = (ok ? BLKIF_DRIVER_STATUS_UP : BLKIF_DRIVER_STATUS_DOWN);
 
+    printk("cwc:%s: ctrl_if_send_message_block\n", __FUNCTION__);
     ctrl_if_send_message_block(&cmsg, NULL, 0, TASK_UNINTERRUPTIBLE);
 }
 
@@ -1131,16 +1160,20 @@
     msg->handle      = 0;
     msg->shmem_frame = (virt_to_machine(blk_ring.sring) >> PAGE_SHIFT);
     
+    printk("cwc:%s: ctrl_if_send_message_block\n", __FUNCTION__);
     ctrl_if_send_message_block(&cmsg, NULL, 0, TASK_UNINTERRUPTIBLE);
 }
 
 static void blkif_free(void)
 {
     /* Prevent new requests being issued until we fix things up. */
+    printk("cwc:%s: pre spin_lock_irq\n", __FUNCTION__);
     spin_lock_irq(&blkif_io_lock);
+    printk("cwc:%s: post spin_lock_irq\n", __FUNCTION__);
     recovery = 1;
     blkif_state = BLKIF_STATE_DISCONNECTED;
     spin_unlock_irq(&blkif_io_lock);
+    printk("cwc:%s: spin_unlock_irq\n", __FUNCTION__);
 
     /* Free resources associated with old device channel. */
     if ( blk_ring.sring != NULL )
@@ -1153,16 +1186,19 @@
     
     unbind_evtchn_from_irq(blkif_evtchn);
     blkif_evtchn = 0;
+    printk("cwc:%s: out\n", __FUNCTION__);
 }
 
 static void blkif_close(void)
 {
+    printk("cwc:%s: called\n", __FUNCTION__);
 }
 
 /* Move from CLOSED to DISCONNECTED state. */
 static void blkif_disconnect(void)
 {
     blkif_sring_t *sring;
+    printk("cwc:%s: called\n", __FUNCTION__);
     
     if ( blk_ring.sring != NULL )
         free_page((unsigned long)blk_ring.sring);
@@ -1176,6 +1212,7 @@
 
 static void blkif_reset(void)
 {
+    printk("cwc:%s: called\n", __FUNCTION__);
     blkif_free();
     blkif_disconnect();
 }
@@ -1184,6 +1221,7 @@
 {
     int i;
     blkif_request_t *req;
+    printk("cwc:%s: in\n", __FUNCTION__);
 
     /* Hmm, requests might be re-ordered when we re-issue them.
      * This will need to be fixed once we have barriers */
@@ -1199,6 +1237,7 @@
             blk_ring.req_prod_pvt++;
         }
     }
+    printk("cwc:%s: post 1\n", __FUNCTION__);
 
     /* Stage 2 : Set up shadow list. */
     for ( i = 0; i < blk_ring.req_prod_pvt; i++ ) 
@@ -1208,12 +1247,14 @@
         req->id = i;
         translate_req_to_pfn(&rec_ring[i], req);
     }
+    printk("cwc:%s: post 2\n", __FUNCTION__);
 
     /* Stage 3 : Set up free list. */
     for ( ; i < BLK_RING_SIZE; i++ )
         rec_ring[i].id = i+1;
     rec_ring_free = blk_ring.req_prod_pvt;
     rec_ring[BLK_RING_SIZE-1].id = 0x0fffffff;
+    printk("cwc:%s: post 3\n", __FUNCTION__);
 
     /* blk_ring->req_prod will be set when we flush_requests().*/
     wmb();
@@ -1229,6 +1270,7 @@
 
     /* Now safe to left other peope use interface. */
     blkif_state = BLKIF_STATE_CONNECTED;
+    printk("cwc:%s: out\n", __FUNCTION__);
 }
 
 static void blkif_connect(blkif_fe_interface_status_t *status)
@@ -1240,6 +1282,7 @@
 #ifdef CONFIG_XEN_BLKDEV_GRANT
     rdomid       = status->domid;
 #endif
+    printk("cwc:%s: in\n", __FUNCTION__);
 
     err = request_irq(blkif_irq, blkif_int, SA_SAMPLE_RANDOM, "blkif", NULL);
     if ( err )
@@ -1250,6 +1293,7 @@
 
     if ( recovery ) 
     {
+        printk("cwc:%s: recovery\n", __FUNCTION__);
         blkif_recover();
     } 
     else 
@@ -1257,15 +1301,19 @@
         /* Transition to connected in case we need to do 
          *  a partition probe on a whole disk. */
         blkif_state = BLKIF_STATE_CONNECTED;
+        printk("cwc:%s: connected\n", __FUNCTION__);
         
         /* Probe for discs attached to the interface. */
         xlvbd_init();
     }
     
     /* Kick pending requests. */
+    printk("cwc:%s: pre spin_lock_irq\n", __FUNCTION__);
     spin_lock_irq(&blkif_io_lock);
+    printk("cwc:%s: post spin_lock_irq\n", __FUNCTION__);
     kick_pending_request_queues();
     spin_unlock_irq(&blkif_io_lock);
+    printk("cwc:%s: spin_unlock_irq\n", __FUNCTION__);
 }
 
 static void unexpected(blkif_fe_interface_status_t *status)
@@ -1282,6 +1330,7 @@
         unexpected(status);
         return;
     }
+    printk("cwc:%s: status: %u\n", __FUNCTION__, status->status);
 
     switch ( status->status ) 
     {
@@ -1353,6 +1402,7 @@
 
 static void blkif_ctrlif_rx(ctrl_msg_t *msg, unsigned long id)
 {
+    printk("cwc:%s: called\n", __FUNCTION__);
     switch ( msg->subtype )
     {
     case CMSG_BLKIF_FE_INTERFACE_STATUS:
@@ -1371,6 +1421,7 @@
 {
     int err = 0;
     int i;
+    printk("cwc:%s: in\n", __FUNCTION__);
     send_driver_status(1);
 
     /*
@@ -1389,6 +1440,7 @@
         printk(KERN_INFO "xen_blk: Timeout connecting to device!\n");
         err = -ENOSYS;
     }
+    printk("cwc:%s: out\n", __FUNCTION__);
     return err;
 }
 
@@ -1424,16 +1476,19 @@
 
 void blkdev_suspend(void)
 {
+    printk("cwc:%s: called\n", __FUNCTION__);
 }
 
 void blkdev_resume(void)
 {
+    printk("cwc:%s: called\n", __FUNCTION__);
     send_driver_status(1);
 }
 
 void blkif_completion(blkif_request_t *req)
 {
     int i;
+    printk("cwc:%s: in\n", __FUNCTION__);
 #ifdef CONFIG_XEN_BLKDEV_GRANT
     grant_ref_t gref;
 
@@ -1456,4 +1511,5 @@
         break;
     }
 #endif
+    printk("cwc:%s: out\n", __FUNCTION__);
 }
===== xen/common/grant_table.c 1.33 vs edited =====
--- 1.33/xen/common/grant_table.c	2005-04-14 13:53:23 -07:00
+++ edited/xen/common/grant_table.c	2005-04-17 22:18:32 -07:00
@@ -22,8 +22,8 @@
  * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
  */
 
-#define GRANT_DEBUG 0
-#define GRANT_DEBUG_VERBOSE 0
+#define GRANT_DEBUG 1
+#define GRANT_DEBUG_VERBOSE 1
 
 #include <xen/config.h>
 #include <xen/sched.h>
@@ -104,7 +104,9 @@
     act = &granting_d->grant_table->active[ref];
     sha = &granting_d->grant_table->shared[ref];
 
+    printk("cwc:%s: pre spin_lock\n", __FUNCTION__);
     spin_lock(&granting_d->grant_table->lock);
+    printk("cwc:%s: post spin_lock\n", __FUNCTION__);
 
     if ( act->pin == 0 )
     {
@@ -248,6 +250,7 @@
      */
 
     spin_unlock(&granting_d->grant_table->lock);
+    printk("cwc:%s: spin_unlock\n", __FUNCTION__);
 
     if ( (host_virt_addr != 0) && (dev_hst_ro_flags & GNTMAP_host_map) )
     {
@@ -269,7 +272,9 @@
         {
             /* Abort. */
 
+            printk("cwc:%s: pre spin_lock 2\n", __FUNCTION__);
             spin_lock(&granting_d->grant_table->lock);
+            printk("cwc:%s: post spin_lock 2\n", __FUNCTION__);
 
             if ( dev_hst_ro_flags & GNTMAP_readonly )
                 act->pin -= GNTPIN_hstr_inc;
@@ -289,14 +294,17 @@
             }
 
             spin_unlock(&granting_d->grant_table->lock);
+            printk("cwc:%s: spin_unlock 2\n", __FUNCTION__);
         }
 
     }
     *pframe = frame;
+    printk("cwc:%s: out\n", __FUNCTION__);
     return rc;
 
  unlock_out:
     spin_unlock(&granting_d->grant_table->lock);
+    printk("cwc:%s: spin_unlock out\n", __FUNCTION__);
     return rc;
 }
 
@@ -507,7 +515,9 @@
     act = &rd->grant_table->active[ref];
     sha = &rd->grant_table->shared[ref];
 
+    printk("cwc:%s: pre spin_lock\n", __FUNCTION__);
     spin_lock(&rd->grant_table->lock);
+    printk("cwc:%s: post spin_lock\n", __FUNCTION__);
 
     if ( frame == 0 )
         frame = act->frame;
@@ -562,6 +572,7 @@
             goto unmap_out;
         }
 
+        printk("cwc:%s: pre pagetable delete\n", __FUNCTION__);
         /* Delete pagetable entry
          */
         if ( unlikely(__put_user(0, (unsigned long *)pl1e)))
@@ -588,6 +599,7 @@
 
         rc = 0;
         *va = virt;
+        printk("cwc:%s: post pagetable delete\n", __FUNCTION__);
     }
 
     if ( (map->ref_and_flags & (GNTMAP_device_map|GNTMAP_host_map)) == 0)
@@ -617,6 +629,7 @@
 
  unmap_out:
     (void)__put_user(rc, &uop->status);
+    printk("cwc:%s: out spin_unlock\n", __FUNCTION__);
     spin_unlock(&rd->grant_table->lock);
     put_domain(rd);
     return rc;
@@ -897,11 +910,14 @@
             ref = (map->ref_and_flags >> MAPTRACK_REF_SHIFT);
             act = &rgt->active[ref];
 
+            printk("cwc:%s: pre spin_lock\n", __FUNCTION__);
             spin_lock(&rgt->lock);
+            printk("cwc:%s: post spin_lock\n", __FUNCTION__);
 
             if ( act->frame != frame )
             {
                 spin_unlock(&rgt->lock);
+                printk("cwc:%s: not frame spin_unlock\n", __FUNCTION__);
                 continue;
             }
 
@@ -910,6 +926,7 @@
             if ( refcount == 0 )
             {
                 spin_unlock(&rgt->lock);
+                printk("cwc:%s: refcount 0 spin_unlock\n", __FUNCTION__);
                 continue;
             }
 
@@ -937,6 +954,7 @@
                 put_page(&frame_table[frame]);
             }
             spin_unlock(&rgt->lock);
+            printk("cwc:%s: spin_unlock\n", __FUNCTION__);
 
             clear_bit(GNTMAP_host_map, &map->ref_and_flags);
 
@@ -949,6 +967,7 @@
     }
     put_domain(rd);
 
+    printk("cwc:%s: out found:%d\n", __FUNCTION__, found);
     return found;
 }
 
@@ -974,13 +993,16 @@
         return 0;
     }
 
+    printk("cwc:%s: pre spin_lock\n", __FUNCTION__);
     spin_lock(&rgt->lock);
+    printk("cwc:%s: post spin_lock\n", __FUNCTION__);
 
     sha = &rgt->shared[ref];
     
     sflags = sha->flags;
     sdom   = sha->domid;
 
+    printk("cwc:%s: pre cmpxchg loop\n", __FUNCTION__);
     for ( ; ; )
     {
         target_pfn = sha->frame;
@@ -1028,10 +1050,12 @@
     }
 
     spin_unlock(&rgt->lock);
+    printk("cwc:%s: out spin_unlock 1\n", __FUNCTION__);
     return 1;
 
  fail:
     spin_unlock(&rgt->lock);
+    printk("cwc:%s: out spin_unlock 0\n", __FUNCTION__);
     return 0;
 }
 

[-- Warning: decoded text below may be mangled, UTF-8 assumed --]
[-- Attachment #3: blkback.c --]
[-- Type: text/x-csrc; name="blkback.c", Size: 23917 bytes --]

/******************************************************************************
 * arch/xen/drivers/blkif/backend/main.c
 * 
 * Back-end of the driver for virtual block devices. This portion of the
 * driver exports a 'unified' block-device interface that can be accessed
 * by any operating system that implements a compatible front end. A 
 * reference front-end implementation can be found in:
 *  arch/xen/drivers/blkif/frontend
 * 
 * Copyright (c) 2003-2004, Keir Fraser & Steve Hand
 * Copyright (c) 2005, Christopher Clark
 */

#include "common.h"
#include <asm-xen/evtchn.h>
#ifdef CONFIG_XEN_BLKDEV_GRANT
#include <asm-xen/xen-public/grant_table.h>
#endif

/*
 * These are rather arbitrary. They are fairly large because adjacent requests
 * pulled from a communication ring are quite likely to end up being part of
 * the same scatter/gather request at the disc.
 * 
 * ** TRY INCREASING 'MAX_PENDING_REQS' IF WRITE SPEEDS SEEM TOO LOW **
 * This will increase the chances of being able to write whole tracks.
 * 64 should be enough to keep us competitive with Linux.
 */
#define MAX_PENDING_REQS 64
#define BATCH_PER_DOMAIN 16

static unsigned long mmap_vstart;
#define MMAP_PAGES                                              \
    (MAX_PENDING_REQS * BLKIF_MAX_SEGMENTS_PER_REQUEST)
#define MMAP_VADDR(_req,_seg)                                   \
    (mmap_vstart +                                              \
     ((_req) * BLKIF_MAX_SEGMENTS_PER_REQUEST * PAGE_SIZE) +    \
     ((_seg) * PAGE_SIZE))

/*
 * Each outstanding request that we've passed to the lower device layers has a 
 * 'pending_req' allocated to it. Each buffer_head that completes decrements 
 * the pendcnt towards zero. When it hits zero, the specified domain has a 
 * response queued for it, with the saved 'id' passed back.
 */
typedef struct {
    blkif_t       *blkif;
    unsigned long  id;
    int            nr_pages;
    atomic_t       pendcnt;
    unsigned short operation;
    int            status;
} pending_req_t;

/*
 * We can't allocate pending_req's in order, since they may complete out of 
 * order. We therefore maintain an allocation ring. This ring also indicates 
 * when enough work has been passed down -- at that point the allocation ring 
 * will be empty.
 */
static pending_req_t pending_reqs[MAX_PENDING_REQS];
static unsigned char pending_ring[MAX_PENDING_REQS];
static spinlock_t pend_prod_lock = SPIN_LOCK_UNLOCKED;
/* NB. We use a different index type to differentiate from shared blk rings. */
typedef unsigned int PEND_RING_IDX;
#define MASK_PEND_IDX(_i) ((_i)&(MAX_PENDING_REQS-1))
static PEND_RING_IDX pending_prod, pending_cons;
#define NR_PENDING_REQS (MAX_PENDING_REQS - pending_prod + pending_cons)

#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,0)
static kmem_cache_t *buffer_head_cachep;
#else
static request_queue_t *plugged_queue;
static inline void flush_plugged_queue(void)
{
    request_queue_t *q = plugged_queue;
    if ( q != NULL )
    {
        if ( q->unplug_fn != NULL )
            q->unplug_fn(q);
        blk_put_queue(q);
        plugged_queue = NULL;
    }
}
#endif

#ifdef CONFIG_XEN_BLKDEV_GRANT
/* When using grant tables to map a frame for device access then the
 * handle returned must be used to unmap the frame. This is needed to
 * drop the ref count on the frame.
 */
static u16 pending_grant_handles[MMAP_PAGES];
#define pending_handle(_idx, _i) \
    (pending_grant_handles[((_idx) * BLKIF_MAX_SEGMENTS_PER_REQUEST) + (_i)])
#define BLKBACK_INVALID_HANDLE (0xFFFF)
#endif

#ifdef CONFIG_XEN_BLKDEV_TAP_BE
/*
 * If the tap driver is used, we may get pages belonging to either the tap
 * or (more likely) the real frontend.  The backend must specify which domain
 * a given page belongs to in update_va_mapping though.  For the moment, 
 * the tap rewrites the ID field of the request to contain the request index
 * and the id of the real front end domain.
 */
#define BLKTAP_COOKIE 0xbeadfeed
static inline domid_t ID_TO_DOM(unsigned long id) { return (id >> 16); }
#endif

static int do_block_io_op(blkif_t *blkif, int max_to_do);
static void dispatch_probe(blkif_t *blkif, blkif_request_t *req);
static void dispatch_rw_block_io(blkif_t *blkif, blkif_request_t *req);
static void make_response(blkif_t *blkif, unsigned long id, 
                          unsigned short op, int st);

static void fast_flush_area(int idx, int nr_pages)
{
    printk("cwc:%s:in\n", __FUNCTION__);
#ifdef CONFIG_XEN_BLKDEV_GRANT
    gnttab_op_t       aop[BLKIF_MAX_SEGMENTS_PER_REQUEST];
    unsigned int      i, invcount = 0;
    u16               handle;

    for ( i = 0; i < nr_pages; i++ )
    {
        if ( BLKBACK_INVALID_HANDLE != ( handle = pending_handle(idx, i) ) )
        {
            aop[i].u.unmap_grant_ref.host_virt_addr = MMAP_VADDR(idx, i);
            aop[i].u.unmap_grant_ref.dev_bus_addr   = 0;
            aop[i].u.unmap_grant_ref.handle         = handle;
            pending_handle(idx, i) = BLKBACK_INVALID_HANDLE;
            invcount++;
        }
    }
    if ( unlikely(HYPERVISOR_grant_table_op(
                    GNTTABOP_unmap_grant_ref, aop, invcount)))
        BUG();
#else

    multicall_entry_t mcl[BLKIF_MAX_SEGMENTS_PER_REQUEST];
    int               i;

    for ( i = 0; i < nr_pages; i++ )
    {
        mcl[i].op = __HYPERVISOR_update_va_mapping;
        mcl[i].args[0] = MMAP_VADDR(idx, i);
        mcl[i].args[1] = 0;
        mcl[i].args[2] = 0;
    }

    mcl[nr_pages-1].args[2] = UVMF_TLB_FLUSH|UVMF_ALL;
    if ( unlikely(HYPERVISOR_multicall(mcl, nr_pages) != 0) )
        BUG();
#endif
    printk("cwc:%s:out\n", __FUNCTION__);
}


/******************************************************************
 * BLOCK-DEVICE SCHEDULER LIST MAINTENANCE
 */

static struct list_head blkio_schedule_list;
static spinlock_t blkio_schedule_list_lock;

static int __on_blkdev_list(blkif_t *blkif)
{
    return blkif->blkdev_list.next != NULL;
}

static void remove_from_blkdev_list(blkif_t *blkif)
{
    unsigned long flags;
    if ( !__on_blkdev_list(blkif) ) return;
    spin_lock_irqsave(&blkio_schedule_list_lock, flags);
    if ( __on_blkdev_list(blkif) )
    {
        list_del(&blkif->blkdev_list);
        blkif->blkdev_list.next = NULL;
        blkif_put(blkif);
    }
    spin_unlock_irqrestore(&blkio_schedule_list_lock, flags);
}

static void add_to_blkdev_list_tail(blkif_t *blkif)
{
    unsigned long flags;
    if ( __on_blkdev_list(blkif) ) return;
    spin_lock_irqsave(&blkio_schedule_list_lock, flags);
    if ( !__on_blkdev_list(blkif) && (blkif->status == CONNECTED) )
    {
        list_add_tail(&blkif->blkdev_list, &blkio_schedule_list);
        blkif_get(blkif);
    }
    spin_unlock_irqrestore(&blkio_schedule_list_lock, flags);
}


/******************************************************************
 * SCHEDULER FUNCTIONS
 */

static DECLARE_WAIT_QUEUE_HEAD(blkio_schedule_wait);

static int blkio_schedule(void *arg)
{
    DECLARE_WAITQUEUE(wq, current);

    blkif_t          *blkif;
    struct list_head *ent;

    daemonize(
#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,0)
        "xenblkd"
#endif
        );

    for ( ; ; )
    {
        /* Wait for work to do. */
        add_wait_queue(&blkio_schedule_wait, &wq);
        set_current_state(TASK_INTERRUPTIBLE);
        if ( (NR_PENDING_REQS == MAX_PENDING_REQS) || 
             list_empty(&blkio_schedule_list) )
            schedule();
        __set_current_state(TASK_RUNNING);
        remove_wait_queue(&blkio_schedule_wait, &wq);

        /* Queue up a batch of requests. */
        while ( (NR_PENDING_REQS < MAX_PENDING_REQS) &&
                !list_empty(&blkio_schedule_list) )
        {
            ent = blkio_schedule_list.next;
            blkif = list_entry(ent, blkif_t, blkdev_list);
            blkif_get(blkif);
            remove_from_blkdev_list(blkif);
            if ( do_block_io_op(blkif, BATCH_PER_DOMAIN) )
                add_to_blkdev_list_tail(blkif);
            blkif_put(blkif);
        }

        /* Push the batch through to disc. */
#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,0)
        run_task_queue(&tq_disk);
#else
        flush_plugged_queue();
#endif
    }
}

static void maybe_trigger_blkio_schedule(void)
{
    /*
     * Needed so that two processes, who together make the following predicate
     * true, don't both read stale values and evaluate the predicate
     * incorrectly. Incredibly unlikely to stall the scheduler on x86, but...
     */
    smp_mb();

    if ( (NR_PENDING_REQS < (MAX_PENDING_REQS/2)) &&
         !list_empty(&blkio_schedule_list) )
        wake_up(&blkio_schedule_wait);
}



/******************************************************************
 * COMPLETION CALLBACK -- Called as bh->b_end_io()
 */

static void __end_block_io_op(pending_req_t *pending_req, int uptodate)
{
    unsigned long flags;

    /* An error fails the entire request. */
    if ( !uptodate )
    {
        DPRINTK("Buffer not up-to-date at end of operation\n");
        pending_req->status = BLKIF_RSP_ERROR;
    }

    if ( atomic_dec_and_test(&pending_req->pendcnt) )
    {
        int pending_idx = pending_req - pending_reqs;
        fast_flush_area(pending_idx, pending_req->nr_pages);
        make_response(pending_req->blkif, pending_req->id,
                      pending_req->operation, pending_req->status);
        blkif_put(pending_req->blkif);
        spin_lock_irqsave(&pend_prod_lock, flags);
        pending_ring[MASK_PEND_IDX(pending_prod++)] = pending_idx;
        spin_unlock_irqrestore(&pend_prod_lock, flags);
        maybe_trigger_blkio_schedule();
    }
}

#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,0)
static void end_block_io_op(struct buffer_head *bh, int uptodate)
{
    __end_block_io_op(bh->b_private, uptodate);
    kmem_cache_free(buffer_head_cachep, bh);
}
#else
static int end_block_io_op(struct bio *bio, unsigned int done, int error)
{
    if ( done || error )
        __end_block_io_op(bio->bi_private, (done && !error));
    bio_put(bio);
    return error;
}
#endif


/******************************************************************************
 * NOTIFICATION FROM GUEST OS.
 */

irqreturn_t blkif_be_int(int irq, void *dev_id, struct pt_regs *regs)
{
    blkif_t *blkif = dev_id;
    add_to_blkdev_list_tail(blkif);
    maybe_trigger_blkio_schedule();
    return IRQ_HANDLED;
}



/******************************************************************
 * DOWNWARD CALLS -- These interface with the block-device layer proper.
 */

static int do_block_io_op(blkif_t *blkif, int max_to_do)
{
    blkif_back_ring_t *blk_ring = &blkif->blk_ring;
    blkif_request_t *req;
    RING_IDX i, rp;
    int more_to_do = 0;

    rp = blk_ring->sring->req_prod;
    rmb(); /* Ensure we see queued requests up to 'rp'. */

    for ( i = blk_ring->req_cons; 
         (i != rp) && !RING_REQUEST_CONS_OVERFLOW(blk_ring, i);
          i++ )
    {
        if ( (max_to_do-- == 0) || (NR_PENDING_REQS == MAX_PENDING_REQS) )
        {
            more_to_do = 1;
            break;
        }
        
        req = RING_GET_REQUEST(blk_ring, i);
        switch ( req->operation )
        {
        case BLKIF_OP_READ:
        case BLKIF_OP_WRITE:
            dispatch_rw_block_io(blkif, req);
            break;

        case BLKIF_OP_PROBE:
            dispatch_probe(blkif, req);
            break;

        default:
            DPRINTK("error: unknown block io operation [%d]\n",
                    req->operation);
            make_response(blkif, req->id, req->operation, BLKIF_RSP_ERROR);
            break;
        }
    }

    blk_ring->req_cons = i;
    return more_to_do;
}

static void dispatch_probe(blkif_t *blkif, blkif_request_t *req)
{
    int rsp = BLKIF_RSP_ERROR;
    int pending_idx = pending_ring[MASK_PEND_IDX(pending_cons)];
    printk("cwc:%s:in id:%lu\n", __FUNCTION__, req->id);

    /* We expect one buffer only. */
    if ( unlikely(req->nr_segments != 1) )
        goto out;

    /* Make sure the buffer is page-sized. */
    if ( (blkif_first_sect(req->frame_and_sects[0]) != 0) ||
         (blkif_last_sect(req->frame_and_sects[0]) != 7) )
        goto out;

    printk("cwc:%s:mid\n", __FUNCTION__);
#ifdef CONFIG_XEN_BLKDEV_GRANT
    {
        gnttab_op_t     op;

        op.u.map_grant_ref.host_virt_addr = MMAP_VADDR(pending_idx, 0);
        op.u.map_grant_ref.flags = GNTMAP_host_map;
        op.u.map_grant_ref.ref = blkif_gref_from_fas(req->frame_and_sects[0]);
        op.u.map_grant_ref.dom = blkif->domid;

        if ( unlikely(HYPERVISOR_grant_table_op(
                        GNTTABOP_map_grant_ref, &op, 1)))
            BUG();

        if ( op.u.map_grant_ref.dev_bus_addr == 0 )
            goto out;

        pending_handle(pending_idx, 0) = op.u.map_grant_ref.handle;
    }
#else /* else CONFIG_XEN_BLKDEV_GRANT */

#ifdef CONFIG_XEN_BLKDEV_TAP_BE
    /* Grab the real frontend out of the probe message. */
    if (req->frame_and_sects[1] == BLKTAP_COOKIE) 
        blkif->is_blktap = 1;
#endif


#ifdef CONFIG_XEN_BLKDEV_TAP_BE
    if ( HYPERVISOR_update_va_mapping_otherdomain(
        MMAP_VADDR(pending_idx, 0),
        (pte_t) { (req->frame_and_sects[0] & PAGE_MASK) | __PAGE_KERNEL },
        0, (blkif->is_blktap ? ID_TO_DOM(req->id) : blkif->domid) ) )
        
        goto out;
#else
    if ( HYPERVISOR_update_va_mapping_otherdomain(
        MMAP_VADDR(pending_idx, 0),
        (pte_t) { (req->frame_and_sects[0] & PAGE_MASK) | __PAGE_KERNEL },
        0, blkif->domid) ) 
        
        goto out;
#endif
#endif /* endif CONFIG_XEN_BLKDEV_GRANT */
   
    printk("cwc:%s:vbd_probe\n", __FUNCTION__);
    rsp = vbd_probe(blkif, (vdisk_t *)MMAP_VADDR(pending_idx, 0), 
                    PAGE_SIZE / sizeof(vdisk_t));

 out:
    fast_flush_area(pending_idx, 1);
    make_response(blkif, req->id, req->operation, rsp);
    printk("cwc:%s:out response:%d\n", __FUNCTION__, rsp);
}

static void dispatch_rw_block_io(blkif_t *blkif, blkif_request_t *req)
{
    extern void ll_rw_block(int rw, int nr, struct buffer_head * bhs[]); 
    int operation = (req->operation == BLKIF_OP_WRITE) ? WRITE : READ;
    unsigned long fas = 0;
    int i, pending_idx = pending_ring[MASK_PEND_IDX(pending_cons)];
    pending_req_t *pending_req;
#ifdef CONFIG_XEN_BLKDEV_GRANT
    gnttab_op_t       aop[BLKIF_MAX_SEGMENTS_PER_REQUEST];
#else
    unsigned long remap_prot;
    multicall_entry_t mcl[BLKIF_MAX_SEGMENTS_PER_REQUEST];
#endif
    struct phys_req preq;
    struct { 
        unsigned long buf; unsigned int nsec;
    } seg[BLKIF_MAX_SEGMENTS_PER_REQUEST];
    unsigned int nseg;
#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,0)
    struct buffer_head *bh;
#else
    struct bio *bio = NULL, *biolist[BLKIF_MAX_SEGMENTS_PER_REQUEST];
    int nbio = 0;
    request_queue_t *q;
#endif

    /* Check that number of segments is sane. */
    nseg = req->nr_segments;
    printk("cwc:%s:in nsegs:%u\n", __FUNCTION__, nseg);
    if ( unlikely(nseg == 0) || 
         unlikely(nseg > BLKIF_MAX_SEGMENTS_PER_REQUEST) )
    {
        DPRINTK("Bad number of segments in request (%d)\n", nseg);
        goto bad_descriptor;
    }

    preq.dev           = req->device;
    preq.sector_number = req->sector_number;
    preq.nr_sects      = 0;

#ifdef CONFIG_XEN_BLKDEV_GRANT
    for ( i = 0; i < nseg; i++ )
    {
        fas         = req->frame_and_sects[i];
        seg[i].nsec = blkif_last_sect(fas) - blkif_first_sect(fas) + 1;

        if ( seg[i].nsec <= 0 )
            goto bad_descriptor;
        preq.nr_sects += seg[i].nsec;

        aop[i].u.map_grant_ref.host_virt_addr = MMAP_VADDR(pending_idx, i);

        aop[i].u.map_grant_ref.dom = blkif->domid;
        aop[i].u.map_grant_ref.ref = blkif_gref_from_fas(fas);
        aop[i].u.map_grant_ref.flags = ( GNTMAP_host_map   |
                                       ( ( operation == READ ) ?
                                             0 : GNTMAP_readonly ) );
    }

    if ( unlikely(HYPERVISOR_grant_table_op(
                    GNTTABOP_map_grant_ref, aop, nseg)))
        BUG();
    printk("cwc:%s:post map\n", __FUNCTION__);

    for ( i = 0; i < nseg; i++ )
    {
        if ( unlikely(aop[i].u.map_grant_ref.dev_bus_addr == 0) )
        {
            DPRINTK("invalid buffer -- could not remap it\n");
            fast_flush_area(pending_idx, nseg);
            goto bad_descriptor;
        }

        phys_to_machine_mapping[__pa(MMAP_VADDR(pending_idx, i))>>PAGE_SHIFT] =
            FOREIGN_FRAME(aop[i].u.map_grant_ref.dev_bus_addr);

        pending_handle(pending_idx, i) = aop[i].u.map_grant_ref.handle;
    }
#endif
    printk("cwc:%s:pre populate segs\n", __FUNCTION__);

    for ( i = 0; i < nseg; i++ )
    {
#ifdef CONFIG_XEN_BLKDEV_GRANT
        seg[i].buf  = (aop[i].u.map_grant_ref.dev_bus_addr << PAGE_SHIFT) |
                      (blkif_first_sect(fas) << 9);
#else
        fas          = req->frame_and_sects[i];
        seg[i].buf  = (fas & PAGE_MASK) | (blkif_first_sect(fas) << 9);
        seg[i].nsec = blkif_last_sect(fas) - blkif_first_sect(fas) + 1;
        if ( seg[i].nsec <= 0 )
            goto bad_descriptor;
        preq.nr_sects += seg[i].nsec;
#endif
     	printk("cwc:%s: buf:%lx populated %u sects\n",
               __FUNCTION__, seg[i].buf, seg[i].nsec);
    }

    if ( vbd_translate(&preq, blkif, operation) != 0 )
    {
        DPRINTK("access denied: %s of [%llu,%llu] on dev=%04x\n", 
                operation == READ ? "read" : "write", preq.sector_number,
                preq.sector_number + preq.nr_sects, preq.dev); 
        goto bad_descriptor;
    }

#ifndef CONFIG_XEN_BLKDEV_GRANT
    if ( operation == READ )
        remap_prot = _PAGE_PRESENT|_PAGE_DIRTY|_PAGE_ACCESSED|_PAGE_RW;
    else
        remap_prot = _PAGE_PRESENT|_PAGE_DIRTY|_PAGE_ACCESSED;

    for ( i = 0; i < nseg; i++ )
    {
        mcl[i].op = __HYPERVISOR_update_va_mapping_otherdomain;
        mcl[i].args[0] = MMAP_VADDR(pending_idx, i);
        mcl[i].args[1] = (seg[i].buf & PAGE_MASK) | remap_prot;
        mcl[i].args[2] = 0;
        mcl[i].args[3] = blkif->domid;
#ifdef CONFIG_XEN_BLKDEV_TAP_BE
        if ( blkif->is_blktap )
            mcl[i].args[3] = ID_TO_DOM(req->id);
#endif
        phys_to_machine_mapping[__pa(MMAP_VADDR(pending_idx, i))>>PAGE_SHIFT] =
            FOREIGN_FRAME(seg[i].buf >> PAGE_SHIFT);
    }

    BUG_ON(HYPERVISOR_multicall(mcl, nseg) != 0);

    for ( i = 0; i < nseg; i++ )
    {
        if ( unlikely(mcl[i].args[5] != 0) )
        {
            DPRINTK("invalid buffer -- could not remap it\n");
            fast_flush_area(pending_idx, nseg);
            goto bad_descriptor;
        }
    }
#endif /* end ifndef CONFIG_XEN_BLKDEV_GRANT */

    pending_req = &pending_reqs[pending_idx];
    pending_req->blkif     = blkif;
    pending_req->id        = req->id;
    pending_req->operation = operation;
    pending_req->status    = BLKIF_RSP_OKAY;
    pending_req->nr_pages  = nseg;

#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,0)

    atomic_set(&pending_req->pendcnt, nseg);
    pending_cons++;
    blkif_get(blkif);

    for ( i = 0; i < nseg; i++ )
    {
        bh = kmem_cache_alloc(buffer_head_cachep, GFP_KERNEL);
        if ( unlikely(bh == NULL) )
        {
            __end_block_io_op(pending_req, 0);
            continue;
        }

        memset(bh, 0, sizeof (struct buffer_head));

        init_waitqueue_head(&bh->b_wait);
        bh->b_size          = seg[i].nsec << 9;
        bh->b_dev           = preq.dev;
        bh->b_rdev          = preq.dev;
        bh->b_rsector       = (unsigned long)preq.sector_number;
        bh->b_data          = (char *)MMAP_VADDR(pending_idx, i) +
            (seg[i].buf & ~PAGE_MASK);
        bh->b_page          = virt_to_page(MMAP_VADDR(pending_idx, i));
        bh->b_end_io        = end_block_io_op;
        bh->b_private       = pending_req;

        bh->b_state = (1 << BH_Mapped) | (1 << BH_Lock) | 
            (1 << BH_Req) | (1 << BH_Launder);
        if ( operation == WRITE )
            bh->b_state |= (1 << BH_JBD) | (1 << BH_Req) | (1 << BH_Uptodate);

        atomic_set(&bh->b_count, 1);

        /* Dispatch a single request. We'll flush it to disc later. */
        generic_make_request(operation, bh);

        preq.sector_number += seg[i].nsec;
    }

#else

    for ( i = 0; i < nseg; i++ )
    {
        while ( (bio == NULL) ||
                (bio_add_page(bio,
                              virt_to_page(MMAP_VADDR(pending_idx, i)),
                              seg[i].nsec << 9,
                              seg[i].buf & ~PAGE_MASK) == 0) )
        {
            bio = biolist[nbio++] = bio_alloc(GFP_KERNEL, nseg-i);
            if ( unlikely(bio == NULL) )
            {
                for ( i = 0; i < (nbio-1); i++ )
                    bio_put(biolist[i]);
                fast_flush_area(pending_idx, nseg);
                goto bad_descriptor;
            }
                
            bio->bi_bdev    = preq.bdev;
            bio->bi_private = pending_req;
            bio->bi_end_io  = end_block_io_op;
            bio->bi_sector  = preq.sector_number;
        }

        preq.sector_number += seg[i].nsec;
    }
    printk("cwc:%s: post bio_add_page\n", __FUNCTION__);

    if ( (q = bdev_get_queue(bio->bi_bdev)) != plugged_queue )
    {
        flush_plugged_queue();
        blk_get_queue(q);
        plugged_queue = q;
    }

    atomic_set(&pending_req->pendcnt, nbio);
    pending_cons++;
    blkif_get(blkif);

    for ( i = 0; i < nbio; i++ )
        submit_bio(operation, biolist[i]);

#endif
    printk("cwc:%s: %d bios submitted\n", __FUNCTION__, nbio);

    return;

 bad_descriptor:
    make_response(blkif, req->id, req->operation, BLKIF_RSP_ERROR);
    printk("cwc:%s: %d bad descriptor\n", __FUNCTION__);
} 



/******************************************************************
 * MISCELLANEOUS SETUP / TEARDOWN / DEBUGGING
 */


static void make_response(blkif_t *blkif, unsigned long id, 
                          unsigned short op, int st)
{
    blkif_response_t *resp;
    unsigned long     flags;
    blkif_back_ring_t *blk_ring = &blkif->blk_ring;

    /* Place on the response ring for the relevant domain. */ 
    spin_lock_irqsave(&blkif->blk_ring_lock, flags);
    resp = RING_GET_RESPONSE(blk_ring, blk_ring->rsp_prod_pvt);
    resp->id        = id;
    resp->operation = op;
    resp->status    = st;
    wmb(); /* Ensure other side can see the response fields. */
    blk_ring->rsp_prod_pvt++;
    RING_PUSH_RESPONSES(blk_ring);
    spin_unlock_irqrestore(&blkif->blk_ring_lock, flags);

    /* Kick the relevant domain. */
    notify_via_evtchn(blkif->evtchn);
}

void blkif_deschedule(blkif_t *blkif)
{
    remove_from_blkdev_list(blkif);
}

static int __init blkif_init(void)
{
    int i;

    if ( !(xen_start_info.flags & SIF_INITDOMAIN) &&
         !(xen_start_info.flags & SIF_BLK_BE_DOMAIN) )
        return 0;

    blkif_interface_init();

    if ( (mmap_vstart = allocate_empty_lowmem_region(MMAP_PAGES)) == 0 )
        BUG();

    pending_cons = 0;
    pending_prod = MAX_PENDING_REQS;
    memset(pending_reqs, 0, sizeof(pending_reqs));
    for ( i = 0; i < MAX_PENDING_REQS; i++ )
        pending_ring[i] = i;
    
    spin_lock_init(&blkio_schedule_list_lock);
    INIT_LIST_HEAD(&blkio_schedule_list);

    if ( kernel_thread(blkio_schedule, 0, CLONE_FS | CLONE_FILES) < 0 )
        BUG();

#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,0)
    buffer_head_cachep = kmem_cache_create(
        "buffer_head_cache", sizeof(struct buffer_head),
        0, SLAB_HWCACHE_ALIGN, NULL, NULL);
#endif

    blkif_ctrlif_init();
    
#ifdef CONFIG_XEN_BLKDEV_GRANT
    memset( pending_grant_handles,  BLKBACK_INVALID_HANDLE, MMAP_PAGES );
    printk(KERN_ALERT "Blkif backend is using grant tables.\n");
#endif

#ifdef CONFIG_XEN_BLKDEV_TAP_BE
    printk(KERN_ALERT "NOTE: Blkif backend is running with tap support on!\n");
#endif

    return 0;
}

__initcall(blkif_init);

[-- Warning: decoded text below may be mangled, UTF-8 assumed --]
[-- Attachment #4: blkfront.c --]
[-- Type: text/x-csrc; name="blkfront.c", Size: 43478 bytes --]

/******************************************************************************
 * blkfront.c
 * 
 * XenLinux virtual block-device driver.
 * 
 * Copyright (c) 2003-2004, Keir Fraser & Steve Hand
 * Modifications by Mark A. Williamson are (c) Intel Research Cambridge
 * Copyright (c) 2004, Christian Limpach
 * Copyright (c) 2004, Andrew Warfield
 * Copyright (c) 2005, Christopher Clark
 * 
 * This file may be distributed separately from the Linux kernel, or
 * incorporated into other software packages, subject to the following license:
 * 
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this source file (the "Software"), to deal in the Software without
 * restriction, including without limitation the rights to use, copy, modify,
 * merge, publish, distribute, sublicense, and/or sell copies of the Software,
 * and to permit persons to whom the Software is furnished to do so, subject to
 * the following conditions:
 * 
 * The above copyright notice and this permission notice shall be included in
 * all copies or substantial portions of the Software.
 * 
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
 * IN THE SOFTWARE.
 */

#if 1
#define ASSERT(_p) \
    if ( !(_p) ) { printk("Assertion '%s' failed, line %d, file %s", #_p , \
    __LINE__, __FILE__); *(int*)0=0; }
#else
#define ASSERT(_p)
#endif

#include <linux/version.h>

#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,0)
#include "block.h"
#else
#include "common.h"
#include <linux/blk.h>
#include <linux/tqueue.h>
#endif

#include <linux/cdrom.h>
#include <linux/sched.h>
#include <linux/interrupt.h>
#include <scsi/scsi.h>
#include <asm-xen/ctrl_if.h>
#include <asm-xen/evtchn.h>
#ifdef CONFIG_XEN_BLKDEV_GRANT
#include <asm-xen/xen-public/grant_table.h>
#include <asm-xen/gnttab.h>
#endif

typedef unsigned char byte; /* from linux/ide.h */

/* Control whether runtime update of vbds is enabled. */
#define ENABLE_VBD_UPDATE 1

#if ENABLE_VBD_UPDATE
static void vbd_update(void);
#else
static void vbd_update(void){};
#endif

#define BLKIF_STATE_CLOSED       0
#define BLKIF_STATE_DISCONNECTED 1
#define BLKIF_STATE_CONNECTED    2

#define WPRINTK(fmt, args...) printk(KERN_WARNING "xen_blk: " fmt, ##args)

static int blkif_handle = 0;
static unsigned int blkif_state = BLKIF_STATE_CLOSED;
static unsigned int blkif_evtchn = 0;
static unsigned int blkif_irq = 0;

static int blkif_control_rsp_valid;
static blkif_response_t blkif_control_rsp;

static blkif_front_ring_t blk_ring;

#define BLK_RING_SIZE __RING_SIZE((blkif_sring_t *)0, PAGE_SIZE)

#ifdef CONFIG_XEN_BLKDEV_GRANT
static domid_t rdomid = 0;
static grant_ref_t gref_head, gref_terminal;
#define MAXIMUM_OUTSTANDING_BLOCK_REQS \
    (BLKIF_MAX_SEGMENTS_PER_REQUEST * BLKIF_RING_SIZE)
#endif

unsigned long rec_ring_free;
blkif_request_t rec_ring[BLK_RING_SIZE];

static int recovery = 0;           /* "Recovery in progress" flag.  Protected
                                    * by the blkif_io_lock */

static void kick_pending_request_queues(void);

int __init xlblk_init(void);

void blkif_completion( blkif_request_t *req );

static inline int GET_ID_FROM_FREELIST( void )
{
    unsigned long free = rec_ring_free;

    BUG_ON(free > BLK_RING_SIZE);

    rec_ring_free = rec_ring[free].id;

    rec_ring[free].id = 0x0fffffee; /* debug */

    return free;
}

static inline void ADD_ID_TO_FREELIST( unsigned long id )
{
    rec_ring[id].id = rec_ring_free;
    rec_ring_free = id;
}


/************************  COMMON CODE  (inlined)  ************************/

/* Kernel-specific definitions used in the common code */
#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,0)
#define DISABLE_SCATTERGATHER() 
#else
static int sg_operation = -1;
#define DISABLE_SCATTERGATHER() (sg_operation = -1)
#endif

static inline void translate_req_to_pfn(blkif_request_t *xreq,
                                        blkif_request_t *req)
{
    int i;

    xreq->operation     = req->operation;
    xreq->nr_segments   = req->nr_segments;
    xreq->device        = req->device;
    /* preserve id */
    xreq->sector_number = req->sector_number;

    for ( i = 0; i < req->nr_segments; i++ )
#ifdef CONFIG_XEN_BLKDEV_GRANT
        xreq->frame_and_sects[i] = req->frame_and_sects[i];
#else
        xreq->frame_and_sects[i] = machine_to_phys(req->frame_and_sects[i]);
#endif
}

static inline void translate_req_to_mfn(blkif_request_t *xreq,
                                        blkif_request_t *req)
{
    int i;

    xreq->operation     = req->operation;
    xreq->nr_segments   = req->nr_segments;
    xreq->device        = req->device;
    xreq->id            = req->id;   /* copy id (unlike above) */
    xreq->sector_number = req->sector_number;

    for ( i = 0; i < req->nr_segments; i++ )
#ifdef CONFIG_XEN_BLKDEV_GRANT
        xreq->frame_and_sects[i] = req->frame_and_sects[i];
#else
        xreq->frame_and_sects[i] = phys_to_machine(req->frame_and_sects[i]);
#endif
}


static inline void flush_requests(void)
{
    DISABLE_SCATTERGATHER();
    RING_PUSH_REQUESTS(&blk_ring);
    notify_via_evtchn(blkif_evtchn);
}




/**************************  KERNEL VERSION 2.6  **************************/

#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,0)

module_init(xlblk_init);

#if ENABLE_VBD_UPDATE
static void update_vbds_task(void *unused)
{ 
    xlvbd_update_vbds();
}

static void vbd_update(void)
{
    static DECLARE_WORK(update_tq, update_vbds_task, NULL);
    schedule_work(&update_tq);
}
#endif /* ENABLE_VBD_UPDATE */

static void kick_pending_request_queues(void)
{

    if ( (xlbd_blk_queue != NULL) &&
         test_bit(QUEUE_FLAG_STOPPED, &xlbd_blk_queue->queue_flags) )
    {
        blk_start_queue(xlbd_blk_queue);
        /* XXXcl call to request_fn should not be needed but
         * we get stuck without...  needs investigating
         */
        xlbd_blk_queue->request_fn(xlbd_blk_queue);
    }

}


int blkif_open(struct inode *inode, struct file *filep)
{
    struct gendisk *gd = inode->i_bdev->bd_disk;
    struct xlbd_disk_info *di = (struct xlbd_disk_info *)gd->private_data;
    printk("cwc:%s: called\n", __FUNCTION__);

    /* Update of usage count is protected by per-device semaphore. */
    di->mi->usage++;
    
    return 0;
}


int blkif_release(struct inode *inode, struct file *filep)
{
    struct gendisk *gd = inode->i_bdev->bd_disk;
    struct xlbd_disk_info *di = (struct xlbd_disk_info *)gd->private_data;
    printk("cwc:%s: called\n", __FUNCTION__);

    /*
     * When usage drops to zero it may allow more VBD updates to occur.
     * Update of usage count is protected by a per-device semaphore.
     */
    if (--di->mi->usage == 0) {
        vbd_update();
    }

    return 0;
}


int blkif_ioctl(struct inode *inode, struct file *filep,
                unsigned command, unsigned long argument)
{
    int i;
    printk("cwc:%s: called\n", __FUNCTION__);

    DPRINTK_IOCTL("command: 0x%x, argument: 0x%lx, dev: 0x%04x\n",
                  command, (long)argument, inode->i_rdev); 
  
    switch (command) {

    case HDIO_GETGEO:
        /* return ENOSYS to use defaults */
        return -ENOSYS;

    case CDROMMULTISESSION:
        DPRINTK("FIXME: support multisession CDs later\n");
        for ( i = 0; i < sizeof(struct cdrom_multisession); i++ )
            if ( put_user(0, (byte *)(argument + i)) ) return -EFAULT;
        return 0;

    default:
        printk(KERN_ALERT "ioctl %08x not supported by Xen blkdev\n",
               command);
        return -ENOSYS;
    }

    return 0;
}


/*
 * blkif_queue_request
 *
 * request block io 
 * 
 * id: for guest use only.
 * operation: BLKIF_OP_{READ,WRITE,PROBE}
 * buffer: buffer to read/write into. this should be a
 *   virtual address in the guest os.
 */
static int blkif_queue_request(struct request *req)
{
    struct xlbd_disk_info *di =
        (struct xlbd_disk_info *)req->rq_disk->private_data;
    unsigned long buffer_ma;
    blkif_request_t *ring_req;
    struct bio *bio;
    struct bio_vec *bvec;
    int idx;
    unsigned long id;
    unsigned int fsect, lsect;
#ifdef CONFIG_XEN_BLKDEV_GRANT
    int ref;
#endif
    printk("cwc:%s: in blkif_state:%u\n", __FUNCTION__, blkif_state);

    if ( unlikely(blkif_state != BLKIF_STATE_CONNECTED) )
        return 1;

    /* Fill out a communications ring structure. */
    ring_req = RING_GET_REQUEST(&blk_ring, blk_ring.req_prod_pvt);
    id = GET_ID_FROM_FREELIST();
    rec_ring[id].id = (unsigned long) req;

    ring_req->id = id;
    ring_req->operation = rq_data_dir(req) ? BLKIF_OP_WRITE :
        BLKIF_OP_READ;
    ring_req->sector_number = (blkif_sector_t)req->sector;
    ring_req->device = di->xd_device;

    ring_req->nr_segments = 0;
    rq_for_each_bio(bio, req)
    {
        bio_for_each_segment(bvec, bio, idx)
        {
            if ( ring_req->nr_segments == BLKIF_MAX_SEGMENTS_PER_REQUEST )
                BUG();
            buffer_ma = page_to_phys(bvec->bv_page);
            fsect = bvec->bv_offset >> 9;
            lsect = fsect + (bvec->bv_len >> 9) - 1;
#ifdef CONFIG_XEN_BLKDEV_GRANT
            /* install a grant reference. */
            ref = gnttab_claim_grant_reference(&gref_head, gref_terminal);
            ASSERT( ref != -ENOSPC );

            gnttab_grant_foreign_access_ref(
                        ref,
                        rdomid,
                        buffer_ma >> PAGE_SHIFT,
                        rq_data_dir(req) );

            ring_req->frame_and_sects[ring_req->nr_segments++] =
                (((u32) ref) << 16) | (fsect << 3) | lsect;
#else
            ring_req->frame_and_sects[ring_req->nr_segments++] =
                buffer_ma | (fsect << 3) | lsect;
#endif
        }
    }

    blk_ring.req_prod_pvt++;
    
    /* Keep a private copy so we can reissue requests when recovering. */
    translate_req_to_pfn(&rec_ring[id], ring_req);

    printk("cwc:%s: out\n", __FUNCTION__);
    return 0;
}


/*
 * do_blkif_request
 *  read a block; request is in a request queue
 */
void do_blkif_request(request_queue_t *rq)
{
    struct request *req;
    int queued;
    printk("cwc:%s: called\n", __FUNCTION__);

    DPRINTK("Entered do_blkif_request\n"); 

    queued = 0;

    while ((req = elv_next_request(rq)) != NULL) {
        if (!blk_fs_request(req)) {
            end_request(req, 0);
            continue;
        }

        if ( RING_FULL(&blk_ring) )
        {
            blk_stop_queue(rq);
            break;
        }
        DPRINTK("do_blk_req %p: cmd %p, sec %lx, (%u/%li) buffer:%p [%s]\n",
                req, req->cmd, req->sector, req->current_nr_sectors,
                req->nr_sectors, req->buffer,
                rq_data_dir(req) ? "write" : "read");
        blkdev_dequeue_request(req);
        if (blkif_queue_request(req)) {
            blk_stop_queue(rq);
            break;
        }
        queued++;
    }

    if (queued != 0)
        flush_requests();
}


static irqreturn_t blkif_int(int irq, void *dev_id, struct pt_regs *ptregs)
{
    struct request *req;
    blkif_response_t *bret;
    RING_IDX i, rp;
    unsigned long flags; 
    
    printk("cwc:%s: pre spin_lock_irqsave\n", __FUNCTION__);
    spin_lock_irqsave(&blkif_io_lock, flags);     
    printk("cwc:%s: post spin_lock_irqsave\n", __FUNCTION__);

    if ( unlikely(blkif_state == BLKIF_STATE_CLOSED) || 
         unlikely(recovery) )
    {
        spin_unlock_irqrestore(&blkif_io_lock, flags);
        printk("cwc:%s: spin_unlock_irqsave\n", __FUNCTION__);
        return IRQ_HANDLED;
    }
    
    rp = blk_ring.sring->rsp_prod;
    rmb(); /* Ensure we see queued responses up to 'rp'. */

    for ( i = blk_ring.rsp_cons; i != rp; i++ )
    {
        unsigned long id;

        bret = RING_GET_RESPONSE(&blk_ring, i);
        id = bret->id;
        req = (struct request *)rec_ring[id].id;
        printk("cwc:%s: blkif_completion call id %lu\n", __FUNCTION__, id);
        blkif_completion( &rec_ring[id] );

        ADD_ID_TO_FREELIST(id); /* overwrites req */

        switch ( bret->operation )
        {
        case BLKIF_OP_READ:
        case BLKIF_OP_WRITE:
            if ( unlikely(bret->status != BLKIF_RSP_OKAY) )
                DPRINTK("Bad return from blkdev data request: %x\n",
                        bret->status);
     
            if ( unlikely(end_that_request_first
                          (req, 
                           (bret->status == BLKIF_RSP_OKAY),
                           req->hard_nr_sectors)) )
                BUG();
            end_that_request_last(req);

            break;
        case BLKIF_OP_PROBE:
            memcpy(&blkif_control_rsp, bret, sizeof(*bret));
            blkif_control_rsp_valid = 1;
            break;
        default:
            BUG();
        }
    }
    printk("cwc:%s: post loop\n", __FUNCTION__);

    blk_ring.rsp_cons = i;
    
    kick_pending_request_queues();

    spin_unlock_irqrestore(&blkif_io_lock, flags);
    printk("cwc:%s: out unlock_irq_restore\n", __FUNCTION__);

    return IRQ_HANDLED;
}

#else
/**************************  KERNEL VERSION 2.4  **************************/

static kdev_t        sg_dev;
static unsigned long sg_next_sect;

/*
 * Request queues with outstanding work, but ring is currently full.
 * We need no special lock here, as we always access this with the
 * blkif_io_lock held. We only need a small maximum list.
 */
#define MAX_PENDING 8
static request_queue_t *pending_queues[MAX_PENDING];
static int nr_pending;


#define blkif_io_lock io_request_lock

/*============================================================================*/
#if ENABLE_VBD_UPDATE

/*
 * blkif_update_int/update-vbds_task - handle VBD update events.
 *  Schedule a task for keventd to run, which will update the VBDs and perform 
 *  the corresponding updates to our view of VBD state.
 */
static void update_vbds_task(void *unused)
{ 
    xlvbd_update_vbds();
}

static void vbd_update(void)
{
    static struct tq_struct update_tq;
    update_tq.routine = update_vbds_task;
    schedule_task(&update_tq);
}

#endif /* ENABLE_VBD_UPDATE */
/*============================================================================*/

static void kick_pending_request_queues(void)
{
    /* We kick pending request queues if the ring is reasonably empty. */
    if ( (nr_pending != 0) && 
         (RING_PENDING_REQUESTS(&blk_ring) < (BLK_RING_SIZE >> 1)) )
    {
        /* Attempt to drain the queue, but bail if the ring becomes full. */
        while ( (nr_pending != 0) && !RING_FULL(&blk_ring) )
            do_blkif_request(pending_queues[--nr_pending]);
    }
}

int blkif_open(struct inode *inode, struct file *filep)
{
    short xldev = inode->i_rdev; 
    struct gendisk *gd = get_gendisk(xldev);
    xl_disk_t *disk = xldev_to_xldisk(inode->i_rdev);
    short minor = MINOR(xldev); 

    if ( gd->part[minor].nr_sects == 0 )
    { 
        /*
         * Device either doesn't exist, or has zero capacity; we use a few
         * cheesy heuristics to return the relevant error code
         */
        if ( (gd->sizes[minor >> gd->minor_shift] != 0) ||
             ((minor & (gd->max_p - 1)) != 0) )
        { 
            /*
             * We have a real device, but no such partition, or we just have a
             * partition number so guess this is the problem.
             */
            return -ENXIO;     /* no such device or address */
        }
        else if ( gd->flags[minor >> gd->minor_shift] & GENHD_FL_REMOVABLE )
        {
            /* This is a removable device => assume that media is missing. */ 
            return -ENOMEDIUM; /* media not present (this is a guess) */
        } 
        else
        { 
            /* Just go for the general 'no such device' error. */
            return -ENODEV;    /* no such device */
        }
    }
    
    /* Update of usage count is protected by per-device semaphore. */
    disk->usage++;

    return 0;
}


int blkif_release(struct inode *inode, struct file *filep)
{
    xl_disk_t *disk = xldev_to_xldisk(inode->i_rdev);

    /*
     * When usage drops to zero it may allow more VBD updates to occur.
     * Update of usage count is protected by a per-device semaphore.
     */
    if ( --disk->usage == 0 ) {
        vbd_update();
    }

    return 0;
}


int blkif_ioctl(struct inode *inode, struct file *filep,
                unsigned command, unsigned long argument)
{
    kdev_t dev = inode->i_rdev;
    struct hd_geometry *geo = (struct hd_geometry *)argument;
    struct gendisk *gd;     
    struct hd_struct *part; 
    int i;
    unsigned short cylinders;
    byte heads, sectors;

    /* NB. No need to check permissions. That is done for us. */
    
    DPRINTK_IOCTL("command: 0x%x, argument: 0x%lx, dev: 0x%04x\n",
                  command, (long) argument, dev); 
  
    gd = get_gendisk(dev);
    part = &gd->part[MINOR(dev)]; 

    switch ( command )
    {
    case BLKGETSIZE:
        DPRINTK_IOCTL("   BLKGETSIZE: %x %lx\n", BLKGETSIZE, part->nr_sects); 
        return put_user(part->nr_sects, (unsigned long *) argument);

    case BLKGETSIZE64:
        DPRINTK_IOCTL("   BLKGETSIZE64: %x %llx\n", BLKGETSIZE64,
                      (u64)part->nr_sects * 512);
        return put_user((u64)part->nr_sects * 512, (u64 *) argument);

    case BLKRRPART:                               /* re-read partition table */
        DPRINTK_IOCTL("   BLKRRPART: %x\n", BLKRRPART);
        return blkif_revalidate(dev);

    case BLKSSZGET:
        return hardsect_size[MAJOR(dev)][MINOR(dev)]; 

    case BLKBSZGET:                                        /* get block size */
        DPRINTK_IOCTL("   BLKBSZGET: %x\n", BLKBSZGET);
        break;

    case BLKBSZSET:                                        /* set block size */
        DPRINTK_IOCTL("   BLKBSZSET: %x\n", BLKBSZSET);
        break;

    case BLKRASET:                                         /* set read-ahead */
        DPRINTK_IOCTL("   BLKRASET: %x\n", BLKRASET);
        break;

    case BLKRAGET:                                         /* get read-ahead */
        DPRINTK_IOCTL("   BLKRAFET: %x\n", BLKRAGET);
        break;

    case HDIO_GETGEO:
        DPRINTK_IOCTL("   HDIO_GETGEO: %x\n", HDIO_GETGEO);
        if (!argument) return -EINVAL;

        /* We don't have real geometry info, but let's at least return
           values consistent with the size of the device */

        heads = 0xff;
        sectors = 0x3f; 
        cylinders = part->nr_sects / (heads * sectors);

        if (put_user(0x00,  (unsigned long *) &geo->start)) return -EFAULT;
        if (put_user(heads,  (byte *)&geo->heads)) return -EFAULT;
        if (put_user(sectors,  (byte *)&geo->sectors)) return -EFAULT;
        if (put_user(cylinders, (unsigned short *)&geo->cylinders)) return -EFAULT;

        return 0;

    case HDIO_GETGEO_BIG: 
        DPRINTK_IOCTL("   HDIO_GETGEO_BIG: %x\n", HDIO_GETGEO_BIG);
        if (!argument) return -EINVAL;

        /* We don't have real geometry info, but let's at least return
           values consistent with the size of the device */

        heads = 0xff;
        sectors = 0x3f; 
        cylinders = part->nr_sects / (heads * sectors);

        if (put_user(0x00,  (unsigned long *) &geo->start))  return -EFAULT;
        if (put_user(heads,  (byte *)&geo->heads))   return -EFAULT;
        if (put_user(sectors,  (byte *)&geo->sectors)) return -EFAULT;
        if (put_user(cylinders, (unsigned int *) &geo->cylinders)) return -EFAULT;

        return 0;

    case CDROMMULTISESSION:
        DPRINTK("FIXME: support multisession CDs later\n");
        for ( i = 0; i < sizeof(struct cdrom_multisession); i++ )
            if ( put_user(0, (byte *)(argument + i)) ) return -EFAULT;
        return 0;

    case SCSI_IOCTL_GET_BUS_NUMBER:
        DPRINTK("FIXME: SCSI_IOCTL_GET_BUS_NUMBER ioctl in XL blkif");
        return -ENOSYS;

    default:
        printk(KERN_ALERT "ioctl %08x not supported by XL blkif\n", command);
        return -ENOSYS;
    }
    
    return 0;
}



/* check media change: should probably do something here in some cases :-) */
int blkif_check(kdev_t dev)
{
    DPRINTK("blkif_check\n");
    return 0;
}

int blkif_revalidate(kdev_t dev)
{
    struct block_device *bd;
    struct gendisk *gd;
    xl_disk_t *disk;
    unsigned long capacity;
    int i, rc = 0;
    
    if ( (bd = bdget(dev)) == NULL )
        return -EINVAL;

    /*
     * Update of partition info, and check of usage count, is protected
     * by the per-block-device semaphore.
     */
    down(&bd->bd_sem);

    if ( ((gd = get_gendisk(dev)) == NULL) ||
         ((disk = xldev_to_xldisk(dev)) == NULL) ||
         ((capacity = gd->part[MINOR(dev)].nr_sects) == 0) )
    {
        rc = -EINVAL;
        goto out;
    }

    if ( disk->usage > 1 )
    {
        rc = -EBUSY;
        goto out;
    }

    /* Only reread partition table if VBDs aren't mapped to partitions. */
    if ( !(gd->flags[MINOR(dev) >> gd->minor_shift] & GENHD_FL_VIRT_PARTNS) )
    {
        for ( i = gd->max_p - 1; i >= 0; i-- )
        {
            invalidate_device(dev+i, 1);
            gd->part[MINOR(dev+i)].start_sect = 0;
            gd->part[MINOR(dev+i)].nr_sects   = 0;
            gd->sizes[MINOR(dev+i)]           = 0;
        }

        grok_partitions(gd, MINOR(dev)>>gd->minor_shift, gd->max_p, capacity);
    }

 out:
    up(&bd->bd_sem);
    bdput(bd);
    return rc;
}


/*
 * blkif_queue_request
 *
 * request block io 
 * 
 * id: for guest use only.
 * operation: BLKIF_OP_{READ,WRITE,PROBE}
 * buffer: buffer to read/write into. this should be a
 *   virtual address in the guest os.
 */
static int blkif_queue_request(unsigned long   id,
                               int             operation,
                               char *          buffer,
                               unsigned long   sector_number,
                               unsigned short  nr_sectors,
                               kdev_t          device)
{
    unsigned long       buffer_ma = virt_to_bus(buffer);
    unsigned long       xid;
    struct gendisk     *gd;
    blkif_request_t    *req;
    struct buffer_head *bh;
    unsigned int        fsect, lsect;
#ifdef CONFIG_XEN_BLKDEV_GRANT
    int ref;
#endif
    printk("cwc:%s: in op:%d\n", __FUNCTION__, operation);

    fsect = (buffer_ma & ~PAGE_MASK) >> 9;
    lsect = fsect + nr_sectors - 1;

    /* Buffer must be sector-aligned. Extent mustn't cross a page boundary. */
    if ( unlikely((buffer_ma & ((1<<9)-1)) != 0) )
        BUG();
    if ( lsect > 7 )
        BUG();

    buffer_ma &= PAGE_MASK;

    if ( unlikely(blkif_state != BLKIF_STATE_CONNECTED) )
        return 1;

    switch ( operation )
    {

    case BLKIF_OP_READ:
    case BLKIF_OP_WRITE:
        gd = get_gendisk(device); 

        /*
         * Update the sector_number we'll pass down as appropriate; note that
         * we could sanity check that resulting sector will be in this
         * partition, but this will happen in driver backend anyhow.
         */
        sector_number += gd->part[MINOR(device)].start_sect;

        /*
         * If this unit doesn't consist of virtual partitions then we clear 
         * the partn bits from the device number.
         */
        if ( !(gd->flags[MINOR(device)>>gd->minor_shift] & 
               GENHD_FL_VIRT_PARTNS) )
            device &= ~(gd->max_p - 1);

        if ( (sg_operation == operation) &&
             (sg_dev == device) &&
             (sg_next_sect == sector_number) )
        {
            printk("cwc:%s: building req\n", __FUNCTION__);
            req = RING_GET_REQUEST(&blk_ring, 
                                   blk_ring.req_prod_pvt - 1);
            bh = (struct buffer_head *)id;
     
            bh->b_reqnext = (struct buffer_head *)rec_ring[req->id].id;
     
            rec_ring[req->id].id = id;
                                                                                                
#ifdef CONFIG_XEN_BLKDEV_GRANT
            /* install a grant reference. */
            ref = gnttab_claim_grant_reference(&gref_head, gref_terminal);
            ASSERT( ref != -ENOSPC );

            gnttab_grant_foreign_access_ref(
                        ref,
                        rdomid,
                        buffer_ma >> PAGE_SHIFT,
                        ( operation == BLKIF_OP_WRITE ? 1 : 0 ) );

            req->frame_and_sects[req->nr_segments] =
                (((u32) ref ) << 16) | (fsect << 3) | lsect;
#else
            req->frame_and_sects[req->nr_segments] =
                buffer_ma | (fsect << 3) | lsect;
#endif
            if ( ++req->nr_segments < BLKIF_MAX_SEGMENTS_PER_REQUEST )
                sg_next_sect += nr_sectors;
            else
                DISABLE_SCATTERGATHER();

            /* Update the copy of the request in the recovery ring. */
            translate_req_to_pfn(&rec_ring[req->id], req );

            printk("cwc:%s: access granted to %lx\n", __FUNCTION__, buffer_ma);
            return 0;
        }
        else if ( RING_FULL(&blk_ring) )
        {
            printk("cwc:%s: ring full\n", __FUNCTION__);
            return 1;
        }
        else
        {
            sg_operation = operation;
            sg_dev       = device;
            sg_next_sect = sector_number + nr_sectors;
        }
        break;

    default:
        {
        printk("cwc:%s: pre panic for unknown op %d\n", __FUNCTION__, operation);
        panic("unknown op %d\n", operation);
        }
    }
    printk("cwc:%s: mid\n", __FUNCTION__);

    /* Fill out a communications ring structure. */
    req = RING_GET_REQUEST(&blk_ring, blk_ring.req_prod_pvt);

    xid = GET_ID_FROM_FREELIST();
    rec_ring[xid].id = id;

    req->id            = xid;
    req->operation     = operation;
    req->sector_number = (blkif_sector_t)sector_number;
    req->device        = device; 
    req->nr_segments   = 1;
#ifdef CONFIG_XEN_BLKDEV_GRANT
    /* install a grant reference. */
    ref = gnttab_claim_grant_reference(&gref_head, gref_terminal);
    ASSERT( ref != -ENOSPC );

    gnttab_grant_foreign_access_ref(
                ref,
                rdomid,
                buffer_ma >> PAGE_SHIFT,
                ( operation == BLKIF_OP_WRITE ? 1 : 0 ) );

    req->frame_and_sects[0] = (((u32) ref)<<16)  | (fsect<<3) | lsect;
#else
    req->frame_and_sects[0] = buffer_ma | (fsect<<3) | lsect;
#endif

    /* Keep a private copy so we can reissue requests when recovering. */    
    translate_req_to_pfn(&rec_ring[xid], req );

    blk_ring.req_prod_pvt++;
    
    printk("cwc:%s: out buf:%lx\n", __FUNCTION__, buffer_ma);
    return 0;
}


/*
 * do_blkif_request
 *  read a block; request is in a request queue
 */
void do_blkif_request(request_queue_t *rq)
{
    struct request *req;
    struct buffer_head *bh, *next_bh;
    int rw, nsect, full, queued = 0;

    DPRINTK("Entered do_blkif_request\n"); 

    while ( !rq->plugged && !list_empty(&rq->queue_head))
    {
        if ( (req = blkdev_entry_next_request(&rq->queue_head)) == NULL ) 
            goto out;
  
        DPRINTK("do_blkif_request %p: cmd %i, sec %lx, (%li/%li) bh:%p\n",
                req, req->cmd, req->sector,
                req->current_nr_sectors, req->nr_sectors, req->bh);

        rw = req->cmd;
        if ( rw == READA )
            rw = READ;
        if ( unlikely((rw != READ) && (rw != WRITE)) )
            panic("XenoLinux Virtual Block Device: bad cmd: %d\n", rw);

        req->errors = 0;

        bh = req->bh;
        while ( bh != NULL )
        {
            next_bh = bh->b_reqnext;
            bh->b_reqnext = NULL;

            full = blkif_queue_request(
                (unsigned long)bh,
                (rw == READ) ? BLKIF_OP_READ : BLKIF_OP_WRITE, 
                bh->b_data, bh->b_rsector, bh->b_size>>9, bh->b_rdev);

            if ( full )
            { 
                bh->b_reqnext = next_bh;
                pending_queues[nr_pending++] = rq;
                if ( unlikely(nr_pending >= MAX_PENDING) )
                    BUG();
                goto out; 
            }

            queued++;

            /* Dequeue the buffer head from the request. */
            nsect = bh->b_size >> 9;
            bh = req->bh = next_bh;
            
            if ( bh != NULL )
            {
                /* There's another buffer head to do. Update the request. */
                req->hard_sector += nsect;
                req->hard_nr_sectors -= nsect;
                req->sector = req->hard_sector;
                req->nr_sectors = req->hard_nr_sectors;
                req->current_nr_sectors = bh->b_size >> 9;
                req->buffer = bh->b_data;
            }
            else
            {
                /* That was the last buffer head. Finalise the request. */
                if ( unlikely(end_that_request_first(req, 1, "XenBlk")) )
                    BUG();
                blkdev_dequeue_request(req);
                end_that_request_last(req);
            }
        }
    }

 out:
    if ( queued != 0 )
        flush_requests();
}


static void blkif_int(int irq, void *dev_id, struct pt_regs *ptregs)
{
    RING_IDX i, rp; 
    unsigned long flags; 
    struct buffer_head *bh, *next_bh;
    
    spin_lock_irqsave(&io_request_lock, flags);     

    if ( unlikely(blkif_state == BLKIF_STATE_CLOSED || recovery) )
    {
        spin_unlock_irqrestore(&io_request_lock, flags);
        return;
    }

    rp = blk_ring.sring->rsp_prod;
    rmb(); /* Ensure we see queued responses up to 'rp'. */

    for ( i = blk_ring.rsp_cons; i != rp; i++ )
    {
        unsigned long id;
        blkif_response_t *bret;
        
        bret = RING_GET_RESPONSE(&blk_ring, i);
        id = bret->id;
        bh = (struct buffer_head *)rec_ring[id].id; 

        blkif_completion( &rec_ring[id] );

        ADD_ID_TO_FREELIST(id);

        switch ( bret->operation )
        {
        case BLKIF_OP_READ:
        case BLKIF_OP_WRITE:
            if ( unlikely(bret->status != BLKIF_RSP_OKAY) )
                DPRINTK("Bad return from blkdev data request: %lx\n",
                        bret->status);
            for ( ; bh != NULL; bh = next_bh )
            {
                next_bh = bh->b_reqnext;
                bh->b_reqnext = NULL;
                bh->b_end_io(bh, bret->status == BLKIF_RSP_OKAY);
            }

            break;
        case BLKIF_OP_PROBE:
            memcpy(&blkif_control_rsp, bret, sizeof(*bret));
            blkif_control_rsp_valid = 1;
            break;
        default:
            BUG();
        }

    }
    blk_ring.rsp_cons = i;
    
    kick_pending_request_queues();

    spin_unlock_irqrestore(&io_request_lock, flags);
}

#endif

/*****************************  COMMON CODE  *******************************/

#ifdef CONFIG_XEN_BLKDEV_GRANT
void blkif_control_probe_send(blkif_request_t *req, blkif_response_t *rsp,
                              unsigned long address)
{
    int ref = gnttab_claim_grant_reference(&gref_head, gref_terminal);
    ASSERT( ref != -ENOSPC );

    gnttab_grant_foreign_access_ref( ref, rdomid, address >> PAGE_SHIFT, 0 );
    printk("cwc:%s: grant access to buf:%lx\n", __FUNCTION__, address >> PAGE_SHIFT);

    req->frame_and_sects[0] = (((u32) ref) << 16) | 7;

    blkif_control_send(req, rsp);
}
#endif

void blkif_control_send(blkif_request_t *req, blkif_response_t *rsp)
{
    unsigned long flags, id;
    blkif_request_t *req_d;
    printk("cwc:%s: in\n", __FUNCTION__);

 retry:
    while ( RING_FULL(&blk_ring) )
    {
        set_current_state(TASK_INTERRUPTIBLE);
        schedule_timeout(1);
    }

    printk("cwc:%s: pre spin_lock_irqsave\n", __FUNCTION__);
    spin_lock_irqsave(&blkif_io_lock, flags);
    printk("cwc:%s: post spin_lock_irqsave\n", __FUNCTION__);
    if ( RING_FULL(&blk_ring) )
    {
        printk("cwc:%s: spin_unlock_irqrestore\n", __FUNCTION__);
        spin_unlock_irqrestore(&blkif_io_lock, flags);
        goto retry;
    }

    DISABLE_SCATTERGATHER();
    req_d = RING_GET_REQUEST(&blk_ring, blk_ring.req_prod_pvt);
    *req_d = *req;    

    id = GET_ID_FROM_FREELIST();
    req_d->id = id;
    rec_ring[id].id = (unsigned long) req;

    translate_req_to_pfn( &rec_ring[id], req );

    blk_ring.req_prod_pvt++;
    flush_requests();

    printk("cwc:%s: spin_unlock_irqrestore 2\n", __FUNCTION__);
    spin_unlock_irqrestore(&blkif_io_lock, flags);

    while ( !blkif_control_rsp_valid )
    {
        set_current_state(TASK_INTERRUPTIBLE);
        schedule_timeout(1);
    }

    memcpy(rsp, &blkif_control_rsp, sizeof(*rsp));
    blkif_control_rsp_valid = 0;
    printk("cwc:%s: out\n", __FUNCTION__);
}


/* Send a driver status notification to the domain controller. */
static void send_driver_status(int ok)
{
    ctrl_msg_t cmsg = {
        .type    = CMSG_BLKIF_FE,
        .subtype = CMSG_BLKIF_FE_DRIVER_STATUS,
        .length  = sizeof(blkif_fe_driver_status_t),
    };
    blkif_fe_driver_status_t *msg = (void*)cmsg.msg;
    
    msg->status = (ok ? BLKIF_DRIVER_STATUS_UP : BLKIF_DRIVER_STATUS_DOWN);

    printk("cwc:%s: ctrl_if_send_message_block\n", __FUNCTION__);
    ctrl_if_send_message_block(&cmsg, NULL, 0, TASK_UNINTERRUPTIBLE);
}

/* Tell the controller to bring up the interface. */
static void blkif_send_interface_connect(void)
{
    ctrl_msg_t cmsg = {
        .type    = CMSG_BLKIF_FE,
        .subtype = CMSG_BLKIF_FE_INTERFACE_CONNECT,
        .length  = sizeof(blkif_fe_interface_connect_t),
    };
    blkif_fe_interface_connect_t *msg = (void*)cmsg.msg;
    
    msg->handle      = 0;
    msg->shmem_frame = (virt_to_machine(blk_ring.sring) >> PAGE_SHIFT);
    
    printk("cwc:%s: ctrl_if_send_message_block\n", __FUNCTION__);
    ctrl_if_send_message_block(&cmsg, NULL, 0, TASK_UNINTERRUPTIBLE);
}

static void blkif_free(void)
{
    /* Prevent new requests being issued until we fix things up. */
    printk("cwc:%s: pre spin_lock_irq\n", __FUNCTION__);
    spin_lock_irq(&blkif_io_lock);
    printk("cwc:%s: post spin_lock_irq\n", __FUNCTION__);
    recovery = 1;
    blkif_state = BLKIF_STATE_DISCONNECTED;
    spin_unlock_irq(&blkif_io_lock);
    printk("cwc:%s: spin_unlock_irq\n", __FUNCTION__);

    /* Free resources associated with old device channel. */
    if ( blk_ring.sring != NULL )
    {
        free_page((unsigned long)blk_ring.sring);
        blk_ring.sring = NULL;
    }
    free_irq(blkif_irq, NULL);
    blkif_irq = 0;
    
    unbind_evtchn_from_irq(blkif_evtchn);
    blkif_evtchn = 0;
    printk("cwc:%s: out\n", __FUNCTION__);
}

static void blkif_close(void)
{
    printk("cwc:%s: called\n", __FUNCTION__);
}

/* Move from CLOSED to DISCONNECTED state. */
static void blkif_disconnect(void)
{
    blkif_sring_t *sring;
    printk("cwc:%s: called\n", __FUNCTION__);
    
    if ( blk_ring.sring != NULL )
        free_page((unsigned long)blk_ring.sring);
    
    sring = (blkif_sring_t *)__get_free_page(GFP_KERNEL);
    SHARED_RING_INIT(sring);
    FRONT_RING_INIT(&blk_ring, sring, PAGE_SIZE);
    blkif_state  = BLKIF_STATE_DISCONNECTED;
    blkif_send_interface_connect();
}

static void blkif_reset(void)
{
    printk("cwc:%s: called\n", __FUNCTION__);
    blkif_free();
    blkif_disconnect();
}

static void blkif_recover(void)
{
    int i;
    blkif_request_t *req;
    printk("cwc:%s: in\n", __FUNCTION__);

    /* Hmm, requests might be re-ordered when we re-issue them.
     * This will need to be fixed once we have barriers */

    /* Stage 1 : Find active and move to safety. */
    for ( i = 0; i < BLK_RING_SIZE; i++ )
    {
        if ( rec_ring[i].id >= PAGE_OFFSET )
        {
            req = RING_GET_REQUEST(&blk_ring, 
                                   blk_ring.req_prod_pvt);
            translate_req_to_mfn(req, &rec_ring[i]);
            blk_ring.req_prod_pvt++;
        }
    }
    printk("cwc:%s: post 1\n", __FUNCTION__);

    /* Stage 2 : Set up shadow list. */
    for ( i = 0; i < blk_ring.req_prod_pvt; i++ ) 
    {
        req = RING_GET_REQUEST(&blk_ring, i);
        rec_ring[i].id = req->id;  
        req->id = i;
        translate_req_to_pfn(&rec_ring[i], req);
    }
    printk("cwc:%s: post 2\n", __FUNCTION__);

    /* Stage 3 : Set up free list. */
    for ( ; i < BLK_RING_SIZE; i++ )
        rec_ring[i].id = i+1;
    rec_ring_free = blk_ring.req_prod_pvt;
    rec_ring[BLK_RING_SIZE-1].id = 0x0fffffff;
    printk("cwc:%s: post 3\n", __FUNCTION__);

    /* blk_ring->req_prod will be set when we flush_requests().*/
    wmb();

    /* Switch off recovery mode, using a memory barrier to ensure that
     * it's seen before we flush requests - we don't want to miss any
     * interrupts. */
    recovery = 0;
    wmb();

    /* Kicks things back into life. */
    flush_requests();

    /* Now safe to left other peope use interface. */
    blkif_state = BLKIF_STATE_CONNECTED;
    printk("cwc:%s: out\n", __FUNCTION__);
}

static void blkif_connect(blkif_fe_interface_status_t *status)
{
    int err = 0;

    blkif_evtchn = status->evtchn;
    blkif_irq    = bind_evtchn_to_irq(blkif_evtchn);
#ifdef CONFIG_XEN_BLKDEV_GRANT
    rdomid       = status->domid;
#endif
    printk("cwc:%s: in\n", __FUNCTION__);

    err = request_irq(blkif_irq, blkif_int, SA_SAMPLE_RANDOM, "blkif", NULL);
    if ( err )
    {
        printk(KERN_ALERT "xen_blk: request_irq failed (err=%d)\n", err);
        return;
    }

    if ( recovery ) 
    {
        printk("cwc:%s: recovery\n", __FUNCTION__);
        blkif_recover();
    } 
    else 
    {
        /* Transition to connected in case we need to do 
         *  a partition probe on a whole disk. */
        blkif_state = BLKIF_STATE_CONNECTED;
        printk("cwc:%s: connected\n", __FUNCTION__);
        
        /* Probe for discs attached to the interface. */
        xlvbd_init();
    }
    
    /* Kick pending requests. */
    printk("cwc:%s: pre spin_lock_irq\n", __FUNCTION__);
    spin_lock_irq(&blkif_io_lock);
    printk("cwc:%s: post spin_lock_irq\n", __FUNCTION__);
    kick_pending_request_queues();
    spin_unlock_irq(&blkif_io_lock);
    printk("cwc:%s: spin_unlock_irq\n", __FUNCTION__);
}

static void unexpected(blkif_fe_interface_status_t *status)
{
    DPRINTK(" Unexpected blkif status %u in state %u\n", 
            status->status, blkif_state);
}

static void blkif_status(blkif_fe_interface_status_t *status)
{
    if ( status->handle != blkif_handle )
    {
        WPRINTK(" Invalid blkif: handle=%u\n", status->handle);
        unexpected(status);
        return;
    }
    printk("cwc:%s: status: %u\n", __FUNCTION__, status->status);

    switch ( status->status ) 
    {
    case BLKIF_INTERFACE_STATUS_CLOSED:
        switch ( blkif_state )
        {
        case BLKIF_STATE_CLOSED:
            unexpected(status);
            break;
        case BLKIF_STATE_DISCONNECTED:
        case BLKIF_STATE_CONNECTED:
            unexpected(status);
            blkif_close();
            break;
        }
        break;

    case BLKIF_INTERFACE_STATUS_DISCONNECTED:
        switch ( blkif_state )
        {
        case BLKIF_STATE_CLOSED:
            blkif_disconnect();
            break;
        case BLKIF_STATE_DISCONNECTED:
        case BLKIF_STATE_CONNECTED:
            /* unexpected(status); */ /* occurs during suspend/resume */
            blkif_reset();
            break;
        }
        break;

    case BLKIF_INTERFACE_STATUS_CONNECTED:
        switch ( blkif_state )
        {
        case BLKIF_STATE_CLOSED:
            unexpected(status);
            blkif_disconnect();
            blkif_connect(status);
            break;
        case BLKIF_STATE_DISCONNECTED:
            blkif_connect(status);
            break;
        case BLKIF_STATE_CONNECTED:
            unexpected(status);
            blkif_connect(status);
            break;
        }
        break;

    case BLKIF_INTERFACE_STATUS_CHANGED:
        switch ( blkif_state )
        {
        case BLKIF_STATE_CLOSED:
        case BLKIF_STATE_DISCONNECTED:
            unexpected(status);
            break;
        case BLKIF_STATE_CONNECTED:
            vbd_update();
            break;
        }
        break;

    default:
        WPRINTK(" Invalid blkif status: %d\n", status->status);
        break;
    }
}


static void blkif_ctrlif_rx(ctrl_msg_t *msg, unsigned long id)
{
    printk("cwc:%s: called\n", __FUNCTION__);
    switch ( msg->subtype )
    {
    case CMSG_BLKIF_FE_INTERFACE_STATUS:
        blkif_status((blkif_fe_interface_status_t *)
                     &msg->msg[0]);
        break;
    default:
        msg->length = 0;
        break;
    }

    ctrl_if_send_response(msg);
}

int wait_for_blkif(void)
{
    int err = 0;
    int i;
    printk("cwc:%s: in\n", __FUNCTION__);
    send_driver_status(1);

    /*
     * We should read 'nr_interfaces' from response message and wait
     * for notifications before proceeding. For now we assume that we
     * will be notified of exactly one interface.
     */
    for ( i=0; (blkif_state != BLKIF_STATE_CONNECTED) && (i < 10*HZ); i++ )
    {
        set_current_state(TASK_INTERRUPTIBLE);
        schedule_timeout(1);
    }

    if ( blkif_state != BLKIF_STATE_CONNECTED )
    {
        printk(KERN_INFO "xen_blk: Timeout connecting to device!\n");
        err = -ENOSYS;
    }
    printk("cwc:%s: out\n", __FUNCTION__);
    return err;
}

int __init xlblk_init(void)
{
    int i;

#ifdef CONFIG_XEN_BLKDEV_GRANT
    if ( 0 > gnttab_alloc_grant_references( MAXIMUM_OUTSTANDING_BLOCK_REQS,
                                            &gref_head, &gref_terminal ))
        return 1;
    printk(KERN_ALERT "Blkif frontend is using grant tables.\n");
#endif

    if ( (xen_start_info.flags & SIF_INITDOMAIN) ||
         (xen_start_info.flags & SIF_BLK_BE_DOMAIN) )
        return 0;

    printk(KERN_INFO "xen_blk: Initialising virtual block device driver\n");

    rec_ring_free = 0;
    for ( i = 0; i < BLK_RING_SIZE; i++ )
        rec_ring[i].id = i+1;
    rec_ring[BLK_RING_SIZE-1].id = 0x0fffffff;

    (void)ctrl_if_register_receiver(CMSG_BLKIF_FE, blkif_ctrlif_rx,
                                    CALLBACK_IN_BLOCKING_CONTEXT);

    wait_for_blkif();

    return 0;
}

void blkdev_suspend(void)
{
    printk("cwc:%s: called\n", __FUNCTION__);
}

void blkdev_resume(void)
{
    printk("cwc:%s: called\n", __FUNCTION__);
    send_driver_status(1);
}

void blkif_completion(blkif_request_t *req)
{
    int i;
    printk("cwc:%s: in\n", __FUNCTION__);
#ifdef CONFIG_XEN_BLKDEV_GRANT
    grant_ref_t gref;

    for ( i = 0; i < req->nr_segments; i++ )
    {
        gref = blkif_gref_from_fas(req->frame_and_sects[i]);
        gnttab_release_grant_reference(&gref_head, gref);
    }
#else
    /* This is a hack to get the dirty logging bits set */
    switch ( req->operation )
    {
    case BLKIF_OP_READ:
        for ( i = 0; i < req->nr_segments; i++ )
        {
            unsigned long pfn = req->frame_and_sects[i] >> PAGE_SHIFT;
            unsigned long mfn = phys_to_machine_mapping[pfn];
            xen_machphys_update(mfn, pfn);
        }
        break;
    }
#endif
    printk("cwc:%s: out\n", __FUNCTION__);
}

[-- Warning: decoded text below may be mangled, UTF-8 assumed --]
[-- Attachment #5: grant_table.c --]
[-- Type: text/x-csrc; name="grant_table.c", Size: 38468 bytes --]

/******************************************************************************
 * common/grant_table.c
 * 
 * Mechanism for granting foreign access to page frames, and receiving
 * page-ownership transfers.
 * 
 * Copyright (c) 2005 Christopher Clark
 * Copyright (c) 2004 K A Fraser
 * 
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 * 
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 * 
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
 */

#define GRANT_DEBUG 1
#define GRANT_DEBUG_VERBOSE 1

#include <xen/config.h>
#include <xen/sched.h>
#include <xen/shadow.h>
#include <xen/mm.h>

#define PIN_FAIL(_lbl, _rc, _f, _a...)   \
    do {                           \
        DPRINTK( _f, ## _a );      \
        rc = (_rc);                \
        goto _lbl;                 \
    } while ( 0 )

static inline int
get_maptrack_handle(
    grant_table_t *t)
{
    unsigned int h;
    if ( unlikely((h = t->maptrack_head) == t->maptrack_limit) )
        return -1;
    t->maptrack_head = t->maptrack[h].ref_and_flags >> MAPTRACK_REF_SHIFT;
    t->map_count++;
    return h;
}

static inline void
put_maptrack_handle(
    grant_table_t *t, int handle)
{
    t->maptrack[handle].ref_and_flags = t->maptrack_head << MAPTRACK_REF_SHIFT;
    t->maptrack_head = handle;
    t->map_count--;
}

static int
__gnttab_activate_grant_ref(
    struct domain          *mapping_d,          /* IN */
    struct exec_domain     *mapping_ed,
    struct domain          *granting_d,
    grant_ref_t             ref,
    u16                     dev_hst_ro_flags,
    unsigned long           host_virt_addr,
    unsigned long          *pframe )            /* OUT */
{
    domid_t               sdom;
    u16                   sflags;
    active_grant_entry_t *act;
    grant_entry_t        *sha;
    s16                   rc = 1;
    unsigned long         frame = 0;
    int                   retries = 0;

    /*
     * Objectives of this function:
     * . Make the record ( granting_d, ref ) active, if not already.
     * . Update shared grant entry of owner, indicating frame is mapped.
     * . Increment the owner act->pin reference counts.
     * . get_page on shared frame if new mapping.
     * . get_page_type if this is first RW mapping of frame.
     * . Add PTE to virtual address space of mapping_d, if necessary.
     * Returns:
     * .  -ve: error
     * .    1: ok
     * .    0: ok and TLB invalidate of host_virt_addr needed.
     *
     * On success, *pframe contains mfn.
     */

    /*
     * We bound the number of times we retry CMPXCHG on memory locations that
     * we share with a guest OS. The reason is that the guest can modify that
     * location at a higher rate than we can read-modify-CMPXCHG, so the guest
     * could cause us to livelock. There are a few cases where it is valid for
     * the guest to race our updates (e.g., to change the GTF_readonly flag),
     * so we allow a few retries before failing.
     */

    act = &granting_d->grant_table->active[ref];
    sha = &granting_d->grant_table->shared[ref];

    printk("cwc:%s: pre spin_lock\n", __FUNCTION__);
    spin_lock(&granting_d->grant_table->lock);
    printk("cwc:%s: post spin_lock\n", __FUNCTION__);

    if ( act->pin == 0 )
    {
        /* CASE 1: Activating a previously inactive entry. */

        sflags = sha->flags;
        sdom   = sha->domid;

        for ( ; ; )
        {
            u32 scombo, prev_scombo, new_scombo;

            if ( unlikely((sflags & GTF_type_mask) != GTF_permit_access) ||
                 unlikely(sdom != mapping_d->id) )
                PIN_FAIL(unlock_out, GNTST_general_error,
                         "Bad flags (%x) or dom (%d). (NB. expected dom %d)\n",
                        sflags, sdom, mapping_d->id);

            /* Merge two 16-bit values into a 32-bit combined update. */
            /* NB. Endianness! */
            prev_scombo = scombo = ((u32)sdom << 16) | (u32)sflags;

            new_scombo = scombo | GTF_reading;
            if ( !(dev_hst_ro_flags & GNTMAP_readonly) )
            {
                new_scombo |= GTF_writing;
                if ( unlikely(sflags & GTF_readonly) )
                    PIN_FAIL(unlock_out, GNTST_general_error,
                             "Attempt to write-pin a r/o grant entry.\n");
            }

            /* NB. prev_scombo is updated in place to seen value. */
            if ( unlikely(cmpxchg_user((u32 *)&sha->flags,
                                       prev_scombo,
                                       new_scombo)) )
                PIN_FAIL(unlock_out, GNTST_general_error,
                         "Fault while modifying shared flags and domid.\n");

            /* Did the combined update work (did we see what we expected?). */
            if ( likely(prev_scombo == scombo) )
                break;

            if ( retries++ == 4 )
                PIN_FAIL(unlock_out, GNTST_general_error,
                         "Shared grant entry is unstable.\n");

            /* Didn't see what we expected. Split out the seen flags & dom. */
            /* NB. Endianness! */
            sflags = (u16)prev_scombo;
            sdom   = (u16)(prev_scombo >> 16);
        }

        /* rmb(); */ /* not on x86 */

        frame = __gpfn_to_mfn_foreign(granting_d, sha->frame);

        if ( unlikely(!pfn_is_ram(frame)) ||
             unlikely(!((dev_hst_ro_flags & GNTMAP_readonly) ?
                        get_page(&frame_table[frame], granting_d) :
                        get_page_and_type(&frame_table[frame], granting_d,
                                          PGT_writable_page))) )
        {
            clear_bit(_GTF_writing, &sha->flags);
            clear_bit(_GTF_reading, &sha->flags);
            PIN_FAIL(unlock_out, GNTST_general_error,
                     "Could not pin the granted frame (%lx)!\n", frame);
        }

        if ( dev_hst_ro_flags & GNTMAP_device_map )
            act->pin += (dev_hst_ro_flags & GNTMAP_readonly) ?
                GNTPIN_devr_inc : GNTPIN_devw_inc;
        if ( dev_hst_ro_flags & GNTMAP_host_map )
            act->pin += (dev_hst_ro_flags & GNTMAP_readonly) ?
                GNTPIN_hstr_inc : GNTPIN_hstw_inc;
        act->domid = sdom;
        act->frame = frame;
    }
    else 
    {
        /* CASE 2: Active modications to an already active entry. */

        /*
         * A cheesy check for possible pin-count overflow.
         * A more accurate check cannot be done with a single comparison.
         */
        if ( (act->pin & 0x80808080U) != 0 )
            PIN_FAIL(unlock_out, ENOSPC, "Risk of counter overflow %08x\n", act->pin);

        frame = act->frame;

        if ( !(dev_hst_ro_flags & GNTMAP_readonly) && 
             !((sflags = sha->flags) & GTF_writing) )
        {
            for ( ; ; )
            {
                u16 prev_sflags;
                
                if ( unlikely(sflags & GTF_readonly) )
                    PIN_FAIL(unlock_out, GNTST_general_error,
                             "Attempt to write-pin a r/o grant entry.\n");

                prev_sflags = sflags;

                /* NB. prev_sflags is updated in place to seen value. */
                if ( unlikely(cmpxchg_user(&sha->flags, prev_sflags, 
                                           prev_sflags | GTF_writing)) )
                    PIN_FAIL(unlock_out, GNTST_general_error,
                         "Fault while modifying shared flags.\n");

                if ( likely(prev_sflags == sflags) )
                    break;

                if ( retries++ == 4 )
                    PIN_FAIL(unlock_out, GNTST_general_error,
                             "Shared grant entry is unstable.\n");

                sflags = prev_sflags;
            }

            if ( unlikely(!get_page_type(&frame_table[frame],
                                         PGT_writable_page)) )
            {
                clear_bit(_GTF_writing, &sha->flags);
                PIN_FAIL(unlock_out, GNTST_general_error,
                         "Attempt to write-pin a unwritable page.\n");
            }
        }

        if ( dev_hst_ro_flags & GNTMAP_device_map )
            act->pin += (dev_hst_ro_flags & GNTMAP_readonly) ? 
                GNTPIN_devr_inc : GNTPIN_devw_inc;
        if ( dev_hst_ro_flags & GNTMAP_host_map )
            act->pin += (dev_hst_ro_flags & GNTMAP_readonly) ?
                GNTPIN_hstr_inc : GNTPIN_hstw_inc;
    }

    /* At this point:
     * act->pin updated to reflect mapping.
     * sha->flags updated to indicate to granting domain mapping done.
     * frame contains the mfn.
     */

    spin_unlock(&granting_d->grant_table->lock);
    printk("cwc:%s: spin_unlock\n", __FUNCTION__);

    if ( (host_virt_addr != 0) && (dev_hst_ro_flags & GNTMAP_host_map) )
    {
        /* Write update into the pagetable
         */

        rc = update_grant_va_mapping( host_virt_addr,
                                (frame << PAGE_SHIFT) | _PAGE_PRESENT  |
                                                        _PAGE_ACCESSED |
                                                        _PAGE_DIRTY    |
                       ((dev_hst_ro_flags & GNTMAP_readonly) ? 0 : _PAGE_RW),
                       mapping_d, mapping_ed );

        /* IMPORTANT: (rc == 0) => must flush / invalidate entry in TLB.
         * This is done in the outer gnttab_map_grant_ref.
         */

        if ( 0 > rc )
        {
            /* Abort. */

            printk("cwc:%s: pre spin_lock 2\n", __FUNCTION__);
            spin_lock(&granting_d->grant_table->lock);
            printk("cwc:%s: post spin_lock 2\n", __FUNCTION__);

            if ( dev_hst_ro_flags & GNTMAP_readonly )
                act->pin -= GNTPIN_hstr_inc;
            else
            {
                act->pin -= GNTPIN_hstw_inc;
                if ( (act->pin & (GNTPIN_hstw_mask|GNTPIN_devw_mask)) == 0 )
                {
                    clear_bit(_GTF_writing, &sha->flags);
                    put_page_type(&frame_table[frame]);
                }
            }
            if ( act->pin == 0 )
            {
                clear_bit(_GTF_reading, &sha->flags);
                put_page(&frame_table[frame]);
            }

            spin_unlock(&granting_d->grant_table->lock);
            printk("cwc:%s: spin_unlock 2\n", __FUNCTION__);
        }

    }
    *pframe = frame;
    printk("cwc:%s: out\n", __FUNCTION__);
    return rc;

 unlock_out:
    spin_unlock(&granting_d->grant_table->lock);
    printk("cwc:%s: spin_unlock out\n", __FUNCTION__);
    return rc;
}

static int
__gnttab_map_grant_ref(
    gnttab_map_grant_ref_t *uop,
    unsigned long *va)
{
    domid_t               dom;
    grant_ref_t           ref;
    struct domain        *ld, *rd;
    struct exec_domain   *led;
    u16                   dev_hst_ro_flags;
    int                   handle;
    unsigned long         frame, host_virt_addr;
    int                   rc;

    /* Returns 0 if TLB flush / invalidate required by caller.
     * va will indicate the address to be invalidated. */

    led = current;
    ld = led->domain;

    /* Bitwise-OR avoids short-circuiting which screws control flow. */
    if ( unlikely(__get_user(dom, &uop->dom) |
                  __get_user(ref, &uop->ref) |
                  __get_user(host_virt_addr, &uop->host_virt_addr) |
                  __get_user(dev_hst_ro_flags, &uop->flags)) )
    {
        DPRINTK("Fault while reading gnttab_map_grant_ref_t.\n");
        return -EFAULT; /* don't set status */
    }


    if ( ((host_virt_addr != 0) || (dev_hst_ro_flags & GNTMAP_host_map) ) &&
         unlikely(!__addr_ok(host_virt_addr)))
    {
        DPRINTK("Bad virtual address (%x) or flags (%x).\n",
                host_virt_addr, dev_hst_ro_flags);
        (void)__put_user(GNTST_bad_virt_addr, &uop->handle);
        return GNTST_bad_gntref;
    }

    if ( unlikely(ref >= NR_GRANT_ENTRIES) ||
         unlikely((dev_hst_ro_flags & (GNTMAP_device_map|GNTMAP_host_map)) ==
0) )
    {
        DPRINTK("Bad ref (%d) or flags (%x).\n", ref, dev_hst_ro_flags);
        (void)__put_user(GNTST_bad_gntref, &uop->handle);
        return GNTST_bad_gntref;
    }

    if ( unlikely((rd = find_domain_by_id(dom)) == NULL) ||
         unlikely(ld == rd) )
    {
        if ( rd != NULL )
            put_domain(rd);
        DPRINTK("Could not find domain %d\n", dom);
        (void)__put_user(GNTST_bad_domain, &uop->handle);
        return GNTST_bad_domain;
    }

    /* get a maptrack handle */
    if ( unlikely((handle = get_maptrack_handle(ld->grant_table)) == -1) )
    {
        int              i;
        grant_mapping_t *new_mt;
        grant_table_t   *lgt      = ld->grant_table;

        /* grow the maptrack table */
        if ( (new_mt = (void *)alloc_xenheap_pages(lgt->maptrack_order + 1)) == NULL )
        {
            put_domain(rd);
            DPRINTK("No more map handles available\n");
            (void)__put_user(GNTST_no_device_space, &uop->handle);
            return GNTST_no_device_space;
        }

        memcpy(new_mt, lgt->maptrack, PAGE_SIZE << lgt->maptrack_order);
        for ( i = lgt->maptrack_limit; i < (lgt->maptrack_limit << 1); i++ )
            new_mt[i].ref_and_flags = (i+1) << MAPTRACK_REF_SHIFT;

        free_xenheap_pages((unsigned long)lgt->maptrack, lgt->maptrack_order);
        lgt->maptrack          = new_mt;
        lgt->maptrack_order   += 1;
        lgt->maptrack_limit  <<= 1;

        printk("Doubled maptrack size\n");
        handle = get_maptrack_handle(ld->grant_table);
    }

#if GRANT_DEBUG_VERBOSE
    DPRINTK("Mapping grant ref (%hu) for domain (%hu) with flags (%x)\n",
            ref, dom, dev_hst_ro_flags);
#endif

    if ( 0 <= ( rc = __gnttab_activate_grant_ref( ld, led, rd, ref,
                                                  dev_hst_ro_flags,
                                                  host_virt_addr, &frame)))
    {
        /* Only make the maptrack live _after_ writing the pte,
         * in case we overwrite the same frame number, causing a
         *  maptrack walk to find it
         */
        ld->grant_table->maptrack[handle].domid = dom;

        ld->grant_table->maptrack[handle].ref_and_flags
            = (ref << MAPTRACK_REF_SHIFT) |
              (dev_hst_ro_flags & MAPTRACK_GNTMAP_MASK);

        (void)__put_user(frame, &uop->dev_bus_addr);

        if ( dev_hst_ro_flags & GNTMAP_host_map )
            *va = host_virt_addr;

        (void)__put_user(handle, &uop->handle);
    }
    else
    {
        (void)__put_user(rc, &uop->handle);
        put_maptrack_handle(ld->grant_table, handle);
    }

    put_domain(rd);
    return rc;
}

static long
gnttab_map_grant_ref(
    gnttab_map_grant_ref_t *uop, unsigned int count)
{
    int i, flush = 0;
    unsigned long va[8];

    for ( i = 0; i < count; i++ )
        if ( __gnttab_map_grant_ref(&uop[i],
             &va[ (flush < 8 ? flush : 0) ]   ) == 0)
            flush++;

    if ( flush != 0 )
    {
        if ( flush <= 8 )
            for ( i = 0; i < flush; i++ )
                flush_tlb_one_mask(current->domain->cpuset, va[i]);
        else 
            local_flush_tlb();
    }

    return 0;
}

static int
__gnttab_unmap_grant_ref(
    gnttab_unmap_grant_ref_t *uop,
    unsigned long *va)
{
    domid_t        dom;
    grant_ref_t    ref;
    u16            handle;
    struct domain *ld, *rd;

    active_grant_entry_t *act;
    grant_entry_t *sha;
    grant_mapping_t *map;
    u16            flags;
    s16            rc = 1;
    unsigned long  frame, virt;

    ld = current->domain;

    /* Bitwise-OR avoids short-circuiting which screws control flow. */
    if ( unlikely(__get_user(virt, &uop->host_virt_addr) |
                  __get_user(frame, &uop->dev_bus_addr) |
                  __get_user(handle, &uop->handle)) )
    {
        DPRINTK("Fault while reading gnttab_unmap_grant_ref_t.\n");
        return -EFAULT; /* don't set status */
    }

    map = &ld->grant_table->maptrack[handle];

    if ( unlikely(handle >= ld->grant_table->maptrack_limit) ||
         unlikely(!(map->ref_and_flags & MAPTRACK_GNTMAP_MASK)) )
    {
        DPRINTK("Bad handle (%d).\n", handle);
        (void)__put_user(GNTST_bad_handle, &uop->status);
        return GNTST_bad_handle;
    }

    dom   = map->domid;
    ref   = map->ref_and_flags >> MAPTRACK_REF_SHIFT;
    flags = map->ref_and_flags & MAPTRACK_GNTMAP_MASK;

    if ( unlikely((rd = find_domain_by_id(dom)) == NULL) ||
         unlikely(ld == rd) )
    {
        if ( rd != NULL )
            put_domain(rd);
        DPRINTK("Could not find domain %d\n", dom);
        (void)__put_user(GNTST_bad_domain, &uop->status);
        return GNTST_bad_domain;
    }
#if GRANT_DEBUG_VERBOSE
    DPRINTK("Unmapping grant ref (%hu) for domain (%hu) with handle (%hu)\n",
            ref, dom, handle);
#endif

    act = &rd->grant_table->active[ref];
    sha = &rd->grant_table->shared[ref];

    printk("cwc:%s: pre spin_lock\n", __FUNCTION__);
    spin_lock(&rd->grant_table->lock);
    printk("cwc:%s: post spin_lock\n", __FUNCTION__);

    if ( frame == 0 )
        frame = act->frame;
    else if ( frame == GNTUNMAP_DEV_FROM_VIRT )
    {
        if ( !( flags & GNTMAP_device_map ) )
            PIN_FAIL(unmap_out, GNTST_bad_dev_addr,
                     "Bad frame number: frame not mapped for device access.\n");
        frame = act->frame;

        /* frame will be unmapped for device access below if virt addr ok */
    }
    else
    {
        if ( unlikely(frame != act->frame) )
            PIN_FAIL(unmap_out, GNTST_general_error,
                     "Bad frame number doesn't match gntref.\n");
        if ( flags & GNTMAP_device_map )
            act->pin -= (flags & GNTMAP_readonly) ? GNTPIN_devr_inc
                                                  : GNTPIN_devw_inc;

        map->ref_and_flags &= ~GNTMAP_device_map;
        (void)__put_user(0, &uop->dev_bus_addr);

        /* frame is now unmapped for device access */
    }

    if ( (virt != 0) &&
         (flags & GNTMAP_host_map) &&
         ((act->pin & (GNTPIN_hstw_mask | GNTPIN_hstr_mask)) > 0))
    {
        l1_pgentry_t   *pl1e;
        unsigned long   _ol1e;

        pl1e = &linear_pg_table[l1_linear_offset(virt)];
                                                                                            
        if ( unlikely(__get_user(_ol1e, (unsigned long *)pl1e) != 0) )
        {
            DPRINTK("Could not find PTE entry for address %x\n", virt);
            rc = -EINVAL;
            goto unmap_out;
        }

        /* check that the virtual address supplied is actually
         * mapped to act->frame.
         */
        if ( unlikely((_ol1e >> PAGE_SHIFT) != frame ))
        {
            DPRINTK("PTE entry %x for address %x doesn't match frame %x\n",
                    _ol1e, virt, frame);
            rc = -EINVAL;
            goto unmap_out;
        }

        printk("cwc:%s: pre pagetable delete\n", __FUNCTION__);
        /* Delete pagetable entry
         */
        if ( unlikely(__put_user(0, (unsigned long *)pl1e)))
        {
            DPRINTK("Cannot delete PTE entry at %x for virtual address %x\n",
                    pl1e, virt);
            rc = -EINVAL;
            goto unmap_out;
        }

        map->ref_and_flags &= ~GNTMAP_host_map;

        act->pin -= (flags & GNTMAP_readonly) ? GNTPIN_hstr_inc
                                              : GNTPIN_hstw_inc;

        if ( frame == GNTUNMAP_DEV_FROM_VIRT )
        {
            act->pin -= (flags & GNTMAP_readonly) ? GNTPIN_devr_inc
                                                  : GNTPIN_devw_inc;

            map->ref_and_flags &= ~GNTMAP_device_map;
            (void)__put_user(0, &uop->dev_bus_addr);
        }

        rc = 0;
        *va = virt;
        printk("cwc:%s: post pagetable delete\n", __FUNCTION__);
    }

    if ( (map->ref_and_flags & (GNTMAP_device_map|GNTMAP_host_map)) == 0)
    {
        map->ref_and_flags = 0;
        put_maptrack_handle(ld->grant_table, handle);
    }

    /* If just unmapped a writable mapping, mark as dirtied */
    if ( unlikely(shadow_mode_log_dirty(rd)) &&
        !( flags & GNTMAP_readonly ) )
         mark_dirty(rd, frame);

    /* If the last writable mapping has been removed, put_page_type */
    if ( ( (act->pin & (GNTPIN_devw_mask|GNTPIN_hstw_mask) ) == 0) &&
         ( !( flags & GNTMAP_readonly ) ) )
    {
        clear_bit(_GTF_writing, &sha->flags);
        put_page_type(&frame_table[frame]);
    }

    if ( act->pin == 0 )
    {
        clear_bit(_GTF_reading, &sha->flags);
        put_page(&frame_table[frame]);
    }

 unmap_out:
    (void)__put_user(rc, &uop->status);
    printk("cwc:%s: out spin_unlock\n", __FUNCTION__);
    spin_unlock(&rd->grant_table->lock);
    put_domain(rd);
    return rc;
}

static long
gnttab_unmap_grant_ref(
    gnttab_unmap_grant_ref_t *uop, unsigned int count)
{
    int i, flush = 0;
    unsigned long va[8];

    for ( i = 0; i < count; i++ )
        if ( __gnttab_unmap_grant_ref(&uop[i],
             &va[ (flush < 8 ? flush : 0) ]   ) == 0)
            flush++;

    if ( flush != 0 )
    {
        if ( flush <= 8 )
            for ( i = 0; i < flush; i++ )
                flush_tlb_one_mask(current->domain->cpuset, va[i]);
        else 
            local_flush_tlb();
    }

    return 0;
}

static long 
gnttab_setup_table(
    gnttab_setup_table_t *uop, unsigned int count)
{
    gnttab_setup_table_t  op;
    struct domain        *d;
    int                   i;

    if ( count != 1 )
        return -EINVAL;

    if ( unlikely(copy_from_user(&op, uop, sizeof(op)) != 0) )
    {
        DPRINTK("Fault while reading gnttab_setup_table_t.\n");
        return -EFAULT;
    }

    if ( unlikely(op.nr_frames > NR_GRANT_FRAMES) )
    {
        DPRINTK("Xen only supports at most %d grant-table frames per domain.\n",
                NR_GRANT_FRAMES);
        (void)put_user(GNTST_general_error, &uop->status);
        return 0;
    }

    if ( op.dom == DOMID_SELF )
    {
        op.dom = current->domain->id;
    }
    else if ( unlikely(!IS_PRIV(current->domain)) )
    {
        (void)put_user(GNTST_permission_denied, &uop->status);
        return 0;
    }

    if ( unlikely((d = find_domain_by_id(op.dom)) == NULL) )
    {
        DPRINTK("Bad domid %d.\n", op.dom);
        (void)put_user(GNTST_bad_domain, &uop->status);
        return 0;
    }

    if ( op.nr_frames <= NR_GRANT_FRAMES )
    {
        ASSERT(d->grant_table != NULL);
        (void)put_user(GNTST_okay, &uop->status);

        for ( i = 0; i < op.nr_frames; i++ )
            (void)put_user( (
                virt_to_phys( (char*)(d->grant_table->shared)+(i*PAGE_SIZE) )
                              >> PAGE_SHIFT ), &uop->frame_list[i]);
    }

    put_domain(d);
    return 0;
}

#if GRANT_DEBUG
static int
gnttab_dump_table(gnttab_dump_table_t *uop)
{
    grant_table_t        *gt;
    gnttab_dump_table_t   op;
    struct domain        *d;
    u32                   shared_mfn;
    active_grant_entry_t *act;
    grant_entry_t         sha_copy;
    grant_mapping_t      *maptrack;
    int                   i;


    if ( unlikely(copy_from_user(&op, uop, sizeof(op)) != 0) )
    {
        DPRINTK("Fault while reading gnttab_dump_table_t.\n");
        return -EFAULT;
    }

    if ( op.dom == DOMID_SELF )
    {
        op.dom = current->domain->id;
    }

    if ( unlikely((d = find_domain_by_id(op.dom)) == NULL) )
    {
        DPRINTK("Bad domid %d.\n", op.dom);
        (void)put_user(GNTST_bad_domain, &uop->status);
        return 0;
    }

    ASSERT(d->grant_table != NULL);
    gt = d->grant_table;
    (void)put_user(GNTST_okay, &uop->status);

    shared_mfn = virt_to_phys(d->grant_table->shared);

    DPRINTK("Grant table for dom (%hu) MFN (%x)\n",
            op.dom, shared_mfn);

    ASSERT(d->grant_table->active != NULL);
    ASSERT(d->grant_table->shared != NULL);
    ASSERT(d->grant_table->maptrack != NULL);

    for ( i = 0; i < NR_GRANT_ENTRIES; i++ )
    {
        sha_copy =  gt->shared[i];

        if ( sha_copy.flags )
        {
            DPRINTK("Grant: dom (%hu) SHARED (%d) flags:(%hx) dom:(%hu) frame:(%lx)\n",
                    op.dom, i, sha_copy.flags, sha_copy.domid, sha_copy.frame);
        }
    }

    spin_lock(&gt->lock);

    for ( i = 0; i < NR_GRANT_ENTRIES; i++ )
    {
        act = &gt->active[i];

        if ( act->pin )
        {
            DPRINTK("Grant: dom (%hu) ACTIVE (%d) pin:(%x) dom:(%hu) frame:(%lx)\n",
                    op.dom, i, act->pin, act->domid, act->frame);
        }
    }

    for ( i = 0; i < gt->maptrack_limit; i++ )
    {
        maptrack = &gt->maptrack[i];

        if ( maptrack->ref_and_flags & MAPTRACK_GNTMAP_MASK )
        {
            DPRINTK("Grant: dom (%hu) MAP (%d) ref:(%hu) flags:(%x) dom:(%hu)\n",
                    op.dom, i,
                    maptrack->ref_and_flags >> MAPTRACK_REF_SHIFT,
                    maptrack->ref_and_flags & MAPTRACK_GNTMAP_MASK,
                    maptrack->domid);
        }
    }

    spin_unlock(&gt->lock);

    put_domain(d);
    return 0;
}
#endif

long 
do_grant_table_op(
    unsigned int cmd, void *uop, unsigned int count)
{
    long rc;

    if ( count > 512 )
        return -EINVAL;

    LOCK_BIGLOCK(current->domain);

    rc = -EFAULT;
    switch ( cmd )
    {
    case GNTTABOP_map_grant_ref:
        if ( unlikely(!array_access_ok(
            VERIFY_WRITE, uop, count, sizeof(gnttab_map_grant_ref_t))) )
            goto out;
        rc = gnttab_map_grant_ref((gnttab_map_grant_ref_t *)uop, count);
        break;
    case GNTTABOP_unmap_grant_ref:
        if ( unlikely(!array_access_ok(
            VERIFY_WRITE, uop, count, sizeof(gnttab_unmap_grant_ref_t))) )
            goto out;
        rc = gnttab_unmap_grant_ref((gnttab_unmap_grant_ref_t *)uop, count);
        break;
    case GNTTABOP_setup_table:
        rc = gnttab_setup_table((gnttab_setup_table_t *)uop, count);
        break;
#if GRANT_DEBUG
    case GNTTABOP_dump_table:
        rc = gnttab_dump_table((gnttab_dump_table_t *)uop);
        break;
#endif
    default:
        rc = -ENOSYS;
        break;
    }

out:
    UNLOCK_BIGLOCK(current->domain);

    return rc;
}

int
gnttab_check_unmap(
    struct domain *rd, struct domain *ld, unsigned long frame, int readonly)
{
    /* Called when put_page is invoked on a page belonging to a foreign domain.
     * Instead of decrementing the frame table ref count, locate the grant
     * table entry, if any, and if found, decrement that count.
     * Called a _lot_ at domain creation because pages mapped by priv domains
     * also traverse this.
     */

    /* Note: if the same frame is mapped multiple times, and then one of
     *       the ptes is overwritten, which maptrack handle gets invalidated?
     * Advice: don't do it.
     */

    unsigned int handle, ref, refcount;
    grant_table_t        *lgt, *rgt;
    active_grant_entry_t *act;
    grant_mapping_t      *map;
    int found = 0;

    lgt = ld->grant_table;

#if GRANT_DEBUG_VERBOSE
    if ( ld->id != 0 )
    {
        DPRINTK("Foreign unref rd(%d) ld(%d) frm(%x) flgs(%x).\n",
                rd->id, ld->id, frame, readonly);
    }
#endif

    /* Fast exit if we're not mapping anything using grant tables */
    if ( lgt->map_count == 0 )
        return 0;

    if ( get_domain(rd) == 0 )
    {
        DPRINTK("gnttab_check_unmap: couldn't get_domain rd(%d)\n", rd->id);
        return 0;
    }

    rgt = rd->grant_table;

    for ( handle = 0; handle < lgt->maptrack_limit; handle++ )
    {
        map = &lgt->maptrack[handle];


        /* cwc22: if multiple grants of the same frame are disallowed,
         * then the readonly check here can be changed to cause an early abort
         * if we've matched on frame, but not on write permission.
         */
        if ( ( map->ref_and_flags & MAPTRACK_GNTMAP_MASK ) &&
             ( readonly ? 1 : (!(map->ref_and_flags & GNTMAP_readonly))))
        {
            ref = (map->ref_and_flags >> MAPTRACK_REF_SHIFT);
            act = &rgt->active[ref];

            printk("cwc:%s: pre spin_lock\n", __FUNCTION__);
            spin_lock(&rgt->lock);
            printk("cwc:%s: post spin_lock\n", __FUNCTION__);

            if ( act->frame != frame )
            {
                spin_unlock(&rgt->lock);
                printk("cwc:%s: not frame spin_unlock\n", __FUNCTION__);
                continue;
            }

            refcount = act->pin & ( readonly ? GNTPIN_hstr_mask
                                             : GNTPIN_hstw_mask );
            if ( refcount == 0 )
            {
                spin_unlock(&rgt->lock);
                printk("cwc:%s: refcount 0 spin_unlock\n", __FUNCTION__);
                continue;
            }

            /* gotcha */
            DPRINTK("Grant unref rd(%d) ld(%d) frm(%x) flgs(%x).\n",
                    rd->id, ld->id, frame, readonly);

            if ( readonly )
                act->pin -= GNTPIN_hstr_inc;
            else
            {
                act->pin -= GNTPIN_hstw_inc;

                /* any more granted writable mappings? */
                if ( (act->pin & (GNTPIN_hstw_mask|GNTPIN_devw_mask)) == 0 )
                {
                    clear_bit(_GTF_writing, &rgt->shared[ref].flags);
                    put_page_type(&frame_table[frame]);
                }
            }

            if ( act->pin == 0 )
            {
                clear_bit(_GTF_reading, &rgt->shared[ref].flags);
                put_page(&frame_table[frame]);
            }
            spin_unlock(&rgt->lock);
            printk("cwc:%s: spin_unlock\n", __FUNCTION__);

            clear_bit(GNTMAP_host_map, &map->ref_and_flags);

            if ( !(map->ref_and_flags & GNTMAP_device_map) )
                put_maptrack_handle(lgt, handle);

            found = 1;
            break;
        }
    }
    put_domain(rd);

    printk("cwc:%s: out found:%d\n", __FUNCTION__, found);
    return found;
}

int 
gnttab_prepare_for_transfer(
    struct domain *rd, struct domain *ld, grant_ref_t ref)
{
    grant_table_t *rgt;
    grant_entry_t *sha;
    domid_t        sdom;
    u16            sflags;
    u32            scombo, prev_scombo;
    int            retries = 0;
    unsigned long  target_pfn;

    DPRINTK("gnttab_prepare_for_transfer rd(%hu) ld(%hu) ref(%hu).\n",
            rd->id, ld->id, ref);

    if ( unlikely((rgt = rd->grant_table) == NULL) ||
         unlikely(ref >= NR_GRANT_ENTRIES) )
    {
        DPRINTK("Dom %d has no g.t., or ref is bad (%d).\n", rd->id, ref);
        return 0;
    }

    printk("cwc:%s: pre spin_lock\n", __FUNCTION__);
    spin_lock(&rgt->lock);
    printk("cwc:%s: post spin_lock\n", __FUNCTION__);

    sha = &rgt->shared[ref];
    
    sflags = sha->flags;
    sdom   = sha->domid;

    printk("cwc:%s: pre cmpxchg loop\n", __FUNCTION__);
    for ( ; ; )
    {
        target_pfn = sha->frame;

        if ( unlikely(target_pfn >= max_page ) )
        {
            DPRINTK("Bad pfn (%x)\n", target_pfn);
            goto fail;
        }

        if ( unlikely(sflags != GTF_accept_transfer) ||
             unlikely(sdom != ld->id) )
        {
            DPRINTK("Bad flags (%x) or dom (%d). (NB. expected dom %d)\n",
                    sflags, sdom, ld->id);
            goto fail;
        }

        /* Merge two 16-bit values into a 32-bit combined update. */
        /* NB. Endianness! */
        prev_scombo = scombo = ((u32)sdom << 16) | (u32)sflags;

        /* NB. prev_scombo is updated in place to seen value. */
        if ( unlikely(cmpxchg_user((u32 *)&sha->flags, prev_scombo, 
                                   prev_scombo | GTF_transfer_committed)) )
        {
            DPRINTK("Fault while modifying shared flags and domid.\n");
            goto fail;
        }

        /* Did the combined update work (did we see what we expected?). */
        if ( likely(prev_scombo == scombo) )
            break;

        if ( retries++ == 4 )
        {
            DPRINTK("Shared grant entry is unstable.\n");
            goto fail;
        }

        /* Didn't see what we expected. Split out the seen flags & dom. */
        /* NB. Endianness! */
        sflags = (u16)prev_scombo;
        sdom   = (u16)(prev_scombo >> 16);
    }

    spin_unlock(&rgt->lock);
    printk("cwc:%s: out spin_unlock 1\n", __FUNCTION__);
    return 1;

 fail:
    spin_unlock(&rgt->lock);
    printk("cwc:%s: out spin_unlock 0\n", __FUNCTION__);
    return 0;
}

void 
gnttab_notify_transfer(
    struct domain *rd, struct domain *ld, grant_ref_t ref, unsigned long frame)
{
    grant_entry_t  *sha;
    unsigned long   pfn;

    DPRINTK("gnttab_notify_transfer rd(%hu) ld(%hu) ref(%hu).\n",
            rd->id, ld->id, ref);

    sha = &rd->grant_table->shared[ref];

    spin_lock(&rd->grant_table->lock);

    pfn = sha->frame;

    if ( unlikely(pfn >= max_page ) )
        DPRINTK("Bad pfn (%x)\n", pfn);
    else
    {
        machine_to_phys_mapping[frame] = pfn;

        if ( unlikely(shadow_mode_log_dirty(ld)))
             mark_dirty(ld, frame);

        if (shadow_mode_translate(ld))
            __phys_to_machine_mapping[pfn] = frame;
    }
    sha->frame = __mfn_to_gpfn(rd, frame);
    sha->domid = rd->id;
    wmb();
    sha->flags = ( GTF_accept_transfer | GTF_transfer_completed );

    spin_unlock(&rd->grant_table->lock);

    return;
}

int 
grant_table_create(
    struct domain *d)
{
    grant_table_t *t;
    int            i;

    if ( (t = xmalloc(grant_table_t)) == NULL )
        goto no_mem;

    /* Simple stuff. */
    memset(t, 0, sizeof(*t));
    spin_lock_init(&t->lock);

    /* Active grant table. */
    if ( (t->active = xmalloc_array(active_grant_entry_t, NR_GRANT_ENTRIES))
         == NULL )
        goto no_mem;
    memset(t->active, 0, sizeof(active_grant_entry_t) * NR_GRANT_ENTRIES);

    /* Tracking of mapped foreign frames table */
    if ( (t->maptrack = (void *)alloc_xenheap_page()) == NULL )
        goto no_mem;
    t->maptrack_order = 0;
    t->maptrack_limit = PAGE_SIZE / sizeof(grant_mapping_t);
    memset(t->maptrack, 0, PAGE_SIZE);
    for ( i = 0; i < t->maptrack_limit; i++ )
        t->maptrack[i].ref_and_flags = (i+1) << MAPTRACK_REF_SHIFT;

    /* Shared grant table. */
    if ( (t->shared = (void *)alloc_xenheap_pages(ORDER_GRANT_FRAMES)) == NULL )
        goto no_mem;
    memset(t->shared, 0, NR_GRANT_FRAMES * PAGE_SIZE);

    for ( i = 0; i < NR_GRANT_FRAMES; i++ )
    {
        SHARE_PFN_WITH_DOMAIN(virt_to_page((char *)(t->shared)+(i*PAGE_SIZE)), d);
        machine_to_phys_mapping[ (virt_to_phys((char*)(t->shared)+(i*PAGE_SIZE))
                                 >> PAGE_SHIFT) ] = INVALID_M2P_ENTRY;
    }

    /* Okay, install the structure. */
    wmb(); /* avoid races with lock-free access to d->grant_table */
    d->grant_table = t;
    return 0;

 no_mem:
    if ( t != NULL )
    {
        if ( t->active != NULL )
            xfree(t->active);
        if ( t->maptrack != NULL )
            free_xenheap_page((unsigned long)t->maptrack);
        xfree(t);
    }
    return -ENOMEM;
}

void
gnttab_release_dev_mappings(grant_table_t *gt)
{
    grant_mapping_t        *map;
    domid_t                 dom;
    grant_ref_t             ref;
    u16                     handle;
    struct domain          *ld, *rd;
    unsigned long           frame;
    active_grant_entry_t   *act;
    grant_entry_t          *sha;

    ld = current->domain;

    for ( handle = 0; handle < gt->maptrack_limit; handle++ )
    {
        map = &gt->maptrack[handle];

        if ( map->ref_and_flags & GNTMAP_device_map )
        {
            dom = map->domid;
            ref = map->ref_and_flags >> MAPTRACK_REF_SHIFT;

            DPRINTK("Grant release (%hu) ref:(%hu) flags:(%x) dom:(%hu)\n",
                    handle, ref,
                    map->ref_and_flags & MAPTRACK_GNTMAP_MASK, dom);

            if ( unlikely((rd = find_domain_by_id(dom)) == NULL) ||
                 unlikely(ld == rd) )
            {
                if ( rd != NULL )
                    put_domain(rd);

                printk(KERN_WARNING "Grant release: Could not find domain %d\n", dom);
                continue;
            }

            act = &rd->grant_table->active[ref];
            sha = &rd->grant_table->shared[ref];

            spin_lock(&rd->grant_table->lock);

            if ( act->pin & (GNTPIN_devw_mask | GNTPIN_devr_mask) )
            {
                frame = act->frame;

                if ( ( (act->pin & GNTPIN_hstw_mask) == 0 ) &&
                     ( (act->pin & GNTPIN_devw_mask) >  0 ) )
                {
                    clear_bit(_GTF_writing, &sha->flags);
                    put_page_type(&frame_table[frame]);
                }

                act->pin &= ~(GNTPIN_devw_mask | GNTPIN_devr_mask);

                if ( act->pin == 0 )
                {
                    clear_bit(_GTF_reading, &sha->flags);
                    map->ref_and_flags = 0;
                    put_page(&frame_table[frame]);
                }
                else
                    map->ref_and_flags &= ~GNTMAP_device_map;
            }

            spin_unlock(&rd->grant_table->lock);

            put_domain(rd);
        }
    }
}


void
grant_table_destroy(
    struct domain *d)
{
    grant_table_t *t;

    if ( (t = d->grant_table) != NULL )
    {
        /* Free memory relating to this grant table. */
        d->grant_table = NULL;
        free_xenheap_pages((unsigned long)t->shared, ORDER_GRANT_FRAMES);
        free_xenheap_page((unsigned long)t->maptrack); //cwc22
        xfree(t->active);
        xfree(t);
    }
}

void
grant_table_init(
    void)
{
    /* Nothing. */
    DPRINTK("Grant table init\n");
}

/*
 * Local variables:
 * mode: C
 * c-set-style: "BSD"
 * c-basic-offset: 4
 * tab-width: 4
 * indent-tabs-mode: nil
 * End:
 */

[-- Attachment #6: Type: text/plain, Size: 138 bytes --]

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xensource.com
http://lists.xensource.com/xen-devel

next prev parent reply	other threads:[~2005-04-18 16:38 UTC|newest]

Thread overview: 42+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2005-04-16  1:32 Unstableness in grant table block drivers Ian Pratt
2005-04-16 15:39 ` Ryan Harper
2005-04-18 16:38   ` Christopher Clark [this message]
2005-04-18 16:45     ` Ryan Harper
2005-04-18 20:51     ` Ryan Harper
2005-04-20 18:52       ` Christopher Clark
2005-04-20 19:37         ` Ryan Harper
2005-04-20 20:04           ` Building unstable on a Debian Opteron box Ray Lanza
2005-04-20 20:29             ` Chris Wright
2005-04-22 18:57           ` Unstableness in grant table block drivers Ryan Harper
2005-04-22 19:06             ` Anthony Liguori
2005-05-12  1:50         ` Xiaofeng Ling
  -- strict thread matches above, loose matches on Subject: below --
2005-04-15 22:30 Ian Pratt
2005-04-16  1:18 ` Christopher Clark
2005-04-16  1:32   ` Christopher Clark
2005-04-16  3:03     ` David Hopwood
2005-04-14 10:24 Ian Pratt
2005-04-14 10:34 ` Ge van Geldorp
     [not found] <E1DLu58-0005lx-KR@host-192-168-0-1-bcn-london>
2005-04-14 10:17 ` Ge van Geldorp
     [not found] <aliguori@us.ibm.com>
2005-04-14  1:18 ` Anthony Liguori
2005-04-14  1:20   ` Steven Hand
2005-04-14  1:46     ` Anthony Liguori
2005-04-14  1:53     ` Kip Macy
2005-04-14  2:27       ` Anthony Liguori
2005-04-14  2:55         ` Kip Macy
2005-04-14  1:16 Ian Pratt
2005-04-14  1:43 ` Anthony Liguori
2005-04-14  2:04   ` Christopher Clark
2005-04-14  2:17     ` Kip Macy
2005-04-14  2:25     ` Anthony Liguori
2005-04-14 15:00     ` Ryan Harper
2005-04-14 16:34       ` Christopher Clark
2005-04-14 17:28         ` Ryan Harper
2005-04-15 18:45           ` Christopher Clark
2005-04-15 21:14             ` Ryan Harper
2005-04-15 21:32               ` Kip Macy
2005-04-15 21:41                 ` Ryan Harper
2005-04-15 21:46                   ` Kip Macy
2005-04-15 21:50                     ` Ryan Harper
2005-04-15 21:52                 ` Ryan Harper
2005-04-15 22:07                   ` Christopher Clark
2005-04-15 22:08                     ` Ryan Harper

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=eab0875405041809385510d0f9@mail.gmail.com \
    --to=christopher.w.clark@gmail.com \
    --cc=aliguori@us.ibm.com \
    --cc=cwc22@cam.ac.uk \
    --cc=ryanh@us.ibm.com \
    --cc=xen-devel@lists.xensource.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

Be sure your reply has a Subject: header at the top and a blank line before the message body.

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.