diff -urN linux-2.5.72/drivers/block/nbd.c linux-2.5.72-new/drivers/block/nbd.c --- linux-2.5.72/drivers/block/nbd.c 2003-06-16 22:19:44.000000000 -0600 +++ linux-2.5.72-new/drivers/block/nbd.c 2003-06-20 21:27:44.650037153 -0600 @@ -6,6 +6,7 @@ * * Copyright 1997-2000 Pavel Machek * Parts copyright 2001 Steven Whitehouse + * Parts copyright 2003 Louis D. Langholtz * * (part of code stolen from loop.c) * @@ -24,10 +25,35 @@ * 01-3-11 Make nbd work with new Linux block layer code. It now supports * plugging like all the other block devices. Also added in MSG_MORE to * reduce number of partial TCP segments sent. - * 01-12-6 Fix deadlock condition by making queue locks independent of + * 01-12-6 Fix deadlock condition by making queue locks independant of * the transmit lock. * 02-10-11 Allow hung xmit to be aborted via SIGKILL & various fixes. * + * 03-05-02 Ported thread patch by which moves + * network I/O into seperate kernel threads so request function no longer + * blocks. + * 03-05-02 Switched to configurable debugging output. + * 03-05-19 Added connection establishment code and new IP:port managing ioctls + * to support tool-less startup and simplified management. + * 03-05-19 Added module parameters to support insertion time configuration + * of various aspects of this driver. + * 03-05-27 Added procfs support for greater runtime monitorability of driver. + * + * 03-06-08 Added session management code to try reconnecting in case of + * connection shutdown. + * 03-06-10 Fixed bug in network read logic that's been there from the + * original 2.5 series nbd driver where data was being read into possibly + * non-contiguous memory using bio_data() call (and caused kernel lockups). + * 03-06-12 Added a default BLOCKING stratedgy on network downtime with a + * non-default NBD_NONBLOCKING flag. This has the net effect of blocking + * I/O when there's only transient problems like a server reboot. If used + * in conjunction now with RAID mirroring, transient errors (while they'll + * pause the system) will not nessesitate a complete recopying of the + * server's exported block device which could potentially take much longer + * than a reboot. + * 03-06-13 Implemented NBD_WRITE_NOCHK. + * 03-06-15 Fixed code to report proper size even when using nbd-client. + * * * possible FIXME: make set_sock / set_blksize / set_size / do_it one syscall * why not: would need verify_area and friends, would share yet another @@ -35,89 +61,139 @@ */ #define PARANOIA +#include #include - -#include -#include #include #include #include +#include #include #include #include #include #include #include -#include -#include +#include +#include +#include #include #include #include #include +#include /* for __xchg()... */ +#define atomic_exchange(x,ptr,size) __xchg((x),(ptr),(size)) +#define MAJOR_NR NBD_MAJOR #include +#define USE_ZEROCOPY #define LO_MAGIC 0x68797548 +#define DISCONNECT_NBD 2 -static struct nbd_device nbd_dev[MAX_NBD]; +#define NBD_DEBUG_OPEN 0x0001 +#define NBD_DEBUG_RELEASE 0x0002 +#define NBD_DEBUG_IOCTL 0x0004 +#define NBD_DEBUG_MEDIA 0x0008 +#define NBD_DEBUG_THREADS 0x0010 +#define NBD_DEBUG_SESSION 0x0020 +#define NBD_DEBUG_INIT 0x0040 +#define NBD_DEBUG_EXIT 0x0080 +#define NBD_DEBUG_RX 0x0100 +#define NBD_DEBUG_TX 0x0200 +#define NBD_DEBUG_BLKDEV 0x0400 + +#ifdef NDEBUG +#define dprintk(flags, fmt...) +#else +#define dprintk(flags, fmt...) do { \ + if (debugflags & (flags)) printk(fmt); \ +} while (0) +#endif -static spinlock_t nbd_lock = SPIN_LOCK_UNLOCKED; +#define DEVICE_TO_MINOR(lo) ((int)((lo)-nbd_devs)) -#define DEBUG( s ) -/* #define DEBUG( s ) printk( s ) - */ +# define REQUEST_QUEUE(req) (&(req)->queuelist) +# define REQUEST_QUEUE_NEXT_REQUEST(q) (elv_next_request(q)) +# define REQUEST_CMD(req) ((req)->cmd[0]) +# define DAEMONIZE(fmt...) daemonize(fmt) +# define NBD_BYTESIZE(lo) ((lo)->bytesize) +# define NBD_BLKSIZE(lo) ((lo)->blksize) +# define INODE_TO_NBD(i) ((i)->i_bdev->bd_disk->private_data) +# define DEVICE_NAME "nbd" +# define request_queue_lock(q) spin_lock_irq((q)->queue_lock) +# define request_queue_unlock(q) spin_unlock_irq((q)->queue_lock) +# define request_queue_lock_save(q,flags) \ + spin_lock_irqsave((q)->queue_lock, (flags)) +# define request_queue_unlock_restore(q,flags) \ + spin_unlock_irqrestore((q)->queue_lock, (flags)) -static int requests_in; -static int requests_out; +/* + * Private structure declarations... + */ +typedef int (*thread_fn_t)(void *); -static void nbd_end_request(struct request *req) -{ - int uptodate = (req->errors == 0) ? 1 : 0; - request_queue_t *q = req->q; - unsigned long flags; +/* + * Forward function declarations to keep compiler happy... + */ +static void nbd_set_size64(nbd_device_t *lo, u64 size); +static int nbd_thread_start(nbd_device_t *lo, nbd_thread_t *th, thread_fn_t fn); +static int nbd_thread_stop(nbd_device_t *lo, nbd_thread_t *th, int wait); +static int rx_loop(void *data); +static int tx_loop(void *data); +static int nbd_deactivate_sin(nbd_device_t *lo); +static int nbd_activate_sin(nbd_device_t *lo); +static int nbd_do_it(nbd_device_t *lo); +static int nbd_redo_queue(nbd_device_t *lo); +static int nbd_clear_queue(nbd_device_t *lo); -#ifdef PARANOIA - requests_out++; -#endif - spin_lock_irqsave(q->queue_lock, flags); - if (!end_that_request_first(req, uptodate, req->nr_sectors)) { - end_that_request_last(req); - } - spin_unlock_irqrestore(q->queue_lock, flags); -} +/* + * Private global definitions... + */ -static int nbd_open(struct inode *inode, struct file *file) -{ - struct nbd_device *lo = inode->i_bdev->bd_disk->private_data; - lo->refcnt++; - return 0; -} +static nbd_device_t nbd_devs[MAX_NBD]; +static struct request_queue nbd_queue[MAX_NBD]; +static spinlock_t nbd_lock[MAX_NBD]; +static uint32_t request_magic; +static uint32_t reply_magic; +static u64 requests_in; +static u64 requests_out; +static u64 qhandler_loops; +static u64 initial_bytesize = (u64)-512; /* formerly 0x7ffffc00<<10 (~2TB) */ /* - * Send or receive packet. + * Module parameter definitions... + */ +static char *connects[64]; +#ifndef NDEBUG +static unsigned int debugflags = 0; +#endif +static char *initial_size = NULL; +static int initial_blksize_bits = 12; /* formerly 10 (or 1K block sizes) */ +static short default_port = NBD_DEFAULT_PORT; + +/** + * sock_xmit - send or recieve a packet. + * @sock: the socket on which to send or recieve. + * @send: receive if false, else send. + * @buf: byte address of the packet. + * @size: number of bytes to send or receive. + * @msg_flags: flags passed to sock_sendmsg or sock_recvmsg call. + * + * Returns the last result status: < 0 for particular error, 0 for closed + * and > 0 for success. */ -static int nbd_xmit(int send, struct socket *sock, char *buf, int size, int msg_flags) +static int sock_xmit(struct socket *sock, int send, void *buf, int size, + int msg_flags) { mm_segment_t oldfs; int result; struct msghdr msg; struct iovec iov; - unsigned long flags; - sigset_t oldset; oldfs = get_fs(); set_fs(get_ds()); - /* Allow interception of SIGKILL only - * Don't allow other signals to interrupt the transmission */ - spin_lock_irqsave(¤t->sighand->siglock, flags); - oldset = current->blocked; - sigfillset(¤t->blocked); - sigdelsetmask(¤t->blocked, sigmask(SIGKILL)); - recalc_sigpending(); - spin_unlock_irqrestore(¤t->sighand->siglock, flags); - do { sock->sk->sk_allocation = GFP_NOIO; @@ -137,481 +213,2177 @@ else result = sock_recvmsg(sock, &msg, size, 0); - if (signal_pending(current)) { - siginfo_t info; - spin_lock_irqsave(¤t->sighand->siglock, flags); - printk(KERN_WARNING "NBD (pid %d: %s) got signal %d\n", - current->pid, current->comm, - dequeue_signal(current, ¤t->blocked, &info)); - spin_unlock_irqrestore(¤t->sighand->siglock, flags); - result = -EINTR; + if (result <= 0) break; - } - - if (result <= 0) { -#ifdef PARANOIA - printk(KERN_ERR "NBD: %s - sock=%ld at buf=%ld, size=%d returned %d.\n", - send ? "send" : "receive", (long) sock, (long) buf, size, result); -#endif - break; - } size -= result; buf += result; } while (size > 0); - spin_lock_irqsave(¤t->sighand->siglock, flags); - current->blocked = oldset; - recalc_sigpending(); - spin_unlock_irqrestore(¤t->sighand->siglock, flags); - set_fs(oldfs); return result; } -#define FAIL( s ) { printk( KERN_ERR "NBD: " s "(result %d)\n", result ); goto error_out; } - -void nbd_send_req(struct nbd_device *lo, struct request *req) +static inline void wait_for_completion_interuptably(struct completion *x) { - int result, i, flags; - struct nbd_request request; - unsigned long size = req->nr_sectors << 9; - struct socket *sock = lo->sock; - - DEBUG("NBD: sending control, "); - - request.magic = htonl(NBD_REQUEST_MAGIC); - request.type = htonl(nbd_cmd(req)); - request.from = cpu_to_be64( (u64) req->sector << 9); - request.len = htonl(size); - memcpy(request.handle, &req, sizeof(req)); + spin_lock_irq(&x->wait.lock); + if (!x->done) { + DECLARE_WAITQUEUE(wait, current); - down(&lo->tx_lock); - - if (!sock || !lo->sock) { - printk(KERN_ERR "NBD: Attempted sendmsg to closed socket\n"); - goto error_out; + wait.flags |= WQ_FLAG_EXCLUSIVE; + __add_wait_queue_tail(&x->wait, &wait); + do { + __set_current_state(TASK_INTERRUPTIBLE); + spin_unlock_irq(&x->wait.lock); + schedule(); + spin_lock_irq(&x->wait.lock); + } while (!x->done && !signal_pending(current)); + __remove_wait_queue(&x->wait, &wait); } + x->done--; + spin_unlock_irq(&x->wait.lock); +} - result = nbd_xmit(1, sock, (char *) &request, sizeof(request), nbd_cmd(req) == NBD_CMD_WRITE ? MSG_MORE : 0); - if (result <= 0) - FAIL("Sendmsg failed for control."); +static int wait_for_io_threads(nbd_device_t *lo) +{ + int signaled = 0; + struct task_struct *tsk = current; + DECLARE_WAITQUEUE(wait, tsk); +#ifndef NDEBUG + int minor = DEVICE_TO_MINOR(lo); +#endif - if (nbd_cmd(req) == NBD_CMD_WRITE) { - struct bio *bio; - /* - * we are really probing at internals to determine - * whether to set MSG_MORE or not... - */ - rq_for_each_bio(bio, req) { - struct bio_vec *bvec; - bio_for_each_segment(bvec, bio, i) { - flags = 0; - if ((i < (bio->bi_vcnt - 1)) || bio->bi_next) - flags = MSG_MORE; - DEBUG("data, "); - result = nbd_xmit(1, sock, page_address(bvec->bv_page) + bvec->bv_offset, bvec->bv_len, flags); - if (result <= 0) - FAIL("Send data failed."); - } - } + add_wait_queue(&lo->no_io_waiters, &wait); + if (atomic_read(&lo->num_io_threads) > 0) { + dprintk(NBD_DEBUG_SESSION, "nb%d: %s: going to sleep...\n", + minor, __FUNCTION__); + set_current_state(TASK_INTERRUPTIBLE); + schedule(); + set_current_state(TASK_RUNNING); + if (signal_pending(current)) + signaled = 1; + dprintk(NBD_DEBUG_SESSION, "nb%d: %s: woken up (%s)\n", + minor, __FUNCTION__, signaled? "signaled": "done"); } - up(&lo->tx_lock); - return; + remove_wait_queue(&lo->no_io_waiters, &wait); + return signaled? 0: 1; +} - error_out: - up(&lo->tx_lock); - req->errors++; +/** + * request_end_while_locked - low level block device system request ender. + * @req: the block device request after blkdev_dequeue_request() done on it. + * @uptodate: 1 (true), or 0 (false) depending on if buffer sync'd with store. + * + * This function must be called with io_request_lock held & interupts disabled. + */ +static void request_end_while_locked(struct request *req, int uptodate) +{ + if (!end_that_request_first(req, uptodate, req->nr_sectors)) { + end_that_request_last(req); + } + requests_out++; + if (current) { + dprintk(NBD_DEBUG_BLKDEV, "%s[%d]: released request (%p).\n", + current->comm, current->pid, req); + } + else { + /* can current ever even be null??? */ + printk(KERN_ERR DEVICE_NAME ": null current pointer!!\n"); + dprintk(NBD_DEBUG_BLKDEV, "%s: released request (%p).\n", + DEVICE_NAME, req); + } } -static struct request *nbd_find_request(struct nbd_device *lo, char *handle) +/** + * request_end - block device system request ender. + * @req: the block device request after blkdev_dequeue_request() done on it. + * @uptodate: 1 (true), or 0 (false) depending on if buffer sync'd with store. + * + * This function must not be called when io_request_lock is held! Use + * request_end_while_locked() instead when this lock is already held. + */ +static void request_end(struct request *req, int uptodate) { - struct request *req; - struct list_head *tmp; - struct request *xreq; + unsigned long flags; - memcpy(&xreq, handle, sizeof(xreq)); + if (REQUEST_CMD(req) == DISCONNECT_NBD) + return; + request_queue_lock_save(req->q, flags); + request_end_while_locked(req, uptodate); + request_queue_unlock_restore(req->q, flags); +} - spin_lock(&lo->queue_lock); - list_for_each(tmp, &lo->queue_head) { - req = list_entry(tmp, struct request, queuelist); - if (req != xreq) - continue; - list_del_init(&req->queuelist); - spin_unlock(&lo->queue_lock); - return req; - } - spin_unlock(&lo->queue_lock); - return NULL; +static void nbd_qsys_enq_tail(nbd_qsys_t *q, struct request *req) +{ + spin_lock(&q->lock); + list_add_tail(REQUEST_QUEUE(req), &q->head); + q->len++; + spin_unlock(&q->lock); + wake_up(&q->waiters); } -#define HARDFAIL( s ) { printk( KERN_ERR "NBD: " s "(result %d)\n", result ); lo->harderror = result; return NULL; } -struct request *nbd_read_stat(struct nbd_device *lo) - /* NULL returned = something went wrong, inform userspace */ +static void nbd_qsys_enq_head(nbd_qsys_t *q, struct request *req) { - int result; - struct nbd_reply reply; - struct request *req; + spin_lock(&q->lock); + list_add(REQUEST_QUEUE(req), &q->head); + q->len++; + spin_unlock(&q->lock); + wake_up(&q->waiters); +} - DEBUG("reading control, "); - reply.magic = 0; - result = nbd_xmit(0, lo->sock, (char *) &reply, sizeof(reply), MSG_WAITALL); - if (result <= 0) - HARDFAIL("Recv control failed."); - req = nbd_find_request(lo, reply.handle); - if (req == NULL) - HARDFAIL("Unexpected reply"); - - DEBUG("ok, "); - if (ntohl(reply.magic) != NBD_REPLY_MAGIC) - HARDFAIL("Not enough magic."); - if (ntohl(reply.error)) - FAIL("Other side returned error."); - - if (nbd_cmd(req) == NBD_CMD_READ) { - struct bio *bio = req->bio; - DEBUG("data, "); - do { - result = nbd_xmit(0, lo->sock, bio_data(bio), bio->bi_size, MSG_WAITALL); - if (result <= 0) - HARDFAIL("Recv data failed."); - bio = bio->bi_next; - } while(bio); +static struct request *nbd_qsys_deq_head(nbd_qsys_t *q) +{ + struct request *req = NULL; + spin_lock(&q->lock); + if (!list_empty(&q->head)) { + req = blkdev_entry_to_request(q->head.next); + list_del_init(REQUEST_QUEUE(req)); + q->len--; } - DEBUG("done.\n"); + spin_unlock(&q->lock); return req; +} -/* Can we get here? Yes, if other side returns error */ - error_out: - req->errors++; +static struct request *nbd_qsys_deq_tail(nbd_qsys_t *q) +{ + struct request *req = NULL; + spin_lock(&q->lock); + if (!list_empty(&q->head)) { + req = blkdev_entry_to_request(q->head.prev); + list_del_init(REQUEST_QUEUE(req)); + q->len--; + } + spin_unlock(&q->lock); return req; } -void nbd_do_it(struct nbd_device *lo) +static struct request *nbd_qsys_deq_request(nbd_qsys_t *q, char *handle) { - struct request *req; - - while (1) { - req = nbd_read_stat(lo); + struct list_head *tmp; + struct request *req, *xreq; - if (!req) { - printk(KERN_ALERT "req should never be null\n" ); - goto out; + memcpy(&xreq, handle, sizeof(xreq)); + spin_lock(&q->lock); + list_for_each(tmp, &q->head) { + req = blkdev_entry_to_request(tmp); + if (req == xreq) { + list_del_init(REQUEST_QUEUE(req)); + q->len--; + spin_unlock(&q->lock); + return req; } - BUG_ON(lo->magic != LO_MAGIC); - nbd_end_request(req); } - out: - return; + spin_unlock(&q->lock); + return NULL; } -void nbd_clear_que(struct nbd_device *lo) +static int nbd_qsys_len(nbd_qsys_t *q) { - struct request *req; + int len; + spin_lock(&q->lock); + len = q->len; + spin_unlock(&q->lock); + return len; +} - BUG_ON(lo->magic != LO_MAGIC); +static struct request *nbd_qsys_deq_head_eventually(nbd_qsys_t *q) +{ + struct request *req; - do { - req = NULL; - spin_lock(&lo->queue_lock); - if (!list_empty(&lo->queue_head)) { - req = list_entry(lo->queue_head.next, struct request, queuelist); - list_del_init(&req->queuelist); - } - spin_unlock(&lo->queue_lock); - if (req) { - req->errors++; - nbd_end_request(req); + req = nbd_qsys_deq_head(q); + if (!req) { + struct task_struct *tsk = current; + DECLARE_WAITQUEUE(wait, tsk); + + add_wait_queue(&q->waiters, &wait); + for (;;) { + set_current_state(TASK_INTERRUPTIBLE); + req = nbd_qsys_deq_head(q); + if (req) + break; + if (signal_pending(current)) + break; + schedule(); } - } while(req); + set_current_state(TASK_RUNNING); + remove_wait_queue(&q->waiters, &wait); + } + return req; } -/* - * We always wait for result of write, for now. It would be nice to make it optional - * in future - * if ((req->cmd == WRITE) && (lo->flags & NBD_WRITE_NOCHK)) - * { printk( "Warning: Ignoring result!\n"); nbd_end_request( req ); } - */ +static int nbd_qsys_clear(nbd_qsys_t *q) +{ + struct request *req; + int ncleared; -#undef FAIL -#define FAIL( s ) { printk( KERN_ERR "%s: " s "\n", req->rq_disk->disk_name ); goto error_out; } + for (ncleared = 0;; ncleared++) { + req = nbd_qsys_deq_head(q); + if (!req) + break; + req->errors++; + request_end(req, 0); + } + return ncleared; +} -static void do_nbd_request(request_queue_t * q) +static inline int sock_recv_bvec(struct socket *sock, struct bio_vec *bvec, + int flags) { - struct request *req; - - while ((req = elv_next_request(q)) != NULL) { - struct nbd_device *lo; + return sock_xmit(sock, 0, page_address(bvec->bv_page) + bvec->bv_offset, + bvec->bv_len, flags); +} - if (!(req->flags & REQ_CMD)) - goto error_out; +static inline int sock_recv_buffers(struct socket *sock, struct request *req) +{ + int result = -ENOMEM; /* only returned if bh == NULL */ + int i; + struct bio *bio; - lo = req->rq_disk->private_data; - if (!lo->file) - FAIL("Request when not-ready."); - nbd_cmd(req) = NBD_CMD_READ; - if (rq_data_dir(req) == WRITE) { - nbd_cmd(req) = NBD_CMD_WRITE; - if (lo->flags & NBD_READ_ONLY) - FAIL("Write on read-only"); + rq_for_each_bio(bio, req) { + struct bio_vec *bvec; + bio_for_each_segment(bvec, bio, i) { + result = sock_recv_bvec(sock, bvec, MSG_WAITALL); + if (result <= 0) + return result; } - BUG_ON(lo->magic != LO_MAGIC); - requests_in++; + } + return result; +} - req->errors = 0; - blkdev_dequeue_request(req); - spin_unlock_irq(q->queue_lock); +static inline int sock_send_bvec(struct socket *sock, struct bio_vec *bvec, + int flags) +{ +#ifdef USE_ZEROCOPY + struct page *page = bvec->bv_page; + size_t size = bvec->bv_len; + int offset = bvec->bv_offset; + int result; - spin_lock(&lo->queue_lock); + flags |= MSG_NOSIGNAL; + do { + result = sock->ops->sendpage(sock, page, offset, size, flags); + if (result <= 0) + break; + size -= result; + offset += result; + } while (size > 0); + return result; +#else /* USE_ZEROCOPY */ + return sock_xmit(sock, 1, page_address(bvec->bv_page) + bvec->bv_offset, + bvec->bv_len, flags); +#endif /* USE_ZEROCOPY */ +} - if (!lo->file) { - spin_unlock(&lo->queue_lock); - printk(KERN_ERR "nbd: failed between accept and semaphore, file lost\n"); - req->errors++; - nbd_end_request(req); - spin_lock_irq(q->queue_lock); - continue; +static inline int sock_send_buffers(struct socket *sock, struct request *req) +{ + int flags; + int result = -ENOMEM; /* returned if bh == NULL */ + int msg_eor = (sock->type == SOCK_SEQPACKET)? MSG_EOR: 0; + int i; + struct bio *bio; + /* + * we are really probing at internals to determine + * whether to set MSG_MORE or not... + */ + rq_for_each_bio(bio, req) { + struct bio_vec *bvec; + bio_for_each_segment(bvec, bio, i) { + flags = ((i + 1 < bio->bi_vcnt) || bio->bi_next)? + MSG_MORE: msg_eor; + result = sock_send_bvec(sock, bvec, flags); + if (result <= 0) + return result; } + } + return result; +} - list_add(&req->queuelist, &lo->queue_head); - spin_unlock(&lo->queue_lock); +#ifndef NDEBUG +static const char *cmd_to_ascii(int cmd) +{ + switch (cmd) { + case 0: return "read"; + case 1: return "write"; + case 2: return "disconnect"; + } + return "invalid"; +} +#endif /* NDEBUG */ - nbd_send_req(lo, req); +static inline int sock_send_request(struct socket *sock, struct request *req) +{ + int result; + nbd_request_t request; + unsigned long size = req->nr_sectors << 9; - if (req->errors) { - printk(KERN_ERR "nbd: nbd_send_req failed\n"); - spin_lock(&lo->queue_lock); - list_del_init(&req->queuelist); - spin_unlock(&lo->queue_lock); - nbd_end_request(req); - spin_lock_irq(q->queue_lock); - continue; - } + request.magic = request_magic; + request.type = htonl(REQUEST_CMD(req)); + request.from = cpu_to_be64( (u64) req->sector << 9); + request.len = htonl(size); + memcpy(request.handle, &req, sizeof(req)); - spin_lock_irq(q->queue_lock); - continue; + result = sock_xmit(sock, 1, &request, sizeof(request), + (REQUEST_CMD(req) == WRITE)? MSG_MORE: 0); + if (result <= 0) + return result; + if (REQUEST_CMD(req) == WRITE) + result = sock_send_buffers(sock, req); + return result; +} - error_out: - req->errors++; - blkdev_dequeue_request(req); - spin_unlock(q->queue_lock); - nbd_end_request(req); - spin_lock(q->queue_lock); +/** + * nbd_send_request - called by tx thread to send NBD requests + * @lo: nbd device + * @req: pointer to request + * + * Should only be called by tx thread to ensure integrity of data stream. + */ +static int nbd_send_request(nbd_device_t *lo, struct request *req) +{ + int result; + + result = sock_send_request(lo->sock, req); + if (result < 0) { + lo->errcnt++; + lo->lasterr = -result; + printk(KERN_ERR "nb%d: error sending request %p (%d).\n", + DEVICE_TO_MINOR(lo), req, result); } - return; + else if (result == 0) { + lo->closed |= SEND_SHUTDOWN; + lo->lasterr = ECONNRESET; + printk("nb%d: write closed on request %p\n", + DEVICE_TO_MINOR(lo), req); + } + return result; } -static int nbd_ioctl(struct inode *inode, struct file *file, - unsigned int cmd, unsigned long arg) +static int nbd_recv_reply(nbd_device_t *lo, struct request **req) { - struct nbd_device *lo = inode->i_bdev->bd_disk->private_data; - int error, temp; - struct request sreq ; + int result; + nbd_reply_t reply; + struct socket *sock; +#ifndef NDEBUG + int minor = DEVICE_TO_MINOR(lo); +#endif - /* Anyone capable of this syscall can do *real bad* things */ + *req = NULL; + sock = lo->sock; - if (!capable(CAP_SYS_ADMIN)) - return -EPERM; - switch (cmd) { - case NBD_DISCONNECT: - printk(KERN_INFO "NBD_DISCONNECT\n"); - sreq.flags = REQ_SPECIAL; - nbd_cmd(&sreq) = NBD_CMD_DISC; - if (!lo->sock) - return -EINVAL; - nbd_send_req(lo, &sreq); - return 0 ; - - case NBD_CLEAR_SOCK: - nbd_clear_que(lo); - spin_lock(&lo->queue_lock); - if (!list_empty(&lo->queue_head)) { - spin_unlock(&lo->queue_lock); - printk(KERN_ERR "nbd: Some requests are in progress -> can not turn off.\n"); - return -EBUSY; - } - file = lo->file; - if (!file) { - spin_unlock(&lo->queue_lock); - return -EINVAL; + result = sock_xmit(sock, 0, &reply, sizeof(reply), MSG_WAITALL); + if (result < 0) { + printk(KERN_ERR "nb%d: error receiving reply (%d).\n", + minor, result); + goto leave; + } + if (result == 0) { + printk("nb%d: read reply closed.\n", minor); + goto leave; + } + if (reply.magic != reply_magic) { + result = -EPROTO; + printk(KERN_ERR "nb%d: wrong reply magic.\n", minor); + goto leave; + } + *req = nbd_qsys_deq_request(&lo->rx_queue, reply.handle); + if (!*req) { + if (nbd_write_nochk(lo)) { + /* No big deal when NBD_WRITE_NOCHK set */ + dprintk(NBD_DEBUG_RX, "nb%d: unexpected reply %p\n", + minor, req); } - lo->file = NULL; - lo->sock = NULL; - spin_unlock(&lo->queue_lock); - fput(file); - return 0; - case NBD_SET_SOCK: - if (lo->file) - return -EBUSY; - error = -EINVAL; - file = fget(arg); - if (file) { - inode = file->f_dentry->d_inode; - if (inode->i_sock) { - lo->file = file; - lo->sock = SOCKET_I(inode); - error = 0; - } else { - fput(file); - } + else { + /* We'd better indicate error though in this case! */ + result = -ESRCH; + printk(KERN_ERR "nb%d: unexpected reply!\n", minor); } - return error; - case NBD_SET_BLKSIZE: - if ((arg & (arg-1)) || (arg < 512) || (arg > PAGE_SIZE)) - return -EINVAL; - lo->blksize = arg; - temp = arg >> 9; - lo->blksize_bits = 9; - while (temp > 1) { - lo->blksize_bits++; - temp >>= 1; + goto leave; + } + lo->harderror = ntohl(reply.error); + if (lo->harderror) { + printk(KERN_ERR "nb%d: request %p, remote error (%d).\n", + minor, *req, lo->harderror); + (*req)->errors++; + goto leave; + } + if (REQUEST_CMD(*req) == READ) { /* then more to read... */ + result = sock_recv_buffers(sock, *req); + if (result < 0) + printk(KERN_ERR "nb%d: request %p, error receiving data (%d).\n", + minor, *req, result); + else if (result == 0) + printk(KERN_ERR "nb%d: request %p, read data closed.\n", + minor, *req); + else { + dprintk(NBD_DEBUG_RX, "nb%d: request %p, received reply\n", + minor, *req); + /* yeah! got entire request! */ + goto leave; } - lo->bytesize &= ~(lo->blksize-1); - set_capacity(lo->disk, lo->bytesize >> 9); - return 0; - case NBD_SET_SIZE: - lo->bytesize = arg & ~(lo->blksize-1); - set_capacity(lo->disk, lo->bytesize >> 9); - return 0; - case NBD_SET_SIZE_BLOCKS: - lo->bytesize = ((u64) arg) << lo->blksize_bits; - set_capacity(lo->disk, lo->bytesize >> 9); - return 0; - case NBD_DO_IT: - if (!lo->file) - return -EINVAL; - nbd_do_it(lo); - /* on return tidy up in case we have a signal */ - /* Forcibly shutdown the socket causing all listeners - * to error - * - * FIXME: This code is duplicated from sys_shutdown, but - * there should be a more generic interface rather than - * calling socket ops directly here */ - down(&lo->tx_lock); - printk(KERN_WARNING "nbd: shutting down socket\n"); - lo->sock->ops->shutdown(lo->sock, SEND_SHUTDOWN|RCV_SHUTDOWN); - lo->sock = NULL; - up(&lo->tx_lock); - spin_lock(&lo->queue_lock); - file = lo->file; - lo->file = NULL; - spin_unlock(&lo->queue_lock); - nbd_clear_que(lo); - printk(KERN_WARNING "nbd: queue cleared\n"); - if (file) - fput(file); - return lo->harderror; - case NBD_CLEAR_QUE: - nbd_clear_que(lo); - return 0; -#ifdef PARANOIA - case NBD_PRINT_DEBUG: - printk(KERN_INFO "%s: next = %p, prev = %p. Global: in %d, out %d\n", - inode->i_bdev->bd_disk->disk_name, lo->queue_head.next, - lo->queue_head.prev, requests_in, requests_out); - return 0; -#endif + /* re-rx-queue request for possible future redo... */ + nbd_qsys_enq_head(&lo->rx_queue, *req); + *req = NULL; /* make it seem like we never got request */ } - return -EINVAL; +leave: + if (result < 0) { + lo->errcnt++; + lo->lasterr = -result; + } + else if (result == 0) { + lo->closed |= RCV_SHUTDOWN; + lo->lasterr = ECONNRESET; + } + return result; } -static int nbd_release(struct inode *inode, struct file *file) +static inline int nbd_activate_kids(nbd_device_t *lo) { - struct nbd_device *lo = inode->i_bdev->bd_disk->private_data; - if (lo->refcnt <= 0) - printk(KERN_ALERT "nbd_release: refcount(%d) <= 0\n", lo->refcnt); - lo->refcnt--; - /* N.B. Doesn't lo->file need an fput?? */ + int rv; + + rv = nbd_thread_start(lo, &lo->rx_thread, rx_loop); + if (rv < 0) + return rv; + rv = nbd_thread_start(lo, &lo->tx_thread, tx_loop); + if (rv < 0) { + nbd_thread_stop(lo, &lo->rx_thread, 1); + return rv; + } return 0; } -static struct block_device_operations nbd_fops = +static inline int nbd_deactivate_kids(nbd_device_t *lo) { - .owner = THIS_MODULE, - .open = nbd_open, - .release = nbd_release, - .ioctl = nbd_ioctl, -}; + nbd_thread_stop(lo, &lo->tx_thread, 1); + nbd_thread_stop(lo, &lo->rx_thread, 1); + return 0; +} -/* - * And here should be modules and kernel interface - * (Just smiley confuses emacs :-) +/** + * nbd_shutdown - shutdown kids & network connection as needed. + * @lo: pointer to nbd_device_t + * + * Must be called with lo->semalock held! */ - -static struct request_queue nbd_queue; - -static int __init nbd_init(void) +static int nbd_shutdown(nbd_device_t *lo) { - int err = -ENOMEM; - int i; - - if (sizeof(struct nbd_request) != 28) { - printk(KERN_CRIT "Sizeof nbd_request needs to be 28 in order to work!\n" ); - return -EIO; - } - - for (i = 0; i < MAX_NBD; i++) { - struct gendisk *disk = alloc_disk(1); - if (!disk) - goto out; - nbd_dev[i].disk = disk; - } +#ifdef PARANOIA + BUG_ON(atomic_read(&lo->semalock.count) != 0); +#endif + dprintk(NBD_DEBUG_SESSION, "nb%d: %s called.\n", + DEVICE_TO_MINOR(lo), __FUNCTION__); + nbd_deactivate_kids(lo); + if (lo->sin.sin_addr.s_addr) + nbd_deactivate_sin(lo); + return 0; +} - if (register_blkdev(NBD_MAJOR, "nbd")) { - err = -EIO; - goto out; - } -#ifdef MODULE - printk("nbd: registered device at major %d\n", NBD_MAJOR); +/** + * nbd_startup - startup network connection & kids as needed. + * @lo: pointer to nbd_device_t + * + * Must be called with lo->semalock held! + */ +static int nbd_startup(nbd_device_t *lo) +{ + int rv; +#ifndef NDEBUG + int minor = DEVICE_TO_MINOR(lo); +#endif /* NDEBUG */ +#ifdef PARANOIA + BUG_ON(atomic_read(&lo->semalock.count) != 0); #endif - blk_init_queue(&nbd_queue, do_nbd_request, &nbd_lock); - devfs_mk_dir("nbd"); - for (i = 0; i < MAX_NBD; i++) { - struct gendisk *disk = nbd_dev[i].disk; - nbd_dev[i].refcnt = 0; - nbd_dev[i].file = NULL; - nbd_dev[i].magic = LO_MAGIC; - nbd_dev[i].flags = 0; - spin_lock_init(&nbd_dev[i].queue_lock); - INIT_LIST_HEAD(&nbd_dev[i].queue_head); - init_MUTEX(&nbd_dev[i].tx_lock); - nbd_dev[i].blksize = 1024; - nbd_dev[i].blksize_bits = 10; - nbd_dev[i].bytesize = ((u64)0x7ffffc00) << 10; /* 2TB */ - disk->major = NBD_MAJOR; - disk->first_minor = i; - disk->fops = &nbd_fops; - disk->private_data = &nbd_dev[i]; - disk->queue = &nbd_queue; - sprintf(disk->disk_name, "nbd%d", i); - sprintf(disk->devfs_name, "nbd/%d", i); - set_capacity(disk, 0x3ffffe); - add_disk(disk); + dprintk(NBD_DEBUG_SESSION, "nb%d: %s called.\n", minor, __FUNCTION__); + if (lo->sin.sin_addr.s_addr) { + dprintk(NBD_DEBUG_SESSION, "nb%d: %s: trying to connect\n", + minor, __FUNCTION__); + /* + * The following nbd_activate_sin() then will + * block till the remote host or network is up + * or the connection attempt times out... + * Don't forget we're holding lo->semalock now! + */ + if (nbd_activate_sin(lo) < 0) + return -ENOMEDIUM; + dprintk(NBD_DEBUG_SESSION, "nb%d: %s: connection established\n", + minor, __FUNCTION__); } - + if (!lo->sock) + return -ENOMEDIUM; + rv = nbd_activate_kids(lo); + if (rv < 0) { + nbd_deactivate_sin(lo); + return rv; + } + dprintk(NBD_DEBUG_SESSION, "nb%d: %s: succeeded.\n", minor, __FUNCTION__); return 0; -out: - while (i--) - put_disk(nbd_dev[i].disk); - return err; } -static void __exit nbd_cleanup(void) +static void unblock_sigkill(void) { - int i; - for (i = 0; i < MAX_NBD; i++) { - del_gendisk(nbd_dev[i].disk); - put_disk(nbd_dev[i].disk); + int unblocked = 0; + unsigned long flags; + + spin_lock_irqsave(¤t->sighand->siglock, flags); + if (sigismember(¤t->blocked, SIGKILL)) { + sigdelsetmask(¤t->blocked, sigmask(SIGKILL)); + recalc_sigpending(); + unblocked = 1; } - devfs_remove("nbd"); - blk_cleanup_queue(&nbd_queue); - unregister_blkdev(NBD_MAJOR, "nbd"); + spin_unlock_irqrestore(¤t->sighand->siglock, flags); + if (unblocked) + dprintk(NBD_DEBUG_THREADS, "%s[%d]: SIGKILL unblocked.\n", + current->comm, current->pid); } -module_init(nbd_init); -module_exit(nbd_cleanup); +static int session_loop(void *data) +{ + nbd_device_t *lo = (nbd_device_t *)data; + int rv = 0, seconds, ncleared; +#ifndef NDEBUG + int minor = DEVICE_TO_MINOR(lo); +#endif /* NDEBUG */ + + __module_get(THIS_MODULE); + DAEMONIZE("nb%d-sess", lo - nbd_devs); + spin_lock(&lo->lock); + lo->ss_thread.task = current; + spin_unlock(&lo->lock); + unblock_sigkill(); + complete(&lo->ss_thread.startup); -MODULE_DESCRIPTION("Network Block Device"); -MODULE_LICENSE("GPL"); + dprintk(NBD_DEBUG_THREADS, "%s[%d]: started.\n", + current->comm, current->pid); + seconds = 10; + do { + if (nbd_do_it(lo) == 0) { + /* connection succeeded at least so reset sleep... */ + seconds = 60; /* a minimum server reboot time */ + } + if (!(lo->flags & NBD_RESTARTABLE)) + break; + if (signal_pending(current)) + break; + dprintk(NBD_DEBUG_SESSION, + "nb%d: sleeping %ds before attempting re-connect\n", + minor, seconds); + set_current_state(TASK_INTERRUPTIBLE); + schedule_timeout(HZ * seconds); + set_current_state(TASK_RUNNING); + /* + * Already waited seconds, so probably don't need to sleep + * as long next time... + */ + if (seconds > 10) + seconds -= 10; + } while (!signal_pending(current)); + + ncleared = nbd_qsys_clear(&lo->tx_queue); + if (ncleared) + dprintk(NBD_DEBUG_SESSION, + "nb%d: cleared %d tx requests.\n", + DEVICE_TO_MINOR(lo), ncleared); + ncleared = nbd_qsys_clear(&lo->rx_queue); + if (ncleared) + dprintk(NBD_DEBUG_SESSION, + "nb%d: cleared %d rx requests.\n", + DEVICE_TO_MINOR(lo), ncleared); + + dprintk(NBD_DEBUG_THREADS, "%s[%d]: ending.\n", + current->comm, current->pid); + + spin_lock(&lo->lock); + lo->ss_thread.task = NULL; + spin_unlock(&lo->lock); + complete(&lo->ss_thread.finish); + module_put(THIS_MODULE); + return rv; +} + +static void end_io_cycle(nbd_device_t *lo) +{ + dprintk(NBD_DEBUG_THREADS, "%s[%d]: called %s (flags=%x)\n", + current->comm, current->pid, __FUNCTION__, lo->flags); + down(&lo->semalock); + if (lo->flags & NBD_NONBLOCKING) + nbd_clear_queue(lo); + else + nbd_redo_queue(lo); + up(&lo->semalock); + wake_up(&lo->no_io_waiters); +} + +static int rx_loop(void *data) +{ + nbd_device_t *lo = (nbd_device_t *)data; + int signr; + struct request *req; + int result; + + __module_get(THIS_MODULE); + atomic_inc(&lo->num_io_threads); + DAEMONIZE("nb%d-rx", lo - nbd_devs); + spin_lock(&lo->lock); + lo->rx_thread.task = current; + spin_unlock(&lo->lock); + unblock_sigkill(); + complete(&lo->rx_thread.startup); + + dprintk(NBD_DEBUG_THREADS, "%s[%d]: started.\n", + current->comm, current->pid); + + while ((signr = signal_pending(current)) == 0) { + result = nbd_recv_reply(lo, &req); + if (result <= 0) { + if (result == -ERESTARTSYS || result == -EINTR) + signr = signal_pending(current); + if (req) + request_end(req, 0); + break; + } + dprintk(NBD_DEBUG_RX, "nb%d: received reply, req=%p\n", + DEVICE_TO_MINOR(lo), req); + if (req) + request_end(req, req->errors? 0: 1); + } + + if (signr) + printk("%s[%d]: signaled to exit (state=%ld)\n", + current->comm, current->pid, current->state); + set_current_state(TASK_RUNNING); + dprintk(NBD_DEBUG_THREADS, "%s[%d]: ending.\n", + current->comm, current->pid); + + /* + * Nullify our task ptr before trying to stop tx thread so tx + * thread does not wind up trying to stop us back. + */ + spin_lock(&lo->lock); + lo->rx_thread.task = NULL; + spin_unlock(&lo->lock); + nbd_thread_stop(lo, &lo->tx_thread, 0); + complete(&lo->rx_thread.finish); + if (atomic_dec_and_test(&lo->num_io_threads)) + end_io_cycle(lo); + module_put(THIS_MODULE); + return 0; +} + +/** + * tx_loop - transmit loop + * @data: pointer to nbd_device_t + * + * Invoked as a seperate kernel thread to process the transmit queue for + * the device. + */ +static int tx_loop(void *data) +{ + nbd_device_t *lo = (nbd_device_t *)data; + struct request *req; + int result, signr, enqueued; +#ifndef NDEBUG + int minor = DEVICE_TO_MINOR(lo); +#endif + + __module_get(THIS_MODULE); + atomic_inc(&lo->num_io_threads); + DAEMONIZE("nb%d-tx", lo - nbd_devs); + spin_lock(&lo->lock); + lo->tx_thread.task = current; + spin_unlock(&lo->lock); + unblock_sigkill(); + complete(&lo->tx_thread.startup); + + dprintk(NBD_DEBUG_THREADS, "%s[%d]: started.\n", + current->comm, current->pid); + + while ((signr = signal_pending(current)) == 0) { + req = nbd_qsys_deq_head_eventually(&lo->tx_queue); + if (!req) + continue; + enqueued = 0; + if (REQUEST_CMD(req) != DISCONNECT_NBD) { + if (REQUEST_CMD(req) != WRITE || !nbd_write_nochk(lo)) { + nbd_qsys_enq_tail(&lo->rx_queue, req); + enqueued = 1; + } + } + result = nbd_send_request(lo, req); + if (result <= 0) { + if (result == -ERESTARTSYS || result == -EINTR) + signr = signal_pending(current); + if (!enqueued) + break; + if (!nbd_qsys_deq_request(&lo->rx_queue, (char *)req)) { + dprintk(NBD_DEBUG_TX, + "nb%d: lost request %p???\n", + minor, req); + break; + } + goto reque_request; + } + dprintk(NBD_DEBUG_TX, "nb%d: sent request %p(%s@%llu,%luB)\n", + minor, req, + cmd_to_ascii(REQUEST_CMD(req)), + (u64) req->sector << 9, + req->nr_sectors << 9); + if (REQUEST_CMD(req) == DISCONNECT_NBD) + break; + if (REQUEST_CMD(req) == WRITE && nbd_write_nochk(lo)) + request_end(req, 1); + continue; + +reque_request: + /* put request back on tx queue... */ + nbd_qsys_enq_head(&lo->tx_queue, req); + dprintk(NBD_DEBUG_TX, "nb%d: re-tx-queued request %p\n", + minor, req); + break; + } + + if (signr) + printk("%s[%d]: signaled to exit (state=%ld)\n", + current->comm, current->pid, current->state); + set_current_state(TASK_RUNNING); + dprintk(NBD_DEBUG_THREADS, "%s[%d]: ending.\n", + current->comm, current->pid); + + spin_lock(&lo->lock); + lo->tx_thread.task = NULL; + spin_unlock(&lo->lock); + nbd_thread_stop(lo, &lo->rx_thread, 0); + complete(&lo->tx_thread.finish); + if (atomic_dec_and_test(&lo->num_io_threads)) + end_io_cycle(lo); + module_put(THIS_MODULE); + return 0; +} + +static int nbd_thread_start(nbd_device_t *lo, nbd_thread_t *th, thread_fn_t fn) +{ + int rv; + struct task_struct *task; + + dprintk(NBD_DEBUG_THREADS, "nb%d: %s[%d] called %s.\n", + DEVICE_TO_MINOR(lo), current->comm, current->pid, + __FUNCTION__); + + spin_lock(&lo->lock); + task = th->task; + if (!task) { + /* insure no thread start races */ + th->task = (struct task_struct *)-1; + } + spin_unlock(&lo->lock); + if (task) + return -EBUSY; + init_completion(&th->startup); + init_completion(&th->finish); + rv = kernel_thread(fn, lo, CLONE_FS|CLONE_FILES); + if (rv >= 0) + wait_for_completion(&th->startup); + return rv; +} + +static int nbd_thread_stop(nbd_device_t *lo, nbd_thread_t *th, int wait) +{ + pid_t signaled; + struct task_struct *task; + + dprintk(NBD_DEBUG_THREADS, "nb%d: %s[%d] called %s.\n", + DEVICE_TO_MINOR(lo), current->comm, current->pid, + __FUNCTION__); + + signaled = 0; + // read_lock(&tasklist_lock); + spin_lock(&lo->lock); + task = th->task; + if (task) { + // XXX task no longer has field counter + // th->task->counter = 5 * HZ; + // XXX Why add PF_MEMALLOC?? Not in memalloc routine! + // th->task->flags |= PF_MEMALLOC; + force_sig(SIGKILL, task); + signaled = task->pid; + // th->task = NULL; + } + spin_unlock(&lo->lock); + // read_unlock(&tasklist_lock); + if (signaled) { + dprintk(NBD_DEBUG_THREADS, + "nb%d: %s signaled pid %d.\n", + DEVICE_TO_MINOR(lo), __FUNCTION__, signaled); + if (wait) + wait_for_completion(&th->finish); + return 1; + } + return 0; +} + +static int getconnection(struct sockaddr_in *sin, struct socket **sock) +{ + int rv; + + rv = sock_create(PF_INET, SOCK_STREAM, IPPROTO_TCP, sock); + if (rv < 0) { + printk(DEVICE_NAME ": error creating socket: %d\n", rv); + return rv; + } + rv = (*sock)->ops->connect(*sock, (struct sockaddr *)sin, + sizeof(*sin), 0); + if (rv < 0) { + printk(DEVICE_NAME ": error connecting to 0x%X:%d (%d).\n", + ntohl(sin->sin_addr.s_addr), + ntohs(sin->sin_port), rv); + return rv; + } + return 0; +} + +static unsigned char protomagic[] = NBD_PROTOMAGIC; + +static int handshake(struct socket *sock, nbd_svr_info_t *svrinfo) +{ + int rv; + static const char initpwd[] = NBD_INIT_PASSWD; + + /* get initial NBD password (more of a header magic really) */ + rv = sock_xmit(sock, 0, svrinfo->initpwd, sizeof(svrinfo->initpwd), + MSG_WAITALL); + if (rv <= 0) + goto error; + if (strncmp(svrinfo->initpwd, initpwd, sizeof(svrinfo->initpwd))) { + printk(DEVICE_NAME ": bad handshake: expecting \"%s\", got \"%.8s\"!\n", + initpwd, svrinfo->initpwd); + goto error; + } + + /* get protocol magic */ + rv = sock_xmit(sock, 0, svrinfo->magic, sizeof(svrinfo->magic), + MSG_WAITALL); + if (rv <= 0) + goto error; + if (*(u64 *)svrinfo->magic != *(u64 *)protomagic) { + printk(DEVICE_NAME ": bad handshake: wrong magic (got 0x%llx)!\n", + *((u64 *)svrinfo->magic)); + goto error; + } + + /* get server storage size */ + rv = sock_xmit(sock, 0, &svrinfo->size, sizeof(svrinfo->size), + MSG_WAITALL); + if (rv <= 0) + goto error; + svrinfo->size = ntohll(svrinfo->size); + printk(DEVICE_NAME ": server says size is %lld bytes.\n", svrinfo->size); + + /* get zeros */ + rv = sock_xmit(sock, 0, svrinfo->zeros, sizeof(svrinfo->zeros), + MSG_WAITALL); + if (rv <= 0) + goto error; + + return 0; + +error: + if (rv < 0) return rv; + return -EPROTO; +} + +/* Must be called with semalock held! */ +static int nbd_deactivate_sin(nbd_device_t *lo) +{ + struct socket *sock; + + spin_lock(&lo->lock); + sock = lo->sock; + if (sock) + lo->sock = NULL; + spin_unlock(&lo->lock); + if (!sock) + return -ENOTCONN; + lo->closed = (RCV_SHUTDOWN|SEND_SHUTDOWN); + // XXX is this right??? + if (sock->state == SS_CONNECTED) { + struct sock *sk = sock->sk; + sock->sk = NULL; + sk->sk_prot->close(sk, 0L); + } + sock_release(sock); + if (!lo->lasterr) + lo->lasterr = ENOTCONN; + if (atomic_dec_and_test(&lo->refcnt)) { + if (lo->flags & NBD_CLEARED) { + lo->flags &= ~NBD_NONBLOCKING; + lo->flags &= ~NBD_CLEARED; + } + } + return 0; +} + +static int nbd_activate_sin(nbd_device_t *lo) +{ + int rv; + nbd_svr_info_t svrinfo; + + if (!lo->sin.sin_addr.s_addr) + return -ENOENT; + atomic_inc(&lo->refcnt); + lo->errcnt = 0; + rv = getconnection(&lo->sin, &lo->sock); + if (rv < 0) + goto leave; + lo->closed = 0; + rv = handshake(lo->sock, &svrinfo); +leave: + if (rv < 0) { + lo->lasterr = -rv; + nbd_deactivate_sin(lo); + } + else { + lo->lasterr = 0; + nbd_set_size64(lo, svrinfo.size); + } + return rv; +} + +static void nbd_qsys_init(nbd_qsys_t *q) +{ + spin_lock_init(&q->lock); + q->len = 0; + INIT_LIST_HEAD(&q->head); + init_waitqueue_head(&q->waiters); +} + +/* + * Here begins definitions of ioctl handling functions... + */ + +/* Must be called with lo->semalock held */ +static int nbd_clear_queue(nbd_device_t *lo) +{ + int rv, ncleared, set_nonblocking; + +#ifdef PARANOIA + BUG_ON(lo->magic != LO_MAGIC); +#endif + ncleared = 0; + set_nonblocking = 0; + + dprintk(NBD_DEBUG_IOCTL, "nb%d: %s called\n", + DEVICE_TO_MINOR(lo), __FUNCTION__); + if (lo->file) { + spin_lock(&lo->lock); + if (lo->rx_thread.task) + printk("nb%d: NBD_CLEAR_QUEUE ioctl invoked while RX active, upgrade tools!\n", + DEVICE_TO_MINOR(lo)); + else { + if (!(lo->flags & NBD_NONBLOCKING)) { + lo->flags |= NBD_NONBLOCKING; + set_nonblocking++; + } + ncleared += nbd_qsys_clear(&lo->rx_queue); + } + if (lo->tx_thread.task) + printk("nb%d: NBD_CLEAR_QUEUE ioctl invoked while TX active, upgrade tools!\n", + DEVICE_TO_MINOR(lo)); + else { + if (!(lo->flags & NBD_NONBLOCKING)) { + lo->flags |= NBD_NONBLOCKING; + set_nonblocking++; + } + ncleared += nbd_qsys_clear(&lo->tx_queue); + } + spin_unlock(&lo->lock); + if (set_nonblocking) + lo->flags |= NBD_CLEARED; + dprintk(NBD_DEBUG_IOCTL, + "nb%d: %s cleared %d requests.\n", + DEVICE_TO_MINOR(lo), __FUNCTION__, ncleared); + return 0; + } + + spin_lock(&lo->lock); + if (lo->rx_thread.task || lo->tx_thread.task) + rv = -EBUSY; + else { + if (!(lo->flags & NBD_NONBLOCKING)) { + lo->flags |= NBD_NONBLOCKING; + set_nonblocking++; + } + ncleared += nbd_qsys_clear(&lo->rx_queue); + ncleared += nbd_qsys_clear(&lo->tx_queue); + rv = ncleared; + } + spin_unlock(&lo->lock); + if (set_nonblocking) + lo->flags |= NBD_CLEARED; + dprintk(NBD_DEBUG_IOCTL, + "nb%d: %s ended %d requests.\n", + DEVICE_TO_MINOR(lo), __FUNCTION__, ncleared); + return rv; +} + +static int nbd_redo_queue(nbd_device_t *lo) +{ + int rv; + struct request *req; + + rv = 0; + spin_lock(&lo->lock); + if (lo->rx_thread.task || lo->tx_thread.task) { + rv = -EBUSY; + goto leave; + } + /* must preserve order of requests... */ + while ((req = nbd_qsys_deq_tail(&lo->rx_queue)) != NULL) { + nbd_qsys_enq_head(&lo->tx_queue, req); + rv++; + } +leave: + spin_unlock(&lo->lock); + if (rv >= 0) + dprintk(NBD_DEBUG_SESSION, + "nb%d: %s moved %d requests from rx queue\n", + DEVICE_TO_MINOR(lo), __FUNCTION__, rv); + return rv; +} + +static int nbd_set_sin(nbd_device_t *lo, struct sockaddr_in *sin) +{ + if (lo->file) + return -EINVAL; + if (lo->closed != (RCV_SHUTDOWN|SEND_SHUTDOWN)) + return -EBUSY; + if (lo->sock) + return -EBUSY; + if (lo->sin.sin_addr.s_addr) + return -EBUSY; + memcpy(&lo->sin, sin, sizeof(lo->sin)); + return 0; +} + +static int nbd_clr_sin(nbd_device_t *lo) +{ + if (lo->file) + return -EINVAL; + if (lo->sock) + return -EBUSY; + if (!lo->sin.sin_addr.s_addr) + return -EINVAL; + memset(&lo->sin, 0, sizeof(lo->sin)); + return 0; +} + +/* + * This must implement old NBD_SET_SOCK sematics... + */ +static int nbd_set_sock(nbd_device_t *lo, unsigned long arg) +{ + int rv; + struct file *file; + struct inode *inode; + struct socket *sock; + + rv = 0; + file = NULL; + + /* + * Establish EBUSY or not first to help user space tool(s) know. + * Using an illegal file descriptor of -1 then provides user space + * programs with an easy way to figure out if device is already + * connected without worry that descriptor will get used. + */ + if (lo->sock || lo->file) { + rv = -EBUSY; + goto leave; + } + + file = fget(arg); + if (!file) { + rv = -EBADF; + goto leave; + } + inode = file->f_dentry->d_inode; + if (!S_ISSOCK(inode->i_mode)) { + rv = -ENOTSOCK; + goto leave; + } + if (!inode->i_sock) { + rv = -ENOTSOCK; + goto leave; + } + sock = SOCKET_I(inode); + if (sock->type != SOCK_STREAM && sock->type != SOCK_SEQPACKET) { + rv = -EPROTONOSUPPORT; + goto leave; + } + lo->file = file; + lo->sock = sock; + memset(&lo->sin, 0, sizeof(lo->sin)); + lo->errcnt = 0; + lo->lasterr = 0; + lo->closed = 0; +leave: + if (rv < 0) { + if (file) + fput(file); + } + dprintk(NBD_DEBUG_IOCTL, + "nb%d: %s descriptor %d, returning %d\n", + DEVICE_TO_MINOR(lo), __FUNCTION__, (int)arg, rv); + return rv; +} + +/** + * nbd_clear_sock - Supports old style nbd shutdown. + * @lo: pointer to network block device. + * + * It's left up to the nbd_client user process to cause the + * close() of the socket (as it was in the old). + */ +static int nbd_clear_sock(nbd_device_t *lo) +{ + int rv; + struct file *file; + + rv = 0; + file = NULL; + if (!lo->sock) { + rv = -ENOTCONN; + goto leave; + } + file = lo->file; + if (!file) { + /* + * Must have been setup using nbd_set_sin(). + * User needs to use nbd_clr_sin() instead! + */ + rv = -EINVAL; + goto leave; + } + rv = -EBUSY; + spin_lock(&lo->lock); + if (!lo->rx_thread.task && !lo->tx_thread.task) + rv = 0; + spin_unlock(&lo->lock); + if (rv) + goto leave; + lo->file = NULL; + lo->sock = NULL; + lo->closed = (RCV_SHUTDOWN|SEND_SHUTDOWN); + if (!lo->lasterr) + lo->lasterr = ENOTCONN; +leave: + if (file) + fput(file); + return rv; +} + +static inline int nbd_start(nbd_device_t *lo) +{ + int rv; + + down(&lo->semalock); + rv = nbd_startup(lo); + up(&lo->semalock); + return rv; +} + +static inline int nbd_stop(nbd_device_t *lo) +{ + int rv; + + down(&lo->semalock); + rv = nbd_shutdown(lo); + up(&lo->semalock); + return rv; +} + +/** + * nbd_do_it - startup rx & tx threads then block till they're done. + * @lo: Pointer to network block device. + * + * Note: This function is meant for backward compatability with oldstyle + * NBD usage & the oldstyle nbd_client program. This function is invoked + * via the NBD_DO_IT ioctl or by the internal session thread. + */ +static int nbd_do_it(nbd_device_t *lo) +{ + int rv; +#ifndef NDEBUG + int minor = DEVICE_TO_MINOR(lo); +#endif + + dprintk(NBD_DEBUG_SESSION, "nb%d: %s called.\n", minor, __FUNCTION__); + + rv = 0; + spin_lock(&lo->lock); + if (lo->ss_thread.task && lo->ss_thread.task != current) + rv = -EBUSY; + spin_unlock(&lo->lock); + if (rv < 0) + goto leave; + + rv = nbd_start(lo); + if (rv < 0) + goto leave; + wait_for_io_threads(lo); + nbd_stop(lo); + +leave: + dprintk(NBD_DEBUG_SESSION, + "nb%d: %s returning %d (rxlen %d, txlen %d)\n", + minor, __FUNCTION__, rv, + nbd_qsys_len(&lo->rx_queue), nbd_qsys_len(&lo->tx_queue)); + return rv; +} + +static int nbd_disconnect(nbd_device_t *lo) +{ + int rv, signaled_ss, signaled_tx, managed; + struct request sreq; + + dprintk(NBD_DEBUG_SESSION, "nb%d: %s called\n", + DEVICE_TO_MINOR(lo), __FUNCTION__); + + rv = 0; + signaled_ss = signaled_tx = 0; + down(&lo->semalock); + if (!lo->sock) { + rv = -ENOTCONN; + goto leave; + } + if (lo->flags & NBD_RESTARTABLE) { + dprintk(NBD_DEBUG_SESSION, "nb%d: disabling restart\n", + DEVICE_TO_MINOR(lo)); + lo->flags &= ~NBD_RESTARTABLE; + } + spin_lock(&lo->lock); + managed = lo->ss_thread.task? 1: 0; + spin_unlock(&lo->lock); + REQUEST_CMD(&sreq) = DISCONNECT_NBD; + /* Enqueue request in turn onto Tx request queue if Tx thread up */ + if (!lo->tx_thread.task) + goto leave; + nbd_qsys_enq_tail(&lo->tx_queue, &sreq); + + /* + * Any waiting we do for kids should be after up'ing + * semalock to avoid deadlock'ing on the semaphore. + */ + up(&lo->semalock); + if (managed) { + dprintk(NBD_DEBUG_SESSION, + "nb%d: %s: waiting for session completion\n", + DEVICE_TO_MINOR(lo), __FUNCTION__); + wait_for_completion(&lo->ss_thread.finish); + } + else + wait_for_io_threads(lo); + down(&lo->semalock); + +leave: + if (rv == 0) { + nbd_shutdown(lo); + lo->lasterr = ENOTCONN; + } + up(&lo->semalock); + dprintk(NBD_DEBUG_SESSION, "nb%d: %s returning %d\n", + DEVICE_TO_MINOR(lo), __FUNCTION__, rv); + return rv; +} + +/** + * do_nbd_request - the block device system request handling function. + * @q: queue of pending requests to operate on. + * + * This function may not actually handle any requests at all if it can't + * get all the needed resources. It's assumed that in this case, returning + * will let other things schedule and that this is okay with the blkdev + * system, specifically with linux/drivers/block/ll_rw_blk.c. + */ +static void do_nbd_request(request_queue_t *q) +{ + struct request *req; + int minor, nreqs; + nbd_device_t *lo; + + dprintk(NBD_DEBUG_BLKDEV, DEVICE_NAME ": %s(%p) called\n", + __FUNCTION__, q); + qhandler_loops++; + nreqs = 0; + while ((req = REQUEST_QUEUE_NEXT_REQUEST(q)) != NULL) { + dprintk(NBD_DEBUG_BLKDEV, DEVICE_NAME ": got request (%p)\n", + req); + blkdev_dequeue_request(req); + nreqs++; + lo = req->rq_disk->private_data; + minor = DEVICE_TO_MINOR(lo); +#ifdef PARANOIA + BUG_ON(lo->magic != LO_MAGIC); + if (minor >= MAX_NBD) { + printk(KERN_ERR DEVICE_NAME + ": request %p: minor too big (%d>=%d)!\n", + req, minor, MAX_NBD); + goto fail_request; + } +#endif + if (!blk_fs_request(req)) { + printk(KERN_ERR + "nb%d: request %p: not for fs - flags=0x%lX\n", + minor, req, req->flags); + goto fail_request; + } + REQUEST_CMD(req) = READ; + if (rq_data_dir(req) == WRITE) + REQUEST_CMD(req) = WRITE; + if ((REQUEST_CMD(req) == WRITE) && nbd_read_only(lo)) { + printk(KERN_ERR + "nb%d: request %p: write on read-only\n", + minor, req); + goto fail_request; + } + if (!lo->tx_thread.task && (lo->flags & NBD_NONBLOCKING)) { + printk(KERN_ERR + "nb%d: request %p: no tx task & non-blocking\n", + minor, req); + goto fail_request; + } + dprintk(NBD_DEBUG_BLKDEV, "nb%d: enqueuing tx request (%p)\n", + minor, req); + nbd_qsys_enq_tail(&lo->tx_queue, req); + continue; + +fail_request: + /* + * Fail the request: anyone waiting on a read or write gets + * an error and can move on to their close() call. + */ + req->errors++; + request_end_while_locked(req, 0); + } + requests_in += nreqs; + dprintk(NBD_DEBUG_BLKDEV, DEVICE_NAME ": %s(%p) returning\n", + __FUNCTION__, q); + return; +} + +static void bd_set_size(struct block_device *bdev, loff_t size, int block_size) +{ +#if 0 + atomic_inc(&bdev->bd_count); + bdev->bd_openers++; +#endif + // XXX need to roll this in with media-change handling! + if (bdev->bd_inode->i_size != size) { + dprintk(NBD_DEBUG_OPEN, DEVICE_NAME + ": %s: i_size was %llu, changing to %llu\n", + __FUNCTION__, bdev->bd_inode->i_size, size); + bdev->bd_inode->i_size = size; + } + if (bdev->bd_block_size != block_size) { + dprintk(NBD_DEBUG_OPEN, DEVICE_NAME + ": %s: block size was %u, changing to %u\n", + __FUNCTION__, bdev->bd_block_size, block_size); + bdev->bd_block_size = block_size; + } + if (bdev->bd_inode->i_blkbits != blksize_bits(block_size)) { + dprintk(NBD_DEBUG_OPEN, DEVICE_NAME + ": %s: blkbits was %u, changing to %u\n", + __FUNCTION__, bdev->bd_inode->i_blkbits, + blksize_bits(block_size)); + bdev->bd_inode->i_blkbits = blksize_bits(block_size); + } +} + +static int nbd_open(struct inode *inode, struct file *file) +{ + nbd_device_t *lo; + int minor, rv, running; + + if (!inode) + return -EINVAL; + lo = INODE_TO_NBD(inode); + minor = DEVICE_TO_MINOR(lo); + if (minor >= MAX_NBD) + return -ENODEV; +#ifdef PARANOIA + BUG_ON(lo->magic != LO_MAGIC); + BUG_ON(file == NULL); +#endif + + rv = 0; + down(&lo->semalock); + if ((file->f_mode & FMODE_WRITE) && nbd_read_only(lo)) { + rv = -EROFS; + goto leave; + } + if ((file->f_flags & O_EXCL) && (atomic_read(&lo->refcnt) > 0)) { + rv = -EBUSY; + goto leave; + } + if (file->f_flags & O_NDELAY) { + dprintk(NBD_DEBUG_OPEN, + "nb%d: %s[%d] called %s(%p,%p(no-delay)).\n", + minor, current->comm, current->pid, + __FUNCTION__, inode, file); + atomic_inc(&lo->refcnt); + goto leave; + } + dprintk(NBD_DEBUG_OPEN, "nb%d: %s[%d] called %s(%p,%p).\n", + minor, current->comm, current->pid, + __FUNCTION__, inode, file); + + /* + * As much as possible, keep requests from queuing if not ready. + * We accomplish that by returning ENOMEDIUM when we're not ready + * and we can't be made ready... + */ + running = 0; + spin_lock(&lo->lock); + if (lo->tx_thread.task) + running = 1; + spin_unlock(&lo->lock); + if (running) { + dprintk(NBD_DEBUG_OPEN, "nb%d: %s: ready, refcnt=%d.\n", + minor, __FUNCTION__, atomic_read(&lo->refcnt)); + atomic_inc(&lo->refcnt); + goto leave; + } + dprintk(NBD_DEBUG_OPEN, "nb%d: %s: not ready.\n", + minor, __FUNCTION__); + if (lo->sin.sin_addr.s_addr) { + if (lo->sock) { + dprintk(NBD_DEBUG_OPEN, + "nb%d: %s: terminating old connection\n", + minor, __FUNCTION__); + /* + * Connection died. Remote host or network is + * down. Make sure it's complete... + */ + nbd_shutdown(lo); + } + if (lo->flags & NBD_RESTARTABLE) + rv = nbd_thread_start(lo, &lo->ss_thread, session_loop); + else + rv = nbd_startup(lo); + if (rv < 0 && (lo->flags & NBD_NONBLOCKING)) + goto leave; + rv = 0; + atomic_inc(&lo->refcnt); + goto leave; + } + if (lo->flags & NBD_NONBLOCKING) + rv = -ENOMEDIUM; + else + atomic_inc(&lo->refcnt); +leave: + up(&lo->semalock); + if (rv == 0) { + bd_set_size(inode->i_bdev, get_capacity(lo->disk) << 9, + NBD_BLKSIZE(lo)); + } + dprintk(NBD_DEBUG_OPEN, "nb%d: %s: returning %d.\n", + minor, __FUNCTION__, rv); + return rv; +} + +static int nbd_release(struct inode *inode, struct file *file) +{ + nbd_device_t *lo; + int minor, refcnt; + + if (!inode) + return -ENODEV; + lo = INODE_TO_NBD(inode); + minor = DEVICE_TO_MINOR(lo); + if (minor >= MAX_NBD) + return -ENODEV; + dprintk(NBD_DEBUG_RELEASE, "nb%d: %s[%d] called %s(%p,%p)\n", + minor, current->comm, current->pid, + __FUNCTION__, inode, file); + + down(&lo->semalock); + refcnt = atomic_read(&lo->refcnt); + if (refcnt <= 0) + printk(KERN_ALERT "nb%d: %s: refcount(%d) <= 0\n", + minor, __FUNCTION__, refcnt); + if (refcnt == 1 && lo->sin.sin_addr.s_addr && lo->sock) { + int running = 0; + spin_lock(&lo->lock); + if (lo->ss_thread.task) + running = 1; + spin_unlock(&lo->lock); + if (!running) { + dprintk(NBD_DEBUG_RELEASE, + "nb%d: last holder: rxlen %d, txlen %d.\n", + minor, + nbd_qsys_len(&lo->rx_queue), + nbd_qsys_len(&lo->tx_queue)); + BUG_ON(nbd_qsys_len(&lo->rx_queue) != 0); + BUG_ON(nbd_qsys_len(&lo->tx_queue) != 0); + nbd_shutdown(lo); + } + } + else if (atomic_dec_and_test(&lo->refcnt)) { + if (lo->flags & NBD_CLEARED) { + lo->flags &= ~NBD_NONBLOCKING; + lo->flags &= ~NBD_CLEARED; + } + } + up(&lo->semalock); + dprintk(NBD_DEBUG_RELEASE, "nb%d: %s returning (refcnt %d)\n", + minor, __FUNCTION__, atomic_read(&lo->refcnt)); + return 0; +} + +/** + * nbd_set_size64 - sets byte size of nbd device + * @lo: pointer to nbd device + * @size: size of device in bytes + * + * Only call when lo inactive (ie. when rx/tx threads not running). + */ +static void nbd_set_size64(nbd_device_t *lo, u64 size) +{ + u64 oldsize; + oldsize = lo->bytesize; + size = size & ~(lo->blksize - 1); + lo->bytesize = size; + set_capacity(lo->disk, size >> 9); + dprintk(NBD_DEBUG_IOCTL, "nb%d: %s: size now %llu\n", + DEVICE_TO_MINOR(lo), __FUNCTION__, size); +} + +static int nbd_set_blksize(nbd_device_t *lo, unsigned long arg) +{ + int rv; + + // XXX ldl: maybe we'd be better off calling block_dev.c + // set_blocksize(dev, arg) as this would also handle + // any block buffers: return set_blocksize(dev, arg); + if ((arg & (arg - 1)) || (arg < 512) || (arg > PAGE_SIZE)) + return -EINVAL; + + rv = 0; + spin_lock(&lo->lock); + if (lo->rx_thread.task || lo->tx_thread.task) { + rv = -EBUSY; + goto unlock; + } + + lo->blksize = arg; + lo->bytesize &= ~(lo->blksize - 1); + set_capacity(lo->disk, lo->bytesize >> 9); + +unlock: + spin_unlock(&lo->lock); + return rv; +} + +static int nbd_set_size(nbd_device_t *lo, unsigned long arg) +{ + int rv; + + rv = 0; + spin_lock(&lo->lock); + if (lo->rx_thread.task || lo->tx_thread.task) { + rv = -EBUSY; + goto unlock; + } + nbd_set_size64(lo, arg); +unlock: + spin_unlock(&lo->lock); + return rv; +} + +static int nbd_set_size_blocks(nbd_device_t *lo, unsigned long arg) +{ + int rv; + + rv = 0; + spin_lock(&lo->lock); + if (lo->rx_thread.task || lo->tx_thread.task) { + rv = -EBUSY; + goto unlock; + } + nbd_set_size64(lo, ((u64)arg) * NBD_BLKSIZE(lo)); +unlock: + spin_unlock(&lo->lock); + return rv; +} + +static int nbd_print_debug(nbd_device_t *lo) +{ + printk(KERN_INFO "nb%d:", DEVICE_TO_MINOR(lo)); + printk(" refcnt=%d", atomic_read(&lo->refcnt)); + printk(" flags=0x%x", lo->flags); + printk(" harderror=%d", lo->harderror); + printk(" q-loops=%llu", qhandler_loops); + printk(" reqsin=%llu", requests_in); + printk(" reqsout=%llu", requests_out); + printk(" rx=%p(%d)", lo->rx_thread.task, lo->rx_queue.len); + printk(" tx=%p(%d)", lo->tx_thread.task, lo->tx_queue.len); + printk(" ss=%p", lo->ss_thread.task); + printk("\n"); + return 0; +} + +#ifndef NDEBUG +static const char *ioctl_cmd_to_ascii(int cmd) +{ + switch (cmd) { + case NBD_SET_SOCK: return "set-sock"; + case NBD_SET_BLKSIZE: return "set-blksize"; + case NBD_SET_SIZE: return "set-size"; + case NBD_DO_IT: return "do-it"; + case NBD_CLEAR_SOCK: return "clear-sock"; + case NBD_CLEAR_QUE: return "clear-que"; + case NBD_PRINT_DEBUG: return "print-debug"; + case NBD_SET_SIZE_BLOCKS: return "set-size-blocks"; + case NBD_DISCONNECT: return "disconnect"; + case NBD_GET_SIN: return "get-sin"; + case NBD_SET_SIN: return "set-sin"; + case NBD_CLR_SIN: return "clear-sin"; + case NBD_SET_FLAGS: return "set-flags"; + case NBD_GET_FLAGS: return "get-flags"; + case BLKROSET: return "set-read-only"; + case BLKROGET: return "get-read-only"; + case BLKGETSIZE: return "block-get-size"; + case BLKFLSBUF: return "flush-buffer-cache"; + } + return "unknown"; +} +#endif /* NDEBUG */ + +static int nbd_ioctl(struct inode *inode, struct file *file, + unsigned int cmd, unsigned long arg) +{ + nbd_device_t *lo; + int minor, rv; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + if (!inode) + return -EINVAL; + lo = inode->i_bdev->bd_disk->private_data; + minor = DEVICE_TO_MINOR(lo); + if (minor >= MAX_NBD) + return -ENODEV; + dprintk(NBD_DEBUG_IOCTL, + "nb%d: %s[%d] called %s: cmd=%s(0x%x) arg=%lu.\n", + minor, current->comm, current->pid, __FUNCTION__, + ioctl_cmd_to_ascii(cmd), cmd, arg); + + /* + * Handled outside normal ioctl handling since it + * needs to play with semalock differently... + */ + switch (cmd) { + case NBD_DO_IT: + return nbd_do_it(lo); + case NBD_DISCONNECT: + return nbd_disconnect(lo); + } + + rv = 0; + down(&lo->semalock); + switch (cmd) { + case NBD_SET_SIN: + { + struct sockaddr_in sin; + rv = copy_from_user(&sin, (void *)arg, sizeof(sin))? + -EFAULT: nbd_set_sin(lo, &sin); + } + break; + case NBD_GET_SIN: + rv = copy_to_user((void *)arg, &lo->sin, sizeof(lo->sin))? + -EFAULT: 0; + break; + case NBD_CLR_SIN: + rv = nbd_clr_sin(lo); + break; + case NBD_CLEAR_QUE: + rv = nbd_clear_queue(lo); + break; + case NBD_PRINT_DEBUG: + rv = nbd_print_debug(lo); + break; + case NBD_SET_SOCK: + rv = nbd_set_sock(lo, arg); + break; + case NBD_CLEAR_SOCK: + rv = nbd_clear_sock(lo); + break; + case NBD_SET_BLKSIZE: + rv = nbd_set_blksize(lo, arg); + break; + case NBD_SET_SIZE: + rv = nbd_set_size(lo, arg); + break; + case NBD_SET_SIZE_BLOCKS: + rv = nbd_set_size_blocks(lo, arg); + break; + case NBD_SET_FLAGS: + arg &= ~NBD_CLEARED; /* not valid user flag */ + if ((lo->flags & NBD_READ_ONLY) != (arg & NBD_READ_ONLY)) + set_disk_ro(lo->disk, (arg & NBD_READ_ONLY)? 1: 0); + lo->flags = (unsigned int)arg; + break; + case NBD_GET_FLAGS: + rv = put_user(lo->flags, (unsigned int *)arg); + break; + case BLKROSET: + if (arg) + lo->flags |= NBD_READ_ONLY; + else + lo->flags &= ~NBD_READ_ONLY; + set_disk_ro(lo->disk, arg); + break; + default: + rv = -EINVAL; + dprintk(NBD_DEBUG_IOCTL, + "nb%d: %s: no handler for 0x%x ioctl.\n", + minor, __FUNCTION__, cmd); + break; + } + up(&lo->semalock); + return rv; +} + +static struct block_device_operations nbd_fops = +{ + owner: THIS_MODULE, + open: nbd_open, + release: nbd_release, + ioctl: nbd_ioctl, +}; + +#ifdef CONFIG_PROC_FS +static const char *strerror(int err) +{ + switch (err) { + case 0: + return "OK"; + case EIO: + return "I/O error"; + case ENOTCONN: + return "Not connected"; + case ECONNRESET: + return "Connection reset by peer"; + case ETIMEDOUT: + return "Connection timed out"; + case ENETUNREACH: + return "Network is unreachable"; + case EINTR: + return "Interupted"; + case ENOMEM: + return "Out of memory"; + case ENOBUFS: + return "Queue full"; + case ECONNREFUSED: + return "Remote host refused connection"; + case EPROTO: + return "Protocol error"; + case ESRCH: + return "Unmatched reply packet"; + case ENOMEDIUM: + return "No medium found"; + } + return "error"; +} + +#ifndef NDEBUG +static int nbd_debug_read_proc(char *page, char **start, off_t offset, + int count, int *eof, void *data) +{ + int sz = 0; + + *eof = 1; + sz += sprintf(page + sz, "%x\n", debugflags); + return sz; +} + +static int nbd_debug_write_proc(struct file *file, const char *buffer, + unsigned long count, void *data) +{ + unsigned int newflags; + + if (!capable(CAP_SYS_ADMIN)) + return -EACCES; + if (sscanf(buffer, "%x", &newflags) != 1) + return -EINVAL; + printk(KERN_INFO DEVICE_NAME ": changing debugflags to 0x%x\n", + newflags); + debugflags = newflags; + return count; +} +#endif + +static int nbd_totals_read_proc(char *page, char **start, off_t offset, + int count, int *eof, void *data) +{ + int sz = 0; + + *eof = 1; + sz += sprintf(page + sz, " Requests in: %llu\n", requests_in); + sz += sprintf(page + sz, "Requests out: %llu\n", requests_out); + sz += sprintf(page + sz, " Queue loops: %llu\n", qhandler_loops); + return sz; +} + +static int nbd_device_read_proc(char *page, char **start, off_t offset, + int count, int *eof, void *data) +{ + nbd_device_t *lo = (nbd_device_t *)data; + int sz, n; + struct socket *sock; + struct file *file; + struct sockaddr_in sin; + pid_t rxpid, txpid, sspid; + + /* Careful! Buffer "page" is just that - a 1 page sized buffer! */ + sz = 0; + *eof = 1; + + sz += sprintf(page + sz, "Reference count: %d\n", + atomic_read(&lo->refcnt)); + sz += sprintf(page + sz, " Device flags: 0x%X\n", lo->flags); + sz += sprintf(page + sz, " Network status: %s (%d)\n", + strerror(lo->lasterr), lo->lasterr); + sz += sprintf(page + sz, " Disk status: %s (%d)\n", + strerror(lo->harderror), lo->harderror); + sz += sprintf(page + sz, "Device size (B): %llu\n", NBD_BYTESIZE(lo)); + sz += sprintf(page + sz, " Block size (B): %d\n", NBD_BLKSIZE(lo)); + sz += sprintf(page + sz, "Rx queue length: %u\n", lo->rx_queue.len); + sz += sprintf(page + sz, "Tx queue length: %u\n", lo->tx_queue.len); + sock = lo->sock; + if (sock) { + struct proto_ops *ops = sock->ops; + n = sizeof(sin); + sz += sprintf(page + sz, " Socket pointer: %p", lo->sock); + if (ops && ops->getname(sock, (struct sockaddr *)&sin, &n, 1) + == 0) + sz += sprintf(page + sz, " (0x%x:%d)", + ntohl(sin.sin_addr.s_addr), + ntohs(sin.sin_port)); + sz += sprintf(page + sz, "\n"); + } + memcpy(&sin, &lo->sin, sizeof(sin)); + if (sin.sin_addr.s_addr) + sz += sprintf(page + sz, " Remote address: 0x%x:%d\n", + ntohl(sin.sin_addr.s_addr), ntohs(sin.sin_port)); + file = lo->file; + if (file) + sz += sprintf(page + sz, " File pointer: %p\n", file); + + spin_lock(&lo->lock); + sspid = (lo->ss_thread.task)? lo->ss_thread.task->pid: 0; + rxpid = (lo->rx_thread.task)? lo->rx_thread.task->pid: 0; + txpid = (lo->tx_thread.task)? lo->tx_thread.task->pid: 0; + spin_unlock(&lo->lock); + + if (sspid) + sz += sprintf(page + sz, "Ses. thread PID: %d\n", sspid); + if (rxpid) + sz += sprintf(page + sz, " Rx thread PID: %d\n", rxpid); + if (txpid) + sz += sprintf(page + sz, " Tx thread PID: %d\n", txpid); + n = atomic_read(&lo->num_io_threads); + if (n) + sz += sprintf(page + sz, " # I/O Threads: %d\n", n); + return sz; +} + +#ifndef NDEBUG +static struct proc_dir_entry *proc_array; +static struct proc_dir_entry *proc_debug = NULL; + +static void __init mk_debug_proc_entry(void) +{ + proc_debug = create_proc_entry("debugflags", S_IFREG|S_IRUSR|S_IWUSR, + proc_array); + if (!proc_debug) + return; + proc_debug->nlink = 1; + proc_debug->data = NULL; + proc_debug->read_proc = nbd_debug_read_proc; + proc_debug->write_proc = nbd_debug_write_proc; +} + +static void __exit rm_debug_proc_entry(void) +{ + if (proc_debug) + remove_proc_entry("debugflags", proc_debug); +} +#endif /* NDEBUG */ + +static void __init init_proc_fs(void) +{ + int i; + char devname[16]; + + if (!proc_array) { + proc_array = proc_mkdir(DEVICE_NAME, proc_root_driver); + if (!proc_array) + return; + } + create_proc_read_entry("totals", 0, proc_array, + nbd_totals_read_proc, NULL); +#ifndef NDEBUG + mk_debug_proc_entry(); +#endif + for (i = 0; i < MAX_NBD; i++) { + sprintf(devname, "%d", i); + create_proc_read_entry(devname, 0, proc_array, + nbd_device_read_proc, nbd_devs + i); + } +} + +static void __exit exit_proc_fs(void) +{ + int i; + char devname[16]; + + for (i = 0; i < MAX_NBD; i++) { + sprintf(devname, "%d", i); + remove_proc_entry(devname, proc_array); + } +#ifndef NDEBUG + rm_debug_proc_entry(); +#endif + remove_proc_entry("totals", proc_array); + remove_proc_entry(DEVICE_NAME, proc_root_driver); +} +#endif + +/* + * And here should be modules and kernel interface + * (Just smiley confuses emacs :-) + */ + +static struct gendisk *nbd_alloc_disk(nbd_device_t *dev) +{ + struct gendisk *disk = alloc_disk(1); + if (disk) { + int minor = DEVICE_TO_MINOR(dev); + disk->major = NBD_MAJOR; + disk->first_minor = minor; + disk->fops = &nbd_fops; + disk->private_data = dev; + disk->queue = &nbd_queue[minor]; + sprintf(disk->disk_name, "%s%d", DEVICE_NAME, minor); + sprintf(disk->devfs_name, "%s/%d", DEVICE_NAME, minor); + set_capacity(disk, dev->bytesize >> 9); + } + return disk; +} + +static void __init init_nbd_dev(nbd_device_t *dev) +{ +#ifdef PARANOIA + dev->magic = LO_MAGIC; +#endif + atomic_set(&dev->refcnt, 0); + dev->flags = 0; + spin_lock_init(&dev->lock); + dev->harderror = 0; + nbd_qsys_init(&dev->tx_queue); + nbd_qsys_init(&dev->rx_queue); + dev->file = NULL; + dev->sock = NULL; + memset(&dev->sin, 0, sizeof(dev->sin)); + dev->errcnt = 0; + dev->lasterr = ENOTCONN; + dev->closed = (RCV_SHUTDOWN|SEND_SHUTDOWN); + dev->ss_thread.task = NULL; + dev->tx_thread.task = NULL; + dev->rx_thread.task = NULL; + atomic_set(&dev->num_io_threads, 0); + init_waitqueue_head(&dev->no_io_waiters); + init_MUTEX(&dev->semalock); + dev->blksize = 1 << initial_blksize_bits; + dev->bytesize = initial_bytesize; + dev->disk = nbd_alloc_disk(dev); + if (dev->disk) + add_disk(dev->disk); +} + +static inline void parse_sin(char *str, struct sockaddr_in *sin) +{ + char *s = str; + u16 port = default_port; + + /* parse format like: "10.0.0.5[:30666]" */ + while (*s && *s != ':') + s++; + if (*s == ':') { + *s++ = '\0'; + sscanf(s, "%hu", &port); + } + sin->sin_family = PF_INET; + sin->sin_port = htons(port); + sin->sin_addr.s_addr = in_aton(str); +} + +static void __init parse_connects(void) +{ + int i, maxi; + + maxi = sizeof(connects)/sizeof(char *); + for (i = 0; i < maxi && connects[i]; i++) { + struct sockaddr_in sin; + int result; + nbd_device_t *lo = nbd_devs + i; + + dprintk(NBD_DEBUG_INIT, DEVICE_NAME ": connects[%d]=\"%s\"\n", + i, connects[i]); + parse_sin(connects[i], &sin); + result = nbd_set_sin(lo, &sin); + if (result < 0) + printk(KERN_ERR "nb%d: bad connect \"%s\" (%d)\n", + i, connects[i], result); + else { + lo->flags |= NBD_RESTARTABLE; + result = nbd_thread_start(lo, &lo->ss_thread, + session_loop); + if (result < 0) + printk(KERN_ERR "nb%d: can't start session manager (%d)\n", + i, result); + } + } +} + +static void __init parse_initial_size(void) +{ + if (!initial_size) + return; + dprintk(NBD_DEBUG_INIT, DEVICE_NAME ": initial_size=\"%s\"\n", + initial_size); + sscanf(initial_size, "%Lu", &initial_bytesize); +} + +int __init nbd_init(void) +{ + int i; + + if (sizeof(nbd_request_t) != 28) { + printk(KERN_CRIT DEVICE_NAME ": Size of nbd_request must be 28 bytes to work!\n"); + return -EIO; + } + if (sizeof(nbd_reply_t) != 16) { + printk(KERN_CRIT DEVICE_NAME ": Size of nbd_reply must be 16 bytes to work!\n"); + return -EIO; + } + + if (register_blkdev(MAJOR_NR, DEVICE_NAME)) { + printk(DEVICE_NAME ": Unable to get major number %d.\n", MAJOR_NR); + return -EIO; + } + requests_in = 0; + requests_out = 0; + qhandler_loops = 0; + + request_magic = htonl(NBD_REQUEST_MAGIC); + reply_magic = htonl(NBD_REPLY_MAGIC); + parse_initial_size(); + + printk(KERN_INFO DEVICE_NAME ": registered at major %d\n", MAJOR_NR); + dprintk(NBD_DEBUG_INIT, DEVICE_NAME ": debugflags=0x%x.\n", + debugflags); + dprintk(NBD_DEBUG_INIT, DEVICE_NAME ": default_port=%d.\n", + default_port); + dprintk(NBD_DEBUG_INIT, DEVICE_NAME ": initial_size=%llu.\n", + initial_bytesize); + dprintk(NBD_DEBUG_INIT, DEVICE_NAME ": initial_blksize_bits=%d.\n", + initial_blksize_bits); + for (i = 0; i < MAX_NBD; i++) { + nbd_lock[i] = SPIN_LOCK_UNLOCKED; + blk_init_queue(&nbd_queue[i], do_nbd_request, &nbd_lock[i]); + } + devfs_mk_dir(DEVICE_NAME); + for (i = 0; i < MAX_NBD; i++) { + init_nbd_dev(nbd_devs + i); + } + if (connects[0]) + parse_connects(); + +#ifdef CONFIG_PROC_FS + init_proc_fs(); +#endif + return 0; +} + +static void __exit dprintk_disk(int minor, char *msg, struct gendisk *disk) +{ + dprintk(NBD_DEBUG_EXIT, + "nb%d: %s %p: %d %d-%d ref#=%d q=%p q.e.ref#=%d\n", + minor, msg, disk, disk->major, disk->first_minor, + disk->first_minor + disk->minors - 1, + atomic_read(&disk->kobj.refcount), disk->queue, + atomic_read(&disk->queue->elevator.kobj.refcount)); +} + +static void __exit nbd_exit(void) +{ + int i; + + dprintk(NBD_DEBUG_EXIT, DEVICE_NAME ": %s called.\n", __FUNCTION__); + + for (i = 0; i < MAX_NBD; i++) { + if (nbd_devs[i].sin.sin_addr.s_addr) + nbd_clr_sin(nbd_devs + i); + else if (nbd_devs[i].file && nbd_devs[i].sock) { + nbd_disconnect(nbd_devs + i); + nbd_clear_sock(nbd_devs + i); + } + } +#ifdef CONFIG_PROC_FS + exit_proc_fs(); +#endif + { + int minor; + struct gendisk *disk; + for (minor = 0; minor < MAX_NBD; minor++) { + disk = nbd_devs[minor].disk; + if (disk) { + dprintk_disk(minor, "...del_gendisk", disk); + del_gendisk(disk); + dprintk_disk(minor, "...put_disk", disk); + put_disk(disk); + } + blk_cleanup_queue(&nbd_queue[minor]); + } + } + devfs_remove(DEVICE_NAME); + if (unregister_blkdev(MAJOR_NR, DEVICE_NAME) != 0) + printk(KERN_ERR DEVICE_NAME ": module cleanup failed!\n"); + else + printk(KERN_INFO DEVICE_NAME ": module exiting.\n"); +} + +module_init(nbd_init); +module_exit(nbd_exit); + +/* + * XXX Should the module author refer to the last developer to release + * this code, or the very first developer? + * This driver is very much a derivative of previous work by: + * Pavel Machek . + */ +MODULE_AUTHOR("Louis D. Langholtz"); + +MODULE_DESCRIPTION("Network Block Device"); +MODULE_LICENSE("GPL"); +MODULE_PARM(default_port, "i"); +MODULE_PARM_DESC(default_port, "default " DEVICE_NAME " service port number"); +MODULE_PARM(initial_size, "s"); +MODULE_PARM_DESC(initial_size, "initial byte size of " DEVICE_NAME " devices"); +MODULE_PARM(initial_blksize_bits, "i"); +MODULE_PARM_DESC(initial_blksize_bits, "initial block size in bits (9-12)"); +MODULE_PARM(connects, "1-64s"); +#ifndef NDEBUG +MODULE_PARM(debugflags, "i"); +MODULE_PARM_DESC(debugflags, "flags for controlling debugging output"); +#endif diff -urN linux-2.5.72/include/linux/nbd.h linux-2.5.72-new/include/linux/nbd.h --- linux-2.5.72/include/linux/nbd.h 2003-06-16 22:20:23.000000000 -0600 +++ linux-2.5.72-new/include/linux/nbd.h 2003-06-20 21:28:10.209232024 -0600 @@ -5,82 +5,233 @@ * 2001 Copyright (C) Steven Whitehouse * New nbd_end_request() for compatibility with new linux block * layer code. + * 2003 Copyright (C) Louis D. Langholtz ldl@aros.net + * Moved nbd_end_request like code to nbd.c, added structure + * type definitions, changed device structure to accomadate + * changes made to nbd.c (see nbd.c for more details). */ #ifndef LINUX_NBD_H #define LINUX_NBD_H -#define NBD_SET_SOCK _IO( 0xab, 0 ) -#define NBD_SET_BLKSIZE _IO( 0xab, 1 ) -#define NBD_SET_SIZE _IO( 0xab, 2 ) -#define NBD_DO_IT _IO( 0xab, 3 ) -#define NBD_CLEAR_SOCK _IO( 0xab, 4 ) -#define NBD_CLEAR_QUE _IO( 0xab, 5 ) -#define NBD_PRINT_DEBUG _IO( 0xab, 6 ) -#define NBD_SET_SIZE_BLOCKS _IO( 0xab, 7 ) -#define NBD_DISCONNECT _IO( 0xab, 8 ) - -enum { - NBD_CMD_READ = 0, - NBD_CMD_WRITE = 1, - NBD_CMD_DISC = 2 -}; - +/* + * Everything up till we check if MAJOR_NR is defined, is needed + * for kernel (driver) as well as user space tools. + */ -#ifdef PARANOIA -extern int requests_in; -extern int requests_out; +#include + +#if !defined(__BYTE_ORDER) +# if defined(__LITTLE_ENDIAN) && !defined(__BIG_ENDIAN) +# define __BYTE_ORDER __LITTLE_ENDIAN +# else +# if defined(__BIG_ENDIAN) && !defined(__LITTLE_ENDIAN) +# define __BYTE_ORDER __BIG_ENDIAN +# else +# error Need to know __BYTE_ORDER! +# endif +# endif #endif -#define nbd_cmd(req) ((req)->cmd[0]) +#if __BYTE_ORDER == __BIG_ENDIAN +# define ntohll(x) (x) +# define htonll(x) (x) +#else +# if __BYTE_ORDER == __LITTLE_ENDIAN +# define ntohll(x) __constant_be64_to_cpu(x) +# define htonll(x) __constant_cpu_to_be64(x) +# else +# error Need to know __BYTE_ORDER! +# endif +#endif -#define MAX_NBD 128 +/* Define available ioctl requests... */ +#define NBD_SET_SOCK _IO( 0xab, 0 ) +#define NBD_SET_BLKSIZE _IO( 0xab, 1 ) +#define NBD_SET_SIZE _IO( 0xab, 2 ) +#define NBD_DO_IT _IO( 0xab, 3 ) +#define NBD_CLEAR_SOCK _IO( 0xab, 4 ) +#define NBD_CLEAR_QUE _IO( 0xab, 5 ) +#define NBD_PRINT_DEBUG _IO( 0xab, 6 ) +#define NBD_SET_SIZE_BLOCKS _IO( 0xab, 7 ) +#define NBD_DISCONNECT _IO( 0xab, 8 ) +#define NBD_SET_SIN _IO( 0xab, 9 ) +#define NBD_CLR_SIN _IO( 0xab, 10 ) +#define NBD_GET_SIN _IO( 0xab, 11 ) +#define NBD_SET_FLAGS _IO( 0xab, 12 ) +#define NBD_GET_FLAGS _IO( 0xab, 13 ) -struct nbd_device { - int refcnt; - int flags; - int harderror; /* Code of hard error */ -#define NBD_READ_ONLY 0x0001 -#define NBD_WRITE_NOCHK 0x0002 - struct socket * sock; - struct file * file; /* If == NULL, device is not ready, yet */ - int magic; /* FIXME: not if debugging is off */ - spinlock_t queue_lock; - struct list_head queue_head;/* Requests are added here... */ - struct semaphore tx_lock; - struct gendisk *disk; - int blksize; - int blksize_bits; - u64 bytesize; -}; +/* + * Define protocol data types, structures, and values for initial + * client/server handshake... + */ -/* This now IS in some kind of include file... */ +#define NBD_DEFAULT_PORT 30666 +#define NBD_INIT_PASSWD "NBDMAGIC" -/* These are send over network in request/reply magic field */ +/* defined as an array so no need for byte swapping (ever) */ +#define NBD_PROTOMAGIC { 0x00, 0x00, 0x42, 0x02, 0x81, 0x86, 0x12, 0x53 } +typedef struct nbd_protomagic { unsigned char bytes[8]; } nbd_protomagic_t +#ifdef __GNUC__ + __attribute__ ((packed)) +#endif +; -#define NBD_REQUEST_MAGIC 0x25609513 -#define NBD_REPLY_MAGIC 0x67446698 -/* Do *not* use magics: 0x12560953 0x96744668. */ +typedef struct nbd_svr_info { + char initpwd[8]; /* Always "NBDMAGIC" */ + unsigned char magic[8]; /* Protocol magic (NBD_PROTOMAGIC) */ + uint64_t size; /* Size in bytes of server storage */ + unsigned char zeros[128]; +} nbd_svr_info_t +#ifdef __GNUC__ + __attribute__ ((packed)) +#endif +; /* - * This is packet used for communication between client and - * server. All data are in network byte order. + * Define protocol data types, structures, and values for servicing + * block requests... */ + +/* Do *not* use magics: 0x12560953 0x96744668. */ +#define NBD_REQUEST_MAGIC 0x25609513 +#define NBD_REPLY_MAGIC 0x67446698 + struct nbd_request { - u32 magic; - u32 type; /* == READ || == WRITE */ - char handle[8]; - u64 from; - u32 len; + uint32_t magic; /* NBD_REQUEST_MAGIC */ + uint32_t type; /* == READ || == WRITE */ + char handle[8]; /* address of request for 32 or 64 bit arches */ + uint64_t from; /* in NBO */ + uint32_t len; /* in NBO */ } #ifdef __GNUC__ __attribute__ ((packed)) #endif ; +typedef struct nbd_request nbd_request_t; struct nbd_reply { - u32 magic; - u32 error; /* 0 = ok, else error */ + uint32_t magic; + uint32_t error; /* 0 = ok, else error */ char handle[8]; /* handle you got from request */ -}; +} +#ifdef __GNUC__ + __attribute__ ((packed)) #endif +; +typedef struct nbd_reply nbd_reply_t; + +/* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * + * + * Values for nbd_device flags: set/get with NBD_SET_FLAGS/NBD_GET_FLAGS + * ioctl calls. Some of these flags are also manipulated by other system + * calls. + */ + +#define NBD_READ_ONLY 0x0001 + +/* + * Set NBD_WRITE_NOCHK when you care more about speed than data integrity. + * This flag has been available for some time but only recently was actually + * implemented. If you turn it on, the driver will immediately react to any + * write requests as though they have been successfully written onto the + * server, and will ignore the actual server response code. So if the server + * has a disk I/O error, the client won't know it unless it later verifies + * what's actually been saved to disk. + */ +#define NBD_WRITE_NOCHK 0x0002 + +#define NBD_RESTARTABLE 0x0004 + +/* + * Set NBD_NONBLOCKING only when you want network errors to be counted as + * non-transient errors. By default, this flag is off so that only actual + * disk errors are recorded as non-transient errors. When off, the driver + * blocks on network failures assuming that something transient - like a + * server reboot - is in progress. Root can call the NBD_CLEAR_QUE ioctl + * to cause any such blocked I/O operations to immediately fail. + */ +#define NBD_NONBLOCKING 0x0008 /* fail requests when disconnected */ + +/* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * + * + * End of definitions shared with user space. + * From here on out, these definitions are only for kernel (driver). + */ + +#ifdef MAJOR_NR + +#include +#include + +#define LOCAL_END_REQUEST + +#include + +#define MAX_NBD 128 +#define NBD_NAME_MAXLEN 128 +#define NBD_MAX_PATHS (MAX_NBD * 2) + +typedef struct nbd_device nbd_device_t; +typedef struct nbd_thread nbd_thread_t; +typedef struct nbd_qsys nbd_qsys_t; + +struct nbd_thread { + struct task_struct *task; + struct completion startup; + struct completion finish; +}; + +struct nbd_qsys { + spinlock_t lock; + unsigned int len; + struct list_head head; + wait_queue_head_t waiters; +}; + +/* + * The nbd_device struct is a block device association with a remote + * served block device. + */ +struct nbd_device { +#ifdef PARANOIA + unsigned int magic; +#endif /* PARANOIA */ + atomic_t refcnt; /* parent rw, child no */ + unsigned int flags; /* parent rw, child no */ + spinlock_t lock; /* for atomic access to (rx|tx)_thread.task */ + int harderror; /* Code of hard error */ + nbd_qsys_t tx_queue; + nbd_qsys_t rx_queue; + struct socket *sock; /* parent rw, child r */ + struct file *file; /* saved from userland via fget() */ + struct sockaddr_in sin; + unsigned int errcnt; /* to count path dependent errors */ + int lasterr; /* last error for path (not request) */ + int closed; /* parent rw, child rw */ + nbd_thread_t ss_thread; /* session thread */ + nbd_thread_t tx_thread; /* transmit I/O thread */ + nbd_thread_t rx_thread; /* receive I/O thread */ + atomic_t num_io_threads; /* count of existing I/O threads (0-2) */ + wait_queue_head_t no_io_waiters; + struct semaphore semalock; /* protect parent level rw access */ + struct gendisk *disk; + int blksize; + u64 bytesize; +}; + +/* nbd_device flags not settable (nor interpretable) by users */ +#define NBD_CLEARED 0x0010 + +static inline unsigned long nbd_read_only(nbd_device_t *lo) +{ + return lo->flags & NBD_READ_ONLY; +} + +static inline unsigned long nbd_write_nochk(nbd_device_t *lo) +{ + return lo->flags & NBD_WRITE_NOCHK; +} + +#endif /* MAJOR_NR */ +#endif /* LINUX_NBD_H */