[PATCHSET 0/8] exofs RAID0 support

All of lore.kernel.org
 help / color / mirror / Atom feed

* [PATCHSET 0/8] exofs RAID0 support
@ 2010-02-01 12:32 Boaz Harrosh
  2010-02-01 14:56 ` [PATCH 1/8] exofs: debug print even less Boaz Harrosh
                   ` (7 more replies)
  0 siblings, 8 replies; 9+ messages in thread
From: Boaz Harrosh @ 2010-02-01 12:32 UTC (permalink / raw)
  To: open-osd, linux-fsdevel, pNFS Mailing List

Submitted for review a striped IO support in exofs. Borrowing from pnfs-objects-layout,
stripping can be optionally combined with mirroring.

The code was tested and passes tests. An associated patch to pnfs-objects-layout and
pnfsd-exofs which enables striping export over pnfs, is also tested. (Not included in
this patchset)

So without further ado:
[PATCH 1/8] exofs: debug print even less
[PATCH 2/8] exofs: Micro-optimize exofs_i_info

	A couple of patches for next Kernel, not really related.

[PATCH 3/8] exofs: Recover in the case of read-passed-end-of-file
[PATCH 4/8] exofs: Move layout related members to a layout structure
[PATCH 5/8] exofs: unindent exofs_sbi_read

	These are some preliminary preparations that stand by themselves

[PATCH 6/8] exofs: Define on-disk per-inode optional layout attribute
[PATCH 7/8] exofs: RAID0 support

	Finally the RAID0 support

[PATCH 8/8] exofs: convert io_state to use pages array instead of bio at input

	And, a now requested change

Thank you for reviewing
Boaz

^ permalink raw reply	[flat|nested] 9+ messages in thread

* [PATCH 1/8] exofs: debug print even less
  2010-02-01 12:32 [PATCHSET 0/8] exofs RAID0 support Boaz Harrosh
@ 2010-02-01 14:56 ` Boaz Harrosh
  2010-02-01 14:57 ` [PATCH 2/8] exofs: Micro-optimize exofs_i_info Boaz Harrosh
                   ` (6 subsequent siblings)
  7 siblings, 0 replies; 9+ messages in thread
From: Boaz Harrosh @ 2010-02-01 14:56 UTC (permalink / raw)
  To: open-osd, pNFS Mailing List, linux-fsdevel

* Last debug trimming left in some stupid print, remove them.
  Fixup some other prints
* Shift printing from inode.c to ios.c
* Add couple of prints when memory allocation fails.

Signed-off-by: Boaz Harrosh <bharrosh@panasas.com>
---
 fs/exofs/inode.c |   23 +++++++++++++----------
 fs/exofs/ios.c   |   38 ++++++++++++++++++++++++++++++++------
 2 files changed, 45 insertions(+), 16 deletions(-)

diff --git a/fs/exofs/inode.c b/fs/exofs/inode.c
index 2afbceb..c88a0c5 100644
--- a/fs/exofs/inode.c
+++ b/fs/exofs/inode.c
@@ -193,7 +193,7 @@ static int __readpages_done(struct page_collect *pcol, bool do_unlock)
 	else
 		good_bytes = pcol->length - resid;
 
-	EXOFS_DBGMSG("readpages_done(0x%lx) good_bytes=0x%llx"
+	EXOFS_DBGMSG2("readpages_done(0x%lx) good_bytes=0x%llx"
 		     " length=0x%lx nr_pages=%u\n",
 		     pcol->inode->i_ino, _LLU(good_bytes), pcol->length,
 		     pcol->nr_pages);
@@ -222,7 +222,7 @@ static int __readpages_done(struct page_collect *pcol, bool do_unlock)
 	}
 
 	pcol_free(pcol);
-	EXOFS_DBGMSG("readpages_done END\n");
+	EXOFS_DBGMSG2("readpages_done END\n");
 	return ret;
 }
 
@@ -290,7 +290,7 @@ static int read_exec(struct page_collect *pcol, bool is_sync)
 
 	atomic_inc(&pcol->sbi->s_curr_pending);
 
-	EXOFS_DBGMSG("read_exec obj=0x%llx start=0x%llx length=0x%lx\n",
+	EXOFS_DBGMSG2("read_exec obj=0x%llx start=0x%llx length=0x%lx\n",
 		  ios->obj.id, _LLU(ios->offset), pcol->length);
 
 	/* pages ownership was passed to pcol_copy */
@@ -462,7 +462,7 @@ static void writepages_done(struct exofs_io_state *ios, void *p)
 	else
 		good_bytes = pcol->length - resid;
 
-	EXOFS_DBGMSG("writepages_done(0x%lx) good_bytes=0x%llx"
+	EXOFS_DBGMSG2("writepages_done(0x%lx) good_bytes=0x%llx"
 		     " length=0x%lx nr_pages=%u\n",
 		     pcol->inode->i_ino, _LLU(good_bytes), pcol->length,
 		     pcol->nr_pages);
@@ -490,7 +490,7 @@ static void writepages_done(struct exofs_io_state *ios, void *p)
 
 	pcol_free(pcol);
 	kfree(pcol);
-	EXOFS_DBGMSG("writepages_done END\n");
+	EXOFS_DBGMSG2("writepages_done END\n");
 }
 
 static int write_exec(struct page_collect *pcol)
@@ -527,7 +527,7 @@ static int write_exec(struct page_collect *pcol)
 	}
 
 	atomic_inc(&pcol->sbi->s_curr_pending);
-	EXOFS_DBGMSG("write_exec(0x%lx, 0x%llx) start=0x%llx length=0x%lx\n",
+	EXOFS_DBGMSG2("write_exec(0x%lx, 0x%llx) start=0x%llx length=0x%lx\n",
 		  pcol->inode->i_ino, pcol->pg_first, _LLU(ios->offset),
 		  pcol->length);
 	/* pages ownership was passed to pcol_copy */
@@ -616,7 +616,7 @@ try_again:
 
 	ret = pcol_add_page(pcol, page, len);
 	if (unlikely(ret)) {
-		EXOFS_DBGMSG("Failed pcol_add_page "
+		EXOFS_DBGMSG2("Failed pcol_add_page "
 			     "nr_pages=%u total_length=0x%lx\n",
 			     pcol->nr_pages, pcol->length);
 
@@ -663,7 +663,7 @@ static int exofs_writepages(struct address_space *mapping,
 	if (expected_pages < 32L)
 		expected_pages = 32L;
 
-	EXOFS_DBGMSG("inode(0x%lx) wbc->start=0x%llx wbc->end=0x%llx "
+	EXOFS_DBGMSG2("inode(0x%lx) wbc->start=0x%llx wbc->end=0x%llx "
 		     "nrpages=%lu start=0x%lx end=0x%lx expected_pages=%ld\n",
 		     mapping->host->i_ino, wbc->range_start, wbc->range_end,
 		     mapping->nrpages, start, end, expected_pages);
@@ -1170,8 +1170,10 @@ static int exofs_update_inode(struct inode *inode, int do_sync)
 	int ret;
 
 	args = kzalloc(sizeof(*args), GFP_KERNEL);
-	if (!args)
+	if (!args) {
+		EXOFS_DBGMSG("Faild kzalloc of args\n");
 		return -ENOMEM;
+	}
 
 	fcb = &args->fcb;
 
@@ -1234,7 +1236,8 @@ static int exofs_update_inode(struct inode *inode, int do_sync)
 free_args:
 	kfree(args);
 out:
-	EXOFS_DBGMSG("ret=>%d\n", ret);
+	EXOFS_DBGMSG("(0x%lx) do_sync=%d ret=>%d\n",
+		     inode->i_ino, do_sync, ret);
 	return ret;
 }
 
diff --git a/fs/exofs/ios.c b/fs/exofs/ios.c
index 5bad01f..3cc0dd3 100644
--- a/fs/exofs/ios.c
+++ b/fs/exofs/ios.c
@@ -26,6 +26,9 @@
 
 #include "exofs.h"
 
+#define EXOFS_DBGMSG2(M...) do {} while (0)
+/* #define EXOFS_DBGMSG2 EXOFS_DBGMSG */
+
 void exofs_make_credential(u8 cred_a[OSD_CAP_LEN], const struct osd_obj_id *obj)
 {
 	osd_sec_init_nosec_doall_caps(cred_a, obj, false, true);
@@ -73,6 +76,8 @@ int exofs_get_io_state(struct exofs_sb_info *sbi, struct exofs_io_state** pios)
 	 */
 	ios = kzalloc(exofs_io_state_size(sbi->s_numdevs), GFP_KERNEL);
 	if (unlikely(!ios)) {
+		EXOFS_DBGMSG("Faild kzalloc bytes=%d\n",
+			     exofs_io_state_size(sbi->s_numdevs));
 		*pios = NULL;
 		return -ENOMEM;
 	}
@@ -276,6 +281,9 @@ int exofs_sbi_write(struct exofs_io_state *ios)
 				bio = bio_kmalloc(GFP_KERNEL,
 						  ios->bio->bi_max_vecs);
 				if (unlikely(!bio)) {
+					EXOFS_DBGMSG(
+					      "Faild to allocate BIO size=%u\n",
+					      ios->bio->bi_max_vecs);
 					ret = -ENOMEM;
 					goto out;
 				}
@@ -290,14 +298,21 @@ int exofs_sbi_write(struct exofs_io_state *ios)
 
 			osd_req_write(or, &ios->obj, ios->offset, bio,
 				      ios->length);
-/*			EXOFS_DBGMSG("write sync=%d\n", sync);*/
+			EXOFS_DBGMSG("write(0x%llx) offset=0x%llx "
+				      "length=0x%llx dev=%d\n",
+				     _LLU(ios->obj.id), _LLU(ios->offset),
+				     _LLU(ios->length), i);
 		} else if (ios->kern_buff) {
 			osd_req_write_kern(or, &ios->obj, ios->offset,
 					   ios->kern_buff, ios->length);
-/*			EXOFS_DBGMSG("write_kern sync=%d\n", sync);*/
+			EXOFS_DBGMSG2("write_kern(0x%llx) offset=0x%llx "
+				      "length=0x%llx dev=%d\n",
+				     _LLU(ios->obj.id), _LLU(ios->offset),
+				     _LLU(ios->length), i);
 		} else {
 			osd_req_set_attributes(or, &ios->obj);
-/*			EXOFS_DBGMSG("set_attributes sync=%d\n", sync);*/
+			EXOFS_DBGMSG2("obj(0x%llx) set_attributes=%d dev=%d\n",
+				     _LLU(ios->obj.id), ios->out_attr_len, i);
 		}
 
 		if (ios->out_attr)
@@ -335,14 +350,25 @@ int exofs_sbi_read(struct exofs_io_state *ios)
 		if (ios->bio) {
 			osd_req_read(or, &ios->obj, ios->offset, ios->bio,
 				     ios->length);
-/*			EXOFS_DBGMSG("read sync=%d\n", sync);*/
+			EXOFS_DBGMSG("read(0x%llx) offset=0x%llx length=0x%llx"
+				      " dev=%d\n", _LLU(ios->obj.id),
+				     _LLU(ios->offset),
+				     _LLU(ios->length),
+				     first_dev);
 		} else if (ios->kern_buff) {
 			osd_req_read_kern(or, &ios->obj, ios->offset,
 					   ios->kern_buff, ios->length);
-/*			EXOFS_DBGMSG("read_kern sync=%d\n", sync);*/
+			EXOFS_DBGMSG2("read_kern(0x%llx) offset=0x%llx "
+				      "length=0x%llx dev=%d\n",
+				     _LLU(ios->obj.id),
+				     _LLU(ios->offset),
+				     _LLU(ios->length),
+				     first_dev);
 		} else {
 			osd_req_get_attributes(or, &ios->obj);
-/*			EXOFS_DBGMSG("get_attributes sync=%d\n", sync);*/
+			EXOFS_DBGMSG2("obj(0x%llx) get_attributes=%d dev=%d\n",
+				     _LLU(ios->obj.id), ios->in_attr_len,
+				     first_dev);
 		}
 
 		if (ios->out_attr)
-- 
1.6.6


^ permalink raw reply related	[flat|nested] 9+ messages in thread

* [PATCH 2/8] exofs: Micro-optimize exofs_i_info
  2010-02-01 12:32 [PATCHSET 0/8] exofs RAID0 support Boaz Harrosh
  2010-02-01 14:56 ` [PATCH 1/8] exofs: debug print even less Boaz Harrosh
@ 2010-02-01 14:57 ` Boaz Harrosh
  2010-02-01 14:57 ` [PATCH 3/8] exofs: Recover in the case of read-passed-end-of-file Boaz Harrosh
                   ` (5 subsequent siblings)
  7 siblings, 0 replies; 9+ messages in thread
From: Boaz Harrosh @ 2010-02-01 14:57 UTC (permalink / raw)
  To: open-osd, pNFS Mailing List, linux-fsdevel

optimize the exofs_i_info struct usage by moving the embedded
vfs_inode to be first. A compiler might optimize away an "add"
operation with constant zero. (Which it cannot with other constants)

Signed-off-by: Boaz Harrosh <bharrosh@panasas.com>
---
 fs/exofs/exofs.h |    4 ++--
 1 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/fs/exofs/exofs.h b/fs/exofs/exofs.h
index c35fd46..13663da 100644
--- a/fs/exofs/exofs.h
+++ b/fs/exofs/exofs.h
@@ -78,13 +78,13 @@ struct exofs_sb_info {
  * our extension to the in-memory inode
  */
 struct exofs_i_info {
+	struct inode   vfs_inode;          /* normal in-memory inode          */
+	wait_queue_head_t i_wq;            /* wait queue for inode            */
 	unsigned long  i_flags;            /* various atomic flags            */
 	uint32_t       i_data[EXOFS_IDATA];/*short symlink names and device #s*/
 	uint32_t       i_dir_start_lookup; /* which page to start lookup      */
-	wait_queue_head_t i_wq;            /* wait queue for inode            */
 	uint64_t       i_commit_size;      /* the object's written length     */
 	uint8_t        i_cred[OSD_CAP_LEN];/* all-powerful credential         */
-	struct inode   vfs_inode;          /* normal in-memory inode          */
 };
 
 static inline osd_id exofs_oi_objno(struct exofs_i_info *oi)
-- 
1.6.6


^ permalink raw reply related	[flat|nested] 9+ messages in thread

* [PATCH 3/8] exofs: Recover in the case of read-passed-end-of-file
  2010-02-01 12:32 [PATCHSET 0/8] exofs RAID0 support Boaz Harrosh
  2010-02-01 14:56 ` [PATCH 1/8] exofs: debug print even less Boaz Harrosh
  2010-02-01 14:57 ` [PATCH 2/8] exofs: Micro-optimize exofs_i_info Boaz Harrosh
@ 2010-02-01 14:57 ` Boaz Harrosh
  2010-02-01 14:57 ` [PATCH 4/8] exofs: Move layout related members to a layout structure Boaz Harrosh
                   ` (4 subsequent siblings)
  7 siblings, 0 replies; 9+ messages in thread
From: Boaz Harrosh @ 2010-02-01 14:57 UTC (permalink / raw)
  To: open-osd, pNFS Mailing List, linux-fsdevel

In check_io, implement the case of reading passed end of
file, by clearing the pages and recover with no error. In
a raid arrangement this can become a legitimate situation
in case of holes in the file.

Signed-off-by: Boaz Harrosh <bharrosh@panasas.com>
---
 fs/exofs/ios.c |   36 ++++++++++++++++++++++++++++++------
 1 files changed, 30 insertions(+), 6 deletions(-)

diff --git a/fs/exofs/ios.c b/fs/exofs/ios.c
index 3cc0dd3..439c5d0 100644
--- a/fs/exofs/ios.c
+++ b/fs/exofs/ios.c
@@ -173,6 +173,21 @@ static int exofs_io_execute(struct exofs_io_state *ios)
 	return ret;
 }
 
+static void _clear_bio(struct bio *bio)
+{
+	struct bio_vec *bv;
+	unsigned i;
+
+	__bio_for_each_segment(bv, bio, i, 0) {
+		unsigned this_count = bv->bv_len;
+
+		if (likely(PAGE_SIZE == this_count))
+			clear_highpage(bv->bv_page);
+		else
+			zero_user(bv->bv_page, bv->bv_offset, this_count);
+	}
+}
+
 int exofs_check_io(struct exofs_io_state *ios, u64 *resid)
 {
 	enum osd_err_priority acumulated_osd_err = 0;
@@ -181,16 +196,25 @@ int exofs_check_io(struct exofs_io_state *ios, u64 *resid)
 
 	for (i = 0; i < ios->numdevs; i++) {
 		struct osd_sense_info osi;
-		int ret = osd_req_decode_sense(ios->per_dev[i].or, &osi);
+		struct osd_request *or = ios->per_dev[i].or;
+		int ret;
+
+		if (unlikely(!or))
+			continue;
 
+		ret = osd_req_decode_sense(or, &osi);
 		if (likely(!ret))
 			continue;
 
-		if (unlikely(ret == -EFAULT)) {
-			EXOFS_DBGMSG("%s: EFAULT Need page clear\n", __func__);
-			/*FIXME: All the pages in this device range should:
-			 *	clear_highpage(page);
-			 */
+		if (OSD_ERR_PRI_CLEAR_PAGES == osi.osd_err_pri) {
+			/* start read offset passed endof file */
+			_clear_bio(ios->per_dev[i].bio);
+			EXOFS_DBGMSG("start read offset passed end of file "
+				"offset=0x%llx, length=0x%llx\n",
+				_LLU(ios->offset),
+				_LLU(ios->length));
+
+			continue; /* we recovered */
 		}
 
 		if (osi.osd_err_pri >= acumulated_osd_err) {
-- 
1.6.6


^ permalink raw reply related	[flat|nested] 9+ messages in thread

* [PATCH 4/8] exofs: Move layout related members to a layout structure
  2010-02-01 12:32 [PATCHSET 0/8] exofs RAID0 support Boaz Harrosh
                   ` (2 preceding siblings ...)
  2010-02-01 14:57 ` [PATCH 3/8] exofs: Recover in the case of read-passed-end-of-file Boaz Harrosh
@ 2010-02-01 14:57 ` Boaz Harrosh
  2010-02-01 14:57 ` [PATCH 5/8] exofs: unindent exofs_sbi_read Boaz Harrosh
                   ` (3 subsequent siblings)
  7 siblings, 0 replies; 9+ messages in thread
From: Boaz Harrosh @ 2010-02-01 14:57 UTC (permalink / raw)
  To: open-osd, pNFS Mailing List, linux-fsdevel

* Abstract away those members in exofs_sb_info that are related/needed
  by a layout into a new exofs_layout structure. Embed it in exofs_sb_info.

* At exofs_io_state receive/keep a pointer to an exofs_layout. No need for
  an exofs_sb_info pointer, all we need is at exofs_layout.

* Change any usage of above exofs_sb_info members to their new name.

Signed-off-by: Boaz Harrosh <bharrosh@panasas.com>
---
 fs/exofs/exofs.h |   24 ++++++++++++++++++------
 fs/exofs/inode.c |   14 +++++++-------
 fs/exofs/ios.c   |   36 +++++++++++++++++++-----------------
 fs/exofs/super.c |   51 +++++++++++++++++++++++++++------------------------
 4 files changed, 71 insertions(+), 54 deletions(-)

diff --git a/fs/exofs/exofs.h b/fs/exofs/exofs.h
index 13663da..33c6856 100644
--- a/fs/exofs/exofs.h
+++ b/fs/exofs/exofs.h
@@ -55,12 +55,18 @@
 /* u64 has problems with printk this will cast it to unsigned long long */
 #define _LLU(x) (unsigned long long)(x)
 
+struct exofs_layout {
+	osd_id		s_pid;			/* partition ID of file system*/
+
+	unsigned	s_numdevs;		/* Num of devices in array    */
+	struct osd_dev	*s_ods[0];		/* Variable length            */
+};
+
 /*
  * our extension to the in-memory superblock
  */
 struct exofs_sb_info {
 	struct exofs_fscb s_fscb;		/* Written often, pre-allocate*/
-	osd_id		s_pid;			/* partition ID of file system*/
 	int		s_timeout;		/* timeout for OSD operations */
 	uint64_t	s_nextid;		/* highest object ID used     */
 	uint32_t	s_numfiles;		/* number of files on fs      */
@@ -69,9 +75,14 @@ struct exofs_sb_info {
 	atomic_t	s_curr_pending;		/* number of pending commands */
 	uint8_t		s_cred[OSD_CAP_LEN];	/* credential for the fscb    */
 
-	struct pnfs_osd_data_map data_map;	/* Default raid to use        */
-	unsigned	s_numdevs;		/* Num of devices in array    */
-	struct osd_dev	*s_ods[1];		/* Variable length, minimum 1 */
+	struct pnfs_osd_data_map data_map;	/* Default raid to use
+						 * FIXME: Needed ?
+						 */
+/*	struct exofs_layout	dir_layout;*/	/* Default dir layout */
+	struct exofs_layout	layout;		/* Default files layout,
+						 * contains the variable osd_dev
+						 * array. Keep last */
+	struct osd_dev	*_min_one_dev[1];	/* Place holder for one dev   */
 };
 
 /*
@@ -101,7 +112,7 @@ struct exofs_io_state {
 	void			*private;
 	exofs_io_done_fn	done;
 
-	struct exofs_sb_info	*sbi;
+	struct exofs_layout	*layout;
 	struct osd_obj_id	obj;
 	u8			*cred;
 
@@ -189,7 +200,8 @@ void exofs_make_credential(u8 cred_a[OSD_CAP_LEN],
 int exofs_read_kern(struct osd_dev *od, u8 *cred, struct osd_obj_id *obj,
 		    u64 offset, void *p, unsigned length);
 
-int  exofs_get_io_state(struct exofs_sb_info *sbi, struct exofs_io_state** ios);
+int  exofs_get_io_state(struct exofs_layout *layout,
+			struct exofs_io_state **ios);
 void exofs_put_io_state(struct exofs_io_state *ios);
 
 int exofs_check_io(struct exofs_io_state *ios, u64 *resid);
diff --git a/fs/exofs/inode.c b/fs/exofs/inode.c
index c88a0c5..03189a9 100644
--- a/fs/exofs/inode.c
+++ b/fs/exofs/inode.c
@@ -65,7 +65,7 @@ static void _pcol_init(struct page_collect *pcol, unsigned expected_pages,
 	/* Create master bios on first Q, later on cloning, each clone will be
 	 * allocated on it's destination Q
 	 */
-	pcol->req_q = osd_request_queue(sbi->s_ods[0]);
+	pcol->req_q = osd_request_queue(sbi->layout.s_ods[0]);
 	pcol->inode = inode;
 	pcol->expected_pages = expected_pages;
 
@@ -99,7 +99,7 @@ static int pcol_try_alloc(struct page_collect *pcol)
 			  BIO_MAX_PAGES_KMALLOC);
 
 	if (!pcol->ios) { /* First time allocate io_state */
-		int ret = exofs_get_io_state(pcol->sbi, &pcol->ios);
+		int ret = exofs_get_io_state(&pcol->sbi->layout, &pcol->ios);
 
 		if (ret)
 			return ret;
@@ -872,7 +872,7 @@ static int exofs_get_inode(struct super_block *sb, struct exofs_i_info *oi,
 	int ret;
 
 	*obj_size = ~0;
-	ret = exofs_get_io_state(sbi, &ios);
+	ret = exofs_get_io_state(&sbi->layout, &ios);
 	if (unlikely(ret)) {
 		EXOFS_ERR("%s: exofs_get_io_state failed.\n", __func__);
 		return ret;
@@ -1043,7 +1043,7 @@ static void create_done(struct exofs_io_state *ios, void *p)
 
 	if (unlikely(ret)) {
 		EXOFS_ERR("object=0x%llx creation faild in pid=0x%llx",
-			  _LLU(exofs_oi_objno(oi)), _LLU(sbi->s_pid));
+			  _LLU(exofs_oi_objno(oi)), _LLU(sbi->layout.s_pid));
 		/*TODO: When FS is corrupted creation can fail, object already
 		 * exist. Get rid of this asynchronous creation, if exist
 		 * increment the obj counter and try the next object. Until we
@@ -1104,7 +1104,7 @@ struct inode *exofs_new_inode(struct inode *dir, int mode)
 
 	mark_inode_dirty(inode);
 
-	ret = exofs_get_io_state(sbi, &ios);
+	ret = exofs_get_io_state(&sbi->layout, &ios);
 	if (unlikely(ret)) {
 		EXOFS_ERR("exofs_new_inode: exofs_get_io_state failed\n");
 		return ERR_PTR(ret);
@@ -1202,7 +1202,7 @@ static int exofs_update_inode(struct inode *inode, int do_sync)
 	} else
 		memcpy(fcb->i_data, oi->i_data, sizeof(fcb->i_data));
 
-	ret = exofs_get_io_state(sbi, &ios);
+	ret = exofs_get_io_state(&sbi->layout, &ios);
 	if (unlikely(ret)) {
 		EXOFS_ERR("%s: exofs_get_io_state failed.\n", __func__);
 		goto free_args;
@@ -1286,7 +1286,7 @@ void exofs_delete_inode(struct inode *inode)
 
 	clear_inode(inode);
 
-	ret = exofs_get_io_state(sbi, &ios);
+	ret = exofs_get_io_state(&sbi->layout, &ios);
 	if (unlikely(ret)) {
 		EXOFS_ERR("%s: exofs_get_io_state failed\n", __func__);
 		return;
diff --git a/fs/exofs/ios.c b/fs/exofs/ios.c
index 439c5d0..83e54a7 100644
--- a/fs/exofs/ios.c
+++ b/fs/exofs/ios.c
@@ -67,23 +67,24 @@ out:
 	return ret;
 }
 
-int exofs_get_io_state(struct exofs_sb_info *sbi, struct exofs_io_state** pios)
+int exofs_get_io_state(struct exofs_layout *layout,
+		       struct exofs_io_state **pios)
 {
 	struct exofs_io_state *ios;
 
 	/*TODO: Maybe use kmem_cach per sbi of size
-	 * exofs_io_state_size(sbi->s_numdevs)
+	 * exofs_io_state_size(layout->s_numdevs)
 	 */
-	ios = kzalloc(exofs_io_state_size(sbi->s_numdevs), GFP_KERNEL);
+	ios = kzalloc(exofs_io_state_size(layout->s_numdevs), GFP_KERNEL);
 	if (unlikely(!ios)) {
 		EXOFS_DBGMSG("Faild kzalloc bytes=%d\n",
-			     exofs_io_state_size(sbi->s_numdevs));
+			     exofs_io_state_size(layout->s_numdevs));
 		*pios = NULL;
 		return -ENOMEM;
 	}
 
-	ios->sbi = sbi;
-	ios->obj.partition = sbi->s_pid;
+	ios->layout = layout;
+	ios->obj.partition = layout->s_pid;
 	*pios = ios;
 	return 0;
 }
@@ -238,10 +239,10 @@ int exofs_sbi_create(struct exofs_io_state *ios)
 {
 	int i, ret;
 
-	for (i = 0; i < ios->sbi->s_numdevs; i++) {
+	for (i = 0; i < ios->layout->s_numdevs; i++) {
 		struct osd_request *or;
 
-		or = osd_start_request(ios->sbi->s_ods[i], GFP_KERNEL);
+		or = osd_start_request(ios->layout->s_ods[i], GFP_KERNEL);
 		if (unlikely(!or)) {
 			EXOFS_ERR("%s: osd_start_request failed\n", __func__);
 			ret = -ENOMEM;
@@ -262,10 +263,10 @@ int exofs_sbi_remove(struct exofs_io_state *ios)
 {
 	int i, ret;
 
-	for (i = 0; i < ios->sbi->s_numdevs; i++) {
+	for (i = 0; i < ios->layout->s_numdevs; i++) {
 		struct osd_request *or;
 
-		or = osd_start_request(ios->sbi->s_ods[i], GFP_KERNEL);
+		or = osd_start_request(ios->layout->s_ods[i], GFP_KERNEL);
 		if (unlikely(!or)) {
 			EXOFS_ERR("%s: osd_start_request failed\n", __func__);
 			ret = -ENOMEM;
@@ -286,10 +287,10 @@ int exofs_sbi_write(struct exofs_io_state *ios)
 {
 	int i, ret;
 
-	for (i = 0; i < ios->sbi->s_numdevs; i++) {
+	for (i = 0; i < ios->layout->s_numdevs; i++) {
 		struct osd_request *or;
 
-		or = osd_start_request(ios->sbi->s_ods[i], GFP_KERNEL);
+		or = osd_start_request(ios->layout->s_ods[i], GFP_KERNEL);
 		if (unlikely(!or)) {
 			EXOFS_ERR("%s: osd_start_request failed\n", __func__);
 			ret = -ENOMEM;
@@ -361,8 +362,9 @@ int exofs_sbi_read(struct exofs_io_state *ios)
 		struct osd_request *or;
 		unsigned first_dev = (unsigned)ios->obj.id;
 
-		first_dev %= ios->sbi->s_numdevs;
-		or = osd_start_request(ios->sbi->s_ods[first_dev], GFP_KERNEL);
+		first_dev %= ios->layout->s_numdevs;
+		or = osd_start_request(ios->layout->s_ods[first_dev],
+				       GFP_KERNEL);
 		if (unlikely(!or)) {
 			EXOFS_ERR("%s: osd_start_request failed\n", __func__);
 			ret = -ENOMEM;
@@ -438,7 +440,7 @@ int exofs_oi_truncate(struct exofs_i_info *oi, u64 size)
 	__be64 newsize;
 	int i, ret;
 
-	if (exofs_get_io_state(sbi, &ios))
+	if (exofs_get_io_state(&sbi->layout, &ios))
 		return -ENOMEM;
 
 	ios->obj.id = exofs_oi_objno(oi);
@@ -448,10 +450,10 @@ int exofs_oi_truncate(struct exofs_i_info *oi, u64 size)
 	attr = g_attr_logical_length;
 	attr.val_ptr = &newsize;
 
-	for (i = 0; i < sbi->s_numdevs; i++) {
+	for (i = 0; i < sbi->layout.s_numdevs; i++) {
 		struct osd_request *or;
 
-		or = osd_start_request(sbi->s_ods[i], GFP_KERNEL);
+		or = osd_start_request(sbi->layout.s_ods[i], GFP_KERNEL);
 		if (unlikely(!or)) {
 			EXOFS_ERR("%s: osd_start_request failed\n", __func__);
 			ret = -ENOMEM;
diff --git a/fs/exofs/super.c b/fs/exofs/super.c
index a1d1e77..fc88751 100644
--- a/fs/exofs/super.c
+++ b/fs/exofs/super.c
@@ -210,7 +210,7 @@ int exofs_sync_fs(struct super_block *sb, int wait)
 	sbi = sb->s_fs_info;
 	fscb = &sbi->s_fscb;
 
-	ret = exofs_get_io_state(sbi, &ios);
+	ret = exofs_get_io_state(&sbi->layout, &ios);
 	if (ret)
 		goto out;
 
@@ -264,12 +264,12 @@ static void _exofs_print_device(const char *msg, const char *dev_path,
 
 void exofs_free_sbi(struct exofs_sb_info *sbi)
 {
-	while (sbi->s_numdevs) {
-		int i = --sbi->s_numdevs;
-		struct osd_dev *od = sbi->s_ods[i];
+	while (sbi->layout.s_numdevs) {
+		int i = --sbi->layout.s_numdevs;
+		struct osd_dev *od = sbi->layout.s_ods[i];
 
 		if (od) {
-			sbi->s_ods[i] = NULL;
+			sbi->layout.s_ods[i] = NULL;
 			osduld_put_device(od);
 		}
 	}
@@ -298,7 +298,8 @@ static void exofs_put_super(struct super_block *sb)
 				  msecs_to_jiffies(100));
 	}
 
-	_exofs_print_device("Unmounting", NULL, sbi->s_ods[0], sbi->s_pid);
+	_exofs_print_device("Unmounting", NULL, sbi->layout.s_ods[0],
+			    sbi->layout.s_pid);
 
 	exofs_free_sbi(sbi);
 	sb->s_fs_info = NULL;
@@ -361,7 +362,7 @@ static int exofs_read_lookup_dev_table(struct exofs_sb_info **psbi,
 {
 	struct exofs_sb_info *sbi = *psbi;
 	struct osd_dev *fscb_od;
-	struct osd_obj_id obj = {.partition = sbi->s_pid,
+	struct osd_obj_id obj = {.partition = sbi->layout.s_pid,
 				 .id = EXOFS_DEVTABLE_ID};
 	struct exofs_device_table *dt;
 	unsigned table_bytes = table_count * sizeof(dt->dt_dev_table[0]) +
@@ -376,9 +377,9 @@ static int exofs_read_lookup_dev_table(struct exofs_sb_info **psbi,
 		return -ENOMEM;
 	}
 
-	fscb_od = sbi->s_ods[0];
-	sbi->s_ods[0] = NULL;
-	sbi->s_numdevs = 0;
+	fscb_od = sbi->layout.s_ods[0];
+	sbi->layout.s_ods[0] = NULL;
+	sbi->layout.s_numdevs = 0;
 	ret = exofs_read_kern(fscb_od, sbi->s_cred, &obj, 0, dt, table_bytes);
 	if (unlikely(ret)) {
 		EXOFS_ERR("ERROR: reading device table\n");
@@ -397,14 +398,15 @@ static int exofs_read_lookup_dev_table(struct exofs_sb_info **psbi,
 		goto out;
 
 	if (likely(numdevs > 1)) {
-		unsigned size = numdevs * sizeof(sbi->s_ods[0]);
+		unsigned size = numdevs * sizeof(sbi->layout.s_ods[0]);
 
 		sbi = krealloc(sbi, sizeof(*sbi) + size, GFP_KERNEL);
 		if (unlikely(!sbi)) {
 			ret = -ENOMEM;
 			goto out;
 		}
-		memset(&sbi->s_ods[1], 0, size - sizeof(sbi->s_ods[0]));
+		memset(&sbi->layout.s_ods[1], 0,
+		       size - sizeof(sbi->layout.s_ods[0]));
 		*psbi = sbi;
 	}
 
@@ -427,8 +429,8 @@ static int exofs_read_lookup_dev_table(struct exofs_sb_info **psbi,
 		 * line. We always keep them in device-table order.
 		 */
 		if (fscb_od && osduld_device_same(fscb_od, &odi)) {
-			sbi->s_ods[i] = fscb_od;
-			++sbi->s_numdevs;
+			sbi->layout.s_ods[i] = fscb_od;
+			++sbi->layout.s_numdevs;
 			fscb_od = NULL;
 			continue;
 		}
@@ -441,8 +443,8 @@ static int exofs_read_lookup_dev_table(struct exofs_sb_info **psbi,
 			goto out;
 		}
 
-		sbi->s_ods[i] = od;
-		++sbi->s_numdevs;
+		sbi->layout.s_ods[i] = od;
+		++sbi->layout.s_numdevs;
 
 		/* Read the fscb of the other devices to make sure the FS
 		 * partition is there.
@@ -499,9 +501,10 @@ static int exofs_fill_super(struct super_block *sb, void *data, int silent)
 		goto free_sbi;
 	}
 
-	sbi->s_ods[0] = od;
-	sbi->s_numdevs = 1;
-	sbi->s_pid = opts->pid;
+	/* Default layout in case we do not have a device-table */
+	sbi->layout.s_ods[0] = od;
+	sbi->layout.s_numdevs = 1;
+	sbi->layout.s_pid = opts->pid;
 	sbi->s_timeout = opts->timeout;
 
 	/* fill in some other data by hand */
@@ -514,7 +517,7 @@ static int exofs_fill_super(struct super_block *sb, void *data, int silent)
 	sb->s_bdev = NULL;
 	sb->s_dev = 0;
 
-	obj.partition = sbi->s_pid;
+	obj.partition = sbi->layout.s_pid;
 	obj.id = EXOFS_SUPER_ID;
 	exofs_make_credential(sbi->s_cred, &obj);
 
@@ -578,13 +581,13 @@ static int exofs_fill_super(struct super_block *sb, void *data, int silent)
 		goto free_sbi;
 	}
 
-	_exofs_print_device("Mounting", opts->dev_name, sbi->s_ods[0],
-			    sbi->s_pid);
+	_exofs_print_device("Mounting", opts->dev_name, sbi->layout.s_ods[0],
+			    sbi->layout.s_pid);
 	return 0;
 
 free_sbi:
 	EXOFS_ERR("Unable to mount exofs on %s pid=0x%llx err=%d\n",
-		  opts->dev_name, sbi->s_pid, ret);
+		  opts->dev_name, sbi->layout.s_pid, ret);
 	exofs_free_sbi(sbi);
 	return ret;
 }
@@ -627,7 +630,7 @@ static int exofs_statfs(struct dentry *dentry, struct kstatfs *buf)
 	uint8_t cred_a[OSD_CAP_LEN];
 	int ret;
 
-	ret = exofs_get_io_state(sbi, &ios);
+	ret = exofs_get_io_state(&sbi->layout, &ios);
 	if (ret) {
 		EXOFS_DBGMSG("exofs_get_io_state failed.\n");
 		return ret;
-- 
1.6.6


^ permalink raw reply related	[flat|nested] 9+ messages in thread

* [PATCH 5/8] exofs: unindent exofs_sbi_read
  2010-02-01 12:32 [PATCHSET 0/8] exofs RAID0 support Boaz Harrosh
                   ` (3 preceding siblings ...)
  2010-02-01 14:57 ` [PATCH 4/8] exofs: Move layout related members to a layout structure Boaz Harrosh
@ 2010-02-01 14:57 ` Boaz Harrosh
  2010-02-01 14:58 ` [PATCH 6/8] exofs: Define on-disk per-inode optional layout attribute Boaz Harrosh
                   ` (2 subsequent siblings)
  7 siblings, 0 replies; 9+ messages in thread
From: Boaz Harrosh @ 2010-02-01 14:57 UTC (permalink / raw)
  To: open-osd, pNFS Mailing List, linux-fsdevel

The original idea was that a mirror read can be sub-divided
to multiple devices. But this has very little gain and only
at very large IOes so it's not going to be implemented soon.

Signed-off-by: Boaz Harrosh <bharrosh@panasas.com>
---
 fs/exofs/ios.c |   87 ++++++++++++++++++++++++-------------------------------
 1 files changed, 38 insertions(+), 49 deletions(-)

diff --git a/fs/exofs/ios.c b/fs/exofs/ios.c
index 83e54a7..4f67931 100644
--- a/fs/exofs/ios.c
+++ b/fs/exofs/ios.c
@@ -356,59 +356,48 @@ out:
 
 int exofs_sbi_read(struct exofs_io_state *ios)
 {
-	int i, ret;
-
-	for (i = 0; i < 1; i++) {
-		struct osd_request *or;
-		unsigned first_dev = (unsigned)ios->obj.id;
-
-		first_dev %= ios->layout->s_numdevs;
-		or = osd_start_request(ios->layout->s_ods[first_dev],
-				       GFP_KERNEL);
-		if (unlikely(!or)) {
-			EXOFS_ERR("%s: osd_start_request failed\n", __func__);
-			ret = -ENOMEM;
-			goto out;
-		}
-		ios->per_dev[i].or = or;
-		ios->numdevs++;
+	struct osd_request *or;
+	struct exofs_per_dev_state *per_dev = &ios->per_dev[0];
+	unsigned first_dev = (unsigned)ios->obj.id;
 
-		if (ios->bio) {
-			osd_req_read(or, &ios->obj, ios->offset, ios->bio,
-				     ios->length);
-			EXOFS_DBGMSG("read(0x%llx) offset=0x%llx length=0x%llx"
-				      " dev=%d\n", _LLU(ios->obj.id),
-				     _LLU(ios->offset),
-				     _LLU(ios->length),
-				     first_dev);
-		} else if (ios->kern_buff) {
-			osd_req_read_kern(or, &ios->obj, ios->offset,
-					   ios->kern_buff, ios->length);
-			EXOFS_DBGMSG2("read_kern(0x%llx) offset=0x%llx "
-				      "length=0x%llx dev=%d\n",
-				     _LLU(ios->obj.id),
-				     _LLU(ios->offset),
-				     _LLU(ios->length),
-				     first_dev);
-		} else {
-			osd_req_get_attributes(or, &ios->obj);
-			EXOFS_DBGMSG2("obj(0x%llx) get_attributes=%d dev=%d\n",
-				     _LLU(ios->obj.id), ios->in_attr_len,
-				     first_dev);
-		}
+	first_dev %= ios->layout->s_numdevs;
+	or = osd_start_request(ios->layout->s_ods[first_dev], GFP_KERNEL);
+	if (unlikely(!or)) {
+		EXOFS_ERR("%s: osd_start_request failed\n", __func__);
+		return -ENOMEM;
+	}
+	per_dev->or = or;
+	ios->numdevs++;
+
+	if (ios->bio) {
+		osd_req_read(or, &ios->obj, ios->offset, ios->bio, ios->length);
+		EXOFS_DBGMSG("read(0x%llx) offset=0x%llx length=0x%llx"
+			     " dev=%d\n", _LLU(ios->obj.id),
+			     _LLU(ios->offset), _LLU(ios->length),
+			     first_dev);
+	} else if (ios->kern_buff) {
+		int ret = osd_req_read_kern(or, &ios->obj, ios->offset,
+					    ios->kern_buff, ios->length);
+
+		EXOFS_DBGMSG2("read_kern(0x%llx) offset=0x%llx "
+			      "length=0x%llx dev=%d ret=>%d\n",
+			      _LLU(ios->obj.id), _LLU(ios->offset),
+			      _LLU(ios->length), first_dev, ret);
+		if (unlikely(ret))
+			return ret;
+	} else {
+		osd_req_get_attributes(or, &ios->obj);
+		EXOFS_DBGMSG2("obj(0x%llx) get_attributes=%d dev=%d\n",
+			      _LLU(ios->obj.id), ios->in_attr_len, first_dev);
+	}
 
-		if (ios->out_attr)
-			osd_req_add_set_attr_list(or, ios->out_attr,
-						  ios->out_attr_len);
+	if (ios->out_attr)
+		osd_req_add_set_attr_list(or, ios->out_attr, ios->out_attr_len);
 
-		if (ios->in_attr)
-			osd_req_add_get_attr_list(or, ios->in_attr,
-						  ios->in_attr_len);
-	}
-	ret = exofs_io_execute(ios);
+	if (ios->in_attr)
+		osd_req_add_get_attr_list(or, ios->in_attr, ios->in_attr_len);
 
-out:
-	return ret;
+	return exofs_io_execute(ios);
 }
 
 int extract_attr_from_ios(struct exofs_io_state *ios, struct osd_attr *attr)
-- 
1.6.6


^ permalink raw reply related	[flat|nested] 9+ messages in thread

* [PATCH 6/8] exofs: Define on-disk per-inode optional layout attribute
  2010-02-01 12:32 [PATCHSET 0/8] exofs RAID0 support Boaz Harrosh
                   ` (4 preceding siblings ...)
  2010-02-01 14:57 ` [PATCH 5/8] exofs: unindent exofs_sbi_read Boaz Harrosh
@ 2010-02-01 14:58 ` Boaz Harrosh
  2010-02-01 14:58 ` [PATCH 7/8] exofs: RAID0 support Boaz Harrosh
  2010-02-01 14:58 ` [PATCH 8/8] exofs: convert io_state to use pages array instead of bio at input Boaz Harrosh
  7 siblings, 0 replies; 9+ messages in thread
From: Boaz Harrosh @ 2010-02-01 14:58 UTC (permalink / raw)
  To: open-osd, pNFS Mailing List, linux-fsdevel

* Layouts describe the way a file is spread on multiple devices.
  The layout information is stored in the objects attribute introduced
  in this patch.

* There can be multiple generating function for the layout.
  Currently defined:
    - No attribute present - use below moving-window on global
      device table, all devices.
      (This is the only one currently used in exofs)
    - an obj_id generated moving window - the obj_id is a randomizing
      factor in the otherwise global map layout.
    - An explicit layout stored, including a data_map and a device
      index list.
    - More might be defined in future ...

* There are two attributes defined of the same structure:
  A-data-files-layout - This layout is used by data-files. If present
                        at a directory, all files of that directory will
                        be created with this layout.
  A-meta-data-layout - This layout is used by a directory and other
                       meta-data information. Also inherited at creation
                       of subdirectories.

* At creation time inodes are created with the layout specified above.
  A usermode utility may change the creation layout on a give directory
  or file. Which in the case of directories, will also apply to newly
  created files/subdirectories, children of that directory.
  In the simple unaltered case of a newly created exofs, no layout
  attributes are present, and all layouts adhere to the layout specified
  at the device-table.

* In case of a future file system loaded in an old exofs-driver.
  At iget(), the generating_function is inspected and if not supported
  will return an IO error to the application and the inode will not
  be loaded. So not to damage any data.
  Note: After this patch we do not yet support any type of layout
        only the RAID0 patch that enables striping at the super-block
        level will add support for RAID0 layouts above. This way we
        are past and future compatible and fully bisectable.

* Access to the device table is done by an accessor since
  it will change according to above information.

Signed-off-by: Boaz Harrosh <bharrosh@panasas.com>
---
 fs/exofs/common.h |   39 ++++++++++++++++++++++++++++++++++++
 fs/exofs/exofs.h  |    6 +++++
 fs/exofs/inode.c  |   56 ++++++++++++++++++++++++++++++++++++++++++++++++----
 fs/exofs/ios.c    |   23 +++++++++++++++++----
 4 files changed, 114 insertions(+), 10 deletions(-)

diff --git a/fs/exofs/common.h b/fs/exofs/common.h
index b1b178e..f0d5203 100644
--- a/fs/exofs/common.h
+++ b/fs/exofs/common.h
@@ -55,6 +55,8 @@
 /* exofs Application specific page/attribute */
 # define EXOFS_APAGE_FS_DATA	(OSD_APAGE_APP_DEFINED_FIRST + 3)
 # define EXOFS_ATTR_INODE_DATA	1
+# define EXOFS_ATTR_INODE_FILE_LAYOUT	2
+# define EXOFS_ATTR_INODE_DIR_LAYOUT	3
 
 /*
  * The maximum number of files we can have is limited by the size of the
@@ -206,4 +208,41 @@ enum {
 	(((name_len) + offsetof(struct exofs_dir_entry, name)  + \
 	  EXOFS_DIR_ROUND) & ~EXOFS_DIR_ROUND)
 
+/*
+ * The on-disk (optional) layout structure.
+ * sits in an EXOFS_ATTR_INODE_FILE_LAYOUT or EXOFS_ATTR_INODE_DIR_LAYOUT
+ * attribute, attached to any inode, usually to a directory.
+ */
+
+enum exofs_inode_layout_gen_functions {
+	LAYOUT_MOVING_WINDOW = 0,
+	LAYOUT_IMPLICT = 1,
+};
+
+struct exofs_on_disk_inode_layout {
+	__le16 gen_func; /* One of enum exofs_inode_layout_gen_functions */
+	__le16 pad;
+	union {
+		/* gen_func == LAYOUT_MOVING_WINDOW (default) */
+		struct exofs_layout_sliding_window {
+			__le32 num_devices; /* first n devices in global-table*/
+		} sliding_window __packed;
+
+		/* gen_func == LAYOUT_IMPLICT */
+		struct exofs_layout_implict_list {
+			struct exofs_dt_data_map data_map;
+			/* Variable array of size data_map.cb_num_comps. These
+			 * are device indexes of the devices in the global table
+			 */
+			__le32 dev_indexes[];
+		} implict __packed;
+	};
+} __packed;
+
+static inline size_t exofs_on_disk_inode_layout_size(unsigned max_devs)
+{
+	return sizeof(struct exofs_on_disk_inode_layout) +
+		max_devs * sizeof(__le32);
+}
+
 #endif /*ifndef __EXOFS_COM_H__*/
diff --git a/fs/exofs/exofs.h b/fs/exofs/exofs.h
index 33c6856..09e3319 100644
--- a/fs/exofs/exofs.h
+++ b/fs/exofs/exofs.h
@@ -186,6 +186,12 @@ static inline struct exofs_i_info *exofs_i(struct inode *inode)
 }
 
 /*
+ * Given a layout, object_number and stripe_index return the associated global
+ * dev_index
+ */
+unsigned exofs_layout_od_id(struct exofs_layout *layout,
+			    osd_id obj_no, unsigned layout_index);
+/*
  * Maximum count of links to a file
  */
 #define EXOFS_LINK_MAX           32000
diff --git a/fs/exofs/inode.c b/fs/exofs/inode.c
index 03189a9..0163546 100644
--- a/fs/exofs/inode.c
+++ b/fs/exofs/inode.c
@@ -859,6 +859,15 @@ int exofs_setattr(struct dentry *dentry, struct iattr *iattr)
 	return error;
 }
 
+static const struct osd_attr g_attr_inode_file_layout = ATTR_DEF(
+	EXOFS_APAGE_FS_DATA,
+	EXOFS_ATTR_INODE_FILE_LAYOUT,
+	0);
+static const struct osd_attr g_attr_inode_dir_layout = ATTR_DEF(
+	EXOFS_APAGE_FS_DATA,
+	EXOFS_ATTR_INODE_DIR_LAYOUT,
+	0);
+
 /*
  * Read an inode from the OSD, and return it as is.  We also return the size
  * attribute in the 'obj_size' argument.
@@ -867,11 +876,16 @@ static int exofs_get_inode(struct super_block *sb, struct exofs_i_info *oi,
 		    struct exofs_fcb *inode, uint64_t *obj_size)
 {
 	struct exofs_sb_info *sbi = sb->s_fs_info;
-	struct osd_attr attrs[2];
+	struct osd_attr attrs[] = {
+		[0] = g_attr_inode_data,
+		[1] = g_attr_inode_file_layout,
+		[2] = g_attr_inode_dir_layout,
+		[3] = g_attr_logical_length,
+	};
 	struct exofs_io_state *ios;
+	struct exofs_on_disk_inode_layout *layout;
 	int ret;
 
-	*obj_size = ~0;
 	ret = exofs_get_io_state(&sbi->layout, &ios);
 	if (unlikely(ret)) {
 		EXOFS_ERR("%s: exofs_get_io_state failed.\n", __func__);
@@ -882,8 +896,9 @@ static int exofs_get_inode(struct super_block *sb, struct exofs_i_info *oi,
 	exofs_make_credential(oi->i_cred, &ios->obj);
 	ios->cred = oi->i_cred;
 
-	attrs[0] = g_attr_inode_data;
-	attrs[1] = g_attr_logical_length;
+	attrs[1].len = exofs_on_disk_inode_layout_size(sbi->layout.s_numdevs);
+	attrs[2].len = exofs_on_disk_inode_layout_size(sbi->layout.s_numdevs);
+
 	ios->in_attr = attrs;
 	ios->in_attr_len = ARRAY_SIZE(attrs);
 
@@ -901,11 +916,42 @@ static int exofs_get_inode(struct super_block *sb, struct exofs_i_info *oi,
 
 	ret = extract_attr_from_ios(ios, &attrs[1]);
 	if (ret) {
+		EXOFS_ERR("%s: extract_attr of inode_data failed\n", __func__);
+		goto out;
+	}
+	if (attrs[1].len) {
+		layout = attrs[1].val_ptr;
+		if (layout->gen_func != cpu_to_le16(LAYOUT_MOVING_WINDOW)) {
+			EXOFS_ERR("%s: unsupported files layout %d\n",
+				__func__, layout->gen_func);
+			ret = -ENOTSUPP;
+			goto out;
+		}
+	}
+
+	ret = extract_attr_from_ios(ios, &attrs[2]);
+	if (ret) {
+		EXOFS_ERR("%s: extract_attr of inode_data failed\n", __func__);
+		goto out;
+	}
+	if (attrs[2].len) {
+		layout = attrs[2].val_ptr;
+		if (layout->gen_func != cpu_to_le16(LAYOUT_MOVING_WINDOW)) {
+			EXOFS_ERR("%s: unsupported meta-data layout %d\n",
+				__func__, layout->gen_func);
+			ret = -ENOTSUPP;
+			goto out;
+		}
+	}
+
+	*obj_size = ~0;
+	ret = extract_attr_from_ios(ios, &attrs[3]);
+	if (ret) {
 		EXOFS_ERR("%s: extract_attr of logical_length failed\n",
 			  __func__);
 		goto out;
 	}
-	*obj_size = get_unaligned_be64(attrs[1].val_ptr);
+	*obj_size = get_unaligned_be64(attrs[3].val_ptr);
 
 out:
 	exofs_put_io_state(ios);
diff --git a/fs/exofs/ios.c b/fs/exofs/ios.c
index 4f67931..2b81f99 100644
--- a/fs/exofs/ios.c
+++ b/fs/exofs/ios.c
@@ -107,6 +107,19 @@ void exofs_put_io_state(struct exofs_io_state *ios)
 	}
 }
 
+unsigned exofs_layout_od_id(struct exofs_layout *layout,
+			    osd_id obj_no, unsigned layout_index)
+{
+	return layout_index;
+}
+
+static inline struct osd_dev *exofs_ios_od(struct exofs_io_state *ios,
+					   unsigned layout_index)
+{
+	return ios->layout->s_ods[
+		exofs_layout_od_id(ios->layout, ios->obj.id, layout_index)];
+}
+
 static void _sync_done(struct exofs_io_state *ios, void *p)
 {
 	struct completion *waiting = p;
@@ -242,7 +255,7 @@ int exofs_sbi_create(struct exofs_io_state *ios)
 	for (i = 0; i < ios->layout->s_numdevs; i++) {
 		struct osd_request *or;
 
-		or = osd_start_request(ios->layout->s_ods[i], GFP_KERNEL);
+		or = osd_start_request(exofs_ios_od(ios, i), GFP_KERNEL);
 		if (unlikely(!or)) {
 			EXOFS_ERR("%s: osd_start_request failed\n", __func__);
 			ret = -ENOMEM;
@@ -266,7 +279,7 @@ int exofs_sbi_remove(struct exofs_io_state *ios)
 	for (i = 0; i < ios->layout->s_numdevs; i++) {
 		struct osd_request *or;
 
-		or = osd_start_request(ios->layout->s_ods[i], GFP_KERNEL);
+		or = osd_start_request(exofs_ios_od(ios, i), GFP_KERNEL);
 		if (unlikely(!or)) {
 			EXOFS_ERR("%s: osd_start_request failed\n", __func__);
 			ret = -ENOMEM;
@@ -290,7 +303,7 @@ int exofs_sbi_write(struct exofs_io_state *ios)
 	for (i = 0; i < ios->layout->s_numdevs; i++) {
 		struct osd_request *or;
 
-		or = osd_start_request(ios->layout->s_ods[i], GFP_KERNEL);
+		or = osd_start_request(exofs_ios_od(ios, i), GFP_KERNEL);
 		if (unlikely(!or)) {
 			EXOFS_ERR("%s: osd_start_request failed\n", __func__);
 			ret = -ENOMEM;
@@ -361,7 +374,7 @@ int exofs_sbi_read(struct exofs_io_state *ios)
 	unsigned first_dev = (unsigned)ios->obj.id;
 
 	first_dev %= ios->layout->s_numdevs;
-	or = osd_start_request(ios->layout->s_ods[first_dev], GFP_KERNEL);
+	or = osd_start_request(exofs_ios_od(ios, first_dev), GFP_KERNEL);
 	if (unlikely(!or)) {
 		EXOFS_ERR("%s: osd_start_request failed\n", __func__);
 		return -ENOMEM;
@@ -442,7 +455,7 @@ int exofs_oi_truncate(struct exofs_i_info *oi, u64 size)
 	for (i = 0; i < sbi->layout.s_numdevs; i++) {
 		struct osd_request *or;
 
-		or = osd_start_request(sbi->layout.s_ods[i], GFP_KERNEL);
+		or = osd_start_request(exofs_ios_od(ios, i), GFP_KERNEL);
 		if (unlikely(!or)) {
 			EXOFS_ERR("%s: osd_start_request failed\n", __func__);
 			ret = -ENOMEM;
-- 
1.6.6


^ permalink raw reply related	[flat|nested] 9+ messages in thread

* [PATCH 7/8] exofs: RAID0 support
  2010-02-01 12:32 [PATCHSET 0/8] exofs RAID0 support Boaz Harrosh
                   ` (5 preceding siblings ...)
  2010-02-01 14:58 ` [PATCH 6/8] exofs: Define on-disk per-inode optional layout attribute Boaz Harrosh
@ 2010-02-01 14:58 ` Boaz Harrosh
  2010-02-01 14:58 ` [PATCH 8/8] exofs: convert io_state to use pages array instead of bio at input Boaz Harrosh
  7 siblings, 0 replies; 9+ messages in thread
From: Boaz Harrosh @ 2010-02-01 14:58 UTC (permalink / raw)
  To: open-osd, pNFS Mailing List, linux-fsdevel

We now support striping over mirror devices. Including variable sized
stripe_unit.

Some limits:
* stripe_unit must be a multiple of PAGE_SIZE
* stripe_unit * stripe_count is maximum upto 32-bit (4Gb)

Tested RAID0 over mirrors, RAID0 only, mirrors only. All check.

Design notes:
* I'm not using a vectored raid-engine mechanism yet. Following the
  pnfs-objects-layout data-map structure, "Mirror" is just a private
  case of "group_width" == 1, and RAID0 is a private case of
  "Mirrors" == 1. The performance lose of the general case over the
  particular special case optimization is totally negligible, also
  considering the extra code size.

* In general I added a prepare_stripes() stage that divides the
  to-be-io pages to the participating devices, the previous
  exofs_ios_write/read, now becomes _write/read_mirrors and a new
  write/read upper layer loops on all devices calling
  _write/read_mirrors. Effectively the prepare_stripes stage is the all
  secret.
  Also truncate need fixing to accommodate for striping.

* In a RAID0 arrangement, in a regular usage scenario, if all inode
  layouts will start at the same device, the small files fill up the
  first device and the later devices stay empty, the farther the device
  the emptier it is.

  To fix that, each inode will start at a different stripe_unit,
  according to it's obj_id modulus number-of-stripe-units. And
  will then span all stripe-units in the same incrementing order
  wrapping back to the beginning of the device table. We call it
  a stripe-units moving window.

  Special consideration was taken to keep all devices in a mirror
  arrangement identical. So a broken osd-device could just be cloned
  from one of the mirrors and no FS scrubbing is needed. (We do that
  by rotating stripe-unit at a time and not a single device at a time.)

TODO:
 We no longer verify object_length == inode->i_size in exofs_iget.
 (since i_size is stripped on multiple objects now).
 I should introduce a multiple-device attribute reading, and use
 it in exofs_iget.

Signed-off-by: Boaz Harrosh <bharrosh@panasas.com>
---
 fs/exofs/exofs.h |   14 +++
 fs/exofs/inode.c |   26 +----
 fs/exofs/ios.c   |  327 +++++++++++++++++++++++++++++++++++++++++++++---------
 fs/exofs/super.c |   52 ++++++++-
 4 files changed, 336 insertions(+), 83 deletions(-)

diff --git a/fs/exofs/exofs.h b/fs/exofs/exofs.h
index 09e3319..5828bde 100644
--- a/fs/exofs/exofs.h
+++ b/fs/exofs/exofs.h
@@ -58,6 +58,17 @@
 struct exofs_layout {
 	osd_id		s_pid;			/* partition ID of file system*/
 
+	/* Our way of looking at the data_map */
+	unsigned stripe_unit;
+	unsigned mirrors_p1;
+
+	unsigned group_width; /* Without data integ components */
+	unsigned data_integ; /* 0, 1, or 2 */
+	unsigned group_depth;
+	unsigned group_count;
+
+	enum exofs_inode_layout_gen_functions lay_func;
+
 	unsigned	s_numdevs;		/* Num of devices in array    */
 	struct osd_dev	*s_ods[0];		/* Variable length            */
 };
@@ -133,6 +144,9 @@ struct exofs_io_state {
 	struct exofs_per_dev_state {
 		struct osd_request *or;
 		struct bio *bio;
+		loff_t offset;
+		unsigned length;
+		unsigned dev;
 	} per_dev[];
 };
 
diff --git a/fs/exofs/inode.c b/fs/exofs/inode.c
index 0163546..2b3163e 100644
--- a/fs/exofs/inode.c
+++ b/fs/exofs/inode.c
@@ -869,18 +869,17 @@ static const struct osd_attr g_attr_inode_dir_layout = ATTR_DEF(
 	0);
 
 /*
- * Read an inode from the OSD, and return it as is.  We also return the size
- * attribute in the 'obj_size' argument.
+ * Read the Linux inode info from the OSD, and return it as is. In exofs the
+ * inode info is in an application specific page/attribute of the osd-object.
  */
 static int exofs_get_inode(struct super_block *sb, struct exofs_i_info *oi,
-		    struct exofs_fcb *inode, uint64_t *obj_size)
+		    struct exofs_fcb *inode)
 {
 	struct exofs_sb_info *sbi = sb->s_fs_info;
 	struct osd_attr attrs[] = {
 		[0] = g_attr_inode_data,
 		[1] = g_attr_inode_file_layout,
 		[2] = g_attr_inode_dir_layout,
-		[3] = g_attr_logical_length,
 	};
 	struct exofs_io_state *ios;
 	struct exofs_on_disk_inode_layout *layout;
@@ -944,15 +943,6 @@ static int exofs_get_inode(struct super_block *sb, struct exofs_i_info *oi,
 		}
 	}
 
-	*obj_size = ~0;
-	ret = extract_attr_from_ios(ios, &attrs[3]);
-	if (ret) {
-		EXOFS_ERR("%s: extract_attr of logical_length failed\n",
-			  __func__);
-		goto out;
-	}
-	*obj_size = get_unaligned_be64(attrs[3].val_ptr);
-
 out:
 	exofs_put_io_state(ios);
 	return ret;
@@ -971,7 +961,6 @@ struct inode *exofs_iget(struct super_block *sb, unsigned long ino)
 	struct exofs_i_info *oi;
 	struct exofs_fcb fcb;
 	struct inode *inode;
-	uint64_t obj_size;
 	int ret;
 
 	inode = iget_locked(sb, ino);
@@ -983,7 +972,7 @@ struct inode *exofs_iget(struct super_block *sb, unsigned long ino)
 	__oi_init(oi);
 
 	/* read the inode from the osd */
-	ret = exofs_get_inode(sb, oi, &fcb, &obj_size);
+	ret = exofs_get_inode(sb, oi, &fcb);
 	if (ret)
 		goto bad_inode;
 
@@ -1004,13 +993,6 @@ struct inode *exofs_iget(struct super_block *sb, unsigned long ino)
 	inode->i_blkbits = EXOFS_BLKSHIFT;
 	inode->i_generation = le32_to_cpu(fcb.i_generation);
 
-	if ((inode->i_size != obj_size) &&
-		(!exofs_inode_is_fast_symlink(inode))) {
-		EXOFS_ERR("WARNING: Size of inode=%llu != object=%llu\n",
-			  inode->i_size, _LLU(obj_size));
-		/* FIXME: call exofs_inode_recovery() */
-	}
-
 	oi->i_dir_start_lookup = 0;
 
 	if ((inode->i_nlink == 0) && (inode->i_mode == 0)) {
diff --git a/fs/exofs/ios.c b/fs/exofs/ios.c
index 2b81f99..6e446b2 100644
--- a/fs/exofs/ios.c
+++ b/fs/exofs/ios.c
@@ -23,6 +23,7 @@
  */
 
 #include <scsi/scsi_device.h>
+#include <asm/div64.h>
 
 #include "exofs.h"
 
@@ -110,7 +111,17 @@ void exofs_put_io_state(struct exofs_io_state *ios)
 unsigned exofs_layout_od_id(struct exofs_layout *layout,
 			    osd_id obj_no, unsigned layout_index)
 {
-	return layout_index;
+/*	switch (layout->lay_func) {
+	case LAYOUT_MOVING_WINDOW:
+	{*/
+		unsigned dev_mod = obj_no;
+
+		return (layout_index + dev_mod * layout->mirrors_p1) %
+							      layout->s_numdevs;
+/*	}
+	case LAYOUT_FUNC_IMPLICT:
+		return layout->devs[layout_index];
+	}*/
 }
 
 static inline struct osd_dev *exofs_ios_od(struct exofs_io_state *ios,
@@ -225,8 +236,8 @@ int exofs_check_io(struct exofs_io_state *ios, u64 *resid)
 			_clear_bio(ios->per_dev[i].bio);
 			EXOFS_DBGMSG("start read offset passed end of file "
 				"offset=0x%llx, length=0x%llx\n",
-				_LLU(ios->offset),
-				_LLU(ios->length));
+				_LLU(ios->per_dev[i].offset),
+				_LLU(ios->per_dev[i].length));
 
 			continue; /* we recovered */
 		}
@@ -248,6 +259,127 @@ int exofs_check_io(struct exofs_io_state *ios, u64 *resid)
 	return acumulated_lin_err;
 }
 
+/* REMOVEME: After review
+   Some quoteing from the standard
+
+   L = logical offset into the file
+   W = number of data components in a stripe
+   S = W * stripe_unit (S is Stripe length)
+   N = L / S (N is the stripe Number)
+   C = (L-(N*S)) / stripe_unit (C is the component)
+   O = (N*stripe_unit)+(L%stripe_unit) (O is the object's offset)
+*/
+
+static void _offset_dev_unit_off(struct exofs_io_state *ios, u64 file_offset,
+			u64 *obj_offset, unsigned *dev, unsigned *unit_off)
+{
+	unsigned stripe_unit = ios->layout->stripe_unit;
+	unsigned stripe_length = stripe_unit * ios->layout->group_width;
+	u64 stripe_no = file_offset;
+	unsigned stripe_mod = do_div(stripe_no, stripe_length);
+
+	*unit_off = stripe_mod % stripe_unit;
+	*obj_offset = stripe_no * stripe_unit + *unit_off;
+	*dev = stripe_mod / stripe_unit * ios->layout->mirrors_p1;
+}
+
+static int _add_stripe_unit(struct exofs_io_state *ios,  unsigned *cur_bvec,
+		     struct exofs_per_dev_state *per_dev, int cur_len)
+{
+	unsigned bv = *cur_bvec;
+	struct request_queue *q =
+			osd_request_queue(exofs_ios_od(ios, per_dev->dev));
+
+	per_dev->length += cur_len;
+
+	if (per_dev->bio == NULL) {
+		unsigned pages_in_stripe = ios->layout->group_width *
+					(ios->layout->stripe_unit / PAGE_SIZE);
+		unsigned bio_size = (ios->bio->bi_vcnt + pages_in_stripe) /
+						ios->layout->group_width;
+
+		per_dev->bio = bio_kmalloc(GFP_KERNEL, bio_size);
+		if (unlikely(!per_dev->bio)) {
+			EXOFS_DBGMSG("Faild to allocate BIO size=%u\n",
+				     bio_size);
+			return -ENOMEM;
+		}
+	}
+
+	while (cur_len > 0) {
+		int added_len;
+		struct bio_vec *bvec = &ios->bio->bi_io_vec[bv];
+
+		BUG_ON(ios->bio->bi_vcnt <= bv);
+		cur_len -= bvec->bv_len;
+
+		added_len = bio_add_pc_page(q, per_dev->bio, bvec->bv_page,
+					    bvec->bv_len, bvec->bv_offset);
+		if (unlikely(bvec->bv_len != added_len))
+			return -ENOMEM;
+		++bv;
+	}
+	BUG_ON(cur_len);
+
+	*cur_bvec = bv;
+	return 0;
+}
+
+static int _prepare_for_striping(struct exofs_io_state *ios)
+{
+	u64 length = ios->length;
+	u64 offset = ios->offset;
+	unsigned stripe_unit = ios->layout->stripe_unit;
+	unsigned comp = 0;
+	unsigned stripes = 0;
+	unsigned cur_bvec = 0;
+	int ret;
+
+	if (!ios->bio) {
+		if (ios->kern_buff) {
+			struct exofs_per_dev_state *per_dev = &ios->per_dev[0];
+			unsigned unit_off;
+
+			_offset_dev_unit_off(ios, offset, &per_dev->offset,
+					     &per_dev->dev, &unit_off);
+			/* no cross device without page array */
+			BUG_ON((ios->layout->group_width > 1) &&
+			       (unit_off + length > stripe_unit));
+		}
+		ios->numdevs = ios->layout->mirrors_p1;
+		return 0;
+	}
+
+	while (length) {
+		struct exofs_per_dev_state *per_dev = &ios->per_dev[comp];
+		unsigned cur_len;
+
+		if (!per_dev->length) {
+			unsigned unit_off;
+
+			_offset_dev_unit_off(ios, offset, &per_dev->offset,
+					     &per_dev->dev, &unit_off);
+			stripes++;
+			cur_len = min_t(u64, stripe_unit - unit_off, length);
+			offset += cur_len;
+		} else {
+			cur_len = min_t(u64, stripe_unit, length);
+		}
+
+		ret = _add_stripe_unit(ios, &cur_bvec, per_dev, cur_len);
+		if (unlikely(ret))
+			goto out;
+
+		comp += ios->layout->mirrors_p1;
+		comp %= ios->layout->s_numdevs;
+
+		length -= cur_len;
+	}
+out:
+	ios->numdevs = stripes * ios->layout->mirrors_p1;
+	return ret;
+}
+
 int exofs_sbi_create(struct exofs_io_state *ios)
 {
 	int i, ret;
@@ -296,61 +428,71 @@ out:
 	return ret;
 }
 
-int exofs_sbi_write(struct exofs_io_state *ios)
+static int _sbi_write_mirror(struct exofs_io_state *ios, int cur_comp)
 {
-	int i, ret;
+	struct exofs_per_dev_state *master_dev = &ios->per_dev[cur_comp];
+	unsigned dev = ios->per_dev[cur_comp].dev;
+	unsigned last_comp = cur_comp + ios->layout->mirrors_p1;
+	int ret = 0;
 
-	for (i = 0; i < ios->layout->s_numdevs; i++) {
+	for (; cur_comp < last_comp; ++cur_comp, ++dev) {
+		struct exofs_per_dev_state *per_dev = &ios->per_dev[cur_comp];
 		struct osd_request *or;
 
-		or = osd_start_request(exofs_ios_od(ios, i), GFP_KERNEL);
+		or = osd_start_request(exofs_ios_od(ios, dev), GFP_KERNEL);
 		if (unlikely(!or)) {
 			EXOFS_ERR("%s: osd_start_request failed\n", __func__);
 			ret = -ENOMEM;
 			goto out;
 		}
-		ios->per_dev[i].or = or;
-		ios->numdevs++;
+		per_dev->or = or;
+		per_dev->offset = master_dev->offset;
 
 		if (ios->bio) {
 			struct bio *bio;
 
-			if (i != 0) {
+			if (per_dev != master_dev) {
 				bio = bio_kmalloc(GFP_KERNEL,
-						  ios->bio->bi_max_vecs);
+						  master_dev->bio->bi_max_vecs);
 				if (unlikely(!bio)) {
 					EXOFS_DBGMSG(
 					      "Faild to allocate BIO size=%u\n",
-					      ios->bio->bi_max_vecs);
+					      master_dev->bio->bi_max_vecs);
 					ret = -ENOMEM;
 					goto out;
 				}
 
-				__bio_clone(bio, ios->bio);
+				__bio_clone(bio, master_dev->bio);
 				bio->bi_bdev = NULL;
 				bio->bi_next = NULL;
-				ios->per_dev[i].bio =  bio;
+				per_dev->length = master_dev->length;
+				per_dev->bio =  bio;
+				per_dev->dev = dev;
 			} else {
-				bio = ios->bio;
+				bio = master_dev->bio;
+				/* FIXME: bio_set_dir() */
+				bio->bi_rw |= (1 << BIO_RW);
 			}
 
-			osd_req_write(or, &ios->obj, ios->offset, bio,
-				      ios->length);
+			osd_req_write(or, &ios->obj, per_dev->offset, bio,
+				      per_dev->length);
 			EXOFS_DBGMSG("write(0x%llx) offset=0x%llx "
 				      "length=0x%llx dev=%d\n",
-				     _LLU(ios->obj.id), _LLU(ios->offset),
-				     _LLU(ios->length), i);
+				     _LLU(ios->obj.id), _LLU(per_dev->offset),
+				     _LLU(per_dev->length), dev);
 		} else if (ios->kern_buff) {
-			osd_req_write_kern(or, &ios->obj, ios->offset,
+			ret = osd_req_write_kern(or, &ios->obj, per_dev->offset,
 					   ios->kern_buff, ios->length);
+			if (unlikely(ret))
+				goto out;
 			EXOFS_DBGMSG2("write_kern(0x%llx) offset=0x%llx "
 				      "length=0x%llx dev=%d\n",
-				     _LLU(ios->obj.id), _LLU(ios->offset),
-				     _LLU(ios->length), i);
+				     _LLU(ios->obj.id), _LLU(per_dev->offset),
+				     _LLU(ios->length), dev);
 		} else {
 			osd_req_set_attributes(or, &ios->obj);
 			EXOFS_DBGMSG2("obj(0x%llx) set_attributes=%d dev=%d\n",
-				     _LLU(ios->obj.id), ios->out_attr_len, i);
+				     _LLU(ios->obj.id), ios->out_attr_len, dev);
 		}
 
 		if (ios->out_attr)
@@ -361,40 +503,57 @@ int exofs_sbi_write(struct exofs_io_state *ios)
 			osd_req_add_get_attr_list(or, ios->in_attr,
 						  ios->in_attr_len);
 	}
-	ret = exofs_io_execute(ios);
 
 out:
 	return ret;
 }
 
-int exofs_sbi_read(struct exofs_io_state *ios)
+int exofs_sbi_write(struct exofs_io_state *ios)
+{
+	int i;
+	int ret;
+
+	ret = _prepare_for_striping(ios);
+	if (unlikely(ret))
+		return ret;
+
+	for (i = 0; i < ios->numdevs; i += ios->layout->mirrors_p1) {
+		ret = _sbi_write_mirror(ios, i);
+		if (unlikely(ret))
+			return ret;
+	}
+
+	ret = exofs_io_execute(ios);
+	return ret;
+}
+
+static int _sbi_read_mirror(struct exofs_io_state *ios, unsigned cur_comp)
 {
 	struct osd_request *or;
-	struct exofs_per_dev_state *per_dev = &ios->per_dev[0];
+	struct exofs_per_dev_state *per_dev = &ios->per_dev[cur_comp];
 	unsigned first_dev = (unsigned)ios->obj.id;
 
-	first_dev %= ios->layout->s_numdevs;
+	first_dev = per_dev->dev + first_dev % ios->layout->mirrors_p1;
 	or = osd_start_request(exofs_ios_od(ios, first_dev), GFP_KERNEL);
 	if (unlikely(!or)) {
 		EXOFS_ERR("%s: osd_start_request failed\n", __func__);
 		return -ENOMEM;
 	}
 	per_dev->or = or;
-	ios->numdevs++;
 
 	if (ios->bio) {
-		osd_req_read(or, &ios->obj, ios->offset, ios->bio, ios->length);
+		osd_req_read(or, &ios->obj, per_dev->offset,
+				per_dev->bio, per_dev->length);
 		EXOFS_DBGMSG("read(0x%llx) offset=0x%llx length=0x%llx"
 			     " dev=%d\n", _LLU(ios->obj.id),
-			     _LLU(ios->offset), _LLU(ios->length),
+			     _LLU(per_dev->offset), _LLU(per_dev->length),
 			     first_dev);
 	} else if (ios->kern_buff) {
-		int ret = osd_req_read_kern(or, &ios->obj, ios->offset,
+		int ret = osd_req_read_kern(or, &ios->obj, per_dev->offset,
 					    ios->kern_buff, ios->length);
-
 		EXOFS_DBGMSG2("read_kern(0x%llx) offset=0x%llx "
 			      "length=0x%llx dev=%d ret=>%d\n",
-			      _LLU(ios->obj.id), _LLU(ios->offset),
+			      _LLU(ios->obj.id), _LLU(per_dev->offset),
 			      _LLU(ios->length), first_dev, ret);
 		if (unlikely(ret))
 			return ret;
@@ -403,14 +562,32 @@ int exofs_sbi_read(struct exofs_io_state *ios)
 		EXOFS_DBGMSG2("obj(0x%llx) get_attributes=%d dev=%d\n",
 			      _LLU(ios->obj.id), ios->in_attr_len, first_dev);
 	}
-
 	if (ios->out_attr)
 		osd_req_add_set_attr_list(or, ios->out_attr, ios->out_attr_len);
 
 	if (ios->in_attr)
 		osd_req_add_get_attr_list(or, ios->in_attr, ios->in_attr_len);
 
-	return exofs_io_execute(ios);
+	return 0;
+}
+
+int exofs_sbi_read(struct exofs_io_state *ios)
+{
+	int i;
+	int ret;
+
+	ret = _prepare_for_striping(ios);
+	if (unlikely(ret))
+		return ret;
+
+	for (i = 0; i < ios->numdevs; i += ios->layout->mirrors_p1) {
+		ret = _sbi_read_mirror(ios, i);
+		if (unlikely(ret))
+			return ret;
+	}
+
+	ret = exofs_io_execute(ios);
+	return ret;
 }
 
 int extract_attr_from_ios(struct exofs_io_state *ios, struct osd_attr *attr)
@@ -434,42 +611,84 @@ int extract_attr_from_ios(struct exofs_io_state *ios, struct osd_attr *attr)
 	return -EIO;
 }
 
+static int _truncate_mirrors(struct exofs_io_state *ios, unsigned cur_comp,
+			     struct osd_attr *attr)
+{
+	int last_comp = cur_comp + ios->layout->mirrors_p1;
+
+	for (; cur_comp < last_comp; ++cur_comp) {
+		struct exofs_per_dev_state *per_dev = &ios->per_dev[cur_comp];
+		struct osd_request *or;
+
+		or = osd_start_request(exofs_ios_od(ios, cur_comp), GFP_KERNEL);
+		if (unlikely(!or)) {
+			EXOFS_ERR("%s: osd_start_request failed\n", __func__);
+			return -ENOMEM;
+		}
+		per_dev->or = or;
+
+		osd_req_set_attributes(or, &ios->obj);
+		osd_req_add_set_attr_list(or, attr, 1);
+	}
+
+	return 0;
+}
+
 int exofs_oi_truncate(struct exofs_i_info *oi, u64 size)
 {
 	struct exofs_sb_info *sbi = oi->vfs_inode.i_sb->s_fs_info;
 	struct exofs_io_state *ios;
-	struct osd_attr attr;
-	__be64 newsize;
+	struct exofs_trunc_attr {
+		struct osd_attr attr;
+		__be64 newsize;
+	} *size_attrs;
+	u64 this_obj_size;
+	unsigned dev;
+	unsigned unit_off;
 	int i, ret;
 
-	if (exofs_get_io_state(&sbi->layout, &ios))
-		return -ENOMEM;
+	ret = exofs_get_io_state(&sbi->layout, &ios);
+	if (unlikely(ret))
+		return ret;
+
+	size_attrs = kcalloc(ios->layout->group_width, sizeof(*size_attrs),
+			     GFP_KERNEL);
+	if (unlikely(!size_attrs)) {
+		ret = -ENOMEM;
+		goto out;
+	}
 
 	ios->obj.id = exofs_oi_objno(oi);
 	ios->cred = oi->i_cred;
 
-	newsize = cpu_to_be64(size);
-	attr = g_attr_logical_length;
-	attr.val_ptr = &newsize;
+	ios->numdevs = ios->layout->s_numdevs;
+	_offset_dev_unit_off(ios, size, &this_obj_size, &dev, &unit_off);
 
-	for (i = 0; i < sbi->layout.s_numdevs; i++) {
-		struct osd_request *or;
+	for (i = 0; i < ios->layout->group_width; ++i) {
+		struct exofs_trunc_attr *size_attr = &size_attrs[i];
+		u64 obj_size;
 
-		or = osd_start_request(exofs_ios_od(ios, i), GFP_KERNEL);
-		if (unlikely(!or)) {
-			EXOFS_ERR("%s: osd_start_request failed\n", __func__);
-			ret = -ENOMEM;
-			goto out;
-		}
-		ios->per_dev[i].or = or;
-		ios->numdevs++;
+		if (i < dev)
+			obj_size = this_obj_size +
+					ios->layout->stripe_unit - unit_off;
+		else if (i == dev)
+			obj_size = this_obj_size;
+		else /* i > dev */
+			obj_size = this_obj_size - unit_off;
 
-		osd_req_set_attributes(or, &ios->obj);
-		osd_req_add_set_attr_list(or, &attr, 1);
+		size_attr->newsize = cpu_to_be64(obj_size);
+		size_attr->attr = g_attr_logical_length;
+		size_attr->attr.val_ptr = &size_attr->newsize;
+
+		ret = _truncate_mirrors(ios, i * ios->layout->mirrors_p1,
+					&size_attr->attr);
+		if (unlikely(ret))
+			goto out;
 	}
 	ret = exofs_io_execute(ios);
 
 out:
+	kfree(size_attrs);
 	exofs_put_io_state(ios);
 	return ret;
 }
diff --git a/fs/exofs/super.c b/fs/exofs/super.c
index fc88751..8f4e4b3 100644
--- a/fs/exofs/super.c
+++ b/fs/exofs/super.c
@@ -308,6 +308,8 @@ static void exofs_put_super(struct super_block *sb)
 static int _read_and_match_data_map(struct exofs_sb_info *sbi, unsigned numdevs,
 				    struct exofs_device_table *dt)
 {
+	u64 stripe_length;
+
 	sbi->data_map.odm_num_comps   =
 				le32_to_cpu(dt->dt_data_map.cb_num_comps);
 	sbi->data_map.odm_stripe_unit =
@@ -321,14 +323,47 @@ static int _read_and_match_data_map(struct exofs_sb_info *sbi, unsigned numdevs,
 	sbi->data_map.odm_raid_algorithm  =
 				le32_to_cpu(dt->dt_data_map.cb_raid_algorithm);
 
-/* FIXME: Hard coded mirror only for now. if not so do not mount */
-	if ((sbi->data_map.odm_num_comps != numdevs) ||
-	    (sbi->data_map.odm_stripe_unit != EXOFS_BLKSIZE) ||
-	    (sbi->data_map.odm_raid_algorithm != PNFS_OSD_RAID_0) ||
-	    (sbi->data_map.odm_mirror_cnt != (numdevs - 1)))
+/* FIXME: Only raid0 !group_width/depth for now. if not so, do not mount */
+	if (sbi->data_map.odm_group_width || sbi->data_map.odm_group_depth) {
+		EXOFS_ERR("Group width/depth not supported\n");
 		return -EINVAL;
-	else
-		return 0;
+	}
+	if (sbi->data_map.odm_num_comps != numdevs) {
+		EXOFS_ERR("odm_num_comps(%u) != numdevs(%u)\n",
+			  sbi->data_map.odm_num_comps, numdevs);
+		return -EINVAL;
+	}
+	if (sbi->data_map.odm_raid_algorithm != PNFS_OSD_RAID_0) {
+		EXOFS_ERR("Only RAID_0 for now\n");
+		return -EINVAL;
+	}
+	if (0 != (numdevs % (sbi->data_map.odm_mirror_cnt + 1))) {
+		EXOFS_ERR("Data Map wrong, numdevs=%d mirrors=%d\n",
+			  numdevs, sbi->data_map.odm_mirror_cnt);
+		return -EINVAL;
+	}
+
+	stripe_length = sbi->data_map.odm_stripe_unit *
+			(numdevs / (sbi->data_map.odm_mirror_cnt + 1));
+	if (stripe_length >= (1ULL << 32)) {
+		EXOFS_ERR("Total Stripe length(0x%llx)"
+			  " >= 32bit is not supported\n", _LLU(stripe_length));
+		return -EINVAL;
+	}
+
+	if (0 != (sbi->data_map.odm_stripe_unit & ~PAGE_MASK)) {
+		EXOFS_ERR("Stripe Unit(0x%llx)"
+			  " must be Multples of PAGE_SIZE(0x%lx)\n",
+			  _LLU(sbi->data_map.odm_stripe_unit), PAGE_SIZE);
+		return -EINVAL;
+	}
+
+	sbi->layout.stripe_unit = sbi->data_map.odm_stripe_unit;
+	sbi->layout.mirrors_p1 = sbi->data_map.odm_mirror_cnt + 1;
+	sbi->layout.group_width = sbi->data_map.odm_num_comps /
+							sbi->layout.mirrors_p1;
+
+	return 0;
 }
 
 /* @odi is valid only as long as @fscb_dev is valid */
@@ -502,6 +537,9 @@ static int exofs_fill_super(struct super_block *sb, void *data, int silent)
 	}
 
 	/* Default layout in case we do not have a device-table */
+	sbi->layout.stripe_unit = PAGE_SIZE;
+	sbi->layout.mirrors_p1 = 1;
+	sbi->layout.group_width = 1;
 	sbi->layout.s_ods[0] = od;
 	sbi->layout.s_numdevs = 1;
 	sbi->layout.s_pid = opts->pid;
-- 
1.6.6


^ permalink raw reply related	[flat|nested] 9+ messages in thread

* [PATCH 8/8] exofs: convert io_state to use pages array instead of bio at input
  2010-02-01 12:32 [PATCHSET 0/8] exofs RAID0 support Boaz Harrosh
                   ` (6 preceding siblings ...)
  2010-02-01 14:58 ` [PATCH 7/8] exofs: RAID0 support Boaz Harrosh
@ 2010-02-01 14:58 ` Boaz Harrosh
  7 siblings, 0 replies; 9+ messages in thread
From: Boaz Harrosh @ 2010-02-01 14:58 UTC (permalink / raw)
  To: open-osd, pNFS Mailing List, linux-fsdevel

* inode.c operations are full-pages based, and not actually
  true scatter-gather
* Lets us use more pages at once upto 512 (from 249) in 64 bit
* Brings us much much closer to be able to use exofs's io_state engine
  from objlayout driver. (Once I decide where to put the common code)

After RAID0 patch the outer (input) bio was never used as a bio, but
was simply a page carrier into the raid engine. Even in the simple
mirror/single-dev arrangement pages info was copied into a second bio.
It is now easer to just pass a pages array into the io_state and prepare
bio(s) once.

Signed-off-by: Boaz Harrosh <bharrosh@panasas.com>
---
 fs/exofs/exofs.h |    5 ++-
 fs/exofs/inode.c |   81 +++++++++++++++++++++++++++--------------------------
 fs/exofs/ios.c   |   46 +++++++++++++++++-------------
 3 files changed, 71 insertions(+), 61 deletions(-)

diff --git a/fs/exofs/exofs.h b/fs/exofs/exofs.h
index 5828bde..9995480 100644
--- a/fs/exofs/exofs.h
+++ b/fs/exofs/exofs.h
@@ -131,7 +131,10 @@ struct exofs_io_state {
 	loff_t			offset;
 	unsigned long		length;
 	void			*kern_buff;
-	struct bio		*bio;
+
+	struct page		**pages;
+	unsigned		nr_pages;
+	unsigned		pgbase;
 
 	/* Attributes */
 	unsigned		in_attr_len;
diff --git a/fs/exofs/inode.c b/fs/exofs/inode.c
index 2b3163e..6ca0b01 100644
--- a/fs/exofs/inode.c
+++ b/fs/exofs/inode.c
@@ -41,16 +41,18 @@
 
 enum { BIO_MAX_PAGES_KMALLOC =
 		(PAGE_SIZE - sizeof(struct bio)) / sizeof(struct bio_vec),
+	MAX_PAGES_KMALLOC =
+		PAGE_SIZE / sizeof(struct page *),
 };
 
 struct page_collect {
 	struct exofs_sb_info *sbi;
-	struct request_queue *req_q;
 	struct inode *inode;
 	unsigned expected_pages;
 	struct exofs_io_state *ios;
 
-	struct bio *bio;
+	struct page **pages;
+	unsigned alloc_pages;
 	unsigned nr_pages;
 	unsigned long length;
 	loff_t pg_first; /* keep 64bit also in 32-arches */
@@ -62,15 +64,12 @@ static void _pcol_init(struct page_collect *pcol, unsigned expected_pages,
 	struct exofs_sb_info *sbi = inode->i_sb->s_fs_info;
 
 	pcol->sbi = sbi;
-	/* Create master bios on first Q, later on cloning, each clone will be
-	 * allocated on it's destination Q
-	 */
-	pcol->req_q = osd_request_queue(sbi->layout.s_ods[0]);
 	pcol->inode = inode;
 	pcol->expected_pages = expected_pages;
 
 	pcol->ios = NULL;
-	pcol->bio = NULL;
+	pcol->pages = NULL;
+	pcol->alloc_pages = 0;
 	pcol->nr_pages = 0;
 	pcol->length = 0;
 	pcol->pg_first = -1;
@@ -80,7 +79,8 @@ static void _pcol_reset(struct page_collect *pcol)
 {
 	pcol->expected_pages -= min(pcol->nr_pages, pcol->expected_pages);
 
-	pcol->bio = NULL;
+	pcol->pages = NULL;
+	pcol->alloc_pages = 0;
 	pcol->nr_pages = 0;
 	pcol->length = 0;
 	pcol->pg_first = -1;
@@ -90,13 +90,13 @@ static void _pcol_reset(struct page_collect *pcol)
 	 * it might not end here. don't be left with nothing
 	 */
 	if (!pcol->expected_pages)
-		pcol->expected_pages = BIO_MAX_PAGES_KMALLOC;
+		pcol->expected_pages = MAX_PAGES_KMALLOC;
 }
 
 static int pcol_try_alloc(struct page_collect *pcol)
 {
-	int pages = min_t(unsigned, pcol->expected_pages,
-			  BIO_MAX_PAGES_KMALLOC);
+	unsigned pages = min_t(unsigned, pcol->expected_pages,
+			  MAX_PAGES_KMALLOC);
 
 	if (!pcol->ios) { /* First time allocate io_state */
 		int ret = exofs_get_io_state(&pcol->sbi->layout, &pcol->ios);
@@ -105,23 +105,28 @@ static int pcol_try_alloc(struct page_collect *pcol)
 			return ret;
 	}
 
+	/* TODO: easily support bio chaining */
+	pages =  min_t(unsigned, pages,
+		       pcol->sbi->layout.group_width * BIO_MAX_PAGES_KMALLOC);
+
 	for (; pages; pages >>= 1) {
-		pcol->bio = bio_kmalloc(GFP_KERNEL, pages);
-		if (likely(pcol->bio))
+		pcol->pages = kmalloc(pages * sizeof(struct page *),
+				      GFP_KERNEL);
+		if (likely(pcol->pages)) {
+			pcol->alloc_pages = pages;
 			return 0;
+		}
 	}
 
-	EXOFS_ERR("Failed to bio_kmalloc expected_pages=%u\n",
+	EXOFS_ERR("Failed to kmalloc expected_pages=%u\n",
 		  pcol->expected_pages);
 	return -ENOMEM;
 }
 
 static void pcol_free(struct page_collect *pcol)
 {
-	if (pcol->bio) {
-		bio_put(pcol->bio);
-		pcol->bio = NULL;
-	}
+	kfree(pcol->pages);
+	pcol->pages = NULL;
 
 	if (pcol->ios) {
 		exofs_put_io_state(pcol->ios);
@@ -132,11 +137,10 @@ static void pcol_free(struct page_collect *pcol)
 static int pcol_add_page(struct page_collect *pcol, struct page *page,
 			 unsigned len)
 {
-	int added_len = bio_add_pc_page(pcol->req_q, pcol->bio, page, len, 0);
-	if (unlikely(len != added_len))
+	if (unlikely(pcol->nr_pages >= pcol->alloc_pages))
 		return -ENOMEM;
 
-	++pcol->nr_pages;
+	pcol->pages[pcol->nr_pages++] = page;
 	pcol->length += len;
 	return 0;
 }
@@ -181,7 +185,6 @@ static void update_write_page(struct page *page, int ret)
  */
 static int __readpages_done(struct page_collect *pcol, bool do_unlock)
 {
-	struct bio_vec *bvec;
 	int i;
 	u64 resid;
 	u64 good_bytes;
@@ -198,8 +201,8 @@ static int __readpages_done(struct page_collect *pcol, bool do_unlock)
 		     pcol->inode->i_ino, _LLU(good_bytes), pcol->length,
 		     pcol->nr_pages);
 
-	__bio_for_each_segment(bvec, pcol->bio, i, 0) {
-		struct page *page = bvec->bv_page;
+	for (i = 0; i < pcol->nr_pages; i++) {
+		struct page *page = pcol->pages[i];
 		struct inode *inode = page->mapping->host;
 		int page_stat;
 
@@ -218,7 +221,7 @@ static int __readpages_done(struct page_collect *pcol, bool do_unlock)
 		ret = update_read_page(page, page_stat);
 		if (do_unlock)
 			unlock_page(page);
-		length += bvec->bv_len;
+		length += PAGE_SIZE;
 	}
 
 	pcol_free(pcol);
@@ -238,11 +241,10 @@ static void readpages_done(struct exofs_io_state *ios, void *p)
 
 static void _unlock_pcol_pages(struct page_collect *pcol, int ret, int rw)
 {
-	struct bio_vec *bvec;
 	int i;
 
-	__bio_for_each_segment(bvec, pcol->bio, i, 0) {
-		struct page *page = bvec->bv_page;
+	for (i = 0; i < pcol->nr_pages; i++) {
+		struct page *page = pcol->pages[i];
 
 		if (rw == READ)
 			update_read_page(page, ret);
@@ -260,13 +262,14 @@ static int read_exec(struct page_collect *pcol, bool is_sync)
 	struct page_collect *pcol_copy = NULL;
 	int ret;
 
-	if (!pcol->bio)
+	if (!pcol->pages)
 		return 0;
 
 	/* see comment in _readpage() about sync reads */
 	WARN_ON(is_sync && (pcol->nr_pages != 1));
 
-	ios->bio = pcol->bio;
+	ios->pages = pcol->pages;
+	ios->nr_pages = pcol->nr_pages;
 	ios->length = pcol->length;
 	ios->offset = pcol->pg_first << PAGE_CACHE_SHIFT;
 
@@ -366,7 +369,7 @@ try_again:
 		goto try_again;
 	}
 
-	if (!pcol->bio) {
+	if (!pcol->pages) {
 		ret = pcol_try_alloc(pcol);
 		if (unlikely(ret))
 			goto fail;
@@ -448,7 +451,6 @@ static int exofs_readpage(struct file *file, struct page *page)
 static void writepages_done(struct exofs_io_state *ios, void *p)
 {
 	struct page_collect *pcol = p;
-	struct bio_vec *bvec;
 	int i;
 	u64 resid;
 	u64  good_bytes;
@@ -467,8 +469,8 @@ static void writepages_done(struct exofs_io_state *ios, void *p)
 		     pcol->inode->i_ino, _LLU(good_bytes), pcol->length,
 		     pcol->nr_pages);
 
-	__bio_for_each_segment(bvec, pcol->bio, i, 0) {
-		struct page *page = bvec->bv_page;
+	for (i = 0; i < pcol->nr_pages; i++) {
+		struct page *page = pcol->pages[i];
 		struct inode *inode = page->mapping->host;
 		int page_stat;
 
@@ -485,7 +487,7 @@ static void writepages_done(struct exofs_io_state *ios, void *p)
 		EXOFS_DBGMSG2("    writepages_done(0x%lx, 0x%lx) status=%d\n",
 			     inode->i_ino, page->index, page_stat);
 
-		length += bvec->bv_len;
+		length += PAGE_SIZE;
 	}
 
 	pcol_free(pcol);
@@ -500,7 +502,7 @@ static int write_exec(struct page_collect *pcol)
 	struct page_collect *pcol_copy = NULL;
 	int ret;
 
-	if (!pcol->bio)
+	if (!pcol->pages)
 		return 0;
 
 	pcol_copy = kmalloc(sizeof(*pcol_copy), GFP_KERNEL);
@@ -512,9 +514,8 @@ static int write_exec(struct page_collect *pcol)
 
 	*pcol_copy = *pcol;
 
-	pcol_copy->bio->bi_rw |= (1 << BIO_RW); /* FIXME: bio_set_dir() */
-
-	ios->bio = pcol_copy->bio;
+	ios->pages = pcol_copy->pages;
+	ios->nr_pages = pcol_copy->nr_pages;
 	ios->offset = pcol_copy->pg_first << PAGE_CACHE_SHIFT;
 	ios->length = pcol_copy->length;
 	ios->done = writepages_done;
@@ -605,7 +606,7 @@ try_again:
 		goto try_again;
 	}
 
-	if (!pcol->bio) {
+	if (!pcol->pages) {
 		ret = pcol_try_alloc(pcol);
 		if (unlikely(ret))
 			goto fail;
diff --git a/fs/exofs/ios.c b/fs/exofs/ios.c
index 6e446b2..263052c 100644
--- a/fs/exofs/ios.c
+++ b/fs/exofs/ios.c
@@ -283,10 +283,11 @@ static void _offset_dev_unit_off(struct exofs_io_state *ios, u64 file_offset,
 	*dev = stripe_mod / stripe_unit * ios->layout->mirrors_p1;
 }
 
-static int _add_stripe_unit(struct exofs_io_state *ios,  unsigned *cur_bvec,
-		     struct exofs_per_dev_state *per_dev, int cur_len)
+static int _add_stripe_unit(struct exofs_io_state *ios,  unsigned *cur_pg,
+		unsigned pgbase, struct exofs_per_dev_state *per_dev,
+		int cur_len)
 {
-	unsigned bv = *cur_bvec;
+	unsigned pg = *cur_pg;
 	struct request_queue *q =
 			osd_request_queue(exofs_ios_od(ios, per_dev->dev));
 
@@ -295,7 +296,7 @@ static int _add_stripe_unit(struct exofs_io_state *ios,  unsigned *cur_bvec,
 	if (per_dev->bio == NULL) {
 		unsigned pages_in_stripe = ios->layout->group_width *
 					(ios->layout->stripe_unit / PAGE_SIZE);
-		unsigned bio_size = (ios->bio->bi_vcnt + pages_in_stripe) /
+		unsigned bio_size = (ios->nr_pages + pages_in_stripe) /
 						ios->layout->group_width;
 
 		per_dev->bio = bio_kmalloc(GFP_KERNEL, bio_size);
@@ -307,21 +308,22 @@ static int _add_stripe_unit(struct exofs_io_state *ios,  unsigned *cur_bvec,
 	}
 
 	while (cur_len > 0) {
-		int added_len;
-		struct bio_vec *bvec = &ios->bio->bi_io_vec[bv];
+		unsigned pglen = min_t(unsigned, PAGE_SIZE - pgbase, cur_len);
+		unsigned added_len;
 
-		BUG_ON(ios->bio->bi_vcnt <= bv);
-		cur_len -= bvec->bv_len;
+		BUG_ON(ios->nr_pages <= pg);
+		cur_len -= pglen;
 
-		added_len = bio_add_pc_page(q, per_dev->bio, bvec->bv_page,
-					    bvec->bv_len, bvec->bv_offset);
-		if (unlikely(bvec->bv_len != added_len))
+		added_len = bio_add_pc_page(q, per_dev->bio, ios->pages[pg],
+					    pglen, pgbase);
+		if (unlikely(pglen != added_len))
 			return -ENOMEM;
-		++bv;
+		pgbase = 0;
+		++pg;
 	}
 	BUG_ON(cur_len);
 
-	*cur_bvec = bv;
+	*cur_pg = pg;
 	return 0;
 }
 
@@ -332,10 +334,10 @@ static int _prepare_for_striping(struct exofs_io_state *ios)
 	unsigned stripe_unit = ios->layout->stripe_unit;
 	unsigned comp = 0;
 	unsigned stripes = 0;
-	unsigned cur_bvec = 0;
-	int ret;
+	unsigned cur_pg = 0;
+	int ret = 0;
 
-	if (!ios->bio) {
+	if (!ios->pages) {
 		if (ios->kern_buff) {
 			struct exofs_per_dev_state *per_dev = &ios->per_dev[0];
 			unsigned unit_off;
@@ -352,7 +354,7 @@ static int _prepare_for_striping(struct exofs_io_state *ios)
 
 	while (length) {
 		struct exofs_per_dev_state *per_dev = &ios->per_dev[comp];
-		unsigned cur_len;
+		unsigned cur_len, page_off;
 
 		if (!per_dev->length) {
 			unsigned unit_off;
@@ -362,11 +364,15 @@ static int _prepare_for_striping(struct exofs_io_state *ios)
 			stripes++;
 			cur_len = min_t(u64, stripe_unit - unit_off, length);
 			offset += cur_len;
+			page_off = unit_off & ~PAGE_MASK;
+			BUG_ON(page_off != ios->pgbase);
 		} else {
 			cur_len = min_t(u64, stripe_unit, length);
+			page_off = 0;
 		}
 
-		ret = _add_stripe_unit(ios, &cur_bvec, per_dev, cur_len);
+		ret = _add_stripe_unit(ios, &cur_pg, page_off , per_dev,
+				       cur_len);
 		if (unlikely(ret))
 			goto out;
 
@@ -448,7 +454,7 @@ static int _sbi_write_mirror(struct exofs_io_state *ios, int cur_comp)
 		per_dev->or = or;
 		per_dev->offset = master_dev->offset;
 
-		if (ios->bio) {
+		if (ios->pages) {
 			struct bio *bio;
 
 			if (per_dev != master_dev) {
@@ -541,7 +547,7 @@ static int _sbi_read_mirror(struct exofs_io_state *ios, unsigned cur_comp)
 	}
 	per_dev->or = or;
 
-	if (ios->bio) {
+	if (ios->pages) {
 		osd_req_read(or, &ios->obj, per_dev->offset,
 				per_dev->bio, per_dev->length);
 		EXOFS_DBGMSG("read(0x%llx) offset=0x%llx length=0x%llx"
-- 
1.6.6


^ permalink raw reply related	[flat|nested] 9+ messages in thread

end of thread, other threads:[~2010-02-01 14:58 UTC | newest]

Thread overview: 9+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2010-02-01 12:32 [PATCHSET 0/8] exofs RAID0 support Boaz Harrosh
2010-02-01 14:56 ` [PATCH 1/8] exofs: debug print even less Boaz Harrosh
2010-02-01 14:57 ` [PATCH 2/8] exofs: Micro-optimize exofs_i_info Boaz Harrosh
2010-02-01 14:57 ` [PATCH 3/8] exofs: Recover in the case of read-passed-end-of-file Boaz Harrosh
2010-02-01 14:57 ` [PATCH 4/8] exofs: Move layout related members to a layout structure Boaz Harrosh
2010-02-01 14:57 ` [PATCH 5/8] exofs: unindent exofs_sbi_read Boaz Harrosh
2010-02-01 14:58 ` [PATCH 6/8] exofs: Define on-disk per-inode optional layout attribute Boaz Harrosh
2010-02-01 14:58 ` [PATCH 7/8] exofs: RAID0 support Boaz Harrosh
2010-02-01 14:58 ` [PATCH 8/8] exofs: convert io_state to use pages array instead of bio at input Boaz Harrosh

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.