Re: Block quirks redux + Toshiba performance quirk

linux-mmc.vger.kernel.org archive mirror
 help / color / mirror / Atom feed

* Re: Block quirks redux + Toshiba performance quirk
  2011-03-10  0:54 Block quirks redux + Toshiba performance quirk Andrei Warkentin
@ 2011-03-10  0:12 ` Andrei Warkentin
  2011-03-10  0:54 ` [RFC 1/5] MMC: Extends card quicks with MMC/SD quirks matching the CID Andrei Warkentin
                   ` (5 subsequent siblings)
  6 siblings, 0 replies; 28+ messages in thread
From: Andrei Warkentin @ 2011-03-10  0:12 UTC (permalink / raw)
  To: linux-mmc; +Cc: tardyp, Linus Walleij, Arnd Bergmann

On Wed, Mar 9, 2011 at 6:54 PM, Andrei Warkentin <andreiw@motorola.com> wrote:
> I'm sending this as a new mail instead of a reply-to, because it's really a
> different set of patches. I'm holding off on the reliability improvement, because
> the generic-code equiv. depends on reliable write support, so I'll wait until everybody
> is satisfied with these changes until sending those in.
>
> This is based on linux-next tree with Pierre's quirks.c.
>
> I've tested it on K36 with backported Pierre's changes.
>
> Thanks,
> A
>
>

Oops, forgot to add CCs... sorry about that.

A

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Block quirks redux + Toshiba performance quirk
@ 2011-03-10  0:54 Andrei Warkentin
  2011-03-10  0:12 ` Andrei Warkentin
                   ` (6 more replies)
  0 siblings, 7 replies; 28+ messages in thread
From: Andrei Warkentin @ 2011-03-10  0:54 UTC (permalink / raw)
  To: linux-mmc

I'm sending this as a new mail instead of a reply-to, because it's really a
different set of patches. I'm holding off on the reliability improvement, because
the generic-code equiv. depends on reliable write support, so I'll wait until everybody
is satisfied with these changes until sending those in.

This is based on linux-next tree with Pierre's quirks.c.

I've tested it on K36 with backported Pierre's changes.

Thanks,
A

^ permalink raw reply	[flat|nested] 28+ messages in thread

* [RFC 1/5] MMC: Extends card quicks with MMC/SD quirks matching the CID.
  2011-03-10  0:54 Block quirks redux + Toshiba performance quirk Andrei Warkentin
  2011-03-10  0:12 ` Andrei Warkentin
@ 2011-03-10  0:54 ` Andrei Warkentin
  2011-03-10  0:54 ` [RFC 2/5] MMC: Allow function-specific quirks Andrei Warkentin
                   ` (4 subsequent siblings)
  6 siblings, 0 replies; 28+ messages in thread
From: Andrei Warkentin @ 2011-03-10  0:54 UTC (permalink / raw)
  To: linux-mmc; +Cc: Andrei Warkentin

The current mechanism is SDIO-only.

Signed-off-by: Andrei Warkentin <andreiw@motorola.com>
---
 drivers/mmc/core/quirks.c |  130 ++++++++++++++++++++++++++++++++++-----------
 1 files changed, 98 insertions(+), 32 deletions(-)

diff --git a/drivers/mmc/core/quirks.c b/drivers/mmc/core/quirks.c
index 4fb16ac..f795b8d 100644
--- a/drivers/mmc/core/quirks.c
+++ b/drivers/mmc/core/quirks.c
@@ -13,41 +13,95 @@
 #include <linux/mmc/card.h>
 #include <linux/mod_devicetable.h>
 
+#define cid_rev(hwrev, fwrev, year, month)	\
+	(((u64) hwrev) << 40 |                  \
+	 ((u64) fwrev) << 32 |                  \
+	 ((u64) year) << 16 |                   \
+	 ((u64) month))
+
+#define cid_rev_card(card)		  \
+	cid_rev(card->cid.hwrev,	  \
+		    card->cid.fwrev,      \
+		    card->cid.year,	  \
+		    card->cid.month)
+
+
+
 /*
  *  The world is not perfect and supplies us with broken mmc/sdio devices.
  *  For at least a part of these bugs we need a work-around
  */
 
 struct mmc_fixup {
-	u16 vendor, device;	/* You can use SDIO_ANY_ID here of course */
+
+	/* CID-specific fields. */
+	const char *name;
+
+	/* Valid revision range */
+	u64 rev_start, rev_end;
+
+	unsigned int manfid;
+	unsigned short oemid;
+
+       /* SDIO-specfic fields. You can use SDIO_ANY_ID here of course */
+	u16 cis_vendor, cis_device;
+
 	void (*vendor_fixup)(struct mmc_card *card, int data);
 	int data;
 };
 
-/*
- * This hook just adds a quirk unconditionnally
- */
-static void __maybe_unused add_quirk(struct mmc_card *card, int data)
-{
-	card->quirks |= data;
-}
+#define CID_MANFID_ANY (-1ul)
+#define CID_OEMID_ANY ((unsigned short) -1)
+#define CID_NAME_ANY (NULL)
 
-/*
- * This hook just removes a quirk unconditionnally
- */
-static void __maybe_unused remove_quirk(struct mmc_card *card, int data)
-{
-	card->quirks &= ~data;
-}
+#define END_FIXUP { 0 }
 
-/*
- * This hook just adds a quirk for all sdio devices
- */
-static void add_quirk_for_sdio_devices(struct mmc_card *card, int data)
-{
-	if (mmc_card_sdio(card))
-		card->quirks |= data;
-}
+#define _FIXUP_EXT(_name, _manfid, _oemid, _rev_start, _rev_end,	\
+		   _cis_vendor, _cis_device,				\
+		   _fixup, _data)					\
+	{						   \
+		.name = (_name),			   \
+		.manfid = (_manfid),			   \
+		.oemid = (_oemid),			   \
+		.rev_start = (_rev_start),		   \
+		.rev_end = (_rev_end),			   \
+		.cis_vendor = (_cis_vendor),		   \
+		.cis_device = (_cis_device),		   \
+		.vendor_fixup = (_fixup),		   \
+		.data = (_data),			   \
+	 }
+
+#define MMC_FIXUP_REV(_name, _manfid, _oemid, _rev_start, _rev_end,	\
+		      _fixup, _data)					\
+	_FIXUP_EXT(_name, _manfid,					\
+		   _oemid, _rev_start, _rev_end,			\
+		   SDIO_ANY_ID, SDIO_ANY_ID,				\
+		   _fixup, _data)					\
+
+#define MMC_FIXUP(_name, _manfid, _oemid, _fixup, _data) \
+	MMC_FIXUP_REV(_name, _manfid, _oemid, 0, -1ull, _fixup, _data)
+
+#define SDIO_FIXUP(_vendor, _device, _fixup, _data)			\
+	_FIXUP_EXT(CID_NAME_ANY, CID_MANFID_ANY,			\
+		    CID_OEMID_ANY, 0, -1ull,				\
+		   _vendor, _device,					\
+		   _fixup, _data)					\
+
+ /*
+  * This hook just adds a quirk unconditionnally
+  */
+ static void __maybe_unused add_quirk(struct mmc_card *card, int data)
+ {
+	 card->quirks |= data;
+ }
+
+ /*
+  * This hook just removes a quirk unconditionnally
+  */
+ static void __maybe_unused remove_quirk(struct mmc_card *card, int data)
+ {
+	 card->quirks &= ~data;
+ }
 
 #ifndef SDIO_VENDOR_ID_TI
 #define SDIO_VENDOR_ID_TI		0x0097
@@ -58,24 +112,36 @@ static void add_quirk_for_sdio_devices(struct mmc_card *card, int data)
 #endif
 
 static const struct mmc_fixup mmc_fixup_methods[] = {
+
 	/* by default sdio devices are considered CLK_GATING broken */
 	/* good cards will be whitelisted as they are tested */
-	{ SDIO_ANY_ID, SDIO_ANY_ID,
-		add_quirk_for_sdio_devices, MMC_QUIRK_BROKEN_CLK_GATING },
-	{ SDIO_VENDOR_ID_TI, SDIO_DEVICE_ID_TI_WL1271,
-		remove_quirk, MMC_QUIRK_BROKEN_CLK_GATING },
-	{ 0 }
+	SDIO_FIXUP(SDIO_ANY_ID, SDIO_ANY_ID,
+		   add_quirk, MMC_QUIRK_BROKEN_CLK_GATING),
+
+	SDIO_FIXUP(SDIO_VENDOR_ID_TI, SDIO_DEVICE_ID_TI_WL1271,
+		   remove_quirk, MMC_QUIRK_BROKEN_CLK_GATING),
+
+	END_FIXUP
 };
 
 void mmc_fixup_device(struct mmc_card *card)
 {
 	const struct mmc_fixup *f;
+	u64 rev = cid_rev_card(card);
 
 	for (f = mmc_fixup_methods; f->vendor_fixup; f++) {
-		if ((f->vendor == card->cis.vendor
-		     || f->vendor == (u16) SDIO_ANY_ID) &&
-		    (f->device == card->cis.device
-		     || f->device == (u16) SDIO_ANY_ID)) {
+		if ((f->manfid == CID_MANFID_ANY
+		     || f->manfid == card->cid.manfid) &&
+		    (f->oemid == CID_OEMID_ANY
+		     || f->oemid == card->cid.oemid) &&
+		    (f->name == CID_NAME_ANY
+		     || !strcmp(f->name, card->cid.prod_name)) &&
+		    (f->cis_vendor == card->cis.vendor
+		     || f->cis_vendor == (u16) SDIO_ANY_ID) &&
+		    (f->cis_device == card->cis.device
+		    || f->cis_device == (u16) SDIO_ANY_ID) &&
+		    rev >= f->rev_start &&
+		    rev <= f->rev_end)	{
 			dev_dbg(&card->dev, "calling %pF\n", f->vendor_fixup);
 			f->vendor_fixup(card, f->data);
 		}
-- 
1.7.0.4


^ permalink raw reply related	[flat|nested] 28+ messages in thread

* [RFC 2/5] MMC: Allow function-specific quirks.
  2011-03-10  0:54 Block quirks redux + Toshiba performance quirk Andrei Warkentin
  2011-03-10  0:12 ` Andrei Warkentin
  2011-03-10  0:54 ` [RFC 1/5] MMC: Extends card quicks with MMC/SD quirks matching the CID Andrei Warkentin
@ 2011-03-10  0:54 ` Andrei Warkentin
  2011-03-10 15:09   ` Arnd Bergmann
  2011-03-10  0:54 ` [RFC 3/5] MMC: Support for block quirks Andrei Warkentin
                   ` (3 subsequent siblings)
  6 siblings, 1 reply; 28+ messages in thread
From: Andrei Warkentin @ 2011-03-10  0:54 UTC (permalink / raw)
  To: linux-mmc; +Cc: Andrei Warkentin

This allows us to create quirks for block devices, without
creating messy Kconfig dependencies, or polluting core/ with
function-specific code.

Change-Id: I0fd466f115718a23edd0636e1e73d91c77b63887
Signed-off-by: Andrei Warkentin <andreiw@motorola.com>
---
 drivers/mmc/core/core.h   |    2 -
 drivers/mmc/core/quirks.c |  100 +++-----------------------------------------
 drivers/mmc/core/sdio.c   |    2 +-
 include/linux/mmc/card.h  |   92 +++++++++++++++++++++++++++++++++++++++++-
 4 files changed, 99 insertions(+), 97 deletions(-)

diff --git a/drivers/mmc/core/core.h b/drivers/mmc/core/core.h
index a2a956b..406a50f 100644
--- a/drivers/mmc/core/core.h
+++ b/drivers/mmc/core/core.h
@@ -59,8 +59,6 @@ int mmc_attach_mmc(struct mmc_host *host, u32 ocr);
 int mmc_attach_sd(struct mmc_host *host, u32 ocr);
 int mmc_attach_sdio(struct mmc_host *host, u32 ocr);
 
-void mmc_fixup_device(struct mmc_card *card);
-
 /* Module parameters */
 extern int use_spi_crc;
 extern int mmc_assume_removable;
diff --git a/drivers/mmc/core/quirks.c b/drivers/mmc/core/quirks.c
index f795b8d..4816c1f 100644
--- a/drivers/mmc/core/quirks.c
+++ b/drivers/mmc/core/quirks.c
@@ -11,97 +11,6 @@
 #include <linux/types.h>
 #include <linux/kernel.h>
 #include <linux/mmc/card.h>
-#include <linux/mod_devicetable.h>
-
-#define cid_rev(hwrev, fwrev, year, month)	\
-	(((u64) hwrev) << 40 |                  \
-	 ((u64) fwrev) << 32 |                  \
-	 ((u64) year) << 16 |                   \
-	 ((u64) month))
-
-#define cid_rev_card(card)		  \
-	cid_rev(card->cid.hwrev,	  \
-		    card->cid.fwrev,      \
-		    card->cid.year,	  \
-		    card->cid.month)
-
-
-
-/*
- *  The world is not perfect and supplies us with broken mmc/sdio devices.
- *  For at least a part of these bugs we need a work-around
- */
-
-struct mmc_fixup {
-
-	/* CID-specific fields. */
-	const char *name;
-
-	/* Valid revision range */
-	u64 rev_start, rev_end;
-
-	unsigned int manfid;
-	unsigned short oemid;
-
-       /* SDIO-specfic fields. You can use SDIO_ANY_ID here of course */
-	u16 cis_vendor, cis_device;
-
-	void (*vendor_fixup)(struct mmc_card *card, int data);
-	int data;
-};
-
-#define CID_MANFID_ANY (-1ul)
-#define CID_OEMID_ANY ((unsigned short) -1)
-#define CID_NAME_ANY (NULL)
-
-#define END_FIXUP { 0 }
-
-#define _FIXUP_EXT(_name, _manfid, _oemid, _rev_start, _rev_end,	\
-		   _cis_vendor, _cis_device,				\
-		   _fixup, _data)					\
-	{						   \
-		.name = (_name),			   \
-		.manfid = (_manfid),			   \
-		.oemid = (_oemid),			   \
-		.rev_start = (_rev_start),		   \
-		.rev_end = (_rev_end),			   \
-		.cis_vendor = (_cis_vendor),		   \
-		.cis_device = (_cis_device),		   \
-		.vendor_fixup = (_fixup),		   \
-		.data = (_data),			   \
-	 }
-
-#define MMC_FIXUP_REV(_name, _manfid, _oemid, _rev_start, _rev_end,	\
-		      _fixup, _data)					\
-	_FIXUP_EXT(_name, _manfid,					\
-		   _oemid, _rev_start, _rev_end,			\
-		   SDIO_ANY_ID, SDIO_ANY_ID,				\
-		   _fixup, _data)					\
-
-#define MMC_FIXUP(_name, _manfid, _oemid, _fixup, _data) \
-	MMC_FIXUP_REV(_name, _manfid, _oemid, 0, -1ull, _fixup, _data)
-
-#define SDIO_FIXUP(_vendor, _device, _fixup, _data)			\
-	_FIXUP_EXT(CID_NAME_ANY, CID_MANFID_ANY,			\
-		    CID_OEMID_ANY, 0, -1ull,				\
-		   _vendor, _device,					\
-		   _fixup, _data)					\
-
- /*
-  * This hook just adds a quirk unconditionnally
-  */
- static void __maybe_unused add_quirk(struct mmc_card *card, int data)
- {
-	 card->quirks |= data;
- }
-
- /*
-  * This hook just removes a quirk unconditionnally
-  */
- static void __maybe_unused remove_quirk(struct mmc_card *card, int data)
- {
-	 card->quirks &= ~data;
- }
 
 #ifndef SDIO_VENDOR_ID_TI
 #define SDIO_VENDOR_ID_TI		0x0097
@@ -124,12 +33,17 @@ static const struct mmc_fixup mmc_fixup_methods[] = {
 	END_FIXUP
 };
 
-void mmc_fixup_device(struct mmc_card *card)
+void mmc_fixup_device(struct mmc_card *card,
+	const struct mmc_fixup *table)
 {
 	const struct mmc_fixup *f;
 	u64 rev = cid_rev_card(card);
 
-	for (f = mmc_fixup_methods; f->vendor_fixup; f++) {
+	/* Non-core specific workarounds. */
+	if (!table)
+		table = mmc_fixup_methods;
+
+	for (f = table; f->vendor_fixup; f++) {
 		if ((f->manfid == CID_MANFID_ANY
 		     || f->manfid == card->cid.manfid) &&
 		    (f->oemid == CID_OEMID_ANY
diff --git a/drivers/mmc/core/sdio.c b/drivers/mmc/core/sdio.c
index 50749f5..d3ec4dc 100644
--- a/drivers/mmc/core/sdio.c
+++ b/drivers/mmc/core/sdio.c
@@ -480,7 +480,7 @@ static int mmc_sdio_init_card(struct mmc_host *host, u32 ocr,
 		card = oldcard;
 		return 0;
 	}
-	mmc_fixup_device(card);
+	mmc_fixup_device(card, NULL);
 
 	if (card->type == MMC_TYPE_SD_COMBO) {
 		err = mmc_sd_setup_card(host, card, oldcard != NULL);
diff --git a/include/linux/mmc/card.h b/include/linux/mmc/card.h
index fe9d7be..00fdeb9 100644
--- a/include/linux/mmc/card.h
+++ b/include/linux/mmc/card.h
@@ -11,6 +11,7 @@
 #define LINUX_MMC_CARD_H
 
 #include <linux/mmc/core.h>
+#include <linux/mod_devicetable.h>
 
 struct mmc_cid {
 	unsigned int		manfid;
@@ -149,7 +150,93 @@ struct mmc_card {
 	struct dentry		*debugfs_root;
 };
 
-void mmc_fixup_device(struct mmc_card *dev);
+/*
+ *  The world is not perfect and supplies us with broken mmc/sdio devices.
+ *  For at least a part of these bugs we need a work-around
+ */
+
+struct mmc_fixup {
+
+	/* CID-specific fields. */
+	const char *name;
+
+	/* Valid revision range */
+	u64 rev_start, rev_end;
+
+	unsigned int manfid;
+	unsigned short oemid;
+
+       /* SDIO-specfic fields. You can use SDIO_ANY_ID here of course */
+	u16 cis_vendor, cis_device;
+
+	void (*vendor_fixup)(struct mmc_card *card, int data);
+	int data;
+};
+
+#define CID_MANFID_ANY (-1ul)
+#define CID_OEMID_ANY ((unsigned short) -1)
+#define CID_NAME_ANY (NULL)
+
+#define END_FIXUP { 0 }
+
+#define _FIXUP_EXT(_name, _manfid, _oemid, _rev_start, _rev_end,	\
+		   _cis_vendor, _cis_device,				\
+		   _fixup, _data)					\
+	{						   \
+		.name = (_name),			   \
+		.manfid = (_manfid),			   \
+		.oemid = (_oemid),			   \
+		.rev_start = (_rev_start),		   \
+		.rev_end = (_rev_end),			   \
+		.cis_vendor = (_cis_vendor),		   \
+		.cis_device = (_cis_device),		   \
+		.vendor_fixup = (_fixup),		   \
+		.data = (_data),			   \
+	 }
+
+#define MMC_FIXUP_REV(_name, _manfid, _oemid, _rev_start, _rev_end,	\
+		      _fixup, _data)					\
+	_FIXUP_EXT(_name, _manfid,					\
+		   _oemid, _rev_start, _rev_end,			\
+		   SDIO_ANY_ID, SDIO_ANY_ID,				\
+		   _fixup, _data)					\
+
+#define MMC_FIXUP(_name, _manfid, _oemid, _fixup, _data) \
+	MMC_FIXUP_REV(_name, _manfid, _oemid, 0, -1ull, _fixup, _data)
+
+#define SDIO_FIXUP(_vendor, _device, _fixup, _data)			\
+	_FIXUP_EXT(CID_NAME_ANY, CID_MANFID_ANY,			\
+		    CID_OEMID_ANY, 0, -1ull,				\
+		   _vendor, _device,					\
+		   _fixup, _data)					\
+
+#define cid_rev(hwrev, fwrev, year, month)	\
+	(((u64) hwrev) << 40 |                  \
+	 ((u64) fwrev) << 32 |                  \
+	 ((u64) year) << 16 |                   \
+	 ((u64) month))
+
+#define cid_rev_card(card)		  \
+	cid_rev(card->cid.hwrev,	  \
+		    card->cid.fwrev,      \
+		    card->cid.year,	  \
+		    card->cid.month)
+
+/*
+ * This hook just adds a quirk unconditionnally
+ */
+static inline void __maybe_unused add_quirk(struct mmc_card *card, int data)
+{
+	card->quirks |= data;
+}
+
+/*
+ * This hook just removes a quirk unconditionnally
+ */
+static inline void __maybe_unused remove_quirk(struct mmc_card *card, int data)
+{
+	card->quirks &= ~data;
+}
 
 #define mmc_card_mmc(c)		((c)->type == MMC_TYPE_MMC)
 #define mmc_card_sd(c)		((c)->type == MMC_TYPE_SD)
@@ -196,4 +283,7 @@ struct mmc_driver {
 extern int mmc_register_driver(struct mmc_driver *);
 extern void mmc_unregister_driver(struct mmc_driver *);
 
+extern void mmc_fixup_device(struct mmc_card *card,
+			     const struct mmc_fixup *table);
+
 #endif
-- 
1.7.0.4


^ permalink raw reply related	[flat|nested] 28+ messages in thread

* [RFC 3/5] MMC: Support for block quirks.
  2011-03-10  0:54 Block quirks redux + Toshiba performance quirk Andrei Warkentin
                   ` (2 preceding siblings ...)
  2011-03-10  0:54 ` [RFC 2/5] MMC: Allow function-specific quirks Andrei Warkentin
@ 2011-03-10  0:54 ` Andrei Warkentin
  2011-03-10  0:54 ` [RFC 4/5] MMC: Adjust unaligned write accesses Andrei Warkentin
                   ` (2 subsequent siblings)
  6 siblings, 0 replies; 28+ messages in thread
From: Andrei Warkentin @ 2011-03-10  0:54 UTC (permalink / raw)
  To: linux-mmc; +Cc: Andrei Warkentin

Block quirks implemented using core/quirks.c support.

Change-Id: I81d9ad57a7ae95c60ee8026f090c8df7c75fd069
Signed-off-by: Andrei Warkentin <andreiw@motorola.com>
---
 drivers/mmc/card/block.c |    7 +++++++
 1 files changed, 7 insertions(+), 0 deletions(-)

diff --git a/drivers/mmc/card/block.c b/drivers/mmc/card/block.c
index 7054fd5..913f394 100644
--- a/drivers/mmc/card/block.c
+++ b/drivers/mmc/card/block.c
@@ -686,6 +686,11 @@ mmc_blk_set_blksize(struct mmc_blk_data *md, struct mmc_card *card)
 	return 0;
 }
 
+static const struct mmc_fixup blk_fixups[] =
+{
+	END_FIXUP
+};
+
 static int mmc_blk_probe(struct mmc_card *card)
 {
 	struct mmc_blk_data *md;
@@ -714,6 +719,8 @@ static int mmc_blk_probe(struct mmc_card *card)
 		cap_str, md->read_only ? "(ro)" : "");
 
 	mmc_set_drvdata(card, md);
+	mmc_fixup_device(card, blk_fixups);
+
 #ifdef CONFIG_MMC_BLOCK_DEFERRED_RESUME
 	mmc_set_bus_resume_policy(card->host, 1);
 #endif
-- 
1.7.0.4


^ permalink raw reply related	[flat|nested] 28+ messages in thread

* [RFC 4/5] MMC: Adjust unaligned write accesses.
  2011-03-10  0:54 Block quirks redux + Toshiba performance quirk Andrei Warkentin
                   ` (3 preceding siblings ...)
  2011-03-10  0:54 ` [RFC 3/5] MMC: Support for block quirks Andrei Warkentin
@ 2011-03-10  0:54 ` Andrei Warkentin
  2011-03-10 16:05   ` Arnd Bergmann
  2011-03-10  0:54 ` [RFC 5/5] MMC: Toshiba eMMC - Split 8K-unaligned accesses Andrei Warkentin
  2011-03-10  1:03 ` [RFC] MMC: Extends card quicks with MMC/SD quirks matching the CID Andrei Warkentin
  6 siblings, 1 reply; 28+ messages in thread
From: Andrei Warkentin @ 2011-03-10  0:54 UTC (permalink / raw)
  To: linux-mmc; +Cc: Andrei Warkentin

Adjust unaligned write accesses spanning preferred align
size into two accesses - an unaligned and an aligned access.
This is meant to be used for card quirks, and is off
by default. A limiting value in transfer size
for this adjustment is available, as on some cards there is a
perf decrease for larger transfers.

Signed-off-by: Andrei Warkentin <andreiw@motorola.com>
---
 drivers/mmc/card/block.c |   43 +++++++++++++++++++++++++++++++++++++++++++
 1 files changed, 43 insertions(+), 0 deletions(-)

diff --git a/drivers/mmc/card/block.c b/drivers/mmc/card/block.c
index 913f394..a8f18c7 100644
--- a/drivers/mmc/card/block.c
+++ b/drivers/mmc/card/block.c
@@ -63,6 +63,8 @@ struct mmc_blk_data {
 
 	unsigned int	usage;
 	unsigned int	read_only;
+	unsigned int	write_align_size;
+	unsigned int	write_align_limit;
 };
 
 static DEFINE_MUTEX(open_lock);
@@ -312,6 +314,43 @@ out:
 	return err ? 0 : 1;
 }
 
+/*
+ * If the request is not aligned, split it into an unaligned
+ * and an aligned portion. Here we can adjust
+ * the size of the MMC request and let the block layer request handle
+ * deal with generating another MMC request.
+ */
+
+static void mmc_adjust_write(struct mmc_card *card,
+			     struct mmc_request *mrq)
+{
+	unsigned int left_in_page;
+	unsigned int wa_size_blocks;
+	struct mmc_blk_data *md = mmc_get_drvdata(card);
+
+	if (!md->write_align_size)
+		return;
+
+	if (md->write_align_limit &&
+	    (md->write_align_limit / mrq->data->blksz)
+	    < mrq->data->blocks)
+		return;
+
+	wa_size_blocks = md->write_align_size / mrq->data->blksz;
+	left_in_page = wa_size_blocks -
+		(mrq->cmd->arg % wa_size_blocks);
+
+	/* Aligned access. */
+	if (left_in_page == wa_size_blocks)
+		return;
+
+	/* Not straddling page boundary. */
+	if (mrq->data->blocks <= left_in_page)
+		return;
+
+	mrq->data->blocks = left_in_page;
+}
+
 static int mmc_blk_issue_rw_rq(struct mmc_queue *mq, struct request *req)
 {
 	struct mmc_blk_data *md = mq->data;
@@ -339,6 +378,10 @@ static int mmc_blk_issue_rw_rq(struct mmc_queue *mq, struct request *req)
 		brq.stop.flags = MMC_RSP_SPI_R1B | MMC_RSP_R1B | MMC_CMD_AC;
 		brq.data.blocks = blk_rq_sectors(req);
 
+		/* Check for unaligned accesses straddling pages. */
+		if (rq_data_dir(req) == WRITE)
+			mmc_adjust_write(card, &brq.mrq);
+
 		/*
 		 * The block layer doesn't support all sector count
 		 * restrictions, so we need to be prepared for too big
-- 
1.7.0.4


^ permalink raw reply related	[flat|nested] 28+ messages in thread

* [RFC 5/5] MMC: Toshiba eMMC - Split 8K-unaligned accesses.
  2011-03-10  0:54 Block quirks redux + Toshiba performance quirk Andrei Warkentin
                   ` (4 preceding siblings ...)
  2011-03-10  0:54 ` [RFC 4/5] MMC: Adjust unaligned write accesses Andrei Warkentin
@ 2011-03-10  0:54 ` Andrei Warkentin
  2011-03-10  1:03 ` [RFC] MMC: Extends card quicks with MMC/SD quirks matching the CID Andrei Warkentin
  6 siblings, 0 replies; 28+ messages in thread
From: Andrei Warkentin @ 2011-03-10  0:54 UTC (permalink / raw)
  To: linux-mmc; +Cc: Andrei Warkentin

These cards show abysmal write performance when
writes < 12K that are 8K unaligned cross an 8K barrier.

Change-Id: I3b015a93ae21d077b447e7d6fbe81255dbd2b0df
Signed-off-by: Andrei Warkentin <andreiw@motorola.com>
---
 drivers/mmc/card/Kconfig |    9 +++++++++
 drivers/mmc/card/block.c |   20 ++++++++++++++++++++
 2 files changed, 29 insertions(+), 0 deletions(-)

diff --git a/drivers/mmc/card/Kconfig b/drivers/mmc/card/Kconfig
index 86948f9..1a9e4aa 100644
--- a/drivers/mmc/card/Kconfig
+++ b/drivers/mmc/card/Kconfig
@@ -14,6 +14,15 @@ config MMC_BLOCK
 	  mount the filesystem. Almost everyone wishing MMC support
 	  should say Y or M here.
 
+config MMC_BLOCK_QUIRK_TOSHIBA_32NM
+       tristate "Toshiba MMC 32nm technology flash device quirks"
+       depends on MMC_BLOCK
+       default n
+       help
+         Say Y if you have a Toshiba 32nm technology flash device,
+	 such as MMC32G or MMC16G eMMCs. This enables a performance
+	 improvement for flash page unaligned writes.
+
 config MMC_BLOCK_BOUNCE
 	bool "Use bounce buffer for simple hosts"
 	depends on MMC_BLOCK
diff --git a/drivers/mmc/card/block.c b/drivers/mmc/card/block.c
index a8f18c7..5250748 100644
--- a/drivers/mmc/card/block.c
+++ b/drivers/mmc/card/block.c
@@ -729,8 +729,28 @@ mmc_blk_set_blksize(struct mmc_blk_data *md, struct mmc_card *card)
 	return 0;
 }
 
+#ifdef CONFIG_MMC_BLOCK_QUIRK_TOSHIBA_32NM
+static void toshiba_32nm_fixup(struct mmc_card *card, int data)
+{
+	struct mmc_blk_data *md = mmc_get_drvdata(card);
+	printk(KERN_INFO "Applying Toshiba 32nm workarounds\n");
+
+	/* Page size 8K, this card doesn't like unaligned writes
+	   across 8K boundary. */
+	md->write_align_size = 8192;
+
+	/* Doing the alignment for accesses > 12K seems to
+	   result in decreased perf. */
+	md->write_align_limit = 12288;
+}
+#endif /* CONFIG_MMC_BLOCK_QUIRK_TOSHIBA_32NM */
+
 static const struct mmc_fixup blk_fixups[] =
 {
+#ifdef CONFIG_MMC_BLOCK_QUIRK_TOSHIBA_32NM
+	MMC_FIXUP("MMC16G", 0x11, 0x0, toshiba_32nm_fixup, 0),
+	MMC_FIXUP("MMC32G", 0x11, 0x0100, toshiba_32nm_fixup, 0),
+#endif /* CONFIG_MMC_BLOCK_QUIRK_TOSHIBA_32NM */
 	END_FIXUP
 };
 
-- 
1.7.0.4


^ permalink raw reply related	[flat|nested] 28+ messages in thread

* [RFC] MMC: Extends card quicks with MMC/SD quirks matching the CID.
  2011-03-10  0:54 Block quirks redux + Toshiba performance quirk Andrei Warkentin
                   ` (5 preceding siblings ...)
  2011-03-10  0:54 ` [RFC 5/5] MMC: Toshiba eMMC - Split 8K-unaligned accesses Andrei Warkentin
@ 2011-03-10  1:03 ` Andrei Warkentin
  6 siblings, 0 replies; 28+ messages in thread
From: Andrei Warkentin @ 2011-03-10  1:03 UTC (permalink / raw)
  To: linux-mmc; +Cc: Andrei Warkentin

The current mechanism is SDIO-only.

Signed-off-by: Andrei Warkentin <andreiw@motorola.com>
---
 drivers/mmc/core/quirks.c |  142 +++++++++++++++++++++++++++++++++++----------
 1 files changed, 110 insertions(+), 32 deletions(-)

diff --git a/drivers/mmc/core/quirks.c b/drivers/mmc/core/quirks.c
index 4fb16ac..5c5c948 100644
--- a/drivers/mmc/core/quirks.c
+++ b/drivers/mmc/core/quirks.c
@@ -13,41 +13,95 @@
 #include <linux/mmc/card.h>
 #include <linux/mod_devicetable.h>
 
+#define cid_rev(hwrev, fwrev, year, month)	\
+	(((u64) hwrev) << 40 |                  \
+	 ((u64) fwrev) << 32 |                  \
+	 ((u64) year) << 16 |                   \
+	 ((u64) month))
+
+#define cid_rev_card(card)		  \
+	cid_rev(card->cid.hwrev,	  \
+		    card->cid.fwrev,      \
+		    card->cid.year,	  \
+		    card->cid.month)
+
+
+
 /*
  *  The world is not perfect and supplies us with broken mmc/sdio devices.
  *  For at least a part of these bugs we need a work-around
  */
 
 struct mmc_fixup {
-	u16 vendor, device;	/* You can use SDIO_ANY_ID here of course */
+
+	/* CID-specific fields. */
+	const char *name;
+
+	/* Valid revision range */
+	u64 rev_start, rev_end;
+
+	unsigned int manfid;
+	unsigned short oemid;
+
+       /* SDIO-specfic fields. You can use SDIO_ANY_ID here of course */
+	u16 cis_vendor, cis_device;
+
 	void (*vendor_fixup)(struct mmc_card *card, int data);
 	int data;
 };
 
-/*
- * This hook just adds a quirk unconditionnally
- */
-static void __maybe_unused add_quirk(struct mmc_card *card, int data)
-{
-	card->quirks |= data;
-}
+#define CID_MANFID_ANY (-1ul)
+#define CID_OEMID_ANY ((unsigned short) -1)
+#define CID_NAME_ANY (NULL)
 
-/*
- * This hook just removes a quirk unconditionnally
- */
-static void __maybe_unused remove_quirk(struct mmc_card *card, int data)
-{
-	card->quirks &= ~data;
-}
+#define END_FIXUP { 0 }
 
-/*
- * This hook just adds a quirk for all sdio devices
- */
-static void add_quirk_for_sdio_devices(struct mmc_card *card, int data)
-{
-	if (mmc_card_sdio(card))
-		card->quirks |= data;
-}
+#define _FIXUP_EXT(_name, _manfid, _oemid, _rev_start, _rev_end,	\
+		   _cis_vendor, _cis_device,				\
+		   _fixup, _data)					\
+	{						   \
+		.name = (_name),			   \
+		.manfid = (_manfid),			   \
+		.oemid = (_oemid),			   \
+		.rev_start = (_rev_start),		   \
+		.rev_end = (_rev_end),			   \
+		.cis_vendor = (_cis_vendor),		   \
+		.cis_device = (_cis_device),		   \
+		.vendor_fixup = (_fixup),		   \
+		.data = (_data),			   \
+	 }
+
+#define MMC_FIXUP_REV(_name, _manfid, _oemid, _rev_start, _rev_end,	\
+		      _fixup, _data)					\
+	_FIXUP_EXT(_name, _manfid,					\
+		   _oemid, _rev_start, _rev_end,			\
+		   SDIO_ANY_ID, SDIO_ANY_ID,				\
+		   _fixup, _data)					\
+
+#define MMC_FIXUP(_name, _manfid, _oemid, _fixup, _data) \
+	MMC_FIXUP_REV(_name, _manfid, _oemid, 0, -1ull, _fixup, _data)
+
+#define SDIO_FIXUP(_vendor, _device, _fixup, _data)			\
+	_FIXUP_EXT(CID_NAME_ANY, CID_MANFID_ANY,			\
+		    CID_OEMID_ANY, 0, -1ull,				\
+		   _vendor, _device,					\
+		   _fixup, _data)					\
+
+ /*
+  * This hook just adds a quirk unconditionnally
+  */
+ static void __maybe_unused add_quirk(struct mmc_card *card, int data)
+ {
+	 card->quirks |= data;
+ }
+
+ /*
+  * This hook just removes a quirk unconditionnally
+  */
+ static void __maybe_unused remove_quirk(struct mmc_card *card, int data)
+ {
+	 card->quirks &= ~data;
+ }
 
 #ifndef SDIO_VENDOR_ID_TI
 #define SDIO_VENDOR_ID_TI		0x0097
@@ -57,25 +111,49 @@ static void add_quirk_for_sdio_devices(struct mmc_card *card, int data)
 #define SDIO_DEVICE_ID_TI_WL1271	0x4076
 #endif
 
+
+/*
+ * This hook just adds a quirk for all sdio devices
+*/
+
+static void add_quirk_for_sdio_devices(struct mmc_card *card, int data)
+{
+	if (mmc_card_sdio(card))
+		card->quirks |= data;
+}
+
 static const struct mmc_fixup mmc_fixup_methods[] = {
+
 	/* by default sdio devices are considered CLK_GATING broken */
 	/* good cards will be whitelisted as they are tested */
-	{ SDIO_ANY_ID, SDIO_ANY_ID,
-		add_quirk_for_sdio_devices, MMC_QUIRK_BROKEN_CLK_GATING },
-	{ SDIO_VENDOR_ID_TI, SDIO_DEVICE_ID_TI_WL1271,
-		remove_quirk, MMC_QUIRK_BROKEN_CLK_GATING },
-	{ 0 }
+	SDIO_FIXUP(SDIO_ANY_ID, SDIO_ANY_ID,
+		   add_quirk_for_sdio_devices,
+		   MMC_QUIRK_BROKEN_CLK_GATING),
+
+	SDIO_FIXUP(SDIO_VENDOR_ID_TI, SDIO_DEVICE_ID_TI_WL1271,
+		   remove_quirk, MMC_QUIRK_BROKEN_CLK_GATING),
+
+	END_FIXUP
 };
 
 void mmc_fixup_device(struct mmc_card *card)
 {
 	const struct mmc_fixup *f;
+	u64 rev = cid_rev_card(card);
 
 	for (f = mmc_fixup_methods; f->vendor_fixup; f++) {
-		if ((f->vendor == card->cis.vendor
-		     || f->vendor == (u16) SDIO_ANY_ID) &&
-		    (f->device == card->cis.device
-		     || f->device == (u16) SDIO_ANY_ID)) {
+		if ((f->manfid == CID_MANFID_ANY
+		     || f->manfid == card->cid.manfid) &&
+		    (f->oemid == CID_OEMID_ANY
+		     || f->oemid == card->cid.oemid) &&
+		    (f->name == CID_NAME_ANY
+		     || !strcmp(f->name, card->cid.prod_name)) &&
+		    (f->cis_vendor == card->cis.vendor
+		     || f->cis_vendor == (u16) SDIO_ANY_ID) &&
+		    (f->cis_device == card->cis.device
+		    || f->cis_device == (u16) SDIO_ANY_ID) &&
+		    rev >= f->rev_start &&
+		    rev <= f->rev_end)	{
 			dev_dbg(&card->dev, "calling %pF\n", f->vendor_fixup);
 			f->vendor_fixup(card, f->data);
 		}
-- 
1.7.0.4


^ permalink raw reply related	[flat|nested] 28+ messages in thread

* Re: [RFC 2/5] MMC: Allow function-specific quirks.
  2011-03-10  0:54 ` [RFC 2/5] MMC: Allow function-specific quirks Andrei Warkentin
@ 2011-03-10 15:09   ` Arnd Bergmann
  2011-03-10 20:41     ` Andrei Warkentin
  0 siblings, 1 reply; 28+ messages in thread
From: Arnd Bergmann @ 2011-03-10 15:09 UTC (permalink / raw)
  To: Andrei Warkentin; +Cc: linux-mmc

On Thursday 10 March 2011, Andrei Warkentin wrote:
> 
> This allows us to create quirks for block devices, without
> creating messy Kconfig dependencies, or polluting core/ with
> function-specific code.

The change looks good to me, but it's hard to read because
you are moving around code and changing it in one patch.

A better split of the first three patches would be to first
move the code to a header file without changing it, and then
make the changes.

Or you could directly add the code to the header file when you
introduce it in the first patch.

> Change-Id: I0fd466f115718a23edd0636e1e73d91c77b63887

Please remove these lines.

	Arnd

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [RFC 4/5] MMC: Adjust unaligned write accesses.
  2011-03-10  0:54 ` [RFC 4/5] MMC: Adjust unaligned write accesses Andrei Warkentin
@ 2011-03-10 16:05   ` Arnd Bergmann
  2011-03-10 20:45     ` Andrei Warkentin
  0 siblings, 1 reply; 28+ messages in thread
From: Arnd Bergmann @ 2011-03-10 16:05 UTC (permalink / raw)
  To: Andrei Warkentin; +Cc: linux-mmc

On Thursday 10 March 2011, Andrei Warkentin wrote:
> Adjust unaligned write accesses spanning preferred align
> size into two accesses - an unaligned and an aligned access.
> This is meant to be used for card quirks, and is off
> by default. A limiting value in transfer size
> for this adjustment is available, as on some cards there is a
> perf decrease for larger transfers.
> 
> Signed-off-by: Andrei Warkentin <andreiw@motorola.com>

Sorry for repeating myself, but I'm not sure if you didn't understand
me or if I missed your arguments against the more generic approach.

Instead of encoding specific parameters for this quirk, I would
much prefer to have the values be meaningful, and have the
code deal with the interpretation.

> @@ -63,6 +63,8 @@ struct mmc_blk_data {
>  
>  	unsigned int	usage;
>  	unsigned int	read_only;
> +	unsigned int	write_align_size;
> +	unsigned int	write_align_limit;
>  };

These numbers are generally speaking useless, except for the
one quirk. What I was trying to suggest here is to have
a page_size field that reflects the underlying page size
of the NAND flash, and a flag that says "please split all
requests under 1.5 times the page size along page boundaries".

We can pre-initialize the page size to some common value
(e.g. 16 KB for 4GB or larger cards, 4 KB for smaller than
4 GB), and use quirks to override it for cards where we
know it's different.

There are a lot of optimizations based on the page size
(partition alignment, fs block size, readahead, ...), so it
can become a generic blockdev attribute and get used by
both kernel and user code, rather than assuming we can
do 512 byte sector accesses efficiently.

The block layer already has physical_block_size and
io_min fields in various places, so the answer may be
to simply set one of those instead of introducing another
page_size for mmc.

	Arnd

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [RFC 2/5] MMC: Allow function-specific quirks.
  2011-03-10 15:09   ` Arnd Bergmann
@ 2011-03-10 20:41     ` Andrei Warkentin
  2011-03-10 21:55       ` Arnd Bergmann
  0 siblings, 1 reply; 28+ messages in thread
From: Andrei Warkentin @ 2011-03-10 20:41 UTC (permalink / raw)
  To: Arnd Bergmann; +Cc: linux-mmc

On Thu, Mar 10, 2011 at 9:09 AM, Arnd Bergmann <arnd@arndb.de> wrote:
> On Thursday 10 March 2011, Andrei Warkentin wrote:
>>
>> This allows us to create quirks for block devices, without
>> creating messy Kconfig dependencies, or polluting core/ with
>> function-specific code.
>
> The change looks good to me, but it's hard to read because
> you are moving around code and changing it in one patch.
>
> A better split of the first three patches would be to first
> move the code to a header file without changing it, and then
> make the changes.
>
> Or you could directly add the code to the header file when you
> introduce it in the first patch.

So basically squash the three patches dealing with quirks infrastructure?

A

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [RFC 4/5] MMC: Adjust unaligned write accesses.
  2011-03-10 16:05   ` Arnd Bergmann
@ 2011-03-10 20:45     ` Andrei Warkentin
  2011-03-10 21:54       ` Arnd Bergmann
  0 siblings, 1 reply; 28+ messages in thread
From: Andrei Warkentin @ 2011-03-10 20:45 UTC (permalink / raw)
  To: Arnd Bergmann; +Cc: linux-mmc

On Thu, Mar 10, 2011 at 10:05 AM, Arnd Bergmann <arnd@arndb.de> wrote:
> Sorry for repeating myself, but I'm not sure if you didn't understand
> me or if I missed your arguments against the more generic approach.
>
> Instead of encoding specific parameters for this quirk, I would
> much prefer to have the values be meaningful, and have the
> code deal with the interpretation.
>
>> @@ -63,6 +63,8 @@ struct mmc_blk_data {
>>
>>       unsigned int    usage;
>>       unsigned int    read_only;
>> +     unsigned int    write_align_size;
>> +     unsigned int    write_align_limit;
>>  };
>
> These numbers are generally speaking useless, except for the
> one quirk. What I was trying to suggest here is to have
> a page_size field that reflects the underlying page size
> of the NAND flash, and a flag that says "please split all
> requests under 1.5 times the page size along page boundaries".

Ah ok, I really did just misinterpret. Sorry. Will refactor.

>
> We can pre-initialize the page size to some common value
> (e.g. 16 KB for 4GB or larger cards, 4 KB for smaller than
> 4 GB), and use quirks to override it for cards where we
> know it's different.
>

Even better. We have the "super page size" in the EXT_CSD. For example
for Sandisk it's 16KB, Toshiba is reporting 32KB, but we know that's
not right...

Thanks again,
A

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [RFC 4/5] MMC: Adjust unaligned write accesses.
  2011-03-10 20:45     ` Andrei Warkentin
@ 2011-03-10 21:54       ` Arnd Bergmann
  2011-03-10 23:06         ` Andrei Warkentin
  0 siblings, 1 reply; 28+ messages in thread
From: Arnd Bergmann @ 2011-03-10 21:54 UTC (permalink / raw)
  To: Andrei Warkentin; +Cc: linux-mmc

On Thursday 10 March 2011 21:45:07 Andrei Warkentin wrote:
> > We can pre-initialize the page size to some common value
> > (e.g. 16 KB for 4GB or larger cards, 4 KB for smaller than
> > 4 GB), and use quirks to override it for cards where we
> > know it's different.
> >
> 
> Even better. We have the "super page size" in the EXT_CSD. For example
> for Sandisk it's 16KB, Toshiba is reporting 32KB, but we know that's
> not right...

Ok, that sounds good. I don't think we have anything like this
for SD cards though, so we'd still need to make some reasonable
assumption there.

The one thing we know is that all SD cards should support writes of
32KB alignment, because that is the largest cluster size supported
by FAT16 and FAT32. I have experimentally shown that most cards
can reasonably do 16KB, and only few modern cards can do smaller
than that.

I also don't know what the effect of setting physical_block_size
and/or io_min is, possibly it no longer works if they are larger
than the MMU page size. Need to try this.

	Arnd

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [RFC 2/5] MMC: Allow function-specific quirks.
  2011-03-10 20:41     ` Andrei Warkentin
@ 2011-03-10 21:55       ` Arnd Bergmann
  0 siblings, 0 replies; 28+ messages in thread
From: Arnd Bergmann @ 2011-03-10 21:55 UTC (permalink / raw)
  To: Andrei Warkentin; +Cc: linux-mmc

On Thursday 10 March 2011 21:41:45 Andrei Warkentin wrote:
> > Or you could directly add the code to the header file when you
> > introduce it in the first patch.
> 
> So basically squash the three patches dealing with quirks infrastructure?

Yes, you could do that, or maybe just squash the first two patches.

	Arnd

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [RFC 4/5] MMC: Adjust unaligned write accesses.
  2011-03-10 21:54       ` Arnd Bergmann
@ 2011-03-10 23:06         ` Andrei Warkentin
  2011-03-11 10:23           ` Arnd Bergmann
  0 siblings, 1 reply; 28+ messages in thread
From: Andrei Warkentin @ 2011-03-10 23:06 UTC (permalink / raw)
  To: Arnd Bergmann; +Cc: linux-mmc

On Thu, Mar 10, 2011 at 3:54 PM, Arnd Bergmann <arnd@arndb.de> wrote:
> On Thursday 10 March 2011 21:45:07 Andrei Warkentin wrote:
>> > We can pre-initialize the page size to some common value
>> > (e.g. 16 KB for 4GB or larger cards, 4 KB for smaller than
>> > 4 GB), and use quirks to override it for cards where we
>> > know it's different.
>> >
>>
>> Even better. We have the "super page size" in the EXT_CSD. For example
>> for Sandisk it's 16KB, Toshiba is reporting 32KB, but we know that's
>> not right...
>
> Ok, that sounds good. I don't think we have anything like this
> for SD cards though, so we'd still need to make some reasonable
> assumption there.
>
> The one thing we know is that all SD cards should support writes of
> 32KB alignment, because that is the largest cluster size supported
> by FAT16 and FAT32. I have experimentally shown that most cards
> can reasonably do 16KB, and only few modern cards can do smaller
> than that.
>
> I also don't know what the effect of setting physical_block_size
> and/or io_min is, possibly it no longer works if they are larger
> than the MMU page size. Need to try this.

The other thing is figuring out the default size limit for when the
page align should be performed. I suppose it's safe enough to
set it to 1.5 size the super_page_size. But that number came from the
Toshiba card tests. Or it could be unbounded by default. I'd rather do
the later.

A

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [RFC 4/5] MMC: Adjust unaligned write accesses.
  2011-03-10 23:06         ` Andrei Warkentin
@ 2011-03-11 10:23           ` Arnd Bergmann
  2011-03-13 13:00             ` Andrei Warkentin
  0 siblings, 1 reply; 28+ messages in thread
From: Arnd Bergmann @ 2011-03-11 10:23 UTC (permalink / raw)
  To: Andrei Warkentin; +Cc: linux-mmc

On Friday 11 March 2011, Andrei Warkentin wrote:
> > I also don't know what the effect of setting physical_block_size
> > and/or io_min is, possibly it no longer works if they are larger
> > than the MMU page size. Need to try this.
> 
> The other thing is figuring out the default size limit for when the
> page align should be performed. I suppose it's safe enough to
> set it to 1.5 size the super_page_size. But that number came from the
> Toshiba card tests. Or it could be unbounded by default. I'd rather do
> the later.

You mean always splitting (multiples of) full super-pages from partial
super-pages when the quirk flag is enabled?

I think it depends on the performance numbers. Do you have any
meaningful measurements without the quirk, with the current implementation
and with the unbounded case?

If the latter two are not much different on the toshiba card, that
would be a simpler implementation, and more likely to be useful on
other cards.

I think we should also do measurements to see if the same quirk
actually has any negative effects on other cards, or if there
are even cases where it helps.

	Arnd

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [RFC 4/5] MMC: Adjust unaligned write accesses.
  2011-03-11 10:23           ` Arnd Bergmann
@ 2011-03-13 13:00             ` Andrei Warkentin
  2011-03-13 14:54               ` Arnd Bergmann
  0 siblings, 1 reply; 28+ messages in thread
From: Andrei Warkentin @ 2011-03-13 13:00 UTC (permalink / raw)
  To: Arnd Bergmann; +Cc: linux-mmc

On Fri, Mar 11, 2011 at 4:23 AM, Arnd Bergmann <arnd@arndb.de> wrote:
> On Friday 11 March 2011, Andrei Warkentin wrote:
>> > I also don't know what the effect of setting physical_block_size
>> > and/or io_min is, possibly it no longer works if they are larger
>> > than the MMU page size. Need to try this.
>>
>> The other thing is figuring out the default size limit for when the
>> page align should be performed. I suppose it's safe enough to
>> set it to 1.5 size the super_page_size. But that number came from the
>> Toshiba card tests. Or it could be unbounded by default. I'd rather do
>> the later.
>
> You mean always splitting (multiples of) full super-pages from partial
> super-pages when the quirk flag is enabled?
>
> I think it depends on the performance numbers. Do you have any
> meaningful measurements without the quirk, with the current implementation
> and with the unbounded case?
>
> If the latter two are not much different on the toshiba card, that
> would be a simpler implementation, and more likely to be useful on
> other cards.
>

Revalidating the data now, along with some more tests, to get a better
picture. It seems the more data I get, the less it makes sense :(.

> I think we should also do measurements to see if the same quirk
> actually has any negative effects on other cards, or if there
> are even cases where it helps.

Going to test on Sandisk eMMC as well.

Thanks,
A

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [RFC 4/5] MMC: Adjust unaligned write accesses.
  2011-03-13 13:00             ` Andrei Warkentin
@ 2011-03-13 14:54               ` Arnd Bergmann
  2011-03-14  7:40                 ` Andrei Warkentin
  0 siblings, 1 reply; 28+ messages in thread
From: Arnd Bergmann @ 2011-03-13 14:54 UTC (permalink / raw)
  To: Andrei Warkentin; +Cc: linux-mmc

On Sunday 13 March 2011 14:00:04 Andrei Warkentin wrote:
> > If the latter two are not much different on the toshiba card, that
> > would be a simpler implementation, and more likely to be useful on
> > other cards.
> >
> 
> Revalidating the data now, along with some more tests, to get a better
> picture. It seems the more data I get, the less it makes sense :(.

I was already fearing that the change would only benefit low-level
benchmarks. It certainly helps writing small chunks to the buffer
that is meant for FAT32 directories, but at some point, the card
will have to write back the entire logical erase block, so you
might not be able to gain much in real-world workloads.

Of course, I also have only a very limited understanding of the
algorithm used in the Toshiba chip.

	Arnd

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [RFC 4/5] MMC: Adjust unaligned write accesses.
  2011-03-13 14:54               ` Arnd Bergmann
@ 2011-03-14  7:40                 ` Andrei Warkentin
  2011-03-19 11:09                   ` Andrei Warkentin
  0 siblings, 1 reply; 28+ messages in thread
From: Andrei Warkentin @ 2011-03-14  7:40 UTC (permalink / raw)
  To: Arnd Bergmann; +Cc: linux-mmc

On Sun, Mar 13, 2011 at 9:54 AM, Arnd Bergmann <arnd@arndb.de> wrote:
> On Sunday 13 March 2011 14:00:04 Andrei Warkentin wrote:
>> > If the latter two are not much different on the toshiba card, that
>> > would be a simpler implementation, and more likely to be useful on
>> > other cards.
>> >
>>
>> Revalidating the data now, along with some more tests, to get a better
>> picture. It seems the more data I get, the less it makes sense :(.
>
> I was already fearing that the change would only benefit low-level
> benchmarks. It certainly helps writing small chunks to the buffer
> that is meant for FAT32 directories, but at some point, the card
> will have to write back the entire logical erase block, so you
> might not be able to gain much in real-world workloads.
>

Original data was collected by someone else on a 16G eMMC device. The
data I have collected now for a 32G device of seemingly similar
technology doesn't quite match the original data in terms of
performance characteristics, so I am trying to collect the data on the
16G device and figure out what the difference is. The align change may
only benefit certain types of accesses, but it for sure must not hurt
the general case, otherwise there is really no point behind even
considering it...

It takes forever to get a good amount of data to see past the noise,
too. My last set of tests has already been running for 10 hours...

> Of course, I also have only a very limited understanding of the
> algorithm used in the Toshiba chip.
>

Unfortunately, so do I. As far as manufacturer suggestions, we get a
very controlled and limited view of things :).

Thanks,
A

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [RFC 4/5] MMC: Adjust unaligned write accesses.
  2011-03-14  7:40                 ` Andrei Warkentin
@ 2011-03-19 11:09                   ` Andrei Warkentin
  2011-03-21 14:21                     ` Arnd Bergmann
  0 siblings, 1 reply; 28+ messages in thread
From: Andrei Warkentin @ 2011-03-19 11:09 UTC (permalink / raw)
  To: Arnd Bergmann; +Cc: linux-mmc

[-- Attachment #1: Type: text/plain, Size: 3911 bytes --]

Hi Arnd, all...

On Mon, Mar 14, 2011 at 2:40 AM, Andrei Warkentin <andreiw@motorola.com> wrote:

>>>
>>> Revalidating the data now, along with some more tests, to get a better
>>> picture. It seems the more data I get, the less it makes sense :(.
>>
>> I was already fearing that the change would only benefit low-level
>> benchmarks. It certainly helps writing small chunks to the buffer
>> that is meant for FAT32 directories, but at some point, the card
>> will have to write back the entire logical erase block, so you
>> might not be able to gain much in real-world workloads.
>>
>

Attaching is some data I have collected  on the MMC32G part. I tried
to make the collection process as controlled as possible, as well as
use more-or-less a "real life" usage case that involves running a user
application, so it's not just a purely synthetic test at block level.

Attached file (I hope you don't mind PDFs) contains data collected for
two possible optimizations. The second page of the document tests the
vendor suggested optimization that is basically -
if (request_blocks < 24) {
     /* given request offset, calculate sectors remaining on 8K page
containing offset */
     sectors = 16 - (request_offset % 16);
     if (request_blocks > sectors) {
        request_blocks = sectors;
     }
}
...I'll call this optimization A.

...the first page of the document tests the optimization that floated
up on the list when I first sent a patch with the vendor suggestions.
That optimization being - align all unaligned accesses (either all
completely, or under a certain size threshold) on flash page size.
I'll call this optimization B.

To test, a collect time info for 2000 small inserts into a table with
sqlite into 20 separate tables. So that's 20 x 2000 sqlite inserts per
test. The test is executed for ext2, ext3 and ext4 with a 4k block
size. Every test begins with a flash discard and format operation on
the partition where the tables are created and accessed, to ensure
similar acceses to flash on every test. All other partitions are RO,
and no processes other than those needed by the tests run. All power
management is disabled. The results are thus repeatable, consistent
and stable across reboots and power-on time...

Each test consists of:
1) Unmount partition
2) Flash erase
3) Format with fs
4) Mount
5) Sync
6) echo 3 > /proc/sys/vm/drop_caches
7) run 20 x 2000 inserts as described above
8) unmount

For optimization B testing, the alignment size and alignment access
size threshold (same parameters as in my RFC patch) are exposed
through debugfs. To get B test data, the flow was

1) Set alignment to none (no optimization)
2) Sql test on ext2
3) Sql test on ext3
4) Sql test on ext4

6) Set alignment to 8k, no threshold
7) Sql test on ext2
8) Sql test on ext3
9) Sql test on ext4

10) Set alignment to 8k, < 8k only
11) Sql test on ext2
12) Sql test on ext3
13) Sql test on ext4

...all the way up to 32K threshold.

For optimization A testing, the optimization was turned off/on with a
debugfs attribute, and the data collected with this flow:

1) Turn off optimization
2) Sql test on ext2
3) Sql test on ext3
4) Sql test on ext4
5) Turn on optimization
6) Sql test on ext2
7) Sql test on ext3
8) Sql test on ext4

My interpretation of the results: Any kind of alignment-on-flash page
optimization produced data that in all cases was either
indistinguishable from control, or was worse. Do you agree with my
interpretation?

So I guess that hexes the align optimization, at least until I can get
data for MMC16G with the same controlled setup. Sorry about that. I'll
work on the "reliability optimization" now, which I guess are pretty
generic for cards with similar buffer schemes. It relies on reliable
writes, so exposing that will be first for review here...

Even though I'm rescinding the adjust/align patch, is there any chance
for pulling in my quirks changes?

Thanks,
A

[-- Attachment #2: flash data MMC32G.pdf --]
[-- Type: application/pdf, Size: 55157 bytes --]

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [RFC 4/5] MMC: Adjust unaligned write accesses.
  2011-03-19 11:09                   ` Andrei Warkentin
@ 2011-03-21 14:21                     ` Arnd Bergmann
  2011-03-21 14:41                       ` Andrei Warkentin
  0 siblings, 1 reply; 28+ messages in thread
From: Arnd Bergmann @ 2011-03-21 14:21 UTC (permalink / raw)
  To: Andrei Warkentin; +Cc: linux-mmc, linux-ext4

On Saturday 19 March 2011, Andrei Warkentin wrote:
> On Mon, Mar 14, 2011 at 2:40 AM, Andrei Warkentin <andreiw@motorola.com> wrote:
> 
> >>>
> >>> Revalidating the data now, along with some more tests, to get a better
> >>> picture. It seems the more data I get, the less it makes sense :(.
> >>
> >> I was already fearing that the change would only benefit low-level
> >> benchmarks. It certainly helps writing small chunks to the buffer
> >> that is meant for FAT32 directories, but at some point, the card
> >> will have to write back the entire logical erase block, so you
> >> might not be able to gain much in real-world workloads.
> >>
> >
> 
> Attaching is some data I have collected  on the MMC32G part. I tried
> to make the collection process as controlled as possible, as well as
> use more-or-less a "real life" usage case that involves running a user
> application, so it's not just a purely synthetic test at block level.
> 
> Attached file (I hope you don't mind PDFs) contains data collected for
> two possible optimizations. The second page of the document tests the
> vendor suggested optimization that is basically -
> if (request_blocks < 24) {
>      /* given request offset, calculate sectors remaining on 8K page
> containing offset */
>      sectors = 16 - (request_offset % 16);
>      if (request_blocks > sectors) {
>         request_blocks = sectors;
>      }
> }
> ...I'll call this optimization A.
> 
> ...the first page of the document tests the optimization that floated
> up on the list when I first sent a patch with the vendor suggestions.
> That optimization being - align all unaligned accesses (either all
> completely, or under a certain size threshold) on flash page size.
> I'll call this optimization B.

I'm not sure if I really understand the difference between the two.
Do you mean optimization A makes sure that you don't have partial
pages at the start of a request, while optimization B also splits
small requests on page boundary if the first page in it is aligned?

> To test, a collect time info for 2000 small inserts into a table with
> sqlite into 20 separate tables. So that's 20 x 2000 sqlite inserts per
> test. The test is executed for ext2, ext3 and ext4 with a 4k block
> size. Every test begins with a flash discard and format operation on
> the partition where the tables are created and accessed, to ensure
> similar acceses to flash on every test. All other partitions are RO,
> and no processes other than those needed by the tests run. All power
> management is disabled. The results are thus repeatable, consistent
> and stable across reboots and power-on time...
> 
> Each test consists of:
> 1) Unmount partition
> 2) Flash erase
> 3) Format with fs
> 4) Mount
> 5) Sync
> 6) echo 3 > /proc/sys/vm/drop_caches
> 7) run 20 x 2000 inserts as described above
> 8) unmount

Just to make sure: Did you properly align the partition start on an
erase block boundary of 4MB?

I would have loved to see results with nilfs2 and btrfs as well, but
I can understand that these were less relevant to you, especially
since you don't really want to compare the file systems as much as
your own changes.

One very surprising result to me is how much worse the ext4 numbers
are compared to ext2/ext3. I would have guessed that they should
be much better, given that the ext4 developers are specifically
trying to optimize for this case. I've taken the ext4 mailing
list on Cc here and will forward your test results there as
well.

> For optimization B testing, the alignment size and alignment access
> size threshold (same parameters as in my RFC patch) are exposed
> through debugfs. To get B test data, the flow was
> 
> 1) Set alignment to none (no optimization)
> 2) Sql test on ext2
> 3) Sql test on ext3
> 4) Sql test on ext4
> 
> 6) Set alignment to 8k, no threshold
> 7) Sql test on ext2
> 8) Sql test on ext3
> 9) Sql test on ext4
> 
> 10) Set alignment to 8k, < 8k only
> 11) Sql test on ext2
> 12) Sql test on ext3
> 13) Sql test on ext4
> 
> ...all the way up to 32K threshold.
> 
> For optimization A testing, the optimization was turned off/on with a
> debugfs attribute, and the data collected with this flow:
> 
> 1) Turn off optimization
> 2) Sql test on ext2
> 3) Sql test on ext3
> 4) Sql test on ext4
> 5) Turn on optimization
> 6) Sql test on ext2
> 7) Sql test on ext3
> 8) Sql test on ext4
> 
> My interpretation of the results: Any kind of alignment-on-flash page
> optimization produced data that in all cases was either
> indistinguishable from control, or was worse. Do you agree with my
> interpretation?

I suppse when the result is total runtime in seconds, that larger numbers
are always worse, so I agree.

One potential flaw in the measurement might be that running the test
a second time means that the card is already in a state that requires
garbage collection and therefore slower. Running the test in the opposite
order (optimized first, then unoptimized) might theoretically lead
to other results. It's not clear from your description whether your
test method has taken this into account (I would assume yes).

> So I guess that hexes the align optimization, at least until I can get
> data for MMC16G with the same controlled setup. Sorry about that. I'll
> work on the "reliability optimization" now, which I guess are pretty
> generic for cards with similar buffer schemes. It relies on reliable
> writes, so exposing that will be first for review here...
> 
> Even though I'm rescinding the adjust/align patch, is there any chance
> for pulling in my quirks changes?

The quirks patch still looks fine to me, I'd just recommend that we
don't apply it before we have a need for it, i.e. at least a single
card specific quirk.

	Arnd

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [RFC 4/5] MMC: Adjust unaligned write accesses.
  2011-03-21 14:21                     ` Arnd Bergmann
@ 2011-03-21 14:41                       ` Andrei Warkentin
  2011-03-21 18:03                         ` Andreas Dilger
  0 siblings, 1 reply; 28+ messages in thread
From: Andrei Warkentin @ 2011-03-21 14:41 UTC (permalink / raw)
  To: Arnd Bergmann; +Cc: linux-mmc, linux-ext4

On Mon, Mar 21, 2011 at 9:21 AM, Arnd Bergmann <arnd@arndb.de> wrote:
>> Attached file (I hope you don't mind PDFs) contains data collected for
>> two possible optimizations. The second page of the document tests the
>> vendor suggested optimization that is basically -
>> if (request_blocks < 24) {
>>      /* given request offset, calculate sectors remaining on 8K page
>> containing offset */
>>      sectors = 16 - (request_offset % 16);
>>      if (request_blocks > sectors) {
>>         request_blocks = sectors;
>>      }
>> }
>> ...I'll call this optimization A.
>>
>> ...the first page of the document tests the optimization that floated
>> up on the list when I first sent a patch with the vendor suggestions.
>> That optimization being - align all unaligned accesses (either all
>> completely, or under a certain size threshold) on flash page size.
>> I'll call this optimization B.
>
> I'm not sure if I really understand the difference between the two.
> Do you mean optimization A makes sure that you don't have partial
> pages at the start of a request, while optimization B also splits
> small requests on page boundary if the first page in it is aligned?

The vendor optimization always splits accesses under 12k, even if they
are aligned. There are (still) some outstanding questions
on how that's supposed to work (to improve anything), but that's the algorithm.

"our" optimization, suggested on this list, was to align accesses onto
flash page size, thus splitting each request into (small) unaligned
and aligned portions.

>
>> To test, a collect time info for 2000 small inserts into a table with
>> sqlite into 20 separate tables. So that's 20 x 2000 sqlite inserts per
>> test. The test is executed for ext2, ext3 and ext4 with a 4k block
>> size. Every test begins with a flash discard and format operation on
>> the partition where the tables are created and accessed, to ensure
>> similar acceses to flash on every test. All other partitions are RO,
>> and no processes other than those needed by the tests run. All power
>> management is disabled. The results are thus repeatable, consistent
>> and stable across reboots and power-on time...
>>
>> Each test consists of:
>> 1) Unmount partition
>> 2) Flash erase
>> 3) Format with fs
>> 4) Mount
>> 5) Sync
>> 6) echo 3 > /proc/sys/vm/drop_caches
>> 7) run 20 x 2000 inserts as described above
>> 8) unmount
>
> Just to make sure: Did you properly align the partition start on an
> erase block boundary of 4MB?
>

Yes, absolutely.

> I would have loved to see results with nilfs2 and btrfs as well, but
> I can understand that these were less relevant to you, especially
> since you don't really want to compare the file systems as much as
> your own changes.
>

In the context of looking at this anyway, I will try and get
comparison data for sqlite on different fs (and different fs tunables)
on flash.

> One very surprising result to me is how much worse the ext4 numbers
> are compared to ext2/ext3. I would have guessed that they should
> be much better, given that the ext4 developers are specifically
> trying to optimize for this case. I've taken the ext4 mailing
> list on Cc here and will forward your test results there as
> well.

I was surprised too.

> One potential flaw in the measurement might be that running the test
> a second time means that the card is already in a state that requires
> garbage collection and therefore slower. Running the test in the opposite
> order (optimized first, then unoptimized) might theoretically lead
> to other results. It's not clear from your description whether your
> test method has taken this into account (I would assume yes).
>

I've done tests across reboots that showed consistent results.
Additionally, repeating a test after another showed same results. At
least on this flash medium, block erase (used erase utility from
flashbench, modified to erase everything if no argument provided)
prior to formatting with fs prior to every test seemed to make results
consistent.

>> So I guess that hexes the align optimization, at least until I can get
>> data for MMC16G with the same controlled setup. Sorry about that. I'll
>> work on the "reliability optimization" now, which I guess are pretty
>> generic for cards with similar buffer schemes. It relies on reliable
>> writes, so exposing that will be first for review here...
>>
>> Even though I'm rescinding the adjust/align patch, is there any chance
>> for pulling in my quirks changes?
>
> The quirks patch still looks fine to me, I'd just recommend that we
> don't apply it before we have a need for it, i.e. at least a single
> card specific quirk.
>

Ok. Sounds good. Back to reliable writes it is, so I can roll up the
second quirk...

A
--
To unsubscribe from this list: send the line "unsubscribe linux-ext4" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [RFC 4/5] MMC: Adjust unaligned write accesses.
  2011-03-21 14:41                       ` Andrei Warkentin
@ 2011-03-21 18:03                         ` Andreas Dilger
  2011-03-21 19:05                           ` Arnd Bergmann
  0 siblings, 1 reply; 28+ messages in thread
From: Andreas Dilger @ 2011-03-21 18:03 UTC (permalink / raw)
  To: Andrei Warkentin; +Cc: Arnd Bergmann, linux-mmc, linux-ext4

On 2011-03-21, at 3:41 PM, Andrei Warkentin wrote:
> On Mon, Mar 21, 2011 at 9:21 AM, Arnd Bergmann <arnd@arndb.de> wrote:
>>> Attached file (I hope you don't mind PDFs) contains data collected for
>>> two possible optimizations. The second page of the document tests the
>>> vendor suggested optimization that is basically -
>>> if (request_blocks < 24) {
>>>    /* given request offset, calculate sectors remaining on 8K page
>>> containing offset */
>>>    sectors = 16 - (request_offset % 16);
>>>    if (request_blocks > sectors) {
>>>       request_blocks = sectors;
>>>    }
>>> }
>>> ...I'll call this optimization A.
>>> 
>>> ...the first page of the document tests the optimization that floated
>>> up on the list when I first sent a patch with the vendor suggestions.
>>> That optimization being - align all unaligned accesses (either all
>>> completely, or under a certain size threshold) on flash page size.
>>> I'll call this optimization B.
>> 
>> I'm not sure if I really understand the difference between the two.
>> Do you mean optimization A makes sure that you don't have partial
>> pages at the start of a request, while optimization B also splits
>> small requests on page boundary if the first page in it is aligned?
> 
> The vendor optimization always splits accesses under 12k, even if they
> are aligned. There are (still) some outstanding questions
> on how that's supposed to work (to improve anything), but that's the algorithm.
> 
> "our" optimization, suggested on this list, was to align accesses onto
> flash page size, thus splitting each request into (small) unaligned
> and aligned portions.

Note that mballoc was specifically designed to handle allocation requests that are aligned on RAID stripe boundaries, so it should be able to handle this for MMC as well.  What is needed is to tell the filesystem what the underlying alignment is.  That can be done at format time with mke2fs or afterward with tune2fs by using the "-E stripe_width" option.

>>> To test, a collect time info for 2000 small inserts into a table with
>>> sqlite into 20 separate tables. So that's 20 x 2000 sqlite inserts per
>>> test. The test is executed for ext2, ext3 and ext4 with a 4k block
>>> size. Every test begins with a flash discard and format operation on
>>> the partition where the tables are created and accessed, to ensure
>>> similar acceses to flash on every test. All other partitions are RO,
>>> and no processes other than those needed by the tests run. All power
>>> management is disabled. The results are thus repeatable, consistent
>>> and stable across reboots and power-on time...
>>> 
>>> Each test consists of:
>>> 1) Unmount partition
>>> 2) Flash erase
>>> 3) Format with fs
>>> 4) Mount
>>> 5) Sync
>>> 6) echo 3 > /proc/sys/vm/drop_caches
>>> 7) run 20 x 2000 inserts as described above
>>> 8) unmount
>> 
>> Just to make sure: Did you properly align the partition start on an
>> erase block boundary of 4MB?
>> 
> 
> Yes, absolutely.
> 
>> I would have loved to see results with nilfs2 and btrfs as well, but
>> I can understand that these were less relevant to you, especially
>> since you don't really want to compare the file systems as much as
>> your own changes.
>> 
> 
> In the context of looking at this anyway, I will try and get
> comparison data for sqlite on different fs (and different fs tunables)
> on flash.
> 
>> One very surprising result to me is how much worse the ext4 numbers
>> are compared to ext2/ext3. I would have guessed that they should
>> be much better, given that the ext4 developers are specifically
>> trying to optimize for this case. I've taken the ext4 mailing
>> list on Cc here and will forward your test results there as
>> well.
> 
> I was surprised too.
> 
>> One potential flaw in the measurement might be that running the test
>> a second time means that the card is already in a state that requires
>> garbage collection and therefore slower. Running the test in the opposite
>> order (optimized first, then unoptimized) might theoretically lead
>> to other results. It's not clear from your description whether your
>> test method has taken this into account (I would assume yes).
>> 
> 
> I've done tests across reboots that showed consistent results.
> Additionally, repeating a test after another showed same results. At
> least on this flash medium, block erase (used erase utility from
> flashbench, modified to erase everything if no argument provided)
> prior to formatting with fs prior to every test seemed to make results
> consistent.
> 
>>> So I guess that hexes the align optimization, at least until I can get
>>> data for MMC16G with the same controlled setup. Sorry about that. I'll
>>> work on the "reliability optimization" now, which I guess are pretty
>>> generic for cards with similar buffer schemes. It relies on reliable
>>> writes, so exposing that will be first for review here...
>>> 
>>> Even though I'm rescinding the adjust/align patch, is there any chance
>>> for pulling in my quirks changes?
>> 
>> The quirks patch still looks fine to me, I'd just recommend that we
>> don't apply it before we have a need for it, i.e. at least a single
>> card specific quirk.
>> 
> 
> Ok. Sounds good. Back to reliable writes it is, so I can roll up the
> second quirk...
> 
> A
> --
> To unsubscribe from this list: send the line "unsubscribe linux-ext4" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html


Cheers, Andreas






^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [RFC 4/5] MMC: Adjust unaligned write accesses.
  2011-03-21 18:03                         ` Andreas Dilger
@ 2011-03-21 19:05                           ` Arnd Bergmann
  2011-03-21 23:58                             ` Andreas Dilger
  0 siblings, 1 reply; 28+ messages in thread
From: Arnd Bergmann @ 2011-03-21 19:05 UTC (permalink / raw)
  To: Andreas Dilger; +Cc: Andrei Warkentin, linux-mmc, linux-ext4

On Monday 21 March 2011 19:03:09 Andreas Dilger wrote:
> Note that mballoc was specifically designed to handle allocation
> requests that are aligned on RAID stripe boundaries, so it should
> be able to handle this for MMC as well.  What is needed is to tell
> the filesystem what the underlying alignment is.  That can be done
> at format time with mke2fs or afterward with tune2fs by using the
> "-E stripe_width" option.

Ah, that sounds useful. So would I set the stripe_width to the
erase block size, and the block group size to a multiple of that?

Does this also work in (rare) cases where the erase block size is
not a power of two?

	Arnd

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [RFC 4/5] MMC: Adjust unaligned write accesses.
  2011-03-21 19:05                           ` Arnd Bergmann
@ 2011-03-21 23:58                             ` Andreas Dilger
  2011-03-22 13:56                               ` Arnd Bergmann
  0 siblings, 1 reply; 28+ messages in thread
From: Andreas Dilger @ 2011-03-21 23:58 UTC (permalink / raw)
  To: Arnd Bergmann; +Cc: Andrei Warkentin, linux-mmc, linux-ext4

On 2011-03-21, at 8:05 PM, Arnd Bergmann wrote:
> On Monday 21 March 2011 19:03:09 Andreas Dilger wrote:
>> Note that mballoc was specifically designed to handle allocation
>> requests that are aligned on RAID stripe boundaries, so it should
>> be able to handle this for MMC as well.  What is needed is to tell
>> the filesystem what the underlying alignment is.  That can be done
>> at format time with mke2fs or afterward with tune2fs by using the
>> "-E stripe_width" option.
> 
> Ah, that sounds useful. So would I set the stripe_width to the
> erase block size, and the block group size to a multiple of that?

When you write "block group size" do you mean the ext4 block group?  Then yes it would help.  You could also consider setting the flex_bg size to a multiple of this, so that the bitmap blocks are grouped as a multiple of this size.  However, they may not be aligned correctly, which needs extra effort that isn't obvious.  

I think it would be nice to have mke2fs take the stripe_width and/or flex_bg factor into account when sizing/aligning the bitmaps, but it doesn't yet.

> Does this also work in (rare) cases where the erase block size is
> not a power of two?

It does (or is supposed to), but that isn't code that is exercised very much (most installations use a power-of-two size).

Cheers, Andreas

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [RFC 4/5] MMC: Adjust unaligned write accesses.
  2011-03-21 23:58                             ` Andreas Dilger
@ 2011-03-22 13:56                               ` Arnd Bergmann
  2011-03-22 15:02                                 ` Andreas Dilger
  0 siblings, 1 reply; 28+ messages in thread
From: Arnd Bergmann @ 2011-03-22 13:56 UTC (permalink / raw)
  To: Andreas Dilger; +Cc: Andrei Warkentin, linux-mmc, linux-ext4

On Tuesday 22 March 2011, Andreas Dilger wrote:
> On 2011-03-21, at 8:05 PM, Arnd Bergmann wrote:
> > On Monday 21 March 2011 19:03:09 Andreas Dilger wrote:
> >> Note that mballoc was specifically designed to handle allocation
> >> requests that are aligned on RAID stripe boundaries, so it should
> >> be able to handle this for MMC as well.  What is needed is to tell
> >> the filesystem what the underlying alignment is.  That can be done
> >> at format time with mke2fs or afterward with tune2fs by using the
> >> "-E stripe_width" option.
> > 
> > Ah, that sounds useful. So would I set the stripe_width to the
> > erase block size, and the block group size to a multiple of that?
> 
> When you write "block group size" do you mean the ext4 block group? 

Yes.

> Then yes it would help.  You could also consider setting the flex_bg
> size to a multiple of this, so that the bitmap blocks are grouped as
> a multiple of this size.  However, they may not be aligned correctly,
> which needs extra effort that isn't obvious.  
> 
> I think it would be nice to have mke2fs take the stripe_width and/or
> flex_bg factor into account when sizing/aligning the bitmaps, but it
> doesn't yet.

A few more questions: 

* On cards that can only write to a single erase block at a time,
should I make the block group size the same as the as the erase
block? I suppose writing both block bitmaps, inode and data to
separate erase blocks would create multiple eraseblock
read-modify-write cycles for every single file otherwise.

* Is it guaranteed that inode bitmap, inode, block bitmap and
blocks are always written in low-to-high sector order within
one ext4 block group? A lot of the drives will do a garbage-collect
step (adding hundreds of miliseconds) every time you move back
inside of the eraseblock.

* Is there any way to make ext4 use effective blocks larger
than 4 KB? The most common size for a NAND flash page is 16
KB right (effectively, ignoring what the hardware does), so
it would be good to never write smaller.

* Calling TRIM on SD cards is probably counterproductive unless
you trim entire erase blocks. Is that even possible with ext4,
assuming that we use block group == erase block?

* Is there a way to put the journal into specific parts of the
drive? Almost all SD cards have an area in the second 4 MB
(more for larger cards) that can be written using random access
without forcing garbage collection on other parts.

> > Does this also work in (rare) cases where the erase block size is
> > not a power of two?
> 
> It does (or is supposed to), but that isn't code that is exercised
> very much (most installations use a power-of-two size).

Ok. Recently, cheap TLC (three-level cell, 3-bit MLC) NAND is
becoming popular. I've seen erase block sizes of 6 MiB, 1376 KiB
(4096 / 3, rounded up) and 4128 KiB (1376 * 3) because of this, in
place of the common 4096 KiB. The SD card standard specifies
values of 12 MB and 24 MB aside from the usual power-of-two values
up to 64 MB for large cards (>32GB), while smaller cards are allowed
only up to 4 MB erase blocks and need to be power-of-two. Many
cards do not use the size they claim in their registers.

	Arnd

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [RFC 4/5] MMC: Adjust unaligned write accesses.
  2011-03-22 13:56                               ` Arnd Bergmann
@ 2011-03-22 15:02                                 ` Andreas Dilger
  2011-03-22 15:44                                   ` Arnd Bergmann
  0 siblings, 1 reply; 28+ messages in thread
From: Andreas Dilger @ 2011-03-22 15:02 UTC (permalink / raw)
  To: Arnd Bergmann; +Cc: Andrei Warkentin, linux-mmc, linux-ext4

On 2011-03-22, at 2:56 PM, Arnd Bergmann wrote:
> On Tuesday 22 March 2011, Andreas Dilger wrote:
>> On 2011-03-21, at 8:05 PM, Arnd Bergmann wrote:
>>> So would I set the stripe_width to the erase block size, and the
>>> block group size to a multiple of that?
>> 
>> When you write "block group size" do you mean the ext4 block group? 
> 
> Yes.
> 
>> Then yes it would help.  You could also consider setting the flex_bg
>> size to a multiple of this, so that the bitmap blocks are grouped as
>> a multiple of this size.  However, they may not be aligned correctly,
>> which needs extra effort that isn't obvious.  
>> 
>> I think it would be nice to have mke2fs take the stripe_width and/or
>> flex_bg factor into account when sizing/aligning the bitmaps, but it
>> doesn't yet.
> 
> A few more questions: 
> 
> * On cards that can only write to a single erase block at a time,
> should I make the block group size the same as the as the erase
> block? I suppose writing both block bitmaps, inode and data to
> separate erase blocks would create multiple eraseblock
> read-modify-write cycles for every single file otherwise.

That doesn't seem like a very good idea.  It will significantly limit the size of the filesystem, and will cause a lot of overhead (two bitmaps per group for only a handful of blocks).

> * Is it guaranteed that inode bitmap, inode, block bitmap and
> blocks are always written in low-to-high sector order within
> one ext4 block group? A lot of the drives will do a garbage-collect
> step (adding hundreds of miliseconds) every time you move back
> inside of the eraseblock.

Generally, yes.  I don't think there is a hard guarantee, but the block device elevator will sort the blocks.

> * Is there any way to make ext4 use effective blocks larger
> than 4 KB? The most common size for a NAND flash page is 16
> KB right (effectively, ignoring what the hardware does), so
> it would be good to never write smaller.

You may be interested in Ted's bigalloc patchset.  This will force block allocation to be at a power-of-two multiple of the blocksize, so it could be 16kB or whatever.  However, this is inefficient if the average filesize is not large enough.

> * Calling TRIM on SD cards is probably counterproductive unless
> you trim entire erase blocks. Is that even possible with ext4,
> assuming that we use block group == erase block?

That is already the case, if the underlying storage reports the erase block size to the filesystem.

> * Is there a way to put the journal into specific parts of the
> drive? Almost all SD cards have an area in the second 4 MB
> (more for larger cards) that can be written using random access
> without forcing garbage collection on other parts.

That would need a small patch to mke2fs.  I've been interested in this also for other reasons, but haven't had time to work on it.  It will likely need only some small adjustments to ext2fs_add_journal_inode() to allow passing the goal block, and write_journal_inode() to use the goal block instead of its internal heuristic.  The default location of the journal inode was previously moved from the beginning of the filesystem to the middle of the filesystem for performance reasons, so this is mostly already handled.

>>> Does this also work in (rare) cases where the erase block size is
>>> not a power of two?
>> 
>> It does (or is supposed to), but that isn't code that is exercised
>> very much (most installations use a power-of-two size).
> 
> Ok. Recently, cheap TLC (three-level cell, 3-bit MLC) NAND is
> becoming popular. I've seen erase block sizes of 6 MiB, 1376 KiB
> (4096 / 3, rounded up) and 4128 KiB (1376 * 3) because of this, in
> place of the common 4096 KiB. The SD card standard specifies
> values of 12 MB and 24 MB aside from the usual power-of-two values
> up to 64 MB for large cards (>32GB), while smaller cards are allowed
> only up to 4 MB erase blocks and need to be power-of-two. Many
> cards do not use the size they claim in their registers.

Well, the large erase block size is not in itself a problem, but if the devices do not use the reported erase block size internally, there is nothing much that ext4 or the rest of the kernel can do about it, since it has no other way of knowing what the real erase block size is.

Cheers, Andreas






^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [RFC 4/5] MMC: Adjust unaligned write accesses.
  2011-03-22 15:02                                 ` Andreas Dilger
@ 2011-03-22 15:44                                   ` Arnd Bergmann
  0 siblings, 0 replies; 28+ messages in thread
From: Arnd Bergmann @ 2011-03-22 15:44 UTC (permalink / raw)
  To: Andreas Dilger; +Cc: Andrei Warkentin, linux-mmc, linux-ext4

On Tuesday 22 March 2011, Andreas Dilger wrote:
> On 2011-03-22, at 2:56 PM, Arnd Bergmann wrote:
> > On Tuesday 22 March 2011, Andreas Dilger wrote:
> >> On 2011-03-21, at 8:05 PM, Arnd Bergmann wrote:
> > 
> > * On cards that can only write to a single erase block at a time,
> > should I make the block group size the same as the as the erase
> > block? I suppose writing both block bitmaps, inode and data to
> > separate erase blocks would create multiple eraseblock
> > read-modify-write cycles for every single file otherwise.
> 
> That doesn't seem like a very good idea.  It will significantly limit
> the size of the filesystem, and will cause a lot of overhead (two bitmaps
> per group for only a handful of blocks).

I'm willing to spend a little space overhead in return for one
or two orders of magnitude in performance and life expectancy
for the card ;-)

A typical case is that a single-page (16KB) write to the currently
open erase block takes 1ms, but since writing to another erase block
requires a garbage-collection (erase-rewrite 4 MB), it takes 500 ms,
just like the following access to the first erase block, which
has now been closed.

Every erase cycle ages the drive, and on some cheap ones, you only
have about 2000 guaranteed erases per erase block!

> > * Is it guaranteed that inode bitmap, inode, block bitmap and
> > blocks are always written in low-to-high sector order within
> > one ext4 block group? A lot of the drives will do a garbage-collect
> > step (adding hundreds of miliseconds) every time you move back
> > inside of the eraseblock.
> 
> Generally, yes.  I don't think there is a hard guarantee,
> but the block device elevator will sort the blocks.

Ok. 
 
> > * Is there any way to make ext4 use effective blocks larger
> > than 4 KB? The most common size for a NAND flash page is 16
> > KB right (effectively, ignoring what the hardware does), so
> > it would be good to never write smaller.
> 
> You may be interested in Ted's bigalloc patchset.  This will force
> block allocation to be at a power-of-two multiple of the blocksize,
> so it could be 16kB or whatever.  However, this is inefficient if
> the average filesize is not large enough.

Is it just a performance/space tradeoff, or is there also a
performance overhead in this?

> > * Calling TRIM on SD cards is probably counterproductive unless
> > you trim entire erase blocks. Is that even possible with ext4,
> > assuming that we use block group == erase block?
> 
> That is already the case, if the underlying storage reports the
> erase block size to the filesystem.

Ok, I should try to find out how this is done on SD cards.
The hardware interface allows erasing 512 byte sectors, so
we might be reporting that instead.
 
> > * Is there a way to put the journal into specific parts of the
> > drive? Almost all SD cards have an area in the second 4 MB
> > (more for larger cards) that can be written using random access
> > without forcing garbage collection on other parts.
> 
> That would need a small patch to mke2fs.  I've been interested in
> this also for other reasons, but haven't had time to work on it. 
> It will likely need only some small adjustments to 
> ext2fs_add_journal_inode() to allow passing the goal block, and
> write_journal_inode() to use the goal block instead of its internal
> heuristic.  The default location of the journal inode was previously
> moved from the beginning of the filesystem to the middle of the
> filesystem for performance reasons, so this is mostly already handled.

Ok. It was previously suggested to put an external journal on a
4 MB partition for experimenting with this. I hope I can back this
up with performance numbers soon.

> Well, the large erase block size is not in itself a problem, but
> if the devices do not use the reported erase block size internally,
> there is nothing much that ext4 or the rest of the kernel can do
> about it, since it has no other way of knowing what the real erase
> block size is.

For SDHC cards, the typical case is that they are reasonably efficient
when you use the reported size, because they are tested that way.
A lot of cards use 2MiB internally but report 4MiB, which is fine
as long as you write the 4MB consecutively and don't alternate
between the two halves.

The split into three erase blocks of (4 MiB / 3) is on low-end
SanDisk cards, and I believe it has mostly advantages and will
work well if we use the reported 4 MB.

The 4128 KiB erase blocks are on a USB stick, and those devices
do not report any erase block size at all.

I have written a tool to detect the actual erase block size,
and perhaps that could be integrated into mke2fs and similar
tools.

	Arnd

^ permalink raw reply	[flat|nested] 28+ messages in thread

end of thread, other threads:[~2011-03-22 15:44 UTC | newest]

Thread overview: 28+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2011-03-10  0:54 Block quirks redux + Toshiba performance quirk Andrei Warkentin
2011-03-10  0:12 ` Andrei Warkentin
2011-03-10  0:54 ` [RFC 1/5] MMC: Extends card quicks with MMC/SD quirks matching the CID Andrei Warkentin
2011-03-10  0:54 ` [RFC 2/5] MMC: Allow function-specific quirks Andrei Warkentin
2011-03-10 15:09   ` Arnd Bergmann
2011-03-10 20:41     ` Andrei Warkentin
2011-03-10 21:55       ` Arnd Bergmann
2011-03-10  0:54 ` [RFC 3/5] MMC: Support for block quirks Andrei Warkentin
2011-03-10  0:54 ` [RFC 4/5] MMC: Adjust unaligned write accesses Andrei Warkentin
2011-03-10 16:05   ` Arnd Bergmann
2011-03-10 20:45     ` Andrei Warkentin
2011-03-10 21:54       ` Arnd Bergmann
2011-03-10 23:06         ` Andrei Warkentin
2011-03-11 10:23           ` Arnd Bergmann
2011-03-13 13:00             ` Andrei Warkentin
2011-03-13 14:54               ` Arnd Bergmann
2011-03-14  7:40                 ` Andrei Warkentin
2011-03-19 11:09                   ` Andrei Warkentin
2011-03-21 14:21                     ` Arnd Bergmann
2011-03-21 14:41                       ` Andrei Warkentin
2011-03-21 18:03                         ` Andreas Dilger
2011-03-21 19:05                           ` Arnd Bergmann
2011-03-21 23:58                             ` Andreas Dilger
2011-03-22 13:56                               ` Arnd Bergmann
2011-03-22 15:02                                 ` Andreas Dilger
2011-03-22 15:44                                   ` Arnd Bergmann
2011-03-10  0:54 ` [RFC 5/5] MMC: Toshiba eMMC - Split 8K-unaligned accesses Andrei Warkentin
2011-03-10  1:03 ` [RFC] MMC: Extends card quicks with MMC/SD quirks matching the CID Andrei Warkentin

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).