[PATCH v4 0/5] mtd: nand: Add on-die ECC support

public inbox for linux-mtd@lists.infradead.org
 help / color / mirror / Atom feed

* [PATCH v4 0/5] mtd: nand: Add on-die ECC support
@ 2014-03-31 23:28 David Mosberger
  2014-03-31 23:28 ` [PATCH v4 1/5] mtd: nand: Detect Micron flash with on-die ECC (aka "internal ECC") enabled David Mosberger
                   ` (5 more replies)
  0 siblings, 6 replies; 30+ messages in thread
From: David Mosberger @ 2014-03-31 23:28 UTC (permalink / raw)
  To: pekon; +Cc: David Mosberger, gsi, computersforpeace, linux-mtd, dedekind1

This patch-series is designed to go on top of Gerhard Sittig's
patch series "mtd: nand: introduce a READMODE command".

The patches add on-die ECC support as found on some Micron flash chips.

David Mosberger (5):
  mtd: nand: Detect Micron flash with on-die ECC (aka "internal ECC")
    enabled.
  mtd: nand: Add NAND_ECC_HW_ON_DIE ECC-mode.
  mtd: nand: Enable subpage-reads on flashes with on-die ECC enabled.
  mtd: nand: Allocate extra buffers needed for on-die ECC controller.
  mtd: nand: Improve bitflip detection for on-die ECC scheme.

 drivers/mtd/nand/nand_base.c |  264 +++++++++++++++++++++++++++++++++++++++++-
 drivers/of/of_mtd.c          |    1 +
 include/linux/mtd/nand.h     |    8 ++
 3 files changed, 267 insertions(+), 6 deletions(-)

-- 
1.7.9.5

^ permalink raw reply	[flat|nested] 30+ messages in thread

* [PATCH v4 1/5] mtd: nand: Detect Micron flash with on-die ECC (aka "internal ECC") enabled.
  2014-03-31 23:28 [PATCH v4 0/5] mtd: nand: Add on-die ECC support David Mosberger
@ 2014-03-31 23:28 ` David Mosberger
  2014-04-01  6:39   ` Brian Norris
  2014-03-31 23:28 ` [PATCH v4 2/5] mtd: nand: Add NAND_ECC_HW_ON_DIE ECC-mode David Mosberger
                   ` (4 subsequent siblings)
  5 siblings, 1 reply; 30+ messages in thread
From: David Mosberger @ 2014-03-31 23:28 UTC (permalink / raw)
  To: pekon; +Cc: David Mosberger, gsi, computersforpeace, linux-mtd, dedekind1

Signed-off-by: David Mosberger <davidm@egauge.net>
---
 drivers/mtd/nand/nand_base.c |   27 ++++++++++++++++++++++++---
 include/linux/mtd/nand.h     |    4 ++++
 2 files changed, 28 insertions(+), 3 deletions(-)

diff --git a/drivers/mtd/nand/nand_base.c b/drivers/mtd/nand/nand_base.c
index 09fe1b1..f145f00 100644
--- a/drivers/mtd/nand/nand_base.c
+++ b/drivers/mtd/nand/nand_base.c
@@ -3054,11 +3054,30 @@ static int nand_setup_read_retry_micron(struct mtd_info *mtd, int retry_mode)
 			feature);
 }
 
+static void
+nand_onfi_detect_micron_on_die_ecc(struct mtd_info *mtd,
+				   struct nand_chip *chip)
+{
+	u8 features[ONFI_SUBFEATURE_PARAM_LEN];
+
+	if (chip->onfi_get_features(mtd, chip, ONFI_FEATURE_ADDR_ARRAY_OP_MODE,
+				    features) < 0)
+		return;
+
+	if (features[0] & ONFI_FEATURE_ARRAY_OP_MODE_ENABLE_ON_DIE_ECC) {
+		/*
+		 * If the chip has on-die ECC enabled, we kind of have
+		 * to do the same...
+		 */
+		pr_info("Using on-die ECC\n");
+	}
+}
+
 /*
  * Configure chip properties from Micron vendor-specific ONFI table
  */
-static void nand_onfi_detect_micron(struct nand_chip *chip,
-		struct nand_onfi_params *p)
+static void nand_onfi_detect_micron(struct mtd_info *mtd,
+		struct nand_chip *chip, struct nand_onfi_params *p)
 {
 	struct nand_onfi_vendor_micron *micron = (void *)p->vendor;
 
@@ -3067,6 +3086,8 @@ static void nand_onfi_detect_micron(struct nand_chip *chip,
 
 	chip->read_retries = micron->read_retry_options;
 	chip->setup_read_retry = nand_setup_read_retry_micron;
+
+	nand_onfi_detect_micron_on_die_ecc(mtd, chip);
 }
 
 /*
@@ -3168,7 +3189,7 @@ static int nand_flash_detect_onfi(struct mtd_info *mtd, struct nand_chip *chip,
 	}
 
 	if (p->jedec_id == NAND_MFR_MICRON)
-		nand_onfi_detect_micron(chip, p);
+		nand_onfi_detect_micron(mtd, chip, p);
 
 	return 1;
 }
diff --git a/include/linux/mtd/nand.h b/include/linux/mtd/nand.h
index 2bd6e4e..780ab58 100644
--- a/include/linux/mtd/nand.h
+++ b/include/linux/mtd/nand.h
@@ -225,6 +225,10 @@ struct nand_chip;
 /* Vendor-specific feature address (Micron) */
 #define ONFI_FEATURE_ADDR_READ_RETRY	0x89
 
+/* Vendor-specific array operation mode (Micron) */
+#define ONFI_FEATURE_ADDR_ARRAY_OP_MODE	0x90
+#define ONFI_FEATURE_ARRAY_OP_MODE_ENABLE_ON_DIE_ECC		0x08
+
 /* ONFI subfeature parameters length */
 #define ONFI_SUBFEATURE_PARAM_LEN	4
 
-- 
1.7.9.5

^ permalink raw reply related	[flat|nested] 30+ messages in thread

* Re: [PATCH v4 1/5] mtd: nand: Detect Micron flash with on-die ECC (aka "internal ECC") enabled.
  2014-03-31 23:28 ` [PATCH v4 1/5] mtd: nand: Detect Micron flash with on-die ECC (aka "internal ECC") enabled David Mosberger
@ 2014-04-01  6:39   ` Brian Norris
  2014-04-01 15:26     ` David Mosberger
  0 siblings, 1 reply; 30+ messages in thread
From: Brian Norris @ 2014-04-01  6:39 UTC (permalink / raw)
  To: David Mosberger; +Cc: gsi, linux-mtd, pekon, dedekind1

On Mon, Mar 31, 2014 at 05:28:53PM -0600, David Mosberger wrote:
> Signed-off-by: David Mosberger <davidm@egauge.net>
> ---
>  drivers/mtd/nand/nand_base.c |   27 ++++++++++++++++++++++++---
>  include/linux/mtd/nand.h     |    4 ++++
>  2 files changed, 28 insertions(+), 3 deletions(-)
> 
> diff --git a/drivers/mtd/nand/nand_base.c b/drivers/mtd/nand/nand_base.c
> index 09fe1b1..f145f00 100644
> --- a/drivers/mtd/nand/nand_base.c
> +++ b/drivers/mtd/nand/nand_base.c
> @@ -3054,11 +3054,30 @@ static int nand_setup_read_retry_micron(struct mtd_info *mtd, int retry_mode)
>  			feature);
>  }
>  
> +static void

Does this really need to be on its own line? It doesn't match the style
of anything else.

> +nand_onfi_detect_micron_on_die_ecc(struct mtd_info *mtd,
> +				   struct nand_chip *chip)

I'm really not sure why the inconsistent style throughout nand_base on
using both an 'mtd' and a 'chip' parameter (we often assume that
'mtd->priv == chip'). If you need the mtd parameter, let's just pass it
instead of 'chip'.

> +{
> +	u8 features[ONFI_SUBFEATURE_PARAM_LEN];
> +
> +	if (chip->onfi_get_features(mtd, chip, ONFI_FEATURE_ADDR_ARRAY_OP_MODE,
> +				    features) < 0)
> +		return;
> +
> +	if (features[0] & ONFI_FEATURE_ARRAY_OP_MODE_ENABLE_ON_DIE_ECC) {
> +		/*
> +		 * If the chip has on-die ECC enabled, we kind of have
> +		 * to do the same...
> +		 */

That's not really true at all, is it? We can simply disable the on-die
ECC and use the provided spare area with a different ECC scheme, can't
we? (e.g., software BCH; or an SoC's HW ECC) In fact, I think disabling
the on-die ECC is a more reasonable default; nand_base is used by many
drivers, most of which provide their own implementations, and many of
which would not be compatible with the implementations you provide. A
system should have to "opt in" somehow to enable this, I think.

Additionally, you need a way to inform the hardware driver that you're
using on-die ECC, so they can make the appropriate choices (to disable
their HW ECC, for instance).

BTW, what driver/controller/SoC are you running this with?

> +		pr_info("Using on-die ECC\n");
> +	}
> +}
> +
>  /*
>   * Configure chip properties from Micron vendor-specific ONFI table
>   */
> -static void nand_onfi_detect_micron(struct nand_chip *chip,
> -		struct nand_onfi_params *p)
> +static void nand_onfi_detect_micron(struct mtd_info *mtd,
> +		struct nand_chip *chip, struct nand_onfi_params *p)

Ditto on mtd + chip.

>  {
>  	struct nand_onfi_vendor_micron *micron = (void *)p->vendor;
>  
> @@ -3067,6 +3086,8 @@ static void nand_onfi_detect_micron(struct nand_chip *chip,
>  
>  	chip->read_retries = micron->read_retry_options;
>  	chip->setup_read_retry = nand_setup_read_retry_micron;
> +
> +	nand_onfi_detect_micron_on_die_ecc(mtd, chip);
>  }
>  
>  /*
> @@ -3168,7 +3189,7 @@ static int nand_flash_detect_onfi(struct mtd_info *mtd, struct nand_chip *chip,
>  	}
>  
>  	if (p->jedec_id == NAND_MFR_MICRON)
> -		nand_onfi_detect_micron(chip, p);
> +		nand_onfi_detect_micron(mtd, chip, p);
>  
>  	return 1;
>  }
> diff --git a/include/linux/mtd/nand.h b/include/linux/mtd/nand.h
> index 2bd6e4e..780ab58 100644
> --- a/include/linux/mtd/nand.h
> +++ b/include/linux/mtd/nand.h
> @@ -225,6 +225,10 @@ struct nand_chip;
>  /* Vendor-specific feature address (Micron) */
>  #define ONFI_FEATURE_ADDR_READ_RETRY	0x89
>  
> +/* Vendor-specific array operation mode (Micron) */
> +#define ONFI_FEATURE_ADDR_ARRAY_OP_MODE	0x90
> +#define ONFI_FEATURE_ARRAY_OP_MODE_ENABLE_ON_DIE_ECC		0x08
> +
>  /* ONFI subfeature parameters length */
>  #define ONFI_SUBFEATURE_PARAM_LEN	4
>  

Brian

^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [PATCH v4 1/5] mtd: nand: Detect Micron flash with on-die ECC (aka "internal ECC") enabled.
  2014-04-01  6:39   ` Brian Norris
@ 2014-04-01 15:26     ` David Mosberger
  2014-04-02  7:27       ` Gupta, Pekon
  0 siblings, 1 reply; 30+ messages in thread
From: David Mosberger @ 2014-04-01 15:26 UTC (permalink / raw)
  To: Brian Norris
  Cc: Gerhard Sittig, linux-mtd@lists.infradead.org, Gupta, Pekon,
	Artem Bityutskiy

Brian,

On Tue, Apr 1, 2014 at 12:39 AM, Brian Norris
<computersforpeace@gmail.com> wrote:
> On Mon, Mar 31, 2014 at 05:28:53PM -0600, David Mosberger wrote:
>> Signed-off-by: David Mosberger <davidm@egauge.net>
>> ---
>>  drivers/mtd/nand/nand_base.c |   27 ++++++++++++++++++++++++---
>>  include/linux/mtd/nand.h     |    4 ++++
>>  2 files changed, 28 insertions(+), 3 deletions(-)
>>
>> diff --git a/drivers/mtd/nand/nand_base.c b/drivers/mtd/nand/nand_base.c
>> index 09fe1b1..f145f00 100644
>> --- a/drivers/mtd/nand/nand_base.c
>> +++ b/drivers/mtd/nand/nand_base.c
>> @@ -3054,11 +3054,30 @@ static int nand_setup_read_retry_micron(struct mtd_info *mtd, int retry_mode)
>>                       feature);
>>  }
>>
>> +static void
>
> Does this really need to be on its own line? It doesn't match the style
> of anything else.

Sure, I changed that.  It's not something that's flagged by chkpatch
and I'm working on enough different code-bases that I'm pretty
much blind to such things.

>> +nand_onfi_detect_micron_on_die_ecc(struct mtd_info *mtd,
>> +                                struct nand_chip *chip)
>
> I'm really not sure why the inconsistent style throughout nand_base on
> using both an 'mtd' and a 'chip' parameter (we often assume that
> 'mtd->priv == chip'). If you need the mtd parameter, let's just pass it
> instead of 'chip'.

Ah, yes, that makes sense.  I updated my code.

>> +{
>> +     u8 features[ONFI_SUBFEATURE_PARAM_LEN];
>> +
>> +     if (chip->onfi_get_features(mtd, chip, ONFI_FEATURE_ADDR_ARRAY_OP_MODE,
>> +                                 features) < 0)
>> +             return;
>> +
>> +     if (features[0] & ONFI_FEATURE_ARRAY_OP_MODE_ENABLE_ON_DIE_ECC) {
>> +             /*
>> +              * If the chip has on-die ECC enabled, we kind of have
>> +              * to do the same...
>> +              */
>
> That's not really true at all, is it? We can simply disable the on-die
> ECC and use the provided spare area with a different ECC scheme, can't
> we? (e.g., software BCH; or an SoC's HW ECC) In fact, I think disabling
> the on-die ECC is a more reasonable default; nand_base is used by many
> drivers, most of which provide their own implementations, and many of
> which would not be compatible with the implementations you provide. A
> system should have to "opt in" somehow to enable this, I think.
>
> Additionally, you need a way to inform the hardware driver that you're
> using on-die ECC, so they can make the appropriate choices (to disable
> their HW ECC, for instance).
>
> BTW, what driver/controller/SoC are you running this with?

No, if you booted with on-die ECC enabled, presumably you have the
on-die ECC layout and you can't switch (sure, you can turn off ECC
but all hell will break lose when you actually try to read data with
the wrong ECC layout).

There seems to be this misconception that somehow on-die ECC would
be turned on "accidentially".  It is not.  Either a boot loader has to turn
it on or you have to pay Micron extra money (several dollars/chip, AFAIR)
to have them send you custom chips with on-die ECC enabled by default.

This is not to say that you may want a way to override it, but the patch
as it stands is safe.

  --david
-- 
eGauge Systems LLC, http://egauge.net/, 1.877-EGAUGE1, fax 720.545.9768

^ permalink raw reply	[flat|nested] 30+ messages in thread

* RE: [PATCH v4 1/5] mtd: nand: Detect Micron flash with on-die ECC (aka "internal ECC") enabled.
  2014-04-01 15:26     ` David Mosberger
@ 2014-04-02  7:27       ` Gupta, Pekon
  2014-04-02 15:07         ` David Mosberger-Tang
  0 siblings, 1 reply; 30+ messages in thread
From: Gupta, Pekon @ 2014-04-02  7:27 UTC (permalink / raw)
  To: David Mosberger, Brian Norris
  Cc: Gerhard Sittig, linux-mtd@lists.infradead.org, Artem Bityutskiy

Hi David,

>From: David Mosberger [mailto:davidm@egauge.net]
>>On Tue, Apr 1, 2014 at 12:39 AM, Brian Norris ><computersforpeace@gmail.com> wrote:
>>> On Mon, Mar 31, 2014 at 05:28:53PM -0600, David Mosberger wrote:
[...]

>>> +{
>>> +     u8 features[ONFI_SUBFEATURE_PARAM_LEN];
>>> +
>>> +     if (chip->onfi_get_features(mtd, chip, ONFI_FEATURE_ADDR_ARRAY_OP_MODE,
>>> +                                 features) < 0)
>>> +             return;
>>> +
>>> +     if (features[0] & ONFI_FEATURE_ARRAY_OP_MODE_ENABLE_ON_DIE_ECC) {
>>> +             /*
>>> +              * If the chip has on-die ECC enabled, we kind of have
>>> +              * to do the same...
>>> +              */
>>
>> That's not really true at all, is it? We can simply disable the on-die
>> ECC and use the provided spare area with a different ECC scheme, can't
>> we? (e.g., software BCH; or an SoC's HW ECC) In fact, I think disabling
>> the on-die ECC is a more reasonable default; nand_base is used by many
>> drivers, most of which provide their own implementations, and many of
>> which would not be compatible with the implementations you provide. A
>> system should have to "opt in" somehow to enable this, I think.
>>
>> Additionally, you need a way to inform the hardware driver that you're
>> using on-die ECC, so they can make the appropriate choices (to disable
>> their HW ECC, for instance).
>>
>> BTW, what driver/controller/SoC are you running this with?
>
>No, if you booted with on-die ECC enabled, presumably you have the
>on-die ECC layout and you can't switch (sure, you can turn off ECC
>but all hell will break lose when you actually try to read data with
>the wrong ECC layout).
>
You have to consider multiple scenarios here (as Brian also suggested).

(1) If a system does _not_ boot from NAND, instead it enables NAND
only in kernel (like for rootfs). Then you need a mechanism to
enable/disable on-die ECC in the kernel, the same way boot-loader does.

(2) If a boot-loader, has enabled on-die ECC, but kernel driver supports
even higher ECC scheme, then you need a hook in driver to allow user
to 'disable' this feature.

>There seems to be this misconception that somehow on-die ECC would
>be turned on "accidentially".  It is not.  Either a boot loader has to turn
>it on or you have to pay Micron extra money (several dollars/chip, AFAIR)
>to have them send you custom chips with on-die ECC enabled by default.
>
>This is not to say that you may want a way to override it, but the patch
>as it stands is safe.
>
>  --david
>--
>eGauge Systems LLC, http://egauge.net/, 1.877-EGAUGE1, fax 720.545.9768


with regards, pekon

^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [PATCH v4 1/5] mtd: nand: Detect Micron flash with on-die ECC (aka "internal ECC") enabled.
  2014-04-02  7:27       ` Gupta, Pekon
@ 2014-04-02 15:07         ` David Mosberger-Tang
  2014-04-02 16:50           ` Gerhard Sittig
  0 siblings, 1 reply; 30+ messages in thread
From: David Mosberger-Tang @ 2014-04-02 15:07 UTC (permalink / raw)
  To: Gupta, Pekon
  Cc: David Mosberger, Gerhard Sittig, Brian Norris,
	linux-mtd@lists.infradead.org, Artem Bityutskiy

Pekon,

On Wed, Apr 2, 2014 at 1:27 AM, Gupta, Pekon <pekon@ti.com> wrote:

> You have to consider multiple scenarios here (as Brian also suggested).
>
> (1) If a system does _not_ boot from NAND, instead it enables NAND
> only in kernel (like for rootfs). Then you need a mechanism to
> enable/disable on-die ECC in the kernel, the same way boot-loader does.

Why?  Like I said before, on-die ECC does not get turned on "accidentally".
Furthermore, it is always *safe* to use on-die ECC,  unlike the ECC provided
by the platform-specific drivers.

> (2) If a boot-loader, has enabled on-die ECC, but kernel driver supports
> even higher ECC scheme, then you need a hook in driver to allow user
> to 'disable' this feature.

Does that really happen in practice?

I'm much more worried about someone using too weak an ECC than wanting
to use a stronger than required ECC.  In fact, that's what got us
into this mess: we ended up using SWECC on a chip that required multi-bit
ECC (yes, shame on us that we didn't catch that, but stuff happens; obviously
we were not the only ones caught off-guard by this change).

  --david

^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [PATCH v4 1/5] mtd: nand: Detect Micron flash with on-die ECC (aka "internal ECC") enabled.
  2014-04-02 15:07         ` David Mosberger-Tang
@ 2014-04-02 16:50           ` Gerhard Sittig
  2014-04-02 17:02             ` David Mosberger
  0 siblings, 1 reply; 30+ messages in thread
From: Gerhard Sittig @ 2014-04-02 16:50 UTC (permalink / raw)
  To: David Mosberger-Tang
  Cc: Brian Norris, linux-mtd@lists.infradead.org, Gupta, Pekon,
	Artem Bityutskiy

On Wed, 2014-04-02 at 09:07 -0600, David Mosberger-Tang wrote:
> 
> Pekon,
> 
> On Wed, Apr 2, 2014 at 1:27 AM, Gupta, Pekon <pekon@ti.com> wrote:
> 
> > You have to consider multiple scenarios here (as Brian also suggested).
> >
> > (1) If a system does _not_ boot from NAND, instead it enables NAND
> > only in kernel (like for rootfs). Then you need a mechanism to
> > enable/disable on-die ECC in the kernel, the same way boot-loader does.
> 
> Why?  Like I said before, on-die ECC does not get turned on "accidentally".
> Furthermore, it is always *safe* to use on-die ECC,  unlike the ECC provided
> by the platform-specific drivers.

I feel that the motivation has been given in previous messages.

You appear to assume that the complete system runtime is using
the same ECC method.  This just need not be a necessity, and need
not be the best idea either.

Any component in the startup sequence may choose any arbitrary
method to deal with the chips.  It's perfectly fine for a ROM
loader to enable the on-die-ECC which was off after POR.  It's
fine for a bootloader to find the chip in this state and keep
using this method.  And it's fine for an operating system to find
the chip in this state yet picking a different ECC method, and
thus disabling the on-die-ECC within the chip.

All that is required for data integrity is that those who work
with data do so by the method which was used to create that data.
Still you can have boot files in areas that use on-die-ECC, and
have a filesystem in an area that uses software or hardware ECC
different from the on-die-ECC method, using the OOB as they
please.

Just provide tools, don't encode policy.  Leave it to the users
what they want to do.  Don't assume that everybody is in the very
same situation which you are in, or shares your priorities.

> > (2) If a boot-loader, has enabled on-die ECC, but kernel driver supports
> > even higher ECC scheme, then you need a hook in driver to allow user
> > to 'disable' this feature.
> 
> Does that really happen in practice?

Simple answer: Yes.  The example was given, when hardware or
software support methods which are stronger than the chip's ECC,
or evenly strong yet preferrable for some other reason (like
speed, or existing support and portability or general
availability).

Please take a step back, take a deep breath, and try to see why
people are asking questions.  It's not that you are being
rejected or need to "defend" something.  It's the opposite that
we are trying to create something that scratches more than one
personal itch, and serves other people's needs as well as yours.
And please accept that thorough review isn't free either, yet
spending the work upfront is cheaper than maintaining things that
don't fit, or have stuff rot and cause problems.  Getting ignored
is worse than receiving feedback and help. :)  Try to see the
benefit of it, please.  People spend their time on communicating
with you, nobody's fighting (it's easier to turn around and do
something else, if emotions get in the way and hinder progress --
we are all busy after all).

virtually yours
Gerhard Sittig
-- 
DENX Software Engineering GmbH,     MD: Wolfgang Denk & Detlev Zundel
HRB 165235 Munich, Office: Kirchenstr. 5, D-82194 Groebenzell, Germany
Phone: +49-8142-66989-0 Fax: +49-8142-66989-80  Email: office@denx.de

^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [PATCH v4 1/5] mtd: nand: Detect Micron flash with on-die ECC (aka "internal ECC") enabled.
  2014-04-02 16:50           ` Gerhard Sittig
@ 2014-04-02 17:02             ` David Mosberger
  2014-04-03  7:10               ` Gerhard Sittig
  0 siblings, 1 reply; 30+ messages in thread
From: David Mosberger @ 2014-04-02 17:02 UTC (permalink / raw)
  To: Gerhard Sittig
  Cc: Brian Norris, linux-mtd@lists.infradead.org, Gupta, Pekon,
	Artem Bityutskiy

Gerhard,

On Wed, Apr 2, 2014 at 10:50 AM, Gerhard Sittig <gsi@denx.de> wrote:

> Just provide tools, don't encode policy.

Hey, I'm all for separating mechanism and policy.
It's just not what exists in the NAND driver now.
AFAIK, ECC selection is *per chip* (not per filesystem)
and is hard-coded by a combination of config-options
and platform-specific drivers.  Isn't that true?

> Please take a step back, take a deep breath, and try to see why
> people are asking questions.  It's not that you are being
> rejected or need to "defend" something.  It's the opposite that
> we are trying to create something that scratches more than one
> personal itch, and serves other people's needs as well as yours.

Yes, and I'm just trying to understand where you're coming from.

  --david
-- 
eGauge Systems LLC, http://egauge.net/, 1.877-EGAUGE1, fax 720.545.9768

^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [PATCH v4 1/5] mtd: nand: Detect Micron flash with on-die ECC (aka "internal ECC") enabled.
  2014-04-02 17:02             ` David Mosberger
@ 2014-04-03  7:10               ` Gerhard Sittig
       [not found]                 ` <CALnQHM1VLY=t6CaQtHGtp=enNCCj=Xz_QN7sj20hUCd8ZJjKpA@mail.gmail.com>
  0 siblings, 1 reply; 30+ messages in thread
From: Gerhard Sittig @ 2014-04-03  7:10 UTC (permalink / raw)
  To: David Mosberger
  Cc: Brian Norris, linux-mtd@lists.infradead.org, Gupta, Pekon,
	Artem Bityutskiy

On Wed, 2014-04-02 at 11:02 -0600, David Mosberger wrote:
> 
> AFAIK, ECC selection is *per chip* (not per filesystem)
> and is hard-coded by a combination of config-options
> and platform-specific drivers.  Isn't that true?

Let's look at the boot stages:

- ROM loader code reads bootloader code
- bootloader code reads bootloader configuration, and boot files
  (kernel image, device tree)
- OS kernels read and write the filesystem

So each of those phases run at different times and operate on
different areas of the chip.

- the bootloader need not write nor modify ROM loader data
- the OS kernel need not write nor modify bootloader data or boot
  files

Which means that you can perfectly operate those individual areas
with different ECC methods, depending on the capabilities of the
software that is running at this moment, and your preferences or
having access to different levels of hardware support, the will
or capability/incapability to sync among those components, or to
adjust and update these individual boot phases, etc.

I really don't see that enabling on-die-ECC in one stage requires
all other stages to follow.  It's one apparent approach, but need
not be the only one.  That's what previous messages tried to say.

Does this explanation help answer your question?

virtually yours
Gerhard Sittig
-- 
DENX Software Engineering GmbH,     MD: Wolfgang Denk & Detlev Zundel
HRB 165235 Munich, Office: Kirchenstr. 5, D-82194 Groebenzell, Germany
Phone: +49-8142-66989-0 Fax: +49-8142-66989-80  Email: office@denx.de

^ permalink raw reply	[flat|nested] 30+ messages in thread

[parent not found: <CALnQHM1VLY=t6CaQtHGtp=enNCCj=Xz_QN7sj20hUCd8ZJjKpA@mail.gmail.com>]

* Re: [PATCH v4 1/5] mtd: nand: Detect Micron flash with on-die ECC (aka "internal ECC") enabled.
       [not found]                 ` <CALnQHM1VLY=t6CaQtHGtp=enNCCj=Xz_QN7sj20hUCd8ZJjKpA@mail.gmail.com>
@ 2014-04-03 15:26                   ` David Mosberger
  0 siblings, 0 replies; 30+ messages in thread
From: David Mosberger @ 2014-04-03 15:26 UTC (permalink / raw)
  To: Gerhard Sittig
  Cc: Brian Norris, linux-mtd@lists.infradead.org, Gupta, Pekon,
	Artem Bityutskiy

[Resend in plain text, sorry...]

On Thu, Apr 3, 2014 at 9:24 AM, David Mosberger <davidm@egauge.net> wrote:
>
> Gerhard,
>
> On Thu, Apr 3, 2014 at 1:10 AM, Gerhard Sittig <gsi@denx.de> wrote:
>>
>> I really don't see that enabling on-die-ECC in one stage requires
>> all other stages to follow.  It's one apparent approach, but need
>> not be the only one.  That's what previous messages tried to say.
>
>
>>
>> Does this explanation help answer your question?
>
>
> Yes, that helps, thanks for taking the time to answer!
>
> Leaving on-die ECC on is certainly not the only way to do it,
> but I still think it's a reasonable way.
>
> I doubt it's reasonable to modify each and every platform-driver
> to be able to decide whether to use on-die ECC or the best
> ECC mode it can supply.
>
> Maybe it's possible to use the new(ish) ecc_strength_ds/ecc_step_ds
> to chose whether or not to keep on-die ECC on?  Say, if the driver's
> ECC is stronger, turn on-die ECC off, otherwise leave it on?
>
> Again, I think the present patch is safe and reasonable, but I'm certainly
> open to better suggestions.
>
>   --david
> --
> eGauge Systems LLC, http://egauge.net/, 1.877-EGAUGE1, fax 720.545.9768




-- 
eGauge Systems LLC, http://egauge.net/, 1.877-EGAUGE1, fax 720.545.9768

^ permalink raw reply	[flat|nested] 30+ messages in thread

* [PATCH v4 2/5] mtd: nand: Add NAND_ECC_HW_ON_DIE ECC-mode.
  2014-03-31 23:28 [PATCH v4 0/5] mtd: nand: Add on-die ECC support David Mosberger
  2014-03-31 23:28 ` [PATCH v4 1/5] mtd: nand: Detect Micron flash with on-die ECC (aka "internal ECC") enabled David Mosberger
@ 2014-03-31 23:28 ` David Mosberger
  2014-04-01  6:02   ` Gupta, Pekon
  2014-04-01  7:24   ` Brian Norris
  2014-03-31 23:28 ` [PATCH v4 3/5] mtd: nand: Enable subpage-reads on flashes with on-die ECC enabled David Mosberger
                   ` (3 subsequent siblings)
  5 siblings, 2 replies; 30+ messages in thread
From: David Mosberger @ 2014-03-31 23:28 UTC (permalink / raw)
  To: pekon; +Cc: David Mosberger, gsi, computersforpeace, linux-mtd, dedekind1

Some Micron chips such as the MT29F4G16ABADAWP have an internal
(on-die) ECC-controller that is useful when the host-platform doesn't
have hardware-support for multi-bit ECC.  This patch adds
NAND_ECC_HW_ON_DIE to support such chips when the driver detects that
the on-die ECC controller is enabled.

Signed-off-by: David Mosberger <davidm@egauge.net>
---
 drivers/mtd/nand/nand_base.c |  133 ++++++++++++++++++++++++++++++++++++++++++
 drivers/of/of_mtd.c          |    1 +
 include/linux/mtd/nand.h     |    2 +
 3 files changed, 136 insertions(+)

diff --git a/drivers/mtd/nand/nand_base.c b/drivers/mtd/nand/nand_base.c
index f145f00..7047e9f5 100644
--- a/drivers/mtd/nand/nand_base.c
+++ b/drivers/mtd/nand/nand_base.c
@@ -78,6 +78,20 @@ static struct nand_ecclayout nand_oob_64 = {
 		 .length = 38} }
 };
 
+static struct nand_ecclayout nand_oob_64_on_die = {
+	.eccbytes = 32,
+	.eccpos = {
+		    8,  9, 10, 11, 12, 13, 14, 15,
+		   24, 25, 26, 27, 28, 29, 30, 31,
+		   40, 41, 42, 43, 44, 45, 46, 47,
+		   56, 57, 58, 59, 60, 61, 62, 63},
+	.oobfree = {
+		{.offset =  4,	 .length = 4},
+		{.offset = 20,	 .length = 4},
+		{.offset = 36,	 .length = 4},
+		{.offset = 52,	 .length = 4}}
+};
+
 static struct nand_ecclayout nand_oob_128 = {
 	.eccbytes = 48,
 	.eccpos = {
@@ -1258,6 +1272,105 @@ static int nand_read_subpage(struct mtd_info *mtd, struct nand_chip *chip,
 	return max_bitflips;
 }
 
+static int check_read_status_on_die(struct mtd_info *mtd,
+				    struct nand_chip *chip, int page)
+{
+	int max_bitflips = 0;
+	uint8_t status;
+
+	/* Check ECC status of page just transferred into NAND's page buffer: */
+	chip->cmdfunc(mtd, NAND_CMD_STATUS, -1, -1);
+	status = chip->read_byte(mtd);
+
+	/* Switch back to data reading: */
+	chip->cmdfunc(mtd, NAND_CMD_READMODE, -1, -1);
+
+	if (status & NAND_STATUS_FAIL) {
+		/* Page has invalid ECC. */
+		mtd->ecc_stats.failed++;
+	} else if (status & NAND_STATUS_REWRITE) {
+		/*
+		 * Simple but suboptimal: any page with a single stuck
+		 * bit will be unusable since it'll be rewritten on
+		 * each read...
+		 */
+		max_bitflips = mtd->bitflip_threshold;
+	}
+	return max_bitflips;
+}
+
+/**
+ * nand_read_subpage_on_die - [REPLACEABLE] raw sub-page read function
+ * @mtd: mtd info structure
+ * @chip: nand chip info structure
+ * @data_offs: offset of requested data within the page
+ * @readlen: data length
+ * @bufpoi: buffer to store read data
+ */
+static int nand_read_subpage_on_die(struct mtd_info *mtd,
+				    struct nand_chip *chip,
+				    uint32_t data_offs, uint32_t readlen,
+				    uint8_t *bufpoi, int page)
+{
+	int start_step, end_step, num_steps, ret;
+	int data_col_addr;
+	int datafrag_len;
+	uint32_t failed;
+	uint8_t *p;
+
+	/* Column address within the page aligned to ECC size */
+	start_step = data_offs / chip->ecc.size;
+	end_step = (data_offs + readlen - 1) / chip->ecc.size;
+	num_steps = end_step - start_step + 1;
+
+	/* Data size aligned to ECC ecc.size */
+	datafrag_len = num_steps * chip->ecc.size;
+	data_col_addr = start_step * chip->ecc.size;
+	p = bufpoi + data_col_addr;
+
+	failed = mtd->ecc_stats.failed;
+
+	ret = check_read_status_on_die(mtd, chip, page);
+	if (ret < 0 || mtd->ecc_stats.failed != failed) {
+		memset(p, 0, datafrag_len);
+		return ret;
+	}
+
+	/* If we read not a page aligned data */
+	if (data_col_addr != 0)
+		chip->cmdfunc(mtd, NAND_CMD_RNDOUT, data_col_addr, -1);
+
+	chip->read_buf(mtd, p, datafrag_len);
+
+	return ret;
+}
+
+/**
+ * nand_read_page_on_die - [INTERN] read raw page data without ecc
+ * @mtd: mtd info structure
+ * @chip: nand chip info structure
+ * @buf: buffer to store read data
+ * @oob_required: caller requires OOB data read to chip->oob_poi
+ * @page: page number to read
+ */
+static int nand_read_page_on_die(struct mtd_info *mtd, struct nand_chip *chip,
+				 uint8_t *buf, int oob_required, int page)
+{
+	uint32_t failed;
+	int ret;
+
+	failed = mtd->ecc_stats.failed;
+
+	ret = check_read_status_on_die(mtd, chip, page);
+	if (ret < 0 || mtd->ecc_stats.failed != failed)
+		return ret;
+
+	chip->read_buf(mtd, buf, mtd->writesize);
+	if (oob_required)
+		chip->read_buf(mtd, chip->oob_poi, mtd->oobsize);
+	return ret;
+}
+
 /**
  * nand_read_page_hwecc - [REPLACEABLE] hardware ECC based page read function
  * @mtd: mtd info structure
@@ -3069,6 +3182,7 @@ nand_onfi_detect_micron_on_die_ecc(struct mtd_info *mtd,
 		 * If the chip has on-die ECC enabled, we kind of have
 		 * to do the same...
 		 */
+		chip->ecc.mode = NAND_ECC_HW_ON_DIE;
 		pr_info("Using on-die ECC\n");
 	}
 }
@@ -3985,6 +4099,25 @@ int nand_scan_tail(struct mtd_info *mtd)
 		ecc->strength = ecc->bytes * 8 / fls(8 * ecc->size);
 		break;
 
+	case NAND_ECC_HW_ON_DIE:
+		/* nand_bbt attempts to put Bbt marker at offset 8 in
+		   oob, which is used for ECC by Micron
+		   MT29F4G16ABADAWP, for example.  Fixed by not using
+		   OOB for BBT marker.  */
+		chip->bbt_options |= NAND_BBT_NO_OOB;
+		chip->ecc.layout = &nand_oob_64_on_die;
+		chip->ecc.read_page = nand_read_page_on_die;
+		chip->ecc.read_subpage = nand_read_subpage_on_die;
+		chip->ecc.write_page = nand_write_page_raw;
+		chip->ecc.read_oob = nand_read_oob_std;
+		chip->ecc.read_page_raw = nand_read_page_raw;
+		chip->ecc.write_page_raw = nand_write_page_raw;
+		chip->ecc.write_oob = nand_write_oob_std;
+		chip->ecc.size = 512;
+		chip->ecc.bytes = 8;
+		chip->ecc.strength = 4;
+		break;
+
 	case NAND_ECC_NONE:
 		pr_warn("NAND_ECC_NONE selected by board driver. "
 			   "This is not recommended!\n");
diff --git a/drivers/of/of_mtd.c b/drivers/of/of_mtd.c
index b7361ed..c844c84 100644
--- a/drivers/of/of_mtd.c
+++ b/drivers/of/of_mtd.c
@@ -23,6 +23,7 @@ static const char *nand_ecc_modes[] = {
 	[NAND_ECC_HW_SYNDROME]	= "hw_syndrome",
 	[NAND_ECC_HW_OOB_FIRST]	= "hw_oob_first",
 	[NAND_ECC_SOFT_BCH]	= "soft_bch",
+	[NAND_ECC_HW_ON_DIE]	= "hw_on_die",
 };
 
 /**
diff --git a/include/linux/mtd/nand.h b/include/linux/mtd/nand.h
index 780ab58..dbb99b3 100644
--- a/include/linux/mtd/nand.h
+++ b/include/linux/mtd/nand.h
@@ -112,6 +112,7 @@ extern int nand_unlock(struct mtd_info *mtd, loff_t ofs, uint64_t len);
 /* Status bits */
 #define NAND_STATUS_FAIL	0x01
 #define NAND_STATUS_FAIL_N1	0x02
+#define NAND_STATUS_REWRITE	0x08
 #define NAND_STATUS_TRUE_READY	0x20
 #define NAND_STATUS_READY	0x40
 #define NAND_STATUS_WP		0x80
@@ -126,6 +127,7 @@ typedef enum {
 	NAND_ECC_HW_SYNDROME,
 	NAND_ECC_HW_OOB_FIRST,
 	NAND_ECC_SOFT_BCH,
+	NAND_ECC_HW_ON_DIE,
 } nand_ecc_modes_t;
 
 /*
-- 
1.7.9.5

^ permalink raw reply related	[flat|nested] 30+ messages in thread

* RE: [PATCH v4 2/5] mtd: nand: Add NAND_ECC_HW_ON_DIE ECC-mode.
  2014-03-31 23:28 ` [PATCH v4 2/5] mtd: nand: Add NAND_ECC_HW_ON_DIE ECC-mode David Mosberger
@ 2014-04-01  6:02   ` Gupta, Pekon
  2014-04-01 15:32     ` David Mosberger
  2014-04-01  7:24   ` Brian Norris
  1 sibling, 1 reply; 30+ messages in thread
From: Gupta, Pekon @ 2014-04-01  6:02 UTC (permalink / raw)
  To: David Mosberger
  Cc: gsi@denx.de, computersforpeace@gmail.com,
	linux-mtd@lists.infradead.org, dedekind1@gmail.com

Hi David,

>From: David Mosberger
>Some Micron chips such as the MT29F4G16ABADAWP have an internal
>(on-die) ECC-controller that is useful when the host-platform doesn't
>have hardware-support for multi-bit ECC.  This patch adds
>NAND_ECC_HW_ON_DIE to support such chips when the driver detects that
>the on-die ECC controller is enabled.
>
>Signed-off-by: David Mosberger <davidm@egauge.net>
>---
> drivers/mtd/nand/nand_base.c |  133 ++++++++++++++++++++++++++++++++++++++++++
> drivers/of/of_mtd.c          |    1 +
> include/linux/mtd/nand.h     |    2 +
> 3 files changed, 136 insertions(+)
>
>diff --git a/drivers/mtd/nand/nand_base.c b/drivers/mtd/nand/nand_base.c
>index f145f00..7047e9f5 100644
>--- a/drivers/mtd/nand/nand_base.c
>+++ b/drivers/mtd/nand/nand_base.c
>@@ -78,6 +78,20 @@ static struct nand_ecclayout nand_oob_64 = {
> 		 .length = 38} }
> };
>
>+static struct nand_ecclayout nand_oob_64_on_die = {
>+	.eccbytes = 32,
>+	.eccpos = {
>+		    8,  9, 10, 11, 12, 13, 14, 15,
>+		   24, 25, 26, 27, 28, 29, 30, 31,
>+		   40, 41, 42, 43, 44, 45, 46, 47,
>+		   56, 57, 58, 59, 60, 61, 62, 63},
>+	.oobfree = {
>+		{.offset =  4,	 .length = 4},
>+		{.offset = 20,	 .length = 4},
>+		{.offset = 36,	 .length = 4},
>+		{.offset = 52,	 .length = 4}}
>+};
>+
> static struct nand_ecclayout nand_oob_128 = {
> 	.eccbytes = 48,
> 	.eccpos = {
>@@ -1258,6 +1272,105 @@ static int nand_read_subpage(struct mtd_info *mtd, struct nand_chip *chip,
> 	return max_bitflips;
> }
>
>+static int check_read_status_on_die(struct mtd_info *mtd,
>+				    struct nand_chip *chip, int page)
>+{
>+	int max_bitflips = 0;
>+	uint8_t status;
>+
>+	/* Check ECC status of page just transferred into NAND's page buffer: */
>+	chip->cmdfunc(mtd, NAND_CMD_STATUS, -1, -1);
>+	status = chip->read_byte(mtd);
>+
>+	/* Switch back to data reading: */
>+	chip->cmdfunc(mtd, NAND_CMD_READMODE, -1, -1);
>+
>+	if (status & NAND_STATUS_FAIL) {
>+		/* Page has invalid ECC. */
>+		mtd->ecc_stats.failed++;
>+	} else if (status & NAND_STATUS_REWRITE) {
>+		/*
>+		 * Simple but suboptimal: any page with a single stuck
>+		 * bit will be unusable since it'll be rewritten on
>+		 * each read...
>+		 */
>+		max_bitflips = mtd->bitflip_threshold;

This is not correct. You need to count the bit-flips. This would lead to
un-necessary scrubbing of the pages by UBI even for single-bit errors.

>+	}
>+	return max_bitflips;
>+}
>+
>+/**
>+ * nand_read_subpage_on_die - [REPLACEABLE] raw sub-page read function
>+ * @mtd: mtd info structure
>+ * @chip: nand chip info structure
>+ * @data_offs: offset of requested data within the page
>+ * @readlen: data length
>+ * @bufpoi: buffer to store read data
>+ */
>+static int nand_read_subpage_on_die(struct mtd_info *mtd,
>+				    struct nand_chip *chip,
>+				    uint32_t data_offs, uint32_t readlen,
>+				    uint8_t *bufpoi, int page)
>+{
>+	int start_step, end_step, num_steps, ret;
>+	int data_col_addr;
>+	int datafrag_len;
>+	uint32_t failed;
>+	uint8_t *p;
>+
>+	/* Column address within the page aligned to ECC size */
>+	start_step = data_offs / chip->ecc.size;
>+	end_step = (data_offs + readlen - 1) / chip->ecc.size;
>+	num_steps = end_step - start_step + 1;
>+
>+	/* Data size aligned to ECC ecc.size */
>+	datafrag_len = num_steps * chip->ecc.size;
>+	data_col_addr = start_step * chip->ecc.size;
>+	p = bufpoi + data_col_addr;
>+
>+	failed = mtd->ecc_stats.failed;
>+
>+	ret = check_read_status_on_die(mtd, chip, page);
>+	if (ret < 0 || mtd->ecc_stats.failed != failed) {
>+		memset(p, 0, datafrag_len);
>+		return ret;
>+	}
>+
>+	/* If we read not a page aligned data */
>+	if (data_col_addr != 0)
>+		chip->cmdfunc(mtd, NAND_CMD_RNDOUT, data_col_addr, -1);
>+
>+	chip->read_buf(mtd, p, datafrag_len);
>+
>+	return ret;
>+}
>+
Please spawn out nand_read_subpage_on_die() into a separate patch
(or may be later) so that basic framework with nand_read_page_on_die()
can be independently reviewed and accepted.


>+/**
>+ * nand_read_page_on_die - [INTERN] read raw page data without ecc
>+ * @mtd: mtd info structure
>+ * @chip: nand chip info structure
>+ * @buf: buffer to store read data
>+ * @oob_required: caller requires OOB data read to chip->oob_poi
>+ * @page: page number to read
>+ */
>+static int nand_read_page_on_die(struct mtd_info *mtd, struct nand_chip *chip,
>+				 uint8_t *buf, int oob_required, int page)
>+{
>+	uint32_t failed;
>+	int ret;
>+
>+	failed = mtd->ecc_stats.failed;
>+
>+	ret = check_read_status_on_die(mtd, chip, page);
>+	if (ret < 0 || mtd->ecc_stats.failed != failed)
>+		return ret;
>+
>+	chip->read_buf(mtd, buf, mtd->writesize);
>+	if (oob_required)
>+		chip->read_buf(mtd, chip->oob_poi, mtd->oobsize);
>+	return ret;
>+}
>+
> /**
>  * nand_read_page_hwecc - [REPLACEABLE] hardware ECC based page read function
>  * @mtd: mtd info structure
>@@ -3069,6 +3182,7 @@ nand_onfi_detect_micron_on_die_ecc(struct mtd_info *mtd,
> 		 * If the chip has on-die ECC enabled, we kind of have
> 		 * to do the same...
> 		 */
>+		chip->ecc.mode = NAND_ECC_HW_ON_DIE;
> 		pr_info("Using on-die ECC\n");
> 	}
> }
>@@ -3985,6 +4099,25 @@ int nand_scan_tail(struct mtd_info *mtd)
> 		ecc->strength = ecc->bytes * 8 / fls(8 * ecc->size);
> 		break;
>
>+	case NAND_ECC_HW_ON_DIE:
>+		/* nand_bbt attempts to put Bbt marker at offset 8 in
>+		   oob, which is used for ECC by Micron
>+		   MT29F4G16ABADAWP, for example.  Fixed by not using
>+		   OOB for BBT marker.  */
>+		chip->bbt_options |= NAND_BBT_NO_OOB;
>+		chip->ecc.layout = &nand_oob_64_on_die;
>+		chip->ecc.read_page = nand_read_page_on_die;
>+		chip->ecc.read_subpage = nand_read_subpage_on_die;
>+		chip->ecc.write_page = nand_write_page_raw;
>+		chip->ecc.read_oob = nand_read_oob_std;
>+		chip->ecc.read_page_raw = nand_read_page_raw;
>+		chip->ecc.write_page_raw = nand_write_page_raw;
>+		chip->ecc.write_oob = nand_write_oob_std;
>+		chip->ecc.size = 512;
>+		chip->ecc.bytes = 8;
>+		chip->ecc.strength = 4;

Shouldn't you parse these value from ONFI params ?
Do Byte[112] (Number of bits ECC bits) of ONFI param page still holds
good when on-die-ECC is enabled ? 

>+		break;
>+
> 	case NAND_ECC_NONE:
> 		pr_warn("NAND_ECC_NONE selected by board driver. "
> 			   "This is not recommended!\n");
>diff --git a/drivers/of/of_mtd.c b/drivers/of/of_mtd.c
>index b7361ed..c844c84 100644
>--- a/drivers/of/of_mtd.c
>+++ b/drivers/of/of_mtd.c
>@@ -23,6 +23,7 @@ static const char *nand_ecc_modes[] = {
> 	[NAND_ECC_HW_SYNDROME]	= "hw_syndrome",
> 	[NAND_ECC_HW_OOB_FIRST]	= "hw_oob_first",
> 	[NAND_ECC_SOFT_BCH]	= "soft_bch",
>+	[NAND_ECC_HW_ON_DIE]	= "hw_on_die",
> };
>
Why are you adding this an selectable option for "nand-ecc-mode"
DT binding ? (as you are not giving user to choose ECC mode).
Also, description of this new DT binding value needs to be added in
Documentation/devicetree/bindings/mtd/nand.txt


> /**
>diff --git a/include/linux/mtd/nand.h b/include/linux/mtd/nand.h
>index 780ab58..dbb99b3 100644
>--- a/include/linux/mtd/nand.h
>+++ b/include/linux/mtd/nand.h
>@@ -112,6 +112,7 @@ extern int nand_unlock(struct mtd_info *mtd, loff_t ofs, uint64_t len);
> /* Status bits */
> #define NAND_STATUS_FAIL	0x01
> #define NAND_STATUS_FAIL_N1	0x02
>+#define NAND_STATUS_REWRITE	0x08
> #define NAND_STATUS_TRUE_READY	0x20
> #define NAND_STATUS_READY	0x40
> #define NAND_STATUS_WP		0x80
>@@ -126,6 +127,7 @@ typedef enum {
> 	NAND_ECC_HW_SYNDROME,
> 	NAND_ECC_HW_OOB_FIRST,
> 	NAND_ECC_SOFT_BCH,
>+	NAND_ECC_HW_ON_DIE,
> } nand_ecc_modes_t;
>
> /*
>--
>1.7.9.5
>

In summary, It seems that you havn't followed the recommendations by
given on previous patches. If you feel those are not applicable to your driver
then please feel free to discuss before re-sending the new version.


with regards, pekon

^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [PATCH v4 2/5] mtd: nand: Add NAND_ECC_HW_ON_DIE ECC-mode.
  2014-04-01  6:02   ` Gupta, Pekon
@ 2014-04-01 15:32     ` David Mosberger
  0 siblings, 0 replies; 30+ messages in thread
From: David Mosberger @ 2014-04-01 15:32 UTC (permalink / raw)
  To: Gupta, Pekon
  Cc: gsi@denx.de, computersforpeace@gmail.com,
	linux-mtd@lists.infradead.org, dedekind1@gmail.com

Pekon,

On Tue, Apr 1, 2014 at 12:02 AM, Gupta, Pekon <pekon@ti.com> wrote:

>>+      } else if (status & NAND_STATUS_REWRITE) {
>>+              /*
>>+               * Simple but suboptimal: any page with a single stuck
>>+               * bit will be unusable since it'll be rewritten on
>>+               * each read...
>>+               */
>>+              max_bitflips = mtd->bitflip_threshold;
>
> This is not correct. You need to count the bit-flips. This would lead to
> un-necessary scrubbing of the pages by UBI even for single-bit errors.

Wrong.  It is correct, it's just not optimal.
Patch 5 of 5 changes this part to count the bitflips, to get optimal behavior.

> Please spawn out nand_read_subpage_on_die() into a separate patch
> (or may be later) so that basic framework with nand_read_page_on_die()
> can be independently reviewed and accepted.

Good point, thanks.


>>@@ -3985,6 +4099,25 @@ int nand_scan_tail(struct mtd_info *mtd)
>>               ecc->strength = ecc->bytes * 8 / fls(8 * ecc->size);
>>               break;
>>
>>+      case NAND_ECC_HW_ON_DIE:
>>+              /* nand_bbt attempts to put Bbt marker at offset 8 in
>>+                 oob, which is used for ECC by Micron
>>+                 MT29F4G16ABADAWP, for example.  Fixed by not using
>>+                 OOB for BBT marker.  */
>>+              chip->bbt_options |= NAND_BBT_NO_OOB;
>>+              chip->ecc.layout = &nand_oob_64_on_die;
>>+              chip->ecc.read_page = nand_read_page_on_die;
>>+              chip->ecc.read_subpage = nand_read_subpage_on_die;
>>+              chip->ecc.write_page = nand_write_page_raw;
>>+              chip->ecc.read_oob = nand_read_oob_std;
>>+              chip->ecc.read_page_raw = nand_read_page_raw;
>>+              chip->ecc.write_page_raw = nand_write_page_raw;
>>+              chip->ecc.write_oob = nand_write_oob_std;
>>+              chip->ecc.size = 512;
>>+              chip->ecc.bytes = 8;
>>+              chip->ecc.strength = 4;
>
> Shouldn't you parse these value from ONFI params ?
> Do Byte[112] (Number of bits ECC bits) of ONFI param page still holds
> good when on-die-ECC is enabled ?

Probably.  I just don't know how to do that.  If someone wants to send
me a patch,
I'll be happy to test it.


>>--- a/drivers/of/of_mtd.c
>>+++ b/drivers/of/of_mtd.c
>>@@ -23,6 +23,7 @@ static const char *nand_ecc_modes[] = {
>>       [NAND_ECC_HW_SYNDROME]  = "hw_syndrome",
>>       [NAND_ECC_HW_OOB_FIRST] = "hw_oob_first",
>>       [NAND_ECC_SOFT_BCH]     = "soft_bch",
>>+      [NAND_ECC_HW_ON_DIE]    = "hw_on_die",
>> };
>>
> Why are you adding this an selectable option for "nand-ecc-mode"
> DT binding ? (as you are not giving user to choose ECC mode).
> Also, description of this new DT binding value needs to be added in
> Documentation/devicetree/bindings/mtd/nand.txt

I don't know how this is used.  If it's not needed, we can leave it out.
I was just worried about some code indexing into nand_ecc_modes[].

> In summary, It seems that you havn't followed the recommendations by
> given on previous patches. If you feel those are not applicable to your driver
> then please feel free to discuss before re-sending the new version.

That's a bit thick, isn't it?

  --david
-- 
eGauge Systems LLC, http://egauge.net/, 1.877-EGAUGE1, fax 720.545.9768

^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [PATCH v4 2/5] mtd: nand: Add NAND_ECC_HW_ON_DIE ECC-mode.
  2014-03-31 23:28 ` [PATCH v4 2/5] mtd: nand: Add NAND_ECC_HW_ON_DIE ECC-mode David Mosberger
  2014-04-01  6:02   ` Gupta, Pekon
@ 2014-04-01  7:24   ` Brian Norris
  2014-04-01 15:41     ` David Mosberger
  1 sibling, 1 reply; 30+ messages in thread
From: Brian Norris @ 2014-04-01  7:24 UTC (permalink / raw)
  To: David Mosberger; +Cc: gsi, linux-mtd, pekon, dedekind1

On Mon, Mar 31, 2014 at 05:28:54PM -0600, David Mosberger wrote:
> Some Micron chips such as the MT29F4G16ABADAWP have an internal
> (on-die) ECC-controller that is useful when the host-platform doesn't
> have hardware-support for multi-bit ECC.  This patch adds
> NAND_ECC_HW_ON_DIE to support such chips when the driver detects that
> the on-die ECC controller is enabled.
> 
> Signed-off-by: David Mosberger <davidm@egauge.net>
> ---
>  drivers/mtd/nand/nand_base.c |  133 ++++++++++++++++++++++++++++++++++++++++++
>  drivers/of/of_mtd.c          |    1 +
>  include/linux/mtd/nand.h     |    2 +
>  3 files changed, 136 insertions(+)
> 
> diff --git a/drivers/mtd/nand/nand_base.c b/drivers/mtd/nand/nand_base.c
> index f145f00..7047e9f5 100644
> --- a/drivers/mtd/nand/nand_base.c
> +++ b/drivers/mtd/nand/nand_base.c
> @@ -78,6 +78,20 @@ static struct nand_ecclayout nand_oob_64 = {
>  		 .length = 38} }
>  };
>  
> +static struct nand_ecclayout nand_oob_64_on_die = {

This is not the only possible layout for on-die ECC. It's probably not
even the only one used by Micron, is it? What about larger page sizes,
for instance?

> +	.eccbytes = 32,
> +	.eccpos = {
> +		    8,  9, 10, 11, 12, 13, 14, 15,
> +		   24, 25, 26, 27, 28, 29, 30, 31,
> +		   40, 41, 42, 43, 44, 45, 46, 47,
> +		   56, 57, 58, 59, 60, 61, 62, 63},
> +	.oobfree = {
> +		{.offset =  4,	 .length = 4},
> +		{.offset = 20,	 .length = 4},
> +		{.offset = 36,	 .length = 4},
> +		{.offset = 52,	 .length = 4}}
> +};
> +
>  static struct nand_ecclayout nand_oob_128 = {
>  	.eccbytes = 48,
>  	.eccpos = {
> @@ -1258,6 +1272,105 @@ static int nand_read_subpage(struct mtd_info *mtd, struct nand_chip *chip,
>  	return max_bitflips;
>  }
>  
> +static int check_read_status_on_die(struct mtd_info *mtd,
> +				    struct nand_chip *chip, int page)
> +{
> +	int max_bitflips = 0;
> +	uint8_t status;
> +
> +	/* Check ECC status of page just transferred into NAND's page buffer: */
> +	chip->cmdfunc(mtd, NAND_CMD_STATUS, -1, -1);
> +	status = chip->read_byte(mtd);
> +
> +	/* Switch back to data reading: */
> +	chip->cmdfunc(mtd, NAND_CMD_READMODE, -1, -1);
> +
> +	if (status & NAND_STATUS_FAIL) {
> +		/* Page has invalid ECC. */
> +		mtd->ecc_stats.failed++;
> +	} else if (status & NAND_STATUS_REWRITE) {
> +		/*
> +		 * Simple but suboptimal: any page with a single stuck
> +		 * bit will be unusable since it'll be rewritten on
> +		 * each read...
> +		 */
> +		max_bitflips = mtd->bitflip_threshold;
> +	}
> +	return max_bitflips;
> +}
> +
> +/**
> + * nand_read_subpage_on_die - [REPLACEABLE] raw sub-page read function
> + * @mtd: mtd info structure
> + * @chip: nand chip info structure
> + * @data_offs: offset of requested data within the page
> + * @readlen: data length
> + * @bufpoi: buffer to store read data
> + */
> +static int nand_read_subpage_on_die(struct mtd_info *mtd,
> +				    struct nand_chip *chip,
> +				    uint32_t data_offs, uint32_t readlen,
> +				    uint8_t *bufpoi, int page)
> +{
> +	int start_step, end_step, num_steps, ret;
> +	int data_col_addr;
> +	int datafrag_len;
> +	uint32_t failed;
> +	uint8_t *p;
> +
> +	/* Column address within the page aligned to ECC size */
> +	start_step = data_offs / chip->ecc.size;
> +	end_step = (data_offs + readlen - 1) / chip->ecc.size;
> +	num_steps = end_step - start_step + 1;
> +
> +	/* Data size aligned to ECC ecc.size */
> +	datafrag_len = num_steps * chip->ecc.size;
> +	data_col_addr = start_step * chip->ecc.size;
> +	p = bufpoi + data_col_addr;
> +
> +	failed = mtd->ecc_stats.failed;
> +
> +	ret = check_read_status_on_die(mtd, chip, page);
> +	if (ret < 0 || mtd->ecc_stats.failed != failed) {
> +		memset(p, 0, datafrag_len);
> +		return ret;
> +	}
> +
> +	/* If we read not a page aligned data */
> +	if (data_col_addr != 0)
> +		chip->cmdfunc(mtd, NAND_CMD_RNDOUT, data_col_addr, -1);
> +
> +	chip->read_buf(mtd, p, datafrag_len);
> +
> +	return ret;
> +}
> +
> +/**
> + * nand_read_page_on_die - [INTERN] read raw page data without ecc
> + * @mtd: mtd info structure
> + * @chip: nand chip info structure
> + * @buf: buffer to store read data
> + * @oob_required: caller requires OOB data read to chip->oob_poi
> + * @page: page number to read
> + */
> +static int nand_read_page_on_die(struct mtd_info *mtd, struct nand_chip *chip,
> +				 uint8_t *buf, int oob_required, int page)
> +{
> +	uint32_t failed;
> +	int ret;
> +
> +	failed = mtd->ecc_stats.failed;
> +
> +	ret = check_read_status_on_die(mtd, chip, page);
> +	if (ret < 0 || mtd->ecc_stats.failed != failed)
> +		return ret;
> +
> +	chip->read_buf(mtd, buf, mtd->writesize);
> +	if (oob_required)
> +		chip->read_buf(mtd, chip->oob_poi, mtd->oobsize);
> +	return ret;
> +}
> +
>  /**
>   * nand_read_page_hwecc - [REPLACEABLE] hardware ECC based page read function
>   * @mtd: mtd info structure
> @@ -3069,6 +3182,7 @@ nand_onfi_detect_micron_on_die_ecc(struct mtd_info *mtd,
>  		 * If the chip has on-die ECC enabled, we kind of have
>  		 * to do the same...
>  		 */
> +		chip->ecc.mode = NAND_ECC_HW_ON_DIE;

Nope. nand_base does not make choices about the ECC type to use; that's
the job of the low-level / hardware driver. Refer to my comment on the
first patch; drivers should opt into this somehow.

>  		pr_info("Using on-die ECC\n");
>  	}
>  }
> @@ -3985,6 +4099,25 @@ int nand_scan_tail(struct mtd_info *mtd)
>  		ecc->strength = ecc->bytes * 8 / fls(8 * ecc->size);
>  		break;
>  
> +	case NAND_ECC_HW_ON_DIE:
> +		/* nand_bbt attempts to put Bbt marker at offset 8 in

s/Bbt/BBT/

Also, please see Documentation/CodingStyle regarding the proper
multiline comment style.

> +		   oob, which is used for ECC by Micron

s/oob/OOB/

> +		   MT29F4G16ABADAWP, for example.  Fixed by not using
> +		   OOB for BBT marker.  */
> +		chip->bbt_options |= NAND_BBT_NO_OOB;
> +		chip->ecc.layout = &nand_oob_64_on_die;
> +		chip->ecc.read_page = nand_read_page_on_die;
> +		chip->ecc.read_subpage = nand_read_subpage_on_die;
> +		chip->ecc.write_page = nand_write_page_raw;
> +		chip->ecc.read_oob = nand_read_oob_std;
> +		chip->ecc.read_page_raw = nand_read_page_raw;
> +		chip->ecc.write_page_raw = nand_write_page_raw;
> +		chip->ecc.write_oob = nand_write_oob_std;
> +		chip->ecc.size = 512;
> +		chip->ecc.bytes = 8;
> +		chip->ecc.strength = 4;

This is all way too specific to the particular Micron flash you're
looking at. There are other vendors that use on-die ECC, and any support
here should be written to accommodate future additions (by Micron or
other vendors).

Particularly: the ECC strength/size should be determined per
vendor/flash, not here. Can you get this from ONFI or the ID? At least
the ecc.{strength,bytes,size} should probably be assigned from the
flash-vendor-specific callback.

> +		break;
> +
>  	case NAND_ECC_NONE:
>  		pr_warn("NAND_ECC_NONE selected by board driver. "
>  			   "This is not recommended!\n");
> diff --git a/drivers/of/of_mtd.c b/drivers/of/of_mtd.c
> index b7361ed..c844c84 100644
> --- a/drivers/of/of_mtd.c
> +++ b/drivers/of/of_mtd.c
> @@ -23,6 +23,7 @@ static const char *nand_ecc_modes[] = {
>  	[NAND_ECC_HW_SYNDROME]	= "hw_syndrome",
>  	[NAND_ECC_HW_OOB_FIRST]	= "hw_oob_first",
>  	[NAND_ECC_SOFT_BCH]	= "soft_bch",
> +	[NAND_ECC_HW_ON_DIE]	= "hw_on_die",
>  };
>  
>  /**
> diff --git a/include/linux/mtd/nand.h b/include/linux/mtd/nand.h
> index 780ab58..dbb99b3 100644
> --- a/include/linux/mtd/nand.h
> +++ b/include/linux/mtd/nand.h
> @@ -112,6 +112,7 @@ extern int nand_unlock(struct mtd_info *mtd, loff_t ofs, uint64_t len);
>  /* Status bits */
>  #define NAND_STATUS_FAIL	0x01
>  #define NAND_STATUS_FAIL_N1	0x02
> +#define NAND_STATUS_REWRITE	0x08

How "standard" is this? Is this a Micron-specific status bit? If so, we
should note that in a comment.

>  #define NAND_STATUS_TRUE_READY	0x20
>  #define NAND_STATUS_READY	0x40
>  #define NAND_STATUS_WP		0x80
> @@ -126,6 +127,7 @@ typedef enum {
>  	NAND_ECC_HW_SYNDROME,
>  	NAND_ECC_HW_OOB_FIRST,
>  	NAND_ECC_SOFT_BCH,
> +	NAND_ECC_HW_ON_DIE,
>  } nand_ecc_modes_t;
>  
>  /*

Brian

^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [PATCH v4 2/5] mtd: nand: Add NAND_ECC_HW_ON_DIE ECC-mode.
  2014-04-01  7:24   ` Brian Norris
@ 2014-04-01 15:41     ` David Mosberger
  0 siblings, 0 replies; 30+ messages in thread
From: David Mosberger @ 2014-04-01 15:41 UTC (permalink / raw)
  To: Brian Norris
  Cc: Gerhard Sittig, linux-mtd@lists.infradead.org, Gupta, Pekon,
	Artem Bityutskiy

Brian,

On Tue, Apr 1, 2014 at 1:24 AM, Brian Norris
<computersforpeace@gmail.com> wrote:
> On Mon, Mar 31, 2014 at 05:28:54PM -0600, David Mosberger wrote:
>> Some Micron chips such as the MT29F4G16ABADAWP have an internal
>> (on-die) ECC-controller that is useful when the host-platform doesn't
>> have hardware-support for multi-bit ECC.  This patch adds
>> NAND_ECC_HW_ON_DIE to support such chips when the driver detects that
>> the on-die ECC controller is enabled.
>>
>> Signed-off-by: David Mosberger <davidm@egauge.net>
>> ---
>>  drivers/mtd/nand/nand_base.c |  133 ++++++++++++++++++++++++++++++++++++++++++
>>  drivers/of/of_mtd.c          |    1 +
>>  include/linux/mtd/nand.h     |    2 +
>>  3 files changed, 136 insertions(+)
>>
>> diff --git a/drivers/mtd/nand/nand_base.c b/drivers/mtd/nand/nand_base.c
>> index f145f00..7047e9f5 100644
>> --- a/drivers/mtd/nand/nand_base.c
>> +++ b/drivers/mtd/nand/nand_base.c
>> @@ -78,6 +78,20 @@ static struct nand_ecclayout nand_oob_64 = {
>>                .length = 38} }
>>  };
>>
>> +static struct nand_ecclayout nand_oob_64_on_die = {
>
> This is not the only possible layout for on-die ECC. It's probably not
> even the only one used by Micron, is it? What about larger page sizes,
> for instance?

I don't know.  I never claimed my patch is perfect, but it's better than
what's in the standard kernel now.

If somebody knows how to improve this, I'm happy to test any patches
to make sure it keeps working with the NAND we happened to use.

>> +     case NAND_ECC_HW_ON_DIE:
>> +             /* nand_bbt attempts to put Bbt marker at offset 8 in
>
> s/Bbt/BBT/

Check nand_bbt.c.  bbt_patern is actually Bbt, not BBT.

> Also, please see Documentation/CodingStyle regarding the proper
> multiline comment style.

Again, not something flagged by chkpatch, but sure, I updated my code, thanks.

>> +                oob, which is used for ECC by Micron
>
> s/oob/OOB/

OK.

>> +                MT29F4G16ABADAWP, for example.  Fixed by not using
>> +                OOB for BBT marker.  */
>> +             chip->bbt_options |= NAND_BBT_NO_OOB;
>> +             chip->ecc.layout = &nand_oob_64_on_die;
>> +             chip->ecc.read_page = nand_read_page_on_die;
>> +             chip->ecc.read_subpage = nand_read_subpage_on_die;
>> +             chip->ecc.write_page = nand_write_page_raw;
>> +             chip->ecc.read_oob = nand_read_oob_std;
>> +             chip->ecc.read_page_raw = nand_read_page_raw;
>> +             chip->ecc.write_page_raw = nand_write_page_raw;
>> +             chip->ecc.write_oob = nand_write_oob_std;
>> +             chip->ecc.size = 512;
>> +             chip->ecc.bytes = 8;
>> +             chip->ecc.strength = 4;
>
> This is all way too specific to the particular Micron flash you're
> looking at. There are other vendors that use on-die ECC, and any support
> here should be written to accommodate future additions (by Micron or
> other vendors).
>
> Particularly: the ECC strength/size should be determined per
> vendor/flash, not here. Can you get this from ONFI or the ID? At least
> the ecc.{strength,bytes,size} should probably be assigned from the
> flash-vendor-specific callback.

Perhaps, I just don't know how to do that.  If somebody sends me a patch,
I'll be happy to test.

>> @@ -112,6 +112,7 @@ extern int nand_unlock(struct mtd_info *mtd, loff_t ofs, uint64_t len);
>>  /* Status bits */
>>  #define NAND_STATUS_FAIL     0x01
>>  #define NAND_STATUS_FAIL_N1  0x02
>> +#define NAND_STATUS_REWRITE  0x08
>
> How "standard" is this? Is this a Micron-specific status bit? If so, we
> should note that in a comment.

I don't know.

  --david
-- 
eGauge Systems LLC, http://egauge.net/, 1.877-EGAUGE1, fax 720.545.9768

^ permalink raw reply	[flat|nested] 30+ messages in thread

* [PATCH v4 3/5] mtd: nand: Enable subpage-reads on flashes with on-die ECC enabled.
  2014-03-31 23:28 [PATCH v4 0/5] mtd: nand: Add on-die ECC support David Mosberger
  2014-03-31 23:28 ` [PATCH v4 1/5] mtd: nand: Detect Micron flash with on-die ECC (aka "internal ECC") enabled David Mosberger
  2014-03-31 23:28 ` [PATCH v4 2/5] mtd: nand: Add NAND_ECC_HW_ON_DIE ECC-mode David Mosberger
@ 2014-03-31 23:28 ` David Mosberger
  2014-03-31 23:28 ` [PATCH v4 4/5] mtd: nand: Allocate extra buffers needed for on-die ECC controller David Mosberger
                   ` (2 subsequent siblings)
  5 siblings, 0 replies; 30+ messages in thread
From: David Mosberger @ 2014-03-31 23:28 UTC (permalink / raw)
  To: pekon; +Cc: David Mosberger, gsi, computersforpeace, linux-mtd, dedekind1

As far as we know, all flash chips with the Micron on-die ECC hardware
have subpage capability, so enable it for that case.

Signed-off-by: David Mosberger <davidm@egauge.net>
---
 drivers/mtd/nand/nand_base.c |    9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/drivers/mtd/nand/nand_base.c b/drivers/mtd/nand/nand_base.c
index 7047e9f5..435ef44 100644
--- a/drivers/mtd/nand/nand_base.c
+++ b/drivers/mtd/nand/nand_base.c
@@ -4185,8 +4185,13 @@ int nand_scan_tail(struct mtd_info *mtd)
 	/* Invalidate the pagebuffer reference */
 	chip->pagebuf = -1;
 
-	/* Large page NAND with SOFT_ECC should support subpage reads */
-	if ((ecc->mode == NAND_ECC_SOFT) && (chip->page_shift > 9))
+	/*
+	 * Large page NAND with SOFT_ECC or on-die ECC should support
+	 * subpage reads.
+	 */
+	if (((ecc->mode == NAND_ECC_SOFT)
+	     || (chip->ecc.mode == NAND_ECC_HW_ON_DIE))
+	    && (chip->page_shift > 9))
 		chip->options |= NAND_SUBPAGE_READ;
 
 	/* Fill in remaining MTD driver data */
-- 
1.7.9.5

^ permalink raw reply related	[flat|nested] 30+ messages in thread

* [PATCH v4 4/5] mtd: nand: Allocate extra buffers needed for on-die ECC controller.
  2014-03-31 23:28 [PATCH v4 0/5] mtd: nand: Add on-die ECC support David Mosberger
                   ` (2 preceding siblings ...)
  2014-03-31 23:28 ` [PATCH v4 3/5] mtd: nand: Enable subpage-reads on flashes with on-die ECC enabled David Mosberger
@ 2014-03-31 23:28 ` David Mosberger
  2014-04-01  7:28   ` Brian Norris
  2014-03-31 23:28 ` [PATCH v4 5/5] mtd: nand: Improve bitflip detection for on-die ECC scheme David Mosberger
  2014-04-01  8:02 ` [PATCH v4 0/5] mtd: nand: Add on-die ECC support Brian Norris
  5 siblings, 1 reply; 30+ messages in thread
From: David Mosberger @ 2014-03-31 23:28 UTC (permalink / raw)
  To: pekon; +Cc: David Mosberger, gsi, computersforpeace, linux-mtd, dedekind1

To avoid unnecessary rewrites, it is necessary to count the number of
actual bitflips that occurred when the NAND_STATUS_REWRITE bit is set.
This patch introduces the extra buffers needed to implement that
counting.  The actual counting is in the next patch.

Signed-off-by: David Mosberger <davidm@egauge.net>
---
 drivers/mtd/nand/nand_base.c |   13 ++++++++++++-
 include/linux/mtd/nand.h     |    2 ++
 2 files changed, 14 insertions(+), 1 deletion(-)

diff --git a/drivers/mtd/nand/nand_base.c b/drivers/mtd/nand/nand_base.c
index 435ef44..834352a 100644
--- a/drivers/mtd/nand/nand_base.c
+++ b/drivers/mtd/nand/nand_base.c
@@ -3935,13 +3935,24 @@ int nand_scan_tail(struct mtd_info *mtd)
 			!(chip->bbt_options & NAND_BBT_USE_FLASH));
 
 	if (!(chip->options & NAND_OWN_BUFFERS)) {
+		size_t on_die_bufsz = 0;
+
+		if (chip->ecc.mode == NAND_ECC_HW_ON_DIE)
+			on_die_bufsz = 2*(mtd->writesize + mtd->oobsize);
+
 		nbuf = kzalloc(sizeof(*nbuf) + mtd->writesize
-				+ mtd->oobsize * 3, GFP_KERNEL);
+				+ mtd->oobsize * 3 + on_die_bufsz, GFP_KERNEL);
 		if (!nbuf)
 			return -ENOMEM;
 		nbuf->ecccalc = (uint8_t *)(nbuf + 1);
 		nbuf->ecccode = nbuf->ecccalc + mtd->oobsize;
 		nbuf->databuf = nbuf->ecccode + mtd->oobsize;
+		if (chip->ecc.mode == NAND_ECC_HW_ON_DIE) {
+			nbuf->chkbuf = (nbuf->databuf + mtd->writesize
+					+ mtd->oobsize);
+			nbuf->rawbuf = (nbuf->chkbuf + mtd->writesize
+					+ mtd->oobsize);
+		}
 
 		chip->buffers = nbuf;
 	} else {
diff --git a/include/linux/mtd/nand.h b/include/linux/mtd/nand.h
index dbb99b3..456809b 100644
--- a/include/linux/mtd/nand.h
+++ b/include/linux/mtd/nand.h
@@ -533,6 +533,8 @@ struct nand_buffers {
 	uint8_t	*ecccalc;
 	uint8_t	*ecccode;
 	uint8_t *databuf;
+	uint8_t *chkbuf;
+	uint8_t *rawbuf;
 };
 
 /**
-- 
1.7.9.5

^ permalink raw reply related	[flat|nested] 30+ messages in thread

* Re: [PATCH v4 4/5] mtd: nand: Allocate extra buffers needed for on-die ECC controller.
  2014-03-31 23:28 ` [PATCH v4 4/5] mtd: nand: Allocate extra buffers needed for on-die ECC controller David Mosberger
@ 2014-04-01  7:28   ` Brian Norris
  2014-04-01  7:37     ` Gupta, Pekon
  0 siblings, 1 reply; 30+ messages in thread
From: Brian Norris @ 2014-04-01  7:28 UTC (permalink / raw)
  To: David Mosberger; +Cc: gsi, linux-mtd, pekon, dedekind1

On Mon, Mar 31, 2014 at 05:28:56PM -0600, David Mosberger wrote:
> To avoid unnecessary rewrites, it is necessary to count the number of
> actual bitflips that occurred when the NAND_STATUS_REWRITE bit is set.
> This patch introduces the extra buffers needed to implement that
> counting.  The actual counting is in the next patch.
> 
> Signed-off-by: David Mosberger <davidm@egauge.net>
> ---
>  drivers/mtd/nand/nand_base.c |   13 ++++++++++++-
>  include/linux/mtd/nand.h     |    2 ++
>  2 files changed, 14 insertions(+), 1 deletion(-)
> 
> diff --git a/drivers/mtd/nand/nand_base.c b/drivers/mtd/nand/nand_base.c
> index 435ef44..834352a 100644
> --- a/drivers/mtd/nand/nand_base.c
> +++ b/drivers/mtd/nand/nand_base.c
> @@ -3935,13 +3935,24 @@ int nand_scan_tail(struct mtd_info *mtd)
>  			!(chip->bbt_options & NAND_BBT_USE_FLASH));
>  
>  	if (!(chip->options & NAND_OWN_BUFFERS)) {
> +		size_t on_die_bufsz = 0;
> +
> +		if (chip->ecc.mode == NAND_ECC_HW_ON_DIE)
> +			on_die_bufsz = 2*(mtd->writesize + mtd->oobsize);

Spacing:

			on_die_bufsz = 2 * (mtd->writesize + mtd->oobsize);

> +
>  		nbuf = kzalloc(sizeof(*nbuf) + mtd->writesize
> -				+ mtd->oobsize * 3, GFP_KERNEL);
> +				+ mtd->oobsize * 3 + on_die_bufsz, GFP_KERNEL);
>  		if (!nbuf)
>  			return -ENOMEM;
>  		nbuf->ecccalc = (uint8_t *)(nbuf + 1);
>  		nbuf->ecccode = nbuf->ecccalc + mtd->oobsize;
>  		nbuf->databuf = nbuf->ecccode + mtd->oobsize;
> +		if (chip->ecc.mode == NAND_ECC_HW_ON_DIE) {
> +			nbuf->chkbuf = (nbuf->databuf + mtd->writesize
> +					+ mtd->oobsize);
> +			nbuf->rawbuf = (nbuf->chkbuf + mtd->writesize
> +					+ mtd->oobsize);
> +		}
>  
>  		chip->buffers = nbuf;
>  	} else {
> diff --git a/include/linux/mtd/nand.h b/include/linux/mtd/nand.h
> index dbb99b3..456809b 100644
> --- a/include/linux/mtd/nand.h
> +++ b/include/linux/mtd/nand.h
> @@ -533,6 +533,8 @@ struct nand_buffers {
>  	uint8_t	*ecccalc;
>  	uint8_t	*ecccode;
>  	uint8_t *databuf;
> +	uint8_t *chkbuf;
> +	uint8_t *rawbuf;

Do you really need two additional buffers? Can you get by with just one
of them?

>  };
>  
>  /**

Brian

^ permalink raw reply	[flat|nested] 30+ messages in thread

* RE: [PATCH v4 4/5] mtd: nand: Allocate extra buffers needed for on-die ECC controller.
  2014-04-01  7:28   ` Brian Norris
@ 2014-04-01  7:37     ` Gupta, Pekon
  2014-04-01  8:24       ` Brian Norris
  0 siblings, 1 reply; 30+ messages in thread
From: Gupta, Pekon @ 2014-04-01  7:37 UTC (permalink / raw)
  To: Brian Norris, David Mosberger
  Cc: gsi@denx.de, linux-mtd@lists.infradead.org, dedekind1@gmail.com

Hi Brian,

>From: Brian Norris [mailto:computersforpeace@gmail.com]
>>On Mon, Mar 31, 2014 at 05:28:56PM -0600, David Mosberger wrote:
[...]
>> diff --git a/include/linux/mtd/nand.h b/include/linux/mtd/nand.h
>> index dbb99b3..456809b 100644
>> --- a/include/linux/mtd/nand.h
>> +++ b/include/linux/mtd/nand.h
>> @@ -533,6 +533,8 @@ struct nand_buffers {
>>  	uint8_t	*ecccalc;
>>  	uint8_t	*ecccode;
>>  	uint8_t *databuf;
>> +	uint8_t *chkbuf;
>> +	uint8_t *rawbuf;
>
>Do you really need two additional buffers? Can you get by with just one
>of them?
>

Some similar comments have been provided in the previous versions
of the patch. But due to in-consistency in $subject, they might not
be visible as a thread of same discussion.
You may like to review and comment on below threads also.

[PATCH] mtd: nand: Add support for Micron on-die ECC controller (rev2).
http://lists.infradead.org/pipermail/linux-mtd/2014-March/052949.html

[RFC] mtd: nand: Preparatory patch for adding on-die ECC controller support ...
http://lists.infradead.org/pipermail/linux-mtd/2014-March/052969.html

with regards, pekon

^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [PATCH v4 4/5] mtd: nand: Allocate extra buffers needed for on-die ECC controller.
  2014-04-01  7:37     ` Gupta, Pekon
@ 2014-04-01  8:24       ` Brian Norris
  0 siblings, 0 replies; 30+ messages in thread
From: Brian Norris @ 2014-04-01  8:24 UTC (permalink / raw)
  To: Gupta, Pekon
  Cc: David Mosberger, gsi@denx.de, linux-mtd@lists.infradead.org,
	dedekind1@gmail.com

Hi Pekon,

On Tue, Apr 01, 2014 at 07:37:46AM +0000, Pekon Gupta wrote:
> >From: Brian Norris [mailto:computersforpeace@gmail.com]
> >>On Mon, Mar 31, 2014 at 05:28:56PM -0600, David Mosberger wrote:
> [...]
> >> diff --git a/include/linux/mtd/nand.h b/include/linux/mtd/nand.h
> >> index dbb99b3..456809b 100644
> >> --- a/include/linux/mtd/nand.h
> >> +++ b/include/linux/mtd/nand.h
> >> @@ -533,6 +533,8 @@ struct nand_buffers {
> >>  	uint8_t	*ecccalc;
> >>  	uint8_t	*ecccode;
> >>  	uint8_t *databuf;
> >> +	uint8_t *chkbuf;
> >> +	uint8_t *rawbuf;
> >
> >Do you really need two additional buffers? Can you get by with just one
> >of them?
> >
> 
> Some similar comments have been provided in the previous versions
> of the patch. But due to in-consistency in $subject, they might not
> be visible as a thread of same discussion.

Yeah, I didn't review every iteration so far. I'm possibly repeating
some things. I'll see where to respond on the old threads.

But here's a note to David: it looks like you sent 4 revisions in one
week. That's a bit fast. Perhaps you can wait a few more days on
spinning patch sets, especially when review is straddling a weekend like
that.

> You may like to review and comment on below threads also.
> 
> [PATCH] mtd: nand: Add support for Micron on-die ECC controller (rev2).
> http://lists.infradead.org/pipermail/linux-mtd/2014-March/052949.html
> 
> [RFC] mtd: nand: Preparatory patch for adding on-die ECC controller support ...
> http://lists.infradead.org/pipermail/linux-mtd/2014-March/052969.html

Brian

^ permalink raw reply	[flat|nested] 30+ messages in thread

* [PATCH v4 5/5] mtd: nand: Improve bitflip detection for on-die ECC scheme.
  2014-03-31 23:28 [PATCH v4 0/5] mtd: nand: Add on-die ECC support David Mosberger
                   ` (3 preceding siblings ...)
  2014-03-31 23:28 ` [PATCH v4 4/5] mtd: nand: Allocate extra buffers needed for on-die ECC controller David Mosberger
@ 2014-03-31 23:28 ` David Mosberger
  2014-04-01  6:29   ` Gupta, Pekon
  2014-04-01  7:50   ` Brian Norris
  2014-04-01  8:02 ` [PATCH v4 0/5] mtd: nand: Add on-die ECC support Brian Norris
  5 siblings, 2 replies; 30+ messages in thread
From: David Mosberger @ 2014-03-31 23:28 UTC (permalink / raw)
  To: pekon; +Cc: David Mosberger, gsi, computersforpeace, linux-mtd, dedekind1

When NAND_STATUS_REWRITE is set, there is no direct indication as to
how many bits have were flipped.  This patch improves the existing
code by manually counting the number of bitflips, rather than just
assuming the worst-case of mtd->bitflip_threshold.  This avoids
unnecessary rewrites, e.g., due to a single stuck bit.

Signed-off-by: David Mosberger <davidm@egauge.net>
---
 drivers/mtd/nand/nand_base.c |   90 ++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 86 insertions(+), 4 deletions(-)

diff --git a/drivers/mtd/nand/nand_base.c b/drivers/mtd/nand/nand_base.c
index 834352a..80cfaa8 100644
--- a/drivers/mtd/nand/nand_base.c
+++ b/drivers/mtd/nand/nand_base.c
@@ -1272,6 +1272,84 @@ static int nand_read_subpage(struct mtd_info *mtd, struct nand_chip *chip,
 	return max_bitflips;
 }
 
+static int
+set_on_die_ecc(struct mtd_info *mtd, struct nand_chip *chip, int on)
+{
+	u8 data[ONFI_SUBFEATURE_PARAM_LEN] = { 0, };
+
+	if (chip->ecc.mode != NAND_ECC_HW_ON_DIE)
+		return 0;
+
+	if (on)
+		data[0] = ONFI_FEATURE_ARRAY_OP_MODE_ENABLE_ON_DIE_ECC;
+
+	return chip->onfi_set_features(mtd, chip,
+				       ONFI_FEATURE_ADDR_ARRAY_OP_MODE, data);
+}
+
+/*
+ * Return the number of bits that differ between buffers SRC1 and
+ * SRC2, both of which are LEN bytes long.
+ *
+ * This code could be optimized for, but it only gets called on pages
+ * with bitflips and compared to the cost of migrating an eraseblock,
+ * the execution time here is trivial...
+ */
+static int
+bitdiff(const void *s1, const void *s2, size_t len)
+{
+	const uint8_t *src1 = s1, *src2 = s2;
+	int count = 0, i;
+
+	for (i = 0; i < len; ++i)
+		count += hweight8(*src1++ ^ *src2++);
+	return count;
+}
+
+static int
+check_for_bitflips(struct mtd_info *mtd, struct nand_chip *chip, int page)
+{
+	int flips = 0, max_bitflips = 0, i, j, read_size;
+	uint8_t *chkbuf, *rawbuf, *chkoob, *rawoob;
+	uint32_t *eccpos;
+
+	chkbuf = chip->buffers->chkbuf;
+	rawbuf = chip->buffers->rawbuf;
+	read_size = mtd->writesize + mtd->oobsize;
+
+	/* Read entire page w/OOB area with on-die ECC on: */
+	chip->read_buf(mtd, chkbuf, read_size);
+
+	/* Re-read page with on-die ECC off: */
+	set_on_die_ecc(mtd, chip, 0);
+	chip->cmdfunc(mtd, NAND_CMD_READ0, 0x00, page);
+	chip->read_buf(mtd, rawbuf, read_size);
+	set_on_die_ecc(mtd, chip, 1);
+
+	chkoob = chkbuf + mtd->writesize;
+	rawoob = rawbuf + mtd->writesize;
+	eccpos = chip->ecc.layout->eccpos;
+	for (i = 0; i < chip->ecc.steps; ++i) {
+		/* Count bit flips in the actual data area: */
+		flips = bitdiff(chkbuf, rawbuf, chip->ecc.size);
+		/* Count bit flips in the ECC bytes: */
+		for (j = 0; j < chip->ecc.bytes; ++j) {
+			flips += hweight8(chkoob[*eccpos] ^ rawoob[*eccpos]);
+			++eccpos;
+		}
+		if (flips > 0)
+			mtd->ecc_stats.corrected += flips;
+		max_bitflips = max_t(int, max_bitflips, flips);
+		chkbuf += chip->ecc.size;
+		rawbuf += chip->ecc.size;
+	}
+
+	/* Re-issue the READ command for the actual data read that follows.  */
+	chip->cmdfunc(mtd, NAND_CMD_READ0, 0x00, page);
+
+	return max_bitflips;
+}
+
 static int check_read_status_on_die(struct mtd_info *mtd,
 				    struct nand_chip *chip, int page)
 {
@@ -1290,11 +1368,15 @@ static int check_read_status_on_die(struct mtd_info *mtd,
 		mtd->ecc_stats.failed++;
 	} else if (status & NAND_STATUS_REWRITE) {
 		/*
-		 * Simple but suboptimal: any page with a single stuck
-		 * bit will be unusable since it'll be rewritten on
-		 * each read...
+		 * The Micron chips turn on the REWRITE status bit for
+		 * ANY bit flips.  Some pages have stuck bits, so we
+		 * don't want to migrate a block just because of
+		 * single bit errors because otherwise, that block
+		 * would effectively become unusable.  So, work out in
+		 * software what the max number of flipped bits is for
+		 * all subpages in a page:
 		 */
-		max_bitflips = mtd->bitflip_threshold;
+		max_bitflips = check_for_bitflips(mtd, chip, page);
 	}
 	return max_bitflips;
 }
-- 
1.7.9.5

^ permalink raw reply related	[flat|nested] 30+ messages in thread

* RE: [PATCH v4 5/5] mtd: nand: Improve bitflip detection for on-die ECC scheme.
  2014-03-31 23:28 ` [PATCH v4 5/5] mtd: nand: Improve bitflip detection for on-die ECC scheme David Mosberger
@ 2014-04-01  6:29   ` Gupta, Pekon
  2014-04-01 15:51     ` David Mosberger
  2014-04-01  7:50   ` Brian Norris
  1 sibling, 1 reply; 30+ messages in thread
From: Gupta, Pekon @ 2014-04-01  6:29 UTC (permalink / raw)
  To: David Mosberger
  Cc: gsi@denx.de, computersforpeace@gmail.com,
	linux-mtd@lists.infradead.org, dedekind1@gmail.com

Hi David,

>From: David Mosberger
>
>When NAND_STATUS_REWRITE is set, there is no direct indication as to
>how many bits have were flipped.  This patch improves the existing
>code by manually counting the number of bitflips, rather than just
>assuming the worst-case of mtd->bitflip_threshold.  This avoids
>unnecessary rewrites, e.g., due to a single stuck bit.
>
>Signed-off-by: David Mosberger <davidm@egauge.net>
>---
> drivers/mtd/nand/nand_base.c |   90 ++++++++++++++++++++++++++++++++++++++++--
> 1 file changed, 86 insertions(+), 4 deletions(-)
>
>diff --git a/drivers/mtd/nand/nand_base.c b/drivers/mtd/nand/nand_base.c
>index 834352a..80cfaa8 100644
>--- a/drivers/mtd/nand/nand_base.c
>+++ b/drivers/mtd/nand/nand_base.c
>@@ -1272,6 +1272,84 @@ static int nand_read_subpage(struct mtd_info *mtd, struct nand_chip *chip,
> 	return max_bitflips;
> }
>
>+static int
>+set_on_die_ecc(struct mtd_info *mtd, struct nand_chip *chip, int on)
>+{
>+	u8 data[ONFI_SUBFEATURE_PARAM_LEN] = { 0, };
>+
>+	if (chip->ecc.mode != NAND_ECC_HW_ON_DIE)
>+		return 0;
>+
>+	if (on)
>+		data[0] = ONFI_FEATURE_ARRAY_OP_MODE_ENABLE_ON_DIE_ECC;
>+
>+	return chip->onfi_set_features(mtd, chip,
>+				       ONFI_FEATURE_ADDR_ARRAY_OP_MODE, data);
>+}
>+
>+/*
>+ * Return the number of bits that differ between buffers SRC1 and
>+ * SRC2, both of which are LEN bytes long.
>+ *
>+ * This code could be optimized for, but it only gets called on pages
>+ * with bitflips and compared to the cost of migrating an eraseblock,
>+ * the execution time here is trivial...
>+ */
>+static int
>+bitdiff(const void *s1, const void *s2, size_t len)
>+{
>+	const uint8_t *src1 = s1, *src2 = s2;
>+	int count = 0, i;
>+
>+	for (i = 0; i < len; ++i)
>+		count += hweight8(*src1++ ^ *src2++);
>+	return count;

hweight8() might not be good option, how about using hweight32 ?

>+}
>+
>+static int
>+check_for_bitflips(struct mtd_info *mtd, struct nand_chip *chip, int page)
>+{
>+	int flips = 0, max_bitflips = 0, i, j, read_size;
>+	uint8_t *chkbuf, *rawbuf, *chkoob, *rawoob;
>+	uint32_t *eccpos;
>+
>+	chkbuf = chip->buffers->chkbuf;
>+	rawbuf = chip->buffers->rawbuf;
>+	read_size = mtd->writesize + mtd->oobsize;
>+
>+	/* Read entire page w/OOB area with on-die ECC on: */
>+	chip->read_buf(mtd, chkbuf, read_size);
>+
>+	/* Re-read page with on-die ECC off: */
>+	set_on_die_ecc(mtd, chip, 0);
>+	chip->cmdfunc(mtd, NAND_CMD_READ0, 0x00, page);
>+	chip->read_buf(mtd, rawbuf, read_size);
>+	set_on_die_ecc(mtd, chip, 1);
>+
>+	chkoob = chkbuf + mtd->writesize;
>+	rawoob = rawbuf + mtd->writesize;
>+	eccpos = chip->ecc.layout->eccpos;
>+	for (i = 0; i < chip->ecc.steps; ++i) {
>+		/* Count bit flips in the actual data area: */
>+		flips = bitdiff(chkbuf, rawbuf, chip->ecc.size);

Do you really need this as separate function call ?
You could make it inline, like you did below of OOB.
Also, you may stop checking bitflips, once count > chip->ecc.strength.

Some of the discussion on different patch may be helpful here, please see
Brian's comments on [1].

>+		/* Count bit flips in the ECC bytes: */
>+		for (j = 0; j < chip->ecc.bytes; ++j) {

Again, why checking only for ecc.bytes ? should have been.

>+			flips += hweight8(chkoob[*eccpos] ^ rawoob[*eccpos]);
>+			++eccpos;
>+		}
>+		if (flips > 0)
>+			mtd->ecc_stats.corrected += flips;
>+		max_bitflips = max_t(int, max_bitflips, flips);
>+		chkbuf += chip->ecc.size;
>+		rawbuf += chip->ecc.size;
>+	}
>+
>+	/* Re-issue the READ command for the actual data read that follows.  */
>+	chip->cmdfunc(mtd, NAND_CMD_READ0, 0x00, page);
>+

If you refer my previous feedbacks, you could avoided this extra
"chip->read_buf(mtd, chkbuf, read_size);" by calling it in nand_page_read_on_die();

>+	return max_bitflips;
>+}
>+
> static int check_read_status_on_die(struct mtd_info *mtd,
> 				    struct nand_chip *chip, int page)
> {
>@@ -1290,11 +1368,15 @@ static int check_read_status_on_die(struct mtd_info *mtd,
> 		mtd->ecc_stats.failed++;
> 	} else if (status & NAND_STATUS_REWRITE) {
> 		/*
>-		 * Simple but suboptimal: any page with a single stuck
>-		 * bit will be unusable since it'll be rewritten on
>-		 * each read...
>+		 * The Micron chips turn on the REWRITE status bit for
>+		 * ANY bit flips.  Some pages have stuck bits, so we
>+		 * don't want to migrate a block just because of
>+		 * single bit errors because otherwise, that block
>+		 * would effectively become unusable.  So, work out in
>+		 * software what the max number of flipped bits is for
>+		 * all subpages in a page:
> 		 */
>-		max_bitflips = mtd->bitflip_threshold;
>+		max_bitflips = check_for_bitflips(mtd, chip, page);

We usually don't delete the changes, introduced in previous patches
of same series. So, take one approach and then logically divide the patches.

> 	}
> 	return max_bitflips;
> }
>--
>1.7.9.5
>

Again, please review all the comments given by Gerhard and myself
on previous patches. Don't hurry in sending newer revisions, as it
would just increase your iterations.

[1] http://lists.infradead.org/pipermail/linux-mtd/2014-March/052625.html

with regards, pekon

^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [PATCH v4 5/5] mtd: nand: Improve bitflip detection for on-die ECC scheme.
  2014-04-01  6:29   ` Gupta, Pekon
@ 2014-04-01 15:51     ` David Mosberger
  2014-04-01 17:30       ` Brian Norris
  0 siblings, 1 reply; 30+ messages in thread
From: David Mosberger @ 2014-04-01 15:51 UTC (permalink / raw)
  To: Gupta, Pekon
  Cc: gsi@denx.de, computersforpeace@gmail.com,
	linux-mtd@lists.infradead.org, dedekind1@gmail.com

Pekon,

On Tue, Apr 1, 2014 at 12:29 AM, Gupta, Pekon <pekon@ti.com> wrote:

>>+/*
>>+ * Return the number of bits that differ between buffers SRC1 and
>>+ * SRC2, both of which are LEN bytes long.
>>+ *
>>+ * This code could be optimized for, but it only gets called on pages
>>+ * with bitflips and compared to the cost of migrating an eraseblock,
>>+ * the execution time here is trivial...
>>+ */
>>+static int
>>+bitdiff(const void *s1, const void *s2, size_t len)
>>+{
>>+      const uint8_t *src1 = s1, *src2 = s2;
>>+      int count = 0, i;
>>+
>>+      for (i = 0; i < len; ++i)
>>+              count += hweight8(*src1++ ^ *src2++);
>>+      return count;
>
> hweight8() might not be good option, how about using hweight32 ?

Why don't you read the comment above.  Yes, you can add more code
and make it faster and it will not matter one bit, so I chose to go with
the shortest code possible.  Sue me.

>>+                      flips += hweight8(chkoob[*eccpos] ^ rawoob[*eccpos]);
>>+                      ++eccpos;
>>+              }
>>+              if (flips > 0)
>>+                      mtd->ecc_stats.corrected += flips;
>>+              max_bitflips = max_t(int, max_bitflips, flips);
>>+              chkbuf += chip->ecc.size;
>>+              rawbuf += chip->ecc.size;
>>+      }
>>+
>>+      /* Re-issue the READ command for the actual data read that follows.  */
>>+      chip->cmdfunc(mtd, NAND_CMD_READ0, 0x00, page);
>>+
>
> If you refer my previous feedbacks, you could avoided this extra
> "chip->read_buf(mtd, chkbuf, read_size);" by calling it in nand_page_read_on_die();

The code you quote has no chip->read_buf call so I'm not sure how to avoid
something that isn't there.  I'm more than happy to address your concern once
I understand what it is.

  --david
-- 
eGauge Systems LLC, http://egauge.net/, 1.877-EGAUGE1, fax 720.545.9768

^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [PATCH v4 5/5] mtd: nand: Improve bitflip detection for on-die ECC scheme.
  2014-04-01 15:51     ` David Mosberger
@ 2014-04-01 17:30       ` Brian Norris
  0 siblings, 0 replies; 30+ messages in thread
From: Brian Norris @ 2014-04-01 17:30 UTC (permalink / raw)
  To: David Mosberger
  Cc: gsi@denx.de, linux-mtd@lists.infradead.org, Gupta, Pekon,
	dedekind1@gmail.com

On Tue, Apr 01, 2014 at 09:51:13AM -0600, David Mosberger wrote:
> On Tue, Apr 1, 2014 at 12:29 AM, Gupta, Pekon <pekon@ti.com> wrote:
> >>+/*
> >>+ * Return the number of bits that differ between buffers SRC1 and
> >>+ * SRC2, both of which are LEN bytes long.
> >>+ *
> >>+ * This code could be optimized for, but it only gets called on pages
> >>+ * with bitflips and compared to the cost of migrating an eraseblock,
> >>+ * the execution time here is trivial...
> >>+ */
> >>+static int
> >>+bitdiff(const void *s1, const void *s2, size_t len)
> >>+{
> >>+      const uint8_t *src1 = s1, *src2 = s2;
> >>+      int count = 0, i;
> >>+
> >>+      for (i = 0; i < len; ++i)
> >>+              count += hweight8(*src1++ ^ *src2++);
> >>+      return count;
> >
> > hweight8() might not be good option, how about using hweight32 ?
> 
> Why don't you read the comment above.  Yes, you can add more code
> and make it faster and it will not matter one bit, so I chose to go with
> the shortest code possible.  Sue me.

I'm guilty of making the same comment, and it really isn't warranted in
some cases where simplicity should win. I'm fine taking this piece
as-is, I suppose, since it likely experiences a low bit-error rate. But
I would caution that bit errors are becoming increasingly common, so
your "trivial" comment may not age well.

> >>+                      flips += hweight8(chkoob[*eccpos] ^ rawoob[*eccpos]);
> >>+                      ++eccpos;
> >>+              }
> >>+              if (flips > 0)
> >>+                      mtd->ecc_stats.corrected += flips;
> >>+              max_bitflips = max_t(int, max_bitflips, flips);
> >>+              chkbuf += chip->ecc.size;
> >>+              rawbuf += chip->ecc.size;
> >>+      }

Brian

^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [PATCH v4 5/5] mtd: nand: Improve bitflip detection for on-die ECC scheme.
  2014-03-31 23:28 ` [PATCH v4 5/5] mtd: nand: Improve bitflip detection for on-die ECC scheme David Mosberger
  2014-04-01  6:29   ` Gupta, Pekon
@ 2014-04-01  7:50   ` Brian Norris
       [not found]     ` <CALnQHM2Afp8LD6MtGQTT5jrcb9xJdYXRGD0TZ_s5GASZsbRZeg@mail.gmail.com>
  1 sibling, 1 reply; 30+ messages in thread
From: Brian Norris @ 2014-04-01  7:50 UTC (permalink / raw)
  To: David Mosberger; +Cc: gsi, linux-mtd, pekon, dedekind1

On Mon, Mar 31, 2014 at 05:28:57PM -0600, David Mosberger wrote:
> When NAND_STATUS_REWRITE is set, there is no direct indication as to
> how many bits have were flipped.  This patch improves the existing
> code by manually counting the number of bitflips, rather than just
> assuming the worst-case of mtd->bitflip_threshold.  This avoids
> unnecessary rewrites, e.g., due to a single stuck bit.
> 
> Signed-off-by: David Mosberger <davidm@egauge.net>
> ---
>  drivers/mtd/nand/nand_base.c |   90 ++++++++++++++++++++++++++++++++++++++++--
>  1 file changed, 86 insertions(+), 4 deletions(-)
> 
> diff --git a/drivers/mtd/nand/nand_base.c b/drivers/mtd/nand/nand_base.c
> index 834352a..80cfaa8 100644
> --- a/drivers/mtd/nand/nand_base.c
> +++ b/drivers/mtd/nand/nand_base.c
> @@ -1272,6 +1272,84 @@ static int nand_read_subpage(struct mtd_info *mtd, struct nand_chip *chip,
>  	return max_bitflips;
>  }
>  
> +static int
> +set_on_die_ecc(struct mtd_info *mtd, struct nand_chip *chip, int on)
> +{
> +	u8 data[ONFI_SUBFEATURE_PARAM_LEN] = { 0, };
> +
> +	if (chip->ecc.mode != NAND_ECC_HW_ON_DIE)
> +		return 0;

I think this check is unnecessary, and probably wrong. The caller should
make sure not to call this for devices that don't support it. Or else,
there should at least be an error code, like -EOPNOTSUPP.

> +
> +	if (on)
> +		data[0] = ONFI_FEATURE_ARRAY_OP_MODE_ENABLE_ON_DIE_ECC;
> +
> +	return chip->onfi_set_features(mtd, chip,
> +				       ONFI_FEATURE_ADDR_ARRAY_OP_MODE, data);
> +}

This should be implemented on a per-vendor basis and provided as a
callback (perhaps chip->set_internal_ecc()?). Then, you would only make
chip->set_internal_ecc non-NULL for flash that support it.

> +
> +/*
> + * Return the number of bits that differ between buffers SRC1 and
> + * SRC2, both of which are LEN bytes long.
> + *
> + * This code could be optimized for, but it only gets called on pages
> + * with bitflips and compared to the cost of migrating an eraseblock,
> + * the execution time here is trivial...
> + */
> +static int
> +bitdiff(const void *s1, const void *s2, size_t len)

Please join the above 2 lines.

> +{
> +	const uint8_t *src1 = s1, *src2 = s2;

Kill the local variables.

> +	int count = 0, i;
> +
> +	for (i = 0; i < len; ++i)
> +		count += hweight8(*src1++ ^ *src2++);

It's rather unfortunate that we have to resort to this...

but anyway, we might as well use hweight32() or hweight_long(), right?
And any leftover odd bytes can be caught up with hweight8().

> +	return count;
> +}
> +
> +static int
> +check_for_bitflips(struct mtd_info *mtd, struct nand_chip *chip, int page)
> +{
> +	int flips = 0, max_bitflips = 0, i, j, read_size;
> +	uint8_t *chkbuf, *rawbuf, *chkoob, *rawoob;
> +	uint32_t *eccpos;
> +
> +	chkbuf = chip->buffers->chkbuf;
> +	rawbuf = chip->buffers->rawbuf;
> +	read_size = mtd->writesize + mtd->oobsize;
> +
> +	/* Read entire page w/OOB area with on-die ECC on: */
> +	chip->read_buf(mtd, chkbuf, read_size);

Do you actually need to re-read, or can you use the existing data? Or at
least, you could overwrite the databuf, instead of using a new chkbuf.

> +
> +	/* Re-read page with on-die ECC off: */
> +	set_on_die_ecc(mtd, chip, 0);
> +	chip->cmdfunc(mtd, NAND_CMD_READ0, 0x00, page);
> +	chip->read_buf(mtd, rawbuf, read_size);
> +	set_on_die_ecc(mtd, chip, 1);
> +
> +	chkoob = chkbuf + mtd->writesize;
> +	rawoob = rawbuf + mtd->writesize;
> +	eccpos = chip->ecc.layout->eccpos;
> +	for (i = 0; i < chip->ecc.steps; ++i) {
> +		/* Count bit flips in the actual data area: */
> +		flips = bitdiff(chkbuf, rawbuf, chip->ecc.size);
> +		/* Count bit flips in the ECC bytes: */
> +		for (j = 0; j < chip->ecc.bytes; ++j) {
> +			flips += hweight8(chkoob[*eccpos] ^ rawoob[*eccpos]);

Why didn't you use bitdiff() here too?

> +			++eccpos;
> +		}
> +		if (flips > 0)
> +			mtd->ecc_stats.corrected += flips;
> +		max_bitflips = max_t(int, max_bitflips, flips);
> +		chkbuf += chip->ecc.size;
> +		rawbuf += chip->ecc.size;
> +	}
> +
> +	/* Re-issue the READ command for the actual data read that follows.  */
> +	chip->cmdfunc(mtd, NAND_CMD_READ0, 0x00, page);
> +
> +	return max_bitflips;
> +}
> +
>  static int check_read_status_on_die(struct mtd_info *mtd,
>  				    struct nand_chip *chip, int page)
>  {
> @@ -1290,11 +1368,15 @@ static int check_read_status_on_die(struct mtd_info *mtd,
>  		mtd->ecc_stats.failed++;
>  	} else if (status & NAND_STATUS_REWRITE) {
>  		/*
> -		 * Simple but suboptimal: any page with a single stuck
> -		 * bit will be unusable since it'll be rewritten on
> -		 * each read...
> +		 * The Micron chips turn on the REWRITE status bit for
> +		 * ANY bit flips.  Some pages have stuck bits, so we
> +		 * don't want to migrate a block just because of
> +		 * single bit errors because otherwise, that block
> +		 * would effectively become unusable.  So, work out in
> +		 * software what the max number of flipped bits is for
> +		 * all subpages in a page:

Can you shorten this comment? It's rather verbose, and it's making
assumptions about upper-layer "migrations". I think we can leave it at
something much simpler, like:

	/*
	 * Micron on-die ECC doesn't report the number of bitflips, so
	 * we have to count them ourself to see if the error rate is too
	 * high.
	 */

>  		 */
> -		max_bitflips = mtd->bitflip_threshold;
> +		max_bitflips = check_for_bitflips(mtd, chip, page);
>  	}
>  	return max_bitflips;
>  }

Brian

^ permalink raw reply	[flat|nested] 30+ messages in thread

[parent not found: <CALnQHM2Afp8LD6MtGQTT5jrcb9xJdYXRGD0TZ_s5GASZsbRZeg@mail.gmail.com>]

* Re: [PATCH v4 5/5] mtd: nand: Improve bitflip detection for on-die ECC scheme.
       [not found]     ` <CALnQHM2Afp8LD6MtGQTT5jrcb9xJdYXRGD0TZ_s5GASZsbRZeg@mail.gmail.com>
@ 2014-04-01 17:33       ` Brian Norris
  2014-04-01 18:01         ` Brian Norris
  0 siblings, 1 reply; 30+ messages in thread
From: Brian Norris @ 2014-04-01 17:33 UTC (permalink / raw)
  To: David Mosberger; +Cc: gsi, linux-mtd, Pekon Gupta, Artem Bityutskiy

(Re-constructing CC list and leaving message intact, since you missed
the "Reply-All" button)

On Tue, Apr 01, 2014 at 10:03:00AM -0600, David Mosberger wrote:
> Brian,
> 
> On Tue, Apr 1, 2014 at 1:50 AM, Brian Norris
> <computersforpeace@gmail.com> wrote:
> > On Mon, Mar 31, 2014 at 05:28:57PM -0600, David Mosberger wrote:
> 
> >> +static int
> >> +set_on_die_ecc(struct mtd_info *mtd, struct nand_chip *chip, int on)
> >> +{
> >> +     u8 data[ONFI_SUBFEATURE_PARAM_LEN] = { 0, };
> >> +
> >> +     if (chip->ecc.mode != NAND_ECC_HW_ON_DIE)
> >> +             return 0;
> >
> > I think this check is unnecessary, and probably wrong. The caller should
> > make sure not to call this for devices that don't support it. Or else,
> > there should at least be an error code, like -EOPNOTSUPP.
> 
> Fair enough.  I removed the check for ecc.mode.
> 
> >> +
> >> +     if (on)
> >> +             data[0] = ONFI_FEATURE_ARRAY_OP_MODE_ENABLE_ON_DIE_ECC;
> >> +
> >> +     return chip->onfi_set_features(mtd, chip,
> >> +                                    ONFI_FEATURE_ADDR_ARRAY_OP_MODE, data);
> >> +}
> >
> > This should be implemented on a per-vendor basis and provided as a
> > callback (perhaps chip->set_internal_ecc()?). Then, you would only make
> > chip->set_internal_ecc non-NULL for flash that support it.
> 
> It's not clear at all to me how (un-)standardized this stuff is.  It
> may be Micron specific,
> but it may not be.  I don't know.  Since it's only called for Micron
> chips with on-die enabled,
> the code is safe as it is.
> 
> > Do you actually need to re-read, or can you use the existing data? Or at
> > least, you could overwrite the databuf, instead of using a new chkbuf.
> 
> In general,  you have to (re-)read.  Consider read_oob or read_subpage.
> 
> >> +
> >> +     /* Re-read page with on-die ECC off: */
> >> +     set_on_die_ecc(mtd, chip, 0);
> >> +     chip->cmdfunc(mtd, NAND_CMD_READ0, 0x00, page);
> >> +     chip->read_buf(mtd, rawbuf, read_size);
> >> +     set_on_die_ecc(mtd, chip, 1);
> >> +
> >> +     chkoob = chkbuf + mtd->writesize;
> >> +     rawoob = rawbuf + mtd->writesize;
> >> +     eccpos = chip->ecc.layout->eccpos;
> >> +     for (i = 0; i < chip->ecc.steps; ++i) {
> >> +             /* Count bit flips in the actual data area: */
> >> +             flips = bitdiff(chkbuf, rawbuf, chip->ecc.size);
> >> +             /* Count bit flips in the ECC bytes: */
> >> +             for (j = 0; j < chip->ecc.bytes; ++j) {
> >> +                     flips += hweight8(chkoob[*eccpos] ^ rawoob[*eccpos]);
> >
> > Why didn't you use bitdiff() here too?
> 
> Because the data is not contiguous and I didn't think the overhead
> of an extra function call was warranted for individual bytes.  But yeah,
> we could certainly use bitdiff() here on individual bytes, if you prefer.
> 
> >>               /*
> >> -              * Simple but suboptimal: any page with a single stuck
> >> -              * bit will be unusable since it'll be rewritten on
> >> -              * each read...
> >> +              * The Micron chips turn on the REWRITE status bit for
> >> +              * ANY bit flips.  Some pages have stuck bits, so we
> >> +              * don't want to migrate a block just because of
> >> +              * single bit errors because otherwise, that block
> >> +              * would effectively become unusable.  So, work out in
> >> +              * software what the max number of flipped bits is for
> >> +              * all subpages in a page:
> >
> > Can you shorten this comment? It's rather verbose, and it's making
> > assumptions about upper-layer "migrations". I think we can leave it at
> > something much simpler, like:
> >
> >         /*
> >          * Micron on-die ECC doesn't report the number of bitflips, so
> >          * we have to count them ourself to see if the error rate is too
> >          * high.
> >          */
> 
> Sure, I did add "This is particularly important for pages with stuck
> bits." since
> I think that is an important case to think about here.
> 
>   --david
> -- 
> eGauge Systems LLC, http://egauge.net/, 1.877-EGAUGE1, fax 720.545.9768

^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [PATCH v4 5/5] mtd: nand: Improve bitflip detection for on-die ECC scheme.
  2014-04-01 17:33       ` Brian Norris
@ 2014-04-01 18:01         ` Brian Norris
  2014-04-01 18:13           ` David Mosberger-Tang
  0 siblings, 1 reply; 30+ messages in thread
From: Brian Norris @ 2014-04-01 18:01 UTC (permalink / raw)
  To: David Mosberger; +Cc: gsi, linux-mtd, Pekon Gupta, Artem Bityutskiy

On Tue, Apr 01, 2014 at 10:33:05AM -0700, Brian Norris wrote:
> (Re-constructing CC list and leaving message intact, since you missed
> the "Reply-All" button)
> 
> On Tue, Apr 01, 2014 at 10:03:00AM -0600, David Mosberger wrote:
> > On Tue, Apr 1, 2014 at 1:50 AM, Brian Norris <computersforpeace@gmail.com> wrote:
> > > On Mon, Mar 31, 2014 at 05:28:57PM -0600, David Mosberger wrote:
> > >> +
> > >> +     if (on)
> > >> +             data[0] = ONFI_FEATURE_ARRAY_OP_MODE_ENABLE_ON_DIE_ECC;
> > >> +
> > >> +     return chip->onfi_set_features(mtd, chip,
> > >> +                                    ONFI_FEATURE_ADDR_ARRAY_OP_MODE, data);
> > >> +}
> > >
> > > This should be implemented on a per-vendor basis and provided as a
> > > callback (perhaps chip->set_internal_ecc()?). Then, you would only make
> > > chip->set_internal_ecc non-NULL for flash that support it.
> > 
> > It's not clear at all to me how (un-)standardized this stuff is.  It
> > may be Micron specific,
> > but it may not be.  I don't know.  Since it's only called for Micron
> > chips with on-die enabled,
> > the code is safe as it is.

The point is that we don't write code into the generic framework that
assumes Micron is the only one to implement it, if possible. This type
of replaceable feature is best left as a callback which can be set to
NULL, I think. Or if you can find a better point at which the
implementation specifics can be abstracted, that can work as well. But
regardless, my high level comment must be addressed -- you wrote this
code as if Micron is the only one to implement on-die ECC.

FWIW, the Toshiba BENAND (Built-in ECC NAND) datasheet I saw doesn't
advertise the ability to disable its ECC, but it does report per-sector
bitflip information (nice!). Also, I think it hides the ECC syndrome
bytes from the user, so most drivers could possibly ignore the built-in
ECC entirely if desired.

> > > Do you actually need to re-read, or can you use the existing data? Or at
> > > least, you could overwrite the databuf, instead of using a new chkbuf.
> > 
> > In general,  you have to (re-)read.  Consider read_oob or read_subpage.
> > 
> > >> +
> > >> +     /* Re-read page with on-die ECC off: */
> > >> +     set_on_die_ecc(mtd, chip, 0);
> > >> +     chip->cmdfunc(mtd, NAND_CMD_READ0, 0x00, page);
> > >> +     chip->read_buf(mtd, rawbuf, read_size);
> > >> +     set_on_die_ecc(mtd, chip, 1);
> > >> +
> > >> +     chkoob = chkbuf + mtd->writesize;
> > >> +     rawoob = rawbuf + mtd->writesize;
> > >> +     eccpos = chip->ecc.layout->eccpos;
> > >> +     for (i = 0; i < chip->ecc.steps; ++i) {
> > >> +             /* Count bit flips in the actual data area: */
> > >> +             flips = bitdiff(chkbuf, rawbuf, chip->ecc.size);
> > >> +             /* Count bit flips in the ECC bytes: */
> > >> +             for (j = 0; j < chip->ecc.bytes; ++j) {
> > >> +                     flips += hweight8(chkoob[*eccpos] ^ rawoob[*eccpos]);
> > >
> > > Why didn't you use bitdiff() here too?
> > 
> > Because the data is not contiguous and I didn't think the overhead
> > of an extra function call was warranted for individual bytes.  But yeah,
> > we could certainly use bitdiff() here on individual bytes, if you prefer.

Sorry, I misread the loop. Never mind. (Although it does then suggest
that maybe the bitdiff() function doesn't really need to stand alone,
for symmetry. Your call.)

> > >>               /*
> > >> -              * Simple but suboptimal: any page with a single stuck
> > >> -              * bit will be unusable since it'll be rewritten on
> > >> -              * each read...
> > >> +              * The Micron chips turn on the REWRITE status bit for
> > >> +              * ANY bit flips.  Some pages have stuck bits, so we
> > >> +              * don't want to migrate a block just because of
> > >> +              * single bit errors because otherwise, that block
> > >> +              * would effectively become unusable.  So, work out in
> > >> +              * software what the max number of flipped bits is for
> > >> +              * all subpages in a page:
> > >
> > > Can you shorten this comment? It's rather verbose, and it's making
> > > assumptions about upper-layer "migrations". I think we can leave it at
> > > something much simpler, like:
> > >
> > >         /*
> > >          * Micron on-die ECC doesn't report the number of bitflips, so
> > >          * we have to count them ourself to see if the error rate is too
> > >          * high.
> > >          */
> > 
> > Sure, I did add "This is particularly important for pages with stuck
> > bits." since
> > I think that is an important case to think about here.

Use your judgment, but please at least kill the migration comment.

Brian

^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [PATCH v4 5/5] mtd: nand: Improve bitflip detection for on-die ECC scheme.
  2014-04-01 18:01         ` Brian Norris
@ 2014-04-01 18:13           ` David Mosberger-Tang
  2014-04-02  7:57             ` Gupta, Pekon
  0 siblings, 1 reply; 30+ messages in thread
From: David Mosberger-Tang @ 2014-04-01 18:13 UTC (permalink / raw)
  To: Brian Norris
  Cc: David Mosberger, Gerhard Sittig, linux-mtd@lists.infradead.org,
	Pekon Gupta, Artem Bityutskiy

Brian,

On Tue, Apr 1, 2014 at 12:01 PM, Brian Norris
<computersforpeace@gmail.com> wrote:

> The point is that we don't write code into the generic framework that
> assumes Micron is the only one to implement it, if possible. This type
> of replaceable feature is best left as a callback which can be set to
> NULL, I think. Or if you can find a better point at which the
> implementation specifics can be abstracted, that can work as well. But
> regardless, my high level comment must be addressed -- you wrote this
> code as if Micron is the only one to implement on-die ECC.
>
> FWIW, the Toshiba BENAND (Built-in ECC NAND) datasheet I saw doesn't
> advertise the ability to disable its ECC, but it does report per-sector
> bitflip information (nice!). Also, I think it hides the ECC syndrome
> bytes from the user, so most drivers could possibly ignore the built-in
> ECC entirely if desired.

I can only make the patch as good as I know how to.
If you need a perfect patch before doing *anything*, chances are
the situation won't improve at all.

I really don't understand when Linux kernel development stopped
accepting improvements just because they're not perfect.  Incremental
improvements are part and parcel of Linux development, at least that's
what I always thought.

The patch as it stands is safe and it improves on the status quo.
If that's not good enough, fine, I tried to do my part.

Really, there is no direct benefit for me to spend time getting this merged.
I'm doing it because I thought it's the right thing to do to contribute back
a little bit since my company by relying on Linux is really is standing on
the shoulders of giants.

I'm sorry if I have sent too many patch revisions in too short amount
of time.  I tried to incorporate feedback and avoid duplicate comments
by providing updated patches.  Also, I don't have much time available,
so the longer this process takes, the less likely I'll be able to finish it
(please don't take that as a threat, it's really just my reality).

  --david

^ permalink raw reply	[flat|nested] 30+ messages in thread

* RE: [PATCH v4 5/5] mtd: nand: Improve bitflip detection for on-die ECC scheme.
  2014-04-01 18:13           ` David Mosberger-Tang
@ 2014-04-02  7:57             ` Gupta, Pekon
  0 siblings, 0 replies; 30+ messages in thread
From: Gupta, Pekon @ 2014-04-02  7:57 UTC (permalink / raw)
  To: David Mosberger-Tang, Brian Norris, dmosberger@gmail.com
  Cc: Gerhard Sittig, linux-mtd@lists.infradead.org, Artem Bityutskiy

Hi David,

>From: dmosberger@gmail.com >Sent: Tuesday, April 01, 2014 11:44 PM
>>On Tue, Apr 1, 2014 at 12:01 PM, Brian Norris ><computersforpeace@gmail.com> wrote:
>> The point is that we don't write code into the generic framework that
>> assumes Micron is the only one to implement it, if possible. This type
>> of replaceable feature is best left as a callback which can be set to
>> NULL, I think. Or if you can find a better point at which the
>> implementation specifics can be abstracted, that can work as well. But
>> regardless, my high level comment must be addressed -- you wrote this
>> code as if Micron is the only one to implement on-die ECC.
>>
>> FWIW, the Toshiba BENAND (Built-in ECC NAND) datasheet I saw doesn't
>> advertise the ability to disable its ECC, but it does report per-sector
>> bitflip information (nice!). Also, I think it hides the ECC syndrome
>> bytes from the user, so most drivers could possibly ignore the built-in
>> ECC entirely if desired.
>
>I can only make the patch as good as I know how to.
>If you need a perfect patch before doing *anything*, chances are
>the situation won't improve at all.
>
>I really don't understand when Linux kernel development stopped
>accepting improvements just because they're not perfect.  Incremental
>improvements are part and parcel of Linux development, at least that's
>what I always thought.
>
>The patch as it stands is safe and it improves on the status quo.
>If that's not good enough, fine, I tried to do my part.
>
>Really, there is no direct benefit for me to spend time getting this merged.
>I'm doing it because I thought it's the right thing to do to contribute back
>a little bit since my company by relying on Linux is really is standing on
>the shoulders of giants.
>
Some ground realities ..
Most developers (including me) are working for companies,
and so our implicit priorities become to maintain controller drivers first.
Like you we also get limited bandwidth to work on generic frameworks
(like nand_base.c) and review patches. So the actual attention given to
generic or core framework components is quite less.

(1) Thus usually whatever gets accepted first time in generic-framework
 remains as-it-is unless there is a bug reported or new feature addition.

(2) Changing a generic framework also means regress testing on multiple
 Platform, as you might accidently break some old platform or driver which
 is already in production. So it's always better to get it correct first time.

So, Brian's and my feedbacks were constructive to get things right at the
first introduction of this feature. Once it's accepted this might remain
untouched even it is sub-optimal. And Maintainer can always take a call
when a patch is good enough for acceptance.
However, there was never an intention to discourage you from submitting patches.

>I'm sorry if I have sent too many patch revisions in too short amount
>of time.  I tried to incorporate feedback and avoid duplicate comments
>by providing updated patches.  Also, I don't have much time available,
>so the longer this process takes, the less likely I'll be able to finish it
>(please don't take that as a threat, it's really just my reality).
>
Having too many patch versions would un-necessary burden you only
with re-testing and cleaning. Therefore I suggested to discuss and
iron-out differences, so that there is no duplication of effort on you side.

I'll try to help you out with some patches from your next version, as there
are number of comments already to fix for this one. But I don't have
Micron device with on-die ECC support, so you might have to help in testing.
Hope that helps..

>  --david


with regards, pekon

^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [PATCH v4 0/5] mtd: nand: Add on-die ECC support
  2014-03-31 23:28 [PATCH v4 0/5] mtd: nand: Add on-die ECC support David Mosberger
                   ` (4 preceding siblings ...)
  2014-03-31 23:28 ` [PATCH v4 5/5] mtd: nand: Improve bitflip detection for on-die ECC scheme David Mosberger
@ 2014-04-01  8:02 ` Brian Norris
  5 siblings, 0 replies; 30+ messages in thread
From: Brian Norris @ 2014-04-01  8:02 UTC (permalink / raw)
  To: David Mosberger; +Cc: gsi, linux-mtd, pekon, dedekind1

Hi David,

On Mon, Mar 31, 2014 at 05:28:52PM -0600, David Mosberger wrote:
> This patch-series is designed to go on top of Gerhard Sittig's
> patch series "mtd: nand: introduce a READMODE command".
> 
> The patches add on-die ECC support as found on some Micron flash chips.
> 
> David Mosberger (5):
>   mtd: nand: Detect Micron flash with on-die ECC (aka "internal ECC")
>     enabled.
>   mtd: nand: Add NAND_ECC_HW_ON_DIE ECC-mode.
>   mtd: nand: Enable subpage-reads on flashes with on-die ECC enabled.
>   mtd: nand: Allocate extra buffers needed for on-die ECC controller.
>   mtd: nand: Improve bitflip detection for on-die ECC scheme.

As I commented on your patches, I think this series is tied too closely
to a single flash part. I gave a few pointers where it can be unraveled
a bit, but please consider more ways to do so if possible. And ask
questions if anything is unclear or difficult.

Aside: if we continue to get more pseudo-generic features with
(relatively) small per-vendor differences, it may be time to do some
cleanup, and push some of the vendor-specific stuff into a new file
drivers/mtd/nand/nand_vendor.c. Already, I think a lot of the
nand_get_flash_type() stuff is so logically separate from much of the
NAND operation core, that maybe it should be pulled out. I don't expect
you to do this cleanup for me, but it may help guide you in determining
how to split certain functionality cleanly into pieces.

Thanks,
Brian

^ permalink raw reply	[flat|nested] 30+ messages in thread

end of thread, other threads:[~2014-04-03 15:26 UTC | newest]

Thread overview: 30+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2014-03-31 23:28 [PATCH v4 0/5] mtd: nand: Add on-die ECC support David Mosberger
2014-03-31 23:28 ` [PATCH v4 1/5] mtd: nand: Detect Micron flash with on-die ECC (aka "internal ECC") enabled David Mosberger
2014-04-01  6:39   ` Brian Norris
2014-04-01 15:26     ` David Mosberger
2014-04-02  7:27       ` Gupta, Pekon
2014-04-02 15:07         ` David Mosberger-Tang
2014-04-02 16:50           ` Gerhard Sittig
2014-04-02 17:02             ` David Mosberger
2014-04-03  7:10               ` Gerhard Sittig
     [not found]                 ` <CALnQHM1VLY=t6CaQtHGtp=enNCCj=Xz_QN7sj20hUCd8ZJjKpA@mail.gmail.com>
2014-04-03 15:26                   ` David Mosberger
2014-03-31 23:28 ` [PATCH v4 2/5] mtd: nand: Add NAND_ECC_HW_ON_DIE ECC-mode David Mosberger
2014-04-01  6:02   ` Gupta, Pekon
2014-04-01 15:32     ` David Mosberger
2014-04-01  7:24   ` Brian Norris
2014-04-01 15:41     ` David Mosberger
2014-03-31 23:28 ` [PATCH v4 3/5] mtd: nand: Enable subpage-reads on flashes with on-die ECC enabled David Mosberger
2014-03-31 23:28 ` [PATCH v4 4/5] mtd: nand: Allocate extra buffers needed for on-die ECC controller David Mosberger
2014-04-01  7:28   ` Brian Norris
2014-04-01  7:37     ` Gupta, Pekon
2014-04-01  8:24       ` Brian Norris
2014-03-31 23:28 ` [PATCH v4 5/5] mtd: nand: Improve bitflip detection for on-die ECC scheme David Mosberger
2014-04-01  6:29   ` Gupta, Pekon
2014-04-01 15:51     ` David Mosberger
2014-04-01 17:30       ` Brian Norris
2014-04-01  7:50   ` Brian Norris
     [not found]     ` <CALnQHM2Afp8LD6MtGQTT5jrcb9xJdYXRGD0TZ_s5GASZsbRZeg@mail.gmail.com>
2014-04-01 17:33       ` Brian Norris
2014-04-01 18:01         ` Brian Norris
2014-04-01 18:13           ` David Mosberger-Tang
2014-04-02  7:57             ` Gupta, Pekon
2014-04-01  8:02 ` [PATCH v4 0/5] mtd: nand: Add on-die ECC support Brian Norris

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox