[PATCH] MMC: Refine block layer waiting for card state

public inbox for linux-mmc@vger.kernel.org
 help / color / mirror / Atom feed

* [PATCH] MMC: Refine block layer waiting for card state
@ 2010-09-27  3:32 Ethan Du
  2010-09-28  6:51 ` Adrian Hunter
  0 siblings, 1 reply; 4+ messages in thread
From: Ethan Du @ 2010-09-27  3:32 UTC (permalink / raw)
  To: linux-mmc

    The while loop when handling rw request may become deadloop in
case of bad card
    I've seen mmcqd gets blocked forever after a single error message:

mmcblk0: error -110 sending read/write command, response 0x900, card
status 0x80e00

    Also there was case of card reports status without error

mmcblk0: error -110 sending read/write command, response 0x900, card
status 0xe00

    After this error, the card can stay in prg state, and never comes back,
    and may not report any error further. So a break out condition
    should be set in mmc block layer:
* should not enter the waiting loop in case of error
* should break out from the waiting loop, if card response with error
* should break out from the waiting loop when timeout

    These will not help with the card, one more thing to do:
* re-init the card in case of too many errors

Signed-off-by: Ethan Du <ethan.too@gmail.com>
---
 drivers/mmc/card/block.c |   35 +++++++++++++++++++++++++++--------
 drivers/mmc/core/mmc.c   |    9 +++++++--
 drivers/mmc/core/sd.c    |    8 ++++++--
 include/linux/mmc/card.h |    3 +++
 include/linux/mmc/mmc.h  |    1 +
 5 files changed, 44 insertions(+), 12 deletions(-)

diff --git a/drivers/mmc/card/block.c b/drivers/mmc/card/block.c
index d545f79..1d304a2 100644
--- a/drivers/mmc/card/block.c
+++ b/drivers/mmc/card/block.c
@@ -316,12 +316,14 @@ out:
 	return err ? 0 : 1;
 }

+#define BUSY_TIMEOUT_MS (16 * 1024)
 static int mmc_blk_issue_rw_rq(struct mmc_queue *mq, struct request *req)
 {
 	struct mmc_blk_data *md = mq->data;
 	struct mmc_card *card = md->queue.card;
 	struct mmc_blk_request brq;
 	int ret = 1, disable_multi = 0;
+	unsigned long timeout = 0;

 	mmc_claim_host(card->host);

@@ -453,7 +455,9 @@ static int mmc_blk_issue_rw_rq(struct mmc_queue
*mq, struct request *req)
 			       brq.stop.resp[0], status);
 		}

-		if (!mmc_host_is_spi(card->host) && rq_data_dir(req) != READ) {
+		if (!mmc_host_is_spi(card->host) && rq_data_dir(req) != READ &&
+		    !brq.cmd.error && !brq.data.error && !brq.stop.error) {
+			timeout = jiffies + msecs_to_jiffies(BUSY_TIMEOUT_MS);
 			do {
 				int err;

@@ -466,13 +470,22 @@ static int mmc_blk_issue_rw_rq(struct mmc_queue
*mq, struct request *req)
 					       req->rq_disk->disk_name, err);
 					goto cmd_err;
 				}
+				if (cmd.resp[0] & R1_ERROR_MASK) {
+					printk(KERN_ERR "%s: card err %#x\n",
+						req->rq_disk->disk_name,
+						cmd.resp[0]);
+					goto cmd_err;
+				}
 				/*
 				 * Some cards mishandle the status bits,
 				 * so make sure to check both the busy
 				 * indication and the card state.
 				 */
-			} while (!(cmd.resp[0] & R1_READY_FOR_DATA) ||
-				(R1_CURRENT_STATE(cmd.resp[0]) == 7));
+				if ((cmd.resp[0] & R1_READY_FOR_DATA) &&
+				    (R1_CURRENT_STATE(cmd.resp[0]) != 7))
+					break;
+			} while (time_before(jiffies, timeout));
+			/* Ignore timeout out */

 #if 0
 			if (cmd.resp[0] & ~0x00000900)
@@ -510,11 +523,11 @@ static int mmc_blk_issue_rw_rq(struct mmc_queue
*mq, struct request *req)

 	return 1;

- cmd_err:
- 	/*
- 	 * If this is an SD card and we're writing, we can first
- 	 * mark the known good sectors as ok.
- 	 *
+cmd_err:
+	/*
+	 * If this is an SD card and we're writing, we can first
+	 * mark the known good sectors as ok.
+	 *
 	 * If the card is not SD, we can still ok written sectors
 	 * as reported by the controller (which might be less than
 	 * the real number of written sectors, but never more).
@@ -541,6 +554,12 @@ static int mmc_blk_issue_rw_rq(struct mmc_queue
*mq, struct request *req)
 		ret = __blk_end_request(req, -EIO, blk_rq_cur_bytes(req));
 	spin_unlock_irq(&md->lock);

+	card->err_count++;
+	if (card->err_count >= ERR_TRIGGER_REINIT) {
+		printk(KERN_WARNING "%s: re-init the card due to error\n",
+			req->rq_disk->disk_name);
+		mmc_detect_change(card->host, 0);
+	}
 	return 0;
 }

diff --git a/drivers/mmc/core/mmc.c b/drivers/mmc/core/mmc.c
index 6909a54..79c4759 100644
--- a/drivers/mmc/core/mmc.c
+++ b/drivers/mmc/core/mmc.c
@@ -535,6 +535,8 @@ static int mmc_init_card(struct mmc_host *host, u32 ocr,

 	if (!oldcard)
 		host->card = card;
+	else
+		oldcard->err_count = 0;

 	return 0;

@@ -563,17 +565,20 @@ static void mmc_remove(struct mmc_host *host)
  */
 static void mmc_detect(struct mmc_host *host)
 {
-	int err;
+	int err = 0;

 	BUG_ON(!host);
 	BUG_ON(!host->card);

 	mmc_claim_host(host);

+	if (host->card->err_count >= ERR_TRIGGER_REINIT)
+		err = mmc_init_card(host, host->ocr, host->card);
 	/*
 	 * Just check if our card has been removed.
 	 */
-	err = mmc_send_status(host->card, NULL);
+	if (!err)
+		err = mmc_send_status(host->card, NULL);

 	mmc_release_host(host);

diff --git a/drivers/mmc/core/sd.c b/drivers/mmc/core/sd.c
index 0f52410..820b0d1 100644
--- a/drivers/mmc/core/sd.c
+++ b/drivers/mmc/core/sd.c
@@ -635,6 +635,7 @@ static int mmc_sd_init_card(struct mmc_host *host, u32 ocr,
 		mmc_set_bus_width(host, MMC_BUS_WIDTH_4);
 	}

+	card->err_count = 0;
 	host->card = card;
 	return 0;

@@ -662,17 +663,20 @@ static void mmc_sd_remove(struct mmc_host *host)
  */
 static void mmc_sd_detect(struct mmc_host *host)
 {
-	int err;
+	int err = 0;

 	BUG_ON(!host);
 	BUG_ON(!host->card);

 	mmc_claim_host(host);

+	if (host->card->err_count >= ERR_TRIGGER_REINIT)
+		err = mmc_sd_init_card(host, host->ocr, host->card);
 	/*
 	 * Just check if our card has been removed.
 	 */
-	err = mmc_send_status(host->card, NULL);
+	if (!err)
+		err = mmc_send_status(host->card, NULL);

 	mmc_release_host(host);

diff --git a/include/linux/mmc/card.h b/include/linux/mmc/card.h
index 6b75250..178de17 100644
--- a/include/linux/mmc/card.h
+++ b/include/linux/mmc/card.h
@@ -143,6 +143,9 @@ struct mmc_card {
 	const char		**info;		/* info strings */
 	struct sdio_func_tuple	*tuples;	/* unknown common tuples */

+	unsigned int		err_count;
+#define ERR_TRIGGER_REINIT 1024
+
 	struct dentry		*debugfs_root;
 };

diff --git a/include/linux/mmc/mmc.h b/include/linux/mmc/mmc.h
index dd11ae5..3b979a1 100644
--- a/include/linux/mmc/mmc.h
+++ b/include/linux/mmc/mmc.h
@@ -122,6 +122,7 @@
 #define R1_UNDERRUN		(1 << 18)	/* ex, c */
 #define R1_OVERRUN		(1 << 17)	/* ex, c */
 #define R1_CID_CSD_OVERWRITE	(1 << 16)	/* erx, c, CID/CSD overwrite */
+#define R1_ERROR_MASK		0xffff0000
 #define R1_WP_ERASE_SKIP	(1 << 15)	/* sx, c */
 #define R1_CARD_ECC_DISABLED	(1 << 14)	/* sx, a */
 #define R1_ERASE_RESET		(1 << 13)	/* sr, c */
-- 
1.5.6.5

^ permalink raw reply related	[flat|nested] 4+ messages in thread

* Re: [PATCH] MMC: Refine block layer waiting for card state
  2010-09-27  3:32 [PATCH] MMC: Refine block layer waiting for card state Ethan Du
@ 2010-09-28  6:51 ` Adrian Hunter
  2010-09-28 11:03   ` Ethan Du
  0 siblings, 1 reply; 4+ messages in thread
From: Adrian Hunter @ 2010-09-28  6:51 UTC (permalink / raw)
  To: Ethan Du; +Cc: linux-mmc

On 27/09/10 06:32, Ethan Du wrote:
>
>      The while loop when handling rw request may become deadloop in
> case of bad card
>      I've seen mmcqd gets blocked forever after a single error message:
>
> mmcblk0: error -110 sending read/write command, response 0x900, card
> status 0x80e00
>
>      Also there was case of card reports status without error
>
> mmcblk0: error -110 sending read/write command, response 0x900, card
> status 0xe00
>
>      After this error, the card can stay in prg state, and never comes back,
>      and may not report any error further. So a break out condition
>      should be set in mmc block layer:
> * should not enter the waiting loop in case of error

How do you know there are not any errors where you do need to wait?

> * should break out from the waiting loop, if card response with error
> * should break out from the waiting loop when timeout
>
>      These will not help with the card, one more thing to do:
> * re-init the card in case of too many errors

Using mmc_detect_change() will give you a new block device.
That is probably a bad idea for a non-removable card.  Also
if the reinitialization is going to help, then why wait for
lots of errors before doing it.

>
> Signed-off-by: Ethan Du<ethan.too@gmail.com>
> ---
>   drivers/mmc/card/block.c |   35 +++++++++++++++++++++++++++--------
>   drivers/mmc/core/mmc.c   |    9 +++++++--
>   drivers/mmc/core/sd.c    |    8 ++++++--
>   include/linux/mmc/card.h |    3 +++
>   include/linux/mmc/mmc.h  |    1 +
>   5 files changed, 44 insertions(+), 12 deletions(-)
>
> diff --git a/drivers/mmc/card/block.c b/drivers/mmc/card/block.c
> index d545f79..1d304a2 100644
> --- a/drivers/mmc/card/block.c
> +++ b/drivers/mmc/card/block.c
> @@ -316,12 +316,14 @@ out:
>   	return err ? 0 : 1;
>   }
>
> +#define BUSY_TIMEOUT_MS (16 * 1024)
>   static int mmc_blk_issue_rw_rq(struct mmc_queue *mq, struct request *req)
>   {
>   	struct mmc_blk_data *md = mq->data;
>   	struct mmc_card *card = md->queue.card;
>   	struct mmc_blk_request brq;
>   	int ret = 1, disable_multi = 0;
> +	unsigned long timeout = 0;
>
>   	mmc_claim_host(card->host);
>
> @@ -453,7 +455,9 @@ static int mmc_blk_issue_rw_rq(struct mmc_queue
> *mq, struct request *req)
>   			       brq.stop.resp[0], status);
>   		}
>
> -		if (!mmc_host_is_spi(card->host)&&  rq_data_dir(req) != READ) {
> +		if (!mmc_host_is_spi(card->host)&&  rq_data_dir(req) != READ&&
> +		    !brq.cmd.error&&  !brq.data.error&&  !brq.stop.error) {
> +			timeout = jiffies + msecs_to_jiffies(BUSY_TIMEOUT_MS);
>   			do {
>   				int err;
>
> @@ -466,13 +470,22 @@ static int mmc_blk_issue_rw_rq(struct mmc_queue
> *mq, struct request *req)
>   					       req->rq_disk->disk_name, err);
>   					goto cmd_err;
>   				}
> +				if (cmd.resp[0]&  R1_ERROR_MASK) {
> +					printk(KERN_ERR "%s: card err %#x\n",
> +						req->rq_disk->disk_name,
> +						cmd.resp[0]);
> +					goto cmd_err;
> +				}
>   				/*
>   				 * Some cards mishandle the status bits,
>   				 * so make sure to check both the busy
>   				 * indication and the card state.
>   				 */
> -			} while (!(cmd.resp[0]&  R1_READY_FOR_DATA) ||
> -				(R1_CURRENT_STATE(cmd.resp[0]) == 7));
> +				if ((cmd.resp[0]&  R1_READY_FOR_DATA)&&
> +				    (R1_CURRENT_STATE(cmd.resp[0]) != 7))
> +					break;
> +			} while (time_before(jiffies, timeout));
> +			/* Ignore timeout out */
>
>   #if 0
>   			if (cmd.resp[0]&  ~0x00000900)
> @@ -510,11 +523,11 @@ static int mmc_blk_issue_rw_rq(struct mmc_queue
> *mq, struct request *req)
>
>   	return 1;
>
> - cmd_err:
> - 	/*
> - 	 * If this is an SD card and we're writing, we can first
> - 	 * mark the known good sectors as ok.
> - 	 *
> +cmd_err:
> +	/*
> +	 * If this is an SD card and we're writing, we can first
> +	 * mark the known good sectors as ok.
> +	 *
>   	 * If the card is not SD, we can still ok written sectors
>   	 * as reported by the controller (which might be less than
>   	 * the real number of written sectors, but never more).
> @@ -541,6 +554,12 @@ static int mmc_blk_issue_rw_rq(struct mmc_queue
> *mq, struct request *req)
>   		ret = __blk_end_request(req, -EIO, blk_rq_cur_bytes(req));
>   	spin_unlock_irq(&md->lock);
>
> +	card->err_count++;
> +	if (card->err_count>= ERR_TRIGGER_REINIT) {
> +		printk(KERN_WARNING "%s: re-init the card due to error\n",
> +			req->rq_disk->disk_name);
> +		mmc_detect_change(card->host, 0);
> +	}
>   	return 0;
>   }
>
> diff --git a/drivers/mmc/core/mmc.c b/drivers/mmc/core/mmc.c
> index 6909a54..79c4759 100644
> --- a/drivers/mmc/core/mmc.c
> +++ b/drivers/mmc/core/mmc.c
> @@ -535,6 +535,8 @@ static int mmc_init_card(struct mmc_host *host, u32 ocr,
>
>   	if (!oldcard)
>   		host->card = card;
> +	else
> +		oldcard->err_count = 0;
>
>   	return 0;
>
> @@ -563,17 +565,20 @@ static void mmc_remove(struct mmc_host *host)
>    */
>   static void mmc_detect(struct mmc_host *host)
>   {
> -	int err;
> +	int err = 0;
>
>   	BUG_ON(!host);
>   	BUG_ON(!host->card);
>
>   	mmc_claim_host(host);
>
> +	if (host->card->err_count>= ERR_TRIGGER_REINIT)
> +		err = mmc_init_card(host, host->ocr, host->card);
>   	/*
>   	 * Just check if our card has been removed.
>   	 */
> -	err = mmc_send_status(host->card, NULL);
> +	if (!err)
> +		err = mmc_send_status(host->card, NULL);
>
>   	mmc_release_host(host);
>
> diff --git a/drivers/mmc/core/sd.c b/drivers/mmc/core/sd.c
> index 0f52410..820b0d1 100644
> --- a/drivers/mmc/core/sd.c
> +++ b/drivers/mmc/core/sd.c
> @@ -635,6 +635,7 @@ static int mmc_sd_init_card(struct mmc_host *host, u32 ocr,
>   		mmc_set_bus_width(host, MMC_BUS_WIDTH_4);
>   	}
>
> +	card->err_count = 0;
>   	host->card = card;
>   	return 0;
>
> @@ -662,17 +663,20 @@ static void mmc_sd_remove(struct mmc_host *host)
>    */
>   static void mmc_sd_detect(struct mmc_host *host)
>   {
> -	int err;
> +	int err = 0;
>
>   	BUG_ON(!host);
>   	BUG_ON(!host->card);
>
>   	mmc_claim_host(host);
>
> +	if (host->card->err_count>= ERR_TRIGGER_REINIT)
> +		err = mmc_sd_init_card(host, host->ocr, host->card);
>   	/*
>   	 * Just check if our card has been removed.
>   	 */
> -	err = mmc_send_status(host->card, NULL);
> +	if (!err)
> +		err = mmc_send_status(host->card, NULL);
>
>   	mmc_release_host(host);
>
> diff --git a/include/linux/mmc/card.h b/include/linux/mmc/card.h
> index 6b75250..178de17 100644
> --- a/include/linux/mmc/card.h
> +++ b/include/linux/mmc/card.h
> @@ -143,6 +143,9 @@ struct mmc_card {
>   	const char		**info;		/* info strings */
>   	struct sdio_func_tuple	*tuples;	/* unknown common tuples */
>
> +	unsigned int		err_count;
> +#define ERR_TRIGGER_REINIT 1024
> +
>   	struct dentry		*debugfs_root;
>   };
>
> diff --git a/include/linux/mmc/mmc.h b/include/linux/mmc/mmc.h
> index dd11ae5..3b979a1 100644
> --- a/include/linux/mmc/mmc.h
> +++ b/include/linux/mmc/mmc.h
> @@ -122,6 +122,7 @@
>   #define R1_UNDERRUN		(1<<  18)	/* ex, c */
>   #define R1_OVERRUN		(1<<  17)	/* ex, c */
>   #define R1_CID_CSD_OVERWRITE	(1<<  16)	/* erx, c, CID/CSD overwrite */
> +#define R1_ERROR_MASK		0xffff0000
>   #define R1_WP_ERASE_SKIP	(1<<  15)	/* sx, c */
>   #define R1_CARD_ECC_DISABLED	(1<<  14)	/* sx, a */
>   #define R1_ERASE_RESET		(1<<  13)	/* sr, c */


^ permalink raw reply	[flat|nested] 4+ messages in thread

* Re: [PATCH] MMC: Refine block layer waiting for card state
  2010-09-28  6:51 ` Adrian Hunter
@ 2010-09-28 11:03   ` Ethan Du
  2010-09-28 11:10     ` [PATCH v2] " Ethan Du
  0 siblings, 1 reply; 4+ messages in thread
From: Ethan Du @ 2010-09-28 11:03 UTC (permalink / raw)
  To: Adrian Hunter; +Cc: linux-mmc

Hi

On Tue, Sep 28, 2010 at 2:51 PM, Adrian Hunter <adrian.hunter@nokia.com> wrote:
> On 27/09/10 06:32, Ethan Du wrote:
>>
>>     The while loop when handling rw request may become deadloop in
>> case of bad card
>>     I've seen mmcqd gets blocked forever after a single error message:
>>
>> mmcblk0: error -110 sending read/write command, response 0x900, card
>> status 0x80e00
>>
>>     Also there was case of card reports status without error
>>
>> mmcblk0: error -110 sending read/write command, response 0x900, card
>> status 0xe00
>>
>>     After this error, the card can stay in prg state, and never comes
>> back,
>>     and may not report any error further. So a break out condition
>>     should be set in mmc block layer:
>> * should not enter the waiting loop in case of error
>
> How do you know there are not any errors where you do need to wait?

Would you enumerate an example?
I am not sure all hosts, seem most host controllers already wait
enough time before return to block layer when cmd->flags has
MMC_RSP_BUSY

>
>> * should break out from the waiting loop, if card response with error
>> * should break out from the waiting loop when timeout
>>
>>     These will not help with the card, one more thing to do:
>> * re-init the card in case of too many errors
>
> Using mmc_detect_change() will give you a new block device.
> That is probably a bad idea for a non-removable card.  Also

Yes, figured it is better to do something like host resume.
I made a function mmc_reinit_host(), will send out the patch

> if the reinitialization is going to help, then why wait for
> lots of errors before doing it.

With the hope that, card can still work after this error

>
>>
>> Signed-off-by: Ethan Du<ethan.too@gmail.com>
>> ---
>>  drivers/mmc/card/block.c |   35 +++++++++++++++++++++++++++--------
>>  drivers/mmc/core/mmc.c   |    9 +++++++--
>>  drivers/mmc/core/sd.c    |    8 ++++++--
>>  include/linux/mmc/card.h |    3 +++
>>  include/linux/mmc/mmc.h  |    1 +
>>  5 files changed, 44 insertions(+), 12 deletions(-)
>>
>> diff --git a/drivers/mmc/card/block.c b/drivers/mmc/card/block.c
>> index d545f79..1d304a2 100644
>> --- a/drivers/mmc/card/block.c
>> +++ b/drivers/mmc/card/block.c
>> @@ -316,12 +316,14 @@ out:
>>        return err ? 0 : 1;
>>  }
>>
>> +#define BUSY_TIMEOUT_MS (16 * 1024)
>>  static int mmc_blk_issue_rw_rq(struct mmc_queue *mq, struct request *req)
>>  {
>>        struct mmc_blk_data *md = mq->data;
>>        struct mmc_card *card = md->queue.card;
>>        struct mmc_blk_request brq;
>>        int ret = 1, disable_multi = 0;
>> +       unsigned long timeout = 0;
>>
>>        mmc_claim_host(card->host);
>>
>> @@ -453,7 +455,9 @@ static int mmc_blk_issue_rw_rq(struct mmc_queue
>> *mq, struct request *req)
>>                               brq.stop.resp[0], status);
>>                }
>>
>> -               if (!mmc_host_is_spi(card->host)&&  rq_data_dir(req) !=
>> READ) {
>> +               if (!mmc_host_is_spi(card->host)&&  rq_data_dir(req) !=
>> READ&&
>> +                   !brq.cmd.error&&  !brq.data.error&&  !brq.stop.error)
>> {
>> +                       timeout = jiffies +
>> msecs_to_jiffies(BUSY_TIMEOUT_MS);
>>                        do {
>>                                int err;
>>
>> @@ -466,13 +470,22 @@ static int mmc_blk_issue_rw_rq(struct mmc_queue
>> *mq, struct request *req)
>>                                               req->rq_disk->disk_name,
>> err);
>>                                        goto cmd_err;
>>                                }
>> +                               if (cmd.resp[0]&  R1_ERROR_MASK) {
>> +                                       printk(KERN_ERR "%s: card err
>> %#x\n",
>> +                                               req->rq_disk->disk_name,
>> +                                               cmd.resp[0]);
>> +                                       goto cmd_err;
>> +                               }
>>                                /*
>>                                 * Some cards mishandle the status bits,
>>                                 * so make sure to check both the busy
>>                                 * indication and the card state.
>>                                 */
>> -                       } while (!(cmd.resp[0]&  R1_READY_FOR_DATA) ||
>> -                               (R1_CURRENT_STATE(cmd.resp[0]) == 7));
>> +                               if ((cmd.resp[0]&  R1_READY_FOR_DATA)&&
>> +                                   (R1_CURRENT_STATE(cmd.resp[0]) != 7))
>> +                                       break;
>> +                       } while (time_before(jiffies, timeout));
>> +                       /* Ignore timeout out */
>>
>>  #if 0
>>                        if (cmd.resp[0]&  ~0x00000900)
>> @@ -510,11 +523,11 @@ static int mmc_blk_issue_rw_rq(struct mmc_queue
>> *mq, struct request *req)
>>
>>        return 1;
>>
>> - cmd_err:
>> -       /*
>> -        * If this is an SD card and we're writing, we can first
>> -        * mark the known good sectors as ok.
>> -        *
>> +cmd_err:
>> +       /*
>> +        * If this is an SD card and we're writing, we can first
>> +        * mark the known good sectors as ok.
>> +        *
>>         * If the card is not SD, we can still ok written sectors
>>         * as reported by the controller (which might be less than
>>         * the real number of written sectors, but never more).
>> @@ -541,6 +554,12 @@ static int mmc_blk_issue_rw_rq(struct mmc_queue
>> *mq, struct request *req)
>>                ret = __blk_end_request(req, -EIO, blk_rq_cur_bytes(req));
>>        spin_unlock_irq(&md->lock);
>>
>> +       card->err_count++;
>> +       if (card->err_count>= ERR_TRIGGER_REINIT) {
>> +               printk(KERN_WARNING "%s: re-init the card due to error\n",
>> +                       req->rq_disk->disk_name);
>> +               mmc_detect_change(card->host, 0);
>> +       }
>>        return 0;
>>  }
>>
>> diff --git a/drivers/mmc/core/mmc.c b/drivers/mmc/core/mmc.c
>> index 6909a54..79c4759 100644
>> --- a/drivers/mmc/core/mmc.c
>> +++ b/drivers/mmc/core/mmc.c
>> @@ -535,6 +535,8 @@ static int mmc_init_card(struct mmc_host *host, u32
>> ocr,
>>
>>        if (!oldcard)
>>                host->card = card;
>> +       else
>> +               oldcard->err_count = 0;
>>
>>        return 0;
>>
>> @@ -563,17 +565,20 @@ static void mmc_remove(struct mmc_host *host)
>>   */
>>  static void mmc_detect(struct mmc_host *host)
>>  {
>> -       int err;
>> +       int err = 0;
>>
>>        BUG_ON(!host);
>>        BUG_ON(!host->card);
>>
>>        mmc_claim_host(host);
>>
>> +       if (host->card->err_count>= ERR_TRIGGER_REINIT)
>> +               err = mmc_init_card(host, host->ocr, host->card);
>>        /*
>>         * Just check if our card has been removed.
>>         */
>> -       err = mmc_send_status(host->card, NULL);
>> +       if (!err)
>> +               err = mmc_send_status(host->card, NULL);
>>
>>        mmc_release_host(host);
>>
>> diff --git a/drivers/mmc/core/sd.c b/drivers/mmc/core/sd.c
>> index 0f52410..820b0d1 100644
>> --- a/drivers/mmc/core/sd.c
>> +++ b/drivers/mmc/core/sd.c
>> @@ -635,6 +635,7 @@ static int mmc_sd_init_card(struct mmc_host *host, u32
>> ocr,
>>                mmc_set_bus_width(host, MMC_BUS_WIDTH_4);
>>        }
>>
>> +       card->err_count = 0;
>>        host->card = card;
>>        return 0;
>>
>> @@ -662,17 +663,20 @@ static void mmc_sd_remove(struct mmc_host *host)
>>   */
>>  static void mmc_sd_detect(struct mmc_host *host)
>>  {
>> -       int err;
>> +       int err = 0;
>>
>>        BUG_ON(!host);
>>        BUG_ON(!host->card);
>>
>>        mmc_claim_host(host);
>>
>> +       if (host->card->err_count>= ERR_TRIGGER_REINIT)
>> +               err = mmc_sd_init_card(host, host->ocr, host->card);
>>        /*
>>         * Just check if our card has been removed.
>>         */
>> -       err = mmc_send_status(host->card, NULL);
>> +       if (!err)
>> +               err = mmc_send_status(host->card, NULL);
>>
>>        mmc_release_host(host);
>>
>> diff --git a/include/linux/mmc/card.h b/include/linux/mmc/card.h
>> index 6b75250..178de17 100644
>> --- a/include/linux/mmc/card.h
>> +++ b/include/linux/mmc/card.h
>> @@ -143,6 +143,9 @@ struct mmc_card {
>>        const char              **info;         /* info strings */
>>        struct sdio_func_tuple  *tuples;        /* unknown common tuples */
>>
>> +       unsigned int            err_count;
>> +#define ERR_TRIGGER_REINIT 1024
>> +
>>        struct dentry           *debugfs_root;
>>  };
>>
>> diff --git a/include/linux/mmc/mmc.h b/include/linux/mmc/mmc.h
>> index dd11ae5..3b979a1 100644
>> --- a/include/linux/mmc/mmc.h
>> +++ b/include/linux/mmc/mmc.h
>> @@ -122,6 +122,7 @@
>>  #define R1_UNDERRUN           (1<<  18)       /* ex, c */
>>  #define R1_OVERRUN            (1<<  17)       /* ex, c */
>>  #define R1_CID_CSD_OVERWRITE  (1<<  16)       /* erx, c, CID/CSD
>> overwrite */
>> +#define R1_ERROR_MASK          0xffff0000
>>  #define R1_WP_ERASE_SKIP      (1<<  15)       /* sx, c */
>>  #define R1_CARD_ECC_DISABLED  (1<<  14)       /* sx, a */
>>  #define R1_ERASE_RESET                (1<<  13)       /* sr, c */
>
>

^ permalink raw reply	[flat|nested] 4+ messages in thread

* [PATCH v2] MMC: Refine block layer waiting for card state
  2010-09-28 11:03   ` Ethan Du
@ 2010-09-28 11:10     ` Ethan Du
  0 siblings, 0 replies; 4+ messages in thread
From: Ethan Du @ 2010-09-28 11:10 UTC (permalink / raw)
  To: linux-mmc; +Cc: Adrian Hunter

    The while loop when handling rw request may become deadloop in
case of bad card
    I've seen mmcqd gets blocked forever after a single error message:

mmcblk0: error -110 sending read/write command, response 0x900, card
status 0x80e00

    Also there was case of card reports status without error

mmcblk0: error -110 sending read/write command, response 0x900, card
status 0xe00

    After this error, the card can stay in prg state, and never comes back,
    and may not report any error further. So a break out condition
    should be set in mmc block layer:
* should not enter the waiting loop in case of error
* should break out from the waiting loop, if card response with error
* should break out from the waiting loop when timeout

    These will not help with the card, one more thing to do:
* re-init the card in case of too many errors

Signed-off-by: Ethan Du <ethan.too@gmail.com>
---

Changes since v1:
       * Use mmc_reinit_host instead of mmc_detect_change

 drivers/mmc/card/block.c |   36 ++++++++++++++++++++++++++++--------
 drivers/mmc/core/core.c  |   38 ++++++++++++++++++++++++++++++++++++++
 include/linux/mmc/card.h |    3 +++
 include/linux/mmc/host.h |    1 +
 include/linux/mmc/mmc.h  |    1 +
 5 files changed, 71 insertions(+), 8 deletions(-)

diff --git a/drivers/mmc/card/block.c b/drivers/mmc/card/block.c
index d545f79..cc28a20 100644
--- a/drivers/mmc/card/block.c
+++ b/drivers/mmc/card/block.c
@@ -316,12 +316,14 @@ out:
 	return err ? 0 : 1;
 }

+#define BUSY_TIMEOUT_MS (16 * 1024)
 static int mmc_blk_issue_rw_rq(struct mmc_queue *mq, struct request *req)
 {
 	struct mmc_blk_data *md = mq->data;
 	struct mmc_card *card = md->queue.card;
 	struct mmc_blk_request brq;
 	int ret = 1, disable_multi = 0;
+	unsigned long timeout = 0;

 	mmc_claim_host(card->host);

@@ -453,7 +455,9 @@ static int mmc_blk_issue_rw_rq(struct mmc_queue
*mq, struct request *req)
 			       brq.stop.resp[0], status);
 		}

-		if (!mmc_host_is_spi(card->host) && rq_data_dir(req) != READ) {
+		if (!mmc_host_is_spi(card->host) && rq_data_dir(req) != READ &&
+		    !brq.cmd.error && !brq.data.error && !brq.stop.error) {
+			timeout = jiffies + msecs_to_jiffies(BUSY_TIMEOUT_MS);
 			do {
 				int err;

@@ -466,13 +470,22 @@ static int mmc_blk_issue_rw_rq(struct mmc_queue
*mq, struct request *req)
 					       req->rq_disk->disk_name, err);
 					goto cmd_err;
 				}
+				if (cmd.resp[0] & R1_ERROR_MASK) {
+					printk(KERN_ERR "%s: card err %#x\n",
+						req->rq_disk->disk_name,
+						cmd.resp[0]);
+					goto cmd_err;
+				}
 				/*
 				 * Some cards mishandle the status bits,
 				 * so make sure to check both the busy
 				 * indication and the card state.
 				 */
-			} while (!(cmd.resp[0] & R1_READY_FOR_DATA) ||
-				(R1_CURRENT_STATE(cmd.resp[0]) == 7));
+				if ((cmd.resp[0] & R1_READY_FOR_DATA) &&
+				    (R1_CURRENT_STATE(cmd.resp[0]) != 7))
+					break;
+			} while (time_before(jiffies, timeout));
+			/* Ignore timeout out */

 #if 0
 			if (cmd.resp[0] & ~0x00000900)
@@ -510,11 +523,11 @@ static int mmc_blk_issue_rw_rq(struct mmc_queue
*mq, struct request *req)

 	return 1;

- cmd_err:
- 	/*
- 	 * If this is an SD card and we're writing, we can first
- 	 * mark the known good sectors as ok.
- 	 *
+cmd_err:
+	/*
+	 * If this is an SD card and we're writing, we can first
+	 * mark the known good sectors as ok.
+	 *
 	 * If the card is not SD, we can still ok written sectors
 	 * as reported by the controller (which might be less than
 	 * the real number of written sectors, but never more).
@@ -534,6 +547,13 @@ static int mmc_blk_issue_rw_rq(struct mmc_queue
*mq, struct request *req)
 		spin_unlock_irq(&md->lock);
 	}

+	card->err_count++;
+	if (card->err_count >= ERR_TRIGGER_REINIT) {
+		card->err_count = 0;
+		printk(KERN_WARNING "%s: re-init the card due to error\n",
+			md->disk->disk_name);
+		mmc_reinit_host(card->host);
+	}
 	mmc_release_host(card->host);

 	spin_lock_irq(&md->lock);
diff --git a/drivers/mmc/core/core.c b/drivers/mmc/core/core.c
index 5db49b1..b61de33 100644
--- a/drivers/mmc/core/core.c
+++ b/drivers/mmc/core/core.c
@@ -1671,6 +1671,44 @@ int mmc_resume_host(struct mmc_host *host)
 }
 EXPORT_SYMBOL(mmc_resume_host);

+/**
+ *	mmc_reinit_host - reinit a host
+ *	@host: mmc host
+ */
+int mmc_reinit_host(struct mmc_host *host)
+{
+	int err = 0;
+
+	mmc_bus_get(host);
+
+	if (!host->bus_ops->resume)
+		return 0;
+
+	if (host->bus_ops && !host->bus_dead) {
+		if (!(host->pm_flags & MMC_PM_KEEP_POWER)) {
+			mmc_power_up(host);
+			mmc_select_voltage(host, host->ocr);
+		}
+		err = host->bus_ops->resume(host);
+		if (err) {
+			printk(KERN_WARNING "%s: error %d during resume "
+					    "(card was removed?)\n",
+					    mmc_hostname(host), err);
+			err = 0;
+		}
+	}
+	mmc_bus_put(host);
+
+	/*
+	 * We add a slight delay here so that resume can progress
+	 * in parallel.
+	 */
+	mmc_detect_change(host, 1);
+
+	return err;
+}
+EXPORT_SYMBOL(mmc_reinit_host);
+
 /* Do the card removal on suspend if card is assumed removeable
  * Do that in pm notifier while userspace isn't yet frozen, so we will be able
    to sync the card.
diff --git a/include/linux/mmc/card.h b/include/linux/mmc/card.h
index 6b75250..178de17 100644
--- a/include/linux/mmc/card.h
+++ b/include/linux/mmc/card.h
@@ -143,6 +143,9 @@ struct mmc_card {
 	const char		**info;		/* info strings */
 	struct sdio_func_tuple	*tuples;	/* unknown common tuples */

+	unsigned int		err_count;
+#define ERR_TRIGGER_REINIT 1024
+
 	struct dentry		*debugfs_root;
 };

diff --git a/include/linux/mmc/host.h b/include/linux/mmc/host.h
index 1575b52..cff9c69 100644
--- a/include/linux/mmc/host.h
+++ b/include/linux/mmc/host.h
@@ -235,6 +235,7 @@ static inline void *mmc_priv(struct mmc_host *host)

 extern int mmc_suspend_host(struct mmc_host *);
 extern int mmc_resume_host(struct mmc_host *);
+extern int mmc_reinit_host(struct mmc_host *);

 extern void mmc_power_save_host(struct mmc_host *host);
 extern void mmc_power_restore_host(struct mmc_host *host);
diff --git a/include/linux/mmc/mmc.h b/include/linux/mmc/mmc.h
index dd11ae5..3b979a1 100644
--- a/include/linux/mmc/mmc.h
+++ b/include/linux/mmc/mmc.h
@@ -122,6 +122,7 @@
 #define R1_UNDERRUN		(1 << 18)	/* ex, c */
 #define R1_OVERRUN		(1 << 17)	/* ex, c */
 #define R1_CID_CSD_OVERWRITE	(1 << 16)	/* erx, c, CID/CSD overwrite */
+#define R1_ERROR_MASK		0xffff0000
 #define R1_WP_ERASE_SKIP	(1 << 15)	/* sx, c */
 #define R1_CARD_ECC_DISABLED	(1 << 14)	/* sx, a */
 #define R1_ERASE_RESET		(1 << 13)	/* sr, c */
-- 
1.5.6.5

^ permalink raw reply related	[flat|nested] 4+ messages in thread

end of thread, other threads:[~2010-09-28 11:10 UTC | newest]

Thread overview: 4+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2010-09-27  3:32 [PATCH] MMC: Refine block layer waiting for card state Ethan Du
2010-09-28  6:51 ` Adrian Hunter
2010-09-28 11:03   ` Ethan Du
2010-09-28 11:10     ` [PATCH v2] " Ethan Du

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox