All of lore.kernel.org
 help / color / mirror / Atom feed
From: "Jiang, Dave" <dave.jiang@intel.com>
To: "allenbh@gmail.com" <allenbh@gmail.com>
Cc: "allen.hubbe@emc.com" <allen.hubbe@emc.com>,
	"dmaengine@vger.kernel.org" <dmaengine@vger.kernel.org>,
	"Williams, Dan J" <dan.j.williams@intel.com>,
	"jdmason@kudzu.us" <jdmason@kudzu.us>,
	"Koul, Vinod" <vinod.koul@intel.com>,
	"linux-ntb@googlegroups.com" <linux-ntb@googlegroups.com>
Subject: Re: [PATCH 2/5] dmaengine: IOATDMA: Add error handling to driver
Date: Tue, 28 Jun 2016 16:11:52 +0000	[thread overview]
Message-ID: <1467130291.3754.8.camel@intel.com> (raw)
In-Reply-To: <CAJ80satySmQk_O9hu6E7ytEovSq7v=DMFo9jawU-Sizx2Or8Lg@mail.gmail.com>

On Tue, 2016-06-28 at 01:08 +0000, Allen Hubbe wrote:
> On Mon, Jun 27, 2016 at 7:09 PM, Dave Jiang <dave.jiang@intel.com>
> wrote:
> > Signed-off-by: Dave Jiang <dave.jiang@intel.com>
> > ---
> >  drivers/dma/ioat/dma.c       |  136
> > ++++++++++++++++++++++++++++++++++++------
> >  drivers/dma/ioat/registers.h |    2 +
> >  2 files changed, 120 insertions(+), 18 deletions(-)
> > 
> > diff --git a/drivers/dma/ioat/dma.c b/drivers/dma/ioat/dma.c
> > index bd09961..1bbc005 100644
> > --- a/drivers/dma/ioat/dma.c
> > +++ b/drivers/dma/ioat/dma.c
> > @@ -622,7 +622,8 @@ static void ioat_cleanup(struct ioatdma_chan
> > *ioat_chan)
> >         if (is_ioat_halted(*ioat_chan->completion)) {
> >                 u32 chanerr = readl(ioat_chan->reg_base +
> > IOAT_CHANERR_OFFSET);
> > 
> > -               if (chanerr & IOAT_CHANERR_HANDLE_MASK) {
> > +               if (chanerr &
> > +                   (IOAT_CHANERR_HANDLE_MASK |
> > IOAT_CHANERR_RECOVER_MASK)) {
> >                         mod_timer(&ioat_chan->timer, jiffies +
> > IDLE_TIMEOUT);
> >                         ioat_eh(ioat_chan);
> >                 }
> > @@ -652,6 +653,60 @@ static void ioat_restart_channel(struct
> > ioatdma_chan *ioat_chan)
> >         __ioat_restart_chan(ioat_chan);
> >  }
> > 
> > +
> > +static void ioat_abort_descs(struct ioatdma_chan *ioat_chan)
> > +{
> > +       struct ioatdma_device *ioat_dma = ioat_chan->ioat_dma;
> > +       struct ioat_ring_ent *desc;
> > +       u16 active;
> > +       int idx = ioat_chan->tail, i;
> > +
> > +       /*
> > +        * We assume that the failed descriptor has been processed.
> > +        * Now we are just returning all the remaining submitted
> > +        * descriptors to abort.
> > +        */
> > +       active = ioat_ring_active(ioat_chan);
> > +
> > +       /* we skip the failed descriptor that tail points to */
> > +       for (i = 1; i < active; i++) {
> > +               struct dma_async_tx_descriptor *tx;
> > +
> > +               smp_read_barrier_depends();
> > +               prefetch(ioat_get_ring_ent(ioat_chan, idx + i +
> > 1));
> > +               desc = ioat_get_ring_ent(ioat_chan, idx + i);
> > +               desc->txd.result |= ERR_DMA_ABORT;
> 
> See below, txd.result doesn't seem to be a bit mask.

I'll fix that. It shouldn't be a bit mask.

> 
> > +
> > +               tx = &desc->txd;
> > +               if (tx->cookie) {
> > +                       dma_cookie_complete(tx);
> > +                       dma_descriptor_unmap(tx);
> > +                       if (tx->callback) {
> > +                               tx->callback(tx->callback_param);
> > +                               tx->callback = NULL;
> > +                       }
> > +               }
> > +
> > +               /* skip extended descriptors */
> > +               if (desc_has_ext(desc)) {
> > +                       WARN_ON(i + 1 >= active);
> > +                       i++;
> 
> Is it always true that extended descriptors (note: descriptors is
> plural) means "exactly one?"  Otherwise, ++ aught to be += how how
> many.  I didn't notice anything in dma_prep.c that would use more
> than
> one extra.

There can only be 1 extended descriptor always.

> 
> > +               }
> > +
> > +               /* cleanup super extended descriptors */
> > +               if (desc->sed) {
> > +                       ioat_free_sed(ioat_dma, desc->sed);
> > +                       desc->sed = NULL;
> > +               }
> > +       }
> > +
> > +       smp_mb(); /* finish all descriptor reads before
> > incrementing tail */
> > +       ioat_chan->tail = idx + i;
> 
> Would it also be correct to say `idx + active` here?  If so, that
> might be more clear.

ok

> 
> > +
> > +       desc = ioat_get_ring_ent(ioat_chan, ioat_chan->tail);
> > +       ioat_chan->last_completion = *ioat_chan->completion = desc-
> > >txd.phys;
> > +}
> > +
> >  static void ioat_eh(struct ioatdma_chan *ioat_chan)
> >  {
> >         struct pci_dev *pdev = to_pdev(ioat_chan);
> > @@ -662,6 +717,7 @@ static void ioat_eh(struct ioatdma_chan
> > *ioat_chan)
> >         u32 err_handled = 0;
> >         u32 chanerr_int;
> >         u32 chanerr;
> > +       bool abort = false;
> > 
> >         /* cleanup so tail points to descriptor that caused the
> > error */
> >         if (ioat_cleanup_preamble(ioat_chan, &phys_complete))
> > @@ -697,30 +753,50 @@ static void ioat_eh(struct ioatdma_chan
> > *ioat_chan)
> >                 break;
> >         }
> > 
> > +       if (chanerr & IOAT_CHANERR_RECOVER_MASK) {
> > +               if (chanerr & IOAT_CHANERR_READ_DATA_ERR) {
> > +                       desc->txd.result |= ERR_DMA_READ;
> 
> In [PATCH 1/5] ERR_DMA_XYZ values do not look appropriate for use as
> a bit mask.
> 
> +enum err_result_flags {
> +       ERR_DMA_NONE = 0,
> +       ERR_DMA_READ,
> +       ERR_DMA_WRITE,
> +       ERR_DMA_ABORT,
> +};
> 
> And in [PATCH 4/5] and [PATCH 5/5] txd.result is not used as a bit
> mask.
> 
> +               switch (txd->result) {
> +               case ERR_DMA_READ:
> +               case ERR_DMA_WRITE:
> +                       entry->errors++;
> +               case ERR_DMA_ABORT:
> +                       flags = DESC_ABORT_FLAG;
> +                       break;
> +               case ERR_DMA_NONE:
> +               default:
> +                       break;
> +               }
> 
> > +                       err_handled |= IOAT_CHANERR_READ_DATA_ERR;
> > +               } else if (chanerr & IOAT_CHANERR_WRITE_DATA_ERR) {
> > +                       desc->txd.result |= ERR_DMA_WRITE;
> > +                       err_handled |= IOAT_CHANERR_WRITE_DATA_ERR;
> > +               }
> > +
> > +               abort = true;
> > +       }
> > +
> >         /* fault on unhandled error or spurious halt */
> >         if (chanerr ^ err_handled || chanerr == 0) {
> 
> Not part of your change, but chanerr == 0 is confusing.  Doesn't that
> mean "no errors?"  No errors would make it hard to justify BUG() that
> follows.

Legacy code. This is the beginning of me cleaning out all the BUG() in
the driver and put in proper error handling. 

> 
> >                 dev_err(to_dev(ioat_chan), "%s: fatal error
> > (%x:%x)\n",
> >                         __func__, chanerr, err_handled);
> >                 BUG();
> > -       } else { /* cleanup the faulty descriptor */
> > -               tx = &desc->txd;
> > -               if (tx->cookie) {
> > -                       dma_cookie_complete(tx);
> > -                       dma_descriptor_unmap(tx);
> > -                       if (tx->callback) {
> > -                               tx->callback(tx->callback_param);
> > -                               tx->callback = NULL;
> > -                       }
> > -               }
> >         }
> > 
> > -       writel(chanerr, ioat_chan->reg_base + IOAT_CHANERR_OFFSET);
> > -       pci_write_config_dword(pdev, IOAT_PCI_CHANERR_INT_OFFSET,
> > chanerr_int);
> > +       /* cleanup the faulty descriptor since we are continuing */
> > +       tx = &desc->txd;
> > +       if (tx->cookie) {
> > +               dma_cookie_complete(tx);
> > +               dma_descriptor_unmap(tx);
> > +               if (tx->callback) {
> > +                       tx->callback(tx->callback_param);
> > +                       tx->callback = NULL;
> > +               }
> > +       }
> > 
> >         /* mark faulting descriptor as complete */
> >         *ioat_chan->completion = desc->txd.phys;
> > 
> >         spin_lock_bh(&ioat_chan->prep_lock);
> > +       /* we need abort all descriptors */
> > +       if (abort) {
> > +               ioat_abort_descs(ioat_chan);
> > +               /* clean up the channel, we could be in weird state
> > */
> > +               ioat_reset_hw(ioat_chan);
> > +       }
> > +
> > +       writel(chanerr, ioat_chan->reg_base + IOAT_CHANERR_OFFSET);
> > +       pci_write_config_dword(pdev, IOAT_PCI_CHANERR_INT_OFFSET,
> > chanerr_int);
> > +
> >         ioat_restart_channel(ioat_chan);
> >         spin_unlock_bh(&ioat_chan->prep_lock);
> >  }
> > @@ -753,10 +829,25 @@ void ioat_timer_event(unsigned long data)
> >                 chanerr = readl(ioat_chan->reg_base +
> > IOAT_CHANERR_OFFSET);
> >                 dev_err(to_dev(ioat_chan), "%s: Channel halted
> > (%x)\n",
> >                         __func__, chanerr);
> > -               if (test_bit(IOAT_RUN, &ioat_chan->state))
> > -                       BUG_ON(is_ioat_bug(chanerr));
> > -               else /* we never got off the ground */
> > -                       return;
> > +               if (test_bit(IOAT_RUN, &ioat_chan->state)) {
> > +                       spin_lock_bh(&ioat_chan->cleanup_lock);
> > +                       spin_lock_bh(&ioat_chan->prep_lock);
> > +                       set_bit(IOAT_CHAN_DOWN, &ioat_chan->state);
> > +                       spin_unlock_bh(&ioat_chan->prep_lock);
> > +
> > +                       ioat_abort_descs(ioat_chan);
> > +                       dev_warn(to_dev(ioat_chan), "Reset
> > channel...\n");
> > +                       ioat_reset_hw(ioat_chan);
> > +                       dev_warn(to_dev(ioat_chan), "Restart
> > channel...\n");
> > +                       ioat_restart_channel(ioat_chan);
> > +
> > +                       spin_lock_bh(&ioat_chan->prep_lock);
> > +                       clear_bit(IOAT_CHAN_DOWN, &ioat_chan-
> > >state);
> > +                       spin_unlock_bh(&ioat_chan->prep_lock);
> > +                       spin_unlock_bh(&ioat_chan->cleanup_lock);
> > +               }
> > +
> > +               return;
> >         }
> > 
> >         spin_lock_bh(&ioat_chan->cleanup_lock);
> > @@ -780,14 +871,23 @@ void ioat_timer_event(unsigned long data)
> >                 u32 chanerr;
> > 
> >                 chanerr = readl(ioat_chan->reg_base +
> > IOAT_CHANERR_OFFSET);
> > -               dev_warn(to_dev(ioat_chan), "Restarting
> > channel...\n");
> >                 dev_warn(to_dev(ioat_chan), "CHANSTS: %#Lx CHANERR:
> > %#x\n",
> >                          status, chanerr);
> >                 dev_warn(to_dev(ioat_chan), "Active descriptors:
> > %d\n",
> >                          ioat_ring_active(ioat_chan));
> > 
> >                 spin_lock_bh(&ioat_chan->prep_lock);
> > +               set_bit(IOAT_CHAN_DOWN, &ioat_chan->state);
> > +               spin_unlock_bh(&ioat_chan->prep_lock);
> > +
> > +               ioat_abort_descs(ioat_chan);
> > +               dev_warn(to_dev(ioat_chan), "Resetting
> > channel...\n");
> > +               ioat_reset_hw(ioat_chan);
> > +               dev_warn(to_dev(ioat_chan), "Restarting
> > channel...\n");
> >                 ioat_restart_channel(ioat_chan);
> > +
> > +               spin_lock_bh(&ioat_chan->prep_lock);
> > +               clear_bit(IOAT_CHAN_DOWN, &ioat_chan->state);
> >                 spin_unlock_bh(&ioat_chan->prep_lock);
> >                 spin_unlock_bh(&ioat_chan->cleanup_lock);
> >                 return;
> > diff --git a/drivers/dma/ioat/registers.h
> > b/drivers/dma/ioat/registers.h
> > index 4994a36..faf20fa 100644
> > --- a/drivers/dma/ioat/registers.h
> > +++ b/drivers/dma/ioat/registers.h
> > @@ -233,6 +233,8 @@
> >  #define IOAT_CHANERR_DESCRIPTOR_COUNT_ERR      0x40000
> > 
> >  #define IOAT_CHANERR_HANDLE_MASK (IOAT_CHANERR_XOR_P_OR_CRC_ERR |
> > IOAT_CHANERR_XOR_Q_ERR)
> > +#define IOAT_CHANERR_RECOVER_MASK (IOAT_CHANERR_READ_DATA_ERR | \
> > +                                  IOAT_CHANERR_WRITE_DATA_ERR)
> > 
> >  #define IOAT_CHANERR_MASK_OFFSET               0x2C    /* 32-bit
> > Channel Error Register */
> > 
> > 
> 
> -- 

  reply	other threads:[~2016-06-28 16:12 UTC|newest]

Thread overview: 10+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2016-06-27 23:09 [PATCH 0/5] dmaengine: ioatdma: DMA error reporting Dave Jiang
2016-06-27 23:09 ` [PATCH 1/5] dmaengine: Adding error handling flag Dave Jiang
2016-06-27 23:09 ` [PATCH 2/5] dmaengine: IOATDMA: Add error handling to driver Dave Jiang
2016-06-28  1:08   ` Allen Hubbe
2016-06-28 16:11     ` Jiang, Dave [this message]
2016-06-27 23:09 ` [PATCH 3/5] dmaengine: ioatdma: add error strings to chanerr output Dave Jiang
2016-06-27 23:09 ` [PATCH 4/5] ntb: add DMA error handling for TX DMA Dave Jiang
2016-06-28  1:53   ` Allen Hubbe
2016-06-28 16:15     ` Jiang, Dave
2016-06-27 23:10 ` [PATCH 5/5] ntb: add DMA error handling for RX DMA Dave Jiang

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=1467130291.3754.8.camel@intel.com \
    --to=dave.jiang@intel.com \
    --cc=allen.hubbe@emc.com \
    --cc=allenbh@gmail.com \
    --cc=dan.j.williams@intel.com \
    --cc=dmaengine@vger.kernel.org \
    --cc=jdmason@kudzu.us \
    --cc=linux-ntb@googlegroups.com \
    --cc=vinod.koul@intel.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.