* [PATCHv2] nvme: ANA transition timeout handling
@ 2018-06-08 12:13 Hannes Reinecke
2018-06-11 14:22 ` Christoph Hellwig
0 siblings, 1 reply; 3+ messages in thread
From: Hannes Reinecke @ 2018-06-08 12:13 UTC (permalink / raw)
Turn the ana_state array into an array of ana groups, and add
a timer to each group for tracking ANA transition timeout.
Once the timeout expires the controller will be reset.
Signed-off-by: Hannes Reinecke <hare at suse.com>
---
drivers/nvme/host/core.c | 2 ++
drivers/nvme/host/multipath.c | 78 ++++++++++++++++++++++++++++++++++---------
drivers/nvme/host/nvme.h | 14 +++++++-
3 files changed, 77 insertions(+), 17 deletions(-)
diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index e62de51209b2..c2362e81619a 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -114,6 +114,7 @@ int nvme_reset_ctrl(struct nvme_ctrl *ctrl)
{
if (!nvme_change_ctrl_state(ctrl, NVME_CTRL_RESETTING))
return -EBUSY;
+ nvme_stop_anatt(ctrl);
if (!queue_work(nvme_reset_wq, &ctrl->reset_work))
return -EBUSY;
return 0;
@@ -2378,6 +2379,7 @@ int nvme_init_identify(struct nvme_ctrl *ctrl)
ctrl->kas = le16_to_cpu(id->kas);
ctrl->max_namespaces = le32_to_cpu(id->mnan);
ctrl->anacap = id->anacap;
+ ctrl->anatt = id->anatt;
ctrl->nanagrpid = le32_to_cpu(id->nanagrpid);
ctrl->anagrpmax = le32_to_cpu(id->anagrpmax);
diff --git a/drivers/nvme/host/multipath.c b/drivers/nvme/host/multipath.c
index 67809e4a1752..b14026e41cac 100644
--- a/drivers/nvme/host/multipath.c
+++ b/drivers/nvme/host/multipath.c
@@ -44,7 +44,7 @@ void nvme_set_disk_name(char *disk_name, struct nvme_ns *ns,
static void nvme_update_ana_state(struct nvme_ns *ns, enum nvme_ana_state state)
{
- WRITE_ONCE(ns->ctrl->ana_state[ns->anagrpid], state);
+ WRITE_ONCE(ns->ctrl->ana_groups[ns->anagrpid].state, state);
}
void nvme_failover_req(struct request *req)
@@ -64,13 +64,17 @@ void nvme_failover_req(struct request *req)
*/
switch (status) {
case NVME_SC_ANA_TRANSITION:
- /*
- * XXX: We should verify the controller doesn't die on during
- * the transition. But that means we per-group timeout from
- * when we first hit the change state, so this won't be
- * entirely trivial..
- */
nvme_update_ana_state(ns, NVME_ANA_CHANGE);
+ if (ns->ctrl->state == NVME_CTRL_LIVE) {
+ struct nvme_ana_group *grp =
+ &ns->ctrl->ana_groups[ns->anagrpid];
+ /*
+ * Use timer_reduce() to ensure we're not modifying
+ * an already running timer.
+ */
+ timer_reduce(&grp->anatt_timer,
+ ns->ctrl->anatt * HZ + jiffies);
+ }
break;
case NVME_SC_ANA_PERSISTENT_LOSS:
nvme_update_ana_state(ns, NVME_ANA_PERSISTENT_LOSS);
@@ -104,7 +108,7 @@ static inline enum nvme_ana_state nvme_ns_ana_state(struct nvme_ns *ns)
return NVME_ANA_OPTIMIZED;
if (WARN_ON_ONCE(ns->anagrpid > ns->ctrl->anagrpmax))
return 0;
- return READ_ONCE(ns->ctrl->ana_state[ns->anagrpid]);
+ return READ_ONCE(ns->ctrl->ana_groups[ns->anagrpid].state);
}
static const char *nvme_ana_state_names[] = {
@@ -351,7 +355,7 @@ static int nvme_process_ana_log(struct nvme_ctrl *ctrl, bool groups_only)
dev_info(ctrl->device, "ANA group %d: %s.\n",
grpid, nvme_ana_state_names[desc->state]);
- WRITE_ONCE(ctrl->ana_state[grpid], desc->state);
+ WRITE_ONCE(ctrl->ana_groups[grpid].state, desc->state);
offset += sizeof(*desc);
if (!nr_nsids)
continue;
@@ -395,14 +399,48 @@ static int nvme_process_ana_log(struct nvme_ctrl *ctrl, bool groups_only)
static void nvme_ana_work(struct work_struct *work)
{
struct nvme_ctrl *ctrl = container_of(work, struct nvme_ctrl, ana_work);
+ u32 grpid;
nvme_process_ana_log(ctrl, false);
+ for (grpid = 1; grpid < ctrl->anagrpmax; grpid++) {
+ struct nvme_ana_group *grp = &ctrl->ana_groups[grpid];
+ enum nvme_ana_state state = READ_ONCE(grp->state);
+
+ if (ctrl->state == NVME_CTRL_LIVE && state == NVME_ANA_CHANGE)
+ timer_reduce(&grp->anatt_timer,
+ ctrl->anatt * HZ + jiffies);
+ else
+ del_timer(&grp->anatt_timer);
+ }
nvme_kick_requeue_lists(ctrl);
}
+void nvme_anatt_timedout(struct timer_list *t)
+{
+ struct nvme_ana_group *grp = from_timer(grp, t, anatt_timer);
+
+ if (grp->ctrl->state != NVME_CTRL_LIVE)
+ return;
+ dev_info(grp->ctrl->device, "ANA group %d: ANATT timeout, resetting\n",
+ grp->grpid);
+ nvme_reset_ctrl(grp->ctrl);
+}
+
+void nvme_stop_anatt(struct nvme_ctrl *ctrl)
+{
+ u32 grpid;
+
+ for (grpid = 0; grpid < ctrl->anagrpmax; grpid++) {
+ struct nvme_ana_group *grp = &ctrl->ana_groups[grpid];
+
+ del_timer(&grp->anatt_timer);
+ }
+}
+
int nvme_configure_ana(struct nvme_ctrl *ctrl)
{
int error;
+ u32 grpid;
if (!nvme_ctrl_has_ana(ctrl))
return 0;
@@ -422,14 +460,21 @@ int nvme_configure_ana(struct nvme_ctrl *ctrl)
}
INIT_WORK(&ctrl->ana_work, nvme_ana_work);
- ctrl->ana_state = kcalloc(ctrl->anagrpmax, sizeof(*ctrl->ana_state),
- GFP_KERNEL);
- if (!ctrl->ana_state)
+ ctrl->ana_groups = kcalloc(ctrl->anagrpmax, sizeof(*ctrl->ana_groups),
+ GFP_KERNEL);
+ if (!ctrl->ana_groups)
return -ENOMEM;
+ for (grpid = 1; grpid < ctrl->anagrpmax; grpid++) {
+ struct nvme_ana_group *grp = &ctrl->ana_groups[grpid];
+ grp->grpid = grpid;
+ grp->ctrl = ctrl;
+ timer_setup(&grp->anatt_timer, nvme_anatt_timedout, 0);
+ }
+
ctrl->ana_log_buf = kmalloc(ctrl->ana_log_size, GFP_KERNEL);
if (!ctrl->ana_log_buf)
- goto out_free_ana_state;
+ goto out_free_ana_groups;
error = nvme_process_ana_log(ctrl, true);
if (error)
@@ -437,15 +482,16 @@ int nvme_configure_ana(struct nvme_ctrl *ctrl)
return 0;
out_free_ana_log_buf:
kfree(ctrl->ana_log_buf);
-out_free_ana_state:
- kfree(ctrl->ana_state);
+out_free_ana_groups:
+ kfree(ctrl->ana_groups);
return -ENOMEM;
}
void nvme_deconfigure_ana(struct nvme_ctrl *ctrl)
{
+ nvme_stop_anatt(ctrl);
kfree(ctrl->ana_log_buf);
- kfree(ctrl->ana_state);
+ kfree(ctrl->ana_groups);
}
static ssize_t ana_grpid_show(struct device *dev, struct device_attribute *attr,
diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h
index be2585576bad..7d909d1f4843 100644
--- a/drivers/nvme/host/nvme.h
+++ b/drivers/nvme/host/nvme.h
@@ -119,6 +119,13 @@ static inline struct nvme_request *nvme_req(struct request *req)
return blk_mq_rq_to_pdu(req);
}
+struct nvme_ana_group {
+ u32 grpid;
+ struct nvme_ctrl *ctrl;
+ enum nvme_ana_state state;
+ struct timer_list anatt_timer;
+};
+
/* The below value is the specific amount of delay needed before checking
* readiness in case of the PCI_DEVICE(0x1c58, 0x0003), which needs the
* NVME_QUIRK_DELAY_BEFORE_CHK_RDY quirk enabled. The value (in ms) was
@@ -200,9 +207,10 @@ struct nvme_ctrl {
/* asymmetric namespace access: */
u8 anacap;
+ u8 anatt;
u32 anagrpmax;
u32 nanagrpid;
- enum nvme_ana_state *ana_state;
+ struct nvme_ana_group *ana_groups;
size_t ana_log_size;
struct nvme_ana_rsp_hdr *ana_log_buf;
struct work_struct ana_work;
@@ -469,6 +477,7 @@ void nvme_mpath_add_disk(struct nvme_ns *ns);
void nvme_mpath_remove_disk(struct nvme_ns_head *head);
int nvme_configure_ana(struct nvme_ctrl *ctrl);
void nvme_deconfigure_ana(struct nvme_ctrl *ctrl);
+void nvme_stop_anatt(struct nvme_ctrl *ctrl);
static inline void nvme_mpath_clear_current_path(struct nvme_ns *ns)
{
@@ -531,6 +540,9 @@ static inline int nvme_configure_ana(struct nvme_ctrl *ctrl)
static inline void nvme_deconfigure_ana(struct nvme_ctrl *ctrl)
{
}
+static inline void nvme_stop_anatt(struct nvme_ctrl *ctrl)
+{
+}
#endif /* CONFIG_NVME_MULTIPATH */
#ifdef CONFIG_NVM
--
2.12.3
^ permalink raw reply related [flat|nested] 3+ messages in thread* [PATCHv2] nvme: ANA transition timeout handling
2018-06-08 12:13 [PATCHv2] nvme: ANA transition timeout handling Hannes Reinecke
@ 2018-06-11 14:22 ` Christoph Hellwig
2018-06-11 15:51 ` Hannes Reinecke
0 siblings, 1 reply; 3+ messages in thread
From: Christoph Hellwig @ 2018-06-11 14:22 UTC (permalink / raw)
> + /*
> + * Use timer_reduce() to ensure we're not modifying
> + * an already running timer.
> + */
> + timer_reduce(&grp->anatt_timer,
> + ns->ctrl->anatt * HZ + jiffies);
That's a bit of a gross hack for a timer that never is actually
reduced, just added.
> + if (ctrl->state == NVME_CTRL_LIVE && state == NVME_ANA_CHANGE)
> + timer_reduce(&grp->anatt_timer,
> + ctrl->anatt * HZ + jiffies);
> + else
> + del_timer(&grp->anatt_timer);
> + for (grpid = 0; grpid < ctrl->anagrpmax; grpid++) {
> + struct nvme_ana_group *grp = &ctrl->ana_groups[grpid];
> +
> + del_timer(&grp->anatt_timer);
Without del_timer_sync you are going to create use after free
conditions.
Anyway, v4 of the ANA series I've just sent contains simple ANATT
handling based on a per-controller timer, which should be sufficient
to catch the intent behind ANATT. Comments welcome.
^ permalink raw reply [flat|nested] 3+ messages in thread* [PATCHv2] nvme: ANA transition timeout handling
2018-06-11 14:22 ` Christoph Hellwig
@ 2018-06-11 15:51 ` Hannes Reinecke
0 siblings, 0 replies; 3+ messages in thread
From: Hannes Reinecke @ 2018-06-11 15:51 UTC (permalink / raw)
On 06/11/2018 04:22 PM, Christoph Hellwig wrote:
>> + /*
>> + * Use timer_reduce() to ensure we're not modifying
>> + * an already running timer.
>> + */
>> + timer_reduce(&grp->anatt_timer,
>> + ns->ctrl->anatt * HZ + jiffies);
>
> That's a bit of a gross hack for a timer that never is actually
> reduced, just added.
>
Well, yes; but using timer_reduce allows me to use the timer code
without having to take a lock; if I would've used add_timer() I always
would have to have two statements (one for setting the timer, and on
call to add_timer) which from my understanding would need to be
protected by a lock.
>> + if (ctrl->state == NVME_CTRL_LIVE && state == NVME_ANA_CHANGE)
>> + timer_reduce(&grp->anatt_timer,
>> + ctrl->anatt * HZ + jiffies);
>> + else
>> + del_timer(&grp->anatt_timer);
>
>
>> + for (grpid = 0; grpid < ctrl->anagrpmax; grpid++) {
>> + struct nvme_ana_group *grp = &ctrl->ana_groups[grpid];
>> +
>> + del_timer(&grp->anatt_timer);
>
> Without del_timer_sync you are going to create use after free
> conditions.
>
Hmm. Possibly.
Will be looking into it if
>
> Anyway, v4 of the ANA series I've just sent contains simple ANATT
> handling based on a per-controller timer, which should be sufficient
> to catch the intent behind ANATT. Comments welcome.
> Actually, I looked into moving the timer handling into the controller, too.
But then the problem was that we'll catch only the most current
transition; if we have two namespaces, both going into the 'change
state' mode independently (as would be the case with our nvmet, if both
namespaces are configured on different ANA groups), we would only able
to register the most current transition.
If we have a scenario like
1. NS A: ANA change state
2. NS A: ANA AEN
-> start ANATT
3. NS B: ANA change state
4. NS B: ANA AEN
-> not starting ANATT as it's already running
5. NS A: final state
6. NS A: ANA AEN
-> kill ANATT
we will start ANATT for NS B (with the full transition timeout) only
after 6., failing to reduce the timer by the time spent between 4. and 6.
If that's not an issue, okay, we can use that approach.
I just didn't dare so, knowing as it has design flaws.
Cheers,
Hannes
^ permalink raw reply [flat|nested] 3+ messages in thread
end of thread, other threads:[~2018-06-11 15:51 UTC | newest]
Thread overview: 3+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2018-06-08 12:13 [PATCHv2] nvme: ANA transition timeout handling Hannes Reinecke
2018-06-11 14:22 ` Christoph Hellwig
2018-06-11 15:51 ` Hannes Reinecke
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.