* [PATCH V1 vfio 1/5] net/mlx5: Add the IFC related bits for query tracker
2024-02-05 12:48 [PATCH V1 vfio 0/5] Improve mlx5 driver to better handle some error cases Yishai Hadas
@ 2024-02-05 12:48 ` Yishai Hadas
2024-02-05 12:48 ` [PATCH V1 vfio 2/5] vfio/mlx5: Add support for tracker object change event Yishai Hadas
` (5 subsequent siblings)
6 siblings, 0 replies; 13+ messages in thread
From: Yishai Hadas @ 2024-02-05 12:48 UTC (permalink / raw)
To: alex.williamson, jgg
Cc: kvm, kevin.tian, joao.m.martins, leonro, yishaih, maorg
Add the IFC related bits for query tracker.
Signed-off-by: Yishai Hadas <yishaih@nvidia.com>
---
include/linux/mlx5/mlx5_ifc.h | 5 +++++
1 file changed, 5 insertions(+)
diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h
index 6f3631425f38..cb08b5e36c21 100644
--- a/include/linux/mlx5/mlx5_ifc.h
+++ b/include/linux/mlx5/mlx5_ifc.h
@@ -12619,6 +12619,11 @@ struct mlx5_ifc_modify_page_track_obj_in_bits {
struct mlx5_ifc_page_track_bits obj_context;
};
+struct mlx5_ifc_query_page_track_obj_out_bits {
+ struct mlx5_ifc_general_obj_out_cmd_hdr_bits general_obj_out_cmd_hdr;
+ struct mlx5_ifc_page_track_bits obj_context;
+};
+
struct mlx5_ifc_msecq_reg_bits {
u8 reserved_at_0[0x20];
--
2.18.1
^ permalink raw reply related [flat|nested] 13+ messages in thread* [PATCH V1 vfio 2/5] vfio/mlx5: Add support for tracker object change event
2024-02-05 12:48 [PATCH V1 vfio 0/5] Improve mlx5 driver to better handle some error cases Yishai Hadas
2024-02-05 12:48 ` [PATCH V1 vfio 1/5] net/mlx5: Add the IFC related bits for query tracker Yishai Hadas
@ 2024-02-05 12:48 ` Yishai Hadas
2024-02-05 12:48 ` [PATCH V1 vfio 3/5] vfio/mlx5: Handle the EREMOTEIO error upon the SAVE command Yishai Hadas
` (4 subsequent siblings)
6 siblings, 0 replies; 13+ messages in thread
From: Yishai Hadas @ 2024-02-05 12:48 UTC (permalink / raw)
To: alex.williamson, jgg
Cc: kvm, kevin.tian, joao.m.martins, leonro, yishaih, maorg
Add support for tracker object change event by referring to its
MLX5_EVENT_TYPE_OBJECT_CHANGE event when occurs.
This lets the driver recognize whether the firmware moved the tracker
object to an error state.
In that case, the driver will skip/block any usage of that object
including an early exit in case the object was previously marked with an
error.
This functionality also covers the case when no CQE is delivered as of
the error state.
The driver was adapted to the device specification to handle the above.
Signed-off-by: Yishai Hadas <yishaih@nvidia.com>
---
drivers/vfio/pci/mlx5/cmd.c | 48 +++++++++++++++++++++++++++++++++++++
drivers/vfio/pci/mlx5/cmd.h | 1 +
2 files changed, 49 insertions(+)
diff --git a/drivers/vfio/pci/mlx5/cmd.c b/drivers/vfio/pci/mlx5/cmd.c
index efd1d252cdc9..8a39ff19da28 100644
--- a/drivers/vfio/pci/mlx5/cmd.c
+++ b/drivers/vfio/pci/mlx5/cmd.c
@@ -149,6 +149,12 @@ int mlx5vf_cmd_query_vhca_migration_state(struct mlx5vf_pci_core_device *mvdev,
return 0;
}
+static void set_tracker_change_event(struct mlx5vf_pci_core_device *mvdev)
+{
+ mvdev->tracker.object_changed = true;
+ complete(&mvdev->tracker_comp);
+}
+
static void set_tracker_error(struct mlx5vf_pci_core_device *mvdev)
{
/* Mark the tracker under an error and wake it up if it's running */
@@ -900,6 +906,29 @@ static int mlx5vf_cmd_modify_tracker(struct mlx5_core_dev *mdev,
return mlx5_cmd_exec(mdev, in, sizeof(in), out, sizeof(out));
}
+static int mlx5vf_cmd_query_tracker(struct mlx5_core_dev *mdev,
+ struct mlx5_vhca_page_tracker *tracker)
+{
+ u32 out[MLX5_ST_SZ_DW(query_page_track_obj_out)] = {};
+ u32 in[MLX5_ST_SZ_DW(general_obj_in_cmd_hdr)] = {};
+ void *obj_context;
+ void *cmd_hdr;
+ int err;
+
+ cmd_hdr = MLX5_ADDR_OF(modify_page_track_obj_in, in, general_obj_in_cmd_hdr);
+ MLX5_SET(general_obj_in_cmd_hdr, cmd_hdr, opcode, MLX5_CMD_OP_QUERY_GENERAL_OBJECT);
+ MLX5_SET(general_obj_in_cmd_hdr, cmd_hdr, obj_type, MLX5_OBJ_TYPE_PAGE_TRACK);
+ MLX5_SET(general_obj_in_cmd_hdr, cmd_hdr, obj_id, tracker->id);
+
+ err = mlx5_cmd_exec(mdev, in, sizeof(in), out, sizeof(out));
+ if (err)
+ return err;
+
+ obj_context = MLX5_ADDR_OF(query_page_track_obj_out, out, obj_context);
+ tracker->status = MLX5_GET(page_track, obj_context, state);
+ return 0;
+}
+
static int alloc_cq_frag_buf(struct mlx5_core_dev *mdev,
struct mlx5_vhca_cq_buf *buf, int nent,
int cqe_size)
@@ -957,9 +986,11 @@ static int mlx5vf_event_notifier(struct notifier_block *nb, unsigned long type,
mlx5_nb_cof(nb, struct mlx5_vhca_page_tracker, nb);
struct mlx5vf_pci_core_device *mvdev = container_of(
tracker, struct mlx5vf_pci_core_device, tracker);
+ struct mlx5_eqe_obj_change *object;
struct mlx5_eqe *eqe = data;
u8 event_type = (u8)type;
u8 queue_type;
+ u32 obj_id;
int qp_num;
switch (event_type) {
@@ -975,6 +1006,12 @@ static int mlx5vf_event_notifier(struct notifier_block *nb, unsigned long type,
break;
set_tracker_error(mvdev);
break;
+ case MLX5_EVENT_TYPE_OBJECT_CHANGE:
+ object = &eqe->data.obj_change;
+ obj_id = be32_to_cpu(object->obj_id);
+ if (obj_id == tracker->id)
+ set_tracker_change_event(mvdev);
+ break;
default:
break;
}
@@ -1634,6 +1671,11 @@ int mlx5vf_tracker_read_and_clear(struct vfio_device *vdev, unsigned long iova,
goto end;
}
+ if (tracker->is_err) {
+ err = -EIO;
+ goto end;
+ }
+
mdev = mvdev->mdev;
err = mlx5vf_cmd_modify_tracker(mdev, tracker->id, iova, length,
MLX5_PAGE_TRACK_STATE_REPORTING);
@@ -1652,6 +1694,12 @@ int mlx5vf_tracker_read_and_clear(struct vfio_device *vdev, unsigned long iova,
dirty, &tracker->status);
if (poll_err == CQ_EMPTY) {
wait_for_completion(&mvdev->tracker_comp);
+ if (tracker->object_changed) {
+ tracker->object_changed = false;
+ err = mlx5vf_cmd_query_tracker(mdev, tracker);
+ if (err)
+ goto end;
+ }
continue;
}
}
diff --git a/drivers/vfio/pci/mlx5/cmd.h b/drivers/vfio/pci/mlx5/cmd.h
index f2c7227fa683..0d6a2db3d801 100644
--- a/drivers/vfio/pci/mlx5/cmd.h
+++ b/drivers/vfio/pci/mlx5/cmd.h
@@ -162,6 +162,7 @@ struct mlx5_vhca_page_tracker {
u32 id;
u32 pdn;
u8 is_err:1;
+ u8 object_changed:1;
struct mlx5_uars_page *uar;
struct mlx5_vhca_cq cq;
struct mlx5_vhca_qp *host_qp;
--
2.18.1
^ permalink raw reply related [flat|nested] 13+ messages in thread* [PATCH V1 vfio 3/5] vfio/mlx5: Handle the EREMOTEIO error upon the SAVE command
2024-02-05 12:48 [PATCH V1 vfio 0/5] Improve mlx5 driver to better handle some error cases Yishai Hadas
2024-02-05 12:48 ` [PATCH V1 vfio 1/5] net/mlx5: Add the IFC related bits for query tracker Yishai Hadas
2024-02-05 12:48 ` [PATCH V1 vfio 2/5] vfio/mlx5: Add support for tracker object change event Yishai Hadas
@ 2024-02-05 12:48 ` Yishai Hadas
2024-02-05 12:48 ` [PATCH V1 vfio 4/5] vfio/mlx5: Block incremental query upon migf state error Yishai Hadas
` (3 subsequent siblings)
6 siblings, 0 replies; 13+ messages in thread
From: Yishai Hadas @ 2024-02-05 12:48 UTC (permalink / raw)
To: alex.williamson, jgg
Cc: kvm, kevin.tian, joao.m.martins, leonro, yishaih, maorg
The SAVE command uses the async command interface over the PF.
Upon a failure in the firmware -EREMOTEIO is returned.
In that case call mlx5_cmd_out_err() to let it print the command failure
details including the firmware syndrome.
Note:
The other commands in the driver use the sync command interface in a way
that a firmware syndrome is printed upon an error inside mlx5_core.
Signed-off-by: Yishai Hadas <yishaih@nvidia.com>
---
drivers/vfio/pci/mlx5/cmd.c | 7 ++++++-
1 file changed, 6 insertions(+), 1 deletion(-)
diff --git a/drivers/vfio/pci/mlx5/cmd.c b/drivers/vfio/pci/mlx5/cmd.c
index 8a39ff19da28..6b45bd7d89ad 100644
--- a/drivers/vfio/pci/mlx5/cmd.c
+++ b/drivers/vfio/pci/mlx5/cmd.c
@@ -614,8 +614,13 @@ static void mlx5vf_save_callback(int status, struct mlx5_async_work *context)
err:
/* The error flow can't run from an interrupt context */
- if (status == -EREMOTEIO)
+ if (status == -EREMOTEIO) {
status = MLX5_GET(save_vhca_state_out, async_data->out, status);
+ /* Failed in FW, print cmd out failure details */
+ mlx5_cmd_out_err(migf->mvdev->mdev, MLX5_CMD_OP_SAVE_VHCA_STATE, 0,
+ async_data->out);
+ }
+
async_data->status = status;
queue_work(migf->mvdev->cb_wq, &async_data->work);
}
--
2.18.1
^ permalink raw reply related [flat|nested] 13+ messages in thread* [PATCH V1 vfio 4/5] vfio/mlx5: Block incremental query upon migf state error
2024-02-05 12:48 [PATCH V1 vfio 0/5] Improve mlx5 driver to better handle some error cases Yishai Hadas
` (2 preceding siblings ...)
2024-02-05 12:48 ` [PATCH V1 vfio 3/5] vfio/mlx5: Handle the EREMOTEIO error upon the SAVE command Yishai Hadas
@ 2024-02-05 12:48 ` Yishai Hadas
2024-02-05 12:48 ` [PATCH V1 vfio 5/5] vfio/mlx5: Let firmware knows upon leaving PRE_COPY back to RUNNING Yishai Hadas
` (2 subsequent siblings)
6 siblings, 0 replies; 13+ messages in thread
From: Yishai Hadas @ 2024-02-05 12:48 UTC (permalink / raw)
To: alex.williamson, jgg
Cc: kvm, kevin.tian, joao.m.martins, leonro, yishaih, maorg
Block incremental query which is state-dependent once the migration file
was previously marked with state error.
This may prevent redundant calls to firmware upon PRE_COPY which will
end-up with a failure and a syndrome printed in dmesg.
Signed-off-by: Yishai Hadas <yishaih@nvidia.com>
---
drivers/vfio/pci/mlx5/cmd.c | 5 +++++
1 file changed, 5 insertions(+)
diff --git a/drivers/vfio/pci/mlx5/cmd.c b/drivers/vfio/pci/mlx5/cmd.c
index 6b45bd7d89ad..6800e4ffe9ee 100644
--- a/drivers/vfio/pci/mlx5/cmd.c
+++ b/drivers/vfio/pci/mlx5/cmd.c
@@ -121,6 +121,11 @@ int mlx5vf_cmd_query_vhca_migration_state(struct mlx5vf_pci_core_device *mvdev,
}
query_flags &= ~MLX5VF_QUERY_INC;
}
+ /* Block incremental query which is state-dependent */
+ if (mvdev->saving_migf->state == MLX5_MIGF_STATE_ERROR) {
+ complete(&mvdev->saving_migf->save_comp);
+ return -ENODEV;
+ }
}
MLX5_SET(query_vhca_migration_state_in, in, opcode,
--
2.18.1
^ permalink raw reply related [flat|nested] 13+ messages in thread* [PATCH V1 vfio 5/5] vfio/mlx5: Let firmware knows upon leaving PRE_COPY back to RUNNING
2024-02-05 12:48 [PATCH V1 vfio 0/5] Improve mlx5 driver to better handle some error cases Yishai Hadas
` (3 preceding siblings ...)
2024-02-05 12:48 ` [PATCH V1 vfio 4/5] vfio/mlx5: Block incremental query upon migf state error Yishai Hadas
@ 2024-02-05 12:48 ` Yishai Hadas
2024-02-06 7:35 ` [PATCH V1 vfio 0/5] Improve mlx5 driver to better handle some error cases Tian, Kevin
2024-02-22 21:37 ` Alex Williamson
6 siblings, 0 replies; 13+ messages in thread
From: Yishai Hadas @ 2024-02-05 12:48 UTC (permalink / raw)
To: alex.williamson, jgg
Cc: kvm, kevin.tian, joao.m.martins, leonro, yishaih, maorg
Let firmware knows upon leaving PRE_COPY back to RUNNING as of some
error in the target/migration cancellation.
This will let firmware cleaning its internal resources that were turned
on upon PRE_COPY.
The flow is based on the device specification in this area.
Signed-off-by: Yishai Hadas <yishaih@nvidia.com>
---
drivers/vfio/pci/mlx5/cmd.c | 14 +++++++++----
drivers/vfio/pci/mlx5/cmd.h | 4 +++-
drivers/vfio/pci/mlx5/main.c | 39 +++++++++++++++++++++++++++++-------
3 files changed, 45 insertions(+), 12 deletions(-)
diff --git a/drivers/vfio/pci/mlx5/cmd.c b/drivers/vfio/pci/mlx5/cmd.c
index 6800e4ffe9ee..c54bcd5d0917 100644
--- a/drivers/vfio/pci/mlx5/cmd.c
+++ b/drivers/vfio/pci/mlx5/cmd.c
@@ -108,8 +108,9 @@ int mlx5vf_cmd_query_vhca_migration_state(struct mlx5vf_pci_core_device *mvdev,
ret = wait_for_completion_interruptible(&mvdev->saving_migf->save_comp);
if (ret)
return ret;
- if (mvdev->saving_migf->state ==
- MLX5_MIGF_STATE_PRE_COPY_ERROR) {
+ /* Upon cleanup, ignore previous pre_copy error state */
+ if (mvdev->saving_migf->state == MLX5_MIGF_STATE_PRE_COPY_ERROR &&
+ !(query_flags & MLX5VF_QUERY_CLEANUP)) {
/*
* In case we had a PRE_COPY error, only query full
* image for final image
@@ -200,7 +201,7 @@ void mlx5vf_cmd_close_migratable(struct mlx5vf_pci_core_device *mvdev)
/* Must be done outside the lock to let it progress */
set_tracker_error(mvdev);
mutex_lock(&mvdev->state_mutex);
- mlx5vf_disable_fds(mvdev);
+ mlx5vf_disable_fds(mvdev, NULL);
_mlx5vf_free_page_tracker_resources(mvdev);
mlx5vf_state_mutex_unlock(mvdev);
}
@@ -639,6 +640,7 @@ int mlx5vf_cmd_save_vhca_state(struct mlx5vf_pci_core_device *mvdev,
u32 in[MLX5_ST_SZ_DW(save_vhca_state_in)] = {};
struct mlx5_vhca_data_buffer *header_buf = NULL;
struct mlx5vf_async_data *async_data;
+ bool pre_copy_cleanup = false;
int err;
lockdep_assert_held(&mvdev->state_mutex);
@@ -649,6 +651,10 @@ int mlx5vf_cmd_save_vhca_state(struct mlx5vf_pci_core_device *mvdev,
if (err)
return err;
+ if ((migf->state == MLX5_MIGF_STATE_PRE_COPY ||
+ migf->state == MLX5_MIGF_STATE_PRE_COPY_ERROR) && !track && !inc)
+ pre_copy_cleanup = true;
+
if (migf->state == MLX5_MIGF_STATE_PRE_COPY_ERROR)
/*
* In case we had a PRE_COPY error, SAVE is triggered only for
@@ -667,7 +673,7 @@ int mlx5vf_cmd_save_vhca_state(struct mlx5vf_pci_core_device *mvdev,
async_data = &migf->async_data;
async_data->buf = buf;
- async_data->stop_copy_chunk = !track;
+ async_data->stop_copy_chunk = (!track && !pre_copy_cleanup);
async_data->out = kvzalloc(out_size, GFP_KERNEL);
if (!async_data->out) {
err = -ENOMEM;
diff --git a/drivers/vfio/pci/mlx5/cmd.h b/drivers/vfio/pci/mlx5/cmd.h
index 0d6a2db3d801..707393df36c4 100644
--- a/drivers/vfio/pci/mlx5/cmd.h
+++ b/drivers/vfio/pci/mlx5/cmd.h
@@ -197,6 +197,7 @@ struct mlx5vf_pci_core_device {
enum {
MLX5VF_QUERY_INC = (1UL << 0),
MLX5VF_QUERY_FINAL = (1UL << 1),
+ MLX5VF_QUERY_CLEANUP = (1UL << 2),
};
int mlx5vf_cmd_suspend_vhca(struct mlx5vf_pci_core_device *mvdev, u16 op_mod);
@@ -232,7 +233,8 @@ int mlx5vf_add_migration_pages(struct mlx5_vhca_data_buffer *buf,
struct page *mlx5vf_get_migration_page(struct mlx5_vhca_data_buffer *buf,
unsigned long offset);
void mlx5vf_state_mutex_unlock(struct mlx5vf_pci_core_device *mvdev);
-void mlx5vf_disable_fds(struct mlx5vf_pci_core_device *mvdev);
+void mlx5vf_disable_fds(struct mlx5vf_pci_core_device *mvdev,
+ enum mlx5_vf_migf_state *last_save_state);
void mlx5vf_mig_file_cleanup_cb(struct work_struct *_work);
void mlx5vf_mig_file_set_save_work(struct mlx5_vf_migration_file *migf,
u8 chunk_num, size_t next_required_umem_size);
diff --git a/drivers/vfio/pci/mlx5/main.c b/drivers/vfio/pci/mlx5/main.c
index fe09a8c8af95..3982fcf60cf2 100644
--- a/drivers/vfio/pci/mlx5/main.c
+++ b/drivers/vfio/pci/mlx5/main.c
@@ -1146,7 +1146,8 @@ mlx5vf_pci_resume_device_data(struct mlx5vf_pci_core_device *mvdev)
return ERR_PTR(ret);
}
-void mlx5vf_disable_fds(struct mlx5vf_pci_core_device *mvdev)
+void mlx5vf_disable_fds(struct mlx5vf_pci_core_device *mvdev,
+ enum mlx5_vf_migf_state *last_save_state)
{
if (mvdev->resuming_migf) {
mlx5vf_disable_fd(mvdev->resuming_migf);
@@ -1157,6 +1158,8 @@ void mlx5vf_disable_fds(struct mlx5vf_pci_core_device *mvdev)
if (mvdev->saving_migf) {
mlx5_cmd_cleanup_async_ctx(&mvdev->saving_migf->async_ctx);
cancel_work_sync(&mvdev->saving_migf->async_data.work);
+ if (last_save_state)
+ *last_save_state = mvdev->saving_migf->state;
mlx5vf_disable_fd(mvdev->saving_migf);
wake_up_interruptible(&mvdev->saving_migf->poll_wait);
mlx5fv_cmd_clean_migf_resources(mvdev->saving_migf);
@@ -1217,12 +1220,34 @@ mlx5vf_pci_step_device_state_locked(struct mlx5vf_pci_core_device *mvdev,
return migf->filp;
}
- if ((cur == VFIO_DEVICE_STATE_STOP_COPY && new == VFIO_DEVICE_STATE_STOP) ||
- (cur == VFIO_DEVICE_STATE_PRE_COPY && new == VFIO_DEVICE_STATE_RUNNING) ||
+ if (cur == VFIO_DEVICE_STATE_STOP_COPY && new == VFIO_DEVICE_STATE_STOP) {
+ mlx5vf_disable_fds(mvdev, NULL);
+ return NULL;
+ }
+
+ if ((cur == VFIO_DEVICE_STATE_PRE_COPY && new == VFIO_DEVICE_STATE_RUNNING) ||
(cur == VFIO_DEVICE_STATE_PRE_COPY_P2P &&
new == VFIO_DEVICE_STATE_RUNNING_P2P)) {
- mlx5vf_disable_fds(mvdev);
- return NULL;
+ struct mlx5_vf_migration_file *migf = mvdev->saving_migf;
+ struct mlx5_vhca_data_buffer *buf;
+ enum mlx5_vf_migf_state state;
+ size_t size;
+
+ ret = mlx5vf_cmd_query_vhca_migration_state(mvdev, &size, NULL,
+ MLX5VF_QUERY_INC | MLX5VF_QUERY_CLEANUP);
+ if (ret)
+ return ERR_PTR(ret);
+ buf = mlx5vf_get_data_buffer(migf, size, DMA_FROM_DEVICE);
+ if (IS_ERR(buf))
+ return ERR_CAST(buf);
+ /* pre_copy cleanup */
+ ret = mlx5vf_cmd_save_vhca_state(mvdev, migf, buf, false, false);
+ if (ret) {
+ mlx5vf_put_data_buffer(buf);
+ return ERR_PTR(ret);
+ }
+ mlx5vf_disable_fds(mvdev, &state);
+ return (state != MLX5_MIGF_STATE_ERROR) ? NULL : ERR_PTR(-EIO);
}
if (cur == VFIO_DEVICE_STATE_STOP && new == VFIO_DEVICE_STATE_RESUMING) {
@@ -1244,7 +1269,7 @@ mlx5vf_pci_step_device_state_locked(struct mlx5vf_pci_core_device *mvdev,
if (ret)
return ERR_PTR(ret);
}
- mlx5vf_disable_fds(mvdev);
+ mlx5vf_disable_fds(mvdev, NULL);
return NULL;
}
@@ -1289,7 +1314,7 @@ void mlx5vf_state_mutex_unlock(struct mlx5vf_pci_core_device *mvdev)
mvdev->deferred_reset = false;
spin_unlock(&mvdev->reset_lock);
mvdev->mig_state = VFIO_DEVICE_STATE_RUNNING;
- mlx5vf_disable_fds(mvdev);
+ mlx5vf_disable_fds(mvdev, NULL);
goto again;
}
mutex_unlock(&mvdev->state_mutex);
--
2.18.1
^ permalink raw reply related [flat|nested] 13+ messages in thread* RE: [PATCH V1 vfio 0/5] Improve mlx5 driver to better handle some error cases
2024-02-05 12:48 [PATCH V1 vfio 0/5] Improve mlx5 driver to better handle some error cases Yishai Hadas
` (4 preceding siblings ...)
2024-02-05 12:48 ` [PATCH V1 vfio 5/5] vfio/mlx5: Let firmware knows upon leaving PRE_COPY back to RUNNING Yishai Hadas
@ 2024-02-06 7:35 ` Tian, Kevin
2024-02-06 8:06 ` Yishai Hadas
2024-02-22 21:37 ` Alex Williamson
6 siblings, 1 reply; 13+ messages in thread
From: Tian, Kevin @ 2024-02-06 7:35 UTC (permalink / raw)
To: Yishai Hadas, alex.williamson@redhat.com, jgg@nvidia.com
Cc: kvm@vger.kernel.org, joao.m.martins@oracle.com, leonro@nvidia.com,
maorg@nvidia.com
> From: Yishai Hadas <yishaih@nvidia.com>
> Sent: Monday, February 5, 2024 8:48 PM
>
> This series improves the mlx5 driver to better handle some error cases
> as of below.
>
> The first two patches let the driver recognize whether the firmware
> moved the tracker object to an error state. In that case, the driver
> will skip/block any usage of that object.
>
> The next two patches (#3, #4), improve the driver to better include the
> proper firmware syndrome in dmesg upon a failure in some firmware
> commands.
>
> The last patch follows the device specification to let the firmware know
> upon leaving PRE_COPY back to RUNNING. (e.g. error in the target,
> migration cancellation, etc.).
>
> This will let the firmware clean its internal resources that were turned
> on upon PRE_COPY.
>
> Note:
> As the first patch should go to net/mlx5, we may need to send it as a
> pull request format to vfio before acceptance of the series, to avoid
> conflicts.
>
> Changes from V0: https://lore.kernel.org/kvm/20240130170227.153464-1-
> yishaih@nvidia.com/
> Patch #2:
> - Rename to use 'object changed' in some places to make it clearer.
> - Enhance the commit log to better clarify the usage/use case.
>
> The above was suggested by Tian, Kevin <kevin.tian@intel.com>.
>
this series looks good to me except a small remark on patch2:
Reviewed-by: Kevin Tian <kevin.tian@intel.com>
^ permalink raw reply [flat|nested] 13+ messages in thread* Re: [PATCH V1 vfio 0/5] Improve mlx5 driver to better handle some error cases
2024-02-06 7:35 ` [PATCH V1 vfio 0/5] Improve mlx5 driver to better handle some error cases Tian, Kevin
@ 2024-02-06 8:06 ` Yishai Hadas
2024-02-08 8:16 ` Yishai Hadas
0 siblings, 1 reply; 13+ messages in thread
From: Yishai Hadas @ 2024-02-06 8:06 UTC (permalink / raw)
To: Tian, Kevin, alex.williamson@redhat.com, jgg@nvidia.com
Cc: kvm@vger.kernel.org, joao.m.martins@oracle.com, leonro@nvidia.com,
maorg@nvidia.com
On 06/02/2024 9:35, Tian, Kevin wrote:
>> From: Yishai Hadas <yishaih@nvidia.com>
>> Sent: Monday, February 5, 2024 8:48 PM
>>
>> This series improves the mlx5 driver to better handle some error cases
>> as of below.
>>
>> The first two patches let the driver recognize whether the firmware
>> moved the tracker object to an error state. In that case, the driver
>> will skip/block any usage of that object.
>>
>> The next two patches (#3, #4), improve the driver to better include the
>> proper firmware syndrome in dmesg upon a failure in some firmware
>> commands.
>>
>> The last patch follows the device specification to let the firmware know
>> upon leaving PRE_COPY back to RUNNING. (e.g. error in the target,
>> migration cancellation, etc.).
>>
>> This will let the firmware clean its internal resources that were turned
>> on upon PRE_COPY.
>>
>> Note:
>> As the first patch should go to net/mlx5, we may need to send it as a
>> pull request format to vfio before acceptance of the series, to avoid
>> conflicts.
>>
>> Changes from V0: https://lore.kernel.org/kvm/20240130170227.153464-1-
>> yishaih@nvidia.com/
>> Patch #2:
>> - Rename to use 'object changed' in some places to make it clearer.
>> - Enhance the commit log to better clarify the usage/use case.
>>
>> The above was suggested by Tian, Kevin <kevin.tian@intel.com>.
>>
>
> this series looks good to me except a small remark on patch2:
We should be fine there, see my answer on V0.
>
> Reviewed-by: Kevin Tian <kevin.tian@intel.com>
Thanks Kevin, for your reviewed-by.
Yishai
^ permalink raw reply [flat|nested] 13+ messages in thread
* Re: [PATCH V1 vfio 0/5] Improve mlx5 driver to better handle some error cases
2024-02-06 8:06 ` Yishai Hadas
@ 2024-02-08 8:16 ` Yishai Hadas
2024-02-21 7:45 ` Yishai Hadas
0 siblings, 1 reply; 13+ messages in thread
From: Yishai Hadas @ 2024-02-08 8:16 UTC (permalink / raw)
To: alex.williamson@redhat.com, jgg@nvidia.com
Cc: kvm@vger.kernel.org, joao.m.martins@oracle.com, leonro@nvidia.com,
maorg@nvidia.com, Tian, Kevin
On 06/02/2024 10:06, Yishai Hadas wrote:
> On 06/02/2024 9:35, Tian, Kevin wrote:
>>> From: Yishai Hadas <yishaih@nvidia.com>
>>> Sent: Monday, February 5, 2024 8:48 PM
>>>
>>> This series improves the mlx5 driver to better handle some error cases
>>> as of below.
>>>
>>> The first two patches let the driver recognize whether the firmware
>>> moved the tracker object to an error state. In that case, the driver
>>> will skip/block any usage of that object.
>>>
>>> The next two patches (#3, #4), improve the driver to better include the
>>> proper firmware syndrome in dmesg upon a failure in some firmware
>>> commands.
>>>
>>> The last patch follows the device specification to let the firmware know
>>> upon leaving PRE_COPY back to RUNNING. (e.g. error in the target,
>>> migration cancellation, etc.).
>>>
>>> This will let the firmware clean its internal resources that were turned
>>> on upon PRE_COPY.
>>>
>>> Note:
>>> As the first patch should go to net/mlx5, we may need to send it as a
>>> pull request format to vfio before acceptance of the series, to avoid
>>> conflicts.
>>>
>>> Changes from V0: https://lore.kernel.org/kvm/20240130170227.153464-1-
>>> yishaih@nvidia.com/
>>> Patch #2:
>>> - Rename to use 'object changed' in some places to make it clearer.
>>> - Enhance the commit log to better clarify the usage/use case.
>>>
>>> The above was suggested by Tian, Kevin <kevin.tian@intel.com>.
>>>
>>
>> this series looks good to me except a small remark on patch2:
>
> We should be fine there, see my answer on V0.
>
>>
>> Reviewed-by: Kevin Tian <kevin.tian@intel.com>
>
> Thanks Kevin, for your reviewed-by.
>
> Yishai
>
Alex
Are we OK here to continue with a PR for the first patch ?
It seems that we should be fine here.
Thanks,
Yishai
^ permalink raw reply [flat|nested] 13+ messages in thread
* Re: [PATCH V1 vfio 0/5] Improve mlx5 driver to better handle some error cases
2024-02-08 8:16 ` Yishai Hadas
@ 2024-02-21 7:45 ` Yishai Hadas
2024-02-22 18:04 ` Alex Williamson
0 siblings, 1 reply; 13+ messages in thread
From: Yishai Hadas @ 2024-02-21 7:45 UTC (permalink / raw)
To: alex.williamson@redhat.com, jgg@nvidia.com
Cc: kvm@vger.kernel.org, joao.m.martins@oracle.com, leonro@nvidia.com,
maorg@nvidia.com, Tian, Kevin
On 08/02/2024 10:16, Yishai Hadas wrote:
> On 06/02/2024 10:06, Yishai Hadas wrote:
>> On 06/02/2024 9:35, Tian, Kevin wrote:
>>>> From: Yishai Hadas <yishaih@nvidia.com>
>>>> Sent: Monday, February 5, 2024 8:48 PM
>>>>
>>>> This series improves the mlx5 driver to better handle some error cases
>>>> as of below.
>>>>
>>>> The first two patches let the driver recognize whether the firmware
>>>> moved the tracker object to an error state. In that case, the driver
>>>> will skip/block any usage of that object.
>>>>
>>>> The next two patches (#3, #4), improve the driver to better include the
>>>> proper firmware syndrome in dmesg upon a failure in some firmware
>>>> commands.
>>>>
>>>> The last patch follows the device specification to let the firmware
>>>> know
>>>> upon leaving PRE_COPY back to RUNNING. (e.g. error in the target,
>>>> migration cancellation, etc.).
>>>>
>>>> This will let the firmware clean its internal resources that were
>>>> turned
>>>> on upon PRE_COPY.
>>>>
>>>> Note:
>>>> As the first patch should go to net/mlx5, we may need to send it as a
>>>> pull request format to vfio before acceptance of the series, to avoid
>>>> conflicts.
>>>>
>>>> Changes from V0: https://lore.kernel.org/kvm/20240130170227.153464-1-
>>>> yishaih@nvidia.com/
>>>> Patch #2:
>>>> - Rename to use 'object changed' in some places to make it clearer.
>>>> - Enhance the commit log to better clarify the usage/use case.
>>>>
>>>> The above was suggested by Tian, Kevin <kevin.tian@intel.com>.
>>>>
>>>
>>> this series looks good to me except a small remark on patch2:
>>
>> We should be fine there, see my answer on V0.
>>
>>>
>>> Reviewed-by: Kevin Tian <kevin.tian@intel.com>
>>
>> Thanks Kevin, for your reviewed-by.
>>
>> Yishai
>>
>
> Alex
>
> Are we OK here to continue with a PR for the first patch ?
>
> It seems that we should be fine here.
>
> Thanks,
> Yishai
>
Hi Alex,
Any update here ?
Thanks,
Yishai
^ permalink raw reply [flat|nested] 13+ messages in thread
* Re: [PATCH V1 vfio 0/5] Improve mlx5 driver to better handle some error cases
2024-02-21 7:45 ` Yishai Hadas
@ 2024-02-22 18:04 ` Alex Williamson
2024-02-22 18:33 ` Leon Romanovsky
0 siblings, 1 reply; 13+ messages in thread
From: Alex Williamson @ 2024-02-22 18:04 UTC (permalink / raw)
To: Yishai Hadas
Cc: jgg@nvidia.com, kvm@vger.kernel.org, joao.m.martins@oracle.com,
leonro@nvidia.com, maorg@nvidia.com, Tian, Kevin
On Wed, 21 Feb 2024 09:45:14 +0200
Yishai Hadas <yishaih@nvidia.com> wrote:
> On 08/02/2024 10:16, Yishai Hadas wrote:
> > On 06/02/2024 10:06, Yishai Hadas wrote:
> >> On 06/02/2024 9:35, Tian, Kevin wrote:
> >>>> From: Yishai Hadas <yishaih@nvidia.com>
> >>>> Sent: Monday, February 5, 2024 8:48 PM
> >>>>
> >>>> This series improves the mlx5 driver to better handle some error cases
> >>>> as of below.
> >>>>
> >>>> The first two patches let the driver recognize whether the firmware
> >>>> moved the tracker object to an error state. In that case, the driver
> >>>> will skip/block any usage of that object.
> >>>>
> >>>> The next two patches (#3, #4), improve the driver to better include the
> >>>> proper firmware syndrome in dmesg upon a failure in some firmware
> >>>> commands.
> >>>>
> >>>> The last patch follows the device specification to let the firmware
> >>>> know
> >>>> upon leaving PRE_COPY back to RUNNING. (e.g. error in the target,
> >>>> migration cancellation, etc.).
> >>>>
> >>>> This will let the firmware clean its internal resources that were
> >>>> turned
> >>>> on upon PRE_COPY.
> >>>>
> >>>> Note:
> >>>> As the first patch should go to net/mlx5, we may need to send it as a
> >>>> pull request format to vfio before acceptance of the series, to avoid
> >>>> conflicts.
> >>>>
> >>>> Changes from V0: https://lore.kernel.org/kvm/20240130170227.153464-1-
> >>>> yishaih@nvidia.com/
> >>>> Patch #2:
> >>>> - Rename to use 'object changed' in some places to make it clearer.
> >>>> - Enhance the commit log to better clarify the usage/use case.
> >>>>
> >>>> The above was suggested by Tian, Kevin <kevin.tian@intel.com>.
> >>>>
> >>>
> >>> this series looks good to me except a small remark on patch2:
> >>
> >> We should be fine there, see my answer on V0.
> >>
> >>>
> >>> Reviewed-by: Kevin Tian <kevin.tian@intel.com>
> >>
> >> Thanks Kevin, for your reviewed-by.
> >>
> >> Yishai
> >>
> >
> > Alex
> >
> > Are we OK here to continue with a PR for the first patch ?
> >
> > It seems that we should be fine here.
> >
> > Thanks,
> > Yishai
> >
>
> Hi Alex,
> Any update here ?
Sure, if Leon wants to do a PR for struct
mlx5_ifc_query_page_track_obj_out_bits, that's fine. The series looks
ok to me. The struct definition is small enough to go through the vfio
tree with Leon's ack, but I'll leave it to you to do the right thing
relative to potential conflicts. Thanks,
Alex
^ permalink raw reply [flat|nested] 13+ messages in thread
* Re: [PATCH V1 vfio 0/5] Improve mlx5 driver to better handle some error cases
2024-02-22 18:04 ` Alex Williamson
@ 2024-02-22 18:33 ` Leon Romanovsky
0 siblings, 0 replies; 13+ messages in thread
From: Leon Romanovsky @ 2024-02-22 18:33 UTC (permalink / raw)
To: Alex Williamson
Cc: Yishai Hadas, jgg@nvidia.com, kvm@vger.kernel.org,
joao.m.martins@oracle.com, maorg@nvidia.com, Tian, Kevin
On Thu, Feb 22, 2024 at 11:04:05AM -0700, Alex Williamson wrote:
> On Wed, 21 Feb 2024 09:45:14 +0200
> Yishai Hadas <yishaih@nvidia.com> wrote:
>
> > On 08/02/2024 10:16, Yishai Hadas wrote:
> > > On 06/02/2024 10:06, Yishai Hadas wrote:
> > >> On 06/02/2024 9:35, Tian, Kevin wrote:
> > >>>> From: Yishai Hadas <yishaih@nvidia.com>
> > >>>> Sent: Monday, February 5, 2024 8:48 PM
> > >>>>
> > >>>> This series improves the mlx5 driver to better handle some error cases
> > >>>> as of below.
> > >>>>
> > >>>> The first two patches let the driver recognize whether the firmware
> > >>>> moved the tracker object to an error state. In that case, the driver
> > >>>> will skip/block any usage of that object.
> > >>>>
> > >>>> The next two patches (#3, #4), improve the driver to better include the
> > >>>> proper firmware syndrome in dmesg upon a failure in some firmware
> > >>>> commands.
> > >>>>
> > >>>> The last patch follows the device specification to let the firmware
> > >>>> know
> > >>>> upon leaving PRE_COPY back to RUNNING. (e.g. error in the target,
> > >>>> migration cancellation, etc.).
> > >>>>
> > >>>> This will let the firmware clean its internal resources that were
> > >>>> turned
> > >>>> on upon PRE_COPY.
> > >>>>
> > >>>> Note:
> > >>>> As the first patch should go to net/mlx5, we may need to send it as a
> > >>>> pull request format to vfio before acceptance of the series, to avoid
> > >>>> conflicts.
> > >>>>
> > >>>> Changes from V0: https://lore.kernel.org/kvm/20240130170227.153464-1-
> > >>>> yishaih@nvidia.com/
> > >>>> Patch #2:
> > >>>> - Rename to use 'object changed' in some places to make it clearer.
> > >>>> - Enhance the commit log to better clarify the usage/use case.
> > >>>>
> > >>>> The above was suggested by Tian, Kevin <kevin.tian@intel.com>.
> > >>>>
> > >>>
> > >>> this series looks good to me except a small remark on patch2:
> > >>
> > >> We should be fine there, see my answer on V0.
> > >>
> > >>>
> > >>> Reviewed-by: Kevin Tian <kevin.tian@intel.com>
> > >>
> > >> Thanks Kevin, for your reviewed-by.
> > >>
> > >> Yishai
> > >>
> > >
> > > Alex
> > >
> > > Are we OK here to continue with a PR for the first patch ?
> > >
> > > It seems that we should be fine here.
> > >
> > > Thanks,
> > > Yishai
> > >
> >
> > Hi Alex,
> > Any update here ?
>
> Sure, if Leon wants to do a PR for struct
> mlx5_ifc_query_page_track_obj_out_bits, that's fine. The series looks
> ok to me. The struct definition is small enough to go through the vfio
> tree with Leon's ack, but I'll leave it to you to do the right thing
> relative to potential conflicts. Thanks,
Alex, you are right, there is no need to send a PR for the first patch.
Please take it directly through your tree.
We don't have anything in our shared branch this cycle.
Acked-by: Leon Romanovsky <leon@kernel.org>
Thanks
>
> Alex
>
^ permalink raw reply [flat|nested] 13+ messages in thread
* Re: [PATCH V1 vfio 0/5] Improve mlx5 driver to better handle some error cases
2024-02-05 12:48 [PATCH V1 vfio 0/5] Improve mlx5 driver to better handle some error cases Yishai Hadas
` (5 preceding siblings ...)
2024-02-06 7:35 ` [PATCH V1 vfio 0/5] Improve mlx5 driver to better handle some error cases Tian, Kevin
@ 2024-02-22 21:37 ` Alex Williamson
6 siblings, 0 replies; 13+ messages in thread
From: Alex Williamson @ 2024-02-22 21:37 UTC (permalink / raw)
To: Yishai Hadas; +Cc: jgg, kvm, kevin.tian, joao.m.martins, leonro, maorg
On Mon, 5 Feb 2024 14:48:23 +0200
Yishai Hadas <yishaih@nvidia.com> wrote:
> This series improves the mlx5 driver to better handle some error cases
> as of below.
>
> The first two patches let the driver recognize whether the firmware
> moved the tracker object to an error state. In that case, the driver
> will skip/block any usage of that object.
>
> The next two patches (#3, #4), improve the driver to better include the
> proper firmware syndrome in dmesg upon a failure in some firmware
> commands.
>
> The last patch follows the device specification to let the firmware know
> upon leaving PRE_COPY back to RUNNING. (e.g. error in the target,
> migration cancellation, etc.).
>
> This will let the firmware clean its internal resources that were turned
> on upon PRE_COPY.
>
> Note:
> As the first patch should go to net/mlx5, we may need to send it as a
> pull request format to vfio before acceptance of the series, to avoid
> conflicts.
>
> Changes from V0: https://lore.kernel.org/kvm/20240130170227.153464-1-yishaih@nvidia.com/
> Patch #2:
> - Rename to use 'object changed' in some places to make it clearer.
> - Enhance the commit log to better clarify the usage/use case.
>
> The above was suggested by Tian, Kevin <kevin.tian@intel.com>.
>
> Yishai
>
> Yishai Hadas (5):
> net/mlx5: Add the IFC related bits for query tracker
> vfio/mlx5: Add support for tracker object change event
> vfio/mlx5: Handle the EREMOTEIO error upon the SAVE command
> vfio/mlx5: Block incremental query upon migf state error
> vfio/mlx5: Let firmware knows upon leaving PRE_COPY back to RUNNING
>
> drivers/vfio/pci/mlx5/cmd.c | 74 ++++++++++++++++++++++++++++++++---
> drivers/vfio/pci/mlx5/cmd.h | 5 ++-
> drivers/vfio/pci/mlx5/main.c | 39 ++++++++++++++----
> include/linux/mlx5/mlx5_ifc.h | 5 +++
> 4 files changed, 110 insertions(+), 13 deletions(-)
>
Applied to vfio next branch for v6.9. Thanks,
Alex
^ permalink raw reply [flat|nested] 13+ messages in thread