From: Tariq Toukan <tariqt@nvidia.com>
To: Eric Dumazet <edumazet@google.com>,
Jakub Kicinski <kuba@kernel.org>, Paolo Abeni <pabeni@redhat.com>,
Andrew Lunn <andrew+netdev@lunn.ch>,
"David S. Miller" <davem@davemloft.net>
Cc: Saeed Mahameed <saeedm@nvidia.com>,
Leon Romanovsky <leon@kernel.org>,
Tariq Toukan <tariqt@nvidia.com>, Mark Bloch <mbloch@nvidia.com>,
<netdev@vger.kernel.org>, <linux-rdma@vger.kernel.org>,
<linux-kernel@vger.kernel.org>, Gal Pressman <gal@nvidia.com>,
Moshe Shemesh <moshe@nvidia.com>,
Dragos Tatulea <dtatulea@nvidia.com>
Subject: [PATCH net-next 3/3] net/mlx5e: Speed up channel creation by initializing MKEY entries via UMR WQE
Date: Thu, 19 Mar 2026 09:43:38 +0200 [thread overview]
Message-ID: <20260319074338.24265-4-tariqt@nvidia.com> (raw)
In-Reply-To: <20260319074338.24265-1-tariqt@nvidia.com>
Initializing all UMR MKEY entries as part of the CREATE_MKEY firmware
command is relatively slow. Since this operation is performed per RQ,
the cumulative latency becomes significant with a large number of
queues.
Move the entries initialization out of the CREATE_MKEY command and
perform it in the fast path by posting an appropriate UMR WQE on the
ICOSQ.
The UMR WQE is prepared and written to the ICOSQ before activation,
making it safe without additional locking, as it does not race with NOP
postings or early NAPI refills.
Performance results:
Setup: 248 channels, MTU 9000, RX/TX ring size 8K.
Interface up:
Before: 5.618 secs
After: 3.537 secs (2.081 secs faster)
Saves ~8.4 msec per channel.
Signed-off-by: Tariq Toukan <tariqt@nvidia.com>
Reviewed-by: Dragos Tatulea <dtatulea@nvidia.com>
---
drivers/net/ethernet/mellanox/mlx5/core/en.h | 6 +
.../net/ethernet/mellanox/mlx5/core/en/txrx.h | 1 +
.../net/ethernet/mellanox/mlx5/core/en_main.c | 237 +++++++++++++-----
.../net/ethernet/mellanox/mlx5/core/en_rx.c | 1 +
4 files changed, 185 insertions(+), 60 deletions(-)
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en.h b/drivers/net/ethernet/mellanox/mlx5/core/en.h
index 6c773a75b514..1a6c86b5919a 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en.h
@@ -247,6 +247,7 @@ struct mlx5e_umr_wqe {
DECLARE_FLEX_ARRAY(struct mlx5_mtt, inline_mtts);
DECLARE_FLEX_ARRAY(struct mlx5_klm, inline_klms);
DECLARE_FLEX_ARRAY(struct mlx5_ksm, inline_ksms);
+ DECLARE_FLEX_ARRAY(struct mlx5_wqe_data_seg, dseg);
};
};
static_assert(offsetof(struct mlx5e_umr_wqe, inline_mtts) == sizeof(struct mlx5e_umr_wqe_hdr),
@@ -747,6 +748,11 @@ struct mlx5e_rq {
struct mlx5e_dma_info wqe_overflow;
struct {
__be32 umr_mkey_be;
+ struct {
+ void *p_unaligned;
+ int sz;
+ dma_addr_t addr;
+ } init_data;
} mpwqe_sp;
/* XDP read-mostly */
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/txrx.h b/drivers/net/ethernet/mellanox/mlx5/core/en/txrx.h
index f2a8453d8dce..948d22f508b0 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en/txrx.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en/txrx.h
@@ -64,6 +64,7 @@ ktime_t mlx5e_cqe_ts_to_ns(cqe_ts_to_ns func, struct mlx5_clock *clock, u64 cqe_
enum mlx5e_icosq_wqe_type {
MLX5E_ICOSQ_WQE_NOP,
+ MLX5E_ICOSQ_WQE_UMR_RX_INIT,
MLX5E_ICOSQ_WQE_UMR_RX,
#ifdef CONFIG_MLX5_EN_TLS
MLX5E_ICOSQ_WQE_UMR_TLS,
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
index 5a31c79cec06..eaed05865042 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
@@ -348,7 +348,6 @@ static inline void mlx5e_build_umr_wqe(struct mlx5e_rq *rq,
ucseg->flags = MLX5_UMR_TRANSLATION_OFFSET_EN | MLX5_UMR_INLINE;
octowords = mlx5e_mpwrq_umr_octowords(rq->mpwqe.pages_per_wqe, rq->mpwqe.umr_mode);
ucseg->xlt_octowords = cpu_to_be16(octowords);
- ucseg->mkey_mask = cpu_to_be64(MLX5_MKEY_MASK_FREE);
}
static int mlx5e_rq_alloc_mpwqe_info(struct mlx5e_rq *rq, int node)
@@ -397,57 +396,61 @@ static u8 mlx5e_mpwrq_access_mode(enum mlx5e_mpwrq_umr_mode umr_mode)
return 0;
}
-static int mlx5e_create_umr_mkey(struct mlx5_core_dev *mdev,
- u32 npages, u8 page_shift, u32 *umr_mkey,
- dma_addr_t filler_addr,
- enum mlx5e_mpwrq_umr_mode umr_mode,
- u32 xsk_chunk_size)
+static void mlx5e_rq_umr_mkey_data_free(struct mlx5e_rq *rq)
{
- struct mlx5_mtt *mtt;
- struct mlx5_ksm *ksm;
- struct mlx5_klm *klm;
- u32 octwords;
- int inlen;
- void *mkc;
- u32 *in;
- int err;
- int i;
+ if (!rq->mpwqe_sp.init_data.p_unaligned)
+ return;
- if ((umr_mode == MLX5E_MPWRQ_UMR_MODE_UNALIGNED ||
- umr_mode == MLX5E_MPWRQ_UMR_MODE_TRIPLE) &&
- !MLX5_CAP_GEN(mdev, fixed_buffer_size)) {
- mlx5_core_warn(mdev, "Unaligned AF_XDP requires fixed_buffer_size capability\n");
- return -EINVAL;
- }
+ dma_unmap_single(rq->pdev, rq->mpwqe_sp.init_data.addr,
+ rq->mpwqe_sp.init_data.sz, DMA_TO_DEVICE);
+ kfree(rq->mpwqe_sp.init_data.p_unaligned);
+ rq->mpwqe_sp.init_data.p_unaligned = NULL;
+}
- octwords = mlx5e_mpwrq_umr_octowords(npages, umr_mode);
+static int mlx5e_rq_umr_mkey_data_alloc(struct mlx5e_rq *rq, u32 npages,
+ struct mlx5_wqe_data_seg *dseg)
+{
+ dma_addr_t data_addr;
+ int data_sz;
+ void *data;
- inlen = MLX5_FLEXIBLE_INLEN(mdev, MLX5_ST_SZ_BYTES(create_mkey_in),
- MLX5_OCTWORD, octwords);
- if (inlen < 0)
- return inlen;
+ data_sz = mlx5e_mpwrq_umr_octowords(npages, rq->mpwqe.umr_mode) *
+ MLX5_OCTWORD;
+ rq->mpwqe_sp.init_data.p_unaligned =
+ kzalloc(data_sz + MLX5_UMR_ALIGN - 1, GFP_KERNEL);
+ if (!rq->mpwqe_sp.init_data.p_unaligned)
+ return -ENOMEM;
- in = kvzalloc(inlen, GFP_KERNEL);
- if (!in)
+ data = PTR_ALIGN(rq->mpwqe_sp.init_data.p_unaligned, MLX5_UMR_ALIGN);
+ data_addr = dma_map_single(rq->pdev, data, data_sz, DMA_TO_DEVICE);
+ if (dma_mapping_error(rq->pdev, data_addr)) {
+ kfree(rq->mpwqe_sp.init_data.p_unaligned);
+ rq->mpwqe_sp.init_data.p_unaligned = NULL;
return -ENOMEM;
+ }
- mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry);
+ rq->mpwqe_sp.init_data.sz = data_sz;
+ rq->mpwqe_sp.init_data.addr = data_addr;
- MLX5_SET(mkc, mkc, free, 1);
- MLX5_SET(mkc, mkc, umr_en, 1);
- MLX5_SET(mkc, mkc, lw, 1);
- MLX5_SET(mkc, mkc, lr, 1);
- MLX5_SET(mkc, mkc, access_mode_1_0, mlx5e_mpwrq_access_mode(umr_mode));
- mlx5e_mkey_set_relaxed_ordering(mdev, mkc);
- MLX5_SET(mkc, mkc, qpn, 0xffffff);
- MLX5_SET(mkc, mkc, pd, mdev->mlx5e_res.hw_objs.pdn);
- MLX5_SET64(mkc, mkc, len, npages << page_shift);
- MLX5_SET(mkc, mkc, translations_octword_size, octwords);
- if (umr_mode == MLX5E_MPWRQ_UMR_MODE_TRIPLE)
- MLX5_SET(mkc, mkc, log_page_size, page_shift - 2);
- else if (umr_mode != MLX5E_MPWRQ_UMR_MODE_OVERSIZED)
- MLX5_SET(mkc, mkc, log_page_size, page_shift);
- MLX5_SET(create_mkey_in, in, translations_octword_actual_size, octwords);
+ dseg->addr = cpu_to_be64(data_addr);
+ dseg->byte_count = cpu_to_be32(data_sz);
+ dseg->lkey = rq->mkey_be;
+
+ return 0;
+}
+
+static void mlx5e_rq_umr_mkey_data_fill(struct mlx5e_rq *rq, u32 npages)
+{
+ struct mlx5_core_dev *mdev = rq->mdev;
+ u32 xsk_chunk_size, xsk_rem;
+ dma_addr_t filler_addr;
+ struct mlx5_mtt *mtt;
+ struct mlx5_ksm *ksm;
+ struct mlx5_klm *klm;
+ __be32 mkey_be;
+ void *data;
+ u8 pad;
+ int i;
/* Initialize the mkey with all MTTs pointing to a default
* page (filler_addr). When the channels are activated, UMR
@@ -455,48 +458,152 @@ static int mlx5e_create_umr_mkey(struct mlx5_core_dev *mdev,
* the RQ's pool, while the gaps (wqe_overflow) remain mapped
* to the default page.
*/
- switch (umr_mode) {
+ filler_addr = rq->wqe_overflow.addr;
+
+ mkey_be = cpu_to_be32(mdev->mlx5e_res.hw_objs.mkey);
+ data = PTR_ALIGN(rq->mpwqe_sp.init_data.p_unaligned, MLX5_UMR_ALIGN);
+
+ switch (rq->mpwqe.umr_mode) {
case MLX5E_MPWRQ_UMR_MODE_OVERSIZED:
- klm = MLX5_ADDR_OF(create_mkey_in, in, klm_pas_mtt);
+ /* Must have xsk_pool != NULL at this point */
+ xsk_chunk_size = rq->xsk_pool->chunk_size;
+ xsk_rem = (1 << rq->mpwqe.page_shift) - xsk_chunk_size;
+ klm = data;
for (i = 0; i < npages; i++) {
klm[i << 1] = (struct mlx5_klm) {
.va = cpu_to_be64(filler_addr),
.bcount = cpu_to_be32(xsk_chunk_size),
- .key = cpu_to_be32(mdev->mlx5e_res.hw_objs.mkey),
+ .key = mkey_be,
};
klm[(i << 1) + 1] = (struct mlx5_klm) {
.va = cpu_to_be64(filler_addr),
- .bcount = cpu_to_be32((1 << page_shift) - xsk_chunk_size),
- .key = cpu_to_be32(mdev->mlx5e_res.hw_objs.mkey),
+ .bcount = cpu_to_be32(xsk_rem),
+ .key = mkey_be,
};
}
break;
case MLX5E_MPWRQ_UMR_MODE_UNALIGNED:
- ksm = MLX5_ADDR_OF(create_mkey_in, in, klm_pas_mtt);
+ ksm = data;
for (i = 0; i < npages; i++)
ksm[i] = (struct mlx5_ksm) {
- .key = cpu_to_be32(mdev->mlx5e_res.hw_objs.mkey),
+ .key = mkey_be,
.va = cpu_to_be64(filler_addr),
};
break;
case MLX5E_MPWRQ_UMR_MODE_ALIGNED:
- mtt = MLX5_ADDR_OF(create_mkey_in, in, klm_pas_mtt);
+ mtt = data;
for (i = 0; i < npages; i++)
mtt[i] = (struct mlx5_mtt) {
.ptag = cpu_to_be64(filler_addr),
};
break;
case MLX5E_MPWRQ_UMR_MODE_TRIPLE:
- ksm = MLX5_ADDR_OF(create_mkey_in, in, klm_pas_mtt);
- for (i = 0; i < npages * 4; i++) {
+ ksm = data;
+ for (i = 0; i < npages * 4; i++)
ksm[i] = (struct mlx5_ksm) {
- .key = cpu_to_be32(mdev->mlx5e_res.hw_objs.mkey),
+ .key = mkey_be,
.va = cpu_to_be64(filler_addr),
};
- }
break;
}
+ /* Pad is not expected, as we init the whole MKEY here */
+ pad = mlx5e_mpwrq_umr_entries_pad(npages, rq->mpwqe.umr_mode);
+ WARN_ONCE(pad, "MPWRQ pad is not expected! UMR mode %u npages %d pad %u\n",
+ rq->mpwqe.umr_mode, npages, pad);
+}
+
+static int mlx5e_rq_umr_mkey_data_init(struct mlx5e_rq *rq, u32 npages)
+
+{
+ struct mlx5_wqe_ctrl_seg *cseg;
+ struct mlx5_wqe_umr_ctrl_seg *ucseg;
+ struct mlx5e_icosq *sq = rq->icosq;
+ struct mlx5e_umr_wqe *umr_wqe;
+ u16 pi, num_wqebbs, octowords;
+ u8 ds_cnt;
+ int err;
+
+ /* + 1 for the data segment */
+ ds_cnt = 1 + DIV_ROUND_UP(offsetof(struct mlx5e_umr_wqe, dseg),
+ MLX5_SEND_WQE_DS);
+ num_wqebbs = DIV_ROUND_UP(ds_cnt, MLX5_SEND_WQEBB_NUM_DS);
+ pi = mlx5e_icosq_get_next_pi(sq, num_wqebbs);
+ umr_wqe = mlx5_wq_cyc_get_wqe(&sq->wq, pi);
+ memset(umr_wqe, 0, num_wqebbs * MLX5_SEND_WQE_BB);
+
+ cseg = &umr_wqe->hdr.ctrl;
+ ucseg = &umr_wqe->hdr.uctrl;
+
+ cseg->opmod_idx_opcode =
+ cpu_to_be32((sq->pc << MLX5_WQE_CTRL_WQE_INDEX_SHIFT) |
+ MLX5_OPCODE_UMR);
+ cseg->qpn_ds = cpu_to_be32((sq->sqn << MLX5_WQE_CTRL_QPN_SHIFT) |
+ ds_cnt);
+ cseg->umr_mkey = rq->mpwqe_sp.umr_mkey_be;
+
+ octowords = mlx5e_mpwrq_umr_octowords(npages, rq->mpwqe.umr_mode);
+ ucseg->xlt_octowords = cpu_to_be16(octowords);
+ ucseg->mkey_mask = cpu_to_be64(MLX5_MKEY_MASK_FREE);
+
+ err = mlx5e_rq_umr_mkey_data_alloc(rq, npages, umr_wqe->dseg);
+ if (err)
+ return err;
+
+ mlx5e_rq_umr_mkey_data_fill(rq, npages);
+
+ sq->db.wqe_info[pi] = (struct mlx5e_icosq_wqe_info) {
+ .wqe_type = MLX5E_ICOSQ_WQE_UMR_RX_INIT,
+ .num_wqebbs = num_wqebbs,
+ .umr.rq = rq,
+ };
+
+ sq->pc += num_wqebbs;
+
+ sq->doorbell_cseg = cseg;
+
+ return 0;
+}
+
+static int mlx5e_create_umr_mkey(struct mlx5_core_dev *mdev,
+ u32 npages, u8 page_shift, u32 *umr_mkey,
+ enum mlx5e_mpwrq_umr_mode umr_mode)
+{
+ int inlen;
+ void *mkc;
+ u32 *in;
+ int err;
+
+ if ((umr_mode == MLX5E_MPWRQ_UMR_MODE_UNALIGNED ||
+ umr_mode == MLX5E_MPWRQ_UMR_MODE_TRIPLE) &&
+ !MLX5_CAP_GEN(mdev, fixed_buffer_size)) {
+ mlx5_core_warn(mdev, "Unaligned AF_XDP requires fixed_buffer_size capability\n");
+ return -EINVAL;
+ }
+
+ inlen = MLX5_ST_SZ_BYTES(create_mkey_in);
+ in = kvzalloc(inlen, GFP_KERNEL);
+ if (!in)
+ return -ENOMEM;
+
+ mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry);
+
+ MLX5_SET(mkc, mkc, free, 1);
+ MLX5_SET(mkc, mkc, umr_en, 1);
+ MLX5_SET(mkc, mkc, lw, 1);
+ MLX5_SET(mkc, mkc, lr, 1);
+ MLX5_SET(mkc, mkc, access_mode_1_0, mlx5e_mpwrq_access_mode(umr_mode));
+ mlx5e_mkey_set_relaxed_ordering(mdev, mkc);
+ MLX5_SET(mkc, mkc, qpn, 0xffffff);
+ MLX5_SET(mkc, mkc, pd, mdev->mlx5e_res.hw_objs.pdn);
+ MLX5_SET64(mkc, mkc, len, npages << page_shift);
+ MLX5_SET(mkc, mkc, translations_octword_size,
+ mlx5e_mpwrq_umr_octowords(npages, umr_mode));
+ if (umr_mode == MLX5E_MPWRQ_UMR_MODE_TRIPLE)
+ MLX5_SET(mkc, mkc, log_page_size, page_shift - 2);
+ else if (umr_mode != MLX5E_MPWRQ_UMR_MODE_OVERSIZED)
+ MLX5_SET(mkc, mkc, log_page_size, page_shift);
+
err = mlx5_core_create_mkey(mdev, umr_mkey, in, inlen);
kvfree(in);
@@ -505,7 +612,6 @@ static int mlx5e_create_umr_mkey(struct mlx5_core_dev *mdev,
static int mlx5e_create_rq_umr_mkey(struct mlx5_core_dev *mdev, struct mlx5e_rq *rq)
{
- u32 xsk_chunk_size = rq->xsk_pool ? rq->xsk_pool->chunk_size : 0;
u32 wq_size = mlx5_wq_ll_get_size(&rq->mpwqe.wq);
u32 num_entries, max_num_entries;
u32 umr_mkey;
@@ -522,9 +628,16 @@ static int mlx5e_create_rq_umr_mkey(struct mlx5_core_dev *mdev, struct mlx5e_rq
max_num_entries);
err = mlx5e_create_umr_mkey(mdev, num_entries, rq->mpwqe.page_shift,
- &umr_mkey, rq->wqe_overflow.addr,
- rq->mpwqe.umr_mode, xsk_chunk_size);
+ &umr_mkey, rq->mpwqe.umr_mode);
+ if (err)
+ return err;
+
rq->mpwqe_sp.umr_mkey_be = cpu_to_be32(umr_mkey);
+
+ err = mlx5e_rq_umr_mkey_data_init(rq, num_entries);
+ if (err)
+ mlx5_core_destroy_mkey(mdev, umr_mkey);
+
return err;
}
@@ -1097,6 +1210,7 @@ static void mlx5e_free_rq(struct mlx5e_rq *rq)
case MLX5_WQ_TYPE_LINKED_LIST_STRIDING_RQ:
mlx5e_rq_free_shampo(rq);
kvfree(rq->mpwqe.info);
+ mlx5e_rq_umr_mkey_data_free(rq);
mlx5_core_destroy_mkey(rq->mdev,
be32_to_cpu(rq->mpwqe_sp.umr_mkey_be));
mlx5e_free_mpwqe_rq_drop_page(rq);
@@ -1275,8 +1389,11 @@ int mlx5e_wait_for_min_rx_wqes(struct mlx5e_rq *rq, int wait_time)
u16 min_wqes = mlx5_min_rx_wqes(rq->wq_type, mlx5e_rqwq_get_size(rq));
do {
- if (mlx5e_rqwq_get_cur_sz(rq) >= min_wqes)
+ if (mlx5e_rqwq_get_cur_sz(rq) >= min_wqes) {
+ /* memory usage completed, can be freed already */
+ mlx5e_rq_umr_mkey_data_free(rq);
return 0;
+ }
msleep(20);
} while (time_before(jiffies, exp_time));
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c b/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c
index 580bb51ad7ef..5edaa416cedd 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c
@@ -837,6 +837,7 @@ int mlx5e_poll_ico_cq(struct mlx5e_cq *cq)
wi->umr.rq->mpwqe.umr_completed++;
break;
case MLX5E_ICOSQ_WQE_NOP:
+ case MLX5E_ICOSQ_WQE_UMR_RX_INIT:
break;
#ifdef CONFIG_MLX5_EN_TLS
case MLX5E_ICOSQ_WQE_UMR_TLS:
--
2.44.0
next prev parent reply other threads:[~2026-03-19 7:44 UTC|newest]
Thread overview: 8+ messages / expand[flat|nested] mbox.gz Atom feed top
2026-03-19 7:43 [PATCH net-next 0/3] net/mlx5e: Improve channel creation time via fast-path MKEY initialization Tariq Toukan
2026-03-19 7:43 ` [PATCH net-next 1/3] net/mlx5e: Move RX MPWQE slowpath fields into a separate struct Tariq Toukan
2026-03-20 23:02 ` Joe Damato
2026-03-19 7:43 ` [PATCH net-next 2/3] net/mlx5e: RX, Pre-calculate pad value in MPWQE Tariq Toukan
2026-03-20 23:02 ` Joe Damato
2026-03-19 7:43 ` Tariq Toukan [this message]
2026-03-24 0:39 ` [PATCH net-next 3/3] net/mlx5e: Speed up channel creation by initializing MKEY entries via UMR WQE Jakub Kicinski
2026-03-20 11:56 ` [PATCH net-next 0/3] net/mlx5e: Improve channel creation time via fast-path MKEY initialization Simon Horman
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20260319074338.24265-4-tariqt@nvidia.com \
--to=tariqt@nvidia.com \
--cc=andrew+netdev@lunn.ch \
--cc=davem@davemloft.net \
--cc=dtatulea@nvidia.com \
--cc=edumazet@google.com \
--cc=gal@nvidia.com \
--cc=kuba@kernel.org \
--cc=leon@kernel.org \
--cc=linux-kernel@vger.kernel.org \
--cc=linux-rdma@vger.kernel.org \
--cc=mbloch@nvidia.com \
--cc=moshe@nvidia.com \
--cc=netdev@vger.kernel.org \
--cc=pabeni@redhat.com \
--cc=saeedm@nvidia.com \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox