Re: [PATCH i-g-t 1/1] tests/xe_vm: Add oversubscribe concurrent bind stress test

public inbox for igt-dev@lists.freedesktop.org
 help / color / mirror / Atom feed

From: "Thomas Hellström" <thomas.hellstrom@linux.intel.com>
To: Sobin Thomas <sobin.thomas@intel.com>, igt-dev@lists.freedesktop.org
Cc: nishit.sharma@intel.com
Subject: Re: [PATCH i-g-t 1/1] tests/xe_vm: Add oversubscribe concurrent bind stress test
Date: Mon, 23 Mar 2026 18:37:04 +0100	[thread overview]
Message-ID: <5e48237f792717be2532bfe5a86b134bccc069bb.camel@linux.intel.com> (raw)
In-Reply-To: <20260218164417.856114-2-sobin.thomas@intel.com>

On Wed, 2026-02-18 at 16:44 +0000, Sobin Thomas wrote:
> Add an xe_vm subtest that oversubscribes VRAM and issues
> concurrent binds into a single VM (scratch-page mode) to
> reproduce the dma-resv/bind race found under memory pressure.
> Prior coverage lacked any case that combined multi-process bind
> pressure with VRAM oversubscription, so bind/submit could
> panic (NULL deref in xe_pt_stage_bind) instead of failing cleanly.
> The new test expects successful completion or ENOMEM/EDEADLK.
> 
> Signed-off-by: Sobin Thomas <sobin.thomas@intel.com>
> ---
>  tests/intel/xe_vm.c | 421
> ++++++++++++++++++++++++++++++++++++++++++++
>  1 file changed, 421 insertions(+)
> 
> diff --git a/tests/intel/xe_vm.c b/tests/intel/xe_vm.c
> index ccff8f804..5c9d5ff0f 100644
> --- a/tests/intel/xe_vm.c
> +++ b/tests/intel/xe_vm.c
> @@ -21,6 +21,176 @@
>  #include "xe/xe_spin.h"
>  #include <string.h>
>  
> +#define MI_BB_END		(0 << 29 | 0x0A << 23 |  0)
> +#define MI_LOAD_REG_MEM		(0 << 29 | 0x29 << 23 | 0 <<
> 22 | 0 << 21 | 1 << 19 | 2)
> +#define MI_STORE_REG_MEM	(0 << 29 | 0x24 << 23 | 0 << 22 | 0
> << 21 | 1 << 19 | 2)
> +#define MI_MATH_R(length)		(0 << 29 | 0x1A << 23 |
> ((length) & 0xFF))
> +#define GPR_RX_ADDR(x)		(0x600 + (x) * 8)
> +#define ALU_LOAD(dst, src)	(0x080 << 20 | ((dst) << 10) |
> (src))
> +#define ALU_STORE(dst, src)	(0x180 << 20 | (dst) << 10 | (src))
> +#define ALU_ADD			(0x100 << 20)
> +#define ALU_RX(x)		(x)
> +#define ALU_SRCA		0x20
> +#define ALU_SRCB		0x21
> +#define ALU_ACCU		0x31
> +#define GB(x) (1024ULL * 1024ULL * 1024ULL * (x))

Why are you open-coding these in the test instead of relying on
intel_gpu_commands.h


> +
> +struct gem_bo {
> +	uint32_t handle;
> +	uint64_t size;
> +	int *ptr;
> +	uint64_t addr;
> +};
> +
> +struct xe_test_ctx {
> +	int fd;
> +	uint32_t vm_id;
> +
> +	uint32_t exec_queue_id;
> +
> +	uint16_t sram_instance;
> +	uint16_t vram_instance;
> +	bool has_vram;
> +};
> +
> +static uint64_t align_to_page_size(uint64_t size)
> +{
> +	return (size + 4095UL) & ~4095UL;
> +}
> +
> +static void create_exec_queue(int fd, struct xe_test_ctx *ctx)
> +{
> +	struct drm_xe_engine_class_instance *hwe;
> +	struct drm_xe_engine_class_instance eci = {
> +		.engine_class = DRM_XE_ENGINE_CLASS_RENDER,
> +	};
> +
> +	/* Find first render engine */
> +	xe_for_each_engine(fd, hwe) {
> +		if (hwe->engine_class == DRM_XE_ENGINE_CLASS_RENDER)
> {
> +			eci = *hwe;
> +			break;
> +		}
> +	}
> +	ctx->exec_queue_id = xe_exec_queue_create(fd, ctx->vm_id,
> &eci, 0);
> +}
> +
> +static void vm_bind_gem_bo(int fd, struct xe_test_ctx *ctx, uint32_t
> handle, uint64_t addr, uint64_t size)
> +{
> +	int rc;
> +	uint64_t timeline_val = 1;
> +	uint32_t syncobj_handle = syncobj_create(fd, 0);
> +
> +	struct drm_xe_sync bind_sync = {
> +		.extensions = 0,
> +		.type = DRM_XE_SYNC_TYPE_TIMELINE_SYNCOBJ,
> +		.flags = DRM_XE_SYNC_FLAG_SIGNAL,
> +		.handle = syncobj_handle,
> +		.timeline_value = timeline_val,
> +	};
> +	struct drm_xe_vm_bind vm_bind = {
> +		.extensions = 0,
> +		.vm_id = ctx->vm_id,
> +		.exec_queue_id = 0,
> +		.num_binds = 1,
> +		.bind = {
> +			.obj = handle,
> +			.obj_offset = 0,
> +			.range = size,
> +			.addr = addr,
> +			.op = DRM_XE_VM_BIND_OP_MAP,
> +			.flags = 0,
> +		},
> +		.num_syncs = 1,
> +		.syncs = (uintptr_t)&bind_sync,
> +	};
> +	rc = igt_ioctl(fd, DRM_IOCTL_XE_VM_BIND, &vm_bind);
> +
> +	igt_info("Bind returned %d\n", rc);
> +	igt_assert(rc == 0);
> +
> +	/* The right way to do this in the real world is to not wait
> for the
> +	 * syncobj here - since it just makes everything synchronous
> -, but
> +	 * instead pass the syncobj as a 'wait'-type object to thie
> execbuf
> +	 * ioctl. We do it here just to make the example simpler.
> +	 */
> +	//wait_syncobj(fd,syncobj_handle, timeline_val);
> +	igt_assert(syncobj_timeline_wait(fd, &syncobj_handle,
> &timeline_val,
> +					 1, INT64_MAX, 0, NULL));
> +
> +	syncobj_destroy(fd, syncobj_handle);
> +}

Why not use xe_vm_bind_sync() or even better xe_vm_bind_lr_sync() so
you can make a variation of the test with LR mode vms.


> +
> +static uint32_t
> +vm_bind_gem_bos(int fd, struct xe_test_ctx *ctx, struct gem_bo *bos,
> int size)
> +{
> +	int rc;
> +	uint32_t syncobj_handle = syncobj_create(fd, 0);
> +	uint64_t timeline_val = 1;
> +	struct drm_xe_sync bind_sync = {
> +		.extensions = 0,
> +		.type = DRM_XE_SYNC_TYPE_TIMELINE_SYNCOBJ,
> +		.flags = DRM_XE_SYNC_FLAG_SIGNAL,
> +		.handle = syncobj_handle,
> +		.timeline_value = timeline_val,
> +	};

Use a user-fence so that it can be reused in LR-mode?

> +	struct drm_xe_vm_bind_op binds[size];
> +	struct drm_xe_vm_bind vm_bind = {
> +		.extensions = 0,
> +		.vm_id = ctx->vm_id,
> +		.exec_queue_id = 0,
> +		.num_binds = size,
> +		.vector_of_binds = (uintptr_t)binds,
> +		.num_syncs = 1,
> +		.syncs = (uintptr_t)&bind_sync,
> +	};
> +
> +	/* Need to call the ioctl differently when size is 1. */
> +	igt_assert(size != 1);
> +
> +	for (int i = 0; i < size; i++) {
> +		binds[i] = (struct drm_xe_vm_bind_op) {
> +			.extensions = 0,
> +			.obj = bos[i].handle,
> +			.pat_index = 0,
> +			.pad = 0,
> +			.obj_offset = 0,
> +			.range = bos[i].size,
> +			.addr = bos[i].addr,
> +			.op = DRM_XE_VM_BIND_OP_MAP,
> +			.flags = 0,
> +			.prefetch_mem_region_instance = 0,
> +			.pad2 = 0,
> +		};
> +	}
> +	rc = igt_ioctl(fd, DRM_IOCTL_XE_VM_BIND, &vm_bind);

Use xe_vm_bind_array()


> +	igt_assert(rc == 0);
> +
> +	return syncobj_handle;




> +}
> +
> +static void query_mem_info(int fd, struct xe_test_ctx *ctx)
> +{
> +	uint64_t vram_reg, sys_reg;
> +	struct drm_xe_mem_region *region;
> +
> +	ctx->has_vram = xe_has_vram(fd);
> +	if (ctx->has_vram) {
> +		/* Get VRAM instance - vram_memory returns a
> bitmask,
> +		 * so we extract the instance from it
> +		 */
> +		vram_reg = vram_memory(fd, 0);
> +		region = xe_mem_region(fd, vram_reg);
> +		ctx->vram_instance = region->instance;
> +	}
> +
> +	/* Get SRAM instance */
> +	sys_reg = system_memory(fd);
> +	region = xe_mem_region(fd, sys_reg);
> +	ctx->sram_instance = region->instance;
> +	igt_debug("has_vram: %d\n", ctx->has_vram);
> +}

Where is the information obtained by the above function used?

> +
>  static uint32_t
>  addr_low(uint64_t addr)
>  {
> @@ -2450,6 +2620,252 @@ static void test_oom(int fd)
>  	}
>  }
>  
> +/**
> + * SUBTEST: oversubscribe-concurrent-bind
> + * Description: Test for oversubscribing the VM with multiple
> processes
> + * doing binds at the same time, and ensure they all complete
> successfully.
> + * Functionality: This check is for a specific bug where if multiple
> processes
> + * oversubscribe the VM, some of the binds may fail with  ENOMEM due
> to
> + * deadlock in the bind code.
> + * Test category: stress test
> + */
> +static void test_vm_oversubscribe_concurrent_bind(int fd, int
> n_vram_bufs,
> +						  int n_sram_bufs,
> int n_proc)
> +{
> +	igt_fork(child, n_proc) {
> +		struct xe_test_ctx ctx = {0};
> +		int rc;
> +		uint64_t addr = GB(1);
> +		struct timespec start, end;
> +		uint32_t vram_binds_syncobj, sram_binds_syncobj;
> +		struct gem_bo vram_bufs[n_vram_bufs];
> +		struct gem_bo sram_bufs[n_sram_bufs];
> +		int expected_result = 0;
> +		int ints_to_add = 4;
> +		int gpu_result;
> +		int retries;
> +		int max_retries = 1024;
> +		uint32_t batch_syncobj;
> +		/* integers_bo contains the integers we're going to
> add. */
> +		struct gem_bo integers_bo, result_bo, batch_bo;
> +		uint64_t tmp_addr;
> +		struct drm_xe_sync batch_syncs[3];
> +		int n_batch_syncs = 0;
> +		int pos = 0;
> +		uint64_t timeline_val = 1;
> +		struct drm_xe_exec exec;
> +
> +		rc = clock_gettime(CLOCK_MONOTONIC, &start);
> +		igt_assert(rc == 0);
> +		ctx.vm_id = xe_vm_create(fd,
> DRM_XE_VM_CREATE_FLAG_SCRATCH_PAGE, 0);
> +		query_mem_info(fd, &ctx);
> +		create_exec_queue(fd, &ctx);
> +		for (int i = 0; i < n_vram_bufs; i++) {
> +			struct gem_bo *bo = &vram_bufs[i];
> +
> +			bo->size = GB(1);
> +			bo->handle = xe_bo_create_caching(fd,
> ctx.vm_id, vram_bufs[i].size,
> +							 
> vram_memory(fd, 0), 0,
> +							 
> DRM_XE_GEM_CPU_CACHING_WC);
> +			bo->ptr = NULL;
> +			bo->addr = addr;
> +			addr += bo->size;
> +			igt_info("vram buffer %d created at
> 0x%016lx\n",
> +				 i, bo->addr);
> +		}
> +		for (int i = 0; i < n_sram_bufs; i++) {
> +			struct gem_bo *bo = &sram_bufs[i];
> +
> +			bo->size = GB(1);
> +			bo->handle = xe_bo_create_caching(fd,
> ctx.vm_id, sram_bufs[i].size,
> +							 
> system_memory(fd), 0,
> +							 
> DRM_XE_GEM_CPU_CACHING_WC);
> +			bo->ptr = NULL;
> +			bo->addr = addr;
> +			addr += bo->size;
> +			igt_info("sram buffer %d created at
> 0x%016lx\n",
> +				 i, bo->addr);

Isn't igt_debug a better choice here and in the rest of the function?
Typically when the tests are run, people are mostly interested in
whether they fail or pass, and if they have an additional interest
beyond that, they can enable debugging.


> +		}
> +		igt_info("\n Binding the buffers to the vm");
> +
> +		if (n_vram_bufs) {
> +			igt_info("binding vram buffers");
> +			vram_binds_syncobj = vm_bind_gem_bos(fd,
> &ctx, vram_bufs, n_vram_bufs);
> +		}
> +		if (n_sram_bufs) {
> +			igt_info("binding sram buffers");
> +			sram_binds_syncobj = vm_bind_gem_bos(fd,
> &ctx, sram_bufs, n_sram_bufs);
> +		}
> +		integers_bo.size = align_to_page_size(sizeof(int) *
> ints_to_add);
> +		integers_bo.handle = xe_bo_create_caching(fd,
> ctx.vm_id, integers_bo.size,
> +							 
> system_memory(fd), 0,
> +							 
> DRM_XE_GEM_CPU_CACHING_WC);
> +		integers_bo.ptr = (int *)xe_bo_map(fd,
> integers_bo.handle, integers_bo.size);
> +
> +		integers_bo.addr = 0x100000;
> +
> +		for (int i = 0; i < ints_to_add; i++) {
> +			int random_int = rand() % 8;
> +
> +			integers_bo.ptr[i] = random_int;
> +			expected_result += random_int;
> +
> +			igt_info("%d", random_int);
> +			if (i + 1 != ints_to_add)
> +				igt_info(" + ");
> +			else
> +				igt_info(" = ");
> +		}
> +		igt_assert_eq(munmap(integers_bo.ptr,
> integers_bo.size), 0);
> +		integers_bo.ptr = NULL;
> +
> +		igt_info("Creating the result buffer object");
> +
> +		result_bo.size = align_to_page_size(sizeof(int));
> +		result_bo.handle  = xe_bo_create_caching(fd,
> ctx.vm_id, result_bo.size,
> +							
> system_memory(fd), 0,
> +							
> DRM_XE_GEM_CPU_CACHING_WC);
> +		result_bo.ptr = NULL;
> +		result_bo.addr = 0x200000;
> +		/* batch_bo contains the commands the GPU will run.
> */
> +
> +		igt_info("Creating the batch buffer object");
> +		batch_bo.size = 4096;
> +		//batch_bo.handle = create_gem_bo_sram(fd,
> batch_bo.size);
> +		batch_bo.handle = xe_bo_create_caching(fd,
> ctx.vm_id, batch_bo.size,
> +						      
> system_memory(fd), 0,
> +						      
> DRM_XE_GEM_CPU_CACHING_WC);
> +
> +		batch_bo.ptr = (int *)xe_bo_map(fd, batch_bo.handle,
> batch_bo.size);
> +		batch_bo.addr = 0x300000;
> +
> +		/* r0 = integers_bo[0] */
> +		batch_bo.ptr[pos++] = MI_LOAD_REG_MEM;
> +		batch_bo.ptr[pos++] = GPR_RX_ADDR(0);
> +		tmp_addr = integers_bo.addr + 0 * sizeof(uint32_t);
> +		batch_bo.ptr[pos++] = tmp_addr & 0xFFFFFFFF;
> +		batch_bo.ptr[pos++] = (tmp_addr >> 32) & 0xFFFFFFFF;
> +		for (int i = 1; i < ints_to_add; i++) {
> +			/* r1 = integers_bo[i] */
> +			batch_bo.ptr[pos++] = MI_LOAD_REG_MEM;
> +			batch_bo.ptr[pos++] = GPR_RX_ADDR(1);
> +			tmp_addr = integers_bo.addr + i *
> sizeof(uint32_t);
> +			batch_bo.ptr[pos++] = tmp_addr & 0xFFFFFFFF;
> +			batch_bo.ptr[pos++] = (tmp_addr >> 32) &
> 0xFFFFFFFF;
> +			/* r0 = r0 + r1 */
> +			batch_bo.ptr[pos++] = MI_MATH_R(3);
> +			batch_bo.ptr[pos++] = ALU_LOAD(ALU_SRCA,
> ALU_RX(0));
> +			batch_bo.ptr[pos++] = ALU_LOAD(ALU_SRCB,
> ALU_RX(1));
> +			batch_bo.ptr[pos++] = ALU_ADD;
> +			batch_bo.ptr[pos++] = ALU_STORE(ALU_RX(0),
> ALU_ACCU);
> +		}
> +		/* result_bo[0] = r0 */
> +		batch_bo.ptr[pos++] = MI_STORE_REG_MEM;
> +		batch_bo.ptr[pos++] = GPR_RX_ADDR(0);
> +		tmp_addr = result_bo.addr + 0 * sizeof(uint32_t);
> +		batch_bo.ptr[pos++] = tmp_addr & 0xFFFFFFFF;
> +		batch_bo.ptr[pos++] = (tmp_addr >> 32) & 0xFFFFFFFF;
> +
> +		batch_bo.ptr[pos++] = MI_BB_END;
> +		while (pos % 4 != 0)
> +			batch_bo.ptr[pos++] = MI_NOOP;
> +
> +		igt_assert(pos * sizeof(int) <= batch_bo.size);
> +
> +		vm_bind_gem_bo(fd, &ctx, integers_bo.handle,
> integers_bo.addr, integers_bo.size);
> +		vm_bind_gem_bo(fd, &ctx, result_bo.handle,
> result_bo.addr, result_bo.size);
> +		vm_bind_gem_bo(fd, &ctx, batch_bo.handle,
> batch_bo.addr, batch_bo.size);
> +
> +		/* Now we do the actual batch submission to the GPU.
> */
> +		batch_syncobj = syncobj_create(fd, 0);
> +
> +		/* Wait for the other threads to create their stuff
> too. */
> +
> +		end = start;
> +		end.tv_sec += 5;
> +		rc = clock_nanosleep(CLOCK_MONOTONIC, TIMER_ABSTIME,
> &end, NULL);
> +		igt_assert_eq(rc, 0);
> +
> +		batch_syncs[n_batch_syncs++] = (struct drm_xe_sync)
> {
> +			.extensions = 0,
> +			.type = DRM_XE_SYNC_TYPE_TIMELINE_SYNCOBJ,
> +			.flags = DRM_XE_SYNC_FLAG_SIGNAL,
> +			.handle = batch_syncobj,
> +			.timeline_value = timeline_val,
> +		};
> +		if (n_vram_bufs) {
> +			batch_syncs[n_batch_syncs++] = (struct
> drm_xe_sync) {
> +				.extensions = 0,
> +				.type =
> DRM_XE_SYNC_TYPE_TIMELINE_SYNCOBJ,
> +				.flags = 0, /* wait */
> +				.handle = vram_binds_syncobj,
> +				.timeline_value = 1,
> +			};
> +		}
> +		if (n_sram_bufs) {
> +			batch_syncs[n_batch_syncs++] = (struct
> drm_xe_sync) {
> +				.extensions = 0,
> +				.type =
> DRM_XE_SYNC_TYPE_TIMELINE_SYNCOBJ,
> +				.flags = 0, /* wait */
> +				.handle = sram_binds_syncobj,
> +				.timeline_value = 1,
> +			};
> +		}
> +		exec = (struct drm_xe_exec) {
> +			.exec_queue_id = ctx.exec_queue_id,
> +			.num_syncs = n_batch_syncs,
> +			.syncs = (uintptr_t)batch_syncs,
> +			.address = batch_bo.addr,
> +			.num_batch_buffer = 1,
> +		};
> +		for (retries = 0; retries < max_retries; retries++)
> {
> +			rc = igt_ioctl(fd, DRM_IOCTL_XE_EXEC,
> &exec);
> +
> +			if (!(rc && errno == ENOMEM))
> +				break;
> +
> +			usleep(100 * retries);
> +			if (retries == 0)
> +				igt_warn("got ENOMEM\n");
> +		}
> +		if (retries == max_retries)
> +			igt_warn("gave up after %d retries\n",
> retries);
> +
> +		if (rc) {
> +			igt_warn("errno: %d (%s)\n", errno,
> strerror(errno));
> +			perror(__func__);
> +		}
> +		igt_assert_eq(rc, 0);
> +
> +		if (retries)
> +			igt_info("!!!!!! succeeded after %d retries
> !!!!!!\n",
> +				 retries);
> +
> +		/* We need to wait for the GPU to finish. */
> +		igt_assert(syncobj_timeline_wait(fd, &batch_syncobj,
> +						 &timeline_val, 1,
> INT64_MAX, 0, NULL));
> +		result_bo.ptr = (int *)xe_bo_map(fd,
> result_bo.handle, result_bo.size);
> +		gpu_result = result_bo.ptr[0];
> +		igt_info("gpu_result = %d\n", gpu_result);
> +		igt_info("expected_result = %d\n", expected_result);
> +
> +		igt_assert_eq(gpu_result, expected_result);
> +		igt_assert_eq(munmap(result_bo.ptr, result_bo.size),
> 0);
> +		result_bo.ptr = NULL;
> +
> +		end.tv_sec += 10;
> +		rc = clock_nanosleep(CLOCK_MONOTONIC, TIMER_ABSTIME,
> &end, NULL);
> +		assert(rc == 0);
> +		gem_close(fd, batch_bo.handle);
> +		gem_close(fd, result_bo.handle);
> +		gem_close(fd, integers_bo.handle);
> +
> +		xe_vm_destroy(fd, ctx.vm_id);
> +		close(fd);
> +	}
> +	igt_waitchildren();
> +}
> +
>  int igt_main()
>  {
>  	struct drm_xe_engine_class_instance *hwe, *hwe_non_copy =
> NULL;
> @@ -2850,6 +3266,11 @@ int igt_main()
>  		test_oom(fd);
>  	}
>  
> +	igt_subtest("oversubscribe-concurrent-bind") {
> +		igt_require(xe_has_vram(fd));
> +		test_vm_oversubscribe_concurrent_bind(fd, 2, 4, 4);

AFAIK there are multiple tests in xe_evict() that does more or less the
same as this test. What is this test doing different compared to those
tests? Is it the array bind?

Also those hard-coded numbers need some explanation? Shouldn't they
relate to the amount of VRAM on the system and to the system memory and
possibly also swap-space available?

Thanks,
Thomas


> +	}
> +
>  	igt_fixture()
>  		drm_close_driver(fd);
>  }

next prev parent reply	other threads:[~2026-03-23 17:37 UTC|newest]

Thread overview: 11+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2026-02-18 16:44 [PATCH i-g-t 0/1] tests/xe_vm: Add oversubscribe concurrent bind stress test Sobin Thomas
2026-02-18 16:44 ` [PATCH i-g-t 1/1] " Sobin Thomas
2026-02-19 10:43   ` Sharma, Nishit
2026-03-05  3:25     ` Thomas, Sobin
2026-02-25 16:38   ` Kamil Konieczny
2026-03-23 17:37   ` Thomas Hellström [this message]
2026-02-18 18:09 ` ✓ Xe.CI.BAT: success for " Patchwork
2026-02-18 18:17 ` ✓ i915.CI.BAT: " Patchwork
2026-02-18 20:20 ` ✗ Xe.CI.FULL: failure " Patchwork
2026-02-18 20:42 ` ✗ i915.CI.Full: " Patchwork
2026-03-23 17:05 ` [PATCH i-g-t 0/1] " Thomas Hellström

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=5e48237f792717be2532bfe5a86b134bccc069bb.camel@linux.intel.com \
    --to=thomas.hellstrom@linux.intel.com \
    --cc=igt-dev@lists.freedesktop.org \
    --cc=nishit.sharma@intel.com \
    --cc=sobin.thomas@intel.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

Be sure your reply has a Subject: header at the top and a blank line before the message body.

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox