All of lore.kernel.org
 help / color / mirror / Atom feed
From: Thomas Monjalon <thomas@monjalon.net>
To: Aman Kumar <aman.kumar@vvdntech.in>
Cc: dev@dpdk.org, rasland@nvidia.com, asafp@nvidia.com,
	shys@nvidia.com, viacheslavo@nvidia.com, akozyrev@nvidia.com,
	matan@nvidia.com, anatoly.burakov@intel.com,
	keesang.song@amd.com, aman.kumar@vvdntech.in,
	jerinjacobk@gmail.com, bruce.richardson@intel.com,
	konstantin.ananyev@intel.com, david.marchand@redhat.com
Subject: Re: [dpdk-dev] [PATCH v2 1/2] lib/eal: add amd epyc2 memcpy routine to eal
Date: Tue, 19 Oct 2021 14:31:01 +0200	[thread overview]
Message-ID: <1936725.1xRPv8SPVf@thomas> (raw)
In-Reply-To: <20211019104724.19416-1-aman.kumar@vvdntech.in>

19/10/2021 12:47, Aman Kumar:
> This patch provides rte_memcpy* calls optimized for
> AMD EPYC platforms. Use config/x86/x86_amd_epyc_linux_gcc
> as cross-file with meson to build dpdk for AMD EPYC platforms.

Please split in 2 patches: platform & memcpy.

What optimization is specific to EPYC?

I dislike the asm code below.
What is AMD specific inside?
Can it use compiler intrinsics as it is done elsewhere?

> +static __rte_always_inline void *
> +rte_memcpy_aligned_ntload_tstore16_amdepyc2(void *dst,
> +					    const void *src,
> +					    size_t size)
> +{
> +	asm volatile goto("movq %0, %%rsi\n\t"
> +	"movq %1, %%rdi\n\t"
> +	"movq %2, %%rdx\n\t"
> +	"cmpq   $(128), %%rdx\n\t"
> +	"jb     202f\n\t"
> +	"201:\n\t"
> +	"vmovntdqa (%%rsi), %%ymm0\n\t"
> +	"vmovntdqa 32(%%rsi), %%ymm1\n\t"
> +	"vmovntdqa 64(%%rsi), %%ymm2\n\t"
> +	"vmovntdqa 96(%%rsi), %%ymm3\n\t"
> +	"vmovdqu  %%ymm0, (%%rdi)\n\t"
> +	"vmovdqu  %%ymm1, 32(%%rdi)\n\t"
> +	"vmovdqu  %%ymm2, 64(%%rdi)\n\t"
> +	"vmovdqu  %%ymm3, 96(%%rdi)\n\t"
> +	"addq   $128, %%rsi\n\t"
> +	"addq   $128, %%rdi\n\t"
> +	"subq   $128, %%rdx\n\t"
> +	"jz     %l[done]\n\t"
> +	"cmpq   $128, %%rdx\n\t" /*Vector Size 32B.  */
> +	"jae    201b\n\t"
> +	"202:\n\t"
> +	"cmpq   $64, %%rdx\n\t"
> +	"jb     203f\n\t"
> +	"vmovntdqa (%%rsi), %%ymm0\n\t"
> +	"vmovntdqa 32(%%rsi), %%ymm1\n\t"
> +	"vmovdqu  %%ymm0, (%%rdi)\n\t"
> +	"vmovdqu  %%ymm1, 32(%%rdi)\n\t"
> +	"addq   $64, %%rsi\n\t"
> +	"addq   $64, %%rdi\n\t"
> +	"subq   $64, %%rdx\n\t"
> +	"jz     %l[done]\n\t"
> +	"203:\n\t"
> +	"cmpq   $32, %%rdx\n\t"
> +	"jb     204f\n\t"
> +	"vmovntdqa (%%rsi), %%ymm0\n\t"
> +	"vmovdqu  %%ymm0, (%%rdi)\n\t"
> +	"addq   $32, %%rsi\n\t"
> +	"addq   $32, %%rdi\n\t"
> +	"subq   $32, %%rdx\n\t"
> +	"jz     %l[done]\n\t"
> +	"204:\n\t"
> +	"cmpb   $16, %%dl\n\t"
> +	"jb     205f\n\t"
> +	"vmovntdqa (%%rsi), %%xmm0\n\t"
> +	"vmovdqu  %%xmm0, (%%rdi)\n\t"
> +	"addq   $16, %%rsi\n\t"
> +	"addq   $16, %%rdi\n\t"
> +	"subq   $16, %%rdx\n\t"
> +	"jz     %l[done]\n\t"
> +	"205:\n\t"
> +	"cmpb   $2, %%dl\n\t"
> +	"jb     208f\n\t"
> +	"cmpb   $4, %%dl\n\t"
> +	"jbe    207f\n\t"
> +	"cmpb   $8, %%dl\n\t"
> +	"jbe    206f\n\t"
> +	"movq   -8(%%rsi,%%rdx), %%rcx\n\t"
> +	"movq   (%%rsi), %%rsi\n\t"
> +	"movq   %%rcx, -8(%%rdi,%%rdx)\n\t"
> +	"movq   %%rsi, (%%rdi)\n\t"
> +	"jmp    %l[done]\n\t"
> +	"206:\n\t"
> +	"movl   -4(%%rsi,%%rdx), %%ecx\n\t"
> +	"movl   (%%rsi), %%esi\n\t"
> +	"movl   %%ecx, -4(%%rdi,%%rdx)\n\t"
> +	"movl   %%esi, (%%rdi)\n\t"
> +	"jmp    %l[done]\n\t"
> +	"207:\n\t"
> +	"movzwl -2(%%rsi,%%rdx), %%ecx\n\t"
> +	"movzwl (%%rsi), %%esi\n\t"
> +	"movw   %%cx, -2(%%rdi,%%rdx)\n\t"
> +	"movw   %%si, (%%rdi)\n\t"
> +	"jmp    %l[done]\n\t"
> +	"208:\n\t"
> +	"movzbl (%%rsi), %%ecx\n\t"
> +	"movb   %%cl, (%%rdi)"
> +	:
> +	: "r"(src), "r"(dst), "r"(size)
> +	: "rcx", "rdx", "rsi", "rdi", "ymm0", "ymm1", "ymm2", "ymm3", "memory"
> +	: done
> +	);
> +done:
> +	return dst;
> +}
> +
> +static __rte_always_inline void *
> +rte_memcpy_generic(void *dst, const void *src, size_t len)
> +{
> +	asm goto("movq	%0, %%rsi\n\t"
> +	"movq	%1, %%rdi\n\t"
> +	"movq	%2, %%rdx\n\t"
> +	"movq    %%rdi, %%rax\n\t"
> +	"cmp     $32, %%rdx\n\t"
> +	"jb      101f\n\t"
> +	"cmp     $(32 * 2), %%rdx\n\t"
> +	"ja      108f\n\t"
> +	"vmovdqu   (%%rsi), %%ymm0\n\t"
> +	"vmovdqu   -32(%%rsi,%%rdx), %%ymm1\n\t"
> +	"vmovdqu   %%ymm0, (%%rdi)\n\t"
> +	"vmovdqu   %%ymm1, -32(%%rdi,%%rdx)\n\t"
> +	"vzeroupper\n\t"
> +	"jmp %l[done]\n\t"
> +	"101:\n\t"
> +	/* Less than 1 VEC.  */
> +	"cmpb    $32, %%dl\n\t"
> +	"jae     103f\n\t"
> +	"cmpb    $16, %%dl\n\t"
> +	"jae     104f\n\t"
> +	"cmpb    $8, %%dl\n\t"
> +	"jae     105f\n\t"
> +	"cmpb    $4, %%dl\n\t"
> +	"jae     106f\n\t"
> +	"cmpb    $1, %%dl\n\t"
> +	"ja      107f\n\t"
> +	"jb      102f\n\t"
> +	"movzbl  (%%rsi), %%ecx\n\t"
> +	"movb    %%cl, (%%rdi)\n\t"
> +	"102:\n\t"
> +	"jmp %l[done]\n\t"
> +	"103:\n\t"
> +	/* From 32 to 63.  No branch when size == 32.  */
> +	"vmovdqu (%%rsi), %%ymm0\n\t"
> +	"vmovdqu -32(%%rsi,%%rdx), %%ymm1\n\t"
> +	"vmovdqu %%ymm0, (%%rdi)\n\t"
> +	"vmovdqu %%ymm1, -32(%%rdi,%%rdx)\n\t"
> +	"vzeroupper\n\t"
> +	"jmp %l[done]\n\t"
> +	/* From 16 to 31.  No branch when size == 16.  */
> +	"104:\n\t"
> +	"vmovdqu (%%rsi), %%xmm0\n\t"
> +	"vmovdqu -16(%%rsi,%%rdx), %%xmm1\n\t"
> +	"vmovdqu %%xmm0, (%%rdi)\n\t"
> +	"vmovdqu %%xmm1, -16(%%rdi,%%rdx)\n\t"
> +	"jmp %l[done]\n\t"
> +	"105:\n\t"
> +	/* From 8 to 15.  No branch when size == 8.  */
> +	"movq    -8(%%rsi,%%rdx), %%rcx\n\t"
> +	"movq    (%%rsi), %%rsi\n\t"
> +	"movq    %%rcx, -8(%%rdi,%%rdx)\n\t"
> +	"movq    %%rsi, (%%rdi)\n\t"
> +	"jmp %l[done]\n\t"
> +	"106:\n\t"
> +	/* From 4 to 7.  No branch when size == 4.  */
> +	"movl    -4(%%rsi,%%rdx), %%ecx\n\t"
> +	"movl    (%%rsi), %%esi\n\t"
> +	"movl    %%ecx, -4(%%rdi,%%rdx)\n\t"
> +	"movl    %%esi, (%%rdi)\n\t"
> +	"jmp %l[done]\n\t"
> +	"107:\n\t"
> +	/* From 2 to 3.  No branch when size == 2.  */
> +	"movzwl  -2(%%rsi,%%rdx), %%ecx\n\t"
> +	"movzwl  (%%rsi), %%esi\n\t"
> +	"movw    %%cx, -2(%%rdi,%%rdx)\n\t"
> +	"movw    %%si, (%%rdi)\n\t"
> +	"jmp %l[done]\n\t"
> +	"108:\n\t"
> +	/* More than 2 * VEC and there may be overlap between destination */
> +	/* and source.  */
> +	"cmpq    $(32 * 8), %%rdx\n\t"
> +	"ja      111f\n\t"
> +	"cmpq    $(32 * 4), %%rdx\n\t"
> +	"jb      109f\n\t"
> +	/* Copy from 4 * VEC to 8 * VEC, inclusively. */
> +	"vmovdqu   (%%rsi), %%ymm0\n\t"
> +	"vmovdqu   32(%%rsi), %%ymm1\n\t"
> +	"vmovdqu   (32 * 2)(%%rsi), %%ymm2\n\t"
> +	"vmovdqu   (32 * 3)(%%rsi), %%ymm3\n\t"
> +	"vmovdqu   -32(%%rsi,%%rdx), %%ymm4\n\t"
> +	"vmovdqu   -(32 * 2)(%%rsi,%%rdx), %%ymm5\n\t"
> +	"vmovdqu   -(32 * 3)(%%rsi,%%rdx), %%ymm6\n\t"
> +	"vmovdqu   -(32 * 4)(%%rsi,%%rdx), %%ymm7\n\t"
> +	"vmovdqu   %%ymm0, (%%rdi)\n\t"
> +	"vmovdqu   %%ymm1, 32(%%rdi)\n\t"
> +	"vmovdqu   %%ymm2, (32 * 2)(%%rdi)\n\t"
> +	"vmovdqu   %%ymm3, (32 * 3)(%%rdi)\n\t"
> +	"vmovdqu   %%ymm4, -32(%%rdi,%%rdx)\n\t"
> +	"vmovdqu   %%ymm5, -(32 * 2)(%%rdi,%%rdx)\n\t"
> +	"vmovdqu   %%ymm6, -(32 * 3)(%%rdi,%%rdx)\n\t"
> +	"vmovdqu   %%ymm7, -(32 * 4)(%%rdi,%%rdx)\n\t"
> +	"vzeroupper\n\t"
> +	"jmp %l[done]\n\t"
> +	"109:\n\t"
> +	/* Copy from 2 * VEC to 4 * VEC. */
> +	"vmovdqu   (%%rsi), %%ymm0\n\t"
> +	"vmovdqu   32(%%rsi), %%ymm1\n\t"
> +	"vmovdqu   -32(%%rsi,%%rdx), %%ymm2\n\t"
> +	"vmovdqu   -(32 * 2)(%%rsi,%%rdx), %%ymm3\n\t"
> +	"vmovdqu   %%ymm0, (%%rdi)\n\t"
> +	"vmovdqu   %%ymm1, 32(%%rdi)\n\t"
> +	"vmovdqu   %%ymm2, -32(%%rdi,%%rdx)\n\t"
> +	"vmovdqu   %%ymm3, -(32 * 2)(%%rdi,%%rdx)\n\t"
> +	"vzeroupper\n\t"
> +	"110:\n\t"
> +	"jmp %l[done]\n\t"
> +	"111:\n\t"
> +	"cmpq    %%rsi, %%rdi\n\t"
> +	"ja      113f\n\t"
> +	/* Source == destination is less common.  */
> +	"je      110b\n\t"
> +	/* Load the first VEC and last 4 * VEC to
> +	 * support overlapping addresses.
> +	 */
> +	"vmovdqu   (%%rsi), %%ymm4\n\t"
> +	"vmovdqu   -32(%%rsi, %%rdx), %%ymm5\n\t"
> +	"vmovdqu   -(32 * 2)(%%rsi, %%rdx), %%ymm6\n\t"
> +	"vmovdqu   -(32 * 3)(%%rsi, %%rdx), %%ymm7\n\t"
> +	"vmovdqu   -(32 * 4)(%%rsi, %%rdx), %%ymm8\n\t"
> +	/* Save start and stop of the destination buffer.  */
> +	"movq    %%rdi, %%r11\n\t"
> +	"leaq    -32(%%rdi, %%rdx), %%rcx\n\t"
> +	/* Align destination for aligned stores in the loop.  Compute */
> +	/* how much destination is misaligned.  */
> +	"movq    %%rdi, %%r8\n\t"
> +	"andq    $(32 - 1), %%r8\n\t"
> +	/* Get the negative of offset for alignment.  */
> +	"subq    $32, %%r8\n\t"
> +	/* Adjust source.  */
> +	"subq    %%r8, %%rsi\n\t"
> +	/* Adjust destination which should be aligned now.  */
> +	"subq    %%r8, %%rdi\n\t"
> +	/* Adjust length.  */
> +	"addq    %%r8, %%rdx\n\t"
> +	/* Check non-temporal store threshold.  */
> +	"cmpq	 $(1024*1024), %%rdx\n\t"
> +	"ja      115f\n\t"
> +	"112:\n\t"
> +	/* Copy 4 * VEC a time forward.  */
> +	"vmovdqu   (%%rsi), %%ymm0\n\t"
> +	"vmovdqu   32(%%rsi), %%ymm1\n\t"
> +	"vmovdqu   (32 * 2)(%%rsi), %%ymm2\n\t"
> +	"vmovdqu   (32 * 3)(%%rsi), %%ymm3\n\t"
> +	"addq    $(32 * 4), %%rsi\n\t"
> +	"subq    $(32 * 4), %%rdx\n\t"
> +	"vmovdqa   %%ymm0, (%%rdi)\n\t"
> +	"vmovdqa   %%ymm1, 32(%%rdi)\n\t"
> +	"vmovdqa   %%ymm2, (32 * 2)(%%rdi)\n\t"
> +	"vmovdqa   %%ymm3, (32 * 3)(%%rdi)\n\t"
> +	"addq    $(32 * 4), %%rdi\n\t"
> +	"cmpq    $(32 * 4), %%rdx\n\t"
> +	"ja      112b\n\t"
> +	/* Store the last 4 * VEC.  */
> +	"vmovdqu   %%ymm5, (%%rcx)\n\t"
> +	"vmovdqu   %%ymm6, -32(%%rcx)\n\t"
> +	"vmovdqu   %%ymm7, -(32 * 2)(%%rcx)\n\t"
> +	"vmovdqu   %%ymm8, -(32 * 3)(%%rcx)\n\t"
> +	/* Store the first VEC.  */
> +	"vmovdqu   %%ymm4, (%%r11)\n\t"
> +	"vzeroupper\n\t"
> +	"jmp %l[done]\n\t"
> +	"113:\n\t"
> +	/* Load the first 4*VEC and last VEC to support overlapping addresses.*/
> +	"vmovdqu   (%%rsi), %%ymm4\n\t"
> +	"vmovdqu   32(%%rsi), %%ymm5\n\t"
> +	"vmovdqu   (32 * 2)(%%rsi), %%ymm6\n\t"
> +	"vmovdqu   (32 * 3)(%%rsi), %%ymm7\n\t"
> +	"vmovdqu   -32(%%rsi,%%rdx), %%ymm8\n\t"
> +	/* Save stop of the destination buffer.  */
> +	"leaq    -32(%%rdi, %%rdx), %%r11\n\t"
> +	/* Align destination end for aligned stores in the loop.  Compute */
> +	/* how much destination end is misaligned.  */
> +	"leaq    -32(%%rsi, %%rdx), %%rcx\n\t"
> +	"movq    %%r11, %%r9\n\t"
> +	"movq    %%r11, %%r8\n\t"
> +	"andq    $(32 - 1), %%r8\n\t"
> +	/* Adjust source.  */
> +	"subq    %%r8, %%rcx\n\t"
> +	/* Adjust the end of destination which should be aligned now.  */
> +	"subq    %%r8, %%r9\n\t"
> +	/* Adjust length.  */
> +	"subq    %%r8, %%rdx\n\t"
> +	 /* Check non-temporal store threshold.  */
> +	"cmpq	 $(1024*1024), %%rdx\n\t"
> +	"ja      117f\n\t"
> +	"114:\n\t"
> +	/* Copy 4 * VEC a time backward.  */
> +	"vmovdqu   (%%rcx), %%ymm0\n\t"
> +	"vmovdqu   -32(%%rcx), %%ymm1\n\t"
> +	"vmovdqu   -(32 * 2)(%%rcx), %%ymm2\n\t"
> +	"vmovdqu   -(32 * 3)(%%rcx), %%ymm3\n\t"
> +	"subq    $(32 * 4), %%rcx\n\t"
> +	"subq    $(32 * 4), %%rdx\n\t"
> +	"vmovdqa   %%ymm0, (%%r9)\n\t"
> +	"vmovdqa   %%ymm1, -32(%%r9)\n\t"
> +	"vmovdqa   %%ymm2, -(32 * 2)(%%r9)\n\t"
> +	"vmovdqa   %%ymm3, -(32 * 3)(%%r9)\n\t"
> +	"subq    $(32 * 4), %%r9\n\t"
> +	"cmpq    $(32 * 4), %%rdx\n\t"
> +	"ja      114b\n\t"
> +	/* Store the first 4 * VEC. */
> +	"vmovdqu   %%ymm4, (%%rdi)\n\t"
> +	"vmovdqu   %%ymm5, 32(%%rdi)\n\t"
> +	"vmovdqu   %%ymm6, (32 * 2)(%%rdi)\n\t"
> +	"vmovdqu   %%ymm7, (32 * 3)(%%rdi)\n\t"
> +	/* Store the last VEC. */
> +	"vmovdqu   %%ymm8, (%%r11)\n\t"
> +	"vzeroupper\n\t"
> +	"jmp %l[done]\n\t"
> +
> +	"115:\n\t"
> +	/* Don't use non-temporal store if there is overlap between */
> +	/* destination and source since destination may be in cache */
> +	/* when source is loaded. */
> +	"leaq    (%%rdi, %%rdx), %%r10\n\t"
> +	"cmpq    %%r10, %%rsi\n\t"
> +	"jb      112b\n\t"
> +	"116:\n\t"
> +	/* Copy 4 * VEC a time forward with non-temporal stores.  */
> +	"prefetcht0 (32*4*2)(%%rsi)\n\t"
> +	"prefetcht0 (32*4*2 + 64)(%%rsi)\n\t"
> +	"prefetcht0 (32*4*3)(%%rsi)\n\t"
> +	"prefetcht0 (32*4*3 + 64)(%%rsi)\n\t"
> +	"vmovdqu   (%%rsi), %%ymm0\n\t"
> +	"vmovdqu   32(%%rsi), %%ymm1\n\t"
> +	"vmovdqu   (32 * 2)(%%rsi), %%ymm2\n\t"
> +	"vmovdqu   (32 * 3)(%%rsi), %%ymm3\n\t"
> +	"addq    $(32*4), %%rsi\n\t"
> +	"subq    $(32*4), %%rdx\n\t"
> +	"vmovntdq  %%ymm0, (%%rdi)\n\t"
> +	"vmovntdq  %%ymm1, 32(%%rdi)\n\t"
> +	"vmovntdq  %%ymm2, (32 * 2)(%%rdi)\n\t"
> +	"vmovntdq  %%ymm3, (32 * 3)(%%rdi)\n\t"
> +	"addq    $(32*4), %%rdi\n\t"
> +	"cmpq    $(32*4), %%rdx\n\t"
> +	"ja      116b\n\t"
> +	"sfence\n\t"
> +	/* Store the last 4 * VEC.  */
> +	"vmovdqu   %%ymm5, (%%rcx)\n\t"
> +	"vmovdqu   %%ymm6, -32(%%rcx)\n\t"
> +	"vmovdqu   %%ymm7, -(32 * 2)(%%rcx)\n\t"
> +	"vmovdqu   %%ymm8, -(32 * 3)(%%rcx)\n\t"
> +	/* Store the first VEC.  */
> +	"vmovdqu   %%ymm4, (%%r11)\n\t"
> +	"vzeroupper\n\t"
> +	"jmp %l[done]\n\t"
> +	"117:\n\t"
> +	/* Don't use non-temporal store if there is overlap between */
> +	/* destination and source since destination may be in cache */
> +	/* when source is loaded.  */
> +	"leaq    (%%rcx, %%rdx), %%r10\n\t"
> +	"cmpq    %%r10, %%r9\n\t"
> +	"jb      114b\n\t"
> +	"118:\n\t"
> +	/* Copy 4 * VEC a time backward with non-temporal stores. */
> +	"prefetcht0 (-32 * 4 * 2)(%%rcx)\n\t"
> +	"prefetcht0 (-32 * 4 * 2 - 64)(%%rcx)\n\t"
> +	"prefetcht0 (-32 * 4 * 3)(%%rcx)\n\t"
> +	"prefetcht0 (-32 * 4 * 3 - 64)(%%rcx)\n\t"
> +	"vmovdqu   (%%rcx), %%ymm0\n\t"
> +	"vmovdqu   -32(%%rcx), %%ymm1\n\t"
> +	"vmovdqu   -(32 * 2)(%%rcx), %%ymm2\n\t"
> +	"vmovdqu   -(32 * 3)(%%rcx), %%ymm3\n\t"
> +	"subq    $(32*4), %%rcx\n\t"
> +	"subq    $(32*4), %%rdx\n\t"
> +	"vmovntdq  %%ymm0, (%%r9)\n\t"
> +	"vmovntdq  %%ymm1, -32(%%r9)\n\t"
> +	"vmovntdq  %%ymm2, -(32 * 2)(%%r9)\n\t"
> +	"vmovntdq  %%ymm3, -(32 * 3)(%%r9)\n\t"
> +	"subq    $(32 * 4), %%r9\n\t"
> +	"cmpq    $(32 * 4), %%rdx\n\t"
> +	"ja      118b\n\t"
> +	"sfence\n\t"
> +	/* Store the first 4 * VEC.  */
> +	"vmovdqu   %%ymm4, (%%rdi)\n\t"
> +	"vmovdqu   %%ymm5, 32(%%rdi)\n\t"
> +	"vmovdqu   %%ymm6, (32 * 2)(%%rdi)\n\t"
> +	"vmovdqu   %%ymm7, (32 * 3)(%%rdi)\n\t"
> +	/* Store the last VEC.  */
> +	"vmovdqu   %%ymm8, (%%r11)\n\t"
> +	"vzeroupper\n\t"
> +	"jmp %l[done]"
> +	:
> +	: "r"(src), "r"(dst), "r"(len)
> +	: "rax", "rcx", "rdx", "rdi", "rsi", "r8", "r9", "r10", "r11", "r12", "ymm0",
> +	"ymm1", "ymm2", "ymm3", "ymm4", "ymm5", "ymm6", "ymm7", "ymm8", "memory"
> +	: done
> +	);
> +done:
> +	return dst;
> +}




  parent reply	other threads:[~2021-10-19 12:31 UTC|newest]

Thread overview: 43+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2021-08-23  8:44 [dpdk-dev] [PATCH 1/2] lib/eal: add amd epyc2 memcpy routine to eal Aman Kumar
2021-08-23  8:44 ` [dpdk-dev] [PATCH 2/2] net/mlx5: optimize mprq memcpy for AMD EPYC2 platforms Aman Kumar
2021-10-13 16:53   ` Thomas Monjalon
2021-10-19 10:52     ` Aman Kumar
2021-08-23 15:21 ` [dpdk-dev] [PATCH 1/2] lib/eal: add amd epyc2 memcpy routine to eal Jerin Jacob
2021-08-30  9:39   ` Aman Kumar
2021-10-19 10:47 ` [dpdk-dev] [PATCH v2 " Aman Kumar
2021-10-19 10:47   ` [dpdk-dev] [PATCH v2 2/2] net/mlx5: optimize mprq memcpy for AMD EPYC2 plaform Aman Kumar
2021-10-19 12:31   ` Thomas Monjalon [this message]
2021-10-19 15:35     ` [dpdk-dev] [PATCH v2 1/2] lib/eal: add amd epyc2 memcpy routine to eal Stephen Hemminger
2021-10-21 17:10     ` Song, Keesang
2021-10-21 17:40       ` Ananyev, Konstantin
2021-10-21 18:12         ` Song, Keesang
2021-10-21 18:41           ` Thomas Monjalon
2021-10-21 19:03             ` Song, Keesang
2021-10-21 19:50               ` Thomas Monjalon
2021-10-21 20:14   ` Thomas Monjalon
2021-10-22  8:45     ` Bruce Richardson
2021-10-26 15:56   ` [dpdk-dev] [PATCH v3 1/3] config/x86: add support for AMD platform Aman Kumar
2021-10-26 15:56     ` [dpdk-dev] [PATCH v3 2/3] doc/guides: add dpdk build instruction for AMD platforms Aman Kumar
2021-10-26 16:07       ` Thomas Monjalon
2021-10-27  6:30         ` Aman Kumar
2021-10-26 15:56     ` [dpdk-dev] [PATCH v3 3/3] lib/eal: add temporal store memcpy support on AMD platform Aman Kumar
2021-10-26 16:14       ` Thomas Monjalon
2021-10-27  6:34         ` Aman Kumar
2021-10-27  7:59           ` Thomas Monjalon
2021-10-26 21:10       ` Stephen Hemminger
2021-10-27  6:43         ` Aman Kumar
2021-10-26 16:01     ` [dpdk-dev] [PATCH v3 1/3] config/x86: add support for " Thomas Monjalon
2021-10-27  6:26       ` Aman Kumar
2021-10-27  7:28     ` [dpdk-dev] [PATCH v4 1/2] " Aman Kumar
2021-10-27  7:28       ` [dpdk-dev] [PATCH v4 2/2] lib/eal: add temporal store memcpy " Aman Kumar
2021-10-27  8:13         ` Thomas Monjalon
2021-10-27 11:03           ` Van Haaren, Harry
2021-10-27 11:41             ` Mattias Rönnblom
2021-10-27 12:15               ` Van Haaren, Harry
2021-10-27 12:22                 ` Ananyev, Konstantin
2021-10-27 13:34                   ` Aman Kumar
2021-10-27 14:10                     ` Van Haaren, Harry
2021-10-27 14:31                       ` Thomas Monjalon
2021-10-29 16:01                         ` Song, Keesang
2021-10-27 14:26                     ` Ananyev, Konstantin
2021-10-27 11:33         ` Mattias Rönnblom

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=1936725.1xRPv8SPVf@thomas \
    --to=thomas@monjalon.net \
    --cc=akozyrev@nvidia.com \
    --cc=aman.kumar@vvdntech.in \
    --cc=anatoly.burakov@intel.com \
    --cc=asafp@nvidia.com \
    --cc=bruce.richardson@intel.com \
    --cc=david.marchand@redhat.com \
    --cc=dev@dpdk.org \
    --cc=jerinjacobk@gmail.com \
    --cc=keesang.song@amd.com \
    --cc=konstantin.ananyev@intel.com \
    --cc=matan@nvidia.com \
    --cc=rasland@nvidia.com \
    --cc=shys@nvidia.com \
    --cc=viacheslavo@nvidia.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.