LinuxPPC-Dev Archive on lore.kernel.org
 help / color / mirror / Atom feed
* Re: [RFC V2] virtio: Add platform specific DMA API translation for virito devices
From: Benjamin Herrenschmidt @ 2018-05-23 22:27 UTC (permalink / raw)
  To: Michael S. Tsirkin, Anshuman Khandual
  Cc: virtualization, linux-kernel, linuxppc-dev, aik, robh, joe,
	elfring, david, jasowang, mpe, hch
In-Reply-To: <20180523213703-mutt-send-email-mst@kernel.org>

On Wed, 2018-05-23 at 21:50 +0300, Michael S. Tsirkin wrote:

> I re-read that discussion and I'm still unclear on the
> original question, since I got several apparently
> conflicting answers.
> 
> I asked:
> 
> 	Why isn't setting VIRTIO_F_IOMMU_PLATFORM on the
> 	hypervisor side sufficient?

I thought I had replied to this...

There are a couple of reasons:

- First qemu doesn't know that the guest will switch to "secure mode"
in advance. There is no difference between a normal and a secure
partition until the partition does the magic UV call to "enter secure
mode" and qemu doesn't see any of it. So who can set the flag here ?

- Second, when using VIRTIO_F_IOMMU_PLATFORM, we also make qemu (or
vhost) go through the emulated MMIO for every access to the guest,
which adds additional overhead.

Cheers,
Ben.

> 
> 
> >  arch/powerpc/include/asm/dma-mapping.h |  6 ++++++
> >  arch/powerpc/platforms/pseries/iommu.c | 11 +++++++++++
> >  drivers/virtio/virtio_ring.c           | 10 ++++++++++
> >  3 files changed, 27 insertions(+)
> > 
> > diff --git a/arch/powerpc/include/asm/dma-mapping.h b/arch/powerpc/include/asm/dma-mapping.h
> > index 8fa3945..056e578 100644
> > --- a/arch/powerpc/include/asm/dma-mapping.h
> > +++ b/arch/powerpc/include/asm/dma-mapping.h
> > @@ -115,4 +115,10 @@ extern u64 __dma_get_required_mask(struct device *dev);
> >  #define ARCH_HAS_DMA_MMAP_COHERENT
> >  
> >  #endif /* __KERNEL__ */
> > +
> > +#define platform_forces_virtio_dma platform_forces_virtio_dma
> > +
> > +struct virtio_device;
> > +
> > +extern bool platform_forces_virtio_dma(struct virtio_device *vdev);
> >  #endif	/* _ASM_DMA_MAPPING_H */
> > diff --git a/arch/powerpc/platforms/pseries/iommu.c b/arch/powerpc/platforms/pseries/iommu.c
> > index 06f0296..a2ec15a 100644
> > --- a/arch/powerpc/platforms/pseries/iommu.c
> > +++ b/arch/powerpc/platforms/pseries/iommu.c
> > @@ -38,6 +38,7 @@
> >  #include <linux/of.h>
> >  #include <linux/iommu.h>
> >  #include <linux/rculist.h>
> > +#include <linux/virtio.h>
> >  #include <asm/io.h>
> >  #include <asm/prom.h>
> >  #include <asm/rtas.h>
> > @@ -1396,3 +1397,13 @@ static int __init disable_multitce(char *str)
> >  __setup("multitce=", disable_multitce);
> >  
> >  machine_subsys_initcall_sync(pseries, tce_iommu_bus_notifier_init);
> > +
> > +bool platform_forces_virtio_dma(struct virtio_device *vdev)
> > +{
> > +	/*
> > +	 * On protected guest platforms, force virtio core to use DMA
> > +	 * MAP API for all virtio devices. But there can also be some
> > +	 * exceptions for individual devices like virtio balloon.
> > +	 */
> > +	return (of_find_compatible_node(NULL, NULL, "ibm,ultravisor") != NULL);
> > +}
> 
> Isn't this kind of slow?  vring_use_dma_api is on
> data path and supposed to be very fast.
> 
> > diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c
> > index 21d464a..47ea6c3 100644
> > --- a/drivers/virtio/virtio_ring.c
> > +++ b/drivers/virtio/virtio_ring.c
> > @@ -141,8 +141,18 @@ struct vring_virtqueue {
> >   * unconditionally on data path.
> >   */
> >  
> > +#ifndef platform_forces_virtio_dma
> > +static inline bool platform_forces_virtio_dma(struct virtio_device *vdev)
> > +{
> > +	return false;
> > +}
> > +#endif
> > +
> >  static bool vring_use_dma_api(struct virtio_device *vdev)
> >  {
> > +	if (platform_forces_virtio_dma(vdev))
> > +		return true;
> > +
> >  	if (!virtio_has_iommu_quirk(vdev))
> >  		return true;
> >  
> > -- 
> > 2.9.3

^ permalink raw reply

* Re: [PATCH 07/14] powerpc: Add support for restartable sequences
From: Michael Ellerman @ 2018-05-24  1:03 UTC (permalink / raw)
  To: Mathieu Desnoyers, Boqun Feng
  Cc: Will Deacon, Peter Zijlstra, Paul E. McKenney, Andy Lutomirski,
	Dave Watson, linux-kernel, linux-api, Paul Turner, Andrew Morton,
	Russell King, Thomas Gleixner, Ingo Molnar, H. Peter Anvin,
	Andrew Hunter, Andi Kleen, Chris Lameter, Ben Maurer, rostedt,
	Josh Triplett, Linus Torvalds, Catalin Marinas, Michael Kerrisk,
	Joel Fernandes, Benjamin Herrenschmidt, Paul Mackerras,
	linuxppc-dev
In-Reply-To: <37442352.1577.1527110985175.JavaMail.zimbra@efficios.com>

Mathieu Desnoyers <mathieu.desnoyers@efficios.com> writes:
> ----- On May 23, 2018, at 4:14 PM, Mathieu Desnoyers mathieu.desnoyers@efficios.com wrote:
...
>> 
>> Hi Boqun,
>> 
>> I tried your patch in a ppc64 le environment, and it does not survive boot
>> with CONFIG_DEBUG_RSEQ=y. init gets killed right away.


Sorry this code is super gross and hard to deal with.

> The following fixup gets ppc64 to work:
>
> --- a/arch/powerpc/kernel/entry_64.S
> +++ b/arch/powerpc/kernel/entry_64.S
> @@ -208,6 +208,7 @@ system_call_exit:
>         /* Check whether the syscall is issued inside a restartable sequence */
>         addi    r3,r1,STACK_FRAME_OVERHEAD
>         bl      rseq_syscall
> +       ld      r3,RESULT(r1)
>  #endif
>         /*
>          * Disable interrupts so current_thread_info()->flags can't change,

I don't think that's safe.

If you look above that, we have r3, r8 and r12 all live:

.Lsyscall_exit:
	std	r3,RESULT(r1)
	CURRENT_THREAD_INFO(r12, r1)

	ld	r8,_MSR(r1)
#ifdef CONFIG_PPC_BOOK3S
	/* No MSR:RI on BookE */
	andi.	r10,r8,MSR_RI
	beq-	.Lunrecov_restore
#endif


They're all volatile across function calls:

  http://openpowerfoundation.org/wp-content/uploads/resources/leabi/content/dbdoclet.50655240_68174.html


The system_call_exit symbol is actually there for kprobes and cosmetic
purposes. The actual syscall return flow starts at .Lsyscall_exit.

So I think this would work:

diff --git a/arch/powerpc/kernel/entry_64.S b/arch/powerpc/kernel/entry_64.S
index db4df061c33a..e19f377a25e0 100644
--- a/arch/powerpc/kernel/entry_64.S
+++ b/arch/powerpc/kernel/entry_64.S
@@ -184,6 +184,14 @@ system_call:			/* label this so stack traces look sane */
 
 .Lsyscall_exit:
 	std	r3,RESULT(r1)
+
+#ifdef CONFIG_DEBUG_RSEQ
+	/* Check whether the syscall is issued inside a restartable sequence */
+	addi    r3,r1,STACK_FRAME_OVERHEAD
+	bl      rseq_syscall
+	ld	r3,RESULT(r1)
+#endif
+
 	CURRENT_THREAD_INFO(r12, r1)
 
 	ld	r8,_MSR(r1)


I'll try and get this series into my test setup at some point, been a
bit busy lately :)

cheers

^ permalink raw reply related

* Re: [PATCH] sound: Use octal not symbolic permissions
From: Vinod @ 2018-05-24  4:11 UTC (permalink / raw)
  To: Joe Perches
  Cc: Clemens Ladisch, Brian Austin, Paul Handrigan, Timur Tabi,
	Nicolin Chen, Xiubo Li, Fabio Estevam, Jaroslav Kysela,
	Takashi Iwai, Liam Girdwood, Mark Brown, alsa-devel, linux-kernel,
	patches, linuxppc-dev
In-Reply-To: <b32d7b5f7827cc3283e4e46c3135144373404621.1527103197.git.joe@perches.com>

On 23-05-18, 12:20, Joe Perches wrote:
> Convert the S_<FOO> symbolic permissions to their octal equivalents as
> using octal and not symbolic permissions is preferred by many as more
> readable.
> 
> see: https://lkml.org/lkml/2016/8/2/1945
> 
> Done with automated conversion via:
> $ ./scripts/checkpatch.pl -f --types=SYMBOLIC_PERMS --fix-inplace <files...>
> 
> Miscellanea:
> 
> o Wrapped one multi-line call to a single line
> 
> Signed-off-by: Joe Perches <joe@perches.com>
> ---
>  sound/core/compress_offload.c             |  2 +-

Acked-By: Vinod Koul <vkoul@kernel.org>

-- 
~Vinod

^ permalink raw reply

* Re: [PATCH v3] powerpc: Implement csum_ipv6_magic in assembly
From: Christophe LEROY @ 2018-05-24  6:20 UTC (permalink / raw)
  To: Segher Boessenkool
  Cc: Benjamin Herrenschmidt, Paul Mackerras, Michael Ellerman,
	linux-kernel, linuxppc-dev, netdev
In-Reply-To: <20180523183447.GV17342@gate.crashing.org>



Le 23/05/2018 à 20:34, Segher Boessenkool a écrit :
> On Tue, May 22, 2018 at 08:57:01AM +0200, Christophe Leroy wrote:
>> The generic csum_ipv6_magic() generates a pretty bad result
> 
> <snip>
> 
> Please try with a more recent compiler, what you used is pretty ancient.
> It's not like recent compilers do great on this either, but it's not
> *that* bad anymore ;-)
> 
>> --- a/arch/powerpc/lib/checksum_32.S
>> +++ b/arch/powerpc/lib/checksum_32.S
>> @@ -293,3 +293,36 @@ dst_error:
>>   	EX_TABLE(51b, dst_error);
>>   
>>   EXPORT_SYMBOL(csum_partial_copy_generic)
>> +
>> +/*
>> + * static inline __sum16 csum_ipv6_magic(const struct in6_addr *saddr,
>> + *				      const struct in6_addr *daddr,
>> + *				      __u32 len, __u8 proto, __wsum sum)
>> + */
>> +
>> +_GLOBAL(csum_ipv6_magic)
>> +	lwz	r8, 0(r3)
>> +	lwz	r9, 4(r3)
>> +	lwz	r10, 8(r3)
>> +	lwz	r11, 12(r3)
>> +	addc	r0, r5, r6
>> +	adde	r0, r0, r7
>> +	adde	r0, r0, r8
>> +	adde	r0, r0, r9
>> +	adde	r0, r0, r10
>> +	adde	r0, r0, r11
>> +	lwz	r8, 0(r4)
>> +	lwz	r9, 4(r4)
>> +	lwz	r10, 8(r4)
>> +	lwz	r11, 12(r4)
>> +	adde	r0, r0, r8
>> +	adde	r0, r0, r9
>> +	adde	r0, r0, r10
>> +	adde	r0, r0, r11
>> +	addze	r0, r0
>> +	rotlwi	r3, r0, 16
>> +	add	r3, r0, r3
>> +	not	r3, r3
>> +	rlwinm	r3, r3, 16, 16, 31
>> +	blr
>> +EXPORT_SYMBOL(csum_ipv6_magic)
> 
> Clustering the loads and carry insns together is pretty much the worst you
> can do on most 32-bit CPUs.

Oh, really ? __csum_partial is written that way too.

Right, now I tried interleaving the lwz and adde. I get no improvment at 
all on a 885, but I get a 15% improvment on a 8321.

Christophe

> 
> 
> Segher
> 

^ permalink raw reply

* Re: [PATCH] cpuidle/powernv : init all present cpus for deep states
From: Akshay Adiga @ 2018-05-24  6:53 UTC (permalink / raw)
  To: linux-kernel, linuxppc-dev; +Cc: npiggin, ego, mpe
In-Reply-To: <1526472134-23757-1-git-send-email-akshay.adiga@linux.vnet.ibm.com>

On Wed, May 16, 2018 at 05:32:14PM +0530, Akshay Adiga wrote:
> Init all present cpus for deep states instead of "all possible" cpus.
> Init fails if the possible cpu is gaurded. Resulting in making only
> non-deep states available for cpuidle/hotplug.
> 
> Signed-off-by: Akshay Adiga <akshay.adiga@linux.vnet.ibm.com>
> ---
>  arch/powerpc/platforms/powernv/idle.c | 4 ++--
>  1 file changed, 2 insertions(+), 2 deletions(-)
> 
> diff --git a/arch/powerpc/platforms/powernv/idle.c b/arch/powerpc/platforms/powernv/idle.c
> index 1f12ab1..1c5d067 100644
> --- a/arch/powerpc/platforms/powernv/idle.c
> +++ b/arch/powerpc/platforms/powernv/idle.c
> @@ -79,7 +79,7 @@ static int pnv_save_sprs_for_deep_states(void)
>  	uint64_t msr_val = MSR_IDLE;
>  	uint64_t psscr_val = pnv_deepest_stop_psscr_val;
> 
> -	for_each_possible_cpu(cpu) {
> +	for_each_present_cpu(cpu) {
>  		uint64_t pir = get_hard_smp_processor_id(cpu);
>  		uint64_t hsprg0_val = (uint64_t)paca_ptrs[cpu];
> 
> @@ -814,7 +814,7 @@ static int __init pnv_init_idle_states(void)
>  		int cpu;
> 
>  		pr_info("powernv: idle: Saving PACA pointers of all CPUs in their thread sibling PACA\n");
> -		for_each_possible_cpu(cpu) {
> +		for_each_present_cpu(cpu) {
>  			int base_cpu = cpu_first_thread_sibling(cpu);
>  			int idx = cpu_thread_in_core(cpu);
>  			int i;
> -- 
> 2.5.5
>

Missed CC'ing to  mpe@ellerman.id.au 

^ permalink raw reply

* [PATCH bpf-next v4 00/10] bpf: enhancements for multi-function programs
From: Sandipan Das @ 2018-05-24  6:56 UTC (permalink / raw)
  To: ast, daniel; +Cc: netdev, linuxppc-dev, mpe, naveen.n.rao, jakub.kicinski

[1] Support for bpf-to-bpf function calls in the powerpc64 JIT compiler.

[2] Provide a way for resolving function calls because of the way JITed
    images are allocated in powerpc64.

[3] Fix to get JITed instruction dumps for multi-function programs from
    the bpf system call.

[4] Fix for bpftool to show delimited multi-function JITed image dumps.

v4:
 - Incorporate review comments from Jakub.
 - Fix JSON output for bpftool.

v3:
 - Change base tree tag to bpf-next.
 - Incorporate review comments from Alexei, Daniel and Jakub.
 - Make sure that the JITed image does not grow or shrink after
   the last pass due to the way the instruction sequence used
   to load a callee's address maybe optimized.
 - Make additional changes to the bpf system call and bpftool to
   make multi-function JITed dumps easier to correlate.

v2:
 - Incorporate review comments from Jakub.

Sandipan Das (10):
  bpf: support 64-bit offsets for bpf function calls
  bpf: powerpc64: pad function address loads with NOPs
  bpf: powerpc64: add JIT support for multi-function programs
  bpf: get kernel symbol addresses via syscall
  tools: bpf: sync bpf uapi header
  tools: bpftool: resolve calls without using imm field
  bpf: fix multi-function JITed dump obtained via syscall
  bpf: get JITed image lengths of functions via syscall
  tools: bpf: sync bpf uapi header
  tools: bpftool: add delimiters to multi-function JITed dumps

 arch/powerpc/net/bpf_jit_comp64.c | 110 ++++++++++++++++++++++++++++++--------
 include/uapi/linux/bpf.h          |   4 ++
 kernel/bpf/syscall.c              |  82 ++++++++++++++++++++++++++--
 kernel/bpf/verifier.c             |  22 +++++---
 tools/bpf/bpftool/prog.c          |  97 ++++++++++++++++++++++++++++++++-
 tools/bpf/bpftool/xlated_dumper.c |  14 +++--
 tools/bpf/bpftool/xlated_dumper.h |   3 ++
 tools/include/uapi/linux/bpf.h    |   4 ++
 8 files changed, 301 insertions(+), 35 deletions(-)

-- 
2.14.3

^ permalink raw reply

* [PATCH bpf-next v4 01/10] bpf: support 64-bit offsets for bpf function calls
From: Sandipan Das @ 2018-05-24  6:56 UTC (permalink / raw)
  To: ast, daniel; +Cc: netdev, linuxppc-dev, mpe, naveen.n.rao, jakub.kicinski
In-Reply-To: <cover.1527143877.git.sandipan@linux.vnet.ibm.com>

The imm field of a bpf instruction is a signed 32-bit integer.
For JITed bpf-to-bpf function calls, it holds the offset of the
start address of the callee's JITed image from __bpf_call_base.

For some architectures, such as powerpc64, this offset may be
as large as 64 bits and cannot be accomodated in the imm field
without truncation.

We resolve this by:

[1] Additionally using the auxiliary data of each function to
    keep a list of start addresses of the JITed images for all
    functions determined by the verifier.

[2] Retaining the subprog id inside the off field of the call
    instructions and using it to index into the list mentioned
    above and lookup the callee's address.

To make sure that the existing JIT compilers continue to work
without requiring changes, we keep the imm field as it is.

Signed-off-by: Sandipan Das <sandipan@linux.vnet.ibm.com>
---
 kernel/bpf/verifier.c | 15 ++++++++++++++-
 1 file changed, 14 insertions(+), 1 deletion(-)

diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index a9e4b1372da6..559cb74ba29e 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -5383,11 +5383,24 @@ static int jit_subprogs(struct bpf_verifier_env *env)
 			    insn->src_reg != BPF_PSEUDO_CALL)
 				continue;
 			subprog = insn->off;
-			insn->off = 0;
 			insn->imm = (u64 (*)(u64, u64, u64, u64, u64))
 				func[subprog]->bpf_func -
 				__bpf_call_base;
 		}
+
+		/* we use the aux data to keep a list of the start addresses
+		 * of the JITed images for each function in the program
+		 *
+		 * for some architectures, such as powerpc64, the imm field
+		 * might not be large enough to hold the offset of the start
+		 * address of the callee's JITed image from __bpf_call_base
+		 *
+		 * in such cases, we can lookup the start address of a callee
+		 * by using its subprog id, available from the off field of
+		 * the call instruction, as an index for this list
+		 */
+		func[i]->aux->func = func;
+		func[i]->aux->func_cnt = env->subprog_cnt;
 	}
 	for (i = 0; i < env->subprog_cnt; i++) {
 		old_bpf_func = func[i]->bpf_func;
-- 
2.14.3

^ permalink raw reply related

* [PATCH bpf-next v4 02/10] bpf: powerpc64: pad function address loads with NOPs
From: Sandipan Das @ 2018-05-24  6:56 UTC (permalink / raw)
  To: ast, daniel; +Cc: netdev, linuxppc-dev, mpe, naveen.n.rao, jakub.kicinski
In-Reply-To: <cover.1527143877.git.sandipan@linux.vnet.ibm.com>

For multi-function programs, loading the address of a callee
function to a register requires emitting instructions whose
count varies from one to five depending on the nature of the
address.

Since we come to know of the callee's address only before the
extra pass, the number of instructions required to load this
address may vary from what was previously generated. This can
make the JITed image grow or shrink.

To avoid this, we should generate a constant five-instruction
when loading function addresses by padding the optimized load
sequence with NOPs.

Signed-off-by: Sandipan Das <sandipan@linux.vnet.ibm.com>
---
 arch/powerpc/net/bpf_jit_comp64.c | 34 +++++++++++++++++++++++-----------
 1 file changed, 23 insertions(+), 11 deletions(-)

diff --git a/arch/powerpc/net/bpf_jit_comp64.c b/arch/powerpc/net/bpf_jit_comp64.c
index 1bdb1aff0619..e4582744a31d 100644
--- a/arch/powerpc/net/bpf_jit_comp64.c
+++ b/arch/powerpc/net/bpf_jit_comp64.c
@@ -167,25 +167,37 @@ static void bpf_jit_build_epilogue(u32 *image, struct codegen_context *ctx)
 
 static void bpf_jit_emit_func_call(u32 *image, struct codegen_context *ctx, u64 func)
 {
+	unsigned int i, ctx_idx = ctx->idx;
+
+	/* Load function address into r12 */
+	PPC_LI64(12, func);
+
+	/* For bpf-to-bpf function calls, the callee's address is unknown
+	 * until the last extra pass. As seen above, we use PPC_LI64() to
+	 * load the callee's address, but this may optimize the number of
+	 * instructions required based on the nature of the address.
+	 *
+	 * Since we don't want the number of instructions emitted to change,
+	 * we pad the optimized PPC_LI64() call with NOPs to guarantee that
+	 * we always have a five-instruction sequence, which is the maximum
+	 * that PPC_LI64() can emit.
+	 */
+	for (i = ctx->idx - ctx_idx; i < 5; i++)
+		PPC_NOP();
+
 #ifdef PPC64_ELF_ABI_v1
-	/* func points to the function descriptor */
-	PPC_LI64(b2p[TMP_REG_2], func);
-	/* Load actual entry point from function descriptor */
-	PPC_BPF_LL(b2p[TMP_REG_1], b2p[TMP_REG_2], 0);
-	/* ... and move it to LR */
-	PPC_MTLR(b2p[TMP_REG_1]);
 	/*
 	 * Load TOC from function descriptor at offset 8.
 	 * We can clobber r2 since we get called through a
 	 * function pointer (so caller will save/restore r2)
 	 * and since we don't use a TOC ourself.
 	 */
-	PPC_BPF_LL(2, b2p[TMP_REG_2], 8);
-#else
-	/* We can clobber r12 */
-	PPC_FUNC_ADDR(12, func);
-	PPC_MTLR(12);
+	PPC_BPF_LL(2, 12, 8);
+	/* Load actual entry point from function descriptor */
+	PPC_BPF_LL(12, 12, 0);
 #endif
+
+	PPC_MTLR(12);
 	PPC_BLRL();
 }
 
-- 
2.14.3

^ permalink raw reply related

* [PATCH bpf-next v4 04/10] bpf: get kernel symbol addresses via syscall
From: Sandipan Das @ 2018-05-24  6:56 UTC (permalink / raw)
  To: ast, daniel; +Cc: netdev, linuxppc-dev, mpe, naveen.n.rao, jakub.kicinski
In-Reply-To: <cover.1527143877.git.sandipan@linux.vnet.ibm.com>

This adds new two new fields to struct bpf_prog_info. For
multi-function programs, these fields can be used to pass
a list of kernel symbol addresses for all functions in a
given program to userspace using the bpf system call with
the BPF_OBJ_GET_INFO_BY_FD command.

When bpf_jit_kallsyms is enabled, we can get the address
of the corresponding kernel symbol for a callee function
and resolve the symbol's name. The address is determined
by adding the value of the call instruction's imm field
to __bpf_call_base. This offset gets assigned to the imm
field by the verifier.

For some architectures, such as powerpc64, the imm field
is not large enough to hold this offset.

We resolve this by:

[1] Assigning the subprog id to the imm field of a call
    instruction in the verifier instead of the offset of
    the callee's symbol's address from __bpf_call_base.

[2] Determining the address of a callee's corresponding
    symbol by using the imm field as an index for the
    list of kernel symbol addresses now available from
    the program info.

Suggested-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: Sandipan Das <sandipan@linux.vnet.ibm.com>
---
v3:
 - Copy addresses to jited_ksyms only if bpf_dump_raw_ok()
   is true.
 - Move new fields to the end of bpf_prog_info to avoid
   breaking userspace.
---
 include/uapi/linux/bpf.h |  2 ++
 kernel/bpf/syscall.c     | 25 +++++++++++++++++++++++++
 kernel/bpf/verifier.c    |  7 +------
 3 files changed, 28 insertions(+), 6 deletions(-)

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index c3e502d06bc3..0be90965867d 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -2205,6 +2205,8 @@ struct bpf_prog_info {
 	__u32 gpl_compatible:1;
 	__u64 netns_dev;
 	__u64 netns_ino;
+	__u32 nr_jited_ksyms;
+	__aligned_u64 jited_ksyms;
 } __attribute__((aligned(8)));
 
 struct bpf_map_info {
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 0b4c94551001..068a4fc79ddb 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -1933,6 +1933,7 @@ static int bpf_prog_get_info_by_fd(struct bpf_prog *prog,
 	if (!capable(CAP_SYS_ADMIN)) {
 		info.jited_prog_len = 0;
 		info.xlated_prog_len = 0;
+		info.nr_jited_ksyms = 0;
 		goto done;
 	}
 
@@ -1981,6 +1982,30 @@ static int bpf_prog_get_info_by_fd(struct bpf_prog *prog,
 		}
 	}
 
+	ulen = info.nr_jited_ksyms;
+	info.nr_jited_ksyms = prog->aux->func_cnt;
+	if (info.nr_jited_ksyms && ulen) {
+		if (bpf_dump_raw_ok()) {
+			u64 __user *user_ksyms;
+			ulong ksym_addr;
+			u32 i;
+
+			/* copy the address of the kernel symbol
+			 * corresponding to each function
+			 */
+			ulen = min_t(u32, info.nr_jited_ksyms, ulen);
+			user_ksyms = u64_to_user_ptr(info.jited_ksyms);
+			for (i = 0; i < ulen; i++) {
+				ksym_addr = (ulong) prog->aux->func[i]->bpf_func;
+				ksym_addr &= PAGE_MASK;
+				if (put_user((u64) ksym_addr, &user_ksyms[i]))
+					return -EFAULT;
+			}
+		} else {
+			info.jited_ksyms = 0;
+		}
+	}
+
 done:
 	if (copy_to_user(uinfo, &info, info_len) ||
 	    put_user(info_len, &uattr->info.info_len))
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 559cb74ba29e..8c4d9d0fd3ab 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -5426,17 +5426,12 @@ static int jit_subprogs(struct bpf_verifier_env *env)
 	 * later look the same as if they were interpreted only.
 	 */
 	for (i = 0, insn = prog->insnsi; i < prog->len; i++, insn++) {
-		unsigned long addr;
-
 		if (insn->code != (BPF_JMP | BPF_CALL) ||
 		    insn->src_reg != BPF_PSEUDO_CALL)
 			continue;
 		insn->off = env->insn_aux_data[i].call_imm;
 		subprog = find_subprog(env, i + insn->off + 1);
-		addr  = (unsigned long)func[subprog]->bpf_func;
-		addr &= PAGE_MASK;
-		insn->imm = (u64 (*)(u64, u64, u64, u64, u64))
-			    addr - __bpf_call_base;
+		insn->imm = subprog;
 	}
 
 	prog->jited = 1;
-- 
2.14.3

^ permalink raw reply related

* [PATCH bpf-next v4 03/10] bpf: powerpc64: add JIT support for multi-function programs
From: Sandipan Das @ 2018-05-24  6:56 UTC (permalink / raw)
  To: ast, daniel; +Cc: netdev, linuxppc-dev, mpe, naveen.n.rao, jakub.kicinski
In-Reply-To: <cover.1527143877.git.sandipan@linux.vnet.ibm.com>

This adds support for bpf-to-bpf function calls in the powerpc64
JIT compiler. The JIT compiler converts the bpf call instructions
to native branch instructions. After a round of the usual passes,
the start addresses of the JITed images for the callee functions
are known. Finally, to fixup the branch target addresses, we need
to perform an extra pass.

Because of the address range in which JITed images are allocated
on powerpc64, the offsets of the start addresses of these images
from __bpf_call_base are as large as 64 bits. So, for a function
call, we cannot use the imm field of the instruction to determine
the callee's address. Instead, we use the alternative method of
getting it from the list of function addresses in the auxiliary
data of the caller by using the off field as an index.

Signed-off-by: Sandipan Das <sandipan@linux.vnet.ibm.com>
---
v3:
 - Fix memory leak for jit_data when we fail to allocated addrs.
 - Remove unnecessary bpf_jit_binary_lock_ro() call.
---
 arch/powerpc/net/bpf_jit_comp64.c | 76 +++++++++++++++++++++++++++++++++------
 1 file changed, 66 insertions(+), 10 deletions(-)

diff --git a/arch/powerpc/net/bpf_jit_comp64.c b/arch/powerpc/net/bpf_jit_comp64.c
index e4582744a31d..f1c95779843b 100644
--- a/arch/powerpc/net/bpf_jit_comp64.c
+++ b/arch/powerpc/net/bpf_jit_comp64.c
@@ -268,7 +268,7 @@ static void bpf_jit_emit_tail_call(u32 *image, struct codegen_context *ctx, u32
 /* Assemble the body code between the prologue & epilogue */
 static int bpf_jit_build_body(struct bpf_prog *fp, u32 *image,
 			      struct codegen_context *ctx,
-			      u32 *addrs)
+			      u32 *addrs, bool extra_pass)
 {
 	const struct bpf_insn *insn = fp->insnsi;
 	int flen = fp->len;
@@ -724,11 +724,25 @@ static int bpf_jit_build_body(struct bpf_prog *fp, u32 *image,
 			break;
 
 		/*
-		 * Call kernel helper
+		 * Call kernel helper or bpf function
 		 */
 		case BPF_JMP | BPF_CALL:
 			ctx->seen |= SEEN_FUNC;
-			func = (u8 *) __bpf_call_base + imm;
+
+			/* bpf function call */
+			if (insn[i].src_reg == BPF_PSEUDO_CALL)
+				if (!extra_pass)
+					func = NULL;
+				else if (fp->aux->func && off < fp->aux->func_cnt)
+					/* use the subprog id from the off
+					 * field to lookup the callee address
+					 */
+					func = (u8 *) fp->aux->func[off]->bpf_func;
+				else
+					return -EINVAL;
+			/* kernel helper call */
+			else
+				func = (u8 *) __bpf_call_base + imm;
 
 			bpf_jit_emit_func_call(image, ctx, (u64)func);
 
@@ -876,6 +890,14 @@ static int bpf_jit_build_body(struct bpf_prog *fp, u32 *image,
 	return 0;
 }
 
+struct powerpc64_jit_data {
+	struct bpf_binary_header *header;
+	u32 *addrs;
+	u8 *image;
+	u32 proglen;
+	struct codegen_context ctx;
+};
+
 struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *fp)
 {
 	u32 proglen;
@@ -883,6 +905,7 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *fp)
 	u8 *image = NULL;
 	u32 *code_base;
 	u32 *addrs;
+	struct powerpc64_jit_data *jit_data;
 	struct codegen_context cgctx;
 	int pass;
 	int flen;
@@ -890,6 +913,7 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *fp)
 	struct bpf_prog *org_fp = fp;
 	struct bpf_prog *tmp_fp;
 	bool bpf_blinded = false;
+	bool extra_pass = false;
 
 	if (!fp->jit_requested)
 		return org_fp;
@@ -903,11 +927,32 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *fp)
 		fp = tmp_fp;
 	}
 
+	jit_data = fp->aux->jit_data;
+	if (!jit_data) {
+		jit_data = kzalloc(sizeof(*jit_data), GFP_KERNEL);
+		if (!jit_data) {
+			fp = org_fp;
+			goto out;
+		}
+		fp->aux->jit_data = jit_data;
+	}
+
 	flen = fp->len;
+	addrs = jit_data->addrs;
+	if (addrs) {
+		cgctx = jit_data->ctx;
+		image = jit_data->image;
+		bpf_hdr = jit_data->header;
+		proglen = jit_data->proglen;
+		alloclen = proglen + FUNCTION_DESCR_SIZE;
+		extra_pass = true;
+		goto skip_init_ctx;
+	}
+
 	addrs = kzalloc((flen+1) * sizeof(*addrs), GFP_KERNEL);
 	if (addrs == NULL) {
 		fp = org_fp;
-		goto out;
+		goto out_addrs;
 	}
 
 	memset(&cgctx, 0, sizeof(struct codegen_context));
@@ -916,10 +961,10 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *fp)
 	cgctx.stack_size = round_up(fp->aux->stack_depth, 16);
 
 	/* Scouting faux-generate pass 0 */
-	if (bpf_jit_build_body(fp, 0, &cgctx, addrs)) {
+	if (bpf_jit_build_body(fp, 0, &cgctx, addrs, false)) {
 		/* We hit something illegal or unsupported. */
 		fp = org_fp;
-		goto out;
+		goto out_addrs;
 	}
 
 	/*
@@ -937,9 +982,10 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *fp)
 			bpf_jit_fill_ill_insns);
 	if (!bpf_hdr) {
 		fp = org_fp;
-		goto out;
+		goto out_addrs;
 	}
 
+skip_init_ctx:
 	code_base = (u32 *)(image + FUNCTION_DESCR_SIZE);
 
 	/* Code generation passes 1-2 */
@@ -947,7 +993,7 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *fp)
 		/* Now build the prologue, body code & epilogue for real. */
 		cgctx.idx = 0;
 		bpf_jit_build_prologue(code_base, &cgctx);
-		bpf_jit_build_body(fp, code_base, &cgctx, addrs);
+		bpf_jit_build_body(fp, code_base, &cgctx, addrs, extra_pass);
 		bpf_jit_build_epilogue(code_base, &cgctx);
 
 		if (bpf_jit_enable > 1)
@@ -973,10 +1019,20 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *fp)
 	fp->jited_len = alloclen;
 
 	bpf_flush_icache(bpf_hdr, (u8 *)bpf_hdr + (bpf_hdr->pages * PAGE_SIZE));
+	if (!fp->is_func || extra_pass) {
+out_addrs:
+		kfree(addrs);
+		kfree(jit_data);
+		fp->aux->jit_data = NULL;
+	} else {
+		jit_data->addrs = addrs;
+		jit_data->ctx = cgctx;
+		jit_data->proglen = proglen;
+		jit_data->image = image;
+		jit_data->header = bpf_hdr;
+	}
 
 out:
-	kfree(addrs);
-
 	if (bpf_blinded)
 		bpf_jit_prog_release_other(fp, fp == org_fp ? tmp_fp : org_fp);
 
-- 
2.14.3

^ permalink raw reply related

* [PATCH bpf-next v4 05/10] tools: bpf: sync bpf uapi header
From: Sandipan Das @ 2018-05-24  6:56 UTC (permalink / raw)
  To: ast, daniel; +Cc: netdev, linuxppc-dev, mpe, naveen.n.rao, jakub.kicinski
In-Reply-To: <cover.1527143877.git.sandipan@linux.vnet.ibm.com>

Syncing the bpf.h uapi header with tools so that struct
bpf_prog_info has the two new fields for passing on the
addresses of the kernel symbols corresponding to each
function in a program.

Signed-off-by: Sandipan Das <sandipan@linux.vnet.ibm.com>
---
v3:
 - Move new fields to the end of bpf_prog_info to avoid
   breaking userspace.
---
 tools/include/uapi/linux/bpf.h | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index c3e502d06bc3..0be90965867d 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -2205,6 +2205,8 @@ struct bpf_prog_info {
 	__u32 gpl_compatible:1;
 	__u64 netns_dev;
 	__u64 netns_ino;
+	__u32 nr_jited_ksyms;
+	__aligned_u64 jited_ksyms;
 } __attribute__((aligned(8)));
 
 struct bpf_map_info {
-- 
2.14.3

^ permalink raw reply related

* [PATCH bpf-next v4 06/10] tools: bpftool: resolve calls without using imm field
From: Sandipan Das @ 2018-05-24  6:56 UTC (permalink / raw)
  To: ast, daniel; +Cc: netdev, linuxppc-dev, mpe, naveen.n.rao, jakub.kicinski
In-Reply-To: <cover.1527143877.git.sandipan@linux.vnet.ibm.com>

Currently, we resolve the callee's address for a JITed function
call by using the imm field of the call instruction as an offset
from __bpf_call_base. If bpf_jit_kallsyms is enabled, we further
use this address to get the callee's kernel symbol's name.

For some architectures, such as powerpc64, the imm field is not
large enough to hold this offset. So, instead of assigning this
offset to the imm field, the verifier now assigns the subprog
id. Also, a list of kernel symbol addresses for all the JITed
functions is provided in the program info. We now use the imm
field as an index for this list to lookup a callee's symbol's
address and resolve its name.

Suggested-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: Sandipan Das <sandipan@linux.vnet.ibm.com>
Reviewed-by: Jakub Kicinski <jakub.kicinski@netronome.com>
---
v3:
 - Avoid using redundant pointers.
 - Fix indentation.

v2:
 - Order variables from longest to shortest.
 - Make sure that ksyms_ptr and ksyms_len are always initialized.
 - Simplify code.
---
 tools/bpf/bpftool/prog.c          | 24 ++++++++++++++++++++++++
 tools/bpf/bpftool/xlated_dumper.c | 10 +++++++++-
 tools/bpf/bpftool/xlated_dumper.h |  2 ++
 3 files changed, 35 insertions(+), 1 deletion(-)

diff --git a/tools/bpf/bpftool/prog.c b/tools/bpf/bpftool/prog.c
index 9bdfdf2d3fbe..e05ab58d39e2 100644
--- a/tools/bpf/bpftool/prog.c
+++ b/tools/bpf/bpftool/prog.c
@@ -420,7 +420,9 @@ static int do_show(int argc, char **argv)
 
 static int do_dump(int argc, char **argv)
 {
+	unsigned long *func_ksyms = NULL;
 	struct bpf_prog_info info = {};
+	unsigned int nr_func_ksyms;
 	struct dump_data dd = {};
 	__u32 len = sizeof(info);
 	unsigned int buf_size;
@@ -496,10 +498,22 @@ static int do_dump(int argc, char **argv)
 		return -1;
 	}
 
+	nr_func_ksyms = info.nr_jited_ksyms;
+	if (nr_func_ksyms) {
+		func_ksyms = malloc(nr_func_ksyms * sizeof(__u64));
+		if (!func_ksyms) {
+			p_err("mem alloc failed");
+			close(fd);
+			goto err_free;
+		}
+	}
+
 	memset(&info, 0, sizeof(info));
 
 	*member_ptr = ptr_to_u64(buf);
 	*member_len = buf_size;
+	info.jited_ksyms = ptr_to_u64(func_ksyms);
+	info.nr_jited_ksyms = nr_func_ksyms;
 
 	err = bpf_obj_get_info_by_fd(fd, &info, &len);
 	close(fd);
@@ -513,6 +527,11 @@ static int do_dump(int argc, char **argv)
 		goto err_free;
 	}
 
+	if (info.nr_jited_ksyms > nr_func_ksyms) {
+		p_err("too many addresses returned");
+		goto err_free;
+	}
+
 	if ((member_len == &info.jited_prog_len &&
 	     info.jited_prog_insns == 0) ||
 	    (member_len == &info.xlated_prog_len &&
@@ -558,6 +577,9 @@ static int do_dump(int argc, char **argv)
 			dump_xlated_cfg(buf, *member_len);
 	} else {
 		kernel_syms_load(&dd);
+		dd.nr_jited_ksyms = info.nr_jited_ksyms;
+		dd.jited_ksyms = (__u64 *) info.jited_ksyms;
+
 		if (json_output)
 			dump_xlated_json(&dd, buf, *member_len, opcodes);
 		else
@@ -566,10 +588,12 @@ static int do_dump(int argc, char **argv)
 	}
 
 	free(buf);
+	free(func_ksyms);
 	return 0;
 
 err_free:
 	free(buf);
+	free(func_ksyms);
 	return -1;
 }
 
diff --git a/tools/bpf/bpftool/xlated_dumper.c b/tools/bpf/bpftool/xlated_dumper.c
index 7a3173b76c16..efdc8fecf2bb 100644
--- a/tools/bpf/bpftool/xlated_dumper.c
+++ b/tools/bpf/bpftool/xlated_dumper.c
@@ -174,7 +174,11 @@ static const char *print_call_pcrel(struct dump_data *dd,
 				    unsigned long address,
 				    const struct bpf_insn *insn)
 {
-	if (sym)
+	if (!dd->nr_jited_ksyms)
+		/* Do not show address for interpreted programs */
+		snprintf(dd->scratch_buff, sizeof(dd->scratch_buff),
+			"%+d", insn->off);
+	else if (sym)
 		snprintf(dd->scratch_buff, sizeof(dd->scratch_buff),
 			 "%+d#%s", insn->off, sym->name);
 	else
@@ -203,6 +207,10 @@ static const char *print_call(void *private_data,
 	unsigned long address = dd->address_call_base + insn->imm;
 	struct kernel_sym *sym;
 
+	if (insn->src_reg == BPF_PSEUDO_CALL &&
+	    (__u32) insn->imm < dd->nr_jited_ksyms)
+		address = dd->jited_ksyms[insn->imm];
+
 	sym = kernel_syms_search(dd, address);
 	if (insn->src_reg == BPF_PSEUDO_CALL)
 		return print_call_pcrel(dd, sym, address, insn);
diff --git a/tools/bpf/bpftool/xlated_dumper.h b/tools/bpf/bpftool/xlated_dumper.h
index b34affa7ef2d..eafbb49c8d0b 100644
--- a/tools/bpf/bpftool/xlated_dumper.h
+++ b/tools/bpf/bpftool/xlated_dumper.h
@@ -49,6 +49,8 @@ struct dump_data {
 	unsigned long address_call_base;
 	struct kernel_sym *sym_mapping;
 	__u32 sym_count;
+	__u64 *jited_ksyms;
+	__u32 nr_jited_ksyms;
 	char scratch_buff[SYM_MAX_NAME + 8];
 };
 
-- 
2.14.3

^ permalink raw reply related

* [PATCH bpf-next v4 07/10] bpf: fix multi-function JITed dump obtained via syscall
From: Sandipan Das @ 2018-05-24  6:56 UTC (permalink / raw)
  To: ast, daniel; +Cc: netdev, linuxppc-dev, mpe, naveen.n.rao, jakub.kicinski
In-Reply-To: <cover.1527143877.git.sandipan@linux.vnet.ibm.com>

Currently, for multi-function programs, we cannot get the JITed
instructions using the bpf system call's BPF_OBJ_GET_INFO_BY_FD
command. Because of this, userspace tools such as bpftool fail
to identify a multi-function program as being JITed or not.

With the JIT enabled and the test program running, this can be
verified as follows:

  # cat /proc/sys/net/core/bpf_jit_enable
  1

Before applying this patch:

  # bpftool prog list
  1: kprobe  name foo  tag b811aab41a39ad3d  gpl
          loaded_at 2018-05-16T11:43:38+0530  uid 0
          xlated 216B  not jited  memlock 65536B
  ...

  # bpftool prog dump jited id 1
  no instructions returned

After applying this patch:

  # bpftool prog list
  1: kprobe  name foo  tag b811aab41a39ad3d  gpl
          loaded_at 2018-05-16T12:13:01+0530  uid 0
          xlated 216B  jited 308B  memlock 65536B
  ...

  # bpftool prog dump jited id 1
     0:   nop
     4:   nop
     8:   mflr    r0
     c:   std     r0,16(r1)
    10:   stdu    r1,-112(r1)
    14:   std     r31,104(r1)
    18:   addi    r31,r1,48
    1c:   li      r3,10
  ...

Signed-off-by: Sandipan Das <sandipan@linux.vnet.ibm.com>
---
v4:
 - If the image allocated in userspace is not large enough,
   fill it up till the end even if the last JITed image can
   only be copied partially.
---
 kernel/bpf/syscall.c | 37 ++++++++++++++++++++++++++++++++++---
 1 file changed, 34 insertions(+), 3 deletions(-)

diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 068a4fc79ddb..c8e987a612b5 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -1970,13 +1970,44 @@ static int bpf_prog_get_info_by_fd(struct bpf_prog *prog,
 	 * for offload.
 	 */
 	ulen = info.jited_prog_len;
-	info.jited_prog_len = prog->jited_len;
+	if (prog->aux->func_cnt) {
+		u32 i;
+
+		info.jited_prog_len = 0;
+		for (i = 0; i < prog->aux->func_cnt; i++)
+			info.jited_prog_len += prog->aux->func[i]->jited_len;
+	} else {
+		info.jited_prog_len = prog->jited_len;
+	}
+
 	if (info.jited_prog_len && ulen) {
 		if (bpf_dump_raw_ok()) {
 			uinsns = u64_to_user_ptr(info.jited_prog_insns);
 			ulen = min_t(u32, info.jited_prog_len, ulen);
-			if (copy_to_user(uinsns, prog->bpf_func, ulen))
-				return -EFAULT;
+
+			/* for multi-function programs, copy the JITed
+			 * instructions for all the functions
+			 */
+			if (prog->aux->func_cnt) {
+				u32 len, free, i;
+				u8 *img;
+
+				free = ulen;
+				for (i = 0; i < prog->aux->func_cnt; i++) {
+					len = prog->aux->func[i]->jited_len;
+					len = min_t(u32, len, free);
+					img = (u8 *) prog->aux->func[i]->bpf_func;
+					if (copy_to_user(uinsns, img, len))
+						return -EFAULT;
+					uinsns += len;
+					free -= len;
+					if (!free)
+						break;
+				}
+			} else {
+				if (copy_to_user(uinsns, prog->bpf_func, ulen))
+					return -EFAULT;
+			}
 		} else {
 			info.jited_prog_insns = 0;
 		}
-- 
2.14.3

^ permalink raw reply related

* [PATCH bpf-next v4 08/10] bpf: get JITed image lengths of functions via syscall
From: Sandipan Das @ 2018-05-24  6:56 UTC (permalink / raw)
  To: ast, daniel; +Cc: netdev, linuxppc-dev, mpe, naveen.n.rao, jakub.kicinski
In-Reply-To: <cover.1527143877.git.sandipan@linux.vnet.ibm.com>

This adds new two new fields to struct bpf_prog_info. For
multi-function programs, these fields can be used to pass
a list of the JITed image lengths of each function for a
given program to userspace using the bpf system call with
the BPF_OBJ_GET_INFO_BY_FD command.

This can be used by userspace applications like bpftool
to split up the contiguous JITed dump, also obtained via
the system call, into more relatable chunks corresponding
to each function.

Signed-off-by: Sandipan Das <sandipan@linux.vnet.ibm.com>
---
 include/uapi/linux/bpf.h |  2 ++
 kernel/bpf/syscall.c     | 20 ++++++++++++++++++++
 2 files changed, 22 insertions(+)

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 0be90965867d..344d2ddcef49 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -2206,7 +2206,9 @@ struct bpf_prog_info {
 	__u64 netns_dev;
 	__u64 netns_ino;
 	__u32 nr_jited_ksyms;
+	__u32 nr_jited_func_lens;
 	__aligned_u64 jited_ksyms;
+	__aligned_u64 jited_func_lens;
 } __attribute__((aligned(8)));
 
 struct bpf_map_info {
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index c8e987a612b5..788456c18617 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -2037,6 +2037,26 @@ static int bpf_prog_get_info_by_fd(struct bpf_prog *prog,
 		}
 	}
 
+	ulen = info.nr_jited_func_lens;
+	info.nr_jited_func_lens = prog->aux->func_cnt;
+	if (info.nr_jited_func_lens && ulen) {
+		if (bpf_dump_raw_ok()) {
+			u32 __user *user_lens;
+			u32 func_len, i;
+
+			/* copy the JITed image lengths for each function */
+			ulen = min_t(u32, info.nr_jited_func_lens, ulen);
+			user_lens = u64_to_user_ptr(info.jited_func_lens);
+			for (i = 0; i < ulen; i++) {
+				func_len = prog->aux->func[i]->jited_len;
+				if (put_user(func_len, &user_lens[i]))
+					return -EFAULT;
+			}
+		} else {
+			info.jited_func_lens = 0;
+		}
+	}
+
 done:
 	if (copy_to_user(uinfo, &info, info_len) ||
 	    put_user(info_len, &uattr->info.info_len))
-- 
2.14.3

^ permalink raw reply related

* [PATCH bpf-next v4 09/10] tools: bpf: sync bpf uapi header
From: Sandipan Das @ 2018-05-24  6:56 UTC (permalink / raw)
  To: ast, daniel; +Cc: netdev, linuxppc-dev, mpe, naveen.n.rao, jakub.kicinski
In-Reply-To: <cover.1527143877.git.sandipan@linux.vnet.ibm.com>

Syncing the bpf.h uapi header with tools so that struct
bpf_prog_info has the two new fields for passing on the
JITed image lengths of each function in a multi-function
program.

Signed-off-by: Sandipan Das <sandipan@linux.vnet.ibm.com>
---
 tools/include/uapi/linux/bpf.h | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index 0be90965867d..344d2ddcef49 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -2206,7 +2206,9 @@ struct bpf_prog_info {
 	__u64 netns_dev;
 	__u64 netns_ino;
 	__u32 nr_jited_ksyms;
+	__u32 nr_jited_func_lens;
 	__aligned_u64 jited_ksyms;
+	__aligned_u64 jited_func_lens;
 } __attribute__((aligned(8)));
 
 struct bpf_map_info {
-- 
2.14.3

^ permalink raw reply related

* [PATCH bpf-next v4 10/10] tools: bpftool: add delimiters to multi-function JITed dumps
From: Sandipan Das @ 2018-05-24  6:56 UTC (permalink / raw)
  To: ast, daniel; +Cc: netdev, linuxppc-dev, mpe, naveen.n.rao, jakub.kicinski
In-Reply-To: <cover.1527143877.git.sandipan@linux.vnet.ibm.com>

This splits up the contiguous JITed dump obtained via the bpf
system call into more relatable chunks for each function in
the program. If the kernel symbols corresponding to these are
known, they are printed in the header for each JIT image dump
otherwise the masked start address is printed.

Before applying this patch:

  # bpftool prog dump jited id 1

     0:	push   %rbp
     1:	mov    %rsp,%rbp
  ...
    70:	leaveq
    71:	retq
    72:	push   %rbp
    73:	mov    %rsp,%rbp
  ...
    dd:	leaveq
    de:	retq

  # bpftool -p prog dump jited id 1

  [{
          "pc": "0x0",
          "operation": "push",
          "operands": ["%rbp"
          ]
      },{
  ...
      },{
          "pc": "0x71",
          "operation": "retq",
          "operands": [null
          ]
      },{
          "pc": "0x72",
          "operation": "push",
          "operands": ["%rbp"
          ]
      },{
  ...
      },{
          "pc": "0xde",
          "operation": "retq",
          "operands": [null
          ]
      }
  ]

After applying this patch:

  # echo 0 > /proc/sys/net/core/bpf_jit_kallsyms
  # bpftool prog dump jited id 1

  0xffffffffc02c7000:
     0:	push   %rbp
     1:	mov    %rsp,%rbp
  ...
    70:	leaveq
    71:	retq

  0xffffffffc02cf000:
     0:	push   %rbp
     1:	mov    %rsp,%rbp
  ...
    6b:	leaveq
    6c:	retq

  # bpftool -p prog dump jited id 1

  [{
          "name": "0xffffffffc02c7000",
          "insns": [{
                  "pc": "0x0",
                  "operation": "push",
                  "operands": ["%rbp"
                  ]
              },{
  ...
              },{
                  "pc": "0x71",
                  "operation": "retq",
                  "operands": [null
                  ]
              }
          ]
      },{
          "name": "0xffffffffc02cf000",
          "insns": [{
                  "pc": "0x0",
                  "operation": "push",
                  "operands": ["%rbp"
                  ]
              },{
  ...
              },{
                  "pc": "0x6c",
                  "operation": "retq",
                  "operands": [null
                  ]
              }
          ]
      }
  ]

  # echo 1 > /proc/sys/net/core/bpf_jit_kallsyms
  # bpftool prog dump jited id 1

  bpf_prog_b811aab41a39ad3d_foo:
     0:	push   %rbp
     1:	mov    %rsp,%rbp
  ...
    70:	leaveq
    71:	retq

  bpf_prog_cf418ac8b67bebd9_F:
     0:	push   %rbp
     1:	mov    %rsp,%rbp
  ...
    6b:	leaveq
    6c:	retq

  # bpftool -p prog dump jited id 1

  [{
          "name": "bpf_prog_b811aab41a39ad3d_foo",
          "insns": [{
                  "pc": "0x0",
                  "operation": "push",
                  "operands": ["%rbp"
                  ]
              },{
  ...
              },{
                  "pc": "0x71",
                  "operation": "retq",
                  "operands": [null
                  ]
              }
          ]
      },{
          "name": "bpf_prog_cf418ac8b67bebd9_F",
          "insns": [{
                  "pc": "0x0",
                  "operation": "push",
                  "operands": ["%rbp"
                  ]
              },{
  ...
              },{
                  "pc": "0x6c",
                  "operation": "retq",
                  "operands": [null
                  ]
              }
          ]
      }
  ]

Signed-off-by: Sandipan Das <sandipan@linux.vnet.ibm.com>
---
v4:
 - Fix JSON output.
---
 tools/bpf/bpftool/prog.c          | 73 ++++++++++++++++++++++++++++++++++++++-
 tools/bpf/bpftool/xlated_dumper.c |  4 +--
 tools/bpf/bpftool/xlated_dumper.h |  1 +
 3 files changed, 75 insertions(+), 3 deletions(-)

diff --git a/tools/bpf/bpftool/prog.c b/tools/bpf/bpftool/prog.c
index e05ab58d39e2..39b88e760367 100644
--- a/tools/bpf/bpftool/prog.c
+++ b/tools/bpf/bpftool/prog.c
@@ -422,7 +422,9 @@ static int do_dump(int argc, char **argv)
 {
 	unsigned long *func_ksyms = NULL;
 	struct bpf_prog_info info = {};
+	unsigned int *func_lens = NULL;
 	unsigned int nr_func_ksyms;
+	unsigned int nr_func_lens;
 	struct dump_data dd = {};
 	__u32 len = sizeof(info);
 	unsigned int buf_size;
@@ -508,12 +510,24 @@ static int do_dump(int argc, char **argv)
 		}
 	}
 
+	nr_func_lens = info.nr_jited_func_lens;
+	if (nr_func_lens) {
+		func_lens = malloc(nr_func_lens * sizeof(__u32));
+		if (!func_lens) {
+			p_err("mem alloc failed");
+			close(fd);
+			goto err_free;
+		}
+	}
+
 	memset(&info, 0, sizeof(info));
 
 	*member_ptr = ptr_to_u64(buf);
 	*member_len = buf_size;
 	info.jited_ksyms = ptr_to_u64(func_ksyms);
 	info.nr_jited_ksyms = nr_func_ksyms;
+	info.jited_func_lens = ptr_to_u64(func_lens);
+	info.nr_jited_func_lens = nr_func_lens;
 
 	err = bpf_obj_get_info_by_fd(fd, &info, &len);
 	close(fd);
@@ -532,6 +546,11 @@ static int do_dump(int argc, char **argv)
 		goto err_free;
 	}
 
+	if (info.nr_jited_func_lens > nr_func_lens) {
+		p_err("too many values returned");
+		goto err_free;
+	}
+
 	if ((member_len == &info.jited_prog_len &&
 	     info.jited_prog_insns == 0) ||
 	    (member_len == &info.xlated_prog_len &&
@@ -569,7 +588,57 @@ static int do_dump(int argc, char **argv)
 				goto err_free;
 		}
 
-		disasm_print_insn(buf, *member_len, opcodes, name);
+		if (info.nr_jited_func_lens && info.jited_func_lens) {
+			struct kernel_sym *sym = NULL;
+			char sym_name[SYM_MAX_NAME];
+			unsigned char *img = buf;
+			__u64 *ksyms = NULL;
+			__u32 *lens;
+			__u32 i;
+
+			if (info.nr_jited_ksyms) {
+				kernel_syms_load(&dd);
+				ksyms = (__u64 *) info.jited_ksyms;
+			}
+
+			if (json_output)
+				jsonw_start_array(json_wtr);
+
+			lens = (__u32 *) info.jited_func_lens;
+			for (i = 0; i < info.nr_jited_func_lens; i++) {
+				if (ksyms) {
+					sym = kernel_syms_search(&dd, ksyms[i]);
+					if (sym)
+						sprintf(sym_name, "%s", sym->name);
+					else
+						sprintf(sym_name, "0x%016llx", ksyms[i]);
+				} else {
+					strcpy(sym_name, "unknown");
+				}
+
+				if (json_output) {
+					jsonw_start_object(json_wtr);
+					jsonw_name(json_wtr, "name");
+					jsonw_string(json_wtr, sym_name);
+					jsonw_name(json_wtr, "insns");
+				} else {
+					printf("%s:\n", sym_name);
+				}
+
+				disasm_print_insn(img, lens[i], opcodes, name);
+				img += lens[i];
+
+				if (json_output)
+					jsonw_end_object(json_wtr);
+				else
+					printf("\n");
+			}
+
+			if (json_output)
+				jsonw_end_array(json_wtr);
+		} else {
+			disasm_print_insn(buf, *member_len, opcodes, name);
+		}
 	} else if (visual) {
 		if (json_output)
 			jsonw_null(json_wtr);
@@ -589,11 +658,13 @@ static int do_dump(int argc, char **argv)
 
 	free(buf);
 	free(func_ksyms);
+	free(func_lens);
 	return 0;
 
 err_free:
 	free(buf);
 	free(func_ksyms);
+	free(func_lens);
 	return -1;
 }
 
diff --git a/tools/bpf/bpftool/xlated_dumper.c b/tools/bpf/bpftool/xlated_dumper.c
index efdc8fecf2bb..b97f1da60dd1 100644
--- a/tools/bpf/bpftool/xlated_dumper.c
+++ b/tools/bpf/bpftool/xlated_dumper.c
@@ -102,8 +102,8 @@ void kernel_syms_destroy(struct dump_data *dd)
 	free(dd->sym_mapping);
 }
 
-static struct kernel_sym *kernel_syms_search(struct dump_data *dd,
-					     unsigned long key)
+struct kernel_sym *kernel_syms_search(struct dump_data *dd,
+				      unsigned long key)
 {
 	struct kernel_sym sym = {
 		.address = key,
diff --git a/tools/bpf/bpftool/xlated_dumper.h b/tools/bpf/bpftool/xlated_dumper.h
index eafbb49c8d0b..33d86e2b369b 100644
--- a/tools/bpf/bpftool/xlated_dumper.h
+++ b/tools/bpf/bpftool/xlated_dumper.h
@@ -56,6 +56,7 @@ struct dump_data {
 
 void kernel_syms_load(struct dump_data *dd);
 void kernel_syms_destroy(struct dump_data *dd);
+struct kernel_sym *kernel_syms_search(struct dump_data *dd, unsigned long key);
 void dump_xlated_json(struct dump_data *dd, void *buf, unsigned int len,
 		      bool opcodes);
 void dump_xlated_plain(struct dump_data *dd, void *buf, unsigned int len,
-- 
2.14.3

^ permalink raw reply related

* Re: [PATCH bpf-next v4 10/10] tools: bpftool: add delimiters to multi-function JITed dumps
From: Jakub Kicinski @ 2018-05-24  7:04 UTC (permalink / raw)
  To: Sandipan Das; +Cc: ast, daniel, netdev, linuxppc-dev, mpe, naveen.n.rao
In-Reply-To: <477bc3286831947c2eb774e8787f72680cf378cf.1527143877.git.sandipan@linux.vnet.ibm.com>

On Thu, 24 May 2018 12:26:54 +0530, Sandipan Das wrote:
> This splits up the contiguous JITed dump obtained via the bpf
> system call into more relatable chunks for each function in
> the program. If the kernel symbols corresponding to these are
> known, they are printed in the header for each JIT image dump
> otherwise the masked start address is printed.

...
 
> Signed-off-by: Sandipan Das <sandipan@linux.vnet.ibm.com>

Reviewed-by: Jakub Kicinski <jakub.kicinski@netronome.com>

Thank you!

^ permalink raw reply

* Re: [RFC V2] virtio: Add platform specific DMA API translation for virito devices
From: Ram Pai @ 2018-05-24  7:21 UTC (permalink / raw)
  To: Michael S. Tsirkin
  Cc: Anshuman Khandual, robh, aik, jasowang, linux-kernel,
	virtualization, hch, joe, linuxppc-dev, elfring, david
In-Reply-To: <20180523213703-mutt-send-email-mst@kernel.org>

On Wed, May 23, 2018 at 09:50:02PM +0300, Michael S. Tsirkin wrote:
> subj: s/virito/virtio/
> 
..snip..
> >  machine_subsys_initcall_sync(pseries, tce_iommu_bus_notifier_init);
> > +
> > +bool platform_forces_virtio_dma(struct virtio_device *vdev)
> > +{
> > +	/*
> > +	 * On protected guest platforms, force virtio core to use DMA
> > +	 * MAP API for all virtio devices. But there can also be some
> > +	 * exceptions for individual devices like virtio balloon.
> > +	 */
> > +	return (of_find_compatible_node(NULL, NULL, "ibm,ultravisor") != NULL);
> > +}
> 
> Isn't this kind of slow?  vring_use_dma_api is on
> data path and supposed to be very fast.

Yes it is slow and not ideal. This won't be the final code. The final
code will cache the information in some global variable and used
in this function.

However this code was added to the RFC to illustrate the idea that dma
operation are needed only in a secure/protected environment.

RP

^ permalink raw reply

* Re: [PATCH bpf-next v4 00/10] bpf: enhancements for multi-function programs
From: Daniel Borkmann @ 2018-05-24  7:31 UTC (permalink / raw)
  To: Sandipan Das, ast; +Cc: netdev, linuxppc-dev, mpe, naveen.n.rao, jakub.kicinski
In-Reply-To: <cover.1527143877.git.sandipan@linux.vnet.ibm.com>

On 05/24/2018 08:56 AM, Sandipan Das wrote:
> [1] Support for bpf-to-bpf function calls in the powerpc64 JIT compiler.
> 
> [2] Provide a way for resolving function calls because of the way JITed
>     images are allocated in powerpc64.
> 
> [3] Fix to get JITed instruction dumps for multi-function programs from
>     the bpf system call.
> 
> [4] Fix for bpftool to show delimited multi-function JITed image dumps.
> 
> v4:
>  - Incorporate review comments from Jakub.
>  - Fix JSON output for bpftool.
> 
> v3:
>  - Change base tree tag to bpf-next.
>  - Incorporate review comments from Alexei, Daniel and Jakub.
>  - Make sure that the JITed image does not grow or shrink after
>    the last pass due to the way the instruction sequence used
>    to load a callee's address maybe optimized.
>  - Make additional changes to the bpf system call and bpftool to
>    make multi-function JITed dumps easier to correlate.
> 
> v2:
>  - Incorporate review comments from Jakub.
> 
> Sandipan Das (10):
>   bpf: support 64-bit offsets for bpf function calls
>   bpf: powerpc64: pad function address loads with NOPs
>   bpf: powerpc64: add JIT support for multi-function programs
>   bpf: get kernel symbol addresses via syscall
>   tools: bpf: sync bpf uapi header
>   tools: bpftool: resolve calls without using imm field
>   bpf: fix multi-function JITed dump obtained via syscall
>   bpf: get JITed image lengths of functions via syscall
>   tools: bpf: sync bpf uapi header
>   tools: bpftool: add delimiters to multi-function JITed dumps
> 
>  arch/powerpc/net/bpf_jit_comp64.c | 110 ++++++++++++++++++++++++++++++--------
>  include/uapi/linux/bpf.h          |   4 ++
>  kernel/bpf/syscall.c              |  82 ++++++++++++++++++++++++++--
>  kernel/bpf/verifier.c             |  22 +++++---
>  tools/bpf/bpftool/prog.c          |  97 ++++++++++++++++++++++++++++++++-
>  tools/bpf/bpftool/xlated_dumper.c |  14 +++--
>  tools/bpf/bpftool/xlated_dumper.h |   3 ++
>  tools/include/uapi/linux/bpf.h    |   4 ++
>  8 files changed, 301 insertions(+), 35 deletions(-)

Applied to bpf-next, thanks a lot Sandipan!

^ permalink raw reply

* Re: [PATCH bpf-next v4 02/10] bpf: powerpc64: pad function address loads with NOPs
From: Daniel Borkmann @ 2018-05-24  7:34 UTC (permalink / raw)
  To: Sandipan Das, ast; +Cc: netdev, linuxppc-dev, mpe, naveen.n.rao, jakub.kicinski
In-Reply-To: <d0db970711596827ba88209e545c686d77c22b7d.1527143877.git.sandipan@linux.vnet.ibm.com>

On 05/24/2018 08:56 AM, Sandipan Das wrote:
> For multi-function programs, loading the address of a callee
> function to a register requires emitting instructions whose
> count varies from one to five depending on the nature of the
> address.
> 
> Since we come to know of the callee's address only before the
> extra pass, the number of instructions required to load this
> address may vary from what was previously generated. This can
> make the JITed image grow or shrink.
> 
> To avoid this, we should generate a constant five-instruction
> when loading function addresses by padding the optimized load
> sequence with NOPs.
> 
> Signed-off-by: Sandipan Das <sandipan@linux.vnet.ibm.com>
> ---
>  arch/powerpc/net/bpf_jit_comp64.c | 34 +++++++++++++++++++++++-----------
>  1 file changed, 23 insertions(+), 11 deletions(-)
> 
> diff --git a/arch/powerpc/net/bpf_jit_comp64.c b/arch/powerpc/net/bpf_jit_comp64.c
> index 1bdb1aff0619..e4582744a31d 100644
> --- a/arch/powerpc/net/bpf_jit_comp64.c
> +++ b/arch/powerpc/net/bpf_jit_comp64.c
> @@ -167,25 +167,37 @@ static void bpf_jit_build_epilogue(u32 *image, struct codegen_context *ctx)
>  
>  static void bpf_jit_emit_func_call(u32 *image, struct codegen_context *ctx, u64 func)
>  {
> +	unsigned int i, ctx_idx = ctx->idx;
> +
> +	/* Load function address into r12 */
> +	PPC_LI64(12, func);
> +
> +	/* For bpf-to-bpf function calls, the callee's address is unknown
> +	 * until the last extra pass. As seen above, we use PPC_LI64() to
> +	 * load the callee's address, but this may optimize the number of
> +	 * instructions required based on the nature of the address.
> +	 *
> +	 * Since we don't want the number of instructions emitted to change,
> +	 * we pad the optimized PPC_LI64() call with NOPs to guarantee that
> +	 * we always have a five-instruction sequence, which is the maximum
> +	 * that PPC_LI64() can emit.
> +	 */
> +	for (i = ctx->idx - ctx_idx; i < 5; i++)
> +		PPC_NOP();

By the way, I think you can still optimize this. The nops are not really
needed in case of insn->src_reg != BPF_PSEUDO_CALL since the address of
a normal BPF helper call will always be at a fixed location and known a
priori.

>  #ifdef PPC64_ELF_ABI_v1
> -	/* func points to the function descriptor */
> -	PPC_LI64(b2p[TMP_REG_2], func);
> -	/* Load actual entry point from function descriptor */
> -	PPC_BPF_LL(b2p[TMP_REG_1], b2p[TMP_REG_2], 0);
> -	/* ... and move it to LR */
> -	PPC_MTLR(b2p[TMP_REG_1]);
>  	/*
>  	 * Load TOC from function descriptor at offset 8.
>  	 * We can clobber r2 since we get called through a
>  	 * function pointer (so caller will save/restore r2)
>  	 * and since we don't use a TOC ourself.
>  	 */
> -	PPC_BPF_LL(2, b2p[TMP_REG_2], 8);
> -#else
> -	/* We can clobber r12 */
> -	PPC_FUNC_ADDR(12, func);
> -	PPC_MTLR(12);
> +	PPC_BPF_LL(2, 12, 8);
> +	/* Load actual entry point from function descriptor */
> +	PPC_BPF_LL(12, 12, 0);
>  #endif
> +
> +	PPC_MTLR(12);
>  	PPC_BLRL();
>  }
>  
> 

^ permalink raw reply

* Re: [RFC V2] virtio: Add platform specific DMA API translation for virito devices
From: Christoph Hellwig @ 2018-05-24  7:17 UTC (permalink / raw)
  To: Benjamin Herrenschmidt
  Cc: Michael S. Tsirkin, Anshuman Khandual, virtualization,
	linux-kernel, linuxppc-dev, aik, robh, joe, elfring, david,
	jasowang, mpe, hch
In-Reply-To: <f77638ddef3af52dd71341083707c9e3745dd505.camel@kernel.crashing.org>

On Thu, May 24, 2018 at 08:27:04AM +1000, Benjamin Herrenschmidt wrote:
> - First qemu doesn't know that the guest will switch to "secure mode"
> in advance. There is no difference between a normal and a secure
> partition until the partition does the magic UV call to "enter secure
> mode" and qemu doesn't see any of it. So who can set the flag here ?
> 
> - Second, when using VIRTIO_F_IOMMU_PLATFORM, we also make qemu (or
> vhost) go through the emulated MMIO for every access to the guest,
> which adds additional overhead.

Also this whole scheme is simply the wrong way around.  No driver should
opt out of the DMA API in general.  For legacy reasons we might have to
opt out of the dma API for some virtio cases due to qemu bugs, but
this should never have been the default, but a quirk for the affected
versions.

We need to fix this now and make the dma ops bypass the quirk instead of
the default, which will also solve the power issue.

^ permalink raw reply

* Re: [PATCH v5 2/4] powerpc/64: enhance memcmp() with VMX instruction for long bytes comparision
From: Michael Ellerman @ 2018-05-24  7:44 UTC (permalink / raw)
  To: wei.guo.simon, linuxppc-dev
  Cc: Paul Mackerras, Naveen N.  Rao, Cyril Bur, Simon Guo
In-Reply-To: <1527058083-6998-3-git-send-email-wei.guo.simon@gmail.com>

Hi Simon,

wei.guo.simon@gmail.com writes:
> From: Simon Guo <wei.guo.simon@gmail.com>
>
> This patch add VMX primitives to do memcmp() in case the compare size
> exceeds 4K bytes. KSM feature can benefit from this.

You say "exceeds 4K" here.

> diff --git a/arch/powerpc/lib/memcmp_64.S b/arch/powerpc/lib/memcmp_64.S
> index f20e883..6303bbf 100644
> --- a/arch/powerpc/lib/memcmp_64.S
> +++ b/arch/powerpc/lib/memcmp_64.S
> @@ -27,12 +27,73 @@
>  #define LH	lhbrx
>  #define LW	lwbrx
>  #define LD	ldbrx
> +#define LVS	lvsr
> +#define VPERM(_VRT,_VRA,_VRB,_VRC) \
> +	vperm _VRT,_VRB,_VRA,_VRC
>  #else
>  #define LH	lhzx
>  #define LW	lwzx
>  #define LD	ldx
> +#define LVS	lvsl
> +#define VPERM(_VRT,_VRA,_VRB,_VRC) \
> +	vperm _VRT,_VRA,_VRB,_VRC
>  #endif
>  
> +#define VMX_OPS_THRES 4096

THRES == 4096

BTW, can we call it VMX_THRESH ?

> +#define ENTER_VMX_OPS	\
> +	mflr    r0;	\
> +	std     r3,-STACKFRAMESIZE+STK_REG(R31)(r1); \
> +	std     r4,-STACKFRAMESIZE+STK_REG(R30)(r1); \
> +	std     r5,-STACKFRAMESIZE+STK_REG(R29)(r1); \
> +	std     r0,16(r1); \
> +	stdu    r1,-STACKFRAMESIZE(r1); \
> +	bl      enter_vmx_ops; \
> +	cmpwi   cr1,r3,0; \
> +	ld      r0,STACKFRAMESIZE+16(r1); \
> +	ld      r3,STK_REG(R31)(r1); \
> +	ld      r4,STK_REG(R30)(r1); \
> +	ld      r5,STK_REG(R29)(r1); \
> +	addi	r1,r1,STACKFRAMESIZE; \
> +	mtlr    r0
> +
> +#define EXIT_VMX_OPS \
> +	mflr    r0; \
> +	std     r3,-STACKFRAMESIZE+STK_REG(R31)(r1); \
> +	std     r4,-STACKFRAMESIZE+STK_REG(R30)(r1); \
> +	std     r5,-STACKFRAMESIZE+STK_REG(R29)(r1); \
> +	std     r0,16(r1); \
> +	stdu    r1,-STACKFRAMESIZE(r1); \
> +	bl      exit_vmx_ops; \
> +	ld      r0,STACKFRAMESIZE+16(r1); \
> +	ld      r3,STK_REG(R31)(r1); \
> +	ld      r4,STK_REG(R30)(r1); \
> +	ld      r5,STK_REG(R29)(r1); \
> +	addi	r1,r1,STACKFRAMESIZE; \
> +	mtlr    r0
> +
> +/*
> + * LD_VSR_CROSS16B load the 2nd 16 bytes for _vaddr which is unaligned with
> + * 16 bytes boundary and permute the result with the 1st 16 bytes.
> +
> + *    |  y y y y y y y y y y y y y 0 1 2 | 3 4 5 6 7 8 9 a b c d e f z z z |
> + *    ^                                  ^                                 ^
> + * 0xbbbb10                          0xbbbb20                          0xbbb30
> + *                                 ^
> + *                                _vaddr
> + *
> + *
> + * _vmask is the mask generated by LVS
> + * _v1st_qw is the 1st aligned QW of current addr which is already loaded.
> + *   for example: 0xyyyyyyyyyyyyy012 for big endian
> + * _v2nd_qw is the 2nd aligned QW of cur _vaddr to be loaded.
> + *   for example: 0x3456789abcdefzzz for big endian
> + * The permute result is saved in _v_res.
> + *   for example: 0x0123456789abcdef for big endian.
> + */
> +#define LD_VSR_CROSS16B(_vaddr,_vmask,_v1st_qw,_v2nd_qw,_v_res) \
> +        lvx     _v2nd_qw,_vaddr,off16; \
> +        VPERM(_v_res,_v1st_qw,_v2nd_qw,_vmask)
> +
>  /*
>   * There are 2 categories for memcmp:
>   * 1) src/dst has the same offset to the 8 bytes boundary. The handlers
> @@ -174,6 +235,13 @@ _GLOBAL(memcmp)
>  	blr
>  
>  .Llong:
> +#ifdef CONFIG_ALTIVEC
> +	/* Try to use vmx loop if length is larger than 4K */
> +	cmpldi  cr6,r5,VMX_OPS_THRES
> +	bge	cr6,.Lsameoffset_vmx_cmp

Here we compare the length to 4K and if it's greater *or equal* then we
go to the VMX case. Or am I reading it backward?

So we should say "if the size is 4K or more we do VMX" shouldn't we?

cheers

^ permalink raw reply

* Re: [PATCH v5 2/4] powerpc/64: enhance memcmp() with VMX instruction for long bytes comparision
From: Simon Guo @ 2018-05-23 15:37 UTC (permalink / raw)
  To: Michael Ellerman; +Cc: linuxppc-dev, Paul Mackerras, Naveen N.  Rao, Cyril Bur
In-Reply-To: <87d0xlsdjy.fsf@concordia.ellerman.id.au>

Hi Michael,
On Thu, May 24, 2018 at 05:44:33PM +1000, Michael Ellerman wrote:
> Hi Simon,
> 
> wei.guo.simon@gmail.com writes:
> > From: Simon Guo <wei.guo.simon@gmail.com>
> >
> > This patch add VMX primitives to do memcmp() in case the compare size
> > exceeds 4K bytes. KSM feature can benefit from this.
> 
> You say "exceeds 4K" here.
> 
it should be >= 4k. I will correct the message.

> > diff --git a/arch/powerpc/lib/memcmp_64.S b/arch/powerpc/lib/memcmp_64.S
> > index f20e883..6303bbf 100644
> > --- a/arch/powerpc/lib/memcmp_64.S
> > +++ b/arch/powerpc/lib/memcmp_64.S
> > @@ -27,12 +27,73 @@
> >  #define LH	lhbrx
> >  #define LW	lwbrx
> >  #define LD	ldbrx
> > +#define LVS	lvsr
> > +#define VPERM(_VRT,_VRA,_VRB,_VRC) \
> > +	vperm _VRT,_VRB,_VRA,_VRC
> >  #else
> >  #define LH	lhzx
> >  #define LW	lwzx
> >  #define LD	ldx
> > +#define LVS	lvsl
> > +#define VPERM(_VRT,_VRA,_VRB,_VRC) \
> > +	vperm _VRT,_VRA,_VRB,_VRC
> >  #endif
> >  
> > +#define VMX_OPS_THRES 4096
> 
> THRES == 4096
> 
> BTW, can we call it VMX_THRESH ?
> 
Sure. I will update it.

> > +#define ENTER_VMX_OPS	\
> > +	mflr    r0;	\
> > +	std     r3,-STACKFRAMESIZE+STK_REG(R31)(r1); \
> > +	std     r4,-STACKFRAMESIZE+STK_REG(R30)(r1); \
> > +	std     r5,-STACKFRAMESIZE+STK_REG(R29)(r1); \
> > +	std     r0,16(r1); \
> > +	stdu    r1,-STACKFRAMESIZE(r1); \
> > +	bl      enter_vmx_ops; \
> > +	cmpwi   cr1,r3,0; \
> > +	ld      r0,STACKFRAMESIZE+16(r1); \
> > +	ld      r3,STK_REG(R31)(r1); \
> > +	ld      r4,STK_REG(R30)(r1); \
> > +	ld      r5,STK_REG(R29)(r1); \
> > +	addi	r1,r1,STACKFRAMESIZE; \
> > +	mtlr    r0
> > +
> > +#define EXIT_VMX_OPS \
> > +	mflr    r0; \
> > +	std     r3,-STACKFRAMESIZE+STK_REG(R31)(r1); \
> > +	std     r4,-STACKFRAMESIZE+STK_REG(R30)(r1); \
> > +	std     r5,-STACKFRAMESIZE+STK_REG(R29)(r1); \
> > +	std     r0,16(r1); \
> > +	stdu    r1,-STACKFRAMESIZE(r1); \
> > +	bl      exit_vmx_ops; \
> > +	ld      r0,STACKFRAMESIZE+16(r1); \
> > +	ld      r3,STK_REG(R31)(r1); \
> > +	ld      r4,STK_REG(R30)(r1); \
> > +	ld      r5,STK_REG(R29)(r1); \
> > +	addi	r1,r1,STACKFRAMESIZE; \
> > +	mtlr    r0
> > +
> > +/*
> > + * LD_VSR_CROSS16B load the 2nd 16 bytes for _vaddr which is unaligned with
> > + * 16 bytes boundary and permute the result with the 1st 16 bytes.
> > +
> > + *    |  y y y y y y y y y y y y y 0 1 2 | 3 4 5 6 7 8 9 a b c d e f z z z |
> > + *    ^                                  ^                                 ^
> > + * 0xbbbb10                          0xbbbb20                          0xbbb30
> > + *                                 ^
> > + *                                _vaddr
> > + *
> > + *
> > + * _vmask is the mask generated by LVS
> > + * _v1st_qw is the 1st aligned QW of current addr which is already loaded.
> > + *   for example: 0xyyyyyyyyyyyyy012 for big endian
> > + * _v2nd_qw is the 2nd aligned QW of cur _vaddr to be loaded.
> > + *   for example: 0x3456789abcdefzzz for big endian
> > + * The permute result is saved in _v_res.
> > + *   for example: 0x0123456789abcdef for big endian.
> > + */
> > +#define LD_VSR_CROSS16B(_vaddr,_vmask,_v1st_qw,_v2nd_qw,_v_res) \
> > +        lvx     _v2nd_qw,_vaddr,off16; \
> > +        VPERM(_v_res,_v1st_qw,_v2nd_qw,_vmask)
> > +
> >  /*
> >   * There are 2 categories for memcmp:
> >   * 1) src/dst has the same offset to the 8 bytes boundary. The handlers
> > @@ -174,6 +235,13 @@ _GLOBAL(memcmp)
> >  	blr
> >  
> >  .Llong:
> > +#ifdef CONFIG_ALTIVEC
> > +	/* Try to use vmx loop if length is larger than 4K */
> > +	cmpldi  cr6,r5,VMX_OPS_THRES
> > +	bge	cr6,.Lsameoffset_vmx_cmp
> 
> Here we compare the length to 4K and if it's greater *or equal* then we
> go to the VMX case. Or am I reading it backward?
> 
> So we should say "if the size is 4K or more we do VMX" shouldn't we?
Yes. Again I need reword the comment to "equal or greater than 4K"
here.

Thanks,
- Simon

^ permalink raw reply

* Re: [PATCH bpf-next v4 02/10] bpf: powerpc64: pad function address loads with NOPs
From: Sandipan Das @ 2018-05-24  8:25 UTC (permalink / raw)
  To: Daniel Borkmann
  Cc: ast, netdev, linuxppc-dev, mpe, naveen.n.rao, jakub.kicinski
In-Reply-To: <43081c8c-d8a8-254a-69f0-7941acab90a3@iogearbox.net>



On 05/24/2018 01:04 PM, Daniel Borkmann wrote:
> On 05/24/2018 08:56 AM, Sandipan Das wrote:
>> For multi-function programs, loading the address of a callee
>> function to a register requires emitting instructions whose
>> count varies from one to five depending on the nature of the
>> address.
>>
>> Since we come to know of the callee's address only before the
>> extra pass, the number of instructions required to load this
>> address may vary from what was previously generated. This can
>> make the JITed image grow or shrink.
>>
>> To avoid this, we should generate a constant five-instruction
>> when loading function addresses by padding the optimized load
>> sequence with NOPs.
>>
>> Signed-off-by: Sandipan Das <sandipan@linux.vnet.ibm.com>
>> ---
>>  arch/powerpc/net/bpf_jit_comp64.c | 34 +++++++++++++++++++++++-----------
>>  1 file changed, 23 insertions(+), 11 deletions(-)
>>
>> diff --git a/arch/powerpc/net/bpf_jit_comp64.c b/arch/powerpc/net/bpf_jit_comp64.c
>> index 1bdb1aff0619..e4582744a31d 100644
>> --- a/arch/powerpc/net/bpf_jit_comp64.c
>> +++ b/arch/powerpc/net/bpf_jit_comp64.c
>> @@ -167,25 +167,37 @@ static void bpf_jit_build_epilogue(u32 *image, struct codegen_context *ctx)
>>  
>>  static void bpf_jit_emit_func_call(u32 *image, struct codegen_context *ctx, u64 func)
>>  {
>> +	unsigned int i, ctx_idx = ctx->idx;
>> +
>> +	/* Load function address into r12 */
>> +	PPC_LI64(12, func);
>> +
>> +	/* For bpf-to-bpf function calls, the callee's address is unknown
>> +	 * until the last extra pass. As seen above, we use PPC_LI64() to
>> +	 * load the callee's address, but this may optimize the number of
>> +	 * instructions required based on the nature of the address.
>> +	 *
>> +	 * Since we don't want the number of instructions emitted to change,
>> +	 * we pad the optimized PPC_LI64() call with NOPs to guarantee that
>> +	 * we always have a five-instruction sequence, which is the maximum
>> +	 * that PPC_LI64() can emit.
>> +	 */
>> +	for (i = ctx->idx - ctx_idx; i < 5; i++)
>> +		PPC_NOP();
> 
> By the way, I think you can still optimize this. The nops are not really
> needed in case of insn->src_reg != BPF_PSEUDO_CALL since the address of
> a normal BPF helper call will always be at a fixed location and known a
> priori.
> 

Ah, true. Thanks for pointing this out. There are a few other things that
we are planning to do for the ppc64 JIT compiler. Will put out a patch for
this with that series.

- Sandipan

>>  #ifdef PPC64_ELF_ABI_v1
>> -	/* func points to the function descriptor */
>> -	PPC_LI64(b2p[TMP_REG_2], func);
>> -	/* Load actual entry point from function descriptor */
>> -	PPC_BPF_LL(b2p[TMP_REG_1], b2p[TMP_REG_2], 0);
>> -	/* ... and move it to LR */
>> -	PPC_MTLR(b2p[TMP_REG_1]);
>>  	/*
>>  	 * Load TOC from function descriptor at offset 8.
>>  	 * We can clobber r2 since we get called through a
>>  	 * function pointer (so caller will save/restore r2)
>>  	 * and since we don't use a TOC ourself.
>>  	 */
>> -	PPC_BPF_LL(2, b2p[TMP_REG_2], 8);
>> -#else
>> -	/* We can clobber r12 */
>> -	PPC_FUNC_ADDR(12, func);
>> -	PPC_MTLR(12);
>> +	PPC_BPF_LL(2, 12, 8);
>> +	/* Load actual entry point from function descriptor */
>> +	PPC_BPF_LL(12, 12, 0);
>>  #endif
>> +
>> +	PPC_MTLR(12);
>>  	PPC_BLRL();
>>  }
>>  
>>
> 

^ permalink raw reply

* [PATCH] KVM: PPC: remove mmio_vsx_tx_sx_enabled in PR KVM MMIO emulation
From: wei.guo.simon @ 2018-05-24  9:01 UTC (permalink / raw)
  To: kvm-ppc; +Cc: Paul Mackerras, kvm, linuxppc-dev, Simon Guo

From: Simon Guo <wei.guo.simon@gmail.com>

Originally PR KVM MMIO emulation uses only 0~31#(5 bits) for VSR
reg number, and use mmio_vsx_tx_sx_enabled field together for
0~63# VSR regs.

Currently PR KVM MMIO emulation is reimplemented with analyse_instr()
assistence. analyse_instr() returns 0~63 for VSR register number, so
it is not necessary to use additional mmio_vsx_tx_sx_enabled field
any more.

This patch extends related reg bits(expand io_gpr to u16 from u8
and use 6 bits for VSR reg#), so that mmio_vsx_tx_sx_enabled can
be removed.

Signed-off-by: Simon Guo <wei.guo.simon@gmail.com>
---
 arch/powerpc/include/asm/kvm_host.h  | 17 ++++++++---------
 arch/powerpc/kvm/emulate_loadstore.c |  7 +++----
 arch/powerpc/kvm/powerpc.c           | 30 +++++++++++++++---------------
 3 files changed, 26 insertions(+), 28 deletions(-)

diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h
index 8dc5e43..bd220a3 100644
--- a/arch/powerpc/include/asm/kvm_host.h
+++ b/arch/powerpc/include/asm/kvm_host.h
@@ -673,7 +673,7 @@ struct kvm_vcpu_arch {
 	gva_t vaddr_accessed;
 	pgd_t *pgdir;
 
-	u8 io_gpr; /* GPR used as IO source/target */
+	u16 io_gpr; /* GPR used as IO source/target */
 	u8 mmio_host_swabbed;
 	u8 mmio_sign_extend;
 	/* conversion between single and double precision */
@@ -689,7 +689,6 @@ struct kvm_vcpu_arch {
 	 */
 	u8 mmio_vsx_copy_nums;
 	u8 mmio_vsx_offset;
-	u8 mmio_vsx_tx_sx_enabled;
 	u8 mmio_vmx_copy_nums;
 	u8 mmio_vmx_offset;
 	u8 mmio_copy_type;
@@ -802,14 +801,14 @@ struct kvm_vcpu_arch {
 #define KVMPPC_VCPU_BUSY_IN_HOST	2
 
 /* Values for vcpu->arch.io_gpr */
-#define KVM_MMIO_REG_MASK	0x001f
-#define KVM_MMIO_REG_EXT_MASK	0xffe0
+#define KVM_MMIO_REG_MASK	0x003f
+#define KVM_MMIO_REG_EXT_MASK	0xffc0
 #define KVM_MMIO_REG_GPR	0x0000
-#define KVM_MMIO_REG_FPR	0x0020
-#define KVM_MMIO_REG_QPR	0x0040
-#define KVM_MMIO_REG_FQPR	0x0060
-#define KVM_MMIO_REG_VSX	0x0080
-#define KVM_MMIO_REG_VMX	0x00c0
+#define KVM_MMIO_REG_FPR	0x0040
+#define KVM_MMIO_REG_QPR	0x0080
+#define KVM_MMIO_REG_FQPR	0x00c0
+#define KVM_MMIO_REG_VSX	0x0100
+#define KVM_MMIO_REG_VMX	0x0180
 
 #define __KVM_HAVE_ARCH_WQP
 #define __KVM_HAVE_CREATE_DEVICE
diff --git a/arch/powerpc/kvm/emulate_loadstore.c b/arch/powerpc/kvm/emulate_loadstore.c
index dca7f1c..64b325b 100644
--- a/arch/powerpc/kvm/emulate_loadstore.c
+++ b/arch/powerpc/kvm/emulate_loadstore.c
@@ -106,7 +106,6 @@ int kvmppc_emulate_loadstore(struct kvm_vcpu *vcpu)
 	 * if mmio_vsx_tx_sx_enabled == 1, copy data between
 	 * VSR[32..63] and memory
 	 */
-	vcpu->arch.mmio_vsx_tx_sx_enabled = get_tx_or_sx(inst);
 	vcpu->arch.mmio_vsx_copy_nums = 0;
 	vcpu->arch.mmio_vsx_offset = 0;
 	vcpu->arch.mmio_copy_type = KVMPPC_VSX_COPY_NONE;
@@ -242,8 +241,8 @@ int kvmppc_emulate_loadstore(struct kvm_vcpu *vcpu)
 			}
 
 			emulated = kvmppc_handle_vsx_load(run, vcpu,
-					KVM_MMIO_REG_VSX | (op.reg & 0x1f),
-					io_size_each, 1, op.type & SIGNEXT);
+					KVM_MMIO_REG_VSX|op.reg, io_size_each,
+					1, op.type & SIGNEXT);
 			break;
 		}
 #endif
@@ -363,7 +362,7 @@ int kvmppc_emulate_loadstore(struct kvm_vcpu *vcpu)
 			}
 
 			emulated = kvmppc_handle_vsx_store(run, vcpu,
-					op.reg & 0x1f, io_size_each, 1);
+					op.reg, io_size_each, 1);
 			break;
 		}
 #endif
diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c
index 05eccdc..dcc7982 100644
--- a/arch/powerpc/kvm/powerpc.c
+++ b/arch/powerpc/kvm/powerpc.c
@@ -881,10 +881,10 @@ static inline void kvmppc_set_vsr_dword(struct kvm_vcpu *vcpu,
 	if (offset == -1)
 		return;
 
-	if (vcpu->arch.mmio_vsx_tx_sx_enabled) {
-		val.vval = VCPU_VSX_VR(vcpu, index);
+	if (index >= 32) {
+		val.vval = VCPU_VSX_VR(vcpu, index - 32);
 		val.vsxval[offset] = gpr;
-		VCPU_VSX_VR(vcpu, index) = val.vval;
+		VCPU_VSX_VR(vcpu, index - 32) = val.vval;
 	} else {
 		VCPU_VSX_FPR(vcpu, index, offset) = gpr;
 	}
@@ -896,11 +896,11 @@ static inline void kvmppc_set_vsr_dword_dump(struct kvm_vcpu *vcpu,
 	union kvmppc_one_reg val;
 	int index = vcpu->arch.io_gpr & KVM_MMIO_REG_MASK;
 
-	if (vcpu->arch.mmio_vsx_tx_sx_enabled) {
-		val.vval = VCPU_VSX_VR(vcpu, index);
+	if (index >= 32) {
+		val.vval = VCPU_VSX_VR(vcpu, index - 32);
 		val.vsxval[0] = gpr;
 		val.vsxval[1] = gpr;
-		VCPU_VSX_VR(vcpu, index) = val.vval;
+		VCPU_VSX_VR(vcpu, index - 32) = val.vval;
 	} else {
 		VCPU_VSX_FPR(vcpu, index, 0) = gpr;
 		VCPU_VSX_FPR(vcpu, index, 1) = gpr;
@@ -913,12 +913,12 @@ static inline void kvmppc_set_vsr_word_dump(struct kvm_vcpu *vcpu,
 	union kvmppc_one_reg val;
 	int index = vcpu->arch.io_gpr & KVM_MMIO_REG_MASK;
 
-	if (vcpu->arch.mmio_vsx_tx_sx_enabled) {
+	if (index >= 32) {
 		val.vsx32val[0] = gpr;
 		val.vsx32val[1] = gpr;
 		val.vsx32val[2] = gpr;
 		val.vsx32val[3] = gpr;
-		VCPU_VSX_VR(vcpu, index) = val.vval;
+		VCPU_VSX_VR(vcpu, index - 32) = val.vval;
 	} else {
 		val.vsx32val[0] = gpr;
 		val.vsx32val[1] = gpr;
@@ -938,10 +938,10 @@ static inline void kvmppc_set_vsr_word(struct kvm_vcpu *vcpu,
 	if (offset == -1)
 		return;
 
-	if (vcpu->arch.mmio_vsx_tx_sx_enabled) {
-		val.vval = VCPU_VSX_VR(vcpu, index);
+	if (index >= 32) {
+		val.vval = VCPU_VSX_VR(vcpu, index - 32);
 		val.vsx32val[offset] = gpr32;
-		VCPU_VSX_VR(vcpu, index) = val.vval;
+		VCPU_VSX_VR(vcpu, index - 32) = val.vval;
 	} else {
 		dword_offset = offset / 2;
 		word_offset = offset % 2;
@@ -1362,10 +1362,10 @@ static inline int kvmppc_get_vsr_data(struct kvm_vcpu *vcpu, int rs, u64 *val)
 			break;
 		}
 
-		if (!vcpu->arch.mmio_vsx_tx_sx_enabled) {
+		if (rs < 32) {
 			*val = VCPU_VSX_FPR(vcpu, rs, vsx_offset);
 		} else {
-			reg.vval = VCPU_VSX_VR(vcpu, rs);
+			reg.vval = VCPU_VSX_VR(vcpu, rs - 32);
 			*val = reg.vsxval[vsx_offset];
 		}
 		break;
@@ -1379,13 +1379,13 @@ static inline int kvmppc_get_vsr_data(struct kvm_vcpu *vcpu, int rs, u64 *val)
 			break;
 		}
 
-		if (!vcpu->arch.mmio_vsx_tx_sx_enabled) {
+		if (rs < 32) {
 			dword_offset = vsx_offset / 2;
 			word_offset = vsx_offset % 2;
 			reg.vsxval[0] = VCPU_VSX_FPR(vcpu, rs, dword_offset);
 			*val = reg.vsx32val[word_offset];
 		} else {
-			reg.vval = VCPU_VSX_VR(vcpu, rs);
+			reg.vval = VCPU_VSX_VR(vcpu, rs - 32);
 			*val = reg.vsx32val[vsx_offset];
 		}
 		break;
-- 
1.8.3.1

^ permalink raw reply related


This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox